/*
*******************************************************************************
*
* Copyright (C) 2004-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: xmlparser.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2004jul21
* created by: Andy Heninger
*/
#include <stdio.h>
#include "unicode/uchar.h"
#include "unicode/ucnv.h"
#include "unicode/regex.h"
#include "filestrm.h"
#include "xmlparser.h"
#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION
// character constants
enum {
x_QUOT=0x22,
x_AMP=0x26,
x_APOS=0x27,
x_LT=0x3c,
x_GT=0x3e,
x_l=0x6c
};
#define XML_SPACES "[ \\u0009\\u000d\\u000a]"
// XML #4
#define XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \
"[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \
"[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \
"[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]"
// XML #5
#define XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]"
// XML #6
#define XML_NAME XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*"
U_NAMESPACE_BEGIN
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser)
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement)
//
// UXMLParser constructor. Mostly just initializes the ICU regexes that are
// used for parsing.
//
UXMLParser::UXMLParser(UErrorCode &status) :
// XML Declaration. XML Production #23.
// example: "<?xml version=1.0 encoding="utf-16" ?>
// This is a sloppy implementation - just look for the leading <?xml and the closing ?>
// allow for a possible leading BOM.
mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status),
// XML Comment production #15
// example: "<!-- whatever -->
// note, does not detect an illegal "--" within comments
mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INV), 0, status),
// XML Spaces
// production [3]
mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status),
// XML Doctype decl production #28
// example "<!DOCTYPE foo SYSTEM "somewhere" >
// or "<!DOCTYPE foo [internal dtd]>
// TODO: we don't actually parse the DOCTYPE or internal subsets.
// Some internal dtd subsets could confuse this simple-minded
// attempt at skipping over them, specifically, occcurences
// of closeing square brackets. These could appear in comments,
// or in parameter entity declarations, for example.
mXMLDoctype(UnicodeString(
"(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)", -1, US_INV
), 0, status),
// XML PI production #16
// example "<?target stuff?>
mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status),
// XML Element Start Productions #40, #41
// example <foo att1='abc' att2="d e f" >
// capture #1: the tag name
//
mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name"
"(?:"
XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = "
"(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"'
")*" // * for zero or more attributes.
XML_SPACES "*?>", -1, US_INV), 0, status), // match " >"
// XML Element End production #42
// example </foo>
mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status),
// XML Element Empty production #44
// example <foo att1="abc" att2="d e f" />
mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name"
"(?:"
XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = "
"(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"'
")*" // * for zero or more attributes.
XML_SPACES "*?/>", -1, US_INV), 0, status), // match " />"
// XMLCharData. Everything but '<'. Note that & will be dealt with later.
mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status),
// Attribute name = "value". XML Productions 10, 40/41
// Capture group 1 is name,
// 2 is the attribute value, including the quotes.
//
// Note that attributes are scanned twice. The first time is with
// the regex for an entire element start. There, the attributes
// are checked syntactically, but not separted out one by one.
// Here, we match a single attribute, and make its name and
// attribute value available to the parser code.
mAttrValue(UnicodeString(XML_SPACES "+(" XML_NAME ")" XML_SPACES "*=" XML_SPACES "*"
"((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status),
mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status),
// Match any of the new-line sequences in content.
// All are changed to \u000a.
mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status),
// & char references
// We will figure out what we've got based on which capture group has content.
// The last one is a catchall for unrecognized entity references..
// 1 2 3 4 5 6 7 8
mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"),
0, status),
fNames(status),
fElementStack(status),
fOneLF((UChar)0x0a) // Plain new-line string, used in new line normalization.
{
}
UXMLParser *
UXMLParser::createParser(UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) {
return NULL;
} else {
return new UXMLParser(errorCode);
}
}
UXMLParser::~UXMLParser() {}
UXMLElement *
UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) {
char bytes[4096], charsetBuffer[100];
FileStream *f;
const char *charset, *pb;
UnicodeString src;
UConverter *cnv;
UChar *buffer, *pu;
int32_t fileLength, bytesLength, length, capacity;
UBool flush;
if(U_FAILURE(errorCode)) {
return NULL;
}
f=T_FileStream_open(filename, "rb");
if(f==NULL) {
errorCode=U_FILE_ACCESS_ERROR;
return NULL;
}
bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
if(bytesLength<(int32_t)sizeof(bytes)) {
// we have already read the entire file
fileLength=bytesLength;
} else {
// get the file length
fileLength=T_FileStream_size(f);
}
/*
* get the charset:
* 1. Unicode signature
* 2. treat as ISO-8859-1 and read XML encoding="charser"
* 3. default to UTF-8
*/
charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode);
if(U_SUCCESS(errorCode) && charset!=NULL) {
// open converter according to Unicode signature
cnv=ucnv_open(charset, &errorCode);
} else {
// read as Latin-1 and parse the XML declaration and encoding
cnv=ucnv_open("ISO-8859-1", &errorCode);
if(U_FAILUR
没有合适的资源?快使用搜索试试~ 我知道了~
xmlparser.rar_The Just
共4个文件
h:2个
c:2个
1.该资源内容由用户上传,如若侵权请联系客服进行举报
2.虚拟产品一经售出概不退款(资源遇到问题,请及时私信上传者)
2.虚拟产品一经售出概不退款(资源遇到问题,请及时私信上传者)
版权申诉
0 下载量 23 浏览量
2022-09-21
19:42:31
上传
评论
收藏 11KB RAR 举报
温馨提示
outside the clip. So we can t just fill where the user bits are 0. We also need to check that the clip bit is set.
资源推荐
资源详情
资源评论
收起资源包目录
xmlparser.rar (4个子文件)
xmlparser.h 4KB
xmlparser.c 28KB
GrStencilAndCoverPathRenderer.h 1KB
GrStencilAndCoverPathRenderer.c 4KB
共 4 条
- 1
资源评论
JonSco
- 粉丝: 66
- 资源: 1万+
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功