xmlparser.rar_TheJust资源-CSDN文库

共4个文件

h：2个

c：2个

版权申诉

23 浏览量 2022-09-21 19:42:31 上传评论收藏 11KB RAR 举报

资源推荐

资源详情

资源评论

收起资源包目录

xmlparser.rar （4个子文件）

xmlparser.h 4KB

xmlparser.c 28KB

GrStencilAndCoverPathRenderer.h 1KB

GrStencilAndCoverPathRenderer.c 4KB

/* ******************************************************************************* * * Copyright (C) 2004-2010, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: xmlparser.cpp * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 2004jul21 * created by: Andy Heninger */ #include <stdio.h> #include "unicode/uchar.h" #include "unicode/ucnv.h" #include "unicode/regex.h" #include "filestrm.h" #include "xmlparser.h" #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION // character constants enum { x_QUOT=0x22, x_AMP=0x26, x_APOS=0x27, x_LT=0x3c, x_GT=0x3e, x_l=0x6c }; #define XML_SPACES "[ \\u0009\\u000d\\u000a]" // XML #4 #define XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \ "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \ "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \ "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]" // XML #5 #define XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]" // XML #6 #define XML_NAME XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*" U_NAMESPACE_BEGIN UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser) UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement) // // UXMLParser constructor. Mostly just initializes the ICU regexes that are // used for parsing. // UXMLParser::UXMLParser(UErrorCode &status) : // XML Declaration. XML Production #23. // example: "<?xml version=1.0 encoding="utf-16" ?> // This is a sloppy implementation - just look for the leading <?xml and the closing ?> // allow for a possible leading BOM. mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status), // XML Comment production #15 // example: " // note, does not detect an illegal "--" within comments mXMLComment(UnicodeString("(?s)", -1, US_INV), 0, status), // XML Spaces // production [3] mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status), // XML Doctype decl production #28 // example "<!DOCTYPE foo SYSTEM "somewhere" > // or "<!DOCTYPE foo [internal dtd]> // TODO: we don't actually parse the DOCTYPE or internal subsets. // Some internal dtd subsets could confuse this simple-minded // attempt at skipping over them, specifically, occcurences // of closeing square brackets. These could appear in comments, // or in parameter entity declarations, for example. mXMLDoctype(UnicodeString( "(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)", -1, US_INV ), 0, status), // XML PI production #16 // example "<?target stuff?> mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status), // XML Element Start Productions #40, #41 // example <foo att1='abc' att2="d e f" > // capture #1: the tag name // mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name" "(?:" XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = " "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"' ")*" // * for zero or more attributes. XML_SPACES "*?>", -1, US_INV), 0, status), // match " >" // XML Element End production #42 // example </foo> mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status), // XML Element Empty production #44 // example <foo att1="abc" att2="d e f" /> mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name" "(?:" XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = " "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"' ")*" // * for zero or more attributes. XML_SPACES "*?/>", -1, US_INV), 0, status), // match " />" // XMLCharData. Everything but '<'. Note that & will be dealt with later. mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status), // Attribute name = "value". XML Productions 10, 40/41 // Capture group 1 is name, // 2 is the attribute value, including the quotes. // // Note that attributes are scanned twice. The first time is with // the regex for an entire element start. There, the attributes // are checked syntactically, but not separted out one by one. // Here, we match a single attribute, and make its name and // attribute value available to the parser code. mAttrValue(UnicodeString(XML_SPACES "+(" XML_NAME ")" XML_SPACES "*=" XML_SPACES "*" "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status), mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status), // Match any of the new-line sequences in content. // All are changed to \u000a. mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status), // & char references // We will figure out what we've got based on which capture group has content. // The last one is a catchall for unrecognized entity references.. // 1 2 3 4 5 6 7 8 mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"), 0, status), fNames(status), fElementStack(status), fOneLF((UChar)0x0a) // Plain new-line string, used in new line normalization. { } UXMLParser * UXMLParser::createParser(UErrorCode &errorCode) { if (U_FAILURE(errorCode)) { return NULL; } else { return new UXMLParser(errorCode); } } UXMLParser::~UXMLParser() {} UXMLElement * UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) { char bytes[4096], charsetBuffer[100]; FileStream *f; const char *charset, *pb; UnicodeString src; UConverter *cnv; UChar *buffer, *pu; int32_t fileLength, bytesLength, length, capacity; UBool flush; if(U_FAILURE(errorCode)) { return NULL; } f=T_FileStream_open(filename, "rb"); if(f==NULL) { errorCode=U_FILE_ACCESS_ERROR; return NULL; } bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes)); if(bytesLength<(int32_t)sizeof(bytes)) { // we have already read the entire file fileLength=bytesLength; } else { // get the file length fileLength=T_FileStream_size(f); } /* * get the charset: * 1. Unicode signature * 2. treat as ISO-8859-1 and read XML encoding="charser" * 3. default to UTF-8 */ charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode); if(U_SUCCESS(errorCode) && charset!=NULL) { // open converter according to Unicode signature cnv=ucnv_open(charset, &errorCode); } else { // read as Latin-1 and parse the XML declaration and encoding cnv=ucnv_open("ISO-8859-1", &errorCode); if(U_FAILUR

评论收藏

内容反馈

版权申诉