/*
www.sourceforge.net/projects/tinyxml
Original code (2.0 and earlier )copyright (c) 2000-2002 Lee Thomason (www.grinninglizard.com)
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any
damages arising from the use of this software.
Permission is granted to anyone to use this software for any
purpose, including commercial applications, and to alter it and
redistribute it freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must
not claim that you wrote the original software. If you use this
software in a product, an acknowledgment in the product documentation
would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and
must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source
distribution.
*/
#include <ctype.h>
#include <stddef.h>
#include "tinyxml.h"
//#define DEBUG_PARSER
#if defined( DEBUG_PARSER )
# if defined( DEBUG ) && defined( _MSC_VER )
# include <windows.h>
# define TIXML_LOG OutputDebugString
# else
# define TIXML_LOG printf
# endif
#endif
// Note tha "PutString" hardcodes the same list. This
// is less flexible than it appears. Changing the entries
// or order will break putstring.
TiXmlBase::Entity TiXmlBase::entity[ NUM_ENTITY ] =
{
{ "&", 5, '&' },
{ "<", 4, '<' },
{ ">", 4, '>' },
{ """, 6, '\"' },
{ "'", 6, '\'' }
};
// Bunch of unicode info at:
// http://www.unicode.org/faq/utf_bom.html
// Including the basic of this table, which determines the #bytes in the
// sequence from the lead byte. 1 placed for invalid sequences --
// although the result will be junk, pass it through as much as possible.
// Beware of the non-characters in UTF-8:
// ef bb bf (Microsoft "lead bytes")
// ef bf be
// ef bf bf
const unsigned char TIXML_UTF_LEAD_0 = 0xefU;
const unsigned char TIXML_UTF_LEAD_1 = 0xbbU;
const unsigned char TIXML_UTF_LEAD_2 = 0xbfU;
const int TiXmlBase::utf8ByteTable[256] =
{
// 0 1 2 3 4 5 6 7 8 9 a b c d e f
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70 End of ASCII range
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x80 0x80 to 0xc1 invalid
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x90
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xa0
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xb0
1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xc0 0xc2 to 0xdf 2 byte
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xd0
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xe0 0xe0 to 0xef 3 byte
4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // 0xf0 0xf0 to 0xf4 4 byte, 0xf5 and higher invalid
};
void TiXmlBase::ConvertUTF32ToUTF8( unsigned long input, char* output, int* length )
{
const unsigned long BYTE_MASK = 0xBF;
const unsigned long BYTE_MARK = 0x80;
const unsigned long FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
if (input < 0x80)
*length = 1;
else if ( input < 0x800 )
*length = 2;
else if ( input < 0x10000 )
*length = 3;
else if ( input < 0x200000 )
*length = 4;
else
{ *length = 0; return; } // This code won't covert this correctly anyway.
output += *length;
// Scary scary fall throughs.
switch (*length)
{
case 4:
--output;
*output = (char)((input | BYTE_MARK) & BYTE_MASK);
input >>= 6;
case 3:
--output;
*output = (char)((input | BYTE_MARK) & BYTE_MASK);
input >>= 6;
case 2:
--output;
*output = (char)((input | BYTE_MARK) & BYTE_MASK);
input >>= 6;
case 1:
--output;
*output = (char)(input | FIRST_BYTE_MARK[*length]);
}
}
/*static*/ int TiXmlBase::IsAlpha( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
{
// This will only work for low-ascii, everything else is assumed to be a valid
// letter. I'm not sure this is the best approach, but it is quite tricky trying
// to figure out alhabetical vs. not across encoding. So take a very
// conservative approach.
// if ( encoding == TIXML_ENCODING_UTF8 )
// {
if ( anyByte < 127 )
return isalpha( anyByte );
else
return 1; // What else to do? The unicode set is huge...get the english ones right.
// }
// else
// {
// return isalpha( anyByte );
// }
}
/*static*/ int TiXmlBase::IsAlphaNum( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
{
// This will only work for low-ascii, everything else is assumed to be a valid
// letter. I'm not sure this is the best approach, but it is quite tricky trying
// to figure out alhabetical vs. not across encoding. So take a very
// conservative approach.
// if ( encoding == TIXML_ENCODING_UTF8 )
// {
if ( anyByte < 127 )
return isalnum( anyByte );
else
return 1; // What else to do? The unicode set is huge...get the english ones right.
// }
// else
// {
// return isalnum( anyByte );
// }
}
class TiXmlParsingData
{
friend class TiXmlDocument;
public:
void Stamp( const char* now, TiXmlEncoding encoding );
const TiXmlCursor& Cursor() { return cursor; }
private:
// Only used by the document!
TiXmlParsingData( const char* start, int _tabsize, int row, int col )
{
assert( start );
stamp = start;
tabsize = _tabsize;
cursor.row = row;
cursor.col = col;
}
TiXmlCursor cursor;
const char* stamp;
int tabsize;
};
void TiXmlParsingData::Stamp( const char* now, TiXmlEncoding encoding )
{
assert( now );
// Do nothing if the tabsize is 0.
if ( tabsize < 1 )
{
return;
}
// Get the current row, column.
int row = cursor.row;
int col = cursor.col;
const char* p = stamp;
assert( p );
while ( p < now )
{
// Treat p as unsigned, so we have a happy compiler.
const unsigned char* pU = (const unsigned char*)p;
// Code contributed by Fletcher Dunn: (modified by lee)
switch (*pU) {
case 0:
// We *should* never get here, but in case we do, don't
// advance past the terminating null character, ever
return;
case '\r':
// bump down to the next line
++row;
col = 0;
// Eat the character
++p;
// Check for \r\n sequence, and treat this as a single character
if (*p == '\n') {
++p;
}
break;
case '\n':
// bump down to the next line
++row;
col = 0;
// Eat the character
++p;
// Check for \n\r sequence, and treat this as a single
// character. (Yes, this bizarre thing does occur still
// on some arcane platforms...)
if (*p == '\r') {
++p;
}
break;
case '\t':
// Eat the character
++p;
// Skip to next tab stop
col = (col / tabsize + 1) * tabsize;
break;
case TIXML_UTF_LEAD_0:
if ( encoding == TIXML_ENCODING_UTF8 )
{
if ( *(p+1) && *(p+2) )
{
// In these cases, don't advance the column. These are
// 0-width spaces.
if ( *(pU+1)==TIXML_UTF_LEAD_1 && *(pU+2)==TIXML_UTF_LEAD_2 )
p += 3;
else if ( *(pU+1)==0xbfU && *(pU+2)==0xbeU )
p += 3;
else if ( *(pU+1)==0xbfU && *(pU+2)==0xbfU )
p += 3;
else
{ p +=3; ++col; } // A normal character.
}
}
else
{
++p;
++col;
}
break;
default:
if ( encoding == TIX
XML解析器,用于解析XML文件
4星 · 超过85%的资源 需积分: 23 136 浏览量
2011-11-06
16:37:26
上传
评论
收藏 250KB RAR 举报
kj270
- 粉丝: 5
- 资源: 5
最新资源
- AIS2024 valid
- 最入门的爬虫代码 python.docx
- 爬虫零基础入门-爬取天气预报.pdf
- 最通俗易懂的 MongoDB 非结构化文档存储数据库教程.zip
- 以mongodb为数据库的订单物流小项目.zip
- 腾讯云-mongodb数据库, 项目部署.zip
- 腾讯 APIJSON 的 MongoDB 数据库插件.zip
- 理解非关系型数据库和关系型数据库的区别.zip
- 操作简单的Mongodb网页web管理工具,基于Spring Boot2.0支持mongodb集群.zip
- tms-mongodb-web,提供访问mongodb数据的REST API和可灵活扩展的mongodb web 客户端.zip
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈