// Markup.cpp: implementation of the CMarkup class.
//
// Markup Release 11.5
// Copyright (C) 2011 First Objective Software, Inc. All rights reserved
// Go to www.firstobject.com for the latest CMarkup and EDOM documentation
// Use in commercial applications requires written permission
// This software is provided "as is", with no warranty.
//
#include <stdio.h>
//#include "..\stdafx.h"
#include "Markup.h"
#if defined(MCD_STRERROR) // C error routine
#include <errno.h>
#endif // C error routine
#if defined (MARKUP_ICONV)
#include <iconv.h>
#endif
#define x_ATTRIBQUOTE '\"' // can be double or single quote
#if defined(MARKUP_STL) && ( defined(MARKUP_WINCONV) || (! defined(MCD_STRERROR)))
#include <windows.h> // for MultiByteToWideChar, WideCharToMultiByte, FormatMessage
#endif // need windows.h when STL and (not setlocale or not strerror), MFC afx.h includes it already
#if defined(MARKUP_MBCS) // MBCS/double byte
#pragma message( "Note: MBCS build (not UTF-8)" )
// For UTF-8, remove MBCS from project settings C/C++ preprocessor definitions
#if defined (MARKUP_WINCONV)
#include <mbstring.h> // for VC++ _mbclen
#endif // WINCONV
#endif // MBCS/double byte
#if defined(_DEBUG) && _MSC_VER > 1000 // VC++ DEBUG
#undef THIS_FILE
static char THIS_FILE[]=__FILE__;
#if defined(DEBUG_NEW)
#define new DEBUG_NEW
#endif // DEBUG_NEW
#endif // VC++ DEBUG
// Disable "while ( 1 )" warning in VC++ 2002
#if _MSC_VER >= 1300 // VC++ 2002 (7.0)
#pragma warning(disable:4127)
#endif // VC++ 2002 (7.0)
//////////////////////////////////////////////////////////////////////
// Internal static utility functions
//
void x_StrInsertReplace( MCD_STR& str, int nLeft, int nReplace, const MCD_STR& strInsert )
{
// Insert strInsert into str at nLeft replacing nReplace chars
// Reduce reallocs on growing string by reserving string space
// If realloc needed, allow for 1.5 times the new length
//
int nStrLength = MCD_STRLENGTH(str);
int nInsLength = MCD_STRLENGTH(strInsert);
int nNewLength = nInsLength + nStrLength - nReplace;
int nAllocLen = MCD_STRCAPACITY(str);
#if defined(MCD_STRINSERTREPLACE) // STL, replace method
if ( nNewLength > nAllocLen )
MCD_BLDRESERVE( str, (nNewLength + nNewLength/2 + 128) );
MCD_STRINSERTREPLACE( str, nLeft, nReplace, strInsert );
#else // MFC, no replace method
int nBufferLen = nNewLength;
if ( nNewLength > nAllocLen )
nBufferLen += nBufferLen/2 + 128;
MCD_CHAR* pDoc = MCD_GETBUFFER( str, nBufferLen );
if ( nInsLength != nReplace && nLeft+nReplace < nStrLength )
memmove( &pDoc[nLeft+nInsLength], &pDoc[nLeft+nReplace], (nStrLength-nLeft-nReplace)*sizeof(MCD_CHAR) );
if ( nInsLength )
memcpy( &pDoc[nLeft], strInsert, nInsLength*sizeof(MCD_CHAR) );
MCD_RELEASEBUFFER( str, pDoc, nNewLength );
#endif // MFC, no replace method
}
int x_Hash( MCD_PCSZ p, int nSize )
{
unsigned int n=0;
while (*p)
n += (unsigned int)(*p++);
return n % nSize;
}
MCD_STR x_IntToStr( int n )
{
MCD_CHAR sz[25];
MCD_SPRINTF(MCD_SSZ(sz),MCD_T("%d"),n);
MCD_STR s=sz;
return s;
}
int x_StrNCmp( MCD_PCSZ p1, MCD_PCSZ p2, int n, int bIgnoreCase = 0 )
{
// Fast string compare to determine equality
if ( bIgnoreCase )
{
bool bNonAsciiFound = false;
MCD_CHAR c1, c2;
while ( n-- )
{
c1 = *p1++;
c2 = *p2++;
if ( c1 != c2 )
{
if ( bNonAsciiFound )
return c1 - c2;
if ( c1 >= 'a' && c1 <= 'z' )
c1 = (MCD_CHAR)( c1 - ('a'-'A') );
if ( c2 >= 'a' && c2 <= 'z' )
c2 = (MCD_CHAR)( c2 - ('a'-'A') );
if ( c1 != c2 )
return c1 - c2;
}
else if ( (unsigned int)c1 > 127 )
bNonAsciiFound = true;
}
}
else
{
while ( n-- )
{
if ( *p1 != *p2 )
return *p1 - *p2;
p1++;
p2++;
}
}
return 0;
}
enum MarkupResultCode
{
MRC_COUNT = 1,
MRC_TYPE = 2,
MRC_NUMBER = 4,
MRC_ENCODING = 8,
MRC_LENGTH = 16,
MRC_MODIFY = 32,
MRC_MSG = 64
};
void x_AddResult( MCD_STR& strResult, MCD_CSTR pszID, MCD_CSTR pszVal = NULL, int nResultCode = 0, int n = -1, int n2 = -1 )
{
// Call this to append an error result to strResult, discard if accumulating too large
if ( MCD_STRLENGTH(strResult) < 1000 )
{
// Use a temporary CMarkup object but keep strResult in a string to minimize memory footprint
CMarkup mResult( strResult );
if ( nResultCode & MRC_MODIFY )
mResult.FindElem( pszID );
else
mResult.AddElem( pszID, MCD_T(""), CMarkup::MNF_WITHNOLINES );
if ( pszVal.pcsz )
{
if ( nResultCode & MRC_TYPE )
mResult.SetAttrib( MCD_T("type"), pszVal );
else if ( nResultCode & MRC_ENCODING )
mResult.SetAttrib( MCD_T("encoding"), pszVal );
else if ( nResultCode & MRC_MSG )
mResult.SetAttrib( MCD_T("msg"), pszVal );
else
mResult.SetAttrib( MCD_T("tagname"), pszVal );
}
if ( nResultCode & MRC_NUMBER )
mResult.SetAttrib( MCD_T("n"), n );
else if ( nResultCode & MRC_COUNT )
mResult.SetAttrib( MCD_T("count"), n );
else if ( nResultCode & MRC_LENGTH )
mResult.SetAttrib( MCD_T("length"), n );
else if ( n != -1 )
mResult.SetAttrib( MCD_T("offset"), n );
if ( n2 != -1 )
mResult.SetAttrib( MCD_T("offset2"), n2 );
strResult = mResult.GetDoc();
}
}
//////////////////////////////////////////////////////////////////////
// Encoding conversion struct and methods
//
struct TextEncoding
{
TextEncoding( MCD_CSTR pszFromEncoding, const void* pFromBuffer, int nFromBufferLen )
{
m_strFromEncoding = pszFromEncoding;
m_pFrom = pFromBuffer;
m_nFromLen = nFromBufferLen;
m_nFailedChars = 0;
m_nToCount = 0;
};
int PerformConversion( void* pTo, MCD_CSTR pszToEncoding = NULL );
bool FindRaggedEnd( int& nTruncBeforeBytes );
#if defined(MARKUP_ICONV)
static const char* IConvName( char* szEncoding, MCD_CSTR pszEncoding );
int IConv( void* pTo, int nToCharSize, int nFromCharSize );
#endif // ICONV
#if ! defined(MARKUP_WCHAR)
static bool CanConvert( MCD_CSTR pszToEncoding, MCD_CSTR pszFromEncoding );
#endif // WCHAR
MCD_STR m_strToEncoding;
MCD_STR m_strFromEncoding;
const void* m_pFrom;
int m_nFromLen;
int m_nToCount;
int m_nFailedChars;
};
// Encoding names
// This is a precompiled ASCII hash table for speed and minimum memory requirement
// Each entry consists of a 2 digit name length, 5 digit code page, and the encoding name
// Each table slot can have multiple entries, table size 155 was chosen for even distribution
//
MCD_PCSZ EncodingNameTable[155] =
{
MCD_T("0800949ksc_5601"),MCD_T("1920932cseucpkdfmtjapanese0920003x-cp20003"),
MCD_T("1250221_iso-2022-jp0228591l10920004x-cp20004"),
MCD_T("0228592l20920005x-cp20005"),
MCD_T("0228593l30600850ibm8501000858ccsid00858"),
MCD_T("0228594l40600437ibm4370701201ucs-2be0600860ibm860"),
MCD_T("0600852ibm8520501250ms-ee0600861ibm8610228599l50751932cp51932"),
MCD_T("0600862ibm8620620127ibm3670700858cp008581010021x-mac-thai0920261x-cp20261"),
MCD_T("0600737ibm7370500869cp-gr1057003x-iscii-be0600863ibm863"),
MCD_T("0750221ms502210628591ibm8190600855ibm8550600864ibm864"),
MCD_T("0600775ibm7751057002x-iscii-de0300949uhc0228605l91028591iso-ir-1000600865ibm865"),
MCD_T("1028594iso-ir-1101028592iso-ir-1010600866ibm8660500861cp-is0600857ibm857"),
MCD_T("0950227x-cp50227"),
MCD_T("0320866koi1628598csisolatinhebrew1057008x-iscii-ka"),
MCD_T("1000950big5-hkscs1220106x-ia5-german0600869ibm869"),
MCD_T("1057009x-iscii-ma0701200ucs-2le0712001utf32be0920269x-cp20269"),
MCD_T("0800708asmo-7080500437cspc81765000unicode-1-1-utf-70612000utf-320920936x-cp20936"),
MCD_T("1200775ebcdic-cp-be0628598hebrew0701201utf16be1765001unicode-1-1-utf-81765001unicode-2-0-utf-80551932x-euc"),
MCD_T("1028595iso-ir-1441028597iso-ir-1260728605latin-90601200utf-161057011x-iscii-pa"),
MCD_T("1028596iso-ir-1271028593iso-ir-1090751932ms51932"),
MCD_T("0801253ms-greek0