#ifndef Py_UNICODEOBJECT_H
#define Py_UNICODEOBJECT_H
#include <stdarg.h>
/*
Unicode implementation based on original code by Fredrik Lundh,
modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
Unicode Integration Proposal. (See
http://www.egenix.com/files/python/unicode-proposal.txt).
Copyright (c) Corporation for National Research Initiatives.
Original header:
--------------------------------------------------------------------
* Yet another Unicode string type for Python. This type supports the
* 16-bit Basic Multilingual Plane (BMP) only.
*
* Written by Fredrik Lundh, January 1999.
*
* Copyright (c) 1999 by Secret Labs AB.
* Copyright (c) 1999 by Fredrik Lundh.
*
* fredrik@pythonware.com
* http://www.pythonware.com
*
* --------------------------------------------------------------------
* This Unicode String Type is
*
* Copyright (c) 1999 by Secret Labs AB
* Copyright (c) 1999 by Fredrik Lundh
*
* By obtaining, using, and/or copying this software and/or its
* associated documentation, you agree that you have read, understood,
* and will comply with the following terms and conditions:
*
* Permission to use, copy, modify, and distribute this software and its
* associated documentation for any purpose and without fee is hereby
* granted, provided that the above copyright notice appears in all
* copies, and that both that copyright notice and this permission notice
* appear in supporting documentation, and that the name of Secret Labs
* AB or the author not be used in advertising or publicity pertaining to
* distribution of the software without specific, written prior
* permission.
*
* SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
* -------------------------------------------------------------------- */
#include <ctype.h>
/* === Internal API ======================================================= */
/* --- Internal Unicode Format -------------------------------------------- */
/* Python 3.x requires unicode */
#define Py_USING_UNICODE
#ifndef SIZEOF_WCHAR_T
#error Must define SIZEOF_WCHAR_T
#endif
#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
Otherwise, Unicode strings are stored as UCS-2 (with limited support
for UTF-16) */
#if Py_UNICODE_SIZE >= 4
#define Py_UNICODE_WIDE
#endif
/* Set these flags if the platform has "wchar.h" and the
wchar_t type is a 16-bit unsigned type */
/* #define HAVE_WCHAR_H */
/* #define HAVE_USABLE_WCHAR_T */
/* Py_UNICODE was the native Unicode storage format (code unit) used by
Python and represents a single Unicode element in the Unicode type.
With PEP 393, Py_UNICODE is deprecated and replaced with a
typedef to wchar_t. */
#ifndef Py_LIMITED_API
#define PY_UNICODE_TYPE wchar_t
typedef wchar_t Py_UNICODE;
#endif
/* If the compiler provides a wchar_t type we try to support it
through the interface functions PyUnicode_FromWideChar(),
PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
#ifdef HAVE_USABLE_WCHAR_T
# ifndef HAVE_WCHAR_H
# define HAVE_WCHAR_H
# endif
#endif
#ifdef HAVE_WCHAR_H
/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
# ifdef _HAVE_BSDI
# include <time.h>
# endif
# include <wchar.h>
#endif
/* Py_UCS4 and Py_UCS2 are typedefs for the respective
unicode representations. */
typedef uint32_t Py_UCS4;
typedef uint16_t Py_UCS2;
typedef uint8_t Py_UCS1;
/* --- Internal Unicode Operations ---------------------------------------- */
/* Since splitting on whitespace is an important use case, and
whitespace in most situations is solely ASCII whitespace, we
optimize for the common case by using a quick look-up table
_Py_ascii_whitespace (see below) with an inlined check.
*/
#ifndef Py_LIMITED_API
#define Py_UNICODE_ISSPACE(ch) \
((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
#define Py_UNICODE_ISALNUM(ch) \
(Py_UNICODE_ISALPHA(ch) || \
Py_UNICODE_ISDECIMAL(ch) || \
Py_UNICODE_ISDIGIT(ch) || \
Py_UNICODE_ISNUMERIC(ch))
#define Py_UNICODE_COPY(target, source, length) \
memcpy((target), (source), (length)*sizeof(Py_UNICODE))
#define Py_UNICODE_FILL(target, value, length) \
do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
} while (0)
/* macros to work with surrogates */
#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDFFF)
#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDBFF)
#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= (ch) && (ch) <= 0xDFFF)
/* Join two surrogate characters and return a single Py_UCS4 value. */
#define Py_UNICODE_JOIN_SURROGATES(high, low) \
(((((Py_UCS4)(high) & 0x03FF) << 10) | \
((Py_UCS4)(low) & 0x03FF)) + 0x10000)
/* high surrogate = top 10 bits added to D800 */
#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 - (0x10000 >> 10) + ((ch) >> 10))
/* low surrogate = bottom 10 bits added to DC00 */
#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 + ((ch) & 0x3FF))
/* Check if substring matches at given offset. The offset must be
valid, and the substring must not be empty. */
#define Py_UNICODE_MATCH(string, offset, substring) \
((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
!memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
#endif /* Py_LIMITED_API */
#ifdef __cplusplus
extern "C" {
#endif
/* --- Unicode Type ------------------------------------------------------- */
#ifndef Py_LIMITED_API
/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
structure. state.ascii and state.compact are set, and the data
immediately follow the structure. utf8_length and wstr_length can be found
in the length field; the utf8 pointer is equal to the data pointer. */
typedef struct {
/* There are 4 forms of Unicode strings:
- compact ascii:
* structure = PyASCIIObject
* test: PyUnicode_IS_COMPACT_ASCII(op)
* kind = PyUnicode_1BYTE_KIND
* compact = 1
* ascii = 1
* ready = 1
* (length is the length of the utf8 and wstr strings)
* (data starts just after the structure)
* (since ASCII is decoded from UTF-8, the utf8 string are the data)
- compact:
* structure = PyCompactUnicodeObject
* test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)
* kind = PyUni
评论0