///////////////////////////////////////////////////////////////////////
// File: unicharset.h
// Description: Unicode character/ligature set class.
// Author: Thomas Kielbus
// Created: Wed Jun 28 17:05:01 PDT 2006
//
// (C) Copyright 2006, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CCUTIL_UNICHARSET_H__
#define TESSERACT_CCUTIL_UNICHARSET_H__
#include "errcode.h"
#include "helpers.h"
#include "strngs.h"
#include "tesscallback.h"
#include "unichar.h"
#include "unicharmap.h"
class CHAR_FRAGMENT {
public:
// Minimum number of characters used for fragment representation.
static const int kMinLen = 6;
// Maximum number of characters used for fragment representation.
static const int kMaxLen = 3 + UNICHAR_LEN + 2;
// Maximum number of fragments per character.
static const int kMaxChunks = 5;
// Setters and Getters.
inline void set_all(const char *unichar, int pos, int total, bool natural) {
set_unichar(unichar);
set_pos(pos);
set_total(total);
set_natural(natural);
}
inline void set_unichar(const char *uch) {
strncpy(this->unichar, uch, UNICHAR_LEN);
this->unichar[UNICHAR_LEN] = '\0';
}
inline void set_pos(int p) { this->pos = p; }
inline void set_total(int t) { this->total = t; }
inline const char* get_unichar() const { return this->unichar; }
inline int get_pos() const { return this->pos; }
inline int get_total() const { return this->total; }
// Returns the string that represents a fragment
// with the given unichar, pos and total.
static STRING to_string(const char *unichar, int pos, int total,
bool natural);
// Returns the string that represents this fragment.
STRING to_string() const {
return to_string(unichar, pos, total, natural);
}
// Checks whether a fragment has the same unichar,
// position and total as the given inputs.
inline bool equals(const char *other_unichar,
int other_pos, int other_total) const {
return (strcmp(this->unichar, other_unichar) == 0 &&
this->pos == other_pos && this->total == other_total);
}
inline bool equals(const CHAR_FRAGMENT *other) const {
return this->equals(other->get_unichar(),
other->get_pos(),
other->get_total());
}
// Checks whether a given fragment is a continuation of this fragment.
// Assumes that the given fragment pointer is not NULL.
inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const {
return (strcmp(this->unichar, fragment->get_unichar()) == 0 &&
this->total == fragment->get_total() &&
this->pos == fragment->get_pos() + 1);
}
// Returns true if this fragment is a beginning fragment.
inline bool is_beginning() const { return this->pos == 0; }
// Returns true if this fragment is an ending fragment.
inline bool is_ending() const { return this->pos == this->total-1; }
// Returns true if the fragment was a separate component to begin with,
// ie did not need chopping to be isolated, but may have been separated
// out from a multi-outline blob.
inline bool is_natural() const { return natural; }
void set_natural(bool value) { natural = value; }
// Parses the string to see whether it represents a character fragment
// (rather than a regular character). If so, allocates memory for a new
// CHAR_FRAGMENT instance and fills it in with the corresponding fragment
// information. Fragments are of the form:
// |m|1|2, meaning chunk 1 of 2 of character m, or
// |:|1n2, meaning chunk 1 of 2 of character :, and no chopping was needed
// to divide the parts, as they were already separate connected components.
//
// If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT
// instance, otherwise (if the string does not represent a fragment or it
// looks like it does, but parsing it as a fragment fails) returns NULL.
//
// Note: The caller is responsible for deallocating memory
// associated with the returned pointer.
static CHAR_FRAGMENT *parse_from_string(const char *str);
private:
char unichar[UNICHAR_LEN + 1];
// True if the fragment was a separate component to begin with,
// ie did not need chopping to be isolated, but may have been separated
// out from a multi-outline blob.
bool natural;
inT16 pos; // fragment position in the character
inT16 total; // total number of fragments in the character
};
// The UNICHARSET class is an utility class for Tesseract that holds the
// set of characters that are used by the engine. Each character is identified
// by a unique number, from 0 to (size - 1).
class UNICHARSET {
public:
// Custom list of characters and their ligature forms (UTF8)
// These map to unicode values in the private use area (PUC) and are supported
// by only few font families (eg. Wyld, Adobe Caslon Pro).
static const char* kCustomLigatures[][2];
// ICU 2.0 UCharDirection enum (from third_party/icu/include/unicode/uchar.h)
enum Direction {
U_LEFT_TO_RIGHT = 0,
U_RIGHT_TO_LEFT = 1,
U_EUROPEAN_NUMBER = 2,
U_EUROPEAN_NUMBER_SEPARATOR = 3,
U_EUROPEAN_NUMBER_TERMINATOR = 4,
U_ARABIC_NUMBER = 5,
U_COMMON_NUMBER_SEPARATOR = 6,
U_BLOCK_SEPARATOR = 7,
U_SEGMENT_SEPARATOR = 8,
U_WHITE_SPACE_NEUTRAL = 9,
U_OTHER_NEUTRAL = 10,
U_LEFT_TO_RIGHT_EMBEDDING = 11,
U_LEFT_TO_RIGHT_OVERRIDE = 12,
U_RIGHT_TO_LEFT_ARABIC = 13,
U_RIGHT_TO_LEFT_EMBEDDING = 14,
U_RIGHT_TO_LEFT_OVERRIDE = 15,
U_POP_DIRECTIONAL_FORMAT = 16,
U_DIR_NON_SPACING_MARK = 17,
U_BOUNDARY_NEUTRAL = 18,
U_CHAR_DIRECTION_COUNT
};
// Create an empty UNICHARSET
UNICHARSET();
~UNICHARSET();
// Return the UNICHAR_ID of a given unichar representation within the
// UNICHARSET.
const UNICHAR_ID unichar_to_id(const char* const unichar_repr) const;
// Return the UNICHAR_ID of a given unichar representation within the
// UNICHARSET. Only the first length characters from unichar_repr are used.
const UNICHAR_ID unichar_to_id(const char* const unichar_repr,
int length) const;
// Return the minimum number of bytes that matches a legal UNICHAR_ID,
// while leaving a legal UNICHAR_ID afterwards. In other words, if there
// is both a short and a long match to the string, return the length that
// ensures there is a legal match after it.
int step(const char* str) const;
// Return whether the given UTF-8 string is encodable with this UNICHARSET.
// If not encodable, write the first byte offset which cannot be converted
// into the second (return) argument.
bool encodable_string(const char *str, int *first_bad_position) const;
// Return the unichar representation corresponding to the given UNICHAR_ID
// within the UNICHARSET.
const char* const id_to_unichar(UNICHAR_ID id) const;
// Return the UTF8 representation corresponding to the given UNICHAR_ID after
// resolving any private encodings internal to Tesseract. This method is
// preferrable to id_to_unichar for outputting text that will be visible to
// external applications.
const char* const id_to_unichar
没有合适的资源?快使用搜索试试~ 我知道了~
tesseract-3.02.02-win32-lib-include-dirs.zip
共33个文件
h:24个
lib:4个
vsprops:2个
需积分: 9 38 下载量 84 浏览量
2017-10-17
09:57:41
上传
评论
收藏 28MB ZIP 举报
温馨提示
tesseract-3.02.02-win32-lib-include-dirs.zip win32 SDK
资源推荐
资源详情
资源评论
收起资源包目录
tesseract-3.02.02-win32-lib-include-dirs.zip (33个子文件)
tesseract-3.02.02-win32-lib-include-dirs
include
tesseract_versionnumbers.vsprops 433B
tesseract
baseapi.h 30KB
fileerr.h 1KB
helpers.h 5KB
unicharset.h 35KB
unicharmap.h 3KB
ndminx.h 1KB
apitypes.h 1KB
params.h 10KB
capi.h 16KB
genericvector.h 25KB
platform.h 2KB
basedir.h 1KB
resultiterator.h 9KB
publictypes.h 11KB
tesscallback.h 32KB
serialis.h 2KB
errcode.h 3KB
ltrresultiterator.h 9KB
strngs.h 6KB
unichar.h 3KB
thresholder.h 8KB
memry.h 2KB
host.h 6KB
pageiterator.h 13KB
leptonica_versionnumbers.vsprops 660B
lib
libtesseract302-static.lib 14MB
libtesseract302d.dll 4.08MB
libtesseract302.lib 111KB
libtesseract302.exp 65KB
libtesseract302.dll 1.5MB
libtesseract302d.lib 111KB
libtesseract302-static-debug.lib 84.78MB
共 33 条
- 1
资源评论
_静以修身
- 粉丝: 7
- 资源: 3
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功