#include "include/Application.hpp"
#include <Windows.h>
#include <iostream>
#include <fstream>
using namespace CppJieba;
//转化原理,由于unicode既可以由任意格式文本转化,又能转为任意格式文本,
//所以unicode就像中介一样 任意类型↔unicode↔任意类型
//MultiByteToWideChar 将任意格式字符串转化为unicode格式
//WideCharToMultiByte 将unicode字符转为任意格式
//支持的转化类型
//CP_ACP ANSI code page
//CP_MACCP Not supported
//CP_OEMCP OEM code page
//CP_SYMBOL Not supported
//CP_THREAD_ACP Not supported
//CP_UTF7 UTF-7 code page
//CP_UTF8 UTF-8 code page
std::wstring AnsitoWideChar(std::string str)
{
//测试转化所需长度
int nLen = MultiByteToWideChar(CP_ACP, 0, str.c_str(), (int)(str.size()), NULL, 0);
WCHAR *pBuffer = new WCHAR[nLen + 1];
MultiByteToWideChar(CP_ACP, 0, str.c_str(), (int)(str.size()), pBuffer, nLen);
pBuffer[nLen] = '\0';
std::wstring wstr;
wstr.append(pBuffer);
delete[] pBuffer;
return wstr;
}
std::wstring UTF8toWideChar(std::string str)
{
//测试转化所需长度
int nLen = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)(str.size()), NULL, 0);
WCHAR *pBuffer = new WCHAR[nLen + 1];
MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)(str.size()), pBuffer, nLen);
pBuffer[nLen] = '\0';
std::wstring wstr;
wstr.append(pBuffer);
delete[] pBuffer;
return wstr;
}
std::string WideChartoAnsi(std::wstring wstr)
{
int nLen = WideCharToMultiByte(CP_ACP, 0, wstr.c_str(), (int)(wstr.size()), NULL, 0, NULL, NULL);
CHAR *pBuffer = new CHAR[nLen + 1];
WideCharToMultiByte(CP_ACP, 0, wstr.c_str(), (int)(wstr.size()), pBuffer, nLen, NULL, NULL);
pBuffer[nLen] = '\0';
std::string str;
str.append(pBuffer);
delete[] pBuffer;
return str;
}
std::string WideChartoUTF8(std::wstring wstr)
{
int nLen = WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), (int)(wstr.size()), NULL, 0, NULL, NULL);
CHAR *pBuffer = new CHAR[nLen + 1];
WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), (int)(wstr.size()), pBuffer, nLen, NULL, NULL);
pBuffer[nLen] = '\0';
std::string str;
str.append(pBuffer);
delete[] pBuffer;
return str;
}
std::string stringToUTF8string(std::string str)
{
std::wstring wstr = AnsitoWideChar(str);
return WideChartoUTF8(wstr);
}
std::string UTF8Tostring(std::string str)
{
std::wstring wstr = UTF8toWideChar(str);
return WideChartoAnsi(wstr);
}
void WordToAnsi(vector<string> &words)
{
for (size_t i=0;i<words.size();i++)
{
words[i] = UTF8Tostring(words[i]);
}
}//vector<pair<string, string> >
void PairToAnsi(vector<pair<string, string> > &p)
{
for (size_t i=0;i<p.size();i++)
{
p[i].first = UTF8Tostring(p[i].first);
p[i].second = UTF8Tostring(p[i].second);
}
}
void PairToAnsi(vector<pair<string, double> > &p)
{
for (size_t i=0;i<p.size();i++)
{
p[i].first = UTF8Tostring(p[i].first);
}
}
std::string readFileToString(const std::string& filename) {
std::ifstream file(filename);
if (!file.is_open()) {
std::cerr << "Failed to open file: " << filename << std::endl;
return "";
}
std::string content((std::istreambuf_iterator<char>(file)),
std::istreambuf_iterator<char>());
file.close();
return content;
}
int main(int argc, char** argv) {
CppJieba::Application app("./dict/jieba.dict.utf8",
"./dict/hmm_model.utf8",
"./dict/user.dict.utf8",
"./dict/idf.utf8",
"./dict/stop_words.utf8");
vector<string> words;
string result;
string s = "35当世界年纪还小的时候";
s.append("当世界年纪还小的时候,每样东西都必须学习怎么生活。");
s.append("太阳开始学发光,学着怎么上山下山。它也试过做别的事,但是都没有成功。譬如说唱歌,它粗糙的声音,把这个敏感的新世界吓坏了。");
s.append("月亮不知道自己该学些什么。学发光吗?白天它觉得这主意不好,晚上它又觉得这主意不错。它一直无法决定,只好反反复复,一阵子这样,一阵子那样,所以看起来有时圆有时缺。它学会的是不断变化。");
s.append("水开始学习流动。它很快就学会了,因为只有一种方式,那就是:一直往低处流,往低处流,往低处流……");
s.append("那时候,生活就是这么简单。每样东西只要弄明白自己做什么最容易就行了。");
s.append("世界在慢慢变化,万物在自由生长。雨从云里落下,滴进泥土里;人睁开眼睛,就可以看到一切有多美好……只要万物都做它最容易做的事,这世界就很有秩序了。");
s.append("这世界还相当有秩序……");
s.append("哦!不要往下讲了,最好再从头开始。这个故事没有结局,却有很多开头,很多很多开头。");
s.append("很久很久以前,当世界年纪还小的时候……");
//s = readFileToString("123.txt");
s = stringToUTF8string(s);
//cout 即 console out 控制台输出
cout << "\nMETHOD_MP" << endl;
app.cut(s, words, METHOD_MP);
WordToAnsi(words);
cout << join(words.begin(), words.end(), "/") << endl;
cout << "\nMETHOD_HMM" << endl;
app.cut(s, words, METHOD_HMM);
WordToAnsi(words);
cout << join(words.begin(), words.end(), "/") << endl;
cout <<"\nMETHOD_MIX" << endl;
app.cut(s, words, METHOD_MIX);
WordToAnsi(words);
cout << join(words.begin(), words.end(), "/") << endl;
cout << "\nMETHOD_FULL" << endl;
app.cut(s, words, METHOD_FULL);
WordToAnsi(words);
cout << join(words.begin(), words.end(), "/") << endl;
cout << "\nMETHOD_QUERY" << endl;
app.cut(s, words, METHOD_QUERY);
WordToAnsi(words);
std::ofstream outputFile("output.txt");
if (outputFile.is_open()) {
outputFile << join(words.begin(), words.end(), "/") << endl;
outputFile.close(); // 关闭文件
} else {
std::cout << "无法打开文件。" << std::endl;
}
cout << join(words.begin(), words.end(), "/") << endl;
cout << "\nTAGGING" << endl;
vector<pair<string, string> > tagres;
app.tag(s, tagres);
cout << UTF8Tostring(s) << endl << endl;
PairToAnsi(tagres);
cout << tagres << endl;
cout << "\nKEYWORD" << endl;
vector<pair<string, double> > keywordres;
app.extract(s, keywordres, 5);
cout << UTF8Tostring(s) << endl << endl;
PairToAnsi(keywordres);
cout << keywordres << endl;
return EXIT_SUCCESS;
}
xiaoyaodefengliu
- 粉丝: 0
- 资源: 6
最新资源
- 基于java的在线动漫周边店的设计与实现
- 生成式 AI 爆发:医疗 AI 走到十字路口
- 基于Matlab实现三相电压型PWM逆变电路仿真模型.rar
- LVBench: An Extreme Long Video Understanding Benchmark
- 基于javaweb的在线投票系统论文.doc
- 在digital电路中,用两个或非门实现一个锁存器
- 基于web的在线心理咨询系统的设计与实现论文.doc
- 圣诞节代码html飘雪花 代码实现示例.docx
- 基于java的足球直播论坛的设计与实现.doc
- Autoregressive Image Generation without Vector Quantization
- 基于web的中小企业信息管理系统
- 2024中国数字经济企业出海报告
- EFC-main.zip
- 基于Python的招聘数据采集分析平台的设计与实现.doc
- MDPO: Conditional Preference Optimization for Multimodal Large Language Models
- 使用C语言将二进制转为Verilog可识别的hex文件(如jpeg文件转mif文件)
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈