# -*- coding:utf-8 -*-
# Author:hankcs
# Date: 2018-06-06 13:19
# 《自然语言处理入门》3.3 训练
# 配套书籍:http://nlp.hankcs.com/book.php
# 讨论答疑:https://bbs.hankcs.com/
from jpype import JString
from pyhanlp import *
from tests.book.ch03.demo_corpus_loader import my_cws_corpus
from tests.book.ch03.msr import msr_model
from tests.test_utility import test_data_path
NatureDictionaryMaker = SafeJClass('com.hankcs.hanlp.corpus.dictionary.NatureDictionaryMaker')
CorpusLoader = SafeJClass('com.hankcs.hanlp.corpus.document.CorpusLoader')
WordNet = JClass('com.hankcs.hanlp.seg.common.WordNet')
Vertex = JClass('com.hankcs.hanlp.seg.common.Vertex')
ViterbiSegment = JClass('com.hankcs.hanlp.seg.Viterbi.ViterbiSegment')
DijkstraSegment = JClass('com.hankcs.hanlp.seg.Dijkstra.DijkstraSegment')
CoreDictionary = LazyLoadingJClass('com.hankcs.hanlp.dictionary.CoreDictionary')
Nature = JClass('com.hankcs.hanlp.corpus.tag.Nature')
def train_bigram(corpus_path, model_path):
sents = CorpusLoader.convert2SentenceList(corpus_path)
for sent in sents:
for word in sent:
if word.label is None:
word.setLabel("n")
maker = NatureDictionaryMaker()
maker.compute(sents)
maker.saveTxtTo(model_path) # tests/data/my_cws_model.txt
def load_bigram(model_path, verbose=True, ret_viterbi=True):
HanLP.Config.CoreDictionaryPath = model_path + ".txt" # unigram
HanLP.Config.BiGramDictionaryPath = model_path + ".ngram.txt" # bigram
# 以下部分为兼容新标注集,不感兴趣可以跳过
HanLP.Config.CoreDictionaryTransformMatrixDictionaryPath = model_path + ".tr.txt" # 词性转移矩阵,分词时可忽略
if model_path != msr_model:
with open(HanLP.Config.CoreDictionaryTransformMatrixDictionaryPath) as src:
for tag in src.readline().strip().split(',')[1:]:
Nature.create(tag)
CoreBiGramTableDictionary = SafeJClass('com.hankcs.hanlp.dictionary.CoreBiGramTableDictionary')
CoreDictionary.getTermFrequency("商品")
# 兼容代码结束
if verbose:
print(CoreDictionary.getTermFrequency("商品"))
print(CoreBiGramTableDictionary.getBiFrequency("商品", "和"))
sent = '商品和服务'
# sent = '货币和服务'
wordnet = generate_wordnet(sent, CoreDictionary.trie)
print(wordnet)
print(viterbi(wordnet))
return ViterbiSegment().enableAllNamedEntityRecognize(False).enableCustomDictionary(
False) if ret_viterbi else DijkstraSegment().enableAllNamedEntityRecognize(False).enableCustomDictionary(False)
def generate_wordnet(sent, trie):
"""
生成词网
:param sent: 句子
:param trie: 词典(unigram)
:return: 词网
"""
searcher = trie.getSearcher(JString(sent), 0)
wordnet = WordNet(sent)
while searcher.next():
wordnet.add(searcher.begin + 1,
Vertex(sent[searcher.begin:searcher.begin + searcher.length], searcher.value, searcher.index))
# 原子分词,保证图连通
vertexes = wordnet.getVertexes()
i = 0
while i < len(vertexes):
if len(vertexes[i]) == 0: # 空白行
j = i + 1
for j in range(i + 1, len(vertexes) - 1): # 寻找第一个非空行 j
if len(vertexes[j]):
break
wordnet.add(i, Vertex.newPunctuationInstance(sent[i - 1: j - 1])) # 填充[i, j)之间的空白行
i = j
else:
i += len(vertexes[i][-1].realWord)
return wordnet
def viterbi(wordnet):
nodes = wordnet.getVertexes()
# 前向遍历
for i in range(0, len(nodes) - 1):
for node in nodes[i]:
for to in nodes[i + len(node.realWord)]:
to.updateFrom(node) # 根据距离公式计算节点距离,并维护最短路径上的前驱指针from
# 后向回溯
path = [] # 最短路径
f = nodes[len(nodes) - 1].getFirst() # 从终点回溯
while f:
path.insert(0, f)
f = f.getFrom() # 按前驱指针from回溯
return [v.realWord for v in path]
if __name__ == '__main__':
corpus_path = my_cws_corpus()
model_path = os.path.join(test_data_path(), 'my_cws_model')
train_bigram(corpus_path, model_path)
load_bigram(model_path)
没有合适的资源?快使用搜索试试~ 我知道了~
自然语言处理代码.zip
共80个文件
py:80个
需积分: 0 1 下载量 18 浏览量
2023-04-06
07:50:56
上传
评论
收藏 60KB ZIP 举报
温馨提示
自然语言处理代码.zip
资源推荐
资源详情
资源评论
收起资源包目录
自然语言处理代码.zip (80个子文件)
book
ch06
__init__.py 228B
evaluate_crf_cws.py 1KB
plot_3d_sgd_newton.py 3KB
crfpp_train_hanlp_load.py 2KB
plot_2d_newton.py 2KB
ch08
__init__.py 216B
demo_role_tag_nt.py 1KB
demo_crf_ner.py 733B
demo_hmm_ner.py 2KB
demo_num_eng.py 675B
demo_role_tag_ns.py 1KB
demo_plane.py 1KB
demo_sp_ner.py 1KB
demo_role_tag_nr.py 2KB
ch01
hello_word.py 431B
__init__.py 209B
ch04
__init__.py 234B
hmm_cws.py 1KB
doctor_hmm.py 3KB
ch10
__init__.py 211B
demo_clustering_f.py 561B
demo_text_clustering.py 1KB
ch09
__init__.py 210B
demo_tfidf.py 983B
demo_term_freq.py 905B
demo_extract_word.py 2KB
ch13
__init__.py 231B
sigmoid.py 502B
demo_neual_parser.py 2KB
demo_word2vec.py 3KB
ch12
__init__.py 216B
opinion_mining.py 2KB
demo_train_parser.py 976B
ch11
demo_load_text_classification_corpus.py 1KB
__init__.py 210B
demo_text_classification_evaluation.py 2KB
demo_text_classification.py 281B
demo_svm_text_classification.py 2KB
ch05
plot_corpus_ratio_f1.py 2KB
__init__.py 188B
plot_name.py 2KB
plot_3d_sgd.py 3KB
eval_perceptron_cws.py 1KB
plot_compressed_f1.py 2KB
perceptron_cws.py 1KB
plot_2d_sgd.py 1KB
online_learning.py 1KB
classify_name.py 1KB
ch07
__init__.py 210B
demo_crf_pos.py 1KB
demo_hmm_pos.py 1KB
pku.py 611B
custom_corpus_pos.py 782B
custom_pos.py 533B
demo_perceptron_pos.py 1KB
evaluate_pos.py 1KB
ch02
bidirectional_segment.py 1KB
aho_corasick_double_array_trie.py 736B
zipf_law.py 2KB
speed_benchmark.py 1KB
dat.py 2KB
trie.py 2KB
utility.py 641B
forward_segment.py 1KB
demo_acdat_segment.py 444B
fully_segment.py 844B
demo_dat_segment.py 958B
aho_corasick.py 597B
backward_segment.py 1KB
evaluate_cws.py 3KB
demo_stopwords.py 2KB
ch03
__init__.py 225B
demo_corpus_loader.py 912B
eval_bigram_cws.py 709B
msr.py 759B
adjust_model.py 549B
japanese_segment.py 785B
ngram_segment.py 4KB
demo_custom_dict.py 710B
sighan05_statistics.py 2KB
共 80 条
- 1
资源评论
鹿129
- 粉丝: 0
- 资源: 1
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功