【免费】自然语言处理代码.zip_自然语言处理代码资源-CSDN文库

共80个文件

py：80个

需积分: 0 18 浏览量 2023-04-06 07:50:56 上传评论收藏 60KB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

自然语言处理代码.zip （80个子文件）

book

ch06

__init__.py 228B

evaluate_crf_cws.py 1KB

plot_3d_sgd_newton.py 3KB

crfpp_train_hanlp_load.py 2KB

plot_2d_newton.py 2KB

ch08

__init__.py 216B

demo_role_tag_nt.py 1KB

demo_crf_ner.py 733B

demo_hmm_ner.py 2KB

demo_num_eng.py 675B

demo_role_tag_ns.py 1KB

demo_plane.py 1KB

demo_sp_ner.py 1KB

demo_role_tag_nr.py 2KB

ch01

hello_word.py 431B

__init__.py 209B

ch04

__init__.py 234B

hmm_cws.py 1KB

doctor_hmm.py 3KB

ch10

__init__.py 211B

demo_clustering_f.py 561B

demo_text_clustering.py 1KB

ch09

__init__.py 210B

demo_tfidf.py 983B

demo_term_freq.py 905B

demo_extract_word.py 2KB

ch13

__init__.py 231B

sigmoid.py 502B

demo_neual_parser.py 2KB

demo_word2vec.py 3KB

ch12

__init__.py 216B

opinion_mining.py 2KB

demo_train_parser.py 976B

ch11

demo_load_text_classification_corpus.py 1KB

__init__.py 210B

demo_text_classification_evaluation.py 2KB

demo_text_classification.py 281B

demo_svm_text_classification.py 2KB

ch05

plot_corpus_ratio_f1.py 2KB

__init__.py 188B

plot_name.py 2KB

plot_3d_sgd.py 3KB

eval_perceptron_cws.py 1KB

plot_compressed_f1.py 2KB

perceptron_cws.py 1KB

plot_2d_sgd.py 1KB

online_learning.py 1KB

classify_name.py 1KB

ch07

__init__.py 210B

demo_crf_pos.py 1KB

demo_hmm_pos.py 1KB

pku.py 611B

custom_corpus_pos.py 782B

custom_pos.py 533B

demo_perceptron_pos.py 1KB

evaluate_pos.py 1KB

ch02

bidirectional_segment.py 1KB

aho_corasick_double_array_trie.py 736B

zipf_law.py 2KB

speed_benchmark.py 1KB

dat.py 2KB

trie.py 2KB

utility.py 641B

forward_segment.py 1KB

demo_acdat_segment.py 444B

fully_segment.py 844B

demo_dat_segment.py 958B

aho_corasick.py 597B

backward_segment.py 1KB

evaluate_cws.py 3KB

demo_stopwords.py 2KB

ch03

__init__.py 225B

demo_corpus_loader.py 912B

eval_bigram_cws.py 709B

msr.py 759B

adjust_model.py 549B

japanese_segment.py 785B

ngram_segment.py 4KB

demo_custom_dict.py 710B

sighan05_statistics.py 2KB

# -*- coding:utf-8 -*- # Author：hankcs # Date: 2018-06-06 13:19 # 《自然语言处理入门》3.3 训练 # 配套书籍：http://nlp.hankcs.com/book.php # 讨论答疑：https://bbs.hankcs.com/ from jpype import JString from pyhanlp import * from tests.book.ch03.demo_corpus_loader import my_cws_corpus from tests.book.ch03.msr import msr_model from tests.test_utility import test_data_path NatureDictionaryMaker = SafeJClass('com.hankcs.hanlp.corpus.dictionary.NatureDictionaryMaker') CorpusLoader = SafeJClass('com.hankcs.hanlp.corpus.document.CorpusLoader') WordNet = JClass('com.hankcs.hanlp.seg.common.WordNet') Vertex = JClass('com.hankcs.hanlp.seg.common.Vertex') ViterbiSegment = JClass('com.hankcs.hanlp.seg.Viterbi.ViterbiSegment') DijkstraSegment = JClass('com.hankcs.hanlp.seg.Dijkstra.DijkstraSegment') CoreDictionary = LazyLoadingJClass('com.hankcs.hanlp.dictionary.CoreDictionary') Nature = JClass('com.hankcs.hanlp.corpus.tag.Nature') def train_bigram(corpus_path, model_path): sents = CorpusLoader.convert2SentenceList(corpus_path) for sent in sents: for word in sent: if word.label is None: word.setLabel("n") maker = NatureDictionaryMaker() maker.compute(sents) maker.saveTxtTo(model_path) # tests/data/my_cws_model.txt def load_bigram(model_path, verbose=True, ret_viterbi=True): HanLP.Config.CoreDictionaryPath = model_path + ".txt" # unigram HanLP.Config.BiGramDictionaryPath = model_path + ".ngram.txt" # bigram # 以下部分为兼容新标注集，不感兴趣可以跳过 HanLP.Config.CoreDictionaryTransformMatrixDictionaryPath = model_path + ".tr.txt" # 词性转移矩阵，分词时可忽略 if model_path != msr_model: with open(HanLP.Config.CoreDictionaryTransformMatrixDictionaryPath) as src: for tag in src.readline().strip().split(',')[1:]: Nature.create(tag) CoreBiGramTableDictionary = SafeJClass('com.hankcs.hanlp.dictionary.CoreBiGramTableDictionary') CoreDictionary.getTermFrequency("商品") # 兼容代码结束 if verbose: print(CoreDictionary.getTermFrequency("商品")) print(CoreBiGramTableDictionary.getBiFrequency("商品", "和")) sent = '商品和服务' # sent = '货币和服务' wordnet = generate_wordnet(sent, CoreDictionary.trie) print(wordnet) print(viterbi(wordnet)) return ViterbiSegment().enableAllNamedEntityRecognize(False).enableCustomDictionary( False) if ret_viterbi else DijkstraSegment().enableAllNamedEntityRecognize(False).enableCustomDictionary(False) def generate_wordnet(sent, trie): """ 生成词网 :param sent: 句子 :param trie: 词典（unigram） :return: 词网 """ searcher = trie.getSearcher(JString(sent), 0) wordnet = WordNet(sent) while searcher.next(): wordnet.add(searcher.begin + 1, Vertex(sent[searcher.begin:searcher.begin + searcher.length], searcher.value, searcher.index)) # 原子分词，保证图连通 vertexes = wordnet.getVertexes() i = 0 while i < len(vertexes): if len(vertexes[i]) == 0: # 空白行 j = i + 1 for j in range(i + 1, len(vertexes) - 1): # 寻找第一个非空行 j if len(vertexes[j]): break wordnet.add(i, Vertex.newPunctuationInstance(sent[i - 1: j - 1])) # 填充[i, j)之间的空白行 i = j else: i += len(vertexes[i][-1].realWord) return wordnet def viterbi(wordnet): nodes = wordnet.getVertexes() # 前向遍历 for i in range(0, len(nodes) - 1): for node in nodes[i]: for to in nodes[i + len(node.realWord)]: to.updateFrom(node) # 根据距离公式计算节点距离，并维护最短路径上的前驱指针from # 后向回溯 path = [] # 最短路径 f = nodes[len(nodes) - 1].getFirst() # 从终点回溯 while f: path.insert(0, f) f = f.getFrom() # 按前驱指针from回溯 return [v.realWord for v in path] if __name__ == '__main__': corpus_path = my_cws_corpus() model_path = os.path.join(test_data_path(), 'my_cws_model') train_bigram(corpus_path, model_path) load_bigram(model_path)

评论收藏

内容反馈