【免费】CRF条件随机场-中文分词_crf条件随机场资源-CSDN文库

共19个文件

utf8：6个

xlsx：5个

xml：4个

NLP

需积分: 0 115 浏览量 2022-12-29 12:09:19 上传评论收藏 15.97MB RAR 举报

资源推荐

资源详情

资源评论

收起资源包目录

CRF随机条件场进行中文文本分词.rar （19个子文件）

CRF随机条件场进行中文文本分词

dataset

球鞋.xlsx 841KB

手机.xlsx 802KB

dataset1

template.utf8 354B

train.utf8 6.03MB

labels.utf8 7B

茅台.xlsx 912KB

图书（局外人）.xlsx 787KB

dataset2

template.utf8 354B

train.utf8 9.96MB

labels.utf8 7B

电脑桌.xlsx 874KB

.idea

workspace.xml 8KB

misc.xml 198B

CRF随机条件场进行中文文本分词.iml 334B

inspectionProfiles

profiles_settings.xml 174B

modules.xml 343B

.gitignore 184B

savemodel

crf.model 43.72MB

model

myCRF.py 17KB

import torch import pandas as pd import re count = 0 def excel_one_line_to_list(): df = pd.read_excel(r'../dataset/手机.xlsx', usecols=[3], names=None) # 读取项目名称列,不要列名 df_li = df.values.tolist() result = [] for s_li in df_li: result.append(s_li[0]) return result class myCRF: def __init__(self): self.scoreMap = {} #分数表 self.UnigramTemplates = [] #状态特征模板 self.BigramTemplates = [] #转移特征模板 self.readTemplate() #读取特征模板 def readTemplate(self,debug=False): ''' 读取特征模板 :return: ''' tempFile = open("../dataset/dataset2/template.utf8", encoding='utf-8') switchFlag = False # 先读Unigram，在读Bigram for line in tempFile: tmpList = [] if line.find("Unigram") > 0 or line.find("Bigram") > 0: # 读到'Unigram'或者'Bigram' continue if switchFlag: if line.find("/") > 0: left = line.split("/")[0].split("[")[-1].split(",")[0] right = line.split("/")[-1].split("[")[-1].split(",")[0] tmpList.append(int(left)) tmpList.append(int(right)) else: num = line.split("[")[-1].split(",")[0] tmpList.append(int(num)) self.BigramTemplates.append(tmpList) else: if len(line.strip()) == 0: switchFlag = True else: if line.find("/") > 0: left = line.split("/")[0].split("[")[-1].split(",")[0] right = line.split("/")[-1].split("[")[-1].split(",")[0] tmpList.append(int(left)) tmpList.append(int(right)) else: num = line.split("[")[-1].split(",")[0] tmpList.append(int(num)) self.UnigramTemplates.append(tmpList) if (debug == True): print(self.UnigramTemplates) print(self.BigramTemplates) def getTrainData(self): sentences = [] results = [] tempFile = open('../dataset/dataset2/train.utf8', encoding='utf-8') sentence = "" result = "" for line in tempFile: line = line.strip() if line == "": if sentence == "" or result == "": pass else: sentences.append(sentence) results.append(result) sentence = "" result = "" else: data = line.split(" ") sentence += data[0] result += data[1] return [sentences, results] def getUnigramScore(self, sentence, thisPos, thisStatus): ''' 获得给定词和标志的状态特征分数和 :param sentence: 句子 :param thisPos: 当前位置 :param thisStatus: 当前标志 :return: 得分 ''' unigramScore = 0 unigramTemplates = self.UnigramTemplates for i in range(0, len(unigramTemplates)): key = self.makeKey(unigramTemplates[i], str(i), sentence, thisPos, thisStatus) if key in self.scoreMap: #这里为了加快运算，将对应的一组分数相加求和 unigramScore += self.scoreMap[key] return unigramScore def getBigramScore(self, sentence, thisPos, preStatus, thisStatus): ''' 获得给定词和标志的转移特征分数和 :param sentence: 句子 :param thisPos: 当前位置 :param preStatus: 上一个特征 :param thisStatus: 当前特征 :return: 得分 ''' bigramScore = 0 bigramTemplates = self.BigramTemplates for i in range(0, len(bigramTemplates)): key = self.makeKey(bigramTemplates[i], str(i), sentence, thisPos, preStatus + thisStatus) if key in self.scoreMap: bigramScore += self.scoreMap[key] return bigramScore def num2Tag(self, number): ''' 将数字转为对应标志 :param number: 数字 :return: 标志 ''' if number == 0: return "B" elif number == 1: return "I" elif number == 2: return "E" elif number == 3: return "S" else: return None def tag2Num(self, status): ''' 将标志转为对应数字 :param status: 标志 :return: 数字 // B，表示词的开始 // M，表示词的中间 (I) // E，表示词的结 // S, 表示单字成词 ''' if status == "B": return 0 elif status == "I": return 1 elif status == "E": return 2 elif status == "S": return 3 else: return -1 def getMaxIndex(self, list): ''' 获取最大值所在列表的序号 :param list: 列表 :return: 最大值的序号 ''' origin = list.copy() origin.sort() max = origin[-1] index = list.index(max) return index def getDuplicate(self, realstring, string): ''' 状态序列里，正确的状态的个数 :param realstring: 正确解 :param string: 输出解 :return: 正确个数 ''' length = min(len(realstring), len(string)) count = 0 for i in range(0, length): if realstring[i] == string[i]: count += 1 return count def makeKey(self, template, identity, sentence, pos, statusCovered,debug=False): ''' 找出一句句子中，给定的模板下的某位置的标注（BIES） :param template: 给定模板 :param identity: 模板序号 :param sentence: 标注句子 :param pos: 当点位置 :param statusCovered: 状态标注 :param debug: 调试用 :return: 标注结果 ''' result = "" result += identity for i in template: index = pos + i if index < 0 or index >= len(sentence): result += " " else: result += sentence[index] result += "/" result += statusCovered if (debug==True): print(result) return result def getWrongNum(self, sentence, realRes): ''' 计算正确率 :param sentence: 句子 :param realRes: 正确解 :return: 错误个数 ''' myRes = self.Viterbi(sentence) # 我的解 lens = len(sentence) wrongNum = 0 for i in range(0, lens): myResI = myRes[i] # 我的解 realResI = realRes[i] # 理论解 if myResI != realResI: wrongNum += 1 return wrongNum def setScoreMap(self, sentence, realRes,debug =False): ''' 建立状态特征和转移特征的特征矩阵，并依据结果为每个元素打分 :param sentence: 句子 :param realRes: 正确解 :param debug: 调试用 :return: ''' myRes = self.Viterbi(sentence) # 我的解 for word in range(0, len(sentence)): myResI = myRes[word] # 我的解 realResI = realRes[word] # 理论解 if myResI != realResI: # 如果和理论值不同 # p

评论收藏

内容反馈