import torch
import pandas as pd
import re
count = 0
def excel_one_line_to_list():
df = pd.read_excel(r'../dataset/手机.xlsx', usecols=[3],
names=None) # 读取项目名称列,不要列名
df_li = df.values.tolist()
result = []
for s_li in df_li:
result.append(s_li[0])
return result
class myCRF:
def __init__(self):
self.scoreMap = {} #分数表
self.UnigramTemplates = [] #状态特征模板
self.BigramTemplates = [] #转移特征模板
self.readTemplate() #读取特征模板
def readTemplate(self,debug=False):
'''
读取特征模板
:return:
'''
tempFile = open("../dataset/dataset2/template.utf8", encoding='utf-8')
switchFlag = False # 先读Unigram,在读Bigram
for line in tempFile:
tmpList = []
if line.find("Unigram") > 0 or line.find("Bigram") > 0: # 读到'Unigram'或者'Bigram'
continue
if switchFlag:
if line.find("/") > 0:
left = line.split("/")[0].split("[")[-1].split(",")[0]
right = line.split("/")[-1].split("[")[-1].split(",")[0]
tmpList.append(int(left))
tmpList.append(int(right))
else:
num = line.split("[")[-1].split(",")[0]
tmpList.append(int(num))
self.BigramTemplates.append(tmpList)
else:
if len(line.strip()) == 0:
switchFlag = True
else:
if line.find("/") > 0:
left = line.split("/")[0].split("[")[-1].split(",")[0]
right = line.split("/")[-1].split("[")[-1].split(",")[0]
tmpList.append(int(left))
tmpList.append(int(right))
else:
num = line.split("[")[-1].split(",")[0]
tmpList.append(int(num))
self.UnigramTemplates.append(tmpList)
if (debug == True):
print(self.UnigramTemplates)
print(self.BigramTemplates)
def getTrainData(self):
sentences = []
results = []
tempFile = open('../dataset/dataset2/train.utf8', encoding='utf-8')
sentence = ""
result = ""
for line in tempFile:
line = line.strip()
if line == "":
if sentence == "" or result == "":
pass
else:
sentences.append(sentence)
results.append(result)
sentence = ""
result = ""
else:
data = line.split(" ")
sentence += data[0]
result += data[1]
return [sentences, results]
def getUnigramScore(self, sentence, thisPos, thisStatus):
'''
获得给定词和标志的状态特征分数和
:param sentence: 句子
:param thisPos: 当前位置
:param thisStatus: 当前标志
:return: 得分
'''
unigramScore = 0
unigramTemplates = self.UnigramTemplates
for i in range(0, len(unigramTemplates)):
key = self.makeKey(unigramTemplates[i], str(i), sentence, thisPos, thisStatus)
if key in self.scoreMap:
#这里为了加快运算,将对应的一组分数相加求和
unigramScore += self.scoreMap[key]
return unigramScore
def getBigramScore(self, sentence, thisPos, preStatus, thisStatus):
'''
获得给定词和标志的转移特征分数和
:param sentence: 句子
:param thisPos: 当前位置
:param preStatus: 上一个特征
:param thisStatus: 当前特征
:return: 得分
'''
bigramScore = 0
bigramTemplates = self.BigramTemplates
for i in range(0, len(bigramTemplates)):
key = self.makeKey(bigramTemplates[i], str(i), sentence, thisPos, preStatus + thisStatus)
if key in self.scoreMap:
bigramScore += self.scoreMap[key]
return bigramScore
def num2Tag(self, number):
'''
将数字转为对应标志
:param number: 数字
:return: 标志
'''
if number == 0:
return "B"
elif number == 1:
return "I"
elif number == 2:
return "E"
elif number == 3:
return "S"
else:
return None
def tag2Num(self, status):
'''
将标志转为对应数字
:param status: 标志
:return: 数字
// B,表示词的开始
// M,表示词的中间 (I)
// E,表示词的结
// S, 表示单字成词
'''
if status == "B":
return 0
elif status == "I":
return 1
elif status == "E":
return 2
elif status == "S":
return 3
else:
return -1
def getMaxIndex(self, list):
'''
获取最大值所在列表的序号
:param list: 列表
:return: 最大值的序号
'''
origin = list.copy()
origin.sort()
max = origin[-1]
index = list.index(max)
return index
def getDuplicate(self, realstring, string):
'''
状态序列里,正确的状态的个数
:param realstring: 正确解
:param string: 输出解
:return: 正确个数
'''
length = min(len(realstring), len(string))
count = 0
for i in range(0, length):
if realstring[i] == string[i]:
count += 1
return count
def makeKey(self, template, identity, sentence, pos, statusCovered,debug=False):
'''
找出一句句子中,给定的模板下的某位置的标注(BIES)
:param template: 给定模板
:param identity: 模板序号
:param sentence: 标注句子
:param pos: 当点位置
:param statusCovered: 状态标注
:param debug: 调试用
:return: 标注结果
'''
result = ""
result += identity
for i in template:
index = pos + i
if index < 0 or index >= len(sentence):
result += " "
else:
result += sentence[index]
result += "/"
result += statusCovered
if (debug==True):
print(result)
return result
def getWrongNum(self, sentence, realRes):
'''
计算正确率
:param sentence: 句子
:param realRes: 正确解
:return: 错误个数
'''
myRes = self.Viterbi(sentence) # 我的解
lens = len(sentence)
wrongNum = 0
for i in range(0, lens):
myResI = myRes[i] # 我的解
realResI = realRes[i] # 理论解
if myResI != realResI:
wrongNum += 1
return wrongNum
def setScoreMap(self, sentence, realRes,debug =False):
'''
建立状态特征和转移特征的特征矩阵,并依据结果为每个元素打分
:param sentence: 句子
:param realRes: 正确解
:param debug: 调试用
:return:
'''
myRes = self.Viterbi(sentence) # 我的解
for word in range(0, len(sentence)):
myResI = myRes[word] # 我的解
realResI = realRes[word] # 理论解
if myResI != realResI: # 如果和理论值不同
# p