import re
import jieba
import pandas as pd
# 导入文本特征向量转化模块
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
def text_save(filename, data): # filename为写入CSV文件的路径,data为要写入数据列表.
file = open(filename,'a')
for i in range(len(data)):
s = str(data[i]).replace('[','').replace(']','') # 去除[],这两行按数据不同,可以选择
s = s.replace("'",'').replace(',','') +'\n' # 去除单引号,逗号,每行末尾追加换行符
file.write(s)
file.close()
print("保存文件成功")
def process():
# -------------第一部分:读取数据--------------------------
fulltrain = pd.read_csv('DataC/microwave.csv', encoding='gb18030')
datatrain = fulltrain['star_rating'].values # 转换为数组
# print(type(datatrain))
# # print(datatrain.dtype)
for i in range(0, len(datatrain)):
if datatrain[i] >= 3: # 大于等于3的变成1,其余为0
datatrain[i] = 1
else:
datatrain[i] = 0
# print(data)
dataxtrain = fulltrain['review_body'].values
print(len(dataxtrain))
# -------------第二部分:数据处理--------------------------
pattern = r"(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*,]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)|([a-zA-Z]+.\w+\.+[a-zA-Z0-9\/_]+)"
restrain = []
for i in dataxtrain:
tmp = ''.join(str(str(i).replace('\n', ''))) # 去掉换行符
# print(tmp)
tmp = re.sub(pattern, '', tmp) # 去掉网站
# print(tmp)
remove_digits = str.maketrans('', '', '0123456789') # 去除数字
tmp = tmp.translate(remove_digits)
# print(tmp)
restrain.append(tmp)
print("train总计:", len(restrain))
# -----------------第三部分:分词,去除停用词--------------------
stop_word = {}.fromkeys([',', '。', '!', 'this', 'me', 'very', 'is', '、', ':', ';', '(', ')', '-', ':'])
print("中文分词后结果:")
corpustrain = []
for a in restrain:
seg_list = jieba.cut(a.strip(), cut_all=False) # 精确模式
final = ''
for seg in seg_list:
if seg not in stop_word: # 非停用词,保留
final += seg
seg_list = jieba.cut(final, cut_all=False)
output = ' '.join(list(seg_list))
# print(output)
corpustrain.append(output)
print(len(corpustrain))
# -------------------第四部分将文本中的词语转换为词频矩阵--------------------------------
vectorizer = CountVectorizer()
# 计算各词语出现的次数
Xtrain = vectorizer.fit_transform(corpustrain)
# 获取词袋中所有文本关键词
# word = vectorizer.get_feature_names()
# # 查看词频结果
# # print(len(word))
# for w in word:
# print(w,end=" ")
# print(" ")
# print("词频矩阵:")
Xtrain = Xtrain.toarray()
# print("矩阵len:",len(X))
# np.set_printoptions(threshold=np.inf)
# print(X)
# -----------------------------第五部分-数据分析---------------------
print("数据分析:")
x_train = Xtrain[:1300]
y_train = datatrain[:1300] # 1表示好评0表示差评
# 调用MultionmialNB分类器,初始化朴素贝叶斯模型 # 训练集合上进行训练, 估计参数
clf = MultinomialNB().fit(x_train, y_train)
# clf = BernoulliNB().fit(x_train,y_train)
pre = clf.predict(Xtrain[1300:])
pvalues = clf.predict_proba(Xtrain[1300:]) # 输出测试样本划分到各个类别的概率值 第一列预测为0的概率,第二列预测为1的概率
text_save("result/microoutputs.txt", pvalues) # 概率分布存到本地 outputs.txt文件
text_save("result/microtargets.txt", datatrain[1300:]) # 概率分布存到本地 targets.txt文件
# for i in range(0, len(pvalues)):
# print(pvalues[i])
print("预测结果:",pre)
print("真实结果:", y_train)
print(classification_report(datatrain[1300:], pre))
good = 0
bad = 0
for i in pre:
if i == 0:
bad += 1
else:
good += 1
print("good:", good, ',', "bad:", bad)
if __name__ == '__main__':
process()