# coding=UTF-8
from numpy import *
import matplotlib.pyplot as plt
import time#time是python自带的模块,用于处理时间问题,提供了一系列的操作时间的函数
import math#自带
import re#正则表达式
import jieba#中文分词
import pandas as pd
import chardet
import numpy as np
data_list = pd.read_csv('测试表格.csv', header=None, encoding='gbk')
train_data = np.array(data_list) # np.ndarray()
print(train_data)
stopwords = [line.strip() for line in open('stoplist.txt', encoding='UTF-8').readlines()]
for row in train_data:
result = ''.join(row)
sentence_depart = jieba.cut(result)
outstr = ''
for word in sentence_depart:
if word not in stopwords:
if word != '\t':
outstr += word
outstr += " "
testEntry =outstr
print(testEntry)
# --------------------------------------------------------------------城乡建设分类模型--------------------------------------------------------------------------------------------------------------------------------------------------------------------'
def loadTrainDataSet1(): # 读取训练集
fileIn = open('out1.txt', encoding='utf-8')
postingList = [] # 邮件表,二维数组
classVec = []
i = 0
for line in fileIn.readlines():
lineArr = line.strip().split()
temp = []
for i in range(len(lineArr)):
if i == 0:
classVec.append(int(lineArr[i]))
else:
temp.append(lineArr[i])
postingList.append(temp)
i = i + 1
return postingList, classVec
def createVocabList1(dataSet): # 创建词典
vocabSet = set([]) # 定义list型的集合
for document in dataSet:
vocabSet = vocabSet | set(document)
return list(vocabSet)
def setOfWords2Vec1(vocabList, inputSet): # 对于每一个训练样本,得到其特征向量
returnVec = [0] * len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else:
pass
# print("\'%s\' 不存在于词典中"%word)
return returnVec
def createTrainMatrix1(vocabList, postingList): # 生成训练矩阵,即每个样本的特征向量
trainMatrix = [] # 训练矩阵
for i in range(len(postingList)):
curVec = setOfWords2Vec1(vocabList, postingList[i])
trainMatrix.append(curVec)
return trainMatrix
def trainNB01(trainMatrix, trainCategory):
numTrainDocs = len(trainMatrix) # 样本数量
numWords = len(trainMatrix[0]) # 样本特征数
pAbusive = sum(trainCategory) / float(numTrainDocs) # p(y=1)
# 分子赋值为1,分母赋值为2(拉普拉斯平滑)
p0Num = ones(numWords); # 初始化向量,代表所有0类样本中词j出现次数
p1Num = ones(numWords); # 初始化向量,代表所有1类样本中词j出现次数
p0Denom = p1Denom = 2.0 # 代表0类1类样本的总词数
for i in range(numTrainDocs):
if trainCategory[i] == 1:
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
p1Vect = p1Num / p1Denom # 概率向量(p(x0=1|y=1),p(x1=1|y=1),...p(xn=1|y=1))
p0Vect = p0Num / p0Denom # 概率向量(p(x0=1|y=0),p(x1=1|y=0),...p(xn=1|y=0))
# 取对数,之后的乘法就可以改为加法,防止数值下溢损失精度
p1Vect = log(p1Vect)
p0Vect = log(p0Vect)
return p0Vect, p1Vect, pAbusive
def classifyNB1(vocabList, testEntry, p0Vec, p1Vec, pClass1): # 朴素贝叶斯分类
# 先将输入文本处理成特征向量
regEx = re.compile('\\W+') # 正则匹配分割,以字母数字的任何字符为分隔符
testArr = regEx.split(testEntry)
testVec = array(setOfWords2Vec1(vocabList, testArr))
# 此处的乘法并非矩阵乘法,而是矩阵相同位置的2个数分别相乘
# 矩阵乘法应当 dot(A,B) 或者 A.dot(B)
# 下式子是原式子取对数,因此原本的连乘变为连加
p1 = sum(testVec * p1Vec) + log(pClass1)
p0 = sum(testVec * p0Vec) + log(1.0 - pClass1)
# 比较大小即可
if p1 > p0:
return 1
else:
return 0
# 测试方法
def testingNB1():
postingList, classVec = loadTrainDataSet1()
vocabList = createVocabList1(postingList)
trainMatrix = createTrainMatrix1(vocabList, postingList)
p0V, p1V, pAb = trainNB01(trainMatrix, classVec)
if classifyNB1(vocabList, testEntry, p0V, p1V, pAb):
return 1
else:
return 0
# ------------------------------------------------------------------------环境保护分类模型--------------------------------------------------------------------------------
def loadTrainDataSet2(): # 读取训练集
fileIn = open('out2.txt', encoding='utf-8')
postingList = [] # 邮件表,二维数组
classVec = []
i = 0
for line in fileIn.readlines():
lineArr = line.strip().split()
temp = []
for i in range(len(lineArr)):
if i == 0:
classVec.append(int(lineArr[i]))
else:
temp.append(lineArr[i])
postingList.append(temp)
i = i + 1
return postingList, classVec
def createVocabList2(dataSet): # 创建词典
vocabSet = set([]) # 定义list型的集合
for document in dataSet:
vocabSet = vocabSet | set(document)
return list(vocabSet)
def setOfWords2Vec2(vocabList, inputSet): # 对于每一个训练样本,得到其特征向量
returnVec = [0] * len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else:
pass
# print("\'%s\' 不存在于词典中"%word)
return returnVec
def createTrainMatrix2(vocabList, postingList): # 生成训练矩阵,即每个样本的特征向量
trainMatrix = [] # 训练矩阵
for i in range(len(postingList)):
curVec = setOfWords2Vec2(vocabList, postingList[i])
trainMatrix.append(curVec)
return trainMatrix
def trainNB02(trainMatrix, trainCategory):
numTrainDocs = len(trainMatrix) # 样本数量
numWords = len(trainMatrix[0]) # 样本特征数
pAbusive = sum(trainCategory) / float(numTrainDocs) # p(y=1)
# 分子赋值为1,分母赋值为2(拉普拉斯平滑)
p0Num = ones(numWords); # 初始化向量,代表所有0类样本中词j出现次数
p1Num = ones(numWords); # 初始化向量,代表所有1类样本中词j出现次数
p0Denom = p1Denom = 2.0 # 代表0类1类样本的总词数
for i in range(numTrainDocs):
if trainCategory[i] == 1:
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
p1Vect = p1Num / p1Denom # 概率向量(p(x0=1|y=1),p(x1=1|y=1),...p(xn=1|y=1))
p0Vect = p0Num / p0Denom # 概率向量(p(x0=1|y=0),p(x1=1|y=0),...p(xn=1|y=0))
# 取对数,之后的乘法就可以改为加法,防止数值下溢损失精度
p1Vect = log(p1Vect)
p0Vect = log(
第一题_自然语言处理文本分类_朴素贝叶斯算法_
版权申诉
5星 · 超过95%的资源 65 浏览量
2021-10-02
17:45:57
上传
评论 1
收藏 3KB RAR 举报
呼啸庄主
- 粉丝: 74
- 资源: 4702
最新资源
- 论文(最终)_20240430235101.pdf
- 基于python编写的Keras深度学习框架开发,利用卷积神经网络CNN,快速识别图片并进行分类
- 最全空间计量实证方法(空间杜宾模型和检验以及结果解释文档).txt
- 5uonly.apk
- 蓝桥杯Python组的历年真题
- 2023-04-06-项目笔记 - 第一百十九阶段 - 4.4.2.117全局变量的作用域-117 -2024.04.30
- 2023-04-06-项目笔记 - 第一百十九阶段 - 4.4.2.117全局变量的作用域-117 -2024.04.30
- 前端开发技术实验报告:内含4四实验&实验报告
- Highlight Plus v20.0.1
- 林周瑜-论文.docx
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈