# -*- coding: utf-8 -*-
# # @Author : jiabing
import string
import numpy as np
import pandas as pd
import jieba
import jieba.analyse
from gensim.models.word2vec import Word2Vec, LineSentence
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from utils.config import *
class ESearch(object):
def __init__(self, index_name, doc_type_name, \
stop_words_file=STOP_WORD_PATH, model_name='es', es_weight=0.2):
'''
测试ElasticSearch版本:6.2.4
'''
# 导入用户自定义停用词
self.stop_list = set() # 停用词储存器
self.load_stop_words_from_file(stop_words_file) # 从文件加载停用词并更新到stop_list
self.word2vec = Word2Vec.load(WV_MODEL_PATH) # 导入词向量
self.model_name = model_name
self.index_name = index_name
self.doc_type_name = doc_type_name
self.es = Elasticsearch()
self.es_status = False # ES状态
self.check_es_status() # 检查index和doc_type
self.questions_hash = set()
self.punctuation = set(string.punctuation) # 标点符号
# 超参数,打分函数中es和tfidf,word2vec,bert等各自的贡献程度
self.es_weight = es_weight
self.bert_weight = 1 - es_weight
def load_stop_words_from_file(self, file):
if file:
with open(file, encoding='utf-8') as f:
lines = f.readlines()
self.stop_list.update([x.strip() for x in lines])
def check_es_status(self):
print('==========')
if self.check_exist_index(self.index_name):
print('[OK] index:', self.index_name)
if self.check_exist_doc_type(self.index_name, self.doc_type_name):
print('[OK] doc type:', self.doc_type_name)
self.es_status = True
else:
print('[WARN] not found doc type: %s in index: %s' % \
(self.index_name, self.doc_type_name))
else:
print('[WARN] not found index:', self.index_name)
if self.es_status:
print('Enjoy query!')
else:
print('Please load data to es from file or textlist!')
print('==========')
def check_exist_index(self, index_name):
return self.es.indices.exists(index=index_name)
def check_exist_doc_type(self, index_name, doc_type_name):
return self.es.indices.exists_type(index=index_name, doc_type=doc_type_name)
def set_mapping(self):
# 对于原问题录入,不做任何索引,不可作索引查询
# 对于原问题切分好的词,指定ES分词器为simple,只按空格切分,且大写转小写
my_mapping = {"mappings":
{self.doc_type_name: {
"properties": {
"context": {"type": "text", "index": "false"}, # 原问题不作任何索引
"splwords": {"type": "text", "index": "false"}, # 分词结果
"keywords": {"type": "text", "analyzer": "simple"}, # 抽取出的关键词
"embeding": {"type": "object", "enabled": "false"}, # 句向量
"context_id": {"type": "text"}}
}}
}
# 创建Index和mapping
self.es.indices.delete(index=self.index_name, ignore=[400, 404])
create_index = self.es.indices.create(
index=self.index_name, body=my_mapping) # {u'acknowledged': True}
if not create_index["acknowledged"]:
print("Index data bug...")
def make_action(self, fields, actions, id):
# fix bug: IndexError: list index out of range
try:
splwords, keywords, embeding = self.split_word(fields)
except Exception as e: # 尝试捕捉分词模块的未知bug
print(e)
print('failed at:', fields)
return
# fix bug: 'NoneType' object has no attribute 'tolist'
if not keywords:
print('[Error] not found any keywords:', fields)
return
if embeding is None:
return
try:
action = {
"_index": self.index_name,
"_type": self.doc_type_name,
"_source": {
"context": fields,
"splwords": splwords,
"keywords": keywords,
"embeding": embeding.tolist(),
"context_id": id,
}
}
actions.append(action)
#print(actions)
except Exception as e:
print('fields:', fields)
print(e)
def load_data_from_file(self, input_file, overwrite=False):
'''
从文件导入数据并写入到ES数据库
input_file: 文件名
filed_sep: 行内域分隔符
skip_lines: 跳过前面的行数,默认为0
read_lines: 最大读取行数,默认为-1(无上限)
filter_repet: 是否过滤重复问题
overwrite: 是否覆盖已有数据库
'''
# 覆盖模式或者ES未准备好得状态下:重新定义mapping
if overwrite or (not self.es_status):
self.set_mapping()
print("Indexing data.....")
ACTION = [] # 创建ACTIONS
df = pd.read_csv(input_file)
for i in range(len(df)):
self.make_action(df["knowledge_q"][i], ACTION, i)
self.questions_hash.add(i)
# 批量导入
success, _ = bulk(
self.es, ACTION, index=self.index_name, raise_on_error=True)
print('Performed %d actions' % success)
print("Index data success")
self.check_es_status() # 再次检查ES状态
# 求句子向量用于
def sentence2vec(self,query):
sentence = []
for word in query:
if word in self.word2vec:
sentence.append(self.word2vec[word])
sentence = np.array(sentence)
if len(sentence) > 0:
sentence = sentence.sum(axis=0) / len(sentence) # 计算句向量
else:
sentence = np.zeros(shape=(300, ))
return sentence
def split_word(self, query):
'''
jieba分词器:返回分词、关键词
'''
cut =jieba.lcut(query, cut_all=True)
splwords = "/".join(jieba.cut(query))
embeding = self.sentence2vec(cut)
keywords = " ".join(jieba.analyse.extract_tags(query,topK=8))
return splwords,keywords,embeding
@classmethod
def softmax(cls, x, inverse=False):
if inverse:
ex = np.exp(-np.array(x))
else:
ex = np.exp(x)
return ex / ex.sum()
def set_es_weight(self, es_weight):
'''
调整es和bert各自贡献程度的超参数,两者之和固定是为1
'''
self.es_weight = es_weight
self.bert_weight = 1 - es_weight
def calc_blue(self, sent1, sent2):
'''
计算BLUE
sent1:真实句子序列
sent2:预测输出序列
'''
sent1_list = [w for w in sent1.split('/') \
if w not in self.punctuation] # 去除单词中的标点符号
sent2_list = [w for w in sent2.split('/') \
if w not in self.punctuation] # 去除单词中的标点符号
sent1_len = len(sent1_list)
sent2_len = len(sent2_list)
# 计算惩罚因子BP
if sent2_len >= sent1_len:
BP = 1
else:
BP = np.exp(1 - sent1_len / sent2_len)
# 计算输出预测精度p
pv = [min(sent1_list.count(w), sent2_list.count(w)) \
for w in set(sent2_list)]
return BP * np.log(sum(pv) / sent2_len)
def calc_similarity(self, embed1, embed2):
'''
计算两个句向量的余弦相似度Similarity
'''
num =np.dot(embed1,embed2)
s = (n
博士僧小星
- 粉丝: 2402
- 资源: 5995
最新资源
- 数据结构与算法作业源文件
- ASP.NET+SQL Sever2005 C语言教学网站及网上考试系统的设计与实现(论文+源代码+开题报告)(2024xj).7z
- ASP.NET+SQLBS模式的计算机等级考试管理系统的设计与实现(论文+源代码+开题报告)(2024ac).7z
- asp.net+sql2008在线论坛系统(2024m5).7z
- ASP.NET+SQL房地产管理系统设计与实现(开题报告+源代码+论文+任务书+工作总结+答辩PPT)(2024ol).7z
- asp.net+sql人才网站设计(源代码+论文)(20244n).7z
- ASP.NET+SQL蒋和平_办公自动化管理系统的设计与实现(源代码+论文)(20240y).7z
- ASP.NET+SQL通用作业批改系统设计(源代码+论文)(2024qf).7z
- ASP.NET网上人才招聘系统(源代码+论文)(20244l).7z
- ASP.NET网上书店(源程序+论文+答辩PPT)(2024dv).7z
- ASP.NET网上教材管理系统的设计与实现(源代码+论文)(2024wb).7z
- ASP.NET模块化动态电子商务网站的开发(源代码+论文+开题报告+外文翻译)(2024ju).7z
- asp.net某店POS积分管理系统-创建会员信息文件(源代码+论文)(2024te).7z
- ASP.NET某店POS积分管理系统——积分实绩更新及销售状况统计(源代码+论文)(2024pi).7z
- asp.net某店POS积分管理系统-清除履历表、日志表、月购买额(源代码+论文)(2024x2).7z
- ASP.NET某店POS积分管理系统-销售情况,会员卡再发行数据生成(源代码+论文)(20247w).7z
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈