# -*- coding: utf-8 -*-
# # @Author : jiabing
import string
import numpy as np
import pandas as pd
import jieba
import jieba.analyse
from gensim.models.word2vec import Word2Vec, LineSentence
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from utils.config import *
class ESearch(object):
def __init__(self, index_name, doc_type_name, \
stop_words_file=STOP_WORD_PATH, model_name='es', es_weight=0.2):
'''
测试ElasticSearch版本:6.2.4
'''
# 导入用户自定义停用词
self.stop_list = set() # 停用词储存器
self.load_stop_words_from_file(stop_words_file) # 从文件加载停用词并更新到stop_list
self.word2vec = Word2Vec.load(WV_MODEL_PATH) # 导入词向量
self.model_name = model_name
self.index_name = index_name
self.doc_type_name = doc_type_name
self.es = Elasticsearch()
self.es_status = False # ES状态
self.check_es_status() # 检查index和doc_type
self.questions_hash = set()
self.punctuation = set(string.punctuation) # 标点符号
# 超参数,打分函数中es和tfidf,word2vec,bert等各自的贡献程度
self.es_weight = es_weight
self.bert_weight = 1 - es_weight
def load_stop_words_from_file(self, file):
if file:
with open(file, encoding='utf-8') as f:
lines = f.readlines()
self.stop_list.update([x.strip() for x in lines])
def check_es_status(self):
print('==========')
if self.check_exist_index(self.index_name):
print('[OK] index:', self.index_name)
if self.check_exist_doc_type(self.index_name, self.doc_type_name):
print('[OK] doc type:', self.doc_type_name)
self.es_status = True
else:
print('[WARN] not found doc type: %s in index: %s' % \
(self.index_name, self.doc_type_name))
else:
print('[WARN] not found index:', self.index_name)
if self.es_status:
print('Enjoy query!')
else:
print('Please load data to es from file or textlist!')
print('==========')
def check_exist_index(self, index_name):
return self.es.indices.exists(index=index_name)
def check_exist_doc_type(self, index_name, doc_type_name):
return self.es.indices.exists_type(index=index_name, doc_type=doc_type_name)
def set_mapping(self):
# 对于原问题录入,不做任何索引,不可作索引查询
# 对于原问题切分好的词,指定ES分词器为simple,只按空格切分,且大写转小写
my_mapping = {"mappings":
{self.doc_type_name: {
"properties": {
"context": {"type": "text", "index": "false"}, # 原问题不作任何索引
"splwords": {"type": "text", "index": "false"}, # 分词结果
"keywords": {"type": "text", "analyzer": "simple"}, # 抽取出的关键词
"embeding": {"type": "object", "enabled": "false"}, # 句向量
"context_id": {"type": "text"}}
}}
}
# 创建Index和mapping
self.es.indices.delete(index=self.index_name, ignore=[400, 404])
create_index = self.es.indices.create(
index=self.index_name, body=my_mapping) # {u'acknowledged': True}
if not create_index["acknowledged"]:
print("Index data bug...")
def make_action(self, fields, actions, id):
# fix bug: IndexError: list index out of range
try:
splwords, keywords, embeding = self.split_word(fields)
except Exception as e: # 尝试捕捉分词模块的未知bug
print(e)
print('failed at:', fields)
return
# fix bug: 'NoneType' object has no attribute 'tolist'
if not keywords:
print('[Error] not found any keywords:', fields)
return
if embeding is None:
return
try:
action = {
"_index": self.index_name,
"_type": self.doc_type_name,
"_source": {
"context": fields,
"splwords": splwords,
"keywords": keywords,
"embeding": embeding.tolist(),
"context_id": id,
}
}
actions.append(action)
#print(actions)
except Exception as e:
print('fields:', fields)
print(e)
def load_data_from_file(self, input_file, overwrite=False):
'''
从文件导入数据并写入到ES数据库
input_file: 文件名
filed_sep: 行内域分隔符
skip_lines: 跳过前面的行数,默认为0
read_lines: 最大读取行数,默认为-1(无上限)
filter_repet: 是否过滤重复问题
overwrite: 是否覆盖已有数据库
'''
# 覆盖模式或者ES未准备好得状态下:重新定义mapping
if overwrite or (not self.es_status):
self.set_mapping()
print("Indexing data.....")
ACTION = [] # 创建ACTIONS
df = pd.read_csv(input_file)
for i in range(len(df)):
self.make_action(df["knowledge_q"][i], ACTION, i)
self.questions_hash.add(i)
# 批量导入
success, _ = bulk(
self.es, ACTION, index=self.index_name, raise_on_error=True)
print('Performed %d actions' % success)
print("Index data success")
self.check_es_status() # 再次检查ES状态
# 求句子向量用于
def sentence2vec(self,query):
sentence = []
for word in query:
if word in self.word2vec:
sentence.append(self.word2vec[word])
sentence = np.array(sentence)
if len(sentence) > 0:
sentence = sentence.sum(axis=0) / len(sentence) # 计算句向量
else:
sentence = np.zeros(shape=(300, ))
return sentence
def split_word(self, query):
'''
jieba分词器:返回分词、关键词
'''
cut =jieba.lcut(query, cut_all=True)
splwords = "/".join(jieba.cut(query))
embeding = self.sentence2vec(cut)
keywords = " ".join(jieba.analyse.extract_tags(query,topK=8))
return splwords,keywords,embeding
@classmethod
def softmax(cls, x, inverse=False):
if inverse:
ex = np.exp(-np.array(x))
else:
ex = np.exp(x)
return ex / ex.sum()
def set_es_weight(self, es_weight):
'''
调整es和bert各自贡献程度的超参数,两者之和固定是为1
'''
self.es_weight = es_weight
self.bert_weight = 1 - es_weight
def calc_blue(self, sent1, sent2):
'''
计算BLUE
sent1:真实句子序列
sent2:预测输出序列
'''
sent1_list = [w for w in sent1.split('/') \
if w not in self.punctuation] # 去除单词中的标点符号
sent2_list = [w for w in sent2.split('/') \
if w not in self.punctuation] # 去除单词中的标点符号
sent1_len = len(sent1_list)
sent2_len = len(sent2_list)
# 计算惩罚因子BP
if sent2_len >= sent1_len:
BP = 1
else:
BP = np.exp(1 - sent1_len / sent2_len)
# 计算输出预测精度p
pv = [min(sent1_list.count(w), sent2_list.count(w)) \
for w in set(sent2_list)]
return BP * np.log(sum(pv) / sent2_len)
def calc_similarity(self, embed1, embed2):
'''
计算两个句向量的余弦相似度Similarity
'''
num =np.dot(embed1,embed2)
s = (n
没有合适的资源?快使用搜索试试~ 我知道了~
温馨提示
检索式问答系统 score es_weight权重决定blue和Similarity,并最终求得score。 blue:通过两句子的长度打分,长度差的越远分数越低。 Similarity:通过两句向量余弦相似度求出。 gensim训练得到得的word2vec词向量最后求平均得到句子向量。 bert通过Bert_serving可直接得到句子向量。
资源推荐
资源详情
资源评论
收起资源包目录
ElasticSearch_chatbot-master.zip (13个子文件)
ElasticSearch_chatbot-master
flask
run.py 824B
image
代码.png 22KB
结果.png 109KB
utils
bert_test.py 366B
word2vec.py 1KB
TF_IDF.py 4KB
Data.py 3KB
Bert_serving.py 605B
__pycache__
config.cpython-36.pyc 550B
config.py 687B
ES
es_serrch_word2vec.py 11KB
es_serrch_bert.py 11KB
test.py 206B
共 13 条
- 1
资源评论
博士僧小星
- 粉丝: 2264
- 资源: 5991
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
最新资源
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功