# import MeCab
import itertools
import collections
import pandas as pd
# mecab = MeCab.Tagger ("-Owakati")
# a = [1, 2, 3, 4, 3]
# b = a.count(0)
# print(b)
# strs = ''
# lst = mecab.parse(strs).split()
# print(lst)
#
# res = []
# dics = {'a':['1', '2'], 'b':['3', '2', '4', '2']}
# df = pd.DataFrame.from_dict({k: ' '.join(v) for k, v in dics.items()}, orient='index', columns=['product_id'])
# df = df.reset_index().rename(columns = {'index':'word'})
# df.to_csv(r'D:\KDD相关\任务1数据集\aaa.csv', index=False, encoding='utf-8')
# # from bs4 import BeautifulSoup
# import re
# def clean(desstr, restr=''):
# soup = BeautifulSoup(desstr, 'html.parser')
# strs = soup.get_text()
# try:
# co = re.compile(u'['u'\U0001F300-\U0001F64F' u'\U0001F680-\U0001F6FF'u'\u2600-\u2B55]+')
# except re.error:
# co = re.compile(u'('u'\ud83c[\udf00-\udfff]|'u'\ud83d[\udc00-\ude4f\ude80-\udeff]|'u'[\u2600-\u2B55])+')
# return co.sub(restr, strs)
# print(clean(''))
##清洗product的文本,并构建词表
# def create_product_word_lst():
# stop_words = set([line.strip() for line in open(r'D:\KDD相关\任务1数据集\停用词.txt', encoding='utf-8').readlines()])
# word_dic = {'es':set(), 'us':set(), 'jp':set()} ##相应语言的词表
# product_data = pd.read_csv(r'D:\KDD相关\任务1数据集\product_catalogue-v0.1.csv')
# n_word = 0
# for i in range(len(product_data)):
# strs = str(product_data['product_title'][i])
# if not strs:
# strs = str(product_data['product_description'][i])
# if not strs:
# strs = str(product_data['product_bullet_point'][i])
# strs = clean(strs)
# local = product_data['product_locale'][i]
# if local != 'jp': ##英西直接根据空格分词
# lst = strs.strip().split()
# for v in lst:
# if v not in stop_words:
# n_word += 1
# word_dic[local].add(v)
# else: ##日调用分词软件分词
# lst = mecab.parse(strs).split()
# for v in lst:
# if v not in stop_words:
# n_word += 1
# word_dic[local].add(v)
# print("词表总词数:", n_word) ##词表总词数: 116,231,275 构建倒排索引计算量太大 13,910,047
# return word_dic
##词所对应的商品id列表(倒排索引)
# def create_query_productid_dic():
# product_data = pd.read_csv(r'D:\KDD相关\任务1数据集\product_catalogue-v0.1.csv')
# word_dic = create_product_word_lst()
# word_productid_dic = {} ##词语及词语对应的product_id列表
# indx = 0
# for local in word_dic: ##时间复杂度:3*word的数量*1*某语言的句子(word的数量*所有句子)
# word_lst = word_dic[local] ##某语言的词表
# sub_data = product_data.groupby('product_locale')
# for word in word_lst:
# indx += 1
# if indx % 100000 == 0:
# print(indx)
# for l, group in sub_data:
# if l == local:
# group1 = group.reset_index()
# for i in range(len(group1)):
# strs = str(product_data['product_title'][i])
# if not strs:
# strs = str(product_data['product_description'][i])
# if not strs:
# strs = str(product_data['product_bullet_point'][i])
# strs = clean(strs)
# lst = strs.split()
# if word in lst:
# word_productid_dic.setdefault(word, set()).add(group1['product_id'][i])
# df = pd.DataFrame.from_dict({k: ' '.join(v) for k, v in word_productid_dic.items()}, orient='index', columns=['product_id'])
# df.to_csv(r'D:\KDD相关\任务1数据集\word_productid_dic.csv', encoding='utf-8', index=False)
# return word_productid_dic
# for i in range(len(test_data)): ##进入query
# query = test_data['query'][i]
#
# id_score_dic = {}
# for j in range(len(product_data)): ##进入product
# if product_data['product_locale'][j] == test_data['query_locale'][i]:
#
#
# dl_lst, avgdl = avg_len_procuct() ##
# dl = dl_lst[j] ##
# score_q_d = 0
# for word in query_lst:
# nqi = 0 ##
# if word in word_id_dic:
# nqi = len(word_id_dic[word])
# fi = doc_lst.count(word) ##
# part1 = log((N - nqi + 0.5) / (nqi + 0.5))
# part2 = fi * (k1 + 1) / (fi + k1 * (1 - b + b * (dl / avgdl)))
# score_q_d += (part1 * part2)
# id_score_dic[product_data['product_id'][j]] = score_q_d
# sort_id_score_dic = sorted(id_score_dic.items(), key = lambda s:s[1], reverse=True)
# for n in range(10):
# top_10_product_id.append(sort_id_score_dic[n][0])
# query_id.append(test_data['query_id'][i])
# queryid_productid_dic.setdefault(test_data['query_id'][i], []).append(sort_id_score_dic[n][0])
# top_10_product_id = pd.DataFrame(top_10_product_id, columns=['product_id'])
# query_id = pd.DataFrame(query_id, columns=['query_id'])
# res = pd.concat([top_10_product_id, query_id], axis=1)
# res.to_csv(r'D:\KDD相关\任务1数据集\submit.csv', index=False, encoding='utf-8')
# return queryid_productid_dic
#
# from nlp_datasets.seq_match.seq_match_dataset import SeqMatchDataset
# 参数们:
'''
'buffer_size': 10000000, 重复
'seed': None, 重复
'reshuffle_each_iteration': True, 重复
'prefetch_size': tf.data.experimental.AUTOTUNE,
'num_parallel_calls': tf.data.experimental.AUTOTUNE,
'add_sos': True,
'add_eos': True,
'skip_count': 0,
'padding_by_eos': False,
'drop_remainder': True,
'bucket_width': 10,
'train_batch_size': 32, 重复
'eval_batch_size': 32, 重复
'predict_batch_size': 32, 重复
'repeat': 1,
-----
'sep': '@',
'num_parallel_calls': 1,
'buffer_size': 1000, 重复
'seed': None, 重复
'reshuffle_each_iteration': True, 重复
'train_batch_size': 2, 重复
'eval_predict_size': 2, 重复
'predict_batch_size': 2, 重复
'query_max_len': 5,
'doc_max_len': 5,
'vocab_file': 'data/vocab.txt',
'train_files': ['data/train.txt'],
'eval_files': ['data/train.txt'],
'predict_files': ['data/train.txt']
-----
'vocab_size': 10,
'embedding_size': 256,
'vec_dim': 256,
-----
'ckpt_period': 1,
'model_dir': '/tmp/dssm'
-----
'xyz_sep': '@',
'sep': ' ',
'x_max_len': -1,
'y_max_len': -1,
'''
# files = ['C:/Users/admin\Desktop/train1.txt']
# dataset = tf.data.Dataset.from_tensor_slices(files)
# import MeCab
# mecab = MeCab.Tagger ("-Owakati")
# sentence = '無印良品 体にフィットするソファー用綿デニムカバー ネイビー 44105634'
# print(mecab.parse(sentence))
# def len_control(word_lst, max_len, pad_token):
# m = len(word_lst)
# if m > max_len:
# word_lst = word_lst[:max_len]
# elif m < max_len:
# cha = max_len - m
# padding = [pad_token] * cha
# word_lst += padding
# return ' '.join(word_lst)
# print(len_control(['dqe', 'rte', 'qee', 'dqe', 'rte', 'qee'], 5, 'unk'))
'''
罗马数字包含以下七种字符: I, V, X, L,C,D 和 M。
字符 数值
I 1
V 5
X 10
L 50
C 100
D 500
M 1000
例如, 罗马数字 2 写做 II ,即为两个并列的 1。12 写做 XII ,即为 X + II 。 27 写做 XXVII, 即为 XX + V + II 。
通常情况下,罗马数字中小的数字在大的数字的右边。但也存在特例,例如 4 不写做 IIII,而是 IV。
数字 1 在数字 5 的左边,所表示的数等于大数 5 减小数 1 得到的数值 4 。同样地,数字 9 表示为 IX。这个特殊的规则只适用于以下六种情况�
人工智能-项目实践-强化学习-文本相似度计算模型(双塔).zip
版权申诉
47 浏览量
2023-12-26
18:20:58
上传
评论
收藏 60KB ZIP 举报
博士僧小星
- 粉丝: 1745
- 资源: 5850