# import pandas as pd
# import requests
# import math
# import re
# import jieba
# # pip install wordcloud -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
# from collections import Counter
# from wordcloud import WordCloud
# import matplotlib.pyplot as plt
# import random
# import matplotlib
#
# matplotlib.rcParams['font.sans-serif'] = ['SimHei']
# matplotlib.rcParams['axes.unicode_minus'] = False
# import sklearn
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.feature_extraction.text import TfidfTransformer
# import matplotlib.pyplot as plt
# import matplotlib
# from sklearn.feature_extraction.text import TfidfVectorizer
#
# matplotlib.rcParams['font.sans-serif'] = ['SimHei']
# matplotlib.rcParams['axes.unicode_minus'] = False
# import random
# import numpy as np
# import matplotlib.mlab as mlab
# import matplotlib.pyplot as plt
# from collections import Counter
# import bertvec
# from bertvec import bert_vec
# def plt_picture_ciyun(n):
# """
# 词云
# 传入的n是:我 老婆 走 了 四年 总会 不经意 想起 那 感觉 没 经历 的 很 难 体会
# """
# wc = WordCloud(
# # 设置字体,不指定就会出现乱码
# font_path='simhei.ttf',
# # 设置背景色
# background_color='white',
# # 设置背景宽
# width=500,
# # 设置背景高
# height=350,
# # 最大字体
# max_font_size=50,
# # 最小字体
# min_font_size=10,
# mode='RGBA'
# # colormap='pink'
# )
# # 产生词云
# wc.generate(n)
# # 显示图片
# # 指定所绘图名称
# plt.figure("jay")
#
# # 以图片的形式显示词云
# plt.imshow(wc)
# # 关闭图像坐标系
# plt.axis("off")
# # 保存词云图片
# # plt.savefig("2209070221.png")
# plt.show()
# def tfidf_word(corpus):
# """
# tfodf 提取关键词:
# import sklearn
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.feature_extraction.text import TfidfTransformer
# from sklearn.feature_extraction.text import TfidfVectorizer
# corpus = ['TF-IDF 主要 思想 是', '算法 一个 重要 特点 可以 脱离 语料库 背景',
# '如果 一个 网页 被 很多 其他 网页 链接 说明 网页 重要',
# '原始 文本 进行 标记',
# '主要 思想']
# """
# vectorizer = TfidfVectorizer()
# X = vectorizer.fit_transform(corpus)
# data = {'word': vectorizer.get_feature_names_out(),
# 'tfidf': X.toarray().sum(axis=0).tolist()}
# df = pd.DataFrame(data)
# df = df.sort_values(by='tfidf', ascending=True) # True是从小达到 False是从大到小
# res = {}
# for i in df.values:
# res[i[0]] = i[1]
# # print(res){'链接': 0.2556396904397093, '说明': 0.2556396904397093...}
# return res
# def dict_sort(dic):
# """
# 字典排序
# """
# count = sorted(dic.items(), key=lambda x: x[1], reverse=True) # True 是降序 False是升序
# return count
# date=pd.read_excel('待处理数据 0812.xlsx')
# # date=pd.read_excel('已评分数据.xlsx')
# train_data=date['Comment'].values
#
# datas=[]
# for line in train_data:
# try:
# line =line.split(' ')
# temp=''
# for i in line:
# temp=temp+i+" "
# # print(temp)
# datas.append(temp)
# except:
# pass
#
# # print(len(datas))
# res=tfidf_word(datas)
# # print(res)
# res=dict_sort(res)
# word_list=[]
# n=''
# for idx,(k ,v) in enumerate(res):
# # print(k,v)
# word_list.append(k)
# n=n+k+" "
# if idx==200:break
#
# # 词云
# plt_picture_ciyun(n)
# word_list_vec=[]
# for word in word_list:
# vec = bert_vec(word)
# # print(vec)
# word_list_vec.append([word,vec])
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False
import random
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.manifold import TSNE
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['SimHei'] #中文字体
mpl.rcParams['axes.unicode_minus'] = False #防止负号出现异常显示
def tsne_em(train_x,train_y):
plt.figure(figsize=(15,15)) #定义画布大小
color=['b',"r","g","k"] # 定义颜色 参数c 可以等于:['c', 'b', 'g', 'r', 'm', 'y', 'k', 'w']
color_label=['b',"r","g","k"]
marker=[" "," "," "," "]
tokens = []
labels = []
for idx,line in enumerate(train_x):
# print(str(train_y[idx]))
labels.append(train_y[idx])
tokens.append(train_x[idx]) # 存储的是向量
tsne_model = TSNE(perplexity=10, n_components=2, init='pca', n_iter=2500, random_state=23)
# perplexity: 默认为30,数据集越大,需要参数值越大,建议值位5-50 , n_components=2 默认为2,嵌入空间的维度(嵌入空间的意思就是结果空间),别的参数估计不重要
print(len(tokens))
print(tokens)
new_values = tsne_model.fit_transform(np.array(tokens))
# 将X投影到一个嵌入空间并返回转换结果
# 降维处理
# print(new_values)
x = []
y = []
for value in new_values:
x.append(value[0])
y.append(value[1])
for i in range(len(x)):
plt.scatter(x[i],y[i],c=color[1],marker=marker[1])
plt.text(x[i],y[i], labels[i], fontsize=10,color=color_label[1])
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.show()