package test;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.wltea.analyzer.lucene.IKAnalyzer;
import java.io.*;
import java.util.*;
import java.util.regex.Pattern;
import static java.lang.Math.log;
import static java.lang.Math.pow;
/**
* Created by yizhou on 14-4-9.
*/
public class TezhenXuanze_副本 {
private static Map<String,Integer> WordList = new HashMap<String, Integer>();//所有词
private static Map<String,Integer> ClassList = new HashMap<String,Integer>();//所有分类
// private static Map<String,Map<Integer,Double>> WordFeatureValue = new HashMap<String, Map<Integer, Double>>();//用来存放所有类的互信息
private static Map<String,Double> WordFeatureValue = new HashMap<String, Double>();
private static List<String> KeyWord = new ArrayList<String>();
private static int wordnum=0;
private static int classnum=0;
private static Pattern pattern = Pattern.compile("[0-9]{6}");
private static Pattern pattern1 = Pattern.compile("[0-9]{1,10}");
// basefilepath为根目录,里面包含所有分类(如看多目录,看空目录等)filepath[]为一级子目录,每个filepath[i]目录里包含该分类下所有文本
// 输入的文本为未经分词的原始文本
public static void Product_WordMatrix(File basefilepath,File writepath) throws IOException {
IKAnalyzer analyzer = new IKAnalyzer(true);
File [] filepath = basefilepath.listFiles();
Integer count = 0;
classnum = filepath.length;
for(int i=0;i<classnum;i++){
ClassList.put(filepath[i].getName(),i);
File []files = filepath[i].listFiles();//存储子目录下所有文件
File path = new File(writepath.getPath()+"\\"+filepath[i].getName());//生成类别目录
if(!path.exists()) path.mkdirs();
int dayux00=0;
for(File filename:files){
File writefile=new File(path.getPath()+"\\"+filename.getName());//将原始文档转化为矩阵形式的词文档
FileWriter fw=new FileWriter(writefile);
BufferedWriter writer=new BufferedWriter(fw);
InputStreamReader reader = new InputStreamReader(new FileInputStream(filename)); // 建立一个输入流对象reader
BufferedReader br = new BufferedReader(reader); // 建立一个对象,它把文件内容转成计算机能读懂的语言
String line = br.readLine();
while (line != null) {
count++;
line = shaixuan(line,filename.getName());
if(line == null) {dayux00++;line=br.readLine();continue;}
String[] lines = line.split("[。??]");
// for(int it=0;it<lines.length;it++){ //长句子分为短句子
// TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(lines[it]));
TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(line));
tokenStream.addAttribute(CharTermAttribute.class);
while (tokenStream.incrementToken()){
String word = tokenStream.getAttribute(CharTermAttribute.class).toString();
writer.write(word + "\t");
if(!pattern1.matcher(word).find()) {
WordList.put(word, wordnum);
wordnum++;
}
}
writer.write(count.toString());
writer.write("\r\n");
writer.flush();
// }
line = br.readLine();
}
writer.close();
reader.close();
}
System.out.println("类别:" + filepath[i].getName() + "过滤数目:" + dayux00+":"+count);
}
}
public static String shaixuan(String line,String filename){
line = line.replaceAll("[+~$`^=|<>~`$^+=|<>¥×]","");
if(line.indexOf("评级")!=-1){
line=line.replaceAll("[“”\"的]", "");
if(line.indexOf("评级")>5)
line = line.substring(line.indexOf("评级")-6,line.indexOf("评级"));
else
line.substring(0,line.indexOf("评级"));
}
else if(line.indexOf("投资建议")!=-1){
if(line.indexOf("风险提示")!=-1)
line = line.substring(line.indexOf("投资建议"),line.indexOf("风险提示"));
else
line = line.substring(line.indexOf("投资建议"));
}
else if(line.indexOf("研报")!=-1&&line.indexOf("评级")==-1)
line=null;
else if(line.indexOf("公告日期")!=-1||(pattern.matcher(line).find()&&!(line.indexOf(filename.substring(0,5))!=-1)))
line=null;
return line;
}
public static void Product_WordMatrix_Test(File basefilepath,File writepath) throws IOException {
IKAnalyzer analyzer = new IKAnalyzer(true);
File [] filepath = basefilepath.listFiles();
Integer count = 0;
for(int i=0;i<filepath.length;i++){
File []files = filepath[i].listFiles();//存储子目录下所有文件
File path = new File(writepath.getPath()+"\\"+filepath[i].getName());//生成类别目录
if(!path.exists()) path.mkdirs();
int dayux00=0;
for(File filename:files){
File writefile=new File(path.getPath()+"\\"+filename.getName());//将原始文档转化为矩阵形式的词文档
FileWriter fw=new FileWriter(writefile);
BufferedWriter writer=new BufferedWriter(fw);
InputStreamReader reader = new InputStreamReader(new FileInputStream(filename)); // 建立一个输入流对象reader
BufferedReader br = new BufferedReader(reader); // 建立一个对象,它把文件内容转成计算机能读懂的语言
String line = br.readLine();
while (line != null) {
count++;
line = shaixuan(line,filename.getName());
if(line == null) {dayux00++;line=br.readLine();continue;}
String[] lines = line.split("[。??]");
// for(int it=0;it<lines.length;it++){ //长句子分为短句子
// TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(lines[it]));
TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(line));
tokenStream.addAttribute(CharTermAttribute.class);
while (tokenStream.incrementToken()){
String word = tokenStream.getAttribute(CharTermAttribute.class).toString();
writer.write(word + "\t");
}
writer.write(count.toString());
writer.write("\r\n");
writer.flush();
// }
line = br.readLine();
}
writer.close();
reader.close();
}
System.out.println("测试类别:"+filepath[i].getName()+"\t过滤数目:"+dayux00);
}
}
public static void Product_ZiMatrix(File basefilepath,File writepath) throws IOException {
File [] filepath = basefilepath.listFiles();
classnum = filepath.length;
for(int i=0;i<classnum;i++){
ClassList.put(filepath[i].getName(),i);
File []files = filepath[i].listFiles();//存储子目录下所有文件
File path = new File(writepath.getPath()+"\\"+filepath[i].getName(
没有合适的资源?快使用搜索试试~ 我知道了~
股吧情绪倾向量化DEMO(毕业设计+机器学习课程project).zip
共93个文件
java:23个
xml:19个
jsp:11个
0 下载量 10 浏览量
2024-01-05
15:45:34
上传
评论
收藏 1.07MB ZIP 举报
温馨提示
股吧情绪倾向量化DEMO(毕业设计+机器学习课程project).zip
资源推荐
资源详情
资源评论
收起资源包目录
股吧情绪倾向量化DEMO(毕业设计+机器学习课程project).zip (93个子文件)
haah
.gitattributes 378B
src
cidian
stopword.dic 129B
guba.dic 7KB
股票词汇大全.dic 139KB
股票基金2.dic 169KB
股票基金.dic 80KB
IKAnalyzer.cfg.xml 536B
test
Spider.java 4KB
TxtValue.java 487B
CiPing.java 2KB
jisuan.java 2KB
DataClassify_old.java 6KB
IKAnalyzerTest.java 15KB
DataPreProcess_old.java 14KB
AttributeSelectionTest.java 4KB
TezhenXuanze.java 9KB
WebSpider_old.java 5KB
TezhenXuanze_副本.java 21KB
TezhenXuanze_old.java 20KB
Servlet
ResultServlet.java 647B
PaquServlet.java 1KB
LiangHuaServlet.java 1KB
TrainServlet.java 1KB
product
WebSpider.java 5KB
DataPreProcess.java 10KB
DataClassify.java 6KB
DataClassify_old.java 6KB
DataSample.java 1KB
Quantization.java 4KB
cidianfa.java 2KB
exportToHTML
股吧情绪倾向分析平台.iml.html 6KB
index.html 186B
out
artifacts
bishe
fonts.css 19KB
WEB-INF
web.xml 1KB
index.jsp 2KB
tbzs.jsp 2KB
default.css 10KB
ichart.js 95KB
bootstrap.css 123KB
sjxl.jsp 2KB
images
pic02.jpg 2KB
logo.jpg 8KB
pic01.jpg 81KB
result.jsp 9KB
fonts
FontAwesome.otf 62KB
fontawesome-webfont.eot 37KB
fontawesome-webfont.woff 43KB
fontawesome-webfont.svg 198KB
fontawesome-webfont.ttf 79KB
qxlh.jsp 2KB
web_war_exploded
WEB-INF
web.xml 313B
index.jsp 175B
production
股吧情绪倾向分析平台
cidian
stopword.dic 129B
guba.dic 7KB
股票词汇大全.dic 139KB
股票基金2.dic 169KB
股票基金.dic 80KB
IKAnalyzer.cfg.xml 536B
web
fonts.css 19KB
WEB-INF
web.xml 1KB
index.jsp 2KB
tbzs.jsp 2KB
default.css 10KB
ichart.js 95KB
bootstrap.css 123KB
sjxl.jsp 2KB
images
pic02.jpg 2KB
logo.jpg 8KB
pic01.jpg 81KB
result.jsp 9KB
fonts
FontAwesome.otf 62KB
fontawesome-webfont.eot 37KB
fontawesome-webfont.woff 43KB
fontawesome-webfont.svg 198KB
fontawesome-webfont.ttf 79KB
qxlh.jsp 2KB
.idea
project-template.xml 186B
description.html 36B
uiDesigner.xml 9KB
artifacts
bishe.xml 776B
scopes
scope_settings.xml 139B
libraries
jsoup_1_7_2.xml 217B
weka.xml 449B
IKAnalyzer2012.xml 319B
vcs.xml 166B
workspace.xml 58KB
misc.xml 5KB
compiler.xml 711B
modules.xml 302B
encodings.xml 264B
copyright
profiles_settings.xml 111B
.gitignore 803B
股吧情绪倾向分析平台.iml 1KB
共 93 条
- 1
资源评论
Lei宝啊
- 粉丝: 2051
- 资源: 1330
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功