package com.action;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.StringReader;
import java.util.ArrayList;
import net.paoding.analysis.analyzer.PaodingAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.TermPositions;
import org.apache.lucene.store.RAMDirectory;
import org.apache.tika.Tika;
import org.apache.tika.metadata.Metadata;
public class Title {
public static String generateTitle(File file) throws Exception {
Tika tika = new Tika();
String content = tika.parseToString(file).trim();
BufferedReader fileContent = new BufferedReader(new StringReader(
content));
int count = 0;
// 把符合条件的正文行加入到候选标题中
String row = fileContent.readLine();
ArrayList<String> rows = new ArrayList<String>();
while (null != row) {
rows = addToRows(rows, row);
row = fileContent.readLine();
count++;
}
fileContent.close();
FileInputStream is = new FileInputStream(file);
Metadata md = new Metadata();
tika.parseToString(is, md);
String[] values = md.getValues("title");
if (values.length != 0) {
rows = addToRows(rows, values[0]);
}
if (rows.size() != 0) {
Float[] compare = new Float[rows.size()];
int len=compare.length;
for (int i = 0; i < len; i++) {
compare[i] =(float)0.3*(len-i);
}
ArrayList termList = sortList(file);
for (int i = 0; i < rows.size(); i++) {
String title = rows.get(i);
int compareTimes;
if (termList.size() > 25) {
compareTimes = 25;
} else {
compareTimes = termList.size();
}
for (int j = 0; j < compareTimes; j++) {
TermItem term = (TermItem) termList.get(j);
if (title.indexOf(term.getTermName()) != -1) {
compare[i] = compare[i] + (compareTimes - j) * 0.2f;
}
}
}
System.out.println("候选标题对应统计结果");
for(int i=0;i<compare.length;i++){
System.out.println(compare[i]);
}
if (compare[titleId(compare)] < 3.5) {
return null;
} else {
System.out.println("得分:" + compare[titleId(compare)]);
System.out.println("最终标题:" + rows.get(titleId(compare)));
return rows.get(titleId(compare));
}
}
return null;
}
public static int titleId(Float[] compare) {
float max = compare[0];
int id = 0;
for (int i = 1; i < compare.length; i++) {
if (max < compare[i]) {
max = compare[i];
id = i;
}
}
return id;
}
public static ArrayList<String> addToRows(ArrayList<String> rows, String row) {
char[] prefix = { '1', '2', '3', '4', '5', '6', '7', '8', '9', '一',
'二', '三', '四', '五', '六', '七', '八', '九', '.', '、', '(', ')',
'(', ')', ' ', '?', '/', '?' };
char[] suffix = { '.', ',', '。', ',', ';', ';', '、', '?', '?' };
char[] check = { 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U',
'V', 'W', 'X', 'Y', 'Z', '.', '(', ')', '?', ',', '1', '2',
'3', '4', '5', '6', '7', '8', '9', '0', '=', '+', '[', ']',
'!', '&', '|', '<', '>', ';', '\'', '\"', '*', '?', '-', '{',
'}', ':', '/', '_',':' };
row = row.trim();
String tempRow = row;
int rlength = row.length();
if (rlength < 7) {
return rows;
}
char[] rowChar = row.toCharArray();
// 删除前缀
for (int i = 0; i < 4; i++) {
for (int j = 0; j < prefix.length; j++) {
if (rowChar[i] == prefix[j]) {
row = tempRow.substring(i + 1);
break;
}
}
}
rowChar = row.toCharArray();
String tempRow1 = row;
int temp = tempRow1.length() - 1;
// 删除后缀
for (int i = 0; i < 2; i++) {
for (int j = 0; j < suffix.length; j++) {
if (rowChar[temp - i] == suffix[j]) {
row = tempRow1.substring(0, temp - i);
break;
}
}
}
if (row.length() < 6 || row.length() > 20) {
return rows;
}
// 如果行内容包含空格,则删除该行作为候选标题,同时排除关键字行
rowChar = row.toCharArray();
for (int i = 0; i < row.length(); i++) {
if (rowChar[i] == ' ' || rowChar[i] == ';' || rowChar[i] == ';') {
return rows;
}
}
// 刷选无意义的字符串
int Slength = 0;
for (int i = 0; i < row.length(); i++) {
for (int j = 0; j < check.length; j++) {
if (rowChar[i] == check[j]) {
Slength++;
break;
}
}
}
if (Slength == row.length()) {
return rows;
}
rows.add(row);
return rows;
}
//获取词项-词频列表
public static ArrayList sortList(File file) throws Exception {
// 建立索引
Analyzer TextAnalyzer = new PaodingAnalyzer();
RAMDirectory ramDir = new RAMDirectory();
IndexWriter TextWriter = new IndexWriter(ramDir, TextAnalyzer, true);
TextWriter.setUseCompoundFile(false);
Document document = new Document();
Tika tika = new Tika();
Field content = new Field("content", tika.parseToString(file),
Field.Store.YES, Field.Index.TOKENIZED,
Field.TermVector.WITH_POSITIONS_OFFSETS);
document.add(content);
TextWriter.addDocument(document);
TextWriter.optimize();
TextWriter.close();
// 把词项和词项对应的文档中出现的频率存入List中,并按频率从高到低对词项排序
IndexReader reader = IndexReader.open(ramDir);
TermEnum termEnum = reader.terms();
ArrayList<TermItem> termList = new ArrayList<TermItem>();
while (termEnum.next()) {
TermItem termItem = new TermItem();
Term term = termEnum.term();
termItem.setTermName(term.text());
TermPositions termPositions = reader.termPositions(termEnum.term());
while (termPositions.next()) {
termItem.setTermFreq(termPositions.freq());
}
termList.add(termItem);
}
for (int i = 0; i < termList.size(); i++) {
TermItem ti = termList.get(i);
for (int j = i + 1; j < termList.size(); j++) {
TermItem tj = termList.get(j);
int tiFreq = ti.getTermFreq();
int tjFreq = tj.getTermFreq();
if (tiFreq < tjFreq) {
String tiTerm = ti.getTermName();
String tjTerm = tj.getTermName();
String termTemp = tiTerm;
ti.setTermName(tjTerm);
tj.setTermName(termTemp);
int freqTemp = tiFreq;
ti.setTermFreq(tjFreq);
tj.setTermFreq(freqTemp);
}
}
}
return termList;
}
}
没有合适的资源?快使用搜索试试~ 我知道了~
资源推荐
资源详情
资源评论
收起资源包目录
基于lucene的全文检索系统 (343个子文件)
all-wcprops 1KB
all-wcprops 662B
Title.class 8KB
SearchAction.class 5KB
FileUploadAction.class 5KB
Test.class 4KB
CorpusUploadAction.class 4KB
FileDownloadAction.class 3KB
PaodingChineseAnalyzer.class 2KB
IndexerAction.class 2KB
Index.class 2KB
FileShowAction.class 2KB
Extract.class 2KB
CorpusShowAction.class 2KB
TermItem.class 744B
.classpath 529B
org.eclipse.wst.common.component 452B
org.eclipse.wst.jsdt.ui.superType.container 49B
t-base.dic 2.36MB
china.dic 71KB
fuzhou.dic 27KB
festival.dic 2KB
x-confucian-family-name.dic 2KB
x-unit.dic 1KB
x-noise-charactor.dic 626B
x-noise-word.dic 592B
nation.dic 535B
star-domestic.dic 379B
appellation.dic 260B
company.dic 256B
administrative.dic 214B
org-foreign.dic 204B
beijing.dic 187B
x-for-combinatorics.dic 167B
language.dic 141B
name-foreign.dic 101B
contemporary-words.dic 41B
quanzhou.dic 38B
comupter-science.dic 24B
star-foreign.dic 20B
xiamen.dic 9B
oceania.dic 0B
africa.dic 0B
europe.dic 0B
america.dic 0B
japan.dic 0B
korea.dic 0B
taiwan.dic 0B
org-domestic.dic 0B
entries 1KB
entries 905B
format 2B
format 2B
text5-2-1.htm 80KB
text7-1-0.htm 61KB
text13-2-1old.htm 41KB
text11-2-4a.htm 39KB
text15-4-0.htm 32KB
text6-4-0.htm 32KB
text13-2-1.htm 31KB
text6-1-3.htm 28KB
text11-4-2b.htm 28KB
text6-1-2.htm 27KB
text15-5-0.htm 26KB
text11-4-1d.htm 26KB
6-1-0.htm 25KB
text7-2-0.htm 24KB
text13-3-2old.htm 23KB
text4-1-0.htm 23KB
text13-3-2.htm 21KB
text13-3-1old.htm 21KB
text13-2-2old.htm 20KB
text2-2-0.htm 20KB
text3-1-0.htm 20KB
text10-3-0a.htm 20KB
text13-3-1.htm 20KB
text2-1-0.htm 19KB
text9-1-0.htm 19KB
text6-1-1.htm 19KB
text15-2-0.htm 19KB
text15-6-0.htm 19KB
text6-3-0.htm 18KB
text4-2-0.htm 18KB
6-1-1.htm 18KB
text5-4-0.htm 18KB
text2-4-0.htm 17KB
text6-1-0.htm 17KB
text9-3-0.htm 17KB
text13-1-0b.htm 17KB
text3-2-0.htm 16KB
text5-2-0.htm 16KB
text11-1-2c.htm 16KB
t8-2-1.htm 15KB
text11-3-2b.htm 14KB
text5-1-0.htm 14KB
text6-5-0.htm 14KB
text14-2-0.htm 14KB
text2-3-0.htm 14KB
text3-3-0.htm 14KB
text5-3-0.htm 14KB
共 343 条
- 1
- 2
- 3
- 4
wugen1
- 粉丝: 1
- 资源: 40
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功
- 1
- 2
前往页