基于lucene的全文检索系统_计量规程全文检索系统资源-CSDN文库

共343个文件

htm：207个

dic：31个

jar：20个

lucene

索引

4星 · 超过85%的资源需积分: 11 35 浏览量 2012-05-08 19:10:16 上传评论 3 收藏 31.09MB RAR 举报

资源推荐

资源详情

资源评论

收起资源包目录

基于lucene的全文检索系统（343个子文件）

all-wcprops 1KB

all-wcprops 662B

Title.class 8KB

SearchAction.class 5KB

FileUploadAction.class 5KB

Test.class 4KB

CorpusUploadAction.class 4KB

FileDownloadAction.class 3KB

PaodingChineseAnalyzer.class 2KB

IndexerAction.class 2KB

Index.class 2KB

FileShowAction.class 2KB

Extract.class 2KB

CorpusShowAction.class 2KB

TermItem.class 744B

.classpath 529B

org.eclipse.wst.common.component 452B

org.eclipse.wst.jsdt.ui.superType.container 49B

t-base.dic 2.36MB

china.dic 71KB

fuzhou.dic 27KB

festival.dic 2KB

x-confucian-family-name.dic 2KB

x-unit.dic 1KB

x-noise-charactor.dic 626B

x-noise-word.dic 592B

nation.dic 535B

star-domestic.dic 379B

appellation.dic 260B

company.dic 256B

administrative.dic 214B

org-foreign.dic 204B

beijing.dic 187B

x-for-combinatorics.dic 167B

language.dic 141B

name-foreign.dic 101B

contemporary-words.dic 41B

quanzhou.dic 38B

comupter-science.dic 24B

star-foreign.dic 20B

xiamen.dic 9B

oceania.dic 0B

africa.dic 0B

europe.dic 0B

america.dic 0B

japan.dic 0B

korea.dic 0B

taiwan.dic 0B

org-domestic.dic 0B

entries 1KB

entries 905B

format 2B

text5-2-1.htm 80KB

text7-1-0.htm 61KB

text13-2-1old.htm 41KB

text11-2-4a.htm 39KB

text15-4-0.htm 32KB

text6-4-0.htm 32KB

text13-2-1.htm 31KB

text6-1-3.htm 28KB

text11-4-2b.htm 28KB

text6-1-2.htm 27KB

text15-5-0.htm 26KB

text11-4-1d.htm 26KB

6-1-0.htm 25KB

text7-2-0.htm 24KB

text13-3-2old.htm 23KB

text4-1-0.htm 23KB

text13-3-2.htm 21KB

text13-3-1old.htm 21KB

text13-2-2old.htm 20KB

text2-2-0.htm 20KB

text3-1-0.htm 20KB

text10-3-0a.htm 20KB

text13-3-1.htm 20KB

text2-1-0.htm 19KB

text9-1-0.htm 19KB

text6-1-1.htm 19KB

text15-2-0.htm 19KB

text15-6-0.htm 19KB

text6-3-0.htm 18KB

text4-2-0.htm 18KB

6-1-1.htm 18KB

text5-4-0.htm 18KB

text2-4-0.htm 17KB

text6-1-0.htm 17KB

text9-3-0.htm 17KB

text13-1-0b.htm 17KB

text3-2-0.htm 16KB

text5-2-0.htm 16KB

text11-1-2c.htm 16KB

t8-2-1.htm 15KB

text11-3-2b.htm 14KB

text5-1-0.htm 14KB

text6-5-0.htm 14KB

text14-2-0.htm 14KB

text2-3-0.htm 14KB

text3-3-0.htm 14KB

text5-3-0.htm 14KB

共 343 条

package com.action; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.StringReader; import java.util.ArrayList; import net.paoding.analysis.analyzer.PaodingAnalyzer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermEnum; import org.apache.lucene.index.TermPositions; import org.apache.lucene.store.RAMDirectory; import org.apache.tika.Tika; import org.apache.tika.metadata.Metadata; public class Title { public static String generateTitle(File file) throws Exception { Tika tika = new Tika(); String content = tika.parseToString(file).trim(); BufferedReader fileContent = new BufferedReader(new StringReader( content)); int count = 0; // 把符合条件的正文行加入到候选标题中 String row = fileContent.readLine(); ArrayList<String> rows = new ArrayList<String>(); while (null != row) { rows = addToRows(rows, row); row = fileContent.readLine(); count++; } fileContent.close(); FileInputStream is = new FileInputStream(file); Metadata md = new Metadata(); tika.parseToString(is, md); String[] values = md.getValues("title"); if (values.length != 0) { rows = addToRows(rows, values[0]); } if (rows.size() != 0) { Float[] compare = new Float[rows.size()]; int len=compare.length; for (int i = 0; i < len; i++) { compare[i] =(float)0.3*(len-i); } ArrayList termList = sortList(file); for (int i = 0; i < rows.size(); i++) { String title = rows.get(i); int compareTimes; if (termList.size() > 25) { compareTimes = 25; } else { compareTimes = termList.size(); } for (int j = 0; j < compareTimes; j++) { TermItem term = (TermItem) termList.get(j); if (title.indexOf(term.getTermName()) != -1) { compare[i] = compare[i] + (compareTimes - j) * 0.2f; } } } System.out.println("候选标题对应统计结果"); for(int i=0;i<compare.length;i++){ System.out.println(compare[i]); } if (compare[titleId(compare)] < 3.5) { return null; } else { System.out.println("得分：" + compare[titleId(compare)]); System.out.println("最终标题：" + rows.get(titleId(compare))); return rows.get(titleId(compare)); } } return null; } public static int titleId(Float[] compare) { float max = compare[0]; int id = 0; for (int i = 1; i < compare.length; i++) { if (max < compare[i]) { max = compare[i]; id = i; } } return id; } public static ArrayList<String> addToRows(ArrayList<String> rows, String row) { char[] prefix = { '1', '2', '3', '4', '5', '6', '7', '8', '9', '一', '二', '三', '四', '五', '六', '七', '八', '九', '.', '、', '(', ')', '（', '）', ' ', '?', '/', '？' }; char[] suffix = { '.', ',', '。', '，', ';', '；', '、', '?', '？' }; char[] check = { 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '.', '(', ')', '?', ',', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '=', '+', '[', ']', '!', '&', '|', '<', '>', ';', '\'', '\"', '*', '？', '-', '{', '}', ':', '/', '_','：' }; row = row.trim(); String tempRow = row; int rlength = row.length(); if (rlength < 7) { return rows; } char[] rowChar = row.toCharArray(); // 删除前缀 for (int i = 0; i < 4; i++) { for (int j = 0; j < prefix.length; j++) { if (rowChar[i] == prefix[j]) { row = tempRow.substring(i + 1); break; } } } rowChar = row.toCharArray(); String tempRow1 = row; int temp = tempRow1.length() - 1; // 删除后缀 for (int i = 0; i < 2; i++) { for (int j = 0; j < suffix.length; j++) { if (rowChar[temp - i] == suffix[j]) { row = tempRow1.substring(0, temp - i); break; } } } if (row.length() < 6 || row.length() > 20) { return rows; } // 如果行内容包含空格，则删除该行作为候选标题,同时排除关键字行 rowChar = row.toCharArray(); for (int i = 0; i < row.length(); i++) { if (rowChar[i] == ' ' || rowChar[i] == ';' || rowChar[i] == '；') { return rows; } } // 刷选无意义的字符串 int Slength = 0; for (int i = 0; i < row.length(); i++) { for (int j = 0; j < check.length; j++) { if (rowChar[i] == check[j]) { Slength++; break; } } } if (Slength == row.length()) { return rows; } rows.add(row); return rows; } //获取词项-词频列表 public static ArrayList sortList(File file) throws Exception { // 建立索引 Analyzer TextAnalyzer = new PaodingAnalyzer(); RAMDirectory ramDir = new RAMDirectory(); IndexWriter TextWriter = new IndexWriter(ramDir, TextAnalyzer, true); TextWriter.setUseCompoundFile(false); Document document = new Document(); Tika tika = new Tika(); Field content = new Field("content", tika.parseToString(file), Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS); document.add(content); TextWriter.addDocument(document); TextWriter.optimize(); TextWriter.close(); // 把词项和词项对应的文档中出现的频率存入List中，并按频率从高到低对词项排序 IndexReader reader = IndexReader.open(ramDir); TermEnum termEnum = reader.terms(); ArrayList<TermItem> termList = new ArrayList<TermItem>(); while (termEnum.next()) { TermItem termItem = new TermItem(); Term term = termEnum.term(); termItem.setTermName(term.text()); TermPositions termPositions = reader.termPositions(termEnum.term()); while (termPositions.next()) { termItem.setTermFreq(termPositions.freq()); } termList.add(termItem); } for (int i = 0; i < termList.size(); i++) { TermItem ti = termList.get(i); for (int j = i + 1; j < termList.size(); j++) { TermItem tj = termList.get(j); int tiFreq = ti.getTermFreq(); int tjFreq = tj.getTermFreq(); if (tiFreq < tjFreq) { String tiTerm = ti.getTermName(); String tjTerm = tj.getTermName(); String termTemp = tiTerm; ti.setTermName(tjTerm); tj.setTermName(termTemp); int freqTemp = tiFreq; ti.setTermFreq(tjFreq); tj.setTermFreq(freqTemp); } } } return termList; } }

评论收藏

内容反馈