人工智能-项目实践-搜索引擎-Web信息检索与处理课程实验1-一个简易的搜索引擎

共70个文件

class：11个

java：10个

jar：7个

版权申诉

人工智能

搜索引擎

课程资源

java

194 浏览量 2024-02-26 14:41:15 上传评论收藏 4.53MB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

web_lab01-master.zip （70个子文件）

web_lab01-master

S_E_by_Jeeping

.classpath 1KB

.settings

org.eclipse.wst.jsdt.ui.superType.name 6B

org.eclipse.jdt.core.prefs 357B

.jsdtscope 555B

org.eclipse.wst.common.component 486B

org.eclipse.wst.common.project.facet.core.xml 335B

org.eclipse.wst.jsdt.ui.superType.container 49B

src

workspace

Indexer.java 1KB

Doc.java 247B

QuerySearcher.java 3KB

Result.java 670B

Day2Weekday.java 489B

CreateIndex.java 4KB

Txt2String.java 1KB

IKTokenizer.java 3KB

IKAnalyzer.java 1KB

Html2Docs.java 2KB

img

book48px.png 3KB

search32.png 2KB

search.png 18KB

build

classes

workspace

IKTokenizer.class 2KB

Result.class 705B

CreateIndex.class 4KB

Day2Weekday.class 694B

Html2Docs.class 3KB

SpellCheck.class 2KB

IKAnalyzer.class 1KB

Txt2String.class 1KB

QuerySearcher.class 5KB

Indexer.class 1KB

Doc.class 420B

WebContent

searcher.jsp 4KB

WEB-INF

lib

lucene-queryparser-5.3.1.jar 392KB

lucene-suggest-5.3.1.jar 240KB

lucene-core-5.3.1.jar 2.25MB

servlet-api.jar 193KB

IKAnalyzer-5.0.jar 1.1MB

lucene-memory-5.3.1.jar 33KB

lucene-highlighter-5.3.1.jar 141KB

web.xml 646B

bootstrap

bootstrap.js 68KB

npm.js 484B

bootstrap.min.js 36KB

jquery.min.js 94KB

css

bootstrap-theme.min.css.map 25KB

bootstrap.min.css 118KB

bootstrap-theme.css.map 47KB

bootstrap-theme.css 26KB

bootstrap.css.map 380KB

bootstrap.css 143KB

bootstrap-theme.min.css 23KB

bootstrap.min.css.map 529KB

fonts

glyphicons-halflings-regular.svg 106KB

glyphicons-halflings-regular.ttf 44KB

glyphicons-halflings-regular.woff 23KB

glyphicons-halflings-regular.eot 20KB

glyphicons-halflings-regular.woff2 18KB

img

book48px.png 3KB

search1.jpg 24KB

search3.jpg 14KB

search32.png 2KB

search.png 18KB

search1_2.jpg 3KB

search1_1.jpg 2KB

ss1_2.jpg 10KB

advance.jsp 1KB

index.html 1KB

scut.css 257B

META-INF

MANIFEST.MF 36B

.project 1011B

// 网页预处理 & 建立索引 package workspace; import java.io.IOException; import java.util.List; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; public class CreateIndex { /** * 将文本读入，并且转化为以每个doc为单位的字符串数组输出 * @return * @throws Exception */ public static List<String> txt2String () throws IOException { String buffer1, buffer2, buffer3; buffer1 = Txt2String.readFile("d:\\SinaNews\\2012.q1.txt") + Txt2String.readFile("d:\\SinaNews\\2012.q2.txt"); buffer2 = Txt2String.readFile("d:\\SinaNews\\2012.q3.txt") + Txt2String.readFile("d:\\SinaNews\\2012.q4.txt"); buffer3 = Txt2String.readFile("d:\\SinaNews\\2013.q1.txt"); Html2Docs retyper = new Html2Docs(); // List<String> 是由一个个doc的内容文本组成的String集合 List<String> temp = retyper.matchDoc(buffer1); temp.addAll(retyper.matchDoc(buffer2)); temp.addAll(retyper.matchDoc(buffer3)); return temp; } /** * 对传入的doc字符串数组处理，转化为Doc类，并完成网页数据预处理 * @param temp * @return */ public static Doc[] string2Docs (List<String> temp) { int docsNumber1 = temp.size(); Doc []docs = new Doc[docsNumber1]; Html2Docs retyper = new Html2Docs(); for (int i = 0; i < docsNumber1; i++) { docs[i] = new Doc(); docs[i].buffer = temp.get(i); docs[i].url = retyper.matchLabel(docs[i].buffer, "url"); docs[i].description = retyper.match(docs[i].buffer, "description"); docs[i].keyword = retyper.match(docs[i].buffer, "keywords"); docs[i].title = retyper.matchLabel(docs[i].buffer, "title"); docs[i].publishid = retyper.match(docs[i].buffer, "publishid"); docs[i].subjectid = retyper.match(docs[i].buffer, "subjectid"); docs[i].content = retyper.getcontent(docs[i].buffer); } return docs; } /** * 由传入的doc对象组成的数组建立索引 * @param docs * @throws Exception */ public static void docs2Index (Doc[] docs) throws Exception { int docsNumber1 = docs.length; String indexDir = "D:\\outputIndex"; Indexer index = new Indexer(indexDir); try{ // 每个document对应一篇文档 Document []document = new Document[docsNumber1]; // 写索引 for (int i = 0; i < docsNumber1; i++) { document[i] = new Document(); document[i].add(new Field("url", docs[i].url, StringField.TYPE_STORED)); document[i].add(new Field("description", docs[i].description, TextField.TYPE_STORED)); document[i].add(new Field("keywords", docs[i].keyword, TextField.TYPE_STORED)); document[i].add(new Field("title", docs[i].title, TextField.TYPE_STORED)); document[i].add(new Field("publishid", docs[i].publishid, StringField.TYPE_STORED)); document[i].add(new Field("subjectid", docs[i].subjectid, StringField.TYPE_STORED)); document[i].add(new Field("content", docs[i].content, TextField.TYPE_STORED)); index.writer.addDocument(document[i]); } } catch (Exception e){ e.printStackTrace(); } finally { try { index.close(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } } /** * 主方法，调用以上的方法，完成由读取文件内容到建立索引的过程 * @param args * @throws Exception */ public static void main(String[] args) throws Exception { // 需要改以上方法为静态方法，然而改成静态后则爆栈 docs2Index(string2Docs(txt2String())); } }

评论收藏

内容反馈

版权申诉