package ir.vsr;
import java.io.*;
import java.util.*;
import java.lang.*;
import ir.utilities.*;
import ir.classifiers.*;
/**
* An inverted index for vector-space information retrieval. Contains
* methods for creating an inverted index from a set of documents
* and retrieving ranked matches to queries using standard TF/IDF
* weighting and cosine similarity.
*
* @author Ray Mooney
*/
public class InvertedIndex{
/** The maximum number of retrieved documents for a query to present to the user
* at a time */
public static final int MAX_RETRIEVALS = 10;
/** A HashMap where tokens are indexed. Each indexed token maps
* to a TokenInfo. */
public HashMap tokenHash = null;
/** A list of all indexed documents. Elements are DocumentReference's. */
public ArrayList docRefs = null;
/** The directory from which the indexed documents come. */
public File dirFile = null;
/** The type of Documents (text, HTML). See docType in DocumentIterator. */
public short docType = DocumentIterator.TYPE_TEXT;
/** Whether tokens should be stemmed with Porter stemmer */
public boolean stem = false;
/** Whether relevance feedback using the Ide_regular algorithm is used */
public boolean feedback = false;
/** Create an inverted index of the documents in a directory.
* @param dirFile The directory of files to index.
* @param docType The type of documents to index (See docType in DocumentIterator)
* @param stem Whether tokens should be stemmed with Porter stemmer.
* @param feedback Whether relevance feedback should be used.
*/
public InvertedIndex(File dirFile, short docType, boolean stem, boolean feedback) {
this.dirFile = dirFile;
this.docType = docType;
this.stem = stem;
this.feedback = feedback;
tokenHash = new HashMap();
docRefs = new ArrayList();
indexDocuments();
}
/** Create an inverted index of the documents in a directory.
* @param dirVector The vector containing the Example objects of the files to Index
* @param docType The type of documents to index (See docType in DocumentIterator)
* @param stem Whether tokens should be stemmed with Porter stemmer.
* @param feedback Whether relevance feedback should be used.
*/
public InvertedIndex(Vector dirVector, short docType, boolean stem, boolean feedback) {
this.docType = docType;
this.stem = stem;
this.feedback = feedback;
tokenHash = new HashMap();
docRefs = new ArrayList();
indexDocuments(dirVector);
}
/** Index the documents in dirFile. */
protected void indexDocuments() {
if (!tokenHash.isEmpty() || !docRefs.isEmpty()) {
// Currently can only index one set of documents when an index is created
System.out.println("\nCannot indexDocuments more than once in the same InvertedIndex");
System.exit(1);
}
// Get an iterator for the documents
DocumentIterator docIter = new DocumentIterator(dirFile, docType, stem);
System.out.println("Indexing documents in " + dirFile);
// Loop, processing each of the documents
while (docIter.hasMoreDocuments()) {
FileDocument doc = docIter.nextDocument();
// Create a document vector for this document
HashMapVector vector = doc.hashMapVector();
// Create a reference to this document
DocumentReference docRef = new DocumentReference(doc);
// Add this document to the list of documents indexed
docRefs.add(docRef);
// Iterate through each of the tokens in the document
Iterator mapEntries = vector.iterator();
while (mapEntries.hasNext()) {
Map.Entry entry = (Map.Entry)mapEntries.next();
// An entry in the HashMap maps a token to a Weight
String token = (String)entry.getKey();
// The count for the token is in the value of the Weight
int count = (int)((Weight)entry.getValue()).getValue();
// Add an occurence of this token to the inverted index pointing to this document
indexToken(token, count, docRef);
}
}
// Now that all documents have been processed, we can calculate the IDF weights for
// all tokens and the resulting lengths of all weighted document vectors.
computeIDFandDocumentLengths();
System.out.println("Indexed " + docRefs.size() + " documents with " + size() + " unique terms.");
}
/** Index the training files in dirVector. */
public void indexDocuments(Vector dirVector) {
if (!tokenHash.isEmpty() || !docRefs.isEmpty()) {
// Currently can only index one set of documents when an index is created
System.out.println("\nCannot indexDocuments more than once in the same InvertedIndex");
System.exit(1);
}
Enumeration enum = dirVector.elements();
// Loop, processing each of the documents
while (enum.hasMoreElements()) {
Example example = (Example) enum.nextElement();
FileDocument doc = example.getDocument();
// Create a document vector for this document
HashMapVector vector = example.getHashMapVector();
// Create a reference to this document
DocumentReference docRef = new DocumentReference(doc);
// Add this document to the list of documents indexed
docRefs.add(docRef);
// Iterate through each of the tokens in the document
Iterator mapEntries = vector.iterator();
while (mapEntries.hasNext()) {
Map.Entry entry = (Map.Entry)mapEntries.next();
// An entry in the HashMap maps a token to a Weight
String token = (String)entry.getKey();
// The count for the token is in the value of the Weight
int count = (int)((Weight)entry.getValue()).getValue();
// Add an occurence of this token to the inverted index pointing to this document
indexToken(token, count, docRef);
}
}
// Now that all documents have been processed, we can calculate the IDF weights for
// all tokens and the resulting lengths of all weighted document vectors.
computeIDFandDocumentLengths();
System.out.println("Indexed " + docRefs.size() + " documents with " + size() + " unique terms.");
}
/** Add a token occurrence to the index.
* @param token The token to index.
* @param count The number of times it occurs in the document.
* @param docRef A reference to the Document it occurs in.
*/
protected void indexToken(String token, int count, DocumentReference docRef) {
// Find this token in the index
TokenInfo tokenInfo = (TokenInfo)tokenHash.get(token);
if (tokenInfo == null) {
// If this is a new token, create info for it to put in the hashtable
tokenInfo = new TokenInfo();
tokenHash.put(token, tokenInfo);
}
// Add a new occurrence for this token to its info
tokenInfo.occList.add(new TokenOccurrence(docRef, count));
}
/** Compute the IDF factor for every token in the index and the length
* of the document vector for every document referenced in the index. */
protected void computeIDFandDocumentLengths() {
// Let N be the total number of documents indexed
double N = docRefs.size();
// Iterate through each of the tokens in the index
Iterator mapEntries = tokenHash.entrySet().iterator();
while (mapEntries.hasNext()) {
// Get the token and the tokenInfo for each entry in the HashMap
Map.Entry entry = (Map.Entry)mapEntries.next();
String token = (String)entry.getKey();
TokenInfo tokenInfo = (TokenInfo)entry.getValue();
// Get the total number of documents in which this token occurs
double numDocRefs = tokenInfo.occList.size();
// Calculate the IDF factor for this token
double idf = Math.log(N/numDocRefs);
// System.out.println(token + " occurs in " + Math.round(numDocRefs) + " docs so IDF=" + idf);
if (idf == 0.0)
// If IDF is 0, then just remove this inconsequential token from the index
mapEntries.remove();
else {
tokenInfo.idf = idf;
// In order to compute document vector lengths, sum the
// square of the weights (IDF * occurrence count) across
// every token occurrence for each document.
for(int i = 0; i < tokenInfo.o
没有合适的资源?快使用搜索试试~ 我知道了~
vsr.tar.gz_operation
1.该资源内容由用户上传,如若侵权请联系客服进行举报
2.虚拟产品一经售出概不退款(资源遇到问题,请及时私信上传者)
2.虚拟产品一经售出概不退款(资源遇到问题,请及时私信上传者)
版权申诉
0 下载量 4 浏览量
2022-09-20
22:48:01
上传
评论
收藏 25KB GZ 举报
温馨提示
共26个文件
java:13个
class:13个
a space consisting of vectors, together with the associative and commutative operation of addition of vectors, and the associative and distributive operation of multiplication of vectors by scala
资源推荐
资源详情
资源评论
收起资源包目录
vsr.tar.gz (26个子文件)
vsr
InvertedIndex.java 21KB
Document.class 2KB
DocumentIterator.java 3KB
TextFileDocument.class 2KB
Retrieval.java 1KB
Retrieval.class 497B
Feedback.class 3KB
TokenInfo.class 337B
InvertedIndex.class 9KB
DocumentReference.java 1KB
Document.java 5KB
DocumentIterator.class 2KB
TokenOccurrence.class 370B
HTMLFileDocument.java 3KB
TokenOccurrence.java 546B
HTMLFileDocument.class 2KB
FileDocument.java 660B
FileDocument.class 967B
TextStringDocument.class 1KB
HashMapVector.class 2KB
HashMapVector.java 4KB
TextFileDocument.java 2KB
DocumentReference.class 835B
Feedback.java 4KB
TextStringDocument.java 1KB
TokenInfo.java 700B
共 26 条
- 1
资源评论
JonSco
- 粉丝: 72
- 资源: 1万+
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
最新资源
- 微信小程序 - 图书管理系统源码.zip
- 微信小程序 - 图片自适应 ,富文本解析源码.zip
- 微信小程序 - 同乐居商城:购物车合算源码
- 1、根据输入的三条边值判断能组成何种三角形,并设计测试数据进行判定覆盖测试 三条边为变量a、b、c,范围为1≤边值≤10,不在范
- SQL server 练习题目8道(小白教学).zip
- Python 手写实现 iD3 决策树算法-根据信息增益公式.zip
- 411675952289057车联助手-小窗版(三星)3.5.1.apk
- 三种快速排序方法合并在一个文件中以便直接运行的Python代码示例
- 937712277954201实习5.word
- 2程序语言基础知识pdf1_1716337722703.jpeg
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功