vsr.tar.gz_operation资源-CSDN文库

版权申诉

operation

4 浏览量 2022-09-20 22:48:01 上传评论收藏 25KB GZ 举报

共26个文件

java：13个

class：13个

资源推荐

资源详情

资源评论

收起资源包目录

vsr.tar.gz （26个子文件）

vsr

InvertedIndex.java 21KB

Document.class 2KB

DocumentIterator.java 3KB

TextFileDocument.class 2KB

Retrieval.java 1KB

Retrieval.class 497B

Feedback.class 3KB

TokenInfo.class 337B

InvertedIndex.class 9KB

DocumentReference.java 1KB

Document.java 5KB

DocumentIterator.class 2KB

TokenOccurrence.class 370B

HTMLFileDocument.java 3KB

TokenOccurrence.java 546B

HTMLFileDocument.class 2KB

FileDocument.java 660B

FileDocument.class 967B

TextStringDocument.class 1KB

HashMapVector.class 2KB

HashMapVector.java 4KB

TextFileDocument.java 2KB

DocumentReference.class 835B

Feedback.java 4KB

TextStringDocument.java 1KB

TokenInfo.java 700B

package ir.vsr; import java.io.*; import java.util.*; import java.lang.*; import ir.utilities.*; import ir.classifiers.*; /** * An inverted index for vector-space information retrieval. Contains * methods for creating an inverted index from a set of documents * and retrieving ranked matches to queries using standard TF/IDF * weighting and cosine similarity. * * @author Ray Mooney */ public class InvertedIndex{ /** The maximum number of retrieved documents for a query to present to the user * at a time */ public static final int MAX_RETRIEVALS = 10; /** A HashMap where tokens are indexed. Each indexed token maps * to a TokenInfo. */ public HashMap tokenHash = null; /** A list of all indexed documents. Elements are DocumentReference's. */ public ArrayList docRefs = null; /** The directory from which the indexed documents come. */ public File dirFile = null; /** The type of Documents (text, HTML). See docType in DocumentIterator. */ public short docType = DocumentIterator.TYPE_TEXT; /** Whether tokens should be stemmed with Porter stemmer */ public boolean stem = false; /** Whether relevance feedback using the Ide_regular algorithm is used */ public boolean feedback = false; /** Create an inverted index of the documents in a directory. * @param dirFile The directory of files to index. * @param docType The type of documents to index (See docType in DocumentIterator) * @param stem Whether tokens should be stemmed with Porter stemmer. * @param feedback Whether relevance feedback should be used. */ public InvertedIndex(File dirFile, short docType, boolean stem, boolean feedback) { this.dirFile = dirFile; this.docType = docType; this.stem = stem; this.feedback = feedback; tokenHash = new HashMap(); docRefs = new ArrayList(); indexDocuments(); } /** Create an inverted index of the documents in a directory. * @param dirVector The vector containing the Example objects of the files to Index * @param docType The type of documents to index (See docType in DocumentIterator) * @param stem Whether tokens should be stemmed with Porter stemmer. * @param feedback Whether relevance feedback should be used. */ public InvertedIndex(Vector dirVector, short docType, boolean stem, boolean feedback) { this.docType = docType; this.stem = stem; this.feedback = feedback; tokenHash = new HashMap(); docRefs = new ArrayList(); indexDocuments(dirVector); } /** Index the documents in dirFile. */ protected void indexDocuments() { if (!tokenHash.isEmpty() || !docRefs.isEmpty()) { // Currently can only index one set of documents when an index is created System.out.println("\nCannot indexDocuments more than once in the same InvertedIndex"); System.exit(1); } // Get an iterator for the documents DocumentIterator docIter = new DocumentIterator(dirFile, docType, stem); System.out.println("Indexing documents in " + dirFile); // Loop, processing each of the documents while (docIter.hasMoreDocuments()) { FileDocument doc = docIter.nextDocument(); // Create a document vector for this document HashMapVector vector = doc.hashMapVector(); // Create a reference to this document DocumentReference docRef = new DocumentReference(doc); // Add this document to the list of documents indexed docRefs.add(docRef); // Iterate through each of the tokens in the document Iterator mapEntries = vector.iterator(); while (mapEntries.hasNext()) { Map.Entry entry = (Map.Entry)mapEntries.next(); // An entry in the HashMap maps a token to a Weight String token = (String)entry.getKey(); // The count for the token is in the value of the Weight int count = (int)((Weight)entry.getValue()).getValue(); // Add an occurence of this token to the inverted index pointing to this document indexToken(token, count, docRef); } } // Now that all documents have been processed, we can calculate the IDF weights for // all tokens and the resulting lengths of all weighted document vectors. computeIDFandDocumentLengths(); System.out.println("Indexed " + docRefs.size() + " documents with " + size() + " unique terms."); } /** Index the training files in dirVector. */ public void indexDocuments(Vector dirVector) { if (!tokenHash.isEmpty() || !docRefs.isEmpty()) { // Currently can only index one set of documents when an index is created System.out.println("\nCannot indexDocuments more than once in the same InvertedIndex"); System.exit(1); } Enumeration enum = dirVector.elements(); // Loop, processing each of the documents while (enum.hasMoreElements()) { Example example = (Example) enum.nextElement(); FileDocument doc = example.getDocument(); // Create a document vector for this document HashMapVector vector = example.getHashMapVector(); // Create a reference to this document DocumentReference docRef = new DocumentReference(doc); // Add this document to the list of documents indexed docRefs.add(docRef); // Iterate through each of the tokens in the document Iterator mapEntries = vector.iterator(); while (mapEntries.hasNext()) { Map.Entry entry = (Map.Entry)mapEntries.next(); // An entry in the HashMap maps a token to a Weight String token = (String)entry.getKey(); // The count for the token is in the value of the Weight int count = (int)((Weight)entry.getValue()).getValue(); // Add an occurence of this token to the inverted index pointing to this document indexToken(token, count, docRef); } } // Now that all documents have been processed, we can calculate the IDF weights for // all tokens and the resulting lengths of all weighted document vectors. computeIDFandDocumentLengths(); System.out.println("Indexed " + docRefs.size() + " documents with " + size() + " unique terms."); } /** Add a token occurrence to the index. * @param token The token to index. * @param count The number of times it occurs in the document. * @param docRef A reference to the Document it occurs in. */ protected void indexToken(String token, int count, DocumentReference docRef) { // Find this token in the index TokenInfo tokenInfo = (TokenInfo)tokenHash.get(token); if (tokenInfo == null) { // If this is a new token, create info for it to put in the hashtable tokenInfo = new TokenInfo(); tokenHash.put(token, tokenInfo); } // Add a new occurrence for this token to its info tokenInfo.occList.add(new TokenOccurrence(docRef, count)); } /** Compute the IDF factor for every token in the index and the length * of the document vector for every document referenced in the index. */ protected void computeIDFandDocumentLengths() { // Let N be the total number of documents indexed double N = docRefs.size(); // Iterate through each of the tokens in the index Iterator mapEntries = tokenHash.entrySet().iterator(); while (mapEntries.hasNext()) { // Get the token and the tokenInfo for each entry in the HashMap Map.Entry entry = (Map.Entry)mapEntries.next(); String token = (String)entry.getKey(); TokenInfo tokenInfo = (TokenInfo)entry.getValue(); // Get the total number of documents in which this token occurs double numDocRefs = tokenInfo.occList.size(); // Calculate the IDF factor for this token double idf = Math.log(N/numDocRefs); // System.out.println(token + " occurs in " + Math.round(numDocRefs) + " docs so IDF=" + idf); if (idf == 0.0) // If IDF is 0, then just remove this inconsequential token from the index mapEntries.remove(); else { tokenInfo.idf = idf; // In order to compute document vector lengths, sum the // square of the weights (IDF * occurrence count) across // every token occurrence for each document. for(int i = 0; i < tokenInfo.o

评论收藏

内容反馈

版权申诉