/*
* Terrier - Terabyte Retriever
* Webpage: http://terrier.org/
* Contact: terrier{a.}dcs.gla.ac.uk
* University of Glasgow - School of Computing Science
* http://www.gla.ac.uk/
*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is CompressingMetaIndex.java
*
* The Original Code is Copyright (C) 2004-2014 the University of Glasgow.
* All Rights Reserved.
*
* Contributor(s):
* Craig Macdonald <craigm{a.}dcs.gla.ac.uk> (original contributor)
*/
package org.terrier.structures;
import gnu.trove.TObjectIntHashMap;
import java.io.BufferedReader;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.io.RandomAccessFile;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.zip.DataFormatException;
import java.util.zip.Inflater;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.terrier.sorting.HeapSortInt;
import org.terrier.structures.collections.FSOrderedMapFile;
import org.terrier.structures.collections.OrderedMap;
import org.terrier.structures.seralization.FixedSizeIntWritableFactory;
import org.terrier.structures.seralization.FixedSizeTextFactory;
import org.terrier.structures.seralization.FixedSizeWriteableFactory;
import org.terrier.utility.ApplicationSetup;
import org.terrier.utility.ArrayUtils;
import org.terrier.utility.Files;
import org.terrier.utility.TerrierTimer;
import org.terrier.utility.Wrapper;
import org.terrier.utility.io.HadoopUtility;
import org.terrier.utility.io.RandomDataInput;
import org.terrier.utility.io.RandomDataInputMemory;
import org.terrier.utility.io.WrappedIOException;
/** A {@link MetaIndex} implementation that compresses contents.
* Values have maximum lengths, but overall value blobs are
* compressed using java.util.zip.Inflater.
* @author Craig Macdonald & Vassilis Plachouras
* @since 3.0
*/
@SuppressWarnings("deprecation")
public class CompressingMetaIndex implements MetaIndex {
private final static Pattern SPLIT_SPACE = Pattern.compile("\\s+");
/** logger to be used in this class */
private static Logger logger = Logger.getLogger(CompressingMetaIndex.class);
/**
* A Hadoop input format for a compressing meta index (allows the reading of a meta index
* as input to a MapReduce job.
*/
public static class CompressingMetaIndexInputFormat implements InputFormat<IntWritable, Wrapper<String[]>>
{
static String STRUCTURE_NAME_JC_KEY = "MetaIndexInputStreamRecordReader.structureName";
/**
* Set structure
* @param jc
* @param metaStructureName
*/
public static void setStructure(JobConf jc, String metaStructureName)
{
jc.set(STRUCTURE_NAME_JC_KEY, metaStructureName);
}
static class MetaIndexSplit extends FileSplit
{
int startId;
int endId;
public MetaIndexSplit(){
super(null, (long)0, (long)0, new String[0]);
}
public MetaIndexSplit(Path file, long start, long length, String[] hosts, int _startId, int _endId) {
super(file, start, length, hosts);
startId = _startId;
endId = _endId;
}
public void readFields(DataInput in) throws IOException {
super.readFields(in);
startId = in.readInt();
endId = in.readInt();
}
public void write(DataOutput out) throws IOException {
super.write(out);
out.writeInt(startId);
out.writeInt(endId);
}
public String toString()
{
StringBuilder rtr = new StringBuilder();
rtr.append("MetaIndexSplit: BlockSize=").append(this.getLength());
rtr.append(" startAt=").append(+this.getStart());
try{
rtr.append(" hosts=");
rtr.append(ArrayUtils.join(this.getLocations(), ","));
}
catch (IOException ioe ) {
logger.warn("Problem getting locations", ioe);
}
rtr.append(" ids=["+startId+","+endId +"]");
return rtr.toString();
}
}
static class MetaIndexInputStreamRecordReader implements RecordReader<IntWritable, Wrapper<String[]>>
{
final InputStream in;
final int startID;
final int endID;
public MetaIndexInputStreamRecordReader(IndexOnDisk index, String structureName, int startingDocID, int endingID)
throws IOException
{
in = new InputStream(index, structureName, startingDocID, endingID);
startID = startingDocID;
endID = endingID;
}
public void close() throws IOException {
in.close();
}
public IntWritable createKey() {
return new IntWritable();
}
public Wrapper<String[]> createValue() {
return new Wrapper<String[]>();
}
public long getPos() throws IOException {
return 0;
}
public float getProgress() throws IOException {
return (float)(in.getIndex() - startID)/(float)(endID - startID);
}
public boolean next(IntWritable docid, Wrapper<String[]> values)
throws IOException
{
if (! in.hasNext())
return false;
//these methods MUST have this order
values.setObject(in.next());
docid.set(in.getIndex());
return true;
}
}
/**
* {@inheritDoc}
*/
public RecordReader<IntWritable, Wrapper<String[]>> getRecordReader(
InputSplit _split, JobConf jc, Reporter reporter)
throws IOException
{
HadoopUtility.loadTerrierJob(jc);
//load the index
Index.setIndexLoadingProfileAsRetrieval(false);
IndexOnDisk index = HadoopUtility.fromHConfiguration(jc);
if (index == null)
throw new IOException("Index could not be loaded from JobConf: " + Index.getLastIndexLoadError() );
//determine the structure to work on
String structureName = jc.get(STRUCTURE_NAME_JC_KEY);
if (structureName == null)
throw new IOException("JobConf property "+STRUCTURE_NAME_JC_KEY+" not specified");
//get the split
MetaIndexSplit s = (MetaIndexSplit)_split;
return new MetaIndexInputStreamRecordReader(index, structureName, s.startId, s.endId);
}
private static String[] getHosts(FileStatus fs, FileSystem f, long start, long len) throws IOException
{
BlockLocation[] bs = f.getFileBlockLocations(fs, start, len);
Set<String> hosts = new HashSet<String>();
for(BlockLocation b : bs)
{
for(String host : b.getHosts())
{
hosts.add(host);
}
}
return hosts.toArray(new String[0]);
}
/**
* {@inheritDoc}
*/
public InputSplit[] getSplits(JobConf jc, int advisedNumberOfSplits)
throws IOException
{
logger.setLevel(Level.DEBUG);
HadoopUtility.loadTerrierJob(jc);
List<InputSplit> splits = new ArrayList<InputSplit>(advisedNumberOfSplits);
IndexOnDisk index = HadoopUtility.fromHConfiguration(jc);
String structureName = jc.get(STRUCTURE_NAME_JC_KEY);
final String dataFilena
没有合适的资源?快使用搜索试试~ 我知道了~
资源推荐
资源详情
资源评论
收起资源包目录
重要!!数据集分类的一系列实验,(K-Means分类,p1,p2等),诸多多线程操 (1436个子文件)
Hadoop_BasicSinglePassIndexer.class 28KB
Manager.class 24KB
MemoryIndex.class 23KB
StructureMerger.class 22KB
Inv2DirectMultiReduce.class 21KB
CompressingMetaIndex.class 20KB
TwitterJSONDocument.class 20KB
LexiconBuilder.class 19KB
BasicSinglePassIndexer.class 16KB
InvertedIndexBuilder.class 16KB
FSOrderedMapFile.class 16KB
CompressingMetaIndexBuilder.class 16KB
Files.class 15KB
HadoopIndexing.class 15KB
Inverted2DirectIndexBuilder.class 14KB
IndexUtil.class 14KB
HadoopUtility.class 14KB
FSOMapFileLexicon.class 14KB
IndexOnDisk.class 14KB
BlockIndexer.class 13KB
TaggedDocument.class 13KB
BitPostingIndexInputFormat.class 13KB
SSSD.class 13KB
BlockInvertedIndexBuilder.class 13KB
Terrier4.class 13KB
TerrierQueryParser.class 12KB
Indexer.class 12KB
BasicIndexer.class 12KB
PostingListManager.class 12KB
QueryExpansion.class 11KB
K_Means.class 11KB
ApplicationSetup.class 11KB
MemoryFieldsIndex.class 11KB
DependenceScoreModifier.class 11KB
SimpleFileCollection.class 11KB
SimpleXMLCollection.class 10KB
Decorate.class 10KB
IntegerCodingPostingIndexInputStream.class 10KB
DFRBagExpansionTerms.class 10KB
Entry.class 10KB
QPRP.class 10KB
CompressingMetaIndexBuilder$MapperReducer.class 10KB
BitFile.class 10KB
SimpleStaticScoreModifier.class 9KB
BitPostingIndexInputStream.class 9KB
Classify.class 9KB
BaseMatching.class 9KB
IntegerCodecCompressionConfiguration.class 9KB
ExtensibleSinglePassIndexer.class 9KB
TerrierLexer.class 9KB
MatchingQueryTerms.class 9KB
Index.class 8KB
IncrementalIndex.class 8KB
InteractiveQuerying.class 8KB
CompressingMetaIndex$CompressingMetaIndexInputFormat.class 8KB
InvertedIndexRecompresser.class 8KB
BitPostingIndex.class 8KB
IntegerCodingPostingOutputStream.class 8KB
PorterStemmer.class 8KB
ArrayUtils.class 8KB
SimpleXMLCollection$XMLDocument.class 8KB
IntegerCodingIterablePosting.class 7KB
DirectIndex.class 7KB
MemoryCompressedMetaIndex.class 7KB
Inv2DirectMultiReduce$Inv2DirectMultiReduceJob.class 7KB
TRv2PorterStemmer.class 7KB
MultiFileCollectionInputFormat.class 7KB
FSOrderedMapFile$MultiFSOMapWriter.class 7KB
FSOrderedMapFile$SubMap.class 7KB
PerFieldNormWeightingModel.class 7KB
FSOMapFileLexiconOutputStream.class 7KB
BitOutputStream.class 7KB
TRECIndexing.class 7KB
MultiTermQuery.class 7KB
IntegerCodingPostingIndex.class 7KB
Create.class 7KB
MultiIndex.class 6KB
Hadoop_BlockSinglePassIndexer.class 6KB
FullNoPLM.class 6KB
CreateSingleLexicon.class 6KB
LexiconMerger.class 6KB
DFRWeightingModel.class 6KB
IndexToString.class 6KB
MemoryMetaIndexMap.class 6KB
PhraseScoreModifier.class 6KB
Distance.class 6KB
Create_Doc2.class 6KB
HeapSortInt.class 6KB
SingleFieldModel.class 6KB
AccumulatorResultSet.class 6KB
MemByteBuffer.class 6KB
RunsMerger.class 6KB
MemoryMetaIndex.class 6KB
TwitterJSONCollection.class 6KB
MapLexicon.class 6KB
Create_Doc2.class 6KB
BlockSinglePassIndexer.class 6KB
Relation.class 6KB
HadoopPlugin$HODJobFactory.class 5KB
DocumentPostingList.class 5KB
共 1436 条
- 1
- 2
- 3
- 4
- 5
- 6
- 15
资源评论
普通网友
- 粉丝: 1127
- 资源: 5292
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
最新资源
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功