基于Hadoop的文本相似度计算_hive相似度计算资源-CSDN文库

共41个文件

class：18个

java：18个

dic：2个

Hadoop

文本相似度

TFIDF

5星 · 超过95%的资源需积分: 46 11 浏览量 2013-06-11 20:52:08 上传评论 8 收藏 2.02MB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

Similarity.zip （41个子文件）

Similarity

.project 421B

ext.dic 1.64MB

src

com

bjtu

similarity

SimilarityMapper.java 974B

SimilarityReducer.java 1023B

WordFileMapper.java 1KB

WordFileReducer.java 2KB

process

TFProcess.java 2KB

WordFileProcess.java 2KB

AnalyseProcess.java 2KB

SimilarityProcess.java 2KB

DFProcess.java 1KB

WordCountProcess.java 2KB

tfid

TFMapper.java 701B

AnalyseReducer.java 861B

WordCountReducer.java 1KB

WordCountMapper.java 774B

AnalyseMapper.java 1KB

DFReducer.java 1KB

DFMapper.java 565B

TFReducer.java 1021B

stopword.dic 7KB

IKAnalyzer2012.jar 1.11MB

.classpath 5KB

bin

com

bjtu

similarity

WordFileReducer.class 3KB

SimilarityMapper.class 2KB

WordFileMapper.class 3KB

SimilarityReducer.class 3KB

process

SimilarityProcess.class 2KB

WordFileProcess.class 2KB

DFProcess.class 2KB

AnalyseProcess.class 3KB

TFProcess.class 2KB

WordCountProcess.class 2KB

tfid

AnalyseReducer.class 3KB

AnalyseMapper.class 3KB

TFMapper.class 2KB

TFReducer.class 3KB

WordCountReducer.class 3KB

DFReducer.class 3KB

WordCountMapper.class 2KB

DFMapper.class 2KB

package com.bjtu.process; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; import com.bjtu.tfid.AnalyseMapper; import com.bjtu.tfid.AnalyseReducer; /** * <code>AnayseProcess</code> is used to analyze text of files<br /> * it uses {@link com.bjtu.tfid.AnalyseMapper} and {@link com.bjtu.tfid.AnalyseReducer} to * analyze. * The data files is in "hdfs://localhost:9002/tmp/hello/". * And output path is "hdfs://localhost:9002/tmp/analyze/analyse-map-reduce" * The output file contains file name , words, and the number of words in this text. * * @author wuyadong & tangxu */ class AnalyseProcess { public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { args = new String[]{"hdfs://localhost:9002/tmp/hello/", "hdfs://localhost:9002/tmp"}; Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args) .getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: Tf-idf <in> <out>"); System.exit(2); } AnalyseProcess process = new AnalyseProcess(); process.analyze(conf, otherArgs[0],"hdfs://localhost:9002/tmp/analyze/analyse-map-reduce"); } /** * @throws IOException * @throws ClassNotFoundException * @throws InterruptedException */ public void analyze(Configuration conf, String inFileName, String outFileName) throws IOException, InterruptedException, ClassNotFoundException { FileSystem hdfs = FileSystem.get(conf); FileStatus p[] = hdfs.listStatus(new Path(inFileName)); Job job = new Job(conf, "analyze"); job.setJarByClass(AnalyseProcess.class); job.setMapperClass(AnalyseMapper.class); job.setReducerClass(AnalyseReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(p.length); FileInputFormat.addInputPath(job, new Path(inFileName)); FileOutputFormat.setOutputPath(job, new Path(outFileName)); job.waitForCompletion(true); } }

评论收藏

内容反馈