package com.bjtu.process;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import com.bjtu.tfid.AnalyseMapper;
import com.bjtu.tfid.AnalyseReducer;
/**
* <code>AnayseProcess</code> is used to analyze text of files<br />
* it uses {@link com.bjtu.tfid.AnalyseMapper} and {@link com.bjtu.tfid.AnalyseReducer} to
* analyze.
* The data files is in "hdfs://localhost:9002/tmp/hello/".
* And output path is "hdfs://localhost:9002/tmp/analyze/analyse-map-reduce"
* The output file contains file name , words, and the number of words in this text.
*
* @author wuyadong & tangxu
*/
class AnalyseProcess {
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
args = new String[]{"hdfs://localhost:9002/tmp/hello/", "hdfs://localhost:9002/tmp"};
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args)
.getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: Tf-idf <in> <out>");
System.exit(2);
}
AnalyseProcess process = new AnalyseProcess();
process.analyze(conf, otherArgs[0],"hdfs://localhost:9002/tmp/analyze/analyse-map-reduce");
}
/**
* @throws IOException
* @throws ClassNotFoundException
* @throws InterruptedException
*/
public void analyze(Configuration conf, String inFileName, String outFileName) throws IOException, InterruptedException, ClassNotFoundException {
FileSystem hdfs = FileSystem.get(conf);
FileStatus p[] = hdfs.listStatus(new Path(inFileName));
Job job = new Job(conf, "analyze");
job.setJarByClass(AnalyseProcess.class);
job.setMapperClass(AnalyseMapper.class);
job.setReducerClass(AnalyseReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setNumReduceTasks(p.length);
FileInputFormat.addInputPath(job, new Path(inFileName));
FileOutputFormat.setOutputPath(job, new Path(outFileName));
job.waitForCompletion(true);
}
}
没有合适的资源?快使用搜索试试~ 我知道了~
资源推荐
资源详情
资源评论
收起资源包目录
Similarity.zip (41个子文件)
Similarity
.project 421B
ext.dic 1.64MB
src
com
bjtu
similarity
SimilarityMapper.java 974B
SimilarityReducer.java 1023B
WordFileMapper.java 1KB
WordFileReducer.java 2KB
process
TFProcess.java 2KB
WordFileProcess.java 2KB
AnalyseProcess.java 2KB
SimilarityProcess.java 2KB
DFProcess.java 1KB
WordCountProcess.java 2KB
tfid
TFMapper.java 701B
AnalyseReducer.java 861B
WordCountReducer.java 1KB
WordCountMapper.java 774B
AnalyseMapper.java 1KB
DFReducer.java 1KB
DFMapper.java 565B
TFReducer.java 1021B
stopword.dic 7KB
IKAnalyzer2012.jar 1.11MB
.classpath 5KB
bin
com
bjtu
similarity
WordFileReducer.class 3KB
SimilarityMapper.class 2KB
WordFileMapper.class 3KB
SimilarityReducer.class 3KB
process
SimilarityProcess.class 2KB
WordFileProcess.class 2KB
DFProcess.class 2KB
AnalyseProcess.class 3KB
TFProcess.class 2KB
WordCountProcess.class 2KB
tfid
AnalyseReducer.class 3KB
AnalyseMapper.class 3KB
TFMapper.class 2KB
TFReducer.class 3KB
WordCountReducer.class 3KB
DFReducer.class 3KB
WordCountMapper.class 2KB
DFMapper.class 2KB
共 41 条
- 1
zxogj
- 粉丝: 11
- 资源: 26
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
最新资源
- Android基础面试题(附赠答案赠送版).md
- Vci.db
- 世界各国-经济距离数据汇总(2005-2022年).xlsx
- 基于Pytorch实现Kaggle竞赛“猫狗分类”源码+数据集(准确率超过99%).zip
- 基于SpringBoot实现的协同过滤美食推荐系统【源码+运行视频+论文】.rar
- 基于SpringBoot+Vue前后端分离型图书管理系统源码+数据库脚本(95分以上).zip
- com.wakdev.nfctasks.apk
- java毕业设计基于SpringBoot+Vue前后端分离型图书管理系统源码+数据库脚本.zip
- 机械与电气信息学院+电子信息工程本2002班+王梦亚+刘现伟+初稿(2).doc
- 中国各地级市的海拔标准差数据集.xlsx
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功
- 1
- 2
前往页