【甘道夫】通过Mahout构建贝叶斯文本分类器案例详解--配套源码资源-CSDN文库

共5个文件

java：5个

4星 · 超过85%的资源需积分: 10 88 浏览量 2015-01-07 12:02:36 上传评论收藏 3KB RAR 举报

资源推荐

资源详情

资源评论

收起资源包目录

tokenize.rar （5个子文件）

tokenize

TokenizeMapper.java 1KB

Counter.java 58B

TokenizeDriver.java 2KB

inputformat

MyRecordReader.java 3KB

MyInputFormat.java 2KB

package tokenize.inputformat; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit; public class MyRecordReader extends RecordReader<Text, Text> { private CombineFileSplit combineFileSplit; // 当前处理的分片 private int totalLength; // 分片包含的文件数量 private int currentIndex; // 当前处理的文件索引 private float currentProgress = 0; // 当前的进度 private Text currentKey = new Text(); // 当前的Key private Text currentValue = new Text(); // 当前的Value private Configuration conf; // 任务信息 private boolean processed; // 记录当前文件是否已经读取 public MyRecordReader(CombineFileSplit combineFileSplit, TaskAttemptContext context, Integer index) throws IOException { super(); this.currentIndex = index; this.combineFileSplit = combineFileSplit; conf = context.getConfiguration(); totalLength = combineFileSplit.getPaths().length; processed = false; } @Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { } @Override public Text getCurrentKey() throws IOException, InterruptedException { return currentKey; } @Override public Text getCurrentValue() throws IOException, InterruptedException { return currentValue; } @Override public float getProgress() throws IOException { if (currentIndex >= 0 && currentIndex < totalLength) { currentProgress = (float) currentIndex / totalLength; return currentProgress; } return currentProgress; } @Override public void close() throws IOException { } @Override public boolean nextKeyValue() throws IOException { if (!processed) { // 如果文件未处理则读取文件并设置key-value // set key Path file = combineFileSplit.getPath(currentIndex); currentKey.set(file.getParent().getName()); // category's name // set value FSDataInputStream in = null; byte[] contents = new byte[(int)combineFileSplit.getLength(currentIndex)]; try { FileSystem fs = file.getFileSystem(conf); in = fs.open(file); in.readFully(contents); currentValue.set(contents); } catch (Exception e) { } finally { in.close(); } processed = true; return true; } return false; //如果文件已经处理，必须返回false } }

评论收藏

内容反馈