package classifier;
import java.io.IOException;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.mahout.classifier.ClassifierResult;
import org.apache.mahout.classifier.bayes.Algorithm;
import org.apache.mahout.classifier.bayes.BayesAlgorithm;
import org.apache.mahout.classifier.bayes.BayesParameters;
import org.apache.mahout.classifier.bayes.CBayesAlgorithm;
import org.apache.mahout.classifier.bayes.ClassifierContext;
import org.apache.mahout.classifier.bayes.Datastore;
import org.apache.mahout.classifier.bayes.InMemoryBayesDatastore;
import org.apache.mahout.classifier.bayes.InvalidDatastoreException;
import org.apache.mahout.common.nlp.NGrams;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import classifier.Counter;
public class ClassifierMapper extends Mapper<Text, Text, Text, IntWritable> {
private Text outKey = new Text();
private static final IntWritable ONE = new IntWritable(1);
private int gramSize = 1;
private ClassifierContext classifier;
private String defaultCategory;
private static final Logger log = LoggerFactory.getLogger(ClassifierMapper.class);
/**
* Parallel Classification
*
* @param key
* The label
* @param value
* the features (all unique) associated w/ this label
* @param context
*/
public void map(Text key, Text value, Context context)
throws IOException, InterruptedException {
String docLabel = "";
String userID = key.toString();
List<String> ngrams = new NGrams(value.toString(), gramSize).generateNGramsWithoutLabel();
try {
ClassifierResult result;
result = classifier.classifyDocument(ngrams.toArray(new String[ngrams.size()])
, defaultCategory);
docLabel = result.getLabel();
} catch (InvalidDatastoreException e) {
log.error(e.toString(), e);
context.getCounter(Counter.FAILDOCS).increment(1);
}
// key is userID and docLabel
outKey.set(userID+"|"+docLabel);
context.write(outKey, ONE);
}
/**
* read the model
* @throws IOException
*/
@Override
public void setup(Context context) throws IOException{
// get bayes parameters
Configuration conf = context.getConfiguration();
BayesParameters params = new BayesParameters(conf.get("bayes.parameters", ""));
log.info("Bayes Parameter {}", params.print());
Algorithm algorithm;
Datastore datastore;
if ("bayes".equalsIgnoreCase(params.get("classifierType"))) {
algorithm = new BayesAlgorithm();
datastore = new InMemoryBayesDatastore(params);
} else if ("cbayes".equalsIgnoreCase(params.get("classifierType"))) {
algorithm = new CBayesAlgorithm();
datastore = new InMemoryBayesDatastore(params);
} else {
throw new IllegalArgumentException(
"Unrecognized classifier type: " + params.get("classifierType"));
}
classifier = new ClassifierContext(algorithm, datastore);
try {
classifier.initialize();
} catch (InvalidDatastoreException e) {
log.error(e.toString(), e);
}
defaultCategory = params.get("defaultCat");
gramSize = params.getGramSize();
}
}
没有合适的资源?快使用搜索试试~ 我知道了~
【甘道夫】通过Mahout构建贝叶斯文本分类器案例详解--配套代码
共12个文件
class:4个
java:4个
project:1个
5星 · 超过95%的资源 需积分: 9 106 下载量 167 浏览量
2015-01-07
12:00:16
上传
评论 1
收藏 9.4MB RAR 举报
温馨提示
http://blog.csdn.net/u010967382/article/details/25368795#comments 博客配套代码
资源推荐
资源详情
资源评论
收起资源包目录
MRClassify.rar (12个子文件)
MRClassify
bin
classifier
Counter.class 902B
ClassifierReducer.class 3KB
ClassifierDriver.class 3KB
ClassifierMapper.class 6KB
.settings
org.eclipse.jdt.core.prefs 598B
src
classifier
ClassifierMapper.java 3KB
ClassifierDriver.java 2KB
Counter.java 60B
ClassifierReducer.java 743B
.project 386B
.classpath 7KB
lib
mahout-core-0.6-job.jar 10.67MB
共 12 条
- 1
资源评论
- 小嘎子闯天涯2016-04-29不错很有帮助
- henrylrc2016-03-03不错很有帮助
Gandalf_lee
- 粉丝: 194
- 资源: 4
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功