import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
public class InvertedIndex
{
public static class invMap extends Mapper<LongWritable,Text,Text,Text>
{
private Text keyInfo = new Text();
private final Text one = new Text("1");
public void map(LongWritable key, Text value,Context context) throws IOException, InterruptedException
{
String line = new String(value.getBytes(), 0, value.getLength(), "UTF8");
// System.out.println("line:" + line);
StringTokenizer token = new StringTokenizer(line);
String keyv = token.nextToken();//缃戦〉閾炬帴
if(!token.hasMoreTokens()) {
return;
}
line = token.nextToken();
// System.out.println("next token:" + line);
StringTokenizer stk = new StringTokenizer(line,"/");//鍒嗚瘝
// System.out.println("key:" + keyv);
while (stk.hasMoreElements())
{
//keyinfo:鍒嗚瘝&&&閾炬帴
line = stk.nextToken();
// System.out.println("token:" + line);
keyInfo.set(line + "&&&" + keyv);
// System.out.println("keyinfo:" + keyInfo.toString());
context.write(keyInfo, one);
}
}
}
public static class Combiner extends Reducer<Text,Text,Text,Text>
{
private Text valueInfo = new Text();
public void reduce(Text key, Iterable<Text> values,Context context) throws IOException, InterruptedException
{
//杈撳叆锛�<鍒嗚瘝&&&閾炬帴锛�1>
String line = key.toString();
int splitIndex = line.indexOf("&&&");
if(splitIndex == 0) {
return;
}
int sum = 0;
for (Text value : values)
{
sum += Integer.valueOf(value.toString());
}
//valueInfo:閾炬帴|鏁板瓧
valueInfo.set(line.substring(splitIndex+3) + "|" + String.valueOf(sum));
//key:鍒嗚瘝
// System.out.println(" substr(0...):" + line.substring(0,splitIndex) + "\n substr(index...):" + line.substring(splitIndex+3));
// System.out.println("key:" + key.toString());
key.set(line.substring(0,splitIndex));
context.write(key, valueInfo);
}
}
public static class invReduce extends TableReducer<Text,Text,ImmutableBytesWritable>
{
public void reduce(Text key, Iterable<Text> values,Context contex) throws IOException, InterruptedException
{
//杈撳叆锛�<鍒嗚瘝锛岄摼鎺sum>
//鐢熸垚閾炬帴鍒楄〃
//valueList:鍒嗚瘝锛岄摼鎺sum;....閾炬帴|sum;
// String valueList = new String();
//linkList:閾炬帴;...閾炬帴;
String linkList = new String();
String line = "";
for (Text value : values)
{
line = value.toString();
// valueList += line + ";";
line = StringUtils.split(line,'|')[0];
// System.out.println("line:" + line);
linkList += line + ";";
}
// System.out.println("key:" + key.toString());
Put put = new Put(Bytes.toBytes(key.toString()));
//列族为word,列修饰符为key,列值为链接
put.add(Bytes.toBytes("word"), Bytes.toBytes("key"), Bytes.toBytes(linkList));
contex.write(new ImmutableBytesWritable(key.getBytes()), put);// 输出求和后的<key,value>
}
}
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException
{
System.setProperty("hadoop.home.dir","C:/hadoop-2.7.1");
String tablename = "search";
Configuration conf = HBaseConfiguration.create();
conf.set("hbase.zookeeper.quorum", "your_ip_address");
conf.set("hbase.zookeeper.property.clientPort","2181");
HBaseAdmin admin = new HBaseAdmin(conf);
if(admin.tableExists(tablename)){
System.out.println("table exists!recreating.......");
admin.disableTable(tablename);
admin.deleteTable(tablename);
}
HTableDescriptor htd = new HTableDescriptor(tablename);
HColumnDescriptor tcd = new HColumnDescriptor("word");
htd.addFamily(tcd);//创建列族
admin.createTable(htd);//创建表
Job job = new Job(conf);//鏂板缓job
job.setJarByClass(InvertedIndex.class);//job绫�
job.setMapperClass(invMap.class);//map璁剧疆
job.setCombinerClass(Combiner.class);//combiner璁剧疆
job.setJobName("InvertedIndex");
TableMapReduceUtil.initTableReducerJob(tablename, invReduce.class, job);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputValueClass(ImmutableBytesWritable.class);
job.setInputFormatClass(TextInputFormat.class);
FileInputFormat.addInputPath(job, new Path("C:/Users/luyunyyyyy/Documents/xlc/saved_html"));
// FileOutputFormat.setOutputPath(job, new Path("hdfs://172.17.11.54:9000/output"));
job.waitForCompletion(true);
}
}
没有合适的资源?快使用搜索试试~ 我知道了~
资源推荐
资源详情
资源评论
收起资源包目录
人工智能-项目实践-搜索引擎-利用hadoop等实现的搜索引擎 (198个子文件)
InvertedIndex.class 3KB
search.class 3KB
InvertedIndex$invReduce.class 3KB
daoProcess.class 3KB
InvertedIndex$Combiner.class 3KB
InvertedIndex$invMap.class 3KB
daoTest.class 1KB
.classpath 12KB
.classpath 12KB
org.eclipse.wst.common.component 480B
org.eclipse.wst.jsdt.ui.superType.container 49B
bootstrap.css 169KB
bootstrap.min.css 137KB
jquery.mCustomScrollbar.min.css 42KB
bootstrap-grid.css 37KB
bootstrap-grid.min.css 28KB
font-awesome.min.css 23KB
bootstrap-reboot.css 5KB
bootstrap-reboot.min.css 4KB
wave.gif 622B
hadoop-12171894811784015 0B
hadoop-1515304150704734882 0B
hadoop-155713907273237085 0B
hadoop-2620289509115473735 0B
hadoop-2937792075736465602 0B
hadoop-3198405759983164690 0B
hadoop-3552208930499889449 0B
hadoop-3834428077032457166 0B
hadoop-3950721015139514471 0B
hadoop-589786364720861678 0B
hadoop-618259365142470382 0B
hadoop-6653092954722541462 0B
hadoop-6714562227415542858 0B
hadoop-7113168790230120231 0B
hadoop-7412144493786822904 0B
hadoop-7938579777958017466 0B
index.html 17KB
hadoop_homework_1.ipynb 24KB
jruby-complete-1.6.8.jar 13.19MB
hbase-server-1.2.9-tests.jar 7.32MB
hadoop-hdfs-2.5.1.jar 6.77MB
hbase-protocol-1.2.9.jar 4.16MB
hbase-server-1.2.9.jar 4MB
hadoop-common-2.5.1.jar 2.83MB
hbase-it-1.2.9-tests.jar 2.7MB
hbase-thrift-1.2.9.jar 2.63MB
netty-all-4.0.50.Final.jar 2.14MB
guava-12.0.1.jar 1.71MB
hadoop-yarn-api-2.5.1.jar 1.57MB
commons-math3-3.1.1.jar 1.53MB
hadoop-mapreduce-client-core-2.5.1.jar 1.43MB
htrace-core-3.1.0-incubating.jar 1.41MB
zookeeper-3.4.10.jar 1.39MB
hadoop-yarn-common-2.5.1.jar 1.35MB
netty-3.10.5.Final.jar 1.27MB
hbase-client-1.2.9.jar 1.25MB
jcodings-1.0.8.jar 1.23MB
leveldbjni-all-1.8.jar 1021KB
jsp-2.1-6.1.14.jar 1001KB
snappy-java-1.0.4.1.jar 973KB
commons-math-2.2.jar 965KB
jaxb-impl-2.2.3-1.jar 869KB
zookeeper-3.4.6.jar 774KB
jackson-mapper-asl-1.9.13.jar 762KB
jersey-server-1.9.jar 696KB
guice-3.0.jar 694KB
apacheds-kerberos-codec-2.0.0-M15.jar 675KB
hadoop-mapreduce-client-common-2.5.1.jar 647KB
commons-collections-3.2.2.jar 575KB
hbase-common-1.2.9.jar 557KB
jetty-6.1.26.jar 527KB
jets3t-0.9.0.jar 527KB
protobuf-java-2.5.0.jar 521KB
hadoop-mapreduce-client-app-2.5.1.jar 480KB
log4j-1.2.17.jar 478KB
log4j-1.2.16.jar 470KB
spymemcached-2.11.6.jar 456KB
jersey-core-1.9.jar 448KB
httpclient-4.2.5.jar 423KB
hbase-rest-1.2.9.jar 422KB
jasper-compiler-5.5.23.jar 399KB
httpcore-4.4.1.jar 315KB
junit-4.12.jar 308KB
commons-httpclient-3.1.jar 298KB
avro-1.7.4.jar 296KB
commons-configuration-1.6.jar 292KB
commons-lang-2.6.jar 278KB
commons-net-3.1.jar 267KB
commons-codec-1.9.jar 258KB
hadoop-yarn-server-common-2.5.1.jar 237KB
commons-compress-1.4.1.jar 236KB
hbase-common-1.2.9-tests.jar 230KB
libthrift-0.9.3.jar 229KB
jackson-core-asl-1.9.13.jar 227KB
commons-beanutils-core-1.8.0.jar 201KB
commons-beanutils-1.7.0.jar 184KB
joni-2.1.2.jar 183KB
jsch-0.1.42.jar 181KB
commons-io-2.4.jar 181KB
jetty-util-6.1.26.jar 173KB
共 198 条
- 1
- 2
资源评论
博士僧小星
- 粉丝: 1940
- 资源: 5901
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功