/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.parse.html;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.MalformedURLException;
import java.nio.charset.Charset;
import java.io.*;
import java.util.regex.*;
import org.cyberneko.html.parsers.*;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.w3c.dom.*;
import org.apache.html.dom.*;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.protocol.Content;
import org.apache.hadoop.conf.*;
import org.apache.nutch.parse.*;
import org.apache.nutch.util.*;
public class HtmlParser implements Parser {
public static final Log LOG = LogFactory.getLog("org.apache.nutch.parse.html");
public static final String [] pictureFormat = SysProperty.PICTURE_FORMAT.split(",");
public static final String [] musicFormat = SysProperty.MUSIC_FORMAT.split(",");
// I used 1000 bytes at first, but found that some documents have
// meta tag well past the first 1000 bytes.
// (e.g. http://cn.promo.yahoo.com/customcare/music.html)
//取出前2000字节,为了找meta标签
private static final int CHUNK_SIZE = 2000;
private static Pattern metaPattern =
Pattern.compile("<meta\\s+([^>]*http-equiv=\"?content-type\"?[^>]*)>",
Pattern.CASE_INSENSITIVE);
private static Pattern charsetPattern =
Pattern.compile("charset=\\s*([a-z][_\\-0-9a-z]*)",
Pattern.CASE_INSENSITIVE);
//为解析wml(其实是xml文件)准备,add by zqm
private static Pattern xmlMetaPattern =
Pattern.compile("<\\?xml\\s+version=\"[^>]+\"\\s+encoding=\"([^>]+)\"\\?>", Pattern.CASE_INSENSITIVE);
private String parserImpl;
/**
* Given a <code>byte[]</code> representing an html file of an
* <em>unknown</em> encoding, read out 'charset' parameter in the meta tag
* from the first <code>CHUNK_SIZE</code> bytes.
* If there's no meta tag for Content-Type or no charset is specified,
* <code>null</code> is returned. <br />
* FIXME: non-byte oriented character encodings (UTF-16, UTF-32)
* can't be handled with this.
* We need to do something similar to what's done by mozilla
* (http://lxr.mozilla.org/seamonkey/source/parser/htmlparser/src/nsParser.cpp#1993).
* See also http://www.w3.org/TR/REC-xml/#sec-guessing
* <br />
*
* @param content <code>byte[]</code> representation of an html file
*/
private static String sniffCharacterEncoding(byte[] content) {
int length = content.length < CHUNK_SIZE ?
content.length : CHUNK_SIZE;
//非assii码会在高8位里填0补充
// We don't care about non-ASCII parts so that it's sufficient
// to just inflate each byte to a 16-bit value by padding.
// For instance, the sequence {0x41, 0x82, 0xb7} will be turned into
// {U+0041, U+0082, U+00B7}.
String str = "";
try {
//用US-ASCII字符集解码content数组,在这部分对于非ascii字符不需处理,乱码也没关系
str = new String(content, 0, length,
Charset.forName("ASCII").toString());
} catch (UnsupportedEncodingException e) {
// code should never come here, but just in case...
return null;
}
Matcher metaMatcher = metaPattern.matcher(str);
String encoding = null;
if (metaMatcher.find()) {
Matcher charsetMatcher = charsetPattern.matcher(metaMatcher.group(1));
if (charsetMatcher.find())
encoding = new String(charsetMatcher.group(1));
}
//若编码为null,则检测是否有xml的编码标示,为wap网页分析,add by zqm
if (encoding == null){
xmlMetaPattern =
Pattern.compile("<\\?xml\\s+version=\"[^>]+\"\\s+encoding=\"([^>]+)\"\\?>", Pattern.CASE_INSENSITIVE);
Matcher xmlMetaMatcher = xmlMetaPattern.matcher(str);
if (xmlMetaMatcher.find()) {
encoding = xmlMetaMatcher.group(1);
}
}
return encoding;
}
private String defaultCharEncoding;
private Configuration conf;
private DOMContentUtils utils;
private HtmlParseFilters htmlParseFilters;
private String cachingPolicy;
public ParseResult getParse(Content content) {
List<OtherData> parseResultList = new ArrayList<OtherData>();
HTMLMetaTags metaTags = new HTMLMetaTags();
//页面的基址
URL base;
//页面的uri,如果有转向,则基址跟页面url有可能不同
URL surl;
try {
base = new URL(content.getBaseUrl());
//get the page url fetched by crawl
surl = new URL(content.getUrl());
//图片分析,目前不做处理。add by zqm
for (int i=0; i<pictureFormat.length; i++){
if (surl.toString().endsWith(pictureFormat[i])){
return null;
}
}
//音频分析,目前不做处理。add by zqm
for (int i=0; i<musicFormat.length; i++){
if (surl.toString().endsWith(musicFormat[i])){
return null;
}
}
//end
} catch (MalformedURLException e) {
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
}
String text = "";
String title = "";
Outlink[] outlinks = new Outlink[0];
Metadata metadata = new Metadata();
// parse the content
DocumentFragment root;
try {
byte[] contentInOctets = content.getContent();
InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets));
EncodingDetector detector = new EncodingDetector(conf);
detector.autoDetectClues(content, true);
detector.addClue(sniffCharacterEncoding(contentInOctets), "sniffed");
String encoding = detector.guessEncoding(content, defaultCharEncoding);
metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding);
metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding);
input.setEncoding(encoding);
if (LOG.isTraceEnabled()) { LOG.trace("Parsing..."); }
root = parse(input);
//find image and key word
// Date begin = new Date();
//分析网页的img标签 add by zqm
findimg(root, base, surl, parseResultList);
// Date end = new Date();
// LOG.info("=== find image total time(ms):" + (end.getTime() - begin.getTime()) + "ms");
// LOG.info("=== find image total time(s):" + (end.getTime() - begin.getTime())/1000 + "s");
} catch (IOException e) {
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
} catch (DOMException e) {
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
} catch (SAXException e) {
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
} catch (Exception e) {
e.printStackTrace(LogUtil.getWarnStream(LOG));
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
}
// get meta directives
HTMLMetaProcessor.getMetaTags(metaTags, root, base);
if (LOG.isTraceEnabled()) {
LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
}
// check meta directives
if (!metaTags.getNoIndex()) {
mp3文件信息解析-nutch使用
4星 · 超过85%的资源 需积分: 9 33 浏览量
2009-10-20
11:12:09
上传
评论
收藏 38KB RAR 举报
michaelzqm
- 粉丝: 0
- 资源: 5
最新资源
- 2001~2022年上市公司数字赋能指数.dta
- 2001~2022年上市公司数字赋能指数.xlsx
- 信息办公石大在线财务管理系统(含源码)-shidacaiwu.rar
- 信息办公电信计费系统完整代码-netctossconformity.rar
- matlab实现TD-SCDMA中初始同步捕捉DwPTS下行同步导频时隙的仿真.zip
- 信息办公玉玺学生信息管理系统-webapps.rar
- 信息办公基于struts的图书管理系统-struts-ts.rar
- 管家婆分销ERP V1 V3 A8II TOP V10.0.2最新全版本通用
- 信息办公基于Ajax+J2EE的MicroERP源码下载-microerp-0.1.rar
- 信息办公双鱼林jsp人事工资系统-wagesmanagesystem.rar
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈