mp3文件信息解析-nutch使用资源-CSDN文库

共20个文件

java：19个

html：1个

mp3解析

nutch

ID3-V1-V2

4星 · 超过85%的资源需积分: 9 33 浏览量 2009-10-20 11:12:09 上传评论收藏 38KB RAR 举报

资源推荐

资源详情

资源评论

收起资源包目录

org.rar （20个子文件）

org

apache

nutch

parse

html

package.html 177B

HtmlParser.java 30KB

DOMContentUtils.java 14KB

ImageModel.java 3KB

DOMBuilder.java 23KB

HTMLMetaProcessor.java 7KB

DbCon.java 3KB

SingleTreadHandle.java 365B

Test.java 10KB

DbOp.java 16KB

Constants.java 873B

BatchHandle.java 4KB

OtherData.java 596B

SiteAndChannelLists.java 1KB

XMLCharacterRecognizer.java 3KB

KeywordHandle.java 4KB

SysProperty.java 2KB

AnalysisMusic.java 2KB

SongInfo.java 8KB

KeyWord.java 2KB

/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.parse.html; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.Map; import java.net.HttpURLConnection; import java.net.URL; import java.net.MalformedURLException; import java.nio.charset.Charset; import java.io.*; import java.util.regex.*; import org.cyberneko.html.parsers.*; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.w3c.dom.*; import org.apache.html.dom.*; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.metadata.Nutch; import org.apache.nutch.protocol.Content; import org.apache.hadoop.conf.*; import org.apache.nutch.parse.*; import org.apache.nutch.util.*; public class HtmlParser implements Parser { public static final Log LOG = LogFactory.getLog("org.apache.nutch.parse.html"); public static final String [] pictureFormat = SysProperty.PICTURE_FORMAT.split(","); public static final String [] musicFormat = SysProperty.MUSIC_FORMAT.split(","); // I used 1000 bytes at first, but found that some documents have // meta tag well past the first 1000 bytes. // (e.g. http://cn.promo.yahoo.com/customcare/music.html) //取出前2000字节，为了找meta标签 private static final int CHUNK_SIZE = 2000; private static Pattern metaPattern = Pattern.compile("<meta\\s+([^>]*http-equiv=\"?content-type\"?[^>]*)>", Pattern.CASE_INSENSITIVE); private static Pattern charsetPattern = Pattern.compile("charset=\\s*([a-z][_\\-0-9a-z]*)", Pattern.CASE_INSENSITIVE); //为解析wml（其实是xml文件）准备,add by zqm private static Pattern xmlMetaPattern = Pattern.compile("<\\?xml\\s+version=\"[^>]+\"\\s+encoding=\"([^>]+)\"\\?>", Pattern.CASE_INSENSITIVE); private String parserImpl; /** * Given a <code>byte[]</code> representing an html file of an * <em>unknown</em> encoding, read out 'charset' parameter in the meta tag * from the first <code>CHUNK_SIZE</code> bytes. * If there's no meta tag for Content-Type or no charset is specified, * <code>null</code> is returned. <br /> * FIXME: non-byte oriented character encodings (UTF-16, UTF-32) * can't be handled with this. * We need to do something similar to what's done by mozilla * (http://lxr.mozilla.org/seamonkey/source/parser/htmlparser/src/nsParser.cpp#1993). * See also http://www.w3.org/TR/REC-xml/#sec-guessing * <br /> * * @param content <code>byte[]</code> representation of an html file */ private static String sniffCharacterEncoding(byte[] content) { int length = content.length < CHUNK_SIZE ? content.length : CHUNK_SIZE; //非assii码会在高8位里填0补充 // We don't care about non-ASCII parts so that it's sufficient // to just inflate each byte to a 16-bit value by padding. // For instance, the sequence {0x41, 0x82, 0xb7} will be turned into // {U+0041, U+0082, U+00B7}. String str = ""; try { //用US-ASCII字符集解码content数组,在这部分对于非ascii字符不需处理，乱码也没关系 str = new String(content, 0, length, Charset.forName("ASCII").toString()); } catch (UnsupportedEncodingException e) { // code should never come here, but just in case... return null; } Matcher metaMatcher = metaPattern.matcher(str); String encoding = null; if (metaMatcher.find()) { Matcher charsetMatcher = charsetPattern.matcher(metaMatcher.group(1)); if (charsetMatcher.find()) encoding = new String(charsetMatcher.group(1)); } //若编码为null，则检测是否有xml的编码标示,为wap网页分析，add by zqm if (encoding == null){ xmlMetaPattern = Pattern.compile("<\\?xml\\s+version=\"[^>]+\"\\s+encoding=\"([^>]+)\"\\?>", Pattern.CASE_INSENSITIVE); Matcher xmlMetaMatcher = xmlMetaPattern.matcher(str); if (xmlMetaMatcher.find()) { encoding = xmlMetaMatcher.group(1); } } return encoding; } private String defaultCharEncoding; private Configuration conf; private DOMContentUtils utils; private HtmlParseFilters htmlParseFilters; private String cachingPolicy; public ParseResult getParse(Content content) { List<OtherData> parseResultList = new ArrayList<OtherData>(); HTMLMetaTags metaTags = new HTMLMetaTags(); //页面的基址 URL base; //页面的uri,如果有转向，则基址跟页面url有可能不同 URL surl; try { base = new URL(content.getBaseUrl()); //get the page url fetched by crawl surl = new URL(content.getUrl()); //图片分析，目前不做处理。add by zqm for (int i=0; i<pictureFormat.length; i++){ if (surl.toString().endsWith(pictureFormat[i])){ return null; } } //音频分析，目前不做处理。add by zqm for (int i=0; i<musicFormat.length; i++){ if (surl.toString().endsWith(musicFormat[i])){ return null; } } //end } catch (MalformedURLException e) { return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf()); } String text = ""; String title = ""; Outlink[] outlinks = new Outlink[0]; Metadata metadata = new Metadata(); // parse the content DocumentFragment root; try { byte[] contentInOctets = content.getContent(); InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets)); EncodingDetector detector = new EncodingDetector(conf); detector.autoDetectClues(content, true); detector.addClue(sniffCharacterEncoding(contentInOctets), "sniffed"); String encoding = detector.guessEncoding(content, defaultCharEncoding); metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding); metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding); input.setEncoding(encoding); if (LOG.isTraceEnabled()) { LOG.trace("Parsing..."); } root = parse(input); //find image and key word // Date begin = new Date(); //分析网页的img标签 add by zqm findimg(root, base, surl, parseResultList); // Date end = new Date(); // LOG.info("=== find image total time(ms)：" + (end.getTime() - begin.getTime()) + "ms"); // LOG.info("=== find image total time(s)：" + (end.getTime() - begin.getTime())/1000 + "s"); } catch (IOException e) { return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf()); } catch (DOMException e) { return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf()); } catch (SAXException e) { return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf()); } catch (Exception e) { e.printStackTrace(LogUtil.getWarnStream(LOG)); return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf()); } // get meta directives HTMLMetaProcessor.getMetaTags(metaTags, root, base); if (LOG.isTraceEnabled()) { LOG.trace("Meta tags for " + base + ": " + metaTags.toString()); } // check meta directives if (!metaTags.getNoIndex()) {

评论收藏

内容反馈