# -*- coding: utf-8 -*-
# 引入模拟浏览器框架支持库
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# 引入ActionChains鼠标操作类支持库
from selenium.webdriver.common.action_chains import ActionChains
# xpath解析支持库
from lxml import etree
# 正则表达式支持库
import re
# py代码中直接调用js方法支持库
import execjs
import execjs.runtime_names
# 发网络请求支持库
import requests
# 自定义的新闻结构体
from newsInfo import NewsInfo
# 自定义解析html结构的实现类
from parseNews import PaseNews
class ParseTouTiao(object):
"""
构造函数,初始化资源
"""
def __init__(self):
self.__firefox_options = webdriver.FirefoxOptions()
self.__firefox_options.add_argument('--headless')
self.__firefox_options.add_argument('--disable-gpu')
self.__browser = webdriver.Firefox(firefox_options=self.__firefox_options)
"""
析构函数,释放资源
"""
def __del__(self):
if self.__browser:
try:
self.__browser.close()
self.__browser.quit()
except Exception as ex:
print(ex)
"""
获取头条首页内容
"""
def __getTouTiaoHtml(self, url):
# 简单的入参校验
if url and '' != url and url.startswith("http"):
# 浏览器打开页面
self.__browser.get(url)
try:
# 此处等到我们所需的热文元素加载出来了再进行下一步,避免页面还没加载完成就去解析内容导致内容为空
element = WebDriverWait(self.__browser, 10).until(
EC.presence_of_element_located((By.XPATH, "//ul/li/div[@ga_event=article_item_click]"))
)
except Exception as ex:
print(ex)
finally:
pass
resHtml = self.__browser.page_source
return resHtml
resHtml = """
<html>
<head>
<script mod_name="tanxssp-probe" async="" src="https://atanx.alicdn.com/t/tanxssp/probe.js" charset="utf-8"></script>
<style class="vjs-styles-defaults">
.video-js {
width: 300px;
height: 150px;
}
.vjs-fluid {
padding-top: 56.25%
}
</style>
<meta charset="utf-8">
<title>今日头条</title>
<meta http-equiv="x-dns-prefetch-control" content="on">
<meta name="renderer" content="webkit">
<link rel="dns-prefetch" href="//s3.pstatp.com/">
<link rel="dns-prefetch" href="//s3a.pstatp.com/">
<link rel="dns-prefetch" href="//s3b.pstatp.com">
<link rel="dns-prefetch" href="//p1.pstatp.com/">
<link rel="dns-prefetch" href="//p3.pstatp.com/">
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
<meta name="viewport" content="width=device-width,initial-scale=1,maximum-scale=1,minimum-scale=1,user-scalable=no,minimal-ui">
<meta name="360-site-verification" content="b96e1758dfc9156a410a4fb9520c5956">
<meta name="360_ssp_verify" content="2ae4ad39552c45425bddb738efda3dbb">
<meta name="google-site-verification" content="3PYTTW0s7IAfkReV8wAECfjIdKY-bQeSkVTyJNZpBKE">
<meta name="shenma-site-verification" content="34c05607e2a9430ad4249ed48faaf7cb_1432711730">
<meta name="baidu_union_verify" content="b88dd3920f970845bad8ad9f90d687f7">
<meta name="domain_verify" content="pmrgi33nmfuw4ir2ej2g65lunfqw6ltdn5wselbcm52wszbchirdqyztge3tenrsgq3dknjume2tayrvmqytemlfmiydimddgu4gcnzcfqrhi2lnmvjwc5tfei5dcnbwhazdcobuhe2dqobrpu">
<meta name="keywords" content="今日头条,头条,头条网,头条新闻,今日头条官网">
<meta name="description" content="《今日头条》(www.toutiao.com)是一款基于数据挖掘的推荐引擎产品,它为用户推荐有价值的、个性化的信息,提供连接人与信息的新型服务,是国内移动互联网领域成长最快的产品服务之一。">
<link rel="alternate" media="only screen and (max-width: 640px)" href="//m.toutiao.com/">
<link rel="shortcut icon" href="//s3a.pstatp.com/toutiao/resource/ntoutiao_web/static/image/favicon_8e9c9c7.ico"
type="image/x-icon">
<link rel="stylesheet" href="//s3.pstatp.com/toutiao/player/dist/pc_vue.css" media="screen" title="no title">
<!--[if lt IE 9]>
<p>您的浏览器版本过低,请<a href="http://browsehappy.com/">升级浏览器</a></p>
<![endif]-->
<script src="//s3.pstatp.com/toutiao/monitor/sdk/slardar.js?ver=20171221_1" crossorigin="anonymous"></script>
<script>window.Slardar && window.Slardar.install({
sampleRate: 1,
bid: 'toutiao_pc',
pid: 'index_new',
ignoreAjax: [/\/action_log\//, /\/stream\/widget\//],
ignoreStatic: [/\.tanx\.com\//, /\.alicdn\.com\//, /\.mediav\.com/]
});</script>
<link rel="stylesheet" href="//s3a.pstatp.com/toutiao/static/css/page/index_node/index.31d54c35426ed45610edf7c5e7ac8b74.css">
<script>!function (e) { function t(a) { if (o[a]) return o[a].exports; var r = o[a] = { exports: {}, id: a, loaded: !1 }; return e[a].call(r.exports, r, r.exports, t), r.loaded = !0, r.exports } var a = window.webpackJsonp; window.webpackJsonp = function (n, c) { for (var p, s, l = 0, i = []; l < n.length; l++)s = n[l], r[s] && i.push.apply(i, r[s]), r[s] = 0; for (p in c) Object.prototype.hasOwnProperty.call(c, p) && (e[p] = c[p]); for (a && a(n, c); i.length;)i.shift().call(null, t); if (c[0]) return o[0] = 0, t(0) }; var o = {}, r = { 0: 0 }; t.e = function (e, a) { if (0 === r[e]) return a.call(null, t); if (void 0 !== r[e]) r[e].push(a); else { r[e] = [a]; var o = document.getElementsByTagName("head")[0], n = document.createElement("script"); n.type = "text/javascript", n.charset = "utf-8", n.async = !0, n.src = t.p + "static/js/" + e + "." + { 1: "24063eb49e6e579544d4", 2: "4cfa0b7d433696569909", 3: "f1ff2831f57d148eec40", 4: "cce2c7d4326515e8702e" }[e] + ".js", o.appendChild(n) } }, t.m = e, t.c = o, t.p = "/toutiao/", t.p = "//s3.pstatp.com/toutiao/" }([]);</script>
<style type="text/css">
@keyframes resizeanim {
from {
opacity: 0;
}
to {
opacity: 0;
}
}
.resize-triggers {
animation: 1ms resizeanim;
visibility: hidden;
opacity: 0;
}
.resize-triggers,
.resize-triggers>div,
.contract-trigger:before {
content: " ";
display: block;
position: absolute;
top: 0;
left: 0;
height: 100%;
width: 100%;
overflow: hidden;
}
.resize-triggers>div {
background: #eee;
overflow: auto;
}
.contract-trigger:before {
width: 200%;
height: 200%;
}
</style>
<script src="https://show-g.mediav.com/s?jsonp=_qihu_jsonpFun_&type=1&of=4&newf=1&showid=P5AcFE&ref=toutiao.com&uid=15378869247912674181990789027651&scheme=https&impct=5&time=ts_1537886924791"
async=""></script>
<style id="5e59571">.imga_wrapper{position:relative;display:inline-block;}.imga_flag{display: inline-block;position: absolute;bottom: 8px;left: 8px;border: 1px solid #fff;color: #ff
评论0
最新资源