# coding:utf8
from baike_spider import url_manager, html_downloader, html_parser,\
html_outputer
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import pandas as pd
import time
from matplotlib.cbook import Null
from sqlalchemy.sql.expression import except_
from selenium.common.exceptions import NoSuchElementException
class SpiderMain(object):
def __init__(self):
self.urls = url_manager.UrlManager()
self.downloader = html_downloader.HtmlDownloader()
self.parser = html_parser.HtmlParser()
self.outputer = html_outputer.HtmlOutputer()
self.goToNext = True
self.stockids = ['600372', '300483', '002758', '600777', '002430']
def craw(self, root_url):
for stockid in self.stockids:
#http://data.eastmoney.com/notices/stock/000063.html
url = "http://data.eastmoney.com/notices/stock/" + stockid + ".html"
driver = webdriver.Chrome()
driver.get(url)
count = 0
while True:
flag = self.singlePageCraw(driver, stockid)
if flag == 0:
break
try:
nextPage = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, "//a[contains(text(),'下一页')]"))
)
if nextPage.get_attribute('class') == 'nolink':
break
nextPage.click()
except Exception,e:
break
count = count + 1
time.sleep(5)
driver.quit()
time.sleep(5)
def singlePageCraw(self, driver, stockid):
try:
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "dt_1"))
)
tr_options = element.find_elements_by_tag_name("tr")
df = pd.DataFrame(columns = ['title', 'content'] )
list = ['title','detail','type','time']
for tr_option in tr_options:
td_options = tr_option.find_elements_by_tag_name("td")
year = ''
filename = ''
count = 0
for td_option in td_options:
try:
if count == 0:
#title
df.loc[count] = [list[count], str(td_option.text).strip()]
count = count + 1
detail_url = td_option.find_element_by_tag_name("a").get_attribute('href')
html_cont = self.downloader.download(detail_url)
detail_content = self.parser.parse(detail_url, html_cont)
#detail
df.loc[count] = [list[count], str(detail_content).strip()]
count = count + 1
filename = stockid + '_' + detail_url.split(',')[0].split('/')[-1]
else:
df.loc[count] = [list[count], str(td_option.text).strip()]
count = count + 1
if '-'in td_option.text:
year = str(td_option.text).split('-')[0]
if str.isdigit(year):
year = int(year)
if year < 2014:
return 0
except NoSuchElementException,e:
return 0
print filename + '-' + str(year)
df.to_csv("./ann/"+ filename +'.csv', index=False)
except Exception,e:
print str(e)
finally:
return 1
if __name__ == "__main__":
root_url = "http://data.eastmoney.com/notices/"
obj_spider = SpiderMain()
obj_spider.craw(root_url)
东方财富网公告爬取2
4星 · 超过85%的资源 需积分: 49 109 浏览量
2017-04-23
23:06:53
上传
评论 9
收藏 8KB RAR 举报
crediks
- 粉丝: 213
- 资源: 6
最新资源
- 基于QT+QML+C++实现的仿 Windows10 画图3D 的颜色选择器+源码
- 32代码四驱简易电动小车蓝牙控制、语音控制、寻光、巡线、避障(内含语音模块为天问ASR-PRO开发板与32单片机串口通信的代码)
- 基于STM32的计步器的设计
- 基于Qt与STM32平台开发的汽车车机系统上位机
- 基于MQTT的智能宠物投喂系统
- 312749069629470selfieU重绘.apk
- http%3A%2F%2Fimg.wsdl.vivo.com.cn%2Fappstore%2Fdeveloper%2Ficon%2F201412%2F201412231038336
- 主要记录B站up主莫烦matplotlib教程中的代码.zip
- Java SE Development Kit 8u411 Windows x64 Installer
- 用于科学绘图的 Matplotlib 样式.zip
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈