#@chenqin
#@ucas
#╭︿︿︿╮
#{/ o o /}
#( (oo) )
#
# ︶︶︶
# coding:utf8
from baike_spider import url_manager, html_downloader, html_parser,\
html_outputer
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import pandas as pd
import time
class SpiderMain(object):
def __init__(self):
self.urls = url_manager.UrlManager()
self.downloader = html_downloader.HtmlDownloader()
self.parser = html_parser.HtmlParser()
self.outputer = html_outputer.HtmlOutputer()
def craw(self, root_url):
driver = webdriver.Chrome()
driver.get("http://data.eastmoney.com/notices/")
while True:
self.singlePageCraw(driver)
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, "//a[contains(text(),'下一页')]"))
).click()
time.sleep(5)
def singlePageCraw(self, driver):
try:
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "dt_1"))
)
tr_options = element.find_elements_by_tag_name("tr")
df = pd.DataFrame(columns = ['title', 'content'] )
list = ['id','name','title','detail','type','time']
for tr_option in tr_options:
td_options = tr_option.find_elements_by_tag_name("td")
res_data = []
stockid = ''
filename = ''
count = 0
for td_option in td_options:
if '更多公告 股吧 研报' in td_option.text:
continue
df.loc[count] = [list[count], str(td_option.text).strip()]
count = count + 1
if str.isdigit(str(td_option.text).strip()):
stockid = td_option.text
if ':' in td_option.text:
detail_url = td_option.find_element_by_tag_name("a").get_attribute('href')
html_cont = self.downloader.download(detail_url)
detail_content = self.parser.parse(detail_url, html_cont)
df.loc[count] = [list[count], str(detail_content).strip()]
filename = stockid + '_' + detail_url.split(',')[0].split('/')[-1]
count = count + 1
print filename
df.to_csv("D:/"+ filename +'.csv', index=False)
except Exception,e:
print str(e)
if __name__ == "__main__":
root_url = "http://data.eastmoney.com/notices/"
obj_spider = SpiderMain()
obj_spider.craw(root_url)