东方财富网公告爬取2_东方财富api接口怎么收费资源-CSDN文库

共15个文件

py：7个

pyc：5个

pydevproject：1个

爬虫

ajax

4星 · 超过85%的资源需积分: 49 109 浏览量 2017-04-23 23:06:53 上传评论 9 收藏 8KB RAR 举报

资源推荐

资源详情

资源评论

收起资源包目录

EastFortuneByStockIdx.rar （15个子文件）

EastFortuneByStockIdx

.project 391B

.settings

org.eclipse.core.resources.prefs 257B

test

__init__.py 47B

.pydevproject 431B

baike_spider

ann

url_manager.py 1KB

__init__.pyc 158B

html_outputer.py 769B

html_parser.py 1KB

__init__.py 0B

html_downloader.pyc 996B

spider_main.py 4KB

url_manager.pyc 2KB

html_outputer.pyc 2KB

html_parser.pyc 2KB

html_downloader.py 515B

# coding:utf8 from baike_spider import url_manager, html_downloader, html_parser,\ html_outputer from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.action_chains import ActionChains import pandas as pd import time from matplotlib.cbook import Null from sqlalchemy.sql.expression import except_ from selenium.common.exceptions import NoSuchElementException class SpiderMain(object): def __init__(self): self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer() self.goToNext = True self.stockids = ['600372', '300483', '002758', '600777', '002430'] def craw(self, root_url): for stockid in self.stockids: #http://data.eastmoney.com/notices/stock/000063.html url = "http://data.eastmoney.com/notices/stock/" + stockid + ".html" driver = webdriver.Chrome() driver.get(url) count = 0 while True: flag = self.singlePageCraw(driver, stockid) if flag == 0: break try: nextPage = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.XPATH, "//a[contains(text(),'下一页')]")) ) if nextPage.get_attribute('class') == 'nolink': break nextPage.click() except Exception,e: break count = count + 1 time.sleep(5) driver.quit() time.sleep(5) def singlePageCraw(self, driver, stockid): try: element = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.ID, "dt_1")) ) tr_options = element.find_elements_by_tag_name("tr") df = pd.DataFrame(columns = ['title', 'content'] ) list = ['title','detail','type','time'] for tr_option in tr_options: td_options = tr_option.find_elements_by_tag_name("td") year = '' filename = '' count = 0 for td_option in td_options: try: if count == 0: #title df.loc[count] = [list[count], str(td_option.text).strip()] count = count + 1 detail_url = td_option.find_element_by_tag_name("a").get_attribute('href') html_cont = self.downloader.download(detail_url) detail_content = self.parser.parse(detail_url, html_cont) #detail df.loc[count] = [list[count], str(detail_content).strip()] count = count + 1 filename = stockid + '_' + detail_url.split(',')[0].split('/')[-1] else: df.loc[count] = [list[count], str(td_option.text).strip()] count = count + 1 if '-'in td_option.text: year = str(td_option.text).split('-')[0] if str.isdigit(year): year = int(year) if year < 2014: return 0 except NoSuchElementException,e: return 0 print filename + '-' + str(year) df.to_csv("./ann/"+ filename +'.csv', index=False) except Exception,e: print str(e) finally: return 1 if __name__ == "__main__": root_url = "http://data.eastmoney.com/notices/" obj_spider = SpiderMain() obj_spider.craw(root_url)

评论收藏

内容反馈