# -*- coding:UTF-8 -*-
import sys
import importlib
reload(sys)
sys.setdefaultencoding( "utf-8" )
import re, requests, codecs, time, random
import pandas as pd
from lxml import html
# proxies={"http" : "123.53.86.133:61234"}
proxies = None
headers = {
'Host': 'guba.eastmoney.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'}
def get_url(page):
stocknum = 600570
url = 'http://guba.eastmoney.com/list,' + str(stocknum) + '_' + str(page) + '.html'
try:
text = requests.get(url, headers=headers, proxies=proxies, timeout=50)
requests.adapters.DEFAULT_RETRIES = 5
s = requests.session()
s.keep_alive = False
text2 = html.fromstring(text.text)
text = html.fromstring(text.text)
urls = text.xpath('//div[@id="articlelistnew"]/div[@class="articleh normal_post"]/span[3]/a/@href')
except Exception as e:
print(e)
time.sleep(random.random() + random.randint(0, 3))
urls = ''
return urls
def get_comments(urls):
for newurl in urls[0:10]:
newurl1 = 'http://guba.eastmoney.com' + newurl
# try:
text1 = requests.get(newurl1, headers=headers, proxies=proxies, timeout=50)
requests.adapters.DEFAULT_RETRIES = 5
s = requests.session()
s.keep_alive = False
text1 = html.fromstring(text1.text)
# times1 = text1.xpath('//div[@class="zwli clearfix"]/div[3]/div/div[2]/text()')
times1 = text1.xpath('//div[@class="zwlitx"]/div/div[2]/text()')
if not times1 is None:
# times = '!'.join(re.sub(re.compile('fabiao| '), '', x)[:13] for x in times1).split('!')
times0 = '!'.join(re.sub(re.compile('| '), '', x)[3:14] for x in times1).split('!')
# times=list(map(lambda x:re.sub(re.compile('fabiao| '),'',x)[:10],times))
# comments1 = text1.xpath('//div[@class="zwli clearfix"]/div[3]/div/div[3]/text()')
comments1 = text1.xpath('//div[@class="zwlitx"]/div/div[3]/div[1]/text()')
comments0 = '!'.join(w.strip() for w in comments1).split('!')
save_to_file(times0, comments0)
# for i in range(0,len(times)-1) :
#
# dic = dict(zip(times[i], comments[i]))
# print times[i], comments[i]
# save_to_file(dic)
# time.sleep(random.random() + random.randint(0, 3))
#
# #
# dic = dict(zip(times, comments))
# save_to_file(times,comments)
# time.sleep(random.random() + random.randint(0, 3))
# except:
#
# print('no comment!!!!')
# # time.sleep(random.random() + random.randint(0, 3))
# # print(dic)
# # if times and comments:
# # dic.append({'time':times,'comment':comments})
# # return dic
def save_to_file(times,comments):
# if dic:
# dic=dic
# print(dic)
df=pd.DataFrame([times,comments]).T
# df.to_excel('eastnoney.xlsx')
# df.to_csv('eastnoney.csv',encoding="utf_8_sig")
df.to_csv('eastmoney.csv', encoding="utf_8_sig",mode='a', header=False)
# print('xiele')
# for i, j in dic.items():
# output = '{}\t{}\n'.format(i, j)
# f = codecs.open('eastmoney.xls', 'a+', 'utf-8')
# # f = codecs.open('eastmoney.xls')
# f.write(output)
# f.close()
for page in range(306, 1000):
print('Crawling to page {}'.format(page))
urls = get_url(page)
get_comments(urls)
没有合适的资源?快使用搜索试试~ 我知道了~
资源推荐
资源详情
资源评论
收起资源包目录
股票涨跌、评论完整.rar (10个子文件)
strcmparrayTime.m 654B
date.xlsx 17KB
股票2.png 83KB
paStockCommentAddCsv.py 4KB
analysisComment.py 2KB
sentiment.xlsx 783KB
date.png 31KB
eastnoney.csv 1.87MB
股票.png 257KB
合并.png 15KB
共 10 条
- 1
资源评论
- 对弈二级市场2020-08-30可以可以,能用
- weixin_424654992020-04-26谢谢您的分享~
zhyl4669
- 粉丝: 35
- 资源: 13
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功