python爬虫爬取股票评论，调用百度AI进行语义分析，matlab数据处理，股票涨跌和评论的关系

共10个文件

png：4个

xlsx：2个

py：2个

爬虫

股票评论

股票涨跌

matlab

5星 · 超过95%的资源需积分: 40 140 浏览量 2019-03-22 18:35:36 上传评论 35 收藏 1.71MB RAR 举报

资源推荐

资源详情

资源评论

收起资源包目录

股票涨跌、评论完整.rar （10个子文件）

strcmparrayTime.m 654B

date.xlsx 17KB

股票2.png 83KB

paStockCommentAddCsv.py 4KB

analysisComment.py 2KB

sentiment.xlsx 783KB

date.png 31KB

eastnoney.csv 1.87MB

股票.png 257KB

合并.png 15KB

# -*- coding:UTF-8 -*- import sys import importlib reload(sys) sys.setdefaultencoding( "utf-8" ) import re, requests, codecs, time, random import pandas as pd from lxml import html # proxies={"http" : "123.53.86.133:61234"} proxies = None headers = { 'Host': 'guba.eastmoney.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'} def get_url(page): stocknum = 600570 url = 'http://guba.eastmoney.com/list,' + str(stocknum) + '_' + str(page) + '.html' try: text = requests.get(url, headers=headers, proxies=proxies, timeout=50) requests.adapters.DEFAULT_RETRIES = 5 s = requests.session() s.keep_alive = False text2 = html.fromstring(text.text) text = html.fromstring(text.text) urls = text.xpath('//div[@id="articlelistnew"]/div[@class="articleh normal_post"]/span[3]/a/@href') except Exception as e: print(e) time.sleep(random.random() + random.randint(0, 3)) urls = '' return urls def get_comments(urls): for newurl in urls[0:10]: newurl1 = 'http://guba.eastmoney.com' + newurl # try: text1 = requests.get(newurl1, headers=headers, proxies=proxies, timeout=50) requests.adapters.DEFAULT_RETRIES = 5 s = requests.session() s.keep_alive = False text1 = html.fromstring(text1.text) # times1 = text1.xpath('//div[@class="zwli clearfix"]/div[3]/div/div[2]/text()') times1 = text1.xpath('//div[@class="zwlitx"]/div/div[2]/text()') if not times1 is None: # times = '!'.join(re.sub(re.compile('fabiao| '), '', x)[:13] for x in times1).split('!') times0 = '!'.join(re.sub(re.compile('| '), '', x)[3:14] for x in times1).split('!') # times=list(map(lambda x:re.sub(re.compile('fabiao| '),'',x)[:10],times)) # comments1 = text1.xpath('//div[@class="zwli clearfix"]/div[3]/div/div[3]/text()') comments1 = text1.xpath('//div[@class="zwlitx"]/div/div[3]/div[1]/text()') comments0 = '!'.join(w.strip() for w in comments1).split('!') save_to_file(times0, comments0) # for i in range(0,len(times)-1) : # # dic = dict(zip(times[i], comments[i])) # print times[i], comments[i] # save_to_file(dic) # time.sleep(random.random() + random.randint(0, 3)) # # # # dic = dict(zip(times, comments)) # save_to_file(times,comments) # time.sleep(random.random() + random.randint(0, 3)) # except: # # print('no comment!!!!') # # time.sleep(random.random() + random.randint(0, 3)) # # print(dic) # # if times and comments: # # dic.append({'time':times,'comment':comments}) # # return dic def save_to_file(times,comments): # if dic: # dic=dic # print(dic) df=pd.DataFrame([times,comments]).T # df.to_excel('eastnoney.xlsx') # df.to_csv('eastnoney.csv',encoding="utf_8_sig") df.to_csv('eastmoney.csv', encoding="utf_8_sig",mode='a', header=False) # print('xiele') # for i, j in dic.items(): # output = '{}\t{}\n'.format(i, j) # f = codecs.open('eastmoney.xls', 'a+', 'utf-8') # # f = codecs.open('eastmoney.xls') # f.write(output) # f.close() for page in range(306, 1000): print('Crawling to page {}'.format(page)) urls = get_url(page) get_comments(urls)

评论收藏

内容反馈