推特关键词采集，关键词搜索_twitter：caimogu9资源-CSDN文库

共2个文件

py：2个

版权申诉

爬虫

5星 · 超过95%的资源 137 浏览量 2022-06-14 17:40:59 上传评论 3 收藏 5KB RAR 举报

资源推荐

资源详情

资源评论

收起资源包目录

twitter.rar （2个子文件）

twitter

get_token.py 2KB

twitter_get_reply_conetnt.py 16KB

#!/usr/bin/env python # _*_ coding: utf-8 _*_ # @Time : 2020/7/17 16:33 # @Version：V 0.1 # @File : user_twitter.py import csv import datetime import random import time import urllib import requests from dateparser.search import search_dates proxies = {'http': 'socks5://127.0.0.1:10808', "https": "socks5://127.0.0.1:10808"} user_agents = [ "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2 ", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36", "Opera/12.80 (Windows NT 5.1; U; en) Presto/2.10.289 Version/12.02", ] def get_spider_time(): g_spider_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) return g_spider_time def get_time(a): a = a.replace("+0000", "") time_str = search_dates(a) times = time_str[0][-1] print(times, type(times)) publish_time_end = (times + datetime.timedelta(hours=8)).strftime("%Y-%m-%d %H:%M:%S") return publish_time_end def get_session(): s = requests.session() s.keep_alive = False s.proxies = proxies s.allow_redirects = False s.verify = False return s def get_token(): while True: try: with open('token.txt', 'r') as f: tokens = f.read() return tokens except Exception as e: print("获取token出错:", e) def get_html(url): s = get_session() # print("第一次获取到的tokens：",tokens) num = 0 while num < 50: try: num += 1 tokens = get_token() headers = { 'Referer': 'https://twitter.com/algore', 'Origin': 'https://twitter.com', 'User-Agent': random.choice(user_agents), 'x-guest-token': tokens, 'authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA' } print("开始请求url", url) """{'errors': [{'code': 200, 'message': 'Forbidden.'}]},Rate limit exceeded""" """{'errors': [{'code': 34, 'message': 'Sorry, that page does not exist.'}]}""" r = s.get(url, headers=headers, proxies=proxies, timeout=20) res = str(r.json()) if "Rate limit exceeded" in res or 'Forbidden' in res or "Sorry, that page does not exist" in res: s = get_session() continue else: return r except Exception as e: s = get_session() print(e) tokens = get_token() def get_twitter_info(tim, threeDayAgosss, j): print("!!!!!!!!!!!!!!!!!!!!!!!!", tim, threeDayAgosss) cursors = '' num = 0 while num < 500: num += 1 try: url = 'https://twitter.com/i/api/2/search/adaptive.json?include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&include_can_media_tag=1&include_ext_has_nft_avatar=1&skip_status=1&cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&include_ext_media_color=true&include_ext_media_availability=true&include_ext_sensitive_media_warning=true&include_ext_trusted_friends_metadata=true&send_error_codes=true&simple_quoted_tweet=true&q={}%20until%3A{}%20since%3A{}&count=20&query_source=typed_query{}&pc=1&spelling_corrections=1&ext=mediaStats%2ChighlightedLabel%2ChasNftAvatar%2CvoiceInfo%2Cenrichments%2CsuperFollowMetadata%2CunmentionInfo'.format( j, tim, threeDayAgosss, cursors) # url = 'https://twitter.com/i/api/2/search/adaptive.json?include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&include_can_media_tag=1&include_ext_has_nft_avatar=1&skip_status=1&cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&include_ext_media_color=true&include_ext_media_availability=true&include_ext_sensitive_media_warning=true&send_error_codes=true&simple_quoted_tweet=true&q=(%23BLM)%20lang%3Aen%20until%3A{}%20since%3A{}&count=20&query_source=typed_query{}&pc=1&spelling_corrections=1&ext=mediaStats%2ChighlightedLabel%2ChasNftAvatar%2CvoiceInfo%2CsuperFollowMetadata' res = get_html(url) res_dict = res.json() # print(res_dict) if res_dict['globalObjects']['tweets']: print("aaaa") get_twitter_article(res_dict['globalObjects'], j) try: if res_dict['timeline']['instructions'][0]['addEntries']['entries'][-1]: page_next = res_dict['timeline']['instructions'][0]['addEntries']['entries'][-1] print(page_next) cursor = page_next['content']['operation']['cursor']['value'] print(cursor) cursor = urllib.parse.quote(cursor) cursors = "&cursor={}".format(cursor) except Exception as e: print("获取下一页参数出错", e) break else: break except Exception as e: print("下一页出错:", e) break def get_twitter_article(data, keyword): try: if data: for key, i in data['tweets'].items(): print('------------------------------') info = {} forward_info = {} user = data['users'] info['m_content_location'] = '' try: m_parent_id = i['retweeted_status_id_str'] info['m_parent_id'] = m_parent_id forward_info['r_parent_id'] = m_parent_id except: info['m_parent_id'] = '' forward_info['r_parent_id'] = '' lang = i['lang'] # 转发推文 if info['m_parent_id']: # 转发推文的url # m_parent_url = '{}/status/'.format(user_url) + info['m_parent_id'] # info['m_parent_url'] = m_parent_url # print("转发推文的url，",m_parent_url) # 转发推文的内容 m_parent_content = i['full_text'] info['m_parent_content'] = m_parent_content # print("转发推文的内容,",m_parent_content) r_is_trans = 1 info['r_is_trans'] = r_is_trans print(r_is_trans) info['m_content'] = '' else: # r_is

评论收藏

内容反馈

版权申诉