#!/usr/bin/env python
# _*_ coding: utf-8 _*_
# @Time : 2020/7/17 16:33
# @Version:V 0.1
# @File : user_twitter.py
import csv
import datetime
import random
import time
import urllib
import requests
from dateparser.search import search_dates
proxies = {'http': 'socks5://127.0.0.1:10808', "https": "socks5://127.0.0.1:10808"}
user_agents = [
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2 ",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
"Opera/12.80 (Windows NT 5.1; U; en) Presto/2.10.289 Version/12.02",
]
def get_spider_time():
g_spider_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
return g_spider_time
def get_time(a):
a = a.replace("+0000", "")
time_str = search_dates(a)
times = time_str[0][-1]
print(times, type(times))
publish_time_end = (times + datetime.timedelta(hours=8)).strftime("%Y-%m-%d %H:%M:%S")
return publish_time_end
def get_session():
s = requests.session()
s.keep_alive = False
s.proxies = proxies
s.allow_redirects = False
s.verify = False
return s
def get_token():
while True:
try:
with open('token.txt', 'r') as f:
tokens = f.read()
return tokens
except Exception as e:
print("获取token出错:", e)
def get_html(url):
s = get_session()
# print("第一次获取到的tokens:",tokens)
num = 0
while num < 50:
try:
num += 1
tokens = get_token()
headers = {
'Referer': 'https://twitter.com/algore',
'Origin': 'https://twitter.com',
'User-Agent': random.choice(user_agents),
'x-guest-token': tokens,
'authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
}
print("开始请求url", url)
"""{'errors': [{'code': 200, 'message': 'Forbidden.'}]},Rate limit exceeded"""
"""{'errors': [{'code': 34, 'message': 'Sorry, that page does not exist.'}]}"""
r = s.get(url, headers=headers, proxies=proxies, timeout=20)
res = str(r.json())
if "Rate limit exceeded" in res or 'Forbidden' in res or "Sorry, that page does not exist" in res:
s = get_session()
continue
else:
return r
except Exception as e:
s = get_session()
print(e)
tokens = get_token()
def get_twitter_info(tim, threeDayAgosss, j):
print("!!!!!!!!!!!!!!!!!!!!!!!!", tim, threeDayAgosss)
cursors = ''
num = 0
while num < 500:
num += 1
try:
url = 'https://twitter.com/i/api/2/search/adaptive.json?include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&include_can_media_tag=1&include_ext_has_nft_avatar=1&skip_status=1&cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&include_ext_media_color=true&include_ext_media_availability=true&include_ext_sensitive_media_warning=true&include_ext_trusted_friends_metadata=true&send_error_codes=true&simple_quoted_tweet=true&q={}%20until%3A{}%20since%3A{}&count=20&query_source=typed_query{}&pc=1&spelling_corrections=1&ext=mediaStats%2ChighlightedLabel%2ChasNftAvatar%2CvoiceInfo%2Cenrichments%2CsuperFollowMetadata%2CunmentionInfo'.format(
j, tim, threeDayAgosss, cursors)
# url = 'https://twitter.com/i/api/2/search/adaptive.json?include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&include_can_media_tag=1&include_ext_has_nft_avatar=1&skip_status=1&cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&include_ext_media_color=true&include_ext_media_availability=true&include_ext_sensitive_media_warning=true&send_error_codes=true&simple_quoted_tweet=true&q=(%23BLM)%20lang%3Aen%20until%3A{}%20since%3A{}&count=20&query_source=typed_query{}&pc=1&spelling_corrections=1&ext=mediaStats%2ChighlightedLabel%2ChasNftAvatar%2CvoiceInfo%2CsuperFollowMetadata'
res = get_html(url)
res_dict = res.json()
# print(res_dict)
if res_dict['globalObjects']['tweets']:
print("aaaa")
get_twitter_article(res_dict['globalObjects'], j)
try:
if res_dict['timeline']['instructions'][0]['addEntries']['entries'][-1]:
page_next = res_dict['timeline']['instructions'][0]['addEntries']['entries'][-1]
print(page_next)
cursor = page_next['content']['operation']['cursor']['value']
print(cursor)
cursor = urllib.parse.quote(cursor)
cursors = "&cursor={}".format(cursor)
except Exception as e:
print("获取下一页参数出错", e)
break
else:
break
except Exception as e:
print("下一页出错:", e)
break
def get_twitter_article(data, keyword):
try:
if data:
for key, i in data['tweets'].items():
print('------------------------------')
info = {}
forward_info = {}
user = data['users']
info['m_content_location'] = ''
try:
m_parent_id = i['retweeted_status_id_str']
info['m_parent_id'] = m_parent_id
forward_info['r_parent_id'] = m_parent_id
except:
info['m_parent_id'] = ''
forward_info['r_parent_id'] = ''
lang = i['lang']
# 转发推文
if info['m_parent_id']:
# 转发推文的url
# m_parent_url = '{}/status/'.format(user_url) + info['m_parent_id']
# info['m_parent_url'] = m_parent_url
# print("转发推文的url,",m_parent_url)
# 转发推文的内容
m_parent_content = i['full_text']
info['m_parent_content'] = m_parent_content
# print("转发推文的内容,",m_parent_content)
r_is_trans = 1
info['r_is_trans'] = r_is_trans
print(r_is_trans)
info['m_content'] = ''
else:
# r_is
- 1
- 2
前往页