#! /usr/bin/evn python
# coding:utf-8
import json
import re
import time
from lxml import etree
import requests
from requests import Response
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from store.google.items import GoogleItem
def get_start_urls():
google_urls = {'https://play.google.com/store', 'https://play.google.com/store/apps/top',
'https://play.google.com/store/apps/new'}
start_urls = list()
for google_url in google_urls:
try:
result = requests.get(google_url, timeout=5)
except requests.RequestException:
return start_urls
if isinstance(result, Response):
result = result.content
dom_tree = etree.HTML(result)
for link in dom_tree.xpath("//@href"):
if link.startswith('/store'):
link = 'https://play.google.com'+link
if link not in start_urls:
start_urls.append(link)
return start_urls
class GoogleSpider(CrawlSpider):
name = "google"
allowed_domains = ["play.google.com"]
start_urls = get_start_urls()
rules = [
Rule(LinkExtractor(allow=('/store/apps/details',)),
callback='parse_app', follow=True),
]
def parse_app(self, response):
self.do_nothing()
item = GoogleItem()
self.init_item(item)
try:
item['url'] = response.url
tmp = response.xpath('//h1[@class="AHFaub"]/span[1]').xpath('text()').extract()
tmp = tmp[0] if tmp else ''
try:
high_points = re.compile(u'[\U00010000-\U0010ffff]')
except re.error:
high_points = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
item['app_name'] = high_points.sub(u'', tmp)
list_tmp = response.xpath('//a[@itemprop="genre"]').xpath('text()').extract()
tmp = ''
for lt in list_tmp:
tmp += ' ' + lt
item['category'] = tmp
tmp = response.xpath('//div[@class="dNLKff"]/c-wiz/div/div//'
'div[@class="L0jl5e bUWb7c cm4lTe"]/div/@style').extract()
tmp = float(tmp[0].strip('%').strip('width: ')) / 100.0 if tmp else float(0)
tmp2 = response.xpath('count(//div[@class="dNLKff"]/c-wiz/div/div//'
'div[@class="vQHuPe bUWb7c"])').extract()
item['score'] = float(tmp2[0]) + tmp if tmp2 else tmp
tmp = response.xpath('//a[@class="hrTbp R8zArc"]').xpath('text()').extract()
tmp = tmp[0] if tmp else ''
item['provider'] = high_points.sub(u'', tmp)
tmp = response.xpath('//span[@class="AYi5wd TBRnV"]/span[1]').xpath('text()').extract()
tmp = tmp[0] if tmp else ''
item['raters'] = tmp
list_tmp = response.xpath('//div[@class="W4P4ne "]/div[2]/div/div/span/div/span') \
.xpath('text()').extract()
try:
self.get_some_cols(list_tmp, item)
except Exception as e:
self.logger.error(str(e.message) + str(list_tmp) + str(item['url']))
self.re_get_some_cols(item)
tmp = response.xpath('//div[@class="W4P4ne "]/div[2]/div/div/span/div/span/div') \
.xpath('text()').extract()
tmp = tmp[0] if tmp else ''
item['content_level'] = tmp
list_tmp = response.xpath('//div[@class="mMF0fd"]/span/@title').extract()
tmp = {}
for i in range(len(list_tmp)):
tmp[i + 1] = str(list_tmp[len(list_tmp) - i - 1])
item['score_detail'] = json.dumps(tmp)
yield item
except Exception as e:
self.logger.error(str(e.message) + str(item['url']))
def format_time(self, uf_time):
month_dict = {'January': '01', 'February': '02', 'March': '03', 'April': '04',
'May': '05', 'June': '06', 'July': '07', 'August': '08',
'September': '09', 'October': '10', 'November': '11', 'December': '12'}
if '' != uf_time:
try:
split_time = uf_time.replace(',', '').split()
if len(split_time) == 3:
month = month_dict[split_time[0]]
day = split_time[1]
year = split_time[2]
st_created_time = year + '-' + month + '-' + day
time_array = time.strptime(st_created_time, '%Y-%m-%d')
return time.strftime('%Y-%m-%d', time_array)
except Exception as e:
self.logger.error(e.message)
return uf_time
def init_item(self, item):
self.do_nothing()
for col in 'url,app_name,category,provider,score,score_detail,raters,app_update_time,' \
'app_size,installation_times,current_app_version,android_requirements,' \
'content_level'.split(','):
item[col] = ''
def re_get_some_cols(self, item):
result = ''
try:
result = requests.get(item['url'], timeout=5)
except requests.RequestException as e:
self.logger.error(e.message)
if isinstance(result, Response):
result = result.content
dom_tree = etree.HTML(result)
list_tmp = dom_tree.xpath('//div[@class="W4P4ne "]/div[2]/div/div/span/div/span/text()')
try:
self.get_some_cols(list_tmp, item)
except Exception as e:
self.logger.error(str(e.message) + str(list_tmp) + str(item['url']))
def get_some_cols(self, list_tmp, item):
if list_tmp:
item['app_update_time'] = self.format_time(list_tmp[0])
if len(list_tmp) > 1:
if '+' in list_tmp[1]:
item['installation_times'] = list_tmp[1]
item['current_app_version'] = list_tmp[2].strip()
item['android_requirements'] = list_tmp[3]
else:
item['app_size'] = list_tmp[1]
item['installation_times'] = list_tmp[2]
if 'and up' in list_tmp[3]:
item['android_requirements'] = list_tmp[3]
else:
item['current_app_version'] = list_tmp[3].strip()
item['android_requirements'] = list_tmp[4]
def do_nothing(self):
pass