scrapy框架爬虫，爬取百度翻译、有道翻译、谷歌翻译、爱词霸翻译、必应翻译，包括防爬虫破解

共27个文件

py：10个

pyc：9个

xml：4个

python

scrapy

3星 · 超过75%的资源 127 浏览量 2018-04-12 11:31:42 上传评论 2 收藏 24KB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

just.zip （27个子文件）

just

ghostdriver.log 0B

scrapy.cfg 251B

.idea

workspace.xml 35KB

misc.xml 199B

modules.xml 260B

inspectionProfiles

encodings(1).xml 181B

just.iml 398B

baidu.json 320B

just

spiders

__pycache__

__init__.cpython-36.pyc 122B

baidu.cpython-36.pyc 526B

spdier58.cpython-36.pyc 1KB

fanyi.cpython-36.pyc 5KB

__init__.py 161B

spdier58.py 1KB

baidu.py 221B

fanyi.py 8KB

items.py 667B

pipelines.py 490B

middlewares.py 5KB

settings.py 4KB

__pycache__

__init__.cpython-36.pyc 114B

items.cpython-36.pyc 740B

pipelines.cpython-36.pyc 812B

middlewares.cpython-36.pyc 3KB

settings.cpython-36.pyc 653B

__init__.py 0B

run.py 375B

# -*- coding: utf-8 -*- import scrapy from just.items import FanyiItem import urllib.request, urllib.parse, urllib.error import json import execjs import time import random import hashlib import chardet import sys class FanyiSpider(scrapy.Spider): name = 'fanyi' allowed_domains = ['fanyi.baidu.com','translate.google.cn','fanyi.youdao.com','fy.iciba.com','cn.bing.com'] start_url = ["http://fanyi.baidu.com/basetrans", "http://translate.google.cn/translate_a", "http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule", "http://fy.iciba.com/ajax.php?a=fy", "https://cn.bing.com/ttranslate?&IG=B9C57B6523AF41F68572068D1F457A9F&IID=translator.5036.3"] #start_url = ["http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule"] lang_detect_url = "http://fanyi.baidu.com/langdetect" #start_urls = ['http://fanyi.baidu.com/'] #通过命令行传入要翻译的内容 def __init__(self, *args, **kwargs): #如果使用默认的start_urls，则加入super这句 #super(FanyiSpider, self).__init__(*args, **kwargs) #启动命令scrapy crawl fanyi -a source="hello"，其中-a用来占位置，args，将传入的参数保存到source self.source = kwargs.get('source') #入口函数，通过百度接口获取传入内容的类型，用于自动翻译 def start_requests(self): yield scrapy.FormRequest( url=self.lang_detect_url, formdata={"query":self.source}, callback=self.parse1 ) #翻译函数，遍历start_url，根据根域名选择不同的解析方式 def parse1(self, response): #语言如果不是中文或英文则不处理 lang_type = json.loads(response.body.decode())["lan"] print(lang_type) if lang_type != "zh" and lang_type != "en": return #POST方式的网站 for url in self.start_url: proto, rest = urllib.request.splittype(url) res, rest = urllib.request.splithost(rest) data = {} if res == "fanyi.baidu.com": if lang_type == "zh": data = {"from": "zh", "to": "en", "query": self.source} elif lang_type == "en": data = {"from": "en", "to": "zh", "query": self.source} elif res == "fanyi.youdao.com": u = 'fanyideskweb' d = self.source f = str(int(time.time() * 1000) + random.randint(1, 10)) c = 'ebSeFb%=XZ%T[KZ)c(sy!' sign = hashlib.md5((u + d + f + c).encode('utf-8')).hexdigest() if lang_type == "zh": data = {"from": "zh-CHS", "to": "en", "i": self.source, "smartresult": "dict", "client": "fanyideskweb", "salt": f, "sign": sign, "doctype": "json", "version": "2.1", "keyfrom": "fanyi.web", "action": "FY_BY_CL1CKBUTTON", "typoResult": "false"} elif lang_type == "en": data = {"from": "zh-CHS", "to": "en", "i": self.source, "smartresult": "dict", "client": "fanyideskweb", "salt": f, "sign": sign, "doctype": "json", "version": "2.1", "keyfrom": "fanyi.web", "action": "FY_BY_CL1CKBUTTON", "typoResult": "false"} elif res == "fy.iciba.com": data = {"f":"auto","t":"auto","w":self.source} elif res == "cn.bing.com": if lang_type == "zh": data = {"from": "zh-CHS", "to": "en", "text": self.source} elif lang_type == "en": data = {"from": "en", "to": "zh-CHS", "text": self.source} yield scrapy.FormRequest( url=url, formdata=data, callback=self.parse ) #GET方式的网站 for url in self.start_url: proto, rest = urllib.request.splittype(url) res, rest = urllib.request.splithost(rest) if res == "translate.google.cn": tk = self.getTk(self.source) if lang_type == "zh": from_lang = "zh-CN" to_lang = "en" elif lang_type == "en": from_lang = "en" to_lang = "zh-CN" local_url = "https://translate.google.cn/translate_a/single?client=webapp" \ "&sl=" + from_lang + "&tl=" + to_lang + "&hl=" + to_lang + "&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca&dt=rw" \ "&dt=rm&dt=ss&dt=t&source=btn&ssel=0&tsel=0&kc=0&tk=" + tk + "&q=" + self.source yield scrapy.Request(url=local_url, callback=self.parse) #获取到的翻译内容的接续函数，根据根域名的不用选择不同的解析方式 def parse(self, response): url = response.url proto, rest = urllib.request.splittype(url) res, rest = urllib.request.splithost(rest) dict_ret = json.loads(response.body.decode()) ret = ''; website = ''; if res == "fanyi.baidu.com": ret = dict_ret["trans"][0]["dst"] website = "百度翻译" elif res == "translate.google.cn": ret = dict_ret[0][0][0] website = "谷歌翻译" elif res == "fanyi.youdao.com": ret = dict_ret['translateResult'][0][0]['tgt'] website = "有道翻译" elif res == "fy.iciba.com": ret = dict_ret['content']['out'] website = "爱词霸翻译" elif res == "cn.bing.com": ret = dict_ret['translationResponse'] website = "必应翻译" item = FanyiItem() item['source'] = self.source item['website'] = website item['target'] = ret yield item #获取谷歌翻译的token，百度翻译采用手机浏览器的方式，如果用电脑浏览器也需要token和sign，算法没查到 def getTk(self,text): ctx = execjs.compile(""" function TL(a) { var k = ""; var b = 406644; var b1 = 3293161072; var jd = "."; var $b = "+-a^+6"; var Zb = "+-3^+b+-f"; for (var e = [], f = 0, g = 0; g < a.length; g++) { var m = a.charCodeAt(g); 128 > m ? e[f++] = m : (2048 > m ? e[f++] = m >> 6 | 192 : (55296 == (m & 64512) && g + 1 < a.length && 56320 == (a.charCodeAt(g + 1) & 64512) ? (m = 65536 + ((m & 1023) << 10) + (a.charCodeAt(++g) & 1023), e[f++] = m >> 18 | 240, e[f++] = m >> 12 & 63 | 128) : e[f++] = m >> 12 | 224, e[f++] = m >> 6 & 63 | 128), e[f++] = m & 63 | 128) } a = b; for (f = 0; f < e.length; f++) a += e[f], a = RL(a, $b); a = RL(a, Zb); a ^= b1 || 0; 0 > a && (a = (a & 2147483647) + 2147483648); a %= 1E6; return a.toString() + jd + (a ^ b) }; function RL(a, b) { var t = "a"; var Yb = "+"; for (var c = 0; c < b.length - 2; c += 3) { var d = b.charAt(c + 2), d = d >= t ? d.charCodeAt(0) - 87 : Number(d), d = b.charAt(c + 1) == Yb ? a >>> d: a << d; a = b.charAt(c) == Yb ? a + d & 4294967295 : a ^ d } return a } """) return ctx.call("TL",text);

评论收藏

内容反馈