# -*- coding: utf-8 -*-
import scrapy
from just.items import FanyiItem
import urllib.request, urllib.parse, urllib.error
import json
import execjs
import time
import random
import hashlib
import chardet
import sys
class FanyiSpider(scrapy.Spider):
name = 'fanyi'
allowed_domains = ['fanyi.baidu.com','translate.google.cn','fanyi.youdao.com','fy.iciba.com','cn.bing.com']
start_url = ["http://fanyi.baidu.com/basetrans",
"http://translate.google.cn/translate_a",
"http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule",
"http://fy.iciba.com/ajax.php?a=fy",
"https://cn.bing.com/ttranslate?&IG=B9C57B6523AF41F68572068D1F457A9F&IID=translator.5036.3"]
#start_url = ["http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule"]
lang_detect_url = "http://fanyi.baidu.com/langdetect"
#start_urls = ['http://fanyi.baidu.com/']
#通过命令行传入要翻译的内容
def __init__(self, *args, **kwargs):
#如果使用默认的start_urls,则加入super这句
#super(FanyiSpider, self).__init__(*args, **kwargs)
#启动命令scrapy crawl fanyi -a source="hello",其中-a用来占位置,args,将传入的参数保存到source
self.source = kwargs.get('source')
#入口函数,通过百度接口获取传入内容的类型,用于自动翻译
def start_requests(self):
yield scrapy.FormRequest(
url=self.lang_detect_url,
formdata={"query":self.source},
callback=self.parse1
)
#翻译函数,遍历start_url,根据根域名选择不同的解析方式
def parse1(self, response):
#语言如果不是中文或英文则不处理
lang_type = json.loads(response.body.decode())["lan"]
print(lang_type)
if lang_type != "zh" and lang_type != "en":
return
#POST方式的网站
for url in self.start_url:
proto, rest = urllib.request.splittype(url)
res, rest = urllib.request.splithost(rest)
data = {}
if res == "fanyi.baidu.com":
if lang_type == "zh":
data = {"from": "zh", "to": "en", "query": self.source}
elif lang_type == "en":
data = {"from": "en", "to": "zh", "query": self.source}
elif res == "fanyi.youdao.com":
u = 'fanyideskweb'
d = self.source
f = str(int(time.time() * 1000) + random.randint(1, 10))
c = 'ebSeFb%=XZ%T[KZ)c(sy!'
sign = hashlib.md5((u + d + f + c).encode('utf-8')).hexdigest()
if lang_type == "zh":
data = {"from": "zh-CHS", "to": "en", "i": self.source, "smartresult": "dict",
"client": "fanyideskweb", "salt": f, "sign": sign, "doctype": "json", "version": "2.1",
"keyfrom": "fanyi.web", "action": "FY_BY_CL1CKBUTTON", "typoResult": "false"}
elif lang_type == "en":
data = {"from": "zh-CHS", "to": "en", "i": self.source, "smartresult": "dict",
"client": "fanyideskweb", "salt": f, "sign": sign, "doctype": "json", "version": "2.1",
"keyfrom": "fanyi.web", "action": "FY_BY_CL1CKBUTTON", "typoResult": "false"}
elif res == "fy.iciba.com":
data = {"f":"auto","t":"auto","w":self.source}
elif res == "cn.bing.com":
if lang_type == "zh":
data = {"from": "zh-CHS", "to": "en", "text": self.source}
elif lang_type == "en":
data = {"from": "en", "to": "zh-CHS", "text": self.source}
yield scrapy.FormRequest(
url=url,
formdata=data,
callback=self.parse
)
#GET方式的网站
for url in self.start_url:
proto, rest = urllib.request.splittype(url)
res, rest = urllib.request.splithost(rest)
if res == "translate.google.cn":
tk = self.getTk(self.source)
if lang_type == "zh":
from_lang = "zh-CN"
to_lang = "en"
elif lang_type == "en":
from_lang = "en"
to_lang = "zh-CN"
local_url = "https://translate.google.cn/translate_a/single?client=webapp" \
"&sl=" + from_lang + "&tl=" + to_lang + "&hl=" + to_lang + "&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca&dt=rw" \
"&dt=rm&dt=ss&dt=t&source=btn&ssel=0&tsel=0&kc=0&tk=" + tk + "&q=" + self.source
yield scrapy.Request(url=local_url, callback=self.parse)
#获取到的翻译内容的接续函数,根据根域名的不用选择不同的解析方式
def parse(self, response):
url = response.url
proto, rest = urllib.request.splittype(url)
res, rest = urllib.request.splithost(rest)
dict_ret = json.loads(response.body.decode())
ret = '';
website = '';
if res == "fanyi.baidu.com":
ret = dict_ret["trans"][0]["dst"]
website = "百度翻译"
elif res == "translate.google.cn":
ret = dict_ret[0][0][0]
website = "谷歌翻译"
elif res == "fanyi.youdao.com":
ret = dict_ret['translateResult'][0][0]['tgt']
website = "有道翻译"
elif res == "fy.iciba.com":
ret = dict_ret['content']['out']
website = "爱词霸翻译"
elif res == "cn.bing.com":
ret = dict_ret['translationResponse']
website = "必应翻译"
item = FanyiItem()
item['source'] = self.source
item['website'] = website
item['target'] = ret
yield item
#获取谷歌翻译的token,百度翻译采用手机浏览器的方式,如果用电脑浏览器也需要token和sign,算法没查到
def getTk(self,text):
ctx = execjs.compile("""
function TL(a)
{
var k = "";
var b = 406644;
var b1 = 3293161072;
var jd = ".";
var $b = "+-a^+6";
var Zb = "+-3^+b+-f";
for (var e = [], f = 0, g = 0; g < a.length; g++) {
var m = a.charCodeAt(g);
128 > m ? e[f++] = m : (2048 > m ? e[f++] = m >> 6 | 192 : (55296 == (m & 64512) && g + 1 < a.length && 56320 == (a.charCodeAt(g + 1) & 64512) ? (m = 65536 + ((m & 1023) << 10) + (a.charCodeAt(++g) & 1023),
e[f++] = m >> 18 | 240,
e[f++] = m >> 12 & 63 | 128) : e[f++] = m >> 12 | 224,
e[f++] = m >> 6 & 63 | 128),
e[f++] = m & 63 | 128)
}
a = b;
for (f = 0; f < e.length; f++) a += e[f],
a = RL(a, $b);
a = RL(a, Zb);
a ^= b1 || 0;
0 > a && (a = (a & 2147483647) + 2147483648);
a %= 1E6;
return a.toString() + jd + (a ^ b)
};
function RL(a, b) {
var t = "a";
var Yb = "+";
for (var c = 0; c < b.length - 2; c += 3) {
var d = b.charAt(c + 2),
d = d >= t ? d.charCodeAt(0) - 87 : Number(d),
d = b.charAt(c + 1) == Yb ? a >>> d: a << d;
a = b.charAt(c) == Yb ? a + d & 4294967295 : a ^ d
}
return a
}
""")
return ctx.call("TL",text);
没有合适的资源?快使用搜索试试~ 我知道了~
资源推荐
资源详情
资源评论
收起资源包目录
just.zip (27个子文件)
just
ghostdriver.log 0B
scrapy.cfg 251B
.idea
workspace.xml 35KB
misc.xml 199B
modules.xml 260B
inspectionProfiles
encodings(1).xml 181B
just.iml 398B
baidu.json 320B
just
spiders
__pycache__
__init__.cpython-36.pyc 122B
baidu.cpython-36.pyc 526B
spdier58.cpython-36.pyc 1KB
fanyi.cpython-36.pyc 5KB
__init__.py 161B
spdier58.py 1KB
baidu.py 221B
fanyi.py 8KB
items.py 667B
pipelines.py 490B
middlewares.py 5KB
settings.py 4KB
__pycache__
__init__.cpython-36.pyc 114B
items.cpython-36.pyc 740B
pipelines.cpython-36.pyc 812B
middlewares.cpython-36.pyc 3KB
settings.cpython-36.pyc 653B
__init__.py 0B
run.py 375B
共 27 条
- 1
资源评论
- only礼拜天2018-10-07一般,不好用
- Insist_Tao2018-10-25一般般吧,我感觉
fzcbx
- 粉丝: 1
- 资源: 3
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功