package com.fullstackyang.crawler.weibo.client;
import com.fullstackyang.httpclient.HttpClientInstance;
import com.fullstackyang.httpclient.HttpRequestUtils;
import com.google.common.base.Strings;
import com.google.common.collect.Maps;
import com.google.common.net.HttpHeaders;
import lombok.NoArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.json.JSONObject;
import java.io.UnsupportedEncodingException;
import java.math.BigDecimal;
import java.net.URLEncoder;
import java.util.Map;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
/**
* 微博免登陆请求客户端
*
* @author fullstackyang
*/
@Slf4j
public class WeiboClient {
private static CookieFetcher cookieFetcher = new CookieFetcher();
private volatile String cookie;
public WeiboClient() {
this.cookie = cookieFetcher.getCookie();
}
private static Lock lock = new ReentrantLock();
/**
* when getting html failed, call this to get a new cookie and try again
*/
public void cookieReset() {
if (lock.tryLock()) {
try {
HttpClientInstance.instance().changeProxy();
this.cookie = cookieFetcher.getCookie();
log.info("cookie :" + cookie);
} finally {
lock.unlock();
}
}
}
/**
* for example:
* <p>WeiboClient weiboClient=new WeiboClient();</p>
* <p>weiboClient. get("http://weibo.com");</p>
*
* @param url weibo.com/*
* @return
*/
public String get(String url) {
if (Strings.isNullOrEmpty(url))
return "";
while (true) {
HttpGet httpGet = new HttpGet(url);
httpGet.addHeader(HttpHeaders.COOKIE, cookie);
httpGet.addHeader(HttpHeaders.HOST, "weibo.com");
httpGet.addHeader("Upgrade-Insecure-Requests", "1");
httpGet.setConfig(RequestConfig.custom().setSocketTimeout(3000)
.setConnectTimeout(3000).setConnectionRequestTimeout(3000).build());
String html = HttpClientInstance.instance().tryExecute(httpGet, null, null);
if (html == null)
cookieReset();
else return html;
}
}
/**
* 获取访问微博时必需的Cookie
*/
@NoArgsConstructor
static class CookieFetcher {
static final String PASSPORT_URL = "https://passport.weibo.com/visitor/visitor?entry=miniblog&a=enter&url=http://weibo.com/?category=2"
+ "&domain=.weibo.com&ua=php-sso_sdk_client-0.6.23";
static final String GEN_VISITOR_URL = "https://passport.weibo.com/visitor/genvisitor";
static final String VISITOR_URL = "https://passport.weibo.com/visitor/visitor?a=incarnate";
private String getCookie() {
Map<String, String> map;
while (true) {
map = getCookieParam();
if (map.containsKey("SUB") && map.containsKey("SUBP") &&
StringUtils.isNoneEmpty(map.get("SUB"), map.get("SUBP")))
break;
HttpClientInstance.instance().changeProxy();
}
return " YF-Page-G0=" + "; _s_tentry=-; SUB=" + map.get("SUB") + "; SUBP=" + map.get("SUBP");
}
private Map<String, String> getCookieParam() {
String time = System.currentTimeMillis() + "";
time = time.substring(0, 9) + "." + time.substring(9, 13);
String passporturl = PASSPORT_URL + "&_rand=" + time;
String tid = "";
String c = "";
String w = "";
{
String str = postGenvisitor(passporturl);
if (str.contains("\"retcode\":20000000")) {
JSONObject jsonObject = new JSONObject(str).getJSONObject("data");
tid = jsonObject.optString("tid");
try {
tid = URLEncoder.encode(tid, "utf-8");
} catch (UnsupportedEncodingException e) {
}
c = jsonObject.has("confidence") ? "000" + jsonObject.getInt("confidence") : "100";
w = jsonObject.optBoolean("new_tid") ? "3" : "2";
}
}
String s = "";
String sp = "";
{
if (StringUtils.isNoneEmpty(tid, w, c)) {
String str = getVisitor(tid, w, c, passporturl);
str = str.substring(str.indexOf("(") + 1, str.indexOf(")"));
if (str.contains("\"retcode\":20000000")) {
JSONObject jsonObject = new JSONObject(str).getJSONObject("data");
s = jsonObject.getString("sub");
sp = jsonObject.getString("subp");
}
}
}
Map<String, String> map = Maps.newHashMap();
map.put("SUB", s);
map.put("SUBP", sp);
return map;
}
private String postGenvisitor(String passporturl) {
Map<String, String> headers = Maps.newHashMap();
headers.put(HttpHeaders.ACCEPT, "*/*");
headers.put(HttpHeaders.ORIGIN, "https://passport.weibo.com");
headers.put(HttpHeaders.REFERER, passporturl);
Map<String, String> params = Maps.newHashMap();
params.put("cb", "gen_callback");
params.put("fp", fp());
HttpPost httpPost = HttpRequestUtils.createHttpPost(GEN_VISITOR_URL, headers, params);
String str = HttpClientInstance.instance().execute(httpPost, null);
return str.substring(str.indexOf("(") + 1, str.lastIndexOf(""));
}
private String getVisitor(String tid, String w, String c, String passporturl) {
String url = VISITOR_URL + "&t=" + tid + "&w=" + "&c=" + c.substring(c.length() - 3)
+ "&gc=&cb=cross_domain&from=weibo&_rand=0." + rand();
Map<String, String> headers = Maps.newHashMap();
headers.put(HttpHeaders.ACCEPT, "*/*");
headers.put(HttpHeaders.HOST, "passport.weibo.com");
headers.put(HttpHeaders.COOKIE, "tid=" + tid + "__0" + c);
headers.put(HttpHeaders.REFERER, passporturl);
HttpGet httpGet = HttpRequestUtils.createHttpGet(url, headers);
httpGet.setConfig(RequestConfig.custom().setCookieSpec(CookieSpecs.STANDARD).build());
return HttpClientInstance.instance().execute(httpGet, null);
}
private static String rand() {
return new BigDecimal(Math.floor(Math.random() * 10000000000000000L)).toString();
}
private static String fp() {
JSONObject jsonObject = new JSONObject();
jsonObject.put("os", "1");
jsonObject.put("browser", "Chrome59,0,3071,115");
jsonObject.put("fonts", "undefined");
jsonObject.put("screenInfo", "1680*1050*24");
jsonObject.put("plugins",
"Enables Widevine licenses for playback of HTML audio/video content. (version: 1.4.8.984)::widevinecdmadapter.dll::Widevine Content Decryption Module|Shockwave Flash 26.0 r0::pepflashplayer.dll::Shockwave Flash|::mhjfbmdgcfjbbpaeojofohoefgiehjai::Chrome PDF Viewer|::internal-nacl-plugin::Native Client|Portable Document Format::internal-pdf-viewer::Chrome PDF Viewer");
return jsonObject.toString();
}
}
}
没有合适的资源?快使用搜索试试~ 我知道了~
温馨提示
爬虫(Web Crawler)是一种自动化程序,用于从互联网上收集信息。其主要功能是访问网页、提取数据并存储,以便后续分析或展示。爬虫通常由搜索引擎、数据挖掘工具、监测系统等应用于网络数据抓取的场景。 爬虫的工作流程包括以下几个关键步骤: URL收集: 爬虫从一个或多个初始URL开始,递归或迭代地发现新的URL,构建一个URL队列。这些URL可以通过链接分析、站点地图、搜索引擎等方式获取。 请求网页: 爬虫使用HTTP或其他协议向目标URL发起请求,获取网页的HTML内容。这通常通过HTTP请求库实现,如Python中的Requests库。 解析内容: 爬虫对获取的HTML进行解析,提取有用的信息。常用的解析工具有正则表达式、XPath、Beautiful Soup等。这些工具帮助爬虫定位和提取目标数据,如文本、图片、链接等。 数据存储: 爬虫将提取的数据存储到数据库、文件或其他存储介质中,以备后续分析或展示。常用的存储形式包括关系型数据库、NoSQL数据库、JSON文件等。 遵守规则: 为避免对网站造成过大负担或触发反爬虫机制,爬虫需要遵守网站的robots.txt协议,限制访问频率和深度,并模拟人类访问行为,如设置User-Agent。 反爬虫应对: 由于爬虫的存在,一些网站采取了反爬虫措施,如验证码、IP封锁等。爬虫工程师需要设计相应的策略来应对这些挑战。 爬虫在各个领域都有广泛的应用,包括搜索引擎索引、数据挖掘、价格监测、新闻聚合等。然而,使用爬虫需要遵守法律和伦理规范,尊重网站的使用政策,并确保对被访问网站的服务器负责。
资源推荐
资源详情
资源评论
收起资源包目录
新型的免登录微博爬虫,自动获取Cookie直接进行抓取和解析微博数据,免去了账号登录的过程,彻底摆脱账号被封的困扰.zip (37个子文件)
SJT-code
pom.xml 3KB
modules
pom.xml 3KB
src
main
java
com
fullstackyang
crawler
weibo
client
WeiboClient.java 8KB
parser
detail
DetailContentHandler.java 606B
WeiboDetailParser.java 1KB
WeiboDetailHandler.java 256B
WeiboBaseHandler.java 190B
feed
WeiboFeedHandler.java 248B
FeedForwardCountHandler.java 844B
FeedHasOriginHandler.java 2KB
FeedOriginUrlHandler.java 750B
FeedOriginPubTimeHandler.java 705B
FeedOriginMidlHandler.java 704B
FeedUrlHandler.java 739B
FeedMidHandler.java 667B
FeedOriginContentHandler.java 866B
WeiboFeedParser.java 2KB
FeedNicknameHandler.java 701B
FeedOriginForwardCountHandler.java 845B
FeedPubTimeHandler.java 697B
FeedContentHandler.java 1KB
FeedOidHandler.java 856B
user
WeiboForwardParser.java 1KB
ForwardUserHandler.java 557B
WeiboUserHandler.java 248B
WeiboBaseParser.java 5KB
dto
WeiboFeed.java 728B
converter
DetailFeedConverter.java 1KB
OriginWeiboFeed.java 249B
WeiboUser.java 410B
NormalWeiboFeed.java 550B
AbstractDTO.java 1KB
WeiboDetail.java 271B
commons
pom.xml 1019B
src
main
java
com
fullstackyang
crawler
weibo
utils
DateConvertor.java 4KB
EncodeConvertor.java 545B
.gitignore 103B
共 37 条
- 1
资源评论
JJJ69
- 粉丝: 6006
- 资源: 5613
下载权益
C知道特权
VIP文章
课程特权
开通VIP
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功