package com.mengkeng.selenium_demo.test;
import com.alibaba.fastjson.JSON;
import com.mengkeng.selenium_demo.entity.BuildAreaUrlLj;
import com.mengkeng.selenium_demo.entity.IdAndNamePO;
import com.mengkeng.selenium_demo.entity.TkBuildingsAreaInfolj;
import com.mengkeng.selenium_demo.entity.TkBuildingsMonthPriceLj;
import com.mengkeng.selenium_demo.mapper.BuildAreaUrlLjMapper;
import com.mengkeng.selenium_demo.service.ProxyService;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.time.DateFormatUtils;
import org.openqa.selenium.By;
import org.openqa.selenium.PageLoadStrategy;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.redis.core.HashOperations;
import org.springframework.data.redis.core.SetOperations;
import org.springframework.data.redis.core.StringRedisTemplate;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.util.*;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
*
* Date: 2022-09-05 13:58
* Description: 小区
*/
@RestController
@RequestMapping("areaInfo")
@Slf4j
public class LianjiaAreaInfoDemo {
@Autowired
private StringRedisTemplate redisTemplate;
@Autowired
private BuildAreaUrlLjMapper buildAreaUrlLjMapper;
@Autowired
private ProxyService proxyService;
public static final String SKIP_URLS = "SKIP_URLS_AREAINFO_LIANJIA";
public static final String URLS = "URLS_AREAINFO_LIANJIA";
public static final String AREA_INFO_COMMUNITY_CODE_LJ = "AREA_INFO_COMMUNITY_CODE_LJ";
private static LinkedList<String> pages = new LinkedList<>();
ThreadPoolExecutor pagepoolExecutor = new ThreadPoolExecutor(2,
10, 30L,
TimeUnit.SECONDS, new LinkedBlockingQueue<>());
@RequestMapping("sync")
public void sync() throws InterruptedException {
System.setProperty("webdriver.chrome.driver", "D://chromedriver.exe");
boolean flag = false;
while (!flag) {
try {
ChromeDriver driver = getChromeDriver();
SetOperations ops = redisTemplate.opsForSet();
try {
getUrls(driver, ops);
parsePagePre(ops);
} finally {
sleep(1000);
driver.quit();
}
} catch (Exception e) {
Thread.sleep(10000);
continue;
}
flag = true;
}
System.out.println("完成");
}
/**
* 获取浏览器对象
* @return
*/
private ChromeDriver getChromeDriver() {
String nextProxy = proxyService.getNextProxy();
System.out.println("当前ip是" + nextProxy);
String[] arr = {"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36 Edg/103.0.1264.37",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
"Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)"};
ChromeOptions chromeOptions = new ChromeOptions();
chromeOptions.setPageLoadStrategy(PageLoadStrategy.EAGER);
chromeOptions.addArguments("--incognito");
chromeOptions.addArguments("--blink-settings=imagesEnabled=false");
chromeOptions.addArguments("--headless");
chromeOptions.addArguments("--no-sandbox");
chromeOptions.addArguments("--disable-gpu");
if (StringUtils.isNotBlank(nextProxy) && !nextProxy.equals("local")) {
chromeOptions.addArguments("--proxy-server=" + nextProxy);
}
HashMap<String, Object> map = new HashMap<>();
map.put("webrtc.ip_handling_policy", "disable_non_proxied_udp");
map.put("webrtc.multiple_routes_enabled", false);
map.put("webrtc.nonproxied_udp_enabled", false);
chromeOptions.setExperimentalOption("prefs", map);
Random random = new Random();
chromeOptions.addArguments("User-Agent=" + arr[random.nextInt(7)]);
ChromeDriver driver = new ChromeDriver(chromeOptions);
driver.manage().window().maximize();
return driver;
}
private void parsePagePre(SetOperations ops) {
HashOperations<String, Object, Object> opsForHash = redisTemplate.opsForHash();
List<BuildAreaUrlLj> buildAreaUrlLjs = buildAreaUrlLjMapper.selectList(null);
List<BuildAreaUrlLj> buildAreaUrlLjs1 = buildAreaUrlLjs.subList(1,3500);
for (BuildAreaUrlLj buildAreaUrlLj : buildAreaUrlLjs1) {
if (ops.isMember(SKIP_URLS, buildAreaUrlLj.getAreaUrl())) {
System.out.println("跳过当前区域" + buildAreaUrlLj.getCityName() + "-" + buildAreaUrlLj.getCountyName());
continue;
}
pagepoolExecutor.execute(() -> parsePage(ops, opsForHash, buildAreaUrlLj));
}
}
/**
* 解析列表
* @param ops
* @param opsForHash
* @param buildAreaUrlLj
*/
private void parsePage(SetOperations ops, HashOperations<String, Object, Object> opsForHash, BuildAreaUrlLj buildAreaUrlLj) {
ChromeDriver driver = getChromeDriver();
try {
driver.get(buildAreaUrlLj.getAreaUrl());
String windowHandlePage = driver.getWindowHandle();
WebElement totalNumStr = validElement("//h2[@class='total fl']/span", driver);
if (null != totalNumStr) {
Integer total = Integer.valueOf(totalNumStr.getText());
// 有数据
if (total > 1) {
String pageData = driver.findElement(By.xpath("//div[@class='page-box house-lst-page-box']")).getAttribute("page-data");
Integer pageNumStr = Integer.valueOf(JSON.parseObject(pageData).getString("totalPage"));
System.out.println("当前区域页数" + pageNumStr + "---" + buildAreaUrlLj.getAreaUrl());
for (int x = 1; x <= pageNumStr; x++) {
List<WebElement> elements = driver.findElements(By.xpath("//ul[@class='listContent']/li/div[1]/div[1]/a"));
for (int i = 0; i < elements.size(); i++) {
WebElement item = elements.get(i);
String code = "";
Pattern compile1 = Pattern.compile("xiaoqu/(\\w+)/");
Matcher matcher1 = compile1.matcher(item.getAttribute("href"));
while (matcher1.find()) {
code = matcher1.group(1);
}
driver.executeScript("arguments[0].click();", item);
sleepAndCutoverNewPage(300, driver);
没有合适的资源?快使用搜索试试~ 我知道了~
资源详情
资源评论
资源推荐
收起资源包目录
java+selenium(房天下 链家小区 多线程+代理实现) (236个子文件)
$PROJECT_FILE$ 364B
LianjiaAreaInfoDemo.class 22KB
FangtianxiaDemo.class 12KB
TkBuildingsAreaInfolj.class 11KB
TkBuildingsMonthPriceLj.class 6KB
BuildAreaUrlLj.class 6KB
TkBuildingsPriceAjk.class 5KB
RestTemplateConfig.class 4KB
QQEmaIlLoginDemo.class 3KB
RedisConfig.class 3KB
BaiduDemo.class 3KB
IdAndNamePO.class 2KB
ProxyService.class 988B
SeleniumDemoApplication.class 878B
SeleniumDemoApplicationTests.class 575B
BuildAreaUrlLjMapper.class 438B
mvnw.cmd 7KB
.gitignore 395B
.gitignore 0B
selenium_demo.iml 21KB
maven-wrapper.jar 57KB
LianjiaAreaInfoDemo.java 21KB
Demo.java 18KB
FangtianxiaDemo.java 10KB
RestTemplateConfig.java 2KB
QQEmaIlLoginDemo.java 2KB
TkBuildingsAreaInfolj.java 2KB
RedisConfig.java 2KB
BaiduDemo.java 1KB
TkBuildingsMonthPriceLj.java 1KB
TkBuildingsPriceAjk.java 867B
BuildAreaUrlLj.java 651B
ProxyService.java 543B
SeleniumDemoApplication.java 441B
IdAndNamePO.java 376B
BuildAreaUrlLjMapper.java 308B
SeleniumDemoApplicationTests.java 233B
selenium_demo.kotlin_module 16B
HELP.md 429B
mvnw 10KB
application.properties 623B
application.properties 613B
maven-wrapper.properties 233B
workspace.xml 10KB
pom.xml 4KB
Project_Default.xml 1KB
compiler.xml 894B
jarRepositories.xml 880B
Maven__com_google_guava_listenablefuture_9999_0_empty_to_avoid_conflict_with_guava.xml 835B
Maven__io_opentelemetry_opentelemetry_sdk_extension_autoconfigure_1_13_0_alpha.xml 807B
Maven__io_opentelemetry_opentelemetry_sdk_extension_autoconfigure_spi_1_13_0.xml 793B
Maven__com_fasterxml_jackson_module_jackson_module_parameter_names_2_13_3.xml 736B
Maven__io_netty_netty_transport_native_unix_common_4_1_77_Final.xml 726B
Maven__org_springframework_boot_spring_boot_starter_data_redis_2_7_0.xml 713B
Maven__org_springframework_boot_spring_boot_test_autoconfigure_2_7_0.xml 713B
Maven__io_netty_netty_transport_native_kqueue_osx_x86_64_4_1_77_Final.xml 713B
Maven__io_netty_netty_transport_native_epoll_linux_x86_64_4_1_77_Final.xml 710B
Maven__org_eclipse_jetty_websocket_websocket_client_9_4_46_v20220331.xml 704B
Maven__org_eclipse_jetty_websocket_websocket_common_9_4_46_v20220331.xml 704B
Maven__com_vaadin_external_google_android_json_0_0_20131108_vaadin1.xml 700B
Maven__io_netty_netty_transport_classes_kqueue_4_1_77_Final.xml 698B
Maven__io_opentelemetry_opentelemetry_sdk_metrics_1_13_0_alpha.xml 695B
Maven__com_fasterxml_jackson_datatype_jackson_datatype_jsr310_2_13_3.xml 695B
Maven__org_asynchttpclient_async_http_client_netty_utils_2_12_3.xml 693B
Maven__org_springframework_boot_spring_boot_starter_logging_2_7_0.xml 692B
Maven__io_netty_netty_transport_native_kqueue_4_1_77_Final.xml 691B
Maven__io_netty_netty_transport_classes_epoll_4_1_77_Final.xml 691B
Maven__io_opentelemetry_opentelemetry_exporter_logging_1_13_0.xml 688B
Maven__org_springframework_boot_spring_boot_starter_tomcat_2_7_0.xml 685B
Maven__io_netty_netty_transport_native_epoll_4_1_77_Final.xml 684B
Maven__org_eclipse_jetty_websocket_websocket_api_9_4_46_v20220331.xml 683B
Maven__com_fasterxml_jackson_datatype_jackson_datatype_jdk8_2_13_3.xml 681B
Maven__org_springframework_boot_spring_boot_autoconfigure_2_7_0.xml 678B
Maven__io_opentelemetry_opentelemetry_sdk_logs_1_13_0_alpha.xml 674B
Maven__org_springframework_boot_spring_boot_starter_jdbc_2_7_0.xml 671B
Maven__org_springframework_boot_spring_boot_starter_test_2_7_0.xml 671B
Maven__org_springframework_boot_spring_boot_starter_json_2_7_0.xml 671B
Maven__org_seleniumhq_selenium_selenium_chromium_driver_4_1_4.xml 667B
Maven__com_google_auto_service_auto_service_annotations_1_0_1.xml 667B
Maven__io_opentelemetry_opentelemetry_semconv_1_13_0_alpha.xml 667B
Maven__org_springframework_boot_spring_boot_starter_web_2_7_0.xml 664B
Maven__org_seleniumhq_selenium_selenium_firefox_driver_4_1_4.xml 660B
Maven__org_apache_tomcat_embed_tomcat_embed_websocket_9_0_63.xml 660B
Maven__com_google_errorprone_error_prone_annotations_2_11_0.xml 659B
Maven__org_seleniumhq_selenium_selenium_remote_driver_4_1_4.xml 653B
Maven__org_seleniumhq_selenium_selenium_chrome_driver_4_1_4.xml 653B
Maven__org_seleniumhq_selenium_selenium_safari_driver_4_1_4.xml 653B
Maven__com_fasterxml_jackson_core_jackson_annotations_2_13_3.xml 651B
Maven__io_opentelemetry_opentelemetry_sdk_common_1_13_0.xml 646B
Maven__org_seleniumhq_selenium_selenium_opera_driver_4_1_4.xml 646B
Maven__org_seleniumhq_selenium_selenium_devtools_v85_4_1_4.xml 646B
Maven__org_springframework_spring_context_support_5_3_20.xml 644B
Maven__org_springframework_data_spring_data_keyvalue_2_7_0.xml 643B
Maven__io_opentelemetry_opentelemetry_sdk_trace_1_13_0.xml 639B
Maven__org_seleniumhq_selenium_selenium_edge_driver_4_1_4.xml 639B
Maven__org_springframework_boot_spring_boot_starter_2_7_0.xml 636B
Maven__org_springframework_data_spring_data_commons_2_7_0.xml 636B
Maven__net_sourceforge_htmlunit_htmlunit_cssparser_1_11_0.xml 636B
Maven__org_eclipse_jetty_jetty_client_9_4_46_v20220331.xml 636B
Maven__org_junit_platform_junit_platform_commons_1_8_2.xml 633B
共 236 条
- 1
- 2
- 3
萌坑
- 粉丝: 9
- 资源: 2
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功
评论0