package com.example.jsoupdemo;
import org.apache.http.Consts;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.apache.poi.xssf.usermodel.*;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.STCellType;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;
import javax.servlet.http.HttpServletResponse;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
@RestController
@RequestMapping("/excel")
public class ExcelController {
@PostMapping("/upload")
public void uploadExcel(HttpServletResponse response, @RequestParam("file") MultipartFile file) {
/**
* 1. 获取上传的url列表
* 2. 遍历获取url对应页面的HTML源码
* 3. 提取对应的商品信息字段
* 4. 输出的excel
* */
try {
// 读取Excel文件
XSSFWorkbook workbook = new XSSFWorkbook(file.getInputStream());
// 读取Excel工作表
XSSFSheet sheet = workbook.getSheetAt(0);
// 创建输出Excel文件
XSSFWorkbook newWorkbook = new XSSFWorkbook();
// 创建Sheet
XSSFSheet newSheet = newWorkbook.createSheet();
// 创建标题行
XSSFRow titleRow = newSheet.createRow(0);
// 设置标题行
XSSFCell cell1 = titleRow.createCell(0, STCellType.INT_STR);
cell1.setCellValue("商品编码");
XSSFCell cell2 = titleRow.createCell(1, STCellType.INT_STR);
cell2.setCellValue("商品名称");
XSSFCell cell3 = titleRow.createCell(2, STCellType.INT_STR);
cell3.setCellValue("商品分类");
// 设置宽度
newSheet.setColumnWidth(0, 2560);
newSheet.setColumnWidth(1, 25600);
newSheet.setColumnWidth(2, 5120);
// 遍历获取HTML源码,提取信息
for (int i = 0; i < sheet.getLastRowNum(); i++) {
// 获取行
XSSFRow row = sheet.getRow(i);
// 获取列
XSSFCell cell = row.getCell(0);
// 获取url
String url = cell.getStringCellValue();
// 输出的Excel创建行
XSSFRow newRow = newSheet.createRow(i + 1);
// 判断url不为空并且包含http
if (!url.isEmpty() && url.contains("http")) {
// 获取商品信息集合
Map<String, String> data = getProductInfo(url);
// 输出商品信息到Excel表
if (data != null) {
XSSFCell cellOne = newRow.createCell(0, STCellType.INT_STR);
cellOne.setCellValue(data.get("sku"));
XSSFCell cellTwo = newRow.createCell(1, STCellType.INT_STR);
cellTwo.setCellValue(data.get("name"));
XSSFCell cellThree = newRow.createCell(2, STCellType.INT_STR);
cellThree.setCellValue(data.get("cat"));
}
}
// 打印调试
System.out.println("\n内容是:" + url);
}
// 下载excel
response.setContentType("application/octet-stream");
// 以时间戳命名
String fileName = String.valueOf(new Date().getTime()) + ".xlsx";
response.setHeader("Content-disposition", "attachment;filename=" + fileName);
response.flushBuffer();
// 输出excel
newWorkbook.write(response.getOutputStream());
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 提取商品信息
* */
private Map<String, String> getProductInfo(String url) throws Exception {
CloseableHttpClient httpclient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(url);
// 模拟浏览器浏览
httpGet.setHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0");
CloseableHttpResponse response1 = httpclient.execute(httpGet);
// 结果集合
Map<String, String> reslut = null;
//获取响应状态码
int StatusCode = response1.getStatusLine().getStatusCode();
try {
HttpEntity entity1 = response1.getEntity();
//如果状态响应码为200,则获取html实体内容或者json文件
if(StatusCode == 200){
String html = EntityUtils.toString(entity1, Consts.UTF_8);
// 提取HTML得到商品信息结果
reslut = getData(html);
// 消耗掉实体
EntityUtils.consume(response1.getEntity());
}else {
//否则,消耗掉实体
EntityUtils.consume(response1.getEntity());
}
} finally {
response1.close();
}
return reslut;
}
private static Map<String, String> getData (String html) throws Exception{
//获取的数据,存放在集合中
Map<String, String> data = new HashMap<String,String>();
//采用Jsoup解析
Document doc = Jsoup.parse(html);
//获取html标签中的内容
// 标题
String name = doc.select("div[class=sku-name]").text();
if (name != null) {
data.put("name", name);
}
// sku
String sku = "";
Elements elements = doc.select("a[data-sku]");
for (Element ele: elements) {
if (ele.hasAttr("data-sku")) {
sku = ele.attr("data-sku");
break;
}
}
if (sku != null) {
data.put("sku", sku);
}
String cat = doc.select("a[clstag=shangpin|keycount|product|mbNav-1]").text();
if (cat != null) {
data.put("cat", cat);
}
System.out.print(sku + "---------" + cat + "---------" + name);
//返回数据
return data;
}
}
没有合适的资源?快使用搜索试试~ 我知道了~
SpringBoot+jsoup爬虫
共114个文件
xml:90个
java:6个
class:5个
需积分: 50 17 下载量 23 浏览量
2020-05-14
11:10:18
上传
评论 1
收藏 122KB ZIP 举报
温馨提示
本地创建.xlsx文件,里面存放京东商品地址,运行项目启动导入本地xlsx文件,自动爬取文件中地址信息下载到本地
资源推荐
资源详情
资源评论
收起资源包目录
SpringBoot+jsoup爬虫 (114个子文件)
ExcelController.class 8KB
NewsController.class 4KB
JsoupdemoApplication.class 758B
PageController.class 588B
JsoupdemoApplicationTests.class 556B
mvnw.cmd 6KB
.gitignore 333B
list.html 1KB
list.html 1KB
index.html 609B
index.html 609B
jsoupdemo.iml 11KB
maven-wrapper.jar 50KB
ExcelController.java 7KB
MavenWrapperDownloader.java 5KB
NewsController.java 2KB
JsoupdemoApplication.java 330B
PageController.java 283B
JsoupdemoApplicationTests.java 225B
HELP.md 902B
mvnw 10KB
maven-wrapper.properties 218B
workspace.xml 58KB
pom.xml 3KB
compiler.xml 818B
Maven__org_springframework_boot_spring_boot_test_autoconfigure_2_2_6_RELEASE.xml 793B
Maven__org_springframework_boot_spring_boot_starter_validation_2_2_6_RELEASE.xml 793B
Maven__org_springframework_boot_spring_boot_starter_thymeleaf_2_2_6_RELEASE.xml 786B
Maven__org_springframework_boot_spring_boot_starter_logging_2_2_6_RELEASE.xml 772B
Maven__org_springframework_boot_spring_boot_starter_tomcat_2_2_6_RELEASE.xml 765B
Maven__com_fasterxml_jackson_module_jackson_module_parameter_names_2_10_3.xml 760B
Maven__org_springframework_boot_spring_boot_autoconfigure_2_2_6_RELEASE.xml 758B
Maven__org_springframework_boot_spring_boot_starter_json_2_2_6_RELEASE.xml 751B
Maven__org_springframework_boot_spring_boot_starter_test_2_2_6_RELEASE.xml 751B
Maven__org_thymeleaf_extras_thymeleaf_extras_java8time_3_0_4_RELEASE.xml 749B
Maven__org_springframework_boot_spring_boot_starter_web_2_2_6_RELEASE.xml 744B
Maven__com_vaadin_external_google_android_json_0_0_20131108_vaadin1.xml 724B
Maven__org_springframework_boot_spring_boot_devtools_2_2_6_RELEASE.xml 723B
Maven__com_fasterxml_jackson_datatype_jackson_datatype_jsr310_2_10_3.xml 719B
Maven__org_springframework_boot_spring_boot_starter_2_2_6_RELEASE.xml 716B
Maven__com_fasterxml_jackson_datatype_jackson_datatype_jdk8_2_10_3.xml 705B
Maven__org_hibernate_validator_hibernate_validator_6_0_18_Final.xml 705B
Maven__org_springframework_boot_spring_boot_test_2_2_6_RELEASE.xml 695B
Maven__org_apache_tomcat_embed_tomcat_embed_websocket_9_0_33.xml 684B
Maven__org_springframework_spring_expression_5_2_5_RELEASE.xml 682B
Maven__com_fasterxml_jackson_core_jackson_annotations_2_10_3.xml 675B
Maven__org_thymeleaf_thymeleaf_spring5_3_0_11_RELEASE.xml 665B
Maven__org_springframework_spring_context_5_2_5_RELEASE.xml 661B
Maven__org_springframework_boot_spring_boot_2_2_6_RELEASE.xml 660B
Maven__jakarta_activation_jakarta_activation_api_1_2_2.xml 657B
Maven__jakarta_validation_jakarta_validation_api_2_0_2.xml 657B
Maven__jakarta_annotation_jakarta_annotation_api_1_3_5.xml 657B
Maven__org_junit_platform_junit_platform_commons_1_5_2.xml 657B
Maven__com_fasterxml_jackson_core_jackson_databind_2_10_3.xml 654B
Maven__org_springframework_spring_webmvc_5_2_5_RELEASE.xml 654B
Maven__org_junit_platform_junit_platform_engine_1_5_2.xml 650B
Maven__org_apache_tomcat_embed_tomcat_embed_core_9_0_33.xml 649B
Maven__org_springframework_spring_beans_5_2_5_RELEASE.xml 647B
Maven__org_apache_poi_poi_ooxml_schemas_3_10_FINAL.xml 641B
Maven__org_springframework_spring_core_5_2_5_RELEASE.xml 640B
Maven__org_springframework_spring_test_5_2_5_RELEASE.xml 640B
Maven__org_junit_jupiter_junit_jupiter_params_5_5_2.xml 639B
Maven__org_junit_jupiter_junit_jupiter_engine_5_5_2.xml 639B
Maven__org_apache_tomcat_embed_tomcat_embed_el_9_0_33.xml 635B
Maven__jakarta_xml_bind_jakarta_xml_bind_api_2_3_3.xml 635B
Maven__org_springframework_spring_web_5_2_5_RELEASE.xml 633B
Maven__org_springframework_spring_aop_5_2_5_RELEASE.xml 633B
Maven__org_springframework_spring_jcl_5_2_5_RELEASE.xml 633B
Maven__org_jboss_logging_jboss_logging_3_4_1_Final.xml 632B
Maven__org_apache_logging_log4j_log4j_to_slf4j_2_12_1.xml 632B
Maven__com_fasterxml_jackson_core_jackson_core_2_10_3.xml 626B
Maven__org_mockito_mockito_junit_jupiter_3_1_0.xml 622B
Maven__org_junit_jupiter_junit_jupiter_api_5_5_2.xml 618B
Maven__org_attoparser_attoparser_2_0_5_RELEASE.xml 613B
Maven__org_thymeleaf_thymeleaf_3_0_11_RELEASE.xml 609B
Maven__org_unbescape_unbescape_1_1_6_RELEASE.xml 602B
Maven__net_bytebuddy_byte_buddy_agent_1_10_8.xml 602B
Maven__org_apache_httpcomponents_httpclient_4_5_1.xml 601B
Maven__org_apache_logging_log4j_log4j_api_2_12_1.xml 597B
Maven__org_apiguardian_apiguardian_api_1_1_0.xml 596B
Maven__org_apache_httpcomponents_httpcore_4_4_13.xml 594B
Maven__ch_qos_logback_logback_classic_1_2_3.xml 592B
Maven__org_junit_jupiter_junit_jupiter_5_5_2.xml 590B
Maven__org_apache_poi_poi_ooxml_3_10_FINAL.xml 585B
Maven__ch_qos_logback_logback_core_1_2_3.xml 571B
Maven__com_jayway_jsonpath_json_path_2_4_0.xml 570B
Maven__commons_codec_commons_codec_1_13.xml 567B
Maven__net_minidev_accessors_smart_1_2.xml 566B
Maven__org_assertj_assertj_core_3_13_2.xml 566B
Maven__org_apache_xmlbeans_xmlbeans_2_3_0.xml 563B
Maven__org_skyscreamer_jsonassert_1_5_0.xml 561B
Maven__net_bytebuddy_byte_buddy_1_10_8.xml 560B
Maven__org_xmlunit_xmlunit_core_2_6_4.xml 559B
Maven__org_mockito_mockito_core_3_1_0.xml 559B
Maven__org_slf4j_jul_to_slf4j_1_7_30.xml 558B
Maven__org_opentest4j_opentest4j_1_2_0.xml 557B
Maven__com_fasterxml_classmate_1_5_1.xml 546B
Maven__org_apache_poi_poi_3_10_FINAL.xml 543B
Maven__org_slf4j_slf4j_api_1_7_30.xml 537B
Maven__org_objenesis_objenesis_2_6.xml 532B
共 114 条
- 1
- 2
资源评论
Shao_13691378252
- 粉丝: 55
- 资源: 2
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功