import re
from urllib.parse import urljoin
from bs4 import BeautifulSoup
class HtmlParser(object):
def __init__(self):
self.new_urls = set()
# self.res_data = dict()
def _get_new_urls(self, page_url, soup):
# new_urls = set()
# /doc/5912108-6125016.html或/doc/3745498.html
links = soup.find_all('a', href=re.compile(r'/doc/[\d-]+\.html'))
for link in links:
new_url = link['href']
new_full_url = urljoin(page_url, new_url)
# print(new_full_url)
self.new_urls.add(new_full_url)
return self.new_urls
def _get_new_data(self, page_url, soup):
res_data = dict()
# url
res_data['url'] = page_url
# <span class="title">Python</span>
title_node = soup.find('span', class_='title')
res_data['title'] = title_node.get_text()
# <div class="card_content" id="js-card-content">
summary_node = soup.find('div', class_='card_content').find('p')
res_data['summary'] = summary_node.get_text()
# print("dd: ", self.res_data)
return res_data
def parse(self, page_url, html_cont):
if page_url is None or html_cont is None:
return
soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
new_urls = self._get_new_urls(page_url, soup)
new_data = self._get_new_data(page_url, soup)
return new_urls, new_data
phoenxior
- 粉丝: 23
- 资源: 5
最新资源
- python005-基于Python爬虫的网络小说数据分析系统的设计与实现.zip
- vs2015 udp 广播 demo
- 创维42L20HW(8DA6)软件数据.rar
- gcc15交叉编译工具链windows版,用于编译龙芯应用,gcc version 15.0.0 20241119 (experimental) (GCC)
- python004-基于python的抑郁症患者看护系统.zip
- 基于TensorFlow2的图像分类模型训练预测项目(支持ResNet和MobileNet等主干网络,热力图生成、可视化模型).zip
- C语言程序设计-1-4章-课后习题答案(1).zip
- python003-python电影数据分析及可视化系统建设.zip
- gcc15交叉编译工具链windows版,支持32位和64位windows软件编译,gcc version 15.0.0 20241111 (experimental) (GCC)
- STM32汇编语言点亮led灯
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈