import re
from urllib.parse import urljoin
from bs4 import BeautifulSoup
class HtmlParser(object):
def __init__(self):
self.new_urls = set()
# self.res_data = dict()
def _get_new_urls(self, page_url, soup):
# new_urls = set()
# /doc/5912108-6125016.html或/doc/3745498.html
links = soup.find_all('a', href=re.compile(r'/doc/[\d-]+\.html'))
for link in links:
new_url = link['href']
new_full_url = urljoin(page_url, new_url)
# print(new_full_url)
self.new_urls.add(new_full_url)
return self.new_urls
def _get_new_data(self, page_url, soup):
res_data = dict()
# url
res_data['url'] = page_url
# <span class="title">Python</span>
title_node = soup.find('span', class_='title')
res_data['title'] = title_node.get_text()
# <div class="card_content" id="js-card-content">
summary_node = soup.find('div', class_='card_content').find('p')
res_data['summary'] = summary_node.get_text()
# print("dd: ", self.res_data)
return res_data
def parse(self, page_url, html_cont):
if page_url is None or html_cont is None:
return
soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
new_urls = self._get_new_urls(page_url, soup)
new_data = self._get_new_data(page_url, soup)
return new_urls, new_data
phoenxior
- 粉丝: 23
- 资源: 5
最新资源
- fed54987-3a28-4a7a-9c89-52d3ac6bc048.vsidx
- (177367038)QT实现教务管理系统.zip
- (178041422)基于springboot网上书城系统.zip
- (3127654)超级玛丽游戏源码下载
- (175717016)CTGU单总线CPU设计(变长指令周期3级时序)(HUST)(circ文件)
- (133916396)单总线CPU设计(变长指令周期3级时序)(HUST).rar
- Unity In-game Debug Console
- (3292010)Java图书管理系统(源码)
- Oracle期末复习题:选择题详解与数据库管理技术
- (176721246)200行C++代码写一个Qt俄罗斯方块
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈