【免费】Python爬虫课程作业，requests+xpath学习

共1个文件

py：1个

python

爬虫

课程资源

需积分: 0 154 浏览量 2023-05-08 11:44:15 上传评论收藏 1KB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

crawl_learn.zip （1个子文件）

crawl_learn.py 3KB

import requests from lxml import etree book_list = [] base_url = 'http://www.biqugei.net/' res = requests.get(base_url) html = etree.HTML(res.text) # # 本周热读推荐 h4s = html.xpath('//h4') for h4 in h4s: book = {} book_name = h4.xpath('a/text()')[0] book_url = h4.xpath('a/@href')[0] author = h4.xpath('../small/text()')[0] book_info = h4.xpath('../p/text()')[0] book['name'] = book_name book['url'] = base_url + book_url # 拼接完整url book['author'] = author book['info'] = book_info book_list.append(book) print(book_list) # 分类 divs = html.xpath('//div[contains(@class, "panel") and contains(@class, "panel-default")]/ul') for ul in divs: lis = ul.xpath('li') for li in lis: book = {} book_name = li.xpath('a/text()')[0] book_url = li.xpath('a/@href')[0] author = li.xpath('span/text()')[0] book['name'] = book_name book['url'] = base_url + book_url # 拼接完整url book['author'] = author book_list.append(book) print(book_list) # 最新入库 trs = html.xpath('//table/tbody')[0].xpath('tr') for tr in trs: book = {} book_name = tr.xpath('td/a/text()')[0] book_url = tr.xpath('td/a/@href')[0] author = tr.xpath('td/text()')[0] book['name'] = book_name book['url'] = base_url + book_url # 拼接完整url book['author'] = author book_list.append(book) print(book_list) # 最近更新 trs = html.xpath('//table/tbody')[1].xpath('tr') for tr in trs: book = {} book_name = tr.xpath('td/a/text()')[0] book_url = tr.xpath('td/a/@href')[0] author = tr.xpath('td/text()')[1] book['name'] = book_name book['url'] = base_url + book_url # 拼接完整url book['author'] = author book_list.append(book) print(book_list) # 小说完整目录的url res = requests.get('http://www.biqugei.net/page/detail165482.html') html = etree.HTML(res.text) catagory_url = html.xpath('/html/body/div[1]/div[1]/div/div/div[2]/div/a[2]/@href') print(catagory_url) # 小说完整目录的所有url # 第一种根据page拼接url 第二种，直接获取url，这里使用第二种 res = requests.get('https://www.biqugei.net/catalog/165547.html') html = etree.HTML(res.text) page_urls = html.xpath('//*[@id="indexselect"]')[0].xpath('option') for page_url in page_urls: url = page_url.xpath('@value')[0] url = 'https://www.biqugei.net' + url print(url) # # 小说目录及对应的url catologs = html.xpath('//dl/dd') for catolog in catologs: catolog_name = catolog.xpath('a/text()')[0] catolog_url = 'https://www.biqugei.net/' + catolog.xpath('a/@href')[0] print(catolog_name, catolog_url) # 小说章节标题以及小说章节内容 res = requests.get('https://www.biqugei.net/read/165547/406054.html') html = etree.HTML(res.text) title = html.xpath('//h1/text()')[0] content = ''.join(html.xpath('//div[@id="booktxt"]/p/text()')) print(title) print(content)

评论收藏

内容反馈