
python 爬虫代码
方式一:
requests+lxml/etree+xpath 表达式
# lxml/etree method
import requests
from lxml import etree
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0;
Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/64.0.3282.119 Safari/537.36'}
url = 'http://news.qq.com/'
html = requests.get(url = url, headers = headers)
con = etree.HTML(html.text)
title = con.xpath('//em[@class="f14 l24"]/a/text()')
link = con.xpath('//em[@class="f14 l24"]/a/@href')
for i in zip(title, link):
print({' 标 题 ': i[0],
'链接': i[1]
})
方式二:
requests+BeautifulSoup+find_all 进行信息提取
# find_all method
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64;
x64) AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/64.0.3282.119 Safari/537.36'}
url = 'http://news.qq.com/'