from bs4 import BeautifulSoup
# 调用requests库获取网页
import requests
url = "https://news.baidu.com/"
ua = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
}
req = requests.get(url, headers=ua)
# print(req.content.decode('utf-8'))
html = req.content.decode('utf-8')
soup = BeautifulSoup(html, 'lxml') # 生成BeautifulSoup对象
# print("输出格式化的BeautifulSoup对象:", soup.prettify())
# 获取head标签
# print("获取head标签: ", soup.head)
# 获取title标签
# print("获取title标签:", soup.title)
# 获取body标签中的第一个a标签
# print("获取第一个a标签:", soup.body.a)
# 获取所有a标签
# print("所有名称为a的标签:", soup.find_all('a'))
# print(type(soup.find_all('a'))) # set集合
# 获取所有名称为a的标签个数
# print("所有名称为a的标签的个数:", len(soup.find_all('a')))
# 获取soup的name
# print("soup的name:", soup.name)
# 获取a标签的name
# print("a标签的name:", soup.a.name)
tag = soup.a
# print("tag的name: ", tag.name) # 获取tag的name
print("tag的内容:", tag)
# 获取tag对象的全部属性
print("tag对象的全部属性:", tag.attrs)
# 获取href属性的值
print("href属性的值:", tag.attrs['href'])