python爬虫，含注释讲解，来自于大数据分析课程.zip资源-CSDN文库

共1个文件

py：1个

版权申诉

66 浏览量 2024-03-21 14:03:01 上传评论收藏 4KB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

python爬虫，含注释讲解，来自于大数据分析课程.zip （1个子文件）

python爬虫，含注释讲解，来自于大数据分析课程.py 11KB

""" 爬虫 """ # 爬虫需要哪些知识体系： # 1.python基础（数据类型，运算符、流程控制、模块、类和对象，文件操作） # 2.web知识，html css javascript # 3.正则表达式、bs、xpath——对于得到的源码进行解析（信息的提取） # 4.数据库 # 5.爬虫的框架：scrapy #本次课程涉及到的前三点。 # 【url】链接，统一资源定位。 # url:整体上分为两个部分：使用://分隔： # https://hao.360.com/ # 协议标识符: https # 资源名称：详细的地址 # hao.360.com 域名 # http://主机ip:端口号/文件名/相关引用（参数） # 一、爬虫的基础知识 # 1.python使用urlparse解析url以及openurl # urlparse # from urllib.parse import urlparse # url="http://192.168.0.1:8080/abc/def/index.html?a=1&b=2" # result=urlparse(url) # print(result) # 2. 爬虫的步骤 # （1）使用python获得url的原码：urlopen(url)（向服务器发送请求）（不是是以前的使用浏览器向服务器发送请求） # （2）会获得response的响应的对象，获得响应的源码:response.read()得到的是字节。 # （3）解析源码（正则表达式、bs），获得需要抓取的数据 # （4）存储爬取的资源（可以写入文件中，也可以写入到数据库中） #细节： # （1）使用python获得url的源码（向服务器发送请求）（不是是以前的使用浏览器向服务器发送请求） from urllib.request import urlopen from bs4 import BeautifulSoup import re import csv # urlopen(参数url) # url="https://www.lagou.com/" # 能够打开url，获取源码（以response对象的形式返回） # response=urlopen(url) # print(response) # (2)会获得response的响应的对象，获得响应的源码 # read() 方法获得response对象下的源码信息——得到的结果是字节 # print(response.read()) #都会调用字节下的decode()的方法，将字节转换成字符串 # print(response.read().decode()) # html=response.read().decode() #字符串 # (3)解析源码 # 正则表达式、bs # 先介绍正则表达式 # res_url=r"<a.*?href=\"(http.*?)\"" # r代表字符串以原样输出。 # re.findall(正则表达式，待匹配字符串) # urls=re.findall(res_url,html) # for i in urls: # print(i) # # (4)存储 # with open("c:/lagou_urls.txt" ,"wt") as f: # # f.write() # f.writelines(urls) # with open("c:/lagou_urls.txt","wt") as f: # for i in urls: # f.write(i+"\n") # with open("c:/lagou_urls.csv","wt",newline="") as f: # writer=csv.writer(f) # for i in urls: # writer.writerow([i]) # 练习：boss直聘链接信息的爬取 # 通过爬虫来获得文件（图片） # （1）urlopen打开链接 # （2）response.read()获得字节 # （3）存储 # url_imag="https://www.lgstatic.com/i/image2/M01/22/3C/CgoB5ly_5LeABgLIAADWGM9TJQU214.PNG" # response=urlopen(url_imag) # # print(response.read()) # with open("c:/lagou.png" ,"wb") as f: # f.write(response.read()) # 二、BS4解析源码 # 其中解析源码：不仅仅可以使用正则表达式，还可以是beatifulsoup bs4 # 如果希望使用，则需要安装：在terminal下输入命令：pip install bs4 # html=""" # <html> # <head> # <title>The Dormouse's story</title> # </head> # <body> # # The Dormouse's story # # # Once upon a time there were three little sisters; and their names were # <a href="http://example.com/elsie" class="sister" id="link1"></a>, # <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> # and # <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; # and they lived at the bottom of a well. # ... # # </body> # </html> # """ # bs4下常用的方法 # select # 使用bs4的时候，需要创建bs4对象 # from bs4 import BeautifulSoup as BS #第一个参数：要处理的字符串信息（源码） #第二个参数：要处理的信息类型，指定是html.parser # bs=BS(html,"html.parser") # 1.select(标签名) 根据标签名进行查找（匹配） # 格式：标签名 # 返回的值是列表，如果标签在源码中存在多个，则多个都会以列表元素的形式返回。 # print(bs.select("title")) # print(bs.select("a")) # 2.通过类名class # class属性对应的值进行查找 # 格式：.class的属性值 # print(bs.select(".sister")) # 3.通过id的值 # id属性对应的值进行查找 # 格式：#id属性的值 # print(bs.select("#link1")) # 4.组合查找 # 不同的查找之间使用空格隔开即可 # print(bs.select("p #link1")) # 5.通过属性进行查找 # 语法：标签名[属性=属性值] # print(bs.select("a[href=\"http://example.com/elsie\"]")) #find 和find_all # find("标签名",{"属性名":"属性值",}) # print(bs.find("a",{"href":"http://example.com/elsie"})) # 练习：获取拉钩中所有的连接，使用bs4 # (1)urlopen # (2)response.read().decode() # (3)创建bs对象 # (4)使用bs下的select、find_all来获取所有href的属性值 # from bs4 import BeautifulSoup # url="https://www.lagou.com/" # # (1)urlopen访问服务器链接，获取response对象 # response=urlopen(url) # # (2)获得response对象下的源码 # html_text=response.read().decode() # # （3）创建bs对象 # bs=BeautifulSoup(html_text,"html.parser") # # （4）信息提取 # a_list=bs.select("a") # with open("c:/a.txt","wt") as f: # for i in a_list: # # print(i) # #使用每一个元素（Tag类型的对象） # # print(type(i)) # # (1)select 和find find_all完全同bs对象下的方法一致，也就是可以继续对i进行进一步的标签分析 # # (2)i.get(key) key代表传入的属性 # # print(i.get("href")) # # (3)text属性，获得标签中间夹的文本内容 # # print(i.text) # v=i.get("href") # f.write(f"{v}\n") # 三、爬虫的案例。 # 正则表达式、bs4来做信息的提取。 # 1. 发送请求，通过python urlopen(url)，获得respose # 2. 通过response.read()字节，如果获取的二进制文件，则不需要转换成字符串，否则需要使用decode()将字节转换成字符串 # 3. 使用正则表达式，或者bs来做信息（字符模式）的提取 # 4. 对爬取的信息进行存储 # （一）名言 # 方式一：正则表达式 # url1="http://quotes.toscrape.com/" # response=urlopen(url1) # html_text=response.read().decode() # # print(html_text) # res_div="<div class=\"quote\" itemscope itemtype=\"http://schema.org/CreativeWork\">(.*?)</div>" # res_quote="“(.*?)”" # res_author="(.*?)" # res_tags="<a class=\"tag\".*?>(.*?)</a>" # li=re.findall(res_div,html_text,re.S|re.M|re.I) # # with open("c:/quotes.csv","wt",newline="") as f: # writer=csv.writer(f) # for i in li: # temp = [] # 为了存储一条名言的三个内容[名言,作者，标签] # # 1.取名言的：名言部分 # quote=re.search(res_quote,i,re.S|re.M|re.I) # # print(f"名言：{quote.group(1)}") # temp.append(quote.group(1)) # # # 2.取名言的作者 # author=re.search(res_author,i,re.S|re.M|re.I) # # print(f"作者：{author.group(1)}") # temp.append(author.group(1)) # # # 3.取名言的tags部分 # tags=re.findall(res_tags,i,re.S|re.M|re.I) # 列表 [tag1,tag2,tag3....] # # print("标签内容：",end="") # # for j in tags: # # print(j,end=",") # # print() # #将tags列表中的元素拼接成字符串 # s="" # for j in tag

评论收藏

内容反馈

版权申诉