京东爬虫一站式爬虫的相关爬虫文件以及代码资源-CSDN文库

共8个文件

py：6个

docx：1个

ppt：1个

免费分享

需积分: 36 93 浏览量 2018-12-20 11:53:47 上传评论 3 收藏 1.82MB RAR 举报

资源推荐

资源详情

资源评论

收起资源包目录

京东爬虫从搜索商品到加入购物车.rar （8个子文件）

16310320222-杨彬

京东爬虫的所有类

Message.py 668B

Deletefolder.py 400B

Jingdong.py 11KB

ImgView.py 2KB

DetailMessageView.py 2KB

DownLoadImg.py 2KB

京东爬虫从商品搜索到将商品加入购物车.docx 975KB

京东爬虫ppt.ppt 1.75MB

import ImgView import Message import pymysql import requests import sys import time import DownLoadImg import DetailMessageView from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait from requests import Response from bs4 import BeautifulSoup import urllib con=pymysql.connect('127.0.0.1','root','yb19961215000000','jingdong') cur=con.cursor() def DeleteTable(table):#删除表 sql="DROP TABLE "+table cur.execute(sql) def CreatTable(cur):#建立品牌表 cur.execute("CREATE TABLE IF NOT EXISTS Brand(Id INT PRIMARY KEY AUTO_INCREMENT,BrandName VARCHAR(50))") def CreatCommodityTable(cur):#创建商品表 cur.execute("CREATE TABLE IF NOT EXISTS Commodity(CId INT PRIMARY KEY AUTO_INCREMENT,CommodityName VARCHAR(300),Price VARCHAR(20),StoreName VARCHAR(40),Comment VARCHAR(20),goodicon VARCHAR(100),DetaileUrl VARCHAR(100))") def InsertBrand(name):#像品牌表中插入数据 sql="insert into Brand(BrandName) values('"+name+"')" print("生成SQL语句》》》》》》》"+sql) try: print("语句执行") cur.execute(sql) print("插入成功") # 提交 con.commit() print("已经提交") except Exception as e: # 错误回滚 con.rollback() def InsertCommotity(CommodityName,Price,StoreName,Comment,goodicon,DetaileUrl): sql = "insert into Commodity(CommodityName,Price,StoreName,Comment,goodicon,DetaileUrl) values('" + CommodityName + "','" + Price + "','" + StoreName + "','" +Comment + "','" + goodicon + "','" + DetaileUrl + "')" # print("生成SQL语句》》》》》》》" + sql) try: # print("语句执行") cur.execute(sql) # print("插入成功") # 提交 con.commit() print("已经提交") except Exception as e: # 错误回滚 con.rollback() def ClearTable(table): sql = "DELETE FROM "+table+" WHERE id > 0" try: # 执行SQL语句 cur.execute(sql) # 提交修改 con.commit() except Exception as e: # 错误回滚 con.rollback() finally: con.close() def getSoup(url):#得到soup对象 try: r=requests.get(url) r.encoding="UTF-8" r.raise_for_status()#编码如果不是200就会引发异常 soup=BeautifulSoup(r.text,"html.parser") return soup except: return '' def downLoadImage(img_url):#下载图片到本地 urllib.request.urlretrieve(img_url, 'F:\\python\\img\\' + img_url.split('/')[-1]) def findAllBrand(soup):#找到所有的品牌 assert isinstance(soup, BeautifulSoup) data=soup.find('ul',{"class":"J_valueList v-fixed"}) if data==None: return True datali = data.find_all('li') for li in datali: bname = li.a.attrs['title'] burl = li.a.attrs['href'] InsertBrand(bname) print("插入成功***") return False def getPages(url): browser = webdriver.Chrome() try: browser.get(url) pages = browser.find_element_by_xpath("//*[@id='J_bottomPage']/span[2]/em[1]/b").text finally: browser.close() return pages def findAllGoods(soup): assert isinstance(soup, BeautifulSoup) data=soup.find('ul',{"class":"gl-warp clearfix"}) datali = data.find_all("li") cnt=0 for li in datali: StoreName = li.find('div', {"class": "p-shop"}) if StoreName!=None: if StoreName.text ==''or StoreName.text==None: continue print(StoreName.text) cnt+=1 CommodityName = li.find('div', {"class": "p-name p-name-type-2"}).text CommodityName=CommodityName.strip()+"" CommodityName=CommodityName.replace(" ","") StoreName=StoreName.text+"" Comment = li.find('div', {"class": "p-commit"}).strong.text Comment=Comment+"" Price = li.find('div', {"class": "p-price"}).strong.text Price=Price+"" goodicon = li.find('div', {"class": "p-icons"}).text goodicon = goodicon.strip()+"" DetaileUrl=li.find('div', {"class": "p-img"}).a.attrs['href'] DetaileUrl=DetaileUrl.strip()+"" # print("********************" + CommodityName.strip()) # print("********************" + Price) # print("********************" + Comment) # print("********************" + goodicon) # print("-----------------------------------------------------") InsertCommotity(str(CommodityName), str(Price), str(StoreName), str(Comment), str(goodicon),str(DetaileUrl)) print("商品数据插入数据库成功") print(cnt) def seletBrand(): brandList=[] sql="select *from Brand" try: # 执行SQL语句 cur.execute(sql) results=cur.fetchall() for row in results: single = [] single.append(row[0]) single.append(row[1]) brandList.append(single) return brandList except Exception as e: # 错误回滚 con.rollback() return '' def selectGoods(): goodsList = [] sql = "select *from commodity" try: # 执行SQL语句 cur.execute(sql) results = cur.fetchall() for row in results: single = [] single.append(row[0]) single.append(row[1]) single.append(row[2]) single.append(row[3]) single.append(row[4]) single.append(row[5]) goodsList.append(single) return goodsList except Exception as e: # 错误回滚 con.rollback() return '' def selectUrl(Id): sql = "select `DetaileUrl` FROM commodity WHERE CId="+Id try: # 执行SQL语句 cur.execute(sql) myresult = cur.fetchone() return myresult except Exception as e: # 错误回滚 con.rollback() return '' def addGoodsToCart(url): browser = webdriver.Chrome() browser.get(url) addCart=browser.find_element_by_id("InitCartUrl") addCart.click() def main(): DeleteTable("brand") DeleteTable("commodity") CreatTable(cur) CreatCommodityTable(cur) inputCommodity = input("请输入你想要查询的商品") url = "https://search.jd.com/Search?keyword=" + inputCommodity + "&enc=utf-8&wq=" + inputCommodity soup = getSoup(url) flag = findAllBrand(soup) if flag == False: YesOrNo=input("您是否想要选择您想要的品牌,如果想要选择请输入yes，不想要选择请输入no>>>>>>") if YesOrNo=="yes": brandlist = seletBrand() BrandMessage=[] for singleBrand in brandlist: print("编号>>>>>"+str(singleBrand[0]), end='') print("品牌>>>>>>"+singleBrand[1]) BrandMessage.append("编号>>>>>"+str(singleBrand[0])+" "+"品牌>>>>>>"+singleBrand[1]) Message.main(BrandMessage) idNumber=input("请输入你想要选择的品牌编号>>>>") print(brandlist[eval(idNumber)-brandlist[0][0]][1]) url=url+"&ev=exbrand_"+brandlist[eval(idNumber)-brandlist[0][0]][1] print(url) soup=getSoup(url) pages=getPages(url)#得到总共的页数 for i in range(1,2): print("这是第"+str(i)+"页"+"@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@") a = time.time() b = '%.5f' % a url=url+"&page="+str(i*2-1)+"&s=58&click=0" soupfirst=getSoup(url)

评论收藏

内容反馈