#!/usr/bin/env python
# coding: utf-8
# In[1]:
from threading import Thread
from DrissionPage import ChromiumPage
from DrissionPage.common import ActionChains
from DataRecorder import Recorder
import json
import pandas as pd
from multiprocessing.dummy import Process,Pool
import json,time,os
from bs4 import BeautifulSoup as bs
import jieba
import requests
# In[25]:
dataList = []
page = ChromiumPage()
for i in range(1,138):
url = 'https://zc-paimai.taobao.com/wow/pm/default/pc/zichansearch?disableNav=YES&fcatV4Ids=[%22206051702%22]&page='+str(i)+'&spm=a2129.27064540.puimod-zc-focus-2021_2860107850.category-1-2&pmid=7794258071_1702609330195&pmtk=20140647.0.0.0.27064540.puimod-zc-focus-2021_2860107850.category-1-2&path=27064540&statusOrders=[%222%22]'
tabid = page.new_tab(url)
tab = page.get_tab(tabid)
dataList.append(tab)
# In[68]:
def parse(tab,dataAAA):
soup = bs(tab.html, 'lxml')
body = soup.find_all(name='div',attrs={"class":"rax-view-v2 pc-search-list--area--DmjoOeu"})[0]
for div in body.contents[:40]:
dataDict = {}
a = div.contents[0]
cons = a.contents[1].contents
dataDict['链接'] = "http:"+a['href']
dataDict['标题'] = cons[0].get_text().strip()
val = cons[2].contents[0].contents[1].contents[0].contents[0].contents[1].get_text().strip()
unit = cons[2].contents[0].contents[1].contents[0].contents[0].contents[2].get_text().strip()
dataDict['当前价'] = val+unit
dataDict['评估价'] = cons[2].contents[1].contents[2].get_text().strip()
dataDict['结束时间'] = cons[2].contents[2].contents[1].get_text().strip()
dataDict['围观次数'] = cons[4].contents[0].contents[0].get_text().strip()
dataDict['报名人数'] = cons[4].contents[1].contents[0].get_text().strip()
if len(cons[4].contents) > 2:
dataDict['拍卖类型'] = cons[4].contents[2].contents[1].get_text().strip()
dataAAA.append(dataDict)
dataAAA = []
for tab in dataList:
parse(tab,dataAAA)
# In[70]:
df = pd.DataFrame(dataAAA)
df.to_csv('ali_zc.csv',encoding='utf-8', index=False)
# In[71]:
basepath = "C://Users/MI/Desktop/python/cache/"
# 新建页面对象
#page = ChromiumPage()
# 第一个标签页访问网址
#url = 'https://sf-item.taobao.com/sf_item/720622371601.htm?track_id=cc10dbe7-4dbb-4b39-b979-c2de2145cafe'
#page.get(url)
# 获取第一个标签页对象
#tab1 = page.get_tab()
# In[37]:
def getxiangqing(first):
try:
tableDict = {}
url = 'https:'+first['itemUrl']
file = basepath+str(first['id'])+".html"
with open(file,'r',encoding='utf8') as r:
html = r.read()
soup = bs(html, 'lxml')
J_TimeLeft = soup.find_all(name='span',attrs={"class":"countdown J_TimeLeft"})[0]
tableDict['结束时间'] = J_TimeLeft.get_text().strip()
J_Delay_spans = soup.find_all(name='span',attrs={"id":"J_Delay"})
if len(J_Delay_spans) > 0:
delayCnt_ems = J_Delay_spans[0].find_all(name='em',attrs={"class":"delayCnt"})
if len(delayCnt_ems) > 0:
tableDict['延时次数'] = delayCnt_ems[0].get_text().strip()
J_Price_spans = soup.find_all(name='span',attrs={"class":"class=pm-current-price J_Price"})
if len(J_Price_spans) > 0:
tableDict['当前价'] = J_Price_spans[0].get_text()
pai_pay_infor_tables = soup.find_all(name="table",attrs={"class":"pai-pay-infor"})
if len(pai_pay_infor_tables) > 0:
trs = pai_pay_infor_tables[0].find_all(name="tbody",attrs={"id":"J_HoverShow"})[0].find_all(name="tr")
if trs and len(trs)>0:
tableDict['保证金'] = trs[0].find_all(name="span",attrs={"class":"J_Price"})[0].get_text().strip().split(" ")[0].replace('¥','')
if len(trs)>3:
tdx = trs[3].find_all(name="td")
for td in tdx:
data_spans = td.find_all(name="span")
tableDict[data_spans[0].get_text().strip()] = data_spans[1].get_text().strip().split(" ")[1].replace('¥','')
if len(trs)>3:
tdx = trs[4].find_all(name="td")
for td in tdx:
data_spans = td.find_all(name="span")
tableDict[data_spans[0].get_text().strip()] = data_spans[1].get_text().strip().split(" ")[1].replace('¥','')
spans0s = soup.find_all(name='span',attrs={"id":"J_Applyer"})
if len(spans0s)>0:
tableDict['报名人数']= spans0s[0].get_text().strip()
spans1s = soup.find_all(name='span',attrs={"id":"J_NotifyNum"})
if len(spans1s)>0:
tableDict['设置提醒人数']= spans1s[0].get_text().strip()
spans2s = soup.find_all(name='span',attrs={"id":"J_Looker"})
if len(spans1s)>0:
tableDict['围观人数']= spans2s[0].get_text().strip()
J_desc_divs = soup.find_all(name='div',attrs={"id":"J_desc"})
if len(J_desc_divs) > 0:
tables = J_desc_divs[0].find_all(name="table")
if tables and len(tables) > 0:
tabDict = {}
for table in tables:
trs = table.find_all(name="tr")
if trs and len(trs)>0:
for tr in trs:
tds = tr.find_all(name="td")
if tds and len(tds) > 0:
td = tds[0]
if len(tds) == 1:
key = ''
x = 0
while len(key)==0:
key=td.contents[x].get_text().strip()
x+=1
tabDict[key] = "\n".join([x.get_text().strip() for x in td.contents[x:]])
else:
td = tds[0]
tabDict[td.get_text().strip()] = ":".join([x.get_text().strip() for x in tds[1:]])
tableDict['标的物介绍'] = str(tabDict)
else:
tableDict['标的物介绍'] = J_desc_divs[0].get_text().strip()
#标的物位置
coordinateEle = soup.find_all(name='input',attrs={"id":"J_Coordinate"})
if len(coordinateEle)>0:
coordinate = coordinateEle[0]["value"].split(',')
tableDict['经度']=coordinate[0]
tableDict['纬度']=coordinate[1]
else:
tableDict['经度']=None
tableDict['纬度']=None
org_name = soup.find_all(name='p',attrs={"class":"org-name"})[0]
tableDict['处置单位']=org_name.get_text().strip() if org_name else None
itemAddress = soup.find_all(name='div',attrs={"id":"itemAddress"})
if len(itemAddress) > 0:
address = itemAddress[0].get_text().strip()
addresses = address.split(" ")
tableDict['省'] = addresses[0]
tableDict['市'] = addresses[1] if len(addresses) > 1 else None
tableDict['区'] = addresses[2] if len(addresses) > 2 else None
itemAddressDetail = soup.find_all(name='div',attrs={"id":"itemAddressDetail"})
if len(itemAddress) > 0:
tableDict['详细地址'] = itemAddressDetail[0].get_text().strip()
item_status = soup.find_all(name='span',attrs={"class":"item-status"})
if len(item_status) > 0:
tableDict['拍卖状态'] = item_status[0].get_text().strip()
J_DownLoadFirst = soup.find_all(name='p',attrs={"id":"J_DownLoadFirst"})
if len(J_DownLoadFirst) > 0 :
tableDict['附件数量'] = len(J_DownLoadFirst[0].find_all(name='a'))
J_DetailTabMenu = soup.find_all(name='ul',attrs={"id":"J_DetailTabMenu"})[0]
lis = J_DetailTabMenu.find_all(name='li') if J_Detail