from lxml import etree
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# import lxml
# 待爬取列表
def getUrls():
urls = ('https://hr.tencent.com/position.php?keywords=&tid=0&lid={}'.format(i) for i in range(2268,2269))
return urls
# 定义类,一个实例对应数据库里的一行数据
class Item(object):
def __init__(self, positionName, positionType, peopleNum, workLocation, publicTime):
self.positionName = positionName
self.positionType = positionType
self.peopleNum = peopleNum
self.workLocation = workLocation
self.publicTime = publicTime
if __name__ == '__main__':
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(options=chrome_options)
urls = getUrls()
for url in urls:
# print("【爬取地址是】:",url)
driver.get(url)
page = driver.page_source
# print('*****************************************************************************')
# print(page)
# 对html的解析,提取需要的信息
html = etree.HTML(page)
tds = html.xpath('//*[@id="position"]//table//tr[position()>1]//td/text()')
a_s = html.xpath('//*[@id="position"]//table//a/text()')
i = 0
str = ""
list = []
for td in tds:
if(i%4==0):
list.append(str)
i=0
str=""
# print('________________')
# print(td)
str += td+" "
i += 1
# print(list)
aList = []
for a in a_s:
if len(a)>10:
aList.append(a)
# print(aList)
all_list = []
index = 0
for item in list:
if len(item) == 0:
continue
array = item.split(" ")
list_item = Item(aList[index],array[0],array[1],array[2],array[3])
print("爬取数据结果:"+list_item.positionName+","+list_item.positionType+","+list_item.peopleNum+","+list_item.workLocation+","+list_item.publicTime)
all_list.append(list_item)
index += 1