# -*- coding: utf-8 -*-
"""
Spyder Editor
This is a temporary script file.
"""
import re
import time
import requests
#定义方法
key_word=input('输入关键字:')
print
City_Code=input('查询的城市:')
print
print ("爬虫开始,现在抓取",City_Code,'的数据,抓取关键字:',key_word)
# def __init__( key_word,City_Code,page):
# url = 'http://api.map.baidu.com/direction/v1'
f_path = 'E:\\project\\logo\\'+City_Code+key_word+'.xls'
f_re = open(f_path, 'w') # 用到open函数
f_re.write('名称\t市\t区\t地址\t电话\t坐标(X,Y)\t图片路径\t标签\t关键字\n') # 标签写入
page=0
num=0
for i in range(1,9999):
num=page*10
# print(page)
url = 'http://map.baidu.com'
parameter = {
"newmap": "1",
"reqflag": "pcmap",
"biz": "1",
"from": "webmap",
"da_par": "direct",
"pcevaname": "pc4.1",
"qt": "con",
"c": City_Code, # 城市代码
"wd": key_word, # 搜索关键词
"wd2": "",
"pn": page, # 页数
"nn": num,
"db": "0",
"sug": "0",
"addr": "0",
"da_src": "pcmappg.poi.page",
"on_gel": "1",
"src": "7",
"gr": "3",
"l": "12",
"tn": "B_NORMAL_MAP",
# "u_loc": "12621219.536556,2630747.285024",
"ie": "utf-8",
# "b": "(11845157.18,3047692.2;11922085.18,3073932.2)", #这个应该是地理位置坐标,可以忽略
# "b": "(22.7734420000,113.7838330000, 22.3906220000, 114.3794460000])",
# "t": "1468896652886"
}
url = 'http://map.baidu.com/'
try:
htm = requests.get(url, params=parameter)
htm = htm.text.encode('latin-1').decode('unicode_escape') # 转码
# pattern = r'(?<=\baddress_norm":"\[).+?(?="ty":)'
# print(htm)
pattern = r'(?<=acc_flag").+?(?="view_type":)'
htm = re.findall(pattern, htm) # 按段落匹配
except Exception as e:
# print("调用百度地图错误",e)
continue
if htm:
try:
for r in htm:
# print(r)
# 名称
pattern = r'(?<=,"name":").+?(?=")'
name = re.findall(pattern, r)
# print(name)
if len(name):
name=name[0]
else:name='-'
# print('名称',name)
# 地址
pattern = r'(?<="addr":").+?(?=")'
address = re.findall(pattern, r) # 地址
if len(address): address = address[0]
else: address='-'
pattern = r'(?<="area_name":").+?(?=")'
# 区
area_name = re.findall(pattern, r)
if len(area_name):area_name=area_name[0]
else:area_name='-'
# 电话
pattern = r'(?<="phone":").+?(?=")'
phone = re.findall(pattern, r)
if len(phone) or phone:
phone=phone[0]
if phone.strip() == '",': phone = '-'
else:phone='-'
# print('电话',phone)
pattern = r'(?<="point":{").+?(?=})'
point = re.findall(pattern, r)
if len(point):
point = point[0]
else: point = '-'
# print('坐标',point)
# 图片地址
pattern = r'(?<="image":").+?(?=")'
image = re.findall(pattern, r)
if len(image):image=image[0]
else:image='-'
# print('图片链接',image)
#print('图片地址', image)
# 标签
pattern = r'(?<="std_tag":").+?(?=")'
std_tag = re.findall(pattern, r)
if len(std_tag):std_tag=std_tag[0]
else:'-'
# print('标签',std_tag)
# 标签
pattern = r'(?<="tag":").+?(?=",)'
tag = re.findall(pattern, r)
try:
# print(name,address,phone,point,image,std_tag,image,std_tag)
f_re.writelines([name,'\t',City_Code,'\t',area_name,'\t',address,'\t', phone,'\t', point,'\t',image,'\t',std_tag,'\t',key_word,'\n'])
num=num+1
except Exception as e:
print("存储错误:",e)
continue
# print(name[0], address, phone, point)
# f_re.writelines([name[0], '\t', address, '\t', phone, '\t', point, '\t', '\n'])
time.sleep(2)
page=page+1;
print('导入了',num,'条数据')
except Exception as e:
print('错误信息:',e)
continue
else:
break
print( '文件存储路径:',f_path)
# f_re.close();
# print('一共导入了',num,'条数据')
f_re.close()
input("爬虫结束")
评论1
最新资源