# -*- coding: utf-8 -*-
import urllib2
from bs4 import BeautifulSoup
from selenium import webdriver
import re
import json
from geopy.geocoders import Nominatim
from geopy.distance import great_circle
from numpy import *
# 采用urllib2+BS4进行静态网页解析
def getCityByBS4(url):
html = urllib2.urlopen(urllib2.Request(url)).read().decode('gbk')
soup = BeautifulSoup(html,'lxml')
x = soup.find_all('p')
t=1
cityLocationList = []
cityLocationListOnlyLatLng =[]
for i in x:
if t > 5 and t<106:
city1 = i.get_text()
city1 = city1.encode('utf-8')
city1 = re.split('\.',city1)[1]
city1 = re.split('(',city1)[0]
if len(city1)>13:
city2 = city1[0:6]
else:
city2 =city1.strip()
try:
write2txt(city2,"cityname.txt")
lat_lng,lat_lngWithCommon = getLocation_xml(city2+"市")
city_location = "%s %s"%(city2,str(lat_lng))
city_locationOnlyLatLng = lat_lng
cityLocationList.append(city_location)
cityLocationListOnlyLatLng.append(city_locationOnlyLatLng)
#write2txt(city_location,"City&LocationBS4.txt")
print city_location
except:
print "something wrong about city:",city2
t +=1
return cityLocationList,cityLocationListOnlyLatLng
#采用selenium+phantomjs进行动态解析
def getCityBySelenium(url):
driver = webdriver.PhantomJS(executable_path="phantomjs.exe")
driver.get(url)
cityLocationList = []
cityLocationListOnlyLatLng =[]
for i in range(6,106):
elem_city = driver.find_element_by_xpath("//font[@id='zoom']/p[%d]"%i).text
elem_city = elem_city.encode('utf-8')
try:
city = re.split("[\.]",elem_city)[1]
city = re.split("(",city)
city1 = city[0]
# 一个中文字符占3个长度!!
if len(city1)>13:
city2 = city1[0:6]
else:
city2 =city1.strip()
lat_lng,lat_lngWithCommon = getLocation_xml(city2+"市")
city_location = "%s %s"%(city2,str(lat_lng))
city_locationOnlyLatLng = lat_lng
#write2txt(city_location,"City&LocationBySelenium.txt")
print city_location
cityLocationList.append(city_location)
cityLocationListOnlyLatLng.append(city_locationOnlyLatLng)
except:
print 'something wrong with ',elem_city
# 调用完后记得关闭!!!!!不然phantomjs一直占用内存
driver.close()
driver.quit()
return cityLocationList,cityLocationListOnlyLatLng
#写入txt操作子函数
def write2txt(file,txtname):
f = open(txtname,'a')
f.write(file)
f.write("\n")
f.close()
# please use this with try except/finall f.close()
#读取txt操作子函数
def readFromTxt(path):
f = open(path,'r')
line_list = []
while True:
line = f.readline() #没有足够内存一次读取整个文件的时候要用readline
if line:
line = line.strip()
#print line
line_list.append(line)
else:
f.close()
break
return line_list
#调用API返回json格式
def getLocation_json(addr):
url= 'http://api.map.baidu.com/geocoder?address=%s&output=json&key=f247cdb592eb43ebac6ccd27f796e2d2'%(addr)
html = urllib2.urlopen(urllib2.Request(url))
json1 = html.read() #转化为str类型
hjson =json.loads(json1) #转化为dict类型
lng = hjson['result']['location']['lng'] # 经度
lat = hjson['result']['location']['lat'] # 纬度
lng_lat = "%s %s"%(lng,lat)
lng_latWithCommon = "%s,%s"%(lng,lat)
return lng_lat,lng_latWithCommon
#调用API返回xml格式
def getLocation_xml(addr):
url= 'http://api.map.baidu.com/geocoder?address=%s&output=xml&key=f247cdb592eb43ebac6ccd27f796e2d2'%(addr)
html = urllib2.urlopen(urllib2.Request(url))
xml = html.read()
bs_getDetail = BeautifulSoup(xml,'lxml')
#方法一,直接根据路径找
lng =float(bs_getDetail.result.location.lng.string)
lat = float(bs_getDetail.result.location.lat.string)
'''
#方法二,使用find方法+正则表达式
lat = bs_getDetail.find('lat')
lng = bs_getDetail.find('lng')
pattern = '\d+\.\d+'
lat = re.findall(pattern,str(lat))[0]
lng = re.findall(pattern,str(lng))[0]
'''
lng_la = "%f %f"%(lng,lat)
lng_latWithCommon = "%f,%f"%(lng,lat)
return lng_la,lng_latWithCommon
# 调用Geopy包进行处理-获取城市名
def getCitynameByGeo(lat_lng):
#知道经纬度获取地址
geolocator = Nominatim()
location = geolocator.reverse(lat_lng)
addr = location.address
print addr
cityname = re.split("[/,]",addr)[-5].strip()
print cityname
return addr,cityname
# 调用Geopy包进行处理-获取经纬度
def getLocationByGeo(cityname):
#知道地址获取经纬度
geolocator = Nominatim()
location2 = geolocator.geocode(cityname)
lat = location2.latitude
lng = location2.longitude
return "%s %s,%s"%(cityname,lat,lng)
# 修改代码,适合数据,增加tolist(),转化成list形式
def distSLC(vecA,vecB):
return great_circle(vecA.tolist(),vecB.tolist()).miles
'''
if __name__ == '__main__':
url = 'http://www.redsh.com/a/20160126/171501.shtml'
city_locationList,cityLocationListOnlyLatLng= getCityByBS4(url)
print city_locationList
for j in cityLocationListOnlyLatLng:
write2txt(j,"LocaionOnlyBySeleniumNoCommon.txt")
'''
'''
addr = readFromTxt('city.txt')
for i in range(len(addr)):
try:
lng_lat = getLocation_xml(addr[i]+"市")
print "%s:%s"%(addr[i],str(lng_lat))
write2txt("%s:%s"%(addr[i],str(lng_lat)),"city_location.txt")
except:
print "something wrong about %s"%(addr[i])
'''
'''
addr = getCity(url)
for i in range(len(addr)):
try:
lng_lat = getLocation_xml(addr[i]+"市")
print "%s:%s"%(addr[i],str(lng_lat))
write2txt("%s:%s"%(addr[i],str(lng_lat)),"city_location.txt")
except:
print "something wrong about %s"%(addr[i])
'''
没有合适的资源?快使用搜索试试~ 我知道了~
python数据挖掘最佳吃货住宿点KmeansEating
共3个文件
py:3个
需积分: 9 5 下载量 35 浏览量
2017-08-17
23:02:55
上传
评论 2
收藏 4KB ZIP 举报
温馨提示
python爬取百度地图饭馆地址排行,利用爬取数据挖掘最佳吃货住宿点,用到机器学习方法
资源推荐
资源详情
资源评论
收起资源包目录
最佳吃货住宿点KmeansEating.zip (3个子文件)
kmeansEatHarbin.py 1KB
API_get.py 6KB
kmeans_eatlocation.py 3KB
共 3 条
- 1
资源评论
冷雨热
- 粉丝: 11
- 资源: 41
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功