python数据挖掘最佳吃货住宿点KmeansEating资源-CSDN文库

共3个文件

py：3个

python

需积分: 9 35 浏览量 2017-08-17 23:02:55 上传评论 2 收藏 4KB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

最佳吃货住宿点KmeansEating.zip （3个子文件）

kmeansEatHarbin.py 1KB

API_get.py 6KB

kmeans_eatlocation.py 3KB

# -*- coding: utf-8 -*- import urllib2 from bs4 import BeautifulSoup from selenium import webdriver import re import json from geopy.geocoders import Nominatim from geopy.distance import great_circle from numpy import * # 采用urllib2+BS4进行静态网页解析 def getCityByBS4(url): html = urllib2.urlopen(urllib2.Request(url)).read().decode('gbk') soup = BeautifulSoup(html,'lxml') x = soup.find_all('p') t=1 cityLocationList = [] cityLocationListOnlyLatLng =[] for i in x: if t > 5 and t<106: city1 = i.get_text() city1 = city1.encode('utf-8') city1 = re.split('\.',city1)[1] city1 = re.split('（',city1)[0] if len(city1)>13: city2 = city1[0:6] else: city2 =city1.strip() try: write2txt(city2,"cityname.txt") lat_lng,lat_lngWithCommon = getLocation_xml(city2+"市") city_location = "%s %s"%(city2,str(lat_lng)) city_locationOnlyLatLng = lat_lng cityLocationList.append(city_location) cityLocationListOnlyLatLng.append(city_locationOnlyLatLng) #write2txt(city_location,"City&LocationBS4.txt") print city_location except: print "something wrong about city:",city2 t +=1 return cityLocationList,cityLocationListOnlyLatLng #采用selenium+phantomjs进行动态解析 def getCityBySelenium(url): driver = webdriver.PhantomJS(executable_path="phantomjs.exe") driver.get(url) cityLocationList = [] cityLocationListOnlyLatLng =[] for i in range(6,106): elem_city = driver.find_element_by_xpath("//font[@id='zoom']/p[%d]"%i).text elem_city = elem_city.encode('utf-8') try: city = re.split("[\.]",elem_city)[1] city = re.split("（",city) city1 = city[0] # 一个中文字符占3个长度！！ if len(city1)>13: city2 = city1[0:6] else: city2 =city1.strip() lat_lng,lat_lngWithCommon = getLocation_xml(city2+"市") city_location = "%s %s"%(city2,str(lat_lng)) city_locationOnlyLatLng = lat_lng #write2txt(city_location,"City&LocationBySelenium.txt") print city_location cityLocationList.append(city_location) cityLocationListOnlyLatLng.append(city_locationOnlyLatLng) except: print 'something wrong with ',elem_city # 调用完后记得关闭！！！！！不然phantomjs一直占用内存 driver.close() driver.quit() return cityLocationList,cityLocationListOnlyLatLng #写入txt操作子函数 def write2txt(file,txtname): f = open(txtname,'a') f.write(file) f.write("\n") f.close() # please use this with try except/finall f.close() #读取txt操作子函数 def readFromTxt(path): f = open(path,'r') line_list = [] while True: line = f.readline() #没有足够内存一次读取整个文件的时候要用readline if line: line = line.strip() #print line line_list.append(line) else: f.close() break return line_list #调用API返回json格式 def getLocation_json(addr): url= 'http://api.map.baidu.com/geocoder?address=%s&output=json&key=f247cdb592eb43ebac6ccd27f796e2d2'%(addr) html = urllib2.urlopen(urllib2.Request(url)) json1 = html.read() #转化为str类型 hjson =json.loads(json1) #转化为dict类型 lng = hjson['result']['location']['lng'] # 经度 lat = hjson['result']['location']['lat'] # 纬度 lng_lat = "%s %s"%(lng,lat) lng_latWithCommon = "%s,%s"%(lng,lat) return lng_lat,lng_latWithCommon #调用API返回xml格式 def getLocation_xml(addr): url= 'http://api.map.baidu.com/geocoder?address=%s&output=xml&key=f247cdb592eb43ebac6ccd27f796e2d2'%(addr) html = urllib2.urlopen(urllib2.Request(url)) xml = html.read() bs_getDetail = BeautifulSoup(xml,'lxml') #方法一，直接根据路径找 lng =float(bs_getDetail.result.location.lng.string) lat = float(bs_getDetail.result.location.lat.string) ''' #方法二，使用find方法+正则表达式 lat = bs_getDetail.find('lat') lng = bs_getDetail.find('lng') pattern = '\d+\.\d+' lat = re.findall(pattern,str(lat))[0] lng = re.findall(pattern,str(lng))[0] ''' lng_la = "%f %f"%(lng,lat) lng_latWithCommon = "%f,%f"%(lng,lat) return lng_la,lng_latWithCommon # 调用Geopy包进行处理-获取城市名 def getCitynameByGeo(lat_lng): #知道经纬度获取地址 geolocator = Nominatim() location = geolocator.reverse(lat_lng) addr = location.address print addr cityname = re.split("[/,]",addr)[-5].strip() print cityname return addr,cityname # 调用Geopy包进行处理-获取经纬度 def getLocationByGeo(cityname): #知道地址获取经纬度 geolocator = Nominatim() location2 = geolocator.geocode(cityname) lat = location2.latitude lng = location2.longitude return "%s %s,%s"%(cityname,lat,lng) # 修改代码，适合数据，增加tolist()，转化成list形式 def distSLC(vecA,vecB): return great_circle(vecA.tolist(),vecB.tolist()).miles ''' if __name__ == '__main__': url = 'http://www.redsh.com/a/20160126/171501.shtml' city_locationList,cityLocationListOnlyLatLng= getCityByBS4(url) print city_locationList for j in cityLocationListOnlyLatLng: write2txt(j,"LocaionOnlyBySeleniumNoCommon.txt") ''' ''' addr = readFromTxt('city.txt') for i in range(len(addr)): try: lng_lat = getLocation_xml(addr[i]+"市") print "%s:%s"%(addr[i],str(lng_lat)) write2txt("%s:%s"%(addr[i],str(lng_lat)),"city_location.txt") except: print "something wrong about %s"%(addr[i]) ''' ''' addr = getCity(url) for i in range(len(addr)): try: lng_lat = getLocation_xml(addr[i]+"市") print "%s:%s"%(addr[i],str(lng_lat)) write2txt("%s:%s"%(addr[i],str(lng_lat)),"city_location.txt") except: print "something wrong about %s"%(addr[i]) '''

评论收藏

内容反馈