This post continues on the last one. Assuming you have the hotel list with urls from booking you can now extract addresses for each hotel. This address can be further converted to latitude and longitude since geo information that can be crawled from booking is not quite right. The script below extracts the address and hotel star ratings from booking using Python and Selenium. The address is then converted to latitude and longitude. All info is finally saved to json file.
If you have any questions regarding the script, don’t hesitate do leave a comment.
Go crazy with it… 😉
#! /usr/bin/python
# -*- coding: utf-8 -*-
# To change this template, choose Tools | Templates
# and open the template in the editor.
__author__= "selfconstruct3d"
__date__ = "$Jun 18, 2016 01:16:36 AM$"
from selenium import webdriver
import re
import json
import urllib
import uuid
def my_random_string(string_length=10):
'''Returns a random string of length string_length.'''
random = str(uuid.uuid4()) # Convert UUID format to a Python string.
random = random.upper() # Make all characters uppercase.
random = random.replace("-","") # Remove the UUID '-'.
return random[0:string_length] # Return the random string.
CITY_NAME = "Vienna"
with open('crawlbooking-Vienna-hotel-urls-ratings.json') as data_file2:
booking_raw_hotel_urls = json.load(data_file2)
driver = webdriver.Firefox()
outputDict = {}
counter = 1
for urlkey in booking_raw_hotel_urls.keys():
#just to follow the progress
print counter
hotel_rating = booking_raw_hotel_urls[urlkey]["rating"]
counter += 1
# test
#if counter > 100:
# break
try:
# switch to english
urlkey = urlkey.replace(".de.html",".en.html")
# open url in browser
driver.get(urlkey)
# generate random ID for hotel
hotels_hash = my_random_string(10)
outputDict[hotels_hash] = {}
name = driver.find_element_by_css_selector("span#hp_hotel_name.fn")
fulladdrwithcat = driver.find_element_by_css_selector("p#showMap2.address.address_clean")
starsRaw = driver.find_element_by_css_selector("span.hp__hotel_ratings__stars")
stars = 0
if starsRaw.text:
stars = int(re.search(r'\d+', starsRaw.text).group())
adressBoxElem = fulladdrwithcat.find_elements_by_tag_name("span")
rawCoords = None
for item in adressBoxElem:
if item.get_attribute("data-bbox") is not None:
rawCoords = item.get_attribute("data-bbox")
# filter address string
booking_address = fulladdrwithcat.text
straustriaindex = booking_address.find("Austria")
booking_address = booking_address[0:straustriaindex+7].encode('utf-8')
# find lat and lng coordinates for address
url = "http://maps.googleapis.com/maps/api/geocode/json?address="+booking_address+"sensor=false"
response = urllib.urlopen(url)
geodata = json.loads(response.read())
lat = None
lng = None
try:
lat = geodata["results"][0]["geometry"]["location"]["lat"]
lng = geodata["results"][0]["geometry"]["location"]["lng"]
except Exception as e:
print e.message
# print some info
print name.text
outputDict[hotels_hash]["name"]=name.text
print urlkey
outputDict[hotels_hash]["url"] = urlkey
print booking_address
outputDict[hotels_hash]["addr"]=booking_address
print lat
outputDict[hotels_hash]["lat"]=lat
print lng
outputDict[hotels_hash]["lng"]=lng
print rawCoords
outputDict[hotels_hash]["cords"]=rawCoords
print "stars:", stars
outputDict[hotels_hash]["stars"]=stars
print "rating", hotel_rating
outputDict[hotels_hash]["rating"]=hotel_rating
print "---"
except Exception as e:
print e
# save to file
with open("crawlbooking-" + CITY_NAME + "-hotel-details.json","w") as f:
json.dump(outputDict,f)
