How to crawl Hotel Informations from booking using Python and Selenium

This post continues on the last one. Assuming you have the hotel list with urls from booking you can now extract addresses for each hotel. This address can be further converted to latitude and longitude since geo information that can be crawled from booking is not quite right. The script below extracts the address and hotel star ratings from booking using Python and Selenium. The address is then converted to latitude and longitude. All info is finally saved to json file.

If you have any questions regarding the script, don’t hesitate do leave a comment.

Go crazy with it… 😉

#! /usr/bin/python
# -*- coding: utf-8 -*-

# To change this template, choose Tools | Templates
# and open the template in the editor.

__author__= "selfconstruct3d"
__date__ = "$Jun 18, 2016 01:16:36 AM$"

from selenium import webdriver
import re
import json
import urllib
import uuid

def my_random_string(string_length=10):
    '''Returns a random string of length string_length.'''
    random = str(uuid.uuid4()) # Convert UUID format to a Python string.
    random = random.upper() # Make all characters uppercase.
    random = random.replace("-","") # Remove the UUID '-'.
    return random[0:string_length] # Return the random string.

CITY_NAME = "Vienna"

with open('crawlbooking-Vienna-hotel-urls-ratings.json') as data_file2:
    booking_raw_hotel_urls = json.load(data_file2)

driver = webdriver.Firefox()

outputDict = {}

counter = 1
for urlkey in booking_raw_hotel_urls.keys():

    #just to follow the progress
    print counter
    hotel_rating = booking_raw_hotel_urls[urlkey]["rating"]
    counter += 1

    # test
    #if counter > 100:
    #    break
        # switch to english
        urlkey = urlkey.replace(".de.html",".en.html")

        # open url in browser

        # generate random ID for hotel
        hotels_hash = my_random_string(10)
        outputDict[hotels_hash] = {}

        name = driver.find_element_by_css_selector("span#hp_hotel_name.fn")
        fulladdrwithcat = driver.find_element_by_css_selector("p#showMap2.address.address_clean")
        starsRaw = driver.find_element_by_css_selector("span.hp__hotel_ratings__stars")

        stars = 0
        if starsRaw.text:
            stars = int('\d+', starsRaw.text).group())

        adressBoxElem = fulladdrwithcat.find_elements_by_tag_name("span")

        rawCoords = None
        for item in adressBoxElem:
            if item.get_attribute("data-bbox") is not None:
                rawCoords = item.get_attribute("data-bbox")

        # filter address string
        booking_address = fulladdrwithcat.text
        straustriaindex = booking_address.find("Austria")
        booking_address = booking_address[0:straustriaindex+7].encode('utf-8')

        # find lat and lng coordinates for address
        url = ""+booking_address+"sensor=false"
        response = urllib.urlopen(url)
        geodata = json.loads(

        lat = None
        lng = None
            lat = geodata["results"][0]["geometry"]["location"]["lat"]
            lng = geodata["results"][0]["geometry"]["location"]["lng"]
        except Exception as e:
            print e.message

        # print some info
        print name.text
        print urlkey
        outputDict[hotels_hash]["url"] = urlkey
        print booking_address
        print lat
        print lng
        print rawCoords
        print "stars:", stars
        print "rating", hotel_rating

        print "---"
    except Exception as e:
        print e

# save to file
with open("crawlbooking-" + CITY_NAME + "-hotel-details.json","w") as f:


Leave a Reply

Fill in your details below or click an icon to log in: Logo

You are commenting using your account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s

%d bloggers like this: