How to crawl Hotel Informations from booking using Python and Selenium

This post continues on the last one. Assuming you have the hotel list with urls from booking you can now extract addresses for each hotel. This address can be further converted to latitude and longitude since geo information that can be crawled from booking is not quite right. The script below extracts the address and hotel star ratings from booking using Python and Selenium. The address is then converted to latitude and longitude. All info is finally saved to json file.

If you have any questions regarding the script, don’t hesitate do leave a comment.

Go crazy with it… 😉

#! /usr/bin/python
# -*- coding: utf-8 -*-

# To change this template, choose Tools | Templates
# and open the template in the editor.

__author__= "selfconstruct3d"
__date__ = "$Jun 18, 2016 01:16:36 AM$"

from selenium import webdriver
import re
import json
import urllib
import uuid

def my_random_string(string_length=10):
    '''Returns a random string of length string_length.'''
    random = str(uuid.uuid4()) # Convert UUID format to a Python string.
    random = random.upper() # Make all characters uppercase.
    random = random.replace("-","") # Remove the UUID '-'.
    return random[0:string_length] # Return the random string.

CITY_NAME = "Vienna"

with open('crawlbooking-Vienna-hotel-urls-ratings.json') as data_file2:
    booking_raw_hotel_urls = json.load(data_file2)

driver = webdriver.Firefox()

outputDict = {}

counter = 1
for urlkey in booking_raw_hotel_urls.keys():

    #just to follow the progress
    print counter
    hotel_rating = booking_raw_hotel_urls[urlkey]["rating"]
    counter += 1

    # test
    #if counter > 100:
    #    break
    try:
        # switch to english
        urlkey = urlkey.replace(".de.html",".en.html")

        # open url in browser
        driver.get(urlkey)

        # generate random ID for hotel
        hotels_hash = my_random_string(10)
        outputDict[hotels_hash] = {}

        name = driver.find_element_by_css_selector("span#hp_hotel_name.fn")
        fulladdrwithcat = driver.find_element_by_css_selector("p#showMap2.address.address_clean")
        starsRaw = driver.find_element_by_css_selector("span.hp__hotel_ratings__stars")

        stars = 0
        if starsRaw.text:
            stars = int(re.search(r'\d+', starsRaw.text).group())

        adressBoxElem = fulladdrwithcat.find_elements_by_tag_name("span")

        rawCoords = None
        for item in adressBoxElem:
            if item.get_attribute("data-bbox") is not None:
                rawCoords = item.get_attribute("data-bbox")

        # filter address string
        booking_address = fulladdrwithcat.text
        straustriaindex = booking_address.find("Austria")
        booking_address = booking_address[0:straustriaindex+7].encode('utf-8')

        # find lat and lng coordinates for address
        url = "http://maps.googleapis.com/maps/api/geocode/json?address="+booking_address+"sensor=false"
        response = urllib.urlopen(url)
        geodata = json.loads(response.read())

        lat = None
        lng = None
        try:
            lat = geodata["results"][0]["geometry"]["location"]["lat"]
            lng = geodata["results"][0]["geometry"]["location"]["lng"]
        except Exception as e:
            print e.message

        # print some info
        print name.text
        outputDict[hotels_hash]["name"]=name.text
        print urlkey
        outputDict[hotels_hash]["url"] = urlkey
        print booking_address
        outputDict[hotels_hash]["addr"]=booking_address
        print lat
        outputDict[hotels_hash]["lat"]=lat
        print lng
        outputDict[hotels_hash]["lng"]=lng
        print rawCoords
        outputDict[hotels_hash]["cords"]=rawCoords
        print "stars:", stars
        outputDict[hotels_hash]["stars"]=stars
        print "rating", hotel_rating
        outputDict[hotels_hash]["rating"]=hotel_rating

        print "---"
    except Exception as e:
        print e

# save to file
with open("crawlbooking-" + CITY_NAME + "-hotel-details.json","w") as f:
    json.dump(outputDict,f)

 

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s

%d bloggers like this: