This post continues on the last one. Assuming you have the hotel list with urls from booking you can now extract addresses for each hotel. This address can be further converted to latitude and longitude since geo information that can be crawled from booking is not quite right. The script below extracts the address and hotel star ratings from booking using Python and Selenium. The address is then converted to latitude and longitude. All info is finally saved to json file.
If you have any questions regarding the script, don’t hesitate do leave a comment.
Go crazy with it… 😉
#! /usr/bin/python # -*- coding: utf-8 -*- # To change this template, choose Tools | Templates # and open the template in the editor. __author__= "selfconstruct3d" __date__ = "$Jun 18, 2016 01:16:36 AM$" from selenium import webdriver import re import json import urllib import uuid def my_random_string(string_length=10): '''Returns a random string of length string_length.''' random = str(uuid.uuid4()) # Convert UUID format to a Python string. random = random.upper() # Make all characters uppercase. random = random.replace("-","") # Remove the UUID '-'. return random[0:string_length] # Return the random string. CITY_NAME = "Vienna" with open('crawlbooking-Vienna-hotel-urls-ratings.json') as data_file2: booking_raw_hotel_urls = json.load(data_file2) driver = webdriver.Firefox() outputDict = {} counter = 1 for urlkey in booking_raw_hotel_urls.keys(): #just to follow the progress print counter hotel_rating = booking_raw_hotel_urls[urlkey]["rating"] counter += 1 # test #if counter > 100: # break try: # switch to english urlkey = urlkey.replace(".de.html",".en.html") # open url in browser driver.get(urlkey) # generate random ID for hotel hotels_hash = my_random_string(10) outputDict[hotels_hash] = {} name = driver.find_element_by_css_selector("span#hp_hotel_name.fn") fulladdrwithcat = driver.find_element_by_css_selector("p#showMap2.address.address_clean") starsRaw = driver.find_element_by_css_selector("span.hp__hotel_ratings__stars") stars = 0 if starsRaw.text: stars = int(re.search(r'\d+', starsRaw.text).group()) adressBoxElem = fulladdrwithcat.find_elements_by_tag_name("span") rawCoords = None for item in adressBoxElem: if item.get_attribute("data-bbox") is not None: rawCoords = item.get_attribute("data-bbox") # filter address string booking_address = fulladdrwithcat.text straustriaindex = booking_address.find("Austria") booking_address = booking_address[0:straustriaindex+7].encode('utf-8') # find lat and lng coordinates for address url = "http://maps.googleapis.com/maps/api/geocode/json?address="+booking_address+"sensor=false" response = urllib.urlopen(url) geodata = json.loads(response.read()) lat = None lng = None try: lat = geodata["results"][0]["geometry"]["location"]["lat"] lng = geodata["results"][0]["geometry"]["location"]["lng"] except Exception as e: print e.message # print some info print name.text outputDict[hotels_hash]["name"]=name.text print urlkey outputDict[hotels_hash]["url"] = urlkey print booking_address outputDict[hotels_hash]["addr"]=booking_address print lat outputDict[hotels_hash]["lat"]=lat print lng outputDict[hotels_hash]["lng"]=lng print rawCoords outputDict[hotels_hash]["cords"]=rawCoords print "stars:", stars outputDict[hotels_hash]["stars"]=stars print "rating", hotel_rating outputDict[hotels_hash]["rating"]=hotel_rating print "---" except Exception as e: print e # save to file with open("crawlbooking-" + CITY_NAME + "-hotel-details.json","w") as f: json.dump(outputDict,f)