I have managed to use python selenium to do a search of broadband prices from a postcode. I want to do a loop to search multiple postcodes and record the results to pandas (which I should be capable of doing). The issue is, no matter which way I slice it, I can't extract the information from the html. Is the java script preventing me from collecting the information? I tried using the underlying API but I was re-directed and the response wasn't usable. I think I am close but after several hours I just can't get the information I need (provider name, speed, price) from each result. I have tried using both selenium and beautiful soup.
`log code.py
import logging, datetime, os
working_dir = os.getcwd()
now = datetime.datetime.now()
niceDate = now.date()
# THIS CREATES A LOG OBJECT THAT WE CAN CALL LATER IN THE SCRIPT
log = logging.getLogger()
log.setLevel(logging.DEBUG) # This is the lowest level we want to see in any of our outputs (console or log file)
# THIS CODE DEFINES THE OUTPUT FILE FOR THE LOG AND THE FORMAT OF EACH ENTRY
file = logging.FileHandler(working_dir + "\\Log_" + str(niceDate) + ".txt")
fileformat = logging.Formatter('%(asctime)s: %(levelname)s - LINE %(lineno)d: - %(message)s', datefmt="%H:%M:%S")
file.setLevel(logging.INFO) # Only INFO and above will be output to the log file, no DEBUG messages
file.setFormatter(fileformat)
# THIS CODE ADDS A STREAM TO THE LOG OUTPUT THAT PRINTS IT TO THE CONSOLE
stream = logging.StreamHandler()
streamformat = logging.Formatter('%(asctime)s: %(levelname)s: - %(message)s', datefmt="%H:%M:%S")
stream.setLevel(logging.DEBUG)
stream.setFormatter(streamformat)
log.addHandler(file)
log.addHandler(stream)
log.info(
"\n[====][====][====][====][====][====][====][====][====][====][====][====][====][====][====][====][====][====]\n")
`main.py
from log_code import *
#Selenium imports here
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import TimeoutException
# import Action chains
from selenium.webdriver.common.action_chains import ActionChains
#Other imports here
import os, re
import wget
#import mechanicalsoup
from bs4 import BeautifulSoup as bs
import time, sys
startTime = time.time()
# CHROME - Load page without images to make it run faster
# CHROME - disk caching
chromeOptions = webdriver.ChromeOptions()
# prefs = {'profile.managed_default_content_settings.images':2}
prefs = {'profile.managed_default_content_settings.images':2, 'disk-cache-size': 4096}
chromeOptions.add_experimental_option("prefs", prefs)
# url = "https://www.thinkbroadband.com/packages"
url = "https://www.moneysupermarket.com/broadband/check-my-area/"
driver = webdriver.Chrome('O:/Rackspace/DP/All_Share/Python/Selenium Webdrivers/chromedriver.exe',chrome_options=chromeOptions)
# driver = webdriver.Edge('O:/Rackspace/DP/All_Share/Python/Selenium Webdrivers/msedgedriver.exe')
try:
driver.get(url)
except Exception as e:
log.critical("UNABLE TO CONNECT TO " + url)
log.critical(str(e))
def scrape_data_MONEYSUPERMARKET(pc,url):
driver.get(url)
postcode_input_box = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "page_lead_input")))
search_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//*[@id='page_lead_form']/div/button")))
postcode_input_box.clear()
postcode_input_box.send_keys(pc)
# Click the search button
# search_button.send_keys(Keys.ENTER)
search_button.click()
# Wait for search results
# Address list appears on the page to filter postcode results by a single household
# When this is available the results have been returned
check1 = False
check2 = False
timeout = 10
try:
element_present = EC.presence_of_element_located((By.XPATH, '//*[@id="alad_form"]/alad-address-dropdown/div/select'))
WebDriverWait(driver, timeout).until(element_present)
log.info(f"{pc}: Address selection dropdown has loaded")
check1 = True
except TimeoutException:
log.critical(f"{pc}: Timed out waiting for page to load")
driver.quit()
sys.exit("EXITING SCRIPT AFTER PAGE FAILED TO LOAD.")
try:
element_present = EC.presence_of_element_located((By.XPATH, '// *[ @ id = "current-provider-filter__location-text"]'))
WebDriverWait(driver, timeout).until(element_present)
log.info(f"{pc}: 'Your postcode' box has loaded")
check2 = True
except TimeoutException:
log.critical(f"{pc}: Timed out waiting for 'Your postcode' box to load")
# SCRAPING
if check1 and check2:
log.info(f"{pc}: Both checks passed. Continue to scraping.")
time.sleep(1) # Sleep to make sure search results can be interrogated
# Pass in beautifulsoup (bs) to search the html as it is sometimes faster than selenium
# Bring the html of the page into bs_obj, using lxml to parse it into a useable object
bs_obj = bs(driver.page_source, 'lxml')
print(bs_obj)
#all_results_boxes = bs_obj.find_all('div', class_="results-turbo")
all_results_boxes = driver.find_elements(By.CLASS_NAME, "ng-hide")
print(len(all_results_boxes))
# Search each result box for provider, speed, price etc. There should only be 1 of each within each result.
for each_result in all_results_boxes:
try:
provider_img = each_result.find_element(By.CLASS_NAME,"results-turbo-provider__logo")
provider = provider_img.accessible_name
except:
log.critical("ERROR GETTING PROVIDER NAME")
driver.quit()
sys.exit()
#speed = each_result.find('span',{'data-dtl-id': 'speed-measure'}).get_text()
#cost = each_result.find(By.CLASS_NAME,"turbo-info-list__value turbo-info-list__value--main-price" ).get_text()
#log.info(provider_name + "---" + speed + "---" + cost)
print(f"{provider}")
print("x")
else:
log.info(f"{pc}: Scraping aborted. CHECK1: {check1} - CHECK2: {check2}")
print("x")
postcodes = ['ha11pt','E111LE','dy102rr','hd11aa']
if url == "https://www.moneysupermarket.com/broadband/check-my-area/":
try:
consent = WebDriverWait(driver, 10).until(EC.element_to_be_clickable(
(By.ID, "banner-accept"))).click()
except:
log.info("CONSENT POP-UPS NOT FOUND - CONTINUING")
log.info(f"URL: {url}")
for pc in postcodes:
scrape_data_MONEYSUPERMARKET(pc,url)
driver.quit()
executionTime = (time.time() - startTime)
log.info("\n\t"+url+"\n")
av = executionTime/len(postcodes)
log.info(f"{str(len(postcodes))} postcodes searched. (Average of {av:.1f}s per postcode)")
log.info(f'TOTAL execution time: {time.strftime("%H:%M:%S",time.gmtime(executionTime))}.')