0
votes

I have managed to use python selenium to do a search of broadband prices from a postcode. I want to do a loop to search multiple postcodes and record the results to pandas (which I should be capable of doing). The issue is, no matter which way I slice it, I can't extract the information from the html. Is the java script preventing me from collecting the information? I tried using the underlying API but I was re-directed and the response wasn't usable. I think I am close but after several hours I just can't get the information I need (provider name, speed, price) from each result. I have tried using both selenium and beautiful soup.

`log code.py

import logging, datetime, os
working_dir = os.getcwd()

now = datetime.datetime.now()
niceDate = now.date()

# THIS CREATES A LOG OBJECT THAT WE CAN CALL LATER IN THE SCRIPT
log = logging.getLogger()
log.setLevel(logging.DEBUG)  # This is the lowest level we want to see in any of our outputs (console or log file)

# THIS CODE DEFINES THE OUTPUT FILE FOR THE LOG AND THE FORMAT OF EACH ENTRY
file = logging.FileHandler(working_dir + "\\Log_" + str(niceDate) + ".txt")
fileformat = logging.Formatter('%(asctime)s: %(levelname)s - LINE %(lineno)d:  - %(message)s', datefmt="%H:%M:%S")
file.setLevel(logging.INFO)  # Only INFO and above will be output to the log file, no DEBUG messages
file.setFormatter(fileformat)

# THIS CODE ADDS A STREAM TO THE LOG OUTPUT THAT PRINTS IT TO THE CONSOLE
stream = logging.StreamHandler()
streamformat = logging.Formatter('%(asctime)s: %(levelname)s:  - %(message)s', datefmt="%H:%M:%S")
stream.setLevel(logging.DEBUG)
stream.setFormatter(streamformat)

log.addHandler(file)
log.addHandler(stream)

log.info(
    "\n[====][====][====][====][====][====][====][====][====][====][====][====][====][====][====][====][====][====]\n")


`main.py

from log_code import *

#Selenium imports here
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import TimeoutException
# import Action chains
from selenium.webdriver.common.action_chains import ActionChains
#Other imports here
import os, re
import wget
#import mechanicalsoup

from bs4 import BeautifulSoup as bs

import time, sys
startTime = time.time()

# CHROME - Load page without images to make it run faster
# CHROME - disk caching
chromeOptions = webdriver.ChromeOptions()
# prefs = {'profile.managed_default_content_settings.images':2}
prefs = {'profile.managed_default_content_settings.images':2, 'disk-cache-size': 4096}
chromeOptions.add_experimental_option("prefs", prefs)

# url = "https://www.thinkbroadband.com/packages"
url = "https://www.moneysupermarket.com/broadband/check-my-area/"

driver = webdriver.Chrome('O:/Rackspace/DP/All_Share/Python/Selenium Webdrivers/chromedriver.exe',chrome_options=chromeOptions)
# driver = webdriver.Edge('O:/Rackspace/DP/All_Share/Python/Selenium Webdrivers/msedgedriver.exe')


try:
    driver.get(url)
except Exception as e:
    log.critical("UNABLE TO CONNECT TO " + url)
    log.critical(str(e))


def scrape_data_MONEYSUPERMARKET(pc,url):
    driver.get(url)
    postcode_input_box = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "page_lead_input")))
    search_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//*[@id='page_lead_form']/div/button")))

    postcode_input_box.clear()
    postcode_input_box.send_keys(pc)
    # Click the search button
    # search_button.send_keys(Keys.ENTER)
    search_button.click()

    # Wait for search results
    # Address list appears on the page to filter postcode results by a single household
    # When this is available the results have been returned
    check1 = False
    check2 = False
    timeout = 10
    try:
        element_present = EC.presence_of_element_located((By.XPATH, '//*[@id="alad_form"]/alad-address-dropdown/div/select'))
        WebDriverWait(driver, timeout).until(element_present)
        log.info(f"{pc}: Address selection dropdown has loaded")
        check1 = True
    except TimeoutException:
        log.critical(f"{pc}: Timed out waiting for page to load")
        driver.quit()
        sys.exit("EXITING SCRIPT AFTER PAGE FAILED TO LOAD.")

    try:
        element_present = EC.presence_of_element_located((By.XPATH, '// *[ @ id = "current-provider-filter__location-text"]'))
        WebDriverWait(driver, timeout).until(element_present)
        log.info(f"{pc}: 'Your postcode' box has loaded")
        check2 = True
    except TimeoutException:
        log.critical(f"{pc}: Timed out waiting for 'Your postcode' box to load")


    # SCRAPING

    if check1 and check2:
        log.info(f"{pc}: Both checks passed. Continue to scraping.")
        time.sleep(1) # Sleep to make sure search results can be interrogated

        # Pass in beautifulsoup (bs) to search the html as it is sometimes faster than selenium
        # Bring the html of the page into bs_obj, using lxml to parse it into a useable object
        bs_obj = bs(driver.page_source, 'lxml')
        print(bs_obj)

        #all_results_boxes = bs_obj.find_all('div', class_="results-turbo")

        all_results_boxes = driver.find_elements(By.CLASS_NAME, "ng-hide")



        print(len(all_results_boxes))


        # Search each result box for provider, speed, price etc. There should only be 1 of each within each result.
        
        for each_result in all_results_boxes:

            try:
                provider_img = each_result.find_element(By.CLASS_NAME,"results-turbo-provider__logo")
                provider = provider_img.accessible_name
            except:
                log.critical("ERROR GETTING PROVIDER NAME")
                driver.quit()
                sys.exit()
                
            #speed = each_result.find('span',{'data-dtl-id': 'speed-measure'}).get_text()
            #cost  = each_result.find(By.CLASS_NAME,"turbo-info-list__value turbo-info-list__value--main-price" ).get_text()

            #log.info(provider_name + "---" + speed + "---" + cost)

            print(f"{provider}")



        print("x")
    else:
        log.info(f"{pc}: Scraping aborted. CHECK1: {check1} - CHECK2: {check2}")


    print("x")



postcodes = ['ha11pt','E111LE','dy102rr','hd11aa']


if url == "https://www.moneysupermarket.com/broadband/check-my-area/":

    try:
        consent = WebDriverWait(driver, 10).until(EC.element_to_be_clickable(
            (By.ID, "banner-accept"))).click()
    except:
        log.info("CONSENT POP-UPS NOT FOUND - CONTINUING")

    log.info(f"URL: {url}")

    for pc in postcodes:
        scrape_data_MONEYSUPERMARKET(pc,url)


driver.quit()



executionTime = (time.time() - startTime)
log.info("\n\t"+url+"\n")
av = executionTime/len(postcodes)
log.info(f"{str(len(postcodes))} postcodes searched. (Average of {av:.1f}s per postcode)")
log.info(f'TOTAL execution time: {time.strftime("%H:%M:%S",time.gmtime(executionTime))}.')