0
votes

I'm quite new to python and have written a script using selenium to scrape a website. I've tried everything but can't get the loop to cycle through pages. It currently just repeats the data on the first page 5 times. I want to scrape all the pages for 'BR1' any help would be great, currently the script below only scrapes the first page 5 times.

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

with open('rightmove.csv', 'w') as file:
    file.write('PropertyCardcontent \n')

PATH = ("/usr/local/bin/chromedriver")
driver = webdriver.Chrome(PATH)

driver.get("https://www.rightmove.co.uk/house-prices.html")
print(driver.title)

elem = driver.find_element(By.NAME, 'searchLocation')  # Find the search box
elem.send_keys('BR1' + Keys.RETURN)

try:
    content = WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.ID,'content'))
            )

finally:
    time.sleep(3)

for p in range(5):
    sold = content.find_elements(By.CLASS_NAME, 'sold-prices-content-wrapper ')
    for solds in sold:
        address = solds.find_elements(By.CLASS_NAME, 'sold-prices-content ')
        for addresses in address:
            result = addresses.find_elements(By.CLASS_NAME, 'results ')
            for results in result:
                card = results.find_elements(By.CLASS_NAME,'propertyCard')
                for propertyCard in card:
                    header = propertyCard.find_elements(By.CLASS_NAME,'propertyCard-content')
                    for propertyCardcontent in header:
                        road = propertyCardcontent.find_elements(By.CLASS_NAME,'title')
                    for propertyCardcontent in header:
                        road = propertyCardcontent.find_elements(By.CLASS_NAME,'subTitle')
                        for subtitle in road:
                            bed = subtitle.find_elements(By.CLASS_NAME, 'propertyType')
    with open('rightmove.csv', 'a') as file:
        for i in range(len(result)):
            file.write(header[i].text + '\n')
        
        button = driver.find_element(By.XPATH, '//*[@id="content"]/div[2]/div[2]/div[4]/div[27]/div[3]/div')
        button.click()
    file.close()

time.sleep(3)
driver.quit()
2

2 Answers

0
votes

You don't have to go down to dom elem by elem, you can just use xpath or class_name (if it's unique, otherwise it's better xpath or css-selector) and get the item you are looking for. Anyway follow this:

import time
import selenium.webdriver as webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

driver = webdriver.Chrome("/usr/local/bin/chromedriver")
driver.get("https://www.rightmove.co.uk/house-prices.html")

# send query
query = "BR1"
search_bar = driver.find_element(By.XPATH, '//input[@class="searchBox ac_input"]')
search_bar.send_keys(query)
search_bar.send_keys(Keys.ENTER)

# wait to result been loaded
WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, 'propertyCard'))
    )

#get amount of pages
pages = driver.find_element(By.XPATH, '//span[@class="pagination-label"][2]').text
pages = int(pages.replace('of ', ''))
data = []
i = 1

while i <= pages:
    WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//div[contains(text(), "Next")]'))
    ).click()
    # get all property for first page
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '//div//div[@class="propertyCard"]'))
    )
    propertyCards = driver.find_elements(By.XPATH, '//div//div[@class="propertyCard"]')
    for propertyCard in propertyCards:
        title = propertyCard.find_element(By.CLASS_NAME, 'title').text
        propertyType = propertyCard.find_element(By.CLASS_NAME, 'propertyType').text
        data.append((title, propertyType))
    time.sleep(2)
    i += 1
    
print("you reach the last page")

#get number of results
driver.close()

I use a list of tuple cause in your example you want store 2 item, if you want store more data you can use a dict and then convert into csv with Dictwriter directly. Enjoy.

0
votes

Since the website has page number on it, I recommend you put the base url as "https://www.rightmove.co.uk/house-prices/br1.html?page=1", and loop through the pages while changing the last index of the url with methods like format string.

One other thing, you don't need to implement all those for loops, you can simply assign each variable to its specific value since everything you need is inside an html block which is easy to navigate on it.