Trying to scrape the internet archive website (Wayback Machine): https://web.archive.org/web/20150906222155mp_/https://www.zalando.co.uk/womens-clothing/.
I am succesful in scraping the 1st page content, but can't move to the next page. I have tried multiple xpath
to move to next pages:
# 1
next_page_url = response.xpath("//li[a[contains(.,'>')]]//@href").extract_first() # does not work
# 2
next_page_url = response.xpath(//a[@class='catalogPagination_page' and text() ='>'])[1]//@href).get() # does not work
I have tried converting to absolute url
(and without) but again with no luck.
Can anyone help with new xpath
or css selectors
that I can finally scrape
the next pages?
Below you can see my full code:
# -*- coding: utf-8 -*-
import scrapy
class ZalandoWomenSpider(scrapy.Spider):
name = 'zalando_women_historic_2015'
allowed_domains = ['www.web.archive.org']
start_urls = ['https://web.archive.org/web/20150906222155mp_/https://www.zalando.co.uk/womens-clothing/']
def parse(self, response):
products = response.xpath("//a[@class='catalogArticlesList_productBox']")
for product in products:
link = product.xpath(".//@href").get()
absolute_url = f"https://web.archive.org{link}"
yield scrapy.Request(url=absolute_url,callback=self.parse_product,dont_filter=True,meta={'link':link})
# process next page
next_page_url = response.xpath("//li[a[contains(.,'>')]]//@href").extract_first() #(//a[@class='catalogPagination_page' and text() ='>'])[1]//@href
absolute_next_page_url = f"https://web.archive.org{next_page_url}"
#absolute_next_page_url = next_page_url
#absolute_next_page_url = response.urljoin(next_page_url)
if next_page_url:
yield scrapy.Request(url=absolute_next_page_url,callback=self.parse)
def parse_product(self, response):
link = response.request.meta['link']
brand = response.xpath("//span[@itemprop='brand']/text()").get()
price = response.xpath("//span[@class='price oldPrice nowrap']/text()").get()
price1 = response.xpath("//span[@itemprop='price']/text()").get()
price2 = response.xpath("//div[@class='boxPrice']//span[contains(@class,'price')]/text()").get()
disc_price = response.xpath("//span[@class='price specialPrice nowrap']/text()").get()
product_type = response.xpath("//span[@itemprop='name']/text()").get()
material = response.xpath("//div[@class='content']//li[contains(.,'material')]/text()").get()
yield {
'brand_name': brand,
'product_price':price,
'product_price1':price1,
'product_price2':price2,
'product_price_b4_disc':disc_price,
'link':link,
'product_type':product_type,
'material':material}