0
votes

Data are overwritten and they give the data of only last page how to solve these problem is any solution for these kindly recommend me I've seen several solutions to scrape multiple pages from a website, but couldn't make it work on my code

    import scrapy
    from scrapy import FormRequest
    from scrapy.crawler import CrawlerProcess
    from scrapy.http import Request
    
    class TestSpider(scrapy.Spider):
        name = 'test'
        url = 'https://advpalata.vrn.ru/registers/reestr_lawyers/'
        
        for x in range(0,5):
            payload='p='+str(x)+'&letterfilter=%D0%90'
            
            headers = {
            'authority': 'advpalata.vrn.ru',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'accept-language': 'en-US,en;q=0.9',
            'cache-control': 'max-age=0',
            'content-type': 'application/x-www-form-urlencoded',
            'cookie': 'PHPSESSID=546743b283bb9e3b2e78dabbfb894220; ie=yes; stat_id=546743b283bb9e3b2e78dabbfb894220; _ym_uid=1658939896610936176; _ym_d=1658939896; _ym_isad=2; PHPSESSID=546743b283bb9e3b2e78dabbfb894220',
            'origin': 'https://advpalata.vrn.ru',
            'referer': 'https://advpalata.vrn.ru/registers/reestr_lawyers/',
            'sec-ch-ua': '".Not/A)Brand";v="99", "Google Chrome";v="103", "Chromium";v="103"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '"Windows"',
            'sec-fetch-dest': 'document',
            'sec-fetch-mode': 'navigate',
            'sec-fetch-site': 'same-origin',
            'sec-fetch-user': '?1',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
            }
    
    
            def start_requests(self):
                yield scrapy.FormRequest(
                    url=self.url,
                    method='POST',
                    body=self.payload,
                    headers=self.headers,
                    callback=self.parse_item,
                    )
    
            def parse_item(self, response):
            
                books = response.xpath("//td[@class='name']//a//@href").extract()
                for book in books:
                    absolute_url = response.urljoin(book)
                    yield Request(absolute_url, callback=self.parse_book)
                    
            def parse_book(self, response):
                title=response.css("h3::text").get()
                # phone = response.xpath("//div[@class='advocate-right']//p['@umi:field-name=phone']//text()").get()
                # email = response.xpath("//div[@class='advocate-right']//p['@umi:field-name=email']//a//text()").get()
    
            
                yield{
                    'title':title,
                    # 'phone':phone,
                    # 'email':email
                }