Scrapy CrawlSpider not joining

Question

I've been reading a lot here and other webs about scrapy and I can't fix this problem so I ask you :P Hope someone could help me.

I want to authenticate a login in the main client page and then parse all the categories and then all the products and save the title of the product, its category, its quantity and its price.

My code:

# -*- coding: utf-8 -*-

import scrapy
from scrapy.item import Item, Field
from scrapy.spiders import CrawlSpider
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.loader.processors import Join
from scrapy.contrib.spiders.init import InitSpider
from scrapy.http import Request, FormRequest
import logging

class article(Item):
    category = Field()
    title = Field()
    quantity = Field()
    price = Field()

class combatzone_spider(CrawlSpider):
    name = 'combatzone_spider'
    allowed_domains = ['www.combatzone.es']
    start_urls = ['http://www.combatzone.es/areadeclientes/']

    rules = (
        Rule(LinkExtractor(allow=r'/category.php?id=\d+'),follow=True),
        Rule(LinkExtractor(allow=r'&page=\d+'),follow=True),
        Rule(LinkExtractor(allow=r'goods.php?id=\d+'),follow=True,callback='parse_items'),
    )

def init_request(self):
    logging.info("You are in initRequest")
    return Request(url=self,callback=self.login)

def login(self,response):
    logging.info("You are in login")
    return scrapy.FormRequest.from_response(response,formname='ECS_LOGINFORM',formdata={'username':'XXXX','password':'YYYY'},callback=self.check_login_response)

def check_login_response(self,response):
    logging.info("You are in checkLogin")
    if "Hola，XXXX" in response.body:
        self.log("Succesfully logged in.")
        return self.initialized()
    else:
        self.log("Something wrong in login.")

def parse_items(self,response):
    logging.info("You are in item")
    item = scrapy.loader.ItemLoader(article(),response)
    item.add_xpath('category','/html/body/div[3]/div[2]/div[2]/a[2]/text()')
    item.add_xpath('title','/html/body/div[3]/div[2]/div[2]/div/div[2]/h1/text()')
    item.add_xpath('quantity','//*[@id="ECS_FORMBUY"]/div[1]/ul/li[2]/font/text()')
    item.add_xpath('price','//*[@id="ECS_RANKPRICE_2"]/text()')
    yield item.load_item()

When I run the scrapy crawl spider on terminal I get this:

SCRAPY) pi@raspberry:~/SCRAPY/combatzone/combatzone/spiders $ scrapy crawl combatzone_spider /home/pi/SCRAPY/combatzone/combatzone/spiders/combatzone_spider.py:9: ScrapyDeprecationWarning: Module scrapy.contrib.spiders is deprecated, use scrapy.spiders instead from scrapy.contrib.spiders.init import InitSpider /home/pi/SCRAPY/combatzone/combatzone/spiders/combatzone_spider.py:9: ScrapyDeprecationWarning: Module scrapy.contrib.spiders.init is deprecated, use scrapy.spiders.init instead from scrapy.contrib.spiders.init import InitSpider 2018-07-24 22:14:53 [scrapy.utils.log] INFO: Scrapy 1.5.1 started (bot: combatzone) 2018-07-24 22:14:53 [scrapy.utils.log] INFO: Versions: lxml 4.2.3.0, libxml2 2.9.8, cssselect 1.0.3, parsel 1.5.0, w3lib 1.19.0, Twisted 18.7.0, Python 2.7.13 (default, Nov 24 2017, 17:33:09) - [GCC 6.3.0 20170516], pyOpenSSL 18.0.0 (OpenSSL 1.1.0h 27 Mar 2018), cryptography 2.3, Platform Linux-4.9.0-6-686-i686-with-debian-9.5 2018-07-24 22:14:53 [scrapy.crawler] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'combatzone.spiders', 'SPIDER_MODULES': ['combatzone.spiders'], 'LOG_LEVEL': 'INFO', 'BOT_NAME': 'combatzone'} 2018-07-24 22:14:53 [scrapy.middleware] INFO: Enabled extensions: ['scrapy.extensions.memusage.MemoryUsage', 'scrapy.extensions.logstats.LogStats', 'scrapy.extensions.telnet.TelnetConsole', 'scrapy.extensions.corestats.CoreStats'] 2018-07-24 22:14:53 [scrapy.middleware] INFO: Enabled downloader middlewares: ['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware', 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware', 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware', 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware', 'scrapy.downloadermiddlewares.retry.RetryMiddleware', 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware', 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware', 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware', 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware', 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware', 'scrapy.downloadermiddlewares.stats.DownloaderStats'] 2018-07-24 22:14:53 [scrapy.middleware] INFO: Enabled spider middlewares: ['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware', 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware', 'scrapy.spidermiddlewares.referer.RefererMiddleware', 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware', 'scrapy.spidermiddlewares.depth.DepthMiddleware'] 2018-07-24 22:14:53 [scrapy.middleware] INFO: Enabled item pipelines: [] 2018-07-24 22:14:53 [scrapy.core.engine] INFO: Spider opened 2018-07-24 22:14:53 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min) 2018-07-24 22:14:54 [scrapy.core.engine] INFO: Closing spider (finished) 2018-07-24 22:14:54 [scrapy.statscollectors] INFO: Dumping Scrapy stats: {'downloader/request_bytes': 231, 'downloader/request_count': 1, 'downloader/request_method_count/GET': 1, 'downloader/response_bytes': 7152, 'downloader/response_count': 1, 'downloader/response_status_count/200': 1, 'finish_reason': 'finished', 'finish_time': datetime.datetime(2018, 7, 24, 21, 14, 54, 410938), 'log_count/INFO': 7, 'memusage/max': 36139008, 'memusage/startup': 36139008, 'response_received_count': 1, 'scheduler/dequeued': 1, 'scheduler/dequeued/memory': 1, 'scheduler/enqueued': 1, 'scheduler/enqueued/memory': 1, 'start_time': datetime.datetime(2018, 7, 24, 21, 14, 53, 998619)} 2018-07-24 22:14:54 [scrapy.core.engine] INFO: Spider closed (finished)

The spider seems not to be working, any idea of why could it be? Thank you very much mates :D

If I try some response xpath extracts in shell it works good but with this CrawlSpider don't — Pepe Sospechas

Vic Vic · Accepted Answer · 2018-07-25T02:55:50

There are 2 problems:

The first is the regular expression, you should escape the "?". For example: /category.php?id=\d+ should be changed as /category.php\?id=\d+(notice that "\?")
The second is you should indent all methods, otherwise they cannot be found in class combatzone_spider.

As for login, I tried to make your code work but I failed. I usually override start_requests to login before crawling.

Here is the code:

# -*- coding: utf-8 -*-

import scrapy
from scrapy.item import Item, Field
from scrapy.spiders import CrawlSpider
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.loader.processors import Join
from scrapy.contrib.spiders.init import InitSpider
from scrapy.http import Request, FormRequest
import logging

class article(Item):
    category = Field()
    title = Field()
    quantity = Field()
    price = Field()

class CombatZoneSpider(CrawlSpider):
    name = 'CombatZoneSpider'
    allowed_domains = ['www.combatzone.es']
    start_urls = ['http://www.combatzone.es/areadeclientes/']

    rules = (
        # escape "?"
        Rule(LinkExtractor(allow=r'category.php\?id=\d+'),follow=False),
        Rule(LinkExtractor(allow=r'&page=\d+'),follow=False),
        Rule(LinkExtractor(allow=r'goods.php\?id=\d+'),follow=False,callback='parse_items'),
    )

    def parse_items(self,response):
        logging.info("You are in item")

        # This is used to print the results
        selector = scrapy.Selector(response=response)
        res = selector.xpath("/html/body/div[3]/div[2]/div[2]/div/div[2]/h1/text()").extract()
        self.logger.info(res)

        # item = scrapy.loader.ItemLoader(article(),response)
        # item.add_xpath('category','/html/body/div[3]/div[2]/div[2]/a[2]/text()')
        # item.add_xpath('title','/html/body/div[3]/div[2]/div[2]/div/div[2]/h1/text()')
        # item.add_xpath('quantity','//*[@id="ECS_FORMBUY"]/div[1]/ul/li[2]/font/text()')
        # item.add_xpath('price','//*[@id="ECS_RANKPRICE_2"]/text()')
        # yield item.load_item()

    # login part
    # I didn't test if it can login because I have no accounts, but they will print something in console.

    def start_requests(self):
        logging.info("You are in initRequest")
        return [scrapy.Request(url="http://www.combatzone.es/areadeclientes/user.php",callback=self.login)]

    def login(self,response):
        logging.info("You are in login")

        # generate the start_urls again:
        for url in self.start_urls:
            yield self.make_requests_from_url(url)

        # yield scrapy.FormRequest.from_response(response,formname='ECS_LOGINFORM',formdata={'username':'XXXX','password':'YYYY'},callback=self.check_login_response)

    # def check_login_response(self,response):
    #     logging.info("You are in checkLogin")
    #     if "Hola，XXXX" in response.body:
    #         self.log("Succesfully logged in.")
    #         return self.initialized()
    #     else:
    #         self.log("Something wrong in login.")

Scrapy CrawlSpider not joining

1 Answers