0
votes

I use django, celery, scrapy.

My settings for celery:

CELERY_BROKER_URL = 'amqp://****/myvhost'
CELERY_TIMEZONE = TIME_ZONE
CELERYD_CONCURRENCY = 1000
CELERYD_MAX_TASKS_PER_CHILD = 4
CELERY_IGNORE_RESULT = True

# django celery
CELERY_RESULT_BACKEND = 'django-db'

# celery queues setup

CELERY_DEFAULT_QUEUE = 'default'
CELERY_DEFAULT_ROUTING_KEY = 'default'
CELERY_QUEUES = (
    Queue('get_context', Exchange('get_context'), routing_key='get_context'),
    Queue('get_article', Exchange('get_article'), routing_key='get_article'),
)
CELERY_ROUTES = {
    'parse.tasks.get_context': {
        'queue': 'get_context',
        'routing_key': 'get_context',
    },
    'parse.tasks.get_article': {
        'queue': 'get_article',
        'routing_key': 'get_article',
    },
}

There are two tasks on celery:

from api_parser import celery_app
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from scrapy_parser.scrapy_parser.spiders.map_links import MapLinksSpider
from scrapy_parser.scrapy_parser.spiders.articles import ArticlesSpider
from threading import Thread


@celery_app.task
def get_context(rules_id, rules):
    process = CrawlerProcess(get_project_settings())
    process.crawl(MapLinksSpider, rules_id=rules_id, rules=rules)
    Thread(target=process.start).start()


@celery_app.task
def get_article(rules_id, link_id, rules, link):
    process = CrawlerProcess(get_project_settings())
    process.crawl(ArticlesSpider, rules_id=rules_id, link_id=link_id, rules=rules, link=link)
    Thread(target=process.start).start()

The first task is triggered by a signal and maps the links.

The second task is started when a new link is added to the database.

My signals in django:

from django.db.models.signals import post_save
from django.dispatch import receiver
from parse.models.rules import Scheduler, Rules, ParseLinks
from parse.tasks import get_context, get_article


@receiver(post_save, sender=Scheduler)
def create_task_get_context(sender, instance, created, **kwargs):
    if created:
        rules = Rules.objects.get(id=int(instance.rules.id))
        get_context.delay(int(rules.id), str(rules.rules))


@receiver(post_save, sender=ParseLinks)
def create_task_get_article(sender, instance, created, **kwargs):
    if created:
        parse_link = ParseLinks.objects.get(id=int(instance.id))
        get_article.delay(int(parse_link.rules.id), int(parse_link.id), str(parse_link.rules.rules), str(parse_link.link))

My spiders:

map_links.py

from parse.models.rules import ParseLinks
import scrapy
import json


class MapLinksSpider(scrapy.Spider):
    name = "map_links"
    start_urls = []

    def __init__(self, **kw):
        super(MapLinksSpider, self).__init__(**kw)
        self.rules_id = kw.get('rules_id')
        self.rules = json.loads(kw.get('rules'))

        self.start_urls = [self.rules['url']]
        self.templates = self.rules['item']['templates']
        self.pagination = self.rules['pagination']

    def parse(self, response):
        for item in self.templates:
            context = response.css(str(item['context']))
            for row in context:
                link = row.css('%s::attr(%s)' % (item['link']['cssSelector'], item['link']['attr'])).extract_first(),
                title = row.css('%s::text' % item['options']['title']['cssSelector']).extract_first(),
                date = row.css('%s::text' % item['options']['date']['cssSelector']).extract_first()

                ParseLinks.objects.get_or_create(rules_id=self.rules_id, link=self.rules['url'] + link[0], title=title, date=date)

            next_page = response.css('%s::attr(%s)' % (self.pagination['link']['cssSelector'], self.pagination['link']['attr'])).extract_first()
            if next_page is not None:
                next_page = response.urljoin(next_page)
                yield scrapy.Request(next_page, callback=self.parse)

articles.py

from parse.models.rules import ParseData
import scrapy
import json


class ArticlesSpider(scrapy.Spider):
    name = "articles"
    start_urls = []

    def __init__(self, **kw):
        super(ArticlesSpider, self).__init__(**kw)
        self.rules_id = kw.get('rules_id')
        self.link_id = kw.get('link_id')
        self.rules = json.loads(kw.get('rules'))
        self.link = kw.get('link')

    def parse(self, response):
        self.start_urls = [self.link]
        title = response.css('%s::text' % self.rules['article']['title']['cssSelector']).extract_first()
        text = response.css('%s::text' % self.rules['article']['text']['cssSelector']).extract_first()

        ParseData.objects.create(rules_id=self.rules_id, link_id=self.link_id, title=title, text=text)

        yield {
            "title": title,
            'text': text
        }

But I get the error: twisted.internet.error.ReactorNotRestartable

I understand that the error is caused by the launch of a new process for the spider. But I'm using threads. And I do not understand why this does not solve my problem.

1

1 Answers

0
votes

I think every beginning scraper meets this question :)
Try this:
0) pip install crochet

  1. import from crochet import setup

  2. setup() - at the top of the file

  3. remove 2 lines:
    a) d.addBoth(lambda _: reactor.stop())
    b) reactor.run()

    The only meaningful lines from [Scrapy docs][2] left are 2 last lines in this my code:

    #some more imports from crochet import setup setup()

    def run_spider(spiderName): module_name="first_scrapy.spiders.{}".format(spiderName) scrapy_var = import_module(module_name) #do some dynamic import of selected spider
    spiderObj=scrapy_var.mySpider() #get mySpider-object from spider module crawler = CrawlerRunner(get_project_settings()) #from Scrapy docs crawler.crawl(spiderObj) #from Scrapy docs

This code allows to select what spider to run just with its name passed to run_spider function and after scraping finishes - select another spider and run it again.
Next you simply run run_spider from Celery task. [1]: ReactorNotRestartable - Twisted and scrapy [2]: https://doc.scrapy.org/en/latest/topics/practices.html