I'm trying to run scrapy (spider/crawl)from django project (task in the admin interrface using celery). this is my code .
this is the error when I try to call the task from a python shell
djangoproject:
-monapp:
-tasks.py
-spider.py
-myspider.py '
-models.py
.....
tasks.py:
from djcelery import celery
from demoapp.spider import *
from demoapp.myspider import *
@celery.task
def add(x, y):
return x + y
@celery.task
def scra():
result_queue = Queue()
crawler = CrawlerWorker(MySpider(), result_queue)
crawler.start()
return "success"
spider.py:
from scrapy import project, signals
from scrapy.settings import Settings
from scrapy.crawler import Crawler
from scrapy.xlib.pydispatch import dispatcher
from multiprocessing.queues import Queue
import multiprocessing
class CrawlerWorker(multiprocessing.Process):
def __init__(self, spider, result_queue):
multiprocessing.Process.__init__(self)
self.result_queue = result_queue
self.crawler = Crawler(Settings())
if not hasattr(project, 'crawler'):
self.crawler.install()
self.crawler.configure()
self.items = []
self.spider = spider
dispatcher.connect(self._item_passed, signals.item_passed)
def _item_passed(self, item):
self.items.append(item)
def run(self):
self.crawler.crawl(self.spider)
self.crawler.start()
self.crawler.stop()
self.result_queue.put(self.items)
myspider.py
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.item import Item, Field
class TorentItem(Item):
title = Field()
desc = Field()
class MySpider(CrawlSpider):
name = 'job'
allowed_domains = ['tanitjobs.com']
start_urls = [\
'http://tanitjobs.com/browse-by-category/Nurse/',]
rules = (
Rule (SgmlLinkExtractor(allow=('page=*',)
,restrict_xpaths=('//div[@class="pageNavigation"]',),
unique = True)
, callback='parse_item', follow= True),
)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items= hxs.select('\
//div[@class="offre"]/div[@class="detail"]')
scraped_items =[]
for item in items:
scraped_item = TorentItem()
scraped_item['title']=item.select(\
'a/strong/text()').extract()
scraped_item['desc'] =item.select(\
'./div[@class="descriptionjob"]/text()').extract()
scraped_items.append(scraped_item)
return scraped_items