0
votes

I've used the CrawlSpider successfully before. But when I changed the code in order to integrate with Redis and add my own middlewares to set UserAgent and cookies, the spider doesn't parse the responses anymore, and thus the spider doesn't generate new requests, the spider closed soon after beginning.

Here's the running stats

Even if I code this: def parse_start_url(self, response): return self.parse_item(response) It only parses the response from first url

Here's my code: Spider:

# -*- coding: utf-8 -*-
from scrapy.linkextractors import LinkExtractor
from yydzh.items import YydzhItem
from scrapy.spiders import Rule, CrawlSpider


class YydzhSpider(CrawlSpider):
    name = 'yydzhSpider'
    allowed_domains = ['yydzh.com']
    start_urls = ['http://www.yydzh.com/thread.php?fid=198']
    rules = (
         Rule(LinkExtractor(allow='thread\.php\?fid=198&page=([1-9]|1[0-9])#s', 
         restrict_xpaths=("//div[@class='pages']")), 
         callback='parse_item', follow=True,
         ),
    )

#def parse_start_url(self, response):
#   return self.parse_item(response)

def parse_item(self, response):
    item = YydzhItem()
    for each in response.xpath \
        ("//*[@id='ajaxtable']//tr[@class='tr2'][last()]/following-sibling::tr[@class!='tr2']"):
        item['title'] = each.xpath("./td[2]/h3[1]/a//text()").extract()[0]
        item['author'] = each.xpath('./td[3]/a//text()').extract()[0]
        item['category'] = each.xpath('./td[2]/span[1]//text()').extract()[0]
        item['url'] = each.xpath("./td[2]/h3[1]//a/@href").extract()[0]
        yield item

Settings I think crucial:

SCHEDULER = "scrapy_redis.scheduler.Scheduler"
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
DOWNLOADER_MIDDLEWARES = {
'yydzh.middlewares.UserAgentmiddleware': 500,
'yydzh.middlewares.CookieMiddleware': 600
}
COOKIES_ENABLED = True

Middleware: UserAgentmiddleware changes the user agent randomly to avoid being noticed by the server

CookieMiddleware adds the cookies to request for pages that ask for log-in to scan

logger = logging.getLogger(__name__)


class UserAgentmiddleware(UserAgentMiddleware):

def process_request(self, request, spider):
    agent = random.choice(agents)
    request.headers["User-Agent"] = agent

class CookieMiddleware(RetryMiddleware):

def __init__(self, settings, crawler):
    RetryMiddleware.__init__(self, settings)
    self.rconn = redis.Redis(host=REDIS_HOST, port=REDIS_PORT,
                             password=REDIS_PASS, db=1, decode_responses=True)  
    init_cookie(self.rconn, crawler.spider.name)


@classmethod
def from_crawler(cls, crawler):
    return cls(crawler.settings, crawler)

def process_request(self, request, spider):
    redisKeys = self.rconn.keys()
    while len(redisKeys) > 0:
        elem = random.choice(redisKeys)
        if spider.name + ':Cookies' in elem:
            cookie = json.loads(self.rconn.get(elem))
            request.cookies = cookie
            request.meta["accountText"] = elem.split("Cookies:")[-1]
            break
        else:
            redisKeys.remove(elem)

def process_response(self, request, response, spider):
    if('您没有登录或者您没有权限访问此页面' in str(response.body)):
        accountText = request.meta["accountText"]
        remove_cookie(self.rconn, spider.name, accountText)
        update_cookie(self.rconn, spider.name, accountText)
        logger.warning("更新Cookie成功!(账号为:%s)" % accountText)
        return request

    return response
1

1 Answers

0
votes

Find the problem: All the urls have been filtered by Redis server before previous requests, and restart it can solve the problem.