I've used the CrawlSpider successfully before. But when I changed the code in order to integrate with Redis and add my own middlewares to set UserAgent and cookies, the spider doesn't parse the responses anymore, and thus the spider doesn't generate new requests, the spider closed soon after beginning.
Even if I code this: def parse_start_url(self, response): return self.parse_item(response) It only parses the response from first url
Here's my code: Spider:
# -*- coding: utf-8 -*-
from scrapy.linkextractors import LinkExtractor
from yydzh.items import YydzhItem
from scrapy.spiders import Rule, CrawlSpider
class YydzhSpider(CrawlSpider):
name = 'yydzhSpider'
allowed_domains = ['yydzh.com']
start_urls = ['http://www.yydzh.com/thread.php?fid=198']
rules = (
Rule(LinkExtractor(allow='thread\.php\?fid=198&page=([1-9]|1[0-9])#s',
restrict_xpaths=("//div[@class='pages']")),
callback='parse_item', follow=True,
),
)
#def parse_start_url(self, response):
# return self.parse_item(response)
def parse_item(self, response):
item = YydzhItem()
for each in response.xpath \
("//*[@id='ajaxtable']//tr[@class='tr2'][last()]/following-sibling::tr[@class!='tr2']"):
item['title'] = each.xpath("./td[2]/h3[1]/a//text()").extract()[0]
item['author'] = each.xpath('./td[3]/a//text()').extract()[0]
item['category'] = each.xpath('./td[2]/span[1]//text()').extract()[0]
item['url'] = each.xpath("./td[2]/h3[1]//a/@href").extract()[0]
yield item
Settings I think crucial:
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
DOWNLOADER_MIDDLEWARES = {
'yydzh.middlewares.UserAgentmiddleware': 500,
'yydzh.middlewares.CookieMiddleware': 600
}
COOKIES_ENABLED = True
Middleware: UserAgentmiddleware changes the user agent randomly to avoid being noticed by the server
CookieMiddleware adds the cookies to request for pages that ask for log-in to scan
logger = logging.getLogger(__name__)
class UserAgentmiddleware(UserAgentMiddleware):
def process_request(self, request, spider):
agent = random.choice(agents)
request.headers["User-Agent"] = agent
class CookieMiddleware(RetryMiddleware):
def __init__(self, settings, crawler):
RetryMiddleware.__init__(self, settings)
self.rconn = redis.Redis(host=REDIS_HOST, port=REDIS_PORT,
password=REDIS_PASS, db=1, decode_responses=True)
init_cookie(self.rconn, crawler.spider.name)
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings, crawler)
def process_request(self, request, spider):
redisKeys = self.rconn.keys()
while len(redisKeys) > 0:
elem = random.choice(redisKeys)
if spider.name + ':Cookies' in elem:
cookie = json.loads(self.rconn.get(elem))
request.cookies = cookie
request.meta["accountText"] = elem.split("Cookies:")[-1]
break
else:
redisKeys.remove(elem)
def process_response(self, request, response, spider):
if('您没有登录或者您没有权限访问此页面' in str(response.body)):
accountText = request.meta["accountText"]
remove_cookie(self.rconn, spider.name, accountText)
update_cookie(self.rconn, spider.name, accountText)
logger.warning("更新Cookie成功!(账号为:%s)" % accountText)
return request
return response