0
votes
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.http import TextResponse
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
from selenium import webdriver
import time


class Product(scrapy.Item):
    title = scrapy.Field()
    link = scrapy.Field()
    data = scrapy.Field()
    name_reviewer = scrapy.Field()
    date = scrapy.Field()
    model_name = scrapy.Field()
    rating = scrapy.Field()
    review = scrapy.Field()
    url_print = scrapy.Field()


class FooSpider(CrawlSpider):
    name = "snap_reviews"
    allowed_domains = ["snapdeal.com"]
    url=[]
    ids  = ['http://www.snapdeal.com/product/micromax-a114-canvas-22-black/1485635784']
    for id in ids:
        for i in range(1,25):
            url.append(id+'/reviews?page='+str(i)+'&vsrc=rcnt')
    start_urls = url

    def __init__(self, *args, **kwargs):
        super(FooSpider, self).__init__(*args, **kwargs)
        self.download_delay = 0.25
        self.browser = webdriver.Firefox()
        self.browser.implicitly_wait(60) # 

    def parse(self,response):
        self.browser.get(response.url)
        sites = response.xpath('//div[@class="reviewareain"]/div/div')
        #self.browser.implicitly_wait(30)
        items = []


        #sel = Selector(text=self.browser.page_source)
        #ites = sel.xpath('//div[@class="reviewareain"]')
        model = response.xpath('//span[contains(@class,"section-head customer_review_tab")]/text()').extract()[0].lstrip()
        for site in sites:
            item = Product()
            item['model_name'] = model
            item['name_reviewer'] = site.xpath('.//span[contains(@class,"_reviewUserName")]/text()').extract()[0]
            item['review'] = site.xpath('.//div[contains(@class, "user-review")]/p/text()').extract()[0]
            item['title'] = site.xpath('.//div[contains(@class,"user-review")]/div[@class="head"]/text()').extract()[0]
            #item['date'] = site.xpath('.//span[contains(@style,"vertical-align:middle;")]/nobr/text()').extract()[0]
            #item['rating'] = site.xpath('.//span[contains(@class,"swSprite")]/@title').extract()[0].split()[0]
            items.append(item)
            return items

Pipeline

import pymongo
from pymongo import MongoClient
from scrapy.conf import settings
from scrapy import log


class MongoDBPipeline(object):
    def __init__(self):
        connection = pymongo.MongoClient(settings['MONGODB_HOST'], settings['MONGODB_PORT'])
        db = connection[settings['MONGODB_DATABASE']]
        self.collection = db[settings['MONGODB_COLLECTION']]
    self.collection.get(type(item)).insert(item, continue_on_error=True)

    def process_item(self, item, spider):
        self.collection.insert(item)
        log.msg("Item wrote to MongoDB database {}, collection {}, at host {}, port {}".format(
            settings['MONGODB_DATABASE'],
            settings['MONGODB_COLLECTION'],
            settings['MONGODB_HOST'],
            settings['MONGODB_PORT']))
        return item

Settings

# -*- coding: utf-8 -*-

# Scrapy settings for snapdeal_review_13jul project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#

BOT_NAME = 'snapdeal_review_13jul'

SPIDER_MODULES = ['snapdeal_review_13jul.spiders']
NEWSPIDER_MODULE = 'snapdeal_review_13jul.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'snapdeal_review_13jul (+http://www.yourdomain.com)'

ITEM_PIPELINES = {'snapdeal_review_13jul.pipelines.MongoDBPipeline':300}
MONGODB_HOST = 'localhost' # Change in prod
MONGODB_PORT = 27017 # Change in prod
MONGODB_DATABASE = "snapdeal_reviews" # Change in prod
MONGODB_COLLECTION = "snap_r_1"
MONGODB_USERNAME = "" # Change in prod
MONGODB_PASSWORD = "" # Change in prod

Error:

Traceback (most recent call last): File "/usr/lib/python2.7/dist-packages/scrapy/middleware.py", line 62, in _process_chain return process_chain(self.methods[methodname], obj, *args) File "/usr/lib/python2.7/dist-packages/scrapy/utils/defer.py", line 65, in process_chain d.callback(input) File "/usr/lib/python2.7/dist-packages/twisted/internet/defer.py", line 382, in callback self._startRunCallbacks(result) File "/usr/lib/python2.7/dist-packages/twisted/internet/defer.py", line 490, in _startRunCallbacks self._runCallbacks() --- --- File "/usr/lib/python2.7/dist-packages/twisted/internet/defer.py", line 577, in _runCallbacks current.result = callback(current.result, *args, **kw) File "/home/nikhil/Desktop/Scrapers/mouth/mouth/pipelines.py", line 22, in process_item self.collection.insert(item) File "/home/nikhil/.local/lib/python2.7/site-packages/pymongo/collection.py", line 1926, in insert check_keys, manipulate, write_concern) File "/home/nikhil/.local/lib/python2.7/site-packages/pymongo/collection.py", line 430, in _insert gen(), check_keys, self.codec_options, sock_info) File "/home/nikhil/.local/lib/python2.7/site-packages/pymongo/collection.py", line 405, in gen doc['_id'] = ObjectId() exceptions.TypeError: 'str' object does not support item assignment

My current code was working earlier but not working now throwing above said error.

1
It looks like a string is passed instead of an Item instance into the pipeline. You need to debug it a little bit. Log the value of item before doing the collection insert. Also, is return items properly indented?alecxe
@alecxe didn't get you.. But this code was working good few days backJohn Dene

1 Answers

0
votes

Try casting item to a dict like so:

self.collection.insert(dict(item))

See if that works.