I want to use scrapy in a larger project, but I am unsure how to pass args like name,start_urls,and allowed_domains. As I understand it name,start_urls,and allowed_domains variables are settings for process.crawl, but I am not able to use self.var like I have with line- site = self.site since self obviously isn't defined there. There is also the problem of the proper way to return. At the end of the day I just want a way to crawl all urls on a single domain from within a script.
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from urllib.parse import urlparse
from scrapy.crawler import CrawlerProcess
#from project.spiders.test_spider import SpiderName
from scrapy.utils.project import get_project_settings
process = CrawlerProcess(settings={
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
crawledUrls = []
class MySpider(CrawlSpider):
name = 'spider_example_name'
def __init__(self,site):
self.site=site
site = self.site
domain = urlparse(site).netloc
start_urls = [site]
allowed_domains = [domain]
rules = (
Rule(LinkExtractor(unique=True), callback='parse_item', follow=True),
)
def parse_item(self, response):
#I think there is a way to do this with yeild
print(self.site)
crawledUrls.append(response.url)
def main():
spider = MySpider('http://quotes.toscrape.com')
process.crawl(spider)
process.start() # the script will block here until the crawling is finished
print("###########################################")
print(len(crawledUrls))
print(crawledUrls)
print("###########################################")
if __name__ == "__main__":
main()