Is there a way to trigger a method in a Spider class just before it terminates?
I can terminate the spider myself, like this:
class MySpider(CrawlSpider):
#Config stuff goes here...
def quit(self):
#Do some stuff...
raise CloseSpider('MySpider is quitting now.')
def my_parser(self, response):
if termination_condition:
self.quit()
#Parsing stuff goes here...
But I can’t find any information on how to determine when the spider is about to quit naturally.
Answers:
Thank you for visiting the Q&A section on Magenaut. Please note that all the answers may not help you solve the issue immediately. So please treat them as advisements. If you found the post helpful (or not), leave a comment & I’ll get back to you as soon as possible.
Method 1
It looks like you can register a signal listener through dispatcher.
I would try something like:
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
class MySpider(CrawlSpider):
def __init__(self):
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
# second param is instance of spder about to be closed.
In the newer version of scrapy scrapy.xlib.pydispatch is deprecated. instead you can use from pydispatch import dispatcher.
Method 2
Just to update, you can just call closed function like this:
class MySpider(CrawlSpider):
def closed(self, reason):
do-something()
Method 3
For Scrapy version 1.0.0+ (it may also work for older versions).
from scrapy import signals
class MySpider(CrawlSpider):
name = 'myspider'
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(MySpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_opened, signals.spider_opened)
crawler.signals.connect(spider.spider_closed, signals.spider_closed)
return spider
def spider_opened(self, spider):
print('Opening {} spider'.format(spider.name))
def spider_closed(self, spider):
print('Closing {} spider'.format(spider.name))
One good usage is to add tqdm progress bar to scrapy spider.
# -*- coding: utf-8 -*-
from scrapy import signals
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from tqdm import tqdm
class MySpider(CrawlSpider):
name = 'myspider'
allowed_domains = ['somedomain.comm']
start_urls = ['http://www.somedomain.comm/ccid.php']
rules = (
Rule(LinkExtractor(allow=r'^http://www.somedomain.comm/ccds.php?id=.*'),
callback='parse_item',
),
Rule(LinkExtractor(allow=r'^http://www.somedomain.comm/ccid.php$',
restrict_xpaths='//table/tr[contains(., "SMTH")]'), follow=True),
)
def parse_item(self, response):
self.pbar.update() # update progress bar by 1
item = MyItem()
# parse response
return item
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(MySpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_opened, signals.spider_opened)
crawler.signals.connect(spider.spider_closed, signals.spider_closed)
return spider
def spider_opened(self, spider):
self.pbar = tqdm() # initialize progress bar
self.pbar.clear()
self.pbar.write('Opening {} spider'.format(spider.name))
def spider_closed(self, spider):
self.pbar.clear()
self.pbar.write('Closing {} spider'.format(spider.name))
self.pbar.close() # close progress bar
Method 4
For the latest version(v1.7), just define closed(reason) method in your spider class.
closed(reason):Called when the spider closes. This method provides a shortcut to
signals.connect() for the spider_closed signal.
Scrapy Doc : scrapy.spiders.Spider.closed
Method 5
For me the accepted did not work / is outdated at least for scrapy 0.19.
I got it to work with the following though:
from scrapy.signalmanager import SignalManager
from scrapy.xlib.pydispatch import dispatcher
class MySpider(CrawlSpider):
def __init__(self, *args, **kwargs):
super(MySpider, self).__init__(*args, **kwargs)
SignalManager(dispatcher.Any).connect(
self.closed_handler, signal=signals.spider_closed)
def closed_handler(self, spider):
# do stuff here
Method 6
if you have many spiders and want to do something before each of them closing, maybe it will be convenient to add statscollector in your project.
in settings:
STATS_CLASS = 'scraper.stats.MyStatsCollector'
and collector:
from scrapy.statscollectors import StatsCollector
class MyStatsCollector(StatsCollector):
def _persist_stats(self, stats, spider):
do something here
All methods was sourced from stackoverflow.com or stackexchange.com, is licensed under cc by-sa 2.5, cc by-sa 3.0 and cc by-sa 4.0