Skip to content

Commit

Permalink
Merge pull request #487 from CIRCL/crawler_v2
Browse files Browse the repository at this point in the history
fix: [crawler] error catcher
  • Loading branch information
Terrtia authored Apr 1, 2020
2 parents d72f28f + 179fba2 commit 00573c9
Showing 1 changed file with 4 additions and 4 deletions.
8 changes: 4 additions & 4 deletions bin/torcrawler/TorSplashCrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def start_requests(self):
self.parse,
errback=self.errback_catcher,
endpoint='execute',
meta={'father': self.original_item},
meta={'father': self.original_item, 'current_url': self.start_urls},
args=l_cookies
)

Expand Down Expand Up @@ -217,7 +217,7 @@ def parse(self,response):
self.parse,
errback=self.errback_catcher,
endpoint='execute',
meta={'father': item_id},
meta={'father': item_id, 'current_url': link.url},
args=l_cookies
)

Expand All @@ -227,7 +227,7 @@ def errback_catcher(self, failure):

if failure.check(ResponseNeverReceived):
request = failure.request
url= response.data['last_url']
url= request.meta['current_url']
father = request.meta['father']

self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url)
Expand All @@ -242,7 +242,7 @@ def errback_catcher(self, failure):
errback=self.errback_catcher,
endpoint='execute',
cache_args=['lua_source'],
meta={'father': father},
meta={'father': father, 'current_url': url},
args=self.build_request_arg(response.cookiejar)
)

Expand Down

0 comments on commit 00573c9

Please sign in to comment.