Module scrapfly.scrapy.middleware
Classes
class ScrapflyMiddleware-
Expand source code
class ScrapflyMiddleware: MAX_API_RETRIES = 20 def process_request(self, request: Union[Request, ScrapflyScrapyRequest], spider: Union[Spider, ScrapflySpider]) -> Optional[ScrapflyScrapyResponse]: if not isinstance(request, ScrapflyScrapyRequest): return None if not isinstance(spider, ScrapflySpider): raise RuntimeError('ScrapflyScrapyRequest must be fired from ScrapflySpider, %s given' % type(spider)) if request.scrape_config.tags is None: request.scrape_config.tags = set() request.scrape_config.tags.add(spider.name) request.scrape_config.tags.add(str(spider.run_id)) if request.scrape_config.proxy_pool is None and spider.settings.get('SCRAPFLY_PROXY_POOL'): request.scrape_config.proxy_pool = spider.settings.get('SCRAPFLY_PROXY_POOL') return None def process_exception(self, request, exception:Union[str, Exception], spider:ScrapflySpider): delay = 1 if isinstance(exception, ResponseNeverReceived): return spider.retry(request, exception, delay) if isinstance(exception, ScrapflyError): if exception.is_retryable: if isinstance(exception, HttpError) and exception.response is not None: if 'retry-after' in exception.response.headers: delay = int(exception.response.headers['retry-after']) return spider.retry(request, exception, delay) if spider.settings.get('SCRAPFLY_CUSTOM_RETRY_CODE', False) and exception.code in spider.settings.get('SCRAPFLY_CUSTOM_RETRY_CODE'): return spider.retry(request, exception, delay) raise exception def process_response(self, request: Union[Request, ScrapflyScrapyRequest], response: Union[Response, ScrapflyScrapyResponse], spider: Union[Spider, ScrapflySpider]) -> Union[ScrapflyScrapyResponse, ScrapflyScrapyRequest]: return responseClass variables
var MAX_API_RETRIES
Methods
def process_exception(self,
request,
exception: str | Exception,
spider: ScrapflySpider)-
Expand source code
def process_exception(self, request, exception:Union[str, Exception], spider:ScrapflySpider): delay = 1 if isinstance(exception, ResponseNeverReceived): return spider.retry(request, exception, delay) if isinstance(exception, ScrapflyError): if exception.is_retryable: if isinstance(exception, HttpError) and exception.response is not None: if 'retry-after' in exception.response.headers: delay = int(exception.response.headers['retry-after']) return spider.retry(request, exception, delay) if spider.settings.get('SCRAPFLY_CUSTOM_RETRY_CODE', False) and exception.code in spider.settings.get('SCRAPFLY_CUSTOM_RETRY_CODE'): return spider.retry(request, exception, delay) raise exception def process_request(self,
request: scrapy.http.request.Request | ScrapflyScrapyRequest,
spider: scrapy.spiders.Spider | ScrapflySpider) ‑> ScrapflyScrapyResponse | None-
Expand source code
def process_request(self, request: Union[Request, ScrapflyScrapyRequest], spider: Union[Spider, ScrapflySpider]) -> Optional[ScrapflyScrapyResponse]: if not isinstance(request, ScrapflyScrapyRequest): return None if not isinstance(spider, ScrapflySpider): raise RuntimeError('ScrapflyScrapyRequest must be fired from ScrapflySpider, %s given' % type(spider)) if request.scrape_config.tags is None: request.scrape_config.tags = set() request.scrape_config.tags.add(spider.name) request.scrape_config.tags.add(str(spider.run_id)) if request.scrape_config.proxy_pool is None and spider.settings.get('SCRAPFLY_PROXY_POOL'): request.scrape_config.proxy_pool = spider.settings.get('SCRAPFLY_PROXY_POOL') return None def process_response(self,
request: scrapy.http.request.Request | ScrapflyScrapyRequest,
response: scrapy.http.response.Response | ScrapflyScrapyResponse,
spider: scrapy.spiders.Spider | ScrapflySpider) ‑> ScrapflyScrapyResponse | ScrapflyScrapyRequest-
Expand source code
def process_response(self, request: Union[Request, ScrapflyScrapyRequest], response: Union[Response, ScrapflyScrapyResponse], spider: Union[Spider, ScrapflySpider]) -> Union[ScrapflyScrapyResponse, ScrapflyScrapyRequest]: return response
class ScrapflyRefererMiddleware (settings: BaseSettings | None = None)-
Expand source code
class ScrapflyRefererMiddleware(ScrapyRefererMidleware): def process_spider_output(self, response, result, spider) -> Iterable: if isinstance(response, ScrapflyScrapyResponse) and response.scrape_config.session is not None: return result # bypass - already handled by scrapfly session system return ScrapyRefererMidleware.process_spider_output(self, response, result, spider) def request_scheduled(self, request, spider): if isinstance(request, ScrapflyScrapyRequest) and request.scrape_config.session is not None: return # bypass - already handled by scrapfly session system ScrapyRefererMidleware.request_scheduled(self, request, spider)Optional base class for spider middlewares.
Added in version: 2.13
This class provides helper methods for asynchronous
process_spider_output()andprocess_start()methods. Middlewares that don't have either of these methods don't need to use this class.You can override the :meth:
~scrapy.spidermiddlewares.base.BaseSpiderMiddleware.get_processed_requestmethod to add processing code for requests and the :meth:~scrapy.spidermiddlewares.base.BaseSpiderMiddleware.get_processed_itemmethod to add processing code for items. These methods take a single request or item from the spider output iterable and return a request or item (the same or a new one), orNoneto remove this request or item from the processing.Ancestors
- scrapy.spidermiddlewares.referer.RefererMiddleware
- scrapy.spidermiddlewares.base.BaseSpiderMiddleware
Methods
def process_spider_output(self, response, result, spider) ‑> Iterable-
Expand source code
def process_spider_output(self, response, result, spider) -> Iterable: if isinstance(response, ScrapflyScrapyResponse) and response.scrape_config.session is not None: return result # bypass - already handled by scrapfly session system return ScrapyRefererMidleware.process_spider_output(self, response, result, spider) def request_scheduled(self, request, spider)-
Expand source code
def request_scheduled(self, request, spider): if isinstance(request, ScrapflyScrapyRequest) and request.scrape_config.session is not None: return # bypass - already handled by scrapfly session system ScrapyRefererMidleware.request_scheduled(self, request, spider)