Module scrapfly.scrapy.middleware

Expand source code
from typing import Union, Optional, Iterable

from scrapy import Spider
from scrapy.http import Request, Response
from scrapy.spidermiddlewares.referer import RefererMiddleware as ScrapyRefererMidleware
from twisted.web._newclient import ResponseNeverReceived

from .spider import ScrapflySpider
from .request import ScrapflyScrapyRequest
from .response import ScrapflyScrapyResponse

from .. import HttpError, ScrapflyError


# spider middleware
class ScrapflyRefererMiddleware(ScrapyRefererMidleware):

    def process_spider_output(self, response, result, spider) -> Iterable:
        if isinstance(response, ScrapflyScrapyResponse) and response.scrape_config.session is not None:
            return result # bypass - already handled by scrapfly session system

        return ScrapyRefererMidleware.process_spider_output(self, response, result, spider)

    def request_scheduled(self, request, spider):
        if isinstance(request, ScrapflyScrapyRequest) and request.scrape_config.session is not None:
            return # bypass - already handled by scrapfly session system

        ScrapyRefererMidleware.request_scheduled(self, request, spider)


# downloader middleware
class ScrapflyMiddleware:
    MAX_API_RETRIES = 20

    def process_request(self, request: Union[Request, ScrapflyScrapyRequest], spider: Union[Spider, ScrapflySpider]) -> Optional[ScrapflyScrapyResponse]:
        if not isinstance(request, ScrapflyScrapyRequest):
            return None

        if not isinstance(spider, ScrapflySpider):
            raise RuntimeError('ScrapflyScrapyRequest must be fired from ScrapflySpider, %s given' % type(spider))

        if request.scrape_config.tags is None:
            request.scrape_config.tags = set()

        request.scrape_config.tags.add(spider.name)
        request.scrape_config.tags.add(str(spider.run_id))

        if request.scrape_config.proxy_pool is None and spider.settings.get('SCRAPFLY_PROXY_POOL'):
            request.scrape_config.proxy_pool = spider.settings.get('SCRAPFLY_PROXY_POOL')

        return None

    def process_exception(self, request, exception:Union[str, Exception], spider:ScrapflySpider):
        delay = 1

        if isinstance(exception, ResponseNeverReceived):
            return spider.retry(request, exception, delay)

        if isinstance(exception, ScrapflyError):
            if exception.is_retryable:
                if isinstance(exception, HttpError) and exception.response is not None:
                    if 'retry-after' in exception.response.headers:
                        delay = int(exception.response.headers['retry-after'])

                return spider.retry(request, exception, delay)

            if spider.settings.get('SCRAPFLY_CUSTOM_RETRY_CODE', False) and exception.code in spider.settings.get('SCRAPFLY_CUSTOM_RETRY_CODE'):
                return spider.retry(request, exception, delay)

        raise exception

    def process_response(self, request: Union[Request, ScrapflyScrapyRequest], response: Union[Response, ScrapflyScrapyResponse], spider: Union[Spider, ScrapflySpider]) -> Union[ScrapflyScrapyResponse, ScrapflyScrapyRequest]:
        return response

Classes

class ScrapflyMiddleware
Expand source code
class ScrapflyMiddleware:
    MAX_API_RETRIES = 20

    def process_request(self, request: Union[Request, ScrapflyScrapyRequest], spider: Union[Spider, ScrapflySpider]) -> Optional[ScrapflyScrapyResponse]:
        if not isinstance(request, ScrapflyScrapyRequest):
            return None

        if not isinstance(spider, ScrapflySpider):
            raise RuntimeError('ScrapflyScrapyRequest must be fired from ScrapflySpider, %s given' % type(spider))

        if request.scrape_config.tags is None:
            request.scrape_config.tags = set()

        request.scrape_config.tags.add(spider.name)
        request.scrape_config.tags.add(str(spider.run_id))

        if request.scrape_config.proxy_pool is None and spider.settings.get('SCRAPFLY_PROXY_POOL'):
            request.scrape_config.proxy_pool = spider.settings.get('SCRAPFLY_PROXY_POOL')

        return None

    def process_exception(self, request, exception:Union[str, Exception], spider:ScrapflySpider):
        delay = 1

        if isinstance(exception, ResponseNeverReceived):
            return spider.retry(request, exception, delay)

        if isinstance(exception, ScrapflyError):
            if exception.is_retryable:
                if isinstance(exception, HttpError) and exception.response is not None:
                    if 'retry-after' in exception.response.headers:
                        delay = int(exception.response.headers['retry-after'])

                return spider.retry(request, exception, delay)

            if spider.settings.get('SCRAPFLY_CUSTOM_RETRY_CODE', False) and exception.code in spider.settings.get('SCRAPFLY_CUSTOM_RETRY_CODE'):
                return spider.retry(request, exception, delay)

        raise exception

    def process_response(self, request: Union[Request, ScrapflyScrapyRequest], response: Union[Response, ScrapflyScrapyResponse], spider: Union[Spider, ScrapflySpider]) -> Union[ScrapflyScrapyResponse, ScrapflyScrapyRequest]:
        return response

Class variables

var MAX_API_RETRIES

Methods

def process_exception(self, request, exception: Union[str, Exception], spider: ScrapflySpider)
Expand source code
def process_exception(self, request, exception:Union[str, Exception], spider:ScrapflySpider):
    delay = 1

    if isinstance(exception, ResponseNeverReceived):
        return spider.retry(request, exception, delay)

    if isinstance(exception, ScrapflyError):
        if exception.is_retryable:
            if isinstance(exception, HttpError) and exception.response is not None:
                if 'retry-after' in exception.response.headers:
                    delay = int(exception.response.headers['retry-after'])

            return spider.retry(request, exception, delay)

        if spider.settings.get('SCRAPFLY_CUSTOM_RETRY_CODE', False) and exception.code in spider.settings.get('SCRAPFLY_CUSTOM_RETRY_CODE'):
            return spider.retry(request, exception, delay)

    raise exception
def process_request(self, request: Union[scrapy.http.request.Request, ScrapflyScrapyRequest], spider: Union[scrapy.spiders.Spider, ScrapflySpider]) ‑> Optional[ScrapflyScrapyResponse]
Expand source code
def process_request(self, request: Union[Request, ScrapflyScrapyRequest], spider: Union[Spider, ScrapflySpider]) -> Optional[ScrapflyScrapyResponse]:
    if not isinstance(request, ScrapflyScrapyRequest):
        return None

    if not isinstance(spider, ScrapflySpider):
        raise RuntimeError('ScrapflyScrapyRequest must be fired from ScrapflySpider, %s given' % type(spider))

    if request.scrape_config.tags is None:
        request.scrape_config.tags = set()

    request.scrape_config.tags.add(spider.name)
    request.scrape_config.tags.add(str(spider.run_id))

    if request.scrape_config.proxy_pool is None and spider.settings.get('SCRAPFLY_PROXY_POOL'):
        request.scrape_config.proxy_pool = spider.settings.get('SCRAPFLY_PROXY_POOL')

    return None
def process_response(self, request: Union[scrapy.http.request.Request, ScrapflyScrapyRequest], response: Union[scrapy.http.response.Response, ScrapflyScrapyResponse], spider: Union[scrapy.spiders.Spider, ScrapflySpider]) ‑> Union[ScrapflyScrapyResponseScrapflyScrapyRequest]
Expand source code
def process_response(self, request: Union[Request, ScrapflyScrapyRequest], response: Union[Response, ScrapflyScrapyResponse], spider: Union[Spider, ScrapflySpider]) -> Union[ScrapflyScrapyResponse, ScrapflyScrapyRequest]:
    return response
class ScrapflyRefererMiddleware (settings=None)
Expand source code
class ScrapflyRefererMiddleware(ScrapyRefererMidleware):

    def process_spider_output(self, response, result, spider) -> Iterable:
        if isinstance(response, ScrapflyScrapyResponse) and response.scrape_config.session is not None:
            return result # bypass - already handled by scrapfly session system

        return ScrapyRefererMidleware.process_spider_output(self, response, result, spider)

    def request_scheduled(self, request, spider):
        if isinstance(request, ScrapflyScrapyRequest) and request.scrape_config.session is not None:
            return # bypass - already handled by scrapfly session system

        ScrapyRefererMidleware.request_scheduled(self, request, spider)

Ancestors

  • scrapy.spidermiddlewares.referer.RefererMiddleware

Methods

def process_spider_output(self, response, result, spider) ‑> Iterable
Expand source code
def process_spider_output(self, response, result, spider) -> Iterable:
    if isinstance(response, ScrapflyScrapyResponse) and response.scrape_config.session is not None:
        return result # bypass - already handled by scrapfly session system

    return ScrapyRefererMidleware.process_spider_output(self, response, result, spider)
def request_scheduled(self, request, spider)
Expand source code
def request_scheduled(self, request, spider):
    if isinstance(request, ScrapflyScrapyRequest) and request.scrape_config.session is not None:
        return # bypass - already handled by scrapfly session system

    ScrapyRefererMidleware.request_scheduled(self, request, spider)