Module scrapfly.scrapy.downloader

Classes

class BinaryBody (body: _io.BytesIO)
Expand source code
class BinaryBody(BytesIO):

    def __init__(self, body:BytesIO):
        self.body = body
        BytesIO.__init__(self)

    def encode(self, encoding:str):
        pass

Buffered I/O implementation using an in-memory bytes buffer.

Ancestors

  • _io.BytesIO
  • _io._BufferedIOBase
  • _io._IOBase

Methods

def encode(self, encoding: str)
Expand source code
def encode(self, encoding:str):
    pass
class BodyProducer (body)
Expand source code
@implementer(IBodyProducer)
class BodyProducer(object):
    def __init__(self, body):
        self.body = body
        self.length = len(body)

    def startProducing(self, consumer):
        consumer.write(self.body)
        return succeed(None)

    def pauseProducing(self):
        pass

    def stopProducing(self):
        pass

Methods

def pauseProducing(self)
Expand source code
def pauseProducing(self):
    pass
def startProducing(self, consumer)
Expand source code
def startProducing(self, consumer):
    consumer.write(self.body)
    return succeed(None)
def stopProducing(self)
Expand source code
def stopProducing(self):
    pass
class BodyReceiver (deferred: twisted.internet.defer.Deferred)
Expand source code
class BodyReceiver(Protocol):

    def __init__(self, deferred:Deferred):
        self.deferred = deferred
        self.content = BytesIO()

    def dataReceived(self, bytes):
        self.content.write(bytes)

    def connectionLost(self, reason):
        self.deferred.callback(self.content.getvalue())

This is the base class for streaming connection-oriented protocols.

If you are going to write a new connection-oriented protocol for Twisted, start here. Any protocol implementation, either client or server, should be a subclass of this class.

The API is quite simple. Implement L{dataReceived} to handle both event-based and synchronous input; output can be sent through the 'transport' attribute, which is to be an instance that implements L{twisted.internet.interfaces.ITransport}. Override C{connectionLost} to be notified when the connection ends.

Some subclasses exist already to help you write common types of protocols: see the L{twisted.protocols.basic} module for a few of them.

Ancestors

  • twisted.internet.protocol.Protocol
  • twisted.internet.protocol.BaseProtocol

Methods

def connectionLost(self, reason)
Expand source code
def connectionLost(self, reason):
    self.deferred.callback(self.content.getvalue())

Called when the connection is shut down.

Clear any circular references here, and any external references to this Protocol. The connection has been closed.

@type reason: L{twisted.python.failure.Failure}

def dataReceived(self, bytes)
Expand source code
def dataReceived(self, bytes):
    self.content.write(bytes)

Called whenever data is received.

Use this method to translate to a higher-level message. Usually, some callback will be made upon the receipt of each complete protocol message.

@param data: a string of indeterminate length. Please keep in mind that you will probably need to buffer some data, as partial (or multiple) protocol messages may be received! I recommend that unit tests for protocols call through to this method with differing chunk sizes, down to one byte at a time.

class ScrapflyHTTPDownloader (settings, crawler=None)
Expand source code
class ScrapflyHTTPDownloader:

    def __init__(self, settings, crawler=None):
        self._crawler = crawler
        self.agent = Agent(reactor)
        self._donwload_handler = DownloadHandlers(crawler)

        # Restore default downloader for http/https when not using ScraplyRequest following Scrapy's default behavior
        settings_without_scrapfly_http_downloader = copy(settings)
        del settings_without_scrapfly_http_downloader['DOWNLOAD_HANDLERS']['http']
        del settings_without_scrapfly_http_downloader['DOWNLOAD_HANDLERS']['https']
        self._donwload_handler.handlers = without_none_values(settings_without_scrapfly_http_downloader.getwithbase("DOWNLOAD_HANDLERS"))

        for scheme, clspath in self._donwload_handler.handlers.items():
            self._donwload_handler._schemes[scheme] = clspath
            self._donwload_handler._load_handler(scheme, skip_lazy=True)
        # End

        if settings.get('SCRAPFLY_SSL_VERIFY') is False:
            import twisted.internet._sslverify as v
            v.platformTrust = lambda : None

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler.settings, crawler)

    def _cb_bodydone(self, twisted_response:Response, request:ScrapflyScrapyRequest, spider:ScrapflySpider) -> Deferred:

        headers = CaseInsensitiveDict()
        status_code = twisted_response.code
        reason = twisted_response.phrase.decode('utf-8')

        for name, values in twisted_response.headers.getAllRawHeaders():
            headers[name.decode('utf-8')] = '; '.join([value.decode('utf-8') for value in values])

        deferred = Deferred()
        body_receiver = BodyReceiver(deferred)

        if 'x-scrapfly-api-cost' in headers:
            self._crawler.stats.inc_value('scrapfly/api_call_cost', count=int(headers['x-scrapfly-api-cost']))

        def on_body_downloaded(body):
            if 'content-encoding' in headers:
                if headers['content-encoding'] == 'gzip':
                    body = zlib.decompress(body, 16+zlib.MAX_WBITS)
                elif headers['content-encoding'] == 'br':
                    try:
                        try:
                            import brotlicffi as brotli
                        except ImportError:
                            import brotli
                    except ImportError:
                        print('You must run pip install scrapfly-sdk[speedups] - brotli is missing - or disable brotli compression')
                        raise

                    body = brotli.decompress(body)
        
            response = requests.Response()
            response.status_code = status_code
            response.reason = reason
            response._content = body

            response.headers.update(headers)
            response.url = request.url

            request.scrape_config.raise_on_upstream_error = False

            scrapfly_api_response:ScrapeApiResponse = spider.scrapfly_client._handle_response(
                response=response,
                scrape_config=request.scrape_config
            )

            return ScrapflyScrapyResponse(request=request, scrape_api_response=scrapfly_api_response)

        deferred.addCallback(on_body_downloaded)
        twisted_response.deliverBody(body_receiver)

        return deferred

    def download_request(self, request, spider):
        if not isinstance(request, ScrapflyScrapyRequest) or not isinstance(spider, ScrapflySpider):
            return mustbe_deferred(self._donwload_handler.download_request, request, spider)

        request_data = spider.scrapfly_client._scrape_request(scrape_config=request.scrape_config)

        uri = '%s?%s' % (request_data['url'], urlencode(request_data['params']))

        request_kwargs = {
            'method': request_data['method'].encode('utf-8'),
            'uri': uri.encode('utf-8'),
            'headers': Headers({name: [value] for name, value in request_data['headers'].items()})
        }

        if request_data['method'] in ['POST', 'PUT', 'PATCH']:
            request_kwargs['bodyProducer'] = BodyProducer(request_data['data'].encode('utf-8'))

        d = self.agent.request(**request_kwargs)
        d.addCallback(self._cb_bodydone, request, spider)

        return d

    def close(self):
        pass

Static methods

def from_crawler(crawler)

Methods

def close(self)
Expand source code
def close(self):
    pass
def download_request(self, request, spider)
Expand source code
def download_request(self, request, spider):
    if not isinstance(request, ScrapflyScrapyRequest) or not isinstance(spider, ScrapflySpider):
        return mustbe_deferred(self._donwload_handler.download_request, request, spider)

    request_data = spider.scrapfly_client._scrape_request(scrape_config=request.scrape_config)

    uri = '%s?%s' % (request_data['url'], urlencode(request_data['params']))

    request_kwargs = {
        'method': request_data['method'].encode('utf-8'),
        'uri': uri.encode('utf-8'),
        'headers': Headers({name: [value] for name, value in request_data['headers'].items()})
    }

    if request_data['method'] in ['POST', 'PUT', 'PATCH']:
        request_kwargs['bodyProducer'] = BodyProducer(request_data['data'].encode('utf-8'))

    d = self.agent.request(**request_kwargs)
    d.addCallback(self._cb_bodydone, request, spider)

    return d