Module scrapfly.scrapy
Expand source code
from typing import Tuple
from functools import cache
from .request import ScrapflyScrapyRequest
from .response import ScrapflyScrapyResponse
from .middleware import ScrapflyMiddleware
from .spider import ScrapflySpider, ScrapflyCrawlSpider
from .pipelines import FilesPipeline, ImagesPipeline
current_scrapy_version = 0
@cache
def comparable_version(version: str) -> int:
l = [int(x, 10) for x in version.split('.')]
l.reverse()
return sum(x * (10 ** i) for i, x in enumerate(l))
try:
from scrapy import __version__
current_scrapy_version = comparable_version(__version__)
except ModuleNotFoundError:
# Error handling
pass
__all__:Tuple[str, ...] = (
'ScrapflyScrapyRequest',
'ScrapflyScrapyResponse',
'ScrapflyMiddleware',
'ScrapflySpider',
'ScrapflyCrawlSpider',
'FilesPipeline',
'ImagesPipeline',
'current_scrapy_version',
'comparable_version'
)
Sub-modules
scrapfly.scrapy.downloader
scrapfly.scrapy.middleware
scrapfly.scrapy.pipelines
scrapfly.scrapy.request
scrapfly.scrapy.response
scrapfly.scrapy.spider
Functions
def comparable_version(version: str) ‑> int
-
Expand source code
@cache def comparable_version(version: str) -> int: l = [int(x, 10) for x in version.split('.')] l.reverse() return sum(x * (10 ** i) for i, x in enumerate(l))
Classes
class FilesPipeline (store_uri, download_func=None, settings=None)
-
Abstract pipeline that implement the file downloading
This pipeline tries to minimize network transfers and file processing, doing stat of the files and determining if file is new, up-to-date or expired.
new
files are those that pipeline never processed and needs to be downloaded from supplier site the first time.uptodate
files are the ones that the pipeline processed and are still valid files.expired
files are those that pipeline already processed but the last modification was made long time ago, so a reprocessing is recommended to refresh it in case of change.Expand source code
class FilesPipeline(ScrapyFilesPipeline): def get_media_requests(self, item, info): scrape_configs = ItemAdapter(item).get(self.files_urls_field, []) requests = [] for config in scrape_configs: # If pipeline are not migrated to scrapfly - config is the url instead of ScrapeConfig object # Auto migrate string url to ScrapeConfig object if isinstance(config, str): config = scrape_config=ScrapeConfig(url=config) if isinstance(config, ScrapeConfig): requests.append(ScrapflyScrapyRequest(scrape_config=config)) else: raise ValueError('FilesPipeline item must ScrapeConfig Object or string url') return requests
Ancestors
- scrapy.pipelines.files.FilesPipeline
- scrapy.pipelines.media.MediaPipeline
Methods
def get_media_requests(self, item, info)
-
Returns the media requests to download
Expand source code
def get_media_requests(self, item, info): scrape_configs = ItemAdapter(item).get(self.files_urls_field, []) requests = [] for config in scrape_configs: # If pipeline are not migrated to scrapfly - config is the url instead of ScrapeConfig object # Auto migrate string url to ScrapeConfig object if isinstance(config, str): config = scrape_config=ScrapeConfig(url=config) if isinstance(config, ScrapeConfig): requests.append(ScrapflyScrapyRequest(scrape_config=config)) else: raise ValueError('FilesPipeline item must ScrapeConfig Object or string url') return requests
class ImagesPipeline (store_uri, download_func=None, settings=None)
-
Abstract pipeline that implement the image thumbnail generation logic
Expand source code
class ImagesPipeline(ScrapyImagesPipeline): def get_media_requests(self, item, info): scrape_configs = ItemAdapter(item).get(self.images_urls_field, []) requests = [] for config in scrape_configs: # If pipeline are not migrated to scrapfly - config is the url instead of ScrapeConfig object # Auto migrate string url to ScrapeConfig object if isinstance(config, str): config = scrape_config = ScrapeConfig(url=config) if isinstance(config, ScrapeConfig): requests.append(ScrapflyScrapyRequest(scrape_config=config)) else: raise ValueError('ImagesPipeline item must ScrapeConfig Object or string url') return requests
Ancestors
- scrapy.pipelines.images.ImagesPipeline
- scrapy.pipelines.files.FilesPipeline
- scrapy.pipelines.media.MediaPipeline
Methods
def get_media_requests(self, item, info)
-
Returns the media requests to download
Expand source code
def get_media_requests(self, item, info): scrape_configs = ItemAdapter(item).get(self.images_urls_field, []) requests = [] for config in scrape_configs: # If pipeline are not migrated to scrapfly - config is the url instead of ScrapeConfig object # Auto migrate string url to ScrapeConfig object if isinstance(config, str): config = scrape_config = ScrapeConfig(url=config) if isinstance(config, ScrapeConfig): requests.append(ScrapflyScrapyRequest(scrape_config=config)) else: raise ValueError('ImagesPipeline item must ScrapeConfig Object or string url') return requests
class ScrapflyCrawlSpider (*a, **kw)
-
Base class for scrapy spiders. All spiders must inherit from this class.
Expand source code
class ScrapflyCrawlSpider(ScrapflySpider): def _scrape_config_factory(self, rule_index, link): return ScrapeConfig(url=link.url) def _build_request(self, rule_index, link): return ScrapflyScrapyRequest( scrape_config=self._scrape_config_factory(rule_index, link), callback=self._callback, errback=self._errback, meta=dict(rule=rule_index, link_text=link.text), ) rules: Sequence[Rule] = () def __init__(self, *a, **kw): super().__init__(*a, **kw) self._compile_rules() def _parse(self, response, **kwargs): return self._parse_response( response=response, callback=self.parse_start_url, cb_kwargs=kwargs, follow=True, ) def parse_start_url(self, response, **kwargs): return [] def process_results(self, response, results): return results def _requests_to_follow(self, response): if not isinstance(response, ScrapflyScrapyResponse): return seen = set() for rule_index, rule in enumerate(self._rules): links = [lnk for lnk in rule.link_extractor.extract_links(response) if lnk not in seen] for link in rule.process_links(links): seen.add(link) request = self._build_request(rule_index, link) yield rule.process_request(request, response) def _callback(self, response): rule = self._rules[response.meta['rule']] return self._parse_response(response, rule.callback, rule.cb_kwargs, rule.follow) def _errback(self, failure): rule = self._rules[failure.request.meta['rule']] return self._handle_failure(failure, rule.errback) def _parse_response(self, response, callback, cb_kwargs, follow=True): if callback: cb_res = callback(response, **cb_kwargs) or () cb_res = self.process_results(response, cb_res) for request_or_item in iterate_spider_output(cb_res): yield request_or_item if follow and self._follow_links: for request_or_item in self._requests_to_follow(response): yield request_or_item def _handle_failure(self, failure, errback): if errback: results = errback(failure) or () for request_or_item in iterate_spider_output(results): yield request_or_item def _compile_rules(self): self._rules = [] for rule in self.rules: self._rules.append(copy.copy(rule)) self._rules[-1]._compile(self) @classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = super().from_crawler(crawler, *args, **kwargs) spider._follow_links = crawler.settings.getbool('CRAWLSPIDER_FOLLOW_LINKS', True) return spider
Ancestors
- ScrapflySpider
- scrapy.spiders.Spider
- scrapy.utils.trackref.object_ref
Class variables
var rules : Sequence[scrapy.spiders.crawl.Rule]
Static methods
def from_crawler(crawler, *args, **kwargs)
-
Expand source code
@classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = super().from_crawler(crawler, *args, **kwargs) spider._follow_links = crawler.settings.getbool('CRAWLSPIDER_FOLLOW_LINKS', True) return spider
Methods
def parse_start_url(self, response, **kwargs)
-
Expand source code
def parse_start_url(self, response, **kwargs): return []
def process_results(self, response, results)
-
Expand source code
def process_results(self, response, results): return results
class ScrapflyMiddleware
-
Expand source code
class ScrapflyMiddleware: MAX_API_RETRIES = 20 def process_request(self, request: Union[Request, ScrapflyScrapyRequest], spider: Union[Spider, ScrapflySpider]) -> Optional[ScrapflyScrapyResponse]: if not isinstance(request, ScrapflyScrapyRequest): return None if not isinstance(spider, ScrapflySpider): raise RuntimeError('ScrapflyScrapyRequest must be fired from ScrapflySpider, %s given' % type(spider)) if request.scrape_config.tags is None: request.scrape_config.tags = set() request.scrape_config.tags.add(spider.name) request.scrape_config.tags.add(str(spider.run_id)) if request.scrape_config.proxy_pool is None and spider.settings.get('SCRAPFLY_PROXY_POOL'): request.scrape_config.proxy_pool = spider.settings.get('SCRAPFLY_PROXY_POOL') return None def process_exception(self, request, exception:Union[str, Exception], spider:ScrapflySpider): delay = 1 if isinstance(exception, ResponseNeverReceived): return spider.retry(request, exception, delay) if isinstance(exception, ScrapflyError): if exception.is_retryable: if isinstance(exception, HttpError) and exception.response is not None: if 'retry-after' in exception.response.headers: delay = int(exception.response.headers['retry-after']) return spider.retry(request, exception, delay) if spider.settings.get('SCRAPFLY_CUSTOM_RETRY_CODE', False) and exception.code in spider.settings.get('SCRAPFLY_CUSTOM_RETRY_CODE'): return spider.retry(request, exception, delay) raise exception def process_response(self, request: Union[Request, ScrapflyScrapyRequest], response: Union[Response, ScrapflyScrapyResponse], spider: Union[Spider, ScrapflySpider]) -> Union[ScrapflyScrapyResponse, ScrapflyScrapyRequest]: return response
Class variables
var MAX_API_RETRIES
Methods
def process_exception(self, request, exception: Union[str, Exception], spider: ScrapflySpider)
-
Expand source code
def process_exception(self, request, exception:Union[str, Exception], spider:ScrapflySpider): delay = 1 if isinstance(exception, ResponseNeverReceived): return spider.retry(request, exception, delay) if isinstance(exception, ScrapflyError): if exception.is_retryable: if isinstance(exception, HttpError) and exception.response is not None: if 'retry-after' in exception.response.headers: delay = int(exception.response.headers['retry-after']) return spider.retry(request, exception, delay) if spider.settings.get('SCRAPFLY_CUSTOM_RETRY_CODE', False) and exception.code in spider.settings.get('SCRAPFLY_CUSTOM_RETRY_CODE'): return spider.retry(request, exception, delay) raise exception
def process_request(self, request: Union[scrapy.http.request.Request, ScrapflyScrapyRequest], spider: Union[scrapy.spiders.Spider, ScrapflySpider]) ‑> Optional[ScrapflyScrapyResponse]
-
Expand source code
def process_request(self, request: Union[Request, ScrapflyScrapyRequest], spider: Union[Spider, ScrapflySpider]) -> Optional[ScrapflyScrapyResponse]: if not isinstance(request, ScrapflyScrapyRequest): return None if not isinstance(spider, ScrapflySpider): raise RuntimeError('ScrapflyScrapyRequest must be fired from ScrapflySpider, %s given' % type(spider)) if request.scrape_config.tags is None: request.scrape_config.tags = set() request.scrape_config.tags.add(spider.name) request.scrape_config.tags.add(str(spider.run_id)) if request.scrape_config.proxy_pool is None and spider.settings.get('SCRAPFLY_PROXY_POOL'): request.scrape_config.proxy_pool = spider.settings.get('SCRAPFLY_PROXY_POOL') return None
def process_response(self, request: Union[scrapy.http.request.Request, ScrapflyScrapyRequest], response: Union[scrapy.http.response.Response, ScrapflyScrapyResponse], spider: Union[scrapy.spiders.Spider, ScrapflySpider]) ‑> Union[ScrapflyScrapyResponse, ScrapflyScrapyRequest]
-
Expand source code
def process_response(self, request: Union[Request, ScrapflyScrapyRequest], response: Union[Response, ScrapflyScrapyResponse], spider: Union[Spider, ScrapflySpider]) -> Union[ScrapflyScrapyResponse, ScrapflyScrapyRequest]: return response
class ScrapflyScrapyRequest (scrape_config: ScrapeConfig, meta: Dict = {}, *args, **kwargs)
-
Represents an HTTP request, which is usually generated in a Spider and executed by the Downloader, thus generating a :class:
Response
.Expand source code
class ScrapflyScrapyRequest(Request): scrape_config:ScrapeConfig # url:str inherited # method:str inherited # body:bytes inherited # headers:Dict inherited # encoding:Dict inherited def __init__(self, scrape_config:ScrapeConfig, meta:Dict={}, *args, **kwargs): self.scrape_config = scrape_config meta['scrapfly_scrape_config'] = self.scrape_config super().__init__( *args, url=self.scrape_config.url, headers=self.scrape_config.headers, cookies=self.scrape_config.cookies, body=self.scrape_config.body, meta=meta, **kwargs ) def replace(self, *args, **kwargs): for x in [ 'meta', 'flags', 'encoding', 'priority', 'dont_filter', 'callback', 'errback', 'cb_kwargs', ]: kwargs.setdefault(x, getattr(self, x)) kwargs['scrape_config'] = deepcopy(self.scrape_config) cls = kwargs.pop('cls', self.__class__) return cls(*args, **kwargs)
Ancestors
- scrapy.http.request.Request
- scrapy.utils.trackref.object_ref
Class variables
var scrape_config : ScrapeConfig
Methods
def replace(self, *args, **kwargs)
-
Create a new Request with the same attributes except for those given new values
Expand source code
def replace(self, *args, **kwargs): for x in [ 'meta', 'flags', 'encoding', 'priority', 'dont_filter', 'callback', 'errback', 'cb_kwargs', ]: kwargs.setdefault(x, getattr(self, x)) kwargs['scrape_config'] = deepcopy(self.scrape_config) cls = kwargs.pop('cls', self.__class__) return cls(*args, **kwargs)
class ScrapflyScrapyResponse (request: ScrapflyScrapyRequest, scrape_api_response: ScrapeApiResponse)
-
An object that represents an HTTP response, which is usually downloaded (by the Downloader) and fed to the Spiders for processing.
Expand source code
class ScrapflyScrapyResponse(TextResponse): content:Union[str, BytesIO] scrape_api_response:ScrapeApiResponse context:Dict scrape_config:ScrapeConfig log_url:str status:str config:Dict success:bool duration:float format:str screenshots:Dict dns:Optional[Dict] ssl:Optional[Dict] iframes:Dict browser_data:Dict error:Optional[Dict] DEFAULT_ENCODING = 'utf-8' def __init__(self, request:ScrapflyScrapyRequest, scrape_api_response:ScrapeApiResponse): self.scrape_api_response = scrape_api_response self.content = self.scrape_api_response.scrape_result['content'] self.context = self.scrape_api_response.context self.scrape_config = self.scrape_api_response.scrape_config self.log_url = self.scrape_api_response.scrape_result['log_url'] self.status = self.scrape_api_response.scrape_result['status'] self.success = self.scrape_api_response.scrape_result['success'] self.duration = self.scrape_api_response.scrape_result['duration'] self.format = self.scrape_api_response.scrape_result['format'] self.screenshots = self.scrape_api_response.scrape_result['screenshots'] self.dns = self.scrape_api_response.scrape_result['dns'] self.ssl = self.scrape_api_response.scrape_result['ssl'] self.iframes = self.scrape_api_response.scrape_result['iframes'] self.browser_data = self.scrape_api_response.scrape_result['browser_data'] self.error = self.scrape_api_response.scrape_result['error'] self.ip_address = None if isinstance(self.content, str): content = self.content.encode('utf-8') elif isinstance(self.content, (BytesIO, TextIO)): content = self.content.read() else: raise RuntimeError('Unsupported body %s' % type(self.content)) TextResponse.__init__( self, url=self.scrape_api_response.scrape_result['url'], status=self.scrape_api_response.scrape_result['status_code'], headers=self.scrape_api_response.scrape_result['response_headers'], body=content, request=request, ip_address=None ) @property def __class__(self): response_headers = self.scrape_api_response.scrape_result['response_headers'] if 'content-type' in response_headers and response_headers['content-type'].find('text/html') >= 0: return HtmlResponse elif 'content-type' in response_headers and response_headers['content-type'].find('application/xml') >= 0: return XmlResponse else: return TextResponse def sink(self, path: Optional[str] = None, name: Optional[str] = None, file: Optional[Union[TextIO, BytesIO]] = None): self.scrape_api_response.sink(path=path, name=name, file=file)
Ancestors
- scrapy.http.response.text.TextResponse
- scrapy.http.response.Response
- scrapy.utils.trackref.object_ref
Class variables
var DEFAULT_ENCODING
var browser_data : Dict
var config : Dict
var content : Union[str, _io.BytesIO]
var context : Dict
var dns : Optional[Dict]
var duration : float
var error : Optional[Dict]
var format : str
var iframes : Dict
var log_url : str
var scrape_api_response : ScrapeApiResponse
var scrape_config : ScrapeConfig
var screenshots : Dict
var ssl : Optional[Dict]
var status : str
var success : bool
Methods
def sink(self, path: Optional[str] = None, name: Optional[str] = None, file: Union[TextIO, _io.BytesIO, ForwardRef(None)] = None)
-
Expand source code
def sink(self, path: Optional[str] = None, name: Optional[str] = None, file: Optional[Union[TextIO, BytesIO]] = None): self.scrape_api_response.sink(path=path, name=name, file=file)
class ScrapflySpider (name: Optional[str] = None, **kwargs: Any)
-
Base class for scrapy spiders. All spiders must inherit from this class.
Expand source code
class ScrapflySpider(scrapy.Spider): scrapfly_client:ScrapflyClient account_info:Dict run_id:int custom_settings:Dict = {} scrapfly_settings:Dict = { 'DOWNLOADER_MIDDLEWARES': { 'scrapfly.scrapy.middleware.ScrapflyMiddleware': 725, 'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware': None, 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware': None, 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware': None, 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware': None, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': None, 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': None, 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': None, 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': None, }, 'DOWNLOAD_HANDLERS_BASE': { 'http': 'scrapfly.scrapy.downloader.ScrapflyHTTPDownloader', 'https': 'scrapfly.scrapy.downloader.ScrapflyHTTPDownloader' }, 'SPIDER_MIDDLEWARES': { 'scrapfly.scrapy.middleware.ScrapflyRefererMiddleware': 10, 'scrapy.spidermiddlewares.referer.RefererMiddleware': None, }, 'ITEM_PIPELINES': { 'scrapfly.scrapy.pipelines.FilesPipeline': 1, 'scrapfly.scrapy.pipelines.ImagesPipeline': 1, 'scrapy.pipelines.files.FilesPipeline': None, 'scrapy.pipelines.images.ImagesPipeline': None } } @classmethod def _merge_settings(cls, d, u): for k, v in u.items(): if isinstance(v, collections.abc.Mapping): d[k] = cls._merge_settings(d.get(k, {}), v) else: d[k] = v return d @classmethod def update_settings(cls, settings): settings.update(cls._merge_settings(dict(settings), cls.scrapfly_settings), priority='spider') @cached_property def run_id(self): return environ.get('SPIDER_RUN_ID') or str(uuid.uuid4()) def closed(self, reason:str): self.scrapfly_client.close() def start_requests(self) -> Iterable[ScrapflyScrapyRequest]: for scrape_config in self.start_urls: if not isinstance(scrape_config, ScrapeConfig): raise RuntimeError('start_urls must contains ScrapeConfig Object with ScrapflySpider') yield ScrapflyScrapyRequest(scrape_config=scrape_config) def retry(self, request:ScrapflyScrapyRequest, reason:Union[str, Exception], delay:Optional[int]=None): logger.info('==> Retrying request for reason %s' % reason) stats = self.crawler.stats retries = request.meta.get('retry_times', 0) + 1 if retries >= self.custom_settings.get('SCRAPFLY_MAX_API_RETRIES', 5): return None retryreq = request.replace(dont_filter=True) retryreq.priority += 100 if retryreq.scrape_config.cache is True: retryreq.scrape_config.cache_clear = True retryreq.meta['retry_times'] = retries if stats: stats.inc_value('scrapfly/api_retry/count') if isinstance(reason, ScrapflyError): stats.inc_value(f'scrapfly/api_retry/{reason.code}') if isinstance(reason, Exception): reason = global_object_name(reason.__class__) logger.warning(f"Retrying {request} for x{retries - 1}: {reason}", extra={'spider': self}) if delay is None: deferred = Deferred() deferred.addCallback(self.crawler.engine.schedule, request=retryreq, spider=self) else: from twisted.internet import reactor # prevent reactor already install issue from . import current_scrapy_version, comparable_version if current_scrapy_version >= comparable_version('2.10.0'): deferred = task.deferLater(reactor, delay, self.crawler.engine.crawl, retryreq) else: deferred = task.deferLater(reactor, delay, self.crawler.engine.crawl, retryreq, self) return deferred @classmethod def from_crawler(cls, crawler:Crawler, *args, **kwargs): from . import current_scrapy_version, comparable_version scrapfly_client = ScrapflyClient( key=crawler.settings.get('SCRAPFLY_API_KEY'), host=crawler.settings.get('SCRAPFLY_HOST', ScrapflyClient.HOST), verify=crawler.settings.get('SCRAPFLY_SSL_VERIFY', True), debug=crawler.settings.get('SCRAPFLY_DEBUG', False), distributed_mode=crawler.settings.get('SCRAPFLY_DISTRIBUTED_MODE', False), connect_timeout=crawler.settings.get('SCRAPFLY_CONNECT_TIMEOUT', ScrapflyClient.DEFAULT_CONNECT_TIMEOUT), read_timeout=crawler.settings.get('SCRAPFLY_READ_TIMEOUT', ScrapflyClient.DEFAULT_READ_TIMEOUT), ) settings_max_concurrency = crawler.settings.get('CONCURRENT_REQUESTS', -1) account_info = scrapfly_client.account() if account_info['account']['suspended'] is True: raise RuntimeError('Your account is suspended, please check your subscription status. Reason: %s' % account_info['account']['suspension_reason']) max_account_concurrency = account_info['subscription']['max_concurrency'] project_concurrency_limit = account_info['project']['concurrency_limit'] maximum_allowed_concurrency = max_account_concurrency if project_concurrency_limit is not None: maximum_allowed_concurrency = project_concurrency_limit if settings_max_concurrency == -1: crawler.settings.set('CONCURRENT_REQUESTS', maximum_allowed_concurrency, 255) logger.warning('Concurrent request auto configured to %d' % maximum_allowed_concurrency) else: if settings_max_concurrency > maximum_allowed_concurrency: logger.warning('==> Your maximum concurrency has been adjusted following your subscription because it\'s missconfigured. Configured: %d, Maximum Allowed: %d' % (settings_max_concurrency, maximum_allowed_concurrency)) crawler.settings.set('CONCURRENT_REQUESTS', maximum_allowed_concurrency, 255) if current_scrapy_version >= comparable_version('2.11.0'): crawler._apply_settings() if crawler.stats: crawler.stats.set_value('scrapfly/api_call_cost', 0) spider = cls(*args, **kwargs) spider._set_crawler(crawler) spider.scrapfly_client = scrapfly_client spider.scrapfly_client.version += "+scrapy@%s" % scrapy.__version__ spider.scrapfly_client.open() return spider
Ancestors
- scrapy.spiders.Spider
- scrapy.utils.trackref.object_ref
Subclasses
Class variables
var account_info : Dict
var custom_settings : Dict
var scrapfly_client : ScrapflyClient
var scrapfly_settings : Dict
Static methods
def from_crawler(crawler: scrapy.crawler.Crawler, *args, **kwargs)
-
Expand source code
@classmethod def from_crawler(cls, crawler:Crawler, *args, **kwargs): from . import current_scrapy_version, comparable_version scrapfly_client = ScrapflyClient( key=crawler.settings.get('SCRAPFLY_API_KEY'), host=crawler.settings.get('SCRAPFLY_HOST', ScrapflyClient.HOST), verify=crawler.settings.get('SCRAPFLY_SSL_VERIFY', True), debug=crawler.settings.get('SCRAPFLY_DEBUG', False), distributed_mode=crawler.settings.get('SCRAPFLY_DISTRIBUTED_MODE', False), connect_timeout=crawler.settings.get('SCRAPFLY_CONNECT_TIMEOUT', ScrapflyClient.DEFAULT_CONNECT_TIMEOUT), read_timeout=crawler.settings.get('SCRAPFLY_READ_TIMEOUT', ScrapflyClient.DEFAULT_READ_TIMEOUT), ) settings_max_concurrency = crawler.settings.get('CONCURRENT_REQUESTS', -1) account_info = scrapfly_client.account() if account_info['account']['suspended'] is True: raise RuntimeError('Your account is suspended, please check your subscription status. Reason: %s' % account_info['account']['suspension_reason']) max_account_concurrency = account_info['subscription']['max_concurrency'] project_concurrency_limit = account_info['project']['concurrency_limit'] maximum_allowed_concurrency = max_account_concurrency if project_concurrency_limit is not None: maximum_allowed_concurrency = project_concurrency_limit if settings_max_concurrency == -1: crawler.settings.set('CONCURRENT_REQUESTS', maximum_allowed_concurrency, 255) logger.warning('Concurrent request auto configured to %d' % maximum_allowed_concurrency) else: if settings_max_concurrency > maximum_allowed_concurrency: logger.warning('==> Your maximum concurrency has been adjusted following your subscription because it\'s missconfigured. Configured: %d, Maximum Allowed: %d' % (settings_max_concurrency, maximum_allowed_concurrency)) crawler.settings.set('CONCURRENT_REQUESTS', maximum_allowed_concurrency, 255) if current_scrapy_version >= comparable_version('2.11.0'): crawler._apply_settings() if crawler.stats: crawler.stats.set_value('scrapfly/api_call_cost', 0) spider = cls(*args, **kwargs) spider._set_crawler(crawler) spider.scrapfly_client = scrapfly_client spider.scrapfly_client.version += "+scrapy@%s" % scrapy.__version__ spider.scrapfly_client.open() return spider
def update_settings(settings)
-
Expand source code
@classmethod def update_settings(cls, settings): settings.update(cls._merge_settings(dict(settings), cls.scrapfly_settings), priority='spider')
Instance variables
var run_id : int
-
Expand source code
def __get__(self, instance, owner=None): if instance is None: return self if self.attrname is None: raise TypeError( "Cannot use cached_property instance without calling __set_name__ on it.") try: cache = instance.__dict__ except AttributeError: # not all objects have __dict__ (e.g. class defines slots) msg = ( f"No '__dict__' attribute on {type(instance).__name__!r} " f"instance to cache {self.attrname!r} property." ) raise TypeError(msg) from None val = cache.get(self.attrname, _NOT_FOUND) if val is _NOT_FOUND: with self.lock: # check if another thread filled cache while we awaited lock val = cache.get(self.attrname, _NOT_FOUND) if val is _NOT_FOUND: val = self.func(instance) try: cache[self.attrname] = val except TypeError: msg = ( f"The '__dict__' attribute on {type(instance).__name__!r} instance " f"does not support item assignment for caching {self.attrname!r} property." ) raise TypeError(msg) from None return val
Methods
def closed(self, reason: str)
-
Expand source code
def closed(self, reason:str): self.scrapfly_client.close()
def retry(self, request: ScrapflyScrapyRequest, reason: Union[str, Exception], delay: Optional[int] = None)
-
Expand source code
def retry(self, request:ScrapflyScrapyRequest, reason:Union[str, Exception], delay:Optional[int]=None): logger.info('==> Retrying request for reason %s' % reason) stats = self.crawler.stats retries = request.meta.get('retry_times', 0) + 1 if retries >= self.custom_settings.get('SCRAPFLY_MAX_API_RETRIES', 5): return None retryreq = request.replace(dont_filter=True) retryreq.priority += 100 if retryreq.scrape_config.cache is True: retryreq.scrape_config.cache_clear = True retryreq.meta['retry_times'] = retries if stats: stats.inc_value('scrapfly/api_retry/count') if isinstance(reason, ScrapflyError): stats.inc_value(f'scrapfly/api_retry/{reason.code}') if isinstance(reason, Exception): reason = global_object_name(reason.__class__) logger.warning(f"Retrying {request} for x{retries - 1}: {reason}", extra={'spider': self}) if delay is None: deferred = Deferred() deferred.addCallback(self.crawler.engine.schedule, request=retryreq, spider=self) else: from twisted.internet import reactor # prevent reactor already install issue from . import current_scrapy_version, comparable_version if current_scrapy_version >= comparable_version('2.10.0'): deferred = task.deferLater(reactor, delay, self.crawler.engine.crawl, retryreq) else: deferred = task.deferLater(reactor, delay, self.crawler.engine.crawl, retryreq, self) return deferred
def start_requests(self) ‑> Iterable[ScrapflyScrapyRequest]
-
Expand source code
def start_requests(self) -> Iterable[ScrapflyScrapyRequest]: for scrape_config in self.start_urls: if not isinstance(scrape_config, ScrapeConfig): raise RuntimeError('start_urls must contains ScrapeConfig Object with ScrapflySpider') yield ScrapflyScrapyRequest(scrape_config=scrape_config)