Module scrapfly.scrape_config
Classes
class Format (*args, **kwds)
-
Expand source code
class Format(Enum): """ Attributes: JSON: JSON format. TEXT: Text format. MARKDOWN: Markdown format. CLEAN_HTML: Clean HTML format. """ JSON = "json" TEXT = "text" MARKDOWN = "markdown" CLEAN_HTML = "clean_html"
Attributes
JSON
- JSON format.
TEXT
- Text format.
MARKDOWN
- Markdown format.
CLEAN_HTML
- Clean HTML format.
Ancestors
- enum.Enum
Class variables
var CLEAN_HTML
-
The type of the None singleton.
var JSON
-
The type of the None singleton.
var MARKDOWN
-
The type of the None singleton.
var TEXT
-
The type of the None singleton.
class FormatOption (*args, **kwds)
-
Expand source code
class FormatOption(Enum): """ Attributes: NO_IMAGES: exlude images from `markdown` format NO_LINKS: exlude links from `markdown` format """ NO_IMAGES = "no_images" NO_LINKS = "no_links" ONLY_CONTENT = "only_content"
Attributes
NO_IMAGES
- exlude images from
markdown
format NO_LINKS
- exlude links from
markdown
format
Ancestors
- enum.Enum
Class variables
var NO_IMAGES
-
The type of the None singleton.
var NO_LINKS
-
The type of the None singleton.
var ONLY_CONTENT
-
The type of the None singleton.
class ScrapeConfig (url: str,
retry: bool = True,
method: str = 'GET',
country: str | None = None,
render_js: bool = False,
cache: bool = False,
cache_clear: bool = False,
ssl: bool = False,
dns: bool = False,
asp: bool = False,
debug: bool = False,
raise_on_upstream_error: bool = True,
cache_ttl: int | None = None,
proxy_pool: str | None = None,
session: str | None = None,
tags: List[str] | Set[str] | None = None,
format: Format | None = None,
format_options: List[FormatOption] | None = None,
extraction_template: str | None = None,
extraction_ephemeral_template: Dict | None = None,
extraction_prompt: str | None = None,
extraction_model: str | None = None,
correlation_id: str | None = None,
cookies: requests.structures.CaseInsensitiveDict | None = None,
body: str | None = None,
data: Dict | None = None,
headers: requests.structures.CaseInsensitiveDict | Dict[str, str] | None = None,
js: str = None,
rendering_wait: int = None,
rendering_stage: Literal['complete', 'domcontentloaded'] = 'complete',
wait_for_selector: str | None = None,
screenshots: Dict | None = None,
screenshot_flags: List[ScreenshotFlag] | None = None,
session_sticky_proxy: bool | None = None,
webhook: str | None = None,
timeout: int | None = None,
js_scenario: List | None = None,
extract: Dict | None = None,
os: str | None = None,
lang: List[str] | None = None,
auto_scroll: bool | None = None,
cost_budget: int | None = None)-
Expand source code
class ScrapeConfig(BaseApiConfig): PUBLIC_DATACENTER_POOL = 'public_datacenter_pool' PUBLIC_RESIDENTIAL_POOL = 'public_residential_pool' url: str retry: bool = True method: str = 'GET' country: Optional[str] = None render_js: bool = False cache: bool = False cache_clear:bool = False ssl:bool = False dns:bool = False asp:bool = False debug: bool = False raise_on_upstream_error:bool = True cache_ttl:Optional[int] = None proxy_pool:Optional[str] = None session: Optional[str] = None tags: Optional[List[str]] = None format: Optional[Format] = None, # raw(unchanged) format_options: Optional[List[FormatOption]] extraction_template: Optional[str] = None # a saved template name extraction_ephemeral_template: Optional[Dict] # ephemeraly declared json template extraction_prompt: Optional[str] = None extraction_model: Optional[str] = None correlation_id: Optional[str] = None cookies: Optional[CaseInsensitiveDict] = None body: Optional[str] = None data: Optional[Dict] = None headers: Optional[CaseInsensitiveDict] = None js: str = None rendering_wait: int = None rendering_stage: Literal["complete", "domcontentloaded"] = "complete" wait_for_selector: Optional[str] = None session_sticky_proxy:bool = True screenshots:Optional[Dict]=None screenshot_flags: Optional[List[ScreenshotFlag]] = None, webhook:Optional[str]=None timeout:Optional[int]=None # in milliseconds js_scenario: Dict = None extract: Dict = None lang:Optional[List[str]] = None os:Optional[str] = None auto_scroll:Optional[bool] = None cost_budget:Optional[int] = None def __init__( self, url: str, retry: bool = True, method: str = 'GET', country: Optional[str] = None, render_js: bool = False, cache: bool = False, cache_clear:bool = False, ssl:bool = False, dns:bool = False, asp:bool = False, debug: bool = False, raise_on_upstream_error:bool = True, cache_ttl:Optional[int] = None, proxy_pool:Optional[str] = None, session: Optional[str] = None, tags: Optional[Union[List[str], Set[str]]] = None, format: Optional[Format] = None, # raw(unchanged) format_options: Optional[List[FormatOption]] = None, # raw(unchanged) extraction_template: Optional[str] = None, # a saved template name extraction_ephemeral_template: Optional[Dict] = None, # ephemeraly declared json template extraction_prompt: Optional[str] = None, extraction_model: Optional[str] = None, correlation_id: Optional[str] = None, cookies: Optional[CaseInsensitiveDict] = None, body: Optional[str] = None, data: Optional[Dict] = None, headers: Optional[Union[CaseInsensitiveDict, Dict[str, str]]] = None, js: str = None, rendering_wait: int = None, rendering_stage: Literal["complete", "domcontentloaded"] = "complete", wait_for_selector: Optional[str] = None, screenshots:Optional[Dict]=None, screenshot_flags: Optional[List[ScreenshotFlag]] = None, session_sticky_proxy:Optional[bool] = None, webhook:Optional[str] = None, timeout:Optional[int] = None, # in milliseconds js_scenario:Optional[List] = None, extract:Optional[Dict] = None, os:Optional[str] = None, lang:Optional[List[str]] = None, auto_scroll:Optional[bool] = None, cost_budget:Optional[int] = None ): assert(type(url) is str) if isinstance(tags, List): tags = set(tags) cookies = cookies or {} headers = headers or {} self.cookies = CaseInsensitiveDict(cookies) self.headers = CaseInsensitiveDict(headers) self.url = url self.retry = retry self.method = method self.country = country self.session_sticky_proxy = session_sticky_proxy self.render_js = render_js self.cache = cache self.cache_clear = cache_clear self.asp = asp self.webhook = webhook self.session = session self.debug = debug self.cache_ttl = cache_ttl self.proxy_pool = proxy_pool self.tags = tags or set() self.format = format self.format_options = format_options self.extraction_template = extraction_template self.extraction_ephemeral_template = extraction_ephemeral_template self.extraction_prompt = extraction_prompt self.extraction_model = extraction_model self.correlation_id = correlation_id self.wait_for_selector = wait_for_selector self.body = body self.data = data self.js = js self.rendering_wait = rendering_wait self.rendering_stage = rendering_stage self.raise_on_upstream_error = raise_on_upstream_error self.screenshots = screenshots self.screenshot_flags = screenshot_flags self.key = None self.dns = dns self.ssl = ssl self.js_scenario = js_scenario self.timeout = timeout self.extract = extract self.lang = lang self.os = os self.auto_scroll = auto_scroll self.cost_budget = cost_budget if cookies: _cookies = [] for name, value in cookies.items(): _cookies.append(name + '=' + value) if 'cookie' in self.headers: if self.headers['cookie'][-1] != ';': self.headers['cookie'] += ';' else: self.headers['cookie'] = '' self.headers['cookie'] += '; '.join(_cookies) if self.body and self.data: raise ScrapeConfigError('You cannot pass both parameters body and data. You must choose') if method in ['POST', 'PUT', 'PATCH']: if self.body is None and self.data is not None: if 'content-type' not in self.headers: self.headers['content-type'] = 'application/x-www-form-urlencoded' self.body = urlencode(data) else: if self.headers['content-type'].find('application/json') != -1: self.body = json.dumps(data) elif self.headers['content-type'].find('application/x-www-form-urlencoded') != -1: self.body = urlencode(data) else: raise ScrapeConfigError('Content-Type "%s" not supported, use body parameter to pass pre encoded body according to your content type' % self.headers['content-type']) elif self.body is None and self.data is None: self.headers['content-type'] = 'text/plain' def to_api_params(self, key:str) -> Dict: params = { 'key': self.key or key, 'url': self.url } if self.country is not None: params['country'] = self.country for name, value in self.headers.items(): params['headers[%s]' % name] = value if self.webhook is not None: params['webhook_name'] = self.webhook if self.timeout is not None: params['timeout'] = self.timeout if self.extract is not None: params['extract'] = base64.urlsafe_b64encode(json.dumps(self.extract).encode('utf-8')).decode('utf-8') if self.cost_budget is not None: params['cost_budget'] = self.cost_budget if self.render_js is True: params['render_js'] = self._bool_to_http(self.render_js) if self.wait_for_selector is not None: params['wait_for_selector'] = self.wait_for_selector if self.js: params['js'] = base64.urlsafe_b64encode(self.js.encode('utf-8')).decode('utf-8') if self.js_scenario: params['js_scenario'] = base64.urlsafe_b64encode(json.dumps(self.js_scenario).encode('utf-8')).decode('utf-8') if self.rendering_wait: params['rendering_wait'] = self.rendering_wait if self.rendering_stage: params['rendering_stage'] = self.rendering_stage if self.screenshots is not None: for name, element in self.screenshots.items(): params['screenshots[%s]' % name] = element if self.screenshot_flags is not None: self.screenshot_flags = [ScreenshotFlag(flag) for flag in self.screenshot_flags] params["screenshot_flags"] = ",".join(flag.value for flag in self.screenshot_flags) else: if self.screenshot_flags is not None: logging.warning('Params "screenshot_flags" is ignored. Works only if screenshots is enabled') if self.auto_scroll is True: params['auto_scroll'] = self._bool_to_http(self.auto_scroll) else: if self.wait_for_selector is not None: logging.warning('Params "wait_for_selector" is ignored. Works only if render_js is enabled') if self.screenshots: logging.warning('Params "screenshots" is ignored. Works only if render_js is enabled') if self.js_scenario: logging.warning('Params "js_scenario" is ignored. Works only if render_js is enabled') if self.js: logging.warning('Params "js" is ignored. Works only if render_js is enabled') if self.rendering_wait: logging.warning('Params "rendering_wait" is ignored. Works only if render_js is enabled') if self.asp is True: params['asp'] = self._bool_to_http(self.asp) if self.retry is False: params['retry'] = self._bool_to_http(self.retry) if self.cache is True: params['cache'] = self._bool_to_http(self.cache) if self.cache_clear is True: params['cache_clear'] = self._bool_to_http(self.cache_clear) if self.cache_ttl is not None: params['cache_ttl'] = self.cache_ttl else: if self.cache_clear is True: logging.warning('Params "cache_clear" is ignored. Works only if cache is enabled') if self.cache_ttl is not None: logging.warning('Params "cache_ttl" is ignored. Works only if cache is enabled') if self.dns is True: params['dns'] = self._bool_to_http(self.dns) if self.ssl is True: params['ssl'] = self._bool_to_http(self.ssl) if self.tags: params['tags'] = ','.join(self.tags) if self.format: params['format'] = Format(self.format).value if self.format_options: params['format'] += ':' + ','.join(FormatOption(option).value for option in self.format_options) if self.extraction_template and self.extraction_ephemeral_template: raise ScrapeConfigError('You cannot pass both parameters extraction_template and extraction_ephemeral_template. You must choose') if self.extraction_template: params['extraction_template'] = self.extraction_template if self.extraction_ephemeral_template: self.extraction_ephemeral_template = json.dumps(self.extraction_ephemeral_template) params['extraction_template'] = 'ephemeral:' + urlsafe_b64encode(self.extraction_ephemeral_template.encode('utf-8')).decode('utf-8') if self.extraction_prompt: params['extraction_prompt'] = quote_plus(self.extraction_prompt) if self.extraction_model: params['extraction_model'] = self.extraction_model if self.correlation_id: params['correlation_id'] = self.correlation_id if self.session: params['session'] = self.session if self.session_sticky_proxy is True: # false by default params['session_sticky_proxy'] = self._bool_to_http(self.session_sticky_proxy) else: if self.session_sticky_proxy: logging.warning('Params "session_sticky_proxy" is ignored. Works only if session is enabled') if self.debug is True: params['debug'] = self._bool_to_http(self.debug) if self.proxy_pool is not None: params['proxy_pool'] = self.proxy_pool if self.lang is not None: params['lang'] = ','.join(self.lang) if self.os is not None: params['os'] = self.os return params @staticmethod def from_exported_config(config:str) -> 'ScrapeConfig': try: from msgpack import loads as msgpack_loads except ImportError as e: print('You must install msgpack package - run: pip install "scrapfly-sdk[seepdup] or pip install msgpack') raise data = msgpack_loads(base64.b64decode(config)) headers = {} for name, value in data['headers'].items(): if isinstance(value, Iterable): headers[name] = '; '.join(value) else: headers[name] = value return ScrapeConfig( url=data['url'], retry=data['retry'], headers=headers, session=data['session'], session_sticky_proxy=data['session_sticky_proxy'], cache=data['cache'], cache_ttl=data['cache_ttl'], cache_clear=data['cache_clear'], render_js=data['render_js'], method=data['method'], asp=data['asp'], body=data['body'], ssl=data['ssl'], dns=data['dns'], country=data['country'], debug=data['debug'], correlation_id=data['correlation_id'], tags=data['tags'], format=data['format'], js=data['js'], rendering_wait=data['rendering_wait'], screenshots=data['screenshots'] or {}, screenshot_flags=data['screenshot_flags'], proxy_pool=data['proxy_pool'], auto_scroll=data['auto_scroll'], cost_budget=data['cost_budget'] ) def to_dict(self) -> Dict: """ Export the ScrapeConfig instance to a plain dictionary. Useful for JSON-serialization or other external storage. """ return { 'url': self.url, 'retry': self.retry, 'method': self.method, 'country': self.country, 'render_js': self.render_js, 'cache': self.cache, 'cache_clear': self.cache_clear, 'ssl': self.ssl, 'dns': self.dns, 'asp': self.asp, 'debug': self.debug, 'raise_on_upstream_error': self.raise_on_upstream_error, 'cache_ttl': self.cache_ttl, 'proxy_pool': self.proxy_pool, 'session': self.session, 'tags': list(self.tags), 'format': Format(self.format).value if self.format else None, 'format_options': [FormatOption(option).value for option in self.format_options] if self.format_options else None, 'extraction_template': self.extraction_template, 'extraction_ephemeral_template': self.extraction_ephemeral_template, 'extraction_prompt': self.extraction_prompt, 'extraction_model': self.extraction_model, 'correlation_id': self.correlation_id, 'cookies': CaseInsensitiveDict(self.cookies), 'body': self.body, 'data': None if self.body else self.data, 'headers': CaseInsensitiveDict(self.headers), 'js': self.js, 'rendering_wait': self.rendering_wait, 'wait_for_selector': self.wait_for_selector, 'session_sticky_proxy': self.session_sticky_proxy, 'screenshots': self.screenshots, 'screenshot_flags': [ScreenshotFlag(flag).value for flag in self.screenshot_flags] if self.screenshot_flags else None, 'webhook': self.webhook, 'timeout': self.timeout, 'js_scenario': self.js_scenario, 'extract': self.extract, 'lang': self.lang, 'os': self.os, 'auto_scroll': self.auto_scroll, 'cost_budget': self.cost_budget, } @staticmethod def from_dict(scrape_config_dict: Dict) -> 'ScrapeConfig': """Create a ScrapeConfig instance from a dictionary.""" url = scrape_config_dict.get('url', None) retry = scrape_config_dict.get('retry', False) method = scrape_config_dict.get('method', 'GET') country = scrape_config_dict.get('country', None) render_js = scrape_config_dict.get('render_js', False) cache = scrape_config_dict.get('cache', False) cache_clear = scrape_config_dict.get('cache_clear', False) ssl = scrape_config_dict.get('ssl', False) dns = scrape_config_dict.get('dns', False) asp = scrape_config_dict.get('asp', False) debug = scrape_config_dict.get('debug', False) raise_on_upstream_error = scrape_config_dict.get('raise_on_upstream_error', True) cache_ttl = scrape_config_dict.get('cache_ttl', None) proxy_pool = scrape_config_dict.get('proxy_pool', None) session = scrape_config_dict.get('session', None) tags = scrape_config_dict.get('tags', []) format = scrape_config_dict.get('format', None) format = Format(format) if format else None format_options = scrape_config_dict.get('format_options', None) format_options = [FormatOption(option) for option in format_options] if format_options else None extraction_template = scrape_config_dict.get('extraction_template', None) extraction_ephemeral_template = scrape_config_dict.get('extraction_ephemeral_template', None) extraction_prompt = scrape_config_dict.get('extraction_prompt', None) extraction_model = scrape_config_dict.get('extraction_model', None) correlation_id = scrape_config_dict.get('correlation_id', None) cookies = scrape_config_dict.get('cookies', {}) body = scrape_config_dict.get('body', None) data = scrape_config_dict.get('data', None) headers = scrape_config_dict.get('headers', {}) js = scrape_config_dict.get('js', None) rendering_wait = scrape_config_dict.get('rendering_wait', None) wait_for_selector = scrape_config_dict.get('wait_for_selector', None) screenshots = scrape_config_dict.get('screenshots', []) screenshot_flags = scrape_config_dict.get('screenshot_flags', []) screenshot_flags = [ScreenshotFlag(flag) for flag in screenshot_flags] if screenshot_flags else None session_sticky_proxy = scrape_config_dict.get('session_sticky_proxy', False) webhook = scrape_config_dict.get('webhook', None) timeout = scrape_config_dict.get('timeout', None) js_scenario = scrape_config_dict.get('js_scenario', None) extract = scrape_config_dict.get('extract', None) os = scrape_config_dict.get('os', None) lang = scrape_config_dict.get('lang', None) auto_scroll = scrape_config_dict.get('auto_scroll', None) cost_budget = scrape_config_dict.get('cost_budget', None) return ScrapeConfig( url=url, retry=retry, method=method, country=country, render_js=render_js, cache=cache, cache_clear=cache_clear, ssl=ssl, dns=dns, asp=asp, debug=debug, raise_on_upstream_error=raise_on_upstream_error, cache_ttl=cache_ttl, proxy_pool=proxy_pool, session=session, tags=tags, format=format, format_options=format_options, extraction_template=extraction_template, extraction_ephemeral_template=extraction_ephemeral_template, extraction_prompt=extraction_prompt, extraction_model=extraction_model, correlation_id=correlation_id, cookies=cookies, body=body, data=data, headers=headers, js=js, rendering_wait=rendering_wait, wait_for_selector=wait_for_selector, screenshots=screenshots, screenshot_flags=screenshot_flags, session_sticky_proxy=session_sticky_proxy, webhook=webhook, timeout=timeout, js_scenario=js_scenario, extract=extract, os=os, lang=lang, auto_scroll=auto_scroll, cost_budget=cost_budget, )
Ancestors
Class variables
var PUBLIC_DATACENTER_POOL
-
The type of the None singleton.
var PUBLIC_RESIDENTIAL_POOL
-
The type of the None singleton.
var asp : bool
-
The type of the None singleton.
var auto_scroll : bool | None
-
The type of the None singleton.
var body : str | None
-
The type of the None singleton.
var cache : bool
-
The type of the None singleton.
var cache_clear : bool
-
The type of the None singleton.
var cache_ttl : int | None
-
The type of the None singleton.
-
The type of the None singleton.
var correlation_id : str | None
-
The type of the None singleton.
var cost_budget : int | None
-
The type of the None singleton.
var country : str | None
-
The type of the None singleton.
var data : Dict | None
-
The type of the None singleton.
var debug : bool
-
The type of the None singleton.
var dns : bool
-
The type of the None singleton.
var extract : Dict
-
The type of the None singleton.
var extraction_ephemeral_template : Dict | None
-
The type of the None singleton.
var extraction_model : str | None
-
The type of the None singleton.
var extraction_prompt : str | None
-
The type of the None singleton.
var extraction_template : str | None
-
The type of the None singleton.
var format : Format | None
-
The type of the None singleton.
var format_options : List[FormatOption] | None
-
The type of the None singleton.
var headers : requests.structures.CaseInsensitiveDict | None
-
The type of the None singleton.
var js : str
-
The type of the None singleton.
var js_scenario : Dict
-
The type of the None singleton.
var lang : List[str] | None
-
The type of the None singleton.
var method : str
-
The type of the None singleton.
var os : str | None
-
The type of the None singleton.
var proxy_pool : str | None
-
The type of the None singleton.
var raise_on_upstream_error : bool
-
The type of the None singleton.
var render_js : bool
-
The type of the None singleton.
var rendering_stage : Literal['complete', 'domcontentloaded']
-
The type of the None singleton.
var rendering_wait : int
-
The type of the None singleton.
var retry : bool
-
The type of the None singleton.
var screenshot_flags : List[ScreenshotFlag] | None
-
The type of the None singleton.
var screenshots : Dict | None
-
The type of the None singleton.
var session : str | None
-
The type of the None singleton.
var session_sticky_proxy : bool
-
The type of the None singleton.
var ssl : bool
-
The type of the None singleton.
-
The type of the None singleton.
var timeout : int | None
-
The type of the None singleton.
var url : str
-
The type of the None singleton.
var wait_for_selector : str | None
-
The type of the None singleton.
var webhook : str | None
-
The type of the None singleton.
Static methods
def from_dict(scrape_config_dict: Dict) ‑> ScrapeConfig
-
Expand source code
@staticmethod def from_dict(scrape_config_dict: Dict) -> 'ScrapeConfig': """Create a ScrapeConfig instance from a dictionary.""" url = scrape_config_dict.get('url', None) retry = scrape_config_dict.get('retry', False) method = scrape_config_dict.get('method', 'GET') country = scrape_config_dict.get('country', None) render_js = scrape_config_dict.get('render_js', False) cache = scrape_config_dict.get('cache', False) cache_clear = scrape_config_dict.get('cache_clear', False) ssl = scrape_config_dict.get('ssl', False) dns = scrape_config_dict.get('dns', False) asp = scrape_config_dict.get('asp', False) debug = scrape_config_dict.get('debug', False) raise_on_upstream_error = scrape_config_dict.get('raise_on_upstream_error', True) cache_ttl = scrape_config_dict.get('cache_ttl', None) proxy_pool = scrape_config_dict.get('proxy_pool', None) session = scrape_config_dict.get('session', None) tags = scrape_config_dict.get('tags', []) format = scrape_config_dict.get('format', None) format = Format(format) if format else None format_options = scrape_config_dict.get('format_options', None) format_options = [FormatOption(option) for option in format_options] if format_options else None extraction_template = scrape_config_dict.get('extraction_template', None) extraction_ephemeral_template = scrape_config_dict.get('extraction_ephemeral_template', None) extraction_prompt = scrape_config_dict.get('extraction_prompt', None) extraction_model = scrape_config_dict.get('extraction_model', None) correlation_id = scrape_config_dict.get('correlation_id', None) cookies = scrape_config_dict.get('cookies', {}) body = scrape_config_dict.get('body', None) data = scrape_config_dict.get('data', None) headers = scrape_config_dict.get('headers', {}) js = scrape_config_dict.get('js', None) rendering_wait = scrape_config_dict.get('rendering_wait', None) wait_for_selector = scrape_config_dict.get('wait_for_selector', None) screenshots = scrape_config_dict.get('screenshots', []) screenshot_flags = scrape_config_dict.get('screenshot_flags', []) screenshot_flags = [ScreenshotFlag(flag) for flag in screenshot_flags] if screenshot_flags else None session_sticky_proxy = scrape_config_dict.get('session_sticky_proxy', False) webhook = scrape_config_dict.get('webhook', None) timeout = scrape_config_dict.get('timeout', None) js_scenario = scrape_config_dict.get('js_scenario', None) extract = scrape_config_dict.get('extract', None) os = scrape_config_dict.get('os', None) lang = scrape_config_dict.get('lang', None) auto_scroll = scrape_config_dict.get('auto_scroll', None) cost_budget = scrape_config_dict.get('cost_budget', None) return ScrapeConfig( url=url, retry=retry, method=method, country=country, render_js=render_js, cache=cache, cache_clear=cache_clear, ssl=ssl, dns=dns, asp=asp, debug=debug, raise_on_upstream_error=raise_on_upstream_error, cache_ttl=cache_ttl, proxy_pool=proxy_pool, session=session, tags=tags, format=format, format_options=format_options, extraction_template=extraction_template, extraction_ephemeral_template=extraction_ephemeral_template, extraction_prompt=extraction_prompt, extraction_model=extraction_model, correlation_id=correlation_id, cookies=cookies, body=body, data=data, headers=headers, js=js, rendering_wait=rendering_wait, wait_for_selector=wait_for_selector, screenshots=screenshots, screenshot_flags=screenshot_flags, session_sticky_proxy=session_sticky_proxy, webhook=webhook, timeout=timeout, js_scenario=js_scenario, extract=extract, os=os, lang=lang, auto_scroll=auto_scroll, cost_budget=cost_budget, )
Create a ScrapeConfig instance from a dictionary.
def from_exported_config(config: str) ‑> ScrapeConfig
-
Expand source code
@staticmethod def from_exported_config(config:str) -> 'ScrapeConfig': try: from msgpack import loads as msgpack_loads except ImportError as e: print('You must install msgpack package - run: pip install "scrapfly-sdk[seepdup] or pip install msgpack') raise data = msgpack_loads(base64.b64decode(config)) headers = {} for name, value in data['headers'].items(): if isinstance(value, Iterable): headers[name] = '; '.join(value) else: headers[name] = value return ScrapeConfig( url=data['url'], retry=data['retry'], headers=headers, session=data['session'], session_sticky_proxy=data['session_sticky_proxy'], cache=data['cache'], cache_ttl=data['cache_ttl'], cache_clear=data['cache_clear'], render_js=data['render_js'], method=data['method'], asp=data['asp'], body=data['body'], ssl=data['ssl'], dns=data['dns'], country=data['country'], debug=data['debug'], correlation_id=data['correlation_id'], tags=data['tags'], format=data['format'], js=data['js'], rendering_wait=data['rendering_wait'], screenshots=data['screenshots'] or {}, screenshot_flags=data['screenshot_flags'], proxy_pool=data['proxy_pool'], auto_scroll=data['auto_scroll'], cost_budget=data['cost_budget'] )
Methods
def to_api_params(self, key: str) ‑> Dict
-
Expand source code
def to_api_params(self, key:str) -> Dict: params = { 'key': self.key or key, 'url': self.url } if self.country is not None: params['country'] = self.country for name, value in self.headers.items(): params['headers[%s]' % name] = value if self.webhook is not None: params['webhook_name'] = self.webhook if self.timeout is not None: params['timeout'] = self.timeout if self.extract is not None: params['extract'] = base64.urlsafe_b64encode(json.dumps(self.extract).encode('utf-8')).decode('utf-8') if self.cost_budget is not None: params['cost_budget'] = self.cost_budget if self.render_js is True: params['render_js'] = self._bool_to_http(self.render_js) if self.wait_for_selector is not None: params['wait_for_selector'] = self.wait_for_selector if self.js: params['js'] = base64.urlsafe_b64encode(self.js.encode('utf-8')).decode('utf-8') if self.js_scenario: params['js_scenario'] = base64.urlsafe_b64encode(json.dumps(self.js_scenario).encode('utf-8')).decode('utf-8') if self.rendering_wait: params['rendering_wait'] = self.rendering_wait if self.rendering_stage: params['rendering_stage'] = self.rendering_stage if self.screenshots is not None: for name, element in self.screenshots.items(): params['screenshots[%s]' % name] = element if self.screenshot_flags is not None: self.screenshot_flags = [ScreenshotFlag(flag) for flag in self.screenshot_flags] params["screenshot_flags"] = ",".join(flag.value for flag in self.screenshot_flags) else: if self.screenshot_flags is not None: logging.warning('Params "screenshot_flags" is ignored. Works only if screenshots is enabled') if self.auto_scroll is True: params['auto_scroll'] = self._bool_to_http(self.auto_scroll) else: if self.wait_for_selector is not None: logging.warning('Params "wait_for_selector" is ignored. Works only if render_js is enabled') if self.screenshots: logging.warning('Params "screenshots" is ignored. Works only if render_js is enabled') if self.js_scenario: logging.warning('Params "js_scenario" is ignored. Works only if render_js is enabled') if self.js: logging.warning('Params "js" is ignored. Works only if render_js is enabled') if self.rendering_wait: logging.warning('Params "rendering_wait" is ignored. Works only if render_js is enabled') if self.asp is True: params['asp'] = self._bool_to_http(self.asp) if self.retry is False: params['retry'] = self._bool_to_http(self.retry) if self.cache is True: params['cache'] = self._bool_to_http(self.cache) if self.cache_clear is True: params['cache_clear'] = self._bool_to_http(self.cache_clear) if self.cache_ttl is not None: params['cache_ttl'] = self.cache_ttl else: if self.cache_clear is True: logging.warning('Params "cache_clear" is ignored. Works only if cache is enabled') if self.cache_ttl is not None: logging.warning('Params "cache_ttl" is ignored. Works only if cache is enabled') if self.dns is True: params['dns'] = self._bool_to_http(self.dns) if self.ssl is True: params['ssl'] = self._bool_to_http(self.ssl) if self.tags: params['tags'] = ','.join(self.tags) if self.format: params['format'] = Format(self.format).value if self.format_options: params['format'] += ':' + ','.join(FormatOption(option).value for option in self.format_options) if self.extraction_template and self.extraction_ephemeral_template: raise ScrapeConfigError('You cannot pass both parameters extraction_template and extraction_ephemeral_template. You must choose') if self.extraction_template: params['extraction_template'] = self.extraction_template if self.extraction_ephemeral_template: self.extraction_ephemeral_template = json.dumps(self.extraction_ephemeral_template) params['extraction_template'] = 'ephemeral:' + urlsafe_b64encode(self.extraction_ephemeral_template.encode('utf-8')).decode('utf-8') if self.extraction_prompt: params['extraction_prompt'] = quote_plus(self.extraction_prompt) if self.extraction_model: params['extraction_model'] = self.extraction_model if self.correlation_id: params['correlation_id'] = self.correlation_id if self.session: params['session'] = self.session if self.session_sticky_proxy is True: # false by default params['session_sticky_proxy'] = self._bool_to_http(self.session_sticky_proxy) else: if self.session_sticky_proxy: logging.warning('Params "session_sticky_proxy" is ignored. Works only if session is enabled') if self.debug is True: params['debug'] = self._bool_to_http(self.debug) if self.proxy_pool is not None: params['proxy_pool'] = self.proxy_pool if self.lang is not None: params['lang'] = ','.join(self.lang) if self.os is not None: params['os'] = self.os return params
def to_dict(self) ‑> Dict
-
Expand source code
def to_dict(self) -> Dict: """ Export the ScrapeConfig instance to a plain dictionary. Useful for JSON-serialization or other external storage. """ return { 'url': self.url, 'retry': self.retry, 'method': self.method, 'country': self.country, 'render_js': self.render_js, 'cache': self.cache, 'cache_clear': self.cache_clear, 'ssl': self.ssl, 'dns': self.dns, 'asp': self.asp, 'debug': self.debug, 'raise_on_upstream_error': self.raise_on_upstream_error, 'cache_ttl': self.cache_ttl, 'proxy_pool': self.proxy_pool, 'session': self.session, 'tags': list(self.tags), 'format': Format(self.format).value if self.format else None, 'format_options': [FormatOption(option).value for option in self.format_options] if self.format_options else None, 'extraction_template': self.extraction_template, 'extraction_ephemeral_template': self.extraction_ephemeral_template, 'extraction_prompt': self.extraction_prompt, 'extraction_model': self.extraction_model, 'correlation_id': self.correlation_id, 'cookies': CaseInsensitiveDict(self.cookies), 'body': self.body, 'data': None if self.body else self.data, 'headers': CaseInsensitiveDict(self.headers), 'js': self.js, 'rendering_wait': self.rendering_wait, 'wait_for_selector': self.wait_for_selector, 'session_sticky_proxy': self.session_sticky_proxy, 'screenshots': self.screenshots, 'screenshot_flags': [ScreenshotFlag(flag).value for flag in self.screenshot_flags] if self.screenshot_flags else None, 'webhook': self.webhook, 'timeout': self.timeout, 'js_scenario': self.js_scenario, 'extract': self.extract, 'lang': self.lang, 'os': self.os, 'auto_scroll': self.auto_scroll, 'cost_budget': self.cost_budget, }
Export the ScrapeConfig instance to a plain dictionary. Useful for JSON-serialization or other external storage.
class ScrapeConfigError (*args, **kwargs)
-
Expand source code
class ScrapeConfigError(Exception): pass
Common base class for all non-exit exceptions.
Ancestors
- builtins.Exception
- builtins.BaseException
class ScreenshotFlag (*args, **kwds)
-
Expand source code
class ScreenshotFlag(Enum): """ Attributes: LOAD_IMAGES: Enable image rendering with the request, add extra usage for the bandwidth consumed. DARK_MODE: Enable dark mode display. BLOCK_BANNERS: Block cookies banners and overlay that cover the screen. HIGH_QUALITY: No compression on the output image. PRINT_MEDIA_FORMAT: Render the page in the print mode. """ LOAD_IMAGES = "load_images" DARK_MODE = "dark_mode" BLOCK_BANNERS = "block_banners" HIGH_QUALITY = "high_quality" PRINT_MEDIA_FORMAT = "print_media_format"
Attributes
LOAD_IMAGES
- Enable image rendering with the request, add extra usage for the bandwidth consumed.
DARK_MODE
- Enable dark mode display.
BLOCK_BANNERS
- Block cookies banners and overlay that cover the screen.
HIGH_QUALITY
- No compression on the output image.
PRINT_MEDIA_FORMAT
- Render the page in the print mode.
Ancestors
- enum.Enum
Class variables
var BLOCK_BANNERS
-
The type of the None singleton.
var DARK_MODE
-
The type of the None singleton.
var HIGH_QUALITY
-
The type of the None singleton.
var LOAD_IMAGES
-
The type of the None singleton.
var PRINT_MEDIA_FORMAT
-
The type of the None singleton.