Module scrapfly.scrape_config
import base64
import json
import logging
from enum import Enum
from urllib.parse import urlencode, quote_plus
from base64 import urlsafe_b64encode
from typing import Optional, List, Dict, Iterable, Union, Set
from requests.structures import CaseInsensitiveDict
from .api_config import BaseApiConfig
class ScreenshotFlag(Enum):
LOAD_IMAGES: Enable image rendering with the request, add extra usage for the bandwidth consumed.
DARK_MODE: Enable dark mode display.
BLOCK_BANNERS: Block cookies banners and overlay that cover the screen.
HIGH_QUALITY: No compression on the output image.
PRINT_MEDIA_FORMAT: Render the page in the print mode.
LOAD_IMAGES = "load_images"
DARK_MODE = "dark_mode"
BLOCK_BANNERS = "block_banners"
HIGH_QUALITY = "high_quality"
PRINT_MEDIA_FORMAT = "print_media_format"
class Format(Enum):
JSON: JSON format.
TEXT: Text format.
MARKDOWN: Markdown format.
CLEAN_HTML: Clean HTML format.
JSON = "json"
TEXT = "text"
MARKDOWN = "markdown"
CLEAN_HTML = "clean_html"
class FormatOption(Enum):
NO_IMAGES: exlude images from `markdown` format
NO_LINKS: exlude links from `markdown` format
NO_IMAGES = "no_images"
NO_LINKS = "no_links"
ONLY_CONTENT = "only_content"
class ScrapeConfigError(Exception):
class ScrapeConfig(BaseApiConfig):
PUBLIC_DATACENTER_POOL = 'public_datacenter_pool'
PUBLIC_RESIDENTIAL_POOL = 'public_residential_pool'
url: str
retry: bool = True
method: str = 'GET'
country: Optional[str] = None
render_js: bool = False
cache: bool = False
cache_clear:bool = False
ssl:bool = False
dns:bool = False
asp:bool = False
debug: bool = False
raise_on_upstream_error:bool = True
cache_ttl:Optional[int] = None
proxy_pool:Optional[str] = None
session: Optional[str] = None
tags: Optional[List[str]] = None
format: Optional[Format] = None, # raw(unchanged)
format_options: Optional[List[FormatOption]]
extraction_template: Optional[str] = None # a saved template name
extraction_ephemeral_template: Optional[Dict] # ephemeraly declared json template
extraction_prompt: Optional[str] = None
extraction_model: Optional[str] = None
correlation_id: Optional[str] = None
cookies: Optional[CaseInsensitiveDict] = None
body: Optional[str] = None
data: Optional[Dict] = None
headers: Optional[CaseInsensitiveDict] = None
js: str = None
rendering_wait: int = None
wait_for_selector: Optional[str] = None
session_sticky_proxy:bool = True
screenshot_flags: Optional[List[ScreenshotFlag]] = None,
timeout:Optional[int]=None # in milliseconds
js_scenario: Dict = None
extract: Dict = None
lang:Optional[List[str]] = None
os:Optional[str] = None
auto_scroll:Optional[bool] = None
cost_budget:Optional[int] = None
def __init__(
url: str,
retry: bool = True,
method: str = 'GET',
country: Optional[str] = None,
render_js: bool = False,
cache: bool = False,
cache_clear:bool = False,
ssl:bool = False,
dns:bool = False,
asp:bool = False,
debug: bool = False,
raise_on_upstream_error:bool = True,
cache_ttl:Optional[int] = None,
proxy_pool:Optional[str] = None,
session: Optional[str] = None,
tags: Optional[Union[List[str], Set[str]]] = None,
format: Optional[Format] = None, # raw(unchanged)
format_options: Optional[List[FormatOption]] = None, # raw(unchanged)
extraction_template: Optional[str] = None, # a saved template name
extraction_ephemeral_template: Optional[Dict] = None, # ephemeraly declared json template
extraction_prompt: Optional[str] = None,
extraction_model: Optional[str] = None,
correlation_id: Optional[str] = None,
cookies: Optional[CaseInsensitiveDict] = None,
body: Optional[str] = None,
data: Optional[Dict] = None,
headers: Optional[Union[CaseInsensitiveDict, Dict[str, str]]] = None,
js: str = None,
rendering_wait: int = None,
wait_for_selector: Optional[str] = None,
screenshot_flags: Optional[List[ScreenshotFlag]] = None,
session_sticky_proxy:Optional[bool] = None,
webhook:Optional[str] = None,
timeout:Optional[int] = None, # in milliseconds
js_scenario:Optional[List] = None,
extract:Optional[Dict] = None,
os:Optional[str] = None,
lang:Optional[List[str]] = None,
auto_scroll:Optional[bool] = None,
cost_budget:Optional[int] = None
assert(type(url) is str)
if isinstance(tags, List):
tags = set(tags)
cookies = cookies or {}
headers = headers or {}
self.cookies = CaseInsensitiveDict(cookies)
self.headers = CaseInsensitiveDict(headers)
self.url = url
self.retry = retry
self.method = method = country
self.session_sticky_proxy = session_sticky_proxy
self.render_js = render_js
self.cache = cache
self.cache_clear = cache_clear
self.asp = asp
self.webhook = webhook
self.session = session
self.debug = debug
self.cache_ttl = cache_ttl
self.proxy_pool = proxy_pool
self.tags = tags or set()
self.format = format
self.format_options = format_options
self.extraction_template = extraction_template
self.extraction_ephemeral_template = extraction_ephemeral_template
self.extraction_prompt = extraction_prompt
self.extraction_model = extraction_model
self.correlation_id = correlation_id
self.wait_for_selector = wait_for_selector
self.body = body = data
self.js = js
self.rendering_wait = rendering_wait
self.raise_on_upstream_error = raise_on_upstream_error
self.screenshots = screenshots
self.screenshot_flags = screenshot_flags
self.key = None
self.dns = dns
self.ssl = ssl
self.js_scenario = js_scenario
self.timeout = timeout
self.extract = extract
self.lang = lang
self.os = os
self.auto_scroll = auto_scroll
self.cost_budget = cost_budget
if cookies:
_cookies = []
for name, value in cookies.items():
_cookies.append(name + '=' + value)
if 'cookie' in self.headers:
if self.headers['cookie'][-1] != ';':
self.headers['cookie'] += ';'
self.headers['cookie'] = ''
self.headers['cookie'] += '; '.join(_cookies)
if self.body and
raise ScrapeConfigError('You cannot pass both parameters body and data. You must choose')
if method in ['POST', 'PUT', 'PATCH']:
if self.body is None and is not None:
if 'content-type' not in self.headers:
self.headers['content-type'] = 'application/x-www-form-urlencoded'
self.body = urlencode(data)
if self.headers['content-type'].find('application/json') != -1:
self.body = json.dumps(data)
elif self.headers['content-type'].find('application/x-www-form-urlencoded') != -1:
self.body = urlencode(data)
raise ScrapeConfigError('Content-Type "%s" not supported, use body parameter to pass pre encoded body according to your content type' % self.headers['content-type'])
elif self.body is None and is None:
self.headers['content-type'] = 'text/plain'
def to_api_params(self, key:str) -> Dict:
params = {
'key': self.key or key,
'url': self.url
if is not None:
params['country'] =
for name, value in self.headers.items():
params['headers[%s]' % name] = value
if self.webhook is not None:
params['webhook_name'] = self.webhook
if self.timeout is not None:
params['timeout'] = self.timeout
if self.extract is not None:
params['extract'] = base64.urlsafe_b64encode(json.dumps(self.extract).encode('utf-8')).decode('utf-8')
if self.cost_budget is not None:
params['cost_budget'] = self.cost_budget
if self.render_js is True:
params['render_js'] = self._bool_to_http(self.render_js)
if self.wait_for_selector is not None:
params['wait_for_selector'] = self.wait_for_selector
if self.js:
params['js'] = base64.urlsafe_b64encode(self.js.encode('utf-8')).decode('utf-8')
if self.js_scenario:
params['js_scenario'] = base64.urlsafe_b64encode(json.dumps(self.js_scenario).encode('utf-8')).decode('utf-8')
if self.rendering_wait:
params['rendering_wait'] = self.rendering_wait
if self.screenshots is not None:
for name, element in self.screenshots.items():
params['screenshots[%s]' % name] = element
if self.screenshot_flags is not None:
self.screenshot_flags = [ScreenshotFlag(flag) for flag in self.screenshot_flags]
params["screenshot_flags"] = ",".join(flag.value for flag in self.screenshot_flags)
if self.screenshot_flags is not None:
logging.warning('Params "screenshot_flags" is ignored. Works only if screenshots is enabled')
if self.auto_scroll is True:
params['auto_scroll'] = self._bool_to_http(self.auto_scroll)
if self.wait_for_selector is not None:
logging.warning('Params "wait_for_selector" is ignored. Works only if render_js is enabled')
if self.screenshots:
logging.warning('Params "screenshots" is ignored. Works only if render_js is enabled')
if self.js_scenario:
logging.warning('Params "js_scenario" is ignored. Works only if render_js is enabled')
if self.js:
logging.warning('Params "js" is ignored. Works only if render_js is enabled')
if self.rendering_wait:
logging.warning('Params "rendering_wait" is ignored. Works only if render_js is enabled')
if self.asp is True:
params['asp'] = self._bool_to_http(self.asp)
if self.retry is False:
params['retry'] = self._bool_to_http(self.retry)
if self.cache is True:
params['cache'] = self._bool_to_http(self.cache)
if self.cache_clear is True:
params['cache_clear'] = self._bool_to_http(self.cache_clear)
if self.cache_ttl is not None:
params['cache_ttl'] = self.cache_ttl
if self.cache_clear is True:
logging.warning('Params "cache_clear" is ignored. Works only if cache is enabled')
if self.cache_ttl is not None:
logging.warning('Params "cache_ttl" is ignored. Works only if cache is enabled')
if self.dns is True:
params['dns'] = self._bool_to_http(self.dns)
if self.ssl is True:
params['ssl'] = self._bool_to_http(self.ssl)
if self.tags:
params['tags'] = ','.join(self.tags)
if self.format:
params['format'] = Format(self.format).value
if self.format_options:
params['format'] += ':' + ','.join(FormatOption(option).value for option in self.format_options)
if self.extraction_template and self.extraction_ephemeral_template:
raise ScrapeConfigError('You cannot pass both parameters extraction_template and extraction_ephemeral_template. You must choose')
if self.extraction_template:
params['extraction_template'] = self.extraction_template
if self.extraction_ephemeral_template:
self.extraction_ephemeral_template = json.dumps(self.extraction_ephemeral_template)
params['extraction_template'] = 'ephemeral:' + urlsafe_b64encode(self.extraction_ephemeral_template.encode('utf-8')).decode('utf-8')
if self.extraction_prompt:
params['extraction_prompt'] = quote_plus(self.extraction_prompt)
if self.extraction_model:
params['extraction_model'] = self.extraction_model
if self.correlation_id:
params['correlation_id'] = self.correlation_id
if self.session:
params['session'] = self.session
if self.session_sticky_proxy is True: # false by default
params['session_sticky_proxy'] = self._bool_to_http(self.session_sticky_proxy)
if self.session_sticky_proxy:
logging.warning('Params "session_sticky_proxy" is ignored. Works only if session is enabled')
if self.debug is True:
params['debug'] = self._bool_to_http(self.debug)
if self.proxy_pool is not None:
params['proxy_pool'] = self.proxy_pool
if self.lang is not None:
params['lang'] = ','.join(self.lang)
if self.os is not None:
params['os'] = self.os
return params
def from_exported_config(config:str) -> 'ScrapeConfig':
from msgpack import loads as msgpack_loads
except ImportError as e:
print('You must install msgpack package - run: pip install "scrapfly-sdk[seepdup] or pip install msgpack')
data = msgpack_loads(base64.b64decode(config))
headers = {}
for name, value in data['headers'].items():
if isinstance(value, Iterable):
headers[name] = '; '.join(value)
headers[name] = value
return ScrapeConfig(
screenshots=data['screenshots'] or {},
def to_dict(self) -> Dict:
Export the ScrapeConfig instance to a plain dictionary.
Useful for JSON-serialization or other external storage.
return {
'url': self.url,
'retry': self.retry,
'method': self.method,
'render_js': self.render_js,
'cache': self.cache,
'cache_clear': self.cache_clear,
'ssl': self.ssl,
'dns': self.dns,
'asp': self.asp,
'debug': self.debug,
'raise_on_upstream_error': self.raise_on_upstream_error,
'cache_ttl': self.cache_ttl,
'proxy_pool': self.proxy_pool,
'session': self.session,
'tags': list(self.tags),
'format': Format(self.format).value if self.format else None,
'format_options': [FormatOption(option).value for option in self.format_options] if self.format_options else None,
'extraction_template': self.extraction_template,
'extraction_ephemeral_template': self.extraction_ephemeral_template,
'extraction_prompt': self.extraction_prompt,
'extraction_model': self.extraction_model,
'correlation_id': self.correlation_id,
'cookies': CaseInsensitiveDict(self.cookies),
'body': self.body,
'data': None if self.body else,
'headers': CaseInsensitiveDict(self.headers),
'js': self.js,
'rendering_wait': self.rendering_wait,
'wait_for_selector': self.wait_for_selector,
'session_sticky_proxy': self.session_sticky_proxy,
'screenshots': self.screenshots,
'screenshot_flags': [ScreenshotFlag(flag).value for flag in self.screenshot_flags] if self.screenshot_flags else None,
'webhook': self.webhook,
'timeout': self.timeout,
'js_scenario': self.js_scenario,
'extract': self.extract,
'lang': self.lang,
'os': self.os,
'auto_scroll': self.auto_scroll,
'cost_budget': self.cost_budget,
def from_dict(scrape_config_dict: Dict) -> 'ScrapeConfig':
"""Create a ScrapeConfig instance from a dictionary."""
url = scrape_config_dict.get('url', None)
retry = scrape_config_dict.get('retry', False)
method = scrape_config_dict.get('method', 'GET')
country = scrape_config_dict.get('country', None)
render_js = scrape_config_dict.get('render_js', False)
cache = scrape_config_dict.get('cache', False)
cache_clear = scrape_config_dict.get('cache_clear', False)
ssl = scrape_config_dict.get('ssl', False)
dns = scrape_config_dict.get('dns', False)
asp = scrape_config_dict.get('asp', False)
debug = scrape_config_dict.get('debug', False)
raise_on_upstream_error = scrape_config_dict.get('raise_on_upstream_error', True)
cache_ttl = scrape_config_dict.get('cache_ttl', None)
proxy_pool = scrape_config_dict.get('proxy_pool', None)
session = scrape_config_dict.get('session', None)
tags = scrape_config_dict.get('tags', [])
format = scrape_config_dict.get('format', None)
format = Format(format) if format else None
format_options = scrape_config_dict.get('format_options', None)
format_options = [FormatOption(option) for option in format_options] if format_options else None
extraction_template = scrape_config_dict.get('extraction_template', None)
extraction_ephemeral_template = scrape_config_dict.get('extraction_ephemeral_template', None)
extraction_prompt = scrape_config_dict.get('extraction_prompt', None)
extraction_model = scrape_config_dict.get('extraction_model', None)
correlation_id = scrape_config_dict.get('correlation_id', None)
cookies = scrape_config_dict.get('cookies', {})
body = scrape_config_dict.get('body', None)
data = scrape_config_dict.get('data', None)
headers = scrape_config_dict.get('headers', {})
js = scrape_config_dict.get('js', None)
rendering_wait = scrape_config_dict.get('rendering_wait', None)
wait_for_selector = scrape_config_dict.get('wait_for_selector', None)
screenshots = scrape_config_dict.get('screenshots', [])
screenshot_flags = scrape_config_dict.get('screenshot_flags', [])
screenshot_flags = [ScreenshotFlag(flag) for flag in screenshot_flags] if screenshot_flags else None
session_sticky_proxy = scrape_config_dict.get('session_sticky_proxy', False)
webhook = scrape_config_dict.get('webhook', None)
timeout = scrape_config_dict.get('timeout', None)
js_scenario = scrape_config_dict.get('js_scenario', None)
extract = scrape_config_dict.get('extract', None)
os = scrape_config_dict.get('os', None)
lang = scrape_config_dict.get('lang', None)
auto_scroll = scrape_config_dict.get('auto_scroll', None)
cost_budget = scrape_config_dict.get('cost_budget', None)
return ScrapeConfig(
