Module `scrapfly.scrape_config`

Classes

class Format (*args, **kwds)

Expand source code

class Format(Enum):
    """
    Attributes:
        JSON: JSON format.
        TEXT: Text format.
        MARKDOWN: Markdown format.
        CLEAN_HTML: Clean HTML format.
    """

    JSON = "json"
    TEXT = "text"
    MARKDOWN = "markdown"
    CLEAN_HTML = "clean_html"

Attributes

JSON: JSON format.
TEXT: Text format.
MARKDOWN: Markdown format.
CLEAN_HTML: Clean HTML format.

Ancestors

enum.Enum

Class variables

var CLEAN_HTML: The type of the None singleton.
var JSON: The type of the None singleton.
var MARKDOWN: The type of the None singleton.
var TEXT: The type of the None singleton.

class FormatOption (*args, **kwds)

Expand source code

class FormatOption(Enum):
    """
    Attributes:
        NO_IMAGES: exlude images from `markdown` format
        NO_LINKS: exlude links from `markdown` format
    """

    NO_IMAGES = "no_images"
    NO_LINKS = "no_links"
    ONLY_CONTENT = "only_content"

Attributes

NO_IMAGES: exlude images from markdown format
NO_LINKS: exlude links from markdown format

Ancestors

enum.Enum

Class variables

var NO_IMAGES: The type of the None singleton.
var NO_LINKS: The type of the None singleton.
var ONLY_CONTENT: The type of the None singleton.

class ScrapeConfig (url: str, retry: bool = True, method: str = 'GET', country: str | None = None, render_js: bool = False, cache: bool = False, cache_clear: bool = False, ssl: bool = False, dns: bool = False, asp: bool = False, debug: bool = False, raise_on_upstream_error: bool = True, cache_ttl: int | None = None, proxy_pool: str | None = None, session: str | None = None, tags: List[str] | Set[str] | None = None, format: Format | None = None, format_options: List[FormatOption] | None = None, extraction_template: str | None = None, extraction_ephemeral_template: Dict | None = None, extraction_prompt: str | None = None, extraction_model: str | None = None, correlation_id: str | None = None, cookies: requests.structures.CaseInsensitiveDict | None = None, body: str | None = None, data: Dict | None = None, headers: requests.structures.CaseInsensitiveDict | Dict[str, str] | None = None, js: str = None, rendering_wait: int = None, rendering_stage: Literal['complete', 'domcontentloaded'] = 'complete', wait_for_selector: str | None = None, screenshots: Dict | None = None, screenshot_flags: List[ScreenshotFlag] | None = None, session_sticky_proxy: bool | None = None, webhook: str | None = None, timeout: int | None = None, js_scenario: List | None = None, extract: Dict | None = None, os: str | None = None, lang: List[str] | None = None, auto_scroll: bool | None = None, cost_budget: int | None = None)

Expand source code

class ScrapeConfig(BaseApiConfig):

    PUBLIC_DATACENTER_POOL = 'public_datacenter_pool'
    PUBLIC_RESIDENTIAL_POOL = 'public_residential_pool'

    url: str
    retry: bool = True
    method: str = 'GET'
    country: Optional[str] = None
    render_js: bool = False
    cache: bool = False
    cache_clear:bool = False
    ssl:bool = False
    dns:bool = False
    asp:bool = False
    debug: bool = False
    raise_on_upstream_error:bool = True
    cache_ttl:Optional[int] = None
    proxy_pool:Optional[str] = None
    session: Optional[str] = None
    tags: Optional[List[str]] = None
    format: Optional[Format] = None, # raw(unchanged)
    format_options: Optional[List[FormatOption]] 
    extraction_template: Optional[str] = None  # a saved template name
    extraction_ephemeral_template: Optional[Dict]  # ephemeraly declared json template
    extraction_prompt: Optional[str] = None
    extraction_model: Optional[str] = None    
    correlation_id: Optional[str] = None
    cookies: Optional[CaseInsensitiveDict] = None
    body: Optional[str] = None
    data: Optional[Dict] = None
    headers: Optional[CaseInsensitiveDict] = None
    js: str = None
    rendering_wait: int = None
    rendering_stage: Literal["complete", "domcontentloaded"] = "complete"
    wait_for_selector: Optional[str] = None
    session_sticky_proxy:bool = True
    screenshots:Optional[Dict]=None
    screenshot_flags: Optional[List[ScreenshotFlag]] = None,
    webhook:Optional[str]=None
    timeout:Optional[int]=None # in milliseconds
    js_scenario: Dict = None
    extract: Dict = None
    lang:Optional[List[str]] = None
    os:Optional[str] = None
    auto_scroll:Optional[bool] = None
    cost_budget:Optional[int] = None

    def __init__(
        self,
        url: str,
        retry: bool = True,
        method: str = 'GET',
        country: Optional[str] = None,
        render_js: bool = False,
        cache: bool = False,
        cache_clear:bool = False,
        ssl:bool = False,
        dns:bool = False,
        asp:bool = False,
        debug: bool = False,
        raise_on_upstream_error:bool = True,
        cache_ttl:Optional[int] = None,
        proxy_pool:Optional[str] = None,
        session: Optional[str] = None,
        tags: Optional[Union[List[str], Set[str]]] = None,
        format: Optional[Format] = None, # raw(unchanged)
        format_options: Optional[List[FormatOption]] = None, # raw(unchanged)
        extraction_template: Optional[str] = None,  # a saved template name
        extraction_ephemeral_template: Optional[Dict] = None,  # ephemeraly declared json template
        extraction_prompt: Optional[str] = None,
        extraction_model: Optional[str] = None,        
        correlation_id: Optional[str] = None,
        cookies: Optional[CaseInsensitiveDict] = None,
        body: Optional[str] = None,
        data: Optional[Dict] = None,
        headers: Optional[Union[CaseInsensitiveDict, Dict[str, str]]] = None,
        js: str = None,
        rendering_wait: int = None,
        rendering_stage: Literal["complete", "domcontentloaded"] = "complete",
        wait_for_selector: Optional[str] = None,
        screenshots:Optional[Dict]=None,
        screenshot_flags: Optional[List[ScreenshotFlag]] = None,
        session_sticky_proxy:Optional[bool] = None,
        webhook:Optional[str] = None,
        timeout:Optional[int] = None, # in milliseconds
        js_scenario:Optional[List] = None,
        extract:Optional[Dict] = None,
        os:Optional[str] = None,
        lang:Optional[List[str]] = None,
        auto_scroll:Optional[bool] = None,
        cost_budget:Optional[int] = None
    ):
        assert(type(url) is str)

        if isinstance(tags, List):
            tags = set(tags)

        cookies = cookies or {}
        headers = headers or {}

        self.cookies = CaseInsensitiveDict(cookies)
        self.headers = CaseInsensitiveDict(headers)
        self.url = url
        self.retry = retry
        self.method = method
        self.country = country
        self.session_sticky_proxy = session_sticky_proxy
        self.render_js = render_js
        self.cache = cache
        self.cache_clear = cache_clear
        self.asp = asp
        self.webhook = webhook
        self.session = session
        self.debug = debug
        self.cache_ttl = cache_ttl
        self.proxy_pool = proxy_pool
        self.tags = tags or set()
        self.format = format
        self.format_options = format_options
        self.extraction_template = extraction_template
        self.extraction_ephemeral_template = extraction_ephemeral_template
        self.extraction_prompt = extraction_prompt
        self.extraction_model = extraction_model        
        self.correlation_id = correlation_id
        self.wait_for_selector = wait_for_selector
        self.body = body
        self.data = data
        self.js = js
        self.rendering_wait = rendering_wait
        self.rendering_stage = rendering_stage
        self.raise_on_upstream_error = raise_on_upstream_error
        self.screenshots = screenshots
        self.screenshot_flags = screenshot_flags
        self.key = None
        self.dns = dns
        self.ssl = ssl
        self.js_scenario = js_scenario
        self.timeout = timeout
        self.extract = extract
        self.lang = lang
        self.os = os
        self.auto_scroll = auto_scroll
        self.cost_budget = cost_budget

        if cookies:
            _cookies = []

            for name, value in cookies.items():
                _cookies.append(name + '=' + value)

            if 'cookie' in self.headers:
                if self.headers['cookie'][-1] != ';':
                    self.headers['cookie'] += ';'
            else:
                self.headers['cookie'] = ''

            self.headers['cookie'] += '; '.join(_cookies)

        if self.body and self.data:
            raise ScrapeConfigError('You cannot pass both parameters body and data. You must choose')

        if method in ['POST', 'PUT', 'PATCH']:
            if self.body is None and self.data is not None:
                if 'content-type' not in self.headers:
                    self.headers['content-type'] = 'application/x-www-form-urlencoded'
                    self.body = urlencode(data)
                else:
                    if self.headers['content-type'].find('application/json') != -1:
                        self.body = json.dumps(data)
                    elif self.headers['content-type'].find('application/x-www-form-urlencoded') != -1:
                        self.body = urlencode(data)
                    else:
                        raise ScrapeConfigError('Content-Type "%s" not supported, use body parameter to pass pre encoded body according to your content type' % self.headers['content-type'])
            elif self.body is None and self.data is None:
                self.headers['content-type'] = 'text/plain'

    def to_api_params(self, key:str) -> Dict:
        params = {
            'key': self.key or key,
            'url': self.url
        }

        if self.country is not None:
            params['country'] = self.country

        for name, value in self.headers.items():
            params['headers[%s]' % name] = value

        if self.webhook is not None:
            params['webhook_name'] = self.webhook

        if self.timeout is not None:
            params['timeout'] = self.timeout

        if self.extract is not None:
            params['extract'] = base64.urlsafe_b64encode(json.dumps(self.extract).encode('utf-8')).decode('utf-8')

        if self.cost_budget is not None:
            params['cost_budget'] = self.cost_budget

        if self.render_js is True:
            params['render_js'] = self._bool_to_http(self.render_js)

            if self.wait_for_selector is not None:
                params['wait_for_selector'] = self.wait_for_selector

            if self.js:
                params['js'] = base64.urlsafe_b64encode(self.js.encode('utf-8')).decode('utf-8')

            if self.js_scenario:
                params['js_scenario'] = base64.urlsafe_b64encode(json.dumps(self.js_scenario).encode('utf-8')).decode('utf-8')

            if self.rendering_wait:
                params['rendering_wait'] = self.rendering_wait
            
            if self.rendering_stage:
                params['rendering_stage'] = self.rendering_stage

            if self.screenshots is not None:
                for name, element in self.screenshots.items():
                    params['screenshots[%s]' % name] = element

            if self.screenshot_flags is not None:
                self.screenshot_flags = [ScreenshotFlag(flag) for flag in self.screenshot_flags]
                params["screenshot_flags"] = ",".join(flag.value for flag in self.screenshot_flags)
            else:
                if self.screenshot_flags is not None:
                    logging.warning('Params "screenshot_flags" is ignored. Works only if screenshots is enabled')

            if self.auto_scroll is True:
                params['auto_scroll'] = self._bool_to_http(self.auto_scroll)
        else:
            if self.wait_for_selector is not None:
                logging.warning('Params "wait_for_selector" is ignored. Works only if render_js is enabled')

            if self.screenshots:
                logging.warning('Params "screenshots" is ignored. Works only if render_js is enabled')

            if self.js_scenario:
                logging.warning('Params "js_scenario" is ignored. Works only if render_js is enabled')

            if self.js:
                logging.warning('Params "js" is ignored. Works only if render_js is enabled')

            if self.rendering_wait:
                logging.warning('Params "rendering_wait" is ignored. Works only if render_js is enabled')

        if self.asp is True:
            params['asp'] = self._bool_to_http(self.asp)

        if self.retry is False:
            params['retry'] = self._bool_to_http(self.retry)

        if self.cache is True:
            params['cache'] = self._bool_to_http(self.cache)

            if self.cache_clear is True:
                params['cache_clear'] = self._bool_to_http(self.cache_clear)

            if self.cache_ttl is not None:
                params['cache_ttl'] = self.cache_ttl
        else:
            if self.cache_clear is True:
                logging.warning('Params "cache_clear" is ignored. Works only if cache is enabled')

            if self.cache_ttl is not None:
                logging.warning('Params "cache_ttl" is ignored. Works only if cache is enabled')

        if self.dns is True:
            params['dns'] = self._bool_to_http(self.dns)

        if self.ssl is True:
            params['ssl'] = self._bool_to_http(self.ssl)

        if self.tags:
            params['tags'] = ','.join(self.tags)

        if self.format:
            params['format'] = Format(self.format).value
            if self.format_options:
                params['format'] += ':' + ','.join(FormatOption(option).value for option in self.format_options)

        if self.extraction_template and self.extraction_ephemeral_template:
            raise ScrapeConfigError('You cannot pass both parameters extraction_template and extraction_ephemeral_template. You must choose')

        if self.extraction_template:
            params['extraction_template'] = self.extraction_template

        if self.extraction_ephemeral_template:
            self.extraction_ephemeral_template = json.dumps(self.extraction_ephemeral_template)
            params['extraction_template'] = 'ephemeral:' + urlsafe_b64encode(self.extraction_ephemeral_template.encode('utf-8')).decode('utf-8')

        if self.extraction_prompt:
            params['extraction_prompt'] = quote_plus(self.extraction_prompt)

        if self.extraction_model:
            params['extraction_model'] = self.extraction_model

        if self.correlation_id:
            params['correlation_id'] = self.correlation_id

        if self.session:
            params['session'] = self.session

            if self.session_sticky_proxy is True: # false by default
                params['session_sticky_proxy'] = self._bool_to_http(self.session_sticky_proxy)
        else:
            if self.session_sticky_proxy:
                logging.warning('Params "session_sticky_proxy" is ignored. Works only if session is enabled')

        if self.debug is True:
            params['debug'] = self._bool_to_http(self.debug)

        if self.proxy_pool is not None:
            params['proxy_pool'] = self.proxy_pool

        if self.lang is not None:
            params['lang'] = ','.join(self.lang)

        if self.os is not None:
            params['os'] = self.os

        return params

    @staticmethod
    def from_exported_config(config:str) -> 'ScrapeConfig':
        try:
            from msgpack import loads as msgpack_loads
        except ImportError as e:
            print('You must install msgpack package - run: pip install "scrapfly-sdk[seepdup] or pip install msgpack')
            raise

        data = msgpack_loads(base64.b64decode(config))

        headers = {}

        for name, value in data['headers'].items():
            if isinstance(value, Iterable):
                headers[name] = '; '.join(value)
            else:
                headers[name] = value

        return ScrapeConfig(
            url=data['url'],
            retry=data['retry'],
            headers=headers,
            session=data['session'],
            session_sticky_proxy=data['session_sticky_proxy'],
            cache=data['cache'],
            cache_ttl=data['cache_ttl'],
            cache_clear=data['cache_clear'],
            render_js=data['render_js'],
            method=data['method'],
            asp=data['asp'],
            body=data['body'],
            ssl=data['ssl'],
            dns=data['dns'],
            country=data['country'],
            debug=data['debug'],
            correlation_id=data['correlation_id'],
            tags=data['tags'],
            format=data['format'],
            js=data['js'],
            rendering_wait=data['rendering_wait'],
            screenshots=data['screenshots'] or {},
            screenshot_flags=data['screenshot_flags'],
            proxy_pool=data['proxy_pool'],
            auto_scroll=data['auto_scroll'],
            cost_budget=data['cost_budget']
        )

    def to_dict(self) -> Dict:
        """
        Export the ScrapeConfig instance to a plain dictionary. 
        Useful for JSON-serialization or other external storage.
        """
        
        return {
            'url': self.url,
            'retry': self.retry,
            'method': self.method,
            'country': self.country,
            'render_js': self.render_js,
            'cache': self.cache,
            'cache_clear': self.cache_clear,
            'ssl': self.ssl,
            'dns': self.dns,
            'asp': self.asp,
            'debug': self.debug,
            'raise_on_upstream_error': self.raise_on_upstream_error,
            'cache_ttl': self.cache_ttl,
            'proxy_pool': self.proxy_pool,
            'session': self.session,
            'tags': list(self.tags),
            'format': Format(self.format).value if self.format else None,
            'format_options': [FormatOption(option).value for option in self.format_options] if self.format_options else None,
            'extraction_template': self.extraction_template,
            'extraction_ephemeral_template': self.extraction_ephemeral_template,
            'extraction_prompt': self.extraction_prompt,
            'extraction_model': self.extraction_model,
            'correlation_id': self.correlation_id,
            'cookies': CaseInsensitiveDict(self.cookies),
            'body': self.body,
            'data': None if self.body else self.data,
            'headers': CaseInsensitiveDict(self.headers),
            'js': self.js,
            'rendering_wait': self.rendering_wait,
            'wait_for_selector': self.wait_for_selector,
            'session_sticky_proxy': self.session_sticky_proxy,
            'screenshots': self.screenshots,
            'screenshot_flags': [ScreenshotFlag(flag).value for flag in self.screenshot_flags] if self.screenshot_flags else None,
            'webhook': self.webhook,
            'timeout': self.timeout,
            'js_scenario': self.js_scenario,
            'extract': self.extract,
            'lang': self.lang,
            'os': self.os,
            'auto_scroll': self.auto_scroll,
            'cost_budget': self.cost_budget,
        }

    @staticmethod
    def from_dict(scrape_config_dict: Dict) -> 'ScrapeConfig':
        """Create a ScrapeConfig instance from a dictionary."""
        url = scrape_config_dict.get('url', None)
        retry = scrape_config_dict.get('retry', False)
        method = scrape_config_dict.get('method', 'GET')
        country = scrape_config_dict.get('country', None)
        render_js = scrape_config_dict.get('render_js', False)
        cache = scrape_config_dict.get('cache', False)
        cache_clear = scrape_config_dict.get('cache_clear', False)
        ssl = scrape_config_dict.get('ssl', False)
        dns = scrape_config_dict.get('dns', False)
        asp = scrape_config_dict.get('asp', False)
        debug = scrape_config_dict.get('debug', False)
        raise_on_upstream_error = scrape_config_dict.get('raise_on_upstream_error', True)
        cache_ttl = scrape_config_dict.get('cache_ttl', None)
        proxy_pool = scrape_config_dict.get('proxy_pool', None)
        session = scrape_config_dict.get('session', None)
        tags = scrape_config_dict.get('tags', [])

        format = scrape_config_dict.get('format', None)
        format = Format(format) if format else None

        format_options = scrape_config_dict.get('format_options', None)
        format_options = [FormatOption(option) for option in format_options] if format_options else None

        extraction_template = scrape_config_dict.get('extraction_template', None)
        extraction_ephemeral_template = scrape_config_dict.get('extraction_ephemeral_template', None)
        extraction_prompt = scrape_config_dict.get('extraction_prompt', None)
        extraction_model = scrape_config_dict.get('extraction_model', None)
        correlation_id = scrape_config_dict.get('correlation_id', None)
        cookies = scrape_config_dict.get('cookies', {})
        body = scrape_config_dict.get('body', None)
        data = scrape_config_dict.get('data', None)
        headers = scrape_config_dict.get('headers', {})
        js = scrape_config_dict.get('js', None)
        rendering_wait = scrape_config_dict.get('rendering_wait', None)
        wait_for_selector = scrape_config_dict.get('wait_for_selector', None)
        screenshots = scrape_config_dict.get('screenshots', [])
        
        screenshot_flags = scrape_config_dict.get('screenshot_flags', [])
        screenshot_flags = [ScreenshotFlag(flag) for flag in screenshot_flags] if screenshot_flags else None

        session_sticky_proxy = scrape_config_dict.get('session_sticky_proxy', False)
        webhook = scrape_config_dict.get('webhook', None)
        timeout = scrape_config_dict.get('timeout', None)
        js_scenario = scrape_config_dict.get('js_scenario', None)
        extract = scrape_config_dict.get('extract', None)
        os = scrape_config_dict.get('os', None)
        lang = scrape_config_dict.get('lang', None)
        auto_scroll = scrape_config_dict.get('auto_scroll', None)
        cost_budget = scrape_config_dict.get('cost_budget', None)

        return ScrapeConfig(
            url=url,
            retry=retry,
            method=method,
            country=country,
            render_js=render_js,
            cache=cache,
            cache_clear=cache_clear,
            ssl=ssl,
            dns=dns,
            asp=asp,
            debug=debug,
            raise_on_upstream_error=raise_on_upstream_error,
            cache_ttl=cache_ttl,
            proxy_pool=proxy_pool,
            session=session,
            tags=tags,
            format=format,
            format_options=format_options,
            extraction_template=extraction_template,
            extraction_ephemeral_template=extraction_ephemeral_template,
            extraction_prompt=extraction_prompt,
            extraction_model=extraction_model,
            correlation_id=correlation_id,
            cookies=cookies,
            body=body,
            data=data,
            headers=headers,
            js=js,
            rendering_wait=rendering_wait,
            wait_for_selector=wait_for_selector,
            screenshots=screenshots,
            screenshot_flags=screenshot_flags,
            session_sticky_proxy=session_sticky_proxy,
            webhook=webhook,
            timeout=timeout,
            js_scenario=js_scenario,
            extract=extract,
            os=os,
            lang=lang,
            auto_scroll=auto_scroll,
            cost_budget=cost_budget,
        )

Ancestors

BaseApiConfig

Class variables

var PUBLIC_DATACENTER_POOL: The type of the None singleton.
var PUBLIC_RESIDENTIAL_POOL: The type of the None singleton.
var asp : bool: The type of the None singleton.
var auto_scroll : bool | None: The type of the None singleton.
var body : str | None: The type of the None singleton.
var cache : bool: The type of the None singleton.
var cache_clear : bool: The type of the None singleton.
var cache_ttl : int | None: The type of the None singleton.
var cookies : requests.structures.CaseInsensitiveDict | None: The type of the None singleton.
var correlation_id : str | None: The type of the None singleton.
var cost_budget : int | None: The type of the None singleton.
var country : str | None: The type of the None singleton.
var data : Dict | None: The type of the None singleton.
var debug : bool: The type of the None singleton.
var dns : bool: The type of the None singleton.
var extract : Dict: The type of the None singleton.
var extraction_ephemeral_template : Dict | None: The type of the None singleton.
var extraction_model : str | None: The type of the None singleton.
var extraction_prompt : str | None: The type of the None singleton.
var extraction_template : str | None: The type of the None singleton.
var format : Format | None: The type of the None singleton.
var format_options : List[FormatOption] | None: The type of the None singleton.
var headers : requests.structures.CaseInsensitiveDict | None: The type of the None singleton.
var js : str: The type of the None singleton.
var js_scenario : Dict: The type of the None singleton.
var lang : List[str] | None: The type of the None singleton.
var method : str: The type of the None singleton.
var os : str | None: The type of the None singleton.
var proxy_pool : str | None: The type of the None singleton.
var raise_on_upstream_error : bool: The type of the None singleton.
var render_js : bool: The type of the None singleton.
var rendering_stage : Literal['complete', 'domcontentloaded']: The type of the None singleton.
var rendering_wait : int: The type of the None singleton.
var retry : bool: The type of the None singleton.
var screenshot_flags : List[ScreenshotFlag] | None: The type of the None singleton.
var screenshots : Dict | None: The type of the None singleton.
var session : str | None: The type of the None singleton.
var session_sticky_proxy : bool: The type of the None singleton.
var ssl : bool: The type of the None singleton.
var tags : List[str] | None: The type of the None singleton.
var timeout : int | None: The type of the None singleton.
var url : str: The type of the None singleton.
var wait_for_selector : str | None: The type of the None singleton.
var webhook : str | None: The type of the None singleton.

Static methods

def from_dict(scrape_config_dict: Dict) ‑> ScrapeConfig

Expand source code

@staticmethod
def from_dict(scrape_config_dict: Dict) -> 'ScrapeConfig':
    """Create a ScrapeConfig instance from a dictionary."""
    url = scrape_config_dict.get('url', None)
    retry = scrape_config_dict.get('retry', False)
    method = scrape_config_dict.get('method', 'GET')
    country = scrape_config_dict.get('country', None)
    render_js = scrape_config_dict.get('render_js', False)
    cache = scrape_config_dict.get('cache', False)
    cache_clear = scrape_config_dict.get('cache_clear', False)
    ssl = scrape_config_dict.get('ssl', False)
    dns = scrape_config_dict.get('dns', False)
    asp = scrape_config_dict.get('asp', False)
    debug = scrape_config_dict.get('debug', False)
    raise_on_upstream_error = scrape_config_dict.get('raise_on_upstream_error', True)
    cache_ttl = scrape_config_dict.get('cache_ttl', None)
    proxy_pool = scrape_config_dict.get('proxy_pool', None)
    session = scrape_config_dict.get('session', None)
    tags = scrape_config_dict.get('tags', [])

    format = scrape_config_dict.get('format', None)
    format = Format(format) if format else None

    format_options = scrape_config_dict.get('format_options', None)
    format_options = [FormatOption(option) for option in format_options] if format_options else None

    extraction_template = scrape_config_dict.get('extraction_template', None)
    extraction_ephemeral_template = scrape_config_dict.get('extraction_ephemeral_template', None)
    extraction_prompt = scrape_config_dict.get('extraction_prompt', None)
    extraction_model = scrape_config_dict.get('extraction_model', None)
    correlation_id = scrape_config_dict.get('correlation_id', None)
    cookies = scrape_config_dict.get('cookies', {})
    body = scrape_config_dict.get('body', None)
    data = scrape_config_dict.get('data', None)
    headers = scrape_config_dict.get('headers', {})
    js = scrape_config_dict.get('js', None)
    rendering_wait = scrape_config_dict.get('rendering_wait', None)
    wait_for_selector = scrape_config_dict.get('wait_for_selector', None)
    screenshots = scrape_config_dict.get('screenshots', [])
    
    screenshot_flags = scrape_config_dict.get('screenshot_flags', [])
    screenshot_flags = [ScreenshotFlag(flag) for flag in screenshot_flags] if screenshot_flags else None

    session_sticky_proxy = scrape_config_dict.get('session_sticky_proxy', False)
    webhook = scrape_config_dict.get('webhook', None)
    timeout = scrape_config_dict.get('timeout', None)
    js_scenario = scrape_config_dict.get('js_scenario', None)
    extract = scrape_config_dict.get('extract', None)
    os = scrape_config_dict.get('os', None)
    lang = scrape_config_dict.get('lang', None)
    auto_scroll = scrape_config_dict.get('auto_scroll', None)
    cost_budget = scrape_config_dict.get('cost_budget', None)

    return ScrapeConfig(
        url=url,
        retry=retry,
        method=method,
        country=country,
        render_js=render_js,
        cache=cache,
        cache_clear=cache_clear,
        ssl=ssl,
        dns=dns,
        asp=asp,
        debug=debug,
        raise_on_upstream_error=raise_on_upstream_error,
        cache_ttl=cache_ttl,
        proxy_pool=proxy_pool,
        session=session,
        tags=tags,
        format=format,
        format_options=format_options,
        extraction_template=extraction_template,
        extraction_ephemeral_template=extraction_ephemeral_template,
        extraction_prompt=extraction_prompt,
        extraction_model=extraction_model,
        correlation_id=correlation_id,
        cookies=cookies,
        body=body,
        data=data,
        headers=headers,
        js=js,
        rendering_wait=rendering_wait,
        wait_for_selector=wait_for_selector,
        screenshots=screenshots,
        screenshot_flags=screenshot_flags,
        session_sticky_proxy=session_sticky_proxy,
        webhook=webhook,
        timeout=timeout,
        js_scenario=js_scenario,
        extract=extract,
        os=os,
        lang=lang,
        auto_scroll=auto_scroll,
        cost_budget=cost_budget,
    )

Create a ScrapeConfig instance from a dictionary.

def from_exported_config(config: str) ‑> ScrapeConfig

Expand source code

@staticmethod
def from_exported_config(config:str) -> 'ScrapeConfig':
    try:
        from msgpack import loads as msgpack_loads
    except ImportError as e:
        print('You must install msgpack package - run: pip install "scrapfly-sdk[seepdup] or pip install msgpack')
        raise

    data = msgpack_loads(base64.b64decode(config))

    headers = {}

    for name, value in data['headers'].items():
        if isinstance(value, Iterable):
            headers[name] = '; '.join(value)
        else:
            headers[name] = value

    return ScrapeConfig(
        url=data['url'],
        retry=data['retry'],
        headers=headers,
        session=data['session'],
        session_sticky_proxy=data['session_sticky_proxy'],
        cache=data['cache'],
        cache_ttl=data['cache_ttl'],
        cache_clear=data['cache_clear'],
        render_js=data['render_js'],
        method=data['method'],
        asp=data['asp'],
        body=data['body'],
        ssl=data['ssl'],
        dns=data['dns'],
        country=data['country'],
        debug=data['debug'],
        correlation_id=data['correlation_id'],
        tags=data['tags'],
        format=data['format'],
        js=data['js'],
        rendering_wait=data['rendering_wait'],
        screenshots=data['screenshots'] or {},
        screenshot_flags=data['screenshot_flags'],
        proxy_pool=data['proxy_pool'],
        auto_scroll=data['auto_scroll'],
        cost_budget=data['cost_budget']
    )

Methods

def to_api_params(self, key: str) ‑> Dict

Expand source code

def to_api_params(self, key:str) -> Dict:
    params = {
        'key': self.key or key,
        'url': self.url
    }

    if self.country is not None:
        params['country'] = self.country

    for name, value in self.headers.items():
        params['headers[%s]' % name] = value

    if self.webhook is not None:
        params['webhook_name'] = self.webhook

    if self.timeout is not None:
        params['timeout'] = self.timeout

    if self.extract is not None:
        params['extract'] = base64.urlsafe_b64encode(json.dumps(self.extract).encode('utf-8')).decode('utf-8')

    if self.cost_budget is not None:
        params['cost_budget'] = self.cost_budget

    if self.render_js is True:
        params['render_js'] = self._bool_to_http(self.render_js)

        if self.wait_for_selector is not None:
            params['wait_for_selector'] = self.wait_for_selector

        if self.js:
            params['js'] = base64.urlsafe_b64encode(self.js.encode('utf-8')).decode('utf-8')

        if self.js_scenario:
            params['js_scenario'] = base64.urlsafe_b64encode(json.dumps(self.js_scenario).encode('utf-8')).decode('utf-8')

        if self.rendering_wait:
            params['rendering_wait'] = self.rendering_wait
        
        if self.rendering_stage:
            params['rendering_stage'] = self.rendering_stage

        if self.screenshots is not None:
            for name, element in self.screenshots.items():
                params['screenshots[%s]' % name] = element

        if self.screenshot_flags is not None:
            self.screenshot_flags = [ScreenshotFlag(flag) for flag in self.screenshot_flags]
            params["screenshot_flags"] = ",".join(flag.value for flag in self.screenshot_flags)
        else:
            if self.screenshot_flags is not None:
                logging.warning('Params "screenshot_flags" is ignored. Works only if screenshots is enabled')

        if self.auto_scroll is True:
            params['auto_scroll'] = self._bool_to_http(self.auto_scroll)
    else:
        if self.wait_for_selector is not None:
            logging.warning('Params "wait_for_selector" is ignored. Works only if render_js is enabled')

        if self.screenshots:
            logging.warning('Params "screenshots" is ignored. Works only if render_js is enabled')

        if self.js_scenario:
            logging.warning('Params "js_scenario" is ignored. Works only if render_js is enabled')

        if self.js:
            logging.warning('Params "js" is ignored. Works only if render_js is enabled')

        if self.rendering_wait:
            logging.warning('Params "rendering_wait" is ignored. Works only if render_js is enabled')

    if self.asp is True:
        params['asp'] = self._bool_to_http(self.asp)

    if self.retry is False:
        params['retry'] = self._bool_to_http(self.retry)

    if self.cache is True:
        params['cache'] = self._bool_to_http(self.cache)

        if self.cache_clear is True:
            params['cache_clear'] = self._bool_to_http(self.cache_clear)

        if self.cache_ttl is not None:
            params['cache_ttl'] = self.cache_ttl
    else:
        if self.cache_clear is True:
            logging.warning('Params "cache_clear" is ignored. Works only if cache is enabled')

        if self.cache_ttl is not None:
            logging.warning('Params "cache_ttl" is ignored. Works only if cache is enabled')

    if self.dns is True:
        params['dns'] = self._bool_to_http(self.dns)

    if self.ssl is True:
        params['ssl'] = self._bool_to_http(self.ssl)

    if self.tags:
        params['tags'] = ','.join(self.tags)

    if self.format:
        params['format'] = Format(self.format).value
        if self.format_options:
            params['format'] += ':' + ','.join(FormatOption(option).value for option in self.format_options)

    if self.extraction_template and self.extraction_ephemeral_template:
        raise ScrapeConfigError('You cannot pass both parameters extraction_template and extraction_ephemeral_template. You must choose')

    if self.extraction_template:
        params['extraction_template'] = self.extraction_template

    if self.extraction_ephemeral_template:
        self.extraction_ephemeral_template = json.dumps(self.extraction_ephemeral_template)
        params['extraction_template'] = 'ephemeral:' + urlsafe_b64encode(self.extraction_ephemeral_template.encode('utf-8')).decode('utf-8')

    if self.extraction_prompt:
        params['extraction_prompt'] = quote_plus(self.extraction_prompt)

    if self.extraction_model:
        params['extraction_model'] = self.extraction_model

    if self.correlation_id:
        params['correlation_id'] = self.correlation_id

    if self.session:
        params['session'] = self.session

        if self.session_sticky_proxy is True: # false by default
            params['session_sticky_proxy'] = self._bool_to_http(self.session_sticky_proxy)
    else:
        if self.session_sticky_proxy:
            logging.warning('Params "session_sticky_proxy" is ignored. Works only if session is enabled')

    if self.debug is True:
        params['debug'] = self._bool_to_http(self.debug)

    if self.proxy_pool is not None:
        params['proxy_pool'] = self.proxy_pool

    if self.lang is not None:
        params['lang'] = ','.join(self.lang)

    if self.os is not None:
        params['os'] = self.os

    return params

def to_dict(self) ‑> Dict

Expand source code

def to_dict(self) -> Dict:
    """
    Export the ScrapeConfig instance to a plain dictionary. 
    Useful for JSON-serialization or other external storage.
    """
    
    return {
        'url': self.url,
        'retry': self.retry,
        'method': self.method,
        'country': self.country,
        'render_js': self.render_js,
        'cache': self.cache,
        'cache_clear': self.cache_clear,
        'ssl': self.ssl,
        'dns': self.dns,
        'asp': self.asp,
        'debug': self.debug,
        'raise_on_upstream_error': self.raise_on_upstream_error,
        'cache_ttl': self.cache_ttl,
        'proxy_pool': self.proxy_pool,
        'session': self.session,
        'tags': list(self.tags),
        'format': Format(self.format).value if self.format else None,
        'format_options': [FormatOption(option).value for option in self.format_options] if self.format_options else None,
        'extraction_template': self.extraction_template,
        'extraction_ephemeral_template': self.extraction_ephemeral_template,
        'extraction_prompt': self.extraction_prompt,
        'extraction_model': self.extraction_model,
        'correlation_id': self.correlation_id,
        'cookies': CaseInsensitiveDict(self.cookies),
        'body': self.body,
        'data': None if self.body else self.data,
        'headers': CaseInsensitiveDict(self.headers),
        'js': self.js,
        'rendering_wait': self.rendering_wait,
        'wait_for_selector': self.wait_for_selector,
        'session_sticky_proxy': self.session_sticky_proxy,
        'screenshots': self.screenshots,
        'screenshot_flags': [ScreenshotFlag(flag).value for flag in self.screenshot_flags] if self.screenshot_flags else None,
        'webhook': self.webhook,
        'timeout': self.timeout,
        'js_scenario': self.js_scenario,
        'extract': self.extract,
        'lang': self.lang,
        'os': self.os,
        'auto_scroll': self.auto_scroll,
        'cost_budget': self.cost_budget,
    }

Export the ScrapeConfig instance to a plain dictionary. Useful for JSON-serialization or other external storage.

class ScrapeConfigError (*args, **kwargs)

Expand source code

class ScrapeConfigError(Exception):
    pass

Common base class for all non-exit exceptions.

Ancestors

builtins.Exception
builtins.BaseException

class ScreenshotFlag (*args, **kwds)

Expand source code

class ScreenshotFlag(Enum):
    """
    Attributes:
        LOAD_IMAGES: Enable image rendering with the request, add extra usage for the bandwidth consumed.
        DARK_MODE: Enable dark mode display.
        BLOCK_BANNERS: Block cookies banners and overlay that cover the screen.
        HIGH_QUALITY: No compression on the output image.
        PRINT_MEDIA_FORMAT: Render the page in the print mode.
    """

    LOAD_IMAGES = "load_images"
    DARK_MODE = "dark_mode"
    BLOCK_BANNERS = "block_banners"
    HIGH_QUALITY = "high_quality"
    PRINT_MEDIA_FORMAT = "print_media_format"

Attributes

LOAD_IMAGES: Enable image rendering with the request, add extra usage for the bandwidth consumed.
DARK_MODE: Enable dark mode display.
BLOCK_BANNERS: Block cookies banners and overlay that cover the screen.
HIGH_QUALITY: No compression on the output image.
PRINT_MEDIA_FORMAT: Render the page in the print mode.

Ancestors

enum.Enum

Class variables

var BLOCK_BANNERS: The type of the None singleton.
var DARK_MODE: The type of the None singleton.
var HIGH_QUALITY: The type of the None singleton.
var LOAD_IMAGES: The type of the None singleton.
var PRINT_MEDIA_FORMAT: The type of the None singleton.