Module scrapfly.scrape_config
Expand source code
import base64
import json
import logging
from enum import Enum
from urllib.parse import urlencode, quote_plus
from base64 import urlsafe_b64encode
from typing import Optional, List, Dict, Iterable, Union, Set
from requests.structures import CaseInsensitiveDict
from .api_config import BaseApiConfig
class ScreenshotFlag(Enum):
"""
Attributes:
LOAD_IMAGES: Enable image rendering with the request, add extra usage for the bandwidth consumed.
DARK_MODE: Enable dark mode display.
BLOCK_BANNERS: Block cookies banners and overlay that cover the screen.
HIGH_QUALITY: No compression on the output image.
PRINT_MEDIA_FORMAT: Render the page in the print mode.
"""
LOAD_IMAGES = "load_images"
DARK_MODE = "dark_mode"
BLOCK_BANNERS = "block_banners"
HIGH_QUALITY = "high_quality"
PRINT_MEDIA_FORMAT = "print_media_format"
class Format(Enum):
"""
Attributes:
JSON: JSON format.
TEXT: Text format.
MARKDOWN: Markdown format.
CLEAN_HTML: Clean HTML format.
"""
JSON = "json"
TEXT = "text"
MARKDOWN = "markdown"
CLEAN_HTML = "clean_html"
class FormatOption(Enum):
"""
Attributes:
NO_IMAGES: exlude images from `markdown` format
NO_LINKS: exlude links from `markdown` format
"""
NO_IMAGES = "no_images"
NO_LINKS = "no_links"
ONLY_CONTENT = "only_content"
class ScrapeConfigError(Exception):
pass
class ScrapeConfig(BaseApiConfig):
PUBLIC_DATACENTER_POOL = 'public_datacenter_pool'
PUBLIC_RESIDENTIAL_POOL = 'public_residential_pool'
url: str
retry: bool = True
method: str = 'GET'
country: Optional[str] = None
render_js: bool = False
cache: bool = False
cache_clear:bool = False
ssl:bool = False
dns:bool = False
asp:bool = False
debug: bool = False
raise_on_upstream_error:bool = True
cache_ttl:Optional[int] = None
proxy_pool:Optional[str] = None
session: Optional[str] = None
tags: Optional[List[str]] = None
format: Optional[Format] = None, # raw(unchanged)
format_options: Optional[List[FormatOption]]
extraction_template: Optional[str] = None # a saved template name
extraction_ephemeral_template: Optional[Dict] # ephemeraly declared json template
extraction_prompt: Optional[str] = None
extraction_model: Optional[str] = None
correlation_id: Optional[str] = None
cookies: Optional[CaseInsensitiveDict] = None
body: Optional[str] = None
data: Optional[Dict] = None
headers: Optional[CaseInsensitiveDict] = None
js: str = None
rendering_wait: int = None
wait_for_selector: Optional[str] = None
session_sticky_proxy:bool = True
screenshots:Optional[Dict]=None
screenshot_flags: Optional[List[ScreenshotFlag]] = None,
webhook:Optional[str]=None
timeout:Optional[int]=None # in milliseconds
js_scenario: Dict = None
extract: Dict = None
lang:Optional[List[str]] = None
os:Optional[str] = None
auto_scroll:Optional[bool] = None
cost_budget:Optional[int] = None
def __init__(
self,
url: str,
retry: bool = True,
method: str = 'GET',
country: Optional[str] = None,
render_js: bool = False,
cache: bool = False,
cache_clear:bool = False,
ssl:bool = False,
dns:bool = False,
asp:bool = False,
debug: bool = False,
raise_on_upstream_error:bool = True,
cache_ttl:Optional[int] = None,
proxy_pool:Optional[str] = None,
session: Optional[str] = None,
tags: Optional[Union[List[str], Set[str]]] = None,
format: Optional[Format] = None, # raw(unchanged)
format_options: Optional[List[FormatOption]] = None, # raw(unchanged)
extraction_template: Optional[str] = None, # a saved template name
extraction_ephemeral_template: Optional[Dict] = None, # ephemeraly declared json template
extraction_prompt: Optional[str] = None,
extraction_model: Optional[str] = None,
correlation_id: Optional[str] = None,
cookies: Optional[CaseInsensitiveDict] = None,
body: Optional[str] = None,
data: Optional[Dict] = None,
headers: Optional[Union[CaseInsensitiveDict, Dict[str, str]]] = None,
js: str = None,
rendering_wait: int = None,
wait_for_selector: Optional[str] = None,
screenshots:Optional[Dict]=None,
screenshot_flags: Optional[List[ScreenshotFlag]] = None,
session_sticky_proxy:Optional[bool] = None,
webhook:Optional[str] = None,
timeout:Optional[int] = None, # in milliseconds
js_scenario:Optional[List] = None,
extract:Optional[Dict] = None,
os:Optional[str] = None,
lang:Optional[List[str]] = None,
auto_scroll:Optional[bool] = None,
cost_budget:Optional[int] = None
):
assert(type(url) is str)
if isinstance(tags, List):
tags = set(tags)
cookies = cookies or {}
headers = headers or {}
self.cookies = CaseInsensitiveDict(cookies)
self.headers = CaseInsensitiveDict(headers)
self.url = url
self.retry = retry
self.method = method
self.country = country
self.session_sticky_proxy = session_sticky_proxy
self.render_js = render_js
self.cache = cache
self.cache_clear = cache_clear
self.asp = asp
self.webhook = webhook
self.session = session
self.debug = debug
self.cache_ttl = cache_ttl
self.proxy_pool = proxy_pool
self.tags = tags or set()
self.format = format
self.format_options = format_options
self.extraction_template = extraction_template
self.extraction_ephemeral_template = extraction_ephemeral_template
self.extraction_prompt = extraction_prompt
self.extraction_model = extraction_model
self.correlation_id = correlation_id
self.wait_for_selector = wait_for_selector
self.body = body
self.data = data
self.js = js
self.rendering_wait = rendering_wait
self.raise_on_upstream_error = raise_on_upstream_error
self.screenshots = screenshots
self.screenshot_flags = screenshot_flags
self.key = None
self.dns = dns
self.ssl = ssl
self.js_scenario = js_scenario
self.timeout = timeout
self.extract = extract
self.lang = lang
self.os = os
self.auto_scroll = auto_scroll
self.cost_budget = cost_budget
if cookies:
_cookies = []
for name, value in cookies.items():
_cookies.append(name + '=' + value)
if 'cookie' in self.headers:
if self.headers['cookie'][-1] != ';':
self.headers['cookie'] += ';'
else:
self.headers['cookie'] = ''
self.headers['cookie'] += '; '.join(_cookies)
if self.body and self.data:
raise ScrapeConfigError('You cannot pass both parameters body and data. You must choose')
if method in ['POST', 'PUT', 'PATCH']:
if self.body is None and self.data is not None:
if 'content-type' not in self.headers:
self.headers['content-type'] = 'application/x-www-form-urlencoded'
self.body = urlencode(data)
else:
if self.headers['content-type'].find('application/json') != -1:
self.body = json.dumps(data)
elif self.headers['content-type'].find('application/x-www-form-urlencoded') != -1:
self.body = urlencode(data)
else:
raise ScrapeConfigError('Content-Type "%s" not supported, use body parameter to pass pre encoded body according to your content type' % self.headers['content-type'])
elif self.body is None and self.data is None:
self.headers['content-type'] = 'text/plain'
def to_api_params(self, key:str) -> Dict:
params = {
'key': self.key or key,
'url': self.url
}
if self.country is not None:
params['country'] = self.country
for name, value in self.headers.items():
params['headers[%s]' % name] = value
if self.webhook is not None:
params['webhook_name'] = self.webhook
if self.timeout is not None:
params['timeout'] = self.timeout
if self.extract is not None:
params['extract'] = base64.urlsafe_b64encode(json.dumps(self.extract).encode('utf-8')).decode('utf-8')
if self.cost_budget is not None:
params['cost_budget'] = self.cost_budget
if self.render_js is True:
params['render_js'] = self._bool_to_http(self.render_js)
if self.wait_for_selector is not None:
params['wait_for_selector'] = self.wait_for_selector
if self.js:
params['js'] = base64.urlsafe_b64encode(self.js.encode('utf-8')).decode('utf-8')
if self.js_scenario:
params['js_scenario'] = base64.urlsafe_b64encode(json.dumps(self.js_scenario).encode('utf-8')).decode('utf-8')
if self.rendering_wait:
params['rendering_wait'] = self.rendering_wait
if self.screenshots is not None:
for name, element in self.screenshots.items():
params['screenshots[%s]' % name] = element
if self.screenshot_flags is not None:
self.screenshot_flags = [ScreenshotFlag(flag) for flag in self.screenshot_flags]
params["screenshot_flags"] = ",".join(flag.value for flag in self.screenshot_flags)
else:
if self.screenshot_flags is not None:
logging.warning('Params "screenshot_flags" is ignored. Works only if screenshots is enabled')
if self.auto_scroll is True:
params['auto_scroll'] = self._bool_to_http(self.auto_scroll)
else:
if self.wait_for_selector is not None:
logging.warning('Params "wait_for_selector" is ignored. Works only if render_js is enabled')
if self.screenshots:
logging.warning('Params "screenshots" is ignored. Works only if render_js is enabled')
if self.js_scenario:
logging.warning('Params "js_scenario" is ignored. Works only if render_js is enabled')
if self.js:
logging.warning('Params "js" is ignored. Works only if render_js is enabled')
if self.rendering_wait:
logging.warning('Params "rendering_wait" is ignored. Works only if render_js is enabled')
if self.asp is True:
params['asp'] = self._bool_to_http(self.asp)
if self.retry is False:
params['retry'] = self._bool_to_http(self.retry)
if self.cache is True:
params['cache'] = self._bool_to_http(self.cache)
if self.cache_clear is True:
params['cache_clear'] = self._bool_to_http(self.cache_clear)
if self.cache_ttl is not None:
params['cache_ttl'] = self.cache_ttl
else:
if self.cache_clear is True:
logging.warning('Params "cache_clear" is ignored. Works only if cache is enabled')
if self.cache_ttl is not None:
logging.warning('Params "cache_ttl" is ignored. Works only if cache is enabled')
if self.dns is True:
params['dns'] = self._bool_to_http(self.dns)
if self.ssl is True:
params['ssl'] = self._bool_to_http(self.ssl)
if self.tags:
params['tags'] = ','.join(self.tags)
if self.format:
params['format'] = Format(self.format).value
if self.format_options:
params['format'] += ':' + ','.join(FormatOption(option).value for option in self.format_options)
if self.extraction_template and self.extraction_ephemeral_template:
raise ScrapeConfigError('You cannot pass both parameters extraction_template and extraction_ephemeral_template. You must choose')
if self.extraction_template:
params['extraction_template'] = self.extraction_template
if self.extraction_ephemeral_template:
self.extraction_ephemeral_template = json.dumps(self.extraction_ephemeral_template)
params['extraction_template'] = 'ephemeral:' + urlsafe_b64encode(self.extraction_ephemeral_template.encode('utf-8')).decode('utf-8')
if self.extraction_prompt:
params['extraction_prompt'] = quote_plus(self.extraction_prompt)
if self.extraction_model:
params['extraction_model'] = self.extraction_model
if self.correlation_id:
params['correlation_id'] = self.correlation_id
if self.session:
params['session'] = self.session
if self.session_sticky_proxy is True: # false by default
params['session_sticky_proxy'] = self._bool_to_http(self.session_sticky_proxy)
else:
if self.session_sticky_proxy:
logging.warning('Params "session_sticky_proxy" is ignored. Works only if session is enabled')
if self.debug is True:
params['debug'] = self._bool_to_http(self.debug)
if self.proxy_pool is not None:
params['proxy_pool'] = self.proxy_pool
if self.lang is not None:
params['lang'] = ','.join(self.lang)
if self.os is not None:
params['os'] = self.os
return params
@staticmethod
def from_exported_config(config:str) -> 'ScrapeConfig':
try:
from msgpack import loads as msgpack_loads
except ImportError as e:
print('You must install msgpack package - run: pip install "scrapfly-sdk[seepdup] or pip install msgpack')
raise
data = msgpack_loads(base64.b64decode(config))
headers = {}
for name, value in data['headers'].items():
if isinstance(value, Iterable):
headers[name] = '; '.join(value)
else:
headers[name] = value
return ScrapeConfig(
url=data['url'],
retry=data['retry'],
headers=headers,
session=data['session'],
session_sticky_proxy=data['session_sticky_proxy'],
cache=data['cache'],
cache_ttl=data['cache_ttl'],
cache_clear=data['cache_clear'],
render_js=data['render_js'],
method=data['method'],
asp=data['asp'],
body=data['body'],
ssl=data['ssl'],
dns=data['dns'],
country=data['country'],
debug=data['debug'],
correlation_id=data['correlation_id'],
tags=data['tags'],
format=data['format'],
js=data['js'],
rendering_wait=data['rendering_wait'],
screenshots=data['screenshots'] or {},
screenshot_flags=data['screenshot_flags'],
proxy_pool=data['proxy_pool'],
auto_scroll=data['auto_scroll'],
cost_budget=data['cost_budget']
)
Classes
class Format (value, names=None, *, module=None, qualname=None, type=None, start=1)
-
Attributes
JSON
- JSON format.
TEXT
- Text format.
MARKDOWN
- Markdown format.
CLEAN_HTML
- Clean HTML format.
Expand source code
class Format(Enum): """ Attributes: JSON: JSON format. TEXT: Text format. MARKDOWN: Markdown format. CLEAN_HTML: Clean HTML format. """ JSON = "json" TEXT = "text" MARKDOWN = "markdown" CLEAN_HTML = "clean_html"
Ancestors
- enum.Enum
Class variables
var CLEAN_HTML
var JSON
var MARKDOWN
var TEXT
class FormatOption (value, names=None, *, module=None, qualname=None, type=None, start=1)
-
Attributes
NO_IMAGES
- exlude images from
markdown
format NO_LINKS
- exlude links from
markdown
format
Expand source code
class FormatOption(Enum): """ Attributes: NO_IMAGES: exlude images from `markdown` format NO_LINKS: exlude links from `markdown` format """ NO_IMAGES = "no_images" NO_LINKS = "no_links" ONLY_CONTENT = "only_content"
Ancestors
- enum.Enum
Class variables
var NO_IMAGES
var NO_LINKS
var ONLY_CONTENT
class ScrapeConfig (url: str, retry: bool = True, method: str = 'GET', country: Optional[str] = None, render_js: bool = False, cache: bool = False, cache_clear: bool = False, ssl: bool = False, dns: bool = False, asp: bool = False, debug: bool = False, raise_on_upstream_error: bool = True, cache_ttl: Optional[int] = None, proxy_pool: Optional[str] = None, session: Optional[str] = None, tags: Union[List[str], Set[str], ForwardRef(None)] = None, format: Optional[Format] = None, format_options: Optional[List[FormatOption]] = None, extraction_template: Optional[str] = None, extraction_ephemeral_template: Optional[Dict] = None, extraction_prompt: Optional[str] = None, extraction_model: Optional[str] = None, correlation_id: Optional[str] = None, cookies: Optional[requests.structures.CaseInsensitiveDict] = None, body: Optional[str] = None, data: Optional[Dict] = None, headers: Union[requests.structures.CaseInsensitiveDict, Dict[str, str], ForwardRef(None)] = None, js: str = None, rendering_wait: int = None, wait_for_selector: Optional[str] = None, screenshots: Optional[Dict] = None, screenshot_flags: Optional[List[ScreenshotFlag]] = None, session_sticky_proxy: Optional[bool] = None, webhook: Optional[str] = None, timeout: Optional[int] = None, js_scenario: Optional[List] = None, extract: Optional[Dict] = None, os: Optional[str] = None, lang: Optional[List[str]] = None, auto_scroll: Optional[bool] = None, cost_budget: Optional[int] = None)
-
Expand source code
class ScrapeConfig(BaseApiConfig): PUBLIC_DATACENTER_POOL = 'public_datacenter_pool' PUBLIC_RESIDENTIAL_POOL = 'public_residential_pool' url: str retry: bool = True method: str = 'GET' country: Optional[str] = None render_js: bool = False cache: bool = False cache_clear:bool = False ssl:bool = False dns:bool = False asp:bool = False debug: bool = False raise_on_upstream_error:bool = True cache_ttl:Optional[int] = None proxy_pool:Optional[str] = None session: Optional[str] = None tags: Optional[List[str]] = None format: Optional[Format] = None, # raw(unchanged) format_options: Optional[List[FormatOption]] extraction_template: Optional[str] = None # a saved template name extraction_ephemeral_template: Optional[Dict] # ephemeraly declared json template extraction_prompt: Optional[str] = None extraction_model: Optional[str] = None correlation_id: Optional[str] = None cookies: Optional[CaseInsensitiveDict] = None body: Optional[str] = None data: Optional[Dict] = None headers: Optional[CaseInsensitiveDict] = None js: str = None rendering_wait: int = None wait_for_selector: Optional[str] = None session_sticky_proxy:bool = True screenshots:Optional[Dict]=None screenshot_flags: Optional[List[ScreenshotFlag]] = None, webhook:Optional[str]=None timeout:Optional[int]=None # in milliseconds js_scenario: Dict = None extract: Dict = None lang:Optional[List[str]] = None os:Optional[str] = None auto_scroll:Optional[bool] = None cost_budget:Optional[int] = None def __init__( self, url: str, retry: bool = True, method: str = 'GET', country: Optional[str] = None, render_js: bool = False, cache: bool = False, cache_clear:bool = False, ssl:bool = False, dns:bool = False, asp:bool = False, debug: bool = False, raise_on_upstream_error:bool = True, cache_ttl:Optional[int] = None, proxy_pool:Optional[str] = None, session: Optional[str] = None, tags: Optional[Union[List[str], Set[str]]] = None, format: Optional[Format] = None, # raw(unchanged) format_options: Optional[List[FormatOption]] = None, # raw(unchanged) extraction_template: Optional[str] = None, # a saved template name extraction_ephemeral_template: Optional[Dict] = None, # ephemeraly declared json template extraction_prompt: Optional[str] = None, extraction_model: Optional[str] = None, correlation_id: Optional[str] = None, cookies: Optional[CaseInsensitiveDict] = None, body: Optional[str] = None, data: Optional[Dict] = None, headers: Optional[Union[CaseInsensitiveDict, Dict[str, str]]] = None, js: str = None, rendering_wait: int = None, wait_for_selector: Optional[str] = None, screenshots:Optional[Dict]=None, screenshot_flags: Optional[List[ScreenshotFlag]] = None, session_sticky_proxy:Optional[bool] = None, webhook:Optional[str] = None, timeout:Optional[int] = None, # in milliseconds js_scenario:Optional[List] = None, extract:Optional[Dict] = None, os:Optional[str] = None, lang:Optional[List[str]] = None, auto_scroll:Optional[bool] = None, cost_budget:Optional[int] = None ): assert(type(url) is str) if isinstance(tags, List): tags = set(tags) cookies = cookies or {} headers = headers or {} self.cookies = CaseInsensitiveDict(cookies) self.headers = CaseInsensitiveDict(headers) self.url = url self.retry = retry self.method = method self.country = country self.session_sticky_proxy = session_sticky_proxy self.render_js = render_js self.cache = cache self.cache_clear = cache_clear self.asp = asp self.webhook = webhook self.session = session self.debug = debug self.cache_ttl = cache_ttl self.proxy_pool = proxy_pool self.tags = tags or set() self.format = format self.format_options = format_options self.extraction_template = extraction_template self.extraction_ephemeral_template = extraction_ephemeral_template self.extraction_prompt = extraction_prompt self.extraction_model = extraction_model self.correlation_id = correlation_id self.wait_for_selector = wait_for_selector self.body = body self.data = data self.js = js self.rendering_wait = rendering_wait self.raise_on_upstream_error = raise_on_upstream_error self.screenshots = screenshots self.screenshot_flags = screenshot_flags self.key = None self.dns = dns self.ssl = ssl self.js_scenario = js_scenario self.timeout = timeout self.extract = extract self.lang = lang self.os = os self.auto_scroll = auto_scroll self.cost_budget = cost_budget if cookies: _cookies = [] for name, value in cookies.items(): _cookies.append(name + '=' + value) if 'cookie' in self.headers: if self.headers['cookie'][-1] != ';': self.headers['cookie'] += ';' else: self.headers['cookie'] = '' self.headers['cookie'] += '; '.join(_cookies) if self.body and self.data: raise ScrapeConfigError('You cannot pass both parameters body and data. You must choose') if method in ['POST', 'PUT', 'PATCH']: if self.body is None and self.data is not None: if 'content-type' not in self.headers: self.headers['content-type'] = 'application/x-www-form-urlencoded' self.body = urlencode(data) else: if self.headers['content-type'].find('application/json') != -1: self.body = json.dumps(data) elif self.headers['content-type'].find('application/x-www-form-urlencoded') != -1: self.body = urlencode(data) else: raise ScrapeConfigError('Content-Type "%s" not supported, use body parameter to pass pre encoded body according to your content type' % self.headers['content-type']) elif self.body is None and self.data is None: self.headers['content-type'] = 'text/plain' def to_api_params(self, key:str) -> Dict: params = { 'key': self.key or key, 'url': self.url } if self.country is not None: params['country'] = self.country for name, value in self.headers.items(): params['headers[%s]' % name] = value if self.webhook is not None: params['webhook_name'] = self.webhook if self.timeout is not None: params['timeout'] = self.timeout if self.extract is not None: params['extract'] = base64.urlsafe_b64encode(json.dumps(self.extract).encode('utf-8')).decode('utf-8') if self.cost_budget is not None: params['cost_budget'] = self.cost_budget if self.render_js is True: params['render_js'] = self._bool_to_http(self.render_js) if self.wait_for_selector is not None: params['wait_for_selector'] = self.wait_for_selector if self.js: params['js'] = base64.urlsafe_b64encode(self.js.encode('utf-8')).decode('utf-8') if self.js_scenario: params['js_scenario'] = base64.urlsafe_b64encode(json.dumps(self.js_scenario).encode('utf-8')).decode('utf-8') if self.rendering_wait: params['rendering_wait'] = self.rendering_wait if self.screenshots is not None: for name, element in self.screenshots.items(): params['screenshots[%s]' % name] = element if self.screenshot_flags is not None: self.screenshot_flags = [ScreenshotFlag(flag) for flag in self.screenshot_flags] params["screenshot_flags"] = ",".join(flag.value for flag in self.screenshot_flags) else: if self.screenshot_flags is not None: logging.warning('Params "screenshot_flags" is ignored. Works only if screenshots is enabled') if self.auto_scroll is True: params['auto_scroll'] = self._bool_to_http(self.auto_scroll) else: if self.wait_for_selector is not None: logging.warning('Params "wait_for_selector" is ignored. Works only if render_js is enabled') if self.screenshots: logging.warning('Params "screenshots" is ignored. Works only if render_js is enabled') if self.js_scenario: logging.warning('Params "js_scenario" is ignored. Works only if render_js is enabled') if self.js: logging.warning('Params "js" is ignored. Works only if render_js is enabled') if self.rendering_wait: logging.warning('Params "rendering_wait" is ignored. Works only if render_js is enabled') if self.asp is True: params['asp'] = self._bool_to_http(self.asp) if self.retry is False: params['retry'] = self._bool_to_http(self.retry) if self.cache is True: params['cache'] = self._bool_to_http(self.cache) if self.cache_clear is True: params['cache_clear'] = self._bool_to_http(self.cache_clear) if self.cache_ttl is not None: params['cache_ttl'] = self.cache_ttl else: if self.cache_clear is True: logging.warning('Params "cache_clear" is ignored. Works only if cache is enabled') if self.cache_ttl is not None: logging.warning('Params "cache_ttl" is ignored. Works only if cache is enabled') if self.dns is True: params['dns'] = self._bool_to_http(self.dns) if self.ssl is True: params['ssl'] = self._bool_to_http(self.ssl) if self.tags: params['tags'] = ','.join(self.tags) if self.format: params['format'] = Format(self.format).value if self.format_options: params['format'] += ':' + ','.join(FormatOption(option).value for option in self.format_options) if self.extraction_template and self.extraction_ephemeral_template: raise ScrapeConfigError('You cannot pass both parameters extraction_template and extraction_ephemeral_template. You must choose') if self.extraction_template: params['extraction_template'] = self.extraction_template if self.extraction_ephemeral_template: self.extraction_ephemeral_template = json.dumps(self.extraction_ephemeral_template) params['extraction_template'] = 'ephemeral:' + urlsafe_b64encode(self.extraction_ephemeral_template.encode('utf-8')).decode('utf-8') if self.extraction_prompt: params['extraction_prompt'] = quote_plus(self.extraction_prompt) if self.extraction_model: params['extraction_model'] = self.extraction_model if self.correlation_id: params['correlation_id'] = self.correlation_id if self.session: params['session'] = self.session if self.session_sticky_proxy is True: # false by default params['session_sticky_proxy'] = self._bool_to_http(self.session_sticky_proxy) else: if self.session_sticky_proxy: logging.warning('Params "session_sticky_proxy" is ignored. Works only if session is enabled') if self.debug is True: params['debug'] = self._bool_to_http(self.debug) if self.proxy_pool is not None: params['proxy_pool'] = self.proxy_pool if self.lang is not None: params['lang'] = ','.join(self.lang) if self.os is not None: params['os'] = self.os return params @staticmethod def from_exported_config(config:str) -> 'ScrapeConfig': try: from msgpack import loads as msgpack_loads except ImportError as e: print('You must install msgpack package - run: pip install "scrapfly-sdk[seepdup] or pip install msgpack') raise data = msgpack_loads(base64.b64decode(config)) headers = {} for name, value in data['headers'].items(): if isinstance(value, Iterable): headers[name] = '; '.join(value) else: headers[name] = value return ScrapeConfig( url=data['url'], retry=data['retry'], headers=headers, session=data['session'], session_sticky_proxy=data['session_sticky_proxy'], cache=data['cache'], cache_ttl=data['cache_ttl'], cache_clear=data['cache_clear'], render_js=data['render_js'], method=data['method'], asp=data['asp'], body=data['body'], ssl=data['ssl'], dns=data['dns'], country=data['country'], debug=data['debug'], correlation_id=data['correlation_id'], tags=data['tags'], format=data['format'], js=data['js'], rendering_wait=data['rendering_wait'], screenshots=data['screenshots'] or {}, screenshot_flags=data['screenshot_flags'], proxy_pool=data['proxy_pool'], auto_scroll=data['auto_scroll'], cost_budget=data['cost_budget'] )
Ancestors
Class variables
var PUBLIC_DATACENTER_POOL
var PUBLIC_RESIDENTIAL_POOL
var asp : bool
var auto_scroll : Optional[bool]
var body : Optional[str]
var cache : bool
var cache_clear : bool
var cache_ttl : Optional[int]
var correlation_id : Optional[str]
var cost_budget : Optional[int]
var country : Optional[str]
var data : Optional[Dict]
var debug : bool
var dns : bool
var extract : Dict
var extraction_ephemeral_template : Optional[Dict]
var extraction_model : Optional[str]
var extraction_prompt : Optional[str]
var extraction_template : Optional[str]
var format : Optional[Format]
var format_options : Optional[List[FormatOption]]
var headers : Optional[requests.structures.CaseInsensitiveDict]
var js : str
var js_scenario : Dict
var lang : Optional[List[str]]
var method : str
var os : Optional[str]
var proxy_pool : Optional[str]
var raise_on_upstream_error : bool
var render_js : bool
var rendering_wait : int
var retry : bool
var screenshot_flags : Optional[List[ScreenshotFlag]]
var screenshots : Optional[Dict]
var session : Optional[str]
var session_sticky_proxy : bool
var ssl : bool
var timeout : Optional[int]
var url : str
var wait_for_selector : Optional[str]
var webhook : Optional[str]
Static methods
def from_exported_config(config: str) ‑> ScrapeConfig
-
Expand source code
@staticmethod def from_exported_config(config:str) -> 'ScrapeConfig': try: from msgpack import loads as msgpack_loads except ImportError as e: print('You must install msgpack package - run: pip install "scrapfly-sdk[seepdup] or pip install msgpack') raise data = msgpack_loads(base64.b64decode(config)) headers = {} for name, value in data['headers'].items(): if isinstance(value, Iterable): headers[name] = '; '.join(value) else: headers[name] = value return ScrapeConfig( url=data['url'], retry=data['retry'], headers=headers, session=data['session'], session_sticky_proxy=data['session_sticky_proxy'], cache=data['cache'], cache_ttl=data['cache_ttl'], cache_clear=data['cache_clear'], render_js=data['render_js'], method=data['method'], asp=data['asp'], body=data['body'], ssl=data['ssl'], dns=data['dns'], country=data['country'], debug=data['debug'], correlation_id=data['correlation_id'], tags=data['tags'], format=data['format'], js=data['js'], rendering_wait=data['rendering_wait'], screenshots=data['screenshots'] or {}, screenshot_flags=data['screenshot_flags'], proxy_pool=data['proxy_pool'], auto_scroll=data['auto_scroll'], cost_budget=data['cost_budget'] )
Methods
def to_api_params(self, key: str) ‑> Dict
-
Expand source code
def to_api_params(self, key:str) -> Dict: params = { 'key': self.key or key, 'url': self.url } if self.country is not None: params['country'] = self.country for name, value in self.headers.items(): params['headers[%s]' % name] = value if self.webhook is not None: params['webhook_name'] = self.webhook if self.timeout is not None: params['timeout'] = self.timeout if self.extract is not None: params['extract'] = base64.urlsafe_b64encode(json.dumps(self.extract).encode('utf-8')).decode('utf-8') if self.cost_budget is not None: params['cost_budget'] = self.cost_budget if self.render_js is True: params['render_js'] = self._bool_to_http(self.render_js) if self.wait_for_selector is not None: params['wait_for_selector'] = self.wait_for_selector if self.js: params['js'] = base64.urlsafe_b64encode(self.js.encode('utf-8')).decode('utf-8') if self.js_scenario: params['js_scenario'] = base64.urlsafe_b64encode(json.dumps(self.js_scenario).encode('utf-8')).decode('utf-8') if self.rendering_wait: params['rendering_wait'] = self.rendering_wait if self.screenshots is not None: for name, element in self.screenshots.items(): params['screenshots[%s]' % name] = element if self.screenshot_flags is not None: self.screenshot_flags = [ScreenshotFlag(flag) for flag in self.screenshot_flags] params["screenshot_flags"] = ",".join(flag.value for flag in self.screenshot_flags) else: if self.screenshot_flags is not None: logging.warning('Params "screenshot_flags" is ignored. Works only if screenshots is enabled') if self.auto_scroll is True: params['auto_scroll'] = self._bool_to_http(self.auto_scroll) else: if self.wait_for_selector is not None: logging.warning('Params "wait_for_selector" is ignored. Works only if render_js is enabled') if self.screenshots: logging.warning('Params "screenshots" is ignored. Works only if render_js is enabled') if self.js_scenario: logging.warning('Params "js_scenario" is ignored. Works only if render_js is enabled') if self.js: logging.warning('Params "js" is ignored. Works only if render_js is enabled') if self.rendering_wait: logging.warning('Params "rendering_wait" is ignored. Works only if render_js is enabled') if self.asp is True: params['asp'] = self._bool_to_http(self.asp) if self.retry is False: params['retry'] = self._bool_to_http(self.retry) if self.cache is True: params['cache'] = self._bool_to_http(self.cache) if self.cache_clear is True: params['cache_clear'] = self._bool_to_http(self.cache_clear) if self.cache_ttl is not None: params['cache_ttl'] = self.cache_ttl else: if self.cache_clear is True: logging.warning('Params "cache_clear" is ignored. Works only if cache is enabled') if self.cache_ttl is not None: logging.warning('Params "cache_ttl" is ignored. Works only if cache is enabled') if self.dns is True: params['dns'] = self._bool_to_http(self.dns) if self.ssl is True: params['ssl'] = self._bool_to_http(self.ssl) if self.tags: params['tags'] = ','.join(self.tags) if self.format: params['format'] = Format(self.format).value if self.format_options: params['format'] += ':' + ','.join(FormatOption(option).value for option in self.format_options) if self.extraction_template and self.extraction_ephemeral_template: raise ScrapeConfigError('You cannot pass both parameters extraction_template and extraction_ephemeral_template. You must choose') if self.extraction_template: params['extraction_template'] = self.extraction_template if self.extraction_ephemeral_template: self.extraction_ephemeral_template = json.dumps(self.extraction_ephemeral_template) params['extraction_template'] = 'ephemeral:' + urlsafe_b64encode(self.extraction_ephemeral_template.encode('utf-8')).decode('utf-8') if self.extraction_prompt: params['extraction_prompt'] = quote_plus(self.extraction_prompt) if self.extraction_model: params['extraction_model'] = self.extraction_model if self.correlation_id: params['correlation_id'] = self.correlation_id if self.session: params['session'] = self.session if self.session_sticky_proxy is True: # false by default params['session_sticky_proxy'] = self._bool_to_http(self.session_sticky_proxy) else: if self.session_sticky_proxy: logging.warning('Params "session_sticky_proxy" is ignored. Works only if session is enabled') if self.debug is True: params['debug'] = self._bool_to_http(self.debug) if self.proxy_pool is not None: params['proxy_pool'] = self.proxy_pool if self.lang is not None: params['lang'] = ','.join(self.lang) if self.os is not None: params['os'] = self.os return params
class ScrapeConfigError (*args, **kwargs)
-
Common base class for all non-exit exceptions.
Expand source code
class ScrapeConfigError(Exception): pass
Ancestors
- builtins.Exception
- builtins.BaseException
class ScreenshotFlag (value, names=None, *, module=None, qualname=None, type=None, start=1)
-
Attributes
LOAD_IMAGES
- Enable image rendering with the request, add extra usage for the bandwidth consumed.
DARK_MODE
- Enable dark mode display.
BLOCK_BANNERS
- Block cookies banners and overlay that cover the screen.
HIGH_QUALITY
- No compression on the output image.
PRINT_MEDIA_FORMAT
- Render the page in the print mode.
Expand source code
class ScreenshotFlag(Enum): """ Attributes: LOAD_IMAGES: Enable image rendering with the request, add extra usage for the bandwidth consumed. DARK_MODE: Enable dark mode display. BLOCK_BANNERS: Block cookies banners and overlay that cover the screen. HIGH_QUALITY: No compression on the output image. PRINT_MEDIA_FORMAT: Render the page in the print mode. """ LOAD_IMAGES = "load_images" DARK_MODE = "dark_mode" BLOCK_BANNERS = "block_banners" HIGH_QUALITY = "high_quality" PRINT_MEDIA_FORMAT = "print_media_format"
Ancestors
- enum.Enum
Class variables
var BLOCK_BANNERS
var DARK_MODE
var HIGH_QUALITY
var LOAD_IMAGES
var PRINT_MEDIA_FORMAT