scrapfly

 1__version__ = '0.8.5'
 2
 3from typing import Tuple
 4from .errors import ScrapflyError
 5from .errors import ScrapflyAspError
 6from .errors import ScrapflyProxyError
 7from .errors import ScrapflyScheduleError
 8from .errors import ScrapflyScrapeError
 9from .errors import ScrapflySessionError
10from .errors import ScrapflyThrottleError
11from .errors import ScrapflyWebhookError
12from .errors import EncoderError
13from .errors import ErrorFactory
14from .errors import HttpError
15from .errors import UpstreamHttpError
16from .errors import UpstreamHttpClientError
17from .errors import UpstreamHttpServerError
18from .errors import ApiHttpClientError
19from .errors import ApiHttpServerError
20from .api_response import ScrapeApiResponse, ResponseBodyHandler
21from .client import ScrapflyClient
22from .scrape_config import ScrapeConfig
23
24__all__:Tuple[str, ...] = (
25    'ScrapflyError',
26    'ScrapflyAspError',
27    'ScrapflyProxyError',
28    'ScrapflyScheduleError',
29    'ScrapflyScrapeError',
30    'ScrapflySessionError',
31    'ScrapflyThrottleError',
32    'ScrapflyWebhookError',
33    'UpstreamHttpError',
34    'UpstreamHttpClientError',
35    'UpstreamHttpServerError',
36    'ApiHttpClientError',
37    'ApiHttpServerError',
38    'EncoderError',
39    'ScrapeApiResponse',
40    'ErrorFactory',
41    'HttpError',
42    'ScrapflyClient',
43    'ResponseBodyHandler',
44    'ScrapeConfig'
45)
class ScrapflyError(builtins.Exception):
 6class ScrapflyError(Exception):
 7    KIND_HTTP_BAD_RESPONSE = 'HTTP_BAD_RESPONSE'
 8    KIND_SCRAPFLY_ERROR = 'SCRAPFLY_ERROR'
 9
10    RESOURCE_PROXY = 'PROXY'
11    RESOURCE_THROTTLE = 'THROTTLE'
12    RESOURCE_SCRAPE = 'SCRAPE'
13    RESOURCE_ASP = 'ASP'
14    RESOURCE_SCHEDULE = 'SCHEDULE'
15    RESOURCE_WEBHOOK = 'WEBHOOK'
16    RESOURCE_SESSION = 'SESSION'
17
18    RETRYABLE_CODE = [
19        'ERR::SCRAPE::OPERATION_TIMEOUT',
20        'ERR::SCRAPE::TOO_MANY_CONCURRENT_REQUEST',
21        'ERR::PROXY::RESOURCES_SATURATION',
22        'ERR::PROXY::NOT_REACHABLE',
23        'ERR::PROXY::UNAVAILABLE',
24        'ERR::THROTTLE::MAX_CONCURRENT_REQUEST_EXCEEDED',
25        'ERR::THROTTLE::MAX_REQUEST_RATE_EXCEEDED',
26        'ERR::SESSION::CONCURRENT_ACCESS',
27        'ERR::ASP::SHIELD_EXPIRED',
28        'ERR::SCRAPE::NETWORK_ISSUE',
29        'ERR::SCRAPE::DRIVER_TIMEOUT'
30    ]
31
32    def __init__(
33        self,
34        message: str,
35        code: str,
36        http_status_code: int,
37        resource: Optional[str]=None,
38        is_retryable: bool = False,
39        retry_delay: Optional[int] = None,
40        retry_times: Optional[int] = None,
41        documentation_url: Optional[str] = None,
42        api_response: Optional['ApiResponse'] = None
43    ):
44        self.message = message
45        self.code = code
46        self.retry_delay = retry_delay
47        self.retry_times = retry_times
48        self.resource = resource
49        self.is_retryable = is_retryable
50        self.documentation_url = documentation_url
51        self.api_response = api_response
52        self.http_status_code = http_status_code
53
54        super().__init__(self.message, str(self.code))
55
56    def __str__(self):
57        message = self.message
58
59        if self.documentation_url is not None:
60            message += '. Learn more: %s' % self.documentation_url
61
62        return message

Common base class for all non-exit exceptions.

ScrapflyError( message: str, code: str, http_status_code: int, resource: Optional[str] = None, is_retryable: bool = False, retry_delay: Optional[int] = None, retry_times: Optional[int] = None, documentation_url: Optional[str] = None, api_response: Optional[ForwardRef('ApiResponse')] = None)
32    def __init__(
33        self,
34        message: str,
35        code: str,
36        http_status_code: int,
37        resource: Optional[str]=None,
38        is_retryable: bool = False,
39        retry_delay: Optional[int] = None,
40        retry_times: Optional[int] = None,
41        documentation_url: Optional[str] = None,
42        api_response: Optional['ApiResponse'] = None
43    ):
44        self.message = message
45        self.code = code
46        self.retry_delay = retry_delay
47        self.retry_times = retry_times
48        self.resource = resource
49        self.is_retryable = is_retryable
50        self.documentation_url = documentation_url
51        self.api_response = api_response
52        self.http_status_code = http_status_code
53
54        super().__init__(self.message, str(self.code))
Inherited Members
builtins.BaseException
with_traceback
class ScrapflyAspError(scrapfly.HttpError):
142class ScrapflyAspError(HttpError):
143    pass

Common base class for all non-exit exceptions.

Inherited Members
HttpError
HttpError
builtins.BaseException
with_traceback
class ScrapflyProxyError(scrapfly.HttpError):
134class ScrapflyProxyError(HttpError):
135    pass

Common base class for all non-exit exceptions.

Inherited Members
HttpError
HttpError
builtins.BaseException
with_traceback
class ScrapflyScheduleError(scrapfly.HttpError):
146class ScrapflyScheduleError(HttpError):
147    pass

Common base class for all non-exit exceptions.

Inherited Members
HttpError
HttpError
builtins.BaseException
with_traceback
class ScrapflyScrapeError(scrapfly.HttpError):
130class ScrapflyScrapeError(HttpError):
131    pass

Common base class for all non-exit exceptions.

Inherited Members
HttpError
HttpError
builtins.BaseException
with_traceback
class ScrapflySessionError(scrapfly.HttpError):
154class ScrapflySessionError(HttpError):
155    pass

Common base class for all non-exit exceptions.

Inherited Members
HttpError
HttpError
builtins.BaseException
with_traceback
class ScrapflyThrottleError(scrapfly.HttpError):
138class ScrapflyThrottleError(HttpError):
139    pass

Common base class for all non-exit exceptions.

Inherited Members
HttpError
HttpError
builtins.BaseException
with_traceback
class ScrapflyWebhookError(scrapfly.HttpError):
150class ScrapflyWebhookError(HttpError):
151    pass

Common base class for all non-exit exceptions.

Inherited Members
HttpError
HttpError
builtins.BaseException
with_traceback
class UpstreamHttpError(scrapfly.HttpError):
102class UpstreamHttpError(HttpError):
103    pass

Common base class for all non-exit exceptions.

Inherited Members
HttpError
HttpError
builtins.BaseException
with_traceback
class UpstreamHttpClientError(scrapfly.UpstreamHttpError):
106class UpstreamHttpClientError(UpstreamHttpError):
107    pass

Common base class for all non-exit exceptions.

Inherited Members
HttpError
HttpError
builtins.BaseException
with_traceback
class UpstreamHttpServerError(scrapfly.UpstreamHttpClientError):
110class UpstreamHttpServerError(UpstreamHttpClientError):
111    pass

Common base class for all non-exit exceptions.

Inherited Members
HttpError
HttpError
builtins.BaseException
with_traceback
class ApiHttpClientError(scrapfly.HttpError):
114class ApiHttpClientError(HttpError):
115    pass

Common base class for all non-exit exceptions.

Inherited Members
HttpError
HttpError
builtins.BaseException
with_traceback
class ApiHttpServerError(scrapfly.ApiHttpClientError):
126class ApiHttpServerError(ApiHttpClientError):
127    pass

Common base class for all non-exit exceptions.

Inherited Members
HttpError
HttpError
builtins.BaseException
with_traceback
class EncoderError(builtins.BaseException):
65class EncoderError(BaseException):
66
67    def __init__(self, content:str):
68        self.content = content
69        super().__init__()
70
71    def __str__(self) -> str:
72        return self.content

Common base class for all exceptions

EncoderError(content: str)
67    def __init__(self, content:str):
68        self.content = content
69        super().__init__()
Inherited Members
builtins.BaseException
with_traceback
class ScrapeApiResponse:
111class ScrapeApiResponse:
112
113    def __init__(self, request: Request, response: Response, scrape_config: ScrapeConfig, api_result: Optional[Dict] = None):
114        self.request = request
115        self.response = response
116        self.scrape_config = scrape_config
117
118        if self.scrape_config.method == 'HEAD':
119            api_result = {
120                'result': {
121                    'request_headers': {},
122                    'status': 'DONE',
123                    'success': 200 >= self.response.status_code < 300,
124                    'response_headers': self.response.headers,
125                    'status_code': self.response.status_code,
126                    'reason': self.response.reason,
127                    'format': 'text',
128                    'content': ''
129                },
130                'context': {},
131                'config': self.scrape_config.__dict__
132            }
133
134            if 'X-Scrapfly-Reject-Code' in self.response.headers:
135                api_result['result']['error'] = {
136                    'code': self.response.headers['X-Scrapfly-Reject-Code'],
137                    'http_code': int(self.response.headers['X-Scrapfly-Reject-Http-Code']),
138                    'message': self.response.headers['X-Scrapfly-Reject-Description'],
139                    'error_id': self.response.headers['X-Scrapfly-Reject-ID'],
140                    'retryable': True if self.response.headers['X-Scrapfly-Reject-Retryable'] == 'yes' else False,
141                    'doc_url': '',
142                    'links': {}
143                }
144
145                if 'X-Scrapfly-Reject-Doc' in self.response.headers:
146                    api_result['result']['error']['doc_url'] = self.response.headers['X-Scrapfly-Reject-Doc']
147                    api_result['result']['error']['links']['Related Docs'] = self.response.headers['X-Scrapfly-Reject-Doc']
148
149        if isinstance(api_result, str):
150            raise HttpError(
151                request=request,
152                response=response,
153                message='Bad gateway',
154                code=502,
155                http_status_code=502,
156                is_retryable=True
157            )
158
159        self.result = self.handle_api_result(api_result=api_result)
160
161    @property
162    def scrape_result(self) -> Dict:
163        return self.result['result']
164
165    @property
166    def config(self) -> Dict:
167        return self.result['config']
168
169    @property
170    def context(self) -> Dict:
171        return self.result['context']
172
173    @property
174    def content(self) -> str:
175        return self.scrape_result['content']
176
177    @property
178    def success(self) -> bool:
179        """
180            /!\ Success means Scrapfly api reply correctly to the call, but the scrape can be unsuccessful if the upstream reply with error status code
181        """
182        return 200 >= self.response.status_code <= 299
183
184    @property
185    def scrape_success(self) -> bool:
186        return self.scrape_result['success']
187
188    @property
189    def error(self) -> Optional[Dict]:
190        if self.scrape_success is False:
191            return self.scrape_result['error']
192
193    @property
194    def status_code(self) -> int:
195        """
196            /!\ This is the status code of our API, not the upstream website
197        """
198        return self.response.status_code
199
200    @property
201    def upstream_status_code(self) -> Optional[int]:
202        if 'status_code' in self.scrape_result:
203            return self.scrape_result['status_code']
204
205        return None
206
207    def prevent_extra_usage(self):
208        if self.remaining_quota == 0:
209            raise ExtraUsageForbidden(
210                message='All Pre Paid Quota Used',
211                code='ERR::ACCOUNT::PREVENT_EXTRA_USAGE',
212                http_status_code=429,
213                is_retryable=False
214            )
215
216    @property
217    def remaining_quota(self) -> Optional[int]:
218        remaining_scrape = self.response.headers.get('X-Scrapfly-Remaining-Scrape')
219
220        if remaining_scrape:
221            remaining_scrape = int(remaining_scrape)
222
223        return remaining_scrape
224
225    @property
226    def cost(self) -> Optional[int]:
227        cost = self.response.headers.get('X-Scrapfly-Api-Cost')
228
229        if cost:
230            cost = int(cost)
231
232        return cost
233
234    @property
235    def duration_ms(self) -> Optional[float]:
236        duration = self.response.headers.get('X-Scrapfly-Response-Time')
237
238        if duration:
239            duration = float(duration)
240
241        return duration
242
243    @property
244    def headers(self) -> CaseInsensitiveDict:
245        return self.response.headers
246
247    def handle_api_result(self, api_result: Dict) -> Optional[FrozenDict]:
248        if self._is_api_error(api_result=api_result) is True:
249            return FrozenDict(api_result)
250
251        try:
252            if isinstance(api_result['config']['headers'], list):
253                api_result['config']['headers'] = {}
254        except TypeError:
255            logger.info(api_result)
256            raise
257
258        with suppress(KeyError):
259            api_result['result']['request_headers'] = CaseInsensitiveDict(api_result['result']['request_headers'])
260            api_result['result']['response_headers'] = CaseInsensitiveDict(api_result['result']['response_headers'])
261
262        if api_result['result']['format'] == 'binary' and api_result['result']['content']:
263            api_result['result']['content'] = BytesIO(b64decode(api_result['result']['content']))
264
265        return FrozenDict(api_result)
266
267    @cached_property
268    def soup(self) -> 'BeautifulSoup':
269        try:
270            from bs4 import BeautifulSoup
271            soup = BeautifulSoup(self.content, "lxml")
272            return soup
273        except ImportError as e:
274            logger.error('You must install scrapfly[parser] to enable this feature')
275
276    @cached_property
277    def selector(self) -> 'Selector':
278        try:
279            from scrapy import Selector
280            return Selector(text=self.content)
281        except ImportError as e:
282            logger.error('You must install scrapfly[scrapy] to enable this feature')
283            raise e
284
285    @property
286    def error_message(self) :
287        if self.error:
288            message = "<-- %s | %s - %s." % (self.response.status_code, self.error['code'], self.error['message'])
289
290            if self.error['links']:
291                message += "Checkout the related doc: %s" % list(self.error['links'].values())[0]
292
293            return message
294
295        return '<-- %s - %s %s | Doc: %s' % (self.response.status_code, self.http_status_code, self.code, self.documentation_url)
296
297    def _is_api_error(self, api_result: Dict) -> bool:
298        if self.scrape_config.method == 'HEAD':
299            if 'X-Reject-Reason' in self.response.headers:
300                return True
301            return False
302
303        if api_result is None:
304            return True
305
306        return 'error_id' in api_result
307
308    def raise_for_result(self, raise_on_upstream_error: bool = True):
309
310        try:
311            self.response.raise_for_status()
312        except HTTPError as e:
313            if 'http_code' in self.result:
314                if e.response.status_code >= 500:
315                    raise ApiHttpServerError(
316                        request=e.request,
317                        response=e.response,
318                        message=self.result['message'],
319                        code='',
320                        resource='',
321                        http_status_code=e.response.status_code,
322                        documentation_url=self.result.get('links')
323                    ) from e
324                else:
325                    raise ApiHttpClientError(
326                        request=e.request,
327                        response=e.response,
328                        message=self.result['message'],
329                        code='',
330                        resource='API',
331                        http_status_code=self.result['http_code'],
332                        documentation_url=self.result.get('links')
333                    ) from e
334
335        if self.result['result']['status'] == 'DONE' and self.scrape_success is False:
336            error = ErrorFactory.create(api_response=self)
337
338            if error:
339                if isinstance(error, UpstreamHttpError):
340                    if raise_on_upstream_error is True:
341                        raise error
342                else:
343                    raise error
344
345    def upstream_result_into_response(self, _class=Response) -> Optional[Response]:
346        if _class != Response:
347            raise RuntimeError('only Response from requests package is supported at the moment')
348
349        if self.result is None:
350            return None
351
352        if self.response.status_code != 200:
353            return None
354
355        response = Response()
356        response.status_code = self.scrape_result['status_code']
357        response.reason = self.scrape_result['reason']
358        response._content = self.scrape_result['content'].encode('utf-8') if self.scrape_result['content'] else None
359        response.headers.update(self.scrape_result['response_headers'])
360        response.url = self.scrape_result['url']
361
362        response.request = Request(
363            method=self.config['method'],
364            url=self.config['url'],
365            headers=self.scrape_result['request_headers'],
366            data=self.config['body'] if self.config['body'] else None
367        )
368
369        if 'set-cookie' in response.headers:
370            for raw_cookie in response.headers['set-cookie']:
371                for name, cookie in SimpleCookie(raw_cookie).items():
372                    expires = cookie.get('expires')
373
374                    if expires == '':
375                        expires = None
376
377                    if expires:
378                        try:
379                            expires = parse(expires).timestamp()
380                        except ValueError:
381                            expires = None
382
383                    if type(expires) == str:
384                        if '.' in expires:
385                            expires = float(expires)
386                        else:
387                            expires = int(expires)
388
389                    response.cookies.set_cookie(Cookie(
390                        version=cookie.get('version') if cookie.get('version') else None,
391                        name=name,
392                        value=cookie.value,
393                        path=cookie.get('path', ''),
394                        expires=expires,
395                        comment=cookie.get('comment'),
396                        domain=cookie.get('domain', ''),
397                        secure=cookie.get('secure'),
398                        port=None,
399                        port_specified=False,
400                        domain_specified=cookie.get('domain') is not None and cookie.get('domain') != '',
401                        domain_initial_dot=bool(cookie.get('domain').startswith('.')) if cookie.get('domain') is not None else False,
402                        path_specified=cookie.get('path') != '' and cookie.get('path') is not None,
403                        discard=False,
404                        comment_url=None,
405                        rest={
406                            'httponly': cookie.get('httponly'),
407                            'samesite': cookie.get('samesite'),
408                            'max-age': cookie.get('max-age')
409                        }
410                    ))
411
412        return response
413
414    def sink(self, path: Optional[str] = None, name: Optional[str] = None, file: Optional[Union[TextIO, BytesIO]] = None, content:Optional[Union[str, bytes]]=None):
415        file_content = content or self.scrape_result['content']
416        file_path = None
417        file_extension = None
418
419        if name:
420            name_parts = name.split('.')
421            if len(name_parts) > 1:
422                file_extension = name_parts[-1]
423
424        if not file:
425            if file_extension is None:
426                try:
427                    mime_type = self.scrape_result['response_headers']['content-type']
428                except KeyError:
429                    mime_type = 'application/octet-stream'
430
431                if ';' in mime_type:
432                    mime_type = mime_type.split(';')[0]
433
434                file_extension = '.' + mime_type.split('/')[1]
435
436            if not name:
437                name = self.config['url'].split('/')[-1]
438
439            if name.find(file_extension) == -1:
440                name += file_extension
441
442            file_path = path + '/' + name if path is not None else name
443
444            if file_path == file_extension:
445                url = re.sub(r'(https|http)?://', '', self.config['url']).replace('/', '-')
446
447                if url[-1] == '-':
448                    url = url[:-1]
449
450                url += file_extension
451
452                file_path = url
453
454            file = open(file_path, 'wb')
455
456        if isinstance(file_content, str):
457            file_content = BytesIO(file_content.encode('utf-8'))
458        elif isinstance(file_content, bytes):
459            file_content = BytesIO(file_content)
460
461        file_content.seek(0)
462        with file as f:
463            shutil.copyfileobj(file_content, f, length=131072)
464
465        logger.info('file %s created' % file_path)
ScrapeApiResponse( request: requests.models.Request, response: requests.models.Response, scrape_config: scrapfly.ScrapeConfig, api_result: Optional[Dict] = None)
113    def __init__(self, request: Request, response: Response, scrape_config: ScrapeConfig, api_result: Optional[Dict] = None):
114        self.request = request
115        self.response = response
116        self.scrape_config = scrape_config
117
118        if self.scrape_config.method == 'HEAD':
119            api_result = {
120                'result': {
121                    'request_headers': {},
122                    'status': 'DONE',
123                    'success': 200 >= self.response.status_code < 300,
124                    'response_headers': self.response.headers,
125                    'status_code': self.response.status_code,
126                    'reason': self.response.reason,
127                    'format': 'text',
128                    'content': ''
129                },
130                'context': {},
131                'config': self.scrape_config.__dict__
132            }
133
134            if 'X-Scrapfly-Reject-Code' in self.response.headers:
135                api_result['result']['error'] = {
136                    'code': self.response.headers['X-Scrapfly-Reject-Code'],
137                    'http_code': int(self.response.headers['X-Scrapfly-Reject-Http-Code']),
138                    'message': self.response.headers['X-Scrapfly-Reject-Description'],
139                    'error_id': self.response.headers['X-Scrapfly-Reject-ID'],
140                    'retryable': True if self.response.headers['X-Scrapfly-Reject-Retryable'] == 'yes' else False,
141                    'doc_url': '',
142                    'links': {}
143                }
144
145                if 'X-Scrapfly-Reject-Doc' in self.response.headers:
146                    api_result['result']['error']['doc_url'] = self.response.headers['X-Scrapfly-Reject-Doc']
147                    api_result['result']['error']['links']['Related Docs'] = self.response.headers['X-Scrapfly-Reject-Doc']
148
149        if isinstance(api_result, str):
150            raise HttpError(
151                request=request,
152                response=response,
153                message='Bad gateway',
154                code=502,
155                http_status_code=502,
156                is_retryable=True
157            )
158
159        self.result = self.handle_api_result(api_result=api_result)
success: bool

/!\ Success means Scrapfly api reply correctly to the call, but the scrape can be unsuccessful if the upstream reply with error status code

status_code: int

/!\ This is the status code of our API, not the upstream website

def prevent_extra_usage(self):
207    def prevent_extra_usage(self):
208        if self.remaining_quota == 0:
209            raise ExtraUsageForbidden(
210                message='All Pre Paid Quota Used',
211                code='ERR::ACCOUNT::PREVENT_EXTRA_USAGE',
212                http_status_code=429,
213                is_retryable=False
214            )
def handle_api_result(self, api_result: Dict) -> Optional[scrapfly.frozen_dict.FrozenDict]:
247    def handle_api_result(self, api_result: Dict) -> Optional[FrozenDict]:
248        if self._is_api_error(api_result=api_result) is True:
249            return FrozenDict(api_result)
250
251        try:
252            if isinstance(api_result['config']['headers'], list):
253                api_result['config']['headers'] = {}
254        except TypeError:
255            logger.info(api_result)
256            raise
257
258        with suppress(KeyError):
259            api_result['result']['request_headers'] = CaseInsensitiveDict(api_result['result']['request_headers'])
260            api_result['result']['response_headers'] = CaseInsensitiveDict(api_result['result']['response_headers'])
261
262        if api_result['result']['format'] == 'binary' and api_result['result']['content']:
263            api_result['result']['content'] = BytesIO(b64decode(api_result['result']['content']))
264
265        return FrozenDict(api_result)
def raise_for_result(self, raise_on_upstream_error: bool = True):
308    def raise_for_result(self, raise_on_upstream_error: bool = True):
309
310        try:
311            self.response.raise_for_status()
312        except HTTPError as e:
313            if 'http_code' in self.result:
314                if e.response.status_code >= 500:
315                    raise ApiHttpServerError(
316                        request=e.request,
317                        response=e.response,
318                        message=self.result['message'],
319                        code='',
320                        resource='',
321                        http_status_code=e.response.status_code,
322                        documentation_url=self.result.get('links')
323                    ) from e
324                else:
325                    raise ApiHttpClientError(
326                        request=e.request,
327                        response=e.response,
328                        message=self.result['message'],
329                        code='',
330                        resource='API',
331                        http_status_code=self.result['http_code'],
332                        documentation_url=self.result.get('links')
333                    ) from e
334
335        if self.result['result']['status'] == 'DONE' and self.scrape_success is False:
336            error = ErrorFactory.create(api_response=self)
337
338            if error:
339                if isinstance(error, UpstreamHttpError):
340                    if raise_on_upstream_error is True:
341                        raise error
342                else:
343                    raise error
def upstream_result_into_response( self, _class=<class 'requests.models.Response'>) -> Optional[requests.models.Response]:
345    def upstream_result_into_response(self, _class=Response) -> Optional[Response]:
346        if _class != Response:
347            raise RuntimeError('only Response from requests package is supported at the moment')
348
349        if self.result is None:
350            return None
351
352        if self.response.status_code != 200:
353            return None
354
355        response = Response()
356        response.status_code = self.scrape_result['status_code']
357        response.reason = self.scrape_result['reason']
358        response._content = self.scrape_result['content'].encode('utf-8') if self.scrape_result['content'] else None
359        response.headers.update(self.scrape_result['response_headers'])
360        response.url = self.scrape_result['url']
361
362        response.request = Request(
363            method=self.config['method'],
364            url=self.config['url'],
365            headers=self.scrape_result['request_headers'],
366            data=self.config['body'] if self.config['body'] else None
367        )
368
369        if 'set-cookie' in response.headers:
370            for raw_cookie in response.headers['set-cookie']:
371                for name, cookie in SimpleCookie(raw_cookie).items():
372                    expires = cookie.get('expires')
373
374                    if expires == '':
375                        expires = None
376
377                    if expires:
378                        try:
379                            expires = parse(expires).timestamp()
380                        except ValueError:
381                            expires = None
382
383                    if type(expires) == str:
384                        if '.' in expires:
385                            expires = float(expires)
386                        else:
387                            expires = int(expires)
388
389                    response.cookies.set_cookie(Cookie(
390                        version=cookie.get('version') if cookie.get('version') else None,
391                        name=name,
392                        value=cookie.value,
393                        path=cookie.get('path', ''),
394                        expires=expires,
395                        comment=cookie.get('comment'),
396                        domain=cookie.get('domain', ''),
397                        secure=cookie.get('secure'),
398                        port=None,
399                        port_specified=False,
400                        domain_specified=cookie.get('domain') is not None and cookie.get('domain') != '',
401                        domain_initial_dot=bool(cookie.get('domain').startswith('.')) if cookie.get('domain') is not None else False,
402                        path_specified=cookie.get('path') != '' and cookie.get('path') is not None,
403                        discard=False,
404                        comment_url=None,
405                        rest={
406                            'httponly': cookie.get('httponly'),
407                            'samesite': cookie.get('samesite'),
408                            'max-age': cookie.get('max-age')
409                        }
410                    ))
411
412        return response
def sink( self, path: Optional[str] = None, name: Optional[str] = None, file: Union[TextIO, _io.BytesIO, NoneType] = None, content: Union[str, bytes, NoneType] = None):
414    def sink(self, path: Optional[str] = None, name: Optional[str] = None, file: Optional[Union[TextIO, BytesIO]] = None, content:Optional[Union[str, bytes]]=None):
415        file_content = content or self.scrape_result['content']
416        file_path = None
417        file_extension = None
418
419        if name:
420            name_parts = name.split('.')
421            if len(name_parts) > 1:
422                file_extension = name_parts[-1]
423
424        if not file:
425            if file_extension is None:
426                try:
427                    mime_type = self.scrape_result['response_headers']['content-type']
428                except KeyError:
429                    mime_type = 'application/octet-stream'
430
431                if ';' in mime_type:
432                    mime_type = mime_type.split(';')[0]
433
434                file_extension = '.' + mime_type.split('/')[1]
435
436            if not name:
437                name = self.config['url'].split('/')[-1]
438
439            if name.find(file_extension) == -1:
440                name += file_extension
441
442            file_path = path + '/' + name if path is not None else name
443
444            if file_path == file_extension:
445                url = re.sub(r'(https|http)?://', '', self.config['url']).replace('/', '-')
446
447                if url[-1] == '-':
448                    url = url[:-1]
449
450                url += file_extension
451
452                file_path = url
453
454            file = open(file_path, 'wb')
455
456        if isinstance(file_content, str):
457            file_content = BytesIO(file_content.encode('utf-8'))
458        elif isinstance(file_content, bytes):
459            file_content = BytesIO(file_content)
460
461        file_content.seek(0)
462        with file as f:
463            shutil.copyfileobj(file_content, f, length=131072)
464
465        logger.info('file %s created' % file_path)
class ErrorFactory:
166class ErrorFactory:
167    RESOURCE_TO_ERROR = {
168        ScrapflyError.RESOURCE_SCRAPE: ScrapflyScrapeError,
169        ScrapflyError.RESOURCE_WEBHOOK: ScrapflyWebhookError,
170        ScrapflyError.RESOURCE_PROXY: ScrapflyProxyError,
171        ScrapflyError.RESOURCE_SCHEDULE: ScrapflyScheduleError,
172        ScrapflyError.RESOURCE_ASP: ScrapflyAspError,
173        ScrapflyError.RESOURCE_SESSION: ScrapflySessionError
174    }
175
176    # Notable http error has own class for more convenience
177    HTTP_STATUS_TO_ERROR = {
178        401: BadApiKeyError,
179        429: TooManyRequest
180    }
181
182    @staticmethod
183    def _get_resource(code: str) -> Optional[Tuple[str, str]]:
184
185        if isinstance(code, str) and '::' in code:
186            _, resource, _ = code.split('::')
187            return resource
188
189        return None
190
191    @staticmethod
192    def create(api_response: 'ScrapeApiResponse'):
193        is_retryable = False
194        kind = ScrapflyError.KIND_HTTP_BAD_RESPONSE if api_response.success is False else ScrapflyError.KIND_SCRAPFLY_ERROR
195        http_code = api_response.status_code
196        retry_delay = 5
197        retry_times = 3
198        description = None
199        error_url = 'https://scrapfly.io/docs/scrape-api/errors#api'
200        code = api_response.error['code']
201
202        if code == 'ERR::SCRAPE::BAD_UPSTREAM_RESPONSE':
203            http_code = api_response.scrape_result['status_code']
204
205        if 'description' in api_response.error:
206            description = api_response.error['description']
207
208        message = '%s %s %s' % (str(http_code), code, api_response.error['message'])
209
210        if 'doc_url' in api_response.error:
211            error_url = api_response.error['doc_url']
212
213        if 'retryable' in api_response.error:
214            is_retryable = api_response.error['retryable']
215
216        resource = ErrorFactory._get_resource(code=code)
217
218        if is_retryable is True:
219            if 'X-Retry' in api_response.headers:
220                retry_delay = int(api_response.headers['Retry-After'])
221
222        message = '%s: %s' % (message, description) if description else message
223
224        if retry_delay is not None and is_retryable is True:
225            message = '%s. Retry delay : %s seconds' % (message, str(retry_delay))
226
227        args = {
228            'message': message,
229            'code': code,
230            'http_status_code': http_code,
231            'is_retryable': is_retryable,
232            'api_response': api_response,
233            'resource': resource,
234            'retry_delay': retry_delay,
235            'retry_times': retry_times,
236            'documentation_url': error_url,
237            'request': api_response.request,
238            'response': api_response.response
239        }
240
241        if kind == ScrapflyError.KIND_HTTP_BAD_RESPONSE:
242            if http_code >= 500:
243                return ApiHttpServerError(**args)
244
245            if http_code in ErrorFactory.HTTP_STATUS_TO_ERROR:
246                return ErrorFactory.HTTP_STATUS_TO_ERROR[http_code](**args)
247
248            if resource in ErrorFactory.RESOURCE_TO_ERROR:
249                return ErrorFactory.RESOURCE_TO_ERROR[resource](**args)
250
251            return ApiHttpClientError(**args)
252
253        elif kind == ScrapflyError.KIND_SCRAPFLY_ERROR:
254            if code == 'ERR::SCRAPE::BAD_UPSTREAM_RESPONSE':
255                if http_code >= 500:
256                    return UpstreamHttpServerError(**args)
257
258                if http_code >= 400:
259                    return UpstreamHttpClientError(**args)
260
261            if resource in ErrorFactory.RESOURCE_TO_ERROR:
262                return ErrorFactory.RESOURCE_TO_ERROR[resource](**args)
263
264            return ScrapflyError(**args)
@staticmethod
def create(api_response: 'ScrapeApiResponse'):
191    @staticmethod
192    def create(api_response: 'ScrapeApiResponse'):
193        is_retryable = False
194        kind = ScrapflyError.KIND_HTTP_BAD_RESPONSE if api_response.success is False else ScrapflyError.KIND_SCRAPFLY_ERROR
195        http_code = api_response.status_code
196        retry_delay = 5
197        retry_times = 3
198        description = None
199        error_url = 'https://scrapfly.io/docs/scrape-api/errors#api'
200        code = api_response.error['code']
201
202        if code == 'ERR::SCRAPE::BAD_UPSTREAM_RESPONSE':
203            http_code = api_response.scrape_result['status_code']
204
205        if 'description' in api_response.error:
206            description = api_response.error['description']
207
208        message = '%s %s %s' % (str(http_code), code, api_response.error['message'])
209
210        if 'doc_url' in api_response.error:
211            error_url = api_response.error['doc_url']
212
213        if 'retryable' in api_response.error:
214            is_retryable = api_response.error['retryable']
215
216        resource = ErrorFactory._get_resource(code=code)
217
218        if is_retryable is True:
219            if 'X-Retry' in api_response.headers:
220                retry_delay = int(api_response.headers['Retry-After'])
221
222        message = '%s: %s' % (message, description) if description else message
223
224        if retry_delay is not None and is_retryable is True:
225            message = '%s. Retry delay : %s seconds' % (message, str(retry_delay))
226
227        args = {
228            'message': message,
229            'code': code,
230            'http_status_code': http_code,
231            'is_retryable': is_retryable,
232            'api_response': api_response,
233            'resource': resource,
234            'retry_delay': retry_delay,
235            'retry_times': retry_times,
236            'documentation_url': error_url,
237            'request': api_response.request,
238            'response': api_response.response
239        }
240
241        if kind == ScrapflyError.KIND_HTTP_BAD_RESPONSE:
242            if http_code >= 500:
243                return ApiHttpServerError(**args)
244
245            if http_code in ErrorFactory.HTTP_STATUS_TO_ERROR:
246                return ErrorFactory.HTTP_STATUS_TO_ERROR[http_code](**args)
247
248            if resource in ErrorFactory.RESOURCE_TO_ERROR:
249                return ErrorFactory.RESOURCE_TO_ERROR[resource](**args)
250
251            return ApiHttpClientError(**args)
252
253        elif kind == ScrapflyError.KIND_SCRAPFLY_ERROR:
254            if code == 'ERR::SCRAPE::BAD_UPSTREAM_RESPONSE':
255                if http_code >= 500:
256                    return UpstreamHttpServerError(**args)
257
258                if http_code >= 400:
259                    return UpstreamHttpClientError(**args)
260
261            if resource in ErrorFactory.RESOURCE_TO_ERROR:
262                return ErrorFactory.RESOURCE_TO_ERROR[resource](**args)
263
264            return ScrapflyError(**args)
class HttpError(scrapfly.ScrapflyError):
79class HttpError(ScrapflyError):
80
81    def __init__(self, request:Request, response:Optional[Response]=None, **kwargs):
82        self.request = request
83        self.response = response
84        super().__init__(**kwargs)
85
86    def __str__(self) -> str:
87
88        if isinstance(self, UpstreamHttpError):
89            text = "%s -- %s " % (self.api_response.scrape_result['status_code'], self.api_response.scrape_result['reason'])
90        else:
91            text = "%s -- %s " % (self.response.status_code, self.response.reason)
92
93            if isinstance(self, (ApiHttpClientError, ApiHttpServerError)):
94                try:
95                    text += self.response.content.decode('utf-8')
96                except UnicodeError:
97                    text += str(self.response.content)
98
99        return text

Common base class for all non-exit exceptions.

HttpError( request: requests.models.Request, response: Optional[requests.models.Response] = None, **kwargs)
81    def __init__(self, request:Request, response:Optional[Response]=None, **kwargs):
82        self.request = request
83        self.response = response
84        super().__init__(**kwargs)
Inherited Members
builtins.BaseException
with_traceback
class ScrapflyClient:
 43class ScrapflyClient:
 44
 45    HOST = 'https://api.scrapfly.io'
 46    DEFAULT_CONNECT_TIMEOUT = 30
 47    DEFAULT_READ_TIMEOUT = 160 # 155 real
 48
 49    host:str
 50    key:str
 51    max_concurrency:int
 52    verify:bool
 53    debug:bool
 54    distributed_mode:bool
 55    connect_timeout:int
 56    read_timeout:int
 57    brotli: bool
 58    reporter:Reporter
 59    version:str
 60
 61    CONCURRENCY_AUTO = 'auto' # retrieve the allowed concurrency from your account
 62
 63    def __init__(
 64        self,
 65        key: str,
 66        host: Optional[str] = HOST,
 67        verify=True,
 68        debug: bool = False,
 69        max_concurrency:int=1,
 70        connect_timeout:int = DEFAULT_CONNECT_TIMEOUT,
 71        read_timeout:int = DEFAULT_READ_TIMEOUT,
 72        reporter:Optional[Callable]=None,
 73        **kwargs
 74    ):
 75        if host[-1] == '/':  # remove last '/' if exists
 76            host = host[:-1]
 77
 78        if 'distributed_mode' in kwargs:
 79            warnings.warn("distributed mode is deprecated and will be remove the next version -"
 80              " user should handle themself the session name based on the concurrency",
 81              DeprecationWarning,
 82              stacklevel=2
 83            )
 84
 85        if 'brotli' in kwargs:
 86            warnings.warn("brotli arg is deprecated and will be remove the next version - "
 87                "brotli is disabled by default",
 88                DeprecationWarning,
 89                stacklevel=2
 90            )
 91
 92        self.version = __version__
 93        self.host = host
 94        self.key = key
 95        self.verify = verify
 96        self.debug = debug
 97        self.connect_timeout = connect_timeout
 98        self.read_timeout = read_timeout
 99        self.max_concurrency = max_concurrency
100        self.body_handler = ResponseBodyHandler(use_brotli=False)
101        self.async_executor = ThreadPoolExecutor()
102        self.http_session = None
103
104        if not self.verify and not self.HOST.endswith('.local'):
105            urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
106
107        if self.debug is True:
108            http.client.HTTPConnection.debuglevel = 5
109
110        if reporter is None:
111            from .reporter import NoopReporter
112
113            reporter = NoopReporter()
114
115        self.reporter = Reporter(reporter)
116
117    @property
118    def ua(self) -> str:
119        return 'ScrapflySDK/%s (Python %s, %s, %s)' % (
120            self.version,
121            platform.python_version(),
122            platform.uname().system,
123            platform.uname().machine
124        )
125
126    @cached_property
127    def _http_handler(self):
128        return partial(self.http_session.request if self.http_session else requests.request)
129
130    @property
131    def http(self):
132        return self._http_handler
133
134    def _scrape_request(self, scrape_config:ScrapeConfig):
135        return {
136            'method': scrape_config.method,
137            'url': self.host + '/scrape',
138            'data': scrape_config.body,
139            'verify': self.verify,
140            'timeout': (self.connect_timeout, self.read_timeout),
141            'headers': {
142                'content-type': scrape_config.headers['content-type'] if scrape_config.method in ['POST', 'PUT', 'PATCH'] else self.body_handler.content_type,
143                'accept-encoding': self.body_handler.content_encoding,
144                'accept': self.body_handler.accept,
145                'user-agent': self.ua
146            },
147            'params': scrape_config.to_api_params(key=self.key)
148        }
149
150    def account(self) -> Union[str, Dict]:
151        response = self._http_handler(
152            method='GET',
153            url=self.host + '/account',
154            params={'key': self.key},
155            verify=self.verify,
156            headers={
157                'accept-encoding': self.body_handler.content_encoding,
158                'accept': self.body_handler.accept,
159                'user-agent': self.ua
160            },
161        )
162
163        response.raise_for_status()
164
165        if self.body_handler.support(response.headers):
166            return self.body_handler(response.content)
167
168        return response.content.decode('utf-8')
169
170    def resilient_scrape(
171        self,
172        scrape_config:ScrapeConfig,
173        retry_on_errors:Set[Exception]={ScrapflyError},
174        retry_on_status_code:Optional[List[int]]=None,
175        tries: int = 5,
176        delay: int = 20,
177    ) -> ScrapeApiResponse:
178        assert retry_on_errors is not None, 'Retry on error is None'
179        assert isinstance(retry_on_errors, set), 'retry_on_errors is not a set()'
180
181        @backoff.on_exception(backoff.expo, exception=tuple(retry_on_errors), max_tries=tries, max_time=delay)
182        def inner() -> ScrapeApiResponse:
183
184            try:
185                return self.scrape(scrape_config=scrape_config)
186            except (UpstreamHttpClientError, UpstreamHttpServerError) as e:
187                if retry_on_status_code is not None and e.api_response:
188                    if e.api_response.upstream_status_code in retry_on_status_code:
189                        raise e
190                    else:
191                        return e.api_response
192
193                raise e
194
195        return inner()
196
197    def open(self):
198        if self.http_session is None:
199            self.http_session = Session()
200            self.http_session.verify = self.verify
201            self.http_session.timeout = (self.connect_timeout, self.read_timeout)
202            self.http_session.params['key'] = self.key
203            self.http_session.headers['accept-encoding'] = self.body_handler.content_encoding
204            self.http_session.headers['accept'] = self.body_handler.accept
205            self.http_session.headers['user-agent'] = self.ua
206
207    def close(self):
208        self.http_session.close()
209        self.http_session = None
210
211    def __enter__(self) -> 'ScrapflyClient':
212        self.open()
213        return self
214
215    def __exit__(self, exc_type, exc_val, exc_tb):
216        self.close()
217
218    async def async_scrape(self, scrape_config:ScrapeConfig, loop:Optional[AbstractEventLoop]=None) -> ScrapeApiResponse:
219        if loop is None:
220            loop = asyncio.get_running_loop()
221
222        return await loop.run_in_executor(self.async_executor, self.scrape, scrape_config)
223
224    async def concurrent_scrape(self, scrape_configs:List[ScrapeConfig], concurrency:Optional[int]=None):
225        if concurrency is None:
226            concurrency = self.max_concurrency
227        elif concurrency == self.CONCURRENCY_AUTO:
228            concurrency = self.account()['subscription']['max_concurrency']
229
230        loop = asyncio.get_running_loop()
231        processing_tasks = []
232        results = []
233        processed_tasks = 0
234        expected_tasks = len(scrape_configs)
235
236        def scrape_done_callback(task:Task):
237            nonlocal processed_tasks
238
239            try:
240                if task.cancelled() is True:
241                    return
242
243                error = task.exception()
244
245                if error is not None:
246                    results.append(error)
247                else:
248                    results.append(task.result())
249            finally:
250                processing_tasks.remove(task)
251                processed_tasks += 1
252
253        while scrape_configs or results or processing_tasks:
254            logger.info("Scrape %d/%d - %d running" % (processed_tasks, expected_tasks, len(processing_tasks)))
255
256            if scrape_configs:
257                if len(processing_tasks) < concurrency:
258                    # @todo handle backpressure
259                    for _ in range(0, concurrency - len(processing_tasks)):
260                        try:
261                            scrape_config = scrape_configs.pop()
262                        except:
263                            break
264
265                        scrape_config.raise_on_upstream_error = False
266                        task = loop.create_task(self.async_scrape(scrape_config=scrape_config, loop=loop))
267                        processing_tasks.append(task)
268                        task.add_done_callback(scrape_done_callback)
269
270            for _ in results:
271                result = results.pop()
272                yield result
273
274            await asyncio.sleep(.5)
275
276        logger.debug("Scrape %d/%d - %d running" % (processed_tasks, expected_tasks, len(processing_tasks)))
277
278    @backoff.on_exception(backoff.expo, exception=NetworkError, max_tries=5)
279    def scrape(self, scrape_config:ScrapeConfig) -> ScrapeApiResponse:
280
281        try:
282            logger.debug('--> %s Scrapping %s' % (scrape_config.method, scrape_config.url))
283            request_data = self._scrape_request(scrape_config=scrape_config)
284            response = self._http_handler(**request_data)
285            scrape_api_response = self._handle_response(response=response, scrape_config=scrape_config)
286
287            self.reporter.report(scrape_api_response=scrape_api_response)
288
289            return scrape_api_response
290        except BaseException as e:
291            self.reporter.report(error=e)
292            raise e
293
294    def _handle_response(self, response:Response, scrape_config:ScrapeConfig) -> ScrapeApiResponse:
295        try:
296            api_response = self._handle_api_response(
297                response=response,
298                scrape_config=scrape_config,
299                raise_on_upstream_error=scrape_config.raise_on_upstream_error
300            )
301
302            if scrape_config.method == 'HEAD':
303                logger.debug('<-- [%s %s] %s | %ss' % (
304                    api_response.response.status_code,
305                    api_response.response.reason,
306                    api_response.response.request.url,
307                    0
308                ))
309            else:
310                logger.debug('<-- [%s %s] %s | %ss' % (
311                    api_response.result['result']['status_code'],
312                    api_response.result['result']['reason'],
313                    api_response.result['config']['url'],
314                    api_response.result['result']['duration'])
315                )
316
317                logger.debug('Log url: %s' % api_response.result['result']['log_url'])
318
319            return api_response
320        except UpstreamHttpError as e:
321            logger.critical(e.api_response.error_message)
322            raise
323        except ScrapflyScrapeError as e:
324            if e.api_response is not None:
325                logger.critical(e.api_response.error_message)
326            else:
327                logger.critical(e.message)
328            raise
329        except HttpError as e:
330            if e.api_response is not None:
331                logger.critical(e.api_response.error_message)
332            else:
333                logger.critical(e.message)
334            raise
335        except ScrapflyError as e:
336            logger.critical('<-- %s | Docs: %s' % (str(e), e.documentation_url))
337            raise
338
339    def save_screenshot(self, api_response:ScrapeApiResponse, name:str, path:Optional[str]=None):
340
341        if not api_response.scrape_result['screenshots']:
342            raise RuntimeError('Screenshot %s do no exists' % name)
343
344        try:
345            api_response.scrape_result['screenshots'][name]
346        except KeyError:
347            raise RuntimeError('Screenshot %s do no exists' % name)
348
349        screenshot_response = self._http_handler(
350            method='GET',
351            url=api_response.scrape_result['screenshots'][name]['url'],
352            params={'key': self.key},
353            verify=self.verify
354        )
355
356        screenshot_response.raise_for_status()
357
358        if not name.endswith('.jpg'):
359            name += '.jpg'
360
361        api_response.sink(path=path, name=name, content=screenshot_response.content)
362
363    def screenshot(self, url:str, path:Optional[str]=None, name:Optional[str]=None) -> str:
364        # for advance configuration, take screenshots via scrape method with ScrapeConfig
365        api_response = self.scrape(scrape_config=ScrapeConfig(
366            url=url,
367            render_js=True,
368            screenshots={'main': 'fullpage'}
369        ))
370
371        name = name or 'main.jpg'
372
373        if not name.endswith('.jpg'):
374            name += '.jpg'
375
376        response = self._http_handler(
377            method='GET',
378            url=api_response.scrape_result['screenshots']['main']['url'],
379            params={'key': self.key}
380        )
381
382        response.raise_for_status()
383
384        return self.sink(api_response, path=path, name=name, content=response.content)
385
386    def sink(self, api_response:ScrapeApiResponse, content:Optional[Union[str, bytes]]=None, path: Optional[str] = None, name: Optional[str] = None, file: Optional[Union[TextIO, BytesIO]] = None) -> str:
387        scrape_result = api_response.result['result']
388        scrape_config = api_response.result['config']
389
390        file_content = content or scrape_result['content']
391        file_path = None
392        file_extension = None
393
394        if name:
395            name_parts = name.split('.')
396            if len(name_parts) > 1:
397                file_extension = name_parts[-1]
398
399        if not file:
400            if file_extension is None:
401                try:
402                    mime_type = scrape_result['response_headers']['content-type']
403                except KeyError:
404                    mime_type = 'application/octet-stream'
405
406                if ';' in mime_type:
407                    mime_type = mime_type.split(';')[0]
408
409                file_extension = '.' + mime_type.split('/')[1]
410
411            if not name:
412                name = scrape_config['url'].split('/')[-1]
413
414            if name.find(file_extension) == -1:
415                name += file_extension
416
417            file_path = path + '/' + name if path else name
418
419            if file_path == file_extension:
420                url = re.sub(r'(https|http)?://', '', api_response.config['url']).replace('/', '-')
421
422                if url[-1] == '-':
423                    url = url[:-1]
424
425                url += file_extension
426
427                file_path = url
428
429            file = open(file_path, 'wb')
430
431        if isinstance(file_content, str):
432            file_content = BytesIO(file_content.encode('utf-8'))
433        elif isinstance(file_content, bytes):
434            file_content = BytesIO(file_content)
435
436        file_content.seek(0)
437        with file as f:
438            shutil.copyfileobj(file_content, f, length=131072)
439
440        logger.info('file %s created' % file_path)
441        return file_path
442
443    def _handle_api_response(
444        self,
445        response: Response,
446        scrape_config:ScrapeConfig,
447        raise_on_upstream_error: Optional[bool] = True
448    ) -> ScrapeApiResponse:
449
450        if scrape_config.method == 'HEAD':
451            body = None
452        else:
453            if self.body_handler.support(headers=response.headers):
454                body = self.body_handler(response.content)
455            else:
456                body = response.content.decode('utf-8')
457
458        api_response:ScrapeApiResponse = ScrapeApiResponse(
459            response=response,
460            request=response.request,
461            api_result=body,
462            scrape_config=scrape_config
463        )
464
465        api_response.raise_for_result(raise_on_upstream_error=raise_on_upstream_error)
466
467        return api_response
ScrapflyClient( key: str, host: Optional[str] = 'https://api.scrapfly.io', verify=True, debug: bool = False, max_concurrency: int = 1, connect_timeout: int = 30, read_timeout: int = 160, reporter: Optional[Callable] = None, **kwargs)
 63    def __init__(
 64        self,
 65        key: str,
 66        host: Optional[str] = HOST,
 67        verify=True,
 68        debug: bool = False,
 69        max_concurrency:int=1,
 70        connect_timeout:int = DEFAULT_CONNECT_TIMEOUT,
 71        read_timeout:int = DEFAULT_READ_TIMEOUT,
 72        reporter:Optional[Callable]=None,
 73        **kwargs
 74    ):
 75        if host[-1] == '/':  # remove last '/' if exists
 76            host = host[:-1]
 77
 78        if 'distributed_mode' in kwargs:
 79            warnings.warn("distributed mode is deprecated and will be remove the next version -"
 80              " user should handle themself the session name based on the concurrency",
 81              DeprecationWarning,
 82              stacklevel=2
 83            )
 84
 85        if 'brotli' in kwargs:
 86            warnings.warn("brotli arg is deprecated and will be remove the next version - "
 87                "brotli is disabled by default",
 88                DeprecationWarning,
 89                stacklevel=2
 90            )
 91
 92        self.version = __version__
 93        self.host = host
 94        self.key = key
 95        self.verify = verify
 96        self.debug = debug
 97        self.connect_timeout = connect_timeout
 98        self.read_timeout = read_timeout
 99        self.max_concurrency = max_concurrency
100        self.body_handler = ResponseBodyHandler(use_brotli=False)
101        self.async_executor = ThreadPoolExecutor()
102        self.http_session = None
103
104        if not self.verify and not self.HOST.endswith('.local'):
105            urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
106
107        if self.debug is True:
108            http.client.HTTPConnection.debuglevel = 5
109
110        if reporter is None:
111            from .reporter import NoopReporter
112
113            reporter = NoopReporter()
114
115        self.reporter = Reporter(reporter)
def account(self) -> Union[str, Dict]:
150    def account(self) -> Union[str, Dict]:
151        response = self._http_handler(
152            method='GET',
153            url=self.host + '/account',
154            params={'key': self.key},
155            verify=self.verify,
156            headers={
157                'accept-encoding': self.body_handler.content_encoding,
158                'accept': self.body_handler.accept,
159                'user-agent': self.ua
160            },
161        )
162
163        response.raise_for_status()
164
165        if self.body_handler.support(response.headers):
166            return self.body_handler(response.content)
167
168        return response.content.decode('utf-8')
def resilient_scrape( self, scrape_config: scrapfly.ScrapeConfig, retry_on_errors: Set[Exception] = {<class 'scrapfly.ScrapflyError'>}, retry_on_status_code: Optional[List[int]] = None, tries: int = 5, delay: int = 20) -> scrapfly.ScrapeApiResponse:
170    def resilient_scrape(
171        self,
172        scrape_config:ScrapeConfig,
173        retry_on_errors:Set[Exception]={ScrapflyError},
174        retry_on_status_code:Optional[List[int]]=None,
175        tries: int = 5,
176        delay: int = 20,
177    ) -> ScrapeApiResponse:
178        assert retry_on_errors is not None, 'Retry on error is None'
179        assert isinstance(retry_on_errors, set), 'retry_on_errors is not a set()'
180
181        @backoff.on_exception(backoff.expo, exception=tuple(retry_on_errors), max_tries=tries, max_time=delay)
182        def inner() -> ScrapeApiResponse:
183
184            try:
185                return self.scrape(scrape_config=scrape_config)
186            except (UpstreamHttpClientError, UpstreamHttpServerError) as e:
187                if retry_on_status_code is not None and e.api_response:
188                    if e.api_response.upstream_status_code in retry_on_status_code:
189                        raise e
190                    else:
191                        return e.api_response
192
193                raise e
194
195        return inner()
def open(self):
197    def open(self):
198        if self.http_session is None:
199            self.http_session = Session()
200            self.http_session.verify = self.verify
201            self.http_session.timeout = (self.connect_timeout, self.read_timeout)
202            self.http_session.params['key'] = self.key
203            self.http_session.headers['accept-encoding'] = self.body_handler.content_encoding
204            self.http_session.headers['accept'] = self.body_handler.accept
205            self.http_session.headers['user-agent'] = self.ua
def close(self):
207    def close(self):
208        self.http_session.close()
209        self.http_session = None
async def async_scrape( self, scrape_config: scrapfly.ScrapeConfig, loop: Optional[asyncio.events.AbstractEventLoop] = None) -> scrapfly.ScrapeApiResponse:
218    async def async_scrape(self, scrape_config:ScrapeConfig, loop:Optional[AbstractEventLoop]=None) -> ScrapeApiResponse:
219        if loop is None:
220            loop = asyncio.get_running_loop()
221
222        return await loop.run_in_executor(self.async_executor, self.scrape, scrape_config)
async def concurrent_scrape( self, scrape_configs: List[scrapfly.ScrapeConfig], concurrency: Optional[int] = None):
224    async def concurrent_scrape(self, scrape_configs:List[ScrapeConfig], concurrency:Optional[int]=None):
225        if concurrency is None:
226            concurrency = self.max_concurrency
227        elif concurrency == self.CONCURRENCY_AUTO:
228            concurrency = self.account()['subscription']['max_concurrency']
229
230        loop = asyncio.get_running_loop()
231        processing_tasks = []
232        results = []
233        processed_tasks = 0
234        expected_tasks = len(scrape_configs)
235
236        def scrape_done_callback(task:Task):
237            nonlocal processed_tasks
238
239            try:
240                if task.cancelled() is True:
241                    return
242
243                error = task.exception()
244
245                if error is not None:
246                    results.append(error)
247                else:
248                    results.append(task.result())
249            finally:
250                processing_tasks.remove(task)
251                processed_tasks += 1
252
253        while scrape_configs or results or processing_tasks:
254            logger.info("Scrape %d/%d - %d running" % (processed_tasks, expected_tasks, len(processing_tasks)))
255
256            if scrape_configs:
257                if len(processing_tasks) < concurrency:
258                    # @todo handle backpressure
259                    for _ in range(0, concurrency - len(processing_tasks)):
260                        try:
261                            scrape_config = scrape_configs.pop()
262                        except:
263                            break
264
265                        scrape_config.raise_on_upstream_error = False
266                        task = loop.create_task(self.async_scrape(scrape_config=scrape_config, loop=loop))
267                        processing_tasks.append(task)
268                        task.add_done_callback(scrape_done_callback)
269
270            for _ in results:
271                result = results.pop()
272                yield result
273
274            await asyncio.sleep(.5)
275
276        logger.debug("Scrape %d/%d - %d running" % (processed_tasks, expected_tasks, len(processing_tasks)))
@backoff.on_exception(backoff.expo, exception=NetworkError, max_tries=5)
def scrape( self, scrape_config: scrapfly.ScrapeConfig) -> scrapfly.ScrapeApiResponse:
278    @backoff.on_exception(backoff.expo, exception=NetworkError, max_tries=5)
279    def scrape(self, scrape_config:ScrapeConfig) -> ScrapeApiResponse:
280
281        try:
282            logger.debug('--> %s Scrapping %s' % (scrape_config.method, scrape_config.url))
283            request_data = self._scrape_request(scrape_config=scrape_config)
284            response = self._http_handler(**request_data)
285            scrape_api_response = self._handle_response(response=response, scrape_config=scrape_config)
286
287            self.reporter.report(scrape_api_response=scrape_api_response)
288
289            return scrape_api_response
290        except BaseException as e:
291            self.reporter.report(error=e)
292            raise e
def save_screenshot( self, api_response: scrapfly.ScrapeApiResponse, name: str, path: Optional[str] = None):
339    def save_screenshot(self, api_response:ScrapeApiResponse, name:str, path:Optional[str]=None):
340
341        if not api_response.scrape_result['screenshots']:
342            raise RuntimeError('Screenshot %s do no exists' % name)
343
344        try:
345            api_response.scrape_result['screenshots'][name]
346        except KeyError:
347            raise RuntimeError('Screenshot %s do no exists' % name)
348
349        screenshot_response = self._http_handler(
350            method='GET',
351            url=api_response.scrape_result['screenshots'][name]['url'],
352            params={'key': self.key},
353            verify=self.verify
354        )
355
356        screenshot_response.raise_for_status()
357
358        if not name.endswith('.jpg'):
359            name += '.jpg'
360
361        api_response.sink(path=path, name=name, content=screenshot_response.content)
def screenshot( self, url: str, path: Optional[str] = None, name: Optional[str] = None) -> str:
363    def screenshot(self, url:str, path:Optional[str]=None, name:Optional[str]=None) -> str:
364        # for advance configuration, take screenshots via scrape method with ScrapeConfig
365        api_response = self.scrape(scrape_config=ScrapeConfig(
366            url=url,
367            render_js=True,
368            screenshots={'main': 'fullpage'}
369        ))
370
371        name = name or 'main.jpg'
372
373        if not name.endswith('.jpg'):
374            name += '.jpg'
375
376        response = self._http_handler(
377            method='GET',
378            url=api_response.scrape_result['screenshots']['main']['url'],
379            params={'key': self.key}
380        )
381
382        response.raise_for_status()
383
384        return self.sink(api_response, path=path, name=name, content=response.content)
def sink( self, api_response: scrapfly.ScrapeApiResponse, content: Union[str, bytes, NoneType] = None, path: Optional[str] = None, name: Optional[str] = None, file: Union[TextIO, _io.BytesIO, NoneType] = None) -> str:
386    def sink(self, api_response:ScrapeApiResponse, content:Optional[Union[str, bytes]]=None, path: Optional[str] = None, name: Optional[str] = None, file: Optional[Union[TextIO, BytesIO]] = None) -> str:
387        scrape_result = api_response.result['result']
388        scrape_config = api_response.result['config']
389
390        file_content = content or scrape_result['content']
391        file_path = None
392        file_extension = None
393
394        if name:
395            name_parts = name.split('.')
396            if len(name_parts) > 1:
397                file_extension = name_parts[-1]
398
399        if not file:
400            if file_extension is None:
401                try:
402                    mime_type = scrape_result['response_headers']['content-type']
403                except KeyError:
404                    mime_type = 'application/octet-stream'
405
406                if ';' in mime_type:
407                    mime_type = mime_type.split(';')[0]
408
409                file_extension = '.' + mime_type.split('/')[1]
410
411            if not name:
412                name = scrape_config['url'].split('/')[-1]
413
414            if name.find(file_extension) == -1:
415                name += file_extension
416
417            file_path = path + '/' + name if path else name
418
419            if file_path == file_extension:
420                url = re.sub(r'(https|http)?://', '', api_response.config['url']).replace('/', '-')
421
422                if url[-1] == '-':
423                    url = url[:-1]
424
425                url += file_extension
426
427                file_path = url
428
429            file = open(file_path, 'wb')
430
431        if isinstance(file_content, str):
432            file_content = BytesIO(file_content.encode('utf-8'))
433        elif isinstance(file_content, bytes):
434            file_content = BytesIO(file_content)
435
436        file_content.seek(0)
437        with file as f:
438            shutil.copyfileobj(file_content, f, length=131072)
439
440        logger.info('file %s created' % file_path)
441        return file_path
class ResponseBodyHandler:
 59class ResponseBodyHandler:
 60
 61    SUPPORTED_COMPRESSION = ['gzip', 'deflate']
 62    SUPPORTED_CONTENT_TYPES = ['application/msgpack', 'application/json']
 63
 64    class JSONDateTimeDecoder(JSONDecoder):
 65        def __init__(self, *args, **kargs):
 66            JSONDecoder.__init__(self, *args, object_hook=_date_parser, **kargs)
 67
 68    # brotli under perform at same gzip level and upper level destroy the cpu so
 69    # the trade off do not worth it for most of usage
 70    def __init__(self, use_brotli:bool=False):
 71        if use_brotli is True and 'br' not in self.SUPPORTED_COMPRESSION:
 72            try:
 73                try:
 74                    import brotlicffi as brotli
 75                    self.SUPPORTED_COMPRESSION.insert(0, 'br')
 76                except ImportError:
 77                    import brotli
 78                    self.SUPPORTED_COMPRESSION.insert(0, 'br')
 79            except ImportError:
 80                pass
 81
 82        self.content_encoding = ', '.join(self.SUPPORTED_COMPRESSION)
 83
 84        try:  # automatically use msgpack if available https://msgpack.org/
 85            import msgpack
 86            self.accept = 'application/msgpack;charset=utf-8'
 87            self.content_type = 'application/msgpack;charset=utf-8'
 88            self.content_loader = partial(msgpack.loads, object_hook=_date_parser, strict_map_key=False)
 89        except ImportError:
 90            self.accept = 'application/json;charset=utf-8'
 91            self.content_type = 'application/json;charset=utf-8'
 92            self.content_loader = partial(loads, cls=self.JSONDateTimeDecoder)
 93
 94    def support(self, headers:Dict) -> bool:
 95        if 'content-type' not in headers:
 96            return False
 97
 98        for content_type in self.SUPPORTED_CONTENT_TYPES:
 99            if headers['content-type'].find(content_type) != -1:
100                return True
101
102        return False
103
104    def __call__(self, content: bytes) -> Union[str, Dict]:
105        try:
106            return self.content_loader(content)
107        except Exception as e:
108            raise EncoderError(content=content.decode('utf-8')) from e
ResponseBodyHandler(use_brotli: bool = False)
70    def __init__(self, use_brotli:bool=False):
71        if use_brotli is True and 'br' not in self.SUPPORTED_COMPRESSION:
72            try:
73                try:
74                    import brotlicffi as brotli
75                    self.SUPPORTED_COMPRESSION.insert(0, 'br')
76                except ImportError:
77                    import brotli
78                    self.SUPPORTED_COMPRESSION.insert(0, 'br')
79            except ImportError:
80                pass
81
82        self.content_encoding = ', '.join(self.SUPPORTED_COMPRESSION)
83
84        try:  # automatically use msgpack if available https://msgpack.org/
85            import msgpack
86            self.accept = 'application/msgpack;charset=utf-8'
87            self.content_type = 'application/msgpack;charset=utf-8'
88            self.content_loader = partial(msgpack.loads, object_hook=_date_parser, strict_map_key=False)
89        except ImportError:
90            self.accept = 'application/json;charset=utf-8'
91            self.content_type = 'application/json;charset=utf-8'
92            self.content_loader = partial(loads, cls=self.JSONDateTimeDecoder)
def support(self, headers: Dict) -> bool:
 94    def support(self, headers:Dict) -> bool:
 95        if 'content-type' not in headers:
 96            return False
 97
 98        for content_type in self.SUPPORTED_CONTENT_TYPES:
 99            if headers['content-type'].find(content_type) != -1:
100                return True
101
102        return False
class ResponseBodyHandler.JSONDateTimeDecoder(json.decoder.JSONDecoder):
64    class JSONDateTimeDecoder(JSONDecoder):
65        def __init__(self, *args, **kargs):
66            JSONDecoder.__init__(self, *args, object_hook=_date_parser, **kargs)

Simple JSON https://json.org decoder

Performs the following translations in decoding by default:

+---------------+-------------------+ | JSON | Python | +===============+===================+ | object | dict | +---------------+-------------------+ | array | list | +---------------+-------------------+ | string | str | +---------------+-------------------+ | number (int) | int | +---------------+-------------------+ | number (real) | float | +---------------+-------------------+ | true | True | +---------------+-------------------+ | false | False | +---------------+-------------------+ | null | None | +---------------+-------------------+

It also understands NaN, Infinity, and -Infinity as their corresponding float values, which is outside the JSON spec.

ResponseBodyHandler.JSONDateTimeDecoder(*args, **kargs)
65        def __init__(self, *args, **kargs):
66            JSONDecoder.__init__(self, *args, object_hook=_date_parser, **kargs)

object_hook, if specified, will be called with the result of every JSON object decoded and its return value will be used in place of the given dict. This can be used to provide custom deserializations (e.g. to support JSON-RPC class hinting).

object_pairs_hook, if specified will be called with the result of every JSON object decoded with an ordered list of pairs. The return value of object_pairs_hook will be used instead of the dict. This feature can be used to implement custom decoders. If object_hook is also defined, the object_pairs_hook takes priority.

parse_float, if specified, will be called with the string of every JSON float to be decoded. By default this is equivalent to float(num_str). This can be used to use another datatype or parser for JSON floats (e.g. decimal.Decimal).

parse_int, if specified, will be called with the string of every JSON int to be decoded. By default this is equivalent to int(num_str). This can be used to use another datatype or parser for JSON integers (e.g. float).

parse_constant, if specified, will be called with one of the following strings: -Infinity, Infinity, NaN. This can be used to raise an exception if invalid JSON numbers are encountered.

If strict is false (true is the default), then control characters will be allowed inside strings. Control characters in this context are those with character codes in the 0-31 range, including '\t' (tab), '\n', '\r' and '\0'.

Inherited Members
json.decoder.JSONDecoder
decode
raw_decode
class ScrapeConfig:
 14class ScrapeConfig:
 15
 16    PUBLIC_DATACENTER_POOL = 'public_datacenter_pool'
 17    PUBLIC_RESIDENTIAL_POOL = 'public_residential_pool'
 18
 19    url: str
 20    retry: bool = True
 21    method: str = 'GET'
 22    country: Optional[str] = None
 23    render_js: bool = False
 24    cache: bool = False
 25    cache_clear:bool = False
 26    ssl:bool = False
 27    dns:bool = False
 28    asp:bool = False
 29    debug: bool = False
 30    raise_on_upstream_error:bool = True
 31    cache_ttl:Optional[int] = None
 32    proxy_pool:Optional[str] = None
 33    session: Optional[str] = None
 34    tags: Optional[List[str]] = None
 35    correlation_id: Optional[str] = None
 36    cookies: Optional[CaseInsensitiveDict] = None
 37    body: Optional[str] = None
 38    data: Optional[Dict] = None
 39    headers: Optional[CaseInsensitiveDict] = None
 40    js: str = None
 41    rendering_wait: int = None
 42    wait_for_selector: Optional[str] = None
 43    session_sticky_proxy:bool = True
 44    screenshots:Optional[Dict]=None
 45    webhook:Optional[str]=None
 46    timeout:Optional[int]=None # in milliseconds
 47    js_scenario: Dict = None
 48    extract: Dict = None
 49    lang:Optional[List[str]] = None
 50    os:Optional[str] = None
 51    auto_scroll:Optional[bool] = None
 52
 53    def __init__(
 54        self,
 55        url: str,
 56        retry: bool = True,
 57        method: str = 'GET',
 58        country: Optional[str] = None,
 59        render_js: bool = False,
 60        cache: bool = False,
 61        cache_clear:bool = False,
 62        ssl:bool = False,
 63        dns:bool = False,
 64        asp:bool = False,
 65        debug: bool = False,
 66        raise_on_upstream_error:bool = True,
 67        cache_ttl:Optional[int] = None,
 68        proxy_pool:Optional[str] = None,
 69        session: Optional[str] = None,
 70        tags: Optional[Set[str]] = None,
 71        correlation_id: Optional[str] = None,
 72        cookies: Optional[CaseInsensitiveDict] = None,
 73        body: Optional[str] = None,
 74        data: Optional[Dict] = None,
 75        headers: Optional[Union[CaseInsensitiveDict, Dict[str, str]]] = None,
 76        js: str = None,
 77        rendering_wait: int = None,
 78        wait_for_selector: Optional[str] = None,
 79        screenshots:Optional[Dict]=None,
 80        session_sticky_proxy:Optional[bool] = None,
 81        webhook:Optional[str] = None,
 82        timeout:Optional[int] = None, # in milliseconds
 83        js_scenario:Optional[Dict] = None,
 84        extract:Optional[Dict] = None,
 85        os:Optional[str] = None,
 86        lang:Optional[List[str]] = None,
 87        auto_scroll:Optional[bool] = None
 88    ):
 89        assert(type(url) is str)
 90
 91        if isinstance(tags, List):
 92            tags = set(tags)
 93
 94        cookies = cookies or {}
 95        headers = headers or {}
 96
 97        self.cookies = CaseInsensitiveDict(cookies)
 98        self.headers = CaseInsensitiveDict(headers)
 99        self.url = url
100        self.retry = retry
101        self.method = method
102        self.country = country
103        self.session_sticky_proxy = session_sticky_proxy
104        self.render_js = render_js
105        self.cache = cache
106        self.cache_clear = cache_clear
107        self.asp = asp
108        self.webhook = webhook
109        self.session = session
110        self.debug = debug
111        self.cache_ttl = cache_ttl
112        self.proxy_pool = proxy_pool
113        self.tags = tags or set()
114        self.correlation_id = correlation_id
115        self.wait_for_selector = wait_for_selector
116        self.body = body
117        self.data = data
118        self.js = js
119        self.rendering_wait = rendering_wait
120        self.raise_on_upstream_error = raise_on_upstream_error
121        self.screenshots = screenshots
122        self.key = None
123        self.dns = dns
124        self.ssl = ssl
125        self.js_scenario = js_scenario
126        self.timeout = timeout
127        self.extract = extract
128        self.lang = lang
129        self.os = os
130        self.auto_scroll = auto_scroll
131
132        if cookies:
133            _cookies = []
134
135            for name, value in cookies.items():
136                _cookies.append(name + '=' + value)
137
138            if 'cookie' in self.headers:
139                if self.headers['cookie'][-1] != ';':
140                    self.headers['cookie'] += ';'
141            else:
142                self.headers['cookie'] = ''
143
144            self.headers['cookie'] += '; '.join(_cookies)
145
146        if self.body and self.data:
147            raise ScrapeConfigError('You cannot pass both parameters body and data. You must choose')
148
149        if method in ['POST', 'PUT', 'PATCH']:
150            if self.body is None and self.data is not None:
151                if 'content-type' not in self.headers:
152                    self.headers['content-type'] = 'application/x-www-form-urlencoded'
153                    self.body = urlencode(data)
154                else:
155                    if self.headers['content-type'].find('application/json') != -1:
156                        self.body = json.dumps(data)
157                    elif self.headers['content-type'].find('application/x-www-form-urlencoded') != -1:
158                        self.body = urlencode(data)
159                    else:
160                        raise ScrapeConfigError('Content-Type "%s" not supported, use body parameter to pass pre encoded body according to your content type' % self.headers['content-type'])
161            elif self.body is None and self.data is None:
162                self.headers['content-type'] = 'text/plain'
163
164    def _bool_to_http(self, _bool:bool) -> str:
165        return 'true' if _bool is True else 'false'
166
167    def to_api_params(self, key:str) -> Dict:
168        params = {
169            'key': self.key if self.key is not None else key,
170            'url': self.url
171        }
172
173        if self.country is not None:
174            params['country'] = self.country
175
176        for name, value in self.headers.items():
177            params['headers[%s]' % name] = value
178
179        if self.webhook is not None:
180            params['webhook_name'] = self.webhook
181
182        if self.timeout is not None:
183            params['timeout'] = self.timeout
184
185        if self.extract is not None:
186            params['extract'] = base64.urlsafe_b64encode(json.dumps(self.extract).encode('utf-8')).decode('utf-8')
187
188        if self.render_js is True:
189            params['render_js'] = self._bool_to_http(self.render_js)
190
191            if self.wait_for_selector is not None:
192                params['wait_for_selector'] = self.wait_for_selector
193
194            if self.js:
195                params['js'] = base64.urlsafe_b64encode(self.js.encode('utf-8')).decode('utf-8')
196
197            if self.js_scenario:
198                params['js_scenario'] = base64.urlsafe_b64encode(json.dumps(self.js_scenario).encode('utf-8')).decode('utf-8')
199
200            if self.rendering_wait:
201                params['rendering_wait'] = self.rendering_wait
202
203            if self.screenshots is not None:
204                for name, element in self.screenshots.items():
205                    params['screenshots[%s]' % name] = element
206
207            if self.auto_scroll is True:
208                params['auto_scroll'] = self._bool_to_http(self.auto_scroll)
209        else:
210            if self.wait_for_selector is not None:
211                logging.warning('Params "wait_for_selector" is ignored. Works only if render_js is enabled')
212
213            if self.screenshots:
214                logging.warning('Params "screenshots" is ignored. Works only if render_js is enabled')
215
216            if self.js_scenario:
217                logging.warning('Params "js_scenario" is ignored. Works only if render_js is enabled')
218
219            if self.js:
220                logging.warning('Params "js" is ignored. Works only if render_js is enabled')
221
222            if self.rendering_wait:
223                logging.warning('Params "rendering_wait" is ignored. Works only if render_js is enabled')
224
225        if self.asp is True:
226            params['asp'] = self._bool_to_http(self.asp)
227
228        if self.retry is False:
229            params['retry'] = self._bool_to_http(self.retry)
230
231        if self.cache is True:
232            params['cache'] = self._bool_to_http(self.cache)
233
234            if self.cache_clear is True:
235                params['cache_clear'] = self._bool_to_http(self.cache_clear)
236
237            if self.cache_ttl is not None:
238                params['cache_ttl'] = self.cache_ttl
239        else:
240            if self.cache_clear is True:
241                logging.warning('Params "cache_clear" is ignored. Works only if cache is enabled')
242
243            if self.cache_ttl is not None:
244                logging.warning('Params "cache_ttl" is ignored. Works only if cache is enabled')
245
246        if self.dns is True:
247            params['dns'] = self._bool_to_http(self.dns)
248
249        if self.ssl is True:
250            params['ssl'] = self._bool_to_http(self.ssl)
251
252        if self.tags:
253            params['tags'] = ','.join(self.tags)
254
255        if self.correlation_id:
256            params['correlation_id'] = self.correlation_id
257
258        if self.session:
259            params['session'] = self.session
260
261            if self.session_sticky_proxy is True: # false by default
262                params['session_sticky_proxy'] = self._bool_to_http(self.session_sticky_proxy)
263        else:
264            if self.session_sticky_proxy:
265                logging.warning('Params "session_sticky_proxy" is ignored. Works only if session is enabled')
266
267        if self.debug is True:
268            params['debug'] = self._bool_to_http(self.debug)
269
270        if self.proxy_pool is not None:
271            params['proxy_pool'] = self.proxy_pool
272
273        if self.lang is not None:
274            params['lang'] = ','.join(self.lang)
275
276        if self.os is not None:
277            params['os'] = self.os
278
279        return params
280
281    @staticmethod
282    def from_exported_config(config:str) -> 'ScrapeConfig':
283        try:
284            from msgpack import loads as msgpack_loads
285        except ImportError as e:
286            print('You must install msgpack package - run: pip install "scrapfly-sdk[seepdup] or pip install msgpack')
287            raise
288
289        data = msgpack_loads(base64.b64decode(config))
290
291        headers = {}
292
293        for name, value in data['headers'].items():
294            if isinstance(value, Iterable):
295                headers[name] = '; '.join(value)
296            else:
297                headers[name] = value
298
299        return ScrapeConfig(
300            url=data['url'],
301            retry=data['retry'],
302            headers=headers,
303            session=data['session'],
304            session_sticky_proxy=data['session_sticky_proxy'],
305            cache=data['cache'],
306            cache_ttl=data['cache_ttl'],
307            cache_clear=data['cache_clear'],
308            render_js=data['render_js'],
309            method=data['method'],
310            asp=data['asp'],
311            body=data['body'],
312            ssl=data['ssl'],
313            dns=data['dns'],
314            country=data['country'],
315            debug=data['debug'],
316            correlation_id=data['correlation_id'],
317            tags=data['tags'],
318            js=data['js'],
319            rendering_wait=data['rendering_wait'],
320            screenshots=data['screenshots'] or {},
321            proxy_pool=data['proxy_pool'],
322            auto_scroll=data['auto_scroll']
323        )
ScrapeConfig( url: str, retry: bool = True, method: str = 'GET', country: Optional[str] = None, render_js: bool = False, cache: bool = False, cache_clear: bool = False, ssl: bool = False, dns: bool = False, asp: bool = False, debug: bool = False, raise_on_upstream_error: bool = True, cache_ttl: Optional[int] = None, proxy_pool: Optional[str] = None, session: Optional[str] = None, tags: Optional[Set[str]] = None, correlation_id: Optional[str] = None, cookies: Optional[requests.structures.CaseInsensitiveDict] = None, body: Optional[str] = None, data: Optional[Dict] = None, headers: Union[requests.structures.CaseInsensitiveDict, Dict[str, str], NoneType] = None, js: str = None, rendering_wait: int = None, wait_for_selector: Optional[str] = None, screenshots: Optional[Dict] = None, session_sticky_proxy: Optional[bool] = None, webhook: Optional[str] = None, timeout: Optional[int] = None, js_scenario: Optional[Dict] = None, extract: Optional[Dict] = None, os: Optional[str] = None, lang: Optional[List[str]] = None, auto_scroll: Optional[bool] = None)
 53    def __init__(
 54        self,
 55        url: str,
 56        retry: bool = True,
 57        method: str = 'GET',
 58        country: Optional[str] = None,
 59        render_js: bool = False,
 60        cache: bool = False,
 61        cache_clear:bool = False,
 62        ssl:bool = False,
 63        dns:bool = False,
 64        asp:bool = False,
 65        debug: bool = False,
 66        raise_on_upstream_error:bool = True,
 67        cache_ttl:Optional[int] = None,
 68        proxy_pool:Optional[str] = None,
 69        session: Optional[str] = None,
 70        tags: Optional[Set[str]] = None,
 71        correlation_id: Optional[str] = None,
 72        cookies: Optional[CaseInsensitiveDict] = None,
 73        body: Optional[str] = None,
 74        data: Optional[Dict] = None,
 75        headers: Optional[Union[CaseInsensitiveDict, Dict[str, str]]] = None,
 76        js: str = None,
 77        rendering_wait: int = None,
 78        wait_for_selector: Optional[str] = None,
 79        screenshots:Optional[Dict]=None,
 80        session_sticky_proxy:Optional[bool] = None,
 81        webhook:Optional[str] = None,
 82        timeout:Optional[int] = None, # in milliseconds
 83        js_scenario:Optional[Dict] = None,
 84        extract:Optional[Dict] = None,
 85        os:Optional[str] = None,
 86        lang:Optional[List[str]] = None,
 87        auto_scroll:Optional[bool] = None
 88    ):
 89        assert(type(url) is str)
 90
 91        if isinstance(tags, List):
 92            tags = set(tags)
 93
 94        cookies = cookies or {}
 95        headers = headers or {}
 96
 97        self.cookies = CaseInsensitiveDict(cookies)
 98        self.headers = CaseInsensitiveDict(headers)
 99        self.url = url
100        self.retry = retry
101        self.method = method
102        self.country = country
103        self.session_sticky_proxy = session_sticky_proxy
104        self.render_js = render_js
105        self.cache = cache
106        self.cache_clear = cache_clear
107        self.asp = asp
108        self.webhook = webhook
109        self.session = session
110        self.debug = debug
111        self.cache_ttl = cache_ttl
112        self.proxy_pool = proxy_pool
113        self.tags = tags or set()
114        self.correlation_id = correlation_id
115        self.wait_for_selector = wait_for_selector
116        self.body = body
117        self.data = data
118        self.js = js
119        self.rendering_wait = rendering_wait
120        self.raise_on_upstream_error = raise_on_upstream_error
121        self.screenshots = screenshots
122        self.key = None
123        self.dns = dns
124        self.ssl = ssl
125        self.js_scenario = js_scenario
126        self.timeout = timeout
127        self.extract = extract
128        self.lang = lang
129        self.os = os
130        self.auto_scroll = auto_scroll
131
132        if cookies:
133            _cookies = []
134
135            for name, value in cookies.items():
136                _cookies.append(name + '=' + value)
137
138            if 'cookie' in self.headers:
139                if self.headers['cookie'][-1] != ';':
140                    self.headers['cookie'] += ';'
141            else:
142                self.headers['cookie'] = ''
143
144            self.headers['cookie'] += '; '.join(_cookies)
145
146        if self.body and self.data:
147            raise ScrapeConfigError('You cannot pass both parameters body and data. You must choose')
148
149        if method in ['POST', 'PUT', 'PATCH']:
150            if self.body is None and self.data is not None:
151                if 'content-type' not in self.headers:
152                    self.headers['content-type'] = 'application/x-www-form-urlencoded'
153                    self.body = urlencode(data)
154                else:
155                    if self.headers['content-type'].find('application/json') != -1:
156                        self.body = json.dumps(data)
157                    elif self.headers['content-type'].find('application/x-www-form-urlencoded') != -1:
158                        self.body = urlencode(data)
159                    else:
160                        raise ScrapeConfigError('Content-Type "%s" not supported, use body parameter to pass pre encoded body according to your content type' % self.headers['content-type'])
161            elif self.body is None and self.data is None:
162                self.headers['content-type'] = 'text/plain'
def to_api_params(self, key: str) -> Dict:
167    def to_api_params(self, key:str) -> Dict:
168        params = {
169            'key': self.key if self.key is not None else key,
170            'url': self.url
171        }
172
173        if self.country is not None:
174            params['country'] = self.country
175
176        for name, value in self.headers.items():
177            params['headers[%s]' % name] = value
178
179        if self.webhook is not None:
180            params['webhook_name'] = self.webhook
181
182        if self.timeout is not None:
183            params['timeout'] = self.timeout
184
185        if self.extract is not None:
186            params['extract'] = base64.urlsafe_b64encode(json.dumps(self.extract).encode('utf-8')).decode('utf-8')
187
188        if self.render_js is True:
189            params['render_js'] = self._bool_to_http(self.render_js)
190
191            if self.wait_for_selector is not None:
192                params['wait_for_selector'] = self.wait_for_selector
193
194            if self.js:
195                params['js'] = base64.urlsafe_b64encode(self.js.encode('utf-8')).decode('utf-8')
196
197            if self.js_scenario:
198                params['js_scenario'] = base64.urlsafe_b64encode(json.dumps(self.js_scenario).encode('utf-8')).decode('utf-8')
199
200            if self.rendering_wait:
201                params['rendering_wait'] = self.rendering_wait
202
203            if self.screenshots is not None:
204                for name, element in self.screenshots.items():
205                    params['screenshots[%s]' % name] = element
206
207            if self.auto_scroll is True:
208                params['auto_scroll'] = self._bool_to_http(self.auto_scroll)
209        else:
210            if self.wait_for_selector is not None:
211                logging.warning('Params "wait_for_selector" is ignored. Works only if render_js is enabled')
212
213            if self.screenshots:
214                logging.warning('Params "screenshots" is ignored. Works only if render_js is enabled')
215
216            if self.js_scenario:
217                logging.warning('Params "js_scenario" is ignored. Works only if render_js is enabled')
218
219            if self.js:
220                logging.warning('Params "js" is ignored. Works only if render_js is enabled')
221
222            if self.rendering_wait:
223                logging.warning('Params "rendering_wait" is ignored. Works only if render_js is enabled')
224
225        if self.asp is True:
226            params['asp'] = self._bool_to_http(self.asp)
227
228        if self.retry is False:
229            params['retry'] = self._bool_to_http(self.retry)
230
231        if self.cache is True:
232            params['cache'] = self._bool_to_http(self.cache)
233
234            if self.cache_clear is True:
235                params['cache_clear'] = self._bool_to_http(self.cache_clear)
236
237            if self.cache_ttl is not None:
238                params['cache_ttl'] = self.cache_ttl
239        else:
240            if self.cache_clear is True:
241                logging.warning('Params "cache_clear" is ignored. Works only if cache is enabled')
242
243            if self.cache_ttl is not None:
244                logging.warning('Params "cache_ttl" is ignored. Works only if cache is enabled')
245
246        if self.dns is True:
247            params['dns'] = self._bool_to_http(self.dns)
248
249        if self.ssl is True:
250            params['ssl'] = self._bool_to_http(self.ssl)
251
252        if self.tags:
253            params['tags'] = ','.join(self.tags)
254
255        if self.correlation_id:
256            params['correlation_id'] = self.correlation_id
257
258        if self.session:
259            params['session'] = self.session
260
261            if self.session_sticky_proxy is True: # false by default
262                params['session_sticky_proxy'] = self._bool_to_http(self.session_sticky_proxy)
263        else:
264            if self.session_sticky_proxy:
265                logging.warning('Params "session_sticky_proxy" is ignored. Works only if session is enabled')
266
267        if self.debug is True:
268            params['debug'] = self._bool_to_http(self.debug)
269
270        if self.proxy_pool is not None:
271            params['proxy_pool'] = self.proxy_pool
272
273        if self.lang is not None:
274            params['lang'] = ','.join(self.lang)
275
276        if self.os is not None:
277            params['os'] = self.os
278
279        return params
@staticmethod
def from_exported_config(config: str) -> scrapfly.ScrapeConfig:
281    @staticmethod
282    def from_exported_config(config:str) -> 'ScrapeConfig':
283        try:
284            from msgpack import loads as msgpack_loads
285        except ImportError as e:
286            print('You must install msgpack package - run: pip install "scrapfly-sdk[seepdup] or pip install msgpack')
287            raise
288
289        data = msgpack_loads(base64.b64decode(config))
290
291        headers = {}
292
293        for name, value in data['headers'].items():
294            if isinstance(value, Iterable):
295                headers[name] = '; '.join(value)
296            else:
297                headers[name] = value
298
299        return ScrapeConfig(
300            url=data['url'],
301            retry=data['retry'],
302            headers=headers,
303            session=data['session'],
304            session_sticky_proxy=data['session_sticky_proxy'],
305            cache=data['cache'],
306            cache_ttl=data['cache_ttl'],
307            cache_clear=data['cache_clear'],
308            render_js=data['render_js'],
309            method=data['method'],
310            asp=data['asp'],
311            body=data['body'],
312            ssl=data['ssl'],
313            dns=data['dns'],
314            country=data['country'],
315            debug=data['debug'],
316            correlation_id=data['correlation_id'],
317            tags=data['tags'],
318            js=data['js'],
319            rendering_wait=data['rendering_wait'],
320            screenshots=data['screenshots'] or {},
321            proxy_pool=data['proxy_pool'],
322            auto_scroll=data['auto_scroll']
323        )