scrapfly
1__version__ = '0.8.5' 2 3from typing import Tuple 4from .errors import ScrapflyError 5from .errors import ScrapflyAspError 6from .errors import ScrapflyProxyError 7from .errors import ScrapflyScheduleError 8from .errors import ScrapflyScrapeError 9from .errors import ScrapflySessionError 10from .errors import ScrapflyThrottleError 11from .errors import ScrapflyWebhookError 12from .errors import EncoderError 13from .errors import ErrorFactory 14from .errors import HttpError 15from .errors import UpstreamHttpError 16from .errors import UpstreamHttpClientError 17from .errors import UpstreamHttpServerError 18from .errors import ApiHttpClientError 19from .errors import ApiHttpServerError 20from .api_response import ScrapeApiResponse, ResponseBodyHandler 21from .client import ScrapflyClient 22from .scrape_config import ScrapeConfig 23 24__all__:Tuple[str, ...] = ( 25 'ScrapflyError', 26 'ScrapflyAspError', 27 'ScrapflyProxyError', 28 'ScrapflyScheduleError', 29 'ScrapflyScrapeError', 30 'ScrapflySessionError', 31 'ScrapflyThrottleError', 32 'ScrapflyWebhookError', 33 'UpstreamHttpError', 34 'UpstreamHttpClientError', 35 'UpstreamHttpServerError', 36 'ApiHttpClientError', 37 'ApiHttpServerError', 38 'EncoderError', 39 'ScrapeApiResponse', 40 'ErrorFactory', 41 'HttpError', 42 'ScrapflyClient', 43 'ResponseBodyHandler', 44 'ScrapeConfig' 45)
6class ScrapflyError(Exception): 7 KIND_HTTP_BAD_RESPONSE = 'HTTP_BAD_RESPONSE' 8 KIND_SCRAPFLY_ERROR = 'SCRAPFLY_ERROR' 9 10 RESOURCE_PROXY = 'PROXY' 11 RESOURCE_THROTTLE = 'THROTTLE' 12 RESOURCE_SCRAPE = 'SCRAPE' 13 RESOURCE_ASP = 'ASP' 14 RESOURCE_SCHEDULE = 'SCHEDULE' 15 RESOURCE_WEBHOOK = 'WEBHOOK' 16 RESOURCE_SESSION = 'SESSION' 17 18 RETRYABLE_CODE = [ 19 'ERR::SCRAPE::OPERATION_TIMEOUT', 20 'ERR::SCRAPE::TOO_MANY_CONCURRENT_REQUEST', 21 'ERR::PROXY::RESOURCES_SATURATION', 22 'ERR::PROXY::NOT_REACHABLE', 23 'ERR::PROXY::UNAVAILABLE', 24 'ERR::THROTTLE::MAX_CONCURRENT_REQUEST_EXCEEDED', 25 'ERR::THROTTLE::MAX_REQUEST_RATE_EXCEEDED', 26 'ERR::SESSION::CONCURRENT_ACCESS', 27 'ERR::ASP::SHIELD_EXPIRED', 28 'ERR::SCRAPE::NETWORK_ISSUE', 29 'ERR::SCRAPE::DRIVER_TIMEOUT' 30 ] 31 32 def __init__( 33 self, 34 message: str, 35 code: str, 36 http_status_code: int, 37 resource: Optional[str]=None, 38 is_retryable: bool = False, 39 retry_delay: Optional[int] = None, 40 retry_times: Optional[int] = None, 41 documentation_url: Optional[str] = None, 42 api_response: Optional['ApiResponse'] = None 43 ): 44 self.message = message 45 self.code = code 46 self.retry_delay = retry_delay 47 self.retry_times = retry_times 48 self.resource = resource 49 self.is_retryable = is_retryable 50 self.documentation_url = documentation_url 51 self.api_response = api_response 52 self.http_status_code = http_status_code 53 54 super().__init__(self.message, str(self.code)) 55 56 def __str__(self): 57 message = self.message 58 59 if self.documentation_url is not None: 60 message += '. Learn more: %s' % self.documentation_url 61 62 return message
Common base class for all non-exit exceptions.
32 def __init__( 33 self, 34 message: str, 35 code: str, 36 http_status_code: int, 37 resource: Optional[str]=None, 38 is_retryable: bool = False, 39 retry_delay: Optional[int] = None, 40 retry_times: Optional[int] = None, 41 documentation_url: Optional[str] = None, 42 api_response: Optional['ApiResponse'] = None 43 ): 44 self.message = message 45 self.code = code 46 self.retry_delay = retry_delay 47 self.retry_times = retry_times 48 self.resource = resource 49 self.is_retryable = is_retryable 50 self.documentation_url = documentation_url 51 self.api_response = api_response 52 self.http_status_code = http_status_code 53 54 super().__init__(self.message, str(self.code))
Inherited Members
- builtins.BaseException
- with_traceback
Common base class for all non-exit exceptions.
Common base class for all non-exit exceptions.
Common base class for all non-exit exceptions.
Common base class for all non-exit exceptions.
Common base class for all non-exit exceptions.
Common base class for all non-exit exceptions.
Common base class for all non-exit exceptions.
Common base class for all non-exit exceptions.
Common base class for all non-exit exceptions.
Common base class for all non-exit exceptions.
Common base class for all non-exit exceptions.
Common base class for all non-exit exceptions.
65class EncoderError(BaseException): 66 67 def __init__(self, content:str): 68 self.content = content 69 super().__init__() 70 71 def __str__(self) -> str: 72 return self.content
Common base class for all exceptions
Inherited Members
- builtins.BaseException
- with_traceback
111class ScrapeApiResponse: 112 113 def __init__(self, request: Request, response: Response, scrape_config: ScrapeConfig, api_result: Optional[Dict] = None): 114 self.request = request 115 self.response = response 116 self.scrape_config = scrape_config 117 118 if self.scrape_config.method == 'HEAD': 119 api_result = { 120 'result': { 121 'request_headers': {}, 122 'status': 'DONE', 123 'success': 200 >= self.response.status_code < 300, 124 'response_headers': self.response.headers, 125 'status_code': self.response.status_code, 126 'reason': self.response.reason, 127 'format': 'text', 128 'content': '' 129 }, 130 'context': {}, 131 'config': self.scrape_config.__dict__ 132 } 133 134 if 'X-Scrapfly-Reject-Code' in self.response.headers: 135 api_result['result']['error'] = { 136 'code': self.response.headers['X-Scrapfly-Reject-Code'], 137 'http_code': int(self.response.headers['X-Scrapfly-Reject-Http-Code']), 138 'message': self.response.headers['X-Scrapfly-Reject-Description'], 139 'error_id': self.response.headers['X-Scrapfly-Reject-ID'], 140 'retryable': True if self.response.headers['X-Scrapfly-Reject-Retryable'] == 'yes' else False, 141 'doc_url': '', 142 'links': {} 143 } 144 145 if 'X-Scrapfly-Reject-Doc' in self.response.headers: 146 api_result['result']['error']['doc_url'] = self.response.headers['X-Scrapfly-Reject-Doc'] 147 api_result['result']['error']['links']['Related Docs'] = self.response.headers['X-Scrapfly-Reject-Doc'] 148 149 if isinstance(api_result, str): 150 raise HttpError( 151 request=request, 152 response=response, 153 message='Bad gateway', 154 code=502, 155 http_status_code=502, 156 is_retryable=True 157 ) 158 159 self.result = self.handle_api_result(api_result=api_result) 160 161 @property 162 def scrape_result(self) -> Dict: 163 return self.result['result'] 164 165 @property 166 def config(self) -> Dict: 167 return self.result['config'] 168 169 @property 170 def context(self) -> Dict: 171 return self.result['context'] 172 173 @property 174 def content(self) -> str: 175 return self.scrape_result['content'] 176 177 @property 178 def success(self) -> bool: 179 """ 180 /!\ Success means Scrapfly api reply correctly to the call, but the scrape can be unsuccessful if the upstream reply with error status code 181 """ 182 return 200 >= self.response.status_code <= 299 183 184 @property 185 def scrape_success(self) -> bool: 186 return self.scrape_result['success'] 187 188 @property 189 def error(self) -> Optional[Dict]: 190 if self.scrape_success is False: 191 return self.scrape_result['error'] 192 193 @property 194 def status_code(self) -> int: 195 """ 196 /!\ This is the status code of our API, not the upstream website 197 """ 198 return self.response.status_code 199 200 @property 201 def upstream_status_code(self) -> Optional[int]: 202 if 'status_code' in self.scrape_result: 203 return self.scrape_result['status_code'] 204 205 return None 206 207 def prevent_extra_usage(self): 208 if self.remaining_quota == 0: 209 raise ExtraUsageForbidden( 210 message='All Pre Paid Quota Used', 211 code='ERR::ACCOUNT::PREVENT_EXTRA_USAGE', 212 http_status_code=429, 213 is_retryable=False 214 ) 215 216 @property 217 def remaining_quota(self) -> Optional[int]: 218 remaining_scrape = self.response.headers.get('X-Scrapfly-Remaining-Scrape') 219 220 if remaining_scrape: 221 remaining_scrape = int(remaining_scrape) 222 223 return remaining_scrape 224 225 @property 226 def cost(self) -> Optional[int]: 227 cost = self.response.headers.get('X-Scrapfly-Api-Cost') 228 229 if cost: 230 cost = int(cost) 231 232 return cost 233 234 @property 235 def duration_ms(self) -> Optional[float]: 236 duration = self.response.headers.get('X-Scrapfly-Response-Time') 237 238 if duration: 239 duration = float(duration) 240 241 return duration 242 243 @property 244 def headers(self) -> CaseInsensitiveDict: 245 return self.response.headers 246 247 def handle_api_result(self, api_result: Dict) -> Optional[FrozenDict]: 248 if self._is_api_error(api_result=api_result) is True: 249 return FrozenDict(api_result) 250 251 try: 252 if isinstance(api_result['config']['headers'], list): 253 api_result['config']['headers'] = {} 254 except TypeError: 255 logger.info(api_result) 256 raise 257 258 with suppress(KeyError): 259 api_result['result']['request_headers'] = CaseInsensitiveDict(api_result['result']['request_headers']) 260 api_result['result']['response_headers'] = CaseInsensitiveDict(api_result['result']['response_headers']) 261 262 if api_result['result']['format'] == 'binary' and api_result['result']['content']: 263 api_result['result']['content'] = BytesIO(b64decode(api_result['result']['content'])) 264 265 return FrozenDict(api_result) 266 267 @cached_property 268 def soup(self) -> 'BeautifulSoup': 269 try: 270 from bs4 import BeautifulSoup 271 soup = BeautifulSoup(self.content, "lxml") 272 return soup 273 except ImportError as e: 274 logger.error('You must install scrapfly[parser] to enable this feature') 275 276 @cached_property 277 def selector(self) -> 'Selector': 278 try: 279 from scrapy import Selector 280 return Selector(text=self.content) 281 except ImportError as e: 282 logger.error('You must install scrapfly[scrapy] to enable this feature') 283 raise e 284 285 @property 286 def error_message(self) : 287 if self.error: 288 message = "<-- %s | %s - %s." % (self.response.status_code, self.error['code'], self.error['message']) 289 290 if self.error['links']: 291 message += "Checkout the related doc: %s" % list(self.error['links'].values())[0] 292 293 return message 294 295 return '<-- %s - %s %s | Doc: %s' % (self.response.status_code, self.http_status_code, self.code, self.documentation_url) 296 297 def _is_api_error(self, api_result: Dict) -> bool: 298 if self.scrape_config.method == 'HEAD': 299 if 'X-Reject-Reason' in self.response.headers: 300 return True 301 return False 302 303 if api_result is None: 304 return True 305 306 return 'error_id' in api_result 307 308 def raise_for_result(self, raise_on_upstream_error: bool = True): 309 310 try: 311 self.response.raise_for_status() 312 except HTTPError as e: 313 if 'http_code' in self.result: 314 if e.response.status_code >= 500: 315 raise ApiHttpServerError( 316 request=e.request, 317 response=e.response, 318 message=self.result['message'], 319 code='', 320 resource='', 321 http_status_code=e.response.status_code, 322 documentation_url=self.result.get('links') 323 ) from e 324 else: 325 raise ApiHttpClientError( 326 request=e.request, 327 response=e.response, 328 message=self.result['message'], 329 code='', 330 resource='API', 331 http_status_code=self.result['http_code'], 332 documentation_url=self.result.get('links') 333 ) from e 334 335 if self.result['result']['status'] == 'DONE' and self.scrape_success is False: 336 error = ErrorFactory.create(api_response=self) 337 338 if error: 339 if isinstance(error, UpstreamHttpError): 340 if raise_on_upstream_error is True: 341 raise error 342 else: 343 raise error 344 345 def upstream_result_into_response(self, _class=Response) -> Optional[Response]: 346 if _class != Response: 347 raise RuntimeError('only Response from requests package is supported at the moment') 348 349 if self.result is None: 350 return None 351 352 if self.response.status_code != 200: 353 return None 354 355 response = Response() 356 response.status_code = self.scrape_result['status_code'] 357 response.reason = self.scrape_result['reason'] 358 response._content = self.scrape_result['content'].encode('utf-8') if self.scrape_result['content'] else None 359 response.headers.update(self.scrape_result['response_headers']) 360 response.url = self.scrape_result['url'] 361 362 response.request = Request( 363 method=self.config['method'], 364 url=self.config['url'], 365 headers=self.scrape_result['request_headers'], 366 data=self.config['body'] if self.config['body'] else None 367 ) 368 369 if 'set-cookie' in response.headers: 370 for raw_cookie in response.headers['set-cookie']: 371 for name, cookie in SimpleCookie(raw_cookie).items(): 372 expires = cookie.get('expires') 373 374 if expires == '': 375 expires = None 376 377 if expires: 378 try: 379 expires = parse(expires).timestamp() 380 except ValueError: 381 expires = None 382 383 if type(expires) == str: 384 if '.' in expires: 385 expires = float(expires) 386 else: 387 expires = int(expires) 388 389 response.cookies.set_cookie(Cookie( 390 version=cookie.get('version') if cookie.get('version') else None, 391 name=name, 392 value=cookie.value, 393 path=cookie.get('path', ''), 394 expires=expires, 395 comment=cookie.get('comment'), 396 domain=cookie.get('domain', ''), 397 secure=cookie.get('secure'), 398 port=None, 399 port_specified=False, 400 domain_specified=cookie.get('domain') is not None and cookie.get('domain') != '', 401 domain_initial_dot=bool(cookie.get('domain').startswith('.')) if cookie.get('domain') is not None else False, 402 path_specified=cookie.get('path') != '' and cookie.get('path') is not None, 403 discard=False, 404 comment_url=None, 405 rest={ 406 'httponly': cookie.get('httponly'), 407 'samesite': cookie.get('samesite'), 408 'max-age': cookie.get('max-age') 409 } 410 )) 411 412 return response 413 414 def sink(self, path: Optional[str] = None, name: Optional[str] = None, file: Optional[Union[TextIO, BytesIO]] = None, content:Optional[Union[str, bytes]]=None): 415 file_content = content or self.scrape_result['content'] 416 file_path = None 417 file_extension = None 418 419 if name: 420 name_parts = name.split('.') 421 if len(name_parts) > 1: 422 file_extension = name_parts[-1] 423 424 if not file: 425 if file_extension is None: 426 try: 427 mime_type = self.scrape_result['response_headers']['content-type'] 428 except KeyError: 429 mime_type = 'application/octet-stream' 430 431 if ';' in mime_type: 432 mime_type = mime_type.split(';')[0] 433 434 file_extension = '.' + mime_type.split('/')[1] 435 436 if not name: 437 name = self.config['url'].split('/')[-1] 438 439 if name.find(file_extension) == -1: 440 name += file_extension 441 442 file_path = path + '/' + name if path is not None else name 443 444 if file_path == file_extension: 445 url = re.sub(r'(https|http)?://', '', self.config['url']).replace('/', '-') 446 447 if url[-1] == '-': 448 url = url[:-1] 449 450 url += file_extension 451 452 file_path = url 453 454 file = open(file_path, 'wb') 455 456 if isinstance(file_content, str): 457 file_content = BytesIO(file_content.encode('utf-8')) 458 elif isinstance(file_content, bytes): 459 file_content = BytesIO(file_content) 460 461 file_content.seek(0) 462 with file as f: 463 shutil.copyfileobj(file_content, f, length=131072) 464 465 logger.info('file %s created' % file_path)
113 def __init__(self, request: Request, response: Response, scrape_config: ScrapeConfig, api_result: Optional[Dict] = None): 114 self.request = request 115 self.response = response 116 self.scrape_config = scrape_config 117 118 if self.scrape_config.method == 'HEAD': 119 api_result = { 120 'result': { 121 'request_headers': {}, 122 'status': 'DONE', 123 'success': 200 >= self.response.status_code < 300, 124 'response_headers': self.response.headers, 125 'status_code': self.response.status_code, 126 'reason': self.response.reason, 127 'format': 'text', 128 'content': '' 129 }, 130 'context': {}, 131 'config': self.scrape_config.__dict__ 132 } 133 134 if 'X-Scrapfly-Reject-Code' in self.response.headers: 135 api_result['result']['error'] = { 136 'code': self.response.headers['X-Scrapfly-Reject-Code'], 137 'http_code': int(self.response.headers['X-Scrapfly-Reject-Http-Code']), 138 'message': self.response.headers['X-Scrapfly-Reject-Description'], 139 'error_id': self.response.headers['X-Scrapfly-Reject-ID'], 140 'retryable': True if self.response.headers['X-Scrapfly-Reject-Retryable'] == 'yes' else False, 141 'doc_url': '', 142 'links': {} 143 } 144 145 if 'X-Scrapfly-Reject-Doc' in self.response.headers: 146 api_result['result']['error']['doc_url'] = self.response.headers['X-Scrapfly-Reject-Doc'] 147 api_result['result']['error']['links']['Related Docs'] = self.response.headers['X-Scrapfly-Reject-Doc'] 148 149 if isinstance(api_result, str): 150 raise HttpError( 151 request=request, 152 response=response, 153 message='Bad gateway', 154 code=502, 155 http_status_code=502, 156 is_retryable=True 157 ) 158 159 self.result = self.handle_api_result(api_result=api_result)
/!\ Success means Scrapfly api reply correctly to the call, but the scrape can be unsuccessful if the upstream reply with error status code
247 def handle_api_result(self, api_result: Dict) -> Optional[FrozenDict]: 248 if self._is_api_error(api_result=api_result) is True: 249 return FrozenDict(api_result) 250 251 try: 252 if isinstance(api_result['config']['headers'], list): 253 api_result['config']['headers'] = {} 254 except TypeError: 255 logger.info(api_result) 256 raise 257 258 with suppress(KeyError): 259 api_result['result']['request_headers'] = CaseInsensitiveDict(api_result['result']['request_headers']) 260 api_result['result']['response_headers'] = CaseInsensitiveDict(api_result['result']['response_headers']) 261 262 if api_result['result']['format'] == 'binary' and api_result['result']['content']: 263 api_result['result']['content'] = BytesIO(b64decode(api_result['result']['content'])) 264 265 return FrozenDict(api_result)
308 def raise_for_result(self, raise_on_upstream_error: bool = True): 309 310 try: 311 self.response.raise_for_status() 312 except HTTPError as e: 313 if 'http_code' in self.result: 314 if e.response.status_code >= 500: 315 raise ApiHttpServerError( 316 request=e.request, 317 response=e.response, 318 message=self.result['message'], 319 code='', 320 resource='', 321 http_status_code=e.response.status_code, 322 documentation_url=self.result.get('links') 323 ) from e 324 else: 325 raise ApiHttpClientError( 326 request=e.request, 327 response=e.response, 328 message=self.result['message'], 329 code='', 330 resource='API', 331 http_status_code=self.result['http_code'], 332 documentation_url=self.result.get('links') 333 ) from e 334 335 if self.result['result']['status'] == 'DONE' and self.scrape_success is False: 336 error = ErrorFactory.create(api_response=self) 337 338 if error: 339 if isinstance(error, UpstreamHttpError): 340 if raise_on_upstream_error is True: 341 raise error 342 else: 343 raise error
345 def upstream_result_into_response(self, _class=Response) -> Optional[Response]: 346 if _class != Response: 347 raise RuntimeError('only Response from requests package is supported at the moment') 348 349 if self.result is None: 350 return None 351 352 if self.response.status_code != 200: 353 return None 354 355 response = Response() 356 response.status_code = self.scrape_result['status_code'] 357 response.reason = self.scrape_result['reason'] 358 response._content = self.scrape_result['content'].encode('utf-8') if self.scrape_result['content'] else None 359 response.headers.update(self.scrape_result['response_headers']) 360 response.url = self.scrape_result['url'] 361 362 response.request = Request( 363 method=self.config['method'], 364 url=self.config['url'], 365 headers=self.scrape_result['request_headers'], 366 data=self.config['body'] if self.config['body'] else None 367 ) 368 369 if 'set-cookie' in response.headers: 370 for raw_cookie in response.headers['set-cookie']: 371 for name, cookie in SimpleCookie(raw_cookie).items(): 372 expires = cookie.get('expires') 373 374 if expires == '': 375 expires = None 376 377 if expires: 378 try: 379 expires = parse(expires).timestamp() 380 except ValueError: 381 expires = None 382 383 if type(expires) == str: 384 if '.' in expires: 385 expires = float(expires) 386 else: 387 expires = int(expires) 388 389 response.cookies.set_cookie(Cookie( 390 version=cookie.get('version') if cookie.get('version') else None, 391 name=name, 392 value=cookie.value, 393 path=cookie.get('path', ''), 394 expires=expires, 395 comment=cookie.get('comment'), 396 domain=cookie.get('domain', ''), 397 secure=cookie.get('secure'), 398 port=None, 399 port_specified=False, 400 domain_specified=cookie.get('domain') is not None and cookie.get('domain') != '', 401 domain_initial_dot=bool(cookie.get('domain').startswith('.')) if cookie.get('domain') is not None else False, 402 path_specified=cookie.get('path') != '' and cookie.get('path') is not None, 403 discard=False, 404 comment_url=None, 405 rest={ 406 'httponly': cookie.get('httponly'), 407 'samesite': cookie.get('samesite'), 408 'max-age': cookie.get('max-age') 409 } 410 )) 411 412 return response
414 def sink(self, path: Optional[str] = None, name: Optional[str] = None, file: Optional[Union[TextIO, BytesIO]] = None, content:Optional[Union[str, bytes]]=None): 415 file_content = content or self.scrape_result['content'] 416 file_path = None 417 file_extension = None 418 419 if name: 420 name_parts = name.split('.') 421 if len(name_parts) > 1: 422 file_extension = name_parts[-1] 423 424 if not file: 425 if file_extension is None: 426 try: 427 mime_type = self.scrape_result['response_headers']['content-type'] 428 except KeyError: 429 mime_type = 'application/octet-stream' 430 431 if ';' in mime_type: 432 mime_type = mime_type.split(';')[0] 433 434 file_extension = '.' + mime_type.split('/')[1] 435 436 if not name: 437 name = self.config['url'].split('/')[-1] 438 439 if name.find(file_extension) == -1: 440 name += file_extension 441 442 file_path = path + '/' + name if path is not None else name 443 444 if file_path == file_extension: 445 url = re.sub(r'(https|http)?://', '', self.config['url']).replace('/', '-') 446 447 if url[-1] == '-': 448 url = url[:-1] 449 450 url += file_extension 451 452 file_path = url 453 454 file = open(file_path, 'wb') 455 456 if isinstance(file_content, str): 457 file_content = BytesIO(file_content.encode('utf-8')) 458 elif isinstance(file_content, bytes): 459 file_content = BytesIO(file_content) 460 461 file_content.seek(0) 462 with file as f: 463 shutil.copyfileobj(file_content, f, length=131072) 464 465 logger.info('file %s created' % file_path)
166class ErrorFactory: 167 RESOURCE_TO_ERROR = { 168 ScrapflyError.RESOURCE_SCRAPE: ScrapflyScrapeError, 169 ScrapflyError.RESOURCE_WEBHOOK: ScrapflyWebhookError, 170 ScrapflyError.RESOURCE_PROXY: ScrapflyProxyError, 171 ScrapflyError.RESOURCE_SCHEDULE: ScrapflyScheduleError, 172 ScrapflyError.RESOURCE_ASP: ScrapflyAspError, 173 ScrapflyError.RESOURCE_SESSION: ScrapflySessionError 174 } 175 176 # Notable http error has own class for more convenience 177 HTTP_STATUS_TO_ERROR = { 178 401: BadApiKeyError, 179 429: TooManyRequest 180 } 181 182 @staticmethod 183 def _get_resource(code: str) -> Optional[Tuple[str, str]]: 184 185 if isinstance(code, str) and '::' in code: 186 _, resource, _ = code.split('::') 187 return resource 188 189 return None 190 191 @staticmethod 192 def create(api_response: 'ScrapeApiResponse'): 193 is_retryable = False 194 kind = ScrapflyError.KIND_HTTP_BAD_RESPONSE if api_response.success is False else ScrapflyError.KIND_SCRAPFLY_ERROR 195 http_code = api_response.status_code 196 retry_delay = 5 197 retry_times = 3 198 description = None 199 error_url = 'https://scrapfly.io/docs/scrape-api/errors#api' 200 code = api_response.error['code'] 201 202 if code == 'ERR::SCRAPE::BAD_UPSTREAM_RESPONSE': 203 http_code = api_response.scrape_result['status_code'] 204 205 if 'description' in api_response.error: 206 description = api_response.error['description'] 207 208 message = '%s %s %s' % (str(http_code), code, api_response.error['message']) 209 210 if 'doc_url' in api_response.error: 211 error_url = api_response.error['doc_url'] 212 213 if 'retryable' in api_response.error: 214 is_retryable = api_response.error['retryable'] 215 216 resource = ErrorFactory._get_resource(code=code) 217 218 if is_retryable is True: 219 if 'X-Retry' in api_response.headers: 220 retry_delay = int(api_response.headers['Retry-After']) 221 222 message = '%s: %s' % (message, description) if description else message 223 224 if retry_delay is not None and is_retryable is True: 225 message = '%s. Retry delay : %s seconds' % (message, str(retry_delay)) 226 227 args = { 228 'message': message, 229 'code': code, 230 'http_status_code': http_code, 231 'is_retryable': is_retryable, 232 'api_response': api_response, 233 'resource': resource, 234 'retry_delay': retry_delay, 235 'retry_times': retry_times, 236 'documentation_url': error_url, 237 'request': api_response.request, 238 'response': api_response.response 239 } 240 241 if kind == ScrapflyError.KIND_HTTP_BAD_RESPONSE: 242 if http_code >= 500: 243 return ApiHttpServerError(**args) 244 245 if http_code in ErrorFactory.HTTP_STATUS_TO_ERROR: 246 return ErrorFactory.HTTP_STATUS_TO_ERROR[http_code](**args) 247 248 if resource in ErrorFactory.RESOURCE_TO_ERROR: 249 return ErrorFactory.RESOURCE_TO_ERROR[resource](**args) 250 251 return ApiHttpClientError(**args) 252 253 elif kind == ScrapflyError.KIND_SCRAPFLY_ERROR: 254 if code == 'ERR::SCRAPE::BAD_UPSTREAM_RESPONSE': 255 if http_code >= 500: 256 return UpstreamHttpServerError(**args) 257 258 if http_code >= 400: 259 return UpstreamHttpClientError(**args) 260 261 if resource in ErrorFactory.RESOURCE_TO_ERROR: 262 return ErrorFactory.RESOURCE_TO_ERROR[resource](**args) 263 264 return ScrapflyError(**args)
191 @staticmethod 192 def create(api_response: 'ScrapeApiResponse'): 193 is_retryable = False 194 kind = ScrapflyError.KIND_HTTP_BAD_RESPONSE if api_response.success is False else ScrapflyError.KIND_SCRAPFLY_ERROR 195 http_code = api_response.status_code 196 retry_delay = 5 197 retry_times = 3 198 description = None 199 error_url = 'https://scrapfly.io/docs/scrape-api/errors#api' 200 code = api_response.error['code'] 201 202 if code == 'ERR::SCRAPE::BAD_UPSTREAM_RESPONSE': 203 http_code = api_response.scrape_result['status_code'] 204 205 if 'description' in api_response.error: 206 description = api_response.error['description'] 207 208 message = '%s %s %s' % (str(http_code), code, api_response.error['message']) 209 210 if 'doc_url' in api_response.error: 211 error_url = api_response.error['doc_url'] 212 213 if 'retryable' in api_response.error: 214 is_retryable = api_response.error['retryable'] 215 216 resource = ErrorFactory._get_resource(code=code) 217 218 if is_retryable is True: 219 if 'X-Retry' in api_response.headers: 220 retry_delay = int(api_response.headers['Retry-After']) 221 222 message = '%s: %s' % (message, description) if description else message 223 224 if retry_delay is not None and is_retryable is True: 225 message = '%s. Retry delay : %s seconds' % (message, str(retry_delay)) 226 227 args = { 228 'message': message, 229 'code': code, 230 'http_status_code': http_code, 231 'is_retryable': is_retryable, 232 'api_response': api_response, 233 'resource': resource, 234 'retry_delay': retry_delay, 235 'retry_times': retry_times, 236 'documentation_url': error_url, 237 'request': api_response.request, 238 'response': api_response.response 239 } 240 241 if kind == ScrapflyError.KIND_HTTP_BAD_RESPONSE: 242 if http_code >= 500: 243 return ApiHttpServerError(**args) 244 245 if http_code in ErrorFactory.HTTP_STATUS_TO_ERROR: 246 return ErrorFactory.HTTP_STATUS_TO_ERROR[http_code](**args) 247 248 if resource in ErrorFactory.RESOURCE_TO_ERROR: 249 return ErrorFactory.RESOURCE_TO_ERROR[resource](**args) 250 251 return ApiHttpClientError(**args) 252 253 elif kind == ScrapflyError.KIND_SCRAPFLY_ERROR: 254 if code == 'ERR::SCRAPE::BAD_UPSTREAM_RESPONSE': 255 if http_code >= 500: 256 return UpstreamHttpServerError(**args) 257 258 if http_code >= 400: 259 return UpstreamHttpClientError(**args) 260 261 if resource in ErrorFactory.RESOURCE_TO_ERROR: 262 return ErrorFactory.RESOURCE_TO_ERROR[resource](**args) 263 264 return ScrapflyError(**args)
79class HttpError(ScrapflyError): 80 81 def __init__(self, request:Request, response:Optional[Response]=None, **kwargs): 82 self.request = request 83 self.response = response 84 super().__init__(**kwargs) 85 86 def __str__(self) -> str: 87 88 if isinstance(self, UpstreamHttpError): 89 text = "%s -- %s " % (self.api_response.scrape_result['status_code'], self.api_response.scrape_result['reason']) 90 else: 91 text = "%s -- %s " % (self.response.status_code, self.response.reason) 92 93 if isinstance(self, (ApiHttpClientError, ApiHttpServerError)): 94 try: 95 text += self.response.content.decode('utf-8') 96 except UnicodeError: 97 text += str(self.response.content) 98 99 return text
Common base class for all non-exit exceptions.
Inherited Members
- builtins.BaseException
- with_traceback
43class ScrapflyClient: 44 45 HOST = 'https://api.scrapfly.io' 46 DEFAULT_CONNECT_TIMEOUT = 30 47 DEFAULT_READ_TIMEOUT = 160 # 155 real 48 49 host:str 50 key:str 51 max_concurrency:int 52 verify:bool 53 debug:bool 54 distributed_mode:bool 55 connect_timeout:int 56 read_timeout:int 57 brotli: bool 58 reporter:Reporter 59 version:str 60 61 CONCURRENCY_AUTO = 'auto' # retrieve the allowed concurrency from your account 62 63 def __init__( 64 self, 65 key: str, 66 host: Optional[str] = HOST, 67 verify=True, 68 debug: bool = False, 69 max_concurrency:int=1, 70 connect_timeout:int = DEFAULT_CONNECT_TIMEOUT, 71 read_timeout:int = DEFAULT_READ_TIMEOUT, 72 reporter:Optional[Callable]=None, 73 **kwargs 74 ): 75 if host[-1] == '/': # remove last '/' if exists 76 host = host[:-1] 77 78 if 'distributed_mode' in kwargs: 79 warnings.warn("distributed mode is deprecated and will be remove the next version -" 80 " user should handle themself the session name based on the concurrency", 81 DeprecationWarning, 82 stacklevel=2 83 ) 84 85 if 'brotli' in kwargs: 86 warnings.warn("brotli arg is deprecated and will be remove the next version - " 87 "brotli is disabled by default", 88 DeprecationWarning, 89 stacklevel=2 90 ) 91 92 self.version = __version__ 93 self.host = host 94 self.key = key 95 self.verify = verify 96 self.debug = debug 97 self.connect_timeout = connect_timeout 98 self.read_timeout = read_timeout 99 self.max_concurrency = max_concurrency 100 self.body_handler = ResponseBodyHandler(use_brotli=False) 101 self.async_executor = ThreadPoolExecutor() 102 self.http_session = None 103 104 if not self.verify and not self.HOST.endswith('.local'): 105 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) 106 107 if self.debug is True: 108 http.client.HTTPConnection.debuglevel = 5 109 110 if reporter is None: 111 from .reporter import NoopReporter 112 113 reporter = NoopReporter() 114 115 self.reporter = Reporter(reporter) 116 117 @property 118 def ua(self) -> str: 119 return 'ScrapflySDK/%s (Python %s, %s, %s)' % ( 120 self.version, 121 platform.python_version(), 122 platform.uname().system, 123 platform.uname().machine 124 ) 125 126 @cached_property 127 def _http_handler(self): 128 return partial(self.http_session.request if self.http_session else requests.request) 129 130 @property 131 def http(self): 132 return self._http_handler 133 134 def _scrape_request(self, scrape_config:ScrapeConfig): 135 return { 136 'method': scrape_config.method, 137 'url': self.host + '/scrape', 138 'data': scrape_config.body, 139 'verify': self.verify, 140 'timeout': (self.connect_timeout, self.read_timeout), 141 'headers': { 142 'content-type': scrape_config.headers['content-type'] if scrape_config.method in ['POST', 'PUT', 'PATCH'] else self.body_handler.content_type, 143 'accept-encoding': self.body_handler.content_encoding, 144 'accept': self.body_handler.accept, 145 'user-agent': self.ua 146 }, 147 'params': scrape_config.to_api_params(key=self.key) 148 } 149 150 def account(self) -> Union[str, Dict]: 151 response = self._http_handler( 152 method='GET', 153 url=self.host + '/account', 154 params={'key': self.key}, 155 verify=self.verify, 156 headers={ 157 'accept-encoding': self.body_handler.content_encoding, 158 'accept': self.body_handler.accept, 159 'user-agent': self.ua 160 }, 161 ) 162 163 response.raise_for_status() 164 165 if self.body_handler.support(response.headers): 166 return self.body_handler(response.content) 167 168 return response.content.decode('utf-8') 169 170 def resilient_scrape( 171 self, 172 scrape_config:ScrapeConfig, 173 retry_on_errors:Set[Exception]={ScrapflyError}, 174 retry_on_status_code:Optional[List[int]]=None, 175 tries: int = 5, 176 delay: int = 20, 177 ) -> ScrapeApiResponse: 178 assert retry_on_errors is not None, 'Retry on error is None' 179 assert isinstance(retry_on_errors, set), 'retry_on_errors is not a set()' 180 181 @backoff.on_exception(backoff.expo, exception=tuple(retry_on_errors), max_tries=tries, max_time=delay) 182 def inner() -> ScrapeApiResponse: 183 184 try: 185 return self.scrape(scrape_config=scrape_config) 186 except (UpstreamHttpClientError, UpstreamHttpServerError) as e: 187 if retry_on_status_code is not None and e.api_response: 188 if e.api_response.upstream_status_code in retry_on_status_code: 189 raise e 190 else: 191 return e.api_response 192 193 raise e 194 195 return inner() 196 197 def open(self): 198 if self.http_session is None: 199 self.http_session = Session() 200 self.http_session.verify = self.verify 201 self.http_session.timeout = (self.connect_timeout, self.read_timeout) 202 self.http_session.params['key'] = self.key 203 self.http_session.headers['accept-encoding'] = self.body_handler.content_encoding 204 self.http_session.headers['accept'] = self.body_handler.accept 205 self.http_session.headers['user-agent'] = self.ua 206 207 def close(self): 208 self.http_session.close() 209 self.http_session = None 210 211 def __enter__(self) -> 'ScrapflyClient': 212 self.open() 213 return self 214 215 def __exit__(self, exc_type, exc_val, exc_tb): 216 self.close() 217 218 async def async_scrape(self, scrape_config:ScrapeConfig, loop:Optional[AbstractEventLoop]=None) -> ScrapeApiResponse: 219 if loop is None: 220 loop = asyncio.get_running_loop() 221 222 return await loop.run_in_executor(self.async_executor, self.scrape, scrape_config) 223 224 async def concurrent_scrape(self, scrape_configs:List[ScrapeConfig], concurrency:Optional[int]=None): 225 if concurrency is None: 226 concurrency = self.max_concurrency 227 elif concurrency == self.CONCURRENCY_AUTO: 228 concurrency = self.account()['subscription']['max_concurrency'] 229 230 loop = asyncio.get_running_loop() 231 processing_tasks = [] 232 results = [] 233 processed_tasks = 0 234 expected_tasks = len(scrape_configs) 235 236 def scrape_done_callback(task:Task): 237 nonlocal processed_tasks 238 239 try: 240 if task.cancelled() is True: 241 return 242 243 error = task.exception() 244 245 if error is not None: 246 results.append(error) 247 else: 248 results.append(task.result()) 249 finally: 250 processing_tasks.remove(task) 251 processed_tasks += 1 252 253 while scrape_configs or results or processing_tasks: 254 logger.info("Scrape %d/%d - %d running" % (processed_tasks, expected_tasks, len(processing_tasks))) 255 256 if scrape_configs: 257 if len(processing_tasks) < concurrency: 258 # @todo handle backpressure 259 for _ in range(0, concurrency - len(processing_tasks)): 260 try: 261 scrape_config = scrape_configs.pop() 262 except: 263 break 264 265 scrape_config.raise_on_upstream_error = False 266 task = loop.create_task(self.async_scrape(scrape_config=scrape_config, loop=loop)) 267 processing_tasks.append(task) 268 task.add_done_callback(scrape_done_callback) 269 270 for _ in results: 271 result = results.pop() 272 yield result 273 274 await asyncio.sleep(.5) 275 276 logger.debug("Scrape %d/%d - %d running" % (processed_tasks, expected_tasks, len(processing_tasks))) 277 278 @backoff.on_exception(backoff.expo, exception=NetworkError, max_tries=5) 279 def scrape(self, scrape_config:ScrapeConfig) -> ScrapeApiResponse: 280 281 try: 282 logger.debug('--> %s Scrapping %s' % (scrape_config.method, scrape_config.url)) 283 request_data = self._scrape_request(scrape_config=scrape_config) 284 response = self._http_handler(**request_data) 285 scrape_api_response = self._handle_response(response=response, scrape_config=scrape_config) 286 287 self.reporter.report(scrape_api_response=scrape_api_response) 288 289 return scrape_api_response 290 except BaseException as e: 291 self.reporter.report(error=e) 292 raise e 293 294 def _handle_response(self, response:Response, scrape_config:ScrapeConfig) -> ScrapeApiResponse: 295 try: 296 api_response = self._handle_api_response( 297 response=response, 298 scrape_config=scrape_config, 299 raise_on_upstream_error=scrape_config.raise_on_upstream_error 300 ) 301 302 if scrape_config.method == 'HEAD': 303 logger.debug('<-- [%s %s] %s | %ss' % ( 304 api_response.response.status_code, 305 api_response.response.reason, 306 api_response.response.request.url, 307 0 308 )) 309 else: 310 logger.debug('<-- [%s %s] %s | %ss' % ( 311 api_response.result['result']['status_code'], 312 api_response.result['result']['reason'], 313 api_response.result['config']['url'], 314 api_response.result['result']['duration']) 315 ) 316 317 logger.debug('Log url: %s' % api_response.result['result']['log_url']) 318 319 return api_response 320 except UpstreamHttpError as e: 321 logger.critical(e.api_response.error_message) 322 raise 323 except ScrapflyScrapeError as e: 324 if e.api_response is not None: 325 logger.critical(e.api_response.error_message) 326 else: 327 logger.critical(e.message) 328 raise 329 except HttpError as e: 330 if e.api_response is not None: 331 logger.critical(e.api_response.error_message) 332 else: 333 logger.critical(e.message) 334 raise 335 except ScrapflyError as e: 336 logger.critical('<-- %s | Docs: %s' % (str(e), e.documentation_url)) 337 raise 338 339 def save_screenshot(self, api_response:ScrapeApiResponse, name:str, path:Optional[str]=None): 340 341 if not api_response.scrape_result['screenshots']: 342 raise RuntimeError('Screenshot %s do no exists' % name) 343 344 try: 345 api_response.scrape_result['screenshots'][name] 346 except KeyError: 347 raise RuntimeError('Screenshot %s do no exists' % name) 348 349 screenshot_response = self._http_handler( 350 method='GET', 351 url=api_response.scrape_result['screenshots'][name]['url'], 352 params={'key': self.key}, 353 verify=self.verify 354 ) 355 356 screenshot_response.raise_for_status() 357 358 if not name.endswith('.jpg'): 359 name += '.jpg' 360 361 api_response.sink(path=path, name=name, content=screenshot_response.content) 362 363 def screenshot(self, url:str, path:Optional[str]=None, name:Optional[str]=None) -> str: 364 # for advance configuration, take screenshots via scrape method with ScrapeConfig 365 api_response = self.scrape(scrape_config=ScrapeConfig( 366 url=url, 367 render_js=True, 368 screenshots={'main': 'fullpage'} 369 )) 370 371 name = name or 'main.jpg' 372 373 if not name.endswith('.jpg'): 374 name += '.jpg' 375 376 response = self._http_handler( 377 method='GET', 378 url=api_response.scrape_result['screenshots']['main']['url'], 379 params={'key': self.key} 380 ) 381 382 response.raise_for_status() 383 384 return self.sink(api_response, path=path, name=name, content=response.content) 385 386 def sink(self, api_response:ScrapeApiResponse, content:Optional[Union[str, bytes]]=None, path: Optional[str] = None, name: Optional[str] = None, file: Optional[Union[TextIO, BytesIO]] = None) -> str: 387 scrape_result = api_response.result['result'] 388 scrape_config = api_response.result['config'] 389 390 file_content = content or scrape_result['content'] 391 file_path = None 392 file_extension = None 393 394 if name: 395 name_parts = name.split('.') 396 if len(name_parts) > 1: 397 file_extension = name_parts[-1] 398 399 if not file: 400 if file_extension is None: 401 try: 402 mime_type = scrape_result['response_headers']['content-type'] 403 except KeyError: 404 mime_type = 'application/octet-stream' 405 406 if ';' in mime_type: 407 mime_type = mime_type.split(';')[0] 408 409 file_extension = '.' + mime_type.split('/')[1] 410 411 if not name: 412 name = scrape_config['url'].split('/')[-1] 413 414 if name.find(file_extension) == -1: 415 name += file_extension 416 417 file_path = path + '/' + name if path else name 418 419 if file_path == file_extension: 420 url = re.sub(r'(https|http)?://', '', api_response.config['url']).replace('/', '-') 421 422 if url[-1] == '-': 423 url = url[:-1] 424 425 url += file_extension 426 427 file_path = url 428 429 file = open(file_path, 'wb') 430 431 if isinstance(file_content, str): 432 file_content = BytesIO(file_content.encode('utf-8')) 433 elif isinstance(file_content, bytes): 434 file_content = BytesIO(file_content) 435 436 file_content.seek(0) 437 with file as f: 438 shutil.copyfileobj(file_content, f, length=131072) 439 440 logger.info('file %s created' % file_path) 441 return file_path 442 443 def _handle_api_response( 444 self, 445 response: Response, 446 scrape_config:ScrapeConfig, 447 raise_on_upstream_error: Optional[bool] = True 448 ) -> ScrapeApiResponse: 449 450 if scrape_config.method == 'HEAD': 451 body = None 452 else: 453 if self.body_handler.support(headers=response.headers): 454 body = self.body_handler(response.content) 455 else: 456 body = response.content.decode('utf-8') 457 458 api_response:ScrapeApiResponse = ScrapeApiResponse( 459 response=response, 460 request=response.request, 461 api_result=body, 462 scrape_config=scrape_config 463 ) 464 465 api_response.raise_for_result(raise_on_upstream_error=raise_on_upstream_error) 466 467 return api_response
63 def __init__( 64 self, 65 key: str, 66 host: Optional[str] = HOST, 67 verify=True, 68 debug: bool = False, 69 max_concurrency:int=1, 70 connect_timeout:int = DEFAULT_CONNECT_TIMEOUT, 71 read_timeout:int = DEFAULT_READ_TIMEOUT, 72 reporter:Optional[Callable]=None, 73 **kwargs 74 ): 75 if host[-1] == '/': # remove last '/' if exists 76 host = host[:-1] 77 78 if 'distributed_mode' in kwargs: 79 warnings.warn("distributed mode is deprecated and will be remove the next version -" 80 " user should handle themself the session name based on the concurrency", 81 DeprecationWarning, 82 stacklevel=2 83 ) 84 85 if 'brotli' in kwargs: 86 warnings.warn("brotli arg is deprecated and will be remove the next version - " 87 "brotli is disabled by default", 88 DeprecationWarning, 89 stacklevel=2 90 ) 91 92 self.version = __version__ 93 self.host = host 94 self.key = key 95 self.verify = verify 96 self.debug = debug 97 self.connect_timeout = connect_timeout 98 self.read_timeout = read_timeout 99 self.max_concurrency = max_concurrency 100 self.body_handler = ResponseBodyHandler(use_brotli=False) 101 self.async_executor = ThreadPoolExecutor() 102 self.http_session = None 103 104 if not self.verify and not self.HOST.endswith('.local'): 105 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) 106 107 if self.debug is True: 108 http.client.HTTPConnection.debuglevel = 5 109 110 if reporter is None: 111 from .reporter import NoopReporter 112 113 reporter = NoopReporter() 114 115 self.reporter = Reporter(reporter)
150 def account(self) -> Union[str, Dict]: 151 response = self._http_handler( 152 method='GET', 153 url=self.host + '/account', 154 params={'key': self.key}, 155 verify=self.verify, 156 headers={ 157 'accept-encoding': self.body_handler.content_encoding, 158 'accept': self.body_handler.accept, 159 'user-agent': self.ua 160 }, 161 ) 162 163 response.raise_for_status() 164 165 if self.body_handler.support(response.headers): 166 return self.body_handler(response.content) 167 168 return response.content.decode('utf-8')
170 def resilient_scrape( 171 self, 172 scrape_config:ScrapeConfig, 173 retry_on_errors:Set[Exception]={ScrapflyError}, 174 retry_on_status_code:Optional[List[int]]=None, 175 tries: int = 5, 176 delay: int = 20, 177 ) -> ScrapeApiResponse: 178 assert retry_on_errors is not None, 'Retry on error is None' 179 assert isinstance(retry_on_errors, set), 'retry_on_errors is not a set()' 180 181 @backoff.on_exception(backoff.expo, exception=tuple(retry_on_errors), max_tries=tries, max_time=delay) 182 def inner() -> ScrapeApiResponse: 183 184 try: 185 return self.scrape(scrape_config=scrape_config) 186 except (UpstreamHttpClientError, UpstreamHttpServerError) as e: 187 if retry_on_status_code is not None and e.api_response: 188 if e.api_response.upstream_status_code in retry_on_status_code: 189 raise e 190 else: 191 return e.api_response 192 193 raise e 194 195 return inner()
197 def open(self): 198 if self.http_session is None: 199 self.http_session = Session() 200 self.http_session.verify = self.verify 201 self.http_session.timeout = (self.connect_timeout, self.read_timeout) 202 self.http_session.params['key'] = self.key 203 self.http_session.headers['accept-encoding'] = self.body_handler.content_encoding 204 self.http_session.headers['accept'] = self.body_handler.accept 205 self.http_session.headers['user-agent'] = self.ua
224 async def concurrent_scrape(self, scrape_configs:List[ScrapeConfig], concurrency:Optional[int]=None): 225 if concurrency is None: 226 concurrency = self.max_concurrency 227 elif concurrency == self.CONCURRENCY_AUTO: 228 concurrency = self.account()['subscription']['max_concurrency'] 229 230 loop = asyncio.get_running_loop() 231 processing_tasks = [] 232 results = [] 233 processed_tasks = 0 234 expected_tasks = len(scrape_configs) 235 236 def scrape_done_callback(task:Task): 237 nonlocal processed_tasks 238 239 try: 240 if task.cancelled() is True: 241 return 242 243 error = task.exception() 244 245 if error is not None: 246 results.append(error) 247 else: 248 results.append(task.result()) 249 finally: 250 processing_tasks.remove(task) 251 processed_tasks += 1 252 253 while scrape_configs or results or processing_tasks: 254 logger.info("Scrape %d/%d - %d running" % (processed_tasks, expected_tasks, len(processing_tasks))) 255 256 if scrape_configs: 257 if len(processing_tasks) < concurrency: 258 # @todo handle backpressure 259 for _ in range(0, concurrency - len(processing_tasks)): 260 try: 261 scrape_config = scrape_configs.pop() 262 except: 263 break 264 265 scrape_config.raise_on_upstream_error = False 266 task = loop.create_task(self.async_scrape(scrape_config=scrape_config, loop=loop)) 267 processing_tasks.append(task) 268 task.add_done_callback(scrape_done_callback) 269 270 for _ in results: 271 result = results.pop() 272 yield result 273 274 await asyncio.sleep(.5) 275 276 logger.debug("Scrape %d/%d - %d running" % (processed_tasks, expected_tasks, len(processing_tasks)))
278 @backoff.on_exception(backoff.expo, exception=NetworkError, max_tries=5) 279 def scrape(self, scrape_config:ScrapeConfig) -> ScrapeApiResponse: 280 281 try: 282 logger.debug('--> %s Scrapping %s' % (scrape_config.method, scrape_config.url)) 283 request_data = self._scrape_request(scrape_config=scrape_config) 284 response = self._http_handler(**request_data) 285 scrape_api_response = self._handle_response(response=response, scrape_config=scrape_config) 286 287 self.reporter.report(scrape_api_response=scrape_api_response) 288 289 return scrape_api_response 290 except BaseException as e: 291 self.reporter.report(error=e) 292 raise e
339 def save_screenshot(self, api_response:ScrapeApiResponse, name:str, path:Optional[str]=None): 340 341 if not api_response.scrape_result['screenshots']: 342 raise RuntimeError('Screenshot %s do no exists' % name) 343 344 try: 345 api_response.scrape_result['screenshots'][name] 346 except KeyError: 347 raise RuntimeError('Screenshot %s do no exists' % name) 348 349 screenshot_response = self._http_handler( 350 method='GET', 351 url=api_response.scrape_result['screenshots'][name]['url'], 352 params={'key': self.key}, 353 verify=self.verify 354 ) 355 356 screenshot_response.raise_for_status() 357 358 if not name.endswith('.jpg'): 359 name += '.jpg' 360 361 api_response.sink(path=path, name=name, content=screenshot_response.content)
363 def screenshot(self, url:str, path:Optional[str]=None, name:Optional[str]=None) -> str: 364 # for advance configuration, take screenshots via scrape method with ScrapeConfig 365 api_response = self.scrape(scrape_config=ScrapeConfig( 366 url=url, 367 render_js=True, 368 screenshots={'main': 'fullpage'} 369 )) 370 371 name = name or 'main.jpg' 372 373 if not name.endswith('.jpg'): 374 name += '.jpg' 375 376 response = self._http_handler( 377 method='GET', 378 url=api_response.scrape_result['screenshots']['main']['url'], 379 params={'key': self.key} 380 ) 381 382 response.raise_for_status() 383 384 return self.sink(api_response, path=path, name=name, content=response.content)
386 def sink(self, api_response:ScrapeApiResponse, content:Optional[Union[str, bytes]]=None, path: Optional[str] = None, name: Optional[str] = None, file: Optional[Union[TextIO, BytesIO]] = None) -> str: 387 scrape_result = api_response.result['result'] 388 scrape_config = api_response.result['config'] 389 390 file_content = content or scrape_result['content'] 391 file_path = None 392 file_extension = None 393 394 if name: 395 name_parts = name.split('.') 396 if len(name_parts) > 1: 397 file_extension = name_parts[-1] 398 399 if not file: 400 if file_extension is None: 401 try: 402 mime_type = scrape_result['response_headers']['content-type'] 403 except KeyError: 404 mime_type = 'application/octet-stream' 405 406 if ';' in mime_type: 407 mime_type = mime_type.split(';')[0] 408 409 file_extension = '.' + mime_type.split('/')[1] 410 411 if not name: 412 name = scrape_config['url'].split('/')[-1] 413 414 if name.find(file_extension) == -1: 415 name += file_extension 416 417 file_path = path + '/' + name if path else name 418 419 if file_path == file_extension: 420 url = re.sub(r'(https|http)?://', '', api_response.config['url']).replace('/', '-') 421 422 if url[-1] == '-': 423 url = url[:-1] 424 425 url += file_extension 426 427 file_path = url 428 429 file = open(file_path, 'wb') 430 431 if isinstance(file_content, str): 432 file_content = BytesIO(file_content.encode('utf-8')) 433 elif isinstance(file_content, bytes): 434 file_content = BytesIO(file_content) 435 436 file_content.seek(0) 437 with file as f: 438 shutil.copyfileobj(file_content, f, length=131072) 439 440 logger.info('file %s created' % file_path) 441 return file_path
59class ResponseBodyHandler: 60 61 SUPPORTED_COMPRESSION = ['gzip', 'deflate'] 62 SUPPORTED_CONTENT_TYPES = ['application/msgpack', 'application/json'] 63 64 class JSONDateTimeDecoder(JSONDecoder): 65 def __init__(self, *args, **kargs): 66 JSONDecoder.__init__(self, *args, object_hook=_date_parser, **kargs) 67 68 # brotli under perform at same gzip level and upper level destroy the cpu so 69 # the trade off do not worth it for most of usage 70 def __init__(self, use_brotli:bool=False): 71 if use_brotli is True and 'br' not in self.SUPPORTED_COMPRESSION: 72 try: 73 try: 74 import brotlicffi as brotli 75 self.SUPPORTED_COMPRESSION.insert(0, 'br') 76 except ImportError: 77 import brotli 78 self.SUPPORTED_COMPRESSION.insert(0, 'br') 79 except ImportError: 80 pass 81 82 self.content_encoding = ', '.join(self.SUPPORTED_COMPRESSION) 83 84 try: # automatically use msgpack if available https://msgpack.org/ 85 import msgpack 86 self.accept = 'application/msgpack;charset=utf-8' 87 self.content_type = 'application/msgpack;charset=utf-8' 88 self.content_loader = partial(msgpack.loads, object_hook=_date_parser, strict_map_key=False) 89 except ImportError: 90 self.accept = 'application/json;charset=utf-8' 91 self.content_type = 'application/json;charset=utf-8' 92 self.content_loader = partial(loads, cls=self.JSONDateTimeDecoder) 93 94 def support(self, headers:Dict) -> bool: 95 if 'content-type' not in headers: 96 return False 97 98 for content_type in self.SUPPORTED_CONTENT_TYPES: 99 if headers['content-type'].find(content_type) != -1: 100 return True 101 102 return False 103 104 def __call__(self, content: bytes) -> Union[str, Dict]: 105 try: 106 return self.content_loader(content) 107 except Exception as e: 108 raise EncoderError(content=content.decode('utf-8')) from e
70 def __init__(self, use_brotli:bool=False): 71 if use_brotli is True and 'br' not in self.SUPPORTED_COMPRESSION: 72 try: 73 try: 74 import brotlicffi as brotli 75 self.SUPPORTED_COMPRESSION.insert(0, 'br') 76 except ImportError: 77 import brotli 78 self.SUPPORTED_COMPRESSION.insert(0, 'br') 79 except ImportError: 80 pass 81 82 self.content_encoding = ', '.join(self.SUPPORTED_COMPRESSION) 83 84 try: # automatically use msgpack if available https://msgpack.org/ 85 import msgpack 86 self.accept = 'application/msgpack;charset=utf-8' 87 self.content_type = 'application/msgpack;charset=utf-8' 88 self.content_loader = partial(msgpack.loads, object_hook=_date_parser, strict_map_key=False) 89 except ImportError: 90 self.accept = 'application/json;charset=utf-8' 91 self.content_type = 'application/json;charset=utf-8' 92 self.content_loader = partial(loads, cls=self.JSONDateTimeDecoder)
64 class JSONDateTimeDecoder(JSONDecoder): 65 def __init__(self, *args, **kargs): 66 JSONDecoder.__init__(self, *args, object_hook=_date_parser, **kargs)
Simple JSON https://json.org decoder
Performs the following translations in decoding by default:
+---------------+-------------------+ | JSON | Python | +===============+===================+ | object | dict | +---------------+-------------------+ | array | list | +---------------+-------------------+ | string | str | +---------------+-------------------+ | number (int) | int | +---------------+-------------------+ | number (real) | float | +---------------+-------------------+ | true | True | +---------------+-------------------+ | false | False | +---------------+-------------------+ | null | None | +---------------+-------------------+
It also understands NaN
, Infinity
, and -Infinity
as
their corresponding float
values, which is outside the JSON spec.
65 def __init__(self, *args, **kargs): 66 JSONDecoder.__init__(self, *args, object_hook=_date_parser, **kargs)
object_hook
, if specified, will be called with the result
of every JSON object decoded and its return value will be used in
place of the given dict
. This can be used to provide custom
deserializations (e.g. to support JSON-RPC class hinting).
object_pairs_hook
, if specified will be called with the result of
every JSON object decoded with an ordered list of pairs. The return
value of object_pairs_hook
will be used instead of the dict
.
This feature can be used to implement custom decoders.
If object_hook
is also defined, the object_pairs_hook
takes
priority.
parse_float
, if specified, will be called with the string
of every JSON float to be decoded. By default this is equivalent to
float(num_str). This can be used to use another datatype or parser
for JSON floats (e.g. decimal.Decimal).
parse_int
, if specified, will be called with the string
of every JSON int to be decoded. By default this is equivalent to
int(num_str). This can be used to use another datatype or parser
for JSON integers (e.g. float).
parse_constant
, if specified, will be called with one of the
following strings: -Infinity, Infinity, NaN.
This can be used to raise an exception if invalid JSON numbers
are encountered.
If strict
is false (true is the default), then control
characters will be allowed inside strings. Control characters in
this context are those with character codes in the 0-31 range,
including '\t'
(tab), '\n'
, '\r'
and '\0'
.
Inherited Members
- json.decoder.JSONDecoder
- decode
- raw_decode
14class ScrapeConfig: 15 16 PUBLIC_DATACENTER_POOL = 'public_datacenter_pool' 17 PUBLIC_RESIDENTIAL_POOL = 'public_residential_pool' 18 19 url: str 20 retry: bool = True 21 method: str = 'GET' 22 country: Optional[str] = None 23 render_js: bool = False 24 cache: bool = False 25 cache_clear:bool = False 26 ssl:bool = False 27 dns:bool = False 28 asp:bool = False 29 debug: bool = False 30 raise_on_upstream_error:bool = True 31 cache_ttl:Optional[int] = None 32 proxy_pool:Optional[str] = None 33 session: Optional[str] = None 34 tags: Optional[List[str]] = None 35 correlation_id: Optional[str] = None 36 cookies: Optional[CaseInsensitiveDict] = None 37 body: Optional[str] = None 38 data: Optional[Dict] = None 39 headers: Optional[CaseInsensitiveDict] = None 40 js: str = None 41 rendering_wait: int = None 42 wait_for_selector: Optional[str] = None 43 session_sticky_proxy:bool = True 44 screenshots:Optional[Dict]=None 45 webhook:Optional[str]=None 46 timeout:Optional[int]=None # in milliseconds 47 js_scenario: Dict = None 48 extract: Dict = None 49 lang:Optional[List[str]] = None 50 os:Optional[str] = None 51 auto_scroll:Optional[bool] = None 52 53 def __init__( 54 self, 55 url: str, 56 retry: bool = True, 57 method: str = 'GET', 58 country: Optional[str] = None, 59 render_js: bool = False, 60 cache: bool = False, 61 cache_clear:bool = False, 62 ssl:bool = False, 63 dns:bool = False, 64 asp:bool = False, 65 debug: bool = False, 66 raise_on_upstream_error:bool = True, 67 cache_ttl:Optional[int] = None, 68 proxy_pool:Optional[str] = None, 69 session: Optional[str] = None, 70 tags: Optional[Set[str]] = None, 71 correlation_id: Optional[str] = None, 72 cookies: Optional[CaseInsensitiveDict] = None, 73 body: Optional[str] = None, 74 data: Optional[Dict] = None, 75 headers: Optional[Union[CaseInsensitiveDict, Dict[str, str]]] = None, 76 js: str = None, 77 rendering_wait: int = None, 78 wait_for_selector: Optional[str] = None, 79 screenshots:Optional[Dict]=None, 80 session_sticky_proxy:Optional[bool] = None, 81 webhook:Optional[str] = None, 82 timeout:Optional[int] = None, # in milliseconds 83 js_scenario:Optional[Dict] = None, 84 extract:Optional[Dict] = None, 85 os:Optional[str] = None, 86 lang:Optional[List[str]] = None, 87 auto_scroll:Optional[bool] = None 88 ): 89 assert(type(url) is str) 90 91 if isinstance(tags, List): 92 tags = set(tags) 93 94 cookies = cookies or {} 95 headers = headers or {} 96 97 self.cookies = CaseInsensitiveDict(cookies) 98 self.headers = CaseInsensitiveDict(headers) 99 self.url = url 100 self.retry = retry 101 self.method = method 102 self.country = country 103 self.session_sticky_proxy = session_sticky_proxy 104 self.render_js = render_js 105 self.cache = cache 106 self.cache_clear = cache_clear 107 self.asp = asp 108 self.webhook = webhook 109 self.session = session 110 self.debug = debug 111 self.cache_ttl = cache_ttl 112 self.proxy_pool = proxy_pool 113 self.tags = tags or set() 114 self.correlation_id = correlation_id 115 self.wait_for_selector = wait_for_selector 116 self.body = body 117 self.data = data 118 self.js = js 119 self.rendering_wait = rendering_wait 120 self.raise_on_upstream_error = raise_on_upstream_error 121 self.screenshots = screenshots 122 self.key = None 123 self.dns = dns 124 self.ssl = ssl 125 self.js_scenario = js_scenario 126 self.timeout = timeout 127 self.extract = extract 128 self.lang = lang 129 self.os = os 130 self.auto_scroll = auto_scroll 131 132 if cookies: 133 _cookies = [] 134 135 for name, value in cookies.items(): 136 _cookies.append(name + '=' + value) 137 138 if 'cookie' in self.headers: 139 if self.headers['cookie'][-1] != ';': 140 self.headers['cookie'] += ';' 141 else: 142 self.headers['cookie'] = '' 143 144 self.headers['cookie'] += '; '.join(_cookies) 145 146 if self.body and self.data: 147 raise ScrapeConfigError('You cannot pass both parameters body and data. You must choose') 148 149 if method in ['POST', 'PUT', 'PATCH']: 150 if self.body is None and self.data is not None: 151 if 'content-type' not in self.headers: 152 self.headers['content-type'] = 'application/x-www-form-urlencoded' 153 self.body = urlencode(data) 154 else: 155 if self.headers['content-type'].find('application/json') != -1: 156 self.body = json.dumps(data) 157 elif self.headers['content-type'].find('application/x-www-form-urlencoded') != -1: 158 self.body = urlencode(data) 159 else: 160 raise ScrapeConfigError('Content-Type "%s" not supported, use body parameter to pass pre encoded body according to your content type' % self.headers['content-type']) 161 elif self.body is None and self.data is None: 162 self.headers['content-type'] = 'text/plain' 163 164 def _bool_to_http(self, _bool:bool) -> str: 165 return 'true' if _bool is True else 'false' 166 167 def to_api_params(self, key:str) -> Dict: 168 params = { 169 'key': self.key if self.key is not None else key, 170 'url': self.url 171 } 172 173 if self.country is not None: 174 params['country'] = self.country 175 176 for name, value in self.headers.items(): 177 params['headers[%s]' % name] = value 178 179 if self.webhook is not None: 180 params['webhook_name'] = self.webhook 181 182 if self.timeout is not None: 183 params['timeout'] = self.timeout 184 185 if self.extract is not None: 186 params['extract'] = base64.urlsafe_b64encode(json.dumps(self.extract).encode('utf-8')).decode('utf-8') 187 188 if self.render_js is True: 189 params['render_js'] = self._bool_to_http(self.render_js) 190 191 if self.wait_for_selector is not None: 192 params['wait_for_selector'] = self.wait_for_selector 193 194 if self.js: 195 params['js'] = base64.urlsafe_b64encode(self.js.encode('utf-8')).decode('utf-8') 196 197 if self.js_scenario: 198 params['js_scenario'] = base64.urlsafe_b64encode(json.dumps(self.js_scenario).encode('utf-8')).decode('utf-8') 199 200 if self.rendering_wait: 201 params['rendering_wait'] = self.rendering_wait 202 203 if self.screenshots is not None: 204 for name, element in self.screenshots.items(): 205 params['screenshots[%s]' % name] = element 206 207 if self.auto_scroll is True: 208 params['auto_scroll'] = self._bool_to_http(self.auto_scroll) 209 else: 210 if self.wait_for_selector is not None: 211 logging.warning('Params "wait_for_selector" is ignored. Works only if render_js is enabled') 212 213 if self.screenshots: 214 logging.warning('Params "screenshots" is ignored. Works only if render_js is enabled') 215 216 if self.js_scenario: 217 logging.warning('Params "js_scenario" is ignored. Works only if render_js is enabled') 218 219 if self.js: 220 logging.warning('Params "js" is ignored. Works only if render_js is enabled') 221 222 if self.rendering_wait: 223 logging.warning('Params "rendering_wait" is ignored. Works only if render_js is enabled') 224 225 if self.asp is True: 226 params['asp'] = self._bool_to_http(self.asp) 227 228 if self.retry is False: 229 params['retry'] = self._bool_to_http(self.retry) 230 231 if self.cache is True: 232 params['cache'] = self._bool_to_http(self.cache) 233 234 if self.cache_clear is True: 235 params['cache_clear'] = self._bool_to_http(self.cache_clear) 236 237 if self.cache_ttl is not None: 238 params['cache_ttl'] = self.cache_ttl 239 else: 240 if self.cache_clear is True: 241 logging.warning('Params "cache_clear" is ignored. Works only if cache is enabled') 242 243 if self.cache_ttl is not None: 244 logging.warning('Params "cache_ttl" is ignored. Works only if cache is enabled') 245 246 if self.dns is True: 247 params['dns'] = self._bool_to_http(self.dns) 248 249 if self.ssl is True: 250 params['ssl'] = self._bool_to_http(self.ssl) 251 252 if self.tags: 253 params['tags'] = ','.join(self.tags) 254 255 if self.correlation_id: 256 params['correlation_id'] = self.correlation_id 257 258 if self.session: 259 params['session'] = self.session 260 261 if self.session_sticky_proxy is True: # false by default 262 params['session_sticky_proxy'] = self._bool_to_http(self.session_sticky_proxy) 263 else: 264 if self.session_sticky_proxy: 265 logging.warning('Params "session_sticky_proxy" is ignored. Works only if session is enabled') 266 267 if self.debug is True: 268 params['debug'] = self._bool_to_http(self.debug) 269 270 if self.proxy_pool is not None: 271 params['proxy_pool'] = self.proxy_pool 272 273 if self.lang is not None: 274 params['lang'] = ','.join(self.lang) 275 276 if self.os is not None: 277 params['os'] = self.os 278 279 return params 280 281 @staticmethod 282 def from_exported_config(config:str) -> 'ScrapeConfig': 283 try: 284 from msgpack import loads as msgpack_loads 285 except ImportError as e: 286 print('You must install msgpack package - run: pip install "scrapfly-sdk[seepdup] or pip install msgpack') 287 raise 288 289 data = msgpack_loads(base64.b64decode(config)) 290 291 headers = {} 292 293 for name, value in data['headers'].items(): 294 if isinstance(value, Iterable): 295 headers[name] = '; '.join(value) 296 else: 297 headers[name] = value 298 299 return ScrapeConfig( 300 url=data['url'], 301 retry=data['retry'], 302 headers=headers, 303 session=data['session'], 304 session_sticky_proxy=data['session_sticky_proxy'], 305 cache=data['cache'], 306 cache_ttl=data['cache_ttl'], 307 cache_clear=data['cache_clear'], 308 render_js=data['render_js'], 309 method=data['method'], 310 asp=data['asp'], 311 body=data['body'], 312 ssl=data['ssl'], 313 dns=data['dns'], 314 country=data['country'], 315 debug=data['debug'], 316 correlation_id=data['correlation_id'], 317 tags=data['tags'], 318 js=data['js'], 319 rendering_wait=data['rendering_wait'], 320 screenshots=data['screenshots'] or {}, 321 proxy_pool=data['proxy_pool'], 322 auto_scroll=data['auto_scroll'] 323 )
53 def __init__( 54 self, 55 url: str, 56 retry: bool = True, 57 method: str = 'GET', 58 country: Optional[str] = None, 59 render_js: bool = False, 60 cache: bool = False, 61 cache_clear:bool = False, 62 ssl:bool = False, 63 dns:bool = False, 64 asp:bool = False, 65 debug: bool = False, 66 raise_on_upstream_error:bool = True, 67 cache_ttl:Optional[int] = None, 68 proxy_pool:Optional[str] = None, 69 session: Optional[str] = None, 70 tags: Optional[Set[str]] = None, 71 correlation_id: Optional[str] = None, 72 cookies: Optional[CaseInsensitiveDict] = None, 73 body: Optional[str] = None, 74 data: Optional[Dict] = None, 75 headers: Optional[Union[CaseInsensitiveDict, Dict[str, str]]] = None, 76 js: str = None, 77 rendering_wait: int = None, 78 wait_for_selector: Optional[str] = None, 79 screenshots:Optional[Dict]=None, 80 session_sticky_proxy:Optional[bool] = None, 81 webhook:Optional[str] = None, 82 timeout:Optional[int] = None, # in milliseconds 83 js_scenario:Optional[Dict] = None, 84 extract:Optional[Dict] = None, 85 os:Optional[str] = None, 86 lang:Optional[List[str]] = None, 87 auto_scroll:Optional[bool] = None 88 ): 89 assert(type(url) is str) 90 91 if isinstance(tags, List): 92 tags = set(tags) 93 94 cookies = cookies or {} 95 headers = headers or {} 96 97 self.cookies = CaseInsensitiveDict(cookies) 98 self.headers = CaseInsensitiveDict(headers) 99 self.url = url 100 self.retry = retry 101 self.method = method 102 self.country = country 103 self.session_sticky_proxy = session_sticky_proxy 104 self.render_js = render_js 105 self.cache = cache 106 self.cache_clear = cache_clear 107 self.asp = asp 108 self.webhook = webhook 109 self.session = session 110 self.debug = debug 111 self.cache_ttl = cache_ttl 112 self.proxy_pool = proxy_pool 113 self.tags = tags or set() 114 self.correlation_id = correlation_id 115 self.wait_for_selector = wait_for_selector 116 self.body = body 117 self.data = data 118 self.js = js 119 self.rendering_wait = rendering_wait 120 self.raise_on_upstream_error = raise_on_upstream_error 121 self.screenshots = screenshots 122 self.key = None 123 self.dns = dns 124 self.ssl = ssl 125 self.js_scenario = js_scenario 126 self.timeout = timeout 127 self.extract = extract 128 self.lang = lang 129 self.os = os 130 self.auto_scroll = auto_scroll 131 132 if cookies: 133 _cookies = [] 134 135 for name, value in cookies.items(): 136 _cookies.append(name + '=' + value) 137 138 if 'cookie' in self.headers: 139 if self.headers['cookie'][-1] != ';': 140 self.headers['cookie'] += ';' 141 else: 142 self.headers['cookie'] = '' 143 144 self.headers['cookie'] += '; '.join(_cookies) 145 146 if self.body and self.data: 147 raise ScrapeConfigError('You cannot pass both parameters body and data. You must choose') 148 149 if method in ['POST', 'PUT', 'PATCH']: 150 if self.body is None and self.data is not None: 151 if 'content-type' not in self.headers: 152 self.headers['content-type'] = 'application/x-www-form-urlencoded' 153 self.body = urlencode(data) 154 else: 155 if self.headers['content-type'].find('application/json') != -1: 156 self.body = json.dumps(data) 157 elif self.headers['content-type'].find('application/x-www-form-urlencoded') != -1: 158 self.body = urlencode(data) 159 else: 160 raise ScrapeConfigError('Content-Type "%s" not supported, use body parameter to pass pre encoded body according to your content type' % self.headers['content-type']) 161 elif self.body is None and self.data is None: 162 self.headers['content-type'] = 'text/plain'
167 def to_api_params(self, key:str) -> Dict: 168 params = { 169 'key': self.key if self.key is not None else key, 170 'url': self.url 171 } 172 173 if self.country is not None: 174 params['country'] = self.country 175 176 for name, value in self.headers.items(): 177 params['headers[%s]' % name] = value 178 179 if self.webhook is not None: 180 params['webhook_name'] = self.webhook 181 182 if self.timeout is not None: 183 params['timeout'] = self.timeout 184 185 if self.extract is not None: 186 params['extract'] = base64.urlsafe_b64encode(json.dumps(self.extract).encode('utf-8')).decode('utf-8') 187 188 if self.render_js is True: 189 params['render_js'] = self._bool_to_http(self.render_js) 190 191 if self.wait_for_selector is not None: 192 params['wait_for_selector'] = self.wait_for_selector 193 194 if self.js: 195 params['js'] = base64.urlsafe_b64encode(self.js.encode('utf-8')).decode('utf-8') 196 197 if self.js_scenario: 198 params['js_scenario'] = base64.urlsafe_b64encode(json.dumps(self.js_scenario).encode('utf-8')).decode('utf-8') 199 200 if self.rendering_wait: 201 params['rendering_wait'] = self.rendering_wait 202 203 if self.screenshots is not None: 204 for name, element in self.screenshots.items(): 205 params['screenshots[%s]' % name] = element 206 207 if self.auto_scroll is True: 208 params['auto_scroll'] = self._bool_to_http(self.auto_scroll) 209 else: 210 if self.wait_for_selector is not None: 211 logging.warning('Params "wait_for_selector" is ignored. Works only if render_js is enabled') 212 213 if self.screenshots: 214 logging.warning('Params "screenshots" is ignored. Works only if render_js is enabled') 215 216 if self.js_scenario: 217 logging.warning('Params "js_scenario" is ignored. Works only if render_js is enabled') 218 219 if self.js: 220 logging.warning('Params "js" is ignored. Works only if render_js is enabled') 221 222 if self.rendering_wait: 223 logging.warning('Params "rendering_wait" is ignored. Works only if render_js is enabled') 224 225 if self.asp is True: 226 params['asp'] = self._bool_to_http(self.asp) 227 228 if self.retry is False: 229 params['retry'] = self._bool_to_http(self.retry) 230 231 if self.cache is True: 232 params['cache'] = self._bool_to_http(self.cache) 233 234 if self.cache_clear is True: 235 params['cache_clear'] = self._bool_to_http(self.cache_clear) 236 237 if self.cache_ttl is not None: 238 params['cache_ttl'] = self.cache_ttl 239 else: 240 if self.cache_clear is True: 241 logging.warning('Params "cache_clear" is ignored. Works only if cache is enabled') 242 243 if self.cache_ttl is not None: 244 logging.warning('Params "cache_ttl" is ignored. Works only if cache is enabled') 245 246 if self.dns is True: 247 params['dns'] = self._bool_to_http(self.dns) 248 249 if self.ssl is True: 250 params['ssl'] = self._bool_to_http(self.ssl) 251 252 if self.tags: 253 params['tags'] = ','.join(self.tags) 254 255 if self.correlation_id: 256 params['correlation_id'] = self.correlation_id 257 258 if self.session: 259 params['session'] = self.session 260 261 if self.session_sticky_proxy is True: # false by default 262 params['session_sticky_proxy'] = self._bool_to_http(self.session_sticky_proxy) 263 else: 264 if self.session_sticky_proxy: 265 logging.warning('Params "session_sticky_proxy" is ignored. Works only if session is enabled') 266 267 if self.debug is True: 268 params['debug'] = self._bool_to_http(self.debug) 269 270 if self.proxy_pool is not None: 271 params['proxy_pool'] = self.proxy_pool 272 273 if self.lang is not None: 274 params['lang'] = ','.join(self.lang) 275 276 if self.os is not None: 277 params['os'] = self.os 278 279 return params
281 @staticmethod 282 def from_exported_config(config:str) -> 'ScrapeConfig': 283 try: 284 from msgpack import loads as msgpack_loads 285 except ImportError as e: 286 print('You must install msgpack package - run: pip install "scrapfly-sdk[seepdup] or pip install msgpack') 287 raise 288 289 data = msgpack_loads(base64.b64decode(config)) 290 291 headers = {} 292 293 for name, value in data['headers'].items(): 294 if isinstance(value, Iterable): 295 headers[name] = '; '.join(value) 296 else: 297 headers[name] = value 298 299 return ScrapeConfig( 300 url=data['url'], 301 retry=data['retry'], 302 headers=headers, 303 session=data['session'], 304 session_sticky_proxy=data['session_sticky_proxy'], 305 cache=data['cache'], 306 cache_ttl=data['cache_ttl'], 307 cache_clear=data['cache_clear'], 308 render_js=data['render_js'], 309 method=data['method'], 310 asp=data['asp'], 311 body=data['body'], 312 ssl=data['ssl'], 313 dns=data['dns'], 314 country=data['country'], 315 debug=data['debug'], 316 correlation_id=data['correlation_id'], 317 tags=data['tags'], 318 js=data['js'], 319 rendering_wait=data['rendering_wait'], 320 screenshots=data['screenshots'] or {}, 321 proxy_pool=data['proxy_pool'], 322 auto_scroll=data['auto_scroll'] 323 )