Module scrapfly.extraction_config
Expand source code
import json
import warnings
from enum import Enum
from typing import Optional, Dict, Union
from urllib.parse import quote_plus
from base64 import urlsafe_b64encode
from .api_config import BaseApiConfig
class CompressionFormat(Enum):
"""
Document compression format.
Attributes:
GZIP: gzip format.
ZSTD: zstd format.
DEFLATE: deflate format.
"""
GZIP = "gzip"
ZSTD = "zstd"
DEFLATE = "deflate"
class ExtractionConfigError(Exception):
pass
class ExtractionConfig(BaseApiConfig):
body: Union[str, bytes]
content_type: str
url: Optional[str] = None
charset: Optional[str] = None
extraction_template: Optional[str] = None # a saved template name
extraction_ephemeral_template: Optional[Dict] # ephemeraly declared json template
extraction_prompt: Optional[str] = None
extraction_model: Optional[str] = None
is_document_compressed: Optional[bool] = None
document_compression_format: Optional[CompressionFormat] = None
webhook: Optional[str] = None
raise_on_upstream_error: bool = True
# deprecated options
template: Optional[str] = None
ephemeral_template: Optional[Dict] = None
def __init__(
self,
body: Union[str, bytes],
content_type: str,
url: Optional[str] = None,
charset: Optional[str] = None,
extraction_template: Optional[str] = None, # a saved template name
extraction_ephemeral_template: Optional[Dict] = None, # ephemeraly declared json template
extraction_prompt: Optional[str] = None,
extraction_model: Optional[str] = None,
is_document_compressed: Optional[bool] = None,
document_compression_format: Optional[CompressionFormat] = None,
webhook: Optional[str] = None,
raise_on_upstream_error: bool = True,
# deprecated options
template: Optional[str] = None,
ephemeral_template: Optional[Dict] = None
):
if template:
print("WARNGING")
warnings.warn(
"Deprecation warning: 'template' is deprecated. Use 'extraction_template' instead."
)
extraction_template = template
if ephemeral_template:
warnings.warn(
"Deprecation warning: 'ephemeral_template' is deprecated. Use 'extraction_ephemeral_template' instead."
)
extraction_ephemeral_template = ephemeral_template
self.key = None
self.body = body
self.content_type = content_type
self.url = url
self.charset = charset
self.extraction_template = extraction_template
self.extraction_ephemeral_template = extraction_ephemeral_template
self.extraction_prompt = extraction_prompt
self.extraction_model = extraction_model
self.is_document_compressed = is_document_compressed
self.document_compression_format = CompressionFormat(document_compression_format) if document_compression_format else None
self.webhook = webhook
self.raise_on_upstream_error = raise_on_upstream_error
if isinstance(body, bytes) or document_compression_format:
compression_format = detect_compression_format(body)
if compression_format is not None:
self.is_document_compressed = True
if self.document_compression_format and compression_format != self.document_compression_format:
raise ExtractionConfigError(
f'The detected compression format `{compression_format}` does not match declared format `{self.document_compression_format}`. '
f'You must pass the compression format or disable compression.'
)
self.document_compression_format = compression_format
else:
self.is_document_compressed = False
if self.is_document_compressed is False:
compression_foramt = CompressionFormat(self.document_compression_format) if self.document_compression_format else None
if isinstance(self.body, str) and compression_foramt:
self.body = self.body.encode('utf-8')
if compression_foramt == CompressionFormat.GZIP:
import gzip
self.body = gzip.compress(self.body)
elif compression_foramt == CompressionFormat.ZSTD:
try:
import zstandard as zstd
except ImportError:
raise ExtractionConfigError(
f'zstandard is not installed. You must run pip install zstandard'
f' to auto compress into zstd or use compression formats.'
)
self.body = zstd.compress(self.body)
elif compression_foramt == CompressionFormat.DEFLATE:
import zlib
compressor = zlib.compressobj(wbits=-zlib.MAX_WBITS) # raw deflate compression
self.body = compressor.compress(self.body) + compressor.flush()
def to_api_params(self, key: str) -> Dict:
params = {
'key': self.key or key,
'content_type': self.content_type
}
if self.url:
params['url'] = self.url
if self.charset:
params['charset'] = self.charset
if self.extraction_template and self.extraction_ephemeral_template:
raise ExtractionConfigError('You cannot pass both parameters extraction_template and extraction_ephemeral_template. You must choose')
if self.extraction_template:
params['extraction_template'] = self.extraction_template
if self.extraction_ephemeral_template:
self.extraction_ephemeral_template = json.dumps(self.extraction_ephemeral_template)
params['extraction_template'] = 'ephemeral:' + urlsafe_b64encode(self.extraction_ephemeral_template.encode('utf-8')).decode('utf-8')
if self.extraction_prompt:
params['extraction_prompt'] = quote_plus(self.extraction_prompt)
if self.extraction_model:
params['extraction_model'] = self.extraction_model
if self.webhook:
params['webhook_name'] = self.webhook
return params
def to_dict(self) -> Dict:
"""
Export the ExtractionConfig instance to a plain dictionary.
"""
if self.is_document_compressed is True:
compression_foramt = CompressionFormat(self.document_compression_format) if self.document_compression_format else None
if compression_foramt == CompressionFormat.GZIP:
import gzip
self.body = gzip.decompress(self.body)
elif compression_foramt == CompressionFormat.ZSTD:
import zstandard as zstd
self.body = zstd.decompress(self.body)
elif compression_foramt == CompressionFormat.DEFLATE:
import zlib
decompressor = zlib.decompressobj(wbits=-zlib.MAX_WBITS)
self.body = decompressor.decompress(self.body) + decompressor.flush()
if isinstance(self.body, bytes):
self.body = self.body.decode('utf-8')
self.is_document_compressed = False
return {
'body': self.body,
'content_type': self.content_type,
'url': self.url,
'charset': self.charset,
'extraction_template': self.extraction_template,
'extraction_ephemeral_template': self.extraction_ephemeral_template,
'extraction_prompt': self.extraction_prompt,
'extraction_model': self.extraction_model,
'is_document_compressed': self.is_document_compressed,
'document_compression_format': CompressionFormat(self.document_compression_format).value if self.document_compression_format else None,
'webhook': self.webhook,
'raise_on_upstream_error': self.raise_on_upstream_error,
}
@staticmethod
def from_dict(extraction_config_dict: Dict) -> 'ExtractionConfig':
"""Create an ExtractionConfig instance from a dictionary."""
body = extraction_config_dict.get('body', None)
content_type = extraction_config_dict.get('content_type', None)
url = extraction_config_dict.get('url', None)
charset = extraction_config_dict.get('charset', None)
extraction_template = extraction_config_dict.get('extraction_template', None)
extraction_ephemeral_template = extraction_config_dict.get('extraction_ephemeral_template', None)
extraction_prompt = extraction_config_dict.get('extraction_prompt', None)
extraction_model = extraction_config_dict.get('extraction_model', None)
is_document_compressed = extraction_config_dict.get('is_document_compressed', None)
document_compression_format = extraction_config_dict.get('document_compression_format', None)
document_compression_format = CompressionFormat(document_compression_format) if document_compression_format else None
webhook = extraction_config_dict.get('webhook', None)
raise_on_upstream_error = extraction_config_dict.get('raise_on_upstream_error', True)
return ExtractionConfig(
body=body,
content_type=content_type,
url=url,
charset=charset,
extraction_template=extraction_template,
extraction_ephemeral_template=extraction_ephemeral_template,
extraction_prompt=extraction_prompt,
extraction_model=extraction_model,
is_document_compressed=is_document_compressed,
document_compression_format=document_compression_format,
webhook=webhook,
raise_on_upstream_error=raise_on_upstream_error
)
def detect_compression_format(data) -> Optional[CompressionFormat]:
"""
Detects the compression type of the given data.
Args:
data: The compressed data as bytes.
Returns:
The name of the compression type ("gzip", "zstd", "deflate", "unknown").
"""
if len(data) < 2:
return None
# gzip
if data[0] == 0x1f and data[1] == 0x8b:
return CompressionFormat.GZIP
# zstd
zstd_magic_numbers = [
b'\x1e\xb5\x2f\xfd', # v0.1
b'\x22\xb5\x2f\xfd', # v0.2
b'\x23\xb5\x2f\xfd', # v0.3
b'\x24\xb5\x2f\xfd', # v0.4
b'\x25\xb5\x2f\xfd', # v0.5
b'\x26\xb5\x2f\xfd', # v0.6
b'\x27\xb5\x2f\xfd', # v0.7
b'\x28\xb5\x2f\xfd', # v0.8
]
for magic in zstd_magic_numbers:
if data[:len(magic)] == magic:
return CompressionFormat.ZSTD
# deflate
if data[0] == 0x78:
if data[1] in (0x01, 0x5E, 0x9C, 0xDA):
return CompressionFormat.DEFLATE
return None
Functions
def detect_compression_format(data) ‑> Optional[CompressionFormat]
-
Detects the compression type of the given data.
Args
data
- The compressed data as bytes.
Returns
The name of the compression type ("gzip", "zstd", "deflate", "unknown").
Expand source code
def detect_compression_format(data) -> Optional[CompressionFormat]: """ Detects the compression type of the given data. Args: data: The compressed data as bytes. Returns: The name of the compression type ("gzip", "zstd", "deflate", "unknown"). """ if len(data) < 2: return None # gzip if data[0] == 0x1f and data[1] == 0x8b: return CompressionFormat.GZIP # zstd zstd_magic_numbers = [ b'\x1e\xb5\x2f\xfd', # v0.1 b'\x22\xb5\x2f\xfd', # v0.2 b'\x23\xb5\x2f\xfd', # v0.3 b'\x24\xb5\x2f\xfd', # v0.4 b'\x25\xb5\x2f\xfd', # v0.5 b'\x26\xb5\x2f\xfd', # v0.6 b'\x27\xb5\x2f\xfd', # v0.7 b'\x28\xb5\x2f\xfd', # v0.8 ] for magic in zstd_magic_numbers: if data[:len(magic)] == magic: return CompressionFormat.ZSTD # deflate if data[0] == 0x78: if data[1] in (0x01, 0x5E, 0x9C, 0xDA): return CompressionFormat.DEFLATE return None
Classes
class CompressionFormat (value, names=None, *, module=None, qualname=None, type=None, start=1)
-
Document compression format.
Attributes
GZIP
- gzip format.
ZSTD
- zstd format.
DEFLATE
- deflate format.
Expand source code
class CompressionFormat(Enum): """ Document compression format. Attributes: GZIP: gzip format. ZSTD: zstd format. DEFLATE: deflate format. """ GZIP = "gzip" ZSTD = "zstd" DEFLATE = "deflate"
Ancestors
- enum.Enum
Class variables
var DEFLATE
var GZIP
var ZSTD
class ExtractionConfig (body: Union[str, bytes], content_type: str, url: Optional[str] = None, charset: Optional[str] = None, extraction_template: Optional[str] = None, extraction_ephemeral_template: Optional[Dict] = None, extraction_prompt: Optional[str] = None, extraction_model: Optional[str] = None, is_document_compressed: Optional[bool] = None, document_compression_format: Optional[CompressionFormat] = None, webhook: Optional[str] = None, raise_on_upstream_error: bool = True, template: Optional[str] = None, ephemeral_template: Optional[Dict] = None)
-
Expand source code
class ExtractionConfig(BaseApiConfig): body: Union[str, bytes] content_type: str url: Optional[str] = None charset: Optional[str] = None extraction_template: Optional[str] = None # a saved template name extraction_ephemeral_template: Optional[Dict] # ephemeraly declared json template extraction_prompt: Optional[str] = None extraction_model: Optional[str] = None is_document_compressed: Optional[bool] = None document_compression_format: Optional[CompressionFormat] = None webhook: Optional[str] = None raise_on_upstream_error: bool = True # deprecated options template: Optional[str] = None ephemeral_template: Optional[Dict] = None def __init__( self, body: Union[str, bytes], content_type: str, url: Optional[str] = None, charset: Optional[str] = None, extraction_template: Optional[str] = None, # a saved template name extraction_ephemeral_template: Optional[Dict] = None, # ephemeraly declared json template extraction_prompt: Optional[str] = None, extraction_model: Optional[str] = None, is_document_compressed: Optional[bool] = None, document_compression_format: Optional[CompressionFormat] = None, webhook: Optional[str] = None, raise_on_upstream_error: bool = True, # deprecated options template: Optional[str] = None, ephemeral_template: Optional[Dict] = None ): if template: print("WARNGING") warnings.warn( "Deprecation warning: 'template' is deprecated. Use 'extraction_template' instead." ) extraction_template = template if ephemeral_template: warnings.warn( "Deprecation warning: 'ephemeral_template' is deprecated. Use 'extraction_ephemeral_template' instead." ) extraction_ephemeral_template = ephemeral_template self.key = None self.body = body self.content_type = content_type self.url = url self.charset = charset self.extraction_template = extraction_template self.extraction_ephemeral_template = extraction_ephemeral_template self.extraction_prompt = extraction_prompt self.extraction_model = extraction_model self.is_document_compressed = is_document_compressed self.document_compression_format = CompressionFormat(document_compression_format) if document_compression_format else None self.webhook = webhook self.raise_on_upstream_error = raise_on_upstream_error if isinstance(body, bytes) or document_compression_format: compression_format = detect_compression_format(body) if compression_format is not None: self.is_document_compressed = True if self.document_compression_format and compression_format != self.document_compression_format: raise ExtractionConfigError( f'The detected compression format `{compression_format}` does not match declared format `{self.document_compression_format}`. ' f'You must pass the compression format or disable compression.' ) self.document_compression_format = compression_format else: self.is_document_compressed = False if self.is_document_compressed is False: compression_foramt = CompressionFormat(self.document_compression_format) if self.document_compression_format else None if isinstance(self.body, str) and compression_foramt: self.body = self.body.encode('utf-8') if compression_foramt == CompressionFormat.GZIP: import gzip self.body = gzip.compress(self.body) elif compression_foramt == CompressionFormat.ZSTD: try: import zstandard as zstd except ImportError: raise ExtractionConfigError( f'zstandard is not installed. You must run pip install zstandard' f' to auto compress into zstd or use compression formats.' ) self.body = zstd.compress(self.body) elif compression_foramt == CompressionFormat.DEFLATE: import zlib compressor = zlib.compressobj(wbits=-zlib.MAX_WBITS) # raw deflate compression self.body = compressor.compress(self.body) + compressor.flush() def to_api_params(self, key: str) -> Dict: params = { 'key': self.key or key, 'content_type': self.content_type } if self.url: params['url'] = self.url if self.charset: params['charset'] = self.charset if self.extraction_template and self.extraction_ephemeral_template: raise ExtractionConfigError('You cannot pass both parameters extraction_template and extraction_ephemeral_template. You must choose') if self.extraction_template: params['extraction_template'] = self.extraction_template if self.extraction_ephemeral_template: self.extraction_ephemeral_template = json.dumps(self.extraction_ephemeral_template) params['extraction_template'] = 'ephemeral:' + urlsafe_b64encode(self.extraction_ephemeral_template.encode('utf-8')).decode('utf-8') if self.extraction_prompt: params['extraction_prompt'] = quote_plus(self.extraction_prompt) if self.extraction_model: params['extraction_model'] = self.extraction_model if self.webhook: params['webhook_name'] = self.webhook return params def to_dict(self) -> Dict: """ Export the ExtractionConfig instance to a plain dictionary. """ if self.is_document_compressed is True: compression_foramt = CompressionFormat(self.document_compression_format) if self.document_compression_format else None if compression_foramt == CompressionFormat.GZIP: import gzip self.body = gzip.decompress(self.body) elif compression_foramt == CompressionFormat.ZSTD: import zstandard as zstd self.body = zstd.decompress(self.body) elif compression_foramt == CompressionFormat.DEFLATE: import zlib decompressor = zlib.decompressobj(wbits=-zlib.MAX_WBITS) self.body = decompressor.decompress(self.body) + decompressor.flush() if isinstance(self.body, bytes): self.body = self.body.decode('utf-8') self.is_document_compressed = False return { 'body': self.body, 'content_type': self.content_type, 'url': self.url, 'charset': self.charset, 'extraction_template': self.extraction_template, 'extraction_ephemeral_template': self.extraction_ephemeral_template, 'extraction_prompt': self.extraction_prompt, 'extraction_model': self.extraction_model, 'is_document_compressed': self.is_document_compressed, 'document_compression_format': CompressionFormat(self.document_compression_format).value if self.document_compression_format else None, 'webhook': self.webhook, 'raise_on_upstream_error': self.raise_on_upstream_error, } @staticmethod def from_dict(extraction_config_dict: Dict) -> 'ExtractionConfig': """Create an ExtractionConfig instance from a dictionary.""" body = extraction_config_dict.get('body', None) content_type = extraction_config_dict.get('content_type', None) url = extraction_config_dict.get('url', None) charset = extraction_config_dict.get('charset', None) extraction_template = extraction_config_dict.get('extraction_template', None) extraction_ephemeral_template = extraction_config_dict.get('extraction_ephemeral_template', None) extraction_prompt = extraction_config_dict.get('extraction_prompt', None) extraction_model = extraction_config_dict.get('extraction_model', None) is_document_compressed = extraction_config_dict.get('is_document_compressed', None) document_compression_format = extraction_config_dict.get('document_compression_format', None) document_compression_format = CompressionFormat(document_compression_format) if document_compression_format else None webhook = extraction_config_dict.get('webhook', None) raise_on_upstream_error = extraction_config_dict.get('raise_on_upstream_error', True) return ExtractionConfig( body=body, content_type=content_type, url=url, charset=charset, extraction_template=extraction_template, extraction_ephemeral_template=extraction_ephemeral_template, extraction_prompt=extraction_prompt, extraction_model=extraction_model, is_document_compressed=is_document_compressed, document_compression_format=document_compression_format, webhook=webhook, raise_on_upstream_error=raise_on_upstream_error )
Ancestors
Class variables
var body : Union[str, bytes]
var charset : Optional[str]
var content_type : str
var document_compression_format : Optional[CompressionFormat]
var ephemeral_template : Optional[Dict]
var extraction_ephemeral_template : Optional[Dict]
var extraction_model : Optional[str]
var extraction_prompt : Optional[str]
var extraction_template : Optional[str]
var is_document_compressed : Optional[bool]
var raise_on_upstream_error : bool
var template : Optional[str]
var url : Optional[str]
var webhook : Optional[str]
Static methods
def from_dict(extraction_config_dict: Dict) ‑> ExtractionConfig
-
Create an ExtractionConfig instance from a dictionary.
Expand source code
@staticmethod def from_dict(extraction_config_dict: Dict) -> 'ExtractionConfig': """Create an ExtractionConfig instance from a dictionary.""" body = extraction_config_dict.get('body', None) content_type = extraction_config_dict.get('content_type', None) url = extraction_config_dict.get('url', None) charset = extraction_config_dict.get('charset', None) extraction_template = extraction_config_dict.get('extraction_template', None) extraction_ephemeral_template = extraction_config_dict.get('extraction_ephemeral_template', None) extraction_prompt = extraction_config_dict.get('extraction_prompt', None) extraction_model = extraction_config_dict.get('extraction_model', None) is_document_compressed = extraction_config_dict.get('is_document_compressed', None) document_compression_format = extraction_config_dict.get('document_compression_format', None) document_compression_format = CompressionFormat(document_compression_format) if document_compression_format else None webhook = extraction_config_dict.get('webhook', None) raise_on_upstream_error = extraction_config_dict.get('raise_on_upstream_error', True) return ExtractionConfig( body=body, content_type=content_type, url=url, charset=charset, extraction_template=extraction_template, extraction_ephemeral_template=extraction_ephemeral_template, extraction_prompt=extraction_prompt, extraction_model=extraction_model, is_document_compressed=is_document_compressed, document_compression_format=document_compression_format, webhook=webhook, raise_on_upstream_error=raise_on_upstream_error )
Methods
def to_api_params(self, key: str) ‑> Dict
-
Expand source code
def to_api_params(self, key: str) -> Dict: params = { 'key': self.key or key, 'content_type': self.content_type } if self.url: params['url'] = self.url if self.charset: params['charset'] = self.charset if self.extraction_template and self.extraction_ephemeral_template: raise ExtractionConfigError('You cannot pass both parameters extraction_template and extraction_ephemeral_template. You must choose') if self.extraction_template: params['extraction_template'] = self.extraction_template if self.extraction_ephemeral_template: self.extraction_ephemeral_template = json.dumps(self.extraction_ephemeral_template) params['extraction_template'] = 'ephemeral:' + urlsafe_b64encode(self.extraction_ephemeral_template.encode('utf-8')).decode('utf-8') if self.extraction_prompt: params['extraction_prompt'] = quote_plus(self.extraction_prompt) if self.extraction_model: params['extraction_model'] = self.extraction_model if self.webhook: params['webhook_name'] = self.webhook return params
def to_dict(self) ‑> Dict
-
Export the ExtractionConfig instance to a plain dictionary.
Expand source code
def to_dict(self) -> Dict: """ Export the ExtractionConfig instance to a plain dictionary. """ if self.is_document_compressed is True: compression_foramt = CompressionFormat(self.document_compression_format) if self.document_compression_format else None if compression_foramt == CompressionFormat.GZIP: import gzip self.body = gzip.decompress(self.body) elif compression_foramt == CompressionFormat.ZSTD: import zstandard as zstd self.body = zstd.decompress(self.body) elif compression_foramt == CompressionFormat.DEFLATE: import zlib decompressor = zlib.decompressobj(wbits=-zlib.MAX_WBITS) self.body = decompressor.decompress(self.body) + decompressor.flush() if isinstance(self.body, bytes): self.body = self.body.decode('utf-8') self.is_document_compressed = False return { 'body': self.body, 'content_type': self.content_type, 'url': self.url, 'charset': self.charset, 'extraction_template': self.extraction_template, 'extraction_ephemeral_template': self.extraction_ephemeral_template, 'extraction_prompt': self.extraction_prompt, 'extraction_model': self.extraction_model, 'is_document_compressed': self.is_document_compressed, 'document_compression_format': CompressionFormat(self.document_compression_format).value if self.document_compression_format else None, 'webhook': self.webhook, 'raise_on_upstream_error': self.raise_on_upstream_error, }
class ExtractionConfigError (*args, **kwargs)
-
Common base class for all non-exit exceptions.
Expand source code
class ExtractionConfigError(Exception): pass
Ancestors
- builtins.Exception
- builtins.BaseException