Module `scrapfly.crawler.crawler_response`

Crawler API Response Classes

This module provides response wrapper classes for the Crawler API.

Classes

class CrawlerArtifactResponse (artifact_data: bytes, artifact_type: str = 'warc')

Expand source code

class CrawlerArtifactResponse:
    """
    Response from downloading crawler artifacts

    Returned by ScrapflyClient.get_crawl_artifact() method.

    Provides high-level access to crawl results with automatic WARC/HAR parsing.
    Users don't need to understand WARC or HAR format to use this class.

    Example:
        ```python
        # Get WARC artifact (default)
        artifact = client.get_crawl_artifact(uuid)

        # Get HAR artifact
        artifact = client.get_crawl_artifact(uuid, artifact_type='har')

        # Easy mode: get all pages as dicts
        pages = artifact.get_pages()
        for page in pages:
            print(f"{page['url']}: {page['status_code']}")
            html = page['content'].decode('utf-8')

        # Memory-efficient: iterate one page at a time
        for record in artifact.iter_responses():
            print(f"{record.url}: {record.status_code}")
            process(record.content)

        # Save to file
        artifact.save('crawl_results.warc.gz')
        ```
    """

    def __init__(self, artifact_data: bytes, artifact_type: str = 'warc'):
        """
        Initialize from artifact data

        Args:
            artifact_data: Raw artifact file bytes
            artifact_type: Type of artifact ('warc' or 'har')
        """
        self._artifact_data = artifact_data
        self._artifact_type = artifact_type
        self._warc_parser: Optional[WarcParser] = None
        self._har_parser: Optional[HarArchive] = None

    @property
    def artifact_type(self) -> str:
        """Get artifact type ('warc' or 'har')"""
        return self._artifact_type

    @property
    def artifact_data(self) -> bytes:
        """Get raw artifact data (for advanced users)"""
        return self._artifact_data

    @property
    def warc_data(self) -> bytes:
        """Get raw WARC data (deprecated, use artifact_data)"""
        return self._artifact_data

    @property
    def parser(self) -> Union[WarcParser, HarArchive]:
        """Get artifact parser instance (lazy-loaded)"""
        if self._artifact_type == 'har':
            if self._har_parser is None:
                self._har_parser = HarArchive(self._artifact_data)
            return self._har_parser
        else:
            if self._warc_parser is None:
                self._warc_parser = parse_warc(self._artifact_data)
            return self._warc_parser

    def iter_records(self) -> Iterator[Union[WarcRecord, HarEntry]]:
        """
        Iterate through all records

        For WARC: iterates through all WARC records
        For HAR: iterates through all HAR entries

        Yields:
            WarcRecord or HarEntry: Each record in the artifact
        """
        if self._artifact_type == 'har':
            return self.parser.iter_entries()
        else:
            return self.parser.iter_records()

    def iter_responses(self) -> Iterator[Union[WarcRecord, HarEntry]]:
        """
        Iterate through HTTP response records only

        This is more memory-efficient than get_pages() for large crawls.

        For WARC: iterates through response records
        For HAR: iterates through all entries (HAR only contains responses)

        Yields:
            WarcRecord or HarEntry: HTTP response records with url, status_code, headers, content
        """
        if self._artifact_type == 'har':
            return self.parser.iter_entries()
        else:
            return self.parser.iter_responses()

    def get_pages(self) -> List[Dict]:
        """
        Get all crawled pages as simple dictionaries

        This is the easiest way to access crawl results.
        Works with both WARC and HAR formats.

        Returns:
            List of dicts with keys: url, status_code, headers, content

        Example:
            ```python
            pages = artifact.get_pages()
            for page in pages:
                print(f"{page['url']}: {len(page['content'])} bytes")
                html = page['content'].decode('utf-8')
            ```
        """
        if self._artifact_type == 'har':
            # Convert HAR entries to page dicts
            pages = []
            for entry in self.parser.iter_entries():
                pages.append({
                    'url': entry.url,
                    'status_code': entry.status_code,
                    'headers': entry.response_headers,
                    'content': entry.content
                })
            return pages
        else:
            return self.parser.get_pages()

    @property
    def total_pages(self) -> int:
        """Get total number of pages in the artifact"""
        return len(self.get_pages())

    def save(self, filepath: str):
        """
        Save WARC data to file

        Args:
            filepath: Path to save the WARC file

        Example:
            ```python
            artifact.save('crawl_results.warc.gz')
            ```
        """
        with open(filepath, 'wb') as f:
            f.write(self.warc_data)

    def __repr__(self):
        return f"CrawlerArtifactResponse(size={len(self.warc_data)} bytes)"

Response from downloading crawler artifacts

Returned by ScrapflyClient.get_crawl_artifact() method.

Provides high-level access to crawl results with automatic WARC/HAR parsing. Users don't need to understand WARC or HAR format to use this class.

Example

# Get WARC artifact (default)
artifact = client.get_crawl_artifact(uuid)

# Get HAR artifact
artifact = client.get_crawl_artifact(uuid, artifact_type='har')

# Easy mode: get all pages as dicts
pages = artifact.get_pages()
for page in pages:
    print(f"{page['url']}: {page['status_code']}")
    html = page['content'].decode('utf-8')

# Memory-efficient: iterate one page at a time
for record in artifact.iter_responses():
    print(f"{record.url}: {record.status_code}")
    process(record.content)

# Save to file
artifact.save('crawl_results.warc.gz')

Initialize from artifact data

Args

artifact_data: Raw artifact file bytes
artifact_type: Type of artifact ('warc' or 'har')

Instance variables

prop artifact_data : bytes

Expand source code

@property
def artifact_data(self) -> bytes:
    """Get raw artifact data (for advanced users)"""
    return self._artifact_data

Get raw artifact data (for advanced users)

prop artifact_type : str

Expand source code

@property
def artifact_type(self) -> str:
    """Get artifact type ('warc' or 'har')"""
    return self._artifact_type

Get artifact type ('warc' or 'har')

prop parser : WarcParser | HarArchive

Expand source code

@property
def parser(self) -> Union[WarcParser, HarArchive]:
    """Get artifact parser instance (lazy-loaded)"""
    if self._artifact_type == 'har':
        if self._har_parser is None:
            self._har_parser = HarArchive(self._artifact_data)
        return self._har_parser
    else:
        if self._warc_parser is None:
            self._warc_parser = parse_warc(self._artifact_data)
        return self._warc_parser

Get artifact parser instance (lazy-loaded)

prop total_pages : int

Expand source code

@property
def total_pages(self) -> int:
    """Get total number of pages in the artifact"""
    return len(self.get_pages())

Get total number of pages in the artifact

prop warc_data : bytes

Expand source code

@property
def warc_data(self) -> bytes:
    """Get raw WARC data (deprecated, use artifact_data)"""
    return self._artifact_data

Get raw WARC data (deprecated, use artifact_data)

Methods

def get_pages(self) ‑> List[Dict]

Expand source code

def get_pages(self) -> List[Dict]:
    """
    Get all crawled pages as simple dictionaries

    This is the easiest way to access crawl results.
    Works with both WARC and HAR formats.

    Returns:
        List of dicts with keys: url, status_code, headers, content

    Example:
        ```python
        pages = artifact.get_pages()
        for page in pages:
            print(f"{page['url']}: {len(page['content'])} bytes")
            html = page['content'].decode('utf-8')
        ```
    """
    if self._artifact_type == 'har':
        # Convert HAR entries to page dicts
        pages = []
        for entry in self.parser.iter_entries():
            pages.append({
                'url': entry.url,
                'status_code': entry.status_code,
                'headers': entry.response_headers,
                'content': entry.content
            })
        return pages
    else:
        return self.parser.get_pages()

Get all crawled pages as simple dictionaries

This is the easiest way to access crawl results. Works with both WARC and HAR formats.

Returns

List of dicts with keys: url, status_code, headers, content

Example

pages = artifact.get_pages()
for page in pages:
    print(f"{page['url']}: {len(page['content'])} bytes")
    html = page['content'].decode('utf-8')

def iter_records(self) ‑> Iterator[WarcRecord | HarEntry]

Expand source code

def iter_records(self) -> Iterator[Union[WarcRecord, HarEntry]]:
    """
    Iterate through all records

    For WARC: iterates through all WARC records
    For HAR: iterates through all HAR entries

    Yields:
        WarcRecord or HarEntry: Each record in the artifact
    """
    if self._artifact_type == 'har':
        return self.parser.iter_entries()
    else:
        return self.parser.iter_records()

Iterate through all records

For WARC: iterates through all WARC records For HAR: iterates through all HAR entries

Yields

WarcRecord or HarEntry: Each record in the artifact

def iter_responses(self) ‑> Iterator[WarcRecord | HarEntry]

Expand source code

def iter_responses(self) -> Iterator[Union[WarcRecord, HarEntry]]:
    """
    Iterate through HTTP response records only

    This is more memory-efficient than get_pages() for large crawls.

    For WARC: iterates through response records
    For HAR: iterates through all entries (HAR only contains responses)

    Yields:
        WarcRecord or HarEntry: HTTP response records with url, status_code, headers, content
    """
    if self._artifact_type == 'har':
        return self.parser.iter_entries()
    else:
        return self.parser.iter_responses()

Iterate through HTTP response records only

This is more memory-efficient than get_pages() for large crawls.

For WARC: iterates through response records For HAR: iterates through all entries (HAR only contains responses)

Yields

WarcRecord or HarEntry: HTTP response records with url, status_code, headers, content

def save(self, filepath: str)

Expand source code

def save(self, filepath: str):
    """
    Save WARC data to file

    Args:
        filepath: Path to save the WARC file

    Example:
        ```python
        artifact.save('crawl_results.warc.gz')
        ```
    """
    with open(filepath, 'wb') as f:
        f.write(self.warc_data)

Save WARC data to file

Args

filepath: Path to save the WARC file

Example

artifact.save('crawl_results.warc.gz')

class CrawlerStartResponse (response_data: Dict[str, Any])

Expand source code

class CrawlerStartResponse:
    """
    Response from starting a crawler job

    Returned by ScrapflyClient.start_crawl() method.

    Attributes:
        uuid: Unique identifier for the crawler job
        status: Initial status (typically 'PENDING')
    """

    def __init__(self, response_data: Dict[str, Any]):
        """
        Initialize from API response

        Args:
            response_data: Raw API response dictionary
        """
        self._data = response_data
        # API returns 'crawler_uuid' not 'uuid'
        self.uuid = response_data.get('crawler_uuid') or response_data.get('uuid')
        self.status = response_data.get('status')

    def __repr__(self):
        return f"CrawlerStartResponse(uuid={self.uuid}, status={self.status})"

Response from starting a crawler job

Returned by ScrapflyClient.start_crawl() method.

Attributes

uuid: Unique identifier for the crawler job
status: Initial status (typically 'PENDING')

Initialize from API response

Args

response_data: Raw API response dictionary

class CrawlerStatusResponse (response_data: Dict[str, Any])

Expand source code

class CrawlerStatusResponse:
    """
    Response from checking crawler job status

    Returned by ScrapflyClient.get_crawl_status() method.

    Provides real-time progress tracking for crawler jobs.

    Attributes:
        uuid: Crawler job UUID
        status: Current status (PENDING, RUNNING, COMPLETED, FAILED, CANCELLED, DONE)
        is_success: Whether the crawler job completed successfully
        is_finished: Whether the crawler job has finished (regardless of success/failure)
        api_credit_cost: Total API credits consumed by this crawl
        stop_reason: Reason why the crawler stopped (e.g., 'seed_url_failed', 'page_limit_reached'), None if still running
        urls_discovered: Total URLs discovered so far
        urls_crawled: Number of URLs successfully crawled
        urls_pending: Number of URLs waiting to be crawled
        urls_failed: Number of URLs that failed to crawl
    """

    # Status constants
    STATUS_PENDING = 'PENDING'
    STATUS_RUNNING = 'RUNNING'
    STATUS_COMPLETED = 'COMPLETED'
    STATUS_DONE = 'DONE'
    STATUS_FAILED = 'FAILED'
    STATUS_CANCELLED = 'CANCELLED'

    def __init__(self, response_data: Dict[str, Any]):
        """
        Initialize from API response

        Args:
            response_data: Raw API response dictionary
        """
        self._data = response_data
        # API returns crawler_uuid in status response
        self.uuid = response_data.get('crawler_uuid') or response_data.get('uuid')
        self.status = response_data.get('status')

        # New fields from API
        self.is_success = response_data.get('is_success', False)
        self.is_finished = response_data.get('is_finished', False)

        # Parse state dict if present (actual API format)
        state = response_data.get('state', {})
        if state:
            # Actual API response structure
            self.urls_discovered = state.get('urls_extracted', 0)
            self.urls_crawled = state.get('urls_visited', 0)
            self.urls_pending = state.get('urls_to_crawl', 0)
            self.urls_failed = state.get('urls_failed', 0)
            self.stop_reason = state.get('stop_reason')
            # API credit cost is in the state dict as 'api_credit_used'
            self.api_credit_cost = state.get('api_credit_used', 0)
        else:
            # Fallback for simpler format (if docs change)
            self.urls_discovered = response_data.get('urls_discovered', 0)
            self.urls_crawled = response_data.get('urls_crawled', 0)
            self.urls_pending = response_data.get('urls_pending', 0)
            self.urls_failed = response_data.get('urls_failed', 0)
            self.stop_reason = None
            self.api_credit_cost = response_data.get('api_credit_cost', 0)

    @property
    def is_complete(self) -> bool:
        """Check if crawler job is complete"""
        return self.status in (self.STATUS_COMPLETED, self.STATUS_DONE)

    @property
    def is_running(self) -> bool:
        """Check if crawler job is currently running"""
        return self.status in (self.STATUS_PENDING, self.STATUS_RUNNING)

    @property
    def is_failed(self) -> bool:
        """Check if crawler job failed"""
        return self.status == self.STATUS_FAILED

    @property
    def is_cancelled(self) -> bool:
        """Check if crawler job was cancelled"""
        return self.status == self.STATUS_CANCELLED

    @property
    def progress_pct(self) -> float:
        """
        Calculate progress percentage

        Returns:
            Progress as percentage (0-100)
        """
        if self.urls_discovered == 0:
            return 0.0
        return (self.urls_crawled / self.urls_discovered) * 100

    def __repr__(self):
        return (f"CrawlerStatusResponse(uuid={self.uuid}, status={self.status}, "
                f"progress={self.progress_pct:.1f}%, "
                f"crawled={self.urls_crawled}/{self.urls_discovered})")

Response from checking crawler job status

Returned by ScrapflyClient.get_crawl_status() method.

Provides real-time progress tracking for crawler jobs.

Attributes

uuid: Crawler job UUID
status: Current status (PENDING, RUNNING, COMPLETED, FAILED, CANCELLED, DONE)
is_success: Whether the crawler job completed successfully
is_finished: Whether the crawler job has finished (regardless of success/failure)
api_credit_cost: Total API credits consumed by this crawl
stop_reason: Reason why the crawler stopped (e.g., 'seed_url_failed', 'page_limit_reached'), None if still running
urls_discovered: Total URLs discovered so far
urls_crawled: Number of URLs successfully crawled
urls_pending: Number of URLs waiting to be crawled
urls_failed: Number of URLs that failed to crawl

Initialize from API response

Args

response_data: Raw API response dictionary

Class variables

var STATUS_CANCELLED
var STATUS_COMPLETED
var STATUS_DONE
var STATUS_FAILED
var STATUS_PENDING
var STATUS_RUNNING

Instance variables

prop is_cancelled : bool

Expand source code

@property
def is_cancelled(self) -> bool:
    """Check if crawler job was cancelled"""
    return self.status == self.STATUS_CANCELLED

Check if crawler job was cancelled

prop is_complete : bool

Expand source code

@property
def is_complete(self) -> bool:
    """Check if crawler job is complete"""
    return self.status in (self.STATUS_COMPLETED, self.STATUS_DONE)

Check if crawler job is complete

prop is_failed : bool

Expand source code

@property
def is_failed(self) -> bool:
    """Check if crawler job failed"""
    return self.status == self.STATUS_FAILED

Check if crawler job failed

prop is_running : bool

Expand source code

@property
def is_running(self) -> bool:
    """Check if crawler job is currently running"""
    return self.status in (self.STATUS_PENDING, self.STATUS_RUNNING)

Check if crawler job is currently running

prop progress_pct : float

Expand source code

@property
def progress_pct(self) -> float:
    """
    Calculate progress percentage

    Returns:
        Progress as percentage (0-100)
    """
    if self.urls_discovered == 0:
        return 0.0
    return (self.urls_crawled / self.urls_discovered) * 100

Calculate progress percentage

Returns

Progress as percentage (0-100)