Module scrapfly.scrapy.pipelines
Expand source code
from scrapy.pipelines.files import FilesPipeline as ScrapyFilesPipeline
from scrapy.pipelines.images import ImagesPipeline as ScrapyImagesPipeline
from itemadapter import ItemAdapter
from . import ScrapflyScrapyRequest, ScrapflyScrapyResponse
from .. import ScrapeConfig
class FilesPipeline(ScrapyFilesPipeline):
def get_media_requests(self, item, info):
scrape_configs = ItemAdapter(item).get(self.files_urls_field, [])
requests = []
for config in scrape_configs:
# If pipeline are not migrated to scrapfly - config is the url instead of ScrapeConfig object
# Auto migrate string url to ScrapeConfig object
if isinstance(config, str):
config = scrape_config=ScrapeConfig(url=config)
if isinstance(config, ScrapeConfig):
requests.append(ScrapflyScrapyRequest(scrape_config=config))
else:
raise ValueError('FilesPipeline item must ScrapeConfig Object or string url')
return requests
class ImagesPipeline(ScrapyImagesPipeline):
def get_media_requests(self, item, info):
scrape_configs = ItemAdapter(item).get(self.images_urls_field, [])
requests = []
for config in scrape_configs:
# If pipeline are not migrated to scrapfly - config is the url instead of ScrapeConfig object
# Auto migrate string url to ScrapeConfig object
if isinstance(config, str):
config = scrape_config = ScrapeConfig(url=config)
if isinstance(config, ScrapeConfig):
requests.append(ScrapflyScrapyRequest(scrape_config=config))
else:
raise ValueError('ImagesPipeline item must ScrapeConfig Object or string url')
return requests
Classes
class FilesPipeline (store_uri, download_func=None, settings=None)
-
Abstract pipeline that implement the file downloading
This pipeline tries to minimize network transfers and file processing, doing stat of the files and determining if file is new, up-to-date or expired.
new
files are those that pipeline never processed and needs to be downloaded from supplier site the first time.uptodate
files are the ones that the pipeline processed and are still valid files.expired
files are those that pipeline already processed but the last modification was made long time ago, so a reprocessing is recommended to refresh it in case of change.Expand source code
class FilesPipeline(ScrapyFilesPipeline): def get_media_requests(self, item, info): scrape_configs = ItemAdapter(item).get(self.files_urls_field, []) requests = [] for config in scrape_configs: # If pipeline are not migrated to scrapfly - config is the url instead of ScrapeConfig object # Auto migrate string url to ScrapeConfig object if isinstance(config, str): config = scrape_config=ScrapeConfig(url=config) if isinstance(config, ScrapeConfig): requests.append(ScrapflyScrapyRequest(scrape_config=config)) else: raise ValueError('FilesPipeline item must ScrapeConfig Object or string url') return requests
Ancestors
- scrapy.pipelines.files.FilesPipeline
- scrapy.pipelines.media.MediaPipeline
Methods
def get_media_requests(self, item, info)
-
Returns the media requests to download
Expand source code
def get_media_requests(self, item, info): scrape_configs = ItemAdapter(item).get(self.files_urls_field, []) requests = [] for config in scrape_configs: # If pipeline are not migrated to scrapfly - config is the url instead of ScrapeConfig object # Auto migrate string url to ScrapeConfig object if isinstance(config, str): config = scrape_config=ScrapeConfig(url=config) if isinstance(config, ScrapeConfig): requests.append(ScrapflyScrapyRequest(scrape_config=config)) else: raise ValueError('FilesPipeline item must ScrapeConfig Object or string url') return requests
class ImagesPipeline (store_uri, download_func=None, settings=None)
-
Abstract pipeline that implement the image thumbnail generation logic
Expand source code
class ImagesPipeline(ScrapyImagesPipeline): def get_media_requests(self, item, info): scrape_configs = ItemAdapter(item).get(self.images_urls_field, []) requests = [] for config in scrape_configs: # If pipeline are not migrated to scrapfly - config is the url instead of ScrapeConfig object # Auto migrate string url to ScrapeConfig object if isinstance(config, str): config = scrape_config = ScrapeConfig(url=config) if isinstance(config, ScrapeConfig): requests.append(ScrapflyScrapyRequest(scrape_config=config)) else: raise ValueError('ImagesPipeline item must ScrapeConfig Object or string url') return requests
Ancestors
- scrapy.pipelines.images.ImagesPipeline
- scrapy.pipelines.files.FilesPipeline
- scrapy.pipelines.media.MediaPipeline
Methods
def get_media_requests(self, item, info)
-
Returns the media requests to download
Expand source code
def get_media_requests(self, item, info): scrape_configs = ItemAdapter(item).get(self.images_urls_field, []) requests = [] for config in scrape_configs: # If pipeline are not migrated to scrapfly - config is the url instead of ScrapeConfig object # Auto migrate string url to ScrapeConfig object if isinstance(config, str): config = scrape_config = ScrapeConfig(url=config) if isinstance(config, ScrapeConfig): requests.append(ScrapflyScrapyRequest(scrape_config=config)) else: raise ValueError('ImagesPipeline item must ScrapeConfig Object or string url') return requests