Source code for image_crawler_utils.classes.image_info

import dataclasses
import pathvalidate
import json, os, traceback
from typing import Iterable, Optional
from rich import markup

from image_crawler_utils.log import Log
from image_crawler_utils.progress_bar import CustomProgress
from image_crawler_utils.utils import check_dir



##### Classes


[docs] @dataclasses.dataclass class ImageInfo: """ A class consisting of image URL, name, info and back up URLs. Can be used to download images and write result to files. """ url: str """The URL used AT FIRST in downloading the image.""" name: str """Name of the image when saved.""" info: dict = dataclasses.field(default_factory=lambda: {}) # Info should be a dict """ A :py:class:`dict`, containing information of the image. + ``info`` will not affect Downloader directly. It only works if you set the ``image_info_filter`` parameter in the Downloader class. + Different sites may have different ``info`` structures which are defined respectively by their Parsers. + **ATTENTION:** If you define you own ``info`` structure, please ENSURE it can be JSON-serialized (e.g. The values of the :py:class:`dict` should be ``int``, ``float``, :py:class:`str`, :py:class:`list`, :py:class:`dict`, etc.) in order to make it compatible with ``save_image_infos()`` and ``load_image_infos()``. """ backup_urls: Iterable[str] = dataclasses.field(default_factory=lambda: []) """When downloading from ``.url`` failed, try downloading from URLs in the list of ``.backup_urls``.""" # Remove invalid char def __post_init__(self): self.name = pathvalidate.sanitize_filename(self.name, replacement_text="_")
##### Functions
[docs] def save_image_infos( image_info_list: Iterable[ImageInfo], json_file: str, encoding: str='UTF-8', display_progress: bool=True, log: Log=Log(), ) -> Optional[tuple[str, str]]: """ Save the ImageInfo list into a JSON file. ONLY WORKS IF the info can be JSON serialized. Args: image_info_list (Iterable[image_crawler_utils.ImageInfo]): An iterable list (e.g. :py:class:`list` or :py:class:`tuple`) of :class:`image_crawler_utils.ImageInfo`. json_file (str): Name / Path of the JSON file. Suffix (.json) is optional. encoding (str): Encoding of the JSON file. display_progress (bool): Display a ``rich`` progress bar when running. Progress bar will be hidden after finishing. log (image_crawler_utils.log.Log, None): Logging config. Returns: (Saved file name, Absolute path of the saved file), or :py:data:`None` if failed. """ try: if display_progress: with CustomProgress(has_spinner=True, transient=True) as progress: task = progress.add_task(description="Converting ImageInfo to dict:", total=3) dict_list = [ dataclasses.asdict(image_info) for image_info in progress.track(image_info_list, description="Converting ImageInfo...") ] progress.update(task, description="Dumping dict list into JSON:", advance=1) dict_list_data = json.dumps(dict_list, indent=4, ensure_ascii=False).encode(encoding) progress.update(task, description="Saving into a JSON file:", advance=1) path, filename = os.path.split(json_file) check_dir(path, log) f_name = os.path.join(path, f"{filename}.json") f_name = f_name.replace(".json.json", ".json") # If .JSON is already contained in json_file, skip it with open(f_name, mode="wb") as f: f.write(dict_list_data) log.info(f'The list of ImageInfo has been saved at [repr.filename]{markup.escape(os.path.abspath(f_name))}[reset]', extra={"markup": True}) progress.update(task, description="[green]ImageInfo successfully saved!", advance=1) else: dict_list = [ dataclasses.asdict(image_info) for image_info in image_info_list ] dict_list_data = json.dumps(dict_list, indent=4, ensure_ascii=False).encode(encoding) path, filename = os.path.split(json_file) check_dir(path, log) f_name = os.path.join(path, f"{filename}.json") f_name = f_name.replace(".json.json", ".json") # If .JSON is already contained in json_file, skip it with open(f_name, mode="wb") as f: f.write(dict_list_data) log.info(f'The list of ImageInfo has been saved at [repr.filename]{markup.escape(os.path.abspath(f_name))}[reset]', extra={"markup": True}) return f_name, os.path.abspath(f_name) except Exception as e: log.error(f'Failed to save the list of ImageInfo at [repr.filename]{markup.escape(os.path.abspath(json_file))}[reset] because {e}\n{traceback.format_exc()}', extra={"markup": True}) return None
[docs] def load_image_infos( json_file: str, encoding: str='UTF-8', display_progress: bool=True, log: Log=Log(), ) -> Optional[list[ImageInfo]]: """ Load the ImageInfo list from a JSON file. ONLY WORKS IF the info can be JSON serialized. Args: json_file (str): Name / Path of the JSON file. encoding (str): Encoding of the JSON file. display_progress (bool): Display a ``rich`` progress bar when running. Progress bar will be hidden after finishing. log (image_crawler_utils.log.Log, None): Logging config. Returns: List of ImageInfo, or None if failed. """ try: if display_progress: with CustomProgress(has_spinner=True, transient=True) as progress: task = progress.add_task(description="Loading JSON file:", total=3) with open(json_file, mode="rb") as f: file_data = f.read() progress.update(task, description="Parsing JSON from loaded data:", advance=1) dict_list = json.loads(file_data.decode(encoding)) progress.update(task, description="Parsing ImageInfo from JSON data:", advance=1) image_info_list = [ImageInfo( url=item["url"], backup_urls=item["backup_urls"], name=item["name"], info=item["info"], ) for item in progress.track(dict_list, description="Parsing ImageInfo...")] progress.update(task, description="[green]ImageInfo successfully loaded!", advance=1) else: with open(json_file, mode="rb") as f: file_data = f.read() dict_list = json.loads(file_data.decode(encoding)) image_info_list = [ImageInfo( url=item["url"], backup_urls=item["backup_urls"], name=item["name"], info=item["info"], ) for item in dict_list] log.info(f'The list of ImageInfo has been loaded from [repr.filename]{markup.escape(os.path.abspath(json_file))}[reset]', extra={"markup": True}) return image_info_list except Exception as e: log.error(f'Failed to load the list of ImageInfo from [repr.filename]{markup.escape(os.path.abspath(json_file))}[reset] because {e}\n{traceback.format_exc()}', extra={"markup": True}) return None