Source code for image_crawler_utils.configs

from __future__ import annotations
import dataclasses
from typing import Optional, Callable, Union

import random



[docs] @dataclasses.dataclass class DebugConfig: """ Contains config for whether displaying a certain level of debugging messages in console. Default set to "info" level. """ show_debug: bool = False """ Display debug-level messages. + Default set to :py:data:`False`. + Include messages of many detailed information about running the crawler, especially connections with websites. + Set ``show_debug`` to :py:data:`False` will not stop displaying debug messages from any ``.display_all_configs()``. """ show_info: bool = True """ Display info-level messages. + Default set to :py:data:`True`. + Include messages of basic information indicating the progress of the crawler. """ show_warning: bool = True """ Display warning-level messages. + Default set to :py:data:`True`. + Include messages of errors that basically do not affect the final results, mostly connection failures with the websites. """ show_error: bool = True """ Display error-level messages. + Default set to :py:data:`True`. + Include messages of errors that may affect the final results but do not interrupt the crawler. """ show_critical: bool = True """ Display critical-level messages. + Default set to :py:data:`True`. + Include messages of errors that interrupt the crawler. Usually a Python error will be raised when critical errors happen. """
[docs] def set_level(self, level_str: str): """ Set current DebugConfig to display messages over the level. For example, set to "warning" will display warning, error and critical messages. Args: level_str (str): Must be one of (from lower to higher) "debug", "info", "warning", "error", "critical" or "silenced". + Set a logging level will display messages including and above this level. For example, ``.set_level("warning")`` will only display messages with "warning", "error" and "critical" levels. + Set to "silenced" level will not display any messages except those generated by the progress bars. """ level_class = ("debug", "info", ("warn", "warning"), "error", "critical", "silenced") attr_list = ("show_debug", "show_info", "show_warning", "show_error", "show_critical") level_int = -1 for i in range(len(level_class)): if isinstance(level_class[i], str): if level_str.lower() == level_class[i]: level_int = i else: if level_str.lower() in level_class[i]: # A tuple level_int = i if level_int >= 0: # Valid level_str flag = False for i in range(len(attr_list)): if i == level_int: flag = True setattr(self, attr_list[i], flag)
[docs] @classmethod def level(cls, level_str: str) -> DebugConfig: """ Create a DebugConfig that is set to display messages over the level. For example, set to "warning" will display warning, error and critical messages. Args: level_str (str): Must be one of (from lower to higher) "debug", "info", "warning", "error", "critical" or "silenced". + Set a logging level will display messages including and above this level. For example, ``.set_level("warning")`` will only display messages with "warning", "error" and "critical" levels. + Set to "silenced" level will not display any messages except those generated by the progress bars. Returns: Created DebugConfig. """ config = cls() config.set_level(level_str) return config
[docs] @dataclasses.dataclass class CapacityCountConfig: """ Contains config for restrictions of images number, total size or webpage number. """ image_num: Optional[int] = None """ The number of images to be parsed from websites or downloaded. + Default is set to :py:data:`None`, meaning no restrictions. + Mostly only used in the Downloader to control the number of images to be downloaded, but some Parsers may also use this parameter. """ capacity: Optional[float] = None """ Total size of images (bytes) to be downloaded. + Default is set to :py:data:`None`, meaning no restrictions. + When capacity is reached, no new downloading threads will be added. However, downloading threads that already started will not be affected, which means actual image size will be larger than the capacity. """ page_num: Optional[int] = None """ Number of gallery pages to detect images in total. None means no restriction. + Default is set to :py:data:`None`, meaning no restrictions. + Some websites, like Twitter / X, do not use gallery pages or JSON-API pages (Image Crawler Utils uses the method of scrolling webpages to get Twitter / X images), and this parameter is not used. """
[docs] @dataclasses.dataclass class DownloadConfig: """ Contains config for downloading. """ headers: Optional[Union[dict, Callable]] = None """ Headers of the requests. + Both fetching webpages and downloading images will use this parameter. + Headers should be :py:data:`None`, a :py:class:`dict` or a callable function that returns a :py:class:`dict`. + If you want to have a random header with every request, you can set ``headers`` to a callable function. This function should not accept any parameters (which can be implemented by ``lambda``) and returns a :py:class:`dict`. + This only works when the requests is sent by :py:mod:`requests` (like :py:func:`requests.get()`). For webpages loaded by browsers, this parameter is omitted. + Basically, this contains the user agent of the requests. + **ATTENTION**: Not all user agents are supported by the websites you are accessing! """ proxies: Optional[Union[dict, Callable]] = None """ Proxies used by the crawler. + Both fetching webpages and downloading images will use this parameter. + Proxies should be :py:data:`None`, a :py:class:`dict` or a callable function that returns a :py:class:`dict`. + Set to :py:data:`None` (Default) will let the crawler use system proxies. + If you want to have a random proxy with every request, you can set ``proxies`` to a callable function. This function should not accept any parameters (which can be implemented by ``lambda``) and returns a :py:class:`dict`. + Both :py:mod:`requests` and browsers use these proxies, but the structure should be in a :py:mod:`requests`-acceptable form like: + HTTP type: ``{'http': '127.0.0.1:7890'}`` + HTTPS type: ``{'https': '127.0.0.1:7890'}`` + SOCKS type: ``{'https': 'socks5://127.0.0.1:7890'}`` + If you input ``'https'`` proxies, ``'http'`` proxies will be automatically generated. + **ATTENTION:** Using usernames and passwords is currently not supported. """ thread_delay: float = 5 """ Delaying time (seconds) before every thread starts. + Both fetching webpages and downloading images will use this parameter. + Some Parsers may use different parameters to control their delaying time. """ fail_delay: float = 3 """ Delaying time (seconds) after every failure. + Both fetching webpages and downloading images will use this parameter. + Some Parsers may use different parameters to control their delaying time when a failure happens. """ randomize_delay: bool = True """ Randomize ``thread_delay`` and ``fail_delay`` between 0 and their values. + For example, ``thread_delay=5.0`` and ``randomize_delay=False`` will cause the ``thread_delay`` to choose a random value between 0 and 5.0 every time. """ thread_num: int = 5 """ Total number of threads. + Both fetching webpages and downloading images will use this parameter. + Some Parsers do not use threading to fetching pages, and this parameter is not used. """ timeout: Optional[float] = 10 """ Timeout for connection. When no response is returned in ``timeout`` seconds, a failure will happen. + Both fetching webpages and downloading images will use this parameter. + Setting to :py:data:`None` means (barely) no restrictions. """ max_download_time: Optional[float] = None """ When no new data is fetched when downloading images in ``max_download_time`` seconds, a failure will happen. + Only downloading images will use this parameter. + Default is set to :py:data:`None`, meaning no restrictions. """ retry_times: int = 5 """ Total times of retrying to fetch a webpage / download an image. + Both fetching webpages and downloading images will use this parameter. """ overwrite_images: bool = True """ Overwrite existing images when downloading. + Only downloading images will use this parameter. """ def __post_init__(self): # Process HTTPS proxies if isinstance(self.proxies, dict): if "https" in self.proxies.keys() and "http" not in self.proxies.keys(): self.proxies["http"] = self.proxies["https"] elif not (isinstance(self.proxies, dict) or callable(self.proxies) or (self.proxies is None)): raise TypeError("Proxies should be a dict, a callable callable, None.") @property def result_headers(self) -> Optional[dict]: """ Generate the headers. If the headers attribute is callable, it will be called for every usage. """ if callable(self.headers): return self.headers() else: return self.headers @property def result_proxies(self) -> Optional[dict]: """ Generate the headers. If the proxies attribute is callable, it will be called for every usage. """ if callable(self.proxies): return self.proxies() else: return self.proxies @property def result_thread_delay(self) -> float: """ Generate the thread delay. If the randomize_delay attribute is set to :py:data:`True`, the delay will be randomized between 0 and thread_delay for every usage. """ if self.randomize_delay: return random.random() * self.thread_delay else: return self.thread_delay @property def result_fail_delay(self) -> float: """ Generate the fail delay. If the randomize_delay attribute is set to :py:data:`True`, the delay will be randomized between 0 and fail_delay for every usage. """ if self.randomize_delay: return random.random() * self.fail_delay else: return self.fail_delay