Source code for image_crawler_utils.stations.pixiv.parser_assets.keyword_parser

from typing import Optional, Union

import time, random
import json
from collections import ChainMap
from collections.abc import Callable
import requests
import ua_generator

from urllib import parse

from image_crawler_utils import Cookies, KeywordParser, ImageInfo, CrawlerSettings
from image_crawler_utils.keyword import KeywordLogicTree, min_len_keyword_group, construct_keyword_tree_from_list
from image_crawler_utils.progress_bar import CustomProgress, ProgressGroup

from .search_settings import PixivSearchSettings



##### Pixiv Keyword Parser


[docs] class PixivKeywordParser(KeywordParser): """ Args: crawler_settings (image_crawler_utils.CrawlerSettings): The CrawlerSettings used in this Parser. station_url (str): The URL of the main page of a website. + This parameter works when several websites use the same structure. For example, https://yande.re/ and https://konachan.com/ both use Moebooru to build their websites, and this parameter must be filled to deal with these sites respectively. + For websites like https://www.pixiv.net/, as no other website uses its structure, this parameter has already been initialized and do not need to be filled. standard_keyword_string (str): Query keyword string using standard syntax. Refer to the documentation for detailed instructions. pixiv_search_settings (image_crawler_utils.stations.pixiv.PixivSearchSettings): A PixivSearchSettings class that contains extra options when searching. keyword_string (str, None): If you want to directly specify the keywords used in searching, set ``keyword_string`` to a custom non-empty string. It will OVERWRITE ``standard_keyword_string``. + For example, set ``keyword_string`` to ``"kuon_(utawarerumono) rating:safe"`` in DanbooruKeywordParser means searching directly with this string in Danbooru, and its standard keyword string equivalent is ``"kuon_(utawarerumono) AND rating:safe"``. use_keyword_include (bool): Using a new keyword string whose searching results can contain all images belong to the original keyword string result. Default set to False. + Example: search "A" can contain all results by "A and B" cookies (image_crawler_utils.Cookies, str, dict, list, None): Cookies containing logging information. thread_delay (float, Callable, None): As Pixiv restricts number of requests in a certain period, this argument defines the delay time (seconds) before every downloading thread of websites. quick_mode (bool): Only collect the basic information. + Pixiv has a strict anti-crawling restriction on acquiring the pages containing information of images. Set this parameter to :py:data:`True` will not request these pages and collect only the basic information of images for downloading. + Different Parsers may have different structures of image information. Refer to the [ImageInfo Structure](#imageinfo-structure-4) chapter for the difference between results. + If set to :py:data:`False` (get full information), then the ``thread_delay`` when downloading information pages will be forced to be set to no lower than ``CrawlerSettings.download_config.thread_num * 1.0``. Other pages are not affected. info_page_batch_num (int): After downloading ``info_page_batch_num`` number of image information pages, the crawler will wait for ``info_page_batch_delay`` seconds before continue. info_page_batch_delay (float, None): After downloading ``info_page_batch_num`` number of image information pages, the crawler will wait for `info_page_batch_delay` seconds before continue. + If ``quick_mode`` is set to :py:data:`True`, both ``info_page_batch_num`` and ``info_page_batch_delay`` will be ignored. + If you are not sure, leaving both ``info_page_batch_num`` and ``info_page_batch_delay`` blank (use their default values) is likely enough for preventing your account to be suspended. + ``info_page_batch_delay`` can be a function that will be called for every usage. """ def __init__( self, station_url: str="https://www.pixiv.net/", crawler_settings: CrawlerSettings=CrawlerSettings(), standard_keyword_string: Optional[str]=None, keyword_string: Optional[str]=None, cookies: Optional[Union[Cookies, list, dict, str]]=Cookies(), pixiv_search_settings: PixivSearchSettings=PixivSearchSettings(), use_keyword_include: bool=False, quick_mode: bool=False, info_page_batch_num: Optional[int]=100, info_page_batch_delay: Union[float, Callable]=300, ): super().__init__( station_url=station_url, crawler_settings=crawler_settings, standard_keyword_string=standard_keyword_string, keyword_string=keyword_string, cookies=cookies, ) self.pixiv_search_settings = pixiv_search_settings self.use_keyword_include = use_keyword_include self.quick_mode = quick_mode self.info_page_batch_num = info_page_batch_num self.info_page_batch_delay = info_page_batch_delay
[docs] def run(self) -> list[ImageInfo]: """ The main function that runs the Parser and returns a list of :class:`image_crawler_utils.ImageInfo`. """ if self.keyword_string is None: if self.use_keyword_include: self.generate_keyword_string_include() else: self.generate_keyword_string() with requests.Session() as session: if not self.cookies.is_none(): session.cookies.update(self.cookies.cookies_dict) else: raise ValueError('Cookies cannot be empty!') self.get_json_page_num() self.get_json_page_urls() self.get_image_basic_info(session=session) if self.quick_mode: return self.get_image_info_quick(session=session) else: return self.get_image_info_full(session=session)
##### Custom funcs # Generate keyword string from keyword tree def __build_keyword_str(self, tree: KeywordLogicTree) -> str: # Generate standard keyword string if isinstance(tree.lchild, str): res1 = tree.lchild while '_' in res1 or '*' in res1: # Pixiv does not support _ and * res1 = res1.replace("_", "").replace("*", "") else: res1 = self.__build_keyword_str(tree.lchild) if isinstance(tree.rchild, str): res2 = tree.rchild while '_' in res2 or '*' in res2: # Pixiv does not support _ and * res2 = res2.replace("_", "").replace("*", "") else: res2 = self.__build_keyword_str(tree.rchild) if tree.logic_operator == "AND": return f'({res1} {res2})' elif tree.logic_operator == "OR": return f'({res1} OR {res2})' elif tree.logic_operator == "NOT": return f'(-{res2})' elif tree.logic_operator == "SINGLE": return f'{res2}' # Basic keyword string
[docs] def generate_keyword_string(self) -> str: self.keyword_string = self.__build_keyword_str(self.keyword_tree) return self.keyword_string
# Keyword (include) string
[docs] def generate_keyword_string_include(self) -> str: keyword_group = min_len_keyword_group(self.keyword_tree.keyword_include_group_list()) keyword_strings = [self.__build_keyword_str(construct_keyword_tree_from_list(group, log=self.crawler_settings.log)) for group in keyword_group] min_image_num = None self.crawler_settings.log.info("Testing the image num of keyword (include) groups to find the one with fewest pages.") with CustomProgress(transient=True) as progress: task = progress.add_task(description="Requesting pages:", total=len(keyword_strings)) for string in keyword_strings: self.crawler_settings.log.debug(f'Testing the image num of keyword string: {string}') self.keyword_string = string image_num = self.get_json_page_num() self.crawler_settings.log.debug(f'The image num of {string} is {image_num}.') if min_image_num is None or image_num < min_image_num: min_image_num = image_num min_string = string progress.update(task, advance=1) progress.update(task, description="[green]Requesting pages finished!") self.keyword_string = min_string self.crawler_settings.log.info(f'The keyword string the parser will use is "{self.keyword_string}" which has {min_image_num} {"images" if min_image_num > 1 else "image"}.') return self.keyword_string
[docs] def get_json_page_num(self, session: requests.Session=None) -> int: if session is None: session = requests.Session() session.cookies.update(self.cookies.cookies_dict) if self.crawler_settings.download_config.result_headers is None: # Pixiv must have user-agents! ua = ua_generator.generate(browser=('chrome', 'edge')) ua.headers.accept_ch('Sec-CH-UA-Platform-Version, Sec-CH-UA-Full-Version-List') json_search_page_headers = dict(ChainMap(ua.headers.get(), {"Referer": "www.pixiv.net"})) else: json_search_page_headers = dict(ChainMap(self.crawler_settings.download_config.result_headers, {"Referer": "www.pixiv.net"})) first_page_url = parse.quote(f"{self.station_url}{self.pixiv_search_settings.build_search_appending_str_json(self.keyword_string)}", safe='/:?=&') self.crawler_settings.log.info(f'Connecting to the first gallery page using keyword "{self.keyword_string}" and URL [repr.url]{first_page_url}[reset] ...', extra={"markup": True}) content = self.request_page_content(first_page_url, session=session, headers=json_search_page_headers) if content is None: self.crawler_settings.log.critical(f"CANNOT connect to the first JSON page, URL: [repr.url]{first_page_url}[reset]", extra={"markup": True}) raise ConnectionError(f"CANNOT connect to the first JSON page, URL: [repr.url]{first_page_url}[reset]", extra={"markup": True}) else: self.crawler_settings.log.info(f'Successfully connected to the first JSON page.') parsed_content = json.loads(content) for image_list_type in ["illust", "illustManga", "manga"]: if image_list_type in parsed_content["body"].keys(): self.artworks_num = int(parsed_content["body"][image_list_type]["total"]) self.json_page_num = int(parsed_content["body"][image_list_type]["lastPage"]) if self.json_page_num == 1000: self.crawler_settings.log.warning("Number of result pages has reached 1000. Due to Pixiv restrictions, result in pages exceeding 1000 cannot be fetched through JSON API directly.") self.crawler_settings.log.info(f"Number of artworks: {self.artworks_num}") if self.artworks_num == 0: # No result, no pages! self.json_page_num = 0 return self.json_page_num
# Get Pixiv ajax API json page URLs
[docs] def get_json_page_urls(self) -> list[str]: self.json_page_urls = [parse.quote(f"{self.station_url}{self.pixiv_search_settings.build_search_appending_str_json(self.keyword_string)}&p={page_num}", safe='/:?=&') for page_num in range(1, self.json_page_num + 1)] return self.json_page_urls
# Get image ID and basic info
[docs] def get_image_basic_info(self, session: requests.Session=None) -> dict: if session is None: session = requests.Session() session.cookies.update(self.cookies.cookies_dict) self.crawler_settings.log.info("Downloading pages including Pixiv IDs...") # Get pages until all pages are fetched empty_urls = self.json_page_urls.copy() json_basic_info = {} while len(empty_urls) > 0: download_urls = empty_urls.copy() empty_urls = [] # Get and parse json page info json_page_contents = self.nodriver_threading_request_page_content( download_urls, restriction_num=self.crawler_settings.capacity_count_config.image_num, is_json=True, deconstruct_browser=True, # It seems that pixiv has less restrictions on crawling this type of pages, so no batch download is set. ) # Get dict for i in range(len(download_urls)): parsed_content = json.loads(json_page_contents[i]) for image_list_type in ["illust", "illustManga", "manga"]: if image_list_type in parsed_content["body"].keys(): if len(parsed_content["body"][image_list_type]["data"]) > 0: for image_data in parsed_content["body"][image_list_type]["data"]: json_basic_info[image_data["id"]] = image_data else: empty_urls.append(download_urls[i]) if len(empty_urls) > 0: self.crawler_settings.log.warning(f'{len(empty_urls)} {"pages are" if len(empty_urls) > 1 else "page is"} empty, possibly because requests were too frequent. Retrying to request pages in 1 to 2 minutes.') time.sleep(60 + random.random() * 60) # Sort with ID from large to small json_basic_info = {elem[0]: elem[1] for elem in sorted(json_basic_info.items(), key=lambda item: int(item[0]), reverse=True)} self.json_basic_info = json_basic_info return self.json_basic_info
# Get image info: full
[docs] def get_image_info_full(self, session: requests.Session=None) -> list[ImageInfo]: if session is None: session = requests.Session() session.cookies.update(self.cookies.cookies_dict) # Update headers for illust detection if self.crawler_settings.download_config.result_headers is None: # Pixiv must have user-agents! ua = ua_generator.generate(browser=('chrome', 'edge')) ua.headers.accept_ch('Sec-CH-UA-Platform-Version, Sec-CH-UA-Full-Version-List') json_image_url_page_headers = [dict(ChainMap(ua.headers.get(), {"Referer": f"www.pixiv.net/artworks/{artwork_id}"})) for artwork_id in self.json_basic_info.keys()] else: json_image_url_page_headers = [dict(ChainMap(self.crawler_settings.download_config.result_headers, {"Referer": f"www.pixiv.net/artworks/{artwork_id}"})) for artwork_id in self.json_basic_info.keys()] # Get and parse json page info self.crawler_settings.log.info("Downloading image info for every Pixiv ID...") json_image_info_urls = [f'{self.station_url}ajax/illust/{artwork_id}' for artwork_id in self.json_basic_info.keys()] json_image_url_page_contents = self.threading_request_page_content( json_image_info_urls, restriction_num=self.crawler_settings.capacity_count_config.image_num, session=session, headers=json_image_url_page_headers, thread_delay=max(1.0 * self.crawler_settings.download_config.thread_num, self.crawler_settings.download_config.result_thread_delay), # Force not to be lower than a certain threshold in case the account get suspended because of too many requests batch_num=self.info_page_batch_num, batch_delay=self.info_page_batch_delay, ) image_info_dict = {} for content in json_image_url_page_contents: if content is None: # Empty page! continue parsed_content = json.loads(content) image_info_dict[parsed_content["body"]["id"]] = parsed_content["body"] # Get and parse json page info self.crawler_settings.log.info("Downloading image URLs for every Pixiv ID...") json_image_download_urls = [f'{self.station_url}ajax/illust/{artwork_id}/pages' for artwork_id in self.json_basic_info.keys()] json_image_url_page_contents = self.nodriver_threading_request_page_content( json_image_download_urls, restriction_num=self.crawler_settings.capacity_count_config.image_num, is_json=True, deconstruct_browser=True, # It seems that pixiv has less restrictions on crawling this type of pages, so no batch download is set. ) self.crawler_settings.log.info(f'Parsing image info...') image_info_list = [] with ProgressGroup(panel_title="Parsing Image Info") as progress_group: progress = progress_group.main_count_bar task = progress.add_task(description="Parsing image info pages:", total=len(json_image_url_page_contents)) for content in json_image_url_page_contents: if content is None: continue # Empty page! parsed_content = json.loads(content) for image_url_size in parsed_content["body"]: image_id = image_url_size["urls"]["original"].split('/')[-1].split('_')[0] tags = [item["tag"] for item in image_info_dict[image_id]["tags"]["tags"]] image_info_list.append(ImageInfo( url=image_url_size["urls"]["original"], name=image_url_size["urls"]["original"].split('/')[-1], info={ "id": image_id, "width": image_url_size["width"], "height": image_url_size["height"], "tags": tags, "info": image_info_dict[image_id], }, )) progress.update(task, advance=1) progress.update(task, description="[green]Parsing image info pages finished!") self.image_info_list = image_info_list return self.image_info_list
# Get image info: quick
[docs] def get_image_info_quick(self, session: requests.Session=None) -> list[ImageInfo]: if session is None: session = requests.Session() session.cookies.update(self.cookies.cookies_dict) # Get and parse json page info self.crawler_settings.log.info("Downloading image URLs for every Pixiv ID...") json_image_download_urls = [f'{self.station_url}ajax/illust/{artwork_id}/pages' for artwork_id in self.json_basic_info.keys()] json_image_url_page_contents = self.nodriver_threading_request_page_content( json_image_download_urls, restriction_num=self.crawler_settings.capacity_count_config.image_num, is_json=True, deconstruct_browser=True, # It seems that pixiv has less restrictions on crawling this type of pages, so no batch download is set. ) self.crawler_settings.log.info(f'Parsing image info...') image_info_list = [] with ProgressGroup(panel_title="Parsing Image Info") as progress_group: progress = progress_group.main_count_bar task = progress.add_task(description="Parsing image info pages:", total=len(json_image_url_page_contents)) for content in json_image_url_page_contents: if content is None: continue # Empty page! parsed_content = json.loads(content) for image_url_size in parsed_content["body"]: image_id = image_url_size["urls"]["original"].split('/')[-1].split('_')[0] tags = self.json_basic_info[image_id]["tags"] image_info_list.append(ImageInfo( url=image_url_size["urls"]["original"], name=image_url_size["urls"]["original"].split('/')[-1], info={ "id": image_id, "width": image_url_size["width"], "height": image_url_size["height"], "tags": tags, "info": self.json_basic_info[image_id], }, )) progress.update(task, advance=1) progress.update(task, description="[green]Parsing image info pages finished!") self.image_info_list = image_info_list return self.image_info_list