Source code for image_crawler_utils.stations.pixiv.parser_assets.user_parser

from typing import Optional, Union
from collections.abc import Callable

import json
from collections import ChainMap
import requests
import ua_generator

from image_crawler_utils import Cookies, Parser, ImageInfo, CrawlerSettings
from image_crawler_utils.progress_bar import ProgressGroup



##### Pixiv User Parser


[docs] class PixivUserParser(Parser): """ Args: crawler_settings (image_crawler_utils.CrawlerSettings): The CrawlerSettings used in this Parser. member_id: Pixiv ID of the user. station_url (str): The URL of the main page of a website. + This parameter works when several websites use the same structure. For example, https://yande.re/ and https://konachan.com/ both use Moebooru to build their websites, and this parameter must be filled to deal with these sites respectively. + For websites like https://www.pixiv.net/, as no other website uses its structure, this parameter has already been initialized and do not need to be filled. use_keyword_include (bool): Using a new keyword string whose searching results can contain all images belong to the original keyword string result. Default set to False. + Example: search "A" can contain all results by "A and B" cookies (image_crawler_utils.Cookies, str, dict, list, None): Cookies containing logging information. quick_mode: DO NOT DOWNLOAD any image info. Will increase speed of downloading. info_page_batch_num (int): Batch size of images. Finish downloading a batch will wait for a rather long time. info_page_batch_delay (float, None): Delay time after each batch of images is downloaded. """ def __init__( self, member_id: str, station_url: str="https://www.pixiv.net/", crawler_settings: CrawlerSettings=CrawlerSettings(), cookies: Optional[Union[Cookies, list, dict, str]]=Cookies(), thread_delay: Union[float, Callable]=0, quick_mode: bool=False, info_page_batch_num: Optional[int]=100, info_page_batch_delay: Union[float, Callable]=300, ): super().__init__( station_url=station_url, crawler_settings=crawler_settings, cookies=cookies, ) self.member_id = member_id self.thread_delay = thread_delay self.quick_mode = quick_mode self.info_page_batch_num = info_page_batch_num self.info_page_batch_delay = info_page_batch_delay
[docs] def run(self) -> list[ImageInfo]: """ The main function that runs the Parser and returns a list of :class:`image_crawler_utils.ImageInfo`. """ if self.thread_delay == 0: # Pixiv do not accept frequent requests! self.thread_delay = self.crawler_settings.download_config.thread_num * 1.0 # Set the delay to 1.0 * thread num with requests.Session() as session: if not self.cookies.is_none(): session.cookies.update(self.cookies.cookies_dict) else: raise ValueError('Cookies cannot be empty!') self.get_image_ids(session=session) if self.quick_mode: return self.get_image_info_quick(session=session) else: return self.get_image_info_full(session=session)
##### Custom funcs # Get Pixiv ajax API image IDs
[docs] def get_image_ids(self, session: requests.Session=None) -> list[str]: if session is None: session = requests.Session() session.cookies.update(self.cookies.cookies_dict) self.crawler_settings.log.info(f"Downloading info of images uploaded by Pixiv member ID: {self.member_id} ...") # Update headers for json download if self.crawler_settings.download_config.result_headers is None: # Pixiv must have user-agents! ua = ua_generator.generate(browser=('chrome', 'edge')) ua.headers.accept_ch('Sec-CH-UA-Platform-Version, Sec-CH-UA-Full-Version-List') json_user_page_headers = dict(ChainMap(ua.headers.get(), {"Referer": "www.pixiv.net"})) else: json_user_page_headers = dict(ChainMap(self.crawler_settings.download_config.result_headers, {"Referer": "www.pixiv.net"})) # Get and parse json page info user_page_json_content = self.request_page_content( url=f"{self.station_url}ajax/user/{self.member_id}/works/latest", session=session, headers=json_user_page_headers, ) # Errors detected! content_dict = json.loads(user_page_json_content) if content_dict["error"] is True: error_msg = f'An error happens in [repr.url]{f"{self.station_url}ajax/user/{self.member_id}/works/latest"}[reset].' self.crawler_settings.log.critical(error_msg, extra={"markup": True}) raise ValueError(error_msg) illust_dict = content_dict["body"]["illusts"] if not isinstance(illust_dict, dict): error_msg = f'Illustrations not detected in [repr.url]{f"{self.station_url}ajax/user/{self.member_id}/works/latest"}[reset].' self.crawler_settings.log.critical(error_msg, extra={"markup": True}) raise ValueError(error_msg) # Sort from newest to oldest self.image_ids = sorted(list(illust_dict.keys()), key=lambda x: int(x), reverse=True) return self.image_ids
# Get image info: full
[docs] def get_image_info_full(self, session: requests.Session=None) -> list[ImageInfo]: if session is None: session = requests.Session() session.cookies.update(self.cookies.cookies_dict) # Update headers for illust detection if self.crawler_settings.download_config.result_headers is None: # Pixiv must have user-agents! ua = ua_generator.generate(browser=('chrome', 'edge')) ua.headers.accept_ch('Sec-CH-UA-Platform-Version, Sec-CH-UA-Full-Version-List') json_image_url_page_headers = [dict(ChainMap(ua.headers.get(), {"Referer": f"www.pixiv.net/artworks/{artwork_id}"})) for artwork_id in self.image_ids] else: json_image_url_page_headers = [dict(ChainMap(self.crawler_settings.download_config.result_headers, {"Referer": f"www.pixiv.net/artworks/{artwork_id}"})) for artwork_id in self.image_ids] # Get and parse json page info self.crawler_settings.log.info("Downloading image info for every Pixiv ID...") json_image_info_urls = [f'{self.station_url}ajax/illust/{artwork_id}' for artwork_id in self.image_ids] json_image_url_page_contents = self.threading_request_page_content( json_image_info_urls, restriction_num=self.crawler_settings.capacity_count_config.image_num, session=session, headers=json_image_url_page_headers, thread_delay=max(1.0 * self.crawler_settings.download_config.thread_num, self.crawler_settings.download_config.result_thread_delay), # Force not to be lower than a certain threshold batch_num=self.info_page_batch_num, batch_delay=self.info_page_batch_delay, ) image_info_dict = {} for content in json_image_url_page_contents: if content is None: # Empty page! continue parsed_content = json.loads(content) image_info_dict[parsed_content["body"]["id"]] = parsed_content["body"] # Get and parse json page info self.crawler_settings.log.info("Downloading image URLs for every Pixiv ID...") json_image_download_urls = [f'{self.station_url}ajax/illust/{artwork_id}/pages' for artwork_id in self.image_ids] json_image_url_page_contents = self.nodriver_threading_request_page_content( json_image_download_urls, restriction_num=self.crawler_settings.capacity_count_config.image_num, is_json=True, deconstruct_browser=True, # It seems that pixiv has less restrictions on crawling this type of pages, so no batch download is set. ) self.crawler_settings.log.info(f'Parsing image info...') image_info_list = [] with ProgressGroup(panel_title="Parsing Image Info") as progress_group: progress = progress_group.main_count_bar task = progress.add_task(description="Parsing image info pages:", total=len(json_image_url_page_contents)) for content in json_image_url_page_contents: if content is None: continue # Empty page! parsed_content = json.loads(content) for image_url_size in parsed_content["body"]: image_id = image_url_size["urls"]["original"].split('/')[-1].split('_')[0] tags = [item["tag"] for item in image_info_dict[image_id]["tags"]["tags"]] image_info_list.append(ImageInfo( url=image_url_size["urls"]["original"], name=image_url_size["urls"]["original"].split('/')[-1], info={ "id": image_id, "width": image_url_size["width"], "height": image_url_size["height"], "tags": tags, "info": image_info_dict[image_id], }, )) progress.update(task, advance=1) progress.update(task, description="[green]Parsing image info pages finished!") self.image_info_list = image_info_list return self.image_info_list
# Get image info: full
[docs] def get_image_info_quick(self, session: requests.Session=None) -> list[ImageInfo]: if session is None: session = requests.Session() session.cookies.update(self.cookies.cookies_dict) # Get and parse json page info self.crawler_settings.log.info("Downloading image URLs for every Pixiv ID...") json_image_download_urls = [f'{self.station_url}ajax/illust/{artwork_id}/pages' for artwork_id in self.image_ids] json_image_url_page_contents = self.nodriver_threading_request_page_content( json_image_download_urls, restriction_num=self.crawler_settings.capacity_count_config.image_num, is_json=True, deconstruct_browser=True, # It seems that pixiv has less restrictions on crawling this type of pages, so no batch download is set. ) self.crawler_settings.log.info(f'Parsing image info...') image_info_list = [] with ProgressGroup(panel_title="Parsing Image Info") as progress_group: progress = progress_group.main_count_bar task = progress.add_task(description="Parsing image info pages:", total=len(json_image_url_page_contents)) for content in json_image_url_page_contents: if content is None: continue # Empty page! parsed_content = json.loads(content) for image_url_size in parsed_content["body"]: image_id = image_url_size["urls"]["original"].split('/')[-1].split('_')[0] image_info_list.append(ImageInfo( url=image_url_size["urls"]["original"], name=image_url_size["urls"]["original"].split('/')[-1], info={ "id": image_id, "width": image_url_size["width"], "height": image_url_size["height"], }, )) progress.update(task, advance=1) progress.update(task, description="[green]Parsing image info pages finished!") self.image_info_list = image_info_list return self.image_info_list