Source code for image_crawler_utils.stations.pixiv.parser_assets.user_parser

from typing import Optional, Union
from collections.abc import Callable

import json
from collections import ChainMap
import requests
import ua_generator

from image_crawler_utils import Cookies, Parser, ImageInfo, CrawlerSettings
from image_crawler_utils.progress_bar import ProgressGroup



##### Pixiv User Parser



[docs]
class PixivUserParser(Parser):
    """
    Args:
        crawler_settings (image_crawler_utils.CrawlerSettings): The CrawlerSettings used in this Parser.
        member_id: Pixiv ID of the user.
        station_url (str): The URL of the main page of a website.

            + This parameter works when several websites use the same structure. For example, https://yande.re/ and https://konachan.com/ both use Moebooru to build their websites, and this parameter must be filled to deal with these sites respectively.
            + For websites like https://www.pixiv.net/, as no other website uses its structure, this parameter has already been initialized and do not need to be filled.

        use_keyword_include (bool): Using a new keyword string whose searching results can contain all images belong to the original keyword string result. Default set to False.

            + Example: search "A" can contain all results by "A and B"
            
        cookies (image_crawler_utils.Cookies, str, dict, list, None): Cookies containing logging information.
        quick_mode: DO NOT DOWNLOAD any image info. Will increase speed of downloading.
        info_page_batch_num (int): Batch size of images. Finish downloading a batch will wait for a rather long time.
        info_page_batch_delay (float, None): Delay time after each batch of images is downloaded.
    """

    def __init__(
        self,
        member_id: str,
        station_url: str="https://www.pixiv.net/",
        crawler_settings: CrawlerSettings=CrawlerSettings(),
        cookies: Optional[Union[Cookies, list, dict, str]]=Cookies(),
        thread_delay: Union[float, Callable]=0,
        quick_mode: bool=False,
        info_page_batch_num: Optional[int]=100,
        info_page_batch_delay: Union[float, Callable]=300,
    ):

        super().__init__(
            station_url=station_url,
            crawler_settings=crawler_settings, 
            cookies=cookies,
        )
        self.member_id = member_id
        self.thread_delay = thread_delay
        self.quick_mode = quick_mode
        self.info_page_batch_num = info_page_batch_num
        self.info_page_batch_delay = info_page_batch_delay



[docs]
    def run(self) -> list[ImageInfo]:
        """
        The main function that runs the Parser and returns a list of :class:`image_crawler_utils.ImageInfo`.
        """
        if self.thread_delay == 0:  # Pixiv do not accept frequent requests!
            self.thread_delay = self.crawler_settings.download_config.thread_num * 1.0  # Set the delay to 1.0 * thread num

        with requests.Session() as session:
            if not self.cookies.is_none():
                session.cookies.update(self.cookies.cookies_dict)
            else:
                raise ValueError('Cookies cannot be empty!')
            self.get_image_ids(session=session)
            if self.quick_mode:
                return self.get_image_info_quick(session=session)
            else:
                return self.get_image_info_full(session=session)



    ##### Custom funcs
    

    # Get Pixiv ajax API image IDs

[docs]
    def get_image_ids(self, session: requests.Session=None) -> list[str]:
        if session is None:
            session = requests.Session()
            session.cookies.update(self.cookies.cookies_dict)
            
        self.crawler_settings.log.info(f"Downloading info of images uploaded by Pixiv member ID: {self.member_id} ...")

        # Update headers for json download
        if self.crawler_settings.download_config.result_headers is None:  # Pixiv must have user-agents!
            ua = ua_generator.generate(browser=('chrome', 'edge'))
            ua.headers.accept_ch('Sec-CH-UA-Platform-Version, Sec-CH-UA-Full-Version-List')
            json_user_page_headers = dict(ChainMap(ua.headers.get(), {"Referer": "www.pixiv.net"}))
        else:
            json_user_page_headers = dict(ChainMap(self.crawler_settings.download_config.result_headers, {"Referer": "www.pixiv.net"}))

        # Get and parse json page info
        user_page_json_content = self.request_page_content(
            url=f"{self.station_url}ajax/user/{self.member_id}/works/latest",
            session=session,
            headers=json_user_page_headers,
        )
        
        # Errors detected!
        content_dict = json.loads(user_page_json_content)
        if content_dict["error"] is True:
            error_msg = f'An error happens in [repr.url]{f"{self.station_url}ajax/user/{self.member_id}/works/latest"}[reset].'
            self.crawler_settings.log.critical(error_msg, extra={"markup": True})
            raise ValueError(error_msg)
        illust_dict = content_dict["body"]["illusts"]
        
        if not isinstance(illust_dict, dict):
            error_msg = f'Illustrations not detected in [repr.url]{f"{self.station_url}ajax/user/{self.member_id}/works/latest"}[reset].'
            self.crawler_settings.log.critical(error_msg, extra={"markup": True})
            raise ValueError(error_msg)
        
        # Sort from newest to oldest
        self.image_ids = sorted(list(illust_dict.keys()), key=lambda x: int(x), reverse=True)
        return self.image_ids

    

    # Get image info: full

[docs]
    def get_image_info_full(self, session: requests.Session=None) -> list[ImageInfo]:
        if session is None:
            session = requests.Session()
            session.cookies.update(self.cookies.cookies_dict)
            
        # Update headers for illust detection
        if self.crawler_settings.download_config.result_headers is None:  # Pixiv must have user-agents!
            ua = ua_generator.generate(browser=('chrome', 'edge'))
            ua.headers.accept_ch('Sec-CH-UA-Platform-Version, Sec-CH-UA-Full-Version-List')
            json_image_url_page_headers = [dict(ChainMap(ua.headers.get(), {"Referer": f"www.pixiv.net/artworks/{artwork_id}"}))
                                           for artwork_id in self.image_ids]
        else:
            json_image_url_page_headers = [dict(ChainMap(self.crawler_settings.download_config.result_headers, {"Referer": f"www.pixiv.net/artworks/{artwork_id}"}))
                                           for artwork_id in self.image_ids]
        
        # Get and parse json page info 
        self.crawler_settings.log.info("Downloading image info for every Pixiv ID...")
        json_image_info_urls = [f'{self.station_url}ajax/illust/{artwork_id}'
                                for artwork_id in self.image_ids]
        json_image_url_page_contents = self.threading_request_page_content(
            json_image_info_urls, 
            restriction_num=self.crawler_settings.capacity_count_config.image_num, 
            session=session,
            headers=json_image_url_page_headers,
            thread_delay=max(1.0 * self.crawler_settings.download_config.thread_num, self.crawler_settings.download_config.result_thread_delay),  # Force not to be lower than a certain threshold
            batch_num=self.info_page_batch_num,
            batch_delay=self.info_page_batch_delay,
        )
        image_info_dict = {}
        for content in json_image_url_page_contents:
            if content is None:  # Empty page!
                continue
            parsed_content = json.loads(content)
            image_info_dict[parsed_content["body"]["id"]] = parsed_content["body"]

        # Get and parse json page info 
        self.crawler_settings.log.info("Downloading image URLs for every Pixiv ID...")
        json_image_download_urls = [f'{self.station_url}ajax/illust/{artwork_id}/pages'
                                    for artwork_id in self.image_ids]
        json_image_url_page_contents = self.nodriver_threading_request_page_content(
            json_image_download_urls, 
            restriction_num=self.crawler_settings.capacity_count_config.image_num, 
            is_json=True,
            deconstruct_browser=True,
            # It seems that pixiv has less restrictions on crawling this type of pages, so no batch download is set.
        )
        
        self.crawler_settings.log.info(f'Parsing image info...')
        image_info_list = []
        with ProgressGroup(panel_title="Parsing Image Info") as progress_group:
            progress = progress_group.main_count_bar
            task = progress.add_task(description="Parsing image info pages:", total=len(json_image_url_page_contents))
            for content in json_image_url_page_contents:
                if content is None:
                    continue  # Empty page!
                parsed_content = json.loads(content)
                for image_url_size in parsed_content["body"]:
                    image_id = image_url_size["urls"]["original"].split('/')[-1].split('_')[0]
                    tags = [item["tag"] for item in image_info_dict[image_id]["tags"]["tags"]]
                    image_info_list.append(ImageInfo(
                        url=image_url_size["urls"]["original"],
                        name=image_url_size["urls"]["original"].split('/')[-1],
                        info={
                            "id": image_id,
                            "width": image_url_size["width"],
                            "height": image_url_size["height"],
                            "tags": tags,
                            "info": image_info_dict[image_id],
                        },
                    ))
                progress.update(task, advance=1)
            
            progress.update(task, description="[green]Parsing image info pages finished!")

        self.image_info_list = image_info_list
        return self.image_info_list



    # Get image info: full

[docs]
    def get_image_info_quick(self, session: requests.Session=None) -> list[ImageInfo]:
        if session is None:
            session = requests.Session()
            session.cookies.update(self.cookies.cookies_dict)
            
        # Get and parse json page info 
        self.crawler_settings.log.info("Downloading image URLs for every Pixiv ID...")
        json_image_download_urls = [f'{self.station_url}ajax/illust/{artwork_id}/pages'
                                    for artwork_id in self.image_ids]
        json_image_url_page_contents = self.nodriver_threading_request_page_content(
            json_image_download_urls, 
            restriction_num=self.crawler_settings.capacity_count_config.image_num, 
            is_json=True,
            deconstruct_browser=True,
            # It seems that pixiv has less restrictions on crawling this type of pages, so no batch download is set.
        )

        self.crawler_settings.log.info(f'Parsing image info...')
        image_info_list = []
        with ProgressGroup(panel_title="Parsing Image Info") as progress_group:
            progress = progress_group.main_count_bar
            task = progress.add_task(description="Parsing image info pages:", total=len(json_image_url_page_contents))
            for content in json_image_url_page_contents:
                if content is None:
                    continue  # Empty page!
                parsed_content = json.loads(content)
                for image_url_size in parsed_content["body"]:
                    image_id = image_url_size["urls"]["original"].split('/')[-1].split('_')[0]
                    image_info_list.append(ImageInfo(
                        url=image_url_size["urls"]["original"],
                        name=image_url_size["urls"]["original"].split('/')[-1],
                        info={
                            "id": image_id,
                            "width": image_url_size["width"],
                            "height": image_url_size["height"],
                        },
                    ))
                progress.update(task, advance=1)
            
            progress.update(task, description="[green]Parsing image info pages finished!")

        self.image_info_list = image_info_list
        return self.image_info_list