Source code for image_crawler_utils.stations.pixiv.parser_assets.keyword_parser

from typing import Optional, Union

import time, random
import json
from collections import ChainMap
from collections.abc import Callable
import requests
import ua_generator

from urllib import parse

from image_crawler_utils import Cookies, KeywordParser, ImageInfo, CrawlerSettings
from image_crawler_utils.keyword import KeywordLogicTree, min_len_keyword_group, construct_keyword_tree_from_list
from image_crawler_utils.progress_bar import CustomProgress, ProgressGroup

from .search_settings import PixivSearchSettings



##### Pixiv Keyword Parser



[docs]
class PixivKeywordParser(KeywordParser):
    """
    Args:
        crawler_settings (image_crawler_utils.CrawlerSettings): The CrawlerSettings used in this Parser.
        station_url (str): The URL of the main page of a website.

            + This parameter works when several websites use the same structure. For example, https://yande.re/ and https://konachan.com/ both use Moebooru to build their websites, and this parameter must be filled to deal with these sites respectively.
            + For websites like https://www.pixiv.net/, as no other website uses its structure, this parameter has already been initialized and do not need to be filled.

        standard_keyword_string (str): Query keyword string using standard syntax. Refer to the documentation for detailed instructions.
        pixiv_search_settings (image_crawler_utils.stations.pixiv.PixivSearchSettings): A PixivSearchSettings class that contains extra options when searching.
        keyword_string (str, None): If you want to directly specify the keywords used in searching, set ``keyword_string`` to a custom non-empty string. It will OVERWRITE ``standard_keyword_string``.

            + For example, set ``keyword_string`` to ``"kuon_(utawarerumono) rating:safe"`` in DanbooruKeywordParser means searching directly with this string in Danbooru, and its standard keyword string equivalent is ``"kuon_(utawarerumono) AND rating:safe"``.

        use_keyword_include (bool): Using a new keyword string whose searching results can contain all images belong to the original keyword string result. Default set to False.

            + Example: search "A" can contain all results by "A and B"

        cookies (image_crawler_utils.Cookies, str, dict, list, None): Cookies containing logging information.
        thread_delay (float, Callable, None): As Pixiv restricts number of requests in a certain period, this argument defines the delay time (seconds) before every downloading thread of websites.
        quick_mode (bool): Only collect the basic information.

            + Pixiv has a strict anti-crawling restriction on acquiring the pages containing information of images. Set this parameter to :py:data:`True` will not request these pages and collect only the basic information of images for downloading.
            + Different Parsers may have different structures of image information. Refer to the [ImageInfo Structure](#imageinfo-structure-4) chapter for the difference between results.
                + If set to :py:data:`False` (get full information), then the ``thread_delay`` when downloading information pages will be forced to be set to no lower than ``CrawlerSettings.download_config.thread_num * 1.0``. Other pages are not affected.

        info_page_batch_num (int): After downloading ``info_page_batch_num`` number of image information pages, the crawler will wait for ``info_page_batch_delay`` seconds before continue.
        info_page_batch_delay (float, None): After downloading ``info_page_batch_num`` number of image information pages, the crawler will wait for `info_page_batch_delay` seconds before continue.

            + If ``quick_mode`` is set to :py:data:`True`, both ``info_page_batch_num`` and ``info_page_batch_delay`` will be ignored.
            + If you are not sure, leaving both ``info_page_batch_num`` and ``info_page_batch_delay`` blank (use their default values) is likely enough for preventing your account to be suspended.
            + ``info_page_batch_delay`` can be a function that will be called for every usage.
    """

    def __init__(
        self, 
        station_url: str="https://www.pixiv.net/",
        crawler_settings: CrawlerSettings=CrawlerSettings(),
        standard_keyword_string: Optional[str]=None, 
        keyword_string: Optional[str]=None,
        cookies: Optional[Union[Cookies, list, dict, str]]=Cookies(),
        pixiv_search_settings: PixivSearchSettings=PixivSearchSettings(),
        use_keyword_include: bool=False,
        quick_mode: bool=False,
        info_page_batch_num: Optional[int]=100,
        info_page_batch_delay: Union[float, Callable]=300,
    ):

        super().__init__(
            station_url=station_url,
            crawler_settings=crawler_settings, 
            standard_keyword_string=standard_keyword_string, 
            keyword_string=keyword_string,
            cookies=cookies,
        )
        self.pixiv_search_settings = pixiv_search_settings
        self.use_keyword_include = use_keyword_include
        self.quick_mode = quick_mode
        self.info_page_batch_num = info_page_batch_num
        self.info_page_batch_delay = info_page_batch_delay



[docs]
    def run(self) -> list[ImageInfo]:
        """
        The main function that runs the Parser and returns a list of :class:`image_crawler_utils.ImageInfo`.
        """
        if self.keyword_string is None:
            if self.use_keyword_include:
                self.generate_keyword_string_include()
            else:
                self.generate_keyword_string()

        with requests.Session() as session:
            if not self.cookies.is_none():
                session.cookies.update(self.cookies.cookies_dict)
            else:
                raise ValueError('Cookies cannot be empty!')
            self.get_json_page_num()
            self.get_json_page_urls()
            self.get_image_basic_info(session=session)
            if self.quick_mode:
                return self.get_image_info_quick(session=session)
            else:
                return self.get_image_info_full(session=session)



    ##### Custom funcs

    
    # Generate keyword string from keyword tree
    def __build_keyword_str(self, tree: KeywordLogicTree) -> str:
        # Generate standard keyword string
        if isinstance(tree.lchild, str):
            res1 = tree.lchild
            while '_' in res1 or '*' in res1:  # Pixiv does not support _ and *
                res1 = res1.replace("_", "").replace("*", "")
        else:
            res1 = self.__build_keyword_str(tree.lchild)
        if isinstance(tree.rchild, str):
            res2 = tree.rchild
            while '_' in res2 or '*' in res2:  # Pixiv does not support _ and *
                res2 = res2.replace("_", "").replace("*", "")
        else:
            res2 = self.__build_keyword_str(tree.rchild)

        if tree.logic_operator == "AND":
            return f'({res1} {res2})'
        elif tree.logic_operator == "OR":
            return f'({res1} OR {res2})'
        elif tree.logic_operator == "NOT":
            return f'(-{res2})'
        elif tree.logic_operator == "SINGLE":
            return f'{res2}'


    # Basic keyword string

[docs]
    def generate_keyword_string(self) -> str:            
        self.keyword_string = self.__build_keyword_str(self.keyword_tree)
        return self.keyword_string



    # Keyword (include) string

[docs]
    def generate_keyword_string_include(self) -> str:
        keyword_group = min_len_keyword_group(self.keyword_tree.keyword_include_group_list())
        keyword_strings = [self.__build_keyword_str(construct_keyword_tree_from_list(group, log=self.crawler_settings.log)) 
                           for group in keyword_group]
        min_image_num = None

        self.crawler_settings.log.info("Testing the image num of keyword (include) groups to find the one with fewest pages.")
        with CustomProgress(transient=True) as progress:
            task = progress.add_task(description="Requesting pages:", total=len(keyword_strings))
            for string in keyword_strings:
                self.crawler_settings.log.debug(f'Testing the image num of keyword string: {string}')
                self.keyword_string = string
                image_num = self.get_json_page_num()
                self.crawler_settings.log.debug(f'The image num of {string} is {image_num}.')
                if min_image_num is None or image_num < min_image_num:
                    min_image_num = image_num
                    min_string = string
                progress.update(task, advance=1)

            progress.update(task, description="[green]Requesting pages finished!")
                
        self.keyword_string = min_string
        self.crawler_settings.log.info(f'The keyword string the parser will use is "{self.keyword_string}" which has {min_image_num} {"images" if min_image_num > 1 else "image"}.')
        return self.keyword_string




[docs]
    def get_json_page_num(self, session: requests.Session=None) -> int:
        if session is None:
            session = requests.Session()
            session.cookies.update(self.cookies.cookies_dict)

        if self.crawler_settings.download_config.result_headers is None:  # Pixiv must have user-agents!
            ua = ua_generator.generate(browser=('chrome', 'edge'))
            ua.headers.accept_ch('Sec-CH-UA-Platform-Version, Sec-CH-UA-Full-Version-List')
            json_search_page_headers = dict(ChainMap(ua.headers.get(), {"Referer": "www.pixiv.net"}))
        else:
            json_search_page_headers = dict(ChainMap(self.crawler_settings.download_config.result_headers, {"Referer": "www.pixiv.net"}))

        first_page_url = parse.quote(f"{self.station_url}{self.pixiv_search_settings.build_search_appending_str_json(self.keyword_string)}", safe='/:?=&')

        self.crawler_settings.log.info(f'Connecting to the first gallery page using keyword "{self.keyword_string}" and URL [repr.url]{first_page_url}[reset] ...', extra={"markup": True})
            
        content = self.request_page_content(first_page_url, session=session, headers=json_search_page_headers)

        if content is None:
            self.crawler_settings.log.critical(f"CANNOT connect to the first JSON page, URL: [repr.url]{first_page_url}[reset]", extra={"markup": True})
            raise ConnectionError(f"CANNOT connect to the first JSON page, URL: [repr.url]{first_page_url}[reset]", extra={"markup": True})
        else:
            self.crawler_settings.log.info(f'Successfully connected to the first JSON page.')

        parsed_content = json.loads(content)
        for image_list_type in ["illust", "illustManga", "manga"]:
            if image_list_type in parsed_content["body"].keys():
                self.artworks_num = int(parsed_content["body"][image_list_type]["total"])
                self.json_page_num = int(parsed_content["body"][image_list_type]["lastPage"])

        if self.json_page_num == 1000:
            self.crawler_settings.log.warning("Number of result pages has reached 1000. Due to Pixiv restrictions, result in pages exceeding 1000 cannot be fetched through JSON API directly.")
        
        self.crawler_settings.log.info(f"Number of artworks: {self.artworks_num}")
        if self.artworks_num == 0:  # No result, no pages!
            self.json_page_num = 0
        return self.json_page_num

        

    # Get Pixiv ajax API json page URLs

[docs]
    def get_json_page_urls(self) -> list[str]:
        self.json_page_urls = [parse.quote(f"{self.station_url}{self.pixiv_search_settings.build_search_appending_str_json(self.keyword_string)}&p={page_num}", safe='/:?=&')
                               for page_num in range(1, self.json_page_num + 1)]
        return self.json_page_urls

    

    # Get image ID and basic info

[docs]
    def get_image_basic_info(self, session: requests.Session=None) -> dict:
        if session is None:
            session = requests.Session()
            session.cookies.update(self.cookies.cookies_dict)

        self.crawler_settings.log.info("Downloading pages including Pixiv IDs...")

        # Get pages until all pages are fetched
        empty_urls = self.json_page_urls.copy()
        json_basic_info = {}

        while len(empty_urls) > 0:
            download_urls = empty_urls.copy()
            empty_urls = []

            # Get and parse json page info
            json_page_contents = self.nodriver_threading_request_page_content(
                download_urls, 
                restriction_num=self.crawler_settings.capacity_count_config.image_num, 
                is_json=True,
                deconstruct_browser=True,
                # It seems that pixiv has less restrictions on crawling this type of pages, so no batch download is set.
            )

            # Get dict
            for i in range(len(download_urls)):
                parsed_content = json.loads(json_page_contents[i])
                for image_list_type in ["illust", "illustManga", "manga"]:
                    if image_list_type in parsed_content["body"].keys():
                        if len(parsed_content["body"][image_list_type]["data"]) > 0:
                            for image_data in parsed_content["body"][image_list_type]["data"]:
                                json_basic_info[image_data["id"]] = image_data
                        else:
                            empty_urls.append(download_urls[i])

            if len(empty_urls) > 0:
                self.crawler_settings.log.warning(f'{len(empty_urls)} {"pages are" if len(empty_urls) > 1 else "page is"} empty, possibly because requests were too frequent. Retrying to request pages in 1 to 2 minutes.')
                time.sleep(60 + random.random() * 60)

        # Sort with ID from large to small
        json_basic_info = {elem[0]: elem[1] for elem in sorted(json_basic_info.items(), key=lambda item: int(item[0]), reverse=True)}

        self.json_basic_info = json_basic_info
        return self.json_basic_info



    # Get image info: full

[docs]
    def get_image_info_full(self, session: requests.Session=None) -> list[ImageInfo]:
        if session is None:
            session = requests.Session()
            session.cookies.update(self.cookies.cookies_dict)
            
        # Update headers for illust detection
        if self.crawler_settings.download_config.result_headers is None:  # Pixiv must have user-agents!
            ua = ua_generator.generate(browser=('chrome', 'edge'))
            ua.headers.accept_ch('Sec-CH-UA-Platform-Version, Sec-CH-UA-Full-Version-List')
            json_image_url_page_headers = [dict(ChainMap(ua.headers.get(), {"Referer": f"www.pixiv.net/artworks/{artwork_id}"}))
                                           for artwork_id in self.json_basic_info.keys()]
        else:
            json_image_url_page_headers = [dict(ChainMap(self.crawler_settings.download_config.result_headers, {"Referer": f"www.pixiv.net/artworks/{artwork_id}"}))
                                           for artwork_id in self.json_basic_info.keys()]
        
        # Get and parse json page info 
        self.crawler_settings.log.info("Downloading image info for every Pixiv ID...")
        json_image_info_urls = [f'{self.station_url}ajax/illust/{artwork_id}'
                                for artwork_id in self.json_basic_info.keys()]
        json_image_url_page_contents = self.threading_request_page_content(
            json_image_info_urls, 
            restriction_num=self.crawler_settings.capacity_count_config.image_num, 
            session=session,
            headers=json_image_url_page_headers,
            thread_delay=max(1.0 * self.crawler_settings.download_config.thread_num, self.crawler_settings.download_config.result_thread_delay),  # Force not to be lower than a certain threshold in case the account get suspended because of too many requests
            batch_num=self.info_page_batch_num,
            batch_delay=self.info_page_batch_delay,
        )
        image_info_dict = {}
        for content in json_image_url_page_contents:
            if content is None:  # Empty page!
                continue
            parsed_content = json.loads(content)
            image_info_dict[parsed_content["body"]["id"]] = parsed_content["body"]

        # Get and parse json page info 
        self.crawler_settings.log.info("Downloading image URLs for every Pixiv ID...")
        json_image_download_urls = [f'{self.station_url}ajax/illust/{artwork_id}/pages'
                                    for artwork_id in self.json_basic_info.keys()]
        json_image_url_page_contents = self.nodriver_threading_request_page_content(
            json_image_download_urls, 
            restriction_num=self.crawler_settings.capacity_count_config.image_num, 
            is_json=True,
            deconstruct_browser=True,
            # It seems that pixiv has less restrictions on crawling this type of pages, so no batch download is set.
        )
        
        self.crawler_settings.log.info(f'Parsing image info...')
        image_info_list = []
        with ProgressGroup(panel_title="Parsing Image Info") as progress_group:
            progress = progress_group.main_count_bar
            task = progress.add_task(description="Parsing image info pages:", total=len(json_image_url_page_contents))
            for content in json_image_url_page_contents:
                if content is None:
                    continue  # Empty page!
                parsed_content = json.loads(content)
                for image_url_size in parsed_content["body"]:
                    image_id = image_url_size["urls"]["original"].split('/')[-1].split('_')[0]
                    tags = [item["tag"] for item in image_info_dict[image_id]["tags"]["tags"]]
                    image_info_list.append(ImageInfo(
                        url=image_url_size["urls"]["original"],
                        name=image_url_size["urls"]["original"].split('/')[-1],
                        info={
                            "id": image_id,
                            "width": image_url_size["width"],
                            "height": image_url_size["height"],
                            "tags": tags,
                            "info": image_info_dict[image_id],
                        },
                    ))
                progress.update(task, advance=1)
            
            progress.update(task, description="[green]Parsing image info pages finished!")

        self.image_info_list = image_info_list
        return self.image_info_list



    # Get image info: quick

[docs]
    def get_image_info_quick(self, session: requests.Session=None) -> list[ImageInfo]:
        if session is None:
            session = requests.Session()
            session.cookies.update(self.cookies.cookies_dict)
            
        # Get and parse json page info 
        self.crawler_settings.log.info("Downloading image URLs for every Pixiv ID...")
        json_image_download_urls = [f'{self.station_url}ajax/illust/{artwork_id}/pages'
                                    for artwork_id in self.json_basic_info.keys()]
        json_image_url_page_contents = self.nodriver_threading_request_page_content(
            json_image_download_urls, 
            restriction_num=self.crawler_settings.capacity_count_config.image_num, 
            is_json=True,
            deconstruct_browser=True,
            # It seems that pixiv has less restrictions on crawling this type of pages, so no batch download is set.
        )

        self.crawler_settings.log.info(f'Parsing image info...')
        image_info_list = []
        with ProgressGroup(panel_title="Parsing Image Info") as progress_group:
            progress = progress_group.main_count_bar
            task = progress.add_task(description="Parsing image info pages:", total=len(json_image_url_page_contents))
            for content in json_image_url_page_contents:
                if content is None:
                    continue  # Empty page!
                parsed_content = json.loads(content)
                for image_url_size in parsed_content["body"]:
                    image_id = image_url_size["urls"]["original"].split('/')[-1].split('_')[0]
                    tags = self.json_basic_info[image_id]["tags"]
                    image_info_list.append(ImageInfo(
                        url=image_url_size["urls"]["original"],
                        name=image_url_size["urls"]["original"].split('/')[-1],
                        info={
                            "id": image_id,
                            "width": image_url_size["width"],
                            "height": image_url_size["height"],
                            "tags": tags,
                            "info": self.json_basic_info[image_id],
                        },
                    ))
                progress.update(task, advance=1)
            
            progress.update(task, description="[green]Parsing image info pages finished!")

        self.image_info_list = image_info_list
        return self.image_info_list