Source code for image_crawler_utils.classes.cookies

from __future__ import annotations
import dataclasses
from collections import Counter
from typing import Optional, Union

import copy
import json
import os, traceback
from rich import markup

import nodriver

from image_crawler_utils.log import Log
from image_crawler_utils.utils import check_dir



[docs] @dataclasses.dataclass(init=False, frozen=True) class Cookies: """ Convert format of cookies between selenium, requests and string. Use ``Cookies(cookies_from_certain_source)`` or ``Cookies.load_from_json()`` to create a Cookies class. Use ``.cookies_nodriver`` / ``.cookies_selenium`` / ``.cookies_dict`` / ``.cookies_string`` to get the cookies of suitable format. Args: cookies (list, dict, str, None): Cookies generated from string, dict (requests), list (selenium or nodriver). + Leave blank (like ``Cookies()``) will create an empty cookies, whose ``.is_none()`` returns :py:data:`True`. """ cookies_nodriver: Optional[list[nodriver.cdp.network.Cookie]] """ Cookies in nodriver form. This form of cookies can be generated by nodriver-related functions and classes, etc. A generation example is like: .. code-block:: python import nodriver from image_crawler_utils.utils import set_up_nodriver_browser from image_crawler_utils import Cookies async def nodriver_func(): browser = await set_up_nodriver_browser() tab = await browser.get('https://foo.bar.com') # Some other process nodriver_cookies = await browser.cookies.get_all() return nodriver_cookies nodriver_cookies = nodriver.loop().run_until_complete(nodriver_func()) cookies = Cookies(nodriver_cookies) """ cookies_selenium: Optional[list[dict]] """ Cookies in selenium form. This form of cookies can be generated by selenium-related functions and classes, etc. A generation example is like: .. code-block:: python from selenium.webdriver import Chrome from image_crawler_utils import Cookies chrome_driver_path = '/path/to/chromedriver' chrome_browser = webdriver.Chrome(executable_path=chrome_driver_path) chrome_browser.get('https://foo.bar.com') # Some other process selenium_cookies = chrome_browser.get_cookies() # A dict cookies = Cookies(selenium_cookies) """ cookies_dict: Optional[dict] """ Cookies in dict form. Mostly for requests module usage. This form of cookies can be generated by :py:mod:`requests`-related functions and classes, or other cookie functions that generates a :py:class:`dict`, etc. A generation example is like: .. code-block:: python import requests from image_crawler_utils import Cookies session = requests.Session() # Some process that adds cookies to session requests_cookies = session.cookies.get_dict() # A list cookies = Cookies(requests_cookies) """ cookies_string: Optional[str] """ Cookies in string form. This form of cookies can be acquired by using Developer Mode (F12) in some browsers, etc. A generation example is like: .. code-block:: python from image_crawler_utils import Cookies cookies = Cookies("your_cookies_string") """ def __init__(self, cookies: Optional[Union[list, dict, str]]=None): object.__setattr__(self, "cookies_string", '') object.__setattr__(self, "cookies_dict", {}) object.__setattr__(self, "cookies_selenium", []) object.__setattr__(self, "cookies_nodriver", None) if cookies is None: return elif isinstance(cookies, str): self.__create_by_string(cookies) return elif isinstance(cookies, dict): if len(cookies) == 0: return else: self.__create_by_dict(cookies) return elif isinstance(cookies, list): if len(cookies) == 0: return else: if isinstance(cookies[0], dict): self.__create_by_selenium(cookies) return elif isinstance(cookies[0], object): self.__create_by_nodriver(cookies) return raise ValueError("Cookies type not identifiable.") def __create_by_nodriver(self, cookies_nodriver: list[nodriver.cdp.network.Cookie]) -> Cookies: """ Input nodriver-type cookies. The other types will be automatically converted. Args: cookies_nodriver (list[nodriver.cdp.network.Cookie]): nodriver-generated cookies. """ object.__setattr__(self, "cookies_nodriver", cookies_nodriver) object.__setattr__(self, "cookies_selenium", [cookie.to_json() for cookie in self.cookies_nodriver]) object.__setattr__(self, "cookies_dict", {cookie.name: cookie.value for cookie in self.cookies_nodriver}) object.__setattr__(self, "cookies_string", '; '.join([f'{cookie.name}={cookie.value}' for cookie in self.cookies_nodriver])) def __create_by_selenium(self, cookies_selenium: list[dict]) -> Cookies: """ Input selenium-type cookies. The other types will be automatically converted. Args: cookies_selenium (list[dict]): Selenium-generated cookies. """ object.__setattr__(self, "cookies_selenium", cookies_selenium) object.__setattr__(self, "cookies_dict", {cookie['name']: cookie['value'] for cookie in self.cookies_selenium}) object.__setattr__(self, "cookies_string", '; '.join([f'{cookie["name"]}={cookie["value"]}' for cookie in self.cookies_selenium])) self.__selenium_cookies_to_nodriver() def __create_by_dict(self, cookies_dict: dict) -> Cookies: """ Input requests-type cookies. The other types will be automatically converted. Args: cookies_dict (dict): Requests-generated cookies. """ object.__setattr__(self, "cookies_dict", cookies_dict) object.__setattr__(self, "cookies_selenium", [{"name": key, "value": value} for key, value in self.cookies_dict.items()]) object.__setattr__(self, "cookies_string", '; '.join([f'{key}={value}' for key, value in self.cookies_dict.items()])) self.__selenium_cookies_to_nodriver() def __create_by_string(self, cookies_string: str) -> Cookies: """ Input string-type cookies. The other types will be automatically converted. Args: cookies_string (str): Cookies in string. """ object.__setattr__(self, "cookies_string", cookies_string.strip()) if cookies_string[-1] == ';': object.__setattr__(self, "cookies_string", self.cookies_string[:-1]) object.__setattr__(self, "cookies_selenium", []) object.__setattr__(self, "cookies_dict", {}) for item in cookies_string.replace('\n', '').split(';'): if "=" not in item: continue name = item.split("=")[0].strip() value = item.split("=")[1].strip() self.cookies_selenium.append({"name": name, "value": value}) self.cookies_dict[name] = value self.__selenium_cookies_to_nodriver() return self def __add__(self, other: Cookies): """ Concatenate two cookies. If two Cookies have same values, the latter will be omitted. """ cookies_list = self.cookies_selenium.copy() for cookie in other.cookies_selenium: if cookie['name'] not in self.cookies_dict.keys(): cookies_list.append(cookie.copy()) return Cookies(cookies_list) def __most_domain(self, cookies: list): if isinstance(cookies[0], dict): domain_list = [cookie["domain"] for cookie in cookies if "domain" in cookie.keys()] elif isinstance(cookies[0], object): domain_list = [cookie.domain for cookie in cookies if hasattr(cookie, "domain")] domain_list = [domain for domain in domain_list if len(domain) > 0] return Counter(domain_list).most_common(1)[0][0] def __selenium_cookies_to_nodriver(self): attribute_dict = { "name": "", "value": "", "domain": "", "path": "/", "size": 0, "httpOnly": False, "secure": True, "session": False, "priority": "Medium", "sameParty": False, "sourceScheme": "Secure", "sourcePort": 443, } if self.cookies_selenium is not None: cookies_nodriver = [] for cookie in self.cookies_selenium: insert_cookies = cookie.copy() for key, item in attribute_dict.items(): if key not in insert_cookies.keys(): if key == "size": insert_cookies[key] = len(insert_cookies["name"]) + len(insert_cookies["value"]) else: insert_cookies[key] = item cookies_nodriver.append(nodriver.cdp.network.Cookie.from_json(insert_cookies)) object.__setattr__(self, "cookies_nodriver", cookies_nodriver) else: raise ValueError("cookies_selenium cannot be None.")
[docs] @classmethod def load_from_json(cls, json_file: str, encoding: str='UTF-8', log: Log=Log()) -> Optional[Cookies]: """ Load the Cookies from a json file. ONLY WORKS IF the info can be JSON serialized. Args: json_file (str): Name / path of json file. Suffix (.json) must be included. encoding (str): Encoding of JSON file. log (image_crawler_utils.log.Log, None): Logging config. Returns: The Cookies, or None if failed. """ try: with open(json_file, "r", encoding=encoding) as f: new_cls = cls(json.load(f)) log.info(f'Cookies has been loaded from [repr.filename]{markup.escape(os.path.abspath(json_file))}[reset]', extra={"markup": True}) return new_cls except Exception as e: log.error(f'Failed to load Cookies from [repr.filename]{markup.escape(os.path.abspath(json_file))}[reset] because {e}\n{traceback.format_exc()}', extra={"markup": True}) return None
[docs] def save_to_json(self, json_file: str, encoding: str='UTF-8', log: Log=Log()) -> Optional[tuple[str, str]]: """ Save the Cookies into a json file. Args: json_file (str): Name / path of json file. (Suffix is optional.) encoding (str): Encoding of JSON file. log (image_crawler_utils.log.Log, None): Logging config. Returns: (Saved file name, Absolute path of the saved file), or None if failed. """ path, filename = os.path.split(json_file) check_dir(path, log) f_name = os.path.join(path, f"{filename}.json") f_name = f_name.replace(".json.json", ".json") # If .json is already contained in json_file, skip it try: with open(f_name, "w", encoding=encoding) as f: json.dump(self.cookies_selenium, f, indent=4, ensure_ascii=False) log.info(f'Cookies has been saved at [repr.filename]{markup.escape(os.path.abspath(f_name))}[reset]', extra={"markup": True}) return f_name, os.path.abspath(f_name) except Exception as e: log.error(f'Failed to save Cookies at [repr.filename]{markup.escape(os.path.abspath(f_name))}[reset] because {e}\n{traceback.format_exc()}', extra={"markup": True}) return None
[docs] def update_selenium_cookies(self, old_selenium_cookies: list[dict]): """ Update selenium-form cookies. For every cookie in the input with the same name as the one in the Cookies class, replace the values with the latter one. Also add cookies in Cookies class which not exists in input cookies. Args: old_selenium_cookies (list[dict]): Cookies from selenium. Returns: New selenium cookies (a list[dict]). """ new_selenium_cookies = copy.deepcopy(old_selenium_cookies) for cookie in self.cookies_selenium: for new_cookie in new_selenium_cookies: if new_cookie['name'] == cookie['name']: new_cookie['value'] = cookie['value'] if cookie['name'] not in [new_cookie['name'] for new_cookie in new_selenium_cookies]: new_selenium_cookies.append(cookie) # Set those without domain to the most frequent domain in cookies provided most_domain = self.__most_domain(new_selenium_cookies) for i in range(len(new_selenium_cookies)): if len(new_selenium_cookies[i]["domain"]) == 0: new_selenium_cookies[i]["domain"] = most_domain return new_selenium_cookies
[docs] def update_nodriver_cookies(self, old_nodriver_cookies: list[nodriver.cdp.network.Cookie]): """ Update nodriver-form cookies. NOT SUGGESTED TO BE USED DIRECTLY. For every cookie in the input with the same name as the one in the Cookies class, replace the values with the latter one. Also add cookies in Cookies class which not exists in input cookies. Args: old_nodriver_cookies (list[nodriver.cdp.network.Cookie]): Cookies from nodriver. Returns: New nodriver cookies (a list[nodriver.cdp.network.Cookie]). """ new_nodriver_cookies = copy.deepcopy(old_nodriver_cookies) for cookie in self.cookies_nodriver: for new_cookie in new_nodriver_cookies: if new_cookie.name == cookie.name: new_cookie.value = cookie.value new_cookie.size = len(new_cookie.name) + len(new_cookie.value) if cookie.name not in [new_cookie.name for new_cookie in new_nodriver_cookies]: new_nodriver_cookies.append(cookie) # Set those without domain to the most frequent domain in cookies provided most_domain = self.__most_domain(new_nodriver_cookies) for i in range(len(new_nodriver_cookies)): if len(new_nodriver_cookies[i].domain) == 0: new_nodriver_cookies[i].domain = most_domain return new_nodriver_cookies
[docs] def is_none(self) -> bool: """ Check whether Cookies is empty (created by :py:data:`None`, "", etc.). Returns: A bool, telling whethers Cookies is empty. """ return (self.cookies_nodriver is None and len(self.cookies_selenium) == 0 and len(self.cookies_dict) == 0 and len(self.cookies_string) == 0)
##### nodriver currently has bugs when setting cookies, so I will set it manually
[docs] async def update_nodriver_browser_cookies( browser: nodriver.Browser, cookies: Cookies, ): """ This function will update nodriver browser cookies with Cookies provided. As nodriver includes a browser.cookies.set_all() but it has a critical bug that stay unfixed for a long time, I'll do it myself! Args: browser (nodriver.Browser): The browser created by nodriver. cookies (image_crawler_utils.Cookies): The cookies containing account information. """ # Replace cookies nodriver_cookies = await browser.cookies.get_all() new_nodriver_cookies = cookies.update_nodriver_cookies(nodriver_cookies) connection = None for tab in browser.tabs: if tab.closed: continue connection = tab break else: connection = browser.connection await connection.send(nodriver.cdp.storage.set_cookies(new_nodriver_cookies))