Initial commit: Add webtoon downloader

2024-12-19 13:58:12 +01:00
commit d5a73f342e
53 changed files with 1173 additions and 0 deletions
--- a/downloaders/init.py
+++ b/downloaders/init.py
--- a/downloaders/pycache/init.cpython-312.pyc
+++ b/downloaders/pycache/init.cpython-312.pyc
--- a/downloaders/pycache/bomtoom.cpython-312.pyc
+++ b/downloaders/pycache/bomtoom.cpython-312.pyc
--- a/downloaders/pycache/bomtoon.cpython-312.pyc
+++ b/downloaders/pycache/bomtoon.cpython-312.pyc
--- a/downloaders/pycache/decrypt.cpython-312.pyc
+++ b/downloaders/pycache/decrypt.cpython-312.pyc
--- a/downloaders/pycache/downloader.cpython-312.pyc
+++ b/downloaders/pycache/downloader.cpython-312.pyc
--- a/downloaders/pycache/kakao_tw.cpython-312.pyc
+++ b/downloaders/pycache/kakao_tw.cpython-312.pyc
--- a/downloaders/pycache/kakao_webtoon.cpython-312.pyc
+++ b/downloaders/pycache/kakao_webtoon.cpython-312.pyc
--- a/downloaders/pycache/kakao_webtoon_use_b4s.cpython-312.pyc
+++ b/downloaders/pycache/kakao_webtoon_use_b4s.cpython-312.pyc
--- a/downloaders/pycache/prerequisite.cpython-312.pyc
+++ b/downloaders/pycache/prerequisite.cpython-312.pyc
--- a/downloaders/pycache/webtoon_com.cpython-312.pyc
+++ b/downloaders/pycache/webtoon_com.cpython-312.pyc
--- a/downloaders/bomtoon.py
+++ b/downloaders/bomtoon.py
@@ -0,0 +1,62 @@
+
+from pathlib import Path
+from typing import TYPE_CHECKING
+from bs4 import BeautifulSoup
+import httpx
+import requests
+from data.path_constant import DOWNLOAD_DIR
+from data.webtoon_request import get_bomtoon_headers
+from downloaders.downloader import Downloader
+
+
+class Bomtoon(Downloader):
+    def __init__(self, webtoon_id):
+        super().__init__(webtoon_id)
+        
+        self.headers = get_bomtoon_headers()
+
+    def _fetch_information(self, url):
+        res = requests.get(url, headers=self.headers)
+        
+        if res.status_code == 200:
+            soup = BeautifulSoup(res.content, 'html.parser')
+            title = soup.find('title')
+            if title:
+                self.title = title.get_text().split('-')[0].strip()
+
+            author = soup.find('meta', attrs={'name': 'author'})
+            if author:
+                self.author = author.get('content')
+            
+            description = soup.find('meta', attrs={'property': 'og:description'})
+            if description:
+                self.description = description.get('content')
+
+            tags = soup.find('meta', attrs={'name': 'keywords'})
+            if tags:
+                tags_list = tags.get('content').split(',')
+                if '連載' in tags_list[0]:
+                    self.tag = tags_list[1]
+                else:
+                    self.tag = tags_list[0]
+
+            self.thumbnail_url = ""
+            self.thumbnail_name = self.webtoon_id + '.jpg'
+        else:
+            print(f"fetch_information: {res.status_code}")
+
+
+    def _fetch_episode_information(self):
+        pass
+
+
+    def _get_episode_image_urls(self, episode_index) -> list[str]:
+        pass
+    
+    async def _download_image(
+        self,
+        episode_path: Path,
+        url: str,
+        image_no: int
+    ) -> None:
+        pass
--- a/downloaders/decrypt.py
+++ b/downloaders/decrypt.py
@@ -0,0 +1,43 @@
+import base64
+import hashlib
+from contextlib import suppress
+
+from WebtoonScraper.exceptions import MissingOptionalDependencyError
+
+class Decrypt :
+    def __init__(self, aid, episodeId, timestamp, nonce, userId, zid):
+        self._aid = aid
+        self._episodeId = episodeId
+        self._timestamp = timestamp
+        self._nonce = nonce
+        self._userId = userId
+        self._zid = zid
+    
+    @classmethod
+    def get_aes(cls):
+        with suppress(AttributeError):
+            return cls.AES
+        try:
+            from Cryptodome.Cipher import AES
+        except ImportError:
+            raise ImportError("Missing optional dependency 'pycryptodomex'. Please install it to use this functionality.")
+        
+        cls.AES = AES
+        return cls.AES
+
+    @classmethod
+    def _decrypt(cls, data: bytes, key: bytes, iv: bytes) -> bytes:
+        AES = cls.get_aes()
+        cipher = AES.new(key, AES.MODE_CBC, iv)
+        return cipher.decrypt(data)
+
+    def get_decrypt_infomations(self) -> tuple[bytes, bytes]:
+        
+        temp_key = hashlib.sha256(f"{self._userId}{self._episodeId}{self._timestamp}".encode()).digest()
+        temp_iv = hashlib.sha256(f"{self._nonce}{self._timestamp}".encode()).digest()[:16]
+        encrypted_key = base64.b64decode(self._aid)
+        encrypted_iv = base64.b64decode(self._zid)
+
+        key = self._decrypt(encrypted_key, temp_key, temp_iv)[:16]
+        iv = self._decrypt(encrypted_iv, temp_key, temp_iv)[:16]
+        return key, iv
--- a/downloaders/downloader.py
+++ b/downloaders/downloader.py
@@ -0,0 +1,168 @@
+import asyncio
+import html
+import json
+from pathlib import Path
+import pyfilename as pf
+import shutil
+import time
+from httpx import AsyncClient
+import requests
+
+from data.special_list import WEBTOON_18_BONUS
+
+
+class Downloader:
+    def __init__(self, webtoon_id: any) -> None:
+        self.webtoon_id = webtoon_id
+        self.client = AsyncClient()
+        self.lately_downloaded_episode: list[Path] = []
+        self.new_webtoon = ""
+
+
+    def download_webtoon(self, url, path:Path) -> None:
+        self._fetch_information(url)
+        self.webtoon_path = path / self.title
+        self.webtoon_path.mkdir(parents=True, exist_ok=True)
+        
+        self._save_information()
+        if self.thumbnail_url != "":
+            self._download_thumbnail()
+
+        self._fetch_episode_information()
+        unobtained_episodes = self._get_unobtained_episodes()
+
+        if len(unobtained_episodes) > 0:
+            self.new_webtoon = self.title
+        
+        try:
+            asyncio.run(
+                self._download_episodes(unobtained_episodes)
+            )
+        except Exception as e:
+            print(f"Error _download_episodes: {e}")
+
+
+
+    def _fetch_information(self, url) -> None:
+        pass
+    
+    def _save_information(self) -> None:
+        information_path = self.webtoon_path / 'information.json'
+        save_necessary = True
+        
+        if information_path.exists():
+            with open(information_path, "r", encoding='utf-8') as json_file:
+                existing_information = json.load(json_file)
+                if (
+                    existing_information["title"] == self.title and
+                    existing_information["author"] == self.author and
+                    existing_information["description"] == self.description and
+                    existing_information["thumbnail_name"] == self.thumbnail_name
+                ):
+                    save_necessary = False
+        if (save_necessary):
+            information = {
+                "title": self.title,
+                "author": self.author,
+                "tag": self.tag,
+                "description": self.description,
+                "thumbnail_name": self.thumbnail_name
+            }
+
+            with open(information_path, 'w', encoding='utf-8') as json_file:
+                json.dump(information, json_file, ensure_ascii=False, indent=2)
+            print(f"{information_path} is saved.")
+
+
+    def _download_thumbnail(self) -> None:
+        thumbnail_path = self.webtoon_path / self.thumbnail_name
+        if not thumbnail_path.exists():
+            response = requests.get(self.thumbnail_url)
+            if response.status_code == 200: 
+                image_raw = response.content   
+                thumbnail_path.write_bytes(image_raw)
+                print(f"{thumbnail_path} is saved.")
+            else:
+                print(response.status_code)
+
+    
+    def _fetch_episode_information(self) -> None:
+        pass
+
+    def _get_unobtained_episodes(self) -> list[int]:
+        downloaded_episodes = []
+
+        for dir in self.webtoon_path.glob('*'):
+            if dir.is_dir():
+                downloaded_episodes.append(int(dir.name.split('.')[0]))
+        
+        if self.title in WEBTOON_18_BONUS:
+            count = len(self.readablities_index_list) - len(downloaded_episodes)
+            if count > 0:
+                episodes = self.readablities_index_list[-count:]
+            
+        else :
+            diffrence = set(self.readablities_index_list) - set(downloaded_episodes)
+            episodes = list(diffrence)
+
+        print(f"{self.title} unobtained episodes: {episodes}")
+
+        return episodes
+    
+    async def _download_episodes(self, episode_index_list: list[int]) -> None:
+        async with self.client:
+            for episode_index in episode_index_list:
+                episode_name = self.episode_titles[episode_index]
+                episode_title = self._get_safe_file_name(episode_index, episode_name)
+                # episode_title = self._get_safe_file_name(f"{episode_index}.{self.episode_titles[episode_index]}")
+                print(episode_title)
+                episode_path = self.webtoon_path / episode_title
+                episode_path.mkdir(parents=True, exist_ok=True)
+                time.sleep(2)
+                is_download_sucessful = await self._download_episode(episode_index, episode_path)
+                if is_download_sucessful: 
+                    self.lately_downloaded_episode.append(episode_path)
+                    print(f"Download {self.episode_titles[episode_index]} sucessful.")
+                else:
+                    print(f"Error _download_episode: {self.episode_titles[episode_index]}")
+                    break
+
+    
+    async def _download_episode(self, episode_index: int, episode_path: Path) -> bool:
+        episode_images_url = self._get_episode_image_urls(episode_index)
+
+        if not episode_images_url:
+            print(f"Failed get image url for: {episode_path}")
+            return False
+
+        try:
+            await asyncio.gather(
+                *(
+                    self._download_image(episode_path, element, i)
+                    for i, element in enumerate(episode_images_url)
+                )
+            )
+        except Exception as e:
+            shutil.rmtree(episode_path)
+            raise
+
+        return True
+    
+    
+    def _get_episode_image_urls(self, episode_index: int) -> list[str] | None:
+        pass
+
+    async def _download_image(self, episode_path: Path, url: str, image_no: int) -> None:
+        pass
+
+    def _get_safe_file_name(self, episode_index: int, episode_name: str) -> str:
+        if self.title == '全知讀者視角':
+            episode_name = f"Ep{episode_name.split('.')[2]}"
+            episode_name = episode_name.replace("（", " (")
+            episode_name = episode_name.replace("）", ")")
+        elif self.title == '怪力亂神':
+            episode_name = episode_name.replace('話. ', '話 ')
+
+        episode_title = f"{episode_index}.{episode_name}"
+
+        return pf.convert(html.unescape(episode_title))
--- a/downloaders/kakao_webtoon.py
+++ b/downloaders/kakao_webtoon.py
@@ -0,0 +1,141 @@
+from pathlib import Path
+import random
+import sys
+import time
+
+from bs4 import BeautifulSoup
+from httpx import AsyncClient, RequestError, HTTPStatusError
+import httpx
+import requests
+from data.path_constant import DOWNLOAD_DIR, DOWNLOAD_LIST_TXT
+from data.kakao_cookie import Cookie
+from data.kakao_request import KakaoRequest
+from downloaders.decrypt import Decrypt
+from downloaders.downloader import Downloader
+
+
+class KakaoWebtoon(Downloader):
+    def __init__(self, webtoon_id: int, cookie: Cookie):
+        super().__init__(webtoon_id)
+        self._timestamp = int(time.time() * 1000)
+        chars = [*range(0x30, 0x3A), *range(0x61, 0x7B)]
+        self._nonce = "".join(chr(i) for i in random.choices(chars, k=10))
+        
+        self.kakaoRequest = KakaoRequest(self._timestamp, self._nonce)
+        self.cookie = cookie
+        self.episode_headers = self.kakaoRequest.get_episode_headers(self.cookie.ant)
+        self.post_headers = self.kakaoRequest.get_post_headers(self.cookie.ant)
+
+    def verify_cookie(self) -> bool:
+        url = f"https://gateway.tw.kakaowebtoon.com/episode/v2/views/content-home/contents/{self.webtoon_id}/episodes?sort=-NO&offset=0&limit=30"
+        res = requests.get(url, headers=self.episode_headers)
+        return res.status_code == 200
+
+    def _fetch_information(self, url):
+        res = requests.get(url, headers=self.episode_headers)
+        
+        if res.status_code == 200:
+            soup = BeautifulSoup(res.content, 'html.parser')
+            description = soup.find('meta', attrs={'name': 'description'})
+            if description:
+                self.description = description.get('content')
+            thumbnail_url = soup.find('meta', attrs={'property': 'og:image'})
+            if thumbnail_url:
+                self.thumbnail_url = thumbnail_url.get('content')
+
+            all_p = soup.find_all('p')
+
+            self.title = all_p[0].get_text()
+            self.author = all_p[1].get_text()
+            self.tag = all_p[2].get_text()
+            self.thumbnail_name = self.webtoon_id + '.' + self.thumbnail_url.split('.')[-1]
+
+    def _fetch_episode_information(self):
+        offset = 0
+        limit = 30
+        is_last: bool = False
+        webtoon_episodes_data = []
+        while not is_last:
+            url = f"https://gateway.tw.kakaowebtoon.com/episode/v2/views/content-home/contents/{self.webtoon_id}/episodes?sort=-NO&offset={offset}&limit={limit}"
+            res = requests.get(url, headers=self.episode_headers)
+            if res.status_code == 200:
+                json_data = res.json()
+
+                webtoon_episodes_data += json_data["data"]["episodes"]
+                offset += limit
+                is_last = json_data["meta"]["pagination"]["last"]
+            else:
+                print("_fetch_episode_information")
+                print(self.cookie.name)
+                print(res.status_code)
+                sys.exit()
+
+
+        episode_ids: list[int] = []
+        seo_ids: list[str] = []
+        numbers: list[int] = []
+        episode_titles: list[str] = []
+        readablities: list[bool] = []
+        
+        for information in reversed(webtoon_episodes_data):
+            episode_ids.append(information["id"])
+            seo_ids.append(information["seoId"])
+            numbers.append(information["no"])
+            episode_titles.append(information["title"])
+            readablities.append(information["readable"])
+
+        
+        self.episode_ids = episode_ids
+        self.seo_ids = seo_ids
+        self.episode_titles = episode_titles
+        self.readablities_index_list = [index for index, value in enumerate(readablities) if value == True]    
+    
+    def _get_episode_image_urls(self, episode_index) -> list[tuple[str, bytes, bytes]] | None:
+        episode_id = self.episode_ids[episode_index]
+                
+        url = f"https://gateway.tw.kakaowebtoon.com/episode/v1/views/viewer/episodes/{episode_id}/media-resources"
+        payload = self.kakaoRequest.get_payload(episode_id)
+        res = requests.post(url, headers=self.post_headers, json=payload)
+
+        data = res.json()["data"]
+
+        aid = data["media"]["aid"]
+        zid = data["media"]["zid"]
+        
+        self.decrypt = Decrypt(aid, episode_id, self._timestamp, self._nonce, self.cookie.userID, zid)
+        key, iv = self.decrypt.get_decrypt_infomations()
+
+        return [(i["url"], key, iv) for i in data["media"]["files"]]
+    
+    async def _download_image(
+        self,
+        episode_path: Path,
+        url: tuple[str, bytes, bytes],
+        image_no: int
+    ) -> None:
+        real_url, key, iv = url
+        file_extension = 'webp'
+        file_name = f"{image_no:03d}.{file_extension}"
+        file_path = episode_path /file_name
+        
+        try:
+            image_raw: bytes = (await self.client.get(real_url, headers=self.episode_headers)).content
+        except httpx.RequestError as e:
+            print(f"An error occurred while requesting {url}: {e}")
+        except httpx.HTTPStatusError as e:
+            print(f"HTTP error occurred: {e}")
+        except httpx.TimeoutException as e:
+            print(f"Timeout error occurred: {e}")
+        except httpx.UnsupportedProtocol as e:
+            print(f"Unsupported protocol error occurred: {e}")
+        except Exception as e:
+            print(f"An unexpected error occurred: {e}")
+        except Exception as e:
+            print(f"Error get image_raw: {file_path}: {e}")
+    
+        decrypted_data = self.decrypt._decrypt(image_raw, key, iv)
+
+        file_path.write_bytes(decrypted_data)
+
+    async def close(self):
+        await self.client.aclose()
--- a/downloaders/webtoon_com.py
+++ b/downloaders/webtoon_com.py
@@ -0,0 +1,122 @@
+
+from pathlib import Path
+import time
+from typing import TYPE_CHECKING
+from bs4 import BeautifulSoup
+from httpx import AsyncClient, HTTPStatusError, RequestError
+import httpx
+import requests
+from data.path_constant import DOWNLOAD_DIR
+from data.webtoon_request import get_webtoon_headers
+from downloaders.downloader import Downloader
+
+
+class Webtoon(Downloader):
+    def __init__(self, webtoon_id: int):
+        super().__init__(webtoon_id)
+        
+        self.headers = get_webtoon_headers()
+        self.base_url = "https://www.webtoons.com/en/action/jungle-juice"
+
+    def _fetch_information(self, url):
+        res = requests.get(url, headers=self.headers)
+        
+        if res.status_code == 200:
+            soup = BeautifulSoup(res.content, 'html.parser')
+            title = soup.find('meta', attrs={'property': 'og:title'})
+            if title:
+                self.title = title.get('content')
+            
+            description = soup.find('meta', attrs={'property': 'og:description'})
+            if description:
+                self.description = description.get('content')
+                        
+            thumbnail_url = soup.find('meta', attrs={'property': 'og:image'})
+            if thumbnail_url:
+                self.thumbnail_url = thumbnail_url.get('content')
+            
+            author_list = soup.find_all('h3')
+            h3_texts = [h3.get_text().strip() for h3 in author_list]
+            author = ', '.join(h3_texts)
+
+            tag = soup.find('h2', class_='genre').get_text()
+            
+            self.author = author
+            self.tag = tag
+
+            seo_id = url.split('/')[-2]
+            thumbnail_type = 'png' if 'png' in self.thumbnail_url else 'jpg'
+            self.thumbnail_name = seo_id + '.' + thumbnail_type
+ 
+            self.latest_title_no = soup.find('li', class_='_episodeItem').get('data-episode-no')
+        else:
+            print(f"fetch_information: {res.status_code}")
+
+
+    def _fetch_episode_information(self):
+        url = f"{self.base_url}/prologue/viewer?title_no={self.webtoon_id}&episode_no={self.latest_title_no}"
+        res = requests.get(url, headers=self.headers)
+
+        if res.status_code == 200:
+            self.episode_titles = []
+            soup = BeautifulSoup(res.content, 'html.parser')
+            li_tags = soup.find('div', class_='episode_cont').find_all('li', attrs={'data-episode-no': True})
+            self.episode_titles = [li.find('span', class_='subj').get_text() for li in li_tags if li.find('span', class_='subj')]
+            self.episode_urls = [li.find('a')['href'] for li in li_tags]
+            self.episode_ids = [int(li.get('data-episode-no')) for li in li_tags] # start with 1, not index
+            self.readablities_index_list = [id - 1 for id in self.episode_ids]
+        else:
+            print(f"fetch_episode_information: {res.status_code}")
+
+
+    def _get_episode_image_urls(self, episode_index) -> list[str]:
+        #url = self.episode_urls[episode_index]
+        episode_id = self.episode_ids[episode_index]
+        url = f"{self.base_url}/prologue/viewer?title_no={self.webtoon_id}&episode_no={episode_id}"
+        episode_image_urls = []
+        res = requests.get(url, headers=self.headers)
+
+        if res.status_code == 200:
+            soup = BeautifulSoup(res.content, 'html.parser')
+            img_tags = soup.select("#_imageList > img")
+            episode_image_urls = [element["data-url"] for element in img_tags]
+            if TYPE_CHECKING:
+                episode_image_urls = [
+                    episode_image_url for episode_image_url in episode_image_urls if isinstance(episode_image_url, str)
+                ]
+        else:
+            print(f"get_episode_image_urls: {res.status_code}")
+        return episode_image_urls
+    
+    async def _download_image(
+        self,
+        episode_path: Path,
+        url: str,
+        image_no: int
+    ) -> None:
+        file_extension = 'jpg'
+        file_name = f"{image_no:03d}.{file_extension}"
+        file_path = episode_path /file_name
+
+        try:
+            
+            response = await self.client.get(url, headers=self.headers)
+            response.raise_for_status()  # Raises HTTPStatusError for 4xx/5xx responses
+            image_raw: bytes = response.content
+        except httpx.RequestError as e:
+            print(f"An error occurred while requesting {url}: {e}")
+        except httpx.HTTPStatusError as e:
+            print(f"HTTP error occurred: {e}")
+        except httpx.TimeoutException as e:
+            print(f"Timeout error occurred: {e}")
+        except httpx.UnsupportedProtocol as e:
+            print(f"Unsupported protocol error occurred: {e}")
+        except Exception as e:
+            print(f"An unexpected error occurred: {e}")
+
+        # try:
+        #     image_raw: bytes = (await self.client.get(url, headers=self.headers)).content
+        # except Exception as e:
+        #     print(f"Error get image_raw: {file_path}: {e}")
+    
+        file_path.write_bytes(image_raw)