Initial commit: Add webtoon downloader

This commit is contained in:
2024-12-19 13:58:12 +01:00
commit d5a73f342e
53 changed files with 1173 additions and 0 deletions

0
downloaders/__init__.py Normal file
View File

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

62
downloaders/bomtoon.py Normal file
View File

@@ -0,0 +1,62 @@
from pathlib import Path
from typing import TYPE_CHECKING
from bs4 import BeautifulSoup
import httpx
import requests
from data.path_constant import DOWNLOAD_DIR
from data.webtoon_request import get_bomtoon_headers
from downloaders.downloader import Downloader
class Bomtoon(Downloader):
def __init__(self, webtoon_id):
super().__init__(webtoon_id)
self.headers = get_bomtoon_headers()
def _fetch_information(self, url):
res = requests.get(url, headers=self.headers)
if res.status_code == 200:
soup = BeautifulSoup(res.content, 'html.parser')
title = soup.find('title')
if title:
self.title = title.get_text().split('-')[0].strip()
author = soup.find('meta', attrs={'name': 'author'})
if author:
self.author = author.get('content')
description = soup.find('meta', attrs={'property': 'og:description'})
if description:
self.description = description.get('content')
tags = soup.find('meta', attrs={'name': 'keywords'})
if tags:
tags_list = tags.get('content').split(',')
if '連載' in tags_list[0]:
self.tag = tags_list[1]
else:
self.tag = tags_list[0]
self.thumbnail_url = ""
self.thumbnail_name = self.webtoon_id + '.jpg'
else:
print(f"fetch_information: {res.status_code}")
def _fetch_episode_information(self):
pass
def _get_episode_image_urls(self, episode_index) -> list[str]:
pass
async def _download_image(
self,
episode_path: Path,
url: str,
image_no: int
) -> None:
pass

43
downloaders/decrypt.py Normal file
View File

@@ -0,0 +1,43 @@
import base64
import hashlib
from contextlib import suppress
from WebtoonScraper.exceptions import MissingOptionalDependencyError
class Decrypt :
def __init__(self, aid, episodeId, timestamp, nonce, userId, zid):
self._aid = aid
self._episodeId = episodeId
self._timestamp = timestamp
self._nonce = nonce
self._userId = userId
self._zid = zid
@classmethod
def get_aes(cls):
with suppress(AttributeError):
return cls.AES
try:
from Cryptodome.Cipher import AES
except ImportError:
raise ImportError("Missing optional dependency 'pycryptodomex'. Please install it to use this functionality.")
cls.AES = AES
return cls.AES
@classmethod
def _decrypt(cls, data: bytes, key: bytes, iv: bytes) -> bytes:
AES = cls.get_aes()
cipher = AES.new(key, AES.MODE_CBC, iv)
return cipher.decrypt(data)
def get_decrypt_infomations(self) -> tuple[bytes, bytes]:
temp_key = hashlib.sha256(f"{self._userId}{self._episodeId}{self._timestamp}".encode()).digest()
temp_iv = hashlib.sha256(f"{self._nonce}{self._timestamp}".encode()).digest()[:16]
encrypted_key = base64.b64decode(self._aid)
encrypted_iv = base64.b64decode(self._zid)
key = self._decrypt(encrypted_key, temp_key, temp_iv)[:16]
iv = self._decrypt(encrypted_iv, temp_key, temp_iv)[:16]
return key, iv

168
downloaders/downloader.py Normal file
View File

@@ -0,0 +1,168 @@
import asyncio
import html
import json
from pathlib import Path
import pyfilename as pf
import shutil
import time
from httpx import AsyncClient
import requests
from data.special_list import WEBTOON_18_BONUS
class Downloader:
def __init__(self, webtoon_id: any) -> None:
self.webtoon_id = webtoon_id
self.client = AsyncClient()
self.lately_downloaded_episode: list[Path] = []
self.new_webtoon = ""
def download_webtoon(self, url, path:Path) -> None:
self._fetch_information(url)
self.webtoon_path = path / self.title
self.webtoon_path.mkdir(parents=True, exist_ok=True)
self._save_information()
if self.thumbnail_url != "":
self._download_thumbnail()
self._fetch_episode_information()
unobtained_episodes = self._get_unobtained_episodes()
if len(unobtained_episodes) > 0:
self.new_webtoon = self.title
try:
asyncio.run(
self._download_episodes(unobtained_episodes)
)
except Exception as e:
print(f"Error _download_episodes: {e}")
def _fetch_information(self, url) -> None:
pass
def _save_information(self) -> None:
information_path = self.webtoon_path / 'information.json'
save_necessary = True
if information_path.exists():
with open(information_path, "r", encoding='utf-8') as json_file:
existing_information = json.load(json_file)
if (
existing_information["title"] == self.title and
existing_information["author"] == self.author and
existing_information["description"] == self.description and
existing_information["thumbnail_name"] == self.thumbnail_name
):
save_necessary = False
if (save_necessary):
information = {
"title": self.title,
"author": self.author,
"tag": self.tag,
"description": self.description,
"thumbnail_name": self.thumbnail_name
}
with open(information_path, 'w', encoding='utf-8') as json_file:
json.dump(information, json_file, ensure_ascii=False, indent=2)
print(f"{information_path} is saved.")
def _download_thumbnail(self) -> None:
thumbnail_path = self.webtoon_path / self.thumbnail_name
if not thumbnail_path.exists():
response = requests.get(self.thumbnail_url)
if response.status_code == 200:
image_raw = response.content
thumbnail_path.write_bytes(image_raw)
print(f"{thumbnail_path} is saved.")
else:
print(response.status_code)
def _fetch_episode_information(self) -> None:
pass
def _get_unobtained_episodes(self) -> list[int]:
downloaded_episodes = []
for dir in self.webtoon_path.glob('*'):
if dir.is_dir():
downloaded_episodes.append(int(dir.name.split('.')[0]))
if self.title in WEBTOON_18_BONUS:
count = len(self.readablities_index_list) - len(downloaded_episodes)
if count > 0:
episodes = self.readablities_index_list[-count:]
else :
diffrence = set(self.readablities_index_list) - set(downloaded_episodes)
episodes = list(diffrence)
print(f"{self.title} unobtained episodes: {episodes}")
return episodes
async def _download_episodes(self, episode_index_list: list[int]) -> None:
async with self.client:
for episode_index in episode_index_list:
episode_name = self.episode_titles[episode_index]
episode_title = self._get_safe_file_name(episode_index, episode_name)
# episode_title = self._get_safe_file_name(f"{episode_index}.{self.episode_titles[episode_index]}")
print(episode_title)
episode_path = self.webtoon_path / episode_title
episode_path.mkdir(parents=True, exist_ok=True)
time.sleep(2)
is_download_sucessful = await self._download_episode(episode_index, episode_path)
if is_download_sucessful:
self.lately_downloaded_episode.append(episode_path)
print(f"Download {self.episode_titles[episode_index]} sucessful.")
else:
print(f"Error _download_episode: {self.episode_titles[episode_index]}")
break
async def _download_episode(self, episode_index: int, episode_path: Path) -> bool:
episode_images_url = self._get_episode_image_urls(episode_index)
if not episode_images_url:
print(f"Failed get image url for: {episode_path}")
return False
try:
await asyncio.gather(
*(
self._download_image(episode_path, element, i)
for i, element in enumerate(episode_images_url)
)
)
except Exception as e:
shutil.rmtree(episode_path)
raise
return True
def _get_episode_image_urls(self, episode_index: int) -> list[str] | None:
pass
async def _download_image(self, episode_path: Path, url: str, image_no: int) -> None:
pass
def _get_safe_file_name(self, episode_index: int, episode_name: str) -> str:
if self.title == '全知讀者視角':
episode_name = f"Ep{episode_name.split('.')[2]}"
episode_name = episode_name.replace("", " (")
episode_name = episode_name.replace("", ")")
elif self.title == '怪力亂神':
episode_name = episode_name.replace('話. ', '')
episode_title = f"{episode_index}.{episode_name}"
return pf.convert(html.unescape(episode_title))

View File

@@ -0,0 +1,141 @@
from pathlib import Path
import random
import sys
import time
from bs4 import BeautifulSoup
from httpx import AsyncClient, RequestError, HTTPStatusError
import httpx
import requests
from data.path_constant import DOWNLOAD_DIR, DOWNLOAD_LIST_TXT
from data.kakao_cookie import Cookie
from data.kakao_request import KakaoRequest
from downloaders.decrypt import Decrypt
from downloaders.downloader import Downloader
class KakaoWebtoon(Downloader):
def __init__(self, webtoon_id: int, cookie: Cookie):
super().__init__(webtoon_id)
self._timestamp = int(time.time() * 1000)
chars = [*range(0x30, 0x3A), *range(0x61, 0x7B)]
self._nonce = "".join(chr(i) for i in random.choices(chars, k=10))
self.kakaoRequest = KakaoRequest(self._timestamp, self._nonce)
self.cookie = cookie
self.episode_headers = self.kakaoRequest.get_episode_headers(self.cookie.ant)
self.post_headers = self.kakaoRequest.get_post_headers(self.cookie.ant)
def verify_cookie(self) -> bool:
url = f"https://gateway.tw.kakaowebtoon.com/episode/v2/views/content-home/contents/{self.webtoon_id}/episodes?sort=-NO&offset=0&limit=30"
res = requests.get(url, headers=self.episode_headers)
return res.status_code == 200
def _fetch_information(self, url):
res = requests.get(url, headers=self.episode_headers)
if res.status_code == 200:
soup = BeautifulSoup(res.content, 'html.parser')
description = soup.find('meta', attrs={'name': 'description'})
if description:
self.description = description.get('content')
thumbnail_url = soup.find('meta', attrs={'property': 'og:image'})
if thumbnail_url:
self.thumbnail_url = thumbnail_url.get('content')
all_p = soup.find_all('p')
self.title = all_p[0].get_text()
self.author = all_p[1].get_text()
self.tag = all_p[2].get_text()
self.thumbnail_name = self.webtoon_id + '.' + self.thumbnail_url.split('.')[-1]
def _fetch_episode_information(self):
offset = 0
limit = 30
is_last: bool = False
webtoon_episodes_data = []
while not is_last:
url = f"https://gateway.tw.kakaowebtoon.com/episode/v2/views/content-home/contents/{self.webtoon_id}/episodes?sort=-NO&offset={offset}&limit={limit}"
res = requests.get(url, headers=self.episode_headers)
if res.status_code == 200:
json_data = res.json()
webtoon_episodes_data += json_data["data"]["episodes"]
offset += limit
is_last = json_data["meta"]["pagination"]["last"]
else:
print("_fetch_episode_information")
print(self.cookie.name)
print(res.status_code)
sys.exit()
episode_ids: list[int] = []
seo_ids: list[str] = []
numbers: list[int] = []
episode_titles: list[str] = []
readablities: list[bool] = []
for information in reversed(webtoon_episodes_data):
episode_ids.append(information["id"])
seo_ids.append(information["seoId"])
numbers.append(information["no"])
episode_titles.append(information["title"])
readablities.append(information["readable"])
self.episode_ids = episode_ids
self.seo_ids = seo_ids
self.episode_titles = episode_titles
self.readablities_index_list = [index for index, value in enumerate(readablities) if value == True]
def _get_episode_image_urls(self, episode_index) -> list[tuple[str, bytes, bytes]] | None:
episode_id = self.episode_ids[episode_index]
url = f"https://gateway.tw.kakaowebtoon.com/episode/v1/views/viewer/episodes/{episode_id}/media-resources"
payload = self.kakaoRequest.get_payload(episode_id)
res = requests.post(url, headers=self.post_headers, json=payload)
data = res.json()["data"]
aid = data["media"]["aid"]
zid = data["media"]["zid"]
self.decrypt = Decrypt(aid, episode_id, self._timestamp, self._nonce, self.cookie.userID, zid)
key, iv = self.decrypt.get_decrypt_infomations()
return [(i["url"], key, iv) for i in data["media"]["files"]]
async def _download_image(
self,
episode_path: Path,
url: tuple[str, bytes, bytes],
image_no: int
) -> None:
real_url, key, iv = url
file_extension = 'webp'
file_name = f"{image_no:03d}.{file_extension}"
file_path = episode_path /file_name
try:
image_raw: bytes = (await self.client.get(real_url, headers=self.episode_headers)).content
except httpx.RequestError as e:
print(f"An error occurred while requesting {url}: {e}")
except httpx.HTTPStatusError as e:
print(f"HTTP error occurred: {e}")
except httpx.TimeoutException as e:
print(f"Timeout error occurred: {e}")
except httpx.UnsupportedProtocol as e:
print(f"Unsupported protocol error occurred: {e}")
except Exception as e:
print(f"An unexpected error occurred: {e}")
except Exception as e:
print(f"Error get image_raw: {file_path}: {e}")
decrypted_data = self.decrypt._decrypt(image_raw, key, iv)
file_path.write_bytes(decrypted_data)
async def close(self):
await self.client.aclose()

122
downloaders/webtoon_com.py Normal file
View File

@@ -0,0 +1,122 @@
from pathlib import Path
import time
from typing import TYPE_CHECKING
from bs4 import BeautifulSoup
from httpx import AsyncClient, HTTPStatusError, RequestError
import httpx
import requests
from data.path_constant import DOWNLOAD_DIR
from data.webtoon_request import get_webtoon_headers
from downloaders.downloader import Downloader
class Webtoon(Downloader):
def __init__(self, webtoon_id: int):
super().__init__(webtoon_id)
self.headers = get_webtoon_headers()
self.base_url = "https://www.webtoons.com/en/action/jungle-juice"
def _fetch_information(self, url):
res = requests.get(url, headers=self.headers)
if res.status_code == 200:
soup = BeautifulSoup(res.content, 'html.parser')
title = soup.find('meta', attrs={'property': 'og:title'})
if title:
self.title = title.get('content')
description = soup.find('meta', attrs={'property': 'og:description'})
if description:
self.description = description.get('content')
thumbnail_url = soup.find('meta', attrs={'property': 'og:image'})
if thumbnail_url:
self.thumbnail_url = thumbnail_url.get('content')
author_list = soup.find_all('h3')
h3_texts = [h3.get_text().strip() for h3 in author_list]
author = ', '.join(h3_texts)
tag = soup.find('h2', class_='genre').get_text()
self.author = author
self.tag = tag
seo_id = url.split('/')[-2]
thumbnail_type = 'png' if 'png' in self.thumbnail_url else 'jpg'
self.thumbnail_name = seo_id + '.' + thumbnail_type
self.latest_title_no = soup.find('li', class_='_episodeItem').get('data-episode-no')
else:
print(f"fetch_information: {res.status_code}")
def _fetch_episode_information(self):
url = f"{self.base_url}/prologue/viewer?title_no={self.webtoon_id}&episode_no={self.latest_title_no}"
res = requests.get(url, headers=self.headers)
if res.status_code == 200:
self.episode_titles = []
soup = BeautifulSoup(res.content, 'html.parser')
li_tags = soup.find('div', class_='episode_cont').find_all('li', attrs={'data-episode-no': True})
self.episode_titles = [li.find('span', class_='subj').get_text() for li in li_tags if li.find('span', class_='subj')]
self.episode_urls = [li.find('a')['href'] for li in li_tags]
self.episode_ids = [int(li.get('data-episode-no')) for li in li_tags] # start with 1, not index
self.readablities_index_list = [id - 1 for id in self.episode_ids]
else:
print(f"fetch_episode_information: {res.status_code}")
def _get_episode_image_urls(self, episode_index) -> list[str]:
#url = self.episode_urls[episode_index]
episode_id = self.episode_ids[episode_index]
url = f"{self.base_url}/prologue/viewer?title_no={self.webtoon_id}&episode_no={episode_id}"
episode_image_urls = []
res = requests.get(url, headers=self.headers)
if res.status_code == 200:
soup = BeautifulSoup(res.content, 'html.parser')
img_tags = soup.select("#_imageList > img")
episode_image_urls = [element["data-url"] for element in img_tags]
if TYPE_CHECKING:
episode_image_urls = [
episode_image_url for episode_image_url in episode_image_urls if isinstance(episode_image_url, str)
]
else:
print(f"get_episode_image_urls: {res.status_code}")
return episode_image_urls
async def _download_image(
self,
episode_path: Path,
url: str,
image_no: int
) -> None:
file_extension = 'jpg'
file_name = f"{image_no:03d}.{file_extension}"
file_path = episode_path /file_name
try:
response = await self.client.get(url, headers=self.headers)
response.raise_for_status() # Raises HTTPStatusError for 4xx/5xx responses
image_raw: bytes = response.content
except httpx.RequestError as e:
print(f"An error occurred while requesting {url}: {e}")
except httpx.HTTPStatusError as e:
print(f"HTTP error occurred: {e}")
except httpx.TimeoutException as e:
print(f"Timeout error occurred: {e}")
except httpx.UnsupportedProtocol as e:
print(f"Unsupported protocol error occurred: {e}")
except Exception as e:
print(f"An unexpected error occurred: {e}")
# try:
# image_raw: bytes = (await self.client.get(url, headers=self.headers)).content
# except Exception as e:
# print(f"Error get image_raw: {file_path}: {e}")
file_path.write_bytes(image_raw)