Files
my-webtoon/downloaders/webtoon_com.py

123 lines
4.9 KiB
Python

from pathlib import Path
import time
from typing import TYPE_CHECKING
from bs4 import BeautifulSoup
from httpx import AsyncClient, HTTPStatusError, RequestError
import httpx
import requests
from data.path_constant import DOWNLOAD_DIR
from data.webtoon_request import get_webtoon_headers
from downloaders.downloader import Downloader
class Webtoon(Downloader):
def __init__(self, webtoon_id: int):
super().__init__(webtoon_id)
self.headers = get_webtoon_headers()
self.base_url = "https://www.webtoons.com/en/action/jungle-juice"
def _fetch_information(self, url):
res = requests.get(url, headers=self.headers)
if res.status_code == 200:
soup = BeautifulSoup(res.content, 'html.parser')
title = soup.find('meta', attrs={'property': 'og:title'})
if title:
self.title = title.get('content')
description = soup.find('meta', attrs={'property': 'og:description'})
if description:
self.description = description.get('content')
thumbnail_url = soup.find('meta', attrs={'property': 'og:image'})
if thumbnail_url:
self.thumbnail_url = thumbnail_url.get('content')
author_list = soup.find_all('h3')
h3_texts = [h3.get_text().strip() for h3 in author_list]
author = ', '.join(h3_texts)
tag = soup.find('h2', class_='genre').get_text()
self.author = author
self.tag = tag
seo_id = url.split('/')[-2]
thumbnail_type = 'png' if 'png' in self.thumbnail_url else 'jpg'
self.thumbnail_name = seo_id + '.' + thumbnail_type
self.latest_title_no = soup.find('li', class_='_episodeItem').get('data-episode-no')
else:
print(f"fetch_information: {res.status_code}")
def _fetch_episode_information(self):
url = f"{self.base_url}/prologue/viewer?title_no={self.webtoon_id}&episode_no={self.latest_title_no}"
res = requests.get(url, headers=self.headers)
if res.status_code == 200:
self.episode_titles = []
soup = BeautifulSoup(res.content, 'html.parser')
li_tags = soup.find('div', class_='episode_cont').find_all('li', attrs={'data-episode-no': True})
self.episode_titles = [li.find('span', class_='subj').get_text() for li in li_tags if li.find('span', class_='subj')]
self.episode_urls = [li.find('a')['href'] for li in li_tags]
self.episode_ids = [int(li.get('data-episode-no')) for li in li_tags] # start with 1, not index
self.readablities_index_list = [id - 1 for id in self.episode_ids]
else:
print(f"fetch_episode_information: {res.status_code}")
def _get_episode_image_urls(self, episode_index) -> list[str]:
#url = self.episode_urls[episode_index]
episode_id = self.episode_ids[episode_index]
url = f"{self.base_url}/prologue/viewer?title_no={self.webtoon_id}&episode_no={episode_id}"
episode_image_urls = []
res = requests.get(url, headers=self.headers)
if res.status_code == 200:
soup = BeautifulSoup(res.content, 'html.parser')
img_tags = soup.select("#_imageList > img")
episode_image_urls = [element["data-url"] for element in img_tags]
if TYPE_CHECKING:
episode_image_urls = [
episode_image_url for episode_image_url in episode_image_urls if isinstance(episode_image_url, str)
]
else:
print(f"get_episode_image_urls: {res.status_code}")
return episode_image_urls
async def _download_image(
self,
episode_path: Path,
url: str,
image_no: int
) -> None:
file_extension = 'jpg'
file_name = f"{image_no:03d}.{file_extension}"
file_path = episode_path /file_name
try:
response = await self.client.get(url, headers=self.headers)
response.raise_for_status() # Raises HTTPStatusError for 4xx/5xx responses
image_raw: bytes = response.content
except httpx.RequestError as e:
print(f"An error occurred while requesting {url}: {e}")
except httpx.HTTPStatusError as e:
print(f"HTTP error occurred: {e}")
except httpx.TimeoutException as e:
print(f"Timeout error occurred: {e}")
except httpx.UnsupportedProtocol as e:
print(f"Unsupported protocol error occurred: {e}")
except Exception as e:
print(f"An unexpected error occurred: {e}")
# try:
# image_raw: bytes = (await self.client.get(url, headers=self.headers)).content
# except Exception as e:
# print(f"Error get image_raw: {file_path}: {e}")
file_path.write_bytes(image_raw)