from pathlib import Path import time from typing import TYPE_CHECKING from bs4 import BeautifulSoup from httpx import AsyncClient, HTTPStatusError, RequestError import httpx import requests from data.path_constant import DOWNLOAD_DIR from data.webtoon_request import get_webtoon_headers from downloaders.downloader import Downloader class Webtoon(Downloader): def __init__(self, webtoon_id: int): super().__init__(webtoon_id) self.headers = get_webtoon_headers() self.base_url = "https://www.webtoons.com/en/action/jungle-juice" def _fetch_information(self, url): res = requests.get(url, headers=self.headers) if res.status_code == 200: soup = BeautifulSoup(res.content, 'html.parser') title = soup.find('meta', attrs={'property': 'og:title'}) if title: self.title = title.get('content') description = soup.find('meta', attrs={'property': 'og:description'}) if description: self.description = description.get('content') thumbnail_url = soup.find('meta', attrs={'property': 'og:image'}) if thumbnail_url: self.thumbnail_url = thumbnail_url.get('content') author_list = soup.find_all('h3') h3_texts = [h3.get_text().strip() for h3 in author_list] author = ', '.join(h3_texts) tag = soup.find('h2', class_='genre').get_text() self.author = author self.tag = tag seo_id = url.split('/')[-2] thumbnail_type = 'png' if 'png' in self.thumbnail_url else 'jpg' self.thumbnail_name = seo_id + '.' + thumbnail_type self.latest_title_no = soup.find('li', class_='_episodeItem').get('data-episode-no') else: print(f"fetch_information: {res.status_code}") def _fetch_episode_information(self): url = f"{self.base_url}/prologue/viewer?title_no={self.webtoon_id}&episode_no={self.latest_title_no}" res = requests.get(url, headers=self.headers) if res.status_code == 200: self.episode_titles = [] soup = BeautifulSoup(res.content, 'html.parser') li_tags = soup.find('div', class_='episode_cont').find_all('li', attrs={'data-episode-no': True}) self.episode_titles = [li.find('span', class_='subj').get_text() for li in li_tags if li.find('span', class_='subj')] self.episode_urls = [li.find('a')['href'] for li in li_tags] self.episode_ids = [int(li.get('data-episode-no')) for li in li_tags] # start with 1, not index self.readablities_index_list = [id - 1 for id in self.episode_ids] else: print(f"fetch_episode_information: {res.status_code}") def _get_episode_image_urls(self, episode_index) -> list[str]: #url = self.episode_urls[episode_index] episode_id = self.episode_ids[episode_index] url = f"{self.base_url}/prologue/viewer?title_no={self.webtoon_id}&episode_no={episode_id}" episode_image_urls = [] res = requests.get(url, headers=self.headers) if res.status_code == 200: soup = BeautifulSoup(res.content, 'html.parser') img_tags = soup.select("#_imageList > img") episode_image_urls = [element["data-url"] for element in img_tags] if TYPE_CHECKING: episode_image_urls = [ episode_image_url for episode_image_url in episode_image_urls if isinstance(episode_image_url, str) ] else: print(f"get_episode_image_urls: {res.status_code}") return episode_image_urls async def _download_image( self, episode_path: Path, url: str, image_no: int ) -> None: file_extension = 'jpg' file_name = f"{image_no:03d}.{file_extension}" file_path = episode_path /file_name try: response = await self.client.get(url, headers=self.headers) response.raise_for_status() # Raises HTTPStatusError for 4xx/5xx responses image_raw: bytes = response.content except httpx.RequestError as e: print(f"An error occurred while requesting {url}: {e}") except httpx.HTTPStatusError as e: print(f"HTTP error occurred: {e}") except httpx.TimeoutException as e: print(f"Timeout error occurred: {e}") except httpx.UnsupportedProtocol as e: print(f"Unsupported protocol error occurred: {e}") except Exception as e: print(f"An unexpected error occurred: {e}") # try: # image_raw: bytes = (await self.client.get(url, headers=self.headers)).content # except Exception as e: # print(f"Error get image_raw: {file_path}: {e}") file_path.write_bytes(image_raw)