my-webtoon/downloaders/webtoon_com.py


from pathlib import Path
import time
from typing import TYPE_CHECKING
from bs4 import BeautifulSoup
from httpx import AsyncClient, HTTPStatusError, RequestError
import httpx
import requests
from data.path_constant import DOWNLOAD_DIR
from data.webtoon_request import get_webtoon_headers
from downloaders.downloader import Downloader


class Webtoon(Downloader):
    def __init__(self, webtoon_id: int):
        super().__init__(webtoon_id)

        self.headers = get_webtoon_headers()
        self.base_url = "https://www.webtoons.com/en/action/jungle-juice"

    def _fetch_information(self, url):
        res = requests.get(url, headers=self.headers)

        if res.status_code == 200:
            soup = BeautifulSoup(res.content, 'html.parser')
            title = soup.find('meta', attrs={'property': 'og:title'})
            if title:
                self.title = title.get('content')

            description = soup.find('meta', attrs={'property': 'og:description'})
            if description:
                self.description = description.get('content')

            thumbnail_url = soup.find('meta', attrs={'property': 'og:image'})
            if thumbnail_url:
                self.thumbnail_url = thumbnail_url.get('content')

            author_list = soup.find_all('h3')
            h3_texts = [h3.get_text().strip() for h3 in author_list]
            author = ', '.join(h3_texts)

            tag = soup.find('h2', class_='genre').get_text()

            self.author = author
            self.tag = tag

            seo_id = url.split('/')[-2]
            thumbnail_type = 'png' if 'png' in self.thumbnail_url else 'jpg'
            self.thumbnail_name = seo_id + '.' + thumbnail_type

            self.latest_title_no = soup.find('li', class_='_episodeItem').get('data-episode-no')
        else:
            print(f"fetch_information: {res.status_code}")


    def _fetch_episode_information(self):
        url = f"{self.base_url}/prologue/viewer?title_no={self.webtoon_id}&episode_no={self.latest_title_no}"
        res = requests.get(url, headers=self.headers)

        if res.status_code == 200:
            self.episode_titles = []
            soup = BeautifulSoup(res.content, 'html.parser')
            li_tags = soup.find('div', class_='episode_cont').find_all('li', attrs={'data-episode-no': True})
            self.episode_titles = [li.find('span', class_='subj').get_text() for li in li_tags if li.find('span', class_='subj')]
            self.episode_urls = [li.find('a')['href'] for li in li_tags]
            self.episode_ids = [int(li.get('data-episode-no')) for li in li_tags] # start with 1, not index
            self.readablities_index_list = [id - 1 for id in self.episode_ids]
        else:
            print(f"fetch_episode_information: {res.status_code}")


    def _get_episode_image_urls(self, episode_index) -> list[str]:
        #url = self.episode_urls[episode_index]
        episode_id = self.episode_ids[episode_index]
        url = f"{self.base_url}/prologue/viewer?title_no={self.webtoon_id}&episode_no={episode_id}"
        episode_image_urls = []
        res = requests.get(url, headers=self.headers)

        if res.status_code == 200:
            soup = BeautifulSoup(res.content, 'html.parser')
            img_tags = soup.select("#_imageList > img")
            episode_image_urls = [element["data-url"] for element in img_tags]
            if TYPE_CHECKING:
                episode_image_urls = [
                    episode_image_url for episode_image_url in episode_image_urls if isinstance(episode_image_url, str)
                ]
        else:
            print(f"get_episode_image_urls: {res.status_code}")
        return episode_image_urls

    async def _download_image(
        self,
        episode_path: Path,
        url: str,
        image_no: int
    ) -> None:
        file_extension = 'jpg'
        file_name = f"{image_no:03d}.{file_extension}"
        file_path = episode_path /file_name

        try:

            response = await self.client.get(url, headers=self.headers)
            response.raise_for_status()  # Raises HTTPStatusError for 4xx/5xx responses
            image_raw: bytes = response.content
        except httpx.RequestError as e:
            print(f"An error occurred while requesting {url}: {e}")
        except httpx.HTTPStatusError as e:
            print(f"HTTP error occurred: {e}")
        except httpx.TimeoutException as e:
            print(f"Timeout error occurred: {e}")
        except httpx.UnsupportedProtocol as e:
            print(f"Unsupported protocol error occurred: {e}")
        except Exception as e:
            print(f"An unexpected error occurred: {e}")

        # try:
        #     image_raw: bytes = (await self.client.get(url, headers=self.headers)).content
        # except Exception as e:
        #     print(f"Error get image_raw: {file_path}: {e}")

        file_path.write_bytes(image_raw)