123 lines
4.9 KiB
Python
123 lines
4.9 KiB
Python
|
|
from pathlib import Path
|
|
import time
|
|
from typing import TYPE_CHECKING
|
|
from bs4 import BeautifulSoup
|
|
from httpx import AsyncClient, HTTPStatusError, RequestError
|
|
import httpx
|
|
import requests
|
|
from data.path_constant import DOWNLOAD_DIR
|
|
from data.webtoon_request import get_webtoon_headers
|
|
from downloaders.downloader import Downloader
|
|
|
|
|
|
class Webtoon(Downloader):
|
|
def __init__(self, webtoon_id: int):
|
|
super().__init__(webtoon_id)
|
|
|
|
self.headers = get_webtoon_headers()
|
|
self.base_url = "https://www.webtoons.com/en/action/jungle-juice"
|
|
|
|
def _fetch_information(self, url):
|
|
res = requests.get(url, headers=self.headers)
|
|
|
|
if res.status_code == 200:
|
|
soup = BeautifulSoup(res.content, 'html.parser')
|
|
title = soup.find('meta', attrs={'property': 'og:title'})
|
|
if title:
|
|
self.title = title.get('content')
|
|
|
|
description = soup.find('meta', attrs={'property': 'og:description'})
|
|
if description:
|
|
self.description = description.get('content')
|
|
|
|
thumbnail_url = soup.find('meta', attrs={'property': 'og:image'})
|
|
if thumbnail_url:
|
|
self.thumbnail_url = thumbnail_url.get('content')
|
|
|
|
author_list = soup.find_all('h3')
|
|
h3_texts = [h3.get_text().strip() for h3 in author_list]
|
|
author = ', '.join(h3_texts)
|
|
|
|
tag = soup.find('h2', class_='genre').get_text()
|
|
|
|
self.author = author
|
|
self.tag = tag
|
|
|
|
seo_id = url.split('/')[-2]
|
|
thumbnail_type = 'png' if 'png' in self.thumbnail_url else 'jpg'
|
|
self.thumbnail_name = seo_id + '.' + thumbnail_type
|
|
|
|
self.latest_title_no = soup.find('li', class_='_episodeItem').get('data-episode-no')
|
|
else:
|
|
print(f"fetch_information: {res.status_code}")
|
|
|
|
|
|
def _fetch_episode_information(self):
|
|
url = f"{self.base_url}/prologue/viewer?title_no={self.webtoon_id}&episode_no={self.latest_title_no}"
|
|
res = requests.get(url, headers=self.headers)
|
|
|
|
if res.status_code == 200:
|
|
self.episode_titles = []
|
|
soup = BeautifulSoup(res.content, 'html.parser')
|
|
li_tags = soup.find('div', class_='episode_cont').find_all('li', attrs={'data-episode-no': True})
|
|
self.episode_titles = [li.find('span', class_='subj').get_text() for li in li_tags if li.find('span', class_='subj')]
|
|
self.episode_urls = [li.find('a')['href'] for li in li_tags]
|
|
self.episode_ids = [int(li.get('data-episode-no')) for li in li_tags] # start with 1, not index
|
|
self.readablities_index_list = [id - 1 for id in self.episode_ids]
|
|
else:
|
|
print(f"fetch_episode_information: {res.status_code}")
|
|
|
|
|
|
def _get_episode_image_urls(self, episode_index) -> list[str]:
|
|
#url = self.episode_urls[episode_index]
|
|
episode_id = self.episode_ids[episode_index]
|
|
url = f"{self.base_url}/prologue/viewer?title_no={self.webtoon_id}&episode_no={episode_id}"
|
|
episode_image_urls = []
|
|
res = requests.get(url, headers=self.headers)
|
|
|
|
if res.status_code == 200:
|
|
soup = BeautifulSoup(res.content, 'html.parser')
|
|
img_tags = soup.select("#_imageList > img")
|
|
episode_image_urls = [element["data-url"] for element in img_tags]
|
|
if TYPE_CHECKING:
|
|
episode_image_urls = [
|
|
episode_image_url for episode_image_url in episode_image_urls if isinstance(episode_image_url, str)
|
|
]
|
|
else:
|
|
print(f"get_episode_image_urls: {res.status_code}")
|
|
return episode_image_urls
|
|
|
|
async def _download_image(
|
|
self,
|
|
episode_path: Path,
|
|
url: str,
|
|
image_no: int
|
|
) -> None:
|
|
file_extension = 'jpg'
|
|
file_name = f"{image_no:03d}.{file_extension}"
|
|
file_path = episode_path /file_name
|
|
|
|
try:
|
|
|
|
response = await self.client.get(url, headers=self.headers)
|
|
response.raise_for_status() # Raises HTTPStatusError for 4xx/5xx responses
|
|
image_raw: bytes = response.content
|
|
except httpx.RequestError as e:
|
|
print(f"An error occurred while requesting {url}: {e}")
|
|
except httpx.HTTPStatusError as e:
|
|
print(f"HTTP error occurred: {e}")
|
|
except httpx.TimeoutException as e:
|
|
print(f"Timeout error occurred: {e}")
|
|
except httpx.UnsupportedProtocol as e:
|
|
print(f"Unsupported protocol error occurred: {e}")
|
|
except Exception as e:
|
|
print(f"An unexpected error occurred: {e}")
|
|
|
|
# try:
|
|
# image_raw: bytes = (await self.client.get(url, headers=self.headers)).content
|
|
# except Exception as e:
|
|
# print(f"Error get image_raw: {file_path}: {e}")
|
|
|
|
file_path.write_bytes(image_raw)
|