webtoons-download-weekly.bash


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74

#!/bin/bash
# Copyright © 2024 DiffieHellman
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

# Download weekly webtoons via scraping
# Requires a list of webtoons in the format: https://www.webtoons.com/en/<type>/<name>/list?title_no=<number> to be placed in a SUBSCRIBED file
# Dependencies: GNU coreutils, GNU wget, GNU sed, GNU find, GNU parallel, gallery-dl

#Exit immediately if a command exits with a non-zero status.
set -e

DOWNLOAD_DIR=~/Downloads
TMP_FILE=$(mktemp)
SCRIPT_DIR="$(dirname "$(realpath "$0")")"
YES="$SCRIPT_DIR/"
SUBSCRIBED="$SCRIPT_DIR/webtoons-list-of-webtoons.txt"
PREVIOUSLY_DLD="$SCRIPT_DIR/webtoons-previously-downloaded.txt"

#to_extract is each line in the file of the list of webtoons to download
while read to_extract ; do
						#sed: match url with regex, then remove before and after the match, then print the line #head: get only most recent week
	to_download=$(wget -qO- "$to_extract" | sed -n 's/.*\(https:.*?title_no=[[:digit:]]\+&episode_no=[[:digit:]]\+\).*/\1/p' |head -n 1)

		#grep: match entire line and quiet output
	if grep -xq "$to_download" "$PREVIOUSLY_DLD"
		then
		#Previously downloaded this one - either it's not next week yet, is on hiatus, or has ended
		echo -e "\e[1;31m# $to_download previously downloaded\e[0m"
		continue

		else
		#Schedule download, add to previous list and print to stdout
		echo "$to_download" | tee --append "$TMP_FILE" "$PREVIOUSLY_DLD"
	fi

done < "$SUBSCRIBED"

#Redirection to avoid "argument list too long" error
parallel -j 8 gallery-dl -d "$DOWNLOAD_DIR" {} < "$TMP_FILE"

rm "$TMP_FILE"

#Correct natural sort to string sort
#Format: N-N.extension
#Process one file at a time to avoid "argument list too long" error
find "$DOWNLOAD_DIR/webtoons" -type f -print0 | while read -d $'\0' file
do
	base_path="${file[0]%-*}" #Trim string after - and get; base_path/<chapter number>
	chapter_number="${base_path##*[![:digit:]]}" #Extract chapter number from path
	base_path=$(dirname "${base_path}") #Get base path only

	filenumber="${file##*-}" #Trim basename
	filenumber="${filenumber%%.*}" #Trim extension
	extension="${file##*.}" #Get extension only

	#Todo: Fix assumption >999 episodes or images per episode won't occur
	new_file="$base_path"/`printf "%03d" $((10#$chapter_number))`-`printf "%03d" $((10#$filenumber))`."$extension"
	if [ "$file" != "$new_file" ];
		then
		mv "$file" "$new_file"
		fi

done