From 56b2a66c78eef13616e8e43ada9a83acc17ba415 Mon Sep 17 00:00:00 2001 From: Joachim Date: Sun, 20 Apr 2025 10:58:36 +0200 Subject: [PATCH] =?UTF-8?q?T=C3=A9l=C3=A9verser=20les=20fichiers=20vers=20?= =?UTF-8?q?"/"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main.py | 187 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 187 insertions(+) create mode 100644 main.py diff --git a/main.py b/main.py new file mode 100644 index 0000000..bc52a7e --- /dev/null +++ b/main.py @@ -0,0 +1,187 @@ +#!/usr/bin/python +# Title: RadioFrance Podcast Downloader +# Author: Rey Joachim, @_@mamot.fr +# "A complex solution to a dumb problem" +import json +import requests +import argparse +from pathlib import Path + +# --- SETTINGS --- + +parser = argparse.ArgumentParser(prog="RadioFrance Podcast Downloader") +parser.add_argument("-u", "--url", help="The podcast's URL, might work with other media, e.g.: https://www.radiofrance.fr/franceinter/podcasts/[name]") +parser.add_argument( + "-p", + "--page", + help="The page you want to download, should be formatted like a-b (e.g for page 1 to 30: 0-30)", +) +parser.add_argument("--noLimit", action="store_true") +args = parser.parse_args() + +assert 2 >= len(args.page.split("-")) or 3 > len( + args.page.split("-") +), "Wrong page formatting, should be formatted like a-b (e.g for page 1 to 30: 0-30)" + +# "remove" recursion limit if needed +if args.noLimit: + from sys import setrecursionlimit + + setrecursionlimit(10**6) + +# define URL +if args.url[-1] == "/": + BASE_URL = args.url +else: + BASE_URL = args.url + "/" +URL_END = "__data.json" + +PATH = str(Path().resolve()) +TOTAL = 1 # avoid div. by 0 error if there is no content. +SUCCESS = 0 +WARN = 0 +ERR = 0 + +# re-set path +user_path = str(input(f"Where should the files be downloaded ? [default: {PATH}] ")) +if len(user_path) > 0: + if user_path[-1] != "/": + PATH = user_path + "/" + else: + PATH = user_path + +# --- FUNCTIONS --- + + +def getPage(p: int) -> dict: + """ + Fetch le JSON d'une page + In: p -> page (int) + Out: (dict) + """ + # only request from p=2 to p=N are valid and [URL]__data.json seem to retun p1 for some reason (but not always...) + if p == 1: + api_aws = requests.get(BASE_URL + URL_END).json()["nodes"][3]["data"] + else: + api_aws = requests.get(BASE_URL + URL_END + "?p=" + str(p)).json()["nodes"][3][ + "data" + ] + return lookForChilds(api_aws[0], api_aws) + + +def lookForChilds(item: any, un_list: list) -> dict: + """ + Reconstitue recusivement le JSON. + In: item(any), un_list(list) + Out: (dict) + """ + + if type(item) is dict: + tmp = {} + for key in item.keys(): + tmp[key] = lookForChilds(item[key], un_list) + return tmp + + if type(item) is list: + tmp = [] + for elem in item: + tmp.append(lookForChilds(elem, un_list)) + return tmp + + if type(item) is int and 0 <= item < len(un_list): + tmp = un_list[item] + # "break" link, prevent infinite loop but at the cost of information loss (a better method is needed) + un_list[item] = str(un_list[item]) + return lookForChilds(tmp, un_list) + + return item + + +def extractData(a: int, b: int) -> dict: + """ + Forme un dictionnaire au format {titre: URL} avec le JSON reconstitué de b - a pages. + Si l'objet n'a pas de titre alors, titre = NameError id + indice de l'objet. + Si l'objet n'a pas de source alors, source = No sources found. + In: a(int), b(int) + Out: (dict) + """ + + extracted = {} + + for p in range(int(a), int(b)): + try: # e.g., si la page n'existe pas (out of range) + page = getPage(p)["metadata"]["pagination"]["items"] + for i in range(len(page)): + # parfois le nom ou la source manque + try: + for source in page[i]["playerInfo"]["media"]["sources"]: + url = source["url"] + if source["url"].split(".")[-1] == "mp3": + break + except TypeError: + url = None + try: + name = f'{page[0]["playerInfo"]["playerMetadata"]["firstLine"]} - {page[i]["playerInfo"]["playerMetadata"]["secondLine"]}' + except TypeError: + name = f'{page[0]["playerInfo"]["playerMetadata"]["firstLine"]} - NameError P{p}L{i} ' + extracted[name] = url + print(f"[INFO] page {p} loaded") + except: + print(f"[WARN] page {p} did not return valid data, exiting...") + break + return extracted + + +def downloadPocast(info: dict) -> int: + """ + Télécharge les fichiers contenus dans un dictionnaire au format {titre: URL} dans le dossier courrant. + In: info(dict) + Out: (int) + """ + global TOTAL, SUCCESS, WARN, ERR # saved if the function crash + rem_k = "" + + try: + for key in info.copy(): + rem_k = key + if info[key] is not None: + aws = requests.get(info[key]) + fname = f"{key}.{info[key].split('.')[-1]}" + with open(PATH + fname, "wb") as fd: + for chunk in aws.iter_content(chunk_size=128): + fd.write(chunk) + print(f"[INFO] {key} downloaded") + SUCCESS += 1 + else: + print(f"[WARN] {key} does not have a source ({info[key]})") + WARN += 1 + info.pop(key) + TOTAL += 1 + return 0 + except Exception as e: + print(f"[ERROR] {e}") + info.pop(rem_k) # remove problematic entry + TOTAL += 1 + ERR += 1 + return 1 + + +# --- CODE --- + +# fetch info +print("[INFO] Generating download queue") +podcasts = extractData(*args.page.split("-")) + +# download files +print(f"[INFO] Downloading file in {PATH}") +exit = 1 +while exit != 0: + exit = downloadPocast(podcasts) + +# end +print("-- RESULTS --") +print(f"Tried to download {TOTAL} files") +print(f"x{ERR} errors") +print(f"x{WARN} warns") +print(f"x{SUCCESS} success") +print(f"{int(SUCCESS/TOTAL*100)}% success, {int((WARN + ERR)/TOTAL*100)}% problem")