canvas-student-data-export/kaltura_downloader.py

# built in
import json
import os
import itertools
import re
import string
import unicodedata
import argparse
import sys

# external
from bs4 import BeautifulSoup
from canvasapi import Canvas
from canvasapi.exceptions import ResourceDoesNotExist, Unauthorized, Forbidden, InvalidAccessToken, CanvasException
import dateutil.parser
import jsonpickle
import requests
import yaml

# Kaltura
import time, urllib
from urllib.parse import urljoin, urlparse
import yt_dlp

import pickle

pickle_file = "./cache.pickle"
classes_done = []

def loadPickle(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

def savePickle(filename, data):
    with open(filename, 'wb') as f:
        pickle.dump(data, f)

class courseView():
    course_id = 0

    term = ""
    course_code = ""
    name = ""
    assignments = []
    announcements = []
    discussions = []
    modules = []

    def __init__(self):
        self.assignments = []
        self.announcements = []
        self.discussions = []
        self.modules = []

def makeValidFilename(input_str):
    if(not input_str):
        return input_str

    # Normalize Unicode and whitespace
    input_str = unicodedata.normalize('NFKC', input_str)
    input_str = input_str.replace("\u00A0", " ") # NBSP to space
    input_str = re.sub(r"\s+", " ", input_str)

    # Remove invalid characters
    valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
    input_str = input_str.replace("+"," ") # Canvas default for spaces
    input_str = input_str.replace(":","-")
    input_str = input_str.replace("/","-")
    input_str = "".join(c for c in input_str if c in valid_chars)

    # Remove leading and trailing whitespace
    input_str = input_str.lstrip().rstrip()

    # Remove trailing periods
    input_str = input_str.rstrip(".")

    return input_str

def _load_credentials(path: str) -> dict:
    """Return a dict with API_URL, API_KEY, USER_ID, COOKIES_PATH or empty dict if file missing."""
    try:
        with open(path, "r", encoding="utf-8") as f:
            return yaml.full_load(f) or {}
    except FileNotFoundError:
        return {}

# Placeholder globals – will be overwritten in __main__ once we have parsed CLI args.
API_URL = ""
API_KEY = ""
USER_ID = 0
COOKIES_PATH = ""

# Directory in which to download course information to (will be created if not
# present)
DL_LOCATION = "./output"
# List of Course IDs that should be skipped
COURSES_TO_SKIP = []

DATE_TEMPLATE = "%B %d, %Y %I:%M %p"

# Max PATH length is 260 characters on Windows. 70 is just an estimate for a reasonable max folder name to prevent the chance of reaching the limit
# Applies to modules, assignments, announcements, and discussions
# If a folder exceeds this limit, a "-" will be added to the end to indicate it was shortened ("..." not valid)
MAX_FOLDER_NAME_SIZE = 70

def getCourseView(course):
    course_view = courseView()

    # Course ID
    course_view.course_id = course.id if hasattr(course, "id") else 0

    # Course term
    course_view.term = makeValidFilename(course.term.name if hasattr(course, "term") and hasattr(course.term, "name") else "")

    # Course code
    course_view.course_code = makeValidFilename(course.course_code if hasattr(course, "course_code") else "")

    # Course name
    course_view.name = course.name if hasattr(course, "name") else ""

    # print(f"Working on: {course_view.term}: {course_view.name}")

    return course_view

## -------- Kaltura Functions -------------

base_url = "https://canvaskaf.ucsd.edu/channel/"
params = "?sortBy=createdAtAsc&format=ajax&page="

kaf_cookies = {
    "kms_ctamuls": "SEE CREDENTIALS.YAML"
}

def getVideoList(classID):
    # Build full url
    full_url = base_url + str(classID) + params

    # Start with page 1
    page_num = 1

    # Bool to indicate if we bump into "There are no more media items." text
    no_more_media = False

    # List to store the video links
    video_links = []

    # Loop through the pages
    while not no_more_media:
        # Get the page
        print("Page: " + str(page_num))
        time.sleep(3)

        req_url = full_url + str(page_num)
        print(req_url)
        page = requests.get(req_url, cookies=kaf_cookies)

        if page.status_code == 403:
            print("Video unavailable, skipping")
            return []

        # Check if we bump into "There are no more media items." text
        if "There are no more media items." in page.text:
            no_more_media = True
            break

        # Interpret as JSON
        json_data = page.json()

        # Go directly to content of json data
        content = json_data["content"][0]["content"]

        # Split the text into lines
        lines = content.split("\n")
        # Find the line that contains the video links using bs4
        soup = BeautifulSoup(content, "html.parser")
        # Find all the links
        links = soup.find_all("a")
        # Loop through the links
        for link in links:
            # Check if the link is a video link
            if "href" in link.attrs:
                # Get the href
                href = link.attrs["href"]
                # Check if it is a media link
                if "media" in href:
                    # Get the video link
                    video_link = href

                    # Build the full end url with the relative href
                    vid_url = urljoin(base_url, urlparse(video_link).path)

                    # Add the full url to the list
                    if vid_url not in video_links:
                        video_links.append(vid_url)
        print(f"Page {page_num} done")
        page_num += 1

    # Return the list of video links
    return video_links

if __name__ == "__main__":
    print("Welcome to the Kaltura Canvas Student Data Export Tool\n")

    parser = argparse.ArgumentParser(description="Export nearly all of a student's Kaltura (Media Gallery) from classes.")
    parser.add_argument("-c", "--config", default="credentials.yaml", help="Path to YAML credentials file (default: credentials.yaml)")
    parser.add_argument("-o", "--output", default="./output", help="Directory to store exported data (default: ./output)")
    parser.add_argument("-v", "--verbose", action="store_true", help="Enable verbose output for debugging.")
    parser.add_argument("--version", action="version", version="Kaltura Canvas Export Tool 1.0")

    args = parser.parse_args()

    # Load credentials from YAML
    creds = _load_credentials(args.config)

    # Validate credentials
    required = ["API_URL", "API_KEY", "USER_ID"]
    missing = [k for k in required if not creds.get(k)]

    # COOKIES_PATH is required
    if "COOKIES_PATH" not in creds or not creds["COOKIES_PATH"]:
        missing.append("COOKIES_PATH")

    if missing:
        print(f"Error: {args.config} is missing required field(s): {', '.join(missing)}.")
        print("Please create the YAML file with the following structure:\n"
              "API_URL: https://<your>.instructure.com\n"
              "API_KEY: <your key>\n"
              "USER_ID: 123456\n"
              "COOKIES_PATH: path/to/cookies.txt\n")
        sys.exit(1)

    # Populate globals expected throughout the script
    API_URL = creds["API_URL"].strip().rstrip('/')
    API_KEY = creds["API_KEY"].strip()  # Remove leading/trailing whitespace which is a common issue
    USER_ID = creds["USER_ID"]
    # Use .get() to safely access optional/conditionally required keys
    COOKIES_PATH = creds.get("COOKIES_PATH", "")
    COURSES_TO_SKIP = creds.get("COURSES_TO_SKIP", [])

    test_KAF = creds.get("KAF_COOKIE")
    if test_KAF:
        kaf_cookies['kms_ctamuls'] = test_KAF
    else:
        print("kms_ctamuls flag required, see credentials.yaml for information.")
        raise SystemExit(1)


    chrome_path_override = creds.get("CHROME_PATH")
    if chrome_path_override:
        override_chrome_path(chrome_path_override)

    # Update output directory
    DL_LOCATION = args.output

    print("\nConnecting to Canvas…\n")

    # Initialize a new Canvas object
    canvas = Canvas(API_URL, API_KEY)

    # Test the connection and API key
    try:
        user = canvas.get_current_user()
        print(f"Successfully authenticated as: {user.name} (ID: {user.id})")
        if user.id != USER_ID:
            print(f"Warning: Authenticated user ID ({user.id}) does not match configured USER_ID ({USER_ID})")
    except Exception as e:
        error_type, message = CanvasErrorHandler.handle_canvas_exception(
            e, "Canvas authentication"
        )
        if CanvasErrorHandler.is_fatal_error(error_type):
            print(f"FATAL: {message}")
            sys.exit(1)
        else:
            CanvasErrorHandler.log_error(error_type, message, verbose=args.verbose)

    print(f"Creating output directory: {DL_LOCATION}\n")
    os.makedirs(DL_LOCATION, exist_ok=True)

    all_courses_views = []

    print("Getting list of all courses\n")
    courses_list = [
        canvas.get_courses(enrollment_state = "active", include="term"),
        canvas.get_courses(enrollment_state = "completed", include="term")
    ]

    if os.path.exists(pickle_file):
        print("Pickle file found, loading")
        classes_done = loadPickle(pickle_file)
    else:
        print("No pickle file found, starting from scratch")
        classes_done = []

    skip = set(COURSES_TO_SKIP)

    for courses in courses_list:
        for course in courses:
            if course.id in skip or not hasattr(course, "name") or not hasattr(course, "term") or course.id in classes_done:
                print("Skipping course: " + str(course.id))
                continue

            course_view = getCourseView(course)

            # print(course)
            # print(course.id)
            # print(course.name)

            dl_dir = os.path.join(DL_LOCATION, course_view.term, course_view.course_code, "Lectures/Kaltura")
            print(dl_dir)

            yt_dlp_options = {
                'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
                'outtmpl': '%(title)s [%(id)s].%(ext)s',
                'merge_output_format': 'mp4',
                'quiet': False,
                'postprocessors': [{
                    'key': 'FFmpegMetadata',
                    'add_metadata': True,
                }],
                'writesubtitles': True,
                'writeautomaticsub': True,
                'subtitlesformat': 'vtt',
                'embedsubs': True,
                'embedthumbnail': True,
                'paths': {
                    'home': dl_dir,
                    'temp': "",
                },
                'download_archive': 'kaltura-ytdl-history.txt',
                'cookies': COOKIES_PATH,
            }

            kaltura_video_list = getVideoList(course.id)

            # For each video in the media gallery, download using yt-dlp
            for url in kaltura_video_list:
                with yt_dlp.YoutubeDL(yt_dlp_options) as ydl:
                    try:
                        ydl.download([url])
                    except yt_dlp.DownloadError:
                        # Prob failed subtitle download, output to file
                        with open("failed.txt", "a") as f:
                            f.write(dl_dir + ": " + url + "\n")
                            f.close()

            classes_done.append(course.id)
            savePickle(pickle_file, classes_done)

    print(f'Finished, {len(courses_list)} classes downloaded')
    os.remove(pickle_file)