Files
canvas-student-data-export/kaltura_downloader.py

351 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# built in
import json
import os
import itertools
import re
import string
import unicodedata
import argparse
import sys
# external
from bs4 import BeautifulSoup
from canvasapi import Canvas
from canvasapi.exceptions import ResourceDoesNotExist, Unauthorized, Forbidden, InvalidAccessToken, CanvasException
import dateutil.parser
import jsonpickle
import requests
import yaml
# Kaltura
import time, urllib
from urllib.parse import urljoin, urlparse
import yt_dlp
import pickle
pickle_file = "./cache.pickle"
classes_done = []
def loadPickle(filename):
with open(filename, 'rb') as f:
return pickle.load(f)
def savePickle(filename, data):
with open(filename, 'wb') as f:
pickle.dump(data, f)
class courseView():
course_id = 0
term = ""
course_code = ""
name = ""
assignments = []
announcements = []
discussions = []
modules = []
def __init__(self):
self.assignments = []
self.announcements = []
self.discussions = []
self.modules = []
def makeValidFilename(input_str):
if(not input_str):
return input_str
# Normalize Unicode and whitespace
input_str = unicodedata.normalize('NFKC', input_str)
input_str = input_str.replace("\u00A0", " ") # NBSP to space
input_str = re.sub(r"\s+", " ", input_str)
# Remove invalid characters
valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
input_str = input_str.replace("+"," ") # Canvas default for spaces
input_str = input_str.replace(":","-")
input_str = input_str.replace("/","-")
input_str = "".join(c for c in input_str if c in valid_chars)
# Remove leading and trailing whitespace
input_str = input_str.lstrip().rstrip()
# Remove trailing periods
input_str = input_str.rstrip(".")
return input_str
def _load_credentials(path: str) -> dict:
"""Return a dict with API_URL, API_KEY, USER_ID, COOKIES_PATH or empty dict if file missing."""
try:
with open(path, "r", encoding="utf-8") as f:
return yaml.full_load(f) or {}
except FileNotFoundError:
return {}
# Placeholder globals will be overwritten in __main__ once we have parsed CLI args.
API_URL = ""
API_KEY = ""
USER_ID = 0
COOKIES_PATH = ""
# Directory in which to download course information to (will be created if not
# present)
DL_LOCATION = "./output"
# List of Course IDs that should be skipped
COURSES_TO_SKIP = []
DATE_TEMPLATE = "%B %d, %Y %I:%M %p"
# Max PATH length is 260 characters on Windows. 70 is just an estimate for a reasonable max folder name to prevent the chance of reaching the limit
# Applies to modules, assignments, announcements, and discussions
# If a folder exceeds this limit, a "-" will be added to the end to indicate it was shortened ("..." not valid)
MAX_FOLDER_NAME_SIZE = 70
def getCourseView(course):
course_view = courseView()
# Course ID
course_view.course_id = course.id if hasattr(course, "id") else 0
# Course term
course_view.term = makeValidFilename(course.term.name if hasattr(course, "term") and hasattr(course.term, "name") else "")
# Course code
course_view.course_code = makeValidFilename(course.course_code if hasattr(course, "course_code") else "")
# Course name
course_view.name = course.name if hasattr(course, "name") else ""
# print(f"Working on: {course_view.term}: {course_view.name}")
return course_view
## -------- Kaltura Functions -------------
base_url = "https://canvaskaf.ucsd.edu/channel/"
params = "?sortBy=createdAtAsc&format=ajax&page="
kaf_cookies = {
"kms_ctamuls": "SEE CREDENTIALS.YAML"
}
def getVideoList(classID):
# Build full url
full_url = base_url + str(classID) + params
# Start with page 1
page_num = 1
# Bool to indicate if we bump into "There are no more media items." text
no_more_media = False
# List to store the video links
video_links = []
# Loop through the pages
while not no_more_media:
# Get the page
print("Page: " + str(page_num))
time.sleep(3)
req_url = full_url + str(page_num)
print(req_url)
page = requests.get(req_url, cookies=kaf_cookies)
if page.status_code == 403:
print("Video unavailable, skipping")
return []
# Check if we bump into "There are no more media items." text
if "There are no more media items." in page.text:
no_more_media = True
break
# Interpret as JSON
json_data = page.json()
# Go directly to content of json data
content = json_data["content"][0]["content"]
# Split the text into lines
lines = content.split("\n")
# Find the line that contains the video links using bs4
soup = BeautifulSoup(content, "html.parser")
# Find all the links
links = soup.find_all("a")
# Loop through the links
for link in links:
# Check if the link is a video link
if "href" in link.attrs:
# Get the href
href = link.attrs["href"]
# Check if it is a media link
if "media" in href:
# Get the video link
video_link = href
# Build the full end url with the relative href
vid_url = urljoin(base_url, urlparse(video_link).path)
# Add the full url to the list
if vid_url not in video_links:
video_links.append(vid_url)
print(f"Page {page_num} done")
page_num += 1
# Return the list of video links
return video_links
if __name__ == "__main__":
print("Welcome to the Kaltura Canvas Student Data Export Tool\n")
parser = argparse.ArgumentParser(description="Export nearly all of a student's Kaltura (Media Gallery) from classes.")
parser.add_argument("-c", "--config", default="credentials.yaml", help="Path to YAML credentials file (default: credentials.yaml)")
parser.add_argument("-o", "--output", default="./output", help="Directory to store exported data (default: ./output)")
parser.add_argument("-v", "--verbose", action="store_true", help="Enable verbose output for debugging.")
parser.add_argument("--version", action="version", version="Kaltura Canvas Export Tool 1.0")
args = parser.parse_args()
# Load credentials from YAML
creds = _load_credentials(args.config)
# Validate credentials
required = ["API_URL", "API_KEY", "USER_ID"]
missing = [k for k in required if not creds.get(k)]
# COOKIES_PATH is required
if "COOKIES_PATH" not in creds or not creds["COOKIES_PATH"]:
missing.append("COOKIES_PATH")
if missing:
print(f"Error: {args.config} is missing required field(s): {', '.join(missing)}.")
print("Please create the YAML file with the following structure:\n"
"API_URL: https://<your>.instructure.com\n"
"API_KEY: <your key>\n"
"USER_ID: 123456\n"
"COOKIES_PATH: path/to/cookies.txt\n")
sys.exit(1)
# Populate globals expected throughout the script
API_URL = creds["API_URL"].strip().rstrip('/')
API_KEY = creds["API_KEY"].strip() # Remove leading/trailing whitespace which is a common issue
USER_ID = creds["USER_ID"]
# Use .get() to safely access optional/conditionally required keys
COOKIES_PATH = creds.get("COOKIES_PATH", "")
COURSES_TO_SKIP = creds.get("COURSES_TO_SKIP", [])
test_KAF = creds.get("KAF_COOKIE")
if test_KAF:
kaf_cookies['kms_ctamuls'] = test_KAF
else:
print("kms_ctamuls flag required, see credentials.yaml for information.")
raise SystemExit(1)
chrome_path_override = creds.get("CHROME_PATH")
if chrome_path_override:
override_chrome_path(chrome_path_override)
# Update output directory
DL_LOCATION = args.output
print("\nConnecting to Canvas…\n")
# Initialize a new Canvas object
canvas = Canvas(API_URL, API_KEY)
# Test the connection and API key
try:
user = canvas.get_current_user()
print(f"Successfully authenticated as: {user.name} (ID: {user.id})")
if user.id != USER_ID:
print(f"Warning: Authenticated user ID ({user.id}) does not match configured USER_ID ({USER_ID})")
except Exception as e:
error_type, message = CanvasErrorHandler.handle_canvas_exception(
e, "Canvas authentication"
)
if CanvasErrorHandler.is_fatal_error(error_type):
print(f"FATAL: {message}")
sys.exit(1)
else:
CanvasErrorHandler.log_error(error_type, message, verbose=args.verbose)
print(f"Creating output directory: {DL_LOCATION}\n")
os.makedirs(DL_LOCATION, exist_ok=True)
all_courses_views = []
print("Getting list of all courses\n")
courses_list = [
canvas.get_courses(enrollment_state = "active", include="term"),
canvas.get_courses(enrollment_state = "completed", include="term")
]
if os.path.exists(pickle_file):
print("Pickle file found, loading")
classes_done = loadPickle(pickle_file)
else:
print("No pickle file found, starting from scratch")
classes_done = []
skip = set(COURSES_TO_SKIP)
for courses in courses_list:
for course in courses:
if course.id in skip or not hasattr(course, "name") or not hasattr(course, "term") or course.id in classes_done:
print("Skipping course: " + str(course.id))
continue
course_view = getCourseView(course)
# print(course)
# print(course.id)
# print(course.name)
dl_dir = os.path.join(DL_LOCATION, course_view.term, course_view.course_code, "Lectures/Kaltura")
print(dl_dir)
yt_dlp_options = {
'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
'outtmpl': '%(title)s [%(id)s].%(ext)s',
'merge_output_format': 'mp4',
'quiet': False,
'postprocessors': [{
'key': 'FFmpegMetadata',
'add_metadata': True,
}],
'writesubtitles': True,
'writeautomaticsub': True,
'subtitlesformat': 'vtt',
'embedsubs': True,
'embedthumbnail': True,
'paths': {
'home': dl_dir,
'temp': "",
},
'download_archive': 'kaltura-ytdl-history.txt',
'cookies': COOKIES_PATH,
}
kaltura_video_list = getVideoList(course.id)
# For each video in the media gallery, download using yt-dlp
for url in kaltura_video_list:
with yt_dlp.YoutubeDL(yt_dlp_options) as ydl:
try:
ydl.download([url])
except yt_dlp.DownloadError:
# Prob failed subtitle download, output to file
with open("failed.txt", "a") as f:
f.write(dl_dir + ": " + url + "\n")
f.close()
classes_done.append(course.id)
savePickle(pickle_file, classes_done)
print(f'Finished, {len(courses_list)} classes downloaded')
os.remove(pickle_file)