351 lines
11 KiB
Python
351 lines
11 KiB
Python
# built in
|
||
import json
|
||
import os
|
||
import itertools
|
||
import re
|
||
import string
|
||
import unicodedata
|
||
import argparse
|
||
import sys
|
||
|
||
# external
|
||
from bs4 import BeautifulSoup
|
||
from canvasapi import Canvas
|
||
from canvasapi.exceptions import ResourceDoesNotExist, Unauthorized, Forbidden, InvalidAccessToken, CanvasException
|
||
import dateutil.parser
|
||
import jsonpickle
|
||
import requests
|
||
import yaml
|
||
|
||
# Kaltura
|
||
import time, urllib
|
||
from urllib.parse import urljoin, urlparse
|
||
import yt_dlp
|
||
|
||
import pickle
|
||
|
||
pickle_file = "./cache.pickle"
|
||
classes_done = []
|
||
|
||
def loadPickle(filename):
|
||
with open(filename, 'rb') as f:
|
||
return pickle.load(f)
|
||
|
||
def savePickle(filename, data):
|
||
with open(filename, 'wb') as f:
|
||
pickle.dump(data, f)
|
||
|
||
class courseView():
|
||
course_id = 0
|
||
|
||
term = ""
|
||
course_code = ""
|
||
name = ""
|
||
assignments = []
|
||
announcements = []
|
||
discussions = []
|
||
modules = []
|
||
|
||
def __init__(self):
|
||
self.assignments = []
|
||
self.announcements = []
|
||
self.discussions = []
|
||
self.modules = []
|
||
|
||
def makeValidFilename(input_str):
|
||
if(not input_str):
|
||
return input_str
|
||
|
||
# Normalize Unicode and whitespace
|
||
input_str = unicodedata.normalize('NFKC', input_str)
|
||
input_str = input_str.replace("\u00A0", " ") # NBSP to space
|
||
input_str = re.sub(r"\s+", " ", input_str)
|
||
|
||
# Remove invalid characters
|
||
valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
|
||
input_str = input_str.replace("+"," ") # Canvas default for spaces
|
||
input_str = input_str.replace(":","-")
|
||
input_str = input_str.replace("/","-")
|
||
input_str = "".join(c for c in input_str if c in valid_chars)
|
||
|
||
# Remove leading and trailing whitespace
|
||
input_str = input_str.lstrip().rstrip()
|
||
|
||
# Remove trailing periods
|
||
input_str = input_str.rstrip(".")
|
||
|
||
return input_str
|
||
|
||
def _load_credentials(path: str) -> dict:
|
||
"""Return a dict with API_URL, API_KEY, USER_ID, COOKIES_PATH or empty dict if file missing."""
|
||
try:
|
||
with open(path, "r", encoding="utf-8") as f:
|
||
return yaml.full_load(f) or {}
|
||
except FileNotFoundError:
|
||
return {}
|
||
|
||
# Placeholder globals – will be overwritten in __main__ once we have parsed CLI args.
|
||
API_URL = ""
|
||
API_KEY = ""
|
||
USER_ID = 0
|
||
COOKIES_PATH = ""
|
||
|
||
# Directory in which to download course information to (will be created if not
|
||
# present)
|
||
DL_LOCATION = "./output"
|
||
# List of Course IDs that should be skipped
|
||
COURSES_TO_SKIP = []
|
||
|
||
DATE_TEMPLATE = "%B %d, %Y %I:%M %p"
|
||
|
||
# Max PATH length is 260 characters on Windows. 70 is just an estimate for a reasonable max folder name to prevent the chance of reaching the limit
|
||
# Applies to modules, assignments, announcements, and discussions
|
||
# If a folder exceeds this limit, a "-" will be added to the end to indicate it was shortened ("..." not valid)
|
||
MAX_FOLDER_NAME_SIZE = 70
|
||
|
||
def getCourseView(course):
|
||
course_view = courseView()
|
||
|
||
# Course ID
|
||
course_view.course_id = course.id if hasattr(course, "id") else 0
|
||
|
||
# Course term
|
||
course_view.term = makeValidFilename(course.term.name if hasattr(course, "term") and hasattr(course.term, "name") else "")
|
||
|
||
# Course code
|
||
course_view.course_code = makeValidFilename(course.course_code if hasattr(course, "course_code") else "")
|
||
|
||
# Course name
|
||
course_view.name = course.name if hasattr(course, "name") else ""
|
||
|
||
# print(f"Working on: {course_view.term}: {course_view.name}")
|
||
|
||
return course_view
|
||
|
||
## -------- Kaltura Functions -------------
|
||
|
||
base_url = "https://canvaskaf.ucsd.edu/channel/"
|
||
params = "?sortBy=createdAtAsc&format=ajax&page="
|
||
|
||
kaf_cookies = {
|
||
"kms_ctamuls": "SEE CREDENTIALS.YAML"
|
||
}
|
||
|
||
def getVideoList(classID):
|
||
# Build full url
|
||
full_url = base_url + str(classID) + params
|
||
|
||
# Start with page 1
|
||
page_num = 1
|
||
|
||
# Bool to indicate if we bump into "There are no more media items." text
|
||
no_more_media = False
|
||
|
||
# List to store the video links
|
||
video_links = []
|
||
|
||
# Loop through the pages
|
||
while not no_more_media:
|
||
# Get the page
|
||
print("Page: " + str(page_num))
|
||
time.sleep(3)
|
||
|
||
req_url = full_url + str(page_num)
|
||
print(req_url)
|
||
page = requests.get(req_url, cookies=kaf_cookies)
|
||
|
||
if page.status_code == 403:
|
||
print("Video unavailable, skipping")
|
||
return []
|
||
|
||
# Check if we bump into "There are no more media items." text
|
||
if "There are no more media items." in page.text:
|
||
no_more_media = True
|
||
break
|
||
|
||
# Interpret as JSON
|
||
json_data = page.json()
|
||
|
||
# Go directly to content of json data
|
||
content = json_data["content"][0]["content"]
|
||
|
||
# Split the text into lines
|
||
lines = content.split("\n")
|
||
# Find the line that contains the video links using bs4
|
||
soup = BeautifulSoup(content, "html.parser")
|
||
# Find all the links
|
||
links = soup.find_all("a")
|
||
# Loop through the links
|
||
for link in links:
|
||
# Check if the link is a video link
|
||
if "href" in link.attrs:
|
||
# Get the href
|
||
href = link.attrs["href"]
|
||
# Check if it is a media link
|
||
if "media" in href:
|
||
# Get the video link
|
||
video_link = href
|
||
|
||
# Build the full end url with the relative href
|
||
vid_url = urljoin(base_url, urlparse(video_link).path)
|
||
|
||
# Add the full url to the list
|
||
if vid_url not in video_links:
|
||
video_links.append(vid_url)
|
||
print(f"Page {page_num} done")
|
||
page_num += 1
|
||
|
||
# Return the list of video links
|
||
return video_links
|
||
|
||
if __name__ == "__main__":
|
||
print("Welcome to the Kaltura Canvas Student Data Export Tool\n")
|
||
|
||
parser = argparse.ArgumentParser(description="Export nearly all of a student's Kaltura (Media Gallery) from classes.")
|
||
parser.add_argument("-c", "--config", default="credentials.yaml", help="Path to YAML credentials file (default: credentials.yaml)")
|
||
parser.add_argument("-o", "--output", default="./output", help="Directory to store exported data (default: ./output)")
|
||
parser.add_argument("-v", "--verbose", action="store_true", help="Enable verbose output for debugging.")
|
||
parser.add_argument("--version", action="version", version="Kaltura Canvas Export Tool 1.0")
|
||
|
||
args = parser.parse_args()
|
||
|
||
# Load credentials from YAML
|
||
creds = _load_credentials(args.config)
|
||
|
||
# Validate credentials
|
||
required = ["API_URL", "API_KEY", "USER_ID"]
|
||
missing = [k for k in required if not creds.get(k)]
|
||
|
||
# COOKIES_PATH is required
|
||
if "COOKIES_PATH" not in creds or not creds["COOKIES_PATH"]:
|
||
missing.append("COOKIES_PATH")
|
||
|
||
if missing:
|
||
print(f"Error: {args.config} is missing required field(s): {', '.join(missing)}.")
|
||
print("Please create the YAML file with the following structure:\n"
|
||
"API_URL: https://<your>.instructure.com\n"
|
||
"API_KEY: <your key>\n"
|
||
"USER_ID: 123456\n"
|
||
"COOKIES_PATH: path/to/cookies.txt\n")
|
||
sys.exit(1)
|
||
|
||
# Populate globals expected throughout the script
|
||
API_URL = creds["API_URL"].strip().rstrip('/')
|
||
API_KEY = creds["API_KEY"].strip() # Remove leading/trailing whitespace which is a common issue
|
||
USER_ID = creds["USER_ID"]
|
||
# Use .get() to safely access optional/conditionally required keys
|
||
COOKIES_PATH = creds.get("COOKIES_PATH", "")
|
||
COURSES_TO_SKIP = creds.get("COURSES_TO_SKIP", [])
|
||
|
||
test_KAF = creds.get("KAF_COOKIE")
|
||
if test_KAF:
|
||
kaf_cookies['kms_ctamuls'] = test_KAF
|
||
else:
|
||
print("kms_ctamuls flag required, see credentials.yaml for information.")
|
||
raise SystemExit(1)
|
||
|
||
|
||
chrome_path_override = creds.get("CHROME_PATH")
|
||
if chrome_path_override:
|
||
override_chrome_path(chrome_path_override)
|
||
|
||
# Update output directory
|
||
DL_LOCATION = args.output
|
||
|
||
print("\nConnecting to Canvas…\n")
|
||
|
||
# Initialize a new Canvas object
|
||
canvas = Canvas(API_URL, API_KEY)
|
||
|
||
# Test the connection and API key
|
||
try:
|
||
user = canvas.get_current_user()
|
||
print(f"Successfully authenticated as: {user.name} (ID: {user.id})")
|
||
if user.id != USER_ID:
|
||
print(f"Warning: Authenticated user ID ({user.id}) does not match configured USER_ID ({USER_ID})")
|
||
except Exception as e:
|
||
error_type, message = CanvasErrorHandler.handle_canvas_exception(
|
||
e, "Canvas authentication"
|
||
)
|
||
if CanvasErrorHandler.is_fatal_error(error_type):
|
||
print(f"FATAL: {message}")
|
||
sys.exit(1)
|
||
else:
|
||
CanvasErrorHandler.log_error(error_type, message, verbose=args.verbose)
|
||
|
||
print(f"Creating output directory: {DL_LOCATION}\n")
|
||
os.makedirs(DL_LOCATION, exist_ok=True)
|
||
|
||
all_courses_views = []
|
||
|
||
print("Getting list of all courses\n")
|
||
courses_list = [
|
||
canvas.get_courses(enrollment_state = "active", include="term"),
|
||
canvas.get_courses(enrollment_state = "completed", include="term")
|
||
]
|
||
|
||
if os.path.exists(pickle_file):
|
||
print("Pickle file found, loading")
|
||
classes_done = loadPickle(pickle_file)
|
||
else:
|
||
print("No pickle file found, starting from scratch")
|
||
classes_done = []
|
||
|
||
skip = set(COURSES_TO_SKIP)
|
||
|
||
for courses in courses_list:
|
||
for course in courses:
|
||
if course.id in skip or not hasattr(course, "name") or not hasattr(course, "term") or course.id in classes_done:
|
||
print("Skipping course: " + str(course.id))
|
||
continue
|
||
|
||
course_view = getCourseView(course)
|
||
|
||
# print(course)
|
||
# print(course.id)
|
||
# print(course.name)
|
||
|
||
dl_dir = os.path.join(DL_LOCATION, course_view.term, course_view.course_code, "Lectures/Kaltura")
|
||
print(dl_dir)
|
||
|
||
yt_dlp_options = {
|
||
'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
|
||
'outtmpl': '%(title)s [%(id)s].%(ext)s',
|
||
'merge_output_format': 'mp4',
|
||
'quiet': False,
|
||
'postprocessors': [{
|
||
'key': 'FFmpegMetadata',
|
||
'add_metadata': True,
|
||
}],
|
||
'writesubtitles': True,
|
||
'writeautomaticsub': True,
|
||
'subtitlesformat': 'vtt',
|
||
'embedsubs': True,
|
||
'embedthumbnail': True,
|
||
'paths': {
|
||
'home': dl_dir,
|
||
'temp': "",
|
||
},
|
||
'download_archive': 'kaltura-ytdl-history.txt',
|
||
'cookies': COOKIES_PATH,
|
||
}
|
||
|
||
kaltura_video_list = getVideoList(course.id)
|
||
|
||
# For each video in the media gallery, download using yt-dlp
|
||
for url in kaltura_video_list:
|
||
with yt_dlp.YoutubeDL(yt_dlp_options) as ydl:
|
||
try:
|
||
ydl.download([url])
|
||
except yt_dlp.DownloadError:
|
||
# Prob failed subtitle download, output to file
|
||
with open("failed.txt", "a") as f:
|
||
f.write(dl_dir + ": " + url + "\n")
|
||
f.close()
|
||
|
||
classes_done.append(course.id)
|
||
savePickle(pickle_file, classes_done)
|
||
|
||
print(f'Finished, {len(courses_list)} classes downloaded')
|
||
os.remove(pickle_file)
|