Add scraper to scrape files with html autoindexing files

2025-03-14 14:55:37 -07:00
parent a03ce35490
commit 4614b32ab7
1 changed files with 192 additions and 0 deletions
--- a/html_file_index_scrape/main.py
+++ b/html_file_index_scrape/main.py
@@ -0,0 +1,192 @@
+# Program to archive any http file link directory
+
+import os, sys, random
+import requests, time, re
+from bs4 import BeautifulSoup
+import urllib.parse as urlparse
+
+# Track progress of scrape
+import pickle, bz2
+
+import functools
+import pathlib
+import shutil
+from tqdm.auto import tqdm
+
+# Multi-thread downloader
+import threading
+from queue import Queue
+
+"""
+    Global variable definition
+"""
+
+# Configuration vars
+
+# Time to wait between **download** requests
+waitTime = 4
+
+# How many threads to open
+agents = 1
+
+# Name of file to store how far download has gotten
+fileName = "cache.pickle"
+
+# Website to scrape and regex to check if link is a directory or file
+urlbase = "<WEBSITE TO SCRAPE>"
+dirCmp = re.compile(r".*\/$")
+fileCmp = re.compile(r".*\..*")
+
+# Required scraper vars
+
+header = {
+    'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.107 Safari/537.36",
+    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
+}
+
+# Locking queue
+lock = threading.Lock()
+
+# Signal to clean-up threads
+closeThread = False
+
+"""
+    Function definition
+"""
+
+# Saving and loading scraper status
+def savePickle(fName, scrapeState):
+    pFile = open(fName, 'wb')
+    pickle.dump(scrapeState, pFile)
+    pFile.close()
+
+    while True:
+        try:
+            dirs = loadPickle(fileName)
+            break
+        except:
+            pass
+
+def loadPickle(fName):
+    try:
+        pFile = open(fName, 'rb')
+        ret = pickle.load(pFile)
+        pFile.close()
+        return ret
+    except:
+        print("Error: scraper history failed to load")
+        return
+
+def clearPickle(fileName):
+    os.remove(fileName)
+
+def download(url, filename, params=None):
+    print(filename)
+    r = requests.get(url, stream=True, allow_redirects=True, headers=header, params=params)
+    if r.status_code != 200:
+        r.raise_for_status()  # Will only raise for 4xx codes, so...
+        raise RuntimeError(f"Request to {url} returned status code {r.status_code}")
+    file_size = int(r.headers.get('Content-Length', 0))
+
+    path = pathlib.Path(filename).expanduser().resolve()
+    path.parent.mkdir(parents=True, exist_ok=True)
+
+    desc = "(Unknown total file size)" if file_size == 0 else ""
+    r.raw.read = functools.partial(r.raw.read, decode_content=True)  # Decompress if needed
+    with tqdm.wrapattr(r.raw, "read", total=file_size, desc=desc) as r_raw:
+        with path.open("wb") as f:
+            shutil.copyfileobj(r_raw, f)
+
+    return path
+
+def threadDownloadLink(history, queue):
+    while not closeThread:
+        getTry = 0
+        cur = None
+        
+        lock.acquire()
+        if not queue.empty():
+            cur = queue.get()
+        lock.release()
+
+        if cur is not None:
+            siteLocation = cur[0]
+            directory = cur[1]
+            filename = cur[2]
+
+            time.sleep(waitTime + random.randint(-2, 10))
+            download(siteLocation, directory + filename)
+
+def dirRecurse(site, directory, depth, toProcess):
+    """
+    Scan through contents in link and download to directory.
+    Two cases:
+        1. If the current link within `link` is a file, download it to `directory`
+        2. If the current link within `link` is a directory, create a new folder in `directory`
+           and call this function on the current link and directory `directory/link`.
+    Find files first, then recurse in directories like a tree.
+    """
+
+    response = ses.get(url=site, headers=header)
+    parse = BeautifulSoup(response.text, features="lxml")
+
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+
+    for link in parse.find_all('a', attrs={'href': fileCmp}):
+        siteLocation = urlparse.urljoin( site, link.get('href') )
+        endLoc = urlparse.unquote(siteLocation.rsplit('/', 1)[1])
+        if not os.path.exists(directory + endLoc):
+            #time.sleep(3)
+            if toProcess.full():
+                print("Queue size:", toProcess.qsize())
+            while toProcess.full():
+                pass
+            toProcess.put([siteLocation, directory, endLoc, str('-' * depth + ' ' + endLoc)])
+
+    for link in parse.find_all('a', attrs={'href': dirCmp}):
+        if (link.get_text() not in "Parent Directory"):
+            siteLocation = urlparse.urljoin( site, link.get('href') )
+            endDir = urlparse.unquote(siteLocation.rsplit('/', 2)[1])
+            # time.sleep(5 + random.randint(-2, 3))
+
+            print(str('-' * depth + ' ' + directory + endDir + '/' ))
+            dirRecurse(siteLocation, directory + endDir + '/', depth + 1, toProcess)
+
+if __name__ == '__main__':
+    # Check if argument exists
+    try:
+        dir_in = sys.argv[1]
+        if os.path.exists(dir_in) and os.path.isdir(dir_in):
+            dir_in = os.path.abspath(dir_in) + '/'
+        else:
+            print("Invalid directory")
+            exit(1)
+    except IndexError:
+        dir_in = os.getcwd() + "/downloaded_files/"
+
+    ses = requests.Session()
+
+    # Check if scraper was interrupted and had a pickle history
+    cache = loadPickle(fileName)
+    if cache:
+        dirs = cache
+    else:
+        dirs = [[], []]
+
+    # Start threads for downloading
+    threads = list()
+    toProcess = Queue(maxsize=agents)
+    for index in range(agents):
+        x = threading.Thread(target=threadDownloadLink, args=(dirs, toProcess), daemon=True)
+        threads.append(x)
+        x.start()
+        
+    # Base case: starting directory
+    try:
+        dirRecurse(urlbase, dir_in, 0, toProcess)
+    finally:
+        closeThread = True
+        for x in threads:
+            x.join()
+        print("Safely exited")