Add scraper to scrape files with html autoindexing files

This commit is contained in:
ReSummit
2025-03-14 14:55:37 -07:00
parent a03ce35490
commit 4614b32ab7

View File

@@ -0,0 +1,192 @@
# Program to archive any http file link directory
import os, sys, random
import requests, time, re
from bs4 import BeautifulSoup
import urllib.parse as urlparse
# Track progress of scrape
import pickle, bz2
import functools
import pathlib
import shutil
from tqdm.auto import tqdm
# Multi-thread downloader
import threading
from queue import Queue
"""
Global variable definition
"""
# Configuration vars
# Time to wait between **download** requests
waitTime = 4
# How many threads to open
agents = 1
# Name of file to store how far download has gotten
fileName = "cache.pickle"
# Website to scrape and regex to check if link is a directory or file
urlbase = "<WEBSITE TO SCRAPE>"
dirCmp = re.compile(r".*\/$")
fileCmp = re.compile(r".*\..*")
# Required scraper vars
header = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.107 Safari/537.36",
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
}
# Locking queue
lock = threading.Lock()
# Signal to clean-up threads
closeThread = False
"""
Function definition
"""
# Saving and loading scraper status
def savePickle(fName, scrapeState):
pFile = open(fName, 'wb')
pickle.dump(scrapeState, pFile)
pFile.close()
while True:
try:
dirs = loadPickle(fileName)
break
except:
pass
def loadPickle(fName):
try:
pFile = open(fName, 'rb')
ret = pickle.load(pFile)
pFile.close()
return ret
except:
print("Error: scraper history failed to load")
return
def clearPickle(fileName):
os.remove(fileName)
def download(url, filename, params=None):
print(filename)
r = requests.get(url, stream=True, allow_redirects=True, headers=header, params=params)
if r.status_code != 200:
r.raise_for_status() # Will only raise for 4xx codes, so...
raise RuntimeError(f"Request to {url} returned status code {r.status_code}")
file_size = int(r.headers.get('Content-Length', 0))
path = pathlib.Path(filename).expanduser().resolve()
path.parent.mkdir(parents=True, exist_ok=True)
desc = "(Unknown total file size)" if file_size == 0 else ""
r.raw.read = functools.partial(r.raw.read, decode_content=True) # Decompress if needed
with tqdm.wrapattr(r.raw, "read", total=file_size, desc=desc) as r_raw:
with path.open("wb") as f:
shutil.copyfileobj(r_raw, f)
return path
def threadDownloadLink(history, queue):
while not closeThread:
getTry = 0
cur = None
lock.acquire()
if not queue.empty():
cur = queue.get()
lock.release()
if cur is not None:
siteLocation = cur[0]
directory = cur[1]
filename = cur[2]
time.sleep(waitTime + random.randint(-2, 10))
download(siteLocation, directory + filename)
def dirRecurse(site, directory, depth, toProcess):
"""
Scan through contents in link and download to directory.
Two cases:
1. If the current link within `link` is a file, download it to `directory`
2. If the current link within `link` is a directory, create a new folder in `directory`
and call this function on the current link and directory `directory/link`.
Find files first, then recurse in directories like a tree.
"""
response = ses.get(url=site, headers=header)
parse = BeautifulSoup(response.text, features="lxml")
if not os.path.exists(directory):
os.makedirs(directory)
for link in parse.find_all('a', attrs={'href': fileCmp}):
siteLocation = urlparse.urljoin( site, link.get('href') )
endLoc = urlparse.unquote(siteLocation.rsplit('/', 1)[1])
if not os.path.exists(directory + endLoc):
#time.sleep(3)
if toProcess.full():
print("Queue size:", toProcess.qsize())
while toProcess.full():
pass
toProcess.put([siteLocation, directory, endLoc, str('-' * depth + ' ' + endLoc)])
for link in parse.find_all('a', attrs={'href': dirCmp}):
if (link.get_text() not in "Parent Directory"):
siteLocation = urlparse.urljoin( site, link.get('href') )
endDir = urlparse.unquote(siteLocation.rsplit('/', 2)[1])
# time.sleep(5 + random.randint(-2, 3))
print(str('-' * depth + ' ' + directory + endDir + '/' ))
dirRecurse(siteLocation, directory + endDir + '/', depth + 1, toProcess)
if __name__ == '__main__':
# Check if argument exists
try:
dir_in = sys.argv[1]
if os.path.exists(dir_in) and os.path.isdir(dir_in):
dir_in = os.path.abspath(dir_in) + '/'
else:
print("Invalid directory")
exit(1)
except IndexError:
dir_in = os.getcwd() + "/downloaded_files/"
ses = requests.Session()
# Check if scraper was interrupted and had a pickle history
cache = loadPickle(fileName)
if cache:
dirs = cache
else:
dirs = [[], []]
# Start threads for downloading
threads = list()
toProcess = Queue(maxsize=agents)
for index in range(agents):
x = threading.Thread(target=threadDownloadLink, args=(dirs, toProcess), daemon=True)
threads.append(x)
x.start()
# Base case: starting directory
try:
dirRecurse(urlbase, dir_in, 0, toProcess)
finally:
closeThread = True
for x in threads:
x.join()
print("Safely exited")