Add scraper to scrape files with html autoindexing files
This commit is contained in:
192
html_file_index_scrape/main.py
Normal file
192
html_file_index_scrape/main.py
Normal file
@@ -0,0 +1,192 @@
|
||||
# Program to archive any http file link directory
|
||||
|
||||
import os, sys, random
|
||||
import requests, time, re
|
||||
from bs4 import BeautifulSoup
|
||||
import urllib.parse as urlparse
|
||||
|
||||
# Track progress of scrape
|
||||
import pickle, bz2
|
||||
|
||||
import functools
|
||||
import pathlib
|
||||
import shutil
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
# Multi-thread downloader
|
||||
import threading
|
||||
from queue import Queue
|
||||
|
||||
"""
|
||||
Global variable definition
|
||||
"""
|
||||
|
||||
# Configuration vars
|
||||
|
||||
# Time to wait between **download** requests
|
||||
waitTime = 4
|
||||
|
||||
# How many threads to open
|
||||
agents = 1
|
||||
|
||||
# Name of file to store how far download has gotten
|
||||
fileName = "cache.pickle"
|
||||
|
||||
# Website to scrape and regex to check if link is a directory or file
|
||||
urlbase = "<WEBSITE TO SCRAPE>"
|
||||
dirCmp = re.compile(r".*\/$")
|
||||
fileCmp = re.compile(r".*\..*")
|
||||
|
||||
# Required scraper vars
|
||||
|
||||
header = {
|
||||
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.107 Safari/537.36",
|
||||
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
|
||||
}
|
||||
|
||||
# Locking queue
|
||||
lock = threading.Lock()
|
||||
|
||||
# Signal to clean-up threads
|
||||
closeThread = False
|
||||
|
||||
"""
|
||||
Function definition
|
||||
"""
|
||||
|
||||
# Saving and loading scraper status
|
||||
def savePickle(fName, scrapeState):
|
||||
pFile = open(fName, 'wb')
|
||||
pickle.dump(scrapeState, pFile)
|
||||
pFile.close()
|
||||
|
||||
while True:
|
||||
try:
|
||||
dirs = loadPickle(fileName)
|
||||
break
|
||||
except:
|
||||
pass
|
||||
|
||||
def loadPickle(fName):
|
||||
try:
|
||||
pFile = open(fName, 'rb')
|
||||
ret = pickle.load(pFile)
|
||||
pFile.close()
|
||||
return ret
|
||||
except:
|
||||
print("Error: scraper history failed to load")
|
||||
return
|
||||
|
||||
def clearPickle(fileName):
|
||||
os.remove(fileName)
|
||||
|
||||
def download(url, filename, params=None):
|
||||
print(filename)
|
||||
r = requests.get(url, stream=True, allow_redirects=True, headers=header, params=params)
|
||||
if r.status_code != 200:
|
||||
r.raise_for_status() # Will only raise for 4xx codes, so...
|
||||
raise RuntimeError(f"Request to {url} returned status code {r.status_code}")
|
||||
file_size = int(r.headers.get('Content-Length', 0))
|
||||
|
||||
path = pathlib.Path(filename).expanduser().resolve()
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
desc = "(Unknown total file size)" if file_size == 0 else ""
|
||||
r.raw.read = functools.partial(r.raw.read, decode_content=True) # Decompress if needed
|
||||
with tqdm.wrapattr(r.raw, "read", total=file_size, desc=desc) as r_raw:
|
||||
with path.open("wb") as f:
|
||||
shutil.copyfileobj(r_raw, f)
|
||||
|
||||
return path
|
||||
|
||||
def threadDownloadLink(history, queue):
|
||||
while not closeThread:
|
||||
getTry = 0
|
||||
cur = None
|
||||
|
||||
lock.acquire()
|
||||
if not queue.empty():
|
||||
cur = queue.get()
|
||||
lock.release()
|
||||
|
||||
if cur is not None:
|
||||
siteLocation = cur[0]
|
||||
directory = cur[1]
|
||||
filename = cur[2]
|
||||
|
||||
time.sleep(waitTime + random.randint(-2, 10))
|
||||
download(siteLocation, directory + filename)
|
||||
|
||||
def dirRecurse(site, directory, depth, toProcess):
|
||||
"""
|
||||
Scan through contents in link and download to directory.
|
||||
Two cases:
|
||||
1. If the current link within `link` is a file, download it to `directory`
|
||||
2. If the current link within `link` is a directory, create a new folder in `directory`
|
||||
and call this function on the current link and directory `directory/link`.
|
||||
Find files first, then recurse in directories like a tree.
|
||||
"""
|
||||
|
||||
response = ses.get(url=site, headers=header)
|
||||
parse = BeautifulSoup(response.text, features="lxml")
|
||||
|
||||
if not os.path.exists(directory):
|
||||
os.makedirs(directory)
|
||||
|
||||
for link in parse.find_all('a', attrs={'href': fileCmp}):
|
||||
siteLocation = urlparse.urljoin( site, link.get('href') )
|
||||
endLoc = urlparse.unquote(siteLocation.rsplit('/', 1)[1])
|
||||
if not os.path.exists(directory + endLoc):
|
||||
#time.sleep(3)
|
||||
if toProcess.full():
|
||||
print("Queue size:", toProcess.qsize())
|
||||
while toProcess.full():
|
||||
pass
|
||||
toProcess.put([siteLocation, directory, endLoc, str('-' * depth + ' ' + endLoc)])
|
||||
|
||||
for link in parse.find_all('a', attrs={'href': dirCmp}):
|
||||
if (link.get_text() not in "Parent Directory"):
|
||||
siteLocation = urlparse.urljoin( site, link.get('href') )
|
||||
endDir = urlparse.unquote(siteLocation.rsplit('/', 2)[1])
|
||||
# time.sleep(5 + random.randint(-2, 3))
|
||||
|
||||
print(str('-' * depth + ' ' + directory + endDir + '/' ))
|
||||
dirRecurse(siteLocation, directory + endDir + '/', depth + 1, toProcess)
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Check if argument exists
|
||||
try:
|
||||
dir_in = sys.argv[1]
|
||||
if os.path.exists(dir_in) and os.path.isdir(dir_in):
|
||||
dir_in = os.path.abspath(dir_in) + '/'
|
||||
else:
|
||||
print("Invalid directory")
|
||||
exit(1)
|
||||
except IndexError:
|
||||
dir_in = os.getcwd() + "/downloaded_files/"
|
||||
|
||||
ses = requests.Session()
|
||||
|
||||
# Check if scraper was interrupted and had a pickle history
|
||||
cache = loadPickle(fileName)
|
||||
if cache:
|
||||
dirs = cache
|
||||
else:
|
||||
dirs = [[], []]
|
||||
|
||||
# Start threads for downloading
|
||||
threads = list()
|
||||
toProcess = Queue(maxsize=agents)
|
||||
for index in range(agents):
|
||||
x = threading.Thread(target=threadDownloadLink, args=(dirs, toProcess), daemon=True)
|
||||
threads.append(x)
|
||||
x.start()
|
||||
|
||||
# Base case: starting directory
|
||||
try:
|
||||
dirRecurse(urlbase, dir_in, 0, toProcess)
|
||||
finally:
|
||||
closeThread = True
|
||||
for x in threads:
|
||||
x.join()
|
||||
print("Safely exited")
|
||||
Reference in New Issue
Block a user