2021-12-03 18:30:47 +01:00

106 lines
3.0 KiB
Python

"""
A simple script to scrape the WHM
onion site using Selenium and the
Tor browser driver
"""
#!/bin/python3
import sys
import os
import time
import datetime
import logging
import random
import urllib.request
from os.path import isfile
import tbselenium.common as cm
from tbselenium.tbdriver import TorBrowserDriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# Configure logging
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s %(message)s", datefmt="%Y/%m/%d %I:%M:%S %p")
logging.warning('Logging started.')
# Get user input
url = str(sys.argv[1])
username = str(sys.argv[2])
password = str(sys.argv[3])
tor_browser_path = str(sys.argv[4])
def scrape(url):
# Run with the Tor browser
with TorBrowserDriver(tor_browser_path) as driver:
# Save the scraped file with time stamp and .html ending
file_name = str("onion_site_scraped-" + str(datetime.datetime.now()) + ".html")
# Download URL entered when starting the program
f = open(file_name, "w")
driver.get(url)
# Tailored for White House Market
elem = driver.find_element_by_name("username")
elem.clear()
elem.send_keys(username)
# Wait
time.sleep((random.random() / 1.5) + (random.randint(0, 1) / 5))
# Continue filling in the password
elem = driver.find_element_by_name("password")
elem.clear()
elem.send_keys(password)
# Wait and fill in CAPTCHA manually
time.sleep((random.random() / 5) + (random.randint(0, 1) / 5))
elem.submit()
# Wait
time.sleep((random.random() / 1.5) + (random.randint(0, 1) / 5))
# Print status
print("Title: \n\n", driver.title, "\n\n", driver.current_url, "\n\nDone.\n")
# Wait
time.sleep(4)
# Save the screenshot file with time stamp and .png ending
screenshot_name = str("onion_site_screenshot-" + str(datetime.datetime.now()) + ".png")
# Get screenshot
driver.get_screenshot_as_file(screenshot_name)
# Wait
time.sleep(6)
# Get all other links
all_links = driver.find_elements_by_tag_name('a')
logging.debug("All links: ", all_links)
# Save images
images = driver.find_elements_by_tag_name('img')
time.sleep(10)
# Write image data to local directory
output_dir = str("/tmp/" + url[7:]+ str(datetime.datetime.now()))
os.makedirs(output_dir, 755)
image_count=0
for i in images:
logging.debug("Image: ", i)
urllib.request.urlretrieve(i.get_attribute('src'), output_dir + "-" + str(image_count)+'.jpg')
image_count += 1
# Write page source HTML to file
# f.write(driver.page_source)
f.close()
# Check if URLs are in a list or just a singlet
if(isfile(url)):
for line in url:
scrape(line)
else:
scrape(url)