106 lines
3.0 KiB
Python
106 lines
3.0 KiB
Python
"""
|
|
A simple script to scrape the WHM
|
|
onion site using Selenium and the
|
|
Tor browser driver
|
|
"""
|
|
|
|
#!/bin/python3
|
|
import sys
|
|
import os
|
|
import time
|
|
import datetime
|
|
import logging
|
|
import random
|
|
import urllib.request
|
|
from os.path import isfile
|
|
import tbselenium.common as cm
|
|
from tbselenium.tbdriver import TorBrowserDriver
|
|
from selenium.webdriver.common.keys import Keys
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
|
|
# Configure logging
|
|
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s %(message)s", datefmt="%Y/%m/%d %I:%M:%S %p")
|
|
logging.warning('Logging started.')
|
|
|
|
# Get user input
|
|
url = str(sys.argv[1])
|
|
username = str(sys.argv[2])
|
|
password = str(sys.argv[3])
|
|
tor_browser_path = str(sys.argv[4])
|
|
|
|
|
|
def scrape(url):
|
|
# Run with the Tor browser
|
|
with TorBrowserDriver(tor_browser_path) as driver:
|
|
# Save the scraped file with time stamp and .html ending
|
|
file_name = str("onion_site_scraped-" + str(datetime.datetime.now()) + ".html")
|
|
|
|
# Download URL entered when starting the program
|
|
f = open(file_name, "w")
|
|
driver.get(url)
|
|
|
|
# Tailored for White House Market
|
|
elem = driver.find_element_by_name("username")
|
|
elem.clear()
|
|
elem.send_keys(username)
|
|
|
|
# Wait
|
|
time.sleep((random.random() / 1.5) + (random.randint(0, 1) / 5))
|
|
|
|
# Continue filling in the password
|
|
elem = driver.find_element_by_name("password")
|
|
elem.clear()
|
|
elem.send_keys(password)
|
|
|
|
# Wait and fill in CAPTCHA manually
|
|
time.sleep((random.random() / 5) + (random.randint(0, 1) / 5))
|
|
elem.submit()
|
|
|
|
# Wait
|
|
time.sleep((random.random() / 1.5) + (random.randint(0, 1) / 5))
|
|
|
|
# Print status
|
|
print("Title: \n\n", driver.title, "\n\n", driver.current_url, "\n\nDone.\n")
|
|
|
|
# Wait
|
|
time.sleep(4)
|
|
|
|
# Save the screenshot file with time stamp and .png ending
|
|
screenshot_name = str("onion_site_screenshot-" + str(datetime.datetime.now()) + ".png")
|
|
|
|
# Get screenshot
|
|
driver.get_screenshot_as_file(screenshot_name)
|
|
|
|
# Wait
|
|
time.sleep(6)
|
|
|
|
# Get all other links
|
|
all_links = driver.find_elements_by_tag_name('a')
|
|
logging.debug("All links: ", all_links)
|
|
|
|
# Save images
|
|
images = driver.find_elements_by_tag_name('img')
|
|
time.sleep(10)
|
|
|
|
# Write image data to local directory
|
|
output_dir = str("/tmp/" + url[7:]+ str(datetime.datetime.now()))
|
|
os.makedirs(output_dir, 755)
|
|
image_count=0
|
|
for i in images:
|
|
logging.debug("Image: ", i)
|
|
urllib.request.urlretrieve(i.get_attribute('src'), output_dir + "-" + str(image_count)+'.jpg')
|
|
image_count += 1
|
|
|
|
# Write page source HTML to file
|
|
# f.write(driver.page_source)
|
|
f.close()
|
|
|
|
# Check if URLs are in a list or just a singlet
|
|
if(isfile(url)):
|
|
for line in url:
|
|
scrape(line)
|
|
else:
|
|
scrape(url)
|