dark_web_crawlers/onion_site_scraping/whm_image_scrape.py

"""
A simple script to scrape the WHM
onion site using Selenium and the
Tor browser driver
"""

#!/bin/python3
import sys
import os
import time
import datetime
import logging
import random
import urllib.request
from os.path import isfile
import tbselenium.common as cm
from tbselenium.tbdriver import TorBrowserDriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Configure logging
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s %(message)s", datefmt="%Y/%m/%d %I:%M:%S %p")
logging.warning('Logging started.')

# Get user input
url = str(sys.argv[1])
username = str(sys.argv[2])
password = str(sys.argv[3])
tor_browser_path = str(sys.argv[4])


def scrape(url):
    # Run with the Tor browser
    with TorBrowserDriver(tor_browser_path) as driver:
        # Save the scraped file with time stamp and .html ending
        file_name = str("onion_site_scraped-" + str(datetime.datetime.now()) + ".html")

        # Download URL entered when starting the program
        f = open(file_name, "w")
        driver.get(url)

        # Tailored for White House Market
        elem = driver.find_element_by_name("username")
        elem.clear()
        elem.send_keys(username)

        # Wait
        time.sleep((random.random() / 1.5) + (random.randint(0, 1) / 5))

        # Continue filling in the password
        elem = driver.find_element_by_name("password")
        elem.clear()
        elem.send_keys(password)

        # Wait and fill in CAPTCHA manually
        time.sleep((random.random() / 5) + (random.randint(0, 1) / 5))
        elem.submit()

        # Wait
        time.sleep((random.random() / 1.5) + (random.randint(0, 1) / 5))

        # Print status
        print("Title: \n\n", driver.title, "\n\n", driver.current_url, "\n\nDone.\n")

        # Wait
        time.sleep(4)

        # Save the screenshot file with time stamp and .png ending
        screenshot_name = str("onion_site_screenshot-" + str(datetime.datetime.now()) + ".png")

        # Get screenshot
        driver.get_screenshot_as_file(screenshot_name)

        # Wait
        time.sleep(6)

        # Get all other links
        all_links = driver.find_elements_by_tag_name('a')
        logging.debug("All links: ", all_links)

        # Save images
        images = driver.find_elements_by_tag_name('img')
        time.sleep(10)

        # Write image data to local directory
        output_dir = str("/tmp/" + url[7:]+ str(datetime.datetime.now()))
        os.makedirs(output_dir, 755)
        image_count=0
        for i in images:
            logging.debug("Image: ", i)
            urllib.request.urlretrieve(i.get_attribute('src'), output_dir + "-" + str(image_count)+'.jpg')
            image_count += 1

        # Write page source HTML to file
        # f.write(driver.page_source)
        f.close()

# Check if URLs are in a list or just a singlet
if(isfile(url)):
    for line in url:
        scrape(line)
else:
    scrape(url)