""" A simple script to scrape an onion site using Selenium and the Tor browser driver GPG v3 """ #!/bin/python3 import sys import os import time import datetime import logging import random import random import urllib.request from os.path import isfile import tbselenium.common as cm from tbselenium.tbdriver import TorBrowserDriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC # Get user input, find domain TOR_BROWSER_PATH = str(sys.argv[2]) # T = os.path.isdir url = str(sys.argv[1]) domain = "" destination = str(sys.argv[3]) # Find domain (?) domain = url.replace("//", "", 10) domain = domain.replace(":", "", 10) # Configure logging logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s", filename=str("/tmp/"+ domain + "_site_scrape.log"), filemode="w") logging.warning('Logging started.') # File stream to write page source to file_name = str(url + "_"+ str(datetime.datetime.now()) + ".html") file_name = file_name.replace("https://", "") file_name = file_name.replace("http://", "") f = open(str("/tmp/" + file_name), 'w') robotstxt = "" def scrape(url): print("|--- Scraping takes place soon.\n|----- URL is: ", url) logging.info("Entered scraping function.") # Run with the Tor browser with TorBrowserDriver(TOR_BROWSER_PATH) as crawler: print("\n\nScraping takes place.") logging.info("Tor Browser Selenium started.") # Get Robots.txt #robotstxt_url = str(url + "/robots.txt") #crawler.get(robotstxt_url) #f_robot = open(str("/tmp/" + domain + "_robots.txt"), 'w') #f_robot.write(str(crawler.page_source)) #print("|---- Robots.txt URL: ", robotstxt_url) #f_robot.close() # Get main pages crawler.get(url) print("Title: \n\n", crawler.title, "\n\n", crawler.current_url, "\n\nDone.\n") f.write(str(crawler.page_source)) # Get all other links all_links = crawler.find_elements(By.TAG_NAME, 'a') link_list = [] logging.info("Retrieving URLs from main URL.\n\n") # Use a counter to increment taken file names file_name_extender = 0 for al in all_links: retrieved_urls = al.get_attribute("href") link_list.append(retrieved_urls) print("Links: ", retrieved_urls) logging.info(retrieved_urls) for link in link_list: if(link == None): pass elif(link == ''): pass else: output_file_name = crawler.title output_file_name = output_file_name.replace("/", "") output_file_name = output_file_name.replace(":", "") if len(output_file_name) > 100: output_file_name = output_file_name[:50] try: if output_file_name[0] == ".": logging.info(f'{output_file_name} removed .') output_file_name = output_file_name.replace(".", "", 1) # If file exits, rename it if(isfile(output_file_name)): output_file_name = str(output_file_name + file_name_extender) file_name_extender += 1 print("Output file name already exists: ", output_file_name, " Added ", file_name_extender) # Write source code to file output = open(str(destination + output_file_name + ".html"), 'w') print("|---- Identified link on page: ", link) except: print("|------ Output file name . and too short") # Check if external link (HTTP://) if(link.find("http")) == False: print("http ", link) logging.info("External link") logging.info(link) else: link = str(url + "/" + link) print("|------ Internal link. Rewriting link: ", link) try: #WebDriverWait(crawler, 3).until(EC.staleness_of((al))) os.chdir(destination) crawler.implicitly_wait(2) time.sleep(1.5) crawler.get(link) # Save output.write(str(crawler.page_source)) logging.info(link) logging.info("Successful download.") print("|------ Title: ", crawler.title, crawler.current_url, " downloaded") all_images = crawler.find_elements(By.TAG_NAME, 'img') image_counter = 0 os.chdir(destination) try: for image in all_images: urllib.request.urlretrieve(image.get_attribute('src'),str(destination + "/image_" + str(image_counter) + '.jpg')) image_counter += 1 print("Image saved: ", image) except Exception as _e: print("Image not saved. ", _e) except Exception as e: logging.error(e) print("|----- Error getting ", link, " current URL:" , crawler.current_url) print(e) print("|---- Done crawling.") # Save images #images = driver.find_elements_by_tag_name('img') f.close() print("|-- Source code written to file. Exiting.") #Close browser crawler.quit() if(isfile(url)): for line in url: scrape(line) else: scrape(url)