""" A simple script to scrape an onion site using Selenium and the Tor browser driver GPG v3 """ #!/bin/python3 import sys import os import time import datetime import logging import random import urllib.request from os.path import isfile from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC # Get user input url = str(sys.argv[1]) selenium_driver_path = str(sys.argv[2]) destination = str(sys.argv[3]) domain = url.replace("/", "") domain = domain.replace(":","") # Configure logging logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s", filename=str("/tmp/" + domain + "_site_scrape.log"), filemode="w") logging.info('Logging started.') # Output files file_name = str(url + "_"+ str(datetime.datetime.now()) + ".html") file_name = file_name.replace("http://", "") file_name = file_name.replace("https://", "") file_name = file_name.replace("/", "") f = open(str("/tmp/" + file_name), 'w') robotstxt = "" #class Scraper: # def __init__(self, url): # self.url = url # domain = str(url[7:]) # save_screenshot = yes|no|index_only # output_file = str("websites_dw/" + domain) def scrape(url): print("|- Scraping takes place soon.\n|-- URL is: ",url) logging.info("Entered scraping function.") # Run with the Firefox browser using Gecko print("|--- Clear web scraping takes place.") logging.info("Firefox Selenium started.") crawler = webdriver.Firefox() # Get robots.txt robotstxt_url = str(url + "/robots.txt") crawler.get(robotstxt_url) f_robot = open(str(destination + "robots.txt"), 'w') f_robot.write(str(crawler.page_source)) print("|--- Robots.txt URL: ", robotstxt_url) f_robot.close() # Get home page crawler.get(url) print("Title: \n\n", crawler.title, "\n\n", crawler.current_url, "\n\nDone.\n") # Write index-html to file f.write(str(crawler.page_source)) # Get all other links all_links = crawler.find_elements(By.TAG_NAME, 'a') link_list = [] logging.info("Retrieving URLs from main URL.") file_name_extender = 0 print("|---- Numer of links: ", len(all_links)) logging.info("Number of links:") logging.info(len(all_links)) # Get all links on homepage for al in all_links: retrieved_url = al.get_dom_attribute("href") link_list.append(retrieved_url) logging.info(str(retrieved_url)) print(link_list) for link in link_list: if(link == None): pass elif(link == ""): pass else: output_file_name = crawler.title output_file_name = output_file_name.replace("/", "") output_file_name = output_file_name.replace(":", "") # Check if file name is too long if len(output_file_name) > 100: output_file_name = output_file_name[:50] try: if output_file_name[0] == ".": print("|----- Link starts with . " , output_file_name) output_file_name = output_file_name.replace(".", "", 1) if(isfile(output_file_name)): output_file_name = str(output_file_name + file_name_extender) file_name_extender += 1 print("output file name occupied") output = open(str(destination + output_file_name + ".html"), 'w') print("|---- Identified link on page: ", link) except: print("|------ file name . too short") # Check if external link (HTTP://) if link.find("http") == False: print("http ", link) else: if(link[0] == "/"): link = (str(url + link)) else: link = str(url + link) print("|------ Internal link. Rewriting link to: ", link) try: #WebDriverWait(crawler, 3).until(EC.staleness_of((al))) crawler.implicitly_wait(2) time.sleep(1.5) crawler.get(link) logging.info("Successful download") logging.info(link) output.write(str(crawler.page_source)) print("|------- Title: ", crawler.title, crawler.current_url, " downloaded.") # Save images image_counter =0 all_images = crawler.find_elements(By.TAG_NAME, 'img') os.chdir(destination) try: for image in all_images: urllib.request.urlretrieve(image.get_attribute('src'), str(destination + "/image_" + str(image_counter) + '.jpg')) image_counter += 1 print("Image saved: ", image) except Exception as _e: print("Image not fetched:", _e) except Exception as e: logging.error(e) print("|----- Error getting ", link, " current URL:" , crawler.current_url) print(e) print("|--- Moving on to saving source.") # Save images #images = driver.find_elements_by_tag_name('img') # Write and close print("|-- Source code written to file.\n|- Exiting.") f.close() crawler.quit() if(isfile(url)): for line in url: scrape(line) else: scrape(url)