DIDECT2S/D3-Collector/site_scrape_dw.py

"""
A simple script to scrape an
onion site using Selenium and the
Tor browser driver

GPG v3
"""

#!/bin/python3
import sys
import os
import time
import datetime
import logging
import random
import random
import urllib.request
from os.path import isfile
import tbselenium.common as cm
from tbselenium.tbdriver import TorBrowserDriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


# Get user input, find domain
TOR_BROWSER_PATH = str(sys.argv[2]) # T = os.path.isdir
url = str(sys.argv[1])
domain = ""
destination = str(sys.argv[3])

# Find domain (?)
domain = url.replace("//", "", 10)
domain = domain.replace(":", "", 10)


# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s", filename=str("/tmp/"+ domain + "_site_scrape.log"), filemode="w")
logging.warning('Logging started.')

# File stream to write page source to
file_name = str(url + "_"+ str(datetime.datetime.now()) + ".html")
file_name = file_name.replace("https://", "")
file_name = file_name.replace("http://", "")
f = open(str("/tmp/" + file_name), 'w')
robotstxt = ""

def scrape(url):
    print("|--- Scraping takes place soon.\n|----- URL is: ", url)
    logging.info("Entered scraping function.")

    # Run with the Tor browser
    with TorBrowserDriver(TOR_BROWSER_PATH) as crawler:
        print("\n\nScraping takes place.")
        logging.info("Tor Browser Selenium started.")

        # Get Robots.txt
        #robotstxt_url = str(url + "/robots.txt")
        #crawler.get(robotstxt_url)
        #f_robot = open(str("/tmp/" + domain + "_robots.txt"), 'w')
        #f_robot.write(str(crawler.page_source))
        #print("|---- Robots.txt URL: ", robotstxt_url)
        #f_robot.close()

        # Get main pages
        crawler.get(url)
        print("Title: \n\n", crawler.title, "\n\n", crawler.current_url, "\n\nDone.\n")
        f.write(str(crawler.page_source))

        # Get all other links
        all_links = crawler.find_elements(By.TAG_NAME, 'a')
        link_list = []
        logging.info("Retrieving URLs from main URL.\n\n")

        # Use a counter to increment taken file names
        file_name_extender = 0

        for al in all_links:
            retrieved_urls = al.get_attribute("href")
            link_list.append(retrieved_urls)
            print("Links: ", retrieved_urls)
            logging.info(retrieved_urls)

        for link in link_list:
            if(link == None):
                pass
            elif(link == ''):
                pass
            else:
                output_file_name = crawler.title
                output_file_name = output_file_name.replace("/", "")
                output_file_name = output_file_name.replace(":", "")
                if len(output_file_name) > 100:
                    output_file_name = output_file_name[:50]
                try:
                    if output_file_name[0] == ".":
                        logging.info(f'{output_file_name} removed .')
                        output_file_name = output_file_name.replace(".", "", 1)
                    # If file exits, rename it
                    if(isfile(output_file_name)):
                        output_file_name = str(output_file_name + file_name_extender)
                        file_name_extender += 1
                        print("Output file name already exists: ", output_file_name, " Added ", file_name_extender)

                    # Write source code to file
                    output = open(str(destination + output_file_name + ".html"), 'w')
                    print("|---- Identified link on page: ", link)
                except:
                    print("|------ Output file name . and too short")

                # Check if external link (HTTP://)
                if(link.find("http")) == False:
                    print("http ", link)
                    logging.info("External link")
                    logging.info(link)
                else:
                    link = str(url + "/" + link)
                    print("|------ Internal link. Rewriting link: ", link)

                try:
                    #WebDriverWait(crawler, 3).until(EC.staleness_of((al)))
                    os.chdir(destination)
                    crawler.implicitly_wait(2)
                    time.sleep(1.5)
                    crawler.get(link)

                    # Save
                    output.write(str(crawler.page_source))
                    logging.info(link)
                    logging.info("Successful download.")
                    print("|------ Title: ", crawler.title, crawler.current_url, " downloaded")

                    all_images = crawler.find_elements(By.TAG_NAME, 'img')
                    image_counter = 0
                    os.chdir(destination)
                    try:
                        for image in all_images:
                            urllib.request.urlretrieve(image.get_attribute('src'),str(destination + "/image_" + str(image_counter) + '.jpg'))
                            image_counter += 1
                            print("Image saved: ", image)
                    except Exception as _e:
                            print("Image not saved. ", _e)

                except Exception as e:
                    logging.error(e)
                    print("|----- Error getting ", link, " current URL:" , crawler.current_url)
                    print(e)

        print("|---- Done crawling.")

        # Save images
        #images = driver.find_elements_by_tag_name('img')

        f.close()

        print("|-- Source code written to file. Exiting.")
        #Close browser
        crawler.quit()

if(isfile(url)):
    for line in url:
        scrape(line)
else:
    scrape(url)