DIDECT2S/D3-Collector/site_scrape_cw.py

"""
A simple script to scrape an
onion site using Selenium and the
Tor browser driver

GPG v3
"""

#!/bin/python3
import sys
import os
import time
import datetime
import logging
import random
import urllib.request
from os.path import isfile
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Get user input
url = str(sys.argv[1])
selenium_driver_path = str(sys.argv[2])
destination = str(sys.argv[3])

domain = url.replace("/", "")
domain = domain.replace(":","")

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s", filename=str("/tmp/" + domain + "_site_scrape.log"), filemode="w")
logging.info('Logging started.')

# Output files
file_name = str(url + "_"+ str(datetime.datetime.now()) + ".html")
file_name = file_name.replace("http://", "")
file_name = file_name.replace("https://", "")
file_name = file_name.replace("/", "")
f = open(str("/tmp/" + file_name), 'w')
robotstxt = ""

#class Scraper:
#    def __init__(self, url):
#        self.url = url
#        domain = str(url[7:])
#        save_screenshot = yes|no|index_only
#        output_file = str("websites_dw/" + domain)

def scrape(url):
    print("|- Scraping takes place soon.\n|-- URL is: ",url)
    logging.info("Entered scraping function.")

    # Run with the Firefox browser using Gecko
    print("|--- Clear web scraping takes place.")
    logging.info("Firefox Selenium started.")
    crawler = webdriver.Firefox()

    # Get robots.txt
    robotstxt_url = str(url + "/robots.txt")
    crawler.get(robotstxt_url)
    f_robot = open(str(destination + "robots.txt"), 'w')
    f_robot.write(str(crawler.page_source))
    print("|--- Robots.txt URL: ", robotstxt_url)
    f_robot.close()

    # Get home page
    crawler.get(url)
    print("Title: \n\n", crawler.title, "\n\n", crawler.current_url, "\n\nDone.\n")

    # Write index-html to file
    f.write(str(crawler.page_source))

    # Get all other links
    all_links = crawler.find_elements(By.TAG_NAME, 'a')
    link_list = []
    logging.info("Retrieving URLs from main URL.")

    file_name_extender = 0
    print("|---- Numer of links: ", len(all_links))
    logging.info("Number of links:")
    logging.info(len(all_links))

    # Get all links on homepage
    for al in all_links:
        retrieved_url = al.get_dom_attribute("href")
        link_list.append(retrieved_url)
        logging.info(str(retrieved_url))

    print(link_list)

    for link in link_list:
        if(link == None):
            pass
        elif(link == ""):
            pass
        else:
            output_file_name = crawler.title
            output_file_name = output_file_name.replace("/", "")
            output_file_name = output_file_name.replace(":", "")

            # Check if file name is too long
            if len(output_file_name) > 100:
                output_file_name = output_file_name[:50]
            try:
                if output_file_name[0] == ".":
                    print("|----- Link starts with . " , output_file_name)
                    output_file_name = output_file_name.replace(".", "", 1)

                if(isfile(output_file_name)):
                    output_file_name = str(output_file_name  + file_name_extender)
                    file_name_extender += 1
                    print("output file name occupied")

                output = open(str(destination + output_file_name + ".html"), 'w')
                print("|---- Identified link on page: ", link)
            except:
                print("|------ file name . too short")

            # Check if external link (HTTP://)
            if link.find("http") == False:
                print("http ", link)
            else:
                if(link[0] == "/"):
                    link = (str(url + link))
                else:
                    link = str(url + link)
                print("|------ Internal link. Rewriting link to: ", link)

            try:
                #WebDriverWait(crawler, 3).until(EC.staleness_of((al)))
                crawler.implicitly_wait(2)
                time.sleep(1.5)
                crawler.get(link)
                logging.info("Successful download")
                logging.info(link)
                output.write(str(crawler.page_source))
                print("|------- Title: ", crawler.title, crawler.current_url, " downloaded.")

                # Save images
                image_counter =0
                all_images = crawler.find_elements(By.TAG_NAME, 'img')
                os.chdir(destination)
                try:
                    for image in all_images:
                        urllib.request.urlretrieve(image.get_attribute('src'), str(destination + "/image_" + str(image_counter) + '.jpg'))
                        image_counter += 1
                        print("Image saved: ", image)

                except Exception as _e:
                    print("Image not fetched:", _e)

            except Exception as e:
                logging.error(e)
                print("|----- Error getting ", link, " current URL:" , crawler.current_url)
                print(e)

    print("|--- Moving on to saving source.")

    # Save images
    #images = driver.find_elements_by_tag_name('img')

    # Write and close
    print("|-- Source code written to file.\n|- Exiting.")
    f.close()
    crawler.quit()

if(isfile(url)):
    for line in url:
        scrape(line)
else:
    scrape(url)