174 lines
5.5 KiB
Python
174 lines
5.5 KiB
Python
"""
|
|
A simple script to scrape an
|
|
onion site using Selenium and the
|
|
Tor browser driver
|
|
|
|
GPG v3
|
|
"""
|
|
|
|
#!/bin/python3
|
|
import sys
|
|
import os
|
|
import time
|
|
import datetime
|
|
import logging
|
|
import random
|
|
import urllib.request
|
|
from os.path import isfile
|
|
from selenium import webdriver
|
|
from selenium.webdriver.common.keys import Keys
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
|
|
# Get user input
|
|
url = str(sys.argv[1])
|
|
selenium_driver_path = str(sys.argv[2])
|
|
destination = str(sys.argv[3])
|
|
|
|
domain = url.replace("/", "")
|
|
domain = domain.replace(":","")
|
|
|
|
# Configure logging
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s", filename=str("/tmp/" + domain + "_site_scrape.log"), filemode="w")
|
|
logging.info('Logging started.')
|
|
|
|
# Output files
|
|
file_name = str(url + "_"+ str(datetime.datetime.now()) + ".html")
|
|
file_name = file_name.replace("http://", "")
|
|
file_name = file_name.replace("https://", "")
|
|
file_name = file_name.replace("/", "")
|
|
f = open(str("/tmp/" + file_name), 'w')
|
|
robotstxt = ""
|
|
|
|
#class Scraper:
|
|
# def __init__(self, url):
|
|
# self.url = url
|
|
# domain = str(url[7:])
|
|
# save_screenshot = yes|no|index_only
|
|
# output_file = str("websites_dw/" + domain)
|
|
|
|
def scrape(url):
|
|
print("|- Scraping takes place soon.\n|-- URL is: ",url)
|
|
logging.info("Entered scraping function.")
|
|
|
|
# Run with the Firefox browser using Gecko
|
|
print("|--- Clear web scraping takes place.")
|
|
logging.info("Firefox Selenium started.")
|
|
crawler = webdriver.Firefox()
|
|
|
|
# Get robots.txt
|
|
robotstxt_url = str(url + "/robots.txt")
|
|
crawler.get(robotstxt_url)
|
|
f_robot = open(str(destination + "robots.txt"), 'w')
|
|
f_robot.write(str(crawler.page_source))
|
|
print("|--- Robots.txt URL: ", robotstxt_url)
|
|
f_robot.close()
|
|
|
|
# Get home page
|
|
crawler.get(url)
|
|
print("Title: \n\n", crawler.title, "\n\n", crawler.current_url, "\n\nDone.\n")
|
|
|
|
# Write index-html to file
|
|
f.write(str(crawler.page_source))
|
|
|
|
# Get all other links
|
|
all_links = crawler.find_elements(By.TAG_NAME, 'a')
|
|
link_list = []
|
|
logging.info("Retrieving URLs from main URL.")
|
|
|
|
file_name_extender = 0
|
|
print("|---- Numer of links: ", len(all_links))
|
|
logging.info("Number of links:")
|
|
logging.info(len(all_links))
|
|
|
|
# Get all links on homepage
|
|
for al in all_links:
|
|
retrieved_url = al.get_dom_attribute("href")
|
|
link_list.append(retrieved_url)
|
|
logging.info(str(retrieved_url))
|
|
|
|
print(link_list)
|
|
|
|
for link in link_list:
|
|
if(link == None):
|
|
pass
|
|
elif(link == ""):
|
|
pass
|
|
else:
|
|
output_file_name = crawler.title
|
|
output_file_name = output_file_name.replace("/", "")
|
|
output_file_name = output_file_name.replace(":", "")
|
|
|
|
# Check if file name is too long
|
|
if len(output_file_name) > 100:
|
|
output_file_name = output_file_name[:50]
|
|
try:
|
|
if output_file_name[0] == ".":
|
|
print("|----- Link starts with . " , output_file_name)
|
|
output_file_name = output_file_name.replace(".", "", 1)
|
|
|
|
if(isfile(output_file_name)):
|
|
output_file_name = str(output_file_name + file_name_extender)
|
|
file_name_extender += 1
|
|
print("output file name occupied")
|
|
|
|
output = open(str(destination + output_file_name + ".html"), 'w')
|
|
print("|---- Identified link on page: ", link)
|
|
except:
|
|
print("|------ file name . too short")
|
|
|
|
# Check if external link (HTTP://)
|
|
if link.find("http") == False:
|
|
print("http ", link)
|
|
else:
|
|
if(link[0] == "/"):
|
|
link = (str(url + link))
|
|
else:
|
|
link = str(url + link)
|
|
print("|------ Internal link. Rewriting link to: ", link)
|
|
|
|
try:
|
|
#WebDriverWait(crawler, 3).until(EC.staleness_of((al)))
|
|
crawler.implicitly_wait(2)
|
|
time.sleep(1.5)
|
|
crawler.get(link)
|
|
logging.info("Successful download")
|
|
logging.info(link)
|
|
output.write(str(crawler.page_source))
|
|
print("|------- Title: ", crawler.title, crawler.current_url, " downloaded.")
|
|
|
|
# Save images
|
|
image_counter =0
|
|
all_images = crawler.find_elements(By.TAG_NAME, 'img')
|
|
os.chdir(destination)
|
|
try:
|
|
for image in all_images:
|
|
urllib.request.urlretrieve(image.get_attribute('src'), str(destination + "/image_" + str(image_counter) + '.jpg'))
|
|
image_counter += 1
|
|
print("Image saved: ", image)
|
|
|
|
except Exception as _e:
|
|
print("Image not fetched:", _e)
|
|
|
|
except Exception as e:
|
|
logging.error(e)
|
|
print("|----- Error getting ", link, " current URL:" , crawler.current_url)
|
|
print(e)
|
|
|
|
print("|--- Moving on to saving source.")
|
|
|
|
# Save images
|
|
#images = driver.find_elements_by_tag_name('img')
|
|
|
|
# Write and close
|
|
print("|-- Source code written to file.\n|- Exiting.")
|
|
f.close()
|
|
crawler.quit()
|
|
|
|
if(isfile(url)):
|
|
for line in url:
|
|
scrape(line)
|
|
else:
|
|
scrape(url)
|