166 lines
5.7 KiB
Python
166 lines
5.7 KiB
Python
"""
|
|
A simple script to scrape an
|
|
onion site using Selenium and the
|
|
Tor browser driver
|
|
|
|
GPG v3
|
|
"""
|
|
|
|
#!/bin/python3
|
|
import sys
|
|
import os
|
|
import time
|
|
import datetime
|
|
import logging
|
|
import random
|
|
import random
|
|
import urllib.request
|
|
from os.path import isfile
|
|
import tbselenium.common as cm
|
|
from tbselenium.tbdriver import TorBrowserDriver
|
|
from selenium.webdriver.common.keys import Keys
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
|
|
|
|
# Get user input, find domain
|
|
TOR_BROWSER_PATH = str(sys.argv[2]) # T = os.path.isdir
|
|
url = str(sys.argv[1])
|
|
domain = ""
|
|
destination = str(sys.argv[3])
|
|
|
|
# Find domain (?)
|
|
domain = url.replace("//", "", 10)
|
|
domain = domain.replace(":", "", 10)
|
|
|
|
|
|
# Configure logging
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s", filename=str("/tmp/"+ domain + "_site_scrape.log"), filemode="w")
|
|
logging.warning('Logging started.')
|
|
|
|
# File stream to write page source to
|
|
file_name = str(url + "_"+ str(datetime.datetime.now()) + ".html")
|
|
file_name = file_name.replace("https://", "")
|
|
file_name = file_name.replace("http://", "")
|
|
f = open(str("/tmp/" + file_name), 'w')
|
|
robotstxt = ""
|
|
|
|
def scrape(url):
|
|
print("|--- Scraping takes place soon.\n|----- URL is: ", url)
|
|
logging.info("Entered scraping function.")
|
|
|
|
# Run with the Tor browser
|
|
with TorBrowserDriver(TOR_BROWSER_PATH) as crawler:
|
|
print("\n\nScraping takes place.")
|
|
logging.info("Tor Browser Selenium started.")
|
|
|
|
# Get Robots.txt
|
|
#robotstxt_url = str(url + "/robots.txt")
|
|
#crawler.get(robotstxt_url)
|
|
#f_robot = open(str("/tmp/" + domain + "_robots.txt"), 'w')
|
|
#f_robot.write(str(crawler.page_source))
|
|
#print("|---- Robots.txt URL: ", robotstxt_url)
|
|
#f_robot.close()
|
|
|
|
# Get main pages
|
|
crawler.get(url)
|
|
print("Title: \n\n", crawler.title, "\n\n", crawler.current_url, "\n\nDone.\n")
|
|
f.write(str(crawler.page_source))
|
|
|
|
# Get all other links
|
|
all_links = crawler.find_elements(By.TAG_NAME, 'a')
|
|
link_list = []
|
|
logging.info("Retrieving URLs from main URL.\n\n")
|
|
|
|
# Use a counter to increment taken file names
|
|
file_name_extender = 0
|
|
|
|
for al in all_links:
|
|
retrieved_urls = al.get_attribute("href")
|
|
link_list.append(retrieved_urls)
|
|
print("Links: ", retrieved_urls)
|
|
logging.info(retrieved_urls)
|
|
|
|
for link in link_list:
|
|
if(link == None):
|
|
pass
|
|
elif(link == ''):
|
|
pass
|
|
else:
|
|
output_file_name = crawler.title
|
|
output_file_name = output_file_name.replace("/", "")
|
|
output_file_name = output_file_name.replace(":", "")
|
|
if len(output_file_name) > 100:
|
|
output_file_name = output_file_name[:50]
|
|
try:
|
|
if output_file_name[0] == ".":
|
|
logging.info(f'{output_file_name} removed .')
|
|
output_file_name = output_file_name.replace(".", "", 1)
|
|
# If file exits, rename it
|
|
if(isfile(output_file_name)):
|
|
output_file_name = str(output_file_name + file_name_extender)
|
|
file_name_extender += 1
|
|
print("Output file name already exists: ", output_file_name, " Added ", file_name_extender)
|
|
|
|
# Write source code to file
|
|
output = open(str(destination + output_file_name + ".html"), 'w')
|
|
print("|---- Identified link on page: ", link)
|
|
except:
|
|
print("|------ Output file name . and too short")
|
|
|
|
# Check if external link (HTTP://)
|
|
if(link.find("http")) == False:
|
|
print("http ", link)
|
|
logging.info("External link")
|
|
logging.info(link)
|
|
else:
|
|
link = str(url + "/" + link)
|
|
print("|------ Internal link. Rewriting link: ", link)
|
|
|
|
try:
|
|
#WebDriverWait(crawler, 3).until(EC.staleness_of((al)))
|
|
os.chdir(destination)
|
|
crawler.implicitly_wait(2)
|
|
time.sleep(1.5)
|
|
crawler.get(link)
|
|
|
|
# Save
|
|
output.write(str(crawler.page_source))
|
|
logging.info(link)
|
|
logging.info("Successful download.")
|
|
print("|------ Title: ", crawler.title, crawler.current_url, " downloaded")
|
|
|
|
all_images = crawler.find_elements(By.TAG_NAME, 'img')
|
|
image_counter = 0
|
|
os.chdir(destination)
|
|
try:
|
|
for image in all_images:
|
|
urllib.request.urlretrieve(image.get_attribute('src'),str(destination + "/image_" + str(image_counter) + '.jpg'))
|
|
image_counter += 1
|
|
print("Image saved: ", image)
|
|
except Exception as _e:
|
|
print("Image not saved. ", _e)
|
|
|
|
except Exception as e:
|
|
logging.error(e)
|
|
print("|----- Error getting ", link, " current URL:" , crawler.current_url)
|
|
print(e)
|
|
|
|
print("|---- Done crawling.")
|
|
|
|
# Save images
|
|
#images = driver.find_elements_by_tag_name('img')
|
|
|
|
f.close()
|
|
|
|
print("|-- Source code written to file. Exiting.")
|
|
#Close browser
|
|
crawler.quit()
|
|
|
|
if(isfile(url)):
|
|
for line in url:
|
|
scrape(line)
|
|
else:
|
|
scrape(url)
|