DIDECT2S/D3-Collector/site_scrape_dw.py

166 lines
5.7 KiB
Python

"""
A simple script to scrape an
onion site using Selenium and the
Tor browser driver
GPG v3
"""
#!/bin/python3
import sys
import os
import time
import datetime
import logging
import random
import random
import urllib.request
from os.path import isfile
import tbselenium.common as cm
from tbselenium.tbdriver import TorBrowserDriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# Get user input, find domain
TOR_BROWSER_PATH = str(sys.argv[2]) # T = os.path.isdir
url = str(sys.argv[1])
domain = ""
destination = str(sys.argv[3])
# Find domain (?)
domain = url.replace("//", "", 10)
domain = domain.replace(":", "", 10)
# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s", filename=str("/tmp/"+ domain + "_site_scrape.log"), filemode="w")
logging.warning('Logging started.')
# File stream to write page source to
file_name = str(url + "_"+ str(datetime.datetime.now()) + ".html")
file_name = file_name.replace("https://", "")
file_name = file_name.replace("http://", "")
f = open(str("/tmp/" + file_name), 'w')
robotstxt = ""
def scrape(url):
print("|--- Scraping takes place soon.\n|----- URL is: ", url)
logging.info("Entered scraping function.")
# Run with the Tor browser
with TorBrowserDriver(TOR_BROWSER_PATH) as crawler:
print("\n\nScraping takes place.")
logging.info("Tor Browser Selenium started.")
# Get Robots.txt
#robotstxt_url = str(url + "/robots.txt")
#crawler.get(robotstxt_url)
#f_robot = open(str("/tmp/" + domain + "_robots.txt"), 'w')
#f_robot.write(str(crawler.page_source))
#print("|---- Robots.txt URL: ", robotstxt_url)
#f_robot.close()
# Get main pages
crawler.get(url)
print("Title: \n\n", crawler.title, "\n\n", crawler.current_url, "\n\nDone.\n")
f.write(str(crawler.page_source))
# Get all other links
all_links = crawler.find_elements(By.TAG_NAME, 'a')
link_list = []
logging.info("Retrieving URLs from main URL.\n\n")
# Use a counter to increment taken file names
file_name_extender = 0
for al in all_links:
retrieved_urls = al.get_attribute("href")
link_list.append(retrieved_urls)
print("Links: ", retrieved_urls)
logging.info(retrieved_urls)
for link in link_list:
if(link == None):
pass
elif(link == ''):
pass
else:
output_file_name = crawler.title
output_file_name = output_file_name.replace("/", "")
output_file_name = output_file_name.replace(":", "")
if len(output_file_name) > 100:
output_file_name = output_file_name[:50]
try:
if output_file_name[0] == ".":
logging.info(f'{output_file_name} removed .')
output_file_name = output_file_name.replace(".", "", 1)
# If file exits, rename it
if(isfile(output_file_name)):
output_file_name = str(output_file_name + file_name_extender)
file_name_extender += 1
print("Output file name already exists: ", output_file_name, " Added ", file_name_extender)
# Write source code to file
output = open(str(destination + output_file_name + ".html"), 'w')
print("|---- Identified link on page: ", link)
except:
print("|------ Output file name . and too short")
# Check if external link (HTTP://)
if(link.find("http")) == False:
print("http ", link)
logging.info("External link")
logging.info(link)
else:
link = str(url + "/" + link)
print("|------ Internal link. Rewriting link: ", link)
try:
#WebDriverWait(crawler, 3).until(EC.staleness_of((al)))
os.chdir(destination)
crawler.implicitly_wait(2)
time.sleep(1.5)
crawler.get(link)
# Save
output.write(str(crawler.page_source))
logging.info(link)
logging.info("Successful download.")
print("|------ Title: ", crawler.title, crawler.current_url, " downloaded")
all_images = crawler.find_elements(By.TAG_NAME, 'img')
image_counter = 0
os.chdir(destination)
try:
for image in all_images:
urllib.request.urlretrieve(image.get_attribute('src'),str(destination + "/image_" + str(image_counter) + '.jpg'))
image_counter += 1
print("Image saved: ", image)
except Exception as _e:
print("Image not saved. ", _e)
except Exception as e:
logging.error(e)
print("|----- Error getting ", link, " current URL:" , crawler.current_url)
print(e)
print("|---- Done crawling.")
# Save images
#images = driver.find_elements_by_tag_name('img')
f.close()
print("|-- Source code written to file. Exiting.")
#Close browser
crawler.quit()
if(isfile(url)):
for line in url:
scrape(line)
else:
scrape(url)