DIDECT2S/D3-Collector/site_scrape_cw.py

174 lines
5.5 KiB
Python

"""
A simple script to scrape an
onion site using Selenium and the
Tor browser driver
GPG v3
"""
#!/bin/python3
import sys
import os
import time
import datetime
import logging
import random
import urllib.request
from os.path import isfile
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# Get user input
url = str(sys.argv[1])
selenium_driver_path = str(sys.argv[2])
destination = str(sys.argv[3])
domain = url.replace("/", "")
domain = domain.replace(":","")
# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s", filename=str("/tmp/" + domain + "_site_scrape.log"), filemode="w")
logging.info('Logging started.')
# Output files
file_name = str(url + "_"+ str(datetime.datetime.now()) + ".html")
file_name = file_name.replace("http://", "")
file_name = file_name.replace("https://", "")
file_name = file_name.replace("/", "")
f = open(str("/tmp/" + file_name), 'w')
robotstxt = ""
#class Scraper:
# def __init__(self, url):
# self.url = url
# domain = str(url[7:])
# save_screenshot = yes|no|index_only
# output_file = str("websites_dw/" + domain)
def scrape(url):
print("|- Scraping takes place soon.\n|-- URL is: ",url)
logging.info("Entered scraping function.")
# Run with the Firefox browser using Gecko
print("|--- Clear web scraping takes place.")
logging.info("Firefox Selenium started.")
crawler = webdriver.Firefox()
# Get robots.txt
robotstxt_url = str(url + "/robots.txt")
crawler.get(robotstxt_url)
f_robot = open(str(destination + "robots.txt"), 'w')
f_robot.write(str(crawler.page_source))
print("|--- Robots.txt URL: ", robotstxt_url)
f_robot.close()
# Get home page
crawler.get(url)
print("Title: \n\n", crawler.title, "\n\n", crawler.current_url, "\n\nDone.\n")
# Write index-html to file
f.write(str(crawler.page_source))
# Get all other links
all_links = crawler.find_elements(By.TAG_NAME, 'a')
link_list = []
logging.info("Retrieving URLs from main URL.")
file_name_extender = 0
print("|---- Numer of links: ", len(all_links))
logging.info("Number of links:")
logging.info(len(all_links))
# Get all links on homepage
for al in all_links:
retrieved_url = al.get_dom_attribute("href")
link_list.append(retrieved_url)
logging.info(str(retrieved_url))
print(link_list)
for link in link_list:
if(link == None):
pass
elif(link == ""):
pass
else:
output_file_name = crawler.title
output_file_name = output_file_name.replace("/", "")
output_file_name = output_file_name.replace(":", "")
# Check if file name is too long
if len(output_file_name) > 100:
output_file_name = output_file_name[:50]
try:
if output_file_name[0] == ".":
print("|----- Link starts with . " , output_file_name)
output_file_name = output_file_name.replace(".", "", 1)
if(isfile(output_file_name)):
output_file_name = str(output_file_name + file_name_extender)
file_name_extender += 1
print("output file name occupied")
output = open(str(destination + output_file_name + ".html"), 'w')
print("|---- Identified link on page: ", link)
except:
print("|------ file name . too short")
# Check if external link (HTTP://)
if link.find("http") == False:
print("http ", link)
else:
if(link[0] == "/"):
link = (str(url + link))
else:
link = str(url + link)
print("|------ Internal link. Rewriting link to: ", link)
try:
#WebDriverWait(crawler, 3).until(EC.staleness_of((al)))
crawler.implicitly_wait(2)
time.sleep(1.5)
crawler.get(link)
logging.info("Successful download")
logging.info(link)
output.write(str(crawler.page_source))
print("|------- Title: ", crawler.title, crawler.current_url, " downloaded.")
# Save images
image_counter =0
all_images = crawler.find_elements(By.TAG_NAME, 'img')
os.chdir(destination)
try:
for image in all_images:
urllib.request.urlretrieve(image.get_attribute('src'), str(destination + "/image_" + str(image_counter) + '.jpg'))
image_counter += 1
print("Image saved: ", image)
except Exception as _e:
print("Image not fetched:", _e)
except Exception as e:
logging.error(e)
print("|----- Error getting ", link, " current URL:" , crawler.current_url)
print(e)
print("|--- Moving on to saving source.")
# Save images
#images = driver.find_elements_by_tag_name('img')
# Write and close
print("|-- Source code written to file.\n|- Exiting.")
f.close()
crawler.quit()
if(isfile(url)):
for line in url:
scrape(line)
else:
scrape(url)