103 lines
3.2 KiB
Python
103 lines
3.2 KiB
Python
#!/bin/python3
|
|
import sys
|
|
import os
|
|
import time
|
|
import datetime
|
|
import random
|
|
import hashlib
|
|
import urllib.request
|
|
from os.path import isfile
|
|
import tbselenium.common as cm
|
|
from tbselenium.tbdriver import TorBrowserDriver
|
|
from tbselenium.utils import launch_tbb_tor_with_stem
|
|
from selenium.webdriver.common.keys import Keys
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
import json
|
|
|
|
# Get user input
|
|
url = str(sys.argv[1])
|
|
username = str(sys.argv[2])
|
|
password = str(sys.argv[3])
|
|
# tbd = str(sys.argv[4])
|
|
|
|
|
|
def scrape(url):
|
|
# Run with the Tor browser
|
|
with TorBrowserDriver("/home/amoros/downloads/tor-browser_en-US/") as driver:
|
|
# Save the scraped file with time stamp and .html ending
|
|
file_name = str("onion_site_scraped-" + str(datetime.datetime.now()) + ".html")
|
|
|
|
# Download URL entered when starting the program
|
|
f = open(file_name, "w")
|
|
driver.get(url)
|
|
|
|
# Tailored for White House Market
|
|
elem = driver.find_element_by_name("username")
|
|
elem.clear()
|
|
elem.send_keys(username)
|
|
|
|
# Wait
|
|
time.sleep((random.random() / 1.5) + (random.randint(0, 1) / 5))
|
|
|
|
# Continue filling in the password
|
|
elem = driver.find_element_by_name("password")
|
|
elem.clear()
|
|
elem.send_keys(password)
|
|
|
|
# Wait and fill in CAPTCHA manually
|
|
time.sleep((random.random() / 5) + (random.randint(0, 1) / 5))
|
|
elem.submit()
|
|
|
|
# Wait
|
|
time.sleep((random.random() / 1.5) + (random.randint(0, 1) / 5))
|
|
|
|
# Print status
|
|
print("Title: \n\n", driver.title, "\n\n", driver.current_url, "\n\nDone.\n")
|
|
|
|
# Wait
|
|
time.sleep(4)
|
|
|
|
# Save the screenshot file with time stamp and .png ending
|
|
screenshot_name = str("onion_site_screenshot-" + str(datetime.datetime.now()) + ".png")
|
|
|
|
# Get screenshot
|
|
driver.get_screenshot_as_file(screenshot_name)
|
|
|
|
# Wait
|
|
time.sleep(6)
|
|
|
|
# Get all other links
|
|
all_links = driver.find_elements_by_tag_name('a')
|
|
print("All links: ", all_links)
|
|
|
|
# Save images
|
|
# /html/body/div[3]/div/div/div[5]/div[2]/div[1]/div/div[1]/div[1]/a/img
|
|
# /html/body/div[3]/div/div/div[5]/div[2]/div[2]/div/div[1]/div[1]/a/img
|
|
#images = driver.find_elements_by_xpath("/html/body/div[3]/div/div/div[5]/div[2]/div[2]/div/div[1]/div[1]/a/img")
|
|
images = driver.find_elements_by_tag_name('img')
|
|
time.sleep(10)
|
|
|
|
print("Pictures: ", len(images))
|
|
output_dir = str("/tmp/" + url+ str(datetime.datetime.now()))
|
|
os.makedirs(output_dir, 755)
|
|
image_count=0
|
|
for i in images:
|
|
print("Image: ", i)
|
|
urllib.request.urlretrieve(i.get_attribute('src'), output_dir+'/'+str(image_count)+'.jpg')
|
|
image_count += 1
|
|
#image_md5 = hashlb.md5(b'image.jpg')
|
|
#f.write(images_result)
|
|
|
|
# Write to file
|
|
f.write(driver.page_source)
|
|
f.close()
|
|
|
|
# Check if urls are in a list or just a singlet
|
|
if(isfile(url)):
|
|
for line in url:
|
|
scrape(line)
|
|
else:
|
|
scrape(url)
|