#!/usr/bin/python # -*- coding: utf-8 -*- import sqlite3 as sqlite import scrapy import pandas as pd import sys import logging import time as stime import datetime as time #from d3_classifier import d3_predict; from urllib.request import urlopen from urllib.error import URLError from urllib.error import HTTPError import urllib.request as request from bs4 import BeautifulSoup # Global variable X_predict = [] logging.basicConfig(filename='d3-collector.log', level=logging.DEBUG) # Open database - on event, get new URLs and fetch them, then provide some descriptive statstics. db_connection = None def main(): counter = 0 print("D3-Collector \n\nA very simple web scraping script that fetches URLs from an annotation database and then tries to download that resource. \n\nUsage: \n\nd3-collector.py -d database.db"); try: db_connection = sqlite.connect('../D3-Centraliser/100_test.db') db_cursor = db_connection.cursor() db_cursor.execute("SELECT DISTINCT url,sha256 FROM webpage") onion_sites = db_cursor.fetchall() for tpl in onion_sites: db_cursor.execute("SELECT url FROM rawpage WHERE url=" + "\"" + tpl[0] + "\";") exists = db_cursor.fetchall() if(exists): print(tpl[0], " already exists in DB. Skipping ", exists) else: rawpage = wget(tpl[0]) print(tpl[0], " and ", tpl[1], "\n") if (len(rawpage) == 2): db_cursor.execute('''INSERT INTO rawpage(sha256, url, html, content, timestamp) VALUES(?,?,?,?,?);''', (tpl[1], tpl[0], rawpage[0], rawpage[1], time.datetime.now())) db_connection.commit() logging.info(tpl[0]) logging.info("Inserted to rawpage table in the database.") counter += 1 print("|-" , counter) except sqlite.Error as er: print('SQLite error: %s' % (' '.join(er.args))) print("Exception class is: ", er.__class__) print('SQLite traceback: ') finally: if db_connection: db_connection.close(); def wget(url): req = request.Request(url, data=None, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0'} ) print("|- Getting URL:" , url) arr = [] try: html = urlopen(req, timeout=300) bs = BeautifulSoup(html,'html.parser') raw = html.read().decode('utf-8') parsed = bs.get_text() raw = str(raw) arr = [raw, parsed] print("Raw page retrieval successful.") sys.stdout.flush() except URLError as e: logging.debug("Could not download. URL error. ") print("URL error.", e) except HTTPError as e: logging.debug("Could not download ") print("HTTP error.", e) #print("OK") #sys.stdout.flush() return arr def svm_score(varX): score = build_classifier(varX) print(score) if __name__ == "__main__": main()