D3/D3-Collector/d3-collector.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
import sqlite3 as sqlite
import scrapy
import pandas as pd
import sys
import logging
import time as stime
import datetime as time
#from d3_classifier import d3_predict;
from urllib.request import urlopen
from urllib.error import URLError
from urllib.error import HTTPError
import urllib.request as request
from bs4 import BeautifulSoup

# Global variable
X_predict = []
logging.basicConfig(filename='d3-collector.log', level=logging.DEBUG)

# Open database - on event, get new URLs and fetch them, then provide some descriptive statstics.
db_connection = None

def main():
    counter = 0

    print("D3-Collector \n\nA very simple web scraping script that fetches URLs from an annotation database and then tries to download that resource. \n\nUsage: \n\nd3-collector.py -d database.db");
    try:
        db_connection = sqlite.connect('../D3-Centraliser/100_test.db')
        db_cursor = db_connection.cursor()
        db_cursor.execute("SELECT DISTINCT url,sha256 FROM webpage")
        onion_sites = db_cursor.fetchall()

        for tpl in onion_sites:
            db_cursor.execute("SELECT url FROM rawpage WHERE url=" + "\"" + tpl[0] + "\";")
            exists = db_cursor.fetchall()

            if(exists):
                print(tpl[0], " already exists in DB. Skipping ", exists)

            else:
                rawpage = wget(tpl[0])
                print(tpl[0], " and ", tpl[1], "\n")

                if (len(rawpage) == 2):
                    db_cursor.execute('''INSERT INTO rawpage(sha256, url, html, content, timestamp) VALUES(?,?,?,?,?);''', (tpl[1], tpl[0], rawpage[0], rawpage[1], time.datetime.now()))
                    db_connection.commit()
                    logging.info(tpl[0])
                    logging.info("Inserted to rawpage table in the database.")
                    counter += 1
                    print("|-" , counter)

    except sqlite.Error as er:
        print('SQLite error: %s' % (' '.join(er.args)))
        print("Exception class is: ", er.__class__)
        print('SQLite traceback: ')

    finally:
        if db_connection:
            db_connection.close();

def wget(url):
    req = request.Request(url,
        data=None,
        headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0'}
    )

    print("|- Getting URL:" , url)
    arr = []
    try:
        html = urlopen(req, timeout=300)
        bs = BeautifulSoup(html,'html.parser')
        raw = html.read().decode('utf-8')
        parsed = bs.get_text()
        raw = str(raw)

        arr = [raw, parsed]
        print("Raw page retrieval successful.")
        sys.stdout.flush()
    except URLError as e:
        logging.debug("Could not download. URL error. ")
        print("URL error.", e)
    except HTTPError as e:
        logging.debug("Could not download ")
        print("HTTP error.", e)

    #print("OK")
    #sys.stdout.flush()

    return arr

def svm_score(varX):
    score = build_classifier(varX)

    print(score)

if __name__ == "__main__":
    main()