101 lines
3.0 KiB
Python
101 lines
3.0 KiB
Python
#!/usr/bin/python
|
|
# -*- coding: utf-8 -*-
|
|
import sqlite3 as sqlite
|
|
import scrapy
|
|
import pandas as pd
|
|
import sys
|
|
import logging
|
|
import time as stime
|
|
import datetime as time
|
|
#from d3_classifier import d3_predict;
|
|
from urllib.request import urlopen
|
|
from urllib.error import URLError
|
|
from urllib.error import HTTPError
|
|
import urllib.request as request
|
|
from bs4 import BeautifulSoup
|
|
|
|
# Global variable
|
|
X_predict = []
|
|
logging.basicConfig(filename='d3-collector.log', level=logging.DEBUG)
|
|
|
|
# Open database - on event, get new URLs and fetch them, then provide some descriptive statstics.
|
|
db_connection = None
|
|
|
|
def main():
|
|
counter = 0
|
|
|
|
print("D3-Collector \n\nA very simple web scraping script that fetches URLs from an annotation database and then tries to download that resource. \n\nUsage: \n\nd3-collector.py -d database.db");
|
|
try:
|
|
db_connection = sqlite.connect('../D3-Centraliser/100_test.db')
|
|
db_cursor = db_connection.cursor()
|
|
db_cursor.execute("SELECT DISTINCT url,sha256 FROM webpage")
|
|
onion_sites = db_cursor.fetchall()
|
|
|
|
for tpl in onion_sites:
|
|
db_cursor.execute("SELECT url FROM rawpage WHERE url=" + "\"" + tpl[0] + "\";")
|
|
exists = db_cursor.fetchall()
|
|
|
|
if(exists):
|
|
print(tpl[0], " already exists in DB. Skipping ", exists)
|
|
|
|
else:
|
|
rawpage = wget(tpl[0])
|
|
print(tpl[0], " and ", tpl[1], "\n")
|
|
|
|
if (len(rawpage) == 2):
|
|
db_cursor.execute('''INSERT INTO rawpage(sha256, url, html, content, timestamp) VALUES(?,?,?,?,?);''', (tpl[1], tpl[0], rawpage[0], rawpage[1], time.datetime.now()))
|
|
db_connection.commit()
|
|
logging.info(tpl[0])
|
|
logging.info("Inserted to rawpage table in the database.")
|
|
counter += 1
|
|
print("|-" , counter)
|
|
|
|
except sqlite.Error as er:
|
|
print('SQLite error: %s' % (' '.join(er.args)))
|
|
print("Exception class is: ", er.__class__)
|
|
print('SQLite traceback: ')
|
|
|
|
finally:
|
|
if db_connection:
|
|
db_connection.close();
|
|
|
|
def wget(url):
|
|
req = request.Request(url,
|
|
data=None,
|
|
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0'}
|
|
)
|
|
|
|
print("|- Getting URL:" , url)
|
|
arr = []
|
|
try:
|
|
html = urlopen(req, timeout=300)
|
|
bs = BeautifulSoup(html,'html.parser')
|
|
raw = html.read().decode('utf-8')
|
|
parsed = bs.get_text()
|
|
raw = str(raw)
|
|
|
|
arr = [raw, parsed]
|
|
print("Raw page retrieval successful.")
|
|
sys.stdout.flush()
|
|
except URLError as e:
|
|
logging.debug("Could not download. URL error. ")
|
|
print("URL error.", e)
|
|
except HTTPError as e:
|
|
logging.debug("Could not download ")
|
|
print("HTTP error.", e)
|
|
|
|
#print("OK")
|
|
#sys.stdout.flush()
|
|
|
|
return arr
|
|
|
|
def svm_score(varX):
|
|
score = build_classifier(varX)
|
|
|
|
print(score)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|
|
|