This repository has been archived on 2021-12-10. You can view files and clone it, but cannot push or open issues or pull requests.
D3/D3-Collector/d3-collector.py

101 lines
3.0 KiB
Python

#!/usr/bin/python
# -*- coding: utf-8 -*-
import sqlite3 as sqlite
import scrapy
import pandas as pd
import sys
import logging
import time as stime
import datetime as time
#from d3_classifier import d3_predict;
from urllib.request import urlopen
from urllib.error import URLError
from urllib.error import HTTPError
import urllib.request as request
from bs4 import BeautifulSoup
# Global variable
X_predict = []
logging.basicConfig(filename='d3-collector.log', level=logging.DEBUG)
# Open database - on event, get new URLs and fetch them, then provide some descriptive statstics.
db_connection = None
def main():
counter = 0
print("D3-Collector \n\nA very simple web scraping script that fetches URLs from an annotation database and then tries to download that resource. \n\nUsage: \n\nd3-collector.py -d database.db");
try:
db_connection = sqlite.connect('../D3-Centraliser/100_test.db')
db_cursor = db_connection.cursor()
db_cursor.execute("SELECT DISTINCT url,sha256 FROM webpage")
onion_sites = db_cursor.fetchall()
for tpl in onion_sites:
db_cursor.execute("SELECT url FROM rawpage WHERE url=" + "\"" + tpl[0] + "\";")
exists = db_cursor.fetchall()
if(exists):
print(tpl[0], " already exists in DB. Skipping ", exists)
else:
rawpage = wget(tpl[0])
print(tpl[0], " and ", tpl[1], "\n")
if (len(rawpage) == 2):
db_cursor.execute('''INSERT INTO rawpage(sha256, url, html, content, timestamp) VALUES(?,?,?,?,?);''', (tpl[1], tpl[0], rawpage[0], rawpage[1], time.datetime.now()))
db_connection.commit()
logging.info(tpl[0])
logging.info("Inserted to rawpage table in the database.")
counter += 1
print("|-" , counter)
except sqlite.Error as er:
print('SQLite error: %s' % (' '.join(er.args)))
print("Exception class is: ", er.__class__)
print('SQLite traceback: ')
finally:
if db_connection:
db_connection.close();
def wget(url):
req = request.Request(url,
data=None,
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0'}
)
print("|- Getting URL:" , url)
arr = []
try:
html = urlopen(req, timeout=300)
bs = BeautifulSoup(html,'html.parser')
raw = html.read().decode('utf-8')
parsed = bs.get_text()
raw = str(raw)
arr = [raw, parsed]
print("Raw page retrieval successful.")
sys.stdout.flush()
except URLError as e:
logging.debug("Could not download. URL error. ")
print("URL error.", e)
except HTTPError as e:
logging.debug("Could not download ")
print("HTTP error.", e)
#print("OK")
#sys.stdout.flush()
return arr
def svm_score(varX):
score = build_classifier(varX)
print(score)
if __name__ == "__main__":
main()