246 lines
10 KiB
Python
246 lines
10 KiB
Python
#####################################################
|
|
# Title: D3-Analyser
|
|
# Author: Jesper Bergman (jesperbe@dsv.su.se)
|
|
# Licence: MIT Licence
|
|
#####################################################
|
|
|
|
#!/usr/bin/python
|
|
"""
|
|
------------------------------------------------------------------------
|
|
D3-Analyser is an annotation driven machine learning program
|
|
for analsying and classifying text from, for example, a darknet source,
|
|
but also other web or text content sources.
|
|
------------------------------------------------------------------------
|
|
|
|
Usage:
|
|
d3-analyser.py (-d | --database) <database>
|
|
d3-analyser.py (-d | --database) <database> [--training | --new]
|
|
d3-analyser.py (-h | --help)
|
|
|
|
Examples:
|
|
d3-analyser.py --database annotations.db --training
|
|
d3-analyser.py --database annotations.db --new --urls <onion_links.txt>
|
|
|
|
Options:
|
|
-h --help Show this message
|
|
-m --model Algorithm to use for the clssification model. Default: ALL. Valid options are: svm, rf, lr.
|
|
-c --classify Takes the file and uses it as comparison with --file=FILE.
|
|
-d --database Tries to connect to the SQLite data and parse data from it. Required.
|
|
-n --new Classify new/unseen content
|
|
-t --training Classify training/testing data set
|
|
-u --urls New URLs to classify (must be in database). Could be in separate file.
|
|
-q --quit Quit this program
|
|
|
|
"""
|
|
|
|
# Load datasets and algorithms
|
|
from docopt import docopt
|
|
from sklearn import svm
|
|
from sklearn.model_selection import cross_val_score
|
|
from sklearn.model_selection import StratifiedKFold
|
|
from sklearn.multiclass import OneVsRestClassifier
|
|
from sklearn.metrics import precision_score;
|
|
from sklearn.metrics import classification_report;
|
|
from sklearn.metrics import accuracy_score;
|
|
from sklearn.metrics import balanced_accuracy_score;
|
|
from sklearn.metrics import recall_score;
|
|
from sklearn.metrics import precision_recall_curve;
|
|
from sklearn.model_selection import train_test_split;
|
|
from sklearn.feature_extraction.text import TfidfVectorizer;
|
|
from sklearn.feature_extraction.text import TfidfTransformer;
|
|
from sklearn.feature_extraction.text import CountVectorizer;
|
|
from sklearn.naive_bayes import MultinomialNB;
|
|
from sklearn.ensemble import RandomForestClassifier;
|
|
from sklearn.linear_model import LogisticRegression;
|
|
#import matplotlib.pyplot as plt
|
|
from sklearn import preprocessing;
|
|
from sklearn.preprocessing import StandardScaler
|
|
from pandas.io import sql;
|
|
|
|
# Load libraries
|
|
import optparse;
|
|
import argparse;
|
|
import sqlite3;
|
|
import os;
|
|
import time;
|
|
import pandas as pd;
|
|
import numpy as np;
|
|
import sys;
|
|
|
|
db_connection = sqlite3.connect('../D3-Centraliser/100_test.db');
|
|
cursor = db_connection.cursor();
|
|
df = [];
|
|
|
|
def main(arguments):
|
|
# Classify unseen samples.
|
|
new_classification = arguments['--new']
|
|
# Classify only training dataset. Include users annotations and highlighted texts.
|
|
training_classification = arguments['--training']
|
|
# SQL query will differ depending on new/old data.
|
|
sql_query = False
|
|
|
|
try:
|
|
# Fetch table to analyse data
|
|
if(training_classification is True):
|
|
sql_query = "SELECT DISTINCT webpage.url, notes.note, highlightedText.highlightedText, categories.category FROM webpage, notes, highlightedText, categories WHERE categories.sha256 = webpage.sha256 AND webpage.sha256 = highlightedText.sha256 AND webpage.sha256 = notes.sha256";
|
|
print("JA")
|
|
elif(new_classification is True):
|
|
sql_query = "SELECT DISTINCT webpage.url, highlightedText.highlightedText, rawpage.content, categories.category FROM webpage, rawpage, highlightedText, categories WHERE categories.sha256 = webpage.sha256 AND webpage.sha256 = highlightedText.sha256 AND webpage.sha256 = rawpage.sha256"
|
|
print("EJ")
|
|
# Insert into a Pandas DF
|
|
df = pd.read_sql_query(sql_query, db_connection);
|
|
|
|
print(df.head, training_classification)
|
|
|
|
# Build classifiers based on the DataFrame
|
|
build_classifiers(df, training_classification);
|
|
|
|
# Close DB connection
|
|
db_connection.commit();
|
|
db_connection.close();
|
|
|
|
except sqlite3.Error as err:
|
|
print("Sqlite error:", err);
|
|
finally:
|
|
db_connection.close();
|
|
|
|
'''
|
|
ADD CLASSIFICATION SCORES TO DATABASE
|
|
|
|
'''
|
|
def add_to_database(item, value):
|
|
if(item == "svm_score"):
|
|
db_connection.execute("INSERT INTO categories(svm_score) VALUES(?)", [value])
|
|
if(item == "lr_score"):
|
|
db_connection.execute("INSERT INTO categories(lr_score) VALUES(?)", [value])
|
|
|
|
db_connection.commit()
|
|
|
|
'''
|
|
BUILD SVM AND LR MODELS
|
|
'''
|
|
def build_classifiers(df, include_annotations, *unknown_samples):
|
|
|
|
# Specify target (class) label
|
|
y = df["category"];
|
|
corpus = []
|
|
|
|
print(y.head);
|
|
|
|
if(include_annotations is True):
|
|
corpus=df['note'].values.astype('U') + df['highlightedText'].astype('U');
|
|
else:
|
|
corpus=df['content'].values.astype('U')# + df['highlightedText'].astype('U');
|
|
|
|
#print(corpus.head(n=15),"\n\n", corpus.shape);
|
|
|
|
# Transform strings in ads to numeric vector
|
|
vectoriser = CountVectorizer();
|
|
X = vectoriser.fit_transform(corpus);
|
|
print("X features: ", X.max(), X.min(), type(X), X, "Voc size: ", len(vectoriser.get_feature_names()))
|
|
# Test and train split 80/20 - not used this time
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True);
|
|
|
|
"""
|
|
Support vector machine classifier
|
|
"""
|
|
svm_clf = svm.SVC(C=1, kernel="linear").fit(X_train, y_train);
|
|
#svm_clf = OneVsRestClassifier(svm.LinearSVC()).fit(X_train, y_train)
|
|
svm_prediction = svm_clf.predict(X_test);
|
|
svm_cross_validation_score = cross_val_score(svm_clf, X, y, cv=5);
|
|
#svm_precision, svm_recall, svm_threshold = precision_recall_curve(y_test, svm_prediction)
|
|
#svm_precision, svm_recall, svm_threshold = precision_recall_curve(y_test, svm_clf.decision_function(X_test))
|
|
|
|
# Calculate the precision/recall
|
|
print(classification_report(y_test, svm_prediction));
|
|
print("---- \n Support vector machine \n----")
|
|
print("\n Accuracy score: ", accuracy_score(y_test, svm_prediction));
|
|
print("\n Balanced accuracy score: ", balanced_accuracy_score(y_test, svm_prediction))
|
|
print("\n Balanced accuracy score: ", balanced_accuracy_score(y_test, svm_prediction));
|
|
print("SVM cross validation mean score:{}".format(np.mean(svm_cross_validation_score)));
|
|
print("SVM predictions: ", svm_prediction)
|
|
|
|
"""
|
|
Logistic regression classifier
|
|
"""
|
|
# 4 fold C-V score using logistic regression classifier
|
|
lr_clf = LogisticRegression(random_state=0, max_iter=1000).fit(X_train, y_train)
|
|
lr_score = cross_val_score(lr_clf, X, y, cv=5)
|
|
lr_prediction = lr_clf.predict(X_test)
|
|
lr_accuracy = accuracy_score(y_test, lr_prediction)
|
|
#lr_precision, lr_recall, lr_threshold = precision_recall_curve(y_test, lr_prediction)
|
|
#lr_precision, lr_recall, lr_threshold = precision_recall_curve(y_test, lr_clf.decision_function(X_test))
|
|
|
|
print("---- \n Logistic regression \n----")
|
|
print(classification_report(y_test, lr_prediction))
|
|
print("LR accuracy score: ", lr_accuracy)
|
|
print("\n Balanced accuracy score: ", balanced_accuracy_score(y_test, lr_prediction));
|
|
print("LR cross validation scores: ", lr_score)
|
|
print("\n Balanced accuracy score: ", balanced_accuracy_score(y_test, lr_prediction))
|
|
print("LR cross validation mean score:{}".format(np.mean(lr_score)))
|
|
|
|
|
|
"""
|
|
Random Forest Classifier
|
|
"""
|
|
rf_clf = RandomForestClassifier(max_depth=5, random_state=0).fit(X_train, y_train)
|
|
rf_prediction = rf_clf.predict(X_test)
|
|
rf_accuracy = cross_val_score(rf_clf, X, y, cv=5)
|
|
rf_score = accuracy_score(y_test, rf_prediction)
|
|
#rf_precision, rf_recall, rf_threshold = precision_recall_curve(y_test, rf_clf.predict(y_test, rf_clf.predict(X_test)))
|
|
|
|
print("---- \n Random Forest \n----")
|
|
print(classification_report(y_test, rf_prediction))
|
|
print("RF accuracy score: ", rf_accuracy)
|
|
print("\n Balanced accuracy score: ", balanced_accuracy_score(y_test, rf_prediction))
|
|
print("RF cross validation scores: ", rf_score)
|
|
print("RF cross validation mean score:{}".format(np.mean(rf_score)))
|
|
|
|
"""
|
|
Naive Bayes classifier
|
|
"""
|
|
nb_clf = MultinomialNB().fit(X_train, y_train)
|
|
nb_prediction = nb_clf.predict(X_test)
|
|
nb_score = cross_val_score(nb_clf, X, y, cv=5)
|
|
nb_clf = MultinomialNB()
|
|
nb_clf.fit(X_train, y_train)
|
|
nb_score = cross_val_score(nb_clf, X, y, cv=5)
|
|
nb_prediction = nb_clf.predict(X_test)
|
|
nb_accuracy = accuracy_score(y_test, nb_prediction)
|
|
#nb_precision, nb_recall, nb_threshold = precision_recall_curve(y_test, nb_clf.predict(X_test))
|
|
|
|
print("---- \n Naive Bayes \n----")
|
|
print(classification_report(y_test, nb_prediction))
|
|
print("NB accuracy score: ", nb_accuracy)
|
|
print("NB balanced accuracy score: ", balanced_accuracy_score(y_test, nb_prediction))
|
|
print("NB cross validation scores: ", nb_score)
|
|
print("NB cross validation mean score:{}".format(np.mean(nb_score)))
|
|
print("NB predictions: ", nb_prediction)
|
|
"""
|
|
Plot precision/recall to graph
|
|
"""
|
|
#plot_precision_recall(lr_precision, lr_recall, svm_precision, svm_recall, rf_precision, rf_recall, nb_precision, nb_recall)
|
|
|
|
return svm_cross_validation_score
|
|
|
|
def d3_predict(prediction_candidate):
|
|
build_classifiers(df)
|
|
|
|
|
|
def plot_precision_recall(lr_precision, lr_recall, svm_precision, svm_recall, rf_precision, rf_recall, nb_precision, nb_recall):
|
|
plt.plot(lr_precision, lr_recall, label="Logistic regression")
|
|
plt.plot(svm_precision, svm_recall, label="Support vector machine")
|
|
plt.plot(nb_precision, nb_recall, label="Naive Bayes")
|
|
plt.plot(rf_precision, rf_recall, label="Random forest")
|
|
plt.xlabel("Precision")
|
|
plt.ylabel("Recall")
|
|
plt.legend(loc="best")
|
|
pr_file = str(datetime.datetime.now()) + "PR.png"
|
|
plt.savefig(pf_file, format="png")
|
|
|
|
# Constructor
|
|
if __name__ == "__main__":
|
|
arguments = docopt(__doc__, version='d3-analyser 0.1');
|
|
main(arguments);
|
|
|