DIDECT2S/D3-Analyser/d3-analyser.py
2021-12-03 17:58:48 +01:00

246 lines
10 KiB
Python

#####################################################
# Title: D3-Analyser
# Author: Jesper Bergman (jesperbe@dsv.su.se)
# Licence: MIT Licence
#####################################################
#!/usr/bin/python
"""
------------------------------------------------------------------------
D3-Analyser is an annotation driven machine learning program
for analsying and classifying text from, for example, a darknet source,
but also other web or text content sources.
------------------------------------------------------------------------
Usage:
d3-analyser.py (-d | --database) <database>
d3-analyser.py (-d | --database) <database> [--training | --new]
d3-analyser.py (-h | --help)
Examples:
d3-analyser.py --database annotations.db --training
d3-analyser.py --database annotations.db --new --urls <onion_links.txt>
Options:
-h --help Show this message
-m --model Algorithm to use for the clssification model. Default: ALL. Valid options are: svm, rf, lr.
-c --classify Takes the file and uses it as comparison with --file=FILE.
-d --database Tries to connect to the SQLite data and parse data from it. Required.
-n --new Classify new/unseen content
-t --training Classify training/testing data set
-u --urls New URLs to classify (must be in database). Could be in separate file.
-q --quit Quit this program
"""
# Load datasets and algorithms
from docopt import docopt
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import precision_score;
from sklearn.metrics import classification_report;
from sklearn.metrics import accuracy_score;
from sklearn.metrics import balanced_accuracy_score;
from sklearn.metrics import recall_score;
from sklearn.metrics import precision_recall_curve;
from sklearn.model_selection import train_test_split;
from sklearn.feature_extraction.text import TfidfVectorizer;
from sklearn.feature_extraction.text import TfidfTransformer;
from sklearn.feature_extraction.text import CountVectorizer;
from sklearn.naive_bayes import MultinomialNB;
from sklearn.ensemble import RandomForestClassifier;
from sklearn.linear_model import LogisticRegression;
#import matplotlib.pyplot as plt
from sklearn import preprocessing;
from sklearn.preprocessing import StandardScaler
from pandas.io import sql;
# Load libraries
import optparse;
import argparse;
import sqlite3;
import os;
import time;
import pandas as pd;
import numpy as np;
import sys;
db_connection = sqlite3.connect('../D3-Centraliser/100_test.db');
cursor = db_connection.cursor();
df = [];
def main(arguments):
# Classify unseen samples.
new_classification = arguments['--new']
# Classify only training dataset. Include users annotations and highlighted texts.
training_classification = arguments['--training']
# SQL query will differ depending on new/old data.
sql_query = False
try:
# Fetch table to analyse data
if(training_classification is True):
sql_query = "SELECT DISTINCT webpage.url, notes.note, highlightedText.highlightedText, categories.category FROM webpage, notes, highlightedText, categories WHERE categories.sha256 = webpage.sha256 AND webpage.sha256 = highlightedText.sha256 AND webpage.sha256 = notes.sha256";
print("JA")
elif(new_classification is True):
sql_query = "SELECT DISTINCT webpage.url, highlightedText.highlightedText, rawpage.content, categories.category FROM webpage, rawpage, highlightedText, categories WHERE categories.sha256 = webpage.sha256 AND webpage.sha256 = highlightedText.sha256 AND webpage.sha256 = rawpage.sha256"
print("EJ")
# Insert into a Pandas DF
df = pd.read_sql_query(sql_query, db_connection);
print(df.head, training_classification)
# Build classifiers based on the DataFrame
build_classifiers(df, training_classification);
# Close DB connection
db_connection.commit();
db_connection.close();
except sqlite3.Error as err:
print("Sqlite error:", err);
finally:
db_connection.close();
'''
ADD CLASSIFICATION SCORES TO DATABASE
'''
def add_to_database(item, value):
if(item == "svm_score"):
db_connection.execute("INSERT INTO categories(svm_score) VALUES(?)", [value])
if(item == "lr_score"):
db_connection.execute("INSERT INTO categories(lr_score) VALUES(?)", [value])
db_connection.commit()
'''
BUILD SVM AND LR MODELS
'''
def build_classifiers(df, include_annotations, *unknown_samples):
# Specify target (class) label
y = df["category"];
corpus = []
print(y.head);
if(include_annotations is True):
corpus=df['note'].values.astype('U') + df['highlightedText'].astype('U');
else:
corpus=df['content'].values.astype('U')# + df['highlightedText'].astype('U');
#print(corpus.head(n=15),"\n\n", corpus.shape);
# Transform strings in ads to numeric vector
vectoriser = CountVectorizer();
X = vectoriser.fit_transform(corpus);
print("X features: ", X.max(), X.min(), type(X), X, "Voc size: ", len(vectoriser.get_feature_names()))
# Test and train split 80/20 - not used this time
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True);
"""
Support vector machine classifier
"""
svm_clf = svm.SVC(C=1, kernel="linear").fit(X_train, y_train);
#svm_clf = OneVsRestClassifier(svm.LinearSVC()).fit(X_train, y_train)
svm_prediction = svm_clf.predict(X_test);
svm_cross_validation_score = cross_val_score(svm_clf, X, y, cv=5);
#svm_precision, svm_recall, svm_threshold = precision_recall_curve(y_test, svm_prediction)
#svm_precision, svm_recall, svm_threshold = precision_recall_curve(y_test, svm_clf.decision_function(X_test))
# Calculate the precision/recall
print(classification_report(y_test, svm_prediction));
print("---- \n Support vector machine \n----")
print("\n Accuracy score: ", accuracy_score(y_test, svm_prediction));
print("\n Balanced accuracy score: ", balanced_accuracy_score(y_test, svm_prediction))
print("\n Balanced accuracy score: ", balanced_accuracy_score(y_test, svm_prediction));
print("SVM cross validation mean score:{}".format(np.mean(svm_cross_validation_score)));
print("SVM predictions: ", svm_prediction)
"""
Logistic regression classifier
"""
# 4 fold C-V score using logistic regression classifier
lr_clf = LogisticRegression(random_state=0, max_iter=1000).fit(X_train, y_train)
lr_score = cross_val_score(lr_clf, X, y, cv=5)
lr_prediction = lr_clf.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_prediction)
#lr_precision, lr_recall, lr_threshold = precision_recall_curve(y_test, lr_prediction)
#lr_precision, lr_recall, lr_threshold = precision_recall_curve(y_test, lr_clf.decision_function(X_test))
print("---- \n Logistic regression \n----")
print(classification_report(y_test, lr_prediction))
print("LR accuracy score: ", lr_accuracy)
print("\n Balanced accuracy score: ", balanced_accuracy_score(y_test, lr_prediction));
print("LR cross validation scores: ", lr_score)
print("\n Balanced accuracy score: ", balanced_accuracy_score(y_test, lr_prediction))
print("LR cross validation mean score:{}".format(np.mean(lr_score)))
"""
Random Forest Classifier
"""
rf_clf = RandomForestClassifier(max_depth=5, random_state=0).fit(X_train, y_train)
rf_prediction = rf_clf.predict(X_test)
rf_accuracy = cross_val_score(rf_clf, X, y, cv=5)
rf_score = accuracy_score(y_test, rf_prediction)
#rf_precision, rf_recall, rf_threshold = precision_recall_curve(y_test, rf_clf.predict(y_test, rf_clf.predict(X_test)))
print("---- \n Random Forest \n----")
print(classification_report(y_test, rf_prediction))
print("RF accuracy score: ", rf_accuracy)
print("\n Balanced accuracy score: ", balanced_accuracy_score(y_test, rf_prediction))
print("RF cross validation scores: ", rf_score)
print("RF cross validation mean score:{}".format(np.mean(rf_score)))
"""
Naive Bayes classifier
"""
nb_clf = MultinomialNB().fit(X_train, y_train)
nb_prediction = nb_clf.predict(X_test)
nb_score = cross_val_score(nb_clf, X, y, cv=5)
nb_clf = MultinomialNB()
nb_clf.fit(X_train, y_train)
nb_score = cross_val_score(nb_clf, X, y, cv=5)
nb_prediction = nb_clf.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_prediction)
#nb_precision, nb_recall, nb_threshold = precision_recall_curve(y_test, nb_clf.predict(X_test))
print("---- \n Naive Bayes \n----")
print(classification_report(y_test, nb_prediction))
print("NB accuracy score: ", nb_accuracy)
print("NB balanced accuracy score: ", balanced_accuracy_score(y_test, nb_prediction))
print("NB cross validation scores: ", nb_score)
print("NB cross validation mean score:{}".format(np.mean(nb_score)))
print("NB predictions: ", nb_prediction)
"""
Plot precision/recall to graph
"""
#plot_precision_recall(lr_precision, lr_recall, svm_precision, svm_recall, rf_precision, rf_recall, nb_precision, nb_recall)
return svm_cross_validation_score
def d3_predict(prediction_candidate):
build_classifiers(df)
def plot_precision_recall(lr_precision, lr_recall, svm_precision, svm_recall, rf_precision, rf_recall, nb_precision, nb_recall):
plt.plot(lr_precision, lr_recall, label="Logistic regression")
plt.plot(svm_precision, svm_recall, label="Support vector machine")
plt.plot(nb_precision, nb_recall, label="Naive Bayes")
plt.plot(rf_precision, rf_recall, label="Random forest")
plt.xlabel("Precision")
plt.ylabel("Recall")
plt.legend(loc="best")
pr_file = str(datetime.datetime.now()) + "PR.png"
plt.savefig(pf_file, format="png")
# Constructor
if __name__ == "__main__":
arguments = docopt(__doc__, version='d3-analyser 0.1');
main(arguments);