##################################################### # Title: D3-Analyser # Author: Jesper Bergman (jesperbe@dsv.su.se) # Licence: MIT Licence ##################################################### #!/usr/bin/python """ ------------------------------------------------------------------------ D3-Analyser is an annotation driven machine learning program for analsying and classifying text from, for example, a darknet source, but also other web or text content sources. ------------------------------------------------------------------------ Usage: d3-analyser.py (-d | --database) d3-analyser.py (-d | --database) [--training | --new] d3-analyser.py (-h | --help) Examples: d3-analyser.py --database annotations.db --training d3-analyser.py --database annotations.db --new --urls Options: -h --help Show this message -m --model Algorithm to use for the clssification model. Default: ALL. Valid options are: svm, rf, lr. -c --classify Takes the file and uses it as comparison with --file=FILE. -d --database Tries to connect to the SQLite data and parse data from it. Required. -n --new Classify new/unseen content -t --training Classify training/testing data set -u --urls New URLs to classify (must be in database). Could be in separate file. -q --quit Quit this program """ # Load datasets and algorithms from docopt import docopt from sklearn import svm from sklearn.model_selection import cross_val_score from sklearn.model_selection import StratifiedKFold from sklearn.multiclass import OneVsRestClassifier from sklearn.metrics import precision_score; from sklearn.metrics import classification_report; from sklearn.metrics import accuracy_score; from sklearn.metrics import balanced_accuracy_score; from sklearn.metrics import recall_score; from sklearn.metrics import precision_recall_curve; from sklearn.model_selection import train_test_split; from sklearn.feature_extraction.text import TfidfVectorizer; from sklearn.feature_extraction.text import TfidfTransformer; from sklearn.feature_extraction.text import CountVectorizer; from sklearn.naive_bayes import MultinomialNB; from sklearn.ensemble import RandomForestClassifier; from sklearn.linear_model import LogisticRegression; #import matplotlib.pyplot as plt from sklearn import preprocessing; from sklearn.preprocessing import StandardScaler from pandas.io import sql; # Load libraries import optparse; import argparse; import sqlite3; import os; import time; import pandas as pd; import numpy as np; import sys; db_connection = sqlite3.connect('../D3-Centraliser/100_test.db'); cursor = db_connection.cursor(); df = []; def main(arguments): # Classify unseen samples. new_classification = arguments['--new'] # Classify only training dataset. Include users annotations and highlighted texts. training_classification = arguments['--training'] # SQL query will differ depending on new/old data. sql_query = False try: # Fetch table to analyse data if(training_classification is True): sql_query = "SELECT DISTINCT webpage.url, notes.note, highlightedText.highlightedText, categories.category FROM webpage, notes, highlightedText, categories WHERE categories.sha256 = webpage.sha256 AND webpage.sha256 = highlightedText.sha256 AND webpage.sha256 = notes.sha256"; print("JA") elif(new_classification is True): sql_query = "SELECT DISTINCT webpage.url, highlightedText.highlightedText, rawpage.content, categories.category FROM webpage, rawpage, highlightedText, categories WHERE categories.sha256 = webpage.sha256 AND webpage.sha256 = highlightedText.sha256 AND webpage.sha256 = rawpage.sha256" print("EJ") # Insert into a Pandas DF df = pd.read_sql_query(sql_query, db_connection); print(df.head, training_classification) # Build classifiers based on the DataFrame build_classifiers(df, training_classification); # Close DB connection db_connection.commit(); db_connection.close(); except sqlite3.Error as err: print("Sqlite error:", err); finally: db_connection.close(); ''' ADD CLASSIFICATION SCORES TO DATABASE ''' def add_to_database(item, value): if(item == "svm_score"): db_connection.execute("INSERT INTO categories(svm_score) VALUES(?)", [value]) if(item == "lr_score"): db_connection.execute("INSERT INTO categories(lr_score) VALUES(?)", [value]) db_connection.commit() ''' BUILD SVM AND LR MODELS ''' def build_classifiers(df, include_annotations, *unknown_samples): # Specify target (class) label y = df["category"]; corpus = [] print(y.head); if(include_annotations is True): corpus=df['note'].values.astype('U') + df['highlightedText'].astype('U'); else: corpus=df['content'].values.astype('U')# + df['highlightedText'].astype('U'); #print(corpus.head(n=15),"\n\n", corpus.shape); # Transform strings in ads to numeric vector vectoriser = CountVectorizer(); X = vectoriser.fit_transform(corpus); print("X features: ", X.max(), X.min(), type(X), X, "Voc size: ", len(vectoriser.get_feature_names())) # Test and train split 80/20 - not used this time X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True); """ Support vector machine classifier """ svm_clf = svm.SVC(C=1, kernel="linear").fit(X_train, y_train); #svm_clf = OneVsRestClassifier(svm.LinearSVC()).fit(X_train, y_train) svm_prediction = svm_clf.predict(X_test); svm_cross_validation_score = cross_val_score(svm_clf, X, y, cv=5); #svm_precision, svm_recall, svm_threshold = precision_recall_curve(y_test, svm_prediction) #svm_precision, svm_recall, svm_threshold = precision_recall_curve(y_test, svm_clf.decision_function(X_test)) # Calculate the precision/recall print(classification_report(y_test, svm_prediction)); print("---- \n Support vector machine \n----") print("\n Accuracy score: ", accuracy_score(y_test, svm_prediction)); print("\n Balanced accuracy score: ", balanced_accuracy_score(y_test, svm_prediction)) print("\n Balanced accuracy score: ", balanced_accuracy_score(y_test, svm_prediction)); print("SVM cross validation mean score:{}".format(np.mean(svm_cross_validation_score))); print("SVM predictions: ", svm_prediction) """ Logistic regression classifier """ # 4 fold C-V score using logistic regression classifier lr_clf = LogisticRegression(random_state=0, max_iter=1000).fit(X_train, y_train) lr_score = cross_val_score(lr_clf, X, y, cv=5) lr_prediction = lr_clf.predict(X_test) lr_accuracy = accuracy_score(y_test, lr_prediction) #lr_precision, lr_recall, lr_threshold = precision_recall_curve(y_test, lr_prediction) #lr_precision, lr_recall, lr_threshold = precision_recall_curve(y_test, lr_clf.decision_function(X_test)) print("---- \n Logistic regression \n----") print(classification_report(y_test, lr_prediction)) print("LR accuracy score: ", lr_accuracy) print("\n Balanced accuracy score: ", balanced_accuracy_score(y_test, lr_prediction)); print("LR cross validation scores: ", lr_score) print("\n Balanced accuracy score: ", balanced_accuracy_score(y_test, lr_prediction)) print("LR cross validation mean score:{}".format(np.mean(lr_score))) """ Random Forest Classifier """ rf_clf = RandomForestClassifier(max_depth=5, random_state=0).fit(X_train, y_train) rf_prediction = rf_clf.predict(X_test) rf_accuracy = cross_val_score(rf_clf, X, y, cv=5) rf_score = accuracy_score(y_test, rf_prediction) #rf_precision, rf_recall, rf_threshold = precision_recall_curve(y_test, rf_clf.predict(y_test, rf_clf.predict(X_test))) print("---- \n Random Forest \n----") print(classification_report(y_test, rf_prediction)) print("RF accuracy score: ", rf_accuracy) print("\n Balanced accuracy score: ", balanced_accuracy_score(y_test, rf_prediction)) print("RF cross validation scores: ", rf_score) print("RF cross validation mean score:{}".format(np.mean(rf_score))) """ Naive Bayes classifier """ nb_clf = MultinomialNB().fit(X_train, y_train) nb_prediction = nb_clf.predict(X_test) nb_score = cross_val_score(nb_clf, X, y, cv=5) nb_clf = MultinomialNB() nb_clf.fit(X_train, y_train) nb_score = cross_val_score(nb_clf, X, y, cv=5) nb_prediction = nb_clf.predict(X_test) nb_accuracy = accuracy_score(y_test, nb_prediction) #nb_precision, nb_recall, nb_threshold = precision_recall_curve(y_test, nb_clf.predict(X_test)) print("---- \n Naive Bayes \n----") print(classification_report(y_test, nb_prediction)) print("NB accuracy score: ", nb_accuracy) print("NB balanced accuracy score: ", balanced_accuracy_score(y_test, nb_prediction)) print("NB cross validation scores: ", nb_score) print("NB cross validation mean score:{}".format(np.mean(nb_score))) print("NB predictions: ", nb_prediction) """ Plot precision/recall to graph """ #plot_precision_recall(lr_precision, lr_recall, svm_precision, svm_recall, rf_precision, rf_recall, nb_precision, nb_recall) return svm_cross_validation_score def d3_predict(prediction_candidate): build_classifiers(df) def plot_precision_recall(lr_precision, lr_recall, svm_precision, svm_recall, rf_precision, rf_recall, nb_precision, nb_recall): plt.plot(lr_precision, lr_recall, label="Logistic regression") plt.plot(svm_precision, svm_recall, label="Support vector machine") plt.plot(nb_precision, nb_recall, label="Naive Bayes") plt.plot(rf_precision, rf_recall, label="Random forest") plt.xlabel("Precision") plt.ylabel("Recall") plt.legend(loc="best") pr_file = str(datetime.datetime.now()) + "PR.png" plt.savefig(pf_file, format="png") # Constructor if __name__ == "__main__": arguments = docopt(__doc__, version='d3-analyser 0.1'); main(arguments);