209 lines
6.5 KiB
Python
209 lines
6.5 KiB
Python
#####################################################
|
|
# Title: D3-Analyser - Inter-annotator Agreement Calculator
|
|
# Author: Jesper Bergman (jesperbe@dsv.su.se)
|
|
# Licence: GPLv2
|
|
#####################################################
|
|
|
|
# Load datasets and algorithms
|
|
from sklearn.metrics import confusion_matrix
|
|
from sklearn.metrics import cohen_kappa_score
|
|
from sklearn.metrics import recall_score
|
|
from statsmodels.stats.inter_rater import fleiss_kappa # https://www.statsmodels.org/stable/_modules/statsmodels/stats/inter_rater.html#fleiss_kappa
|
|
from statsmodels.stats.inter_rater import aggregate_raters # https://www.statsmodels.org/stable/_modules/statsmodels/stats/inter_rater.html#fleiss_kappa
|
|
from pandas.io import sql
|
|
from fleiss import fleissKappa
|
|
|
|
# Load libraries
|
|
import sqlite3
|
|
import time
|
|
import pandas as pd
|
|
import numpy as np
|
|
import sys
|
|
|
|
def db_fetch():
|
|
|
|
sql_y1 = "SELECT DISTINCT sha256,category FROM categories WHERE uuid=\"ndmdiHk8\" ORDER BY category"
|
|
sql_y2 = "SELECT DISTINCT sha256,category FROM categories WHERE uuid=\"08puj3X3\" ORDER BY category"
|
|
|
|
y1 = cursor.execute(sql_y1).fetchall()
|
|
y2 = cursor.execute(sql_y2).fetchall()
|
|
|
|
y1_dict = {}
|
|
y2_dict = {}
|
|
left = []
|
|
right = []
|
|
for y1_iter in y1:
|
|
for y2_iter in y2:
|
|
if(y2_iter[0] in y1_iter[0]):
|
|
left.append(y1_iter[1])
|
|
right.append(y2_iter[1])
|
|
val1 = y1_dict.get(y1_iter[1])
|
|
val2 = y2_dict.get(y2_iter[1])
|
|
cat1 = y1_iter[1]
|
|
cat2 = y2_iter[1]
|
|
if(val1 and cat1):
|
|
val1 += [y1_iter[0]]
|
|
y1_dict.update({cat1 : val1})
|
|
print(cat1, " inserted. ", val1, "in 1")
|
|
val1 = ""
|
|
cat1 = ""
|
|
|
|
if(val2 and cat2):
|
|
val2 += [y2_iter[0]]
|
|
y2_dict.update({cat2 : val2})
|
|
print(cat2, " inserted ", val2, "in 2")
|
|
val2 = ""
|
|
cat2 = ""
|
|
else:
|
|
y1_dict.update({cat1 : [y1_iter[0]]})
|
|
y2_dict.update({cat2 : [y2_iter[0]]})
|
|
sorted(y1_dict)
|
|
sorted(y2_dict)
|
|
print("Generic CK: ", cohen_kappa_score(left, right), "\n\n")
|
|
|
|
k1 = y1_dict.keys()
|
|
k2 = y2_dict.keys()
|
|
|
|
for i in k1:
|
|
for ii in k2:
|
|
if(y1_dict.get(i) != None):
|
|
ck1 = y1_dict.get(i)
|
|
if(y2_dict.get(ii) != None):
|
|
ck2 = y2_dict.get(ii)
|
|
if(i == ii):
|
|
# Cut down to equal size for kappa calculations
|
|
if(len(ck1) > len(ck2)):
|
|
ck1 = ck1[:len(ck2)]
|
|
elif(len(ck1) < len(ck2)):
|
|
ck2 = ck2[:len(ck1)]
|
|
|
|
cks = cohen_kappa_score(sorted(ck1), sorted(ck2))
|
|
print("Cohen's kappa score for ", i, ":", ii, ck1, ck2, "\nScore: ", cks)
|
|
#db_insert(ck1, ck2, i, cks, "cohen")
|
|
|
|
# Convert to DF table
|
|
#table = pd.pivot_table(y1, index=['sha256'], columns=['category'], aggfunc=np.sum, fill_value=0)
|
|
#table2 = pd.pivot_table(y2, index=['sha256'], columns=['category'], aggfunc=np.sum, fill_value=0)
|
|
|
|
return cks
|
|
|
|
def db_insert(sha256_1, sha256_2, category, score, score_type):
|
|
if(score_type == "cohen"):
|
|
if(sha256_1 != str):
|
|
for i in sha256_1:
|
|
sql = "UPDATE categories SET cohen_kappa_score=" + str(score) + " WHERE sha256=\"" + i + "\";"
|
|
cursor.execute(sql)
|
|
|
|
if(sha256_2 != str):
|
|
for i in sha256_2:
|
|
sql = "UPDATE categories SET cohen_kappa_score=" + str(score) + " WHERE sha256=\"" + i + "\";"
|
|
cursor.execute(sql)
|
|
|
|
db_connection.commit()
|
|
|
|
'''
|
|
COHEN'S KAPPA (inter-annotator's agreement between only two annotators)
|
|
y1: Labels assigned by the first annotator.
|
|
y2: Labels assigned by the second annotator.
|
|
'''
|
|
def strip_categories(df1, df2):
|
|
# Get indexes
|
|
b = df1.columns
|
|
c = df2.columns
|
|
|
|
# Convert to arrays
|
|
b = b.get_level_values(1).tolist()
|
|
c = c.get_level_values(1).tolist()
|
|
|
|
# Add matching categories to an array
|
|
matches=[]
|
|
for row1 in b:
|
|
for row2 in c:
|
|
if(row1 == row2):
|
|
if(row1 not in matches):
|
|
matches.append(row1)
|
|
print(matches)
|
|
# Fetch one category at a time and calculate the kappa for it
|
|
for category in matches:
|
|
# Resize the biggest
|
|
df1_max_size = int(df1.index.size)
|
|
df2_max_size = int(df2.index.size)
|
|
|
|
if(df1_max_size > df2_max_size):
|
|
df1 = df1.head(df2_max_size)
|
|
if(df2_max_size > df1_max_size):
|
|
df2 = df2.head(df1_max_size)
|
|
print(df1_max_size, df2_max_size)
|
|
|
|
# Singlify categories
|
|
y1 = df1.get(("uuid", category))
|
|
y2 = df2.get(("uuid", category))
|
|
|
|
print(y1, "\n\n", y2, "--------\n")
|
|
|
|
#db_entry = [sha256,]
|
|
# Calculate Kappa
|
|
fk, ck = calculate_kappas(y1, y2)
|
|
print("FLEISS IS TYPE: " , type(fk))
|
|
|
|
# Do not insert NaN
|
|
if(np.isnan(fk) == False):
|
|
print("FLEISS IS OK: ", fk)
|
|
|
|
for index,value in y1.items():
|
|
if(value == 1):
|
|
db_insert(index, category, fk, ck)
|
|
print("Index: ", index, " value ", value, " in category ", category, " Fleiss ", fk, " Cohen's: ", ck, " inserted to DB.")
|
|
|
|
for index2,value2 in y2.items():
|
|
if(value2 == 1):
|
|
db_insert(index, category, fk, ck)
|
|
print("Index: ", index2, " value ", value2, " in category ", category, " Fleiss ", fk, " Cohen's: ", ck, " inserted to DB.")
|
|
|
|
#db_insert(sha256, category,fk, ck)
|
|
|
|
'''
|
|
CALCULATE KAPPA
|
|
'''
|
|
def calculate_kappas(a1, a2):
|
|
# Verify shape in some way.
|
|
if(a1.shape != a2.shape):
|
|
print("Sorry. Incompatible shapes.")
|
|
|
|
# Fleiss
|
|
frames = [a1, a2]
|
|
table3 = pd.concat(frames)
|
|
inputKappa = table3.to_numpy()
|
|
|
|
# Calculate Kappas
|
|
#c_kappa = cohen_kappa_score(a1, a2)
|
|
f_kappa = fleiss_kappa(frames, method='fleiss')
|
|
print("Fleiss: ", f_kappa)
|
|
return f_kappa
|
|
|
|
|
|
'''
|
|
MAIN
|
|
'''
|
|
db_connection = "";
|
|
cursor = "";
|
|
|
|
# Connect to database
|
|
try:
|
|
db_connection = sqlite3.connect('../D3-Centraliser/annotations.db')
|
|
cursor = db_connection.cursor()
|
|
|
|
db_fetch()
|
|
#strip_categories(table, table2)
|
|
|
|
# Close DB connection
|
|
db_connection.commit()
|
|
db_connection.close()
|
|
|
|
except sqlite3.Error as err:
|
|
print("Sqlite error:", err)
|
|
finally:
|
|
db_connection.close()
|
|
|
|
|