2021-12-03 17:58:48 +01:00

209 lines
6.5 KiB
Python

#####################################################
# Title: D3-Analyser - Inter-annotator Agreement Calculator
# Author: Jesper Bergman (jesperbe@dsv.su.se)
# Licence: GPLv2
#####################################################
# Load datasets and algorithms
from sklearn.metrics import confusion_matrix
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import recall_score
from statsmodels.stats.inter_rater import fleiss_kappa # https://www.statsmodels.org/stable/_modules/statsmodels/stats/inter_rater.html#fleiss_kappa
from statsmodels.stats.inter_rater import aggregate_raters # https://www.statsmodels.org/stable/_modules/statsmodels/stats/inter_rater.html#fleiss_kappa
from pandas.io import sql
from fleiss import fleissKappa
# Load libraries
import sqlite3
import time
import pandas as pd
import numpy as np
import sys
def db_fetch():
sql_y1 = "SELECT DISTINCT sha256,category FROM categories WHERE uuid=\"ndmdiHk8\" ORDER BY category"
sql_y2 = "SELECT DISTINCT sha256,category FROM categories WHERE uuid=\"08puj3X3\" ORDER BY category"
y1 = cursor.execute(sql_y1).fetchall()
y2 = cursor.execute(sql_y2).fetchall()
y1_dict = {}
y2_dict = {}
left = []
right = []
for y1_iter in y1:
for y2_iter in y2:
if(y2_iter[0] in y1_iter[0]):
left.append(y1_iter[1])
right.append(y2_iter[1])
val1 = y1_dict.get(y1_iter[1])
val2 = y2_dict.get(y2_iter[1])
cat1 = y1_iter[1]
cat2 = y2_iter[1]
if(val1 and cat1):
val1 += [y1_iter[0]]
y1_dict.update({cat1 : val1})
print(cat1, " inserted. ", val1, "in 1")
val1 = ""
cat1 = ""
if(val2 and cat2):
val2 += [y2_iter[0]]
y2_dict.update({cat2 : val2})
print(cat2, " inserted ", val2, "in 2")
val2 = ""
cat2 = ""
else:
y1_dict.update({cat1 : [y1_iter[0]]})
y2_dict.update({cat2 : [y2_iter[0]]})
sorted(y1_dict)
sorted(y2_dict)
print("Generic CK: ", cohen_kappa_score(left, right), "\n\n")
k1 = y1_dict.keys()
k2 = y2_dict.keys()
for i in k1:
for ii in k2:
if(y1_dict.get(i) != None):
ck1 = y1_dict.get(i)
if(y2_dict.get(ii) != None):
ck2 = y2_dict.get(ii)
if(i == ii):
# Cut down to equal size for kappa calculations
if(len(ck1) > len(ck2)):
ck1 = ck1[:len(ck2)]
elif(len(ck1) < len(ck2)):
ck2 = ck2[:len(ck1)]
cks = cohen_kappa_score(sorted(ck1), sorted(ck2))
print("Cohen's kappa score for ", i, ":", ii, ck1, ck2, "\nScore: ", cks)
#db_insert(ck1, ck2, i, cks, "cohen")
# Convert to DF table
#table = pd.pivot_table(y1, index=['sha256'], columns=['category'], aggfunc=np.sum, fill_value=0)
#table2 = pd.pivot_table(y2, index=['sha256'], columns=['category'], aggfunc=np.sum, fill_value=0)
return cks
def db_insert(sha256_1, sha256_2, category, score, score_type):
if(score_type == "cohen"):
if(sha256_1 != str):
for i in sha256_1:
sql = "UPDATE categories SET cohen_kappa_score=" + str(score) + " WHERE sha256=\"" + i + "\";"
cursor.execute(sql)
if(sha256_2 != str):
for i in sha256_2:
sql = "UPDATE categories SET cohen_kappa_score=" + str(score) + " WHERE sha256=\"" + i + "\";"
cursor.execute(sql)
db_connection.commit()
'''
COHEN'S KAPPA (inter-annotator's agreement between only two annotators)
y1: Labels assigned by the first annotator.
y2: Labels assigned by the second annotator.
'''
def strip_categories(df1, df2):
# Get indexes
b = df1.columns
c = df2.columns
# Convert to arrays
b = b.get_level_values(1).tolist()
c = c.get_level_values(1).tolist()
# Add matching categories to an array
matches=[]
for row1 in b:
for row2 in c:
if(row1 == row2):
if(row1 not in matches):
matches.append(row1)
print(matches)
# Fetch one category at a time and calculate the kappa for it
for category in matches:
# Resize the biggest
df1_max_size = int(df1.index.size)
df2_max_size = int(df2.index.size)
if(df1_max_size > df2_max_size):
df1 = df1.head(df2_max_size)
if(df2_max_size > df1_max_size):
df2 = df2.head(df1_max_size)
print(df1_max_size, df2_max_size)
# Singlify categories
y1 = df1.get(("uuid", category))
y2 = df2.get(("uuid", category))
print(y1, "\n\n", y2, "--------\n")
#db_entry = [sha256,]
# Calculate Kappa
fk, ck = calculate_kappas(y1, y2)
print("FLEISS IS TYPE: " , type(fk))
# Do not insert NaN
if(np.isnan(fk) == False):
print("FLEISS IS OK: ", fk)
for index,value in y1.items():
if(value == 1):
db_insert(index, category, fk, ck)
print("Index: ", index, " value ", value, " in category ", category, " Fleiss ", fk, " Cohen's: ", ck, " inserted to DB.")
for index2,value2 in y2.items():
if(value2 == 1):
db_insert(index, category, fk, ck)
print("Index: ", index2, " value ", value2, " in category ", category, " Fleiss ", fk, " Cohen's: ", ck, " inserted to DB.")
#db_insert(sha256, category,fk, ck)
'''
CALCULATE KAPPA
'''
def calculate_kappas(a1, a2):
# Verify shape in some way.
if(a1.shape != a2.shape):
print("Sorry. Incompatible shapes.")
# Fleiss
frames = [a1, a2]
table3 = pd.concat(frames)
inputKappa = table3.to_numpy()
# Calculate Kappas
#c_kappa = cohen_kappa_score(a1, a2)
f_kappa = fleiss_kappa(frames, method='fleiss')
print("Fleiss: ", f_kappa)
return f_kappa
'''
MAIN
'''
db_connection = "";
cursor = "";
# Connect to database
try:
db_connection = sqlite3.connect('../D3-Centraliser/annotations.db')
cursor = db_connection.cursor()
db_fetch()
#strip_categories(table, table2)
# Close DB connection
db_connection.commit()
db_connection.close()
except sqlite3.Error as err:
print("Sqlite error:", err)
finally:
db_connection.close()