EXTREMUM_web/base/handlers/ajaxTrainHandler.py

464 lines
18 KiB
Python

import base.pipeline as pipeline
import pickle, os
import pandas as pd
import json
from sklearn.preprocessing import LabelEncoder
from dict_and_html import *
from .. import methods
from ..methods import PIPELINE_PATH
import numpy as np
from collections import defaultdict
import base.pipeline as pipeline
import json
from django.shortcuts import HttpResponse
def handler(action, request):
status = 200
if action == "train":
# train a new model
# parameters sent via ajax
model_name = request.POST.get("model_name")
df_name = request.session.get("df_name")
# dataframe name
if df_name == "upload":
df_name = request.session.get("df_name_upload_base_name")
request.session["model_name"] = model_name
test_set_ratio = ""
if "test_set_ratio" in request.POST:
test_set_ratio = request.POST.get("test_set_ratio")
datasets_types_PipelineJSON_path = os.path.join(
PIPELINE_PATH + "/dataset_types_pipeline.json"
)
jsonFile = pipeline.PipelineJSON(datasets_types_PipelineJSON_path)
dataset_type = jsonFile.read_from_json([df_name])
if type(dataset_type) is list:
dataset_type = dataset_type[0]
if "array_preprocessing" in request.POST:
array_preprocessing = request.POST.get("array_preprocessing")
if dataset_type == "tabular":
class_label = request.POST.get("class_label")
preprocessing_info = {
"preprocessing": array_preprocessing,
"test_set_ratio": test_set_ratio,
"explainability": {"technique": "dice"},
"class_label": class_label,
}
elif dataset_type == "timeseries":
if model_name != "glacier":
preprocessing_info = {
"preprocessing": array_preprocessing,
"test_set_ratio": test_set_ratio,
"explainability": {"technique": model_name},
}
else:
# Path to the Bash script
autoencoder = request.POST.get("autoencoder")
preprocessing_info = {
"autoencoder": autoencoder,
"explainability": {"technique": model_name},
}
# absolute excel_file_name_path
excel_file_name_path = os.path.join(
PIPELINE_PATH + f"{df_name}" + "/" + df_name + ".csv"
)
# load paths
# absolute excel_file_preprocessed_path
excel_file_name_preprocessed_path = os.path.join(
PIPELINE_PATH,
f"{df_name}" + "/" + df_name + "_preprocessed" + ".csv",
)
json_path = os.path.join(PIPELINE_PATH + f"{df_name}" + "/pipeline.json")
jsonFile = pipeline.PipelineJSON(json_path)
# save the plots for future use
# folder path: pipelines/<dataset name>/trained_models/<model_name>/
model_name_path = os.path.join(
PIPELINE_PATH + f"{df_name}" + "/trained_models/" + model_name
)
model_name_dir_path = os.path.join(PIPELINE_PATH + f"{df_name}")
if os.path.exists(excel_file_name_preprocessed_path) == True:
# if preprocessed_file exists
# delete it and do preprocessing again
# maybe should optimize it for cases
# where the preprocessing is the same with
# the one applited on the existing file
os.remove(excel_file_name_preprocessed_path)
# generate filename
idx = excel_file_name_path.index(".")
excel_file_name_preprocessed = (
df_name[:idx] + "_preprocessed" + excel_file_name_path[idx:]
)
# save file for preprocessing
preprocess_df = pd.read_csv(excel_file_name_path)
request.session["excel_file_name_preprocessed"] = excel_file_name_preprocessed
# make the dir
if not os.path.exists(model_name_path):
os.makedirs(model_name_path)
try:
if dataset_type == "tabular":
le = LabelEncoder()
preprocess_df[class_label] = le.fit_transform(
preprocess_df[class_label]
)
if "array_preprocessing" in request.POST:
preprocess_df = methods.preprocess(
preprocess_df,
array_preprocessing,
excel_file_name_path,
dataset_type,
model_name_path,
class_label,
)
elif dataset_type == "timeseries":
pos = jsonFile.read_from_json(["pos"])
neg = jsonFile.read_from_json(["neg"])
pos_label, neg_label = 1, 0
if pos != pos_label:
preprocess_df.iloc[:, -1] = preprocess_df.iloc[:, -1].apply(
lambda x: pos_label if x == int(pos) else x
)
if neg != neg_label:
preprocess_df.iloc[:, -1] = preprocess_df.iloc[:, -1].apply(
lambda x: neg_label if x == int(neg) else x
)
if "array_preprocessing" in request.POST:
preprocess_df = methods.preprocess(
preprocess_df,
array_preprocessing,
excel_file_name_path,
dataset_type,
model_name_path,
)
pca = methods.generatePCA(preprocess_df)
# TSNE
if dataset_type == "tabular":
tsne, projections = methods.generateTSNE(
preprocess_df, dataset_type, class_label
)
else:
tsne, projections = methods.generateTSNE(preprocess_df, dataset_type)
if dataset_type == "tabular":
# training
feature_importance, classification_report, importance_dict = (
methods.training(
preprocess_df,
model_name,
float(test_set_ratio),
class_label,
dataset_type,
df_name,
model_name_path,
)
)
# feature importance on the original categorical columns (if they exist)
df = pd.read_csv(excel_file_name_path)
df = df.drop(class_label, axis=1)
# Initialize a dictionary to hold aggregated feature importances
categorical_columns = methods.get_categorical_features(df)
if categorical_columns != []:
aggregated_importance = {}
encoded_columns = methods.update_column_list_with_one_hot_columns(
df, preprocess_df, df.columns
)
feature_mapping = defaultdict(list)
for col in encoded_columns:
for original_col in categorical_columns:
if col.startswith(original_col + "_"):
feature_mapping[original_col].append(col)
break
else:
feature_mapping[col].append(
col
) # If no match, map to itself
# Aggregate the feature importances
for original_feature, encoded_columns in feature_mapping.items():
if encoded_columns: # Check if encoded_columns is not empty
if original_feature not in encoded_columns:
aggregated_importance[original_feature] = np.sum(
[
importance_dict.get(col, 0)
for col in encoded_columns
]
)
else:
aggregated_importance[original_feature] = (
importance_dict.get(original_feature, 0)
)
importance_df = pd.DataFrame(
{
"feature": list(aggregated_importance.keys()),
"importance": list(aggregated_importance.values()),
}
)
importance_df.to_csv(
model_name_path + "/feature_importance_df.csv", index=None
)
else:
# if no categorical columns
# Combine feature names with their respective importance values
feature_importance_df = pd.DataFrame(
{
"feature": importance_dict.keys(),
"importance": importance_dict.values(),
}
)
feature_importance_df.to_csv(
model_name_path + "/feature_importance_df.csv", index=None
)
# save some files
pickle.dump(
classification_report,
open(model_name_path + "/classification_report.sav", "wb"),
)
pickle.dump(
feature_importance,
open(model_name_path + "/feature_importance.sav", "wb"),
)
pickle.dump(le, open(model_name_path + "/label_encoder.sav", "wb"))
context = {
"dataset_type": dataset_type,
"pca": pca.to_html(),
"class_report": classification_report.to_html(),
"feature_importance": feature_importance.to_html(),
}
elif dataset_type == "timeseries":
path = model_name_path
dataset_camel = methods.convert_to_camel_case(df_name)
if "Ecg" in dataset_camel:
dataset_camel = dataset_camel.replace("Ecg", "ECG")
experiment = methods.fetch_line_by_dataset(
PIPELINE_PATH + "/glacier_experiments.txt",
dataset_camel,
)
if experiment is not None:
stripped_arguments = methods.extract_arguments_from_line(experiment)
if model_name == "glacier":
classification_report = methods.training(
preprocess_df,
model_name,
float(test_set_ratio) if test_set_ratio != "" else 0,
"",
dataset_type,
df_name,
path,
autoencoder,
stripped_arguments,
)
else:
classification_report = methods.training(
preprocess_df,
model_name,
float(test_set_ratio) if test_set_ratio != "" else 0,
"",
dataset_type,
df_name,
path,
)
pickle.dump(
classification_report,
open(path + "/classification_report.sav", "wb"),
)
context = {
"dataset_type": dataset_type,
"pca": pca.to_html(),
"tsne": tsne.to_html(),
"class_report": classification_report.to_html(),
}
# save the plots
pickle.dump(tsne, open(model_name_path + "/tsne.sav", "wb"))
pickle.dump(pca, open(model_name_path + "/pca.sav", "wb"))
# save projections file for future use
with open(model_name_path + "/tsne_projections.json", "w") as f:
json.dump(projections.tolist(), f, indent=2)
if jsonFile.key_exists("classifier"):
temp_json = {model_name: preprocessing_info}
jsonFile.update_json(["classifier"], temp_json)
else:
temp_jason = {
"preprocessed_name": df_name + "_preprocessed.csv",
"classifier": {model_name: preprocessing_info},
}
jsonFile.append_to_json(temp_jason)
classifier_data = jsonFile.read_from_json(["classifier", model_name])
classifier_data_html = dict_and_html(classifier_data)
context.update({"classifier_data": classifier_data_html})
preprocess_df.to_csv(excel_file_name_preprocessed_path, index=False)
status = 200
except FileNotFoundError as e:
methods.remove_dir_and_empty_parent(model_name_path)
context = methods.format_error_context(
e, "File error. Please check if all required files are available."
)
status = 400
except PermissionError as e:
methods.remove_dir_and_empty_parent(model_name_path)
context = methods.format_error_context(
e, "Permission error. Ensure appropriate file permissions."
)
status = 400
except KeyError as e:
methods.remove_dir_and_empty_parent(model_name_path)
context = methods.format_error_context(
e, f"Key error. Missing expected key {str(e)}. Verify dataset and configuration settings."
)
status = 400
except ValueError as e:
methods.remove_dir_and_empty_parent(model_name_path)
context = methods.format_error_context(
e, "Data error. Please verify the data format and preprocessing steps."
)
status = 400
except TypeError as e:
methods.remove_dir_and_empty_parent(model_name_path)
context = methods.format_error_context(
e, "Type error. Check for data type compatibility in operations."
)
status = 400
except Exception as e:
methods.remove_dir_and_empty_parent(model_name_path)
context = methods.format_error_context(
e, "An unexpected error occurred. Please review the code and data."
)
status = 400
elif action == "delete_pre_trained":
df_name = request.session["df_name"]
model_name = request.POST.get("model_name")
model_name_path = os.path.join(
PIPELINE_PATH + f"{df_name}" + "/trained_models/" + model_name
)
print(model_name_path)
excel_file_name_preprocessed_path = os.path.join(
PIPELINE_PATH,
f"{df_name}" + "/" + df_name + "_preprocessed" + ".csv",
)
try:
# Check if the file exists
if os.path.exists(excel_file_name_preprocessed_path):
# Delete the file
os.remove(excel_file_name_preprocessed_path)
# print(f"File '{excel_file_name_preprocessed_path}' has been deleted successfully.")
else:
print(f"File '{excel_file_name_preprocessed_path}' does not exist.")
except Exception as e:
print(f"An error occurred while deleting the file: {e}")
json_path = os.path.join(PIPELINE_PATH + f"{df_name}" + "/pipeline.json")
jsonFile = pipeline.PipelineJSON(json_path)
jsonFile.delete_key(["classifier", model_name])
methods.remove_dir_and_empty_parent(model_name_path)
# load paths
# absolute excel_file_preprocessed_path
if not jsonFile.key_exists("classifier"):
# pre trained models do not exist
# check if dataset directory exists
df_dir = os.path.join(PIPELINE_PATH + f"{df_name}")
if not os.path.exists(df_dir):
df_name = None
context = {
"df_name": df_name,
"available_pretrained_models_info": [],
}
else:
# if it exists
# check the section of "classifiers"
# folder path
available_pretrained_models = jsonFile.read_from_json(
["classifier"]
).keys()
available_pretrained_models_info = (
methods.create_tuple_of_models_text_value(
available_pretrained_models
)
)
context = {
"df_name": df_name,
"available_pretrained_models_info": available_pretrained_models_info,
}
elif action == "discard_model":
name = request.session["df_name"]
model_name = request.session["model_name"]
model_name_path = os.path.join(
PIPELINE_PATH + f"{name}" + "/trained_models/" + model_name
)
# should delete model folder
# should delete classifier from json
# should delete preprocessed path too
methods.remove_dir_and_empty_parent(model_name_path)
# load paths
# absolute excel_file_preprocessed_path
excel_file_name_preprocessed_path = os.path.join(
PIPELINE_PATH,
f"{name}" + "/" + name + "_preprocessed" + ".csv",
)
try:
# Check if the file exists
if os.path.exists(excel_file_name_preprocessed_path):
# Delete the file
os.remove(excel_file_name_preprocessed_path)
# print(f"File '{excel_file_name_preprocessed_path}' has been deleted successfully.")
else:
print(f"File '{excel_file_name_preprocessed_path}' does not exist.")
except Exception as e:
print(f"An error occurred while deleting the file: {e}")
json_path = os.path.join(PIPELINE_PATH + f"{name}" + "/pipeline.json")
jsonFile = pipeline.PipelineJSON(json_path)
jsonFile.delete_key(["classifier",model_name])
context = {}
return HttpResponse(json.dumps(context), status=status)