464 lines
18 KiB
Python
464 lines
18 KiB
Python
import base.pipeline as pipeline
|
|
import pickle, os
|
|
import pandas as pd
|
|
import json
|
|
from sklearn.preprocessing import LabelEncoder
|
|
from dict_and_html import *
|
|
from .. import methods
|
|
from ..methods import PIPELINE_PATH
|
|
import numpy as np
|
|
from collections import defaultdict
|
|
import base.pipeline as pipeline
|
|
import json
|
|
from django.shortcuts import HttpResponse
|
|
|
|
def handler(action, request):
|
|
status = 200
|
|
if action == "train":
|
|
# train a new model
|
|
# parameters sent via ajax
|
|
model_name = request.POST.get("model_name")
|
|
df_name = request.session.get("df_name")
|
|
|
|
# dataframe name
|
|
if df_name == "upload":
|
|
df_name = request.session.get("df_name_upload_base_name")
|
|
|
|
request.session["model_name"] = model_name
|
|
test_set_ratio = ""
|
|
if "test_set_ratio" in request.POST:
|
|
test_set_ratio = request.POST.get("test_set_ratio")
|
|
|
|
datasets_types_PipelineJSON_path = os.path.join(
|
|
PIPELINE_PATH + "/dataset_types_pipeline.json"
|
|
)
|
|
jsonFile = pipeline.PipelineJSON(datasets_types_PipelineJSON_path)
|
|
dataset_type = jsonFile.read_from_json([df_name])
|
|
|
|
if type(dataset_type) is list:
|
|
dataset_type = dataset_type[0]
|
|
|
|
if "array_preprocessing" in request.POST:
|
|
array_preprocessing = request.POST.get("array_preprocessing")
|
|
|
|
if dataset_type == "tabular":
|
|
class_label = request.POST.get("class_label")
|
|
preprocessing_info = {
|
|
"preprocessing": array_preprocessing,
|
|
"test_set_ratio": test_set_ratio,
|
|
"explainability": {"technique": "dice"},
|
|
"class_label": class_label,
|
|
}
|
|
elif dataset_type == "timeseries":
|
|
if model_name != "glacier":
|
|
preprocessing_info = {
|
|
"preprocessing": array_preprocessing,
|
|
"test_set_ratio": test_set_ratio,
|
|
"explainability": {"technique": model_name},
|
|
}
|
|
else:
|
|
# Path to the Bash script
|
|
autoencoder = request.POST.get("autoencoder")
|
|
preprocessing_info = {
|
|
"autoencoder": autoencoder,
|
|
"explainability": {"technique": model_name},
|
|
}
|
|
|
|
# absolute excel_file_name_path
|
|
excel_file_name_path = os.path.join(
|
|
PIPELINE_PATH + f"{df_name}" + "/" + df_name + ".csv"
|
|
)
|
|
|
|
# load paths
|
|
# absolute excel_file_preprocessed_path
|
|
excel_file_name_preprocessed_path = os.path.join(
|
|
PIPELINE_PATH,
|
|
f"{df_name}" + "/" + df_name + "_preprocessed" + ".csv",
|
|
)
|
|
|
|
json_path = os.path.join(PIPELINE_PATH + f"{df_name}" + "/pipeline.json")
|
|
jsonFile = pipeline.PipelineJSON(json_path)
|
|
# save the plots for future use
|
|
# folder path: pipelines/<dataset name>/trained_models/<model_name>/
|
|
|
|
model_name_path = os.path.join(
|
|
PIPELINE_PATH + f"{df_name}" + "/trained_models/" + model_name
|
|
)
|
|
|
|
model_name_dir_path = os.path.join(PIPELINE_PATH + f"{df_name}")
|
|
|
|
if os.path.exists(excel_file_name_preprocessed_path) == True:
|
|
# if preprocessed_file exists
|
|
# delete it and do preprocessing again
|
|
# maybe should optimize it for cases
|
|
# where the preprocessing is the same with
|
|
# the one applited on the existing file
|
|
os.remove(excel_file_name_preprocessed_path)
|
|
|
|
# generate filename
|
|
idx = excel_file_name_path.index(".")
|
|
excel_file_name_preprocessed = (
|
|
df_name[:idx] + "_preprocessed" + excel_file_name_path[idx:]
|
|
)
|
|
|
|
# save file for preprocessing
|
|
preprocess_df = pd.read_csv(excel_file_name_path)
|
|
request.session["excel_file_name_preprocessed"] = excel_file_name_preprocessed
|
|
|
|
# make the dir
|
|
if not os.path.exists(model_name_path):
|
|
os.makedirs(model_name_path)
|
|
|
|
try:
|
|
if dataset_type == "tabular":
|
|
le = LabelEncoder()
|
|
preprocess_df[class_label] = le.fit_transform(
|
|
preprocess_df[class_label]
|
|
)
|
|
|
|
if "array_preprocessing" in request.POST:
|
|
preprocess_df = methods.preprocess(
|
|
preprocess_df,
|
|
array_preprocessing,
|
|
excel_file_name_path,
|
|
dataset_type,
|
|
model_name_path,
|
|
class_label,
|
|
)
|
|
elif dataset_type == "timeseries":
|
|
|
|
pos = jsonFile.read_from_json(["pos"])
|
|
neg = jsonFile.read_from_json(["neg"])
|
|
pos_label, neg_label = 1, 0
|
|
|
|
if pos != pos_label:
|
|
preprocess_df.iloc[:, -1] = preprocess_df.iloc[:, -1].apply(
|
|
lambda x: pos_label if x == int(pos) else x
|
|
)
|
|
if neg != neg_label:
|
|
preprocess_df.iloc[:, -1] = preprocess_df.iloc[:, -1].apply(
|
|
lambda x: neg_label if x == int(neg) else x
|
|
)
|
|
if "array_preprocessing" in request.POST:
|
|
preprocess_df = methods.preprocess(
|
|
preprocess_df,
|
|
array_preprocessing,
|
|
excel_file_name_path,
|
|
dataset_type,
|
|
model_name_path,
|
|
)
|
|
|
|
pca = methods.generatePCA(preprocess_df)
|
|
|
|
# TSNE
|
|
if dataset_type == "tabular":
|
|
tsne, projections = methods.generateTSNE(
|
|
preprocess_df, dataset_type, class_label
|
|
)
|
|
else:
|
|
tsne, projections = methods.generateTSNE(preprocess_df, dataset_type)
|
|
|
|
if dataset_type == "tabular":
|
|
# training
|
|
feature_importance, classification_report, importance_dict = (
|
|
methods.training(
|
|
preprocess_df,
|
|
model_name,
|
|
float(test_set_ratio),
|
|
class_label,
|
|
dataset_type,
|
|
df_name,
|
|
model_name_path,
|
|
)
|
|
)
|
|
|
|
# feature importance on the original categorical columns (if they exist)
|
|
df = pd.read_csv(excel_file_name_path)
|
|
df = df.drop(class_label, axis=1)
|
|
|
|
# Initialize a dictionary to hold aggregated feature importances
|
|
categorical_columns = methods.get_categorical_features(df)
|
|
|
|
if categorical_columns != []:
|
|
aggregated_importance = {}
|
|
encoded_columns = methods.update_column_list_with_one_hot_columns(
|
|
df, preprocess_df, df.columns
|
|
)
|
|
|
|
feature_mapping = defaultdict(list)
|
|
for col in encoded_columns:
|
|
for original_col in categorical_columns:
|
|
if col.startswith(original_col + "_"):
|
|
feature_mapping[original_col].append(col)
|
|
break
|
|
else:
|
|
feature_mapping[col].append(
|
|
col
|
|
) # If no match, map to itself
|
|
|
|
# Aggregate the feature importances
|
|
for original_feature, encoded_columns in feature_mapping.items():
|
|
if encoded_columns: # Check if encoded_columns is not empty
|
|
if original_feature not in encoded_columns:
|
|
aggregated_importance[original_feature] = np.sum(
|
|
[
|
|
importance_dict.get(col, 0)
|
|
for col in encoded_columns
|
|
]
|
|
)
|
|
else:
|
|
aggregated_importance[original_feature] = (
|
|
importance_dict.get(original_feature, 0)
|
|
)
|
|
|
|
importance_df = pd.DataFrame(
|
|
{
|
|
"feature": list(aggregated_importance.keys()),
|
|
"importance": list(aggregated_importance.values()),
|
|
}
|
|
)
|
|
|
|
importance_df.to_csv(
|
|
model_name_path + "/feature_importance_df.csv", index=None
|
|
)
|
|
else:
|
|
# if no categorical columns
|
|
# Combine feature names with their respective importance values
|
|
feature_importance_df = pd.DataFrame(
|
|
{
|
|
"feature": importance_dict.keys(),
|
|
"importance": importance_dict.values(),
|
|
}
|
|
)
|
|
|
|
feature_importance_df.to_csv(
|
|
model_name_path + "/feature_importance_df.csv", index=None
|
|
)
|
|
|
|
# save some files
|
|
pickle.dump(
|
|
classification_report,
|
|
open(model_name_path + "/classification_report.sav", "wb"),
|
|
)
|
|
pickle.dump(
|
|
feature_importance,
|
|
open(model_name_path + "/feature_importance.sav", "wb"),
|
|
)
|
|
pickle.dump(le, open(model_name_path + "/label_encoder.sav", "wb"))
|
|
|
|
context = {
|
|
"dataset_type": dataset_type,
|
|
"pca": pca.to_html(),
|
|
"class_report": classification_report.to_html(),
|
|
"feature_importance": feature_importance.to_html(),
|
|
}
|
|
elif dataset_type == "timeseries":
|
|
|
|
path = model_name_path
|
|
dataset_camel = methods.convert_to_camel_case(df_name)
|
|
if "Ecg" in dataset_camel:
|
|
dataset_camel = dataset_camel.replace("Ecg", "ECG")
|
|
|
|
experiment = methods.fetch_line_by_dataset(
|
|
PIPELINE_PATH + "/glacier_experiments.txt",
|
|
dataset_camel,
|
|
)
|
|
|
|
if experiment is not None:
|
|
stripped_arguments = methods.extract_arguments_from_line(experiment)
|
|
|
|
if model_name == "glacier":
|
|
classification_report = methods.training(
|
|
preprocess_df,
|
|
model_name,
|
|
float(test_set_ratio) if test_set_ratio != "" else 0,
|
|
"",
|
|
dataset_type,
|
|
df_name,
|
|
path,
|
|
autoencoder,
|
|
stripped_arguments,
|
|
)
|
|
else:
|
|
classification_report = methods.training(
|
|
preprocess_df,
|
|
model_name,
|
|
float(test_set_ratio) if test_set_ratio != "" else 0,
|
|
"",
|
|
dataset_type,
|
|
df_name,
|
|
path,
|
|
)
|
|
|
|
pickle.dump(
|
|
classification_report,
|
|
open(path + "/classification_report.sav", "wb"),
|
|
)
|
|
|
|
context = {
|
|
"dataset_type": dataset_type,
|
|
"pca": pca.to_html(),
|
|
"tsne": tsne.to_html(),
|
|
"class_report": classification_report.to_html(),
|
|
}
|
|
|
|
# save the plots
|
|
pickle.dump(tsne, open(model_name_path + "/tsne.sav", "wb"))
|
|
pickle.dump(pca, open(model_name_path + "/pca.sav", "wb"))
|
|
|
|
# save projections file for future use
|
|
with open(model_name_path + "/tsne_projections.json", "w") as f:
|
|
json.dump(projections.tolist(), f, indent=2)
|
|
|
|
if jsonFile.key_exists("classifier"):
|
|
temp_json = {model_name: preprocessing_info}
|
|
jsonFile.update_json(["classifier"], temp_json)
|
|
else:
|
|
temp_jason = {
|
|
"preprocessed_name": df_name + "_preprocessed.csv",
|
|
"classifier": {model_name: preprocessing_info},
|
|
}
|
|
jsonFile.append_to_json(temp_jason)
|
|
|
|
classifier_data = jsonFile.read_from_json(["classifier", model_name])
|
|
classifier_data_html = dict_and_html(classifier_data)
|
|
context.update({"classifier_data": classifier_data_html})
|
|
preprocess_df.to_csv(excel_file_name_preprocessed_path, index=False)
|
|
status = 200
|
|
|
|
except FileNotFoundError as e:
|
|
methods.remove_dir_and_empty_parent(model_name_path)
|
|
context = methods.format_error_context(
|
|
e, "File error. Please check if all required files are available."
|
|
)
|
|
status = 400
|
|
|
|
except PermissionError as e:
|
|
methods.remove_dir_and_empty_parent(model_name_path)
|
|
context = methods.format_error_context(
|
|
e, "Permission error. Ensure appropriate file permissions."
|
|
)
|
|
status = 400
|
|
|
|
except KeyError as e:
|
|
methods.remove_dir_and_empty_parent(model_name_path)
|
|
context = methods.format_error_context(
|
|
e, f"Key error. Missing expected key {str(e)}. Verify dataset and configuration settings."
|
|
)
|
|
status = 400
|
|
|
|
except ValueError as e:
|
|
methods.remove_dir_and_empty_parent(model_name_path)
|
|
context = methods.format_error_context(
|
|
e, "Data error. Please verify the data format and preprocessing steps."
|
|
)
|
|
status = 400
|
|
|
|
except TypeError as e:
|
|
methods.remove_dir_and_empty_parent(model_name_path)
|
|
context = methods.format_error_context(
|
|
e, "Type error. Check for data type compatibility in operations."
|
|
)
|
|
status = 400
|
|
|
|
except Exception as e:
|
|
methods.remove_dir_and_empty_parent(model_name_path)
|
|
context = methods.format_error_context(
|
|
e, "An unexpected error occurred. Please review the code and data."
|
|
)
|
|
status = 400
|
|
elif action == "delete_pre_trained":
|
|
|
|
df_name = request.session["df_name"]
|
|
model_name = request.POST.get("model_name")
|
|
model_name_path = os.path.join(
|
|
PIPELINE_PATH + f"{df_name}" + "/trained_models/" + model_name
|
|
)
|
|
|
|
print(model_name_path)
|
|
|
|
excel_file_name_preprocessed_path = os.path.join(
|
|
PIPELINE_PATH,
|
|
f"{df_name}" + "/" + df_name + "_preprocessed" + ".csv",
|
|
)
|
|
try:
|
|
# Check if the file exists
|
|
if os.path.exists(excel_file_name_preprocessed_path):
|
|
# Delete the file
|
|
os.remove(excel_file_name_preprocessed_path)
|
|
# print(f"File '{excel_file_name_preprocessed_path}' has been deleted successfully.")
|
|
else:
|
|
print(f"File '{excel_file_name_preprocessed_path}' does not exist.")
|
|
except Exception as e:
|
|
print(f"An error occurred while deleting the file: {e}")
|
|
|
|
json_path = os.path.join(PIPELINE_PATH + f"{df_name}" + "/pipeline.json")
|
|
jsonFile = pipeline.PipelineJSON(json_path)
|
|
jsonFile.delete_key(["classifier", model_name])
|
|
|
|
methods.remove_dir_and_empty_parent(model_name_path)
|
|
# load paths
|
|
# absolute excel_file_preprocessed_path
|
|
|
|
if not jsonFile.key_exists("classifier"):
|
|
# pre trained models do not exist
|
|
# check if dataset directory exists
|
|
df_dir = os.path.join(PIPELINE_PATH + f"{df_name}")
|
|
if not os.path.exists(df_dir):
|
|
df_name = None
|
|
|
|
context = {
|
|
"df_name": df_name,
|
|
"available_pretrained_models_info": [],
|
|
}
|
|
else:
|
|
# if it exists
|
|
# check the section of "classifiers"
|
|
# folder path
|
|
available_pretrained_models = jsonFile.read_from_json(
|
|
["classifier"]
|
|
).keys()
|
|
|
|
available_pretrained_models_info = (
|
|
methods.create_tuple_of_models_text_value(
|
|
available_pretrained_models
|
|
)
|
|
)
|
|
context = {
|
|
"df_name": df_name,
|
|
"available_pretrained_models_info": available_pretrained_models_info,
|
|
}
|
|
elif action == "discard_model":
|
|
name = request.session["df_name"]
|
|
model_name = request.session["model_name"]
|
|
model_name_path = os.path.join(
|
|
PIPELINE_PATH + f"{name}" + "/trained_models/" + model_name
|
|
)
|
|
# should delete model folder
|
|
# should delete classifier from json
|
|
# should delete preprocessed path too
|
|
methods.remove_dir_and_empty_parent(model_name_path)
|
|
# load paths
|
|
# absolute excel_file_preprocessed_path
|
|
excel_file_name_preprocessed_path = os.path.join(
|
|
PIPELINE_PATH,
|
|
f"{name}" + "/" + name + "_preprocessed" + ".csv",
|
|
)
|
|
try:
|
|
# Check if the file exists
|
|
if os.path.exists(excel_file_name_preprocessed_path):
|
|
# Delete the file
|
|
os.remove(excel_file_name_preprocessed_path)
|
|
# print(f"File '{excel_file_name_preprocessed_path}' has been deleted successfully.")
|
|
else:
|
|
print(f"File '{excel_file_name_preprocessed_path}' does not exist.")
|
|
except Exception as e:
|
|
print(f"An error occurred while deleting the file: {e}")
|
|
|
|
json_path = os.path.join(PIPELINE_PATH + f"{name}" + "/pipeline.json")
|
|
jsonFile = pipeline.PipelineJSON(json_path)
|
|
jsonFile.delete_key(["classifier",model_name])
|
|
|
|
context = {}
|
|
|
|
return HttpResponse(json.dumps(context), status=status) |