EXTREMUM_web/base/handlers/ajaxTrainHandler.py

import base.pipeline as pipeline
import pickle, os
import pandas as pd
import json
from sklearn.preprocessing import LabelEncoder
from dict_and_html import *
from .. import methods
from ..methods import PIPELINE_PATH
import numpy as np
from collections import defaultdict
import base.pipeline as pipeline
import json
from django.shortcuts import HttpResponse

def handler(action, request):
    status = 200
    if action == "train":
        # train a new model
        # parameters sent via ajax
        model_name = request.POST.get("model_name")
        df_name = request.session.get("df_name")

        # dataframe name
        if df_name == "upload":
            df_name = request.session.get("df_name_upload_base_name")

        request.session["model_name"] = model_name
        test_set_ratio = ""
        if "test_set_ratio" in request.POST:
            test_set_ratio = request.POST.get("test_set_ratio")

        datasets_types_PipelineJSON_path = os.path.join(
            PIPELINE_PATH + "/dataset_types_pipeline.json"
        )
        jsonFile = pipeline.PipelineJSON(datasets_types_PipelineJSON_path)
        dataset_type = jsonFile.read_from_json([df_name])

        if type(dataset_type) is list:
            dataset_type = dataset_type[0]

        if "array_preprocessing" in request.POST:
            array_preprocessing = request.POST.get("array_preprocessing")

        if dataset_type == "tabular":
            class_label = request.POST.get("class_label")
            preprocessing_info = {
                "preprocessing": array_preprocessing,
                "test_set_ratio": test_set_ratio,
                "explainability": {"technique": "dice"},
                "class_label": class_label,
            }
        elif dataset_type == "timeseries":
            if model_name != "glacier":
                preprocessing_info = {
                    "preprocessing": array_preprocessing,
                    "test_set_ratio": test_set_ratio,
                    "explainability": {"technique": model_name},
                }
            else:
                # Path to the Bash script
                autoencoder = request.POST.get("autoencoder")
                preprocessing_info = {
                    "autoencoder": autoencoder,
                    "explainability": {"technique": model_name},
                }

        # absolute excel_file_name_path
        excel_file_name_path = os.path.join(
            PIPELINE_PATH + f"{df_name}" + "/" + df_name + ".csv"
        )

        # load paths
        # absolute excel_file_preprocessed_path
        excel_file_name_preprocessed_path = os.path.join(
            PIPELINE_PATH,
            f"{df_name}" + "/" + df_name + "_preprocessed" + ".csv",
        )

        json_path = os.path.join(PIPELINE_PATH + f"{df_name}" + "/pipeline.json")
        jsonFile = pipeline.PipelineJSON(json_path)
        # save the plots for future use
        # folder path: pipelines/<dataset name>/trained_models/<model_name>/

        model_name_path = os.path.join(
            PIPELINE_PATH + f"{df_name}" + "/trained_models/" + model_name
        )

        model_name_dir_path = os.path.join(PIPELINE_PATH + f"{df_name}")

        if os.path.exists(excel_file_name_preprocessed_path) == True:
            # if preprocessed_file exists
            # delete it and do preprocessing again
            # maybe should optimize it for cases
            # where the preprocessing is the same with
            # the one applited on the existing file
            os.remove(excel_file_name_preprocessed_path)

        # generate filename
        idx = excel_file_name_path.index(".")
        excel_file_name_preprocessed = (
            df_name[:idx] + "_preprocessed" + excel_file_name_path[idx:]
        )

        # save file for preprocessing
        preprocess_df = pd.read_csv(excel_file_name_path)
        request.session["excel_file_name_preprocessed"] = excel_file_name_preprocessed

        # make the dir
        if not os.path.exists(model_name_path):
            os.makedirs(model_name_path)

        try:
            if dataset_type == "tabular":
                le = LabelEncoder()
                preprocess_df[class_label] = le.fit_transform(
                    preprocess_df[class_label]
                )

                if "array_preprocessing" in request.POST:
                    preprocess_df = methods.preprocess(
                        preprocess_df,
                        array_preprocessing,
                        excel_file_name_path,
                        dataset_type,
                        model_name_path,
                        class_label,
                    )
            elif dataset_type == "timeseries":

                pos = jsonFile.read_from_json(["pos"])
                neg = jsonFile.read_from_json(["neg"])
                pos_label, neg_label = 1, 0

                if pos != pos_label:
                    preprocess_df.iloc[:, -1] = preprocess_df.iloc[:, -1].apply(
                        lambda x: pos_label if x == int(pos) else x
                    )
                if neg != neg_label:
                    preprocess_df.iloc[:, -1] = preprocess_df.iloc[:, -1].apply(
                        lambda x: neg_label if x == int(neg) else x
                    )
                if "array_preprocessing" in request.POST:
                    preprocess_df = methods.preprocess(
                        preprocess_df,
                        array_preprocessing,
                        excel_file_name_path,
                        dataset_type,
                        model_name_path,
                    )

            pca = methods.generatePCA(preprocess_df)

            # TSNE
            if dataset_type == "tabular":
                tsne, projections = methods.generateTSNE(
                    preprocess_df, dataset_type, class_label
                )
            else:
                tsne, projections = methods.generateTSNE(preprocess_df, dataset_type)

            if dataset_type == "tabular":
                # training
                feature_importance, classification_report, importance_dict = (
                    methods.training(
                        preprocess_df,
                        model_name,
                        float(test_set_ratio),
                        class_label,
                        dataset_type,
                        df_name,
                        model_name_path,
                    )
                )

                # feature importance on the original categorical columns (if they exist)
                df = pd.read_csv(excel_file_name_path)
                df = df.drop(class_label, axis=1)

                # Initialize a dictionary to hold aggregated feature importances
                categorical_columns = methods.get_categorical_features(df)

                if categorical_columns != []:
                    aggregated_importance = {}
                    encoded_columns = methods.update_column_list_with_one_hot_columns(
                        df, preprocess_df, df.columns
                    )

                    feature_mapping = defaultdict(list)
                    for col in encoded_columns:
                        for original_col in categorical_columns:
                            if col.startswith(original_col + "_"):
                                feature_mapping[original_col].append(col)
                                break
                        else:
                            feature_mapping[col].append(
                                col
                            )  # If no match, map to itself

                    # Aggregate the feature importances
                    for original_feature, encoded_columns in feature_mapping.items():
                        if encoded_columns:  # Check if encoded_columns is not empty
                            if original_feature not in encoded_columns:
                                aggregated_importance[original_feature] = np.sum(
                                    [
                                        importance_dict.get(col, 0)
                                        for col in encoded_columns
                                    ]
                                )
                            else:
                                aggregated_importance[original_feature] = (
                                    importance_dict.get(original_feature, 0)
                                )

                    importance_df = pd.DataFrame(
                        {
                            "feature": list(aggregated_importance.keys()),
                            "importance": list(aggregated_importance.values()),
                        }
                    )

                    importance_df.to_csv(
                        model_name_path + "/feature_importance_df.csv", index=None
                    )
                else:
                    # if no categorical columns
                    # Combine feature names with their respective importance values
                    feature_importance_df = pd.DataFrame(
                        {
                            "feature": importance_dict.keys(),
                            "importance": importance_dict.values(),
                        }
                    )

                    feature_importance_df.to_csv(
                        model_name_path + "/feature_importance_df.csv", index=None
                    )

                # save some files
                pickle.dump(
                    classification_report,
                    open(model_name_path + "/classification_report.sav", "wb"),
                )
                pickle.dump(
                    feature_importance,
                    open(model_name_path + "/feature_importance.sav", "wb"),
                )
                pickle.dump(le, open(model_name_path + "/label_encoder.sav", "wb"))

                context = {
                    "dataset_type": dataset_type,
                    "pca": pca.to_html(),
                    "class_report": classification_report.to_html(),
                    "feature_importance": feature_importance.to_html(),
                }
            elif dataset_type == "timeseries":

                path = model_name_path
                dataset_camel = methods.convert_to_camel_case(df_name)
                if "Ecg" in dataset_camel:
                    dataset_camel = dataset_camel.replace("Ecg", "ECG")

                experiment = methods.fetch_line_by_dataset(
                    PIPELINE_PATH + "/glacier_experiments.txt",
                    dataset_camel,
                )

                if experiment is not None:
                    stripped_arguments = methods.extract_arguments_from_line(experiment)

                if model_name == "glacier":
                    classification_report = methods.training(
                        preprocess_df,
                        model_name,
                        float(test_set_ratio) if test_set_ratio != "" else 0,
                        "",
                        dataset_type,
                        df_name,
                        path,
                        autoencoder,
                        stripped_arguments,
                    )
                else:
                    classification_report = methods.training(
                        preprocess_df,
                        model_name,
                        float(test_set_ratio) if test_set_ratio != "" else 0,
                        "",
                        dataset_type,
                        df_name,
                        path,
                    )

                pickle.dump(
                    classification_report,
                    open(path + "/classification_report.sav", "wb"),
                )

                context = {
                    "dataset_type": dataset_type,
                    "pca": pca.to_html(),
                    "tsne": tsne.to_html(),
                    "class_report": classification_report.to_html(),
                }

            # save the plots
            pickle.dump(tsne, open(model_name_path + "/tsne.sav", "wb"))
            pickle.dump(pca, open(model_name_path + "/pca.sav", "wb"))

            # save projections file for future use
            with open(model_name_path + "/tsne_projections.json", "w") as f:
                json.dump(projections.tolist(), f, indent=2)

            if jsonFile.key_exists("classifier"):
                temp_json = {model_name: preprocessing_info}
                jsonFile.update_json(["classifier"], temp_json)
            else:
                temp_jason = {
                    "preprocessed_name": df_name + "_preprocessed.csv",
                    "classifier": {model_name: preprocessing_info},
                }
                jsonFile.append_to_json(temp_jason)

            classifier_data = jsonFile.read_from_json(["classifier", model_name])
            classifier_data_html = dict_and_html(classifier_data)
            context.update({"classifier_data": classifier_data_html})
            preprocess_df.to_csv(excel_file_name_preprocessed_path, index=False)
            status = 200

        except FileNotFoundError as e:
            methods.remove_dir_and_empty_parent(model_name_path)
            context = methods.format_error_context(
                e, "File error. Please check if all required files are available."
            )
            status = 400

        except PermissionError as e:
            methods.remove_dir_and_empty_parent(model_name_path)
            context = methods.format_error_context(
                e, "Permission error. Ensure appropriate file permissions."
            )
            status = 400

        except KeyError as e:
            methods.remove_dir_and_empty_parent(model_name_path)
            context = methods.format_error_context(
                e, f"Key error. Missing expected key {str(e)}. Verify dataset and configuration settings."
            )
            status = 400

        except ValueError as e:
            methods.remove_dir_and_empty_parent(model_name_path)
            context = methods.format_error_context(
                e, "Data error. Please verify the data format and preprocessing steps."
            )
            status = 400

        except TypeError as e:
            methods.remove_dir_and_empty_parent(model_name_path)
            context = methods.format_error_context(
                e, "Type error. Check for data type compatibility in operations."
            )
            status = 400

        except Exception as e:
            methods.remove_dir_and_empty_parent(model_name_path)
            context = methods.format_error_context(
                e, "An unexpected error occurred. Please review the code and data."
            )
            status = 400
    elif action == "delete_pre_trained":

        df_name = request.session["df_name"]
        model_name = request.POST.get("model_name")
        model_name_path = os.path.join(
            PIPELINE_PATH + f"{df_name}" + "/trained_models/" + model_name
        )

        print(model_name_path)

        excel_file_name_preprocessed_path = os.path.join(
            PIPELINE_PATH,
            f"{df_name}" + "/" + df_name + "_preprocessed" + ".csv",
        )
        try:
            # Check if the file exists
            if os.path.exists(excel_file_name_preprocessed_path):
                # Delete the file
                os.remove(excel_file_name_preprocessed_path)
                # print(f"File '{excel_file_name_preprocessed_path}' has been deleted successfully.")
            else:
                print(f"File '{excel_file_name_preprocessed_path}' does not exist.")
        except Exception as e:
            print(f"An error occurred while deleting the file: {e}")

        json_path = os.path.join(PIPELINE_PATH + f"{df_name}" + "/pipeline.json")
        jsonFile = pipeline.PipelineJSON(json_path)
        jsonFile.delete_key(["classifier", model_name])

        methods.remove_dir_and_empty_parent(model_name_path)
        # load paths
        # absolute excel_file_preprocessed_path

        if not jsonFile.key_exists("classifier"):
            # pre trained models do not exist
            # check if dataset directory exists
            df_dir = os.path.join(PIPELINE_PATH + f"{df_name}")
            if not os.path.exists(df_dir):
                df_name = None

            context = {
                "df_name": df_name,
                "available_pretrained_models_info": [],
            }
        else:
            # if it exists
            # check the section of "classifiers"
            # folder path
            available_pretrained_models = jsonFile.read_from_json(
                ["classifier"]
            ).keys()

            available_pretrained_models_info = (
                methods.create_tuple_of_models_text_value(
                    available_pretrained_models
                )
            )
            context = {
                "df_name": df_name,
                "available_pretrained_models_info": available_pretrained_models_info,
            }
    elif action == "discard_model":
        name = request.session["df_name"]
        model_name = request.session["model_name"]
        model_name_path = os.path.join(
            PIPELINE_PATH + f"{name}" + "/trained_models/" + model_name
        )
        # should delete model folder
        # should delete classifier from json
        # should delete preprocessed path too
        methods.remove_dir_and_empty_parent(model_name_path)
        # load paths
        # absolute excel_file_preprocessed_path
        excel_file_name_preprocessed_path = os.path.join(
            PIPELINE_PATH,
            f"{name}" + "/" + name + "_preprocessed" + ".csv",
        )
        try:
            # Check if the file exists
            if os.path.exists(excel_file_name_preprocessed_path):
                # Delete the file
                os.remove(excel_file_name_preprocessed_path)
                # print(f"File '{excel_file_name_preprocessed_path}' has been deleted successfully.")
            else:
                print(f"File '{excel_file_name_preprocessed_path}' does not exist.")
        except Exception as e:
            print(f"An error occurred while deleting the file: {e}")

        json_path = os.path.join(PIPELINE_PATH + f"{name}" + "/pipeline.json")
        jsonFile = pipeline.PipelineJSON(json_path)
        jsonFile.delete_key(["classifier",model_name])

        context = {}

    return HttpResponse(json.dumps(context), status=status)