import base.pipeline as pipeline import pickle, os import pandas as pd import json from sklearn.preprocessing import LabelEncoder from dict_and_html import * from .. import methods from ..methods import PIPELINE_PATH import numpy as np from collections import defaultdict import base.pipeline as pipeline import json from django.shortcuts import HttpResponse def handler(action, request): status = 200 if action == "train": # train a new model # parameters sent via ajax model_name = request.POST.get("model_name") df_name = request.session.get("df_name") # dataframe name if df_name == "upload": df_name = request.session.get("df_name_upload_base_name") request.session["model_name"] = model_name test_set_ratio = "" if "test_set_ratio" in request.POST: test_set_ratio = request.POST.get("test_set_ratio") datasets_types_PipelineJSON_path = os.path.join( PIPELINE_PATH + "/dataset_types_pipeline.json" ) jsonFile = pipeline.PipelineJSON(datasets_types_PipelineJSON_path) dataset_type = jsonFile.read_from_json([df_name]) if type(dataset_type) is list: dataset_type = dataset_type[0] if "array_preprocessing" in request.POST: array_preprocessing = request.POST.get("array_preprocessing") if dataset_type == "tabular": class_label = request.POST.get("class_label") preprocessing_info = { "preprocessing": array_preprocessing, "test_set_ratio": test_set_ratio, "explainability": {"technique": "dice"}, "class_label": class_label, } elif dataset_type == "timeseries": if model_name != "glacier": preprocessing_info = { "preprocessing": array_preprocessing, "test_set_ratio": test_set_ratio, "explainability": {"technique": model_name}, } else: # Path to the Bash script autoencoder = request.POST.get("autoencoder") preprocessing_info = { "autoencoder": autoencoder, "explainability": {"technique": model_name}, } # absolute excel_file_name_path excel_file_name_path = os.path.join( PIPELINE_PATH + f"{df_name}" + "/" + df_name + ".csv" ) # load paths # absolute excel_file_preprocessed_path excel_file_name_preprocessed_path = os.path.join( PIPELINE_PATH, f"{df_name}" + "/" + df_name + "_preprocessed" + ".csv", ) json_path = os.path.join(PIPELINE_PATH + f"{df_name}" + "/pipeline.json") jsonFile = pipeline.PipelineJSON(json_path) # save the plots for future use # folder path: pipelines//trained_models// model_name_path = os.path.join( PIPELINE_PATH + f"{df_name}" + "/trained_models/" + model_name ) model_name_dir_path = os.path.join(PIPELINE_PATH + f"{df_name}") if os.path.exists(excel_file_name_preprocessed_path) == True: # if preprocessed_file exists # delete it and do preprocessing again # maybe should optimize it for cases # where the preprocessing is the same with # the one applited on the existing file os.remove(excel_file_name_preprocessed_path) # generate filename idx = excel_file_name_path.index(".") excel_file_name_preprocessed = ( df_name[:idx] + "_preprocessed" + excel_file_name_path[idx:] ) # save file for preprocessing preprocess_df = pd.read_csv(excel_file_name_path) request.session["excel_file_name_preprocessed"] = excel_file_name_preprocessed # make the dir if not os.path.exists(model_name_path): os.makedirs(model_name_path) try: if dataset_type == "tabular": le = LabelEncoder() preprocess_df[class_label] = le.fit_transform( preprocess_df[class_label] ) if "array_preprocessing" in request.POST: preprocess_df = methods.preprocess( preprocess_df, array_preprocessing, excel_file_name_path, dataset_type, model_name_path, class_label, ) elif dataset_type == "timeseries": pos = jsonFile.read_from_json(["pos"]) neg = jsonFile.read_from_json(["neg"]) pos_label, neg_label = 1, 0 if pos != pos_label: preprocess_df.iloc[:, -1] = preprocess_df.iloc[:, -1].apply( lambda x: pos_label if x == int(pos) else x ) if neg != neg_label: preprocess_df.iloc[:, -1] = preprocess_df.iloc[:, -1].apply( lambda x: neg_label if x == int(neg) else x ) if "array_preprocessing" in request.POST: preprocess_df = methods.preprocess( preprocess_df, array_preprocessing, excel_file_name_path, dataset_type, model_name_path, ) pca = methods.generatePCA(preprocess_df) # TSNE if dataset_type == "tabular": tsne, projections = methods.generateTSNE( preprocess_df, dataset_type, class_label ) else: tsne, projections = methods.generateTSNE(preprocess_df, dataset_type) if dataset_type == "tabular": # training feature_importance, classification_report, importance_dict = ( methods.training( preprocess_df, model_name, float(test_set_ratio), class_label, dataset_type, df_name, model_name_path, ) ) # feature importance on the original categorical columns (if they exist) df = pd.read_csv(excel_file_name_path) df = df.drop(class_label, axis=1) # Initialize a dictionary to hold aggregated feature importances categorical_columns = methods.get_categorical_features(df) if categorical_columns != []: aggregated_importance = {} encoded_columns = methods.update_column_list_with_one_hot_columns( df, preprocess_df, df.columns ) feature_mapping = defaultdict(list) for col in encoded_columns: for original_col in categorical_columns: if col.startswith(original_col + "_"): feature_mapping[original_col].append(col) break else: feature_mapping[col].append( col ) # If no match, map to itself # Aggregate the feature importances for original_feature, encoded_columns in feature_mapping.items(): if encoded_columns: # Check if encoded_columns is not empty if original_feature not in encoded_columns: aggregated_importance[original_feature] = np.sum( [ importance_dict.get(col, 0) for col in encoded_columns ] ) else: aggregated_importance[original_feature] = ( importance_dict.get(original_feature, 0) ) importance_df = pd.DataFrame( { "feature": list(aggregated_importance.keys()), "importance": list(aggregated_importance.values()), } ) importance_df.to_csv( model_name_path + "/feature_importance_df.csv", index=None ) else: # if no categorical columns # Combine feature names with their respective importance values feature_importance_df = pd.DataFrame( { "feature": importance_dict.keys(), "importance": importance_dict.values(), } ) feature_importance_df.to_csv( model_name_path + "/feature_importance_df.csv", index=None ) # save some files pickle.dump( classification_report, open(model_name_path + "/classification_report.sav", "wb"), ) pickle.dump( feature_importance, open(model_name_path + "/feature_importance.sav", "wb"), ) pickle.dump(le, open(model_name_path + "/label_encoder.sav", "wb")) context = { "dataset_type": dataset_type, "pca": pca.to_html(), "class_report": classification_report.to_html(), "feature_importance": feature_importance.to_html(), } elif dataset_type == "timeseries": path = model_name_path dataset_camel = methods.convert_to_camel_case(df_name) if "Ecg" in dataset_camel: dataset_camel = dataset_camel.replace("Ecg", "ECG") experiment = methods.fetch_line_by_dataset( PIPELINE_PATH + "/glacier_experiments.txt", dataset_camel, ) if experiment is not None: stripped_arguments = methods.extract_arguments_from_line(experiment) if model_name == "glacier": classification_report = methods.training( preprocess_df, model_name, float(test_set_ratio) if test_set_ratio != "" else 0, "", dataset_type, df_name, path, autoencoder, stripped_arguments, ) else: classification_report = methods.training( preprocess_df, model_name, float(test_set_ratio) if test_set_ratio != "" else 0, "", dataset_type, df_name, path, ) pickle.dump( classification_report, open(path + "/classification_report.sav", "wb"), ) context = { "dataset_type": dataset_type, "pca": pca.to_html(), "tsne": tsne.to_html(), "class_report": classification_report.to_html(), } # save the plots pickle.dump(tsne, open(model_name_path + "/tsne.sav", "wb")) pickle.dump(pca, open(model_name_path + "/pca.sav", "wb")) # save projections file for future use with open(model_name_path + "/tsne_projections.json", "w") as f: json.dump(projections.tolist(), f, indent=2) if jsonFile.key_exists("classifier"): temp_json = {model_name: preprocessing_info} jsonFile.update_json(["classifier"], temp_json) else: temp_jason = { "preprocessed_name": df_name + "_preprocessed.csv", "classifier": {model_name: preprocessing_info}, } jsonFile.append_to_json(temp_jason) classifier_data = jsonFile.read_from_json(["classifier", model_name]) classifier_data_html = dict_and_html(classifier_data) context.update({"classifier_data": classifier_data_html}) preprocess_df.to_csv(excel_file_name_preprocessed_path, index=False) status = 200 except FileNotFoundError as e: methods.remove_dir_and_empty_parent(model_name_path) context = methods.format_error_context( e, "File error. Please check if all required files are available." ) status = 400 except PermissionError as e: methods.remove_dir_and_empty_parent(model_name_path) context = methods.format_error_context( e, "Permission error. Ensure appropriate file permissions." ) status = 400 except KeyError as e: methods.remove_dir_and_empty_parent(model_name_path) context = methods.format_error_context( e, f"Key error. Missing expected key {str(e)}. Verify dataset and configuration settings." ) status = 400 except ValueError as e: methods.remove_dir_and_empty_parent(model_name_path) context = methods.format_error_context( e, "Data error. Please verify the data format and preprocessing steps." ) status = 400 except TypeError as e: methods.remove_dir_and_empty_parent(model_name_path) context = methods.format_error_context( e, "Type error. Check for data type compatibility in operations." ) status = 400 except Exception as e: methods.remove_dir_and_empty_parent(model_name_path) context = methods.format_error_context( e, "An unexpected error occurred. Please review the code and data." ) status = 400 elif action == "delete_pre_trained": df_name = request.session["df_name"] model_name = request.POST.get("model_name") model_name_path = os.path.join( PIPELINE_PATH + f"{df_name}" + "/trained_models/" + model_name ) print(model_name_path) excel_file_name_preprocessed_path = os.path.join( PIPELINE_PATH, f"{df_name}" + "/" + df_name + "_preprocessed" + ".csv", ) try: # Check if the file exists if os.path.exists(excel_file_name_preprocessed_path): # Delete the file os.remove(excel_file_name_preprocessed_path) # print(f"File '{excel_file_name_preprocessed_path}' has been deleted successfully.") else: print(f"File '{excel_file_name_preprocessed_path}' does not exist.") except Exception as e: print(f"An error occurred while deleting the file: {e}") json_path = os.path.join(PIPELINE_PATH + f"{df_name}" + "/pipeline.json") jsonFile = pipeline.PipelineJSON(json_path) jsonFile.delete_key(["classifier", model_name]) methods.remove_dir_and_empty_parent(model_name_path) # load paths # absolute excel_file_preprocessed_path if not jsonFile.key_exists("classifier"): # pre trained models do not exist # check if dataset directory exists df_dir = os.path.join(PIPELINE_PATH + f"{df_name}") if not os.path.exists(df_dir): df_name = None context = { "df_name": df_name, "available_pretrained_models_info": [], } else: # if it exists # check the section of "classifiers" # folder path available_pretrained_models = jsonFile.read_from_json( ["classifier"] ).keys() available_pretrained_models_info = ( methods.create_tuple_of_models_text_value( available_pretrained_models ) ) context = { "df_name": df_name, "available_pretrained_models_info": available_pretrained_models_info, } elif action == "discard_model": name = request.session["df_name"] model_name = request.session["model_name"] model_name_path = os.path.join( PIPELINE_PATH + f"{name}" + "/trained_models/" + model_name ) # should delete model folder # should delete classifier from json # should delete preprocessed path too methods.remove_dir_and_empty_parent(model_name_path) # load paths # absolute excel_file_preprocessed_path excel_file_name_preprocessed_path = os.path.join( PIPELINE_PATH, f"{name}" + "/" + name + "_preprocessed" + ".csv", ) try: # Check if the file exists if os.path.exists(excel_file_name_preprocessed_path): # Delete the file os.remove(excel_file_name_preprocessed_path) # print(f"File '{excel_file_name_preprocessed_path}' has been deleted successfully.") else: print(f"File '{excel_file_name_preprocessed_path}' does not exist.") except Exception as e: print(f"An error occurred while deleting the file: {e}") json_path = os.path.join(PIPELINE_PATH + f"{name}" + "/pipeline.json") jsonFile = pipeline.PipelineJSON(json_path) jsonFile.delete_key(["classifier",model_name]) context = {} return HttpResponse(json.dumps(context), status=status)