CoPAL/CoPAL_Share.py

"""
Author Zahra Kharazian, zahra.kharazian@dsv.su.se

This code implements the CoPAL algorithm that employs
conformal prediction in active learning for regression tasks.
Suitable for multi-variate time series data
"""


import random
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from crepes import WrapRegressor
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from crepes import ConformalRegressor, ConformalPredictiveSystem
from crepes.extras import binning, DifficultyEstimator
from sklearn.metrics import mean_absolute_error, mean_squared_error
import sys


random_state = 1
data = 'componentX'

model = xgb.XGBRegressor(random_state=random_state)
# model = RandomForestRegressor(random_state=random_state)

CP = 'norm_Mondrian_CPS'
# CP = 'std_Mondrian_CPS'
# CP = 'Mondrian_regressor'

evaluation_metric = mean_squared_error
num_vehicle_selection = 226
al_iterations_count = 6
data_fraction = 1
num_rounds = 5
train_percentage = 0.01
test_percentage = 0.25
cal_percentage = 0.14
pool_percentage = 0.6


np.random.seed(random_state)

# If all the readouts are needed for training
def join_op_tte_prep2(data_op, data_tte):

    data_tte = data_tte.sample(frac=data_fraction)
    data_tte = data_tte[data_tte['in_study_repair'] == 1]
    data_op = data_op[data_op['vehicle_id'].isin(data_tte['vehicle_id'])]
    data_op_tte = pd.merge(data_op, data_tte, on=['vehicle_id'], how='left')
    data_op_tte['RUL'] = data_op_tte['length_of_study_time_step'] - data_op_tte['time_step']
    data_op_tte = data_op_tte.dropna()

    return data_op_tte

def X_y_split(df):
    y = df[['RUL']]
    X = df.drop(columns=['RUL', 'vehicle_id', 'length_of_study_time_step', 'time_step', 'in_study_repair'])
    return X, y


def conformal_prediction(model, df_train, df_pool, regressor='Mondrian_regressor'): # This only works with RF but not with other models

    X_train, y_train = X_y_split(df_train)
    X_train = X_train.values.astype(float)
    y_train = y_train.values.ravel()

    X_pool, y_pool = X_y_split(df_pool)
    X_pool = X_pool.values.astype(float)
    y_pool = y_pool.values.ravel()

    mdl = WrapRegressor(model)
    mdl.fit(X_train, y_train)
    de = DifficultyEstimator()
    de.fit(X_train, y=y_train)


    sigmas_cal = de.apply(X_cal)
    sigmas_pool = de.apply(X_pool)
    sigmas_test = de.apply(X_test)
    y_hat_cal = mdl.predict(X_cal)
    residuals_cal = y_cal - y_hat_cal
    y_hat_pool = mdl.predict(X_pool)
    y_hat_test = mdl.predict(X_test)

    if regressor == 'Mondrian_regressor':
        bins_cal, bin_thresholds = binning(sigmas_cal, bins=10)
        mdl.calibrate(X_cal, y_cal, bins=bins_cal)

        bins_pool = binning(sigmas_pool, bins=bin_thresholds)
        pool_intervals = mdl.predict_int(X_pool, bins=bins_pool)

        bins_test = binning(sigmas_test, bins=bin_thresholds)
        test_intervals = mdl.predict_int(X_test, bins=bins_test)

    elif regressor == 'norm_Mondrian_CPS':
        bins_cal, bin_thresholds = binning(y_hat_cal, bins=5)
        mdl.calibrate(X_cal, y_cal, sigmas=sigmas_cal, bins=bins_cal, cps=True)

        bins_pool = binning(mdl.predict(X_pool), bins=bin_thresholds)

        cps_mond_norm = ConformalPredictiveSystem().fit(residuals_cal,
                                                        sigmas=sigmas_cal,
                                                        bins=bins_cal)

        pool_intervals = cps_mond_norm.predict(y_hat_pool,
                                               sigmas=sigmas_pool,
                                               bins=bins_pool, lower_percentiles=2.5,
                                               higher_percentiles=97.5)
        bins_test = binning(mdl.predict(X_test), bins=bin_thresholds)

        test_intervals = cps_mond_norm.predict(y_hat_test,
                                               sigmas=sigmas_test,
                                               bins=bins_test, lower_percentiles=2.5,
                                               higher_percentiles=97.5)

    elif regressor == 'std_Mondrian_CPS':

        bins_cal, bin_thresholds = binning(y_hat_cal, bins=5)
        mdl.calibrate(X_cal, y_cal, sigmas=sigmas_cal, bins=bins_cal, cps=True)

        bins_pool = binning(mdl.predict(X_pool), bins=bin_thresholds)
        cps_mond_std = ConformalPredictiveSystem().fit(residuals_cal, bins=bins_cal)

        pool_intervals = cps_mond_std.predict(y_hat_pool,
                                               sigmas=sigmas_pool,
                                               bins=bins_pool, lower_percentiles=2.5,
                                               higher_percentiles=97.5)
        bins_test = binning(mdl.predict(X_test), bins=bin_thresholds)
        test_intervals = cps_mond_std.predict(y_hat_test,
                                           sigmas=sigmas_test,
                                           bins=bins_test, lower_percentiles=2.5,
                                           higher_percentiles=97.5)


    return pool_intervals, test_intervals


def sample_selection(pred_intervals_cps, df_pool, df_train, policy = 'most_uncertain'):

    df_interval_temp = pd.DataFrame(pred_intervals_cps, columns=['min_int', 'max_int'])
    merged_df = df_pool.copy()
    merged_df.reset_index(drop=True, inplace=True)
    merged_df['max_int'] = df_interval_temp['max_int']
    merged_df['min_int'] = df_interval_temp['min_int']
    merged_df['diff_int'] = merged_df['max_int'] - merged_df['min_int']
    merged_ave_df = merged_df.groupby('vehicle_id').agg(Ave_intervals=('diff_int', 'mean')).reset_index()


    if policy == 'most_uncertain':
        merged_ave_df_sorted = merged_ave_df.sort_values(by='Ave_intervals', ascending=False)
        df_int_certainty = merged_ave_df_sorted.head(num_vehicle_selection)
        # remove these "num_vehicle_selection" samples from pool
        selected_vehicles = df_int_certainty['vehicle_id']

    elif policy == 'most_uncertain_roulette':
        merged_ave_df['probability'] = merged_ave_df['Ave_intervals'] / merged_ave_df['Ave_intervals'].sum()
        selected_vehicles = merged_ave_df.sample(n=num_vehicle_selection, weights='probability', random_state=random_state)['vehicle_id']

    elif policy == 'most_certain_roulette':
        merged_ave_df['flipped_Ave_intervals'] = (merged_ave_df['Ave_intervals'].max() + 1 - merged_ave_df['Ave_intervals'])
        merged_ave_df['probability'] = merged_ave_df['flipped_Ave_intervals'] / merged_ave_df['flipped_Ave_intervals'].sum()
        selected_vehicles = merged_ave_df.sample(n=num_vehicle_selection, weights='probability', random_state=random_state)['vehicle_id']


    elif policy == 'most_certain':
        merged_ave_df_sorted = merged_ave_df.sort_values(by='Ave_intervals', ascending=True)
        df_int_certainty = merged_ave_df_sorted.head(num_vehicle_selection)
        # remove these "num_vehicle_selection" samples from pool
        selected_vehicles = df_int_certainty['vehicle_id']

    elif policy == 'random':
        unique_vehicles = df_pool.vehicle_id.unique()
        unique_vehicles = pd.DataFrame({'vehicle_id':unique_vehicles})
        selected_vehicles = unique_vehicles.sample(n=num_vehicle_selection, random_state=random_state)
        selected_vehicles = selected_vehicles['vehicle_id']

    pool_new_df = df_pool[~df_pool['vehicle_id'].isin(selected_vehicles)]

    # add these "num_vehicle_selection" samples to train
    df_pool_selected_vehicles = df_pool[df_pool['vehicle_id'].isin(selected_vehicles)]
    train_new_df = pd.concat([df_pool_selected_vehicles, df_train])

    return train_new_df, pool_new_df

def evaluate_regressor(model):
    pred = model.predict(X_test)
    rmse = evaluation_metric(y_test, pred, squared=False)
    return rmse


def split_df(df):
    df = df.sample(frac=1, random_state=random_state)
    # Group by 'Group' and get unique groups
    unique_groups = df['vehicle_id'].unique()
    # Calculate sizes for each part
    total_groups = len(unique_groups)
    pool_size = int(pool_percentage * total_groups)
    train_size = int(train_percentage * total_groups)
    test_size = int(test_percentage * total_groups)
    calibration_size = int(cal_percentage * total_groups)

    # Divide the unique groups into parts
    parts = [unique_groups[:pool_size],
             unique_groups[pool_size:pool_size + train_size],
             unique_groups[pool_size + train_size:pool_size + train_size + test_size],
             unique_groups[pool_size + train_size + test_size:]]

    # Create DataFrames for each part
    part_dfs = []
    for part in parts:
        part_df = df[df['vehicle_id'].isin(part)]
        part_dfs.append(part_df)

    return part_dfs[0], part_dfs[1], part_dfs[2], part_dfs[3]

"""Download the data here:
https://snd.se/en/catalogue/dataset/2024-34
"""

train_op = pd.read_csv("your address here/train_operational_readouts.csv")
train_tte = pd.read_csv("your address here/train_tte.csv")
train_spec = pd.read_csv("your address here/train_specification.csv")

data_op_tte = join_op_tte_prep2(train_op, train_tte)
######################## One random readout for each vehicle ######################
# Group the DataFrame by 'vehicle_id' and sample a random readout per vehicle
# data_op_tte = data_op_tte.groupby('vehicle_id').apply(lambda x: x.sample(n=1)).reset_index(drop=True)


results_mu = []
results_mc = []
results_mur = []
results_mcr = []
results_rnd = []

intervals_mu = []
intervals_mc = []
intervals_mur = []
intervals_mcr = []
intervals_rnd = []

figure_counter = 1

for round in range(num_rounds):
    random_state = round + 1
    print('\n\n\n>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> round: ', round)

    # Flush the output
    sys.stdout.flush()

    df_pool, df_train, df_test, df_cal = split_df(data_op_tte)

    X_test, y_test = X_y_split(df_test)
    X_cal, y_cal = X_y_split(df_cal)

    X_cal = X_cal.values.astype(float)
    y_cal = y_cal.values.ravel()

    df_train_ur = df_train.copy()
    df_pool_ur = df_pool.copy()

    df_train_c = df_train.copy()
    df_pool_c = df_pool.copy()

    df_train_cr = df_train.copy()
    df_pool_cr = df_pool.copy()

    df_train_rand = df_train.copy()
    df_pool_rand = df_pool.copy()

    mae_least_conf = []
    mae_most_conf = []
    mae_most_conf_roulette = []
    mae_least_conf_roulette = []
    mae_random = []

    test_int_u = []
    test_int_ur = []
    test_int_c = []
    test_int_cr = []
    test_int_rand = []

    for iter in range(al_iterations_count):
        print('iter: ', iter, ' #####################################################################################################################')

        # Flush the output
        sys.stdout.flush()

        ########################################### Random ######################################
        model = xgb.XGBRegressor(random_state=random_state)
        # model = RandomForestRegressor(random_state=random_state)

        _ , test_intervals_rand = conformal_prediction(model, df_train_rand, df_pool_rand, regressor = CP)
        average_test_inter_rand = (test_intervals_rand[:, 1] - test_intervals_rand[:, 0]).mean()
        test_int_rand.append(average_test_inter_rand)

        MAE_rand = evaluate_regressor(model)

        print('RMSE_rand: ', MAE_rand, ', X_train_rand.shape[0]:', df_train_rand.shape[0], ', vehicles in X_train_rand:', df_train_rand.vehicle_id.nunique(), ', X_pool_rand.shape[0]:', df_pool_rand.shape[0], 'vehicles in X_pool:', df_pool_rand.vehicle_id.nunique())
        mae_random.append(MAE_rand)

        df_train_rand, df_pool_rand  = sample_selection(None, df_pool_rand, df_train_rand,
                                                                  policy = 'random')

        ###################################### Most Uncertain ######################################

        model = xgb.XGBRegressor(random_state=random_state)
        #
        # model = RandomForestRegressor(random_state=random_state)

        pred_intervals_cps, test_intervals_u = conformal_prediction(model, df_train, df_pool, regressor = CP)
        average_test_inter_u = (test_intervals_u[:, 1] - test_intervals_u[:, 0]).mean()
        test_int_u.append(average_test_inter_u)

        MAE_u = evaluate_regressor(model)

        print('RMSE_most_uncertain: ', MAE_u, ', X_train.shape[0]:', df_train.shape[0], ', vehicles in X_train:',
              df_train.vehicle_id.nunique(), ', X_pool.shape[0]:', df_pool.shape[0], 'vehicles in X_pool:',
              df_pool.vehicle_id.nunique())
        mae_least_conf.append(MAE_u)

        df_train, df_pool= sample_selection(pred_intervals_cps,
                                                   df_pool,
                                                   df_train,
                                                   policy='most_uncertain')

        ###################################### Unertain roulette ######################################
        model = xgb.XGBRegressor(random_state=random_state)
        #
        # model = RandomForestRegressor(random_state=random_state)

        pred_intervals_cps_ur, test_intervals_ur = conformal_prediction(model, df_train_ur, df_pool_ur, regressor = CP)
        average_test_inter_ur = (test_intervals_ur[:, 1] - test_intervals_ur[:, 0]).mean()
        test_int_ur.append(average_test_inter_ur)

        MAE_ur = evaluate_regressor(model)

        print('RMSE_most_uncertain_roulette: ', MAE_ur, ', X_train.shape[0]:', df_train_ur.shape[0], ', vehicles in X_train:',
              df_train_ur.vehicle_id.nunique(), ', X_pool.shape[0]:', df_pool_ur.shape[0], 'vehicles in X_pool:',
              df_pool_ur.vehicle_id.nunique())
        mae_least_conf_roulette.append(MAE_ur)

        df_train_ur, df_pool_ur = sample_selection(pred_intervals_cps_ur,
                                                       df_pool_ur,
                                                       df_train_ur,
                                                       policy='most_uncertain_roulette')

        ###################################### Most certain ######################################
        model = xgb.XGBRegressor(random_state=random_state)
        #
        # model = RandomForestRegressor(random_state=random_state)

        pred_intervals_cps_c, test_intervals_c = conformal_prediction(model, df_train_c, df_pool_c, regressor = CP)
        average_test_inter_c = (test_intervals_c[:, 1] - test_intervals_c[:, 0]).mean()
        test_int_c.append(average_test_inter_c)

        MAE_c = evaluate_regressor(model)

        print('RMSE_most_certain: ', MAE_c, ', X_train.shape[0]:', df_train_c.shape[0], ', vehicles in X_train:', df_train_c.vehicle_id.nunique() ,', X_pool.shape[0]:', df_pool_c.shape[0],'vehicles in X_pool:', df_pool_c.vehicle_id.nunique())
        mae_most_conf.append(MAE_c)

        df_train_c, df_pool_c= sample_selection(pred_intervals_cps_c,
                                                                  df_pool_c,
                                                                  df_train_c,
                                                                  policy = 'most_certain')

        ###################################### certain roulette ######################################
        model = xgb.XGBRegressor(random_state=random_state)
        #
        # model =RandomForestRegressor(random_state=random_state)

        pred_intervals_cps_cr, test_intervals_cr = conformal_prediction(model, df_train_cr, df_pool_cr, regressor = CP)
        average_test_inter_cr = (test_intervals_cr[:, 1] - test_intervals_cr[:, 0]).mean()
        test_int_cr.append(average_test_inter_cr)

        MAE_cr = evaluate_regressor(model)

        print('RMSE_most_certain_roulette: ', MAE_cr, ', X_train.shape[0]:', df_train_cr.shape[0], ', vehicles in X_train:',
              df_train_cr.vehicle_id.nunique(), ', X_pool.shape[0]:', df_pool_cr.shape[0], 'vehicles in X_pool:',
              df_pool_cr.vehicle_id.nunique())
        mae_most_conf_roulette.append(MAE_cr)

        df_train_cr, df_pool_cr = sample_selection(pred_intervals_cps_cr,
                                                       df_pool_cr,
                                                       df_train_cr,
                                                       policy='most_certain_roulette')


    print('iter: Last', ' #####################################################################################################################')
    model = xgb.XGBRegressor(random_state=random_state)
    # model = RandomForestRegressor(random_state=random_state)

    _ , test_int = conformal_prediction(model, df_train_rand, df_pool_rand, regressor = CP)
    average_test_inter = (test_int[:, 1] - test_int[:, 0]).mean()
    test_int_rand.append(average_test_inter)
    MAE_rand = evaluate_regressor(model)
    print('RMSE_rand: ', MAE_rand, ', X_train_rand.shape[0]:', df_train_rand.shape[0], ', vehicles in X_train_rand:', df_train_rand.vehicle_id.nunique(), ', X_pool_rand.shape[0]:', df_pool_rand.shape[0], 'vehicles in X_pool:', df_pool_rand.vehicle_id.nunique())
    mae_random.append(MAE_rand)

    model = xgb.XGBRegressor(random_state=random_state)
    # model = RandomForestRegressor(random_state=random_state)

    _ , test_int = conformal_prediction(model, df_train, df_pool, regressor = CP)
    average_test_inter = (test_int[:, 1] - test_int[:, 0]).mean()
    test_int_u.append(average_test_inter)
    MAE_u = evaluate_regressor(model)
    print('RMSE_most_uncertain: ', MAE_u, ', X_train.shape[0]:', df_train.shape[0], ', vehicles in X_train:',
          df_train.vehicle_id.nunique(), ', X_pool.shape[0]:', df_pool.shape[0], 'vehicles in X_pool:',
          df_pool.vehicle_id.nunique())
    mae_least_conf.append(MAE_u)

    model = xgb.XGBRegressor(random_state=random_state)
    # model = RandomForestRegressor(random_state=random_state)

    _ , test_int = conformal_prediction(model, df_train_ur, df_pool_ur, regressor = CP)
    average_test_inter = (test_int[:, 1] - test_int[:, 0]).mean()
    test_int_ur.append(average_test_inter)
    MAE_ur = evaluate_regressor(model)
    print('RMSE_most_uncertain_roulette: ', MAE_ur, ', X_train.shape[0]:', df_train_ur.shape[0],
          ', vehicles in X_train:',
          df_train_ur.vehicle_id.nunique(), ', X_pool.shape[0]:', df_pool_ur.shape[0], 'vehicles in X_pool:',
          df_pool_ur.vehicle_id.nunique())
    mae_least_conf_roulette.append(MAE_ur)

    model = xgb.XGBRegressor(random_state=random_state)
    # model = RandomForestRegressor(random_state=random_state)

    _ , test_int = conformal_prediction(model,df_train_c, df_pool_c, regressor = CP)
    average_test_inter = (test_int[:, 1] - test_int[:, 0]).mean()
    test_int_c.append(average_test_inter)
    MAE_c = evaluate_regressor(model)
    print('RMSE_most_certain: ', MAE_c, ', X_train.shape[0]:', df_train_c.shape[0], ', vehicles in X_train:',
          df_train_c.vehicle_id.nunique(), ', X_pool.shape[0]:', df_pool_c.shape[0], 'vehicles in X_pool:',
          df_pool_c.vehicle_id.nunique())
    mae_most_conf.append(MAE_c)

    model = xgb.XGBRegressor(random_state=random_state)
    # model = RandomForestRegressor(random_state=random_state)

    _ , test_int = conformal_prediction(model, df_train_cr, df_pool_cr, regressor = CP)
    average_test_inter = (test_int[:, 1] - test_int[:, 0]).mean()
    test_int_cr.append(average_test_inter)
    MAE_cr = evaluate_regressor(model)
    print('RMSE_most_certain_roulette: ', MAE_cr, ', X_train.shape[0]:', df_train_cr.shape[0], ', vehicles in X_train:',
          df_train_cr.vehicle_id.nunique(), ', X_pool.shape[0]:', df_pool_cr.shape[0], 'vehicles in X_pool:',
          df_pool_cr.vehicle_id.nunique())
    mae_most_conf_roulette.append(MAE_cr)


    sizes_list = list(range(df_train_rand.vehicle_id.nunique(), 0, -num_vehicle_selection))
    sizes_array = np.array(sizes_list)
    sizes_array.sort()  # Sort the list in ascending order

    # Plotting MAE values with dataset size on secondary x-axis
    plt.figure()

    # Plot MAE values
    plt.plot(range(al_iterations_count + 1), mae_least_conf, label='least confident')
    plt.plot(range(al_iterations_count + 1), mae_least_conf_roulette, label='least confident roulette')
    plt.plot(range(al_iterations_count + 1), mae_most_conf, label='most confident')
    plt.plot(range(al_iterations_count + 1), mae_most_conf_roulette, label='most confident roulette')
    plt.plot(range(al_iterations_count + 1), mae_random, '-.', label='random selection')

    # Add dataset size labels on the secondary x-axis
    ax = plt.gca()
    ax2 = ax.twiny()
    ax2.set_xlim(ax.get_xlim())
    ax2.set_xticks(range(al_iterations_count + 1))
    ax2.set_xticklabels(sizes_array)

    # Set labels for both x-axes
    ax.set_xlabel("Active Learning Iterations")
    ax2.set_xlabel("Number of vehicles in the train set")
    ax.set_ylabel("RMSE")
    plt.title('Active Learning round ' + str(round))
    ax.legend()
    figure_counter = figure_counter + 1

    # plot test interval levels
    plt.figure()
    plt.plot(range(al_iterations_count + 1), test_int_u, label='most uncertain')
    plt.plot(range(al_iterations_count + 1), test_int_ur, label='most uncertain roulette')
    plt.plot(range(al_iterations_count + 1), test_int_c, label='most certain')
    plt.plot(range(al_iterations_count + 1), test_int_cr, label='most certain roulette')
    plt.plot(range(al_iterations_count + 1), test_int_rand, '-.', label='random selection')
    # Add dataset size labels on the secondary x-axis
    ax = plt.gca()
    ax2 = ax.twiny()
    ax2.set_xlim(ax.get_xlim())
    ax2.set_xticks(range(al_iterations_count + 1))
    ax2.set_xticklabels(sizes_array)
    # Set labels for both x-axes
    ax.set_xlabel("Active Learning Iterations")
    ax2.set_xlabel("Number of vehicles in the train set")
    ax.set_ylabel("Average Test Intervals")
    plt.title('Active Learning round ' + str(round))
    ax.legend()
    figure_counter = figure_counter + 1


    results_mu.append(mae_least_conf)
    results_mur.append(mae_least_conf_roulette)
    results_mc.append(mae_most_conf)
    results_mcr.append(mae_most_conf_roulette)
    results_rnd.append(mae_random)

    intervals_mu.append(test_int_u)
    intervals_mur.append(test_int_ur)
    intervals_mc.append(test_int_c)
    intervals_mcr.append(test_int_cr)
    intervals_rnd.append(test_int_rand)

final_res = pd.DataFrame({'mu':results_mu, 'mur':results_mur,
                          'mc':results_mc, 'mcr': results_mcr,
                          'rnd':results_rnd})

# final_res = pd.DataFrame({'mu':results_mu, 'mur':results_mur, 'rnd':results_rnd})

mu_df = pd.DataFrame(final_res.mu.to_list(), columns=['itr'+str(i) for i in range(al_iterations_count + 1)])
mur_df = pd.DataFrame(final_res.mur.to_list(), columns=['itr'+str(i) for i in range(al_iterations_count + 1)])
mc_df = pd.DataFrame(final_res.mc.to_list(), columns=['itr'+str(i) for i in range(al_iterations_count+1)])
mcr_df = pd.DataFrame(final_res.mcr.to_list(), columns=['itr'+str(i) for i in range(al_iterations_count+1)])
rnd_df = pd.DataFrame(final_res.rnd.to_list(), columns=['itr'+str(i) for i in range(al_iterations_count+1)])


final_intervals = pd.DataFrame({'mu':intervals_mu, 'mur':intervals_mur,
                          'mc':intervals_mc, 'mcr': intervals_mcr,
                          'rnd':intervals_rnd})

mu_interval_df = pd.DataFrame(final_intervals.mu.to_list(), columns=['itr'+str(i) for i in range(al_iterations_count + 1 )])
mur_interval_df = pd.DataFrame(final_intervals.mur.to_list(), columns=['itr'+str(i) for i in range(al_iterations_count  + 1)])
mc_interval_df = pd.DataFrame(final_intervals.mc.to_list(), columns=['itr'+str(i) for i in range(al_iterations_count + 1)])
mcr_interval_df = pd.DataFrame(final_intervals.mcr.to_list(), columns=['itr'+str(i) for i in range(al_iterations_count + 1)])
rnd_interval_df = pd.DataFrame(final_intervals.rnd.to_list(), columns=['itr'+str(i) for i in range(al_iterations_count +1)])

x = range(al_iterations_count+1)
plt.figure()

def plot_one_curve(df, label, color):
    y = np.array(df.mean().to_list())
    deviation = np.array(df.std().to_list())
    plt.plot(x, y, label=label)
    plt.fill_between(x, y - deviation, y + deviation, color=color, alpha=0.1)

plot_one_curve(mu_df, 'most uncertain', 'blue')
plot_one_curve(mur_df, 'most uncertain roulette', 'red')
plot_one_curve(mc_df, 'most certain', 'green')
plot_one_curve(mcr_df, 'most certain roulette', 'magenta')
plot_one_curve(rnd_df, 'random selection', 'purple')
# Add dataset size labels on the secondary x-axis
ax = plt.gca()
ax2 = ax.twiny()
ax2.set_xlim(ax.get_xlim())
ax2.set_xticks(range(al_iterations_count + 1))
ax2.set_xticklabels(sizes_array)

# Set labels for both x-axes
ax.set_xlabel("Active Learning Iterations")
ax2.set_xlabel("Number of vehicles in the train set")
ax.set_ylabel("RMSE")
plt.title('Active Learning')
ax.legend()

figure_counter = figure_counter + 1

x = range(al_iterations_count + 1)
plt.figure()

plot_one_curve(mu_interval_df, 'most uncertain', 'blue')
plot_one_curve(mur_interval_df, 'most uncertain roulette', 'red')
plot_one_curve(mc_interval_df, 'most certain', 'green')
plot_one_curve(mcr_interval_df, 'most certain roulette', 'magenta')
plot_one_curve(rnd_interval_df, 'random selection', 'purple')


ax = plt.gca()
ax2 = ax.twiny()
ax2.set_xlim(ax.get_xlim())
ax2.set_xticks(range(al_iterations_count + 1))
ax2.set_xticklabels(sizes_array)

# Set labels for both x-axes
ax.set_xlabel("Active Learning Iterations")
ax2.set_xlabel("Number of vehicles in the train set")
ax.set_ylabel("Prediction Interval for Test set")
plt.title('Active Learning')
ax.legend()
figure_counter = figure_counter + 1

plt.show()