EXTREMUM_web/base/views.py

from django.shortcuts import render, redirect
import pandas as pd
from django.core.files.storage import FileSystemStorage
import pickle, os
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np
from . import utils


fig = None
excel_file_name_preprocessed = "dataset_preprocessed.csv"
excel_file_name = "dataset.csv"

def home(request):
    global fig
    global excel_file_name
    global excel_file_name_preprocessed

    # if file for preprocessing does not exist create it
    if os.path.exists(excel_file_name_preprocessed) == False:
        df = pd.read_csv(excel_file_name)
        df.to_csv(excel_file_name_preprocessed)

    if request.method == 'POST':
        print("HEYEYEYEYEYE")
        feature1 = request.POST.get('feature1')
        feature2 = request.POST.get('feature2')
        if 'csv' in request.POST:
            excel_file = request.FILES["excel_file"]
            excel_file_name = request.FILES["excel_file"].name
            df = pd.read_csv(excel_file)

            fs = FileSystemStorage() #defaults to MEDIA_ROOT
            request.session['excel_file_name'] = excel_file_name
            fs.save(excel_file_name, excel_file)

            idx = excel_file_name.index('.')
            excel_file_name_preprocessed = excel_file_name[:idx] + "_preprocessed" + excel_file_name[idx:]
            fs = FileSystemStorage() #defaults to MEDIA_ROOT
            request.session['excel_file_name_preprocess'] = excel_file_name_preprocessed
            fs.save(excel_file_name_preprocessed, excel_file)

        if 'std' in request.POST:
            preprocess(excel_file_name_preprocessed, 'std')
        if 'onehot' in request.POST:
            preprocess(excel_file_name_preprocessed, 'onehot')
        if 'imp' in request.POST:
            preprocess(excel_file_name_preprocessed, 'imp')

        if 'plot' in request.POST:
            fig = stats(excel_file_name, feature1, feature2)

        if 'logit' in request.POST:
            training(excel_file_name_preprocessed, 'logit')
        if 'xgb' in request.POST:
            training(excel_file_name_preprocessed, 'xgb')
    else:
        # if not post, meaning either a lanch of the page or a refresh
        feature1 = 'MonthlyCharges'
        feature2 = 'tenure'
        fig = stats(excel_file_name, feature1, feature2)

    # collect the data to render
    df = pd.read_csv(excel_file_name)
    data_to_display = df[:5].to_html()

    request.session['data_to_display'] = data_to_display
    request.session['excel_file_name'] = excel_file_name
    request.session['excel_file_name_preprocessed'] = excel_file_name_preprocessed
    labels = df.columns

    context = {'data_to_display': data_to_display, 'excel_file': excel_file_name, 'labels': labels, 'fig': fig, 'feature1': feature1, 'feature2': feature2}
    return render(request,'base/home.html', context)

def stats(name, feature1, feature2):
    global fig
    df = pd.read_csv(name)
    import plotly.express as px
    fig = px.scatter(df, x=feature1, y=feature2, color='Churn')
    fig = fig.to_html(full_html=False)
    return fig

def preprocess(name, type):
    from sklearn.preprocessing import StandardScaler
    data = pd.read_csv(name)

    if set(['No','customerID','Churn']).issubset(data.columns):
        data.drop(['No','customerID','Churn'],axis=1,inplace=True)

    if type == 'std':
        # define standard scaler
        scaler = StandardScaler()
        # transform data
        print("HEY")
        num_d = data.select_dtypes(exclude=['object'])
        data[num_d.columns] = scaler.fit_transform(num_d)

    if type=='onehot':
        data = pd.get_dummies(data,columns=['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
                            'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                            'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
                            'PaperlessBilling', 'PaymentMethod'],drop_first = True)

    if type == 'imp':
        data_numeric = data.select_dtypes(exclude=['object'])
        data_categorical = data.select_dtypes(exclude=['number'])
        imp = SimpleImputer(missing_values=np.nan, strategy='mean')
        data_numeric = pd.DataFrame(imp.fit_transform(data_numeric), columns = data_numeric.columns, index=data_numeric.index) #only apply imputer to numeric columns
        data = pd.concat([data_numeric, data_categorical], axis = 1)

    os.remove(excel_file_name_preprocessed)
    data.to_csv(excel_file_name_preprocessed)
    return

def training(name, type):
    global fig
    data = pd.read_csv(name)

    y=data['Churn']
    y=y.replace({"Yes":1,"No":0})
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=0)
    X_train.to_csv("X_train.csv")
    X_test.to_csv("X_test.csv")
    y_train.to_csv("y_train.csv")
    y_test.to_csv("y_test.csv")

    if 'logit' == type:
        from sklearn.linear_model import LogisticRegression
        clf = LogisticRegression(random_state=0).fit(X_train, y_train)
        filename = 'lg.sav'
        pickle.dump(clf, open(filename, 'wb'))

    if 'xgb' == type:
        from xgboost import XGBClassifier
        xgb = XGBClassifier(learning_rate = 0.01,n_estimators=1000).fit(X_train, y_train)
        file_name = 'xgb.sav'
        pickle.dump(xgb,open(file_name,'wb'))

    return