2024-05-27 19:16:14 +03:00

143 lines
5.5 KiB
Python

from django.shortcuts import render, redirect
import pandas as pd
from django.core.files.storage import FileSystemStorage
import pickle, os
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np
from . import utils
fig = None
excel_file_name_preprocessed = "dataset_preprocessed.csv"
excel_file_name = "dataset.csv"
def home(request):
global fig
global excel_file_name
global excel_file_name_preprocessed
# if file for preprocessing does not exist create it
if os.path.exists(excel_file_name_preprocessed) == False:
df = pd.read_csv(excel_file_name)
df.to_csv(excel_file_name_preprocessed)
if request.method == 'POST':
print("HEYEYEYEYEYE")
feature1 = request.POST.get('feature1')
feature2 = request.POST.get('feature2')
if 'csv' in request.POST:
excel_file = request.FILES["excel_file"]
excel_file_name = request.FILES["excel_file"].name
df = pd.read_csv(excel_file)
fs = FileSystemStorage() #defaults to MEDIA_ROOT
request.session['excel_file_name'] = excel_file_name
fs.save(excel_file_name, excel_file)
idx = excel_file_name.index('.')
excel_file_name_preprocessed = excel_file_name[:idx] + "_preprocessed" + excel_file_name[idx:]
fs = FileSystemStorage() #defaults to MEDIA_ROOT
request.session['excel_file_name_preprocess'] = excel_file_name_preprocessed
fs.save(excel_file_name_preprocessed, excel_file)
if 'std' in request.POST:
preprocess(excel_file_name_preprocessed, 'std')
if 'onehot' in request.POST:
preprocess(excel_file_name_preprocessed, 'onehot')
if 'imp' in request.POST:
preprocess(excel_file_name_preprocessed, 'imp')
if 'plot' in request.POST:
fig = stats(excel_file_name, feature1, feature2)
if 'logit' in request.POST:
training(excel_file_name_preprocessed, 'logit')
if 'xgb' in request.POST:
training(excel_file_name_preprocessed, 'xgb')
else:
# if not post, meaning either a lanch of the page or a refresh
feature1 = 'MonthlyCharges'
feature2 = 'tenure'
fig = stats(excel_file_name, feature1, feature2)
# collect the data to render
df = pd.read_csv(excel_file_name)
data_to_display = df[:5].to_html()
request.session['data_to_display'] = data_to_display
request.session['excel_file_name'] = excel_file_name
request.session['excel_file_name_preprocessed'] = excel_file_name_preprocessed
labels = df.columns
context = {'data_to_display': data_to_display, 'excel_file': excel_file_name, 'labels': labels, 'fig': fig, 'feature1': feature1, 'feature2': feature2}
return render(request,'base/home.html', context)
def stats(name, feature1, feature2):
global fig
df = pd.read_csv(name)
import plotly.express as px
fig = px.scatter(df, x=feature1, y=feature2, color='Churn')
fig = fig.to_html(full_html=False)
return fig
def preprocess(name, type):
from sklearn.preprocessing import StandardScaler
data = pd.read_csv(name)
if set(['No','customerID','Churn']).issubset(data.columns):
data.drop(['No','customerID','Churn'],axis=1,inplace=True)
if type == 'std':
# define standard scaler
scaler = StandardScaler()
# transform data
print("HEY")
num_d = data.select_dtypes(exclude=['object'])
data[num_d.columns] = scaler.fit_transform(num_d)
if type=='onehot':
data = pd.get_dummies(data,columns=['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
'PaperlessBilling', 'PaymentMethod'],drop_first = True)
if type == 'imp':
data_numeric = data.select_dtypes(exclude=['object'])
data_categorical = data.select_dtypes(exclude=['number'])
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
data_numeric = pd.DataFrame(imp.fit_transform(data_numeric), columns = data_numeric.columns, index=data_numeric.index) #only apply imputer to numeric columns
data = pd.concat([data_numeric, data_categorical], axis = 1)
os.remove(excel_file_name_preprocessed)
data.to_csv(excel_file_name_preprocessed)
return
def training(name, type):
global fig
data = pd.read_csv(name)
y=data['Churn']
y=y.replace({"Yes":1,"No":0})
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=0)
X_train.to_csv("X_train.csv")
X_test.to_csv("X_test.csv")
y_train.to_csv("y_train.csv")
y_test.to_csv("y_test.csv")
if 'logit' == type:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(X_train, y_train)
filename = 'lg.sav'
pickle.dump(clf, open(filename, 'wb'))
if 'xgb' == type:
from xgboost import XGBClassifier
xgb = XGBClassifier(learning_rate = 0.01,n_estimators=1000).fit(X_train, y_train)
file_name = 'xgb.sav'
pickle.dump(xgb,open(file_name,'wb'))
return