Evaluation
4. Evaluation Metrics for Classification
In the previous session we trained a model for predicting churn. How do we know if it’s good?
4.1 Evaluation metrics: session overview
- Dataset: https://www.kaggle.com/blastchar/telco-customer-churn
- https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv
Metric - function that compares the predictions with the actual values and outputs a single number that tells how good the predictions are
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
df = pd.read_csv(r'E:\Zoom_mycode\Churn\churn.csv')
df.columns = df.columns.str.lower().str.replace(' ', '_')
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)
for c in categorical_columns:
df[c] = df[c].str.lower().str.replace(' ', '_')
df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')
df.totalcharges = df.totalcharges.fillna(0)
df.churn = (df.churn == 'yes').astype(int)
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values
del df_train['churn']
del df_val['churn']
del df_test['churn']
numerical = ['tenure', 'monthlycharges', 'totalcharges']
categorical = [
'gender',
'seniorcitizen',
'partner',
'dependents',
'phoneservice',
'multiplelines',
'internetservice',
'onlinesecurity',
'onlinebackup',
'deviceprotection',
'techsupport',
'streamingtv',
'streamingmovies',
'contract',
'paperlessbilling',
'paymentmethod',
]
dv = DictVectorizer(sparse=False)
train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)
model = LogisticRegression()
model.fit(X_train, y_train)
val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)
y_pred = model.predict_proba(X_val)[:, 1]
churn_decision = (y_pred >= 0.5)
(y_val == churn_decision).mean()
4.2 Accuracy and dummy model
- Evaluate the model on different thresholds
- Check the accuracy of dummy baselines
len(y_val)
1409
(y_val == churn_decision).mean()
0.8034066713981547
1132/ 1409
0.8034066713981547
from sklearn.metrics import accuracy_score
accuracy_score(y_val, y_pred >= 0.5)
0.8034066713981547
thresholds = np.linspace(0, 1, 21)
scores = []
for t in thresholds:
score = accuracy_score(y_val, y_pred >= t)
print('%.2f %.3f' % (t, score))
scores.append(score)
0.00 0.274
0.05 0.509
0.10 0.591
0.15 0.666
0.20 0.710
0.25 0.739
0.30 0.760
0.35 0.772
0.40 0.785
0.45 0.793
0.50 0.803
0.55 0.801
0.60 0.795
0.65 0.786
0.70 0.766
0.75 0.744
0.80 0.735
0.85 0.726
0.90 0.726
0.95 0.726
1.00 0.726
plt.plot(thresholds, scores)
[<matplotlib.lines.Line2D at 0xff7b24eb48e0>]
from collections import Counter
Counter(y_pred >= 1.0)
Counter({False: 1409})
1 - y_val.mean()
0.7260468417317246
4.3 Confusion table
- Different types of errors and correct decisions
- Arranging them in a table
actual_positive = (y_val == 1)
actual_negative = (y_val == 0)
t = 0.5
predict_positive = (y_pred >= t)
predict_negative = (y_pred < t)
tp = (predict_positive & actual_positive).sum()
tn = (predict_negative & actual_negative).sum()
fp = (predict_positive & actual_negative).sum()
fn = (predict_negative & actual_positive).sum()
confusion_matrix = np.array([
[tn, fp],
[fn, tp]
])
confusion_matrix
array([[922, 101],
[176, 210]])
(confusion_matrix / confusion_matrix.sum()).round(2)
array([[0.65, 0.07],
[0.12, 0.15]])
4.4 Precision and Recall
p = tp / (tp + fp)
p
0.6752411575562701
r = tp / (tp + fn)
r
0.5440414507772021
4.5 ROC Curves
TPR and FRP
tpr = tp / (tp + fn)
tpr
0.5440414507772021
fpr = fp / (fp + tn)
fpr
0.09872922776148582
scores = []
thresholds = np.linspace(0, 1, 101)
for t in thresholds:
actual_positive = (y_val == 1)
actual_negative = (y_val == 0)
predict_positive = (y_pred >= t)
predict_negative = (y_pred < t)
tp = (predict_positive & actual_positive).sum()
tn = (predict_negative & actual_negative).sum()
fp = (predict_positive & actual_negative).sum()
fn = (predict_negative & actual_positive).sum()
scores.append((t, tp, fp, fn, tn))
columns = ['threshold', 'tp', 'fp', 'fn', 'tn']
df_scores = pd.DataFrame(scores, columns=columns)
df_scores['tpr'] = df_scores.tp / (df_scores.tp + df_scores.fn)
df_scores['fpr'] = df_scores.fp / (df_scores.fp + df_scores.tn)
plt.plot(df_scores.threshold, df_scores['tpr'], label='TPR')
plt.plot(df_scores.threshold, df_scores['fpr'], label='FPR')
plt.legend()
<matplotlib.legend.Legend at 0xffaca7c6f9a0>
Random model
np.random.seed(1)
y_rand = np.random.uniform(0, 1, size=len(y_val))
((y_rand >= 0.5) == y_val).mean()
0.5017743080198722
def tpr_fpr_dataframe(y_val, y_pred):
scores = []
thresholds = np.linspace(0, 1, 101)
for t in thresholds:
actual_positive = (y_val == 1)
actual_negative = (y_val == 0)
predict_positive = (y_pred >= t)
predict_negative = (y_pred < t)
tp = (predict_positive & actual_positive).sum()
tn = (predict_negative & actual_negative).sum()
fp = (predict_positive & actual_negative).sum()
fn = (predict_negative & actual_positive).sum()
scores.append((t, tp, fp, fn, tn))
columns = ['threshold', 'tp', 'fp', 'fn', 'tn']
df_scores = pd.DataFrame(scores, columns=columns)
df_scores['tpr'] = df_scores.tp / (df_scores.tp + df_scores.fn)
df_scores['fpr'] = df_scores.fp / (df_scores.fp + df_scores.tn)
return df_scores
df_rand = tpr_fpr_dataframe(y_val, y_rand)
plt.plot(df_rand.threshold, df_rand['tpr'], label='TPR')
plt.plot(df_rand.threshold, df_rand['fpr'], label='FPR')
plt.legend()
<matplotlib.legend.Legend at 0xffaca7bec8b0>
Ideal model
num_neg = (y_val == 0).sum()
num_pos = (y_val == 1).sum()
num_neg, num_pos
(1023, 386)
y_ideal = np.repeat([0, 1], [num_neg, num_pos])
y_ideal
y_ideal_pred = np.linspace(0, 1, len(y_val))
1 - y_val.mean()
0.7260468417317246
accuracy_score(y_ideal, y_ideal_pred >= 0.726)
1.0
df_ideal = tpr_fpr_dataframe(y_ideal, y_ideal_pred)
df_ideal[::10]
threshold | tp | fp | fn | tn | tpr | fpr | |
---|---|---|---|---|---|---|---|
0 | 0.0 | 386 | 1023 | 0 | 0 | 1.000000 | 1.000000 |
10 | 0.1 | 386 | 882 | 0 | 141 | 1.000000 | 0.862170 |
20 | 0.2 | 386 | 741 | 0 | 282 | 1.000000 | 0.724340 |
30 | 0.3 | 386 | 600 | 0 | 423 | 1.000000 | 0.586510 |
40 | 0.4 | 386 | 459 | 0 | 564 | 1.000000 | 0.448680 |
50 | 0.5 | 386 | 319 | 0 | 704 | 1.000000 | 0.311828 |
60 | 0.6 | 386 | 178 | 0 | 845 | 1.000000 | 0.173998 |
70 | 0.7 | 386 | 37 | 0 | 986 | 1.000000 | 0.036168 |
80 | 0.8 | 282 | 0 | 104 | 1023 | 0.730570 | 0.000000 |
90 | 0.9 | 141 | 0 | 245 | 1023 | 0.365285 | 0.000000 |
100 | 1.0 | 1 | 0 | 385 | 1023 | 0.002591 | 0.000000 |
plt.plot(df_ideal.threshold, df_ideal['tpr'], label='TPR')
plt.plot(df_ideal.threshold, df_ideal['fpr'], label='FPR')
plt.legend()
<matplotlib.legend.Legend at 0xffaca7af4a90>
Putting everything together
plt.plot(df_scores.threshold, df_scores['tpr'], label='TPR', color='black')
plt.plot(df_scores.threshold, df_scores['fpr'], label='FPR', color='blue')
plt.plot(df_ideal.threshold, df_ideal['tpr'], label='TPR ideal')
plt.plot(df_ideal.threshold, df_ideal['fpr'], label='FPR ideal')
# plt.plot(df_rand.threshold, df_rand['tpr'], label='TPR random', color='grey')
# plt.plot(df_rand.threshold, df_rand['fpr'], label='FPR random', color='grey')
plt.legend()
<matplotlib.legend.Legend at 0xffaca7a695b0>
plt.figure(figsize=(5, 5))
plt.plot(df_scores.fpr, df_scores.tpr, label='Model')
plt.plot([0, 1], [0, 1], label='Random', linestyle='--')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.legend()
<matplotlib.legend.Legend at 0xffaca6b72e50>
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_val, y_pred)
plt.figure(figsize=(5, 5))
plt.plot(fpr, tpr, label='Model')
plt.plot([0, 1], [0, 1], label='Random', linestyle='--')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.legend()
<matplotlib.legend.Legend at 0xffaca5d3eaf0>
4.6 ROC AUC
- Area under the ROC curve - useful metric
- Interpretation of AUC
from sklearn.metrics import auc
auc(fpr, tpr)
0.843850505725819
auc(df_scores.fpr, df_scores.tpr)
0.8438796286447967
auc(df_ideal.fpr, df_ideal.tpr)
0.9999430203759136
fpr, tpr, thresholds = roc_curve(y_val, y_pred)
auc(fpr, tpr)
0.843850505725819
from sklearn.metrics import roc_auc_score
roc_auc_score(y_val, y_pred)
0.843850505725819
neg = y_pred[y_val == 0]
pos = y_pred[y_val == 1]
import random
n = 100000
success = 0
for i in range(n):
pos_ind = random.randint(0, len(pos) - 1)
neg_ind = random.randint(0, len(neg) - 1)
if pos[pos_ind] > neg[neg_ind]:
success = success + 1
success / n
0.8434
n = 50000
np.random.seed(1)
pos_ind = np.random.randint(0, len(pos), size=n)
neg_ind = np.random.randint(0, len(neg), size=n)
(pos[pos_ind] > neg[neg_ind]).mean()
0.84646
4.7 Cross-Validation
- Evaluating the same model on different subsets of data
- Getting the average prediction and the spread within predictions
def train(df_train, y_train, C=1.0):
dicts = df_train[categorical + numerical].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(dicts)
model = LogisticRegression(C=C, max_iter=1000)
model.fit(X_train, y_train)
return dv, model
dv, model = train(df_train, y_train, C=0.001)
def predict(df, dv, model):
dicts = df[categorical + numerical].to_dict(orient='records')
X = dv.transform(dicts)
y_pred = model.predict_proba(X)[:, 1]
return y_pred
y_pred = predict(df_val, dv, model)
from sklearn.model_selection import KFold
!pip install tqdm
Requirement already satisfied: tqdm in /home/alexey/.pyenv/versions/3.8.11/lib/python3.8/site-packages (4.61.2)
[33mWARNING: You are using pip version 21.2.2; however, version 21.2.4 is available.
You should consider upgrading via the '/home/alexey/.pyenv/versions/3.8.11/bin/python3.8 -m pip install --upgrade pip' command.[0m
from tqdm.auto import tqdm
n_splits = 5
for C in tqdm([0.001, 0.01, 0.1, 0.5, 1, 5, 10]):
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)
scores = []
for train_idx, val_idx in kfold.split(df_full_train):
df_train = df_full_train.iloc[train_idx]
df_val = df_full_train.iloc[val_idx]
y_train = df_train.churn.values
y_val = df_val.churn.values
dv, model = train(df_train, y_train, C=C)
y_pred = predict(df_val, dv, model)
auc = roc_auc_score(y_val, y_pred)
scores.append(auc)
print('C=%s %.3f +- %.3f' % (C, np.mean(scores), np.std(scores)))
0%| | 0/7 [00:00<?, ?it/s]
C=0.001 0.825 +- 0.009
C=0.01 0.840 +- 0.009
C=0.1 0.841 +- 0.008
C=0.5 0.840 +- 0.007
C=1 0.841 +- 0.008
C=5 0.841 +- 0.008
C=10 0.841 +- 0.008
scores
[0.8419433083969826,
0.8458047775129122,
0.8325145494681918,
0.8325466042079682,
0.8525462018763139]
dv, model = train(df_full_train, df_full_train.churn.values, C=1.0)
y_pred = predict(df_test, dv, model)
auc = roc_auc_score(y_test, y_pred)
auc
0.8572386167896259