Skip to main content

Sham Sui Po, Hong Kong

Python SciKit-Learn Cheat Sheet

  • Simple and efficient tools for predictive data analysis
  • Accessible to everybody, and reusable in various contexts
  • Built on NumPy, SciPy, and matplotlib
  • Open source, commercially usable - BSD license

SciKit-Learn Cheat Sheet

Image Source: SciKit Learn User Guide

Regressions ++ Classifications ++ Clustering ++ Dimensionality Reduction ++ Model Selection ++ Pre-processing

Github Repository

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from mpl_toolkits import mplot3d
import numpy as np
import pandas as pd
import plotly.express as px
from scipy.cluster import hierarchy
import seaborn as sns
from sklearn import svm
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.datasets import load_iris, load_wine, fetch_20newsgroups, fetch_openml
from sklearn.impute import MissingIndicator, SimpleImputer
from sklearn.decomposition import PCA
from sklearn.ensemble import (
RandomForestClassifier,
RandomForestRegressor,
GradientBoostingRegressor,
AdaBoostRegressor,
GradientBoostingClassifier,
AdaBoostClassifier
)
from sklearn.feature_extraction.text import (
CountVectorizer,
TfidfTransformer,
TfidfVectorizer
)
from sklearn.linear_model import (
LinearRegression,
LogisticRegression,
Ridge,
ElasticNet
)
from sklearn.metrics import (
mean_absolute_error,
mean_squared_error,
classification_report,
confusion_matrix,
ConfusionMatrixDisplay,
accuracy_score
)
from sklearn.model_selection import (
train_test_split,
GridSearchCV,
cross_val_score,
cross_validate
)
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import (
MinMaxScaler,
StandardScaler,
OrdinalEncoder,
LabelEncoder,
OneHotEncoder,
PolynomialFeatures
)
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

Working with Missing Values

X_missing = pd.DataFrame(
np.array([5,2,3,np.NaN,np.NaN,4,-3,2,1,8,np.NaN,4,10,np.NaN,5]).reshape(5,3)
)
X_missing.columns = ['f1','f2','f3']

X_missing
f1f2f3
05.02.03.0
1NaNNaN4.0
2-3.02.01.0
38.0NaN4.0
410.0NaN5.0
X_missing.isnull().sum()

# f1 1
# f2 3
# f3 0
# dtype: int64

Missing Indicator

indicator = MissingIndicator(missing_values=np.NaN)
indicator = indicator.fit_transform(X_missing)
indicator = pd.DataFrame(indicator, columns=['a1', 'a2'])
indicator
a1a2
0FalseFalse
1TrueTrue
2FalseFalse
3FalseTrue
4FalseTrue

Simple Imputer

imputer_mean = SimpleImputer(missing_values=np.NaN, strategy='mean')
X_filled_mean = pd.DataFrame(imputer_mean.fit_transform(X_missing))
X_filled_mean.columns = ['f1','f2','f3']
X_filled_mean
f1f2f3
05.02.03.0
15.02.04.0
2-3.02.01.0
38.02.04.0
410.02.05.0
imputer_median = SimpleImputer(missing_values=np.NaN, strategy='median')
X_filled_median = pd.DataFrame(imputer_median.fit_transform(X_missing))
X_filled_median.columns = ['f1','f2','f3']
X_filled_median
f1f2f3
05.02.03.0
16.52.04.0
2-3.02.01.0
38.02.04.0
410.02.05.0
imputer_median = SimpleImputer(missing_values=np.NaN, strategy='most_frequent')
X_filled_median = pd.DataFrame(imputer_median.fit_transform(X_missing))
X_filled_median.columns = ['f1','f2','f3']
X_filled_median
f1f2f3
05.02.03.0
1-3.02.04.0
2-3.02.01.0
38.02.04.0
410.02.05.0

Drop Missing Data

X_missing_dropped = X_missing.dropna(axis=1)
X_missing_dropped
f3
03.0
14.0
21.0
34.0
45.0
X_missing_dropped = X_missing.dropna(axis=0).reset_index()
X_missing_dropped
f1f2f3
05.02.03.0
1-3.02.01.0

Categorical Data Preprocessing

X_cat_df = pd.DataFrame(
np.array([
['M', 'O-', 'medium'],
['M', 'O-', 'high'],
['F', 'O+', 'high'],
['F', 'AB', 'low'],
['F', 'B+', 'medium']
])
)

X_cat_df.columns = ['f1','f2','f3']

X_cat_df
f1f2f3
0MO-medium
1MO-high
2FO+high
3FABlow
4FB+medium

Ordinal Encoder

encoder_ord = OrdinalEncoder(dtype='int')

X_cat_df.f3 = encoder_ord.fit_transform(X_cat_df.f3.values.reshape(-1, 1))
X_cat_df
f1f2f3
0MO-2
1MO-0
2FO+0
3FAB1
4FB+2

Label Encoder

encoder_lab = LabelEncoder()
X_cat_df['f2'] = encoder_lab.fit_transform(X_cat_df['f2'])
X_cat_df
f1f2f3
0M32
1M30
2F20
3F01
4F12

OneHot Encoder

encoder_oh = OneHotEncoder(dtype='int')

onehot_df = pd.DataFrame(
encoder_oh.fit_transform(X_cat_df[['f1']])
.toarray(),
columns=['F', 'M']
)

onehot_df['f2'] = X_cat_df.f2
onehot_df['f3'] = X_cat_df.f3
onehot_df
FMf2f3
00132
10130
21020
31001
41012

Loading SK Datasets

Toy Datasets

load_iris(*[, return_X_y, as_frame])classificationLoad and return the iris dataset.
load_diabetes(*[, return_X_y, as_frame, scaled])regressionLoad and return the diabetes dataset.
load_digits(*[, n_class, return_X_y, as_frame])classificationLoad and return the digits dataset.
load_linnerud(*[, return_X_y, as_frame])multi-output regressionLoad and return the physical exercise Linnerud dataset.
load_wine(*[, return_X_y, as_frame])classificationLoad and return the wine dataset.
load_breast_cancer(*[, return_X_y, as_frame])classificationLoad and return the breast cancer wisconsin dataset.
iris_ds = load_iris()
iris_data = iris_ds.data
col_names = iris_ds.feature_names
target_names = iris_ds.target_names

print(
'Iris Dataset',
'\n * Data array: ',
iris_data.shape,
'\n * Column names: ',
col_names,
'\n * Target names: ',
target_names
)

# Iris Dataset
# * Data array: (150, 4)
# * Column names: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
# * Target names: ['setosa' 'versicolor' 'virginica']
iris_df = pd.DataFrame(data=iris_data, columns=col_names)

iris_df.head()
sepal length (cm)sepal width (cm)petal length (cm)petal width (cm)
05.13.51.40.2
14.93.01.40.2
24.73.21.30.2
34.63.11.50.2
45.03.61.40.2

Real World Datasets

fetch_olivetti_faces(*[, data_home, ...])classificationLoad the Olivetti faces data-set from AT&T.
fetch_20newsgroups(*[, data_home, subset, ...])classificationLoad the filenames and data from the 20 newsgroups dataset.
fetch_20newsgroups_vectorized(*[, subset, ...])classificationLoad and vectorize the 20 newsgroups dataset.
fetch_lfw_people(*[, data_home, funneled, ...])classificationLoad the Labeled Faces in the Wild (LFW) people dataset.
fetch_lfw_pairs(*[, subset, data_home, ...])classificationLoad the Labeled Faces in the Wild (LFW) pairs dataset.
fetch_covtype(*[, data_home, ...])classificationLoad the covertype dataset.
fetch_rcv1(*[, data_home, subset, ...])classificationLoad the RCV1 multilabel dataset.
fetch_kddcup99(*[, subset, data_home, ...])classificationLoad the kddcup99 dataset.
fetch_california_housing(*[, data_home, ...])regressionLoad the California housing dataset.
newsgroups_train = fetch_20newsgroups(subset='train')
train_data = newsgroups_train.data
col_names = newsgroups_train.filenames.shape
target_names = newsgroups_train.target.shape

print(
'Newsgroup - Train Subset',
'\n * Data array: ',
len(train_data),
'\n * Column names: ',
col_names,
'\n * Target names: ',
target_names
)

# Newsgroup - Train Subset
# * Data array: 11314
# * Column names: (11314,)
# * Target names: (11314,)
print('Target Names: ', newsgroups_train.target_names)

# Target Names: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']

OpenML Datasets

mice_ds = fetch_openml(name='miceprotein', version=4, parser="auto")
print(
'Mice Protein Dataset',
'\n * Data Shape: ',
mice_ds.data.shape,
'\n * Target Shape: ',
mice_ds.target.shape,
'\n * Target Names: ',
np.unique(mice_ds.target)
)

# Mice Protein Dataset
# * Data Shape: (1080, 77)
# * Target Shape: (1080,)
# * Target Names: ['c-CS-m' 'c-CS-s' 'c-SC-m' 'c-SC-s' 't-CS-m' 't-CS-s' 't-SC-m' 't-SC-s']

print(mice_ds.DESCR)

Supervised Learning - Regression Models

Simple Linear Regression

iris_df.plot(
figsize=(12,5),
kind='scatter',
x='sepal length (cm)',
y='sepal width (cm)',
title='Iris Dataset :: Sepal Width&Height'
)

print(iris_df.corr())

The Sepal Width has very little correlation to all other metrics but itself. While the other three correlate nicely:

sepal length (cm)sepal width (cm)petal length (cm)petal width (cm)
sepal length (cm)1.000000-0.1175700.8717540.817941
sepal width (cm)-0.1175701.000000-0.428440-0.366126
petal length (cm)0.871754-0.4284401.0000000.962865
petal width (cm)0.817941-0.3661260.9628651.000000

scikit-learn - Machine Learning in Python

scikit-learn - Machine Learning in Python

Data Pre-processing

iris_df['petal length (cm)'][:1]
# 0 1.4
# Name: petal length (cm), dtype: float64
iris_df['petal length (cm)'].values.reshape(-1,1)[:1]
# array([[1.4]])
# scikit expects a 2s imput => remove index
X = iris_df['petal length (cm)'].values.reshape(-1,1)
y = iris_df['petal width (cm)'].values.reshape(-1,1)
# train/test split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
print(X_train.shape, X_test.shape)
# (120, 1) (30, 1) 80:20 split

Model Training

regressor = LinearRegression()
regressor.fit(X_train,y_train)

intercept = regressor.intercept_
slope = regressor.coef_

print(' Intercept: ', intercept, '\n Slope: ', slope)
# Intercept: [-0.35135666]
# Correlation Coeficient: [[0.41310505]]

Predictions

y_pred = regressor.predict([X_test[0]])
print(' Prediction: ', y_pred, '\n True Value: ', y_test[0])
# Prediction: [[0.22699041]]
# True Value: [0.2]
def predict(value):
return (slope*value + intercept)[0][0]
print('Prediction: ', predict(X_test[0]))
# Prediction: [[0.22699041]]
iris_df['petal width (cm) prediction'] = iris_df['petal length (cm)'].apply(predict)
print(' Prediction: ', iris_df['petal width (cm) prediction'][0], '\n True Value: ', iris_df['petal width (cm)'][0])
# Prediction: 0.22699041280334376
# True Value: 0.2
iris_df.head(10)
sepal length (cm)sepal width (cm)petal length (cm)petal width (cm)petal width (cm) prediction
05.13.51.40.20.226990
14.93.01.40.20.226990
24.73.21.30.20.185680
34.63.11.50.20.268301
45.03.61.40.20.226990
55.43.91.70.40.350922
64.63.41.40.30.226990
75.03.41.50.20.268301
84.42.91.40.20.226990
94.93.11.50.10.268301
iris_df.plot(
figsize=(12,5),
kind='scatter',
x='petal width (cm)',
y='petal width (cm) prediction',
# no value in colorizing..just looks pretty
c='petal width (cm) prediction',
colormap='summer',
title='Iris Dataset - Sepal Width True vs Prediction'
)

scikit-learn - Machine Learning in Python

Model Evaluation

mae = mean_absolute_error(
iris_df['petal width (cm)'],
iris_df['petal width (cm) prediction']
)

mse = mean_squared_error(
iris_df['petal width (cm)'],
iris_df['petal width (cm) prediction']
)

rmse = np.sqrt(mse)

print(' MAE: ', mae, '\n MSE: ', mse, '\n RMSE: ', rmse)

# MAE: 0.1569441318761155
# MSE: 0.04209214667485277
# RMSE: 0.2051637070118708

ElasticNet Regression

Dataset

!wget https://raw.githubusercontent.com/Satish-Vennapu/DataScience/main/AMES_Final_DF.csv -P datasets
ames_df = pd.read_csv('datasets/AMES_Final_DF.csv')
ames_df.head(5).transpose()
01234
Lot Frontage141.080.081.093.074.0
Lot Area31770.011622.014267.011160.013830.0
Overall Qual6.05.06.07.05.0
Overall Cond5.06.06.05.05.0
Year Built1960.01961.01958.01968.01997.0
...
Sale Condition_AdjLand0.00.00.00.00.0
Sale Condition_Alloca0.00.00.00.00.0
Sale Condition_Family0.00.00.00.00.0
Sale Condition_Normal1.01.01.01.01.0
Sale Condition_Partial0.00.00.00.00.0
274 rows × 5 columns
# the target value is:
ames_df['SalePrice']
0215000
1105000
2172000
3244000
4189900
...
2920142500
2921131000
2922132000
2923170000
2924188000
Name: SalePrice, Length: 2925, dtype: int64

Preprocessing

# remove target column from training dataset
X_ames = ames_df.drop('SalePrice', axis=1)
y_ames = ames_df['SalePrice']

print(X_ames.shape, y_ames.shape)
# (2925, 273) (2925,)
# train/test split
X_ames_train, X_ames_test, y_ames_train, y_ames_test = train_test_split(
X_ames,
y_ames,
test_size=0.1,
random_state=101
)

print(X_ames_train.shape, X_ames_test.shape)
# (2632, 273) (293, 273)
# normalize feature set
scaler = StandardScaler()
X_ames_train_scaled = scaler.fit_transform(X_ames_train)

X_ames_test_scaled = scaler.transform(X_ames_test)

Grid Search for Hyperparameters

base_ames_elastic_net_model = ElasticNet(max_iter=int(1e4))
param_grid = \{
'alpha': [50, 75, 100, 125, 150],
'l1_ratio':[0.2, 0.4, 0.6, 0.8, 1.0]
\}
grid_ames_model = GridSearchCV(
estimator=base_ames_elastic_net_model,
param_grid=param_grid,
scoring='neg_mean_squared_error',
cv=5, verbose=1
)

grid_ames_model.fit(X_ames_train_scaled, y_ames_train)

print(
'Results:\nBest Estimator: ',
grid_ames_model.best_estimator_,
'\nBest Hyperparameter: ',
grid_ames_model.best_params_
)

Results:

  • Best Estimator: ElasticNet(alpha=125, l1_ratio=1.0, max_iter=10000)
  • Best Hyperparameter: \{'alpha': 125, 'l1_ratio': 1.0\}

Model Evaluation

y_ames_pred = grid_ames_model.predict(X_ames_test_scaled)

print(
'MAE: ',
mean_absolute_error(y_ames_test, y_ames_pred),
'MSE: ',
mean_squared_error(y_ames_test, y_ames_pred),
'RMSE: ',
np.sqrt(mean_squared_error(y_ames_test, y_ames_pred))
)

# MAE: 14185.506207185055 MSE: 422714457.5190704 RMSE: 20560.020854052418
# average SalePrize
np.mean(ames_df['SalePrice'])
# 180815.53743589742

rel_error_avg = mean_absolute_error(y_ames_test, y_ames_pred) * 100 / np.mean(ames_df['SalePrice'])
print('Pridictions are on average off by: ', rel_error_avg.round(2), '%')
# Pridictions are on average off by: 7.85 %
plt.figure(figsize=(10,4))

plt.scatter(y_ames_test,y_ames_pred, c='mediumspringgreen', s=3)
plt.axline((0, 0), slope=1, color='dodgerblue', linestyle=(':'))

plt.title('Prediction Accuracy :: MAE:'+ str(mean_absolute_error(y_ames_test, y_ames_pred).round(2)) + 'US$')
plt.xlabel('True Sales Price')
plt.ylabel('Predicted Sales Price')
plt.savefig('assets/Scikit_Learn_11.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

Multiple Linear Regression

Above I used the petal width and length to create a linear regression model. But as explored earlier we can also use the sepal length (only the sepal width does not show a linear correlation):

print(iris_df.corr())
sepal length (cm)sepal width (cm)petal length (cm)petal width (cm)
sepal length (cm)1.000000-0.1175700.8717540.817941
sepal width (cm)-0.1175701.000000-0.428440-0.366126
petal length (cm)0.871754-0.4284401.0000000.962865
petal width (cm)0.817941-0.3661260.9628651.000000
X_multi = iris_df[['petal length (cm)', 'sepal length (cm)']]
y = iris_df['petal width (cm)']
regressor_multi = LinearRegression()
regressor_multi.fit(X_multi, y)

intercept_multi = regressor_multi.intercept_
slope_multi = regressor_multi.coef_

print(' Intercept: ', intercept_multi, '\n Slope: ', slope_multi)

# Intercept: -0.00899597269816943
# Slope: [ 0.44937611 -0.08221782]
def predict_multi(petal_length, sepal_length):
return (slope_multi[0]*petal_length + slope_multi[1]*sepal_length + intercept_multi)
y_pred = predict_multi(
iris_df['petal length (cm)'][0],
iris_df['sepal length (cm)'][0]
)

print(' Prediction: ', y_pred, '\n True value: ', iris_df['petal width (cm)'][0])
# Prediction: 0.20081970121763193
# True value: 0.2
iris_df['petal width (cm) prediction (multi)'] = (
(
slope_multi[0] * iris_df['petal length (cm)']
) + (
slope_multi[1] * iris_df['sepal length (cm)']
) + (
intercept_multi
)
)
iris_df.head(10)
sepal length (cm)sepal width (cm)petal length (cm)petal width (cm)petal width (cm) predictionpetal width (cm) prediction (multi)
05.13.51.40.20.2269900.200820
14.93.01.40.20.2269900.217263
24.73.21.30.20.1856800.188769
34.63.11.50.20.2683010.286866
45.03.61.40.20.2269900.209041
55.43.91.70.40.3509220.310967
64.63.41.40.30.2269900.241929
75.03.41.50.20.2683010.253979
84.42.91.40.20.2269900.258372
94.93.11.50.10.2683010.262201
iris_df.plot(
figsize=(12,5),
kind='scatter',
x='petal width (cm)',
y='petal width (cm) prediction (multi)',
c='petal width (cm) prediction',
colormap='summer',
title='Iris Dataset - Sepal Width True vs Prediction (multi)'
)

scikit-learn - Machine Learning in Python

mae_multi = mean_absolute_error(
iris_df['petal width (cm)'],
iris_df['petal width (cm) prediction (multi)']
)

mse_multi = mean_squared_error(
iris_df['petal width (cm)'],
iris_df['petal width (cm) prediction (multi)']
)

rmse_multi = np.sqrt(mse_multi)

print(' MAE_Multi: ', mae_multi,' MAE: ', mae, '\n MSE_Multi: ', mse_multi, ' MSE: ', mse, '\n RMSE_Multi: ', rmse_multi, ' RMSE: ', rmse)

The accuracy of the model was improved by adding an additional, correlating value:

Multi RegressionSingle Regression
Mean Absolute Error0.155621080793001020.1569441318761155
Mean Squared Error0.040962085264089820.04209214667485277
Root Mean Squared Error0.202390921891496460.2051637070118708

Supervised Learning - Logistic Regression Model

Binary Logistic Regression

Dataset

np.random.seed(666)

# generate 10 index values between 0-10
x_data_logistic_binary = np.random.randint(10, size=(10)).reshape(-1, 1)
# generate binary category for values above
y_data_logistic_binary = np.random.randint(2, size=10)

Model Fitting

logistic_binary_model = LogisticRegression(
solver='liblinear',
C=10.0,
random_state=0
)

logistic_binary_model.fit(x_data_logistic_binary, y_data_logistic_binary)

intercept_logistic_binary = logistic_binary_model.intercept_
slope_logistic_binary = logistic_binary_model.coef_

print(' Intercept: ', intercept_logistic_binary, '\n Slope: ', slope_logistic_binary)

# Intercept: [-0.4832956]
# Slope: [[0.11180522]]

Model Predictions

prob_pred_logistic_binary = logistic_binary_model.predict_proba(x_data_logistic_binary)
y_pred_logistic_binary = logistic_binary_model.predict(x_data_logistic_binary)


print('Prediction Probabilities: ', prob_pred[:1])

unique, counts = np.unique(y_pred_logistic_binary, return_counts=True)
print('Classes: ', unique, '| Number of Class Instances: ', counts)

# probabilities e.g. below -> 58% certainty that the first element is class 0

# Prediction Probabilities: [[0.58097284 0.41902716]]
# Classes: [0 1] | Number of Class Instances: [5 5]

Model Evaluation

conf_mtx = confusion_matrix(y_data_logistic_binary, y_pred_logistic_binary)
conf_mtx

# [2, 3] [TP, FP]
# [3, 2] [FN, TN]

scikit-learn - Machine Learning in Python

report = classification_report(y_data_logistic_binary, y_pred_logistic_binary)
print(report)
precisionrecallf1-scoresupport
00.400.400.405
10.400.400.405
accuracy0.4010
macro avg0.400.400.4010
weighted avg0.400.400.4010

Logistic Regression Pipelines

Dataset Preprocessing

iris_ds = load_iris()

# train/test split
X_train_iris, X_test_iris, y_train_iris, y_test_iris = train_test_split(
iris_ds.data,
iris_ds.target,
test_size=0.2,
random_state=42
)
print(X_train_iris.shape, X_test_iris.shape)
# (120, 4) (30, 4)

Pipeline

pipe_iris = Pipeline([
('minmax', MinMaxScaler()),
('log_reg', LogisticRegression()),
])

pipe_iris.fit(X_train_iris, y_train_iris)
iris_score = pipe_iris.score(X_test_iris, y_test_iris)
print('Prediction Accuracy: ', iris_score.round(4)*100, '%')
# Prediction Accuracy: 96.67 %

Cross Validation

Train | Test Split
!wget https://raw.githubusercontent.com/reisanar/datasets/master/Advertising.csv -P datasets
adv_df = pd.read_csv('datasets/Advertising.csv')
adv_df.head(5)
TVRadioNewspaperSales
0230.137.869.222.1
144.539.345.110.4
217.245.969.39.3
3151.541.358.518.5
4180.810.858.412.9
# Split ds into features and targets
X_adv = adv_df.drop('Sales', axis=1)
y_adv = adv_df['Sales']
# 70:30 train/test split
X_adv_train, X_adv_test, y_adv_train, y_adv_test = train_test_split(
X_adv, y_adv, test_size=0.3, random_state=666
)

print(X_adv_train.shape, y_adv_train.shape)
# (140, 3) (140,)
# normalize features
scaler_adv = StandardScaler()
scaler_adv.fit(X_adv_train)

X_adv_train = scaler_adv.transform(X_adv_train)
X_adv_test = scaler_adv.transform(X_adv_test)
Model Fitting
model_adv1 = Ridge(
alpha=100.0
)

model_adv1.fit(X_adv_train, y_adv_train)
Model Evaluation
y_adv_pred = model_adv1.predict(X_adv_test)

mean_squared_error(y_adv_test, y_adv_pred)
# 6.528575771818745
Adjusting Hyper Parameter
model_adv2 = Ridge(
alpha=1.0
)

model_adv2.fit(X_adv_train, y_adv_train)
y_adv_pred2 = model_adv2.predict(X_adv_test)
mean_squared_error(y_adv_test, y_adv_pred2)
# 2.3319016551123535

Train | Validation | Test Split

# 70:30 train/temp split
X_adv_train, X_adv_temp, y_adv_train, y_adv_temp = train_test_split(
X_adv, y_adv, test_size=0.3, random_state=666
)

# 50:50 test/val split
X_adv_test, X_adv_val, y_adv_test, y_adv_val = train_test_split(
X_adv_temp, y_adv_temp, test_size=0.5, random_state=666
)

print(X_adv_train.shape, X_adv_test.shape, X_adv_val.shape)
# (140, 3) (30, 3) (30, 3)
# normalize features
scaler_adv = StandardScaler()
scaler_adv.fit(X_adv_train)

X_adv_train = scaler_adv.transform(X_adv_train)
X_adv_test = scaler_adv.transform(X_adv_test)
X_adv_val = scaler_adv.transform(X_adv_val)
Model Fitting and Evaluation
model_adv3 = Ridge(
alpha=100.0
)

model_adv3.fit(X_adv_train, y_adv_train)
# do evaluation with the validation set
y_adv_pred3 = model_adv3.predict(X_adv_val)
mean_squared_error(y_adv_val, y_adv_pred3)
# 7.136230975501291
Adjusting Hyper Parameter
model_adv4 = Ridge(
alpha=1.0
)

model_adv4.fit(X_adv_train, y_adv_train)

y_adv_pred4 = model_adv4.predict(X_adv_val)
mean_squared_error(y_adv_val, y_adv_pred4)
# 2.6393803874124435
# only once you are certain that you have the best performance
# do a final evaluation with the test set
y_adv4_final_pred = model_adv4.predict(X_adv_test)
mean_squared_error(y_adv_test, y_adv4_final_pred)
# 2.024422922812264

k-fold Cross Validation

Do a train/test split and segment the training set by k-folds (e.g. 5-10) and use each of those segments once to validate a training step. The resulting error is the average of all k errors.

Train-Test Split
# 70:30 train/temp split
X_adv_train, X_adv_test, y_adv_train, y_adv_test = train_test_split(
X_adv, y_adv, test_size=0.3, random_state=666
)
# normalize features
scaler_adv = StandardScaler()
scaler_adv.fit(X_adv_train)

X_adv_train = scaler_adv.transform(X_adv_train)
X_adv_test = scaler_adv.transform(X_adv_test)
Model Scoring
model_adv5 = Ridge(
alpha=100.0
)
# do a 5-fold cross-eval
scores = cross_val_score(
estimator=model_adv5,
X=X_adv_train,
y=y_adv_train,
scoring='neg_mean_squared_error',
cv=5
)

# take the mean of all five neg. error values
abs(scores.mean())
# 8.688107513529168
Adjusting Hyper Parameter
model_adv6 = Ridge(
alpha=1.0
)
# do a 5-fold cross-eval
scores = cross_val_score(
estimator=model_adv6,
X=X_adv_train,
y=y_adv_train,
scoring='neg_mean_squared_error',
cv=5
)

# take the mean of all five neg. error values
abs(scores.mean())
# 3.3419582340688576
Model Fitting and Final Evaluation
model_adv6.fit(X_adv_train, y_adv_train)

y_adv6_final_pred = model_adv6.predict(X_adv_test)
mean_squared_error(y_adv_test, y_adv6_final_pred)
# 2.3319016551123535

Cross Validate

Dataset (re-import)
adv_df = pd.read_csv('datasets/Advertising.csv')
X_adv = adv_df.drop('Sales', axis=1)
y_adv = adv_df['Sales']
# 70:30 train/test split
X_adv_train, X_adv_test, y_adv_train, y_adv_test = train_test_split(
X_adv, y_adv, test_size=0.3, random_state=666
)
# normalize features
scaler_adv = StandardScaler()
scaler_adv.fit(X_adv_train)

X_adv_train = scaler_adv.transform(X_adv_train)
X_adv_test = scaler_adv.transform(X_adv_test)
Model Scoring
model_adv7 = Ridge(
alpha=100.0
)
scores = cross_validate(
model_adv7,
X_adv_train,
y_adv_train,
scoring=[
'neg_mean_squared_error',
'neg_mean_absolute_error'
],
cv=10
)
scores_df = pd.DataFrame(scores)
scores_df
fit_timescore_timetest_neg_mean_squared_errortest_neg_mean_absolute_error
00.0163990.000749-12.539147-2.851864
10.0006840.000452-2.806466-1.423516
20.0009370.000782-11.142227-2.740332
30.0010600.000633-7.237347-2.196963
40.0010450.000738-11.313985-2.690813
50.0006500.000510-3.169169-1.526568
60.0006980.000429-6.578249-1.727616
70.0006000.000423-5.740245-1.640964
80.0005650.000463-10.268075-2.415688
90.0005620.000487-10.641669-1.974407
abs(scores_df.mean())
fit_time0.002320
score_time0.000566
test_neg_mean_squared_error8.143658
test_neg_mean_absolute_error2.118873
dtype: float64
Adjusting Hyper Parameter
model_adv8 = Ridge(
alpha=1.0
)
scores = cross_validate(
model_adv8,
X_adv_train,
y_adv_train,
scoring=[
'neg_mean_squared_error',
'neg_mean_absolute_error'
],
cv=10
)

abs(pd.DataFrame(scores).mean())
fit_time0.001141
score_time0.000777
test_neg_mean_squared_error3.272673
test_neg_mean_absolute_error1.345709
dtype: float64
Model Fitting and Final Evaluation
model_adv8.fit(X_adv_train, y_adv_train)

y_adv8_final_pred = model_adv8.predict(X_adv_test)
mean_squared_error(y_adv_test, y_adv8_final_pred)
# 2.3319016551123535

Loop through a set of hyperparameters to find an optimum.

base_elastic_net_model = ElasticNet()
param_grid = \{
'alpha': [0.1, 1, 5, 10, 50, 100],
'l1_ratio':[0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
\}
grid_model = GridSearchCV(
estimator=base_elastic_net_model,
param_grid=param_grid,
scoring='neg_mean_squared_error',
cv=5, verbose=2
)

grid_model.fit(X_adv_train, y_adv_train)

print(
'Results:\nBest Estimator: ',
grid_model.best_estimator_,
'\nBest Hyperparameter: ',
grid_model.best_params_
)

Results:

  • Best Estimator: ElasticNet(alpha=0.1, l1_ratio=1.0)
  • Best Hyperparameter: \{'alpha': 0.1, 'l1_ratio': 1.0\}
gridcv_results = pd.DataFrame(grid_model.cv_results_)
mean_fit_timestd_fit_timemean_score_timestd_score_timeparam_alphaparam_l1_ratioparamssplit0_test_scoresplit1_test_scoresplit2_test_scoresplit3_test_scoresplit4_test_scoremean_test_scorestd_test_scorerank_test_score
00.0011560.0001600.0004490.0000380.10.1{'alpha': 0.1, 'l1_ratio': 0.1}-1.924119-3.384152-3.588444-3.703040-5.091974-3.5383461.0072646
10.0011440.0001810.0004070.0000910.10.3{'alpha': 0.1, 'l1_ratio': 0.3}-1.867117-3.304382-3.561106-3.623188-5.061781-3.4835151.0160005
20.0006230.0000260.0002720.0000520.10.5{'alpha': 0.1, 'l1_ratio': 0.5}-1.812633-3.220727-3.539711-3.547572-5.043259-3.4327801.0284064
30.0009320.0001650.0003210.0000600.10.7{'alpha': 0.1, 'l1_ratio': 0.7}-1.750153-3.144120-3.525226-3.477228-5.034008-3.3861471.0467223
40.0007250.0001060.0002590.0000240.10.9{'alpha': 0.1, 'l1_ratio': 0.9}-1.693440-3.075686-3.518777-3.413393-5.029683-3.3461961.0651952
50.0006540.0000530.0002740.0000260.11.0{'alpha': 0.1, 'l1_ratio': 1.0}-1.667506-3.044928-3.518866-3.384363-5.031297-3.3293921.0750061
60.0005950.0000160.0002440.00000210.1{'alpha': 1, 'l1_ratio': 0.1}-8.575470-11.021534-8.212152-6.808719-10.792072-9.0819901.60419212
70.0005910.0000180.0002440.00000210.3{'alpha': 1, 'l1_ratio': 0.3}-8.131855-10.448423-7.774620-6.179358-10.071728-8.5211971.56917311
80.0006280.0000490.0002660.00002310.5{'alpha': 1, 'l1_ratio': 0.5}-7.519809-9.562473-7.261824-5.453399-9.213320-7.8021651.48178510
90.0005940.0000150.0002430.00000210.7{'alpha': 1, 'l1_ratio': 0.7}-6.614835-8.351711-6.702104-4.698977-8.230616-6.9196491.3297419
100.0007140.0001080.0002680.00003310.9{'alpha': 1, 'l1_ratio': 0.9}-5.537250-6.887828-6.148400-4.106124-7.101573-5.9562351.0784308
110.0006490.0000670.0002630.00002811.0{'alpha': 1, 'l1_ratio': 1.0}-4.932027-6.058207-5.892529-3.798441-6.472871-5.4308150.9598047
120.0006450.0000420.0002640.00004050.1{'alpha': 5, 'l1_ratio': 0.1}-21.863798-25.767488-18.768865-12.608680-23.207907-20.4433474.52090413
130.0006170.0000300.0002810.00003850.3{'alpha': 5, 'l1_ratio': 0.3}-23.626694-27.439028-20.266203-12.788078-24.609195-21.7458405.03149314
140.0005990.0000110.0002490.00001350.5{'alpha': 5, 'l1_ratio': 0.5}-26.202964-29.867138-22.527913-13.423857-26.835934-23.7715615.67591115
150.0005880.0000130.0002760.00003550.7{'alpha': 5, 'l1_ratio': 0.7}-27.768946-33.428462-23.506474-14.599984-29.112276-25.6832286.38237917
160.0005800.0000030.0002710.00000150.9{'alpha': 5, 'l1_ratio': 0.9}-29.868949-34.423737-25.623955-16.750237-31.056181-27.5446126.08709319
170.0005910.0000110.0002590.00002151.0{'alpha': 5, 'l1_ratio': 1.0}-29.868949-34.423737-25.623955-16.750237-31.056181-27.5446126.08709319
180.0006320.0000280.0002500.000012100.1{'alpha': 10, 'l1_ratio': 0.1}-26.179546-30.396420-22.386698-14.596498-27.292337-24.1703005.42932216
190.0005930.0000200.0002390.000001100.3{'alpha': 10, 'l1_ratio': 0.3}-28.704426-33.379967-24.561645-15.634153-29.883725-26.4327836.09006218
200.0005950.0000360.0002450.000013100.5{'alpha': 10, 'l1_ratio': 0.5}-29.868949-34.423737-25.623955-16.750237-31.056181-27.5446126.08709319
210.0006100.0000530.0002580.000015100.7{'alpha': 10, 'l1_ratio': 0.7}-29.868949-34.423737-25.623955-16.750237-31.056181-27.5446126.08709319
220.0005970.0000220.0002480.000015100.9{'alpha': 10, 'l1_ratio': 0.9}-29.868949-34.423737-25.623955-16.750237-31.056181-27.5446126.08709319
230.0006230.0000570.0003050.000076101.0{'alpha': 10, 'l1_ratio': 1.0}-29.868949-34.423737-25.623955-16.750237-31.056181-27.5446126.08709319
240.0006020.0000160.0002520.000013500.1{'alpha': 50, 'l1_ratio': 0.1}-29.868949-34.423737-25.623955-16.750237-31.056181-27.5446126.08709319
250.0005770.0000090.0002380.000001500.3{'alpha': 50, 'l1_ratio': 0.3}-29.868949-34.423737-25.623955-16.750237-31.056181-27.5446126.08709319
260.0006070.0000460.0002450.000010500.5{'alpha': 50, 'l1_ratio': 0.5}-29.868949-34.423737-25.623955-16.750237-31.056181-27.5446126.08709319
270.0005690.0000040.0002590.000012500.7{'alpha': 50, 'l1_ratio': 0.7}-29.868949-34.423737-25.623955-16.750237-31.056181-27.5446126.08709319
280.0005820.0000220.0002440.000011500.9{'alpha': 50, 'l1_ratio': 0.9}-29.868949-34.423737-25.623955-16.750237-31.056181-27.5446126.08709319
290.0006030.0000410.0002510.000015501.0{'alpha': 50, 'l1_ratio': 1.0}-29.868949-34.423737-25.623955-16.750237-31.056181-27.5446126.08709319
300.0006700.0001060.0002510.0000131000.1{'alpha': 100, 'l1_ratio': 0.1}-29.868949-34.423737-25.623955-16.750237-31.056181-27.5446126.08709319
310.0007640.0001790.0003430.0000541000.3{'alpha': 100, 'l1_ratio': 0.3}-29.868949-34.423737-25.623955-16.750237-31.056181-27.5446126.08709319
320.0006230.0000770.0002440.0000071000.5{'alpha': 100, 'l1_ratio': 0.5}-29.868949-34.423737-25.623955-16.750237-31.056181-27.5446126.08709319
330.0008170.0001560.0003290.0000761000.7{'alpha': 100, 'l1_ratio': 0.7}-29.868949-34.423737-25.623955-16.750237-31.056181-27.5446126.08709319
340.0005900.0000170.0002420.0000041000.9{'alpha': 100, 'l1_ratio': 0.9}-29.868949-34.423737-25.623955-16.750237-31.056181-27.5446126.08709319
350.0005950.0000270.0002420.0000071001.0{'alpha': 100, 'l1_ratio': 1.0}-29.868949-34.423737-25.623955-16.750237-31.056181-27.5446126.08709319
gridcv_results[
[
'param_alpha',
'param_l1_ratio'
]
].plot(title='Grid Search Hyperparameter :: Parameter', figsize=(12,8))

scikit-learn - Machine Learning in Python

gridcv_results[
[
'mean_fit_time',
'std_fit_time',
'mean_score_time'
]
].plot(title='Grid Search Hyperparameter :: Timing', figsize=(12,8))

scikit-learn - Machine Learning in Python

gridcv_results[
[
'split0_test_score',
'split1_test_score',
'split2_test_score',
'split3_test_score',
'split4_test_score',
'mean_test_score',
'std_test_score',
'rank_test_score'
]
].plot(title='Grid Search Hyperparameter :: Parameter', figsize=(12,8))

scikit-learn - Machine Learning in Python

Model Evaluation
y_grid_pred = grid_model.predict(X_adv_test)

mean_squared_error(y_adv_test, y_grid_pred)
# 2.380865536033581

Supervised Learning - KNN Algorithm

Dataset

wine = load_wine()
print(wine.data.shape)
print(wine.feature_names)
print(wine.data[:1])

# (178, 13)
# ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']
# [[1.423e+01 1.710e+00 2.430e+00 1.560e+01 1.270e+02 2.800e+00 3.060e+00
# 2.800e-01 2.290e+00 5.640e+00 1.040e+00 3.920e+00 1.065e+03]]
wine_df = pd.DataFrame(data=wine.data, columns=wine.feature_names)
wine_df.head(2).T
01
alcohol14.2313.20
malic_acid1.711.78
ash2.432.14
alcalinity_of_ash15.6011.20
magnesium127.00100.00
total_phenols2.802.65
flavanoids3.062.76
nonflavanoid_phenols0.280.26
proanthocyanins2.291.28
color_intensity5.644.38
hue1.041.05
od280/od315_of_diluted_wines3.923.40
proline1065.001050.00

Data Pre-processing

# normalization
scaler = MinMaxScaler()
scaler.fit(wine.data)
wine_norm = scaler.fit_transform(wine.data)
# train/test split
X_train_wine, X_test_wine, y_train_wine, y_test_wine = train_test_split(
wine_norm,
wine.target,
test_size=0.3
)

print(X_train_wine.shape, X_test_wine.shape)
# (124, 13) (54, 13)

Model Fitting

# model for k=3
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_wine, y_train_wine)

y_pred_wine_knn3 = knn.predict(X_test_wine)
print('Accuracy Score: ', (accuracy_score(y_test_wine, y_pred_wine_knn3)*100).round(2), '%')
# Accuracy Score: 98.15 %
# model for k=5
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_wine, y_train_wine)

y_pred_wine_knn5 = knn.predict(X_test_wine)
print('Accuracy Score: ', (accuracy_score(y_test_wine, y_pred_wine_knn5)*100).round(2), '%')
# Accuracy Score: 98.15 %
# model for k=7
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train_wine, y_train_wine)

y_pred_wine_knn7 = knn.predict(X_test_wine)
print('Accuracy Score: ', (accuracy_score(y_test_wine, y_pred_wine_knn7)*100).round(2), '%')
# Accuracy Score: 96.3 %
# model for k=9
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train_wine, y_train_wine)

y_pred_wine_knn7 = knn.predict(X_test_wine)
print('Accuracy Score: ', (accuracy_score(y_test_wine, y_pred_wine_knn7)*100).round(2), '%')
# Accuracy Score: 96.3 %

Supervised Learning - Decision Tree Classifier

  • Does not require normalization
  • Is not sensitive to missing values

Dataset

!wget https://gist.githubusercontent.com/Dviejopomata/ea5869ba4dcff84f8c294dc7402cd4a9/raw/4671f90b8b04ba4db9d67acafaa4c0827cd233c2/bill_authentication.csv -P datasets
bill_auth_df = pd.read_csv('datasets/bill_authentication.csv')
bill_auth_df.head(3)
VarianceSkewnessCurtosisEntropyClass
03.62168.6661-2.8073-0.446990
14.54598.1674-2.4586-1.462100
23.8660-2.63831.92420.106450

Preprocessing

# remove target feature from training set
X_bill = bill_auth_df.drop('Class', axis=1)
y_bill = bill_auth_df['Class']
X_train_bill, X_test_bill, y_train_bill, y_test_bill = train_test_split(X_bill, y_bill, test_size=0.2)

Model Fitting

tree_classifier = DecisionTreeClassifier()

tree_classifier.fit(X_train_bill, y_train_bill)

Evaluation

y_pred_bill = tree_classifier.predict(X_test_bill)
conf_mtx_bill = confusion_matrix(y_test_bill, y_pred_bill)
conf_mtx_bill

# array([[150, 2],
# [ 4, 119]])
conf_mtx_bill_plot = ConfusionMatrixDisplay(
confusion_matrix=conf_mtx_bill,
display_labels=[False,True]
)

conf_mtx_bill_plot.plot()
plt.show()

scikit-learn - Machine Learning in Python

report_bill = classification_report(
y_test_bill, y_pred_bill
)
print(report_bill)
precisionrecallf1-scoresupport
00.970.990.98152
10.980.970.98123
accuracy0.98275
macro avg0.980.980.98275
weighted avg0.980.980.98275

Supervised Learning - Random Forest Classifier

  • Does not require normalization
  • Is not sensitive to missing values
  • Low risk of overfitting
  • Efficient with large datasets
  • High accuracy

Dataset

!wget https://raw.githubusercontent.com/xjcjiacheng/data-analysis/master/heart%20disease%20UCI/heart.csv -P datasets
heart_df = pd.read_csv('datasets/heart.csv')
heart_df.head(5)
agesexcptrestbpscholfbsrestecgthalachexangoldpeakslopecathaltarget
063131452331015002.30011
137121302500118703.50021
241011302040017201.42021
356111202360117800.82021
457001203540116310.62021

Preprocessing

# remove target feature from training set
X_heart = heart_df.drop('target', axis=1)
y_heart = heart_df['target']
X_train_heart, X_test_heart, y_train_heart, y_test_heart = train_test_split(
X_heart,
y_heart,
test_size=0.2,
random_state=0
)

Model Fitting

forest_classifier = RandomForestClassifier(n_estimators=10, criterion='entropy')

forest_classifier.fit(X_train_heart, y_train_heart)

Evaluation

y_pred_heart = forest_classifier.predict(X_test_heart)
conf_mtx_heart = confusion_matrix(y_test_heart, y_pred_heart)
conf_mtx_heart

# array([[24, 3],
# [ 5, 29]])
conf_mtx_heart_plot = ConfusionMatrixDisplay(
confusion_matrix=conf_mtx_heart,
display_labels=[False,True]
)

conf_mtx_heart_plot.plot()
plt.show()

scikit-learn - Machine Learning in Python

report_heart = classification_report(
y_test_heart, y_pred_heart
)
print(report_heart)
precisionrecallf1-scoresupport
00.830.890.8627
10.910.850.8834
accuracy0.8761
macro avg0.870.870.8761
weighted avg0.870.870.8761

Random Forest Hyperparameter Tuning

Testing Hyperparameters

rdnfor_classifier = RandomForestClassifier(
n_estimators=2,
min_samples_split=2,
min_samples_leaf=1,
criterion='entropy'
)
rdnfor_classifier.fit(X_train_heart, y_train_heart)
rdnfor_pred = rdnfor_classifier.predict(X_test_heart)
print('Accuracy Score: ', accuracy_score(y_test_heart, rdnfor_pred).round(4)*100, '%')

# Accuracy Score: 73.77 %

Grid-Search Cross-Validation

Try a set of values for selected Hyperparameter to find the optimal configuration.

param_grid = \{
'n_estimators': [5, 25, 50, 75,100, 125],
'min_samples_split': [1,2,3],
'min_samples_leaf': [1,2,3],
'criterion': ['gini', 'entropy', 'log_loss'],
'max_features' : ['sqrt', 'log2']
\}

grid_search = GridSearchCV(
estimator = rdnfor_classifier,
param_grid = param_grid
)

grid_search.fit(X_train_heart, y_train_heart)
print('Best Parameter: ', grid_search.best_params_)
# Best Parameter: \{
# 'criterion': 'entropy',
# 'max_features': 'sqrt',
# 'min_samples_leaf': 2,
# 'min_samples_split': 1,
# 'n_estimators': 25
# \}
rdnfor_classifier_optimized = RandomForestClassifier(
n_estimators=25,
min_samples_split=1,
min_samples_leaf=2,
criterion='entropy',
max_features='sqrt'
)

rdnfor_classifier_optimized.fit(X_train_heart, y_train_heart)
rdnfor_pred_optimized = rdnfor_classifier_optimized.predict(X_test_heart)
print('Accuracy Score: ', accuracy_score(y_test_heart, rdnfor_pred_optimized).round(4)*100, '%')

# Accuracy Score: 85.25 %

Random Forest Classifier 1 - Penguins

!wget https://github.com/remijul/dataset/raw/master/penguins_size.csv -P datasets
peng_df = pd.read_csv('datasets/penguins_size.csv')
peng_df = peng_df.dropna()
peng_df.head(5)
speciesislandculmen_length_mmculmen_depth_mmflipper_length_mmbody_mass_gsex
0AdelieTorgersen39.118.7181.03750.0MALE
1AdelieTorgersen39.517.4186.03800.0FEMALE
2AdelieTorgersen40.318.0195.03250.0FEMALE
4AdelieTorgersen36.719.3193.03450.0FEMALE
5AdelieTorgersen39.320.6190.03650.0MALE
# drop labels and encode string values
X_peng = pd.get_dummies(peng_df.drop('species', axis=1),drop_first=True)
y_peng = peng_df['species']
# train/test split
X_peng_train, X_peng_test, y_peng_train, y_peng_test = train_test_split(
X_peng,
y_peng,
test_size=0.3,
random_state=42
)
# creating the model
rfc_peng = RandomForestClassifier(
n_estimators=10,
max_features='sqrt',
random_state=42
)
# model training and running predictions
rfc_peng.fit(X_peng_train, y_peng_train)
peng_pred = rfc_peng.predict(X_peng_test)
print('Accuracy Score: ',accuracy_score(y_peng_test, peng_pred, normalize=True).round(4)*100, '%')
# Accuracy Score: 98.02 %

Feature Importance

# feature importance for classification
peng_index = ['importance']
peng_data_columns = pd.Series(X_peng.columns)
peng_importance_array = rfc_peng.feature_importances_
peng_importance_df = pd.DataFrame(peng_importance_array, peng_data_columns, peng_index)
peng_importance_df
importance
culmen_length_mm0.288928
culmen_depth_mm0.111021
flipper_length_mm0.357994
body_mass_g0.025477
island_Dream0.178498
island_Torgersen0.031042
sex_FEMALE0.004716
sex_MALE0.002324
peng_importance_df.sort_values(
by='importance',
ascending=False
).plot(
kind='barh',
title='Feature Importance for Species Classification',
figsize=(12,4)
)

scikit-learn - Machine Learning in Python

Model Evaluation

report_peng = classification_report(y_peng_test, peng_pred)
print(report_peng)
precisionrecallf1-scoresupport
Adelie0.980.980.9849
Chinstrap0.940.940.9418
Gentoo1.001.001.0034
accuracy0.98101
macro avg0.970.970.97101
weighted avg0.980.980.98101
conf_mtx_peng = confusion_matrix(y_peng_test, peng_pred)

conf_mtx_peng_plot = ConfusionMatrixDisplay(
confusion_matrix=conf_mtx_peng
)

conf_mtx_peng_plot.plot(cmap='plasma')

scikit-learn - Machine Learning in Python

Random Forest Classifier - Banknote Authentication

!wget https://github.com/jbrownlee/Datasets/raw/master/banknote_authentication.csv -P datasets
money_df = pd.read_csv('datasets/data-banknote-authentication.csv')
money_df.head(5)
Variance_WaveletSkewness_WaveletCurtosis_WaveletImage_EntropyClass
03.621608.6661-2.8073-0.446990
14.545908.1674-2.4586-1.462100
23.86600-2.63831.92420.106450
33.456609.5228-4.0112-3.594400
40.32924-4.45524.5718-0.988800
sns.pairplot(money_df, hue='Class', palette='winter')

scikit-learn - Machine Learning in Python

# drop label for training
X_money = money_df.drop('Class', axis=1)
y_money = money_df['Class']
print(X_money.shape, y_money.shape)
X_money_train, X_money_test, y_money_train, y_money_test = train_test_split(
X_money,
y_money,
test_size=0.15,
random_state=42
)

Grid Search for Hyperparameters

rfc_money_base = RandomForestClassifier(oob_score=True)
param_grid = \{
'n_estimators': [64, 96, 128, 160, 192],
'max_features': [2,3,4],
'bootstrap': [True, False]
\}
grid_money = GridSearchCV(rfc_money_base, param_grid) 
grid_money.fit(X_money_train, y_money_train)
grid_money.best_params_
# \{'bootstrap': True, 'max_features': 2, 'n_estimators': 96\}

Model Training and Evaluation

rfc_money = RandomForestClassifier(
bootstrap=True,
max_features=2,
n_estimators=96,
oob_score=True
)
rfc_money.fit(X_money_train, y_money_train)
print('Out-of-Bag Score: ', rfc_money.oob_score_.round(4)*100, '%')
# Out-of-Bag Score: 99.14 %
money_pred = rfc_money.predict(X_money_test)
money_report = classification_report(y_money_test, money_pred)
print(money_report)
precisionrecallf1-scoresupport
00.991.001.00111
11.000.990.9995
accuracy1.00206
macro avg1.000.991.00206
weighted avg1.001.001.00206
conf_mtx_money = confusion_matrix(y_money_test, money_pred)

conf_mtx_money_plot = ConfusionMatrixDisplay(
confusion_matrix=conf_mtx_money
)

conf_mtx_money_plot.plot(cmap='plasma')

scikit-learn - Machine Learning in Python

Optimizations

# verify number of estimators found by grid search
errors = []
missclassifications = []

for n in range(1,200):
rfc = RandomForestClassifier(n_estimators=n, max_features=2)
rfc.fit(X_money_train, y_money_train)
preds = rfc.predict(X_money_test)

err = 1 - accuracy_score(y_money_test, preds)
errors.append(err)

n_missed = np.sum(preds != y_money_test)
missclassifications.append(n_missed)
plt.figure(figsize=(12,4))
plt.title('Errors as a Function of n_estimators')
plt.xlabel('Estimators')
plt.ylabel('Error Score')
plt.plot(range(1,200), errors)
# there is no noteable improvement above ~10 estimators

scikit-learn - Machine Learning in Python

plt.figure(figsize=(12,4))
plt.title('Misclassifications as a Function of n_estimators')
plt.xlabel('Estimators')
plt.ylabel('Misclassifications')
plt.plot(range(1,200), missclassifications)
# and the same for misclassifications

scikit-learn - Machine Learning in Python

Random Forest Regressor

Comparing different regression models to a random forrest regression model.

# dataset
!wget https://github.com/vineetsingh028/Rock_Density_Prediction/raw/master/rock_density_xray.csv -P datasets
rock_df = pd.read_csv('datasets/rock_density_xray.csv')
rock_df.columns = ['Signal', 'Density']
rock_df.head(5)
SignalDensity
072.9451242.456548
114.2298772.601719
236.5973341.967004
39.5788992.300439
421.7658972.452374
plt.figure(figsize=(12,5))
plt.title('X-Ray Bounce Signal Strength vs Rock Density')
sns.scatterplot(data=rock_df, x='Signal', y='Density')
# the signal vs density plot follows a sine wave - spoiler alert: simpler algorithm
# will fail trying to fit this dataset...

scikit-learn - Machine Learning in Python

# train-test split
X_rock = rock_df['Signal'].values.reshape(-1,1)
y_rock = rock_df['Density']

X_rock_train, X_rock_test, y_rock_train, y_rock_test = train_test_split(
X_rock,
y_rock,
test_size=0.1,
random_state=42
)
# normalization
scaler = StandardScaler()
X_rock_train_scaled = scaler.fit_transform(X_rock_train)
X_rock_test_scaled = scaler.transform(X_rock_test)

vs Linear Regression

lr_rock = LinearRegression()
lr_rock.fit(X_rock_train_scaled, y_rock_train)
lr_rock_preds = lr_rock.predict(X_rock_test_scaled)

mae = mean_absolute_error(y_rock_test, lr_rock_preds)
rmse = np.sqrt(mean_squared_error(y_rock_test, lr_rock_preds))
mean_abs = y_rock_test.mean()
avg_error = mae * 100 / mean_abs

print('MAE: ', mae.round(2), 'RMSE: ', rmse.round(2), 'Relative Avg. Error: ', avg_error.round(2), '%')
# MAE: 0.24 RMSE: 0.3 Relative Avg. Error: 10.93 %
# visualize predictions
plt.figure(figsize=(12,5))
plt.plot(X_rock_test, lr_rock_preds, c='mediumspringgreen')
sns.scatterplot(data=rock_df, x='Signal', y='Density', c='dodgerblue')
plt.title('Linear Regression Predictions')
plt.show()
# the returned error appears small because the linear regression returns an average
# but it cannot fit a linear line to the contours of the underlying sine wave function

scikit-learn - Machine Learning in Python

vs Polynomial Regression

# helper function
def run_model(model, X_train, y_train, X_test, y_test, df):

# FIT MODEL
model.fit(X_train, y_train)

# EVALUATE
y_preds = model.predict(X_test)
mae = mean_absolute_error(y_test, y_preds)
rmse = np.sqrt(mean_squared_error(y_test, y_preds))
mean_abs = y_test.mean()
avg_error = mae * 100 / mean_abs
print('MAE: ', mae.round(2), 'RMSE: ', rmse.round(2), 'Relative Avg. Error: ', avg_error.round(2), '%')

# PLOT RESULTS
signal_range = np.arange(0,100)
output = model.predict(signal_range.reshape(-1,1))


plt.figure(figsize=(12,5))
sns.scatterplot(data=df, x='Signal', y='Density', c='dodgerblue')
plt.plot(signal_range,output, c='mediumspringgreen')
plt.title('Regression Predictions')
plt.show()
# test helper on previous linear regression
run_model(
model=lr_rock,
X_train=X_rock_train,
y_train=y_rock_train,
X_test=X_rock_test,
y_test=y_rock_test,
df=rock_df
)

MAE: 0.24 RMSE: 0.3 Relative Avg. Error: 10.93 %

scikit-learn - Machine Learning in Python

# build polynomial model
pipe_poly = make_pipeline(
PolynomialFeatures(degree=6),
LinearRegression()
)
# run model
run_model(
model=pipe_poly,
X_train=X_rock_train,
y_train=y_rock_train,
X_test=X_rock_test,
y_test=y_rock_test,
df=rock_df
)
# with a HARD LIMIT of 0-100 for the xray signal a 6th degree polinomial is a good fit

MAE: 0.13 RMSE: 0.14 Relative Avg. Error: 5.7 %

scikit-learn - Machine Learning in Python

vs KNeighbors Regression

# build polynomial model
k_values=[1,5,10,25]

for k in k_values:
model = KNeighborsRegressor(n_neighbors=k)
print(model)

# run model
run_model(
model,
X_train=X_rock_train,
y_train=y_rock_train,
X_test=X_rock_test,
y_test=y_rock_test,
df=rock_df
)

KNeighborsRegressor(n_neighbors=1)

MAE: 0.12 RMSE: 0.17 Relative Avg. Error: 5.47 %

scikit-learn - Machine Learning in Python

KNeighborsRegressor()

MAE: 0.13 RMSE: 0.15 Relative Avg. Error: 5.9 %

scikit-learn - Machine Learning in Python

KNeighborsRegressor(n_neighbors=10)

MAE: 0.12 RMSE: 0.14 Relative Avg. Error: 5.44 %

scikit-learn - Machine Learning in Python

KNeighborsRegressor(n_neighbors=25)

MAE: 0.14 RMSE: 0.16 Relative Avg. Error: 6.18 %

scikit-learn - Machine Learning in Python

vs Decision Tree Regression

tree_model = DecisionTreeRegressor()

# run model
run_model(
model=tree_model,
X_train=X_rock_train,
y_train=y_rock_train,
X_test=X_rock_test,
y_test=y_rock_test,
df=rock_df
)

MAE: 0.12 RMSE: 0.17 Relative Avg. Error: 5.47 %

scikit-learn - Machine Learning in Python

vs Support Vector Regression

svr_rock = svm.SVR()

param_grid = \{
'C': [0.01,0.1,1,5,10,100, 1000],
'gamma': ['auto', 'scale']
\}

rock_grid = GridSearchCV(svr_rock, param_grid)
# run model
run_model(
model=rock_grid,
X_train=X_rock_train,
y_train=y_rock_train,
X_test=X_rock_test,
y_test=y_rock_test,
df=rock_df
)

MAE: 0.13 RMSE: 0.14 Relative Avg. Error: 5.75 %

scikit-learn - Machine Learning in Python

vs Gradient Boosting Regression

gbr_rock = GradientBoostingRegressor()

# run model
run_model(
model=gbr_rock,
X_train=X_rock_train,
y_train=y_rock_train,
X_test=X_rock_test,
y_test=y_rock_test,
df=rock_df
)

MAE: 0.13 RMSE: 0.15 Relative Avg. Error: 5.76 %

scikit-learn - Machine Learning in Python

vs Ada Boosting Regression

abr_rock = AdaBoostRegressor()

# run model
run_model(
model=abr_rock,
X_train=X_rock_train,
y_train=y_rock_train,
X_test=X_rock_test,
y_test=y_rock_test,
df=rock_df
)

MAE: 0.13 RMSE: 0.14 Relative Avg. Error: 5.67 %

scikit-learn - Machine Learning in Python

Finally, Random Forrest Regression

rfr_rock = RandomForestRegressor(n_estimators=10)

# run model
run_model(
model=rfr_rock,
X_train=X_rock_train,
y_train=y_rock_train,
X_test=X_rock_test,
y_test=y_rock_test,
df=rock_df
)

MAE: 0.11 RMSE: 0.14 Relative Avg. Error: 5.1 %

scikit-learn - Machine Learning in Python

Supervised Learning - SVC Model

Support Vector Machines (SVMs) are a set of supervised learning methods used for classification, regression and outliers detection.

  • Effective in high dimensional spaces.
  • Still effective in cases where number of dimensions is greater than the number of samples.

Dataset

Measurements of geometrical properties of kernels belonging to three different varieties of wheat:

  • A: Area,
  • P: Perimeter,
  • C = 4piA/P^2: Compactness,
  • LK: Length of kernel,
  • WK: Width of kernel,
  • A_Coef: Asymmetry coefficient
  • LKG: Length of kernel groove.
!wget https://raw.githubusercontent.com/prasertcbs/basic-dataset/master/Seed_Data.csv -P datasets
wheat_df = pd.read_csv('datasets/Seed_Data.csv')
wheat_df.head(5)
APCLKWKA_CoefLKGtarget
015.2614.840.87105.7633.3122.2215.2200
114.8814.570.88115.5543.3331.0184.9560
214.2914.090.90505.2913.3372.6994.8250
313.8413.940.89555.3243.3792.2594.8050
416.1414.990.90345.6583.5621.3555.1750
wheat_df.info()

# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 210 entries, 0 to 209
# Data columns (total 8 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 A 210 non-null float64
# 1 P 210 non-null float64
# 2 C 210 non-null float64
# 3 LK 210 non-null float64
# 4 WK 210 non-null float64
# 5 A_Coef 210 non-null float64
# 6 LKG 210 non-null float64
# 7 target 210 non-null int64
# dtypes: float64(7), int64(1)
# memory usage: 13.2 KB

Preprocessing

# remove target feature from training set
X_wheat = wheat_df.drop('target', axis=1)
y_wheat = wheat_df['target']

print(X_wheat.shape, y_wheat.shape)
# (210, 7) (210,)
# train/test split
X_train_wheat, X_test_wheat, y_train_wheat, y_test_wheat = train_test_split(
X_wheat,
y_wheat,
test_size=0.2,
random_state=42
)
# normalization
sc_wheat = StandardScaler()
X_train_wheat=sc_wheat.fit_transform(X_train_wheat)
X_test_wheat=sc_wheat.fit_transform(X_test_wheat)

Model Training

# SVM classifier fitting
clf_wheat = svm.SVC()
clf_wheat.fit(X_train_wheat, y_train_wheat)

Model Evaluation

# Predictions
y_wheat_pred = clf_wheat.predict(X_test_wheat)
print(
'Accuracy Score: ',
accuracy_score(y_test_wheat, y_wheat_pred, normalize=True).round(4)*100, '%'
)
# Accuracy Score: 90.48 %
report_wheat = classification_report(
y_test_wheat, y_wheat_pred
)
print(report_wheat)
precisionrecallf1-scoresupport
00.820.820.8211
11.000.930.9614
20.890.940.9117
accuracy0.9042
macro avg0.900.900.9042
weighted avg0.910.900.9142
conf_mtx_wheat = confusion_matrix(y_test_wheat, y_wheat_pred)
conf_mtx_wheat

# array([[ 9, 0, 2],
# [ 1, 13, 0],
# [ 1, 0, 16]])
conf_mtx_wheat_plot = ConfusionMatrixDisplay(
confusion_matrix=conf_mtx_wheat
)

conf_mtx_wheat_plot.plot()
plt.show()

scikit-learn - Machine Learning in Python

Margin Plots for Support Vector Classifier

# get dataset
!wget https://github.com/alpeshraj/mouse_viral_study/raw/main/mouse_viral_study.csv -P datasets
mice_df = pd.read_csv('datasets/mouse_viral_study.csv')
mice_df.head(5)
Med_1_mLMed_2_mLVirus Present
06.5082318.5825310
14.1261163.0734591
26.4278706.3697580
33.6729534.9052151
41.5803212.4405621
sns.scatterplot(data=mice_df, x='Med_1_mL',y='Med_2_mL',hue='Virus Present', palette='winter')

scikit-learn - Machine Learning in Python

# visualizing a hyperplane to separate the two features
sns.scatterplot(data=mice_df, x='Med_1_mL',y='Med_2_mL',hue='Virus Present', palette='winter')

x = np.linspace(0,10,100)
m = -1
b = 11
y = m*x + b

plt.plot(x,y,c='fuchsia')

scikit-learn - Machine Learning in Python

SVC with a Linear Kernel

# using a support vector classifier to calculate maximize the margin between both classes

y_vir = mice_df['Virus Present']
X_vir = mice_df.drop('Virus Present',axis=1)

# kernel : \{'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'\}
# the smaller the C value the more feature vectors will be inside the margin
model_vir = svm.SVC(kernel='linear', C=1000)

model_vir.fit(X_vir, y_vir)
# import helper function
from helper.svm_margin_plot import plot_svm_boundary
plot_svm_boundary(model_vir, X_vir, y_vir)

scikit-learn - Machine Learning in Python

# the smaller the C value the more feature vectors will be inside the margin
model_vir_low_reg = svm.SVC(kernel='linear', C=0.005)
model_vir_low_reg.fit(X_vir, y_vir)
plot_svm_boundary(model_vir_low_reg, X_vir, y_vir)

scikit-learn - Machine Learning in Python

SVC with a Radial Basis Function Kernel

model_vir_rbf = svm.SVC(kernel='rbf', C=1)
model_vir_rbf.fit(X_vir, y_vir)
plot_svm_boundary(model_vir_rbf, X_vir, y_vir)

scikit-learn - Machine Learning in Python

# # gamma : \{'scale', 'auto'\} or float, default='scale'
# - if ``gamma='scale'`` (default) is passed then it uses 1 / (n_features * X.var()) as value of gamma,
# - if 'auto', uses 1 / n_features
# - if float, must be non-negative.
model_vir_rbf_auto_gamma = svm.SVC(kernel='rbf', C=1, gamma='auto')
model_vir_rbf_auto_gamma.fit(X_vir, y_vir)
plot_svm_boundary(model_vir_rbf_auto_gamma, X_vir, y_vir)

scikit-learn - Machine Learning in Python

SVC with a Sigmoid Kernel

model_vir_sigmoid = svm.SVC(kernel='sigmoid', gamma='scale')
model_vir_sigmoid.fit(X_vir, y_vir)
plot_svm_boundary(model_vir_sigmoid, X_vir, y_vir)

scikit-learn - Machine Learning in Python

SVC with a Polynomial Kernel

model_vir_poly = svm.SVC(kernel='poly', C=1, degree=2)
model_vir_poly.fit(X_vir, y_vir)
plot_svm_boundary(model_vir_poly, X_vir, y_vir)

scikit-learn - Machine Learning in Python

Grid Search for Support Vector Classifier

svm_base_model = svm.SVC()

param_grid = \{
'C':[0.01, 0.1, 1],
'kernel': ['linear', 'rbf']
\}
grid = GridSearchCV(svm_base_model, param_grid) 
grid.fit(X_vir, y_vir)
grid.best_params_
# \{'C': 0.01, 'kernel': 'linear'\}

Support Vector Regression

# dataset
!wget https://github.com/fsdhakan/ML/raw/main/cement_slump.csv -P datasets
cement_df = pd.read_csv('datasets/cement_slump.csv')
cement_df.head(5)
CementSlagFly ashWaterSPCoarse Aggr.Fine Aggr.SLUMP(cm)FLOW(cm)Compressive Strength (28-day)(Mpa)
0273.082.0105.0210.09.0904.0680.023.062.034.99
1163.0149.0191.0180.012.0843.0746.00.020.041.14
2162.0148.0191.0179.016.0840.0743.01.020.041.81
3162.0148.0190.0179.019.0838.0741.03.021.542.08
4154.0112.0144.0220.010.0923.0658.020.064.026.82
plt.figure(figsize=(8,8))
sns.heatmap(cement_df.corr(), annot=True, cmap='viridis')

scikit-learn - Machine Learning in Python

# drop labels
X_cement = cement_df.drop('Compressive Strength (28-day)(Mpa)', axis=1)
y_cement = cement_df['Compressive Strength (28-day)(Mpa)']
 # train/test split
X_train_cement, X_test_cement, y_train_cement, y_test_cement = train_test_split(
X_cement,
y_cement,
test_size=0.3,
random_state=42
)
# normalize
scaler = StandardScaler()
X_train_cement_scaled = scaler.fit_transform(X_train_cement)
X_test_cement_scaled = scaler.transform(X_test_cement)

Base Model Run

base_model_cement = svm.SVR()
base_model_cement.fit(X_train_cement_scaled, y_train_cement)

base_model_predictions = base_model_cement.predict(X_test_cement_scaled)
mae = mean_absolute_error(y_test_cement, base_model_predictions)
rmse = mean_squared_error(y_test_cement, base_model_predictions)
mean_abs = y_test_cement.mean()
avg_error = mae * 100 / mean_abs

print('MAE: ', mae.round(2), 'RMSE: ', rmse.round(2), 'Relative Avg. Error: ', avg_error.round(2), '%')
MAERMSERelative Avg. Error
4.6836.9512.75 %

Grid Search for better Hyperparameter

param_grid = \{
'C': [0.001,0.01,0.1,0.5,1],
'kernel': ['linear', 'rbf', 'poly'],
'gamma': ['scale', 'auto'],
'degree': [2,3,4],
'epsilon': [0,0.01,0.1,0.5,1,2]
\}
cement_grid = GridSearchCV(base_model_cement, param_grid)
cement_grid.fit(X_train_cement_scaled, y_train_cement)
cement_grid.best_params_
# \{'C': 1, 'degree': 2, 'epsilon': 2, 'gamma': 'scale', 'kernel': 'linear'\}
cement_grid_predictions = cement_grid.predict(X_test_cement_scaled)
mae_grid = mean_absolute_error(y_test_cement, cement_grid_predictions)
rmse_grid = mean_squared_error(y_test_cement, cement_grid_predictions)
mean_abs = y_test_cement.mean()
avg_error_grid = mae_grid * 100 / mean_abs

print('MAE: ', mae_grid.round(2), 'RMSE: ', rmse_grid.round(2), 'Relative Avg. Error: ', avg_error_grid.round(2), '%')
MAERMSERelative Avg. Error
1.855.25.05 %

Example Task - Wine Fraud

Data Exploration

# dataset
!wget https://github.com/CAPGAGA/Fraud-in-Wine/raw/main/wine_fraud.csv -P datasets
wine_df = pd.read_csv('datasets/wine_fraud.csv')
wine_df.head(5)
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholqualitytype
07.40.700.001.90.07611.034.00.99783.510.569.4Legitred
17.80.880.002.60.09825.067.00.99683.200.689.8Legitred
27.80.760.042.30.09215.054.00.99703.260.659.8Legitred
311.20.280.561.90.07517.060.00.99803.160.589.8Legitred
47.40.700.001.90.07611.034.00.99783.510.569.4Legitred
wine_df.value_counts('quality')
quality
Legit6251
Fraud246
dtype: int64
wine_df['quality'].value_counts().plot(
kind='bar',
figsize=(10,5),
title='Wine - Quality distribution')

scikit-learn - Machine Learning in Python

plt.figure(figsize=(10, 5))
plt.title('Wine - Quality distribution by Type')

sns.countplot(
data=wine_df,
x='quality',
hue='type',
palette='winter'
)

plt.savefig('assets/Scikit_Learn_22.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

wine_df_white = wine_df[wine_df['type'] == 'white']
wine_df_red = wine_df[wine_df['type'] == 'red']
# fraud percentage by wine type
legit_white_wines = wine_df_white.value_counts('quality')[0]
fraud_white_wines = wine_df_white.value_counts('quality')[1]
white_fraud_percentage = fraud_white_wines * 100 / (legit_white_wines + fraud_white_wines)

legit_red_wines = wine_df_red.value_counts('quality')[0]
fraud_red_wines = wine_df_red.value_counts('quality')[1]
red_fraud_percentage = fraud_red_wines * 100 / (legit_red_wines + fraud_red_wines)

print(
'Fraud Percentage: \nWhite Wines: ',
white_fraud_percentage.round(2),
'% \nRed Wines: ',
red_fraud_percentage.round(2),
'%'
)
Fraud Percentage:
White Wines:3.74 %
Red Wines:3.94 %
# make features numeric
feature_map = \{
'Legit': 0,
'Fraud': 1,
'red': 0,
'white': 1
\}

wine_df['quality_enc'] = wine_df['quality'].map(feature_map)
wine_df['type_enc'] = wine_df['type'].map(feature_map)
wine_df[['quality', 'quality_enc', 'type', 'type_enc']]
qualityquality_enctypetype_enc
0Legit0red0
1Legit0red0
2Legit0red0
3Legit0red0
4Legit0red0
...
6492Legit0white1
6493Legit0white1
6494Legit0white1
6495Legit0white1
6496Legit0white1
6497 rows × 4 columns
# find correlations
wine_df.corr(numeric_only=True)
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality_enctype_enc
fixed acidity1.0000000.2190080.324436-0.1119810.298195-0.282735-0.3290540.458910-0.2527000.299568-0.0954520.021794-0.486740
volatile acidity0.2190081.000000-0.377981-0.1960110.377124-0.352557-0.4144760.2712960.2614540.225984-0.0376400.151228-0.653036
citric acid0.324436-0.3779811.0000000.1424510.0389980.1331260.1952420.096154-0.3298080.056197-0.010493-0.0617890.187397
residual sugar-0.111981-0.1960110.1424511.000000-0.1289400.4028710.4954820.552517-0.267320-0.185927-0.359415-0.0487560.348821
chlorides0.2981950.3771240.038998-0.1289401.000000-0.195045-0.2796300.3626150.0447080.395593-0.2569160.034499-0.512678
free sulfur dioxide-0.282735-0.3525570.1331260.402871-0.1950451.0000000.7209340.025717-0.145854-0.188457-0.179838-0.0852040.471644
total sulfur dioxide-0.329054-0.4144760.1952420.495482-0.2796300.7209341.0000000.032395-0.238413-0.275727-0.265740-0.0352520.700357
density0.4589100.2712960.0961540.5525170.3626150.0257170.0323951.0000000.0116860.259478-0.6867450.016351-0.390645
pH-0.2527000.261454-0.329808-0.2673200.044708-0.145854-0.2384130.0116861.0000000.1921230.1212480.020107-0.329129
sulphates0.2995680.2259840.056197-0.1859270.395593-0.188457-0.2757270.2594780.1921231.000000-0.003029-0.034046-0.487218
alcohol-0.095452-0.037640-0.010493-0.359415-0.256916-0.179838-0.265740-0.6867450.121248-0.0030291.000000-0.0511410.032970
quality_enc0.0217940.151228-0.061789-0.0487560.034499-0.085204-0.0352520.0163510.020107-0.034046-0.0511411.000000-0.004598
type_enc-0.486740-0.6530360.1873970.348821-0.5126780.4716440.700357-0.390645-0.329129-0.4872180.032970-0.0045981.000000
plt.figure(figsize=(12,8))
sns.heatmap(wine_df.corr(numeric_only=True), annot=True, cmap='viridis')

scikit-learn - Machine Learning in Python

# how does the quality correlate to measurements
wine_df.corr(numeric_only=True)['quality_enc']
Quality Correlstion
fixed acidity0.021794
volatile acidity0.151228
citric acid-0.061789
residual sugar-0.048756
chlorides0.034499
free sulfur dioxide-0.085204
total sulfur dioxide-0.035252
density0.016351
pH0.020107
sulphates-0.034046
alcohol-0.051141
quality_enc1.000000
type_enc-0.004598
Name: quality_enc, dtype: float64
wine_df.corr(numeric_only=True)['quality_enc'][:-2].sort_values().plot(
figsize=(12,5),
kind='bar',
title='Correlation of Measurements to Quality'
)

scikit-learn - Machine Learning in Python

Regression Model

# separate target + remove string values
X_wine = wine_df.drop(['quality_enc', 'quality', 'type'], axis=1)
y_wine = wine_df['quality']

print(X_wine.shape, y_wine.shape)
# train-test split
X_wine_train, X_wine_test, y_wine_train, y_wine_test = train_test_split(
X_wine,
y_wine,
test_size=0.1,
random_state=42
)
# normalization
scaler = StandardScaler()
X_wine_train_scaled = scaler.fit_transform(X_wine_train)
X_wine_test_scaled = scaler.transform(X_wine_test)
# create the SVC model using class_weight to balance out the
# dataset that heavily leaning towards non-frauds
svc_wine_base = svm.SVC(
kernel='rbf',
class_weight='balanced'
)
# grid search
param_grid = \{
'C': [0.5, 1, 1.5, 2, 2.5],
'gamma' : ['scale', 'auto']
\}

wine_grid = GridSearchCV(svc_wine_base, param_grid)
wine_grid.fit(X_wine_train_scaled, y_wine_train)
print('Best Params: ', wine_grid.best_params_)
# Best Params: \{'C': 2.5, 'gamma': 'auto'\}
y_wine_pred = wine_grid.predict(X_wine_test_scaled)
print(
'Accuracy Score: ',
accuracy_score(y_wine_test, y_wine_pred, normalize=True).round(4)*100, '%'
)
# Accuracy Score: 84.77 %
report_wine = classification_report(
y_wine_test, y_wine_pred
)
print(report_wine)
precisionrecallf1-scoresupport
Fraud0.160.680.2625
Legit0.990.850.92625
accuracy0.85650
macro avg0.570.770.59650
weighted avg0.950.850.89650
conf_mtx_wine = confusion_matrix(y_wine_test, y_wine_pred)
conf_mtx_wine

# array([[ 17, 8],
# [ 91, 534]])
conf_mtx_wine_plot = ConfusionMatrixDisplay(
confusion_matrix=conf_mtx_wine
)

conf_mtx_wine_plot.plot(cmap='plasma')

scikit-learn - Machine Learning in Python

# expand grid search
param_grid = \{
'C': [1000, 1050, 1100, 1050, 1200],
'gamma' : ['scale', 'auto']
\}

wine_grid = GridSearchCV(svc_wine_base, param_grid)
wine_grid.fit(X_wine_train_scaled, y_wine_train)
print('Best Params: ', wine_grid.best_params_)
# Best Params: \{'C': 1100, 'gamma': 'scale'\}
y_wine_pred = wine_grid.predict(X_wine_test_scaled)
print('Accuracy Score: ',accuracy_score(y_wine_test, y_wine_pred, normalize=True).round(4)*100, '%')
# Accuracy Score: 94.31 %
report_wine = classification_report(y_wine_test, y_wine_pred)
print(report_wine)
conf_mtx_wine = confusion_matrix(y_wine_test, y_wine_pred)

conf_mtx_wine_plot = ConfusionMatrixDisplay(
confusion_matrix=conf_mtx_wine
)

conf_mtx_wine_plot.plot(cmap='plasma')
precisionrecallf1-scoresupport
Fraud0.290.320.3025
Legit0.970.970.97625
accuracy0.85650
macro avg0.630.640.64650
weighted avg0.950.940.94650

scikit-learn - Machine Learning in Python

Supervised Learning - Boosting Methods

# dataset - label mushrooms as poisonous or eatable
!wget https://github.com/semnan-university-ai/Mushroom/raw/main/Mushroom.csv -P datasets

Dataset Exploration

shroom_df = pd.read_csv('datasets/mushrooms.csv')
shroom_df.head(5).transpose()

Mushroom Data Set

  1. cap-shape: bell = b, conical = c, convex = x, flat = f, knobbed = k, sunken = s
  2. cap-surface: fibrous = f, grooves = g, scaly = y, smooth = s
  3. cap-color: brown = n, buff = b, cinnamon = c, gray = g, green = r, pink = p, purple = u, red = e, white = w, yellow = y
  4. bruises?: bruises = t, no = f
  5. odor: almond = a, anise = l, creosote = c, fishy = y, foul = f, musty = m, none = n, pungent = p, spicy = s
  6. gill-attachment: attached = a, descending = d, free = f, notched = n
  7. gill-spacing: close = c, crowded = w, distant = d
  8. gill-size: broad = b, narrow = n
  9. gill-color: black = k, brown = n, buff = b, chocolate = h, gray = g, green = r, orange = o, pink = p, purple = u, red = e, white = w, yellow = y
  10. stalk-shape: enlarging = e, tapering = t
  11. stalk-root: bulbous = b, club = c, cup = u, equal = e, rhizomorphs = z, rooted = r, missing = ?
  12. stalk-surface-above-ring: fibrous = f, scaly = y, silky = k, smooth = s
  13. stalk-surface-below-ring: fibrous = f, scaly = y, silky = k, smooth = s
  14. stalk-color-above-ring: brown = n, buff = b, cinnamon = c, gray = g, orange = o, pink = p, red = e, white = w, yellow = y
  15. stalk-color-below-ring: brown = n, buff = b, cinnamon = c, gray = g, orange = o, pink = p, red = e, white = w, yellow = y
  16. veil-type: partial = p, universal = u
  17. veil-color: brown = n, orange = o, white = w, yellow = y
  18. ring-number: none = n, one = o, two = t
  19. ring-type: cobwebby = c, evanescent = e, flaring = f, large = l, none = n, pendant = p, sheathing = s, zone = z
  20. spore-print-color: black = k, brown = n, buff = b, chocolate = h, green = r, orange = o, purple = u, white = w, yellow = y
  21. population: abundant = a, clustered = c, numerous = n, scattered = s, several = v, solitary = `y
  22. habitat: grasses = g, leaves = l, meadows = m, paths = p, urban = u, waste = w, woods = d
01234
classpeepe
cap-shapexxbxx
cap-surfacesssys
cap-colornywwg
bruisesttttf
odorpalpn
gill-attachmentfffff
gill-spacingccccw
gill-sizenbbnb
gill-colorkknnk
stalk-shapeeeeet
stalk-rooteccee
stalk-surface-above-ringsssss
stalk-surface-below-ringsssss
stalk-color-above-ringwwwww
stalk-color-below-ringwwwww
veil-typeppppp
veil-colorwwwww
ring-numberooooo
ring-typeppppe
spore-print-colorknnkn
populationsnnsa
habitatugmug
shroom_df.isnull().sum()
class0
cap-shape0
cap-surface0
cap-color0
bruises0
odor0
gill-attachment0
gill-spacing0
gill-size0
gill-color0
stalk-shape0
stalk-root0
stalk-surface-above-ring0
stalk-surface-below-ring0
stalk-color-above-ring0
stalk-color-below-ring0
veil-type0
veil-color0
ring-number0
ring-type0
spore-print-color0
population0
habitat0
dtype: int64
feature_df = shroom_df.describe().transpose().reset_index(
names=['feature']
).sort_values(
'unique', ascending=False
)
featurecountuniquetopfreq
9gill-color812412b1728
3cap-color812410n2284
20spore-print-color81249w2388
5odor81249n3528
15stalk-color-below-ring81249w4384
14stalk-color-above-ring81249w4464
22habitat81247d3148
1cap-shape81246x3656
21population81246v4040
19ring-type81245p3968
11stalk-root81245b3776
12stalk-surface-above-ring81244s5176
13stalk-surface-below-ring81244s4936
17veil-color81244w7924
2cap-surface81244y3244
18ring-number81243o7488
10stalk-shape81242t4608
8gill-size81242b5612
7gill-spacing81242c6812
6gill-attachment81242f7914
4bruises81242f4748
0class81242e4208
16veil-type81241p8124
plt.figure(figsize=(12,8))
plt.title('Mushroom Features :: Number of unique Features')
sns.barplot(data=feature_df, y='feature', x='unique', orient='h', palette='summer_r')

scikit-learn - Machine Learning in Python

plt.figure(figsize=(10,4))
plt.title('Mushroom Count :: Editable vs Poisonous')
sns.countplot(data=shroom_df, x='class', palette='seismic_r')

scikit-learn - Machine Learning in Python

Adaptive Boosting

# remove lable class
X_shroom = shroom_df.drop('class', axis=1)
# make all values numeric
X_shroom = pd.get_dummies(X_shroom, drop_first=True)

y_shroom = shroom_df['class']
# train/test split
X_shroom_train, X_shroom_test, y_shroom_train, y_shroom_test = train_test_split(
X_shroom,
y_shroom,
test_size=0.15,
random_state=42
)

Feature Exploration

# don't try fit a perfect model but only return
# the most important feature for classification
abc_shroom = AdaBoostClassifier(estimator=None, n_estimators=1)
abc_shroom.fit(X_shroom_train,y_shroom_train)
shroom_preds = abc_shroom.predict(X_shroom_test)

print('Accuracy Score: ',accuracy_score(y_shroom_test, shroom_preds, normalize=True).round(4)*100, '%')
# Accuracy Score: 88.35 %
report_shroom = classification_report(y_shroom_test, shroom_preds)
print(report_shroom)
precisionrecallf1-scoresupport
e0.970.800.88637
p0.820.970.89582
accuracy0.881219
macro avg0.890.890.881219
weighted avg0.900.880.881219
conf_mtx_shroom = confusion_matrix(y_shroom_test, shroom_preds)

conf_mtx_shroom_plot = ConfusionMatrixDisplay(
confusion_matrix=conf_mtx_shroom
)

conf_mtx_shroom_plot.plot(cmap='winter_r')

scikit-learn - Machine Learning in Python

# the model was fit on a single feature and still resulted in a pretty good performance.
# Let's find out what feature was chosen for the classification.

shroom_index = ['importance']
shroom_data_columns = pd.Series(X_shroom.columns)
shroom_importance_array = abc_shroom.feature_importances_
shroom_importance_df = pd.DataFrame(shroom_importance_array, shroom_data_columns, shroom_index)
shroom_importance_df.value_counts()
importancecount
0.094
1.01
dtype: int64
# plot a slice of the dataframe to find the feature
shroom_importance_df_sorted = shroom_importance_df.sort_values(
by='importance',
ascending=True
)

shroom_importance_df_sorted[-5:].plot(
kind='barh',
title='Feature Importance for Mushroom Classification',
figsize=(8,4)
)

The most important feature (as determined by the model) is the odor - in this case a odor of none is the best indicator to classify a poisonous mushroom:

odor: almond = a, anise = l, creosote = c, fishy = y, foul = f, musty = m, none = n, pungent = p, spicy = s

scikit-learn - Machine Learning in Python

# the mojority of poisonous mushrooms do have an odor
# naking the lack of it a good indicator for an eatable variety
plt.figure(figsize=(12,4))
plt.title('Mushroom Odor vs Class')
sns.countplot(data=shroom_df, x='odor', hue='class', palette='summer')

scikit-learn - Machine Learning in Python

Optimizing Hyperparameters

# find out how many of the 95 features you have
# to add to your model to get a better fit

error_rates = []

for estimators in range(1,96):
model = AdaBoostClassifier(n_estimators=estimators)
model.fit(X_shroom_train,y_shroom_train)
preds = model.predict(X_shroom_test)

err = 1 - accuracy_score(y_shroom_test, preds)
error_rates.append(err)
x_range=range(1,96)
plt.figure(figsize=(10,4))
plt.title('Adaboost Error Rate vs n_estimators')
plt.xlabel('n_estimators')
plt.ylabel('Error Rate')
plt.xticks(np.arange(min(x_range), max(x_range)+1, 3.0))
plt.plot(x_range, error_rates)

scikit-learn - Machine Learning in Python

# already after 16 estimators there is no
# visible improvment for the error rate
abc_shroom2 = AdaBoostClassifier(estimator=None, n_estimators=16)
abc_shroom2.fit(X_shroom_train,y_shroom_train)

shroom_preds2 = abc_shroom2.predict(X_shroom_test)

print('Accuracy Score: ',accuracy_score(y_shroom_test, shroom_preds2, normalize=True).round(4)*100, '%')
# Accuracy Score: 99.92 %

report_shroom2 = classification_report(y_shroom_test, shroom_preds2)
print(report_shroom2)
precisionrecallf1-scoresupport
e1.001.001.00637
p1.001.001.00582
accuracy1.001219
macro avg1.001.001.001219
weighted avg1.001.001.001219
conf_mtx_shroom2 = confusion_matrix(y_shroom_test, shroom_preds2)

conf_mtx_shroom_plot2 = ConfusionMatrixDisplay(
confusion_matrix=conf_mtx_shroom2
)

conf_mtx_shroom_plot2.plot(cmap='winter_r')

scikit-learn - Machine Learning in Python

shroom_index = ['importance']
shroom_data_columns = pd.Series(X_shroom.columns)
shroom_importance_array = abc_shroom2.feature_importances_
shroom_importance_df = pd.DataFrame(shroom_importance_array, shroom_data_columns, shroom_index)
shroom_importance_df.value_counts()

# there are 12 features now that are deemed important
importancecount
0.000083
0.06259
0.12502
0.18751
dtype: int64
shroom_importance_df_sorted = shroom_importance_df.sort_values(
by='importance',
ascending=True
).tail(13)
importance
gill-size_n0.1875
population_v0.1250
odor_n0.1250
odor_c0.0625
stalk-shape_t0.0625
spore-print-color_w0.0625
population_c0.0625
ring-type_p0.0625
spore-print-color_r0.0625
stalk-surface-above-ring_k0.0625
gill-spacing_w0.0625
odor_f0.0625
stalk-color-below-ring_w0.0000
plt.figure(figsize=(10,6))
plt.title('Features important to classify poisonous Mushrooms')

sns.barplot(
data=shroom_importance_df_sorted.tail(13),
y=shroom_importance_df_sorted.tail(13).index,
x='importance',
orient='h',
palette='summer'
)

scikit-learn - Machine Learning in Python

Gradient Boosting

Gridsearch for best Hyperparameter

gb_shroom = GradientBoostingClassifier()
param_grid = \{
'n_estimators': [50, 100, 150],
'learning_rate': [0.05,0.1,0.2],
'max_depth': [2,3,4,5]
\}
shroom_grid = GridSearchCV(gb_shroom, param_grid)
shroom_grid.fit(X_shroom_train, y_shroom_train)
shroom_grid.best_params_
# \{'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 150\}
shroom_grid_preds = shroom_grid.predict(X_shroom_test)

print('Accuracy Score: ',accuracy_score(y_shroom_test, shroom_grid_preds, normalize=True).round(4)*100, '%')
# Accuracy Score: 100.0 %

report_shroom_grid_preds = classification_report(y_shroom_test, shroom_grid_preds)
print(report_shroom_grid_preds)
precisionrecallf1-scoresupport
e1.001.001.00637
p1.001.001.00582
accuracy1.001219
macro avg1.001.001.001219
weighted avg1.001.001.001219
conf_mtx_shroom_grid = confusion_matrix(y_shroom_test, shroom_grid_preds)

conf_mtx_shroom_grid_plot = ConfusionMatrixDisplay(
confusion_matrix=conf_mtx_shroom_grid
)

conf_mtx_shroom_grid_plot.plot(cmap='winter_r')

scikit-learn - Machine Learning in Python

Feature Importance

shroom_feature_importance = shroom_grid.best_estimator_.feature_importances_
feature_importance_df = pd.DataFrame(
index = X_shroom.columns,
data = shroom_feature_importance,
columns = ['importance']
)

# kick all features that have zero importance and sort by importance
feature_importance_df = feature_importance_df[
feature_importance_df['importance'] > 3e-03
].sort_values(
by='importance',
ascending=False
)
plt.figure(figsize=(10,6))
plt.title('Features important to classify poisonous Mushrooms')

sns.barplot(
data=feature_importance_df,
y=feature_importance_df.index,
x='importance',
orient='h',
palette='summer'
)

scikit-learn - Machine Learning in Python

Supervised Learning - Naive Bayes NLP

Feature Extraction

text = [
'This is a dataset for binary sentiment classification',
'containing substantially more data than previous benchmark datasets',
'We provide a set of 25,000 highly polar movie reviews for training',
'And 25,000 for testing',
'There is additional unlabeled data for use as well',
'Raw text and already processed bag of words formats are provided'
]

CountVectorizer & TfidfTransformer

cv = CountVectorizer(stop_words='english')
cv_sparse_matrix = cv.fit_transform(text)
# <6x30 sparse matrix of type '<class 'numpy.int64'>'
# with 33 stored elements in Compressed Sparse Row format>
print(cv_sparse_matrix.todense())
# [[0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
# [0 0 0 0 1 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
# [1 1 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 0 0 1 0 1 0 0 0 1 0 0 0]
# [1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
# [0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0]
# [0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 1 0 0 0 0 0 1 0 0 0 1]]
print(cv.vocabulary_)
# \{'dataset': 9, 'binary': 5, 'sentiment': 21, 'classification': 6, 'containing': 7, 'substantially': 23, 'data': 8, 'previous': 15, 'benchmark': 4, 'datasets': 10, 'provide': 17, 'set': 22, '25': 1, '000': 0, 'highly': 12, 'polar': 14, 'movie': 13, 'reviews': 20, 'training': 26, 'testing': 24, 'additional': 2, 'unlabeled': 27, 'use': 28, 'raw': 19, 'text': 25, 'processed': 16, 'bag': 3, 'words': 29, 'formats': 11, 'provided': 18\}
tfidf_trans = TfidfTransformer()
tfidf_trans_results = tfidf_trans.fit_transform(cv_sparse_matrix)
print(tfidf_trans_results.todense())
# [[0. 0. 0. 0. 0. 0.5
# 0.5 0. 0. 0.5 0. 0.
# 0. 0. 0. 0. 0. 0.
# 0. 0. 0. 0.5 0. 0.
# 0. 0. 0. 0. 0. 0. ]
# [0. 0. 0. 0. 0.4198708 0.
# 0. 0.4198708 0.34430007 0. 0.4198708 0.
# 0. 0. 0. 0.4198708 0. 0.
# 0. 0. 0. 0. 0. 0.4198708
# 0. 0. 0. 0. 0. 0. ]
# [0.28386526 0.28386526 0. 0. 0. 0.
# 0. 0. 0. 0. 0. 0.
# 0.3461711 0.3461711 0.3461711 0. 0. 0.3461711
# 0. 0. 0.3461711 0. 0.3461711 0.
# 0. 0. 0.3461711 0. 0. 0. ]
# [0.5355058 0.5355058 0. 0. 0. 0.
# 0. 0. 0. 0. 0. 0.
# 0. 0. 0. 0. 0. 0.
# 0. 0. 0. 0. 0. 0.
# 0.65304446 0. 0. 0. 0. 0. ]
# [0. 0. 0.52182349 0. 0. 0.
# 0. 0. 0.42790272 0. 0. 0.
# 0. 0. 0. 0. 0. 0.
# 0. 0. 0. 0. 0. 0.
# 0. 0. 0. 0.52182349 0.52182349 0. ]
# [0. 0. 0. 0.37796447 0. 0.
# 0. 0. 0. 0. 0. 0.37796447
# 0. 0. 0. 0. 0.37796447 0.
# 0.37796447 0.37796447 0. 0. 0. 0.
# 0. 0.37796447 0. 0. 0. 0.37796447]]

TfidfVectorizer

tfidf_vec = TfidfVectorizer(
lowercase=True,
analyzer='word',
stop_words='english'
)

tfidf_vec_results = tfidf_vec.fit_transform(text)
# <6x30 sparse matrix of type '<class 'numpy.float64'>'
# with 33 stored elements in Compressed Sparse Row format>
print(tfidf_trans_results == tfidf_vec_results)
# True

Dataset Exploration

!wget https://raw.githubusercontent.com/kunal-lalwani/Twitter-US-Airlines-Sentiment-Analysis/master/Tweets.csv -P datasets
tweet_df = pd.read_csv('datasets/Tweets.csv')
tweet_df.head(3).transpose()
012
tweet_id570306133677760513570301130888122368570301083672813571
airline_sentimentneutralpositiveneutral
airline_sentiment_confidence1.00.34860.6837
negativereasonNaNNaNNaN
negativereason_confidenceNaN0.0NaN
airlineVirgin AmericaVirgin AmericaVirgin America
airline_sentiment_goldNaNNaNNaN
namecairdinjnardinoyvonnalynn
negativereason_goldNaNNaNNaN
retweet_count000
text@VirginAmerica What @dhepburn said.@VirginAmerica plus you've added commercials t...@VirginAmerica I didn't today... Must mean I n...
tweet_coordNaNNaNNaN
tweet_created2015-02-24 11:35:52 -08002015-02-24 11:15:59 -08002015-02-24 11:15:48 -0800
tweet_locationNaNNaNLets Play
user_timezoneEastern Time (US & Canada)Pacific Time (US & Canada)Central Time (US & Canada)
plt.figure(figsize=(12,5))
plt.title('Tweet Sentiment Classification by Airline')
sns.countplot(
data=tweet_df,
x='airline',
hue='airline_sentiment',
palette='cool'
)

plt.savefig('assets/Scikit_Learn_56.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

plt.figure(figsize=(12,6))
plt.title('Tweet Sentiment Classification with negative Reason')
sns.countplot(
data=tweet_df,
x='airline',
hue='negativereason',
palette='cool'
)

plt.savefig('assets/Scikit_Learn_57.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

Data Preprocessing

tweet_data = tweet_df[['airline_sentiment', 'text']]
X_tweet = tweet_data['text']
y_tweet = tweet_data['airline_sentiment']
# train/ test split
X_tweet_train, X_tweet_test, y_tweet_train, y_tweet_test = train_test_split(
X_tweet,
y_tweet,
test_size=0.2,
random_state=42
)

TFIDF Vectorizer

tfidf_tweet_vec = TfidfVectorizer(
lowercase=True,
analyzer='word',
stop_words='english'
)

X_tweet_tfidf_train = tfidf_tweet_vec.fit_transform(X_tweet_train)
# <11712x12987 sparse matrix of type '<class 'numpy.float64'>'
# with 106745 stored elements in Compressed Sparse Row format>
X_tweet_tfidf_test = tfidf_tweet_vec.transform(X_tweet_test)

Model Comparison

# report helper function
def report(model):
preds = model.predict(X_tweet_tfidf_test)

print(classification_report(y_tweet_test, preds))

conf_mtx = confusion_matrix(y_tweet_test, preds)
conf_mtx_plot = ConfusionMatrixDisplay(
confusion_matrix=conf_mtx
)
conf_mtx_plot.plot(cmap='plasma')
logreg_tweet = LogisticRegression(max_iter=1000)
logreg_tweet.fit(X_tweet_tfidf_train, y_tweet_train)
report(logreg_tweet)
precisionrecallf1-scoresupport
negative0.820.930.881889
neutral0.660.480.56580
positive0.790.630.70459
accuracy0.802928
macro avg0.760.680.712928
weighted avg0.790.800.782928

scikit-learn - Machine Learning in Python

rbf_svc_tweet = svm.SVC()
rbf_svc_tweet.fit(X_tweet_tfidf_train, y_tweet_train)
report(rbf_svc_tweet)
precisionrecallf1-scoresupport
negative0.810.950.871889
neutral0.680.420.52580
positive0.800.610.69459
accuracy0.792928
macro avg0.760.660.692928
weighted avg0.780.790.772928

scikit-learn - Machine Learning in Python

linear_svc_tweet = svm.LinearSVC()
linear_svc_tweet.fit(X_tweet_tfidf_train, y_tweet_train)
report(linear_svc_tweet)
precisionrecallf1-scoresupport
negative0.850.910.881889
neutral0.640.540.58580
positive0.760.670.71459
accuracy0.802928
macro avg0.750.710.722928
weighted avg0.790.800.792928

scikit-learn - Machine Learning in Python

nb_tweets = MultinomialNB()
nb_tweets.fit(X_tweet_tfidf_train, y_tweet_train)
report(nb_tweets)
# The Naive Bayes classifies almost all tweets as negative
# which means it does well with searching neg tweets
# but ends up classifying a lot neutral and pos tweets as neg
precisionrecallf1-scoresupport
negative0.690.990.811889
neutral0.750.150.25580
positive0.940.180.31459
accuracy0.702928
macro avg0.790.440.462928
weighted avg0.740.700.622928

scikit-learn - Machine Learning in Python

Model Deployment

# building a pipeline to ingest new tweets with the best performing model
pipe = Pipeline(
[
('tfidf', TfidfVectorizer()),
('svc', svm.SVC())
]
)
# before deployment retrain on entire dataset
pipe.fit(X_tweet, y_tweet)
# test prediction
print(pipe.predict([
'good flight',
'terrible service',
'too late',
'ok flight',
'Thank you'
]))
# ['positive' 'negative' 'negative' 'neutral' 'positive']

Text Classification

IMDB Dataset of 50K Movie Reviews https://ai.stanford.edu/~amaas/data/sentiment/

Data Exploration

imdb_df = pd.read_csv('datasets/moviereviews.csv')
imdb_df.head()
labelreview
0neghow do films like mouse hunt get into theatres...
1negsome talented actresses are blessed with a dem...
2posthis has been an extraordinary year for austra...
3posaccording to hollywood movies made in last few...
4negmy first press screening of 1998 and already i...
imdb_df.info()

# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 2000 entries, 0 to 1999
# Data columns (total 2 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 label 2000 non-null object
# 1 review 1965 non-null object
# dtypes: object(2)
# memory usage: 31.4+ KB
# find missing
imdb_df.isnull().sum()
# label 0
# review 35
# dtype: int64
# drop missing
imdb_df = imdb_df.dropna(axis=0)
imdb_df.isnull().sum()
# label 0
# review 0
# dtype: int64
# make sure there a no empty string reviews
# (imdb_df['review'] == ' ').sum()
imdb_df['review'].str.isspace().sum()
# 27
# remove empty string reviews
imdb_df = imdb_df[~imdb_df['review'].str.isspace()]
imdb_df = imdb_df[imdb_df['review'] != '']
imdb_df['review'].str.isspace().sum()
# 0
# is the dataset balanced
imdb_df['label'].value_counts()
# neg 969
# pos 969
# Name: label, dtype: int64

Top 30 Features by Label

# find top 20 words in negative reviews
imdb_neg_df = imdb_df[imdb_df['label'] == 'neg']

count_vectorizer = CountVectorizer(analyzer='word', stop_words='english')
bag_of_words = count_vectorizer.fit_transform(imdb_neg_df['review'])
sum_words = bag_of_words.sum(axis=0)
words_freq = [
(word, sum_words[0, idx]) for word, idx in count_vectorizer.vocabulary_.items()
]

words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
x, y = zip(*words_freq[:30])

plt.figure(figsize=(12,5))
plt.bar(x,y)
plt.xticks(rotation=90)
plt.title('Top30 Words used in Negative Reviews')

plt.savefig('assets/Scikit_Learn_62.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

# find top 20 words in positive reviews
imdb_pos_df = imdb_df[imdb_df['label'] != 'neg']

count_vectorizer = CountVectorizer(analyzer='word', stop_words='english')
bag_of_words = count_vectorizer.fit_transform(imdb_pos_df['review'])
sum_words = bag_of_words.sum(axis=0)
words_freq = [
(word, sum_words[0, idx]) for word, idx in count_vectorizer.vocabulary_.items()
]

words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
x, y = zip(*words_freq[:30])

plt.figure(figsize=(12,5))
plt.bar(x,y)
plt.xticks(rotation=90)
plt.title('Top30 Words used in Positive Reviews')

plt.savefig('assets/Scikit_Learn_63.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

Data Preprocessing

X_rev = imdb_df['review']
y_rev = imdb_df['label']
# train/ test split
X_rev_train, X_rev_test, y_rev_train, y_rev_test = train_test_split(
X_rev,
y_rev,
test_size=0.2,
random_state=42
)
tfidf_rev_vec = TfidfVectorizer(
lowercase=True,
analyzer='word',
stop_words='english'
)

X_rev_tfidf_train = tfidf_rev_vec.fit_transform(X_rev_train)
X_rev_tfidf_test = tfidf_rev_vec.transform(X_rev_test)

Model Training

nb_rev = MultinomialNB()
nb_rev.fit(X_rev_tfidf_train, y_rev_train)
preds = nb_rev.predict(X_rev_tfidf_test)
print(classification_report(y_rev_test, preds))
precisionrecallf1-scoresupport
neg0.790.880.83188
pos0.870.780.82200
accuracy0.82388
macro avg0.830.830.82388
weighted avg0.830.820.82388
conf_mtx = confusion_matrix(y_rev_test, preds)
conf_mtx_plot = ConfusionMatrixDisplay(
confusion_matrix=conf_mtx
)
conf_mtx_plot.plot(cmap='plasma')

scikit-learn - Machine Learning in Python

Unsupervised Learning - KMeans Clustering

Dataset Exploration

!wget https://github.com/selva86/datasets/raw/master/bank-full.csv -P datasets
bank_df = pd.read_csv('datasets/bank-full.csv', sep=';')
bank_df.head(5).transpose()
01234
age5657374056
jobhousemaidservicesservicesadmin.services
maritalmarriedmarriedmarriedmarriedmarried
educationbasic.4yhigh.schoolhigh.schoolbasic.6yhigh.school
defaultnounknownnonono
housingnonoyesnono
loannonononoyes
contacttelephonetelephonetelephonetelephonetelephone
monthmaymaymaymaymay
day_of_weekmonmonmonmonmon
duration261149226151307
campaign11111
pdays999999999999999
previous00000
poutcomenonexistentnonexistentnonexistentnonexistentnonexistent
emp.var.rate1.11.11.11.11.1
cons.price.idx93.99493.99493.99493.99493.994
cons.conf.idx-36.4-36.4-36.4-36.4-36.4
euribor3m4.8574.8574.8574.8574.857
nr.employed5191.05191.05191.05191.05191.0
ynonononono
bank_df.describe()
agedurationcampaignpdayspreviousemp.var.ratecons.price.idxcons.conf.idxeuribor3mnr.employed
count41188.0000041188.00000041188.00000041188.00000041188.00000041188.00000041188.00000041188.00000041188.00000041188.000000
mean40.02406258.2850102.567593962.4754540.1729630.08188693.575664-40.5026003.6212915167.035911
std10.42125259.2792492.770014186.9109070.4949011.5709600.5788404.6281981.73444772.251528
min17.000000.0000001.0000000.0000000.000000-3.40000092.201000-50.8000000.6340004963.600000
25%32.00000102.0000001.000000999.0000000.000000-1.80000093.075000-42.7000001.3440005099.100000
50%38.00000180.0000002.000000999.0000000.0000001.10000093.749000-41.8000004.8570005191.000000
75%47.00000319.0000003.000000999.0000000.0000001.40000093.994000-36.4000004.9610005228.100000
max98.000004918.00000056.000000999.0000007.0000001.40000094.767000-26.9000005.0450005228.100000
plt.figure(figsize=(12, 5))
plt.title('Age Distribution by Marital Status')

sns.histplot(
data=bank_df,
x='age',
bins=50,
hue='marital',
palette='winter',
kde=True
)

plt.savefig('assets/Scikit_Learn_65.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

plt.figure(figsize=(12, 5))
plt.title('Age Distribution by Loan Status')

sns.histplot(
data=bank_df,
x='age',
bins=50,
hue='loan',
palette='winter',
kde=True
)

plt.savefig('assets/Scikit_Learn_66.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

# remove columns with `pday`s = 999 (placeholder for never)
plt.figure(figsize=(12, 5))
plt.title('Distribution of Days Since Last Contacted by Loan Status')

sns.histplot(
data=bank_df[bank_df['pdays'] != 999],
x='pdays',
hue='loan',
palette='winter',
kde=True
)

plt.savefig('assets/Scikit_Learn_67.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

# Create call duration in minutes column
bank_df['duration_minutes'] = bank_df['duration'].apply(lambda x: x/60).round(1)

plt.figure(figsize=(12, 5))
plt.title('Distribution Contact Duration by Contact Type')
plt.xlim(0,20)
sns.histplot(
data=bank_df,
x='duration_minutes',
hue='contact',
palette='winter',
kde=True
)

plt.savefig('assets/Scikit_Learn_68.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

plt.figure(figsize=(16, 5))
plt.title('Customer Jobs Countplot by Loan Defaults')
sns.countplot(
data=bank_df,
x='job',
order=bank_df['job'].value_counts().index,
palette='winter',
hue='default'
)

plt.savefig('assets/Scikit_Learn_69.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

plt.figure(figsize=(16, 5))
plt.title('Customer Education Countplot by Loan Defaults')
sns.countplot(
data=bank_df,
x='education',
order=bank_df['education'].value_counts().index,
palette='winter',
hue='default'
)

plt.savefig('assets/Scikit_Learn_70.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

sns.pairplot(
data=bank_df,
hue='marital',
palette='winter'
)

plt.savefig('assets/Scikit_Learn_71.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

Dataset Preprocessing

# encode categorical features
X_bank = pd.get_dummies(bank_df)
# normalize data
bank_scaler = StandardScaler()

X_bank_scaled = bank_scaler.fit_transform(X_bank)

Model Training

bank_model = KMeans(
n_clusters=2,
n_init='auto',
random_state=42
)
# fit to find cluster centers and predict what center every feature belongs to
bank_cluster_labels = bank_model.fit_predict(X_bank_scaled)
# add predicted label to source dataframe
X_bank['Cluster'] = bank_cluster_labels
X_bank['Cluster'].value_counts()
# 0 26871
# 1 14317
# Name: Cluster, dtype: int64
# How do the feature correlate with the predicted labels
label_corr = X_bank.corr()['Cluster']
print(label_corr.iloc[:-1].sort_values())
plt.figure(figsize=(10,14))
label_corr.iloc[:-1].sort_values().plot(kind='barh')
plt.title('Feature Importance')

plt.savefig('assets/Scikit_Learn_72.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

Choosing a K Value

# visualize the sum distance of your datapoints to the
# predicted cluster centers as a function of number of clusters
sum_squared_distance = []

for k in range(2,20):
model = KMeans(n_clusters=k, n_init='auto')
model.fit(X_bank_scaled)

sum_squared_distance.append(model.inertia_)
plt.figure(figsize=(10,5))
plt.title('SSD as a Function of Number of Cluster')
plt.plot(range(2,20), sum_squared_distance, 'o--')

plt.savefig('assets/Scikit_Learn_73.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

plt.figure(figsize=(10,5))
plt.title('Difference in SSD as a Function of Number of Clusters')
pd.Series(sum_squared_distance).diff().plot(kind='bar')

plt.savefig('assets/Scikit_Learn_74.webp', bbox_inches='tight')

There are two 'elbows' - one between k=5-6 (behold the 0-index in Pandas!) and the second one between k=14-15. Both of them are potential good values for the number of cluster k.

scikit-learn - Machine Learning in Python

Re-fitting the Model

bank_model = KMeans(
n_clusters=6,
n_init='auto',
random_state=42
)
# fit to find cluster centers and predict what center every feature belongs to
bank_cluster_labels = bank_model.fit_predict(X_bank_scaled)
# add predicted label to source dataframe
X_bank['Cluster'] = bank_cluster_labels
X_bank['Cluster'].value_counts()
# 5 10713
# 0 10663
# 1 8164
# 3 5566
# 4 3322
# 2 2760
# Name: Cluster, dtype: int64

Example 1 : Color Quantization

img_array = mpimg.imread('assets/gz.jpg')
img_array.shape
# (325, 640, 3)
plt.imshow(img_array)
plt.title('Original Image')
plt.savefig('assets/Scikit_Learn_75.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

# flatten the image from 3 to 2 dimensions
(height, width, colour) = img_array.shape
img_array2d = img_array.reshape(height*width,colour)
img_array2d.shape
# (208000, 3)
# reduce colour space to 6 clusters
colour_model = KMeans(n_clusters=6, n_init='auto')
colour_labels = colour_model.fit_predict(img_array2d)
# get rgb value for each of the 6 cluster centers
rgb_colours = colour_model.cluster_centers_.round(0).astype(int)
rgb_colours
# array([[186, 111, 58],
# [ 31, 11, 16],
# [135, 72, 46],
# [236, 157, 73],
# [ 81, 40, 34],
# [252, 199, 125]])
# assign these rgb values to each pixel within the cluster
# and reshape to original 3d array
quantized_image = np.reshape(rgb_colours[colour_labels],(height,width,colour))
plt.imshow(quantized_image)
plt.title('Quantized Image')
plt.savefig('assets/Scikit_Learn_76.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

Example 2 : Country Clustering

Dataset Exploration

!wget https://github.com/priyansh21112002/CIA-Country-Description/raw/main/CIA_Country_Facts.csv -P datasets
country_df = pd.read_csv('datasets/CIA_Country_Facts.csv')
country_df.head(5).transpose()
01234
CountryAfghanistanAlbaniaAlgeriaAmerican SamoaAndorra
RegionASIA (EX. NEAR EAST)EASTERN EUROPENORTHERN AFRICAOCEANIAWESTERN EUROPE
Population310569973581655329300915779471201
Area (sq. mi.)647500287482381740199468
Pop. Density (per sq. mi.)48.0124.613.8290.4152.1
Coastline (coast/area ratio)0.01.260.0458.290.0
Net migration23.06-4.93-0.39-20.716.6
Infant mortality (per 1000 births)163.0721.5231.09.274.05
GDP ($ per capita)700.04500.06000.08000.019000.0
Literacy (%)36.086.570.097.0100.0
Phones (per 1000)3.271.278.1259.5497.2
Arable (%)12.1321.093.2210.02.22
Crops (%)0.224.420.2515.00.0
Other (%)87.6574.4996.5375.097.78
Climate1.03.01.02.03.0
Birthrate46.615.1117.1422.468.71
Deathrate20.345.224.613.276.25
Agriculture0.380.2320.101NaNNaN
Industry0.240.1880.6NaNNaN
Service0.380.5790.298NaNNaN
fig, axes = plt.subplots(figsize=(10,5), nrows=1, ncols=2)
plt.suptitle('Country Population Histogram')

axes[0].set_xlabel('Population')
axes[0].set_ylabel('Frequency')

axes[0].hist(
x=country_df['Population'],
range=None,
density=True,
histtype='bar',
orientation='vertical',
color='dodgerblue'
)

axes[1].set_xlabel('Population (<100Mio)')
axes[1].set_ylabel('Frequency')

axes[1].hist(
x=country_df['Population'],
range=[0, 1e8],
density=True,
histtype='bar',
orientation='vertical',
color='fuchsia'
)

plt.savefig('assets/Scikit_Learn_77.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

plt.figure(figsize=(12, 5))
plt.title('GDP ($ per capita) by Region')

sns.barplot(
data=country_df,
y='Region',
x='GDP ($ per capita)',
estimator=np.mean,
errorbar='sd',
orient='h',
palette='cool'
)

plt.savefig('assets/Scikit_Learn_78.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

plt.figure(figsize=(10, 6))

sns.scatterplot(
y='Phones (per 1000)',
x='GDP ($ per capita)',
data=country_df,
hue='Region',
palette='cool',
).set_title('GDP ($ per capita) vs. Phones (per 1000)')

plt.savefig('assets/Scikit_Learn_79.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

plt.figure(figsize=(10, 6))

sns.scatterplot(
y='Literacy (%)',
x='GDP ($ per capita)',
data=country_df,
hue='Region',
palette='cool',
).set_title('GDP ($ per capita) vs. Literacy (%)')

plt.savefig('assets/Scikit_Learn_80.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

plt.figure(figsize=(20, 12), dpi=200)
plt.title('Correlation Heatmap CIA Country Dataset')

sns.heatmap(
country_df.corr(numeric_only=True),
linewidth=0.5,
cmap='seismic',
annot=True
)

plt.savefig('assets/Scikit_Learn_81.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

plt.figure(figsize=(20, 12), dpi=200)
sns.clustermap(
country_df.corr(numeric_only=True),
linewidth=0.5,
cmap='seismic',
annot=False,
col_cluster=False
)

plt.savefig('assets/Scikit_Learn_82.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

Dataset Preprocessing

# find columns with missing values
country_df.isnull().sum()
Country0
Region0
Population0
Area (sq. mi.)0
Pop. Density (per sq. mi.)0
Coastline (coast/area ratio)0
Net migration3
Infant mortality (per 1000 births)3
GDP ($ per capita)1
Literacy (%)18
Phones (per 1000)4
Arable (%)2
Crops (%)2
Other (%)2
Climate22
Birthrate3
Deathrate4
Agriculture15
Industry16
Service15
dtype: int64
# what countries don't have an agriculture value
country_df[pd.isnull(country_df['Agriculture'])]['Country']
# all countries without agriculture data will not have a
# whole lot of agriculture output. The same is true for 'Industry'
# and 'Service' These values can be set to zero:
3American Samoa
4Andorra
78Gibraltar
80Greenland
83Guam
134Mayotte
140Montserrat
144Nauru
153N. Mariana Islands
171Saint Helena
174St Pierre & Miquelon
177San Marino
208Turks & Caicos Is
221Wallis and Futuna
223Western Sahara
Name: Country, dtype: object
# set missing values to zero for Agriculture, Industry and Service
# define what default values you want to fill
values = \{
"Agriculture": 0,
"Industry": 0,
"Service": 0,
\}
# and replace missing with values
country_df = country_df.fillna(value=values)
# another datapoint that is often missing is climate
# the climate can be estimated by countries in the same Region
country_df[pd.isnull(country_df['Climate'])][['Country', 'Region', 'Climate']]
CountryRegionClimate
5AngolaSUB-SAHARAN AFRICANaN
36CanadaNORTHERN AMERICANaN
50CroatiaEASTERN EUROPENaN
66Faroe IslandsWESTERN EUROPENaN
78GibraltarWESTERN EUROPENaN
101ItalyWESTERN EUROPENaN
115LebanonNEAR EASTNaN
118LibyaNORTHERN AFRICANaN
120LithuaniaBALTICSNaN
121LuxembourgWESTERN EUROPENaN
129MaltaWESTERN EUROPENaN
137MoldovaC.W. OF IND. STATESNaN
138MonacoWESTERN EUROPENaN
141MoroccoNORTHERN AFRICANaN
145NepalASIA (EX. NEAR EAST)NaN
169RussiaC.W. OF IND. STATESNaN
171Saint HelenaSUB-SAHARAN AFRICANaN
174St Pierre & MiquelonNORTHERN AMERICANaN
177San MarinoWESTERN EUROPENaN
181SerbiaEASTERN EUROPENaN
186SloveniaEASTERN EUROPENaN
200TanzaniaSUB-SAHARAN AFRICANaN
country_df[pd.isnull(country_df['Climate'])]['Region'].value_counts()
WESTERN EUROPE7
SUB-SAHARAN AFRICA3
EASTERN EUROPE3
NORTHERN AMERICA2
NORTHERN AFRICA2
C.W. OF IND. STATES2
NEAR EAST1
BALTICS1
ASIA (EX. NEAR EAST)1
Name: Region, dtype: int64
# the Region value has annoying whitespaces that need to be stripped
country_df['Region'] = country_df['Region'].apply(lambda x: x.strip())
# climate zones in western europe
country_df[country_df['Region'] == 'WESTERN EUROPE']['Climate'].value_counts()
# climate zones in SUB-SAHARAN AFRICA
country_df[country_df['Region'] == 'SUB-SAHARAN AFRICA']['Climate'].value_counts()
# climate zones in EASTERN EUROPE
country_df[country_df['Region'] == 'EASTERN EUROPE']['Climate'].value_counts()
# climate zones in NORTHERN AMERICA
country_df[country_df['Region'] == 'NORTHERN AMERICA']['Climate'].value_counts()
# climate zones in NORTHERN AFRICA
country_df[country_df['Region'] == 'NORTHERN AFRICA']['Climate'].value_counts()
# climate zones in C.W. OF IND. STATES
country_df[country_df['Region'] == 'C.W. OF IND. STATES']['Climate'].value_counts()
# climate zones in NEAR EAST
country_df[country_df['Region'] == 'NEAR EAST']['Climate'].value_counts()
# climate zones in BALTICS
country_df[country_df['Region'] == 'BALTICS']['Climate'].value_counts()
# climate zones in ASIA (EX. NEAR EAST)
country_df[country_df['Region'] == 'ASIA (EX. NEAR EAST)']['Climate'].value_counts()
# we can either use the top value to fill missing climate data points
# or use a mean value:
country_df['Climate'] = country_df['Climate'].fillna(country_df.groupby('Region')['Climate'].transform('mean'))
# there are more missing values, e.g. literacy:
country_df[pd.isnull(country_df['Literacy (%)'])][['Country', 'Region', 'Literacy (%)']]
CountryRegionLiteracy (%)
25Bosnia & HerzegovinaEASTERN EUROPENaN
66Faroe IslandsWESTERN EUROPENaN
74Gaza StripNEAR EASTNaN
78GibraltarWESTERN EUROPENaN
80GreenlandNORTHERN AMERICANaN
85GuernseyWESTERN EUROPENaN
99Isle of ManWESTERN EUROPENaN
104JerseyWESTERN EUROPENaN
108KiribatiOCEANIANaN
123MacedoniaEASTERN EUROPENaN
134MayotteSUB-SAHARAN AFRICANaN
144NauruOCEANIANaN
185SlovakiaEASTERN EUROPENaN
187Solomon IslandsOCEANIANaN
209TuvaluOCEANIANaN
220Virgin IslandsLATIN AMER. & CARIBNaN
222West BankNEAR EASTNaN
223Western SaharaNORTHERN AFRICANaN
# here we can also fill with mean values:
country_df['Literacy (%)'] = country_df['Literacy (%)'].fillna(country_df.groupby('Region')['Literacy (%)'].transform('mean'))
# the remaining rows with missing values can be dropped for now
country_df = country_df.dropna(axis=0)
country_df.isnull().sum()
Country0
Region0
Population0
Area (sq. mi.)0
Pop. Density (per sq. mi.)0
Coastline (coast/area ratio)0
Net migration0
Infant mortality (per 1000 births)0
GDP ($ per capita)0
Literacy (%)0
Phones (per 1000)0
Arable (%)0
Crops (%)0
Other (%)0
Climate0
Birthrate0
Deathrate0
Agriculture0
Industry0
Service0
dtype: int64
# drop the country column as it is a unique
# classifier that will not help with clustering
country_df_dropped = country_df.drop(['Country'], axis=1)
# the region column is useful but needs to be encoded
country_df_dropped = pd.get_dummies(country_df_dropped)
country_df_dropped.head(5).transpose()
01234
Population31056997.003581655.0003.293009e+0757794.0071201.00
Area (sq. mi.)647500.0028748.0002.381740e+06199.00468.00
Pop. Density (per sq. mi.)48.00124.6001.380000e+01290.40152.10
Coastline (coast/area ratio)0.001.2604.000000e-0258.290.00
Net migration23.06-4.930-3.900000e-01-20.716.60
Infant mortality (per 1000 births)163.0721.5203.100000e+019.274.05
GDP ($ per capita)700.004500.0006.000000e+038000.0019000.00
Literacy (%)36.0086.5007.000000e+0197.00100.00
Phones (per 1000)3.2071.2007.810000e+01259.50497.20
Arable (%)12.1321.0903.220000e+0010.002.22
Crops (%)0.224.4202.500000e-0115.000.00
Other (%)87.6574.4909.653000e+0175.0097.78
Climate1.003.0001.000000e+002.003.00
Birthrate46.6015.1101.714000e+0122.468.71
Deathrate20.345.2204.610000e+003.276.25
Agriculture0.380.2321.010000e-010.000.00
Industry0.240.1886.000000e-010.000.00
Service0.380.5792.980000e-010.000.00
Region_ASIA (EX. NEAR EAST)1.000.0000.000000e+000.000.00
Region_BALTICS0.000.0000.000000e+000.000.00
Region_C.W. OF IND. STATES0.000.0000.000000e+000.000.00
Region_EASTERN EUROPE0.001.0000.000000e+000.000.00
Region_LATIN AMER. & CARIB0.000.0000.000000e+000.000.00
Region_NEAR EAST0.000.0000.000000e+000.000.00
Region_NORTHERN AFRICA0.000.0001.000000e+000.000.00
Region_NORTHERN AMERICA0.000.0000.000000e+000.000.00
Region_OCEANIA0.000.0000.000000e+001.000.00
Region_SUB-SAHARAN AFRICA0.000.0000.000000e+000.000.00
Region_WESTERN EUROPE0.000.0000.000000e+000.001.00
# to be able to compare all datapoints they need to be normalized
country_scaler = StandardScaler()
country_df_scaled = country_scaler.fit_transform(country_df_dropped)

Model Training

# finding a good k-value for number of cluster
ssd_country = []

for k in range(2,30):
model = KMeans(n_clusters=k, n_init='auto')
model.fit(country_df_scaled)

ssd_country.append(model.inertia_)
plt.figure(figsize=(10,5))
plt.title('SSD as a Function of Number of Cluster')
plt.plot(range(2,30), ssd_country, 'o--')

plt.savefig('assets/Scikit_Learn_83.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

plt.figure(figsize=(10,5))
plt.title('Difference in SSD as a Function of Number of Clusters')
pd.Series(ssd_country).diff().plot(kind='bar')

plt.savefig('assets/Scikit_Learn_84.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

country_model = KMeans(
n_clusters=14,
n_init='auto',
random_state=42
)
# fit to find cluster centers and predict what center every feature belongs to
country_cluster_labels = country_model.fit_predict(country_df_scaled)

Model Evaluation

# add predicted label to source dataframe
country_df['Cluster14'] = country_cluster_labels
country_df['Cluster14'].value_counts()
plt.figure(figsize=(10, 7))
sns.set(style='darkgrid')

# hue/style by categorical column
sns.scatterplot(
x='GDP ($ per capita)',
y='Literacy (%)',
data=country_df,
s=40,
alpha=0.6,
hue='Cluster14',
palette='cool',
style='Region'
).set_title('Country Clusters with k=14')

plt.savefig('assets/Scikit_Learn_85.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

# repeat but only with 3 cluster
country_model2 = KMeans(
n_clusters=3,
n_init='auto',
random_state=42
)
# fit to find cluster centers and predict what center every feature belongs to
country_cluster_labels2 = country_model2.fit_predict(country_df_scaled)

# add predicted label to source dataframe
country_df['Cluster3'] = country_cluster_labels2

plt.figure(figsize=(10, 7))
sns.set(style='darkgrid')

# hue/style by categorical column
sns.scatterplot(
x='GDP ($ per capita)',
y='Literacy (%)',
data=country_df,
s=40,
alpha=0.6,
hue='Cluster3',
palette='cool',
style='Region'
).set_title('Country Clusters with k=3')

plt.savefig('assets/Scikit_Learn_86.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

# How do the feature correlate with the predicted labels
country_label_corr = country_df.corr()['Cluster3']
print(country_label_corr.iloc[:-1].sort_values())

Feature Correlation

Literacy (%)-0.413704
Crops (%)-0.152936
Coastline (coast/area ratio)-0.132610
Service-0.070495
Area (sq. mi.)-0.062183
Phones (per 1000)-0.037538
Population-0.024969
Industry0.008487
Arable (%)0.034891
Climate0.049659
Other (%)0.050444
Pop. Density (per sq. mi.)0.101062
GDP ($ per capita)0.122206
Agriculture0.250750
Net migration0.316226
Birthrate0.369940
Infant mortality (per 1000 births)0.412365
Deathrate0.575814
Name: Cluster, dtype: float64
plt.figure(figsize=(10,6))
country_label_corr.iloc[:-1].sort_values().plot(kind='barh')
plt.title('Feature Importance')

plt.savefig('assets/Scikit_Learn_87.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

Plotly Choropleth Map

iso_codes = pd.read_csv('datasets/country-iso-codes.csv')
iso_map = iso_codes.set_index('Country')['ISO Code'].to_dict()
country_df['ISO Code'] = country_df['Country'].map(iso_map)
country_df[['Country','ISO Code']].head(5)
CountryISO Code
0AfghanistanAFG
1AlbaniaALB
2AlgeriaDZA
3American SamoaASM
4AndorraAND
fig = px.choropleth(
country_df,
locations='ISO Code',
color='Cluster3',
hover_name='Country',
color_continuous_scale=px.colors.sequential.Plasma
)

fig.show()

scikit-learn - Machine Learning in Python

fig = px.choropleth(
country_df,
locations='ISO Code',
color='Cluster14',
hover_name='Country',
color_continuous_scale=px.colors.sequential.Plasma
)

fig.show()

scikit-learn - Machine Learning in Python

Unsupervised Learning - Agglomerative Clustering

Dataset Preprocessing

autompg_data: The Auto-MPG dataset for regression Revised from CMU StatLib library, data concerns city-cycle fuel consumption

autoMPG_df = pd.read_csv('datasets/auto-mpg.csv')
autoMPG_df.head(5)
mpgcylindersdisplacementhorsepowerweightaccelerationmodel_yearoriginname
018.08307.0130.0350412.070usachevrolet chevelle malibu
115.08350.0165.0369311.570usabuick skylark 320
218.08318.0150.0343611.070usaplymouth satellite
316.08304.0150.0343312.070usaamc rebel sst
417.08302.0140.0344910.570usaford torino
autoMPG_df['origin'].value_counts()
# there are only 3 countries of origin - can be turned into a dummy variable
autoMPG_dummy_df = pd.get_dummies(autoMPG_df.drop('name', axis=1))
autoMPG_dummy_df.head(5)
mpgcylindersdisplacementhorsepowerweightaccelerationmodel_yearorigin_europeorigin_japanorigin_usa
018.08307.0130.0350412.070FalseFalseTrue
115.08350.0165.0369311.570FalseFalseTrue
218.08318.0150.0343611.070FalseFalseTrue
316.08304.0150.0343312.070FalseFalseTrue
417.08302.0140.0344910.570FalseFalseTrue
# normalize dataset
scaler = MinMaxScaler()
autoMPG_scaled = pd.DataFrame(
scaler.fit_transform(autoMPG_dummy_df), columns=autoMPG_dummy_df.columns
)
autoMPG_scaled.describe()
mpgcylindersdisplacementhorsepowerweightaccelerationmodel_yearorigin_europeorigin_japanorigin_usa
count392.000000392.000000392.000000392.000000392.000000392.000000392.000000392.000000392.000000392.000000
mean0.3842000.4943880.3266460.3177680.3868970.4488880.4982990.1734690.2015310.625000
std0.2075800.3411570.2703980.2091910.2408290.1642180.3069780.3791360.4016560.484742
min0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
25%0.2127660.2000000.0956070.1576090.1735890.3437500.2500000.0000000.0000000.000000
50%0.3656910.2000000.2144700.2581520.3375390.4464290.5000000.0000000.0000001.000000
75%0.5319151.0000000.5368220.4347830.5675500.5372020.7500000.0000000.0000001.000000
max1.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.000000
plt.figure(figsize=(12,10))

sns.heatmap(autoMPG_scaled, annot=False, cmap='viridis')

plt.savefig('assets/Scikit_Learn_90.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

sns.clustermap(
autoMPG_scaled.corr(numeric_only=True),
linewidth=0.5,
cmap='seismic',
annot=True,
col_cluster=False
)

plt.savefig('assets/Scikit_Learn_91.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

Assigning Cluster Labels

Known Number of Clusters

# there are ~ 4 clusters visible - let's try to agglomerate them
autoMPG_model = AgglomerativeClustering(n_clusters=4)
cluster_labels = autoMPG_model.fit_predict(autoMPG_scaled)
autoMPG_df['label'] = cluster_labels
autoMPG_df.head(5)
mpgcylindersdisplacementhorsepowerweightaccelerationmodel_yearoriginnamelabel
018.08307.0130.0350412.070usachevrolet chevelle malibu2
115.08350.0165.0369311.570usabuick skylark 3202
218.08318.0150.0343611.070usaplymouth satellite2
316.08304.0150.0343312.070usaamc rebel sst2
417.08302.0140.0344910.570usaford torino2
plt.figure(figsize=(12,5))
sns.scatterplot(
x='mpg',
y='horsepower',
data=autoMPG_df,
hue='label',
palette='cool_r',
style='origin'
).set_title('Horsepower as a function of Miles-per-gallon')

plt.savefig('assets/Scikit_Learn_92.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

plt.figure(figsize=(12,5))
sns.scatterplot(
x='model_year',
y='mpg',
data=autoMPG_df,
hue='label',
palette='cool_r',
style='origin'
).set_title('Model Year as a function of Miles-per-gallon')
plt.legend(bbox_to_anchor=(1.01,1.01))

plt.savefig('assets/Scikit_Learn_93.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

figure, axes = plt.subplots(1, 3, sharex=True,figsize=(15, 5))
figure.suptitle('Country of Origin')

axes[0].set_title('second chart with no data')

sns.scatterplot(
x='horsepower',
y='mpg',
data=autoMPG_df[autoMPG_df['origin'] == 'europe'],
hue='label',
palette='cool_r',
style='model_year',
ax=axes[0]
).set_title('Europe')

axes[1].set_title('Europe')

sns.scatterplot(
x='horsepower',
y='mpg',
data=autoMPG_df[autoMPG_df['origin'] == 'japan'],
hue='label',
palette='cool_r',
style='model_year',
ax=axes[1]
).set_title('Japan')

axes[2].set_title('second chart with no data')

sns.scatterplot(
x='horsepower',
y='mpg',
data=autoMPG_df[autoMPG_df['origin'] == 'usa'],
hue='label',
palette='cool_r',
style='model_year',
ax=axes[2]
).set_title('USA')
plt.legend(bbox_to_anchor=(1.01,1.01))

plt.savefig('assets/Scikit_Learn_94.webp', bbox_inches='tight')
# nice... perfect separation by country!

scikit-learn - Machine Learning in Python

Unknown Number of Clusters

The Clustermap created above allowed us to estimate the amount of clusters needed to accuratly label the dataset based on the Dendrogram displayed on the left side. If we do not know how many clusters are present in our dataset we can define a maximum distance threshold a cluster can have before being merged with surrounding clusters. Setting this threshold to zero results in a number of clusters == number of datapoints.

autoMPG_model_auto = AgglomerativeClustering(
n_clusters=None,
metric='euclidean',
distance_threshold=0
)
cluster_labels_auto = autoMPG_model_auto.fit_predict(autoMPG_scaled)
len(np.unique(cluster_labels_auto))
# threshold of zero leads to 392 clusters == number of rows in our dataset
# find out a good distance threshold
linkage_matrix = hierarchy.linkage(autoMPG_model_auto.children_)
linkage_matrix
# [`cluster[i]`, `cluster[j]`, `distance between`, `number of members`]
# to display this matrix we can use the above mentioned dendrogram
plt.figure(figsize=(20,10))
plt.title('Hierarchy Dendrogram for 8 Classes')
dendro = hierarchy.dendrogram(linkage_matrix, truncate_mode='lastp', p=9)

plt.savefig('assets/Scikit_Learn_95.webp', bbox_inches='tight')
# The higher the y-value the larger the distance between the connected clusters

scikit-learn - Machine Learning in Python

# since the miles-per-gallons are a good indicator for the label
# what is the max distance between two points here:
car_max_mpg = autoMPG_scaled.iloc[autoMPG_scaled['mpg'].idxmax()]
car_min_mpg = autoMPG_scaled.iloc[autoMPG_scaled['mpg'].idxmin()]

np.linalg.norm(car_max_mpg - car_min_mpg)
# 3.1128158766165406
# if the max distance is ~3 the threshold should be < 3
autoMPG_model_auto = AgglomerativeClustering(
n_clusters=None,
metric='euclidean',
distance_threshold=2
)
cluster_labels_auto = autoMPG_model_auto.fit_predict(autoMPG_scaled)
len(np.unique(cluster_labels_auto))
# threshold of two leads to 11 clusters
autoMPG_model_auto = AgglomerativeClustering(
n_clusters=None,
metric='euclidean',
distance_threshold=3
)
cluster_labels_auto = autoMPG_model_auto.fit_predict(autoMPG_scaled)
len(np.unique(cluster_labels_auto))
# threshold of three leads to 9 clusters
autoMPG_df['label_auto'] = cluster_labels_auto
figure, axes = plt.subplots(1, 3, sharex=True,figsize=(15, 6))
figure.suptitle('Country of Origin')

axes[0].set_title('second chart with no data')

sns.scatterplot(
x='horsepower',
y='mpg',
data=autoMPG_df[autoMPG_df['origin'] == 'europe'],
hue='label_auto',
palette='cool_r',
style='model_year',
ax=axes[0]
).set_title('Europe')

axes[1].set_title('Europe')

sns.scatterplot(
x='horsepower',
y='mpg',
data=autoMPG_df[autoMPG_df['origin'] == 'japan'],
hue='label_auto',
palette='cool_r',
style='model_year',
ax=axes[1]
).set_title('Japan')

axes[2].set_title('second chart with no data')

sns.scatterplot(
x='horsepower',
y='mpg',
data=autoMPG_df[autoMPG_df['origin'] == 'usa'],
hue='label_auto',
palette='cool_r',
style='model_year',
ax=axes[2]
).set_title('USA')
plt.legend(bbox_to_anchor=(1.01,1.01))

plt.savefig('assets/Scikit_Learn_96.webp', bbox_inches='tight')
# the division by countries is still there. but we are now getting
# sub-classes within each country - which might be important depending on your set goal

scikit-learn - Machine Learning in Python

Unsupervised Learning - Density-based Spatial Clustering (DBSCAN)

DBSCAN vs KMeans

blobs_df = pd.read_csv('datasets/blobs.csv')
blobs_df.tail(2)
X1X2
14985.4545526.461246
1499-7.7692307.014384
plt.figure(figsize=(12,5))
plt.title('Blobs Dataset')
sns.scatterplot(data=blobs_df, x='X1', y='X2')

plt.savefig('assets/Scikit_Learn_97.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

moons_df = pd.read_csv('datasets/moons.csv')
moons_df.tail(2)
X1X2
14981.803858-0.154705
14990.2033050.079049
plt.figure(figsize=(12,5))
plt.title('Moons Dataset')
sns.scatterplot(data=moons_df, x='X1', y='X2')

plt.savefig('assets/Scikit_Learn_98.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

circles_df = pd.read_csv('datasets/circles.csv')
circles_df.tail(2)
X1X2
14980.027432-0.264891
1499-0.2167320.183006
plt.figure(figsize=(12,5))
plt.title('Circles Dataset')
sns.scatterplot(data=circles_df, x='X1', y='X2')

plt.savefig('assets/Scikit_Learn_99.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

def display_categories(model, data, axis):
labels = model.fit_predict(data)
sns.scatterplot(data=data, x='X1', y='X2', hue=labels, palette='cool' , ax=axis)
km_model_blobs = KMeans(n_clusters=3, init='random', n_init='auto')
db_model_blobs = DBSCAN(eps=0.5, min_samples=5)

figure, axes = plt.subplots(1, 2, sharex=True,figsize=(12, 6))
figure.suptitle('3 Blobs Dataset')

axes[0].set_title('KMeans Clustering')
display_categories(km_model_blobs, blobs_df, axes[0])

axes[1].set_title('DBSCAN Clustering')
display_categories(db_model_blobs, blobs_df, axes[1])

plt.savefig('assets/Scikit_Learn_100.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

km_model_moons = KMeans(n_clusters=2, init='random', n_init='auto')
db_model_moons = DBSCAN(eps=0.2, min_samples=5)

figure, axes = plt.subplots(1, 2, sharex=True,figsize=(12, 6))
figure.suptitle('2 Moons Dataset')

axes[0].set_title('KMeans Clustering')
display_categories(km_model_moons, moons_df, axes[0])

axes[1].set_title('DBSCAN Clustering')
display_categories(db_model_moons, moons_df, axes[1])

plt.savefig('assets/Scikit_Learn_101.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

km_model_circles = KMeans(n_clusters=2, init='random', n_init='auto')
db_model_circles = DBSCAN(eps=0.2, min_samples=5)

figure, axes = plt.subplots(1, 2, sharex=True,figsize=(12, 6))
figure.suptitle('2 Circles Dataset')

axes[0].set_title('KMeans Clustering')
display_categories(km_model_circles, circles_df, axes[0])

axes[1].set_title('DBSCAN Clustering')
display_categories(db_model_circles, circles_df, axes[1])

plt.savefig('assets/Scikit_Learn_102.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

DBSCAN Hyperparameter Tuning

two_blobs_df = pd.read_csv('datasets/two-blobs.csv')
two_blobs_otl_df = pd.read_csv('datasets/two-blobs-outliers.csv')
# default hyperparameter
db_model_base = DBSCAN(eps=0.5, min_samples=5)

figure, axes = plt.subplots(1, 2, sharex=True,figsize=(12, 6))
figure.suptitle('2 Blobs Dataset - Default Hyperparameter')

axes[0].set_title('DBSCAN Clustering w/o Outliers')
display_categories(db_model_base, two_blobs_df, axes[0])

axes[1].set_title('DBSCAN Clustering with Outliers')
display_categories(db_model_base, two_blobs_otl_df, axes[1])

plt.savefig('assets/Scikit_Learn_103.webp', bbox_inches='tight')
# points around cluster 1 are assigned to be outliers

scikit-learn - Machine Learning in Python

# reducing epsilon reduces the max distance (epsilon)
# points are allowed to have and still be assigned to a cluster
db_model_dec = DBSCAN(eps=0.001, min_samples=5)

figure, axes = plt.subplots(1, 2, sharex=True,figsize=(12, 6))
figure.suptitle('2 Blobs Dataset - Reduced Epsilon')

axes[0].set_title('DBSCAN Clustering w/o Outliers')
display_categories(db_model_dec, two_blobs_df, axes[0])

axes[1].set_title('DBSCAN Clustering with Outliers')
display_categories(db_model_dec, two_blobs_otl_df, axes[1])

plt.savefig('assets/Scikit_Learn_104.webp', bbox_inches='tight')
# distance is too small - every point becomes it's own cluster and is assigned as an outlier

scikit-learn - Machine Learning in Python

# increasing epsilon increases the max distance (epsilon)
# points are allowed to have and still be assigned to a cluster
db_model_inc = DBSCAN(eps=10, min_samples=5)

figure, axes = plt.subplots(1, 2, sharex=True,figsize=(12, 6))
figure.suptitle('2 Blobs Dataset - Increased Epsilon')

axes[0].set_title('DBSCAN Clustering w/o Outliers')
display_categories(db_model_inc, two_blobs_df, axes[0])

axes[1].set_title('DBSCAN Clustering with Outliers')
display_categories(db_model_inc, two_blobs_otl_df, axes[1])

plt.savefig('assets/Scikit_Learn_105.webp', bbox_inches='tight')
# distance is too big - every point becomes becomes part of the same cluster

scikit-learn - Machine Learning in Python

Elbow Plot

epsilon_value_range = np.linspace(0.0001, 1, 100)

n_outliers = []
perc_outlier = []
n_clusters = []

for epsilon in epsilon_value_range:
dbscan_model = DBSCAN(eps=epsilon)
dbscan_model.fit(two_blobs_otl_df)

# total number of outliers
n_outliers.append(np.sum(dbscan_model.labels_ == -1))
# percentage of outliers
perc_outlier.append(
100 * np.sum(dbscan_model.labels_ == -1) / len(dbscan_model.labels_)
)
# number of clusters
n_clusters.append(len(np.unique(dbscan_model.labels_)))
plt.figure(figsize=(12,5))
plt.title('Elbow Plot - DBSCAN Hyperparameter')
plt.xlabel('Epsilon (Max Distance between Points)')
plt.ylabel('Number of Outliers')
plt.ylim(0,10)
# we expect 3 outliers
plt.hlines(y=3, xmin=0, xmax=0.7, color='fuchsia')
# 3 outliers are reached somewhere around eps=0.7
plt.vlines(x=0.7, ymin=0, ymax=3, color='fuchsia')
sns.lineplot(x=epsilon_value_range, y=n_outliers)

plt.savefig('assets/Scikit_Learn_107.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

plt.figure(figsize=(12,5))
plt.title('Number of Clusters by Epsilon Range')
plt.xlabel('Epsilon (Max Distance between Points)')
plt.ylabel('Number of Clusters')
# we expect 2 clusters + outliers
plt.hlines(y=3, xmin=0, xmax=1, color='fuchsia')
plt.ylim(0,50)
plt.xlim(0,1)
sns.lineplot(x=epsilon_value_range, y=n_clusters)

plt.savefig('assets/Scikit_Learn_108.webp', bbox_inches='tight')
# we already reach 3 cluster with an epsilon of 0.2
# but as seen above we need an epsilon of 0.7 to reduce
# the number of outliers to 3

scikit-learn - Machine Learning in Python

# find the optimum
# rule of thumb for min_samples = 2*n_dim
n_dim = two_blobs_otl_df.shape[1]
db_model_opt = DBSCAN(eps=0.7, min_samples=2*n_dim)

figure, axes = plt.subplots(1, 2, sharex=True,figsize=(12, 6))
figure.suptitle('2 Blobs Dataset - Optimal Epsilon')

axes[0].set_title('DBSCAN Clustering w/o Outliers')
display_categories(db_model_opt, two_blobs_df, axes[0])

axes[1].set_title('DBSCAN Clustering with Outliers')
display_categories(db_model_opt, two_blobs_otl_df, axes[1])

plt.savefig('assets/Scikit_Learn_106.webp', bbox_inches='tight')
# the 3 outliers are labled as such and every other point is assigned to one of the two clusters

scikit-learn - Machine Learning in Python

# find number of outliers
print('Number of Outliers', np.sum(db_model_opt.labels_ == -1))
# Number of Outliers 3
# get outlier percentage
print('Percentage of Outliers', (100 * np.sum(db_model_opt.labels_ == -1) / len(db_model_opt.labels_)).round(2),'%')
# Percentage of Outliers 0.3 %

Realworld Dataset

Wholesale customers The data set refers to clients of a wholesale distributor. It includes the annual spending in monetary units (m.u.) on diverse product categories

Additional Information

  1. FRESH: annual spending (m.u.) on fresh products (Continuous)
  2. MILK: annual spending (m.u.) on milk products (Continuous)
  3. GROCERY: annual spending (m.u.) on grocery products (Continuous)
  4. FROZEN: annual spending (m.u.) on frozen products (Continuous)
  5. DETERGENTS_PAPER: annual spending (m.u.) on detergents and paper products (Continuous)
  6. DELICATESSEN: annual spending (m.u.)on and delicatessen products (Continuous)
  7. CHANNEL: customers Channel - Horeca (Hotel/Restaurant/Cafe) or Retail channel (Nominal)
  8. REGION: customers Region - Lisnon, Oporto or Other (Nominal)

Dataset Exploration

wholesale_df = pd.read_csv('datasets/wholesome-customers-data.csv')
wholesale_df.head(5)
ChannelRegionFreshMilkGroceryFrozenDetergents_PaperDelicassen
023126699656756121426741338
123705798109568176232931776
223635388087684240535167844
313132651196422164045071788
4232261554107198391517775185

wholesale_df.info()
plt.figure(figsize=(12,5))
plt.title('Whole Sale: Milk Products vs Groceries')
sns.scatterplot(
data=wholesale_df,
x='Milk', y='Grocery',
hue='Channel', style='Region',
palette='winter'
)

plt.savefig('assets/Scikit_Learn_109.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

plt.figure(figsize=(10, 5))
plt.title('Whole Sale: Milk Products by Distribution Channel')

sns.histplot(
data=wholesale_df,
x='Milk',
bins=50,
hue='Channel',
palette='winter',
kde=True
)

plt.savefig('assets/Scikit_Learn_110.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

sns.clustermap(
wholesale_df.corr(),
linewidth=0.5,
cmap='winter',
annot=True,
col_cluster=False
)

plt.savefig('assets/Scikit_Learn_111.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

sns.pairplot(
data=wholesale_df,
hue='Region',
palette='winter'
)

plt.savefig('assets/Scikit_Learn_112.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

Data Preprocessing

# normalize feature set
scaler = StandardScaler()
wholesale_scaled = pd.DataFrame(
scaler.fit_transform(wholesale_df), columns=wholesale_df.columns
)
wholesale_scaled.describe()
ChannelRegionFreshMilkGroceryFrozenDetergents_PaperDelicassen
count4.400000e+024.400000e+024.400000e+02440.0000004.400000e+024.400000e+024.400000e+024.400000e+02
mean1.614870e-173.552714e-16-3.431598e-170.000000-4.037175e-173.633457e-172.422305e-17-8.074349e-18
std1.001138e+001.001138e+001.001138e+001.0011381.001138e+001.001138e+001.001138e+001.001138e+00
min-6.902971e-01-1.995342e+00-9.496831e-01-0.778795-8.373344e-01-6.283430e-01-6.044165e-01-5.402644e-01
25%-6.902971e-01-7.023369e-01-7.023339e-01-0.578306-6.108364e-01-4.804306e-01-5.511349e-01-3.964005e-01
50%-6.902971e-015.906683e-01-2.767602e-01-0.294258-3.366684e-01-3.188045e-01-4.336004e-01-1.985766e-01
75%1.448652e+005.906683e-013.905226e-010.1890922.849105e-019.946441e-022.184822e-011.048598e-01
max1.448652e+005.906683e-017.927738e+009.1836508.936528e+001.191900e+017.967672e+001.647845e+01

Model Hyperparameter Tuning

epsilon_value_range = np.linspace(0.001, 3, 100)
n_dim = wholesale_scaled.shape[1]

n_outliers = []
perc_outlier = []
n_clusters = []

for epsilon in epsilon_value_range:
dbscan_model = DBSCAN(eps=epsilon, min_samples=2*n_dim)
dbscan_model.fit(wholesale_scaled)

# total number of outliers
n_outliers.append(np.sum(dbscan_model.labels_ == -1))
# percentage of outliers
perc_outlier.append(
100 * np.sum(dbscan_model.labels_ == -1) / len(dbscan_model.labels_)
)
# number of clusters
n_clusters.append(len(np.unique(dbscan_model.labels_)))
plt.figure(figsize=(12,5))
plt.title('Elbow Plot - DBSCAN Hyperparameter')
plt.xlabel('Epsilon (Max Distance between Points)')
plt.ylabel('Number of Outliers')
plt.hlines(y=25, xmin=0, xmax=2, color='fuchsia')
plt.vlines(x=2, ymin=0, ymax=25, color='fuchsia')
sns.lineplot(x=epsilon_value_range, y=n_outliers)

plt.savefig('assets/Scikit_Learn_113.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

plt.figure(figsize=(12,5))
plt.title('Number of Clusters by Epsilon Range')
plt.xlabel('Epsilon (Max Distance between Points)')
plt.ylabel('Number of Clusters')
plt.hlines(y=3, xmin=0, xmax=2, color='fuchsia')
plt.vlines(x=2, ymin=0, ymax=3, color='fuchsia')
sns.lineplot(x=epsilon_value_range, y=n_clusters)

plt.savefig('assets/Scikit_Learn_114.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

def wholesale_categories(model, data, x, y, axis):
labels = model.fit_predict(data)
sns.scatterplot(data=data, x=x, y=y, hue=labels, palette='cool' , ax=axis)
db_model_opt = DBSCAN(eps=2.0, min_samples=2*n_dim)

figure, axes = plt.subplots(1, 2, sharex=True,figsize=(12, 6))
figure.suptitle('Whole Sale Dataset - DBSCAN Cluster (Normalized)')

axes[0].set_title('DBSCAN Clustering Milk Products vs Groceries')
wholesale_categories(
model=db_model_opt,
data=wholesale_scaled,
x='Milk', y='Grocery',
axis=axes[0]
)

axes[1].set_title('DBSCAN Clustering Milk Products vs Delicassen')
wholesale_categories(
model=db_model_opt,
data=wholesale_scaled,
x='Milk', y='Delicassen',
axis=axes[1]
)

plt.savefig('assets/Scikit_Learn_115a.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

# add labels to original dataframe
wholesale_df['Label'] = db_model_opt.fit_predict(wholesale_scaled)
wholesale_df['Label'].head(5)
# remove outliers
wholesale_df_wo_otl = wholesale_df[wholesale_df['Label'] != -1]
db_model_opt = DBSCAN(eps=3.0, min_samples=2*n_dim)

figure, axes = plt.subplots(1, 2, sharex=True,figsize=(12, 6))
figure.suptitle('Whole Sale Dataset - DBSCAN Cluster (w/o Outliers)')

axes[0].set_title('DBSCAN Clustering Milk Products vs Groceries')
sns.scatterplot(
data=wholesale_df_wo_otl,
x='Milk', y='Grocery',
hue='Label',
palette='cool',
ax=axes[0]
)

axes[1].set_title('DBSCAN Clustering Milk Products vs Delicassen')
sns.scatterplot(
data=wholesale_df_wo_otl,
x='Milk', y='Delicassen',
hue='Label',
palette='cool',
ax=axes[1]
)

plt.savefig('assets/Scikit_Learn_115b.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

# see if the mean values of each cluster differ from each other
grouped_df = wholesale_df.groupby('Label').mean()
LabelChannelRegionFreshMilkGroceryFrozenDetergents_PaperDelicassen
-11.522.48000027729.92000022966.96000026609.60000011289.64000011173.5600006707.160000
02.002.6201558227.6666678615.85271313859.6744191447.7596905969.5813951498.457364
11.002.51398612326.9720283023.5594413655.3286713086.181818763.7832171083.786713
scaler = MinMaxScaler()
grouped_scaler = pd.DataFrame(
scaler.fit_transform(grouped_df), columns=grouped_df.columns, index=['Outlier', 'Cluster 1', 'Cluster 2']
)
grouped_scaler.head()
ChannelRegionFreshMilkGroceryFrozenDetergents_PaperDelicassen
Outlier0.520.0000001.0000001.0000001.0000001.0000001.0000001.000000
Cluster 11.001.0000000.0000000.2804080.4445510.0000000.5000870.073741
Cluster 20.000.2424890.2101960.0000000.0000000.1664750.0000000.000000
plt.figure(figsize=(12, 3))
plt.title('Scaled Cluster / Outliers Comparison (Normalized)')

sns.heatmap(
grouped_scaler,
linewidth=0.5,
cmap='coolwarm',
annot=True
)

plt.savefig('assets/Scikit_Learn_116.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

grouped_df = grouped_df.drop(['Labels'], axis=1)
# remove outlier
wholesale_clusters = grouped_df.drop(-1, axis=0)
wholesale_clusters.head()
LabelChannelRegionFreshMilkGroceryFrozenDetergents_PaperDelicassen
02.02.6201558227.6666678615.85271313859.6744191447.7596905969.5813951498.457364
11.02.51398612326.9720283023.5594413655.3286713086.181818763.7832171083.786713
plt.figure(figsize=(12, 3))
plt.title('Mean Spending Values for Cluster 1 and 2')

sns.heatmap(
wholesale_clusters,
linewidth=0.5,
cmap='coolwarm',
annot=True
)

plt.savefig('assets/Scikit_Learn_117.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

Dimensionality Reduction - Principal Component Analysis (PCA)

Dataset Preprocessing

Breast cancer wisconsin (diagnostic) dataset.

  • Attribute Information:
    • radius (mean of distances from center to points on the perimeter)
    • texture (standard deviation of gray-scale values)
    • perimeter
    • area
    • smoothness (local variation in radius lengths)
    • compactness (perimeter^2 / area - 1.0)
    • concavity (severity of concave portions of the contour)
    • concave points (number of concave portions of the contour)
    • symmetry
    • fractal dimension ("coastline approximation" - 1)

The mean, standard error, and "worst" or largest (mean of the three worst/largest values) of these features were computed for each image, resulting in 30 features. For instance, field 0 is Mean Radius, field 10 is Radius SE, field 20 is Worst Radius.

  • class:
    • WDBC-Malignant
    • WDBC-Benign
tumor_df = pd.read_csv('datasets/cancer-tumor-data-features.csv')
tumor_df.head(5).transpose()
01234
mean radius17.99000020.57000019.69000011.42000020.290000
mean texture10.38000017.77000021.25000020.38000014.340000
mean perimeter122.800000132.900000130.00000077.580000135.100000
mean area1001.0000001326.0000001203.000000386.1000001297.000000
mean smoothness0.1184000.0847400.1096000.1425000.100300
mean compactness0.2776000.0786400.1599000.2839000.132800
mean concavity0.3001000.0869000.1974000.2414000.198000
mean concave points0.1471000.0701700.1279000.1052000.104300
mean symmetry0.2419000.1812000.2069000.2597000.180900
mean fractal dimension0.0787100.0566700.0599900.0974400.058830
radius error1.0950000.5435000.7456000.4956000.757200
texture error0.9053000.7339000.7869001.1560000.781300
perimeter error8.5890003.3980004.5850003.4450005.438000
area error153.40000074.08000094.03000027.23000094.440000
smoothness error0.0063990.0052250.0061500.0091100.011490
compactness error0.0490400.0130800.0400600.0745800.024610
concavity error0.0537300.0186000.0383200.0566100.056880
concave points error0.0158700.0134000.0205800.0186700.018850
symmetry error0.0300300.0138900.0225000.0596300.017560
fractal dimension error0.0061930.0035320.0045710.0092080.005115
worst radius25.38000024.99000023.57000014.91000022.540000
worst texture17.33000023.41000025.53000026.50000016.670000
worst perimeter184.600000158.800000152.50000098.870000152.200000
worst area2019.0000001956.0000001709.000000567.7000001575.000000
worst smoothness0.1622000.1238000.1444000.2098000.137400
worst compactness0.6656000.1866000.4245000.8663000.205000
worst concavity0.7119000.2416000.4504000.6869000.400000
worst concave points0.2654000.1860000.2430000.2575000.162500
worst symmetry0.4601000.2750000.3613000.6638000.236400
worst fractal dimension0.1189000.0890200.0875800.1730000.076780
# normalizing data
scaler = StandardScaler()
tumor_scaled_arr = scaler.fit_transform(tumor_df)
tumor_scaled_df = pd.DataFrame(
tumor_scaled_arr, columns=tumor_df.columns
)
tumor_scaled_df.head(5).transpose()
01234
mean radius1.0970641.8298211.579888-0.7689091.750297
mean texture-2.073335-0.3536320.4561870.253732-1.151816
mean perimeter1.2699341.6859551.566503-0.5926871.776573
mean area0.9843751.9087081.558884-0.7644641.826229
mean smoothness1.568466-0.8269620.9422103.2835530.280372
mean compactness3.283515-0.4870721.0529263.4029090.539340
mean concavity2.652874-0.0238461.3634781.9158971.371011
mean concave points2.5324750.5481442.0372311.4517071.428493
mean symmetry2.2175150.0013920.9396852.867383-0.009560
mean fractal dimension2.255747-0.868652-0.3980084.910919-0.562450
radius error2.4897340.4992551.2286760.3263731.270543
texture error-0.565265-0.876244-0.780083-0.110409-0.790244
perimeter error2.8330310.2633270.8509280.2865931.273189
area error2.4875780.7424021.181336-0.2883781.190357
smoothness error-0.214002-0.605351-0.2970050.6897021.483067
compactness error1.316862-0.6929260.8149742.744280-0.048520
concavity error0.724026-0.4407800.2130760.8195180.828471
concave points error0.6608200.2601621.4248271.1150071.144205
symmetry error1.148757-0.8054500.2370364.732680-0.361092
fractal dimension error0.907083-0.0994440.2935592.0475110.499328
worst radius1.8866901.8059271.511870-0.2814641.298575
worst texture-1.359293-0.369203-0.0239740.133984-1.466770
worst perimeter2.3036011.5351261.347475-0.2499391.338539
worst area2.0012371.8904891.456285-0.5500211.220724
worst smoothness1.307686-0.3756120.5274073.3942750.220556
worst compactness2.616665-0.4304441.0829323.893397-0.313395
worst concavity2.109526-0.1467490.8549741.9895880.613179
worst concave points2.2960761.0870841.9550002.1757860.729259
worst symmetry2.750622-0.2438901.1522556.046041-0.868353
worst fractal dimension1.9370150.2811900.2013914.935010-0.397100

Model Fitting

pca_model = PCA(n_components=2)
pca_results = pca_model.fit_transform(tumor_scaled_df)
print(pca_model.explained_variance_ratio_)
print(np.sum(pca_model.explained_variance_ratio_))
# the two principal components are able to describe
# 63% of the variance in the dataset
# [0.44272026 0.18971182]
# 0.6324320765155945
# adding components to original dataframe
tumor_df[['PC1','PC2']] = pca_results
tumor_df[['PC1','PC2']].head(5).transpose()
01234
PC19.1928372.3878025.7338967.1229533.935302
PC21.948583-3.768172-1.07517410.275589-1.948072
plt.figure(figsize=(12,5))
plt.title('Principal Component Analysis - Cancer Tumor Dataset')
sns.scatterplot(
data=tumor_df,
x='PC1', y='PC2'
)

plt.savefig('assets/Scikit_Learn_118.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

# get label data from dataset to confirm that we still have
# separably clusters after reducing the dimensions to 2
from sklearn.datasets import load_breast_cancer
tumor_dataset = load_breast_cancer()
tumor_dataset.keys()
# dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])
tumor_dataset['target']
plt.figure(figsize=(12,5))
plt.title('PCA Cancer Tumor Dataset - Coloured by Labels')
sns.scatterplot(
data=tumor_df,
x='PC1', y='PC2',
hue=tumor_dataset['target'],
palette='winter'
)

plt.savefig('assets/Scikit_Learn_119.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

# as shown above we get around 63% of the variance explained by using 2 principal components
# since the dataset has 30 features 30 principal components will explain 100% of the variance

explained_variance = []

for n in range(1,31):
pca = PCA(n_components=n)
pca.fit(tumor_scaled_df)

explained_variance.append(np.sum(pca.explained_variance_ratio_))
plt.figure(figsize=(10, 5))
plt.title('Explained Variance by Number of Principal Components')
plt.xlabel('Principal Components')
sns.set(style='darkgrid')
sns.barplot(
data=pd.DataFrame(explained_variance, columns=['Explained Variance']),
x=np.arange(1,31),
y='Explained Variance'
)

plt.savefig('assets/Scikit_Learn_120.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

Dataset 2

What handwritten numbers are the hardest to tell apart for a ML Model?

digits_df = pd.read_csv('datasets/digits.csv')
digits_df.head(5).transpose()
01234
pixel_0_00.00.00.00.00.0
pixel_0_10.00.00.00.00.0
pixel_0_25.00.00.07.00.0
pixel_0_313.012.04.015.01.0
pixel_0_49.013.015.013.011.0
...
pixel_7_410.016.011.013.016.0
pixel_7_50.010.016.09.04.0
pixel_7_60.00.09.00.00.0
pixel_7_70.00.00.00.00.0
number_label0.01.02.03.04.0
# drop label column
X_digits = digits_df.drop('number_label', axis=1)
digits_labels = digits_df['number_label']
# select a single images
img_idx = 333
Single_Digit = np.array(X_digits.iloc[img_idx])
Single_Digit.shape
# the images inside the dataset are flattened
# (64,)
# need to be turned back into their 8x8 pixel format
Single_Digit = Single_Digit.reshape((8, 8))
Single_Digit.shape
# (8, 8)
# Display the Image
plt.figure(figsize=(4,4))
plt.imshow(Single_Digit, interpolation='nearest', cmap='plasma')
plt.title('Digit Label: %d' % digits_labels[img_idx])
plt.show()

scikit-learn - Machine Learning in Python

plt.figure(figsize=(8,6))
plt.title('Digit Label: %d' % digits_labels[0])

sns.heatmap(
Single_Digit,
linewidth=0.5,
cmap='plasma_r',
annot=True
)

plt.savefig('assets/Scikit_Learn_122.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

Dataset 2 Preprocessing

# normalize data
scaler = StandardScaler()
digits_scaled = pd.DataFrame(
scaler.fit_transform(X_digits), columns=X_digits.columns
)
digits_scaled.head(5).transpose()
01234
pixel_0_00.0000000.0000000.0000000.0000000.000000
pixel_0_1-0.335016-0.335016-0.335016-0.335016-0.335016
pixel_0_2-0.043081-1.094937-1.0949370.377661-1.094937
pixel_0_30.2740720.038648-1.8447420.744919-2.551014
pixel_0_4-0.6644780.2687510.7353660.268751-0.197863
...
pixel_7_30.208293-0.249010-2.0782180.208293-2.306869
pixel_7_4-0.3667710.849632-0.1640370.2414300.849632
pixel_7_5-1.1466470.5485611.5656860.379040-0.468564
pixel_7_6-0.505670-0.5056701.695137-0.505670-0.505670
pixel_7_7-0.196008-0.196008-0.196008-0.196008-0.196008

Model Fitting

pca_model2 = PCA(n_components=2)
pca_results2 = pca_model2.fit_transform(digits_scaled)
print(np.sum(pca_model2.explained_variance_ratio_))
# reducing the number of dimensions from 64 -> 2 leads to 22% explained variance
X_digits[['PC1','PC2']] = pca_results2
X_digits[['PC1','PC2']].head(5).transpose()
01234
PC11.9142640.5889971.302144-3.0208474.528854
PC2-0.9545640.924622-0.317291-0.868696-1.093369
plt.figure(figsize=(12,5))
plt.title('PCA Digits Dataset - Coloured by Labels')
sns.scatterplot(
data=X_digits,
x='PC1', y='PC2',
hue=digits_labels,
palette='tab20'
)
plt.legend(bbox_to_anchor=(1.01,1.01))

plt.savefig('assets/Scikit_Learn_123.webp', bbox_inches='tight')
# numbers 4 and 7 are very distinct. There is some overlap between 6 and 0 and between 2 and 3
# but you can still get some separation. All the numbers in the middle are 'problematic' and
# probably need a larger amount training data.

scikit-learn - Machine Learning in Python

# how many components would we have to add to reach 80% explained variance
explained_variance = []

for n in range(1,65):
pca = PCA(n_components=n)
pca.fit(digits_scaled)

explained_variance.append(np.sum(pca.explained_variance_ratio_))
plt.figure(figsize=(16, 5))
plt.title('Explained Variance by Number of Principal Components')
plt.xlabel('Principal Components')
sns.set(style='darkgrid')
sns.barplot(
data=pd.DataFrame(explained_variance, columns=['Explained Variance']),
x=np.arange(1,65),
y='Explained Variance'
)

plt.savefig('assets/Scikit_Learn_124.webp', bbox_inches='tight')
# we need more than 20 principal components out of 64 to reach 80% expainable variance:

scikit-learn - Machine Learning in Python

# rerun the training for 3 components for ~30% explained variance
pca_model3 = PCA(n_components=3)
pca_results3 = pca_model3.fit_transform(digits_scaled)
print(np.sum(pca_model3.explained_variance_ratio_))
# reducing the number of dimensions from 64 -> 3 leads to 30% explained variance
X_digits[['PC1','PC2','PC3']] = pca_results3
X_digits[['PC1','PC2','PC3']].head(5).transpose()
01234
PC11.9142130.5889811.302030-3.0207654.528946
PC2-0.9545100.924646-0.317199-0.868788-1.093498
PC3-3.9459823.9247133.023435-0.8017790.973213
%matplotlib notebook
fig = plt.figure(figsize=(8,8))
ax = plt.axes(projection='3d')
ax.scatter3D(
xs=X_digits['PC1'],
ys=X_digits['PC2'],
zs=X_digits['PC3'],
c=digits_labels,
cmap='tab20'
)
ax.set_title('PCA Digits Dataset - Coloured by Labels')
ax.set(
xticklabels=[],
yticklabels=[],
zticklabels=[],
xlabel='PC1',
ylabel='PC2',
zlabel='PC3',
)

# plt.savefig('assets/Scikit_Learn_125.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python