Skip to main content

TST, HongKong

Serving your SciKit Learn Model as a Prediction API

Github Repository

Image Data Preprocessing

import collections
from glob import glob
import matplotlib.pyplot as plt
from matplotlib import patches
import numpy as np
import os
import pandas as pd
import pickle
import re
from scipy import ndimage
from skimage import (
io,
color,
exposure,
transform,
feature
)
import seaborn as sns
from sklearn.metrics import (
classification_report,
confusion_matrix,
ConfusionMatrixDisplay)
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
SEED = 42

Image Dataset Preparation

Get Image Dataset from Local Directory

# get list of images https://paperswithcode.com/dataset/food-101
data_dir = os.listdir('dataset/animals')
print(data_dir)
# ['bear', 'cat', 'chicken', 'cow', 'deer', 'dog', 'duck', 'eagle', 'elephant', 'human', 'lion', 'monkey', 'mouse', 'panda', 'pigeon', 'rabbit', 'sheep', 'tiger', 'wolf']
glob('./dataset/animals/{}/*.jpg'.format('wolf'))

# ['./dataset/animals/wolf/wolffrontal0001.jpg',
# './dataset/animals/wolf/wolffrontal0002.jpg',
# './dataset/animals/wolf/wolffrontal0003.jpg',
# ...
all_files = []
for subfolder in data_dir:
all_files += glob('./dataset/animals/{}/*.jpg'.format(subfolder))

len(all_files)
# 2015
data_collection = io.ImageCollection(all_files)
data_collection.files
# ['./dataset/animals/bear/bearfrontal0001.jpg',
# './dataset/animals/bear/bearfrontal0002.jpg',
# './dataset/animals/bear/bearfrontal0003.jpg',
# './dataset/animals/bear/bearfrontal0004.jpg',
#...

Get Image Labels from Folder Structure

# use regular expression to extract folder name as label - example:
re.search(
r'./dataset/animals/(.*?)/',
'./dataset/animals/bear/bearfrontal0001.jpg'
).group(1)
# label extracted: 'bear'
def extract_labels(location):
label = re.search(
r'./dataset/animals/(.*?)/', location
).group(1)

return label
labels = list(map(extract_labels, data_collection.files))
list(set(labels))
# ['human',
# 'cat',
# 'lion',
# 'sheep',
# 'cow',
# 'mouse',
# 'pigeon',
# 'tiger',
# 'rabbit',
# 'elephant',
# 'deer',
# 'eagle',
# 'dog',
# 'wolf',
# 'panda',
# 'monkey',
# 'duck',
# 'chicken',
# 'bear']

Dataset Export

def buffer(item):
return item

# dataset_arrs = np.array(list(map(buffer,dataset)))
dataset_list = list(map(buffer, data_collection))
dataset_array = np.asarray(dataset_list)
dataset_array.shape
# (2015, 80, 80, 3)
data_dict = dict()

data_dict['description'] = '2015 80x80 RGB images of 19 classes.'
data_dict['data'] = dataset_array
data_dict['target'] = labels
data_dict['labels'] = set(labels)
label_distribution = collections.Counter(data_dict['target'])
plt.figure(figsize=(16, 5))
plt.title('Target Distribution in Dataset')

sns.countplot(
data=data_dict,
x='target'
)

plt.savefig('assets/Scikit_Image_Model_Deployment_01.webp', bbox_inches='tight')

Scikit-Image Introduction

# plot multiple random images with labels
ran_gen = np.random.default_rng()

plt.figure(figsize=(12, 12))

for i in range(12):
ax = plt.subplot(4, 4, i+1)
random_index = ran_gen.integers(low=0, high=2015, size=1)
plt.imshow(data_dict['data'][random_index[0]])
plt.title(data_dict['target'][random_index[0]])
plt.axis(False)

plt.savefig('assets/Scikit_Image_Model_Deployment_02.webp', bbox_inches='tight')

Scikit-Image Introduction

# save the dateset
output = open('dataset/animals.pkl', 'wb')
pickle.dump(data_dict, output)

Data Preprocessing

# load dataset pickle
dataset = pickle.load(open('dataset/animals.pkl', 'rb'))
dataset['description']
X = dataset['data']
y = dataset['target']
# train-test-split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, stratify=y, random_state=SEED
)
print(X_train.shape, X_test.shape)
# (1612, 80, 80, 3) (403, 80, 80, 3)

Feature Extraction

Histogram of Oriented Gradients (HOG)

testimg = io.imread('assets/lion.jpg')
# hog feature descriptor
feature_vector, hog_image = feature.hog(testimg, orientations=8, pixels_per_cell=(16, 16),
cells_per_block=(1, 1), visualize=True, channel_axis=-1)

# Rescale histogram for better display
hog_image_rescaled = exposure.rescale_intensity(hog_image, in_range=(0, 5))
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5), sharex=True, sharey=True)

ax1.axis('off')
ax1.imshow(testimg, cmap=plt.cm.gray)
ax1.set_title('Input image')

ax2.axis('off')
ax2.imshow(hog_image_rescaled, cmap=plt.cm.gray)
ax2.set_title('Histogram of Oriented Gradients')
plt.show()

Scikit-Image Introduction

class hog_transformer(BaseEstimator, TransformerMixin):
def __init__(
self,
orientations=9,
pixels_per_cell=(8, 8),
cells_per_block=(3, 3)
):
self.orientations = orientations
self.pixels_per_cell = pixels_per_cell
self.cells_per_block = cells_per_block

def fit(self, X, y=None):
return self

def transform(self, X, y=None):
def local_hog(img):
feature_vector = feature.hog(
img,
orientations=self.orientations,
pixels_per_cell = self.pixels_per_cell,
cells_per_block = self.cells_per_block,
channel_axis=-1
)

return feature_vector

return [local_hog(x) for x in X]

Model Training

SGD Classifier Training Pipeline

  1. Feature Extraction
  2. Normalization
  3. Model Fitting
feature_extractor = hog_transformer()
scaler = StandardScaler()
model_sgd = SGDClassifier(loss='hinge', learning_rate='adaptive', eta0=0.1, early_stopping=True)
X_train_fv = feature_extractor.fit_transform(X_train)
X_test_fv = feature_extractor.transform(X_test)
X_train_fv_scaled = scaler.fit_transform(X_train_fv)
X_test_fv_scaled = scaler.transform(X_test_fv)
model_sgd.fit(X_train_fv_scaled, y_train)

Model Evaluation

y_pred = model_sgd.predict(X_test_fv_scaled)
# plot predictions
ran_gen = np.random.default_rng()

plt.figure(figsize=(12, 12))

for i in range(12):
ax = plt.subplot(4, 4, i+1)
random_index = ran_gen.integers(low=0, high=403, size=1)
plt.imshow(X_test[random_index[0]])
plt.title(y_pred[random_index[0]])
plt.axis(False)

plt.savefig('assets/Scikit_Image_Model_Deployment_04.webp', bbox_inches='tight')

Scikit-Image Introduction

conf_mtx = ConfusionMatrixDisplay(
confusion_matrix=confusion_matrix(y_test, y_pred.tolist()),
display_labels=[False]
)

conf_mtx.plot(cmap='plasma')

Scikit-Image Introduction

eval_report = classification_report(y_test, y_pred, output_dict=True)
eval_df = pd.DataFrame(eval_report)
eval_df.transpose()
precisionrecallf1-scoresupport
bear0.4285710.4500000.43902420.000000
cat0.6000000.5625000.58064532.000000
chicken0.6111110.5500000.57894720.000000
cow0.5000000.6000000.54545520.000000
deer0.6956520.8000000.74418620.000000
dog0.5172410.5769230.54545526.000000
duck0.7777780.7000000.73684220.000000
eagle0.4666670.7368420.57142919.000000
elephant0.5416670.6500000.59090920.000000
human0.8260870.9500000.88372120.000000
lion0.3750000.3000000.33333320.000000
monkey0.4666670.3500000.40000020.000000
mouse0.4166670.2500000.31250020.000000
panda0.6785710.8260870.74509823.000000
pigeon0.6315790.6000000.61538520.000000
rabbit0.8000000.6000000.68571420.000000
sheep0.5500000.5500000.55000020.000000
tiger0.8000000.6956520.74418623.000000
wolf0.4444440.4000000.42105320.000000
accuracy0.5880890.5880890.5880890.588089
macro avg0.5856690.5867370.580204403.000000
weighted avg0.5876590.5880890.582170403.000000

Hyperparameter Tuning

Training Pipeline

train_pipeline = Pipeline([
('feature_extraction', hog_transformer(
orientations=9,
pixels_per_cell=(8, 8),
cells_per_block=(3, 3))
),
('normalization', StandardScaler()),
('model_training', SGDClassifier(
loss='hinge', eta0=0.1,
learning_rate='adaptive',
early_stopping=True)
)
])
train_pipeline.fit(X_train, y_train)
y_pred_pipe = train_pipeline.predict(X_test)
eval_report = classification_report(y_test, y_pred_pipe, output_dict=True)
eval_df = pd.DataFrame(eval_report)
eval_df.transpose()
precisionrecallf1-scoresupport
bear0.3684210.3500000.35897420.000000
cat0.6071430.5312500.56666732.000000
chicken0.5714290.6000000.58536620.000000
cow0.6818180.7500000.71428620.000000
deer0.7727270.8500000.80952420.000000
dog0.3548390.4230770.38596526.000000
duck0.5000000.5500000.52381020.000000
eagle0.4736840.4736840.47368419.000000
elephant0.4444440.6000000.51063820.000000
human0.6800000.8500000.75555620.000000
lion0.5000000.3500000.41176520.000000
monkey0.5000000.4000000.44444420.000000
mouse0.3076920.2000000.24242420.000000
panda0.7727270.7391300.75555623.000000
pigeon0.6086960.7000000.65116320.000000
rabbit0.6000000.6000000.60000020.000000
sheep0.5000000.5500000.52381020.000000
tiger0.7222220.5652170.63414623.000000
wolf0.7894740.7500000.76923120.000000
accuracy0.5682380.5682380.5682380.568238
macro avg0.5660690.5701240.564053403.000000
weighted avg0.5670780.5682380.563651403.000000

GridSearch

estimator = Pipeline([
('feature_extraction', hog_transformer()),
('normalization', StandardScaler()),
('model_training', SGDClassifier())
])
param_grid = [
{
'feature_extraction__orientations': [7, 8, 9, 10, 11],
'feature_extraction__pixels_per_cell': [(7, 7), (8, 8), (9, 9)],
'feature_extraction__cells_per_block': [(2, 2), (3, 3), (4, 4)],
'model_training__loss': ['hinge', 'squared_hinge', 'perceptron'],
'model_training__eta0': [0.001, 0.1, 1],
'model_training__learning_rate': ['optimal', 'adaptive']
}
]
model_grid = GridSearchCV(
estimator,
param_grid,
scoring='accuracy',
cv=3,
n_jobs=-1,
verbose=2
)
model_grid.fit(X_train, y_train)
# Fitting 3 folds for each of 810 candidates, totalling 2430 fits
# time for a coffee break :)

Best Model Evaluation

model_grid.best_params_
model_grid.best_score_
model_best = model_grid.best_estimator_
y_pred_best = model_best.predict(X_test)
eval_report_best = classification_report(y_test, y_pred_best, output_dict=True)
eval_best_df = pd.DataFrame(eval_report_best)
eval_best_df.transpose()
precisionrecallf1-scoresupport
bear0.6800000.8500000.75555620.000000
cat0.7500000.6562500.70000032.000000
chicken0.8095240.8500000.82926820.000000
cow0.8095240.8500000.82926820.000000
deer0.8695651.0000000.93023320.000000
dog0.6071430.6538460.62963026.000000
duck0.7894740.7500000.76923120.000000
eagle0.6470590.5789470.61111119.000000
elephant0.7826090.9000000.83720920.000000
human1.0000000.9000000.94736820.000000
lion0.6923080.4500000.54545520.000000
monkey0.6315790.6000000.61538520.000000
mouse0.6842110.6500000.66666720.000000
panda0.9166670.9565220.93617023.000000
pigeon0.6666670.8000000.72727320.000000
rabbit0.8666670.6500000.74285720.000000
sheep0.7500000.7500000.75000020.000000
tiger0.8000000.8695650.83333323.000000
wolf0.8095240.8500000.82926820.000000
accuracy0.7642680.7642680.7642680.764268
macro avg0.7664480.7665860.762383403.000000
weighted avg0.7652510.7642680.760746403.000000
conf_mtx_best = ConfusionMatrixDisplay(
confusion_matrix=confusion_matrix(y_test, y_pred_best.tolist()),
display_labels=[False]
)

conf_mtx_best.plot(cmap='plasma')

Scikit-Image Introduction

# plot predictions
ran_gen = np.random.default_rng()

plt.figure(figsize=(12, 12))

for i in range(12):
ax = plt.subplot(4, 4, i+1)
random_index = ran_gen.integers(low=0, high=403, size=1)
plt.imshow(X_test[random_index[0]])
plt.title(y_pred_best[random_index[0]])
plt.axis(False)

plt.savefig('assets/Scikit_Image_Model_Deployment_07.webp', bbox_inches='tight')

Scikit-Image Introduction

Trained Model Export

# save the model
output = open('model/animal_model_best.pkl', 'wb')
pickle.dump(model_best, output)

Deployment Model

Model Training

best_params = {'feature_extraction__cells_per_block': (2, 2),
'feature_extraction__orientations': 11,
'feature_extraction__pixels_per_cell': (8, 8),
'model_training__eta0': 0.1,
'model_training__learning_rate': 'optimal',
'model_training__loss': 'perceptron'}
feature_extractor_pipe = make_pipeline(
hog_transformer(
orientations=11,
pixels_per_cell=(8, 8),
cells_per_block=(2, 2))
)
feature_vectors = feature_extractor_pipe.fit_transform(X_train)
normalizer = StandardScaler()
feature_normed = normalizer.fit_transform(feature_vectors)
classifier = SGDClassifier(
loss='perceptron', eta0=0.1,
learning_rate='optimal',
early_stopping=True
)
classifier.fit(feature_normed, y_train)

Model Evaluation

feature_vectors_test = feature_extractor_pipe.transform(X_test)
feature_normed_test = normalizer.transform(feature_vectors_test)
y_pred_deploy = classifier.predict(feature_normed_test)
eval_report_best = classification_report(y_test, y_pred_deploy, output_dict=True)
eval_best_df = pd.DataFrame(eval_report_best)
eval_best_df.transpose()
precisionrecallf1-scoresupport
bear0.7000000.7000000.70000020.00000
cat0.8620690.7812500.81967232.00000
chicken0.9375000.7500000.83333320.00000
cow0.6250000.7500000.68181820.00000
deer0.8095240.8500000.82926820.00000
dog0.5882350.7692310.66666726.00000
duck0.7500000.7500000.75000020.00000
eagle0.5909090.6842110.63414619.00000
elephant0.7619050.8000000.78048820.00000
human1.0000000.9500000.97435920.00000
lion0.6470590.5500000.59459520.00000
monkey0.8750000.7000000.77777820.00000
mouse0.6470590.5500000.59459520.00000
panda0.9565220.9565220.95652223.00000
pigeon0.6666670.8000000.72727320.00000
rabbit0.7692310.5000000.60606120.00000
sheep0.6190480.6500000.63414620.00000
tiger0.8000000.8695650.83333323.00000
wolf0.7619050.8000000.78048820.00000
accuracy0.7493800.7493800.7493800.74938
macro avg0.7561910.7453040.746028403.00000
weighted avg0.7590710.7493800.749534403.00000

Model Export

# save the model
output = open('model/animal_model_deployment.pkl', 'wb')
pickle.dump(classifier, output)
# save fitted normalizer
output = open('model/animal_model_deployment_scaler.pkl', 'wb')
pickle.dump(normalizer, output)