Skip to main content

Guangzhou, China

MLFlow Docker

This is just an experiment to see if I can use MLFlow inside my pytorch-jupyter Docker container. To do this I added MLFlow to the Dockerfile:

FROM pytorch/pytorch:latest

# Set environment variables
ENV DEBIAN_FRONTEND=noninteractive

# Install system dependencies
RUN apt-get update && \
apt-get install -y \
git \
tini \
python3-pip \
python3-dev \
python3-opencv \
libglib2.0-0

# intall optional python deps
RUN python -m pip install --upgrade pip
# jupyter notebooks
RUN pip install jupyter
# fastdup https://github.com/visual-layer/fastdup
RUN pip install fastdup
RUN pip install opencv-python
RUN pip install matplotlib matplotlib-inline pandas
RUN pip install pillow
RUN pip install pyyaml
# YOLO 8.1
RUN pip install ultralytics Cython>=0.29.32 lapx>=0.5.5
# MLFlow 2.10
RUN pip install mlflow pytorch_lightning



# Set the working directory
WORKDIR /opt/app

# Start the notebook
RUN chmod +x /usr/bin/tini
ENTRYPOINT ["/usr/bin/tini", "--"]
CMD ["jupyter", "notebook", "--port=8888", "--no-browser", "--ip=0.0.0.0", "--allow-root"]

Let's build this custom image with:

docker build -t pytorch-jupyter . -f Dockerfile

I can now create the container and mount my working directory into the container WORKDIR to get started:

docker run --ipc=host --gpus all -ti --rm \
-v $(pwd):/opt/app -p 8888:8888 -p 5000:5000 \
--name pytorch-jupyter \
pytorch-jupyter:latest

This will start Jupyter on Port 8888 and I can start MLFlow manually:

docker exec -ti pytorch-jupyter mlflow ui --host 0.0.0.0

The MLFlow is now available on localhost:5000 on my host system:

MLFlow in Docker

Just to be sure I stop MLFlow and try to run it directly from a Jupyter Notebook:

get_ipython().system_raw("mlflow ui --port 5000 --host 0.0.0.0 &")

And the UI is still available on localhost:5000 - nice:

MLFlow in Docker

Create an MLFlow Experiment

experiment_name = "emnist_classifier_dnn"
mlflow.create_experiment(experiment_name)
mlflow.set_experiment(experiment_name)

MLFlow in Docker

Dataset

Starting a classification run on the EMNIST Handwritten Character Set:

emnist_train_data = pd.read_csv("./datasets/emnist-letters-train.csv", header=None)

emnist_test_data = pd.read_csv("./datasets/emnist-letters-test.csv", header=None)
emnist_test_data = emnist_test_data.sample(frac=1)

# extract labels
train_labels = emnist_train_data.values[:, 0]
train_images = emnist_train_data.values[:, 1:]

# reshape image to the original 28x28 shape
train_images = train_images.reshape(-1, 28, 28)

MLFlow in Docker

dataset_info = """
The EMNIST dataset is a set of handwritten character digits derived from the NIST Special Database 19 and converted to a 28x28 pixel image format and dataset structure that directly matches the MNIST dataset . Further information on the dataset contents and conversion process can be found in the paper available at https://arxiv.org/abs/1702.05373v1
"""

with open("dataset_info.txt", "w") as f:
f.write(dataset_info)
with mlflow.start_run(run_name = 'emnist_classifier_dnn_test_run') as current_run:
mlflow.log_metric('Accuracy', 0.67)

params = {
'num_nn_layers': 4
}

mlflow.log_params(params)
mlflow.log_figure(fig, 'sample_images.jpg')
mlflow.log_artifact('dataset_info.txt')
mlflow.set_tag('EMNIST', 'Character Classification')

MLFlow in Docker

Perform a train/test/val split and prepare the dataloaders needed for the model training:

MLFlow in Docker

train_dl = DataLoader(train, BATCH_SIZE, shuffle=True, drop_last=True, num_workers=1)
val_dl = DataLoader(val, BATCH_SIZE, num_workers=1)
test_dl = DataLoader(test_dataset, BATCH_SIZE, num_workers=1)

dataiter = iter(train_dl)
batch_images, batch_labels = next(dataiter)

Model Training

class EmnistModel(pl.LightningModule):
def __init__(self):
super().__init__()
self.criterion = nn.CrossEntropyLoss()

self.linear1 = nn.Linear(INPUT_SIZE, 512)
self.linear2 = nn.Linear(512, 128)
self.linear3 = nn.Linear(128, 32)
self.linear4 = nn.Linear(32, OUTPUT_SIZE)

def forward(self, xb):
out = self.linear1(xb)
out = F.relu(out)

out = self.linear2(out)
out = F.relu(out)

out = self.linear3(out)
out = F.relu(out)

out = self.linear4(out)
return out

def configure_optimizers(self):
return optim.Adam(self.parameters(), lr = 0.0001)

def training_step(self, batch, batch_idx):
# batches consists of images and labels
x, y = batch
# labels start at 1 but the classes at 0
y -= 1

y_hat = self(x)
loss = self.criterion(y_hat, y.long())
pred = y_hat.argmax(dim = 1)

acc = accuracy(pred, y, task="multiclass", num_classes=26)

self.log("train_loss", loss, on_epoch=True, prog_bar=True)
self.log("train_acc", acc, on_epoch=True, prog_bar=True)

return loss

def validation_step(self, batch, batch_idx):
x, y = batch
y -= 1

y_hat = self(x)
loss = self.criterion(y_hat, y.long())
pred = y_hat.argmax(dim=1)

acc = accuracy(pred, y, task="multiclass", num_classes=26)

self.log("val_loss", loss, on_epoch=True, prog_bar=True)
self.log("val_acc", acc, on_epoch=True, prog_bar=True)

return acc

def test_step(self, batch, batch_idx):
x, y = batch
y -= 1

y_hat = self(x)
loss = self.criterion(y_hat, y.long())
pred = y_hat.argmax(dim=1)

acc = accuracy(pred, y, task="multiclass", num_classes=26)

self.log("test_loss", loss, on_epoch=True, prog_bar=True)
self.log("test_acc", acc, on_epoch=True, prog_bar=True)

return acc

def predict_step(self, batch, batch_idx, dataloaders_idx=0):
x, y = batch
return self(x)
emnist_model = EmnistModel()
logger = CSVLogger("logs", name="emnist_classifier_dnn")
trainer = pl.Trainer(max_epochs = 10, logger=logger)
mlflow.pytorch.autolog(log_models = False)
with mlflow.start_run() as run:
trainer.fit(emnist_model, train_dl, val_dl)
trainer.test(dataloaders = test_dl)

input_schema = Schema([TensorSpec(np.dtype(np.float32), (-1, 784))])
output_schema = Schema([TensorSpec(np.dtype(np.float32), (-1, 26))])

signature = ModelSignature(inputs = input_schema, outputs = output_schema)

mlflow.pytorch.log_model(emnist_model, "emnist_classifier_dnn", signature = signature)

# ───────────────────────────────────────────────
# Test metric DataLoader 0
# ───────────────────────────────────────────────
# test_acc 0.8519594669342041
# test_loss 0.4712047874927521
# ───────────────────────────────────────────────

MLFlow in Docker

metrics = pd.read_csv(f"{trainer.logger.log_dir}/metrics.csv")
metrics.set_index("epoch", inplace=True)
metrics.drop(columns=['step', 'train_loss_step',
'train_acc_step', 'test_acc', 'test_loss'], inplace=True)
sns.lineplot(data=metrics)

MLFlow in Docker

Model Prediction

Get a sample set of test images:

test_dataiter = iter(test_dl)
test_images, test_labels = next(test_dataiter)
test_images.shape, test_labels.shape

Run them trough the trained model to return predictions:

model_path = f"runs:/{run_id}/emnist_classifier_dnn"
loaded_model = mlflow.pyfunc.load_model(model_path)

predictions = loaded_model.predict(test_images.numpy())

To display the prediction we first need to get the images back into shape:

# reshape image to the original 28x28 shape
test_images_reshaped = test_images.reshape(-1, 28, 28)
test_samples = np.random.randint(0, len(test_images_reshaped), 16)

Now we can plot them using Matplotlib:

fig = plt.figure(figsize = (8, 8))

for i, idx in enumerate(test_samples):
true_label = classes[int(test_labels[idx].item()) - 1]
pred_label = classes[np.argmax(predictions[idx])]

plt.subplot(4, 4, i+1)
plt.imshow(test_images[idx] / 255.0, cmap="gray")
plt.title(f"True: {true_label} || Pred: {pred_label}")
plt.axis('off')

plt.tight_layout()
plt.show()

MLFlow in Docker

Verify the overall performance by creating a confusion matrix over all test predictions:

x_pred = []
y_true = []

for inputs, labels in test_dl:
output = loaded_model.predict(inputs.numpy())

output = np.argmax(output, axis=1).astype('float64').tolist()
y_pred.extend(output)

labels = [x-1 for x in labels.tolist()]
y_true.extend(labels)

cm = confusion_matrix(y_true, y_pred)
confusion = ConfusionMatrixDisplay(cm, display_labels=classes)

fig, ax = plt.subplots(figsize = (12,8))
confusion.plot(ax = ax)

MLFlow in Docker