Skip to main content

TST, HongKong

AutoML with AutoGluon for Tabular Data

AutoGluon automates machine learning tasks enabling you to easily achieve strong predictive performance in your applications. With just a few lines of code, you can train and deploy high-accuracy machine learning and deep learning models on image, text, time series, and tabular data.

Github Repository

Installation

Installing AutoGluon with GPU support:

pip install -U pip
pip install -U setuptools wheel
pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117
pip install autogluon
# for visualizations
pip install bokeh==2.0.1"

Tabular Data Classification

# get dataset
!wget https://github.com/mpolinowski/hotel-booking-dataset/raw/master/datasets/hotel_bookings.csv -P datase
from autogluon.tabular import TabularDataset, TabularPredictor
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
SEED = 42
MODEL_PATH = 'model'

Classification Model

Data Preprocessing

data = TabularDataset('dataset/hotel_bookings.csv')
data.head(5).transpose()
01234
hotelResort HotelResort HotelResort HotelResort HotelResort Hotel
is_canceled00000
lead_time34273771314
arrival_date_year20152015201520152015
arrival_date_monthJulyJulyJulyJulyJuly
arrival_date_week_number2727272727
arrival_date_day_of_month11111
stays_in_weekend_nights00000
stays_in_week_nights00112
adults22112
children0.00.00.00.00.0
babiesModel TestingypeNo DepositNo DepositNo DepositNo Deposit
agentNaNNaNNaN304.0240.0
companyNaNNaNNaNNaNNaN
days_in_waiting_list00000
customer_typeTransientTransientTransientTransientTransient
adr0.00.075.075.098.0
required_car_parking_spaces00000
total_of_special_requests00001
reservation_statusCheck-OutCheck-OutCheck-OutCheck-OutCheck-Out
reservation_status_date01-07-1501-07-1502-07-1502-07-1503-07-15
# the are two columns for the label is_canceled and reservation_status
# only keep one and make it the true label
data = data.drop(['is_canceled'], axis=1)
data = data.drop(['reservation_status_date'], axis=1)
data.info()
# <class 'autogluon.core.dataset.TabularDataset'>
# RangeIndex: 119390 entries, 0 to 119389
# Data columns (total 30 columns)
# take small random sample to get started => 10%
data_sample = data.sample(frac=0.1 , random_state=SEED)
data_sample.describe()

| | lead_time | arrival_date_year | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | children | babies | is_repeated_guest | previous_cancellations | pr_Reg | min | 0.000000 | 2015.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 9.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | | 25% | 19.000000 | 2016.000000 | 16.000000 | 8.000000 | 0.000000 | 1.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 9.000000 | 67.000000 | 0.000000 | 69.000000 | 0.000000 | 0.000000 | | 50% | 70.000000 | 2016.000000 | 27.000000 | 16.000000 | 1.000000 | 2.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 14.000000 | 178.000000 | 0.000000 | 95.000000 | 0.000000 | 0.000000 | | 75% | 162.000000 | 2017.000000 | 38.000000 | 24.000000 | 2.000000 | 3.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 229.000000 | 254.000000 | 0.000000 | 126.000000 | 0.000000 | 1.000000 | | max | 629.000000 | 2017.000000 | 53.000000 | 31.000000 | 12.000000 | 30.000000 | 55.000000 | 3.000000 | 9.000000 | 1.000000 | 26.000000 | 56.000000 | 14.000000 | 531.000000 | 525.000000 | 391.000000 | 451.500000 | 3.000000 | 5.000000 |

# train/test split
print(len(data_sample)*0.8)
# 9551.2
train_size = 9550
train_data = data_sample.sample(n=train_size, random_state=SEED)
test_data = data_sample.drop(train_data.index)
print(len(train_data), len(test_data))
# 9550 2389

Model Training

# train a binary classification model on booking cancellation
predictor = TabularPredictor(label='reservation_status', path=MODEL_PATH)
predictor.fit(train_data)

# AutoGluon infers your prediction problem is: 'multiclass' (because dtype of label-column == object).
# 3 unique label values:  ['Check-Out', 'Canceled', 'No-Show']
    
# AutoGluon training complete, total runtime = 56.25s ... Best model: "WeightedEnsemble_L2"
predictor.fit_summary()

Estimated performance of each model:

modelscore_valpred_time_valfit_timepred_time_val_marginalfit_time_marginalstack_levelcan_inferfit_order
0WeightedEnsemble_L20.8753930.04783717.4480670.0005080.3990062True14
1XGBoost0.8743460.0280963.3229330.0280963.3229331True11
2LightGBMLarge0.8691100.0363324.9472900.0363324.9472901True13
3LightGBM0.8659690.0372772.2221200.0372772.2221201True5
4LightGBMXT0.8638740.0550512.5916140.0550512.5916141True4
5RandomForestGini0.8575920.0619620.8404760.0619620.8404761True6
6CatBoost0.8565450.01119414.6616690.01119414.6616691True8
7RandomForestEntr0.8544500.0620010.9052560.0620010.9052561True7
8ExtraTreesEntr0.8502620.0627870.7298180.0627870.7298181True10
9ExtraTreesGini0.8492150.0634420.7990200.0634420.7990201True9
10NeuralNetTorch0.8471200.01923313.7261290.01923313.7261291True12
11NeuralNetFastAI0.8387430.0188307.6193300.0188307.6193301True3
12KNeighborsDist0.7078530.0139230.0079540.0139230.0079541True2
13KNeighborsUnif0.6890050.0345560.0096810.0345560.0096811True1

Number of models trained: 14 Types of models trained:

WeightedEnsembleModel
leaderboard=pd.DataFrame(predictor.leaderboard())

plt.figure(figsize=(8, 7))

sns.set(style='darkgrid')

sns.scatterplot(
    x='pred_time_val',
    y='score_val',
    data=leaderboard,
    s=300,
    alpha=0.5,
    hue='model',
    palette='tab20',
    style='fit_time'
)

plt.title('Prediction Time vs Accuracy Score')
plt.xlabel('Average Time for Predictions')
plt.ylabel('Accuracy Score')
plt.legend(bbox_to_anchor=(1.01,1.01))

plt.savefig('assets/AutoML_with_AutoGluon_01.webp', bbox_inches='tight')

AutoML with AutoGluon for Tabular Data

Model Loading

# load best model
predictor = TabularPredictor.load("model/")

Model Testing

X_test = test_data.drop(columns=['reservation_status'] )
y_test = test_data['reservation_status']
y_pred = predictor.predict(X_test)
eval_metrics = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)
array = np.array(list(eval_metrics.items()))
df = pd.DataFrame(array, columns = ['metric','value']).sort_values(by='value')

plt.figure(figsize=(12,5))
plt.bar(df['metric'], df['value'])
plt.title('Evaluation Metrics')

plt.savefig('assets/AutoML_with_AutoGluon_02.webp', bbox_inches='tight')

AutoML with AutoGluon for Tabular Data

Feature Interpretability

Get feature importance by conducting a Permutation-Shuffling test in AutoGluon. Shuffle one feature column, test how this effects prediction accuracy. The higher the effect the higher the features / columns importance. Negative effects show features that confuse the model, decrease the prediction accuracy and should be removed.

# add test dataset that must include the classifier (is_cancelled):
predictor.feature_importance(test_data)
importancestddevp_valuenp99_highp99_low
country0.0615320.0030767.468850e-0750.0678650.055199
deposit_type0.0581830.0088156.133847e-0550.0763340.040033
total_of_special_requests0.0456260.0065324.905663e-0550.0590750.032177
lead_time0.0386770.0070451.264772e-0450.0531840.024171
agent0.0313940.0033621.553978e-0550.0383160.024472
previous_cancellations0.0302220.0028919.926338e-0650.0361740.024269
assigned_room_type0.0292170.0030101.332357e-0550.0354140.023020
market_segment0.0282130.0051061.233146e-0450.0387260.017699
reserved_room_type0.0230220.0025291.719338e-0550.0282290.017815
required_car_parking_spaces0.0114690.0019098.880361e-0550.0154000.007538
customer_type0.0101300.0035931.619134e-0350.0175290.002731
arrival_date_year0.0070320.0027032.173517e-0350.0125980.001467
booking_changes0.0061950.0017116.319167e-0450.0097170.002673
adr0.0046040.0024767.086297e-0350.009703-0.000494
previous_bookings_not_canceled0.0037670.0017514.290459e-0350.0073730.000162
company0.0036840.0017614.733895e-0350.0073100.000058
children0.0030140.0005461.235932e-0450.0041380.001890
arrival_date_week_number0.0022600.0027066.760071e-0250.007833-0.003312
meal0.0010050.0010084.488794e-0250.003080-0.001071
stays_in_weekend_nights0.0009210.0016591.411190e-0150.004336-0.002494
arrival_date_month0.0006700.0015551.950213e-0150.003871-0.002532
is_repeated_guest0.0004190.0011842.367137e-0150.002856-0.002019
distribution_channel0.0003350.0010422.561126e-0150.002481-0.001811
days_in_waiting_list-0.0003350.0003509.503497e-0150.000386-0.001056
babies-0.0003350.0004599.110961e-0150.000609-0.001279
adults-0.0007530.0021637.602366e-0150.003700-0.005207
stays_in_week_nights-0.0010050.0029397.563611e-0150.005047-0.007056
arrival_date_day_of_month-0.0010050.0023977.991062e-0150.003931-0.005941
hotel-0.0010880.0021478.398613e-0150.003332-0.005508
importance_df = predictor.feature_importance(test_data).reset_index()
plt.figure(figsize=(8,8))
plt.title('Feature Importance')
sns.set(style='darkgrid')
sns.barplot(
    data=importance_df,
    y='index',
    x='importance',
    orient='horizontal'
).set_ylabel('Feature')

plt.savefig('assets/AutoML_with_AutoGluon_03.webp', bbox_inches='tight')

AutoML with AutoGluon for Tabular Data

Running Predictions

test_booking = {
  "hotel": "City Hotel",
  "is_canceled": 0,
  "lead_time": 214,
  "arrival_date_year": 2017,
  "arrival_date_month": "June",
  "arrival_date_week_number": 23,
  "arrival_date_day_of_month": 9,
  "stays_in_weekend_nights": 1,
  "stays_in_week_nights": 2,
  "adults": 2,
  "children": 0,
  "babies": 0,
  "meal": "BB",
  "country": "GBR",
  "market_segment": "Groups",
  "distribution_channel": "Direct",
  "is_repeated_guest": 0,
  "previous_cancellations": 0,
  "previous_bookings_not_canceled": 0,
  "reserved_room_type": "D",
  "assigned_room_type": "D",
  "booking_changes": 1,
  "deposit_type": "No Deposit",
  "agent": 28,
  "company": 153,
  "days_in_waiting_list": 0,
  "customer_type": "Transient",
  "adr": 118.13,
  "required_car_parking_spaces": 0,
  "total_of_special_requests": 0,
  "reservation_status": "Check-Out",
  "reservation_status_date": "12-06-17"
}
# load booking into dataset
test_booking_df = TabularDataset.from_dict([test_booking])
test_booking_from_csv_df = TabularDataset('dataset/test_booking.csv')
predictor.predict(test_booking_df)
# 0    Check-Out <- not cancelled
# Name: is_canceled, dtype: int64
predictor.predict(test_booking_from_csv_df)
# 0    Check-Out <- not cancelled
# Name: is_canceled, dtype: int64

Tabular Data Regression

# get dataset
! wget https://raw.githubusercontent.com/mgrafals/Uber-Data-Engineering-Project/main/uber_data.csv -P dataset
from autogluon.tabular import TabularDataset, TabularPredictor
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
SEED = 42
MODEL_PATH = 'model'

Data Preprocessing

data = TabularDataset('dataset/uber_data.csv')
data.head().transpose()
01234
VendorID11222
tpep_pickup_datetime2016-03-01 00:00:002016-03-01 00:00:002016-03-01 00:00:002016-03-01 00:00:002016-03-01 00:00:00
tpep_dropoff_datetime2016-03-01 00:07:552016-03-01 00:11:062016-03-01 00:31:062016-03-01 00:00:002016-03-01 00:00:00
passenger_count11235
trip_distance2.52.919.9810.7830.43
pickup_longitude-73.976746-73.983482-73.782021-73.863419-73.971741
pickup_latitude40.76515240.76792540.6448140.76981440.792183
RatecodeID11113
store_and_fwd_flagNNNNN
dropoff_longitude-74.004265-74.005943-73.974541-73.96965-74.17717
dropoff_latitude40.74612840.73316640.6757740.75776740.695053
payment_type11111
fare_amount9.011.054.531.598.0
extra0.50.50.50.00.0
mta_tax0.50.50.50.50.0
tip_amount2.053.058.03.780.0
tolls_amount0.00.00.05.5415.5
improvement_surcharge0.30.30.30.30.3
total_amount12.3515.3563.841.62113.8
# there are two values that directly scale with the label
# of of passenger fare: fare_amount and total_amount -> drop the latter
data = data.drop('total_amount', axis=1)
data.info()
# RangeIndex: 100000 entries, 0 to 99999
# Data columns (total 18 columns):
data_sample = data.sample(frac=0.5 , random_state=SEED)
data_sample.describe()
VendorIDpassenger_counttrip_distancepickup_longitudepickup_latitudeRatecodeIDdropoff_longitudedropoff_latitudepayment_typefare_amountextramta_taxtip_amounttolls_amountimprovement_surchargetotal_amount
count50000.00000050000.00000050000.00000050000.00000050000.00000050000.00000050000.00000050000.00000050000.00000050000.00000050000.00000050000.00000050000.00000050000.00000050000.00000050000.000000
mean1.8831601.9263603.039138-73.25277240.3554001.040780-73.27421140.3675441.33742013.2875110.1017300.4969801.8813040.3679850.29948416.434978
std0.3212331.5862123.8516447.2682264.0024370.2844627.1565343.9409440.48242312.1042650.2026740.0429062.5820701.5217070.01674914.779261
min1.0000000.0000000.000000-121.9331510.0000001.000000-121.9333270.0000001.000000-7.000000-0.500000-0.500000-2.7000000.000000-0.300000-10.140000
25%2.0000001.0000001.000000-73.99092140.7389331.000000-73.99041040.7387761.0000006.5000000.0000000.5000000.0000000.0000000.3000008.300000
50%2.0000001.0000001.670000-73.98016440.7554281.000000-73.97840940.7552491.0000009.5000000.0000000.5000001.3600000.0000000.30000011.800000
75%2.0000002.0000003.200000-73.96414240.7690901.000000-73.96209740.7680022.00000015.0000000.0000000.5000002.4600000.0000000.30000018.300000
max2.0000006.000000160.8000000.00000041.2045486.0000000.00000042.6668934.000000819.5000004.5000000.50000047.56000022.0400000.300000832.800000
# 80:20 train test split
train_data = data_sample.sample(n=40000, random_state=SEED)
test_data = data_sample.drop(train_data.index)

Model Training

predictor = TabularPredictor(label='fare_amount', path=MODEL_PATH)
predictor.fit(train_data)

# AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and label-values can't be converted to int).
# Label info (max, min, mean, stddev): (819.5, -7.0, 13.23572, 11.96267)

# AutoGluon training complete, total runtime = 89.72s ... Best model: "WeightedEnsemble_L2"
leaderboard=pd.DataFrame(predictor.leaderboard())

plt.figure(figsize=(8, 7))

sns.set(style='darkgrid')

sns.scatterplot(
    x='pred_time_val',
    y='score_val',
    data=leaderboard,
    s=300,
    alpha=0.5,
    hue='model',
    palette='tab20',
    style='fit_time'
)

plt.title('Prediction Time vs Accuracy Score')
plt.xlabel('Average Time for Predictions')
plt.ylabel('Validation Score (-RMSE)')
plt.legend(bbox_to_anchor=(1.01,1.01))

plt.savefig('assets/AutoML_with_AutoGluon_01.webp', bbox_inches='tight')

AutoML with AutoGluon for Tabular Data

Model Loading

# load best model
predictor = TabularPredictor.load("model/")

Model Testing Model Testing

X_test = test_data.drop(columns=['fare_amount'] )
y_test = test_data['fare_amount']
y_pred = predictor.predict(X_test)
eval_metrics = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)
array = np.array(list(eval_metrics.items()))
df = pd.DataFrame(array, columns = ['metric','value']).sort_values(by='value')

plt.figure(figsize=(15,5))
plt.bar(df['metric'], df['value'])
plt.title('Evaluation Metrics')

plt.savefig('assets/AutoML_with_AutoGluon_02.webp', bbox_inches='tight')

AutoML with AutoGluon for Tabular Data

Feature Interpretability

# add test dataset that must include the classifier (fare_amount):
importance_df = predictor.feature_importance(test_data).reset_index()
plt.figure(figsize=(8,8))
plt.title('Feature Importance')
sns.set(style='darkgrid')
sns.barplot(
    data=importance_df,
    y='index',
    x='importance',
    orient='horizontal'
).set_ylabel('Feature')

plt.savefig('assets/AutoML_with_AutoGluon_03.webp', bbox_inches='tight')

AutoML with AutoGluon for Tabular Data

Running Predictions

test_drive = {
  "VendorID": 2,
  "tpep_pickup_datetime": "2016-03-01 01:12:39",
  "tpep_dropoff_datetime": "2016-03-01 01:16:48",
  "passenger_count": 5,
  "trip_distance": 1.28,
  "pickup_longitude": -73.97952270507811,
  "pickup_latitude": 40.76089096069336,
  "RatecodeID": 1,
  "store_and_fwd_flag": "N",
  "dropoff_longitude": -73.99040985107422,
  "dropoff_latitude": 40.77185821533203,
  "payment_type": 1,
  "fare_amount": 5.5,
  "extra": 0.5,
  "mta_tax": 0.5,
  "tip_amount": 0.0,
  "tolls_amount": 0.0,
  "improvement_surcharge": 0.3,
  "total_amount": 6.8
}
# load booking into dataset
test_drive_df = TabularDataset.from_dict([test_drive])
test_drive_from_csv_df = TabularDataset('dataset/test_data.csv')
predictor.predict(test_drive_df)
# 0    6.403378
# Name: fare_amount, dtype: float32
predictor.predict(test_drive_from_csv_df)
# 0    20.392935
# Name: fare_amount, dtype: float32

Customizations

leaderboard_results = predictor.leaderboard(test_data)
results_df = leaderboard_results[['model', 'score_test', 'score_val', 'pred_time_test', 'fit_time']]
results_df
modelscore_testscore_valpred_time_testfit_time
0ExtraTreesMSE-3.424482-2.9672950.3361134.152885
1RandomForestMSE-3.496445-3.4212890.63579114.147954
2LightGBMLarge-3.711858-3.3386910.1149171.717049
3WeightedEnsemble_L2-3.869480-2.7664000.87055456.528128
4CatBoost-3.875734-3.2466650.01213326.307805
5XGBoost-4.003582-3.2111020.0265920.405672
6LightGBM-4.547570-3.2450280.0131850.341916
7NeuralNetFastAI-4.767402-3.3003410.16098325.395540
8LightGBMXT-5.297554-3.4303230.0165570.359552
9NeuralNetTorch-7.383115-5.6494170.05052910.426494
10KNeighborsUnif-7.793495-6.1597390.3308120.019789
11KNeighborsDist-8.238958-7.2029820.2760460.018064

Inference Constraints

The WeightedEnsemble_L2 model - which is an ensemble of different models used by AutoGluon - has the highest test accuracy but also takes by far the longest to return predictions. The 'non-ensembled' model ExtraTreesMSE is not that far off when it comes to accuracy but more than twice as fast with predictions. We can set a restraint how much time is acceptable to remove 'slow' models from the created weighted ensemble model.

fast_predictor = TabularPredictor(label='fare_amount', path='model_fast')
fast_predictor.fit(train_data, time_limit=30, infer_limit=0.000004, infer_limit_batch_size=10000)

# Removing 5/6 base models to satisfy inference constraint (constraint=1.651μs) ...
# 	0.092ms	-> 0.058ms	(KNeighborsUnif)
# 	0.058ms	-> 0.022ms	(KNeighborsDist)
# 	0.022ms	-> 3.321μs	(RandomForestMSE)
# 	3.321μs	-> 1.871μs	(LightGBMXT)
# 	1.871μs	-> 1.177μs	(CatBoost)
eval_metrics_fast = fast_predictor.fit_summary()
modelscore_valpred_time_valfit_timepred_time_val_marginalfit_time_marginalstack_levelcan_inferfit_order
0LightGBM-3.2450280.0038340.5883950.0038340.5883951True4
1WeightedEnsemble_L2-3.2450280.0042050.5948060.0003710.0064112True7
2CatBoost-3.2669190.0030058.2122420.0030058.2122421True6
3RandomForestMSE-3.4212890.07342915.4123170.07342915.4123171True5
4LightGBMXT-3.4303230.0045970.6672010.0045970.6672011True3
5KNeighborsUnif-6.1597390.1349670.3652080.1349670.3652081True1
6KNeighborsDist-7.2029820.1204730.3943910.1204730.3943911True2

Number of models trained: 7 Types of models trained:

CatBoostModel
eval_metrics = predictor.fit_summary()
modelscore_valpred_time_valfit_timepred_time_val_marginalfit_time_marginalstack_levelcan_inferfit_order
0WeightedEnsemble_L2-2.7664000.24469556.5281280.0003080.2464372True12
1ExtraTreesMSE-2.9672950.0602204.1528850.0602204.1528851True7
2XGBoost-3.2111020.0052350.4056720.0052350.4056721True9
3LightGBM-3.2450280.0032920.3419160.0032920.3419161True4
4CatBoost-3.2466650.00418526.3078050.00418526.3078051True6
5NeuralNetFastAI-3.3003410.03188725.3955400.03188725.3955401True8
6LightGBMLarge-3.3386910.0247501.7170490.0247501.7170491True11
7RandomForestMSE-3.4212890.05990914.1479540.05990914.1479541True5
8LightGBMXT-3.4303230.0038570.3595520.0038570.3595521True3
9NeuralNetTorch-5.6494170.02028310.4264940.02028310.4264941True10
10KNeighborsUnif-6.1597390.1428600.0197890.1428600.0197891True1
11KNeighborsDist-7.2029820.1131720.0180640.1131720.0180641True2

Number of models trained: 12 Types of models trained:

TabularNeuralNetTorchModel

Result: Much faster but less accurate

  • WeightedEnsemble_L2:
    • score_val: -2.766400 -> -3.245028
    • pred_time_val: 0.244695s -> 0.004205s
    • fit_time: 56.528128s -> 0.594806s

Hyperparameter Tuning

hyperparameters_NN_Torch = {"num_epochs": 1, "learning_rate": 0.5}
hyperparameters_XGB = {} # use XGBoost with default values
hyperparameters = {"NN_TORCH": hyperparameters_NN_Torch, "XGB": hyperparameters_XGB}
tuned_predictor = TabularPredictor(label='fare_amount', path='model_tuned')
tuned_predictor.fit(
        train_data=train_data,
        hyperparameters=hyperparameters
    )
tuned_predictor.fit_summary()
# AutoGluon only trained the two configured model for one epoch 
modelscore_valpred_time_valfit_timepred_time_val_marginalfit_time_marginalstack_levelcan_inferfit_order
0XGBoost-3.2111020.0039930.3639990.0039930.3639991True1
1WeightedEnsemble_L2-3.2111020.0043040.4344710.0003110.0704722True3
2NeuralNetTorch-23.7699870.0198700.9353910.0198700.9353911True2

AutoGluon Presets

Available Presets: [best_quality, high_quality, good_quality, medium_quality, optimize_for_deployment, interpretable, ignore_text]

presets = ['medium_quality'] # accept lower accuracy for speed
preset_predictor = TabularPredictor(label='fare_amount', path='preset_model')
preset_predictor.fit(
        train_data=train_data,
        presets=presets,
        included_model_types=['GBM', 'FASTAI', 'NN_TORCH'], # only use those models
        # excluded_model_types=['KNN', 'NN', 'XT', 'RF', 'FASTAI'] # use all but those models
    )
preset_predictor.fit_summary()
modelscore_valpred_time_valfit_timepred_time_val_marginalfit_time_marginalstack_levelcan_inferfit_order
0WeightedEnsemble_L2-3.0542430.08391542.9061330.0003010.1335712True6
1LightGBM-3.2450280.0032620.5071830.0032620.5071831True2
2NeuralNetFastAI-3.3003410.02794626.1121540.02794626.1121541True3
3LightGBMLarge-3.3386910.0260101.9536290.0260101.9536291True5
4LightGBMXT-3.4303230.0042760.4589920.0042760.4589921True1
5NeuralNetTorch-5.6494170.02212013.7406050.02212013.7406051True4

Model Deployment

predictor = TabularPredictor.load("model/")
# retain model on train + validation dataset
predictor.refit_full()
# strip all models not part of the `WeightedEnsemble_L2_FULL`
predictor.clone_for_deployment('production_model')
# Clone: Keeping minimum set of models required to predict with best model 'WeightedEnsemble_L2_FULL'...
# To load the cloned predictor: predictor_clone = TabularPredictor.load(path="production_model")
predictor_clone = TabularPredictor.load(path="production_model")
test_data = TabularDataset('dataset/test_data.csv')
predictor_clone.predict(test_data)

# 0    20.425285
# Name: fare_amount, dtype: float32