Skip to main content

TST, HongKong

AutoML with AutoGluon for Timeseries Forecasts

AutoGluon automates machine learning tasks enabling you to easily achieve strong predictive performance in your applications. With just a few lines of code, you can train and deploy high-accuracy machine learning and deep learning models on image, text, time series, and tabular data.

Github Repository

Installation

Installing AutoGluon with GPU support:

pip install -U pip
pip install -U setuptools wheel
pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117
pip install autogluon
# for visualizations
pip install bokeh==2.0.1"

Single Variate Forecasting

# get dataset
!wget https://raw.githubusercontent.com/databricks/Spark-The-Definitive-Guide/master/data/retail-data/all/online-retail-dataset.csv -P dataset
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor
import matplotlib.pyplot as plt
from datetime import datetime as dt
import pandas as pd
import seaborn as sns
SEED = 42
MODEL_PATH = 'model'

Data Preprocessing

df = pd.read_csv('dataset/online-retail-dataset.csv')
df.head(5)
InvoiceNoStockCodeDescriptionQuantityInvoiceDateUnitPriceCustomerIDCountry
053636585123AWHITE HANGING HEART T-LIGHT HOLDER612/1/2010 8:262.5517850.0United Kingdom
153636571053WHITE METAL LANTERN612/1/2010 8:263.3917850.0United Kingdom
253636584406BCREAM CUPID HEARTS COAT HANGER812/1/2010 8:262.7517850.0United Kingdom
353636584029GKNITTED UNION FLAG HOT WATER BOTTLE612/1/2010 8:263.3917850.0United Kingdom
453636584029ERED WOOLLY HOTTIE WHITE HEART.612/1/2010 8:263.3917850.0United Kingdom
df.info()
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 541909 entries, 0 to 541908
# Data columns (total 8 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 InvoiceNo 541909 non-null object
# 1 StockCode 541909 non-null object
# 2 Description 540455 non-null object
# 3 Quantity 541909 non-null int64
# 4 InvoiceDate 541909 non-null object
# 5 UnitPrice 541909 non-null float64
# 6 CustomerID 406829 non-null float64
# 7 Country 541909 non-null object
# dtypes: float64(2), int64(1), object(5)
# memory usage: 33.1+ MB
# only sample last 10.000 items
# df_sample = df.iloc[-10000:]
# take all items
df_sample = df.copy()

# renaming columns
df_sample.rename(columns={'InvoiceNo': 'item_id', 'InvoiceDate': 'timestamp'}, inplace=True)
# create sale total price
df_sample['target'] = df_sample['Quantity'] * df_sample['UnitPrice']
df_sample['item_id'] = 'online_sales'
# create single variant timeseries
df_sample.drop(
['StockCode', 'Description', 'CustomerID', 'Country', 'Quantity', 'UnitPrice'],
axis=1, inplace=True)
df_sample.head(5)
item_idtimestamptarget
0online_sales12/1/2010 8:2620.40
1online_sales12/1/2010 8:2627.80
2online_sales12/1/2010 8:262.60
3online_sales12/1/2010 8:265.85
4online_sales12/1/2010 8:2619.90
# reformat timestamp to remove time from date
df_sample['timestamp'] = pd.to_datetime(df_sample['timestamp']).dt.strftime('%m/%d/%Y')
df_sample.head(5)
item_idtargettimestamp
0online_sales16.612/23/2010
1online_sales8.512/23/2010
2online_sales20.812/23/2010
3online_sales20.812/23/2010
4online_sales20.812/23/2010
# groupby date and sum() up the sales
df_sample = df_sample.groupby(
['item_id', 'timestamp']).sum()

print(df_sample.info())
# MultiIndex: 305 entries, ('online_sales', '01/04/2011') to ('online_sales', '12/23/2010')
df_sample.head(5)
item_idtimestamptarget
online_sales01/04/201115584.29
01/05/201175076.22
01/07/201181417.78
01/09/201132131.53
df_sample.loc['online_sales']['target'].plot(
title='Sales Volume by Date',
figsize=(10,5),
rot=45,
legend=True
)
plt.savefig('assets/AutoGluon_AutoML_TimeSeries_01.webp', bbox_inches='tight')

AutoML with AutoGluon for Timeseries Data

df_sample.to_csv('dataset/single_variant_ts.csv', index=True)

Model Training

ValueError: Frequency not provided and cannot be inferred. This is often due to the time index of the data being irregularly sampled. Please ensure that the data set used has a uniform time index, or create the TimeSeriesPredictor setting ignore_time_index=True.

AutoGluon does not like irregular timeseries AT ALL... I manually fixed the timestamp column with regular, daily interval. Docs recommend auto-filling for missing data before model training.

train_data = TimeSeriesDataFrame('dataset/single_variant_ts.csv')
train_data.describe()
target
count305.000000
mean31959.829292
std17414.261664
min-1566.230000
25%20728.140000
50%27978.410000
75%42912.400000
max112141.110000
# create a predictor for 30 days (30 row in dataset) forcast
sv_predictor = TimeSeriesPredictor(
prediction_length=30,
path=MODEL_PATH,
target='target',
eval_metric='sMAPE'
)
sv_predictor.fit(
train_data,
time_limit=800,
presets="medium_quality"
)

# Training complete. Models trained: ['Naive', 'SeasonalNaive', 'Theta', 'AutoETS', 'RecursiveTabular', 'DeepAR', 'WeightedEnsemble']
# Total runtime: 146.36 s
# Best model: WeightedEnsemble
# Best model score: -0.2301
sv_predictor.fit_summary()

Estimated performance of each model:

modelscore_valpred_time_valfit_time_marginalfit_order
0WeightedEnsemble-0.3215951.0426511.8816477
1RecursiveTabular-0.3215951.0426510.7572915
2DeepAR-0.3847560.09503369.7518116
3AutoETS-0.38536422.8658000.0120044
4Theta-0.39778524.2691350.0096193
5SeasonalNaive-0.4035445.1627110.0101792
6Naive-0.4035445.5724330.0090851
Number of models trained: 7
Types of models trained:
{'MultiWindowBacktestingModel', 'TimeSeriesGreedyEnsemble'}

Model Evaluation

# return a 1 month forcast on the training data
sv_predictions = sv_predictor.predict(train_data, random_seed=SEED)
sv_predictions
mean0.10.20.30.40.50.60.70.80.9
item_idtimestamp
online_sales2011-10-1635231.54989214821.28729121889.08085426885.81062831139.10855535389.17684539525.31178643747.07675448715.57483955619.827997
2011-10-1737319.0984009315.48968018992.51867325927.42808931800.99026937256.96461142685.96239048685.27847655741.91860565278.986905
2011-10-1838623.6336125142.37128516610.76405224909.97464132018.25333838569.69269445201.92257752390.85557160646.93090672387.928409
2011-10-1940741.3017581946.15497315539.06876025137.95311333223.04457240765.60693448463.16538956628.99073666077.74374179722.836483
2011-10-2049296.1017076303.23291521458.81551431910.94126640964.79263249394.71205957908.46156366841.60150477474.68624092186.915812
2011-10-2142399.179004-4222.41869211966.11474923324.14721833287.62275942457.75419951587.04994661400.40366172842.29697088931.093472
2011-10-2233619.926637-17087.154419364.14461712901.40448023491.86236433662.23889343520.88446454164.96490766630.57364784037.471194
2011-10-2339042.384772-14703.5404323853.55295517218.43062628519.71001439090.59863949763.39253860939.66826474324.11112192676.374673
2011-10-2437314.733017-19270.824930233.74411414092.68026326011.10993337258.66368148286.52211660254.49475474152.62816993725.394967
2011-10-2540035.277581-19730.031575823.36452915754.37808328379.18536040095.12523751768.50408064369.74641578969.54536999392.125068
2011-10-2643809.551647-18581.3009152831.15542818233.92914331493.22189543799.32557256059.96050069262.07405384846.687611106247.592685
2011-10-2740978.233969-24604.018712-2246.68239614204.33570528124.91507241120.10701653865.68543567632.21499783850.112737106498.851884
2011-10-2841743.192227-26024.978536-2645.38530714166.81113028273.28425841561.17831154943.11240469180.70027886004.451402109515.329421
2011-10-2938315.939169-32037.733749-7961.0473619530.29259024433.42039438430.21358352199.83196867014.32183384580.597330108781.710477
2011-10-3040790.730787-31714.294692-6632.03291811250.13569226624.39549340830.33381455083.39713670254.85469088116.212266113018.777994
2011-10-3139601.428364-35269.325656-9299.6929079073.42687424982.99309439702.65883354402.24742370171.21012788614.257154114229.722423
2011-11-0143321.091336-33495.290752-7238.41676111718.45102727982.34574643267.87551558529.01173074809.03096193805.173651120492.849400
2011-11-0239873.310897-39638.237188-12259.4888317259.63565824107.77434539944.40025255573.55081772172.97417291871.915068119270.200045
2011-11-0338897.212691-42509.220460-14725.3707335465.49268622784.08574338756.25419554814.42655372198.67976692621.915745120534.229094
2011-11-0445310.748490-37919.694783-9357.16496011175.94616228815.30099545152.63472161626.06830079239.01663999980.063552128811.104239
2011-11-0540524.113111-45095.463685-15760.9258285668.92647223679.48827940503.12141157398.50063875497.46747496633.997752126117.225307
2011-11-0640806.692620-46676.736613-16544.1209315068.03039723563.21710440845.48754258063.83366476565.79820398222.894111128465.220975
2011-11-0743503.676450-46255.703438-15315.0779936965.64807425902.25245743336.31121561053.51255280111.716545102295.131332133194.560977
2011-11-0839830.233893-51545.615185-20027.9241582662.82461921906.34601239980.48242858006.29560277399.15683999888.163672131151.589197
2011-11-0936990.513044-56523.526649-24495.447993-1066.94597218732.00191537151.27915755520.41968075331.85926798382.970785130468.262942
2011-11-1042656.625332-52222.084277-19714.5147883802.22366523887.04679342525.44125061225.05714281332.468161104820.044505137808.305522
2011-11-1144756.329828-52548.031615-18656.1970645288.75403825734.36063744828.24096163986.35102084378.816506108611.618683142140.108263
2011-11-1237905.655743-60991.102805-27099.347751-2388.49941218511.77622138050.38962957546.97422978416.261580102719.966853136415.925032
2011-11-1344715.800505-55633.323514-20782.2853583913.66798724980.45103144809.79855264605.93757685704.112222110433.920792144866.205041
2011-11-1438863.458282-62877.590928-28234.112477-2844.26135618802.53192638951.66043059170.90856380678.533877105966.185779141098.465744

Visualization

def plot_predictions(train_data, predictions, item_id, target_column, titel, ylabel):
plt.figure(figsize=(12,5))
plt.title(titel)
plt.xlabel('Date')
plt.ylabel(ylabel)
# timeseries data
y_train = train_data.loc[item_id][target_column]
plt.plot(y_train, label="Timeseries Data")
# forcast data
y_pred = predictions.loc[item_id]
plt.plot(y_pred['mean'], label="Mean Forecast")
# confidence intervals
plt.fill_between(
y_pred.index , y_pred['0.1'], y_pred['0.9'],
color='red', alpha=0.1, label='10%-90% Confidence Range'
)
plot_predictions(
train_data, sv_predictions,
item_id='online_sales', target_column='target',
titel='30 Days Sales Prediction with Confidence Interval',
ylabel = 'Sales Volume'
)
plt.savefig('assets/AutoGluon_AutoML_TimeSeries_02.webp', bbox_inches='tight')

AutoML with AutoGluon for Timeseries Data

Multi Variate Forecasting - Future Covariants

Add known factors that affected your time series data in the past to future prediction - e.g. holidays on restaurant revenues.

# get dataset
!wget https://github.com/DaviRolim/datasets/raw/master/RestaurantVisitors.csv -P dataset

Data Preprocessing

df = pd.read_csv('dataset/RestaurantVisitors.csv')
df.tail(5)
# dataset contains unknowns -> will be used for prediction
dateweekdayholidayholiday_namerest1rest2rest3rest4total
5125/27/2017Saturday0naNaNNaNNaNNaNNaN
5135/28/2017Sunday0naNaNNaNNaNNaNNaN
5145/29/2017Monday1Memorial DayNaNNaNNaNNaNNaN
5155/30/2017Tuesday0naNaNNaNNaNNaNNaN
5165/31/2017Wednesday0naNaNNaNNaNNaNNaN
df.info()
# there are `517` entries but only `478` have a total
ColumnNon-Null CountDtype
0date517 non-null
1weekday517 non-nullobject
2holiday517 non-nullint64
3holiday_name517 non-nullobject
4rest1478 non-nullfloat64
5rest2478 non-nullfloat64
6rest3478 non-nullfloat64
7rest4478 non-nullfloat64
8total478 non-nullfloat64
df_sample = df.copy()

# renaming columns
df_sample.rename(columns={'total': 'target', 'date': 'timestamp'}, inplace=True)
df_sample['item_id'] = 'restaurant_visitors'

# get numeric representation of weekday from timestamp
datetimes = pd.to_datetime(df_sample['timestamp'])
df_sample['timestamp'] = datetimes
df_sample['weekday'] = datetimes.dt.day_of_week

# drop not needed
df_sample.drop(
['rest1', 'rest2', 'rest3', 'rest4', 'holiday_name'],
axis=1, inplace=True)

df_sample.tail(5)
timestampweekdayholidaytargetitem_id
5122017-05-2750NaNrestaurant_visitors
5132017-05-2860NaNrestaurant_visitors
5142017-05-2901NaNrestaurant_visitors
5152017-05-3010NaNrestaurant_visitors
5162017-05-3120NaNrestaurant_visitors
# split missing data for prediction
df_sample.iloc[:478].to_csv('dataset/mv_known_series.csv', index=False)
df_sample.iloc[478:].drop('target',axis=1).to_csv('dataset/mv_unknown_series.csv', index=False)

Model Training

train_data = TimeSeriesDataFrame('dataset/mv_known_series.csv')
train_data.head(5)
weekdayholidaytarget
item_idtimestamp
restaurant_visitors2016-01-0141296.0
2016-01-0250191.0
2016-01-0360202.0
2016-01-0400105.0
2016-01-051098.0
# create a predictor for the length of the unknown series
mv_predictor = TimeSeriesPredictor(
prediction_length=len(df_sample.iloc[478:]),
path=MODEL_PATH,
target='target',
known_covariates_names = ['weekday', 'holiday'],
eval_metric='sMAPE'
)
mv_predictor.fit(
train_data,
time_limit=800,
presets="high_quality"
)

# Training complete. Models trained: ['Naive', 'SeasonalNaive', 'Theta', 'AutoETS', 'RecursiveTabular', 'DeepAR', 'TemporalFusionTransformer', 'PatchTST', 'AutoARIMA', 'WeightedEnsemble']
# Total runtime: 470.02 s
# Best model: WeightedEnsemble
# Best model score: -0.1501

Model Predictions

future_series = TimeSeriesDataFrame('dataset/mv_unknown_series.csv')
future_series.head(5)
weekdayholiday
item_idtimestamp
restaurant_visitors2017-04-2360
2017-04-2400
2017-04-2510
2017-04-2620
2017-04-2730
mv_predictions = mv_predictor.predict(train_data, known_covariates=future_series, random_seed=SEED)

Visualization

plot_predictions(
train_data, mv_predictions,
item_id='restaurant_visitors', target_column='target',
titel='Restaurant Visitors 39 Days Predictions with Confidence Interval',
ylabel = 'Restaurant Revenue'
)
plt.savefig('assets/AutoGluon_AutoML_TimeSeries_03.webp', bbox_inches='tight')

AutoML with AutoGluon for Timeseries Data

Multi Variate Forecasting - Past Covariants

The Air Quality dataset reports on the weather and the level of pollution each hour for five years at the US embassy in Beijing, China. The data includes the date-time, the PM2.5 concentration, and the weather information including dew point, temperature, pressure, wind direction, wind speed and the cumulative number of hours of snow and rain.

# get dataset
!wget https://raw.githubusercontent.com/jyoti0225/Air-Pollution-Forecasting/master/AirPollution.csv -P dataset

Data Preprocessing

# datetime is split up into 4 columns => combine
def parse(x):
return dt.strptime(x, '%Y %m %d %H')
df = pd.read_csv('dataset/AirPollution.csv', date_parser=parse,parse_dates=[
['year', 'month', 'day', 'hour']
])

df.head(5)
year_month_day_hourNopm2.5DEWPTEMPPREScbwdIwsIsIr
02010-01-01 00:00:001NaN-21-11.01021.0NW1.7900
12010-01-01 01:00:002NaN-21-12.01020.0NW4.9200
22010-01-01 02:00:003NaN-21-11.01019.0NW6.7100
32010-01-01 03:00:004NaN-21-14.01019.0NW9.8400
42010-01-01 04:00:005NaN-20-12.01018.0NW12.9700
df.info()
# dataset contains missing pm2.5 values

# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 year_month_day_hour 43824 non-null datetime64[ns]
# 1 No 43824 non-null int64
# 2 pm2.5 41757 non-null float64
# 3 DEWP 43824 non-null int64
# 4 TEMP 43824 non-null float64
# 5 PRES 43824 non-null float64
# 6 cbwd 43824 non-null object
# 7 Iws 43824 non-null float64
# 8 Is 43824 non-null int64
# 9 Ir 43824 non-null int64
df_sample = df.copy()

# one-hot encode wind direction
one_hot_wind = pd.get_dummies(df['cbwd'], drop_first=True)
df_sample = pd.concat([df, one_hot_wind], axis=1, join="inner")

# renaming columns
df_sample.rename(columns={
'year_month_day_hour': 'timestamp',
'pm2.5': 'target',
'DEWP': 'dew_point',
'TEMP': 'temperature',
'PRES': 'pressure',
'NW': 'wind_direction_nw',
'SE': 'wind_direction_se',
'cv': 'wind_direction_cv',
'Iws': 'wind_speed',
'Is': 'snow',
'Ir': 'rain'}, inplace=True)

# add item_id
df_sample['item_id'] = 'pm2_pollution'

# fill missing targets with mean()
df_sample['target'] = df_sample['target'].fillna(df_sample['target'].mean())

# make datetime object
datetimes = pd.to_datetime(df_sample['timestamp'])
df_sample['timestamp'] = datetimes
df_sample['weekday'] = datetimes.dt.day_of_week

# drop not needed
df_sample.drop(['No', 'cbwd'], axis=1, inplace=True)

df_sample.head(5)
timestamptargetdew_pointtemperaturepressurewind_speedsnowrainwind_direction_nwwind_direction_sewind_direction_cvitem_idweekday
02010-01-01 00:00:0098.613215-21-11.01021.01.7900100pm2_pollution4
12010-01-01 01:00:0098.613215-21-12.01020.04.9200100pm2_pollution4
22010-01-01 02:00:0098.613215-21-11.01019.06.7100100pm2_pollution4
32010-01-01 03:00:0098.613215-21-14.01019.09.8400100pm2_pollution4
42010-01-01 04:00:0098.613215-20-12.01018.012.9700100pm2_pollution4
df_sample.to_csv('dataset/bj_airpollution.csv', index=False)

Model Training

train_data = TimeSeriesDataFrame('dataset/bj_airpollution.csv')
train_data.head(5)
targetdew_pointtemperaturepressurewind_speedsnowrainwind_direction_nwwind_direction_sewind_direction_cvweekday
item_idtimestamp
pm2_pollution2010-01-01 00:00:0098.613215-21-11.01021.01.79001004
2010-01-01 01:00:0098.613215-21-12.01020.04.92001004
2010-01-01 02:00:0098.613215-21-11.01019.06.71001004
2010-01-01 03:00:0098.613215-21-14.01019.09.84001004
2010-01-01 04:00:0098.613215-20-12.01018.012.97001004
# 30-day predictor
bj_predictor = TimeSeriesPredictor(
prediction_length=24*30,
path=MODEL_PATH,
target='target',
eval_metric='sMAPE'
)
bj_predictor.fit(
train_data,
presets="high_quality"
)

# Fitting simple weighted ensemble.
# -0.8465 = Validation score (-sMAPE)
# 3.16 s = Training runtime
# 40.20 s = Validation (prediction) runtime
# Training complete. Models trained: ['Naive', 'SeasonalNaive', 'Theta', 'AutoETS', 'RecursiveTabular', 'DeepAR', 'PatchTST', 'AutoARIMA', 'WeightedEnsemble']
# Total runtime: 693.53 s
# Best model: WeightedEnsemble
# Best model score: -0.8465

Model Predictions

bj_predictions = bj_predictor.predict(train_data, random_seed=SEED)

Visualization

plot_predictions(
train_data, bj_predictions,
item_id='pm2_pollution', target_column='target',
titel='Beijing PM2 Air Pollution 30 Days Predictions with Confidence Interval',
ylabel = 'PM2 Measurement'
)
plt.savefig('assets/AutoGluon_AutoML_TimeSeries_04.webp', bbox_inches='tight')

AutoML with AutoGluon for Timeseries Data