Sham Sui Po, Hong Kong

Python Asserts Cheat Sheet

Python Asserts Cheat Sheet

Datasets

Validating and Verifying Data

TLC Trip Record Data

Yellow and green taxi trip records include fields capturing pick-up and drop-off dates/times, pick-up and drop-off locations, trip distances, itemized fares, rate types, payment types, and driver-reported passenger counts. The data used in the attached datasets were collected and provided to the NYC Taxi and Limousine Commission (TLC) by technology providers authorized under the Taxicab & Livery Passenger Enhancement Programs (TPEP/LPEP). The trip data was not created by the TLC, and TLC makes no representations as to the accuracy of these data.

!wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet -P dataset

import csv
import math
import numpy as np
import numpy.testing as npt
import pandas as pd
import pandas.testing as pdt

yellow_tripdata_df = pd.read_parquet(
    'dataset/yellow_tripdata_2023-01.parquet'
)

yellow_tripdata_df.to_csv('dataset/yellow_tripdata_2023-01.csv')

yellow_tripdata_df = pd.read_csv(
    'dataset/yellow_tripdata_2023-01.csv',
    parse_dates=['tpep_pickup_datetime','tpep_dropoff_datetime'],
    nrows=1000
)
yellow_tripdata_df.head(5)

# https://www.kaggle.com/datasets/neomatrix369/nyc-taxi-trip-duration-extended
trip_ext_df = pd.read_csv('dataset/nyc_trip_duration_extended.csv')
trip_ext_df.head(5)

Introduction to Asserts

Asserts in Python

# simple assert
x = 'five'
assert x == 5
# AssertionError

list = [6,2,3,4,5]
assert all(list[i] <= list[i+1] for i in range(len(list)-1))
# AssertionError

def add(a,b):
    return a + b

assert add(2,3) < 5
AssertionError

trip_ext_df.columns

# Index(['name', 'district', 'neighbourhood', 'latitude', 'longitude',
#        'geonumber'],
#       dtype='object')

with open('dataset/nyc_trip_duration_extended.csv') as f:
    reader = csv.DictReader(f)
    
    expected_columns = ['name', 'district', 'neighbourhood', 'latitude', 'longitude',
       'geonumber', 'missing_column']
    
    assert reader.fieldnames == expected_columns, f"Expected columns: {expected_columns}, but got {reader.fieldnames}"
    
# AssertionError:
# Expected columns: ['name', 'district', 'neighbourhood', 'latitude', 'longitude', 'geonumber', 'missing_column'],
# but got ['name', 'district', 'neighbourhood', 'latitude', 'longitude', 'geonumber']

with open('dataset/yellow_tripdata_2023-01.csv') as f:
    reader = csv.DictReader(f)
    
    for row in reader:
        # check passenger count is positive int
        assert float(row['passenger_count']) > 0., f"ERROR :: Invalid Passenger Count: {row['passenger_count']}"
        
# AssertionError: ERROR :: Invalid Passenger Count: 0.0

# how many trips were without passengers
trips_without_passengers = 0

with open('dataset/yellow_tripdata_2023-01.csv') as f:
    next(f) # # skip header
    
    for line in f:
        values = line.strip().split(',')
        trip_id = values[0]
        passenger_count = values[4]
        if passenger_count == '0.0':
            trips_without_passengers += 1
            
print(f'Trips without passengers: {trips_without_passengers}')
# Trips without passengers: 51164

perct_zero_trips = len(zero_trips) * 100 / len(yellow_tripdata_df)
print("%.2f" % perct_zero_trips + ' %')
# 1.67 %

Asserts in Pandas

Indices

index1 = pd.Index([1,2,3])
index2 = pd.Index([1,2,'three'])

pdt.assert_index_equal(index1, index2)

# Index classes are different
# [left]:  Int64Index([1, 2, 3], dtype='int64')
# [right]: Index([1, 2, 'three'], dtype='object')

index1 = pd.Index([1,2,3])
index2 = pd.Index([3,2,1])

pdt.assert_index_equal(index1, index2, check_order=True)
# Index values are different (66.66667 %)
# [left]:  Int64Index([1, 2, 3], dtype='int64')
# [right]: Int64Index([3, 2, 1], dtype='int64')

index1 = pd.Index([1.0,2.0,3.0])
index2 = pd.Index([1.0,2.0,3.1])

pdt.assert_index_equal(index1, index2, check_exact=False, atol=0.1)

# Index values are different (33.33333 %)
# [left]:  Float64Index([1.0, 2.0, 3.0], dtype='float64')
# [right]: Float64Index([1.0, 2.0, 3.1], dtype='float64')

index1 = pd.Index(yellow_tripdata_df['tpep_pickup_datetime'].dt.date)
index1[:3]
# Index([2023-01-01, 2023-01-01, 2023-01-01], dtype='object', name='tpep_pickup_datetime')

index2 = pd.Index(yellow_tripdata_df['tpep_dropoff_datetime'].dt.date)
index2[:3]
# Index([2023-01-01, 2023-01-01, 2023-01-01], dtype='object', name='tpep_dropoff_datetime')

pdt.assert_index_equal(index1, index2, check_exact=True, check_names=False)

# Index values are different (0.3 %)
# [left]:  Index([2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01,
#        2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01,
#        ...
#        2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01,
#        2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01],
#       dtype='object', name='tpep_pickup_datetime', length=1000)
# [right]: Index([2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01,
#        2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01,
#        ...
#        2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01,
#        2023-01-01, 2023-01-01, 2023-01-01, 2023-01-01],
#       dtype='object', name='tpep_dropoff_datetime', length=1000)

# show difference
index_diff = yellow_tripdata_df[
    index1 != index2
][
    ['tpep_pickup_datetime','tpep_dropoff_datetime']
]

index_diff.head()

#     tpep_pickup_datetime tpep_dropoff_datetime
# 383  2023-01-01 00:36:07   2023-01-02 00:17:13
# 567  2022-12-31 23:59:37   2023-01-01 00:07:28
# 761  2022-12-31 23:58:27   2023-01-01 00:02:21

Series

s1 = pd.Series([1,2,3], name='series1')
s2 = pd.Series([1,2,3], name='series2')

pdt.assert_series_equal(s1,s2,check_names=True)

# Attribute "name" are different
# [left]:  series1
# [right]: series2

s1 = pd.Series([1,2,3], name='series1')
s2 = pd.Series(['1','2','3'], name='series2')

pdt.assert_series_equal(s1,s2,check_names=False, check_dtype=False)

# Series values are different (100.0 %)
# [index]: [0, 1, 2]
# [left]:  [1, 2, 3]
# [right]: [1, 2, 3]

pickup_series = yellow_tripdata_df['tpep_pickup_datetime'].dt.date
pickup_series.head(2)

# 0    2023-01-01
# 1    2023-01-01
# Name: tpep_pickup_datetime, dtype: object

dropoff_series = yellow_tripdata_df['tpep_dropoff_datetime'].dt.date
pickup_series.head(2)

# 0    2023-01-01
# 1    2023-01-01
# Name: tpep_pickup_datetime, dtype: object

pdt.assert_series_equal(pickup_series, dropoff_series, check_exact=True, check_names=False)

# AssertionError: Series are different
# Series values are different (0.3 %)

# drop values that don't fit
index_diff = pickup_series.index[pickup_series != dropoff_series]
yellow_tripdata_df_drop = yellow_tripdata_df.drop(index_diff).reset_index(drop=True)
# rebuild series
pickup_series = yellow_tripdata_df_drop['tpep_pickup_datetime'].dt.date
dropoff_series = yellow_tripdata_df_drop['tpep_dropoff_datetime'].dt.date
# re-check - this times it works
pdt.assert_series_equal(pickup_series, dropoff_series, check_exact=True, check_names=False)

DataFrames

df1 = pd.DataFrame({'A': [1,2,3], 'B': [3,2,1]})
df2 = pd.DataFrame({'B': [3,2,1], 'A': [1,2,3]})

pdt.assert_frame_equal(df1,df2,check_like=False)

# DataFrame.columns values are different (100.0 %)
# [left]:  Index(['A', 'B'], dtype='object')
# [right]: Index(['B', 'A'], dtype='object')

pickup_df = yellow_tripdata_df.copy()
dropoff_df = yellow_tripdata_df.copy()

pickup_df['date'] = yellow_tripdata_df['tpep_pickup_datetime'].dt.date
dropoff_df['date'] = yellow_tripdata_df['tpep_dropoff_datetime'].dt.date

pdt.assert_frame_equal(
    pickup_df[['date']],
    dropoff_df[['date']],
    check_exact=True,
    check_names=False
)

# AssertionError: DataFrame.iloc[:, 0] (column name="date") are different
# DataFrame.iloc[:, 0] (column name="date") values are different (0.3 %)

# get index of mismatched rows
index_diff = pickup_df.index[
    pickup_df['date'].ne(dropoff_df['date'])
]

index_diff
# Int64Index([383, 567, 761], dtype='int64')

# drop rows at those indices
pickup_df_drop = pickup_df.drop(index_diff)
dropoff_df_drop = dropoff_df.drop(index_diff)

# verify that assert now works
pdt.assert_frame_equal(
    pickup_df_drop[['date']],
    dropoff_df_drop[['date']],
    check_exact=True,
    check_names=False
)

Asserts in Numpy

a = np.array([1,2,3])
b = np.array([3,2,1])

npt.assert_array_equal(a,b)

# AssertionError: Arrays are not equal
# Mismatched elements: 2 / 3 (66.7%)
# Max absolute difference: 2
# Max relative difference: 2.
#  x: array([1, 2, 3])
#  y: array([3, 2, 1])

string1 = 'string'
string2 = 'STRING'

npt.assert_string_equal(string1,string2)

a = np.array([1,2,3])
b = np.array([0.98,2.02,2.98])

npt.assert_allclose(a,b,atol=0.01)

# AssertionError: Not equal to tolerance rtol=1e-07, atol=0.01
# Mismatched elements: 3 / 3 (100%)
# Max absolute difference: 0.02
# Max relative difference: 0.02040816
# x: array([1, 2, 3])
# y: array([0.98, 2.02, 2.98])

a = np.array([1,2,3])
b = np.array([2,3,4])

npt.assert_array_less(a,b)

npt.assert_array_less(b,a)

# AssertionError: Arrays are not less-ordered
# Mismatched elements: 3 / 3 (100%)
# Max absolute difference: 1
# Max relative difference: 1.
#  x: array([2, 3, 4])
#  y: array([1, 2, 3])

Assert-based Testing

Quantitative Tests

def test_for_missing_data(df):
    # count all missing values and assert number to be zero
    assert df.isnull().sum().sum() == 0, 'ERROR :: DataFrame contains missing data!'
    return True

assert test_for_missing_data(trip_ext_df)
# True

def test_non_numerical_data_types(df, columns):
    for col in columns:
        assert df[col].dtype == 'int64' or df[col].dtype =='float64', f'ERROR :: {col} has a non-numerical dType'
    return True

test_columns = ['neighbourhood','latitude','longitude','geonumber']
assert test_non_numerical_data_types(trip_ext_df, trip_ext_df[test_columns])
# AssertionError: ERROR :: neighbourhood has a non-numerical dType

def test_for_out_of_range(df, columns):
    for col in columns:
        assert df[col].dtype == 'int64' or df[col].dtype == 'float64', f'ERROR :: {col} has a non-numerical dType'
        assert df[col].max() <= math.inf, f'ERROR :: {col} contains infinite values'
        assert df[col].min() >= -math.inf, f'ERROR :: {col} contains infinite values'
        assert not np.isnan(df[col]).any(), f'ERROR :: {col} contains NaN values'
        assert not np.isinf(df[col]).any(), f'ERROR :: {col} contains infinite values'
    return True

test_columns = ['latitude','longitude','geonumber']
assert test_non_numerical_data_types(trip_ext_df, trip_ext_df[test_columns])
# True

Logical Tests

def test_for_logical_errors(df):
    # all dropoffs AFTER pickups
    assert all(df['tpep_dropoff_datetime'] > df['tpep_pickup_datetime']), 'ERROR :: Drop-off time before pickup'
    # no negative trip distances
    assert (df['trip_distance'] >= 0).all(), 'ERROR :: Negative trip distances'
    # no negative passenger count
    assert (df['passenger_count'] >= 0).all(), 'ERROR :: Negative passenger count'
    
    return True

assert test_for_logical_errors(yellow_tripdata_df)
# True

Python Asserts Cheat Sheet

Datasets​

Introduction to Asserts​

Asserts in Python​

Asserts in Pandas​

Indices​

Series​

DataFrames​

Asserts in Numpy​

Assert-based Testing​

Quantitative Tests​

Logical Tests​