Skip to main content

Sham Sui Po, Hong Kong

Pandas Cheat Sheet 2023

Github Repository

import numpy as np
import pandas as pd
SEED = 42

Panda Series

Numeric Index

data = [1642, 1776, 1867, 1821, 1909]

series_with_numeric_index = pd.Series(data)
series_with_numeric_index
01642
11776
21867
31821
41909

Labelled Index

index = ['Aliqua', 'Ad ut Nulla', 'Nisi est Pastrami', 'Jowl Magna', 'Anim Bacon Doner']

series_with_labelled_index = pd.Series(data, index)
series_with_labelled_index
Aliqua1642
Ad ut Nulla1776
Nisi est Pastram1867
Jowl Magna1821
Anim Bacon Doner1909
labelled_data = {
'Aliqua' : 1642,
'Ad ut Nulla' : 1776,
'Nisi est Pastrami' : 1867,
'Jowl Magna' : 1821,
'Anim Bacon Doner' : 1909
}

series_with_labelled_index2 = pd.Series(labelled_data)
series_with_labelled_index2
Aliqua1642
Ad ut Nulla1776
Nisi est Pastram1867
Jowl Magna1821
Anim Bacon Doner1909

Fill Missing Data

before = {
'Aliqua' : 1642,
'Ad ut Nulla' : 1776,
'Nisi est Pastrami' : 1867,
'Jowl Magna' : 1821,
'Anim Bacon Doner' : 1909
}

after = {
'Voluptate Boudin' : 1643,
'Ad ut Nulla' : 1777,
'Nisi est Pastrami' : 1868,
'Jowl Magna' : 1822,
'Anim Bacon Doner' : 1910
}
before_series = pd.Series(before)
after_series = pd.Series(after)

all_jowl_magna = before_series['Jowl Magna'] + after_series['Jowl Magna']
print(all_jowl_magna)
# 3643
# no fill
before_series + after_series
Ad ut Nulla3553.0
AliquaNaN
Anim Bacon Doner3819.0
Jowl Magna3643.0
Nisi est Pastrami3735.0
Voluptate BoudinNaN
# fill missing data with 0
before_series.add(after_series, fill_value=0)
Ad ut Nulla3553.0
Aliqua1642.0
Anim Bacon Doner3819.0
Jowl Magna3643.0
Nisi est Pastrami3735.0
Voluptate Boudin1643.0

Panda Dataframes

Creating a Dataframe from a Numpy Array

np.random.seed(SEED)

random_data_array = np.random.randint(0,103,(4,3))
random_data_array

# array([[102, 51, 92],
# [ 14, 71, 60],
# [ 20, 102, 82],
# [ 86, 74, 74]])
dataframe = pd.DataFrame(random_data_array)
dataframe
012
01025192
1147160
22010282
3867474
# get column labels
dataframe.columns
# RangeIndex(start=0, stop=3, step=1)
# get index labels
dataframe.index
# RangeIndex(start=0, stop=4, step=1)
# get data types
dataframe.dtypes
0int64
1int64
2int64
dtype: object
# get number of rows
len(dataframe)
# 4
data_index = ['Ad ut Nulla', 'Nisi est Pastrami', 'Jowl Magna', 'Anim Bacon Doner']

data_columns = ['Q1', 'Q2', 'Q3']
dataframe_with_labels = pd.DataFrame(random_data_array, data_index, data_columns)
dataframe_with_labels
Q1Q2Q3
Ad ut Nulla1025192
Nisi est Pastrami147160
Jowl Magna2010282
Anim Bacon Doner867474
dataframe_with_labels.info()
# <class 'pandas.core.frame.DataFrame'>
# Index: 4 entries, Ad ut Nulla to Anim Bacon Doner
# Data columns (total 3 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 Q1 4 non-null int64
# 1 Q2 4 non-null int64
# 2 Q3 4 non-null int64
# dtypes: int64(3)
# memory usage: 128.0+ bytes

Creating a Dataframe from a CSV File

wine_dataset = pd.read_csv('datasets/wine-quality.csv')
wine_dataset.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 fixed acidity 4898 non-null float64
1 volatile acidity 4898 non-null float64
2 citric acid 4898 non-null float64
3 residual sugar 4898 non-null float64
4 chlorides 4898 non-null float64
5 free sulfur dioxide 4898 non-null float64
6 total sulfur dioxide 4898 non-null float64
7 density 4898 non-null float64
8 pH 4898 non-null float64
9 sulphates 4898 non-null float64
10 alcohol 4898 non-null float64
11 quality 4898 non-null int64
dtypes: float64(11), int64(1)
memory usage: 459.3 KB

Creating a Dataframe from a Excel File

wine_dataset = pd.read_excel('datasets/wine-quality.xlsx', sheet_name='Sheet1')

# dependency:
# ImportError: Missing optional dependency 'openpyxl'. Use pip or conda to install openpyxl.

wine_dataset_excel = pd.DataFrame.from_dict(wine_dataset)

wine_dataset_excel.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 fixed acidity 4898 non-null float64
1 volatile acidity 4898 non-null float64
2 citric acid 4898 non-null float64
3 residual sugar 4898 non-null float64
4 chlorides 4898 non-null float64
5 free sulfur dioxide 4898 non-null float64
6 total sulfur dioxide 4898 non-null float64
7 density 4898 non-null float64
8 pH 4898 non-null float64
9 sulphates 4898 non-null float64
10 alcohol 4898 non-null float64
11 quality 4898 non-null int64
dtypes: float64(11), int64(1)
memory usage: 459.3 KB
wine_dataset_excel.head(2)
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
07.00.270.3620.70.04545.0170.01.0013.00.458.86
16.30.300.341.60.04914.0132.00.9943.30.499.56

Describe Dataframe

wine_dataset_excel.describe()
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
count4898.0000004898.0000004898.0000004898.0000004898.0000004898.0000004898.0000004898.0000004898.0000004898.0000004898.0000004898.000000
mean6.8547880.2782410.3341926.3914150.04577235.308085138.3606570.9940273.1882670.48984710.5142675.877909
std0.8438680.1007950.1210205.0720580.02184817.00713742.4980650.0029910.1510010.1141261.2306210.885639
min3.8000000.0800000.0000000.6000000.0090002.0000009.0000000.9871102.7200000.2200008.0000003.000000
256.3000000.2100000.2700001.7000000.03600023.000000108.0000000.9917233.0900000.4100009.5000005.000000
506.8000000.2600000.3200005.2000000.04300034.000000134.0000000.9937403.1800000.47000010.4000006.000000
757.3000000.3200000.3900009.9000000.05000046.000000167.0000000.9961003.2800000.55000011.4000006.000000
max14.2000001.1000001.66000065.8000000.346000289.000000440.0000001.0389803.8200001.08000014.2000009.000000
  • DataFrame.count: Count number of non-NA/null observations.
  • DataFrame.max: Maximum of the values in the object.
  • DataFrame.min: Minimum of the values in the object.
  • DataFrame.mean: Mean of the values.
  • DataFrame.std: Standard deviation of the observations.
  • DataFrame.25: Lower percentile - 75% of the data is above this value.
  • DataFrame.75: Upper percentile - 25% of the data is above this value.
  • DataFrame.50: The 50 percentile is the same as the median.

Dataframe Correlation Matrix

Compute pairwise correlation of columns, excluding NA/null values. Generate values between -1 and 1 to represent the negative or positive correlation between two values:

wine_dataset_excel.corr()
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
fixed acidity1.000000-0.0226970.2891810.0890210.023086-0.0493960.0910700.265331-0.425858-0.017143-0.120881-0.113663
volatile acidity-0.0226971.000000-0.1494720.0642860.070512-0.0970120.0892610.027114-0.031915-0.0357280.067718-0.194723
citric acid0.289181-0.1494721.0000000.0942120.1143640.0940770.1211310.149503-0.1637480.062331-0.075729-0.009209
residual sugar0.0890210.0642860.0942121.0000000.0886850.2990980.4014390.838966-0.194133-0.026664-0.450631-0.097577
chlorides0.0230860.0705120.1143640.0886851.0000000.1013920.1989100.257211-0.0904390.016763-0.360189-0.209934
free sulfur dioxide-0.049396-0.0970120.0940770.2990980.1013921.0000000.6155010.294210-0.0006180.059217-0.2501040.008158
total sulfur dioxide0.0910700.0892610.1211310.4014390.1989100.6155011.0000000.5298810.0023210.134562-0.448892-0.174737
density0.2653310.0271140.1495030.8389660.2572110.2942100.5298811.000000-0.0935910.074493-0.780138-0.307123
pH-0.425858-0.031915-0.163748-0.194133-0.090439-0.0006180.002321-0.0935911.0000000.1559510.1214320.099427
sulphates-0.017143-0.0357280.062331-0.0266640.0167630.0592170.1345620.0744930.1559511.000000-0.0174330.053678
alcohol-0.1208810.067718-0.075729-0.450631-0.360189-0.250104-0.448892-0.7801380.121432-0.0174331.0000000.435575
quality-0.113663-0.194723-0.009209-0.097577-0.2099340.008158-0.174737-0.3071230.0994270.0536780.4355751.000000

Dataframes Columns

Value Counts for Categorical Values

Count the number of entries in a categorical column for each categroy:

wine_dataset_excel['quality'].value_counts()
# there are 5 wines with quality 9 but 2198 with quality 6
62198
51457
7880
8175
4163
320
95
Name: quality, dtype: int64
wine_dataset_excel['quality'].value_counts().plot.bar()

Wine Quality Distribution

Unique Entries in a Column

print(wine_dataset_excel['quality'].unique())
# array([6, 5, 7, 8, 4, 3, 9])
# we don't have any wines with quality 1,2 or 10
print(wine_dataset_excel['quality'].nunique())
# 7
# in total there are 7 classes

Selecting Columns

# Select a column to return a PandaSeries
type(wine_dataset_excel['pH'])
wine_dataset_excel['pH']
03.00
13.30
23.26
33.19
43.19
...
48933.27
48943.15
48952.99
48963.34
48973.26

Name: pH, Length: 4898, dtype: float64

# select only wines with a pH of 2.72
wine_dataset_excel[wine_dataset_excel['pH'] == 2.72]
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
190010.00.230.2714.10.03345.0166.00.99882.720.439.76
# select multiple columns
columns = ['pH', 'alcohol']
type(wine_dataset_excel[columns])
wine_dataset_excel[columns]
pHalcohol
03.008.8
13.309.5
23.2610.1
33.199.9
43.199.9
...
48933.2711.2
48943.159.6
48952.999.4
48963.3412.8
48973.2611.8

4898 rows × 2 columns Name: pH, Length: 4898, dtype: float64

plot = wine_dataset_excel[columns].plot.scatter(
figsize=(12,8),
x='pH',
y='alcohol')

Scatter Plot multiple columns

Renaming Categorical Values

wine_dataset_excel['quality'] = wine_dataset_excel['quality'].replace(
[10,9,8,7,6,5,4,3,2,1],
['A','B','C','D','E','F','G','H','I','J']
)

wine_dataset_excel['quality']
0E
1E
2E
3E
4E
...
4893E
4894F
4895E
4896D
4897E
Name: quality, Length: 4898, dtype: object

Value Mapping

feature_map = {
'A':10,
'B': 9,
'C': 8,
'D': 7,
'E': 6,
'F': 5,
'G': 4,
'H': 3,
'I': 2,
'J': 1
}

wine_dataset_excel['quality'].map(feature_map)
wine_dataset_excel['quality']
0E
1E
2E
3E
4E
...
4893E
4894F
4895E
4896D
4897E
Name: quality, Length: 4898, dtype: object

Sorting by Columns

wine_dataset_excel.sort_values(by='quality', ascending=False, na_position='last')
indexfixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
8277.40.240.362.000.03127.0139.00.990553.280.4812.59
16057.10.260.492.200.03231.0113.00.990303.370.4212.99
8766.90.360.344.200.01857.0119.00.989803.280.3612.79
7749.10.270.4510.600.03528.0124.00.997003.200.4610.49
8206.60.360.291.600.02124.085.00.989653.410.6112.49
...
14847.50.320.244.600.0538.0134.00.995803.140.509.13
23737.60.480.371.200.0345.057.00.992563.050.5410.43
2518.50.260.2116.200.07441.0197.00.998003.020.509.83
16886.70.250.261.550.041118.5216.00.994903.550.639.43
2535.80.240.443.500.0295.0109.00.991303.530.4311.73
wine_dataset_excel.sort_values(by=['alcohol', 'residual sugar'], ascending=False, na_position='first')
indexfixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
39186.40.3500.281.600.03731.0113.00.987793.120.4014.207
45035.80.6100.018.400.04131.0104.00.990903.260.7214.057
31505.80.3900.477.500.02712.088.00.990703.380.4514.006
10995.80.2900.212.600.02512.0120.00.989403.390.7914.007
39045.00.4550.181.900.03633.0106.00.987463.210.8314.007
...
40206.40.3700.125.900.0566.091.00.995363.060.468.404
38356.20.3100.233.300.05234.0113.00.994293.160.488.405
38396.20.3100.233.300.05234.0113.00.994293.160.488.405
32654.20.2150.235.100.04164.0157.00.996883.420.448.003
26254.50.1900.210.950.03389.0159.00.993323.340.428.005

Find Min/Max Value in Column

# return max value in column
wine_dataset_excel['total sulfur dioxide'].max()
# 440.0
# return max value location in column
max_value_location = wine_dataset_excel['total sulfur dioxide'].idxmax()
print(max_value_location)
# 4745
wine_dataset_excel.iloc[max_value_location]
fixed acidity6.10000
volatile acidity0.26000
citric acid0.25000
residual sugar2.90000
chlorides0.04700
free sulfur dioxide289.00000
total sulfur dioxide440.00000
density0.99314
pH3.44000
sulphates0.64000
alcohol10.50000
quality3.00000
Name: 4745, dtype: float64
# return min value location in column
min_value_location = wine_dataset_excel['total sulfur dioxide'].idxmin()
print(min_value_location)
# 3710
wine_dataset_excel.iloc[min_value_location]
fixed acidity4.70000
volatile acidity0.67000
citric acid0.09000
residual sugar1.00000
chlorides0.02000
free sulfur dioxide5.00000
total sulfur dioxide9.00000
density0.98722
pH3.30000
sulphates0.34000
alcohol13.60000
quality5.00000
Name: 3710, dtype: float64

Adding Columns

wine_dataset_excel['total acidity'] = wine_dataset_excel['fixed acidity'] + wine_dataset_excel['volatile acidity']
wine_dataset_excel
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholqualitytotal acidity
07.00.270.3620.70.04545.0170.01.001003.000.458.867.27
16.30.300.341.60.04914.0132.00.994003.300.499.566.60
28.10.280.406.90.05030.097.00.995103.260.4410.168.38
37.20.230.328.50.05847.0186.00.995603.190.409.967.43
47.20.230.328.50.05847.0186.00.995603.190.409.967.43
...
48936.20.210.291.60.03924.092.00.991143.270.5011.266.41
48946.60.320.368.00.04757.0168.00.994903.150.469.656.92
48956.50.240.191.20.04130.0111.00.992542.990.469.466.74
48965.50.290.301.10.02220.0110.00.988693.340.3812.875.79
48976.00.210.380.80.02022.098.00.989413.260.3211.866.21

Removing Columns

wine_dataset_excel_dropped = wine_dataset_excel.drop(['total acidity'], axis=1)
wine_dataset_excel_dropped
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
07.00.270.3620.70.04545.0170.01.001003.000.458.86
16.30.300.341.60.04914.0132.00.994003.300.499.56
28.10.280.406.90.05030.097.00.995103.260.4410.16
37.20.230.328.50.05847.0186.00.995603.190.409.96
47.20.230.328.50.05847.0186.00.995603.190.409.96
...
48936.20.210.291.60.03924.092.00.991143.270.5011.26
48946.60.320.368.00.04757.0168.00.994903.150.469.65
48956.50.240.191.20.04130.0111.00.992542.990.469.46
48965.50.290.301.10.02220.0110.00.988693.340.3812.87
48976.00.210.380.80.02022.098.00.989413.260.3211.86

Dataframes Rows

Setting an Index Column

# adding an index column
wine_dataset_excel_dropped['index'] = range(1, len(wine_dataset_excel_dropped) + 1)
wine_dataset_excel_dropped.head(2)
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholqualityindex
07.00.270.3620.70.04545.0170.01.001003.000.458.861
16.30.300.341.60.04914.0132.00.994003.300.499.562
wine_dataset_excel_indexed = wine_dataset_excel_dropped.set_index('index')
wine_dataset_excel_indexed.head(2)
indexfixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
17.00.270.3620.70.04545.0170.01.001003.000.458.86
26.30.300.341.60.04914.0132.00.994003.300.499.56
# resetting the index turns index into a regular column again
wine_dataset_excel_indexed.reset_index()

Selecting Rows

# select first rows by integer location
wine_dataset_excel_indexed.iloc[0]
fixed acidity7.000
volatile acidity0.270
citric acid0.360
residual sugar20.700
chlorides0.045
free sulfur dioxide45.000
total sulfur dioxide170.000
density1.001
pH3.000
sulphates0.450
alcohol8.800
quality6.000
# select first row by labelled index
wine_dataset_excel_indexed.loc[1]
fixed acidity7.000
volatile acidity0.270
citric acid0.360
residual sugar20.700
chlorides0.045
free sulfur dioxide45.000
total sulfur dioxide170.000
density1.001
pH3.000
sulphates0.450
alcohol8.800
quality6.000
# select multiple rows by iloc
wine_dataset_excel_indexed.iloc[:3]
indexfixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
17.00.270.3620.70.04545.0170.01.00103.000.458.86
26.30.300.341.60.04914.0132.00.99403.300.499.56
38.10.280.406.90.05030.097.00.99513.260.4410.16
# select multiple rows by loc
wine_dataset_excel_indexed.loc[[111,222,333]]
indexfixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
1116.50.1700.548.50.08264.0163.00.99592.890.398.86
2227.20.6850.219.50.07033.0172.00.99713.000.559.16
3336.30.2300.301.80.03316.091.00.99063.280.4011.86

Removing Rows

wine_dataset_excel_indexed.drop(1, axis=0)
indexfixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
26.30.300.341.60.04914.0132.00.99403.300.499.56
38.10.280.406.90.05030.097.00.99513.260.4410.16
...

Inserting Rows

# copy a row
row_copy = wine_dataset_excel_indexed.iloc[0]
wine_dataset_excel_indexed.append(row_copy)
# FutureWarning: The frame.append method is deprecated and will be removed
# from pandas in a future version. Use pandas.concat instead.
indexfixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
17.00.270.3620.70.04545.0170.01.001003.000.458.86.0
26.30.300.341.60.04914.0132.00.994003.300.499.56.0
...
48986.00.210.380.80.02022.098.00.989413.260.3211.86.0
17.00.270.3620.70.04545.0170.01.001003.000.458.86.0
pd.concat([wine_dataset_excel_indexed, row_copy.to_frame().T], ignore_index=False)
indexfixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
17.00.270.3620.70.04545.0170.01.001003.000.458.86.0
26.30.300.341.60.04914.0132.00.994003.300.499.56.0
...
48986.00.210.380.80.02022.098.00.989413.260.3211.86.0
17.00.270.3620.70.04545.0170.01.001003.000.458.86.0

Removing Duplicated Rows

# test for duplicated rows
wine_dataset_excel.duplicated()

The 4th row contains a duplicated entry:

0False
1False
2False
3False
4True
...
4893False
4894False
4895False
4896False
4897False
Length: 4898, dtype: bool
# remove duplicates
wine_dataset_excel_deduped = wine_dataset_excel.drop_duplicates()
wine_dataset_excel_deduped.shape
# (3961, 12)
# We are down from 4898 to 3961 dropping 937 rows!

Selecting n-largest / n-smallest

top_10_wines = wine_dataset_excel.nlargest(10, 'quality')
top_10_wines
indexfixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
7749.10.270.4510.60.03528.0124.00.997003.200.4610.49
8206.60.360.291.60.02124.085.00.989653.410.6112.49
8277.40.240.362.00.03127.0139.00.990553.280.4812.59
8766.90.360.344.20.01857.0119.00.989803.280.3612.79
16057.10.260.492.20.03231.0113.00.990303.370.4212.99
176.20.660.481.20.02929.075.00.989203.330.3912.88
206.20.660.481.20.02929.075.00.989203.330.3912.88
226.80.260.421.70.04941.0122.00.993003.470.4810.58
686.70.230.312.10.04630.096.00.992603.330.6410.78
746.70.230.312.10.04630.096.00.992603.330.6410.78
worst_10_wines = wine_dataset_excel.nsmallest(10, 'quality')
worst_10_wines

Selecting Random Sample

# sample a fixed number - 2
two_random_wines = wine_dataset_excel.sample(n=2)
two_random_wines
indexfixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
35797.40.250.287.250.02814.078.00.992382.940.3711.57
20697.10.330.251.600.03025.0126.00.990103.220.3412.17
# sample a percentage - 0.1%
random_wines_01_percent = wine_dataset_excel.sample(frac=0.001)
random_wines_01_percent
indexfixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
366.50.390.235.40.05125.0149.00.993403.240.3510.05
2715.20.600.077.00.04433.0147.00.994403.330.589.75
35555.80.140.156.10.04227.0123.00.993623.060.609.96
13158.10.200.369.70.04463.0162.00.997003.100.4610.06
2226.20.250.251.40.03035.0105.00.991203.300.4411.17
random_wines_01_percent.plot.bar(figsize=(12,8), rot=0)

Plot Random Sample

Shuffle Dataset

# Use the randomizer in sample() to shuffle the entire dataset
random_wines_shuffle = wine_dataset_excel.sample(frac=1)
random_wines_shuffle
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholqualityreally sourtotal acidity
22037.80.2400.382.10.05814.0167.00.994003.210.559.95False8.040
42947.40.2200.289.00.04622.0121.00.994683.100.5510.85False7.620
13636.90.3200.161.40.05115.096.00.994003.220.389.54False7.220
33106.30.3000.292.10.04833.0142.00.989563.220.4612.97False6.600
978.60.2650.361.20.03415.080.00.991302.950.3611.47True8.865
...
22947.00.3200.316.40.03138.0115.00.992353.380.5812.27False7.320
4056.80.2700.121.30.04087.0168.00.992003.180.4110.05False7.070
7197.40.2900.501.80.04235.0127.00.993703.450.5010.27False7.690
27856.40.2400.2520.20.08335.0157.00.999763.170.509.15False6.640
16667.80.4450.561.00.0408.084.00.993803.250.4310.85False8.245

Conditional Filtering

# mark all wines with a pH below 3 as sour
wine_dataset_excel_indexed['really sour'] = wine_dataset_excel_indexed['pH'] < 3.
wine_dataset_excel_indexed.head(5)
indexfixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholqualityreally sour
17.00.270.3620.70.04545.0170.01.00103.000.458.86False
26.30.300.341.60.04914.0132.00.99403.300.499.56False
38.10.280.406.90.05030.097.00.99513.260.4410.16False
47.20.230.328.50.05847.0186.00.99563.190.409.96False
57.20.230.328.50.05847.0186.00.99563.190.409.96False

Filter by Value

# select only rows with a pH value below 3
sour_wines = wine_dataset_excel_indexed[wine_dataset_excel_indexed['pH'] < 3.]
print(sour_wines.shape)
# there are 437 sour wines - (437, 13)
sour_wines.head(5)
indexfixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholqualityreally sour
118.10.2700.411.450.03311.063.00.99082.990.5612.05True
158.30.4200.6219.250.04041.0172.01.00022.980.679.75True
748.60.2300.461.000.0549.072.00.99412.950.499.16True
797.40.1800.308.800.06426.0103.00.99612.940.569.35True
988.60.2650.361.200.03415.080.00.99132.950.3611.47True
# select only rows that are marked as not sour
not_so_sour_wines = wine_dataset_excel_indexed[wine_dataset_excel_indexed['really sour'] == False]
print(not_so_sour_wines.shape)
# there are 4461 not so sour wines - (4461, 13)
not_so_sour_wines.head(5)
indexfixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholqualityreally sour
17.00.270.3620.70.04545.0170.01.00103.000.458.86False
26.30.300.341.60.04914.0132.00.99403.300.499.56False
38.10.280.406.90.05030.097.00.99513.260.4410.16False
47.20.230.328.50.05847.0186.00.99563.190.409.96False
57.20.230.328.50.05847.0186.00.99563.190.409.96False

Filter by Multiple Values

# select only rows that are marked as not sour AND have a high sugar level
sweet_wines = wine_dataset_excel_indexed[
(
wine_dataset_excel_indexed['really sour'] == False
) & (
wine_dataset_excel_indexed['residual sugar'] > 20
)
]
print(sweet_wines.shape)
# there are 15 sweet wines - (15, 13)
sweet_wines.head(5)
indexfixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholqualityreally sour
17.00.270.3620.70.04545.0170.01.00103.000.458.86False
87.00.270.3620.70.04545.0170.01.00103.000.458.86False
4456.90.240.3620.80.03140.0139.00.99753.200.3311.06False
16547.90.330.2831.60.05335.0176.01.01033.150.388.86False
16647.90.330.2831.60.05335.0176.01.01033.150.388.86False
# select only rows that have extreme total SO2 values OR sugar levels
selected_wines = wine_dataset_excel_indexed[
(
wine_dataset_excel_indexed['total sulfur dioxide'] > 250.
) | (
wine_dataset_excel_indexed['residual sugar'] > 30.
)
]
print(selected_wines.shape)
# (27, 13)
selected_wines.head(5)
indexfixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholqualityreally sour
1127.20.270.4618.750.05245.0255.01.00003.040.528.95False
2287.10.250.3210.300.04166.0272.00.99693.170.529.16False
3267.50.270.315.800.057131.0313.00.99463.180.5910.55False
3886.30.390.355.900.04082.5260.00.99413.120.6610.15False
4047.10.270.3118.200.04655.0252.01.00003.070.568.75False

Comparing Columns with Crosstab

wine_dataset_excel['really sour'] = wine_dataset_excel['pH'] < 3.

# how many wines that are 'really sour' are in each quality class
pd.crosstab(wine_dataset_excel['quality'], wine_dataset_excel['really sour'])

really sour

qualityFalseTrue
3164
414914
51321136
62003195
780872
815916
950
pd.crosstab(wine_dataset_excel['quality'], wine_dataset_excel['really sour']).plot.bar()

Pandas Crosstab

Filter with isin()

# select wines with a density of 1.0010 OR 0.9956 inside the dataset
options = [1.0010, 0.9956]

selected_wines_with_selected_density = wine_dataset_excel_indexed[
wine_dataset_excel_indexed['density'].isin(options)
]
print(selected_wines_with_selected_density.shape)
# (46, 13)
selected_wines_with_selected_density.head(5)
indexfixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholqualityreally sour
17.00.270.3620.70.04545.0170.01.00103.000.458.86False
47.20.230.328.50.05847.0186.00.99563.190.409.96False
57.20.230.328.50.05847.0186.00.99563.190.409.96False
87.00.270.3620.70.04545.0170.01.00103.000.458.86False
716.20.270.437.80.05648.0244.00.99563.100.519.06False
# select wines that DO NOT have a density of 1.0010 OR 0.9956 inside the dataset
options = [1.0010, 0.9956]

selected_wines_with_selected_density = wine_dataset_excel_indexed[
~wine_dataset_excel_indexed['density'].isin(options)
]
print(selected_wines_with_selected_density.shape)
# (4852, 13)
selected_wines_with_selected_density.head(5)
indexfixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholqualityreally sour
26.30.300.341.60.04914.0132.00.99403.300.499.56False
38.10.280.406.90.05030.097.00.99513.260.4410.16False
68.10.280.406.90.05030.097.00.99513.260.4410.16False
76.20.320.167.00.04530.0136.00.99493.180.479.66False
96.30.300.341.60.04914.0132.00.99403.300.499.56False

Filter with between()

# select wines that have a density inbetween 0.9949 OR 0.9951 inside the dataset
selected_wines_with_selected_density = wine_dataset_excel_indexed[
wine_dataset_excel_indexed['density'].between(0.9949, 0.9951, inclusive='both')
]
print(selected_wines_with_selected_density.shape)
# (105, 13)
selected_wines_with_selected_density.head(5)
indexfixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholqualityreally sour
38.10.280.406.90.05030.097.00.99513.260.4410.16False
68.10.280.406.90.05030.097.00.99513.260.4410.16False
76.20.320.167.00.04530.0136.00.99493.180.479.66False
256.60.270.411.30.05216.0142.00.99513.420.4710.06False
506.90.190.355.00.06732.0150.00.99503.360.489.85False

Apply

Apply a Custom Method to a Single Column (Pandas Series)

# custom function to round up a number
def round_it_up(num):
return np.around(num, decimals=3)
wine_dataset_excel_indexed['density'].apply(round_it_up)
index
11.001
20.994
30.995
40.996
50.996
...
48940.991
48950.995
48960.993
48970.989
48980.989
Name: density, Length: 4898, dtype: float64
# replace values by symbols ↑ → ↓ ←
wine_dataset_excel_indexed['density'].mean()
# 0.9940273764801959
def rel_average(density):
if density > 0.9940273764801959:
return '↑'
else:
return '↓'

wine_dataset_excel_indexed['density average'] = wine_dataset_excel_indexed['density'].apply(rel_average) 
wine_dataset_excel_indexed
indexfixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholqualitydensity average
17.00.270.3620.70.04545.0170.01.001003.000.458.86
26.30.300.341.60.04914.0132.00.994003.300.499.56
38.10.280.406.90.05030.097.00.995103.260.4410.16
47.20.230.328.50.05847.0186.00.995603.190.409.96
57.20.230.328.50.05847.0186.00.995603.190.409.96
...
48946.20.210.291.60.03924.092.00.991143.270.5011.26
48956.60.320.368.00.04757.0168.00.994903.150.469.65
48966.50.240.191.20.04130.0111.00.992542.990.469.46
48975.50.290.301.10.02220.0110.00.988693.340.3812.87
48986.00.210.380.80.02022.098.00.989413.260.3211.86

Apply a Custom Method to Multiple Columns (Pandas Series)

Lambda Expression
# using a lambda expression
## function
def times_two(num):
return num*2

# lambda
lambda num: num*2
wine_dataset_excel_indexed['quality'].apply(times_two)
index
112
212
312
412
512
...
489412
489510
489612
489714
489812
wine_dataset_excel_indexed['quality'].apply(lambda num: num*2)
index
112
212
312
412
512
...
489412
489510
489612
489714
489812
def sweetness(fixed_acidity, volantile_acidity, citric_acid, residual_sugar):
if (fixed_acidity+volantile_acidity+citric_acid)/residual_sugar > 0.38:
return "sweet"
else:
return "dry"
wine_dataset_excel_indexed['sweetness'] = wine_dataset_excel_indexed[[
'fixed acidity',
'volatile acidity',
'citric acid',
'residual sugar'
]].apply(
lambda wine_dataset_excel_indexed: sweetness(
wine_dataset_excel_indexed['fixed acidity'],
wine_dataset_excel_indexed['volatile acidity'],
wine_dataset_excel_indexed['citric acid'],
wine_dataset_excel_indexed['residual sugar']
), axis=1
)

wine_dataset_excel_indexed
indexfixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholqualitydensity averagesweetness
17.00.270.3620.70.04545.0170.01.001003.000.458.86dry
26.30.300.341.60.04914.0132.00.994003.300.499.56sweet
38.10.280.406.90.05030.097.00.995103.260.4410.16sweet
47.20.230.328.50.05847.0186.00.995603.190.409.96sweet
57.20.230.328.50.05847.0186.00.995603.190.409.96sweet
...
48946.20.210.291.60.03924.092.00.991143.270.5011.26sweet
48956.60.320.368.00.04757.0168.00.994903.150.469.65sweet
48966.50.240.191.20.04130.0111.00.992542.990.469.46sweet
48975.50.290.301.10.02220.0110.00.988693.340.3812.87sweet
48986.00.210.380.80.02022.098.00.989413.260.3211.86sweet
Numpy Vectorize

Using Numpy Vectorize to speed things up:

wine_dataset_excel_indexed['sweetness'] = np.vectorize(sweetness)(
wine_dataset_excel_indexed['fixed acidity'],
wine_dataset_excel_indexed['volatile acidity'],
wine_dataset_excel_indexed['citric acid'],
wine_dataset_excel_indexed['residual sugar']
)

wine_dataset_excel_indexed.head(5)
indexfixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholqualitydensity averagesweetness
17.00.270.3620.70.04545.0170.01.001003.000.458.86dry
26.30.300.341.60.04914.0132.00.994003.300.499.56sweet
38.10.280.406.90.05030.097.00.995103.260.4410.16sweet
47.20.230.328.50.05847.0186.00.995603.190.409.96sweet
57.20.230.328.50.05847.0186.00.995603.190.409.96sweet

Detect Missing Data

print(pd.NA is pd.NA)
print(np.nan is np.nan)
# True
# True
df_missing = pd.read_csv('datasets/wine-quality-missing.csv')
df_missing
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
07.00.270.3620.70.04545.0170.01.001003.000.458.86.0
16.30.300.341.60.04914.0132.00.994003.300.499.56.0
28.1NaN0.406.90.05030.097.00.995103.260.4410.16.0
37.20.230.328.50.05847.0186.00.995603.190.409.96.0
47.20.230.328.50.05847.0186.00.995603.190.409.96.0
...
48946.60.320.368.00.04757.0168.00.994903.150.469.65.0
48956.50.240.191.20.04130.0111.00.992542.990.469.46.0
48965.50.290.301.10.02220.0110.00.988693.340.3812.87.0
48976.00.210.380.80.02022.098.00.989413.260.3211.86.0
4898NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
df_missing.isna()
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
0FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
1FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
2FalseTrueFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
3FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
4FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
...
4894FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
4895FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
4896FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
4897FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
4898TrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
df_missing.notna()
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
0TrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
1TrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
2TrueFalseTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
3TrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
4TrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
...
4894TrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
4895TrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
4896TrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
4897TrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
4898FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
# only show wines that do not have a quality score
df[df_missing['quality'].isna()]
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
1346.80.2700.228.10.03455.0203.00.996103.190.528.9NaN
1456.30.2550.371.10.04037.0114.00.990503.000.3910.9NaN
1936.60.1500.345.10.05534.0125.00.994203.360.429.6NaN
2968.30.3900.7010.60.04533.0169.00.997603.090.579.4NaN
9326.50.2600.2812.50.04680.0225.00.996853.180.4110.0NaN
4898NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
# search for multiple null values
df[(df_missing['free sulfur dioxide'].isna()) & (df_missing['total sulfur dioxide'].isna())]
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
366.5NaNNaNNaNNaNNaNNaNNaN3.240.3510.05.0
4898NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN

Drop Missing Data

# drop all rows that have less than 7 non-null values
df_missing.dropna(axis='rows', thresh=7)
# drop only rows that miss ALL data points
df_missing.dropna(how='all')
# only drop rows that have missing values in specified volumns
df_missing.dropna(subset=['pH', 'alcohol'])

Fill Missing Data

# fill all missing values with `0`
df_missing.fillna(value=0)
# fill all missing values with `0` in a specified row
df_missing['pH'] = df_missing['pH'].fillna(value=0)
# use different fills per column
values = {
"fixed acidity": 0,
"volatile acidity": 0,
"citric acid": 0,
"residual sugar": 0,
"chlorides": 0,
"free sulfur dioxide": 0,
"total sulfur dioxide": 0,
"density": 0,
"pH": 0,
"sulphates": 0,
"alcohol": 0,
"quality": 'Not Evaluated',
}
df_missing.fillna(value=values)
# only drop rows that have missing values in specified volumns
df_missing.dropna(subset=['pH', 'alcohol'])
# fill missing data with average values
df_missing['chlorides'] = df_missing['chlorides'].fillna(df_missing['chlorides'].mean())

GroupBy

# group all wines with the same quality
# and display the mean values
wine_dataset.groupby('quality').mean()
qualityfixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcohol
37.6000000.3332500.3360006.3925000.05430053.325000170.6000000.9948843.1875000.47450010.345000
47.1294480.3812270.3042334.6282210.05009823.358896125.2791410.9942773.1828830.47613510.152454
56.9339740.3020110.3376537.3349690.05154636.432052150.9045980.9952633.1688330.4822039.808840
66.8376710.2605640.3380256.4416060.04521735.650591137.0473160.9939613.1885990.49110610.575372
76.7347160.2627670.3256255.1864770.03819134.125568125.1147730.9924523.2138980.50310211.367936
86.6571430.2774000.3265145.6714290.03831436.720000126.1657140.9922363.2186860.48622911.636000
97.4200000.2980000.3860004.1200000.02740033.400000116.0000000.9914603.3080000.46600012.180000
# only return the mean values of one colums as a pd series
wine_dataset.groupby('quality').mean()['total sulfur dioxide']
quality
3170.600000
4125.279141
5150.904598
6137.047316
7125.114773
8126.165714
9116.000000
Name: total sulfur dioxide, dtype: float64

GroupBy Multi-Index

# group by multiple features
quality_tsd = wine_dataset.groupby(['quality','total sulfur dioxide']).mean()
quality_tsd
qualitytotal sulfur dioxidefixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidedensitypHsulphatesalcohol
319.06.90.390.404.600.0225.00.991503.310.3712.6
33.010.30.170.471.400.0375.00.993902.890.289.6
57.07.60.480.371.200.0345.00.992563.050.5410.4
66.07.10.320.3211.000.03816.00.993703.240.4011.5
96.08.30.330.421.150.03318.00.991103.200.3212.4
...
985.06.60.360.291.600.02124.00.989653.410.6112.4
113.07.10.260.492.200.03231.00.990303.370.4212.9
119.06.90.360.344.200.01857.00.989803.280.3612.7
124.09.10.270.4510.600.03528.00.997003.200.4610.4
139.07.40.240.362.000.03127.00.990553.280.4812.5
quality_tsd.swaplevel()
total sulfur dioxidequalityfixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidedensitypHsulphatesalcohol
19.036.90.390.404.600.0225.00.991503.310.3712.6
33.0310.30.170.471.400.0375.00.993902.890.289.6
57.037.60.480.371.200.0345.00.992563.050.5410.4
66.037.10.320.3211.000.03816.00.993703.240.4011.5
96.038.30.330.421.150.03318.00.991103.200.3212.4
...
85.096.60.360.291.600.02124.00.989653.410.6112.4
113.097.10.260.492.200.03231.00.990303.370.4212.9
119.096.90.360.344.200.01857.00.989803.280.3612.7
124.099.10.270.4510.600.03528.00.997003.200.4610.4
139.097.40.240.362.000.03127.00.990553.280.4812.5
quality_tsd.swaplevel().sort_index(level='total sulfur dioxide', ascending=False)
total sulfur dioxidequalityfixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidedensitypHsulphatesalcohol
440.036.10.260.252.900.047289.00.993143.440.6410.5
366.538.60.550.3515.550.05735.51.000103.040.6311.0
344.059.10.330.381.700.06250.50.995803.100.709.5
313.057.50.270.315.800.057131.00.994603.180.5910.5
307.537.10.490.222.000.047146.50.992403.240.3711.0
...
21.055.90.190.370.800.0273.00.989703.090.3110.8
19.036.90.390.404.600.0225.00.991503.310.3712.6
18.069.70.240.494.900.0323.00.993682.850.5410.0
10.044.80.650.121.100.0134.00.992463.320.3613.5
9.054.70.670.091.000.0205.00.987223.300.3413.6
wine_dataset.groupby(['quality','total sulfur dioxide']).describe().transpose()
# show hierarchical indices [quality, total sulfur dioxide]
quality_tsd.index.levels
# FrozenList([[3, 4, 5, 6, 7, 8, 9], [9.0, 10.0, 18.0, 19.0, 21.0, 24.0, 25.0, 26.0, 28.0, 29.0, 30.0, 31.0, 33.0, 34.0, 37.0, 40.0, 41.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, 80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, 90.0, 91.0, 92.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0, 100.0, 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0, 110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 115.5, 116.0, 117.0, 118.0, 119.0, 120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126.0, ...]])
# only show where quality is 9
quality_tsd.loc[9]
total sulfur dioxidefixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidedensitypHsulphatesalcohol
85.06.60.360.291.60.02124.00.989653.410.6112.4
113.07.10.260.492.20.03231.00.990303.370.4212.9
119.06.90.360.344.20.01857.00.989803.280.3612.7
124.09.10.270.4510.60.03528.00.997003.200.4610.4
139.07.40.240.362.00.03127.00.990553.280.4812.5
# only show where quality is 3 or 4
quality_tsd.loc[[3, 4]]
qualitytotal sulfur dioxidefixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidedensitypHsulphatesalcohol
319.06.90.3900.404.600.0225.00.991503.310.3712.6
33.010.30.1700.471.400.0375.00.993902.890.289.6
57.07.60.4800.371.200.0345.00.992563.050.5410.4
66.07.10.3200.3211.000.03816.00.993703.240.4011.5
96.08.30.3300.421.150.03318.00.991103.200.3212.4
...
4225.09.80.2500.7410.000.05636.00.997703.060.4310.0
233.08.00.6600.7217.550.04262.00.999902.920.689.4
234.56.80.2900.161.400.038122.50.992203.150.4710.0
245.06.30.6000.4411.000.05050.00.997203.190.579.3
272.06.20.2550.241.700.039138.50.994523.530.539.6
quality_tsd.index
MultiIndex([(3,  19.0),
(3, 33.0),
(3, 57.0),
(3, 66.0),
(3, 96.0),
(3, 109.0),
(3, 111.0),
(3, 123.0),
(3, 134.0),
(3, 157.0),
...
(8, 179.0),
(8, 180.0),
(8, 186.0),
(8, 188.0),
(8, 212.5),
(9, 85.0),
(9, 113.0),
(9, 119.0),
(9, 124.0),
(9, 139.0)],
names=['quality', 'total sulfur dioxide'], length=777)
# only show averages for wines of aquality of 3 and tsd average is 111.0
quality_tsd.loc[(3, 111.)]

fixed acidity6.2000
volatile acidity0.2300
citric acid0.3500
residual sugar0.7000
chlorides0.0510
free sulfur dioxide24.0000
density0.9916
pH3.3700
sulphates0.4300
alcohol11.0000
Name: (3, 111.0), dtype: float64

Multi-Index Cross-Section

# cross-section - get all wines with quality 9
quality_tsd.xs(key=9, level='quality')
total sulfur dioxidefixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidedensitypHsulphatesalcohol
85.06.60.360.291.60.02124.00.989653.410.6112.4
113.07.10.260.492.20.03231.00.990303.370.4212.9
119.06.90.360.344.20.01857.00.989803.280.3612.7
124.09.10.270.4510.60.03528.00.997003.200.4610.4
139.07.40.240.362.00.03127.00.990553.280.4812.5
# cross-section - get all wines with tsd = 124
quality_tsd.xs(key=124., level='total sulfur dioxide')
qualityfixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidedensitypHsulphatesalcohol
48.4500000.3300000.5500003.7000000.04350012.5000000.9938903.0450000.44000010.550000
56.6615380.2746150.3092319.6384620.04476923.5384620.9957723.1615380.49692310.015385
66.8260870.2847830.3278265.9173910.04382628.5652170.9937083.1634780.44652210.534783
76.6166670.2850000.2958335.3333330.03750036.7500000.9917353.1658330.51750011.841667
86.9000000.2900000.3650004.8000000.03600034.5000000.9910153.1350000.37500012.300000
99.1000000.2700000.45000010.6000000.03500028.0000000.9970003.2000000.46000010.400000
# pre-select only wines with a pH value between 3.0-3.1
# and then group by quality and tsd
wine_dataset[wine_dataset['pH'].isin([3.0, 3.1])].groupby(['quality','total sulfur dioxide']).mean()
qualitytotal sulfur dioxidefixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidedensitypHsulphatesalcohol
495.06.0000000.5900000.0000000.800.03730.0000000.9903203.1000000.4010.900000
102.07.0333330.3933330.2233335.000.03712.6666670.9930133.0333330.7611.466667
106.08.5000000.2000000.4000001.100.04631.0000000.9919403.0000000.3510.500000
107.09.2000000.1600000.4900002.000.04418.0000000.9951403.1000000.5310.200000
111.06.5000000.2900000.2500002.500.1428.0000000.9927003.0000000.449.900000
...
7171.06.8000000.1800000.30000012.800.06219.0000000.9980803.0000000.529.000000
174.08.1000000.3000000.4900008.100.03726.0000000.9943003.1000000.3011.200000
189.06.9000000.3600000.28000013.550.04851.0000000.9978203.0000000.609.500000
193.07.6000000.1900000.32000018.750.04732.0000001.0001403.1000000.509.300000
8125.06.9000000.3600000.3500008.600.03837.0000000.9916003.0000000.3212.400000

Aggregation

# return std deviation minimum/maximum values
wine_dataset.agg(['std', 'min', 'max'], axis='rows')
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
std0.8438680.1007950.121025.0720580.02184817.00713742.4980650.0029910.1510010.1141261.2306210.885639
min3.8000000.0800000.000000.6000000.0090002.0000009.0000000.9871102.7200000.2200008.0000003.000000
max14.2000001.1000001.6600065.8000000.346000289.000000440.0000001.0389803.8200001.08000014.2000009.000000
# return std deviation, mean and minimum/maximum values for selected columns
wine_dataset.agg({'fixed acidity' : ['std', 'min'], 'citric acid' : ['std', 'mean']}, axis='rows')
fixed aciditycitric acid
std0.8438680.121020
min3.800000NaN
meanNaN0.334192

Combining Dataframes

Concatenation

By Rows

data_one = { 'A': ['A0', 'A1', 'A2', 'A3', 'A4'], 'B': ['B0', 'B1', 'B2', 'B3', 'B4']}
data_two = { 'C': ['C0', 'C1', 'C2', 'C3', 'C4'], 'D': ['D0', 'D1', 'D2', 'D3', 'D4']}
one_df = pd.DataFrame(data_one)
two_df = pd.DataFrame(data_two)
pd.concat([one_df, two_df], axis=1)
ABCD
0A0B0C0D0
1A1B1C1D1
2A2B2C2D2
3A3B3C3D3
4A4B4C4D4

By Columns

data_three = { 'A': ['A0', 'A1', 'A2', 'A3', 'A4'], 'B': ['B0', 'B1', 'B2', 'B3', 'B4']}
data_four = { 'A': ['A5', 'A6', 'A7', 'A8', 'A9'], 'B': ['B5', 'B6', 'B7', 'B8', 'B9']}
three_df = pd.DataFrame(data_three)
four_df = pd.DataFrame(data_four)
pd.concat([three_df, four_df], axis=0, ignore_index=True)
AB
0A0B0
1A1B1
2A2B2
3A3B3
4A4B4
5A5B5
6A6B6
7A7B7
8A8B8
9A9B9

Merge

Inner Join

registration_data = {
'reg_id': ['1', '2', '3', '4'],
'name': ['Manfred', 'Kurt', 'Vasili', 'Cassandra']
}

login_data = {
'log_id': ['1', '2', '3', '4'],
'name': ['Chantal', 'Manfred', 'Cassandra', 'Andrew']
}
reg_df = pd.DataFrame(registration_data)
log_df = pd.DataFrame(login_data)
# only select names that are present in both tables and merge
pd.merge(reg_df, log_df, how='inner', on='name')
reg_idnamelog_id
01Manfred2
14Cassandra3

Outer Join

# merge all
pd.merge(reg_df, log_df, how='outer', on='name')
reg_idnamelog_id
01Manfred2
12KurtNaN
23VasiliNaN
34Cassandra3
4NaNChantal1
5NaNAndrew4

Left Join

# merge left
pd.merge(reg_df, log_df, how='left', on='name')
reg_idnamelog_id
01Manfred2
12KurtNaN
23VasiliNaN
34Cassandra3
# merge left / swapped df input
pd.merge(log_df, reg_df, how='left', on='name')
log_idnamereg_id
01ChantalNaN
12Manfred1
23Cassandra4
34AndrewNaN

Right Join

# merge right
pd.merge(reg_df, log_df, how='right', on='name')
reg_idnamelog_id
0NaNChantal1
11Manfred2
24Cassandra3
3NaNAndrew4
# merge right / swapped df input
pd.merge(log_df, reg_df, how='right', on='name')
log_idnamereg_id
02Manfred1
1NaNKurt2
2NaNVasili3
33Cassandra4

Join by left_on / right_on

registration_data = {
'reg_id': ['1', '2', '3', '4'],
'name': ['Manfred', 'Kurt', 'Vasili', 'Cassandra']
}

login_data = {
'log_id': ['1', '2', '3', '4'],
'user': ['Chantal', 'Manfred', 'Cassandra', 'Andrew']
}
reg_df = pd.DataFrame(registration_data)
log_df = pd.DataFrame(login_data)
# merge on different columns
pd.merge(reg_df, log_df, how='inner', left_on='name', right_on='user').set_index('user')
userreg_idlog_id
Manfred12
Cassandra43

Join Suffixes

registration_data = {
'id': ['1', '2', '3', '4'],
'name': ['Manfred', 'Kurt', 'Vasili', 'Cassandra']
}

login_data = {
'id': ['1', '2', '3', '4'],
'name': ['Chantal', 'Manfred', 'Cassandra', 'Andrew']
}
reg_df = pd.DataFrame(registration_data)
log_df = pd.DataFrame(login_data)
pd.merge(reg_df, log_df , how='inner', on='name', suffixes=('_reg', '_log'))
id_regnameid_log
01Manfred2
14Cassandra3

String Method

email = 'jl-picard@starfleet.com'
email.split('@')[0]
# 'jl-picard'

Pandas Str()

names = pd.Series(['Data', 'Geordie', 'Deanna', 'Worf', 'Riker', 'Beverly', '7'])
names.str.isdigit()
0False
1False
2False
3False
4False
5False
6True
dtype: bool
messy_api_response = [
'Burgdoggen, Tri-tip, leberkas, aute',
'T-bone, sint, dolor, consequat',
'Mollit, magna, proident, kielbasa'
]
pd.Series(messy_api_response).str.split(',').str[0]
0Burgdoggen
1T-bone
2Mollit
dtype: object
pd.Series(messy_api_response).str.split(',', expand=True)
0123
0BurgdoggenTri-tipleberkasaute
1T-bonesintdolorconsequat
2Mollitmagnaproidentkielbasa
messy_api_response_2 = [
'burgdoggen %',
't-bone%',
' mollit %'
]

pd.Series(messy_api_response_2).str.replace('%', '').str.strip()
0burgdoggen
1t-bone
2mollit
dtype: object
# alternatively use apply()
def cleanup_strings(name):
name = name.replace('%', '')
name = name.strip()
name = name.capitalize()
return name

pd.Series(messy_api_response_2).apply(cleanup_strings)
0Burgdoggen
1T-bone
2Mollit
dtype: object

Date-Time Method

Date-Time Object

# example datetiem object
from datetime import datetime

year = 2023
month = 10
day = 5
hour = 4
minute = 44
seconds = 7

date = datetime(year, month, day, hour, minute, seconds)
date
# datetime.datetime(2023, 10, 5, 4, 44, 7)
# working with US datetime object
date_series = pd.Series([
'Aug 27, 1989',
'2021-04-01',
'2020-10-25 02:00 +0800',
'19th of Mar 2003'
])

# generate datetime objects
date_series = pd.to_datetime(date_series)
date_series
01989-08-27 00:00:00
12021-04-01 00:00:00
22020-10-25 02:00:00+08:00
32003-03-19 00:00:00
dtype: datetime64[ns]
date_series[1].month
# 4
# working with European datetime object
date_series_european = pd.Series(['31-03-2005', '01-04-2005', '28-04-2005'])
date_series_european = pd.to_datetime(date_series_european, dayfirst=True)
date_series_european
02005-03-31
12005-04-01
22005-04-28
dtype: datetime64[ns]
# formating timestrings
time_string = '06--Dec--2012'

pd.to_datetime(time_string, format='%d--%b--%Y')
# Timestamp('2012-12-06 00:00:00')
# import from csv dataset
sales = pd.read_csv('datasets/RetailSales_BeerWineLiquor.csv')
sales
DATEMRTSSM4453USN
01992-01-011509
11992-02-011541
21992-03-011597
31992-04-011675
41992-05-011822
...
3352019-12-016630
3362020-01-014388
3372020-02-014533
3382020-03-015562
3392020-04-015207
340 rows × 2 columns
sales['DATE'] = pd.to_datetime(sales['DATE'])
print(sales['DATE'][0].year)
# 1992
sales['DATE']
DATE
01992-01-01
11992-02-01
21992-03-01
31992-04-01
41992-05-01
...
3352019-12-01
3362020-01-01
3372020-02-01
3382020-03-01
3392020-04-01
Name: DATE, Length: 340, dtype: datetime64[ns]
# import from csv dataset + automatically parse datetime object
sales_autoparse = pd.read_csv('datasets/RetailSales_BeerWineLiquor.csv', parse_dates=[0])
sales_autoparse['DATE'].dt.year
01992
11992
21992
31992
41992
...
3352019
3362020
3372020
3382020
3392020
Name: DATE, Length: 340, dtype: int64
# working with time indices
sales_time_index = sales_autoparse.set_index('DATE')

# group data by years and calculate the mean value
sales_time_index.resample('A').mean()
DATEMRTSSM4453USN
1992-12-311807.250000
1993-12-311794.833333
1994-12-311841.750000
1995-12-311833.916667
1996-12-311929.750000
1997-12-312006.750000
1998-12-312115.166667
1999-12-312206.333333
2000-12-312375.583333
2001-12-312468.416667
2002-12-312491.166667
2003-12-312539.083333
2004-12-312682.416667
2005-12-312797.250000
2006-12-313001.333333
2007-12-313177.333333
2008-12-313292.000000
2009-12-313353.750000
2010-12-313450.083333
2011-12-313532.666667
2012-12-313697.083333
2013-12-313839.666667
2014-12-314023.833333
2015-12-314212.500000
2016-12-314434.416667
2017-12-314602.666667
2018-12-314830.666667
2019-12-314972.750000
2020-12-314922.500000

Panda I/O

CSV

# read_csv
df_csv = pd.read_csv('datasets/RetailSales_BeerWineLiquor.csv', index_col=0)
df_csv
DATEMRTSSM4453USN
1992-01-011509
1992-02-011541
1992-03-011597
1992-04-011675
1992-05-011822
...
2019-12-016630
2020-01-014388
2020-02-014533
2020-03-015562
2020-04-015207
340 rows × 1 columns
# write to_csv
df_csv.to_csv('datasets/csv-test.csv')

Excel

dependency pip install openpyxl

workbook = pd.ExcelFile('datasets/wine-quality.xlsx')
workbook.sheet_names
# ['Sheet1']
# read_excel
df_excel = pd.read_excel('datasets/wine-quality.xlsx', sheet_name='Sheet1')
# get all worksheets
df_excel_dictionary = pd.read_excel('datasets/wine-quality.xlsx', sheet_name=None)
# show all sheet keys
print(df_excel_dictionary.keys())
# dict_keys(['Sheet1'])

# select sheet by key
df_excel_dictionary['Sheet1']
# write to excel
df_excel.to_excel('datasets/test.xlsx', index=False)

HTML

dependency: pip install lxml

url = 'https://en.wikipedia.org/wiki/Star_Trek:_Deep_Space_Nine'
# read in an HTML table
tables = pd.read_html(url)
ds9_actors = tables[1]
ds9_actors
ActorCharacterPositionAppearancesCharacter's speciesRank
0Avery BrooksBenjamin SiskoCommanding OfficerSeasons 1–7HumanCommander (Seasons 1–3)Captain (Seasons 3–7)
1Avery BrooksBenjamin Sisko is the Starfleet officer placed...Benjamin Sisko is the Starfleet officer placed...Benjamin Sisko is the Starfleet officer placed...Benjamin Sisko is the Starfleet officer placed...Benjamin Sisko is the Starfleet officer placed...
2René AuberjonoisOdoChief of SecuritySeasons 1–7ChangelingConstable (unofficial)
3René AuberjonoisConstable Odo is the station's chief of securi...Constable Odo is the station's chief of securi...Constable Odo is the station's chief of securi...Constable Odo is the station's chief of securi...Constable Odo is the station's chief of securi...
4Alexander SiddigJulian BashirChief Medical OfficerSeasons 1–7HumanLieutenant, junior grade (Seasons 1–3) Lieuten...
5Alexander SiddigJulian Bashir is the station's chief medical o...Julian Bashir is the station's chief medical o...Julian Bashir is the station's chief medical o...Julian Bashir is the station's chief medical o...Julian Bashir is the station's chief medical o...
6Terry FarrellJadzia DaxChief Science OfficerSeasons 1–6TrillLieutenant (Seasons 1–3)Lieutenant commander (...
7Terry FarrellJadzia Dax is the station's science officer. A...Jadzia Dax is the station's science officer. A...Jadzia Dax is the station's science officer. A...Jadzia Dax is the station's science officer. A...Jadzia Dax is the station's science officer. A...
8Cirroc LoftonJake SiskoStudent (Seasons 1–5)Journalist (Seasons 5–7)Seasons 1–7HumanCivilian
9Cirroc LoftonJake is Benjamin Sisko's son. He at first rese...Jake is Benjamin Sisko's son. He at first rese...Jake is Benjamin Sisko's son. He at first rese...Jake is Benjamin Sisko's son. He at first rese...Jake is Benjamin Sisko's son. He at first rese...
10Colm MeaneyMiles O'BrienChief Operations OfficerSeasons 1–7HumanSenior chief petty officer
11Colm MeaneyMiles O'Brien is the Chief of Operations, resp...Miles O'Brien is the Chief of Operations, resp...Miles O'Brien is the Chief of Operations, resp...Miles O'Brien is the Chief of Operations, resp...Miles O'Brien is the Chief of Operations, resp...
12Armin ShimermanQuarkBar ownerSeasons 1–7FerengiCivilian
13Armin ShimermanQuark is the proprietor of a bar on Deep Space...Quark is the proprietor of a bar on Deep Space...Quark is the proprietor of a bar on Deep Space...Quark is the proprietor of a bar on Deep Space...Quark is the proprietor of a bar on Deep Space...
14Nana VisitorKira NerysFirst OfficerSeasons 1–7BajoranMajor (Seasons 1–6)Colonel (Season 7)Commander...
15Nana VisitorKira Nerys is the Bajoran military's liaison t...Kira Nerys is the Bajoran military's liaison t...Kira Nerys is the Bajoran military's liaison t...Kira Nerys is the Bajoran military's liaison t...Kira Nerys is the Bajoran military's liaison t...
16Michael DornWorfStrategic Operations OfficerFirst Officer, USS...Seasons 4–7KlingonLieutenant commander
17Michael DornThe fourth season saw the addition of Dorn to ...The fourth season saw the addition of Dorn to ...The fourth season saw the addition of Dorn to ...The fourth season saw the addition of Dorn to ...The fourth season saw the addition of Dorn to ...
18Nicole de BoerEzri DaxCounselorSeason 7TrillEnsign (Season 7)Lieutenant, junior grade (Sea...
19Nicole de BoerAfter the abrupt departure of Terry Farrell, E...After the abrupt departure of Terry Farrell, E...After the abrupt departure of Terry Farrell, E...After the abrupt departure of Terry Farrell, E...After the abrupt departure of Terry Farrell, E...
# write dataframe to HTML
ds9_actors.to_html('datasets/test.html', index=False)

SQL

Create an In-Memory SQL Database

Requierement: pip install sqlalchemy

from sqlalchemy import create_engine
temp_db = create_engine('sqlite:///:memory:')
np.random.seed(SEED)
random_df = pd.DataFrame(
data=np.random.randint(
low=0,
high=100,
size=(4,4)
), columns=['a','b','c','d']
)

random_df
abcd
051921471
160208286
274748799
32322152
random_df.to_sql(name='random_table', con=temp_db, if_exists='fail')

Read SQL Data

sql_df = pd.read_sql(sql='random_table', con=temp_db)
sql_df
indexabcd
0051921471
1160208286
2274748799
332322152
random_df.to_sql(name='random_table', con=temp_db, if_exists='replace', index=False)
sql_df = pd.read_sql(sql='random_table', con=temp_db)
sql_df
abcd
051921471
160208286
274748799
32322152

SQL Queries

sql_df = pd.read_sql(sql='SELECT a,d FROM random_table', con=temp_db)
sql_df
ad
01471
18286
28799
32152

Visualizations Overview

Pandas Dataframes and Dataseries with Matplotlib and Seaborn

from numpy.random import randn, randint, uniform, sample
date_values_df = pd.DataFrame(
randn(1000),
index = pd.date_range(
'2019-10-15',
periods=1000
),
columns=['value']
)

timeseries = pd.Series(
randn(1000),
index = pd.date_range(
'2019-10-15',
periods=1000
)
)

Line Plot

date_values_df['value'] = date_values_df['value'].cumsum()
date_values_df.plot(title='Dataframe Cummulative Sum', figsize=(12,8))

Pandas Visualizations

timeseries = timeseries.cumsum()
timeseries.plot(title='Timseries Cummulative Sum', figsize=(12,8))

Pandas Visualizations

Subplots

# complete dataset
iris_ds = sns.load_dataset('iris')
iris_ds.plot(
title='Iris Dataset',
figsize=(10,5),
legend=True,
logy=True
)

Pandas Visualizations

iris_ds.plot(
title='Iris Dataset',
subplots=True,
figsize=(10,5),
legend=True,
sharex=True,
logy=False
)

Pandas Visualizations

iris_ds.plot(
title='Iris Dataset',
subplots=True,
figsize=(10,5),
legend=True,
sharex=False,
layout=(2,2)
)

plt.tight_layout()

Pandas Visualizations

iris_dropped_width = iris_ds.drop(labels=['sepal_width', 'petal_width'], axis=1)
iris_dropped_width.head()
iris_dropped_legth = iris_ds[['sepal_width', 'petal_width']]
iris_dropped_legth.head()
# twin axes plot
ax = iris_dropped_width.plot(legend=True)
ax.legend(loc='upper left')
iris_dropped_legth.plot(
title='Iris Dataset',
figsize=(10,5),
secondary_y = True,
ax = ax
)

Pandas Visualizations

Bar Plot

iris_dropped_species = iris_ds.drop(['species'], axis=1)
iris_dropped_species.iloc[0].plot.bar(
figsize=(10,5),
title='Iris Dataset'
)

Pandas Visualizations

Histogram

titanic_ds = sns.load_dataset('titanic')
titanic_ds.head()
survivedpclasssexagesibspparchfareembarkedclasswhoadult_maledeckembark_townalivealone
003male22.0107.2500SThirdmanTrueNaNSouthamptonnoFalse
111female38.01071.2833CFirstwomanFalseCCherbourgyesFalse
213female26.0007.9250SThirdwomanFalseNaNSouthamptonyesTrue
311female35.01053.1000SFirstwomanFalseCSouthamptonyesFalse
403male35.0008.0500SThirdmanTrueNaNSouthamptonnoTrue
ax = titanic_ds['pclass'].plot.hist(
figsize=(10,5),
title='Titanic - Passenger by Class Histogram'
)

Pandas Visualizations

iris_ds.plot.hist(
title="Iris Dataset - Petal Width Distribution by Species",
figsize=(10,5),
bins=20,
column=["petal_width"],
by="species"
)
plt.tight_layout()

Pandas Visualizations

iris_ds.plot.hist(
title="Iris Dataset - Histogram",
figsize=(10,5),
bins=20,
orientation='horizontal'
)

Pandas Visualizations

iris_ds.plot.hist(
title="Iris Dataset - Stacked Histogram",
figsize=(10,5),
bins=50,
stacked=True
)

Pandas Visualizations

iris_ds.hist(
figsize=(12,8),
bins=20,
by="species",
legend=True
)

Pandas Visualizations

iris_ds.hist(
figsize=(12,8),
bins=20,
color='mediumseagreen'
)

Pandas Visualizations

Stacked Bar Plot

rand_df = pd.DataFrame(randn(10,4), columns=['a', 'b', 'c', 'd'])
rand_df.head()
abcd
0-0.0850121.3948981.7901660.167911
11.536115-1.0537700.000237-1.468094
2-1.479892-0.480497-0.685922-1.975383
3-1.217567-1.1998570.9306550.478314
40.119575-0.480929-0.2025240.728837
# compare distribution of a-d per row
rand_df.plot.bar(
stacked=True,
figsize=(10,5)
)

Pandas Visualizations

rand_df.plot.barh(
stacked=True,
figsize=(10,5)
)

Pandas Visualizations

Box Plot

iris_ds.plot.box(
figsize=(12,5),
by='species',
vert=True
)

Pandas Visualizations

iris_ds.describe()
sepal_lengthsepal_widthpetal_lengthpetal_width
count150.000000150.000000150.000000150.000000
mean5.8433333.0573333.7580001.199333
std0.8280660.4358661.7652980.762238
min4.3000002.0000001.0000000.100000
25%5.1000002.8000001.6000000.300000
50%5.8000003.0000004.3500001.300000
75%6.4000003.3000005.1000001.800000
max7.9000004.4000006.9000002.500000

Area Plot

iris_ds.plot.area(
figsize=(12,5),
stacked=True
)

Pandas Visualizations

Scatter Plot

iris_ds.plot.scatter(
figsize=(12,5),
x='sepal_width',
y='sepal_length',
c='petal_width',
colormap='winter',
title='Iris Dataset - Sepal Width vs Sepal Length'
)

Pandas Visualizations

ax = iris_ds.plot.scatter(
figsize=(12,5),
x='sepal_width',
y='petal_width',
label='width',
c='dodgerblue'
)

iris_ds.plot.scatter(
figsize=(12,5),
x='sepal_length',
y='petal_length',
label='length',
c='fuchsia',
ax=ax
)

Pandas Visualizations

ax = iris_ds.plot.scatter(
figsize=(12,5),
x='sepal_length',
y='petal_length',
label='length',
c='fuchsia'
)

iris_ds.plot.scatter(
figsize=(12,5),
x='sepal_width',
y='petal_width',
label='width',
c='dodgerblue',
ax=ax
)

Pandas Visualizations

Hex Plot

iris_ds.plot.hexbin(
figsize=(10,5),
x = 'petal_length',
y = 'sepal_length',
gridsize=10,
C='sepal_width',
cmap="plasma",
title='Sepal & Petal Length vs Sepal Width'
)

Pandas Visualizations

Pie Plot

iris_dropped_species = iris_ds.drop(['species'], axis=1)
colors = plt.get_cmap('cool')(np.linspace(0.8, 0.1, 4))
iris_dropped_species.iloc[0].plot.pie(
figsize=(8,8),
colors=colors,
startangle=20,
autopct='%.2f',
title='Iris Dataset',
shadow=True
)

Pandas Visualizations

Subplots

colors2 = plt.get_cmap('magma')(np.linspace(0.9, 0.1, 4))

sample_df = iris_dropped_species.head(3).T
sample_df
012
sepal_length5.14.94.7
sepal_width3.53.03.2
petal_length1.41.41.3
petal_width0.20.20.2
sample_df.plot.pie(
subplots=True,
colors=colors2,
figsize=(25,25),
fontsize=16
)

Pandas Visualizations

Scatter Matrix

from pandas.plotting import scatter_matrix
scatter_matrix(
iris_dropped_species,
figsize=(8,8),
diagonal='kde',
color='dodgerblue'
)

Pandas Visualizations