TST, Hongkong

Audio Classification with Computer Vision

Github Repository

Audio Classification with Computer Vision

The ESC-50 dataset is a labeled collection of 2000 environmental audio recordings suitable for benchmarking methods of environmental sound classification.

The dataset consists of 5-second-long recordings organized into 50 semantical classes (with 40 examples per class) loosely arranged into 5 major categories:

import numpy as np
from matplotlib import pyplot as plt
from numpy.lib import stride_tricks
import os
import pandas as pd
import scipy.io.wavfile as wav

Visualize the Dataset

esc50_df = pd.read_csv('dataset/ESC-50/esc50.csv')
esc50_df.head()

	filename	fold	target	category	esc10	src_file	take
0	1-100032-A-0.wav	1	0	dog	True	100032	A
1	1-100038-A-14.wav	1	14	chirping_birds	False	100038	A
2	1-100210-A-36.wav	1	36	vacuum_cleaner	False	100210	A
3	1-100210-B-36.wav	1	36	vacuum_cleaner	False	100210	B
4	1-101296-A-19.wav	1	19	thunderstorm	False	101296	A

esc50_df['category'].value_counts()

dog 40 glass_breaking 40 drinking_sipping 40 rain 40 insects 40 laughing 40 hen 40 engine 40 breathing 40 crying_baby 40 hand_saw 40 coughing 40 snoring 40 chirping_birds 40 toilet_flush 40 pig 40 washing_machine 40 clock_tick 40 sneezing 40 rooster 40 sea_waves 40 siren 40 cat 40 door_wood_creaks 40 helicopter 40 crackling_fire 40 car_horn 40 brushing_teeth 40 vacuum_cleaner 40 thunderstorm 40 door_wood_knock 40 can_opening 40 crow 40 clapping 40 fireworks 40 chainsaw 40 airplane 40 mouse_click 40 pouring_water 40 train 40 sheep 40 water_drops 40 church_bells 40 clock_alarm 40 keyboard_typing 40 wind 40 footsteps 40 frog 40 cow 40 crickets 40 Name: category, dtype: int64

def fourier_transformation(sig, frameSize, overlapFac=0.5, window=np.hanning):
    win = window(frameSize)
    hopSize = int(frameSize - np.floor(overlapFac * frameSize))

    # zeros at beginning (thus center of 1st window should be for sample nr. 0)   
    samples = np.append(np.zeros(int(np.floor(frameSize/2.0))), sig)    
    # cols for windowing
    cols = np.ceil( (len(samples) - frameSize) / float(hopSize)) + 1
    # zeros at end (thus samples can be fully covered by frames)
    samples = np.append(samples, np.zeros(frameSize))

    frames = stride_tricks.as_strided(samples, shape=(int(cols), frameSize), strides=(samples.strides[0]*hopSize, samples.strides[0])).copy()
    frames *= win

    return np.fft.rfft(frames) 

def make_logscale(spec, sr=44100, factor=20.):
    timebins, freqbins = np.shape(spec)

    scale = np.linspace(0, 1, freqbins) ** factor
    scale *= (freqbins-1)/max(scale)
    scale = np.unique(np.round(scale))

    # create spectrogram with new freq bins
    newspec = np.complex128(np.zeros([timebins, len(scale)]))
    for i in range(0, len(scale)):        
        if i == len(scale)-1:
            newspec[:,i] = np.sum(spec[:,int(scale[i]):], axis=1)
        else:        
            newspec[:,i] = np.sum(spec[:,int(scale[i]):int(scale[i+1])], axis=1)

    # list center freq of bins
    allfreqs = np.abs(np.fft.fftfreq(freqbins*2, 1./sr)[:freqbins+1])
    freqs = []
    for i in range(0, len(scale)):
        if i == len(scale)-1:
            freqs += [np.mean(allfreqs[int(scale[i]):])]
        else:
            freqs += [np.mean(allfreqs[int(scale[i]):int(scale[i+1])])]

    return newspec, freqs

def plot_spectrogram(location, categorie, plotpath=None, binsize=2**10, colormap="jet"):
    samplerate, samples = wav.read(location)

    s = fourier_transformation(samples, binsize)

    sshow, freq = make_logscale(s, factor=1.0, sr=samplerate)

    ims = 20.*np.log10(np.abs(sshow)/10e-6) # amplitude to decibel

    timebins, freqbins = np.shape(ims)

    print("timebins: ", timebins)
    print("freqbins: ", freqbins)

    plt.figure(figsize=(15, 7.5))
    plt.title('Class Label: ' + categorie)
    plt.imshow(np.transpose(ims), origin="lower", aspect="auto", cmap=colormap, interpolation="none")
    plt.colorbar()

    plt.xlabel("time (s)")
    plt.ylabel("frequency (hz)")
    plt.xlim([0, timebins-1])
    plt.ylim([0, freqbins])

    xlocs = np.float32(np.linspace(0, timebins-1, 5))
    plt.xticks(xlocs, ["%.02f" % l for l in ((xlocs*len(samples)/timebins)+(0.5*binsize))/samplerate])
    ylocs = np.int16(np.round(np.linspace(0, freqbins-1, 10)))
    plt.yticks(ylocs, ["%.02f" % freq[i] for i in ylocs])

    if plotpath:
        plt.savefig(plotpath, bbox_inches="tight")
    else:
        plt.show()

    plt.clf()

    return ims

plot = plot_spectrogram('dataset/ESC-50/audio/' + esc50_df[esc50_df['category'] == 'crow']['filename'].iloc[0], categorie='Crow')

Audio Classification with Computer Vision

plot = plot_spectrogram('dataset/ESC-50/audio/' + esc50_df[esc50_df['category'] == 'toilet_flush']['filename'].iloc[0], categorie='Toilet Flush')

Audio Classification with Computer Vision

Data Preprocessing

def audio_vis(location, filepath, binsize=2**10, colormap="jet"):
    samplerate, samples = wav.read(location)

    s = fourier_transformation(samples, binsize)

    sshow, freq = make_logscale(s, factor=1.0, sr=samplerate)

    with np.errstate(divide='ignore'):
        ims = 20.*np.log10(np.abs(sshow)/10e-6) # amplitude to decibel

    timebins, freqbins = np.shape(ims)

    plt.figure(figsize=(15, 7.5))
    plt.imshow(np.transpose(ims), origin="lower", aspect="auto", cmap=colormap, interpolation="none")

    plt.axis('off')
    plt.xlim([0, timebins-1])
    plt.ylim([0, freqbins])
    
    plt.savefig(filepath, bbox_inches="tight")
    plt.close()

    return

conversion = []

for i in range(len(esc50_df.index)):
    
    filename = esc50_df['filename'].iloc[i]
    location = 'dataset/ESC-50/audio/' + filename
    category = esc50_df['category'].iloc[i]
    catpath = 'dataset/ESC-50/spectrogram/' + category
    filepath = catpath + '/' + filename[:-4] + '.jpg'

    conversion.append({location, filepath})

conversion[0]

dataset/ESC-50/spectrogram/dog/1-100032-A-0.jpg

for i in range(len(esc50_df.index)):
    
    filename = esc50_df['filename'].iloc[i]
    location = 'dataset/ESC-50/audio/' + filename
    category = esc50_df['category'].iloc[i]
    catpath = 'dataset/ESC-50/spectrogram/' + category
    filepath = catpath + '/' + filename[:-4] + '.jpg'

    os.makedirs(catpath, exist_ok=True)
    
    audio_vis(location, filepath)

Train-Test-Split

!pip install split-folders
import splitfolders

input_folder = 'dataset/ESC-50/spectrogram'
output = 'data'

splitfolders.ratio(input_folder, output=output, seed=42, ratio=(.8, .2))

Prepare Validation Data

testing = [
    'data/test/helicopter.wav',
    'data/test/cat.wav'
]

def test_vis(location, filepath, binsize=2**10, colormap="jet"):
    samplerate, samples = wav.read(location)

    s = fourier_transformation(samples, binsize)

    sshow, freq = make_logscale(s, factor=1.0, sr=samplerate)

    with np.errstate(divide='ignore'):
        ims = 20.*np.log10(np.abs(sshow)/10e-6) # amplitude to decibel

    timebins, freqbins = np.shape(ims)

    plt.figure(figsize=(15, 7.5))
    plt.imshow(np.transpose(ims), origin="lower", aspect="auto", cmap=colormap, interpolation="none")

    plt.axis('off')
    plt.xlim([0, timebins-1])
    plt.ylim([0, freqbins])
    
    plt.savefig(filepath, bbox_inches="tight")
    plt.close()

    return

test_vis(testing[0], filepath='data/test/helicopter.jpg')

test_vis(testing[1], filepath='data/test/cat.jpg')

Model Training

from ultralytics import YOLO

model = YOLO('yolov8n-cls.pt')
#model = YOLO('yolov8s-cls.pt')

results = model.train(data='./data', epochs=20, imgsz=640)

metrics = model.val()

Audio Classification with Computer Vision

print(metrics.top1)
print(metrics.top5)

0.7824999690055847 0.9524999856948853

Model Predictions

# Predict with the model
pred = model('data/test/helicopter.jpg')

# helicopter 0.56, crying_baby 0.09, crickets 0.08, sea_waves 0.07, snoring 0.07, 3.5ms
# Speed: 4.9ms preprocess, 3.5ms inference, 0.1ms postprocess per image at shape (1, 3, 640, 640)

# Predict with the model
pred = model('data/test/cat.jpg')

# cat 0.55, rooster 0.23, crying_baby 0.22, laughing 0.00, siren 0.00, 3.5ms
# Speed: 13.0ms preprocess, 3.5ms inference, 0.1ms postprocess per image at shape (1, 3, 640, 640)

Audio Classification with Computer Vision

Visualize the Dataset​

Data Preprocessing​

Train-Test-Split​

Prepare Validation Data​

Model Training​

Model Predictions​

Visualize the Dataset

Data Preprocessing

Train-Test-Split

Prepare Validation Data

Model Training

Model Predictions