BBC Sound Effects Sound Clip Generation

In [6]:

%run '/Users/edward/Documents/Scripts/Python/init_py.py'  #macosx
# exec(open("D:/Edward/Documents/Assignments/Scripts/Python/init_py.py").read())
addpythonpkg('ReadNWrite')
addpythonpkg('generic')
from MATLAB import *
import os
import matplotlib.pyplot as plt
plt.style.use('dark_background')
import seaborn as sns
import re
import numpy as np
from scipy import signal
from IPython import display
import time
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from pdb import set_trace

jupyter mode

Preliminary data cleaning¶

Keep only the labeled data for now¶

In [5]:

df = pd.read_csv('./BBCSoundEffects.csv')
df = df.loc[~df['category'].isnull(), :]
df.to_csv('./BBCSoundEffect_labeld.csv', index=False)
df.head()

Out[5]:

	location	description	secs	category	CDNumber	CDName	tracknum
0	07076051.wav	Two-stroke petrol engine driving small elevato...	194	Engines: Petrol	EC117D	Diesel & Petrol Engines	4
1	07076050.wav	Single-cylinder Petter engine, start, run stop...	194	Engines: Diesel	EC117D	Diesel & Petrol Engines	1
2	07076049.wav	Start, constant run with engine driving small ...	200	Engines: Petrol	EC117D	Diesel & Petrol Engines	3
3	07076048.wav	Two false starts, constant run, stop. (2 1/4 h...	195	Engines: Petrol	EC117D	Diesel & Petrol Engines	2
4	07076047.wav	An 8 mm projector running at 24 f.p.s.	117	Cine Projectors	EC6C1	Cameras	4

Getting animal sound¶

In [24]:

Summary.shape

Out[24]:

(1191, 8)

In [ ]:

df = pd.read_csv('./BBCSoundEffect_labeld.csv')
df['SimpleCategory'] = df['category'].apply(lambda x: x.split(":")[0].strip())
gp = df.groupby(by=['SimpleCategory', 'CDName'], as_index=False, sort=True)
Summary = gp.count()
Summary2 = gp.sum()
Summary['secs'] = Summary2['secs']
Summary = Summary.sort_values('secs', ascending=False)
Summary

Text Clustering the descriptions¶

In [6]:

import string
import collections

import nltk
nltk.data.path.append("/Volumes/SD/nltk_data")
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
# from gensim.models import Word2Vec

from pprint import pprint

def process_text(text, stem=True, stop_words=None, lower=False, pos=None):
    """ Tokenize text and stem words removing punctuation """
    text = text.translate(str.maketrans('','',string.punctuation))
    tokens = word_tokenize(text)

    if stop_words is not None:
        tokens = [w for w in tokens if not w in stop_words]

    if lower:
        tokens = [w.lower() for w in tokens]

    if pos is not None: # checking part of speech
        for i, w in enumerate(tokens):
            tmp_pos = set([k.pos() for k in wn.synsets(w)])
            if not tmp_pos or not tmp_pos <= pos:
                tokens[i] = None
        tokens = [w for w in tokens if w is not None] # remove None

    if stem:
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(t) for t in tokens]

    return tokens

def cluster_texts(texts, clusters=3):
    """ Transform texts to Tf-Idf coordinates and cluster texts using K-Means """
    vectorizer = TfidfVectorizer(tokenizer=process_text,
                                 stop_words=stopwords.words('english'),
                                 lowercase=True)

    text_model = vectorizer.fit_transform(texts)

    km_model = KMeans(n_clusters=clusters, random_state=42)
    km_model.fit(text_model)

    clustering = collections.defaultdict(list)

    for idx, label in enumerate(km_model.labels_):
        clustering[label].append(idx) #return the index

    return text_model, clustering

df = pd.read_csv('./BBCSoundEffects.csv')
articles = df['description'].tolist()
tfidf_model, clusters = cluster_texts(articles, 7)

# Relabel the data
df['clusterLabel'] = np.nan
for k in clusters.keys():
    df.loc[clusters[k], 'clusterLabel'] = k

X = tfidf_model.todense()

#pprint(dict(clusters))

In [7]:

# What is the topic of the cluster
from nltk.corpus import wordnet as wn
def get_key_word(index = 0, show_df=False):
    df_sub11 = df.loc[df['clusterLabel']==index,['description', 'category', 'CDName']]
    if show_df:
        display.display(df_sub11)
    A = df_sub11.drop_duplicates().dropna().values.flatten()
    B = [process_text(a) for a in A]
    B = flat_list = [item for sublist in B for item in sublist]
    fdist = nltk.FreqDist(B)
    return fdist.most_common(10)

nouns = wn.all_synsets('n')
num_cat = len(df['clusterLabel'].unique())
key_words = [[]] * num_cat
for i in range(num_cat):
    key_word = get_key_word(i, False)
    print(key_word)
    key_words[i] = key_word[0][0]
    #word_list = [kk for kk, cc in key_word]
    #wn.synsets(w)[0].pos()
    print('=================================')

[('and', 1059), ('with', 958), ('2', 826), ('atmospher', 791), ('in', 737), ('crowd', 573), ('of', 507), ('comedi', 498), ('1', 498), ('reprocess', 483)]
=================================
[('aircraft', 445), ('engin', 427), ('exterior', 280), ('start', 268), ('helicopt', 236), ('stop', 182), ('and', 168), ('run', 159), ('off', 143), ('interior', 136)]
=================================
[('door', 245), ('open', 132), ('close', 120), ('interior', 102), ('and', 78), ('exterior', 45), ('car', 43), ('hous', 38), ('gate', 31), ('lift', 30)]
=================================
[('clock', 25), ('strike', 11), ('bell', 10), ('12', 9), ('time', 8), ('grandfath', 6), ('church', 3), ('cuckoo', 2), ('bracket', 2), ('with', 2)]
=================================
[('train', 512), ('steam', 176), ('station', 104), ('exterior', 78), ('diesel', 72), ('with', 58), ('locomot', 54), ('interior', 53), ('suburban', 45), ('pass', 44)]
=================================
[('footstep', 396), ('1', 239), ('reprocess', 198), ('2', 196), ('stop', 137), ('and', 136), ('male', 123), ('femal', 75), ('stair', 72), ('man', 71)]
=================================
[('car', 1167), ('motor', 471), ('ford', 453), ('interior', 409), ('model', 394), ('exterior', 328), ('van', 282), ('cortina', 197), ('vintag', 177), ('anglia', 172)]
=================================

PCA of clusters¶

In [8]:

from sklearn.decomposition import PCA
key_words[0] = 'comedi'
if True:
    pca = PCA(n_components=2).fit(X)
    data2D = pca.transform(X)
if True:
    labels = key_words
    df['clusterName'] = np.nan
    for n, l in enumerate(labels):
        df.loc[df['clusterLabel']==n, 'clusterName'] = l

    df_plot = df.copy()
    df_plot['x'] = data2D[:,0]
    df_plot['y'] = data2D[:,1]

sns.scatterplot(x='x', y='y', hue='clusterName', data=df_plot)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.gcf().savefig('PCA_clustering.png', bbox_inches='tight', dpi=300)
plt.show()

In [9]:

df.to_csv('./BBC_Sound_Effect_clustered.csv', index=False)
np.savez_compressed('./clustering.npz', tfidf_model=tfidf_model, clusters=clusters)

In [10]:

df.head()

Out[10]:

	location	description	secs	category	CDNumber	CDName	tracknum	clusterLabel	clusterName
0	07076051.wav	Two-stroke petrol engine driving small elevato...	194	Engines: Petrol	EC117D	Diesel & Petrol Engines	4	1.0	aircraft
1	07076050.wav	Single-cylinder Petter engine, start, run stop...	194	Engines: Diesel	EC117D	Diesel & Petrol Engines	1	1.0	aircraft
2	07076049.wav	Start, constant run with engine driving small ...	200	Engines: Petrol	EC117D	Diesel & Petrol Engines	3	1.0	aircraft
3	07076048.wav	Two false starts, constant run, stop. (2 1/4 h...	195	Engines: Petrol	EC117D	Diesel & Petrol Engines	2	1.0	aircraft
4	07076047.wav	An 8 mm projector running at 24 f.p.s.	117	Cine Projectors	EC6C1	Cameras	4	0.0	comedi

TSNE¶

In [6]:

#from sklearn.manifold import TSNE
from MulticoreTSNE import MulticoreTSNE as TSNE
tsne = TSNE(n_jobs=4, n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(X)
plt.scatter(tsne_results[:, 0], tsne_results[:, 1])
plt.show()

Download some data from the cluster¶

In [11]:

df = pd.read_csv('./BBC_Sound_Effect_clustered.csv')
gp = df.groupby(by=['clusterLabel', 'clusterName'], sort=True, as_index=False)
Summary = gp.count()
Summary2 = gp.sum()
Summary['secs'] = Summary2['secs']
Summary

Out[11]:

	clusterLabel	clusterName	location	description	secs	category	CDNumber	CDName	tracknum
0	0.0	comedi	10686	10686	1317537	4886	10686	10283	10686
1	1.0	aircraft	1963	1963	229096	631	1963	1865	1963
2	2.0	door	606	606	14245	144	606	460	606
3	3.0	clock	728	728	24807	9	728	689	728
4	4.0	train	545	545	61583	212	545	533	545
5	5.0	footstep	581	581	17728	204	581	579	581
6	6.0	car	902	902	64328	462	902	896	902

In [12]:

df2 = df.sort_values(by=['clusterLabel'])
df2 = df2.loc[np.logical_or(df['clusterLabel']==3, df['clusterLabel']==5),:]

df2.to_csv('./clock_footstep_sorted.csv', index=False)

Download the sound data to be explored¶

In [13]:

import urllib3
import shutil
import time
url_format = 'http://bbcsfx.acropolis.org.uk/assets/{}'
path_format = './wav/{}'
http = urllib3.PoolManager()
def download_file(url, path):
    if not os.path.isfile(path): # not already downloaded
        with http.request('GET', url, preload_content=False) as r, open(path, 'wb') as out_file:       
            shutil.copyfileobj(r, out_file)
df2 = df2.reset_index()
num_downloads = df2.shape[0]
printProgressBar(0,num_downloads,prefix='Progress:',suffix='Complete',length=50,mode="counts")
for i in df2.index:
    printProgressBar(i+1,num_downloads,prefix='Progress:',suffix='Complete',length=50,mode="counts")
    url = url_format.format(df2.loc[i, 'location'])
    path = path_format.format(df2.loc[i, 'location'])
    if os.path.isfile(path):
        time.sleep(0.001)
        continue # skip ones already downloaded
    download_file(url, path)

Progress:  |██████████████████████████████████████████████████| 1309 / 1309 Complete
ended

Spectrogram¶

In [140]:

# Spectrogram

import librosa
import audioread

def make_spectrogram(wav_files, labels=['clock', 'footstep'], sr=44100):
    fig, ax = plt.subplots(3, 2)
    for i, wav in enumerate(wav_files):
        current_path = "./wav/{}".format(wav)
        tmp = audioread.audio_open(current_path) # ge tthe sample rate
        data_out, _ = librosa.load(current_path, sr=tmp.samplerate, dtype=np.float32)

        cmap = 'CMRmap'#'nipy_spectral'
        ax[0,i].plot(np.linspace(0, len(data_out)/sr,len(data_out)), data_out, c=(174/255, 199/255, 232/255))
        ax[0,i].set_title(labels[i])

        frequencies, times, spectrogram = signal.spectrogram(data_out, sr)

        ax[1,i].pcolormesh(times, frequencies, spectrogram, cmap=cmap)
        ax[1,i].set_title('Orignal Spectrogram')

        ax[2,i].pcolormesh(times, frequencies, spectrogram, cmap=cmap)
        ax[2,i].set_ylim(0, 5000)
        ax[2,i].set_title('Zoom Original Spectrogram')

    [a.set_xlabel('Time (sec)') for a in ax.flatten()]
    [a.set_ylabel('Frequency (Hz)') for a in ax[1:,:].flatten()]

    fig.subplots_adjust(wspace=0.25, hspace=0.5)
    fig.set_size_inches(10,8)
    return fig


clock = '07016099.wav'
footstep = '07004099.wav'

fig = make_spectrogram([clock, footstep])
fig.savefig('./Example_audio_spectrogram.png')

Periodicity of the sounds¶

Audio Segmentation¶

In [128]:

from auditok.cmdline import file_to_audio_source, seconds_to_str_fromatter, TokenizerWorker, LogWorker
from auditok.core import StreamTokenizer
from auditok.io import PyAudioSource, BufferAudioSource, StdinAudioSource, player_for
from auditok.util import ADSFactory, AudioEnergyValidator
import logging
import librosa
import audioread

def audio_segmentation(wavefile, analysis_window=0.01, max_time=None, energy_threshold=55, min_duration=0.2,
                       max_duration=5, max_silence=0.3, quiet=True, printf="{id} {start} {end}",
                       debug=False, time_format='%S', drop_trailing_silence=False):
    # load the data
    asource = file_to_audio_source(filename=wavefile, uc=1)

    # Logging for segmentation
    LOGGER_NAME = "AUDITOK_LOGGER"
    logger = logging.getLogger(LOGGER_NAME)
    logger.setLevel(logging.DEBUG)
    handler = logging.StreamHandler(sys.stdout)
    handler.setLevel(logging.CRITICAL)
    logger.addHandler(handler)

    record = True

    # Dropping trailing tokenizer
    mode = StreamTokenizer.DROP_TRAILING_SILENCE if drop_trailing_silence else 0
    ads = ADSFactory.ads(audio_source=asource, block_dur=analysis_window, max_time=max_time,
                         record=record)
    validator = AudioEnergyValidator(sample_width=asource.get_sample_width(), energy_threshold=energy_threshold)


    # Segmentation parameters
    analysis_window_per_second = 1. / analysis_window
    tokenizer = StreamTokenizer(validator=validator, min_length=min_duration * analysis_window_per_second,
                                max_length=int(max_duration * analysis_window_per_second),
                                max_continuous_silence=max_silence * analysis_window_per_second,
                                mode = mode)

    observers = []
    tokenizer_worker = None

    oformat = printf.replace("\\n", "\n").replace("\\t", "\t").replace("\\r", "\r")
    converter = seconds_to_str_fromatter(time_format)
    log_worker = LogWorker(print_detections = not quiet, output_format=oformat,
                                       time_formatter=converter, logger=logger, debug=debug)
    observers.append(log_worker)

    # Do the segmentation
    tokenizer_worker = TokenizerWorker(ads, tokenizer, analysis_window, observers)

    # start observer threads
    for obs in observers:
        obs.start()
    # start tokenization thread
    tokenizer_worker.start()

    time.sleep(1) # wait a little

    tokenizer_worker = None

    return log_worker.detections

In [301]:

clock = '07016099.wav'
footstep = '07004099.wav'


# Plot
plt.style.use('dark_background')
fig, ax = plt.subplots(1, 2)
titles = ['clock', 'footstep']
energy_threshold=[55, 55]

for i, w in enumerate([clock, footstep]):
    current_path = "./wav/{}".format(w)
    # Segment
    detections = audio_segmentation(current_path, energy_threshold=energy_threshold[i], max_duration=4)
    # Plot
    tmp = audioread.audio_open(current_path) # get tthe sample rate
    sr = tmp.samplerate
    data_out, _ = librosa.load(current_path, sr=sr, dtype=np.float32)
    time_vec = np.linspace(0, len(data_out)/sr,len(data_out))
    ax[i].plot(time_vec, data_out)

    ylevel = ax[i].get_ylim()
    for _, _, _, start_time, end_time in detections:
        ax[i].fill_betweenx(ylevel, start_time, end_time, alpha=0.35, color='yellow', edgecolor='w', lw=5)

    ax[i].set_title(titles[i])

fig.set_size_inches(15, 3)

In [338]:

clock = '07016230.wav'
footstep = '07064103.wav'


# Plot
plt.style.use('dark_background')
fig, ax = plt.subplots(1, 2)
titles = ['clock', 'footstep']
energy_threshold=[72, 55]

for i, w in enumerate([clock, footstep]):
    current_path = "./wav/{}".format(w)
    # Segment
    detections = audio_segmentation(current_path, energy_threshold=energy_threshold[i], max_duration=30)
    # Plot
    tmp = audioread.audio_open(current_path) # get tthe sample rate
    sr = tmp.samplerate
    data_out, _ = librosa.load(current_path, sr=sr, dtype=np.float32)
    time_vec = np.linspace(0, len(data_out)/sr,len(data_out))
    ax[i].plot(time_vec, data_out)

    ylevel = ax[i].get_ylim()
    for _, _, _, start_time, end_time in detections:
        ax[i].fill_betweenx(ylevel, start_time, end_time, alpha=0.35, color='yellow', edgecolor='w', lw=5)

    ax[i].set_title(titles[i])

fig.set_size_inches(15, 3)

In [334]:

display.Audio('./wav/{}'.format(clock))

Out[334]:

In [337]:

df = pd.read_csv('./clock_footstep_sorted.csv')
num_data = df.shape[0]
df_segment = pd.DataFrame({"location":[], "segment_num":[], "start":[], "end":[],
                           "start_time":[], "end_time":[], "duration":[], "num_of_segments":[]})
printProgressBar(0,num_data,prefix='Progress:',suffix='Complete',length=50,mode="counts")
for i in df.index:
    printProgressBar(i+1,num_data,prefix='Progress:',suffix='Complete',length=50,mode="counts")
    current_path = "./wav/{}".format(df.loc[i, 'location'])
    if False: #df.loc[i, 'secs']>8:
        for md in range(4, df.loc[i, 'secs']+1, 2):
            detections = audio_segmentation(current_path, energy_threshold=72, max_duration=md)
            dd = np.array(detections)
            if any(abs(dd[:,4] - dd[:, 3]- md)<1E-6): # Saturated segment
                continue
            else:
                break
    else:
        detections = audio_segmentation(current_path, energy_threshold=72, max_duration=500)


    for d in detections:
        tmp_df = pd.DataFrame({"location":df.loc[i, 'location'], "segment_num":[d[0]],
                               "start":[d[1]], "end":[d[2]],
                               "start_time":[d[3]], "end_time":[d[4]],
                               "duration":[d[4] - d[3]], "num_of_segments":[len(detections)]})
        df_segment = df_segment.append(tmp_df)

df_segment = df_segment.reset_index(drop=True)
df_segment = df_segment.astype({'segment_num': np.int32, 'start':np.int32, 'end':np.int32,
                                'start_time': np.float32, 'end_time': np.float32, 'duration':np.float32,
                                'num_of_segments': np.int32})
df_segment.to_csv('./clock_footstep_segmented.csv', index=False)

Progress:  |██████████████████████████████████████████████████| 1309 / 1309 Complete
ended

In [319]:

df = pd.read_csv('./clock_footstep_sorted.csv')
df_segment = pd.read_csv('./clock_footstep_segmented.csv')
df_segment = df_segment.merge(df, on='location')
pd.concat((df_segment.loc[df_segment['clusterName']=='clock', 'duration'].describe().rename('clock'),
          df_segment.loc[df_segment['clusterName']=='footstep', 'duration'].describe().rename('footstep')),
          axis=1)

Out[319]:

	clock	footstep
count	5677.000000	13324.000000
mean	2.491319	0.761512
std	7.990162	2.577033
min	0.310000	0.310000
25%	0.340000	0.320000
50%	0.650000	0.380000
75%	1.800000	0.530000
max	283.990000	64.000000

Distributions of segment durations¶

In [320]:

fig, axs = plt.subplots(1,2)
for n, c in enumerate(['clock', 'footstep']):
    _=axs[n].hist(df_segment.loc[df_segment['clusterName']==c, 'duration'], bins=200)
    axs[n].set_title(c)
    axs[n].set_xlabel('Durations (sec)')
fig.set_size_inches(10, 4)
_ = fig.suptitle('Distributions of segment durations')

Distributions of segment onsets¶

In [321]:

fig, axs = plt.subplots(1,2)
for n, c in enumerate(['clock', 'footstep']):
    _=axs[n].hist(df_segment.loc[df_segment['clusterName']==c, 'start_time'], bins=200)
    axs[n].set_title(c)
    axs[n].set_xlabel('Onset time (sec)')
fig.set_size_inches(10, 4)
_ = fig.suptitle('Distributions of segment onset time')

Coefficient of Variation of Durations¶

Plotting the distirbution of coefficient of variation of the duration would help to determine if the segmented epochs are indeed repeated sounds.

In [11]:

def cv(X, axis=0, toarray=False, printerr=True, *args, **kwargs):
    try:
        if toarray:
            X = np.array(X)
        return np.nanstd(X, axis=axis, *args, **kwargs) / np.nanmean(X, axis=axis, *args, **kwargs)
    except Exception as err:
        if printerr:
            print(err)

        return np.array(X)[0] if len(X)>0 else np.nan

gp = df_segment.loc[:, ['location', 'start_time', 'duration', 'clusterName']].groupby(
    by=['location', 'clusterName'], sort=False, as_index=False)
Summary = gp.agg(cv)

In [322]:

fig, axs = plt.subplots(1,2)
for n, c in enumerate(['clock', 'footstep']):
    _=axs[n].hist(Summary.loc[Summary['clusterName']==c, 'duration'], bins=200)
    axs[n].set_title(c)
    axs[n].set_xlabel('Coef. of Var. of Durations')
fig.set_size_inches(10, 4)
_ = fig.suptitle('Distributions of CV of Durations')

Conclusion: most of the clock sounds are higly repeated. In contrast, footstep sound epochs have irregular patterns. Different training strategy may be necessary for the two classes of sounds.

Frequency distibution¶

Spectral Density¶

In [53]:

import librosa
import audioread

def make_spectral_power(wav_files, labels=['clock', 'footstep'], sr=44100):
    fig, ax = plt.subplots(3, 2)
    for i, wav in enumerate(wav_files):
        current_path = "./wav/{}".format(wav)
        tmp = audioread.audio_open(current_path) # ge tthe sample rate
        data_out, _ = librosa.load(current_path, sr=tmp.samplerate, dtype=np.float32)

        c = (174/255, 199/255, 232/255)
        ax[0,i].plot(np.linspace(0, len(data_out)/sr,len(data_out)), data_out, c=c)
        ax[0,i].set_title(labels[i])
        ax[0,i].set_xlabel('Time [sec]')

        f, Pxx_den = signal.welch(data_out, sr, nperseg=1024)

        ax[1,i].semilogy(f, Pxx_den, c=c)
        ax[1,i].set_title('Spectral Density')
        ax[1,i].set_xlabel('Frequency [Hz]')
        ax[1,i].set_ylabel('PSD [V**2/Hz]')

        Pxx_cum = Pxx_den.cumsum() / Pxx_den.sum()
        ax[2,i].semilogy(f, Pxx_cum, c=c)
        ax[2,i].set_title('Spectral Power Cum. Dist.')
        ax[2,i].set_xlabel('Frequency [Hz]')
        ax[2,i].set_ylabel('Cum. Prob.')

    fig.subplots_adjust(wspace=0.25, hspace=0.5)
    fig.set_size_inches(10,8)
    return fig

clock = '07016099.wav'
footstep = '07004099.wav'

fig = make_spectral_power([clock, footstep])
fig.savefig('./Example_audio_spectral_density.png')

Distributions Power density¶

Up to what frequency covers 90%, 95%, or 99% of the power density?

In [63]:

df = pd.read_csv('./clock_footstep_sorted.csv')
num_data = df.shape[0]
df['FrequencyCutoff'] = np.nan

Pxx_mat = [[]] * df.shape[0]

printProgressBar(0,num_data,prefix='Progress:',suffix='Complete',length=50,mode="counts")
for i in df.index:
    printProgressBar(i+1,num_data,prefix='Progress:',suffix='Complete',length=50,mode="counts")
    current_path = "./wav/{}".format(df.loc[i, 'location'])
    tmp = audioread.audio_open(current_path) # ge tthe sample rate
    sr = tmp.samplerate
    data_out, _ = librosa.load(current_path, sr=sr, dtype=np.float32)
    f, Pxx_den = signal.welch(data_out, sr, nperseg=1024)
    Pxx_cum = Pxx_den.cumsum() / Pxx_den.sum()
    df.loc[i, 'FrequencyCutoff'] = f[np.where(Pxx_cum>0.99)[0][0]]
    Pxx_mat[i] = Pxx_den

Pxx_mat = np.array(Pxx_mat)
np.savez_compressed('./Spectral_Power_Density.npz', Pxx_mat=Pxx_mat, f=f)

Progress:  |██████████████████████████████████████████████████| 1309 / 1309 Complete
ended

In [67]:

Pxx_cum_mat = np.cumsum(Pxx_mat, axis=1) / np.sum(Pxx_mat, axis=1, keepdims=True)
df['FrequencyCutoff90'] = [f[p > 0.90][0] for p in Pxx_cum_mat]
df['FrequencyCutoff95'] = [f[p > 0.95][0] for p in Pxx_cum_mat]
df['FrequencyCutoff99'] = [f[p > 0.99][0] for p in Pxx_cum_mat]

df.loc[:, ['FrequencyCutoff90', 'FrequencyCutoff95', 'FrequencyCutoff99']].describe(\
                        percentiles=[0.5, 0.8, 0.9, 0.95, 0.99])
df.to_csv('./clock_footstep_frequency.csv', index=False)
display.display(df.loc[:, ['FrequencyCutoff90', 'FrequencyCutoff95', 'FrequencyCutoff99']].describe(\
                        percentiles=[0.5, 0.8, 0.9, 0.95, 0.99]))

	FrequencyCutoff90	FrequencyCutoff95	FrequencyCutoff99
count	1309.000000	1309.000000	1309.000000
mean	2649.916444	3302.689986	4930.593562
std	2085.076382	2320.056078	2897.710985
min	301.464844	387.597656	904.394531
50%	1894.921875	2540.917969	4048.242188
80%	3419.472656	4366.933594	6692.519531
90%	5624.472656	6468.574219	9052.558594
95%	8225.683594	8863.066406	11240.332031
99%	10074.093750	11193.820313	14618.460938
max	14039.648438	14384.179688	17872.558594

In [82]:

fig, ax = plt.subplots(1,3)
titles = ['90% Total Power', '95% Total Power', '99% Total Power']
for i, k in enumerate(['FrequencyCutoff90', 'FrequencyCutoff95', 'FrequencyCutoff99']):
    ax[i].hist(df[k], bins=100)
    ax[i].set_xlabel('Frequency (Hz)')
    ax[i].set_xlim(0, 22050)
    ax[i].set_title(titles[i])

fig.set_size_inches(12, 3)
_ = fig.suptitle('Distribution of Frequency Cutoff', y=1.1)

In [104]:

df.loc[df['clusterName']=='clock','secs'].sum()/60/60

Out[104]:

6.890833333333333

Deep Generative Neural Network¶

Preparing data¶

In [120]:

# Separate data into folders using hardlinks
for i in df.index:
    if sys.platform[:3] == 'dar':
        cmd = 'ln ./wav/{0} ./wav/train_{1}/{0}'.format(df.loc[i, 'location'], '{}')
    else:
        cmd = 'mklink /H ./wav/{0} ./wav/train_{1}/{0}'.format(df.loc[i, 'location'], '{}')

    cmd = cmd.format(df.loc[i, 'clusterName'])
    os.system(cmd)

Slice clock data¶

In [365]:

import librosa
import audioread

def ts_window(y, ts, window):
    index = [int(k) for k in np.array(window) / ts]
    return y[index[0]:index[1]]

def ts_pad(y, fs, dur=4.0, pad_start=0.05):
    pad_dur = dur - (len(y)/fs) # total durations need to be padded
    if pad_dur <=pad_start: # pad only beginning
        y = np.concatenate((np.zeros(int(dur*fs)-len(y)), y))
    else: # pad beginning for pad_start, then pad the end as well
        y = np.concatenate((np.zeros(int(pad_start*fs)), y))
        y = np.concatenate((y, np.zeros(int(dur*fs)-len(y))))

    return y

def slice_audio(audio_file, start_time, end_time, fs_out=16000, max_duration=4.0, enforce_duration=None,
               merge_epochs=False, pad_start=0.05, merged_overlap=1):
    """
    audio_file: path to audio file
    start_time, end_time: list of start and end time of the epoch
    fs_out: sampling rate of the output
    max_duration: maximum duration of the segment.
    enforce_duration: if less than the enforced duration, padding the end with zero
    merg_epochs: if merge epochs, multiple epochs are going to be combined to form a segment 
                 less than max_duration
    pad_start: pad some zeros at the beginning
    merged_overlap: number of epochs to overlap between segments made of merged epochs
    """
    tmp = audioread.audio_open(audio_file) # get the sample rate
    sr = tmp.samplerate
    data_out, _ = librosa.load(audio_file, sr=sr, dtype=np.float32)
    data_out = librosa.resample(data_out, sr, fs_out)
    Y_list = []
    if not merge_epochs: # each epoch is one segment
        for n, (st, et) in enumerate(zip(start_time, end_time)): # iterate over epochs
            dur = et - st
            if max_duration < dur:
                continue # ignore this epoch. 
                # Cannot simply take the subset, as it will be an incomplete sample
            y = ts_window(data_out, 1./fs_out, [st, et])
            Y_list.append(y)
    else: # start from the beginning of the epochs
        times = np.c_[start_time, end_time]
        niter = len(times)-1
        for i in range(niter):
            try:
                index_i, index_j = np.array(np.where(times - times[0,0] < max_duration))[:,-1]
            except:
                return []
            if index_i==0 and index_j==0: # the duration of the starting epoch is already larger than max_duration
                times = times[1:, :] # start from the next time point
            else:
                window = [times[0, 0], times[index_i, index_j]]
                #print(window)
                y = ts_window(data_out, 1./fs_out, window)
                Y_list.append(y)
                # next
                next_index_i = max(1, index_i-merged_overlap)
                if next_index_i > times.shape[0]-1:
                    break
                times = times[next_index_i:, :] # go to next time point

    if enforce_duration is not None:
        for n in range(len(Y_list)):
            Y_list[n] = ts_pad(Y_list[n], fs_out, enforce_duration, pad_start)

    # Enforce data type
    Y_list = [y.astype(np.float32) for y in Y_list]

    return Y_list

In [366]:

df_segment = pd.read_csv('./clock_footstep_segmented.csv')
df = pd.read_csv('./clock_footstep_sorted.csv')
df = df.merge(df_segment, on='location')
locations = df['location'].unique()
num_data=len(locations)

printProgressBar(0,num_data,prefix='Progress:',suffix='Complete',length=50,mode="counts")
for i, loc in enumerate(locations):
    #if i == 0: 
    #    time.sleep(0.001)
     #   continue
    printProgressBar(i,num_data,prefix='Progress:',suffix='Complete',length=50,mode="counts")
    df_sub = df.loc[df['location']==loc, :]
    if df_sub['clusterName'].iloc[0] != 'clock':
        continue
    start_time = df_sub['start_time']
    end_time = df_sub['end_time']
    Y = slice_audio("./wav/{}".format(loc), start_time, end_time, fs_out=16000,
                    merge_epochs=True, max_duration=4.0, enforce_duration=4.0)
    for kk, y in enumerate(Y):
        librosa.output.write_wav("./train_clock_segmented2/{}-{:d}.wav".format(
            loc.replace(".wav",""), kk), y, 16000, norm=False)

Progress:  |█████████████████████████████████████████████████-| 1307 / 1308 Complete

In [286]:

df_segment = pd.read_csv('./clock_footstep_segmented.csv')
df = pd.read_csv('./clock_footstep_sorted.csv')
df = df.merge(df_segment, on='location')
loc = '07016124.wav'
df_sub = df.loc[df['location']==loc, :]
start_time = df_sub['start_time']
end_time = df_sub['end_time']
Y = slice_audio("./wav/{}".format(loc), start_time, end_time, fs_out=16000,
                merge_epochs=True, max_duration=4.0, enforce_duration=4.0)
display.Audio(filename="./wav/{}".format(loc))

Out[286]:

Run the vanilla WaveGAN¶

In [125]:

!export CUDA_VISIBLE_DEVICES="0"
!python ./library/wavegan/train_wavegan.py train ./train --data_dir ./wav/train_clock

Generate samples¶

In [370]:

import tensorflow as tf
from IPython.display import display, Audio

# Load the graph
tf.reset_default_graph()
saver = tf.train.import_meta_graph('./train/infer/infer.meta')
graph = tf.get_default_graph()
sess = tf.InteractiveSession()
saver.restore(sess, './train/model.ckpt-7809')

INFO:tensorflow:Restoring parameters from ./train/model.ckpt-7809

"The chimes"¶

In [402]:

for i, j in zip([161, 45, 59, 99], ['C','E','D','G']):
    np.random.seed(i)
    # Create 50 random latent vectors z
    _z = (np.random.rand(50, 100) * 2.) - 1

    # Synthesize G(z)
    z = graph.get_tensor_by_name('z:0')
    G_z = graph.get_tensor_by_name('G_z:0')
    _G_z = sess.run(G_z, {z: _z})

    # Play audio in notebook
    print(i, ':', j)
    display(Audio(_G_z[0, :, 0], rate=16000))

161 : C

45 : E

59 : D

99 : G