BBC Sound Effects Sound Clip Generation
In [6]:
%run '/Users/edward/Documents/Scripts/Python/init_py.py' #macosx
# exec(open("D:/Edward/Documents/Assignments/Scripts/Python/init_py.py").read())
addpythonpkg('ReadNWrite')
addpythonpkg('generic')
from MATLAB import *
import os
import matplotlib.pyplot as plt
plt.style.use('dark_background')
import seaborn as sns
import re
import numpy as np
from scipy import signal
from IPython import display
import time
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from pdb import set_trace
Preliminary data cleaning¶
Keep only the labeled data for now¶
In [5]:
df = pd.read_csv('./BBCSoundEffects.csv')
df = df.loc[~df['category'].isnull(), :]
df.to_csv('./BBCSoundEffect_labeld.csv', index=False)
df.head()
Out[5]:
Getting animal sound¶
In [24]:
Summary.shape
Out[24]:
In [ ]:
df = pd.read_csv('./BBCSoundEffect_labeld.csv')
df['SimpleCategory'] = df['category'].apply(lambda x: x.split(":")[0].strip())
gp = df.groupby(by=['SimpleCategory', 'CDName'], as_index=False, sort=True)
Summary = gp.count()
Summary2 = gp.sum()
Summary['secs'] = Summary2['secs']
Summary = Summary.sort_values('secs', ascending=False)
Summary
Text Clustering the descriptions¶
In [6]:
import string
import collections
import nltk
nltk.data.path.append("/Volumes/SD/nltk_data")
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
# from gensim.models import Word2Vec
from pprint import pprint
def process_text(text, stem=True, stop_words=None, lower=False, pos=None):
""" Tokenize text and stem words removing punctuation """
text = text.translate(str.maketrans('','',string.punctuation))
tokens = word_tokenize(text)
if stop_words is not None:
tokens = [w for w in tokens if not w in stop_words]
if lower:
tokens = [w.lower() for w in tokens]
if pos is not None: # checking part of speech
for i, w in enumerate(tokens):
tmp_pos = set([k.pos() for k in wn.synsets(w)])
if not tmp_pos or not tmp_pos <= pos:
tokens[i] = None
tokens = [w for w in tokens if w is not None] # remove None
if stem:
stemmer = PorterStemmer()
tokens = [stemmer.stem(t) for t in tokens]
return tokens
def cluster_texts(texts, clusters=3):
""" Transform texts to Tf-Idf coordinates and cluster texts using K-Means """
vectorizer = TfidfVectorizer(tokenizer=process_text,
stop_words=stopwords.words('english'),
lowercase=True)
text_model = vectorizer.fit_transform(texts)
km_model = KMeans(n_clusters=clusters, random_state=42)
km_model.fit(text_model)
clustering = collections.defaultdict(list)
for idx, label in enumerate(km_model.labels_):
clustering[label].append(idx) #return the index
return text_model, clustering
df = pd.read_csv('./BBCSoundEffects.csv')
articles = df['description'].tolist()
tfidf_model, clusters = cluster_texts(articles, 7)
# Relabel the data
df['clusterLabel'] = np.nan
for k in clusters.keys():
df.loc[clusters[k], 'clusterLabel'] = k
X = tfidf_model.todense()
#pprint(dict(clusters))
In [7]:
# What is the topic of the cluster
from nltk.corpus import wordnet as wn
def get_key_word(index = 0, show_df=False):
df_sub11 = df.loc[df['clusterLabel']==index,['description', 'category', 'CDName']]
if show_df:
display.display(df_sub11)
A = df_sub11.drop_duplicates().dropna().values.flatten()
B = [process_text(a) for a in A]
B = flat_list = [item for sublist in B for item in sublist]
fdist = nltk.FreqDist(B)
return fdist.most_common(10)
nouns = wn.all_synsets('n')
num_cat = len(df['clusterLabel'].unique())
key_words = [[]] * num_cat
for i in range(num_cat):
key_word = get_key_word(i, False)
print(key_word)
key_words[i] = key_word[0][0]
#word_list = [kk for kk, cc in key_word]
#wn.synsets(w)[0].pos()
print('=================================')
PCA of clusters¶
In [8]:
from sklearn.decomposition import PCA
key_words[0] = 'comedi'
if True:
pca = PCA(n_components=2).fit(X)
data2D = pca.transform(X)
if True:
labels = key_words
df['clusterName'] = np.nan
for n, l in enumerate(labels):
df.loc[df['clusterLabel']==n, 'clusterName'] = l
df_plot = df.copy()
df_plot['x'] = data2D[:,0]
df_plot['y'] = data2D[:,1]
sns.scatterplot(x='x', y='y', hue='clusterName', data=df_plot)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.gcf().savefig('PCA_clustering.png', bbox_inches='tight', dpi=300)
plt.show()
In [9]:
df.to_csv('./BBC_Sound_Effect_clustered.csv', index=False)
np.savez_compressed('./clustering.npz', tfidf_model=tfidf_model, clusters=clusters)
In [10]:
df.head()
Out[10]:
TSNE¶
In [6]:
#from sklearn.manifold import TSNE
from MulticoreTSNE import MulticoreTSNE as TSNE
tsne = TSNE(n_jobs=4, n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(X)
plt.scatter(tsne_results[:, 0], tsne_results[:, 1])
plt.show()
Download some data from the cluster¶
In [11]:
df = pd.read_csv('./BBC_Sound_Effect_clustered.csv')
gp = df.groupby(by=['clusterLabel', 'clusterName'], sort=True, as_index=False)
Summary = gp.count()
Summary2 = gp.sum()
Summary['secs'] = Summary2['secs']
Summary
Out[11]:
In [12]:
df2 = df.sort_values(by=['clusterLabel'])
df2 = df2.loc[np.logical_or(df['clusterLabel']==3, df['clusterLabel']==5),:]
df2.to_csv('./clock_footstep_sorted.csv', index=False)
Download the sound data to be explored¶
In [13]:
import urllib3
import shutil
import time
url_format = 'http://bbcsfx.acropolis.org.uk/assets/{}'
path_format = './wav/{}'
http = urllib3.PoolManager()
def download_file(url, path):
if not os.path.isfile(path): # not already downloaded
with http.request('GET', url, preload_content=False) as r, open(path, 'wb') as out_file:
shutil.copyfileobj(r, out_file)
df2 = df2.reset_index()
num_downloads = df2.shape[0]
printProgressBar(0,num_downloads,prefix='Progress:',suffix='Complete',length=50,mode="counts")
for i in df2.index:
printProgressBar(i+1,num_downloads,prefix='Progress:',suffix='Complete',length=50,mode="counts")
url = url_format.format(df2.loc[i, 'location'])
path = path_format.format(df2.loc[i, 'location'])
if os.path.isfile(path):
time.sleep(0.001)
continue # skip ones already downloaded
download_file(url, path)
Spectrogram¶
In [140]:
# Spectrogram
import librosa
import audioread
def make_spectrogram(wav_files, labels=['clock', 'footstep'], sr=44100):
fig, ax = plt.subplots(3, 2)
for i, wav in enumerate(wav_files):
current_path = "./wav/{}".format(wav)
tmp = audioread.audio_open(current_path) # ge tthe sample rate
data_out, _ = librosa.load(current_path, sr=tmp.samplerate, dtype=np.float32)
cmap = 'CMRmap'#'nipy_spectral'
ax[0,i].plot(np.linspace(0, len(data_out)/sr,len(data_out)), data_out, c=(174/255, 199/255, 232/255))
ax[0,i].set_title(labels[i])
frequencies, times, spectrogram = signal.spectrogram(data_out, sr)
ax[1,i].pcolormesh(times, frequencies, spectrogram, cmap=cmap)
ax[1,i].set_title('Orignal Spectrogram')
ax[2,i].pcolormesh(times, frequencies, spectrogram, cmap=cmap)
ax[2,i].set_ylim(0, 5000)
ax[2,i].set_title('Zoom Original Spectrogram')
[a.set_xlabel('Time (sec)') for a in ax.flatten()]
[a.set_ylabel('Frequency (Hz)') for a in ax[1:,:].flatten()]
fig.subplots_adjust(wspace=0.25, hspace=0.5)
fig.set_size_inches(10,8)
return fig
clock = '07016099.wav'
footstep = '07004099.wav'
fig = make_spectrogram([clock, footstep])
fig.savefig('./Example_audio_spectrogram.png')
Periodicity of the sounds¶
Audio Segmentation¶
In [128]:
from auditok.cmdline import file_to_audio_source, seconds_to_str_fromatter, TokenizerWorker, LogWorker
from auditok.core import StreamTokenizer
from auditok.io import PyAudioSource, BufferAudioSource, StdinAudioSource, player_for
from auditok.util import ADSFactory, AudioEnergyValidator
import logging
import librosa
import audioread
def audio_segmentation(wavefile, analysis_window=0.01, max_time=None, energy_threshold=55, min_duration=0.2,
max_duration=5, max_silence=0.3, quiet=True, printf="{id} {start} {end}",
debug=False, time_format='%S', drop_trailing_silence=False):
# load the data
asource = file_to_audio_source(filename=wavefile, uc=1)
# Logging for segmentation
LOGGER_NAME = "AUDITOK_LOGGER"
logger = logging.getLogger(LOGGER_NAME)
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.CRITICAL)
logger.addHandler(handler)
record = True
# Dropping trailing tokenizer
mode = StreamTokenizer.DROP_TRAILING_SILENCE if drop_trailing_silence else 0
ads = ADSFactory.ads(audio_source=asource, block_dur=analysis_window, max_time=max_time,
record=record)
validator = AudioEnergyValidator(sample_width=asource.get_sample_width(), energy_threshold=energy_threshold)
# Segmentation parameters
analysis_window_per_second = 1. / analysis_window
tokenizer = StreamTokenizer(validator=validator, min_length=min_duration * analysis_window_per_second,
max_length=int(max_duration * analysis_window_per_second),
max_continuous_silence=max_silence * analysis_window_per_second,
mode = mode)
observers = []
tokenizer_worker = None
oformat = printf.replace("\\n", "\n").replace("\\t", "\t").replace("\\r", "\r")
converter = seconds_to_str_fromatter(time_format)
log_worker = LogWorker(print_detections = not quiet, output_format=oformat,
time_formatter=converter, logger=logger, debug=debug)
observers.append(log_worker)
# Do the segmentation
tokenizer_worker = TokenizerWorker(ads, tokenizer, analysis_window, observers)
# start observer threads
for obs in observers:
obs.start()
# start tokenization thread
tokenizer_worker.start()
time.sleep(1) # wait a little
tokenizer_worker = None
return log_worker.detections
In [301]:
clock = '07016099.wav'
footstep = '07004099.wav'
# Plot
plt.style.use('dark_background')
fig, ax = plt.subplots(1, 2)
titles = ['clock', 'footstep']
energy_threshold=[55, 55]
for i, w in enumerate([clock, footstep]):
current_path = "./wav/{}".format(w)
# Segment
detections = audio_segmentation(current_path, energy_threshold=energy_threshold[i], max_duration=4)
# Plot
tmp = audioread.audio_open(current_path) # get tthe sample rate
sr = tmp.samplerate
data_out, _ = librosa.load(current_path, sr=sr, dtype=np.float32)
time_vec = np.linspace(0, len(data_out)/sr,len(data_out))
ax[i].plot(time_vec, data_out)
ylevel = ax[i].get_ylim()
for _, _, _, start_time, end_time in detections:
ax[i].fill_betweenx(ylevel, start_time, end_time, alpha=0.35, color='yellow', edgecolor='w', lw=5)
ax[i].set_title(titles[i])
fig.set_size_inches(15, 3)
In [338]:
clock = '07016230.wav'
footstep = '07064103.wav'
# Plot
plt.style.use('dark_background')
fig, ax = plt.subplots(1, 2)
titles = ['clock', 'footstep']
energy_threshold=[72, 55]
for i, w in enumerate([clock, footstep]):
current_path = "./wav/{}".format(w)
# Segment
detections = audio_segmentation(current_path, energy_threshold=energy_threshold[i], max_duration=30)
# Plot
tmp = audioread.audio_open(current_path) # get tthe sample rate
sr = tmp.samplerate
data_out, _ = librosa.load(current_path, sr=sr, dtype=np.float32)
time_vec = np.linspace(0, len(data_out)/sr,len(data_out))
ax[i].plot(time_vec, data_out)
ylevel = ax[i].get_ylim()
for _, _, _, start_time, end_time in detections:
ax[i].fill_betweenx(ylevel, start_time, end_time, alpha=0.35, color='yellow', edgecolor='w', lw=5)
ax[i].set_title(titles[i])
fig.set_size_inches(15, 3)
In [334]:
display.Audio('./wav/{}'.format(clock))
Out[334]: