How to get mel-spectagram peaks array in python? - python

I want to make an audio fingerprint, so i need to get a spectrogram peaks array. I've tried to find solution in the internet, but there's nothing.
Here is the spectagram example
import librosa, librosa.display
import numpy as np
import matplotlib.pyplot as plt
import IPython.display as ipd
from FFT import FFT
def MEL_SPECTOGRAM(signal, sr, fileName):
ipd.Audio(signal, rate=sr)
# this is the number of samples in a window per fft
n_fft = 2048
# The amount of samples we are shifting after each fft
hop_length = 512
audio_stft = librosa.core.stft(signal, hop_length=hop_length, n_fft=n_fft)
spectrogram = np.abs(audio_stft)
log_spectro = librosa.amplitude_to_db(spectrogram)
librosa.util.normalize(log_spectro)
librosa.display.specshow(log_spectro, sr=sr, n_fft=n_fft, hop_length=hop_length, cmap='magma', win_length=n_fft)
plt.plot()
plt.show()
[mel-spectagram example]
(https://i.stack.imgur.com/u0zKd.png)
The best solution i found was this video, but unfortunately, it was written on wolfram, so i can't use it
https://www.youtube.com/watch?v=oCHeGesfJe8&ab_channel=Wolfram

Related

How to find series of highest peaks of a repeating pattern using find_peaks() in Python?

I'm trying to determine the highest peaks of the pattern blocks in the following waveform:
Basically, I need to detect the following peaks only (highlighted):
If I use scipy.find_peaks(), it's unable to detect the appropriate peaks:
indices = find_peaks(my_waveform, prominence = 1)[0]
It ends up detecting all of the following points, which is not what I am looking for:
I can't provide the input arguments of distance or height thresholds to scipy.find_peaks() since there are many of the desired peaks on either extremes which are lower in height than the non-desired peaks in the middle.
Note: I had de-trended the waveform in order to aid this above problem too as you can see in the above snapshot, but it still doesn't give the right results.
So can anyone help with a correct way to tackle this?
Here's the code to fully reproduce the dataset I've shown ("autocorr" is the final waveform of interest)
import json
import sys, os
import numpy as np
import pandas as pd
import glob
import pickle
from statsmodels.tsa.stattools import adfuller, acf, pacf
from scipy.signal import find_peaks, square
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import matplotlib.pyplot as plt
#GENERATION OF A FUNCTION WITH DUAL SEASONALITY & NOISE
def white_noise(mu, sigma, num_pts):
""" Function to generate Gaussian Normal Noise
Args:
sigma: std value
num_pts: no of points
mu: mean value
Returns:
generated Gaussian Normal Noise
"""
noise = np.random.normal(mu, sigma, num_pts)
return noise
def signal_line_plot(input_signal: pd.Series, title: str = "", y_label: str = "Signal"):
""" Function to plot a time series signal
Args:
input_signal: time series signal that you want to plot
title: title on plot
y_label: label of the signal being plotted
Returns:
signal plot
"""
plt.plot(input_signal)
plt.title(title)
plt.ylabel(y_label)
plt.show()
t_week = np.linspace(1,480, 480)
t_weekend=np.linspace(1,192,192)
T=96 #Time Period
x_weekday = 10*square(2*np.pi*t_week/T, duty=0.7)+10 + white_noise(0, 1,480)
x_weekend = 2*square(2*np.pi*t_weekend/T, duty=0.7)+2 + white_noise(0,1,192)
x_daily_weekly = np.concatenate((x_weekday, x_weekend))
x_daily_weekly_long = np.concatenate((x_daily_weekly,x_daily_weekly,x_daily_weekly,x_daily_weekly,x_daily_weekly,x_daily_weekly,x_daily_weekly,x_daily_weekly,x_daily_weekly,x_daily_weekly))
signal_line_plot(x_daily_weekly_long)
signal_line_plot(x_daily_weekly_long[0:1000])
#x_daily_weekly_long is the final waveform on which I'm carrying out Autocorrelation
#PERFORMING AUTOCORRELATION:
import scipy.signal as signal
autocorr = signal.correlate(x_daily_weekly_long, x_daily_weekly_long, mode = "same")
lags = signal.correlation_lags(len(x_daily_weekly_long), len(x_daily_weekly_long), mode = "same")
#VISUALIZATION:
f = plt.figure()
f.set_figwidth(40)
f.set_figheight(10)
plt.plot(lags, autocorr)
As you have some kind of double (or even triple) signal, I would attempt a double smoothing.
One to remove the overall trend, and one to remove the sharp noise.
A picture is probably better than a long explanation:
from scipy.signal import find_peaks
import pandas as pd
import numpy as np
def smooth(s, win):
return pd.Series(s).rolling(window=win, center=True).mean().ffill().bfill()
plt.plot(lags, autocorr, label='data')
WINDOW = 100 # needs to be determined empirically
# and so are the multipliers below
# double smoothing difference + clipping
ddiff = np.clip(smooth(autocorr, 2*WINDOW)-smooth(autocorr, 10*WINDOW), 0, np.inf)
plt.plot(lags, ddiff, label='smooth+clip')
peaks = find_peaks(ddiff, width=WINDOW)[0]
plt.plot(lags[peaks], autocorr[peaks], marker='o', ls='')
plt.plot(lags[peaks], ddiff[peaks], marker='o', ls='')
plt.legend()
output:
smoothing the original signal
As often in data analysis, the earlier you perform a transformation might be the better. You could also clean your original signal before running the autocorrelation. Here is a quick example (using the smooth function defined above):
from scipy.signal import find_peaks
x2 = smooth(x_daily_weekly_long, 100)
autocorr2 = signal.correlate(x2, x2, mode = "same")
plt.plot(lags, autocorr2)
idx = find_peaks(autocorr2)[0]
plt.plot(lags[idx], autocorr2[idx], marker='o', ls='')
cleaned signal:
For testing purposes i used a rough reconstruction of your signal.
import numpy as np
from scipy.signal import find_peaks, square
import matplotlib.pyplot as plt
x = np.linspace(3,103,10000)
sin = np.clip(np.sin(0.6*x)-0.5,0,10)
tri = np.concatenate([np.linspace(0,0.3,5000),np.linspace(0.3,0,5000)],axis =0)
sig = np.sin(6*x-1.2)
full = sin+tri+sig
peak run #1
peaks = find_peaks(full)[0]
plt.plot(full)
plt.scatter(peaks,full[peaks], color='red', s=5)
plt.show()
peak run #2 + index reextraction (this needs the actual values from your signal)
peaks2 = find_peaks(full[peaks])[0]
index = peaks[peaks2]
plt.plot(full)
plt.scatter(index,full[index], color='red', s=5)
plt.show()
If you know the period you can do this:
w=T
f = plt.figure()
f.set_figwidth(40)
f.set_figheight(10)
plt.plot(lags, autocorr)
plt.scatter(lags[signal.find_peaks(signal.convolve(autocorr, np.ones(w)/w, mode="same"))[0]], autocorr[signal.find_peaks(signal.convolve(autocorr, np.ones(w)/w, mode="same"))[0]], color="r")
Result:
I don't know if it works in other cases.
EDIT:
another approach is to find the maximum in a slicing window, but also in this case you must define empirically a window size.
w=900
f = plt.figure()
f.set_figwidth(40)
f.set_figheight(10)
plt.plot(lags, autocorr)
plt.scatter(lags[filters.maximum_filter(autocorr, size=W)==autocorr], autocorr[filters.maximum_filter(autocorr, size=W)==autocorr], color="r")
Result:

librosa mel spectrogram Hz scaling issue

I am having some odd vertical scaling issues with librosa.feature.melspectrogram(). It seems that when I use librosa.load() with sr=None, the Hz scale doesn't coincide with the intended spectrographic features. To investigate this further, I looked at a pure 1,000Hz tone which I got from https://www.mediacollege.com/audio/tone/download/
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
filename = '1kHz_44100Hz_16bit_05sec.wav'
y1, sr1 = librosa.load(filename,sr=None)
y2, sr2 = librosa.load(filename)
fig, ax = plt.subplots(1,2)
S = librosa.feature.melspectrogram(y1, sr=sr1, n_mels=128)
S_DB = librosa.power_to_db(S, ref=np.max)
librosa.display.specshow(S_DB, sr=sr1, y_axis='mel', ax=ax[0]);
ax[0].title.set_text(f"sr1={sr1}\nload(filename,sr=None)")
S = librosa.feature.melspectrogram(y2, sr=sr2, n_mels=128)
S_DB = librosa.power_to_db(S, ref=np.max)
librosa.display.specshow(S_DB, sr=sr2, y_axis='mel', ax=ax[1]);
ax[1].title.set_text(f"sr2={sr2}\nload(filename)")
plt.tight_layout()
I'm not sure why the 1kHz tone is not lining up in both spectrograms. I would suspect the one with sr=None to be the more accurate as it is using the actual samplerate from the file. Would anyone know why there is a difference? The feature in the left plot is obviously not at 1kHz, but more like 800Hz or so. Thanks.

Python Audio Analysis, Spectrogram: Which spectrogram should I use and why?

I am doing my final project at university: pitch estimation from song recording using convolutional neural network (CNN). I want to retrieve pitches existed in a song recording. For CNN input, I am using a spectrogram.
I am using MIR-QBSH dataset with pitch vectors as data label. Before processing the audio to CNN (each audio has 8 sec duration in .wav files of 8 KHz, 8 bit, mono), I need to pre-process the audio into a spectrogram representation.
I have found 3 ways to generate a spectrogram, the code are listed below.
Audio example I am using in this code is available here.
Imports:
import librosa
import numpy as np
import matplotlib.pyplot as plt
import librosa.display
from numpy.fft import *
import math
import wave
import struct
from scipy.io import wavfile
Spectrogram A
x, sr = librosa.load('audio/00020_2003_person1.wav', sr=None)
window_size = 1024
hop_length = 512
n_mels = 128
time_steps = 384
window = np.hanning(window_size)
stft= librosa.core.spectrum.stft(x, n_fft = window_size, hop_length = hop_length, window=window)
out = 2 * np.abs(stft) / np.sum(window)
plt.figure(figsize=(12, 4))
ax = plt.axes()
plt.set_cmap('hot')
librosa.display.specshow(librosa.amplitude_to_db(out, ref=np.max), y_axis='log', x_axis='time',sr=sr)
plt.savefig('spectrogramA.png', bbox_inches='tight', transparent=True, pad_inches=0.0 )
Spectrogram B
x, sr = librosa.load('audio/00020_2003_person1.wav', sr=None)
X = librosa.stft(x)
Xdb = librosa.amplitude_to_db(abs(X))
# plt.figure(figsize=(14, 5))
librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='hz')
Spectrogram C
# Read the wav file (mono)
samplingFrequency, signalData = wavfile.read('audio/00020_2003_person1.wav')
print(samplingFrequency)
print(signalData)
# Plot the signal read from wav file
plt.subplot(111)
plt.specgram(signalData,Fs=samplingFrequency)
plt.xlabel('Time')
plt.ylabel('Frequency')
Spectrogram results are displayed below:
My question is, from the 3 spectrograms I have listed above, which spectrogram is best to use for input to CNN and why should I use that spectrogram type? I am currently having difficulty to find their differences, as well as their pros and cons.

Plot signal spectrogram image for each time step in python

I want to print a spectrogram image for each time step
This is what i have tried
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
NFFT = 256 # the length of the windowing segments
Fs = 64 # the sampling rate
plt.subplot(212) # don't share the axis
for x in range(1,20):
Pxx, freqs, bins, im = plt.specgram(a, NFFT=150, Fs=Fs,noverlap=50, cmap=plt.cm.gist_heat)
NFFT = NFFT+50
plt.savefig("spectrogram{x}.png".format(x=x))
But the output image for the first iteration is as same as the image in the last iteration

How to plot a wav file

I have just read a wav file with scipy and now I want to make the plot of the file using matplotlib, on the "y scale" I want to see the aplitude and over the "x scale" I want to see the numbers of frames!
Any help how can I do this??
Thank you!
from scipy.io.wavfile import read
import numpy as np
from numpy import*
import matplotlib.pyplot as plt
a=read("C:/Users/Martinez/Desktop/impulso.wav")
print a
You can call wave lib to read an audio file.
To plot the waveform, use the "plot" function from matplotlib
import matplotlib.pyplot as plt
import numpy as np
import wave
import sys
spf = wave.open("wavfile.wav", "r")
# Extract Raw Audio from Wav File
signal = spf.readframes(-1)
signal = np.fromstring(signal, "Int16")
# If Stereo
if spf.getnchannels() == 2:
print("Just mono files")
sys.exit(0)
plt.figure(1)
plt.title("Signal Wave...")
plt.plot(signal)
plt.show()
you will have something like:
To Plot the x-axis in seconds you need get the frame rate and divide by size of your signal, you can use linspace function from numpy to create a Time Vector spaced linearly with the size of the audio file and finally you can use plot again like plt.plot(Time,signal)
import matplotlib.pyplot as plt
import numpy as np
import wave
import sys
spf = wave.open("Animal_cut.wav", "r")
# Extract Raw Audio from Wav File
signal = spf.readframes(-1)
signal = np.fromstring(signal, "Int16")
fs = spf.getframerate()
# If Stereo
if spf.getnchannels() == 2:
print("Just mono files")
sys.exit(0)
Time = np.linspace(0, len(signal) / fs, num=len(signal))
plt.figure(1)
plt.title("Signal Wave...")
plt.plot(Time, signal)
plt.show()
New plot x-axis in seconds:
Alternatively, if you want to use SciPy, you may also do the following:
from scipy.io.wavfile import read
import matplotlib.pyplot as plt
# read audio samples
input_data = read("Sample.wav")
audio = input_data[1]
# plot the first 1024 samples
plt.plot(audio[0:1024])
# label the axes
plt.ylabel("Amplitude")
plt.xlabel("Time")
# set the title
plt.title("Sample Wav")
# display the plot
plt.show()
Here's a version that will also handle stereo inputs, based on the answer by #ederwander
import matplotlib.pyplot as plt
import numpy as np
import wave
file = 'test.wav'
with wave.open(file,'r') as wav_file:
#Extract Raw Audio from Wav File
signal = wav_file.readframes(-1)
signal = np.fromstring(signal, 'Int16')
#Split the data into channels
channels = [[] for channel in range(wav_file.getnchannels())]
for index, datum in enumerate(signal):
channels[index%len(channels)].append(datum)
#Get time from indices
fs = wav_file.getframerate()
Time=np.linspace(0, len(signal)/len(channels)/fs, num=len(signal)/len(channels))
#Plot
plt.figure(1)
plt.title('Signal Wave...')
for channel in channels:
plt.plot(Time,channel)
plt.show()
Here is the code to draw a waveform and a frequency spectrum of a wavefile
import wave
import numpy as np
import matplotlib.pyplot as plt
signal_wave = wave.open('voice.wav', 'r')
sample_rate = 16000
sig = np.frombuffer(signal_wave.readframes(sample_rate), dtype=np.int16)
For the whole segment of the wave file
sig = sig[:]
For partial segment of the wave file
sig = sig[25000:32000]
Separating stereo channels
left, right = data[0::2], data[1::2]
Plot the waveform (plot_a) and the frequency spectrum (plot_b)
plt.figure(1)
plot_a = plt.subplot(211)
plot_a.plot(sig)
plot_a.set_xlabel('sample rate * time')
plot_a.set_ylabel('energy')
plot_b = plt.subplot(212)
plot_b.specgram(sig, NFFT=1024, Fs=sample_rate, noverlap=900)
plot_b.set_xlabel('Time')
plot_b.set_ylabel('Frequency')
plt.show()
Just an observation (I cannot add comment).
You will receive the following mesage:
DeprecationWarning: Numeric-style type codes are deprecated and will
resultin an error in the future.
Do not use np.fromstring with binaries. Instead of signal = np.fromstring(signal, 'Int16'), it's preferred to use signal = np.frombuffer(signal, dtype='int16').
Here is a version that handles mono/stereo and 8-bit/16-bit PCM.
import matplotlib.pyplot as plt
import numpy as np
import wave
file = 'test.wav'
wav_file = wave.open(file,'r')
#Extract Raw Audio from Wav File
signal = wav_file.readframes(-1)
if wav_file.getsampwidth() == 1:
signal = np.array(np.frombuffer(signal, dtype='UInt8')-128, dtype='Int8')
elif wav_file.getsampwidth() == 2:
signal = np.frombuffer(signal, dtype='Int16')
else:
raise RuntimeError("Unsupported sample width")
# http://schlameel.com/2017/06/09/interleaving-and-de-interleaving-data-with-python/
deinterleaved = [signal[idx::wav_file.getnchannels()] for idx in range(wav_file.getnchannels())]
#Get time from indices
fs = wav_file.getframerate()
Time=np.linspace(0, len(signal)/wav_file.getnchannels()/fs, num=len(signal)/wav_file.getnchannels())
#Plot
plt.figure(1)
plt.title('Signal Wave...')
for channel in deinterleaved:
plt.plot(Time,channel)
plt.show()
I suppose I could've put this in a comment, but building slightly on the answers from both #ederwander and #TimSC, I wanted to make something more fine (as in detailed) and aesthetically pleasing. The code below creates what I think is a very nice waveform of a stereo or mono wave file (I didn't need a title so I just commented that out, nor did I need the show method - just needed to save the image file).
Here's an example of a stereo wav rendered:
And the code, with the differences I mentioned:
import matplotlib.pyplot as plt
import numpy as np
import wave
file = '/Path/to/my/audio/file/DeadMenTellNoTales.wav'
wav_file = wave.open(file,'r')
#Extract Raw Audio from Wav File
signal = wav_file.readframes(-1)
if wav_file.getsampwidth() == 1:
signal = np.array(np.frombuffer(signal, dtype='UInt8')-128, dtype='Int8')
elif wav_file.getsampwidth() == 2:
signal = np.frombuffer(signal, dtype='Int16')
else:
raise RuntimeError("Unsupported sample width")
# http://schlameel.com/2017/06/09/interleaving-and-de-interleaving-data-with-python/
deinterleaved = [signal[idx::wav_file.getnchannels()] for idx in range(wav_file.getnchannels())]
#Get time from indices
fs = wav_file.getframerate()
Time=np.linspace(0, len(signal)/wav_file.getnchannels()/fs, num=len(signal)/wav_file.getnchannels())
plt.figure(figsize=(50,3))
#Plot
plt.figure(1)
#don't care for title
#plt.title('Signal Wave...')
for channel in deinterleaved:
plt.plot(Time,channel, linewidth=.125)
#don't need to show, just save
#plt.show()
plt.savefig('/testing_folder/deadmentellnotales2d.png', dpi=72)
I came up with a solution that's more flexible and more performant:
Downsampling is used to achieve two samples per second. This is achieved by calculating the average of absolute values for each window. The result looks like the waveforms from streaming sites like SoundCloud.
Multi-channel is supported (thanks #Alter)
Numpy is used for each operation, which is much more performant than looping through the array.
The file is processed in batches to support very large files.
import matplotlib.pyplot as plt
import numpy as np
import wave
import math
file = 'audiofile.wav'
with wave.open(file,'r') as wav_file:
num_channels = wav_file.getnchannels()
frame_rate = wav_file.getframerate()
downsample = math.ceil(frame_rate * num_channels / 2) # Get two samples per second!
process_chunk_size = 600000 - (600000 % frame_rate)
signal = None
waveform = np.array([])
while signal is None or signal.size > 0:
signal = np.frombuffer(wav_file.readframes(process_chunk_size), dtype='int16')
# Take mean of absolute values per 0.5 seconds
sub_waveform = np.nanmean(
np.pad(np.absolute(signal), (0, ((downsample - (signal.size % downsample)) % downsample)), mode='constant', constant_values=np.NaN).reshape(-1, downsample),
axis=1
)
waveform = np.concatenate((waveform, sub_waveform))
#Plot
plt.figure(1)
plt.title('Waveform')
plt.plot(waveform)
plt.show()

Categories