normalizing mel spectrogram to unit peak amplitude? - python

I am new to both python and librosa. I am trying to follow this method for a speech recognizer: acoustic front end
My code:
import librosa
import librosa.display
import numpy as np
y, sr = librosa.load('test.wav', sr = None)
normalizedy = librosa.util.normalize(y)
stft = librosa.core.stft(normalizedy, n_fft = 256, hop_length=16)
mel = librosa.feature.melspectrogram(S=stft, n_mels=32)
melnormalized = librosa.util.normalize(mel)
mellog = np.log(melnormalized) - np.log(10**-5)
The problem is that when I apply librosa.util.normalize to variable mel, I expect values to be between 1 and -1, which they aren't. What am I missing?

If you want your output to be log-scaled and normalized to between -1 and +1, you should log-scale first, then normalize:
import librosa
import librosa.display
import numpy as np
y, sr = librosa.load('test.wav', sr = None)
normalizedy = librosa.util.normalize(y)
stft = librosa.core.stft(normalizedy, n_fft = 256, hop_length=16)
mel = librosa.feature.melspectrogram(S=stft, n_mels=32)
mellog = np.log(mel + 1e-9)
melnormalized = librosa.util.normalize(mellog)
# use melnormalized

Related

scipy and numpy not error with as_matrix function

This is a "working example" that does not work. Why does this not run? scipy seems to not work.
i get this error:
File "display_map.py", line 35, in
rot_cw = R.from_quat(keyframe["rot_cw"]).as_matrix()
AttributeError: 'Rotation' object has no attribute 'as_matrix'
please can someone help me me change it. I tried reducing the version of scipy
import msgpack
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
from numpy.linalg import inv
from scipy.spatial.transform import Rotation as R
import open3d as o3d
import sys
if len(sys.argv) < 2:
print(
"ERROR: Please provide path to .msg file. Example usage is; python3 visualize_openvslam_map.py path_to.msg"
)
exit()
with open(sys.argv[1], "rb") as f:
upacked_msg = msgpack.Unpacker(f)
packed_msg = upacked_msg.unpack()
keyfarmes = packed_msg["keyframes"]
landmarks = packed_msg["landmarks"]
# FILL IN KEYFRAME POINTS(ODOMETRY) TO ARRAY
keyframe_points = []
keyframe_points_color = []
for keyframe in keyfarmes.values():
# get conversion from camera to world
trans_cw = np.matrix(keyframe["trans_cw"]).T
rot_cw = R.from_quat(keyframe["rot_cw"]).as_matrix()
# compute conversion from world to camera
rot_wc = rot_cw.T
trans_wc = -rot_wc * trans_cw
keyframe_points.append((trans_wc[0, 0], trans_wc[1, 0], trans_wc[2, 0]))
keyframe_points = np.array(keyframe_points)
keyframe_points_color = np.repeat(np.array([[0., 1., 0.]]),
keyframe_points.shape[0],
axis=0)
# FILL IN LANDMARK POINTS TO ARRAY
landmark_points = []
landmark_points_color = []
for lm in landmarks.values():
landmark_points.append(lm["pos_w"])
landmark_points_color.append([
abs(lm["pos_w"][1]) * 4,
abs(lm["pos_w"][1]) * 2,
abs(lm["pos_w"][1]) * 3
])
landmark_points = np.array(landmark_points)
landmark_points_color = np.array(landmark_points_color)
# CONSTRUCT KEYFRAME(ODOMETRY) FOR VISUALIZTION
keyframe_points_pointcloud = o3d.geometry.PointCloud()
keyframe_points_pointcloud.points = o3d.utility.Vector3dVector(keyframe_points)
keyframe_points_pointcloud.colors = o3d.utility.Vector3dVector(
keyframe_points_color)
# CONSTRUCT LANDMARK POINTCLOUD FOR VISUALIZTION
landmark_points_pointcloud = o3d.geometry.PointCloud()
landmark_points_pointcloud.points = o3d.utility.Vector3dVector(landmark_points)
landmark_points_pointcloud.colors = o3d.utility.Vector3dVector(
landmark_points_color)
# VISULIZE MAP
o3d.visualization.draw_geometries([
keyframe_points_pointcloud, landmark_points_pointcloud,
o3d.geometry.TriangleMesh.create_coordinate_frame()
])
In scipy.spatial.Rotation methods from_dcm, as_dcm were renamed to from_matrix, as_matrix respectively.

Iterate over an audio file with Python's librosa

I was trying to use a voice emotion detecton model on github HERE. Based on their examples, I was able to implement the following code to predict the final emotion of an audio file as a single prediction. Looks like it makes sub-predictions for each 0.4s window in the audio file, and then takes the maximum occurance as the final output (here is the sample file I used).
How can I change it to print a prediction for every 1s chunk of the audio file (as opposed to a single value for the whole file)?
import numpy as np
import pandas as pd
import librosa
from tqdm import tqdm
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Conv2D, MaxPool2D, Flatten, Dropout, Dense
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
# Create a configuration class to help if I want to change parameters later
class Config:
def __init__(self, n_mfcc = 26, n_feat = 13, n_fft = 552, sr = 22050, window = 0.4, test_shift = 0.1):
self.n_mfcc = n_mfcc
self.n_feat = n_feat
self.n_fft = n_fft
self.sr = sr
self.window = window
self.step = int(sr * window)
self.test_shift = test_shift
self.shift = int(sr * test_shift)
config = Config()
model = pickle.load(open('cnn_ep25_mfccOnly_moreData.pkl', 'rb'))
wav, sr = librosa.load('YAF_chain_angry.wav')
all_results = []
# Initialize a local results list
local_results = []
# Initialize min and max values for each file for scaling
_min, _max = float('inf'), -float('inf')
# Load the file
# Create an array to hold features for each window
X = []
# Iterate over sliding 0.4s windows of the audio file
for i in range(int((wav.shape[0]/sr-config.window)/config.test_shift)):
X_sample = wav[i*config.shift: i*config.shift + config.step] # slice out 0.4s window
X_mfccs = librosa.feature.mfcc(X_sample, sr, n_mfcc = config.n_mfcc, n_fft = config.n_fft,
hop_length = config.n_fft)[1:config.n_feat + 1] # generate mfccs from sample
_min = min(np.amin(X_mfccs), _min)
_max = max(np.amax(X_mfccs), _max) # check min and max values
X.append(X_mfccs) # add features of window to X
# Put window data into array, scale, then reshape
X = np.array(X)
X = (X - _min) / (_max - _min)
X = X.reshape(X.shape[0], X.shape[1], X.shape[2], 1)
# Feed data for each window into model for prediction
for i in range(X.shape[0]):
window = X[i].reshape(1, X.shape[1], X.shape[2], 1)
local_results.append(model.predict(window))
# Aggregate predictions for file into one then append to all_results
local_results = (np.sum(np.array(local_results), axis = 0)/len(local_results))[0]
local_results = list(local_results)
prediction = np.argmax(local_results)
# Turn all results into a dataframe
df_cols = ['neutral', 'happy', 'sad', 'angry', 'fearful', 'disgusted', 'surprised']
print(df_cols)
print(local_results)
print("Prediction: "+ df_cols[prediction])

To fix an error : could not broadcast input array from shape (5,5) into shape (5,5,4)

I want to preprocess such an image dataset using an unsupervised wiener algorithm. But it doesn't work properly. when I run the code, it shows me a value attribute error. For convenience, my code is given below -
import cv2
import glob
from matplotlib import pyplot as plt
from skimage import io, restoration, img_as_float
import scipy.stats as st
import numpy as np
dataset = glob.glob('input/train/*.png')
directory = 'output/train/'
for img_id, img_path in enumerate(dataset):
img = img_as_float(io.imread(img_path))
def gkern(kernlen=21, nsig=2): #Returns a 2D Gaussian kernel.
lim = kernlen//2 + (kernlen % 2)/2
x = np.linspace(-lim, lim, kernlen+1)
kern1d = np.diff(st.norm.cdf(x))
kern2d = np.outer(kern1d, kern1d)
return kern2d/kern2d.sum()
psf = gkern(5,3) #Kernel length and sigma
deconvolved, _ = restoration.unsupervised_wiener(img, psf)
cl2 = cv2.resize(deconvolved, (512,512), interpolation = cv2.INTER_CUBIC)
plt.imsave(f"output/unsupervised_{img_id}.png", cl2, cmap='gray')
I am getting the error :
File "C:\Users\Junaed\.spyder-py3\unsupervised_wiener.py", line 33, in <module>
deconvolved, _ = restoration.unsupervised_wiener(img, psf)
ValueError: could not broadcast input array from shape (5,5) into shape (5,5,4)
How could I fix this issue, Can someone help me here?

How to convert a .wav file to a spectrogram in python3

I am trying to create a spectrogram from a .wav file in python3.
I want the final saved image to look similar to this image:
I have tried the following:
This stack overflow post:
Spectrogram of a wave file
This post worked, somewhat. After running it, I got
However, This graph does not contain the colors that I need. I need a spectrogram that has colors. I tried to tinker with this code to try and add the colors however after spending significant time and effort on this, I couldn't figure it out!
I then tried this tutorial.
This code crashed(on line 17) when I tried to run it with the error TypeError: 'numpy.float64' object cannot be interpreted as an integer.
line 17:
samples = np.append(np.zeros(np.floor(frameSize/2.0)), sig)
I tried to fix it by casting
samples = int(np.append(np.zeros(np.floor(frameSize/2.0)), sig))
and I also tried
samples = np.append(np.zeros(int(np.floor(frameSize/2.0)), sig))
However neither of these worked in the end.
I would really like to know how to convert my .wav files to spectrograms with color so that I can analyze them! Any help would be appreciated!!!!!
Please tell me if you want me to provide any more information about my version of python, what I tried, or what I want to achieve.
Use scipy.signal.spectrogram.
import matplotlib.pyplot as plt
from scipy import signal
from scipy.io import wavfile
sample_rate, samples = wavfile.read('path-to-mono-audio-file.wav')
frequencies, times, spectrogram = signal.spectrogram(samples, sample_rate)
plt.pcolormesh(times, frequencies, spectrogram)
plt.imshow(spectrogram)
plt.ylabel('Frequency [Hz]')
plt.xlabel('Time [sec]')
plt.show()
Be sure that your wav file is mono (single channel) and not stereo (dual channel) before trying to do this. I highly recommend reading the scipy documentation at https://docs.scipy.org/doc/scipy-
0.19.0/reference/generated/scipy.signal.spectrogram.html.
Putting plt.pcolormesh before plt.imshow seems to fix some issues, as pointed out by #Davidjb, and if unpacking error occurs, follow the steps by #cgnorthcutt below.
I have fixed the errors you are facing for http://www.frank-zalkow.de/en/code-snippets/create-audio-spectrograms-with-python.html
This implementation is better because you can change the binsize (e.g. binsize=2**8)
import numpy as np
from matplotlib import pyplot as plt
import scipy.io.wavfile as wav
from numpy.lib import stride_tricks
""" short time fourier transform of audio signal """
def stft(sig, frameSize, overlapFac=0.5, window=np.hanning):
win = window(frameSize)
hopSize = int(frameSize - np.floor(overlapFac * frameSize))
# zeros at beginning (thus center of 1st window should be for sample nr. 0)
samples = np.append(np.zeros(int(np.floor(frameSize/2.0))), sig)
# cols for windowing
cols = np.ceil( (len(samples) - frameSize) / float(hopSize)) + 1
# zeros at end (thus samples can be fully covered by frames)
samples = np.append(samples, np.zeros(frameSize))
frames = stride_tricks.as_strided(samples, shape=(int(cols), frameSize), strides=(samples.strides[0]*hopSize, samples.strides[0])).copy()
frames *= win
return np.fft.rfft(frames)
""" scale frequency axis logarithmically """
def logscale_spec(spec, sr=44100, factor=20.):
timebins, freqbins = np.shape(spec)
scale = np.linspace(0, 1, freqbins) ** factor
scale *= (freqbins-1)/max(scale)
scale = np.unique(np.round(scale))
# create spectrogram with new freq bins
newspec = np.complex128(np.zeros([timebins, len(scale)]))
for i in range(0, len(scale)):
if i == len(scale)-1:
newspec[:,i] = np.sum(spec[:,int(scale[i]):], axis=1)
else:
newspec[:,i] = np.sum(spec[:,int(scale[i]):int(scale[i+1])], axis=1)
# list center freq of bins
allfreqs = np.abs(np.fft.fftfreq(freqbins*2, 1./sr)[:freqbins+1])
freqs = []
for i in range(0, len(scale)):
if i == len(scale)-1:
freqs += [np.mean(allfreqs[int(scale[i]):])]
else:
freqs += [np.mean(allfreqs[int(scale[i]):int(scale[i+1])])]
return newspec, freqs
""" plot spectrogram"""
def plotstft(audiopath, binsize=2**10, plotpath=None, colormap="jet"):
samplerate, samples = wav.read(audiopath)
s = stft(samples, binsize)
sshow, freq = logscale_spec(s, factor=1.0, sr=samplerate)
ims = 20.*np.log10(np.abs(sshow)/10e-6) # amplitude to decibel
timebins, freqbins = np.shape(ims)
print("timebins: ", timebins)
print("freqbins: ", freqbins)
plt.figure(figsize=(15, 7.5))
plt.imshow(np.transpose(ims), origin="lower", aspect="auto", cmap=colormap, interpolation="none")
plt.colorbar()
plt.xlabel("time (s)")
plt.ylabel("frequency (hz)")
plt.xlim([0, timebins-1])
plt.ylim([0, freqbins])
xlocs = np.float32(np.linspace(0, timebins-1, 5))
plt.xticks(xlocs, ["%.02f" % l for l in ((xlocs*len(samples)/timebins)+(0.5*binsize))/samplerate])
ylocs = np.int16(np.round(np.linspace(0, freqbins-1, 10)))
plt.yticks(ylocs, ["%.02f" % freq[i] for i in ylocs])
if plotpath:
plt.savefig(plotpath, bbox_inches="tight")
else:
plt.show()
plt.clf()
return ims
ims = plotstft(filepath)
import os
import wave
import pylab
def graph_spectrogram(wav_file):
sound_info, frame_rate = get_wav_info(wav_file)
pylab.figure(num=None, figsize=(19, 12))
pylab.subplot(111)
pylab.title('spectrogram of %r' % wav_file)
pylab.specgram(sound_info, Fs=frame_rate)
pylab.savefig('spectrogram.png')
def get_wav_info(wav_file):
wav = wave.open(wav_file, 'r')
frames = wav.readframes(-1)
sound_info = pylab.fromstring(frames, 'int16')
frame_rate = wav.getframerate()
wav.close()
return sound_info, frame_rate
for A Capella Science - Bohemian Gravity! this gives:
Use graph_spectrogram(path_to_your_wav_file).
I don't remember the blog from where I took this snippet. I will add the link whenever I see it again.
Beginner's answer above is excellent. I dont have 50 rep so I can't comment on it, but if you want the correct amplitude in the frequency domain the stft function should look like this:
import numpy as np
from matplotlib import pyplot as plt
import scipy.io.wavfile as wav
from numpy.lib import stride_tricks
""" short time fourier transform of audio signal """
def stft(sig, frameSize, overlapFac=0, window=np.hanning):
win = window(frameSize)
hopSize = int(frameSize - np.floor(overlapFac * frameSize))
# zeros at beginning (thus center of 1st window should be for sample nr. 0)
samples = np.append(np.zeros(int(np.floor(frameSize/2.0))), sig)
# cols for windowing
cols = np.ceil( (len(samples) - frameSize) / float(hopSize)) + 1
# zeros at end (thus samples can be fully covered by frames)
samples = np.append(samples, np.zeros(frameSize))
frames = stride_tricks.as_strided(samples, shape=(int(cols), frameSize), strides=(samples.strides[0]*hopSize, samples.strides[0])).copy()
frames *= win
fftResults = np.fft.rfft(frames)
windowCorrection = 1/(np.sum(np.hanning(frameSize))/frameSize) #This is amplitude correct (1/mean(window)). Energy correction is 1/rms(window)
FFTcorrection = 2/frameSize
scaledFftResults = fftResults*windowCorrection*FFTcorrection
return scaledFftResults
You can use librosa for your mp3 spectogram needs. Here is some code I found, thanks to Parul Pandey from medium. The code I used is this,
# Method described here https://stackoverflow.com/questions/15311853/plot-spectogram-from-mp3
import librosa
import librosa.display
from pydub import AudioSegment
import matplotlib.pyplot as plt
from scipy.io import wavfile
from tempfile import mktemp
def plot_mp3_matplot(filename):
"""
plot_mp3_matplot -- using matplotlib to simply plot time vs amplitude waveplot
Arguments:
filename -- filepath to the file that you want to see the waveplot for
Returns -- None
"""
# sr is for 'sampling rate'
# Feel free to adjust it
x, sr = librosa.load(filename, sr=44100)
plt.figure(figsize=(14, 5))
librosa.display.waveplot(x, sr=sr)
def convert_audio_to_spectogram(filename):
"""
convert_audio_to_spectogram -- using librosa to simply plot a spectogram
Arguments:
filename -- filepath to the file that you want to see the waveplot for
Returns -- None
"""
# sr == sampling rate
x, sr = librosa.load(filename, sr=44100)
# stft is short time fourier transform
X = librosa.stft(x)
# convert the slices to amplitude
Xdb = librosa.amplitude_to_db(abs(X))
# ... and plot, magic!
plt.figure(figsize=(14, 5))
librosa.display.specshow(Xdb, sr = sr, x_axis = 'time', y_axis = 'hz')
plt.colorbar()
# same as above, just changed the y_axis from hz to log in the display func
def convert_audio_to_spectogram_log(filename):
x, sr = librosa.load(filename, sr=44100)
X = librosa.stft(x)
Xdb = librosa.amplitude_to_db(abs(X))
plt.figure(figsize=(14, 5))
librosa.display.specshow(Xdb, sr = sr, x_axis = 'time', y_axis = 'log')
plt.colorbar()
Cheers!

Is that the expected result on ndimage.gaussian_filter?

I'm trying to calculate the discrete derivative using gaussian_filter from scipy.ndimage and so the output is presenting some strange behavior with boundary conditions. The code is below:
from scipy import ndimage
import numpy as np
import matplotlib.pyplot as plt
y = np.linspace(0.2*np.pi,0.7*np.pi,100)
U = np.sin(y)
sg = 1
Uy = ndimage.gaussian_filter1d(U, sigma=sg,order=1,mode='constant',cval=0)
Uy2 = ndimage.gaussian_filter1d(U, sigma=sg,order=1,mode='nearest')
Uy3 = ndimage.gaussian_filter1d(U, sigma=sg,order=1,mode='reflect')
Uy4 = ndimage.gaussian_filter1d(U, sigma=sg,order=1,mode='mirror')
Uy5 = ndimage.gaussian_filter1d(U, sigma=sg,order=1,mode='wrap')
fig,(a1,a2) = plt.subplots(1,2)
a1.plot(U , y,label='data')
a2.plot(Uy, y,label='constant')
a2.plot(Uy2,y,label='nearest')
a2.plot(Uy3,y,label='reflect')
a2.plot(Uy4,y,label='mirror')
a2.plot(Uy5,y,label='wrap')
a1.legend(loc='best')
a2.legend(loc='best')
What happened? Constant mode should result cval on boudary? Is that the expected result?

Categories