Resampling audio file does not produce desired result - python

I want to resample a recording in 32000 KHz to 16000 KHz. I have done this with the code below. But the output audio is somewhat messed up.
You can find the original audio as well output in the following folder
https://drive.google.com/drive/folders/1vr-ib8zvZagH_QeE4JSUtAUpp3EG75va
Any ideas what I am doing wrong?
import os
import librosa
import soundfile as sf
folder_name = "trial_sess"
os.chdir(os.path.join("process",folder_name))
for file in os.listdir():
if file.endswith(".m4a") or file.endswith(".mp4") or file.endswith(".mp3"):
nm,ext = file.split(".")
sr = librosa.get_samplerate(file)
y, sr = librosa.load(file, sr = sr)
sf.write(os.path.join(zoom_loc,"sessions",folder_name,"output_resampled_audio" + "." + "wav"), data = y, samplerate=16000)

There is no resampling performed in your code, and the incorrect samplerate is passed to write.
librosa.load will resample on-demand if the sr argument is different from that of the original file. So the code should be something like:
target_sr = 16000
y, sr = librosa.load(file, sr=target_sr)
assert sr == target_sr # check that librosa did resample
out_path = os.path.join(zoom_loc,"sessions",folder_name,"output_resampled_audio" + "." + "wav")
sf.write(out_path, data = y, samplerate=target_sr)

Related

Librosa Split .wav file into 15s intervals

I'm new to working with audio files. I have several 60 second long files that I want to split into 15 second files (or any length). I'm able to split files into 1 second long files (so 60 files) but can't seem to get 15 second intervals to work. How can I create the intervals I'm looking for?
import os
import numpy as np
import librosa
import librosa.display
audio_dir = r'data\acoustics\recordings'
out_dir = r'data\acoustics\splits'
os.makedirs(out_dir, exist_ok=True)
audio_file = os.path.join(audio_dir, 'rec_20220729T160547Z.wav')
wave, sr = librosa.load(audio_file, sr=None)
num_sections = int(np.ceil(len(wave) / sr)
split = []
for i in range(num_sections):
t = wave[i * sr : i * sr + sr]
split.append(t)
for i in range(num_sections):
recording_name = os.path.basename(audio_file[:-4])
out_file = f"{recording_name}_{str(i)}.wav"
sf.write(os.path.join(out_dir, out_file), split[i], sr)
What you have done is mostly correct. It just need minor changes.
First is getting the data which you have done correctly.
import os
import numpy as np
import librosa
import librosa.display
import soundfile as sf # Missing import
audio_dir = r'data\acoustics\recordings'
out_dir = r'data\acoustics\splits'
os.makedirs(out_dir, exist_ok=True)
audio_file = os.path.join(audio_dir, 'rec_20220729T160547Z.wav')
wave, sr = librosa.load(audio_file, sr=None)
Calculate the length of segment:
segment_dur_secs = 15
segment_length = sr * segment_dur_secs
Breaking up the data and saving to file:
num_sections = int(np.ceil(len(wave) / segment_length))
split = []
for i in range(num_sections):
t = wave[i * segment_length: (i + 1) * segment_length]
split.append(t)
for i in range(num_sections):
recording_name = os.path.basename(audio_file[:-4])
out_file = f"{recording_name}_{str(i)}.wav"
sf.write(os.path.join(out_dir, out_file), split[i], sr)
Alternatively:
split = []
for s in range(0, len(wave), segment_length):
t = wave[s: s + segment_length]
split.append(t)
recording_name = os.path.basename(audio_file[:-4])
for i, segment in enumerate(split):
out_file = f"{recording_name}_{i}.wav"
sf.write(os.path.join(out_dir, out_file), segment, sr)
Edit: There is an issue with the code here because sf is not defined. (Fixed the import)

How to convert a numpy array to a mp3 file

I am using the soundcard library to record my microphone input, it records in a NumPy array and I want to grab that audio and save it as an mp3 file.
Code:
import soundcard as sc
import numpy
import threading
speakers = sc.all_speakers() # Gets a list of the systems speakers
default_speaker = sc.default_speaker() # Gets the default speaker
mics = sc.all_microphones() # Gets a list of all the microphones
default_mic = sc.get_microphone('Headset Microphone (Arctis 7 Chat)') # Gets the default microphone
# Records the default microphone
def record_mic():
print('Recording...')
with default_mic.recorder(samplerate=48000) as mic, default_speaker.player(samplerate=48000) as sp:
for _ in range(1000000000000):
data = mic.record(numframes=None) # 'None' creates zero latency
sp.play(data)
# Save the mp3 file here
recordThread = threading.Thread(target=record_mic)
recordThread.start()
With Scipy (to wav file)
You can easily convert to wav and then separately convert wav to mp3. More details here.
from scipy.io.wavfile import write
samplerate = 44100; fs = 100
t = np.linspace(0., 1., samplerate)
amplitude = np.iinfo(np.int16).max
data = amplitude * np.sin(2. * np.pi * fs * t)
write("example.wav", samplerate, data.astype(np.int16))
With pydub (to mp3)
Try this function from this excellent thread -
import pydub
import numpy as np
def write(f, sr, x, normalized=False):
"""numpy array to MP3"""
channels = 2 if (x.ndim == 2 and x.shape[1] == 2) else 1
if normalized: # normalized array - each item should be a float in [-1, 1)
y = np.int16(x * 2 ** 15)
else:
y = np.int16(x)
song = pydub.AudioSegment(y.tobytes(), frame_rate=sr, sample_width=2, channels=channels)
song.export(f, format="mp3", bitrate="320k")
#[[-225 707]
# [-234 782]
# [-205 755]
# ...,
# [ 303 89]
# [ 337 69]
# [ 274 89]]
write('out2.mp3', sr, x)
Note: Output MP3 will of cause be 16-bit, because MP3s are always 16 bit. However, you can set sample_width=3 as suggested by #Arty for 24-bit input.
As of now the accepted answer produces extremely distorted sound atleast in my case so here is the improved version :
#librosa read
y,sr=librosa.load(dir+file,sr=None)
y=librosa.util.normalize(y)
#pydub read
sound=AudioSegment.from_file(dir+file)
channel_sounds = sound.split_to_mono()
samples = [s.get_array_of_samples() for s in channel_sounds]
fp_arr = np.array(samples).T.astype(np.float32)
fp_arr /= np.iinfo(samples[0].typecode).max
fp_arr=np.array([x[0] for x in fp_arr])
#i normalize the pydub waveform with librosa for comparison purposes
fp_arr=librosa.util.normalize(fp_arr)
so you read the audiofile from any library and you have a waveform then you can export it to any pydub supported codec with this code below, i also used librosa read waveform and it works perfect.
wav_io = io.BytesIO()
scipy.io.wavfile.write(wav_io, sample_rate, waveform)
wav_io.seek(0)
sound = AudioSegment.from_wav(wav_io)
with open("file_exported_by_pydub.mp3",'wb') as af:
sound.export(
af,
format='mp3',
codec='mp3',
bitrate='160000',
)

Can I convert spectrograms generated with librosa back to audio?

I converted some audio files to spectrograms and saved them to files using the following code:
import os
from matplotlib import pyplot as plt
import librosa
import librosa.display
import IPython.display as ipd
audio_fpath = "./audios/"
spectrograms_path = "./spectrograms/"
audio_clips = os.listdir(audio_fpath)
def generate_spectrogram(x, sr, save_name):
X = librosa.stft(x)
Xdb = librosa.amplitude_to_db(abs(X))
fig = plt.figure(figsize=(20, 20), dpi=1000, frameon=False)
ax = fig.add_axes([0, 0, 1, 1], frameon=False)
ax.axis('off')
librosa.display.specshow(Xdb, sr=sr, cmap='gray', x_axis='time', y_axis='hz')
plt.savefig(save_name, quality=100, bbox_inches=0, pad_inches=0)
librosa.cache.clear()
for i in audio_clips:
audio_fpath = "./audios/"
spectrograms_path = "./spectrograms/"
audio_length = librosa.get_duration(filename=audio_fpath + i)
j=60
while j < audio_length:
x, sr = librosa.load(audio_fpath + i, offset=j-60, duration=60)
save_name = spectrograms_path + i + str(j) + ".jpg"
generate_spectrogram(x, sr, save_name)
j += 60
if j >= audio_length:
j = audio_length
x, sr = librosa.load(audio_fpath + i, offset=j-60, duration=60)
save_name = spectrograms_path + i + str(j) + ".jpg"
generate_spectrogram(x, sr, save_name)
I wanted to keep the most detail and quality from the audios, so that i could turn them back to audio without too much loss (They are 80MB each).
Is it possible to turn them back to audio files? How can I do it?
I tried using librosa.feature.inverse.mel_to_audio, but it didn't work, and I don't think it applies.
I now have 1300 spectrogram files and want to train a Generative Adversarial Network with them, so that I can generate new audios, but I don't want to do it if i wont be able to listen to the results later.
Yes, it is possible to recover most of the signal and estimate the phase with e.g. Griffin-Lim Algorithm (GLA). Its "fast" implementation for Python can be found in librosa. Here's how you can use it:
import numpy as np
import librosa
y, sr = librosa.load(librosa.util.example_audio_file(), duration=10)
S = np.abs(librosa.stft(y))
y_inv = librosa.griffinlim(S)
And that's how the original and reconstruction look like:
The algorithm by default randomly initialises the phases and then iterates forward and inverse STFT operations to estimate the phases.
Looking at your code, to reconstruct the signal, you'd just need to do:
import numpy as np
X_inv = librosa.griffinlim(np.abs(X))
It's just an example of course. As pointed out by #PaulR, in your case you'd need to load the data from jpeg (which is lossy!) and then apply inverse transform to amplitude_to_db first.
The algorithm, especially the phase estimation, can be further improved thanks to advances in artificial neural networks. Here is one paper that discusses some enhancements.

How to change audio playback speed using Pydub?

I am new learner of audio editing libs - Pydub. I want to change some audio files' playback speed using Pydub(say .wav/mp3 format files), but I don't know how to make it. The only module I saw that could possibly deal with this problem is speedup module in effect.py. However, there is no explanation about how I am supposed to call it.
Could anyone kindly explain how to do this task in Pydub? Many thanks!
(A related question: Pydub - How to change frame rate without changing playback speed, but what I want to do is to change the playback speed without changing the audio quality.)
This can be done using pyrubberband package which requires rubberband library that can stretch audio while keeping the pitch and high quality. I was able to install the library on MacOS using brew, and same on Ubuntu with apt install. For extreme stretching, look into PaulStretch
brew install rubberband
This works simply with librosa package
import librosa
import pyrubberband
import soundfile as sf
y, sr = librosa.load(filepath, sr=None)
y_stretched = pyrubberband.time_stretch(y, sr, 1.5)
sf.write(analyzed_filepath, y_stretched, sr, format='wav')
To make pyrubberband work directly with AudioSegment from pydub without librosa I fiddled this function:
def change_audioseg_tempo(audiosegment, tempo, new_tempo):
y = np.array(audiosegment.get_array_of_samples())
if audiosegment.channels == 2:
y = y.reshape((-1, 2))
sample_rate = audiosegment.frame_rate
tempo_ratio = new_tempo / tempo
print(tempo_ratio)
y_fast = pyrb.time_stretch(y, sample_rate, tempo_ratio)
channels = 2 if (y_fast.ndim == 2 and y_fast.shape[1] == 2) else 1
y = np.int16(y_fast * 2 ** 15)
new_seg = pydub.AudioSegment(y.tobytes(), frame_rate=sample_rate, sample_width=2, channels=channels)
return new_seg
from pydub import AudioSegment
from pydub import effects
root = r'audio.wav'
velocidad_X = 1.5 # No puede estar por debajo de 1.0
sound = AudioSegment.from_file(root)
so = sound.speedup(velocidad_X, 150, 25)
so.export(root[:-4] + '_Out.mp3', format = 'mp3')
I know it's late but I wrote a program to convert mp3 to different playback speed.
First, Convert the .MP3 -> .Wav because PYRubberBand supports only .wav format. Then streach the time and pitch at the same time to avoid chipmunk effect.
import wave
import sys
from pydub import AudioSegment
#sound = AudioSegment.from_file("deviprasadgharpehai.mp3")
sound = AudioSegment.from_mp3(sys.argv[1])
sound.export("file.wav", format="wav")
print(sys.argv[1])
import soundfile as sf
import pyrubberband as pyrb
y, sr = sf.read("file.wav")
# Play back at extra low speed
y_stretch = pyrb.time_stretch(y, sr, 0.5)
# Play back extra low tones
y_shift = pyrb.pitch_shift(y, sr, 0.5)
sf.write("analyzed_filepathX5.wav", y_stretch, sr, format='wav')
sound = AudioSegment.from_wav("analyzed_filepathX5.wav")
sound.export("analyzed_filepathX5.mp3", format="mp3")
# Play back at low speed
y_stretch = pyrb.time_stretch(y, sr, 0.75)
# Play back at low tones
y_shift = pyrb.pitch_shift(y, sr, 0.75)
sf.write("analyzed_filepathX75.wav", y_stretch, sr, format='wav')
sound = AudioSegment.from_wav("analyzed_filepathX75.wav")
sound.export("analyzed_filepathX75.mp3", format="mp3")
# Play back at 1.5X speed
y_stretch = pyrb.time_stretch(y, sr, 1.5)
# Play back two 1.5x tones
y_shift = pyrb.pitch_shift(y, sr, 1.5)
sf.write("analyzed_filepathX105.wav", y_stretch, sr, format='wav')
sound = AudioSegment.from_wav("analyzed_filepathX105.wav")
sound.export("analyzed_filepathX105.mp3", format="mp3")
# Play back at same speed
y_stretch = pyrb.time_stretch(y, sr, 1)
# Play back two smae-tones
y_shift = pyrb.pitch_shift(y, sr, 1)
sf.write("analyzed_filepathXnormal.wav", y_stretch, sr, format='wav')
sound = AudioSegment.from_wav("analyzed_filepathXnormal.wav")
sound.export("analyzed_filepathXnormal.mp3", format="mp3")
**Make Sure to install **
Wave, AudioSegment, FFmpeg, PYRubberBand, Soundfile
To use this Run,
python3 filename.py mp3filename.mp3
To change the speed of the audio without changing the pitch (or creating chipmunk effect). You can use below code.
from pydub import AudioSegment
from pydub.effects import speedup
audio = AudioSegment.from_mp3(song.mp3)
new_file = speedup(audio,1.5,150)
new_file.export("file.mp3", format="mp3")

How to change audio speed without changing pitch?

I need to apply audio to video at certain time with certain duration, but some audio duration is bigger(or smaller) then needed. How to change speed of audio without changing pitch? I tried to change fps(by multiplying to division of needed duration to audio duration) but it is not work as I want.
original = VideoFileClip("orig.mp4")
clips = [orig.audio.volumex(0.3)]
subs = [] #some array
i = 0
for sub in subs:
clip = AudioFileClip("\\temp{}.mp3")
mult = clip.duration / (sub.end - sub.start) + 0.00001
clip = AudioArrayClip(clip.to_soundarray(buffersize=500, fps=24000/mult), fps=24000).set_start(sub.start).set_end(sub.end)
clips.append(clip)
i += 1
final = CompositeAudioClip(clips)
final.write_audiofile("final.mp3")
you can use librosa module:
from scipy.io import wavfile
import librosa, numpy as np
song, fs = librosa.load("song.wav")
song_2_times_faster = librosa.effects.time_stretch(song, 2)
scipy.io.wavfile.write("song_2_times_faster.wav", fs, song_2_times_faster) # save the song
Using wave: Change the sampling rate
import wave
CHANNELS = 1
swidth = 2
Change_RATE = 2
spf = wave.open('VOZ.wav', 'rb')
RATE=spf.getframerate()
signal = spf.readframes(-1)
wf = wave.open('changed.wav', 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(swidth)
wf.setframerate(RATE*Change_RATE)
wf.writeframes(signal)
wf.close()

Categories