I'm using the script found on this blog Google speech recognition with python (I give any credit to author).
import sys
import pyaudio, speex
import numpy as np # just for doing a standard deviation for audio level checks
import urllib2
import wave
e = speex.Encoder()
e.initialize(speex.SPEEX_MODEID_WB)
d = speex.Decoder()
d.initialize(speex.SPEEX_MODEID_WB)
chunk = 320 # tried other numbers... some don't work
FORMAT = pyaudio.paInt16
bytespersample=2
CHANNELS = 1
RATE = 16000 # "wideband" mode for speex. May work with 8000. Haven't tried it.
p = pyaudio.PyAudio()
# Start the stream to record the audio
stream = p.open(format = FORMAT,
channels = CHANNELS,
rate = RATE,
input = True,
output = True,
frames_per_buffer = chunk)
print "Listening. Recording will start when some sound is heard."
threshold = 200 # Adjust this to be slightly above the noise level of your recordings.
nquit = 40 # number of silent frames before terminating the program
nover = 0
keepgoing = True
spxlist=[] # list of the encoded speex packets/frames
while keepgoing:
data = stream.read(chunk) # grab 320 samples from the microphone
spxdata = e.encode(data) # encode using the speex dll
print "Length encoded: %d"%len(spxdata) # print the length, after encoding. Can't exceed 255!
spxlist.append(spxdata)
a=np.frombuffer(data,np.int16) # convert to numpy array to check for silence or audio
audiolevel=np.std(a)
if audiolevel < threshold: # too quiet
nover+=1
else:
nover=0
if nover >= nquit:
keepgoing=False
print '%2.1f (%d%% quiet)'%(audiolevel, nover*100/nquit)
print "Too quiet. I'm stopping now."
stream.stop_stream()
stream.close()
p.terminate()
fullspx=''.join(spxlist) # make a string of all the header-ized speex packets
out_file = open("test.spx","wb")
out_file.write(fullspx)
out_file.close()
As you can see I slightly modify the script to make it write and output file in .spx, but it dosen't work.
Any advice?
Thanks for your help.
Edit:
I'm running this script under an Ubuntu-linux machine.
Related
I'm trying to make beat detection using PC microphone and then with timestamp of beat calculate distance between multiple successive beats. I have chosen python because there is plenty of material available and it's quick to develop. By searching the internet I have come up with this simple code (no advanced peak detection or anything yet, this comes later if need be):
import pyaudio
import struct
import math
import time
SHORT_NORMALIZE = (1.0/32768.0)
def get_rms(block):
# RMS amplitude is defined as the square root of the
# mean over time of the square of the amplitude.
# so we need to convert this string of bytes into
# a string of 16-bit samples...
# we will get one short out for each
# two chars in the string.
count = len(block)/2
format = "%dh" % (count)
shorts = struct.unpack(format, block)
# iterate over the block.
sum_squares = 0.0
for sample in shorts:
# sample is a signed short in +/- 32768.
# normalize it to 1.0
n = sample * SHORT_NORMALIZE
sum_squares += n*n
return math.sqrt(sum_squares / count)
CHUNK = 32
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK)
elapsed_time = 0
prev_detect_time = 0
while True:
data = stream.read(CHUNK)
amplitude = get_rms(data)
if amplitude > 0.05: # value set by observing graphed data captured from mic
elapsed_time = time.perf_counter() - prev_detect_time
if elapsed_time > 0.1: # guard against multiple spikes at beat point
print(elapsed_time)
prev_detect_time = time.perf_counter()
def close_stream():
stream.stop_stream()
stream.close()
p.terminate()
The code works pretty good in silence, and I have been pretty satisfied the first two moments I ran it, but then I tried how accurate it was and I was a little bit less satisfied. To test this I used two methods: phone with metronome set to 60bpm (emits tic toc sounds into microphone) and an Arduino hooked to a beeper, which is triggered at 1Hz rate by accurate Chronodot RTC. The beeper beeps into microphone, triggering a detection. With both methods results look similar (numbers represent distance between two beat detections in seconds):
0.9956681643835616
1.0056331689497717
0.9956100091324198
1.0058207853881278
0.9953449497716891
1.0052103013698623
1.0049350136986295
0.9859074337899543
1.004996383561644
0.9954095342465745
1.0061518904109583
0.9953025753424658
1.0051235068493156
1.0057199634703196
0.984839305936072
1.00610396347032
0.9951862648401821
1.0053146301369864
0.9960100821917806
1.0053391780821919
0.9947373881278523
1.0058608219178105
1.0056580091324214
0.9852110319634697
1.0054473059360731
0.9950465753424638
1.0058237077625556
0.995704694063928
1.0054566575342463
0.9851026118721435
1.0059882374429243
1.0052523835616398
0.9956161461187207
1.0050863926940607
0.9955758173515932
1.0058052968036577
0.9953960913242028
1.0048014611872205
1.006336876712325
0.9847434520547935
1.0059712876712297
Now I'm pretty confident that at least Arduino is accurate to 1 msec (which is targeted accuracy). The results tend to be off by +- 5msec, but now and then even 15ms, which is unacceptable. Is there a way to achieve greater accuracy or is this limitation of python / soundcard / something else? Thank you!
EDIT:
After incorporating tom10 and barny's suggestions into the code, the code looks like this:
import pyaudio
import struct
import math
import psutil
import os
def set_high_priority():
p = psutil.Process(os.getpid())
p.nice(psutil.HIGH_PRIORITY_CLASS)
SHORT_NORMALIZE = (1.0/32768.0)
def get_rms(block):
# RMS amplitude is defined as the square root of the
# mean over time of the square of the amplitude.
# so we need to convert this string of bytes into
# a string of 16-bit samples...
# we will get one short out for each
# two chars in the string.
count = len(block)/2
format = "%dh" % (count)
shorts = struct.unpack(format, block)
# iterate over the block.
sum_squares = 0.0
for sample in shorts:
# sample is a signed short in +/- 32768.
# normalize it to 1.0
n = sample * SHORT_NORMALIZE
sum_squares += n*n
return math.sqrt(sum_squares / count)
CHUNK = 4096
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
RUNTIME_SECONDS = 10
set_high_priority()
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK)
elapsed_time = 0
prev_detect_time = 0
TIME_PER_CHUNK = 1000 / RATE * CHUNK
SAMPLE_GROUP_SIZE = 32 # 1 sample = 2 bytes, group is closest to 1 msec elapsing
TIME_PER_GROUP = 1000 / RATE * SAMPLE_GROUP_SIZE
for i in range(0, int(RATE / CHUNK * RUNTIME_SECONDS)):
data = stream.read(CHUNK)
time_in_chunk = 0
group_index = 0
for j in range(0, len(data), (SAMPLE_GROUP_SIZE * 2)):
group = data[j:(j + (SAMPLE_GROUP_SIZE * 2))]
amplitude = get_rms(group)
amplitudes.append(amplitude)
if amplitude > 0.02:
current_time = (elapsed_time + time_in_chunk)
time_since_last_beat = current_time - prev_detect_time
if time_since_last_beat > 500:
print(time_since_last_beat)
prev_detect_time = current_time
time_in_chunk = (group_index+1) * TIME_PER_GROUP
group_index += 1
elapsed_time = (i+1) * TIME_PER_CHUNK
stream.stop_stream()
stream.close()
p.terminate()
With this code I achieved the following results (units are this time milliseconds instead of seconds):
999.909297052154
999.9092970521542
999.9092970521542
999.9092970521542
999.9092970521542
1000.6349206349205
999.9092970521551
999.9092970521524
999.9092970521542
999.909297052156
999.9092970521542
999.9092970521542
999.9092970521524
999.9092970521542
Which, if I didn't make any mistake, looks a lot better than before and has achieved sub-millisecond accuracy. I thank tom10 and barny for their help.
The reason you're not getting the right timing for the beats is that you're missing chunks of the audio data. That is, the chunks are being read by the soundcard, but you're not collecting the data before it's overwritten with the next chunk.
First, though, for this problem you need to distinguish between the ideas of timing accuracy and real-time response.
The timing accuracy of a sound card should be very good, much better than a ms, and you should be able to capture all of this accuracy in the data you read from the soundcard. The real-time responsiveness of your computer's OS should be very bad, much worse than a ms. That is, you should easily be able to identify audio events (such as beats) to within a ms, but not identify them at the time they happen (instead, 30-200ms later depending on your system). This arrangement usually works for computers because general human perception of the timing of events is much greater than a ms (except for rare specialized percepetual systems, like comparing auditory events between the two ears, etc).
The specific problem with your code is that CHUNKS is much too small for the OS to query the sound card at each sample. You have it at 32, so at 44100Hz, the OS needs to get to the sound card every 0.7ms, which is too short of a time for a computer that's tasked with doing many other things. If you OS doesn't get the chunk before the next one comes in, the original chunk is overwritten and lost.
To get this working so it's consistent with the constraints above, make CHUNKS much larger than 32, and more like 1024 (as in the PyAudio examples). Depending on your computer and what it's doing, even that my not be long enough.
If this type of approach won't work for you, you will probably need a dedicated real-time system like an Arduino. (Generally, though, this isn't necessary, so think twice before you decide that you need to use the Arduino. Usually, when I've seen people need true real-time it's when trying to do something very quantitave interactive with the human, like flash a light, have the person tap a button, flash another light, have the person tap another button, etc, to measure response times.)
I'm looking to detect the tempo of an audio file in python 3.6, but I don't really understand the doc about aubio. Would someone please indicate how to extract the tempo with aubio or another library?
Updated
This command will give you the tempo estimate of the entire file (available in 0.4.5):
aubio tempo foo.wav
There is a simple demo in aubio's python/demos: demo_bpm_extract.py.
The most important part is the following two lines, which compute the periods between each consecutive beats (np.diff), convert these periods in bpm (60./), and takes the median (np.median) as the most likely bpm candidate for this series of beats:
#!/usr/bin/env python
import numpy as np
bpms = 60./np.diff(beats)
median_bpm = np.median(bpms)
Note how the median is better suited than the mean here, since it will always give an estimate which exists in the original population bpms.
I found this code by Paul Brossier that could help you, here it is:
#! /usr/bin/env python
from aubio import source, tempo
from numpy import median, diff
def get_file_bpm(path, params = None):
""" Calculate the beats per minute (bpm) of a given file.
path: path to the file
param: dictionary of parameters
"""
if params is None:
params = {}
try:
win_s = params['win_s']
samplerate = params['samplerate']
hop_s = params['hop_s']
except KeyError:
"""
# super fast
samplerate, win_s, hop_s = 4000, 128, 64
# fast
samplerate, win_s, hop_s = 8000, 512, 128
"""
# default:
samplerate, win_s, hop_s = 44100, 1024, 512
s = source(path, samplerate, hop_s)
samplerate = s.samplerate
o = tempo("specdiff", win_s, hop_s, samplerate)
# List of beats, in samples
beats = []
# Total number of frames read
total_frames = 0
while True:
samples, read = s()
is_beat = o(samples)
if is_beat:
this_beat = o.get_last_s()
beats.append(this_beat)
#if o.get_confidence() > .2 and len(beats) > 2.:
# break
total_frames += read
if read < hop_s:
break
# Convert to periods and to bpm
if len(beats) > 1:
if len(beats) < 4:
print("few beats found in {:s}".format(path))
bpms = 60./diff(beats)
b = median(bpms)
else:
b = 0
print("not enough beats found in {:s}".format(path))
return b
if __name__ == '__main__':
import sys
for f in sys.argv[1:]:
bpm = get_file_bpm(f)
print("{:6s} {:s}".format("{:2f}".format(bpm), f))
this is the key part:
bpms = 60./np.diff(beats)
median_bpm = np.median(bpms)
On the client side, I am sending a blob audio (wav) file. On the server side, I am trying to convert the blob file to an audio wav file. I did the following:
blob = request.FILES['file']
name = "TEST.wav"
audio = wave.open(name, 'wb')
audio.setnchannels(1)
audio.writeframes(blob.read())
I thought that converting the blob would be similar to converting a blob image to a jpeg file, but was very incorrect in that assumption. That didn't work; I get an error - "Error: sample width not specified." I then used setsampwidth() and tossed in an arbitrary number between 1 and 4 (after looking at the wave.py source file...I don't know why the bytes have to be between 1 and 4). After that another error is thrown - "Error: sampling rate not specified." How do I specify the sampling rate?
What does the setnchannels(), setsampwidth() methods do? Is there an "easy" way I generate the wav file from the blob?
Previously, I never do it before.. but, in my test this script below is worked well for me.. (But the audio output isn't same like original file).
>>> nchannels = 2
>>> sampwidth = 2
>>> framerate = 8000
>>> nframes = 100
>>>
>>> import wave
>>>
>>> name = 'output.wav'
>>> audio = wave.open(name, 'wb')
>>> audio.setnchannels(nchannels)
>>> audio.setsampwidth(sampwidth)
>>> audio.setframerate(framerate)
>>> audio.setnframes(nframes)
>>>
>>> blob = open("original.wav").read() # such as `blob.read()`
>>> audio.writeframes(blob)
>>>
I found this method at https://stackoverflow.com/a/3637480/6396981
Finally, by changing the value of nchannels and sampwidth with 1. and I got an audio that same with original file.
nchannels = 1
sampwidth = 1
framerate = 8000
nframes = 1
Tested under Python2, and got an error UnicodeDecodeError: 'utf-8' codec can't decode byte 0x95 in position 4: invalid start byte on Python3.
I have encountered the same problem as well. My problem was having a low pitched output compared to the original. I manage to reverse engineer the original audio to get the nframes, samplerate, and sampwidth using getnframes(),getframerate(), and getsampwidth() respectively. At last, I managed to tweak the sample frequency/ frame rate to somehow bring the perfect tone.
The tweaking became perfect at a certain offset frequency than the original. Mine worked fine at an offset sum of the sixteenth of the original samplerate.
i.e.
OffsetFrequency = OriginalFrequency/16
Frequency = OriginalFrequency + OffsetFrequency
I have a python script that receives chunks of binary raw audio data and I would like to change the sample rate of those chunks to 16000 and then pipe them to another component.
I tried my luck with audiotools but without success:
# f is a filelike FIFO buffer
reader = PCMFileReader(f, 44100, 1, 1, 16)
conv = PCMConverter(reader, 16000, 1, 1, 16)
Then I just write to the buffer anytime, I get a new chunk:
f.write(msg)
And read from the buffer in another thread:
while not reader.file.closed:
fl = conf.read(10)
chunk = fl.to_bytes(False, True)
The problem is that I get this value error, which seems to come from a "samplerate.c" library:
ValueError: SRC_DATA->data_out is NULL
This error only occurs with resampling. If I turn off that step, then everything works fine and I get playable audio.
Therefore my question: What would be a good tool for this task? And if audiotools turns out to be the right answer, how do I do that correctly.
here is a simplified resampler code
dataFormat is a number of bytes per sample in the stream, ex: stereo 16 bit would be = 4, original_samples is a source bin string size, desired_samples is a desired bit string size, 16KHz->44K1Hz ex: original = 160 but desired = 441, pcm is a source bin string, return is resampled bin string) :
def resampleSimplified(pcm, desired_samples, original_samples, dataFormat):
samples_to_pad = desired_samples - original_samples
q, r = divmod(desired_samples, original_samples)
times_to_pad_up = q + int(bool(r))
times_to_pad_down = q
pcmList = [pcm[i:i+dataFormat] for i in range(0, len(pcm), dataFormat)]
if samples_to_pad > 0:
# extending pcm times_to_pad times
pcmListPadded = list(itertools.chain.from_iterable(
itertools.repeat(x, times_to_pad_up) for x in pcmList)
)
else:
# shrinking pcm times_to_pad times
if times_to_pad_down > 0:
pcmListPadded = pcmList[::(times_to_pad_down)]
else:
pcmListPadded = pcmList
padded_pcm = ''.join(pcmListPadded[:desired_samples])
return padded_pcm
I want to know how to get samples out of a .wav file in order to perform windowed join of two .wav files.
Can any one please tell how to do this?
The wave module of the standard library is the key: after of course import wave at the top of your code, wave.open('the.wav', 'r') returns a "wave read" object from which you can read frames with the .readframes method, which returns a string of bytes which are the samples... in whatever format the wave file has them (you can determine the two parameters relevant to decomposing frames into samples with the .getnchannels method for the number of channels, and .getsampwidth for the number of bytes per sample).
The best way to turn the string of bytes into a sequence of numeric values is with the array module, and a type of (respectively) 'B', 'H', 'L' for 1, 2, 4 bytes per sample (on a 32-bit build of Python; you can use the itemsize value of your array object to double-check this). If you have different sample widths than array can provide you, you'll need to slice up the byte string (padding each little slice appropriately with bytes worth 0) and use the struct module instead (but that's clunkier and slower, so use array instead if you can).
You can use the wave module. First you should read the metadata, such us sample size or the number of channels. Using the readframes() method, you can read samples, but only as a byte string. Based on the sample format, you have to convert them to samples using struct.unpack().
Alternatively, if you want the samples as an array of floating-point numbers, you can use SciPy's io.wavfile module.
Here's a function to read samples from a wave file (tested with mono & stereo):
def read_samples(wave_file, nb_frames):
frame_data = wave_file.readframes(nb_frames)
if frame_data:
sample_width = wave_file.getsampwidth()
nb_samples = len(frame_data) // sample_width
format = {1:"%db", 2:"<%dh", 4:"<%dl"}[sample_width] % nb_samples
return struct.unpack(format, frame_data)
else:
return ()
And here's the full script that does windowed mixing or concatenating of multiple .wav files. All input files need to have the same params (# of channels and sample width).
import argparse
import itertools
import struct
import sys
import wave
def _struct_format(sample_width, nb_samples):
return {1:"%db", 2:"<%dh", 4:"<%dl"}[sample_width] % nb_samples
def _mix_samples(samples):
return sum(samples)//len(samples)
def read_samples(wave_file, nb_frames):
frame_data = wave_file.readframes(nb_frames)
if frame_data:
sample_width = wave_file.getsampwidth()
nb_samples = len(frame_data) // sample_width
format = _struct_format(sample_width, nb_samples)
return struct.unpack(format, frame_data)
else:
return ()
def write_samples(wave_file, samples, sample_width):
format = _struct_format(sample_width, len(samples))
frame_data = struct.pack(format, *samples)
wave_file.writeframes(frame_data)
def compatible_input_wave_files(input_wave_files):
nchannels, sampwidth, framerate, nframes, comptype, compname = input_wave_files[0].getparams()
for input_wave_file in input_wave_files[1:]:
nc,sw,fr,nf,ct,cn = input_wave_file.getparams()
if (nc,sw,fr,ct,cn) != (nchannels, sampwidth, framerate, comptype, compname):
return False
return True
def mix_wave_files(output_wave_file, input_wave_files, buffer_size):
output_wave_file.setparams(input_wave_files[0].getparams())
sampwidth = input_wave_files[0].getsampwidth()
max_nb_frames = max([input_wave_file.getnframes() for input_wave_file in input_wave_files])
for frame_window in xrange(max_nb_frames // buffer_size + 1):
all_samples = [read_samples(wave_file, buffer_size) for wave_file in input_wave_files]
mixed_samples = [_mix_samples(samples) for samples in itertools.izip_longest(*all_samples, fillvalue=0)]
write_samples(output_wave_file, mixed_samples, sampwidth)
def concatenate_wave_files(output_wave_file, input_wave_files, buffer_size):
output_wave_file.setparams(input_wave_files[0].getparams())
sampwidth = input_wave_files[0].getsampwidth()
for input_wave_file in input_wave_files:
nb_frames = input_wave_file.getnframes()
for frame_window in xrange(nb_frames // buffer_size + 1):
samples = read_samples(input_wave_file, buffer_size)
if samples:
write_samples(output_wave_file, samples, sampwidth)
def argument_parser():
parser = argparse.ArgumentParser(description='Mix or concatenate multiple .wav files')
parser.add_argument('command', choices = ("mix", "concat"), help='command')
parser.add_argument('output_file', help='ouput .wav file')
parser.add_argument('input_files', metavar="input_file", help='input .wav files', nargs="+")
parser.add_argument('--buffer_size', type=int, help='nb of frames to read per iteration', default=1000)
return parser
if __name__ == '__main__':
args = argument_parser().parse_args()
input_wave_files = [wave.open(name,"rb") for name in args.input_files]
if not compatible_input_wave_files(input_wave_files):
print "ERROR: mixed wave files must have the same params."
sys.exit(2)
output_wave_file = wave.open(args.output_file, "wb")
if args.command == "mix":
mix_wave_files(output_wave_file, input_wave_files, args.buffer_size)
elif args.command == "concat":
concatenate_wave_files(output_wave_file, input_wave_files, args.buffer_size)
output_wave_file.close()
for input_wave_file in input_wave_files:
input_wave_file.close()
After reading the samples (for example with the wave module, more details here) you may want to have the values scales between -1 and 1 (this is the convention for audio signals).
In this case, you can add:
# scale to -1.0 -- 1.0
max_nb_bit = float(2**(nb_bits-1))
samples = signal_int / (max_nb_bit + 1.0)
with nb_bits the bit depth and signal_int the integers values.