How to get accurate timing using microphone in python - python

I'm trying to make beat detection using PC microphone and then with timestamp of beat calculate distance between multiple successive beats. I have chosen python because there is plenty of material available and it's quick to develop. By searching the internet I have come up with this simple code (no advanced peak detection or anything yet, this comes later if need be):
import pyaudio
import struct
import math
import time
SHORT_NORMALIZE = (1.0/32768.0)
def get_rms(block):
# RMS amplitude is defined as the square root of the
# mean over time of the square of the amplitude.
# so we need to convert this string of bytes into
# a string of 16-bit samples...
# we will get one short out for each
# two chars in the string.
count = len(block)/2
format = "%dh" % (count)
shorts = struct.unpack(format, block)
# iterate over the block.
sum_squares = 0.0
for sample in shorts:
# sample is a signed short in +/- 32768.
# normalize it to 1.0
n = sample * SHORT_NORMALIZE
sum_squares += n*n
return math.sqrt(sum_squares / count)
CHUNK = 32
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK)
elapsed_time = 0
prev_detect_time = 0
while True:
data = stream.read(CHUNK)
amplitude = get_rms(data)
if amplitude > 0.05: # value set by observing graphed data captured from mic
elapsed_time = time.perf_counter() - prev_detect_time
if elapsed_time > 0.1: # guard against multiple spikes at beat point
print(elapsed_time)
prev_detect_time = time.perf_counter()
def close_stream():
stream.stop_stream()
stream.close()
p.terminate()
The code works pretty good in silence, and I have been pretty satisfied the first two moments I ran it, but then I tried how accurate it was and I was a little bit less satisfied. To test this I used two methods: phone with metronome set to 60bpm (emits tic toc sounds into microphone) and an Arduino hooked to a beeper, which is triggered at 1Hz rate by accurate Chronodot RTC. The beeper beeps into microphone, triggering a detection. With both methods results look similar (numbers represent distance between two beat detections in seconds):
0.9956681643835616
1.0056331689497717
0.9956100091324198
1.0058207853881278
0.9953449497716891
1.0052103013698623
1.0049350136986295
0.9859074337899543
1.004996383561644
0.9954095342465745
1.0061518904109583
0.9953025753424658
1.0051235068493156
1.0057199634703196
0.984839305936072
1.00610396347032
0.9951862648401821
1.0053146301369864
0.9960100821917806
1.0053391780821919
0.9947373881278523
1.0058608219178105
1.0056580091324214
0.9852110319634697
1.0054473059360731
0.9950465753424638
1.0058237077625556
0.995704694063928
1.0054566575342463
0.9851026118721435
1.0059882374429243
1.0052523835616398
0.9956161461187207
1.0050863926940607
0.9955758173515932
1.0058052968036577
0.9953960913242028
1.0048014611872205
1.006336876712325
0.9847434520547935
1.0059712876712297
Now I'm pretty confident that at least Arduino is accurate to 1 msec (which is targeted accuracy). The results tend to be off by +- 5msec, but now and then even 15ms, which is unacceptable. Is there a way to achieve greater accuracy or is this limitation of python / soundcard / something else? Thank you!
EDIT:
After incorporating tom10 and barny's suggestions into the code, the code looks like this:
import pyaudio
import struct
import math
import psutil
import os
def set_high_priority():
p = psutil.Process(os.getpid())
p.nice(psutil.HIGH_PRIORITY_CLASS)
SHORT_NORMALIZE = (1.0/32768.0)
def get_rms(block):
# RMS amplitude is defined as the square root of the
# mean over time of the square of the amplitude.
# so we need to convert this string of bytes into
# a string of 16-bit samples...
# we will get one short out for each
# two chars in the string.
count = len(block)/2
format = "%dh" % (count)
shorts = struct.unpack(format, block)
# iterate over the block.
sum_squares = 0.0
for sample in shorts:
# sample is a signed short in +/- 32768.
# normalize it to 1.0
n = sample * SHORT_NORMALIZE
sum_squares += n*n
return math.sqrt(sum_squares / count)
CHUNK = 4096
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
RUNTIME_SECONDS = 10
set_high_priority()
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK)
elapsed_time = 0
prev_detect_time = 0
TIME_PER_CHUNK = 1000 / RATE * CHUNK
SAMPLE_GROUP_SIZE = 32 # 1 sample = 2 bytes, group is closest to 1 msec elapsing
TIME_PER_GROUP = 1000 / RATE * SAMPLE_GROUP_SIZE
for i in range(0, int(RATE / CHUNK * RUNTIME_SECONDS)):
data = stream.read(CHUNK)
time_in_chunk = 0
group_index = 0
for j in range(0, len(data), (SAMPLE_GROUP_SIZE * 2)):
group = data[j:(j + (SAMPLE_GROUP_SIZE * 2))]
amplitude = get_rms(group)
amplitudes.append(amplitude)
if amplitude > 0.02:
current_time = (elapsed_time + time_in_chunk)
time_since_last_beat = current_time - prev_detect_time
if time_since_last_beat > 500:
print(time_since_last_beat)
prev_detect_time = current_time
time_in_chunk = (group_index+1) * TIME_PER_GROUP
group_index += 1
elapsed_time = (i+1) * TIME_PER_CHUNK
stream.stop_stream()
stream.close()
p.terminate()
With this code I achieved the following results (units are this time milliseconds instead of seconds):
999.909297052154
999.9092970521542
999.9092970521542
999.9092970521542
999.9092970521542
1000.6349206349205
999.9092970521551
999.9092970521524
999.9092970521542
999.909297052156
999.9092970521542
999.9092970521542
999.9092970521524
999.9092970521542
Which, if I didn't make any mistake, looks a lot better than before and has achieved sub-millisecond accuracy. I thank tom10 and barny for their help.

The reason you're not getting the right timing for the beats is that you're missing chunks of the audio data. That is, the chunks are being read by the soundcard, but you're not collecting the data before it's overwritten with the next chunk.
First, though, for this problem you need to distinguish between the ideas of timing accuracy and real-time response.
The timing accuracy of a sound card should be very good, much better than a ms, and you should be able to capture all of this accuracy in the data you read from the soundcard. The real-time responsiveness of your computer's OS should be very bad, much worse than a ms. That is, you should easily be able to identify audio events (such as beats) to within a ms, but not identify them at the time they happen (instead, 30-200ms later depending on your system). This arrangement usually works for computers because general human perception of the timing of events is much greater than a ms (except for rare specialized percepetual systems, like comparing auditory events between the two ears, etc).
The specific problem with your code is that CHUNKS is much too small for the OS to query the sound card at each sample. You have it at 32, so at 44100Hz, the OS needs to get to the sound card every 0.7ms, which is too short of a time for a computer that's tasked with doing many other things. If you OS doesn't get the chunk before the next one comes in, the original chunk is overwritten and lost.
To get this working so it's consistent with the constraints above, make CHUNKS much larger than 32, and more like 1024 (as in the PyAudio examples). Depending on your computer and what it's doing, even that my not be long enough.
If this type of approach won't work for you, you will probably need a dedicated real-time system like an Arduino. (Generally, though, this isn't necessary, so think twice before you decide that you need to use the Arduino. Usually, when I've seen people need true real-time it's when trying to do something very quantitave interactive with the human, like flash a light, have the person tap a button, flash another light, have the person tap another button, etc, to measure response times.)

Related

Numba Python - how to exploit parallelism effectively?

I have been trying to exploit Numba to speed up large array calculations. I have been measuring the calculation speed in GFLOPS, and it consistently falls far short of my expectations for my CPU.
My processor is i9-9900k, which according to float32 benchmarks should be capable of over 200 GFLOPS. In my tests I have never exceeded about 50 GFLOPS. This is running on all 8 cores.
On a single core I achieve about 17 GFLOPS, which (I believe) is 50% of the theoretical performance. I'm not sure if this is improvable, but the fact that it doesn't extend well to multi-core is a problem.
I am trying to learn this because I am planning to write some image processing code that desperately needs every speed boost possible. I also feel I should understand this first, before I dip my toes into GPU computing.
Here is some example code with a few of my attempts at writing fast functions. The operation I am testing, is multiplying an array by a float32 then summing the whole array, i.e. a MAC operation.
How can I get better results?
import os
# os.environ["NUMBA_ENABLE_AVX"] = "1"
import numpy as np
import timeit
from timeit import default_timer as timer
import numba
# numba.config.NUMBA_ENABLE_AVX = 1
# numba.config.LOOP_VECTORIZE = 1
# numba.config.DUMP_ASSEMBLY = 1
from numba import float32, float64
from numba import jit, njit, prange
from numba import vectorize
from numba import cuda
lengthY = 16 # 2D array Y axis
lengthX = 2**16 # X axis
totalops = lengthY * lengthX * 2 # MAC operation has 2 operations
iters = 100
doParallel = True
#njit(fastmath=True, parallel=doParallel)
def MAC_numpy(testarray):
output = (float)(0.0)
multconst = (float)(.99)
output = np.sum(np.multiply(testarray, multconst))
return output
#njit(fastmath=True, parallel=doParallel)
def MAC_01(testarray):
lengthX = testarray.shape[1]
lengthY = testarray.shape[0]
output = (float)(0.0)
multconst = (float)(.99)
for y in prange(lengthY):
for x in prange(lengthX):
output += multconst*testarray[y,x]
return output
#njit(fastmath=True, parallel=doParallel)
def MAC_04(testarray):
lengthX = testarray.shape[1]
lengthY = testarray.shape[0]
output = (float)(0.0)
multconst = (float)(.99)
for y in prange(lengthY):
for x in prange(int(lengthX/4)):
xn = x*4
output += multconst*testarray[y,xn] + multconst*testarray[y,xn+1] + multconst*testarray[y,xn+2] + multconst*testarray[y,xn+3]
return output
# ======================================= TESTS =======================================
testarray = np.random.rand(lengthY, lengthX)
# ==== MAC_numpy ====
time = 1000
for n in range(iters):
start = timer()
output = MAC_numpy(testarray)
end = timer()
if((end-start) < time): #get shortest time
time = end-start
print("\nMAC_numpy")
print("output = %f" % (output))
print(type(output))
print("fastest time = %16.10f us" % (time*10**6))
print("Compute Rate = %f GFLOPS" % ((totalops/time)/10**9))
# ==== MAC_01 ====
time = 1000
lengthX = testarray.shape[1]
lengthY = testarray.shape[0]
for n in range(iters):
start = timer()
output = MAC_01(testarray)
end = timer()
if((end-start) < time): #get shortest time
time = end-start
print("\nMAC_01")
print("output = %f" % (output))
print(type(output))
print("fastest time = %16.10f us" % (time*10**6))
print("Compute Rate = %f GFLOPS" % ((totalops/time)/10**9))
# ==== MAC_04 ====
time = 1000
for n in range(iters):
start = timer()
output = MAC_04(testarray)
end = timer()
if((end-start) < time): #get shortest time
time = end-start
print("\nMAC_04")
print("output = %f" % (output))
print(type(output))
print("fastest time = %16.10f us" % (time*10**6))
print("Compute Rate = %f GFLOPS" % ((totalops/time)/10**9))
Q : How can I get better results?
1st : Learn how to avoid doing useless work - you can straight eliminate HALF of the FLOP-s not speaking about also the half of all the RAM-I/O-s avoided, each one being at a cost of +100~350 [ns] per writeback
Due to the distributive nature of MUL and ADD ( a.C + b.C ) == ( a + b ).C, better first np.sum( A ) and only after that then MUL the sum by the (float) constant.
#utput = np.sum(np.multiply(testarray, multconst)) # AWFULLY INEFFICIENT
output = np.sum( testarray)*multconst #######################
2nd : Learn how to best align data along the order of processing ( cache-line reuses get you ~100x faster re-use of pre-fetched data. Not aligning vectorised-code along these already pre-fetched data side-effects just let your code pay many times the RAM-access latencies, instead of smart re-using the already paid for data-blocks. Designing work-units aligned according to this principle means a few SLOCs more, but the rewards are worth that - who gets ~100x faster CPUs+RAMs for free and right now or about a ~100x speedup for free, just from not writing a badly or naively designed looping iterators?
3rd : Learn how to efficiently harness vectorised (block-directed) operations inside numpy or numba code-blocks and avoid pressing numba to spend time on auto-analysing the call-signatures ( you pay an extra time for this auto-analyses per call, while you have designed the code and knew exactly what data-types are going to go there, so why to pay an extra time for auto-analysis each time a numba-block gets called???)
4th : Learn where the extended Amdahl's Law, having all the relevant add-on costs and processing atomicity put into the game, supports your wish to get speedups, not to ever pay way more than you will get back (to at least justify the add-on costs... ) - paying extra costs for not getting any reward is possible, yet has no beneficial impact on your code's performance ( rather the opposite )
5th : Learn when and how the manually created inline(s) may save your code, once the steps 1-4 are well learnt and routinely excersised with proper craftmanship ( Using popular COTS frameworks is fine, yet these may deliver results after a few days of work, while a hand-crafted single purpose smart designed assembly code was able to get the same results in about 12 minutes(!), not several days without any GPU/CPU tricks etc - yes, that faster - just by not doing a single step more than what was needed for the numerical processing of the large matrix data )
Did I mention float32 may surprise at being processed slower on small scales than float64, while on larger data-scales ~ n [GB] the RAM I/O-times grow slower for more efficient float32 pre-fetches? This never happens here, as float64 array gets processed here. Sure, unless one explicitly instructs the constructor(s) to downconvert the default data type, like this: np.random.rand( lengthY, lengthX ).astype( dtype = np.float32 )>>> np.random.rand( 10, 2 ).dtypedtype('float64')Avoiding extensive memory allocations is another performance trick, supported in numpy call-signatures. Using this option for large arrays will save you a lot of extra time wasted on mem-allocs for large interim arrays. Reusing already pre-allocated memory-zones and wisely controlled gc-policing are another signs of a professional, focused on low-latency & design-for-performance

Sampling in RasPi3 and ADS1115 using Sample count or time.time()

I'm using Raspberry Pi 3 and ADS1115, my project requires me to get evenly spaced samples so as to plot and analyse. The other posts were about achieving 10k and 50k sps but I only require 500SPS and that isn't working either. Is there a way to run my code for 120 seconds with 500 sps and get 60,000 samples in the end from both A0 and A1 channel at the same time? I have attached the code for reference. Thanks in advance
from Adafruit_ADS1x15 import ADS1x15
import time
import numpy as np
pga = 2/3 # Set full-scale range of programmable gain
# amplifier (page 13 of data sheet), change
#depending on the input voltage range
ADS1115 = 0x01 # Specify that the device being used is the
# ADS1115, for the ADS1015 used 0x00
adc = Adafruit_ADS1x15.ADS1015() # Create instance of the class ADS1x15
# Function to print sampled values to the terminal
def logdata():
print "sps value should be one of: 8, 16, 32, 64, 128, 250, 475, 860,
otherwise the value will default to 250"
frequency = input("Input sampling frequency (Hz): ") # Get
#sampling frequency from user
sps = input("Input sps (Hz) : ") # Get
# ads1115 sps value from the user
time1 = input("Input sample time (seconds): ") # Get how
#long to sample for from the user
period = 1.0 / frequency # Calculate sampling period
datapoints = int(time1*frequency) # Datapoints is the total number
#of samples to take, which must be an integer
startTime=time.time() # Time of first sample
t1=startTime # T1 is last sample time
t2=t1 # T2 is current time
for x in range (0,datapoints) : # Loop in which data is sampled
while (t2-t1 < period) : # Check if t2-t1 is less then
#sample period, if it is then update t2
t2=time.time() # and check again
t1+=period # Update last sample time by the
# sampling period
print adc.read_adc(0, pga, data_rate=sps), "mV ", ("%.2f" %
(t2-startTime)) , "s" # Print sampled value and time to the terminal
# Call to logdata function
logdata()
1) Are you using the ADS1115 ???
adc = Adafruit_ADS1x15.ADS1015() # should be adc = Adafruit_ADS1x15.ADS1115()
2) you can't read two or more single-ended channels at the same time. In differential mode one can compare two channels to yield one value.
To read a value from channel 1, besides channel 0 you would have to add another call in your loop:
print adc.read_adc(0, pga, data_rate=sps) ..... # original call for channel 0
print adc.read_adc(1, pga, data_rate=sps) ..... # new call for channel 1
3) before a value can be read, the ADS must be configured with several parameters, like channel, rate, gain etc. After some time, which is needed for doing an analog/digital conversion, values can be read, once, or over and over again, when in continuous mode.
In the original ADAFruit library this time is calculated from the data rate (insecure) and in the recent port to CircuitPython this time will most often be around 0.01 s, because the conversion will most probably not have finished directly after configuration (check the _read method).
4) reading at 500SPS is about the fastest the ADS1115 can read. The reference states that conversion at 860SPS takes 1.2 ms. Adding time for configuration and reading you will not be able to read two or more values continuosly every 0.002s, even if you were receiving notifications for conversion, like I described on my homepage, instead of waiting for a fixed period of time.
5) I think the closest you can get with Python is to run 2 daisy-chained ADS1115 in continuous mode with GPIO notifications, but I have no experience with that.

Implement realtime signal processing in Python - how to capture audio continuously?

I'm planning to implement a "DSP-like" signal processor in Python. It should capture small fragments of audio via ALSA, process them, then play them back via ALSA.
To get things started, I wrote the following (very simple) code.
import alsaaudio
inp = alsaaudio.PCM(alsaaudio.PCM_CAPTURE, alsaaudio.PCM_NORMAL)
inp.setchannels(1)
inp.setrate(96000)
inp.setformat(alsaaudio.PCM_FORMAT_U32_LE)
inp.setperiodsize(1920)
outp = alsaaudio.PCM(alsaaudio.PCM_PLAYBACK, alsaaudio.PCM_NORMAL)
outp.setchannels(1)
outp.setrate(96000)
outp.setformat(alsaaudio.PCM_FORMAT_U32_LE)
outp.setperiodsize(1920)
while True:
l, data = inp.read()
# TODO: Perform some processing.
outp.write(data)
The problem is, that the audio "stutters" and is not gapless. I tried experimenting with the PCM mode, setting it to either PCM_ASYNC or PCM_NONBLOCK, but the problem remains. I think the problem is that samples "between" two subsequent calls to "inp.read()" are lost.
Is there a way to capture audio "continuously" in Python (preferably without the need for too "specific"/"non-standard" libraries)? I'd like the signal to always get captured "in the background" into some buffer, from which I can read some "momentary state", while audio is further being captured into the buffer even during the time, when I perform my read operations. How can I achieve this?
Even if I use a dedicated process/thread to capture the audio, this process/thread will always at least have to (1) read audio from the source, (2) then put it into some buffer (from which the "signal processing" process/thread then reads). These two operations will therefore still be sequential in time and thus samples will get lost. How do I avoid this?
Thanks a lot for your advice!
EDIT 2: Now I have it running.
import alsaaudio
from multiprocessing import Process, Queue
import numpy as np
import struct
"""
A class implementing buffered audio I/O.
"""
class Audio:
"""
Initialize the audio buffer.
"""
def __init__(self):
#self.__rate = 96000
self.__rate = 8000
self.__stride = 4
self.__pre_post = 4
self.__read_queue = Queue()
self.__write_queue = Queue()
"""
Reads audio from an ALSA audio device into the read queue.
Supposed to run in its own process.
"""
def __read(self):
inp = alsaaudio.PCM(alsaaudio.PCM_CAPTURE, alsaaudio.PCM_NORMAL)
inp.setchannels(1)
inp.setrate(self.__rate)
inp.setformat(alsaaudio.PCM_FORMAT_U32_BE)
inp.setperiodsize(self.__rate / 50)
while True:
_, data = inp.read()
self.__read_queue.put(data)
"""
Writes audio to an ALSA audio device from the write queue.
Supposed to run in its own process.
"""
def __write(self):
outp = alsaaudio.PCM(alsaaudio.PCM_PLAYBACK, alsaaudio.PCM_NORMAL)
outp.setchannels(1)
outp.setrate(self.__rate)
outp.setformat(alsaaudio.PCM_FORMAT_U32_BE)
outp.setperiodsize(self.__rate / 50)
while True:
data = self.__write_queue.get()
outp.write(data)
"""
Pre-post data into the output buffer to avoid buffer underrun.
"""
def __pre_post_data(self):
zeros = np.zeros(self.__rate / 50, dtype = np.uint32)
for i in range(0, self.__pre_post):
self.__write_queue.put(zeros)
"""
Runs the read and write processes.
"""
def run(self):
self.__pre_post_data()
read_process = Process(target = self.__read)
write_process = Process(target = self.__write)
read_process.start()
write_process.start()
"""
Reads audio samples from the queue captured from the reading thread.
"""
def read(self):
return self.__read_queue.get()
"""
Writes audio samples to the queue to be played by the writing thread.
"""
def write(self, data):
self.__write_queue.put(data)
"""
Pseudonymize the audio samples from a binary string into an array of integers.
"""
def pseudonymize(self, s):
return struct.unpack(">" + ("I" * (len(s) / self.__stride)), s)
"""
Depseudonymize the audio samples from an array of integers into a binary string.
"""
def depseudonymize(self, a):
s = ""
for elem in a:
s += struct.pack(">I", elem)
return s
"""
Normalize the audio samples from an array of integers into an array of floats with unity level.
"""
def normalize(self, data, max_val):
data = np.array(data)
bias = int(0.5 * max_val)
fac = 1.0 / (0.5 * max_val)
data = fac * (data - bias)
return data
"""
Denormalize the data from an array of floats with unity level into an array of integers.
"""
def denormalize(self, data, max_val):
bias = int(0.5 * max_val)
fac = 0.5 * max_val
data = np.array(data)
data = (fac * data).astype(np.int64) + bias
return data
debug = True
audio = Audio()
audio.run()
while True:
data = audio.read()
pdata = audio.pseudonymize(data)
if debug:
print "[PRE-PSEUDONYMIZED] Min: " + str(np.min(pdata)) + ", Max: " + str(np.max(pdata))
ndata = audio.normalize(pdata, 0xffffffff)
if debug:
print "[PRE-NORMALIZED] Min: " + str(np.min(ndata)) + ", Max: " + str(np.max(ndata))
print "[PRE-NORMALIZED] Level: " + str(int(10.0 * np.log10(np.max(np.absolute(ndata)))))
#ndata += 0.01 # When I comment in this line, it wreaks complete havoc!
if debug:
print "[POST-NORMALIZED] Level: " + str(int(10.0 * np.log10(np.max(np.absolute(ndata)))))
print "[POST-NORMALIZED] Min: " + str(np.min(ndata)) + ", Max: " + str(np.max(ndata))
pdata = audio.denormalize(ndata, 0xffffffff)
if debug:
print "[POST-PSEUDONYMIZED] Min: " + str(np.min(pdata)) + ", Max: " + str(np.max(pdata))
print ""
data = audio.depseudonymize(pdata)
audio.write(data)
However, when I even perform the slightest modification to the audio data (e. g. comment that line in), I get a lot of noise and extreme distortion at the output. It seems like I don't handle the PCM data correctly. The strange thing is that the output of the "level meter", etc. all appears to make sense. However, the output is completely distorted (but continuous) when I offset it just slightly.
EDIT 3: I just found out that my algorithms (not included here) work when I apply them to wave files. So the problem really appears to actually boil down to the ALSA API.
EDIT 4: I finally found the problems. They were the following.
1st - ALSA quietly "fell back" to PCM_FORMAT_U8_LE upon requesting PCM_FORMAT_U32_LE, thus I interpreted the data incorrectly by assuming that each sample was 4 bytes wide. It works when I request PCM_FORMAT_S32_LE.
2nd - The ALSA output seems to expect period size in bytes, even though they explicitely state that it is expected in frames in the specification. So you have to set the period size four times as high for output if you use 32 bit sample depth.
3rd - Even in Python (where there is a "global interpreter lock"), processes are slow compared to Threads. You can get latency down a lot by changing to threads, since the I/O threads basically don't do anything that's computationally intensive.
When you
read one chunk of data,
write one chunk of data,
then wait for the second chunk of data to be read,
then the buffer of the output device will become empty if the second chunk is not shorter than the first chunk.
You should fill up the output device's buffer with silence before starting the actual processing. Then small delays in either the input or output processing will not matter.
You can do that all manually, as #CL recommend in his/her answer, but I'd recommend just using
GNU Radio instead:
It's a framework that takes care of doing all the "getting small chunks of samples in and out your algorithm"; it scales very well, and you can write your signal processing either in Python or C++.
In fact, it comes with an Audio Source and an Audio Sink that directly talk to ALSA and just give/take continuous samples. I'd recommend reading through GNU Radio's Guided Tutorials; they explain exactly what is necessary to do your signal processing for an audio application.
A really minimal flow graph would look like:
You can substitute the high pass filter for your own signal processing block, or use any combination of the existing blocks.
There's helpful things like file and wav file sinks and sources, filters, resamplers, amplifiers (ok, multipliers), …
I finally found the problems. They were the following.
1st - ALSA quietly "fell back" to PCM_FORMAT_U8_LE upon requesting PCM_FORMAT_U32_LE, thus I interpreted the data incorrectly by assuming that each sample was 4 bytes wide. It works when I request PCM_FORMAT_S32_LE.
2nd - The ALSA output seems to expect period size in bytes, even though they explicitely state that it is expected in frames in the specification. So you have to set the period size four times as high for output if you use 32 bit sample depth.
3rd - Even in Python (where there is a "global interpreter lock"), processes are slow compared to Threads. You can get latency down a lot by changing to threads, since the I/O threads basically don't do anything that's computationally intensive.
Audio is gapless and undistorted now, but latency is far too high.

High-speed alternatives to replace byte array processing bottlenecks

>> See EDIT below <<
I am working on processing data from a special pixelated CCD camera over serial, using FTDI D2xx drivers via pyUSB.
The camera can operate at high bandwidth to the PC, up to 80 frames/sec. I would love that speed, but know that it isn't feasible with Python, due to it being a scripted language, but would like to know how close I can get - whether it be some optimizations that I missed in my code, threading, or using some other approach. I immediately think that breaking-out the most time consuming loops and putting them in C code, but I don't have much experience with C code and not sure the best way to get Python to interact inline with it, if that's possible. I have complex algorithms heavily developed in Python with SciPy/Numpy, which are already optimized and have acceptable performance, so I would need a way to just speed-up the acquisition of the data to feed-back to Python, if that's the best approach.
The difficulty, and the reason I used Python, and not some other language, is due to the need to be able to easily run it cross-platform (I develop in Windows, but am putting the code on an embedded Linux board, making a stand-alone system). If you suggest that I use another code, like C, how would I be able to work cross-platform? I have never worked with compiling a lower-level language like C between Windows and Linux, so I would want to be sure of that process - I would have to compile it for each system, right? What do you suggest?
Here are my functions, with current execution times:
ReadStream: 'RXcount' is 114733 for a device read, formatting from string to byte equivalent
Returns a list of bytes (0-255), representing binary values
Current execution time: 0.037 sec
def ReadStream(RXcount):
global ftdi
RXdata = ftdi.read(RXcount)
RXdata = list(struct.unpack(str(len(RXdata)) + 'B', RXdata))
return RXdata
ProcessRawData: To reshape the byte list into an array that matches the pixel orientations
Results in a 3584x32 array, after trimming off some un-needed bytes.
Data is unique in that every block of 14 rows represents 14-bits of one row of pixels on the device (with 32 bytes across # 8 bits/byte = 256 bits across), which is 256x256 pixels. The processed array has 32 columns of bytes because each byte, in binary, represents 8 pixels (32 bytes * 8 bits = 256 pixels). Still working on how to do that one... I have already posted a question for that previously
Current execution time: 0.01 sec ... not bad, it's just Numpy
def ProcessRawData(RawData):
if len(RawData) == 114733:
ProcessedMatrix = np.ndarray((1, 114733), dtype=int)
np.copyto(ProcessedMatrix, RawData)
ProcessedMatrix = ProcessedMatrix[:, 1:-44]
ProcessedMatrix = np.reshape(ProcessedMatrix, (-1, 32))
return ProcessedMatrix
else:
return None
Finally,
GetFrame: The device has a mode where it just outputs whether a pixel detected anything or not, using the lowest bit of the array (every 14th row) - Get that data and convert to int for each pixel
Results in 256x256 array, after processing every 14th row, which are bytes to be read as binary (32 bytes across ... 32 bytes * 8 bits = 256 pixels across)
Current execution time: 0.04 sec
def GetFrame(ProcessedMatrix):
if np.shape(ProcessedMatrix) == (3584, 32):
FrameArray = np.zeros((256, 256), dtype='B')
DataRows = ProcessedMatrix[13::14]
for i in range(256):
RowData = ""
for j in range(32):
RowData = RowData + "{:08b}".format(DataRows[i, j])
FrameArray[i] = [int(RowData[b:b+1], 2) for b in range(256)]
return FrameArray
else:
return False
Goal:
I would like to target a total execution time of ~0.02 secs/frame by whatever suggestions you make (currently it's 0.25 secs/frame with the GetFrame function being the weakest). The device I/O is not the limiting factor, as that outputs a data packet every 0.0125 secs. If I get the execution time down, then can I just run the acquisition and processing in parallel with some threading?
Let me know what you suggest as the best path forward - Thank you for the help!
EDIT, thanks to #Jaime:
Functions are now:
def ReadStream(RXcount):
global ftdi
return np.frombuffer(ftdi.read(RXcount), dtype=np.uint8)
... time 0.013 sec
def ProcessRawData(RawData):
if len(RawData) == 114733:
return RawData[1:-44].reshape(-1, 32)
return None
... time 0.000007 sec!
def GetFrame(ProcessedMatrix):
if ProcessedMatrix.shape == (3584, 32):
return np.unpackbits(ProcessedMatrix[13::14]).reshape(256, 256)
return False
... time 0.00006 sec!
So, with pure Python, I am now able to acquire the data at the desired frame rate! After a few tweaks to the D2xx USB buffers and latency timing, I just clocked it at 47.6 FPS!
Last step is if there is any way to make this run in parallel with my processing algorithms? Need some way to pass the result of GetFrame to another loop running in parallel.
There are several places where you can speed things up significantly. Perhaps the most obvious is rewriting GetFrame:
def GetFrame(ProcessedMatrix):
if ProcessedMatrix.shape == (3584, 32):
return np.unpackbits(ProcessedMatrix[13::14]).reshape(256, 256)
return False
This requires that ProcessedMatrix be an ndarray of type np.uint8, but other than that, on my systems it runs 1000x faster.
With your other two functions, I think that in ReadStream you should do something like:
def ReadStream(RXcount):
global ftdi
return np.frombuffer(ftdi.read(RXcount), dtype=np.uint8)
Even if it doesn't speed up that function much, because it is the reading taking up most of the time, it will already give you a numpy array of bytes to work on. With that, you can then go on to ProcessRawData and try:
def ProcessRawData(RawData):
if len(RawData) == 114733:
return RawData[1:-44].reshape(-1, 32)
return None
Which is 10x faster than your version.

Output speex encode in a file using python

I'm using the script found on this blog Google speech recognition with python (I give any credit to author).
import sys
import pyaudio, speex
import numpy as np # just for doing a standard deviation for audio level checks
import urllib2
import wave
e = speex.Encoder()
e.initialize(speex.SPEEX_MODEID_WB)
d = speex.Decoder()
d.initialize(speex.SPEEX_MODEID_WB)
chunk = 320 # tried other numbers... some don't work
FORMAT = pyaudio.paInt16
bytespersample=2
CHANNELS = 1
RATE = 16000 # "wideband" mode for speex. May work with 8000. Haven't tried it.
p = pyaudio.PyAudio()
# Start the stream to record the audio
stream = p.open(format = FORMAT,
channels = CHANNELS,
rate = RATE,
input = True,
output = True,
frames_per_buffer = chunk)
print "Listening. Recording will start when some sound is heard."
threshold = 200 # Adjust this to be slightly above the noise level of your recordings.
nquit = 40 # number of silent frames before terminating the program
nover = 0
keepgoing = True
spxlist=[] # list of the encoded speex packets/frames
while keepgoing:
data = stream.read(chunk) # grab 320 samples from the microphone
spxdata = e.encode(data) # encode using the speex dll
print "Length encoded: %d"%len(spxdata) # print the length, after encoding. Can't exceed 255!
spxlist.append(spxdata)
a=np.frombuffer(data,np.int16) # convert to numpy array to check for silence or audio
audiolevel=np.std(a)
if audiolevel < threshold: # too quiet
nover+=1
else:
nover=0
if nover >= nquit:
keepgoing=False
print '%2.1f (%d%% quiet)'%(audiolevel, nover*100/nquit)
print "Too quiet. I'm stopping now."
stream.stop_stream()
stream.close()
p.terminate()
fullspx=''.join(spxlist) # make a string of all the header-ized speex packets
out_file = open("test.spx","wb")
out_file.write(fullspx)
out_file.close()
As you can see I slightly modify the script to make it write and output file in .spx, but it dosen't work.
Any advice?
Thanks for your help.
Edit:
I'm running this script under an Ubuntu-linux machine.

Categories