Why am I bleeding memory? - python

heres a script to detect and record sound that ive been hacking on for a while. it works great, except that its memory usage grows after every successful detection. i fixed a similar issue that occurred during prolonged periods of silence (if num_listening > 4096:...), but this one has me stumped.
from sys import byteorder
from array import array
from struct import pack
from datetime import datetime
import pyaudio
import wave
import os
import time
THRESHOLD = 6348
MAX_SILENCE = 500
CHUNK_SIZE = 1024
FORMAT = pyaudio.paInt16
RATE = 44100
MAX_LENGTH = 1024
def is_silent(snd_data):
"Returns 'True' if below the 'silent' threshold"
return max(snd_data) < THRESHOLD
def normalize(snd_data):
"Average the volume out"
MAXIMUM = 16384
times = float(MAXIMUM)/max(abs(i) for i in snd_data)
r = array('h')
for i in snd_data:
r.append(int(i*times))
return r
def trim(snd_data):
"Trim the blank spots at the start and end"
def _trim(snd_data):
snd_started = False
r = array('h')
for i in snd_data:
if not snd_started and abs(i) > THRESHOLD:
snd_started = True
r.append(i)
elif snd_started:
r.append(i)
return r
# Trim to the left
snd_data = _trim(snd_data)
# Trim to the right
snd_data.reverse()
snd_data = _trim(snd_data)
snd_data.reverse()
return snd_data
def add_silence(snd_data, seconds):
"Add silence to the start and end of 'snd_data' of length 'seconds' (float)"
silence = [0] * int(seconds * RATE)
r = array('h', silence)
r.extend(snd_data)
r.extend(silence)
return r
def record():
"""
Record a word or words from the microphone and
return the data as an array of signed shorts.
Normalizes the audio, trims silence from the
start and end, and pads with 0.5 seconds of
blank sound to make sure VLC et al can play
it without getting chopped off.
"""
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT, channels=1, rate=RATE,
input=True, output=True,
frames_per_buffer=CHUNK_SIZE)
num_silent = 0
num_snd = 0
num_listening = 0
snd_started = False
r = array('h')
while num_snd < MAX_LENGTH:
# little endian, signed short
snd_data = array('h', stream.read(CHUNK_SIZE, exception_on_overflow = False))
if byteorder == 'big':
snd_data.byteswap()
r.extend(snd_data)
silent = is_silent(snd_data)
if not silent and not snd_started:
snd_started = True
if snd_started:
num_snd += 1
if num_silent > MAX_SILENCE:
break
if silent:
if snd_started:
num_silent += 1
if not snd_started:
num_listening += 1
if num_listening > 4096:
del r[:]
num_listening = 0
sample_width = p.get_sample_size(FORMAT)
stream.stop_stream()
stream.close()
p.terminate()
del r[0:8000]
r = normalize(r)
r = trim(r)
r = add_silence(r, 0.5)
return sample_width, r
def record_to_file(path):
"Records from the microphone and outputs the resulting data to 'path'"
sample_width, data = record()
data = pack('<' + ('h'*len(data)), *data)
wf = wave.open(path, 'wb')
wf.setnchannels(1)
wf.setsampwidth(sample_width)
wf.setframerate(RATE)
wf.writeframes(data)
wf.close()
if __name__ == '__main__':
while True:
print("Ready!")
recorded = datetime.now()
recorded = "testpi1_" + recorded.strftime("%Y-%m-%d--%H-%M-%S") + ".wav"
record_to_file("/motion/" + recorded)
os.system("./convert-audio.py " + recorded)

forking the record_to_file function with multiprocessing.Process solved the issue.
add
import multiprocessing
adjust
if __name__ == '__main__':
while True:
print("Ready!")
recorded = datetime.now()
recorded = "testpi1_" + recorded.strftime("%Y-%m-%d--%H-%M-%S") + ".wav"
record_to_file("/motion/" + recorded)
os.system("./convert-audio.py " + recorded)
to
if __name__ == '__main__':
while True:
print("Ready!")
recorded = datetime.now()
recorded = "testpi1_" + recorded.strftime("%Y-%m-%d--%H-%M-%S") + ".wav"
p1 = multiprocessing.Process(target=record_to_file,args=("/motion/" + recorded,))
p1.start()
p1.join()
os.system("./convert-audio.py " + recorded)

Related

I cant use pyaudio (record) when i run ubuntu sudo root

I wrote a code to detect if the audio is above a certain threshold. I am doing this for a bot and I need to import some data from some files and use the keyboard.is pressed but it is a root library so I tried to run it on the root.
THIS IS THE MAIN SOURCE CODE NOT THE BOT
import pyaudio as pa
import math
import struct
from sqlalchemy import false
Threshold = 10000
CHUNK = 2048
FORMAT = pa.paInt16
CHANNELS = 1
RATE = 44100 # in Hz
swidth = 2
SHORT_NORMALIZE = (1.0/32768.0)
class Recorder:
#staticmethod
def rms(frame):
count = len(frame) / swidth
format = "%dh" % (count)
shorts = struct.unpack(format, frame)
sum_squares = 0.0
for sample in shorts:
n = sample * SHORT_NORMALIZE
sum_squares += n * n
rms = math.pow(sum_squares / count, 0.5)
return rms * 1000
def __init__(self):
self.p = pa.PyAudio()
self.stream =self.p.open(
format = FORMAT,
channels = CHANNELS,
rate = RATE,
input=True,
output=True,
frames_per_buffer=CHUNK
)
def listen(self):
print('Listening beginning')
while True:
print(self.stream)
input = self.stream.read(CHUNK,exception_on_overflow=False)
rms_val = self.rms(input)
print(rms_val)
if rms_val > Threshold:
break
a = Recorder()
a.listen()

Real time audio processing with python and AudioSegment

Well for some reason i want to split some selected mp3 files to chunk-time: ~28msec.
I have quality problem for slicing<1sec.
from av import AudioFrame
from pydub import AudioSegment
import av
#open an mp3 file
sound1 = AudioSegment.from_file(r"ΑΓΙΑ ΣΚΕΠΗ.mp3")
codec = av.CodecContext.create('pcm_s16le', 'r')
codec.sample_rate = 44100
codec.channels = 2
#split the file each part 10 second
#slices = sound1[::10000]
#split the file each part 2 second
#slices = sound1[::2000]
#split the file each part 1 second
#slices = sound1[::1000] #ok quality 1 tick every 1 second
#split the file each part 10 millisecond
slices = sound1[::10] #bad quality
pieces = AudioSegment.silent()
'''
for slice in slices:
pieces = pieces+slice
pieces.export("remaked.mp3",format="mp3")
#remaked works well
'''
for slice in slices:
#qualty loss (why?)
packet = av.Packet(slice.raw_data)
frame = codec.decode(packet)[0]
#remake AudioSegment from Av.AudioFrame
for p in frame.planes:
data = p.to_bytes()
data_segment = AudioSegment(data, sample_width=2, channels=2, frame_rate=44100)
pieces = pieces+data_segment
pieces.export("remaked.mp3",format="mp3")
How can i fix the quality problem?
Note that i use av.AudioFrame (frame = codec.decode(packet)[0]) because i want to send some real time audio data with aiortc
Edit:
from av import AudioFrame
from pydub import AudioSegment
import pyaudio
import av
import fractions
from aiortc.mediastreams import MediaStreamTrack
class RadioTelephoneTrack(MediaStreamTrack):
kind = "audio"
def __init__(self):
super().__init__() # don't forget this!
self.sample_rate = 8000
self.AUDIO_PTIME = 0.020 # 20ms audio packetization
self.samples = int(self.AUDIO_PTIME * self.sample_rate)
self.FORMAT = pyaudio.paInt16
self.CHANNELS = 2
self.RATE = self.sample_rate
#self.RATE = 44100
self.CHUNK = int(8000*0.020)
#self.CHUNK = 1024
self.p = pyaudio.PyAudio()
self.mic_stream = self.p.open(format=self.FORMAT, channels=1,rate=self.RATE, input=True,frames_per_buffer=self.CHUNK)
self.codec = av.CodecContext.create('pcm_s16le', 'r')
self.codec.sample_rate = self.RATE
#self.codec.sample_fmt = AV_SAMPLE_FMT_S16
self.codec.channels = 2
#self.codec.channel_layout = "mono";
self.sound1 = AudioSegment.from_file(r"ΑΓΙΑ ΣΚΕΠΗ.mp3").set_frame_rate(self.sample_rate)
print("Frame rate: "+str(self.sound1.frame_rate))
#self.sound1_channels = self.sound1.split_to_mono()
#self.sound1 = self.sound1_channels[0].overlay(self.sound1_channels[1])
self.audio_samples = 0
self.chunk_number = 0
#self.sound1 = self.sound1 - 30 # make sound1 quiter 30dB
async def recv(self):
mic_data = self.mic_stream.read(self.CHUNK)
mic_sound = AudioSegment(mic_data, sample_width=2, channels=1, frame_rate=self.RATE)
mic_sound = AudioSegment.from_mono_audiosegments(mic_sound, mic_sound)
mic_sound_duration = len(mic_sound)
#print("Mic sound duration: "+str(mic_sound_duration))
mp3_slice_duration = mic_sound_duration
if(len(self.sound1)>(self.chunk_number+1)*mp3_slice_duration):
sound1_part = self.sound1[self.chunk_number*mp3_slice_duration:(self.chunk_number+1)*mp3_slice_duration]
elif(len(self.sound1)>(self.chunk_number)*mp3_slice_duration):
sound1_part = self.sound1[self.chunk_number*mp3_slice_duration:]
else:
#replay
times_played_1 = int((self.chunk_number)*mp3_slice_duration/len(self.sound1))
times_played_2 = int((self.chunk_number+1)*mp3_slice_duration/len(self.sound1))
if(times_played_1==times_played_2):
time_start = ((self.chunk_number)*mp3_slice_duration)-(times_played_1*len(self.sound1))
time_end = ((self.chunk_number+1)*mp3_slice_duration)-(times_played_1*len(self.sound1))
sound1_part = self.sound1[time_start:time_end]
else:
time_start_1 = ((self.chunk_number)*mp3_slice_duration)-(times_played_1*len(self.sound1))
sound1_part1 = self.sound1[time_start_1:]
time_end_1 = ((self.chunk_number+1)*mp3_slice_duration)-(times_played_2*len(self.sound1))
sound1_part2 = self.sound1[0:time_end_1]
sound1_part = sound1_part1.append(sound1_part2, crossfade=5)
#sound1_part = AudioSegment.silent()
#self.mix_sound = sound1_part.overlay(mic_sound)
self.mix_sound = sound1_part
packet = av.Packet(self.mix_sound.raw_data)
frame = self.codec.decode(packet)[0]
frame.pts = self.audio_samples
self.audio_samples += frame.samples
self.chunk_number = self.chunk_number+1
return frame
The above code works (pretty better).
The main problems now are:
The sound sounds in depth.
There is click noise every time the sound re-starts (starts from the beginning).

creating an infinitely long sine tone in Python

I'm trying to create an infinitely long pure sine tone in Python (so I can later add realtime on/off events) and while I can get a tone playing, it is choppy and clipped. My assumption is that it's either from the next chunk not starting in the same place in the wave's cycle as the last chunk ended, or that there is a delay in calculating the next chunk, and I have no idea which it is.
Are either of those things occurring, or have I made some other error? Moreover, is there a better approach to take without sacrificing the ability to alter the incoming data in real time?
import time
import numpy
import pyaudio
import math
CHUNK = 4096
RATE = 44100
def sine(current_time, frequency=440):
length = CHUNK
factor = float(frequency) * (math.pi * 2) / RATE
this_chunk = numpy.arange(length) + current_time
return numpy.sin(this_chunk * factor)
def get_chunk():
data = sine(time.time())
return data * 0.1
def callback(in_data, frame_count, time_info, status):
chunk = get_chunk() * 0.25
data = chunk.astype(numpy.float32).tostring()
return (data, pyaudio.paContinue)
p = pyaudio.PyAudio()
stream = p.open(format = pyaudio.paFloat32,
channels = 2,
rate = RATE,
output = True,
stream_callback = callback)
stream.start_stream()
while stream.is_active():
time.sleep(0.1)
stream.stop_stream()
stream.close()
Looks like you need some optimization see,
https://wiki.python.org/moin/PythonSpeed/PerformanceTips
try:
import time
import numpy
import pyaudio
import math
CHUNK = 4096
RATE = 44100
MP = math.pi
NA = numpy.arange
NS = numpy.sin
TT = time.time
NF32 = numpy.float32
p = pyaudio.PyAudio()
PO = p.open
PC = pyaudio.paContinue
PF = pyaudio.paFloat32
TS = time.sleep
def sine(current_time, frequency=440):
length = CHUNK
factor = float(frequency) * (MP * 2) / RATE
this_chunk = NA(length) + current_time
return NS(this_chunk * factor)* 0.025
def callback(in_data, frame_count, time_info, status):
chunk = sine(TT())
data = chunk.astype(NF32).tostring()
return (data, PC)
stream = PO(format = PF,
channels = 2,
rate = RATE,
output = True,
stream_callback = callback)
stream.start_stream()
while stream.is_active():
TS(0.1)
stream.stop_stream()
stream.close()

Synchronizing audio and video with OpenCV and PyAudio

I have gotten both OpenCV and PyAudio working however I am not sure how I would sync them together. I am unable to get a framerate from OpenCV and measuring the call time for a frame changes from moment to moment. However with PyAudio it's basis is grabbing a certain sample rate. How would I sync them to be at the same rate. I assume there is some standard or some way codecs do it. (I've tried google all I got was information on lip syncing :/).
OpenCV Frame rate
from __future__ import division
import time
import math
import cv2, cv
vc = cv2.VideoCapture(0)
# get the frame
while True:
before_read = time.time()
rval, frame = vc.read()
after_read = time.time()
if frame is not None:
print len(frame)
print math.ceil((1.0 / (after_read - before_read)))
cv2.imshow("preview", frame)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
else:
print "None..."
cv2.waitKey(1)
# display the frame
while True:
cv2.imshow("preview", frame)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
Grabbing and saving audio
from sys import byteorder
from array import array
from struct import pack
import pyaudio
import wave
THRESHOLD = 500
CHUNK_SIZE = 1024
FORMAT = pyaudio.paInt16
RATE = 44100
def is_silent(snd_data):
"Returns 'True' if below the 'silent' threshold"
print "\n\n\n\n\n\n\n\n"
print max(snd_data)
print "\n\n\n\n\n\n\n\n"
return max(snd_data) < THRESHOLD
def normalize(snd_data):
"Average the volume out"
MAXIMUM = 16384
times = float(MAXIMUM)/max(abs(i) for i in snd_data)
r = array('h')
for i in snd_data:
r.append(int(i*times))
return r
def trim(snd_data):
"Trim the blank spots at the start and end"
def _trim(snd_data):
snd_started = False
r = array('h')
for i in snd_data:
if not snd_started and abs(i)>THRESHOLD:
snd_started = True
r.append(i)
elif snd_started:
r.append(i)
return r
# Trim to the left
snd_data = _trim(snd_data)
# Trim to the right
snd_data.reverse()
snd_data = _trim(snd_data)
snd_data.reverse()
return snd_data
def add_silence(snd_data, seconds):
"Add silence to the start and end of 'snd_data' of length 'seconds' (float)"
r = array('h', [0 for i in xrange(int(seconds*RATE))])
r.extend(snd_data)
r.extend([0 for i in xrange(int(seconds*RATE))])
return r
def record():
"""
Record a word or words from the microphone and
return the data as an array of signed shorts.
Normalizes the audio, trims silence from the
start and end, and pads with 0.5 seconds of
blank sound to make sure VLC et al can play
it without getting chopped off.
"""
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT, channels=1, rate=RATE,
input=True, output=True,
frames_per_buffer=CHUNK_SIZE)
num_silent = 0
snd_started = False
r = array('h')
while 1:
# little endian, signed short
snd_data = array('h', stream.read(1024))
if byteorder == 'big':
snd_data.byteswap()
print "\n\n\n\n\n\n"
print len(snd_data)
print snd_data
r.extend(snd_data)
silent = is_silent(snd_data)
if silent and snd_started:
num_silent += 1
elif not silent and not snd_started:
snd_started = True
if snd_started and num_silent > 1:
break
sample_width = p.get_sample_size(FORMAT)
stream.stop_stream()
stream.close()
p.terminate()
r = normalize(r)
r = trim(r)
r = add_silence(r, 0.5)
return sample_width, r
def record_to_file(path):
"Records from the microphone and outputs the resulting data to 'path'"
sample_width, data = record()
data = pack('<' + ('h'*len(data)), *data)
wf = wave.open(path, 'wb')
wf.setnchannels(1)
wf.setsampwidth(sample_width)
wf.setframerate(RATE)
wf.writeframes(data)
wf.close()
if __name__ == '__main__':
print("please speak a word into the microphone")
record_to_file('demo.wav')
print("done - result written to demo.wav")
I think you'd be better off using either GSreamer or ffmpeg, or if you're on Windows, DirectShow. These libs can handle both audio and video, and should have some kind of a Multiplexer to allow you to mix video and audio properly.
But if you really want to do this using Opencv, you should be able to use VideoCapture to get the frame rate, have you tried using this?
fps = cv.GetCaptureProperty(vc, CV_CAP_PROP_FPS)
Another way would be to estimate fps as number of frames divided by duration:
nFrames = cv.GetCaptureProperty(vc, CV_CAP_PROP_FRAME_COUNT)
cv.SetCaptureProperty(vc, CV_CAP_PROP_POS_AVI_RATIO, 1)
duration = cv.GetCaptureProperty(vc, CV_CAP_PROP_POS_MSEC)
fps = 1000 * nFrames / duration;
I'm not sure I understand what you were trying to do here:
before_read = time.time()
rval, frame = vc.read()
after_read = time.time()
It seems to me that doing after_read - before_read only measures how long it took for OpenCV to load the next frame, it doesn't measure the fps. OpenCV is not trying to do playback, it's only loading frames and it'll try to do so the fastest it can and I think there's no way to configure that. I think that putting a waitKey(1/fps) after displaying each frame will achieve what you're looking for.
You could have 2 counters 1 for audio and one for video.
The video counter will become +(1/fps) when showing an image and audio +sec where sec the seconds of audio you are writing to the stream each time. Then on audio part of the code you can do something like
While audiosec-videosec>=0.05: # Audio is ahead
time.sleep(0.05)
And on video part
While videosec-audiosec>=0.2:# video is ahead
time.sleep(0.2)
You can play with the numbers
This is how i achieve some sort of synchronization on my own video player project using pyaudio recently ffmpeg instead of cv2.
personally i used threading for this.
import concurrent.futures
import pyaudio
import cv2
class Aud_Vid():
def __init__(self, arg):
self.video = cv2.VideoCapture(0)
self.CHUNK = 1470
self.FORMAT = pyaudio.paInt16
self.CHANNELS = 2
self.RATE = 44100
self.audio = pyaudio.PyAudio()
self.instream = self.audio.open(format=self.FORMAT,channels=self.CHANNELS,rate=self.RATE,input=True,frames_per_buffer=self.CHUNK)
self.outstream = self.audio.open(format=self.FORMAT,channels=self.CHANNELS,rate=self.RATE,output=True,frames_per_buffer=self.CHUNK)
def sync(self):
with concurrent.futures.ThreadPoolExecutor() as executor:
tv = executor.submit(self.video.read)
ta = executor.submit(self.instream.read,1470)
vid = tv.result()
aud = ta.result()
return(vid[1].tobytes(),aud)

Live recognition with Python and Pocketsphinx

I have recently been working with pocket sphinx in python. I have successfully got the
example below to work recognising a recorded wav.
#!/usr/bin/env python
import sys,os
def decodeSpeech(hmmd,lmdir,dictp,wavfile):
"""
Decodes a speech file
"""
try:
import pocketsphinx as ps
import sphinxbase
except:
print """Pocket sphinx and sphixbase is not installed
in your system. Please install it with package manager.
"""
speechRec = ps.Decoder(hmm = hmmd, lm = lmdir, dict = dictp)
wavFile = file(wavfile,'rb')
wavFile.seek(44)
speechRec.decode_raw(wavFile)
result = speechRec.get_hyp()
return result[0]
if __name__ == "__main__":
hmdir = "/home/jaganadhg/Desktop/Docs_New/kgisl/model/hmm/wsj1"
lmd = "/home/jaganadhg/Desktop/Docs_New/kgisl/model/lm/wsj/wlist5o.3e-7.vp.tg.lm.DMP"
dictd = "/home/jaganadhg/Desktop/Docs_New/kgisl/model/lm/wsj/wlist5o.dic"
wavfile = "/home/jaganadhg/Desktop/Docs_New/kgisl/sa1.wav"
recognised = decodeSpeech(hmdir,lmd,dictd,wavfile)
print "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%"
print recognised
print "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%"
The problem is how can I do real time speech recognition from a microphone? In
a while loop with a if statement so that if a set word is recognised from the microphone
a function can be called?
The code for realtime recognition looks like this:
config = Decoder.default_config()
config.set_string('-hmm', path.join(MODELDIR, 'en-us/en-us'))
config.set_string('-lm', path.join(MODELDIR, 'en-us/en-us.lm.bin'))
config.set_string('-dict', path.join(MODELDIR, 'en-us/cmudict-en-us.dict'))
config.set_string('-logfn', '/dev/null')
decoder = Decoder(config)
import pyaudio
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024)
stream.start_stream()
in_speech_bf = False
decoder.start_utt()
while True:
buf = stream.read(1024)
if buf:
decoder.process_raw(buf, False, False)
if decoder.get_in_speech() != in_speech_bf:
in_speech_bf = decoder.get_in_speech()
if not in_speech_bf:
decoder.end_utt()
print 'Result:', decoder.hyp().hypstr
decoder.start_utt()
else:
break
decoder.end_utt()
You can also use gstreamer python bindings in pocketsphinx, check livedemo.py
Try this. Pocketsphinx is now a GStreamer plugin.
This is the code I see on the internet and I've modified a few things to really listen to the words very bad and slow
You can help me modify it for good. It is built on ubuntu 16.04 LTS
I do not know much about programming
Looking forward to help
# -*- encoding: utf-8 -*-
#!/usr/bin/env python
from pocketsphinx.pocketsphinx import *
from sphinxbase.sphinxbase import *
import os
import pyaudio
import wave
import audioop
from collections import deque
import time
import math;import Mic
"""
Written by Sophie Li, 2016
http://blog.justsophie.com/python-speech-to-text-with-pocketsphinx/
"""
class SpeechDetector:
def __init__(self):
# Microphone stream config.
self.CHUNK = 1024 # CHUNKS of bytes to read each time from mic
self.FORMAT = pyaudio.paInt16
self.CHANNELS = 1
self.RATE = 16000
self.SILENCE_LIMIT = 1 # Silence limit in seconds. The max ammount of seconds where
# only silence is recorded. When this time passes the
# recording finishes and the file is decoded
self.PREV_AUDIO = 0.5 # Previous audio (in seconds) to prepend. When noise
# is detected, how much of previously recorded audio is
# prepended. This helps to prevent chopping the beginning
# of the phrase.
self.THRESHOLD = 4500
self.num_phrases = -1
# These will need to be modified according to where the pocketsphinx folder is
MODELDIR = "/home/l/Desktop/pocketsphinx/model/en-us"
# Create a decoder with certain model
config = Decoder.default_config()
config.set_string('-hmm', os.path.join(MODELDIR, '/home/l/Desktop/pocketsphinx/model/en-us/en-us/'))
config.set_string('-lm', os.path.join(MODELDIR, '/home/l/Desktop/pocketsphinx/model/en-us/en-us.lm.bin'))
config.set_string('-dict', os.path.join(MODELDIR, '/home/l/Desktop/pocketsphinx/model/en-us/cmudict-en-us.dict'))
config.set_string('-keyphrase', 'no one')
config.set_float('-kws_threshold', 1e+20)
# Creaders decoder object for streaming data.
self.decoder = Decoder(config)
def setup_mic(self, num_samples=50):
""" Gets average audio intensity of your mic sound. You can use it to get
average intensities while you're talking and/or silent. The average
is the avg of the .2 of the largest intensities recorded.
"""
#print "Getting intensity values from mic."
p = pyaudio.PyAudio()
stream = p.open(format=self.FORMAT,
channels=self.CHANNELS,
rate=self.RATE,
input=True,
frames_per_buffer=self.CHUNK)
values = [math.sqrt(abs(audioop.avg(stream.read(self.CHUNK), 4)))
for x in range(num_samples)]
values = sorted(values, reverse=True)
r = sum(values[:int(num_samples * 0.2)]) / int(num_samples * 0.2)
#print " Finished "
#print " Average audio intensity is ", r
stream.close()
p.terminate()
if r < 3000:
self.THRESHOLD = 3500
else:
self.THRESHOLD = r + 100
def save_speech(self, data, p):
"""
Saves mic data to temporary WAV file. Returns filename of saved
file
"""
filename = 'output_'+str(int(time.time()))
# writes data to WAV file
data = ''.join(data)
wf = wave.open(filename + '.wav', 'wb')
wf.setnchannels(1)
wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
wf.setframerate(16000) # TODO make this value a function parameter?
wf.writeframes(data)
wf.close()
return filename + '.wav'
def decode_phrase(self, wav_file):
self.decoder.start_utt()
stream = open(wav_file, "rb")
while True:
buf = stream.read(1024)
if buf:
self.decoder.process_raw(buf, False, False)
else:
break
self.decoder.end_utt()
words = []
[words.append(seg.word) for seg in self.decoder.seg()]
return words
def run(self):
"""
Listens to Microphone, extracts phrases from it and calls pocketsphinx
to decode the sound
"""
self.setup_mic()
#Open stream
p = pyaudio.PyAudio()
stream = p.open(format=self.FORMAT,
channels=self.CHANNELS,
rate=self.RATE,
input=True,
frames_per_buffer=self.CHUNK)
audio2send = []
cur_data = '' # current chunk of audio data
rel = self.RATE/self.CHUNK
slid_win = deque(maxlen=self.SILENCE_LIMIT * rel)
#Prepend audio from 0.5 seconds before noise was detected
prev_audio = deque(maxlen=self.PREV_AUDIO * rel)
started = False
while True:
cur_data = stream.read(self.CHUNK)
slid_win.append(math.sqrt(abs(audioop.avg(cur_data, 4))))
if sum([x > self.THRESHOLD for x in slid_win]) > 0:
if started == False:
print "Bắt đầu ghi âm"
started = True
audio2send.append(cur_data)
elif started:
print "Hoàn thành ghi âm"
filename = self.save_speech(list(prev_audio) + audio2send, p)
r = self.decode_phrase(filename)
print "RESULT: ", r
# hot word for me " no one" if r.count('one') and r.count("no") > 0 the end programs
if r.count("one") > 0 and r.count("no") > 0:
Mic.playaudiofromAudio().play("/home/l/Desktop/PROJECT/Audio/beep_hi.wav")
os.remove(filename)
return
# Removes temp audio file
os.remove(filename)
# Reset all
started = False
slid_win = deque(maxlen=self.SILENCE_LIMIT * rel)
prev_audio = deque(maxlen= 0.5 * rel)
audio2send = []
print "Chế độ nghe ..."
else:
prev_audio.append(cur_data)
print "* Hoàn thành nghe"
stream.close()
p.terminate()

Categories