Live recognition with Python and Pocketsphinx

Live recognition with Python and Pocketsphinx - python

I have recently been working with pocket sphinx in python. I have successfully got the
example below to work recognising a recorded wav.
#!/usr/bin/env python
import sys,os
def decodeSpeech(hmmd,lmdir,dictp,wavfile):
"""
Decodes a speech file
"""
try:
import pocketsphinx as ps
import sphinxbase
except:
print """Pocket sphinx and sphixbase is not installed
in your system. Please install it with package manager.
"""
speechRec = ps.Decoder(hmm = hmmd, lm = lmdir, dict = dictp)
wavFile = file(wavfile,'rb')
wavFile.seek(44)
speechRec.decode_raw(wavFile)
result = speechRec.get_hyp()
return result[0]
if __name__ == "__main__":
hmdir = "/home/jaganadhg/Desktop/Docs_New/kgisl/model/hmm/wsj1"
lmd = "/home/jaganadhg/Desktop/Docs_New/kgisl/model/lm/wsj/wlist5o.3e-7.vp.tg.lm.DMP"
dictd = "/home/jaganadhg/Desktop/Docs_New/kgisl/model/lm/wsj/wlist5o.dic"
wavfile = "/home/jaganadhg/Desktop/Docs_New/kgisl/sa1.wav"
recognised = decodeSpeech(hmdir,lmd,dictd,wavfile)
print "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%"
print recognised
print "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%"
The problem is how can I do real time speech recognition from a microphone? In
a while loop with a if statement so that if a set word is recognised from the microphone
a function can be called?

The code for realtime recognition looks like this:
config = Decoder.default_config()
config.set_string('-hmm', path.join(MODELDIR, 'en-us/en-us'))
config.set_string('-lm', path.join(MODELDIR, 'en-us/en-us.lm.bin'))
config.set_string('-dict', path.join(MODELDIR, 'en-us/cmudict-en-us.dict'))
config.set_string('-logfn', '/dev/null')
decoder = Decoder(config)
import pyaudio
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024)
stream.start_stream()
in_speech_bf = False
decoder.start_utt()
while True:
buf = stream.read(1024)
if buf:
decoder.process_raw(buf, False, False)
if decoder.get_in_speech() != in_speech_bf:
in_speech_bf = decoder.get_in_speech()
if not in_speech_bf:
decoder.end_utt()
print 'Result:', decoder.hyp().hypstr
decoder.start_utt()
else:
break
decoder.end_utt()
You can also use gstreamer python bindings in pocketsphinx, check livedemo.py

Try this. Pocketsphinx is now a GStreamer plugin.

This is the code I see on the internet and I've modified a few things to really listen to the words very bad and slow
You can help me modify it for good. It is built on ubuntu 16.04 LTS
I do not know much about programming
Looking forward to help
# -*- encoding: utf-8 -*-
#!/usr/bin/env python
from pocketsphinx.pocketsphinx import *
from sphinxbase.sphinxbase import *
import os
import pyaudio
import wave
import audioop
from collections import deque
import time
import math;import Mic
"""
Written by Sophie Li, 2016
http://blog.justsophie.com/python-speech-to-text-with-pocketsphinx/
"""
class SpeechDetector:
def __init__(self):
# Microphone stream config.
self.CHUNK = 1024 # CHUNKS of bytes to read each time from mic
self.FORMAT = pyaudio.paInt16
self.CHANNELS = 1
self.RATE = 16000
self.SILENCE_LIMIT = 1 # Silence limit in seconds. The max ammount of seconds where
# only silence is recorded. When this time passes the
# recording finishes and the file is decoded
self.PREV_AUDIO = 0.5 # Previous audio (in seconds) to prepend. When noise
# is detected, how much of previously recorded audio is
# prepended. This helps to prevent chopping the beginning
# of the phrase.
self.THRESHOLD = 4500
self.num_phrases = -1
# These will need to be modified according to where the pocketsphinx folder is
MODELDIR = "/home/l/Desktop/pocketsphinx/model/en-us"
# Create a decoder with certain model
config = Decoder.default_config()
config.set_string('-hmm', os.path.join(MODELDIR, '/home/l/Desktop/pocketsphinx/model/en-us/en-us/'))
config.set_string('-lm', os.path.join(MODELDIR, '/home/l/Desktop/pocketsphinx/model/en-us/en-us.lm.bin'))
config.set_string('-dict', os.path.join(MODELDIR, '/home/l/Desktop/pocketsphinx/model/en-us/cmudict-en-us.dict'))
config.set_string('-keyphrase', 'no one')
config.set_float('-kws_threshold', 1e+20)
# Creaders decoder object for streaming data.
self.decoder = Decoder(config)
def setup_mic(self, num_samples=50):
""" Gets average audio intensity of your mic sound. You can use it to get
average intensities while you're talking and/or silent. The average
is the avg of the .2 of the largest intensities recorded.
"""
#print "Getting intensity values from mic."
p = pyaudio.PyAudio()
stream = p.open(format=self.FORMAT,
channels=self.CHANNELS,
rate=self.RATE,
input=True,
frames_per_buffer=self.CHUNK)
values = [math.sqrt(abs(audioop.avg(stream.read(self.CHUNK), 4)))
for x in range(num_samples)]
values = sorted(values, reverse=True)
r = sum(values[:int(num_samples * 0.2)]) / int(num_samples * 0.2)
#print " Finished "
#print " Average audio intensity is ", r
stream.close()
p.terminate()
if r < 3000:
self.THRESHOLD = 3500
else:
self.THRESHOLD = r + 100
def save_speech(self, data, p):
"""
Saves mic data to temporary WAV file. Returns filename of saved
file
"""
filename = 'output_'+str(int(time.time()))
# writes data to WAV file
data = ''.join(data)
wf = wave.open(filename + '.wav', 'wb')
wf.setnchannels(1)
wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
wf.setframerate(16000) # TODO make this value a function parameter?
wf.writeframes(data)
wf.close()
return filename + '.wav'
def decode_phrase(self, wav_file):
self.decoder.start_utt()
stream = open(wav_file, "rb")
while True:
buf = stream.read(1024)
if buf:
self.decoder.process_raw(buf, False, False)
else:
break
self.decoder.end_utt()
words = []
[words.append(seg.word) for seg in self.decoder.seg()]
return words
def run(self):
"""
Listens to Microphone, extracts phrases from it and calls pocketsphinx
to decode the sound
"""
self.setup_mic()
#Open stream
p = pyaudio.PyAudio()
stream = p.open(format=self.FORMAT,
channels=self.CHANNELS,
rate=self.RATE,
input=True,
frames_per_buffer=self.CHUNK)
audio2send = []
cur_data = '' # current chunk of audio data
rel = self.RATE/self.CHUNK
slid_win = deque(maxlen=self.SILENCE_LIMIT * rel)
#Prepend audio from 0.5 seconds before noise was detected
prev_audio = deque(maxlen=self.PREV_AUDIO * rel)
started = False
while True:
cur_data = stream.read(self.CHUNK)
slid_win.append(math.sqrt(abs(audioop.avg(cur_data, 4))))
if sum([x > self.THRESHOLD for x in slid_win]) > 0:
if started == False:
print "Bắt đầu ghi âm"
started = True
audio2send.append(cur_data)
elif started:
print "Hoàn thành ghi âm"
filename = self.save_speech(list(prev_audio) + audio2send, p)
r = self.decode_phrase(filename)
print "RESULT: ", r
# hot word for me " no one" if r.count('one') and r.count("no") > 0 the end programs
if r.count("one") > 0 and r.count("no") > 0:
Mic.playaudiofromAudio().play("/home/l/Desktop/PROJECT/Audio/beep_hi.wav")
os.remove(filename)
return
# Removes temp audio file
os.remove(filename)
# Reset all
started = False
slid_win = deque(maxlen=self.SILENCE_LIMIT * rel)
prev_audio = deque(maxlen= 0.5 * rel)
audio2send = []
print "Chế độ nghe ..."
else:
prev_audio.append(cur_data)
print "* Hoàn thành nghe"
stream.close()
p.terminate()

Related

Deepspeech realtime speech to text

How can I do real-time speech to text using deep speech and a microphone?
I tried running this script I found on GitHub, but when I run it and I do not say anything for a while, it starts printing random text.
import pyaudio
import deepspeech
import numpy as np
from queue import SimpleQueue
BUFFERS_PER_SECOND = 10
SAMPLE_WIDTH = 2
BEAM_WIDTH = 512
#switch between tensorflow and tensorflow light model
#MODEL_PATH = 'deepspeech-0.8.1-models.tflite'
MODEL_PATH = 'models\DeepSpeech\deepspeech-0.9.3-models.pbmm'
SCORER_PATH = 'models\DeepSpeech\deepspeech-0.9.3-models.scorer'
buffer_queue = SimpleQueue()
def audio_callback(in_data, frame_count, time_info, status_flags):
buffer_queue.put(np.frombuffer(in_data, dtype='int16'))
return (None, pyaudio.paContinue)
def find_device(pyaudio, device_name):
''' find specific device or return default input device'''
default = pyaudio.get_default_input_device_info()
for i in range(pyaudio.get_device_count()):
name = pyaudio.get_device_info_by_index(i)['name']
if name == device_name:
return (i, name)
return (default['index'], default['name'])
def main():
model = deepspeech.Model(MODEL_PATH)
model.setBeamWidth(BEAM_WIDTH)
model.enableExternalScorer(SCORER_PATH)
stream = model.createStream()
audio = pyaudio.PyAudio()
index, name = find_device(audio, 'pulse')
print(f'select device {name}')
buffer_size = model.sampleRate() // BUFFERS_PER_SECOND
audio_stream = audio.open(rate=model.sampleRate(),
channels=1,
format=audio.get_format_from_width(
SAMPLE_WIDTH, unsigned=False),
input_device_index=index,
input=True,
frames_per_buffer=buffer_size,
stream_callback=audio_callback)
num_iterations = BUFFERS_PER_SECOND * 2
i = 0
while audio_stream.is_active():
stream.feedAudioContent(buffer_queue.get())
if i % num_iterations == 0:
text = stream.intermediateDecode()
if text.find('stop') >= 0:
break
print(text)
i += 1
print(stream.finishStream())
audio_stream.close()
if __name__ == '__main__':
main()
#find_device()
I know there are other options, but every option I found was either free trial or instantly paying. So if someone could help me work with DeepSpeech, or if somebody knows a free alternative, I would really appreciate it

Using Python on Raspberry Pi, I'm unable to record audio with PyAudio via Thonny/terminal

Using the example from: https://realpython.com/playing-and-recording-sound-python/#python-sounddevice_1 , I'm getting the following error when using Thonny: "Backend terminated or disconnected. Use 'Stop/Restart' to restart.". When I run the program in terminal, I get this as the error: "OSError: [Errno -9981] Input overflowed". The example code (which isn't threaded) can work in both terminal and Thonny if I modify it to not throw an exception on overflow "data = stream.read(chunk, exception_on_overflow = False)" but does not work when in a new thread. I've also tried changing the chunk size to be larger and smaller to no avail. When I have the overflow exception for the threaded version, I get a different error in terminal: "Segmentation fault". I'm running Raspbian 10 Buster and Python 3.7.3, if someone could test/see if it works, thanks.
import time
import board
import busio
import digitalio
from adafruit_mcp230xx.mcp23017 import MCP23017
from adafruit_debouncer import Debouncer
i2c = busio.I2C(board.SCL, board.SDA)
mcp = MCP23017(i2c)
import threading
import pyaudio
import wave
import sys
import subprocess
record = False
def background_audio_recording():
#chunk = 1024 # Record in chunks of 1024 samples
chunk = 1024
sample_format = pyaudio.paInt16 # 16 bits per sample
channels = 2
fs = 44100 # Record at 44100 samples per second
#seconds = 3
filename = "output.wav"
p = pyaudio.PyAudio() # Create an interface to PortAudio
stream = p.open(format=sample_format,
channels=channels,
rate=fs,
frames_per_buffer=chunk,
#input_device_index = 2,
input=True)
frames = [] # Initialize array to store frames
while record:
#data = stream.read(chunk, exception_on_overflow = False)
data = stream.read(chunk)
frames.append(data)
'''
for i in range(0, int(fs / chunk * seconds)):
data = stream.read(chunk)
frames.append(data)
'''
# Stop and close the stream
stream.stop_stream()
stream.close()
# Terminate the PortAudio interface
p.terminate()
print('Finished recording')
# Save the recorded data as a WAV file
wf = wave.open(filename, 'wb')
wf.setnchannels(channels)
wf.setsampwidth(p.get_sample_size(sample_format))
wf.setframerate(fs)
wf.writeframes(b''.join(frames))
wf.close()
print("File Saved")
return
button1PinSetup = mcp.get_pin(0) # GPA0
button1PinSetup.direction = digitalio.Direction.INPUT
button1PinSetup.pull = digitalio.Pull.UP
button1Pin = Debouncer(button1PinSetup)
button2PinSetup = mcp.get_pin(1) # GPA1
button2PinSetup.direction = digitalio.Direction.INPUT
button2PinSetup.pull = digitalio.Pull.UP
button2Pin = Debouncer(button2PinSetup)
while True:
button1Pin.update()
button2Pin.update()
if button1Pin.fell:
print("Record")
if record == True:
print("Already Recording")
else:
record = True
threading.Thread(target=background_audio_recording).start()
if button2Pin.fell:
print("Recording Stopped")
record = False

It seems as though, I needed to create the PyAudio object as a global, as well as within the thread (for each additional recording attempt). I got it working with the following code
import pyaudio
import wave
import time
import threading
import board
import busio
import digitalio
from adafruit_mcp230xx.mcp23017 import MCP23017
from adafruit_debouncer import Debouncer
i2c = busio.I2C(board.SCL, board.SDA)
mcp = MCP23017(i2c)
form_1 = pyaudio.paInt16 # 16-bit resolution
chans = 2 # 1 channel
samp_rate = 44100 # 44.1kHz sampling rate
chunk = 4096 # 2^12 samples for buffer
#record_secs = 3 # seconds to record
dev_index = 2 # device index found by p.get_device_info_by_index(ii)
wav_output_filename = 'test1.wav' # name of .wav file
audio = pyaudio.PyAudio() # create pyaudio instantiation
record = False
def background_audio_recording():
audio = pyaudio.PyAudio() # create pyaudio instantiation
# create pyaudio stream
stream = audio.open(format = form_1,rate = samp_rate,channels = chans, \
input_device_index = dev_index,input = True, \
frames_per_buffer=chunk)
print("recording")
frames = []
# loop through stream and append audio chunks to frame array
#for ii in range(0,int((samp_rate/chunk)*record_secs)):
while record == True:
data = stream.read(chunk)
frames.append(data)
print("finished recording")
# stop the stream, close it, and terminate the pyaudio instantiation
stream.stop_stream()
stream.close()
audio.terminate()
# save the audio frames as .wav file
wavefile = wave.open(wav_output_filename,'wb')
wavefile.setnchannels(chans)
wavefile.setsampwidth(audio.get_sample_size(form_1))
wavefile.setframerate(samp_rate)
wavefile.writeframes(b''.join(frames))
wavefile.close()
return
button1PinSetup = mcp.get_pin(0) # GPA0
button1PinSetup.direction = digitalio.Direction.INPUT
button1PinSetup.pull = digitalio.Pull.UP
button1Pin = Debouncer(button1PinSetup)
button2PinSetup = mcp.get_pin(1) # GPA1
button2PinSetup.direction = digitalio.Direction.INPUT
button2PinSetup.pull = digitalio.Pull.UP
button2Pin = Debouncer(button2PinSetup)
while True:
button1Pin.update()
button2Pin.update()
if button1Pin.fell:
print("Record")
if record == True:
print("Already Recording")
else:
record = True
threading.Thread(target=background_audio_recording).start()
if button2Pin.fell:
print("Recording Stopped")
record = False

Synchronizing audio and video with OpenCV and PyAudio

I have gotten both OpenCV and PyAudio working however I am not sure how I would sync them together. I am unable to get a framerate from OpenCV and measuring the call time for a frame changes from moment to moment. However with PyAudio it's basis is grabbing a certain sample rate. How would I sync them to be at the same rate. I assume there is some standard or some way codecs do it. (I've tried google all I got was information on lip syncing :/).
OpenCV Frame rate
from __future__ import division
import time
import math
import cv2, cv
vc = cv2.VideoCapture(0)
# get the frame
while True:
before_read = time.time()
rval, frame = vc.read()
after_read = time.time()
if frame is not None:
print len(frame)
print math.ceil((1.0 / (after_read - before_read)))
cv2.imshow("preview", frame)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
else:
print "None..."
cv2.waitKey(1)
# display the frame
while True:
cv2.imshow("preview", frame)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
Grabbing and saving audio
from sys import byteorder
from array import array
from struct import pack
import pyaudio
import wave
THRESHOLD = 500
CHUNK_SIZE = 1024
FORMAT = pyaudio.paInt16
RATE = 44100
def is_silent(snd_data):
"Returns 'True' if below the 'silent' threshold"
print "\n\n\n\n\n\n\n\n"
print max(snd_data)
print "\n\n\n\n\n\n\n\n"
return max(snd_data) < THRESHOLD
def normalize(snd_data):
"Average the volume out"
MAXIMUM = 16384
times = float(MAXIMUM)/max(abs(i) for i in snd_data)
r = array('h')
for i in snd_data:
r.append(int(i*times))
return r
def trim(snd_data):
"Trim the blank spots at the start and end"
def _trim(snd_data):
snd_started = False
r = array('h')
for i in snd_data:
if not snd_started and abs(i)>THRESHOLD:
snd_started = True
r.append(i)
elif snd_started:
r.append(i)
return r
# Trim to the left
snd_data = _trim(snd_data)
# Trim to the right
snd_data.reverse()
snd_data = _trim(snd_data)
snd_data.reverse()
return snd_data
def add_silence(snd_data, seconds):
"Add silence to the start and end of 'snd_data' of length 'seconds' (float)"
r = array('h', [0 for i in xrange(int(seconds*RATE))])
r.extend(snd_data)
r.extend([0 for i in xrange(int(seconds*RATE))])
return r
def record():
"""
Record a word or words from the microphone and
return the data as an array of signed shorts.
Normalizes the audio, trims silence from the
start and end, and pads with 0.5 seconds of
blank sound to make sure VLC et al can play
it without getting chopped off.
"""
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT, channels=1, rate=RATE,
input=True, output=True,
frames_per_buffer=CHUNK_SIZE)
num_silent = 0
snd_started = False
r = array('h')
while 1:
# little endian, signed short
snd_data = array('h', stream.read(1024))
if byteorder == 'big':
snd_data.byteswap()
print "\n\n\n\n\n\n"
print len(snd_data)
print snd_data
r.extend(snd_data)
silent = is_silent(snd_data)
if silent and snd_started:
num_silent += 1
elif not silent and not snd_started:
snd_started = True
if snd_started and num_silent > 1:
break
sample_width = p.get_sample_size(FORMAT)
stream.stop_stream()
stream.close()
p.terminate()
r = normalize(r)
r = trim(r)
r = add_silence(r, 0.5)
return sample_width, r
def record_to_file(path):
"Records from the microphone and outputs the resulting data to 'path'"
sample_width, data = record()
data = pack('<' + ('h'*len(data)), *data)
wf = wave.open(path, 'wb')
wf.setnchannels(1)
wf.setsampwidth(sample_width)
wf.setframerate(RATE)
wf.writeframes(data)
wf.close()
if __name__ == '__main__':
print("please speak a word into the microphone")
record_to_file('demo.wav')
print("done - result written to demo.wav")

I think you'd be better off using either GSreamer or ffmpeg, or if you're on Windows, DirectShow. These libs can handle both audio and video, and should have some kind of a Multiplexer to allow you to mix video and audio properly.
But if you really want to do this using Opencv, you should be able to use VideoCapture to get the frame rate, have you tried using this?
fps = cv.GetCaptureProperty(vc, CV_CAP_PROP_FPS)
Another way would be to estimate fps as number of frames divided by duration:
nFrames = cv.GetCaptureProperty(vc, CV_CAP_PROP_FRAME_COUNT)
cv.SetCaptureProperty(vc, CV_CAP_PROP_POS_AVI_RATIO, 1)
duration = cv.GetCaptureProperty(vc, CV_CAP_PROP_POS_MSEC)
fps = 1000 * nFrames / duration;
I'm not sure I understand what you were trying to do here:
before_read = time.time()
rval, frame = vc.read()
after_read = time.time()
It seems to me that doing after_read - before_read only measures how long it took for OpenCV to load the next frame, it doesn't measure the fps. OpenCV is not trying to do playback, it's only loading frames and it'll try to do so the fastest it can and I think there's no way to configure that. I think that putting a waitKey(1/fps) after displaying each frame will achieve what you're looking for.

You could have 2 counters 1 for audio and one for video.
The video counter will become +(1/fps) when showing an image and audio +sec where sec the seconds of audio you are writing to the stream each time. Then on audio part of the code you can do something like
While audiosec-videosec>=0.05: # Audio is ahead
time.sleep(0.05)
And on video part
While videosec-audiosec>=0.2:# video is ahead
time.sleep(0.2)
You can play with the numbers
This is how i achieve some sort of synchronization on my own video player project using pyaudio recently ffmpeg instead of cv2.

personally i used threading for this.
import concurrent.futures
import pyaudio
import cv2
class Aud_Vid():
def __init__(self, arg):
self.video = cv2.VideoCapture(0)
self.CHUNK = 1470
self.FORMAT = pyaudio.paInt16
self.CHANNELS = 2
self.RATE = 44100
self.audio = pyaudio.PyAudio()
self.instream = self.audio.open(format=self.FORMAT,channels=self.CHANNELS,rate=self.RATE,input=True,frames_per_buffer=self.CHUNK)
self.outstream = self.audio.open(format=self.FORMAT,channels=self.CHANNELS,rate=self.RATE,output=True,frames_per_buffer=self.CHUNK)
def sync(self):
with concurrent.futures.ThreadPoolExecutor() as executor:
tv = executor.submit(self.video.read)
ta = executor.submit(self.instream.read,1470)
vid = tv.result()
aud = ta.result()
return(vid[1].tobytes(),aud)

How to capture a video (AND audio) in python, from a camera (or webcam)

i'm looking for a solution, either in linux or in windows, that allows me to
record video (+audio) from my webcam & microphone, simultaneously.
save it as a file.AVI (or mpg or whatever)
display the video on the screen while recording it
Compression is NOT an issue in my case, and i actually prefer to capture RAW and compress it later.
So far i've done it with an ActiveX component in VB which took care of everything, and i'd like to progress with python (the VB solution is unstable, unreliable).
so far i've seen code that captures VIDEO only, or individual frames...
I've looked so far at
OpenCV - couldn't find audio capture there
PyGame - no simultaneous audio capture (AFAIK)
VideoCapture - provide only single frames.
SimpleCV - no audio
VLC - binding to VideoLAN program into wxPthon - hopefully it will do (still investigating this option)
kivy - just heard about it, didn't manage to get it working under windows SO FAR.
The question - is there a video & audio capture library for python?
or - what are the other options if any?

Answer: No. There is no single library/solution in python to do video/audio recording simultaneously. You have to implement both separately and merge the audio and video signal in a smart way to end up with a video/audio file.
I got a solution for the problem you present. My code addresses your three issues:
Records video + audio from webcam and microphone simultaneously.
It saves the final video/audio file as .AVI
Un-commenting lines 76, 77 and 78 will make the video to be displayed to screen while recording.
My solution uses pyaudio for audio recording, opencv for video recording, and ffmpeg for muxing the two signals. To be able to record both simultaneously, I use multithreading. One thread records video, and a second one the audio. I have uploaded my code to github and also have included all the essential parts it here.
https://github.com/JRodrigoF/AVrecordeR
Note: opencv is not able to control the fps at which the webcamera does the recording. It is only able to specify in the encoding of the file the desired final fps, but the webcamera usually behaves differently according to specifications and light conditions (I found). So the fps have to be controlled at the level of the code.
import cv2
import pyaudio
import wave
import threading
import time
import subprocess
import os
class VideoRecorder():
# Video class based on openCV
def __init__(self):
self.open = True
self.device_index = 0
self.fps = 6 # fps should be the minimum constant rate at which the camera can
self.fourcc = "MJPG" # capture images (with no decrease in speed over time; testing is required)
self.frameSize = (640,480) # video formats and sizes also depend and vary according to the camera used
self.video_filename = "temp_video.avi"
self.video_cap = cv2.VideoCapture(self.device_index)
self.video_writer = cv2.VideoWriter_fourcc(*self.fourcc)
self.video_out = cv2.VideoWriter(self.video_filename, self.video_writer, self.fps, self.frameSize)
self.frame_counts = 1
self.start_time = time.time()
# Video starts being recorded
def record(self):
# counter = 1
timer_start = time.time()
timer_current = 0
while(self.open==True):
ret, video_frame = self.video_cap.read()
if (ret==True):
self.video_out.write(video_frame)
# print str(counter) + " " + str(self.frame_counts) + " frames written " + str(timer_current)
self.frame_counts += 1
# counter += 1
# timer_current = time.time() - timer_start
time.sleep(0.16)
# gray = cv2.cvtColor(video_frame, cv2.COLOR_BGR2GRAY)
# cv2.imshow('video_frame', gray)
# cv2.waitKey(1)
else:
break
# 0.16 delay -> 6 fps
#
# Finishes the video recording therefore the thread too
def stop(self):
if self.open==True:
self.open=False
self.video_out.release()
self.video_cap.release()
cv2.destroyAllWindows()
else:
pass
# Launches the video recording function using a thread
def start(self):
video_thread = threading.Thread(target=self.record)
video_thread.start()
class AudioRecorder():
# Audio class based on pyAudio and Wave
def __init__(self):
self.open = True
self.rate = 44100
self.frames_per_buffer = 1024
self.channels = 2
self.format = pyaudio.paInt16
self.audio_filename = "temp_audio.wav"
self.audio = pyaudio.PyAudio()
self.stream = self.audio.open(format=self.format,
channels=self.channels,
rate=self.rate,
input=True,
frames_per_buffer = self.frames_per_buffer)
self.audio_frames = []
# Audio starts being recorded
def record(self):
self.stream.start_stream()
while(self.open == True):
data = self.stream.read(self.frames_per_buffer)
self.audio_frames.append(data)
if self.open==False:
break
# Finishes the audio recording therefore the thread too
def stop(self):
if self.open==True:
self.open = False
self.stream.stop_stream()
self.stream.close()
self.audio.terminate()
waveFile = wave.open(self.audio_filename, 'wb')
waveFile.setnchannels(self.channels)
waveFile.setsampwidth(self.audio.get_sample_size(self.format))
waveFile.setframerate(self.rate)
waveFile.writeframes(b''.join(self.audio_frames))
waveFile.close()
pass
# Launches the audio recording function using a thread
def start(self):
audio_thread = threading.Thread(target=self.record)
audio_thread.start()
def start_AVrecording(filename):
global video_thread
global audio_thread
video_thread = VideoRecorder()
audio_thread = AudioRecorder()
audio_thread.start()
video_thread.start()
return filename
def start_video_recording(filename):
global video_thread
video_thread = VideoRecorder()
video_thread.start()
return filename
def start_audio_recording(filename):
global audio_thread
audio_thread = AudioRecorder()
audio_thread.start()
return filename
def stop_AVrecording(filename):
audio_thread.stop()
frame_counts = video_thread.frame_counts
elapsed_time = time.time() - video_thread.start_time
recorded_fps = frame_counts / elapsed_time
print "total frames " + str(frame_counts)
print "elapsed time " + str(elapsed_time)
print "recorded fps " + str(recorded_fps)
video_thread.stop()
# Makes sure the threads have finished
while threading.active_count() > 1:
time.sleep(1)
# Merging audio and video signal
if abs(recorded_fps - 6) >= 0.01: # If the fps rate was higher/lower than expected, re-encode it to the expected
print "Re-encoding"
cmd = "ffmpeg -r " + str(recorded_fps) + " -i temp_video.avi -pix_fmt yuv420p -r 6 temp_video2.avi"
subprocess.call(cmd, shell=True)
print "Muxing"
cmd = "ffmpeg -ac 2 -channel_layout stereo -i temp_audio.wav -i temp_video2.avi -pix_fmt yuv420p " + filename + ".avi"
subprocess.call(cmd, shell=True)
else:
print "Normal recording\nMuxing"
cmd = "ffmpeg -ac 2 -channel_layout stereo -i temp_audio.wav -i temp_video.avi -pix_fmt yuv420p " + filename + ".avi"
subprocess.call(cmd, shell=True)
print ".."
# Required and wanted processing of final files
def file_manager(filename):
local_path = os.getcwd()
if os.path.exists(str(local_path) + "/temp_audio.wav"):
os.remove(str(local_path) + "/temp_audio.wav")
if os.path.exists(str(local_path) + "/temp_video.avi"):
os.remove(str(local_path) + "/temp_video.avi")
if os.path.exists(str(local_path) + "/temp_video2.avi"):
os.remove(str(local_path) + "/temp_video2.avi")
if os.path.exists(str(local_path) + "/" + filename + ".avi"):
os.remove(str(local_path) + "/" + filename + ".avi")

To the questions asked above: Yes the code should also works under Python3. I adjusted it a little bit and now works for python2 and python3 (tested it on windows7 with 2.7 and 3.6, though you need to have ffmpeg installed or the executable ffmpeg.exe at least in the same directory, you can get it here: https://www.ffmpeg.org/download.html ). Of course you also need all the other libraries cv2, numpy, pyaudio, installed like herewith:
pip install opencv-python numpy pyaudio
You can now run the code directly:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# VideoRecorder.py
from __future__ import print_function, division
import numpy as np
import cv2
import pyaudio
import wave
import threading
import time
import subprocess
import os
class VideoRecorder():
"Video class based on openCV"
def __init__(self, name="temp_video.avi", fourcc="MJPG", sizex=640, sizey=480, camindex=0, fps=30):
self.open = True
self.device_index = camindex
self.fps = fps # fps should be the minimum constant rate at which the camera can
self.fourcc = fourcc # capture images (with no decrease in speed over time; testing is required)
self.frameSize = (sizex, sizey) # video formats and sizes also depend and vary according to the camera used
self.video_filename = name
self.video_cap = cv2.VideoCapture(self.device_index)
self.video_writer = cv2.VideoWriter_fourcc(*self.fourcc)
self.video_out = cv2.VideoWriter(self.video_filename, self.video_writer, self.fps, self.frameSize)
self.frame_counts = 1
self.start_time = time.time()
def record(self):
"Video starts being recorded"
# counter = 1
timer_start = time.time()
timer_current = 0
while self.open:
ret, video_frame = self.video_cap.read()
if ret:
self.video_out.write(video_frame)
# print(str(counter) + " " + str(self.frame_counts) + " frames written " + str(timer_current))
self.frame_counts += 1
# counter += 1
# timer_current = time.time() - timer_start
time.sleep(1/self.fps)
# gray = cv2.cvtColor(video_frame, cv2.COLOR_BGR2GRAY)
# cv2.imshow('video_frame', gray)
# cv2.waitKey(1)
else:
break
def stop(self):
"Finishes the video recording therefore the thread too"
if self.open:
self.open=False
self.video_out.release()
self.video_cap.release()
cv2.destroyAllWindows()
def start(self):
"Launches the video recording function using a thread"
video_thread = threading.Thread(target=self.record)
video_thread.start()
class AudioRecorder():
"Audio class based on pyAudio and Wave"
def __init__(self, filename="temp_audio.wav", rate=44100, fpb=1024, channels=2):
self.open = True
self.rate = rate
self.frames_per_buffer = fpb
self.channels = channels
self.format = pyaudio.paInt16
self.audio_filename = filename
self.audio = pyaudio.PyAudio()
self.stream = self.audio.open(format=self.format,
channels=self.channels,
rate=self.rate,
input=True,
frames_per_buffer = self.frames_per_buffer)
self.audio_frames = []
def record(self):
"Audio starts being recorded"
self.stream.start_stream()
while self.open:
data = self.stream.read(self.frames_per_buffer)
self.audio_frames.append(data)
if not self.open:
break
def stop(self):
"Finishes the audio recording therefore the thread too"
if self.open:
self.open = False
self.stream.stop_stream()
self.stream.close()
self.audio.terminate()
waveFile = wave.open(self.audio_filename, 'wb')
waveFile.setnchannels(self.channels)
waveFile.setsampwidth(self.audio.get_sample_size(self.format))
waveFile.setframerate(self.rate)
waveFile.writeframes(b''.join(self.audio_frames))
waveFile.close()
def start(self):
"Launches the audio recording function using a thread"
audio_thread = threading.Thread(target=self.record)
audio_thread.start()
def start_AVrecording(filename="test"):
global video_thread
global audio_thread
video_thread = VideoRecorder()
audio_thread = AudioRecorder()
audio_thread.start()
video_thread.start()
return filename
def start_video_recording(filename="test"):
global video_thread
video_thread = VideoRecorder()
video_thread.start()
return filename
def start_audio_recording(filename="test"):
global audio_thread
audio_thread = AudioRecorder()
audio_thread.start()
return filename
def stop_AVrecording(filename="test"):
audio_thread.stop()
frame_counts = video_thread.frame_counts
elapsed_time = time.time() - video_thread.start_time
recorded_fps = frame_counts / elapsed_time
print("total frames " + str(frame_counts))
print("elapsed time " + str(elapsed_time))
print("recorded fps " + str(recorded_fps))
video_thread.stop()
# Makes sure the threads have finished
while threading.active_count() > 1:
time.sleep(1)
# Merging audio and video signal
if abs(recorded_fps - 6) >= 0.01: # If the fps rate was higher/lower than expected, re-encode it to the expected
print("Re-encoding")
cmd = "ffmpeg -r " + str(recorded_fps) + " -i temp_video.avi -pix_fmt yuv420p -r 6 temp_video2.avi"
subprocess.call(cmd, shell=True)
print("Muxing")
cmd = "ffmpeg -y -ac 2 -channel_layout stereo -i temp_audio.wav -i temp_video2.avi -pix_fmt yuv420p " + filename + ".avi"
subprocess.call(cmd, shell=True)
else:
print("Normal recording\nMuxing")
cmd = "ffmpeg -y -ac 2 -channel_layout stereo -i temp_audio.wav -i temp_video.avi -pix_fmt yuv420p " + filename + ".avi"
subprocess.call(cmd, shell=True)
print("..")
def file_manager(filename="test"):
"Required and wanted processing of final files"
local_path = os.getcwd()
if os.path.exists(str(local_path) + "/temp_audio.wav"):
os.remove(str(local_path) + "/temp_audio.wav")
if os.path.exists(str(local_path) + "/temp_video.avi"):
os.remove(str(local_path) + "/temp_video.avi")
if os.path.exists(str(local_path) + "/temp_video2.avi"):
os.remove(str(local_path) + "/temp_video2.avi")
# if os.path.exists(str(local_path) + "/" + filename + ".avi"):
# os.remove(str(local_path) + "/" + filename + ".avi")
if __name__ == '__main__':
start_AVrecording()
time.sleep(5)
stop_AVrecording()
file_manager()

I would recommend ffmpeg. There is a python wrapper.
http://code.google.com/p/pyffmpeg/

I've been looking around for a good answer to this, and I think it is GStreamer...
The documentation for the python bindings is extremely light, and most of it seemed centered around the old 0.10 version of GStreamer instead of the new 1.X versions, but GStreamer is a extremely powerful, cross-platform multimedia framework that can stream, mux, transcode, and display just about anything.

I used JRodrigoF's script for a while on a project. However, I noticed that sometimes the threads would hang and it would cause the program to crash. Another issue is that openCV does not capture video frames at a reliable rate and ffmpeg would distort my video when re-encoding.
I came up with a new solution that records much more reliably and with much higher quality for my application. It presently only works for Windows because it uses pywinauto and the built-in Windows Camera app. The last bit of the script does some error-checking to confirm the video successfully recorded by checking the timestamp of the name of the video.
https://gist.github.com/mjdargen/956cc968864f38bfc4e20c9798c7d670
import pywinauto
import time
import subprocess
import os
import datetime
def win_record(duration):
subprocess.run('start microsoft.windows.camera:', shell=True) # open camera app
# focus window by getting handle using title and class name
# subprocess call opens camera and gets focus, but this provides alternate way
# t, c = 'Camera', 'ApplicationFrameWindow'
# handle = pywinauto.findwindows.find_windows(title=t, class_name=c)[0]
# # get app and window
# app = pywinauto.application.Application().connect(handle=handle)
# window = app.window(handle=handle)
# window.set_focus() # set focus
time.sleep(2) # have to sleep
# take control of camera window to take video
desktop = pywinauto.Desktop(backend="uia")
cam = desktop['Camera']
# cam.print_control_identifiers()
# make sure in video mode
if cam.child_window(title="Switch to Video mode", auto_id="CaptureButton_1", control_type="Button").exists():
cam.child_window(title="Switch to Video mode", auto_id="CaptureButton_1", control_type="Button").click()
time.sleep(1)
# start then stop video
cam.child_window(title="Take Video", auto_id="CaptureButton_1", control_type="Button").click()
time.sleep(duration+2)
cam.child_window(title="Stop taking Video", auto_id="CaptureButton_1", control_type="Button").click()
# retrieve vids from camera roll and sort
dir = 'C:/Users/m/Pictures/Camera Roll'
all_contents = list(os.listdir(dir))
vids = [f for f in all_contents if "_Pro.mp4" in f]
vids.sort()
vid = vids[-1] # get last vid
# compute time difference
vid_time = vid.replace('WIN_', '').replace('_Pro.mp4', '')
vid_time = datetime.datetime.strptime(vid_time, '%Y%m%d_%H_%M_%S')
now = datetime.datetime.now()
diff = now - vid_time
# time different greater than 2 minutes, assume something wrong & quit
if diff.seconds > 120:
quit()
subprocess.run('Taskkill /IM WindowsCamera.exe /F', shell=True) # close camera app
print('Recorded successfully!')
win_record(2)

If you notice misalignment between video and audio by the code above, please see my solution below
I think the most rated answer above does a great job. However, it did not work perfectly when I was using it,especially when you use a low fps rate (say 10). The main issue is with video recording. In order to properly synchronize video and audio recording with ffmpeg, one has to make sure that cv2.VideoCapture() and cv2.VideoWriter() share exact same FPS, because the recorded video time length is solely determined by fps rate and the number of frames.
Following is my suggested update:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# VideoRecorder.py
from __future__ import print_function, division
import numpy as np
import cv2
import pyaudio
import wave
import threading
import time
import subprocess
import os
import ffmpeg
class VideoRecorder():
"Video class based on openCV"
def __init__(self, name="temp_video.avi", fourcc="MJPG", sizex=640, sizey=480, camindex=0, fps=30):
self.open = True
self.device_index = camindex
self.fps = fps # fps should be the minimum constant rate at which the camera can
self.fourcc = fourcc # capture images (with no decrease in speed over time; testing is required)
self.frameSize = (sizex, sizey) # video formats and sizes also depend and vary according to the camera used
self.video_filename = name
self.video_cap = cv2.VideoCapture(self.device_index)
self.video_cap.set(cv2.CAP_PROP_FPS, self.fps)
self.video_writer = cv2.VideoWriter_fourcc(*self.fourcc)
self.video_out = cv2.VideoWriter(self.video_filename, self.video_writer, self.fps, self.frameSize)
self.frame_counts = 1
self.start_time = time.time()
def record(self):
"Video starts being recorded"
# counter = 1
timer_start = time.time()
timer_current = 0
while self.open:
ret, video_frame = self.video_cap.read()
if ret:
self.video_out.write(video_frame)
# print(str(counter) + " " + str(self.frame_counts) + " frames written " + str(timer_current))
self.frame_counts += 1
# print(self.frame_counts)
# counter += 1
# timer_current = time.time() - timer_start
# time.sleep(1/self.fps)
# gray = cv2.cvtColor(video_frame, cv2.COLOR_BGR2GRAY)
# cv2.imshow('video_frame', gray)
# cv2.waitKey(1)
else:
break
def stop(self):
"Finishes the video recording therefore the thread too"
if self.open:
self.open=False
self.video_out.release()
self.video_cap.release()
cv2.destroyAllWindows()
def start(self):
"Launches the video recording function using a thread"
video_thread = threading.Thread(target=self.record)
video_thread.start()
class AudioRecorder():
"Audio class based on pyAudio and Wave"
def __init__(self, filename="temp_audio.wav", rate=44100, fpb=1024, channels=2):
self.open = True
self.rate = rate
self.frames_per_buffer = fpb
self.channels = channels
self.format = pyaudio.paInt16
self.audio_filename = filename
self.audio = pyaudio.PyAudio()
self.stream = self.audio.open(format=self.format,
channels=self.channels,
rate=self.rate,
input=True,
frames_per_buffer = self.frames_per_buffer)
self.audio_frames = []
def record(self):
"Audio starts being recorded"
self.stream.start_stream()
while self.open:
data = self.stream.read(self.frames_per_buffer)
self.audio_frames.append(data)
if not self.open:
break
def stop(self):
"Finishes the audio recording therefore the thread too"
if self.open:
self.open = False
self.stream.stop_stream()
self.stream.close()
self.audio.terminate()
waveFile = wave.open(self.audio_filename, 'wb')
waveFile.setnchannels(self.channels)
waveFile.setsampwidth(self.audio.get_sample_size(self.format))
waveFile.setframerate(self.rate)
waveFile.writeframes(b''.join(self.audio_frames))
waveFile.close()
def start(self):
"Launches the audio recording function using a thread"
audio_thread = threading.Thread(target=self.record)
audio_thread.start()
def start_AVrecording(filename="test"):
global video_thread
global audio_thread
video_thread = VideoRecorder()
audio_thread = AudioRecorder()
audio_thread.start()
video_thread.start()
return filename
def start_video_recording(filename="test"):
global video_thread
video_thread = VideoRecorder()
video_thread.start()
return filename
def start_audio_recording(filename="test"):
global audio_thread
audio_thread = AudioRecorder()
audio_thread.start()
return filename
def stop_AVrecording(filename="test"):
audio_thread.stop()
frame_counts = video_thread.frame_counts
elapsed_time = time.time() - video_thread.start_time
recorded_fps = frame_counts / elapsed_time
print("total frames " + str(frame_counts))
print("elapsed time " + str(elapsed_time))
print("recorded fps " + str(recorded_fps))
video_thread.stop()
# Makes sure the threads have finished
while threading.active_count() > 1:
time.sleep(1)
video_stream = ffmpeg.input(video_thread.video_filename)
audio_stream = ffmpeg.input(audio_thread.audio_filename)
ffmpeg.output(audio_stream, video_stream, 'out.mp4').run(overwrite_output=True)
# # Merging audio and video signal
# if abs(recorded_fps - 6) >= 0.01: # If the fps rate was higher/lower than expected, re-encode it to the expected
# print("Re-encoding")
# cmd = "ffmpeg -r " + str(recorded_fps) + " -i temp_video.avi -pix_fmt yuv420p -r 6 temp_video2.avi"
# subprocess.call(cmd, shell=True)
# print("Muxing")
# cmd = "ffmpeg -y -ac 2 -channel_layout stereo -i temp_audio.wav -i temp_video2.avi -pix_fmt yuv420p " + filename + ".avi"
# subprocess.call(cmd, shell=True)
# else:
# print("Normal recording\nMuxing")
# cmd = "ffmpeg -y -ac 2 -channel_layout stereo -i temp_audio.wav -i temp_video.avi -pix_fmt yuv420p " + filename + ".avi"
# subprocess.call(cmd, shell=True)
# print("..")
def file_manager(filename="test"):
"Required and wanted processing of final files"
local_path = os.getcwd()
if os.path.exists(str(local_path) + "/temp_audio.wav"):
os.remove(str(local_path) + "/temp_audio.wav")
if os.path.exists(str(local_path) + "/temp_video.avi"):
os.remove(str(local_path) + "/temp_video.avi")
if os.path.exists(str(local_path) + "/temp_video2.avi"):
os.remove(str(local_path) + "/temp_video2.avi")
# if os.path.exists(str(local_path) + "/" + filename + ".avi"):
# os.remove(str(local_path) + "/" + filename + ".avi")
if __name__ == '__main__':
start_AVrecording()
# try:
# while True:
# pass
# except KeyboardInterrupt:
# stop_AVrecording()
time.sleep(10)
stop_AVrecording()
print("finishing recording")
file_manager()

Using everyone's contributions and following the suggestion of Paul
I was able to come up with the following code:
Recorder.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# VideoRecorder.py
from __future__ import print_function, division
import numpy as np
import sys
import cv2
import pyaudio
import wave
import threading
import time
import subprocess
import os
import ffmpeg
REC_FOLDER = "recordings/"
class Recorder():
def __init__(self, filename):
self.filename = filename
self.video_thread = self.VideoRecorder(self, REC_FOLDER + filename)
self.audio_thread = self.AudioRecorder(self, REC_FOLDER + filename)
def startRecording(self):
self.video_thread.start()
self.audio_thread.start()
def stopRecording(self):
self.video_thread.stop()
self.audio_thread.stop()
def saveRecording(self):
#Save audio / Show video resume
self.audio_thread.saveAudio()
self.video_thread.showFramesResume()
#Merges both streams and writes
video_stream = ffmpeg.input(self.video_thread.video_filename)
audio_stream = ffmpeg.input(self.audio_thread.audio_filename)
while (not os.path.exists(self.audio_thread.audio_filename)):
print("waiting for audio file to exit...")
stream = ffmpeg.output(video_stream, audio_stream, REC_FOLDER + self.filename +".mp4")
try:
ffmpeg.run(stream, capture_stdout=True, capture_stderr=True, overwrite_output=True)
except ffmpeg.Error as e:
print(e.stdout, file=sys.stderr)
print(e.stderr, file=sys.stderr)
class VideoRecorder():
"Video class based on openCV"
def __init__(self, recorder, name, fourcc="MJPG", frameSize=(640,480), camindex=0, fps=15):
self.recorder = recorder
self.open = True
self.duration = 0
self.device_index = camindex
self.fps = fps # fps should be the minimum constant rate at which the camera can
self.fourcc = fourcc # capture images (with no decrease in speed over time; testing is required)
self.video_filename = name + ".avi" # video formats and sizes also depend and vary according to the camera used
self.video_cap = cv2.VideoCapture(self.device_index, cv2.CAP_DSHOW)
self.video_writer = cv2.VideoWriter_fourcc(*fourcc)
self.video_out = cv2.VideoWriter(self.video_filename, self.video_writer, self.fps, frameSize)
self.frame_counts = 1
self.start_time = time.time()
def record(self):
"Video starts being recorded"
counter = 1
while self.open:
ret, video_frame = self.video_cap.read()
if ret:
self.video_out.write(video_frame)
self.frame_counts += 1
counter += 1
self.duration += 1/self.fps
if (video_frame is None): print("I WAS NONEEEEEEEEEEEEEEEEEEEEEE")
gray = cv2.cvtColor(video_frame, cv2.COLOR_BGR2GRAY)
cv2.imshow('video_frame', gray)
cv2.waitKey(1)
while(self.duration - self.recorder.audio_thread.duration >= 0.2 and self.recorder.audio_thread.open):
time.sleep(0.2)
else:
break
#Release Video
self.video_out.release()
self.video_cap.release()
cv2.destroyAllWindows()
self.video_out = None
def stop(self):
"Finishes the video recording therefore the thread too"
self.open=False
def start(self):
"Launches the video recording function using a thread"
self.thread = threading.Thread(target=self.record)
self.thread.start()
def showFramesResume(self):
#Only stop of video has all frames
frame_counts = self.frame_counts
elapsed_time = time.time() - self.start_time
recorded_fps = self.frame_counts / elapsed_time
print("total frames " + str(frame_counts))
print("elapsed time " + str(elapsed_time))
print("recorded fps " + str(recorded_fps))
class AudioRecorder():
"Audio class based on pyAudio and Wave"
def __init__(self, recorder, filename, rate=44100, fpb=1024, channels=1, audio_index=0):
self.recorder = recorder
self.open = True
self.rate = rate
self.duration = 0
self.frames_per_buffer = fpb
self.channels = channels
self.format = pyaudio.paInt16
self.audio_filename = filename + ".wav"
self.audio = pyaudio.PyAudio()
self.stream = self.audio.open(format=self.format,
channels=self.channels,
rate=self.rate,
input=True,
input_device_index=audio_index,
frames_per_buffer = self.frames_per_buffer)
self.audio_frames = []
def record(self):
"Audio starts being recorded"
self.stream.start_stream()
t_start = time.time_ns()
while self.open:
try:
self.duration += self.frames_per_buffer / self.rate
data = self.stream.read(self.frames_per_buffer)
self.audio_frames.append(data)
except Exception as e:
print('\n' + '*'*80)
print('PyAudio read exception at %.1fms\n' % ((time.time_ns() - t_start)/10**6))
print(e)
print('*'*80 + '\n')
while(self.duration - self.recorder.video_thread.duration >= 0.5):
time.sleep(0.5)
#Closes audio stream
self.stream.stop_stream()
self.stream.close()
self.audio.terminate()
def stop(self):
"Finishes the audio recording therefore the thread too"
self.open = False
def start(self):
"Launches the audio recording function using a thread"
self.thread = threading.Thread(target=self.record)
self.thread.start()
def saveAudio(self):
#Save Audio File
waveFile = wave.open(self.audio_filename, 'wb')
waveFile.setnchannels(self.channels)
waveFile.setsampwidth(self.audio.get_sample_size(self.format))
waveFile.setframerate(self.rate)
waveFile.writeframes(b''.join(self.audio_frames))
waveFile.close()
Main.py
from recorder import Recorder
import time
recorder = Recorder("test1")
recorder.startRecording()
time.sleep(240)
recorder.stopRecording()
recorder.saveRecording()
With this solution, the camera and the audio will wait for each other.
I also tried the FFmpeg Re-encoding and Muxing and even though it was able to synchronize the audio with video, the video had a massive quality drop.

You can do offline html,js code to do video with audio recording. Using python lib python webview open that page. It should work fine.

I was randomly getting "[Errno -9999] Unanticipated host error" while using JRodrigoF's solution and found that it's due to a race condition where an audio stream can be closed just before being read for the last time inside record() of AudioRecorder class.
I modified slightly so that all the closing procedures are done after the while loop and added a function list_audio_devices() that shows the list of audio devices to select from. I also added an audio device index as a parameter to choose an audio device.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# VideoRecorder.py
from __future__ import print_function, division
import numpy as np
import cv2
import pyaudio
import wave
import threading
import time
import subprocess
import os
class VideoRecorder():
"Video class based on openCV"
def __init__(self, name="temp_video.avi", fourcc="MJPG", sizex=640, sizey=480, camindex=0, fps=30):
self.open = True
self.device_index = camindex
self.fps = fps # fps should be the minimum constant rate at which the camera can
self.fourcc = fourcc # capture images (with no decrease in speed over time; testing is required)
self.frameSize = (sizex, sizey) # video formats and sizes also depend and vary according to the camera used
self.video_filename = name
self.video_cap = cv2.VideoCapture(self.device_index)
self.video_writer = cv2.VideoWriter_fourcc(*self.fourcc)
self.video_out = cv2.VideoWriter(self.video_filename, self.video_writer, self.fps, self.frameSize)
self.frame_counts = 1
self.start_time = time.time()
def record(self):
"Video starts being recorded"
# counter = 1
timer_start = time.time()
timer_current = 0
while self.open:
ret, video_frame = self.video_cap.read()
if ret:
self.video_out.write(video_frame)
# print(str(counter) + " " + str(self.frame_counts) + " frames written " + str(timer_current))
self.frame_counts += 1
# counter += 1
# timer_current = time.time() - timer_start
time.sleep(1/self.fps)
# gray = cv2.cvtColor(video_frame, cv2.COLOR_BGR2GRAY)
# cv2.imshow('video_frame', gray)
# cv2.waitKey(1)
else:
break
def stop(self):
"Finishes the video recording therefore the thread too"
if self.open:
self.open=False
self.video_out.release()
self.video_cap.release()
cv2.destroyAllWindows()
def start(self):
"Launches the video recording function using a thread"
video_thread = threading.Thread(target=self.record)
video_thread.start()
class AudioRecorder():
"Audio class based on pyAudio and Wave"
def __init__(self, filename="temp_audio.wav", rate=44100, fpb=2**12, channels=1, audio_index=0):
self.open = True
self.rate = rate
self.frames_per_buffer = fpb
self.channels = channels
self.format = pyaudio.paInt16
self.audio_filename = filename
self.audio = pyaudio.PyAudio()
self.stream = self.audio.open(format=self.format,
channels=self.channels,
rate=self.rate,
input=True,
input_device_index=audio_index,
frames_per_buffer = self.frames_per_buffer)
self.audio_frames = []
def record(self):
"Audio starts being recorded"
self.stream.start_stream()
t_start = time.time_ns()
while self.open:
try:
data = self.stream.read(self.frames_per_buffer)
self.audio_frames.append(data)
except Exception as e:
print('\n' + '*'*80)
print('PyAudio read exception at %.1fms\n' % ((time.time_ns() - t_start)/10**6))
print(e)
print('*'*80 + '\n')
time.sleep(0.01)
self.stream.stop_stream()
self.stream.close()
self.audio.terminate()
waveFile = wave.open(self.audio_filename, 'wb')
waveFile.setnchannels(self.channels)
waveFile.setsampwidth(self.audio.get_sample_size(self.format))
waveFile.setframerate(self.rate)
waveFile.writeframes(b''.join(self.audio_frames))
waveFile.close()
def stop(self):
"Finishes the audio recording therefore the thread too"
if self.open:
self.open = False
def start(self):
"Launches the audio recording function using a thread"
audio_thread = threading.Thread(target=self.record)
audio_thread.start()
def start_AVrecording(filename="test", audio_index=0, sample_rate=44100):
global video_thread
global audio_thread
video_thread = VideoRecorder()
audio_thread = AudioRecorder(audio_index=audio_index, rate=sample_rate)
audio_thread.start()
video_thread.start()
return filename
def start_video_recording(filename="test"):
global video_thread
video_thread = VideoRecorder()
video_thread.start()
return filename
def start_audio_recording(filename="test", audio_index=0, sample_rate=44100):
global audio_thread
audio_thread = AudioRecorder(audio_index=audio_index, rate=sample_rate)
audio_thread.start()
return filename
def stop_AVrecording(filename="test"):
audio_thread.stop()
frame_counts = video_thread.frame_counts
elapsed_time = time.time() - video_thread.start_time
recorded_fps = frame_counts / elapsed_time
print("total frames " + str(frame_counts))
print("elapsed time " + str(elapsed_time))
print("recorded fps " + str(recorded_fps))
video_thread.stop()
# Makes sure the threads have finished
while threading.active_count() > 1:
time.sleep(1)
# Merging audio and video signal
if abs(recorded_fps - 6) >= 0.01: # If the fps rate was higher/lower than expected, re-encode it to the expected
print("Re-encoding")
cmd = "ffmpeg -r " + str(recorded_fps) + " -i temp_video.avi -pix_fmt yuv420p -r 6 temp_video2.avi"
subprocess.call(cmd, shell=True)
print("Muxing")
cmd = "ffmpeg -y -ac 2 -channel_layout stereo -i temp_audio.wav -i temp_video2.avi -pix_fmt yuv420p " + filename + ".avi"
subprocess.call(cmd, shell=True)
else:
print("Normal recording\nMuxing")
cmd = "ffmpeg -y -ac 2 -channel_layout stereo -i temp_audio.wav -i temp_video.avi -pix_fmt yuv420p " + filename + ".avi"
subprocess.call(cmd, shell=True)
print("..")
def file_manager(filename="test"):
"Required and wanted processing of final files"
local_path = os.getcwd()
if os.path.exists(str(local_path) + "/temp_audio.wav"):
os.remove(str(local_path) + "/temp_audio.wav")
if os.path.exists(str(local_path) + "/temp_video.avi"):
os.remove(str(local_path) + "/temp_video.avi")
if os.path.exists(str(local_path) + "/temp_video2.avi"):
os.remove(str(local_path) + "/temp_video2.avi")
# if os.path.exists(str(local_path) + "/" + filename + ".avi"):
# os.remove(str(local_path) + "/" + filename + ".avi")
def list_audio_devices(name_filter=None):
pa = pyaudio.PyAudio()
device_index = None
sample_rate = None
for x in range(pa.get_device_count()):
info = pa.get_device_info_by_index(x)
print(pa.get_device_info_by_index(x))
if name_filter is not None and name_filter in info['name']:
device_index = info['index']
sample_rate = int(info['defaultSampleRate'])
break
return device_index, sample_rate
if __name__ == '__main__':
start_AVrecording()
time.sleep(5)
stop_AVrecording()
file_manager()

How to play an audiofile with pyaudio?

I do not understand the example material for pyaudio. It seems they had written an entire small program and it threw me off.
How do I just play a single audio file?
Format is not an issue, I just want to know the bare minimum code I need to play an audio file.

May be this small wrapper (warning: created on knees) of their example will help you to understand the meaning of code they wrote.
import pyaudio
import wave
import sys
class AudioFile:
chunk = 1024
def __init__(self, file):
""" Init audio stream """
self.wf = wave.open(file, 'rb')
self.p = pyaudio.PyAudio()
self.stream = self.p.open(
format = self.p.get_format_from_width(self.wf.getsampwidth()),
channels = self.wf.getnchannels(),
rate = self.wf.getframerate(),
output = True
)
def play(self):
""" Play entire file """
data = self.wf.readframes(self.chunk)
while data != b'':
self.stream.write(data)
data = self.wf.readframes(self.chunk)
def close(self):
""" Graceful shutdown """
self.stream.close()
self.p.terminate()
# Usage example for pyaudio
a = AudioFile("1.wav")
a.play()
a.close()

The example seems pretty clear to me. You simply save the example as playwav.py call:
python playwav.py my_fav_wav.wav
The wave example with some extra comments:
import pyaudio
import wave
import sys
# length of data to read.
chunk = 1024
# validation. If a wave file hasn't been specified, exit.
if len(sys.argv) < 2:
print "Plays a wave file.\n\n" +\
"Usage: %s filename.wav" % sys.argv[0]
sys.exit(-1)
'''
************************************************************************
This is the start of the "minimum needed to read a wave"
************************************************************************
'''
# open the file for reading.
wf = wave.open(sys.argv[1], 'rb')
# create an audio object
p = pyaudio.PyAudio()
# open stream based on the wave object which has been input.
stream = p.open(format =
p.get_format_from_width(wf.getsampwidth()),
channels = wf.getnchannels(),
rate = wf.getframerate(),
output = True)
# read data (based on the chunk size)
data = wf.readframes(chunk)
# play stream (looping from beginning of file to the end)
while data:
# writing to the stream is what *actually* plays the sound.
stream.write(data)
data = wf.readframes(chunk)
# cleanup stuff.
wf.close()
stream.close()
p.terminate()

This way requires ffmpeg for pydub, but can play not only wave files:
import pyaudio
import sys
from pydub import AudioSegment
if len(sys.argv) <= 1:
print('No File Name!')
sys.exit(1)
chunk = 1024
fn = ' '.join(sys.argv[1:])
pd = AudioSegment.from_file(fn)
p = pyaudio.PyAudio()
stream = p.open(format =
p.get_format_from_width(pd.sample_width),
channels = pd.channels,
rate = pd.frame_rate,
output = True)
i = 0
data = pd[:chunk]._data
while data:
stream.write(data)
i += chunk
data = pd[i:i + chunk]._data
stream.close()
p.terminate()
sys.exit(0)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Live recognition with Python and Pocketsphinx - python

Try this. Pocketsphinx is now a GStreamer plugin.

Related

Deepspeech realtime speech to text

Using Python on Raspberry Pi, I'm unable to record audio with PyAudio via Thonny/terminal

Synchronizing audio and video with OpenCV and PyAudio

How to capture a video (AND audio) in python, from a camera (or webcam)

How to play an audiofile with pyaudio?

Categories

Resources