Synchronizing audio and video with OpenCV and PyAudio

Synchronizing audio and video with OpenCV and PyAudio - python

I have gotten both OpenCV and PyAudio working however I am not sure how I would sync them together. I am unable to get a framerate from OpenCV and measuring the call time for a frame changes from moment to moment. However with PyAudio it's basis is grabbing a certain sample rate. How would I sync them to be at the same rate. I assume there is some standard or some way codecs do it. (I've tried google all I got was information on lip syncing :/).
OpenCV Frame rate
from __future__ import division
import time
import math
import cv2, cv
vc = cv2.VideoCapture(0)
# get the frame
while True:
before_read = time.time()
rval, frame = vc.read()
after_read = time.time()
if frame is not None:
print len(frame)
print math.ceil((1.0 / (after_read - before_read)))
cv2.imshow("preview", frame)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
else:
print "None..."
cv2.waitKey(1)
# display the frame
while True:
cv2.imshow("preview", frame)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
Grabbing and saving audio
from sys import byteorder
from array import array
from struct import pack
import pyaudio
import wave
THRESHOLD = 500
CHUNK_SIZE = 1024
FORMAT = pyaudio.paInt16
RATE = 44100
def is_silent(snd_data):
"Returns 'True' if below the 'silent' threshold"
print "\n\n\n\n\n\n\n\n"
print max(snd_data)
print "\n\n\n\n\n\n\n\n"
return max(snd_data) < THRESHOLD
def normalize(snd_data):
"Average the volume out"
MAXIMUM = 16384
times = float(MAXIMUM)/max(abs(i) for i in snd_data)
r = array('h')
for i in snd_data:
r.append(int(i*times))
return r
def trim(snd_data):
"Trim the blank spots at the start and end"
def _trim(snd_data):
snd_started = False
r = array('h')
for i in snd_data:
if not snd_started and abs(i)>THRESHOLD:
snd_started = True
r.append(i)
elif snd_started:
r.append(i)
return r
# Trim to the left
snd_data = _trim(snd_data)
# Trim to the right
snd_data.reverse()
snd_data = _trim(snd_data)
snd_data.reverse()
return snd_data
def add_silence(snd_data, seconds):
"Add silence to the start and end of 'snd_data' of length 'seconds' (float)"
r = array('h', [0 for i in xrange(int(seconds*RATE))])
r.extend(snd_data)
r.extend([0 for i in xrange(int(seconds*RATE))])
return r
def record():
"""
Record a word or words from the microphone and
return the data as an array of signed shorts.
Normalizes the audio, trims silence from the
start and end, and pads with 0.5 seconds of
blank sound to make sure VLC et al can play
it without getting chopped off.
"""
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT, channels=1, rate=RATE,
input=True, output=True,
frames_per_buffer=CHUNK_SIZE)
num_silent = 0
snd_started = False
r = array('h')
while 1:
# little endian, signed short
snd_data = array('h', stream.read(1024))
if byteorder == 'big':
snd_data.byteswap()
print "\n\n\n\n\n\n"
print len(snd_data)
print snd_data
r.extend(snd_data)
silent = is_silent(snd_data)
if silent and snd_started:
num_silent += 1
elif not silent and not snd_started:
snd_started = True
if snd_started and num_silent > 1:
break
sample_width = p.get_sample_size(FORMAT)
stream.stop_stream()
stream.close()
p.terminate()
r = normalize(r)
r = trim(r)
r = add_silence(r, 0.5)
return sample_width, r
def record_to_file(path):
"Records from the microphone and outputs the resulting data to 'path'"
sample_width, data = record()
data = pack('<' + ('h'*len(data)), *data)
wf = wave.open(path, 'wb')
wf.setnchannels(1)
wf.setsampwidth(sample_width)
wf.setframerate(RATE)
wf.writeframes(data)
wf.close()
if __name__ == '__main__':
print("please speak a word into the microphone")
record_to_file('demo.wav')
print("done - result written to demo.wav")

I think you'd be better off using either GSreamer or ffmpeg, or if you're on Windows, DirectShow. These libs can handle both audio and video, and should have some kind of a Multiplexer to allow you to mix video and audio properly.
But if you really want to do this using Opencv, you should be able to use VideoCapture to get the frame rate, have you tried using this?
fps = cv.GetCaptureProperty(vc, CV_CAP_PROP_FPS)
Another way would be to estimate fps as number of frames divided by duration:
nFrames = cv.GetCaptureProperty(vc, CV_CAP_PROP_FRAME_COUNT)
cv.SetCaptureProperty(vc, CV_CAP_PROP_POS_AVI_RATIO, 1)
duration = cv.GetCaptureProperty(vc, CV_CAP_PROP_POS_MSEC)
fps = 1000 * nFrames / duration;
I'm not sure I understand what you were trying to do here:
before_read = time.time()
rval, frame = vc.read()
after_read = time.time()
It seems to me that doing after_read - before_read only measures how long it took for OpenCV to load the next frame, it doesn't measure the fps. OpenCV is not trying to do playback, it's only loading frames and it'll try to do so the fastest it can and I think there's no way to configure that. I think that putting a waitKey(1/fps) after displaying each frame will achieve what you're looking for.

You could have 2 counters 1 for audio and one for video.
The video counter will become +(1/fps) when showing an image and audio +sec where sec the seconds of audio you are writing to the stream each time. Then on audio part of the code you can do something like
While audiosec-videosec>=0.05: # Audio is ahead
time.sleep(0.05)
And on video part
While videosec-audiosec>=0.2:# video is ahead
time.sleep(0.2)
You can play with the numbers
This is how i achieve some sort of synchronization on my own video player project using pyaudio recently ffmpeg instead of cv2.

personally i used threading for this.
import concurrent.futures
import pyaudio
import cv2
class Aud_Vid():
def __init__(self, arg):
self.video = cv2.VideoCapture(0)
self.CHUNK = 1470
self.FORMAT = pyaudio.paInt16
self.CHANNELS = 2
self.RATE = 44100
self.audio = pyaudio.PyAudio()
self.instream = self.audio.open(format=self.FORMAT,channels=self.CHANNELS,rate=self.RATE,input=True,frames_per_buffer=self.CHUNK)
self.outstream = self.audio.open(format=self.FORMAT,channels=self.CHANNELS,rate=self.RATE,output=True,frames_per_buffer=self.CHUNK)
def sync(self):
with concurrent.futures.ThreadPoolExecutor() as executor:
tv = executor.submit(self.video.read)
ta = executor.submit(self.instream.read,1470)
vid = tv.result()
aud = ta.result()
return(vid[1].tobytes(),aud)

Related

Extract specific frames of youtube video without downloading video

I need to extract specific frames of an online video to work on an algorithm but I don't want to download the whole video because that would make it highly inefficient.
For starters, I tried working with youtube videos. I can download whole of the video using youtube-dl in this way:
ydl_opts = {'outtmpl': r'OUTPUT_DIRECTORY_HERE',}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download([url])
And then I can capture individual frames.
I need to avoid downloading the whole video. After some research, I have found that ffmpeg might help me do this. I found no way to download just the frames so if this is not possible, the second option is that I can download specific portions of the video. One such example in linux is here but I couldn't find any solution for python.
What is a good way to download just the frames, or portions of videos (in python) without downloading the entire thing?

Just to add on to the current answer, performance can further be enhanced using multiprocessing. For example, if you wanted to split up the video into frames and process them independently in num_cpu processes:
import os
from functools import partial
from multiprocessing.pool import Pool
import cv2
import youtube_dl
def process_video_parallel(url, skip_frames, process_number):
cap = cv2.VideoCapture(url)
num_processes = os.cpu_count()
frames_per_process = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) // num_processes
cap.set(cv2.CAP_PROP_POS_FRAMES, frames_per_process * process_number)
x = 0
count = 0
while x < 10 and count < frames_per_process:
ret, frame = cap.read()
if not ret:
break
filename =r"PATH\shot"+str(x)+".png"
x += 1
cv2.imwrite(filename.format(count), frame)
count += skip_frames # Skip 300 frames i.e. 10 seconds for 30 fps
cap.set(1, count)
cap.release()
video_url = "..." # The Youtube URL
ydl_opts = {}
ydl = youtube_dl.YoutubeDL(ydl_opts)
info_dict = ydl.extract_info(video_url, download=False)
formats = info_dict.get('formats', None)
print("Obtaining frames")
for f in formats:
if f.get('format_note', None) == '144p':
url = f.get('url', None)
cpu_count = os.cpu_count()
with Pool(cpu_count) as pool:
pool.map(partial(process_video_parallel, url, 300), range(cpu_count))
For the purposes of this application, since images are just being saved from the video, this may not result in a huge improvement (maybe a few seconds), but if additional algorithms needed to be applied on the frames, it could be beneficial.

I tried what #AyeshaKhan shared in the comments.
After importing cv2,numpy,youtube-dl:
url=saved_url #The Youtube URL
ydl_opts={}
ydl=youtube_dl.YoutubeDL(ydl_opts)
info_dict=ydl.extract_info(video_url, download=False)
formats = info_dict.get('formats',None)
print("Obtaining frames")
for f in formats:
if f.get('format_note',None) == '144p':
url = f.get('url',None)
cap = cv2.VideoCapture(url)
x=0
count=0
while x<10:
ret, frame = cap.read()
if not ret:
break
filename =r"PATH\shot"+str(x)+".png"
x+=1
cv2.imwrite(filename.format(count), frame)
count+=300 #Skip 300 frames i.e. 10 seconds for 30 fps
cap.set(1,count)
if cv2.waitKey(30)&0xFF == ord('q'):
break
cap.release()
The answer in the comments was downloading all of the frames so the count I added in .format() ensured that I skipped the frames as per my requirement.
Additionally, x here limits the number to 10.
Although, I am still not sure whether this method is actually capturing the specified frames, or if it is capturing all the frames and just saving the specified frames to my local storage. I needed the former thing.
But this is still fast enough and works for me!

Alternative to #danielcahall,
This method uses Ray for parallelization instead of Pool
Note: For the first time, it might take more time for initializing the ray components, After the first run, this will be fine
from timeit import default_timer as timer
import os, ray, shutil
import cv2
import youtube_dl
try:
os.makedirs('test_fol')
except:
shutil.rmtree('test_fol')
os.makedirs('test_fol')
ray.init()
#ray.remote
def process_video_parallel(url, total_frames, process_number):
cap = cv2.VideoCapture(url)
num_processes = os.cpu_count()
frames_per_process = int(total_frames) // num_processes
cap.set(cv2.CAP_PROP_POS_FRAMES, frames_per_process * process_number)
count = frames_per_process * process_number
while count < frames_per_process * (process_number+1):
ret, frame = cap.read()
if not ret:
break
filename = f"test_fol/{count}.jpg"
cv2.imwrite(filename, frame)
count += 1
cap.release()
t1 = timer()
video_url = "..." # The Youtube URL
ydl_opts = {}
ydl = youtube_dl.YoutubeDL(ydl_opts)
info_dict = ydl.extract_info(video_url, download=False)
duration = info_dict['duration']
formats = info_dict.get('formats', None)
for f in formats:
if f.get('format_note', None) == '144p':
url = f.get('url', None)
cpu_count = os.cpu_count()
data = ray.get([process_video_parallel.remote(url, int(duration*31), x) for x in range(cpu_count)])
break
print("Total Time", timer()-t1)

Play audio and get current second of playback using Python?

I am working on speech interface with python. I am having trouble with audio playback.
What do you use to black back simple mp3 files on the raspberry pi?
I need to play audio and 2 seconds before the end of the playback I need to start another task (opening the stream of the microphone)
How can I archive this? May problem is that I haven't found a way to read the current seconds of the playback yet. If I could read this, I would just start a new thread when the currenttime is audiolength - 2 seconds.
I hope you can help me or have any experience with this.

I found a solution to this.
PyAudio is providing a way to play audio chunk by chunk. Through that you can read the current chunk and compare it to the overall size of the audio.
class AudioPlayer():
"""AudioPlayer class"""
def __init__(self):
self.chunk = 1024
self.audio = pyaudio.PyAudio()
self._running = True
def play(self, audiopath):
self._running = True
#storing how much we have read already
self.chunktotal = 0
wf = wave.open(audiopath, 'rb')
stream = self.audio.open(format =self.audio.get_format_from_width(wf.getsampwidth()),channels = wf.getnchannels(),rate = wf.getframerate(),output = True)
print(wf.getframerate())
# read data (based on the chunk size)
data = wf.readframes(self.chunk)
#THIS IS THE TOTAL LENGTH OF THE AUDIO
audiolength = wf.getnframes() / float(wf.getframerate())
while self._running:
if data != '':
stream.write(data)
self.chunktotal = self.chunktotal + self.chunk
#calculating the percentage
percentage = (self.chunktotal/wf.getnframes())*100
#calculating the current seconds
current_seconds = self.chunktotal/float(wf.getframerate())
data = wf.readframes(self.chunk)
if data == b'':
break
# cleanup stream
stream.close()
def stop(self):
self._running = False
Hope it helps someone,
Alex

Try just_playback. It's a wrapper I wrote around miniaudio that provides playback control functionality like pausing, resuming, seeking, getting the current playback positions and setting the playback volume.

How to make this equalizer more efficient?

So I have been working on making an equalizer and the problem I am facing is that the pyaudio stream is streaming much faster than the speed with which the eq. is finding the bass component of the audio file. I will briefly outline the implementation:
I have created two extra threads and have used tkinter for the gui. Thread 1 computes the bass component (fn bass() ) of the sound in chunks of 50ms data.
Thread 2 plots that by actually creating a rectangle in tkinter with varying top left coordinates.
flag2 keeps the main thread running, while flag synchronizes the bass() and plot() functions. The last part of the code is to ensure that the display doesn't go faster than the song itself( however the exact opposite is the concern right now).
I am attaching the code here:
import numpy as np
from scipy.io import wavfile
from numpy import fft as fft
import time
import tkinter as tk
import threading
import pyaudio
import wave
CHUNK = 1024
wf = wave.open("test3.wav", 'rb')
p = pyaudio.PyAudio()
###
def callback(in_data, frame_count, time_info, status):
data = wf.readframes(frame_count)
return (data, pyaudio.paContinue)
stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
channels=wf.getnchannels(),
rate=wf.getframerate(),
output=True,
stream_callback=callback)
####
rate,audData = wavfile.read("test3.wav")
print ("Rate "+str(rate))
print ("Length of wav file(in s) = " + str(audData.shape[0]/rate))
ch1=audData[:]
tim = 0.050
pt=int(tim*rate)
flag2 = True
flag = False
cnt = 0
value=0
def bass():
global pt
global cnt
global audData
global value
global flag2
global flag
cnt +=1
fourier=fft.fft(ch1[((cnt-1)*pt):((cnt)*pt)])
fourier = abs(fourier) / float(pt)
fourier = fourier[0:25]
fourier = fourier**2
if (cnt+1)*pt > len(audData[:]) :
flag2 = False
value = (np.sum(fourier))/pt
flag= True
return
def plot():
global value
global flag
root=tk.Tk()
canvas =tk.Canvas(root,width=200,height=500)
canvas.pack()
while True:
if flag:
canvas.delete("all")
flag=False
greenbox = canvas.create_rectangle(50,500-(value/80),150,500,fill="green")
print(value/80) # to check whether it excees 500
root.update_idletasks()
root.update()
return
def sound():
global data
global stream
global wf
global CHUNK
stream.start_stream()
while stream.is_active():
time.sleep(0.1)
stream.stop_stream()
stream.close()
wf.close()
p.terminate()
bass()
t1 = threading.Thread(target=plot, name='t_1')
t2 = threading.Thread(target=sound, name='t_2')
t1.start()
t2.start()
while flag2:
a = time.time()
bass()
b=time.time()
while (b-a) < tim :
time.sleep(0.015)
b=time.time()
To overcome this processing speed problem, I tried to process 1 in every 3 chunks:
cnt +=1
fourier=fft.fft(ch1[((3*cnt-3)*pt):((3*cnt-2)*pt)])
fourier = abs(fourier) / float(pt)
fourier = fourier[0:25]
fourier = fourier**2
if (3*cnt+1)*pt > len(audData[:]) :
flag2 = False
#######
while (b-a) < 3*tim :
time.sleep(0.015)
b=time.time()
But this even this is not up to the mark. The lag is visible after a few seconds. Any ideas on how to improve this?

Instead of efficiency, a more realistic solution might be delay matching. If you can determine the latency of your FFT and display (etc.) processes, the you can either delay sound output (using a fifo of some number of audio samples), or have the visualization process look ahead in the playback file read by the equivalent number of samples.

OpenCV / Python : multi-threading for live facial recognition

I'm using OpenCv and Dlib to execute facial recognition w/ landmarks, live from the webcam stream. The language is Python. It works fine on my macbook laptop, but I need it to run from a desktop computer 24/7. The computer is a PC Intel® Core™2 Quad CPU Q6600 # 2.40GHz 32bit running Debian Jessie. The drop in performance is drastic : there is a 10 seconds delay due to processing !
I therefore looked into multi-threading to gain performance :
I first tried the sample code by OpenCv, and the result is great! All four cores hit 100%, and the performance is much better.
I then replaced the frame processing code with my code, and it doesn't improve performance at all ! Only one core hits the 100%, the other ones stay very low. I even think it's worse with multi-threading on.
I got the facial landmark code from the dlib sample code. I know it can probably be optimized, but I want to understand why am I not able to use my (old) computer's full power with multi-threading ?
I'll drop my code below, thanks a lot for reading :)
from __future__ import print_function
import numpy as np
import cv2
import dlib
from multiprocessing.pool import ThreadPool
from collections import deque
from common import clock, draw_str, StatValue
import video
class DummyTask:
def __init__(self, data):
self.data = data
def ready(self):
return True
def get(self):
return self.data
if __name__ == '__main__':
import sys
print(__doc__)
try:
fn = sys.argv[1]
except:
fn = 0
cap = video.create_capture(fn)
#Face detector
detector = dlib.get_frontal_face_detector()
#Landmarks shape predictor
predictor = dlib.shape_predictor("landmarks/shape_predictor_68_face_landmarks.dat")
# This is where the facial detection takes place
def process_frame(frame, t0, detector, predictor):
# some intensive computation...
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
clahe_image = clahe.apply(gray)
detections = detector(clahe_image, 1)
for k,d in enumerate(detections):
shape = predictor(clahe_image, d)
for i in range(1,68): #There are 68 landmark points on each face
cv2.circle(frame, (shape.part(i).x, shape.part(i).y), 1, (0,0,255), thickness=2)
return frame, t0
threadn = cv2.getNumberOfCPUs()
pool = ThreadPool(processes = threadn)
pending = deque()
threaded_mode = True
latency = StatValue()
frame_interval = StatValue()
last_frame_time = clock()
while True:
while len(pending) > 0 and pending[0].ready():
res, t0 = pending.popleft().get()
latency.update(clock() - t0)
draw_str(res, (20, 20), "threaded : " + str(threaded_mode))
draw_str(res, (20, 40), "latency : %.1f ms" % (latency.value*1000))
draw_str(res, (20, 60), "frame interval : %.1f ms" % (frame_interval.value*1000))
cv2.imshow('threaded video', res)
if len(pending) < threadn:
ret, frame = cap.read()
t = clock()
frame_interval.update(t - last_frame_time)
last_frame_time = t
if threaded_mode:
task = pool.apply_async(process_frame, (frame.copy(), t, detector, predictor))
else:
task = DummyTask(process_frame(frame, t, detector, predictor))
pending.append(task)
ch = cv2.waitKey(1)
if ch == ord(' '):
threaded_mode = not threaded_mode
if ch == 27:
break
cv2.destroyAllWindows()

Performance issue was due to a bad compilation of dlib. Do not use pip install dlib which runs very very slowly for some reason compared to the proper compilation. I went from almost 10 seconds lag to about 2 seconds this way. So finally I didn't need multi-threading/processing, but I'm working on it to enhance the speed even more. Thanks for the help :)

i tried a simplified approach like P.Ro mentioned in his answer with processes writing to an output queue but somehow the queue got locked most of the time because all the processes wrote to it at the same time. (just my guess) i probably did something wrong.
in the end i ended up using pipes.
the code is nasty. but if i was me a few hours ago. i would still be glad to find an example that actually runs without effort.
from multiprocessing import Process, Queue, Manager,Pipe
import multiprocessing
import face_recognition as fik
import cv2
import time
video_input = 0
obama_image = fik.load_image_file("obama.png")
obama_face_encoding = fik.face_encodings(obama_image)[0]
quality = 0.7
def f(id,fi,fl):
import face_recognition as fok
while True:
small_frame = fi.get()
print("running thread"+str(id))
face_locations = fok.face_locations(small_frame)
if(len(face_locations)>0):
print(face_locations)
for (top7, right7, bottom7, left7) in face_locations:
small_frame_c = small_frame[top7:bottom7, left7:right7]
fl.send(small_frame_c)
fps_var =0
if __name__ == '__main__':
multiprocessing.set_start_method('spawn')
# global megaman
with Manager() as manager:
video_capture = cv2.VideoCapture(video_input)
fi = Queue(maxsize=14)
threads = 8
proc = []
parent_p = []
thread_p = []
# procids = range(0,threads)
for t in range(0,threads):
p_t,c_t = Pipe()
parent_p.append(p_t)
thread_p.append(c_t)
print(t)
proc.append(Process(target=f, args=(t,fi,thread_p[t])))
proc[t].start()
useframe = False
frame_id = 0
while True:
# Grab a single frame of video
ret, frame = video_capture.read()
effheight, effwidth = frame.shape[:2]
if effwidth < 20:
break
# Resize frame of video to 1/4 size for faster face recognition processing
xxx = 930
yyy = 10/16 #0.4234375
small_frame = cv2.resize(frame, (xxx, int(xxx*yyy)))
if frame_id%2 == 0:
if not fi.full():
fi.put(small_frame)
print(frame_id)
cv2.imshow('Video', small_frame)
print("FPS: ", int(1.0 / (time.time() - fps_var)))
fps_var = time.time()
#GET ALL DETECTIONS
for t in range(0,threads):
if parent_p[t].poll():
small_frame_c = parent_p[t].recv()
cv2.imshow('recc', small_frame_c)
height34, width34 = small_frame_c.shape[:2]
# print fsizeee
if(width34<20):
print("face 2 small")
print(width34)
break
face_encodings_cam = fik.face_encodings(small_frame_c,[(0, width34, height34, 0)])
match = fik.compare_faces([obama_face_encoding], face_encodings_cam[0])
name = "Unknown"
if match[0]:
name = "Barack"
print(name)
break
frame_id += 1
# Hit 'q' on the keyboard to quit!
if cv2.waitKey(1) & 0xFF == ord('q'):
break

Do not have much experience with using ThreadPool, but I always just use Process like shown below. You should be able to easily edit this code to fit your needs. I wrote this with your implementation in mind.
This code will get the number of cores and start however many worker processes that will all be implementing the desired function in parallel. They all share a Queue of frames for input and all put to the same output Queue for the main to get and show. Each Queue has a maximum size, in this case 5. This ensures that despite the CPU time it takes to process, it will always be relatively live time.
import numpy as np
import cv2
from multiprocessing import Process, Queue
import time
#from common import clock, draw_str, StatValue
#import video
class Canny_Process(Process):
def __init__(self,frame_queue,output_queue):
Process.__init__(self)
self.frame_queue = frame_queue
self.output_queue = output_queue
self.stop = False
#Initialize your face detectors here
def get_frame(self):
if not self.frame_queue.empty():
return True, self.frame_queue.get()
else:
return False, None
def stopProcess(self):
self.stop = True
def canny_frame(self,frame):
# some intensive computation...
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
edges = cv2.Canny(gray, 50, 100)
#To simulate CPU Time
#############################
for i in range(1000000):
x = 546*546
res = x/(i+1)
#############################
'REPLACE WITH FACE DETECT CODE HERE'
if self.output_queue.full():
self.output_queue.get_nowait()
self.output_queue.put(edges)
def run(self):
while not self.stop:
ret, frame = self.get_frame()
if ret:
self.canny_frame(frame)
if __name__ == '__main__':
frame_sum = 0
init_time = time.time()
def put_frame(frame):
if Input_Queue.full():
Input_Queue.get_nowait()
Input_Queue.put(frame)
def cap_read(cv2_cap):
ret, frame = cv2_cap.read()
if ret:
put_frame(frame)
cap = cv2.VideoCapture(0)
threadn = cv2.getNumberOfCPUs()
threaded_mode = True
process_list = []
Input_Queue = Queue(maxsize = 5)
Output_Queue = Queue(maxsize = 5)
for x in range((threadn -1)):
canny_process = Canny_Process(frame_queue = Input_Queue,output_queue = Output_Queue)
canny_process.daemon = True
canny_process.start()
process_list.append(canny_process)
ch = cv2.waitKey(1)
cv2.namedWindow('Threaded Video', cv2.WINDOW_NORMAL)
while True:
cap_read(cap)
if not Output_Queue.empty():
result = Output_Queue.get()
cv2.imshow('Threaded Video', result)
ch = cv2.waitKey(5)
if ch == ord(' '):
threaded_mode = not threaded_mode
if ch == 27:
break
cv2.destroyAllWindows()
This should do the trick just change my canny function to do your face detection. I wrote this from your code and compared the two. This is significantly faster. I am using multiprocessing.Process here. In python processes are truly parallel and threads are not quite because of the GIL. I am using 2 queues to send data back and forth between the main and the processes. Queues are both Thread and Process safe.

you may use this, multithreaded:
from imutils.video import VideoStream
# Initialize multithreading the video stream.
videostream = "rtsp://192.168.x.y/user=admin=xxxxxxx_channel=vvvv=1.sdp?params"
vs = VideoStream(src=videostream, resolution=frameSize,
framerate=32).start()
frame = vs.read()

Live recognition with Python and Pocketsphinx

I have recently been working with pocket sphinx in python. I have successfully got the
example below to work recognising a recorded wav.
#!/usr/bin/env python
import sys,os
def decodeSpeech(hmmd,lmdir,dictp,wavfile):
"""
Decodes a speech file
"""
try:
import pocketsphinx as ps
import sphinxbase
except:
print """Pocket sphinx and sphixbase is not installed
in your system. Please install it with package manager.
"""
speechRec = ps.Decoder(hmm = hmmd, lm = lmdir, dict = dictp)
wavFile = file(wavfile,'rb')
wavFile.seek(44)
speechRec.decode_raw(wavFile)
result = speechRec.get_hyp()
return result[0]
if __name__ == "__main__":
hmdir = "/home/jaganadhg/Desktop/Docs_New/kgisl/model/hmm/wsj1"
lmd = "/home/jaganadhg/Desktop/Docs_New/kgisl/model/lm/wsj/wlist5o.3e-7.vp.tg.lm.DMP"
dictd = "/home/jaganadhg/Desktop/Docs_New/kgisl/model/lm/wsj/wlist5o.dic"
wavfile = "/home/jaganadhg/Desktop/Docs_New/kgisl/sa1.wav"
recognised = decodeSpeech(hmdir,lmd,dictd,wavfile)
print "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%"
print recognised
print "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%"
The problem is how can I do real time speech recognition from a microphone? In
a while loop with a if statement so that if a set word is recognised from the microphone
a function can be called?

The code for realtime recognition looks like this:
config = Decoder.default_config()
config.set_string('-hmm', path.join(MODELDIR, 'en-us/en-us'))
config.set_string('-lm', path.join(MODELDIR, 'en-us/en-us.lm.bin'))
config.set_string('-dict', path.join(MODELDIR, 'en-us/cmudict-en-us.dict'))
config.set_string('-logfn', '/dev/null')
decoder = Decoder(config)
import pyaudio
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024)
stream.start_stream()
in_speech_bf = False
decoder.start_utt()
while True:
buf = stream.read(1024)
if buf:
decoder.process_raw(buf, False, False)
if decoder.get_in_speech() != in_speech_bf:
in_speech_bf = decoder.get_in_speech()
if not in_speech_bf:
decoder.end_utt()
print 'Result:', decoder.hyp().hypstr
decoder.start_utt()
else:
break
decoder.end_utt()
You can also use gstreamer python bindings in pocketsphinx, check livedemo.py

Try this. Pocketsphinx is now a GStreamer plugin.

This is the code I see on the internet and I've modified a few things to really listen to the words very bad and slow
You can help me modify it for good. It is built on ubuntu 16.04 LTS
I do not know much about programming
Looking forward to help
# -*- encoding: utf-8 -*-
#!/usr/bin/env python
from pocketsphinx.pocketsphinx import *
from sphinxbase.sphinxbase import *
import os
import pyaudio
import wave
import audioop
from collections import deque
import time
import math;import Mic
"""
Written by Sophie Li, 2016
http://blog.justsophie.com/python-speech-to-text-with-pocketsphinx/
"""
class SpeechDetector:
def __init__(self):
# Microphone stream config.
self.CHUNK = 1024 # CHUNKS of bytes to read each time from mic
self.FORMAT = pyaudio.paInt16
self.CHANNELS = 1
self.RATE = 16000
self.SILENCE_LIMIT = 1 # Silence limit in seconds. The max ammount of seconds where
# only silence is recorded. When this time passes the
# recording finishes and the file is decoded
self.PREV_AUDIO = 0.5 # Previous audio (in seconds) to prepend. When noise
# is detected, how much of previously recorded audio is
# prepended. This helps to prevent chopping the beginning
# of the phrase.
self.THRESHOLD = 4500
self.num_phrases = -1
# These will need to be modified according to where the pocketsphinx folder is
MODELDIR = "/home/l/Desktop/pocketsphinx/model/en-us"
# Create a decoder with certain model
config = Decoder.default_config()
config.set_string('-hmm', os.path.join(MODELDIR, '/home/l/Desktop/pocketsphinx/model/en-us/en-us/'))
config.set_string('-lm', os.path.join(MODELDIR, '/home/l/Desktop/pocketsphinx/model/en-us/en-us.lm.bin'))
config.set_string('-dict', os.path.join(MODELDIR, '/home/l/Desktop/pocketsphinx/model/en-us/cmudict-en-us.dict'))
config.set_string('-keyphrase', 'no one')
config.set_float('-kws_threshold', 1e+20)
# Creaders decoder object for streaming data.
self.decoder = Decoder(config)
def setup_mic(self, num_samples=50):
""" Gets average audio intensity of your mic sound. You can use it to get
average intensities while you're talking and/or silent. The average
is the avg of the .2 of the largest intensities recorded.
"""
#print "Getting intensity values from mic."
p = pyaudio.PyAudio()
stream = p.open(format=self.FORMAT,
channels=self.CHANNELS,
rate=self.RATE,
input=True,
frames_per_buffer=self.CHUNK)
values = [math.sqrt(abs(audioop.avg(stream.read(self.CHUNK), 4)))
for x in range(num_samples)]
values = sorted(values, reverse=True)
r = sum(values[:int(num_samples * 0.2)]) / int(num_samples * 0.2)
#print " Finished "
#print " Average audio intensity is ", r
stream.close()
p.terminate()
if r < 3000:
self.THRESHOLD = 3500
else:
self.THRESHOLD = r + 100
def save_speech(self, data, p):
"""
Saves mic data to temporary WAV file. Returns filename of saved
file
"""
filename = 'output_'+str(int(time.time()))
# writes data to WAV file
data = ''.join(data)
wf = wave.open(filename + '.wav', 'wb')
wf.setnchannels(1)
wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
wf.setframerate(16000) # TODO make this value a function parameter?
wf.writeframes(data)
wf.close()
return filename + '.wav'
def decode_phrase(self, wav_file):
self.decoder.start_utt()
stream = open(wav_file, "rb")
while True:
buf = stream.read(1024)
if buf:
self.decoder.process_raw(buf, False, False)
else:
break
self.decoder.end_utt()
words = []
[words.append(seg.word) for seg in self.decoder.seg()]
return words
def run(self):
"""
Listens to Microphone, extracts phrases from it and calls pocketsphinx
to decode the sound
"""
self.setup_mic()
#Open stream
p = pyaudio.PyAudio()
stream = p.open(format=self.FORMAT,
channels=self.CHANNELS,
rate=self.RATE,
input=True,
frames_per_buffer=self.CHUNK)
audio2send = []
cur_data = '' # current chunk of audio data
rel = self.RATE/self.CHUNK
slid_win = deque(maxlen=self.SILENCE_LIMIT * rel)
#Prepend audio from 0.5 seconds before noise was detected
prev_audio = deque(maxlen=self.PREV_AUDIO * rel)
started = False
while True:
cur_data = stream.read(self.CHUNK)
slid_win.append(math.sqrt(abs(audioop.avg(cur_data, 4))))
if sum([x > self.THRESHOLD for x in slid_win]) > 0:
if started == False:
print "Bắt đầu ghi âm"
started = True
audio2send.append(cur_data)
elif started:
print "Hoàn thành ghi âm"
filename = self.save_speech(list(prev_audio) + audio2send, p)
r = self.decode_phrase(filename)
print "RESULT: ", r
# hot word for me " no one" if r.count('one') and r.count("no") > 0 the end programs
if r.count("one") > 0 and r.count("no") > 0:
Mic.playaudiofromAudio().play("/home/l/Desktop/PROJECT/Audio/beep_hi.wav")
os.remove(filename)
return
# Removes temp audio file
os.remove(filename)
# Reset all
started = False
slid_win = deque(maxlen=self.SILENCE_LIMIT * rel)
prev_audio = deque(maxlen= 0.5 * rel)
audio2send = []
print "Chế độ nghe ..."
else:
prev_audio.append(cur_data)
print "* Hoàn thành nghe"
stream.close()
p.terminate()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Synchronizing audio and video with OpenCV and PyAudio - python

Related

Extract specific frames of youtube video without downloading video

Play audio and get current second of playback using Python?

How to make this equalizer more efficient?

OpenCV / Python : multi-threading for live facial recognition

Live recognition with Python and Pocketsphinx

Categories

Resources