How to pass live audio url to Google Speech to Text API - python

I have a url to live audio recording that I'm trying to transcribe using Google Speech to Text API. I am using an example code from the Cloud Speech to Text API. However, the problem is that when I pass the live url I do not receive any output. Below is the relevant portion of my code. Any help would be greatly appreciated!
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
import io
import os
import time
import requests
import numpy as np
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
from urllib.request import urlopen
from datetime import datetime
from datetime import timedelta
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]= "app_creds.json"
def get_stream():
stream = urlopen('streamurl')
duration = 60
begin = datetime.now()
duration = timedelta(seconds=duration)
while datetime.now() - begin < duration:
data = stream.read(8000)
return data
def transcribe_streaming():
"""Streams transcription of the given audio file."""
client = speech.SpeechClient()
content = get_stream()
# In practice, stream should be a generator yielding chunks of audio data.
stream = [content]
requests = (types.StreamingRecognizeRequest(audio_content=chunk)
for chunk in stream)
config = types.RecognitionConfig(
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
language_code='en-US')
streaming_config = types.StreamingRecognitionConfig(config=config)
# streaming_recognize returns a generator.
responses = client.streaming_recognize(streaming_config, requests)
for response in responses:
# Once the transcription has settled, the first result will contain the
# is_final result. The other results will be for subsequent portions of
# the audio.
for result in response.results:
print('Finished: {}'.format(result.is_final))
print('Stability: {}'.format(result.stability))
alternatives = result.alternatives
# The alternatives are ordered from most likely to least.
for alternative in alternatives:
print('Confidence: {}'.format(alternative.confidence))
print(u'Transcript: {}'.format(alternative.transcript))

When sending audio to the Google Speech service, make sure that the service object setup matches the audio encoding. In your particular case
config = types.RecognitionConfig(
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
language_code='en-US')
corresponds to single channel, 16KHz, linear 16 bit PCM encoding. See the list of other supported encodings if you need to transcribe audio in different formats.

A part of my code I used a while back, I don't know if that may help:
def live_recognize_loop(self):
client = self.client
def is_running():
return self.recording
while self.recording:
with MicrophoneStream(RATE, CHUNK) as stream:
audio_generator = stream.generator(is_running)
requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator)
responses = client.streaming_recognize(client.custom_streaming_config, requests)
responses_iterator = iter(responses)
while self.recording:
try:
response = next(responses_iterator)
except StopIteration:
break
except OutOfRange:
# Exception 400 - Exceeded maximum allowed stream duration of 65 seconds.
self.user_display(self.intermediateFrame.GetMessageText())
break # Start over
except ServiceUnavailable as e:
# Exception 503 - Getting metadata from plugin failed
self.log("{0} - NOT RECOGNIZED - {1}\n".format(self.getDate(), e))
break
except ResourceExhausted as e:
break
except GoogleAPICallError as e:
break
if response.results:
result = response.results[0]
if result.alternatives:
transcript = result.alternatives[0].transcript
self.intermediateFrame.SetMessageText(transcript)
if not result.is_final:
self.intermediateFrame.Display()
# print(transcript)
else:
self.user_display(transcript)
self.intermediateFrame.Display(False)
self.intermediateFrame.SetMessageText("")
#print("\t\t FINAL: %s" % transcript)
break # Start over
MicrophoneStream class
from __future__ import division
import pyaudio
from six.moves import queue
class MicrophoneStream(object):
"""Opens a recording stream as a generator yielding the audio chunks."""
def __init__(self, rate, chunk):
self._rate = rate
self._chunk = chunk
# Create a thread-safe buffer of audio data
self._buff = queue.Queue()
self.closed = True
def __enter__(self):
self._audio_interface = pyaudio.PyAudio()
self._audio_stream = self._audio_interface.open(
format=pyaudio.paInt16,
# The API currently only supports 1-channel (mono) audio
channels=1, rate=self._rate,
input=True, frames_per_buffer=self._chunk,
# Run the audio stream asynchronously to fill the buffer object.
# This is necessary so that the input device's buffer doesn't
# overflow while the calling thread makes network requests, etc.
stream_callback=self._fill_buffer,
)
self.closed = False
return self
def __exit__(self, type, value, traceback):
self._audio_stream.stop_stream()
self._audio_stream.close()
self.closed = True
# Signal the generator to terminate so that the client's
# streaming_recognize method will not block the process termination.
self._buff.put(None)
self._audio_interface.terminate()
def _fill_buffer(self, in_data, frame_count, time_info, status_flags):
"""Continuously collect data from the audio stream, into the buffer."""
self._buff.put(in_data)
return None, pyaudio.paContinue
def generator(self, is_running=None):
while not self.closed:
# Use a blocking get() to ensure there's at least one chunk of
# data, and stop iteration if the chunk is None, indicating the
# end of the audio stream.
chunk = self._buff.get()
if callable(is_running) and not is_running():
return
if chunk is None:
return
data = [chunk]
# Now consume whatever other data's still buffered.
while True:
try:
chunk = self._buff.get(block=False)
if chunk is None:
return
data.append(chunk)
except queue.Empty:
break
yield b''.join(data)

Try using:
import urllib
urllib.urlretrieve ("http://www.example.com/songs/mp3.mp3", "mp3.mp3")
(for Python 3+ use import urllib.request and urllib.request.urlretrieve)

Related

Azure Text-To-Spech to PyAudio Stream

I am trying to stream the output of an Azure text-to-speech instance to my speaker with PyAudio using Microsoft's sample code
I tried to write to PyAudio's stream inside Azure's callback function def write, but it gives me this error:
`my_stream.write(audio_buffer)
File "/opt/homebrew/lib/python3.10/site-packages/pyaudio.py", line 589
, in write pa.write_stream(self._stream, frames, num_frames,
TypeError: argument 2 must be read-only bytes-like object, not memoryview`
How do I handle Azure's output so that the PyAudio stream accepts it as audio data?
Full code:
`import azure.cognitiveservices.speech as speechsdk
import os, sys, pyaudio
pa = pyaudio.PyAudio()
my_text = "My emotional experiences are varied, but mostly involve trying to find a balance between understanding others’ feelings and managing my own. I also explore the intersection of emotion and technology through affective computing and related research."
voc_data = {
'channels': 1 if sys.platform == 'darwin' else 2,
'rate': 44100,
'width': pa.get_sample_size(pyaudio.paInt16),
'format': pyaudio.paInt16,
'frames': []
}
my_stream = pa.open(format=voc_data['format'],
channels=voc_data['channels'],
rate=voc_data['rate'],
output=True)
speech_key = os.getenv('SPEECH_KEY')
service_region = os.getenv('SPEECH_REGION')
def speech_synthesis_to_push_audio_output_stream():
"""performs speech synthesis and push audio output to a stream"""
class PushAudioOutputStreamSampleCallback(speechsdk.audio.PushAudioOutputStreamCallback):
"""
Example class that implements the PushAudioOutputStreamCallback, which is used to show
how to push output audio to a stream
"""
def __init__(self) -> None:
super().__init__()
self._audio_data = bytes(0)
self._closed = False
def write(self, audio_buffer: memoryview) -> int:
"""
The callback function which is invoked when the synthesizer has an output audio chunk
to write out
"""
self._audio_data += audio_buffer
my_stream.write(audio_buffer)
print("{} bytes received.".format(audio_buffer.nbytes))
return audio_buffer.nbytes
def close(self) -> None:
"""
The callback function which is invoked when the synthesizer is about to close the
stream.
"""
self._closed = True
print("Push audio output stream closed.")
def get_audio_data(self) -> bytes:
return self._audio_data
def get_audio_size(self) -> int:
return len(self._audio_data)
# Creates an instance of a speech config with specified subscription key and service region.
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
# Creates customized instance of PushAudioOutputStreamCallback
stream_callback = PushAudioOutputStreamSampleCallback()
# Creates audio output stream from the callback
push_stream = speechsdk.audio.PushAudioOutputStream(stream_callback)
# Creates a speech synthesizer using push stream as audio output.
stream_config = speechsdk.audio.AudioOutputConfig(stream=push_stream)
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=stream_config)
# Receives a text from console input and synthesizes it to stream output.
while True:
# print("Enter some text that you want to synthesize, Ctrl-Z to exit")
# try:
# text = input()
# except EOFError:
# break
result = speech_synthesizer.speak_text_async(my_text).get()
# Check result
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
print("Speech synthesized for text [{}], and the audio was written to output stream.".format(text))
elif result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = result.cancellation_details
print("Speech synthesis canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print("Error details: {}".format(cancellation_details.error_details))
# Destroys result which is necessary for destroying speech synthesizer
del result
# Destroys the synthesizer in order to close the output stream.
del speech_synthesizer
print("Totally {} bytes received.".format(stream_callback.get_audio_size()))
speech_synthesis_to_push_audio_output_stream()`
Here , I have a work around where instead of the using stream use a file. Where the audio will be stored in the file and then we simply read the file and play it using Py audio.
# Dependencies
import os
import azure.cognitiveservices.speech as speechsdk
import pyaudio
import wave
speech_config = speechsdk.SpeechConfig(subscription="<Key>", region="<Region>")
# Audio Config
audio_config = speechsdk.audio.AudioOutputConfig(filename="background.wav")
speech_config.speech_synthesis_voice_name='en-US-JennyNeural'
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
print("Enter the Text:- ")
text = input()
speech_synthesis_result = speech_synthesizer.speak_text_async(text).get()
print("Conversion is Complete")
filename = 'background.wav' # Same is in audio config
chunk = 1024
file = wave.open(filename, 'rb')
p = pyaudio.PyAudio()
stream = p.open(format = p.get_format_from_width(file.getsampwidth()),
channels = file.getnchannels(),
rate = file.getframerate(),
output = True)
data = file.readframes(chunk)
print("Starting Audio")
while data != '':
stream.write(data)
data = file.readframes(chunk)
stream.stop_stream()
stream.close()
p.terminate()
Here It will take more space and more time as we are reading the file.

Capture Sound from Mirophone and speakers

We are using amazon transcribe to make speach to text but we need to capture sound from both microphone and speakers. Do you think that this can be done with sounddevice or should we use something else?
amazon-transcribe-streaming-sdk
Mic Function
async def mic_stream():
# This function wraps the raw input stream from the microphone forwarding
# the blocks to an asyncio.Queue.
loop = asyncio.get_event_loop()
input_queue = asyncio.Queue()
def callback(indata, outdata, frame_count, time_info, status):
'''
if status:
print(status)
'''
#indata[:] = outdata
loop.call_soon_threadsafe(input_queue.put_nowait, (bytes(indata), status))
# Be sure to use the correct parameters for the audio stream that matches
# the audio formats described for the source language you'll be using:
# https://docs.aws.amazon.com/transcribe/latest/dg/streaming.html
stream = sounddevice.RawStream(
#device=3,
channels=1,
samplerate=16000,
callback=callback,
blocksize=1024 * 2,
dtype="int16",
)
# Initiate the audio stream and asynchronously yield the audio chunks
# as they become available.
with stream:
while True:
indata, status = await input_queue.get()
yield indata, status

How to multithread a program and return results to main.py?

I am using Windows and python. The objective is simple, running the main.py starts a speech recognition and once it recognizes what was said returns the text to main.py. The speech recognition program recognizes without any issues, the problem is at multithreading and getting the result back to main.py.
Here is the main.py:
import threading
from speechEngine.recognize import *
from SpeechSynthesis.speech import *
from core.AI import *
spch= "default"
newthread=threading.Thread(target=speechrec())
newthread.start()
while(True):
if(spch == "default"):
print("at default")
continue
else:
print(spch)
result=process(spch)
speak(result)
spch="default"
And here is speech recognition which is called as a new thread:
import argparse
import os
import queue
from typing import Text
import sounddevice as sd
import vosk
import sys
import json
#from vosk import SetLogLevel
#SetLogLevel(-1)
def speechrec():
q = queue.Queue()
"a lot of argument lines have been deleted to increase readability"
try:
if args.model is None:
args.model = "model"
if args.samplerate is None:
device_info = sd.query_devices(args.device, 'input')
args.samplerate = int(device_info['default_samplerate'])
model = vosk.Model(args.model)
with sd.RawInputStream(samplerate=args.samplerate, blocksize = 8000, device=args.device, dtype='int16', channels=1, callback=callback):
rec = vosk.KaldiRecognizer(model, args.samplerate)
while True:
data = q.get()
if rec.AcceptWaveform(data):
vc=rec.FinalResult() #produces raw output of what the user said
vc=json.loads(vc)
vc=vc['text'] #converts the user speech to text format
if vc != '':
global spch
spch = vc
except KeyboardInterrupt:
parser.exit(0)

How can I make my audio loop with pyaudio?

First of all I'm pretty new to this library and I don't really understand everything. With the help of the internet I managed to get this code snippet working. This code basically plays an audio file(.wav to be specific). The problem is that it only plays once; I want the audio file to loop until I set the is_looping variable to False.
import pyaudio
import wave
class AudioFile:
chunk = 1024
def __init__(self, file_dir):
""" Init audio stream """
self.wf = wave.open(file_dir, 'rb')
self.p = pyaudio.PyAudio()
self.stream = self.p.open(
format=self.p.get_format_from_width(self.wf.getsampwidth()),
channels=self.wf.getnchannels(),
rate=self.wf.getframerate(),
output=True
)
def play(self):
""" Play entire file """
data = self.wf.readframes(self.chunk)
while data != '':
self.stream.write(data)
data = self.wf.readframes(self.chunk)
def close(self):
""" Graceful shutdown """
self.stream.close()
self.p.terminate()
is_looping = True
audio = AudioFile("___.wav")
audio.play()
audio.close()
I tried doing something like this, but it still didn't work:
is_looping = True
audio = AudioFile("___.wav")
while is_looping:
audio.play()
audio.close()
I couldn't find a way to loop the audio using my code, but I found a code in the internet that does exactly what I wanted it to do. Here's the link: https://gist.github.com/THeK3nger/3624478
And here is the code from that link:
import os
import wave
import threading
import sys
# PyAudio Library
import pyaudio
class WavePlayerLoop(threading.Thread):
CHUNK = 1024
def __init__(self, filepath, loop=True):
"""
Initialize `WavePlayerLoop` class.
PARAM:
-- filepath (String) : File Path to wave file.
-- loop (boolean) : True if you want loop playback.
False otherwise.
"""
super(WavePlayerLoop, self).__init__()
self.filepath = os.path.abspath(filepath)
self.loop = loop
def run(self):
# Open Wave File and start play!
wf = wave.open(self.filepath, 'rb')
player = pyaudio.PyAudio()
# Open Output Stream (based on PyAudio tutorial)
stream = player.open(format=player.get_format_from_width(wf.getsampwidth()),
channels=wf.getnchannels(),
rate=wf.getframerate(),
output=True)
# PLAYBACK LOOP
data = wf.readframes(self.CHUNK)
while self.loop:
stream.write(data)
data = wf.readframes(self.CHUNK)
if data == b'': # If file is over then rewind.
wf.rewind()
data = wf.readframes(self.CHUNK)
stream.close()
player.terminate()
def play(self):
"""
Just another name for self.start()
"""
self.start()
def stop(self):
"""
Stop playback.
"""
self.loop = False
You just need to add something like this outside the class and it should work:
player = WavePlayerLoop("sounds/1.wav")
player.play()

How to play an audiofile with pyaudio?

I do not understand the example material for pyaudio. It seems they had written an entire small program and it threw me off.
How do I just play a single audio file?
Format is not an issue, I just want to know the bare minimum code I need to play an audio file.
May be this small wrapper (warning: created on knees) of their example will help you to understand the meaning of code they wrote.
import pyaudio
import wave
import sys
class AudioFile:
chunk = 1024
def __init__(self, file):
""" Init audio stream """
self.wf = wave.open(file, 'rb')
self.p = pyaudio.PyAudio()
self.stream = self.p.open(
format = self.p.get_format_from_width(self.wf.getsampwidth()),
channels = self.wf.getnchannels(),
rate = self.wf.getframerate(),
output = True
)
def play(self):
""" Play entire file """
data = self.wf.readframes(self.chunk)
while data != b'':
self.stream.write(data)
data = self.wf.readframes(self.chunk)
def close(self):
""" Graceful shutdown """
self.stream.close()
self.p.terminate()
# Usage example for pyaudio
a = AudioFile("1.wav")
a.play()
a.close()
The example seems pretty clear to me. You simply save the example as playwav.py call:
python playwav.py my_fav_wav.wav
The wave example with some extra comments:
import pyaudio
import wave
import sys
# length of data to read.
chunk = 1024
# validation. If a wave file hasn't been specified, exit.
if len(sys.argv) < 2:
print "Plays a wave file.\n\n" +\
"Usage: %s filename.wav" % sys.argv[0]
sys.exit(-1)
'''
************************************************************************
This is the start of the "minimum needed to read a wave"
************************************************************************
'''
# open the file for reading.
wf = wave.open(sys.argv[1], 'rb')
# create an audio object
p = pyaudio.PyAudio()
# open stream based on the wave object which has been input.
stream = p.open(format =
p.get_format_from_width(wf.getsampwidth()),
channels = wf.getnchannels(),
rate = wf.getframerate(),
output = True)
# read data (based on the chunk size)
data = wf.readframes(chunk)
# play stream (looping from beginning of file to the end)
while data:
# writing to the stream is what *actually* plays the sound.
stream.write(data)
data = wf.readframes(chunk)
# cleanup stuff.
wf.close()
stream.close()
p.terminate()
This way requires ffmpeg for pydub, but can play not only wave files:
import pyaudio
import sys
from pydub import AudioSegment
if len(sys.argv) <= 1:
print('No File Name!')
sys.exit(1)
chunk = 1024
fn = ' '.join(sys.argv[1:])
pd = AudioSegment.from_file(fn)
p = pyaudio.PyAudio()
stream = p.open(format =
p.get_format_from_width(pd.sample_width),
channels = pd.channels,
rate = pd.frame_rate,
output = True)
i = 0
data = pd[:chunk]._data
while data:
stream.write(data)
i += chunk
data = pd[i:i + chunk]._data
stream.close()
p.terminate()
sys.exit(0)

Categories