how to get Recognized text only? - python

I used Azure speech to text in python
import azure.cognitiveservices.speech as speechsdk
var = lambda evt: print('ss: {}'.format(evt))
speech_recognizer.recognizing.connect(var)
then after trying to get result actual recognizer text it end with this:
ss: SpeechRecognitionEventArgs(session_id=0aea5e8b80e544b48414f2d27585b6c4, result=SpeechRecognitionResult(result_id=86c7de30436f4db1b064121bd617f24b, text="Hello.", reason=ResultReason.RecognizedSpeech))
I want to just print Hello ?

To get the text from the event:
import azure.cognitiveservices.speech as speechsdk
var = lambda evt: print('ss: {}'.format(evt.result.text))
speech_recognizer.recognizing.connect(var)

If you are using simple mic to recognize the text, here is something which you can use to get the text:
def speech_recognize_once_from_mic():
"""performs one-shot speech recognition from the default microphone"""
# <SpeechRecognitionWithMicrophone>
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
# Creates a speech recognizer using microphone as audio input.
# The default language is "en-us".
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config)
# Starts speech recognition, and returns after a single utterance is recognized. The end of a
# single utterance is determined by listening for silence at the end or until a maximum of 15
# seconds of audio is processed. It returns the recognition text as result.
# Note: Since recognize_once() returns only a single utterance, it is suitable only for single
# shot recognition like command or query.
# For long-running multi-utterance recognition, use start_continuous_recognition() instead.
result = speech_recognizer.recognize_once()
# Check the result
if result.reason == speechsdk.ResultReason.RecognizedSpeech:
print("Recognized: {}".format(result.text))
elif result.reason == speechsdk.ResultReason.NoMatch:
print("No speech could be recognized")
elif result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = result.cancellation_details
print("Speech Recognition canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print("Error details: {}".format(cancellation_details.error_details))
# </SpeechRecognitionWithMicrophone>
Check this repo for further reference. Hope it helps.

Related

Language Translator Using Google API in Python

I have used this code from geeksforgeeks (https://www.geeksforgeeks.org/language-translator-using-google-api-in-python/), I am trying to run it and it runs without any error, and it prints out:
Speak 'hello' to initiate the Translation !
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
but when i say "hello" it does not recognize it and do not start listening for translation.
I have imported all the modules, tried updating every one of them, and also Im using a macbook m1 pro.
And heres the code:
import speech_recognition as spr
from googletrans import Translator
from gtts import gTTS
import os
# Creating Recogniser() class object
recog1 = spr.Recognizer()
# Creating microphone instance
mc = spr.Microphone()
# Capture Voice
with mc as source:
print("Speak 'hello' to initiate the Translation !")
print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
recog1.adjust_for_ambient_noise(source, duration=0.2)
audio = recog1.listen(source)
MyText = recog1.recognize_google(audio)
MyText = MyText.lower()
# Here initialising the recorder with
# hello, whatever after that hello it
# will recognise it.
if 'hello' in MyText:
# Translator method for translation
translator = Translator()
# short form of english in which
# you will speak
from_lang = 'en'
# In which we want to convert, short
# form of hindi
to_lang = 'hi'
with mc as source:
print("Speak a stentence...")
recog1.adjust_for_ambient_noise(source, duration=0.2)
# Storing the speech into audio variable
audio = recog1.listen(source)
# Using recognize.google() method to
# convert audio into text
get_sentence = recog1.recognize_google(audio)
# Using try and except block to improve
# its efficiency.
try:
# Printing Speech which need to
# be translated.
print("Phase to be Translated :"+ get_sentence)
# Using translate() method which requires
# three arguments, 1st the sentence which
# needs to be translated 2nd source language
# and 3rd to which we need to translate in
text_to_translate = translator.translate(get_sentence,
src= from_lang,
dest= to_lang)
# Storing the translated text in text
# variable
text = text_to_translate.text
# Using Google-Text-to-Speech ie, gTTS() method
# to speak the translated text into the
# destination language which is stored in to_lang.
# Also, we have given 3rd argument as False because
# by default it speaks very slowly
speak = gTTS(text=text, lang=to_lang, slow= False)
# Using save() method to save the translated
# speech in capture_voice.mp3
speak.save("captured_voice.mp3")
# Using OS module to run the translated voice.
os.system("start captured_voice.mp3")
# Here we are using except block for UnknownValue
# and Request Error and printing the same to
# provide better service to the user.
except spr.UnknownValueError:
print("Unable to Understand the Input")
except spr.RequestError as e:
print("Unable to provide Required Output".format(e))
from gtts import gTTS
from io import BytesIO
from pygame import mixer
import time
def speak():
mp3_fp = BytesIO()
tts = gTTS('KGF is a Great movie to watch', lang='en')
tts.write_to_fp(mp3_fp)
tts.save("Audio.mp3")
return mp3_fp
mixer.init()
sound = speak()
sound.seek(0)
mixer.music.load(sound, "mp3")
mixer.music.play()

Python Program Freezes Without Error After a Few Hours of Running Continuously

I'm writing a simple python program to control my LED lights with voice commands and am running into a problem: after a few hours of continuous running, it freezes without any kind of error. I checked task manager every time it was frozen and it didn't show any thing significant, everything was well within normal parameters. I believe it may have something to do with the SpeechRecognition module I'm using, as it always freezes in the "recordAudio()" function below.
(Excuse the iffy code, I'm new to programming)
from BLEDevices import *
import speech_recognition as sr
import winsound
import wolframalpha
import pyttsx3
app_id = 'V3R5HG-H7U9VH6YR4'
client = wolframalpha.Client(app_id)
engine = pyttsx3.init()
def recordAudio(phrase):
# obtain audio from the microphone
r = sr.Recognizer()
with sr.Microphone() as source:
print(phrase)
audio = r.listen(source)
# recognize speech using Google Speech Recognition
try:
data = r.recognize_google(audio)
print("Speech Recognition thinks you said " + data)
except sr.UnknownValueError:
return "failed"
except sr.RequestError as e:
print("Could not request results from Speech Recognition service; {0}".format(e))
return "failed"
return data
def wolfram(text):
try:
res = client.query(text)
result = next(res.results).text
except:
result = 'Unable to Answer Query'
print(result)
engine.say(result)
engine.runAndWait()
def processAudio(text):
if ('color' in text or 'light' in text or 'led' in text or 'brightness' in text or 'work mode' in text or 'relax mode' in text):
led_control(text)
else:
wolfram(text)
while True:
text = recordAudio('Listening for "Controller"...').lower()
if 'controller' in text:
winsound.Beep(500, 100)
text = recordAudio('Listening...').lower()
processAudio(text)
else:
print('\b', end='')

Edit Azure Python code to clean up Speech-to-Text output

I'm using Microsoft Azure's speech-to-text API and it's working well but the output is cumbersome and I'd like to clean it up so that only the recognized speech is displayed.
this is what the output looks like
The python snippet that azure provides is:
try:
import azure.cognitiveservices.speech as speechsdk
import sys
sys.exit(1)
speech_key, service_region = "***", "***"
weatherfilename = os.path.join(
os.path.dirname(__file__),
'orf_audio_2',
'716_anton.wav')
# def speech_recognize_once_from_file():
"""performs one-shot speech recognition with input from an audio file"""
# <SpeechRecognitionWithFile>
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
audio_config = speechsdk.audio.AudioConfig(filename=weatherfilename)
# Creates a speech recognizer using a file as audio input.
# The default language is "en-us".
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
start_continuous_recognition() instead.
result = speech_recognizer.recognize_once()
# Check the result
if result.reason == speechsdk.ResultReason.RecognizedSpeech:
print("Recognized: {}".format(result.text))
elif result.reason == speechsdk.ResultReason.NoMatch:
print("No speech could be recognized: {}".format(result.no_match_details))
elif result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = result.cancellation_details
print("Speech Recognition canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print("Error details: {}".format(cancellation_details.error_details))
# </SpeechRecognitionWithFile>
result.text in the sample code is the simplest output of recognized speech.
My test with default microphone:
Please refer to below fragment of code which works for me.
import azure.cognitiveservices.speech as speechsdk
import time
# Creates an instance of a speech config with specified subscription key and service region.
# Replace with your own subscription key and service region (e.g., "westus").
speech_key, service_region = "***", "***"
weatherfilename = "D:\\whatstheweatherlike.wav"
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
audio_config = speechsdk.audio.AudioConfig(filename=weatherfilename)
# Creates a recognizer with the given settings
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
speech_recognizer.session_stopped.connect(lambda evt: print('\nSESSION STOPPED {}'.format(evt)))
speech_recognizer.recognized.connect(lambda evt: print('\n{}'.format(evt.result.text)))
# print('Say a few words\n\n')
speech_recognizer.start_continuous_recognition()
time.sleep(10)
speech_recognizer.stop_continuous_recognition()
speech_recognizer.session_started.disconnect_all()
speech_recognizer.recognized.disconnect_all()
speech_recognizer.session_stopped.disconnect_all()
And the output looks like:

Why google speech_v1p1beta1 output only shows the last word?

I am using the code below to transcribe an audio file. When the process is completed, I only get the last word.
I have tried both flac and wav files and made sure the files are in my bucket.
Also verified service account is google is working fine. But can't figure out why I am only getting the last word.
#!/usr/bin/env python
"""Google Cloud Speech API sample that demonstrates enhanced models
and recognition metadata.
Example usage:
python diarization.py
"""
import argparse
import io
def transcribe_file_with_diarization():
"""Transcribe the given audio file synchronously with diarization."""
# [START speech_transcribe_diarization_beta]
from google.cloud import speech_v1p1beta1 as speech
client = speech.SpeechClient()
audio = speech.types.RecognitionAudio(uri="gs://MYBUCKET/MYAudiofile")
config = speech.types.RecognitionConfig(
encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=8000,
language_code='en-US',
enable_speaker_diarization=True,
diarization_speaker_count=2)
print('Waiting for operation to complete...')
response = client.recognize(config, audio)
# The transcript within each result is separate and sequential per result.
# However, the words list within an alternative includes all the words
# from all the results thus far. Thus, to get all the words with speaker
# tags, you only have to take the words list from the last result:
result = response.results[-1]
words_info = result.alternatives[0].words
# Printing out the output:
for word_info in words_info:
print("word: '{}', speaker_tag: {}".format(word_info.word,
word_info.speaker_tag))
# [END speech_transcribe_diarization_beta]
if __name__ == '__main__':
transcribe_file_with_diarization()
RESULTS is shown here after running the code:
python diarazation.py
Waiting for operation to complete...
word: 'bye', speaker_tag: 0

Is it possible to always have a google cloud speech recognition/api listening for a keyword python

I am just starting out with python and the google speech api/speech recognition. I was wondering if it is possible to always have the speech api listening for a keyword and when it hears the keyword to process commands. Since there is a limit on how much free audio the google speech api can process, is this possible? So far I have code that looks like this, but once the api does not hear any speech for a certain amount of seconds(I think 4), it throws an error. In a final project, I'd like to get this to work on a raspberry pi 3.
import speech_recognition as sr
import speak
from time import ctime
import time
import sys
r = sr.Recognizer()
lang = 'en'
data = ''
nameCalled = 0
# Enable Microphone and use for audio input
# Speech recognition using Google Speech Recognition
def spk(text, lang):
speak.tts(text, lang)
def audioRecord():
try:
with sr.Microphone() as source:
#r.energy_threshold = 500
# Increase for less sensitivity, decrease for more
print('Listening...')
audio = r.listen(source)
#r.adjust_for_ambient_noise(source)
data = r.recognize_google(audio)
print('You said ' + data)
return data
except sr.UnknownValueError:
print('Google could not understand audio!')
except sr.RequestError as e:
print('Could not request results for GSR')
def brain(data):
global nameCalled
#^^Keep track to see if amber was called^^
global lang
#If amber was said, than the next command heard can be executed
if nameCalled == 0:
if 'Amber' in data:
nameCalled = 1
spk('Yes?', lang)
elif 'nothing' in data:
spk('Okay', lang)
sys.exit()
else:
return 'null'
#Once we hear amber, the next command spoken can be executed,
# if something goes wrong, just set the nameCalled variable to 0
#and restart the process
elif nameCalled == 1:
if 'what time is it' in data:
spk(ctime(), lang)
if 'nothing' in data:
spk('Okay', lang)
sys.exit()
nameCalled = 0
else:
nameCalled = 0
# initialization
spk('hello nick, what can I do for you today', lang)
while 1:
data = audioRecord()
brain(data)
Kitt.ai provides 'Snowboy', a hotword detection engine which serves that purpose. You may trigger the speech recognition after the hotword is detected and it's pretty accurate too and it exactly fits this use-case.
Best of all, it runs offline.
You can set your code to run, after getting triggered by the hotword.
Check it out:
https://snowboy.kitt.ai

Categories