I'm using Microsoft Azure's speech-to-text API and it's working well but the output is cumbersome and I'd like to clean it up so that only the recognized speech is displayed.
this is what the output looks like
The python snippet that azure provides is:
try:
import azure.cognitiveservices.speech as speechsdk
import sys
sys.exit(1)
speech_key, service_region = "***", "***"
weatherfilename = os.path.join(
os.path.dirname(__file__),
'orf_audio_2',
'716_anton.wav')
# def speech_recognize_once_from_file():
"""performs one-shot speech recognition with input from an audio file"""
# <SpeechRecognitionWithFile>
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
audio_config = speechsdk.audio.AudioConfig(filename=weatherfilename)
# Creates a speech recognizer using a file as audio input.
# The default language is "en-us".
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
start_continuous_recognition() instead.
result = speech_recognizer.recognize_once()
# Check the result
if result.reason == speechsdk.ResultReason.RecognizedSpeech:
print("Recognized: {}".format(result.text))
elif result.reason == speechsdk.ResultReason.NoMatch:
print("No speech could be recognized: {}".format(result.no_match_details))
elif result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = result.cancellation_details
print("Speech Recognition canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print("Error details: {}".format(cancellation_details.error_details))
# </SpeechRecognitionWithFile>
result.text in the sample code is the simplest output of recognized speech.
My test with default microphone:
Please refer to below fragment of code which works for me.
import azure.cognitiveservices.speech as speechsdk
import time
# Creates an instance of a speech config with specified subscription key and service region.
# Replace with your own subscription key and service region (e.g., "westus").
speech_key, service_region = "***", "***"
weatherfilename = "D:\\whatstheweatherlike.wav"
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
audio_config = speechsdk.audio.AudioConfig(filename=weatherfilename)
# Creates a recognizer with the given settings
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
speech_recognizer.session_stopped.connect(lambda evt: print('\nSESSION STOPPED {}'.format(evt)))
speech_recognizer.recognized.connect(lambda evt: print('\n{}'.format(evt.result.text)))
# print('Say a few words\n\n')
speech_recognizer.start_continuous_recognition()
time.sleep(10)
speech_recognizer.stop_continuous_recognition()
speech_recognizer.session_started.disconnect_all()
speech_recognizer.recognized.disconnect_all()
speech_recognizer.session_stopped.disconnect_all()
And the output looks like:
Related
I have used this code from geeksforgeeks (https://www.geeksforgeeks.org/language-translator-using-google-api-in-python/), I am trying to run it and it runs without any error, and it prints out:
Speak 'hello' to initiate the Translation !
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
but when i say "hello" it does not recognize it and do not start listening for translation.
I have imported all the modules, tried updating every one of them, and also Im using a macbook m1 pro.
And heres the code:
import speech_recognition as spr
from googletrans import Translator
from gtts import gTTS
import os
# Creating Recogniser() class object
recog1 = spr.Recognizer()
# Creating microphone instance
mc = spr.Microphone()
# Capture Voice
with mc as source:
print("Speak 'hello' to initiate the Translation !")
print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
recog1.adjust_for_ambient_noise(source, duration=0.2)
audio = recog1.listen(source)
MyText = recog1.recognize_google(audio)
MyText = MyText.lower()
# Here initialising the recorder with
# hello, whatever after that hello it
# will recognise it.
if 'hello' in MyText:
# Translator method for translation
translator = Translator()
# short form of english in which
# you will speak
from_lang = 'en'
# In which we want to convert, short
# form of hindi
to_lang = 'hi'
with mc as source:
print("Speak a stentence...")
recog1.adjust_for_ambient_noise(source, duration=0.2)
# Storing the speech into audio variable
audio = recog1.listen(source)
# Using recognize.google() method to
# convert audio into text
get_sentence = recog1.recognize_google(audio)
# Using try and except block to improve
# its efficiency.
try:
# Printing Speech which need to
# be translated.
print("Phase to be Translated :"+ get_sentence)
# Using translate() method which requires
# three arguments, 1st the sentence which
# needs to be translated 2nd source language
# and 3rd to which we need to translate in
text_to_translate = translator.translate(get_sentence,
src= from_lang,
dest= to_lang)
# Storing the translated text in text
# variable
text = text_to_translate.text
# Using Google-Text-to-Speech ie, gTTS() method
# to speak the translated text into the
# destination language which is stored in to_lang.
# Also, we have given 3rd argument as False because
# by default it speaks very slowly
speak = gTTS(text=text, lang=to_lang, slow= False)
# Using save() method to save the translated
# speech in capture_voice.mp3
speak.save("captured_voice.mp3")
# Using OS module to run the translated voice.
os.system("start captured_voice.mp3")
# Here we are using except block for UnknownValue
# and Request Error and printing the same to
# provide better service to the user.
except spr.UnknownValueError:
print("Unable to Understand the Input")
except spr.RequestError as e:
print("Unable to provide Required Output".format(e))
from gtts import gTTS
from io import BytesIO
from pygame import mixer
import time
def speak():
mp3_fp = BytesIO()
tts = gTTS('KGF is a Great movie to watch', lang='en')
tts.write_to_fp(mp3_fp)
tts.save("Audio.mp3")
return mp3_fp
mixer.init()
sound = speak()
sound.seek(0)
mixer.music.load(sound, "mp3")
mixer.music.play()
I'm having trouble running an image from a URL through the Vision API's Safe Search/Explicit Content Detection. Python Samples can be found here:
https://github.com/googleapis/python-vision/blob/HEAD/samples/snippets/detect/detect.py
If I were to save the below in a python file - what is the best way to run it? I tried !python detect.py safe-search-uri http://www.photos-public-domain.com/wp-content/uploads/2011/01/old-vw-bug-and-van.jpg but it's not working. Maybe I'm missing some of the code or running it the wrong way?
Sample Code from above github:
from google.cloud import vision
client = vision.ImageAnnotatorClient()
image = vision.Image()
image.source.image_uri = uri
response = client.safe_search_detection(image=image)
safe = response.safe_search_annotation
# Names of likelihood from google.cloud.vision.enums
likelihood_name = ('UNKNOWN', 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE',
'LIKELY', 'VERY_LIKELY')
print('Safe search:')
print('adult: {}'.format(likelihood_name[safe.adult]))
print('medical: {}'.format(likelihood_name[safe.medical]))
print('spoofed: {}'.format(likelihood_name[safe.spoof]))
print('violence: {}'.format(likelihood_name[safe.violence]))
print('racy: {}'.format(likelihood_name[safe.racy]))
if response.error.message:
raise Exception(
'{}\nFor more info on error messages, check: '
'https://cloud.google.com/apis/design/errors'.format(
response.error.message))
If you just executed the code snippet you included in your question, you are not passing the value uri to the code properly. You need to parse the arguments you are passing on your python command. You can do this by adding argparse.
from google.cloud import vision
import argparse
# Parse the options in this part #
parser = argparse.ArgumentParser(description='Safe search')
parser.add_argument(
'--safe-search-uri',
dest='uri'
)
args = parser.parse_args()
uri = args.uri
# [START vision_safe_search_detection_gcs]
def detect_safe_search_uri(uri):
"""Detects unsafe features in the file located in Google Cloud Storage or
on the Web."""
from google.cloud import vision
client = vision.ImageAnnotatorClient()
image = vision.Image()
image.source.image_uri = uri
response = client.safe_search_detection(image=image)
safe = response.safe_search_annotation
# Names of likelihood from google.cloud.vision.enums
likelihood_name = ('UNKNOWN', 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE',
'LIKELY', 'VERY_LIKELY')
print('Safe search:')
print('adult: {}'.format(likelihood_name[safe.adult]))
print('medical: {}'.format(likelihood_name[safe.medical]))
print('spoofed: {}'.format(likelihood_name[safe.spoof]))
print('violence: {}'.format(likelihood_name[safe.violence]))
print('racy: {}'.format(likelihood_name[safe.racy]))
if response.error.message:
raise Exception(
'{}\nFor more info on error messages, check: '
'https://cloud.google.com/apis/design/errors'.format(
response.error.message))
# [END vision_safe_search_detection_gcs]
detect_safe_search_uri(uri)
Testing using command !python detect.py --safe-search-uri http://www.photos-public-domain.com/wp-content/uploads/2011/01/old-vw-bug-and-van.jpg :
I'm writing a simple python program to control my LED lights with voice commands and am running into a problem: after a few hours of continuous running, it freezes without any kind of error. I checked task manager every time it was frozen and it didn't show any thing significant, everything was well within normal parameters. I believe it may have something to do with the SpeechRecognition module I'm using, as it always freezes in the "recordAudio()" function below.
(Excuse the iffy code, I'm new to programming)
from BLEDevices import *
import speech_recognition as sr
import winsound
import wolframalpha
import pyttsx3
app_id = 'V3R5HG-H7U9VH6YR4'
client = wolframalpha.Client(app_id)
engine = pyttsx3.init()
def recordAudio(phrase):
# obtain audio from the microphone
r = sr.Recognizer()
with sr.Microphone() as source:
print(phrase)
audio = r.listen(source)
# recognize speech using Google Speech Recognition
try:
data = r.recognize_google(audio)
print("Speech Recognition thinks you said " + data)
except sr.UnknownValueError:
return "failed"
except sr.RequestError as e:
print("Could not request results from Speech Recognition service; {0}".format(e))
return "failed"
return data
def wolfram(text):
try:
res = client.query(text)
result = next(res.results).text
except:
result = 'Unable to Answer Query'
print(result)
engine.say(result)
engine.runAndWait()
def processAudio(text):
if ('color' in text or 'light' in text or 'led' in text or 'brightness' in text or 'work mode' in text or 'relax mode' in text):
led_control(text)
else:
wolfram(text)
while True:
text = recordAudio('Listening for "Controller"...').lower()
if 'controller' in text:
winsound.Beep(500, 100)
text = recordAudio('Listening...').lower()
processAudio(text)
else:
print('\b', end='')
I'm not too familiar with python, apologies if this is too trivial question
I have a script that get an audio file from an Url, I need to convert the file from .ogg type to .wav
Then I want to pass the converted and loaded file, to a function that has as argument a filepath string.
Below is my code:
import os
import pydub
import glob
import time
from io import BytesIO
import pandas as pd
from urllib.request import Request, urlopen
import urllib.error
import azure.cognitiveservices.speech as speechsdk
import time
#%%
audio_file = "https = url.whatever.com.co/audio_file.ogg"
req = Request(audio_file)
try: response = urlopen(req).read()
except urllib.error.URLError as e:
print(e.reason)
sound = pydub.AudioSegment.from_ogg(BytesIO(response))
sound_wav = sound.export(format = "wav")
speech_key, service_region = "XXXXXXXXXXXXXXXX", "eastus"
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
speech_config.speech_recognition_language="es-ES"
audio_filename = r"C:\some_file_path\3AC3844337F7E5CEAE95.wav"
#audio_config = speechsdk.audio.AudioConfig(sound_wav)
audio_config = speechsdk.audio.AudioConfig(audio_filename = audio_filename)
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
done = False
def stop_cb(evt):
"""callback that stops continuous recognition upon receiving an event `evt`"""
print('CLOSING on {}'.format(evt))
speech_recognizer.stop_continuous_recognition()
global done
done = True
all_results = []
def handle_final_result(evt):
all_results.append(evt.result.text)
speech_recognizer.recognized.connect(handle_final_result)
# Connect callbacks to the events fired by the speech recognizer
speech_recognizer.recognized.connect(lambda evt: print('RECOGNIZED: {}'.format(evt)))
# stop continuous recognition on either session stopped or canceled events
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)
# Start continuous speech recognition
speech_recognizer.start_continuous_recognition()
while not done:
time.sleep(.5)
print("Printing all results:")
print(all_results)
When I run my code with this line:
audio_config = speechsdk.audio.AudioConfig(audio_filename = audio_filename)
It works correct...
However when I run it with this line:
audio_config = speechsdk.audio.AudioConfig(sound_wav)
I get this error:
ValueError: use_default_microphone must be a bool, is "tempfile._TemporaryFileWrapper object at 0x0000020EC4297668"
The error message you got suggests that sound_wav is a temporary filename. Then, as seen in the documentation, it looks like audio_config = speechsdk.audio.AudioConfig(filename = sound_wav) is what you need.
As you used audio_filename as a parameter, it could happen that a different version uses that different name. You can using that instead.
I used Azure speech to text in python
import azure.cognitiveservices.speech as speechsdk
var = lambda evt: print('ss: {}'.format(evt))
speech_recognizer.recognizing.connect(var)
then after trying to get result actual recognizer text it end with this:
ss: SpeechRecognitionEventArgs(session_id=0aea5e8b80e544b48414f2d27585b6c4, result=SpeechRecognitionResult(result_id=86c7de30436f4db1b064121bd617f24b, text="Hello.", reason=ResultReason.RecognizedSpeech))
I want to just print Hello ?
To get the text from the event:
import azure.cognitiveservices.speech as speechsdk
var = lambda evt: print('ss: {}'.format(evt.result.text))
speech_recognizer.recognizing.connect(var)
If you are using simple mic to recognize the text, here is something which you can use to get the text:
def speech_recognize_once_from_mic():
"""performs one-shot speech recognition from the default microphone"""
# <SpeechRecognitionWithMicrophone>
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
# Creates a speech recognizer using microphone as audio input.
# The default language is "en-us".
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config)
# Starts speech recognition, and returns after a single utterance is recognized. The end of a
# single utterance is determined by listening for silence at the end or until a maximum of 15
# seconds of audio is processed. It returns the recognition text as result.
# Note: Since recognize_once() returns only a single utterance, it is suitable only for single
# shot recognition like command or query.
# For long-running multi-utterance recognition, use start_continuous_recognition() instead.
result = speech_recognizer.recognize_once()
# Check the result
if result.reason == speechsdk.ResultReason.RecognizedSpeech:
print("Recognized: {}".format(result.text))
elif result.reason == speechsdk.ResultReason.NoMatch:
print("No speech could be recognized")
elif result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = result.cancellation_details
print("Speech Recognition canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print("Error details: {}".format(cancellation_details.error_details))
# </SpeechRecognitionWithMicrophone>
Check this repo for further reference. Hope it helps.