The purpose of my code is to listen to the user speak and return what he said. I am using the Whisper and the Speach Recognition Libraries for this. When I run the code, it returns a blank list {'text': '', 'segments': [], 'language': 'en'}
Code:
import unicodedata
import speech_recognition as sr
import pyttsx3
import whisper
import numpy as np
r = sr.Recognizer()
with sr.Microphone() as source:
print('Speak now!')
audiosource = r.listen(source)
raw_bytes = audiosource.frame_data
something = np.frombuffer(raw_bytes, np.int16).flatten().astype(np.float32) / 32768.0
model = whisper.load_model("tiny")
result = model.transcribe(something, fp16=False)
print(result)
I tried looking at the type of data the the transcribe function takes in, which is np. array. The something variable is also an np.array. I don't understand why the transcribe function doesn't decode the array.
Related
I have used this code from geeksforgeeks (https://www.geeksforgeeks.org/language-translator-using-google-api-in-python/), I am trying to run it and it runs without any error, and it prints out:
Speak 'hello' to initiate the Translation !
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
but when i say "hello" it does not recognize it and do not start listening for translation.
I have imported all the modules, tried updating every one of them, and also Im using a macbook m1 pro.
And heres the code:
import speech_recognition as spr
from googletrans import Translator
from gtts import gTTS
import os
# Creating Recogniser() class object
recog1 = spr.Recognizer()
# Creating microphone instance
mc = spr.Microphone()
# Capture Voice
with mc as source:
print("Speak 'hello' to initiate the Translation !")
print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
recog1.adjust_for_ambient_noise(source, duration=0.2)
audio = recog1.listen(source)
MyText = recog1.recognize_google(audio)
MyText = MyText.lower()
# Here initialising the recorder with
# hello, whatever after that hello it
# will recognise it.
if 'hello' in MyText:
# Translator method for translation
translator = Translator()
# short form of english in which
# you will speak
from_lang = 'en'
# In which we want to convert, short
# form of hindi
to_lang = 'hi'
with mc as source:
print("Speak a stentence...")
recog1.adjust_for_ambient_noise(source, duration=0.2)
# Storing the speech into audio variable
audio = recog1.listen(source)
# Using recognize.google() method to
# convert audio into text
get_sentence = recog1.recognize_google(audio)
# Using try and except block to improve
# its efficiency.
try:
# Printing Speech which need to
# be translated.
print("Phase to be Translated :"+ get_sentence)
# Using translate() method which requires
# three arguments, 1st the sentence which
# needs to be translated 2nd source language
# and 3rd to which we need to translate in
text_to_translate = translator.translate(get_sentence,
src= from_lang,
dest= to_lang)
# Storing the translated text in text
# variable
text = text_to_translate.text
# Using Google-Text-to-Speech ie, gTTS() method
# to speak the translated text into the
# destination language which is stored in to_lang.
# Also, we have given 3rd argument as False because
# by default it speaks very slowly
speak = gTTS(text=text, lang=to_lang, slow= False)
# Using save() method to save the translated
# speech in capture_voice.mp3
speak.save("captured_voice.mp3")
# Using OS module to run the translated voice.
os.system("start captured_voice.mp3")
# Here we are using except block for UnknownValue
# and Request Error and printing the same to
# provide better service to the user.
except spr.UnknownValueError:
print("Unable to Understand the Input")
except spr.RequestError as e:
print("Unable to provide Required Output".format(e))
from gtts import gTTS
from io import BytesIO
from pygame import mixer
import time
def speak():
mp3_fp = BytesIO()
tts = gTTS('KGF is a Great movie to watch', lang='en')
tts.write_to_fp(mp3_fp)
tts.save("Audio.mp3")
return mp3_fp
mixer.init()
sound = speak()
sound.seek(0)
mixer.music.load(sound, "mp3")
mixer.music.play()
I'm trying to adjust the pitch of IBM Watson but I can't seem to find any documentation on this whatsoever.
If you visit this link then you can see that there is an option to adjust the pitch/speed.
The code I have is very simply this:
from ibm_watson import TextToSpeechV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
authenticator = IAMAuthenticator('api_key')
text_to_speech = TextToSpeechV1(
authenticator=authenticator
)
text_to_speech.set_service_url('service_url')
sample = "insert what you want to say here"
with open('test.wav', 'wb') as audio_file:
audio_file.write(
text_to_speech.synthesize(
sample,
voice='en-GB_JamesV3Voice',
accept='audio/wav'
).get_result().content)
I have literally no idea what parameters to adjust in order to make the voice low. Thank you so much!
What you are looking for is the prosody element. Neural voices (V3) only use the pitch and rate attribute.
Using your example:
sample = 'Here is a <prosody pitch="150Hz"> modified pitch </prosody> example.'
sample = 'Here is a <prosody rate="x-slow"> modified rate </prosody> example.'
And here is a link to the docs about the prosody element:
https://cloud.ibm.com/docs/text-to-speech?topic=text-to-speech-elements#prosody_element
I'm writing a simple python program to control my LED lights with voice commands and am running into a problem: after a few hours of continuous running, it freezes without any kind of error. I checked task manager every time it was frozen and it didn't show any thing significant, everything was well within normal parameters. I believe it may have something to do with the SpeechRecognition module I'm using, as it always freezes in the "recordAudio()" function below.
(Excuse the iffy code, I'm new to programming)
from BLEDevices import *
import speech_recognition as sr
import winsound
import wolframalpha
import pyttsx3
app_id = 'V3R5HG-H7U9VH6YR4'
client = wolframalpha.Client(app_id)
engine = pyttsx3.init()
def recordAudio(phrase):
# obtain audio from the microphone
r = sr.Recognizer()
with sr.Microphone() as source:
print(phrase)
audio = r.listen(source)
# recognize speech using Google Speech Recognition
try:
data = r.recognize_google(audio)
print("Speech Recognition thinks you said " + data)
except sr.UnknownValueError:
return "failed"
except sr.RequestError as e:
print("Could not request results from Speech Recognition service; {0}".format(e))
return "failed"
return data
def wolfram(text):
try:
res = client.query(text)
result = next(res.results).text
except:
result = 'Unable to Answer Query'
print(result)
engine.say(result)
engine.runAndWait()
def processAudio(text):
if ('color' in text or 'light' in text or 'led' in text or 'brightness' in text or 'work mode' in text or 'relax mode' in text):
led_control(text)
else:
wolfram(text)
while True:
text = recordAudio('Listening for "Controller"...').lower()
if 'controller' in text:
winsound.Beep(500, 100)
text = recordAudio('Listening...').lower()
processAudio(text)
else:
print('\b', end='')
so i'm creating a speech to search script in python for youtube using selenium and speech recognition module and it's working perfectly except for one thing and its that it listens to my voice for too long or waits for my input for too long so is there a way to put a timer on the input? here's my code:
import speech_recognition as sr
from selenium import webdriver
r=sr.Recognizer()
drive=webdriver.Chrome()
def gotosite():
drive.get("https://youtube.com")
with sr.Microphone() as source:
print("speak:")
audio = r.listen(source)
text = r.recognize_google(audio)
print("You said: {}".format(text))
searchbox = drive.find_element_by_xpath(
"/html/body/ytd-app/div/div/ytd-masthead/div[3]/div[2]/ytd-searchbox/form/div/div[1]/input")
searchbox.send_keys(text)
searchbutton = drive.find_element_by_xpath(
"/html/body/ytd-app/div/div/ytd-masthead/div[3]/div[2]/ytd-searchbox/form/button")
searchbutton.click()
gotosite()
use audio = r.listen(source, timeout=3) in place of audio = r.listen(source)
I have searched and tried to implement solutions suggested here:
Errno 13 Permission denied: 'file.mp3' Python
Error while re-opening sound file in python
But there doesn't seem to be any good solutions to this. Here is my code, can anyone tell me what I am doing wrong here:
#!/usr/bin/env python3
# Requires PyAudio and PySpeech.
import time, os
import speech_recognition as sr
from gtts import gTTS
import pygame as pg
import mutagen.mp3
#Find out what input sound device is default (use if you have issues with microphone)
#import pyaudio
#sdev= pyaudio.pa.get_default_input_device()
def play_music(sound_file, volume=0.8):
'''
stream music with mixer.music module in a blocking manner
this will stream the sound from disk while playing
'''
# set up the mixer, this will set it up according to your sound file
mp3 = mutagen.mp3.MP3(sound_file)
pg.mixer.init(frequency=mp3.info.sample_rate)
pg.mixer.music.set_volume(volume)
try:
pg.mixer.music.load(sound_file)
print("HoBo Sound file {} loaded!".format(sound_file))
except pg.error:
print("HoBo Sound file {} not found! ({})".format(sound_file, pg.get_error()))
return
pg.mixer.music.play()
while pg.mixer.music.get_busy() == True:
continue
pg.mixer.quit()
sound_file.close()
def speak(audioString):
print(audioString)
tts = gTTS(text=audioString, lang='en')
tts.save("audio.mp3")
# pick a mp3 file in folder or give full path
sound_file = "audio.mp3"
# optional volume 0 to 1.0
volume = 0.6
play_music(sound_file, volume)
def audioIn():
# Record Audio from Microphone
r = sr.Recognizer()
with sr.Microphone() as source:
print("Say something!")
audio = r.listen(source)
# Google Speech Recognition
try:
# for testing purposes, we're just using the default API key
# to use another API key, use `r.recognize_google(audio, key="GOOGLE_SPEECH_RECOGNITION_API_KEY")`
# instead of `r.recognize_google(audio)`
data = r.recognize_google(audio)
print("You said: ", data)
except sr.UnknownValueError:
print("Google Speech Recognition could not understand audio")
except sr.RequestError as e:
print("Could not request results from Google Speech Recognition service; {0}".format(e))
return data
def hobo(data):
if "how are you" in data:
speak("I am fine")
if "what time is it" in data:
speak(time.ctime())
if "where is" in data:
data = data.split(" ")
location = data[2]
speak("Hold on Sir, I will show you where " + location + " is.")
os.system("chromium-browser https://www.google.nl/maps/place/" + location + "/&")
# Starts the program
#time.sleep(2)
speak("Testing")
while(data != "stop"):
data = audioIn()
hobo(data)
else:
quit
So I found the fix in one of the original threads I already went over. The fix was to implement a delete() function like so:
def delete():
time.sleep(2)
pg.mixer.init()
pg.mixer.music.load("somefilehere.mp3")
os.remove("audio.mp3")
and changing the play_music() function so it includes the delete() function in the end (and I removed the sound_file.close() statement of course).
Follow below method
import time
from gtts import gTTS
import pygame
def Text_to_speech():
Message = "hey there"
speech = gTTS(text=Message)
speech.save('textToSpeech.mp3')
pygame.mixer.init()
pygame.mixer.music.load("textToSpeech.mp3")
pygame.mixer.music.play()
time.sleep(3)
pygame.mixer.music.unload()