I'm using Google Speech API and since i'm using LongRunning functions for wav files, and they're all in pt-BR language, they're returning with content such as "voc\303\252 hoje boa noite cart\303\243o".
How can I convert this back to UTF-8?
I already tried .encode function, and already tried to check if there's any parameter to send, but I cannot find anything.
# [START def_transcribe_gcs]
def transcribe_gcs(gcs_uri):
"""Asynchronously transcribes the audio file specified by the gcs_uri."""
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
client = speech.SpeechClient()
audio = types.RecognitionAudio(uri=gcs_uri)
config = types.RecognitionConfig(
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=8000,
language_code='pt-BR')
operation = client.long_running_recognize(config, audio)
print('Waiting for operation to complete...')
response = operation.result(timeout=300)
# Print the first alternative of all the consecutive results.
for result in response.results:
print('Transcript: {}'.format(result.alternatives[0].transcript))
print('Confidence: {}'.format(result.alternatives[0].confidence))
##This part is mine, the rest of the code belongs to Google
file = open("Test.txt", "wb")
file.write(str(response.results))
file.close()
# [END def_transcribe_gcs]
Related
I would like to know if it is possible to get all the possible transcripts that google can generate from a given audio file, as you can see it is only giving the transcript that has the higher matching result.
from google.cloud import speech
import os
import io
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = ''
# Creates google client
client = speech.SpeechClient()
# Full path of the audio file, Replace with your file name
file_name = os.path.join(os.path.dirname(__file__),"test2.wav")
#Loads the audio file into memory
with io.open(file_name, "rb") as audio_file:
content = audio_file.read()
audio = speech.RecognitionAudio(content=content)
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
audio_channel_count=1,
language_code="en-gb"
)
# Sends the request to google to transcribe the audio
response = client.recognize(request={"config": config, "audio": audio})
print(response.results)
# Reads the response
for result in response.results:
print("Transcript: {}".format(result.alternatives[0].transcript))
On your RecognitionConfig(), set a value to max_alternatives. When this is set greater than 1, it will show the other possible transcriptions.
max_alternatives int
Maximum number of recognition hypotheses to be returned. Specifically,
the maximum number of SpeechRecognitionAlternative messages within
each SpeechRecognitionResult. The server may return fewer than
max_alternatives. Valid values are 0-30. A value of 0
or 1 will return a maximum of one. If omitted, will return a
maximum of one.
Update your RecognitionConfig() to the code below:
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
audio_channel_count=1,
language_code="en-gb",
max_alternatives=10 # place a value between 0 - 30
)
I tested this using the sample audio from the github repo of Speech API. I used code below for testing:
from google.cloud import speech
import os
import io
# Creates google client
client = speech.SpeechClient()
# Full path of the audio file, Replace with your file name
file_name = os.path.join(os.path.dirname(__file__),"audio.raw")
#Loads the audio file into memory
with io.open(file_name, "rb") as audio_file:
content = audio_file.read()
audio = speech.RecognitionAudio(content=content)
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
audio_channel_count=1,
language_code="en-us",
max_alternatives=10 # used 10 for testing
)
# Sends the request to google to transcribe the audio
response = client.recognize(request={"config": config, "audio": audio})
for result in response.results:
print(result.alternatives)
Output:
I'm currently working on a project where I request a phone call (Mp3) and have to make an automatic transcript through a python script.
I'm using the Azure Speech to text services and got that all working, but that service only supports a Wav. file and I am still stuck at that part.
import azure.cognitiveservices.speech as speechsdk
import time
from os import path
from pydub import AudioSegment
import requests
import hashlib
OID = ***
string = f"***"
encoded = string.encode()
result = hashlib.sha256(encoded)
resultHash = (result.hexdigest())
r = requests.get(f"***", headers={f"***":f"{***}"})
Telefoongesprek = r
# converts audio file (mp3 to Wav.)
#src = Telefoongesprek
#dst = "Telefoongesprek #****.wav"
#sound = AudioSegment.from_mp3(src)
#sound.export(dst, format="wav")
def speech_recognize_continuous_from_file():
speech_config = speechsdk.SpeechConfig(subscription="***", region="***")
speech_config.speech_recognition_language = "nl-NL"
audio_config = speechsdk.audio.AudioConfig(filename="Telefoongesprek #****.wav")
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
done = False
def stop_cb(evt):
print('CLOSING on {}'.format(evt))
nonlocal done
done = True
all_results = []
def handle_final_result(evt):
all_results.append(evt.result.text)
#speech_recognizer.recognizing.connect(handle_final_result)
speech_recognizer.recognized.connect(handle_final_result)
speech_recognizer.session_started.connect(handle_final_result)
speech_recognizer.session_stopped.connect(handle_final_result)
speech_recognizer.canceled.connect(handle_final_result)
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)
speech_recognizer.start_continuous_recognition()
while not done:
time.sleep(.5)
speech_recognizer.stop_continuous_recognition()
print(all_results)
speech_recognize_continuous_from_file()
Thats the code im using without all the keys and encryption, and everthing works apart from the convert from MP3 to Wav.
is there any way I can save the requested file locally in this script and pass it through in:
audio_config = speechsdk.audio.AudioConfig(filename="Telefoongesprek #****.wav"). or do I have to save it to the pc and do it another way.
I have been stuck on this problem for over a week and have tried many different ways.
Thanks in advance!
Beau van der Meer
You should be able to save the response data ( you can access the raw bytes with r.content) to a .mp3 file locally and then pass that file path to pydub.
with open('path/to/local/file.mp3', 'wb') as f:
f.write(r.content)
Another option is to use the module io.BytesIO from the standard library.
If you pass it raw bytes, e g import io; f = io.BytesIO(r.content), it will give you a object that behaves like an open filehandle back, which you can pass to functions accepting files. I didn't check that pydub method you are trying to use accepts filehandles or only paths, so you have to check that first.
I am using Cloud speech to text api to convert audio file to text file. I am executing it using python, Below is code.
import io
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="D:\\Sentiment_Analysis\\My Project 59503-717155d6fb4a.json"
# Imports the Google Cloud client library
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
# Instantiates a client
client = speech.SpeechClient()
# The name of the audio file to transcribe
file_name = os.path.join(os.path.dirname('D:\CallADoc_VoiceImplementation\audioclip154173607416598.amr'),'CallADoc_VoiceImplementation','audioclip154173607416598.amr')
# Loads the audio into memory
with io.open(file_name, 'rb') as audio_file: content = audio_file.read()
audio = types.RecognitionAudio(content=content)
config = types.RecognitionConfig(encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,sample_rate_hertz=16000,language_code='en-IN')
# Detects speech in the audio file
response = client.recognize(config, audio)
for result in response.results: print('Transcript: {}'.format(result.alternatives[0].transcript))
When i execute the sample/tested audio file in the name "audio.raw", the audio is converting and result is like below.
runfile('C:/Users/sandesh.p/CallADoc/GoogleSpeechtoText.py', wdir='C:/Users/sandesh.p/CallADoc')
Transcript: how old is the Brooklyn Bridge
But for same code, i am recording a audio and try to convert, it is giving empty result like below:
runfile('C:/Users/sandesh.p/CallADoc/GoogleSpeechtoText.py', wdir='C:/Users/sandesh.p/CallADoc')
I am trying to fix this from past 2 days and please help me to resolve this.
Try following the troubleshooting steps to have your audio with the appropriate settings.
For instance, your audio file will have the following settings, which are required to have better results:
Encoding: FLAC
Channels: 1 # 16-bit
Sampleratehertz: 16000Hz
The following is my code (I made some slight changes to the original example code):
import io
import os
# Imports the Google Cloud client library
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
# Instantiates a client
client = speech.SpeechClient()
# The name of the audio file to transcribe
file_name = os.path.join(
os.path.dirname(__file__),
'C:\\Users\\louie\\Desktop',
'TOEFL2.mp3')
# Loads the audio into memory
with io.open(file_name, 'rb') as audio_file:
content = audio_file.read()
audio = types.RecognitionAudio(content=content)
config = types.RecognitionConfig(
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
language_code='en-US')
# Detects speech in the audio file
response = client.recognize(config, audio)
for result in response.results:
print('Transcript: {}'.format(result.alternatives[0].transcript))
text_file = open("C:\\Users\\louie\\Desktop\\Output.txt", "w")
text_file.write('Transcript: {}'.format(result.alternatives[0].transcript))
text_file.close()
I can only directly run this code in my windows prompt command since otherwise, the system cannot know the GOOGLE_APPLICATION_CREDENTIALS. However, when I run the code, nothing happened. I followed all the steps and I could see the request traffic changed on my console. But I cannot see any transcript. Could someone help me out?
You are trying to decode TOEFL2.mp3 file encoded as MP3 while you specify LINEAR audio encoding with
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16
You have to convert mp3 to wav first, see information about AudioEncoding
I'm starting to know how to use google APIs modifying the python example code of the texttospeech API I found an issue, when I use ssml languaje in a txt file to pass the text to the API the resultant mp3 audio changed the character 'é' with the sentence 'derechos de autor' and the character 'á' with a silence. That only happens when I read the text from file, if i provide the ssml sentence direct to the applicacion by argunment when calling it this change doesn't happens.
I searched for this issue and I didn't find it, colud anyone give a hint of that is going on here?
This is the function that takes the ssml texto from the console, and creates the correct mp3 audio file:
def synthesize_ssml(ssml, output):
from google.cloud import texttospeech as texttospeech
client = texttospeech.TextToSpeechClient()
input_text = texttospeech.types.SynthesisInput(ssml=ssml)
voice = texttospeech.types.VoiceSelectionParams(language_code='es-ES')
audio_config = texttospeech.types.AudioConfig(
audio_encoding=texttospeech.enums.AudioEncoding.MP3)
response = client.synthesize_speech(input_text, voice, audio_config)
with open(output, 'wb') as out:
out.write(response.audio_content)
print('Audio content written to file "%s"' % output)
And this is the function that takes the ssml from a file, the same text, produce different audio files:
def synthesize_ssml_file(input, output):
from google.cloud import texttospeech as texttospeech
with open(input,'r') as inp:
input_text=texttospeech.types.SynthesisInput(ssml=str(inp.read()))
client = texttospeech.TextToSpeechClient()
voice = texttospeech.types.VoiceSelectionParams(language_code='es-ES')
audio_config = texttospeech.types.AudioConfig(
audio_encoding=texttospeech.enums.AudioEncoding.MP3)
response = client.synthesize_speech(input_text, voice, audio_config)
with open(output, 'wb') as out:
out.write(response.audio_content)
print('Audio content written to file "%s"' % output)