I have been using google text to speech API because of how great the voices are. The only problem is been trying to find how to make it user friendly. The biggest thing is google text to speech can only accept text files with 5000 or fewer characters. The main issue that, I have been finding is that currently all I can do is use a single text file copy and paste my stuff on there before saving. Does anyone know how can I upload a folder filled with text files to make it quicker? Plus also saving the mp3 instead of overwriting them?
# [START tts_ssml_address_imports]
from google.cloud import texttospeech
import os
import html
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] =
# [END tts_ssml_address_imports]
# [START tts_ssml_address_audio]
def ssml_to_audio(ssml_text, outfile):
# Generates SSML text from plaintext.
#
# Given a string of SSML text and an output file name, this function
# calls the Text-to-Speech API. The API returns a synthetic audio
# version of the text, formatted according to the SSML commands. This
# function saves the synthetic audio to the designated output file.
#
# Args:
# ssml_text: string of SSML text
# outfile: string name of file under which to save audio output
#
# Returns:
# nothing
# Instantiates a client
client = texttospeech.TextToSpeechClient()
# Sets the text input to be synthesized
synthesis_input = texttospeech.types.SynthesisInput(text=ssml_text)
# Builds the voice request, selects the language code ("en-US") and
# the SSML voice gender ("MALE")
voice = texttospeech.types.VoiceSelectionParams(language_code='en-US',
name="en-US-Wavenet-D",
ssml_gender=texttospeech.enums.SsmlVoiceGender.MALE))
# Selects the type of audio file to return
audio_config = texttospeech.types.AudioConfig(audio_encoding="LINEAR16", pitch = 0, speaking_rate = 0.9)
# Performs the text-to-speech request on the text input with the selected
# voice parameters and audio file type
response = client.synthesize_speech(synthesis_input, voice, audio_config)
# Writes the synthetic audio to the output file.
with open(outfile, 'wb') as out:
out.write(response.audio_content)
print('Audio content written to file ' + outfile)
# [END tts_ssml_address_audio]
def main():
# test example address file
file = 'input_text.txt'
with open(file, 'r') as f:
text = f.read()
ssml_text = text
ssml_to_audio(ssml_text, 'file_output_speech.mp3')
# [END tts_ssml_address_test]
if __name__ == '__main__':
main()
Related
I would like to know if it is possible to get all the possible transcripts that google can generate from a given audio file, as you can see it is only giving the transcript that has the higher matching result.
from google.cloud import speech
import os
import io
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = ''
# Creates google client
client = speech.SpeechClient()
# Full path of the audio file, Replace with your file name
file_name = os.path.join(os.path.dirname(__file__),"test2.wav")
#Loads the audio file into memory
with io.open(file_name, "rb") as audio_file:
content = audio_file.read()
audio = speech.RecognitionAudio(content=content)
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
audio_channel_count=1,
language_code="en-gb"
)
# Sends the request to google to transcribe the audio
response = client.recognize(request={"config": config, "audio": audio})
print(response.results)
# Reads the response
for result in response.results:
print("Transcript: {}".format(result.alternatives[0].transcript))
On your RecognitionConfig(), set a value to max_alternatives. When this is set greater than 1, it will show the other possible transcriptions.
max_alternatives int
Maximum number of recognition hypotheses to be returned. Specifically,
the maximum number of SpeechRecognitionAlternative messages within
each SpeechRecognitionResult. The server may return fewer than
max_alternatives. Valid values are 0-30. A value of 0
or 1 will return a maximum of one. If omitted, will return a
maximum of one.
Update your RecognitionConfig() to the code below:
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
audio_channel_count=1,
language_code="en-gb",
max_alternatives=10 # place a value between 0 - 30
)
I tested this using the sample audio from the github repo of Speech API. I used code below for testing:
from google.cloud import speech
import os
import io
# Creates google client
client = speech.SpeechClient()
# Full path of the audio file, Replace with your file name
file_name = os.path.join(os.path.dirname(__file__),"audio.raw")
#Loads the audio file into memory
with io.open(file_name, "rb") as audio_file:
content = audio_file.read()
audio = speech.RecognitionAudio(content=content)
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
audio_channel_count=1,
language_code="en-us",
max_alternatives=10 # used 10 for testing
)
# Sends the request to google to transcribe the audio
response = client.recognize(request={"config": config, "audio": audio})
for result in response.results:
print(result.alternatives)
Output:
The following is my code (I made some slight changes to the original example code):
import io
import os
# Imports the Google Cloud client library
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
# Instantiates a client
client = speech.SpeechClient()
# The name of the audio file to transcribe
file_name = os.path.join(
os.path.dirname(__file__),
'C:\\Users\\louie\\Desktop',
'TOEFL2.mp3')
# Loads the audio into memory
with io.open(file_name, 'rb') as audio_file:
content = audio_file.read()
audio = types.RecognitionAudio(content=content)
config = types.RecognitionConfig(
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
language_code='en-US')
# Detects speech in the audio file
response = client.recognize(config, audio)
for result in response.results:
print('Transcript: {}'.format(result.alternatives[0].transcript))
text_file = open("C:\\Users\\louie\\Desktop\\Output.txt", "w")
text_file.write('Transcript: {}'.format(result.alternatives[0].transcript))
text_file.close()
I can only directly run this code in my windows prompt command since otherwise, the system cannot know the GOOGLE_APPLICATION_CREDENTIALS. However, when I run the code, nothing happened. I followed all the steps and I could see the request traffic changed on my console. But I cannot see any transcript. Could someone help me out?
You are trying to decode TOEFL2.mp3 file encoded as MP3 while you specify LINEAR audio encoding with
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16
You have to convert mp3 to wav first, see information about AudioEncoding
I'm starting to know how to use google APIs modifying the python example code of the texttospeech API I found an issue, when I use ssml languaje in a txt file to pass the text to the API the resultant mp3 audio changed the character 'é' with the sentence 'derechos de autor' and the character 'á' with a silence. That only happens when I read the text from file, if i provide the ssml sentence direct to the applicacion by argunment when calling it this change doesn't happens.
I searched for this issue and I didn't find it, colud anyone give a hint of that is going on here?
This is the function that takes the ssml texto from the console, and creates the correct mp3 audio file:
def synthesize_ssml(ssml, output):
from google.cloud import texttospeech as texttospeech
client = texttospeech.TextToSpeechClient()
input_text = texttospeech.types.SynthesisInput(ssml=ssml)
voice = texttospeech.types.VoiceSelectionParams(language_code='es-ES')
audio_config = texttospeech.types.AudioConfig(
audio_encoding=texttospeech.enums.AudioEncoding.MP3)
response = client.synthesize_speech(input_text, voice, audio_config)
with open(output, 'wb') as out:
out.write(response.audio_content)
print('Audio content written to file "%s"' % output)
And this is the function that takes the ssml from a file, the same text, produce different audio files:
def synthesize_ssml_file(input, output):
from google.cloud import texttospeech as texttospeech
with open(input,'r') as inp:
input_text=texttospeech.types.SynthesisInput(ssml=str(inp.read()))
client = texttospeech.TextToSpeechClient()
voice = texttospeech.types.VoiceSelectionParams(language_code='es-ES')
audio_config = texttospeech.types.AudioConfig(
audio_encoding=texttospeech.enums.AudioEncoding.MP3)
response = client.synthesize_speech(input_text, voice, audio_config)
with open(output, 'wb') as out:
out.write(response.audio_content)
print('Audio content written to file "%s"' % output)
I am using the Google Cloud Vision API for Python on a small program I'm using. The function is working and I get the OCR results, but I need to format these before being able to work with them.
This is the function:
# Call to OCR API
def detect_text_uri(uri):
"""Detects text in the file located in Google Cloud Storage or on the Web.
"""
client = vision.ImageAnnotatorClient()
image = types.Image()
image.source.image_uri = uri
response = client.text_detection(image=image)
texts = response.text_annotations
for text in texts:
textdescription = (" "+ text.description )
return textdescription
I specifically need to slice the text line by line and add four spaces in the beginning and a line break in the end, but at this moment this is only working for the first line, and the rest is returned as a single line blob.
I've been checking the official documentation but didn't really find out about the format of the response of the API.
You are almost right there. As you want to slice the text line by line, instead of looping the text annotations, try to get the direct 'description' from google vision's response as shown below.
def parse_image(image_path=None):
"""
Parse the image using Google Cloud Vision API, Detects "document" features in an image
:param image_path: path of the image
:return: text content
:rtype: str
"""
client = vision.ImageAnnotatorClient()
response = client.text_detection(image=open(image_path, 'rb'))
text = response.text_annotations
del response # to clean-up the system memory
return text[0].description
The above function returns a string with the content in the image, with the lines separated by "\n"
Now, you can add prefix & suffix as you need to each line.
image_content = parse_image(image_path="path\to\image")
my_formatted_text = ""
for line in image_content.split("\n"):
my_formatted_text += " " + line + "\n"
my_formatted_text is the text you need.
I'm using Google Speech API and since i'm using LongRunning functions for wav files, and they're all in pt-BR language, they're returning with content such as "voc\303\252 hoje boa noite cart\303\243o".
How can I convert this back to UTF-8?
I already tried .encode function, and already tried to check if there's any parameter to send, but I cannot find anything.
# [START def_transcribe_gcs]
def transcribe_gcs(gcs_uri):
"""Asynchronously transcribes the audio file specified by the gcs_uri."""
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
client = speech.SpeechClient()
audio = types.RecognitionAudio(uri=gcs_uri)
config = types.RecognitionConfig(
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=8000,
language_code='pt-BR')
operation = client.long_running_recognize(config, audio)
print('Waiting for operation to complete...')
response = operation.result(timeout=300)
# Print the first alternative of all the consecutive results.
for result in response.results:
print('Transcript: {}'.format(result.alternatives[0].transcript))
print('Confidence: {}'.format(result.alternatives[0].confidence))
##This part is mine, the rest of the code belongs to Google
file = open("Test.txt", "wb")
file.write(str(response.results))
file.close()
# [END def_transcribe_gcs]