Efficient speaker diarization - python

I am running a VM instance on google cloud. My goal is to apply speaker diarization to several .wav files stored on cloud buckets.
I have tried the following alternatives with the subsequent problems:
Speaker diarization on Google's API. This seems to go fast but the results make no sense at all. I've already seen similar issues and I opened a thread myself but I get no answer... The output of this only returns maximum of two speakers with random labels. Here is the code I tried in python:
from google.cloud import speech_v1p1beta1 as speech
from google.cloud import storage
import os
import json
import sys
storage_client = storage.Client()
client = speech.SpeechClient()
if "--channel" in sys.argv:
index = sys.argv.index("--channel") + 1
if index < len(sys.argv):
channel = sys.argv[index]
print("Channel:", channel)
else:
print("--channel option requires a value")
audio_folder=f'audio_{channel}'
# channel='tve'
transcript_folder=f'transcript_output'
bucket = storage_client.bucket(audio_folder)
bucket2 = storage_client.bucket(transcript_folder)
wav_files=[i.name for i in bucket.list_blobs()]
json_files=[i.name.split(f'{channel}/')[-1] for i in bucket2.list_blobs(prefix=channel)]
for file in wav_files:
if not file.endswith('.wav'):
continue
transcript_name=file.replace('.wav','.json')
if transcript_name in json_files:
continue
gcs_uri = f"gs://{audio_folder}/{file}"
# gcs_uri = f"gs://{audio_folder}/out2.wav"
audio = speech.RecognitionAudio(uri=gcs_uri)
diarization_config = speech.SpeakerDiarizationConfig(
enable_speaker_diarization=True,
min_speaker_count=2,
#max_speaker_count=10,
)
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
#sample_rate_hertz=8000,
language_code="es-ES",
diarization_config=diarization_config,
#audio_channel_count = 2,
)
print("Waiting for operation to complete...")
operation = client.long_running_recognize(config=config, audio=audio)
response=operation.result()
result = response.results[-1]
# print(result)
# print(type(result))
with open(transcript_name,'w') as f:
json.dump(str(result),f)
# transcript_name=file.replace('.wav','.txt')
# result = response.results[-1]
# with open(transcript_name,'w') as f:
# f.write(result)
os.system(f'gsutil cp {transcript_name} gs://transcript_output/{channel}')
os.remove(transcript_name)
print(f'File {file} processed. ')
No matter how the max_speaker or min are changed, results are the same.
pyannote:
As the above did not work, I decided to try with pyannote. The performance of it is very nice but there is one problem, it is extremely slow. For a wav file of 30 mins it takes more than 3 hours to finish the diarization.
Here is my code:
#import packages
import os
from datetime import datetime
import pandas as pd
from pyannote.audio import Pipeline
from pyannote.audio import Model
from pyannote.core.json import dump
from pyannote.core.json import load
from pyannote.core.json import loads
from pyannote.core.json import load_from
import subprocess
from pyannote.database.util import load_rttm
from google.cloud import speech_v1p1beta1 as speech
from google.cloud import storage
import sys
# channel='a3'
storage_client = storage.Client()
if "--channel" in sys.argv:
index = sys.argv.index("--channel") + 1
if index < len(sys.argv):
channel = sys.argv[index]
print("Channel:", channel)
else:
print("--channel option requires a value")
audio_folder=f'audio_{channel}'
transcript_folder=f'transcript_{channel}'
bucket = storage_client.bucket(audio_folder)
bucket2 = storage_client.bucket(transcript_folder)
wav_files=[i.name for i in bucket.list_blobs()]
rttm_files=[i.name for i in bucket2.list_blobs()]
token="XXX"
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization#2.1",
use_auth_token=token)
# this load the model
model = Model.from_pretrained("pyannote/segmentation",
use_auth_token=token)
for file in wav_files:
if not file.endswith('.wav'):
continue
rttm_name=file.replace('.wav','.rttm')
if rttm_name in rttm_files:
continue
if '2023' not in file:
continue
print(f'Doing file {file}')
gcs_uri = f"gs://{audio_folder}/{file}"
os.system(f'gsutil cp {gcs_uri} {file}')
diarization = pipeline(file)
with open(rttm_name, "w") as rttm:
diarization.write_rttm(rttm)
os.system(f'gsutil cp {rttm_name} gs://transcript_{channel}/{rttm_name}')
os.remove(file)
os.remove(rttm_name)
I am running this with python3.9 on a VM instance with GPU NVIDIA-T4.
Is this normal? I've seen that pyannote.audio is kinda slow on the factor of 1x or so, this time is much more than that given that, in theory, it should be running on a dedicated GPU for it...
Are there any faster alternatives? Any way to improve the code or design a VM that might increase speed?

Related

Reading multiple files from Google Storage using Python client asynchronously

I am trying to read a list of files uploaded to a Google Storage bucket and load them to a file/buffer so that I can perform some aggregation on these files.
So far, I am able to read the contents of all the files in a serial manners (each blob object from the iterator that contains all the files in the bucket). However, there are thousands of files that I have uploaded to google cloud storage and even reading these files is taking a considerable amount of time.
from google.cloud import storage
import json
import time
import multiprocessing
from multiprocessing import Pool, Manager
cpu_count = multiprocessing.cpu_count()
manager = Manager()
finalized_list = manager.list()
# Explicitly use service account credentials by specifying the private key file.
storage_client = storage.Client.from_service_account_json('.serviceAccountCredentials.json')
bucket_name = "bucket-name"
def list_blobs():
blobs = storage_client.list_blobs(bucket_name)
return blobs
def read_blob(blob):
bucket = storage_client.bucket(bucket_name)
blob_object = bucket.blob(blob)
with blob_object.open("r") as f:
converted_string = f.read()
print(converted_string)
finalized_list.append(converted_string)
def main():
start_time = time.time()
print("Start time: ", start_time)
pool = Pool(processes=cpu_count)
blobs = list_blobs()
pool.map(read_blob, [blob for blob in blobs])
end_time = time.time()
elapsed_time = end_time - start_time
print("Time taken: ", elapsed_time, " seconds")
if __name__ == "__main__":
main()
As in the above code snippet, I thought of using multiprocessing in python to read each blob object in the bucket, however, since the blob object returned by google cloud storage is not a standard iterator/list object, I am getting an error that says Pickling client objects is not explicitly supported
Is there any other way that I could use to fetch and read thousands of files from cloud storage quickly using a python script?
Here is a solution I did a years ago with concurrent.futures.ProcessPoolExecutor (I did a cpu heavy task. You can as well use concurrent.futures.ThreadPoolExecutor if you're mostly waiting for a return)
from google.cloud import storage
# multi CPU
import concurrent.futures
# progress bar
from tqdm import tqdm
bucket_name = 'your_bucket'
path_to_folder = 'your_path_to_the_files'
file_ending = '.pkl'
kwargs_bucket={
'bucket_or_name': bucket_name,
#'max_results': 60, # comment if you want to run it on all files
'prefix': path_to_folder
}
kwargs_process_pool={
#'max_workers': 1 #comment if you want full speed
}
# a list to store the output
results = []
# connect to the bucket
client = storage.Client()
bucket = client.get_bucket(bucket_name)
# multi CPU OCR
futures = []
# progress bar
with tqdm(total=sum(1 for blob in client.list_blobs(**kwargs_bucket) if blob.name.endswith(file_ending)), position=0, leave=True) as pbar:
#ProcessPoolExecutor
with concurrent.futures.ProcessPoolExecutor(**kwargs_process_pool) as executor:
# getting all the files from the bucket
for blob in client.list_blobs(**kwargs_bucket):
# skip the folder
if not blob.name.endswith(file_ending):
continue
# calling the class above with the ProcessPoolExecutor
futures.append(executor.submit(your_function, blob.name))
# updating the progress bar and checking the return
for future in concurrent.futures.as_completed(futures):
pbar.update(1)
if future.result() != '':
results.append(future.result())
I figured out the hard way, that you should only pass variables and not objects to your_function with the executor. That's why I'm passing blob.name.
Hope that helps

Python Lambda function using boto3 uploads 0 bytes image file to s3

My use case is that I'm trying to take a screenshot of a view in Tableau, and save that screenshot in a bucket in s3. This is done through a Lambda function written in Python. The Lambda is assigned full access rights to s3 and is connected to the Internet.
Everything essentially works - there's no issues with access rights to s3, a connection to the Tableau account can be established, and a file is uploaded to s3. There's no errors thrown when the code is tested. There's one issue though: the saved file is an empty 0 bytes file.
Here's the code:
import logging
import traceback
import os
import requests
from datetime import datetime, timezone
import pytz
import json
from dotenv import load_dotenv
import tableauserverclient as TSC
from slack.web.client import WebClient
from slack.errors import SlackApiError
import boto3
import nest_asyncio
nest_asyncio.apply()
def lambda_handler(event,context):
def Tableau2Slack():
try:
#Tableau environemnt variables
tabAccount=os.environ['tabAccount'],
tabPass=os.environ['tabPass'],
tabDomain=os.environ['tabDomain'],
tabServer=os.environ['tabServer'],
tabView1=os.environ['tabView1'],
tabPath1=os.environ['tabPath1']
s3 = boto3.client('s3')
bucket=os.environ['bucket']
#Let's connect to Tableau
print("Talking to Tableau...\n")
tableau_auth = TSC.TableauAuth(tabAccount, tabPass, tabDomain)
server = TSC.Server(tabServer)
# Searching Tableau Online account for View1
with server.auth.sign_in(tableau_auth):
server.use_server_version()
req_option = TSC.RequestOptions()
req_option.filter.add(TSC.Filter(TSC.RequestOptions.Field.Name,
TSC.RequestOptions.Operator.Equals, tabView1))
all_views, pagination_item = server.views.get(req_option)
# Error catching for bad View names
if not all_views:
raise LookupError("View with the specified name was not found.")
view_item = all_views[0]
image_req_option = TSC.ImageRequestOptions(imageresolution=TSC.ImageRequestOptions.Resolution.High,maxage=1)
server.views.populate_image(view_item, image_req_option)
print("Image saved in temporary folder...\n")
date = datetime.utcnow().strftime('%Y_%m_%d')
# Save bytes as image
with open('/tmp' + tabPath1, "wb") as image_file1:
s3.upload_file('/tmp' + tabPath1, bucket, date + '_' + tabPath1)
print("Tableau image successfully saved to s3 as {0}".format(tabPath1), '\n')
# Tableau try statement error handling
except:
traceback.print_exc()
Tableau2Slack()
return print('Success!')
I suspect that there's something wrong where the file is opened and then uploaded to s3, but can't figure out what.
Running the same code locally, but instead of...
with open('/tmp/' + tabPath1, "wb") as image_file1:
s3.upload_file('/tmp/' + tabPath1, bucket, date + '_' + tabPath1)
...replacing it with...
with open(tabPath1, "wb") as image_file1:
image_file1.write(view_item.image)
...saves a proper file of about 250 kb.
Any idea what could be going on? I'm out of ideas...

Get all transcript results using the google Speech-to-text API

I would like to know if it is possible to get all the possible transcripts that google can generate from a given audio file, as you can see it is only giving the transcript that has the higher matching result.
from google.cloud import speech
import os
import io
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = ''
# Creates google client
client = speech.SpeechClient()
# Full path of the audio file, Replace with your file name
file_name = os.path.join(os.path.dirname(__file__),"test2.wav")
#Loads the audio file into memory
with io.open(file_name, "rb") as audio_file:
content = audio_file.read()
audio = speech.RecognitionAudio(content=content)
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
audio_channel_count=1,
language_code="en-gb"
)
# Sends the request to google to transcribe the audio
response = client.recognize(request={"config": config, "audio": audio})
print(response.results)
# Reads the response
for result in response.results:
print("Transcript: {}".format(result.alternatives[0].transcript))
On your RecognitionConfig(), set a value to max_alternatives. When this is set greater than 1, it will show the other possible transcriptions.
max_alternatives int
Maximum number of recognition hypotheses to be returned. Specifically,
the maximum number of SpeechRecognitionAlternative messages within
each SpeechRecognitionResult. The server may return fewer than
max_alternatives. Valid values are 0-30. A value of 0
or 1 will return a maximum of one. If omitted, will return a
maximum of one.
Update your RecognitionConfig() to the code below:
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
audio_channel_count=1,
language_code="en-gb",
max_alternatives=10 # place a value between 0 - 30
)
I tested this using the sample audio from the github repo of Speech API. I used code below for testing:
from google.cloud import speech
import os
import io
# Creates google client
client = speech.SpeechClient()
# Full path of the audio file, Replace with your file name
file_name = os.path.join(os.path.dirname(__file__),"audio.raw")
#Loads the audio file into memory
with io.open(file_name, "rb") as audio_file:
content = audio_file.read()
audio = speech.RecognitionAudio(content=content)
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
audio_channel_count=1,
language_code="en-us",
max_alternatives=10 # used 10 for testing
)
# Sends the request to google to transcribe the audio
response = client.recognize(request={"config": config, "audio": audio})
for result in response.results:
print(result.alternatives)
Output:

How can I run a python file inside another python file?

I'm having a difficult time trying to run a python file within another python file as a module. The program I am trying to run inside the other python file works fine on its own. However, when I import it as a module it does not do anything and does not even give me an error code.
This is the first code. The file name is speech2text.py
def mainprogram():
import os
import sys
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = r"C:\Users\taiki\PycharmProjects\startup\stelarvision-280712-c709366612cc.json"
filepath = "./"
output_filepath = "./"
bucketname = "stelarvision2020"
sys.path.append("/users/taiki/appdata/local/packages/pythonsoftwarefoundation.python.3.8_qbz5n2kfra8p0/localcache/local-packages/python38/site-packages")
sys.path.append("/Users/taiki/AppData/Local/Programs/Python/Python38-32/Lib/site-packages")
from pydub import AudioSegment
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
import wave
from google.cloud import storage
def stereo_to_mono(audio_file_name):
sound = AudioSegment.from_wav(audio_file_name)
sound = sound.set_channels(1)
sound.export(audio_file_name, format="wav")
def frame_rate_channel(audio_file_name):
with wave.open(audio_file_name, "rb") as wave_file:
frame_rate = wave_file.getframerate()
channels = wave_file.getnchannels()
return frame_rate, channels
def upload_blob(bucket_name, source_file_name, destination_blob_name):
storage_client = storage.Client()
bucket = storage_client.get_bucket(bucket_name)
blob = bucket.blob(destination_blob_name)
blob.upload_from_filename(source_file_name)
def delete_blob(bucket_name, blob_name):
storage_client = storage.Client()
bucket = storage_client.get_bucket(bucket_name)
blob = bucket.blob(blob_name)
blob.delete()
def google_transcribe(audio_file_name):
file_name = filepath + audio_file_name
frame_rate, channels = frame_rate_channel(file_name)
if channels > 1:
stereo_to_mono(file_name)
bucket_name = bucketname
source_file_name = filepath + audio_file_name
destination_blob_name = audio_file_name
upload_blob(bucket_name, source_file_name, destination_blob_name)
gcs_uri = 'gs://' + bucketname + '/' + audio_file_name
transcript = ''
client = speech.SpeechClient()
audio = types.RecognitionAudio(uri=gcs_uri)
config = types.RecognitionConfig(
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=frame_rate,
language_code='en-US',
enable_automatic_punctuation=True)
operation = client.long_running_recognize(config, audio)
response = operation.result(timeout=10000)
for result in response.results:
transcript += result.alternatives[0].transcript
delete_blob(bucket_name, destination_blob_name)
return transcript
def write_transcripts(transcript_filename, transcript):
f = open(output_filepath + transcript_filename, "w+")
f.write(transcript)
f.close()
if __name__ == "__main__":
audio_file_name = "sample_music.wav"
transcript = google_transcribe(audio_file_name)
transcript_filename = audio_file_name.split('.')[0] + '.txt'
write_transcripts(transcript_filename, transcript)
mainprogram()
This is the second python file
def run():
import speech2text
speech2text.mainprogram()
run()
When I execute the second file it doesn't do anything..
The part if __name__ == "__main__": is only executed when you execute that script directly, i.e. python speech2text.py.
To make your program work, delete that if-statement.
The structure of your python files is a little bit strange. It's better to import sys and import os at the head of the file rather than import something in a class.
Also, your indent has problem. In my opinion, you'd better download a nice IDE like pycharm to automatically check your syntax errors.
First, you should put the import outside the function, as it would be more conventional. You should also check if two files are in the same folder.
Second, this part will not work if you call it outside
if __name__ == "__main__":
audio_file_name = "sample_music.wav"
transcript = google_transcribe(audio_file_name)
transcript_filename = audio_file_name.split('.')[0] + '.txt'
write_transcripts(transcript_filename, transcript)
this one is used to indicate that don't automatically run when got called from another program, which basically you are doing now.
Third, the last line mainprogram() in the first file, not sure what you mean here. You have already have a command to run a function in the second program speech2text.mainprogram() after importing.
Maybe you only have done wrong in one part not all 3 that I am suggesting, so please do try and test!
Suppose you second py file name is two and first one name is one
now to run two in one try this
add this line in one import two
and to run their function two.run()
and your two should be like this.
Two learn basic Python visit my youtube channle TechieBaar

How to load a model saved in joblib file from Google Cloud Storage bucket

I want to load a model which is saved as a joblib file from Google Cloud Storage bucket. When it is in local path, we can load it as follows (considering model_file is the full path in system):
loaded_model = joblib.load(model_file)
How can we do the same task with Google Cloud Storage?
For anyone googling around for an answer to this.
Here are two more options besides the obvious, to use Google AI platform for model hosting (and online predictions).
Option 1 is to use TemporaryFile like this:
from google.cloud import storage
from sklearn.externals import joblib
from tempfile import TemporaryFile
storage_client = storage.Client()
bucket_name=<bucket name>
model_bucket='model.joblib'
bucket = storage_client.get_bucket(bucket_name)
#select bucket file
blob = bucket.blob(model_bucket)
with TemporaryFile() as temp_file:
#download blob into temp file
blob.download_to_file(temp_file)
temp_file.seek(0)
#load into joblib
model=joblib.load(temp_file)
#use the model
model.predict(...)
Option 2 is to use BytesIO like this:
from google.cloud import storage
from sklearn.externals import joblib
from io import BytesIO
storage_client = storage.Client()
bucket_name=<bucket name>
model_bucket='model.joblib'
bucket = storage_client.get_bucket(bucket_name)
#select bucket file
blob = bucket.blob(model_bucket)
#download blob into an in-memory file object
model_file = BytesIO()
blob.download_to_file(model_file)
#load into joblib
model=joblib.load(model_local)
Alternate answer as of 2020 using tf2, you can do this:
import joblib
import tensorflow as tf
gcs_path = 'gs://yourpathtofile'
loaded_model = joblib.load(tf.io.gfile.GFile(gcs_path, 'rb'))
I found using gcsfs to be the fastest (and most compact) method to use:
def load_joblib(bucket_name, file_name):
fs = gcsfs.GCSFileSystem()
with fs.open(f'{bucket_name}/{file_name}') as f:
return joblib.load(f)
I don't think that's possible, at least in a direct way. I though about a workaround, but the might not be as efficient as you want.
By using the Google Cloud Storage client libraries [1] you can download the model file first, load it, and when your program ends, delete it. Of course, this means that you need to download the file every time you run the code. Here is a snippet:
from google.cloud import storage
from sklearn.externals import joblib
storage_client = storage.Client()
bucket_name=<bucket name>
model_bucket='model.joblib'
model_local='local.joblib'
bucket = storage_client.get_bucket(bucket_name)
#select bucket file
blob = bucket.blob(model_bucket)
#download that file and name it 'local.joblib'
blob.download_to_filename(model_local)
#load that file from local file
job=joblib.load(model_local)
For folks who are Googling around with this problem - here's another option. The open source modelstore library is a wrapper that deals with the process of saving, uploading, and downloading models from Google Cloud Storage.
Under the hood, it saves scikit-learn models using joblib, creates a tar archive with the files, and up/downloads them from a Google Cloud Storage bucket using blob.upload_from_file() and blob.download_to_filename().
In practice it looks a bit like this (a full example is here):
# Create modelstore instance
from modelstore import ModelStore
ModelStore.from_gcloud(
os.environ["GCP_PROJECT_ID"], # Your GCP project ID
os.environ["GCP_BUCKET_NAME"], # Your Cloud Storage bucket name
)
# Train and upload a model (this currently works with 9 different ML frameworks)
model = train() # Replace with your code to train a model
meta_data = modelstore.sklearn.upload("my-model-domain", model=model)
# ... and later when you want to download it
model_path = modelstore.download(
local_path="/path/to/a/directory",
domain="my-model-domain",
model_id=meta_data["model"]["model_id"],
)
The full documentation is here.
This is the shortest way I found so far:
from google.cloud import storage
client = storage.Client()
bucket = client.get_bucket("my-gcs-bucket")
blob = bucket.blob("model.joblib")
with blob.open(mode="rb") as file:
model = joblib.load(file)

Categories