I have a below code, which had three methods : zip_extract , get_filepath and data_restructure.
The code should first execute zip_extract which just extracts if there are any zip files in gcp bucket and it will not return anything.
Next it should execute get_file_path which will traverse through the entire bucket and takes all the respective files paths present in it and store it in a list and returns this to data_resturcture.
Data_restructure takes each file path present in that list and checks for whether it is dicom or not and if the file is a dicom it will store in a structure in destination bucket and if the file is not dicom then it will store it in a different hierarchy in destination bucket.
I wrote a dataflow pipeline for this code as below:
with beam.Pipeline(options=pipeline_options) as p:
file_paths = (p | "Get File Paths" >> beam.Create(get_file_path()))
file_paths | "Data Restructure" >> beam.Map(lambda x: data_restructure(x))
, but this is throwing an error message in dataflow log that
The Dataflow job appears to be stuck because no worker activity has been seen in the last 1h. Please check the worker logs in Stackdriver Logging. You can also get help with Cloud Dataflow at https://cloud.google.com/dataflow/support."
Main code:
def zip_extract():
'''
Function to unzip a folder in a bucket under a specific hierarchy
'''
from google.cloud import storage
client = storage.Client()
bucket = client.bucket(landing_bucket)
blobs_specific = list(bucket.list_blobs(prefix=data_folder))
for file_name in blobs_specific:
file_extension = pathlib.Path(file_name.name).suffix
try:
if file_extension==".zip":
destination_blob_pathname = file_name.name
blob = bucket.blob(destination_blob_pathname)
zipbytes = io.BytesIO(blob.download_as_string())
if is_zipfile(zipbytes):
with ZipFile(zipbytes, 'r') as myzip:
for contentfilename in myzip.namelist():
contentfile = myzip.read(contentfilename)
blob = bucket.blob(f'{file_name.name.replace(".zip","")}/{contentfilename}')
blob.upload_from_string(contentfile)
logging.info("Unzip completed")
except:
logging.info('Skipping : {} file format found.'.format(file_extension))
continue
client.close
def get_file_path():
'''
Function to store all the file paths present in landing bucket into a list
'''
zip_extract()
file_paths = []
from google.cloud import storage
client = storage.Client()
bucket = client.bucket(landing_bucket)
blobs_specific = list(bucket.list_blobs(prefix=data_folder))
try:
for blob in blobs_specific:
file_paths.append("gs://{}/".format(landing_bucket)+blob.name)
client.close
logging.info("List is ready with data")
return file_paths
except Exception as err:
logging.error("Error while appending data to list : {}".format(err))
raise
def data_restructure(line):
'''
params line: String which has the file path
Function to read each file and check if it is a DICOM file or not, if yes,
store it in Study-Series-SOP hierarchy else store it in Descriptive folder in Intermediate bucket.
'''
from google.cloud import storage
InstanceUID={}
client = storage.Client()
destination_bucket = client.bucket(intermediate_bucket)
cmd = "gsutil cp {} .\local_folder".format(line)
result = subprocess.run(cmd,shell=True,capture_output=True,text=True)
file_name=os.listdir(".\local_folder").pop(0)
try:
dicom_data = dcmread(".\local_folder\{}".format(file_name))
logging.info("Started reading Dicom file")
for element in dicom_data:
if element.name in ("Study Instance UID","Series Instance UID","SOP Instance UID","Modality"):
InstanceUID[element.name]=element.value
destination_bucket = client.bucket(intermediate_bucket)
blob = destination_bucket.blob('Client/Test/DICOM/{}/{}/{}/{}.dcm'.format(list(InstanceUID.values())[1],list(InstanceUID.values())[2],list(InstanceUID.values())[3],list(InstanceUID.values())[0]))
blob.upload_from_filename(".\local_folder\{}".format(file_name))
InstanceUID.clear()
logging.info("DICOM file {} uploaded into Intermediate Bucket".format(file_name))
os.remove(".\local_folder\{}".format(file_name))
except Exception as e:
file_extension = file_name.split("/")[-1].split(".")[-1]
if file_extension != "zip" and "report" not in file_name and file_extension != "":
blob = destination_bucket.blob('Test/Descriptive/{}'.format(file_name))
blob.upload_from_filename(".\local_folder\{}".format(file_name))
logging.info("Stored file into Descriptive folder")
os.remove(".\local_folder\{}".format(file_name))
else:
blob = destination_bucket.blob('Test/Reports/{}'.format(file_name))
blob.upload_from_filename(".\local_folder\{}".format(file_name))
logging.info("Stored Report file into Reports folder")
os.remove(".\local_folder\{}".format(file_name))
client.close()
def call_main():
parser = argparse.ArgumentParser()
path_args, pipeline_args = parser.parse_known_args()
pipeline_options = PipelineOptions(pipeline_args)
setup_options= pipeline_options.view_as(SetupOptions)
setup_options.setup_file='./setup.py'
setup_options.save_main_session=True
google_cloud_options = pipeline_options.view_as(GoogleCloudOptions)
google_cloud_options.project = project_id
google_cloud_options.job_name = "dataflow"+re.sub("[^0-9]+", "-", str(datetime.datetime.now()))
google_cloud_options.service_account_email = "service_email"
pipeline_options.view_as(StandardOptions).runner = "DataflowRunner"
google_cloud_options.staging_location = config["staging_location"]
google_cloud_options.temp_location = config["temp_location"]
google_cloud_options.region = config["region"]
pipeline_options.view_as(WorkerOptions).num_workers = 2
pipeline_options.view_as(WorkerOptions).machine_type = "n1-standard-2"
pipeline_options.view_as(WorkerOptions).disk_size_gb = 1024
pipeline_options.view_as(WorkerOptions).network = vpc_name
pipeline_options.view_as(WorkerOptions).subnetwork = f'regions/{config["region"]}/subnetworks/{subnet_name}'
pipeline_options.view_as(WorkerOptions).use_public_ips=False
with beam.Pipeline(options=pipeline_options) as p:
file_paths = (p | "Get File Paths" >> beam.Create(get_file_path()))
file_paths | "Data Restructure" >> beam.Map(lambda x: data_restructure(x))
if __name__ == '__main__':
call_main()
setup.py file:
import setuptools
setuptools.setup(
name='Installing Packages',
version='1.0.0',
install_requires=['google-cloud-datastore==1.15.3',
'google.cloud.storage==1.16.1',
'apache-beam[gcp]==2.31.0',
'google-api-core==1.33.2',
'google-cloud-core==1.7.3',
'google-cloud-logging == 1.15.1',
'pydicom == 2.3.1',
'uuid == 1.30',
'google-cloud-secret-manager',
'psycopg2-binary'],
packages=setuptools.find_packages())
I'm new to apache_beam and dataflow. Please help me with this.
I tried other ways of writing the dataflow pipeline but nothing worked.
Please correct me If I had done anything wrong here.
Kindly tell me if the way I wrote transformations are right or not. If not, please help me the right way.I'm stuck with this not able to progress.
Thanks in advance
This error
The Dataflow job appears to be stuck because no worker activity has
been seen in the last 1h. Please check the worker logs in Stackdriver
Logging. You can also get help with Cloud Dataflow at
https://cloud.google.com/dataflow/support."
usually happens for issues related to dependency installations (and not related to transforms);
You can debug this with looking at the worker startup logs in cloud logging. You are likely to see pip issues with installing dependencies.
You can try other forms of dependency management (https://beam.apache.org/documentation/sdks/python-pipeline-dependencies/) - Custom containers would be less error prone.
as a side note, there is no need to pin beam sdk version. It will be automatically picked and it can cause errors if you are pinning one version but using a different version locally.
I am running a VM instance on google cloud. My goal is to apply speaker diarization to several .wav files stored on cloud buckets.
I have tried the following alternatives with the subsequent problems:
Speaker diarization on Google's API. This seems to go fast but the results make no sense at all. I've already seen similar issues and I opened a thread myself but I get no answer... The output of this only returns maximum of two speakers with random labels. Here is the code I tried in python:
from google.cloud import speech_v1p1beta1 as speech
from google.cloud import storage
import os
import json
import sys
storage_client = storage.Client()
client = speech.SpeechClient()
if "--channel" in sys.argv:
index = sys.argv.index("--channel") + 1
if index < len(sys.argv):
channel = sys.argv[index]
print("Channel:", channel)
else:
print("--channel option requires a value")
audio_folder=f'audio_{channel}'
# channel='tve'
transcript_folder=f'transcript_output'
bucket = storage_client.bucket(audio_folder)
bucket2 = storage_client.bucket(transcript_folder)
wav_files=[i.name for i in bucket.list_blobs()]
json_files=[i.name.split(f'{channel}/')[-1] for i in bucket2.list_blobs(prefix=channel)]
for file in wav_files:
if not file.endswith('.wav'):
continue
transcript_name=file.replace('.wav','.json')
if transcript_name in json_files:
continue
gcs_uri = f"gs://{audio_folder}/{file}"
# gcs_uri = f"gs://{audio_folder}/out2.wav"
audio = speech.RecognitionAudio(uri=gcs_uri)
diarization_config = speech.SpeakerDiarizationConfig(
enable_speaker_diarization=True,
min_speaker_count=2,
#max_speaker_count=10,
)
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
#sample_rate_hertz=8000,
language_code="es-ES",
diarization_config=diarization_config,
#audio_channel_count = 2,
)
print("Waiting for operation to complete...")
operation = client.long_running_recognize(config=config, audio=audio)
response=operation.result()
result = response.results[-1]
# print(result)
# print(type(result))
with open(transcript_name,'w') as f:
json.dump(str(result),f)
# transcript_name=file.replace('.wav','.txt')
# result = response.results[-1]
# with open(transcript_name,'w') as f:
# f.write(result)
os.system(f'gsutil cp {transcript_name} gs://transcript_output/{channel}')
os.remove(transcript_name)
print(f'File {file} processed. ')
No matter how the max_speaker or min are changed, results are the same.
pyannote:
As the above did not work, I decided to try with pyannote. The performance of it is very nice but there is one problem, it is extremely slow. For a wav file of 30 mins it takes more than 3 hours to finish the diarization.
Here is my code:
#import packages
import os
from datetime import datetime
import pandas as pd
from pyannote.audio import Pipeline
from pyannote.audio import Model
from pyannote.core.json import dump
from pyannote.core.json import load
from pyannote.core.json import loads
from pyannote.core.json import load_from
import subprocess
from pyannote.database.util import load_rttm
from google.cloud import speech_v1p1beta1 as speech
from google.cloud import storage
import sys
# channel='a3'
storage_client = storage.Client()
if "--channel" in sys.argv:
index = sys.argv.index("--channel") + 1
if index < len(sys.argv):
channel = sys.argv[index]
print("Channel:", channel)
else:
print("--channel option requires a value")
audio_folder=f'audio_{channel}'
transcript_folder=f'transcript_{channel}'
bucket = storage_client.bucket(audio_folder)
bucket2 = storage_client.bucket(transcript_folder)
wav_files=[i.name for i in bucket.list_blobs()]
rttm_files=[i.name for i in bucket2.list_blobs()]
token="XXX"
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization#2.1",
use_auth_token=token)
# this load the model
model = Model.from_pretrained("pyannote/segmentation",
use_auth_token=token)
for file in wav_files:
if not file.endswith('.wav'):
continue
rttm_name=file.replace('.wav','.rttm')
if rttm_name in rttm_files:
continue
if '2023' not in file:
continue
print(f'Doing file {file}')
gcs_uri = f"gs://{audio_folder}/{file}"
os.system(f'gsutil cp {gcs_uri} {file}')
diarization = pipeline(file)
with open(rttm_name, "w") as rttm:
diarization.write_rttm(rttm)
os.system(f'gsutil cp {rttm_name} gs://transcript_{channel}/{rttm_name}')
os.remove(file)
os.remove(rttm_name)
I am running this with python3.9 on a VM instance with GPU NVIDIA-T4.
Is this normal? I've seen that pyannote.audio is kinda slow on the factor of 1x or so, this time is much more than that given that, in theory, it should be running on a dedicated GPU for it...
Are there any faster alternatives? Any way to improve the code or design a VM that might increase speed?
I already tried this code to convert my large wav file to text
import speech_recognition as sr
r = sr.Recognizer()
hellow=sr.AudioFile('hello_world.wav')
with hellow as source:
audio = r.record(source)
try:
s = r.recognize_google(audio)
print("Text: "+s)
except Exception as e:
print("Exception: "+str(e))
But it is not converting it accurately, the reason I feel it's the 'US' accent.
Please tell me how i can convert whole large wav file accurately.
Google's speech to text is very effective, try the below link,
https://cloud.google.com/speech-to-text/
You can choose the language (English US in your case) and also upload files.
Like #bigdataolddriver commented 100% accuracy is not possible yet, and will be worth millions.
Google speech to text has three types of APIs
Synchronous, Asynchronous and streaming, in which asynchronous allows you to ~480 minutes audio conversion while others will only let you ~1 minute. Following is the sample code to do the conversion.
filepath = "~/audio_wav/" #Input audio file path
output_filepath = "~/Transcripts/" #Final transcript path
bucketname = "callsaudiofiles" #Name of the bucket created in the step before
# Import libraries
from pydub import AudioSegment
import io
import os
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
import wave
from google.cloud import storage
Speech to text support wav files with LINEAR16 or MULAW encoded audio.
Below is the code to get the frame rate and channel with code.
def frame_rate_channel(audio_file_name):
with wave.open(audio_file_name, "rb") as wave_file:
frame_rate = wave_file.getframerate()
channels = wave_file.getnchannels()
return frame_rate,channels
and the code below is the does the asynchronous conversion.
def google_transcribe(audio_file_name):
file_name = filepath + audio_file_name
# The name of the audio file to transcribe
frame_rate, channels = frame_rate_channel(file_name)
if channels > 1:
stereo_to_mono(file_name)
bucket_name = bucketname
source_file_name = filepath + audio_file_name
destination_blob_name = audio_file_name
upload_blob(bucket_name, source_file_name, destination_blob_name)
gcs_uri = 'gs://' + bucketname + '/' + audio_file_name
transcript = ''
client = speech.SpeechClient()
audio = types.RecognitionAudio(uri=gcs_uri)
config = types.RecognitionConfig(
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=frame_rate,
language_code='en-US')
# Detects speech in the audio file
operation = client.long_running_recognize(config, audio)
response = operation.result(timeout=10000)
for result in response.results:
transcript += result.alternatives[0].transcript
delete_blob(bucket_name, destination_blob_name)
return transcript
and this is how you write them to file
def write_transcripts(transcript_filename,transcript):
f= open(output_filepath + transcript_filename,"w+")
f.write(transcript)
f.close()
Kindly let me know if you need any further clarifications.
I have created a python program, see below. It moves files to AWS S3. It it is expecting input. It works well with pycharm but when I call the python script from command prompt, it doesn't take the input value. Here is my code:
import os
import sys
import boto3
from botocore.client import Config
import configparser
import re
import os.path
## Initialize the Parameters
def initconfig(input):
config = configparser.ConfigParser()
config.read_file(open( 'CONFIG_AIRBILLING.conf'))
print('Code Name is :'+ input)
global REMOTE_DIR,ACCESS_KEY_ID,ACCESS_SECRET_KEY,BUCKET_NAME,TARGET_DIR,FILENAME,SRC_DIR,data,File
ACCESS_KEY_ID = config.get('ACCESS', 'ACCESS_KEY_ID')
print('ACCESS_ID_IS:'+ ACCESS_KEY_ID)
ACCESS_SECRET_KEY = config.get('ACCESS', 'ACCESS_SECRET_KEY')
BUCKET_NAME = config.get('ACCESS', 'BUCKET_NAME')
SRC_DIR = config.get(input, 'SRC_DIR')
FILENAME = config.get(input, 'FILENAME')
#LOC="C:\test\demo.txt"
TARGET_DIR = config.get(input, 'TARGET_DIR')
File='demo.txt'
#data = open(File, 'rb') ## This is the filename, need to change it
## This function will make sure file exist in Source directory
def readstatus():
try:
with open(File,'r') as f:
f.closed
result='True'
movefiles(result)
except (Exception,FileNotFoundError) as e:
print('***Error:File Not Found or Accessible***')
result='False*'
raise e
## This function will move the files to AWS S3 bucket
def movefiles(result):
if result=='True':
s3 = boto3.resource(
's3',
aws_access_key_id=ACCESS_KEY_ID,
aws_secret_access_key=ACCESS_SECRET_KEY,
config=Config(signature_version='s3v4')
)
s3.Bucket(BUCKET_NAME).put_object(Key=TARGET_DIR + '/' + File, Body=File)
print('***File Moved***')
print("Done")
initconfig("ABC")
readstatus()
The code above runs fine with pycharm because I can change the value of initconfig function value. But when I run this through command prompt, it doesn't take the parameter values I pass. Here is how I am passing the value, Please help me to fix this.
From Command Prompt
python move_files_to_S3 "TEST"
You want sys.argv, which is a list with all parameters passed through the command line (but notice sys.argv[0] is the name of the script itself).
You'd also do good to check for __name__ == '__main__' to distinguish between when your code is called from the python interpreter through the command line (as in your example at the end of your post) and when it is imported from another module:
if __name__ == '__main__':
initconfig(sys.argv[1])
readstatus()
I want to copy a file in s3 bucket using python.
Ex : I have bucket name = test. And in the bucket, I have 2 folders name "dump" & "input". Now I want to copy a file from local directory to S3 "dump" folder using python... Can anyone help me?
NOTE: This answer uses boto. See the other answer that uses boto3, which is newer.
Try this...
import boto
import boto.s3
import sys
from boto.s3.key import Key
AWS_ACCESS_KEY_ID = ''
AWS_SECRET_ACCESS_KEY = ''
bucket_name = AWS_ACCESS_KEY_ID.lower() + '-dump'
conn = boto.connect_s3(AWS_ACCESS_KEY_ID,
AWS_SECRET_ACCESS_KEY)
bucket = conn.create_bucket(bucket_name,
location=boto.s3.connection.Location.DEFAULT)
testfile = "replace this with an actual filename"
print 'Uploading %s to Amazon S3 bucket %s' % \
(testfile, bucket_name)
def percent_cb(complete, total):
sys.stdout.write('.')
sys.stdout.flush()
k = Key(bucket)
k.key = 'my test file'
k.set_contents_from_filename(testfile,
cb=percent_cb, num_cb=10)
[UPDATE]
I am not a pythonist, so thanks for the heads up about the import statements.
Also, I'd not recommend placing credentials inside your own source code. If you are running this inside AWS use IAM Credentials with Instance Profiles (http://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_use_switch-role-ec2_instance-profiles.html), and to keep the same behaviour in your Dev/Test environment, use something like Hologram from AdRoll (https://github.com/AdRoll/hologram)
import boto3
s3 = boto3.resource('s3')
BUCKET = "test"
s3.Bucket(BUCKET).upload_file("your/local/file", "dump/file")
No need to make it that complicated:
s3_connection = boto.connect_s3()
bucket = s3_connection.get_bucket('your bucket name')
key = boto.s3.key.Key(bucket, 'some_file.zip')
with open('some_file.zip') as f:
key.send_file(f)
Upload file to s3 within a session with credentials.
import boto3
session = boto3.Session(
aws_access_key_id='AWS_ACCESS_KEY_ID',
aws_secret_access_key='AWS_SECRET_ACCESS_KEY',
)
s3 = session.resource('s3')
# Filename - File to upload
# Bucket - Bucket to upload to (the top level directory under AWS S3)
# Key - S3 object name (can contain subdirectories). If not specified then file_name is used
s3.meta.client.upload_file(Filename='input_file_path', Bucket='bucket_name', Key='s3_output_key')
I used this and it is very simple to implement
import tinys3
conn = tinys3.Connection('S3_ACCESS_KEY','S3_SECRET_KEY',tls=True)
f = open('some_file.zip','rb')
conn.upload('some_file.zip',f,'my_bucket')
https://www.smore.com/labs/tinys3/
from boto3.s3.transfer import S3Transfer
import boto3
#have all the variables populated which are required below
client = boto3.client('s3', aws_access_key_id=access_key,aws_secret_access_key=secret_key)
transfer = S3Transfer(client)
transfer.upload_file(filepath, bucket_name, folder_name+"/"+filename)
This is a three liner. Just follow the instructions on the boto3 documentation.
import boto3
s3 = boto3.resource(service_name = 's3')
s3.meta.client.upload_file(Filename = 'C:/foo/bar/baz.filetype', Bucket = 'yourbucketname', Key = 'baz.filetype')
Some important arguments are:
Parameters:
Filename (str) -- The path to the file to upload.
Bucket (str) -- The name of the bucket to upload to.
Key (str) -- The name of the that you want to assign to your file in your s3 bucket. This could be the same as the name of the file or a different name of your choice but the filetype should remain the same.
Note: I assume that you have saved your credentials in a ~\.aws folder as suggested in the best configuration practices in the boto3 documentation.
This will also work:
import os
import boto
import boto.s3.connection
from boto.s3.key import Key
try:
conn = boto.s3.connect_to_region('us-east-1',
aws_access_key_id = 'AWS-Access-Key',
aws_secret_access_key = 'AWS-Secrete-Key',
# host = 's3-website-us-east-1.amazonaws.com',
# is_secure=True, # uncomment if you are not using ssl
calling_format = boto.s3.connection.OrdinaryCallingFormat(),
)
bucket = conn.get_bucket('YourBucketName')
key_name = 'FileToUpload'
path = 'images/holiday' #Directory Under which file should get upload
full_key_name = os.path.join(path, key_name)
k = bucket.new_key(full_key_name)
k.set_contents_from_filename(key_name)
except Exception,e:
print str(e)
print "error"
Using boto3
import logging
import boto3
from botocore.exceptions import ClientError
def upload_file(file_name, bucket, object_name=None):
"""Upload a file to an S3 bucket
:param file_name: File to upload
:param bucket: Bucket to upload to
:param object_name: S3 object name. If not specified then file_name is used
:return: True if file was uploaded, else False
"""
# If S3 object_name was not specified, use file_name
if object_name is None:
object_name = file_name
# Upload the file
s3_client = boto3.client('s3')
try:
response = s3_client.upload_file(file_name, bucket, object_name)
except ClientError as e:
logging.error(e)
return False
return True
For more:-
https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-uploading-files.html
import boto
from boto.s3.key import Key
AWS_ACCESS_KEY_ID = ''
AWS_SECRET_ACCESS_KEY = ''
END_POINT = '' # eg. us-east-1
S3_HOST = '' # eg. s3.us-east-1.amazonaws.com
BUCKET_NAME = 'test'
FILENAME = 'upload.txt'
UPLOADED_FILENAME = 'dumps/upload.txt'
# include folders in file path. If it doesn't exist, it will be created
s3 = boto.s3.connect_to_region(END_POINT,
aws_access_key_id=AWS_ACCESS_KEY_ID,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
host=S3_HOST)
bucket = s3.get_bucket(BUCKET_NAME)
k = Key(bucket)
k.key = UPLOADED_FILENAME
k.set_contents_from_filename(FILENAME)
For upload folder example as following code and S3 folder picture
import boto
import boto.s3
import boto.s3.connection
import os.path
import sys
# Fill in info on data to upload
# destination bucket name
bucket_name = 'willie20181121'
# source directory
sourceDir = '/home/willie/Desktop/x/' #Linux Path
# destination directory name (on s3)
destDir = '/test1/' #S3 Path
#max size in bytes before uploading in parts. between 1 and 5 GB recommended
MAX_SIZE = 20 * 1000 * 1000
#size of parts when uploading in parts
PART_SIZE = 6 * 1000 * 1000
access_key = 'MPBVAQ*******IT****'
secret_key = '11t63yDV***********HgUcgMOSN*****'
conn = boto.connect_s3(
aws_access_key_id = access_key,
aws_secret_access_key = secret_key,
host = '******.org.tw',
is_secure=False, # uncomment if you are not using ssl
calling_format = boto.s3.connection.OrdinaryCallingFormat(),
)
bucket = conn.create_bucket(bucket_name,
location=boto.s3.connection.Location.DEFAULT)
uploadFileNames = []
for (sourceDir, dirname, filename) in os.walk(sourceDir):
uploadFileNames.extend(filename)
break
def percent_cb(complete, total):
sys.stdout.write('.')
sys.stdout.flush()
for filename in uploadFileNames:
sourcepath = os.path.join(sourceDir + filename)
destpath = os.path.join(destDir, filename)
print ('Uploading %s to Amazon S3 bucket %s' % \
(sourcepath, bucket_name))
filesize = os.path.getsize(sourcepath)
if filesize > MAX_SIZE:
print ("multipart upload")
mp = bucket.initiate_multipart_upload(destpath)
fp = open(sourcepath,'rb')
fp_num = 0
while (fp.tell() < filesize):
fp_num += 1
print ("uploading part %i" %fp_num)
mp.upload_part_from_file(fp, fp_num, cb=percent_cb, num_cb=10, size=PART_SIZE)
mp.complete_upload()
else:
print ("singlepart upload")
k = boto.s3.key.Key(bucket)
k.key = destpath
k.set_contents_from_filename(sourcepath,
cb=percent_cb, num_cb=10)
PS: For more reference URL
If you have the aws command line interface installed on your system you can make use of pythons subprocess library.
For example:
import subprocess
def copy_file_to_s3(source: str, target: str, bucket: str):
subprocess.run(["aws", "s3" , "cp", source, f"s3://{bucket}/{target}"])
Similarly you can use that logics for all sort of AWS client operations like downloading or listing files etc. It is also possible to get return values. This way there is no need to import boto3. I guess its use is not intended that way but in practice I find it quite convenient that way. This way you also get the status of the upload displayed in your console - for example:
Completed 3.5 GiB/3.5 GiB (242.8 MiB/s) with 1 file(s) remaining
To modify the method to your wishes I recommend having a look into the subprocess reference as well as to the AWS Cli reference.
Note: This is a copy of my answer to a similar question.
I have something that seems to me has a bit more order:
import boto3
from pprint import pprint
from botocore.exceptions import NoCredentialsError
class S3(object):
BUCKET = "test"
connection = None
def __init__(self):
try:
vars = get_s3_credentials("aws")
self.connection = boto3.resource('s3', 'aws_access_key_id',
'aws_secret_access_key')
except(Exception) as error:
print(error)
self.connection = None
def upload_file(self, file_to_upload_path, file_name):
if file_to_upload is None or file_name is None: return False
try:
pprint(file_to_upload)
file_name = "your-folder-inside-s3/{0}".format(file_name)
self.connection.Bucket(self.BUCKET).upload_file(file_to_upload_path,
file_name)
print("Upload Successful")
return True
except FileNotFoundError:
print("The file was not found")
return False
except NoCredentialsError:
print("Credentials not available")
return False
There're three important variables here, the BUCKET const, the file_to_upload and the file_name
BUCKET: is the name of your S3 bucket
file_to_upload_path: must be the path from file you want to upload
file_name: is the resulting file and path in your bucket (this is where you add folders or what ever)
There's many ways but you can reuse this code in another script like this
import S3
def some_function():
S3.S3().upload_file(path_to_file, final_file_name)
You should mention the content type as well to omit the file accessing issue.
import os
image='fly.png'
s3_filestore_path = 'images/fly.png'
filename, file_extension = os.path.splitext(image)
content_type_dict={".png":"image/png",".html":"text/html",
".css":"text/css",".js":"application/javascript",
".jpg":"image/png",".gif":"image/gif",
".jpeg":"image/jpeg"}
content_type=content_type_dict[file_extension]
s3 = boto3.client('s3', config=boto3.session.Config(signature_version='s3v4'),
region_name='ap-south-1',
aws_access_key_id=S3_KEY,
aws_secret_access_key=S3_SECRET)
s3.put_object(Body=image, Bucket=S3_BUCKET, Key=s3_filestore_path, ContentType=content_type)
xmlstr = etree.tostring(listings, encoding='utf8', method='xml')
conn = boto.connect_s3(
aws_access_key_id = access_key,
aws_secret_access_key = secret_key,
# host = '<bucketName>.s3.amazonaws.com',
host = 'bycket.s3.amazonaws.com',
#is_secure=False, # uncomment if you are not using ssl
calling_format = boto.s3.connection.OrdinaryCallingFormat(),
)
conn.auth_region_name = 'us-west-1'
bucket = conn.get_bucket('resources', validate=False)
key= bucket.get_key('filename.txt')
key.set_contents_from_string("SAMPLE TEXT")
key.set_canned_acl('public-read')
A lot of the existing answers here are pretty complex. A simple approach is to use cloudpathlib, which wraps boto3.
First, be sure to be authenticated properly with an ~/.aws/credentials file or environment variables set. See more options in the cloudpathlib docs.
This is how you would upload a file:
from pathlib import Path
from cloudpathlib import CloudPath
# write a local file that we will upload:
Path("test_file.txt").write_text("hello")
#> 5
# upload that file to S3
CloudPath("s3://drivendata-public-assets/testsfile.txt").upload_from("test_file.txt")
#> S3Path('s3://mybucket/testsfile.txt')
# read it back from s3
CloudPath("s3://mybucket/testsfile.txt").read_text()
#> 'hello'
Note, that you could write to the cloud path directly using the normal write_text, write_bytes, or open methods as well.
I modified your example slightly, dropping some imports and the progress to get what I needed for a boto example.
import boto.s3
from boto.s3.key import Key
AWS_ACCESS_KEY_ID = 'your-access-key-id'
AWS_SECRET_ACCESS_KEY = 'your-secret-access-key'
bucket_name = AWS_ACCESS_KEY_ID.lower() + '-form13'
conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
bucket = conn.create_bucket(bucket_name, location=boto.s3.connection.Location.DEFAULT)
filename = 'embedding.csv'
k = Key(bucket)
k.key = filename
k.set_contents_from_filename(filename)
Here's a boto3 example as well:
import boto3
ACCESS_KEY = 'your-access-key'
SECRET_KEY = 'your-secret-key'
file_name='embedding.csv'
object_name=file_name
bucket_name = ACCESS_KEY.lower() + '-form13'
s3 = boto3.client('s3', aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY)
s3.create_bucket(Bucket=bucket_name)
s3.upload_file(file_name, bucket_name, object_name)