Amazon S3 upload fails using boto + Python - python

Hi I am unable to upload a file to S3 using boto. It fails with the following error message. Can someone help me, i am new to python and boto.
from boto.s3 import connect_to_region
from boto.s3.connection import Location
from boto.s3.key import Key
import boto
import gzip
import os
AWS_KEY = ''
AWS_SECRET_KEY = ''
BUCKET_NAME = 'mybucketname'
conn = connect_to_region(Location.USWest2,aws_access_key_id = AWS_KEY,
aws_secret_access_key = AWS_SECRET_KEY,
is_secure=False,debug = 2
)
bucket = conn.lookup(BUCKET_NAME)
bucket2 = conn.lookup('unzipped-data')
rs = bucket.list()
rs2 = bucket2.list()
compressed_files = []
all_files = []
files_to_download = []
downloaded_files = []
path = "~/tmp/"
# Check if the file has already been decompressed
def filecheck():
for filename in bucket.list():
all_files.append(filename.name)
for n in rs2:
compressed_files.append(n.name)
for file_name in all_files:
if file_name.strip('.gz') in compressed_files:
pass;
elif '.gz' in file_name and 'indeed' in file_name:
files_to_download.append(file_name)
# Download necessary files
def download_files():
for name in rs:
if name.name in files_to_download:
file_name = name.name.split('/')
print('Downloading: '+ name.name).strip('\n')
file_name = name.name.split('/')
name.get_contents_to_filename(path+file_name[-1])
print(' - Completed')
# Decompressing the file
print('Decompressing: '+ name.name).strip('\n')
inF = gzip.open(path+file_name[-1], 'rb')
outF = open(path+file_name[-1].strip('.gz'), 'wb')
for line in inF:
outF.write(line)
inF.close()
outF.close()
print(' - Completed')
# Uploading file
print('Uploading: '+name.name).strip('\n')
full_key_name = name.name.strip('.gz')
k = Key(bucket2)
k.key = full_key_name
k.set_contents_from_filename(path+file_name[-1].strip('.gz'))
print('Completed')
# Clean Up
d_list = os.listdir(path)
for d in d_list:
os.remove(path+d)
# Function Calls
filecheck()
download_files()
Error message :
Traceback (most recent call last):
File "C:\Users\Siddartha.Reddy\workspace\boto-test\com\salesify\sid\decompress_s3.py", line 86, in <module>
download_files()
File "C:\Users\Siddartha.Reddy\workspace\boto-test\com\salesify\sid\decompress_s3.py", line 75, in download_files
k.set_contents_from_filename(path+file_name[-1].strip('.gz'))
File "C:\Python27\lib\site-packages\boto\s3\key.py", line 1362, in set_contents_from_filename
encrypt_key=encrypt_key)
File "C:\Python27\lib\site-packages\boto\s3\key.py", line 1293, in set_contents_from_file
chunked_transfer=chunked_transfer, size=size)
File "C:\Python27\lib\site-packages\boto\s3\key.py", line 750, in send_file
chunked_transfer=chunked_transfer, size=size)
File "C:\Python27\lib\site-packages\boto\s3\key.py", line 951, in _send_file_internal
query_args=query_args
File "C:\Python27\lib\site-packages\boto\s3\connection.py", line 664, in make_request
retry_handler=retry_handler
File "C:\Python27\lib\site-packages\boto\connection.py", line 1070, in make_request
retry_handler=retry_handler)
File "C:\Python27\lib\site-packages\boto\connection.py", line 1029, in _mexe
raise ex
socket.error: [Errno 10053] An established connection was aborted by the software in your host machine
I have no problem downloading the files, but the upload fails for some weird reason.

If the problem is the size of files (> 5GB), you should use multipart upload:
http://docs.aws.amazon.com/AmazonS3/latest/dev/mpuoverview.html
search for multipart_upload in the docs:
http://boto.readthedocs.org/en/latest/ref/s3.html#module-boto.s3.multipart
Also, see this question for a related issue:
How can I copy files bigger than 5 GB in Amazon S3?
The process is a little non-intuitive. You need to:
run initiate_multipart_upload(), storing the returned object
split the file into chunks (either on disk, or read from memory using CStringIO)
feed the parts sequentially into upload_part_from_file()
run complete_upload() on the stored object

Related

How do I download file from S3 bucket by parsing the object key as a parameter?

I managed to list files of .txt extension from an S3 bucket and its specific subfolder prefix. However when I am parsing the object key as a variable to download the file I am getting an error.
from boto3.session import Session
import botocore
import boto3
import os
filepath = os.path.join("./Documents/AWS")
from subprocess import check_output
# Read the access key file
with open(os.path.join(filepath, "accessKeys.txt"), 'r', encoding='utf-8') as f:
line = f.readline().strip()
access_key = line.split(':')[0]
secret_key = line.split(':')[1]
session = Session(aws_access_key_id = access_key,
aws_secret_access_key = secret_key,
region_name='eu-west-1')
downloadpath = os.path.join("./Downloads")
# Download file from s3 Bucket
s3 = boto3.resource('s3')
#Bucket
bucket = s3.Bucket('terraform-state-181213')
#list objects within given prefix
objs = list(bucket.objects.filter(Prefix='terraform/'))
obj_key = []
or i in range(0, len(objs)):
print(objs[i].key)
for file in objs:
if file.key.endswith('.txt'):
obj_key.append(file.key)
obj_key = str(obj_key).strip('[]')
print(obj_key)
'terraform/oasis_descriptor.txt'
# Download file parsing the bucket and obj_key as parameters
session.resource('s3').Bucket(bucket).download_file(obj_key, os.path.join(downloadpath,'test.txt'))
Error:
Traceback (most recent call last):
File "/Users/vinitgohil/Documents/AWS/open_read_s3file.py", line 47, in <module>
s3.Bucket(bucket).download_file(obj_key, os.path.join(downloadpath,'test.txt'))
File "/Users/vinitgohil/Library/Python/3.8/lib/python/site-packages/boto3/s3/inject.py", line 244, in bucket_download_file
return self.meta.client.download_file(
File "/Users/vinitgohil/Library/Python/3.8/lib/python/site-packages/boto3/s3/inject.py", line 170, in download_file
return transfer.download_file(
File "/Users/vinitgohil/Library/Python/3.8/lib/python/site-packages/boto3/s3/transfer.py", line 307, in download_file
future.result()
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/s3transfer/futures.py", line 106, in result
return self._coordinator.result()
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/s3transfer/futures.py", line 265, in result
raise self._exception
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/s3transfer/tasks.py", line 255, in _main
self._submit(transfer_future=transfer_future, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/s3transfer/download.py", line 340, in _submit
response = client.head_object(
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/botocore/client.py", line 316, in _api_call
return self._make_api_call(operation_name, kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/botocore/client.py", line 598, in _make_api_call
request_dict = self._convert_to_request_dict(
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/botocore/client.py", line 644, in _convert_to_request_dict
api_params = self._emit_api_params(
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/botocore/client.py", line 673, in _emit_api_params
self.meta.events.emit(
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/botocore/hooks.py", line 356, in emit
return self._emitter.emit(aliased_event_name, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/botocore/hooks.py", line 228, in emit
return self._emit(event_name, kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/botocore/hooks.py", line 211, in _emit
response = handler(**kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/botocore/handlers.py", line 223, in validate_bucket_name
if not VALID_BUCKET.search(bucket) and not VALID_S3_ARN.search(bucket):
TypeError: expected string or bytes-like object
Here is sample code for downloading an Amazon S3 object.
Resource method
import boto3
s3_resource = boto3.resource('s3')
# Bucket, key, destination
s3_resource.Object('mybucket', 'hello.txt').download_file('/tmp/hello.txt')
Client method
import boto3
s3_client = boto3.client('s3')
# Provide bucket name, key, destination
s3_client.download_file('mybucket', 'hello.txt', '/tmp/hello.txt'))

TypeError: data type not understood when using transient EMR cluster

I am using the following very simple code which reads csv or parquet files from an S3 bucket and copy it to another S3 bucket.
def read_s3_file_as_raw(bucket_name, path_to_file):
object = s3_client.get_object(Bucket=bucket_name, Key=path_to_file)
response_body = object['Body'].read().decode(encoding="utf-8",errors="ignore")
file_name = os.path.basename(path_to_file)
print(file_name + ' read from S3 successfully.')
return response_body
def read_s3_file_as_dataframe(bucket_name, path_to_file):
'''
read single csv or parquet file on s3 as dataframe
'''
file_name = os.path.basename(path_to_file)
if path_to_file.endswith('.csv'):
object = s3_client.get_object(Bucket=bucket_name, Key=path_to_file)
df = pd.read_csv(object['Body'])
print(file_name + ' read from S3 successfully.')
return df
elif path_to_file.endswith('.parquet'):
fs = s3fs.S3FileSystem()
p_dataset = pq.ParquetDataset(f"s3://{bucket_name}/{path_to_file}",filesystem=fs)
df = p_dataset.read().to_pandas()
print(file_name + ' read from S3 successfully.')
return df
When I run the code on permanent cluster, EC2 or even my local machine it works perfectly (for both csv and parquet) but when I try to run it through transient EMR cluster I get the following error(for parquet only and no problem with csv files):
File "pyarrow/array.pxi", line 559, in pyarrow.lib._PandasConvertible.to_pandas
File "pyarrow/table.pxi", line 1367, in pyarrow.lib.Table._to_pandas
File "/usr/local/lib64/python3.6/site-packages/pyarrow/pandas_compat.py", line 769, in table_to_blockmanager
return BlockManager(blocks, axes)
File "/usr/local/lib64/python3.6/site-packages/pandas/core/internals/managers.py", line 141, in __init__
self._consolidate_check()
File "/usr/local/lib64/python3.6/site-packages/pandas/core/internals/managers.py", line 656, in _consolidate_check
ftypes = [blk.ftype for blk in self.blocks]
File "/usr/local/lib64/python3.6/site-packages/pandas/core/internals/managers.py", line 656, in <listcomp>
ftypes = [blk.ftype for blk in self.blocks]
File "/usr/local/lib64/python3.6/site-packages/pandas/core/internals/blocks.py", line 349, in ftype
return f"{dtype}:{self._ftype}"
File "/usr/local/lib64/python3.6/site-packages/numpy/core/_dtype.py", line 54, in __str__
return dtype.name
File "/usr/local/lib64/python3.6/site-packages/numpy/core/_dtype.py", line 347, in _name_get
if _name_includes_bit_suffix(dtype):
File "/usr/local/lib64/python3.6/site-packages/numpy/core/_dtype.py", line 326, in _name_includes_bit_suffix
elif np.issubdtype(dtype, np.flexible) and _isunsized(dtype):
File "/usr/local/lib64/python3.6/site-packages/numpy/core/numerictypes.py", line 726, in issubdtype
arg1 = dtype(arg1).type
TypeError: data type not understood
Command exiting with ret '1'
I am using the following command to get it executed :
aws emr create-cluster --applications Name=Hadoop Name=Spark \
--bootstrap-actions '[{"Path":"s3://propic-nonprod-datalake-force-transient/bootstrap3.sh","Name":"cluster_setup"}]' \
--service-role EMR_DefaultRole \
--release-label emr-5.20.0 \
--log-uri 's3n://propic-nonprod-datalake-logs/logs/emrtransientcluster/development/' \
--name 'emrtransientcluster-dataload-development' \
--instance-type m1.large --instance-count 1 \
--auto-terminate \
--steps Type=CUSTOM_JAR,Name=CustomJAR,ActionOnFailure=CONTINUE,Jar=s3://ap-southeast-2.elasticmapreduce/libs/script-runner/script-runner.jar,Args=["s3://propic-nonprod-datalake-force-transient/s3_file_transfer5.py"]
If you're not performing any transformation on the data, I'd suggest using the in-built s3-dist-cp instead of writing your own code from scratch just for copying data between buckets. Details on how to add it as a step to a running cluster can be found here. In short you'd need to change the last line of your command to something like this:
--steps Type=CUSTOM_JAR, Name="S3DistCp step", ActionOnFailure=CONTINUE, Jar="command-runner.jar", Args=["s3-dist-cp", "--s3Endpoint=s3.amazonaws.com", "--src=s3://src-bucket/dir/", "--dest=s3://dest-bucket/dir/"]

Python Pillow - ValueError: Decompressed Data Too Large

I use the Pillow lib to create thumbnails. I have to create a lot of them, actually more than 10.000
The program works fine, but after processing round about 1.500, I get the following error:
Traceback (most recent call last):
File "thumb.py", line 15, in <module>
im = Image.open('/Users/Marcel/images/07032017/' + infile)
File "/Users/Marcel/product-/PIL/Image.py", line 2339, in open
im = _open_core(fp, filename, prefix)
File "/Users/Marcel/product-/PIL/Image.py", line 2329, in _open_core
im = factory(fp, filename)
File "/Users/Marcel/product-/PIL/ImageFile.py", line 97, in __init__
self._open()
File "/Users/Marcel/product-/PIL/PngImagePlugin.py", line 538, in _open
s = self.png.call(cid, pos, length)
File "/Users/Marcel/product-/PIL/PngImagePlugin.py", line 136, in call
return getattr(self, "chunk_" + cid.decode('ascii'))(pos, length)
File "/Users/Marcel/product-/PIL/PngImagePlugin.py", line 319, in chunk_iCCP
icc_profile = _safe_zlib_decompress(s[i+2:])
File "/Users/Marcel/product-/PIL/PngImagePlugin.py", line 90, in _safe_zlib_decompress
raise ValueError("Decompressed Data Too Large")
ValueError: Decompressed Data Too Large
My program is very straight forward:
import os, sys
import PIL
from PIL import Image
size = 235, 210
reviewedProductsList = open('products.txt', 'r')
reviewedProducts = reviewedProductsList.readlines()
t = map(lambda s: s.strip(), reviewedProducts)
print "Thumbs to create: '%s'" % len(reviewedProducts)
for infile in t:
outfile = infile
try:
im = Image.open('/Users/Marcel/images/07032017/' + infile)
im.thumbnail(size, Image.ANTIALIAS)
print "thumb created"
im.save('/Users/Marcel/product-/thumbs/' + outfile, "JPEG")
except IOError, e:
print "cannot create thumbnail for '%s'" % infile
print "error: '%s'" % e
I am performing this operation locally on my MacBook Pro.
This is to protect against a potential DoS attack on servers running Pillow caused by decompression bombs. It occurs when a decompressed image is found to have too large metadata. See http://pillow.readthedocs.io/en/4.0.x/handbook/image-file-formats.html?highlight=decompression#png
Here's the CVE report: https:// www.cvedetails.com/cve/CVE-2014-9601/
From a recent issue:
If you set ImageFile.LOAD_TRUNCATED_IMAGES to true, it will suppress
the error (but still not read the large metadata). Alternately, you can
change set the values here: https://github.com/python-pillow/Pillow/ blob/master/PIL/PngImagePlugin.py#L74
https://github.com/python-pillow/Pillow/issues/2445
following code should help you in setting what accepted answer says.
from PIL import PngImagePlugin
LARGE_ENOUGH_NUMBER = 100
PngImagePlugin.MAX_TEXT_CHUNK = LARGE_ENOUGH_NUMBER * (1024**2)
It's not documented how to set this value. I hope people find this useful.

Pyglet unable to play .wav files

def morse_audio( item ):
from pyglet import media
import pyglet
import time
import glob
import os
import wave
from contextlib import closing
files = []
audios = []
for file in glob.glob('C:\\Users\\MQ\'s Virual World\\Downloads\\Morse\\*.wav'):
ass = str(os.path.join('C:\\Users\MQ\'s Virual World\\Downloads\\Morse', file))
print (ass)
files.append(ass)
#audio = media.load(files[1])
#audio.play()
#print (len(files))
one = list(item)
str_list = [x.strip(' ') for x in one]
str_list = [x.strip('/') for x in str_list]
for s in str_list[0]:
if s != "-" and s != ".":
list(item)
for letter in item:
for i in range(0, 51):
if letter == " ":
time.sleep(1.5)
audios.append("noise3.wav")
break
if letter != letterlst[i] and letter != letterlst[i].lower():
continue
else:
print (files[i])
audio = media.load(files[i])
audio.play()
audios.append(files[i])
audios.append("noise2.wav")
time.sleep(1)
else:
lst = item.split()
print (' '.join(lst))
for code in lst:
for i in range(0, 51):
if code == "/":
time.sleep(1.5)
audios.append("noise3.wav")
break
if code != morse[i]:
continue
else:
print (files[i])
audio = media.load(files[i])
audio.play()
audios.append(files[i])
audios.append("noise2.wav")
time.sleep(1)
break
outfile = "sounds.wav"
data= []
for file in audios:
w = wave.open(file, 'rb')
lol = w.getparams()
print (lol)
data.append( [w.getparams(), w.readframes(w.getnframes())] )
w.close()
with closing(wave.open(outfile, 'wb')) as output:
# find sample rate from first file
with closing(wave.open(audios[0])) as w:
output.setparams(w.getparams())
# write each file to output
for audioo in audios:
with closing(wave.open(audioo)) as w:
output.writeframes(w.readframes(w.getnframes()))()))
So this code previously worked but I wanted to use different file types other then .wav files but because that worked so poorly I went back to .wav. These are different .wav files but the ones that worked before get the same error message. Which is:
Traceback (most recent call last):
File "C:\Users\MQ's Virual World\AppData\Local\Programs\Python\Python35-32\morsecode.py", line 187, in <module>
morse_audio("0123456789ÁÄ#&':,$=!-().+?;/_")
File "C:\Users\MQ's Virual World\AppData\Local\Programs\Python\Python35-32\morsecode.py", line 96, in morse_audio
audio.play()
File "C:\Users\MQ's Virual World\AppData\Local\Programs\Python\Python35-32\lib\site-packages\pyglet\media\__init__.py", line 473, in play
player.play()
File "C:\Users\MQ's Virual World\AppData\Local\Programs\Python\Python35-32\lib\site-packages\pyglet\media\__init__.py", line 1012, in play
self._set_playing(True)
File "C:\Users\MQ's Virual World\AppData\Local\Programs\Python\Python35-32\lib\site-packages\pyglet\media\__init__.py", line 993, in _set_playing
self._create_audio_player()
File "C:\Users\MQ's Virual World\AppData\Local\Programs\Python\Python35-32\lib\site-packages\pyglet\media\__init__.py", line 1083, in _create_audio_player
self._audio_player = audio_driver.create_audio_player(group, self)
File "C:\Users\MQ's Virual World\AppData\Local\Programs\Python\Python35-32\lib\site-packages\pyglet\media\drivers\directsound\__init__.py", line 502, in create_audio_player
return DirectSoundAudioPlayer(source_group, player)
File "C:\Users\MQ's Virual World\AppData\Local\Programs\Python\Python35-32\lib\site-packages\pyglet\media\drivers\directsound\__init__.py", line 184, in __init__
None)
File "C:\Users\MQ's Virual World\AppData\Local\Programs\Python\Python35-32\lib\site-packages\pyglet\com.py", line 125, in <lambda>
self.method.get_field()(self.i, self.name)(obj, *args)
File "_ctypes/callproc.c", line 920, in GetResult
OSError: [WinError -2147024809] The parameter is incorrect
I've tried .wav files that used to work. It works when I use a .ogg file. Also works with mp3s. Seems only .wav files are giving it issues. Very suddenly and randomly.

Problem running a python script (pypdf/hex errors)

I am trying to create a Python script using the PyPDF Module. What the script does it take the 'Root' folder, merges all the PDFs in it and outputs the merged PDF in an 'Output' folder and renames it to 'Root.pdf' (the folder which containes the split PDFs). What it does then is do the same with the sub-directories, giving the final output a name equal to the sub-directories.
I'm stuck when coming to process the sub-directories, giving me an error code related to some hex values. (it seems that it is getting a null value which is not in hex)
Here is the error code generated:
Traceback (most recent call last):
File "C:\Documents and Settings\student3\Desktop\Test\pdfMergerV1.py", line 76, in <module>
files_recursively(path)
File "C:\Documents and Settings\student3\Desktop\Test\pdfMergerV1.py", line 74, in files_recursively
os.path.walk(path, process_file, ())
File "C:\Python27\lib\ntpath.py", line 263, in walk
walk(name, func, arg)
File "C:\Python27\lib\ntpath.py", line 259, in walk
func(arg, top, names)
File "C:\Documents and Settings\student3\Desktop\Test\pdfMergerV1.py", line 38, in process_file
pdf = PdfFileReader(file( filename, "rb"))
File "C:\Python27\lib\site-packages\pyPdf\pdf.py", line 374, in __init__
self.read(stream)
File "C:\Python27\lib\site-packages\pyPdf\pdf.py", line 775, in read
newTrailer = readObject(stream, self)
File "C:\Python27\lib\site-packages\pyPdf\generic.py", line 67, in readObject
return DictionaryObject.readFromStream(stream, pdf)
File "C:\Python27\lib\site-packages\pyPdf\generic.py", line 531, in readFromStream
value = readObject(stream, pdf)
File "C:\Python27\lib\site-packages\pyPdf\generic.py", line 58, in readObject
return ArrayObject.readFromStream(stream, pdf)
File "C:\Python27\lib\site-packages\pyPdf\generic.py", line 153, in readFromStream
arr.append(readObject(stream, pdf))
File "C:\Python27\lib\site-packages\pyPdf\generic.py", line 69, in readObject
return readHexStringFromStream(stream)
File "C:\Python27\lib\site-packages\pyPdf\generic.py", line 276, in readHexStringFromStream
txt += chr(int(x, base=16))
ValueError: invalid literal for int() with base 16: '\x00\x00'
This is the source code for the script:
#----------------------------------------------------------------------------------------------
# Name: pdfMerger
# Purpose: Automatic merging of all PDF files in a directory and its sub-directories and
# rename them according to the folder itself. Requires the pyPDF Module
#
# Current: Processes all the PDF files in the current directory
# To-Do: Process the sub-directories.
#
# Version: 1.0
# Author: Brian Livori
#
# Created: 03/08/2011
# Copyright: (c) Brian Livori 2011
# Licence: Open-Source
#---------------------------------------------------------------------------------------------
#!/usr/bin/env <strong class="highlight">python</strong>
import os
import glob
import sys
import fnmatch
from pyPdf import PdfFileReader, PdfFileWriter
output = PdfFileWriter()
path = str(os.getcwd())
x = 0
def process_file(_, path, filelist):
for filename in filelist:
if filename.endswith('.pdf'):
filename = os.path.join(path, filename)
print "Merging " + filename
pdf = PdfFileReader(file( filename, "rb"))
x = pdf.getNumPages()
i = 0
while (i != x):
output.addPage(pdf.getPage(i))
print "Merging page: " + str(i+1) + "/" + str(x)
i += 1
output_dir = "\Output\\"
ext = ".pdf"
dir = os.path.basename(path)
outputpath = str(os.getcwd()) + output_dir
final_output = outputpath
if os.path.exists(final_output) != True:
os.mkdir(final_output)
outputStream = file(final_output + dir + ext, "wb")
os.path.join(outputStream)
output.write(outputStream)
outputStream.close()
else:
outputStream = file(final_output + dir + ext, "wb")
os.path.join(outputStream)
output.write(outputStream)
outputStream.close()
def files_recursively(topdir):
os.path.walk(path, process_file, ())
files_recursively(path)
It looks like the PDF files you are reading are not valid PDF files, or they are more exotic than PyPDF is prepared for. Are you sure you have good PDF files to read?
Also, there are a few odd things in your code, but this one might really matter:
output_dir = "\Output\\"
You have a \O escape sequence there which isn't what you want.

Categories