I am unable to load a tar.gz file from my local directory to an S3 bucket location. I've had no issues running the function below to upload any csv files but am getting the error: "Fileobj must implement read" error. I am using Boto3 and Python
The tar_file is the file on my local drive to upload to the S3 bucket location
import csv
import glob
import os
import tarfile
from datetime import date
from typing import Optional, Set
from io import BytesIO
import psycopg2
import boto3
from constants import (
ARTIFACT_STORE,
DB_HOST,
DB_PASSWORD,
DB_USER,
EXCLUDED_TABLES,
NIPR_DB_NAME,
S3_ACCESS_KEY_ID,
S3_SECRET_ACCESS_KEY,
S3_ENDPOINT_URL,
BUCKET_NAME
)
def upload_s3_file():
tar_file = f"{ARTIFACT_STORE}/{date.today()}_cds.tar.gz"
s3 = boto3.client('s3',endpoint_url=S3_ENDPOINT_URL,aws_access_key_id=S3_ACCESS_KEY_ID,aws_secret_access_key=S3_SECRET_ACCESS_KEY)
with tarfile.open(tar_file,'r:gz') as tar:
s3.upload_fileobj(tar,BUCKET_NAME,tar_file)
When I run the below function on a csv generated file to the S3 bucket, I have no issues:
s3 = boto3.client('s3',endpoint_url=S3_ENDPOINT_URL,aws_access_key_id=ACCESS_KEY,aws_secret_access_key=SECRET_KEY)
with open("test.csv", "rb") as f:
s3.upload_fileobj(f,BUCKET_NAME, "test")
The problem is you're supposed to pass a file object to upload_fileobj and not a tarfile object.
with open(tar_file,'rb') as tar:
s3.upload_fileobj(tar,BUCKET_NAME,tar_file)
Related
Hi and thank you for reading.
I'm new GCP, and I still can't find a solution to my problem.
I've searched many topics but no solution helped me move on.
INPUT INFORMATION
I have files stored in my bucket in Cloud Storage.
These files could be of any extension, but I need to select only .zips
I want to write a python script in App-Engine, which will find and select these zip files then unzip them in the same directory in Cloud Storage
Below version of the script, which doesn't work
from google.cloud import storage
from zipfile import ZipFile
def list_blobs(bucket_name):
storage_client = storage.Client()
blobs = storage_client.list_blobs(bucket_name)
for blob in blobs:
try:
with ZipFile(f'{blob.name}', 'r') as zipObj:
zipObj.extractall()
except:
print(f'{blob.name} not supported unzipping')
list_blobs('test_bucket_for_me_05_08_2021')
Output
proxy.txt not supported unzipping
test.zip not supported unzipping
I find solution, this code below will unzipped zips in bucket
from google.cloud import storage
from zipfile import ZipFile
from zipfile import is_zipfile
import io
storage_client = storage.Client()
def unzip_files(bucketname):
bucket = storage_client.get_bucket(bucketname)
blobs = storage_client.list_blobs(bucketname)
for blob in blobs:
file = bucket.blob(blob.name)
try:
zipbytes = io.BytesIO(file.download_as_string())
if is_zipfile(zipbytes):
with ZipFile(zipbytes, 'r') as selected_zip:
for files_in_zip in selected_zip.namelist():
file_in_zip = selected_zip.read(files_in_zip)
blob_new = bucket.blob(files_in_zip)
blob_new.upload_from_string(file_in_zip)
except:
print(f'{blob.name} not supported')
unzip_files("test_bucket_for_me_05_08_2021")
Of course , i will modify this code, but this solution works
Thanks all for your time and effort
I'm just wondering is there a way to extract a password protected zip file from Azure Storage.
I tried using a python Azure Function to no avail but had a problem reading the location of the file.
Would the file have to stored on a shared location temporarily in order to achieve?
Just looking for a bit of direction here am I missing a step maybe?
Regards,
James
Azure blob storage provides storing functionality only, there is no running env to perform unzip operation. So basically, we should download .zip file to Azure function, unzip it and upload files in .zip file 1 by 1.
For a quick test, I write an HTTP trigger Azure function demo that unzipping a zip file with password-protected, it works for me on local :
import azure.functions as func
import uuid
import os
import shutil
from azure.storage.blob import ContainerClient
from zipfile import ZipFile
storageAccountConnstr = '<storage account conn str>'
container = '<container name>'
#define local temp path, on Azure, the path is recommanded under /home
tempPathRoot = 'd:/temp/'
unZipTempPathRoot = 'd:/unZipTemp/'
def main(req=func.HttpRequest) -> func.HttpResponse:
reqBody = req.get_json()
fileName = reqBody['fileName']
zipPass = reqBody['password']
container_client = ContainerClient.from_connection_string(storageAccountConnstr,container)
#download zip file
zipFilePath = tempPathRoot + fileName
with open(zipFilePath, "wb") as my_blob:
download_stream = container_client.get_blob_client(fileName).download_blob()
my_blob.write(download_stream.readall())
#unzip to temp folder
unZipTempPath = unZipTempPathRoot + str(uuid.uuid4())
with ZipFile(zipFilePath) as zf:
zf.extractall(path=unZipTempPath,pwd=bytes(zipPass,'utf8'))
#upload all files in temp folder
for root, dirs, files in os.walk(unZipTempPath):
for file in files:
filePath = os.path.join(root, file)
destBlobClient = container_client.get_blob_client(fileName + filePath.replace(unZipTempPath,''))
with open(filePath, "rb") as data:
destBlobClient.upload_blob(data,overwrite=True)
#remove all temp files
shutil.rmtree(unZipTempPath)
os.remove(zipFilePath)
return func.HttpResponse("done")
Files in my container:
Result:
Using blob triggers will be better to do this as it will cause time-out errors if the size of your zip file is huge.
Anyway, this is only a demo that shows you how to do this.
I am currently working on a script to send various files to an s3 bucket. The files successfully send to the bucket with no issue, but when I open the files they do not show any data. These files seemed to have only transferred the actual files "names" (.i.e. file1.csv) as a "string", but do not the actual .csv containing any data within.
An example below is a directory containing files that I sent to my s3:
/home/user/Desktop/
file1.csv
file2.csv
file3.csv
file4.csv
How can I update my code to send
import fnmatch
import os
import glob
import boto3
from botocore.client import Config
ACCESS_KEY_ID = 'some_key'
ACCESS_SECRET_KEY = 'some_key'
BUCKET_NAME = 'isome_bucket'
s3 = boto3.client(
's3',
aws_access_key_id=ACCESS_KEY_ID,
aws_secret_access_key=ACCESS_SECRET_KEY,
config=Config(signature_version='s3v4')
)
for file in os.listdir('/home/user/Desktop/'):
if fnmatch.fnmatch(file, '*.csv'):
key = "folder_1/" + file
s3.put_object(Bucket=BUCKET_NAME, Key=key, Body=file)
print("Complete")
You are simply passing the name of your file to s3. I think the upload_file API will serve you better here instead of put_object. You can simply pass in the full path for upload_file and it will work instead of you having to open the file and reading the content yourself. Replace your put_object method with this and it should work.
s3.upload_file('/home/user/Desktop/'+file, BUCKET_NAME, key)
I am creating a zipfile on my local machine and would like to write files from s3. So far I'm unable to do it. Here's what I have in the mean time.
import os
import zipfile
from fs import open_fs
fs = open_fs(os.getenv('s3_sample_folder'))
file_names = file_names() #list of file names
with zipfile.ZipFile('zipfile.zip', mode='w') as zf:
for file in file_names:
with fs.open('/'+file, 'rb') as remote_file:
content = remote_file.read()
zf.write(content, basename(content))
The ZipFile.write method accepts a file name, not file content. You should use the ZipFile.writestr method instead to write file content to the zip file:
zf.writestr(file, content)
Since you are using PyFilesystem, you can open a S3 filesystem and a Zip filesystem, then use copy_file to copy between them.
Something like the following should work:
import os
from fs import open_fs
from fs.copy import copy_file
with open_fs(os.getenv('s3_sample_folder')) as s3_fs:
with open_fs('zip://zipfile.zip', create=True) as zip_fs:
for filename in file_names():
copy_file(s3_fs, filename, zip_fs, filename)
I am trying to download all sample files from my s3 bucket in amazon ec2 via boto3 and python3 script to my local hard drive.
Here is the script.
#downloading files from s3
import boto3
import sys
import os
import argparse
import threading
from botocore.client import Config
instance_id = "i-03e7f6391a0f523ee"
action = 'ON'
ec2 = boto3.client('ec2')
s3=boto3.resource('s3')
for bucket in s3.buckets.all():
print(bucket.name)
my_bucket=s3.Bucket('tkbucket32')
print("listing all files in bucket")
for s3_file in my_bucket.objects.all():
print(s3_file.key)
s3.meta.client.download_file('tkbucket32', 'hello.txt', '/tmp/hello.txt')
Currently there are following files in bucket
hello.txt myhomepage.html
Now I want them to be downloaded to
D:\folder1\folder2\folder3
I am running scripts in Windows 7.
My problem is this line
s3.meta.client.download_file('tkbucket32', 'hello.txt', '/tmp/hello.txt')
Where do I specify the path of on local hard drive where I want to download these files?
Change this line:
s3.meta.client.download_file('tkbucket32', 'hello.txt', '/tmp/hello.txt')
to:
s3.meta.client.download_file('tkbucket32', 'hello.txt', 'D:\\folder1\\folder2\\folder3\\hello.txt')