Trying to unzip password protected file in GCS but getting error in below code. Below code work fine with normal .gz files but fails to unzip password protected files.
storage_client = storage.Client()
source_bucket = 'bucket'
source_bucket1 = storage_client.bucket(source_bucket)
blob = source_bucket1.blob("path/filename.gz")
zipbytes = io.BytesIO(blob.download_as_string())
print(zipbytes)
if is_zipfile(zipbytes):
with ZipFile(zipbytes, 'r') as myzip:
for contentfilename in myzip.namelist():
contentfile = myzip.read(contentfilename)
contentfilename = contentfilename[:-3]
blob1 = bucket.blob(contentfilename)
blob1.upload_from_string(contentfile)
print(f'File decompressed from {zipfilename_with_path} to {contentfilename}')
blob.delete()
You can use Python, e.g. from a Cloud Function:
from google.cloud import storage
from zipfile import ZipFile
from zipfile import is_zipfile
import io
def zipextract(bucketname, zipfilename_with_path):
storage_client = storage.Client()
bucket = storage_client.get_bucket(bucketname)
destination_blob_pathname = zipfilename_with_path
blob = bucket.blob(destination_blob_pathname)
zipbytes = io.BytesIO(blob.download_as_string())
if is_zipfile(zipbytes):
with ZipFile(zipbytes, 'r') as myzip:
for contentfilename in myzip.namelist():
contentfile = myzip.read(contentfilename)
blob = bucket.blob(zipfilename_with_path + "/" + contentfilename)
blob.upload_from_string(contentfile)
zipextract("mybucket", "path/file.zip") # if the file is gs://mybucket/path/file.zip
Am able to read .csv.gz password protected file using below logic. All of this is done in memory. It has performance issue if the file is huge but works fine.
storage_client = storage.Client()
source_bucket = '<bucket-name>'
source_bucket1 = storage_client.bucket(source_bucket)
bukcet_folder = '/unzip'
blob = source_bucket1.blob(path)
zipbytes = io.BytesIO(blob.download_as_string())
with ZipFile(zipbytes, 'r') as myzip:
print("Inside the zipfiles loop")
with myzip.open('filename.csv',pwd=b'password') as myfile:
print("Inside zip 2 loop")
contentfile = myfile.read()
contentfilename = bucket_folder + destination_file_path
blob1 = source_bucket1.blob(contentfilename)
blob1.upload_from_string(contentfile)
```
Related
import pandas as pd
from io import BytesIO
import zipfile
from google.cloud import storage
import datetime
class ConstructedFile:
def __init__(self, data):
self.raw_data = data
self.columns = list(raw_data.keys())
self.final_list = dict()
self.memory_file = None
def generate_csv(self):
df = pd.DataFrame(self.raw_data, self.columns)
self.final_list = {"Name": self.columns, "data": df.to_csv(index=False)}
print(f"***** {self.final_list} *****")
def compress_to_zip(self, file_name):
self.memory_file = BytesIO()
zf = zipfile.ZipFile(self.memory_file, 'w')
data = zipfile.ZipInfo(f"{file_name}.csv")
data.compress_type = zipfile.ZIP_DEFLATED
zf.writestr(data, self.final_list['data'])
def upload_blob_from_memory(self, destination_blob_name):
bucket_name = "Example_Bucket"
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(destination_blob_name)
blob.upload_from_string(self.memory_file, content_type='application/zip')
def generate_download_signed_url_v4(self, blob_name):
bucket_name = "Example_Bucket"
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(blob_name)
url = blob.generate_signed_url(version="v4", expiration=datetime.timedelta(minutes=15), method="GET")
return url
raw_data = {"Fruits": ["apple", "orange", "grapes"], "Quantity": [3, 5, 6], "Rate": [50, 60, 70]}
file_name = "fruits"
obj = ConstructedFile(raw_data)
obj.generate_csv()
obj.compress_to_zip(file_name)
obj.upload_blob_from_memory(f"export/{file_name}.zip")
singed_url = obj.generate_download_signed_url_v4(f"export/{file_name}.zip")
i just give sample flow of my program as original code was implemented.
The program flow are:
Fetching raw_data from database.
fetched raw_data are being involved to upcoming step
generate to csv ==> obj.generate_csv()
compress to zip as blob ==> obj.compress_to_zip(file_name)
upload the blob to GCS bucket ==> obj.upload_blob_from_memory(f"export/{file_name}")
generate the singed_url from GCS bucket for uploaded blob ==> obj.generate_download_signed_url_v4(f"export/{file_name}")
Here is the problem i am facing.
when i downloading the zip file (fruits.zip) using singed_url, that file works and open fine. but Sometimes i download the zip file using singed_url, that file does not open and shows error message . it seems to be corrupted.
here is image which is error message while opening zip file
also i have tried to open file which is download from GCS bucket directly, it shows same error message.
i don't know my guess correct or wrong, i think that the mistake would happen while constructing the zip file in memory?
i am stuck with this. please help me out.
I am trying to the Append to the Blobs if the Blob already exist, but from the below code I can only create a file, but can't append to the existing blob.
filename = x + '.csv'
file_system_client = service_client.get_file_system_client(file_system=date_time+"9")
file_client = file_system_client.create_file(filename)
local_file = open(filename, 'r') # Change the Path over here !!!
file_contents = local_file.read()
file_client.append_data(data=file_contents, offset=0, length=len(file_contents))
file_client.flush_data(len(file_contents))
I have tried to append using below code, but I think I am using wrong syntax from azure
file_system_client = service_client.get_file_system_client(file_system="test-data")
# Get the Blob Names from the Container
container_client = blob_service_client.get_container_client("test-data")
blobs_list = container_client.list_blobs()
# Check the Blob name is present or not
for blob in blobs_list:
if blob.name == sourceid + ".csv":
flag = True
break
if flag:
file_client = file_system_client.get_file_client(sourceid + ".csv")
else:
file_client = file_system_client.create_file(sourceid + ".csv")
local_file = gzip.open(filename, 'r') # Change the Path over here !!!
file_contents = local_file.read()
file_client.append_data(data=file_contents, offset=0, length=len(file_contents))
file_client.flush_data(len(file_contents))
Issue Solved by Following code Snippet... Finally got the syntax to append the blobs which are in csv by python...
flag = False
blob_service_client = BlobServiceClient.from_connection_string(
"DefaultEndpointsProtocol=https;AccountName=***********;AccountKey=*************;EndpointSuffix=core.windows.net")
service_client = DataLakeServiceClient(
account_url="{}://{}.dfs.core.windows.net".format("https", "********"),
credential="************")
file_system_client = service_client.get_file_system_client(file_system="test-data")
# Get the Blob Names from the Container
container_client = blob_service_client.get_container_client("test-data")
blobs_list = container_client.list_blobs()
# Check the Blob name is present or not
for blob in blobs_list:
if blob.name == sourceid + ".csv":
flag = True
break
if flag:
file_client = file_system_client.get_file_client(sourceid + ".csv")
file_client.get_file_properties().size
filesize_previous = file_client.get_file_properties().size
local_file = gzip.open(filename, 'r') # Change the Path over here !!!
file_contents = local_file.read()
file_client.append_data(data=file_contents, offset=filesize_previous, length=len(file_contents))
file_client.flush_data(filesize_previous + len(file_contents))
else:
file_client = file_system_client.create_file(sourceid + ".csv")
local_file = gzip.open(filename, 'r') # Change the Path over here !!!
file_contents = local_file.read()
file_client.append_data(data=file_contents, offset=0, length=len(file_contents))
file_client.flush_data(len(file_contents))
We have an option to set the blob type as AppendBlob while uploading a blob, with help of this we can append the blobs data.
I did a repro on this scenario where the csv blob in my Azure container will have 50 contacts and my local csv file will have 100 contacts.
With the help of below code, we’ll be able to Append the data and at the end after uploading our source file will get deleted.
import logging
import azure.functions as func
from azure.storage.blob import AppendBlobService, BlobServiceClient
import os
account_name = "ACCOUNT_NAME" #add your account name
account_key = "ACCOUNT_KEY" #add your account key
append_blob_service = AppendBlobService(account_name=account_name, account_key=account_key)
def generate_progress_callback():
def progress_callback(current, total):
print('({}, {})'.format(current, total))
return progress_callback
append_blob_service.append_blob_from_path(container_name="container0805", blob_name="50-contacts.csv",
file_path=r"C:\Users\saikarri\Python\100-contacts.csv",
progress_callback=generate_progress_callback())
print("hello")
path = r"C:\Users\saikarri\Python\100-contacts.csv"
if os.path.exists(path):
os.remove(path)
print("delete file")
else:
print("no such file")
I have tried to use lambda function to write a file to S3, then test shows "succeeded" ,but nothing appeared in my S3 bucket. What happened? Does anyone can give me some advice or solutions? Thanks a lot. Here's my code.
import json
import boto3
def lambda_handler(event, context):
string = "dfghj"
file_name = "hello.txt"
lambda_path = "/tmp/" + file_name
s3_path = "/100001/20180223/" + file_name
with open(lambda_path, 'w+') as file:
file.write(string)
file.close()
s3 = boto3.resource('s3')
s3.meta.client.upload_file(lambda_path, 's3bucket', s3_path)
I've had success streaming data to S3, it has to be encoded to do this:
import boto3
def lambda_handler(event, context):
string = "dfghj"
encoded_string = string.encode("utf-8")
bucket_name = "s3bucket"
file_name = "hello.txt"
s3_path = "100001/20180223/" + file_name
s3 = boto3.resource("s3")
s3.Bucket(bucket_name).put_object(Key=s3_path, Body=encoded_string)
If the data is in a file, you can read this file and send it up:
with open(filename) as f:
string = f.read()
encoded_string = string.encode("utf-8")
My response is very similar to Tim B but the most import part is
1.Go to S3 bucket and create a bucket you want to write to
2.Follow the below steps otherwise you lambda will fail due to permission/access. I've copied and pasted it the link content here for you too just in case if they change the url /move it to some other page.
a. Open the roles page in the IAM console.
b. Choose Create role.
c. Create a role with the following properties.
-Trusted entity – AWS Lambda.
-Permissions – AWSLambdaExecute.
-Role name – lambda-s3-role.
The AWSLambdaExecute policy has the permissions that the function needs to manage objects in Amazon S3 and write logs to CloudWatch Logs.
Copy and past this into your Lambda python function
import json, boto3,os, sys, uuid
from urllib.parse import unquote_plus
s3_client = boto3.client('s3')
def lambda_handler(event, context):
some_text = "test"
#put the bucket name you create in step 1
bucket_name = "my_buck_name"
file_name = "my_test_file.csv"
lambda_path = "/tmp/" + file_name
s3_path = "output/" + file_name
os.system('echo testing... >'+lambda_path)
s3 = boto3.resource("s3")
s3.meta.client.upload_file(lambda_path, bucket_name, file_name)
return {
'statusCode': 200,
'body': json.dumps('file is created in:'+s3_path)
}
from os import path
import json, boto3, sys, uuid
import requests
s3_client = boto3.client('s3')
def lambda_handler(event, context):
bucket_name = "mybucket"
url = "https://i.imgur.com/ExdKOOz.png"
reqponse = requests.get(url)
filenname = get_filename(url)
img = reqponse.content
s3 = boto3.resource("s3")
s3.Bucket(bucket_name).put_object(Key=filenname, Body=img)
return {'statusCode': 200,'body': json.dumps('file is created in:')}
def get_filename(url):
fragment_removed = url.split("#")[0]
query_string_removed = fragment_removed.split("?")[0]
scheme_removed = query_string_removed.split("://")[-1].split(":")[-1]
if scheme_removed.find("/") == -1:
return ""
return path.basename(scheme_removed)
I am trying to upload files from local directory to S3 folder. I am able to upload files to S3 bucket but I am unable to upload files to folder within S3 bucket.
Could any one help? What am i doing wrong here..
Here is the code:
import os
import sys
import boto3
import fnmatch
import pprint
import re
import hashlib
SOURCE_DIR = '/home/user/Downloads/tracks/'
BUCKET_NAME = 'mybucket'
S3_FOLDER = 'mybucket/folder1/'
client = boto3.client('s3')
s3 = boto3.resource('s3')
def get_md5(filename):
f = open(filename, 'rb')
m = hashlib.md5()
while True:
data = f.read(10240)
if len(data) == 0:
break
m.update(data)
return m.hexdigest()
def get_etag(filebase,filepath):
for item in bucket.objects.all():
keyfile = S3_FOLDER + filebase
if(keyfile == item.key):
md5 = get_md5(filepath)
etag = item.e_tag.strip('"').strip("'")
if etag != md5:
print(filebase + ": " + md5 + " != " + etag)
return(files_to_upload.append(filepath))
else:
return(files_to_upload.append(filepath))
files_to_upload = []
for root, dirnames, filenames in os.walk(SOURCE_DIR):
for filename in filenames:
filepath = os.path.join(root, filename)
get_etag(filename,filepath)
for f in files_to_upload:
client.put_object(Bucket=BUCKET_NAME, Key=f)
Folders don't really exist in S3. You can prefix the file name (object key) with the something that looks like a folder path.
It's not entirely clear to me what your code is doing with the file paths, but your code needs to be changed to something like this:
for f in files_to_upload:
key = "my/s3/folder/name/" + f
client.put_object(Bucket=BUCKET_NAME, Key=key, Body=f)
Note: You weren't passing a Body parameter, so I think your code was just creating empty objects in S3.
Here is the link i have used (Download files from Amazon S3 with Django). Using this i'm able to download single file.
Code:
s3_template_path = queryset.values('file')
filename = 'test.pdf'
conn = boto.connect_s3('<aws access key>', '<aws secret key>')
bucket = conn.get_bucket('your_bucket')
s3_file_path = bucket.get_key(s3_template_path)
response_headers = {
'response-content-type': 'application/force-download',
'response-content-disposition':'attachment;filename="%s"'% filename
}
url = s3_file_path.generate_url(60, 'GET',
response_headers=response_headers,
force_http=True)
return HttpResponseRedirect(url)
I need to download multiple files from S3, as a zip would be better. Can the mentioned method be modified and used. If not please suggest other method.
Okay here is a possible solution, it basically downloads each file and zips them into a folder, then returns this to the user.
Not sure if s3_template_path is the same for each file, but change this if neccessary
# python 3
import requests
import os
import zipfile
file_names = ['test.pdf', 'test2.pdf', 'test3.pdf']
# set up zip folder
zip_subdir = "download_folder"
zip_filename = zip_subdir + ".zip"
byte_stream = io.BytesIO()
zf = zipfile.ZipFile(byte_stream, "w")
for filename in file_names:
s3_template_path = queryset.values('file')
conn = boto.connect_s3('<aws access key>', '<aws secret key>')
bucket = conn.get_bucket('your_bucket')
s3_file_path = bucket.get_key(s3_template_path)
response_headers = {
'response-content-type': 'application/force-download',
'response-content-disposition':'attachment;filename="%s"'% filename
}
url = s3_file_path.generate_url(60, 'GET',
response_headers=response_headers,
force_http=True)
# download the file
file_response = requests.get(url)
if file_response.status_code == 200:
# create a copy of the file
f1 = open(filename , 'wb')
f1.write(file_response.content)
f1.close()
# write the file to the zip folder
fdir, fname = os.path.split(filename)
zip_path = os.path.join(zip_subdir, fname)
zf.write(filename, zip_path)
# close the zip folder and return
zf.close()
response = HttpResponse(byte_stream.getvalue(), content_type="application/x-zip-compressed")
response['Content-Disposition'] = 'attachment; filename=%s' % zip_filename
return response