Convert csv from UTF16 to UTF8 inside a Google Cloud function - Python - python

I am trying to convert a csv file from UTF16-le to UTF8 inside a google cloud function in python 3.9.
The utf16 file is in a bucket but can't be found with my code.
from google.cloud import storage
import codecs
import shutil
def utf_convert(blob_name):
bucket_name = "test_bucket"
blob_name = "Testutf16.csv"
new_file = "Testutf8.csv"
storage_client = storage.Client()
source_bucket = storage_client.bucket(bucket_name)
source_blob = source_bucket.blob(blob_name)
with codecs.open(source_blob, encoding="utf-16-le") as input_file:
with codecs.open(
new_file, "w", encoding="utf-8") as output_file:
shutil.copyfileobj(input_file, output_file)
I receive following error:
TypeError: expected str, bytes or os.PathLike object, not Blob
If I try to pass directly the name of the file or the uri in the codecs.open,
with codecs.open("gs://test_bucket/Testutf16.csv", encoding="utf-16-le")
I receive following error:
FileNotFoundError: [Errno 2] No such file or directory
How can my file be found?

Related

Cannot open binary file from jupyter notebook in cloud storage bucket GCP

I am having troubles opening a binary file in a jupyter notebook in GCP. I have tried the 2 following methods but I'm always getting an error
Code 1
with open('gs://cloud-ai-platform-fcf9f6d9-ccf6-4e8b-bdf2-6cc69a369809/rank_table.bin', 'rb') as file:
binary_content = file.read()
rank_table_file = binary_content
self.rank_table = np.fromfile(rank_table_file, dtype=np.int32)
Error 1 (obviously I checked, the file path is correct)
FileNotFoundError: [Errno 2] No such file or directory: 'gs://cloud-ai-platform-fcf9f6d9-ccf6-4e8b-bdf2-6cc69a369809/rank_table.bin'
Second code was with a blob method as indicated in GCP documentation
from google.cloud import storage
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob('rank_table.bin')
with blob.open("rb") as f:
binary_content = f.read()
rank_table_file = binary_content
self.rank_table = np.fromfile(rank_table_file, dtype=np.int32)
But instead, I got the following error message:
io.UnsupportedOperation: fileno
I understand that this error comes from the fact that my file does not support the method applied, but I can't figure out why.
Do you have any suggestion I could use?
Cheers

Stream Files to Zip File in Azure Blob Storage using Python?

I have the following problem in Python:
I am looking to create a zipfile in Blob Storage consisting of files from an array of URLs but I don't want to create the entire zipfile in memory and then upload it. I ideally want to stream the files to the zipfile in blob storage. I found this write up for C# https://andrewstevens.dev/posts/stream-files-to-zip-file-in-azure-blob-storage/
as well as this answer also in C# https://stackoverflow.com/a/54767264/10550055 .
I haven't been able to find equivalent functionality in the python azure blob SDK and python zipfile library.
Try this :
from zipfile import ZipFile
from azure.storage.blob import BlobServiceClient
import os,requests
tempPath = '<temp path>'
if not os.path.isdir(tempPath):
os.mkdir(tempPath)
zipFileName = 'test.zip'
storageConnstr = ''
container = ''
blob = BlobServiceClient.from_connection_string(storageConnstr).get_container_client(container).get_blob_client(zipFileName)
fileURLs = {'https://cdn.pixabay.com/photo/2015/04/23/22/00/tree-736885__480.jpg',
'http://1812.img.pp.sohu.com.cn/images/blog/2009/11/18/18/8/125b6560a6ag214.jpg',
'http://513.img.pp.sohu.com.cn/images/blog/2009/11/18/18/27/125b6541abcg215.jpg'}
def download_url(url, save_path, chunk_size=128):
r = requests.get(url, stream=True)
with open(save_path, 'wb') as fd:
for chunk in r.iter_content(chunk_size=chunk_size):
fd.write(chunk)
zipObj = ZipFile(tempPath + zipFileName, 'w')
#download file and write to zip
for url in fileURLs:
localFilePath = tempPath + os.path.basename(url)
download_url(url,localFilePath)
zipObj.write(localFilePath)
zipObj.close()
#upload zip
with open(tempPath + zipFileName, 'rb') as stream:
blob.upload_blob(stream)

Invalid base64-encoded string when trying to upload BufferedReader to Azure Blob Storage in Python

When trying to upload CSVS in a folder to the blob it throws in the first CSV:
Exception:
Invalid base64-encoded string: number of data characters (85) cannot be 1 more than a multiple of 4
Thanks!
import os
import io
import glob from base64
import b64decode, b64encode
from azure.storage.blob import BlobClient, BlobServiceClient, ContainerClient
path = "D:/Git projects/csvs"
extension = 'csv'
os.chdir(path) #gets files that are from TODAY only
result = glob.glob("*"+ str(today) + ".{}".format(extension))
try:
blob_service_client = BlobServiceClient.from_connection_string(connect_str)
for csv_file in result: # Create a blob client using the local file name as the name for the blob
blob_client = blob_service_client.get_blob_client(container=, blob=blob_name) # Upload the created file
with open(csv_file, "rb") as data:
#data = data.read().decode("utf-8")
#data = data.encode("utf-8")
blob_client.upload_blob(data, overwrite = True)
except Exception as ex:
print('Exception:')
print(ex)`
OUTPUT: Exception: Invalid base64-encoded string: number of data characters (85) cannot be 1 more than a multiple of 4
You can check the encoding of the csv data you are trying to upload :
import chardet
with open(csv_file,"rb") as data
chardet.detect(data)
The output of chardet will be something like :
{'encoding': 'EUC-JP', 'confidence': 0.99}
The libary to detect encoding is : https://github.com/chardet/chardet . Then you can work on converting that encoding and try to upload again

aws lambda python append to file from S3 object

I am trying to write the contents read from S3 object to a file . I am getting syntax error while doing the same.
object =s3.get_object(Bucket=bucket_name, Key="toollib/{0}/{1}/stages/{0}.groovy".format(tool,platform))
print(object)
jenkinsfile = object['Body'].read()
print(jenkinsfile)
basepath = '/mnt/efs/{0}/{1}/{2}/'.format(orderid, platform, technology)
filename = basepath+fileName
print(filename)
#file1=open(filename, "a")
with open(filename, 'a') as file:
file.write(jenkinsfile)
Error : "errorMessage": "write() argument must be str, not bytes"
Opening the file in binary mode should do the trick:
with open(filename, 'ab') as file:
file.write(jenkinsfile)

Read h5 file using AWS boto3

I am trying to read h5 file from AWS S3 using boto3.
client = boto3.client('s3',key ='key')
result = client.get_object(Bucket='bucket', Key='file')
with h5py.File(result['Body'], 'r') as f:
data = f
TypeError: expected str, bytes or os.PathLike object, not StreamingBody
Any idea?
h5py version is 2.10, boto3 version is 1.7.58
The same question was here, but no answer...
The h5py.File() command is expecting a path to a local file on disk. However, you are passing it the data in memory.
You can download the file with:
import boto3
s3_client = boto3.client('s3')
s3_client.download_file('bucket', 'key', 'filename')
with h5py.File('filename', 'r') as f:
data = f
A working solution using tempfile for temporary storage.
This streams the model data from your s3 bucket into a temp storage and sets it into a variable.
import tempfile
from keras import models
import boto3
# Creating the low level functional client
client = boto3.client(
's3',
aws_access_key_id = 'ACCESS_KEY_ID',
aws_secret_access_key = 'ACCESS_SECRET_KEY',
region_name = 'us-east-1'
)
# Create the S3 object
response_data = client.get_object(
Bucket = 'bucket-name',
Key = 'model/model.h5'
)
model_name='model.h5'
response_data=response_data['Body']
response_data=response_data.read()
#save byte file to temp storage
with tempfile.TemporaryDirectory() as tempdir:
with open(f"{tempdir}/{model_name}", 'wb') as my_data_file:
my_data_file.write(response_data)
#load byte file from temp storage into variable
gotten_model=models.load_model(f"{tempdir}/{model_name}")
print(gotten_model.summary())

Categories