I read the minio docs and I see two methods to upload data:
put_object() this needs a io-stream
fput_object() this reads a file on disk
I want to test minio and upload some data I just created with numpy.random.bytes().
How to upload data which is stored in a variable in the python interpreter?
Take a look at io.BytesIO. These allow you to wrap byte arrays up in a stream which you can give to minio.
For example:
import io
from minio import Minio
value = "Some text I want to upload"
value_as_bytes = value.encode('utf-8')
value_as_a_stream = io.BytesIO(value_as_bytes)
client = Minio("my-url-here", ...) # Edit this bit to connect to your Minio server
client.put_object("my_bucket", "my_key", value_as_a_stream , length=len(value_as_bytes))
I was in a similar situation: trying to store a pandas DataFrame as a feather file into minio.
I needed to store bytes directly using the Minio client. In the end the code looked like that:
from io import BytesIO
from pandas import df
from numpy import random
import minio
# Create the client
client = minio.Minio(
endpoint="localhost:9000",
access_key="access_key",
secret_key="secret_key",
secure=False
)
# Create sample dataset
df = pd.DataFrame({
"a": numpy.random.random(size=1000),
})
# Create a BytesIO instance that will behave like a file opended in binary mode
feather_output = BytesIO()
# Write feather file
df.to_feather(feather_output)
# Get numver of bytes
nb_bytes = feather_output.tell()
# Go back to the start of the opened file
feather_output.seek(0)
# Put the object into minio
client.put_object(
bucket_name="datasets",
object_name="demo.feather",
length=nb_bytes,
data=feather_output
)
I had to use .seek(0) in order for minio to be able to insert correct amounts of bytes.
#gcharbon: this solution does not work for me. client.put_object() does only accept bytes like objects.
Here is my solution:
from minio import Minio
import pandas as pd
import io
#Can use a string with csv data here as well
csv_bytes = df.to_csv().encode('utf-8')
csv_buffer = io.BytesIO(csv_bytes)
# Create the client
client = Minio(
endpoint="localhost:9000",
access_key="access_key",
secret_key="secret_key",
secure=False
)
client.put_object("bucketname",
"objectname",
data=csv_buffer,
length=len(csv_bytes),
content_type='application/csv')
Related
I'm trying to download a blob file & store it locally on my machine. The file format is HDF5 (a format I have limited/no experience of so far).
So far I've been successful in downloading something using the scripts below. The key issue is it doesn't seem to be the full file. When downloading the file directly from storage explorer it is circa 4,000kb. The HDF5 file I save is 2kb.
What am I doing wrong? Am I missing a readall() somewhere?
My first time working with blob storage & HDF5's, so coming a little stuck right now. A lot of the old questions seem to be using deprecated commands as the azure.storage.blob module has been updated.
from azure.storage.blob import BlobServiceClient
from io import StringIO, BytesIO
import h5py
# Initialise client
blob_service_client = BlobServiceClient.from_connection_string("my_conn_str")
# Initialise container
blob_container_client = blob_service_client.get_container_client("container_name")
# Get blob
blob_client = blob_container_client.get_blob_client("file_path")
# Download
download_stream = blob_client.download_blob()
# Create empty stream
stream = BytesIO()
# Read downloaded blob into stream
download_stream.readinto(stream)
# Create new empty hdf5 file
hf = h5py.File('data.hdf5', 'w')
# Write stream into empty HDF5
hf.create_dataset('dataset_1',stream)
# Close Blob (& save)
hf.close()
I tried to reproduce the scenario in my system facing with same issue with code you tried
So I tried the another solution read the hdf5 file as stream and write it inside another hdf5 file
Try with this solution .Taken some dummy data for testing purpose.
from azure.storage.blob import BlobServiceClient
from io import StringIO, BytesIO
import numpy as np
import h5py
# Initialise client
blob_service_client = BlobServiceClient.from_connection_string("Connection String")
# Initialise container
blob_container_client = blob_service_client.get_container_client("test//Container name")
# Get blob
blob_client = blob_container_client.get_blob_client("test.hdf5 //Blob name")
print("downloaded the blob ")
# Download
download_stream = blob_client.download_blob()
stream = BytesIO()
downloader = blob_client.download_blob()
# download the entire file in memory here
# file can be many giga bytes! Big problem
downloader.readinto(stream)
# works fine to open the stream and read data
f = h5py.File(stream, 'r')
//dummy data
data_matrix = np.random.uniform(-1, 1, size=(10, 3))
with h5py.File(stream, "r") as f:
# List all groups
print("Keys: %s" % f.keys())
a_group_key = list(f.keys())[0]
# Get the data
data = list(f[a_group_key])
data_matrix=data
print(data)
with h5py.File("file1.hdf5", "w") as data_file:
data_file.create_dataset("group_name", data=data_matrix)
OUTPUT
My requirement is to export the data from BQ to GCS in a particular sorted order which I am not able to get using automatic export and hence trying to write a manual export for this.
File format is like below:
HDR001||5378473972abc||20101|182082||
DTL001||436282798101|
DTL002||QS
DTL005||3733|8
DTL002||QA
DTL005||3733|8
DTL002||QP
DTL005||3733|8
DTL001||436282798111|
DTL002||QS
DTL005||3133|2
DTL002||QA
DTL005||3133|8
DTL002||QP
DTL005||3133|0
I am very new to this and am able to write the file in local disk but I am not sure how I can write this to file to GCS. I tried to use the write_to_file but I seem to be missing something.
import pandas as pd
import pickle as pkl
import tempfile
from google.colab import auth
from google.cloud import bigquery, storage
#import cloudstorage as gcs
auth.authenticate_user()
df = pd.DataFrame(data=job)
sc = storage.Client(project='temp-project')
with tempfile.NamedTemporaryFile(mode='w+b', buffering=- 1,prefix='test',suffix='temp') as fh:
with open(fh.name,'w+',newline='') as f:
dfAsString = df.to_string(header=" ", index=False)
fh.name = fh.write(dfAsString)
fh.close()
bucket = sc.get_bucket('my-bucket')
target_fn = 'test.csv'
source_fn = fh.name
destination_blob_name = bucket.blob('test.csv')
bucket.blob(destination_blob_name).upload_from_file(source_fn)
Can someone please help?
Thank You.
I would suggest to upload an object through a Cloud Storage bucket. Instead of upload_from_file, you need to use upload_from_filename. Your code should look like this:
bucket.blob(destination_blob_name).upload_from_filename(source_fn)
Here are links for the documentation on how to upload an object to Cloud Storage bucket and Client library docs.
EDIT:
The reason why you're getting that is because somewhere in your code, you're passing a Blob object, rather than a String. Currently your destination variable is a Blob Object, change it to String instead:
destination_blob_name = bucket.blob('test.csv')
to
destination_blob_name = 'test.csv'
I want to download a CSV file stored in Azure storage into a stream and directly used in my python script, but after I did this with help from Thomas, I cannot use pandas read_csv method, the error message is: pandas.io.common.EmptyDataError: No columns to parse from file,thus I assume the download CSV stream is actually empty, but after check in storage account, the CSV file is fine with all data inside it, what the problem here? below is the code from Thomas:
from azure.storage.blob import BlockBlobService
import io
from io import BytesIO, StringIO
import pandas as pd
from shutil import copyfileobj
with BytesIO() as input_blob:
with BytesIO() as output_blob:
block_blob_service = BlockBlobService(account_name='my account', account_key='mykey')
block_blob_service.get_blob_to_stream('my counter', 'datatest1.csv', input_blob)
df=pd.read_csv(input_blob)
print(df)
copyfileobj(input_blob, output_blob)
#print(output_blob)
# Create the a new blob
block_blob_service.create_blob_from_stream('my counter', 'datatest2.csv', output_blob)
if i dont execute the read_csv code, the create_blob_from_stream will create a empty file, but if i execute the read_csv code, i got error:
pandas.parser.TextReader.cinit (pandas\parser.c:6171)
pandas.io.common.EmptyDataError: No columns to parse from file
the download file stored fine in the blob storage with all data in it. as showing below:
i finally figure out, after spend so many time on this !
have to EXECUTE :
input_blob.seek(0)
to use the stream after save the stream to input_blob !!
I feel kind of stupid right now. I have been reading numerous documentations and stackoverflow questions but I can't get it right.
I have a file on Google Cloud Storage. It is in a bucket 'test_bucket'. Inside this bucket there is a folder, 'temp_files_folder', which contains two files, one .txt file named 'test.txt' and one .csv file named 'test.csv'. The two files are simply because I try using both but the result is the same either way.
The content in the files is
hej
san
and I am hoping to read it into python the same way I would do on a local with
textfile = open("/file_path/test.txt", 'r')
times = textfile.read().splitlines()
textfile.close()
print(times)
which gives
['hej', 'san']
I have tried using
from google.cloud import storage
client = storage.Client()
bucket = client.get_bucket('test_bucket')
blob = bucket.get_blob('temp_files_folder/test.txt')
print(blob.download_as_string)
but it gives the output
<bound method Blob.download_as_string of <Blob: test_bucket, temp_files_folder/test.txt>>
How can I get the actual string(s) in the file?
download_as_string is a method, you need to call it.
print(blob.download_as_string())
More likely, you want to assign it to a variable so that you download it once and can then print it and do whatever else you want with it:
downloaded_blob = blob.download_as_string()
print(downloaded_blob)
do_something_else(downloaded_blob)
The method 'download_as_string()' will read in the content as byte.
Find below an example to process a .csv file.
import csv
from io import StringIO
from google.cloud import storage
storage_client = storage.Client()
bucket = storage_client.get_bucket(YOUR_BUCKET_NAME)
blob = bucket.blob(YOUR_FILE_NAME)
blob = blob.download_as_string()
blob = blob.decode('utf-8')
blob = StringIO(blob) #tranform bytes to string here
names = csv.reader(blob) #then use csv library to read the content
for name in names:
print(f"First Name: {name[0]}")
According to the documentation (https://googleapis.dev/python/storage/latest/blobs.html), As of the time of writing (2021/08), the download_as_string method is a depreciated alias for the download_as_byte method which - as suggested by the name - returns a byte object.
You can instead use the download_as_text method to return a str object.
For instances, to download the file MYFILE from bucket MYBUCKET and store it as an utf-8 encoded string:
from google.cloud.storage import Client
client = Client()
bucket = client.get_bucket(MYBUCKET)
blob = bucket.get_blob(MYFILE)
downloaded_file = blob.download_as_text(encoding="utf-8")
You can then also use this in order to read different file formats. For json, replace the last line to
import json
downloaded_json_file = json.loads(blob.download_as_text(encoding="utf-8"))
For yaml files, replace the last line to :
import yaml
downloaded_yaml_file = yaml.safe_load(blob.download_as_text(encoding="utf-8"))
DON'T USE: blob.download_as_string()
USE: blob.download_as_text()
blob.download_as_text() does indeed return a string.
blob.download_as_string() is deprecated and returns a bytes object instead of a string object.
Works out when reading a docx / text file
from google.cloud import storage
# create storage client
storage_client = storage.Client.from_service_account_json('**PATH OF JSON FILE**')
bucket = storage_client.get_bucket('**BUCKET NAME**')
# get bucket data as blob
blob = bucket.blob('**SPECIFYING THE DOXC FILENAME**')
downloaded_blob = blob.download_as_string()
downloaded_blob = downloaded_blob.decode("utf-8")
print(downloaded_blob)
I have zip files uploaded to S3. I'd like to download them for processing. I don't need to permanently store them, but I need to temporarily process them. How would I go about doing this?
Because working software > comprehensive documentation:
Boto2
import zipfile
import boto
import io
# Connect to s3
# This will need your s3 credentials to be set up
# with `aws configure` using the aws CLI.
#
# See: https://aws.amazon.com/cli/
conn = boto.s3.connect_s3()
# get hold of the bucket
bucket = conn.get_bucket("my_bucket_name")
# Get hold of a given file
key = boto.s3.key.Key(bucket)
key.key = "my_s3_object_key"
# Create an in-memory bytes IO buffer
with io.BytesIO() as b:
# Read the file into it
key.get_file(b)
# Reset the file pointer to the beginning
b.seek(0)
# Read the file as a zipfile and process the members
with zipfile.ZipFile(b, mode='r') as zipf:
for subfile in zipf.namelist():
do_stuff_with_subfile()
Boto3
import zipfile
import boto3
import io
# this is just to demo. real use should use the config
# environment variables or config file.
#
# See: http://boto3.readthedocs.org/en/latest/guide/configuration.html
session = boto3.session.Session(
aws_access_key_id="ACCESSKEY",
aws_secret_access_key="SECRETKEY"
)
s3 = session.resource("s3")
bucket = s3.Bucket('stackoverflow-brice-test')
obj = bucket.Object('smsspamcollection.zip')
with io.BytesIO(obj.get()["Body"].read()) as tf:
# rewind the file
tf.seek(0)
# Read the file as a zipfile and process the members
with zipfile.ZipFile(tf, mode='r') as zipf:
for subfile in zipf.namelist():
print(subfile)
Tested on MacOSX with Python3.
If speed is a concern, a good approach would be to choose an EC2 instance fairly close to your S3 bucket (in the same region) and use that instance to unzip/process your zipped files.
This will allow for a latency reduction and allow you to process them fairly efficiently. You can remove each extracted file after finishing your work.
Note: This will only work if you are fine using EC2 instances.
Pandas provides a shortcut for this, which removes most of the code from the top answer, and allows you to be agnostic about whether your file path is on s3, gcp, or your local machine.
import pandas as pd
obj = pd.io.parsers.get_filepath_or_buffer(file_path)[0]
with io.BytesIO(obj.read()) as byte_stream:
# Use your byte stream, to, for example, print file names...
with zipfile.ZipFile(byte_stream, mode='r') as zipf:
for subfile in zipf.namelist():
print(subfile)
I believe you have heard boto which is Python interface to Amazon Web Services
You can get key from s3 to file.
import boto
import zipfile.ZipFile as ZipFile
s3 = boto.connect_s3() # connect
bucket = s3.get_bucket(bucket_name) # get bucket
key = bucket.get_key(key_name) # get key (the file in s3)
key.get_file(local_name) # set this to temporal file
with ZipFile(local_name, 'r') as myzip:
# do something with myzip
os.unlink(local_name) # delete it
You can also use tempfile. For more detail, see create & read from tempfile
Reading certain file from a zip file from S3 bucket.
import boto3
import os
import zipfile
import io
import json
'''
When you configure awscli, you\'ll set up a credentials file located at
~/.aws/credentials. By default, this file will be used by Boto3 to authenticate.
'''
os.environ['AWS_PROFILE'] = "<profile_name>"
os.environ['AWS_DEFAULT_REGION'] = "<region_name>"
# Let's use Amazon S3
s3_name = "<bucket_name>"
zip_file_name = "<zip_file_name>"
file_to_open = "<file_to_open>"
s3 = boto3.resource('s3')
obj = s3.Object(s3_name, zip_file_name )
with io.BytesIO(obj.get()["Body"].read()) as tf:
# rewind the file
tf.seek(0)
# Read the file as a zipfile and process the members
with zipfile.ZipFile(tf, mode='r') as zipf:
file_contents= zipf.read(file_to_open).decode("utf-8")
print(file_contents)
reference from #brice answer.
Adding on to #brice answer
Here is the code if you want to read any data inside the file line by line
with zipfile.ZipFile(tf, mode='r') as zipf:
for line in zipf.read("xyz.csv").split(b"\n"):
print(line)
break # to break off after the first line
Hope this helps!