My use case is that I'm trying to take a screenshot of a view in Tableau, and save that screenshot in a bucket in s3. This is done through a Lambda function written in Python. The Lambda is assigned full access rights to s3 and is connected to the Internet.
Everything essentially works - there's no issues with access rights to s3, a connection to the Tableau account can be established, and a file is uploaded to s3. There's no errors thrown when the code is tested. There's one issue though: the saved file is an empty 0 bytes file.
Here's the code:
import logging
import traceback
import os
import requests
from datetime import datetime, timezone
import pytz
import json
from dotenv import load_dotenv
import tableauserverclient as TSC
from slack.web.client import WebClient
from slack.errors import SlackApiError
import boto3
import nest_asyncio
nest_asyncio.apply()
def lambda_handler(event,context):
def Tableau2Slack():
try:
#Tableau environemnt variables
tabAccount=os.environ['tabAccount'],
tabPass=os.environ['tabPass'],
tabDomain=os.environ['tabDomain'],
tabServer=os.environ['tabServer'],
tabView1=os.environ['tabView1'],
tabPath1=os.environ['tabPath1']
s3 = boto3.client('s3')
bucket=os.environ['bucket']
#Let's connect to Tableau
print("Talking to Tableau...\n")
tableau_auth = TSC.TableauAuth(tabAccount, tabPass, tabDomain)
server = TSC.Server(tabServer)
# Searching Tableau Online account for View1
with server.auth.sign_in(tableau_auth):
server.use_server_version()
req_option = TSC.RequestOptions()
req_option.filter.add(TSC.Filter(TSC.RequestOptions.Field.Name,
TSC.RequestOptions.Operator.Equals, tabView1))
all_views, pagination_item = server.views.get(req_option)
# Error catching for bad View names
if not all_views:
raise LookupError("View with the specified name was not found.")
view_item = all_views[0]
image_req_option = TSC.ImageRequestOptions(imageresolution=TSC.ImageRequestOptions.Resolution.High,maxage=1)
server.views.populate_image(view_item, image_req_option)
print("Image saved in temporary folder...\n")
date = datetime.utcnow().strftime('%Y_%m_%d')
# Save bytes as image
with open('/tmp' + tabPath1, "wb") as image_file1:
s3.upload_file('/tmp' + tabPath1, bucket, date + '_' + tabPath1)
print("Tableau image successfully saved to s3 as {0}".format(tabPath1), '\n')
# Tableau try statement error handling
except:
traceback.print_exc()
Tableau2Slack()
return print('Success!')
I suspect that there's something wrong where the file is opened and then uploaded to s3, but can't figure out what.
Running the same code locally, but instead of...
with open('/tmp/' + tabPath1, "wb") as image_file1:
s3.upload_file('/tmp/' + tabPath1, bucket, date + '_' + tabPath1)
...replacing it with...
with open(tabPath1, "wb") as image_file1:
image_file1.write(view_item.image)
...saves a proper file of about 250 kb.
Any idea what could be going on? I'm out of ideas...
Related
I've modified a piece of code for merging two or more xml files into one. I got it working locally without using or storing files on google cloud storage.
I'd like to use it via cloud functions, which seems to work mostly fine, apart from uploading the final xml file to google cloud storage.
import os
import wget
import logging
from io import BytesIO
from google.cloud import storage
from xml.etree import ElementTree as ET
def merge(event, context):
client = storage.Client()
bucket = client.get_bucket('mybucket')
test1 = bucket.blob("xml-file1.xml")
inputxml1 = test1.download_as_string()
root1 = ET.fromstring(inputxml1)
test2 = bucket.blob("xml-file2.xml")
inputxml2 = test2.download_as_string()
root2 = ET.fromstring(inputxml2)
copy_files = [e for e in root1.findall('./SHOPITEM')]
src_files = set([e.find('./SHOPITEM') for e in copy_files])
copy_files.extend([e for e in root2.findall('./SHOPITEM') if e.find('./CODE').text not in src_files])
files = ET.Element('SHOP')
files.extend(copy_files)
blob = bucket.blob("test.xml")
blob.upload_from_string(files)
Ive tried the functions .write and .tostring but unsuccessfully.
Sorry for the incomplete question. I've already found a solution and I cant recall the error message I got.
Here is my solution:
blob.upload_from_string(ET.tostring(files, encoding='UTF-8',xml_declaration=True, method='xml').decode('UTF-8'),content_type='application/xml')
My requirement is to export the data from BQ to GCS in a particular sorted order which I am not able to get using automatic export and hence trying to write a manual export for this.
File format is like below:
HDR001||5378473972abc||20101|182082||
DTL001||436282798101|
DTL002||QS
DTL005||3733|8
DTL002||QA
DTL005||3733|8
DTL002||QP
DTL005||3733|8
DTL001||436282798111|
DTL002||QS
DTL005||3133|2
DTL002||QA
DTL005||3133|8
DTL002||QP
DTL005||3133|0
I am very new to this and am able to write the file in local disk but I am not sure how I can write this to file to GCS. I tried to use the write_to_file but I seem to be missing something.
import pandas as pd
import pickle as pkl
import tempfile
from google.colab import auth
from google.cloud import bigquery, storage
#import cloudstorage as gcs
auth.authenticate_user()
df = pd.DataFrame(data=job)
sc = storage.Client(project='temp-project')
with tempfile.NamedTemporaryFile(mode='w+b', buffering=- 1,prefix='test',suffix='temp') as fh:
with open(fh.name,'w+',newline='') as f:
dfAsString = df.to_string(header=" ", index=False)
fh.name = fh.write(dfAsString)
fh.close()
bucket = sc.get_bucket('my-bucket')
target_fn = 'test.csv'
source_fn = fh.name
destination_blob_name = bucket.blob('test.csv')
bucket.blob(destination_blob_name).upload_from_file(source_fn)
Can someone please help?
Thank You.
I would suggest to upload an object through a Cloud Storage bucket. Instead of upload_from_file, you need to use upload_from_filename. Your code should look like this:
bucket.blob(destination_blob_name).upload_from_filename(source_fn)
Here are links for the documentation on how to upload an object to Cloud Storage bucket and Client library docs.
EDIT:
The reason why you're getting that is because somewhere in your code, you're passing a Blob object, rather than a String. Currently your destination variable is a Blob Object, change it to String instead:
destination_blob_name = bucket.blob('test.csv')
to
destination_blob_name = 'test.csv'
I'm trying to download my cifar 10 data that is in S3 to train it in AWS SageMaker.
I'm using this code to load the data:
import s3fs
fs = s3fs.S3FileSystem()
def unpickle(file):
dict = pickle.load(file, encoding='bytes')
return dict
with fs.open(f's3://bucket_name/data_batch_1') as f:
data= unpickle(f)
I'm getting the error "EOFError: Ran out of input" on the unpickle function. I assume the "file" is empty, but I tried different ways to get the data from my bucket, and can't seem to get it right.
Unless you have granted the appropriate permissions in IAM for the user to have access to the S3 bucket, the easiest fix is to grant public access, i.e. make sure all are unchecked as below.
Then, using boto3 is an option for importing the dataset from S3 into SageMaker. Here is an example:
import boto3
import botocore
import pandas as pd
from sagemaker import get_execution_role
role = get_execution_role()
bucket = 'databucketname'
data_key = 'datasetname.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)
train_df = pd.read_csv(data_location)
Hope this helps.
I am attempting to pull a file from AWS S3, using Boto3, directly into a BytesIO object. This will eventually be used to manipulate the downloaded data but for now I'm just trying to give that file directly to a user via Flask. As I understand everything the below should work, but does not. The browser simply displays nothing (and shows only downloaded a few bytes of data).
(In this example, my sample file is a png)
from flask import Flask, send_from_directory, abort, Response, send_file, make_response
import boto3, botocore
import os
import io
AWS_ACCESS_KEY = os.environ['AWS_ACCESS_KEY'].rstrip()
AWS_SECRET_KEY = os.environ['AWS_SECRET_KEY'].rstrip()
S3_BUCKET = "static1"
app = Flask(__name__, static_url_path='/tmp')
#app.route('/', defaults={'path': ''})
#app.route('/<path:path>')
def catch_all(path):
s3 = boto3.client('s3', aws_access_key_id=AWS_ACCESS_KEY, aws_secret_access_key=AWS_SECRET_KEY,)
file = io.BytesIO()
metadata = s3.head_object(Bucket=S3_BUCKET, Key=path)
conf = boto3.s3.transfer.TransferConfig(use_threads=False)
s3.download_fileobj(S3_BUCKET, path, file)
return send_file(file, mimetype=metadata['ContentType'])
if __name__ == '__main__':
app.run(debug=True,port=3000,host='0.0.0.0')
If I modify that core routine to write the BytesIO object to disk, then read it back into a new BytesIO object - it works fine. As below:
def catch_all(path):
s3 = boto3.client('s3', aws_access_key_id=AWS_ACCESS_KEY, aws_secret_access_key=AWS_SECRET_KEY,)
file = io.BytesIO()
metadata = s3.head_object(Bucket=S3_BUCKET, Key=path)
conf = boto3.s3.transfer.TransferConfig(use_threads=False)
s3.download_fileobj(S3_BUCKET, path, file)
print(file.getvalue())
fh = open("/tmp/test1.png","wb")
fh.write(file.getvalue())
fh.close()
fh = open("/tmp/test1.png","rb")
f2 = io.BytesIO(fh.read())
fh.close
print(f2.getvalue())
return send_file(f2, mimetype=metadata['ContentType'])
Going around in circles with this for a few days, It's clear that I'm missing something and I'm not sure what. The script is being run inside a Python 3.8 docker container with the latest copies of boto3/flask/etc.
Rewinding your BytesIO object should do the trick, with file.seek(0) just before send_file(...).
For the record I'm not sure your boto3/botocore calls are "best practices", to try your usecase I ended up with:
from boto3.session import Session
session = Session(
aws_access_key_id=KEY_ID, aws_secret_access_key=ACCESS_KEY, region_name=REGION_NAME
)
s3 = session.resource("s3")
#base_bp.route("/test-stuff")
def test_stuff():
a_file = io.BytesIO()
s3_object = s3.Object(BUCKET, PATH)
s3_object.download_fileobj(a_file)
a_file.seek(0)
return send_file(a_file, mimetype=s3_object.content_type)
It works on when reading the file from disk because you instanciate your BytesIO with the full content of the file, so it's properly fulfilled and still at "position 0".
I want to upload an image on Google Cloud Storage from a python script. This is my code:
from oauth2client.service_account import ServiceAccountCredentials
from googleapiclient import discovery
scopes = ['https://www.googleapis.com/auth/devstorage.full_control']
credentials = ServiceAccountCredentials.from_json_keyfile_name('serviceAccount.json', scop
es)
service = discovery.build('storage','v1',credentials = credentials)
body = {'name':'my_image.jpg'}
req = service.objects().insert(
bucket='my_bucket', body=body,
media_body=googleapiclient.http.MediaIoBaseUpload(
gcs_image, 'application/octet-stream'))
resp = req.execute()
if gcs_image = open('img.jpg', 'r') the code works and correctly save my image on Cloud Storage. How can I directly upload a bytes image? (for example from an OpenCV/Numpy array: gcs_image = cv2.imread('img.jpg'))
In my case, I wanted to upload a PDF document to Cloud Storage from bytes.
When I tried the below, it created a text file with my byte string in it.
blob.upload_from_string(bytedata)
In order to create an actual PDF file using the byte string I had to do:
blob.upload_from_string(bytedata, content_type='application/pdf')
My byte data was b64encoded, so I also had b64decode it first.
If you want to upload your image from file.
import os
from google.cloud import storage
def upload_file_to_gcs(bucket_name, local_path, local_file_name, target_key):
try:
client = storage.Client()
bucket = client.bucket(bucket_name)
full_file_path = os.path.join(local_path, local_file_name)
bucket.blob(target_key).upload_from_filename(full_file_path)
return bucket.blob(target_key).public_url
except Exception as e:
print(e)
return None
but if you want to upload bytes directly:
import os
from google.cloud import storage
def upload_data_to_gcs(bucket_name, data, target_key):
try:
client = storage.Client()
bucket = client.bucket(bucket_name)
bucket.blob(target_key).upload_from_string(data)
return bucket.blob(target_key).public_url
except Exception as e:
print(e)
return None
note that target_key is prefix and the name of the uploaded file.
MediaIoBaseUpload expects an io.Base-like object and raises following error:
'numpy.ndarray' object has no attribute 'seek'
upon receiving a ndarray object. To solve it I am using TemporaryFile and numpy.ndarray().tofile()
from oauth2client.service_account import ServiceAccountCredentials
from googleapiclient import discovery
import googleapiclient
import numpy as np
import cv2
from tempfile import TemporaryFile
scopes = ['https://www.googleapis.com/auth/devstorage.full_control']
credentials = ServiceAccountCredentials.from_json_keyfile_name('serviceAccount.json', scopes)
service = discovery.build('storage','v1',credentials = credentials)
body = {'name':'my_image.jpg'}
with TemporaryFile() as gcs_image:
cv2.imread('img.jpg').tofile(gcs_image)
req = service.objects().insert(
bucket='my_bucket’, body=body,
media_body=googleapiclient.http.MediaIoBaseUpload(
gcs_image, 'application/octet-stream'))
resp = req.execute()
Be aware that googleapiclient is non-idiomatic and maintenance only(it’s not developed anymore). I would recommend using idiomatic one.
Here is how to directly upload a PIL Image from memory:
from google.cloud import storage
import io
from PIL import Image
# Define variables
bucket_name = XXXXX
destination_blob_filename = XXXXX
# Configure bucket and blob
client = storage.Client()
bucket = client.bucket(bucket_name)
im = Image.open("test.jpg")
bs = io.BytesIO()
im.save(bs, "jpeg")
blob.upload_from_string(bs.getvalue(), content_type="image/jpeg")
In addition to that, here is how to download blobfiles directly to memory as PIL Images:
blob = bucket.blob(destination_blob_filename)
downloaded_im_data = blob.download_as_bytes()
downloaded_im = Image.open(io.BytesIO(downloaded_im_data))