How to extract the elements from csv to json in S3 - python

I need to find the csv files from the folder
List all the files inside the folder
Convert files to json and save in the same bucket
Csv file, Like below so many csv files are there
emp_id,Name,Company
10,Aka,TCS
11,VeI,TCS
Code is below
import boto3
import pandas as pd
def lambda_handler(event, context):
s3 = boto3.resource('s3')
my_bucket = s3.Bucket('testfolder')
for file in my_bucket.objects.all():
print(file.key)
for csv_f in file.key:
with open(f'{csv_f.replace(".csv", ".json")}', "w") as f:
pd.read_csv(csv_f).to_json(f, orient='index')
Not able to save if you remove bucket name it will save in the folder. How to save back to bucket name

You can check the following code:
from io import StringIO
import boto3
import pandas as pd
s3 = boto3.resource('s3')
def lambda_handler(event, context):
s3 = boto3.resource('s3')
input_bucket = 'bucket-with-csv-file-44244'
my_bucket = s3.Bucket(input_bucket)
for file in my_bucket.objects.all():
if file.key.endswith(".csv"):
csv_f = f"s3://{input_bucket}/{file.key}"
print(csv_f)
json_file = file.key.replace(".csv", ".json")
print(json_file)
json_buffer = StringIO()
df = pd.read_csv(csv_f)
df.to_json(json_buffer, orient='index')
s3.Object(input_bucket, json_file).put(Body=json_buffer.getvalue())
Your lambda layer will need to have:
fsspec
pandas
s3fs

Related

Sometime zip file does not open, shows error message. that zip file is constructed by zipfile python library

import pandas as pd
from io import BytesIO
import zipfile
from google.cloud import storage
import datetime
class ConstructedFile:
def __init__(self, data):
self.raw_data = data
self.columns = list(raw_data.keys())
self.final_list = dict()
self.memory_file = None
def generate_csv(self):
df = pd.DataFrame(self.raw_data, self.columns)
self.final_list = {"Name": self.columns, "data": df.to_csv(index=False)}
print(f"***** {self.final_list} *****")
def compress_to_zip(self, file_name):
self.memory_file = BytesIO()
zf = zipfile.ZipFile(self.memory_file, 'w')
data = zipfile.ZipInfo(f"{file_name}.csv")
data.compress_type = zipfile.ZIP_DEFLATED
zf.writestr(data, self.final_list['data'])
def upload_blob_from_memory(self, destination_blob_name):
bucket_name = "Example_Bucket"
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(destination_blob_name)
blob.upload_from_string(self.memory_file, content_type='application/zip')
def generate_download_signed_url_v4(self, blob_name):
bucket_name = "Example_Bucket"
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(blob_name)
url = blob.generate_signed_url(version="v4", expiration=datetime.timedelta(minutes=15), method="GET")
return url
raw_data = {"Fruits": ["apple", "orange", "grapes"], "Quantity": [3, 5, 6], "Rate": [50, 60, 70]}
file_name = "fruits"
obj = ConstructedFile(raw_data)
obj.generate_csv()
obj.compress_to_zip(file_name)
obj.upload_blob_from_memory(f"export/{file_name}.zip")
singed_url = obj.generate_download_signed_url_v4(f"export/{file_name}.zip")
i just give sample flow of my program as original code was implemented.
The program flow are:
Fetching raw_data from database.
fetched raw_data are being involved to upcoming step
generate to csv ==> obj.generate_csv()
compress to zip as blob ==> obj.compress_to_zip(file_name)
upload the blob to GCS bucket ==> obj.upload_blob_from_memory(f"export/{file_name}")
generate the singed_url from GCS bucket for uploaded blob ==> obj.generate_download_signed_url_v4(f"export/{file_name}")
Here is the problem i am facing.
when i downloading the zip file (fruits.zip) using singed_url, that file works and open fine. but Sometimes i download the zip file using singed_url, that file does not open and shows error message . it seems to be corrupted.
here is image which is error message while opening zip file
also i have tried to open file which is download from GCS bucket directly, it shows same error message.
i don't know my guess correct or wrong, i think that the mistake would happen while constructing the zip file in memory?
i am stuck with this. please help me out.

Read Excel file in AWS

I wanted to read an excel file in S3 from Glue.
Here's what I've done so far.
import pandas as pd
import awswrangler as wr
import io
ad_request_path = 's3://bucketname/key.xlsx'
df = wr.s3.read_excel(ad_request_path)
OR
bucket_name = 'bucketname'
object_key = 'key.xlsx'
s3_client = boto3.client('s3')
obj = s3_client.get_object(Bucket=bucket_name, Key=object_key)
data = obj['Body'].read()
workbook = open_workbook_xls(file_contents=data)
df = pd.read_excel(io.BytesIO(data))
print(df)
I got this error message:
XLRDError: Excel xlsx file; not supported
Managed to make it work. Just add engine = 'openpyxl'
import awswrangler as wr
import openpyxl
ad_request_path = 's3://bucketname/key.xlsx'
df = wr.s3.read_excel(ad_request_path, engine='openpyxl')

How to write a pandas dataframe to_json() to s3 in json format

I have an AWS lambda function which creates a data frame, I need to write this file to a S3 bucket.
import pandas as pd
import boto3
import io
# code to get the df
destination = "output_" + str(datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')) + '.json'
df.to_json(destination) # this file should be written to S3 bucket
The following code runs in AWS Lambda and uploads the json file to S3.
Lambda role should have S3 access permissions.
import pandas as pd
import boto3
import io
# code to get the df
destination = "output_" + str(datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')) + '.json'
json_buffer = io.StringIO()
df.to_json(json_buffer)
s3 = boto3.resource('s3')
my_bucket = s3.Bucket('my-bucket-name')
my_bucket.put_object(Key=destination, Body=json_buffer.getvalue())
You can use following code as well
#Creating Session using Boto3
session = boto3.Session(
aws_access_key_id='<key ID>',
aws_secret_access_key='<secret_key>'
)
#Create s3 session with boto3
s3 = session.resource('s3')
json_buffer = io.StringIO()
# Create dataframe and convert to pandas
df = spark.range(4).withColumn("organisation", lit("stackoverflow"))
df_p = df.toPandas()
df_p.to_json(json_buffer, orient='records')
#Create s3 object
object = s3.Object('<bucket-name>', '<JSON file name>')
#Put the object into bucket
result = object.put(Body=json_buffer.getvalue())

Boto3, read gzip from s3 and print content

I'm trying to read a gzip file from S3 - the "native" format f the file is a csv. Ultimately, after uncompressing the file, I'd like to be able to "see" the content so I can read the number of lines in the csv and keep count of it.
My "basic" attempts are here - still just trying to print the contents of the file. This attempt just tells me that there is no such file or directory...
I know I'm also probably erroneously thinking the unzipped csv file will be in json format - but that's the next "issue" once I get to read the unzipped contents...
[Errno 2] No such file or directory: 'SMSUsageReports/eu-west-1/2018/01/02/001.csv.gz'
import gzip
import boto3
import json
s3 = boto3.resource('s3')
s3_client = boto3.client('s3')
bucket = s3.Bucket('snssmsreports')
for obj in bucket.objects.filter(Prefix='SMSUsageReports/eu-west-1/2018/01/02'):
json_object = s3_client.get_object(Bucket=bucket.name, Key=obj.key)
file_name = obj.key
obj = bucket.Object(file_name)
file_body = obj.get()["Body"].read()
# gzip stuff here
f=gzip.open(file_name,'rb')
file_content=f.read()
#print file_content
#jsonFileReader = json_object['Body'].read()
jsonDict = json.loads(file_content)
#table = dynamodb.Table('SNS')
#table.put_item(Item=jsonDict)
print('{0}:{1}'.format(bucket.name, obj.key))
print(jsonDict)
OK, So I updated my code as follow:
import zipfile
import gzip
import boto3
import io
import json
import pandas as pd
s3 = boto3.resource('s3')
s3_client = boto3.client('s3')
bucket = s3.Bucket('snssmsreports')
for obj in bucket.objects.filter(Prefix='SMSUsageReports/eu-west-1/2018/01/02'):
json_object = s3_client.get_object(Bucket=bucket.name, Key=obj.key)
file_name = obj.key
obj = bucket.Object(file_name)
s3_client.download_file(bucket.name, file_name, '../../tmp/file.gz')
gzip_name = '../../tmp/file.gz'
# gzip stuff here
with gzip.open(gzip_name,'rb') as f:
file_content=f.read()
str_file = str(file_content)
csvfile = open('../../tmp/testfile.csv','w')
csvfile.write(str_file)
csvfile.close()
#table = dynamodb.Table('SNS')
#table.put_item(Item=jsonDict)
#pandas csv reader
df1 = pd.read_csv('../../tmp/testfile.csv')
print(df1)
#print('{0}:{1}'.format(bucket.name, obj.key))
#print(file_content)
#table = dynamodb.Table('SNS')
#table.put_item(Item=jsonDict)
This does not throw any errors anymore, but the output only has one row and 135 columns, so panda is not liking the actual content of the csv, or my conversion to str() is not the right way to do it?
OK, issue was the opening of the file for write - to write bytes I had to open file as wb...
csvfile = open('../../tmp/testfile.csv','wb')
csvfile.write(file_content)
csvfile.close()

How could I use aws lambda to write file to s3 (python)?

I have tried to use lambda function to write a file to S3, then test shows "succeeded" ,but nothing appeared in my S3 bucket. What happened? Does anyone can give me some advice or solutions? Thanks a lot. Here's my code.
import json
import boto3
def lambda_handler(event, context):
string = "dfghj"
file_name = "hello.txt"
lambda_path = "/tmp/" + file_name
s3_path = "/100001/20180223/" + file_name
with open(lambda_path, 'w+') as file:
file.write(string)
file.close()
s3 = boto3.resource('s3')
s3.meta.client.upload_file(lambda_path, 's3bucket', s3_path)
I've had success streaming data to S3, it has to be encoded to do this:
import boto3
def lambda_handler(event, context):
string = "dfghj"
encoded_string = string.encode("utf-8")
bucket_name = "s3bucket"
file_name = "hello.txt"
s3_path = "100001/20180223/" + file_name
s3 = boto3.resource("s3")
s3.Bucket(bucket_name).put_object(Key=s3_path, Body=encoded_string)
If the data is in a file, you can read this file and send it up:
with open(filename) as f:
string = f.read()
encoded_string = string.encode("utf-8")
My response is very similar to Tim B but the most import part is
1.Go to S3 bucket and create a bucket you want to write to
2.Follow the below steps otherwise you lambda will fail due to permission/access. I've copied and pasted it the link content here for you too just in case if they change the url /move it to some other page.
a. Open the roles page in the IAM console.
b. Choose Create role.
c. Create a role with the following properties.
-Trusted entity – AWS Lambda.
-Permissions – AWSLambdaExecute.
-Role name – lambda-s3-role.
The AWSLambdaExecute policy has the permissions that the function needs to manage objects in Amazon S3 and write logs to CloudWatch Logs.
Copy and past this into your Lambda python function
import json, boto3,os, sys, uuid
from urllib.parse import unquote_plus
s3_client = boto3.client('s3')
def lambda_handler(event, context):
some_text = "test"
#put the bucket name you create in step 1
bucket_name = "my_buck_name"
file_name = "my_test_file.csv"
lambda_path = "/tmp/" + file_name
s3_path = "output/" + file_name
os.system('echo testing... >'+lambda_path)
s3 = boto3.resource("s3")
s3.meta.client.upload_file(lambda_path, bucket_name, file_name)
return {
'statusCode': 200,
'body': json.dumps('file is created in:'+s3_path)
}
from os import path
import json, boto3, sys, uuid
import requests
s3_client = boto3.client('s3')
def lambda_handler(event, context):
bucket_name = "mybucket"
url = "https://i.imgur.com/ExdKOOz.png"
reqponse = requests.get(url)
filenname = get_filename(url)
img = reqponse.content
s3 = boto3.resource("s3")
s3.Bucket(bucket_name).put_object(Key=filenname, Body=img)
return {'statusCode': 200,'body': json.dumps('file is created in:')}
def get_filename(url):
fragment_removed = url.split("#")[0]
query_string_removed = fragment_removed.split("?")[0]
scheme_removed = query_string_removed.split("://")[-1].split(":")[-1]
if scheme_removed.find("/") == -1:
return ""
return path.basename(scheme_removed)

Categories