error in reading csv file content on S3 using boto - python

I am using boto to read a csv file and parse it contents. This is the code I wrote:
import boto
from boto.s3.key import Key
import pandas as pd
import io
conn = boto.connect_s3(keyId, sKeyId)
bucket = conn.get_bucket(bucketName)
# Get the Key object of the given key, in the bucket
k = Key(bucket, srcFileName)
content = k.get_contents_as_string()
reader = pd.read_csv(io.StringIO(content))
for row in reader:
print(row)
But I am getting error at read_csv line:
TypeError: initial_value must be str or None, not bytes
How can I resolve this error and parse the contents of the csv file present on S3
UPDATE: if I use BytesIO instead of StringIO then the print(row) line only prints 1st row of the csv. How do I loop over it?
This is my current code:
import boto3
s3 = boto3.resource('s3',aws_access_key_id = keyId, aws_secret_access_key = sKeyId)
obj = s3.Object(bucketName, srcFileName)
content = obj.get_contents_as_string()
reader = pd.read_csv(io.BytesIO(content), header=None)
count = 0
for index, row in reader.iterrows():
print(row[1])
When I execute this I get AttributeError: 's3.Object' object has no attribute 'get_contents_as_string' error

Related

how to convert json file to csv with "success":true

I have problem with convert json file to csv file on python
and i think it will be the nested json file but i don't know how to handle it!
import json, requests
url = requests.get("https://####/api/food_orders")
text = url.text
data = json.load(text)
order_data = data['data']
# now we will open a file for writing
data_file = open('ordersJsonToCsv.csv', 'w', newline='')
# create the csv writer object
csv_writer = csv.writer(data_file)
# Counter variable used for writing
# headers to the CSV file
count = 0
for ord in order_data:
if count == 0:
# Writing headers of CSV file
header = ord.keys()
csv_writer.writerow(header)
count += 1
# Writing data of CSV file
csv_writer.writerow(ord.values())
data_file.close()
And Json file look like
This code will solve the problem to get data only
import pandas as pd
import json, requests
url = requests.get("https://##/api/orders?
text = url.text
info = json.loads(text)
df = pd.json_normalize(info['data'])
df.to_csv("samplecsv.csv")

Read Excel file in AWS

I wanted to read an excel file in S3 from Glue.
Here's what I've done so far.
import pandas as pd
import awswrangler as wr
import io
ad_request_path = 's3://bucketname/key.xlsx'
df = wr.s3.read_excel(ad_request_path)
OR
bucket_name = 'bucketname'
object_key = 'key.xlsx'
s3_client = boto3.client('s3')
obj = s3_client.get_object(Bucket=bucket_name, Key=object_key)
data = obj['Body'].read()
workbook = open_workbook_xls(file_contents=data)
df = pd.read_excel(io.BytesIO(data))
print(df)
I got this error message:
XLRDError: Excel xlsx file; not supported
Managed to make it work. Just add engine = 'openpyxl'
import awswrangler as wr
import openpyxl
ad_request_path = 's3://bucketname/key.xlsx'
df = wr.s3.read_excel(ad_request_path, engine='openpyxl')

Python create list of dictionaries from csv on S3

I am trying to take a CSV and create a list of dictionaries in python with the CSV coming from S3. Code is as follows:
import os
import boto3
import csv
import json
from io import StringIO
import logging
import time
s3 = boto3.resource('s3')
s3Client = boto3.client('s3','us-east-1')
bucket = 'some-bucket'
key = 'some-key'
obj = s3Client.get_object(Bucket = bucket, Key = key)
lines = obj['Body'].read().decode('utf-8').splitlines(True)
newl = []
for line in csv.reader(lines, quotechar='"', delimiter=',',quoting=csv.QUOTE_ALL,skipinitialspace=True, escapechar="\\"):
newl.append(line)
fieldnames = newl[0]
newl1 = newl[1:]
reader = csv.DictReader(newl1,fieldnames)
out = json.dumps([row for row in reader])
jlist1 = json.loads(out)
but this gives me the error:
iterator should return strings, not list (did you open the file in text mode?)
if I alter the for loop to this:
for line in csv.reader(lines, quotechar='"', delimiter=',',quoting=csv.QUOTE_ALL,skipinitialspace=True, escapechar="\\"):
newl.append(','.join(line))
then it works, however there are some fields that have commas in them so this completely screws up the schema and shifts the data. For example:
|address1 |address2 |state|
------------------------------
|123 Main st|APT 3, Fl1|TX |
becomes:
|address1 |address2 |state|null|
-----------------------------------
|123 Main st|APT 3 |Fl1 |TX |
Where am I going wrong?
The problem is that you are building a list of lists here :
newl.append(line)
and as the error says : iterator should return strings, not list
so try to cast line as a string:
newl.append(str(line))
Hope this helps :)
I ended up changing the code to this:
obj = s3Client.get_object(Bucket = bucket, Key = key)
lines1 = obj['Body'].read().decode('utf-8').split('\n')
fieldnames = lines1[0].replace('"','').split(',')
testls = [row for row in csv.DictReader(lines1[1:], fieldnames)]
out = json.dumps([row for row in testls])
jlist1 = json.loads(out)
And got the desired result

Boto3, read gzip from s3 and print content

I'm trying to read a gzip file from S3 - the "native" format f the file is a csv. Ultimately, after uncompressing the file, I'd like to be able to "see" the content so I can read the number of lines in the csv and keep count of it.
My "basic" attempts are here - still just trying to print the contents of the file. This attempt just tells me that there is no such file or directory...
I know I'm also probably erroneously thinking the unzipped csv file will be in json format - but that's the next "issue" once I get to read the unzipped contents...
[Errno 2] No such file or directory: 'SMSUsageReports/eu-west-1/2018/01/02/001.csv.gz'
import gzip
import boto3
import json
s3 = boto3.resource('s3')
s3_client = boto3.client('s3')
bucket = s3.Bucket('snssmsreports')
for obj in bucket.objects.filter(Prefix='SMSUsageReports/eu-west-1/2018/01/02'):
json_object = s3_client.get_object(Bucket=bucket.name, Key=obj.key)
file_name = obj.key
obj = bucket.Object(file_name)
file_body = obj.get()["Body"].read()
# gzip stuff here
f=gzip.open(file_name,'rb')
file_content=f.read()
#print file_content
#jsonFileReader = json_object['Body'].read()
jsonDict = json.loads(file_content)
#table = dynamodb.Table('SNS')
#table.put_item(Item=jsonDict)
print('{0}:{1}'.format(bucket.name, obj.key))
print(jsonDict)
OK, So I updated my code as follow:
import zipfile
import gzip
import boto3
import io
import json
import pandas as pd
s3 = boto3.resource('s3')
s3_client = boto3.client('s3')
bucket = s3.Bucket('snssmsreports')
for obj in bucket.objects.filter(Prefix='SMSUsageReports/eu-west-1/2018/01/02'):
json_object = s3_client.get_object(Bucket=bucket.name, Key=obj.key)
file_name = obj.key
obj = bucket.Object(file_name)
s3_client.download_file(bucket.name, file_name, '../../tmp/file.gz')
gzip_name = '../../tmp/file.gz'
# gzip stuff here
with gzip.open(gzip_name,'rb') as f:
file_content=f.read()
str_file = str(file_content)
csvfile = open('../../tmp/testfile.csv','w')
csvfile.write(str_file)
csvfile.close()
#table = dynamodb.Table('SNS')
#table.put_item(Item=jsonDict)
#pandas csv reader
df1 = pd.read_csv('../../tmp/testfile.csv')
print(df1)
#print('{0}:{1}'.format(bucket.name, obj.key))
#print(file_content)
#table = dynamodb.Table('SNS')
#table.put_item(Item=jsonDict)
This does not throw any errors anymore, but the output only has one row and 135 columns, so panda is not liking the actual content of the csv, or my conversion to str() is not the right way to do it?
OK, issue was the opening of the file for write - to write bytes I had to open file as wb...
csvfile = open('../../tmp/testfile.csv','wb')
csvfile.write(file_content)
csvfile.close()

Boto3 read a file content from S3 key line by line

With boto3, you can read a file content from a location in S3, given a bucket name and the key, as per (this assumes a preliminary import boto3)
s3 = boto3.resource('s3')
content = s3.Object(BUCKET_NAME, S3_KEY).get()['Body'].read()
This returns a string type. The specific file I need to fetch happens to be a collection of dictionary-like objects, one per line. So it is not a JSON format. Instead of reading it as a string, I'd like to stream it as a file object and read it line by line; cannot find a way to do this other than downloading the file locally first as
s3 = boto3.resource('s3')
bucket = s3.Bucket(BUCKET_NAME)
filename = 'my-file'
bucket.download_file(S3_KEY, filename)
f = open('my-file')
What I'm asking is if it's possible to have this type of control on the file without having to download it locally first?
I found .splitlines() worked for me...
txt_file = s3.Object(bucket, file).get()['Body'].read().decode('utf-8').splitlines()
Without the .splitlines() the whole blob of text was return and trying to iterate each line resulted in each char being iterated. With .splitlines() iteration by line was achievable.
In my example here I iterate through each line and compile it into a dict.
txt_file = s3.Object(bucket, file).get()['Body'].read().decode(
'utf-8').splitlines()
for line in txt_file:
arr = line.split()
print(arr)
You also can take advantage of StreamingBody's iter_lines method:
for line in s3.Object(bucket, file).get()['Body'].iter_lines():
decoded_line = line.decode('utf-b') # if decoding is needed
That would consume less memory than reading the whole line at once and then split it
The following comment from kooshiwoosh to a similar question provides a nice answer:
from io import TextIOWrapper
from gzip import GzipFile
...
# get StreamingBody from botocore.response
response = s3.get_object(Bucket=bucket, Key=key)
# if gzipped
gzipped = GzipFile(None, 'rb', fileobj=response['Body'])
data = TextIOWrapper(gzipped)
for line in data:
# process line
This will do the work:
bytes_to_read = 512
content = s3.Object(BUCKET_NAME, S3_KEY).get()['Body'].read(bytes_to_read)
This works for me:
json_object = s3.get_object(Bucket = bucket, Key = json_file_name)
json_file_reader = json_object['Body'].read()
content = json.loads(json_file_reader)
As of now you have a possibility to use the download_fileobj function. Here an example for a CSV file:
import boto3
import csv
bucket = 'my_bucket'
file_key = 'my_key/file.csv'
output_file_path = 'output.csv'
s3 = boto3.resource('s3')
bucket = s3.Bucket(bucket)
#Dump binary in append mode
with open(output_file_path, 'ab') as file_object:
bucket.download_fileobj(
Key = file_key,
Fileobj = file_object,
)
#Read your file as usual
with open(output_file_path, 'r') as csvfile:
lines = csv.reader(csvfile)
for line in lines:
doWhatEver(line[0])

Categories