I'm trying to read a gzip file from S3 - the "native" format f the file is a csv. Ultimately, after uncompressing the file, I'd like to be able to "see" the content so I can read the number of lines in the csv and keep count of it.
My "basic" attempts are here - still just trying to print the contents of the file. This attempt just tells me that there is no such file or directory...
I know I'm also probably erroneously thinking the unzipped csv file will be in json format - but that's the next "issue" once I get to read the unzipped contents...
[Errno 2] No such file or directory: 'SMSUsageReports/eu-west-1/2018/01/02/001.csv.gz'
import gzip
import boto3
import json
s3 = boto3.resource('s3')
s3_client = boto3.client('s3')
bucket = s3.Bucket('snssmsreports')
for obj in bucket.objects.filter(Prefix='SMSUsageReports/eu-west-1/2018/01/02'):
json_object = s3_client.get_object(Bucket=bucket.name, Key=obj.key)
file_name = obj.key
obj = bucket.Object(file_name)
file_body = obj.get()["Body"].read()
# gzip stuff here
f=gzip.open(file_name,'rb')
file_content=f.read()
#print file_content
#jsonFileReader = json_object['Body'].read()
jsonDict = json.loads(file_content)
#table = dynamodb.Table('SNS')
#table.put_item(Item=jsonDict)
print('{0}:{1}'.format(bucket.name, obj.key))
print(jsonDict)
OK, So I updated my code as follow:
import zipfile
import gzip
import boto3
import io
import json
import pandas as pd
s3 = boto3.resource('s3')
s3_client = boto3.client('s3')
bucket = s3.Bucket('snssmsreports')
for obj in bucket.objects.filter(Prefix='SMSUsageReports/eu-west-1/2018/01/02'):
json_object = s3_client.get_object(Bucket=bucket.name, Key=obj.key)
file_name = obj.key
obj = bucket.Object(file_name)
s3_client.download_file(bucket.name, file_name, '../../tmp/file.gz')
gzip_name = '../../tmp/file.gz'
# gzip stuff here
with gzip.open(gzip_name,'rb') as f:
file_content=f.read()
str_file = str(file_content)
csvfile = open('../../tmp/testfile.csv','w')
csvfile.write(str_file)
csvfile.close()
#table = dynamodb.Table('SNS')
#table.put_item(Item=jsonDict)
#pandas csv reader
df1 = pd.read_csv('../../tmp/testfile.csv')
print(df1)
#print('{0}:{1}'.format(bucket.name, obj.key))
#print(file_content)
#table = dynamodb.Table('SNS')
#table.put_item(Item=jsonDict)
This does not throw any errors anymore, but the output only has one row and 135 columns, so panda is not liking the actual content of the csv, or my conversion to str() is not the right way to do it?
OK, issue was the opening of the file for write - to write bytes I had to open file as wb...
csvfile = open('../../tmp/testfile.csv','wb')
csvfile.write(file_content)
csvfile.close()
Related
I'm having a problem with BytesIO library in Python. I want to convert a pdf file that I have retrieved from an S3 bucket, and convert it into a dataframe using a custom function convert_bytes_to_df. The first pdf file is fine to convert to a csv, however subsequent csvs look like they have appended to each other. I have tried to reset the IO with seek and truncate but it doesn't seem to work. What am I doing wrong?
import boto3
from io import BytesIO,StringIO
LOGGER = logging.getLogger(__name__)
logging.basicConfig(level=logging.ERROR)
logging.getLogger(__name__).setLevel(logging.DEBUG)
session = boto3.Session()
s3 = session.resource('s3')
src_bucket = s3.Bucket('input-bucket')
dest_bucket = s3.Bucket('output-bucket')
csv_buffer = StringIO()
def lambda_handler(event,context):
msg = event['Records'][0]['Sns']['Message']
pdf_files = json.loads(msg)['pdf_files']
location = json.loads(msg)['location']
total_files= len(pdf_files)
LOGGER.info('Processing: {}'.format(json.dumps(pdf_files)))
for pdf_file in pdf_files:
file_name = pdf_file['key']
obj = s3.Object(src_bucket.name,file_name)
fs = BytesIO(obj.get()['Body'].read())
df = convert_bytes_to_df(fs)
df.to_csv(csv_buffer,index=False)
s3.Object(dest_bucket.name, location +"/"+file_name.split('.')[0]+".csv").put(Body=csv_buffer.getvalue())
fs.seek(0)
fs.truncate(0)
LOGGER.info('Processed: {} in {}'.format(file_name,location))
LOGGER.info('Converted {} files: {}'.format(total_files,json.dumps(pdf_files)))
src_bucket.objects.all().delete()
LOGGER.info('Deleted all files from {}'.format(src_bucket.name))
move
csv_buffer = StringIO()
inside for loop.
csv_buffer is initialized only once.
you need it to be inside for loop so that it is getting initialized for each element in the loop.
e.g:
for pdf_file in pdf_files:
csv_buffer = StringIO()
file_name = pdf_file['key']
obj = s3.Object(src_bucket.name,file_name)
fs = BytesIO(obj.get()['Body'].read())
df = convert_bytes_to_df(fs)
df.to_csv(csv_buffer,index=False)
s3.Object(dest_bucket.name, location +"/"+file_name.split('.')[0]+".csv").put(Body=csv_buffer.getvalue())
fs.seek(0)
fs.truncate(0)
I'm iterating through an excel file that I'm pulling from S3. I want to append this data into one file. The data isn't enough to exceed lambda memory limits so I'm saving it into a variable and then converting the string into csv file that I'm looking to upload to S3. When I run a variation of this code locally it works perfectly, not sure what's going wrong when I'm converting it to AWS.
import csv
import boto3
import urllib3
import tempfile
s3 = boto3.client('s3')
bucket = os.environ['S3_BUCKET']
http = urllib3.PoolManager()
def lambda_handler(event, context):
file = readS3('example.xlsx') # load file with Boto3
latest_scan = openpyxl.load_workbook(io.BytesIO(file), data_only=True)
sh = latest_scan.active
a = []
for row in sh['A']:
r5 = http.request(
'GET',
'https://example.com/api/' + str(row.value),
headers={
'Accept': 'text/csv'
}
)
a.append(r5.data.decode('utf-8'))
s = ''.join(a)
temp = tempfile.TemporaryFile(mode='w+', suffix='.csv')
with open(temp, 'w', encoding="utf-8") as f:
for line in s:
f.write(line)
temp.seek(0)
s3.put_object(temp, Bucket = bucket, Key = 'test.csv')
temp.close()
I'm getting:
"errorMessage": "expected str, bytes or os.PathLike object, not _io.TextIOWrapper",
"errorType": "TypeError",
"stackTrace": [
" File \"/var/task/lambda_function.py\", line in lambda_handler\n with open(temp,
'w', encoding=\"utf-8\") as f:\n"
]
tempfile.TemporaryFile() opens the file, it doesn't return a filename. So just assign that to f.
with tempfile.TemporaryFile(mode='w+', suffix='.csv', encoding="utf-8") as f:
I need to find the csv files from the folder
List all the files inside the folder
Convert files to json and save in the same bucket
Csv file, Like below so many csv files are there
emp_id,Name,Company
10,Aka,TCS
11,VeI,TCS
Code is below
import boto3
import pandas as pd
def lambda_handler(event, context):
s3 = boto3.resource('s3')
my_bucket = s3.Bucket('testfolder')
for file in my_bucket.objects.all():
print(file.key)
for csv_f in file.key:
with open(f'{csv_f.replace(".csv", ".json")}', "w") as f:
pd.read_csv(csv_f).to_json(f, orient='index')
Not able to save if you remove bucket name it will save in the folder. How to save back to bucket name
You can check the following code:
from io import StringIO
import boto3
import pandas as pd
s3 = boto3.resource('s3')
def lambda_handler(event, context):
s3 = boto3.resource('s3')
input_bucket = 'bucket-with-csv-file-44244'
my_bucket = s3.Bucket(input_bucket)
for file in my_bucket.objects.all():
if file.key.endswith(".csv"):
csv_f = f"s3://{input_bucket}/{file.key}"
print(csv_f)
json_file = file.key.replace(".csv", ".json")
print(json_file)
json_buffer = StringIO()
df = pd.read_csv(csv_f)
df.to_json(json_buffer, orient='index')
s3.Object(input_bucket, json_file).put(Body=json_buffer.getvalue())
Your lambda layer will need to have:
fsspec
pandas
s3fs
With boto3, you can read a file content from a location in S3, given a bucket name and the key, as per (this assumes a preliminary import boto3)
s3 = boto3.resource('s3')
content = s3.Object(BUCKET_NAME, S3_KEY).get()['Body'].read()
This returns a string type. The specific file I need to fetch happens to be a collection of dictionary-like objects, one per line. So it is not a JSON format. Instead of reading it as a string, I'd like to stream it as a file object and read it line by line; cannot find a way to do this other than downloading the file locally first as
s3 = boto3.resource('s3')
bucket = s3.Bucket(BUCKET_NAME)
filename = 'my-file'
bucket.download_file(S3_KEY, filename)
f = open('my-file')
What I'm asking is if it's possible to have this type of control on the file without having to download it locally first?
I found .splitlines() worked for me...
txt_file = s3.Object(bucket, file).get()['Body'].read().decode('utf-8').splitlines()
Without the .splitlines() the whole blob of text was return and trying to iterate each line resulted in each char being iterated. With .splitlines() iteration by line was achievable.
In my example here I iterate through each line and compile it into a dict.
txt_file = s3.Object(bucket, file).get()['Body'].read().decode(
'utf-8').splitlines()
for line in txt_file:
arr = line.split()
print(arr)
You also can take advantage of StreamingBody's iter_lines method:
for line in s3.Object(bucket, file).get()['Body'].iter_lines():
decoded_line = line.decode('utf-b') # if decoding is needed
That would consume less memory than reading the whole line at once and then split it
The following comment from kooshiwoosh to a similar question provides a nice answer:
from io import TextIOWrapper
from gzip import GzipFile
...
# get StreamingBody from botocore.response
response = s3.get_object(Bucket=bucket, Key=key)
# if gzipped
gzipped = GzipFile(None, 'rb', fileobj=response['Body'])
data = TextIOWrapper(gzipped)
for line in data:
# process line
This will do the work:
bytes_to_read = 512
content = s3.Object(BUCKET_NAME, S3_KEY).get()['Body'].read(bytes_to_read)
This works for me:
json_object = s3.get_object(Bucket = bucket, Key = json_file_name)
json_file_reader = json_object['Body'].read()
content = json.loads(json_file_reader)
As of now you have a possibility to use the download_fileobj function. Here an example for a CSV file:
import boto3
import csv
bucket = 'my_bucket'
file_key = 'my_key/file.csv'
output_file_path = 'output.csv'
s3 = boto3.resource('s3')
bucket = s3.Bucket(bucket)
#Dump binary in append mode
with open(output_file_path, 'ab') as file_object:
bucket.download_fileobj(
Key = file_key,
Fileobj = file_object,
)
#Read your file as usual
with open(output_file_path, 'r') as csvfile:
lines = csv.reader(csvfile)
for line in lines:
doWhatEver(line[0])
I am using boto to read a csv file and parse it contents. This is the code I wrote:
import boto
from boto.s3.key import Key
import pandas as pd
import io
conn = boto.connect_s3(keyId, sKeyId)
bucket = conn.get_bucket(bucketName)
# Get the Key object of the given key, in the bucket
k = Key(bucket, srcFileName)
content = k.get_contents_as_string()
reader = pd.read_csv(io.StringIO(content))
for row in reader:
print(row)
But I am getting error at read_csv line:
TypeError: initial_value must be str or None, not bytes
How can I resolve this error and parse the contents of the csv file present on S3
UPDATE: if I use BytesIO instead of StringIO then the print(row) line only prints 1st row of the csv. How do I loop over it?
This is my current code:
import boto3
s3 = boto3.resource('s3',aws_access_key_id = keyId, aws_secret_access_key = sKeyId)
obj = s3.Object(bucketName, srcFileName)
content = obj.get_contents_as_string()
reader = pd.read_csv(io.BytesIO(content), header=None)
count = 0
for index, row in reader.iterrows():
print(row[1])
When I execute this I get AttributeError: 's3.Object' object has no attribute 'get_contents_as_string' error