I'm having a problem with BytesIO library in Python. I want to convert a pdf file that I have retrieved from an S3 bucket, and convert it into a dataframe using a custom function convert_bytes_to_df. The first pdf file is fine to convert to a csv, however subsequent csvs look like they have appended to each other. I have tried to reset the IO with seek and truncate but it doesn't seem to work. What am I doing wrong?
import boto3
from io import BytesIO,StringIO
LOGGER = logging.getLogger(__name__)
logging.basicConfig(level=logging.ERROR)
logging.getLogger(__name__).setLevel(logging.DEBUG)
session = boto3.Session()
s3 = session.resource('s3')
src_bucket = s3.Bucket('input-bucket')
dest_bucket = s3.Bucket('output-bucket')
csv_buffer = StringIO()
def lambda_handler(event,context):
msg = event['Records'][0]['Sns']['Message']
pdf_files = json.loads(msg)['pdf_files']
location = json.loads(msg)['location']
total_files= len(pdf_files)
LOGGER.info('Processing: {}'.format(json.dumps(pdf_files)))
for pdf_file in pdf_files:
file_name = pdf_file['key']
obj = s3.Object(src_bucket.name,file_name)
fs = BytesIO(obj.get()['Body'].read())
df = convert_bytes_to_df(fs)
df.to_csv(csv_buffer,index=False)
s3.Object(dest_bucket.name, location +"/"+file_name.split('.')[0]+".csv").put(Body=csv_buffer.getvalue())
fs.seek(0)
fs.truncate(0)
LOGGER.info('Processed: {} in {}'.format(file_name,location))
LOGGER.info('Converted {} files: {}'.format(total_files,json.dumps(pdf_files)))
src_bucket.objects.all().delete()
LOGGER.info('Deleted all files from {}'.format(src_bucket.name))
move
csv_buffer = StringIO()
inside for loop.
csv_buffer is initialized only once.
you need it to be inside for loop so that it is getting initialized for each element in the loop.
e.g:
for pdf_file in pdf_files:
csv_buffer = StringIO()
file_name = pdf_file['key']
obj = s3.Object(src_bucket.name,file_name)
fs = BytesIO(obj.get()['Body'].read())
df = convert_bytes_to_df(fs)
df.to_csv(csv_buffer,index=False)
s3.Object(dest_bucket.name, location +"/"+file_name.split('.')[0]+".csv").put(Body=csv_buffer.getvalue())
fs.seek(0)
fs.truncate(0)
Related
Trying to achieve below functionality:
Uploading multiple files to s3 bucket.
Non pdf files needs to get converted to pdf and then merge into single pdf file.
The folder structure will be folder1/2/3/4. under folder 4 the files gets uploaded.
Below is my code (AWS Lambda function) but the issue is the files (only some) are merging before all the files gets converted. Convert to pdf has to occur successfully before the merging starts.
import os
import io
from io import BytesIO
import tarfile
import boto3
import subprocess
import brotli
from PyPDF2 import PdfMerger
from time import sleep
#Directory where libre office open source s/w will be saved lambda tmp directory
LIBRE_OFFICE_INSTALL_DIR = '/tmp/instdir'
s3_bucket = boto3.resource("s3").Bucket("bucketname")
def load_libre_office():
if os.path.exists(LIBRE_OFFICE_INSTALL_DIR) and os.path.isdir(LIBRE_OFFICE_INSTALL_DIR):
print("Have a cached copy of LibreOffice, Skipping Extraction")
else:
print("No Cached copy of Libre Office exists , extracting tar stream from brotli file")
buffer = BytesIO()
with open('/opt/lo.tar.br','rb') as brotli_file:
decompressor = brotli.Decompressor()
while True:
chunk = brotli_file.read(1024)
buffer.write(decompressor.decompress(chunk))
if len(chunk) < 1024:
break
buffer.seek(0)
print('Extracting tar stream to /tmp for caching')
with tarfile.open(fileobj=buffer) as tar:
# TODO: write code...
print('opening tar file')
tar.extractall('/tmp')
print('LibreOffice caching done!')
return f'{LIBRE_OFFICE_INSTALL_DIR}/program/soffice.bin'
def convert_word_to_pdf(soffice_path, word_file_path, output_dir):
conv_cmd = f"{soffice_path} --headless --norestore --invisible --nodefault --nofirststartwizard --nolockcheck --nologo --convert-to pdf:writer_pdf_Export --outdir {output_dir} {word_file_path}"
response = subprocess.run(conv_cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if response.returncode != 0:
response = subprocess.run(conv_cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if response.returncode != 0:
return False
return True
def download_from_s3(bucket,key,download_path):
s3 = boto3.client('s3')
s3.download_file(bucket,key,download_path)
def upload_to_s3(file_pathn,bucket,key):
s3=boto3.client('s3')
s3.upload_file(file_pathn,bucket,key)
# Create an S3 client
s3 = boto3.client('s3')
bucket='bucketname'
prefix = 'media/files/'
def merge_pdfs(bucket, prefix):
# Get a list of all subdirectories in the specified prefix
result = s3.list_objects_v2(Bucket=bucket, Prefix=prefix, Delimiter='/')
# Get a list of all subdirectories
subdirectories = [prefix + obj['Prefix'].split('/')[-2] + '/' for obj in result.get('CommonPrefixes', [])]
# Loop through all subdirectories
for subdirectory in subdirectories:
# Get a list of all inner subdirectories in the subdirectory
inner_result = s3.list_objects_v2(Bucket=bucket, Prefix=subdirectory, Delimiter='/')
# Get a list of all inner subdirectories
inner_subdirectories = [subdirectory + obj['Prefix'].split('/')[-2] + '/' for obj in inner_result.get('CommonPrefixes', [])]
for inner_subdirectory in inner_subdirectories:
# Get a list of all PDF objects in the inner subdirectory
obj_list = s3.list_objects_v2(Bucket=bucket, Prefix=inner_subdirectory)
# Get a list of all PDF object keys in the inner subdirectory
keys = [obj['Key'] for obj in obj_list['Contents']]
#Create a PDF merger object
pdf_merger = PdfMerger()
newS3 = boto3.resource('s3')
bucket1 = newS3.Bucket(bucket)
# To check if mergedfile already exists
obj = list(bucket1.objects.filter(Prefix=inner_subdirectory+'newmerged.pdf'))
if len(obj) > 0:
print("Exists")
else:
print("Not Exists")
#Loop through all PDF objects in the inner subdirectory
print(len(keys))
for key in keys :
if key.endswith('.pdf'):
obj = s3.get_object(Bucket=bucket, Key=key)
pdf_content = obj['Body'].read()
pdf_merger.append(io.BytesIO(pdf_content))
y=io.BytesIO()
pdf_merger.write(y)
y.seek(0)
s3.put_object(Bucket=bucket, Key=inner_subdirectory+"newmerged.pdf", Body=y)
def lambda_handler(event, context):
print(event)
key = event['Records'][0]['s3']['object']['key']
key_prefix, base_name = os.path.split(key)
download_path = f"/tmp/{base_name}"
output_dir = "/tmp"
soffice_path = load_libre_office()
if not key.endswith('.pdf'):
download_from_s3(bucket, key, download_path)
is_converted = convert_word_to_pdf(soffice_path, download_path, output_dir)
print('isconverting')
if is_converted:
file_name, _ = os.path.splitext(base_name)
upload_to_s3(f"{output_dir}/{file_name}.pdf", bucket, f"{key_prefix}/{file_name}.pdf")
print('uploaded')
#sleep(100)
merge_pdfs(bucket,prefix)
I am using the below Python3 shell code to read from S3 bucket, extract data and write to a new file in the same bucket. But the write operation is not working and Medicaid_Provider_ID_.txt is populated with zero rows. Any clue ??
import logging
import boto3
s3 = boto3.client("s3")
data = s3.get_object(Bucket='mmis.request.file', Key='MEIPASS_FISCAL_TRANS_ONE_RECORD.TXT')
file_lines = data['Body'].iter_lines()
next(file_lines)
new = []
id = 1
for line in file_lines:
line_split = line.decode().split(',')
MEDICAID_PROVIDER_ID = line_split[0]
REASON_CODE = line_split[1]
with open("Medicaid_Provider_ID_.txt","w") as f:
f.writelines(MEDICAID_PROVIDER_ID)
f.close()
id += 1
new = s3.put_object(Bucket='mmis.request.file', Key='Medicaid_Provider_ID_.txt')
This line of code is recreating your file every single time the code runs:
with open("Medicaid_Provider_ID_.txt","w") as f:
You should open/create the file once, then iterate over all the rows in the file, then close the file when you are done. Like so:
import logging
import boto3
s3 = boto3.client("s3")
data = s3.get_object(Bucket='mmis.request.file', Key='MEIPASS_FISCAL_TRANS_ONE_RECORD.TXT')
file_lines = data['Body'].iter_lines()
next(file_lines)
new = []
id = 1
# Open the file
with open("Medicaid_Provider_ID_.txt","w") as f:
# Write each line of the file
for line in file_lines:
line_split = line.decode().split(',')
MEDICAID_PROVIDER_ID = line_split[0]
REASON_CODE = line_split[1]
f.writelines(MEDICAID_PROVIDER_ID)
# Close the file
f.close()
id += 1
new = s3.put_object(Bucket='mmis.request.file', Key='Medicaid_Provider_ID_.txt')
I'm trying the following. But when i overwite a file which was invoked by lambda, due to this it is going in a loop. Can you anyone please help me. Below also pasted the piece of code which am using for lambda.
Task
Read a file in a folder called 'Folder A' when it is uploaded to this folder
Then replace a particualr column which has character more then 10
then upload this file back to the same folder but unfortunately it is going in a loop due to lambda invoke
Tried moved to a different folder called TrimmedFile then it is working fine without any loops.
Can someone tell me how to read, edit, save the file in the same folder which was invoked?
import json
import urllib.parse
import boto3
import json
import os
import csv
print('Loading function')
s3 = boto3.client('s3')
def lambda_handler(event, context):
# Get the object from the event and show its content type
bucket = event['Records'][0]['s3']['bucket']['name']
key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'], encoding='utf-8')
try:
#print("CONTENT TYPE: " + key['ContentType'])
#for record in event['Records']:
print("file name " + key)
#bucket = record['s3']['bucket']['name']
#file_key = urllib.parse.unquote_plus(record['s3']['object']['key'], encoding='utf-8')
file_key = key
csvfile = s3.get_object(Bucket=bucket, Key=file_key)
csvcontent = csvfile["Body"].read().decode("utf-8")
file = csvcontent.split("\n")
csv_reader = csv.reader(file)
line_count = 0
colindex = ''
content = []
contentstring = ''
s33 = boto3.resource('s3')
copy_source = {
'Bucket': bucket,
'Key': file_key
}
new_bucket = s33.Bucket(bucket)
print(file_key)
print(bucket)
src_folder = "FolderA/"
new_filekey = file_key.replace(src_folder,"")
print(new_filekey)
new_bucket.copy(copy_source, 'BKP/' + new_filekey )
for row in csv_reader:
if row:
row = list(map(str.strip, row))
if line_count == 0:
if 'ColToTruncate' in row:
colindex = row.index('ColToTruncate')
line_count += 1
else:
print('No ColToTruncate column found in '+ file_key)
return 'No ColToTruncate column found in '+ file_key
else:
if len(row[colindex ]) >= 10:
row[colindex ] = row[colindex ][0:2]
line_count += 1
content.append(row)
contentstring += ', '.join(row)
contentstring = contentstring + '\n'
#print(contentstring)
#filename = file_key + '.csv'
uploadByteStream = bytes(contentstring.encode('utf-8'))
#new_key = 'TrimmedFiles/' + new_filekey
s3.put_object(Bucket=bucket, Key=file_key , Body=uploadByteStream)
return True
except Exception as e:
print(e)
print('Error getting object {} from bucket {}. Make sure they exist and your bucket is in the same region as this function.'.format(key, bucket))
raise e
I believe you have created an event Trigger on S3 and associated it with Lambda and when you are replacing the file you get the lambda triggered and it becomes a loop.
There could be 2 ways to handle it:
1.Configure a PUT OR POST event type ( which ever suits your case) to trigger the lambda. Now save the updated file at another location and then copy it to the original one. Doing this s3 will generate a "S3:ObjectCreated:Copy" event which will not invoke the Lambda again.
# Copying file from secondary location to original location
copy_sr = {
"Bucket":bucket,
"Key" :file_key_copy
}
s3_resource.meta.client.copy(copy_sr,
final_bucket,file_key_copy
)
#Deleting the file from the secondary location
s3_client.delete_object(Bucket=bucket,
Key=file_key_copy
)
2.Use SQS queue and configure it not to precess any message received twice in a specified period of time ( depending on the frequency of file getting updated)
This is to demonstrate how to read a file and and replace it after editing. It can act as a skeleton code.
import boto3
import base64
import json
import io
client = boto3.client('s3')
res = boto3.resource('s3')
def lambda_handler(event, context):
file_key = event['file_key']
file_obj = s3_res.Object("bucket_name", file_key)
content_obj = file_obj.get()['Body'].read().decode('utf-8') # fetching the data in
res.Object("bucket_name", file_key).delete() # Here you are deleting the old file
######Performing your operation and saving in new_data variable#########
new_file = io.BytesIO(new_data.encode())
client.upload_fileobj(new_file, "bucket_name", file_key) # uploading the file at the exact same location.
I have a script which pulls in data from a csv file, does some manipulations to it and creates an output excel file. But, its a tedious process as I need to do it for multiple files.
Question: Is there a way for me to run this script across multiple csv files together and create a separate excel file output for each input file?
I'm not sure what to try out here. I've read that I need to use a module called glob but I'm not sure how to go about it.
This script works for a single file:
# Import libraries
import pandas as pd
import xlsxwriter
# Set system paths
INPUT_PATH = 'SystemPath//Downloads//'
INPUT_FILE = 'rawData.csv'
OUTPUT_PATH = 'SystemPath//Downloads//Output//'
OUTPUT_FILE = 'rawDataOutput.xlsx'
# Get data
df = pd.read_csv(INPUT_PATH + INPUT_FILE)
# Clean data
cleanedData = df[['State','Campaigns','Type','Start date','Impressions','Clicks','Spend(INR)',
'Orders','Sales(INR)','NTB orders','NTB sales']]
cleanedData = cleanedData[cleanedData['Impressions'] != 0].sort_values('Impressions',
ascending= False).reset_index()
cleanedData.loc['Total'] = cleanedData.select_dtypes(pd.np.number).sum()
cleanedData['CTR(%)'] = (cleanedData['Clicks'] /
cleanedData['Impressions']).astype(float).map("{:.2%}".format)
cleanedData['CPC(INR)'] = (cleanedData['Spend(INR)'] / cleanedData['Clicks'])
cleanedData['ACOS(%)'] = (cleanedData['Spend(INR)'] /
cleanedData['Sales(INR)']).astype(float).map("{:.2%}".format)
cleanedData['% of orders NTB'] = (cleanedData['NTB orders'] /
cleanedData['Orders']).astype(float).map("{:.2%}".format)
cleanedData['% of sales NTB'] = (cleanedData['NTB sales'] /
cleanedData['Sales(INR)']).astype(float).map("{:.2%}".format)
cleanedData = cleanedData[['State','Campaigns','Type','Start date','Impressions','Clicks','CTR(%)',
'Spend(INR)','CPC(INR)','Orders','Sales(INR)','ACOS(%)',
'NTB orders','% of orders NTB','NTB sales','% of sales NTB']]
# Create summary
summaryData = cleanedData.groupby(['Type'])[['Spend(INR)','Sales(INR)']].agg('sum')
summaryData.loc['Overall Snapshot'] = summaryData.select_dtypes(pd.np.number).sum()
summaryData['ROI'] = summaryData['Sales(INR)'] / summaryData['Spend(INR)']
# Push to excel
writer = pd.ExcelWriter(OUTPUT_PATH + OUTPUT_FILE, engine='xlsxwriter')
summaryData.to_excel(writer, sheet_name='Summary')
cleanedData.to_excel(writer, sheet_name='Overall Report')
writer.save()
I've never tried anything like this before and I would appreciate your help trying to figure this out
You can use Python's glob.glob() to get all of the CSV files from a given folder. For each filename that is returned, you could derive a suitable output filename. The file processing could be moved into a function as follows:
# Import libraries
import pandas as pd
import xlsxwriter
import glob
import os
def process_csv(input_filename, output_filename):
# Get data
df = pd.read_csv(input_filename)
# Clean data
cleanedData = df[['State','Campaigns','Type','Start date','Impressions','Clicks','Spend(INR)',
'Orders','Sales(INR)','NTB orders','NTB sales']]
cleanedData = cleanedData[cleanedData['Impressions'] != 0].sort_values('Impressions',
ascending= False).reset_index()
cleanedData.loc['Total'] = cleanedData.select_dtypes(pd.np.number).sum()
cleanedData['CTR(%)'] = (cleanedData['Clicks'] /
cleanedData['Impressions']).astype(float).map("{:.2%}".format)
cleanedData['CPC(INR)'] = (cleanedData['Spend(INR)'] / cleanedData['Clicks'])
cleanedData['ACOS(%)'] = (cleanedData['Spend(INR)'] /
cleanedData['Sales(INR)']).astype(float).map("{:.2%}".format)
cleanedData['% of orders NTB'] = (cleanedData['NTB orders'] /
cleanedData['Orders']).astype(float).map("{:.2%}".format)
cleanedData['% of sales NTB'] = (cleanedData['NTB sales'] /
cleanedData['Sales(INR)']).astype(float).map("{:.2%}".format)
cleanedData = cleanedData[['State','Campaigns','Type','Start date','Impressions','Clicks','CTR(%)',
'Spend(INR)','CPC(INR)','Orders','Sales(INR)','ACOS(%)',
'NTB orders','% of orders NTB','NTB sales','% of sales NTB']]
# Create summary
summaryData = cleanedData.groupby(['Type'])[['Spend(INR)','Sales(INR)']].agg('sum')
summaryData.loc['Overall Snapshot'] = summaryData.select_dtypes(pd.np.number).sum()
summaryData['ROI'] = summaryData['Sales(INR)'] / summaryData['Spend(INR)']
# Push to excel
writer = pd.ExcelWriter(output_filename, engine='xlsxwriter')
summaryData.to_excel(writer, sheet_name='Summary')
cleanedData.to_excel(writer, sheet_name='Overall Report')
writer.save()
# Set system paths
INPUT_PATH = 'SystemPath//Downloads//'
OUTPUT_PATH = 'SystemPath//Downloads//Output//'
for csv_filename in glob.glob(os.path.join(INPUT_PATH, "*.csv")):
name, ext = os.path.splitext(os.path.basename(csv_filename))
# Create an output filename based on the input filename
output_filename = os.path.join(OUTPUT_PATH, f"{name}Output.xlsx")
process_csv(csv_filename, output_filename)
os.path.join() can be used as a safer way to join file paths together.
Something like:
import os
import glob
import pandas as pd
os.chdir(r'path\to\folder') #changes folder path to working dir
filelist=glob.glob('*.csv') #creates a list of all csv files
for file in filelist: #loops through the files
df=pd.read_csv(file,...)
#Do something and create a final_df
final_df.to_excel(file[:-4],+'_output.xlsx',index=False) #excel with same name+ouput
you can run this scrip inside a for loop:
for file in os.listdir(INPUT_PATH):
if file.endswith('.csv') or file.endswith('.CSV'):
INPUT_FILE = INPUT_PATH + '/' + file
OUTPUT_FILE = INPUT_PATH + '/Outputs/' + file.[:-4] + 'xlsx'
try this:
import glob
files = glob.glob(INPUT_PATH + "*.csv")
for file in files:
# Get data
df = pd.read_csv(file)
# Clean data
#your cleaning code
# Push to excel
writer = pd.ExcelWriter(OUTPUT_PATH + file.split("/")[-1].replace(".csv","_OUTPUT.xlxs", engine='xlsxwriter')
I'm trying to read a gzip file from S3 - the "native" format f the file is a csv. Ultimately, after uncompressing the file, I'd like to be able to "see" the content so I can read the number of lines in the csv and keep count of it.
My "basic" attempts are here - still just trying to print the contents of the file. This attempt just tells me that there is no such file or directory...
I know I'm also probably erroneously thinking the unzipped csv file will be in json format - but that's the next "issue" once I get to read the unzipped contents...
[Errno 2] No such file or directory: 'SMSUsageReports/eu-west-1/2018/01/02/001.csv.gz'
import gzip
import boto3
import json
s3 = boto3.resource('s3')
s3_client = boto3.client('s3')
bucket = s3.Bucket('snssmsreports')
for obj in bucket.objects.filter(Prefix='SMSUsageReports/eu-west-1/2018/01/02'):
json_object = s3_client.get_object(Bucket=bucket.name, Key=obj.key)
file_name = obj.key
obj = bucket.Object(file_name)
file_body = obj.get()["Body"].read()
# gzip stuff here
f=gzip.open(file_name,'rb')
file_content=f.read()
#print file_content
#jsonFileReader = json_object['Body'].read()
jsonDict = json.loads(file_content)
#table = dynamodb.Table('SNS')
#table.put_item(Item=jsonDict)
print('{0}:{1}'.format(bucket.name, obj.key))
print(jsonDict)
OK, So I updated my code as follow:
import zipfile
import gzip
import boto3
import io
import json
import pandas as pd
s3 = boto3.resource('s3')
s3_client = boto3.client('s3')
bucket = s3.Bucket('snssmsreports')
for obj in bucket.objects.filter(Prefix='SMSUsageReports/eu-west-1/2018/01/02'):
json_object = s3_client.get_object(Bucket=bucket.name, Key=obj.key)
file_name = obj.key
obj = bucket.Object(file_name)
s3_client.download_file(bucket.name, file_name, '../../tmp/file.gz')
gzip_name = '../../tmp/file.gz'
# gzip stuff here
with gzip.open(gzip_name,'rb') as f:
file_content=f.read()
str_file = str(file_content)
csvfile = open('../../tmp/testfile.csv','w')
csvfile.write(str_file)
csvfile.close()
#table = dynamodb.Table('SNS')
#table.put_item(Item=jsonDict)
#pandas csv reader
df1 = pd.read_csv('../../tmp/testfile.csv')
print(df1)
#print('{0}:{1}'.format(bucket.name, obj.key))
#print(file_content)
#table = dynamodb.Table('SNS')
#table.put_item(Item=jsonDict)
This does not throw any errors anymore, but the output only has one row and 135 columns, so panda is not liking the actual content of the csv, or my conversion to str() is not the right way to do it?
OK, issue was the opening of the file for write - to write bytes I had to open file as wb...
csvfile = open('../../tmp/testfile.csv','wb')
csvfile.write(file_content)
csvfile.close()