Python Pandas DataFrame adds index column when appending - python

I am looping through csv files to append to a DataFrame table but it seems that every time I loop and append, there is an index column added to the Table. Very confusing and I am very stuck, any help would be great.
My code:
import sqlite3 as sql
import pandas as pd
import hashlib
import os
import csv
from pandas import ExcelWriter
def obtain_data(filename, connect, type):
writer =
ExcelWriter('path\\new_excel_sheets\\'+filename+'.xlsx')
table =
ExcelWriter('path\\new_excel_sheets\\hash_table.xlsx')
if type == True:
print(filename)
df = pd.DataFrame.from_csv('path'+filename,
index_col=None)
else:
workbook = pd.ExcelFile('path' + filename)
df = workbook.parse('Sheet1')
df = df.rename(columns={'INDEX': 'INDX'})
df = df.rename(columns={'Index': 'INDXS'})
headers = df.dtypes.index
header_list = str(headers.tolist())
header_list = ''.join(header_list)
hash_t = str(hashlib.md5(header_list.encode('utf-8')).hexdigest())
c = connect.cursor()
print(filename)
print(hash_t)
if hash_t == 'd22db04a2f009f222da57e91acdce21b':
next_open = df['DATE'][1]
next_open_value = df['DATE'][2]
df.insert(3, next_open, next_open_value)
headers = df.dtypes.index
header_list = str(headers.tolist())
header_list = ''.join(header_list)
new_hash_t = str(hashlib.md5(header_list.encode('utf-
8')).hexdigest())
df = df.drop(df.index[1:])
hashing = {str(new_hash_t): str(filename)}
df2 = pd.DataFrame.from_dict(hashing, orient='index')
try:
df2.to_sql(name='Hash Table', con=connect, if_exists='append')
df.to_sql(name=new_hash_t, con=connect, if_exists='append')
except:
raise IndexError('Could not transform ' + str(filename) + ' into
database.')
elif hash_t == '484fbe4de83acb41480dd935d82d7fbe':
next_open = df['DATE'][1]
next_open_value = df['DATE'][2]
df.insert(3, next_open, next_open_value)
headers = df.dtypes.index
header_list = str(headers.tolist())
header_list = ''.join(header_list)
new_hash_t = str(hashlib.md5(header_list.encode('utf-
8')).hexdigest())
df = df.drop(df.index[2])
df['DATE'][1] = df['DATE'][0]
hashing = {new_hash_t: filename}
df2 = pd.DataFrame.from_dict(hashing, orient='index')
try:
df2.to_sql(name='Hash Table', con=connect, if_exists='append')
df.to_sql(name=new_hash_t, con=connect, if_exists='append')
except:
raise IndexError('Could not transform ' + str(filename) + ' into
database.')
else:
hashing = {hash_t: filename}
df2 = pd.DataFrame.from_dict(hashing, orient='index')
try:
df2.to_sql(name='Hash Table', con=connect, if_exists='append',
index=False)
df.to_sql(name=hash_t, con=connect, if_exists='append',
index=True)
except:
raise IndexError('Could not transform ' + str(filename) + '
into database.')
df.to_excel(writer)
print(filename + ' has been completed succesfully.')
final_results = {'df': df, 'hash_t': hash_t}
return final_results
csv_files = []
usable_files = []
for filename in os.listdir(filepath):
if filename.endswith(".xlsx"):
print('Found an XLSX file ' + str(filename))
usable_files.append(filename)
elif filename.endswith('.CSV'):
print('Found a CSV File ' + filename)
csv_files.append(filename)
else:
print('Found an unusable file ' + str(filename))
for file in usable_files:
connect = sql.connect(SQLite3 connection)
obtain_data(file, connect, False)
for file in csv_files:
connect = sql.connect(SQLite3 connection)
obtain_data(file, connect, True)
print('All files have been made into Tables')
The SQLite3 database does everything right, but when I append to it it adds an index column. I am not sure how to put index columns in here(feel free to teach me) so bear with me here. The table goes from looking like this
rowid, 0 , 1, 2, etc
0, value, value, value, etc
1, value, value, value, etc
but when I loop through(say 4 times), it changes to this
rowid, index, 0, 1, 2, etc
0, 0, 0, 0, 0, value
0, 0, 0, 0, 0, value
This is a very weird problem so any help would be appreciated, thanks!

Simply set index parameter to False in all to_sql() calls (by default parameter is set to True):
df2.to_sql(name='Hash Table', con=connect, if_exists='append', index=False)
And any flat file outputs:
df.to_excel(writer, index=False)
df.to_csv(filename, index=False)

Related

how to improve the performance python script for extracting big size(3-4GB)of oracle table

I'm connecting to oracle database using python script and extracting around 10 tables. one table is having 3Gb of data it took around 4 hours to extract with below code and upload it to S3. How can we improve the performance of the below python script?
Different file format other than csv will improve the performance like parquet?
Any suggestions or solutions will be highly appreciated.
Below is the code I tried:
def extract_handler():
# Parameters defined in cloudwatch event
env = os.environ['Environment'] if 'Environment' in os.environ else 'sit'
# FTP parameters
host = f"/{env}/connet_HOSTNAME"
username = f"/{env}/connect_USERNAME"
password = f"/{env}/connect_PASSWORD"
host = get_parameters(host)
username = get_parameters(username)
password = get_parameters(password)
today = date.today()
current_date = today.strftime("%Y%m%d")
con = None
cur = None
tables = ["table1", "table2","table3"........."table10"]
bucket = "bucket_name"
for table in tables:
try:
con = cx_Oracle.connect(username, password, host, encoding="UTF-8")
cur = con.cursor()
logging.info('Successfully established the connection to Oracle db')
table_name = table.split(".")[1]
logging.info("######## Table name:"+ table +" ###### ")
logging.info("****** PROCESSING:" +table_name+" *********")
cur.execute("SELECT count(*) FROM {}".format(table))
count = cur.fetchone()[0]
logging.info("Count:", count)
if count > 0:
cur1 = con.cursor()
# Define the desired timestamp format
timestamp_format = '%Y/%m/%d %H:%M:%S'
# Execute a query to read a table
cur1.execute( "select * from {} where TRUNC(DWH_CREATED_ON)=TRUNC(SYSDATE)-1".format(table))
batch_size = 10000
rows = cur1.fetchmany(batch_size)
csv_file = f"/tmp/{table_name}.csv"
with open(csv_file, "w", newline="") as f:
# Add file_date column as the first column
writer = csv.DictWriter(f, fieldnames=['file_date'] + [col[0] for col in cur1.description],
delimiter='\t')
writer.writeheader()
logging.info("Header added to the table:" + table + "######")
while rows:
for row in rows:
row_dict = {'file_date': current_date}
for i, col in enumerate(cur1.description):
if col[1] == cx_Oracle.DATETIME:
if row[i] is not None:
row_dict[col[0]] = row[i].strftime(timestamp_format)
else:
row_dict[col[0]] = ""
else:
row_dict[col[0]] = row[i]
with open(csv_file, "a", newline="") as f:
# Add file_date column as the first column
writer = csv.DictWriter(f, fieldnames=['file_date'] + [col[0] for col in cur1.description],
delimiter='\t')
writer.writerow(row_dict)
# Fetch the next batch of 100 rows
rows = cur1.fetchmany(batch_size)
logging.info("Records written to the temp file for the table :" + table + "######")
s3_path = "NorthernRegion" + '/' + table_name + '/' + current_date + '/' + table_name + '.csv'
s3_client = boto3.client('s3', region_name='region-central-1')
s3_client.upload_file('/tmp/' + table_name + '.csv', bucket, s3_path)
logging.info(table + "File uploaded to S3 ######")
else:
logging.info('Table not having data')
return 'Data is not refreshed yet, Hence quitting..'
if cur1:
cur1.close()
except Exception as err:
#Handle or log other exceptions such as bucket doesn't exist
logging.error(err)
finally:
if cur:
cur.close()
if con:
con.close()
return "Successfully processed"

read in sequence different json file

I have different JSON files in my local directory and I read all of them with this code
path_to_json = 'C:/Users/../Desktop/NewData'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
def func(s):
try:
return eval(s)
except:
return dict()
list_of_df=[]
for i in range(len(json_files)):
try:
file_name = json_files[i]
df = pd.read_json(file_name, lines=True)
df= df[['something']]
df = df['something'].apply(func)
df=pd.json_normalize(df)
df=pd.DataFrame(df[["something", "something1"]])
df['Index'] = 'weather5' + str(6+i)
except:
pass
list_of_df.append(df)
df=pd.concat(list_of_df)
df = df[['Index','something', 'something1']]
df.head()
The name of the JSON files that I read are weather56, weather57, weather58, weather59, weather60, weather61
I am using this line df['Index'] = 'weather5' + str(6+i) to read them properly and adjust them to a dataframe. However seem that I do not read them properly as now appears in the dataframe as:
Index
weather56
weather57
weather58
weather59
weather510
weather511
How to adjust this line df['Index'] = 'weather5' + str(6+i) to read the JSON files with their names?
df['Index'] = 'weather5' + str(6+i)
As i goes from 0 to 6, the corresponding values generated are going to be
weather56 // '5' + str(6 + 0)
weather57
weather58
weather59
weather510 // '5' + str(6 + 4) := '5' + '10'
weather511
If you change the line to
df['Index'] = 'weather' + str(56+i)
it should appear as -
weather56
weather57
weather58
weather59
weather60
weather61

looping through a list of dataframes, writing each element of that list to a new .csv file on disk

I have a list of dataframes and am attempting to export each using the pandas.df.to_csv method to a folder on disk. However, only the last item in the list of dataframes is being written to disk as a .csv
Please see code below:
import pandas as pd
import os
import datetime
from pathlib import Path
CSV_Folder = Path('C:\PA_Boundaries\Tests')
Output = r'C:/PA_Boundaries/test_output'
today = datetime.date.today()
date = today.strftime('%Y%m%d')
try:
dfs = []
for file in os.listdir(CSV_Folder):
df = pd.read_csv(CSV_Folder / file)
dfs.append(df)
new_dfs = []
for df in dfs:
new_df = pd.DataFrame()
new_df['Original Addr string'] = df['StreetConc']
new_df['Addr #'] = df['AddNum']
new_df['Prefix'] = df['StPreDir']
new_df['Street Name'] = df['StName']
new_df['StreetType'] = df['StType']
new_df['Suffix'] = df['StDir']
new_df['Multi-Unit'] = ''
new_df['City'] = df['City']
new_df['Zip Code'] = df['PostCode']
new_df['4'] = df['PostalExt']
new_df['County'] = df['CountyID']
new_df['Addr Type'] = ''
new_df['Precint Part Name'] = ''
new_df['Lat'] = df['X']
new_df['Long'] = df['Y']
replaced_address_names = []
for index, row in new_df.iterrows():
new_row = row['Original Addr string'].replace(',', ' ')
replaced_address_names.append(new_row)
new_df['Original Addr string'] = replaced_address_names
county_id = df.iloc[0, 37]
new_dfs.append(new_df)
for i in range(len(new_dfs)):
new_dfs[i].to_csv(f'{Output}\ADDR_{county_id}_{date}.csv', index=False)
except FileNotFoundError:
print(f'{file} not found in {CSV_Folder}')
except PermissionError:
print('Check syntax of paths')
else:
print('Process Complete')
new_dfs contains the correct number of dataframes. However, when looping through the new list of dataframes and calling .to_csv on each item in the list, only the last item in the list is written to the disk.
The problem lies in the way in which you name your exported file.
After running through the loop, county_id will be equal to the last county_id, or the county_id of the last iterated df.
Since the name of your exported dataframe is {Output}\ADDR_{county_id}_{date}.csv, all the exported files are being named by the same count_id and date, or in other words, they are being rewritten.
To avoid this, you can create a new list called county_ids and then use the last loop to change the name of the saved file. This would be your resulting code:
import pandas as pd
import os
import datetime
from pathlib import Path
CSV_Folder = Path('C:\PA_Boundaries\Tests')
Output = r'C:/PA_Boundaries/test_output'
today = datetime.date.today()
date = today.strftime('%Y%m%d')
try:
dfs = []
for file in os.listdir(CSV_Folder):
df = pd.read_csv(CSV_Folder / file)
dfs.append(df)
new_dfs, county_ids = [], []
for df in dfs:
new_df = pd.DataFrame()
new_df['Original Addr string'] = df['StreetConc']
new_df['Addr #'] = df['AddNum']
new_df['Prefix'] = df['StPreDir']
new_df['Street Name'] = df['StName']
new_df['StreetType'] = df['StType']
new_df['Suffix'] = df['StDir']
new_df['Multi-Unit'] = ''
new_df['City'] = df['City']
new_df['Zip Code'] = df['PostCode']
new_df['4'] = df['PostalExt']
new_df['County'] = df['CountyID']
new_df['Addr Type'] = ''
new_df['Precint Part Name'] = ''
new_df['Lat'] = df['X']
new_df['Long'] = df['Y']
replaced_address_names = []
for index, row in new_df.iterrows():
new_row = row['Original Addr string'].replace(',', ' ')
replaced_address_names.append(new_row)
new_df['Original Addr string'] = replaced_address_names
county_ids.append(df.iloc[0, 37])
new_dfs.append(new_df)
for i in range(len(new_dfs)):
new_dfs[i].to_csv(f'{Output}\ADDR_{county_id[i]}_{date}.csv', index=False)
except FileNotFoundError:
print(f'{file} not found in {CSV_Folder}')
except PermissionError:
print('Check syntax of paths')
else:
print('Process Complete')
Obviously I cannot test this - if you do run it there maybe lines that need tweaking. However, I'd do the code something like the below. Basically I'd call a function to replace as I'm opening and write out immediately.
If you can get it working it will probably be faster and reads slightly better as there are less lines.
Example:
import pandas as pd
import os
import datetime
from pathlib import Path
CSV_Folder = Path(r'C:/PA_Boundaries/Tests')
Output = r'C:/PA_Boundaries/test_output/'
today = datetime.date.today()
date = today.strftime('%Y%m%d')
def updateFrame(f):
new_df = pd.DataFrame()
new_df['Original Addr string'] = f['StreetConc']
new_df['Addr #'] = f['AddNum']
new_df['Prefix'] = f['StPreDir']
new_df['Street Name'] = f['StName']
new_df['StreetType'] = f['StType']
new_df['Suffix'] = f['StDir']
new_df['Multi-Unit'] = ''
new_df['City'] = f['City']
new_df['Zip Code'] = f['PostCode']
new_df['4'] = f['PostalExt']
new_df['County'] = f['CountyID']
new_df['Addr Type'] = ''
new_df['Precint Part Name'] = ''
new_df['Lat'] = f['X']
new_df['Long'] = f['Y']
# better way to replace without looping the rows...
new_df['Original Addr string'] = new_df['Original Addr string'].str.replace(',', ' ')
return new_df
for file in os.listdir(CSV_Folder):
working_file = str(CSV_Folder) + '/' + file
if working_file.endswith('.csv'):
try:
df = pd.read_csv(working_file)
county_id = str(df.iloc[0, 37])
# the function returns a frame so you can treat it as such...
updateFrame(df).to_csv(f'{Output}ADDR_{county_id}_{date}.csv', index=False)
except FileNotFoundError:
print(f'{file} not found in {CSV_Folder}')
except PermissionError:
print('Check syntax of paths')
else:
print('Process Complete')

Rename current file with variable value

I am simply trying to get the value of column 'ID' where two rows meet (meet where Name and Zip are on the same row) then with that row number, I get column 'ID' value to rename file with. I don't think writing the '.rename' correctly for one.. any pointers appreciated. Currently no error, but no output either.
import PyPDF2
from PyPDF2 import PdfFileWriter, PdfFileReader
import re
import config
import xlrd
import numpy as np
import pandas as pd
import math
import os
for filename in os.listdir(config.Total):
if filename.endswith(".pdf"):
First_Name, Last_Name, Zip = filename.replace(".pdf",'').split()
Name = First_Name + " " + Last_Name
print(Name)
print(Zip)
data1 = pd.read_excel(config.Excel1)
data2 = pd.read_excel(config.Excel2)
df = pd.DataFrame(data1)
header = df.iloc[0]
df2 = pd.DataFrame(data2)
header2 = df2.iloc[0]
df = df[1:]
df.rename(columns = header)
df2 = df2[1:]
df2.rename(columns = header2)
row_numberd1 = df[df['Member Name'].str.contains(Name)].index.min()
row_numberd12 = df[df['Member Address Line 3'].str.contains(Zip)].index.min()
if row_numberd1 == row_numberd12: # When rows match of NameUp and Zip var in DF1
rowMatched = row_numberd1
print("Match Found")
print(rowMatched)
MemberID = df['ID'][rowMatched]
MemberI = str(MemberID)
os.rename(config.ID+ "/" + MemberI)
row_numberd2 = df2[df2['Member Name'].str.contains(Name)].index.min()
row_numberd22 = df2[df2['Member Address Line 3'].str.contains(Zip)].index.min()
if row_numberd2 == row_numberd22: # When rows match of NameUp and Zip var in DF2
rowMatched2 = row_numberd2
print("No Match Found")
print(rowMatched2)
MemberID = df2['ID'][rowMatched2]
MemberI = str(MemberID)
os.rename(config.ID+ "/" + MemberI)
Maybe?
os.rename(config.Total, + MemberI, config.ID)
Since the code runs without an error, it must never gets into the body of if statement. Hence never renames. Maybe there is no match! You can find out by explicit renaming:
row = df[df['Member Name'].str.contains(Name) and df['Member Address Line 3'].str.contains(Zip)].index.min()
os.rename("old_filename",str(df['ID'][row]))

how to import state of pandas dataframe to second .py file

so, toward the end of my first file; we'll call /file.py.
def get_excel_data(self):
"""Places excel data into pandas dataframe"""
# excel_data = pandas.read_excel(self.find_file())
for extracted_archive in self.find_file():
excel_data = pandas.read_excel(extracted_archive)
# print(excel_data)
columns = pandas.DataFrame(columns=excel_data.columns.tolist())
excel_data = pandas.concat([excel_data, columns])
excel_data.columns = excel_data.columns.str.strip()
excel_data.columns = excel_data.columns.str.replace("/", "_")
excel_data.columns = excel_data.columns.str.replace(" ", "_")
total_records = 0
num_valid_records = 0
num_invalid_records = 0
for row in excel_data.itertuples():
mrn = row.MRN
total_records += 1
if mrn in ("", " ", "N/A", "NaT", "NaN", None) or math.isnan(mrn):
# print(f"Invalid record: {row}")
num_invalid_records += 1
# total_invalid = num_invalid_records + dup_count
excel_data = excel_data.drop(excel_data.index[row.Index])
# continue
else:
# print(mrn) # outputs all MRN ids
for row in excel_data.itertuples():
num_valid_records += 1
continue
with open("./logs/metrics.csv", "a", newline="\n") as f:
csv_writer = DictWriter(f, ['date', 'total_records', 'processed', 'skipped', 'success_rate'])
# csv_writer.writeheader()
currentDT = datetime.datetime.now()
success_rate = num_valid_records / total_records * 100
csv_writer.writerow(dict(date=currentDT,
total_records=total_records,
processed=num_valid_records,
skipped=num_invalid_records,
success_rate=num_valid_records / total_records * 100))
return self.clean_data_frame(excel_data)
def clean_data_frame(self, data_frame):
"""Cleans up dataframes"""
for col in data_frame.columns:
if "date" in col.lower():
data_frame[col] = pandas.to_datetime(data_frame[col],
errors='coerce', infer_datetime_format=True)
data_frame[col] = data_frame[col].dt.date
data_frame['MRN'] = data_frame['MRN'].astype(int).astype(str)
return data_frame
def get_mapping_data(self):
map_data = pandas.read_excel(config.MAPPING_DOC, sheet_name='main')
columns = pandas.DataFrame(columns=map_data.columns.tolist())
return pandas.concat([map_data, columns])
in my second file I would like to keep that end state; and do another iteration for instance.... second_file.py
def process_records(self, records, map_data, completed=None, errors=None):
"""Code to execute after webdriver initialization."""
series_not_null = False
try:
num_attempt = 0
for record in data_frame.itertuples(): # not working
print(record)
series_not_null = True
mrn = record.MRN
self.navigate_to_search(num_attempt)
self.navigate_to_member(mrn)
self.navigate_to_assessment()
self.add_assessment(record, map_data)
self.driver.switch_to.parent_frame() # not working
sleep(.5)
error_flag = self.close_member_tab(self.driver, mrn, error_flag)
except Exception as exc:
if series_not_null:
errors = self.process_series_error(exc)
return completed, error
both have import pandas
you can save your dataframe in a pickle file like this. it is also worth noting that you can store most anything in a pickle file. here is a link to some info here: pickle info
import pandas as pd
import pickle
x = pd.DataFrame({'a':[1,2,3],'b':[4,5,6],'c':[7,8,9]})
#this will create a file called pickledata.p that will store the data frame
with open('pickledata.p', 'wb') as fh: #notice that you need the 'wb' for the dump
pickle.dump(x, fh)
#to load the file do this
with open('pickledata.p', 'rb') as fh: #you need to use 'rb' to read
df = pickle.load(fh)
#you can now use df like a normal dataframe
print(df)
you dont actually need the '.p' extension for a pickle file, i just like it.
so you save your dataframe at the end of script one, and then load it in at the start of script 2.
Use Dataframe.to_pickle and pandas.read_pickle:
To persist
df.to_pickle('./dataframe.pkl')
To load
df = pd.read_pickle('./dataframe.pkl')

Categories