How to insert a data frame as an object attribute - python

This is most likely a pretty basic question, but I am still learning about classes/objects/constructors/etc. and I am trying to apply some of these concepts to my current workflow.
I am trying to create a class that automatically saves my data frame as a CSV or xlsx file, depending on what I specify, to a given folder. However, I don't believe that I am correctly passing my data frame as an object attribute. This is my code as it stands:
award_date_change = merged_df.loc[merged_df['award_date_change'] == 'yes'] #this is my data frame
class uploading_to_GC:
def __init__(self, file_name, file_type, already_exists): #constructor where I want to pass my data frame, file type to be saved to, and specifying if the file already exists in my folder
self.file_name = file_name
self.file_type = file_type
self.already_exists = already_exists
def print_file_name(self):
self.file_name.head(5)
def private_workspace(self):
commonPath = os.path.expanduser(r"~\path")
GCdocs = commonPath + '384593683' + '\\'
path = GCdocs + "" + file_name
if len(self.file_name) != 0 and self.already_exists == True: #if a file already exists in Gfolder
if self.file_type == "csv": #for csv files
GC_old = pd.read_csv(path)
GC_new = GC_old.append(self.file_name, ignore_index=True)
GC_new.to_csv(path, index = False)
print("csv file is updated to private workspace in GCdocs")
elif self.file_type == "xlsx": #for xlsx files
GC_old = pd.read_csv(path)
GC_new = GC_old.append(self.file_name, ignore_index=True)
GC_new.to_excel(path, index = False)
print("excel file is updated to private workspace in GCdocs")
else:
print("unrecognized file type")
elif len(self.file_name) != 0 and self.already_exists == False: #if a file does FOLDER already exist in folder
if self.file_type == "csv":
self.file_name.to_csv(path,index=False)
if self.file_type == "xlsx":
self.file_name.to_excel(path,index=False)
else:
print("unrecognized file type")
else:
print("there is no data to upload")
award_date_change = uploading_to_GC(award_date_change,"csv", False)
award_date_change.private_workspace
I am aware that I don't need to use a class to do this, but I wanted to challenge myself to start using classes more often. Any help would be appreciated

You can pass and store a df in a Class as a data member very simply:
class Foo:
def __init__(df: pd.DataFrame):
self.df = df
# or, if you want to be sure you don't modify the original df
self.df = df.copy()
df = pd.DataFrame()
foo_obj = Foo(df)
Edit: the : pd.DataFrame is for type-hinting. This does not affect the actual code, but is merely useful to the reader that we are expecting a pd.DataFrame as input. Good IDEs will also give you an error if you don't pass a DataFrame.

Related

excel pickling with openpyxl

I have written this code, but on the line _ = ws.cell(column == get_column_letter(cll[0][0]), row == (cll[0][1]), value == cll[1]) it returns an error that I do not understand!
Here's the code:
from openpyxl import Workbook as Wb
from openpyxl.utils import get_column_letter
import dill
def unpickle_exl(file_name, unpkld_file_name):
"""
unpickles a pickled excel file.
for file_name (current file name) suffix the name with .dll
likewise, for unpkld_file_name (file name to be made after unpickling) suffix your name with .xlsx
failiure to do so will result in an error.
please do not leave the second argument empty: please enter '' instead
"""
if file_name[len(file_name)-4:] != '.dll':
raise SyntaxError("file_name does not end with suffix .dll")
if unpkld_file_name != '' and unpkld_file_name[len(unpkld_file_name)-5:] != '.xlsx':
raise SyntaxError("unpkld_file_name does not end with suffix .xlsx")
if unpkld_file_name == '':
unpkld_file_name == unpkld_file_name.replace(".dll", ".xlsx")
try:
with open(file_name, 'rb') as d:
pkld_sprdsht = dill.load(d)
except OSError:
raise ReferenceError("File " + str(filename) + "does not exist.")
print(pkld_sprdsht)
wb = Wb()
for obj in pkld_sprdsht:
ws = wb.create_sheet()
for sht in obj:
for cll in sht:
_ = ws.cell(column == get_column_letter(cll[0][0]), row == (cll[0][1]), value == cll[1])
wb.save(filename = unpkld_file_name)
def test():
unpickle_exl('xlsx_to_dll test sprdsht.dll', 'xlsx_to_dll test sprdsht_copy.xlsx')
I'm trying to save the cells one by one(or maybe everything at once) but I don't really understand the _ = ... bit.
NOTE:
If you need it, I can add the pickling code if it helps.

Is there a way to read either a psv, csv or excel file into a dataframe depending on the file type of the input?

I want the user of a script to enter the file path and name which will be read into a data frame.
The user can enter something like directory1\test1.csv or directory1\test1.psv or directory1\test1.xlsx
Regardless of whether they enter a psv, csv or xlsx, I want to read it into a dataframe with something like following logic:
If file name ends with .psv then df = pd.read_psv(), elif file name ends with .csv then df = read_csv(), elif file name ends with .xlsx then df = pd.read_excel().
Is there a way to do this?
Sure there is.
import os
# Get the file extension
ext = os.path.splitext(in_file)[1]
if ext == '.psv':
df = pd.read_psv(in_file)
elif ext == '.csv':
df = pd.read_csv(in_file)
elif ext == '.xlsx':
df = pd.read_excel(in_file)
else:
raise RuntimeError('File extension not recognized')
This is what I have tried:
#Specify the file directory where it is located
file_path = input("Enter file directory path:")
#Specify the file name where it is located
file_name = input("Enter file name:")
#number of rows to skip to read the column names
skip_rows = input("Skip n rows:")
input_file = file_path + file_name
if file_name.endswith('.csv'):
df = pd.read_csv(input_file, skiprows = skip_rows)
elif file_name.endswith('.xlsx'):
df = pd.read_excel(input_file, skiprows = skip_rows)
elif file_name.endswith('.psv'):
df = pd.read_csv(input_file, sep = "|", skiprows = skip_rows)
else:
print('file format not supported')

Having trouble getting function to execute after panda preprocessing

I am doing some work with pandas that requires some preprocessing so that I can graph as intended. Right now I am looping through column names and deleting the ones I do not need. After I have done that I do a merge with another panda df so that I can execute the next function call. The code looks something like:
def makePlotFile(df, asg, dueDate, path, outputFile, gradesPath=None):
print(gradesPath)
vizData(df, asg, dueDate, path)
vizAttempts(df, asg, dueDate, path)
vizFirstAttempt(df, asg, dueDate, path)
graph1 = path + "/output1.pdf"
graph2 = path + "/output2.pdf"
graph3 = path + "/output3.pdf"
ready = False
if gradesPath != None:
print("Will include grade information")
grades = pd.read_csv(gradesPath, error_bad_lines=False)
for column_name, _ in grades.iteritems():
if asg not in column_name:
if column_name != 'Email':
del grades[column_name]
if asg in column_name:
grade = column_name
ready = True
print('GOT COLUMN NAME')
#await asyncio.wait(grade)
if ready:
print('HIT IT')
pd.merge(df, grades, on='Email')
vizGradesFirstAttempt(df, asg, dueDate, path,grade)
graph4 = path + "/output4.pdf"
pdfs = [graph1, graph2, graph3, graph4]
merger = PdfFileMerger()
for pdf in pdfs:
merger.append(pdf)
merger.write(outputFile)
merger.close()
if gradesPath == None:
print("No grade information given")
pdfs = [graph1, graph2, graph3]
merger = PdfFileMerger()
for pdf in pdfs:
merger.append(pdf)
merger.write(outputFile)
merger.close()
I have added in some print statements to help me debug. The statement "Will include grade information" prints but does not make it to the other print statements. I am not sure if it has to do with a synchronous issue or what. I would appreciate some guidance. I am assuming I am missing something small, but not sure what.
The function calls to:
vizAttempts(df, asg, dueDate, path)
vizFirstAttempt(df, asg, dueDate, path)```
All work as expected. They are functions that create graphs using my dataframe. I thne merge them into a single PDF. this is what the graph1, graph2, and graph3 help me with.

iterate over multiple files in my directory

Currently I am grabbing a excel file from a folder with Python just fine; in the below code.. and pushing this to a web form via selenium.
However, I am trying to modify this to continue to go through a directory over multiple files. (there will be many excel files in my 'directory' or 'folder').
main.py
from data.find_pending_records import FindPendingRecords
from vital.vital_entry import VitalEntry
if __name__ == "__main__":
try:
#Instantiates FindPendingRecords then gets records to process
PENDING_RECORDS = FindPendingRecords().get_excel_data()
#Reads excel to map data from excel to vital
MAP_DATA = FindPendingRecords().get_mapping_data()
#Configures Driver for vital
VITAL_ENTRY = VitalEntry()
#Start chrome and navigate to vital website
VITAL_ENTRY.instantiate_chrome()
#Begin processing Records
VITAL_ENTRY.process_records(PENDING_RECORDS, MAP_DATA)
print("All done, Bill")
except Exception as exc:
print(exc)
config.py
FILE_LOCATION = r"C:\Zip\2019.02.12 Data Docs.zip"
UNZIP_LOCATION = r"C:\Zip\Pending"
VITAL_URL = 'http://boringdatabasewebsite:8080/Horrible'
HEADLESS = False
PROCESSORS = 4
MAPPING_DOC = ".//map/mapping.xlsx"
find_pending_records.py
"""Module used to find records that need to be inserted into Horrible website"""
from zipfile import ZipFile
import math
import pandas
import config
class FindPendingRecords:
"""Class used to find records that need to be inserted into Site"""
#classmethod
def find_file(cls):
""""Finds the excel file to process"""
archive = ZipFile(config.FILE_LOCATION)
for file in archive.filelist:
if file.filename.__contains__('Horrible Data Log '):
return archive.extract(file.filename, config.UNZIP_LOCATION)
return FileNotFoundError
def get_excel_data(self):
"""Places excel data into pandas dataframe"""
excel_data = pandas.read_excel(self.find_file())
columns = pandas.DataFrame(columns=excel_data.columns.tolist())
excel_data = pandas.concat([excel_data, columns])
excel_data.columns = excel_data.columns.str.strip()
excel_data.columns = excel_data.columns.str.replace("/", "_")
excel_data.columns = excel_data.columns.str.replace(" ", "_")
num_valid_records = 0
for row in excel_data.itertuples():
person = row.PERSON
if person in ("", " ", None) or math.isnan(mrn):
print(f"Invalid record: {row}")
excel_data = excel_data.drop(excel_data.index[row.Index])
else:
num_valid_records += 1
print(f"Processing #{num_valid_records} records")
return self.clean_data_frame(excel_data)
def clean_data_frame(self, data_frame):
"""Cleans up dataframes"""
for col in data_frame.columns:
if "date" in col.lower():
data_frame[col] = pandas.to_datetime(data_frame[col],
errors='coerce', infer_datetime_format=True)
data_frame[col] = data_frame[col].dt.date
data_frame['PERSON'] = data_frame['PERSON'].astype(int).astype(str)
return data_frame
def get_mapping_data(self):
map_data = pandas.read_excel(config.MAPPING_DOC, sheet_name='main')
columns = pandas.DataFrame(columns=map_data.columns.tolist())
return pandas.concat([map_data, columns])
One way is as below (pseudocode)
class FindPendingRecords:
#classmethod
def find_file(cls):
return ["file1", "file2", "file3"]
def __init__(self):
self.files = self.find_file()
def get_excel_data(self):
for excel_data in self.files:
# process your excel_data
yield excel_data
Your main should be
if __name__ == "__main__":
try:
for PENDING_RECORDS in FindPendingRecords().get_excel_data():
# Do operations on PENDING_RECORDS
print (PENDING_RECORDS)
print("All done, Bill")
except Exception as exc:
print(exc)
Your find_file method will be
#classmethod
def find_file(cls):
all_files = list()
""""Finds the excel file to process"""
archive = ZipFile(config.FILE_LOCATION)
for file in archive.filelist:
if file.filename.__contains__('Horrible Data Log '):
all_files.append(archive.extract(file.filename, config.UNZIP_LOCATION))
return all_files

Taking Same Worksheet from a Folder of xlsm Files with Python

I'm new to pandas/python and Ive come up with the following code to extract data from a specific part of a worksheet.
import openpyxl as xl
import pandas as pd
rows_with_data = [34,37,38,39,44,45,46,47,48,49, 50,54,55,57,58,59,60,62,63,64,65,66,70,71,72,76,77, 78,79,80,81,82,83,84,88,89,90,91,92]
path = r'XXX'
xpath = input('XXX')
file = r'**.xlsm'
xfile = input('Change file name, current is ' + file + ' :')
sheetname = r'Summary'
wb = xl.load_workbook(filename = xpath + '\\' +file, data_only = True)
sheet = wb.get_sheet_by_name(sheetname)
rows = len(rows_with_data)
line_items = []
for i in range(rows) :
line_items.append(sheet.cell(row = rows_with_data[i], column = 13).value)
period = []
for col in range(17,35):
period.append(sheet.cell(row = 20, column = col).value)
print(line_items)
vals = []
x = []
for i in range(rows):
if i != 0:
vals.append(x)
x = []
for col in range(17,35):
x.append(sheet.cell(row = rows_with_data[i], column = col).value)
vals.append(x)
all_values = {}
all_values['Period'] = period
for i in range(rows):
print(line_items[i])
all_values[line_items[i]] = vals[i]
print(all_values)
period_review = input('Enter a period (i.e. 2002): ')
item = input('Enter a period (i.e. XXX): ')
time = period.index(period_review)
display_item = str(all_values[item][time])
print(item + ' for ' + period_review + " is " + display_item)
Summary_Dataframe = pd.DataFrame(all_values)
writer = pd.ExcelWriter(xpath + '\\' + 'values.xlsx')
Summary_Dataframe.to_excel(writer,'Sheet1')
writer.save()
writer.close()
I have the same worksheet (summary results) across a library of 60 xlsm files and I'm having a hard time figuring out how to iterate this across the entire folder of files. I also want change this from extracting specific rows to taking the entire "Summary" worksheet, pasting it to the new file and naming the worksheet by its filename ("Experiment_A") when pasted to the new excel file. Any advice?
I was having hard time to read your code to understand that what you want to do finally. So it is just an advice not a solution. You can iterate through all files in the folder using os then read the files in to one dataframe then save the single big data frame in to csv. I usually avoid excel but I guess you need the excel conversion. In the example below I have read all txt file from a directory put them in to dataframe list then store the big data frame as json. You can also store it as excel/csv.
import os
import pandas as pd
def process_data():
# input file path in 2 part in case it is very long
input_path_1 = r'\\path\to\the\folder'
input_path_2 = r'\second\part\of\the\path'
# adding the all file path
file_path = input_path_1 + input_path_2
# listing all file in the file folder
file_list = os.listdir(os.path.join(file_path))
# selecting only the .txt files in to a list object
file_list = [file_name for file_name in file_list if '.txt' in file_name]
# selecting the fields we need
field_names = ['country', 'ticket_id']
# defining a list to put all the datafremes in one list
pd_list = []
inserted_files = []
# looping over txt files and storing in to database
for file_name in file_list:
# creating the file path to read the file
file_path_ = file_path + '\\' + file_name
df_ = pd.read_csv(os.path.join(file_path_), sep='\t', usecols=field_names)
# converting the datetime to date
# few internal data transformation example before writting
df_['sent_date'] = pd.to_datetime(df_['sent_date'])
df_['sent_date'] = df_['sent_date'].values.astype('datetime64[M]')
# adding each dataframe to the list
pd_list.append(df_)
# adding file name to the inserted list to print later
inserted_files.append(file_name)
print(inserted_files)
# sql like union all dataframes and create a single data source
df_ = pd.concat(pd_list)
output_path_1 = r'\\path\to\output'
output_path_2 = r'\path\to\output'
output_path = output_path_1 + output_path_2
# put the file name
file_name = 'xyz.json'
# adding the day the file processed
df_['etl_run_time'] = pd.to_datetime('today').strftime('%Y-%m-%d')
# write file to json
df_.to_json(os.path.join(output_path, file_name), orient='records')
return print('Data Stored as json successfully')
process_data()

Categories