Extending pandas generates warning about column creation - python

I have a class that extends pandas
class teste(pd.DataFrame):
def __init__(self, data=None, index=None, columns=None, dtype=None,
copy=False, atrib_0 = '', atrib_1 = None, atrib_2 = []):
super(teste,self).__init__(data=data, index=index, columns=columns, dtype=dtype, copy=copy)
self.atrib_0 = atrib_0
self.atrib_1 = atrib_1
self.atrib_2 = atrib_2
return
I created an instance of that class using the following code:
t = teste(pandas_df,
atrib_0 = 'NAME',
atrib_1 = 'D',
atrib_2 = ['A','B','C','D'],
)
But doing that generates a UserWarning for the atrib_2, saying Pandas doesn't allow columns to be created via a new attribute name.
Since I am not creating a new column, but attributing a property to that instance of my class, I believe it gets confused because it's possible to access existing columns using the code df.new_column = []. Any new attribute that gets a list generates that warning.
Does anybody know how to get rid of it? What am I doing wrong? Any help is much appreciated.

Related

Hand over settings in a class to a pd.read_csv() function

Hi i am pretty new to python. I developed the following class:
import pandas as pd
import os
class Class1:
def __init__(self, path, cols = None, settings = {"sep" : ";", "encoding" : "unicode_escape", "header" : "infer", "decimal" :"."
, "skiprows" : None, "names" : None, "skipfooter" : 0, "engine" : "python"} ):
self.raw = self._load_raw(path = path, s = settings, cols = cols)
def _load_raw(self, path, s, cols = None):
df = pd.read_csv(path, sep = s["sep"], encoding = s["encoding"], decimal = s["decimal"], skiprows = s["skiprows"], skipfooter = s["skipfooter"]
, engine = s["engine"], header = s["header"], names = s["names"], usecols = cols)
return df
Inside of the class is a function which reads a csv file into a pd.DataFrame. I am wondering if there is a smart way of developing the class without handing over such a setting dictionary to read the dataframe later on when creating an object. Lets suppose the csv file is much more easy and just need 1 argument e.g. "sep" and not all the other arguments, but then the class needs also to be able to read csv files which require more arguments. Is there a pythonic way to just hand over as many as required ?
for example 1 object just needs "sep", and another object neeeds all of the settings parameters defined in the docs of pd.read_csv for example, but they can be both created with the same class

Class that returns a transformed dataframe

I'm trying to create a class that takes the path and name of the CSV file, converts it to a dataframe, deletes some columns, converts another one to datetime, as in the code
import os
from pathlib import Path
import pandas as pd
import datetime
class Plans:
def __init__(self, file , path):
self.file = file
self.path = path
self.df = pd.Dataframe()
def get_dataframe(self):
os.chdir(self.path)
self.df = pd.read_csv(self.file, encoding="latin-1", low_memory=False, sep=';')
if 'data' in df.columns:
self.tipo = 'sales'
self.df['data'] = pd.to_datetime(df['data'])
return clean_unused_data()
def clean_unused_data(self):
columns = ['id', 'docs', 'sequence','data_in','received', 'banc', 'return', 'status', 'return_cod',
'bank_account_return', 'id_transcript', 'id_tx','type_order']
for item in columns:
del self.df[item]
del columns[:]
return self.df
When I call an object of the class it gives an error with the clean_unused_data function
returns the following error:
__getattr__ raise AttributeError(f"module 'pandas' has no attribute '{name}'")
Also, I would like to do more dataframe transformations in the Plans class. but since this first one failed, I was a little lost.
Thanks for the help and I apologize for the lack of intimacy with python
I think the error refers to calling an attribute that does not exist in Pandas. From what I can see you wrote pd.DataFrame as pd.Dataframe. Notice the capitalization.
Try the following:
def __init__(self, file , path):
self.file = file
self.path = path
self.df = pd.DataFrame()
Probably one of the columns you are trying to delete is not actually in your file. You can handle the exception or remove this column label from your array.

TypeError: __init__() missing 1 required positional argument: 'df'

I want to call df["ID"] in the dataset_csv function and then call the dataset_csv function using dataset = RawToCSV.dataset_csv(input_path). df["ID"] was defined in the raw_file_processing function.
My code raised TypeError: __init__() missing 1 required positional argument: 'df' error.
import re
import pandas as pd
import os
import numpy as np
input_path = "../input_data"
class RawToCSV:
def __init__(self, path_, df):
self.measurement_df = None
self.cls = None
self.path_ = path_
self.df = df
def raw_file_processing(self, path_):
# Open all the subfolders within path
for root, dirs, files in os.walk(path_):
for file in files:
with open(os.path.join(root, file), "r") as data:
self.df = pd.read_csv(data)
# 'Class' refers to the independent variable
cls_info = self.df.iloc[2]
# Dummy-code the classes
cls = pd.get_dummies(cls_info)
# Create the ID series by concatenating columns 1-3
self.df = self.df.assign(
ID=self.df[['cell_id:cell_id', 'region:region', 'tile_num:tile_num']].apply(
lambda row: '_'.join([str(each) for each in row]), axis=1))
self.df = self.df.drop(columns=['cell_id:cell_id', 'region:region', 'tile_num:tile_num'])
# Obtain measurement info
# Normalize data against blank/empty columns
# log-transform the data
for col in self.df[9:]:
if re.findall(r"Blank|Empty", col):
background = col
else:
line = col.readline()
for dat in line:
norm_data = dat / background
self.measurement_df = np.log2(norm_data)
return self.df["ID"], cls, self.measurement_df
def dataset_csv(self):
"""Col 1: ID
Col 2: class
Col 3-n: measurements"""
ids = self.df["ID"]
id_col = ids.to_frame()
cls_col = self.cls.to_frame()
frames = [id_col, cls_col, self.measurement_df]
dataset_df = pd.concat(frames)
data_csv = dataset_df.to_csv("../input_data/dataset.csv")
return data_csv
raw = RawToCSV(input_path)
three_tuple = raw.raw_file_processing(input_path)
dataset = raw.data_csv()
Traceback:
> --------------------------------------------------------------------------- TypeError Traceback (most recent call
> last) /tmp/ipykernel_136/323215226.py in <module>
> ----> 1 raw = RawToCSV(input_path)
> 2 three_tuple = raw.raw_file_processing(input_path)
>
> TypeError: __init__() missing 1 required positional argument: 'df'
In this part of code:
dataset = RawToCSV.dataset_csv(input_path)
You are using the class itself, however you should first instantiate from the class RawToCSV, like this:
rawToCSV = RawTOCSV(input_path)
dataset = rawToCSV.data_csv()
But still you have another mistake ,too. In the constructor of the class , __init__ you've initiated the self.df with self.df, which the latter one hasn't been defined ,yet.
Therefore in this part of code, you'll get another error (AttributeError: 'RawToCSV' object has no attribute 'df'):
def __init__(self, path_):
self.measurement_df = None
self.cls = None
self.path_ = path_
self.df = self.df # <-----
On this line:
dataset = RawToCSV.dataset_csv(input_path)
you're calling dataset_csv as if it were a static method (calling it on the class not an instance). You are passing in input_path, which I assume is a string. Since you're calling the method as if it were static, it is not invisibly adding the actual self value into the call (you have to have an object to even be sent as self).
This means that your one parameter of dataset_csv, which you named self, is receiving the (string) value of input_path.
The error message is telling you that the string input_path has no member .df because it doesn't.
With the way your class and its methods are currently set up, you'll need your entry point code at the bottom to be something like this:
raw = RawToCSV(input_path)
three_tuple = raw.raw_file_processing(input_path)
dataset = raw.dataset_csv()
Though, you may want to restructure your class and its methods

Create multiple dataframes as properties of an instance of a class within if loop

I have a class, myClass, that I wish to add several dataframes too. At first the class requires a name, and a list of filepaths for an instance to be created:
class myClass:
def __init__(self, name, filepathlist):
self.name = name
self.filepathlist = filepathlist
The data that is pulled into the instance is not in the desired format. As such I have created a method of the class to format the data and create a property of the class for each file that is read:
def formatData(self):
i = 0
if i < (len(self.filepathlist) - 1):
DFRAW = pd.read_csv(self.filepathlist[i], header = 9) #Row 9 is the row that is not blank (all blank auto-skipped)
DFRAW['DateTime'], DFRAW['dummycol1'] = DFRAW[' ;W;W;W;W'].str.split(';', 1).str
DFRAW['Col1'], DFRAW['dummycol2'] = DFRAW['dummycol1'].str.split(';', 1).str
DFRAW['Col2'], DFRAW['dummycol3'] = DFRAW['dummycol2'].str.split(';', 1).str
DFRAW['Col3'], DFRAW['Col4'] = DFRAW['dummycol3'].str.split(';', 1).str
DFRAW= DFRAW.drop([' ;W;W;W;W', 'dummycol1', 'dummycol2', 'dummycol3'], axis = 1)
#There appears to be an issue with these two lines.
processedfilename = "MYDFNAME" + str(i)
self.processedfilename = DFRAW
i = i + 1
I have run the formatting lines of code, those that start with DFRAW, outside of the class and believe these are working correctly.
Somewhere in the script there is an issue with assigning the dataframes as properties of the class; I create a list of filepaths and an instance of the class:
filepathlist = [r"file1.csv",r"file2.csv"]
myINST = myClass("MyInstName", filepathlist )
Then run the formatting method:
myINST.formatData()
Now running the following to check that the instance of the class, myINST, has the properties correctly assigned;
vars(myINST)
But this returns the filepathlist, name and roughly 8000 lines of rows of data from the dataframe. I was expecting the following:
filepathlist, name, MYDFNAME0, MYDFNAME1
What is the error in my code or my approach?
vars will return all the values of an instance, and since myClass have three values: name, filepathlist and processedfilename (which should really be a dataframe), so it will return all.
If you only want the filepathlist, you can access it through instance_object.field_name.
myINST.filepathlist and this will return [r"file1.csv",r"file2.csv"].
Also, you are probably not doing correct here:
processedfilename = "MYDFNAME" + str(i)
self.processedfilename = DFRAW
i = i + 1
(1) You are storing dataframe object in a field called processedfilename, which is weird. (2) You are not appending values but rather replacing, thus after the loop, this will only return you the latest data frame in your filepathlist.
You should store your dataframe in a better format: list, dictionary, etc.
Actually you can access your dataframe(s) in vars() if you incorporate it into the __init__ method. Below builds a dictionary of dataframes with keys being original csv file names.
class myClass:
def __init__(self, name, filepathlist):
self.name = name
self.filepathlist = filepathlist
self.mydataframedict = self.formatData()
def formatData(self):
tmp_dict = {}
for f in self.filepathlist:
DFRAW = pd.read_csv(f, header = 9)
DFRAW['DateTime'], DFRAW['dummycol1'] = DFRAW[' ;W;W;W;W'].str.split(';', 1).str
DFRAW['Col1'], DFRAW['dummycol2'] = DFRAW['dummycol1'].str.split(';', 1).str
DFRAW['Col2'], DFRAW['dummycol3'] = DFRAW['dummycol2'].str.split(';', 1).str
DFRAW['Col3'], DFRAW['Col4'] = DFRAW['dummycol3'].str.split(';', 1).str
DFRAW = DFRAW.drop([' ;W;W;W;W', 'dummycol1', 'dummycol2', 'dummycol3'], axis = 1)
tmp_dict[f] = DFRAW
return tmp_dict
filepathlist = [r"file1.csv", r"file2.csv"]
myINST = myClass("MyInstName", filepathlist )
new_dict = myINST.formatData() # LOCAL VARIABLE (ALSO ACCESSIBLE IN VARS)
print(vars(myINST))
# {'name': 'MyInstName', 'mydataframedict': {'file1.csv': ..., 'file2.csv': ...},
# 'filepathlist': ['file1.csv', 'file2.csv']}

TypeError: 'type' object has no attribute '__getitem__' in pandas DataFrame

I was trying to analyse the play-by-play data of a basketball team
What I did was to read a csv file into a DataFrame object.
I want to preserve the functionality of the DataFrame object while adding in new attributes to the existing object. Thus I wrote a class called Basketball:
from data_math import *
import pandas as pd
class Basketball(pd.DataFrame):
def __init__(self,*args,**kargs):
pd.DataFrame.__init__(self,*args,**kargs)
self.FGM = calculate_FGM(pd.DataFrame)
self.FGA = calculate_FGA(pd.DateFrame)
self.FGP = self.FGM / self.FGA
self.M3 = calculate_3M(pd.DataFrame)
self.A3 = calcualte_3A(pd.DataFrame)
self.P3 = self.M3 / self.A3
self.FTM = calcualte_FTM(pd.DataFrame)
self.FTA = calculate_FTA(pd.DataFrame)
self.FTP = self.FTM / self.FTA
# self.P = score_calculate(pd.DataFrame)
I wrote another data_math.py file to help calculate the different attributes I wanted to include into the Basketball class.
from pandas import DataFrame
def score_calculate(df):
df_pt_scored = df[((df['etype']=='shot') & (df['result']=='made'))]
df_ft_scored = df[((df['etype']=='free throw') & (df['result']=='made'))]
return df_pt_scored['points'].sum()+len(df_ft_scored.index)
def calculate_FGM(df):
cond_pt = (df['etype']=='shots') & (df['results']=='made')
cond_ft = (df['etype']=='freethrow') & (df['results']=='made')
return len(df[cond_pt].index)+len(df[cond_ft].index)
def calculate_FGA(df):
shot_cond= df['etype']=='shot'
free_throw_cond = df['etype']=='free throw'
return len(df[shot_cond].index)+len(df[free_throw_cond].index)
def calculate_3M(df):
cond_3M= (df['etype']=='shot')&(df['type']=='3pt')&(df['result']=='made')
return len(df[cond_3M].index)
def calcualte_3A(df):
cond_3A = (df['etype']=='shot')&(df['type']=='3pt')
return len(df[cond_3A].index)
def calculate_FTM(df):
cond_FTM =(df['etype']=='free throw') & (df['result']=='made')
return len(df[cond_FTM].index)
def calcualte_FTA(df):
cond_FTA =(df['etype']=='free throw')
return len(df[cond_FTA].index)
In the end I start my program from main.py which I hope would give me the correct output. However while executing on this line:
team1= Basketball(tm1)
I received the following Traceback
Traceback (most recent call last):
File "/Users/luoyicheng/Developer/STAR-Research/data_analysis/source code/main.py", line 20, in <module>
team1= Basketball(tm1)
File "/Users/luoyicheng/Developer/STAR-Research/data_analysis/source code/Basketball.py", line 6, in __init__
self.FGM = calculate_FGM(pd.DataFrame)
File "/Users/luoyicheng/Developer/STAR-Research/data_analysis/source code/data_math.py", line 9, in calculate_FGM
cond_pt = (df['etype']=='shots') & (df['results']=='made')
TypeError: 'type' object has no attribute '__getitem__'
I am new to python programming and could not figure out why this error has occurred. To my understanding, this error means I am unable to use indexing feature of the DataFrame. However, if I try to code in my main function similar things I am able to get the output I want. I am also not clear of how to extend the existing DataFrame class so that I can still access the methods in the DataFrame class while extending the team1 object to have attributes such as FGM, FGA, etc.
The idea of extending this class is to allow me to pass any DataFrame object in the Basketball() so that I can have an object with extending attributes and methods. I think I also lack an understanding of the use of init and self.
Please don't blame for not describing the problem clearly as I am not familiar with all the terminology in OOP.
Thank you so much!
You're passing each function pd.DataFrame which is of type type:
In [11]: type(pd.DataFrame)
Out[11]: type
Hence the exception message.
You mean to be passing self (which is of type DataFrame):
self.FGM = calculate_FGM(pd.DataFrame)
...
should read:
self.FGM = calculate_FGM(self)
...

Categories