TypeError: __init__() missing 1 required positional argument: 'df' - python

I want to call df["ID"] in the dataset_csv function and then call the dataset_csv function using dataset = RawToCSV.dataset_csv(input_path). df["ID"] was defined in the raw_file_processing function.
My code raised TypeError: __init__() missing 1 required positional argument: 'df' error.
import re
import pandas as pd
import os
import numpy as np
input_path = "../input_data"
class RawToCSV:
def __init__(self, path_, df):
self.measurement_df = None
self.cls = None
self.path_ = path_
self.df = df
def raw_file_processing(self, path_):
# Open all the subfolders within path
for root, dirs, files in os.walk(path_):
for file in files:
with open(os.path.join(root, file), "r") as data:
self.df = pd.read_csv(data)
# 'Class' refers to the independent variable
cls_info = self.df.iloc[2]
# Dummy-code the classes
cls = pd.get_dummies(cls_info)
# Create the ID series by concatenating columns 1-3
self.df = self.df.assign(
ID=self.df[['cell_id:cell_id', 'region:region', 'tile_num:tile_num']].apply(
lambda row: '_'.join([str(each) for each in row]), axis=1))
self.df = self.df.drop(columns=['cell_id:cell_id', 'region:region', 'tile_num:tile_num'])
# Obtain measurement info
# Normalize data against blank/empty columns
# log-transform the data
for col in self.df[9:]:
if re.findall(r"Blank|Empty", col):
background = col
else:
line = col.readline()
for dat in line:
norm_data = dat / background
self.measurement_df = np.log2(norm_data)
return self.df["ID"], cls, self.measurement_df
def dataset_csv(self):
"""Col 1: ID
Col 2: class
Col 3-n: measurements"""
ids = self.df["ID"]
id_col = ids.to_frame()
cls_col = self.cls.to_frame()
frames = [id_col, cls_col, self.measurement_df]
dataset_df = pd.concat(frames)
data_csv = dataset_df.to_csv("../input_data/dataset.csv")
return data_csv
raw = RawToCSV(input_path)
three_tuple = raw.raw_file_processing(input_path)
dataset = raw.data_csv()
Traceback:
> --------------------------------------------------------------------------- TypeError Traceback (most recent call
> last) /tmp/ipykernel_136/323215226.py in <module>
> ----> 1 raw = RawToCSV(input_path)
> 2 three_tuple = raw.raw_file_processing(input_path)
>
> TypeError: __init__() missing 1 required positional argument: 'df'

In this part of code:
dataset = RawToCSV.dataset_csv(input_path)
You are using the class itself, however you should first instantiate from the class RawToCSV, like this:
rawToCSV = RawTOCSV(input_path)
dataset = rawToCSV.data_csv()
But still you have another mistake ,too. In the constructor of the class , __init__ you've initiated the self.df with self.df, which the latter one hasn't been defined ,yet.
Therefore in this part of code, you'll get another error (AttributeError: 'RawToCSV' object has no attribute 'df'):
def __init__(self, path_):
self.measurement_df = None
self.cls = None
self.path_ = path_
self.df = self.df # <-----

On this line:
dataset = RawToCSV.dataset_csv(input_path)
you're calling dataset_csv as if it were a static method (calling it on the class not an instance). You are passing in input_path, which I assume is a string. Since you're calling the method as if it were static, it is not invisibly adding the actual self value into the call (you have to have an object to even be sent as self).
This means that your one parameter of dataset_csv, which you named self, is receiving the (string) value of input_path.
The error message is telling you that the string input_path has no member .df because it doesn't.
With the way your class and its methods are currently set up, you'll need your entry point code at the bottom to be something like this:
raw = RawToCSV(input_path)
three_tuple = raw.raw_file_processing(input_path)
dataset = raw.dataset_csv()
Though, you may want to restructure your class and its methods

Related

Hand over settings in a class to a pd.read_csv() function

Hi i am pretty new to python. I developed the following class:
import pandas as pd
import os
class Class1:
def __init__(self, path, cols = None, settings = {"sep" : ";", "encoding" : "unicode_escape", "header" : "infer", "decimal" :"."
, "skiprows" : None, "names" : None, "skipfooter" : 0, "engine" : "python"} ):
self.raw = self._load_raw(path = path, s = settings, cols = cols)
def _load_raw(self, path, s, cols = None):
df = pd.read_csv(path, sep = s["sep"], encoding = s["encoding"], decimal = s["decimal"], skiprows = s["skiprows"], skipfooter = s["skipfooter"]
, engine = s["engine"], header = s["header"], names = s["names"], usecols = cols)
return df
Inside of the class is a function which reads a csv file into a pd.DataFrame. I am wondering if there is a smart way of developing the class without handing over such a setting dictionary to read the dataframe later on when creating an object. Lets suppose the csv file is much more easy and just need 1 argument e.g. "sep" and not all the other arguments, but then the class needs also to be able to read csv files which require more arguments. Is there a pythonic way to just hand over as many as required ?
for example 1 object just needs "sep", and another object neeeds all of the settings parameters defined in the docs of pd.read_csv for example, but they can be both created with the same class

Class that returns a transformed dataframe

I'm trying to create a class that takes the path and name of the CSV file, converts it to a dataframe, deletes some columns, converts another one to datetime, as in the code
import os
from pathlib import Path
import pandas as pd
import datetime
class Plans:
def __init__(self, file , path):
self.file = file
self.path = path
self.df = pd.Dataframe()
def get_dataframe(self):
os.chdir(self.path)
self.df = pd.read_csv(self.file, encoding="latin-1", low_memory=False, sep=';')
if 'data' in df.columns:
self.tipo = 'sales'
self.df['data'] = pd.to_datetime(df['data'])
return clean_unused_data()
def clean_unused_data(self):
columns = ['id', 'docs', 'sequence','data_in','received', 'banc', 'return', 'status', 'return_cod',
'bank_account_return', 'id_transcript', 'id_tx','type_order']
for item in columns:
del self.df[item]
del columns[:]
return self.df
When I call an object of the class it gives an error with the clean_unused_data function
returns the following error:
__getattr__ raise AttributeError(f"module 'pandas' has no attribute '{name}'")
Also, I would like to do more dataframe transformations in the Plans class. but since this first one failed, I was a little lost.
Thanks for the help and I apologize for the lack of intimacy with python
I think the error refers to calling an attribute that does not exist in Pandas. From what I can see you wrote pd.DataFrame as pd.Dataframe. Notice the capitalization.
Try the following:
def __init__(self, file , path):
self.file = file
self.path = path
self.df = pd.DataFrame()
Probably one of the columns you are trying to delete is not actually in your file. You can handle the exception or remove this column label from your array.

Create multiple dataframes as properties of an instance of a class within if loop

I have a class, myClass, that I wish to add several dataframes too. At first the class requires a name, and a list of filepaths for an instance to be created:
class myClass:
def __init__(self, name, filepathlist):
self.name = name
self.filepathlist = filepathlist
The data that is pulled into the instance is not in the desired format. As such I have created a method of the class to format the data and create a property of the class for each file that is read:
def formatData(self):
i = 0
if i < (len(self.filepathlist) - 1):
DFRAW = pd.read_csv(self.filepathlist[i], header = 9) #Row 9 is the row that is not blank (all blank auto-skipped)
DFRAW['DateTime'], DFRAW['dummycol1'] = DFRAW[' ;W;W;W;W'].str.split(';', 1).str
DFRAW['Col1'], DFRAW['dummycol2'] = DFRAW['dummycol1'].str.split(';', 1).str
DFRAW['Col2'], DFRAW['dummycol3'] = DFRAW['dummycol2'].str.split(';', 1).str
DFRAW['Col3'], DFRAW['Col4'] = DFRAW['dummycol3'].str.split(';', 1).str
DFRAW= DFRAW.drop([' ;W;W;W;W', 'dummycol1', 'dummycol2', 'dummycol3'], axis = 1)
#There appears to be an issue with these two lines.
processedfilename = "MYDFNAME" + str(i)
self.processedfilename = DFRAW
i = i + 1
I have run the formatting lines of code, those that start with DFRAW, outside of the class and believe these are working correctly.
Somewhere in the script there is an issue with assigning the dataframes as properties of the class; I create a list of filepaths and an instance of the class:
filepathlist = [r"file1.csv",r"file2.csv"]
myINST = myClass("MyInstName", filepathlist )
Then run the formatting method:
myINST.formatData()
Now running the following to check that the instance of the class, myINST, has the properties correctly assigned;
vars(myINST)
But this returns the filepathlist, name and roughly 8000 lines of rows of data from the dataframe. I was expecting the following:
filepathlist, name, MYDFNAME0, MYDFNAME1
What is the error in my code or my approach?
vars will return all the values of an instance, and since myClass have three values: name, filepathlist and processedfilename (which should really be a dataframe), so it will return all.
If you only want the filepathlist, you can access it through instance_object.field_name.
myINST.filepathlist and this will return [r"file1.csv",r"file2.csv"].
Also, you are probably not doing correct here:
processedfilename = "MYDFNAME" + str(i)
self.processedfilename = DFRAW
i = i + 1
(1) You are storing dataframe object in a field called processedfilename, which is weird. (2) You are not appending values but rather replacing, thus after the loop, this will only return you the latest data frame in your filepathlist.
You should store your dataframe in a better format: list, dictionary, etc.
Actually you can access your dataframe(s) in vars() if you incorporate it into the __init__ method. Below builds a dictionary of dataframes with keys being original csv file names.
class myClass:
def __init__(self, name, filepathlist):
self.name = name
self.filepathlist = filepathlist
self.mydataframedict = self.formatData()
def formatData(self):
tmp_dict = {}
for f in self.filepathlist:
DFRAW = pd.read_csv(f, header = 9)
DFRAW['DateTime'], DFRAW['dummycol1'] = DFRAW[' ;W;W;W;W'].str.split(';', 1).str
DFRAW['Col1'], DFRAW['dummycol2'] = DFRAW['dummycol1'].str.split(';', 1).str
DFRAW['Col2'], DFRAW['dummycol3'] = DFRAW['dummycol2'].str.split(';', 1).str
DFRAW['Col3'], DFRAW['Col4'] = DFRAW['dummycol3'].str.split(';', 1).str
DFRAW = DFRAW.drop([' ;W;W;W;W', 'dummycol1', 'dummycol2', 'dummycol3'], axis = 1)
tmp_dict[f] = DFRAW
return tmp_dict
filepathlist = [r"file1.csv", r"file2.csv"]
myINST = myClass("MyInstName", filepathlist )
new_dict = myINST.formatData() # LOCAL VARIABLE (ALSO ACCESSIBLE IN VARS)
print(vars(myINST))
# {'name': 'MyInstName', 'mydataframedict': {'file1.csv': ..., 'file2.csv': ...},
# 'filepathlist': ['file1.csv', 'file2.csv']}

How do I create a class that merges and returns a dataframe, but inherits from another class?

I have two classes, I need the first class to take an input path which contains multiple files; and based off the file type, it inherits specific qualities such as a file provider name, group, and skiprow and footer values derived from provider name. Then I need to pass those values into the merging class so it can merge into one giant concatenated DataFrame, which then can be exported to csv.
import os
import pandas as pd
import logging
import glob
from pandas import Series, DataFrame
class Provider(object):
def __init__(self, path, group, type1_or_type2):
self.group = group.lower()
self.path = path
self.type1_or_type2 = type1_or_type2
self.skipfooter = 1
if self.group == 'AXEL'.lower():
self.skiprows = 3
else:
self.skiprows = 0
class Datamerge(Provider):
def __init__(self, dataframe):
self.dataframe = DataFrame
#classmethod
def massread(cls):
"""
Select your path and index provider to merge the data into one large dataframe
"""
allfiles = glob.glob(self.path +"/*.*")
list_ = []
for file_ in allfiles:
df = pd.read_csv(file_, skiprows = self.skiprows, skipfooter = self.skipfooter, engine = 'python')
list_.append(df)
frame = pd.concat(list_)
return cls(frame)
s = Provider('F:\Desktop/FNMRA','AXEL','type1')
s.frame.to_csv('F:\Desktop/test.csv')
The error that comes up is Traceback (most recent call last):
File "F:\Desktop\Python Scripts\massload.py", line 47, in <module>
s.DataMerge
AttributeError: 'Provider' object has no attribute 'DataMerge'
[Finished in 3.1s with exit code 1]

Populating a model from csv: __init__ arguments error

I have a table with large number of columns which needs to be populated from a csv file. I have the following __init__ code inside the model definition. [1]
class Table
column1 = ............
column2 = .............
.......
def __init__(self, **kwargs):
self.__dict__.update(kwargs)
The code to read from csv file is (load_csv.py)
data_file = "data.csv"
csv_file = csv.DictReader(open(data_file, 'rU'), delimiter=',')
for row in csv_file:
table_entries = {}
for key, value in row.items():
table_entries[key] = value
table_row = Table(table_entries)
db.session.add(table_row)
db.session.commit()
I get the following error on executing load_csv.py
table_row = Table(table_entries)
TypeError: __init__() takes exactly 1 argument (2 given)
I read that this is because it is using the default __init__ but I am not able to see why it is missing __init__ function I have defined in code. Any help in resolving this problem would be much appreciated.
You want to apply the dictionary as keyword arguments:
table_row = Table(**table_entries)
or change your Table() class to receive one argument:
class Table
def __init__(self, row):
self.__dict__.update(row)

Categories