Class that returns a transformed dataframe - python

I'm trying to create a class that takes the path and name of the CSV file, converts it to a dataframe, deletes some columns, converts another one to datetime, as in the code
import os
from pathlib import Path
import pandas as pd
import datetime
class Plans:
def __init__(self, file , path):
self.file = file
self.path = path
self.df = pd.Dataframe()
def get_dataframe(self):
os.chdir(self.path)
self.df = pd.read_csv(self.file, encoding="latin-1", low_memory=False, sep=';')
if 'data' in df.columns:
self.tipo = 'sales'
self.df['data'] = pd.to_datetime(df['data'])
return clean_unused_data()
def clean_unused_data(self):
columns = ['id', 'docs', 'sequence','data_in','received', 'banc', 'return', 'status', 'return_cod',
'bank_account_return', 'id_transcript', 'id_tx','type_order']
for item in columns:
del self.df[item]
del columns[:]
return self.df
When I call an object of the class it gives an error with the clean_unused_data function
returns the following error:
__getattr__ raise AttributeError(f"module 'pandas' has no attribute '{name}'")
Also, I would like to do more dataframe transformations in the Plans class. but since this first one failed, I was a little lost.
Thanks for the help and I apologize for the lack of intimacy with python

I think the error refers to calling an attribute that does not exist in Pandas. From what I can see you wrote pd.DataFrame as pd.Dataframe. Notice the capitalization.
Try the following:
def __init__(self, file , path):
self.file = file
self.path = path
self.df = pd.DataFrame()

Probably one of the columns you are trying to delete is not actually in your file. You can handle the exception or remove this column label from your array.

Related

Call function and parameters based on condition in Python

i have written a function in Python that reads either a .csv or .xls and return it as a pandas dataframe.
Based on the passed file_type the function uses either the pandas.read_csv() or pandas.read_excel() function with (just one slight) difference in the parameters.
It works without an issue but its obviously repeated code i would like to reduce.
So how could i best:
Have just one function call that is dynamically changed to the specific one defined by the file_type variable
Dynamically change the parameters of the then called function based on the same variable?
Here is my current code.
Thanks for your help.
def file_to_df(file_name, fields= None, file_type = None, encoding = None):
"""Read stock level from csv or xlsx file.Filter SKU and Qty.Return dataframe."""
if file_type == 'csv' or 'xls':
if file_type == 'csv':
data_frame = pd.read_csv(
file_name,
encoding = encoding,
converters={'Barcode':str,'Qty':int},
usecols=fields
)
elif file_type == 'xls':
data_frame = pd.read_excel(
file_name,
converters={'Barcode':str,'Qty':int},
usecols=fields
)
# Remove empty rows
data_frame.replace('', np_nan, inplace=True)
data_frame.dropna(axis=0, how='any', subset=None, inplace=True)
return data_frame
else:
print('no csv or xls filetype was handed to file_to_df')
For the parameters i tried using two tuples that are put into the function call.
You can modify your signature function and use the keyword-only arguments (PEP 3102). After that, create a dict of parameters, add your fixed parameters (converters), rename some parameters (fields -> usecols) and add other parameters as it:
import pandas as pd
import pathlib
def file_to_df(file_name, **kwargs):
xfile = pathlib.Path(file_name)
params = {
'converters': {'Barcode': str, 'Qty': int}, # add fixed parameters
'usecols': kwargs.pop('fields', None) # convert fields to usecols
} | kwargs # pass all other parameters as it
# determine the right function according the extension
funcs = {'.csv': pd.read_csv, '.xlsx': pd.read_excel}
try:
df = funcs[xfile.suffix](xfile, **params)
except KeyError:
raise RuntimeError('no csv or xls filetype was handed to file_to_df')
return df
Don't pass a string that has to be mapped to a particular function; just pass the correct function.
def file_to_df(file_name, fields=None, *, converter, **kwargs):
"""Read stock level from csv or xlsx file.Filter SKU and Qty.Return dataframe."""
data_frame = converter(file_name, , converters={'Barcode': str, 'Qty': int}, usecols=fields, **kwargs)
data_frame.replace('', np_nan, inplace=True)
data_frame.dropna(axis=0, how='any', subset=None, inplace=True)
return data_frame
df1 = file_to_df('foo.csv', converter=pd.read_csv)
df2 = file_to_df('foo.xlsx', converter=pd.read_excel, encoding='...')

Hand over settings in a class to a pd.read_csv() function

Hi i am pretty new to python. I developed the following class:
import pandas as pd
import os
class Class1:
def __init__(self, path, cols = None, settings = {"sep" : ";", "encoding" : "unicode_escape", "header" : "infer", "decimal" :"."
, "skiprows" : None, "names" : None, "skipfooter" : 0, "engine" : "python"} ):
self.raw = self._load_raw(path = path, s = settings, cols = cols)
def _load_raw(self, path, s, cols = None):
df = pd.read_csv(path, sep = s["sep"], encoding = s["encoding"], decimal = s["decimal"], skiprows = s["skiprows"], skipfooter = s["skipfooter"]
, engine = s["engine"], header = s["header"], names = s["names"], usecols = cols)
return df
Inside of the class is a function which reads a csv file into a pd.DataFrame. I am wondering if there is a smart way of developing the class without handing over such a setting dictionary to read the dataframe later on when creating an object. Lets suppose the csv file is much more easy and just need 1 argument e.g. "sep" and not all the other arguments, but then the class needs also to be able to read csv files which require more arguments. Is there a pythonic way to just hand over as many as required ?
for example 1 object just needs "sep", and another object neeeds all of the settings parameters defined in the docs of pd.read_csv for example, but they can be both created with the same class

TypeError: __init__() missing 1 required positional argument: 'df'

I want to call df["ID"] in the dataset_csv function and then call the dataset_csv function using dataset = RawToCSV.dataset_csv(input_path). df["ID"] was defined in the raw_file_processing function.
My code raised TypeError: __init__() missing 1 required positional argument: 'df' error.
import re
import pandas as pd
import os
import numpy as np
input_path = "../input_data"
class RawToCSV:
def __init__(self, path_, df):
self.measurement_df = None
self.cls = None
self.path_ = path_
self.df = df
def raw_file_processing(self, path_):
# Open all the subfolders within path
for root, dirs, files in os.walk(path_):
for file in files:
with open(os.path.join(root, file), "r") as data:
self.df = pd.read_csv(data)
# 'Class' refers to the independent variable
cls_info = self.df.iloc[2]
# Dummy-code the classes
cls = pd.get_dummies(cls_info)
# Create the ID series by concatenating columns 1-3
self.df = self.df.assign(
ID=self.df[['cell_id:cell_id', 'region:region', 'tile_num:tile_num']].apply(
lambda row: '_'.join([str(each) for each in row]), axis=1))
self.df = self.df.drop(columns=['cell_id:cell_id', 'region:region', 'tile_num:tile_num'])
# Obtain measurement info
# Normalize data against blank/empty columns
# log-transform the data
for col in self.df[9:]:
if re.findall(r"Blank|Empty", col):
background = col
else:
line = col.readline()
for dat in line:
norm_data = dat / background
self.measurement_df = np.log2(norm_data)
return self.df["ID"], cls, self.measurement_df
def dataset_csv(self):
"""Col 1: ID
Col 2: class
Col 3-n: measurements"""
ids = self.df["ID"]
id_col = ids.to_frame()
cls_col = self.cls.to_frame()
frames = [id_col, cls_col, self.measurement_df]
dataset_df = pd.concat(frames)
data_csv = dataset_df.to_csv("../input_data/dataset.csv")
return data_csv
raw = RawToCSV(input_path)
three_tuple = raw.raw_file_processing(input_path)
dataset = raw.data_csv()
Traceback:
> --------------------------------------------------------------------------- TypeError Traceback (most recent call
> last) /tmp/ipykernel_136/323215226.py in <module>
> ----> 1 raw = RawToCSV(input_path)
> 2 three_tuple = raw.raw_file_processing(input_path)
>
> TypeError: __init__() missing 1 required positional argument: 'df'
In this part of code:
dataset = RawToCSV.dataset_csv(input_path)
You are using the class itself, however you should first instantiate from the class RawToCSV, like this:
rawToCSV = RawTOCSV(input_path)
dataset = rawToCSV.data_csv()
But still you have another mistake ,too. In the constructor of the class , __init__ you've initiated the self.df with self.df, which the latter one hasn't been defined ,yet.
Therefore in this part of code, you'll get another error (AttributeError: 'RawToCSV' object has no attribute 'df'):
def __init__(self, path_):
self.measurement_df = None
self.cls = None
self.path_ = path_
self.df = self.df # <-----
On this line:
dataset = RawToCSV.dataset_csv(input_path)
you're calling dataset_csv as if it were a static method (calling it on the class not an instance). You are passing in input_path, which I assume is a string. Since you're calling the method as if it were static, it is not invisibly adding the actual self value into the call (you have to have an object to even be sent as self).
This means that your one parameter of dataset_csv, which you named self, is receiving the (string) value of input_path.
The error message is telling you that the string input_path has no member .df because it doesn't.
With the way your class and its methods are currently set up, you'll need your entry point code at the bottom to be something like this:
raw = RawToCSV(input_path)
three_tuple = raw.raw_file_processing(input_path)
dataset = raw.dataset_csv()
Though, you may want to restructure your class and its methods

Extending pandas generates warning about column creation

I have a class that extends pandas
class teste(pd.DataFrame):
def __init__(self, data=None, index=None, columns=None, dtype=None,
copy=False, atrib_0 = '', atrib_1 = None, atrib_2 = []):
super(teste,self).__init__(data=data, index=index, columns=columns, dtype=dtype, copy=copy)
self.atrib_0 = atrib_0
self.atrib_1 = atrib_1
self.atrib_2 = atrib_2
return
I created an instance of that class using the following code:
t = teste(pandas_df,
atrib_0 = 'NAME',
atrib_1 = 'D',
atrib_2 = ['A','B','C','D'],
)
But doing that generates a UserWarning for the atrib_2, saying Pandas doesn't allow columns to be created via a new attribute name.
Since I am not creating a new column, but attributing a property to that instance of my class, I believe it gets confused because it's possible to access existing columns using the code df.new_column = []. Any new attribute that gets a list generates that warning.
Does anybody know how to get rid of it? What am I doing wrong? Any help is much appreciated.

How do I create a class that merges and returns a dataframe, but inherits from another class?

I have two classes, I need the first class to take an input path which contains multiple files; and based off the file type, it inherits specific qualities such as a file provider name, group, and skiprow and footer values derived from provider name. Then I need to pass those values into the merging class so it can merge into one giant concatenated DataFrame, which then can be exported to csv.
import os
import pandas as pd
import logging
import glob
from pandas import Series, DataFrame
class Provider(object):
def __init__(self, path, group, type1_or_type2):
self.group = group.lower()
self.path = path
self.type1_or_type2 = type1_or_type2
self.skipfooter = 1
if self.group == 'AXEL'.lower():
self.skiprows = 3
else:
self.skiprows = 0
class Datamerge(Provider):
def __init__(self, dataframe):
self.dataframe = DataFrame
#classmethod
def massread(cls):
"""
Select your path and index provider to merge the data into one large dataframe
"""
allfiles = glob.glob(self.path +"/*.*")
list_ = []
for file_ in allfiles:
df = pd.read_csv(file_, skiprows = self.skiprows, skipfooter = self.skipfooter, engine = 'python')
list_.append(df)
frame = pd.concat(list_)
return cls(frame)
s = Provider('F:\Desktop/FNMRA','AXEL','type1')
s.frame.to_csv('F:\Desktop/test.csv')
The error that comes up is Traceback (most recent call last):
File "F:\Desktop\Python Scripts\massload.py", line 47, in <module>
s.DataMerge
AttributeError: 'Provider' object has no attribute 'DataMerge'
[Finished in 3.1s with exit code 1]

Categories