I am trying to create a Pandas pipeline that creates dummy variables and append the column to the existing dataframe.
Unfortunately I can't get the appended columns to stick when the pipeline is finished.
Example:
def function(df):
pass
def create_dummy(df):
a = pd.get_dummy(df['col'])
b = df.append(a)
return b
def mah_pipe(df):
(df.pipe(function)
.pipe(create_dummy)
.pipe(print)
return df
print(mah_pipe(df))
First - I have no idea if this is good practice.
What's weird is that the .pipe(print) prints the dataframe with appended columns. Yay.
But the statement print(mah_pipe(df)) does not. I though they would behave the same way.
I have tried to read the documentation about pd.pipe but I couldn't figure it out.
Hoping someone could help shed some light on what's going on.
This is because print in Python returns None. Since you are not making a copy of df on your pipes, your df dies after print.
pipes in Pandas
Unless used as last pipe, in Pandas, we except (df) -> [pipe] -> (df_1)-> [pipe2] ->(df_2)-> [pipeN] -> df_N By having print as last pipe, the output is None.
Solution
...
def start_pipe(dataf):
# allows make a copy to avoid modifying original
dataf = dataf.copy()
def create_dummies(dataf, column_name):
dummies = pd.get_dummies(dataf[column_name])
dataf[dummies.columns] = dummies
return dataf
def print_dataf(dataf, n_rows=5):
print(dataf.head(n_rows))
return dataf # this is important
# usage
...
dt = (df
.pipe(start_pipe)
.pipe(create_dummies, column_name='a')
.pipe(print_dataf, n_rows=10)
)
def mah_pipe(df):
df = (df
.pipe(start_pipe)
.pipe(create_dummies, column_name='a')
.pipe(print_dataf, n_rows=10)
)
return df
print(mah_pipe(df))
Related
I have several functions that need to be sent to pipeline on assignment, for example
def Android_iOs_device_os_cange(df: pd.DataFrame) -> pd.DataFrame:
import numpy as np
df = df.copy()
def foung_android_list(df):
list_for_android = list(df[df['device_os'] == 'Android'].device_brand.unique())
list_for_android.remove('(not set)')
return list_for_android
def foung_iOS_list(df):
list_for_iOS = list(df[df['device_os'] == 'iOS'].device_brand.unique())
list_for_iOS.remove('(not set)')
return list_for_iOS
df.loc[:,'device_os'] = np.where(df.loc[df['device_brand'].isin(foung_iOS_list(df))] & (df.loc[df['device_os'].isnull()]), 'iOS',
df['device_os'])
df.loc[:,'device_os'] = np.where(df['device_brand'].isin(foung_android_list(df)) & (df['device_os'].isnull()), 'Android',
df['device_os'])
df.loc[:,'device_os'] = np.where(df['device_os'].isnull(), '(not set)', df['device_os'])
print(df)
return df
This function changes all empty values in the device_os column on Android or iOS, depending on which brand of phone the client has specified, or leaves (not set) if the device_brand line is empty. As a function in jupiter lab, the code runs fine, but when I inserted it into the function in the pipeline, the code gives me an error, 'device_brand', i.e. the code does not find such a column in the DataFrame.
Because this is a data science task, I have a data preprocessing function and I perform it outside of pipline, because a target variable is needed for x and y, and I get it from another dataframe, in theory I can generally shove everything into the preprocessing function and execute the code like this, but the task is a task if I shove Android_iOs_device_os_cange to the preprocessing function, then the from_float_to_int function follows
def from_float_to_int(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
for i in df.columns:
if df[i].dtype == 'float64':
df[i] = df[i].astype(int)
print(df)
return df
as it is easy to guess, it changes the data type from float to int, and i have an idea make columnselectior replace types but I am not sure about the correctness of this strategy, at the end there are functions that cannot be replaced so easily.
as it is easy to guess, it changes the data type from float to int and then I encounter the problem that pipeline does not perceive the columns function, and without it I will not be able to perform two probably the most important functions: prepare_for_ohe and make_standard_scatter and Labelencoder_select. The first function removes from all columns, except one, all columns where unique values are less than 80, the second converts numeric values for certain columns to StandartScallaer, and the latter converts encrypted data to LabelEncoder and in all these functions there are pd.columns, without this function I do not know how to replace it, because if with this problem I if I meet in from_float_to_int, then it's stupid to think that it won't be in the following functions
def make_standard_scatter(df: pd.DataFrame) -> None:
from sklearn.preprocessing import StandardScaler
df = df.copy()
data_1 = df[['count_of_action', 'hit_time']]
std_scaler = StandardScaler()
std_scaler.fit(data_1)
std_scaled = std_scaler.transform(data_1)
list1 = ['count_of_action', 'hit_time']
list2 = []
for name in list1:
std_name = name + '_std'
list2.append(std_name)
df[list2] = std_scaled
print(df)
return df
def prepare_for_ohe(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
df[[i for i in df.columns if i != 'hit_time']] = df[[i for i in df.columns if i != 'hit_time']].apply(
lambda x: x.where(x.map(x.value_counts()) > 80))
df = df.dropna()
return df
def Labelencoder_select(df: pd.DataFrame) -> None:
from sklearn.preprocessing import LabelEncoder
df = df.copy()
list1 = [i for i in df.columns if (i.split('_')[0] in 'utm') or (i.split('_')[0] in 'device') or (i.split('_')[0] in
'geo')]
df[list1] = df[list1].apply(LabelEncoder().fit_transform)
print(df)
return df
And all this I mean, how to write functions so that pd.columns functions and line-by-line data changes are perceived correctly by Pipeline.
I have a problem with getting data.
I have this DataFrame:
I need to filter by 'fabricante' == 'Kellogs' and get the 'calorias' column, I did this:
I need the second column (calorias) for introducing in this function:
def valor_medio_intervalo(fabricante, variable, confianza):
subconjunto = None # Select only the data: (fabricante, variable) from 'cereal_df'
inicio, final = None, None # put the statistical function here.
return inicio, final
And this is my code for the last part:
def valor_medio_intervalo(fabricante, variable, confianza):
subconjunto = cereal_df.loc[cereal_df['fabricante'] == fabricante][variable]
inicio, final = sm.stats.DescrStatsW(variable).tconfint_mean(alpha = 1-confianza)
return inicio, final
The error:
I'm gonna be so appreciative if you can help me
You called DescrStatsW('calorias').
But surely you wanted DescrStatsW(subconjunto), right?
I'm just reading https://www.statsmodels.org/stable/generated/statsmodels.stats.weightstats.DescrStatsW.html
which explains you should pass in
a 1- or 2-column numpy array or dataframe.
I am able to change the sequence of columns using below code I found on stackoverflow, now I am trying to convert it into a function for regular use but it doesnt seem to do anything. Pycharm says local variable df_name value is not used in last line of my function.
Working Code
columnsPosition = list(df.columns)
F, H = columnsPosition.index('F'), columnsPosition.index('H')
columnsPosition[F], columnsPosition[H] = columnsPosition[H], columnsPosition[F]
df = df[columnsPosition]
My Function - Doesnt work, need to make this work
def change_col_seq(df_name, old_col_position, new_col_position):
columnsPosition = list(df_name.columns)
F, H = columnsPosition.index(old_col_position), columnsPosition.index(new_col_position)
columnsPosition[F], columnsPosition[H] = columnsPosition[H], columnsPosition[F]
df_name = df_name[columnsPosition] # pycharm has issue on this line
I have tried adding return on last statement of function but I am unable to make it work.
To re-order the Columns
To change the position of 2 columns:
def change_col_seq(df_name:pd.DataFrame, old_col_position:str, new_col_position:str):
df_name[new_col_position], df_name[old_col_position] = df_name[old_col_position].copy(), df_name[new_col_position].copy()
df = df_name.rename(columns={old_col_position:new_col_position, new_col_position:old_col_position})
return df
To Rename the Columns
You can use the rename method (Documentation)
If you want to change the name of just one column:
def change_col_name(df_name, old_col_name:str, new_col_name:str):
df = df_name.rename(columns={old_col_name: new_col_name})
return df
If you want to change the name of multiple column:
def change_col_name(df_name, old_col_name:list, new_col_name:list):
df = df_name.rename(columns=dict(zip(old_col_name, new_col_name)))
return df
I'm looking for the name for a procedure which handles output from one function in several others (trying to find better words for my problem). Some pseudo/actual code would be really helpful.
I have written the following code:
def read_data():
read data from a file
create df
return df
def parse_data():
sorted_df = read_data()
count lines
sort by date
return sorted_df
def add_new_column():
new_column_df = parse_data()
add new column
return new_column_df
def create_plot():
plot_data = add_new_column()
create a plot
display chart
What I'm trying to understand is how to skip a function, e.g. create following chain read_data() -> parse_data() -> create_plot().
As the code looks right now (due to all return values and how they are passed between functions) it requires me to change input data in the last function, create_plot().
I suspect that I'm creating logically incorrect code.
Any thoughts?
Original code:
import pandas as pd
import matplotlib.pyplot as plt
# Read csv files in to data frame
def read_data():
raw_data = pd.read_csv('C:/testdata.csv', sep=',', engine='python', encoding='utf-8-sig').replace({'{':'', '}':'', '"':'', ',':' '}, regex=True)
return raw_data
def parse_data(parsed_data):
...
# Convert CreationDate column into datetime
raw_data['CreationDate'] = pd.to_datetime(raw_data['CreationDate'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
raw_data.sort_values(by=['CreationDate'], inplace=True, ascending=True)
parsed_data = raw_data
return parsed_data
raw_data = read_files()
parsed = parsed_data(raw_data)
Pass the data in instead of just effectively "nesting" everything. Any data that a function requires should ideally be passed in to the function as a parameter:
def read_data():
read data from a file
create df
return df
def parse_data(sorted_df):
count lines
sort by date
return sorted_df
def add_new_column(new_column_df):
add new column
return new_column_df
def create_plot(plot_data):
create a plot
display chart
df = read_data()
parsed = parse_data(df)
added = add_new_column(parsed)
create_plot(added)
Try to make sure functions are only handling what they're directly responsible for. It isn't parse_data's job to know where the data is coming from or to produce the data, so it shouldn't be worrying about that. Let the caller handle that.
The way I have things set up here is often referred to as "piping" or "threading". Information "flows" from one function into the next. In a language like Clojure, this could be written as:
(-> (read-data)
(parse-data)
(add-new-column)
(create-plot))
Using the threading macro -> which frees you up from manually needing to handle data passing. Unfortunately, Python doesn't have anything built in to do this, although it can be achieved using external modules.
Also note that since dataframes seem to be mutable, you don't actually need to return the altered ones them from the functions. If you're just mutating the argument directly, you could just pass the same data frame to each of the functions in order instead of placing it in intermediate variables like parsed and added. The way I'm showing here is a general way to set things up, but it can be altered depending on your exact use case.
Use class to contain your code
class DataManipulation:
def __init__(self, path):
self.df = pd.DataFrame()
self.read_data(path)
#staticmethod
def new(file_path):
return DataManipulation(path)
def read_data(self, path):
read data from a file
self.df = create df
def parse_data(self):
use self.df
count lines
sort by date
return self
def add_new_column(self):
use self.df
add new column
return self
def create_plot(self):
plot_data = add_new_column()
create a plot
display chart
return self
And then,
d = DataManipulation.new(filepath).parse_data().add_column().create_plot()
In function, I can't use argument to define the name of the df in df.to_csv().
I have a long script to pull apart and understand. To do so I want to save the different dataframes it uses and store them in order. I created a function to do this and add the order number 01 (number_of_interim_exports) to the name (from argument).
My problem is that I need to use this for multiple dataframe names, but the df.to_csv part won't accept an argument in place of df...
def print_interim_results_any(name, num_exports, df_name):
global number_of_interim_exports
global print_interim_outputs
if print_interim_outputs == 1:
csvName = str(number_of_interim_exports).zfill(2) + "_" +name
interimFileName = "interim_export_"+csvName+".csv"
df.to_csv(interimFileName, sep=;, encoding='utf-8', index=False)
number_of_interim_exports += 1
I think i just screwed something else up: this works fine:
import pandas as pd
df = pd.DataFrame({1:[1,2,3]})
def f(frame):
frame.to_csv("interimFileName.csv")
f(df)