Pandas SettingWithCopyWarning only when copied outside of scope - python

Why does doing df.copy() not give a warning only if done inside the function scope, but errors if used outside of the scope?
Example 1 - Gives Warning:
def do_something(df):
df = df.loc[df['x'].isin([1, 2, 2698])]
df.loc[:, "y"] = 0
return df
df = pd.DataFrame({"x": [1, 2, 4], "y": ["a", "b", "c"]})
do_something(df)
Example 2 - Does not give warning
def do_something(df):
df = df.copy() # <---------------------- This fixes the warning but is not desirable as it creates a copy (which can take a lot of memory)
df = df.loc[df['x'].isin([1, 2, 2698])]
df.loc[:, "y"] = 0
return df
df = pd.DataFrame({"x": [1, 2, 4], "y": ["a", "b", "c"]})
do_something(df)

Solution - change the function to use df.loc[:] = instead of df =
def do_something(df):
# df = df.loc[df['x'].isin([1, 2, 2698])] # <------ Replace this with the line below
df.loc[:] = df.loc[df['x'].isin([1, 2, 2698])]
# ^^^^^^ <--- add this
df.loc[:, "y"] = 0
return df

Related

Python dataframe merge on condition

I have two data frames, and 3 conditions to create new data frame
1)df1["Product"]==df2["Product"] and df2["Date"] >= df1["Date"]
2)Now need to loop df2["Product"] sum(df2["Count"]) while checking df1["Count"] on each iteration for df2["Count"] == df1["Count"]
Example
df1["Product"][2] = "147326.A" and df1["Date"][2] = "1/03/22" and df1["Count"][2] = 4,
now we check df2 if there is match df2["Product"][1] == df1["Product"][2] and df2["Date"][1] >= df1["Date"][2], first condition are met now we need to sum() the df2["Count"] end on each iteration compare it to df1["Count"] if df1["Count"]== df2[Count] add to new data frame
df1 = pd.DataFrame({"Date":["11/01/22", "1/02/22", "1/03/22", "1/04/22", "2/02/22"],"Product" :["315114.A", "147326.A", "147326.A", "91106.A", "283214.A"],"Count":[3,1,4,1,2]})
df2 = pd.DataFrame({"Date" : ["15/01/22", "4/02/22", "7/03/22", "1/04/22", "2/02/22", "15/01/22","1/06/22","1/06/22"],"Product" : ["315114.A", "147326.A ", "147326.A", "91106.A", "283214.A", "315114.A","147326.A","147326.A" ],"Count" : [1, 1, 2, 1, 2, 2, 1, 1]})
The following data should be a match:
df1 = pd.DataFrame({"Date" : ["01/03/2022"],"Product":["91106.A"],"Count":[2]})
df2 = pd.DataFrame({"Date" : ["01/03/2022", "7/03/2022", "7/03/2022", "7/03/2022","7/03/2022", "7/03/2022"],"Product" : ["91106.A", "91106.A","91106.A", "91106.A", "91106.A", "91106.A"],"Count" : [1, 1, 1, 1, 1, 1]})
You could solve this in a list comprehension (within a pd.DataFrame):
df3 = pd.DataFrame([j.to_dict() for i, j in df1.iterrows() if
j["Count"] == df2[(df2["Product"] == j["Product"]) &
(df2["Date"] >= j["Date"])]["Count"].sum()])
Splitting this up into lots of lines would look like this:
l = []
for i, j in df1.iterrows():
if j["Count"] == df2[(df2["Product"] == j["Product"]) &
(df2["Date"] >= j["Date"])]["Count"].sum():
x = j.to_dict()
l.append(x)
df3 = pd.DataFrame(l)

maintaining pandas df index with selection & groupby (python)

I am having an issue with returning the original df index of a row given a groupby condition after subselecting some of the df. It's easier to understand through code.
So if we start with a toy dataframe:
headers = ['a','b']
nrows = 8
df = pd.DataFrame(columns = headers)
df['a'] = [0]*(nrows//2) + [1]*(nrows//2)
df['b'] = [2]*(nrows//4) + [4]*(nrows//4) + [2]*(nrows//4) + [4]*(nrows//4)
print(df)
then I select the subset of data I am interested in and checking that the index is retained:
sub_df = df[df['a']==1] ## selects for only group 1 (indices 4-7)
print(sub_df.index) ## looks good so far
sub_df.index returns
Int64Index([4, 5, 6, 7], dtype='int64')
Which seems great! I would like to group data from that subset and extract the original df index and that is where the issue occurs:
For example:
g_df = sub_df.groupby('b')
g_df_idx = g_df.indices
print(g_df_idx) ## bad!
when I print(g_df_idx) I want it to return:
{2: array([4,5]), 4: array([6,7])}
Due to the way I will be using this code I can't just groupby(['a','b'])
I'm going nuts with this thing. Here are some of the many solutions I have tried:
## 1
e1_idx = sub_df.groupby('b').indices
# print(e1_idx) ## issue persists
## 2
e2 = sub_df.groupby('b', as_index = True) ## also tried as_index = False
e2_idx = e2.indices
# print(e2_idx) ## issue persists
## 3
e3 = sub_df.reset_index()
e3_idx = e3.groupby('b').indices
# print(e3_idx) ## issue persists
I'm sure there must be some simple solution I'm just overlooking. Would be very grateful for any advice.
you can do like this
g_df_idx = g_df.apply(lambda x: x.index).to_dict()
print(g_df_idx)
# {2: Int64Index([4, 5], dtype='int64'), 4: Int64Index([6, 7], dtype='int64')}

How to run variable of one function to another in python?

I am a beginner. I have two function:
1st creating dataframes and some print statement
2nd is downloading the dataframes to csv in colab.
I want to download all dataframes by the df_name.
code:
def fun1():
import pandas as pd
d = {'col1': [1, 2], 'col2': [3, 4]}
d2 = {'col1': [-5, -6], 'col2': [-7, -8]}
df = pd.DataFrame(data=d)
df2 = pd.DataFrame(data=d2)
print('info', df.info())
print('info', df2.info())
return df, df2
def fun2(df):
from google.colab import files
name1 = 'positive.csv'
name2 = 'negative.csv'
df.to_csv(name1)
df2.to_csv(name2)
files.download(name1)
files.download(name2)
fun2(df) #looking something like this that download my df, func2 should read my df and df2 from fun1()
I tried:
class tom:
def fun1(self):
import pandas as pd
d = {'col1': [1, 2], 'col2': [3, 4]}
d2 = {'col1': [-5, -6], 'col2': [-7, -8]}
df = pd.DataFrame(data=d)
df2 = pd.DataFrame(data=d2)
print('info', df.info())
print('info', df2.info())
self.df= df
self.df2 = df2
return df, df2
def fun2(self):
df,df2 = fun1()
from google.colab import files
name1 = 'positive.csv'
name2 = 'negative.csv'
df.to_csv(name1)
df2.to_csv(name2)
return files.download(name1) ,files.download(name2)
tom().fun2() #it download files but shows print of fun1 as well which I don't want.
looking for something like
tom().fun2(dataframe_name) #it just download the files nothing else
set permanent variables directly in the class if its not gonna change and
define fun just for actions.
class s:
import pandas as pd
d = {'col1': [1, 2], 'col2': [3, 4]}
d2 = {'col1': [-5, -6], 'col2': [-7, -8]}
df = pd.DataFrame(data=d)
df2 = pd.DataFrame(data=d2)
name1 = 'positive.csv'
name2 = 'negative.csv'
df.to_csv(name1)
df2.to_csv(name2)
def f():
print('info', df.info())
print('info', df2.info())
def fun(x):
from google.colab import files
return files.download(x)
run
s.f() --it will print value only
s.fun(s.name1) --it will just download the file
Maybe you can save the data you need in a class variable or create another function, that keeps the data from the first function you need the value (lets call it A) and then pass A to the second function as an argument.

How to merge Dynamic Dataframes

I'm looking for help to add two dynamically generated dataframes.
Both DataFrames have a column computed on input from an intslider ipywidget.
the third Dataframe should update dynamically on changes of any of above Dataframes
import pandas as pd
from ipywidgets import interact
#interact(x=(0,1000,10))
def df_draw_one(x):
data = {"A":[1,2,3,4,5]}
df_one = pd.DataFrame(data)
df_one['B'] = df_one['A']*x
print(df_one)
#interact(x=(0,1000,10))
def df_draw_two(x):
data = {"A":[6,7,8,9,10]}
df_two = pd.DataFrame(data)
df_two['B'] = df_two['A']*x
print(df_two)
df_res = df_one+df_two
I understand with the current code, df_one and two are local and hence result in:
NameError: name 'df_one' is not defined
but I'm at loss on how to make them accessible.
Any pointer would be appreciated
You can have your functions return the two dataframe adding a return statement.
import pandas as pd
from ipywidgets import interact
#interact(x=(0, 1000, 10))
def df_draw_one(x):
data = {"A": [1, 2, 3, 4, 5]}
df_one = pd.DataFrame(data)
df_one['B'] = df_one['A'] * x
print(df_one)
return df_one
#interact(x=(0, 1000, 10))
def df_draw_two(x):
data = {"A": [6, 7, 8, 9, 10]}
df_two = pd.DataFrame(data)
df_two['B'] = df_two['A'] * x
print(df_two)
return df_two
df_one = df_draw_one(1)
df_two = df_draw_two(1)
df_res = df_one + df_two
print(df_res)
Another way is to have df_one and df_two as global variables, but it's dirty and not really necessary.
Update
One idea could be to have both widget generated in the same function, then everything becomes more easy to handle.
import pandas as pd
from ipywidgets import interact
#interact()
def df_draw_one(x=(0, 1000, 10), y=(0, 1000, 10)):
data = {"A": [1, 2, 3, 4, 5]}
df_one = pd.DataFrame(data)
df_one['B'] = df_one['A'] * x
data2 = {"A": [6, 7, 8, 9, 10]}
df_two = pd.DataFrame(data2)
df_two['B'] = df_two['A'] * y
display(df_one)
display(df_two)
df_res = df_one + df_two
display(df_res)
Here my result:

How to speed up the matching of columns in pandas dataframe [duplicate]

This question already has answers here:
How do I select rows from a DataFrame based on column values?
(16 answers)
Closed 3 years ago.
I'm trying to find matching values in a pandas dataframe. Once a match is found I want to perform some operations on the row of the dataframe.
Currently I'm using this Code:
import pandas as pd
d = {'child_id': [1, 2,5,4], 'parent_id': [3, 4,2,3], 'content': ["a","b","c","d"]}
df = pd.DataFrame(data=d)
for i in range(len(df)):
for j in range(len(df)):
if str(df['child_id'][j]) == str(df['parent_id'][i]):
print(df.content[i])
else:
pass
It works fine, but is rather slow. Since I'm dealing with a dataset with millions of rows, I would take months. Is there a faster way to do this?
Edit: To clarify what, I want to create is a dataframe, which contains the Content of Matches.
import pandas as pd
d = {'child_id': [1,2,5,4],
'parent_id': [3,4,2,3],
'content': ["a","b","c","d"]}
df = pd.DataFrame(data=d)
df2 = pd.DataFrame(columns = ("content_child", "content_parent"))
for i in range(len(df)):
for j in range(len(df)):
if str(df['child_id'][j]) == str(df['parent_id'][i]):
content_child = str(df["content"][i])
content_parent = str(df["content"][j])
s = pd.Series([content_child, content_parent], index=['content_child', 'content_parent'])
df2 = df2.append(s, ignore_index=True)
else:
pass
print(df2)
The fastest way is to use the features of numpy:
import pandas as pd
d = {
'child_id': [1, 2, 5, 4],
'parent_id': [3, 4, 2, 3],
'content': ["a", "b", "c", "d"]
}
df = pd.DataFrame(data=d)
comp1 = df['child_id'].values == df['parent_id'].values
comp2 = df['child_id'].values[::-1] == df['parent_id'].values
comp3 = df['child_id'].values == df['parent_id'].values[::-1]
if comp1.any() and not comp2.any() and not comp3.any():
comp = np.c_[ df['content'].values[comp1] ]
elif comp1.any() and comp2.any() and not comp3.any():
comp = np.c_[ df['content'].values[comp1], df['content'].values[comp2] ]
elif comp1.any() and comp2.any() and comp3.any():
comp = np.c_[ df['content'].values[comp1], df['content'].values[comp2], df['content'].values[comp3] ]
print( df['content'].values[comp] )
Which outputs:
[]

Categories