I am a beginner. I have two function:
1st creating dataframes and some print statement
2nd is downloading the dataframes to csv in colab.
I want to download all dataframes by the df_name.
code:
def fun1():
import pandas as pd
d = {'col1': [1, 2], 'col2': [3, 4]}
d2 = {'col1': [-5, -6], 'col2': [-7, -8]}
df = pd.DataFrame(data=d)
df2 = pd.DataFrame(data=d2)
print('info', df.info())
print('info', df2.info())
return df, df2
def fun2(df):
from google.colab import files
name1 = 'positive.csv'
name2 = 'negative.csv'
df.to_csv(name1)
df2.to_csv(name2)
files.download(name1)
files.download(name2)
fun2(df) #looking something like this that download my df, func2 should read my df and df2 from fun1()
I tried:
class tom:
def fun1(self):
import pandas as pd
d = {'col1': [1, 2], 'col2': [3, 4]}
d2 = {'col1': [-5, -6], 'col2': [-7, -8]}
df = pd.DataFrame(data=d)
df2 = pd.DataFrame(data=d2)
print('info', df.info())
print('info', df2.info())
self.df= df
self.df2 = df2
return df, df2
def fun2(self):
df,df2 = fun1()
from google.colab import files
name1 = 'positive.csv'
name2 = 'negative.csv'
df.to_csv(name1)
df2.to_csv(name2)
return files.download(name1) ,files.download(name2)
tom().fun2() #it download files but shows print of fun1 as well which I don't want.
looking for something like
tom().fun2(dataframe_name) #it just download the files nothing else
set permanent variables directly in the class if its not gonna change and
define fun just for actions.
class s:
import pandas as pd
d = {'col1': [1, 2], 'col2': [3, 4]}
d2 = {'col1': [-5, -6], 'col2': [-7, -8]}
df = pd.DataFrame(data=d)
df2 = pd.DataFrame(data=d2)
name1 = 'positive.csv'
name2 = 'negative.csv'
df.to_csv(name1)
df2.to_csv(name2)
def f():
print('info', df.info())
print('info', df2.info())
def fun(x):
from google.colab import files
return files.download(x)
run
s.f() --it will print value only
s.fun(s.name1) --it will just download the file
Maybe you can save the data you need in a class variable or create another function, that keeps the data from the first function you need the value (lets call it A) and then pass A to the second function as an argument.
Related
Why does doing df.copy() not give a warning only if done inside the function scope, but errors if used outside of the scope?
Example 1 - Gives Warning:
def do_something(df):
df = df.loc[df['x'].isin([1, 2, 2698])]
df.loc[:, "y"] = 0
return df
df = pd.DataFrame({"x": [1, 2, 4], "y": ["a", "b", "c"]})
do_something(df)
Example 2 - Does not give warning
def do_something(df):
df = df.copy() # <---------------------- This fixes the warning but is not desirable as it creates a copy (which can take a lot of memory)
df = df.loc[df['x'].isin([1, 2, 2698])]
df.loc[:, "y"] = 0
return df
df = pd.DataFrame({"x": [1, 2, 4], "y": ["a", "b", "c"]})
do_something(df)
Solution - change the function to use df.loc[:] = instead of df =
def do_something(df):
# df = df.loc[df['x'].isin([1, 2, 2698])] # <------ Replace this with the line below
df.loc[:] = df.loc[df['x'].isin([1, 2, 2698])]
# ^^^^^^ <--- add this
df.loc[:, "y"] = 0
return df
I have a df as below:
I would like to group by id and flag and create a new column in the df which is the result of: [sum(value1)/sum(value2)] * 12. Therefore I will need the result to be:
I have created a function:
`def calculation (value1, value2):
result = (value1/value2) * 12
return(result)`
Could you advise which is the best way to apply this function along with the grouping, in order to get the desired output?
Many thanks
The following code should work.
import pandas as pd
df = pd.DataFrame({"id" : [1,1,2,2],"flag":["A","B","B","A"],"value1":[520,200,400,410],"value2":[12,5,11,2]})
def calculation(value1, value2):
result = (value1/value2) * 12
return(result)
df.groupby(['id','flag']).apply(lambda x: calculation(x['value1'],x['value2'])).astype(int)
You just have to use the following for groupby and apply.
df.groupby(['id','flag']).apply(lambda x: calculation(x['value1'],x['value2'])).astype(int)
Here's a solution using apply and a lambda function.
import pandas as pd
df = pd.DataFrame([
[1, 'A', 520, 12],
[1, 'B', 200, 5],
[2, 'B', 400, 11],
[2, 'A', 410, 2]],
columns=['id', 'flag', 'value1', 'value2'])
df.groupby(['id', 'flag']).apply(lambda x: (12 * x['value1']) / x['value2'])
If you want to use the function calculation above then just call the apply method like this.
df.groupby(['id', 'flag']).apply(lambda x: calculation(x['value1'], x['value2']))
I'm looking for help to add two dynamically generated dataframes.
Both DataFrames have a column computed on input from an intslider ipywidget.
the third Dataframe should update dynamically on changes of any of above Dataframes
import pandas as pd
from ipywidgets import interact
#interact(x=(0,1000,10))
def df_draw_one(x):
data = {"A":[1,2,3,4,5]}
df_one = pd.DataFrame(data)
df_one['B'] = df_one['A']*x
print(df_one)
#interact(x=(0,1000,10))
def df_draw_two(x):
data = {"A":[6,7,8,9,10]}
df_two = pd.DataFrame(data)
df_two['B'] = df_two['A']*x
print(df_two)
df_res = df_one+df_two
I understand with the current code, df_one and two are local and hence result in:
NameError: name 'df_one' is not defined
but I'm at loss on how to make them accessible.
Any pointer would be appreciated
You can have your functions return the two dataframe adding a return statement.
import pandas as pd
from ipywidgets import interact
#interact(x=(0, 1000, 10))
def df_draw_one(x):
data = {"A": [1, 2, 3, 4, 5]}
df_one = pd.DataFrame(data)
df_one['B'] = df_one['A'] * x
print(df_one)
return df_one
#interact(x=(0, 1000, 10))
def df_draw_two(x):
data = {"A": [6, 7, 8, 9, 10]}
df_two = pd.DataFrame(data)
df_two['B'] = df_two['A'] * x
print(df_two)
return df_two
df_one = df_draw_one(1)
df_two = df_draw_two(1)
df_res = df_one + df_two
print(df_res)
Another way is to have df_one and df_two as global variables, but it's dirty and not really necessary.
Update
One idea could be to have both widget generated in the same function, then everything becomes more easy to handle.
import pandas as pd
from ipywidgets import interact
#interact()
def df_draw_one(x=(0, 1000, 10), y=(0, 1000, 10)):
data = {"A": [1, 2, 3, 4, 5]}
df_one = pd.DataFrame(data)
df_one['B'] = df_one['A'] * x
data2 = {"A": [6, 7, 8, 9, 10]}
df_two = pd.DataFrame(data2)
df_two['B'] = df_two['A'] * y
display(df_one)
display(df_two)
df_res = df_one + df_two
display(df_res)
Here my result:
This question already has answers here:
How do I select rows from a DataFrame based on column values?
(16 answers)
Closed 3 years ago.
I'm trying to find matching values in a pandas dataframe. Once a match is found I want to perform some operations on the row of the dataframe.
Currently I'm using this Code:
import pandas as pd
d = {'child_id': [1, 2,5,4], 'parent_id': [3, 4,2,3], 'content': ["a","b","c","d"]}
df = pd.DataFrame(data=d)
for i in range(len(df)):
for j in range(len(df)):
if str(df['child_id'][j]) == str(df['parent_id'][i]):
print(df.content[i])
else:
pass
It works fine, but is rather slow. Since I'm dealing with a dataset with millions of rows, I would take months. Is there a faster way to do this?
Edit: To clarify what, I want to create is a dataframe, which contains the Content of Matches.
import pandas as pd
d = {'child_id': [1,2,5,4],
'parent_id': [3,4,2,3],
'content': ["a","b","c","d"]}
df = pd.DataFrame(data=d)
df2 = pd.DataFrame(columns = ("content_child", "content_parent"))
for i in range(len(df)):
for j in range(len(df)):
if str(df['child_id'][j]) == str(df['parent_id'][i]):
content_child = str(df["content"][i])
content_parent = str(df["content"][j])
s = pd.Series([content_child, content_parent], index=['content_child', 'content_parent'])
df2 = df2.append(s, ignore_index=True)
else:
pass
print(df2)
The fastest way is to use the features of numpy:
import pandas as pd
d = {
'child_id': [1, 2, 5, 4],
'parent_id': [3, 4, 2, 3],
'content': ["a", "b", "c", "d"]
}
df = pd.DataFrame(data=d)
comp1 = df['child_id'].values == df['parent_id'].values
comp2 = df['child_id'].values[::-1] == df['parent_id'].values
comp3 = df['child_id'].values == df['parent_id'].values[::-1]
if comp1.any() and not comp2.any() and not comp3.any():
comp = np.c_[ df['content'].values[comp1] ]
elif comp1.any() and comp2.any() and not comp3.any():
comp = np.c_[ df['content'].values[comp1], df['content'].values[comp2] ]
elif comp1.any() and comp2.any() and comp3.any():
comp = np.c_[ df['content'].values[comp1], df['content'].values[comp2], df['content'].values[comp3] ]
print( df['content'].values[comp] )
Which outputs:
[]
I have the following pandas Dataframe:
dict1 = {'file': ['filename2', 'filename2', 'filename3', 'filename4', 'filename4', 'filename3'], 'amount': [3, 4, 5, 1, 2, 1], 'front':[21889611, 36357723, 196312, 11, 42, 1992], 'back':[21973805, 36403870, 277500, 19, 120, 3210]}
df1 = pd.DataFrame(dict1)
print(df1)
file amount front back
0 filename2 3 21889611 21973805
1 filename2 4 36357723 36403870
2 filename3 5 196312 277500
3 filename4 1 11 19
4 filename4 2 42 120
5 filename3 1 1992 3210
My task is to take N random draws between front and back, whereby N is equal to the value in amount. Parse this into a dictionary.
To do this on an row-by-row basis is easy for me to understand:
e.g. row 1
import numpy as np
random_draws = np.random.choice(np.arange(21889611, 21973805+1), 3)
e.g. row 2
random_draws = np.random.choice(np.arange(36357723, 36403870+1), 4)
Normally with pandas, users could define this as a function and use something like
def func(front, back, amount):
return np.random.choice(np.arange(front, back+1), amount)
df["new_column"].apply(func)
but the result of my function is an array of varying size.
My second problem is that I would like the output to be a dictionary, of the format
{file: [random_draw_results], file: [random_draw_results], file: [random_draw_results], ...}
For the above example df1, the function should output this dictionary (given the draws):
final_dict = {"filename2": [21927457, 21966814, 21898538, 36392840, 36375560, 36384078, 36366833],
"filename3": 212143, 239725, 240959, 197359, 276948, 3199],
"filename4": [100, 83, 15]}
We can pass axis=1 to operate over rows when using apply.
We then need to tell what columns to use and we return a list.
We then either perform some form of groupby or we could use defaultdict as shown below:
dict1 = {'file': ['filename2', 'filename2', 'filename3', 'filename4', 'filename4', 'filename3'], 'amount': [3, 4, 5, 1, 2, 1], 'front':[21889611, 36357723, 196312, 11, 42, 1992], 'back':[21973805, 36403870, 277500, 19, 120, 3210]}
import numpy as np
import pandas as pd
def func(x):
return np.random.choice(np.arange(x.front, x.back+1), x.amount).tolist()
df1 = pd.DataFrame(dict1)
df1["new_column"] = df1.apply(func, axis=1)
df1.groupby('file')['new_column'].apply(sum).to_dict()
Returns:
{'filename2': [21891765,
21904680,
21914414,
36398355,
36358161,
36387670,
36369443],
'filename3': [240766, 217580, 217581, 274396, 241413, 2488],
'filename4': [18, 96, 107]}
Alt2 would be to use (and by some small timings I ran it looks like it runs as fast):
from collections import defaultdict
d = defaultdict(list)
for k,v in df1.set_index('file')['new_column'].items():
d[k].extend(v)