I have the following two tables:
>>> df1 = pd.DataFrame(data={'1': ['john', '10', 'john'],
... '2': ['mike', '30', 'ana'],
... '3': ['ana', '20', 'mike'],
... '4': ['eve', 'eve', 'eve'],
... '5': ['10', np.NaN, '10'],
... '6': [np.NaN, np.NaN, '20']},
... index=pd.Series(['ind1', 'ind2', 'ind3'], name='index'))
>>> df1
1 2 3 4 5 6
index
ind1 john mike ana eve 10 NaN
ind2 10 30 20 eve NaN NaN
ind3 john ana mike eve 10 20
df2 = pd.DataFrame(data={'first_n': [4, 4, 3]},
index=pd.Series(['ind1', 'ind2', 'ind3'], name='index'))
>>> df2
first_n
index
ind1 4
ind2 4
ind3 3
I also have the following function that reverses a list and gets the first n non-NA elements:
def get_rev_first_n(row, top_n):
rev_row = [x for x in row[::-1] if x == x]
return rev_row[:top_n]
>>> get_rev_first_n(['john', 'mike', 'ana', 'eve', '10', np.NaN], 4)
['10', 'eve', 'ana', 'mike']
How would I apply this function to the two tables so that it takes in both df1 and df2 and outputs either a list or columns?
df=pd.concat([df1,df2],axis=1)
df.apply(get_rev_first_n,args=[4]) #send args as top_in
axis=0 is run along rows means runs on each column which is the default you don't have to specify it
args=[4] will be passed to second argument of get_rev_first_n
You can try apply with lambda on each row of the data frame, I just concatenate the two df's using concat and applied your method to each row of the resulted dataframe.
Full Code:
import pandas as pd
import numpy as np
def get_rev_first_n(row, top_n):
rev_row = [x for x in row[::-1] if x == x]
return rev_row[1:top_n]
df1 = pd.DataFrame(data={'1': ['john', '10', 'john'],
'2': ['mike', '30', 'ana'],
'3': ['ana', '20', 'mike'],
'4': ['eve', 'eve', 'eve'],
'5': ['10', np.NaN, '10'],
'6': [np.NaN, np.NaN, '20']},
index=pd.Series(['ind1', 'ind2', 'ind3'], name='index'))
df2 = pd.DataFrame(data={'first_n': [4, 4, 3]},
index=pd.Series(['ind1', 'ind2', 'ind3'], name='index'))
df3 = pd.concat([df1, df2.reindex(df1.index)], axis=1)
df = df3.apply(lambda row : get_rev_first_n(row, row['first_n']), axis = 1)
print(df)
Output:
index
ind1 [10, eve, ana]
ind2 [eve, 20, 30]
ind3 [20, 10]
dtype: object
Related
If have a dataframe like this:
df = pd.DataFrame({
'ID': ['1', '4', '4', '3', '3', '3'],
'club': ['arts', 'math', 'theatre', 'poetry', 'dance', 'cricket']
})
and I have a dictionary named tag_dict:
{'1': {'Granted'},
'3': {'Granted'}}
The keys of the dictionary match with some IDs in the ID column on data frame.
Now, I want to create a new column "Tag" in Dataframe such that
If a value in the ID column matches with the keys of a dictionary, then we have to place the value of that key in the dictionary else place '-' in that field
The output should look like this:
df = PD.DataFrame({
'ID': ['1', '4', '4', '3', '3', '3'],
'club': ['arts', 'math', 'theatre', 'poetry', 'dance', 'cricket'],
'tag':['Granted','-','-','Granted','Granted','Granted']
})
import pandas as pd
df = pd.DataFrame({
'ID': ['1', '4', '4', '3', '3', '3'],
'club': ['arts', 'math', 'theatre', 'poetry', 'dance', 'cricket']})
# I've removed the {} around your items. Feel free to add more key:value pairs
my_dict = {'1': 'Granted', '3': 'Granted'}
# use .map() to match your keys to their values
df['Tag'] = df['ID'].map(my_dict)
# if required, fill in NaN values with '-'
nan_rows = df['Tag'].isna()
df.loc[nan_rows, 'Tag'] = '-'
df
End result:
I'm not sure what the purpose of the curly brackets arount Granted is but you could use apply:
df = pd.DataFrame({
'ID': ['1', '4', '4', '3', '3', '3'],
'club': ['arts', 'math', 'theatre', 'poetry', 'dance', 'cricket']
})
tag_dict = {'1': 'Granted',
'3': 'Granted'}
df['tag'] = df['ID'].apply(lambda x: tag_dict.get(x, '-'))
print(df)
Output:
ID club tag
0 1 arts Granted
1 4 math -
2 4 theatre -
3 3 poetry Granted
4 3 dance Granted
5 3 cricket Granted
Solution with .map:
df["tag"] = df["ID"].map(dct).apply(lambda x: "-" if pd.isna(x) else [*x][0])
print(df)
Prints:
ID club tag
0 1 arts Granted
1 4 math -
2 4 theatre -
3 3 poetry Granted
4 3 dance Granted
5 3 cricket Granted
I have a dataframe:
values
NaN
NaN
[1,2,5]
[2]
[5]
And a dictionary
{nan: nan,
'1': '10',
'2': '11',
'5': '12',}
The dataframe contains keys from the dictionary.
How can I replace these keys with the corresponding values from the same dictionary?
Output:
values
NaN
NaN
[10,11,12]
[11]
[12]
I have tried
so_df['values'].replace(my_dictionary, inplace=True)
so_df.head()
You can use apply() method of pandas df. Check the implementation below:
import pandas as pd
import numpy as np
df = pd.DataFrame([np.nan,
np.nan,
['1', '2', '5'],
['2'],
['5']], columns=['values'])
my_dict = {np.nan: np.nan,
'1': '10',
'2': '11',
'5': '12'}
def update(row):
if isinstance(row['values'], list):
row['values'] = [my_dict.get(val) for val in row['values']]
else:
row['values'] = my_dict.get(row['values'])
return row
df = df.apply(lambda row: update(row), axis=1)
Simple implementation. Just make sure if your dataframe contains string, your dictionary keys also contains string.
Try:
df['values']=pd.to_numeric(df['values'].explode().astype(str).map(my_dict), errors='coerce').groupby(level=0).agg(list)
Setup
import numpy as np
df=pd.DataFrame({'values':[np.nan,np.nan,[1,2,5],[2],5]})
my_dict={np.nan: np.nan, '1': '10', '2': '11', '5': '12'}
Use Series.explode with Series.map
df['values']=( df['values'].explode()
.astype(str)
.map(my_dict)
.dropna()
.astype(int)
.groupby(level = 0)
.agg(list) )
If there are others strings in your values column you would need pd.to_numeric with errors = coerce, to keep it you should do:
df['values']=(pd.to_numeric( df['values'].explode()
.astype(str)
.replace(my_dict),
errors = 'coerce')
.dropna()
.groupby(level = 0)
.agg(list)
.fillna(df['values'])
)
Output
values
0 NaN
1 NaN
2 [10, 11, 12]
3 [11]
4 [12]
UPDATE
solution without explode
df['values']=(pd.to_numeric( df['values'].apply(pd.Series)
.stack()
.reset_index(level=1,drop=1)
.astype(str)
.replace(my_dict),
errors = 'coerce')
.dropna()
.groupby(level = 0)
.agg(list)
.fillna(df['values'])
)
I have the following pandas DataFrame with mixed data types: string and integer values. I want to sort values of this DataFrame in descending order using multiple columns: Price and Name. The string values (i.e. Name) should be sorted in the alphabetical order, or actually can be ignored at all, because the most important ones are numerical values.
The problem is that the list of target columns can contain both string and integer columns, e.g. target_columns = ["Price","Name"]
d = {'1': ['25', 'AAA', 2], '2': ['30', 'BBB', 3], '3': ['5', 'CCC', 2], \
'4': ['300', 'DDD', 2], '5': ['30', 'DDD', 3], '6': ['100', 'AAA', 3]}
columns=['Price', 'Name', 'Class']
target_columns = ['Price', 'Name']
order_per_cols = [False] * len(target_columns)
df = pd.DataFrame.from_dict(data=d, orient='index')
df.columns = columns
df.sort_values(list(target_columns), ascending=order_per_cols, inplace=True)
Currently, this code fails with the following message:
TypeError: '<' not supported between instances of 'str' and 'int'
The expected output:
Price Name Class
300 DDD 2
100 AAA 3
30 DDD 3
30 BBB 3
25 AAA 2
5 CCC 2
If I understand you correctly, you want a generic way that excludes the object columns from your selection.
We can use DataFrame.select_dtypes for this, then sort on the numeric columns:
# df['Price'] = pd.to_numeric(df['Price'])
numeric = df[target_columns].select_dtypes('number').columns.tolist()
df = df.sort_values(numeric, ascending=[False]*len(numeric))
Price Name Class
4 300 DDD 2
6 100 AAA 3
2 30 BBB 3
5 30 DDD 3
1 25 AAA 2
3 5 CCC 2
One more solution could be -
Using 'by' parameter in sort_values function
d = ({'1': ['25', 'AAA', 2], '2': ['30', 'BBB', 3], '3': ['5', 'CCC', 2], \
'4': ['300', 'DDD', 2], '5': ['30', 'DDD', 3], '6': ['100', 'AAA', 3]})
df = pd.DataFrame.from_dict(data=d,columns=['Price','Name','Class'],orient='index')
df['Price'] = pd.to_numeric(df['Price'])
df.sort_values(**by** = ['Price','Name'],ascending=False)
I'm still relatively new to Pandas and I can't tell which of the functions I'm best off using to get to my answer. I have looked at pivot, pivot_table, group_by and aggregate but I can't seem to get it to do what I require. Quite possibly user error, for which I apologise!
I have data like this:
Code to create df:
import pandas as pd
df = pd.DataFrame([
['1', '1', 'A', 3, 7],
['1', '1', 'B', 2, 9],
['1', '1', 'C', 2, 9],
['1', '2', 'A', 4, 10],
['1', '2', 'B', 4, 0],
['1', '2', 'C', 9, 8],
['2', '1', 'A', 3, 8],
['2', '1', 'B', 10, 4],
['2', '1', 'C', 0, 1],
['2', '2', 'A', 1, 6],
['2', '2', 'B', 10, 2],
['2', '2', 'C', 10, 3]
], columns = ['Field1', 'Field2', 'Type', 'Price1', 'Price2'])
print(df)
I am trying to get data like this:
Although my end goal will be to end up with one column for A, one for B and one for C. As A will use Price1 and B & C will use Price2.
I don't want to necessarily get the max or min or average or sum of the Price as theoretically (although unlikely) there could be two different Price1's for the same Fields & Type.
What's the best function to use in Pandas to get to what I need?
Use DataFrame.set_index with DataFrame.unstack for reshape - output is MultiIndex in columns, so added sorting second level by DataFrame.sort_index, flatten values and last create column from Field levels:
df1 = (df.set_index(['Field1','Field2', 'Type'])
.unstack(fill_value=0)
.sort_index(axis=1, level=1))
df1.columns = [f'{b}-{a}' for a, b in df1.columns]
df1 = df1.reset_index()
print (df1)
Field1 Field2 A-Price1 A-Price2 B-Price1 B-Price2 C-Price1 C-Price2
0 1 1 3 7 2 9 2 9
1 1 2 4 10 4 0 9 8
2 2 1 3 8 10 4 0 1
3 2 2 1 6 10 2 10 3
Solution with DataFrame.pivot_table is also possible, but it aggregate values in duplicates first 3 columns with default mean function:
df2 = (df.pivot_table(index=['Field1','Field2'],
columns='Type',
values=['Price1', 'Price2'],
aggfunc='mean')
.sort_index(axis=1, level=1))
df2.columns = [f'{b}-{a}' for a, b in df2.columns]
df2 = df2.reset_index()
print (df2)
use pivot_table
pd.pivot_table(df, values =['Price1', 'Price2'], index=['Field1','Field2'],columns='Type').reset_index()
I need to convert numeric values of a column (pandas data frame) to float, but they are in string format.
d = {'col1': ['1', '2.1', '3.1'],
'col2': ['yes', '4', '6'],
'col3': ['1', '4', 'not']}
Expected:
{'col1': [1, 2.1, 3.1],
'col2': ['yes', 4, 6],
'col3': [1, 4, 'not']}
It is possible, but not recommended, because if mixed values in columns some function should failed:
d = {'col1': ['1', '2.1', '3.1'],
'col2': ['yes', '4', '6'],
'col3': ['1', '4', 'not']}
df = pd.DataFrame(d)
def func(x):
try:
return float(x)
except Exception:
return x
df = df.applymap(func)
print (df)
col1 col2 col3
0 1.0 yes 1
1 2.1 4 4
2 3.1 6 not
print (df.to_dict('l'))
{'col1': [1.0, 2.1, 3.1], 'col2': ['yes', 4.0, 6.0], 'col3': [1.0, 4.0, 'not']}
Another solution:
df = df.apply(lambda x: pd.to_numeric(x, errors='coerce')).fillna(df)