Appending a dictionary to a dataframe as a new column - python

I'm very new to Python and was hoping to get some help. I am following an online example where the author creates a dictionary, adds some data to it and then appends this to his original dataframe.
When I follow the code the data in the dictionary doesn't get appended to the dataframe and as such I can't continue with the example.
The authors code is as follows:
from collections import defaultdict
won_last = defaultdict(int)
for index,row in data.iterrows():
home_team = row['HomeTeam']
visitor_team = row['AwayTeam']
row['HomeLastWin'] = won_last[home_team]
row['VisitorLastWin'] = won_last[visitor_team]
results.ix[index]=row
won_last[home_team] = row['HomeWin']
won_last[visitor_team] = not row['HomeWin']
When I run this code I get the error message (note that the name of the dataframe is different but apart from that nothing has changed)
AttributeError Traceback (most recent call last)
<ipython-input-46-d31706a5f745> in <module>
4 row['HomeLastWin'] = won_last[home_team]
5 row['VisitorLastWin'] = won_last[visitor_team]
----> 6 data.ix[index]=row
7 won_last[home_team] = row['HomeWin']
8 won_last[visitor_team] = not row['HomeWin']
~\anaconda3\lib\site-packages\pandas\core\generic.py in __getattr__(self, name)
5137 if self._info_axis._can_hold_identifiers_and_holds_name(name):
5138 return self[name]
-> 5139 return object.__getattribute__(self, name)
5140
5141 def __setattr__(self, name: str, value) -> None:
AttributeError: 'DataFrame' object has no attribute 'ix'
If I change the row data.ix[index]=row to data.loc[index]=row the code runs ok but nothing happens to my dataframe
Below is an example of the dataset I am working with
Div Date Time HomeTeam AwayTeam FTHG FTAG FTR HomeWIn
E0 12/09/2020 12:30 Fulham Arsenal 0 3 A FALSE
E0 12/09/2020 15:00 Crystal Palace Southampton 1 0 H FALSE
E0 12/09/2020 17:30 Liverpool Leeds 4 3 H TRUE
E0 12/09/2020 20:00 West Ham Newcastle 0 2 A TRUE
E0 13/09/2020 14:00 West Brom Leicester 0 3 A FALSE
and below is the dataset of the example I am working through with the columns added
Date Visitor Team VisitorPts Home Team HomePts HomeWin
20 01/11/2013 Milwaukee 105 Boston 98 FALSE
21 01/11/2013 Miami Heat 100 Brooklyn 101 TRUE
22 01/11/2013 Clevland 84 Charlotte 90 TRUE
23 01/11/2013 Portland 113 Denver 98 FALSE
24 01/11/2013 Dallas 91 Houston 113 TRUE
HomeLastWin VisitorLastWIn
FALSE FALSE
FALSE FALSE
FALSE TRUE
FALSE FALSE
TRUE TRUE
Thanks
Jon

Could you please try this,
Data that used as dataset_stack.csv
from collections import defaultdict
won_last = defaultdict(int)
# Load the Pandas libraries with alias 'pd'
import pandas as pd
# Read data from file 'dataset_stack.csv'
# (in the same directory that your python process is based)
# Control delimiters, rows, column names with read_csv (see later)
data = pd.read_csv("dataset_stack.csv")
results=pd.DataFrame(data=data)
#print(results)
# Preview the first 5 lines of the loaded data
#data.head()
for index,row in data.iterrows():
home_team = row['HomeTeam']
visitor_team = row['VisitorTeam']
row['HomeLastWin'] = won_last[home_team]
row['VisitorLastWin'] = won_last[visitor_team]
#results.ix[index]=row
#results.loc[index]=row
#add new column directly to dataframe instead of adding it to row & appending to dataframe
results['HomeLastWin']=won_last[home_team]
results['VisitorLastWin']=won_last[visitor_team]
results.append(row, ignore_index=True)
won_last[home_team] = row['HomeWin']
won_last[visitor_team] = not row['HomeWin']
print(results)
Output:
Date VisitorTeam VisitorPts HomeTeam HomePts HomeWin \
0 1/11/2013 Milwaukee 105 Boston 98 False
1 1/11/2013 Miami Heat 100 Brooklyn 101 True
2 1/11/2013 Clevland 84 Charlotte 90 True
3 1/11/2013 Portland 113 Denver 98 False
4 1/11/2013 Dallas 91 Houston 113 True
HomeLastWin VisitorLastWin
0 0 0
1 0 0
2 0 0
3 0 0
4 0 0

Related

Adding Spotify Data from zenodo in a DataFrame

I want to add all the data from charts.zip from https://doi.org/10.5281/zenodo.4778562 in a single DataFrame. The data consist of a file per year that contains multiple CSVs. I made the following code:
header = 0
dfs = []
for file in glob.glob('Charts/*/201?/*.csv'):
region = file.split('/')[1]
dates = re.findall('\d{4}-\d{2}-\d{2}', file.split('/')[-1])
weekly_chart = pd.read_csv(file, header=header, sep='\t')
weekly_chart['week_start'] = datetime.strptime(dates[0], '%Y-%m-%d')
weekly_chart['week_end'] = datetime.strptime(dates[1], '%Y-%m-%d')
weekly_chart['region'] = region
dfs.append(weekly_chart)
all_charts = pd.concat(dfs)
But, when I run it, python returns:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/tmp/ipykernel_12886/3473678833.py in <module>
9 weekly_chart['region'] = region
10 dfs.append(weekly_chart)
---> 11 all_charts = pd.concat(dfs)
~/Downloads/enter/lib/python3.9/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
312
313 return wrapper
~/Downloads/enter/lib/python3.9/site-packages/pandas/core/reshape/concat.py in concat(objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy)
344 ValueError: Indexes have overlapping values: ['a']
345 """
--> 346 op = _Concatenator(
347 objs,
348 axis=axis,
~/Downloads/enter/lib/python3.9/site-packages/pandas/core/reshape/concat.py in __init__(self, objs, axis, join, keys, levels, names, ignore_index, verify_integrity, copy, sort)
401
402 if len(objs) == 0:
--> 403 raise ValueError("No objects to concatenate")
404
405 if keys is None:
ValueError: No objects to concatenate
How can I fix it?
I think the glob.glob might just be over complicating things... This works perfectly for me.
# Gives you a list of EVERY file in the Charts directory
# and sub directories that is a CSV
file_list = []
for path, subdirs, files in os.walk("Charts"):
file_list.extend([os.path.join(path, x) for x in files if x.endswith('.csv')])
dfs = []
for file in file_list:
region = file.split('/')[1]
dates = re.findall('\d{4}-\d{2}-\d{2}', file.split('/')[-1])
df = pd.read_csv(file, sep='\t')
df['week_start'] = dates[0]
df['week_end'] = dates[1]
df['region'] = region
dfs.append(df)
all_charts = pd.concat(dfs, ignore_index=True)
print(all_charts)
Output:
position song_id song_name artist streams ... peak_position position_status week_start week_end region
0 1 7wGoVu4Dady5GV0Sv4UIsx rockstar Post Malone 17532665 ... 1 0 2017-10-20 2017-10-27 us
1 2 75ZvA4QfFiZvzhj2xkaWAh I Fall Apart Post Malone 8350785 ... 2 0 2017-10-20 2017-10-27 us
2 3 2fQrGHiQOvpL9UgPvtYy6G Bank Account 21 Savage 7589124 ... 3 1 2017-10-20 2017-10-27 us
3 4 43ZyHQITOjhciSUUNPVRHc Gucci Gang Lil Pump 7584237 ... 4 1 2017-10-20 2017-10-27 us
4 5 5tz69p7tJuGPeMGwNTxYuV 1-800-273-8255 Logic 7527770 ... 1 -2 2017-10-20 2017-10-27 us
... ... ... ... ... ... ... ... ... ... ... ...
273595 196 6kex4EBAj0WHXDKZMEJaaF Swalla (feat. Nicki Minaj & Ty Dolla $ign) Jason Derulo 3747830 ... 8 -5 2018-03-02 2018-03-09 global
273596 197 0CokSRCu5hZgPxcZBaEzVE Glorious (feat. Skylar Grey) Macklemore 3725286 ... 14 -8 2018-03-02 2018-03-09 global
273597 198 7oK9VyNzrYvRFo7nQEYkWN Mr. Brightside The Killers 3717326 ... 148 -3 2018-03-02 2018-03-09 global
273598 199 7EUfNvyCVxQV3oN5ScA2Lb Next To Me Imagine Dragons 3681739 ... 122 -77 2018-03-02 2018-03-09 global
273599 200 6u0EAxf1OJTLS7CvInuNd7 Vai malandra (feat. Tropkillaz & DJ Yuri Martins) Anitta 3676542 ... 30 -23 2018-03-02 2018-03-09 global
If you really want the dates to be dates, you can run this on those two columns at the end.
all_charts['week_start'] = pd.to_datetime(all_charts['week_start'])
Personally, I'd also do the following:
all_charts['week_start'] = pd.to_datetime(all_charts['week_start'])
all_charts['week_end'] = pd.to_datetime(all_charts['week_end'])
all_charts['region'] = all_charts['region'].astype('category')
all_charts['artist'] = all_charts['artist'].astype('category')
all_charts['song_name'] = all_charts['song_name'].astype('category')
all_charts['song_id'] = all_charts['song_id'].astype('category')
all_charts.set_index(['region', 'week_start', 'week_end', 'position'], inplace=True)
all_charts.position_status = pd.to_numeric(all_charts.position_status, errors='coerce')
print(df.head(10))
Giving:
song_id song_name artist streams last_week_position weeks_on_chart peak_position position_status
region week_start week_end position
us 2017-10-20 2017-10-27 1 7wGoVu4Dady5GV0Sv4UIsx rockstar Post Malone 17532665 1.0 3 1 0.0
2 75ZvA4QfFiZvzhj2xkaWAh I Fall Apart Post Malone 8350785 2.0 6 2 0.0
3 2fQrGHiQOvpL9UgPvtYy6G Bank Account 21 Savage 7589124 4.0 5 3 1.0
4 43ZyHQITOjhciSUUNPVRHc Gucci Gang Lil Pump 7584237 5.0 3 4 1.0
5 5tz69p7tJuGPeMGwNTxYuV 1-800-273-8255 Logic 7527770 3.0 26 1 -2.0
6 5Gd19NupVe5X8bAqxf9Iaz Gorgeous Taylor Swift 6940802 NaN 1 6 NaN
7 0ofbQMrRDsUaVKq2mGLEAb Havana Camila Cabello 6623184 10.0 12 7 3.0
8 2771LMNxwf62FTAdpJMQfM Bodak Yellow Cardi B 6472727 6.0 14 3 -2.0
9 5Z3GHaZ6ec9bsiI5BenrbY Young Dumb & Broke Khalid 5982108 9.0 29 6 0.0
10 7GX5flRQZVHRAGd6B4TmDO XO Tour Llif3 Lil Uzi Vert 5822583 8.0 9 2 -2.0

Replace blank value in dataframe based on another column condition

I have many blanks in a merged data set and I want to fill them with a condition.
My current code looks like this
import pandas as pd
import csv
import numpy as np
pd.set_option('display.max_columns', 500)
# Read all files into pandas dataframes
Jan = pd.read_csv(r'C:\~\Documents\Jan.csv')
Feb = pd.read_csv(r'C:\~\Documents\Feb.csv')
Mar = pd.read_csv(r'C:\~\Documents\Mar.csv')
Jan=pd.DataFrame({'Department':['52','5','56','70','7'],'Item':['2515','254','818','','']})
Feb=pd.DataFrame({'Department':['52','56','765','7','40'],'Item':['2515','818','524','','']})
Mar=pd.DataFrame({'Department':['7','70','5','8','52'],'Item':['45','','818','','']})
all_df_list = [Jan, Feb, Mar]
appended_df = pd.concat(all_df_list)
df = appended_df
df.to_csv(r"C:\~\Documents\SallesDS.csv", index=False)
Data set:
df
Department Item
52 2515
5 254
56 818
70
7 50
52 2515
56 818
765 524
7
40
7 45
70
5 818
8
52
What I want is to fill the empty cells in Item with a correspondent values of the Department column.
So If Department is 52 and Item is empty it should be filled with 2515
Department 7 and Item is empty fill it with 45
and the result should look like this
df
Department Item
52 2515
5 254
56 818
70
7 50
52 2515
56 818
765 524
7 45
40
7 45
70
5 818
8
52 2515
I tried the following method but non of them worked.
1
df.loc[(df['Item'].isna()) & (df['Department'].str.contains(52)), 'Item'] = 2515
df.loc[(df['Item'].isna()) & (df['Department'].str.contains(7)), 'Item'] = 45
2
df["Item"] = df["Item"].fillna(df["Department"])
df = df.replace({"Item":{"52":"2515", "7":"45"}})
both ethir return error or do not work
Answer:
Hi I have used the below code and it worked
b = [52]
df.Item=np.where(df.Department.isin(b),df.Item.fillna(2515),df.Item)
a = [7]
df.Item=np.where(df.Department.isin(a),df.Item.fillna(45),df.Item)
Hope it helps someone who face the same issue
The following solution first creates a map of each department and it's maximum corresponding item (assuming there is one), and then matches that item to a department with a blank item. Note that in your data frame, the empty items are an empty string ("") and not NaN.
Create a map:
values = df.groupby('Department').max()
values['Item'] = values['Item'].apply(lambda x: np.nan if x == "" else x)
values = values.dropna().reset_index()
Department Item
0 5 818
1 52 2515
2 56 818
3 7 45
4 765 524
Then use df.apply():
df['Item'] = df.apply(lambda x: values[values['Department'] == x['Department']]['Item'].values if x['Item'] == "" else x['Item'], axis=1)
In this case, the new values will have brackets around them. They can be removed with str.replace():
df['Item'] = df['Item'].astype(str).str.replace(r'\[|\'|\'|\]', "", regex=True)
The result:
Department Item
0 52 2515
1 5 254
2 56 818
3 70
4 7 45
0 52 2515
1 56 818
2 765 524
3 7 45
4 40
0 7 45
1 70
2 5 818
3 8
4 52 2515
Hi I have used the below code and it worked
b = [52]
df.Item=np.where(df.Department.isin(b),df.Item.fillna(2515),df.Item)
a = [7]
df.Item=np.where(df.Department.isin(a),df.Item.fillna(45),df.Item)
Hope it helps someone who face the same issue

Pandas can only convert an array of size 1 to a Python scalar

I have this dataframe, df_pm:
Player GameWeek Minutes \
PlayerMatchesDetailID
1 Alisson 1 90
2 Virgil van Dijk 1 90
3 Joseph Gomez 1 90
ForTeam AgainstTeam \
1 Liverpool Norwich City
2 Liverpool Norwich City
3 Liverpool Norwich City
Goals ShotsOnTarget ShotsInBox CloseShots \
1 0 0 0 0
2 1 1 1 1
3 0 0 0 0
TotalShots Headers GoalAssists ShotOnTargetCreated \
1 0 0 0 0
2 1 1 0 0
3 0 0 0 0
ShotInBoxCreated CloseShotCreated TotalShotCreated \
1 0 0 0
2 0 0 0
3 0 0 1
HeadersCreated
1 0
2 0
3 0
this second dataframe, df_melt:
MatchID GameWeek Date Team Home \
0 46605 1 2019-08-09 Liverpool Home
1 46605 1 2019-08-09 Norwich City Away
2 46606 1 2019-08-10 AFC Bournemouth Home
AgainstTeam
0 Norwich City
1 Liverpool
2 Sheffield United
3 AFC Bournemouth
...
575 Sheffield United
576 Newcastle United
577 Southampton
and this snippet, which uses both:
match_ids = []
home_away = []
dates = []
#For each row in the player matches dataframe...
for row in df_pm.itertuples():
#Look up the match id from the team matches dataframe
team = row.ForTeam
againstteam = row.AgainstTeam
gameweek = row.GameWeek
print (team,againstteam,gameweek)
match_id = df_melt.loc[(df_melt['GameWeek']==gameweek)
&(df_melt['Team']==team)
&(df_melt['AgainstTeam']==againstteam),
'MatchID'].item()
date = df_melt.loc[(df_melt['GameWeek']==gameweek)
&(df_melt['Team']==team)
&(df_melt['AgainstTeam']==againstteam),
'Date'].item()
home = df_melt.loc[(df_melt['GameWeek']==gameweek)
&(df_melt['Team']==team)
&(df_melt['AgainstTeam']==againstteam),
'Home'].item()
match_ids.append(match_id)
home_away.append(home)
dates.append(date)
At first iteration, I print:
Liverpool
Norwich City
1
But I'm getting the error:
Traceback (most recent call last):
File "tableau_data_generation.py", line 166, in <module>
'MatchID'].item()
File "/Users/me/anaconda2/envs/data_science/lib/python3.7/site-packages/pandas/core/base.py", line 652, in item
return self.values.item()
ValueError: can only convert an array of size 1 to a Python scalar
printing the whole df_melt dataframe, I see that these four datetime values are flawed:
540 46875 28 TBC Aston Villa Home
541 46875 28 TBC Sheffield United Away
...
548 46879 28 TBC Manchester City Home
549 46879 28 TBC Arsenal Away
How do I fix this?
When you use item() on a Series you should actually have received:
FutureWarning: `item` has been deprecated and will be removed in a future version
Since item() has been deprecated in version 0.25.0, it looks like you use
some outdated version of Pandas and possibly you should start from upgrading it.
Even in a newer version of Pandas you can use item(), but on a Numpy
array (at least now, not deprecated).
So change your code to:
df_melt.loc[...].values.item()
Another option is to use iloc[0], so you can also change your code to:
df_melt.loc[...].iloc[0]
Edit
The above solution still can raise an exception (IndexError) if df_melt
does not find any row meeting the given criteria.
To make your code resistant to such cases (and return some default value)
you can add a function getting the given attribute (attr, actually a
column) from the first row meeting the criteria given (gameweek, team,
and againstteam):
def getAttr(gameweek, team, againstteam, attr, default=None):
xx = df_melt.loc[(df_melt['GameWeek'] == gameweek)
& (df_melt['Team'] == team)
& (df_melt['AgainstTeam'] == againstteam)]
return default if xx.empty else xx.iloc[0].loc[attr]
Then, instead of all 3 ... = df_melt.loc[...].item() instructions run:
match_id = getAttr(gameweek, team, againstteam, 'MatchID', default=-1)
date = getAttr(gameweek, team, againstteam, 'Date')
home = getAttr(gameweek, team, againstteam, 'Home', default='????')

Blank quotes usage in dataframe

I am trying to combine OR | with df.loc to extract data. The code I have written extracts everything in the csv file. Here is the original csv file: https://drive.google.com/open?id=16eo29mF0pn_qNw-BGpZyVM9PBxv2aN1G
import pandas as pd
df = pd.read_csv("yelp_business.csv")
df = df.loc[(df['categories'].str.contains('chinese', case = False)) | (df['name'].str.contains('subway', case = False)) | (df['categories'].str.contains('', case = False)) | (df['address'].str.contains('', case = False))]
print df
It looks like the blank quotes '' are not working in str.contains or the OR | doesn't work in df.loc. Instead of just returning rows with chinese restaurants (which are 4171 in number) and the row with the restaurant name subway, it returns all the 174,568 rows.
EDITED
The output I want should be all the rows of category chinese and all the rows of name subway while taking into consideration that the address might not have any assigned value or is null.
import pandas as pd
df = pd.read_csv("yelp_business.csv")
cusine = 'chinese'
name = 'subway'
address #address has no assigned value or is NULL
df = df.loc[(df['categories'].str.contains(cusine, case = False)) |
(df['name'].str.contains(name, case = False)) |
(df['address'].str.contains(address, case = False))]
print df
This code gives me an error NameError: name 'address' is not defined.
I think here is possible chain conditions by | for categories column, for find empty string use ^""$ - it match start and end of string with quotes:
df = pd.read_csv("yelp_business.csv")
df1 = df.loc[(df['categories'].str.contains('chinese|^""$', case = False)) |
(df['name'].str.contains('subway', case = False)) |
(df['address'].str.contains('^""$', case = False))]
print (len(df1))
11320
print (df1.head())
business_id name neighborhood \
9 TGWhGNusxyMaA4kQVBNeew "Detailing Gone Mobile" NaN
53 4srfPk1s8nlm1YusyDUbjg ***"Subway" Southeast
57 spDZkD6cp0JUUm6ghIWHzA "Kitchen M" Unionville
63 r6Jw8oRCeumxu7Y1WRxT7A "D&D Cleaning" NaN
88 YhV93k9uiMdr3FlV4FHjwA "Caviness Studio" NaN
address city state postal_code latitude \
9 ***"" Henderson NV 89014 36.055825
53 "6889 S Eastern Ave, Ste 101" Las Vegas NV 89119 36.064652
57 "8515 McCowan Road" Markham ON L3P 5E5 43.867918
63 ***"" Urbana IL 61802 40.110588
88 ***"" Phoenix AZ 85001 33.449967
longitude stars review_count is_open \
9 -115.046350 5.0 7 1
53 -115.118954 2.5 6 1
57 -79.283687 3.0 80 1
63 -88.207270 5.0 4 0
88 -112.070223 5.0 4 1
categories
9 Automotive;Auto Detailing
53 Fast Food;Restaurants;Sandwiches
57 ***Restaurants;Chinese
63 Home Cleaning;Home Services;Window Washing
88 Marketing;Men's Clothing;Restaurants;Graphic D...
EDIT: If need filter out empty and NaNs values:
df2 = df.loc[(df['categories'].str.contains('chinese', case = False)) |
(df['name'].str.contains('subway', case = False)) &
~((df['address'] == '""') | (df['categories'] == '""'))]
print (df2.head())
business_id name neighborhood \
53 4srfPk1s8nlm1YusyDUbjg "Subway" Southeast
57 spDZkD6cp0JUUm6ghIWHzA "Kitchen M" Unionville
96 dTWfATVrBfKj7Vdn0qWVWg "Flavor Cuisine" Scarborough
126 WUiDaFQRZ8wKYGLvmjFjAw "China Buffet" University City
145 vzx1WdVivFsaN4QYrez2rw "Subway" NaN
address city state postal_code \
53 "6889 S Eastern Ave, Ste 101" Las Vegas NV 89119
57 "8515 McCowan Road" Markham ON L3P 5E5
96 "8 Glen Watford Drive" Toronto ON M1S 2C1
126 "8630 University Executive Park Dr" Charlotte NC 28262
145 "5111 Boulder Hwy" Las Vegas NV 89122
latitude longitude stars review_count is_open \
53 36.064652 -115.118954 2.5 6 1
57 43.867918 -79.283687 3.0 80 1
96 43.787061 -79.276166 3.0 6 1
126 35.306173 -80.752672 3.5 76 1
145 36.112895 -115.062353 3.0 3 1
categories
53 Fast Food;Restaurants;Sandwiches
57 Restaurants;Chinese
96 Restaurants;Chinese;Food Court
126 Buffets;Restaurants;Sushi Bars;Chinese
145 Sandwiches;Restaurants;Fast Food
Find detail information about contains at
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.str.contains.html

Pandas - Count the number of rows that would be true for a function - for each input row

I have a dataframe that needs a column added to it. That column needs to be a count of all the other rows in the table that meet a certain condition, that condition needs to take in input both from the "input" row and the "output" row.
For example, if it was a dataframe describing people, and I wanted to make a column that counted how many people were taller than the current row and lighter.
I'd want the height and weight of the row, as well as the height and weight of the other rows in a function, so I can do something like:
def example_function(height1, weight1, height2, weight2):
if height1 > height2 and weight1 < weight2:
return True
else:
return False
And it would just sum up all the True's and give that sum in the column.
Is something like this possible?
Thanks in advance for any ideas!
Edit: Sample input:
id name height weight country
0 Adam 70 180 USA
1 Bill 65 190 CANADA
2 Chris 71 150 GERMANY
3 Eric 72 210 USA
4 Fred 74 160 FRANCE
5 Gary 75 220 MEXICO
6 Henry 61 230 SPAIN
The result would need to be:
id name height weight country new_column
0 Adam 70 180 USA 1
1 Bill 65 190 CANADA 1
2 Chris 71 150 GERMANY 3
3 Eric 72 210 USA 1
4 Fred 74 160 FRANCE 4
5 Gary 75 220 MEXICO 1
6 Henry 61 230 SPAIN 0
I believe it will need to be some sort of function, as the actual logic I need to use is more complicated.
edit 2:fixed typo
You can add booleans, like this:
count = ((df.height1 > df.height2) & (df.weight1 < df.weight2)).sum()
EDIT:
I test it a bit and then change conditions with custom function:
def f(x):
#check boolean mask
#print ((df.height > x.height) & (df.weight < x.weight))
return ((df.height < x.height) & (df.weight > x.weight)).sum()
df['new_column'] = df.apply(f, axis=1)
print (df)
id name height weight country new_column
0 0 Adam 70 180 USA 2
1 1 Bill 65 190 CANADA 1
2 2 Chris 71 150 GERMANY 3
3 3 Eric 72 210 USA 1
4 4 Fred 74 160 FRANCE 4
5 5 Gary 75 220 MEXICO 1
6 6 Henry 61 230 SPAIN 0
Explanation:
For each row compare values and for count simply sum values True.
For example, if it was a dataframe describing people, and I wanted to make a column that counted how many people were taller than the current row and lighter.
As far as I understand, you want to assign to a new column something like
df['num_heigher_and_leighter'] = df.apply(lambda r: ((df.height > r.height) & (df.weight < r.weight)).sum(), axis=1)
However, your text description doesn't seem to match the outcome, which is:
0 2
1 3
2 0
3 1
4 0
5 0
6 6
dtype: int64
Edit
As in any other case, you can use a named function instead of a lambda:
df = ...
def foo(r):
return ((df.height > r.height) & (df.weight < r.weight)).sum()
df['num_heigher_and_leighter'] = df.apply(foo, axis=1)
I'm assuming you had a typo and want to compare heights with heights and weights with weights. If so, you could count the number of persons taller OR heavier like so:
>>> for i,height,weight in zip(df.index,df.height, df.weight):
... cnt = df.loc[((df.height>height) & (df.weight>weight)), 'height'].count()
... df.loc[i,'thing'] = cnt
...
>>> df
name height weight country thing
0 Adam 70 180 USA 2.0
1 Bill 65 190 CANADA 2.0
2 Chris 71 150 GERMANY 3.0
3 Eric 72 210 USA 1.0
4 Fred 74 160 FRANCE 1.0
5 Gary 75 220 MEXICO 0.0
6 Henry 61 230 SPAIN 0.0
Here for instance, no person is Heavier than Henry, and no person is taller than Gary. If that's not what you intended, it should be easy to modify the & above to a | instead or switching out the > to a <.
When you're more accustomed to Pandas, I suggest you use Ami Tavory excellent answer instead.
PS. For the love of god, use the Metric system for representing weight and height, and convert to whatever for presentation. These numbers are totally nonsensical for the world population at large. :)

Categories