Python Filter Dataframe with Dynamic arguments - python

Hi i want to Filter a dataframe from arguments dynamically.
this is my idea now:
tr=pd.read_csv("sales.csv")
def filtr(*arg2):
fltr = tr.loc[(tr[arg2[0]] arg2[1] arg2[2]) arg2[3] ....]
print(fltr)
filtr(*sys.argv[1:])
## python test.py "Unit Cost" "==" 4 & .......
i had the idea of making the (tr[arg2[0]] arg2[1] arg2[2]) as body and iterating it but i don't know how.
edit: Data Example:
{'Region': {0: 'Sub-Saharan Africa', 1: 'Europe', 2: 'Middle East and North Africa', 3: 'Sub-Saharan Africa', 4: 'Europe', 5: 'Sub-Saharan Africa', 6: 'Asia', 7: 'Asia', 8: 'Sub-Saharan Africa', 9: 'Central America and the Caribbean', 10: 'Sub-Saharan Africa', 11: 'Europe', 12: 'Europe', 13: 'Asia', 14: 'Middle East and North Africa', 15: 'Australia and Oceania', 16: 'Central America and the Caribbean', 17: 'Europe', 18: 'Middle East and North Africa', 19: 'Europe'}, 'Country': {0: 'Chad', 1: 'Latvia', 2: 'Pakistan', 3: 'Democratic Republic of the Congo', 4: 'Czech Republic', 5: 'South Africa', 6: 'Laos', 7: 'China', 8: 'Eritrea', 9: 'Haiti', 10: 'Zambia', 11: 'Bosnia and Herzegovina', 12: 'Germany', 13: 'India', 14: 'Algeria', 15: 'Palau', 16: 'Cuba', 17: 'Vatican City', 18: 'Lebanon', 19: 'Lithuania'}, 'Item Type': {0: 'Office Supplies', 1: 'Beverages', 2: 'Vegetables', 3: 'Household', 4: 'Beverages', 5: 'Beverages', 6: 'Vegetables', 7: 'Baby Food', 8: 'Meat', 9: 'Office Supplies', 10: 'Cereal', 11: 'Baby Food', 12: 'Office Supplies', 13: 'Household', 14: 'Clothes', 15: 'Snacks', 16: 'Beverages', 17: 'Beverages', 18: 'Personal Care', 19: 'Snacks'}, 'Sales Channel': {0: 'Online', 1: 'Online', 2: 'Offline', 3: 'Online', 4: 'Online', 5: 'Offline', 6: 'Online', 7: 'Online', 8: 'Online', 9: 'Online', 10: 'Offline', 11: 'Offline', 12: 'Online', 13: 'Online', 14: 'Offline', 15: 'Offline', 16: 'Online', 17: 'Online', 18: 'Offline', 19: 'Offline'}, 'Order Priority': {0: 'L', 1: 'C', 2: 'C', 3: 'C', 4: 'C', 5: 'H', 6: 'L', 7: 'C', 8: 'L', 9: 'C', 10: 'M', 11: 'M', 12: 'C', 13: 'C', 14: 'C', 15: 'L', 16: 'H', 17: 'L', 18: 'H', 19: 'H'}, 'Order Date': {0: '1/27/2011', 1: '12/28/2015', 2: '1/13/2011', 3: '9/11/2012', 4: '10/27/2015', 5: '7/10/2012', 6: '2/20/2011', 7: '4/10/2017', 8: '11/21/2014', 9: '7/4/2015', 10: '7/26/2016', 11: '10/20/2012', 12: '2/22/2015', 13: '8/27/2016', 14: '6/21/2011', 15: '9/19/2013', 16: '11/15/2015', 17: '4/6/2015', 18: '4/12/2010', 19: '9/26/2011'}, 'Order ID': {0: 292494523, 1: 361825549, 2: 141515767, 3: 500364005, 4: 127481591, 5: 482292354, 6: 844532620, 7: 564251220, 8: 411809480, 9: 327881228, 10: 773452794, 11: 479823005, 12: 498603188, 13: 151717174, 14: 181401288, 15: 500204360, 16: 640987718, 17: 206925189, 18: 221503102, 19: 878520286}, 'Ship Date': {0: '2/12/2011', 1: '1/23/2016', 2: '2/1/2011', 3: '10/6/2012', 4: '12/5/2015', 5: '8/21/2012', 6: '3/20/2011', 7: '5/12/2017', 8: '1/10/2015', 9: '7/20/2015', 10: '8/24/2016', 11: '11/15/2012', 12: '2/27/2015', 13: '9/2/2016', 14: '7/21/2011', 15: '10/4/2013', 16: '11/30/2015', 17: '4/27/2015', 18: '5/19/2010', 19: '10/2/2011'}, 'Units Sold': {0: 4484, 1: 1075, 2: 6515, 3: 7683, 4: 3491, 5: 9880, 6: 4825, 7: 3330, 8: 2431, 9: 6197, 10: 724, 11: 9145, 12: 6618, 13: 5338, 14: 9527, 15: 441, 16: 1365, 17: 2617, 18: 6545, 19: 2530}, 'Unit Price': {0: 651.21, 1: 47.45, 2: 154.06, 3: 668.27, 4: 47.45, 5: 47.45, 6: 154.06, 7: 255.28, 8: 421.89, 9: 651.21, 10: 205.7, 11: 255.28, 12: 651.21, 13: 668.27, 14: 109.28, 15: 152.58, 16: 47.45, 17: 47.45, 18: 81.73, 19: 152.58}, 'Unit Cost': {0: 524.96, 1: 31.79, 2: 90.93, 3: 502.54, 4: 31.79, 5: 31.79, 6: 90.93, 7: 159.42, 8: 364.69, 9: 524.96, 10: 117.11, 11: 159.42, 12: 524.96, 13: 502.54, 14: 35.84, 15: 97.44, 16: 31.79, 17: 31.79, 18: 56.67, 19: 97.44}, 'Total Revenue': {0: 2920025.64, 1: 51008.75, 2: 1003700.9, 3: 5134318.41, 4: 165647.95, 5: 468806.0, 6: 743339.5, 7: 850082.4, 8: 1025614.59, 9: 4035548.37, 10: 148926.8, 11: 2334535.6, 12: 4309707.78, 13: 3567225.26, 14: 1041110.56, 15: 67287.78, 16: 64769.25, 17: 124176.65, 18: 534922.85, 19: 386027.4}, 'Total Cost': {0: 2353920.64, 1: 34174.25, 2: 592408.95, 3: 3861014.82, 4: 110978.89, 5: 314085.2, 6: 438737.25, 7: 530868.6, 8: 886561.39, 9: 3253177.12, 10: 84787.64, 11: 1457895.9, 12: 3474185.28, 13: 2682558.52, 14: 341447.68, 15: 42971.04, 16: 43393.35, 17: 83194.43, 18: 370905.15, 19: 246523.2}, 'Total Profit': {0: 566105.0, 1: 16834.5, 2: 411291.95, 3: 1273303.59, 4: 54669.06, 5: 154720.8, 6: 304602.25, 7: 319213.8, 8: 139053.2, 9: 782371.25, 10: 64139.16, 11: 876639.7, 12: 835522.5, 13: 884666.74, 14: 699662.88, 15: 24316.74, 16: 21375.9, 17: 40982.22, 18: 164017.7, 19: 139504.2}}

Just use eval() and here are the code:
import pandas as pd
def filter_df(df, args_list):
constraints = []
for a in args_list:
col = a[0]
symbol = a[1]
value = a[2]
constraint = "(df.{}{}{})".format(col, symbol, value)
constraints.append(constraint)
filter_str = "&".join(constraints)
return df[eval(filter_str)]
data = {
"COL_A": [1,2,3,2,4,6],
"COL_B": [1,10,100,20,20,40],
"COL_C": ["aaa", "bbb", "zzz", "xxx", "xxx", "xxx"]
}
df = pd.DataFrame(data)
args_list = [["COL_A", "<=", "4"], ["COL_C", "==", "'xxx'"]]
df2 = filter_df(df, args_list)
This is df:
After filter COL_A <= 4 & COL_C == 'xxx', this is df2:

How about this ?
def filter(df, **args):
conditions = args["args"]
for key , value in conditions.items():
df = df[df[key] > value]
return df
Invoke using
df = filter(df, args={"Unit Cost": 500, "Unit Price": 500})
Result:
print(df.shape)
(5,14)
Note: This approach can be used only when you want to compare all the conditions using >. if you need to include multiple operation, you may need to find a better approach

def filter_df(arg2):
if arg2[1]==">":
return tr.loc[(tr[arg2[0]] > int(arg2[2]))]
elif arg2[1]=="<":
return tr.loc[(tr[arg2[0]] < int(arg2[2]))]
elif arg2[1]=="=":
return tr.loc[(tr[arg2[0]] == int(arg2[2]))]
else:
raise ValueError("invalid comparison: %s"%arg2[1])
filter_df(arg2)
now if (for example) arg2 = ('Unit Cost', '>', '500'), the function will return only the rows with Unit Cost>500:
If you want to pass multiple condition it is more complicated and my hint is to pass them step-by-step, separately.

Related

Is there a way of creating boxplots using the exact boxplot values?

I am trying to create boxplots for 24 hours, each hour already having the maxValue, quartile75, mean, quartile25 and minValue. Those values are stored in a dataframe - I put them into a dict.
{'hour': {0: 0,
1: 1,
2: 2,
3: 3,
4: 4,
5: 5,
6: 6,
7: 7,
8: 8,
9: 9,
10: 10,
11: 11,
12: 12,
13: 13,
14: 14,
15: 15,
16: 16,
17: 17,
18: 18,
19: 19,
20: 20,
21: 21,
22: 22,
23: 23},
'minValue': {0: -491.69,
1: -669.49,
2: -551.22,
3: -514.2,
4: -506.94,
5: -665.7,
6: -484.89,
7: -488.99,
8: -524.22,
9: -851.9,
10: -610.0,
11: -998.8,
12: -580.57,
13: -737.22,
14: -895.2,
15: -500.0,
16: -852.0,
17: -610.0,
18: -500.0,
19: -610.0,
20: -1000.0,
21: -674.0,
22: -1005.0,
23: -499.33},
'quartile25': {0: 114.94,
1: 119.29,
2: 128.8,
3: 139.8,
4: 151.48,
5: 146.75,
6: 139.1,
7: 125.02,
8: 110.0,
9: 105.0,
10: 94.9,
11: 92.81,
12: 107.62,
13: 134.5,
14: 150.8,
15: 168.51,
16: 175.71,
17: 163.0,
18: 142.57,
19: 139.3,
20: 139.45,
21: 120.68,
22: 116.89,
23: 112.84},
'median': {0: 188.53,
1: 193.2,
2: 206.6,
3: 222.2,
4: 234.58,
5: 227.68,
6: 218.32,
7: 200.93,
8: 190.92,
9: 182.6,
10: 175.01,
11: 176.87,
12: 192.33,
13: 210.38,
14: 227.0,
15: 243.87,
16: 252.1,
17: 245.45,
18: 226.86,
19: 219.6,
20: 209.09,
21: 192.32,
22: 187.4,
23: 184.94},
'quartile75': {0: 292.1,
1: 295.33,
2: 316.62,
3: 340.8,
4: 357.0,
5: 345.3,
6: 330.4,
7: 305.28,
8: 290.4,
9: 280.1,
10: 268.23,
11: 270.99,
12: 301.84,
13: 321.04,
14: 345.61,
15: 373.84,
16: 393.39,
17: 382.79,
18: 359.89,
19: 341.55,
20: 325.5,
21: 292.1,
22: 287.2,
23: 285.96},
'maxValue': {0: 2420.3,
1: 1450.0,
2: 2852.0,
3: 7300.0,
4: 3967.0,
5: 3412.1,
6: 6999.99,
7: 2999.99,
8: 6000.0,
9: 3000.0,
10: 8885.9,
11: 9999.0,
12: 6254.0,
13: 2300.0,
14: 2057.58,
15: 2860.0,
16: 5000.0,
17: 4151.01,
18: 7000.0,
19: 3000.0,
20: 6000.0,
21: 3000.5,
22: 2000.0,
23: 2500.0}}
When I used a normal time series data set I plotted like this:
N=24
c = ['hsl('+str(h)+',50%'+',50%)' for h in np.linspace(0, 360, N)]
fig = go.Figure(data=[go.Box(
x=hour_dataframes[i]['hour'],
y=hour_dataframes[i]['priceNum'],
marker_color=c[i]
) for i in range(int(N))])
fig.update_layout(
xaxis=dict(showgrid=True, zeroline=True, showticklabels=True),
yaxis=dict(zeroline=True, gridcolor='white'),
paper_bgcolor='rgb(233,233,233)',
plot_bgcolor='rgb(233,233,233)',
autosize=False,
width=1500,
height=1000,
)
fig.show()
It worked fine but the data set became too big and Jupyterlab started crashing, so I pulled aggregated data but now I don't know how to plot multiple boxes (like the code above does) using the exact box plot values.

networkx and pyviz how to add color to nodes based on dimenison

I'm using networkx and pyviz together. I'm trying to figure out how to change the color of the nodes in pyviz based an attribute in my data frame.
My dataframe looks like this:
I want to show the connections between user_one and user_two but color their nodes based on their license type.
Here is what I have thus far:
import pandas as pd
import networkx as nx
from pyvis.network import Network
df_dict = {'PROJECT': {0: 'Finance Project', 1: 'Finance Project', 2: 'Finance Project', 3: 'Finance Project', 4: 'Finance Project', 5: 'Finance Project', 6: 'Finance Project', 7: 'Finance Project', 8: 'Finance Project', 9: 'Finance Project', 10: 'Finance Project', 11: 'Finance Project', 12: 'HR Project', 13: 'Finance Project', 14: 'HR Project', 15: 'Finance Project'},
'PLAN': {0: 'COMPANY', 1: 'COMPANY', 2: 'COMPANY', 3: 'COMPANY', 4: 'COMPANY', 5: 'COMPANY', 6: 'COMPANY', 7: 'COMPANY', 8: 'COMPANY', 9: 'COMPANY', 10: 'COMPANY', 11: 'COMPANY', 12: 'COMPANY', 13: 'COMPANY', 14: 'COMPANY', 15: 'COMPANY'},
'USER_ONE': {0: 'Mike Jones', 1: 'Eminem', 2: 'Mike Jones', 3: 'Mike Jones', 4: 'Michael Jordan', 5: 'Eminem', 6: 'Michael Jordan', 7: 'Michael Jordan', 8: 'Mike Jones', 9: 'Kobe Bryant', 10: 'Eminem', 11: 'Elon Musk', 12: 'Bill Gates', 13: 'Elon Musk', 14: 'Mark Zuckerberg', 15: 'Elon Musk'},
'USER_ONE_LICENSE': {0: 'FULL', 1: 'FULL', 2: 'FULL', 3: 'FULL', 4: 'FULL', 5: 'FULL', 6: 'FULL', 7: 'FULL', 8: 'FULL', 9: 'OCCASIONAL', 10: 'FULL', 11: 'FULL', 12: 'FULL', 13: 'FULL', 14: 'FULL', 15: 'FULL'},
'USER_ONE_LICENSE_COLOR': {0: 'lightgreen', 1: 'lightgreen', 2: 'lightgreen', 3: 'lightgreen', 4: 'lightgreen', 5: 'lightgreen', 6: 'lightgreen', 7: 'lightgreen', 8: 'lightgreen', 9: 'gray', 10: 'lightgreen', 11: 'lightgreen', 12: 'lightgreen', 13: 'lightgreen', 14: 'lightgreen', 15: 'lightgreen'},
'USER_ONE_DAYS_COLLAB': {0: 88, 1: 55, 2: 67, 3: 1, 4: 70, 5: 54, 6: 2, 7: 114, 8: 4, 9: 1, 10: 10, 11: 19, 12: 5, 13: 11, 14: 100, 15: 13},
'USER_TWO': {0: 'Michael Jordan', 1: 'Mike Jones', 2: 'Eminem', 3: 'Kobe Bryant', 4: 'Eminem', 5: 'Michael Jordan', 6: 'Elon Musk', 7: 'Mike Jones', 8: 'Elon Musk', 9: 'Mike Jones', 10: 'Elon Musk', 11: 'Eminem', 12: 'Mark Zuckerberg', 13: 'Michael Jordan', 14: 'Bill Gates', 15: 'Mike Jones'},
'USER_TWO_LICENSE': {0: 'FULL', 1: 'FULL', 2: 'FULL', 3: 'OCCASIONAL', 4: 'FULL', 5: 'FULL', 6: 'FULL', 7: 'FULL', 8: 'FULL', 9: 'FULL', 10: 'FULL', 11: 'FULL', 12: 'FULL', 13: 'FULL', 14: 'FULL', 15: 'FULL'},
'USER_TWO_LICENSE_COLOR': {0: 'lightgreen', 1: 'lightgreen', 2: 'lightgreen', 3: 'gray', 4: 'lightgreen', 5: 'lightgreen', 6: 'lightgreen', 7: 'lightgreen', 8: 'lightgreen', 9: 'lightgreen', 10: 'lightgreen', 11: 'lightgreen', 12: 'lightgreen', 13: 'lightgreen', 14: 'lightgreen', 15: 'lightgreen'},
'USER_TWO_DAYS_COLLAB': {0: 114, 1: 67, 2: 55, 3: 1, 4: 54, 5: 70, 6: 11, 7: 88, 8: 13, 9: 1, 10: 19, 11: 10, 12: 100, 13: 2, 14: 5, 15: 4}
, 'TOTAL_COLLABS': {0: 202, 1: 122, 2: 122, 3: 2, 4: 124, 5: 124, 6: 13, 7: 202, 8: 17, 9: 2, 10: 29, 11: 29, 12: 105, 13: 13, 14: 105, 15: 17}}
df = pd.DataFrame(df_dict)
G = nx.from_pandas_edgelist(df
,source='USER_ONE'
,target='USER_TWO'
)
net = Network(notebook=True)
net.from_nx(G)
net.show_buttons(filter_=True)
net.show('example4.html')
I'm trying to follow this as a template: Assign color to NetworkX node based on column name
#code from stackoverflow
colors = []
for node in G:
if node in df["person"].values:
colors.append("lightblue")
else: colors.append("lightgreen")
#my attempt:
#get distinct users and respective colors based on license type
colors_df = df.groupby(by='USER_ONE')['USER_ONE_LICENSE_COLOR'].max().reset_index()
#get list of nodes in the right order
nodes_list_ordered = list(G.nodes)
#order color df so i can turn it into list to match nodes
colors_df['ORDER'] = pd.Categorical(colors_df.USER_ONE, categories = nodes_list_ordered, ordered = True)
colors_df.sort_values(by='ORDER', inplace=True)
colors = colors_df['USER_ONE_LICENSE_COLOR'].to_list()
Am I doing this right? I now have a list of colors in the right order, but how do I apply it to pyviz?
I did all this to realize that from_pandas_edgelist() does not a have 'node_color' argument.

How to split a column and add additional rows from the split values in pandas?

I have a dataframe as:
{'last_name': {0: 'Acosta-Arriola',
1: 'Afragola',
2: 'Bertolini',
3: 'Coyle',
4: 'Davis',
10: 'Duntz',
11: 'Eastman',
12: 'Fitzgerald',
13: 'Fitzgerald',
14: 'Freeman',
15: 'Freeman',
16: 'Gambardella',
17: 'Kelleher',
18: 'King',
19: 'Looney',
20: 'Mccann',
21: 'Murray',
22: 'Palmeri',
23: 'Powers',
24: 'Vitelli',
25: 'Wyzykowski'},
'first_name_or_initial': {0: 'Jose',
1: 'Sarah',
2: 'Peter',
3: 'James',
4: 'Albert',
10: 'Shawn',
11: 'Bryan',
12: 'Richard',
13: 'Richard',
14: 'Matthew',
15: 'Matthew',
16: 'Vincent',
17: 'Robert',
18: 'Thomas',
19: 'Ray',
20: 'Joseph',
21: 'Joshua',
22: 'Randy',
23: 'Dennis',
24: 'Robert',
25: 'John'},'middle_name_or_initial': {0: 'Lusi;Luis',
1: 'R.;B.',
2: 'M.;Mario',
3: 'M.;Michael',
4: 'Chadbourne;C.',
10: 'R.;Richard',
11: 'J.;James',
12: 'M.;J.;Micha',
13: 'M.;Michael',
14: 'Christopher;Robert',
15: 'Christopher;C.',
16: 'A.;Anthony',
17: 'S.;Steven',
18: 'E.;Emory',
19: 'S.;Scott',
20: 'M.;Michael',
21: 'M.;P.',
22: 'T.;Thomas',
23: 'E.;Edward',
24: 'J.;D.',
25: 'J.;James'},
'Suffix': {0: '',
1: '',
2: '',
3: '',
4: 'Jr.',
10: '',
11: '',
12: '',
13: '',
14: '',
15: '',
16: '',
17: '',
18: 'Jr.',
19: 'Jr.',
20: '',
21: '',
22: '',
23: 'Jr.',
24: '',
25: ''},
'address_1': {0: '',
1: '51 Indigo Trail',
2: '90 Cherry Street;1295 Great Hill Road;90 Cherry Street',
3: '51 Canary Court;51 Canary Court;687 Main Street',
4: '39 Hemenway Street',
10: '118 Brookside Avenue;9886 171 Street Place',
11: '616 East Main Street;989 Boston Post Road;38 Mallard Court;1421 Naugatuck Avenue',
12: '',
13: '18 Fox Ridge Lane;18 Fox Ridge;18 Fox Ridge Road',
14: '',
15: '',
16: '45 Jakobs Landing',
17: '171 Williams Road;181 Knob Hill Road',
18: '31 Millwood Drive;31 Millwood Drive;41 Waverly Park Road;31 Millwood Drive;25 Crouch Road',
19: '',
20: '17 Pheasant Run;25 Mcdermott Road;PO Box 510;17 Pheasant Run',
21: '42 Seymour Street;42 Seymour Stt',
22: '205 Mccall Road',
23: '204 Milton Avenue;187 Milton Avenue',
24: '16 Montgomery Drive',
25: '457 Hill Street;139 County Line Road'}}
Here i would like to split a column middle_name using delimeter semicolon ';'.
after splitting i would like to have a additional rows as many spitted words as existed.
for example:
Duntz Shawn R.;Richard 118 Brookside Avenue;9886 171 Street Place
should be
1. Duntz - Shawn - R. - 118 Brookside Avenue;9886 171 Street Place
2. Duntz - Shawn - Richard - 118 Brookside Avenue;9886 171 Street Place
# split the middle name
df.middle_name_or_initial = df.middle_name_or_initial.str.split(';')
# explode the dataframe
df_new = df.explode('middle_name_or_initial')
here is the documentation of df.explode()
doc

How to Use Melt to Tidy Dataframe in Pandas?

dt = {'Ind': {0: 'Ind1',
1: 'Ind2',
2: 'Ind3',
3: 'Ind4',
4: 'Ind5',
5: 'Ind6',
6: 'Ind7',
7: 'Ind8',
8: 'Ind9',
9: 'Ind10',
10: 'Ind1',
11: 'Ind2',
12: 'Ind3',
13: 'Ind4',
14: 'Ind5',
15: 'Ind6',
16: 'Ind7',
17: 'Ind8',
18: 'Ind9',
19: 'Ind10'},
'Treatment': {0: 'Treat',
1: 'Treat',
2: 'Treat',
3: 'Treat',
4: 'Treat',
5: 'Treat',
6: 'Treat',
7: 'Treat',
8: 'Treat',
9: 'Treat',
10: 'Cont',
11: 'Cont',
12: 'Cont',
13: 'Cont',
14: 'Cont',
15: 'Cont',
16: 'Cont',
17: 'Cont',
18: 'Cont',
19: 'Cont'},
'value': {0: 4.5,
1: 8.3,
2: 6.2,
3: 4.2,
4: 7.1,
5: 7.5,
6: 7.9,
7: 5.1,
8: 5.8,
9: 6.0,
10: 11.3,
11: 11.6,
12: 13.3,
13: 12.2,
14: 13.4,
15: 11.7,
16: 12.1,
17: 12.0,
18: 14.0,
19: 13.8}}
mydt = pd.DataFrame(dt, columns = ['Ind', 'Treatment', 'value')
How can I tidy up my dataframe to make it look like?
Desired Output
You can use DataFrame.from_dict
pd.DataFrame.from_dict(data, orient='index')

TypeError: unsupported operand type(s) for &: 'str' and 'bool'

All,
I have below Pandas dataframe, and I am trying to filter my dataframe such that my output displays country name along with the year 1989 column whose number is >1000000.For this I am using below code, but it is returning me below error.
{'Country': {0: 'Austria', 1: 'Belgium', 2: 'Denmark', 3: 'Finland', 4: 'France', 5: 'Germany', 6: 'Iceland', 7: 'Ireland', 8: 'Italy', 9: 'Luxemburg', 10: 'Netherland', 11: 'Norway', 12: 'Portugal', 13: 'Spain', 14: 'Sweden', 15: 'Switzerland', 16: 'United Kingdom'}, 'y1989': {0: 7602431, 1: 9927600, 2: 5129800, 3: 4954359, 4: 56269800, 5: 61715000, 6: 253500, 7: 3526600, 8: 57504700, 9: 374900, 10: 14805240, 11: 4226901, 12: 10304700, 13: 38851900, 14: 8458890, 15: 6619973, 16: 57236200}, 'y1990': {0: 7660345.0, 1: 9947800.0, 2: 5135400.0, 3: 4974383.0, 4: 0.0, 5: 62678000.0, 6: 255708.0, 7: 3505500.0, 8: 57576400.0, 9: 379300.0, 10: 14892574.0, 11: 4241473.0, 12: 0.0, 13: 38924500.0, 14: 8527040.0, 15: 6673850.0, 16: 57410600.0}, 'y1991': {0: 7790957, 1: 9987000, 2: 5146500, 3: 4998478, 4: 56893000, 5: 79753000, 6: 259577, 7: 3519000, 8: 57746200, 9: 384400, 10: 15010445, 11: 4261930, 12: 9858500, 13: 38993800, 14: 8590630, 15: 6750693, 16: 57649200}, 'y1992': {0: 7860800, 1: 10068319, 2: 5162100, 3: 5029300, 4: 57217500, 5: 80238000, 6: 262193, 7: 3542000, 8: 57788200, 9: 389800, 10: 15129200, 11: 4273634, 12: 9846000, 13: 39055900, 14: 8644100, 15: 6831900, 16: 58888800}, 'y1993': {0: 7909575, 1: 10100631, 2: 5180614, 3: 5054982, 4: 57529577, 5: 81338000, 6: 264922, 7: 3559985, 8: 57114161, 9: 395200, 10: 15354000, 11: 4324577, 12: 9987500, 13: 39790955, 14: 8700000, 15: 6871500, 16: 58191230}, 'y1994': {0: 7943652, 1: 10130574, 2: 5191000, 3: 5098754, 4: 57847000, 5: 81353000, 6: 266783, 7: 3570700, 8: 57201800, 9: 400000, 10: 15341553, 11: 4348410, 12: 9776000, 13: 39177400, 14: 8749000, 15: 7021200, 16: 58380000}, 'y1995': {0: 8054800, 1: 10143047, 2: 5251027, 3: 5116800, 4: 58265400, 5: 81845000, 6: 267806, 7: 3591200, 8: 57268578, 9: 412800, 10: 15492800, 11: 4370000, 12: 9920800, 13: 39241900, 14: 8837000, 15: 7060400, 16: 58684000}}
My code
df[(df.Country)& (df.y1989>1000000)]
Error:
TypeError: unsupported operand type(s) for &: 'str' and 'bool'
I am not sure what could be the reason, being a newbie to python if you could provide explanation for the error that will be greatly appreciated.
Thanks in advance,
'Country' doesn't form part of your filtering criteria, so don't use it to form your Boolean indexer. Instead, use the loc accessor to give a Boolean condition and specify necessary columns separately:
res = df.loc[df['y1989'] > 1000000, ['Country','y1989']]
Under no circumstances use chained assignment, e.g. via df[df['y1989']>1000000][['Country','y1989']], as this is ambiguous and explicitly discouraged in the docs.

Categories