Select row data if other row data is matched - python

I have dataframe:
import pandas as pd
df = pd.read_csv('data.csv')
df.head()
title poster
0 Toy Story https://images-na.ssl-images-amazon.com/images...
1 Jumanji https://images-na.ssl-images-amazon.com/images...
I want to create a function which will take movie title as the input and return an return the poster link as output. I tried the following, but it is not working:
def function_to_return_link(movie_name):
if df['title'].str.contains(movie_name).any():
print('Movie present in df')
out = df.loc[df['title'] == movie_name]
print(out)
else:
print('Movie is not present')
It showing the output as:
function_to_return_link('Toy Story')
Movie present in df
Empty DataFrame
Columns: [title, poster]
Index: []

df.loc[..., 'poster'] returns a pd.Series with your selected movie(s). Then, use pd.Series.iat to get the first value in the selection (by index). If the movie isn't present, then it raises an IndexError.
def function_to_return_link(movie_name):
posters = df.loc[df['title'].str.contains(movie_name), 'poster']
try:
link = posters.iat[0]
except IndexError:
print('Movie is not present')
else:
return link
Note that this doesn't account for duplicate entries (multiple entries). To deal with that, you could do the below (though it's arguably/perhaps less pythonic than try/except).
def function_to_return_link(movie_name):
posters = df.loc[df['title'].str.contains(movie_name), 'poster']
if len(posters) > 1:
print('Multiple hits')
elif len(posters) == 0:
print('Movie is not present')
else:
return posters.iat[0]

Here's a way you could do:
def function_to_return_link(movie_name):
if movie_name in pos['title']:
return pos.query("title == #movie_name")['poster']
else:
print('Movie is not present')

Related

Xlookup with panda in Python

I am very new to Python and i would like to use xlookup in order to look for values in different columns (column"Debt", column "Liquidity" etc..) in a database
And fill the value in the cells (C17, C18, c19....) of a number of destination files which have the same format
path_source= r"C:\Test source.xlsx"
destination_file= r"C:Stress Test Q4 2022\test.xlsx"
df1 = pd.read_excel(path_source)
df2= pd.read_excel(destination_file)
def xlookup(lookup_value, lookup_array, return_array, if_not_found:str = ''):
match_value = return_array.loc[lookup_array == lookup_value]
if match_value.empty:
return f'"{lookup_value}" not found!' if if_not_found == '' else if_not_found
else:
return match_value.tolist()[0]
df2.iloc[2,17]= df1["debt"].apply(xlookup, args = (main_df1["Fund name"],main_df1["fund_A"] ))
NameError: name 'main_df1' is not defined
can anyone help correct the code ?
thanks a lot

adding multiple columns to a dataframe using df.apply and a lambda function

I am trying to add multiple columns to an existing dataframe with df.apply and a lambda function. I am able to add columns one by one but not able to do it for all the columns together.
My code
def get_player_stats(player_name):
print(player_name)
resp = requests.get(player_id_api + player_name)
if resp.status_code != 200:
# This means something went wrong.
print('Error {}'.format(resp.status_code))
result = resp.json()
player_id = result['data'][0]['pid']
resp_data = requests.get(player_data_api + str(player_id))
if resp_data.status_code != 200:
# This means something went wrong.
print('Error {}'.format(resp_data.status_code))
result_data = resp_data.json()
check1 = len(result_data.get('data',None).get('batting',None))
# print(check1)
check2 = len(result_data.get('data',{}).get('batting',{}).get('ODIs',{}))
# check2 = result_data.get(['data']['batting']['ODIs'],None)
# print(check2)
if check1 > 0 and check2 > 0:
total_6s = result_data['data']['batting']['ODIs']['6s']
total_4s = result_data['data']['batting']['ODIs']['4s']
average = result_data['data']['batting']['ODIs']['Ave']
total_innings = result_data['data']['batting']['ODIs']['Inns']
total_catches = result_data['data']['batting']['ODIs']['Ct']
total_stumps = result_data['data']['batting']['ODIs']['St']
total_wickets = result_data['data']['bowling']['ODIs']['Wkts']
print(average,total_innings,total_4s,total_6s,total_catches,total_stumps,total_wickets)
return np.array([average,total_innings,total_4s,total_6s,total_catches,total_stumps,total_wickets])
else:
print('No data for player')
return '','','','','','',''
cols = ['Avg','tot_inns','tot_4s','tot_6s','tot_cts','tot_sts','tot_wkts']
for col in cols:
players_available[col] = ''
players_available[cols] = players_available.apply(lambda x: get_player_stats(x['playerName']) , axis =1)
I have tried adding columns explicitly to the dataframe but still i am getting an error
ValueError: Must have equal len keys and value when setting with an iterable
Can someone help me with this?
It's tricky, since in pandas the apply method evolve through versions.
In my version (0.25.3) and also the other recent versions, if the function returns pd.Series object then it works.
In your code, you could try to change the return value in the function:
return pd.Series([average,total_innings,total_4s,total_6s,
total_catches,total_stumps,total_wickets])
return pd.Series(['','','','','','',''])

Unexpected KeyError with for loop but not when manual

I have written a function that manually creates separate dataframes for each participant in the main dataframe. However, I'm trying to write it so that it's more automated as participants will be added to the dataframe in the future.
My original function:
def separate_participants(main_df):
S001 = main_df[main_df['participant'] == 'S001']
S001.name = "S001"
S002 = main_df[main_df['participant'] == 'S002']
S002.name = "S002"
S003 = main_df[main_df['participant'] == 'S003']
S003.name = "S003"
S004 = main_df[main_df['participant'] == 'S004']
S004.name = "S004"
S005 = main_df[main_df['participant'] == 'S005']
S005.name = "S005"
S006 = main_df[main_df['participant'] == 'S006']
S006.name = "S006"
S007 = main_df[main_df['participant'] == 'S007']
S007.name = "S007"
participants = (S001, S002, S003, S004, S005, S006, S007)
participant_names = (S001.name, S002.name, S003.name, S004.name, S005.name, S006.name, S007.name)
return participants, participant_names
However, when I try and change this I get a KeyError for the name of the participant in the main_df. The code is as follows:
def separate_participants(main_df):
participant_list = list(main_df.participant.unique())
participants = []
for participant in participant_list:
name = participant
temp_df = main_df[main_df[participant] == participant]
name = temp_df
participants.append(name)
return participants
The error I get: KeyError: 'S001'
I can't seem to figure out what I'm doing wrong, that means it works in the old function but not the new one. The length of the objects in the dataframe and the list are the same (4) so there are no extra characters.
Any help/pointers would be greatly appreciated!
Thanks #Iguananaut for the answer:
Your DataFrame has a column named 'participant' but you're indexing it with the value of the variable participant which is presumably not a column in your DataFrame. You probably wanted main_df['participant']. Most likely the KeyError came with a "traceback" leading back to the line temp_df = main_df[main_df[participant] == participant] which suggests you should examine it closely.

Using .apply() on dataframe re-orders my columns into alphabetical order? Strange behavior

I am seeing some strange behaviour with an .apply() function.
I am calling an API, and editing the row with the response.
I can't upload the dataset for privacy reasons, but this is basically the function and simple example DataFrame:
df = pd.DataFrame({'my_customers':['John','Foo'],'email':['email#gmail.com','othermail#yahoo.com']})
print(df)
my_customers email
0 John email#gmail.com
1 Foo othermail#yahoo.com
And the api call:
def api_func(row):
name=row['my_customers']
email=row['email']
# send api request
response = api(name, email)
# if there is data in the response
if 'data' in response.keys():
# append our data to row
row['api_status'] = 'Data Found'
row['api_response']= response
return row
# otherwise flag no data found
else:
row['api_status'] = 'No Data Found'
return row
And the DataFrame after this apply() function. The columns have been re-ordered into alphabetical order:
df = df.apply(api_func,axis=1)
api_status api_response email my_customers
0 data found xyz email#gmail.com John
1 data found abc othermail#yahoo.com Foo
I have read the documentation and can't find any clues as to why this is happening. And i have never seen this happen before.
Any idea?
edit: full code:
def api_append(row):
# set up variables
api=get_api()
firstname=row['firstname']
lastname=row['lastname']
email=row['email']
phone=row['phone_only']
countrycode=row['country_code']
# send request
request = SearchAPIRequest(email=email,minimum_match=0.8,first_name=firstname,
last_name=lastname,phone=phone,country_code=countrycode,api_key=api)
# if there is an error, record it to the dataframe
try:
response = request.send()
response = response.to_dict()
except (ValueError, AttributeError) as e:
# append our error data to the row
row['Pipl_Api_Status'] = 'check error code'
row['Pipl_Api_response']= str(e)
return row
# if there is data in the response
if 'data' in response.keys():
# append our data to row
row['Api_Status'] = 'Data Found'
row['Api_response'] = response['data']
return row
# otherwise flag no data found
else:
row['Api_Status'] = 'No Data Found'
return row
df_sample = df_sample.apply(api_append,axis=1)

Loop and add function component as index

I would like to change the index of the following code. Instead of having 'close' as the index, I want to have the corresponding x from the function. As sometimes like in this example even if i provide 4 curr only 3 are available. Meaning that I cannot add the list as the index after looping as the size changes. Thank you for your help. I should add that even with the set_index(x) the index remain 'close'.
The function daily_price_historical retrieve prices from a public API . There are exactly 7 columns from which I select the the first one (close).
The function:
def daily_price_historical(symbol, comparison_symbol, all_data=False, limit=1, aggregate=1, exchange=''):
url = 'https://min-api.cryptocompare.com/data/histoday?fsym={}&tsym={}&limit={}&aggregate={}'\
.format(symbol.upper(), comparison_symbol.upper(), limit, aggregate)
if exchange:
url += '&e={}'.format(exchange)
if all_data:
url += '&allData=true'
page = requests.get(url)
data = page.json()['Data']
df = pd.DataFrame(data)
df.drop(df.index[-1], inplace=True)
return df
The code:
curr = ['1WO', 'ABX','ADH', 'ALX']
d_price = []
for x in curr:
try:
close = daily_price_historical(x, 'JPY', exchange='CCCAGG').close
d_price.append(close).set_index(x)
except:
pass
d_price = pd.concat(d_price, axis=1)
d_price = d_price.transpose()
print(d_price)
The output:
0
close 2.6100
close 0.3360
close 0.4843
The function daily_price_historical returns a dataframe, so daily_price_historical(x, 'JPY', exchange='CCCAGG').close is a pandas Series. The title of a Series is its name, but you can change it with rename. So you want:
...
close = daily_price_historical(x, 'JPY', exchange='CCCAGG').close
d_price.append(close.rename(x))
...
In your original code, d_price.append(close).set_index(x) raised a AttributeError: 'NoneType' object has no attribute 'set_index' exception because append on a list returns None but the exception was raised after the append and was silently swallowed by the catchall except: pass.
What to remember from that: never use the very dangerous :
try:
...
except:
pass
which hides any error.
Try this small code
import pandas as pd
import requests
curr = ['1WO', 'ABX','ADH', 'ALX']
def daily_price_historical(symbol, comparison_symbol, all_data=False, limit=1, aggregate=1, exchange=''):
url = 'https://min-api.cryptocompare.com/data/histoday?fsym={}&tsym={}&limit={}&aggregate={}'\
.format(symbol.upper(), comparison_symbol.upper(), limit, aggregate)
if exchange:
url += '&e={}'.format(exchange)
if all_data:
url += '&allData=true'
page = requests.get(url)
data = page.json()['Data']
df = pd.DataFrame(data)
df.drop(df.index[-1], inplace=True)
return df
d_price = []
lables_ind = []
for idx, x in enumerate(curr):
try:
close = daily_price_historical(x, 'JPY', exchange='CCCAGG').close
d_price.append(close[0])
lables_ind.append(x)
except:
pass
d_price = pd.DataFrame(d_price,columns=["0"])
d_price.index = lables_ind
print(d_price)
Output
0
1WO 2.6100
ADH 0.3360
ALX 0.4843

Categories