I have a reproducible example, toy dataframe:
df = pd.DataFrame({'my_customers':['John','Foo'],'email':['email#gmail.com','othermail#yahoo.com'],'other_column':['yes','no']})
print(df)
my_customers email other_column
0 John email#gmail.com yes
1 Foo othermail#yahoo.com no
And I apply() a function to the rows, creating a new column inside the function:
def func(row):
# if this column is 'yes'
if row['other_column'] == 'yes':
# create a new column with 'Hello' in it
row['new_column'] = 'Hello'
# return to df
return row
# otherwise
else:
# just return the row
return row
I then apply the function to the df, and we can see that the order has been changed. The columns are now in alphabetical order. Is there any way to avoid this? I would like to keep it in the original order.
df = df.apply(func, axis = 1)
print(df)
email my_customers new_column other_column
0 email#gmail.com John Hello yes
1 othermail#yahoo.com Foo NaN no
Edited for clarification - the above code was too simple
input
df = pd.DataFrame({'my_customers':['John','Foo'],
'email':['email#gmail.com','othermail#yahoo.com'],
'api_status':['data found','no data found'],
'api_response':['huge json','huge json']})
my_customers email api_status api_response
0 John email#gmail.com data found huge json
1 Foo othermail#yahoo.com no data found huge json
Parsing the api_response. I need to create many new rows in the DF:
def api_parse(row):
# if we have response data
if row['api_response'] == huge json:
# get response for parsing
response_data = row['api_response']
"""Let's get associated URLS first"""
# if there's a URL section in the response
if 'urls' in response_data .keys():
# get all associated URLS into a list
urls = extract_values(response_data ['urls'], 'url')
row['Associated_Urls'] = urls
"""Get a list of jobs"""
if 'jobs' in response_data .keys():
# get all associated jobs and organizations into a list
titles = extract_values(person_data['jobs'], 'title')
organizations = extract_values(person_data['jobs'], 'organization')
counter = 1
# create a new column for each job
for pair in zip(titles,organizations):
row['Job'+'_'+str(counter)] = f'Title: {pair[0]}, Organization: {pair[1]}'
counter +=1
"""Get a list of education"""
if 'educations' in response_data .keys():
# get all degrees into list
degrees = extract_values(response_data ['educations'], 'display')
counter = 1
# create a new column for each degree
for edu in degrees:
row['education'+'_'+str(counter)] = edu
counter +=1
"""Get a list of social profiles from URLS we parsed earlier"""
facebook = [i for i in urls if 'facebook' in i] or [np.nan]
instagram = [i for i in urls if 'instagram' in i] or [np.nan]
linkedin = [i for i in urls if 'linkedin' in i] or [np.nan]
twitter = [i for i in urls if 'twitter' in i] or [np.nan]
amazon = [i for i in urls if 'amazon' in i] or [np.nan]
row['facebook'] = facebook
row['instagram'] = instagram
row['linkedin'] = linkedin
row['twitter'] = twitter
row['amazon'] = amazon
return row
elif row['api_Status'] == 'No Data Found':
# do nothing
return row
expected output:
my_customers email api_status api_response job_1 job_2 \
0 John email#gmail.com data found huge json xyz xyz2
1 Foo othermail#yahoo.com no data found huge json nan nan
education_1 facebook other api info
0 foo profile1 etc
1 nan nan nan
You could adjust the order of columns in your DataFrame after running the apply function. For example:
df = df.apply(func, axis = 1)
df = df[['my_customers', 'email', 'other_column', 'new_column']]
To reduce the amount of duplication (i.e. by having to retype all column names), you could get the existing set of columns before calling the apply function:
columns = list(df.columns)
df = df.apply(func, axis = 1)
df = df[columns + ['new_column']]
Update based on the author's edits to the original question. Whilst I'm not sure if the data structure chosen (storing API results in a Data Frame) is the best option, one simple solution could be to extract the new columns after calling the apply functions.
# Store the existing columns before calling apply
existing_columns = list(df.columns)
df = df.apply(func, axis = 1)
all_columns = list(df.columns)
new_columns = [column for column in all_columns if column not in existing_columns]
df = df[columns + new_columns]
For performance optimisations, you could store the existing columns in a set instead of a list which will yield lookups in constant time due to the hashed nature of a set data structure in Python. This would change existing_columns = list(df.columns) to existing_columns = set(df.columns).
Finally, as #Parfait very kindly points out in their comment, the code above may raise some depreciation warnings. Using pandas.DataFrame.reindex instead of df = df[columns + new_columns] will make the warnings disappear:
new_columns_order = [columns + new_columns]
df = df.reindex(columns=new_columns_order)
That occurs because you don't assign a value to the new column if row["other_column"] != 'yes'. Just try this:
def func(row):
if row['other_column'] == 'yes':
row['new_column'] = 'Hello'
return row
else:
row['new_column'] = ''
return row
df.apply(func, axis = 1)
You can choose the value for row["new_column"] == 'no' to be whatever. I just left it blank.
Related
I am trying to output a pandas dataframe and I am only getting one column (PADD 5) instead of (PADD 1 through PADD 5). In addition, I cannot get the index to format in YYYY-MM-DD. I would appreciate if anyone knew how to output these two things. Thanks much!
# API Key from EIA
api_key = 'xxxxxxxxxxx'
# api_key = os.getenv("EIA_API_KEY")
# PADD Names to Label Columns
# Change to whatever column labels you want to use.
PADD_NAMES = ['PADD 1','PADD 2','PADD 3','PADD 4','PADD 5']
# Enter all your Series IDs here separated by commas
PADD_KEY = ['PET.MCRRIP12.M',
'PET.MCRRIP22.M',
'PET.MCRRIP32.M',
'PET.MCRRIP42.M',
'PET.MCRRIP52.M']
# Initialize list - this is the final list that you will store all the data from the json pull. Then you will use this list to concat into a pandas dataframe.
final_data = []
# Choose start and end dates
startDate = '2009-01-01'
endDate = '2023-01-01'
for i in range(len(PADD_KEY)):
url = 'https://api.eia.gov/series/?api_key=' + api_key + '&series_id=' + PADD_KEY[i]
r = requests.get(url)
json_data = r.json()
if r.status_code == 200:
print('Success!')
else:
print('Error')
print(json_data)
df = pd.DataFrame(json_data.get('series')[0].get('data'),
columns = ['Date', PADD_NAMES[i]])
df.set_index('Date', drop=True, inplace=True)
final_data.append(df)
# Combine all the data into one dataframe
crude = pd.concat(final_data, axis=1)
# Create date as datetype datatype
crude['Year'] = crude.index.astype(str).str[:4]
crude['Month'] = crude.index.astype(str).str[4:]
crude['Day'] = 1
crude['Date'] = pd.to_datetime(crude[['Year','Month','Day']])
crude.set_index('Date',drop=True,inplace=True)
crude.sort_index(inplace=True)
crude = crude[startDate:endDate]
crude = crude.iloc[:,:5]
df.head()
PADD 5
Date
202201 1996
202112 2071
202111 2125
202110 2128
202109 2232
I am trying to solve this problem
1.Create a function 'wbURL' that takes an indicator, a country code, and begin and end years to creates a URL of the following format:
http://api.worldbank.org/countries/COUNTRYCODE/indicators/INDICATORformat=json&date=BEGIN_DATE:END_DATE
Create another function 'wbDF' that takes the same inputs and returns a dataframe constructed from the response (the response will come as a JSON list, and the 1st (i.e., index 1) element of this list contains the relevant data. Extract this element, which is a list of dictionaries---this is what you want to construct the dataframe out of.Drop all columns except indicator country date and value. For the country and value columns, notice that the data is itself a dictionary: use apply to extract the value out of these dictionaries.
This is the code I wrote out:
def wbURL(contcode, ind, begin, end):
return f'http://api.worldbank.org/countries{contcode}/indicators/{ind}?format=json&date={begin}:{end}'
def wbDF(contcode, ind, begin, end):
url = wbURL(contcode, ind, begin, end)
response = requests.get(url)
wb_raw = response.content
wb = json.loads(wb_raw)
data = wb[1]
df = pd.DataFrame(data)
df = df.drop(columns=['countryiso3code', 'unit', 'obs_status', 'decimal'])
df['indicator'] = [d['id'] for d in [d['indicator'] for d in data]]
df['country'] = [d['value'] for d in [d['country'] for d in data]]
df['date'] = [d['date'] for d in data]
df['value'] = [d['value'] for d in data]
return df
test = wbDF('GBR', 'SP.DYN.LE00.IN', 2000, 2019)
print(test)
When I ren it, i got an error:
IndexError: list index out of range
I would like someone to have a look at the code I have written and give me some advice on how I should change it to make ir working
I have a code which iterate through excel and extract values from excel columns as loaded as list in dataframe. When I write dataframe to excel, I am seeing data with in [] and quotes for string ['']. How can I remove [''] when I write to excel.
Also I want to write only first value in product ID column to excel. how can I do that?
result = pd.DataFrame.from_dict(result) # result has list of data
df_t = result.T
writer = pd.ExcelWriter(path)
df_t.to_excel(writer, 'data')
writer.save()
My output to excel
I am expecting output as below and Product_ID column should only have first value in list
I tried below and getting error
path = path to excel
df = pd.read_excel(path, engine="openpyxl")
def data_clean(x):
for index, data in enumerate(x.values):
item = eval(data)
if len(item):
x.values[index] = item[0]
else:
x.values[index] = ""
return x
new_df = df.apply(data_clean, axis=1)
new_df.to_excel(path)
I am getting below error:
item = eval(data)
TypeError: eval() arg 1 must be a string, bytes or code object
df_t['id'] = df_t['id'].str[0] # this is a shortcut for if you only want the 0th index
df_t['other_columns'] = df_t['other_columns'].apply(lambda x: " ".join(x)) # this is to "unlist" the lists of lists which you have fed into a pandas column
This should be the effect you want, but you have to make sure that the data in each cell is ['', ...] form, and if it's different you can modify the way it's handled in the data_clean function:
import pandas as pd
df = pd.read_excel("1.xlsx", engine="openpyxl")
def data_clean(x):
for index, data in enumerate(x.values):
item = eval(data)
if len(item):
x.values[index] = item[0]
else:
x.values[index] = ""
return x
new_df = df.apply(data_clean, axis=1)
new_df.to_excel("new.xlsx")
The following is an example of df and modified new_df(Some randomly generated data):
# df
name Product_ID xxx yyy
0 ['Allen'] ['AF124', 'AC12414'] [124124] [222]
1 ['Aaszflen'] ['DF124', 'AC12415'] [234125] [22124124,124125]
2 ['Allen'] ['CF1sdv24', 'AC12416'] [123544126] [33542124124,124126]
3 ['Azdxven'] ['BF124', 'AC12417'] [35127] [333]
4 ['Allen'] ['MF124', 'AC12418'] [3528] [12352324124,124128]
5 ['Allen'] ['AF124', 'AC12419'] [122359] [12352324124,124129]
# new_df
name Product_ID xxx yyy
0 Allen AF124 124124 222
1 Aaszflen DF124 234125 22124124
2 Allen CF1sdv24 123544126 33542124124
3 Azdxven BF124 35127 333
4 Allen MF124 3528 12352324124
5 Allen AF124 122359 12352324124
I have few static key columns EmployeeId,type and few columns coming from first FOR loop.
While in the second FOR loop if i have a specific key then only values should be appended to the existing data frame columns else whatever the columns getting fetched from first for loop should remain same.
First For Loop Output:
"EmployeeId","type","KeyColumn","Start","End","Country","Target","CountryId","TargetId"
"Emp1","Metal","1212121212","2000-06-17","9999-12-31","","","",""
After Second For Loop i have below output:
"EmployeeId","type","KeyColumn","Start","End","Country","Target","CountryId","TargetId"
"Emp1","Metal","1212121212","2000-06-17","9999-12-31","","AMAZON","1",""
"Emp1","Metal","1212121212","2000-06-17","9999-12-31","","FLIPKART","2",""
As per code if i have Employee tag available , i have got above 2 records but i may have few json files without Employee tag then output should remain same as per First Loop Output.
But i am getting 0 records as per my code. Please help me if my way of coding is wrong.
Really sorry -- If the way of asking question is not clear , as i am new to python . Please find the code in the below hyper link
Please find below code
for i in range(len(json_file['enty'])):
temp = {}
temp['EmployeeId'] = json_file['enty'][i]['id']
temp['type'] = json_file['enty'][i]['type']
for key in json_file['enty'][i]['data']['attributes'].keys():
try:
temp[key] = json_file['enty'][i]['data']['attributes'][key]['values'][0]['value']
except:
temp[key] = None
for key in json_file['enty'][i]['data']['attributes'].keys():
if(key == 'Employee'):
for j in range(len(json_file['enty'][i]['data']['attributes']['Employee']['group'])):
for key in json_file['enty'][i]['data']['attributes']['Employee']['group'][j].keys():
try:
temp[key] = json_file['enty'][i]['data']['attributes']['Employee']['group'][j][key]['values'][0]['value']
except:
temp[key] = None
temp_df = pd.DataFrame([temp])
df = pd.concat([df, temp_df], sort=True)
# Rearranging columns
df = df[['EmployeeId', 'type'] + [col for col in df.columns if col not in ['EmployeeId', 'type']]]
# Writing the dataset
df[columns_list].to_csv("Test22.csv", index=False, quotechar='"', quoting=1)
If Employee Tag is not available i am getting 0 records as output but i am expecting 1 record as per output of first FOR loop. If the "Employee tag" is available then i am expecting 2 records along with my static columns "EmployeeId","type","KeyColumn","Start","End", else if the tag is not available then all the static columns "EmployeeId","type","KeyColumn","Start","End", and remaining columns as blanks
enter link description here
A long solution with modifying your code, so adding one more loop, changing indexes, as well as modifying the range parameters:
df = pd.DataFrame()
num = max([len(v) for k,v in json_file['data'][0]['data1'].items()])
for i in range(num):
temp = {}
temp['Empid'] = json_file['data'][0]['Empid']
temp['Empname'] = json_file['data'][0]['Empname']
for key in json_file['data'][0]['data1'].keys():
if key not in temp:
temp[key] = []
try:
for j in range(len(json_file['data'][0]['data1'][key])):
temp[key].append(json_file['data'][0]['data1'][key][j]['relative']['id'])
except:
temp[key] = None
temp_df = pd.DataFrame([temp])
df = pd.concat([df, temp_df],ignore_index=True)
for i in json_file['data'][0]['data1'].keys():
df[i] = pd.Series([x for y in df[i].tolist() for x in y]).drop_duplicates()
And now:
print(df)
Is:
Empid Empname XXXX YYYYY
0 1234 ABC Naveen Kumar
1 1234 ABC NaN Rajesh
I have a pandas matrix df of the form:
user_id time url
4 20140502 'w.lejournal.fr/actualite/politique/sarkozy-terminator_1557749.html',
7 20140307 'w.lejournal.fr/palmares/palmares-immobilier/'
10 20140604 'w.lejournal.fr/actualite/societe/adeline-hazan-devient-la-nouvelle-controleuse-des-lieux-de-privation-de-liberte_1558176.html'
etc...
I want to use the groupby function to group by user, then to make some statistics of the words appearing in the urls of each user, for example, get how many times there is the world 'actualite' in a user urls.
For now, my code is:
def my_stat_function(temp_set):
res = 0
for (u,t) in temp_set:
if 'actualite' in u and t > 20140101:
res += 1
return res
group_user = df.groupby('user_id')
output_list = []
for (i,group) in group_user:
dfg = pandas.DataFrame(group)
temp_set = [tuple(x) for x in dfg[['url','time']].values]
temp_var = my_stat_function(temp_set)
output_list.append([i]+[temp_var])
outputDf = pandas.DataFrame(data = output_list, columns = ['user_id','stat'])
My question is: can I avoid to iterate group by group to apply my_stat_function, and is there exist something faster, maybe applying the function apply? I would really like something more "pandas-ish' and faster.
Thank you for your help.