I'm trying to split a dataframe with a certain logic.
Here's my attempt:
def split_df(df: pd.DataFrame):
train = pd.DataFrame(columns = df.columns)
valid = pd.DataFrame(columns = df.columns)
i = 0
for data in tqdm(df.iterrows()):
if i > 10:
break
if (len(valid[valid['category_id'] == data[1]['category_id']]) > 0):
tmp = pd.DataFrame(columns = df.columns, data = pd.Series(data[1]))
train.append(tmp,ignore_index=True)
i = i+1
else:
tmp = pd.DataFrame(columns = df.columns, data = pd.Series(data[1]))
train.append(tmp,ignore_index=True)
valid.append(tmp,ignore_index=True)
i = i+1
return (train, valid)
When I run this, I get a tuple of empty dataframes.
the i<10 is just for me to check outputs.
The splitting logic may be wrong, but it's not important for now.
I also try to avoid for loops, so if there's a better approach to this problem, I'll be glad to read about it.
Append does not modify the dataframe in place, so you need to reassign your variable to keep changes:
train = train.append(tmp,ignore_index=True)
valid = valid.append(tmp,ignore_index=True)
Related
community,
I have a sorted pandas dataframe that looks as following:
I want to merge rows that have overlapping values in start and end columns. Meaning that if the end value of initial row is bigger than start value of the sequential one or any othe sequential, they will be merged into one row. Examples are rows 3, 4 and 5. Output I would expect is:
To do so, I am trying to implement recursion function, that would loop over the dataframe until condition worsk and then return me a number that would be used to search location for the end row .
However, the functioin I am trying to implement, returns me empty dataframe. Could you help me please, where should I put attention, or what alternative can I build if recurtion is not a solution?
def row_merger(pd_df):
counter = 0
new_df = pd.DataFrame(columns=pd_df.columns)
for i in range(len(pd_df) - 1):
def recursion_inside(pd_df, counter = 0):
counter = 0
if pd_df.iloc[i + 1 + counter]["q.start"] <= pd_df.iloc[i]["q.end"]:
counter = counter+1
recursion_inside(pd_df, counter)
else:
return counter
new_row = {"name": pd_df["name"][i], "q.start": pd_df.iloc[i]
["q.start"], "q.end": pd_df.iloc[i+counter]["q.start"]}
new_df.append(new_row, ignore_index=True)
return new_df
I don't see the benefit of using recursion here, so I would just iterate over the rows instead, building up the rows for the output dataframe one by one, e.g. like this:
def row_merger(df_in):
if len(df_in) <= 1:
return df_in
rows_out = []
current_row = df_in.iloc[0].values
for next_row in df_in.iloc[1:].values:
if next_row[1] > current_row[2]:
rows_out.append(current_row)
current_row = next_row
else:
current_row[2] = max(current_row[2], next_row[2])
rows_out.append(current_row)
return pd.DataFrame(rows_out, columns=df_in.columns)
I have some strings in a column and I want to explode the words out only if they are not within brackets. The column looks like this
pd.DataFrame(data={'a': ['first,string','(second,string)','third,string (another,string,here)']})
and I want the output to look like this
pd.DataFrame(data={'a': ['first','string','(second,string)','third','string','(another,string,here)']})
This sort of works, but i would like to not have to put the row number in each time
re.split(r',(?![^()]*\))', x['a'][0])
re.split(r',(?![^()]*\))', x['a'][1])
re.split(r',(?![^()]*\))', x['a'][2])
i thought i could do with a lmbda function but i cannot get it to work. Thanks for checking this out
x['a'].apply(lambda i: re.split(r',(?![^()]*\))', i))
It is not clear to me if the elements in your DataFrame may have multiple groups between brackets. Given that doubt, I have implemented the following:
import pandas as pd
import re
df = pd.DataFrame(data={'a': ['first,string','(second,string)','third,string (another,string,here)']})
pattern = re.compile("([^\(]*)([\(]?.*[\)]?)(.*)", re.IGNORECASE)
def findall(ar, res = None):
if res is None:
res = []
m = pattern.findall(ar)[0]
if len(m[0]) > 0:
res.extend(m[0].split(","))
if len(m[1]) > 0:
res.append(m[1])
if len(m[2]) > 0:
return findall(ar[2], res = res)
else:
return res
res = []
for x in df["a"]:
res.extend(findall(x))
print(pd.DataFrame(data={"a":res}))
Essentially, you recursively scan the last part of the match until you find no more words between strings. If order was not an issue, the solution is easier.
I am trying to fill records one column based on some condition but I am not getting the result. Can you please help me how to do this?
Example:
df:
applied_sql_function1 and_or_not_oprtor_pre comb_fld_order_1
CASE WHEN
WHEN AND
WHEN AND
WHEN
WHEN AND
WHEN OR
WHEN
WHEN dummy
WHEN dummy
WHEN
Expected Output:
applied_sql_function1 and_or_not_oprtor_pre comb_fld_order_1 new
CASE WHEN CASE WHEN
WHEN AND
WHEN AND
WHEN WHEN
WHEN AND
WHEN OR
WHEN WHEN
WHEN dummy
WHEN dummy
WHEN WHEN
I have written some logic for this but it is not working:
df_main1['new'] =''
for index,row in df_main1.iterrows():
new = ''
if((str(row['applied_sql_function1']) != '') and (str(row['and_or_not_oprtor_pre']) == '') and (str(row['comb_fld_order_1']) == '')):
new += str(row['applied_sql_function1'])
print(new)
if(str(row['applied_sql_function1']) != '') and (str(row['and_or_not_oprtor_pre']) != ''):
new += ''
print(new)
else:
new += ''
row['new'] = new
print(df_main1['new'])
Using, loc
mask = df.and_or_not_oprtor_pre.fillna("").eq("") \
& df.comb_fld_order_1.fillna("").eq("")
df.loc[mask, 'new'] = df.loc[mask, 'applied_sql_function1']
try this one, it would work in a quick way
indexes = df.index[(df['and_or_not_oprtor_pre'].isna()) & (df['comb_fld_order_1'].isna())]
df.loc[indexes, 'new'] = df.loc[indexes, 'applied_sql_function1']
Go with np.where all the way! It's easy to understand and vectorized, so the performance is good on really large datasets.
import pandas as pd, numpy as np
df['new'] = ''
df['new'] = np.where((df['and_or_not_oprtor_pre'] == '') & (df['comb_fld_order_1'] == ''), df['applied_sql_function1'], df['new'])
df
I have a list of DataFrames i.e data = [df1,df2,df3.....dfn] . I am trying to iterate function maxloc through list of data and appending new values to new_max. It gives me following error
TypeError: 'int' object is not iterable. How can I fix it?
def max(data):
data['loc_max'] = np.zeros(len(data))
for i in range(1,len(data)-1):
if data['value'][i] >= data['value'][i-1] and data['value'][i] >= data['value'][i+1]:
data['loc_max'][i] = 1
return data
def maxloc(data):
loc_opt_ind = argrelextrema(df['values'].values, np.greater)
loc_max = np.zeros(len(data))
loc_max[loc_opt_ind] = 1
data['loc_max'] = loc_max
return data
new_max= []
for df in range(len(data)):
max_values = maxloc(df).loc_max
new_max.append(max_values)
When you use:
for df in range(len(data)):
# your loop
your df is just intgers, you should use this loop instea:
for df in data:
# your loop
So here is my code updating many column values based on a condition of split values of the column 'location'. The code works fine, but as its iterating by row it's not efficient enough. Can anyone help me to make this code work faster please?
for index, row in df.iterrows():
print index
location_split =row['location'].split(':')
after_county=False
after_province=False
for l in location_split:
if l.strip().endswith('ED'):
df[index, 'electoral_district'] = l
elif l.strip().startswith('County'):
df[index, 'county'] = l
after_county = True
elif after_province ==True:
if l.strip()!='Ireland':
df[index, 'dublin_postal_district'] = l
elif after_county==True:
df[index, 'province'] = l.strip()
after_province = True
'map' was what I needed :)
def fill_county(column):
res = ''
location_split = column.split(':')
for l in location_split:
if l.strip().startswith('County'):
res= l.strip()
break
return res
df['county'] = map(fill_county, df['location'])