recursively merging rows pandas dataframe based on the condition - python

community,
I have a sorted pandas dataframe that looks as following:
I want to merge rows that have overlapping values in start and end columns. Meaning that if the end value of initial row is bigger than start value of the sequential one or any othe sequential, they will be merged into one row. Examples are rows 3, 4 and 5. Output I would expect is:
To do so, I am trying to implement recursion function, that would loop over the dataframe until condition worsk and then return me a number that would be used to search location for the end row .
However, the functioin I am trying to implement, returns me empty dataframe. Could you help me please, where should I put attention, or what alternative can I build if recurtion is not a solution?
def row_merger(pd_df):
counter = 0
new_df = pd.DataFrame(columns=pd_df.columns)
for i in range(len(pd_df) - 1):
def recursion_inside(pd_df, counter = 0):
counter = 0
if pd_df.iloc[i + 1 + counter]["q.start"] <= pd_df.iloc[i]["q.end"]:
counter = counter+1
recursion_inside(pd_df, counter)
else:
return counter
new_row = {"name": pd_df["name"][i], "q.start": pd_df.iloc[i]
["q.start"], "q.end": pd_df.iloc[i+counter]["q.start"]}
new_df.append(new_row, ignore_index=True)
return new_df

I don't see the benefit of using recursion here, so I would just iterate over the rows instead, building up the rows for the output dataframe one by one, e.g. like this:
def row_merger(df_in):
if len(df_in) <= 1:
return df_in
rows_out = []
current_row = df_in.iloc[0].values
for next_row in df_in.iloc[1:].values:
if next_row[1] > current_row[2]:
rows_out.append(current_row)
current_row = next_row
else:
current_row[2] = max(current_row[2], next_row[2])
rows_out.append(current_row)
return pd.DataFrame(rows_out, columns=df_in.columns)

Related

split string row wise with condition in python

I have some strings in a column and I want to explode the words out only if they are not within brackets. The column looks like this
pd.DataFrame(data={'a': ['first,string','(second,string)','third,string (another,string,here)']})
and I want the output to look like this
pd.DataFrame(data={'a': ['first','string','(second,string)','third','string','(another,string,here)']})
This sort of works, but i would like to not have to put the row number in each time
re.split(r',(?![^()]*\))', x['a'][0])
re.split(r',(?![^()]*\))', x['a'][1])
re.split(r',(?![^()]*\))', x['a'][2])
i thought i could do with a lmbda function but i cannot get it to work. Thanks for checking this out
x['a'].apply(lambda i: re.split(r',(?![^()]*\))', i))
It is not clear to me if the elements in your DataFrame may have multiple groups between brackets. Given that doubt, I have implemented the following:
import pandas as pd
import re
df = pd.DataFrame(data={'a': ['first,string','(second,string)','third,string (another,string,here)']})
pattern = re.compile("([^\(]*)([\(]?.*[\)]?)(.*)", re.IGNORECASE)
def findall(ar, res = None):
if res is None:
res = []
m = pattern.findall(ar)[0]
if len(m[0]) > 0:
res.extend(m[0].split(","))
if len(m[1]) > 0:
res.append(m[1])
if len(m[2]) > 0:
return findall(ar[2], res = res)
else:
return res
res = []
for x in df["a"]:
res.extend(findall(x))
print(pd.DataFrame(data={"a":res}))
Essentially, you recursively scan the last part of the match until you find no more words between strings. If order was not an issue, the solution is easier.

Create a new data frame by subsetting all the data frames in a dictionary using a loop

I have a dictionary of data frames:
two_season_bucket_suffixes = {'two_season_bucket_year1_racer_bio':two_season_bucket_year1_racer_bio,
'two_season_bucket_year1_spring_rate':two_season_bucket_year1_spring_rate,
'two_season_bucket_year1_neaps_rate':two_season_bucket_year1_neaps_rate,
'two_season_bucket_year1_spring_raw':two_season_bucket_year1_spring_raw,
'two_season_bucket_year1_neap_raw':two_season_bucket_year1_neap_raw,
'two_season_bucket_year1_opposing_team':two_season_bucket_year1_opposing_team,
'two_season_bucket_year1_opposing_team_distribution':two_season_bucket_year1_opposing_team_distribution,
'two_season_bucket_year1_stern_score':two_season_bucket_year1_stern_score,
'two_season_bucket_year1_bow_score':two_season_bucket_year1_bow_score,
'two_season_bucket_year1_team_score':two_season_bucket_year1_team_score,
'two_season_bucket_year2_racer_bio':two_season_bucket_year2_racer_bio,
'two_season_bucket_year2_spring_rate':two_season_bucket_year2_spring_rate,
'two_season_bucket_year2_neaps_rate':two_season_bucket_year2_neaps_rate,
'two_season_bucket_year2_spring_raw':two_season_bucket_year2_spring_raw,
'two_season_bucket_year2_neap_raw':two_season_bucket_year2_neap_raw,
'two_season_bucket_year2_opposing_team':two_season_bucket_year2_opposing_team,
'two_season_bucket_year2_opposing_team_distribution':two_season_bucket_year2_opposing_team_distribution,
'two_season_bucket_year2_stern_score':two_season_bucket_year2_stern_score,
'two_season_bucket_year2_bow_score':two_season_bucket_year2_bow_score,
'two_season_bucket_year2_team_score':two_season_bucket_year2_team_score}
they all have different columns, but they all have at least one column that starts with the word 'prediction'
I would like to create a new dataframe out of every column that starts with 'prediction':
two_season_bucket_prediction= pd.DataFrame()
counter = 0
for key, val in two_season_bucket_suffixes.items():
if counter == 0:
two_season_bucket_prediction= val[val.columns[pd.Series(val.columns).str.startswith('prediction')]]
else:
two_season_bucket_prediction= two_season_bucket_prediction.join(val[val.columns[pd.Series(val.columns).str.startswith('prediction')]])
counter += 1
every time I just end up with a data frame with one column named 'prediction', even though I am sure no column is only named prediction
You are increasing the counter where you dont reach ever:
two_season_bucket_prediction= pd.DataFrame()
counter = 0
for key, val in two_season_bucket_suffixes.items():
if counter == 0:
two_season_bucket_prediction= val[val.columns[pd.Series(val.columns).str.startswith('prediction')]]
else:
two_season_bucket_prediction= two_season_bucket_prediction.join(val[val.columns[pd.Series(val.columns).str.startswith('prediction')]])
counter += 1

Pandas append series from tuple into empty dataframe

I'm trying to split a dataframe with a certain logic.
Here's my attempt:
def split_df(df: pd.DataFrame):
train = pd.DataFrame(columns = df.columns)
valid = pd.DataFrame(columns = df.columns)
i = 0
for data in tqdm(df.iterrows()):
if i > 10:
break
if (len(valid[valid['category_id'] == data[1]['category_id']]) > 0):
tmp = pd.DataFrame(columns = df.columns, data = pd.Series(data[1]))
train.append(tmp,ignore_index=True)
i = i+1
else:
tmp = pd.DataFrame(columns = df.columns, data = pd.Series(data[1]))
train.append(tmp,ignore_index=True)
valid.append(tmp,ignore_index=True)
i = i+1
return (train, valid)
When I run this, I get a tuple of empty dataframes.
the i<10 is just for me to check outputs.
The splitting logic may be wrong, but it's not important for now.
I also try to avoid for loops, so if there's a better approach to this problem, I'll be glad to read about it.
Append does not modify the dataframe in place, so you need to reassign your variable to keep changes:
train = train.append(tmp,ignore_index=True)
valid = valid.append(tmp,ignore_index=True)

Create a function that uses a list as an argument to extract rows from a CSV

I'm trying to pass a list as an argument to a function that will grab a row from a csv if it contains a string in the list provided. I can't get the index to change on itemA. It only prints the last item of the list!
GAS=[
"SUNOCO",
"CUMBERLAND",
"MOBIL"]
gasLength=len(GAS)
print(gasLength)
def parseData(csvToParse = transactionsCSV, itemA="", itemB=""):
#For Loop to append to CSV
for row in csvToParse:
if itemA in row[3]:
csv_personA.writerow([row[0],row[1],row[2],row[3],row[4],row[5]])
print(row[3])
print(itemA)
elif itemB in row[3]:
csv_personB.writerow([row[0],row[1],row[2],row[3],row[4],row[5]])
#This Was suggested but still only returns the GAS index of 0
for counter, _ in enumerate(range(gasLength)):
parseData(csvToParse=transactionsCSV, itemA=GAS[counter], itemB="")
for _ in range(gasLength):
x = gasLength-1
parseData(csvToParse=transactionsCSV, itemA=GAS[x], itemB="")
# My first attempt is below!!!
#Get gas purchases
def parseGasStations():
x = 0
itemsToCheck = row_count*gasLength
print(itemsToCheck)
#while x is less than total of items in the main csv times the number of items in the gas array.
while x < itemsToCheck:
a = 0
y = 0
#While a is less than the total number of rows in the main
while a < row_count:
print(GAS[y])
for _ in range(gasLength):
parseData(csvToParse=transactionsCSV, itemA=GAS[gasLength-1], itemB="")
if y != gasLength-1:
y += 1
elif y == gasLength-1:
y = 0
a += 1
x += 1
parseGasStations()
csv output
The output is only appending the MOBIL stations to the CSV and not indexing through the list like I thought it would.
So, if you want to use a numeric iteration counter with your iteration, you could do the following.
for counter, _ in enumerate(range(gasLength)):
parseData(csvToParse=transactionsCSV, itemA=GAS[counter], itemB="")
Enumerate returns a tuple containing a counter and the element itself.
Thanks to Fluxens I was able to figure this out!
Here's a function that takes a list as a parameter and indexes through all the items!
GAS=(
"SUNOCO",
"CUMBERLAND",
"MOBIL",
"BESTWAY",
"AMORE FUEL")
gasLength=len(GAS)
def parseData(csvToParse="", catagory=(), catagorySize=""):
#For loop to check each row in master csv
for row in csvToParse:
#For loop to index through catagory items to look for in each row
for counter, _ in enumerate(range(catagorySize)):
if catagory[counter] in row[3]:
csv_mark.writerow([row[0],row[1],row[2],row[3],row[4],row[5]])
print(row[3])
print(catagory)
parseData(csvToParse = transactionsCSV, catagory=GAS, catagorySize=gasLength)

Efficiently update columns based on one of the columns split value

So here is my code updating many column values based on a condition of split values of the column 'location'. The code works fine, but as its iterating by row it's not efficient enough. Can anyone help me to make this code work faster please?
for index, row in df.iterrows():
print index
location_split =row['location'].split(':')
after_county=False
after_province=False
for l in location_split:
if l.strip().endswith('ED'):
df[index, 'electoral_district'] = l
elif l.strip().startswith('County'):
df[index, 'county'] = l
after_county = True
elif after_province ==True:
if l.strip()!='Ireland':
df[index, 'dublin_postal_district'] = l
elif after_county==True:
df[index, 'province'] = l.strip()
after_province = True
'map' was what I needed :)
def fill_county(column):
res = ''
location_split = column.split(':')
for l in location_split:
if l.strip().startswith('County'):
res= l.strip()
break
return res
df['county'] = map(fill_county, df['location'])

Categories