How to split dataframe by specific string in rows - python

I have a dataframe like this:
df = pd.DataFrame({"a":["x1", 12, 14, "x2", 32, 9]})
df
Out[10]:
a
0 x1
1 12
2 14
3 x2
4 32
5 9
I would like to split it in multiple dataframes (in this case, two) if row begins with "x". And then this row should be the column name. Maybe splitting these dataframes and put inside a dictionary?
The output should be like this:
x1
Out[12]:
x1
0 12
1 14
x2
Out[13]:
x2
0 32
1 9
Anyone could help me?

You can try cumsum on str.startswith then groupby on that:
for k, d in df.groupby(df['a'].str.startswith('x').fillna(0).cumsum()):
# manipulate data to get desired output
sub_df = pd.DataFrame(d.iloc[1:].to_numpy(), columns=d.iloc[0].to_numpy())
# do something with it
print(sub_df)
print('-'*10)
Output:
x1
0 12
1 14
----------
x2
0 32
1 9
----------

Something like this should work:
import pandas as pd
df = pd.DataFrame({"a":["x1", 12, 14, "x2", 32, 9]})
## Get the row index of value starting with x
ixs = []
for j in df.index:
if isinstance(df.loc[j,'a'],str):
if df.loc[j,'a'].startswith('x'):
ixs.append(j)
dicto = {}
for i,val in enumerate(ixs):
start_ix = ixs[i]
if i == len(ixs) - 1:
end_ix = df.index[-1]
else:
end_ix = ixs[i+1] - 1
new_df = df.loc[start_ix:end_ix,'a'].reset_index(drop=True)
new_df.columns = new_df.iloc[0]
new_df.drop(new_df.index[0],inplace=True)
dicto[i] = new_df

A groupby is like a dictionary, so we can explicitly make it one:
dfs = {f'x{k}':d for k, d in df.groupby(df['a'].str.startswith('x').fillna(False).cumsum())}
for k in dfs:
dfs[k].columns = dfs[k].iloc[0].values # Make x row the header.
dfs[k] = dfs[k].iloc[1:] # drop x row.
print(dfs[k], '\n')
Output:
x1
1 12
2 14
x2
4 32
5 9

Related

Pandas dataframe, in a row, to find the max in selected column, and find value of another column based on that

I have a dataframe like this:
>>>import pandas as pd
>>>df = pd.DataFrame({'x1':[20,25],'y1':[5,8],'x2':[22,27],'y2':[10,2]})
>>>df
x1 y1 x2 y2
0 20 5 22 10
1 25 8 27 2
>>>
X and Y pair together. I need to compare y1 and y2 and get the max in every row. And find the corresponding x.
Hence the max of row [0] is y2 (=10), and the corresponding x is x2 (=22). The second row will be y1
(=8) and x1(=25).
Expected result, new columns x and y:
x1 y1 x2 y2 x y
0 20 5 22 10 22 10
1 25 8 27 2 25 8
This is a simple dataframe I made to elaborate on the question. X and Y pairs, in my case, can be 30 pairs.
# get a hold on "y*" columns
y_cols = df.filter(like="y")
# get the maximal y-values' suffixes, and then add from front "x" to them
max_x_vals = y_cols.idxmax(axis=1).str.extract(r"(\d+)$", expand=False).radd("x")
# get the locations of those x* values
max_x_ids = df.columns.get_indexer(max_x_vals)
# now we have the indexes of x*'s in the columns; NumPy's indexing
# helps to get a cross section
df["max_xs"] = df.to_numpy()[np.arange(len(df)), max_x_ids]
# for y*'s, it's directly the maximum per row
df["max_ys"] = y_cols.max(axis=1)
to get
>>> df
x1 y1 x2 y2 max_xs max_ys
0 20 5 22 10 22 10
1 25 8 27 2 25 8
You can do it with the help of .apply function.
import pandas as pd
import numpy as np
df = pd.DataFrame({'x1':[20,25],'y1':[5,8],'x2':[22,27],'y2':[10,2]})
y_cols = [col for col in df.columns if col[0] == 'y']
x_cols = [col for col in df.columns if col[0] == 'x']
def find_corresponding_x(row):
max_y_index = np.argmax(row[y_cols])
return row[f'{x_cols[max_y_index]}']
df['corresponding_x'] = df.apply(find_corresponding_x, axis = 1)
you can use the function below. remember to import pandas and numpy like I did in this code. import your data set and use Max_number function.
import pandas as pd
import numpy as np
df = pd.DataFrame({'x1':[20,25],'y1':[5,8],'x2':[22,27],'y2':[10,2]})
def Max_number (df):
columns = list(df.columns)
rows = df.shape[0]
max_value = []
column_name = []
for i in range(rows):
row_array = list(np.array(df[i:i+1])[0])
maximum = max(row_array)
max_value.append(maximum)
index=row_array.index(maximum)
column_name.append(columns[index])
return pd.DataFrame({"column":column_name,"max_value":max_value})
returns this:
row index
column
max_value
0
x2
22
1
x2
27
if x1 column comes first and then y1, then x2, y2 and so on, you can just try:
a = df.columns.get_indexer(y_cols.idxmax(axis=1))
df[['y', 'x']] = df.to_numpy()[np.arange(len(df)), [a, a - 1]].T
this is one solution:
a = df[df['y1'] < df['y2']].drop(columns=['y1','x1']).rename(columns={'y2':'y', 'x2':'x'})
b = df[df['y1'] >= df['y2']].drop(columns=['y2','x2']).rename(columns={'y1':'y', 'x1':'x'})
result = pd.concat([a,b])
if you need to keep order then maybe add another column with original index and sort by it after concatenation
I hope it works for your solution,
import pandas as pd
df = pd.DataFrame({'x1':[20,25],'y1':[5,8],'x2':[22,27],'y2':[10,2]})
df['x_max'] = df[['x1', 'x2']].max(axis=1)
df['y_max'] = df[['y1', 'y2']].max(axis=1)
df

How to filter dataframe based on varying thresholds for indexes

I have a data frame and a dictionary like this:
thresholds = {'column':{'A':10,'B':11,'C':9}}
df:
Column
A 13
A 7
A 11
B 12
B 14
B 14
C 7
C 8
C 11
For every index group, I want to calculate the count of values less than the threshold and greater than the threshold value.
So my output looks like this:
df:
Values<Thr Values>Thr
A 1 2
B 0 3
C 2 1
Can anyone help me with this
You can use:
import numpy as np
t = df.index.to_series().map(thresholds['column'])
out = (pd.crosstab(df.index, np.where(df['Column'].gt(t), 'Values>Thr', 'Values≤Thr'))
.rename_axis(index=None, columns=None)
)
Output:
Values>Thr Values≤Thr
A 2 1
B 3 0
C 1 2
syntax variant
out = (pd.crosstab(df.index, df['Column'].gt(t))
.rename_axis(index=None, columns=None)
.rename(columns={False: 'Values≤Thr', True: 'Values>Thr'})
)
apply on many column based on the key in the dictionary
def count(s):
t = s.index.to_series().map(thresholds.get(s.name, {}))
return (pd.crosstab(s.index, s.gt(t))
.rename_axis(index=None, columns=None)
.rename(columns={False: 'Values≤Thr', True: 'Values>Thr'})
)
out = pd.concat({c: count(df[c]) for c in df})
NB. The key of the dictionary must match exactly. I changed the case for the demo.
Output:
Values≤Thr Values>Thr
Column A 1 2
B 0 3
C 2 1
Here another option:
import pandas as pd
df = pd.DataFrame({'Column': [13, 7, 11, 12, 14, 14, 7, 8, 11]})
df.index = ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C']
thresholds = {'column':{'A':10,'B':11,'C':9}}
df['smaller'] = df['Column'].groupby(df.index).transform(lambda x: x < thresholds['column'][x.name]).astype(int)
df['greater'] = df['Column'].groupby(df.index).transform(lambda x: x > thresholds['column'][x.name]).astype(int)
df.drop(columns=['Column'], inplace=True)
# group by index summing the greater and smaller columns
sums = df.groupby(df.index).sum()
sums

Element-wise Comparison of Two Pandas Dataframes

I am trying to compare two columns in pandas. I know I can do:
# either using Pandas' equals()
df1[col].equals(df2[col])
# or this
df1[col] == df2[col]
However, what I am looking for is to compare these columns elment-wise and when they are not matching print out both values. I have tried:
if df1[col] != df2[col]:
print(df1[col])
print(df2[col])
where I get the error for 'The truth value of a Series is ambiguous'
I believe this is because the column is treated as a series of boolean values for the comparison which causes the ambiguity. I also tried various forms of for loops which did not resolve the issue.
Can anyone point me to how I should go about doing what I described?
This might work for you:
import pandas as pd
df1 = pd.DataFrame({'col1': [1, 2, 3, 4, 5]})
df2 = pd.DataFrame({'col1': [1, 2, 9, 4, 7]})
if not df2[df2['col1'] != df1['col1']].empty:
print(df1[df1['col1'] != df2['col1']])
print(df2[df2['col1'] != df1['col1']])
Output:
col1
2 3
4 5
col1
2 9
4 7
You need to get hold of the index where the column values are not matching. Once you have that index then you can query the individual DFs to get the values.
Please try the fallowing and is if this helps:
for ind in (df1.loc[df1['col1'] != df2['col1']].index):
x = df1.loc[df1.index == ind, 'col1'].values[0]
y = df2.loc[df2.index == ind, 'col1'].values[0]
print(x, y )
Solution
Try this. You could use any of the following one-line solutions.
# Option-1
df.loc[df.apply(lambda row: row[col1] != row[col2], axis=1), [col1, col2]]
# Option-2
df.loc[df[col1]!=df[col2], [col1, col2]]
Logic:
Option-1: We use pandas.DataFrame.apply() to evaluate the target columns row by row and pass the returned indices to df.loc[indices, [col1, col2]] and that returns the required set of rows where col1 != col2.
Option-2: We get the indices with df[col1] != df[col2] and the rest of the logic is the same as Option-1.
Dummy Data
I made the dummy data such that for indices: 2,6,8 we will find column 'a' and 'c' to be different. Thus, we want only those rows returned by the solution.
import numpy as np
import pandas as pd
a = np.arange(10)
c = a.copy()
c[[2,6,8]] = [0,20,40]
df = pd.DataFrame({'a': a, 'b': a**2, 'c': c})
print(df)
Output:
a b c
0 0 0 0
1 1 1 1
2 2 4 0
3 3 9 3
4 4 16 4
5 5 25 5
6 6 36 20
7 7 49 7
8 8 64 40
9 9 81 9
Applying the solution to the dummy data
We see that the solution proposed returns the result as expected.
col1, col2 = 'a', 'c'
result = df.loc[df.apply(lambda row: row[col1] != row[col2], axis=1), [col1, col2]]
print(result)
Output:
a c
2 2 0
6 6 20
8 8 40

Adding a list to a column when iterating over a dataframe row

I have the following dataframe:
import numpy as np
import pandas as pd
import random
df = pd.DataFrame(np.random.randint(0,20,size=(2, 2)), columns=list('AB'))
df
A B
0 13 4
1 16 17
Then I create another dataframe in a loop where the columns of the dataframe are lists. There is a post here (Pandas split column of lists into multiple columns) that shows to split the columns.
tmp_lst_1 = []
for index, row in df.iterrows():
tmp_lst_2 = []
for r in range(len(row)):
tmp_lst_2.insert(r, random.sample(range(1, 50), 2) )
tmp_lst_1.insert(index, tmp_lst_2)
df1 = pd.DataFrame(tmp_lst_1)
df1
0 1
0 [21, 5] [6, 42]
1 [49, 40] [8, 45]
but I was wondering if there is a more efficient way to create this dataframe without needing to split all the columns individually? I am looking to get something like this:
df1
C D E F
0 21 5 6 42
1 49 40 8 45
I think loop by DataFrame.iterrows here is not necessary, you can use nested list comprehension with flattening lists:
df = pd.DataFrame(np.random.randint(0,20,size=(2, 2)), columns=list('AB'))
tmp_lst_1 = [[x for r in range(len(df.columns))
for x in random.sample(range(1, 50), 2)]
for i in range(len(df))]
df1 = pd.DataFrame(tmp_lst_1, index=df.index)
print (df1)
0 1 2 3
0 23 24 42 48
1 26 43 24 5
Alternative without list comprehension:
tmp_lst_1 = []
for i in range(len(df)):
flat_list = []
for r in range(len(df.columns)):
for x in random.sample(range(1, 50), 2):
flat_list.append(x)
tmp_lst_1.append(flat_list)

How to move data in numpy array from column/row to another based on value in third column

Im trying to sort this data to go from this:
to this:
Basically I'm trying to compress 5 rows of data, each with 1 ID and 2 values into 1 row of data with 1 ID and 10 values. My data is approx. 6 million rows long. One thing to note: not every group has 5 (X,Y) coordinate values. Some only have 4.
I could not figure out how to do this by indexing alone. So i wrote a for loop, which doesnt work very well. It will sort the first 10,000 ok (but end with an error), but it takes forever.
coords = pd.read_csv('IDQQCoords.csv')
coords = coords.as_matrix(columns=None)
mpty = np.zeros((len(coords),8),dtype=float)
#creates an empty array the same length as coords
coords = np.append(coords,mpty,axis=1)
# adds the 8 empty columns from the previous command
#This is to make space to add the values from subsequent rows
cnt = 0
lth = coords.shape[0]
for counter in range(1,lth):
if coords[cnt+1,0] == coords[cnt,0]:
coords[cnt,3:5] = coords[cnt+1,1:3]
coords = np.delete(coords,cnt+1,axis=0)
if coords[cnt+1,0] == coords[cnt,0]:
coords[cnt,5:7] = coords[cnt+1,1:3]
coords = np.delete(coords,cnt+1,axis=0)
if coords[cnt+1,0] == coords[cnt,0]:
coords[cnt,7:9] = coords[cnt+1,1:3]
coords = np.delete(coords,cnt+1,axis=0)
if coords[cnt+1,0] == coords[cnt,0]:
coords[cnt,9:11] = coords[cnt+1,1:3]
coords = np.delete(coords,cnt+1,axis=0)
cnt = cnt+1
Can someone help me, either with an index or a better loop?
Thanks a ton
Assuming that
coords = pd.read_csv('IDQQCoords.csv')
implies that you are using Pandas, then the easiest way to produce the desired result is to use DataFrame.pivot:
import pandas as pd
import numpy as np
np.random.seed(2016)
df = pd.DataFrame({'shapeid': [0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2],
'x': np.random.random(14),
'y': np.random.random(14)})
# shapeid x y
# 0 0 0.896705 0.603638
# 1 0 0.730239 0.588791
# 2 0 0.783276 0.069347
# 3 0 0.741652 0.942829
# 4 0 0.462090 0.372599
# 5 1 0.642565 0.451989
# 6 1 0.224864 0.450841
# 7 1 0.708547 0.033112
# 8 1 0.747126 0.169423
# 9 2 0.625107 0.180155
# 10 2 0.579956 0.352746
# 11 2 0.242640 0.342806
# 12 2 0.131956 0.277638
# 13 2 0.143948 0.375779
df['col'] = df.groupby('shapeid').cumcount()
df = df.pivot(index='shapeid', columns='col')
df = df.sort_index(axis=1, level=1)
df.columns = ['{}{}'.format(col, num) for col,num in df.columns]
print(df)
yields
x0 y0 x1 y1 x2 y2 x3 \
shapeid
0 0.896705 0.603638 0.730239 0.588791 0.783276 0.069347 0.741652
1 0.642565 0.451989 0.224864 0.450841 0.708547 0.033112 0.747126
2 0.625107 0.180155 0.579956 0.352746 0.242640 0.342806 0.131956
y3 x4 y4
shapeid
0 0.942829 0.462090 0.372599
1 0.169423 NaN NaN
2 0.277638 0.143948 0.375779

Categories