Select rows from pandas data frame based on range from another - python

I have two dataframes.
First dataframe: df_json
+------------+-----------------+-----------+------------+
| chromosome | ensembl_id | gene_end | gene_start |
+------------+-----------------+-----------+------------+
| 7 | ENSG00000122543 | 5886362 | 5879827 |
| 12 | ENSG00000111325 | 122980043 | 122974580 |
| 17 | ENSG00000181396 | 82418637 | 82389223 |
| 6 | ENSG00000119900 | 71308950 | 71288803 |
| 9 | ENSG00000106809 | 92404696 | 92383967 |
+------------+-----------------+-----------+------------+
Second dataframe: df
+------------+-----------------+-----------+------------+
| rs_id | variant | gene_id | chromosome |
+------------+-----------------+-----------+------------+
| rs13184706 | 5:43888254:C:T | 43888254| 5 |
| rs58824264 | 5:43888493:C:T | 43888493| 5 |
+------------+-----------------+-----------+------------+
I want to iterate through df_json and for each row in df_json, select the rows from df whose gene_id is in range (gene_start, gene_end) and df['chromosome'] == df_json['chromosome']. Also, I need to create a new column in the resulting dataframe which has the ensembl_id from df_json.
I am able to achieve the same using the code below but it is very slow. I need a faster way to do this as I need to execute this on millions of rows.
result_df = []
for row in df_json.itertuples():
gene_end, gene_start = row[3], row[4]
gene = df.loc[(df['gene_id'].between(gene_start, gene_end, inclusive=True)) & (df['chromosome'] == row[1])]
gene['ensembl_id'] = row[2]
result_df.append(gene)
print(krishna[0])

You should avoid iterating pandas dataframe rows where possible, as this is inefficient and less readable.
You can implement your logic using pd.DataFrame.merge and pd.Series.between. I have changed the data in your example to make it more interesting.
import pandas as pd
df_json = pd.DataFrame({'chromosome': [7, 12, 17, 6, 9],
'ensembl_id': ['ENSG00000122543', 'ENSG00000111325', 'ENSG00000181396',
'ENSG00000119900', 'ENSG00000106809'],
'gene_end': [5886362, 122980043, 82418637, 71308950, 92404696],
'gene_start': [5879827, 122974580, 82389223, 71288803, 92383967]})
df = pd.DataFrame({'rs_id': ['rs13184706', 'rs58824264'],
'variant': ['5:43888254:C:T', '5:43888493:C:T'],
'gene_id': [5880000, 43888493],
'chromosome': [7, 9]})
res = df_json.merge(df, how='left', on='chromosome')
res = res[res['gene_id'].between(res['gene_start'], res['gene_end'])]
print(res)
# chromosome ensembl_id gene_end gene_start gene_id rs_id \
# 0 7 ENSG00000122543 5886362 5879827 5880000.0 rs13184706
# variant
# 0 5:43888254:C:T

Use pyranges for large datasets. It is very efficient and fast:
import pyranges as pr
c = """Chromosome ensembl_id End Start
7 ENSG00000122543 5886362 5879827
12 ENSG00000111325 122980043 122974580
17 ENSG00000181396 82418637 82389223
5 MadeUp 43889000 43888253
6 ENSG00000119900 71308950 71288803
9 ENSG00000106809 92404696 92383967"""
c2 = """rs_id variant Start End Chromosome
rs13184706 5:43888254:C:T 43888254 43888256 5
rs58824264 5:43888493:C:T 43888493 43888494 5"""
gr = pr.from_string(c)
gr2 = pr.from_string(c2)
j = gr.join(gr2)
# +--------------+--------------+-----------+-----------+------------+----------------+-----------+-----------+
# | Chromosome | ensembl_id | End | Start | rs_id | variant | Start_b | End_b |
# | (category) | (object) | (int32) | (int32) | (object) | (object) | (int32) | (int32) |
# |--------------+--------------+-----------+-----------+------------+----------------+-----------+-----------|
# | 5 | MadeUp | 43889000 | 43888253 | rs13184706 | 5:43888254:C:T | 43888254 | 43888256 |
# | 5 | MadeUp | 43889000 | 43888253 | rs58824264 | 5:43888493:C:T | 43888493 | 43888494 |
# +--------------+--------------+-----------+-----------+------------+----------------+-----------+-----------+
# Unstranded PyRanges object has 2 rows and 8 columns from 1 chromosomes.
# For printing, the PyRanges was sorted on Chromosome.
df = j.df # as pandas df

Related

Python Pandas Extracting specific data from column

I have one column as an object contains multiple data separated by ( | )
I would like to extract only the customer order number which is start with
( 44 ) sometimes the order number in the beginning, sometimes in the middle, sometimes in the end
And sometimes is duplicated
44019541285_P_002 | 0317209757 | 87186978110350851 | 387186978103840544 |
87186978110202440 | 44019119315 | 87186978110202440 | 44019119315
87186978110326832 | 44019453624 | 87186978110326832 | 44019453624
44019406029 | 0317196878 | 87186978110313085 | 387186978120481881|
44019480564 | 0317202711 | 87186978110335810 | 387186978103844160 |
Wishing result
44019541285
44019119315
44019453624
44019406029
44019480564
My code:
import pandas as pd
from io import StringIO
data = '''
Order_Numbers
44019541285_P_002 | 0317209757 | 87186978110350851 | 387186978103840544 | 0652569032
87186978110202440 | 44019119315 | 87186978110202440 | 44019119315
87186978110326832 | 44019453624 | 87186978110326832 | 44019453624
44019406029 | 0317196878 | 87186978110313085 | 387186978120481881|
44019480564 | 0317202711 | 87186978110335810 | 387186978103844160 | 630552498
'''
df = pd.read_csv(StringIO(data.replace(' ','')))
df
'''
Order_Numbers
0 44019541285_P_002|0317209757|87186978110350851...
1 87186978110202440|44019119315|8718697811020244...
2 87186978110326832|44019453624|8718697811032683...
3 44019406029|0317196878|87186978110313085|38718...
4 44019480564|0317202711|87186978110335810|38718...
'''
Final code:
(
df.Order_Numbers.str.split('|', expand=True)
.astype(str)
.where(lambda x: x.applymap(lambda y: y[:2] == '44'))
.bfill(axis=1)
[0]
.str.split('_').str.get(0)
)
0 44019541285
1 44019119315
2 44019453624
3 44019406029
4 44019480564
Name: 0, dtype: object
import pandas as pd
df = pd.DataFrame({
'order_number':[
'44019541285_P_002 | 0317209757 | 87186978110350851 | 387186978103840544 | 0652569032',
'87186978110202440 | 44019119315 | 87186978110202440 | 44019119315',
'87186978110326832 | 44019453624 | 87186978110326832 | 44019453624',
'44019406029 | 0317196878 | 87186978110313085 | 387186978120481881|',
'44019480564 | 0317202711 | 87186978110335810 | 387186978103844160 | 630552498'
]
})
def extract_customer_order(order_number):
order_number = order_number.replace(' ','') # remove all space to make it easy to process e.g. '44019541285_P_002 | 0317209757 ' -> '44019541285_P_002|0317209757'
order_number_list = order_number.split('|') # split the string at every | to multiple string in list '44019541285_P_002|0317209757' -> ['44019541285_P_002', '0317209757']
result = []
for order in order_number_list:
if order.startswith('44'): # select only order number starting with '44'
if order not in result: # to prevent duplicate order number
result += [order]
# if you want the result as string separated by '|', uncomment line below
# result = '|'.join(result)
return result
df['customer_order'] = df['order_number'].apply(extract_customer_order)

Pyspark: Reorder only a subset of rows among themselves

my data frame:
+-----+--------+-------+
| val | id | reRnk |
+-----+--------+-------+
| 2 | a | yes |
| 1 | b | no |
| 3 | c | no |
| 8 | d | yes |
| 7 | e | yes |
| 9 | f | no |
+-----+--------+-------+
In my desired output I will re-rank only the columns where reRnk==yes, ranking will be done based on "val"
I don't want to change the rows where reRnk = no, for example at id=b we have reRnk=no I want to keep that row at row no. 2 only.
my desired output will look like this:
+-----+--------+-------+
| val | id | reRnk |
+-----+--------+-------+
| 8 | d | yes |
| 1 | b | no |
| 3 | c | no |
| 7 | e | yes |
| 2 | a | yes |
| 9 | f | no |
+-----+--------+-------+
From what I'm reading, pyspark DF's do not have an index by default. You might need to add this.
I do not know the exact syntax for pyspark, however since it has many similarities with pandas this might lead you into a certain direction:
df.loc[df.reRnk == 'yes', ['val','id']] = df.loc[df.reRnk == 'yes', ['val','id']].sort_values('val', ascending=False).set_index(df.loc[df.reRnk == 'yes', ['val','id']].index)
Basically what we do here is isolating the rows with reRnk == 'yes', sorting these values but resetting the index to its original index. Then we assign these new values to the original rows in the df.
for .loc, https://spark.apache.org/docs/3.2.0/api/python/reference/pyspark.pandas/api/pyspark.pandas.DataFrame.loc.html might be worth a try.
for .sort_values see: https://sparkbyexamples.com/pyspark/pyspark-orderby-and-sort-explained/

Best way to compare 2 dfs, get the name of different col & before + after vals?

What is the best way to compare 2 dataframes w/ the same column names, row by row, if a cell is different have the Before & After value and which cellis different in that dataframe.
I know this question has been asked a lot, but none of the applications fit my use case. Speed is important. There is a package called datacompy but it is not good if I have to compare 5000 dataframes in a loop (i'm only comparing 2 at a time, but around 10,000 total, and 5000 times).
I don't want to join the dataframes on a column. I want to compare them row by row. Row 1 with row 1. Etc. If a column in row 1 is different, I only need to know the column name, the before, and the after. Perhaps if it is numeric I could also add a column w/ the abs val. of the dif.
The problem is, there is sometimes an edge case where rows are out of order (only by 1 entry), and don’t want these to come up as false positives.
Example:
These dataframes would be created when I pass in race # (there are 5,000 race numbers)
df1
+-----+-------+--+------+--+----------+----------+-------------+--+
| Id | Speed | | Name | | Distance | | Location | |
+-----+-------+--+------+--+----------+----------+-------------+--+
| 181 | 10.3 | | Joe | | 2 | | New York | |
| 192 | 9.1 | | Rob | | 1 | | Chicago | |
| 910 | 1.0 | | Fred | | 5 | | Los Angeles | |
| 97 | 1.8 | | Bob | | 8 | | New York | |
| 88 | 1.2 | | Ken | | 7 | | Miami | |
| 99 | 1.1 | | Mark | | 6 | | Austin | |
+-----+-------+--+------+--+----------+----------+-------------+--+
df2:
+-----+-------+--+------+--+----------+----------+-------------+--+
| Id | Speed | | Name | | Distance | | | Location |
+-----+-------+--+------+--+----------+----------+-------------+--+
| 181 | 10.3 | | Joe | | 2 | | New York | |
| 192 | 9.4 | | Rob | | 1 | | Chicago | |
| 910 | 1.0 | | Fred | | 5 | | Los Angeles | |
| 97 | 1.5 | | Bob | | 8 | | New York | |
| 99 | 1.1 | | Mark | | 6 | | Austin | |
| 88 | 1.2 | | Ken | | 7 | | Miami | |
+-----+-------+--+------+--+----------+----------+-------------+--+
diff:
+-------+----------+--------+-------+
| Race# | Diff_col | Before | After |
+-------+----------+--------+-------+
| 123 | Speed | 9.1 | 9.4 |
| 123 | Speed | 1.8 | 1.5 |
An example of a false positive is with the last 2 rows, Ken + Mark.
I could summarize the differences in one line per race, but if the dataframe has 3000 records and there are 1,000 differences (unlikely, but possible) than I will have tons of columns. I figured this was was easier as I could export to excel and then sort by race #, see all the differences, or by diff_col, see which columns are different.
def DiffCol2(df1, df2, race_num):
is_diff = False
diff_cols_list = []
row_coords, col_coords = np.where(df1 != df2)
diffDf = []
alldiffDf = []
for y in set(col_coords):
col_df1 = df1.iloc[:,y].name
col_df2 = df2.iloc[:,y].name
for index, row in df1.iterrows():
if df1.loc[index, col_df1] != df2.loc[index, col_df2]:
col_name = col_df1
if col_df1 != col_df2: col_name = (col_df1, col_df2)
diffDf.append({‘Race #’: race_num,'Column Name': col_name, 'Before: df2.loc[index, col_df2], ‘After’: df1.loc[index, col_df1]})
try:
check_edge_case = df1.loc[index, col_df1] == df2.loc[index+1, col_df1]
except:
check_edge_case = False
try:
check_edge_case_two = df1.loc[index, col_df1] == df2.loc[index-1, col_df1]
except:
check_edge_case_two = False
if not (check_edge_case or check_edge_case_two):
col_name = col_df1
if col_df1 != col_df2:
col_name = (col_df1, col_df2) #if for some reason column name isn’t the same, which should never happen but in case, I want to know both col names
is_diff = True
diffDf.append({‘Race #’: race_num,'Column Name': col_name, 'Before: df2.loc[index, col_df2], ‘After’: df1.loc[index, col_df1]})
return diffDf, alldiffDf, is_diff
[apologies in advance for weirdly formatted tables, i did my best given how annoying pasting tables into s/o is]
The code below works if dataframes have the same number and names of columns and the same number of rows, so comparing only values in the tables
Not sure where you want to get Race# from
df1 = pd.DataFrame(np.random.randn(10, 4), columns=list('ABCD'))
df2 = df1.copy(deep=True)
df2['B'][5] = 100 # Creating difference
df2['C'][6] = 100 # Creating difference
dif=[]
for col in df1.columns:
for bef, aft in zip(df1[col], df2[col]):
if bef!=aft:
dif.append([col, bef, aft])
print(dif)
Results below
Alternative solution without loops
df = df1.melt()
df.columns=['Column', 'Before']
df.insert(2, 'After', df2.melt().value)
df[df.Before!=df.After]

pyspark - how can I remove all duplicate rows (ignoring certain columns) and not leaving any dupe pairs behind?

I have the following table:
df = spark.createDataFrame([(2,'john',1),
(2,'john',1),
(3,'pete',8),
(3,'pete',8),
(5,'steve',9)],
['id','name','value'])
df.show()
+----+-------+-------+--------------+
| id | name | value | date |
+----+-------+-------+--------------+
| 2 | john | 1 | 131434234342 |
| 2 | john | 1 | 10-22-2018 |
| 3 | pete | 8 | 10-22-2018 |
| 3 | pete | 8 | 3258958304 |
| 5 | steve | 9 | 124324234 |
+----+-------+-------+--------------+
I want to remove all duplicate pairs (When the duplicates occur in id, name, or value but NOT date) so that I end up with:
+----+-------+-------+-----------+
| id | name | value | date |
+----+-------+-------+-----------+
| 5 | steve | 9 | 124324234 |
+----+-------+-------+-----------+
How can I do this in PySpark?
You could groupBy id, name and value and filter on the count column : :
df = df.groupBy('id','name','value').count().where('count = 1')
df.show()
+---+-----+-----+-----+
| id| name|value|count|
+---+-----+-----+-----+
| 5|steve| 9| 1|
+---+-----+-----+-----+
You could eventually drop the count column if needed
Do groupBy for the columns you want and count and do a filter where count is equal to 1 and then you can drop the count column like below
import pyspark.sql.functions as f
df = df.groupBy("id", "name", "value").agg(f.count("*").alias('cnt')).where('cnt = 1').drop('cnt')
You can add the date column in the GroupBy condition if you want
Hope this helps you

Interval intersection in pandas

Update 5:
This feature has been released as part of pandas 20.1 (on my birthday :] )
Update 4:
PR has been merged!
Update 3:
The PR has moved here
Update 2:
It seems like this question may have contributed to re-opening the PR for IntervalIndex in pandas.
Update:
I no longer have this problem, since I'm actually now querying for overlapping ranges from A and B, not points from B which fall within ranges in A, which is a full interval tree problem. I won't delete the question though, because I think it's still a valid question, and I don't have a good answer.
Problem statement
I have two dataframes.
In dataframe A, two of the integer columns taken together represent an interval.
In dataframe B, one integer column represents a position.
I'd like to do a sort of join, such that points are assigned to each interval they fall within.
Intervals are rarely but occasionally overlapping. If a point falls within that overlap, it should be assigned to both intervals. About half of points won't fall within an interval, but nearly every interval will have at least one point within its range.
What I've been thinking
I was initially going to dump my data out of pandas, and use intervaltree or banyan or maybe bx-python but then I came across this gist. It turns out that the ideas shoyer has in there never made it into pandas, but it got me thinking -- it might be possible to do this within pandas, and since I want this code to be as fast as python can possibly go, I'd rather not dump my data out of pandas until the very end. I also get the feeling that this is possible with bins and pandas cut function, but I'm a total newbie to pandas, so I could use some guidance! Thanks!
Notes
Potentially related? Pandas DataFrame groupby overlapping intervals of variable length
This feature is was released as part of pandas 20.1
Answer using pyranges, which is basically pandas sprinkled with bioinformatics sugar.
Setup:
import numpy as np
np.random.seed(0)
import pyranges as pr
a = pr.random(int(1e6))
# +--------------+-----------+-----------+--------------+
# | Chromosome | Start | End | Strand |
# | (category) | (int32) | (int32) | (category) |
# |--------------+-----------+-----------+--------------|
# | chr1 | 8830650 | 8830750 | + |
# | chr1 | 9564361 | 9564461 | + |
# | chr1 | 44977425 | 44977525 | + |
# | chr1 | 239741543 | 239741643 | + |
# | ... | ... | ... | ... |
# | chrY | 29437476 | 29437576 | - |
# | chrY | 49995298 | 49995398 | - |
# | chrY | 50840129 | 50840229 | - |
# | chrY | 38069647 | 38069747 | - |
# +--------------+-----------+-----------+--------------+
# Stranded PyRanges object has 1,000,000 rows and 4 columns from 25 chromosomes.
# For printing, the PyRanges was sorted on Chromosome and Strand.
b = pr.random(int(1e6), length=1)
# +--------------+-----------+-----------+--------------+
# | Chromosome | Start | End | Strand |
# | (category) | (int32) | (int32) | (category) |
# |--------------+-----------+-----------+--------------|
# | chr1 | 52110394 | 52110395 | + |
# | chr1 | 122640219 | 122640220 | + |
# | chr1 | 162690565 | 162690566 | + |
# | chr1 | 117198743 | 117198744 | + |
# | ... | ... | ... | ... |
# | chrY | 45169886 | 45169887 | - |
# | chrY | 38863683 | 38863684 | - |
# | chrY | 28592193 | 28592194 | - |
# | chrY | 29441949 | 29441950 | - |
# +--------------+-----------+-----------+--------------+
# Stranded PyRanges object has 1,000,000 rows and 4 columns from 25 chromosomes.
# For printing, the PyRanges was sorted on Chromosome and Strand.
Execution:
result = a.join(b, strandedness="same")
# +--------------+-----------+-----------+--------------+-----------+-----------+--------------+
# | Chromosome | Start | End | Strand | Start_b | End_b | Strand_b |
# | (category) | (int32) | (int32) | (category) | (int32) | (int32) | (category) |
# |--------------+-----------+-----------+--------------+-----------+-----------+--------------|
# | chr1 | 227348436 | 227348536 | + | 227348516 | 227348517 | + |
# | chr1 | 18901135 | 18901235 | + | 18901191 | 18901192 | + |
# | chr1 | 230131576 | 230131676 | + | 230131636 | 230131637 | + |
# | chr1 | 84829850 | 84829950 | + | 84829903 | 84829904 | + |
# | ... | ... | ... | ... | ... | ... | ... |
# | chrY | 44139791 | 44139891 | - | 44139821 | 44139822 | - |
# | chrY | 51689785 | 51689885 | - | 51689859 | 51689860 | - |
# | chrY | 45379140 | 45379240 | - | 45379215 | 45379216 | - |
# | chrY | 37469479 | 37469579 | - | 37469576 | 37469577 | - |
# +--------------+-----------+-----------+--------------+-----------+-----------+--------------+
# Stranded PyRanges object has 16,153 rows and 7 columns from 24 chromosomes.
# For printing, the PyRanges was sorted on Chromosome and Strand.
df = result.df
# Chromosome Start End Strand Start_b End_b Strand_b
# 0 chr1 227348436 227348536 + 227348516 227348517 +
# 1 chr1 18901135 18901235 + 18901191 18901192 +
# 2 chr1 230131576 230131676 + 230131636 230131637 +
# 3 chr1 84829850 84829950 + 84829903 84829904 +
# 4 chr1 189088140 189088240 + 189088163 189088164 +
# ... ... ... ... ... ... ... ...
# 16148 chrY 38968068 38968168 - 38968124 38968125 -
# 16149 chrY 44139791 44139891 - 44139821 44139822 -
# 16150 chrY 51689785 51689885 - 51689859 51689860 -
# 16151 chrY 45379140 45379240 - 45379215 45379216 -
# 16152 chrY 37469479 37469579 - 37469576 37469577 -
#
# [16153 rows x 7 columns]

Categories