Sorting and arranging a list using pandas - python

I have an input file as shown below which needs to be arranged in such an order that the key values need to be in ascending order, while the keys which are not present need to be printed in the last.
I am getting the data arranged in the required format but the order is missing.
I have tried using sort() method but it shows "list has no attribute sort".
Please suggest solution and also suggest if any modifications required.
Input file:
3=1388|4=1388|5=IBM|8=157.75|9=88929|1021=1500|854=n|388=157.75|394=157.75|474=157.75|1584=88929|444=20160713|459=93000546718000|461=7|55=93000552181000|22=89020|400=157.75|361=0.73|981=0|16=1468416600.6006|18=1468416600.6006|362=0.46
3=1388|4=1388|5=IBM|8=157.73|9=100|1021=0|854=p|394=157.73|474=157.749977558|1584=89029|444=20160713|459=93001362639104|461=26142|55=93001362849000|22=89120|361=0.71|981=0|16=1468416601.372|18=1468416601.372|362=0.45
3=1388|4=1388|5=IBM|8=157.69|9=100|1021=600|854=p|394=157.69|474=157.749910415|1584=89129|444=20160713|459=93004178882560|461=27052|55=93004179085000|22=89328|361=0.67|981=1|16=1468416604.1916|18=1468416604.1916|362=0.43
Code i tried:
import pandas as pd
import numpy as np
df = pd.read_csv('inputfile', index_col=None, names=['text'])
s = df.text.str.split('|')
ds = [dict(w.split('=', 1) for w in x) for x in s]
p = pd.DataFrame.from_records(ds)
p1 = p.replace(np.nan,'n/a', regex=True)
st = p1.stack(level=0,dropna=False)
dfs = [g for i,g in st.groupby(level=0)]
#print st
i = 0
while i < len(dfs):
#index of each column
print ('\nindex[%d]'%i)
for (_,k),v in dfs[i].iteritems():
print k,'\t',v
i = i + 1
output getting:
index[0]
1021 1500
1584 88929
16 1468416600.6006
18 1468416600.6006
22 89020
3 1388
361 0.73
362 0.46
388 157.75
394 157.75
4 1388
400 157.75
444 20160713
459 93000546718000
461 7
474 157.75
5 IBM
55 93000552181000
8 157.75
854 n
9 88929
981 0
index[1]
1021 0
1584 89029
16 1468416601.372
18 1468416601.372
22 89120
3 1388
361 0.71
362 0.45
388 n/a
394 157.73
4 1388
400 n/a
444 20160713
459 93001362639104
461 26142
474 157.749977558
5 IBM
55 93001362849000
8 157.73
854 p
9 100
981 0
Expected output:
index[0]
3 1388
4 1388
5 IBM
8 157.75
9 88929
16 1468416600.6006
18 1468416600.6006
22 89020
55 93000552181000
361 0.73
362 0.46
388 157.75
394 157.75
400 157.75
444 20160713
459 93000546718000
461 7
474 157.75
854 n
981 0
1021 1500
1584 88929
index[1]
3 1388
4 1388
5 IBM
8 157.75
9 88929
16 1468416600.6006
18 1468416600.6006
22 89020
55 93000552181000
361 0.73
362 0.46
394 157.75
444 20160713
459 93000546718000
461 7
474 157.75
854 n
981 0
1021 1500
1584 88929
388 n/a
400 n/a

Replace your ds line with
ds = [{int(pair[0]): pair[1] for pair in [w.split('=', 1) for w in x]} for x in s]
To convert the index to an integer so it will be sorted numerically
To output the n/a values at the end, you could use the pandas selection to output the nonnull values first, then the null values, e.g:
for (ix, series) in p.iterrows():
print('\nindex[%d]' % ix)
output_series(ix, series[pd.notnull])
output_series(ix, series[pd.isnull].fillna('n/a'))
btw, you can also simplify your stack, groupby, print to:
for (ix, series) in p1.iterrows():
print('\nindex[%d]' % ix)
for tag, value in series.iteritems():
print(tag, '\t', value)
So the whole script becomes:
def output_series(ix, series):
for tag, value in series.iteritems():
print(tag, '\t', value)
df = pd.read_csv('inputfile', index_col=None, names=['text'])
s = df.text.str.split('|')
ds = [{int(pair[0]): pair[1] for pair in [w.split('=', 1) for w in x]} for x in s]
p = pd.DataFrame.from_records(ds)
for (ix, series) in p.iterrows():
print('\nindex[%d]' % ix)
output_series(ix, series[pd.notnull])
output_series(ix, series[pd.isnull].fillna('n/a'))

Here:
import pandas as pd
import numpy as np
df = pd.read_csv('inputfile', index_col=None, names=['text'])
s = df.text.str.split('|')
ds = [dict(w.split('=', 1) for w in x) for x in s]
p1 = pd.DataFrame.from_records(ds).fillna('n/a')
st = p1.stack(level=0,dropna=False)
for k, v in st.groupby(level=0):
print(k, v.sort_index())

Related

How to simplify data.table logic and make it doable in pandas?

I have a dataframe with multiple columns with numerical values. I wanted to new columns which compare the values of other columns and assign its column name as label. I already understood its logic in r, but wondering how should I do this easily in python. Can anyone point me out how this can be done in python when we try to add new column where need to compare value of multiple columns and assign column name which has max value? Any idea?
reproducible example
this is 100% working reproducible example in R:
library(data.table)
df <- data.frame(a = sample(seq(1:10), size=10), b = sample(LETTERS[1:10], size=10), cnt=sample(seq(1:100), size=5),
RECENT_MOV= sample(seq(1:1000), size = 10),
RETIRED= sample(seq(1:200), size = 10),
SERV_EMPL= sample(seq(1:500), size = 10),
SUB_BUS=sample(seq(1:2000), size = 10),
WORK_HOME=sample(seq(1:1200), size = 10)
)
dt <- as.data.table(df)
write.csv(dt, "sample.csv")
label = c("RECENT_MOV", "RETIRED", "SERV_EMPL", "SUB_BUS","WORK_HOME")
df$category <- NA_character_
df[, row_ind:= 1:nrow(df)]
df[cnt > 2, category := names(which.max(.SD[, label, with = FALSE])), by = row_ind]
current output is:
> dput(dt)
structure(list(a = c(5L, 10L, 1L, 6L, 7L, 3L, 2L, 8L, 4L, 9L),
b = c("E", "A", "D", "H", "J", "F", "G", "I", "C", "B"),
cnt = c(13L, 88L, 45L, 92L, 70L, 13L, 88L, 45L, 92L, 70L),
RECENT_MOV = c(70L, 195L, 620L, 572L, 354L, 648L, 798L, 657L,
233L, 672L), RETIRED = c(189L, 195L, 191L, 88L, 148L, 186L,
39L, 78L, 158L, 55L), SERV_EMPL = c(65L, 151L, 415L, 383L,
255L, 207L, 210L, 470L, 181L, 188L), SUB_BUS = c(894L, 829L,
1798L, 502L, 897L, 1461L, 744L, 1991L, 260L, 1697L), WORK_HOME = c(553L,
739L, 454L, 137L, 435L, 1042L, 316L, 697L, 517L, 1158L),
category = c("SUB_BUS", "SUB_BUS", "SUB_BUS", "RECENT_MOV",
"SUB_BUS", "SUB_BUS", "RECENT_MOV", "SUB_BUS", "WORK_HOME",
"SUB_BUS"), row_ind = 1:10), row.names = c(NA, -10L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x0000015a64b61ef0>)
my current python attempt
import pandas as pd
df=pd.read_csv("sample.csv", index_col=None, header=0)
label = ["RECENT_MOV", "RETIRED", "SERV_EMPL", "SUB_BUS","WORK_HOME"]
df['category'] = pd.NA
df['row_ind'] = range(1,len(df))
however, I have trouble to make this line in pythonic way:
df[cnt > 2, category := names(which.max(.SD[, label, with = FALSE])), by = row_ind]
basically, this lines said create new column variable called category where comparing columns in label where whichever column has max value, its column name will be assigned as value in category column. How should I do it this easily in python?
logic translation:
df[cnt > 2, category := names(which.max(.SD[, label, with = FALSE])), by = row_ind]
this line telling us that first do filter by cnt column where cnt > 2, then compare columns values of df[["RECENT_MOV", "RETIRED", "SERV_EMPL", "SUB_BUS","WORK_HOME"]] and pick the column with highest value by row-wise and assign that name of that column as value to df['category']=col_name_with_highest_value_in_each_row.
desirable output
this is desirable output that I want to produce in python:
a b cnt RECENT_MOV RETIRED SERV_EMPL SUB_BUS WORK_HOME category row_ind
1 5 E 13 70 189 65 894 553 SUB_BUS 1
2 10 A 88 195 195 151 829 739 SUB_BUS 2
3 1 D 45 620 191 415 1798 454 SUB_BUS 3
4 6 H 92 572 88 383 502 137 RECENT_MOV 4
5 7 J 70 354 148 255 897 435 SUB_BUS 5
6 3 F 13 648 186 207 1461 1042 SUB_BUS 6
7 2 G 88 798 39 210 744 316 RECENT_MOV 7
8 8 I 45 657 78 470 1991 697 SUB_BUS 8
9 4 C 92 233 158 181 260 517 WORK_HOME 9
10 9 B 70 672 55 188 1697 1158 SUB_BUS 10
This is actually really simple with pandas. Have a list of the columns to search in, and then use idxmax with axis=1:
# Filter out rows where `cnt` is less than or equal to 2
df = df[df['cnt'] > 2]
# Determine category for each row
search_cols = ['RECENT_MOV', 'RETIRED', 'SERV_EMPL', 'SUB_BUS', 'WORK_HOME']
df['category'] = df[search_cols].idxmax(axis=1)
# Assign row indexes
df['row_ind'] = df.index
Output:
>>> df
a b cnt RECENT_MOV RETIRED SERV_EMPL SUB_BUS WORK_HOME category row_ind
1 1 C 76 452 62 55 115 247 RECENT_MOV 1
2 7 E 14 50 165 337 1165 810 SUB_BUS 2
3 2 A 46 523 167 423 784 707 SUB_BUS 3
4 3 H 3 38 144 473 745 437 SUB_BUS 4
5 5 I 59 743 127 261 351 190 RECENT_MOV 5
6 8 J 76 143 49 470 1612 935 SUB_BUS 6
7 4 D 14 818 101 418 1919 314 SUB_BUS 7
8 6 F 46 714 9 446 1432 938 SUB_BUS 8
9 10 B 3 585 160 14 107 489 RECENT_MOV 9
10 9 G 59 814 73 449 937 287 SUB_BUS 10

Sampling from static data set to create dataframe, ignore index in Python

I am trying to create some random samples (of a given size) from a static dataframe. The goal is to create multiple columns for each sample (and each sample drawn is the same size). I'm expecting to see multiple columns of the same length (i.e. sample size) in the fully sampled dataframe, but maybe append isn't the right way to go. Here is the code:
# create sample dataframe
target_df = pd.DataFrame(np.arange(1000))
target_df.columns=['pl']
# create the sampler:
sample_num = 5
sample_len = 10
df_max_row = len(target_df) - sample_len
for i in range(sample_num):
rndm_start = np.random.choice(df_max_row, 1)[0]
rndm_end = rndm_start + sample_len
slicer = target_df.iloc[rndm_start:rndm_end]['pl']
sampled_df = sampled_df.append(slicer, ignore_index=True)
sampled_df = sampled_df.T
The output of this is shown in the pic below - The red line shows the index I want remove.
The desired output is shown below that. How do I make this happen?
Thanks!
I would create new column using
sampled_df[i] = slicer.reset_index(drop=True)
Eventually I would use str(i) for column name because later it is simpler to select column using string than number
import pandas as pd
import random
target_df = pd.DataFrame({'pl': range(1000)})
# create the sampler:
sample_num = 5
sample_len = 10
df_max_row = len(target_df) - sample_len
sampled_df = pd.DataFrame()
for i in range(1, sample_num+1):
start = random.randint(0, df_max_row)
end = start + sample_len
slicer = target_df[start:end]['pl']
sampled_df[str(i)] = slicer.reset_index(drop=True)
sampled_df.index += 1
print(sampled_df)
Result:
1 2 3 4 5
1 735 396 646 534 769
2 736 397 647 535 770
3 737 398 648 536 771
4 738 399 649 537 772
5 739 400 650 538 773
6 740 401 651 539 774
7 741 402 652 540 775
8 742 403 653 541 776
9 743 404 654 542 777
10 744 405 655 543 778
But to create really random values then I would first shuffle values
np.random.shuffle(target_df['pl'])
and then I don't have to use random to select start
shuffle changes original column so it can't assign to new variable.
It doesn't repeat values in samples.
import pandas as pd
#import numpy as np
import random
target_df = pd.DataFrame({'pl': range(1000)})
# create the sampler:
sample_num = 5
sample_len = 10
sampled_df = pd.DataFrame()
#np.random.shuffle(target_df['pl'])
random.shuffle(target_df['pl'])
for i in range(1, sample_num+1):
start = i * sample_len
end = start + sample_len
slicer = target_df[start:end]['pl']
sampled_df[str(i)] = slicer.reset_index(drop=True)
sampled_df.index += 1
print(sampled_df)
Result:
1 2 3 4 5
1 638 331 171 989 170
2 22 643 47 136 764
3 969 455 211 763 194
4 859 384 174 552 566
5 221 829 62 926 414
6 4 895 951 967 381
7 758 688 594 876 873
8 757 691 825 693 707
9 235 353 34 699 121
10 447 81 36 682 251
If values can repeat then you could use
sampled_df[str(i)] = target_df['pl'].sample(n=sample_len, ignore_index=True)
import pandas as pd
target_df = pd.DataFrame({'pl': range(1000)})
# create the sampler:
sample_num = 5
sample_len = 10
sampled_df = pd.DataFrame()
for i in range(1, sample_num+1):
sampled_df[str(i)] = target_df['pl'].sample(n=sample_len, ignore_index=True)
sampled_df.index += 1
print(sampled_df)
EDIT
You may also get shuffled values as numpy array and use reshape - and later convert back to DataFrame with many columns. And later you can get some columns.
import pandas as pd
import random
target_df = pd.DataFrame({'pl': range(1000)})
# create the sampler:
sample_num = 5
sample_len = 10
random.shuffle(target_df['pl'])
sampled_df = pd.DataFrame(target_df['pl'].values.reshape([sample_len,-1]))
sampled_df = sampled_df.iloc[:, 0:sample_num]
sampled_df.index += 1
print(sampled_df)

How to compare two values at a specific location in a loop, and append data in a range of values in Pandas Dataframe

I have a dataframe, from where I extracted some sample data:
Time Val
0 70000 -322
1 70500 -439
2 71000 -528
3 71500 -606
4 72000 -642
5 72500 -663
6 73000 -620
7 73500 -561
8 74000 -592
9 74500 -614
10 75000 -630
11 75500 -719
12 80000 -613
13 80500 -127
14 81000 -235
15 81500 -186
16 82000 -82
17 82500 836
18 83000 1137
183 70000 -106
184 70500 -117
185 71000 -626
186 71500 -810
187 72000 -822
188 72500 -676
189 73000 -639
190 73500 -664
191 74000 -708
192 74500 -515
193 75000 -61
194 75500 -121
195 80000 -145
196 80500 -57
197 81000 -133
198 81500 101
199 82000 235
200 82500 585
201 83000 550
366 70000 18
367 70500 138
368 71000 22
369 71500 -68
370 72000 -146
371 72500 -163
372 73000 -251
373 73500 -230
374 74000 -218
375 74500 -137
376 75000 -126
Now I would like to compare the value from 'Val' at time 73000 with the value [i-3].
If the value is less, then append the continuous values to the list until Time has reached 80000.
I wrote this loop but the problem is that 'Val' compares ALL values [i-3] between 73000 and 80000. I want that the comparison happens ONLY at 73000, and if the condition is true, write the data to the list (until Time 80000)
box = []
for i in df.index:
if df.Time[i] >= 73000 and df.Time[i] <= 80000 and df.Val[i] < df.Val[i-3]:
box.append(
{
'Time': df.Time[i],
'newVAL': df.Val[i],
}
)
box = pd.DataFrame (box, columns = ['Time','newVAL'])
How could I change the code in order to achieve this?
You need to remember the result of the comparison in another variable, and reset it whenever you encounter a time value outside your desired interval. The code would look like this.
box = []
writeToList = False
for i in df.index:
if df.Time[i] < 73000 or df.Time[i] > 80000:
writeToList = False
if df.Time[i] == 73000 and df.Val[i] < df.Val[i-3]:
writeToList = True
if writeToList and df.Time[i] >= 73000 and df.Time[i] <= 80000 :
box.append(
{
'Time': df.Time[i],
'newVAL': df.Val[i],
}
)
box = pd.DataFrame (box, columns = ['Time','newVAL'])
Hope this helps.

I want to compare values in a dataframe column and report the index for the value that satisfy a conditional argument?

Unnamed: 4 GDP in billions of chained 2009 dollars.1
214 2000q1 12359.1
215 2000q2 12592.5
216 2000q3 12607.7
217 2000q4 12679.3
218 2001q1 12643.3
219 2001q2 12710.3
220 2001q3 12670.1
221 2001q4 12705.3
222 2002q1 12822.3
223 2002q2 12893.0
224 2002q3 12955.8
225 2002q4 12964.0
226 2003q1 13031.2
227 2003q2 13152.1
228 2003q3 13372.4
229 2003q4 13528.7
230 2004q1 13606.5
231 2004q2 13706.2
232 2004q3 13830.8
233 2004q4 13950.4
234 2005q1 14099.1
235 2005q2 14172.7
236 2005q3 14291.8
237 2005q4 14373.4
238 2006q1 14546.1
239 2006q2 14589.6
240 2006q3 14602.6
241 2006q4 14716.9
242 2007q1 14726.0
243 2007q2 14838.7
... ... ...
250 2009q1 14375.0
251 2009q2 14355.6
252 2009q3 14402.5
253 2009q4 14541.9
254 2010q1 14604.8
255 2010q2 14745.9
256 2010q3 14845.5
257 2010q4 14939.0
258 2011q1 14881.3
259 2011q2 14989.6
260 2011q3 15021.1
261 2011q4 15190.3
262 2012q1 15291.0
263 2012q2 15362.4
264 2012q3 15380.8
265 2012q4 15384.3
266 2013q1 15491.9
267 2013q2 15521.6
268 2013q3 15641.3
269 2013q4 15793.9
270 2014q1 15747.0
271 2014q2 15900.8
272 2014q3 16094.5
273 2014q4 16186.7
274 2015q1 16269.0
275 2015q2 16374.2
276 2015q3 16454.9
277 2015q4 16490.7
278 2016q1 16525.0
279 2016q2 16583.1
I have the above dataframe. I want to compare the values in the column GDP in billions of chained 2009 dollars.1 and report the index and value of the row for which the value of the column is consecutively less for two values above it. I am using the following code but i am not getting the result
datan = pd.read_excel('gdplev.xls', skiprows = 5)
datan.drop(datan.iloc[0:230, 0:4], inplace = True, axis = 1)
datan = datan[214:]
datan = datan.drop(['GDP in billions of current dollars.1', 'Unnamed: 7'], axis = 1)
datan
for item in datan['GDP in billions of chained 2009 dollars.1']:
if item > item+1 and item+1 > item+2:
print(item+2)
Please help
I suggest the following:
# First I reproduce a similar DataFrame than yours
import pandas as pd
import numpy as np
np.random.seed(123)
df = pd.DataFrame({"quarter" : pd.date_range("2000q1", freq="Q", periods = 10),
"gdp": np.random.rand(10)*10000})
df["quarter"] = pd.Series(df["quarter"].dt.year).astype("str") + "q" + pd.Series(df["quarter"].dt.quarter).astype("str")
# Then I create two columns that are the lags of gdp
df["gdpN_1"] = df["gdp"].shift()
df["gdpN_2"] = df["gdpN_1"].shift()
# I create a top when gdp is below gdp at past quarter and the quarter before that
df["top"] = (df["gdp"] < df["gdpN_1"]) & (df["gdp"] < df["gdpN_2"])
# I only select rows for which top is True
new_df = df.loc[df["top"], ["quarter", "gdp"]]
And the result for new_df is :
quarter gdp
2 2000q3 2268.514536
5 2001q2 4231.064601
8 2002q1 4809.319015
9 2002q2 3921.175182

Python selecting items by comparing values in a table using dictionary

I have a table with 12 columns and want to select the items in the first column (qseqid) based on the second column (sseqid). Meaning that the second column (sseqid) is repeating with different values in the 11th and 12th columns, which areevalueandbitscore, respectively.
The ones that I would like to get are having the lowestevalueand the highestbitscore(whenevalues are the same, the rest of the columns can be ignored and the data is down below).
So, I have made a short code which uses the second columns as a key for the dictionary. I can get five different items from the second column with lists of qseqid+evalueandqseqid+bitscore.
Here is the code:
#!usr/bin/python
filename = "data.txt"
readfile = open(filename,"r")
d = dict()
for i in readfile.readlines():
i = i.strip()
i = i.split("\t")
d.setdefault(i[1], []).append([i[0],i[10]])
d.setdefault(i[1], []).append([i[0],i[11]])
for x in d:
print(x,d[x])
readfile.close()
But, I am struggling to get the qseqid with the lowest evalue and the highest bitscore for each sseqid.
Is there any good logic to solve the problem?
Thedata.txtfile (including the header row and with»representing tab characters)
qseqid»sseqid»pident»length»mismatch»gapopen»qstart»qend»sstart»send»evalue»bitscore
ACLA_022040»TBB»32.71»431»258»8»39»468»24»423»2.00E-76»240
ACLA_024600»TBB»80»435»87»0»1»435»1»435»0»729
ACLA_031860»TBB»39.74»453»251»3»1»447»1»437»1.00E-121»357
ACLA_046030»TBB»75.81»434»105»0»1»434»1»434»0»704
ACLA_072490»TBB»41.7»446»245»3»4»447»3»435»2.00E-120»353
ACLA_010400»EF1A»27.31»249»127»8»69»286»9»234»3.00E-13»61.6
ACLA_015630»EF1A»22»491»255»17»186»602»3»439»8.00E-19»78.2
ACLA_016510»EF1A»26.23»122»61»4»21»127»9»116»2.00E-08»46.2
ACLA_023300»EF1A»29.31»447»249»12»48»437»3»439»2.00E-45»155
ACLA_028450»EF1A»85.55»443»63»1»1»443»1»442»0»801
ACLA_074730»CALM»23.13»147»101»4»6»143»2»145»7.00E-08»41.2
ACLA_096170»CALM»29.33»150»96»4»34»179»2»145»1.00E-13»55.1
ACLA_016630»CALM»23.9»159»106»5»58»216»4»147»5.00E-12»51.2
ACLA_031930»RPB2»36.87»1226»633»24»121»1237»26»1219»0»734
ACLA_065630»RPB2»65.79»1257»386»14»1»1252»4»1221»0»1691
ACLA_082370»RPB2»27.69»1228»667»37»31»1132»35»1167»7.00E-110»365
ACLA_061960»ACT»28.57»147»95»5»146»284»69»213»3.00E-12»57.4
ACLA_068200»ACT»28.73»463»231»13»16»471»4»374»1.00E-53»176
ACLA_069960»ACT»24.11»141»97»4»581»718»242»375»9.00E-09»46.2
ACLA_095800»ACT»91.73»375»31»0»1»375»1»375»0»732
And here's a little more readable version of the table's contents:
0 1 2 3 4 5 6 7 8 9 10 11
qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore
ACLA_022040 TBB 32.71 431 258 8 39 468 24 423 2.00E-76 240
ACLA_024600 TBB 80 435 87 0 1 435 1 435 0 729
ACLA_031860 TBB 39.74 453 251 3 1 447 1 437 1.00E-121 357
ACLA_046030 TBB 75.81 434 105 0 1 434 1 434 0 704
ACLA_072490 TBB 41.7 446 245 3 4 447 3 435 2.00E-120 353
ACLA_010400 EF1A 27.31 249 127 8 69 286 9 234 3.00E-13 61.6
ACLA_015630 EF1A 22 491 255 17 186 602 3 439 8.00E-19 78.2
ACLA_016510 EF1A 26.23 122 61 4 21 127 9 116 2.00E-08 46.2
ACLA_023300 EF1A 29.31 447 249 12 48 437 3 439 2.00E-45 155
ACLA_028450 EF1A 85.55 443 63 1 1 443 1 442 0 801
ACLA_074730 CALM 23.13 147 101 4 6 143 2 145 7.00E-08 41.2
ACLA_096170 CALM 29.33 150 96 4 34 179 2 145 1.00E-13 55.1
ACLA_016630 CALM 23.9 159 106 5 58 216 4 147 5.00E-12 51.2
ACLA_031930 RPB2 36.87 1226 633 24 121 1237 26 1219 0 734
ACLA_065630 RPB2 65.79 1257 386 14 1 1252 4 1221 0 1691
ACLA_082370 RPB2 27.69 1228 667 37 31 1132 35 1167 7.00E-110 365
ACLA_061960 ACT 28.57 147 95 5 146 284 69 213 3.00E-12 57.4
ACLA_068200 ACT 28.73 463 231 13 16 471 4 374 1.00E-53 176
ACLA_069960 ACT 24.11 141 97 4 581 718 242 375 9.00E-09 46.2
ACLA_095800 ACT 91.73 375 31 0 1 375 1 375 0 732
Since you're a Python newbie I'm glad that there are several examples of how to this manually, but for comparison I'll show how it can be done using the pandas library which makes working with tabular data much simpler.
Since you didn't provide example output, I'm assuming that by "with the lowest evalue and the highest bitscore for each sseqid" you mean "the highest bitscore among the lowest evalues" for a given sseqid; if you want those separately, that's trivial too.
import pandas as pd
df = pd.read_csv("acla1.dat", sep="\t")
df = df.sort(["evalue", "bitscore"],ascending=[True, False])
df_new = df.groupby("sseqid", as_index=False).first()
which produces
>>> df_new
sseqid qseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore
0 ACT ACLA_095800 91.73 375 31 0 1 375 1 375 0.000000e+00 732.0
1 CALM ACLA_096170 29.33 150 96 4 34 179 2 145 1.000000e-13 55.1
2 EF1A ACLA_028450 85.55 443 63 1 1 443 1 442 0.000000e+00 801.0
3 RPB2 ACLA_065630 65.79 1257 386 14 1 1252 4 1221 0.000000e+00 1691.0
4 TBB ACLA_024600 80.00 435 87 0 1 435 1 435 0.000000e+00 729.0
Basically, first we read the data file into an object called a DataFrame, which is kind of like an Excel worksheet. Then we sort by evalue ascending (so that lower evalues come first) and by bitscore descending (so that higher bitscores come first). Then we can use groupby to collect the data in groups of equal sseqid, and take the first one in each group, which because of the sorting will be the one we want.
#!usr/bin/python
import csv
DATA = "data.txt"
class Sequence:
def __init__(self, row):
self.qseqid = row[0]
self.sseqid = row[1]
self.pident = float(row[2])
self.length = int(row[3])
self.mismatch = int(row[4])
self.gapopen = int(row[5])
self.qstart = int(row[6])
self.qend = int(row[7])
self.sstart = int(row[8])
self.send = int(row[9])
self.evalue = float(row[10])
self.bitscore = float(row[11])
def __str__(self):
return (
"{qseqid}\t"
"{sseqid}\t"
"{pident}\t"
"{length}\t"
"{mismatch}\t"
"{gapopen}\t"
"{qstart}\t"
"{qend}\t"
"{sstart}\t"
"{send}\t"
"{evalue}\t"
"{bitscore}"
).format(**self.__dict__)
def entries(fname, header_rows=1, dtype=list, **kwargs):
with open(fname) as inf:
incsv = csv.reader(inf, **kwargs)
# skip header rows
for i in range(header_rows):
next(incsv)
for row in incsv:
yield dtype(row)
def main():
bestseq = {}
for seq in entries(DATA, dtype=Sequence, delimiter="\t"):
# see if a sequence with the same sseqid already exists
prev = bestseq.get(seq.sseqid, None)
if (
prev is None
or seq.evalue < prev.evalue
or (seq.evalue == prev.evalue and seq.bitscore > prev.bitscore)
):
bestseq[seq.sseqid] = seq
# display selected sequences
keys = sorted(bestseq)
for key in keys:
print(bestseq[key])
if __name__ == "__main__":
main()
which results in
ACLA_095800 ACT 91.73 375 31 0 1 375 1 375 0.0 732.0
ACLA_096170 CALM 29.33 150 96 4 34 179 2 145 1e-13 55.1
ACLA_028450 EF1A 85.55 443 63 1 1 443 1 442 0.0 801.0
ACLA_065630 RPB2 65.79 1257 386 14 1 1252 4 1221 0.0 1691.0
ACLA_024600 TBB 80.0 435 87 0 1 435 1 435 0.0 729.0
While not nearly as elegant and concise as using thepandaslibrary, it's quite possible to do what you want without resorting to third-party modules. The following uses thecollections.defaultdictclass to facilitate creation of dictionaries of variable-length lists of records. The use of theAttrDictclass is optional, but it makes accessing the fields of each dictionary-based records easier and is less awkward-looking than the usualdict['fieldname']syntax otherwise required.
import csv
from collections import defaultdict, namedtuple
from itertools import imap
from operator import itemgetter
data_file_name = 'data.txt'
DELIMITER = '\t'
ssqeid_dict = defaultdict(list)
# from http://stackoverflow.com/a/1144405/355230
def multikeysort(items, columns):
comparers = [((itemgetter(col[1:].strip()), -1) if col.startswith('-') else
(itemgetter(col.strip()), 1)) for col in columns]
def comparer(left, right):
for fn, mult in comparers:
result = cmp(fn(left), fn(right))
if result:
return mult * result
else:
return 0
return sorted(items, cmp=comparer)
# from http://stackoverflow.com/a/15109345/355230
class AttrDict(dict):
def __init__(self, *args, **kwargs):
super(AttrDict, self).__init__(*args, **kwargs)
self.__dict__ = self
with open(data_file_name, 'rb') as data_file:
reader = csv.DictReader(data_file, delimiter=DELIMITER)
format_spec = '\t'.join([('{%s}' % field) for field in reader.fieldnames])
for rec in (AttrDict(r) for r in reader):
# Convert the two sort fields to numeric values for proper ordering.
rec.evalue, rec.bitscore = map(float, (rec.evalue, rec.bitscore))
ssqeid_dict[rec.sseqid].append(rec)
for ssqeid in sorted(ssqeid_dict):
# Sort each group of recs with same ssqeid. The first record after sorting
# will be the one sought that has the lowest evalue and highest bitscore.
selected = multikeysort(ssqeid_dict[ssqeid], ['evalue', '-bitscore'])[0]
print format_spec.format(**selected)
Output (»represents tabs):
ACLA_095800» ACT» 91.73» 375» 31» 0» 1» 375» 1» 375» 0.0» 732.0
ACLA_096170» CALM» 29.33» 150» 96» 4» 34» 179» 2» 145» 1e-13» 55.1
ACLA_028450» EF1A» 85.55» 443» 63» 1» 1» 443» 1» 442» 0.0» 801.0
ACLA_065630» RPB2» 65.79» 1257» 386» 14» 1» 1252» 4» 1221» 0.0» 1691.0
ACLA_024600» TBB» 80» 435» 87» 0» 1» 435» 1» 435» 0.0» 729.0
filename = 'data.txt'
readfile = open(filename,'r')
d = dict()
sseqid=[]
lines=[]
for i in readfile.readlines():
sseqid.append(i.rsplit()[1])
lines.append(i.rsplit())
sorted_sseqid = sorted(set(sseqid))
sdqDict={}
key =None
for sorted_ssqd in sorted_sseqid:
key=sorted_ssqd
evalue=[]
bitscore=[]
qseid=[]
for line in lines:
if key in line:
evalue.append(line[10])
bitscore.append(line[11])
qseid.append(line[0])
sdqDict[key]=[qseid,evalue,bitscore]
print sdqDict
print 'TBB LOWEST EVALUE' + '---->' + min(sdqDict['TBB'][1])
##I think you can do the list manipulation below to find out the qseqid
readfile.close()

Categories