How to scrape tbody from a collapsible table using BeautifulSoup library? - python

Recently i did a project based of covid-19 dashboard. Where i use to scrape data from this website which has a collapsible table. Everything was ok till now, now recently the heroku app showing some errors. So i rerun my code in my local machine and the error occured at scraping tbody. Then i figured out that the site i use to scrape data has changed or updated the way it looks (table) and then my code is not able to grab it. I tried viewing page source and i am not able to find the table (tbody) that is on this page.But i am able to find tbody and all the data if i inspect the row of the table but cant find it on page source.How can i scrape the table now ?
My code:
The table i have to grab:

The data you see on the page is loaded from external URL via Ajax. You can use requests/json module to load it:
import json
import requests
url = 'https://www.mohfw.gov.in/data/datanew.json'
data = requests.get(url).json()
# uncomment to print all data:
# print(json.dumps(data, indent=4))
# print some data on screen:
for d in data:
print('{:<30} {:<10} {:<10} {:<10} {:<10}'.format(d['state_name'], d['active'], d['positive'], d['cured'], d['death']))
Prints:
Andaman and Nicobar Islands 329 548 214 5
Andhra Pradesh 75720 140933 63864 1349
Arunachal Pradesh 670 1591 918 3
Assam 9814 40269 30357 98
Bihar 17579 51233 33358 296
Chandigarh 369 1051 667 15
Chhattisgarh 2803 9086 6230 53
... and so on.

Try:
import json
import requests
import pandas as pd
data = []
row = []
r = requests.get('https://www.mohfw.gov.in/data/datanew.json')
j = json.loads(r.text)
for i in j:
for k in i:
row.append(i[k])
data.append(row)
row = []
columns = [i for i in j[0]]
df = pd.DataFrame(data, columns=columns)
df.sno = pd.to_numeric(df.sno, errors='coerce').reset_index()
df = df.sort_values('sno',)
print(df.to_string())
prints:
sno state_name active positive cured death new_active new_positive new_cured new_death state_code
0 0 Andaman and Nicobar Islands 329 548 214 5 403 636 226 7 35
1 1 Andhra Pradesh 75720 140933 63864 1349 72188 150209 76614 1407 28
2 2 Arunachal Pradesh 670 1591 918 3 701 1673 969 3 12
3 3 Assam 9814 40269 30357 98 10183 41726 31442 101 18
4 4 Bihar 17579 51233 33358 296 18937 54240 34994 309 10
5 5 Chandigarh 369 1051 667 15 378 1079 683 18 04
6 6 Chhattisgarh 2803 9086 6230 53 2720 9385 6610 55 22
7 7 Dadra and Nagar Haveli and Daman and Diu 412 1100 686 2 418 1145 725 2 26
8 8 Delhi 10705 135598 120930 3963 10596 136716 122131 3989 07
9 9 Goa 1657 5913 4211 45 1707 6193 4438 48 30
10 10 Gujarat 14090 61438 44907 2441 14300 62463 45699 2464 24
and so on...

Related

How to scrape the table of states?

I am trying to scrape the table from:
https://worldpopulationreview.com/states
My code:
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = 'https://worldpopulationreview.com/states'
page = requests.get(url)
soup = BeautifulSoup(page.text,'lxml')
table = soup.find('table', {'class': 'jsx-a3119e4553b2cac7 table is-striped is-hoverable is-fullwidth tp-table-body is-narrow'})
headers = []
for i in table.find_all('th'):
title = i.text.strip()
headers.append(title)
df = pd.DataFrame(columns=headers)
for row in table.find_all('tr')[1:]:
data = row.find_all('td')
row_data = [td.text.strip() for td in data]
length = len(df)
df.loc[length] = row_data
df
Currently returns
'NoneType' object has no attribute 'find_all'
Clearly the error is because the table variable is returning nothing, but I believe I have the table tag correct.
The table data is dynamically loaded by JavaScript and bs4 can't render JS but you can do the job bs4 with an automation tool something like selenium and grab the table using pandas DataFrame.
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.chrome.service import Service
webdriver_service = Service("./chromedriver") #Your chromedriver path
driver = webdriver.Chrome(service=webdriver_service)
driver.get('https://worldpopulationreview.com/states')
driver.maximize_window()
time.sleep(8)
soup = BeautifulSoup(driver.page_source,"lxml")
#You can pull the table directly from the web page
df = pd.read_html(str(soup))[0]
print(df)
#OR
#table= soup.select_one('table[class="jsx-a3119e4553b2cac7 table is-striped is-hoverable is-fullwidth tp-table-body is-narrow"]')
# df = pd.read_html(str(table))[0]
# print(df)
Output:
Rank State 2022 Population Growth Rate ... 2010 Population Growth Since 2010 % of US Density (/mi²)
0 1 California 39995077 0.57% ... 37253956 7.36% 11.93% 257
1 2 Texas 29945493 1.35% ... 25145561 19.09% 8.93% 115
2 3 Florida 22085563 1.25% ... 18801310 17.47% 6.59% 412
3 4 New York 20365879 0.41% ... 19378102 5.10% 6.07% 432
4 5 Pennsylvania 13062764 0.23% ... 12702379 2.84% 3.90% 292
5 6 Illinois 12808884 -0.01% ... 12830632 -0.17% 3.82% 231
6 7 Ohio 11852036 0.22% ... 11536504 2.74% 3.53% 290
7 8 Georgia 10916760 0.95% ... 9687653 12.69% 3.26% 190
8 9 North Carolina 10620168 0.86% ... 9535483 11.38% 3.17% 218
9 10 Michigan 10116069 0.19% ... 9883640 2.35% 3.02% 179
10 11 New Jersey 9388414 0.53% ... 8791894 6.78% 2.80% 1277
11 12 Virginia 8757467 0.73% ... 8001024 9.45% 2.61% 222
12 13 Washington 7901429 1.26% ... 6724540 17.50% 2.36% 119
13 14 Arizona 7303398 1.05% ... 6392017 14.26% 2.18% 64
14 15 Massachusetts 7126375 0.68% ... 6547629 8.84% 2.13% 914
15 16 Tennessee 7023788 0.81% ... 6346105 10.68% 2.09% 170
16 17 Indiana 6845874 0.44% ... 6483802 5.58% 2.04% 191
17 18 Maryland 6257958 0.65% ... 5773552 8.39% 1.87% 645
18 19 Missouri 6188111 0.27% ... 5988927 3.33% 1.85% 90
19 20 Wisconsin 5935064 0.35% ... 5686986 4.36% 1.77% 110
20 21 Colorado 5922618 1.27% ... 5029196 17.76% 1.77% 57
21 22 Minnesota 5787008 0.70% ... 5303925 9.11% 1.73% 73
22 23 South Carolina 5217037 0.95% ... 4625364 12.79% 1.56% 174
23 24 Alabama 5073187 0.48% ... 4779736 6.14% 1.51% 100
24 25 Louisiana 4682633 0.27% ... 4533372 3.29% 1.40% 108
25 26 Kentucky 4539130 0.37% ... 4339367 4.60% 1.35% 115
26 27 Oregon 4318492 0.95% ... 3831074 12.72% 1.29% 45
27 28 Oklahoma 4000953 0.52% ... 3751351 6.65% 1.19% 58
28 29 Connecticut 3612314 0.09% ... 3574097 1.07% 1.08% 746
29 30 Utah 3373162 1.53% ... 2763885 22.04% 1.01% 41
30 31 Iowa 3219171 0.45% ... 3046355 5.67% 0.96% 58
31 32 Nevada 3185426 1.28% ... 2700551 17.95% 0.95% 29
32 33 Arkansas 3030646 0.32% ... 2915918 3.93% 0.90% 58
33 34 Mississippi 2960075 -0.02% ... 2967297 -0.24% 0.88% 63
34 35 Kansas 2954832 0.29% ... 2853118 3.57% 0.88% 36
35 36 New Mexico 2129190 0.27% ... 2059179 3.40% 0.64% 18
36 37 Nebraska 1988536 0.68% ... 1826341 8.88% 0.59% 26
37 38 Idaho 1893410 1.45% ... 1567582 20.79% 0.56% 23
38 39 West Virginia 1781860 -0.33% ... 1852994 -3.84% 0.53% 74
39 40 Hawaii 1474265 0.65% ... 1360301 8.38% 0.44% 230
40 41 New Hampshire 1389741 0.44% ... 1316470 5.57% 0.41% 155
41 42 Maine 1369159 0.25% ... 1328361 3.07% 0.41% 44
42 43 Rhode Island 1106341 0.41% ... 1052567 5.11% 0.33% 1070
43 44 Montana 1103187 0.87% ... 989415 11.50% 0.33%
8
44 45 Delaware 1008350 0.92% ... 897934 12.30% 0.30% 517
45 46 South Dakota 901165 0.81% ... 814180 10.68% 0.27% 12
46 47 North Dakota 800394 1.35% ... 672591 19.00% 0.24% 12
47 48 Alaska 738023 0.31% ... 710231 3.91% 0.22%
1
48 49 Vermont 646545 0.27% ... 625741 3.32% 0.19% 70
49 50 Wyoming 579495 0.23% ... 563626 2.82% 0.17%
6
[50 rows x 9 columns]
Table is rendered dynamically from JSON that is placed at the end of the source code, so it do not need selenium simply extract the tag and load the JSON - This also includes all additional information from the page:
soup = BeautifulSoup(requests.get('https://worldpopulationreview.com/states').text)
json.loads(soup.select_one('#__NEXT_DATA__').text)['props']['pageProps']['data']
Example
import requests, json
import pandas as pd
from bs4 import BeautifulSoup
soup = BeautifulSoup(requests.get('https://worldpopulationreview.com/states').text)
pd.DataFrame(
json.loads(soup.select_one('#__NEXT_DATA__').text)['props']['pageProps']['data']
)
Example
Cause there are also additional information, that is used for the map, simply choose columns you need by header.
fips
state
densityMi
pop2022
pop2021
pop2020
pop2019
pop2010
growthRate
growth
growthSince2010
area
fill
Name
rank
0
6
California
256.742
39995077
39766650
39538223
39309799
37253956
0.00574419
228427
0.0735793
155779
#084594
California
1
1
48
Texas
114.632
29945493
29545499
29145505
28745507
25145561
0.0135382
399994
0.190886
261232
#084594
Texas
2
2
12
Florida
411.852
22085563
21811875
21538187
21264502
18801310
0.0125477
273688
0.174682
53625
#084594
Florida
3
3
36
New York
432.158
20365879
20283564
20201249
20118937
19378102
0.00405821
82315
0.0509739
47126
#084594
New York
4
4
42
Pennsylvania
291.951
13062764
13032732
13002700
12972667
12702379
0.00230435
30032
0.0283715
44743
#2171b5
Pennsylvania
5
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
45
46
South Dakota
11.887
901165
893916
886667
879421
814180
0.00810926
7249
0.106838
75811
#c6dbef
South Dakota
46
46
38
North Dakota
11.5997
800394
789744
779094
768441
672591
0.0134854
10650
0.190016
69001
#c6dbef
North Dakota
47
47
2
Alaska
1.29332
738023
735707
733391
731075
710231
0.00314799
2316
0.0391309
570641
#c6dbef
Alaska
48
48
50
Vermont
70.147
646545
644811
643077
641347
625741
0.00268916
1734
0.033247
9217
#c6dbef
Vermont
49
49
56
Wyoming
5.96845
579495
578173
576851
575524
563626
0.00228651
1322
0.0281552
97093
#c6dbef
Wyoming
50

Is there a way to have insertion of tags or labels with respect to a certain value range?

I have this data and would like to insert a new column titled 'Level'. I understand
'insert' is a mode of entering in a new column. I tried an 'if' for the argument 'value', but this is not yielding anything.
Data:
Active Discharged Deaths
State/UTs
Andaman and Nicobar 6 7437 129
Andhra Pradesh 14550 1993589 13925
Arunachal Pradesh 634 52507 267
Assam 6415 580491 5710
Bihar 55 716048 9656
Chandigarh 35 64273 814
Chhattisgarh 354 990757 13557
Dadra and Nagar Haveli and Daman and Diu 2 10659 4
Delhi 367 1412542 25082
Goa 885 170391 3210
Gujarat 152 815275 10082
Haryana 617 760271 9685
Himachal Pradesh 1699 209420 3613
Jammu and Kashmir 1286 320337 4410
Jharkhand 126 342716 5133
Karnataka 17412 2901299 37426
Kerala 239338 3966557 21631
Ladakh 54 20327 207
Lakshadweep 9 10288 51
Madhya Pradesh 125 781629 10516
Maharashtra 51234 6300755 137811
Manipur 3180 110602 1802
Meghalaya 2104 73711 1329
Mizoram 11414 54056 226
Nagaland 712 29045 631
Odisha 6322 997790 8055
Puducherry 914 121452 1818
Punjab 326 584079 16444
Rajasthan 86 945097 8954
Sikkim 913 28968 375
Tamil Nadu 16256 2572942 35036
Telengana 5505 650453 3886
Tripura 691 81866 803
Uttar Pradesh 227 1686369 22861
Uttarakhand 379 335358 7388
West Bengal 8480 1525581 18515
code:
data = Table.read_table('IndiaStatus.csv')#.drop('Discharged', 'Discharge Ratio (%)','Total Cases','Active','Deaths')
data2.info()
data3 = data2.set_index("State/UTs")
data3 = data3[["Active","Discharged","Deaths"]]
print(data3)
data3.insert(1, column = "Level", value = "Severe" if data3["Active"] > 91874)
output:
line 49
data3.insert(1, column = "Level", value = "Severe" if data3["Active"] > 91874)
^
SyntaxError: invalid syntax
The SyntaxError is because you need a else condition, so something like value = "Severe", if data3["Active"] > 91874 else 'OTHER' would remove the error. That said, it won't work in this case and return another error of using a Series - in this case data3["Active"] > 91874 - in a if statement.
I believe you can use np.where here
data3.insert(1, column = "Level",
value = np.where(data3["Active"] > 91874, "Severe", 'OTHER')
Replace OTHER in the above code by any value you want to assign in the column when the condition data3["Active"] > 91874 is not met

in dataframe , how to merge two rows

in dataframe , how to merge two rows, like 148 merge 142 to be a new line and drop two them.
title collectionsCount subscribersCount entriesCount viewsCount
148 Android 697977 100213 6803 10610138
142 Java 103821 65303 1493 1590201
161 iOS 163137 65896 3601 3739843
177 JavaScript 222100 88872 2412 3548736
16 Python 45234 45100 1007 930588
162 Swift 28498 30317 1180 928488
20 PHP 15376 25143 375 329720
62 Go 5321 12881 179 145851
41 C++ 3495 18404 101 75019
17 C 2213 14870 50 52019
63 Ruby 1543 6711 40 45162
You can use the method pandas.Series.replace to replace Android to Java then use pandas.DataFrame.groupby to aggregate the data.
This should work:
rules = {'Android':'Java'}
df['title'].replace(rules,inplace=True)
df = df.groupby('title').sum().reset_index()
print(df)
Output:
title collectionsCount subscribersCount entriesCount viewsCount
0 C 2213 14870 50 52019
1 C++ 3495 18404 101 75019
2 Go 5321 12881 179 145851
3 Java 801798 165516 8296 12200339
4 JavaScript 222100 88872 2412 3548736
5 PHP 15376 25143 375 329720
6 Python 45234 45100 1007 930588
7 Ruby 1543 6711 40 45162
8 Swift 28498 30317 1180 928488
9 iOS 163137 65896 3601 3739843

Pivot tables using pandas

I have the following dataframe:
df1= df[['rsa_units','regions','ssno','veteran','pos_off_ttl','occ_ser','grade','gender','ethnicity','age','age_category','service_time','type_appt','disabled','actn_dt','nat_actn_2_3','csc_auth_12','fy']]
this will produce 1.4 mil records. I've taken the first 12.
Eastern Region (R9),Eastern Region (R9),123456789,Non Vet,LBRER,3502,3,Male,White,43.0,Older Gen X'ers,5.0,Temporary,,2009-05-18 00:00:00,115,BDN,2009
Northern Region (R1),Northern Region (R1),234567891,Non Vet,FRSTRY TECHNCN,0462,4,Male,White,37.0,Younger Gen X'ers,7.0,Temporary,,2007-05-27 00:00:00,115,BDN,2007
Northern Region (R1),Northern Region (R1),345678912,Non Vet,FRSTRY AID,0462,3,Male,White,33.0,Younger Gen X'ers,8.0,Temporary,,2006-06-05 00:00:00,115,BDN,2006
Northern Research Station (NRS),Research & Development(RES),456789123,Non Vet,FRSTRY TECHNCN,0462,7,Male,White,37.0,Younger Gen X'ers,10.0,Term,,2006-11-26 00:00:00,702,N6M,2007
Intermountain Region (R4),Intermountain Region (R4),5678912345,Non Vet,BIOLCL SCI TECHNCN,0404,5,Male,White,45.0,Older Gen X'ers,6.0,Temporary,,2008-05-18 00:00:00,115,BWA,2008
Intermountain Region (R4),Intermountain Region (R4),678912345,Non Vet,FRSTRY AID (FIRE),0462,3,Female,White,31.0,Younger Gen X'ers,5.0,Temporary,,2009-05-10 00:00:00,115,BDN,2009
Pacific Southwest Region (R5),Pacific Southwest Region (R5),789123456,Non Vet,FRSTRY AID (FIRE),0462,3,Male,White,31.0,Younger Gen X'ers,3.0,Temporary,,2012-05-06 00:00:00,115,NAM,2012
Pacific Southwest Region (R5),Pacific Southwest Region (R5),891234567,Non Vet,FRSTRY AID (FIRE),0462,3,Male,White,31.0,Younger Gen X'ers,3.0,Temporary,,2011-06-05 00:00:00,115,BDN,2011
Intermountain Region (R4),Intermountain Region (R4),912345678,Non Vet,FRSTRY TECHNCN,0462,5,Male,White,37.0,Younger Gen X'ers,11.0,Temporary,,2006-04-30 00:00:00,115,BDN,2006
Northern Region (R1),Northern Region (R1),987654321,Non Vet,FRSTRY TECHNCN,0462,4,Male,White,37.0,Younger Gen X'ers,11.0,Temporary,,2005-04-11 00:00:00,115,BDN,2005
Southwest Region (R3),Southwest Region (R3),876543219,Non Vet,FRSTRY TECHNCN (HOTSHOT/HANDCREW),0462,4,Male,White,30.0,Gen Y Millennial,4.0,Temporary,,2013-03-24 00:00:00,115,NAM,2013
Southwest Region (R3),Southwest Region (R3),765432198,Non Vet,FRSTRY TECHNCN (RECR),0462,4,Male,White,30.0,Gen Y Millennial,5.0,Temporary,,2010-11-21 00:00:00,115,BDN,2011
I then filter on ['nat_actn_2_3'] for the certain hiring codes.
h1 = df1[df1['nat_actn_2_3'].isin(['100','101','108','170','171','115','130','140','141','190','702','703'])]
h2 = h1.sort('ssno')
h3 = h2.drop_duplicates(['ssno','actn_dt'])
and can look at value_counts() to see total hires by region.
total_newhires = h3['regions'].value_counts()
total_newhires
produces:
Out[38]:
Pacific Southwest Region (R5) 42255
Pacific Northwest Region (R6) 32081
Intermountain Region (R4) 24045
Northern Region (R1) 22822
Rocky Mountain Region (R2) 17481
Southwest Region (R3) 17305
Eastern Region (R9) 11034
Research & Development(RES) 7337
Southern Region (R8) 7288
Albuquerque Service Center(ASC) 7032
Washington Office(WO) 4837
Alaska Region (R10) 4210
Job Corps(JC) 4010
nda 438
I'd like to do something like in excel where I can have the ['regions'] as my row and the ['fy'] as the columns to give me a total count of numbers based off the ['ssno'] for each ['fy']. It would also be nice to eventually do calculations based off the numbers too, like averages and sums.
Along with looking at examples in the url: http://pandas.pydata.org/pandas-docs/stable/reshaping.html, I've also tried:
hirestable = pivot_table(h3, values=['ethnicity', 'veteran'], rows=['regions'], cols=['fy'])
I'm wondering if groupby may be what I'm looking for?
Any help is appreciated. I've spent 3 days on this and can't seem to put it together.
So based off the answer below I did a pivot using the following code:
h3.pivot_table(values=['ssno'], rows=['nat_actn_2_3'], cols=['fy'], aggfunc=len).
Which produced a somewhat decent result. When I used 'ethnicity' or 'veteran' as a value my results came out really strange and didn't match my value counts numbers. Not sure if the pivot eliminates duplicates or what, but it did not come out correctly.
ssno
fy 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015
nat_actn_2_3
100 34 20 25 18 38 43 45 14 19 25 10
101 510 453 725 795 1029 1293 957 383 470 605 145
108 170 132 112 85 123 127 84 43 40 29 10
115 9203 8972 7946 9038 10139 10480 9211 8735 10482 11258 339
130 299 313 431 324 291 325 336 202 230 436 112
140 62 74 71 75 132 125 82 42 45 74 18
141 20 16 23 17 20 14 10 9 13 17 7
170 202 433 226 278 336 386 284 265 121 118 49
171 4771 4627 4234 4196 4470 4472 3270 3145 354 341 34
190 1 1 NaN NaN NaN 1 NaN NaN NaN NaN NaN
702 3141 3099 3429 3030 3758 3952 3813 2902 2329 2375 650
703 2280 2354 2225 2050 2260 2328 2172 2503 2649 2856 726
Try it like this:
h3.pivot_table(values=['ethnicity', 'veteran'], index=['regions'], columns=['fy'], aggfunc=len, fill_value=0)
To get counts use the aggfunc = len
Also your isin references a list of strings, but the data you provide for columns 'nat_actn_2_3' are int
Try:
h3.pivot_table(values=['ethnicity', 'veteran'], rows=['regions'], cols=['fy'], aggfunc=len, fill_value=0)
if you have an older version of pandas

Python selecting items by comparing values in a table using dictionary

I have a table with 12 columns and want to select the items in the first column (qseqid) based on the second column (sseqid). Meaning that the second column (sseqid) is repeating with different values in the 11th and 12th columns, which areevalueandbitscore, respectively.
The ones that I would like to get are having the lowestevalueand the highestbitscore(whenevalues are the same, the rest of the columns can be ignored and the data is down below).
So, I have made a short code which uses the second columns as a key for the dictionary. I can get five different items from the second column with lists of qseqid+evalueandqseqid+bitscore.
Here is the code:
#!usr/bin/python
filename = "data.txt"
readfile = open(filename,"r")
d = dict()
for i in readfile.readlines():
i = i.strip()
i = i.split("\t")
d.setdefault(i[1], []).append([i[0],i[10]])
d.setdefault(i[1], []).append([i[0],i[11]])
for x in d:
print(x,d[x])
readfile.close()
But, I am struggling to get the qseqid with the lowest evalue and the highest bitscore for each sseqid.
Is there any good logic to solve the problem?
Thedata.txtfile (including the header row and with»representing tab characters)
qseqid»sseqid»pident»length»mismatch»gapopen»qstart»qend»sstart»send»evalue»bitscore
ACLA_022040»TBB»32.71»431»258»8»39»468»24»423»2.00E-76»240
ACLA_024600»TBB»80»435»87»0»1»435»1»435»0»729
ACLA_031860»TBB»39.74»453»251»3»1»447»1»437»1.00E-121»357
ACLA_046030»TBB»75.81»434»105»0»1»434»1»434»0»704
ACLA_072490»TBB»41.7»446»245»3»4»447»3»435»2.00E-120»353
ACLA_010400»EF1A»27.31»249»127»8»69»286»9»234»3.00E-13»61.6
ACLA_015630»EF1A»22»491»255»17»186»602»3»439»8.00E-19»78.2
ACLA_016510»EF1A»26.23»122»61»4»21»127»9»116»2.00E-08»46.2
ACLA_023300»EF1A»29.31»447»249»12»48»437»3»439»2.00E-45»155
ACLA_028450»EF1A»85.55»443»63»1»1»443»1»442»0»801
ACLA_074730»CALM»23.13»147»101»4»6»143»2»145»7.00E-08»41.2
ACLA_096170»CALM»29.33»150»96»4»34»179»2»145»1.00E-13»55.1
ACLA_016630»CALM»23.9»159»106»5»58»216»4»147»5.00E-12»51.2
ACLA_031930»RPB2»36.87»1226»633»24»121»1237»26»1219»0»734
ACLA_065630»RPB2»65.79»1257»386»14»1»1252»4»1221»0»1691
ACLA_082370»RPB2»27.69»1228»667»37»31»1132»35»1167»7.00E-110»365
ACLA_061960»ACT»28.57»147»95»5»146»284»69»213»3.00E-12»57.4
ACLA_068200»ACT»28.73»463»231»13»16»471»4»374»1.00E-53»176
ACLA_069960»ACT»24.11»141»97»4»581»718»242»375»9.00E-09»46.2
ACLA_095800»ACT»91.73»375»31»0»1»375»1»375»0»732
And here's a little more readable version of the table's contents:
0 1 2 3 4 5 6 7 8 9 10 11
qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore
ACLA_022040 TBB 32.71 431 258 8 39 468 24 423 2.00E-76 240
ACLA_024600 TBB 80 435 87 0 1 435 1 435 0 729
ACLA_031860 TBB 39.74 453 251 3 1 447 1 437 1.00E-121 357
ACLA_046030 TBB 75.81 434 105 0 1 434 1 434 0 704
ACLA_072490 TBB 41.7 446 245 3 4 447 3 435 2.00E-120 353
ACLA_010400 EF1A 27.31 249 127 8 69 286 9 234 3.00E-13 61.6
ACLA_015630 EF1A 22 491 255 17 186 602 3 439 8.00E-19 78.2
ACLA_016510 EF1A 26.23 122 61 4 21 127 9 116 2.00E-08 46.2
ACLA_023300 EF1A 29.31 447 249 12 48 437 3 439 2.00E-45 155
ACLA_028450 EF1A 85.55 443 63 1 1 443 1 442 0 801
ACLA_074730 CALM 23.13 147 101 4 6 143 2 145 7.00E-08 41.2
ACLA_096170 CALM 29.33 150 96 4 34 179 2 145 1.00E-13 55.1
ACLA_016630 CALM 23.9 159 106 5 58 216 4 147 5.00E-12 51.2
ACLA_031930 RPB2 36.87 1226 633 24 121 1237 26 1219 0 734
ACLA_065630 RPB2 65.79 1257 386 14 1 1252 4 1221 0 1691
ACLA_082370 RPB2 27.69 1228 667 37 31 1132 35 1167 7.00E-110 365
ACLA_061960 ACT 28.57 147 95 5 146 284 69 213 3.00E-12 57.4
ACLA_068200 ACT 28.73 463 231 13 16 471 4 374 1.00E-53 176
ACLA_069960 ACT 24.11 141 97 4 581 718 242 375 9.00E-09 46.2
ACLA_095800 ACT 91.73 375 31 0 1 375 1 375 0 732
Since you're a Python newbie I'm glad that there are several examples of how to this manually, but for comparison I'll show how it can be done using the pandas library which makes working with tabular data much simpler.
Since you didn't provide example output, I'm assuming that by "with the lowest evalue and the highest bitscore for each sseqid" you mean "the highest bitscore among the lowest evalues" for a given sseqid; if you want those separately, that's trivial too.
import pandas as pd
df = pd.read_csv("acla1.dat", sep="\t")
df = df.sort(["evalue", "bitscore"],ascending=[True, False])
df_new = df.groupby("sseqid", as_index=False).first()
which produces
>>> df_new
sseqid qseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore
0 ACT ACLA_095800 91.73 375 31 0 1 375 1 375 0.000000e+00 732.0
1 CALM ACLA_096170 29.33 150 96 4 34 179 2 145 1.000000e-13 55.1
2 EF1A ACLA_028450 85.55 443 63 1 1 443 1 442 0.000000e+00 801.0
3 RPB2 ACLA_065630 65.79 1257 386 14 1 1252 4 1221 0.000000e+00 1691.0
4 TBB ACLA_024600 80.00 435 87 0 1 435 1 435 0.000000e+00 729.0
Basically, first we read the data file into an object called a DataFrame, which is kind of like an Excel worksheet. Then we sort by evalue ascending (so that lower evalues come first) and by bitscore descending (so that higher bitscores come first). Then we can use groupby to collect the data in groups of equal sseqid, and take the first one in each group, which because of the sorting will be the one we want.
#!usr/bin/python
import csv
DATA = "data.txt"
class Sequence:
def __init__(self, row):
self.qseqid = row[0]
self.sseqid = row[1]
self.pident = float(row[2])
self.length = int(row[3])
self.mismatch = int(row[4])
self.gapopen = int(row[5])
self.qstart = int(row[6])
self.qend = int(row[7])
self.sstart = int(row[8])
self.send = int(row[9])
self.evalue = float(row[10])
self.bitscore = float(row[11])
def __str__(self):
return (
"{qseqid}\t"
"{sseqid}\t"
"{pident}\t"
"{length}\t"
"{mismatch}\t"
"{gapopen}\t"
"{qstart}\t"
"{qend}\t"
"{sstart}\t"
"{send}\t"
"{evalue}\t"
"{bitscore}"
).format(**self.__dict__)
def entries(fname, header_rows=1, dtype=list, **kwargs):
with open(fname) as inf:
incsv = csv.reader(inf, **kwargs)
# skip header rows
for i in range(header_rows):
next(incsv)
for row in incsv:
yield dtype(row)
def main():
bestseq = {}
for seq in entries(DATA, dtype=Sequence, delimiter="\t"):
# see if a sequence with the same sseqid already exists
prev = bestseq.get(seq.sseqid, None)
if (
prev is None
or seq.evalue < prev.evalue
or (seq.evalue == prev.evalue and seq.bitscore > prev.bitscore)
):
bestseq[seq.sseqid] = seq
# display selected sequences
keys = sorted(bestseq)
for key in keys:
print(bestseq[key])
if __name__ == "__main__":
main()
which results in
ACLA_095800 ACT 91.73 375 31 0 1 375 1 375 0.0 732.0
ACLA_096170 CALM 29.33 150 96 4 34 179 2 145 1e-13 55.1
ACLA_028450 EF1A 85.55 443 63 1 1 443 1 442 0.0 801.0
ACLA_065630 RPB2 65.79 1257 386 14 1 1252 4 1221 0.0 1691.0
ACLA_024600 TBB 80.0 435 87 0 1 435 1 435 0.0 729.0
While not nearly as elegant and concise as using thepandaslibrary, it's quite possible to do what you want without resorting to third-party modules. The following uses thecollections.defaultdictclass to facilitate creation of dictionaries of variable-length lists of records. The use of theAttrDictclass is optional, but it makes accessing the fields of each dictionary-based records easier and is less awkward-looking than the usualdict['fieldname']syntax otherwise required.
import csv
from collections import defaultdict, namedtuple
from itertools import imap
from operator import itemgetter
data_file_name = 'data.txt'
DELIMITER = '\t'
ssqeid_dict = defaultdict(list)
# from http://stackoverflow.com/a/1144405/355230
def multikeysort(items, columns):
comparers = [((itemgetter(col[1:].strip()), -1) if col.startswith('-') else
(itemgetter(col.strip()), 1)) for col in columns]
def comparer(left, right):
for fn, mult in comparers:
result = cmp(fn(left), fn(right))
if result:
return mult * result
else:
return 0
return sorted(items, cmp=comparer)
# from http://stackoverflow.com/a/15109345/355230
class AttrDict(dict):
def __init__(self, *args, **kwargs):
super(AttrDict, self).__init__(*args, **kwargs)
self.__dict__ = self
with open(data_file_name, 'rb') as data_file:
reader = csv.DictReader(data_file, delimiter=DELIMITER)
format_spec = '\t'.join([('{%s}' % field) for field in reader.fieldnames])
for rec in (AttrDict(r) for r in reader):
# Convert the two sort fields to numeric values for proper ordering.
rec.evalue, rec.bitscore = map(float, (rec.evalue, rec.bitscore))
ssqeid_dict[rec.sseqid].append(rec)
for ssqeid in sorted(ssqeid_dict):
# Sort each group of recs with same ssqeid. The first record after sorting
# will be the one sought that has the lowest evalue and highest bitscore.
selected = multikeysort(ssqeid_dict[ssqeid], ['evalue', '-bitscore'])[0]
print format_spec.format(**selected)
Output (»represents tabs):
ACLA_095800» ACT» 91.73» 375» 31» 0» 1» 375» 1» 375» 0.0» 732.0
ACLA_096170» CALM» 29.33» 150» 96» 4» 34» 179» 2» 145» 1e-13» 55.1
ACLA_028450» EF1A» 85.55» 443» 63» 1» 1» 443» 1» 442» 0.0» 801.0
ACLA_065630» RPB2» 65.79» 1257» 386» 14» 1» 1252» 4» 1221» 0.0» 1691.0
ACLA_024600» TBB» 80» 435» 87» 0» 1» 435» 1» 435» 0.0» 729.0
filename = 'data.txt'
readfile = open(filename,'r')
d = dict()
sseqid=[]
lines=[]
for i in readfile.readlines():
sseqid.append(i.rsplit()[1])
lines.append(i.rsplit())
sorted_sseqid = sorted(set(sseqid))
sdqDict={}
key =None
for sorted_ssqd in sorted_sseqid:
key=sorted_ssqd
evalue=[]
bitscore=[]
qseid=[]
for line in lines:
if key in line:
evalue.append(line[10])
bitscore.append(line[11])
qseid.append(line[0])
sdqDict[key]=[qseid,evalue,bitscore]
print sdqDict
print 'TBB LOWEST EVALUE' + '---->' + min(sdqDict['TBB'][1])
##I think you can do the list manipulation below to find out the qseqid
readfile.close()

Categories