How to scrape tbody from a collapsible table using BeautifulSoup library? - python
Recently i did a project based of covid-19 dashboard. Where i use to scrape data from this website which has a collapsible table. Everything was ok till now, now recently the heroku app showing some errors. So i rerun my code in my local machine and the error occured at scraping tbody. Then i figured out that the site i use to scrape data has changed or updated the way it looks (table) and then my code is not able to grab it. I tried viewing page source and i am not able to find the table (tbody) that is on this page.But i am able to find tbody and all the data if i inspect the row of the table but cant find it on page source.How can i scrape the table now ?
My code:
The table i have to grab:
The data you see on the page is loaded from external URL via Ajax. You can use requests/json module to load it:
import json
import requests
url = 'https://www.mohfw.gov.in/data/datanew.json'
data = requests.get(url).json()
# uncomment to print all data:
# print(json.dumps(data, indent=4))
# print some data on screen:
for d in data:
print('{:<30} {:<10} {:<10} {:<10} {:<10}'.format(d['state_name'], d['active'], d['positive'], d['cured'], d['death']))
Prints:
Andaman and Nicobar Islands 329 548 214 5
Andhra Pradesh 75720 140933 63864 1349
Arunachal Pradesh 670 1591 918 3
Assam 9814 40269 30357 98
Bihar 17579 51233 33358 296
Chandigarh 369 1051 667 15
Chhattisgarh 2803 9086 6230 53
... and so on.
Try:
import json
import requests
import pandas as pd
data = []
row = []
r = requests.get('https://www.mohfw.gov.in/data/datanew.json')
j = json.loads(r.text)
for i in j:
for k in i:
row.append(i[k])
data.append(row)
row = []
columns = [i for i in j[0]]
df = pd.DataFrame(data, columns=columns)
df.sno = pd.to_numeric(df.sno, errors='coerce').reset_index()
df = df.sort_values('sno',)
print(df.to_string())
prints:
sno state_name active positive cured death new_active new_positive new_cured new_death state_code
0 0 Andaman and Nicobar Islands 329 548 214 5 403 636 226 7 35
1 1 Andhra Pradesh 75720 140933 63864 1349 72188 150209 76614 1407 28
2 2 Arunachal Pradesh 670 1591 918 3 701 1673 969 3 12
3 3 Assam 9814 40269 30357 98 10183 41726 31442 101 18
4 4 Bihar 17579 51233 33358 296 18937 54240 34994 309 10
5 5 Chandigarh 369 1051 667 15 378 1079 683 18 04
6 6 Chhattisgarh 2803 9086 6230 53 2720 9385 6610 55 22
7 7 Dadra and Nagar Haveli and Daman and Diu 412 1100 686 2 418 1145 725 2 26
8 8 Delhi 10705 135598 120930 3963 10596 136716 122131 3989 07
9 9 Goa 1657 5913 4211 45 1707 6193 4438 48 30
10 10 Gujarat 14090 61438 44907 2441 14300 62463 45699 2464 24
and so on...
Related
How to scrape the table of states?
I am trying to scrape the table from: https://worldpopulationreview.com/states My code: from bs4 import BeautifulSoup import requests import pandas as pd url = 'https://worldpopulationreview.com/states' page = requests.get(url) soup = BeautifulSoup(page.text,'lxml') table = soup.find('table', {'class': 'jsx-a3119e4553b2cac7 table is-striped is-hoverable is-fullwidth tp-table-body is-narrow'}) headers = [] for i in table.find_all('th'): title = i.text.strip() headers.append(title) df = pd.DataFrame(columns=headers) for row in table.find_all('tr')[1:]: data = row.find_all('td') row_data = [td.text.strip() for td in data] length = len(df) df.loc[length] = row_data df Currently returns 'NoneType' object has no attribute 'find_all' Clearly the error is because the table variable is returning nothing, but I believe I have the table tag correct.
The table data is dynamically loaded by JavaScript and bs4 can't render JS but you can do the job bs4 with an automation tool something like selenium and grab the table using pandas DataFrame. from selenium import webdriver import time from bs4 import BeautifulSoup import pandas as pd from selenium.webdriver.chrome.service import Service webdriver_service = Service("./chromedriver") #Your chromedriver path driver = webdriver.Chrome(service=webdriver_service) driver.get('https://worldpopulationreview.com/states') driver.maximize_window() time.sleep(8) soup = BeautifulSoup(driver.page_source,"lxml") #You can pull the table directly from the web page df = pd.read_html(str(soup))[0] print(df) #OR #table= soup.select_one('table[class="jsx-a3119e4553b2cac7 table is-striped is-hoverable is-fullwidth tp-table-body is-narrow"]') # df = pd.read_html(str(table))[0] # print(df) Output: Rank State 2022 Population Growth Rate ... 2010 Population Growth Since 2010 % of US Density (/mi²) 0 1 California 39995077 0.57% ... 37253956 7.36% 11.93% 257 1 2 Texas 29945493 1.35% ... 25145561 19.09% 8.93% 115 2 3 Florida 22085563 1.25% ... 18801310 17.47% 6.59% 412 3 4 New York 20365879 0.41% ... 19378102 5.10% 6.07% 432 4 5 Pennsylvania 13062764 0.23% ... 12702379 2.84% 3.90% 292 5 6 Illinois 12808884 -0.01% ... 12830632 -0.17% 3.82% 231 6 7 Ohio 11852036 0.22% ... 11536504 2.74% 3.53% 290 7 8 Georgia 10916760 0.95% ... 9687653 12.69% 3.26% 190 8 9 North Carolina 10620168 0.86% ... 9535483 11.38% 3.17% 218 9 10 Michigan 10116069 0.19% ... 9883640 2.35% 3.02% 179 10 11 New Jersey 9388414 0.53% ... 8791894 6.78% 2.80% 1277 11 12 Virginia 8757467 0.73% ... 8001024 9.45% 2.61% 222 12 13 Washington 7901429 1.26% ... 6724540 17.50% 2.36% 119 13 14 Arizona 7303398 1.05% ... 6392017 14.26% 2.18% 64 14 15 Massachusetts 7126375 0.68% ... 6547629 8.84% 2.13% 914 15 16 Tennessee 7023788 0.81% ... 6346105 10.68% 2.09% 170 16 17 Indiana 6845874 0.44% ... 6483802 5.58% 2.04% 191 17 18 Maryland 6257958 0.65% ... 5773552 8.39% 1.87% 645 18 19 Missouri 6188111 0.27% ... 5988927 3.33% 1.85% 90 19 20 Wisconsin 5935064 0.35% ... 5686986 4.36% 1.77% 110 20 21 Colorado 5922618 1.27% ... 5029196 17.76% 1.77% 57 21 22 Minnesota 5787008 0.70% ... 5303925 9.11% 1.73% 73 22 23 South Carolina 5217037 0.95% ... 4625364 12.79% 1.56% 174 23 24 Alabama 5073187 0.48% ... 4779736 6.14% 1.51% 100 24 25 Louisiana 4682633 0.27% ... 4533372 3.29% 1.40% 108 25 26 Kentucky 4539130 0.37% ... 4339367 4.60% 1.35% 115 26 27 Oregon 4318492 0.95% ... 3831074 12.72% 1.29% 45 27 28 Oklahoma 4000953 0.52% ... 3751351 6.65% 1.19% 58 28 29 Connecticut 3612314 0.09% ... 3574097 1.07% 1.08% 746 29 30 Utah 3373162 1.53% ... 2763885 22.04% 1.01% 41 30 31 Iowa 3219171 0.45% ... 3046355 5.67% 0.96% 58 31 32 Nevada 3185426 1.28% ... 2700551 17.95% 0.95% 29 32 33 Arkansas 3030646 0.32% ... 2915918 3.93% 0.90% 58 33 34 Mississippi 2960075 -0.02% ... 2967297 -0.24% 0.88% 63 34 35 Kansas 2954832 0.29% ... 2853118 3.57% 0.88% 36 35 36 New Mexico 2129190 0.27% ... 2059179 3.40% 0.64% 18 36 37 Nebraska 1988536 0.68% ... 1826341 8.88% 0.59% 26 37 38 Idaho 1893410 1.45% ... 1567582 20.79% 0.56% 23 38 39 West Virginia 1781860 -0.33% ... 1852994 -3.84% 0.53% 74 39 40 Hawaii 1474265 0.65% ... 1360301 8.38% 0.44% 230 40 41 New Hampshire 1389741 0.44% ... 1316470 5.57% 0.41% 155 41 42 Maine 1369159 0.25% ... 1328361 3.07% 0.41% 44 42 43 Rhode Island 1106341 0.41% ... 1052567 5.11% 0.33% 1070 43 44 Montana 1103187 0.87% ... 989415 11.50% 0.33% 8 44 45 Delaware 1008350 0.92% ... 897934 12.30% 0.30% 517 45 46 South Dakota 901165 0.81% ... 814180 10.68% 0.27% 12 46 47 North Dakota 800394 1.35% ... 672591 19.00% 0.24% 12 47 48 Alaska 738023 0.31% ... 710231 3.91% 0.22% 1 48 49 Vermont 646545 0.27% ... 625741 3.32% 0.19% 70 49 50 Wyoming 579495 0.23% ... 563626 2.82% 0.17% 6 [50 rows x 9 columns]
Table is rendered dynamically from JSON that is placed at the end of the source code, so it do not need selenium simply extract the tag and load the JSON - This also includes all additional information from the page: soup = BeautifulSoup(requests.get('https://worldpopulationreview.com/states').text) json.loads(soup.select_one('#__NEXT_DATA__').text)['props']['pageProps']['data'] Example import requests, json import pandas as pd from bs4 import BeautifulSoup soup = BeautifulSoup(requests.get('https://worldpopulationreview.com/states').text) pd.DataFrame( json.loads(soup.select_one('#__NEXT_DATA__').text)['props']['pageProps']['data'] ) Example Cause there are also additional information, that is used for the map, simply choose columns you need by header. fips state densityMi pop2022 pop2021 pop2020 pop2019 pop2010 growthRate growth growthSince2010 area fill Name rank 0 6 California 256.742 39995077 39766650 39538223 39309799 37253956 0.00574419 228427 0.0735793 155779 #084594 California 1 1 48 Texas 114.632 29945493 29545499 29145505 28745507 25145561 0.0135382 399994 0.190886 261232 #084594 Texas 2 2 12 Florida 411.852 22085563 21811875 21538187 21264502 18801310 0.0125477 273688 0.174682 53625 #084594 Florida 3 3 36 New York 432.158 20365879 20283564 20201249 20118937 19378102 0.00405821 82315 0.0509739 47126 #084594 New York 4 4 42 Pennsylvania 291.951 13062764 13032732 13002700 12972667 12702379 0.00230435 30032 0.0283715 44743 #2171b5 Pennsylvania 5 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 45 46 South Dakota 11.887 901165 893916 886667 879421 814180 0.00810926 7249 0.106838 75811 #c6dbef South Dakota 46 46 38 North Dakota 11.5997 800394 789744 779094 768441 672591 0.0134854 10650 0.190016 69001 #c6dbef North Dakota 47 47 2 Alaska 1.29332 738023 735707 733391 731075 710231 0.00314799 2316 0.0391309 570641 #c6dbef Alaska 48 48 50 Vermont 70.147 646545 644811 643077 641347 625741 0.00268916 1734 0.033247 9217 #c6dbef Vermont 49 49 56 Wyoming 5.96845 579495 578173 576851 575524 563626 0.00228651 1322 0.0281552 97093 #c6dbef Wyoming 50
Is there a way to have insertion of tags or labels with respect to a certain value range?
I have this data and would like to insert a new column titled 'Level'. I understand 'insert' is a mode of entering in a new column. I tried an 'if' for the argument 'value', but this is not yielding anything. Data: Active Discharged Deaths State/UTs Andaman and Nicobar 6 7437 129 Andhra Pradesh 14550 1993589 13925 Arunachal Pradesh 634 52507 267 Assam 6415 580491 5710 Bihar 55 716048 9656 Chandigarh 35 64273 814 Chhattisgarh 354 990757 13557 Dadra and Nagar Haveli and Daman and Diu 2 10659 4 Delhi 367 1412542 25082 Goa 885 170391 3210 Gujarat 152 815275 10082 Haryana 617 760271 9685 Himachal Pradesh 1699 209420 3613 Jammu and Kashmir 1286 320337 4410 Jharkhand 126 342716 5133 Karnataka 17412 2901299 37426 Kerala 239338 3966557 21631 Ladakh 54 20327 207 Lakshadweep 9 10288 51 Madhya Pradesh 125 781629 10516 Maharashtra 51234 6300755 137811 Manipur 3180 110602 1802 Meghalaya 2104 73711 1329 Mizoram 11414 54056 226 Nagaland 712 29045 631 Odisha 6322 997790 8055 Puducherry 914 121452 1818 Punjab 326 584079 16444 Rajasthan 86 945097 8954 Sikkim 913 28968 375 Tamil Nadu 16256 2572942 35036 Telengana 5505 650453 3886 Tripura 691 81866 803 Uttar Pradesh 227 1686369 22861 Uttarakhand 379 335358 7388 West Bengal 8480 1525581 18515 code: data = Table.read_table('IndiaStatus.csv')#.drop('Discharged', 'Discharge Ratio (%)','Total Cases','Active','Deaths') data2.info() data3 = data2.set_index("State/UTs") data3 = data3[["Active","Discharged","Deaths"]] print(data3) data3.insert(1, column = "Level", value = "Severe" if data3["Active"] > 91874) output: line 49 data3.insert(1, column = "Level", value = "Severe" if data3["Active"] > 91874) ^ SyntaxError: invalid syntax
The SyntaxError is because you need a else condition, so something like value = "Severe", if data3["Active"] > 91874 else 'OTHER' would remove the error. That said, it won't work in this case and return another error of using a Series - in this case data3["Active"] > 91874 - in a if statement. I believe you can use np.where here data3.insert(1, column = "Level", value = np.where(data3["Active"] > 91874, "Severe", 'OTHER') Replace OTHER in the above code by any value you want to assign in the column when the condition data3["Active"] > 91874 is not met
in dataframe , how to merge two rows
in dataframe , how to merge two rows, like 148 merge 142 to be a new line and drop two them. title collectionsCount subscribersCount entriesCount viewsCount 148 Android 697977 100213 6803 10610138 142 Java 103821 65303 1493 1590201 161 iOS 163137 65896 3601 3739843 177 JavaScript 222100 88872 2412 3548736 16 Python 45234 45100 1007 930588 162 Swift 28498 30317 1180 928488 20 PHP 15376 25143 375 329720 62 Go 5321 12881 179 145851 41 C++ 3495 18404 101 75019 17 C 2213 14870 50 52019 63 Ruby 1543 6711 40 45162
You can use the method pandas.Series.replace to replace Android to Java then use pandas.DataFrame.groupby to aggregate the data. This should work: rules = {'Android':'Java'} df['title'].replace(rules,inplace=True) df = df.groupby('title').sum().reset_index() print(df) Output: title collectionsCount subscribersCount entriesCount viewsCount 0 C 2213 14870 50 52019 1 C++ 3495 18404 101 75019 2 Go 5321 12881 179 145851 3 Java 801798 165516 8296 12200339 4 JavaScript 222100 88872 2412 3548736 5 PHP 15376 25143 375 329720 6 Python 45234 45100 1007 930588 7 Ruby 1543 6711 40 45162 8 Swift 28498 30317 1180 928488 9 iOS 163137 65896 3601 3739843
Pivot tables using pandas
I have the following dataframe: df1= df[['rsa_units','regions','ssno','veteran','pos_off_ttl','occ_ser','grade','gender','ethnicity','age','age_category','service_time','type_appt','disabled','actn_dt','nat_actn_2_3','csc_auth_12','fy']] this will produce 1.4 mil records. I've taken the first 12. Eastern Region (R9),Eastern Region (R9),123456789,Non Vet,LBRER,3502,3,Male,White,43.0,Older Gen X'ers,5.0,Temporary,,2009-05-18 00:00:00,115,BDN,2009 Northern Region (R1),Northern Region (R1),234567891,Non Vet,FRSTRY TECHNCN,0462,4,Male,White,37.0,Younger Gen X'ers,7.0,Temporary,,2007-05-27 00:00:00,115,BDN,2007 Northern Region (R1),Northern Region (R1),345678912,Non Vet,FRSTRY AID,0462,3,Male,White,33.0,Younger Gen X'ers,8.0,Temporary,,2006-06-05 00:00:00,115,BDN,2006 Northern Research Station (NRS),Research & Development(RES),456789123,Non Vet,FRSTRY TECHNCN,0462,7,Male,White,37.0,Younger Gen X'ers,10.0,Term,,2006-11-26 00:00:00,702,N6M,2007 Intermountain Region (R4),Intermountain Region (R4),5678912345,Non Vet,BIOLCL SCI TECHNCN,0404,5,Male,White,45.0,Older Gen X'ers,6.0,Temporary,,2008-05-18 00:00:00,115,BWA,2008 Intermountain Region (R4),Intermountain Region (R4),678912345,Non Vet,FRSTRY AID (FIRE),0462,3,Female,White,31.0,Younger Gen X'ers,5.0,Temporary,,2009-05-10 00:00:00,115,BDN,2009 Pacific Southwest Region (R5),Pacific Southwest Region (R5),789123456,Non Vet,FRSTRY AID (FIRE),0462,3,Male,White,31.0,Younger Gen X'ers,3.0,Temporary,,2012-05-06 00:00:00,115,NAM,2012 Pacific Southwest Region (R5),Pacific Southwest Region (R5),891234567,Non Vet,FRSTRY AID (FIRE),0462,3,Male,White,31.0,Younger Gen X'ers,3.0,Temporary,,2011-06-05 00:00:00,115,BDN,2011 Intermountain Region (R4),Intermountain Region (R4),912345678,Non Vet,FRSTRY TECHNCN,0462,5,Male,White,37.0,Younger Gen X'ers,11.0,Temporary,,2006-04-30 00:00:00,115,BDN,2006 Northern Region (R1),Northern Region (R1),987654321,Non Vet,FRSTRY TECHNCN,0462,4,Male,White,37.0,Younger Gen X'ers,11.0,Temporary,,2005-04-11 00:00:00,115,BDN,2005 Southwest Region (R3),Southwest Region (R3),876543219,Non Vet,FRSTRY TECHNCN (HOTSHOT/HANDCREW),0462,4,Male,White,30.0,Gen Y Millennial,4.0,Temporary,,2013-03-24 00:00:00,115,NAM,2013 Southwest Region (R3),Southwest Region (R3),765432198,Non Vet,FRSTRY TECHNCN (RECR),0462,4,Male,White,30.0,Gen Y Millennial,5.0,Temporary,,2010-11-21 00:00:00,115,BDN,2011 I then filter on ['nat_actn_2_3'] for the certain hiring codes. h1 = df1[df1['nat_actn_2_3'].isin(['100','101','108','170','171','115','130','140','141','190','702','703'])] h2 = h1.sort('ssno') h3 = h2.drop_duplicates(['ssno','actn_dt']) and can look at value_counts() to see total hires by region. total_newhires = h3['regions'].value_counts() total_newhires produces: Out[38]: Pacific Southwest Region (R5) 42255 Pacific Northwest Region (R6) 32081 Intermountain Region (R4) 24045 Northern Region (R1) 22822 Rocky Mountain Region (R2) 17481 Southwest Region (R3) 17305 Eastern Region (R9) 11034 Research & Development(RES) 7337 Southern Region (R8) 7288 Albuquerque Service Center(ASC) 7032 Washington Office(WO) 4837 Alaska Region (R10) 4210 Job Corps(JC) 4010 nda 438 I'd like to do something like in excel where I can have the ['regions'] as my row and the ['fy'] as the columns to give me a total count of numbers based off the ['ssno'] for each ['fy']. It would also be nice to eventually do calculations based off the numbers too, like averages and sums. Along with looking at examples in the url: http://pandas.pydata.org/pandas-docs/stable/reshaping.html, I've also tried: hirestable = pivot_table(h3, values=['ethnicity', 'veteran'], rows=['regions'], cols=['fy']) I'm wondering if groupby may be what I'm looking for? Any help is appreciated. I've spent 3 days on this and can't seem to put it together. So based off the answer below I did a pivot using the following code: h3.pivot_table(values=['ssno'], rows=['nat_actn_2_3'], cols=['fy'], aggfunc=len). Which produced a somewhat decent result. When I used 'ethnicity' or 'veteran' as a value my results came out really strange and didn't match my value counts numbers. Not sure if the pivot eliminates duplicates or what, but it did not come out correctly. ssno fy 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 nat_actn_2_3 100 34 20 25 18 38 43 45 14 19 25 10 101 510 453 725 795 1029 1293 957 383 470 605 145 108 170 132 112 85 123 127 84 43 40 29 10 115 9203 8972 7946 9038 10139 10480 9211 8735 10482 11258 339 130 299 313 431 324 291 325 336 202 230 436 112 140 62 74 71 75 132 125 82 42 45 74 18 141 20 16 23 17 20 14 10 9 13 17 7 170 202 433 226 278 336 386 284 265 121 118 49 171 4771 4627 4234 4196 4470 4472 3270 3145 354 341 34 190 1 1 NaN NaN NaN 1 NaN NaN NaN NaN NaN 702 3141 3099 3429 3030 3758 3952 3813 2902 2329 2375 650 703 2280 2354 2225 2050 2260 2328 2172 2503 2649 2856 726
Try it like this: h3.pivot_table(values=['ethnicity', 'veteran'], index=['regions'], columns=['fy'], aggfunc=len, fill_value=0) To get counts use the aggfunc = len Also your isin references a list of strings, but the data you provide for columns 'nat_actn_2_3' are int Try: h3.pivot_table(values=['ethnicity', 'veteran'], rows=['regions'], cols=['fy'], aggfunc=len, fill_value=0) if you have an older version of pandas
Python selecting items by comparing values in a table using dictionary
I have a table with 12 columns and want to select the items in the first column (qseqid) based on the second column (sseqid). Meaning that the second column (sseqid) is repeating with different values in the 11th and 12th columns, which areevalueandbitscore, respectively. The ones that I would like to get are having the lowestevalueand the highestbitscore(whenevalues are the same, the rest of the columns can be ignored and the data is down below). So, I have made a short code which uses the second columns as a key for the dictionary. I can get five different items from the second column with lists of qseqid+evalueandqseqid+bitscore. Here is the code: #!usr/bin/python filename = "data.txt" readfile = open(filename,"r") d = dict() for i in readfile.readlines(): i = i.strip() i = i.split("\t") d.setdefault(i[1], []).append([i[0],i[10]]) d.setdefault(i[1], []).append([i[0],i[11]]) for x in d: print(x,d[x]) readfile.close() But, I am struggling to get the qseqid with the lowest evalue and the highest bitscore for each sseqid. Is there any good logic to solve the problem? Thedata.txtfile (including the header row and with»representing tab characters) qseqid»sseqid»pident»length»mismatch»gapopen»qstart»qend»sstart»send»evalue»bitscore ACLA_022040»TBB»32.71»431»258»8»39»468»24»423»2.00E-76»240 ACLA_024600»TBB»80»435»87»0»1»435»1»435»0»729 ACLA_031860»TBB»39.74»453»251»3»1»447»1»437»1.00E-121»357 ACLA_046030»TBB»75.81»434»105»0»1»434»1»434»0»704 ACLA_072490»TBB»41.7»446»245»3»4»447»3»435»2.00E-120»353 ACLA_010400»EF1A»27.31»249»127»8»69»286»9»234»3.00E-13»61.6 ACLA_015630»EF1A»22»491»255»17»186»602»3»439»8.00E-19»78.2 ACLA_016510»EF1A»26.23»122»61»4»21»127»9»116»2.00E-08»46.2 ACLA_023300»EF1A»29.31»447»249»12»48»437»3»439»2.00E-45»155 ACLA_028450»EF1A»85.55»443»63»1»1»443»1»442»0»801 ACLA_074730»CALM»23.13»147»101»4»6»143»2»145»7.00E-08»41.2 ACLA_096170»CALM»29.33»150»96»4»34»179»2»145»1.00E-13»55.1 ACLA_016630»CALM»23.9»159»106»5»58»216»4»147»5.00E-12»51.2 ACLA_031930»RPB2»36.87»1226»633»24»121»1237»26»1219»0»734 ACLA_065630»RPB2»65.79»1257»386»14»1»1252»4»1221»0»1691 ACLA_082370»RPB2»27.69»1228»667»37»31»1132»35»1167»7.00E-110»365 ACLA_061960»ACT»28.57»147»95»5»146»284»69»213»3.00E-12»57.4 ACLA_068200»ACT»28.73»463»231»13»16»471»4»374»1.00E-53»176 ACLA_069960»ACT»24.11»141»97»4»581»718»242»375»9.00E-09»46.2 ACLA_095800»ACT»91.73»375»31»0»1»375»1»375»0»732 And here's a little more readable version of the table's contents: 0 1 2 3 4 5 6 7 8 9 10 11 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore ACLA_022040 TBB 32.71 431 258 8 39 468 24 423 2.00E-76 240 ACLA_024600 TBB 80 435 87 0 1 435 1 435 0 729 ACLA_031860 TBB 39.74 453 251 3 1 447 1 437 1.00E-121 357 ACLA_046030 TBB 75.81 434 105 0 1 434 1 434 0 704 ACLA_072490 TBB 41.7 446 245 3 4 447 3 435 2.00E-120 353 ACLA_010400 EF1A 27.31 249 127 8 69 286 9 234 3.00E-13 61.6 ACLA_015630 EF1A 22 491 255 17 186 602 3 439 8.00E-19 78.2 ACLA_016510 EF1A 26.23 122 61 4 21 127 9 116 2.00E-08 46.2 ACLA_023300 EF1A 29.31 447 249 12 48 437 3 439 2.00E-45 155 ACLA_028450 EF1A 85.55 443 63 1 1 443 1 442 0 801 ACLA_074730 CALM 23.13 147 101 4 6 143 2 145 7.00E-08 41.2 ACLA_096170 CALM 29.33 150 96 4 34 179 2 145 1.00E-13 55.1 ACLA_016630 CALM 23.9 159 106 5 58 216 4 147 5.00E-12 51.2 ACLA_031930 RPB2 36.87 1226 633 24 121 1237 26 1219 0 734 ACLA_065630 RPB2 65.79 1257 386 14 1 1252 4 1221 0 1691 ACLA_082370 RPB2 27.69 1228 667 37 31 1132 35 1167 7.00E-110 365 ACLA_061960 ACT 28.57 147 95 5 146 284 69 213 3.00E-12 57.4 ACLA_068200 ACT 28.73 463 231 13 16 471 4 374 1.00E-53 176 ACLA_069960 ACT 24.11 141 97 4 581 718 242 375 9.00E-09 46.2 ACLA_095800 ACT 91.73 375 31 0 1 375 1 375 0 732
Since you're a Python newbie I'm glad that there are several examples of how to this manually, but for comparison I'll show how it can be done using the pandas library which makes working with tabular data much simpler. Since you didn't provide example output, I'm assuming that by "with the lowest evalue and the highest bitscore for each sseqid" you mean "the highest bitscore among the lowest evalues" for a given sseqid; if you want those separately, that's trivial too. import pandas as pd df = pd.read_csv("acla1.dat", sep="\t") df = df.sort(["evalue", "bitscore"],ascending=[True, False]) df_new = df.groupby("sseqid", as_index=False).first() which produces >>> df_new sseqid qseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore 0 ACT ACLA_095800 91.73 375 31 0 1 375 1 375 0.000000e+00 732.0 1 CALM ACLA_096170 29.33 150 96 4 34 179 2 145 1.000000e-13 55.1 2 EF1A ACLA_028450 85.55 443 63 1 1 443 1 442 0.000000e+00 801.0 3 RPB2 ACLA_065630 65.79 1257 386 14 1 1252 4 1221 0.000000e+00 1691.0 4 TBB ACLA_024600 80.00 435 87 0 1 435 1 435 0.000000e+00 729.0 Basically, first we read the data file into an object called a DataFrame, which is kind of like an Excel worksheet. Then we sort by evalue ascending (so that lower evalues come first) and by bitscore descending (so that higher bitscores come first). Then we can use groupby to collect the data in groups of equal sseqid, and take the first one in each group, which because of the sorting will be the one we want.
#!usr/bin/python import csv DATA = "data.txt" class Sequence: def __init__(self, row): self.qseqid = row[0] self.sseqid = row[1] self.pident = float(row[2]) self.length = int(row[3]) self.mismatch = int(row[4]) self.gapopen = int(row[5]) self.qstart = int(row[6]) self.qend = int(row[7]) self.sstart = int(row[8]) self.send = int(row[9]) self.evalue = float(row[10]) self.bitscore = float(row[11]) def __str__(self): return ( "{qseqid}\t" "{sseqid}\t" "{pident}\t" "{length}\t" "{mismatch}\t" "{gapopen}\t" "{qstart}\t" "{qend}\t" "{sstart}\t" "{send}\t" "{evalue}\t" "{bitscore}" ).format(**self.__dict__) def entries(fname, header_rows=1, dtype=list, **kwargs): with open(fname) as inf: incsv = csv.reader(inf, **kwargs) # skip header rows for i in range(header_rows): next(incsv) for row in incsv: yield dtype(row) def main(): bestseq = {} for seq in entries(DATA, dtype=Sequence, delimiter="\t"): # see if a sequence with the same sseqid already exists prev = bestseq.get(seq.sseqid, None) if ( prev is None or seq.evalue < prev.evalue or (seq.evalue == prev.evalue and seq.bitscore > prev.bitscore) ): bestseq[seq.sseqid] = seq # display selected sequences keys = sorted(bestseq) for key in keys: print(bestseq[key]) if __name__ == "__main__": main() which results in ACLA_095800 ACT 91.73 375 31 0 1 375 1 375 0.0 732.0 ACLA_096170 CALM 29.33 150 96 4 34 179 2 145 1e-13 55.1 ACLA_028450 EF1A 85.55 443 63 1 1 443 1 442 0.0 801.0 ACLA_065630 RPB2 65.79 1257 386 14 1 1252 4 1221 0.0 1691.0 ACLA_024600 TBB 80.0 435 87 0 1 435 1 435 0.0 729.0
While not nearly as elegant and concise as using thepandaslibrary, it's quite possible to do what you want without resorting to third-party modules. The following uses thecollections.defaultdictclass to facilitate creation of dictionaries of variable-length lists of records. The use of theAttrDictclass is optional, but it makes accessing the fields of each dictionary-based records easier and is less awkward-looking than the usualdict['fieldname']syntax otherwise required. import csv from collections import defaultdict, namedtuple from itertools import imap from operator import itemgetter data_file_name = 'data.txt' DELIMITER = '\t' ssqeid_dict = defaultdict(list) # from http://stackoverflow.com/a/1144405/355230 def multikeysort(items, columns): comparers = [((itemgetter(col[1:].strip()), -1) if col.startswith('-') else (itemgetter(col.strip()), 1)) for col in columns] def comparer(left, right): for fn, mult in comparers: result = cmp(fn(left), fn(right)) if result: return mult * result else: return 0 return sorted(items, cmp=comparer) # from http://stackoverflow.com/a/15109345/355230 class AttrDict(dict): def __init__(self, *args, **kwargs): super(AttrDict, self).__init__(*args, **kwargs) self.__dict__ = self with open(data_file_name, 'rb') as data_file: reader = csv.DictReader(data_file, delimiter=DELIMITER) format_spec = '\t'.join([('{%s}' % field) for field in reader.fieldnames]) for rec in (AttrDict(r) for r in reader): # Convert the two sort fields to numeric values for proper ordering. rec.evalue, rec.bitscore = map(float, (rec.evalue, rec.bitscore)) ssqeid_dict[rec.sseqid].append(rec) for ssqeid in sorted(ssqeid_dict): # Sort each group of recs with same ssqeid. The first record after sorting # will be the one sought that has the lowest evalue and highest bitscore. selected = multikeysort(ssqeid_dict[ssqeid], ['evalue', '-bitscore'])[0] print format_spec.format(**selected) Output (»represents tabs): ACLA_095800» ACT» 91.73» 375» 31» 0» 1» 375» 1» 375» 0.0» 732.0 ACLA_096170» CALM» 29.33» 150» 96» 4» 34» 179» 2» 145» 1e-13» 55.1 ACLA_028450» EF1A» 85.55» 443» 63» 1» 1» 443» 1» 442» 0.0» 801.0 ACLA_065630» RPB2» 65.79» 1257» 386» 14» 1» 1252» 4» 1221» 0.0» 1691.0 ACLA_024600» TBB» 80» 435» 87» 0» 1» 435» 1» 435» 0.0» 729.0
filename = 'data.txt' readfile = open(filename,'r') d = dict() sseqid=[] lines=[] for i in readfile.readlines(): sseqid.append(i.rsplit()[1]) lines.append(i.rsplit()) sorted_sseqid = sorted(set(sseqid)) sdqDict={} key =None for sorted_ssqd in sorted_sseqid: key=sorted_ssqd evalue=[] bitscore=[] qseid=[] for line in lines: if key in line: evalue.append(line[10]) bitscore.append(line[11]) qseid.append(line[0]) sdqDict[key]=[qseid,evalue,bitscore] print sdqDict print 'TBB LOWEST EVALUE' + '---->' + min(sdqDict['TBB'][1]) ##I think you can do the list manipulation below to find out the qseqid readfile.close()