Using a table_id with beautifulsoup to extract data in python - python

I tried to use the following code but it doesn't find the table, despite this having worked on other web pages.
from bs4 import BeautifulSoup
from selenium import webdriver
chromedriver = (r'C:\Users\c\chromedriver.exe')
driver = webdriver.Chrome(chromedriver)
driver.get("https://isodzz.nafta.sk/yCapacity/#/?nav=ss.od.nom.c&lng=EN")
html = driver.page_source
soup = BeautifulSoup(html, "lxml")
table = soup.find_all('table', {'id':'nominations_point_data_c'})
print(table)

Do it like this. First you need to wait for the table to appear. This site is awfully slow to load. Since there is a table element in the HTML we can use pandas for a neat print.
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import pandas as pd
driver = webdriver.Chrome(executable_path='C:/bin/chromedriver.exe')
driver.get("https://isodzz.nafta.sk/yCapacity/#/?nav=ss.od.nom.c&lng=EN")
element = WebDriverWait(driver, 25).until(EC.visibility_of_element_located((By.CLASS_NAME, "MobileOverflow"))) #Element is present now
page = driver.page_source #Get the HTML of the page
df = pd.read_html(page) #Make pandas read the HTML
table = df[0] #Get the first table on the page
print(table)
Output:
Date: Confirmed Nomination
Date: Injection [MWh] Withdrawal [MWh]
0 01.11.2020 13 410.490 11 626.856
1 02.11.2020 11 874.096 12 227.510
2 03.11.2020 0.000 0.000
3 04.11.2020 0.000 0.000
4 05.11.2020 0.000 0.000
5 06.11.2020 0.000 0.000
6 07.11.2020 0.000 0.000
7 08.11.2020 0.000 0.000
8 09.11.2020 0.000 0.000
9 10.11.2020 0.000 0.000
10 11.11.2020 34 201.032 37 624.672
11 12.11.2020 54 427.560 27 940.872
12 13.11.2020 49 069.584 21 538.372
13 14.11.2020 54 361.138 15 312.000
14 15.11.2020 57 592.332 15 804.000
15 16.11.2020 57 515.424 20 280.000
16 17.11.2020 53 315.328 29 432.000
17 18.11.2020 48 960.672 26 192.000
18 19.11.2020 46 716.561 33 873.233
19 20.11.2020 43 852.200 43 806.382
20 21.11.2020 29 639.328 33 888.000
21 22.11.2020 0.000 0.000

Related

Extract the table from website after click a button failure

below is my code to extract the table from website after click a button but failure(refer picture). Hope all who know can help me solve this problem. Thanks
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import pandas as pd
driver = webdriver.Chrome()
driver.get("https://www.klsescreener.com/v2/")
button = driver.find_element("xpath", '/html/body/div[1]/div[1]/div[3]/div/div[1]/div[1]/div/form/div[29]/div[2]/input')
button.click()
# Wait for the table to appear
wait = WebDriverWait(driver, 10)
table = wait.until(EC.presence_of_element_located((By.XPATH, "/html/body/div[1]/div[1]/div[3]/div/div[2]/div[3]/table")))
# Extract the table data using BeautifulSoup
soup = BeautifulSoup(driver.page_source, "html.parser")
table = soup.find(class="table-responsive")
# Extract the header row and all other rows
header = [th.text.strip() for th in table.find("tr").find_all("th")]
data = [[td.text.strip() for td in tr.find_all("td")] for tr in table.find_all("tr")[1:]]
# Create a pandas dataframe from the header and data
df = pd.DataFrame(data, columns=header)
# Save the dataframe to an Excel file
df.to_excel("table.xlsx", index=False)
# Wait for 10 additional seconds
time.sleep(10)
# Close the browser window
driver.quit()
this is my first time post question here. Hope anyone who know the problem and solution can help me, Thanks all.enter image description here
You do not need selenium simply use the api (check browser dev tools on xhr tab) to get the HTML and use pandas.read_html() to parse the table - You need requests in this case cause they check for user-agent in the headers.
Exanmple
import pandas as pd
import requests
pd.read_html(
requests.get(
'https://www.klsescreener.com/v2/screener/quote_results',
headers={'user-agent':'some agent'}
).text
)[0]
Output
Name
Code
Category
Price
Change%
52week
Volume
EPS
DPS
NTA
PE
DY
ROE
PTBV
MCap.(M)
Indicators
Unnamed: 16
0
TOPBLDS [s]
5268
Construction, Main Market
0.02
0%
0.010-0.050
0
21.3
0
0.001
0.09
0
21302.3
20
14.12
QoQ YoY RYoY
nan
1
PGB [s]
0091
Energy, Main Market
0.2
-2.4%
0.080-1.000
1190
84.43
0
0.129
0.24
0
652.47
1.55
120.87
QoQ YoY
nan
2
EATECH [s]
5259
Transportation & Logistics, Main Market
0.34
1.5%
0.025-0.365
228712
19.49
0
0.03
1.74
0
649.7
11.33
180.37
YoY
nan
3
TECHNAX [s]
2739
Energy, Main Market
0.03
20%
0.000-0.000
11037
11.07
0
0.044
0.27
0
251.5
0.68
66.44
QoQ YoY RQoQ RYoY
nan
4
CARLSBG
2836
Consumer Products & Services, Main Market
24.1
-0.4%
19.320-24.300
1802
107.39
63
0.49
22.44
2.61
219.16
49.18
7368.53
QoQ YoY RQoQ RYoY
nan
...
1050
GPP [s]
03029
Industrial Products & Services, Leap Market
0.295
0%
0.295-0.295
0
-9.84
0
0.02
-3
0
-492.14
14.75
45.76
RQoQ
nan
1051
DIGISTA
0029
Technology, Main Market
0.095
0%
0.060-0.195
18082
-1.14
0
0.002
-8.34
0
-670.35
55.88
43.11
YoY RYoY RTopQ
nan
1052
MCOM [s]
03022
Technology, Leap Market
0.2
0%
0.050-0.200
0
-7.31
0
0.01
-2.74
0
-745.44
20.41
37.71
nan
nan
1053
SAPNRG [s]
5218
Energy, Main Market
0.045
-10%
0.030-0.100
1201122
-40.76
0
0.02
-0.11
0
-2038
2.25
719.06
QoQ RConQ
nan
1054
KANGER [s]
0170
Consumer Products & Services, Ace Market
0.04
-11.1%
0.030-0.300
278584
-18.29
0
0.005
-0.22
0
-3732.31
8.16
25.99
QoQ YoY
nan

How to get table and it's element with Python/Selenium

I'm trying to get all the price in the table at this URL:
https://www.skyscanner.it/trasporti/voli/bud/rome/?adults=1&adultsv2=1&cabinclass=economy&children=0&childrenv2=&destinationentityid=27539793&inboundaltsenabled=true&infants=0&iym=2208&originentityid=27539604&outboundaltsenabled=true&oym=2208&preferdirects=false&ref=home&rtn=1&selectedoday=01&selectediday=01
The table elements are the days with the related price.
This is what I'm trying to do to get the table:
#Attempt 1
week = table.find_element(By.CLASS_NAME, "BpkCalendarGrid_bpk-calendar-grid__NzBmM month-view-grid--data-loaded")
#Attempt 2
table = driver.find_element(by=By.XPATH, value="Xpath copied using Crhome inspector"
However I cannot get it.
What is the correct way to extract all the price from this table? Thanks!
You can grab table data meaning all prices using selenium with pandas DataFrame. There are two tables exist of the table data prices
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
option = webdriver.ChromeOptions()
option.add_argument("start-maximized")
#chrome to stay open
option.add_experimental_option("detach", True)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=option)
driver.get('https://www.skyscanner.it/trasporti/voli/bud/rome/?adults=1&adultsv2=1&cabinclass=economy&children=0&childrenv2=&destinationentityid=27539793&inboundaltsenabled=true&infants=0&iym=2208&originentityid=27539604&outboundaltsenabled=true&oym=2208&preferdirects=false&ref=home&rtn=1&selectedoday=01&selectediday=01')
table = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, '(//table)[1]'))).get_attribute("outerHTML")
table_2 = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, '(//table)[2]'))).get_attribute("outerHTML")
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="acceptCookieButton"]'))).click()
df1 = pd.read_html(table)[0]
print(df1)
df2 = pd.read_html(table_2)[0]
print(df2)
Output:
lun mar mer gio ven sab dom
0 1€ 40 2€ 28 3€ 32 4€ 37 5€ 34 6€ 35 7€ 34
1 8€ 34 9€ 28 10€ 27 11€ 26 12€ 26 13€ 46 14€ 35
2 15€ 35 16€ 40 17€ 36 18€ 51 19€ 28 20€ 33 21€ 36
3 22€ 38 23€ 38 24€ 30 25€ 50 26€ 43 27€ 50 28€ 51
4 29€ 38 30€ 36 31€ 58 1- 2- 3- 4-
5 5- 6- 7- 8- 9- 10- 11-
lun mar mer gio ven sab dom
0 1€ 40 2€ 28 3€ 32 4€ 37 5€ 34 6€ 35 7€ 34
1 8€ 34 9€ 28 10€ 27 11€ 26 12€ 26 13€ 46 14€ 35
2 15€ 35 16€ 40 17€ 36 18€ 51 19€ 28 20€ 33 21€ 36
3 22€ 38 23€ 38 24€ 30 25€ 50 26€ 43 27€ 50 28€ 51
4 29€ 38 30€ 36 31€ 58 1- 2- 3- 4-
5 5- 6- 7- 8- 9- 10- 11-
webdriverManager
Alternative solution(Table-1): Thus way you can extract prices from table two too.
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
option = webdriver.ChromeOptions()
option.add_argument("start-maximized")
#chrome to stay open
option.add_experimental_option("detach", True)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=option)
driver.get('https://www.skyscanner.it/trasporti/voli/bud/rome/?adults=1&adultsv2=1&cabinclass=economy&children=0&childrenv2=&destinationentityid=27539793&inboundaltsenabled=true&infants=0&iym=2208&originentityid=27539604&outboundaltsenabled=true&oym=2208&preferdirects=false&ref=home&rtn=1&selectedoday=01&selectediday=01')
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="acceptCookieButton"]'))).click()
table = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, '(//table)[1]/tbody/tr/td')))
for i in table:
price = i.find_element(By.XPATH,'.//div[#class="price"]').text.replace('€','').strip()
print(price)
Output:
39
30
32
37
34
35
34
34
28
27
26
26
46
35
35
40
36
52
29
34
37
39
39
30
50
44
50
52
38
36
58

DataFrame max() not return max

Real beginner question here, but it is so simple, I'm genuinely stumped. Python/DataFrame newbie.
I've loaded a DataFrame from a Google Sheet, however any graphing or attempts at calculations are generating bogus results. Loading code:
# Setup
!pip install --upgrade -q gspread
from google.colab import auth
auth.authenticate_user()
import gspread
from oauth2client.client import GoogleCredentials
gc = gspread.authorize(GoogleCredentials.get_application_default())
worksheet = gc.open('Linear Regression - Brain vs. Body Predictor').worksheet("Raw Data")
rows = worksheet.get_all_values()
# Convert to a DataFrame and render.
import pandas as pd
df = pd.DataFrame.from_records(rows)
This seems to work fine and the data looks to be correctly loaded when I print out the DataFrame but running max() returns obviously false results. For example:
print(df[0])
print(df[0].max())
Will output:
0 3.385
1 0.48
2 1.35
3 465
4 36.33
5 27.66
6 14.83
7 1.04
8 4.19
9 0.425
10 0.101
11 0.92
12 1
13 0.005
14 0.06
15 3.5
16 2
17 1.7
18 2547
19 0.023
20 187.1
21 521
22 0.785
23 10
24 3.3
25 0.2
26 1.41
27 529
28 207
29 85
...
32 6654
33 3.5
34 6.8
35 35
36 4.05
37 0.12
38 0.023
39 0.01
40 1.4
41 250
42 2.5
43 55.5
44 100
45 52.16
46 10.55
47 0.55
48 60
49 3.6
50 4.288
51 0.28
52 0.075
53 0.122
54 0.048
55 192
56 3
57 160
58 0.9
59 1.62
60 0.104
61 4.235
Name: 0, Length: 62, dtype: object
Max: 85
Obviously, the maximum value is way out -- it should be 6654, not 85.
What on earth am I doing wrong?
First StackOverflow post, so thanks in advance.
If you check it, you'll see at the end of your print() that dtype=object. Also, you'll notice your pandas Series have "int" values along with "float" values (e.g. you have 6654 and 3.5 in the same Series).
These are good hints you have a series of strings, and the max operator here is comparing based on string comparing. You want, however, to have a series of numbers (specifically floats) and to compare based on number comparing.
Check the following reproducible example:
>>> df = pd.DataFrame({'col': ['0.02', '9', '85']}, dtype=object)
>>> df.col.max()
'9'
You can check that because
>>> '9' > '85'
True
You want these values to be considered floats instead. Use pd.to_numeric
>>> df['col'] = pd.to_numeric(df.col)
>>> df.col.max()
85
For more on str and int comparison, check this question

Turn an HTML table into a CSV file

How do I turn a table like this--batting gamelogs table--into a CSV file using Python and BeautifulSoup?
I want the first header where it says Rk, Gcar, Gtm, etc. and not any of the other headers within the table (the ones for each month of the season).
Here is the code I have so far:
from bs4 import BeautifulSoup
from urllib2 import urlopen
import csv
def stir_the_soup():
player_links = open('player_links.txt', 'r')
player_ID_nums = open('player_ID_nums.txt', 'r')
id_nums = [x.rstrip('\n') for x in player_ID_nums]
idx = 0
for url in player_links:
print url
soup = BeautifulSoup(urlopen(url), "lxml")
p_type = ""
if url[-12] == 'p':
p_type = "pitching"
elif url[-12] == 'b':
p_type = "batting"
table = soup.find(lambda tag: tag.name=='table' and tag.has_attr('id') and tag['id']== (p_type + "_gamelogs"))
header = [[val.text.encode('utf8') for val in table.find_all('thead')]]
rows = []
for row in table.find_all('tr'):
rows.append([val.text.encode('utf8') for val in row.find_all('th')])
rows.append([val.text.encode('utf8') for val in row.find_all('td')])
with open("%s.csv" % id_nums[idx], 'wb') as f:
writer = csv.writer(f)
writer.writerow(header)
writer.writerows(row for row in rows if row)
idx += 1
player_links.close()
if __name__ == "__main__":
stir_the_soup()
The id_nums list contains all of the id numbers for each player to use as the names for the separate CSV files.
For each row, the leftmost cell is a tag and the rest of the row is tags. In addition to the header how do I put that into one row?
this code gets you the big table of stats, which is what I think you want.
Make sure you have lxml, beautifulsoup4 and pandas installed.
df = pd.read_html(r'https://www.baseball-reference.com/players/gl.fcgi?id=abreuto01&t=b&year=2010')
print(df[4])
Here is the output of first 5 rows. You may need to clean it slightly as I don't know what your exact endgoal is:
df[4].head(5)
Rk Gcar Gtm Date Tm Unnamed: 5 Opp Rslt Inngs PA ... CS BA OBP SLG OPS BOP aLI WPA RE24 Pos
0 1 66 2 (1) Apr 6 ARI NaN SDP L,3-6 7-8 1 ... 0 1.000 1.000 1.000 2.000 9 .94 0.041 0.51 PH
1 2 67 3 Apr 7 ARI NaN SDP W,5-3 7-8 1 ... 0 .500 .500 .500 1.000 9 1.16 -0.062 -0.79 PH
2 3 68 4 Apr 9 ARI NaN PIT W,9-1 8-GF 1 ... 0 .667 .667 .667 1.333 2 .00 0.000 0.13 PH SS
3 4 69 5 Apr 10 ARI NaN PIT L,3-6 CG 4 ... 0 .500 .429 .500 .929 2 1.30 -0.040 -0.37 SS
4 5 70 7 (1) Apr 13 ARI # LAD L,5-9 6-6 1 ... 0 .429 .375 .429 .804 9 1.52 -0.034 -0.46 PH
to select certain columns within this DataFrame: df[4]['COLUMN_NAME_HERE'].head(5)
Example: df[4]['Gcar']
Also, if doing df[4] is getting annoying you could always just switch to another dataframe df2=df[4]
import pandas as pd
from bs4 import BeautifulSoup
import urllib2
url = 'https://www.baseball-reference.com/players/gl.fcgi?id=abreuto01&t=b&year=2010'
html=urllib2.urlopen(url)
bs = BeautifulSoup(html,'lxml')
table = str(bs.find('table',{'id':'batting_gamelogs'}))
dfs = pd.read_html(table)
This uses Pandas, which is pretty useful for stuff like this. It also puts it in a pretty reasonable format to do other operations on.
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_html.html

Python: Read and write the file of complex and reapeating format

To begin with, sorry for poor Engish.
I have a file with repeating format. Such as
326 Iteration: 0 #Bonds: 10
1 6 7 14 54 70 77 0 0 0 0 0 1 0.693 0.632 0.847 0.750 0.644 0.000 0.000 0.000 0.000 0.000 3.566 0.000 0.028
2 6 3 6 15 55 0 0 0 0 0 0 1 0.925 0.920 0.909 0.892 0.000 0.000 0.000 0.000 0.000 0.000 3.645 0.000 -0.040
3 6 2 8 10 52 0 0 0 0 0 0 1 0.925 0.910 0.920 0.898 0.000 0.000 0.000 0.000 0.000 0.000 3.653 0.000 0.000
...
324 8 323 0 0 0 0 0 0 0 0 0 100 0.871 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.871 3.000 -0.493
325 2 326 0 0 0 0 0 0 0 0 0 101 0.930 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.930 0.000 0.334
326 8 325 0 0 0 0 0 0 0 0 0 101 0.930 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.930 3.000 -0.611
637.916060425841 306.094529423257 1250.10511927236
6.782126993565285E-006
326 (repeating from here) Iteration: 100 #Bonds: 10
1 6 7 14 54 64 70 77 0 0 0 0 1 0.885 0.580 0.819 0.335 0.784 0.709 0.000 0.000 0.000 0.000 4.111 0.000 0.025
2 6 3 6 15 55 0 0 0 0 0 0 1 0.812 0.992 0.869 0.966 0.000 0.000 0.000 0.000 0.000 0.000 3.639 0.000 -0.034
3 6 2 8 10 52 0 0 0 0 0 0 1 0.812 0.966 0.989 0.926 0.000 0.000 0.000 0.000 0.000 0.000 3.692 0.000 0.004
As you can see here, the first line is the header, and 2nd~327th line is the data that I want to analyze, and 328th and 329th line have some numbers which I don't want to use. Next "frame" starts from line 330, with exactly same format. This "frame" repeats more than 200000 times.
I want to use 1st ~ 13th column from that 2nd~327th line data of each frames. Also I want to use first number of header.
I want to analyze the data, 3th~12th column of each 2nd~327th line of all repeating "frames", printing number of 0s and number of non-0s data from of target matrix of each frames. Also print some 1st, 2nd and 13th column as well. So the expected output file become like
326
1
1 6 5 5 1
2 6 4 6 1
...
325 2 1 9 101
326 8 1 9 101
326 (Next frame starts from here)
2
1 6 5 5 1
2 6 4 6 1
...
326
3
1 6 5 5 1
2 6 4 6 1
...
First line: First number of first line.
Second line: Frame number
3rd~328th line: 1st column of input file, 2nd column of input file, number of non-zeros of 3th~12th column of input, number of zeros of 3th~12th column of input, and 13th column of input.
From 4th line: repeating format, same with above.
So, the result file have 2 header line, and analyzed data of 326 lines, total 328 line per each frame. Same format repeats for next frame too. Using that format of result data (5 spaces each) is recommended to use the file for other purpose.
The way I'm using is, Creating 13 arrays for 13 columns -> store data using double for loops for each frame, and each 328 lines. But I have no idea how can I deal with output.
Following is the my trial code (unfinished, only for read the input), but this code have a lot of problems. Linecache reads whole line, not the first number of every first line. Every frame have 326+3=329 lines, but it seems like my code is not properly working for frame-wise workings. I welcomes any help and assist to analyze this data. Thank you very much in advance.
# Read the file
filename = raw_input("Enter the file name \n")
file = open(filename, 'r')
# Read the number of atom from header
import linecache
nnn = linecache.getline(filename, 1)
natoms = int(nnn)
singleframe = natoms + 3
# get number of frames
nlines = 0
for i1 in file:
nlines = nlines +1
file.close()
nframes = nlines / singleframe
print 'no of lines are: ', nlines
print 'no of frames are: ', nframes
print 'no of atoms are:', natoms
# Create 1d string array
nrange = range(nlines)
data_lines = [None]*(nlines)
# Store whole input file into string array
file = open(filename, 'r')
i1=0
for i1 in nrange:
data_lines[i1] = file.readline()
file.close()
# Create 1d array to store atomic data
at_index = [None]*natoms
at_type = [None]*natoms
n1 = [None]*natoms
n2 = [None]*natoms
n3 = [None]*natoms
n4 = [None]*natoms
n5 = [None]*natoms
n6 = [None]*natoms
n7 = [None]*natoms
n8 = [None]*natoms
n9 = [None]*natoms
n10 = [None]*natoms
molnr = [None]*natoms
nrange1= range(natoms)
nframe = range(nframes)
file = open('output_force','w')
print data_lines[9]
for j1 in nframe:
start = j1*(natoms + 3) + 3
for i1 in nrange1:
line = data_lines[i1+start].split() #Split each line based on spaces
at_index[i1] = int(line[0])
at_type[i1] = int(line[1])
n1[i1]= int(line[2])
n2[i1]= int(line[3])
n3[i1]= int(line[4])
n4[i1]= int(line[5])
n5[i1]= int(line[6])
n6[i1]= int(line[7])
n7[i1]= int(line[8])
n8[i1]= int(line[9])
n9[i1]= int(line[10])
n10[i1]= int(line[11])
molnr[i1]= int(line[12])
When you are working with csv files, you should look into the csv module. I wrote a code that are should do the trick.
This code assumes "good data". If your data set may contain errors (such as less columns than 13, or less data rows than 326) some alterations should be done.
(changed to comply with Python 2.6.6)
import csv
with open('mydata.csv') as in_file:
with open('outfile.csv', 'wb') as out_file:
csv_reader = csv.reader(in_file, delimiter=' ', skipinitialspace=True)
csv_writer = csv.writer(out_file, delimiter = '\t')
# Iterate over all rows in the file
for i, header in enumerate(csv_reader):
# Get the header data
num = header[0]
csv_writer.writerow([num])
# Write frame number, starting with 1 (hence the +1 part)
csv_writer.writerow([i+1])
# Iterate over all data rows
for _ in xrange(326):
# Call next(csv_reader) to get the next row
# Put inside a try ... except to avoid StopIteration exception
# if end of file is found before reaching 326 lines
try:
row = next(csv_reader)
except StopIteration:
break
# Use list comprehension to extract number of zeros
zeros = sum([1 for x in row[2:12] if x.strip() == '0'])
not_zeros = 10 - zeros
# Write the data to output file
out = [row[0].strip(), row[1].strip(),not_zeros, zeros, row[12].strip()]
csv_writer.writerow(out)
# If the
else:
# Skip the last two lines of the file
next(csv_reader)
next(csv_reader)
For the first three lines, this yields:
326
1
1 6 5 5 1
2 6 4 6 1
3 6 4 6 1

Categories