Html file writing from a for loop's output - python
I have written a Python WebScraper like this:
from selenium import webdriver
from BeautifulSoup import BeautifulSoup
wd = webdriver.Firefox()
wd.get('http://www.nseindia.com/live_market/dynaContent/live_analysis/top_gainers_losers.htm?cat=G&utm_campaign=website&utm_source=sendgrid.com&utm_medium=email')
html_page = wd.page_source
wd.quit()
soup = BeautifulSoup(html_page)
table = soup.find("table", attrs = {"id":"topGainers"})
print "success"
#print table
for row in table.findAll('tr')[1:]:
cols = row.findAll('td')
#print cols
#break
some = [cols[0], cols[5], cols[6], cols[9]]
#print some
#break
for td in some:
if td.find(text = True):
text = ''.join(td.find(text = True))
print text + "|"
else:
continue
Now I Want my output (text) to be in a html file in a table format.How could i do that??
#Okay first if you want the table to have HEADERS above each column you should save the heading names in a list like so...
listofheaders=['header1','header2','header3']
#for each row in the table save the data included in the row in a list of lists something like this:
listofrows=[['a','b','c'],['a','b','c'],['a','b','c']]
#now create a string with the following:
htmlstuff='<!DOCTYPE html>\n<html>\n<head>\n<style>\ntable,th,td\n{\nborder:1px solid black;\nborder-collapse:collapse;\n}\nth,td\n{\npadding:5px;\n}\n</style>\n</head>\n\n<body>\n<table style="width:300px">\n<tr>\n '
#now you would add the COLUMN HEADERS to the list...
for header in listofheaders:
htmlstuff=htmlstuff+'<th>'+str(header)+'</th>\n'
#then you can populate the table row by row...
for row in listofrows:
htmlstuff+=' <tr>\n'
for item in row:
htmlstuff=htmlstuff+' <td>'+str(item)+'</td>\n'
htmlstuff+=' </tr>\n'
#finish off the html coding...
htmlstuff+='</table>\n</body>\n\n</html>'
#now create the html page and write the data...
f=open('webpage.html','w')
f.write(htmlstuff)
f.close()
You can even use webbrowser to automatically open the page for you.
import webbrowser
webbrowser.open('webpage.html')
Related
How can I export a table from a webpage to csv? [duplicate]
I want to convert a HTML table as obtained from the script below into a CSV file, but got type error as follows: TypeError: sequence item 0: expected string, Tag found from bs4 import BeautifulSoup import urllib2 url = 'http://www.data.jma.go.jp/obd/stats/etrn/view/monthly_s3_en.php?block_no=47401&view=1' html = urllib2.urlopen(url).read() soup = BeautifulSoup(html) table = soup.find_all('table', class_='data2_s') rows = table[0].find_all('tr') How is the easiest way to convert it into a CSV file? I tried as: fo = open('fo.txt','w') for r in rows: fo.write(str(r.txt) + '\n') fo.close() but it wrote 'none' The HTML is like this: <table class="data2_s"><caption class="m">WAKKANAI   WMO Station ID:47401 Lat 45<sup>o</sup>24.9'N  Lon 141<sup>o</sup>40.7'E</caption><tr><th scope="col">Year</th><th scope="col">Jan</th><th scope="col">Feb</th><th scope="col">Mar</th><th scope="col">Apr</th><th scope="col">May</th><th scope="col">Jun</th><th scope="col">Jul</th><th scope="col">Aug</th><th scope="col">Sep</th><th scope="col">Oct</th><th scope="col">Nov</th><th scope="col">Dec</th><th scope="col">Annual</th></tr><tr class="mtx" style="text-align:right;"><td style="text-align:center">1938</td><td class="data_0_0_0_0">-5.2</td><td class="data_0_0_0_0">-4.9</td><td class="data_0_0_0_0">-0.6</td><td class="data_0_0_0_0">4.7</td><td class="data_0_0_0_0">9.5</td><td class="data_0_0_0_0">11.6</td><td class="data_0_0_0_0">17.9</td><td class="data_0_0_0_0">22.2</td><td class="data_0_0_0_0">16.5</td><td class="data_0_0_0_0">10.7</td><td class="data_0_0_0_0">3.3</td><td class="data_0_0_0_0">-4.7</td><td class="data_0_0_0_0">6.8</td></tr> <tr class="mtx" style="text-align:right;"><td style="text-align:center">1939</td><td class="data_0_0_0_0">-7.5</td><td class="data_0_0_0_0">-6.6</td><td class="data_0_0_0_0">-1.4</td><td class="data_0_0_0_0">4.0</td><td class="data_0_0_0_0">7.5</td><td class="data_0_0_0_0">13.0</td><td class="data_0_0_0_0">17.4</td><td class="data_0_0_0_0">20.0</td><td class="data_0_0_0_0">17.4</td><td class="data_0_0_0_0">9.7</td><td class="data_0_0_0_0">3.0</td><td class="data_0_0_0_0">-2.5</td><td class="data_0_0_0_0">6.2</td></tr>
This is a job for the csv lib, getting each td inside each row and extracting the text, it will handle where there are missing values in each row: from bs4 import BeautifulSoup import urllib2 import csv url = 'http://www.data.jma.go.jp/obd/stats/etrn/view/monthly_s3_en.php?block_no=47401&view=1' html = urllib2.urlopen(url).read() soup = BeautifulSoup(html) table = soup.select_one("table.data2_s") # python3 just use th.text headers = [th.text.encode("utf-8") for th in table.select("tr th")] with open("out.csv", "w") as f: wr = csv.writer(f) wr.writerow(headers) wr.writerows([[td.text.encode("utf-8") for td in row.find_all("td")] for row in table.select("tr + tr")]) Which matches the table exactly as you see on the page: :~$ cat out.csv Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Annual 1938,-5.2,-4.9,-0.6,4.7,9.5,11.6,17.9,22.2,16.5,10.7,3.3,-4.7,6.8 1939,-7.5,-6.6,-1.4,4.0,7.5,13.0,17.4,20.0,17.4,9.7,3.0,-2.5,6.2 1940,-6.0,-5.7,-0.5,3.5,8.5,11.0,16.6,19.7,15.6,10.4,3.7,-1.0,6.3 1941,-6.5,-5.8,-2.6,3.6,8.1,11.4,12.7,16.5,16.0,10.0,4.0,-2.9,5.4 1942,-7.8,-8.2,-0.8,3.5,7.1,12.0,17.4,18.4,15.7,10.5,2.5,-2.9,5.6 1943,-4.1,-6.1,-1.1,3.5,6.9,12.9,19.3,21.5,17.5,11.7,1.2,-3.6,6.6 1944,-7.7,-7.9,-2.2,1.7,8.9,13.7,19.0,21.3,16.6,10.8,1.3,-6.0,5.8 1945,-7.8,-6.9,-1.8,3.9,5.5,11.0,13.6,18.7,16.8,11.0,3.9,-4.8,5.3 1946,-6.5,-6.0,-3.3,4.5,7.6,14.9,18.2,22.2,16.9,11.5,4.4,-2.5,6.8 1947,-4.9,-5.5,-2.3,3.7,9.0,11.2,17.1,19.3,15.1,10.6,2.4,-4.6,5.9 1948,-2.7,-4.4,-0.2,6.0,10.7,12.2,16.2,22.0,16.9,11.1,4.2,-0.6,7.6 1949,-2.6,-2.8,-3.4,2.0,9.4,11.8,16.9,20.8,17.8,10.8,3.1,-3.8,6.7 1950,-5.7,-4.8,-1.3,4.0,9.2,14.6,19.3,22.6,16.8,9.0,3.0,-2.9,7.0 1951,-6.7,-6.5,-2.2,3.7,9.5,12.3,16.7,22.3,15.6,10.1,3.7,-0.3,6.5 1952,-5.7,-7.1,-2.4,3.8,8.3,13.1,16.4,19.7,17.0,11.3,0.9,-7.1,5.7 1953,-7.7,-7.3,-0.9,3.6,6.9,11.1,16.8,19.2,17.6,11.2,-0.6,-2.6,5.6 1954,-6.7,-4.1,-2.5,4.0,7.5,11.0,13.7,17.0,17.2,9.5,3.2,-1.8,5.7 1955,-6.4,-4.8,-1.3,4.7,7.0,12.7,20.3,19.5,15.5,10.6,3.6,-0.4,6.8 1956,-6.1,-4.6,-2.0,5.1,10.8,11.2,13.8,16.3,17.2,12.3,2.8,-2.6,6.2 1957,-3.9,-5.5,-2.9,4.4,9.3,10.9,17.1,18.2,15.5,11.1,5.4,-1.1,6.5 1958,-4.9,-4.9,-2.3,4.4,8.5,12.6,17.5,18.3,16.8,10.6,4.5,-0.5,6.7 1959,-7.3,-2.8,0.8,6.4,9.4,12.7,17.1,18.5,16.2,11.6,2.9,-3.9,6.8 1960,-7.2,-5.2,-1.4,3.5,7.7,10.8,15.9,20.8,18.1,9.7,3.3,-3.9,6.0 1961,-7.7,-5.3,-1.4,5.5,8.7,14.7,19.5,20.0,18.9,10.4,4.1,-1.3,7.2 1962,-4.2,-5.4,-2.5,6.7,10.0,12.9,16.8,17.7,16.6,9.9,2.6,-1.5,6.6 1963,-3.6,-3.7,0.1,5.0,10.4,12.4,16.8,17.1,15.6,10.7,4.3,-1.7,7.0 1964,-4.5,-7.7,-1.3,3.7,9.9,11.9,15.3,17.7,14.9,10.0,3.6,-1.9,6.0 1965,-4.1,-5.7,-2.8,3.2,9.1,13.3,15.2,18.8,15.8,11.4,2.1,-2.6,6.1 1966,-5.0,-5.5,-1.0,3.2,8.1,12.2,15.3,17.5,15.4,11.6,4.1,-4.4,6.0 1967,-6.8,-5.9,-0.7,4.5,10.0,11.4,16.4,20.5,15.5,11.0,1.8,-1.5,6.4 1968,-4.2,-4.7,1.9,5.7,8.9,14.5,17.3,18.1,15.9,9.1,5.3,-0.7,7.3 1969,-7.3,-7.5,-2.5,3.9,7.2,10.6,17.0,16.5,16.1,9.4,2.2,-5.4,5.0 1970,-6.6,-6.0,-4.2,4.6,10.4,12.9,17.4,19.2,16.8,10.5,4.3,-3.3,6.3 1971,-6.3,-6.4,-1.7,4.1,7.6,11.6,15.8,17.2,15.2,11.5,3.4,-2.2,5.8 1972,-5.3,-5.0,-0.6,5.9,9.4,12.8,16.8,20.4,15.7,10.9,1.9,-1.4,6.8 1973,-4.2,-5.3,-2.9,4.2,8.4,12.8,17.0,20.9,17.1,10.4,3.5,-1.9,6.7 1974,-2.6,-4.6,-2.1,4.0,8.4,11.8,16.8,18.8,16.5,10.1,1.9,-5.7,6.1 1975,-4.1,-6.1,-1.5,4.3,8.4,13.7,16.1,20.6,17.3,10.4,3.8,-3.8,6.6 1976,-4.6,-3.5,-1.4,4.0,8.9,11.9,17.5,17.6,15.7,10.2,1.3,-2.0,6.3 1977,-8.3,-7.1,-1.0,3.6,8.0,11.9,18.2,19.1,17.4,11.4,4.5,-1.8,6.3 1978,-6.7,-9.2,-1.6,4.3,9.2,13.5,20.6,21.3,17.4,9.6,3.4,-2.1,6.6 1979,-6.9,-4.5,-2.5,2.7,7.8,13.2,15.8,20.3,16.9,11.3,2.9,-0.1,6.4 1980,-5.4,-7.1,-1.9,1.9,7.8,12.9,15.9,16.5,16.0,10.0,4.3,-0.6,5.9 1981,-5.4,-6.3,-2.6,5.6,8.1,11.8,17.1,18.7,16.0,10.5,0.8,-0.6,6.1 1982,-5.6,-5.3,-0.6,3.7,9.0,11.9,16.9,21.0,17.5,11.4,4.3,-1.0,6.9 1983,-4.2,-7.6,-1.9,6.8,8.2,8.5,14.5,18.9,15.8,8.9,4.8,-2.1,5.9 1984,-4.9,-6.6,-3.3,2.9,7.9,15.5,19.5,20.5,16.6,9.2,2.3,-3.6,6.3 1985,-8.7,-4.8,-1.4,4.9,8.6,11.7,16.6,21.1,15.7,10.3,2.7,-4.2,6.0 1986,-7.2,-6.5,-2.4,4.6,8.4,11.2,14.4,19.6,16.8,9.1,2.1,-1.9,5.7 1987,-6.4,-5.6,-1.4,4.2,8.6,12.6,17.5,18.0,16.4,11.1,2.0,-3.1,6.2 1988,-4.8,-6.3,-1.8,4.1,8.0,12.6,14.1,20.4,16.1,10.4,2.0,-1.5,6.1 1989,-2.6,-2.4,0.8,4.0,8.2,10.7,18.4,20.4,16.8,10.8,4.8,-1.3,7.4 1990,-5.7,-2.4,1.4,5.7,9.3,13.4,18.9,20.3,17.1,13.3,6.2,1.2,8.2 1991,-1.6,-3.6,-1.5,4.8,10.1,14.3,16.2,19.0,16.6,11.8,3.5,-2.3,7.3 1992,-3.6,-3.6,-0.4,3.7,8.1,12.1,17.6,18.0,14.9,11.1,3.2,-1.2,6.7 1993,-2.7,-3.3,-0.2,3.1,8.6,10.7,15.6,17.6,16.3,11.1,3.7,-1.6,6.6 1994,-6.1,-2.7,-1.3,4.4,10.0,12.8,17.4,21.7,17.5,11.8,4.3,-2.9,7.2 1995,-4.0,-4.0,-0.8,4.8,11.0,12.7,18.4,19.3,16.3,12.3,5.2,-0.6,7.6 1996,-4.6,-4.5,-1.0,3.5,6.9,12.0,15.9,18.7,16.8,10.4,2.3,-2.4,6.2 1997,-3.0,-3.3,-1.5,4.3,7.3,11.7,17.4,17.2,16.1,10.3,6.4,-0.7,6.9 1998,-6.9,-5.1,0.3,5.3,10.1,12.9,15.5,18.1,17.2,12.5,2.0,-2.4,6.6 1999,-4.1,-5.6,-2.6,4.2,8.4,14.5,16.6,21.0,18.3,11.2,3.8,-1.9,7.0 2000,-4.2,-5.6,-2.1,3.5,9.3,12.8,18.9,21.5,17.7,10.6,1.5,-4.1,6.7 2001,-6.3,-7.7,-2.4,4.7,8.5,13.0,17.4,18.7,15.6,10.8,4.0,-4.2,6.0 2002,-3.6,-1.0,0.5,6.8,11.1,12.1,15.7,17.1,17.0,10.8,2.3,-4.4,7.0 2003,-4.7,-5.6,-0.7,5.3,10.1,13.9,14.3,18.4,16.6,11.3,4.5,-1.4,6.8 2004,-3.9,-3.0,-0.5,4.4,10.6,14.6,16.8,19.7,17.8,11.8,5.9,-2.0,7.7 2005,-4.6,-5.7,-1.0,3.9,7.0,14.3,16.7,21.0,17.9,12.6,4.9,-2.3,7.1 2006,-5.5,-4.7,-0.9,2.1,9.3,11.9,18.4,21.6,17.7,11.0,4.5,-1.8,7.0 2007,-3.7,-3.2,-0.7,3.5,7.6,14.3,16.7,20.4,17.0,10.9,3.0,-1.7,7.0 2008,-6.0,-4.8,0.6,6.0,8.3,11.9,17.9,18.8,17.9,11.5,3.8,-0.4,7.1 2009,-2.4,-4.4,0.0,4.5,10.0,12.3,14.8,18.6,16.9,11.4,3.1,-2.2,6.9 2010,-3.4,-4.9,-1.4,3.5,7.3,15.0,18.1,22.4,18.4,11.4,4.8,-1.1,7.5 2011,-5.1,-2.2,-0.6,4.4,6.5,12.8,17.5 ),21.5,18.3,12.1,4.9,-2.3,7.3 2012,-5.4,-6.4,-2.4,4.6,8.9,12.6,17.2,20.4,19.4,11.8,3.8,-3.0,6.8 2013,-5.8,-5.1,-1.3,4.5,7.2,14.0,18.9,20.2,17.6,11.8,5.5,-0.2,7.3 2014,-5.3,-4.2,-1.2,3.9,8.7,13.9,19.2,20.0,16.7,11.0,4.8,-2.3,7.1 2015,-2.9,-1.7,2.3,5.9,9.9,12.1,17.6,19.0,17.3,10.4,3.7,-0.2,7.8 2016,-5.2,-4.7,0.5,4.3,11.4,12.5,17.4,21.8 ], , , , ,5.2 ] if you want the caption use table.select_one("caption.m").text: with open("out.csv", "w") as f: wr = csv.writer(f) wr.writerow([table.select_one("caption.m").text.encode("utf-8")]) wr.writerow(headers) wr.writerows([[td.text.encode("utf-8") for td in row.find_all("td")] for row in table.select("tr + tr")]) but it might be an idea to use that as the name of the file as opposed to adding it to the csv. If you really want to do it without the csv, use the same logic with str.join: table = soup.select_one("table.data2_s") headers = [th.text.encode("utf-8") for th in table.select("tr th")] with open("out.csv", "w") as f: f.write(",".join(headers) + "\n") f.writelines(",".join([td.text.encode("utf-8") for td in row.find_all("td")]) + "\n" for row in table.select("tr + tr")) If you want to replace the empty cells with N/A: with open("out.csv", "w") as f: f.write(",".join(headers) + "\n") f.writelines(",".join([td.text.encode("utf-8").strip('\xe3\x80\x80') or "N/A" for td in row.find_all("td")]) + "\n" for row in table.select("tr + tr")) Which will change the last row to: 2016,-5.2,-4.7,0.5,4.3,11.4,12.5,17.4,21.8 ],N/A,N/A,N/A,N/A,5.2 ] The spaces for missing values are unicode ideographic space characters (u"\u3000" in python) which when encoded to utf-8 become and strip, if that leave an empty string then we just use "N/A" In [7]: print u"\u3000" In [8]: u"\u3000".encode("utf-8") Out[8]: '\xe3\x80\x80' In [9]: u"\u3000".encode("utf-8").strip('\xe3\x80\x80') Out[9]: ''
Use the csv module from Python to do this. You can obviously write more columns if you want, but the idea is that you're writing a list to the csv file. There are other options that you can specify in the writer() method if you'd like to quote things, escape things, etc. import csv with open('your_csv_name.csv', 'w') as o: w = csv.writer(o) # Headers w.writerow(['tr_content']) # Write the tr text for r in rows: w.writerow([r])
Here is another way without using csv module: fp=open('data.csv','w') for row in rows[:-1]: # Removed last row as it has empty cells that gives error which can also be resolved if needed fp.write(row.get_text(',') + '\n') fp.close() You can directly open data.csv file. Station details can be get by below command: >>>> table = soup.find_all('table', class_='data2_s') >>>> print table[0].find_all('caption')[0].get_text().encode('ascii', 'ignore') WAKKANAI WMO Station ID:47401 Lat 45o24.9'N Lon 141o40.7'E Hope this helps.
import csv from bs4 import BeautifulSoup import pandas as pd html = open('test.html').read() soup = BeautifulSoup(html, features='lxml') #Specify table name which you want to read. #Example: <table class="queryResults" border="0" cellspacing="1"> table = soup.select_one('table.queryResults') def get_all_tables(soup): return soup.find_all("table") tbls = get_all_tables(soup) for i, tablen in enumerate(tbls, start=1): print(i) print(tablen) def get_table_headers(table): headers = [] for th in table.find("tr").find_all("th"): headers.append(th.text.strip()) return headers head = get_table_headers(table) #print(head) def get_table_rows(table): rows = [] for tr in table.find_all("tr")[1:]: cells = [] # grab all td tags in this table row tds = tr.find_all("td") if len(tds) == 0: # if no td tags, search for th tags # can be found especially in wikipedia tables below the table ths = tr.find_all("th") for th in ths: cells.append(th.text.strip()) else: # use regular td tags for td in tds: cells.append(td.text.strip()) rows.append(cells) return rows table_rows = get_table_rows(table) #print(table_rows) def save_as_csv(table_name, headers, rows): pd.DataFrame(rows, columns=headers).to_csv(f"{table_name}.csv") save_as_csv("Test_table", head, table_rows)
Unable to write data into Excel file (multiple tabs) using Python
I am not much familiar with data writing in Excel format using Python, need some help to write my data output into single .xlsx (Excel) file with multiple tabs. My code is given here: import time import requests import random from lxml import html from bs4 import BeautifulSoup import xlsxwriter def write_to_file(file, mode, data, newline=None, with_tab=None): with open(file, mode, encoding='utf-8') as l: if with_tab == True: data = '\t'.join(data) if newline == True: data = data+'\n' l.write(data) link = ["http://ec.europa.eu/environment/ets/ohaDetails.do?returnURL=&languageCode=en&accountID=®istryCode=&buttonAction=all&action=&account.registryCode=&accountType=&identifierInReg=&accountHolder=&primaryAuthRep=&installationIdentifier=&installationName=&accountStatus=&permitIdentifier=&complianceStatus=&mainActivityType=-1&searchType=oha&resultList.currentPageNumber="+str(var)+"&nextList=Next%C2%A0%3E&selectedPeriods=" for var in range(17500)] # This will read the URL's line by line as per specific value of var. start = 1 end = 20 for pagenum, links in enumerate(link[start:end]): print(links) r = requests.get(links) time.sleep(random.randint(2,5)) soup = BeautifulSoup(r.content,"lxml") # Table 2 for items in soup.find(id="tblAccountContactInfo").find_all("tr")[:]: dataset = [item.get_text(strip=True) for item in items.find_all("td")[:]] print(dataset) write_to_file('Table3.tsv', 'a', dataset, with_tab=True, newline=True) write_to_file('Table3.tsv', 'a', links) # Table 3 for items in soup.find(id="tblChildDetails").find("table").find_all("tr"): dataset = [item.get_text(strip=True) for item in items.find_all("td")[:]] print(dataset) write_to_file('Table3.tsv', 'a', dataset, with_tab=True, newline=True) write_to_file('Table3.tsv', 'a', links) #workbook = xlsxwriter.Workbook('Table3.xlsx') #worksheet = workbook.add_worksheet("Table 3") #worksheet.write(dataset) #workbook.close() I need the output in .xlsx Excel sheet in multiple tabs like Table 1 tab and Table 2 tab, currently I am fetching data in .tsv format. I have tried the xlsxwriter but unable to get results so commented those line. Please help
You need to first create two worksheets, and keep track of the current row to be used for each worksheet. An append_row() function can then add one row of data to the required sheet. import time import requests import random from lxml import html from bs4 import BeautifulSoup import xlsxwriter def append_row(ws, row): for col, value in enumerate(row): ws.write_string(ws.cur_row, col, value) ws.cur_row += 1 workbook = xlsxwriter.Workbook('output.xlsx') ws_2 = workbook.add_worksheet("Table 2") ws_3 = workbook.add_worksheet("Table 3") # Keep a track of the row to use in each worksheet ws_2.cur_row = 0 ws_3.cur_row = 0 start = 1 end = 3 link = "http://ec.europa.eu/environment/ets/ohaDetails.do?returnURL=&languageCode=en&accountID=®istryCode=&buttonAction=all&action=&account.registryCode=&accountType=&identifierInReg=&accountHolder=&primaryAuthRep=&installationIdentifier=&installationName=&accountStatus=&permitIdentifier=&complianceStatus=&mainActivityType=-1&searchType=oha&resultList.currentPageNumber={}&nextList=Next%C2%A0%3E&selectedPeriods=" for page_number in range(start, end): print("Page {}".format(page_number)) url = link.format(page_number) r = requests.get(url) time.sleep(random.randint(2, 5)) soup = BeautifulSoup(r.content, "lxml") # Table 2 for items in soup.find(id="tblAccountContactInfo").find_all("tr")[:]: dataset = [item.get_text(strip=True) for item in items.find_all("td")[:]] append_row(ws_2, [url] + dataset]) # Table 3 for items in soup.find(id="tblChildDetails").find("table").find_all("tr"): dataset = [item.get_text(strip=True) for item in items.find_all("td")[:]] append_row(ws_3, [url] + dataset]) workbook.close()
Data is being written three times instead of once using requests and openpyxl?
Here is the code I am using: from Sid.drivercommand import * from Stocks.lot_size import symbol_list from bs4 import BeautifulSoup import datetime from openpyxl import Workbook p1_url = 'https://www.nseindia.com/marketinfo/companyTracker/mtOptionKeys.jsp?companySymbol=' p3_url = '&indexSymbol=NIFTY&series=EQ&instrument=OPTSTK&date=-' symbol_list = ['ACC','SBIN','PNB'] loc = r"C:\Users\OneDrive\Stock Study" loc_opt_data = r"C:\Users\OneDrive\Stock Study\Opt Data" os.chdir(loc_opt_data) date = 'Test_Data_'+ str(datetime.date.today()) + ".xlsx" datewise_data = Workbook(date) os.chdir(loc) for symbol in symbol_list[0:1]: url = p1_url+symbol+p3_url raw_page = requests.get(url).text soup = BeautifulSoup(raw_page, "lxml") if len(raw_page) > 0: datewise_data_sheet = datewise_data.create_sheet(symbol) for table in soup.findAll('table'): '# Fields: ' + ','.join([tr.text for tr in table.findAll('th')]) for row in table.findAll('tr'): # ws1.append(([tr.text for tr in row.findAll('td')])) datewise_data_sheet.append(([tr.text for tr in row.findAll('td')])) # test.append(([tr.text for tr in row.findAll('td')])) #print(symbol) raw_page = 0 datewise_data.save(date) The file generated from this has the data but each symbol has three sets of data while I only want the first set/table.
You have tables within tables on the web-page so your code, as it is, finds the tr elements three times as they are in the first table, an the same tr elements are in the second table and the row.findAll('td') finds them a third time as the first table td contains the second table. You just need to find the data once so use: table = soup.findAll('table')[1] for row in table.findAll('tr'): Instead of: for table in soup.findAll('table'): for row in table.findAll('tr'): This will go straight to the table within the table and give you one set of results.
How do I web scrape the sub-headers from this link?
I've made a web scraper that scrapes data from pages that look like this (it scrapes the tables): https://www.techpowerup.com/gpudb/2/ The problem is that my program, for some reason, is only scraping the values, and not the subheaders. For instance, (click on the link), it only scrapes the "R420", "130nm", "160 million", etc. but not the "GPU Name", "Process Size", "Transistors" etc. What do I add to the code to get it to scrape the subheaders? Here's my code: import csv import requests import bs4 url = "https://www.techpowerup.com/gpudb/2" #obtain HTML and parse through it response = requests.get(url) html = response.content import sys reload(sys) sys.setdefaultencoding('utf-8') soup = bs4.BeautifulSoup(html, "lxml") tables = soup.findAll("table") #reading every value in every row in each table and making a matrix tableMatrix = [] for table in tables: list_of_rows = [] for row in table.findAll('tr'): list_of_cells = [] for cell in row.findAll('td'): text = cell.text.replace(' ', '') list_of_cells.append(text) list_of_rows.append(list_of_cells) tableMatrix.append((list_of_rows, list_of_cells)) #(YOU CAN PROBABLY IGNORE THIS)placeHolder used to avoid duplicate data from appearing in list placeHolder = 0 excelTable = [] for table in tableMatrix: for row in table: if placeHolder == 0: for entry in row: excelTable.append(entry) placeHolder = 1 else: placeHolder = 0 excelTable.append('\n') for value in excelTable: print value print '\n' #create excel file and write the values into a csv fl = open(str(count) + '.csv', 'w') writer = csv.writer(fl) for values in excelTable: writer.writerow(values) fl.close()
if you check the page source, those cells are header cells. So they are not using TD tags but TH tags. you may want to update your loop to include TH cells alongside TD cells.
Web table scraping: how do I find the column number of a cell in excel using python
I have an excel file with many Chinese names in the first row like this: enter image description here And what I am doing is to scrape some more Chinese names from a web table and the names are all at the 2nd col in each row (tr). I want to see if the names being scraped is already in my excel file. So I use a boolean have to keep track. It should return True if found. And I want to know the exact position (column number) of the found name, so I use name_position to keep track. from lxml import html from bs4 import BeautifulSoup import requests import openpyxl from openpyxl.workbook import Workbook wb=openpyxl.load_workbook('hehe.xlsx') ws1=wb.get_sheet_by_name('Taocan') page = requests.get(url) tree = html.fromstring(page.text) web = page.text soup = BeautifulSoup(web, 'lxml') table = soup.find('table', {'class': "tc_table"}) trs = table.find_all('tr') for tr in trs: ls = [] for td in tr.find_all('td'): ls.append(td.text) ls = [x.encode('utf-8') for x in ls] try: name = ls[1] have = False name_position = 1 for cell in ws1[1]: if name == cell: have = True break else: name_position += 1 except IndexError: print("there is an index error") However, my code doesn't seem to work, and I think the problem is from the comparison of the names: if name == cell I changed to: if name == cell.value it still doesn't work. Can anyone help me with this? thanks/: Just to add on: the web page Im scraping is also in Chinese. So when I print(ls) it gives a list like this ['1', '\xe4\xb8\x80\xe8\x88\xac\xe6\xa3\x80\xe6\x9f\xa5', '\xe8\xba\xab\xe9\xab\x98\xe3\x80\x81\xe4\xbd\x93\xe9\x87\x8d\xe3\x80\x81\xe4\xbd\x93\xe9\x87\x8d\xe6\x8c\x87\xe6\x95\xb0\xe3\x80\x81\xe8\x85\xb0\xe5\x9b\xb4\xe3\x80\x81\xe8\x88\x92\xe5\xbc\xa0\xe5\x8e\x8b\xe3\x80\x81\xe6\x94\xb6\xe7\xbc\xa9\xe5\x8e\x8b\xe3\x80\x81\xe8\xa1\x80\xe5\x8e\x8b\xe6\x8c\x87\xe6\x95\xb0', '\xe9\x80\x9a\xe8\xbf\x87\xe4\xbb\xaa\xe5\x99\xa8\xe6\xb5\x8b\xe9\x87\x8f\xe4\xba\xba\xe4\xbd\x93\xe8\xba\xab\xe9\xab\x98\xe3\x80\x81\xe4\xbd\x93\xe9\x87\x8d\xe3\x80\x81\xe4\xbd\x93\xe8\x84\x82\xe8\x82\xaa\xe7\x8e\x87\xe5\x8f\x8a\xe8\xa1\x80\xe5\x8e\x8b\xef\xbc\x8c\xe7\xa7\x91\xe5\xad\xa6\xe5\x88\xa4\xe6\x96\xad\xe4\xbd\x93\xe9\x87\x8d\xe6\x98\xaf\xe5\x90\xa6\xe6\xa0\x87\xe5\x87\x86\xe3\x80\x81\xe8\xa1\x80\xe5\x8e\x8b\xe6\x98\xaf\xe5\x90\xa6\xe6\xad\xa3\xe5\xb8\xb8\xe3\x80\x81\xe4\xbd\x93\xe8\x84\x82\xe8\x82\xaa\xe6\x98\xaf\xe5\x90\xa6\xe8\xb6\x85\xe6\xa0\x87\xe3\x80\x82'] but if I print(ls[1]) it gives Chinese name like "广州"