How do I web scrape the sub-headers from this link? - python
I've made a web scraper that scrapes data from pages that look like this (it scrapes the tables): https://www.techpowerup.com/gpudb/2/
The problem is that my program, for some reason, is only scraping the values, and not the subheaders. For instance, (click on the link), it only scrapes the "R420", "130nm", "160 million", etc. but not the "GPU Name", "Process Size", "Transistors" etc.
What do I add to the code to get it to scrape the subheaders? Here's my code:
import csv
import requests
import bs4
url = "https://www.techpowerup.com/gpudb/2"
#obtain HTML and parse through it
response = requests.get(url)
html = response.content
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
soup = bs4.BeautifulSoup(html, "lxml")
tables = soup.findAll("table")
#reading every value in every row in each table and making a matrix
tableMatrix = []
for table in tables:
list_of_rows = []
for row in table.findAll('tr'):
list_of_cells = []
for cell in row.findAll('td'):
text = cell.text.replace(' ', '')
list_of_cells.append(text)
list_of_rows.append(list_of_cells)
tableMatrix.append((list_of_rows, list_of_cells))
#(YOU CAN PROBABLY IGNORE THIS)placeHolder used to avoid duplicate data from appearing in list
placeHolder = 0
excelTable = []
for table in tableMatrix:
for row in table:
if placeHolder == 0:
for entry in row:
excelTable.append(entry)
placeHolder = 1
else:
placeHolder = 0
excelTable.append('\n')
for value in excelTable:
print value
print '\n'
#create excel file and write the values into a csv
fl = open(str(count) + '.csv', 'w')
writer = csv.writer(fl)
for values in excelTable:
writer.writerow(values)
fl.close()
if you check the page source, those cells are header cells. So they are not using TD tags but TH tags. you may want to update your loop to include TH cells alongside TD cells.
Related
How can I export a table from a webpage to csv? [duplicate]
I want to convert a HTML table as obtained from the script below into a CSV file, but got type error as follows: TypeError: sequence item 0: expected string, Tag found from bs4 import BeautifulSoup import urllib2 url = 'http://www.data.jma.go.jp/obd/stats/etrn/view/monthly_s3_en.php?block_no=47401&view=1' html = urllib2.urlopen(url).read() soup = BeautifulSoup(html) table = soup.find_all('table', class_='data2_s') rows = table[0].find_all('tr') How is the easiest way to convert it into a CSV file? I tried as: fo = open('fo.txt','w') for r in rows: fo.write(str(r.txt) + '\n') fo.close() but it wrote 'none' The HTML is like this: <table class="data2_s"><caption class="m">WAKKANAI   WMO Station ID:47401 Lat 45<sup>o</sup>24.9'N  Lon 141<sup>o</sup>40.7'E</caption><tr><th scope="col">Year</th><th scope="col">Jan</th><th scope="col">Feb</th><th scope="col">Mar</th><th scope="col">Apr</th><th scope="col">May</th><th scope="col">Jun</th><th scope="col">Jul</th><th scope="col">Aug</th><th scope="col">Sep</th><th scope="col">Oct</th><th scope="col">Nov</th><th scope="col">Dec</th><th scope="col">Annual</th></tr><tr class="mtx" style="text-align:right;"><td style="text-align:center">1938</td><td class="data_0_0_0_0">-5.2</td><td class="data_0_0_0_0">-4.9</td><td class="data_0_0_0_0">-0.6</td><td class="data_0_0_0_0">4.7</td><td class="data_0_0_0_0">9.5</td><td class="data_0_0_0_0">11.6</td><td class="data_0_0_0_0">17.9</td><td class="data_0_0_0_0">22.2</td><td class="data_0_0_0_0">16.5</td><td class="data_0_0_0_0">10.7</td><td class="data_0_0_0_0">3.3</td><td class="data_0_0_0_0">-4.7</td><td class="data_0_0_0_0">6.8</td></tr> <tr class="mtx" style="text-align:right;"><td style="text-align:center">1939</td><td class="data_0_0_0_0">-7.5</td><td class="data_0_0_0_0">-6.6</td><td class="data_0_0_0_0">-1.4</td><td class="data_0_0_0_0">4.0</td><td class="data_0_0_0_0">7.5</td><td class="data_0_0_0_0">13.0</td><td class="data_0_0_0_0">17.4</td><td class="data_0_0_0_0">20.0</td><td class="data_0_0_0_0">17.4</td><td class="data_0_0_0_0">9.7</td><td class="data_0_0_0_0">3.0</td><td class="data_0_0_0_0">-2.5</td><td class="data_0_0_0_0">6.2</td></tr>
This is a job for the csv lib, getting each td inside each row and extracting the text, it will handle where there are missing values in each row: from bs4 import BeautifulSoup import urllib2 import csv url = 'http://www.data.jma.go.jp/obd/stats/etrn/view/monthly_s3_en.php?block_no=47401&view=1' html = urllib2.urlopen(url).read() soup = BeautifulSoup(html) table = soup.select_one("table.data2_s") # python3 just use th.text headers = [th.text.encode("utf-8") for th in table.select("tr th")] with open("out.csv", "w") as f: wr = csv.writer(f) wr.writerow(headers) wr.writerows([[td.text.encode("utf-8") for td in row.find_all("td")] for row in table.select("tr + tr")]) Which matches the table exactly as you see on the page: :~$ cat out.csv Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Annual 1938,-5.2,-4.9,-0.6,4.7,9.5,11.6,17.9,22.2,16.5,10.7,3.3,-4.7,6.8 1939,-7.5,-6.6,-1.4,4.0,7.5,13.0,17.4,20.0,17.4,9.7,3.0,-2.5,6.2 1940,-6.0,-5.7,-0.5,3.5,8.5,11.0,16.6,19.7,15.6,10.4,3.7,-1.0,6.3 1941,-6.5,-5.8,-2.6,3.6,8.1,11.4,12.7,16.5,16.0,10.0,4.0,-2.9,5.4 1942,-7.8,-8.2,-0.8,3.5,7.1,12.0,17.4,18.4,15.7,10.5,2.5,-2.9,5.6 1943,-4.1,-6.1,-1.1,3.5,6.9,12.9,19.3,21.5,17.5,11.7,1.2,-3.6,6.6 1944,-7.7,-7.9,-2.2,1.7,8.9,13.7,19.0,21.3,16.6,10.8,1.3,-6.0,5.8 1945,-7.8,-6.9,-1.8,3.9,5.5,11.0,13.6,18.7,16.8,11.0,3.9,-4.8,5.3 1946,-6.5,-6.0,-3.3,4.5,7.6,14.9,18.2,22.2,16.9,11.5,4.4,-2.5,6.8 1947,-4.9,-5.5,-2.3,3.7,9.0,11.2,17.1,19.3,15.1,10.6,2.4,-4.6,5.9 1948,-2.7,-4.4,-0.2,6.0,10.7,12.2,16.2,22.0,16.9,11.1,4.2,-0.6,7.6 1949,-2.6,-2.8,-3.4,2.0,9.4,11.8,16.9,20.8,17.8,10.8,3.1,-3.8,6.7 1950,-5.7,-4.8,-1.3,4.0,9.2,14.6,19.3,22.6,16.8,9.0,3.0,-2.9,7.0 1951,-6.7,-6.5,-2.2,3.7,9.5,12.3,16.7,22.3,15.6,10.1,3.7,-0.3,6.5 1952,-5.7,-7.1,-2.4,3.8,8.3,13.1,16.4,19.7,17.0,11.3,0.9,-7.1,5.7 1953,-7.7,-7.3,-0.9,3.6,6.9,11.1,16.8,19.2,17.6,11.2,-0.6,-2.6,5.6 1954,-6.7,-4.1,-2.5,4.0,7.5,11.0,13.7,17.0,17.2,9.5,3.2,-1.8,5.7 1955,-6.4,-4.8,-1.3,4.7,7.0,12.7,20.3,19.5,15.5,10.6,3.6,-0.4,6.8 1956,-6.1,-4.6,-2.0,5.1,10.8,11.2,13.8,16.3,17.2,12.3,2.8,-2.6,6.2 1957,-3.9,-5.5,-2.9,4.4,9.3,10.9,17.1,18.2,15.5,11.1,5.4,-1.1,6.5 1958,-4.9,-4.9,-2.3,4.4,8.5,12.6,17.5,18.3,16.8,10.6,4.5,-0.5,6.7 1959,-7.3,-2.8,0.8,6.4,9.4,12.7,17.1,18.5,16.2,11.6,2.9,-3.9,6.8 1960,-7.2,-5.2,-1.4,3.5,7.7,10.8,15.9,20.8,18.1,9.7,3.3,-3.9,6.0 1961,-7.7,-5.3,-1.4,5.5,8.7,14.7,19.5,20.0,18.9,10.4,4.1,-1.3,7.2 1962,-4.2,-5.4,-2.5,6.7,10.0,12.9,16.8,17.7,16.6,9.9,2.6,-1.5,6.6 1963,-3.6,-3.7,0.1,5.0,10.4,12.4,16.8,17.1,15.6,10.7,4.3,-1.7,7.0 1964,-4.5,-7.7,-1.3,3.7,9.9,11.9,15.3,17.7,14.9,10.0,3.6,-1.9,6.0 1965,-4.1,-5.7,-2.8,3.2,9.1,13.3,15.2,18.8,15.8,11.4,2.1,-2.6,6.1 1966,-5.0,-5.5,-1.0,3.2,8.1,12.2,15.3,17.5,15.4,11.6,4.1,-4.4,6.0 1967,-6.8,-5.9,-0.7,4.5,10.0,11.4,16.4,20.5,15.5,11.0,1.8,-1.5,6.4 1968,-4.2,-4.7,1.9,5.7,8.9,14.5,17.3,18.1,15.9,9.1,5.3,-0.7,7.3 1969,-7.3,-7.5,-2.5,3.9,7.2,10.6,17.0,16.5,16.1,9.4,2.2,-5.4,5.0 1970,-6.6,-6.0,-4.2,4.6,10.4,12.9,17.4,19.2,16.8,10.5,4.3,-3.3,6.3 1971,-6.3,-6.4,-1.7,4.1,7.6,11.6,15.8,17.2,15.2,11.5,3.4,-2.2,5.8 1972,-5.3,-5.0,-0.6,5.9,9.4,12.8,16.8,20.4,15.7,10.9,1.9,-1.4,6.8 1973,-4.2,-5.3,-2.9,4.2,8.4,12.8,17.0,20.9,17.1,10.4,3.5,-1.9,6.7 1974,-2.6,-4.6,-2.1,4.0,8.4,11.8,16.8,18.8,16.5,10.1,1.9,-5.7,6.1 1975,-4.1,-6.1,-1.5,4.3,8.4,13.7,16.1,20.6,17.3,10.4,3.8,-3.8,6.6 1976,-4.6,-3.5,-1.4,4.0,8.9,11.9,17.5,17.6,15.7,10.2,1.3,-2.0,6.3 1977,-8.3,-7.1,-1.0,3.6,8.0,11.9,18.2,19.1,17.4,11.4,4.5,-1.8,6.3 1978,-6.7,-9.2,-1.6,4.3,9.2,13.5,20.6,21.3,17.4,9.6,3.4,-2.1,6.6 1979,-6.9,-4.5,-2.5,2.7,7.8,13.2,15.8,20.3,16.9,11.3,2.9,-0.1,6.4 1980,-5.4,-7.1,-1.9,1.9,7.8,12.9,15.9,16.5,16.0,10.0,4.3,-0.6,5.9 1981,-5.4,-6.3,-2.6,5.6,8.1,11.8,17.1,18.7,16.0,10.5,0.8,-0.6,6.1 1982,-5.6,-5.3,-0.6,3.7,9.0,11.9,16.9,21.0,17.5,11.4,4.3,-1.0,6.9 1983,-4.2,-7.6,-1.9,6.8,8.2,8.5,14.5,18.9,15.8,8.9,4.8,-2.1,5.9 1984,-4.9,-6.6,-3.3,2.9,7.9,15.5,19.5,20.5,16.6,9.2,2.3,-3.6,6.3 1985,-8.7,-4.8,-1.4,4.9,8.6,11.7,16.6,21.1,15.7,10.3,2.7,-4.2,6.0 1986,-7.2,-6.5,-2.4,4.6,8.4,11.2,14.4,19.6,16.8,9.1,2.1,-1.9,5.7 1987,-6.4,-5.6,-1.4,4.2,8.6,12.6,17.5,18.0,16.4,11.1,2.0,-3.1,6.2 1988,-4.8,-6.3,-1.8,4.1,8.0,12.6,14.1,20.4,16.1,10.4,2.0,-1.5,6.1 1989,-2.6,-2.4,0.8,4.0,8.2,10.7,18.4,20.4,16.8,10.8,4.8,-1.3,7.4 1990,-5.7,-2.4,1.4,5.7,9.3,13.4,18.9,20.3,17.1,13.3,6.2,1.2,8.2 1991,-1.6,-3.6,-1.5,4.8,10.1,14.3,16.2,19.0,16.6,11.8,3.5,-2.3,7.3 1992,-3.6,-3.6,-0.4,3.7,8.1,12.1,17.6,18.0,14.9,11.1,3.2,-1.2,6.7 1993,-2.7,-3.3,-0.2,3.1,8.6,10.7,15.6,17.6,16.3,11.1,3.7,-1.6,6.6 1994,-6.1,-2.7,-1.3,4.4,10.0,12.8,17.4,21.7,17.5,11.8,4.3,-2.9,7.2 1995,-4.0,-4.0,-0.8,4.8,11.0,12.7,18.4,19.3,16.3,12.3,5.2,-0.6,7.6 1996,-4.6,-4.5,-1.0,3.5,6.9,12.0,15.9,18.7,16.8,10.4,2.3,-2.4,6.2 1997,-3.0,-3.3,-1.5,4.3,7.3,11.7,17.4,17.2,16.1,10.3,6.4,-0.7,6.9 1998,-6.9,-5.1,0.3,5.3,10.1,12.9,15.5,18.1,17.2,12.5,2.0,-2.4,6.6 1999,-4.1,-5.6,-2.6,4.2,8.4,14.5,16.6,21.0,18.3,11.2,3.8,-1.9,7.0 2000,-4.2,-5.6,-2.1,3.5,9.3,12.8,18.9,21.5,17.7,10.6,1.5,-4.1,6.7 2001,-6.3,-7.7,-2.4,4.7,8.5,13.0,17.4,18.7,15.6,10.8,4.0,-4.2,6.0 2002,-3.6,-1.0,0.5,6.8,11.1,12.1,15.7,17.1,17.0,10.8,2.3,-4.4,7.0 2003,-4.7,-5.6,-0.7,5.3,10.1,13.9,14.3,18.4,16.6,11.3,4.5,-1.4,6.8 2004,-3.9,-3.0,-0.5,4.4,10.6,14.6,16.8,19.7,17.8,11.8,5.9,-2.0,7.7 2005,-4.6,-5.7,-1.0,3.9,7.0,14.3,16.7,21.0,17.9,12.6,4.9,-2.3,7.1 2006,-5.5,-4.7,-0.9,2.1,9.3,11.9,18.4,21.6,17.7,11.0,4.5,-1.8,7.0 2007,-3.7,-3.2,-0.7,3.5,7.6,14.3,16.7,20.4,17.0,10.9,3.0,-1.7,7.0 2008,-6.0,-4.8,0.6,6.0,8.3,11.9,17.9,18.8,17.9,11.5,3.8,-0.4,7.1 2009,-2.4,-4.4,0.0,4.5,10.0,12.3,14.8,18.6,16.9,11.4,3.1,-2.2,6.9 2010,-3.4,-4.9,-1.4,3.5,7.3,15.0,18.1,22.4,18.4,11.4,4.8,-1.1,7.5 2011,-5.1,-2.2,-0.6,4.4,6.5,12.8,17.5 ),21.5,18.3,12.1,4.9,-2.3,7.3 2012,-5.4,-6.4,-2.4,4.6,8.9,12.6,17.2,20.4,19.4,11.8,3.8,-3.0,6.8 2013,-5.8,-5.1,-1.3,4.5,7.2,14.0,18.9,20.2,17.6,11.8,5.5,-0.2,7.3 2014,-5.3,-4.2,-1.2,3.9,8.7,13.9,19.2,20.0,16.7,11.0,4.8,-2.3,7.1 2015,-2.9,-1.7,2.3,5.9,9.9,12.1,17.6,19.0,17.3,10.4,3.7,-0.2,7.8 2016,-5.2,-4.7,0.5,4.3,11.4,12.5,17.4,21.8 ], , , , ,5.2 ] if you want the caption use table.select_one("caption.m").text: with open("out.csv", "w") as f: wr = csv.writer(f) wr.writerow([table.select_one("caption.m").text.encode("utf-8")]) wr.writerow(headers) wr.writerows([[td.text.encode("utf-8") for td in row.find_all("td")] for row in table.select("tr + tr")]) but it might be an idea to use that as the name of the file as opposed to adding it to the csv. If you really want to do it without the csv, use the same logic with str.join: table = soup.select_one("table.data2_s") headers = [th.text.encode("utf-8") for th in table.select("tr th")] with open("out.csv", "w") as f: f.write(",".join(headers) + "\n") f.writelines(",".join([td.text.encode("utf-8") for td in row.find_all("td")]) + "\n" for row in table.select("tr + tr")) If you want to replace the empty cells with N/A: with open("out.csv", "w") as f: f.write(",".join(headers) + "\n") f.writelines(",".join([td.text.encode("utf-8").strip('\xe3\x80\x80') or "N/A" for td in row.find_all("td")]) + "\n" for row in table.select("tr + tr")) Which will change the last row to: 2016,-5.2,-4.7,0.5,4.3,11.4,12.5,17.4,21.8 ],N/A,N/A,N/A,N/A,5.2 ] The spaces for missing values are unicode ideographic space characters (u"\u3000" in python) which when encoded to utf-8 become and strip, if that leave an empty string then we just use "N/A" In [7]: print u"\u3000" In [8]: u"\u3000".encode("utf-8") Out[8]: '\xe3\x80\x80' In [9]: u"\u3000".encode("utf-8").strip('\xe3\x80\x80') Out[9]: ''
Use the csv module from Python to do this. You can obviously write more columns if you want, but the idea is that you're writing a list to the csv file. There are other options that you can specify in the writer() method if you'd like to quote things, escape things, etc. import csv with open('your_csv_name.csv', 'w') as o: w = csv.writer(o) # Headers w.writerow(['tr_content']) # Write the tr text for r in rows: w.writerow([r])
Here is another way without using csv module: fp=open('data.csv','w') for row in rows[:-1]: # Removed last row as it has empty cells that gives error which can also be resolved if needed fp.write(row.get_text(',') + '\n') fp.close() You can directly open data.csv file. Station details can be get by below command: >>>> table = soup.find_all('table', class_='data2_s') >>>> print table[0].find_all('caption')[0].get_text().encode('ascii', 'ignore') WAKKANAI WMO Station ID:47401 Lat 45o24.9'N Lon 141o40.7'E Hope this helps.
import csv from bs4 import BeautifulSoup import pandas as pd html = open('test.html').read() soup = BeautifulSoup(html, features='lxml') #Specify table name which you want to read. #Example: <table class="queryResults" border="0" cellspacing="1"> table = soup.select_one('table.queryResults') def get_all_tables(soup): return soup.find_all("table") tbls = get_all_tables(soup) for i, tablen in enumerate(tbls, start=1): print(i) print(tablen) def get_table_headers(table): headers = [] for th in table.find("tr").find_all("th"): headers.append(th.text.strip()) return headers head = get_table_headers(table) #print(head) def get_table_rows(table): rows = [] for tr in table.find_all("tr")[1:]: cells = [] # grab all td tags in this table row tds = tr.find_all("td") if len(tds) == 0: # if no td tags, search for th tags # can be found especially in wikipedia tables below the table ths = tr.find_all("th") for th in ths: cells.append(th.text.strip()) else: # use regular td tags for td in tds: cells.append(td.text.strip()) rows.append(cells) return rows table_rows = get_table_rows(table) #print(table_rows) def save_as_csv(table_name, headers, rows): pd.DataFrame(rows, columns=headers).to_csv(f"{table_name}.csv") save_as_csv("Test_table", head, table_rows)
Data is being written three times instead of once using requests and openpyxl?
Here is the code I am using: from Sid.drivercommand import * from Stocks.lot_size import symbol_list from bs4 import BeautifulSoup import datetime from openpyxl import Workbook p1_url = 'https://www.nseindia.com/marketinfo/companyTracker/mtOptionKeys.jsp?companySymbol=' p3_url = '&indexSymbol=NIFTY&series=EQ&instrument=OPTSTK&date=-' symbol_list = ['ACC','SBIN','PNB'] loc = r"C:\Users\OneDrive\Stock Study" loc_opt_data = r"C:\Users\OneDrive\Stock Study\Opt Data" os.chdir(loc_opt_data) date = 'Test_Data_'+ str(datetime.date.today()) + ".xlsx" datewise_data = Workbook(date) os.chdir(loc) for symbol in symbol_list[0:1]: url = p1_url+symbol+p3_url raw_page = requests.get(url).text soup = BeautifulSoup(raw_page, "lxml") if len(raw_page) > 0: datewise_data_sheet = datewise_data.create_sheet(symbol) for table in soup.findAll('table'): '# Fields: ' + ','.join([tr.text for tr in table.findAll('th')]) for row in table.findAll('tr'): # ws1.append(([tr.text for tr in row.findAll('td')])) datewise_data_sheet.append(([tr.text for tr in row.findAll('td')])) # test.append(([tr.text for tr in row.findAll('td')])) #print(symbol) raw_page = 0 datewise_data.save(date) The file generated from this has the data but each symbol has three sets of data while I only want the first set/table.
You have tables within tables on the web-page so your code, as it is, finds the tr elements three times as they are in the first table, an the same tr elements are in the second table and the row.findAll('td') finds them a third time as the first table td contains the second table. You just need to find the data once so use: table = soup.findAll('table')[1] for row in table.findAll('tr'): Instead of: for table in soup.findAll('table'): for row in table.findAll('tr'): This will go straight to the table within the table and give you one set of results.
Web table scraping: how do I find the column number of a cell in excel using python
I have an excel file with many Chinese names in the first row like this: enter image description here And what I am doing is to scrape some more Chinese names from a web table and the names are all at the 2nd col in each row (tr). I want to see if the names being scraped is already in my excel file. So I use a boolean have to keep track. It should return True if found. And I want to know the exact position (column number) of the found name, so I use name_position to keep track. from lxml import html from bs4 import BeautifulSoup import requests import openpyxl from openpyxl.workbook import Workbook wb=openpyxl.load_workbook('hehe.xlsx') ws1=wb.get_sheet_by_name('Taocan') page = requests.get(url) tree = html.fromstring(page.text) web = page.text soup = BeautifulSoup(web, 'lxml') table = soup.find('table', {'class': "tc_table"}) trs = table.find_all('tr') for tr in trs: ls = [] for td in tr.find_all('td'): ls.append(td.text) ls = [x.encode('utf-8') for x in ls] try: name = ls[1] have = False name_position = 1 for cell in ws1[1]: if name == cell: have = True break else: name_position += 1 except IndexError: print("there is an index error") However, my code doesn't seem to work, and I think the problem is from the comparison of the names: if name == cell I changed to: if name == cell.value it still doesn't work. Can anyone help me with this? thanks/: Just to add on: the web page Im scraping is also in Chinese. So when I print(ls) it gives a list like this ['1', '\xe4\xb8\x80\xe8\x88\xac\xe6\xa3\x80\xe6\x9f\xa5', '\xe8\xba\xab\xe9\xab\x98\xe3\x80\x81\xe4\xbd\x93\xe9\x87\x8d\xe3\x80\x81\xe4\xbd\x93\xe9\x87\x8d\xe6\x8c\x87\xe6\x95\xb0\xe3\x80\x81\xe8\x85\xb0\xe5\x9b\xb4\xe3\x80\x81\xe8\x88\x92\xe5\xbc\xa0\xe5\x8e\x8b\xe3\x80\x81\xe6\x94\xb6\xe7\xbc\xa9\xe5\x8e\x8b\xe3\x80\x81\xe8\xa1\x80\xe5\x8e\x8b\xe6\x8c\x87\xe6\x95\xb0', '\xe9\x80\x9a\xe8\xbf\x87\xe4\xbb\xaa\xe5\x99\xa8\xe6\xb5\x8b\xe9\x87\x8f\xe4\xba\xba\xe4\xbd\x93\xe8\xba\xab\xe9\xab\x98\xe3\x80\x81\xe4\xbd\x93\xe9\x87\x8d\xe3\x80\x81\xe4\xbd\x93\xe8\x84\x82\xe8\x82\xaa\xe7\x8e\x87\xe5\x8f\x8a\xe8\xa1\x80\xe5\x8e\x8b\xef\xbc\x8c\xe7\xa7\x91\xe5\xad\xa6\xe5\x88\xa4\xe6\x96\xad\xe4\xbd\x93\xe9\x87\x8d\xe6\x98\xaf\xe5\x90\xa6\xe6\xa0\x87\xe5\x87\x86\xe3\x80\x81\xe8\xa1\x80\xe5\x8e\x8b\xe6\x98\xaf\xe5\x90\xa6\xe6\xad\xa3\xe5\xb8\xb8\xe3\x80\x81\xe4\xbd\x93\xe8\x84\x82\xe8\x82\xaa\xe6\x98\xaf\xe5\x90\xa6\xe8\xb6\x85\xe6\xa0\x87\xe3\x80\x82'] but if I print(ls[1]) it gives Chinese name like "广州"
How to save the string, one word per column in Python?
I'm scraping the names of massage therapists along with their addresses from a directory. The addresses are all being saved into the CSV in one column for the whole string, but the title/name of each therapist is being saved one word per column over 2 or 3 columns. What do I need to do in order to get the string that's being extracted to save in one column, like the addresses are being saved? (The top two lines of code are example html from the page, the next set of code is the extract from the script targeting this element) <span class="name"> <img src="/images/famt-placeholder-sm.jpg" class="thumb" alt="Tiffani D Abraham"> Tiffani D Abraham</span> import mechanize from lxml import html import csv import io from time import sleep def save_products (products, writer): for product in products: for price in product['prices']: writer.writerow([ product["title"].encode('utf-8') ]) writer.writerow([ price["contact"].encode('utf-8') ]) writer.writerow([ price["services"].encode('utf-8') ]) f_out = open('mtResult.csv', 'wb') writer = csv.writer(f_out) links = ["https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY","https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=2&PageSize=10","https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=3&PageSize=10","https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=4&PageSize=10","https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=5&PageSize=10","https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=6&PageSize=10","https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=7&PageSize=10", "https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=8&PageSize=10", "https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=9&PageSize=10", "https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=10&PageSize=10" ] br = mechanize.Browser() for link in links: print(link) r = br.open(link) content = r.read() products = [] tree = html.fromstring(content) product_nodes = tree.xpath('//ul[#class="famt-results"]/li') for product_node in product_nodes: product = {} price_nodes = product_node.xpath('.//a') product['prices'] = [] for price_node in price_nodes: price = {} try: product['title'] = product_node.xpath('.//span[1]/text()')[0] except: product['title'] = "" try: price['services'] = price_node.xpath('./span[2]/text()')[0] except: price['services'] = "" try: price['contact'] = price_node.xpath('./span[3]/text()')[0] except: price['contact'] = "" product['prices'].append(price) products.append(product) save_products(products, writer) f_out.close()
I'm not positive if this solves the issue you were having, but either way there are a few improvements and modifications you might be interested in. For example, since each link varies by a page index you can loop through the links easily rather than copying all 50 down to a list. Each therapist per page also has their own index, so you can also loop through the xpaths for each therapist's information. #import modules import mechanize from lxml import html import csv import io #open browser br = mechanize.Browser() #create file headers titles = ["NAME"] services = ["TECHNIQUE(S)"] contacts = ["CONTACT INFO"] #loop through all 50 webpages for therapist data for link_index in range(1,50): link = "https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=" + str(link_index) + "&PageSize=10" r = br.open(link) page = r.read() tree = html.fromstring(page) #loop through therapist data for each therapist per page for therapist_index in range(1,10): #store names title = tree.xpath('//*[#id="content"]/div[2]/ul[1]/li[' + str(therapist_index) + ']/a/span[1]/text()') titles.append(" ".join(title)) #store techniques and convert to unicode service = tree.xpath('//*[#id="content"]/div[2]/ul[1]/li[' + str(therapist_index) + ']/a/span[2]/text()') try: services.append(service[0].encode("utf-8")) except: services.append(" ") #store contact info and convert to unicode contact = tree.xpath('//*[#id="content"]/div[2]/ul[1]/li[' + str(therapist_index) + ']/a/span[3]/text()') try: contacts.append(contact[0].encode("utf-8")) except: contacts.append(" ") #open file to write to f_out = open('mtResult.csv', 'wb') writer = csv.writer(f_out) #get rows in correct format rows = zip(titles, services, contacts) #write csv line by line for row in rows: writer.writerow(row) f_out.close() The script loops through all 50 links on the provided webpage, and seems to be scraping all relevant information for each therapist if provided. Finally, it prints all the data to a csv with all data stored under respective columns for 'Name', 'Technique(s)', and 'Contact Info' if this is what you were originally struggling with. Hope this helps!
Html file writing from a for loop's output
I have written a Python WebScraper like this: from selenium import webdriver from BeautifulSoup import BeautifulSoup wd = webdriver.Firefox() wd.get('http://www.nseindia.com/live_market/dynaContent/live_analysis/top_gainers_losers.htm?cat=G&utm_campaign=website&utm_source=sendgrid.com&utm_medium=email') html_page = wd.page_source wd.quit() soup = BeautifulSoup(html_page) table = soup.find("table", attrs = {"id":"topGainers"}) print "success" #print table for row in table.findAll('tr')[1:]: cols = row.findAll('td') #print cols #break some = [cols[0], cols[5], cols[6], cols[9]] #print some #break for td in some: if td.find(text = True): text = ''.join(td.find(text = True)) print text + "|" else: continue Now I Want my output (text) to be in a html file in a table format.How could i do that??
#Okay first if you want the table to have HEADERS above each column you should save the heading names in a list like so... listofheaders=['header1','header2','header3'] #for each row in the table save the data included in the row in a list of lists something like this: listofrows=[['a','b','c'],['a','b','c'],['a','b','c']] #now create a string with the following: htmlstuff='<!DOCTYPE html>\n<html>\n<head>\n<style>\ntable,th,td\n{\nborder:1px solid black;\nborder-collapse:collapse;\n}\nth,td\n{\npadding:5px;\n}\n</style>\n</head>\n\n<body>\n<table style="width:300px">\n<tr>\n ' #now you would add the COLUMN HEADERS to the list... for header in listofheaders: htmlstuff=htmlstuff+'<th>'+str(header)+'</th>\n' #then you can populate the table row by row... for row in listofrows: htmlstuff+=' <tr>\n' for item in row: htmlstuff=htmlstuff+' <td>'+str(item)+'</td>\n' htmlstuff+=' </tr>\n' #finish off the html coding... htmlstuff+='</table>\n</body>\n\n</html>' #now create the html page and write the data... f=open('webpage.html','w') f.write(htmlstuff) f.close() You can even use webbrowser to automatically open the page for you. import webbrowser webbrowser.open('webpage.html')