How do I web scrape the sub-headers from this link?

How do I web scrape the sub-headers from this link? - python

I've made a web scraper that scrapes data from pages that look like this (it scrapes the tables): https://www.techpowerup.com/gpudb/2/
The problem is that my program, for some reason, is only scraping the values, and not the subheaders. For instance, (click on the link), it only scrapes the "R420", "130nm", "160 million", etc. but not the "GPU Name", "Process Size", "Transistors" etc.
What do I add to the code to get it to scrape the subheaders? Here's my code:
import csv
import requests
import bs4
url = "https://www.techpowerup.com/gpudb/2"
#obtain HTML and parse through it
response = requests.get(url)
html = response.content
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
soup = bs4.BeautifulSoup(html, "lxml")
tables = soup.findAll("table")
#reading every value in every row in each table and making a matrix
tableMatrix = []
for table in tables:
list_of_rows = []
for row in table.findAll('tr'):
list_of_cells = []
for cell in row.findAll('td'):
text = cell.text.replace(' ', '')
list_of_cells.append(text)
list_of_rows.append(list_of_cells)
tableMatrix.append((list_of_rows, list_of_cells))
#(YOU CAN PROBABLY IGNORE THIS)placeHolder used to avoid duplicate data from appearing in list
placeHolder = 0
excelTable = []
for table in tableMatrix:
for row in table:
if placeHolder == 0:
for entry in row:
excelTable.append(entry)
placeHolder = 1
else:
placeHolder = 0
excelTable.append('\n')
for value in excelTable:
print value
print '\n'
#create excel file and write the values into a csv
fl = open(str(count) + '.csv', 'w')
writer = csv.writer(fl)
for values in excelTable:
writer.writerow(values)
fl.close()

if you check the page source, those cells are header cells. So they are not using TD tags but TH tags. you may want to update your loop to include TH cells alongside TD cells.

Related

How can I export a table from a webpage to csv? [duplicate]

I want to convert a HTML table as obtained from the script below into a CSV file, but got type error as follows:
TypeError: sequence item 0: expected string, Tag found
from bs4 import BeautifulSoup
import urllib2
url = 'http://www.data.jma.go.jp/obd/stats/etrn/view/monthly_s3_en.php?block_no=47401&view=1'
html = urllib2.urlopen(url).read()
soup = BeautifulSoup(html)
table = soup.find_all('table', class_='data2_s')
rows = table[0].find_all('tr')
How is the easiest way to convert it into a CSV file?
I tried as:
fo = open('fo.txt','w')
for r in rows:
fo.write(str(r.txt) + '\n')
fo.close()
but it wrote 'none'
The HTML is like this:
<table class="data2_s"><caption class="m">WAKKANAIÂ Â Â WMO Station ID:47401Â LatÂ 45<sup>o</sup>24.9'NÂ Â LonÂ 141<sup>o</sup>40.7'E</caption><tr><th scope="col">Year</th><th scope="col">Jan</th><th scope="col">Feb</th><th scope="col">Mar</th><th scope="col">Apr</th><th scope="col">May</th><th scope="col">Jun</th><th scope="col">Jul</th><th scope="col">Aug</th><th scope="col">Sep</th><th scope="col">Oct</th><th scope="col">Nov</th><th scope="col">Dec</th><th scope="col">Annual</th></tr><tr class="mtx" style="text-align:right;"><td style="text-align:center">1938</td><td class="data_0_0_0_0">-5.2</td><td class="data_0_0_0_0">-4.9</td><td class="data_0_0_0_0">-0.6</td><td class="data_0_0_0_0">4.7</td><td class="data_0_0_0_0">9.5</td><td class="data_0_0_0_0">11.6</td><td class="data_0_0_0_0">17.9</td><td class="data_0_0_0_0">22.2</td><td class="data_0_0_0_0">16.5</td><td class="data_0_0_0_0">10.7</td><td class="data_0_0_0_0">3.3</td><td class="data_0_0_0_0">-4.7</td><td class="data_0_0_0_0">6.8</td></tr>
<tr class="mtx" style="text-align:right;"><td style="text-align:center">1939</td><td class="data_0_0_0_0">-7.5</td><td class="data_0_0_0_0">-6.6</td><td class="data_0_0_0_0">-1.4</td><td class="data_0_0_0_0">4.0</td><td class="data_0_0_0_0">7.5</td><td class="data_0_0_0_0">13.0</td><td class="data_0_0_0_0">17.4</td><td class="data_0_0_0_0">20.0</td><td class="data_0_0_0_0">17.4</td><td class="data_0_0_0_0">9.7</td><td class="data_0_0_0_0">3.0</td><td class="data_0_0_0_0">-2.5</td><td class="data_0_0_0_0">6.2</td></tr>

This is a job for the csv lib, getting each td inside each row and extracting the text, it will handle where there are missing values in each row:
from bs4 import BeautifulSoup
import urllib2
import csv
url = 'http://www.data.jma.go.jp/obd/stats/etrn/view/monthly_s3_en.php?block_no=47401&view=1'
html = urllib2.urlopen(url).read()
soup = BeautifulSoup(html)
table = soup.select_one("table.data2_s")
# python3 just use th.text
headers = [th.text.encode("utf-8") for th in table.select("tr th")]
with open("out.csv", "w") as f:
wr = csv.writer(f)
wr.writerow(headers)
wr.writerows([[td.text.encode("utf-8") for td in row.find_all("td")] for row in table.select("tr + tr")])
Which matches the table exactly as you see on the page:
:~$ cat out.csv
Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Annual
1938,-5.2,-4.9,-0.6,4.7,9.5,11.6,17.9,22.2,16.5,10.7,3.3,-4.7,6.8
1939,-7.5,-6.6,-1.4,4.0,7.5,13.0,17.4,20.0,17.4,9.7,3.0,-2.5,6.2
1940,-6.0,-5.7,-0.5,3.5,8.5,11.0,16.6,19.7,15.6,10.4,3.7,-1.0,6.3
1941,-6.5,-5.8,-2.6,3.6,8.1,11.4,12.7,16.5,16.0,10.0,4.0,-2.9,5.4
1942,-7.8,-8.2,-0.8,3.5,7.1,12.0,17.4,18.4,15.7,10.5,2.5,-2.9,5.6
1943,-4.1,-6.1,-1.1,3.5,6.9,12.9,19.3,21.5,17.5,11.7,1.2,-3.6,6.6
1944,-7.7,-7.9,-2.2,1.7,8.9,13.7,19.0,21.3,16.6,10.8,1.3,-6.0,5.8
1945,-7.8,-6.9,-1.8,3.9,5.5,11.0,13.6,18.7,16.8,11.0,3.9,-4.8,5.3
1946,-6.5,-6.0,-3.3,4.5,7.6,14.9,18.2,22.2,16.9,11.5,4.4,-2.5,6.8
1947,-4.9,-5.5,-2.3,3.7,9.0,11.2,17.1,19.3,15.1,10.6,2.4,-4.6,5.9
1948,-2.7,-4.4,-0.2,6.0,10.7,12.2,16.2,22.0,16.9,11.1,4.2,-0.6,7.6
1949,-2.6,-2.8,-3.4,2.0,9.4,11.8,16.9,20.8,17.8,10.8,3.1,-3.8,6.7
1950,-5.7,-4.8,-1.3,4.0,9.2,14.6,19.3,22.6,16.8,9.0,3.0,-2.9,7.0
1951,-6.7,-6.5,-2.2,3.7,9.5,12.3,16.7,22.3,15.6,10.1,3.7,-0.3,6.5
1952,-5.7,-7.1,-2.4,3.8,8.3,13.1,16.4,19.7,17.0,11.3,0.9,-7.1,5.7
1953,-7.7,-7.3,-0.9,3.6,6.9,11.1,16.8,19.2,17.6,11.2,-0.6,-2.6,5.6
1954,-6.7,-4.1,-2.5,4.0,7.5,11.0,13.7,17.0,17.2,9.5,3.2,-1.8,5.7
1955,-6.4,-4.8,-1.3,4.7,7.0,12.7,20.3,19.5,15.5,10.6,3.6,-0.4,6.8
1956,-6.1,-4.6,-2.0,5.1,10.8,11.2,13.8,16.3,17.2,12.3,2.8,-2.6,6.2
1957,-3.9,-5.5,-2.9,4.4,9.3,10.9,17.1,18.2,15.5,11.1,5.4,-1.1,6.5
1958,-4.9,-4.9,-2.3,4.4,8.5,12.6,17.5,18.3,16.8,10.6,4.5,-0.5,6.7
1959,-7.3,-2.8,0.8,6.4,9.4,12.7,17.1,18.5,16.2,11.6,2.9,-3.9,6.8
1960,-7.2,-5.2,-1.4,3.5,7.7,10.8,15.9,20.8,18.1,9.7,3.3,-3.9,6.0
1961,-7.7,-5.3,-1.4,5.5,8.7,14.7,19.5,20.0,18.9,10.4,4.1,-1.3,7.2
1962,-4.2,-5.4,-2.5,6.7,10.0,12.9,16.8,17.7,16.6,9.9,2.6,-1.5,6.6
1963,-3.6,-3.7,0.1,5.0,10.4,12.4,16.8,17.1,15.6,10.7,4.3,-1.7,7.0
1964,-4.5,-7.7,-1.3,3.7,9.9,11.9,15.3,17.7,14.9,10.0,3.6,-1.9,6.0
1965,-4.1,-5.7,-2.8,3.2,9.1,13.3,15.2,18.8,15.8,11.4,2.1,-2.6,6.1
1966,-5.0,-5.5,-1.0,3.2,8.1,12.2,15.3,17.5,15.4,11.6,4.1,-4.4,6.0
1967,-6.8,-5.9,-0.7,4.5,10.0,11.4,16.4,20.5,15.5,11.0,1.8,-1.5,6.4
1968,-4.2,-4.7,1.9,5.7,8.9,14.5,17.3,18.1,15.9,9.1,5.3,-0.7,7.3
1969,-7.3,-7.5,-2.5,3.9,7.2,10.6,17.0,16.5,16.1,9.4,2.2,-5.4,5.0
1970,-6.6,-6.0,-4.2,4.6,10.4,12.9,17.4,19.2,16.8,10.5,4.3,-3.3,6.3
1971,-6.3,-6.4,-1.7,4.1,7.6,11.6,15.8,17.2,15.2,11.5,3.4,-2.2,5.8
1972,-5.3,-5.0,-0.6,5.9,9.4,12.8,16.8,20.4,15.7,10.9,1.9,-1.4,6.8
1973,-4.2,-5.3,-2.9,4.2,8.4,12.8,17.0,20.9,17.1,10.4,3.5,-1.9,6.7
1974,-2.6,-4.6,-2.1,4.0,8.4,11.8,16.8,18.8,16.5,10.1,1.9,-5.7,6.1
1975,-4.1,-6.1,-1.5,4.3,8.4,13.7,16.1,20.6,17.3,10.4,3.8,-3.8,6.6
1976,-4.6,-3.5,-1.4,4.0,8.9,11.9,17.5,17.6,15.7,10.2,1.3,-2.0,6.3
1977,-8.3,-7.1,-1.0,3.6,8.0,11.9,18.2,19.1,17.4,11.4,4.5,-1.8,6.3
1978,-6.7,-9.2,-1.6,4.3,9.2,13.5,20.6,21.3,17.4,9.6,3.4,-2.1,6.6
1979,-6.9,-4.5,-2.5,2.7,7.8,13.2,15.8,20.3,16.9,11.3,2.9,-0.1,6.4
1980,-5.4,-7.1,-1.9,1.9,7.8,12.9,15.9,16.5,16.0,10.0,4.3,-0.6,5.9
1981,-5.4,-6.3,-2.6,5.6,8.1,11.8,17.1,18.7,16.0,10.5,0.8,-0.6,6.1
1982,-5.6,-5.3,-0.6,3.7,9.0,11.9,16.9,21.0,17.5,11.4,4.3,-1.0,6.9
1983,-4.2,-7.6,-1.9,6.8,8.2,8.5,14.5,18.9,15.8,8.9,4.8,-2.1,5.9
1984,-4.9,-6.6,-3.3,2.9,7.9,15.5,19.5,20.5,16.6,9.2,2.3,-3.6,6.3
1985,-8.7,-4.8,-1.4,4.9,8.6,11.7,16.6,21.1,15.7,10.3,2.7,-4.2,6.0
1986,-7.2,-6.5,-2.4,4.6,8.4,11.2,14.4,19.6,16.8,9.1,2.1,-1.9,5.7
1987,-6.4,-5.6,-1.4,4.2,8.6,12.6,17.5,18.0,16.4,11.1,2.0,-3.1,6.2
1988,-4.8,-6.3,-1.8,4.1,8.0,12.6,14.1,20.4,16.1,10.4,2.0,-1.5,6.1
1989,-2.6,-2.4,0.8,4.0,8.2,10.7,18.4,20.4,16.8,10.8,4.8,-1.3,7.4
1990,-5.7,-2.4,1.4,5.7,9.3,13.4,18.9,20.3,17.1,13.3,6.2,1.2,8.2
1991,-1.6,-3.6,-1.5,4.8,10.1,14.3,16.2,19.0,16.6,11.8,3.5,-2.3,7.3
1992,-3.6,-3.6,-0.4,3.7,8.1,12.1,17.6,18.0,14.9,11.1,3.2,-1.2,6.7
1993,-2.7,-3.3,-0.2,3.1,8.6,10.7,15.6,17.6,16.3,11.1,3.7,-1.6,6.6
1994,-6.1,-2.7,-1.3,4.4,10.0,12.8,17.4,21.7,17.5,11.8,4.3,-2.9,7.2
1995,-4.0,-4.0,-0.8,4.8,11.0,12.7,18.4,19.3,16.3,12.3,5.2,-0.6,7.6
1996,-4.6,-4.5,-1.0,3.5,6.9,12.0,15.9,18.7,16.8,10.4,2.3,-2.4,6.2
1997,-3.0,-3.3,-1.5,4.3,7.3,11.7,17.4,17.2,16.1,10.3,6.4,-0.7,6.9
1998,-6.9,-5.1,0.3,5.3,10.1,12.9,15.5,18.1,17.2,12.5,2.0,-2.4,6.6
1999,-4.1,-5.6,-2.6,4.2,8.4,14.5,16.6,21.0,18.3,11.2,3.8,-1.9,7.0
2000,-4.2,-5.6,-2.1,3.5,9.3,12.8,18.9,21.5,17.7,10.6,1.5,-4.1,6.7
2001,-6.3,-7.7,-2.4,4.7,8.5,13.0,17.4,18.7,15.6,10.8,4.0,-4.2,6.0
2002,-3.6,-1.0,0.5,6.8,11.1,12.1,15.7,17.1,17.0,10.8,2.3,-4.4,7.0
2003,-4.7,-5.6,-0.7,5.3,10.1,13.9,14.3,18.4,16.6,11.3,4.5,-1.4,6.8
2004,-3.9,-3.0,-0.5,4.4,10.6,14.6,16.8,19.7,17.8,11.8,5.9,-2.0,7.7
2005,-4.6,-5.7,-1.0,3.9,7.0,14.3,16.7,21.0,17.9,12.6,4.9,-2.3,7.1
2006,-5.5,-4.7,-0.9,2.1,9.3,11.9,18.4,21.6,17.7,11.0,4.5,-1.8,7.0
2007,-3.7,-3.2,-0.7,3.5,7.6,14.3,16.7,20.4,17.0,10.9,3.0,-1.7,7.0
2008,-6.0,-4.8,0.6,6.0,8.3,11.9,17.9,18.8,17.9,11.5,3.8,-0.4,7.1
2009,-2.4,-4.4,0.0,4.5,10.0,12.3,14.8,18.6,16.9,11.4,3.1,-2.2,6.9
2010,-3.4,-4.9,-1.4,3.5,7.3,15.0,18.1,22.4,18.4,11.4,4.8,-1.1,7.5
2011,-5.1,-2.2,-0.6,4.4,6.5,12.8,17.5 ),21.5,18.3,12.1,4.9,-2.3,7.3
2012,-5.4,-6.4,-2.4,4.6,8.9,12.6,17.2,20.4,19.4,11.8,3.8,-3.0,6.8
2013,-5.8,-5.1,-1.3,4.5,7.2,14.0,18.9,20.2,17.6,11.8,5.5,-0.2,7.3
2014,-5.3,-4.2,-1.2,3.9,8.7,13.9,19.2,20.0,16.7,11.0,4.8,-2.3,7.1
2015,-2.9,-1.7,2.3,5.9,9.9,12.1,17.6,19.0,17.3,10.4,3.7,-0.2,7.8
2016,-5.2,-4.7,0.5,4.3,11.4,12.5,17.4,21.8 ],　,　,　,　,5.2 ]
if you want the caption use table.select_one("caption.m").text:
with open("out.csv", "w") as f:
wr = csv.writer(f)
wr.writerow([table.select_one("caption.m").text.encode("utf-8")])
wr.writerow(headers)
wr.writerows([[td.text.encode("utf-8") for td in row.find_all("td")]
for row in table.select("tr + tr")])
but it might be an idea to use that as the name of the file as opposed to adding it to the csv.
If you really want to do it without the csv, use the same logic with str.join:
table = soup.select_one("table.data2_s")
headers = [th.text.encode("utf-8") for th in table.select("tr th")]
with open("out.csv", "w") as f:
f.write(",".join(headers) + "\n")
f.writelines(",".join([td.text.encode("utf-8") for td in row.find_all("td")]) + "\n"
for row in table.select("tr + tr"))
If you want to replace the empty cells with N/A:
with open("out.csv", "w") as f:
f.write(",".join(headers) + "\n")
f.writelines(",".join([td.text.encode("utf-8").strip('\xe3\x80\x80') or "N/A" for td in row.find_all("td")]) + "\n"
for row in table.select("tr + tr"))
Which will change the last row to:
2016,-5.2,-4.7,0.5,4.3,11.4,12.5,17.4,21.8 ],N/A,N/A,N/A,N/A,5.2 ]
The spaces for missing values are unicode ideographic space characters (u"\u3000" in python) which when encoded to utf-8 become and strip, if that leave an empty string then we just use "N/A"
In [7]: print u"\u3000"
　
In [8]: u"\u3000".encode("utf-8")
Out[8]: '\xe3\x80\x80'
In [9]: u"\u3000".encode("utf-8").strip('\xe3\x80\x80')
Out[9]: ''

Use the csv module from Python to do this. You can obviously write more columns if you want, but the idea is that you're writing a list to the csv file. There are other options that you can specify in the writer() method if you'd like to quote things, escape things, etc.
import csv
with open('your_csv_name.csv', 'w') as o:
w = csv.writer(o)
# Headers
w.writerow(['tr_content'])
# Write the tr text
for r in rows:
w.writerow([r])

Here is another way without using csv module:
fp=open('data.csv','w')
for row in rows[:-1]: # Removed last row as it has empty cells that gives error which can also be resolved if needed
fp.write(row.get_text(',') + '\n')
fp.close()
You can directly open data.csv file.
Station details can be get by below command:
>>>> table = soup.find_all('table', class_='data2_s')
>>>> print table[0].find_all('caption')[0].get_text().encode('ascii', 'ignore')
WAKKANAI WMO Station ID:47401 Lat 45o24.9'N Lon 141o40.7'E
Hope this helps.

import csv
from bs4 import BeautifulSoup
import pandas as pd
html = open('test.html').read()
soup = BeautifulSoup(html, features='lxml')
#Specify table name which you want to read.
#Example: <table class="queryResults" border="0" cellspacing="1">
table = soup.select_one('table.queryResults')
def get_all_tables(soup):
return soup.find_all("table")
tbls = get_all_tables(soup)
for i, tablen in enumerate(tbls, start=1):
print(i)
print(tablen)
def get_table_headers(table):
headers = []
for th in table.find("tr").find_all("th"):
headers.append(th.text.strip())
return headers
head = get_table_headers(table)
#print(head)
def get_table_rows(table):
rows = []
for tr in table.find_all("tr")[1:]:
cells = []
# grab all td tags in this table row
tds = tr.find_all("td")
if len(tds) == 0:
# if no td tags, search for th tags
# can be found especially in wikipedia tables below the table
ths = tr.find_all("th")
for th in ths:
cells.append(th.text.strip())
else:
# use regular td tags
for td in tds:
cells.append(td.text.strip())
rows.append(cells)
return rows
table_rows = get_table_rows(table)
#print(table_rows)
def save_as_csv(table_name, headers, rows):
pd.DataFrame(rows, columns=headers).to_csv(f"{table_name}.csv")
save_as_csv("Test_table", head, table_rows)

Data is being written three times instead of once using requests and openpyxl?

Here is the code I am using:
from Sid.drivercommand import *
from Stocks.lot_size import symbol_list
from bs4 import BeautifulSoup
import datetime
from openpyxl import Workbook
p1_url = 'https://www.nseindia.com/marketinfo/companyTracker/mtOptionKeys.jsp?companySymbol='
p3_url = '&indexSymbol=NIFTY&series=EQ&instrument=OPTSTK&date=-'
symbol_list = ['ACC','SBIN','PNB']
loc = r"C:\Users\OneDrive\Stock Study"
loc_opt_data = r"C:\Users\OneDrive\Stock Study\Opt Data"
os.chdir(loc_opt_data)
date = 'Test_Data_'+ str(datetime.date.today()) + ".xlsx"
datewise_data = Workbook(date)
os.chdir(loc)
for symbol in symbol_list[0:1]:
url = p1_url+symbol+p3_url
raw_page = requests.get(url).text
soup = BeautifulSoup(raw_page, "lxml")
if len(raw_page) > 0:
datewise_data_sheet = datewise_data.create_sheet(symbol)
for table in soup.findAll('table'):
'# Fields: ' + ','.join([tr.text for tr in table.findAll('th')])
for row in table.findAll('tr'):
# ws1.append(([tr.text for tr in row.findAll('td')]))
datewise_data_sheet.append(([tr.text for tr in row.findAll('td')]))
# test.append(([tr.text for tr in row.findAll('td')]))
#print(symbol)
raw_page = 0
datewise_data.save(date)
The file generated from this has the data but each symbol has three sets of data while I only want the first set/table.

You have tables within tables on the web-page so your code, as it is, finds the tr elements three times as they are in the first table, an the same tr elements are in the second table and the row.findAll('td') finds them a third time as the first table td contains the second table.
You just need to find the data once so use:
table = soup.findAll('table')[1]
for row in table.findAll('tr'):
Instead of:
for table in soup.findAll('table'):
for row in table.findAll('tr'):
This will go straight to the table within the table and give you one set of results.

Web table scraping: how do I find the column number of a cell in excel using python

I have an excel file with many Chinese names in the first row like this:
enter image description here
And what I am doing is to scrape some more Chinese names from a web table and the names are all at the 2nd col in each row (tr). I want to see if the names being scraped is already in my excel file. So I use a boolean have to keep track. It should return True if found. And I want to know the exact position (column number) of the found name, so I use name_position to keep track.
from lxml import html
from bs4 import BeautifulSoup
import requests
import openpyxl
from openpyxl.workbook import Workbook
wb=openpyxl.load_workbook('hehe.xlsx')
ws1=wb.get_sheet_by_name('Taocan')
page = requests.get(url)
tree = html.fromstring(page.text)
web = page.text
soup = BeautifulSoup(web, 'lxml')
table = soup.find('table', {'class': "tc_table"})
trs = table.find_all('tr')
for tr in trs:
ls = []
for td in tr.find_all('td'):
ls.append(td.text)
ls = [x.encode('utf-8') for x in ls]
try:
name = ls[1]
have = False
name_position = 1
for cell in ws1[1]:
if name == cell:
have = True
break
else:
name_position += 1
except IndexError:
print("there is an index error")
However, my code doesn't seem to work, and I think the problem is from the comparison of the names:
if name == cell
I changed to:
if name == cell.value
it still doesn't work.
Can anyone help me with this? thanks/:
Just to add on: the web page Im scraping is also in Chinese. So when I
print(ls)
it gives a list like this
['1', '\xe4\xb8\x80\xe8\x88\xac\xe6\xa3\x80\xe6\x9f\xa5', '\xe8\xba\xab\xe9\xab\x98\xe3\x80\x81\xe4\xbd\x93\xe9\x87\x8d\xe3\x80\x81\xe4\xbd\x93\xe9\x87\x8d\xe6\x8c\x87\xe6\x95\xb0\xe3\x80\x81\xe8\x85\xb0\xe5\x9b\xb4\xe3\x80\x81\xe8\x88\x92\xe5\xbc\xa0\xe5\x8e\x8b\xe3\x80\x81\xe6\x94\xb6\xe7\xbc\xa9\xe5\x8e\x8b\xe3\x80\x81\xe8\xa1\x80\xe5\x8e\x8b\xe6\x8c\x87\xe6\x95\xb0', '\xe9\x80\x9a\xe8\xbf\x87\xe4\xbb\xaa\xe5\x99\xa8\xe6\xb5\x8b\xe9\x87\x8f\xe4\xba\xba\xe4\xbd\x93\xe8\xba\xab\xe9\xab\x98\xe3\x80\x81\xe4\xbd\x93\xe9\x87\x8d\xe3\x80\x81\xe4\xbd\x93\xe8\x84\x82\xe8\x82\xaa\xe7\x8e\x87\xe5\x8f\x8a\xe8\xa1\x80\xe5\x8e\x8b\xef\xbc\x8c\xe7\xa7\x91\xe5\xad\xa6\xe5\x88\xa4\xe6\x96\xad\xe4\xbd\x93\xe9\x87\x8d\xe6\x98\xaf\xe5\x90\xa6\xe6\xa0\x87\xe5\x87\x86\xe3\x80\x81\xe8\xa1\x80\xe5\x8e\x8b\xe6\x98\xaf\xe5\x90\xa6\xe6\xad\xa3\xe5\xb8\xb8\xe3\x80\x81\xe4\xbd\x93\xe8\x84\x82\xe8\x82\xaa\xe6\x98\xaf\xe5\x90\xa6\xe8\xb6\x85\xe6\xa0\x87\xe3\x80\x82']
but if I
print(ls[1])
it gives Chinese name like "广州"

How to save the string, one word per column in Python?

I'm scraping the names of massage therapists along with their addresses from a directory. The addresses are all being saved into the CSV in one column for the whole string, but the title/name of each therapist is being saved one word per column over 2 or 3 columns.
What do I need to do in order to get the string that's being extracted to save in one column, like the addresses are being saved? (The top two lines of code are example html from the page, the next set of code is the extract from the script targeting this element)
<span class="name">
<img src="/images/famt-placeholder-sm.jpg" class="thumb" alt="Tiffani D Abraham"> Tiffani D Abraham</span>
import mechanize
from lxml import html
import csv
import io
from time import sleep
def save_products (products, writer):
for product in products:
for price in product['prices']:
writer.writerow([ product["title"].encode('utf-8') ])
writer.writerow([ price["contact"].encode('utf-8') ])
writer.writerow([ price["services"].encode('utf-8') ])
f_out = open('mtResult.csv', 'wb')
writer = csv.writer(f_out)
links = ["https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY","https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=2&PageSize=10","https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=3&PageSize=10","https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=4&PageSize=10","https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=5&PageSize=10","https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=6&PageSize=10","https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=7&PageSize=10", "https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=8&PageSize=10", "https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=9&PageSize=10", "https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=10&PageSize=10" ]
br = mechanize.Browser()
for link in links:
print(link)
r = br.open(link)
content = r.read()
products = []
tree = html.fromstring(content)
product_nodes = tree.xpath('//ul[#class="famt-results"]/li')
for product_node in product_nodes:
product = {}
price_nodes = product_node.xpath('.//a')
product['prices'] = []
for price_node in price_nodes:
price = {}
try:
product['title'] = product_node.xpath('.//span[1]/text()')[0]
except:
product['title'] = ""
try:
price['services'] = price_node.xpath('./span[2]/text()')[0]
except:
price['services'] = ""
try:
price['contact'] = price_node.xpath('./span[3]/text()')[0]
except:
price['contact'] = ""
product['prices'].append(price)
products.append(product)
save_products(products, writer)
f_out.close()

I'm not positive if this solves the issue you were having, but either way there are a few improvements and modifications you might be interested in.
For example, since each link varies by a page index you can loop through the links easily rather than copying all 50 down to a list. Each therapist per page also has their own index, so you can also loop through the xpaths for each therapist's information.
#import modules
import mechanize
from lxml import html
import csv
import io
#open browser
br = mechanize.Browser()
#create file headers
titles = ["NAME"]
services = ["TECHNIQUE(S)"]
contacts = ["CONTACT INFO"]
#loop through all 50 webpages for therapist data
for link_index in range(1,50):
link = "https://www.amtamassage.org/findamassage/results.html?match=exact&l=NY&PageIndex=" + str(link_index) + "&PageSize=10"
r = br.open(link)
page = r.read()
tree = html.fromstring(page)
#loop through therapist data for each therapist per page
for therapist_index in range(1,10):
#store names
title = tree.xpath('//*[#id="content"]/div[2]/ul[1]/li[' + str(therapist_index) + ']/a/span[1]/text()')
titles.append(" ".join(title))
#store techniques and convert to unicode
service = tree.xpath('//*[#id="content"]/div[2]/ul[1]/li[' + str(therapist_index) + ']/a/span[2]/text()')
try:
services.append(service[0].encode("utf-8"))
except:
services.append(" ")
#store contact info and convert to unicode
contact = tree.xpath('//*[#id="content"]/div[2]/ul[1]/li[' + str(therapist_index) + ']/a/span[3]/text()')
try:
contacts.append(contact[0].encode("utf-8"))
except:
contacts.append(" ")
#open file to write to
f_out = open('mtResult.csv', 'wb')
writer = csv.writer(f_out)
#get rows in correct format
rows = zip(titles, services, contacts)
#write csv line by line
for row in rows:
writer.writerow(row)
f_out.close()
The script loops through all 50 links on the provided webpage, and seems to be scraping all relevant information for each therapist if provided. Finally, it prints all the data to a csv with all data stored under respective columns for 'Name', 'Technique(s)', and 'Contact Info' if this is what you were originally struggling with.
Hope this helps!

Html file writing from a for loop's output

I have written a Python WebScraper like this:
from selenium import webdriver
from BeautifulSoup import BeautifulSoup
wd = webdriver.Firefox()
wd.get('http://www.nseindia.com/live_market/dynaContent/live_analysis/top_gainers_losers.htm?cat=G&utm_campaign=website&utm_source=sendgrid.com&utm_medium=email')
html_page = wd.page_source
wd.quit()
soup = BeautifulSoup(html_page)
table = soup.find("table", attrs = {"id":"topGainers"})
print "success"
#print table
for row in table.findAll('tr')[1:]:
cols = row.findAll('td')
#print cols
#break
some = [cols[0], cols[5], cols[6], cols[9]]
#print some
#break
for td in some:
if td.find(text = True):
text = ''.join(td.find(text = True))
print text + "|"
else:
continue
Now I Want my output (text) to be in a html file in a table format.How could i do that??

#Okay first if you want the table to have HEADERS above each column you should save the heading names in a list like so...
listofheaders=['header1','header2','header3']
#for each row in the table save the data included in the row in a list of lists something like this:
listofrows=[['a','b','c'],['a','b','c'],['a','b','c']]
#now create a string with the following:
htmlstuff='<!DOCTYPE html>\n<html>\n<head>\n<style>\ntable,th,td\n{\nborder:1px solid black;\nborder-collapse:collapse;\n}\nth,td\n{\npadding:5px;\n}\n</style>\n</head>\n\n<body>\n<table style="width:300px">\n<tr>\n '
#now you would add the COLUMN HEADERS to the list...
for header in listofheaders:
htmlstuff=htmlstuff+'<th>'+str(header)+'</th>\n'
#then you can populate the table row by row...
for row in listofrows:
htmlstuff+=' <tr>\n'
for item in row:
htmlstuff=htmlstuff+' <td>'+str(item)+'</td>\n'
htmlstuff+=' </tr>\n'
#finish off the html coding...
htmlstuff+='</table>\n</body>\n\n</html>'
#now create the html page and write the data...
f=open('webpage.html','w')
f.write(htmlstuff)
f.close()
You can even use webbrowser to automatically open the page for you.
import webbrowser
webbrowser.open('webpage.html')

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How do I web scrape the sub-headers from this link? - python

if you check the page source, those cells are header cells. So they are not using TD tags but TH tags. you may want to update your loop to include TH cells alongside TD cells.

Related

How can I export a table from a webpage to csv? [duplicate]

Data is being written three times instead of once using requests and openpyxl?

Web table scraping: how do I find the column number of a cell in excel using python

How to save the string, one word per column in Python?

Html file writing from a for loop's output

Categories

Resources