Data I'm trying to scrape cut short - python
I'm trying to use Python to scrape the play-by-play table from this basketball-reference example into a CSV file.
When I run this code, the table is cut short and many cells are missing. I'm a programming n00b and any help would be appreciated.
from bs4 import BeautifulSoup
from urllib2 import urlopen
import csv
bref = "http://www.basketball-reference.com"
print "Enter game code:"
game = raw_input("> ")
def make_soup(url):
return BeautifulSoup(urlopen(url), "lxml")
def get_pbp(pbp):
soup = make_soup(bref + "/boxscores/pbp/" + game + ".html")
table = soup.find("table", "no_highlight stats_table")
rows = [row.find_all("td") for row in table.find_all("tr")]
data = []
for row in rows:
values = []
for value in row:
if value.string is None:
values.append(u"")
else:
values.append(value.string.replace(u"\xa0", u""))
data.append(values)
return data
if __name__ == '__main__':
print "Writing data for game " + game
with open(game + '.csv', 'w') as f:
writer = csv.writer(f)
writer.writerows(get_pbp(game))
print game + " has been successfully scraped."
You need to skip empty cells:
table = soup.find("table", class_="no_highlight stats_table")
rows = [[cell.text.replace(u"\xa0", u"").strip() for cell in row.find_all("td") if cell.text.strip()]
for row in table.find_all("tr")[2:]]
with open(game + '.csv', 'w') as f:
writer = csv.writer(f)
writer.writerows(rows)
Related
How can I export a table from a webpage to csv? [duplicate]
I want to convert a HTML table as obtained from the script below into a CSV file, but got type error as follows: TypeError: sequence item 0: expected string, Tag found from bs4 import BeautifulSoup import urllib2 url = 'http://www.data.jma.go.jp/obd/stats/etrn/view/monthly_s3_en.php?block_no=47401&view=1' html = urllib2.urlopen(url).read() soup = BeautifulSoup(html) table = soup.find_all('table', class_='data2_s') rows = table[0].find_all('tr') How is the easiest way to convert it into a CSV file? I tried as: fo = open('fo.txt','w') for r in rows: fo.write(str(r.txt) + '\n') fo.close() but it wrote 'none' The HTML is like this: <table class="data2_s"><caption class="m">WAKKANAI   WMO Station ID:47401 Lat 45<sup>o</sup>24.9'N  Lon 141<sup>o</sup>40.7'E</caption><tr><th scope="col">Year</th><th scope="col">Jan</th><th scope="col">Feb</th><th scope="col">Mar</th><th scope="col">Apr</th><th scope="col">May</th><th scope="col">Jun</th><th scope="col">Jul</th><th scope="col">Aug</th><th scope="col">Sep</th><th scope="col">Oct</th><th scope="col">Nov</th><th scope="col">Dec</th><th scope="col">Annual</th></tr><tr class="mtx" style="text-align:right;"><td style="text-align:center">1938</td><td class="data_0_0_0_0">-5.2</td><td class="data_0_0_0_0">-4.9</td><td class="data_0_0_0_0">-0.6</td><td class="data_0_0_0_0">4.7</td><td class="data_0_0_0_0">9.5</td><td class="data_0_0_0_0">11.6</td><td class="data_0_0_0_0">17.9</td><td class="data_0_0_0_0">22.2</td><td class="data_0_0_0_0">16.5</td><td class="data_0_0_0_0">10.7</td><td class="data_0_0_0_0">3.3</td><td class="data_0_0_0_0">-4.7</td><td class="data_0_0_0_0">6.8</td></tr> <tr class="mtx" style="text-align:right;"><td style="text-align:center">1939</td><td class="data_0_0_0_0">-7.5</td><td class="data_0_0_0_0">-6.6</td><td class="data_0_0_0_0">-1.4</td><td class="data_0_0_0_0">4.0</td><td class="data_0_0_0_0">7.5</td><td class="data_0_0_0_0">13.0</td><td class="data_0_0_0_0">17.4</td><td class="data_0_0_0_0">20.0</td><td class="data_0_0_0_0">17.4</td><td class="data_0_0_0_0">9.7</td><td class="data_0_0_0_0">3.0</td><td class="data_0_0_0_0">-2.5</td><td class="data_0_0_0_0">6.2</td></tr>
This is a job for the csv lib, getting each td inside each row and extracting the text, it will handle where there are missing values in each row: from bs4 import BeautifulSoup import urllib2 import csv url = 'http://www.data.jma.go.jp/obd/stats/etrn/view/monthly_s3_en.php?block_no=47401&view=1' html = urllib2.urlopen(url).read() soup = BeautifulSoup(html) table = soup.select_one("table.data2_s") # python3 just use th.text headers = [th.text.encode("utf-8") for th in table.select("tr th")] with open("out.csv", "w") as f: wr = csv.writer(f) wr.writerow(headers) wr.writerows([[td.text.encode("utf-8") for td in row.find_all("td")] for row in table.select("tr + tr")]) Which matches the table exactly as you see on the page: :~$ cat out.csv Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Annual 1938,-5.2,-4.9,-0.6,4.7,9.5,11.6,17.9,22.2,16.5,10.7,3.3,-4.7,6.8 1939,-7.5,-6.6,-1.4,4.0,7.5,13.0,17.4,20.0,17.4,9.7,3.0,-2.5,6.2 1940,-6.0,-5.7,-0.5,3.5,8.5,11.0,16.6,19.7,15.6,10.4,3.7,-1.0,6.3 1941,-6.5,-5.8,-2.6,3.6,8.1,11.4,12.7,16.5,16.0,10.0,4.0,-2.9,5.4 1942,-7.8,-8.2,-0.8,3.5,7.1,12.0,17.4,18.4,15.7,10.5,2.5,-2.9,5.6 1943,-4.1,-6.1,-1.1,3.5,6.9,12.9,19.3,21.5,17.5,11.7,1.2,-3.6,6.6 1944,-7.7,-7.9,-2.2,1.7,8.9,13.7,19.0,21.3,16.6,10.8,1.3,-6.0,5.8 1945,-7.8,-6.9,-1.8,3.9,5.5,11.0,13.6,18.7,16.8,11.0,3.9,-4.8,5.3 1946,-6.5,-6.0,-3.3,4.5,7.6,14.9,18.2,22.2,16.9,11.5,4.4,-2.5,6.8 1947,-4.9,-5.5,-2.3,3.7,9.0,11.2,17.1,19.3,15.1,10.6,2.4,-4.6,5.9 1948,-2.7,-4.4,-0.2,6.0,10.7,12.2,16.2,22.0,16.9,11.1,4.2,-0.6,7.6 1949,-2.6,-2.8,-3.4,2.0,9.4,11.8,16.9,20.8,17.8,10.8,3.1,-3.8,6.7 1950,-5.7,-4.8,-1.3,4.0,9.2,14.6,19.3,22.6,16.8,9.0,3.0,-2.9,7.0 1951,-6.7,-6.5,-2.2,3.7,9.5,12.3,16.7,22.3,15.6,10.1,3.7,-0.3,6.5 1952,-5.7,-7.1,-2.4,3.8,8.3,13.1,16.4,19.7,17.0,11.3,0.9,-7.1,5.7 1953,-7.7,-7.3,-0.9,3.6,6.9,11.1,16.8,19.2,17.6,11.2,-0.6,-2.6,5.6 1954,-6.7,-4.1,-2.5,4.0,7.5,11.0,13.7,17.0,17.2,9.5,3.2,-1.8,5.7 1955,-6.4,-4.8,-1.3,4.7,7.0,12.7,20.3,19.5,15.5,10.6,3.6,-0.4,6.8 1956,-6.1,-4.6,-2.0,5.1,10.8,11.2,13.8,16.3,17.2,12.3,2.8,-2.6,6.2 1957,-3.9,-5.5,-2.9,4.4,9.3,10.9,17.1,18.2,15.5,11.1,5.4,-1.1,6.5 1958,-4.9,-4.9,-2.3,4.4,8.5,12.6,17.5,18.3,16.8,10.6,4.5,-0.5,6.7 1959,-7.3,-2.8,0.8,6.4,9.4,12.7,17.1,18.5,16.2,11.6,2.9,-3.9,6.8 1960,-7.2,-5.2,-1.4,3.5,7.7,10.8,15.9,20.8,18.1,9.7,3.3,-3.9,6.0 1961,-7.7,-5.3,-1.4,5.5,8.7,14.7,19.5,20.0,18.9,10.4,4.1,-1.3,7.2 1962,-4.2,-5.4,-2.5,6.7,10.0,12.9,16.8,17.7,16.6,9.9,2.6,-1.5,6.6 1963,-3.6,-3.7,0.1,5.0,10.4,12.4,16.8,17.1,15.6,10.7,4.3,-1.7,7.0 1964,-4.5,-7.7,-1.3,3.7,9.9,11.9,15.3,17.7,14.9,10.0,3.6,-1.9,6.0 1965,-4.1,-5.7,-2.8,3.2,9.1,13.3,15.2,18.8,15.8,11.4,2.1,-2.6,6.1 1966,-5.0,-5.5,-1.0,3.2,8.1,12.2,15.3,17.5,15.4,11.6,4.1,-4.4,6.0 1967,-6.8,-5.9,-0.7,4.5,10.0,11.4,16.4,20.5,15.5,11.0,1.8,-1.5,6.4 1968,-4.2,-4.7,1.9,5.7,8.9,14.5,17.3,18.1,15.9,9.1,5.3,-0.7,7.3 1969,-7.3,-7.5,-2.5,3.9,7.2,10.6,17.0,16.5,16.1,9.4,2.2,-5.4,5.0 1970,-6.6,-6.0,-4.2,4.6,10.4,12.9,17.4,19.2,16.8,10.5,4.3,-3.3,6.3 1971,-6.3,-6.4,-1.7,4.1,7.6,11.6,15.8,17.2,15.2,11.5,3.4,-2.2,5.8 1972,-5.3,-5.0,-0.6,5.9,9.4,12.8,16.8,20.4,15.7,10.9,1.9,-1.4,6.8 1973,-4.2,-5.3,-2.9,4.2,8.4,12.8,17.0,20.9,17.1,10.4,3.5,-1.9,6.7 1974,-2.6,-4.6,-2.1,4.0,8.4,11.8,16.8,18.8,16.5,10.1,1.9,-5.7,6.1 1975,-4.1,-6.1,-1.5,4.3,8.4,13.7,16.1,20.6,17.3,10.4,3.8,-3.8,6.6 1976,-4.6,-3.5,-1.4,4.0,8.9,11.9,17.5,17.6,15.7,10.2,1.3,-2.0,6.3 1977,-8.3,-7.1,-1.0,3.6,8.0,11.9,18.2,19.1,17.4,11.4,4.5,-1.8,6.3 1978,-6.7,-9.2,-1.6,4.3,9.2,13.5,20.6,21.3,17.4,9.6,3.4,-2.1,6.6 1979,-6.9,-4.5,-2.5,2.7,7.8,13.2,15.8,20.3,16.9,11.3,2.9,-0.1,6.4 1980,-5.4,-7.1,-1.9,1.9,7.8,12.9,15.9,16.5,16.0,10.0,4.3,-0.6,5.9 1981,-5.4,-6.3,-2.6,5.6,8.1,11.8,17.1,18.7,16.0,10.5,0.8,-0.6,6.1 1982,-5.6,-5.3,-0.6,3.7,9.0,11.9,16.9,21.0,17.5,11.4,4.3,-1.0,6.9 1983,-4.2,-7.6,-1.9,6.8,8.2,8.5,14.5,18.9,15.8,8.9,4.8,-2.1,5.9 1984,-4.9,-6.6,-3.3,2.9,7.9,15.5,19.5,20.5,16.6,9.2,2.3,-3.6,6.3 1985,-8.7,-4.8,-1.4,4.9,8.6,11.7,16.6,21.1,15.7,10.3,2.7,-4.2,6.0 1986,-7.2,-6.5,-2.4,4.6,8.4,11.2,14.4,19.6,16.8,9.1,2.1,-1.9,5.7 1987,-6.4,-5.6,-1.4,4.2,8.6,12.6,17.5,18.0,16.4,11.1,2.0,-3.1,6.2 1988,-4.8,-6.3,-1.8,4.1,8.0,12.6,14.1,20.4,16.1,10.4,2.0,-1.5,6.1 1989,-2.6,-2.4,0.8,4.0,8.2,10.7,18.4,20.4,16.8,10.8,4.8,-1.3,7.4 1990,-5.7,-2.4,1.4,5.7,9.3,13.4,18.9,20.3,17.1,13.3,6.2,1.2,8.2 1991,-1.6,-3.6,-1.5,4.8,10.1,14.3,16.2,19.0,16.6,11.8,3.5,-2.3,7.3 1992,-3.6,-3.6,-0.4,3.7,8.1,12.1,17.6,18.0,14.9,11.1,3.2,-1.2,6.7 1993,-2.7,-3.3,-0.2,3.1,8.6,10.7,15.6,17.6,16.3,11.1,3.7,-1.6,6.6 1994,-6.1,-2.7,-1.3,4.4,10.0,12.8,17.4,21.7,17.5,11.8,4.3,-2.9,7.2 1995,-4.0,-4.0,-0.8,4.8,11.0,12.7,18.4,19.3,16.3,12.3,5.2,-0.6,7.6 1996,-4.6,-4.5,-1.0,3.5,6.9,12.0,15.9,18.7,16.8,10.4,2.3,-2.4,6.2 1997,-3.0,-3.3,-1.5,4.3,7.3,11.7,17.4,17.2,16.1,10.3,6.4,-0.7,6.9 1998,-6.9,-5.1,0.3,5.3,10.1,12.9,15.5,18.1,17.2,12.5,2.0,-2.4,6.6 1999,-4.1,-5.6,-2.6,4.2,8.4,14.5,16.6,21.0,18.3,11.2,3.8,-1.9,7.0 2000,-4.2,-5.6,-2.1,3.5,9.3,12.8,18.9,21.5,17.7,10.6,1.5,-4.1,6.7 2001,-6.3,-7.7,-2.4,4.7,8.5,13.0,17.4,18.7,15.6,10.8,4.0,-4.2,6.0 2002,-3.6,-1.0,0.5,6.8,11.1,12.1,15.7,17.1,17.0,10.8,2.3,-4.4,7.0 2003,-4.7,-5.6,-0.7,5.3,10.1,13.9,14.3,18.4,16.6,11.3,4.5,-1.4,6.8 2004,-3.9,-3.0,-0.5,4.4,10.6,14.6,16.8,19.7,17.8,11.8,5.9,-2.0,7.7 2005,-4.6,-5.7,-1.0,3.9,7.0,14.3,16.7,21.0,17.9,12.6,4.9,-2.3,7.1 2006,-5.5,-4.7,-0.9,2.1,9.3,11.9,18.4,21.6,17.7,11.0,4.5,-1.8,7.0 2007,-3.7,-3.2,-0.7,3.5,7.6,14.3,16.7,20.4,17.0,10.9,3.0,-1.7,7.0 2008,-6.0,-4.8,0.6,6.0,8.3,11.9,17.9,18.8,17.9,11.5,3.8,-0.4,7.1 2009,-2.4,-4.4,0.0,4.5,10.0,12.3,14.8,18.6,16.9,11.4,3.1,-2.2,6.9 2010,-3.4,-4.9,-1.4,3.5,7.3,15.0,18.1,22.4,18.4,11.4,4.8,-1.1,7.5 2011,-5.1,-2.2,-0.6,4.4,6.5,12.8,17.5 ),21.5,18.3,12.1,4.9,-2.3,7.3 2012,-5.4,-6.4,-2.4,4.6,8.9,12.6,17.2,20.4,19.4,11.8,3.8,-3.0,6.8 2013,-5.8,-5.1,-1.3,4.5,7.2,14.0,18.9,20.2,17.6,11.8,5.5,-0.2,7.3 2014,-5.3,-4.2,-1.2,3.9,8.7,13.9,19.2,20.0,16.7,11.0,4.8,-2.3,7.1 2015,-2.9,-1.7,2.3,5.9,9.9,12.1,17.6,19.0,17.3,10.4,3.7,-0.2,7.8 2016,-5.2,-4.7,0.5,4.3,11.4,12.5,17.4,21.8 ], , , , ,5.2 ] if you want the caption use table.select_one("caption.m").text: with open("out.csv", "w") as f: wr = csv.writer(f) wr.writerow([table.select_one("caption.m").text.encode("utf-8")]) wr.writerow(headers) wr.writerows([[td.text.encode("utf-8") for td in row.find_all("td")] for row in table.select("tr + tr")]) but it might be an idea to use that as the name of the file as opposed to adding it to the csv. If you really want to do it without the csv, use the same logic with str.join: table = soup.select_one("table.data2_s") headers = [th.text.encode("utf-8") for th in table.select("tr th")] with open("out.csv", "w") as f: f.write(",".join(headers) + "\n") f.writelines(",".join([td.text.encode("utf-8") for td in row.find_all("td")]) + "\n" for row in table.select("tr + tr")) If you want to replace the empty cells with N/A: with open("out.csv", "w") as f: f.write(",".join(headers) + "\n") f.writelines(",".join([td.text.encode("utf-8").strip('\xe3\x80\x80') or "N/A" for td in row.find_all("td")]) + "\n" for row in table.select("tr + tr")) Which will change the last row to: 2016,-5.2,-4.7,0.5,4.3,11.4,12.5,17.4,21.8 ],N/A,N/A,N/A,N/A,5.2 ] The spaces for missing values are unicode ideographic space characters (u"\u3000" in python) which when encoded to utf-8 become and strip, if that leave an empty string then we just use "N/A" In [7]: print u"\u3000" In [8]: u"\u3000".encode("utf-8") Out[8]: '\xe3\x80\x80' In [9]: u"\u3000".encode("utf-8").strip('\xe3\x80\x80') Out[9]: ''
Use the csv module from Python to do this. You can obviously write more columns if you want, but the idea is that you're writing a list to the csv file. There are other options that you can specify in the writer() method if you'd like to quote things, escape things, etc. import csv with open('your_csv_name.csv', 'w') as o: w = csv.writer(o) # Headers w.writerow(['tr_content']) # Write the tr text for r in rows: w.writerow([r])
Here is another way without using csv module: fp=open('data.csv','w') for row in rows[:-1]: # Removed last row as it has empty cells that gives error which can also be resolved if needed fp.write(row.get_text(',') + '\n') fp.close() You can directly open data.csv file. Station details can be get by below command: >>>> table = soup.find_all('table', class_='data2_s') >>>> print table[0].find_all('caption')[0].get_text().encode('ascii', 'ignore') WAKKANAI WMO Station ID:47401 Lat 45o24.9'N Lon 141o40.7'E Hope this helps.
import csv from bs4 import BeautifulSoup import pandas as pd html = open('test.html').read() soup = BeautifulSoup(html, features='lxml') #Specify table name which you want to read. #Example: <table class="queryResults" border="0" cellspacing="1"> table = soup.select_one('table.queryResults') def get_all_tables(soup): return soup.find_all("table") tbls = get_all_tables(soup) for i, tablen in enumerate(tbls, start=1): print(i) print(tablen) def get_table_headers(table): headers = [] for th in table.find("tr").find_all("th"): headers.append(th.text.strip()) return headers head = get_table_headers(table) #print(head) def get_table_rows(table): rows = [] for tr in table.find_all("tr")[1:]: cells = [] # grab all td tags in this table row tds = tr.find_all("td") if len(tds) == 0: # if no td tags, search for th tags # can be found especially in wikipedia tables below the table ths = tr.find_all("th") for th in ths: cells.append(th.text.strip()) else: # use regular td tags for td in tds: cells.append(td.text.strip()) rows.append(cells) return rows table_rows = get_table_rows(table) #print(table_rows) def save_as_csv(table_name, headers, rows): pd.DataFrame(rows, columns=headers).to_csv(f"{table_name}.csv") save_as_csv("Test_table", head, table_rows)
Separate Python web scraped data in different columns (Excel)
Dear Stackoverflow community, Recently I started playing around with Python. I learned a lot watching YouTube videos and browsing this platform. But I can't solve my problem. Hope you guys can help me out. So I tried to scrape information from websites using Python(Anaconda). And put this information in an CSV file. I tried to separate the columns by adding "," in my script. But when I open my CSV file all the data is put together in 1 column(A). Instead I want the data to be separated in different columns(A & B (and C, D, E, F etc when I want to add info)). What do I have to add into this code: filename = "brands.csv" f = open(filename, "w") headers = "brand, shipping\n" f.write(headers) for container in containers: brand_container = container.findAll("h2",{"class":"product-name"}) brand = brand_container[0].a.text shipping_container = container.findAll("p",{"class":"availability in-stock"}) shipping = shipping_container[0].text.strip() print("brand: " + brand) print("shipping: " + shipping) f.write(brand + "," + shipping + "," + "\n") f.close() Thank you for helping out! Kind regards, Complete script after Game0ver's suggestion: from urllib.request import urlopen as uReq from bs4 import BeautifulSoup as soup my_url = 'https://www.scraped-website.com' # opening up connection, grabbing the page uClient = uReq(my_url) page_html = uClient.read() uClient.close() # html parsing page_soup = soup(page_html, "html.parser") # grabs each product containers = page_soup.findAll("li",{"class":"item last"}) container = containers[0] import csv filename = "brands.csv" with open(filename, 'w') as csvfile: fieldnames = ['brand', 'shipping'] # define your delimiter writer = csv.DictWriter(csvfile, delimiter=',', fieldnames=fieldnames) writer.writeheader() for container in containers: brand_container = container.findAll("h2",{"class":"product-name"}) brand = brand_container[0].a.text shipping_container = container.findAll("p",{"class":"availability in-stock"}) shipping = shipping_container[0].text.strip() print("brand: " + brand) print("shipping: " + shipping) As I mentioned this code didn't work. I must have done something wrong?
You better use python's csv module to do that: import csv filename = "brands.csv" with open(filename, 'w') as csvfile: fieldnames = ['brand', 'shipping'] # define your delimiter writer = csv.DictWriter(csvfile, delimiter=',', fieldnames=fieldnames) writer.writeheader() # write rows...
Try enclosing your values in double quotes, like f.write('"'+brand + '","' + shipping + '"\n') Although, there are more better ways to handle this generic task and this functionality.
You can choose either of the ways I've shown below. As the url avaiable within your script is unreachable, I've provided a working one. import csv import requests from bs4 import BeautifulSoup url = "https://yts.am/browse-movies" response = requests.get(url) soup = BeautifulSoup(response.content, 'lxml') with open("movieinfo.csv", 'w', newline="") as f: writer = csv.DictWriter(f, ['name', 'year']) writer.writeheader() for row in soup.select(".browse-movie-bottom"): d = {} d['name'] = row.select_one(".browse-movie-title").text d['year'] = row.select_one(".browse-movie-year").text writer.writerow(d) Or you can try like the following: soup = BeautifulSoup(response.content, 'lxml') with open("movieinfo.csv", 'w', newline="") as f: writer = csv.writer(f) writer.writerow(['name','year']) for row in soup.select(".browse-movie-bottom"): name = row.select_one(".browse-movie-title").text year = row.select_one(".browse-movie-year").text writer.writerow([name,year])
python give column name and write value in separate column as table
my code from lxml import html import requests import csv # encoding=utf8 import sys reload(sys) sys.setdefaultencoding('utf8') # example site page = requests.get('http://www.wintergreenfund.com/reports/top-ten/') tree = html.fromstring(page.text) #This will create a list of services: tname = tree.xpath('//*[#id="colLeft"]//table//tr/td[1]/text()') tvalue = tree.xpath('//table//tr/td[2]/text()') print tname print tvalue print 'Input the csv file' csvfile = raw_input("> ") res = tname,tvalue #Assuming res is a list of lists with open(csvfile, "w") as output: writer = csv.writer(output, lineterminator='\n') writer.writerows(res) my output in csv Reynolds American Inc. Consolidated-Tomoka Land Co. British American Tobacco 8.30% 7.50% 7.10% 6.60% 6.40% 5.90% 5.30% 4.80% 4.70% 4.10% Required output same as in website with coulmn name Ref http://www.wintergreenfund.com/reports/top-ten/ And also unicode is not working .need help on this my new code from lxml import html import requests import csv page = requests.get('http://www.wintergreenfund.com/reports/top-ten/') tree = html.fromstring(page.text) csvrows = [] for rows in tree.xpath('//*[#id="colLeft"]//table//tr'): csvrows.append([rows.xpath('./td[1]/text()'),rows.xpath('./td[2]/text()')]) print csvrows print 'Input the csv file' csvfile = raw_input("> ") with open(csvfile, "w") as output: writer = csv.writer(output, lineterminator='\n') writer.writerow(['Name','Value']) #substitute as appropriate. writer.writerows(csvrows) I am getting value with [' '] in it and also empty [ ]
First thing , if you want to combine two lists at each corresponding index , you should use zip() , currently you are creating a tuple of two lists in line - res = tname,tvalue - and then writing it as is to the csv. Also, secondly, you should first use xpath to get each row in the table, and then use xpath to get each required td element from it. Rather than using two xpaths as you are using currently. Example - from lxml import html import requests import csv page = requests.get('http://www.wintergreenfund.com/reports/top-ten/') tree = html.fromstring(page.text) csvrows = [] for rows in tree.xpath('//*[#id="colLeft"]//table//tr'): row1text = rows.xpath('./td[1]/text()') row2text = rows.xpath('./td[2]/text()') if row1text and row2text: csvrows.append([row1text[0],row2text[0]]) print(csvrows) print('Input the csv file') csvfile = input("> ") with open(csvfile, "w") as output: writer = csv.writer(output, lineterminator='\n') writer.writerow(['Name','Value']) #substitute as appropriate. writer.writerows(csvrows)
Python + BeautifulSoup Exporting to CSV
I'm having a bit of trouble automatically scraping data in a table from a Wikipedia article. First I was getting an encoding error. I specified UTF-8 and the error went away, but the scraped data doesn't display a lot of the characters correctly. You will be able to tell from the code that I am a complete newbie: from bs4 import BeautifulSoup import urllib2 wiki = "http://en.wikipedia.org/wiki/Anderson_Silva" header = {'User-Agent': 'Mozilla/5.0'} #Needed to prevent 403 error on Wikipedia req = urllib2.Request(wiki,headers=header) page = urllib2.urlopen(req) soup = BeautifulSoup(page) Result = "" Record = "" Opponent = "" Method = "" Event = "" Date = "" Round = "" Time = "" Location = "" Notes = "" table = soup.find("table", { "class" : "wikitable sortable" }) f = open('output.csv', 'w') for row in table.findAll("tr"): cells = row.findAll("td") #For each "tr", assign each "td" to a variable. if len(cells) == 10: Result = cells[0].find(text=True) Record = cells[1].find(text=True) Opponent = cells[2].find(text=True) Method = cells[3].find(text=True) Event = cells[4].find(text=True) Date = cells[5].find(text=True) Round = cells[6].find(text=True) Time = cells[7].find(text=True) Location = cells[8].find(text=True) Notes = cells[9].find(text=True) write_to_file = Result + "," + Record + "," + Opponent + "," + Method + "," + Event + "," + Date + "," + Round + "," + Time + "," + Location + "\n" write_to_unicode = write_to_file.encode('utf-8') print write_to_unicode f.write(write_to_unicode) f.close()
As pswaminathan pointed out, using the csv module will help greatly. Here is how I do it: table = soup.find('table', {'class': 'wikitable sortable'}) with open('out2.csv', 'w') as f: csvwriter = csv.writer(f) for row in table.findAll('tr'): cells = [c.text.encode('utf-8') for c in row.findAll('td')] if len(cells) == 10: csvwriter.writerow(cells) Discussion Using the csv module, I created a csvwriter object connected to my output file. By using the with command, I don't need to worry about closing the output file after done: it will be closed after the with block. In my code, cells is a list of UTF8-encoded text extracted from the td tags within a tr tag. I used the construct c.text, which is more concise than c.find(text=True).
parsing table with BeautifulSoup and write in text file
I need data from table in text file (output.txt) in this format: data1;data2;data3;data4;..... Celkova podlahova plocha bytu;33m;Vytah;Ano;Nadzemne podlazie;Prizemne podlazie;.....;Forma vlastnictva;Osobne All in "one line", separator is ";" (later export in csv-file). I´m beginner.. Help, thanks. from BeautifulSoup import BeautifulSoup import urllib2 import codecs response = urllib2.urlopen('http://www.reality.sk/zakazka/0747-003578/predaj/1-izb-byt/kosice-mestska-cast-sever-sladkovicova-kosice-sever/art-real-1-izb-byt-sladkovicova-ul-kosice-sever') html = response.read() soup = BeautifulSoup(html) tabulka = soup.find("table", {"class" : "detail-char"}) for row in tabulka.findAll('tr'): col = row.findAll('td') prvy = col[0].string.strip() druhy = col[1].string.strip() record = ([prvy], [druhy]) fl = codecs.open('output.txt', 'wb', 'utf8') for rec in record: line = '' for val in rec: line += val + u';' fl.write(line + u'\r\n') fl.close()
You are not keeping each record as you read it in. Try this, which stores the records in records: from BeautifulSoup import BeautifulSoup import urllib2 import codecs response = urllib2.urlopen('http://www.reality.sk/zakazka/0747-003578/predaj/1-izb-byt/kosice-mestska-cast-sever-sladkovicova-kosice-sever/art-real-1-izb-byt-sladkovicova-ul-kosice-sever') html = response.read() soup = BeautifulSoup(html) tabulka = soup.find("table", {"class" : "detail-char"}) records = [] # store all of the records in this list for row in tabulka.findAll('tr'): col = row.findAll('td') prvy = col[0].string.strip() druhy = col[1].string.strip() record = '%s;%s' % (prvy, druhy) # store the record with a ';' between prvy and druhy records.append(record) fl = codecs.open('output.txt', 'wb', 'utf8') line = ';'.join(records) fl.write(line + u'\r\n') fl.close() This could be cleaned up more, but I think it's what you are wanting.
here's an alternative non BS way, just for your task store=[] #to store your results url="""http://www.reality.sk/zakazka/0747-003578/predaj/1-izb-byt/kosice-mestska-cast-sever-sladkovicova-kosice-sever/art-real-1-izb-byt-sladkovicova-ul-kosice-sever""" page=urllib2.urlopen(url) data=page.read() for table in data.split("</table>"): if "<table" in table and 'class="detail-char' in table: for item in table.split("</td>"): if "<td" in item: store.append(item.split(">")[-1].strip()) print ','.join(store) output $ ./python.py Celková podlahová plocha bytu,33 m2,Výťah,Áno,Nadzemné podlažie,Prízemné podlažie,Stav,Čiastočná rekonštrukcia,Konštrukcia bytu,tehlová,Forma vlastníctva,osobné