parsing table with BeautifulSoup and write in text file - python
I need data from table in text file (output.txt) in this format:
data1;data2;data3;data4;.....
Celkova podlahova plocha bytu;33m;Vytah;Ano;Nadzemne podlazie;Prizemne podlazie;.....;Forma vlastnictva;Osobne
All in "one line", separator is ";" (later export in csv-file).
I´m beginner.. Help, thanks.
from BeautifulSoup import BeautifulSoup
import urllib2
import codecs
response = urllib2.urlopen('http://www.reality.sk/zakazka/0747-003578/predaj/1-izb-byt/kosice-mestska-cast-sever-sladkovicova-kosice-sever/art-real-1-izb-byt-sladkovicova-ul-kosice-sever')
html = response.read()
soup = BeautifulSoup(html)
tabulka = soup.find("table", {"class" : "detail-char"})
for row in tabulka.findAll('tr'):
col = row.findAll('td')
prvy = col[0].string.strip()
druhy = col[1].string.strip()
record = ([prvy], [druhy])
fl = codecs.open('output.txt', 'wb', 'utf8')
for rec in record:
line = ''
for val in rec:
line += val + u';'
fl.write(line + u'\r\n')
fl.close()
You are not keeping each record as you read it in. Try this, which stores the records in records:
from BeautifulSoup import BeautifulSoup
import urllib2
import codecs
response = urllib2.urlopen('http://www.reality.sk/zakazka/0747-003578/predaj/1-izb-byt/kosice-mestska-cast-sever-sladkovicova-kosice-sever/art-real-1-izb-byt-sladkovicova-ul-kosice-sever')
html = response.read()
soup = BeautifulSoup(html)
tabulka = soup.find("table", {"class" : "detail-char"})
records = [] # store all of the records in this list
for row in tabulka.findAll('tr'):
col = row.findAll('td')
prvy = col[0].string.strip()
druhy = col[1].string.strip()
record = '%s;%s' % (prvy, druhy) # store the record with a ';' between prvy and druhy
records.append(record)
fl = codecs.open('output.txt', 'wb', 'utf8')
line = ';'.join(records)
fl.write(line + u'\r\n')
fl.close()
This could be cleaned up more, but I think it's what you are wanting.
here's an alternative non BS way, just for your task
store=[] #to store your results
url="""http://www.reality.sk/zakazka/0747-003578/predaj/1-izb-byt/kosice-mestska-cast-sever-sladkovicova-kosice-sever/art-real-1-izb-byt-sladkovicova-ul-kosice-sever"""
page=urllib2.urlopen(url)
data=page.read()
for table in data.split("</table>"):
if "<table" in table and 'class="detail-char' in table:
for item in table.split("</td>"):
if "<td" in item:
store.append(item.split(">")[-1].strip())
print ','.join(store)
output
$ ./python.py
Celková podlahová plocha bytu,33 m2,Výťah,Áno,Nadzemné podlažie,Prízemné podlažie,Stav,Čiastočná rekonštrukcia,Konštrukcia bytu,tehlová,Forma vlastníctva,osobné
Related
Want to Scrap each category individual but either it scraping data in single alphabet form or in a paragraph form
I want to extract Name & Position, Education, Contact number and email all in different column of csv but when I extract it either it is a single block per alphabet or a single column per paragraph(if I list it).Here is the code: import requests from bs4 import BeautifulSoup from csv import writer url = 'https://governors.pwcs.edu/about_us/staff_bios_and_contact_information' req = requests.get(url) soup = BeautifulSoup(req.text, 'lxml') page = soup.find_all('p') for i in page: i = i.text with open('page.csv', 'a', encoding = 'utf8', newline='') as f: thewriter = writer(f) thewriter.writerow(i)
You can use regex to pull out what you need: import requests from bs4 import BeautifulSoup import pandas as pd import re url = 'https://governors.pwcs.edu/about_us/staff_bios_and_contact_information' req = requests.get(url) soup = BeautifulSoup(req.text, 'html.parser') content = soup.find('div', {'id':'divContent'}) p_list = content.find_all('p') rows = [] for p in p_list: string = p.text text = re.search('(^.*) (Education: )(.*)( Contact).*(\d{3}-\d{3}-\d{4})\s*([a-zA-z1-9].*#[\w].*\.[\w].*)', string).groups() name = text[0] edu = text[2] phone = text[4] email = text[5] row = { 'name':name, 'education':edu, 'phone':phone, 'email':email} rows.append(row) df = pd.DataFrame(rows) df.to_csv('page.csv', index=False)
How can I export a table from a webpage to csv? [duplicate]
I want to convert a HTML table as obtained from the script below into a CSV file, but got type error as follows: TypeError: sequence item 0: expected string, Tag found from bs4 import BeautifulSoup import urllib2 url = 'http://www.data.jma.go.jp/obd/stats/etrn/view/monthly_s3_en.php?block_no=47401&view=1' html = urllib2.urlopen(url).read() soup = BeautifulSoup(html) table = soup.find_all('table', class_='data2_s') rows = table[0].find_all('tr') How is the easiest way to convert it into a CSV file? I tried as: fo = open('fo.txt','w') for r in rows: fo.write(str(r.txt) + '\n') fo.close() but it wrote 'none' The HTML is like this: <table class="data2_s"><caption class="m">WAKKANAI   WMO Station ID:47401 Lat 45<sup>o</sup>24.9'N  Lon 141<sup>o</sup>40.7'E</caption><tr><th scope="col">Year</th><th scope="col">Jan</th><th scope="col">Feb</th><th scope="col">Mar</th><th scope="col">Apr</th><th scope="col">May</th><th scope="col">Jun</th><th scope="col">Jul</th><th scope="col">Aug</th><th scope="col">Sep</th><th scope="col">Oct</th><th scope="col">Nov</th><th scope="col">Dec</th><th scope="col">Annual</th></tr><tr class="mtx" style="text-align:right;"><td style="text-align:center">1938</td><td class="data_0_0_0_0">-5.2</td><td class="data_0_0_0_0">-4.9</td><td class="data_0_0_0_0">-0.6</td><td class="data_0_0_0_0">4.7</td><td class="data_0_0_0_0">9.5</td><td class="data_0_0_0_0">11.6</td><td class="data_0_0_0_0">17.9</td><td class="data_0_0_0_0">22.2</td><td class="data_0_0_0_0">16.5</td><td class="data_0_0_0_0">10.7</td><td class="data_0_0_0_0">3.3</td><td class="data_0_0_0_0">-4.7</td><td class="data_0_0_0_0">6.8</td></tr> <tr class="mtx" style="text-align:right;"><td style="text-align:center">1939</td><td class="data_0_0_0_0">-7.5</td><td class="data_0_0_0_0">-6.6</td><td class="data_0_0_0_0">-1.4</td><td class="data_0_0_0_0">4.0</td><td class="data_0_0_0_0">7.5</td><td class="data_0_0_0_0">13.0</td><td class="data_0_0_0_0">17.4</td><td class="data_0_0_0_0">20.0</td><td class="data_0_0_0_0">17.4</td><td class="data_0_0_0_0">9.7</td><td class="data_0_0_0_0">3.0</td><td class="data_0_0_0_0">-2.5</td><td class="data_0_0_0_0">6.2</td></tr>
This is a job for the csv lib, getting each td inside each row and extracting the text, it will handle where there are missing values in each row: from bs4 import BeautifulSoup import urllib2 import csv url = 'http://www.data.jma.go.jp/obd/stats/etrn/view/monthly_s3_en.php?block_no=47401&view=1' html = urllib2.urlopen(url).read() soup = BeautifulSoup(html) table = soup.select_one("table.data2_s") # python3 just use th.text headers = [th.text.encode("utf-8") for th in table.select("tr th")] with open("out.csv", "w") as f: wr = csv.writer(f) wr.writerow(headers) wr.writerows([[td.text.encode("utf-8") for td in row.find_all("td")] for row in table.select("tr + tr")]) Which matches the table exactly as you see on the page: :~$ cat out.csv Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Annual 1938,-5.2,-4.9,-0.6,4.7,9.5,11.6,17.9,22.2,16.5,10.7,3.3,-4.7,6.8 1939,-7.5,-6.6,-1.4,4.0,7.5,13.0,17.4,20.0,17.4,9.7,3.0,-2.5,6.2 1940,-6.0,-5.7,-0.5,3.5,8.5,11.0,16.6,19.7,15.6,10.4,3.7,-1.0,6.3 1941,-6.5,-5.8,-2.6,3.6,8.1,11.4,12.7,16.5,16.0,10.0,4.0,-2.9,5.4 1942,-7.8,-8.2,-0.8,3.5,7.1,12.0,17.4,18.4,15.7,10.5,2.5,-2.9,5.6 1943,-4.1,-6.1,-1.1,3.5,6.9,12.9,19.3,21.5,17.5,11.7,1.2,-3.6,6.6 1944,-7.7,-7.9,-2.2,1.7,8.9,13.7,19.0,21.3,16.6,10.8,1.3,-6.0,5.8 1945,-7.8,-6.9,-1.8,3.9,5.5,11.0,13.6,18.7,16.8,11.0,3.9,-4.8,5.3 1946,-6.5,-6.0,-3.3,4.5,7.6,14.9,18.2,22.2,16.9,11.5,4.4,-2.5,6.8 1947,-4.9,-5.5,-2.3,3.7,9.0,11.2,17.1,19.3,15.1,10.6,2.4,-4.6,5.9 1948,-2.7,-4.4,-0.2,6.0,10.7,12.2,16.2,22.0,16.9,11.1,4.2,-0.6,7.6 1949,-2.6,-2.8,-3.4,2.0,9.4,11.8,16.9,20.8,17.8,10.8,3.1,-3.8,6.7 1950,-5.7,-4.8,-1.3,4.0,9.2,14.6,19.3,22.6,16.8,9.0,3.0,-2.9,7.0 1951,-6.7,-6.5,-2.2,3.7,9.5,12.3,16.7,22.3,15.6,10.1,3.7,-0.3,6.5 1952,-5.7,-7.1,-2.4,3.8,8.3,13.1,16.4,19.7,17.0,11.3,0.9,-7.1,5.7 1953,-7.7,-7.3,-0.9,3.6,6.9,11.1,16.8,19.2,17.6,11.2,-0.6,-2.6,5.6 1954,-6.7,-4.1,-2.5,4.0,7.5,11.0,13.7,17.0,17.2,9.5,3.2,-1.8,5.7 1955,-6.4,-4.8,-1.3,4.7,7.0,12.7,20.3,19.5,15.5,10.6,3.6,-0.4,6.8 1956,-6.1,-4.6,-2.0,5.1,10.8,11.2,13.8,16.3,17.2,12.3,2.8,-2.6,6.2 1957,-3.9,-5.5,-2.9,4.4,9.3,10.9,17.1,18.2,15.5,11.1,5.4,-1.1,6.5 1958,-4.9,-4.9,-2.3,4.4,8.5,12.6,17.5,18.3,16.8,10.6,4.5,-0.5,6.7 1959,-7.3,-2.8,0.8,6.4,9.4,12.7,17.1,18.5,16.2,11.6,2.9,-3.9,6.8 1960,-7.2,-5.2,-1.4,3.5,7.7,10.8,15.9,20.8,18.1,9.7,3.3,-3.9,6.0 1961,-7.7,-5.3,-1.4,5.5,8.7,14.7,19.5,20.0,18.9,10.4,4.1,-1.3,7.2 1962,-4.2,-5.4,-2.5,6.7,10.0,12.9,16.8,17.7,16.6,9.9,2.6,-1.5,6.6 1963,-3.6,-3.7,0.1,5.0,10.4,12.4,16.8,17.1,15.6,10.7,4.3,-1.7,7.0 1964,-4.5,-7.7,-1.3,3.7,9.9,11.9,15.3,17.7,14.9,10.0,3.6,-1.9,6.0 1965,-4.1,-5.7,-2.8,3.2,9.1,13.3,15.2,18.8,15.8,11.4,2.1,-2.6,6.1 1966,-5.0,-5.5,-1.0,3.2,8.1,12.2,15.3,17.5,15.4,11.6,4.1,-4.4,6.0 1967,-6.8,-5.9,-0.7,4.5,10.0,11.4,16.4,20.5,15.5,11.0,1.8,-1.5,6.4 1968,-4.2,-4.7,1.9,5.7,8.9,14.5,17.3,18.1,15.9,9.1,5.3,-0.7,7.3 1969,-7.3,-7.5,-2.5,3.9,7.2,10.6,17.0,16.5,16.1,9.4,2.2,-5.4,5.0 1970,-6.6,-6.0,-4.2,4.6,10.4,12.9,17.4,19.2,16.8,10.5,4.3,-3.3,6.3 1971,-6.3,-6.4,-1.7,4.1,7.6,11.6,15.8,17.2,15.2,11.5,3.4,-2.2,5.8 1972,-5.3,-5.0,-0.6,5.9,9.4,12.8,16.8,20.4,15.7,10.9,1.9,-1.4,6.8 1973,-4.2,-5.3,-2.9,4.2,8.4,12.8,17.0,20.9,17.1,10.4,3.5,-1.9,6.7 1974,-2.6,-4.6,-2.1,4.0,8.4,11.8,16.8,18.8,16.5,10.1,1.9,-5.7,6.1 1975,-4.1,-6.1,-1.5,4.3,8.4,13.7,16.1,20.6,17.3,10.4,3.8,-3.8,6.6 1976,-4.6,-3.5,-1.4,4.0,8.9,11.9,17.5,17.6,15.7,10.2,1.3,-2.0,6.3 1977,-8.3,-7.1,-1.0,3.6,8.0,11.9,18.2,19.1,17.4,11.4,4.5,-1.8,6.3 1978,-6.7,-9.2,-1.6,4.3,9.2,13.5,20.6,21.3,17.4,9.6,3.4,-2.1,6.6 1979,-6.9,-4.5,-2.5,2.7,7.8,13.2,15.8,20.3,16.9,11.3,2.9,-0.1,6.4 1980,-5.4,-7.1,-1.9,1.9,7.8,12.9,15.9,16.5,16.0,10.0,4.3,-0.6,5.9 1981,-5.4,-6.3,-2.6,5.6,8.1,11.8,17.1,18.7,16.0,10.5,0.8,-0.6,6.1 1982,-5.6,-5.3,-0.6,3.7,9.0,11.9,16.9,21.0,17.5,11.4,4.3,-1.0,6.9 1983,-4.2,-7.6,-1.9,6.8,8.2,8.5,14.5,18.9,15.8,8.9,4.8,-2.1,5.9 1984,-4.9,-6.6,-3.3,2.9,7.9,15.5,19.5,20.5,16.6,9.2,2.3,-3.6,6.3 1985,-8.7,-4.8,-1.4,4.9,8.6,11.7,16.6,21.1,15.7,10.3,2.7,-4.2,6.0 1986,-7.2,-6.5,-2.4,4.6,8.4,11.2,14.4,19.6,16.8,9.1,2.1,-1.9,5.7 1987,-6.4,-5.6,-1.4,4.2,8.6,12.6,17.5,18.0,16.4,11.1,2.0,-3.1,6.2 1988,-4.8,-6.3,-1.8,4.1,8.0,12.6,14.1,20.4,16.1,10.4,2.0,-1.5,6.1 1989,-2.6,-2.4,0.8,4.0,8.2,10.7,18.4,20.4,16.8,10.8,4.8,-1.3,7.4 1990,-5.7,-2.4,1.4,5.7,9.3,13.4,18.9,20.3,17.1,13.3,6.2,1.2,8.2 1991,-1.6,-3.6,-1.5,4.8,10.1,14.3,16.2,19.0,16.6,11.8,3.5,-2.3,7.3 1992,-3.6,-3.6,-0.4,3.7,8.1,12.1,17.6,18.0,14.9,11.1,3.2,-1.2,6.7 1993,-2.7,-3.3,-0.2,3.1,8.6,10.7,15.6,17.6,16.3,11.1,3.7,-1.6,6.6 1994,-6.1,-2.7,-1.3,4.4,10.0,12.8,17.4,21.7,17.5,11.8,4.3,-2.9,7.2 1995,-4.0,-4.0,-0.8,4.8,11.0,12.7,18.4,19.3,16.3,12.3,5.2,-0.6,7.6 1996,-4.6,-4.5,-1.0,3.5,6.9,12.0,15.9,18.7,16.8,10.4,2.3,-2.4,6.2 1997,-3.0,-3.3,-1.5,4.3,7.3,11.7,17.4,17.2,16.1,10.3,6.4,-0.7,6.9 1998,-6.9,-5.1,0.3,5.3,10.1,12.9,15.5,18.1,17.2,12.5,2.0,-2.4,6.6 1999,-4.1,-5.6,-2.6,4.2,8.4,14.5,16.6,21.0,18.3,11.2,3.8,-1.9,7.0 2000,-4.2,-5.6,-2.1,3.5,9.3,12.8,18.9,21.5,17.7,10.6,1.5,-4.1,6.7 2001,-6.3,-7.7,-2.4,4.7,8.5,13.0,17.4,18.7,15.6,10.8,4.0,-4.2,6.0 2002,-3.6,-1.0,0.5,6.8,11.1,12.1,15.7,17.1,17.0,10.8,2.3,-4.4,7.0 2003,-4.7,-5.6,-0.7,5.3,10.1,13.9,14.3,18.4,16.6,11.3,4.5,-1.4,6.8 2004,-3.9,-3.0,-0.5,4.4,10.6,14.6,16.8,19.7,17.8,11.8,5.9,-2.0,7.7 2005,-4.6,-5.7,-1.0,3.9,7.0,14.3,16.7,21.0,17.9,12.6,4.9,-2.3,7.1 2006,-5.5,-4.7,-0.9,2.1,9.3,11.9,18.4,21.6,17.7,11.0,4.5,-1.8,7.0 2007,-3.7,-3.2,-0.7,3.5,7.6,14.3,16.7,20.4,17.0,10.9,3.0,-1.7,7.0 2008,-6.0,-4.8,0.6,6.0,8.3,11.9,17.9,18.8,17.9,11.5,3.8,-0.4,7.1 2009,-2.4,-4.4,0.0,4.5,10.0,12.3,14.8,18.6,16.9,11.4,3.1,-2.2,6.9 2010,-3.4,-4.9,-1.4,3.5,7.3,15.0,18.1,22.4,18.4,11.4,4.8,-1.1,7.5 2011,-5.1,-2.2,-0.6,4.4,6.5,12.8,17.5 ),21.5,18.3,12.1,4.9,-2.3,7.3 2012,-5.4,-6.4,-2.4,4.6,8.9,12.6,17.2,20.4,19.4,11.8,3.8,-3.0,6.8 2013,-5.8,-5.1,-1.3,4.5,7.2,14.0,18.9,20.2,17.6,11.8,5.5,-0.2,7.3 2014,-5.3,-4.2,-1.2,3.9,8.7,13.9,19.2,20.0,16.7,11.0,4.8,-2.3,7.1 2015,-2.9,-1.7,2.3,5.9,9.9,12.1,17.6,19.0,17.3,10.4,3.7,-0.2,7.8 2016,-5.2,-4.7,0.5,4.3,11.4,12.5,17.4,21.8 ], , , , ,5.2 ] if you want the caption use table.select_one("caption.m").text: with open("out.csv", "w") as f: wr = csv.writer(f) wr.writerow([table.select_one("caption.m").text.encode("utf-8")]) wr.writerow(headers) wr.writerows([[td.text.encode("utf-8") for td in row.find_all("td")] for row in table.select("tr + tr")]) but it might be an idea to use that as the name of the file as opposed to adding it to the csv. If you really want to do it without the csv, use the same logic with str.join: table = soup.select_one("table.data2_s") headers = [th.text.encode("utf-8") for th in table.select("tr th")] with open("out.csv", "w") as f: f.write(",".join(headers) + "\n") f.writelines(",".join([td.text.encode("utf-8") for td in row.find_all("td")]) + "\n" for row in table.select("tr + tr")) If you want to replace the empty cells with N/A: with open("out.csv", "w") as f: f.write(",".join(headers) + "\n") f.writelines(",".join([td.text.encode("utf-8").strip('\xe3\x80\x80') or "N/A" for td in row.find_all("td")]) + "\n" for row in table.select("tr + tr")) Which will change the last row to: 2016,-5.2,-4.7,0.5,4.3,11.4,12.5,17.4,21.8 ],N/A,N/A,N/A,N/A,5.2 ] The spaces for missing values are unicode ideographic space characters (u"\u3000" in python) which when encoded to utf-8 become and strip, if that leave an empty string then we just use "N/A" In [7]: print u"\u3000" In [8]: u"\u3000".encode("utf-8") Out[8]: '\xe3\x80\x80' In [9]: u"\u3000".encode("utf-8").strip('\xe3\x80\x80') Out[9]: ''
Use the csv module from Python to do this. You can obviously write more columns if you want, but the idea is that you're writing a list to the csv file. There are other options that you can specify in the writer() method if you'd like to quote things, escape things, etc. import csv with open('your_csv_name.csv', 'w') as o: w = csv.writer(o) # Headers w.writerow(['tr_content']) # Write the tr text for r in rows: w.writerow([r])
Here is another way without using csv module: fp=open('data.csv','w') for row in rows[:-1]: # Removed last row as it has empty cells that gives error which can also be resolved if needed fp.write(row.get_text(',') + '\n') fp.close() You can directly open data.csv file. Station details can be get by below command: >>>> table = soup.find_all('table', class_='data2_s') >>>> print table[0].find_all('caption')[0].get_text().encode('ascii', 'ignore') WAKKANAI WMO Station ID:47401 Lat 45o24.9'N Lon 141o40.7'E Hope this helps.
import csv from bs4 import BeautifulSoup import pandas as pd html = open('test.html').read() soup = BeautifulSoup(html, features='lxml') #Specify table name which you want to read. #Example: <table class="queryResults" border="0" cellspacing="1"> table = soup.select_one('table.queryResults') def get_all_tables(soup): return soup.find_all("table") tbls = get_all_tables(soup) for i, tablen in enumerate(tbls, start=1): print(i) print(tablen) def get_table_headers(table): headers = [] for th in table.find("tr").find_all("th"): headers.append(th.text.strip()) return headers head = get_table_headers(table) #print(head) def get_table_rows(table): rows = [] for tr in table.find_all("tr")[1:]: cells = [] # grab all td tags in this table row tds = tr.find_all("td") if len(tds) == 0: # if no td tags, search for th tags # can be found especially in wikipedia tables below the table ths = tr.find_all("th") for th in ths: cells.append(th.text.strip()) else: # use regular td tags for td in tds: cells.append(td.text.strip()) rows.append(cells) return rows table_rows = get_table_rows(table) #print(table_rows) def save_as_csv(table_name, headers, rows): pd.DataFrame(rows, columns=headers).to_csv(f"{table_name}.csv") save_as_csv("Test_table", head, table_rows)
How to webscrape wiki tables of multiple Companies
I am trying to webscrape wiki tables of multiple companies like samsung,alibaba etc,but can't able to so. Below is My code import csv from urllib.request import urlopen from bs4 import BeautifulSoup csvFile = open('Information.csv', 'wt+') writer = csv.writer(csvFile) lst=['Samsung','Facebook','Google','Tata_Consultancy_Services','Wipro','IBM','Alibaba_Group','Baidu','Yahoo!','Oracle_Corporation'] for a in lst: html = urlopen("https://en.wikipedia.org/wiki/a") bs = BeautifulSoup(html, 'html.parser') table = bs.findAll('table') for tr in table: rows = tr.findAll('tr') for row in rows: csvRow = [] for cell in row.findAll(['td', 'th']): csvRow.append(cell.get_text()) print(csvRow) writer.writerow(csvRow)
You are passing a as a string itself, not a reference to one of the items in the list. Here is the corrected code: import csv from urllib.request import urlopen from bs4 import BeautifulSoup csvFile = open('Information.csv', 'wt+') writer = csv.writer(csvFile) lst=['Samsung','Facebook','Google','Tata_Consultancy_Services','Wipro','IBM','Alibaba_Group','Baidu','Yahoo!','Oracle_Corporation'] for a in lst: html = urlopen("https://en.wikipedia.org/wiki/{}".format(a)) bs = BeautifulSoup(html, 'html.parser') table = bs.findAll('table') for tr in table: rows = tr.findAll('tr') for row in rows: csvRow = [] for cell in row.findAll(['td', 'th']): csvRow.append(cell.get_text()) print(csvRow) writer.writerow(csvRow)
html = urlopen("https://en.wikipedia.org/wiki/a") is where the problem is. you're looping through lst to get the url for each company but failed to do so by using a string literal in the urlopen method. the way to solve this is to replace html = urlopen("https://en.wikipedia.org/wiki/a") with either one of the following: html = urlopen("https://en.wikipedia.org/wiki/" + a) html = urlopen(f"https://en.wikipedia.org/wiki/{a}") #requires python 3.6+ html = urlopen("https://en.wikipedia.org/wiki/{}".format(a))
Python 2.7: How to check if row already exists?
I am trying to check if a row already exists. If it doesn't, something has to be written in the row. My CSV file is always empty. # import libraries import csv import urllib2 from bs4 import BeautifulSoup # integer for first article id articleid = 4449 articles = 4459 while articleid < articles: # specify the url and article id url = 'http://www.bkfrem.dk/default.asp?vis=nyheder&id='+str(articleid) articleid += 1 # query the website and return the html to the variable page = urllib2.urlopen(url) # parse the html using beautiful soup and store in variable soup soup = BeautifulSoup(page, 'html.parser') # create CSV file csvfile = csv.writer(open('news.csv', 'a')) # take out the <div> of name and get its value and text title_box = soup.find('h1', attrs={'style': 'margin-bottom:0px'}) title = title_box.text.encode('utf-8').strip() date_box = soup.find('div', attrs={'style': 'font-style:italic; padding-bottom:10px'}) date = date_box.text.encode('utf-8').strip() articleText_box = soup.find('div', attrs={'class': 'news'}) articleText = articleText_box.text.encode('utf-8').strip() # print the data (encoded) to the CSV file with open('news.csv', 'rb') as csvfileO: f_reader = csv.reader(csvfileO, delimiter=',') for row in f_reader: if articleText not in row: csvfile.writerow(["Title", "Date", "Text"]) csvfile.writerow((title, date, articleText)) What am I doing wrong since it's empty?
for row in f_reader: if articleText not in csvfile.writerow(["Title", "Date", "Text"]) csvfile.writerow((title, date, articleText)) You have if articleText not in Not in what? You should have it pointing to something to validate. if articleText not in "Something": csvfile.writerow(["Title", "Date", "Text"]) csvfile.writerow((title, date, articleText))
writing info to csv in python
import requests from bs4 import BeautifulSoup import csv from urlparse import urljoin import urllib2 base_url = 'http://www.baseball-reference.com/' # base url for concatenation data = requests.get("http://www.baseball-reference.com/teams/BAL/2014-schedule-scores.shtml") #website for scraping soup = BeautifulSoup(data.content) b=5 for link in soup.find_all('a'): if not link.has_attr('href'): continue if link.get_text() != 'boxscore': continue url = base_url + link['href'] response = requests.get(url) html = response.content soup = BeautifulSoup(html) # Scores table = soup.find('table', attrs={'id': 'BaltimoreOriolespitching'}) for row in table.findAll('tr'): list_of_cells = [] for cell in row.findAll('td'): text = cell.text.replace(' ', '') list_of_cells.append(text) for list in list_of_cells: with open('test1.csv', 'w', newline='') as fp: a = csv.writer(fp, delimiter=',') a.writerows(list) I am trying to write the info scraped to a csv so that each piece of information has its own cell. The more I play with the code I either get an indentation error or the first row prints to a csv and thats it. IndentationError: expected an indented block
I think the first thing to consider is moving opening the file and creating the CSV writer outside the loop. I think you're overwriting the CSV file ('w') on each pass through the for loop. So try this: with open('test1.csv', 'w', newline='') as fp: csvw = csv.writer(fp, delimiter=',') for link in soup.find_all('a'): if not link.has_attr('href'): continue if link.get_text() != 'boxscore': continue url = base_url + link['href'] response = requests.get(url) html = response.content soup = BeautifulSoup(html) # Scores table = soup.find('table', attrs={'id': 'BaltimoreOriolespitching'}) for row in table.findAll('tr'): list_of_cells = [] for cell in row.findAll('td'): text = cell.text.replace(' ', '') list_of_cells.append(text) for list in list_of_cells: csvw.writerows(list)