Html parser to dataframe - python

I am struggled trying to convert a html table to a dataframe. I would like to write the table in a csv file.
from requests import session
import sys
import csv
from bs4 import BeautifulSoup
c = session()
outfile = open("Weather2017.csv", 'wb')
response = c.get('http://www.wunderground.com/history/airport/EGLL/2017/1/1/CustomHistory.html?dayend=31&monthend=12&yearend=2017&req_city=NA&req_state=NA&req_statename=NA')
soup = BeautifulSoup(response.text, "html.parser")
soup = soup.find(id="obsTable").text.replace('\n','',1)
outfile.write(soup.replace('\n',',London\n'))
the type error is the following
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-11-1e149d844e15> in <module>()
11 soup = BeautifulSoup(response.text, "html.parser")
12 soup = soup.find(id="obsTable").text.replace('\n','',1)
---> 13 outfile.write(soup.replace('\n',',London\n'))
14
15
TypeError: a bytes-like object is required, not 'str'
this is the table I want to covert in to a csv file
Can anyone help me?
Thanks in advance!

How about this,
from requests import session
import sys
import csv
from bs4 import BeautifulSoup
c = session()
response = c.get('http://www.wunderground.com/history/airport/EGLL/2017/1/1/CustomHistory.html?dayend=31&monthend=12&yearend=2017&req_city=NA&req_state=NA&req_statename=NA')
soup = BeautifulSoup(response.text, "html.parser")
table = soup.find(id="obsTable")
headers = [header.text.encode('utf-8').strip() for header in table.find_all('th')]
rows = []
for row in table.find_all('tr'):
rows.append([val.text.encode('utf-8').strip() for val in row.find_all('td')])
del rows[0] # Remove header row. Added as empty.
with open('Weather2017.csv', 'wb') as f:
writer = csv.writer(f)
writer.writerow(headers)
writer.writerows(row for row in rows if row)

The things causing a problem in your code when applying BeautifulSoup() are these tags:
tbody , /tbody , thead, /thead. if you get rid of them everything will work well!
Here is a solution using pandas, regex and some other libs :)
#needed imports
import pandas as pd
import numpy as numpy
from bs4 import BeautifulSoup
import requests
import re
# get page html code
url = 'https://www.wunderground.com/history/airport/EGLL/2017/1/1/CustomHistory.html?dayend=31&monthend=12&yearend=2017&req_city=NA&req_state=NA&req_statename=NA'
req = requests.get(url)
html = req.text
soup = (BeautifulSoup(html, 'html.parser'))
#removing tags that cause problems using re library
patterns = ['<tbody>','</tbody>','<thead>','</thead>']
cleaned_html= soup.prettify()
for pat in patterns:
cleaned_html = re.sub(pat, '', cleaned_html)
df = pd.read_html(cleaned_html, attrs={'id':'obsTable'})[0]
df.head()
# build a hierarchical columns
df.columns = [['2017',
'Temp. (°C)','Temp. (°C)','Temp. (°C)',
'Dew Point (°C)','Dew Point (°C)','Dew Point (°C)',
'Humidity (%)','Humidity (%)','Humidity (%)',
'Sea Level Press. (hPa)','Sea Level Press. (hPa)','Sea Level Press. (hPa)',
'Visibility (km)','Visibility (km)','Visibility (km)',
'Wind (km/h)', 'Wind (km/h)','Wind (km/h)',
'Precip. (mm)', 'Events'],
['Jan',
'high','avg','low',
'high','avg','low',
'high','avg','low',
'high','avg','low',
'high','avg','low',
'high','avg','high',
'sum',
'nan']]
df.head()
#removing the first un-needed rows
df = df.drop([0,1], axis=0)
df.reset_index(inplace=True, drop=True)
df.head()
#save the result to CSV file
df.to_csv('weather.csv')

Related

Can't get all the data of one column using BeautifulSoup

I'm using beautifulSoup to extract some data off of a wiki, but I can only get the first data of a specific column. If my understanding of for-loops is correct, it should loop through everything in the table. I tested this by printing "t" to the console and it shows all the data in HTML format. Is there a reason why this is happening?
from bs4 import BeautifulSoup
import requests, csv
import pandas as pd
wiki_url = "https://en.wiktionary.org/wiki/Appendix:Mandarin_Frequency_lists/1-1000"
table_id = "wikitable"
response = requests.get(wiki_url)
soup = BeautifulSoup(response.text, 'html.parser')
#table = soup.find('table', class_="wikitable")
table = soup.find_all('table', class_="wikitable")
with open('chinesewords.csv', 'w', encoding='utf8', newline='') as c:
writer = csv.writer(c)
writer.writerow(["simplified, pinyin"])
for t in table:
simplified = t.find('span', class_="Hans").text
print(simplified)
The output:
一
(I apologize in advance if I didn't follow the rules of StackOverflow posting, as this is my first time posting a question)
Make your life easier and try pandas.read_html().
Here's an example:
import requests
import pandas as pd
table = (
pd
.read_html(
requests
.get(
"https://en.wiktionary.org/wiki/Appendix:Mandarin_Frequency_lists/1-1000"
).text,
flavor="lxml"
)[0]
)
table.to_csv("mandarin_frequency_lists.csv", index=False)
Output:
If you mean data from one column from the table, the following code is enough. I hope I helped:
from bs4 import BeautifulSoup
import requests, csv
import pandas as pd
wiki_url = "https://en.wiktionary.org/wiki/Appendix:Mandarin_Frequency_lists/1-1000"
response = requests.get(wiki_url)
soup = BeautifulSoup(response.text, 'html.parser')
table_column = soup.find_all('span', class_="Hans")
with open('chinesewords.csv', 'w', encoding='utf32', newline='') as c:
writer = csv.writer(c)
writer.writerow(["simplified, pinyin"])
for t in table_column:
simplified = t.text
print(simplified)
writer.writerow(simplified)

Finding <caption class="table-title">

so I have written a script to scrape tables from a website and saves these to an Excel sheet:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from pandas import ExcelWriter
import os.path
path = "C:...."
url= 'https://zoek.officielebekendmakingen.nl/kst-35570-2.html'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
tables_df = pd.read_html(url, attrs = {'class': 'kio2 portrait'})
tables = soup.find_all('table', class_="kio2 portrait")
titles = []
for table in tables:
print(table)
title = table.find_all("caption", class_="table-title")
titles.append(title)
titles = []
writer = pd.ExcelWriter('output.xlsx')
for i, df in enumerate(tables_df, 1):
df.to_excel(writer, index=True,sheet_name=f'sheetName_{i}')
writer.save()
Which works, but now I want to find all titles of these table so I can give each sheet this title. For example, the first table has the following text of which I am interested:
<table cellpadding="0" cellspacing="0" class="kio2 portrait" summary="Tabel 1.1 Budgettaire kerngegevens"><caption class="table-title">Tabel 1.1 Budgettaire kerngegevens</caption>
Now I want to scrape the part between <caption class="table-title"> and </caption>. Or, which is also a possibility, use the summary element. How can I achieve this? I have tried it within the code but I do not find anything yet.
Try:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from pandas import ExcelWriter
url = "https://zoek.officielebekendmakingen.nl/kst-35570-2.html"
soup = BeautifulSoup(requests.get(url).text, "html.parser")
writer = pd.ExcelWriter("output.xlsx")
for i, table in enumerate(soup.find_all("table", class_="kio2 portrait"), 1):
df = pd.read_html(str(table))[0]
caption = table.get("summary", "").replace(":", "").strip()
# some tables doesn't contain summary, so make generic sheet name:
if not caption:
caption = f"table {i}"
df.to_excel(writer, sheet_name=caption)
writer.save()
This creates output.xlsx with 185 sheets (at least opening it in my Libreoffice):

BeautifulSoup: Scraping CSV list of URLs

I have been trying to download data from different urls and then save it to a csv file.
The idea is extract the highlighted data from: https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow
So far I built the following piece of code:
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request as ur
url_is = 'https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow'
read_data = ur.urlopen(url_is).read()
soup_is=BeautifulSoup(read_data, 'lxml')
row = soup_is.select_one('tr.mainRow>td.rowTitle:contains("Cash Dividends Paid - Total")')
data=[cell.text for cell in row.parent.select('td') if cell.text!='']
df=pd.DataFrame(data)
print(df.T)
I get as an output:
All good so far.
Now my idea is to extract specific classes from multiple URLs, keep the same headers from the website and export it to a .csv.
The tags and classes stay the same
Sample URLs:
https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow
https://www.marketwatch.com/investing/stock/aapl/financials/cash-flow
Code (I wanted to try with 2 columns: 2015 and 2016)
As desidered ouput I would like something like:
I wrote the following code, but is giving me issues, any help or advice is welcome:
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request as ur
import numpy as np
import requests
links = ['https://www.marketwatch.com/investing/stock/aapl/financials/cash-flow', 'https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow']
container = pd.DataFrame(columns=['Name', 'Name2'])
pos=0
for l in links:
read_data = ur.urlopen(l).read()
soup_is=BeautifulSoup(read_data, 'lxml')
row = soup_is.select_one('tr.mainRow>td.rowTitle:contains("Cash Dividends Paid - Total")')
results=[cell.text for cell in row.parent.select('td') if cell.text!='']
records = []
for result in results:
records = []
Name = result.find('span', attrs={'itemprop':'2015'}).text if result.find('span', attrs={'itemprop':'2015'}) is not None else ''
Name2 = result.find('span', attrs={'itemprop':'2016'}).text if result.find('span', attrs={'itemprop':'2016'}) is not None else ''
records.append(Name)
records.append(Name2)
container.loc[pos] = records
pos+=1
import requests
import pandas as pd
urls = ['https://www.marketwatch.com/investing/stock/aapl/financials/cash-flow',
'https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow']
def main(urls):
with requests.Session() as req:
goal = []
for url in urls:
r = req.get(url)
df = pd.read_html(
r.content, match="Cash Dividends Paid - Total")[0].iloc[[0], 0:3]
goal.append(df)
new = pd.concat(goal)
print(new)
main(urls)

BeautifulSoup with Table

I'm Web Scraping on Beautiful Soup and I am getting an error on line 13: for row in table.findAll('tr').
Its coming up an error on the cmd. Hope someone could help.
import csv
import requests
from bs4 import BeautifulSoup
url='http://www.dublincity.ie/dublintraffic/carparks.htm'
response = requests.get(url)
html= response.content
soup=BeautifulSoup(html)
table=soup.find('tbody', attrs={'id' :'itemsBody'})
list_of_rows=[]
for row in table.findAll('tr'):
list_of_cells=[]
for cell in row.findAll('td'):
text = cell.text.replace(' ','')
list_of_cells.append(text)
list_of_cells.append(list_of_cells)
outfile= open("./carpark.csv", "wb")
writer=csv.writer(outfile)
writer.writerows(["location","spaces"])
writer.writerows(list_of_rows)
If you wanna stick to BeautifulSoup then you can fetch and write the content using its xml parser along with csv.DictWriter(). Check out the implementation:
import csv
import requests
from bs4 import BeautifulSoup
url = 'http://www.dublincity.ie/dublintraffic/cpdata.xml?1543254514266'
res = requests.get(url)
soup = BeautifulSoup(res.content,"xml")
data = []
for item in soup.select("carpark"):
ditem = {}
ditem['Name'] = item.get("name")
ditem['Spaces'] = item.get("spaces")
data.append(ditem)
with open("xmldocs.csv","w",newline="") as f:
writer = csv.DictWriter(f,["Name","Spaces"])
writer.writeheader()
for info in data:
writer.writerow(info)
You could retrieve the data as an xml doc and then parse. This is just an example of part of process you could tailor.
import requests
from xml.etree import ElementTree
import pandas as pd
url = 'http://www.dublincity.ie/dublintraffic/cpdata.xml?1543254514266'
xml_data = requests.get(url).content
tree = ElementTree.fromstring(xml_data)
parking = []
for child in tree:
for nextChild in child:
parking.append([child.tag ,nextChild.attrib['name'],nextChild.attrib['spaces']])
df = pd.DataFrame(parking)
print(df)
df.to_csv(r'C:\Users\User\Desktop\Data.csv', sep=',', encoding='utf-8',index = False )

Scraping tweets from wapo gone awry

Two issues.
Goal etl a 3 columns csv with column headings of date, time, & tweet.
My attempts at extracting the span text/time out of the li results in duplicating the span info inside the time and tweet columns.
It's my first week working with python, i've tried to replace() the tweet columns 'time' with "" but I end up removing both columns 'time' instances.
combining the columns together in-order or correctly mixing the data columns together as they appear. The code I write either results in 30,000 or 1000 lines. The correct csv file should be around 520 lines.
import bs4 as bs
import requests, urllib.request, csv
from urllib.request import urlopen
sauce = urllib.request.urlopen('https://www.washingtonpost.com/graphics/politics/100-days-of-trump-tweets/?utm_term=.0c2052f6d858').read()
soup = bs.BeautifulSoup(sauce, 'html.parser')
lists = soup.find_all('li', class_='visible')
dates = soup.find_all("li", attrs={"data-date": True})
tweet_data = ['date, time, tweets']
for li in dates[1:]:
date = li['data-date']
tweet_data.append([date])
for list in lists[1:]:
time = list.find_all('span', {"class": "gray"})[0].text
tweets = list.text
tweet_data.append([time, tweets])
with open('tweets_attempt_8.csv', 'w') as csvfile:
writer = csv.writer(csvfile)
writer.writerows(tweet_data)
Here is code for which you needed to you out put...
I hope you are satisfy with this answers.
import bs4 as bs
import urllib2,csv
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
url='www.washingtonpost.com/graphics/politics/100-days-of-trump-tweets/?utm_term=.0c2052f6d858'
sauce = urllib2.Request(url, headers={'User-Agent' : "Magic Browser"})
con = urllib2.urlopen(sauce)
data = con.read()
soup = bs.BeautifulSoup(data, 'html.parser')
lists = soup.find_all('li', class_='visible')
dates = soup.find_all("li", attrs={"data-date": True})
tweet_data = ['date, time, tweets']
for li,list in zip(dates[1:],lists[1:]):
date = li['data-date']
time = list.find_all('span', {"class": "gray"})[0].text
tweets = list.text
tweet_data.append([date,time, tweets])
with open('/tmp/tweets_attempt_8.csv', 'w') as csvfile:
writer = csv.writer(csvfile)
writer.writerows(tweet_data)
As you want the Out Put look at this
Try this. There are 504 lines in that page which you want to parse. You will get all of them with a csv output.
import csv ; import requests ; from bs4 import BeautifulSoup
with open('tweets_attempt_8.csv', 'w', newline='', encoding='utf8') as outfile:
writer = csv.writer(outfile)
writer.writerow(['date','time','tweets'])
sauce = requests.get('https://www.washingtonpost.com/graphics/politics/100-days-of-trump-tweets/?utm_term=.0c2052f6d858',headers={"User-Agent":"Existed"}).text
soup = BeautifulSoup(sauce,"html.parser")
for item in soup.select("li.pg-excerpt.visible"):
date = item.get('data-date')
time = item.select("span.gray")[0].text
title = item.text.strip()
print(date, time, title[10:])
writer.writerow([date, time, title[10:]])

Categories