Can't get all the data of one column using BeautifulSoup - python

I'm using beautifulSoup to extract some data off of a wiki, but I can only get the first data of a specific column. If my understanding of for-loops is correct, it should loop through everything in the table. I tested this by printing "t" to the console and it shows all the data in HTML format. Is there a reason why this is happening?
from bs4 import BeautifulSoup
import requests, csv
import pandas as pd
wiki_url = "https://en.wiktionary.org/wiki/Appendix:Mandarin_Frequency_lists/1-1000"
table_id = "wikitable"
response = requests.get(wiki_url)
soup = BeautifulSoup(response.text, 'html.parser')
#table = soup.find('table', class_="wikitable")
table = soup.find_all('table', class_="wikitable")
with open('chinesewords.csv', 'w', encoding='utf8', newline='') as c:
writer = csv.writer(c)
writer.writerow(["simplified, pinyin"])
for t in table:
simplified = t.find('span', class_="Hans").text
print(simplified)
The output:
一
(I apologize in advance if I didn't follow the rules of StackOverflow posting, as this is my first time posting a question)

Make your life easier and try pandas.read_html().
Here's an example:
import requests
import pandas as pd
table = (
pd
.read_html(
requests
.get(
"https://en.wiktionary.org/wiki/Appendix:Mandarin_Frequency_lists/1-1000"
).text,
flavor="lxml"
)[0]
)
table.to_csv("mandarin_frequency_lists.csv", index=False)
Output:

If you mean data from one column from the table, the following code is enough. I hope I helped:
from bs4 import BeautifulSoup
import requests, csv
import pandas as pd
wiki_url = "https://en.wiktionary.org/wiki/Appendix:Mandarin_Frequency_lists/1-1000"
response = requests.get(wiki_url)
soup = BeautifulSoup(response.text, 'html.parser')
table_column = soup.find_all('span', class_="Hans")
with open('chinesewords.csv', 'w', encoding='utf32', newline='') as c:
writer = csv.writer(c)
writer.writerow(["simplified, pinyin"])
for t in table_column:
simplified = t.text
print(simplified)
writer.writerow(simplified)

Related

Finding <caption class="table-title">

so I have written a script to scrape tables from a website and saves these to an Excel sheet:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from pandas import ExcelWriter
import os.path
path = "C:...."
url= 'https://zoek.officielebekendmakingen.nl/kst-35570-2.html'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
tables_df = pd.read_html(url, attrs = {'class': 'kio2 portrait'})
tables = soup.find_all('table', class_="kio2 portrait")
titles = []
for table in tables:
print(table)
title = table.find_all("caption", class_="table-title")
titles.append(title)
titles = []
writer = pd.ExcelWriter('output.xlsx')
for i, df in enumerate(tables_df, 1):
df.to_excel(writer, index=True,sheet_name=f'sheetName_{i}')
writer.save()
Which works, but now I want to find all titles of these table so I can give each sheet this title. For example, the first table has the following text of which I am interested:
<table cellpadding="0" cellspacing="0" class="kio2 portrait" summary="Tabel 1.1 Budgettaire kerngegevens"><caption class="table-title">Tabel 1.1 Budgettaire kerngegevens</caption>
Now I want to scrape the part between <caption class="table-title"> and </caption>. Or, which is also a possibility, use the summary element. How can I achieve this? I have tried it within the code but I do not find anything yet.
Try:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from pandas import ExcelWriter
url = "https://zoek.officielebekendmakingen.nl/kst-35570-2.html"
soup = BeautifulSoup(requests.get(url).text, "html.parser")
writer = pd.ExcelWriter("output.xlsx")
for i, table in enumerate(soup.find_all("table", class_="kio2 portrait"), 1):
df = pd.read_html(str(table))[0]
caption = table.get("summary", "").replace(":", "").strip()
# some tables doesn't contain summary, so make generic sheet name:
if not caption:
caption = f"table {i}"
df.to_excel(writer, sheet_name=caption)
writer.save()
This creates output.xlsx with 185 sheets (at least opening it in my Libreoffice):

How to scrape the website properly and getting all td texts from website

I am new to python. is anyone know {sum(int(td.text) for td in soup.select('td:last-child')[1:])} what is use of [1:] in this or [0] or [1]. i saw it in many scraping examples below for in loop. As i was practicing i build this code and don't able to scrape all data in csv file. thanks in advance, sorry for two question at one time.
import requests
from bs4 import BeautifulSoup
import csv
url= "https://iplt20.com/stats/2020/most-runs"
r= requests.get (url)
soup= BeautifulSoup (r.content, 'html5lib')
lst= []
table=soup.find ('div', attrs = {'class':'js-table'})
#for row in table.findAll ('div', attrs= {'class':'top-players__player-name'}):
# score = {}
# score['Player'] = row.a.text.strip()
# lst.append(score)
for row in table.findAll (class_='top-players__m top-players__padded '):
score = {}
score['Matches'] = int(row.td.text)
lst.append(score)
filename= 'iplStat.csv'
with open (filename, 'w', newline='') as f:
w= csv.DictWriter(f,['Player', 'Matches'])
w.writeheader()
for score in lst:
w.writerow(score)
print (lst)
All of this is not even needed. Just use pandas:
import requests
import pandas as pd
url = "https://iplt20.com/stats/2020/most-runs"
r = requests.get (url)
df = pd.read_html(r.content)[0]
df.to_csv("iplStats.csv", index = False)
Screenshot of csv file:

BeautifulSoup: Scraping CSV list of URLs

I have been trying to download data from different urls and then save it to a csv file.
The idea is extract the highlighted data from: https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow
So far I built the following piece of code:
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request as ur
url_is = 'https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow'
read_data = ur.urlopen(url_is).read()
soup_is=BeautifulSoup(read_data, 'lxml')
row = soup_is.select_one('tr.mainRow>td.rowTitle:contains("Cash Dividends Paid - Total")')
data=[cell.text for cell in row.parent.select('td') if cell.text!='']
df=pd.DataFrame(data)
print(df.T)
I get as an output:
All good so far.
Now my idea is to extract specific classes from multiple URLs, keep the same headers from the website and export it to a .csv.
The tags and classes stay the same
Sample URLs:
https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow
https://www.marketwatch.com/investing/stock/aapl/financials/cash-flow
Code (I wanted to try with 2 columns: 2015 and 2016)
As desidered ouput I would like something like:
I wrote the following code, but is giving me issues, any help or advice is welcome:
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request as ur
import numpy as np
import requests
links = ['https://www.marketwatch.com/investing/stock/aapl/financials/cash-flow', 'https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow']
container = pd.DataFrame(columns=['Name', 'Name2'])
pos=0
for l in links:
read_data = ur.urlopen(l).read()
soup_is=BeautifulSoup(read_data, 'lxml')
row = soup_is.select_one('tr.mainRow>td.rowTitle:contains("Cash Dividends Paid - Total")')
results=[cell.text for cell in row.parent.select('td') if cell.text!='']
records = []
for result in results:
records = []
Name = result.find('span', attrs={'itemprop':'2015'}).text if result.find('span', attrs={'itemprop':'2015'}) is not None else ''
Name2 = result.find('span', attrs={'itemprop':'2016'}).text if result.find('span', attrs={'itemprop':'2016'}) is not None else ''
records.append(Name)
records.append(Name2)
container.loc[pos] = records
pos+=1
import requests
import pandas as pd
urls = ['https://www.marketwatch.com/investing/stock/aapl/financials/cash-flow',
'https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow']
def main(urls):
with requests.Session() as req:
goal = []
for url in urls:
r = req.get(url)
df = pd.read_html(
r.content, match="Cash Dividends Paid - Total")[0].iloc[[0], 0:3]
goal.append(df)
new = pd.concat(goal)
print(new)
main(urls)

Wikitables to CSV

I feel like i'm close to getting there. But can't think of a way to make this lists into a .csv. Could anybody help?
import requests
from bs4 import BeautifulSoup
import pandas as pd
wiki = requests.get('https://en.wikipedia.org/wiki/List_of_mass_shootings_in_the_United_States')
soup = BeautifulSoup(wiki.content, 'html.parser')
tables = soup.find_all('table', class_='wikitable sortable')
column_names = [item.get_text() for item in tables[0].find_all('th')]
content = [item.get_text() for item in tables[0].find_all('td')]
df = pd.DataFrame(columns=column_names)
Try with this :
import requests
from bs4 import BeautifulSoup
import pandas as pd
wiki = requests.get('https://en.wikipedia.org/wiki/List_of_mass_shootings_in_the_United_States')
soup = BeautifulSoup(wiki.content, 'html.parser')
tables = soup.find_all('table', class_='wikitable sortable')
#column_names = [item.get_text() for item in tables[0].find_all('th')]
alltables=pd.DataFrame()
for x in tables:
df = pd.read_html(str(x))
alltables=alltables.append(df,ignore_index=True)
print(alltables)
#appended_data = pd.concat(appended_data)
alltables.to_csv('test.csv')

Html parser to dataframe

I am struggled trying to convert a html table to a dataframe. I would like to write the table in a csv file.
from requests import session
import sys
import csv
from bs4 import BeautifulSoup
c = session()
outfile = open("Weather2017.csv", 'wb')
response = c.get('http://www.wunderground.com/history/airport/EGLL/2017/1/1/CustomHistory.html?dayend=31&monthend=12&yearend=2017&req_city=NA&req_state=NA&req_statename=NA')
soup = BeautifulSoup(response.text, "html.parser")
soup = soup.find(id="obsTable").text.replace('\n','',1)
outfile.write(soup.replace('\n',',London\n'))
the type error is the following
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-11-1e149d844e15> in <module>()
11 soup = BeautifulSoup(response.text, "html.parser")
12 soup = soup.find(id="obsTable").text.replace('\n','',1)
---> 13 outfile.write(soup.replace('\n',',London\n'))
14
15
TypeError: a bytes-like object is required, not 'str'
this is the table I want to covert in to a csv file
Can anyone help me?
Thanks in advance!
How about this,
from requests import session
import sys
import csv
from bs4 import BeautifulSoup
c = session()
response = c.get('http://www.wunderground.com/history/airport/EGLL/2017/1/1/CustomHistory.html?dayend=31&monthend=12&yearend=2017&req_city=NA&req_state=NA&req_statename=NA')
soup = BeautifulSoup(response.text, "html.parser")
table = soup.find(id="obsTable")
headers = [header.text.encode('utf-8').strip() for header in table.find_all('th')]
rows = []
for row in table.find_all('tr'):
rows.append([val.text.encode('utf-8').strip() for val in row.find_all('td')])
del rows[0] # Remove header row. Added as empty.
with open('Weather2017.csv', 'wb') as f:
writer = csv.writer(f)
writer.writerow(headers)
writer.writerows(row for row in rows if row)
The things causing a problem in your code when applying BeautifulSoup() are these tags:
tbody , /tbody , thead, /thead. if you get rid of them everything will work well!
Here is a solution using pandas, regex and some other libs :)
#needed imports
import pandas as pd
import numpy as numpy
from bs4 import BeautifulSoup
import requests
import re
# get page html code
url = 'https://www.wunderground.com/history/airport/EGLL/2017/1/1/CustomHistory.html?dayend=31&monthend=12&yearend=2017&req_city=NA&req_state=NA&req_statename=NA'
req = requests.get(url)
html = req.text
soup = (BeautifulSoup(html, 'html.parser'))
#removing tags that cause problems using re library
patterns = ['<tbody>','</tbody>','<thead>','</thead>']
cleaned_html= soup.prettify()
for pat in patterns:
cleaned_html = re.sub(pat, '', cleaned_html)
df = pd.read_html(cleaned_html, attrs={'id':'obsTable'})[0]
df.head()
# build a hierarchical columns
df.columns = [['2017',
'Temp. (°C)','Temp. (°C)','Temp. (°C)',
'Dew Point (°C)','Dew Point (°C)','Dew Point (°C)',
'Humidity (%)','Humidity (%)','Humidity (%)',
'Sea Level Press. (hPa)','Sea Level Press. (hPa)','Sea Level Press. (hPa)',
'Visibility (km)','Visibility (km)','Visibility (km)',
'Wind (km/h)', 'Wind (km/h)','Wind (km/h)',
'Precip. (mm)', 'Events'],
['Jan',
'high','avg','low',
'high','avg','low',
'high','avg','low',
'high','avg','low',
'high','avg','low',
'high','avg','high',
'sum',
'nan']]
df.head()
#removing the first un-needed rows
df = df.drop([0,1], axis=0)
df.reset_index(inplace=True, drop=True)
df.head()
#save the result to CSV file
df.to_csv('weather.csv')

Categories