Finding <caption class="table-title"> - python

so I have written a script to scrape tables from a website and saves these to an Excel sheet:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from pandas import ExcelWriter
import os.path
path = "C:...."
url= 'https://zoek.officielebekendmakingen.nl/kst-35570-2.html'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
tables_df = pd.read_html(url, attrs = {'class': 'kio2 portrait'})
tables = soup.find_all('table', class_="kio2 portrait")
titles = []
for table in tables:
print(table)
title = table.find_all("caption", class_="table-title")
titles.append(title)
titles = []
writer = pd.ExcelWriter('output.xlsx')
for i, df in enumerate(tables_df, 1):
df.to_excel(writer, index=True,sheet_name=f'sheetName_{i}')
writer.save()
Which works, but now I want to find all titles of these table so I can give each sheet this title. For example, the first table has the following text of which I am interested:
<table cellpadding="0" cellspacing="0" class="kio2 portrait" summary="Tabel 1.1 Budgettaire kerngegevens"><caption class="table-title">Tabel 1.1 Budgettaire kerngegevens</caption>
Now I want to scrape the part between <caption class="table-title"> and </caption>. Or, which is also a possibility, use the summary element. How can I achieve this? I have tried it within the code but I do not find anything yet.

Try:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from pandas import ExcelWriter
url = "https://zoek.officielebekendmakingen.nl/kst-35570-2.html"
soup = BeautifulSoup(requests.get(url).text, "html.parser")
writer = pd.ExcelWriter("output.xlsx")
for i, table in enumerate(soup.find_all("table", class_="kio2 portrait"), 1):
df = pd.read_html(str(table))[0]
caption = table.get("summary", "").replace(":", "").strip()
# some tables doesn't contain summary, so make generic sheet name:
if not caption:
caption = f"table {i}"
df.to_excel(writer, sheet_name=caption)
writer.save()
This creates output.xlsx with 185 sheets (at least opening it in my Libreoffice):

Related

Can't get all the data of one column using BeautifulSoup

I'm using beautifulSoup to extract some data off of a wiki, but I can only get the first data of a specific column. If my understanding of for-loops is correct, it should loop through everything in the table. I tested this by printing "t" to the console and it shows all the data in HTML format. Is there a reason why this is happening?
from bs4 import BeautifulSoup
import requests, csv
import pandas as pd
wiki_url = "https://en.wiktionary.org/wiki/Appendix:Mandarin_Frequency_lists/1-1000"
table_id = "wikitable"
response = requests.get(wiki_url)
soup = BeautifulSoup(response.text, 'html.parser')
#table = soup.find('table', class_="wikitable")
table = soup.find_all('table', class_="wikitable")
with open('chinesewords.csv', 'w', encoding='utf8', newline='') as c:
writer = csv.writer(c)
writer.writerow(["simplified, pinyin"])
for t in table:
simplified = t.find('span', class_="Hans").text
print(simplified)
The output:
δΈ€
(I apologize in advance if I didn't follow the rules of StackOverflow posting, as this is my first time posting a question)
Make your life easier and try pandas.read_html().
Here's an example:
import requests
import pandas as pd
table = (
pd
.read_html(
requests
.get(
"https://en.wiktionary.org/wiki/Appendix:Mandarin_Frequency_lists/1-1000"
).text,
flavor="lxml"
)[0]
)
table.to_csv("mandarin_frequency_lists.csv", index=False)
Output:
If you mean data from one column from the table, the following code is enough. I hope I helped:
from bs4 import BeautifulSoup
import requests, csv
import pandas as pd
wiki_url = "https://en.wiktionary.org/wiki/Appendix:Mandarin_Frequency_lists/1-1000"
response = requests.get(wiki_url)
soup = BeautifulSoup(response.text, 'html.parser')
table_column = soup.find_all('span', class_="Hans")
with open('chinesewords.csv', 'w', encoding='utf32', newline='') as c:
writer = csv.writer(c)
writer.writerow(["simplified, pinyin"])
for t in table_column:
simplified = t.text
print(simplified)
writer.writerow(simplified)

Issue using BeautifulSoup and reading target URLs from a CSV

Everything works as expected when I'm using a single URL for the URL variable to scrape, but not getting any results when attempting to read links from a csv. Any help is appreciated.
Info about the CSV:
One column with a header called "Links"
300 rows of links with no space, commoa, ; or other charters before/after the links
One link in each row
import requests # required to make request
from bs4 import BeautifulSoup # required to parse html
import pandas as pd
import csv
with open("urls.csv") as infile:
reader = csv.DictReader(infile)
for link in reader:
res = requests.get(link['Links'])
#print(res.url)
url = res
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
email_elm0 = soup.find_all(class_= "app-support-list__item")[0].text.strip()
email_elm1 = soup.find_all(class_= "app-support-list__item")[1].text.strip()
email_elm2 = soup.find_all(class_= "app-support-list__item")[2].text.strip()
email_elm3 = soup.find_all(class_= "app-support-list__item")[3].text.strip()
final_email_elm = (email_elm0,email_elm1,email_elm2,email_elm3)
print(final_email_elm)
df = pd.DataFrame(final_email_elm)
#getting an output in csv format for the dataframe we created
#df.to_csv('draft_part2_scrape.csv')
The problem lies in this part of the code:
with open("urls.csv") as infile:
reader = csv.DictReader(infile)
for link in reader:
res = requests.get(link['Links'])
...
After the loop is executed, res will have the last link. So, this program will only scrape the last link.
To solve this problem, store all the links in a list and iterate that list to scrape each of the link. You can store the scraped result in a seperate dataframe and concatenate them at the end to store in a single file:
import requests # required to make request
from bs4 import BeautifulSoup # required to parse html
import pandas as pd
import csv
links = []
with open("urls.csv") as infile:
reader = csv.DictReader(infile)
for link in reader:
links.append(link['Links'])
dfs = []
for url in links:
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
email_elm0 = soup.find_all(class_="app-support-list__item")[0].text.strip()
email_elm1 = soup.find_all(class_="app-support-list__item")[1].text.strip()
email_elm2 = soup.find_all(class_="app-support-list__item")[2].text.strip()
email_elm3 = soup.find_all(class_="app-support-list__item")[3].text.strip()
final_email_elm = (email_elm0, email_elm1, email_elm2, email_elm3)
print(final_email_elm)
dfs.append(pd.DataFrame(final_email_elm))
#getting an output in csv format for the dataframe we created
df = pd.concat(dfs)
df.to_csv('draft_part2_scrape.csv')

Wikitables to CSV

I feel like i'm close to getting there. But can't think of a way to make this lists into a .csv. Could anybody help?
import requests
from bs4 import BeautifulSoup
import pandas as pd
wiki = requests.get('https://en.wikipedia.org/wiki/List_of_mass_shootings_in_the_United_States')
soup = BeautifulSoup(wiki.content, 'html.parser')
tables = soup.find_all('table', class_='wikitable sortable')
column_names = [item.get_text() for item in tables[0].find_all('th')]
content = [item.get_text() for item in tables[0].find_all('td')]
df = pd.DataFrame(columns=column_names)
Try with this :
import requests
from bs4 import BeautifulSoup
import pandas as pd
wiki = requests.get('https://en.wikipedia.org/wiki/List_of_mass_shootings_in_the_United_States')
soup = BeautifulSoup(wiki.content, 'html.parser')
tables = soup.find_all('table', class_='wikitable sortable')
#column_names = [item.get_text() for item in tables[0].find_all('th')]
alltables=pd.DataFrame()
for x in tables:
df = pd.read_html(str(x))
alltables=alltables.append(df,ignore_index=True)
print(alltables)
#appended_data = pd.concat(appended_data)
alltables.to_csv('test.csv')

"How to fix 'AttributeError: 'NoneType' object has no attribute 'tbody'' error in Python?

I expected a csv file created with in my desktop directory.
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "https://basketball.realgm.com/ncaa/conferences/Big-12-
Conference/3/Kansas/54/nba-players"
# get permission
response = requests.get(url)
# access html files
soup = BeautifulSoup(response.text, 'html.parser')
# creating data frame
columns = ['Player', 'Position', 'Height', 'Weight', 'Draft Year', 'NBA
Teams', 'Years', 'Games Played','Points Per Game', 'Rebounds Per Game',
'Assists Per Game']
df = pd.DataFrame(columns=columns)
table = soup.find(name='table', attrs={'class': 'tablesaw','data-
tablesaw-mode':'swipe','id': 'table-6615'}).tbody
trs = table.find('tr')
# rewording html
for tr in trs:
tds = tr.find_all('td')
row = [td.text.replace('\n', '')for td in tds]
df = df.append(pd.Series(row, index=columns), ignore_index=True)
df.to_csv('kansas_player', index=False)
I expected a csv file created with in my desktop directory.
Looks like by your way the soup.find(...) can not find 'table', and that's might
be why you get a None type returned, here is my change and you can tailor it to cope with you csv export need:
from bs4 import BeautifulSoup
import urllib.request
url = "https://basketball.realgm.com/ncaa/conferences/Big-12-Conference/3/Kansas/54/nba-players"
# get permission
response = urllib.request.urlopen(url)
# access html files
html = response.read()
soup = BeautifulSoup(html)
table = soup.find("table", {"class": "tablesaw"})
At this point, you can return full table content as:
From there on, you can easily extract the table row information by such as:
for tr in table.findAll('tr'):
tds = tr.find_all('td')
row = [td.text.replace('\n', '')for td in tds]
.....
Now each row would look like:
Finally, you can write each row into the csv with or without the pandas, your call then.

How to webscrape wiki tables of multiple Companies

I am trying to webscrape wiki tables of multiple companies like samsung,alibaba etc,but can't able to so. Below is My code
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
csvFile = open('Information.csv', 'wt+')
writer = csv.writer(csvFile)
lst=['Samsung','Facebook','Google','Tata_Consultancy_Services','Wipro','IBM','Alibaba_Group','Baidu','Yahoo!','Oracle_Corporation']
for a in lst:
html = urlopen("https://en.wikipedia.org/wiki/a")
bs = BeautifulSoup(html, 'html.parser')
table = bs.findAll('table')
for tr in table:
rows = tr.findAll('tr')
for row in rows:
csvRow = []
for cell in row.findAll(['td', 'th']):
csvRow.append(cell.get_text())
print(csvRow)
writer.writerow(csvRow)
You are passing a as a string itself, not a reference to one of the items in the list. Here is the corrected code:
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
csvFile = open('Information.csv', 'wt+')
writer = csv.writer(csvFile)
lst=['Samsung','Facebook','Google','Tata_Consultancy_Services','Wipro','IBM','Alibaba_Group','Baidu','Yahoo!','Oracle_Corporation']
for a in lst:
html = urlopen("https://en.wikipedia.org/wiki/{}".format(a))
bs = BeautifulSoup(html, 'html.parser')
table = bs.findAll('table')
for tr in table:
rows = tr.findAll('tr')
for row in rows:
csvRow = []
for cell in row.findAll(['td', 'th']):
csvRow.append(cell.get_text())
print(csvRow)
writer.writerow(csvRow)
html = urlopen("https://en.wikipedia.org/wiki/a") is where the problem is.
you're looping through lst to get the url for each company but failed to do so by using a string literal in the urlopen method.
the way to solve this is to replace html = urlopen("https://en.wikipedia.org/wiki/a") with either one of the following:
html = urlopen("https://en.wikipedia.org/wiki/" + a)
html = urlopen(f"https://en.wikipedia.org/wiki/{a}") #requires python 3.6+
html = urlopen("https://en.wikipedia.org/wiki/{}".format(a))

Categories