I am having trouble finding a table while web scraping using python/Beautiful Soup
import requests
from bs4 import BeautifulSoup
url = 'https://www.espn.com/nba/player/gamelog/_/id/3907387/ben-simmons'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
t = soup.find_all('table' , class_='Table Table--align-right')
This is returning null
You can use pandas read_html to read the table as a list and find the required list from that as below:
import requests
import pandas as pd
url = 'https://www.espn.com/nba/player/gamelog/_/id/3907387/ben-simmon'
html = requests.get(url).content
df_list = pd.read_html(html)
df = df_list[3]
print(df)
Related
I'm trying to use BeautifulSoup4 in Orange to scrape data from a list of URLs scraped from that same website.
I have managed to scraped the data from a single page when I set the URL manually.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
import csv
import re
url = "https://data.ushja.org/awards-standings/zone-points.aspx?year=2021&zone=1§ion=1901"
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
rank = soup.find("table", class_="table-standings-body")
for child in rank.children:
print(url,child)
and I have been able to scrape the list of URLs I need
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
import csv
import re
url = "https://data.ushja.org/awards-standings/zones.aspx?year=2021&zone=1"
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
rank = soup.find("table", class_="table-standings-body")
link = soup.find('div',class_='contentSection')
url_list = link.find('a').get('href')
for url_list in link.find_all('a'):
print (url_list.get('href'))
But so far I haven't been able to combine both to scrape the data from that URL list. Can I do that only by nesting for loops, and if so, how? Or how can I do it?
I am sorry if this is a stupid question, but I only started trying with Python and Web-Scraping yesterday and I have not been able to figure this by consulting similar-ish topics.
Try:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = "https://data.ushja.org/awards-standings/zones.aspx?year=2021&zone=1"
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
# get all links
url_list = []
for a in soup.find("div", class_="contentSection").find_all("a"):
url_list.append(a["href"].replace("§", "§"))
# get all data from URLs
all_data = []
for url in url_list:
print(url)
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
h2 = soup.h2
sub = h2.find_next("p")
for tr in soup.select("tr:has(td)"):
all_data.append(
[
h2.get_text(strip=True),
sub.get_text(strip=True),
*[td.get_text(strip=True) for td in tr.select("td")],
]
)
# save data to CSV
df = pd.DataFrame(
all_data,
columns=[
"title",
"sub_title",
"Rank",
"Horse / Owner",
"Points",
"Total Comps",
],
)
print(df)
df.to_csv("data.csv", index=None)
This traverses all URLs and saves all data to data.csv (screenshot from LibreOffice):
How to parse table from https://ege.hse.ru/rating/2019/81031971/all/?rlist=&ptype=0&vuz-abiturients-budget-order=ge&vuz-abiturients-budget-val=10 with BeautifulSoup and make pandas DataFrame?
My code:
import requests
from bs4 import BeautifulSoup
url = 'https://ege.hse.ru/rating/2019/81031971/all/?rlist=&ptype=0&vuz-abiturients-budget-order=ge&vuz-abiturients-budget-val=10'
page = requests.get(url)
soup = BeautifulSoup(page.content,"html.parser")
table = soup.find_all("table")
for each_table in table:
for row in each_table.find_all('tr'):
for cell in row.find_all("td"):
print(cell.text)
I try this:
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "https://ege.hse.ru/rating/2019/81031971/all/?rlist=&ptype=0&vuz-abiturients-budget-order=ge&vuz-abiturients-budget-val=10"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
tbl = soup.find("table", {"id": "MainContent_dataGridView1"})
data_frame = pd.read_html(str(tbl))[0]
print(data_frame)
But it says:
"ValueError: No tables found"
I see only a table with id="transparence_t"
So:
tbl = soup.find("table", {"id": "transparence_t"})
data_frame = pd.read_html(str(tbl))[0]
print(data_frame)
It returns to me a 698x6 dataframe
I tested my code with jupiter notebook with this code
...
rname = soup.find('p', 'con_tx')
#rnamelis = rname.findAll('p')
rname
from urllib.request import urljoin
story=[]
#review_text = lis[0].find('p').getText()
#list_soup =soup.find_all('p', 'con_tx')
story=rname.getText()
story
and it worked well.
(result) '전 여친에 ...'
But when I tried to scrape multiple pages
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.request import urljoin
import pandas as pd
import numpy as np
import requests
base_url = 'https://movie.naver.com/movie/bi/mi/basic.nhn?code='
pages =['177374','164102']
url = base_url + pages[0]
story = []
for n in pages:
# Create url
url = base_url + n
# Parse data using BS
print('Downloading page %s...' % url)
res = requests.get(url)
res.raise_for_status()
html = urlopen(url)
soup = BeautifulSoup(html, "html.parser")
#print(soup.find('p', 'con_tx'))
rname = soup.find('p', 'con_tx')
story=rname.getText()
data = {story}
df = pd.DataFrame(data)
df.head()
df.to_csv('./moviestory.csv', sep=',', encoding='EUC-KR')
An error message came out.
ValueError: DataFrame constructor not properly called!
How do I fix my code?
Crawling area
Not sure what you are trying to do, but one thing I'm noticing is you are overwriting your dataframe each time. Also don;t know why you initialise story as a list, and then store it as a dictionary in the loop.
from bs4 import BeautifulSoup
import pandas as pd
import requests
base_url = 'https://movie.naver.com/movie/bi/mi/basic.nhn?code='
pages =['177374','164102']
df = pd.DataFrame()
for n in pages:
# Create url
url = base_url + n
# Parse data using BS
print('Downloading page %s...' % url)
res = requests.get(url)
soup = BeautifulSoup(res.text, "html.parser")
rname = soup.find('p', 'con_tx')
story=rname.getText()
data = [story]
df = df.append(pd.DataFrame(data), sort=True).reset_index(drop=True)
df.to_csv('./moviestory.csv', sep=',', encoding='EUC-KR')
How can I scrape the yahoo earnings calendar to pull out the dates?
This is for python 3.
from bs4 import BeautifulSoup as soup
import urllib
url = 'https://finance.yahoo.com/calendar/earnings?day=2019-06-13&symbol=ibm'
response = urllib.request.urlopen(url)
html = response.read()
page_soup = soup(html,'lxml')
table = page_soup.find('p')
print(table)
the output is "None"
Beautiful Soup has some find functions that you can use to inspect the DOM , please refer to the documentation
from bs4 import BeautifulSoup as soup
import urllib.request
url = 'https://finance.yahoo.com/calendar/earnings?day=2019-06-13&symbol=ibm'
response = urllib.request.urlopen(url)
html = response.read()
page_soup = soup(html,'lxml')
table = page_soup.find_all('td')
Dates = []
for something in table:
try:
if something['aria-label'] == "Earnings Date":
Dates.append(something.text)
except:
print('')
print(Dates)
Might be off-topic but since you want to get a table from a webpage, you might consider using pandas which works with two lines:
import pandas as pd
earnings = pd.read_html('https://finance.yahoo.com/calendar/earnings?day=2019-06-13&symbol=ibm')[0]
Here are two succinct ways
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://finance.yahoo.com/calendar/earnings?day=2019-06-13&symbol=ibm&guccounter=1')
soup = bs(r.content, 'lxml')
# using attribute = value selector
dates = [td.text for td in soup.select('[aria-label="Earnings Date"]')]
#using nth-of-type to get column
dates = [td.text for td in soup.select('#cal-res-table td:nth-of-type(3)')]
As subject, I try to fetch the table using beautifulsoup.
http://www.hkjc.com/english/racing/Horse.asp?HorseNo=T421
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import lxml
import xlrd
HorseNo = ["T421"]
driver = webdriver.PhantomJS(r'D:\Program Files\Python\Path\PhantomJS\bin\phantomjs.exe')
#driver = webdriver.Chrome(r'D:\Program Files\Python\Path\chromedriver.exe')
url = "http://www.hkjc.com/english/racing/horse.asp?HorseNo=" + str(HorseNo)
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, "lxml")
table = soup.find("table", {"class" :"bigborder", "width":"970"}).findAll("tr")
print(table)
for row in table:
cells = row.findAll("td")
print(cells)
Print(table) result is fine though print(cells) is not able to return every td in the table. Would somebody advise me further. Thanks.
try this below using requests
from bs4 import BeautifulSoup
import requests
HorseNo = ["T421"]
url = "http://www.hkjc.com/english/racing/horse.asp?HorseNo=" + str(HorseNo)
html = requests.get(url).text
soup = BeautifulSoup(html, "lxml")
table = soup.find("table", {"class" :"bigborder", "width":"970"}).findAll("tr")
cells = []
for row in table:
cell = row.findAll("td")
cells.append(cell)
print(cells)