I am trying to retrieve informations from a table in this link: https://ski-resort-stats.com/ski-resorts-in-europe/
The page has a scrolling menu, which I must act on first to have all the entries on the page and being able to select them on.
But, when I am retrieving the infos I look for after, it does not do it for the whole table... I tried to add a sleeping time between the two actions in case it would be link to that but nothing changes. Could someone help me with that ?
Here is my code below:
driver = webdriver.Chrome("path/chromedriver")
driver.get("https://ski-resort-stats.com/ski-resorts-in-europe/")
content = driver.page_source
soup = BeautifulSoup(content)
#Select "All" in the drop down menu to select all the ski resorts
menu=driver.find_element_by_id("table_1_length")
for option in menu.find_elements_by_tag_name('option'):
if option.text == 'All':
option.click()
break
import time
time.sleep(10)
mydivs = soup.find_all("td",{"class":"column-resort-name"})
print(mydivs)
So the last element printed of mydivs is not the last element of the table...
All data is already in the page in the <table>:
import requests
from bs4 import BeautifulSoup
url = "https://ski-resort-stats.com/ski-resorts-in-europe/"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
# print some data from rows
for row in soup.select("#table_1 tbody tr"):
r = [td.get_text(strip=True) for td in row.select("td")]
print(r[1])
Prints:
Hemsedal
Geilosiden Geilo
Golm
Hafjell
Voss
Hochschwarzeck
Rossfeld - Berchtesgaden - Oberau
...
Puigmal
Kranzberg-Mittenwald
Wetterstein lifts-Wettersteinbahnen-– Ehrwald
Stuhleck-Spital am Semmering
Related
I am trying to scrape this site https://franchisedisclosure.gov.au/Register with playwright and the url doesn't change after you click on the next button. How do I solve this pagination problem?
Here's my code
`
from bs4 import BeautifulSoup as bs
from playwright.sync_api import sync_playwright
url = 'https://franchisedisclosure.gov.au/Register'
with sync_playwright() as p:
browser = p.chromium.launch(headless=False, slow_mo=50)
page = browser.new_page()
page.goto(url)
page.locator("text=I agree to the terms of use").click()
page.locator("text=Continue").click()
page.wait_for_load_state('domcontentloaded')
page.is_visible('tbody')
html = page.inner_html('table.table.table-hover')
soup = bs(html, 'html.parser')
table = soup.find('tbody')
rows = table.findAll('tr')
names = []
industry = []
Locations = []
for row in rows:
info = row.findAll('td')
name = info[0].text.strip()
industry = info[1].text.strip()
Locations = info[2].text.strip()
`
I've checked online and every solution I see involves the url changing. And for some reason, you can make requests to the api of the site. Postman said something about the parameters not being sent.
With some small adjustments you can get it, lets try this:
from bs4 import BeautifulSoup as bs
from playwright.sync_api import sync_playwright
import time
url = 'https://franchisedisclosure.gov.au/Register'
with sync_playwright() as p:
browser = p.chromium.launch(headless=False, slow_mo=100)
page = browser.new_page()
page.goto(url)
page.locator("text=I agree to the terms of use").click()
page.locator("text=Continue").click()
page.wait_for_load_state('domcontentloaded')
names = []
industry = []
Locations = []
# When you click to next page, an element with text "Loading" appears in the screen, so we save that element
loading_icon = "//strong[text()='Loading...']"
# This is the "next page" button
next_page_locator = "//ul[#class='pagination']/li[3]"
# We select the option of 50 elements per page
page.select_option("#perPageCount", value="50")
# We wait for the selector of loading icon to be visible and then to be hidden, which means the new list is fully loaded
page.wait_for_selector(loading_icon, state="visible")
page.wait_for_selector(loading_icon, state="hidden")
time.sleep(1)
# We make a loop until the button "Next page" is disabled, which means there are no more pages to paginate
while "disabled" not in page.get_attribute(selector=next_page_locator, name="class"):
# We get the info you wanted
page.is_visible('tbody')
html = page.inner_html('table.table.table-hover')
soup = bs(html, 'html.parser')
table = soup.find('tbody')
rows = table.findAll('tr')
for row in rows:
info = row.findAll('td')
name = info[0].text.strip()
industry = info[1].text.strip()
Locations = info[2].text.strip()
# Once we get the info we click in next page and we wait for the loading element to be visible and then to be hidden.
page.click(next_page_locator)
page.wait_for_selector(loading_icon, state="visible")
page.wait_for_selector(loading_icon, state="hidden")
time.sleep(1)
Thanks for the great question... and answer. In addition / as opposed to using the loading_icon, you could also use a "networkidle", so expanding on #Jaky Ruby's answer adding page.wait_for_load_state(state="networkidle"). I often use the networkidle option to check for the completed loading of the next page, however I've read somewhere it's not necessarily best practice... but it works quite often.
from bs4 import BeautifulSoup as bs
from playwright.sync_api import sync_playwright
import time
url = 'https://franchisedisclosure.gov.au/Register'
with sync_playwright() as p:
browser = p.chromium.launch(headless=False, slow_mo=100)
page = browser.new_page()
page.goto(url)
page.locator("text=I agree to the terms of use").click()
page.locator("text=Continue").click()
page.wait_for_load_state('domcontentloaded')
names = []
industry = []
Locations = []
# When you click to next page, an element with text "Loading" appears in the screen, so we save that element
loading_icon = "//strong[text()='Loading...']"
# This is the "next page" button
next_page_locator = "//ul[#class='pagination']/li[3]"
# We select the option of 50 elements per page
page.select_option("#perPageCount", value="50")
# We wait for the selector of loading icon to be visible and then to be hidden, which means the new list is fully loaded
page.wait_for_selector(loading_icon, state="visible")
page.wait_for_selector(loading_icon, state="hidden")
page.wait_for_load_state(state="networkidle")
time.sleep(1)
# We make a loop until the button "Next page" is disabled, which means there are no more pages to paginate
while "disabled" not in page.get_attribute(selector=next_page_locator, name="class"):
# We get the info you wanted
page.is_visible('tbody')
html = page.inner_html('table.table.table-hover')
soup = bs(html, 'html.parser')
table = soup.find('tbody')
rows = table.findAll('tr')
for row in rows:
info = row.findAll('td')
name = info[0].text.strip()
industry = info[1].text.strip()
Locations = info[2].text.strip()
# Once we get the info we click in next page and we wait for the loading element to be visible and then to be hidden.
page.click(next_page_locator)
page.wait_for_selector(loading_icon, state="visible")
page.wait_for_selector(loading_icon, state="hidden")
time.sleep(1)
import requests
from bs4 import BeautifulSoup
results = requests.get("https://en.wikipedia.org/wiki/List_of_multiple_Olympic_gold_medalists")
src = results.content
soup = BeautifulSoup(src, 'lxml')
trs = soup.find_all("tr")
for tr in trs:
print(tr.text)
This is the code I write for the scraping table from the page "https://en.wikipedia.org/wiki/List_of_multiple_Olympic_gold_medalists"
If I am only targeting the table in the session "List of most Olympic gold medals over career", how can I specify the table I need? There are 2 sortable jquery-tablesorter so I cannot use the class attribute to select the table I needed.
One more question, if I know that the page I am scraping contains a lot of tables and the one I need always have 10 td in 1 row, can I have something like
If len(td) == 10:
print(tr)
to extract the data I wanted
Update on code:
from bs4 import BeautifulSoup
results = requests.get("https://en.wikipedia.org/wiki/List_of_multiple_Olympic_gold_medalists")
src = results.content
soup = BeautifulSoup(src, 'lxml')
tbs = soup.find("tbody")
trs = tbs.find_all("tr")
for tr in trs:
print(tr.text)
I have one of the solution, not a good one, just to extract the first table from the page which is the one I needed, any suggestion/ improvement are welcomed!
Thank you.
To only get the first table you can use a CSS Selector nth-of-type(1):
import requests
from bs4 import BeautifulSoup
URL = "https://en.wikipedia.org/wiki/List_of_multiple_Olympic_gold_medalists"
soup = BeautifulSoup(requests.get(URL).content, "html.parser")
table = soup.select_one("table.wikitable:nth-of-type(1)")
trs = table.find_all("tr")
for tr in trs:
print(tr.text)
I want to extract text from th tags in a table so I can print a list of metro stations from a table in a Wikipedia page. I only need text from a certain table (there are two of them in the page)
import urllib.request
url = "https://en.wikipedia.org/wiki/List_of_London_Underground_stations"
page = urllib.request.urlopen(url)
from bs4 import BeautifulSoup
soup = BeautifulSoup(page, "lxml")
stations_table = soup.find("table", class_= "wikitable sortable plainrowheaders")
stations_table
for i in soup.find_all('th', stations_table):
print(i.text)
I can get the table stored in the stations_table variable but cannot print the text in th tags within the wikitable sortable plainrowheaders table. While it does print the station name, it also prints the headers:
Station
Local authority
Zone(s)[†]
Opened[4]
Main lineopened
Usage[5]
How can I filter those out?
It shows all th in table - not only stations but also headers like Stations, Lines
To skip it I search all tr, skip first row and then I search th in every row
for i in stations_table.find_all('tr')[1:]
print(i.find('th').text.strip())
Full code
import urllib.request
from bs4 import BeautifulSoup
url = "https://en.wikipedia.org/wiki/List_of_London_Underground_stations"
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, "html.parser")
stations_table = soup.find("table", class_= "wikitable sortable plainrowheaders")
for i in stations_table.find_all('tr')[1:]:
print(i.find('th').text.strip())
#print(i.th.text.strip())
for i in soup.find_all('th', stations_table):
searches for all the table headings and the table rows. What can be done for this, is to extract all the rows and start printing from the second row (ignoring the title's row) as below
for i in stations_table.find_all('tr')[1:]:
print(i.find('th').text)
I am trying to scrape the bitcoin price off of coinbase and cannot find the proper syntax. When I run the program (without the line with question marks) I get the block of html that I need, but I don't know how to narrow down and retrieve the price itself. Any help appreciated, thanks.
import requests
from bs4 import BeautifulSoup
url = 'https://www.coinbase.com/charts'
data = requests.get(url)
nicedata = data.text
soup = BeautifulSoup(nicedata, 'html.parser')
prettysoup = soup.prettify()
bitcoin = soup.find('h4', {'class':
'Header__StyledHeader-sc-1q6y56a-0 hZxUBM
TextElement__Spacer-sc-18l8wi5-0 hpeTzd'})
price = bitcoin.find('???')
print(price)
The attached image contains the html
To get text from item:
price = bitcoin.text
But this page has many items <h4> with this class but find() gets only first one and it has text Bitcoin, not price from your image. You may need find_all() to get list with all items and then you can use index [index] or slicing [start:end] to get some items, or you can use for-loop to work with every item on list.
import requests
from bs4 import BeautifulSoup
url = 'https://www.coinbase.com/charts'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
all_h4 = soup.find_all('h4', {'class': 'Header__StyledHeader-sc-1q6y56a-0 hZxUBM TextElement__Spacer-sc-18l8wi5-0 hpeTzd'})
for h4 in all_h4:
print(h4.text)
It can be easier to work with data if you keep it in list of list or array or DataFrame. But to create list of lists it would be easier to find rows <tr> and inside every row search <h4>
import requests
from bs4 import BeautifulSoup
url = 'https://www.coinbase.com/charts'
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
all_tr = soup.find_all('tr')
data = []
for tr in all_tr:
row = []
for h4 in tr.find_all('h4'):
row.append(h4.text)
if row: # skip empty row
data.append(row)
for row in data:
print(row)
It doesn't need class to get all h4.
BTW: This page uses JavaScript to append new rows when you scroll page but requests and BeautifulSoup can't run JavaScript - so if you will need all rows then you may need Selenium to control web browser which runs JavaScript
I am currently running the following python script:
import requests
from bs4 import BeautifulSoup
origin= ["USD","GBP","EUR"]
i=0
while i < len(origin):
page = requests.get("https://www.x-rates.com/table/?from="+origin[i]+"&amount=1")
soup = BeautifulSoup(page.content, "html.parser")
tables = soup.findChildren('table')
my_table = tables[0]
rows = my_table.findChildren(['td'])
i = i +1
for rows in rows:
cells = rows.findChildren('a')
for cell in cells:
value = cell.string
print(value)
To scrape data from this HTML:
https://i.stack.imgur.com/DkX83.png
The problem I have is that I'm struggling to only scrape the first column without scraping the second one as well because they are both under tags and in the same table row as each other. The href is the only thing which differentiates between the two tags and I have tried filtering using this but it doesn't seem to work and returns a blank value. Also when i try to sort the data manually the output is amended vertically and not horizontally, I am new to coding so any help would be appreciated :)
There is another way you might wanna try as well to achieve the same:
import requests
from bs4 import BeautifulSoup
keywords = ["USD","GBP","EUR"]
for keyword in keywords:
page = requests.get("https://www.x-rates.com/table/?from={}&amount=1".format(keyword))
soup = BeautifulSoup(page.content, "html.parser")
for items in soup.select_one(".ratesTable tbody").find_all("tr"):
data = [item.text for item in items.find_all("td")[1:2]]
print(data)
It is easier to follow what happens when you print every item you got from the top e.g. in this case from table item. The idea is to go one by one so you can follow.
import requests
from bs4 import BeautifulSoup
origin= ["USD","GBP","EUR"]
i=0
while i < len(origin):
page = requests.get("https://www.x-rates.com/table/?from="+origin[i]+"&amount=1")
soup = BeautifulSoup(page.content, "html.parser")
tables = soup.findChildren('table')
my_table = tables[0]
i = i +1
rows = my_table.findChildren('tr')
for row in rows:
cells = row.findAll('td',class_='rtRates')
if len(cells) > 0:
first_item = cells[0].find('a')
value = first_item.string
print(value)