Beautifulsoup4 find_all not getting the results I need

Beautifulsoup4 find_all not getting the results I need - python

I'm trying to get data from flashscore.com to a project I'm doing as a part of my self-tought Python study:
import requests
from bs4 import BeautifulSoup
res = requests.get("https://www.flashscore.com/")
soup = BeautifulSoup(res.text, "lxml")
games = soup.find_all("div", {'class':['event__match', 'event__match--scheduled', 'event__match--twoLine']})
print(games)
When I run this, it gets me an empty list []
Why?

When an empty list is returned in find_all(), that means the elements that you specified could not be found.
Make sure that what you are trying to scrape isn't dynamically added such as an iframe in some cases

The failure is due to the fact that the website uses a set of Ajax technologies, specifically dynamically added content with the help of the JavaScript client scripting language. The client code for scripting languages is executed in the browser itself, not at the web server level. The success of such code depends on the browser's ability to interpret and execute it correctly. With the help of the BeatifulSoup library in the program you wrote, you only check the HTML code. JavaScript code can be open, for example, with the help of the Selenium library: https://www.selenium.dev/. Below is the full code for the data that I suppose you are interested in:
# crawler_her_sel.py
import time
from selenium.webdriver import Firefox
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import pandas as pd
def firefoxdriver(my_url):
"""
Preparing of the browser for the work and adding the headers to
the browser.
"""
# Preparing of the Tor browser for the work.
options = Options()
options.add_argument("--headless")
driver = Firefox(options=options)
return driver
def scrapingitems(driver, my_list, my_xpath):
"""
Create appropriate lists of the data for the pandas library.
"""
try:
elem_to_scrap = driver.find_element(By.XPATH, my_xpath).text
my_list.append(elem_to_scrap)
except:
elem_to_scrap = ""
my_list.append(elem_to_scrap)
# Variable with the URL of the website.
my_url = "https://www.flashscore.com/"
# Preparing of the Tor browser for the work and adding the headers
# to the browser.
driver = firefoxdriver(my_url)
# Loads the website code as the Selenium object.
driver.get(my_url)
# Prepare the blank dictionary to fill in for pandas.
matches = {}
# Preparation of lists with scraped data.
countries = []
leagues = []
home_teams = []
scores_home = []
scores_away = []
away_teams = []
# Wait for page to fully render
try:
element = WebDriverWait(driver, 25).until(
EC.presence_of_element_located((By.CLASS_NAME, "adsclick")))
except TimeoutException:
print("Loading took too much time!. Please rerun the script.")
except Exception as e:
print(str(e))
else:
# Loads the website code as the BeautifulSoup object.
pageSource = driver.page_source
bsObj = BeautifulSoup(pageSource, "lxml")
# Determining the number of the football matches with the help of
# the BeautifulSoup.
games_1 = bsObj.find_all(
"div", {"class":
"event__participant event__participant--home"})
games_2 = bsObj.find_all(
"div", {"class":
"event__participant event__participant--home fontBold"})
games_3 = bsObj.find_all(
"div", {"class":
"event__participant event__participant--away"})
games_4 = bsObj.find_all(
"div", {"class":
"event__participant event__participant--away fontBold"})
# Determining the number of the countries for the given football
# matches.
all_countries = driver.find_elements(By.CLASS_NAME, "event__title--type")
# Determination of the number that determines the number of
# the loop iterations.
sum_to_iterate = len(all_countries) + len(games_1) + len(games_2)
+ len(games_3) + len(games_4)
for ind in range(1, (sum_to_iterate+1)):
# Scraping of the country names.
xpath_countries = ('//div[#class="sportName soccer"]/div['+str(ind)
+']/div[2]/div/span[1]')
scrapingitems(driver, countries, xpath_countries)
# Scraping of the league names.
xpath_leagues = ('//div[#class="sportName soccer"]/div['+str(ind)
+']/div[2]/div/span[2]')
scrapingitems(driver, leagues, xpath_leagues)
# Scraping of the home team names.
xpath_home_teams = ('//div[#class="sportName soccer"]/div['+str(ind)
+']/div[3]')
scrapingitems(driver, home_teams, xpath_home_teams)
# Scraping of the home team scores.
xpath_scores_home = ('//div[#class="sportName soccer"]/div['+str(ind)
+']/div[5]')
scrapingitems(driver, scores_home, xpath_scores_home)
# Scraping of the away team scores.
xpath_scores_away = ('//div[#class="sportName soccer"]/div['+str(ind)
+']/div[6]')
scrapingitems(driver, scores_away, xpath_scores_away)
# Scraping of the away team names.
xpath_away_teams = ('//div[#class="sportName soccer"]/div['+str(ind)
+']/div[4]')
scrapingitems(driver, away_teams, xpath_away_teams)
# Add lists with the scraped data to the dictionary in the correct
# order.
matches["Countries"] = countries
matches["Leagues"] = leagues
matches["Home_teams"] = home_teams
matches["Scores_for_home_teams"] = scores_home
matches["Scores_for_away_teams"] = scores_away
matches["Away_teams"] = away_teams
# Creating of the frame for the data with the help of the pandas
# package.
df_res = pd.DataFrame(matches)
# Saving of the properly formatted data to the csv file. The date
# and the time of the scraping are hidden in the file name.
name_of_file = lambda: "flashscore{}.csv".format(time.strftime(
"%Y%m%d-%H.%M.%S"))
df_res.to_csv(name_of_file(), encoding="utf-8")
finally:
driver.quit()
The result of the script is a csv file, which, when loaded as data into Excel, gives the following table, e.g.:
It is worth mentioning here to download the necessary driver for your browser: https://www.selenium.dev/documentation/webdriver/getting_started/install_drivers/.
In addition, I give you links to two other interesting scripts that relate to scraping from the https://www.flashscore.com/ portal, i.e.: How can i scrape a football results from flashscore using python and Scraping stats with Selenium.
I would also like to raise legal issues here. The robots.txt file downloaded from the https://www.flashscore.com/robots.txt website looks like this:
It shows that you can scrape the home page. But the „General Terms of Use” says that quoting „Without prior authorisation in writing from the Provider, Visitors are not authorised to copy, modify, tamper with, distribute, transmit, display, reproduce, transfer, upload, download or otherwise use or alter any of the content of the App. ”
This, unfortunately, introduces ambiguity and ultimately it is not clear what the owner really wants. Therefore, I recommend that you do not use this script constantly, and certainly not for commercial purposes and I ask other visitors for this that visit this website. I myself wrote this script for the purpose of learning to scrape and I do not intend to use it at all.
The finished script can be downloaded from my GitHub.

Related

Python Selenium - Scraping Trip Advisor Reviews

I am only a hobbyist with Python, so please bare with me. I am trying to run this script to collect Trip Advisor reviews and write to excel.
But once it opens up the website it throws this error: NoSuchElementException no such element: Unable to locate element: {"method":"xpath","selector":".//q[#class='IRsGHoPm']"}
Any got any ideas on what is going wrong?
import csv #This package lets us save data to a csv file
from selenium import webdriver #The Selenium package we'll need
from selenium.webdriver.support import expected_conditions
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
import time #This package lets us pause execution for a bit
path_to_file = "E:Desktop/Data/Reviews.csv"
pages_to_scrape = 3
url = "https://www.tripadvisor.com/Hotel_Review-g60982-d209422-Reviews-Hilton_Waikiki_Beach-Honolulu_Oahu_Hawaii.html"
# import the webdriver
driver = webdriver.Chrome()
driver.get(url)
# open the file to save the review
csvFile = open(path_to_file, 'a', encoding="utf-8")
csvWriter = csv.writer(csvFile)
# change the value inside the range to save the number of reviews we're going to grab
for i in range(0, pages_to_scrape):
# give the DOM time to load
time.sleep(5)
# Click the "expand review" link to reveal the entire review.
driver.find_element_by_xpath(".//div[contains(#data-test-target, 'expand-review')]").click()
# Now we'll ask Selenium to look for elements in the page and save them to a variable. First lets define a container that will hold all the reviews on the page. In a moment we'll parse these and save them:
container = driver.find_elements_by_xpath("//div[#data-reviewid]")
# Next we'll grab the date of the review:
dates = driver.find_elements_by_xpath(".//div[#class='_2fxQ4TOx']")
# Now we'll look at the reviews in the container and parse them out
for j in range(len(container)): # A loop defined by the number of reviews
# Grab the rating
rating = container[j].find_element_by_xpath(".//span[contains(#class, 'ui_bubble_rating bubble_')]").get_attribute("class").split("_")[3]
# Grab the title
title = container[j].find_element_by_xpath(".//div[contains(#data-test-target, 'review-title')]").text
#Grab the review
review = container[j].find_element_by_xpath(".//q[#class='IRsGHoPm']").text.replace("\n", " ")
#Grab the data
date = " ".join(dates[j].text.split(" ")[-2:])
#Save that data in the csv and then continue to process the next review
csvWriter.writerow([date, rating, title, review])
# When all the reviews in the container have been processed, change the page and repeat
driver.find_element_by_xpath('.//a[#class="ui_button nav next primary "]').click()
# When all pages have been processed, quit the driver
driver.quit()
enter image description here

I am trying to finding "_2fxQ4TOx" class name in the page but it came up with 0 results. May be these are the auto-generated classes by browser to keep up with compression and its own CSS reference.
Instead, try to find elements by these attributes (which actually exist in the frontend), try finding it like:
container = driver.find_element_by_css_selector('div[data-test-target="reviews-tab"]')
reviews = container.find_elements_by_css_selector('div[data-test-target="HR_CC_CARD"]')
for review in reviews:
# get date
# get descriptions
# ...
you can also refer to this link:
Is there a way to find an element by attributes in Python Selenium?

Retrieve Mechanical Soup results after submitting a form

I am struggling to retrieve some results from a simple form submission. This is what I have so far:
import mechanicalsoup
browser = mechanicalsoup.StatefulBrowser()
browser.set_verbose(2)
url = "https://www.dermcoll.edu.au/find-a-derm/"
browser.open(url)
form = browser.select_form("#find-derm-form")
browser["postcode"] = 3000
browser.submit_selected()
form.print_summary()
Where do these results end up...?
Many thanks

As per the MechanicalSoup FAQ, you shouldn't use this library when dealing with a dynamic JavaScript-enabled form, which seems to be the case for the website in your example.
Instead, you can use Selenium in combination with BeautifulSoup (and a little bit of help from webdriver-manager) to achieve your desired result. A short example would look like this:
from selenium import webdriver
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
# set up the Chrome driver instance using webdriver_manager
driver = webdriver.Chrome(ChromeDriverManager().install())
# navigate to the page
driver.get("https://www.dermcoll.edu.au/find-a-derm/")
# find the postcode input and enter your desired value
postcode_input = driver.find_element_by_name("postcode")
postcode_input.send_keys("3000")
# find the search button and perform the search
search_button = driver.find_element_by_class_name("search-btn.location_derm_search_icon")
search_button.click()
# get all search results and load them into a BeautifulSoup object for parsing
search_results = driver.find_element_by_id("search_result")
search_results = search_results.get_attribute('innerHTML')
search_results = BeautifulSoup(search_results)
# get individual result cards
search_results = search_results.find_all("div", {"class": "address_sec_contents"})
# now you can parse for whatever information you need
[x.find("h4") for x in search_results] # names
[x.find("p", {"class": "qualification"}) for x in search_results] # qualifications
[x.find("address") for x in search_results] # addresses
While this way may seem more involved, it's a lot more robust and can be easily repurposed for many more situations where MechanicalSoup falls short.

Can't locate data on page

I am attempting to pull the title and link of each so called raffle in the list on this website. However, when i try to scrape this data, it can't seem to be found.
I have tried scraping all links on the page, but I think these "boxes" may be loaded via javascript.
The results I am receiving are a few links unrelated to what I want to get. There should be 40+ links that show up in this list, but the majority are not showing. Any help would be great, been stuck on this for a while
For some reason, this link and many others aren't showing up when I am scraping:
my code:
def raffle_page_collection():
chrome_driver()
page = requests.get('https://www.soleretriever.com/yeezy-boost-350-v2-black/')
soup = BeautifulSoup(page.text,'html.parser')
product_header = soup.find('h1').text
product_colorway = soup.find('h2').text
product_sku_and_release_date_and_price = soup.find('h3').text
container = soup.find(class_='main-container')
raffles = container.find_all('a')
raffle_list = []
for items in raffles:
raffle_list.append(items.get('href'))
print(raffle_list)

You should try automation selenium library. it allows you to scrape dynamic rendering request(js or ajax) page data.
Try this:
from selenium import webdriver
from bs4 import BeautifulSoup
import time
browser = webdriver.Chrome()
browser.get('https://www.soleretriever.com/yeezy-boost-350-v2-black/')
time.sleep(3)
soup = BeautifulSoup(browser.page_source,'html.parser')
product_header = soup.find('h1').text
product_colorway = soup.find('h2').text
product_sku_and_release_date_and_price = soup.find('h3').text
container = soup.find(class_='main-container')
raffles = container.find("div",{"class":"vc_pageable-slide-wrapper vc_clearfix"})
raffle_list = []
for items in raffles.find_all("a",href=True):
raffle_list.append(items.get('href'))
print(product_header)
print(product_colorway)
print(product_sku_and_release_date_and_price)
print(raffle_list)
O/P:
Yeezy Boost 350 v2 Black
Black/ Black/ Black
FU9006 | 07/06/19 | $220
['https://www.43einhalb.com/en/adidas-yeezy-boost-350-v2-black-328672#SR', 'https://www.adidas.co.uk/yeezy#SR', 'https://www.allikestore.com/default/adidas-yeezy-boost-350-v2-static-black-black-fu9006-105308.html#SR', 'https://archive1820.com/en/forms/6/yeezy-raffle#SR', 'https://drops.blackboxstore.com/blackbox_launches_view/catalog/product/view/id/22296/s/yeezy-boost-350-v2#SR', 'https://woobox.com/4szm9v#SR', 'https://launches.endclothing.com/product/yeezy-boost-350-v2-fu9006#SR', 'https://www.instagram.com/p/ByEIHSHDSY6/', 'https://www.instagram.com/p/ByFG1G0lWf7/', 'https://releases.footshop.com/adidas-yeezy-boost-350-v2-agqn6WoBJZ9y4RSnzw9G#SR', 'https://launches.goodhoodstore.com/launches/yeezy-boost-350-v2-black-33#SR', 'https://www.hervia.com/launches/yeezy-350#SR', 'https://www.hibbett.com/adidas-yeezy-350-v2-black-mens-shoe/M0991.html#SR', 'https://reporting.jdsports.co.uk/cgi-bin/msite?yeezy_comp+a+0+0+0+0+0&utm_source=RedEye&utm_medium=Email&utm_campaign=Yeezy%20Boost%20351%20Clay&utm_content=0905%20Yeezy%20Clay#SR', 'https://www.instagram.com/p/ByDnK6uH6kE/', 'https://www.nakedcph.com/yeezy-boost-v2-350-static-black-raffle/s/635#SR', 'https://www.instagram.com/p/ByIXT8zHvYz/', 'https://launches.sevenstore.com/launch/yeezy-boost-350-v2-black-4033024#SR', 'https://shelta.eu/news/adidas-yeezy-boost-350-v2-black-fu9006x#SR', 'https://www.instagram.com/p/ByDI_6JAfty/', 'https://www.sneakersnstuff.com/en/product/38889/adidas-yeezy-350-v2#SR', 'https://www.instagram.com/p/ByHtt3HFkE0/', 'https://www.instagram.com/p/ByCaKR7Cde1/', 'https://tres-bien.com/adidas-yeezy-boost-350-v2-black-fu9006-fw19#SR', 'https://yeezysupply.com/products/yeezy-boost-350-v2-black-june-7-2019#SR']
for chrome browser:
http://chromedriver.chromium.org/downloads
Install web driver for chrome browser:
https://christopher.su/2015/selenium-chromedriver-ubuntu/
selenium tutorial
https://selenium-python.readthedocs.io/

Can't get all titles from a list with Python WebScraping

I'm practicing web scraping with Python atm and I found a problem, I wanted to scrape one website that has a list of anime that I watched before but when I try to scrape it (via requests or selenium) it only gets around 30 of 110 anime names from the page.
Here is my code with selenium:
from selenium import webdriver
from bs4 import BeautifulSoup
browser = webdriver.Firefox()
browser.get("https://anilist.co/user/Agusmaris/animelist/Completed")
data = BeautifulSoup(browser.page_source, 'lxml')
for title in data.find_all(class_="title"):
print(title.getText())
And when I run it, the page source only shows up until an anime called 'Golden time' when there are like 70 or more left that are in the page.
Thanks
Edit: Code that works now thanks to 'supputuri':
from selenium import webdriver
from bs4 import BeautifulSoup
import time
driver = webdriver.Firefox()
driver.get("https://anilist.co/user/Agusmaris/animelist/Completed")
time.sleep(3)
footer = driver.find_element_by_css_selector("div.footer")
preY = 0
print(str(footer))
while footer.rect['y'] != preY:
preY = footer.rect['y']
footer.location_once_scrolled_into_view
print('loading')
html = driver.page_source
soup = BeautifulSoup(html, 'lxml')
for title in soup.find_all(class_="title"):
print(title.getText())
driver.close()
driver.quit()
ret = input()

Here is the solution.
Make sure to add import time
driver.get("https://anilist.co/user/Agusmaris/animelist/Completed")
time.sleep(3)
footer =driver.find_element_by_css_selector("div.footer")
preY =0
while footer.rect['y']!=preY:
preY = footer.rect['y']
footer.location_once_scrolled_into_view
time.sleep(1)
print(str(driver.page_source))
This will iterate until all the anime is loaded and then gets the page source.
Let us know if this was helpful.

So, this is the jist of what I get when I load the page source:
AniListwindow.al_token = 'E1lPa1kzYco5hbdwT3GAMg3OG0rj47Gy5kF0PUmH';Sorry, AniList requires Javascript.Please enable Javascript or http://outdatedbrowser.com>upgrade to a modern web browser.Sorry, AniList requires a modern browser.Please http://outdatedbrowser.com>upgrade to a newer web browser.
Since I know damn well that Javascript is enabled and my Chrome version is fully up to date, and the URL listed takes one to a nonsecure website to "download" a new version of your browser, I think this is a spam site. Not sure if you were aware of that when posting so I won't flag as such, but I wanted you and others who come across this to be aware.

How to scrape Ajax webpage using python

I am learning Python scraping technique but I am stuck with the problem of scraping an Ajax page like this one.
I want to scrape all the medicines name and details coming in the page. Since I read most of the answer on the stack overflow but I am not getting the right data after scraping. I also tried to scrape using selenium or send a forge post request but it failed.
So please help me on this Ajax scraping topic specially this page because ajax is triggered on selecting an option from dropdown options.
Also please provide me with some resources for ajax page scraping.
//using selenium
from selenium import webdriver
import bs4 as bs
import lxml
import requests
path_to_chrome = '/home/brutal/Desktop/chromedriver'
browser = webdriver.Chrome(executable_path = path_to_chrome)
url = 'https://www.gianteagle.com/Pharmacy/Savings/4-10-Dollar-Drug-Program/Generic-Drug-Program/'
browser.get(url)
browser.find_element_by_xpath('//*[#id="ctl00_RegionPage_RegionPageMainContent_RegionPageContent_userControl_StateList"]/option[contains(text(), "Ohio")]').click()
new_url = browser.current_url
r = requests.get(new_url)
print(r.content)

ChromeDriver you can download here
normalize-space is used in order to remove trash from web text, such as x0
from time import sleep
from selenium import webdriver
from lxml.html import fromstring
data = {}
driver = webdriver.Chrome('PATH TO YOUR DRIVER/chromedriver') # i.e '/home/superman/www/myproject/chromedriver'
driver.get('https://www.gianteagle.com/Pharmacy/Savings/4-10-Dollar-Drug-Program/Generic-Drug-Program/')
# Loop states
for i in range(2, 7):
dropdown_state = driver.find_element(by='id', value='ctl00_RegionPage_RegionPageMainContent_RegionPageContent_userControl_StateList')
# open dropdown
dropdown_state.click()
# click state
driver.find_element_by_xpath('//*[#id="ctl00_RegionPage_RegionPageMainContent_RegionPageContent_userControl_StateList"]/option['+str(i)+']').click()
# let download the page
sleep(3)
# prepare HTML
page_content = driver.page_source
tree = fromstring(page_content)
state = tree.xpath('//*[#id="ctl00_RegionPage_RegionPageMainContent_RegionPageContent_userControl_StateList"]/option['+str(i)+']/text()')[0]
data[state] = []
# Loop products inside the state
for line in tree.xpath('//*[#id="ctl00_RegionPage_RegionPageMainContent_RegionPageContent_userControl_gridSearchResults"]/tbody/tr[#style]'):
med_type = line.xpath('normalize-space(.//td[#class="medication-type"])')
generic_name = line.xpath('normalize-space(.//td[#class="generic-name"])')
brand_name = line.xpath('normalize-space(.//td[#class="brand-name hidden-xs"])')
strength = line.xpath('normalize-space(.//td[#class="strength"])')
form = line.xpath('normalize-space(.//td[#class="form"])')
qty_30_day = line.xpath('normalize-space(.//td[#class="30-qty"])')
price_30_day = line.xpath('normalize-space(.//td[#class="30-price"])')
qty_90_day = line.xpath('normalize-space(.//td[#class="90-qty hidden-xs"])')
price_90_day = line.xpath('normalize-space(.//td[#class="90-price hidden-xs"])')
data[state].append(dict(med_type=med_type,
generic_name=generic_name,
brand_name=brand_name,
strength=strength,
form=form,
qty_30_day=qty_30_day,
price_30_day=price_30_day,
qty_90_day=qty_90_day,
price_90_day=price_90_day))
print('data:', data)
driver.quit()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Beautifulsoup4 find_all not getting the results I need - python

When an empty list is returned in find_all(), that means the elements that you specified could not be found. Make sure that what you are trying to scrape isn't dynamically added such as an iframe in some cases

Related

Python Selenium - Scraping Trip Advisor Reviews

Retrieve Mechanical Soup results after submitting a form

Can't locate data on page

Can't get all titles from a list with Python WebScraping

How to scrape Ajax webpage using python

Categories

Resources