Scrape information from FlashScore.ro live - python

I am trying to scrape information from this website https://www.flashscore.ro/baschet/ from the live tab. I want to receive an email every time something happens.
but my problem is with scraping
the code I have until now returns None . I want for now to get the name of the home team.
I am kinda new to this scraping with python thing
import requests
from bs4 import BeautifulSoup
URL = 'https://www.flashscore.ro/baschet/'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'}
def find_price():
page = requests.get(URL, headers = headers)
soup = BeautifulSoup(page.content, 'html.parser')
home_team = soup.html.find('div', {'class': 'event__participant event__participant--home'})
return home_team
print(find_price())

The website uses JavaScript, but requests doesn't support it. so we can use Selenium as an alternative to scrape the page.
Install it with: pip install selenium.
Download the correct ChromeDriver from here.
from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep
URL = "https://www.flashscore.ro/baschet/"
driver = webdriver.Chrome(r"C:\path\to\chromedriver.exe")
driver.get(URL)
# Wait for page to fully render
sleep(5)
soup = BeautifulSoup(driver.page_source, "html.parser")
for tag in soup.find_all(
"div", {"class": "event__participant event__participant--home"}
):
print(tag.text)
driver.quit()
Output:
Lyon-Villeurbanne
Fortitudo Bologna
Virtus Roma
Treviso
Trieste
Trento
Unicaja
Gran Canaria
Galatasaray
Horizont Minsk 2 F
...And on

Related

I am tying to get the full html code of a website, specifically "The North Face" using python and using requests does not give me the entire html code

I am trying to scrape "the north face" website for a group project and I am looking for a faster way to get the output faster. Is there any faster way without opening a chrome web page every time I am getting the html of a page ? I can't use requests cause it doesn't give me the FULL source code. Thank for the help.
This is what I have:
import requests
from bs4 import BeautifulSoup
from helium import *
import time
# To tell the API that I am a user using Google Chrome.
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
# open Chrome in the back ground.
browser = start_chrome("https://www.thenorthface.com/shop/mens-jackets-vests-en-ca#facet=&beginIndex=0", headless=True)
# Click on the "LOAD MORE" button to load all the products in the page.
while Text("LOAD MORE").exists():
click("LOAD MORE")
time.sleep(2.0)
# get the html source of the page
html = browser.page_source
kill_browser()
# creat a soup object
soup = BeautifulSoup(html, "html.parser")
# print(soup.prettify())
# soup object for all products
products_cards = soup.find_all("div", {"class": "product-block-info info info-js"})
# print(products_cards)
products_names = []
products_links = []
products_prices = []
for card in products_cards:
for name in card.find_all("div", {"class": "product-block-name name name-js"}):
for i in name.find_all("a", class_="product-block-name-link"):
# print(i.get("title"))
products_names.append(i.get("title"))
# print(i.get("href"))
products_links.append(i.get("href"))
# soup object for specific product
# product_soup = BeautifulSoup(html, "html.parser")
#!!!!!!!!!!!!!!!!
for jacket_url in products_links[:3]:
browser = start_chrome(jacket_url, headless=True)
html = browser.page_source
kill_browser()
product_soup = BeautifulSoup(html, "html.parser")
price_info = product_soup.find_all("div", class_="product-content-info-price product-price product-price-js")
for info in price_info:
for price in info.find("span", "product-content-info-offer-price offer-price offer-price-js product-price-amount-js"):
products_prices.append(price)
print(len(products_prices))
print(len(products_names))
print(len(products_links)) ```

NoneType Object has no attribute “get_text” — Python

I was doing some web scraping from amazon and I came across this error (mentioned in title).
This is my code:
import requests
from bs4 import BeautifulSoup
import smtplib
URL = 'https://www.amazon.co.uk/UGREEN-Adapter-Samsung-Oneplus- Blackview/dp/B072V9CNTK/ref=sr_1_2_sspa?keywords=otg+cable&qid=1578610622&sr=8-2-spons&psc=1&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUEzRzRRUUdaR05RVlRJJmVuY3J5cHRlZElkPUEwNjExNjM4MVI4NVZaTFlYTlhGSCZlbmNyeXB0ZWRBZElkPUEwMjg1MTU0OEhROERWQTBSRFAzJndpZGdldE5hbWU9c3BfYXRmJmFjdGlvbj1jbGlja1JlZGlyZWN0JmRvTm90TG9nQ2xpY2s9dHJ1ZQ=='
headers = {
"User Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64 AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'}
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.find(id="productTitle").get_text()
price = soup.find(id="priceblock_ourprice").get_text()
converted_price = float(price[0:3])
def check_price():
print(soup.find(id="priceblock_ourprice").get_text())
converted_price = float(price[0:3])
if(converted_price < 7.00):
send_mail()
It is because the page is dynamically loaded using javascript. You can use selenium to get the html code of the website, like this:
from selenium import webdriver
URL = 'https://www.amazon.co.uk/UGREEN-Adapter-Samsung-Oneplus- Blackview/dp/B072V9CNTK/ref=sr_1_2_sspa?keywords=otg+cable&qid=1578610622&sr=8-2-spons&psc=1&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUEzRzRRUUdaR05RVlRJJmVuY3J5cHRlZElkPUEwNjExNjM4MVI4NVZaTFlYTlhGSCZlbmNyeXB0ZWRBZElkPUEwMjg1MTU0OEhROERWQTBSRFAzJndpZGdldE5hbWU9c3BfYXRmJmFjdGlvbj1jbGlja1JlZGlyZWN0JmRvTm90TG9nQ2xpY2s9dHJ1ZQ=='
driver = webdriver.Chrome()
driver.get(URL)
time.sleep(5)
page = driver.page_source
driver.close()
Thus, here is the full code:
from bs4 import BeautifulSoup
from selenium import webdriver
import time
URL = 'https://www.amazon.co.uk/UGREEN-Adapter-Samsung-Oneplus- Blackview/dp/B072V9CNTK/ref=sr_1_2_sspa?keywords=otg+cable&qid=1578610622&sr=8-2-spons&psc=1&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUEzRzRRUUdaR05RVlRJJmVuY3J5cHRlZElkPUEwNjExNjM4MVI4NVZaTFlYTlhGSCZlbmNyeXB0ZWRBZElkPUEwMjg1MTU0OEhROERWQTBSRFAzJndpZGdldE5hbWU9c3BfYXRmJmFjdGlvbj1jbGlja1JlZGlyZWN0JmRvTm90TG9nQ2xpY2s9dHJ1ZQ=='
driver = webdriver.Chrome()
driver.get(URL)
time.sleep(5)
page = driver.page_source
driver.close()
soup = BeautifulSoup(page, 'html5lib')
title = soup.find(id="productTitle")
price = soup.find(id="priceblock_ourprice")
print(soup.find(id="priceblock_ourprice").get_text())
Output:
£6.99

How to do scraping from a page with BeautifulSoup

The question asked is very simple, but for me, it doesn't work and I don't know!
I want to scrape the rating beer from this page https://www.brewersfriend.com/homebrew/recipe/view/16367/southern-tier-pumking-clone with BeautifulSoup, but it doesn't work.
This is my code:
import requests
import bs4
from bs4 import BeautifulSoup
url = 'https://www.brewersfriend.com/homebrew/recipe/view/16367/southern-tier-pumking-clone'
test_html = requests.get(url).text
soup = BeautifulSoup(test_html, "lxml")
rating = soup.findAll("span", class_="ratingValue")
rating
When I finish, it doesn't work, but if I do the same thing with another page is work... I don't know. Someone can help me? The result of rating is 4.58
Thanks everybody!
If you print the test_html, you'll find you get a 403 forbidden response.
You should add a header (at least a user-agent : ) ) to your GET request.
import requests
from bs4 import BeautifulSoup
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
}
url = 'https://www.brewersfriend.com/homebrew/recipe/view/16367/southern-tier-pumking-clone'
test_html = requests.get(url, headers=headers).text
soup = BeautifulSoup(test_html, 'html5lib')
rating = soup.find('span', {'itemprop': 'ratingValue'})
print(rating.text)
# 4.58
The reason behind getting forbidden status code (HTTP error 403) which means the server will not fulfill your request despite understanding the response. You will definitely get this error if you try scrape a lot of the more popular websites which will have security features to prevent bots. So you need to disguise your request!
For that you need use Headers.
Also you need correct your tag attribute whose data you're trying to get i.e. itemprop
use lxml as your tree builder, or any other of your choice
import requests
from bs4 import BeautifulSoup
url = 'https://www.brewersfriend.com/homebrew/recipe/view/16367/southern-tier-pumking-clone'
# Add this
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
test_html = requests.get(url, headers=headers).text
soup = BeautifulSoup(test_html, 'lxml')
rating = soup.find('span', {'itemprop':'ratingValue'})
print(rating.text)
The page you are requesting response as 403 forbidden so you might not be getting an error but it will provide you blank result as []. To avoid this situation we add user agent and this code will get you the desired result.
import urllib.request
from bs4 import BeautifulSoup
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
url = "https://www.brewersfriend.com/homebrew/recipe/view/16367/southern-tier-pumking-clone"
headers={'User-Agent':user_agent}
request=urllib.request.Request(url,None,headers) #The assembled request
response = urllib.request.urlopen(request)
soup = BeautifulSoup(response, "lxml")
rating = soup.find('span', {'itemprop':'ratingValue'})
rating.text
import requests
from bs4 import BeautifulSoup
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36
(KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
}
url = 'https://www.brewersfriend.com/homebrew/recipe/view/16367/southerntier-pumking
clone'
test_html = requests.get(url, headers=headers).text
soup = BeautifulSoup(test_html, 'html5lib')
rating = soup.find('span', {'itemprop': 'ratingValue'})
print(rating.text)
you are facing this error because some websites can't be scraped by beautiful soup. So for these kinds of websites, you have to use selenium
download latest chrome driver from this link according to your operating system
install selenium driver by this command "pip install selenium"
# import required modules
import selenium
from selenium import webdriver
from bs4 import BeautifulSoup
import time, os
curren_dir = os.getcwd()
print(curren_dir)
# concatinate web driver with your current dir && if you are using window change "/" to '\' .
# make sure , you placed chromedriver in current directory
driver = webdriver.Chrome(curren_dir+'/chromedriver')
# driver.get open url on your browser
driver.get('https://www.brewersfriend.com/homebrew/recipe/view/16367/southern-tier-pumking-clone')
time.sleep(1)
# it fetch data html data from driver
super_html = driver.page_source
# now convert raw data with 'html.parser'
soup=BeautifulSoup(super_html,"html.parser")
rating = soup.findAll("span",itemprop="ratingValue")
rating[0].text

How can I use Beautiful Soup to parse the batter's names?

I have had no issues grabbing three stats: hits, runs and rbi's. Here is the code I have been working with so far:
#import modules
from bs4 import BeautifulSoup
import requests, os
from selenium import webdriver
#start webdriver
os.chdir('C:\webdrivers')
header = {'User-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}
options = webdriver.ChromeOptions(); options.add_argument("--start-
maximized")
driver = webdriver.Chrome(chrome_options=options)
driver.get('https://www.baseball-reference.com/leagues/MLB/2018-standard-
batting.shtml')
#grab html
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.quit()
#parse three stats: rbi's, runs and hits
hits = [i.text for i in soup.find_all('td', {'data-stat': 'H'})]
runs = [i.text for i in soup.find_all('td', {'data-stat': 'R'})]
rbi = [i.text for i in soup.find_all('td', {'data-stat': 'RBI'})]
#print data
print(hits, runs, rbi)
The code above works great. When I try to grab the batter's names, however, I run into some problems. The batter's names are not parsed correctly. I would like just their first and last name if possible.
Here is what I tried:
print(soup.find_all('td', {'data-stat': 'player'}))
The batter's names are in the code but there is a lot of extra data. Also, my computer slowed down a lot when I tried this line of code. Any suggestions? Thanks in advance for any help you may offer!
The data is not in source page, please refer to this link:
https://d3k2oh6evki4b7.cloudfront.net/short/inc/players_search_list.csv
this is the csv file you can directly download this file or you can fetch desired data with code as well.
How to get batter's names:
just request the player data directly, I found this url when I watch the page load, get player name from this url will very easy:
https://d3k2oh6evki4b7.cloudfront.net/short/inc/players_search_list.csv
How to speeder your code:
First: Using selenium to load the webdriver will cost the most part time in your code.
For your grab target, I suggest you use requests directly instead selenium
Second: lxml parser will speeder than the html parser, but you should install it if you never use it, just run "pip install lxml" will help you.
installing-a-parser and summarizes the advantages and disadvantages of each parser library
for example:
import requests
from bs4 import BeautifulSoup
# start requests
target_url = 'https://www.baseball-reference.com/leagues/MLB/2018-standard-batting.shtml'
headers = {'User-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}
page_source = requests.get(target_url, headers=headers).text
#grab html
soup = BeautifulSoup(page_source, 'lxml')
#parse three stats: rbi's, runs and hits
hits = [i.text for i in soup.find_all('td', {'data-stat': 'H'})]
runs = [i.text for i in soup.find_all('td', {'data-stat': 'R'})]
rbi = [i.text for i in soup.find_all('td', {'data-stat': 'RBI'})]
#print data
print(hits, runs, rbi)

Using python to scrape push data?

I'm trying to scrape the left side of this news site (= SENESTE NYT):
https://www.dr.dk/nyheder/
But it seems the data isn't anywhere to be found? Neither in the html or related api/json etc. Is it some kind of push data?
Using Chrome's Network console I've found this api but it doesn't contain the news items on the left side:
https://www.dr.dk/tjenester/newsapp-content/teasers?reqoffset=0&reqlimit=100
Can anyone help me? How do I scrape "SENESTE NYT"?
I first loaded the page with selenium and then processed with BeautifulSoup.
from selenium import webdriver
from bs4 import BeautifulSoup
url = "https://www.dr.dk/nyheder"
driver = webdriver.Chrome()
driver.get(url)
page_source = driver.page_source
soup = BeautifulSoup(page_source, "lxml")
div = soup.find("div", {"class":"timeline-container"})
headlines = div.find_all("h3")
print(headlines)
And it seems to find the headlines:
[<h3>Puigdemont: Debatterede spørgsmål af interesse for hele Europa</h3>,
<h3>Afblæser tsunami-varsel for Hawaii</h3>,
<h3>56.000 flygter fra vulkan i udbrud </h3>,
<h3>Pence: USA offentliggør snart plan for ambassadeflytning </h3>,
<h3>Østjysk motorvej genåbnet </h3>]
Not sure if this is what you wanted.
-----EDITED----
More efficient way would be to create request with some custom headers (already confirmed this is not working)
import requests
headers = {
"Accept":"*/*",
"Host":"www.dr.dk",
"Referer":"https://www.dr.dk/nyheder",
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
}
r = requests.get(url="https://www.dr.dk/tjenester/newsapp-content/teasers?reqoffset=0&reqlimit=100", headers=headers)
r.json()

Categories