unable to scrape date/time info using Beautifulsoup - python

I am trying to web scrape an upcoming event date on reuters.com using Python and Beautifulsoup package.
Unfortunately it seems harder than expected to get out the upcoming earnings event date and time from HTML.
I do not understand why I cannot get a visible output via the below script although I can see the value while web inspecting the target URL. Does anybody know why? Is there any viable work-around?
header = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:70.0) Gecko/20100101 Firefox/70.0', }
URL = f'https://www.reuters.com/companies/SAPG.DE/events'
page = requests.get(URL, headers=header)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find(id='__next')
job_elems = results.find_all('section', class_='Events-section-2YwsJ')
for job_elem in job_elems:
event_type = job_elem.find('h3').text
if event_type.find('Events') != -1:
print(job_elem.find('h3').text)
items = job_elem.find_all('div', class_='EventList-event-Veu-f')
for item in items:
title = item.find('span').text
earnings_time = item.find('time').get_text()
if title.find('Earnings Release') != -1:
print(earnings_time)
The attributes class of the "object" in question is EventList-date-cLNT9 which I have never seen before.

This happens as time tag is using js to load, but bs4 uses html,, you have 2 options :
one is to use selenium ,or to use their API.
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Chrome()
URL = f'https://www.reuters.com/companies/SAPG.DE/events'
page = driver.get(URL)
soup = BeautifulSoup(driver.page_source, 'html.parser')
results = soup.find(id='__next')
job_elems = results.find_all('section', class_='Events-section-2YwsJ')
for job_elem in job_elems:
event_type = job_elem.find('h3').text
if event_type.find('Events') != -1:
print(job_elem.find('h3').text)
items = job_elem.find_all('div', class_='EventList-event-Veu-f')
for item in items:
title = item.find('span').text
time = item.find('time').text
print(f"Title: {title}, Time: {time}")
driver.quit()
output :
Upcoming Events
Title: SAP SE at Morgan Stanley Technology, Media and Telecom Conference (Virtual), Time: 1 Mar 2021 / 6PM EET
Title: Q1 2021 SAP SE Earnings Release, Time: 22 Apr 2021 / 8AM EET

The reason for that is those events are added dynamically by JavaScript, which means that they are not visible in the HTML you get back.
However, there's an API you can query to get the events
Here's how:
import requests
api_url = "https://www.reuters.com/companies/api/getFetchCompanyEvents/SAPG.DE"
response = requests.get(api_url).json()
for event in response["market_data"]["upcoming_event"]:
print(f"{event['name']} - {event['time']}")
Output:
SAP SE at Morgan Stanley Technology, Media and Telecom Conference (Virtual) - 2021-03-01T16:45:00Z
Q1 2021 SAP SE Earnings Release - 2021-04-22T06:30:00Z

Related

Web scraping multiple pages in python

So I'm trying to web scrape a website that has around 500 pages for used cars and each page has around 22 cars, I managed to extract the first 22 cars from the first page, but how can make my code iterate through all the pages so I can get all cars? (I'm a beginner so sorry if my code is not well structured)
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
website = 'https://ksa.yallamotor.com/used-cars/search'
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:103.0) Gecko/20100101 Firefox/103.0'
}
response = requests.get(website, headers=headers)
links = []
car_name = []
model_year = []
cars = []
soup = BeautifulSoup(response.text, 'lxml')
cars = soup.find_all('div', class_='singleSearchCard m24t p12 bg-w border-gray border8')
for c in cars:
l = "https://ksa.yallamotor.com/" + c.find('a', class_='black-link')['href']
links.append(l)
for i in range(0,22):
url = links[i]
session_object = requests.Session()
result = session_object.get(url, headers=headers)
soup = BeautifulSoup(result.text, 'lxml')
name = soup.find('h1', class_="font24")
car_name.append(name.text)
y = soup.find_all('div', class_="font14 text-center font-b m2t")[0]
model_year.append(y.text)
Website is under Cloudflare protection, so you would need something like cloudscraper (pip install cloudscraper). The following code will get you your data (you can further analyse each car, get the details you need, etc):
import cloudscraper
from bs4 import BeautifulSoup
scraper = cloudscraper.create_scraper()
for x in range(1, 501):
r = scraper.get(f'https://ksa.yallamotor.com/used-cars/search?page={x}&sort=updated_desc')
soup = BeautifulSoup(r.text, 'html.parser')
cars = soup.select('.singleSearchCard')
for car in cars:
url = car.select_one('a.black-link')
print(url.get_text(strip=True), url['href'])
Result printed in terminal:
Used BMW 7 Series 730Li 2018 /used-cars/bmw/7-series/2018/used-bmw-7-series-2018-jeddah-1294758
Used Infiniti QX80 5.6L Luxe (8 Seats) 2020 /used-cars/infiniti/qx80/2020/used-infiniti-qx80-2020-jeddah-1295458
Used Chevrolet Suburban 5.3L LS 2WD 2018 /used-cars/chevrolet/suburban/2018/used-chevrolet-suburban-2018-jeddah-1302084
Used Chevrolet Silverado 2016 /used-cars/chevrolet/silverado/2016/used-chevrolet-silverado-2016-jeddah-1297430
Used GMC Yukon 5.3L SLE (2WD) 2018 /used-cars/gmc/yukon/2018/used-gmc-yukon-2018-jeddah-1304469
Used GMC Yukon 5.3L SLE (2WD) 2018 /used-cars/gmc/yukon/2018/used-gmc-yukon-2018-jeddah-1304481
Used Chevrolet Impala 3.6L LS 2018 /used-cars/chevrolet/impala/2018/used-chevrolet-impala-2018-jeddah-1297427
Used Infiniti Q70 3.7L Luxe 2019 /used-cars/infiniti/q70/2019/used-infiniti-q70-2019-jeddah-1295235
Used Chevrolet Tahoe LS 2WD 2018 /used-cars/chevrolet/tahoe/2018/used-chevrolet-tahoe-2018-jeddah-1305486
Used Mercedes-Benz 450 SEL 2018 /used-cars/mercedes-benz/450-sel/2018/used-mercedes-benz-450-sel-2018-jeddah-1295830
[...]

Beautiful Soup not working on this website

I want to scrape the URLs of all the items in the table but when I try, nothing comes up. The code is quite basic so I can see why it might not work. However, even trying to scrape the title of this website, nothing comes up. I at least expected the h1 tag as it's outside the table...
Website: https://www.vanguard.com.au/personal/products/en/overview
import requests
from bs4 import BeautifulSoup
lists =[]
url = 'https://www.vanguard.com.au/personal/products/en/overview'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
title = soup.find_all('h1', class_='heading2 gbs-font-vanguard-red')
for links in soup.find_all('a', style='padding-bottom: 1px;'):
link_text = links['href']
lists.append(link_text)
print(title)
print(lists)
If the problem is caused by the JavaScript eventlistener, I would suggest you use beautifulsoup along with selenium to scrape this website. So, let's apply selenium at sending request and get back page source and then use beautifulsoup to parse it.
In addition, you should use title = soup.find() instead of title = soup.findall() in order to get only one title.
The example of code using Firefox:
from selenium import webdriver
from webdriver_manager.firefox import GeckoDriverManager
from bs4 import BeautifulSoup
url = 'https://www.vanguard.com.au/personal/products/en/overview'
browser = webdriver.Firefox(executable_path=GeckoDriverManager().install())
browser.get(url)
soup = BeautifulSoup(browser.page_source, 'html.parser')
browser.close()
lists =[]
title = soup.find('h1', class_='heading2 gbs-font-vanguard-red')
for links in soup.find_all('a', style='padding-bottom: 1px;'):
link_text = links['href']
lists.append(link_text)
print(title)
print(lists)
Output:
<h1 class="heading2 gbs-font-vanguard-red">Investment products</h1>
['/personal/products/en/detail/8132', '/personal/products/en/detail/8219', '/personal/products/en/detail/8121',...,'/personal/products/en/detail/8217']
The most common problem (with many modern pages): this page uses JavaScript to add elements but requests/BeautifulSoup can't run JavaScript.
You may need to use Selenium to control real web browser which can run JavaScript.
This example use only Selenium without BeautifulSoup
I use xpath but you may also use css selector.
from selenium import webdriver
from selenium.webdriver.common.by import By
url = 'https://www.vanguard.com.au/personal/products/en/overview'
lists = []
#driver = webdriver.Chrome(executable_path="/path/to/chromedrive.exe")
driver = webdriver.Firefox(executable_path="/path/to/geckodrive.exe")
driver.get(url)
title = driver.find_element(By.XPATH, '//h1[#class="heading2 gbs-font-vanguard-red"]')
print(title.text)
all_items = driver.find_elements(By.XPATH, '//a[#style="padding-bottom: 1px;"]')
for links in all_items:
link_text = links.get_attribute('href')
print(link_text)
lists.append(link_text)
ChromeDriver (for Chrome)
GeckoDriver (for Firefox)
It's always more efficient to get the data from the source as opposed to doing it through Selenium. Looks like the links are created through the portId.
import pandas as pd
import requests
url = 'https://www3.vanguard.com.au/personal/products/funds.json'
payload = {
'context': '/personal/products/',
'countryCode': 'au.ret',
'paths': "[[['funds','legacyFunds'],'AU']]",
'method': 'get'}
jsonData = requests.get(url, params=payload).json()
results = jsonData['jsonGraph']['funds']['AU']['value']
df1 = pd.json_normalize(results, record_path=['children'])
df2 = pd.json_normalize(results, record_path=['listings'])
df = pd.concat([df1, df2], axis=0)
df['url_link'] = 'https://www.vanguard.com.au/personal/products/en/detail/' + df['portId'] + '/Overview'
Output:
print(df[['fundName', 'url_link']])
fundName url_link
0 Vanguard Active Emerging Market Equity Fund https://www.vanguard.com.au/personal/products/...
1 Vanguard Active Global Credit Bond Fund https://www.vanguard.com.au/personal/products/...
2 Vanguard Active Global Growth Fund https://www.vanguard.com.au/personal/products/...
3 Vanguard Australian Corporate Fixed Interest I... https://www.vanguard.com.au/personal/products/...
4 Vanguard Australian Fixed Interest Index Fund https://www.vanguard.com.au/personal/products/...
.. ... ...
23 Vanguard MSCI Australian Small Companies Index... https://www.vanguard.com.au/personal/products/...
24 Vanguard MSCI Index International Shares (Hedg... https://www.vanguard.com.au/personal/products/...
25 Vanguard MSCI Index International Shares ETF https://www.vanguard.com.au/personal/products/...
26 Vanguard MSCI International Small Companies In... https://www.vanguard.com.au/personal/products/...
27 Vanguard International Credit Securities Hedge... https://www.vanguard.com.au/personal/products/...
[66 rows x 2 columns]

Extracting the required information for a Script tag of scraped webpage using BeautifulSoup

I'm a webscraping novice and I am looking for pointers of what to do next, or potentially a working solution, to scrape the following webpage: https://www.capology.com/club/leicester/salaries/2019-2020/
I would like to extract the following for each row (player) of the table:
Player Name i.e. Jamie Vardy
Weekly Gross Base Salary (in GBP) i.e. £140,000
Annual Gross Base Salary (in GBP) i.e. £7,280,000
Position i.e. F
Age i.e. 33
Country England
The following code creates the 'soup' for the JavaScript table of information I want:
import requests
from bs4 import BeautifulSoup
import json
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0'}
url = 'https://www.capology.com/club/leicester/salaries/2019-2020/'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
script = soup.find_all('script')[11].string # 11th script tag in the webpage
I can see the 'soup' assigned to the script variable has all the information I need, however, I am struggling to extract the information that I need as a pandas DataFrame?
I would subsequently like to set up this up for pagination, to scrape each team in the the 'Big 5' European Leagues (Premier League, Serie A, La Liga, Bundeliga, and Ligue 1), for the 17-18, 18-19, 19-20, and 20-21 (current) seasons. However, that's the final stage solution and I am happy to go away and try and do that myself if that's a time consuming request.
A working solution would be fantastic but just some pointers so that I can go away and learn this stuff myself as efficiently as possible would be great.
Thanks very much!
This is a task that is best suited for a tool like selenium, as the site uses the scrip to populate the page with the table after it loads, and it is not trivial to parse the values from the script source:
from selenium import webdriver
from bs4 import BeautifulSoup as soup
import urllib.parse, collections, re
d = webdriver.Chrome('/path/to/chromedriver')
d.get((url:='https://www.capology.com/club/leicester/salaries/2019-2020/'))
league_teams = d.execute_script("""
var results = [];
for (var i of Array.from(document.querySelectorAll('li.green-subheader + li')).slice(0, 5)){
results.push({league:i.querySelector('.league-title').textContent,
teams:Array.from(i.querySelectorAll('select:nth-of-type(1).team-menu option')).map(x => [x.getAttribute('value'), x.textContent]).slice(1),
years:Array.from(i.querySelectorAll('select:nth-of-type(2).team-menu option')).map(x => [x.getAttribute('value'), x.textContent]).slice(2)})
}
return results;
""")
vals = collections.defaultdict(dict)
for i in league_teams:
for y, full_year in [[re.sub('\d{4}\-\d{4}', '2020-2021', i['years'][0][0]), '2020-21'], *i['years']][:4]:
for t, team in i['teams']:
d.get(urllib.parse.urljoin(url, t) + (y1:=re.findall('/\d{4}\-\d{4}/', y)[0][1:]))
hvals = [x.get_text(strip=True) for x in soup(d.page_source, 'html.parser').select('#table thead tr:nth-of-type(3) th')]
tvals = soup(d.page_source, 'html.parser').select('#table tbody tr')
full_table = [dict(zip(hvals, [j.get_text(strip=True) for j in k.select('td')])) for k in tvals]
if team not in vals[i['league']]:
vals[i['league']][team] = {full_year:None}
vals[i['league']][team][full_year] = full_table

How to webscrape reviews from external links with bs4?

I would like to extract for each movie at least 20 user reviews, but I don't know how to loop to get into the IMDb title movie and then to the user reviews with beautifulsoup.
start link = "https://www.imdb.com/search/title/?title_type=feature,tv_movie&release_date=2018-01-01,2019-12-31&count=250";
title_link(1) = "https://www.imdb.com/title/tt7131622/?ref_=adv_li_tt";
user_reviews_link_movie1 = "https://www.imdb.com/title/tt7131622/reviews?ref_=tt_ov_rt" ;
I am able to extract from a static page titles, years, ratings and metascores of each movie of the list.
# Import packages and set urls
from requests import get
url = 'https://www.imdb.com/search/title/?title_type=feature,tv_movie&release_date=2018-01-01,2019-12-31&count=250'
response = get(url)
print(response.text[:500])
from bs4 import BeautifulSoup
html_soup = BeautifulSoup(response.text, 'html.parser')
type(html_soup)
movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')
print(type(movie_containers))
print(len(movie_containers))
# Lists to store the scraped data in
names = []
years = []
imdb_ratings = []
metascores = []
votes = []
# Extract data from individual movie container
for container in movie_containers:
# If the movie has Metascore, then extract:
if container.find('div', class_ = 'ratings-metascore') is not None:
# The name
name = container.h3.a.text
names.append(name)
# The year
year = container.h3.find('span', class_ = 'lister-item-year').text
years.append(year)
# The IMDB rating
imdb = float(container.strong.text)
imdb_ratings.append(imdb)
# The Metascore
m_score = container.find('span', class_ = 'metascore').text
metascores.append(int(m_score))
import pandas as pd
test_df = pd.DataFrame({'movie': names,'year': years,'imdb': imdb_ratings,'metascore': metascores})
test_df
Actual results :
movie year imdb metascore
Once Upon a Time... in Hollywood (2019) (8.1) (83)
Scary Stories (2019) (6.5) (61)
Fast & Furious: Hobbs & Shaw (2019) (6.8) (60)
Avengers: Endgame (2019) (8.6) (78)
Expected :
movie1 year1 imbd1 metascore1 review1
movie1 year1 imbd1 metascore1 review2
...
movie1 year1 imbd1 metascore1 review20
movie2 year2 imbd2 metascore2 review1
...
movie2 year2 imbd2 metascore2 review20
...
movie250 year250 imbd250 metascore250 review20
Assuming that answer on my question in comments is "yes".
Below is a solution to your initial request.
There's a check whether a particular film really has 20 reviews. If less, then gather all available ones.
Technically parsing process is correct, I checked it when assigned movie_containers = movie_containers[:3]. Gathering all data will take some time.
UPDATE: just finished collecting info on all 250 films - everything is scraped without errors, so block after solution itself is just FYI.
Also if you want to go further with your parsing, I mean collect data for next 250 films and so on, you can add one more looping level to this parser. The process is similar to one in the "Reviews extracting" section.
# Import packages and set urls
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
base_url = 'https://www.imdb.com/search/title/?title_type=feature,tv_movie&release_date=2018-01-01,2019-12-31&count=250'
url_header_for_reviews = 'https://www.imdb.com'
url_tail_for_reviews = 'reviews?ref_=tt_urv'
base_response = get(base_url)
html_soup = BeautifulSoup(base_response.text, 'html.parser')
movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')
result_df = pd.DataFrame()
# Extract data from individual movie container
for container in movie_containers:
# If the movie has Metascore, then extract:
if container.find('div', class_ = 'ratings-metascore') is not None:
# Reviews extracting
num_reviews = 20
# Getting last piece of link puzzle for a movie reviews` link
url_middle_for_reviews = container.find('a')['href']
# Opening reviews page of a concrete movie
response_reviews = get(url_header_for_reviews + url_middle_for_reviews + url_tail_for_reviews)
reviews_soup = BeautifulSoup(response_reviews.text, 'html.parser')
# Searching all reviews
reviews_containers = reviews_soup.find_all('div', class_ = 'imdb-user-review')
# Check if actual number of reviews is less than target one
if len(reviews_containers) < num_reviews:
num_reviews = len(reviews_containers)
# Looping through each review and extracting title and body
reviews_titles = []
reviews_bodies = []
for review_index in range(num_reviews):
review_container = reviews_containers[review_index]
review_title = review_container.find('a', class_ = 'title').text.strip()
review_body = review_container.find('div', class_ = 'text').text.strip()
reviews_titles.append(review_title)
reviews_bodies.append(review_body)
# The name
name = container.h3.a.text
names = [name for i in range(num_reviews)]
# The year
year = container.h3.find('span', class_ = 'lister-item-year').text
years = [year for i in range(num_reviews)]
# The IMDB rating
imdb_rating = float(container.strong.text)
imdb_ratings = [imdb_rating for i in range(num_reviews)]
# The Metascore
metascore = container.find('span', class_ = 'metascore').text
metascores = [metascore for i in range(num_reviews)]
# Gathering up scraped data into result_df
if result_df.empty:
result_df = pd.DataFrame({'movie': names,'year': years,'imdb': imdb_ratings,'metascore': metascores,'review_title': reviews_titles,'review_body': reviews_bodies})
elif num_reviews > 0:
result_df = result_df.append(pd.DataFrame({'movie': names,'year': years,'imdb': imdb_ratings,'metascore': metascores,'review_title': reviews_titles,'review_body': reviews_bodies}))
Btw I'm not sure that IMDB will let you gather data for all films in a loop as is. There's a possibility that you can get a captcha or redirection to some other page. If these issue appears,I'd go with a simple solution - pauses in scraping and/or changing user-agents.
Pause (sleep) can be implemented as follows:
import time
import numpy as np
time.sleep((30-5)*np.random.random()+5) #from 5 to 30 seconds
Inserting a user-agent in request can be done as follows:
import requests
from bs4 import BeautifulSoup
url = ('http://www.link_you_want_to_make_request_on.com/bla_bla')
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
Google some other variants of user-agents, make a list from them and change them from time to time in next requests. Watch out though which user-agents you use - some of them indicate mobile or tablet devices, and for them a site (not only IMDB) can give response pages in a format that differs from PC one - other markup, other design etc. So in general above algorithm works only for PC version of pages.

How to use beautiful soup to extract elements from collapsible section

I'm developing a python scraper using beautiful soup4 and I have difficulty to scrape the information in a collapsible section in this page: https://www.redfin.com/CA/Los-Angeles/1366-W-22nd-St-90007/home/6896268.
The collapsible section I want to scrape is "Property History for 1366 West 22nd St". The information I'm trying to get is "date" column and "price" column.
url = "https://www.redfin.com/CA/Los-Angeles/1366-W-22nd-St-90007/home/6896268"
headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'}
req = urllib.request.Request(url, headers = headers)
res = urllib.request.urlopen(req, context=ssl.SSLContext())
soup = BeautifulSoup(res, 'html.parser')
dates = [td.text for td in soup.find_all('td', {"class": "date-col nowrap"})]
However, the dates I scraped from date column only have Oct 29, 2018, Aug 24, 2018 and Aug 24, 2018, because soup.find_all('td', {"class": "date-col nowrap"}) cannot find the rest dates after these three dates. The rest dates are collapsed and need to click "See all property history" button to unfold the rest dates. Is there any way to scrape the collapsed dates using Selenium?
Here's the code which should work, it returns the table as a dictionary of tuples.
import selenium
from selenium import webdriver
import time
url = "https://www.redfin.com/CA/Los-Angeles/1366-W-22nd-St-90007/home/6896268"
def browser():
driver = webdriver.Chrome()
driver.get(url)
return driver
def main():
driver = browser()
el = driver.find_element_by_xpath('//span[contains(text(), "See all property history")]')
el.click()
# should expand quite quickly, otherwise might need to wait, e.g. time.sleep(5)
row_arg = "//tr[#class=' PropertyHistoryEventRow']" # take note of the space before 'Property'
rows = driver.find_elements_by_xpath(row_arg)
tbl = {}
for i, row in enumerate(rows):
date = row.find_element_by_xpath('.//td[#class="date-col nowrap"]').text
event = row.find_element_by_xpath('.//td[#class="event-col"]').text
price = row.find_element_by_xpath('//td[#class="price-col number"]').text
appre = row.find_element_by_xpath('.//td[#class="appreciation-col number empty"]').text
tbl[i] = (date, event, price, appre)
for k, v in tbl.items():
print(k, v)
return tbl

Categories