Web scraping BeautifulSoup find_all doesn't work on APEC - python

the source code of the page is as in the picture
I want to findall div class container-result but it doesn't work I get an empty list
my code :
`
from bs4 import BeautifulSoup, NavigableString, Tag
import requests
import urllib.request
url = "https://www.apec.fr/candidat/recherche-emploi.html/emploi?page="
for page in range(0,10,1):
r = requests.get(url + str(page))
soup = BeautifulSoup(r.content,"html.parser")
ancher = soup.find_all('div', attrs={'class': 'container-result'})
print(ancher)
`

As the web page is rendered by javascript, requests / BeautifulSoup will not be able to retrieve DOM elements needed as they're added after some time whilst page is being rendered. You could try to use selenium for this purpose, here is an example:
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
# delay for selenium web driver wait
DELAY = 30
# create selenium driver
chrome_options = webdriver.ChromeOptions()
#chrome_options.add_argument('--headless')
#chrome_options.add_argument('--no-sandbox')
driver = webdriver.Chrome('<<PATH TO chromedriver>>', options=chrome_options)
# iterate over pages
for page in range(0, 10, 1):
# open web page
driver.get(f'https://www.apec.fr/candidat/recherche-emploi.html/emploi?page={page}')
# wait for element with class 'container-result' to be added
container_result = WebDriverWait(driver, DELAY).until(EC.presence_of_element_located((By.CLASS_NAME, "container-result")))
# scroll to container-result
driver.execute_script("arguments[0].scrollIntoView();", container_result)
# get source HTML of the container-result element
source = container_result.get_attribute('innerHTML')
# print source
print(source)
# here you can continue work with the source variable either using selenium API or using BeautifulSoup API:
# soup = BeautifulSoup(source, "html.parser")
# quit webdriver
driver.quit()

Related

How to scrape text from a hidden element?

I am trying to scrape the text of the Swiss constitution from Link and convert it to markdown. However, the page source is different from what I see in the inspector: The source only contains no script warnings in various languages with the element "app-root" hidden.
The inspector shows a .html file served from here with which I am able to get the desired result. However, using this file directly would not allow me to scrape the subsequent revisions of the law automatically. Is there a way to extract the page source with the element "app-root" displayed?
This code returns "None" but works with the URL set to the .html file:
from selenium import webdriver
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver import FirefoxOptions
from bs4 import BeautifulSoup
from markdownify import markdownify
url = "https://www.fedlex.admin.ch/eli/cc/1999/404/en"
opts = FirefoxOptions()
opts.add_argument("--headless")
driver = webdriver.Firefox(executable_path=GeckoDriverManager().install(), options=opts)
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
div = soup.find("div", {"id": "lawcontent"})
content = markdownify(str(div))
print(content[:200])
Any help is much appreciated.
In your code, you're not giving any time for the driver to render the contents, resulting in incomplete source code.
Waits can be used to wait for required elements to be visible/present etc. The given code below waits for the div content to be visible and then returns the page source code.
Code snippet-
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
url = "https://www.fedlex.admin.ch/eli/cc/1999/404/en"
driver.get(url)
try:
delay=20 #20 second delay
WebDriverWait(driver, delay).until(EC.visibility_of_element_located((By.ID, 'lawcontent')))
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
div = soup.find("div", {"id": "lawcontent"})
content = markdownify(str(div))
print(content[:200])
#raises Exception if element is not visible within delay duration
except TimeoutException:
print("Timeout!!!")

Python web crawling result is less than expected

I tried using Selenium to crawl the web data, it loads all 346 products after clicking on the load more button for a few times on the browser, however, it only shows 96 / 346 product instead of 346 / 346 product, any idea how to fix it? I have already put the crawling code right after the while true loop for clicking the load more button
screen capture of the result
from urllib.request import urlopen
import requests
import ast
from selenium import webdriver
driver=webdriver.Chrome('e:/Users/fungc1/Documents/chromedriver.exe')
from selenium.webdriver.chrome.options import Options
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
options=Options()
from bs4 import BeautifulSoup
url="https://www.toysrus.com.sg/lego"
#data = soup.findAll('div',attrs={'class':'card-image-wrapper'})
#toc = soup.find_all('div',attrs={'class':'result-count text-center'})
driver.get(url)
driver.maximize_window()
time.sleep(5)
driver.find_element_by_link_text("STAY ON THE SINGAPORE SITE").click()
while True:
try:
driver.execute_script("var scrollingElement = (document.scrollingElement || document.body);scrollingElement.scrollTop = scrollingElement.scrollHeight;")
wait=WebDriverWait(driver, 10)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button.btn[data-url*='www.toysrus.com']"))).click()
time.sleep(5)
except Exception as e:
print(e)
break
time.sleep(5)
response = requests.get(url)
response_text = response.text
soup = BeautifulSoup(response_text, 'lxml')
text = urlopen(url).read()
soup = BeautifulSoup(text)
data = soup.findAll('div',attrs={'class':'card-image-wrapper'})
toc = soup.find_all('div',attrs={'class':'result-count text-center'})
emptylist2=[]
for item in toc:
print((item).text.strip()[:-1])
for div in data:
links = div.findAll('a')
for a in links:
catalogueresult=ast.literal_eval("" + a['href'][1:-5][-7:])
emptylist2.append(catalogueresult)
print (emptylist2)
You are mixing few things.
You opened the browser with Selenium and loaded all the items by clicking on load button. But after that you use requests library to request new html again from the url which has nothing to do with Selenium. So, you are doing two separate things. In your case even you remove the Selenium code you will get the same thing because you are not utilizing Selenium after loading all the products.
Now, what you need to do is to ask the Selenium to return the html code of all 396 products so that you can give it to BeautifulSoup for further parsing.
To do that, you don't need first 4 lines after your while loop ends. Do something like this:
html = driver.page_source #will return the html code with all products
soup = BeautifulSoup(html, 'lxml')
With this you will get all 396 products.

How can I scrape career path job titles from this javascript page using Python

How can I scrape career path job titles from this javascript page using Python?
'https://www.dice.com/career-paths?title=PHP%2BDeveloper&location=San%2BDiego,%2BCalifornia,%2BUs,%2BCA&experience=0&sortBy=mostProbableTransition'
This is my code snippet, the returned soup doesn't have any the text data I need!
import requests
from bs4 import BeautifulSoup
import json
import re
from selenium import webdriver
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# get BeautifulSoup object
def get_soup(url):
"""
This function returns the BeautifulSoup object.
Parameters:
url: the link to get soup object for
Returns:
soup: BeautifulSoup object
"""
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
return soup
# get selenium driver object
def get_selenium_driver():
"""
This function returns the selenium driver object.
Parameters:
None
Returns:
driver: selenium driver object
"""
options = webdriver.FirefoxOptions()
options.add_argument('-headless')
driver = webdriver.Firefox(executable_path=r"geckodriver", firefox_options = options)
return driver
# get soup obj using selenium
def get_soup_using_selenium(url):
"""
Given the url of a page, this function returns the soup object.
Parameters:
url: the link to get soup object for
Returns:
soup: soup object
"""
options = webdriver.FirefoxOptions()
options.add_argument('-headless')
driver = webdriver.Firefox(executable_path=r"geckodriver", firefox_options = options)
driver.get(url)
driver.implicitly_wait(3)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
driver.close()
return soup
title = "PHP%2BDeveloper"
location = "San%2BDiego,%2BCalifornia,%2BUs,%2BCA"
years_of_experirence = "0"
sort_by_filter = "mostProbableTransition"
url = "https://www.dice.com/career-paths?title={}&location={}&experience={}&sortBy={}".format(title, location, years_of_experirence , sort_by_filter)
career_paths_page_soup = get_soup(url)
Like another user has mentioned in the comments, requests won't work for you here. But, using Selenium, you can scrape the page contents using WebDriverWait to ensure all page content had loaded, and element.text to fetch the web page content.
The following code snippet will print the career path strings on the left side of the page:
from selenium import webdriver
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# navigate to the page
driver = get_selenium_driver()
driver.get(url)
# wait for loading indicator to be hidden
WebDriverWait(driver, 10).until(EC.invisibility_of_element((By.XPATH, "//*[contains(text(), 'Loading data')]")))
# wait for content to load
career_path_elements = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, "//div[#class='abcd']/ul/li")))
# print out career paths
for element in career_path_elements:
# get title attribute that usually contains career path text
title = element.get_attribute("title")
# sometimes career path is in span below this element
if not title:
# find the element and print its text
span_element = element.find_element_by_xpath("span[not(contains(#class, 'currentJobHead'))]")
print(span_element.text)
# print title in other cases
else:
print(title)
This prints the following:
PHP Developer
Drupal Developer
Web Developer
Full Stack Developer
Back-End Developer
Full Stack PHP Developer
IT Director
Software Development Manager
There are a few items of interest here. The main one being the Javascript loading on this page -- upon first opening the page, a "Loading data..." indicator appears. We must wait on EC.invisibility_of_element for this item to ensure it has disappeared before we attempt to locate any page content.
After that, we invoke WebDriverWait once again, but this time on the "Career path" elements on the right hand side of the page. This WebDriverWait call returns a list of elements, stored in career_path_elements. We can loop through this list of elements to print the career path of each item.
Each career path element contains the career path text in the title attribute, so we call element.get_attribute("title") to fetch that text. However, there is a special case for the 'Current Job Title' item where the career path text is contained in a span one level lower. We handle cases where title is empty by calling element.find_element_by_xpath() to locate the span instead. This ensures we can print every career path item on the page.

Source data doesn't match actual content when scraping dynamic content with Beautiful Soup + Selenium

I'm trying to teach myself how to scrape data and found a nice dynamic website to test this on (releases.com in this case).
Since it's dynamic, I figured I'd have to use selenium to fetch its data.
However, the retrieved page source still only contains the initial html and its js: not the actual elements shown in the browser.
Why does that happen?
I'm assuming it's because I'm fetching the page source, but what other option is there?
My code looks like this:
from bs4 import BeautifulSoup as soup
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
import chromedriver_binary
import time
#constants
my_url = "https://www.releases.com/l/Games/2018/1/"
# Start the WebDriver and load the page
wd = webdriver.Chrome()
wd.get(my_url)
# Wait for the elements to appear
time.sleep(10)
# And grab the page HTML source
html_page = wd.page_source
wd.quit()
#Make soup
pageSoup = soup(html_page, "html.parser")
#Get data
print(pageSoup.text)

python - web scraping an ajax website using BeautifulSoup

I am trying to scrape e-commerce site that uses ajax call to load its next pages.
I am able to scrape data present on page 1 but page 2 loads automatically through ajax call when I scroll page 1 to bottom.
My code :
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as ureq
my_url='http://www.shopclues.com/mobiles-smartphones.html'
page=ureq(my_url).read()
page_soup=soup(page,"html.parser")
containers=page_soup.findAll("div",{"class":"column col3"})
for container in containers:
name=container.h3.text
price=container.find("span",{'class':'p_price'}).text
print("Name : "+name.replace(","," "))
print("Price : "+price)
for i in range(2,7):
my_url="http://www.shopclues.com/ajaxCall/moreProducts?catId=1431&filters=&pageType=c&brandName=&start="+str(36*(i-1))+"&columns=4&fl_cal=1&page="+str(i)
page=ureq(my_url).read()
print(page)
page_soup=soup(page,"html.parser")
containers=page_soup.findAll("div",{"class":"column col3"})
for container in containers:
name=container.h3.text
price=container.find("span",{'class':'p_price'}).text
print("Name : "+name.replace(","," "))
print("Price : "+price)
I have printed the ajax page read by ureq to know whether I am able to open the ajax page and I got an output as:
b' ' are the outputs of:
print(page)
please provide me a solution to scrape the remaining data.
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup as soup
from urllib2 import urlopen as ureq
import random
import time
chrome_options = webdriver.ChromeOptions()
prefs = {"profile.default_content_setting_values.notifications": 2}
chrome_options.add_experimental_option("prefs", prefs)
# A randomizer for the delay
seconds = 5 + (random.random() * 5)
# create a new Chrome session
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.implicitly_wait(30)
# driver.maximize_window()
# navigate to the application home page
driver.get("http://www.shopclues.com/mobiles-smartphones.html")
time.sleep(seconds)
time.sleep(seconds)
# Add more to range for more phones
for i in range(1):
element = driver.find_element_by_id("moreProduct")
driver.execute_script("arguments[0].click();", element)
time.sleep(seconds)
time.sleep(seconds)
html = driver.page_source
page_soup = soup(html, "html.parser")
containers = page_soup.findAll("div", {"class": "column col3"})
for container in containers:
# Add error handling
try:
name = container.h3.text
price = container.find("span", {'class': 'p_price'}).text
print("Name : " + name.replace(",", " "))
print("Price : " + price)
except AttributeError:
continue
driver.quit()
I used selenium to load the website and click the button to load more results. Then take the resulting html and put in your code.

Categories