Is this website scrape-able with BeautifulSoup? - python

I would to scrape this website : https://www.projets-environnement.gouv.fr/pages/home/
More precisely, I would like to collect the table in the div with id = table-wrapper.
My trouble is that I can't catch it with BeautifulSoup.
Here is my code :
url = 'https://www.projets-environnement.gouv.fr/pages/home/'
html = requests.get(url).text
soup = BeautifulSoup(html, "html5lib")
div_table = soup.findAll('div', id_='table-wrapper')
But div_table is a None object.
Is Selenium the solution ?

I think you should use selenium:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
url = 'https://www.projets-environnement.gouv.fr/pages/home/'
options = Options()
options.headless = True
driver = webdriver.Firefox(firefox_options=options)
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
mytable = soup.find('div', id='table-wrapper')
and you get that table.

The correct way to call is:
soup.find("div", {"id": "table-wrapper"})

Related

Get 'None' from website by using beautifulSoup

I am new joiner and doing self study for crawling. I tried to get the information from Disneyland
https://www.hongkongdisneyland.com/book/general-tickets/1day-tickets-j
I tried to crawl the price from the website, but it return "None", the result should be HK$639
url5 = 'https://www.hongkongdisneyland.com/book/general-tickets/1day-tickets-j'
r = requests.get(url5)
sp = BeautifulSoup(r.content, 'html.parser')
door = sp.find('div', class_='container')
price = door.find('p', class_='price')
print(price)
My concept of beautifulSoup: It parse the website to be html and I can use find/find_all to find the information by using 'div', 'p', and by its class. Please correct me if it is wrong, thank you.
The page is loaded by JavaScript.So to pull out the desired data, you can use selenium with bs4
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import time
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument("--no-sandbox")
options.add_argument("start-maximized")
options.add_experimental_option("detach", True)
webdriver_service = Service("./chromedriver") #Your chromedriver path
driver = webdriver.Chrome(service=webdriver_service,options=options)
url = 'https://www.hongkongdisneyland.com/book/general-tickets/1day-tickets-j'
driver.get(url)
#driver.maximize_window()
time.sleep(10)
soup=BeautifulSoup(driver.page_source, 'lxml')
price = soup.select_one('p:-soup-contains("General Admission:") > strong').text
print(price)
Output:
HK$639

How to scrape this using bs4

I have to get <a class="last" aria-label="Last Page" href="https://webtoon-tr.com/webtoon/page/122/">Son »</a>.
From this site:https://webtoon-tr.com/webtoon/
But when i try to scrape it with this code:
from bs4 import BeautifulSoup
import requests
url = "https://webtoon-tr.com/webtoon/"
html = requests.get(url).content
soup = BeautifulSoup(html,"html.parser")
last = soup.find_all("a",{"class":"last"})
print(last)
It just returns me an empty list, and when i try to scrape all "a" tags it only returns 2 which are completly different things.
Can somebody help me about it ? I really appreciate it.
Try using the request_html library.
from bs4 import BeautifulSoup
import requests_html
url = "https://webtoon-tr.com/webtoon/"
s = requests_html.HTMLSession()
html = s.get(url)
soup = BeautifulSoup(html.content, "lxml")
last = soup.findAll("a", {"class":"last"})
print(last)
[<a aria-label="Last Page" class="last" href="https://webtoon-tr.com/webtoon/page/122/">Son »</a>]
Website is protected by Cloudflare. requests, cloudscraper or request_html doesn't work for me, only selenium:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--headless")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
browser.get("https://webtoon-tr.com/webtoon/")
soup = BeautifulSoup(browser.page_source, 'html5lib')
browser.quit()
link = soup.select_one('a.last')
print(link)
This returns
<a aria-label="Last Page" class="last" href="https://webtoon-tr.com/webtoon/page/122/">Son »</a>

How to scrap google hot trend

I am trying to scrap Google Hot Trends. I tried to run Chrome developer tools to capture all requests, but it seems there are no requests in or out. So I tried to use selenium, But I could not get the data due to many reasons the data is variable and change constantly. Here is the code I tried:
from selenium import webdriver
from selenium.webdriver.chrome import options
import os
from bs4 import BeautifulSoup
options = options.Options()
options.headless = True
options.add_argument("--headless")
url = "https://trends.google.com/trends/hottrends/visualize?nrow=5&ncol=5&pn=p36"
def HeadlessBrowserHttpRequest(target: str) -> str:
driver = webdriver.Chrome(
options=options, executable_path=os.path.abspath("chromedriver")
)
while True:
driver.get(target)
soup = BeautifulSoup(driver.page_source, "html.parser")
if soup.find("tile"):
titles = [title for title in soup.find("div", class_="tile")]
if len(titles) > 0:
print(titles)
HeadlessBrowserHttpRequest(url)
Your code looks correct.
The only point I see here you are missing is: you have to extract the texts from web elements you get here.
Also I'd prefer printing the texts one by one, not all the array at once.
As following:
from selenium import webdriver
from selenium.webdriver.chrome import options
import os
from bs4 import BeautifulSoup
options = options.Options()
options.headless = True
options.add_argument("--headless")
url = "https://trends.google.com/trends/hottrends/visualize?nrow=5&ncol=5&pn=p36"
def HeadlessBrowserHttpRequest(target: str) -> str:
driver = webdriver.Chrome(
options=options, executable_path=os.path.abspath("chromedriver")
)
while True:
driver.get(target)
soup = BeautifulSoup(driver.page_source, "html.parser")
if soup.find("tile"):
titles = [title for title in soup.find("div", class_="tile")]
if len(titles) > 0:
for title in titles:
print(title.text)
HeadlessBrowserHttpRequest(url)
I managed to solve the problem with the following code:
from selenium import webdriver
from selenium.webdriver.chrome import options
import os
from bs4 import BeautifulSoup
options = options.Options()
url = "https://trends.google.com/trends/hottrends/visualize?nrow=5&ncol=5&pn=p36"
def HeadlessBrowserHttpRequest(target: str) -> str:
driver = webdriver.Chrome(
options=options, executable_path=os.path.abspath("chromedriver")
)
driver.get(target)
while True:
soup = BeautifulSoup(driver.page_source, "html.parser")
if soup.find("div", {"class": "card current done-typing"}):
titles = [title for title in soup.find("div", class_="card current done-typing")]
if len(titles) > 0:
for title in titles:
print(title.text)
HeadlessBrowserHttpRequest(url)

BeautifulSoup4 Scrape Pages Which URL Does Not Change [Python]

So I want to scrape all the dates from the clash of stats, and there are multiple pages to it, and when you turn the page, the URL does not change. How do I scrape all the dates on which the player has joined a new clan?
The website:
https://www.clashofstats.com/players/pink-panther-VL029CJ2/history/log
My code now:
from emoji import UNICODE_EMOJI
import requests
from bs4 import BeautifulSoup
link = f'https://www.clashofstats.com/players/{"Pink Panther"}-{str("#VL029CJ2").replace("#", "")}/history/log'
link = link.replace("#", "%2523")
link = link.replace("#", "%2540")
link = link.replace(" ", "-")
print(link)
for i in name:
if i in UNICODE_EMOJI:
link = link.replace(i, "")
page = requests.get(link)
soup = BeautifulSoup(page.content, 'html.parser')
dates = soup.find_all(class_="start date")
print(dates)
You should use Selenium
pip install selenium
and download the Chrome Driver for example.
Then the code will look sth like this:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import TimeoutException, WebDriverException
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])
driver = webdriver.Chrome('chromedriver.exe', options=options)
link = 'https://www.clashofstats.com/players/pink-panther-VL029CJ2/history/log'
driver.get(link)
while True:
try:
page = requests.get(link)
soup = BeautifulSoup(page.content, 'html.parser')
dates = soup.find_all(class_="start date")
print(dates)
# next_page_link = driver.find_element_by_xpath('path_to_element')
# next_page_link = driver.find_elements_by_class_name('class_name')
print(next_page_link)
next_page_link.click()
print("Navigating to Next Page")
except (TimeoutException, WebDriverException) as e:
print("Last page reached")
break
driver.quit()
But you need to find XPath to element, or locate the element via class name
Locate elements in Selenium

Incomplete HTML-response on some sites using Requests & BeautifulSoup or Selenium

I'm tying to scrape information from some urls using Requests and BeautifulSoup in Python. But some sites only return an partial HTML response missing the content of the page
This is the code, that is not working:
import requests
from bs4 import BeautifulSoup
url = "http://www.exampleurl.com"
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
Here is the incomplete response:
Picture
I tried to use Selenium with Chrome Webdriver instead, but ended up with the same issue.
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
browser = webdriver.Chrome(options=options)
browser.get(url)
html = browser.page_source
Any ideas?
What happens
You do not get the expected html cause it is in an iframe
Try to get the src of the iframe soup.find('iframe')['src'] and request with it again.
Example
import requests
from bs4 import BeautifulSoup
url = "http://www.ingenieur-jobs.de/jobangebote/3075/"
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
iframe = requests.get(soup.find('iframe')['src'])
soup = BeautifulSoup(iframe.content, 'html.parser')
soup

Categories