I am new joiner and doing self study for crawling. I tried to get the information from Disneyland
https://www.hongkongdisneyland.com/book/general-tickets/1day-tickets-j
I tried to crawl the price from the website, but it return "None", the result should be HK$639
url5 = 'https://www.hongkongdisneyland.com/book/general-tickets/1day-tickets-j'
r = requests.get(url5)
sp = BeautifulSoup(r.content, 'html.parser')
door = sp.find('div', class_='container')
price = door.find('p', class_='price')
print(price)
My concept of beautifulSoup: It parse the website to be html and I can use find/find_all to find the information by using 'div', 'p', and by its class. Please correct me if it is wrong, thank you.
The page is loaded by JavaScript.So to pull out the desired data, you can use selenium with bs4
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import time
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument("--no-sandbox")
options.add_argument("start-maximized")
options.add_experimental_option("detach", True)
webdriver_service = Service("./chromedriver") #Your chromedriver path
driver = webdriver.Chrome(service=webdriver_service,options=options)
url = 'https://www.hongkongdisneyland.com/book/general-tickets/1day-tickets-j'
driver.get(url)
#driver.maximize_window()
time.sleep(10)
soup=BeautifulSoup(driver.page_source, 'lxml')
price = soup.select_one('p:-soup-contains("General Admission:") > strong').text
print(price)
Output:
HK$639
Related
I am trying to crawl this site "https://ec.europa.eu/info/law/better-regulation/have-your-say/initiatives/12527-Artificial-intelligence-ethical-and-legal-requirements/feedback_en?p_id=24212003"
but getting only header and few body responses, unable to get full paragraph content and links of pages.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.headless = True
options.add_argument("--window-size=1920,1200")
driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
driver.get("https://ec.europa.eu/info/law/better-regulation/have-your-say/initiatives/12527-Artificial-intelligence-ethical-and-legal-requirements/feedback_en?p_id=24212003&page=1")
print(driver.page_source)
driver.quit()
So the response has no href and tags that what I need
What i need from site
result output
Pass some experimental options and scrape the data
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
# set chrome options and run headless
chrome_options = Options()
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation",
'disable-component-update',
'ignore-certificate-errors'])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options, executable_path=DRIVER_PATH)
driver.get("https://ec.europa.eu/info/law/better-regulation/have-your-say/initiatives/12527-Artificial-intelligence-ethical-and-legal-requirements/feedback_en?p_id=24212003&page=1")
soup = BeautifulSoup(driver.page_source, 'html')
driver.quit()
# find all <a> tags and get the href using dict comprehension
d = {x.text: x['href'] for x in soup.find_all('a', href=True)}
I have to get <a class="last" aria-label="Last Page" href="https://webtoon-tr.com/webtoon/page/122/">Son »</a>.
From this site:https://webtoon-tr.com/webtoon/
But when i try to scrape it with this code:
from bs4 import BeautifulSoup
import requests
url = "https://webtoon-tr.com/webtoon/"
html = requests.get(url).content
soup = BeautifulSoup(html,"html.parser")
last = soup.find_all("a",{"class":"last"})
print(last)
It just returns me an empty list, and when i try to scrape all "a" tags it only returns 2 which are completly different things.
Can somebody help me about it ? I really appreciate it.
Try using the request_html library.
from bs4 import BeautifulSoup
import requests_html
url = "https://webtoon-tr.com/webtoon/"
s = requests_html.HTMLSession()
html = s.get(url)
soup = BeautifulSoup(html.content, "lxml")
last = soup.findAll("a", {"class":"last"})
print(last)
[<a aria-label="Last Page" class="last" href="https://webtoon-tr.com/webtoon/page/122/">Son »</a>]
Website is protected by Cloudflare. requests, cloudscraper or request_html doesn't work for me, only selenium:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--headless")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
browser.get("https://webtoon-tr.com/webtoon/")
soup = BeautifulSoup(browser.page_source, 'html5lib')
browser.quit()
link = soup.select_one('a.last')
print(link)
This returns
<a aria-label="Last Page" class="last" href="https://webtoon-tr.com/webtoon/page/122/">Son »</a>
the images I am trying to get are inside an <img tag, and I want the 'srcset' images.
I have found this code here but it doesn't seem to work.
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from time import sleep
from cookie_accepter3 import load_and_accept_cookies
driver = webdriver.Safari()
def getdata(url):
r = requests.get(url)
return r.text
URL = 'https://www.autotrader.co.uk/car-details/202205215960809?sort=relevance&advertising-location=at_cars&radius=1501&make=SEAT&postcode=cv326ja&model=Ibiza&onesearchad=New&onesearchad=Nearly%20New&onesearchad=Used&include-delivery-option=on&page=1'
driver.get(URL)
sleep(3)
load_and_accept_cookies(URL, driver)
htmldata = getdata("https://www.autotrader.co.uk/car-details/202205215960809?sort=relevance&advertising-location=at_cars&radius=1501&make=SEAT&postcode=cv326ja&model=Ibiza&onesearchad=New&onesearchad=Nearly%20New&onesearchad=Used&include-delivery-option=on&page=1")
soup = BeautifulSoup(htmldata, 'html.parser')
for item in soup.find_all('img'):
print(item['src'])
Any help would be greatly appreciated, thank you.
There is only one img element in this webpage, you can easily select it with the selenium CSS selector, you don't need to go for bs4
Working code -
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
options = webdriver.ChromeOptions()
# options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
chrome_driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=options
)
URL = "https://www.autotrader.co.uk/car-details/202205215960809?sort=relevance&advertising-location=at_cars&radius=1501&make=SEAT&postcode=cv326ja&model=Ibiza&onesearchad=New&onesearchad=Nearly%20New&onesearchad=Used&include-delivery-option=on&page="
with chrome_driver as driver:
driver.implicitly_wait(15)
driver.get(URL)
time.sleep(3)
img_url = driver.find_element(By.CSS_SELECTOR, "img").get_attribute("srcset")
time.sleep(0.3)
print(img_url)
Output -
https://m.atcdn.co.uk/a/media/w300h225/afdc5f1656624e178f8af72d7632b92d.jpg 320w, https://m.atcdn.co.uk/a/media/w480h360/afdc5f1656624e178f8af72d7632b92d.jpg 480w, https://m.atcdn.co.uk/a/media/w600h450/afdc5f1656624e178f8af72d7632b92d.jpg 600w, https://m.atcdn.co.uk/a/media/w720h540/afdc5f1656624e178f8af72d7632b92d.jpg 720w, https://m.atcdn.co.uk/a/media/w800h600/afdc5f1656624e178f8af72d7632b92d.jpg 800w
I am trying to scrap Google Hot Trends. I tried to run Chrome developer tools to capture all requests, but it seems there are no requests in or out. So I tried to use selenium, But I could not get the data due to many reasons the data is variable and change constantly. Here is the code I tried:
from selenium import webdriver
from selenium.webdriver.chrome import options
import os
from bs4 import BeautifulSoup
options = options.Options()
options.headless = True
options.add_argument("--headless")
url = "https://trends.google.com/trends/hottrends/visualize?nrow=5&ncol=5&pn=p36"
def HeadlessBrowserHttpRequest(target: str) -> str:
driver = webdriver.Chrome(
options=options, executable_path=os.path.abspath("chromedriver")
)
while True:
driver.get(target)
soup = BeautifulSoup(driver.page_source, "html.parser")
if soup.find("tile"):
titles = [title for title in soup.find("div", class_="tile")]
if len(titles) > 0:
print(titles)
HeadlessBrowserHttpRequest(url)
Your code looks correct.
The only point I see here you are missing is: you have to extract the texts from web elements you get here.
Also I'd prefer printing the texts one by one, not all the array at once.
As following:
from selenium import webdriver
from selenium.webdriver.chrome import options
import os
from bs4 import BeautifulSoup
options = options.Options()
options.headless = True
options.add_argument("--headless")
url = "https://trends.google.com/trends/hottrends/visualize?nrow=5&ncol=5&pn=p36"
def HeadlessBrowserHttpRequest(target: str) -> str:
driver = webdriver.Chrome(
options=options, executable_path=os.path.abspath("chromedriver")
)
while True:
driver.get(target)
soup = BeautifulSoup(driver.page_source, "html.parser")
if soup.find("tile"):
titles = [title for title in soup.find("div", class_="tile")]
if len(titles) > 0:
for title in titles:
print(title.text)
HeadlessBrowserHttpRequest(url)
I managed to solve the problem with the following code:
from selenium import webdriver
from selenium.webdriver.chrome import options
import os
from bs4 import BeautifulSoup
options = options.Options()
url = "https://trends.google.com/trends/hottrends/visualize?nrow=5&ncol=5&pn=p36"
def HeadlessBrowserHttpRequest(target: str) -> str:
driver = webdriver.Chrome(
options=options, executable_path=os.path.abspath("chromedriver")
)
driver.get(target)
while True:
soup = BeautifulSoup(driver.page_source, "html.parser")
if soup.find("div", {"class": "card current done-typing"}):
titles = [title for title in soup.find("div", class_="card current done-typing")]
if len(titles) > 0:
for title in titles:
print(title.text)
HeadlessBrowserHttpRequest(url)
Im trying to scrap the price of a flight from the Google Flights website using Selenium but said element does not show up anywhere, not even when scraping the whole page. Ive read that it might be due to it being in a different frame, but how would I know in which frame it is.
Here is the website: https://www.google.es/flights?lite=0#flt=/m/0h3tv./m/05qtj.2018-12-14;c:EUR;e:1;a:FR;sd:1;t:f;tt:o
The price I'm looking for is: 32 €
And here is my code:
from bs4 import BeautifulSoup as soup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument("--headless")
d = webdriver.Chrome('/Users/davidgarciaballester/Desktop/chromedriver', options=chrome_options)
url='https://www.google.es/flights?lite=0#flt=/m/0h3tv./m/05qtj.2018-12-14;c:EUR;e:1;a:FR;sd:1;t:f;tt:o'
d.get(url)
precios = soup(d.page_source, 'html.parser').findAll('jsl',{'jstcache':'9322'})
print(precios)
d.quit();
Am I missing something? Thanks in advance.
EDIT 1: jstcache changed value to 9322
You can use the following CSS selector combination:
from selenium import webdriver
d = webdriver.Chrome()
d.get("https://www.google.es/flights?lite=0#flt=/m/0h3tv./m/05qtj.2018-12-14;c:EUR;e:1;a:FR;sd:1;t:f;tt:o")
item = d.execute_script("return document.querySelector('.flt-subhead1.gws-flights-results__price.gws-flights-results__cheapest-price span + jsl')")
print(item.text)
d.quit()
from bs4 import BeautifulSoup as soup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
d = webdriver.Chrome('C:\chromedriver_win32\chromedriver.exe')
url='https://www.google.es/flights?lite=0#flt=/m/0h3tv./m/05qtj.2018-12-14;c:EUR;e:1;a:FR;sd:1;t:f;tt:o'
d.get(url)
page = soup(d.page_source, 'html.parser')
precios = page.findAll('jsl',{'jstcache':'9322'})
print(precios)
d.quit();
worked for me:
print (precios[0].text)
gave me €32
Ok figured out what was going on. I wasn't giving the driver enough time to load the page. Fixed this by stalling for a few seconds after loading the page.
Working code:
from bs4 import BeautifulSoup as soup
from selenium import webdriver
import time
from selenium.webdriver.chrome.options import Options
d = webdriver.Chrome('C:/Users/David/Desktop/chromedriver.exe')
url='https://www.google.es/flights?lite=0#flt=/m/0h3tv./m/05qtj.2018-12-14;c:EUR;e:1;a:FR;sd:1;t:f;tt:o'
d.get(url)
time.sleep(5)
page = soup(d.page_source, 'html.parser')
precios = page.findAll('jsl',{'jstcache':'9322'})
print(precios)
d.quit()
EDIT 1: As Idlehands pointed out the jstcache number is probably dynamic and changes over time, so this aproach was not well thought. Instead I'm now using the following CSS selector combination QHarr suggested. Working code:
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
whitelist = set('abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ 0123456789')
chrome_options = Options()
chrome_options.add_argument("--headless")
d = webdriver.Chrome('C:/Users/David/Desktop/chromedriver.exe', options=chrome_options)
url='https://www.google.es/flights?lite=0#flt=/m/0h3tv./m/05qtj.2018-12-14;c:EUR;e:1;a:FR;sd:1;t:f;tt:o'
d.get(url)
time.sleep(2)
precio = d.execute_script("return document.querySelector('.flt-subhead1.gws-flights-results__price.gws-flights-results__cheapest-price span + jsl')").text
precio = ''.join(filter(whitelist.__contains__, precio))
print(precio)
d.quit()