Im trying to scrap the price of a flight from the Google Flights website using Selenium but said element does not show up anywhere, not even when scraping the whole page. Ive read that it might be due to it being in a different frame, but how would I know in which frame it is.
Here is the website: https://www.google.es/flights?lite=0#flt=/m/0h3tv./m/05qtj.2018-12-14;c:EUR;e:1;a:FR;sd:1;t:f;tt:o
The price I'm looking for is: 32 €
And here is my code:
from bs4 import BeautifulSoup as soup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument("--headless")
d = webdriver.Chrome('/Users/davidgarciaballester/Desktop/chromedriver', options=chrome_options)
url='https://www.google.es/flights?lite=0#flt=/m/0h3tv./m/05qtj.2018-12-14;c:EUR;e:1;a:FR;sd:1;t:f;tt:o'
d.get(url)
precios = soup(d.page_source, 'html.parser').findAll('jsl',{'jstcache':'9322'})
print(precios)
d.quit();
Am I missing something? Thanks in advance.
EDIT 1: jstcache changed value to 9322
You can use the following CSS selector combination:
from selenium import webdriver
d = webdriver.Chrome()
d.get("https://www.google.es/flights?lite=0#flt=/m/0h3tv./m/05qtj.2018-12-14;c:EUR;e:1;a:FR;sd:1;t:f;tt:o")
item = d.execute_script("return document.querySelector('.flt-subhead1.gws-flights-results__price.gws-flights-results__cheapest-price span + jsl')")
print(item.text)
d.quit()
from bs4 import BeautifulSoup as soup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
d = webdriver.Chrome('C:\chromedriver_win32\chromedriver.exe')
url='https://www.google.es/flights?lite=0#flt=/m/0h3tv./m/05qtj.2018-12-14;c:EUR;e:1;a:FR;sd:1;t:f;tt:o'
d.get(url)
page = soup(d.page_source, 'html.parser')
precios = page.findAll('jsl',{'jstcache':'9322'})
print(precios)
d.quit();
worked for me:
print (precios[0].text)
gave me €32
Ok figured out what was going on. I wasn't giving the driver enough time to load the page. Fixed this by stalling for a few seconds after loading the page.
Working code:
from bs4 import BeautifulSoup as soup
from selenium import webdriver
import time
from selenium.webdriver.chrome.options import Options
d = webdriver.Chrome('C:/Users/David/Desktop/chromedriver.exe')
url='https://www.google.es/flights?lite=0#flt=/m/0h3tv./m/05qtj.2018-12-14;c:EUR;e:1;a:FR;sd:1;t:f;tt:o'
d.get(url)
time.sleep(5)
page = soup(d.page_source, 'html.parser')
precios = page.findAll('jsl',{'jstcache':'9322'})
print(precios)
d.quit()
EDIT 1: As Idlehands pointed out the jstcache number is probably dynamic and changes over time, so this aproach was not well thought. Instead I'm now using the following CSS selector combination QHarr suggested. Working code:
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
whitelist = set('abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ 0123456789')
chrome_options = Options()
chrome_options.add_argument("--headless")
d = webdriver.Chrome('C:/Users/David/Desktop/chromedriver.exe', options=chrome_options)
url='https://www.google.es/flights?lite=0#flt=/m/0h3tv./m/05qtj.2018-12-14;c:EUR;e:1;a:FR;sd:1;t:f;tt:o'
d.get(url)
time.sleep(2)
precio = d.execute_script("return document.querySelector('.flt-subhead1.gws-flights-results__price.gws-flights-results__cheapest-price span + jsl')").text
precio = ''.join(filter(whitelist.__contains__, precio))
print(precio)
d.quit()
Related
I am doing web scraping to the real estate portal <www.immobiliare.it>
Specifically I am retrieving some information from the search page, which contains 25 properties per page. I have managed to retrieved almost everything but I am having trouble to retrieve the src of a map image that each property has. This map is after a CSS selector.
The HTML structure is the following:
I have been able to get this data with selenium:
https://stackoverflow.com/a/75020969/14461986
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
Options = Options()
Options.headless = True
driver = webdriver.Chrome(options=Options, service=Service(ChromeDriverManager().install()))
url = 'https://www.immobiliare.it/vendita-case/milano/forlanini/?criterio=dataModifica&ordine=desc&page=3'
driver.get(url)
soup = BeautifulSoup(driver.page_source)
data = []
# Each property is contained under each li in-realEstateResults__item
for property in soup.select('li.in-realEstateResults__item'):
data.append({
'id': property.get('id'),
'MapUrl': property.select_one('[alt="mappa"]').get('src') if property.select_one('[alt="mappa"]') else None
})
print(data)
However, after the 4th image the MapUrl comes empty. The properties are correcty loaded as I have checked the Ids and also the HTML for the rest of the images is the same but for a reason I do not understand the MapUrl is not retrieved. I would also welcome any advice on how make this script more efficient.
However, issue here is lazy loading, so you have to interact with the website and scroll down to force the loading.
You may have to accept / close some popups (optional):
driver.find_element(By.CSS_SELECTOR,'#didomi-notice-agree-button').click()
driver.find_element(By.CSS_SELECTOR,'.nd-dialogFrame__close').click()
driver.find_element(By.CSS_SELECTOR,'section h1').click()
now we can start scrolling (simple but working solution, could be improved):
for i in range(30):
driver.find_element(By.CSS_SELECTOR,'body').send_keys(Keys.PAGE_DOWN)
time.sleep(0.3)
Example
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
url = 'https://www.immobiliare.it/vendita-case/milano/forlanini/?criterio=dataModifica&ordine=desc'
driver.get(url)
driver.find_element(By.CSS_SELECTOR,'#didomi-notice-agree-button').click()
driver.find_element(By.CSS_SELECTOR,'.nd-dialogFrame__close').click()
driver.find_element(By.CSS_SELECTOR,'section h1').click()
for i in range(30):
driver.find_element(By.CSS_SELECTOR,'body').send_keys(Keys.PAGE_DOWN)
time.sleep(0.3)
soup = BeautifulSoup(driver.page_source)
data = []
for e in soup.select('li.in-realEstateResults__item'):
data.append({
'title':e.a.get('title'),
'imgUrls':[i.get('src') for i in e.select('.nd-list__item img')],
'imgMapInfo': e.select_one('[alt="mappa"]').get('src') if e.select_one('[alt="mappa"]') else None
})
data
I am new joiner and doing self study for crawling. I tried to get the information from Disneyland
https://www.hongkongdisneyland.com/book/general-tickets/1day-tickets-j
I tried to crawl the price from the website, but it return "None", the result should be HK$639
url5 = 'https://www.hongkongdisneyland.com/book/general-tickets/1day-tickets-j'
r = requests.get(url5)
sp = BeautifulSoup(r.content, 'html.parser')
door = sp.find('div', class_='container')
price = door.find('p', class_='price')
print(price)
My concept of beautifulSoup: It parse the website to be html and I can use find/find_all to find the information by using 'div', 'p', and by its class. Please correct me if it is wrong, thank you.
The page is loaded by JavaScript.So to pull out the desired data, you can use selenium with bs4
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import time
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument("--no-sandbox")
options.add_argument("start-maximized")
options.add_experimental_option("detach", True)
webdriver_service = Service("./chromedriver") #Your chromedriver path
driver = webdriver.Chrome(service=webdriver_service,options=options)
url = 'https://www.hongkongdisneyland.com/book/general-tickets/1day-tickets-j'
driver.get(url)
#driver.maximize_window()
time.sleep(10)
soup=BeautifulSoup(driver.page_source, 'lxml')
price = soup.select_one('p:-soup-contains("General Admission:") > strong').text
print(price)
Output:
HK$639
the images I am trying to get are inside an <img tag, and I want the 'srcset' images.
I have found this code here but it doesn't seem to work.
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from time import sleep
from cookie_accepter3 import load_and_accept_cookies
driver = webdriver.Safari()
def getdata(url):
r = requests.get(url)
return r.text
URL = 'https://www.autotrader.co.uk/car-details/202205215960809?sort=relevance&advertising-location=at_cars&radius=1501&make=SEAT&postcode=cv326ja&model=Ibiza&onesearchad=New&onesearchad=Nearly%20New&onesearchad=Used&include-delivery-option=on&page=1'
driver.get(URL)
sleep(3)
load_and_accept_cookies(URL, driver)
htmldata = getdata("https://www.autotrader.co.uk/car-details/202205215960809?sort=relevance&advertising-location=at_cars&radius=1501&make=SEAT&postcode=cv326ja&model=Ibiza&onesearchad=New&onesearchad=Nearly%20New&onesearchad=Used&include-delivery-option=on&page=1")
soup = BeautifulSoup(htmldata, 'html.parser')
for item in soup.find_all('img'):
print(item['src'])
Any help would be greatly appreciated, thank you.
There is only one img element in this webpage, you can easily select it with the selenium CSS selector, you don't need to go for bs4
Working code -
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
options = webdriver.ChromeOptions()
# options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
chrome_driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=options
)
URL = "https://www.autotrader.co.uk/car-details/202205215960809?sort=relevance&advertising-location=at_cars&radius=1501&make=SEAT&postcode=cv326ja&model=Ibiza&onesearchad=New&onesearchad=Nearly%20New&onesearchad=Used&include-delivery-option=on&page="
with chrome_driver as driver:
driver.implicitly_wait(15)
driver.get(URL)
time.sleep(3)
img_url = driver.find_element(By.CSS_SELECTOR, "img").get_attribute("srcset")
time.sleep(0.3)
print(img_url)
Output -
https://m.atcdn.co.uk/a/media/w300h225/afdc5f1656624e178f8af72d7632b92d.jpg 320w, https://m.atcdn.co.uk/a/media/w480h360/afdc5f1656624e178f8af72d7632b92d.jpg 480w, https://m.atcdn.co.uk/a/media/w600h450/afdc5f1656624e178f8af72d7632b92d.jpg 600w, https://m.atcdn.co.uk/a/media/w720h540/afdc5f1656624e178f8af72d7632b92d.jpg 720w, https://m.atcdn.co.uk/a/media/w800h600/afdc5f1656624e178f8af72d7632b92d.jpg 800w
I am working on a project analyzing the Supercluster Astronaut Database. I posted a StackOverflow question a few weeks ago about scraping the data from the website and got the code below from one of the helpful posters.
My only remaining issue with this process is that when I load the code, a browser window pops open linked to the data source I am trying to scrape. I've tried tinkering with this code to get the browser window to not pop up by commenting out a few lines here and there, but nothing I've tried seems to work properly. Can someone help point me in the right direction to modify the code below to not have a browser pop up?
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
data = []
url = 'https://www.supercluster.com/astronauts?ascending=false&limit=300&list=true&sort=launch%20order'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
time.sleep(5)
driver.get(url)
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'lxml')
driver.close()
tags = soup.select('.astronaut_cell.x')
for item in tags:
name = item.select_one('.bau.astronaut_cell__title.bold.mr05').get_text()
#print(name.text)
country = item.select_one('.mouseover__contents.rel.py05.px075.bau.caps.small.ac')
if country:
country=country.get_text()
#print(country)
data.append([name, country])
cols=['name','country']
df = pd.DataFrame(data,columns=cols)
print(df)
I think you're looking to run your code in headless mode. You can add a headless argument in the Options() class to achieve this.
Code :-
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time
data = []
url = 'https://www.supercluster.com/astronauts?ascending=false&limit=300&list=true&sort=launch%20order'
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(ChromeDriverManager().install(),options=options)
driver.maximize_window()
driver.get(url)
time.sleep(10)
soup = BeautifulSoup(driver.page_source, 'lxml')
driver.close()
tags = soup.select('.astronaut_cell.x')
for item in tags:
name = item.select_one('.bau.astronaut_cell__title.bold.mr05').get_text()
#print(name.text)
country = item.select_one('.mouseover__contents.rel.py05.px075.bau.caps.small.ac')
if country:
country=country.get_text()
#print(country)
data.append([name, country])
cols=['name','country']
df = pd.DataFrame(data,columns=cols)
print(df)
I am trying to web scrape Aliexpress using Selenium and Python. I'm doing it by following a youtube tutorial, I have followed every steps but I just can't seem to get it to work.
I tried to use requests, BeautifulSoup as well. But it seems like Aliexpress uses lazy loaders on their product listings. I tried using the window scroll script but that didn't work. It seems like the content would not load until I personally scroll on it.
This is the url for the page I would like to web scrape
https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=dog+supplies<ype=wholesale&SortType=default&g=n
This is the code I have currently. It doesn't return anything in the output. I think that's because it's trying to go through all the product listings but it couldn't find any because it's not loaded...
Any suggestions/help would be greatly appreciated, sorry for the bad formatting and the bad code in advance.
Thank you!
"""
To do
HOT PRODUCT FINDER Enter: Keyword, to generate a url
Product Name
Product Image
Product Link
Sales Number
Price
Create an excel file that contains these data
Sort the list by top selling orders
Develop an algorithm for the velocity of the product (total sales increased / time?)
Scrape site every day """
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import requests
import lxml
#Starting Up the web driver
driver = webdriver.Chrome()
# grab Keywords
search_term = input('Keywords: ')
# url generator
def get_url(search_term):
"""Generate a url link using search term provided"""
url_template = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText={}<ype=wholesale&SortType=default&g=n'
search_term = search_term.replace(" ", "+")
return url_template.format(search_term)
url = get_url('search_term')
driver.get(url)
#scrolling down to the end of the page
time.sleep(2)
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
#Extracting the Collection
r = requests.get(url)
soup = BeautifulSoup(r.content,'lxml')
productlist = soup.find_all('div', class_='list product-card')
print(productlist)
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import requests
import lxml
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("start-maximized")
chrome_options.add_argument("disable-infobars")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
driver = webdriver.Chrome(executable_path = 'chromedriver.exe',options = chrome_options)
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
# grab Keywords
search_term = input('Keywords: ')
# url generator
driver.get('https://www.aliexpress.com')
driver.implicitly_wait(10)
p = driver.find_element_by_name('SearchText')
p.send_keys(search_term)
p.send_keys(Keys.ENTER)
productlist = []
product = driver.find_element_by_xpath('//*[#id="root"]/div/div/div[2]/div[2]/div/div[2]/ul')
height = driver.execute_script("return document.body.scrollHeight")
for scrol in range(100,height-1800,100):
driver.execute_script(f"window.scrollTo(0,{scrol})")
time.sleep(0.5)
# driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
div = []
list_i = []
item_title = []
a = []
for z in range(1,16):
div.append(product.find_element_by_xpath('//*[#id="root"]/div/div/div[2]/div[2]/div/div[2]/ul/div'+str([z])))
for pr in div:
list_i.append(pr.find_elements_by_class_name('list-item'))
for pc in list_i:
for p in pc:
item_title.append(p.find_element_by_class_name('item-title-wrap'))
for pt in item_title:
a.append(pt.find_element_by_tag_name('a'))
for prt in a:
productlist.append(prt.text)