I am doing web scraping to the real estate portal <www.immobiliare.it>
Specifically I am retrieving some information from the search page, which contains 25 properties per page. I have managed to retrieved almost everything but I am having trouble to retrieve the src of a map image that each property has. This map is after a CSS selector.
The HTML structure is the following:
I have been able to get this data with selenium:
https://stackoverflow.com/a/75020969/14461986
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
Options = Options()
Options.headless = True
driver = webdriver.Chrome(options=Options, service=Service(ChromeDriverManager().install()))
url = 'https://www.immobiliare.it/vendita-case/milano/forlanini/?criterio=dataModifica&ordine=desc&page=3'
driver.get(url)
soup = BeautifulSoup(driver.page_source)
data = []
# Each property is contained under each li in-realEstateResults__item
for property in soup.select('li.in-realEstateResults__item'):
data.append({
'id': property.get('id'),
'MapUrl': property.select_one('[alt="mappa"]').get('src') if property.select_one('[alt="mappa"]') else None
})
print(data)
However, after the 4th image the MapUrl comes empty. The properties are correcty loaded as I have checked the Ids and also the HTML for the rest of the images is the same but for a reason I do not understand the MapUrl is not retrieved. I would also welcome any advice on how make this script more efficient.
However, issue here is lazy loading, so you have to interact with the website and scroll down to force the loading.
You may have to accept / close some popups (optional):
driver.find_element(By.CSS_SELECTOR,'#didomi-notice-agree-button').click()
driver.find_element(By.CSS_SELECTOR,'.nd-dialogFrame__close').click()
driver.find_element(By.CSS_SELECTOR,'section h1').click()
now we can start scrolling (simple but working solution, could be improved):
for i in range(30):
driver.find_element(By.CSS_SELECTOR,'body').send_keys(Keys.PAGE_DOWN)
time.sleep(0.3)
Example
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
url = 'https://www.immobiliare.it/vendita-case/milano/forlanini/?criterio=dataModifica&ordine=desc'
driver.get(url)
driver.find_element(By.CSS_SELECTOR,'#didomi-notice-agree-button').click()
driver.find_element(By.CSS_SELECTOR,'.nd-dialogFrame__close').click()
driver.find_element(By.CSS_SELECTOR,'section h1').click()
for i in range(30):
driver.find_element(By.CSS_SELECTOR,'body').send_keys(Keys.PAGE_DOWN)
time.sleep(0.3)
soup = BeautifulSoup(driver.page_source)
data = []
for e in soup.select('li.in-realEstateResults__item'):
data.append({
'title':e.a.get('title'),
'imgUrls':[i.get('src') for i in e.select('.nd-list__item img')],
'imgMapInfo': e.select_one('[alt="mappa"]').get('src') if e.select_one('[alt="mappa"]') else None
})
data
Related
I want to get and print all the Urls of the products in one of the AliExpress pages, it worked but just the first 12 urls that show and not all the urls on the page, normally it should be 60 urls because every page in AliExpress contains 60 products. How can I extract all the 60 urls. Any help is highly appreciated.
This is my code :
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from time import sleep
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get("https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=smart+lock<ype=wholesale&SortType=total_tranpro_desc")
urls = [a.get_attribute('href') for a in driver.find_elements(By.CSS_SELECTOR,'a[href*="/item/"]')]
for url in urls:
print(url)
The web page there not presenting all the products. It uses specific loading strategy to present only currently visible products.
To grab all the products you will have to scroll the page collecting the presented products during the scrolling. The URLs should be stored in set to avoid duplicates.
I tried the following simple solution and it worked for me:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
import time
options = Options()
options.add_argument("start-maximized")
webdriver_service = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(service=webdriver_service, options=options)
url = "https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=smart+lock<ype=wholesale&SortType=total_tranpro_desc"
driver.get(url)
urls = set()
for i in range(5):
block = [a.get_attribute('href') for a in driver.find_elements(By.CSS_SELECTOR, 'a[href*="/item/"]')]
urls.update(block)
driver.execute_script("window.scrollBy(0, arguments[0]);", 600)
time.sleep(2)
for url in urls:
print(url)
The output is:
https://he.aliexpress.com/item/1005004505011076.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-33&pdp_ext_f=%7B%22sku_id%22%3A%2212000029397314035%22%7D&pdp_npi=2%40dis%21ILS%21680.47%21238.16%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000029397314035%21sea&curPageLogUid=eOPom12i1YdL
https://he.aliexpress.com/item/1005003118129589.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-3&pdp_ext_f=%7B%22sku_id%22%3A%2212000024195825216%22%7D&pdp_npi=2%40dis%21ILS%2117.75%2114.38%21%21%210.0%21%21%400b0a0ac216625452616837499e4d53%2112000024195825216%21sea&curPageLogUid=b4KiLx28AEal
https://he.aliexpress.com/item/1005004314575646.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-27&pdp_ext_f=%7B%22sku_id%22%3A%2212000028723076890%22%7D&pdp_npi=2%40dis%21ILS%21626.53%21307.0%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000028723076890%21sea&curPageLogUid=iOoGBaB9fk1i
https://he.aliexpress.com/item/4000154297870.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-4&pdp_ext_f=%7B%22sku_id%22%3A%2212000017734780587%22%7D&pdp_npi=2%40dis%21ILS%2176.86%2139.18%21%21%217.4%21%21%400b0a0ac216625452616837499e4d53%2112000017734780587%21sea&curPageLogUid=iMbrWCxxlNRK
https://he.aliexpress.com/item/1005002868522660.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-28&pdp_ext_f=%7B%22sku_id%22%3A%2212000022534558777%22%7D&pdp_npi=2%40dis%21ILS%2163.46%2138.69%21%21%2121.15%21%21%400b0a0ac216625452616837499e4d53%2112000022534558777%21sea&curPageLogUid=kzYKVU82QxVZ
https://he.aliexpress.com/item/1005004495398206.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-44&pdp_ext_f=%7B%22sku_id%22%3A%2212000029361338239%22%7D&pdp_npi=2%40dis%21ILS%21303.25%21239.56%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000029361338239%21sea&curPageLogUid=ZCNQVDFjiW24
https://he.aliexpress.com/item/1005002843575207.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-30&pdp_ext_f=%7B%22sku_id%22%3A%2212000026607998519%22%7D&pdp_npi=2%40dis%21ILS%21294.41%21232.57%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000026607998519%21sea&curPageLogUid=dKGPq84ciu6A
https://he.aliexpress.com/item/4001166182060.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-14&pdp_ext_f=%7B%22sku_id%22%3A%2210000014983376216%22%7D&pdp_npi=2%40dis%21ILS%2183.98%2167.17%21%21%213.51%21%21%400b0a0ac216625452616837499e4d53%2110000014983376216%21sea&curPageLogUid=5P64jj4BsxMb
https://he.aliexpress.com/item/1005002864846136.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-31&pdp_ext_f=%7B%22sku_id%22%3A%2212000027311675115%22%7D&pdp_npi=2%40dis%21ILS%21321.74%21254.18%21%21%2130.24%21%21%400b0a0ac216625452616837499e4d53%2112000027311675115%21sea&curPageLogUid=wIquG44FDWvS
https://he.aliexpress.com/item/1005004407267255.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-25&pdp_ext_f=%7B%22sku_id%22%3A%2212000029260939933%22%7D&pdp_npi=2%40dis%21ILS%211117.23%21893.79%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000029260939933%21sea&curPageLogUid=L9QVMI7Dkc9L
https://he.aliexpress.com/item/1005003202983418.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-13&pdp_ext_f=%7B%22sku_id%22%3A%2212000024642176626%22%7D&pdp_npi=2%40dis%21ILS%2176.75%2127.75%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000024642176626%21sea&curPageLogUid=w934YlWjZpSk
https://he.aliexpress.com/item/10000345027263.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-40&pdp_ext_f=%7B%22sku_id%22%3A%2220000000178547076%22%7D&pdp_npi=2%40dis%21ILS%2173.66%2147.88%21%21%216.31%21%21%400b0a0ac216625452616837499e4d53%2120000000178547076%21sea&curPageLogUid=1CXW2eZmYyoz
https://he.aliexpress.com/item/1005003782715703.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-41&pdp_ext_f=%7B%22sku_id%22%3A%2212000028520788363%22%7D&pdp_npi=2%40dis%21ILS%21733.13%21403.22%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000028520788363%21sea&curPageLogUid=DzR0GUWN6DqQ
https://he.aliexpress.com/item/1005003093642821.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-11&pdp_ext_f=%7B%22sku_id%22%3A%2212000025801875061%22%7D&pdp_npi=2%40dis%21ILS%21823.63%21271.78%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000025801875061%21sea&curPageLogUid=yVO5hwgoHmV1
https://he.aliexpress.com/item/4000918150378.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-23&pdp_ext_f=%7B%22sku_id%22%3A%2212000021385410206%22%7D&pdp_npi=2%40dis%21ILS%21178.69%21146.52%21%21%2186.4%21%21%400b0a0ac216625452616837499e4d53%2112000021385410206%21sea&curPageLogUid=C6dkvTMogJ62
https://he.aliexpress.com/item/1005001900549701.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-16&pdp_ext_f=%7B%22sku_id%22%3A%2212000024274381461%22%7D&pdp_npi=2%40dis%21ILS%21188.58%21105.62%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000024274381461%21sea&curPageLogUid=d1dAHgBFOsqF
https://he.aliexpress.com/item/1005003257440167.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-29&pdp_ext_f=%7B%22sku_id%22%3A%2212000029883679176%22%7D&pdp_npi=2%40dis%21ILS%21891.4%21410.06%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000029883679176%21sea&curPageLogUid=D3UOiF7Eswwj
https://he.aliexpress.com/item/33023975249.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-19&pdp_ext_f=%7B%22sku_id%22%3A%2212000026754718473%22%7D&pdp_npi=2%40dis%21ILS%21132.1%2147.57%21%21%2112.0%21%21%400b0a0ac216625452616837499e4d53%2112000026754718473%21sea&curPageLogUid=iVmMqv3Hv2dw
https://he.aliexpress.com/item/1005004105323148.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-35&pdp_ext_f=%7B%22sku_id%22%3A%2212000028159854219%22%7D&pdp_npi=2%40dis%21ILS%21497.87%21388.34%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000028159854219%21sea&curPageLogUid=ScGffoZSAzHu
https://he.aliexpress.com/item/1005003076385676.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-38&pdp_ext_f=%7B%22sku_id%22%3A%2212000023907005619%22%7D&pdp_npi=2%40dis%21ILS%2186.71%2124.94%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000023907005619%21sea&curPageLogUid=QArTgWyf3MS7
https://he.aliexpress.com/item/1005003731084162.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-2&pdp_ext_f=%7B%22sku_id%22%3A%2212000029315337088%22%7D&pdp_npi=2%40dis%21ILS%21743.72%21327.24%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000029315337088%21sea&curPageLogUid=aMsEw0Z36uSN
https://he.aliexpress.com/item/4001010240435.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-43&pdp_ext_f=%7B%22sku_id%22%3A%2210000013511866229%22%7D&pdp_npi=2%40dis%21ILS%2129.36%213.02%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2110000013511866229%21sea&curPageLogUid=z1vD7tD9W5MZ
https://he.aliexpress.com/item/4000154478484.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-8&pdp_ext_f=%7B%22sku_id%22%3A%2212000021350825852%22%7D&pdp_npi=2%40dis%21ILS%2178.64%2135.39%21%21%217.4%21%21%400b0a0ac216625452616837499e4d53%2112000021350825852%21sea&curPageLogUid=yRWy6gHc4b1E
https://he.aliexpress.com/item/1005003346086758.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-32&pdp_ext_f=%7B%22sku_id%22%3A%2212000025331814334%22%7D&pdp_npi=2%40dis%21ILS%2195.31%2158.12%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000025331814334%21sea&curPageLogUid=iGvJbXPDD3cg
https://he.aliexpress.com/item/4000065467682.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-6&pdp_ext_f=%7B%22sku_id%22%3A%2210000015061976915%22%7D&pdp_npi=2%40dis%21ILS%21281.64%21191.53%21%21%21162.48%21%21%400b0a0ac216625452616837499e4d53%2110000015061976915%21sea&curPageLogUid=1qiD9mBYf6KM
https://he.aliexpress.com/item/1005002676337744.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-1&pdp_ext_f=%7B%22sku_id%22%3A%2212000021669614390%22%7D&pdp_npi=2%40dis%21ILS%21561.28%21392.91%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000021669614390%21sea&curPageLogUid=Y1Mvi9ey1Hwl
https://he.aliexpress.com/item/4000834334480.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-34&pdp_ext_f=%7B%22sku_id%22%3A%2212000028610539908%22%7D&pdp_npi=2%40dis%21ILS%21162.83%21138.42%21%21%2168.96%21%21%400b0a0ac216625452616837499e4d53%2112000028610539908%21sea&curPageLogUid=6CijCm642BQh
https://he.aliexpress.com/item/4001031327478.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-7&pdp_ext_f=%7B%22sku_id%22%3A%2212000015965032899%22%7D&pdp_npi=2%40dis%21ILS%211037.26%21383.79%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000015965032899%21sea&curPageLogUid=fLjjoL5HUrxi
https://he.aliexpress.com/item/1005004262179786.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-12&pdp_ext_f=%7B%22sku_id%22%3A%2212000028573420101%22%7D&pdp_npi=2%40dis%21ILS%21679.22%21319.24%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000028573420101%21sea&curPageLogUid=n4io2MIKAEsP
https://he.aliexpress.com/item/1005003515525914.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-0&pdp_ext_f=%7B%22sku_id%22%3A%2212000026134836061%22%7D&pdp_npi=2%40dis%21ILS%21493.37%21360.15%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000026134836061%21sea&curPageLogUid=qY8tyv2zwcdK
https://he.aliexpress.com/item/1005003230965691.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-26&pdp_ext_f=%7B%22sku_id%22%3A%2212000024764885745%22%7D&pdp_npi=2%40dis%21ILS%21188.3%21133.68%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000024764885745%21sea&curPageLogUid=CJHjTgOCDRJ3
https://he.aliexpress.com/item/4000975407287.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-10&pdp_ext_f=%7B%22sku_id%22%3A%2210000015021478939%22%7D&pdp_npi=2%40dis%21ILS%21680.62%21299.47%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2110000015021478939%21sea&curPageLogUid=WDdJT8cOU94O
https://he.aliexpress.com/item/1005003187314155.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-42&pdp_ext_f=%7B%22sku_id%22%3A%2212000024567051659%22%7D&pdp_npi=2%40dis%21ILS%21157.54%2194.54%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000024567051659%21sea&curPageLogUid=4qQvpAeygSFw
https://he.aliexpress.com/item/32954893736.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-39&pdp_ext_f=%7B%22sku_id%22%3A%2210000002371991125%22%7D&pdp_npi=2%40dis%21ILS%21303.58%21100.18%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2110000002371991125%21sea&curPageLogUid=yDxMTsGMA1m1
https://he.aliexpress.com/item/1005002548126691.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-15&pdp_ext_f=%7B%22sku_id%22%3A%2212000021089170892%22%7D&pdp_npi=2%40dis%21ILS%2171.49%2118.2%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000021089170892%21sea&curPageLogUid=xfzmTqNWeUEV
https://he.aliexpress.com/item/4000735115535.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-17&pdp_ext_f=%7B%22sku_id%22%3A%2212000027051383811%22%7D&pdp_npi=2%40dis%21ILS%21223.41%21176.51%21%21%2128.2%21%21%400b0a0ac216625452616837499e4d53%2112000027051383811%21sea&curPageLogUid=FasdXFlpMCuE
https://he.aliexpress.com/item/1005002659060326.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-20&pdp_ext_f=%7B%22sku_id%22%3A%2212000021594037009%22%7D&pdp_npi=2%40dis%21ILS%211144.03%21800.83%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000021594037009%21sea&curPageLogUid=qWxA5NkkHX7u
https://he.aliexpress.com/item/1005003050882550.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-21&pdp_ext_f=%7B%22sku_id%22%3A%2212000023509149328%22%7D&pdp_npi=2%40dis%21ILS%2178.01%2135.5%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000023509149328%21sea&curPageLogUid=eazJyQ2xYlTa
https://he.aliexpress.com/item/1005001429138105.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-5&pdp_ext_f=%7B%22sku_id%22%3A%2212000016106374836%22%7D&pdp_npi=2%40dis%21ILS%21908.29%21272.49%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000016106374836%21sea&curPageLogUid=UTqfAcwKjxt3
https://he.aliexpress.com/item/1005003575980819.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-9&pdp_ext_f=%7B%22sku_id%22%3A%2212000026346773711%22%7D&pdp_npi=2%40dis%21ILS%2156.48%2137.29%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000026346773711%21sea&curPageLogUid=t6N9K9ie8MII
https://he.aliexpress.com/item/1005004406234563.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-24&pdp_ext_f=%7B%22sku_id%22%3A%2212000029068345270%22%7D&pdp_npi=2%40dis%21ILS%21334.62%21167.31%21%21%2119.93%21%21%400b0a0ac216625452616837499e4d53%2112000029068345270%21sea&curPageLogUid=ERz9XojcigBs
https://he.aliexpress.com/item/1005003208283080.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-18&pdp_ext_f=%7B%22sku_id%22%3A%2212000024667248681%22%7D&pdp_npi=2%40dis%21ILS%21164.38%21100.25%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000024667248681%21sea&curPageLogUid=Z5qyCqcRmY48
https://he.aliexpress.com/item/1005002983524272.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-37&pdp_ext_f=%7B%22sku_id%22%3A%2212000023060725911%22%7D&pdp_npi=2%40dis%21ILS%21164.45%21146.35%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000023060725911%21sea&curPageLogUid=LPz7kSEHNr3P
https://he.aliexpress.com/item/32964505971.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-22&pdp_ext_f=%7B%22sku_id%22%3A%2212000020850078768%22%7D&pdp_npi=2%40dis%21ILS%21861.0%21266.91%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000020850078768%21sea&curPageLogUid=ehbKLbbMGsal
https://he.aliexpress.com/item/1005004243027957.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-36&pdp_ext_f=%7B%22sku_id%22%3A%2212000029030797848%22%7D&pdp_npi=2%40dis%21ILS%21723.76%21325.69%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000029030797848%21sea&curPageLogUid=SAk2CRsHhKDu
Process finished with exit code 0
I'm trying to collect all (5) of the social media links from the artist in this example. Currently, my output is only the LAST (fifth) social media link. I'm using selenium, I understand this my not be the best option for collecting this data but its all I know at this time.
Note, I've only included relevant code for my question. Thank you in advance for any help/insight.
from cgitb import text
from os import link
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time
from random import randint
import pandas as pd
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('disable-infobars')
chrome_options.add_argument('--disable-extensions')
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(chrome_options=chrome_options)
for url in urls:
driver.get(https://soundcloud.com/flux-pavilion)
time.sleep(randint(3,4))
try:
links = driver.find_elements_by_xpath('//*[#id="content"]/div/div[4]/div[2]/div/article[1]/div[2]/ul/li//a[#href]')
for elem in links:
socialmedia = (elem.get_attribute("href"))
except:
links = "none"
artist = {
'socialmedia': socialmedia,
}
print(artist)
The problem is not with your XPath-expression, but rather with the (non-existent) list processing of your output code.
Your code output'ed only the last item of the resulting XPath list. That was the problem why you only received one link (it was the last one).
So change the output part of your code to
[...]
url = driver.get("https://soundcloud.com/flux-pavilion")
time.sleep(randint(3,4))
artist = []
try:
links = driver.find_elements_by_xpath('//*[#id="content"]/div/div[4]/div[2]/div/article[1]/div[2]/ul/li//a[#href]')
for elem in links:
artist.append(elem.get_attribute("href"))
except:
links = "none"
for link in artist:
print(link)
And the output will contain all of the values(links) you desire:
driver = webdriver.Chrome(chrome_options=chrome_options)
https://gate.sc/?url=https%3A%2F%2Ftwitter.com%2FFluxpavilion&token=da4a8d-1-1653430570528
https://gate.sc/?url=https%3A%2F%2Finstagram.com%2FFluxpavilion&token=277ea0-1-1653430570529
https://gate.sc/?url=https%3A%2F%2Ffacebook.com%2FFluxpavilion&token=4c773c-1-1653430570530
https://gate.sc/?url=https%3A%2F%2Fyoutube.com%2FFluxpavilion&token=1353f7-1-1653430570531
https://gate.sc/?url=https%3A%2F%2Fopen.spotify.com%2Fartist%2F7muzHifhMdnfN1xncRLOqk%3Fsi%3DbK9XeoW5RxyMlA-W9uVwPw&token=bc2936-1-1653430570532
I am working on a project analyzing the Supercluster Astronaut Database. I posted a StackOverflow question a few weeks ago about scraping the data from the website and got the code below from one of the helpful posters.
My only remaining issue with this process is that when I load the code, a browser window pops open linked to the data source I am trying to scrape. I've tried tinkering with this code to get the browser window to not pop up by commenting out a few lines here and there, but nothing I've tried seems to work properly. Can someone help point me in the right direction to modify the code below to not have a browser pop up?
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
data = []
url = 'https://www.supercluster.com/astronauts?ascending=false&limit=300&list=true&sort=launch%20order'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
time.sleep(5)
driver.get(url)
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'lxml')
driver.close()
tags = soup.select('.astronaut_cell.x')
for item in tags:
name = item.select_one('.bau.astronaut_cell__title.bold.mr05').get_text()
#print(name.text)
country = item.select_one('.mouseover__contents.rel.py05.px075.bau.caps.small.ac')
if country:
country=country.get_text()
#print(country)
data.append([name, country])
cols=['name','country']
df = pd.DataFrame(data,columns=cols)
print(df)
I think you're looking to run your code in headless mode. You can add a headless argument in the Options() class to achieve this.
Code :-
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time
data = []
url = 'https://www.supercluster.com/astronauts?ascending=false&limit=300&list=true&sort=launch%20order'
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(ChromeDriverManager().install(),options=options)
driver.maximize_window()
driver.get(url)
time.sleep(10)
soup = BeautifulSoup(driver.page_source, 'lxml')
driver.close()
tags = soup.select('.astronaut_cell.x')
for item in tags:
name = item.select_one('.bau.astronaut_cell__title.bold.mr05').get_text()
#print(name.text)
country = item.select_one('.mouseover__contents.rel.py05.px075.bau.caps.small.ac')
if country:
country=country.get_text()
#print(country)
data.append([name, country])
cols=['name','country']
df = pd.DataFrame(data,columns=cols)
print(df)
I am trying to scrape NASDAQ's website for real time stock quotes. When I use chrome developer tools, I can see the span I want to target is (for example with Alphabet as of writing this) <span class="symbol-page-header__pricing-price">$2952.77</span>. I want to extract the $2952.77. My python code is:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
s = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=s)
def get_last_price(ticker):
driver.get(f"https://www.nasdaq.com/market-activity/stocks/{ticker}")
price = driver.find_element(By.CLASS_NAME, "symbol-page-header__pricing-last-price")
print(price.get_attribute('text'))
# p = price.get_attribute('innerHTML')
get_last_price('googl')
The above code returns 'None'. If you uncomment out the line defining p and print it's output, it shows that Selenium thinks the span is empty.
<span class="symbol-page-header__pricing-price"></span>
I don't understand why this is happening. My thought is that it has something to do with the fact that it's probably being rendered dynamically with Javascript, but I thought that was an advantage of Selenium say as opposed to BeautifulSoup... there shouldn't be an issue right?
If you look into the HTML DOM of NASDAQ's Coinbase Global your Locator Strategy selects 2 nodes and the the one you don't want is:
Solution
To print the price information you can use the following Locator Strategy:
Using XPATH and text attribute:
driver.get("https://www.nasdaq.com/market-activity/stocks/coin")
print(WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH,"//span[#class='symbol-page-header__pricing-price' and text()]"))).text)
Using XPATH and get_attribute("innerHTML"):
driver.get("https://www.nasdaq.com/market-activity/stocks/coin")
print(WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH,"//span[#class='symbol-page-header__pricing-price' and text()]"))).get_attribute("innerHTML"))
Console Output:
$263.91
Note : You have to add the following imports :
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
There are 2 nodes with the class symbol-page-header__pricing-price. The node that you want is under
<div class="symbol-page-header__pricing-details symbol-page-header__pricing-details--current symbol-page-header__pricing-details--decrease"></div>
So, you need to get inside this div first to ensure you scrape the right one.
Anyways, I'd recommend you to use BeautifulSoup to scrape the HTML text after you’ve finished interacting with the dynamic website with selenium. This will save your time and your memory. It has no need to keep running the browser, so, it would be better if you terminate it (i.e. driver.close()) and use BeautifulSoup to explore the static HTML text.
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
s = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=s)
def get_last_price(ticker):
driver.get(f"https://www.nasdaq.com/market-activity/stocks/{ticker}")
time.sleep(1)
driver.close()
soup = BeautifulSoup(driver.page_source, "lxml")
header = soup.find('div', attrs={'class': 'symbol-page-header__pricing-details symbol-page-header__pricing-details--current symbol-page-header__pricing-details--decrease'})
price = header.find('span', attrs={'class':'symbol-page-header__pricing-price'})
print(price)
print(price.text)
get_last_price('googl')
Output:
>>> <span class="symbol-page-header__pricing-price">$2952.77</span>
>>> $2952.77
Hello I want to web scrape data from a site with an age verification pop-up using python 3.x and beautifulsoup. I can't get to the underlying text and images without clicking "yes" for "are you over 21". Thanks for any support.
EDIT: Thanks, with some help from a comment I see that I can use the cookies but am not sure how to manage/store/call cookies with the requests package.
So with some help from another user I am using selenium package so that it will work also in case it's a graphical overlay (I think?). Having trouble getting it to work with the gecko driver but will keep trying! Thanks for all the advice again, everyone.
EDIT 3: OK I have made progress and I can get the browser window to open, using the gecko driver!~ Unfortunately it doesn't like that link specification so I'm posting again. The link to click "yes" on the age verification is buried on that page as something called mlink...
EDIT 4: Made some progress, updated code is below. I managed to find the element in the XML code, now I just need to manage to click the link.
#
import time
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
driver = webdriver.Firefox(executable_path=r'/Users/jeff/Documents/geckodriver') # Optional argument, if not specified will search path.
driver.get('https://www.shopharborside.com/oakland/#/shop/412');
url = 'https://www.shopharborside.com/oakland/#/shop/412'
driver.get(url)
#
driver.find_element_by_class_name('hhc_modal-body').click(Yes)
#wait.1.second
time.sleep(1)
pagesource = driver.page_source
soup = BeautifulSoup(pagesource)
#you.can.now.enjoy.soup
print(soup.prettify())
Edit new: Stuck again, here is the current code. I seem to have isolated the element "mBtnYes" but I get an error when running the code :
ElementClickInterceptedException: Message: Element is not clickable at point (625,278.5500030517578) because another element obscures it
import time
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
driver = webdriver.Firefox(executable_path=r'/Users/jeff/Documents/geckodriver') # Optional argument, if not specified will search path.
driver.get('https://www.shopharborside.com/oakland/#/shop/412');
url = 'https://www.shopharborside.com/oakland/#/shop/412'
driver.get(url)
#
driver.find_element_by_id('myBtnYes').click()
#wait.1.second
time.sleep(1)
pagesource = driver.page_source
soup = BeautifulSoup(pagesource)
#you.can.now.enjoy.soup
print(soup.prettify())
if your aim is to click the verification get to selenium:
ps install selenium && get geckodriver(firefox) or chromedriver(chrome)
#Mossein~King(hi i'm here to help)
import time
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.options import Options
from BeautifulSoup import BeautifulSoup
#this.is.for.headless.This.will.save.you.a.bunch.of.research.time(Trust.me)
options = Options()
options.add_argument("--headless")
driver = webdriver.Firefox(firefox_options=options)
#for.graphical(you.need.gecko.driver.for.firefox)
# driver = webdriver.Firefox()
url = 'your-url'
driver.get(url)
#get.the.link.to.clicking
#exaple if<a class='MosseinKing'>
driver.find_element_by_xpath("//a[#class='MosseinKing']").click()
#wait.1.secong.in.case.of.transitions
time.sleep(1)
pagesource = driver.page_source
soup = BeautifulSoup(pagesource)
#you.can.now.enjoy.soup
print soup.prettify()