How to collect all hrefs using xpath? Selenium - Python - python

I'm trying to collect all (5) of the social media links from the artist in this example. Currently, my output is only the LAST (fifth) social media link. I'm using selenium, I understand this my not be the best option for collecting this data but its all I know at this time.
Note, I've only included relevant code for my question. Thank you in advance for any help/insight.
from cgitb import text
from os import link
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time
from random import randint
import pandas as pd
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('disable-infobars')
chrome_options.add_argument('--disable-extensions')
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(chrome_options=chrome_options)
for url in urls:
driver.get(https://soundcloud.com/flux-pavilion)
time.sleep(randint(3,4))
try:
links = driver.find_elements_by_xpath('//*[#id="content"]/div/div[4]/div[2]/div/article[1]/div[2]/ul/li//a[#href]')
for elem in links:
socialmedia = (elem.get_attribute("href"))
except:
links = "none"
artist = {
'socialmedia': socialmedia,
}
print(artist)

The problem is not with your XPath-expression, but rather with the (non-existent) list processing of your output code.
Your code output'ed only the last item of the resulting XPath list. That was the problem why you only received one link (it was the last one).
So change the output part of your code to
[...]
url = driver.get("https://soundcloud.com/flux-pavilion")
time.sleep(randint(3,4))
artist = []
try:
links = driver.find_elements_by_xpath('//*[#id="content"]/div/div[4]/div[2]/div/article[1]/div[2]/ul/li//a[#href]')
for elem in links:
artist.append(elem.get_attribute("href"))
except:
links = "none"
for link in artist:
print(link)
And the output will contain all of the values(links) you desire:
driver = webdriver.Chrome(chrome_options=chrome_options)
https://gate.sc/?url=https%3A%2F%2Ftwitter.com%2FFluxpavilion&token=da4a8d-1-1653430570528
https://gate.sc/?url=https%3A%2F%2Finstagram.com%2FFluxpavilion&token=277ea0-1-1653430570529
https://gate.sc/?url=https%3A%2F%2Ffacebook.com%2FFluxpavilion&token=4c773c-1-1653430570530
https://gate.sc/?url=https%3A%2F%2Fyoutube.com%2FFluxpavilion&token=1353f7-1-1653430570531
https://gate.sc/?url=https%3A%2F%2Fopen.spotify.com%2Fartist%2F7muzHifhMdnfN1xncRLOqk%3Fsi%3DbK9XeoW5RxyMlA-W9uVwPw&token=bc2936-1-1653430570532

Related

Selenium not able to find all elements in HTML page

I am doing web scraping to the real estate portal <www.immobiliare.it>
Specifically I am retrieving some information from the search page, which contains 25 properties per page. I have managed to retrieved almost everything but I am having trouble to retrieve the src of a map image that each property has. This map is after a CSS selector.
The HTML structure is the following:
I have been able to get this data with selenium:
https://stackoverflow.com/a/75020969/14461986
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
Options = Options()
Options.headless = True
driver = webdriver.Chrome(options=Options, service=Service(ChromeDriverManager().install()))
url = 'https://www.immobiliare.it/vendita-case/milano/forlanini/?criterio=dataModifica&ordine=desc&page=3'
driver.get(url)
soup = BeautifulSoup(driver.page_source)
data = []
# Each property is contained under each li in-realEstateResults__item
for property in soup.select('li.in-realEstateResults__item'):
data.append({
'id': property.get('id'),
'MapUrl': property.select_one('[alt="mappa"]').get('src') if property.select_one('[alt="mappa"]') else None
})
print(data)
However, after the 4th image the MapUrl comes empty. The properties are correcty loaded as I have checked the Ids and also the HTML for the rest of the images is the same but for a reason I do not understand the MapUrl is not retrieved. I would also welcome any advice on how make this script more efficient.
However, issue here is lazy loading, so you have to interact with the website and scroll down to force the loading.
You may have to accept / close some popups (optional):
driver.find_element(By.CSS_SELECTOR,'#didomi-notice-agree-button').click()
driver.find_element(By.CSS_SELECTOR,'.nd-dialogFrame__close').click()
driver.find_element(By.CSS_SELECTOR,'section h1').click()
now we can start scrolling (simple but working solution, could be improved):
for i in range(30):
driver.find_element(By.CSS_SELECTOR,'body').send_keys(Keys.PAGE_DOWN)
time.sleep(0.3)
Example
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
url = 'https://www.immobiliare.it/vendita-case/milano/forlanini/?criterio=dataModifica&ordine=desc'
driver.get(url)
driver.find_element(By.CSS_SELECTOR,'#didomi-notice-agree-button').click()
driver.find_element(By.CSS_SELECTOR,'.nd-dialogFrame__close').click()
driver.find_element(By.CSS_SELECTOR,'section h1').click()
for i in range(30):
driver.find_element(By.CSS_SELECTOR,'body').send_keys(Keys.PAGE_DOWN)
time.sleep(0.3)
soup = BeautifulSoup(driver.page_source)
data = []
for e in soup.select('li.in-realEstateResults__item'):
data.append({
'title':e.a.get('title'),
'imgUrls':[i.get('src') for i in e.select('.nd-list__item img')],
'imgMapInfo': e.select_one('[alt="mappa"]').get('src') if e.select_one('[alt="mappa"]') else None
})
data

Retrieving specific matches from a list in python

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from time import sleep
from datetime import datetime
import pandas as pd
import warnings
import os
os.chdir('C:/Users/paulc/Documents/Medium Football')
warnings.filterwarnings('ignore')
base_url = 'https://www.sportingindex.com/spread-betting/football/international-world-cup'
option = Options()
option.headless = False
driver = webdriver.Chrome("C:/Users/paulc/Documents/Medium Football/chromedriver.exe",options=option)
driver.get(base_url)
links = [elem.get_attribute("href") for elem in driver.find_elements(By.TAG_NAME,"a")]
this code retrieves all the href links on this page. I want to search the links list and return only the matches that contain 'https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a'
however I get the AttributeError: 'NoneType' object has no attribute 'startswith'
using
import re
[x for x in links if x.startswith('https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a')]
help is appreciated.
Instead of collecting all a elements on the page where will be a lot of irrelevant results you can use more precise locator.
So, instead of
driver.find_elements(By.TAG_NAME,"a")
Use this:
driver.find_elements(By.XPATH,"//a[contains(#href,'https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a')]")
This will give you desired elements only.
And this
links = [elem.get_attribute("href") for elem in driver.find_elements(By.XPATH,"//a[contains(#href,'https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a')]")]
will directly give you the wanted links only.
UPD
In case this is giving you an empty list you possibly are missing a delay. So, you can simply add some pause before that line, like time.sleep(2) but it's better to use WebDriverWait expected_conditions explicit waits for that.
I can't check it since my computer is blocking that link due to my company policy since that is a gambling site, but normally something like this should work:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
wait = WebDriverWait(driver, 10)
links = [elem.get_attribute("href") for elem in wait.until(EC.visibility_of_all_elements_located((By.XPATH, "//a[contains(#href,'https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a')]")))]
The following code is filtering to grab the right links
import time
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
webdriver_service = Service("./chromedriver") #Your chromedriver path
driver = webdriver.Chrome(service=webdriver_service)
driver.get('https://www.sportingindex.com/spread-betting/football/international-world-cup')
driver.maximize_window()
time.sleep(8)
soup = BeautifulSoup(driver.page_source,"lxml")
for u in soup.select('a[class="gatracking"]'):
link = 'https://www.sportingindex.com' + u.get('href')
if '-v-' in link:
print(link)
Output:
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.24fdf8f5-b69b-4341-b6b4-d27605f7f7fd/spain-v-germany
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.c9bdf787-791a-47e0-b77c-a2d4cf567bfd/cameroon-v-serbia
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.5eddaa44-666b-47dc-8a0f-4ac758de00dc/south-korea-v-ghana
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.70cefd39-60f7-415e-9cb5-7a56acd403d6/brazil-v-switzerland
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.7fe0285e-366f-4f3c-b77f-4c96077a6c71/portugal-v-uruguay
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.dd7a995d-7478-45f8-af27-9f234d37cc76/ecuador-v-senegal
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.92232207-0f1e-4bb1-bacd-1332ef6b9007/netherlands-v-qatar
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.b913620e-69c7-4606-a153-7b48589b7c94/iran-v-usa
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.7a4a18fb-d4ee-4880-849f-f1afdea33cd5/wales-v-england
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.20c098b4-4e97-4fd1-97b0-f42d84424361/australia-v-denmark
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.5a7476e2-8d35-4a8e-8065-b4339e79f395/tunisia-v-france
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.8a869f02-9dd0-49c5-91bd-209ee224fc2a/poland-v-argentina
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.6379b787-f246-4ba4-a896-28a97396d02f/saudi-arabia-v-mexico
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.52737cfd-da19-42dd-b15b-c16c3e8e9a86/canada-v-morocco
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.168fab1f-8360-4e87-ba84-bfbd11a4a207/croatia-v-belgium
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.9fb541f0-43a4-409c-8e54-e34a43965714/costa-rica-v-germany
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.7379c8a7-ab5d-4653-b487-22bf7ff8eefe/japan-v-spain
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.e7e4c6be-98b7-4258-ba40-74c54a790fe1/ghana-v-uruguay
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.e4c18c81-565e-47ce-b08d-9aed62c88a5d/south-korea-v-portugal
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.18f44028-e23d-48d4-970b-e75c164589bd/cameroon-v-brazil
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.526f9b1b-6d95-4f44-abce-e0a6a30acfd4/serbia-v-switzerland
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.24fdf8f5-b69b-4341-b6b4-d27605f7f7fd/spain-v-germany
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.c9bdf787-791a-47e0-b77c-a2d4cf567bfd/cameroon-v-serbia
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.5eddaa44-666b-47dc-8a0f-4ac758de00dc/south-korea-v-ghana
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.70cefd39-60f7-415e-9cb5-7a56acd403d6/brazil-v-switzerland
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.7fe0285e-366f-4f3c-b77f-4c96077a6c71/portugal-v-uruguay
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.24fdf8f5-b69b-4341-b6b4-d27605f7f7fd/spain-v-germany
https://www.sportingindex.com/spread-betting/rugby-union/france-top-14/group_a.ad22f34f-9cd6-47b4-a826-0c0f0dce7df2/lyon-v-toulouse
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.24fdf8f5-b69b-4341-b6b4-d27605f7f7fd/spain-v-germany
https://www.sportingindex.com/spread-betting/rugby-union/france-top-14/group_a.ad22f34f-9cd6-47b4-a826-0c0f0dce7df2/lyon-v-toulouse
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.c9bdf787-791a-47e0-b77c-a2d4cf567bfd/cameroon-v-serbia
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.5eddaa44-666b-47dc-8a0f-4ac758de00dc/south-korea-v-ghana
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.70cefd39-60f7-415e-9cb5-7a56acd403d6/brazil-v-switzerland
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.7fe0285e-366f-4f3c-b77f-4c96077a6c71/portugal-v-uruguay

Get total number of review of permanently closed place in google map without API

I am learning python as well as web scrapping and I want to get number of review from google map of a permanently closed restaurant but I cannot do that, would you please help? Thank you
from bs4 import BeautifulSoup
url = 'https://www.google.com/maps?q=asia+halal+restaurant+aichi+japan+open+date&safe=strict&rlz=1C1GCEA_enID892ID892&sxsrf=ALeKk01NqaBLM8bXeVVS6M6tv9kAy0G6qQ:1616997971678&gs_lcp=Cgdnd3Mtd2l6EAM6BwgjELADECc6BQghEKABOgQIIRAVOgcIIRAKEKABUIUIWKojYOckaABwAHgAgAHHAogB7RGSAQcxLjUuNC4ymAEAoAEBqgEHZ3dzLXdpesgBAcABAQ&uact=5&um=1&ie=UTF-8&sa=X&ved=2ahUKEwjbhef-7NTvAhWa93MBHaFHCzYQ_AUoAXoECAEQAw'
import requests
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
ps = soup.find_all(string = 'クチコミ')
ps
I also tried to use find 'class' and 'span aria-label' based on developer tool of chrome below but still cannot do that
browser picture for html class
#ps = soup.find_all(class_='h0ySl-wcwwM-E70qVe-list')
#ps = soup.find_all('span aria-label')
#total_rev = ps.get_text()
#total_rev
Here is the code that I tried using selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from bs4 import BeautifulSoup
driver = webdriver.Chrome('F:/Download/SW/chromedriver_win32/chromedriver.exe')
url = 'https://www.google.com/maps/place/%E3%82%A2%E3%83%83%E3%83%90%E3%82%B7+%E3%82%B9%E3%82%A4%E3%83%BC%E3%83%84/#35.0903185,136.8551766,17z/data=!3m1!4b1!4m5!3m4!1s0x600378381c4bb1f7:0x8e9d356b9ded5bcc!8m2!3d35.0903185!4d136.8573653'
driver.get(url)
I have tried to get number of review using this code in "still operating" restaurant, but when it comes to permanently closed one I cannot get the number of review
span_review = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "section-star")]'))).click()
#Find the total number of reviews
total_number_of_reviews = driver.find_element_by_xpath('//*[#id="pane"]/div/div[1]/div/div/div[2]/div[2]/div/div[2]/div[2]').text.split(" ")[0]
total_number_of_reviews = int(total_number_of_reviews.replace(',','')) if ',' in total_number_of_reviews else int(total_number_of_reviews)#Find scroll layout
total_reviews = driver.find_element_by_class_name("h0ySl-wcwwM-E70qVe-list")
total_reviews #= driver.get('aria-label')
total_reviews = total_reviews.get_text('aria-label')
total_reviews
total_reviews
total_number_of_reviews = total_reviews.text[0:]
total_number_of_reviews
Hopefully I can learn
Thanks!
I can't find your xpath in HTML. There is no <button> with text section-star but <li class="section-star">.
And aria-label is not text but attribute and you have to use .get_attribute('aria-label')
But I found other xpath //button[jsaction="pane.rating.moreReviews"] and it works for me for permanent closed and still operating
Tested on Firefox and Chrome, Linux.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
driver = webdriver.Chrome('F:/Download/SW/chromedriver_win32/chromedriver.exe')
#driver = webdriver.Chrome()
#driver = webdriver.Firefox()
all_urls = [
# permanent closed
'https://www.google.com/maps/place/%E3%82%A2%E3%83%83%E3%83%90%E3%82%B7+%E3%82%B9%E3%82%A4%E3%83%BC%E3%83%84/#35.0903185,136.8551766,17z/data=!3m1!4b1!4m5!3m4!1s0x600378381c4bb1f7:0x8e9d356b9ded5bcc!8m2!3d35.0903185!4d136.8573653',
# still operating
'https://www.google.com/maps/place/Seaside+Restaurant+Higashiyama+Garden+-+Port+Bldg./#35.0841323,136.8474088,14z/data=!3m1!5s0x6003790a61e056e7:0x7f307de064680a96!4m9!1m2!2m1!1srestaurants!3m5!1s0x600379a07cd9fcc7:0x89f84cc9f0422e30!8m2!3d35.0895485!4d136.8809243!15sCgtyZXN0YXVyYW50c1oNIgtyZXN0YXVyYW50c5IBCnJlc3RhdXJhbnQ',
]
for url in all_urls:
driver.get(url)
total_reviews = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//button[#jsaction="pane.rating.moreReviews"]')))
total_reviews = total_reviews.get_attribute('aria-label')
print(total_reviews)

web scraping with selenium returns empty list

I worked on little bit web scraping before but I have not idea of javascript. I want to scrape "Company Name" and "description of the company" from https://www.ces.tech/Show-Floor/Exhibitor-Directory.aspx. I am using selenium for scraping but I don't want to use browser in background. I write some code here:
from selenium.webdriver.common.by import By
from selenium import webdriver
import os
op = webdriver.ChromeOptions()
op.add_argument('headless')
driver = webdriver.Chrome(options=op)
driver.get('https://www.ces.tech/Show-Floor/Exhibitor-Directory.aspx')
company = []
items = driver.find_elements(By.CLASS_NAME, "exhibitorCardModal")
for item in items:
comp=item.find_elements(By.CLASS_NAME, "company-name")
desc = item.find_elements(By.CLASS_NAME, "description")
result_dict = {
"company":comp.text,
"description":desc.text
}
company.append(result_dict)
But got empty list. Can someone tell me what is wrong here. I also try to use there api https://www.ces.tech/api/Exhibitors?searchTerm=&sortBy=alpha&alpha=&state=&country=&venue=&exhibitorType=&pageNo=1&pageSize=30 but got this error :
{"error":{"code":"ApiVersionUnspecified","message":"An API version is required, but was not specified."}}
You also have to add wait / delay before accessing the elements to let the page completely loaded before you trying to access them.
You should use find_element instead of find_elements for the loop internal commands:
comp=item.find_elements(By.CLASS_NAME, "company-name")
desc = item.find_elements(By.CLASS_NAME, "description")
So your code should be something like this:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
import os
import time
op = webdriver.ChromeOptions()
op.add_argument('headless')
driver = webdriver.Chrome(options=op)
wait = WebDriverWait(driver, 20)
driver.get('https://www.ces.tech/Show-Floor/Exhibitor-Directory.aspx')
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "exhibitorCardModal")))
time.sleep(0.5)
company = []
items = driver.find_elements(By.CLASS_NAME, "exhibitorCardModal")
for item in items:
comp=item.find_element(By.CLASS_NAME, "company-name")
desc = item.find_element(By.CLASS_NAME, "description")
result_dict = {
"company":comp.text,
"description"::desc.text
}
company.append(result_dict)

Unable to grab certain links from dynamic content

I've written a script in python in combination with selenium to scrape the links of different properties located at the right sided area right next to the map from its landing page.
Link to the landing page
When I click on each block manually from chrome I see links containing this /for_sale/ portion in a new tab whereas what my script fetches contain /homedetails/.
How can I get the number of results (such as 153 homes for sale) along with right links to the properties?
My try so far:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
link = "https://www.zillow.com/homes/33155_rb/"
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
driver.get(link)
itemcount = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#map-result-count-message h2")))
print(itemcount.text)
for item in wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR,".zsg-photo-card-overlay-link"))):
print(item.get_attribute("href"))
driver.quit()
One of the current output:
https://www.zillow.com/homedetails/6860-SW-48th-Ter-Miami-FL-33155/44206318_zpid/
One of such expected output:
https://www.zillow.com/homes/for_sale/Miami-FL-33155/house_type/44184455_zpid/72458_rid/globalrelevanceex_sort/25.776783,-80.256072,25.695446,-80.364905_rect/12_zm/0_mmm/
While analyzing /homedetails/ and /for_sale/ links, I found that /homedetails/ link usually contains some sort of code like this:
44206318_zpid
that code acts as a unique identifier for the ad post, I extracted it and added it to:
https://www.zillow.com/homes/for_sale/
so the final link for the ad post will be like this:
https://www.zillow.com/homes/for_sale/44206318_zpid
It's a valid link and takes to the AD post.
Here is the final script:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
link = "https://www.zillow.com/homes/33155_rb/"
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
driver.get(link)
itemcount = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#map-result-count-message h2")))
print(itemcount.text)
for item in wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR,".zsg-photo-card-overlay-link"))):
link = item.get_attribute("href")
if "zpid" in link:
print("https://www.zillow.com/homes/for_sale/{}".format(link.split('/')[-2]))
I hope this would help.
You can loop over the pagination divs and keep a running counter of the number of homes displayed on each page. To parse the html, this answer utilizes BeautifulSoup:
from selenium import webdriver
from bs4 import BeautifulSoup as soup
import re, time
def home_num(_d:soup) -> int:
return len(_d.find_all('a', {'href':re.compile('^/homedetails/')}))
d = webdriver.Chrome('/Users/jamespetullo/Downloads/chromedriver')
d.get('https://www.zillow.com/homes/33155_rb/')
homecount, _links = home_num(soup(d.page_source, 'html.parser')), []
_seen_links, _result_links = [], []
_start = [i for i in d.find_elements_by_tag_name('a') if isinstance(i.get_attribute("href"), str) and re.findall('/homes/for_sale/', i.get_attribute("href")) and i.get_attribute("href") not in _seen_links]
while _start:
_new_start = _start[0]
try:
_new_start.send_keys('\n')
time.sleep(5)
_start = [i for i in d.find_elements_by_tag_name('a') if isinstance(i.get_attribute("href"), str) and re.findall('/homes/for_sale/', i.get_attribute("href")) and i.get_attribute("href") not in _seen_links]
except:
_seen_links.append(_new_start.get_attribute('href'))
_start = [i for i in d.find_elements_by_tag_name('a') if isinstance(i.get_attribute("href"), str) and re.findall('/homes/for_sale/', i.get_attribute("href")) and i.get_attribute("href") not in _seen_links]
else:
_seen_links.append(_new_start.get_attribute('href'))
_result_links.append(_new_start.get_attribute('href'))
homecount += home_num(soup(d.page_source, 'html.parser'))
If you inspect those images present at right hand side of the page you will see "homedetails" not "forsale".
Just try to open link in new tab and observe the actuallink is "homedetails".

Categories