Unable to grab certain links from dynamic content - python

I've written a script in python in combination with selenium to scrape the links of different properties located at the right sided area right next to the map from its landing page.
Link to the landing page
When I click on each block manually from chrome I see links containing this /for_sale/ portion in a new tab whereas what my script fetches contain /homedetails/.
How can I get the number of results (such as 153 homes for sale) along with right links to the properties?
My try so far:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
link = "https://www.zillow.com/homes/33155_rb/"
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
driver.get(link)
itemcount = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#map-result-count-message h2")))
print(itemcount.text)
for item in wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR,".zsg-photo-card-overlay-link"))):
print(item.get_attribute("href"))
driver.quit()
One of the current output:
https://www.zillow.com/homedetails/6860-SW-48th-Ter-Miami-FL-33155/44206318_zpid/
One of such expected output:
https://www.zillow.com/homes/for_sale/Miami-FL-33155/house_type/44184455_zpid/72458_rid/globalrelevanceex_sort/25.776783,-80.256072,25.695446,-80.364905_rect/12_zm/0_mmm/

While analyzing /homedetails/ and /for_sale/ links, I found that /homedetails/ link usually contains some sort of code like this:
44206318_zpid
that code acts as a unique identifier for the ad post, I extracted it and added it to:
https://www.zillow.com/homes/for_sale/
so the final link for the ad post will be like this:
https://www.zillow.com/homes/for_sale/44206318_zpid
It's a valid link and takes to the AD post.
Here is the final script:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
link = "https://www.zillow.com/homes/33155_rb/"
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
driver.get(link)
itemcount = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#map-result-count-message h2")))
print(itemcount.text)
for item in wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR,".zsg-photo-card-overlay-link"))):
link = item.get_attribute("href")
if "zpid" in link:
print("https://www.zillow.com/homes/for_sale/{}".format(link.split('/')[-2]))
I hope this would help.

You can loop over the pagination divs and keep a running counter of the number of homes displayed on each page. To parse the html, this answer utilizes BeautifulSoup:
from selenium import webdriver
from bs4 import BeautifulSoup as soup
import re, time
def home_num(_d:soup) -> int:
return len(_d.find_all('a', {'href':re.compile('^/homedetails/')}))
d = webdriver.Chrome('/Users/jamespetullo/Downloads/chromedriver')
d.get('https://www.zillow.com/homes/33155_rb/')
homecount, _links = home_num(soup(d.page_source, 'html.parser')), []
_seen_links, _result_links = [], []
_start = [i for i in d.find_elements_by_tag_name('a') if isinstance(i.get_attribute("href"), str) and re.findall('/homes/for_sale/', i.get_attribute("href")) and i.get_attribute("href") not in _seen_links]
while _start:
_new_start = _start[0]
try:
_new_start.send_keys('\n')
time.sleep(5)
_start = [i for i in d.find_elements_by_tag_name('a') if isinstance(i.get_attribute("href"), str) and re.findall('/homes/for_sale/', i.get_attribute("href")) and i.get_attribute("href") not in _seen_links]
except:
_seen_links.append(_new_start.get_attribute('href'))
_start = [i for i in d.find_elements_by_tag_name('a') if isinstance(i.get_attribute("href"), str) and re.findall('/homes/for_sale/', i.get_attribute("href")) and i.get_attribute("href") not in _seen_links]
else:
_seen_links.append(_new_start.get_attribute('href'))
_result_links.append(_new_start.get_attribute('href'))
homecount += home_num(soup(d.page_source, 'html.parser'))

If you inspect those images present at right hand side of the page you will see "homedetails" not "forsale".
Just try to open link in new tab and observe the actuallink is "homedetails".

Related

Scraping information from Booking.com

I am trying to scrape some information from booking.com. I handled some stuff like pagination, extract title etc.
I am trying to extract the number of guests from here.
This is my code:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.maximize_window()
test_url = 'https://www.booking.com/hotel/gr/diamandi-20.en-gb.html?label=gen173nr-1DCAEoggI46AdIM1gEaFyIAQGYAQm4ARjIAQzYAQPoAQGIAgGoAgS4ApGp7ZgGwAIB0gIkZTBjOTA2MTQtYTc0MC00YWUwLTk5ZWEtMWNiYzg3NThiNGQ12AIE4AIB&sid=47583bd8c0122ee70cdd7bb0b06b0944&aid=304142&ucfs=1&arphpl=1&checkin=2022-10-24&checkout=2022-10-30&dest_id=-829252&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&hpos=2&hapos=2&sr_order=popularity&srpvid=f0f16af3449102aa&srepoch=1662736362&all_sr_blocks=852390201_352617405_2_0_0&highlighted_blocks=852390201_352617405_2_0_0&matching_block_id=852390201_352617405_2_0_0&sr_pri_blocks=852390201_352617405_2_0_0__30000&from=searchresults#hotelTmpl'
driver.get(test_url)
time.sleep(3)
soup2 = BeautifulSoup(driver.page_source, 'lxml')
guests = soup2.select_one('span.xp__guests__count')
guests = guests.text if price else None
amenities = soup2.select_one('div.hprt-facilities-block')
The result is this one '\n2 adults\n·\n\n0 children\n\n·\n\n1 room\n\n'
I know that with some regexp I can extract the information but I want but i would like to understand if is there a way to extract directly the "2 adults" from the above pic.
Thanks.
This is one way to get that information, without using BeautifulSoup (why parse the page twice?):
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
[...]
wait = WebDriverWait(browser, 20)
url = 'https://www.booking.com/hotel/gr/diamandi-20.en-gb.html?label=gen173nr-1DCAEoggI46AdIM1gEaFyIAQGYAQm4ARjIAQzYAQPoAQGIAgGoAgS4ApGp7ZgGwAIB0gIkZTBjOTA2MTQtYTc0MC00YWUwLTk5ZWEtMWNiYzg3NThiNGQ12AIE4AIB&sid=47583bd8c0122ee70cdd7bb0b06b0944&aid=304142&ucfs=1&arphpl=1&checkin=2022-10-24&checkout=2022-10-30&dest_id=-829252&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&hpos=2&hapos=2&sr_order=popularity&srpvid=f0f16af3449102aa&srepoch=1662736362&all_sr_blocks=852390201_352617405_2_0_0&highlighted_blocks=852390201_352617405_2_0_0&matching_block_id=852390201_352617405_2_0_0&sr_pri_blocks=852390201_352617405_2_0_0__30000&from=searchresults#hotelTmpl'
browser.get(url)
guest_count = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "span[class='xp__guests__count']"))).find_element(By.TAG_NAME, "span")
print(guest_count.text)
Result in terminal:
2 adults
Selenium docs can be found at https://www.selenium.dev/documentation/
I haven't used BeautifulSoup. I use Selenium. This is how I would do it in Selenium:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
driver = webdriver.Chrome()
driver.maximize_window()
test_url = 'https://www.booking.com/hotel/gr/diamandi-20.en-gb.html?label=gen173nr-1DCAEoggI46AdIM1gEaFyIAQGYAQm4ARjIAQzYAQPoAQGIAgGoAgS4ApGp7ZgGwAIB0gIkZTBjOTA2MTQtYTc0MC00YWUwLTk5ZWEtMWNiYzg3NThiNGQ12AIE4AIB&sid=47583bd8c0122ee70cdd7bb0b06b0944&aid=304142&ucfs=1&arphpl=1&checkin=2022-10-24&checkout=2022-10-30&dest_id=-829252&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&hpos=2&hapos=2&sr_order=popularity&srpvid=f0f16af3449102aa&srepoch=1662736362&all_sr_blocks=852390201_352617405_2_0_0&highlighted_blocks=852390201_352617405_2_0_0&matching_block_id=852390201_352617405_2_0_0&sr_pri_blocks=852390201_352617405_2_0_0__30000&from=searchresults#hotelTmpl'
driver.get(test_url)
time.sleep(3)
element = driver.find_element(By.XPATH,"//span[#class='xp__guests__count']")
adults = int(element.text.split(" adults")[0])
print(str(adults))
Basically, I find the span element that contains the text you are looking for. .text gives you all the inner text (in this case, "2 adults · 0 children · 1 room").
The next line takes only the part of the string that comes before " adults", then casts it as an int.

How to collect all hrefs using xpath? Selenium - Python

I'm trying to collect all (5) of the social media links from the artist in this example. Currently, my output is only the LAST (fifth) social media link. I'm using selenium, I understand this my not be the best option for collecting this data but its all I know at this time.
Note, I've only included relevant code for my question. Thank you in advance for any help/insight.
from cgitb import text
from os import link
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time
from random import randint
import pandas as pd
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('disable-infobars')
chrome_options.add_argument('--disable-extensions')
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(chrome_options=chrome_options)
for url in urls:
driver.get(https://soundcloud.com/flux-pavilion)
time.sleep(randint(3,4))
try:
links = driver.find_elements_by_xpath('//*[#id="content"]/div/div[4]/div[2]/div/article[1]/div[2]/ul/li//a[#href]')
for elem in links:
socialmedia = (elem.get_attribute("href"))
except:
links = "none"
artist = {
'socialmedia': socialmedia,
}
print(artist)
The problem is not with your XPath-expression, but rather with the (non-existent) list processing of your output code.
Your code output'ed only the last item of the resulting XPath list. That was the problem why you only received one link (it was the last one).
So change the output part of your code to
[...]
url = driver.get("https://soundcloud.com/flux-pavilion")
time.sleep(randint(3,4))
artist = []
try:
links = driver.find_elements_by_xpath('//*[#id="content"]/div/div[4]/div[2]/div/article[1]/div[2]/ul/li//a[#href]')
for elem in links:
artist.append(elem.get_attribute("href"))
except:
links = "none"
for link in artist:
print(link)
And the output will contain all of the values(links) you desire:
driver = webdriver.Chrome(chrome_options=chrome_options)
https://gate.sc/?url=https%3A%2F%2Ftwitter.com%2FFluxpavilion&token=da4a8d-1-1653430570528
https://gate.sc/?url=https%3A%2F%2Finstagram.com%2FFluxpavilion&token=277ea0-1-1653430570529
https://gate.sc/?url=https%3A%2F%2Ffacebook.com%2FFluxpavilion&token=4c773c-1-1653430570530
https://gate.sc/?url=https%3A%2F%2Fyoutube.com%2FFluxpavilion&token=1353f7-1-1653430570531
https://gate.sc/?url=https%3A%2F%2Fopen.spotify.com%2Fartist%2F7muzHifhMdnfN1xncRLOqk%3Fsi%3DbK9XeoW5RxyMlA-W9uVwPw&token=bc2936-1-1653430570532

How can I web scrape information from a website that has all the tags in the <pre>preformatted tag section?

I am creating a python crawler that scrapes information from the Interpol website. I was successfully able to scrape information from the first page like names of people, date of birth, nationality etc. In order to scrape information from the second page, I first got the URL from tag and clicked on the link using my program. When I went to the URL, I found out that all the information(meaning all the tags) were in the < pre > tag section. I am confused about why that is the case. So my question is how can I get information from inside the pre-tag section where all the other tags are. I am trying to get names of people, birthdays, their corresponding links, etc. I am using selenium btw. I will put down the URL of the website. And the URL of the second page that I found in the tag. I hope that helps you guys understand what I am talking about.
Main Website:
https://www.interpol.int/en/How-we-work/Notices/View-Red-Notices
The second-page link I found in the tag:
https://ws-public.interpol.int/notices/v1/red?resultPerPage=20&page=2
The code for the problem I have so far will be posted down below:
from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException, NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url = 'https://www.interpol.int/en/How-we-work/Notices/View-Red-Notices'
driver = webdriver.Chrome(executable_path="c:\\SeliniumWebDrivers\\chromedriver.exe")
driver.get(url) //to go the website
url = [] //to get all the URLs of the people
names = [] //to get the names of the peoples
age = [] //to get the age of the people
nationality = [] //to get the nationality of the people
newwindow = [] //to get all the next page links
y = 0
g = 1
try:
driver.get(driver.current_url)
main = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, 'noticesResultsItemList'))
)
links = main.find_elements_by_tag_name("a")
years = main.find_elements_by_class_name("age")
borns = main.find_elements_by_class_name("nationalities")
for link in links:
newurl = link.get_attribute('href')
url.append(newurl)
names.append(link.text) //adding the names
y += 1
for year in years:
age.append(year.text) //adding the age to list
for nation in borns:
nationality.append(nation.text) //adding the nationality to list
driver.get(driver.current_url)
driver.refresh()
next = WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.ID, 'paginationPanel'))
)
pages = next.find_elements_by_tag_name("a")
for page in pages:
newlink = page.get_attribute('href')
newwindow.append(newlink)
#to get to the next page
print(newwindow[2])
driver.get(newwindow[2])
````
you can use selenium to click next page instead of getting the url. This is a just a simple ,you may need to use a loop and extract data and click next page. I've use variable browser instead of main.I've written a function and used a for loop to get the data from each page
from selenium import webdriver
import time
from selenium.common.exceptions import NoSuchElementException,ElementNotInteractableException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
browser = webdriver.Chrome('/home/cam/Downloads/chromedriver')
url='https://www.interpol.int/en/How-we-work/Notices/View-Red-Notices'
browser.get(url)
def get_data():
links = browser.find_elements_by_tag_name("a")
years = browser.find_elements_by_class_name("age")
borns = browser.find_elements_by_class_name("nationalities")
time.sleep(5)
try:
browser.find_element_by_xpath('//*[#id="privacy-cookie-banner__privacy-accept"]').click()
except ElementNotInteractableException:
pass
for i in range(1,9):
print(i)
get_data()
print('//*[#id="paginationPanel"]/div/div/ul/li['+str(i+2)+']/a')
b=WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="paginationPanel"]/div/div/ul/li['+str(i+2)+']/a')))
b.click()
time.sleep(10)

How do I run through a list of links one by one and then scrape data using selenium(driver.get)?

I'm trying to loop through 2 sets of links. Starting with https://cuetracker.net/seasons > click through each season link (Last 5 seasons) and then click through each tournament link within each season link and scrape the match data from each tournament.
Using the below code I have managed to get a list of season links I desire but then when I try and grab the tournament links and put them into a list it is only getting the last season tournament links as opposed to each season's.
I'd guess it's something to do with driver.get just completing before the next lines of code work and I need to loop/iterate using indexes but I'm a complete novice so I'm not too sure.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.select import Select
from bs4 import BeautifulSoup
import re
import pandas as pd
import os
Chrome_Path = r"C:\Users\George\Desktop\chromedriver.exe"
Browser = webdriver.Chrome(Chrome_Path)
Browser.get("https://cuetracker.net/seasons")
links = Browser.find_elements_by_css_selector("table.table.table-striped a")
hrefs=[]
for link in links:
hrefs.append(link.get_attribute("href"))
hrefs = hrefs[1:5]
for href in hrefs:
Browser.get(href)
links2 = Browser.find_elements_by_partial_link_text("20")
hrefs2 =[]
for link in links2:
hrefs2.append(link.get_attribute("href"))
You are pretty close and you are right about "you just need to wait a bit".
You could wait for page load: wait_for_page_load checks the document readystate and if everything is loaded then you are good to go. Check this thread for more. :)
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.select import Select
from bs4 import BeautifulSoup
import os
import re
import time
import pandas as pd
def wait_for_page_load():
timer = 10
start_time = time.time()
page_state = None
while page_state != 'complete':
time.sleep(0.5)
page_state = Browser.execute_script('return document.readyState;')
if time.time() - start_time > timer:
raise Exception('Timeout :(')
Chrome_Path = r"C:\Users\George\Desktop\chromedriver.exe"
Browser = webdriver.Chrome()
Browser.get("https://cuetracker.net/seasons")
links = Browser.find_elements_by_css_selector("table.table.table-striped a")
hrefs=[]
for link in links:
hrefs.append(link.get_attribute("href"))
hrefs = hrefs[1:5]
hrefs2 = {}
for href in hrefs:
hrefs2[href] = []
Browser.get(href)
wait_for_page_load()
links2 = Browser.find_elements_by_partial_link_text("20")
for link in links2:
hrefs2[href].append((link.get_attribute("href")))
A few notes if you don't mind:
Browser should be browser or driver, same applies to Chrome_Path
check out Xpath, it is awesome
EDIT:
I've been sloppy for the first time so I've updated the answer to answer the question :D. Waiting for page load is still a good idea :)
The problem was that you re-defined hrefs2 in each cycle so it always contained the result of the last iteration.
About why xpath:
If you would like to to load results before 2000, your url collecting logic would break. You could still do this:
table = Browser.find_element_by_xpath('//*[#class="table table-striped"]')
all_urls = [x.get_attribute('href') for x in table.find_elements_by_xpath('.//tr/td[2]/a')]
Where you find the table by the class name, then collect the urls from the second column of the table.
If you know the url pattern you can even do this:
all_urls = [x.get_attribute('href') for x in Browser.find_elements_by_xpath('//td//a[contains(#href, "https://cuetracker.net/tournaments")]')]
The Xpath above:
//td <- in any depth of the document tree find td tagged elements
//a <- in collected td elements get all children which are a tagged (in any depth)
[contains(#href, "https://cuetracker.net/tournaments")] from the list of collected a tagged elements which contain the "https://cuetracker.net/tournaments" text in the href attribute (partial match)

My scraper fails to get all the items from a webpage

I've written some code in python in combination with selenium to parse different product names from a webpage. There are few load more buttons visible if the browser is made to scroll downward. The webpage displays it's full content if the page is made to scroll downmost until there is no load more button to click. My scraper seems to be doing good but I'm not getting all the results. There are around 200 products in that page but I'm getting 90 out of them. What change should I bring about in my scraper to get them all? Thanks in advance.
The webpage I'm dealing with: Page_Link
This is the script I'm trying with:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get("put_above_url_here")
wait = WebDriverWait(driver, 10)
page = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,".listing_item")))
for scroll in range(17):
page.send_keys(Keys.PAGE_DOWN)
time.sleep(2)
try:
load = driver.find_element_by_css_selector(".lm-btm")
load.click()
except Exception:
pass
for item in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "[id^=item_]"))):
name = item.find_element_by_css_selector(".pro-name.el2").text
print(name)
driver.quit()
Try below code to get required data:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get("https://www.purplle.com/search?q=hair%20fall%20shamboo")
wait = WebDriverWait(driver, 10)
header = driver.find_element_by_tag_name("header")
driver.execute_script("arguments[0].style.display='none';", header)
while True:
try:
page = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".listing_item")))
driver.execute_script("arguments[0].scrollIntoView();", page)
page.send_keys(Keys.END)
load = wait.until(EC.element_to_be_clickable((By.PARTIAL_LINK_TEXT, "LOAD MORE")))
driver.execute_script("arguments[0].scrollIntoView();", load)
load.click()
wait.until(EC.staleness_of(load))
except:
break
for item in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "[id^=item_]"))):
name = item.find_element_by_css_selector(".pro-name.el2").text
print(name)
driver.quit()
You should only Use Selenium as a last resort.
A simple look around in the webpage showed the API it called to get your data.
It returns a JSON output with all the details:
Link
You can now just loop over and store in a dataframe easily.
Very fast, fewer errors than selenium.

Categories