Unable to collect all the shop names from a webpage - python

I've written a script in python to parse some names from a webpage. The items available in that webpage doesn't get displayed all at a time, rather, it is necessary to scroll to the bottom to let the webpage release few more items and again few more upon another scrolling and so on until all items are visible. The problem is the items are not located in the body that is why driver.execute_script("return document.body.scrollHeight;") this command is not working (IMO). It is located in the left sided area like a sliding container. How can I reach the bottom of that container and parse the names from this webpage? I've written almost all the codes except for controlling the lazy-load. I'm attaching an image to give you an idea what did i try to mean by calling it a sliding container.
The link to that webpage: Link
This what I've tried so far:
from selenium import webdriver; import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
driver.get("replace_the_above_link")
check_height = driver.execute_script("return document.body.scrollHeight;")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3)
height = driver.execute_script("return document.body.scrollHeight;")
if height == check_height:
break
check_height = height
for item in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".select_list h2 a"))):
print(item.text)
driver.quit()
This is the image of that box which contains item: Click Here
Currently my scraper is parsing items which are visible when the page is loaded.

Below code should allow you to make XHR requests by scrolling container as much time as possible and then scrape required data:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
driver.get("https://www.weedsta.com/dispensaries/in/california")
entries_count = len(wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "select_list"))))
while True:
driver.find_element_by_class_name("tel").send_keys(Keys.END)
try:
wait.until(lambda driver: entries_count < len(driver.find_elements_by_class_name("select_list")))
except:
break
for item in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".select_list h2 a"))):
print(item.text)
driver.quit()

Related

Selenium is returning empty text for elements that definitely have text

I'm practicing trying to scrape my university's course catalog. I have a few lines in Python that open the url in Chrome and clicks the search button to bring up the course catalog. When I go to extract the texting using find_elements_by_xpath(), it returns blank. When I use the dev tools on Chrome, there definitely is text there.
from selenium import webdriver
import time
driver = webdriver.Chrome()
url = 'https://courses.osu.edu/psp/csosuct/EMPLOYEE/PUB/c/COMMUNITY_ACCESS.OSR_CAT_SRCH.GBL?'
driver.get(url)
time.sleep(3)
iframe = driver.find_element_by_id('ptifrmtgtframe')
driver.switch_to.frame(iframe)
element = driver.find_element_by_xpath('//*[#id="OSR_CAT_SRCH_WK_BUTTON1"]')
element.click()
course = driver.find_elements_by_xpath('//*[#id="OSR_CAT_SRCH_OSR_CRSE_HEADER$0"]')
print(course)
I'm trying to extract the text from the element 'OSU_CAT_SRCH_OSR_CRSE_HEADER'. I don't understand why it's not returning the text values especially when I can see that it contains text with dev tools.
You are not using text that is the reason you are not getting the text.
course = driver.find_elements_by_xpath('//*[#id="OSR_CAT_SRCH_OSR_CRSE_HEADER$0"]').text
Try above changes in last second line
Below is the full code after the changes
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
driver = webdriver.Chrome()
url = 'https://courses.osu.edu/psp/csosuct/EMPLOYEE/PUB/c/COMMUNITY_ACCESS.OSR_CAT_SRCH.GBL?'
driver.get(url)
time.sleep(3)
iframe = driver.find_element_by_id('ptifrmtgtframe')
driver.switch_to.frame(iframe)
element = driver.find_element_by_xpath('//*[#id="OSR_CAT_SRCH_WK_BUTTON1"]')
element.click()
# wait 10 seconds
course = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//*[#id="OSR_CAT_SRCH_OSR_CRSE_HEADER$0"]'))
).text
print(course)

Scroll with Keys.PAGE_DOWN in Selenium Python

Hello Every one can any one help me in scrolling https://www.grainger.com/category/black-pipe-fittings/pipe-fittings/pipe-tubing-and-fittings/plumbing/ecatalog/N-qu1?searchRedirect=products
i want to scroll this using
actions = ActionChains(browser)
actions.send_keys(Keys.PAGE_DOWN)
actions.perform()
till it reaches the bottom of the scroll where it will find an element "Load More"
loadMoreButton = browser.find_element_by_css_selector(
".btn.list-view__load-more.list-view__load-more--js")
loadMoreButton.click()
and then ponce clicked the load more button it has to again perform the scroll action and then again the loadmore action until the load more button is not available.
I have to use this page down action as the element does not load until the page is scrolled till the element if anyone could suggest some solution will be of great help
This has worked for me with zero issues...
from selenium.webdriver.common.keys import Keys
driver.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)
To scroll the page https://www.grainger.com/category/black-pipe-fittings/pipe-fittings/pipe-tubing-and-fittings/plumbing/ecatalog/N-qu1?searchRedirect=products till it reaches the bottom of the page where it will find an element with text as View More and then click the element until the element is not available you can use the following solution:
Code Block:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException
from selenium.common.exceptions import TimeoutException
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_argument('disable-infobars')
browser=webdriver.Chrome(chrome_options=options, executable_path=r'C:\Utility\BrowserDrivers\chromedriver.exe')
browser.get("https://www.grainger.com/category/black-pipe-fittings/pipe-fittings/pipe-tubing-and-fittings/plumbing/ecatalog/N-qu1?searchRedirect=products")
while True:
try:
browser.execute_script("return arguments[0].scrollIntoView(true);", WebDriverWait(browser,10).until(EC.visibility_of_element_located((By.XPATH, "//a[#class='btn list-view__load-more list-view__load-more--js' and normalize-space()='View More']"))))
browser.execute_script("arguments[0].click();", WebDriverWait(browser,10).until(EC.element_to_be_clickable((By.XPATH, "//a[#class='btn list-view__load-more list-view__load-more--js' and normalize-space()='View More']"))))
print("View More button clicked")
except (TimeoutException, StaleElementReferenceException) as e:
print("No more View More buttons")
break
browser.quit()
Console Output:
View More button clicked
View More button clicked
No more View More buttons
#PedroLobito I am trying to retrieve the product links can you help me
in this
No need for selenium in this case, just sniff the xhr requests via developer tools and go straight to the gold (json).
The url structure for products is as follows:
https://www.x.com/product/anything-Item#
Just add the Item # value in the json object at the end of the url, something like:
https://www.x.com/product/anything-5P540
https://www.x.com/product/anything-5P541
...
py3 example (for py2, just change the format syntax):
import json
import requests
main_cat = "WP7115916"
sub_cat = "4836"
x = requests.get(f"https://www.x.com/product/tableview/GRAINGER-APPROVED-Square-Head-Plugs-{main_cat}/_/N-qu1?searchRedirect=products&breadcrumbCatId={sub_cat}&s_pp=false").json()
for p in x['records']:
for childs in p['children']:
for item in json.loads(childs['collapseValues']):
url = f"https://www.x.com/product/lol-{item['sku']}"
print(url)
https://www.x.com/product/lol-5P540
https://www.x.com/product/lol-5P541
https://www.x.com/product/lol-5P542
https://www.x.com/product/lol-5P543
https://www.x.com/product/lol-5P544
https://www.x.com/product/lol-5P545
https://www.x.com/product/lol-5P546
https://www.x.com/product/lol-5P547
https://www.x.com/product/lol-5P548
...
One of the best method for smooth scrolling...
html = driver.find_element(By.XPATH,'//body')
total_scroled = 0
page_height = driver.execute_script("return document.body.scrollHeight")
while total_scroled < page_height:
html.send_keys(Keys.PAGE_DOWN)
total_scroled += 400
time.sleep(.5)

Can't fetch the texts from a webpage

I've created a script using python and selenium to get all the text available out there in the following link. The webpage has got lazyloading method active and that is why more content become visible upon each scrolling. My script can handle that too.
However, the problem is when my script makes the webpage exhaust its content by reaching the bottom, it stucks right there. Once it can breaks out of the loop, I can fetch the content. How can I break out of the loop?
I know .LoadingDots is always there. And that is the only reason I can't find any logic to break the loop.
Link to that site
Here is what I've tried so far: (couldn't get rid of the loop)
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
wait = WebDriverWait(driver,10)
driver.get("https://www.quora.com/topic/American-Football")
while True:
try:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR, ".LoadingDots")))
except Exception: break
for item in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".ui_qtext_rendered_qtext .ui_qtext_para"))):
print(item.text)
driver.quit()
I know I can solve the issue if I comply with the following:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
driver = webdriver.Chrome()
wait = WebDriverWait(driver,10)
driver.get("https://www.quora.com/topic/American-Football")
last_len = len(wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".ui_qtext_rendered_qtext .ui_qtext_para"))))
while True:
for load_more in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a[id$='_more']"))):
driver.execute_script("arguments[0].click();",load_more)
try:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
wait.until(lambda driver: len(wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".ui_qtext_rendered_qtext .ui_qtext_para")))) > last_len)
items = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".ui_qtext_rendered_qtext .ui_qtext_para")))
last_len = len(items)
except TimeoutException: break
for item in items:
print(item.text)
driver.quit()
My question is: how can i fetch the content from that page exhausting all the scrolls using the way I tried with my first script making use of .LoadingDots?
When the page is scrolled to the button the element with classes .LoadingDots.regular remains the same, but its parent element adds new class hidden. You can check if the class was added using get_attribute function. You can also locate it directly with the class spinner_display_area
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
loading_dots = driver.find_element_by_class_name('spinner_display_area')
if 'hidden' in loading_dots.get_attribute('class'):
break;
Your script doesn't work as expected because (By.CSS_SELECTOR, ".LoadingDots") selector returns this element <div class="LoadingDots tiny"> and it is always hidden so your expectation of its invisibility always returns True and loop cannot be broken.
You need to check another element with "LoadingDots" class name: <div class="LoadingDots regular"> and the logic should be following:
Scroll page down
Wait for loading dots to appear (start loading more content)
Wait for loading dots to disappear (loading more content is done)
If after page scrolled we see no dots - break the loop
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 5)
driver.get("https://www.quora.com/topic/American-Football")
while True:
try:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".LoadingDots.regular")))
wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR, ".LoadingDots.regular")))
except Exception: continue
else: break
for item in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".ui_qtext_rendered_qtext .ui_qtext_para"))):
print(item.text)
driver.quit()
BUT! Note that I've posted this script just to point on reason why your script is not working... It's not really efficient as in case content loaded too fast (possibility is quite low, but...) script might not catch the moment when loading dots appeared and you'll not get all required content.
So #Guy solution seem to be more reliable (+1)

Can't exhaust the load more button to unveil all the headlines

I've tried to keep clicking on more button located at the bottom of a webpage (in it's landing page) to unveil all the headlines. The thing is when I execute my script, It only click once and then stop. How can I keep clicking on that button until there is no more option to click?
Link to that website
This is my script so far:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
link = "https://www.newsnow.co.uk/h/Sport/Football/Championship/Transfer+News"
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
driver.get(link)
while True:
try:
loadmore = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "[class^='hl_more']")))
driver.execute_script("arguments[0].scrollIntoView();",loadmore)
loadmore.click()
except Exception: break
driver.quit()
Try below code to simulate required behavior:
header = driver.find_element_by_id("phead")
driver.execute_script('arguments[0].style.position = "absolute";', header)
while True:
try:
loadmore = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "a.hl_more.bg_tween:not(.nfloading)")))
driver.execute_script("arguments[0].scrollIntoView();",loadmore)
loadmore.click()
except: break

My scraper fails to get all the items from a webpage

I've written some code in python in combination with selenium to parse different product names from a webpage. There are few load more buttons visible if the browser is made to scroll downward. The webpage displays it's full content if the page is made to scroll downmost until there is no load more button to click. My scraper seems to be doing good but I'm not getting all the results. There are around 200 products in that page but I'm getting 90 out of them. What change should I bring about in my scraper to get them all? Thanks in advance.
The webpage I'm dealing with: Page_Link
This is the script I'm trying with:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get("put_above_url_here")
wait = WebDriverWait(driver, 10)
page = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,".listing_item")))
for scroll in range(17):
page.send_keys(Keys.PAGE_DOWN)
time.sleep(2)
try:
load = driver.find_element_by_css_selector(".lm-btm")
load.click()
except Exception:
pass
for item in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "[id^=item_]"))):
name = item.find_element_by_css_selector(".pro-name.el2").text
print(name)
driver.quit()
Try below code to get required data:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get("https://www.purplle.com/search?q=hair%20fall%20shamboo")
wait = WebDriverWait(driver, 10)
header = driver.find_element_by_tag_name("header")
driver.execute_script("arguments[0].style.display='none';", header)
while True:
try:
page = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".listing_item")))
driver.execute_script("arguments[0].scrollIntoView();", page)
page.send_keys(Keys.END)
load = wait.until(EC.element_to_be_clickable((By.PARTIAL_LINK_TEXT, "LOAD MORE")))
driver.execute_script("arguments[0].scrollIntoView();", load)
load.click()
wait.until(EC.staleness_of(load))
except:
break
for item in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "[id^=item_]"))):
name = item.find_element_by_css_selector(".pro-name.el2").text
print(name)
driver.quit()
You should only Use Selenium as a last resort.
A simple look around in the webpage showed the API it called to get your data.
It returns a JSON output with all the details:
Link
You can now just loop over and store in a dataframe easily.
Very fast, fewer errors than selenium.

Categories