Helium or Selenium wont click button or Link - python

i am web scraping a sneakers website called skechers, now when the LOAD MORE link/button appears while scrolling down to load more Products, it clicks the Element but nothing loads.
Here is my CODE:
from helium import*
import time
from bs4 import BeautifulSoup
s = start_firefox("https://www.skechers.com/men/shoes/boots/?srule=price-low-to-high&start=0&sz=24")
time.sleep(5)
for x in range(1,5):
scroll_down(num_pixels=1000)
time.sleep(3)
try:
click(Button('LOAD MORE'))
time.sleep(10)
except:
continue
soup = BeautifulSoup(s.page_source,"lxml")
kill_browser()
Now i have tried click("LOAD MORE") and also click(Link("LOAD MORE")), But none of them seems to work :(

from helium import*
import time
from bs4 import BeautifulSoup
s = start_firefox("https://www.skechers.com/men/shoes/boots/?srule=price-low-to-high&start=0&sz=24")
time.sleep(5)
for x in range(1,5):
scroll_down(num_pixels=1000)
time.sleep(3)
try:
button=s.find_element_by_xpath('//button[contains(text(),"Load More")]')
button.click()
time.sleep(10)
except:
continue
soup = BeautifulSoup(s.page_source,"lxml")
kill_browser()
use xpath , for some reason that method is not working in firefox but works well in chrome

Related

How to scrape website if it has load more button to load more content on the page?

from selenium import webdriver
import time
driver = webdriver.Chrome(executable_path=r'C:\Users\gkhat\Downloads\chromedriver.exe')
driver.get('https://www.allrecipes.com/recipes/233/world-cuisine/asian/indian/')
card_titles = driver.find_elements_by_class_name('card__detailsContainer')
button = driver.find_element_by_id('category-page-list-related-load-more-button')
for card_title in card_titles:
rname = card_title.find_element_by_class_name('card__title').text
print(rname)
time.sleep(3)
driver.execute_script("arguments[0].scrollIntoView(true);", button)
driver.execute_script("arguments[0].click();", button)
time.sleep(3)
driver.quit()
The website loads the food cards after clicking on the the "Load More" button the above code scrape the recipe title I want it keep scraping the title even after clicking the load more button.
I tried the going to the Network tab the clicking on XHR but none of the requests shows the JSON. What should I do?
I tried below code for that. It works, but I am not sure if this is the best way to do it. FYI I handled those pop-ups for email manually. You need to find a way to handle them.
from selenium import webdriver
import time
from selenium.common.exceptions import StaleElementReferenceException
driver = webdriver.Chrome(executable_path="path")
driver.maximize_window()
driver.implicitly_wait(10)
driver.get("https://www.allrecipes.com/recipes/233/world-cuisine/asian/indian/")
receipes = driver.find_elements_by_class_name("card__detailsContainer")
for rec in receipes:
name = rec.find_element_by_tag_name("h3").get_attribute("innerText")
print(name)
loadmore = driver.find_element_by_id("category-page-list-related-load-more-button")
j = 0
try:
while loadmore.is_displayed():
loadmore.click()
time.sleep(5)
lrec = driver.find_elements_by_class_name("recipeCard__detailsContainer")
newlist = lrec[j:]
for rec in newlist:
name = rec.find_element_by_tag_name("h3").get_attribute("innerText")
print(name)
j = len(lrec)+1
time.sleep(5)
except StaleElementReferenceException:
pass
driver.quit()
Actually there is a json that returns the data. However the json returns it in html, so just need to parse that.
Note: You can change the chunk size so you can get more than 24 items per "page"
import requests
from bs4 import BeautifulSoup
size = 24
page = 0
hasNext = True
while hasNext == True:
page +=1
print('\tPage: %s' %page)
url = 'https://www.allrecipes.com/element-api/content-proxy/aggregate-load-more?sourceFilter%5B%5D=alrcom&id=cms%2Fonecms_posts_alrcom_2007692&excludeIds%5B%5D=cms%2Fallrecipes_recipe_alrcom_142967&excludeIds%5B%5D=cms%2Fonecms_posts_alrcom_231026&excludeIds%5B%5D=cms%2Fonecms_posts_alrcom_247233&excludeIds%5B%5D=cms%2Fonecms_posts_alrcom_246179&excludeIds%5B%5D=cms%2Fonecms_posts_alrcom_256599&excludeIds%5B%5D=cms%2Fonecms_posts_alrcom_247204&excludeIds%5B%5D=cms%2Fonecms_posts_alrcom_34591&excludeIds%5B%5D=cms%2Fonecms_posts_alrcom_245131&excludeIds%5B%5D=cms%2Fonecms_posts_alrcom_220560&excludeIds%5B%5D=cms%2Fonecms_posts_alrcom_212721&excludeIds%5B%5D=cms%2Fonecms_posts_alrcom_236563&excludeIds%5B%5D=cms%2Fallrecipes_recipe_alrcom_14565&excludeIds%5B%5D=cms%2Fonecms_posts_alrcom_8189766&excludeIds%5B%5D=cms%2Fonecms_posts_alrcom_8188886&excludeIds%5B%5D=cms%2Fonecms_posts_alrcom_8189135&excludeIds%5B%5D=cms%2Fonecms_posts_alrcom_2052087&excludeIds%5B%5D=cms%2Fonecms_posts_alrcom_7986932&excludeIds%5B%5D=cms%2Fonecms_posts_alrcom_2040338&excludeIds%5B%5D=cms%2Fonecms_posts_alrcom_280310&excludeIds%5B%5D=cms%2Fonecms_posts_alrcom_142967&excludeIds%5B%5D=cms%2Fonecms_posts_alrcom_14565&excludeIds%5B%5D=cms%2Fonecms_posts_alrcom_228957&excludeIds%5B%5D=cms%2Fonecms_posts_alrcom_46822&excludeIds%5B%5D=cms%2Fonecms_posts_alrcom_72349&page={page}&orderBy=Popularity30Days&docTypeFilter%5B%5D=content-type-recipe&docTypeFilter%5B%5D=content-type-gallery&size={size}&pagesize={size}&x-ssst=iTv629LHnNxfbQ1iVslBTZJTH69zVWEa&variant=food'.format(size=size, page=page)
jsonData = requests.get(url).json()
hasNext = jsonData['hasNext']
soup = BeautifulSoup(jsonData['html'], 'html.parser')
cardTitles = soup.find_all('h3',{'class':'recipeCard__title'})
for title in cardTitles:
print(title.text.strip())

How to bypass disclaimer while scraping a website

I was able to scrape the following website before using "driver = webdriver.PhantomJS()" for work reason. What I was scraping were the price and the date.
https://www.cash.ch/fonds/swisscanto-ast-avant-bvg-portfolio-45-p-19225268/swc/chf
This stopped working some days ago due to a disclaimer page which I have to agree at first.
https://www.cash.ch/fonds-investor-disclaimer?redirect=fonds/swisscanto-ast-avant-bvg-portfolio-45-p-19225268/swc/chf
Once agreed I visually saw the real content, however the driver seems not, print out is [], so it must be still with the url of the disclaimer.
Please see code below.
from selenium import webdriver
from bs4 import BeautifulSoup
import csv
import os
driver = webdriver.PhantomJS()
driver.set_window_size(1120, 550)
#Swisscanto
driver.get("https://www.cash.ch/fonds/swisscanto-ast-avant-bvg- portfolio-45-p-19225268/swc/chf")
s_swisscanto = BeautifulSoup(driver.page_source, 'lxml')
nav_sc = s_swisscanto.find_all('span', {"data-field-entry": "value"})
date_sc = s_swisscanto.find_all('span', {"data-field-entry": "datetime"})
print(nav_sc)
print(date_sc)
print("Done Swisscanton")
This should work (I think the button you want to click in zustimmen?)
driver = webdriver.PhantomJS()
driver.get("https://www.cash.ch/fonds/swisscanto-ast-avant-bvg-portfolio-45-p-19225268/swc/chf"
accept_button = driver.find_element_by_link_text('zustimmen')
accept_button.click()
content = driver.page_source
More details here
python selenium click on button

Checking the clickability of an element in selenium using python

I've been trying to write a script which will give me all the links to the episodes present on this page :- http://www.funimation.com/shows/assassination-classroom/videos/episodes
As you can see that the links can be seen in 'Outer HTML', I used selenium and PhantomJS with python.
Link Example: http://www.funimation.com/shows/assassination-classroom/videos/official/karma-time
However, I can't seem to get my code right. I do have a basic Idea of what I want to do. Here's the process :-
1.) Copy the Outer HTML of the very first page and then save it as 'Source_html' file.
2.) Look for links inside this file.
3.) Move to the next page to see rest of the videos and their links.
4.) Repeat the step 2.
This is what my code looks like :
from selenium import webdriver
from selenium import selenium
from bs4 import BeautifulSoup
import time
# ---------------------------------------------------------------------------------------------
driver = webdriver.PhantomJS()
driver.get('http://www.funimation.com/shows/assassination-classroom/videos/episodes')
elem = driver.find_element_by_xpath("//*")
source_code = elem.get_attribute("outerHTML")
f = open('source_code.html', 'w')
f.write(source_code.encode('utf-8'))
f.close()
print 'Links On First Page Are : \n'
soup = BeautifulSoup('source_code.html')
subtitles = soup.find_all('div',{'class':'popup-heading'})
official = 'something'
for official in subtitles:
x = official.findAll('a')
for a in x:
print a['href']
sbtn = driver.find_element_by_link_text(">"):
print sbtn
print 'Entering The Loop Now'
for driver.find_element_by_link_text(">"):
sbtn.click()
time.sleep(3)
elem = driver.find_element_by_xpath("//*")
source_code = elem.get_attribute("outerHTML")
f = open('source_code1.html', 'w')
f.write(source_code.encode('utf-8'))
f.close()
Things I already know :-
soup = BeautifulSoup('source_code.html') won't work, because I need to open this file via python and feed it into BS after that. That I can manage.
That official variable isn't really doing anything. Just helping me start a loop.
for driver.find_element_by_link_text(">"):
Now, this is what I need to fix somehow. I'm not sure how to check if this thing is still clickable or not. If yes, then proceed to next page, get the links, click this again to go to page 3 and repeat the process.
Any help would be appreciated.
You don't need to use BeautifulSoup here at all. Just grab all the links via selenium. Proceed to next page only if the > link is visible. Here is the complete implementation including gathering the links, necessary waits. It should work for any page count:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.PhantomJS()
driver.get("http://www.funimation.com/shows/assassination-classroom/videos/episodes")
wait = WebDriverWait(driver, 10)
links = []
while True:
# wait for the page to load
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "a.item-title")))
# wait until the loading circle becomes invisible
wait.until(EC.invisibility_of_element_located((By.ID, "loadingCircle")))
links.extend([link.get_attribute("href") for link in driver.find_elements_by_css_selector("a.item-title")])
print("Parsing page number #" + driver.find_element_by_css_selector("a.jp-current").text)
# click next
next_link = driver.find_element_by_css_selector("a.next")
if not next_link.is_displayed():
break
next_link.click()
time.sleep(1) # hardcoded delay
print(len(links))
print(links)
For the mentioned in the question URL, it prints:
Parsing page number #1
Parsing page number #2
93
['http://www.funimation.com/shows/assassination-classroom/videos/official/assassination-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/assassination-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/assassination-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/baseball-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/baseball-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/baseball-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/karma-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/karma-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/karma-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/grown-up-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/grown-up-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/grown-up-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/assembly-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/assembly-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/assembly-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/test-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/test-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/test-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/school-trip-time1st-period', 'http://www.funimation.com/shows/assassination-classroom/videos/official/school-trip-time1st-period', 'http://www.funimation.com/shows/assassination-classroom/videos/official/school-trip-time1st-period', 'http://www.funimation.com/shows/assassination-classroom/videos/official/school-trip-time2nd-period', 'http://www.funimation.com/shows/assassination-classroom/videos/official/school-trip-time2nd-period', 'http://www.funimation.com/shows/assassination-classroom/videos/official/school-trip-time2nd-period', 'http://www.funimation.com/shows/assassination-classroom/videos/official/transfer-student-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/transfer-student-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/transfer-student-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/l-and-r-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/l-and-r-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/l-and-r-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/transfer-student-time2nd-period', 'http://www.funimation.com/shows/assassination-classroom/videos/official/transfer-student-time2nd-period', 'http://www.funimation.com/shows/assassination-classroom/videos/official/transfer-student-time2nd-period', 'http://www.funimation.com/shows/assassination-classroom/videos/official/ball-game-tournament-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/ball-game-tournament-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/ball-game-tournament-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/talent-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/talent-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/talent-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/vision-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/vision-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/vision-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/end-of-term-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/end-of-term-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/end-of-term-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/schools-out1st-term', 'http://www.funimation.com/shows/assassination-classroom/videos/official/schools-out1st-term', 'http://www.funimation.com/shows/assassination-classroom/videos/official/schools-out1st-term', 'http://www.funimation.com/shows/assassination-classroom/videos/official/island-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/island-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/island-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/action-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/action-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/action-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/pandemonium-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/pandemonium-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/pandemonium-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/karma-time2nd-period', 'http://www.funimation.com/shows/assassination-classroom/videos/official/karma-time2nd-period', 'http://www.funimation.com/shows/assassination-classroom/videos/official/karma-time2nd-period', 'http://www.funimation.com/shows/deadman-wonderland', 'http://www.funimation.com/shows/deadman-wonderland', 'http://www.funimation.com/shows/riddle-story-of-devil', 'http://www.funimation.com/shows/riddle-story-of-devil', 'http://www.funimation.com/shows/soul-eater', 'http://www.funimation.com/shows/soul-eater', 'http://www.funimation.com/shows/assassination-classroom/videos/official/xx-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/xx-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/xx-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/nagisa-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/nagisa-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/nagisa-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/summer-festival-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/summer-festival-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/summer-festival-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/kaede-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/kaede-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/kaede-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/itona-horibe-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/itona-horibe-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/itona-horibe-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/spinning-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/spinning-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/spinning-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/leader-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/leader-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/leader-time', 'http://www.funimation.com/shows/deadman-wonderland', 'http://www.funimation.com/shows/deadman-wonderland', 'http://www.funimation.com/shows/riddle-story-of-devil', 'http://www.funimation.com/shows/riddle-story-of-devil', 'http://www.funimation.com/shows/soul-eater', 'http://www.funimation.com/shows/soul-eater']
Basically, I use webelement.is_displayed() to check if it is clickable or not.
isLinkDisplay = driver.find_element_by_link_text(">").is_displayed()

Selenium not .click giving "object not callable" error

I am trying to automate my rental search. I can get the first page of results but when I try to click the next button I get an error - "object not callable" I am new to Python and just thought this would be a fun project to learn with - any help.
from selenium import webdriver
from bs4 import BeautifulSoup
import datetime
from datetime import timedelta
import time
import re
pages = set()
def getLinks(url):
global pages
# Open web browser and get url - 3 second time delay.
driver = webdriver.Firefox()
driver.get(url)
time.sleep(3)
pageSource = driver.page_source
bsObj = BeautifulSoup(pageSource)
for addr_link in bsObj.findAll("a", href=re.compile("^/homedetails/*")):
if 'href' in addr_link.attrs:
if addr_link['href'] not in pages:
newPage = addr_link.attrs['href']
pages.add(newPage)
print(newPage)
#if bsObj.find('li', {'class': "zsg-pagination-next"}) == True:
next_page = bsObj.find('li', {'class': "zsg-pagination-next"}).find("a")
#next_page.click()
print(next_page)
next_page.click()
getLinks(http://www.zillow.com/homes/for_rent/Jackson-County-MO/house,mobile_type/1804_rid/6m_days/39.198737,-93.6866,38.873394,-95.026932_rect/9_zm/)
To achieve above task, you don't need beautifulsoup, it can be done with webdriver. Try below code.
//Code to Fetch All the Link Details Through WebDriver
addr_link = driver.find_elements_by_xpath("//a[contains(#href,'homedetails')]")
for link in addr_link :
print link.get_attribute("href")
//Code To Click On Next Button
next_btn = driver.find_elements_by_xpath("//a[text()='Next']")
next_btn.click()

Scrape spotify web interface

I'm trying to get the number of plays for the top songs from a number of artists on Spotify using python and splinter.
If you fill in the username and password below with yours, you should be able to run the code.
from splinter import Browser
import time
from bs4 import BeautifulSoup
browser = Browser()
url = 'http://play.spotify.com'
browser.visit(url)
time.sleep(2)
button = browser.find_by_id('has-account')
button.click()
time.sleep(1)
browser.fill('username', 'your_username')
browser.fill('password', 'your_password')
buttons = browser.find_by_css('button')
visible_buttons = [button for button in buttons if button.visible]
login_button = visible_buttons[-1]
login_button.click()
time.sleep(1)
browser.visit('https://play.spotify.com/artist/5YGY8feqx7naU7z4HrwZM6')
time.sleep(10)
So far, so good. If you open up firefox, you'll can see Miley Cyrus's artist page, including the number of plays for top tracks.
If you open up the Firefox Developer Tools Inspector and hover, you can see the name of the song in .tl-highlight elements, and the number of plays in .tl-listen-count elements. However, I've found it impossible (at least on my machine) to access these elements using splinter. Moreover, when I try to get the source for the entire page, the elements that I can see by hovering my mouse over them in Firefox don't show up in what is ostensibly the page source.
html = browser.html
soup = BeautifulSoup(html)
output = soup.prettify()
with open('miley_cyrus_artist_page.html', 'w') as output_f:
output_f.write(output)
browser.quit()
I don't think I know enough about web programming to know what the issue is here--Firefox sees all the DOM elements clearly, but splinter that is driving Firefox does not.
The key problem is that there is an iframe containing the artist's page with list of tracks. You need to switch into it's context before searching for elements:
frame = browser.driver.find_element_by_css_selector("iframe[id^=browse-app-spotify]")
browser.driver.switch_to.frame(frame)
Many thanks to #alecxe, the following code works to pull the information on the artist.
from splinter import Browser
import time
from bs4 import BeautifulSoup
import codecs
browser = Browser()
url = 'http://play.spotify.com'
browser.visit(url)
time.sleep(2)
button = browser.find_by_id('has-account')
button.click()
time.sleep(1)
browser.fill('username', 'your_username')
browser.fill('password', 'your_password')
buttons = browser.find_by_css('button')
visible_buttons = [button for button in buttons if button.visible]
login_button = visible_buttons[-1]
login_button.click()
time.sleep(1)
browser.visit('https://play.spotify.com/artist/5YGY8feqx7naU7z4HrwZM6')
time.sleep(30)
CORRECT_FRAME_INDEX = 6
with browser.get_iframe(CORRECT_FRAME_INDEX) as iframe:
html = iframe.html
soup = BeautifulSoup(html)
output = soup.prettify()
with codecs.open('test.html', 'w', 'utf-8') as output_f:
output_f.write(output)
browser.quit()

Categories