Using Selenium to interact with authors on Medium.com
I am trying to target a popup element with selenium that appears when text is double-clicked. Once this is detected and clicked it opens a message box to the right. My end goal has been to insert (send_keys) text into this text box and yet it is proving to be quite difficult.
[the dynamic element]1 # the element on the far right is the button to open the chat box.
enter image description here2 # this is the text box
WHAT I HAVE TRIED:
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.keys import Keys
def interact_with_author(message):
# variables
target_class = "meteredContent"
body_css = "body"
header_xpath = "/html/head"
second_article_css = "#root > div > div.s.n.t > div.ah.ay > div > div.n.p > div > div:nth-child(2)" # the second article on the page
first_par = "#\39 993"
second_par = "#\35 f50"
first_par_css = "#root > div > div.s > article > div > section > div > div > p"
first_par_class = "ht hu dj hv b ei hw hx hy el hz ia ib ic id ie if ig ih ii ij ik il im in io db eg"
wait_time = 5 # seconds to wait when sleep is called with the wait_time variable
#code
text_box = driver.find_element_by_css_selector('body > div:nth-child(47) > div > div > div > div > div')
action = ActionChains(driver)
listing=WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.TAG_NAME,"a")))
articles = driver.find_elements_by_tag_name("a")
an_article = driver.find_element_by_css_selector(second_article_css)
an_article.click()
time.sleep(wait_time) # todo change to sleep four seconds after the article is fully loaded
listing=WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.TAG_NAME,"p")))
try:
paragraphs = driver.find_elements_by_tag_name('p')
driver.execute_script("document.body.style.zoom='250%'")
try:
first_par = ''
for i in range(1,len(paragraphs)):
first_par_commentable = None
try:
first_par_commentable = driver.find_element_by_xpath(f"/html/body/div[1]/div/div[3]/article/div/section/div/div/p[{i}]")
driver.execute_script("document.body.style.zoom='200%'")
except Exception as e:
ic(e)
if first_par_commentable != None:
break
except Exception as f:
ic(f)
try:
first_par_commentable.click()
action.double_click(first_par_commentable).perform()
time.sleep(random.randint(1,3))
except Exception as e:
ic(e)
except Exception as e:
ic(e)
'''
If anyone knows how to access this element quickly and in a scaleable way it would be appreciated.
'''
Related
my process is there is a list of restaurants and i want to click on it and do some stuff and come back and same with the next restaurants in the list using a loop as you can see in below image
i followed this link but didn't get clarification
CODE
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
driver = webdriver.Chrome(executable_path='../chromedriver.exe')
driver.get("https://www.zomato.com/kolkata/dine-out?dishv2_id=76487754bd59c594cd5218d3427e68e0_2&rating_range=4.0-5.0")
screen_height = driver.execute_script("return window.screen.height;") # get the screen height of the web
i = 1
count = 0
scroll_pause_time = 1
while True:
# scroll one screen height each time
driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))
i += 1
time.sleep(scroll_pause_time)
# update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
scroll_height = driver.execute_script("return document.body.scrollHeight;")
# Break the loop when the height we need to scroll to is larger than the total scroll height
if (screen_height) * i > scroll_height:
break
driver.execute_script("window.scrollTo(0, 0);")
res_list = []
titles = driver.find_elements_by_xpath('//a[#class="sc-dBAPYN kcrxQo"]')
#this is block of code which is error prone
for i in titles:
time.sleep(5)
driver.execute_script("window.scrollTo(0, 0);")
element = i.find_element_by_xpath('./div/h4')
driver.execute_script("arguments[0].click();", element)
time.sleep(3)
name_of_rests = driver.find_element_by_css_selector('#root > div > main > div > section.sc-kxynE.jzTfFZ > section > section > div > div > div > h1').text
res_list.append(name_of_rests)
driver.back()
print(res_list)
driver.close()
You have to define this
driver.find_elements_by_xpath('//a[#class="sc-dBAPYN kcrxQo"]')
again in loop.
Code :
number_of_titles = len(driver.find_elements_by_xpath('//a[#class="sc-dBAPYN kcrxQo"]'))
#this is block of code which is error prone
j = 0
for i in range(number_of_titles):
time.sleep(5)
titles = driver.find_elements_by_xpath('//a[#class="sc-dBAPYN kcrxQo"]')
driver.execute_script("window.scrollTo(0, 0);")
element = titles[j].find_element_by_xpath('.//div//h4')
driver.execute_script("arguments[0].click();", element)
time.sleep(3)
j = j +1
name_of_rests = driver.find_element_by_css_selector('#root > div > main > div > section.sc-kxynE.jzTfFZ > section > section > div > div > div > h1').text
res_list.append(name_of_rests)
driver.back()
I tried to follow along with some youtube tutorials in order to make my code do what I want it to do, but I still haven't found any answer on the entire internet...
Here I tried to make the script using BeautifulSoup:
import bs4
import requests
resoult = requests.get("https://www.instagram.com/kyliejenner/following/")
src = resoult.content
Soup = bs4.BeautifulSoup(src, "lxml")
links = Soup.find_all("a")
print(links)
print("/n")
for link in links:
if "FPmhX notranslate _0imsa " in link.text:
print(link)
And here I tried to do the same thing with Selenium, but the problem is that I don't know the next steps in order to make my code copy the usernames a user is following
import selenium
from selenium import webdriver
import time
PATH = "C:\Program Files (x86)\chromedriver.exe"
driver = webdriver.Chrome(PATH)
driver.get("https://www.instagram.com/")
time.sleep(2)
username = driver.find_element_by_css_selector ("#loginForm > div > div:nth-child(1) > div > label >
input")
username.send_keys ("my_username")
password = driver.find_element_by_css_selector ("#loginForm > div > div:nth-child(2) > div > label >
input")
password.send_keys("password")
loginButton = driver.find_element_by_css_selector ("#loginForm > div > div:nth-child(3)")
loginButton.click()
time.sleep(3)
saveinfoButton = driver.find_element_by_css_selector ("#react-root > section > main > div > div > div
>
section > div > button")
saveinfoButton.click()
time.sleep(3)
notnowButton = driver.find_element_by_css_selector("body > div.RnEpo.Yx5HN > div > div > div >
div.mt3GC
> button.aOOlW.HoLwm")
notnowButton.click()
I would really appreciate it if someone could solve this problem. Again, all that I want my script to do is to copy the usernames from the "following" section of someones profile.
I am scraping Banggood, the problem is that driver open just first link and then doesn't go to next link of links list( next product )
and get this error in line 24
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
but i tried to print links out of loop i got all of the
print( links[0].get_attribute('href') )
print( links[2].get_attribute('href') )
main code :
import time
from selenium import webdriver #THIS IS MAIN SCRIPT
driver = webdriver.Chrome(executable_path='C:\\Users\\Compu City\\Desktop\\chromedriver.exe')#DRIVER LOCATION
driver.get('https://usa.banggood.com/Deals_Electronics.html#dealscategories2')#DRIVER LOCATION
driver.implicitly_wait(30)
links = driver.find_elements_by_css_selector('body > div.flashdeals-container.fixed > div.main > div.product-list.cf > ul > li > a.products_name.exclick')
#links has 25 link
product=0
while product <= len(links):
driver.get(links[product].get_attribute('href'))
try:# TITLE
title = driver.find_element_by_css_selector('#centerCtrl > div.title_hd > h2 > strong')
print(title.text)
except:
print('no title')
try:# NEW PRICE
new_price = driver.find_element_by_css_selector('#centerCtrl > div.itemBox > div.item_price_box > div.item_now_price')
print(new_price.text)
except:
print('no new price')
try:# OLD PRICE
old_price = driver.find_element_by_css_selector('#centerCtrl > div.itemBox > div.item_price_box > div.item_old_price')
print(old_price.text)
except:
print('no old price')
try:#image
image = driver.find_element_by_css_selector('#landingImage').get_attribute('src')
print(image)
except:
print('no image')
product +=1
try that
v=[]
for x in links:
#driver.get(links[1].get_attribute('href'))
print(v.append(x.get_attribute('href')))
print(len(v))
driver.get(v[1])
time.sleep(10)
driver.get(v[2])
product=0
while product <= len(v):
driver.get(v[product])
product +=1
I'm trying to scrape Bet365 for it's live soccer odds. I'm iterating over a list of live matches available. For each match I need to click on the match and I'll be directed to some new content where I have all the detailed oddsinfo. From here it crashes when I go back to keep on my iterating.
It throws this error:
Traceback (most recent call last):
File "/Users/christian/Google Drev/Data Science/Bet365/main.py", line 32, in <module>
getScoreH = game.find_element_by_css_selector(scoreH).text
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/selenium/webdriver/remote/webelement.py", line 430, in find_element_by_css_selector
return self.find_element(by=By.CSS_SELECTOR, value=css_selector)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/selenium/webdriver/remote/webelement.py", line 654, in find_element
{"using": by, "value": value})['value']
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/selenium/webdriver/remote/webelement.py", line 628, in _execute
return self._parent.execute(command, params)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/selenium/webdriver/remote/webdriver.py", line 320, in execute
self.error_handler.check_response(response)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/selenium/webdriver/remote/errorhandler.py", line 242, in check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
(Session info: chrome=69.0.3497.100)
(Driver info: chromedriver=2.42.591059 (a3d9684d10d61aa0c45f6723b327283be1ebaad8),platform=Mac OS X 10.14.0 x86_64)
The error comes from the last code in my main.py:
# HERE IT BREAKS!:
# Redirects to a games detailed odds page
game.find_element_by_css_selector(oddsBtn).click()
time.sleep(5)
# Go back and keep choose the click the next games details.
obj.find_element(overview).click()
time.sleep(5)
Below is my program. But as I said, the problem comes from the main.py's last few lines of code where I need to go back to an iteration. It seems like it doesn't remember where I left off.
cls_scraper.py:
"""
Class to find element(s) by css selector
"""
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
import platform
import time
import os
class Scraper():
def __init__(self, driver):
self.driver = driver
def wait(self, element):
return WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, element)))
def element_exist_css(self, element):
try:
self.driver.find_element_by_css_selector(element)
except NoSuchElementException:
print('Element doesnt exist')
return False
return True
def element_css(self, element):
try:
time.sleep(2)
return WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable(
(By.CSS_SELECTOR, element)))
except StaleElementReferenceException:
print("XB: StaleElementReferenceException")
except WebDriverException:
print("XB: WebDriverException")
def find_elements(self, element):
time.sleep(2)
return self.driver.find_elements_by_css_selector(element)
def find_element(self, element):
time.sleep(2)
return self.driver.find_element_by_css_selector(element)
str_elements.py:
"""
String library to have relevant css selector elements in one place.
"""
""" BET 365 - Overview-page """
# Enter the page
enterPage = '#TopPromotionMainArea'
# Page with live odds
inPlay = 'body > div:nth-child(1) > div > div:nth-child(1) > div > div.hm-HeaderModule_Primary > div.hm-BigButtons > nav > a:nth-child(2)'
# Element containing relevent games and info about time, score etc, se below.
games = 'div.ipo-FixtureRenderer.ipo-Competition_Container > div'
# For each game in games, these elements can be found:
teamH = 'div.ipo-TeamStack > div:nth-child(1)'
teamA = 'div.ipo-TeamStack > div:nth-child(2)'
scoreH = 'div.ipo-TeamPoints_TeamScore.ipo-TeamPoints_TeamScore-teamone'
scoreA = 'div.ipo-TeamPoints_TeamScore.ipo-TeamPoints_TeamScore-teamtwo'
gameTime = 'div.ipo-InPlayTimer'
# The redirection in order to get all kinds of odds from a match
# Main overview-page only show a part of it.
oddsBtn = 'div.ipo-FixtureEventCountButton_EventCountWrapper'
# The overview tab to see all the live games
overview = 'div.ip-ControlBar > span.ip-ControlBar_ButtonBar > div:nth-child(1)'
# Choose english language
langTab = 'body > div:nth-child(1) > div > div:nth-child(1) > div > div.hm-HeaderModule_Secondary > div.hm-HeaderModule_Menus > div.hm-LanguageDropDownSelections.hm-DropDownSelections > a'
pickEng = 'body > div:nth-child(1) > div > div:nth-child(1) > div > div.hm-HeaderModule_Secondary > div.hm-HeaderModule_Menus > div.hm-LanguageDropDownSelections.hm-DropDownSelections.hm-DropDownSelections_Selected > div > div > a:nth-child(1)'
# Get a better overview
allMarkets = 'body > div:nth-child(1) > div > div.wc-PageView > div.wc-PageView_Main.wc-InPlayPage_MainContainer > div > div > div.ipo-OverViewView > div > div > div > div.ipo-OverViewDetail > div.ipo-OverViewDetail_Container.ipo-Classification > div.ipo-ClassificationHeader_Header.ipo-ClassificationHeader_Header-1.ipo-ClassificationHeader_Header-lightgreenborder.ipo-ClassificationHeader_Header-moremarkets > div.ipo-ClassificationHeader_MarketsButtonOuterWrapper > div > div.ipo-ClassificationHeader_MarketsButton.ipo-ClassificationHeader_MarketsButton-transparent'
""" BET 365 - Odds-page """
# Collect all the odds from the redirection page.
main.py:
""" Run program from here """
from str_elements import *
from cls_scraper import *
from browser.path import *
import time
if __name__ == '__main__':
print("Welcome \n")
# Open website
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(driver_path, chrome_options=options)
driver.get('https://www.bet365.dk/#/HO/')
""" Click relevant elements """
print("Bet365: Pressing buttons ...")
obj = Scraper(driver)
obj.element_css(enterPage).click() # Enters the bet365 main page
obj.element_css(inPlay).click() # Presses the in-play tab
obj.element_css(langTab).click() # Choose languages
obj.element_css(pickEng).click() # Choose english
obj.element_css(overview).click() # Shows all live games
obj.element_css(allMarkets).click() # Better overview
print("Bet365: Collecting game data ...")
# All live games
liveContainer = obj.find_elements(games) # Contains a list of running games
for game in liveContainer:
getScoreH = game.find_element_by_css_selector(scoreH).text
getScoreA = game.find_element_by_css_selector(scoreA).text
getTeamH = game.find_element_by_css_selector(teamH).text
getTeamA = game.find_element_by_css_selector(teamA).text
getTime = game.find_element_by_css_selector(gameTime).text
print("Score: ", getScoreH, "-", getScoreA)
print("GameTime:", getTime)
print("HomeTeam:", getTeamH)
print("AwayTeam:", getTeamA)
print("")
## HERE IT BREAKS!:
## Redirects to a games detailed odds page
# game.find_element_by_css_selector(oddsBtn).click()
# time.sleep(5)
## Go back and keep choose the click the next games details.
# obj.find_element(overview).click()
# time.sleep(5)
Why is it when I add time.sleep(2), I get my desired output but if I add wait until specific xpath it gives less results?
Output with time.sleep(2) (also desired):
Adelaide Utd
Tottenham
Dundee Fc
...
Count: 145 names
Remove time.sleep
Adelaide Utd
Tottenham
Dundee Fc
...
Count: 119 names
I have added:
clickMe = wait(driver, 13).until(EC.element_to_be_clickable((By.CSS_SELECTOR, ("#page-container > div:nth-child(4) > div > div.ubet-sports-section-page > div > div:nth-child(2) > div > div > div:nth-child(1) > div > div > div.page-title-new > h1"))))
As this element is present on all pages.
Seems to be significantly less. How can I get around this issue?
Script:
import csv
import os
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import WebDriverWait as wait
driver = webdriver.Chrome()
driver.set_window_size(1024, 600)
driver.maximize_window()
driver.get('https://ubet.com/sports/soccer')
clickMe = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, ('//select[./option="Soccer"]/option'))))
options = driver.find_elements_by_xpath('//select[./option="Soccer"]/option')
indexes = [index for index in range(len(options))]
for index in indexes:
try:
try:
zz = wait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, '(//select/optgroup/option)[%s]' % str(index + 1))))
zz.click()
except StaleElementReferenceException:
pass
from selenium.webdriver.support.ui import WebDriverWait
def find(driver):
pass
from selenium.common.exceptions import StaleElementReferenceException, NoSuchElementException
import time
clickMe = wait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, ("#page-container > div:nth-child(4) > div > div.ubet-sports-section-page > div > div:nth-child(2) > div > div > div:nth-child(1) > div > div > div.page-title-new > h1"))))
langs0 = driver.find_elements_by_css_selector(
"div > div > div > div > div > div > div > div > div.row.collapse > div > div > div:nth-child(2) > div > div > div > div > div > div.row.small-collapse.medium-collapse > div:nth-child(1) > div > div > div > div.lbl-offer > span")
langs0_text = []
for lang in langs0:
try:
langs0_text.append(lang.text)
except StaleElementReferenceException:
pass
directory = 'C:\\A.csv' #####################################
with open(directory, 'a', newline='', encoding="utf-8") as outfile:
writer = csv.writer(outfile)
for row in zip(langs0_text):
writer.writerow(row)
except StaleElementReferenceException:
pass
If you cannot access page, you need vpn.
Updating...
Perhaps that element loads before others. So if we changed it to datascraped (not all pages have data to be scraped).
Add:
try:
clickMe = wait(driver, 13).until(EC.element_to_be_clickable((By.CSS_SELECTOR, ("div > div > div > div > div > div > div > div > div.row.collapse > div > div > div:nth-child(2) > div > div > div > div > div > div.row.small-collapse.medium-collapse > div:nth-child(3) > div > div > div > div.lbl-offer > span"))))
except TimeoutException as ex:
pass
Same issue still present
Manual steps:
#Load driver.get('https://ubet.com/sports/soccer')
#Click drop down (//select/optgroup/option
#Wait for page elements so can scrape
Scrape:
div > div > div > div > div > div > div > div > div.row.collapse > div > div > div:nth-child(2) > div > div > div > div > div > div.row.small-collapse.medium-collapse > div:nth-child(1) > div > div > div > div.lbl-offer > span
Loop repeat.
The website is built on angularjs, so your best bet would be to wait until angular has finished processing of all AJAX requests (I won't go into the underlying mechanics, but there are plenty of materials on that topic throughout the web). For this, I usually define a custom expected condition to check while waiting:
class NgReady:
js = ('return (window.angular !== undefined) && '
'(angular.element(document).injector() !== undefined) && '
'(angular.element(document).injector().get("$http").pendingRequests.length === 0)')
def __call__(self, driver):
return driver.execute_script(self.js)
# NgReady does not have any internal state, so one instance
# can be reused for waiting multiple times
ng_ready = NgReady()
Now use it to wait after zz.click():
zz.click()
wait(driver, 10).until(ng_ready)
Tests
Your original code, unmodified (without sleeping or waiting with ng_ready):
$ python so-47954604.py && wc -l out.csv && rm out.csv
86 out.csv
Using time.sleep(10) after zz.click():
$ python so-47954604.py && wc -l out.csv && rm out.csv
101 out.csv
Same result when using wait(driver, 10).until(ng_ready) after zz.click():
$ python so-47954604.py && wc -l out.csv && rm out.csv
101 out.csv
Credits
NgReady is not my invention, I just ported it to python from the expected condition implemented in Java I found here, so all credits go to the author of the answer.
#hoefling idea is absolutely the correct one, but here is an addition to the "wait for Angular" part.
The logic used inside the NgReady only checks for angular to be defined and no pending requests left to be processed. Even though it works for this website, it's not a definite answer to the question of Angular being ready to work with.
If we look at what Protractor - the Angular end-to-end testing framework - does to "sync" with Angular, it is using this "Testability" API built into Angular.
There is also this pytractor package which extends selenium webdriver instances with a WebDriverMixin which would keep the sync between the driver and angular automatically on every interaction.
You can either start using pytractor directly (it is though abandonded as a package). Or, we can try and apply the ideas implemented there in order to always keep our webdriver synced with Angular. For that, let's create this waitForAngular.js script (we'll use only Angular 1 and 2 support logic only - we can always extend it by using the relevant Protractor's client side script):
try { return (function (rootSelector, callback) {
var el = document.querySelector(rootSelector);
try {
if (!window.angular) {
throw new Error('angular could not be found on the window');
}
if (angular.getTestability) {
angular.getTestability(el).whenStable(callback);
} else {
if (!angular.element(el).injector()) {
throw new Error('root element (' + rootSelector + ') has no injector.' +
' this may mean it is not inside ng-app.');
}
angular.element(el).injector().get('$browser').
notifyWhenNoOutstandingRequests(callback);
}
} catch (err) {
callback(err.message);
}
}).apply(this, arguments); }
catch(e) { throw (e instanceof Error) ? e : new Error(e); }
Then, let's inherit from webdriver.Chrome and patch the execute() method - so that every time there is an interaction, we additionally check if Angular is ready before the interaction:
import csv
from selenium import webdriver
from selenium.webdriver.remote.command import Command
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import WebDriverWait as wait
from selenium.webdriver.common.by import By
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.support import expected_conditions as EC
COMMANDS_NEEDING_WAIT = [
Command.CLICK_ELEMENT,
Command.SEND_KEYS_TO_ELEMENT,
Command.GET_ELEMENT_TAG_NAME,
Command.GET_ELEMENT_VALUE_OF_CSS_PROPERTY,
Command.GET_ELEMENT_ATTRIBUTE,
Command.GET_ELEMENT_TEXT,
Command.GET_ELEMENT_SIZE,
Command.GET_ELEMENT_LOCATION,
Command.IS_ELEMENT_ENABLED,
Command.IS_ELEMENT_SELECTED,
Command.IS_ELEMENT_DISPLAYED,
Command.SUBMIT_ELEMENT,
Command.CLEAR_ELEMENT
]
class ChromeWithAngular(webdriver.Chrome):
def __init__(self, root_element, *args, **kwargs):
self.root_element = root_element
with open("waitForAngular.js") as f:
self.script = f.read()
super(ChromeWithAngular, self).__init__(*args, **kwargs)
def wait_for_angular(self):
self.execute_async_script(self.script, self.root_element)
def execute(self, driver_command, params=None):
if driver_command in COMMANDS_NEEDING_WAIT:
self.wait_for_angular()
return super(ChromeWithAngular, self).execute(driver_command, params=params)
driver = ChromeWithAngular(root_element='body')
# the rest of the code as is with what you had
Again, this is heavily insipred by the pytractor and protractor projects.