I've been using selenium ChromeDriver to parse some information. I've found elements I need by XPath, saved them into the list and then wrote a for loop to click on each of the element. It worked perfectly when I had a small amount of elements (>30), but now I'm facing a problem, that the length of my list is shorter than the actual amount of elements on the webpage (like 30 from 112, or 45 from 210). But all of these elements have the same XPath I've located initially.
# login function
def login(email,password):
driver.get('https://crm.tender-win.ru/account/logon')
driver.find_element_by_id('email').send_keys(email)
driver.find_element_by_id('password').send_keys(password)
driver.find_element_by_id('btnLogin').click()
df = []
login('somelogin','somepassword')
driver.get('https://crm.tender-win.ru/tenders/new')
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'MyTenderMenuTabsInfo')))
tenders = [driver.find_elements_by_xpath("//div[#class='panel card ']")
or driver.find_elements_by_xpath("//div[#class='panel card new']")][0]
for tender in tenders:
tender.click()
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'tenderCurrencyCode')))
data = driver \
.find_element_by_xpath('//*[#id="infoScroll"]/div/div[3]/div[2]/table') \
.get_attribute('outerHTML')
df1 = pd.read_html(data)
df.append(df1)
print(df)
P.S.
I solved my issuse by turning for loop into function and using it until my list with elemenets is empty.
def clicker(tenders=None):
for tender in tenders:
tender.click()
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'tenderCurrencyCode')))
data = driver \
.find_element_by_xpath('//*[#id="infoScroll"]/div/div[3]/div[2]/table') \
.get_attribute('outerHTML')
df1 = pd.read_html(data)
df.append(df1)
print(df)
def scrape():
tenders = [driver.find_elements_by_xpath("//div[#class='panel card ']")
or driver.find_elements_by_xpath("//div[#class='panel card new']")][0]
while len(tenders) > 0:
clicker(tenders)
tenders = [driver.find_elements_by_xpath("//div[#class='panel card ']")
or driver.find_elements_by_xpath("//div[#class='panel card new']")][0]
Related
In particular I am trying to scrap this table (https://whalewisdom.com/filer/berkshire-hathaway-inc#tabholdings_tab_link) But I would like to scraping via python code, the first 50 rows.
For this reason I need to setup option value in order to see the first 50 rows per pages:
my currently code are:
test = {}
dict_scr = {}
for ii in range (0,12):
options = webdriver.FirefoxOptions()
options.binary_location = r'C:/Users/Mozilla Firefox/firefox.exe'
driver = selenium.webdriver.Firefox(executable_path='C:/Users/geckodriver.exe' , options=options)
driver.execute("get", {'url': link_scr['Links'][ii]})
Select(WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//button[text()='50']"))))
test[link_scr.index[ii]] = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, "table#current_holdings_table"))).get_attribute("outerHTML")
dict_scr[link_scr.index[ii]] = pd.read_html(test[link_scr.index[ii]])
print(test[link_scr.index[ii]])
How I can modify this code in order to get firs 50 rows scraping dataframe?
I write two samples, you can refer to github:
sample:
from time import sleep
from clicknium import clicknium as cc, locator
tab = cc.chrome.open("https://whalewisdom.com/filer/berkshire-hathaway-inc#tabholdings_tab_link")
tab.find_element(locator.chrome.whalewisdom.button_25).click()
tab.find_element(locator.chrome.whalewisdom.a_50).click()
sleep(3) #wait for table laoded
elems_sector = tab.find_elements(locator.chrome.whalewisdom.td_informationtechnology)
elemns_shares = tab.find_elements(locator.chrome.whalewisdom.td_890923410)
count = len(elems_sector)
for idx in range(count):
sector = elems_sector[idx].get_text()
shares = elemns_shares[idx].get_text()
print({'sector': sector, 'shares': shares})
sample1: don't change page number, scrape two pages data
from time import sleep
from clicknium import clicknium as cc, locator
tab = cc.chrome.open("https://whalewisdom.com/filer/berkshire-hathaway-inc#tabholdings_tab_link")
i = 0
while True:
elems_sector = tab.find_elements(locator.chrome.whalewisdom.td_informationtechnology)
elemns_shares = tab.find_elements(locator.chrome.whalewisdom.td_890923410)
count = len(elems_sector)
for idx in range(count):
sector = elems_sector[idx].get_text()
shares = elemns_shares[idx].get_text()
print({'sector': sector, 'shares': shares})
i += 1
if i>1:
break
tab.find_element(locator.chrome.whalewisdom.a).click()
sleep(2) #wait for table loaded
I'm fairly new with Selenium and I've been running a couple of very small web scraping projects.
When I try to click on this element through the .click() function I keep getting "Element not interactable"
The html section I'm trying to interact is this:
<a class="hawk-iconBefore hawk-styleCheckbox hawk-styleList" data-options="{"name":"finish","value":"Foil"}" href="https://starcitygames.com/search/?card_name=Glimmervoid&finish=Foil" rel="nofollow"><span class="hawk-selectionInner">Foil <span class="hawk-facetCount">(5)</span></span></a>
And my python code looks like this:
from selenium import webdriver
from selenium.webdriver.common.by import By
url = 'https://starcitygames.com/'
card_name = 'Fatal Push'
expansion_name = 'Double Masters'
foil = True
card_price = 0
browser_options = webdriver.ChromeOptions()
browser_options.add_argument("headless")
browser = webdriver.Chrome(options=browser_options)
browser.get(url)
browser.implicitly_wait(0.2)
browser.maximize_window()
print(card_name)
def get_card_price():
global card_price
print("Finding card...")
browser.find_element(By.CSS_SELECTOR, "[name='search_query']").send_keys(card_name)
search_button = browser.find_element(By.CLASS_NAME, "search-submit")
search_button.click()
if foil:
print("Checking if Foil...")
foil_select = browser.find_element(By.XPATH, "/html/body/div/div[1]/main/aside/div[2]/div[2]/div/div[5]/div/ul/li[1]/a")
try:
foil_select.click()
print("It's Foil")
except:
print("Element not interactable")
cards = browser.find_elements(By.CLASS_NAME,"hawk-results-item")
for card in cards:
c = card.text
price = card.find_element(By.CSS_SELECTOR, "div[class='hawk-results-item__options-table-cell hawk-results-item__options-table-cell--price childAttributes']")
if expansion_name in c:
card_price = price.text
return card_price
get_card_price()
print("Fetching card price...")
print(card_price)
browser.quit()
All other part send the info I need but when I check it the condition foil is true it jumps to the exception due to the element not being interactable.
I have tried accesing it with css_selector, and with the regular xpath, I saw another answer in which they suggested using the full XPATH and that it fixed the issue but it didn't work.
What could I do?
So I figured out how to fetch the href for the element I wanted and it was as simple as just getting that and then telling my code to go to that page and execute the rest of the code:
That's how it looks now:
if foil:
print("Checking if Foil...")
try:
foil_select=browser.find_element(By.XPATH, '//*[#id="hawkfacet_finish"]/li[1]/a')
link = foil_select.get_attribute("href")
print("It's Foil")
browser.get(link)
except:
print("Element not interactable")
else:
foil_select=browser.find_element(By.XPATH, '//*[#id="hawkfacet_finish"]/li[2]/a')
link = foil_select.get_attribute("href")
print("It's not foil")
browser.get(link)
Now to move on with the next step. Thanks everyone!
This
browser_options.add_argument("headless")
should be
browser_options.add_argument("--headless")
You need to scroll to each cards first before grabbing the price.
Below is the sample code:
driver.maximize_window()
wait = WebDriverWait(driver, 20)
url = 'https://starcitygames.com/'
card_name = 'Fatal Push'
expansion_name = 'Double Masters'
foil = True
card_price = 0
#browser_options = webdriver.ChromeOptions()
#browser_options.add_argument("headless")
#browser = webdriver.Chrome(options=browser_options)
driver.get(url)
driver.implicitly_wait(0.2)
driver.maximize_window()
print(card_name)
def get_card_price():
global card_price
print("Finding card...")
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[name='search_query']"))).send_keys(card_name)
search_button = wait.until(EC.visibility_of_element_located((By.CLASS_NAME, "search-submit")))
search_button.click()
if foil:
print("Checking if Foil...")
foil_select = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "ul#hawkfacet_rarity li a[data-options*='Rare']")))
try:
foil_select.click()
print("It's Foil")
except:
print("Element not interactable")
time.sleep(5)
cards = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//div[#class='hawk-results-item']")))
for card in cards:
driver.execute_script("arguments[0].scrollIntoView(true);", card)
c = card.get_attribute('innerText')
print(c)
price = card.find_element(By.XPATH, ".//descendant::div[contains(#class, 'price childAttributes')]")
print(price.text)
if expansion_name in c:
card_price = price.text
return card_price
get_card_price()
print("Fetching card price...")
print(card_price)
Output:
Fatal Push
Finding card...
Checking if Foil...
It's Foil
Fatal Push (Borderless)
Double Masters - Variants
Near Mint -
English
$14.99
QTY: 0
NOTIFY ME
$14.99
Fatal Push (Borderless)
Double Masters - Variants (Foil)
Near Mint -
English
$14.99
QTY: 3
Add to cart
$14.99
Fetching card price...
$14.99
Process finished with exit code 0
I want to navigate through all the continents/ countries here and collect the tables into a pandas data frame, but sometimes the process clicks on the same link a couple of times before continuing on to the next. This is my current implementation:
chrome_options = Options()
chrome_options.add_argument("--start-maximized")
DRIVER_PATH = '/path/to/chromedriver'
driver = webdriver.Chrome(executable_path=DRIVER_PATH, options=chrome_options)
driver.get('https://www.ertms.net/deployment-world-map/')
continents = driver.find_element(by='id', value='panel')
continent_names = continents.text.split()
# navigating through continent links
for i, cont in enumerate(continent_names):
cont_buttons = driver.find_elements_by_class_name('accordion')
continent_element = WebDriverWait(driver, 10).until(EC.element_to_be_clickable(cont_buttons[i + 1]))
time.sleep(0.5)
ActionChains(driver).move_to_element(continent_element).click().perform()
time.sleep(3)
child_buttons = driver.find_elements_by_class_name('accordion')
# going through country links for each continent. Here is where the same link is sometimes clicked twice
for j, country in enumerate(child_buttons):
time.sleep(3)
child_buttons = driver.find_elements_by_class_name('accordion')
country_element = WebDriverWait(driver, 10).until(EC.element_to_be_clickable(child_buttons[j]))
time.sleep(0.5)
ActionChains(driver).move_to_element(country_element).click().perform()
# going back to page with list of countries for current continent
back_button = driver.find_element_by_class_name('go-back')
driver.execute_script("arguments[0].click();", back_button)
time.sleep(3)
# going back to list of continents
back_button = driver.find_element_by_class_name('go-back')
driver.execute_script("arguments[0].click();", back_button)
time.sleep(3)
I navigate around using EC.element_to_be_clickable and a combination of the By.LINK_TEXT or find_elements_by_class_name methods. Any advice on best practices would be appreciated.
I try of select the sport 'Football' in a drop down of sport but impossible of click on it.
I tried with the Select() method:
driver = webdriver.Chrome()
url = "https://www.flashscore.com/"
driver.get(url)
Team = 'Paris SG'
Type = 'Teams'
sport = 'Football'
buttonSearch = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".header__button--search"))).click()
fill_search_bar = driver.find_element(By.CSS_SELECTOR, ".input___1NGxU3-")
fill_search_bar.clear()
fill_search_bar.send_keys(Team)
driver.find_element(By.CSS_SELECTOR, ".dropDown").click()
select_sport = Select(driver.find_element(By.XPATH,"//div[contains(#class, 'dropDown__list')]"))
select_sport.select_by_visible_text(sport)
This code return this error : UnexpectedTagNameException: Message: Select only works on <select> elements, not on <div>.
Here is my second version:
fill_search_bar = driver.find_element(By.CSS_SELECTOR, ".input___1NGxU3-")
fill_search_bar.clear()
fill_search_bar.send_keys(Team)
driver.find_element(By.CSS_SELECTOR, ".dropDown").click()
select_sport = WebDriverWait(driver, timeout=10).until(EC.element_to_be_clickable((By.XPATH,"//[#class='dropDown__list']/[contains(text(),'"+ sport +"')]"))).click()
This code return this error : selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//div[#class='dropDown__list']/div[contains(text(),'Football')]"}.
How can I solve this problem ?
I would suggest to break down the wait until class into two lines for simplicity. Its totally optional and wouldn't make much of a difference.
wait = WebDriverWait(driver, 300)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".header__button--search")))
element_to_be_clicked=driver.find_element_by_css_selector(".header__button--search")
element_to_be_clicked.click()
For the second part try using the values of the options in drop down list:
fill_search_bar.clear()
fill_search_bar.send_keys(Team)
driver.find_element_by_xpath("//div[#class='dropDown__selectedValue dropDownValueSelected___3msxRQS']").click()
select_sport=Select(driver.find_element_by_class("dropDown__list dropDownList___3V-ppVu"))
select_sport.select_by_value('1') #football has value 1 in the list
I am trying to scrape all job postings for the last 24 hours from Glassdoor and save them to a dictionary.
binary = FirefoxBinary('path_to_firebox_binary.exe')
cap = DesiredCapabilities().FIREFOX
cap["marionette"] = True
driver = webdriver.Firefox(firefox_binary=binary, capabilities=cap, executable_path=GeckoDriverManager().install())
base_url = 'https://www.glassdoor.com/Job/jobs.htm?suggestCount=0&suggestChosen=false&clickSource=searchBtn' \
'&typedKeyword=data+sc&sc.keyword=data+scientist&locT=C&locId=1154532&jobType= '
driver.get(url=base_url)
driver.implicitly_wait(20)
driver.maximize_window()
WebDriverWait(driver, 20).until(
EC.element_to_be_clickable((By.CSS_SELECTOR, "div#filter_fromAge>span"))).click()
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((
By.XPATH, "//div[#id='PrimaryDropdown']/ul//li//span[#class='label' and contains(., 'Last Day')]"))).click()
# find job listing elements on web page
listings = driver.find_elements_by_class_name("jl")
n_listings = len(listings)
results = {}
for index in range(n_listings):
driver.find_elements_by_class_name("jl")[index].click() # runs into error
print("clicked listing {}".format(index + 1))
info = driver.find_element_by_class_name("empInfo.newDetails")
emp = info.find_element_by_class_name("employerName")
results[index] = {'title': title, 'company': emp_name, 'description': description}
I keep running into the error message
selenium.common.exceptions.StaleElementReferenceException: Message:
The element reference of is stale; either the element is no longer attached to the
DOM, it is not in the current frame context, or the document has been
refreshed
for the first line inside my for loop. Even if the for loop runs for some number of times, it eventually leads to the exception showing up. I am new to selenium and web scraping, will appreciate any help.
Every time a new post is selected the clicked element is being modified, and therefor the DOM is being refreshed. The change is slow, certainly in comparison to the actions in the loop, so what you want to do is to slow it a little bit. Instead of using fixed sleep you can wait for the changes to occur
Every time you select a posting a new class selected is being added and the style attribute lose it's content. You should wait for this to happen, get the information, and click the next post
wait = WebDriverWait(driver, 20)
for index in range(n_listings - 1):
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.selected:not([style="border-bottom:0"])')))
print("clicked listing {}".format(index + 1))
info = driver.find_element_by_class_name('empInfo.newDetails')
emp = info.find_element_by_class_name('employerName')
if index < n_listings - 1:
driver.find_element_by_css_selector('.selected + .jl').click()
This error means the element you are trying to click on was not found, you have to first make sure the target element exists and then call click() or wrap it in a try/except block.
# ...
results = {}
for index in range(n_listings):
try:
driver.find_elements_by_class_name("jl")[index].click() # runs into error
except:
print('Listing not found, retrying in 1 seconds ...')
time.sleep(1)
continue
print("clicked listing {}".format(index + 1))
info = driver.find_element_by_class_name("empInfo.newDetails")
emp = info.find_element_by_class_name("employerName")
# ...