Job navigates to one page and does not count down pages - python

I'm navigating to all the links on this page, but the page counter is not working. Any idea why that is?
I've tried to adjust the xpath in the hopes that would fix the issue. Why is it doing this?
driver = webdriver.Chrome()
def page_counter():
for x in range(1000):
yield x
count = page_counter()
driver.get('https://www.betfair.com.au/exchange/plus/football')
elements = [x.get_attribute("href") for x in
driver.find_elements_by_xpath("//nav//a[starts-with(#href, ('football'))]")]
shuffle(elements)
import operator
import collections
links = dict((next(count) + 1, e) for e in elements)
desc_links = collections.OrderedDict(sorted(links.items(), reverse=True))
for key, value in desc_links.items():
driver.get(value)
print('At Page: ' + str(key))

elements becomes an empty list and that is the reason your loop is not working.
You need to let your page load to be able to access the link elements which have href starting with football. One of the possible ways to do it would be using WebDriverWait to wait for the header element to be visible:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get('https://www.betfair.com.au/exchange/plus/football')
wait = WebDriverWait(driver, 10)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".container .page-content header")))
elements = [x.get_attribute("href") for x in driver.find_elements_by_xpath("//nav//a[starts-with(#href, ('football'))]")]

Related

I want to extract all the urls from number of 10 pages in python using slenium

Here below I mentioned my code and I want to get multiple urls from web page but not from the end the limit of my page is 10. I need only 10 pages urls.
My Code :
`
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
array = []
driver = webdriver.Chrome()
driver.get('https://google.com')
search = driver.find_element("name","q").send_keys("websites"+Keys.RETURN)
urls = "https://www.google.com/search?q=websites&sxsrf=ALiCzsZmfT1H8dxBOig9KvuRbtnQUtVTtQ%3A1668401426603&source=hp&ei=EslxY8GlIZyM4-EPjqOLoAk&iflsig=AJiK0e8AAAAAY3HXIiUkRUcpwQ84iKLerx9VqGixFmVk&ved=0ahUKEwjB9vzS76z7AhUcxjgGHY7RApQQ4dUDCAk&uact=5&oq=websites&gs_lcp=Cgdnd3Mtd2l6EAMyCAgAEIAEELEDMgsIABCABBCxAxCDATILCAAQgAQQsQMQgwEyBQgAEIAEMgsIABCABBCxAxCDATIFCAAQgAQyBQgAEIAEMgcIABCABBAKMgUIABCABDILCAAQgAQQsQMQgwE6BwgjEOoCECc6BwguEOoCECc6CwguEIMBELEDEIAEOggILhCDARCxAzoICAAQsQMQgwE6BAgjECc6DgguEIAEELEDEMcBENEDOhEILhCABBCxAxCDARDHARCvAVDUBFj1E2D0FWgBcAB4AIAB4QGIAeMJkgEFMC43LjGYAQCgAQGwAQo&sclient=gws-wiz"
driver.get(urls)
elems = driver.find_elements("xpath","/html/body/div/div/div/div/div[2]/div[2]/div/div/div/div/div/div[1]/div/a")
while True:
link = driver.find_elements("xpath","/html/body/div[7]/div/div[11]/div/div[4]/div/div[2]/table/tbody/tr/td[12]/a")
if link == 1 :
print("No more pages left")
break
else:
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next'))).click()
elems = driver.find_elements("xpath","/html/body/div/div/div/div/div[2]/div[2]/div/div/div/div/div/div[1]/div/a")
for elem in elems:
array = elem.get_attribute("href")
print(array)
`

How can we loop through items that are coordinates, load each into a Google Maps page, and grab an element from each page?

I'm testing the code below, which should loop through items in a column of a dataframe, load each into Google Maps, and grab a 'title' from each page. I want to push the result into a column in a dataframe (df_fin['my_place'] = driver.title).
The problem that I am noticing is that the first record works perfectly fine, but no subsequent items work. I'm guessing something is wrong with the page refresh, or maybe the click event, but I don't know for sure what's going on here.
import pandas as pd
df_fin = pd.read_csv('C:\\Users\\df_fin.csv')
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium import webdriver
import time
driver = webdriver.Chrome('C:\\Users\\TryMe\\chromedriver.exe')
driver.maximize_window()
driver.implicitly_wait(5)
i = 1
for item in df_fin['place']:
try:
driver.get(item)
wait = WebDriverWait(driver, 10)
main_canvas = wait.until(EC.element_to_be_clickable((By.XPATH, "//*[name()='canvas']")))
size = main_canvas.size
w, h = size['width'], size['height']
new_w = w/2
new_h = h/2
ActionChains(driver).move_by_offset(new_h, new_h).pause(5).perform()
time.sleep(2)
wait.until(EC.element_to_be_clickable((By.XPATH, "//*[name()='canvas']"))).click()
print(driver.title)
df_fin['my_place'] = driver.title
except:
df_fin['my_place'] = 'OTHER'
i = i + 1
print(i)
df_fin.head(10)
Here is a sample of what's in my dataframe.
https://www.google.com/maps/#42.33988,-71.10409,18z
https://www.google.com/maps/#39.73914,-75.54937,18z
https://www.google.com/maps/#44.4995,-88.05496,18z
https://www.google.com/maps/#44.50235,-88.06322,18z
https://www.google.com/maps/#40.82265,-73.40959,18z
Finally, in the image below, you can see that 'title' is 'Rod Laver Arena'. That's what I'm trying to get.

Python Selenium Path not found

I have wrote the code
import os
from webdriver_manager.chrome import ChromeDriverManager
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--start-maximized')
options.page_load_strategy = 'eager'
driver = webdriver.Chrome(options=options)
url = "https://www.moneycontrol.com/india/stockpricequote/chemicals/tatachemicals/TC"
driver.get(url)
try:
wait = WebDriverWait(driver, 10)
except Exception:
driver.send_keys(Keys.CONTROL +'Escape')
driver.find_element_by_link_text("Bonus").click()
try:
wait = WebDriverWait(driver, 5)
except Exception:
driver.send_keys(Keys.CONTROL +'Escape')
for i in range(0, 50):
bonus_month = driver.find_element_by_xpath ("//*[#class= 'mctable1.thborder.frtab']/tbody/tr[%s]/td[1]"%(i))
print(bonus_month.text)
bonus = driver.find_element_by_xpath ("//*[#class= 'mctable1.thborder.frtab']/tbody/tr[%s]/td[1]"%(i))
print(bonus.text)
This gives me error
no such element: Unable to locate element: {"method":"xpath","selector":"//*[#class= 'mctable1.thborder.frtab']/tbody/tr[0]/td[1]"}
Element on the page:
Where I am making mistake in finding Exbonus and Ratio?
First use the clickable method from the expected conditions to check that the element is clickable within given time to just make sure it is operational.
Once the click action performed on the bonus link the table takes some time to finishes loading. In meantime selenium tries to fetch the table content and fails to get it. So again add wait for the element to load and then grab the table using Xpath and iterate over the rows of the table. -
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//ul[#id='corporation_tab']//a[#href='#ca_bonus']")))
driver.find_element_by_link_text("Bonus").click()
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//tbody[#id='id_b']/tr")))
tableRows = driver.find_elements_by_xpath('//tbody[#id="id_b"]/tr')
print(tableRows)
for i in tableRows:
AnnouncementDate = i.find_element_by_xpath('td[1]').text
exbonus = i.find_element_by_xpath('td[2]').text
ratio = i.find_element_by_xpath('td[3]').text
print(AnnouncementDate + " \t\t " + exbonus + " \t\t " + ratio)
This returns me the output -
You will need following extra import -
from selenium.webdriver.support import expected_conditions as EC
This partially will solve your issue with locators:
1 To find Ex-Bonus use css selector: #id_b>tr>td:nth-of-type(2)
2 To find ratio use also css selector, #id_b>tr>td:nth-of-type(3)
To iterante use:
#id_b>tr:nth-of-type(x)>td:nth-of-type(3)
where x is the number of row.
For example, #id_b>tr:nth-of-type(1)>td:nth-of-type(3) will give you text with ratio 3:5
If you avoid avoid using #id_b, this locator will not be unique.
Instead of range function I'd use find_elements_by_css_selector. Try following:
rows = driver.find_elements_by_css_selector("#id_b>tr")
for row in rows:
bonus = row.find_element_by_css_selector("td:nth-of-type(2)").text
ratio = row.find_element_by_css_selector("td:nth-of-type(3)").text
There are only 5 of this elements on the page. I'll have to click See More.

Unable to parse data inside h4 tag: Python3

I am facing an issue while parsing data from the 'Literature ' tab from the third table. The steps I took to reach the table:
Go to ibl.mdanderson.org/fasmic/#!
Type and select AKT1 (3 mutations) (NOTE:'GO' button doesn't work, please click the option from the drop-down)
Click on the green button with the text 'MS', a new table will appear.
In this new table, there will be a tab called literature, I need the literature text and the PMID.
I tried the following code, but it gives an empty list:
xyz= driver.find_element_by_xpath("//*[contains(text(),'Literature')]").click()
for elements in driver.find_elements_by_xpath('//div[#class="tab-pane ng-scope active"]'):
soup = BeautifulSoup(driver.page_source, 'lxml')
table = soup.find('div', attrs={'id': "literature_div"})
table_body = table.find('h4')
rows = table.find_all('h4')
for row in rows:
cols = row.find_all('h4')
# cols = [ele.text.strip() for ele in cols]
litrature.append([ele for ele in cols if ele]) # Get rid of empty value
print("Data from COLUMN 1:")
print(litrature)
How can I resolve this?
UPDATE
When I try to click on the 'Next ' button under the 'literature' table, I get the following error:
"Message: The element reference of is stale; either the element is no longer attached to the DOM, it is not in the current frame context, or the document has been refreshed "
Following is the line I added to click on the "NEXT" buton: driver.find_element_by_xpath('//a[#ng-click="selectPage(page + 1, $event)"]').click()
How can I resolve this?
you need to wait 3 times
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get('https://ibl.mdanderson.org/fasmic/#!/')
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH , '//input')))
input = driver.find_element_by_xpath("//input")
input.send_keys("AKT1\n")
button = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CLASS_NAME , 'btn-tab-avail')))
button.click()
driver.find_element_by_xpath("//*[contains(text(),'Literature')]").click()
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#literature_div h4')))
rows = driver.find_elements_by_css_selector("#literature_div h4")
litrature = []
for item in rows:
item = item.text
litrature.append(item)
print("Data from COLUMN 1:")
print item
Like this? Someone with more knowledge of python waits can certainly improve on my wait lines.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
url = "https://ibl.mdanderson.org/fasmic/#!/"
d = webdriver.Chrome()
wait = WebDriverWait(d, 10)
d.get(url)
d.find_element_by_css_selector('[type=text]').send_keys('AKT1 (3 mutations)')
d.find_element_by_css_selector("input[type='text']").send_keys(Keys.RETURN)
btn = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".btn.btn-default.btn-tab-avail")))
btn.click()
d.find_element_by_css_selector("[heading=Literature]").click()
ele = wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, "#literature_div [ng-repeat]"), "PMID"))
eles = d.find_elements_by_css_selector("#literature_div [ng-repeat]")
for item in eles:
print(item.text,"\n")
d.quit()

Parsing a site where URL doesn't change with Selenium Python

I'm trying to scrape [this][1] site its URL doesnt change when next page is clicked on. So, I used Selenium to click on the next page, but doing that doesnt help. As my driver keeps getting the old page even after next page is clicked on. Is there any other way to get to the next page and scrape it?
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
driver = webdriver.Safari()
store_pages = []
#10306 is total number of pages.
for i in range (10306):
Starting_url = 'site'
driver.get(Starting_url)
html = driver.page_source
soup = BeautifulSoup(html, "lxml")
print (store_pages.append(i))
timeout = 20
try:
WebDriverWait(driver, timeout).until(EC.visibility_of_element_located((By.XPATH, "//*[#id='ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_lblDisclaimerMsg']")))
except TimeoutException:
print("Timed out waiting for page to load")
driver.quit()
nextpage_url = driver.find_element_by_name("ctl00$SPWebPartManager1$g_d6877ff2_42a8_4804_8802_6d49230dae8a$ctl00$imgbtnNext").click()
timeout = 20
wait = WebDriverWait(driver, 10).until(EC.text_to_be_present_in_element_value((By.CSS_SELECTOR, '#ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a > div.act_search_results > div.act_search_header'), "206113 Record(s) | Page [2 of 10306]"))
NGO_element = driver.find_element_by_class_name("faq-sub-content exempted-result")
NGO_name = NGO_element.find_elements_by_tag_name("h1")
NGO_name_pancard = driver.find_elements_by_class_name("pan-id")
NGO_data = NGO_element.find_elements_by_tag_name("ul")
NGO_sub_data = NGO_element.find_elements_by_tag_name("li")
for i, p, t in zip(NGO_name, NGO_name_pancard, NGO_data):
n_name = i.text.replace(p.text, '')
n_data = t.text
n_pan = p.text
print ("Name of NGO:", n_name, "Fields of NGO:", n_data, "Pancard number:", n_pan)
nextpage_url = driver.find_element_by_name("ctl00$SPWebPartManager1$g_d6877ff2_42a8_4804_8802_6d49230dae8a$ctl00$imgbtnNext").click()
#timeout = 2
You need to make sure when you reach the next page, the content of the earlier page has become stale otherwise, you will have stale element error or get the same thing repeatedly. Try the below approach, it should get you there. The rest you can modify yourself.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
driver.get("http://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx")
while True:
for elem in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"[id^='arrowex']"))):
print(elem.text)
try:
wait.until(EC.presence_of_element_located((By.ID, "ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_ctl00_imgbtnNext"))).click()
wait.until(EC.staleness_of(elem))
except:
break
driver.quit()

Categories