Python web scraping using Selenium - iterate through href link

Python web scraping using Selenium - iterate through href link - python

I am trying to write a script that uses selenium to download many files which consist of different NHL players information; game-log. I want to download a file for each players in the following table: https://www.naturalstattrick.com/playerteams.php?fromseason=20142015&thruseason=20162017&stype=2&sit=all&score=all&stdoi=std&rate=y&team=ALL&pos=S&loc=B&toi=0.1&gpfilt=none&fd=&td=&tgp=410&lines=single
Once on that website, I wanted to click on all the players' name in the table. When a player's name is clicked through the href link, a new window opens. There are few drop-down menus at the top. I want to select "Rate" instead of "Counts" and also select " Game Log" instead of "Player Summary", and then click "Submit". Finally, I want to click on CSV(All) at the bottom to download a CSV file.
Here is my current code:
from selenium import webdriver
import csv
from selenium.webdriver.support.ui import Select
from datetime import date, timedelta
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
chromedriver =("C:/Users/Michel/Desktop/python/package/chromedriver_win32/chromedriver.exe")
driver = webdriver.Chrome(chromedriver)
driver.get("https://www.naturalstattrick.com/playerteams.php?fromseason=20142015&thruseason=20162017&stype=2&sit=all&score=all&stdoi=std&rate=y&team=ALL&pos=S&loc=B&toi=0.1&gpfilt=none&fd=&td=&tgp=410&lines=single")
table = driver.find_element_by_xpath("//table[#class='indreg dataTable no-footer DTFC_Cloned']")
for row in table.find_elements_by_xpath("//tr[#role='row']")
links = driver.find_element_by_xpath('//a[#href]')
links.click()
select = Select(driver.find_element_by_name('rate'))
select.select_by_value("y")
select1 = Select(driver.find_element_by_name('v'))
select1.select_by_value("g")
select2 = Select(driver.find_element_by_type('submit'))
select2.select_by_value("submit")
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH , '//div[#class="dt-button button-csv button-htm15"]')))
CSVall = driver.find_element_by_xpath('//div[#class="dt-button button-csv button-htm15"]')
CSVall.click()
driver.close()
I try to change different things, but I always get an error. Where is the problem ?
Moreover, I think I should probably add a line to wait for the website to load because it takes a few seconds; after "driver.get". I do not know what should be the expected conditions to end the wait in this case.
Thanks

Rather than keep clicking through selections you could grab the playerIds from the first page and concantenate those, along with the strings representing the selections for Rate and Game Log into the queryString part of the new URL. Sure you can tidy up the following.
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def getPlayerId(url):
id = url.split('playerid=')[1]
id = id.split('&')[0]
return id
def makeNewURL(playerId):
return 'https://www.naturalstattrick.com/playerreport.php?fromseason=20142015&thruseason=20162017&stype=2&sit=all&stdoi=oi&rate=y&v=g&playerid=' + playerId
#chromedriver =("C:/Users/Michel/Desktop/python/package/chromedriver_win32/chromedriver.exe")
driver = webdriver.Chrome()
driver.get("https://www.naturalstattrick.com/playerteams.php?fromseason=20142015&thruseason=20162017&stype=2&sit=all&score=all&stdoi=std&rate=y&team=ALL&pos=S&loc=B&toi=0.1&gpfilt=none&fd=&td=&tgp=410&lines=single")
links = driver.find_elements_by_css_selector('table.indreg.dataTable.no-footer.DTFC_Cloned [href*=playerid]')
newLinks = []
for link in links:
newLinks.append(link.get_attribute('href'))
for link in newLinks:
playerId = getPlayerId(link)
link = makeNewURL(playerId)
driver.get(link)
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH , '//a[#class="dt-button buttons-csv buttons-html5"][2]')))
CSVall = driver.find_element_by_xpath('//a[#class="dt-button buttons-csv buttons-html5"][2]')
CSVall.click()

you don't need to click each player link but save the URLs as list, also there are several error, you can see working code below
from selenium import webdriver
import csv
from selenium.webdriver.support.ui import Select
from datetime import date, timedelta
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
chromedriver =("C:/Users/Michel/Desktop/python/package/chromedriver_win32/chromedriver.exe")
driver = webdriver.Chrome(chromedriver)
driver.get("https://www.naturalstattrick.com/playerteams.php?fromseason=20142015&thruseason=20162017&stype=2&sit=all&score=all&stdoi=std&rate=y&team=ALL&pos=S&loc=B&toi=0.1&gpfilt=none&fd=&td=&tgp=410&lines=single")
playerLinks = driver.find_elements_by_xpath("//table[#class='indreg dataTable no-footer DTFC_Cloned']//a")
playerLinks = [p.get_attribute('href') for p in playerLinks]
print(len(playerLinks))
for url in playerLinks:
driver.get(url)
select = Select(driver.find_element_by_name('rate'))
select.select_by_value("y")
select1 = Select(driver.find_element_by_name('v'))
select1.select_by_value("g")
driver.find_element_by_css_selector('input[type="submit"]').click()
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH , '//a[#class="dt-button buttons-csv buttons-html5"][2]')))
CSVall = driver.find_element_by_xpath('//a[#class="dt-button buttons-csv buttons-html5"][2]')
CSVall.click()
driver.close()

Related

Selenium not updating current window

Selenium seems to be looking for data in an old page and not the new one.
I'm trying to automate a search where I select from a dropdown menu and fill a box with some value
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
url = 'http://www.op.nysed.gov/opsearches.htm'
value = '60'
license_no = '084157'
driver = webdriver.Chrome()
driver.get(url)
select = Select(driver.find_element(By.XPATH, '//form[#id="licensee-num-form"]/center/select'))
select.select_by_value(value)
fill = driver.find_element(By.XPATH, '//form[#id="licensee-num-form"]/center/input')
fill.send_keys(license_no)
fill.send_keys(Keys.ENTER)
data = driver.find_element(By.XPATH, "//div[#id='content_column']")
However, when I print data.text, it prints data from the first page, not the second one. I tried using driver.refresh() to refresh the page but it did not work.

This happens because you getting the data = driver.find_element(By.XPATH, "//div[#id='content_column']") immediately after clicking the Enter. The page still not refreshed. You should add a short delay there and then wait for element visibility on the refreshed page.
Please try this:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
url = 'http://www.op.nysed.gov/opsearches.htm'
value = '60'
license_no = '084157'
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 20)
driver.get(url)
select = Select(driver.find_element(By.XPATH, '//form[#id="licensee-num-form"]/center/select'))
select.select_by_value(value)
fill = driver.find_element(By.XPATH, '//form[#id="licensee-num-form"]/center/input')
fill.send_keys(license_no)
fill.send_keys(Keys.ENTER)
time.sleep(0.5)
data = wait.until(EC.visibility_of_element_located((By.XPATH, "//div[#id='content_column']")))

selenium no way to use a visible select element python

I am working on a project that I first sign up instagram by given information and then scrape some data for the sign up form I have some problems with second form which is birthdate the Select class of selenium didn't work. Maybe it is because of visibility. when I tried to access the elements it says that there is no such elements but they are both visible graphically and in inspect page of chrome and I prefer to not use the pyautogui because it is kinda unflexible and hard to use.
here is my code
#information
driverSignUp = webdriver.Chrome()
while True:
while True:
try:
driverSignUp.get('https://www.instagram.com/accounts/emailsignup/')
break
except:
pass
try:
driverSignUp.find_element(By.CSS_SELECTOR, 'body.p-error.dialog-404')
driverSignUp.delete_all_cookies()
except:
break
driverSignUp.implicitly_wait(1)
driverSignUp.find_element(By.NAME, 'emailOrPhone').send_keys(email)
driverSignUp.find_element(By.NAME, 'fullName').send_keys(fullName)
driverSignUp.find_element(By.NAME, 'username').send_keys(username)
driverSignUp.find_element(By.NAME, 'password').send_keys(password)
element = WebDriverWait(driverSignUp, 3).until(
EC.all_of(EC.presence_of_element_located((By.CSS_SELECTOR, 'button.sqdOP.L3NKy.y3zKF')))
)
driverSignUp.find_element(By.CSS_SELECTOR, 'button.sqdOP.L3NKy.y3zKF').click()
#birthdate
actions = ActionChains(driverSignUp)
Month = driverSignUp.find_element(By.CSS_SELECTOR, 'select[title="Month:"]')
actions.move_to_element(Month).perform()
selectMonth = Select(Month)
selectMonth.select_by_visible_text('August')
Day = driverSignUp.find_element(By.CSS_SELECTOR, 'select[title="Day:"]')
actions.move_to_element(Day).perform()
selectDay = Select(Day)
selectDay.select_by_visible_text('1')
Year = driverSignUp.find_element(By.CSS_SELECTOR, 'select[title="Year:"]')
actions.move_to_element(Year).perform()
selectYear = Select(Year)
selectYear.select_by_visible_text('2000')
and the libraries I used:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import Select
thanks for your answers

Selenium using WebDriverWait WebElement still doesn't print any text, recognizes string PYTHON

As title. Couldn't find a solution online. You'll see by the print statements that the program will get the text property from the WebElement object, but it is always an empty string even though I am using WebDriverWait.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
PATH = "C:\\Users\\Anthony\\Desktop\\chromedriver.exe"
driver = webdriver.Chrome(PATH)
website = "https://www.magicspoiler.com"
driver.get(website)
el_name = '//div[#class="set-card-2 pad5"]'
try:
main = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, el_name)))
articles = main.find_elements(By.XPATH, el_name)
for article in articles:
print(type(article.text))
print(article.text)
finally:
driver.quit()

I checked the HTML DOM and I don't see any text in the element you are looking for. They are simply images with anchor links and therefore you are getting blank responses.
I have given an alternate try, to extract the links and they were successful.
So, after refactoring your code, it looks like this:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
PATH = "C:\\Users\\Anthony\\Desktop\\chromedriver.exe"
driver = webdriver.Chrome(PATH)
website = "https://www.magicspoiler.com"
driver.get(website)
el_name = '//div[#class="set-card-2 pad5"]/a'
try:
main = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, el_name)))
articles = main.find_elements(By.XPATH, el_name)
for article in articles:
# print(type(article.text))
print(article.get_attribute("href"))
finally:
pass
driver.quit()
Here is the response I got:
So, please check if you want to get the links or really the text. If you want the text, then there is none that I see in DOM as I said. You may have to traverse through each of the link by clicking on it, and find if any you need fro the navigated page.

How do I retrieve the link of an image through Selenium

I'm trying to make my program fetch the link of an image and then store it as a string in a variable.
This is the xpath of the image. I need to do it through xpaths because the xpaths on the website are very similar bar the "/article[x]". This allow me to increase the number with a variable so that I can go through all the xpaths on the page.
/html/body/div[2]/div[2]/div[3]/div[2]/div[2]/div[1]/div/article[1]/div[2]/div[1]/a/img
Picture of the website that I'm trying to retrieve the links of the image
My code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import tkinter
import time
Anime = input("Enter Anime:")
driver = webdriver.Chrome(executable_path=r"C:\Users\amete\Documents\chromedriver.exe")
driver.get("https://myanimelist.net/search/all?q=one%20piece&cat=all")
search = driver.find_element_by_xpath('//input[#name="q"]')
wait = WebDriverWait(driver, 20)
wait.until(EC.element_to_be_clickable((By.XPATH, '//input[#name="q"]')))
# Clears the field
search.send_keys(Keys.CONTROL, 'a')
search.send_keys(Keys.DELETE)
# The field is now cleared and the program can type whatever it wants
search.send_keys(Anime)
search.send_keys(Keys.RETURN)
# Accept the cookies
wait.until(EC.element_to_be_clickable((By.XPATH, '//*[#id="qc-cmp2-ui"]/div[2]/div/button[3]'))).click()
# Added this wait
wait.until(EC.element_to_be_clickable((By.XPATH,'//h2[#id="anime"]//ancestor::div[#class="content-left"]//article[1]/div[contains(#class, "list")][1]/div[contains(#class, "information")]/a[1]')))
link = driver.find_element_by_xpath('//h2[#id="anime"]//ancestor::div[#class="content-left"]//article[1]/div[contains(#class, "list")][1]/div[contains(#class, "information")]/a[1]').text
piclink = driver.('/html/body/div[2]/div[2]/div[3]/div[2]/div[2]/div[1]/div/article[1]/div[2]/div[1]/a/img')
print (piclink)

you can get it like this (specify the attribute)
piclink = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div[3]/div[2]/div[2]/div[1]/div/article[1]/div[2]/div[1]/a/img').get_attribute('src')
print(piclink)

My scraper fails to get all the items from a webpage

I've written some code in python in combination with selenium to parse different product names from a webpage. There are few load more buttons visible if the browser is made to scroll downward. The webpage displays it's full content if the page is made to scroll downmost until there is no load more button to click. My scraper seems to be doing good but I'm not getting all the results. There are around 200 products in that page but I'm getting 90 out of them. What change should I bring about in my scraper to get them all? Thanks in advance.
The webpage I'm dealing with: Page_Link
This is the script I'm trying with:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get("put_above_url_here")
wait = WebDriverWait(driver, 10)
page = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,".listing_item")))
for scroll in range(17):
page.send_keys(Keys.PAGE_DOWN)
time.sleep(2)
try:
load = driver.find_element_by_css_selector(".lm-btm")
load.click()
except Exception:
pass
for item in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "[id^=item_]"))):
name = item.find_element_by_css_selector(".pro-name.el2").text
print(name)
driver.quit()

Try below code to get required data:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get("https://www.purplle.com/search?q=hair%20fall%20shamboo")
wait = WebDriverWait(driver, 10)
header = driver.find_element_by_tag_name("header")
driver.execute_script("arguments[0].style.display='none';", header)
while True:
try:
page = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".listing_item")))
driver.execute_script("arguments[0].scrollIntoView();", page)
page.send_keys(Keys.END)
load = wait.until(EC.element_to_be_clickable((By.PARTIAL_LINK_TEXT, "LOAD MORE")))
driver.execute_script("arguments[0].scrollIntoView();", load)
load.click()
wait.until(EC.staleness_of(load))
except:
break
for item in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "[id^=item_]"))):
name = item.find_element_by_css_selector(".pro-name.el2").text
print(name)
driver.quit()

You should only Use Selenium as a last resort.
A simple look around in the webpage showed the API it called to get your data.
It returns a JSON output with all the details:
Link
You can now just loop over and store in a dataframe easily.
Very fast, fewer errors than selenium.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python web scraping using Selenium - iterate through href link - python

Related

Selenium not updating current window

selenium no way to use a visible select element python

Selenium using WebDriverWait WebElement still doesn't print any text, recognizes string PYTHON

How do I retrieve the link of an image through Selenium

My scraper fails to get all the items from a webpage

Categories

Resources