Scraping Multiple urls selenium - python

I'm new to coding but i wrote this code that scraps the page fine but i want to scrape multiple of these urls like 200 how do i do that?
from selenium import webdriver
chrome_path = r"C:\Users\lenovo\Downloads\chromedriver_win32 (5)\chromedriver.exe"
driver = webdriver.Chrome(chrome_path)
driver.get("https://www.kijijiautos.ca/vip/22442312")
driver.find_element_by_xpath('//div[#class="b1yLWE b3zFtQ"]').text
btn = driver.find_element_by_xpath('//button[#class="g1zAe-"]')
btn.click()
driver.find_elements_by_xpath('//span[#class="A2jAym q2jAym"]').text
driver.find_element_by_xpath('//div[#class="b1yLWE b1zAe-"]').text
print(driver.current_url)

Something like below
from selenium import webdriver
chrome_path = r"C:\Users\lenovo\Downloads\chromedriver_win32 (5)\chromedriver.exe"
driver = webdriver.Chrome(chrome_path)
def get_scarping(link):
driver.get(link)
driver.find_element_by_xpath('//div[#class="b1yLWE b3zFtQ"]').text
btn = driver.find_element_by_xpath('//button[#class="g1zAe-"]')
btn.click()
driver.find_elements_by_xpath('//span[#class="A2jAym q2jAym"]').text
driver.find_element_by_xpath('//div[#class="b1yLWE b1zAe-"]').text
print(driver.current_url)
return driver.current_url
links = ["https://www.kijijiautos.ca/vip/22442312", "other_urls"]
scrapings = []
for link in links:
scrapings.append(get_scarping(link))

Just add for loop
from selenium import webdriver
chrome_path = r"C:\Users\lenovo\Downloads\chromedriver_win32 (5)\chromedriver.exe"
driver = webdriver.Chrome(chrome_path)
for x in range(200):
driver.get("https://www.kijijiautos.ca/vip/22442312")
driver.find_element_by_xpath('//div[#class="b1yLWE b3zFtQ"]').text
btn = driver.find_element_by_xpath('//button[#class="g1zAe-"]')
btn.click()
driver.find_elements_by_xpath('//span[#class="A2jAym q2jAym"]').text
driver.find_element_by_xpath('//div[#class="b1yLWE b1zAe-"]').text
print(driver.current_url)

Related

Converting Chromewebdriver to Edgewebdriver

I have written a code that will run selenium using chromewebdriver. now i need to convert the code so that it will run msedgedriver. can someone send the code which is converted so that it will run on msedge driver and tell me how did they do it so that i can write the code the same way in future.
here is my code:
import time
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
'''locators'''
bank_cost_lt=(By.XPATH,"//*[#id='itembox-InstantBankDiscount']//a")
x_mark_lt=(By.XPATH,"//*[#id='twister-plus-dp-bg']/i")
partner_lt=(By.XPATH,"//*[#id='itembox-Partner']//a[#class='a-size-base a-link-emphasis vsx-offers-count']")
options = webdriver.ChromeOptions()
prefs = {"profile.default_content_setting_values.notifications": 2}
options.add_experimental_option("prefs", prefs)
# driver=webdriver.Chrome(executable_path=chrome_exe_path, chrome_options=options)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), chrome_options=options)
driver.implicitly_wait(0.5)
# driver.maximize_window()
df=pd.DataFrame(columns=['url','price','seller','bank offers','partner offers'])
cnt=0
def presence_of_element_click(by_locator, timeout):
WebDriverWait(driver, timeout).until(EC.presence_of_element_located(by_locator)).click()
urls=["https://www.amazon.in/JBL-Cancellation-Headphones-Playtime-Assistant/dp/B096FYLJ6M/ref=sr_1_8?crid=OHZVHJG9Q7HN&keywords=jbl%2Bheadphones&qid=1672115942&s=electronics&sprefix=jbl%2Bheadphone%2Celectronics%2C235&sr=1-8&th=1"]
# urls = ["https://www.amazon.in/dp/B0BGZN7FWV?th=1"]
for url in urls:
print(url)
driver.get(url)
time.sleep(3)
WebDriverWait(driver, 180).until(lambda driver: driver.execute_script('return document.readyState') == 'complete')
elem = driver.find_element('xpath','//*')
source_code = elem.get_attribute("outerHTML")
soup1 = BeautifulSoup(source_code, "html.parser")
soup2 = BeautifulSoup(soup1.prettify(), "html.parser")
price = soup2.find('span',{'class':'a-price aok-align-center reinventPricePriceToPayMargin priceToPay'}).span.text
print(price)
seller=soup2.find('div',{'id':'merchant-info'}).find('a').find('span').text
print(seller)
presence_of_element_click(bank_cost_lt,60)
time.sleep(4)
bank_=driver.find_element('id','InstantBankDiscount-sideSheet')
source_code = bank_.get_attribute("outerHTML")
soup1 = BeautifulSoup(source_code, "html.parser")
soup2 = BeautifulSoup(soup1.prettify(), "html.parser")
bank_offer_els=soup2.find_all('div',{'class':'a-section vsx-offers-desktop-lv__item'})
lst=[]
for val in bank_offer_els:
lst.append(str(val.p.text).strip())
presence_of_element_click(x_mark_lt,60)
time.sleep(2)
presence_of_element_click(partner_lt,60)
time.sleep(2)
bank_=driver.find_element('id','Partner-single-offer')
source_code = bank_.get_attribute("outerHTML")
soup1 = BeautifulSoup(source_code, "html.parser")
soup2 = BeautifulSoup(soup1.prettify(), "html.parser")
partner_els=soup2.find_all('div',{'class':'a-section vsx-offers-desktop-dv__content aok-block'})
par_lst=[]
for val in partner_els:
par_lst.append(str(val.text).strip())
presence_of_element_click(x_mark_lt,60)
df.loc[cnt,'url']=url
df.loc[cnt,'price']=price.strip()
df.loc[cnt,'seller']= seller.strip()
if lst:
df.loc[cnt,'bank offers']=', '.join(lst)
else:
df.loc[cnt,'bank offers']='No offers'
if par_lst:
df.loc[cnt,'partner offers']=', '.join(par_lst)
else:
df.loc[cnt,'partner offers']='No Offers'
cnt+=1
df.to_excel('output1.xlsx',index=False)
If you're using Selenium 4, you only need to change Chrome related code to Edge related. The edited lines of code are below:
from selenium.webdriver.edge.service import Service
from webdriver_manager.microsoft import EdgeChromiumDriverManager
options = webdriver.EdgeOptions()
driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()), options=options)
If you want to learn more about Edge WebDriver automation, you can refer to this doc: Use WebDriver to automate Microsoft Edge.

Go to site use direct link could not get element

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import chromedriver_autoinstaller
from selenium.webdriver.common.keys import Keys
import subprocess
import time
from selenium.webdriver.common.by import By
import functionmodules
### PATH
CHROME_DRIVER = 'C:\\Users\SANGHYUN\Downloads\chromedriver_win32\chromedriver.exe'
url = 'https://cafe.naver.com/reply14/1'
#url = 'https://cafe.naver.com/reply14'
CHROME_PATH = 'C:\\Program Files\Google\Chrome\Application\chrome.exe'
searchpath = url
subprocess.Popen(r'C:\\Program Files\Google\Chrome\Application\chrome.exe --remote-debugging-port=9222 --user-data-dir="C:\chrometemp"')
option = Options()
option.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
chrome_ver = chromedriver_autoinstaller.get_chrome_version().split('.')[0]
try:
driver = webdriver.Chrome(f'./{chrome_ver}/chromedriver.exe', options=option)
except:
chromedriver_autoinstaller.install(True)
driver = webdriver.Chrome(f'./{chrome_ver}/chromedriver.exe', options=option)
driver.get(searchpath)
def CallGoToArticleStep():
# go to main
driver.switch_to.parent_frame()
driver.find_element(By.XPATH, '//*[#id="menuLink1"]').click()
driver.switch_to.frame('cafe_main')
# click article3
time.sleep(2)
firstarticle = '//*[#id="main-area"]/div[4]/table/tbody/tr[2]/td[1]/div[3]/div/a[1]'
element3 = driver.find_element(By.XPATH, firstarticle)
element3.send_keys('\n')
#CallGoToArticleStep()
# write reply, send reply
for i in range (1):
time.sleep(4)
print (i)
replyString = '//*[#id="app"]/div/div/div[2]/div[2]/div[4]/div[2]/div[1]/textarea'
replyElement = driver.find_element(By.XPATH, replyString)
replyElement.send_keys('whisky life')
replyClickString = '//*[#id="app"]/div/div/div[2]/div[2]/div[4]/div[2]/div[2]/div[2]/a'
replyClickElement = driver.find_element(By.XPATH, replyClickString)
replyClickElement.click()
time.sleep(1000)`
In this source call CallGoToArticleStep() can get replyElement, not call then can't get replyElement but, browser element equal.
is there way to not call CallGoToArticleStep function and get replyElement?

How to re-load page while looping over elements?

This is my code, should be easily recreateable:
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
def main():
# Setup chrome options
chrome_options = Options()
chrome_options.add_argument("--headless") # Ensure GUI is off
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--window-size=1920x3500")
# Set path to chromedriver as per your configuration
webdriver_service = Service("/home/sumant/chromedriver/stable/chromedriver")
# Choose Chrome Browser
driver = webdriver.Chrome(service=webdriver_service, options=chrome_options)
driver.maximize_window()
# Get page
url = "https://www.ibrance.com/"
driver.get(url)
time.sleep(2)
ele = driver.find_elements_by_tag_name('a')
for i, e in enumerate(ele):
try:
print(e.get_attribute('outerHTML'))
e.click()
time.sleep(2)
driver.save_screenshot(f"/mnt/d/Work/ss{i}.png")
driver.get(url)
# driver.refresh()
except:
print("element not interactable")
driver.close()
driver.quit()
if __name__ == '__main__':
main()
The idea is I click on a link take a screenshot, load home page again, click on next link and so on.
After the first link, it is not able to find any other element on the reloaded page.
This is correct, since after the refresh it is unable to find you required elements.
To do so, elements need to be reloaded after each refresh.
Do this:
ele = driver.find_elements_by_tag_name('a')
for i, e in enumerate(ele):
try:
print(e.get_attribute('outerHTML'))
e.click()
time.sleep(2)
driver.save_screenshot(f"/mnt/d/Work/ss{i}.png")
driver.get(url)
driver.refresh()
# reload elements
ele = driver.find_elements_by_tag_name('a')
So this worked
(Thanks YuMa, for the inspiration)
def main():
# ...
# Get page
url = "https://www.ibrance.com/"
driver.get(url)
time.sleep(2)
total_element = driver.find_elements_by_tag_name('a')
total_clicks = len(total_element)
def get_images(ele, i):
try:
ele[i].click()
time.sleep(2)
# driver.save_screenshot(f"/mnt/d/Work/ss{i}.png")
print(driver.title)
driver.get(url)
time.sleep(2)
except:
print("")
for i in range(0, total_clicks+1):
ele = driver.find_elements_by_tag_name('a')
get_images(ele, i)

How to scroll correctly in a dynamically-loading webpage with Selenium?

Here's the link of the website : website
I would like to have all the links of th hotels in this location.
Here's my script :
import pandas as pd
import numpy as np
from selenium import webdriver
import time
PATH = "driver\chromedriver.exe"
options = webdriver.ChromeOptions()
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1200,900")
options.add_argument('enable-logging')
driver = webdriver.Chrome(options=options, executable_path=PATH)
driver.get('https://fr.hotels.com/search.do?destination-id=10398359&q-check-in=2021-06-24&q-check-out=2021-06-25&q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER')
cookie = driver.find_element_by_xpath('//button[#class="uolsaJ"]')
try:
cookie.click()
except:
pass
for i in range(30):
driver.execute_script("window.scrollBy(0, 1000)")
time.sleep(5)
time.sleep(5)
my_elems = driver.find_elements_by_xpath('//a[#class="_61P-R0"]')
links = [my_elem.get_attribute("href") for my_elem in my_elems]
X = np.array(links)
print(X.shape)
#driver.close()
But I cannot find a way to tell the script : scroll down until there is nothing more to scroll.
I tried to change this parameters :
for i in range(30):
driver.execute_script("window.scrollBy(0, 1000)")
time.sleep(30)
I changed the time.sleep(), the number 1000 and so on but my output keep changing and not in the right way.
output
As you can see, I have scraped a lot of numbers differents. How to make my script scraping a same amout each time ? Not necessarily each links but at last a stable number.
Here it scroll and at one point it seems blocked and scrape all the links it has at the moment. That's not appropriate.
There are several issues here.
You are getting the elements and their links only AFTER you finished scrolling while you should do that inside the scrolling loop.
You should wait until the cookies alert is appearing to close it.
You can scroll until the footer element is presented.
Something like this:
import pandas as pd
import numpy as np
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
PATH = "driver\chromedriver.exe"
options = webdriver.ChromeOptions()
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1200,900")
options.add_argument('enable-logging')
driver = webdriver.Chrome(options=options, executable_path=PATH)
wait = WebDriverWait(driver, 20)
driver.get('https://fr.hotels.com/search.do?destination-id=10398359&q-check-in=2021-06-24&q-check-out=2021-06-25&q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER')
wait.until(EC.visibility_of_element_located((By.XPATH, '//button[#class="uolsaJ"]'))).click()
def is_element_visible(xpath):
wait1 = WebDriverWait(driver, 2)
try:
wait1.until(EC.visibility_of_element_located((By.XPATH, xpath)))
return True
except Exception:
return False
while not is_element_visible("//footer[#id='footer']"):
my_elems = driver.find_elements_by_xpath('//a[#class="_61P-R0"]')
links = [my_elem.get_attribute("href") for my_elem in my_elems]
X = np.array(links)
print(X.shape)
driver.execute_script("window.scrollBy(0, 1000)")
time.sleep(5)
#driver.close()
You can try this by directly calling the DOM and locate some element that will be only at the bottom of the page with .is_displayed() selenium method which returns true/false:
# https://stackoverflow.com/a/57076690/15164646
while True:
# it will be returning false until the element is located
# "#message" id = "No more results" at the bottom of the YouTube search
end_result = driver.find_element_by_css_selector('#message').is_displayed()
driver.execute_script("var scrollingElement = (document.scrollingElement || document.body);scrollingElement.scrollTop = scrollingElement.scrollHeight;")
# further code below
# once the element is found it returns True. If so, it will break out of the while loop
if end_result == True:
break
I wrote a blog post where I used this method to scrape YouTube Search.

Scrape table in nested page appears after click using selenium

I want to scrape data inside nested tables, in this page: https://www.wagertalk.com/freeOddsPage/page.html?sport=L5&date=2021-05-29&cb=0.01844398326591401
When you click on any cell, a nested new table appears, I want to scrape data from those nested tables.
I created a phyton script trying to use selenium to click on each cell then the table show so I scrape it; but the elenium browser didn't click or the nested tables didn't show:
u = 'https://www.wagertalk.com/freeOddsPage/page.html?sport=S8&date=2021-05-27&cb=0.6242232189793953'
import requests
import csv
import json
import datetime
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
options = Options()
# options.add_argument("--headless") #headless
#options.add_argument('--no-sandbox')
#options.add_argument('--ignore-certificate-errors')
#options.add_argument('--incognito')
driver = webdriver.Chrome(executable_path=r"C:/chromedriver.exe", options=options)
driver.get(u)
driver.maximize_window()
driver.implicitly_wait(60) ##Wait the loading if error
time.sleep(20)
soup = BeautifulSoup(driver.page_source, 'html.parser')
soup = BeautifulSoup(driver.page_source, 'html.parser')
for i in soup.select('#schedule tbody tr[id^="g"]:has(.tennis_score_main)'):
match_date = i.select_one('th:nth-of-type(1) div:nth-of-type(1)').text
match_time = i.select_one('th:nth-of-type(1) div:nth-of-type(2)').text
A_team = i.select_one('th:nth-of-type(3) div:nth-of-type(1) div:nth-of-type(1)').text if i.select_one('th:nth-of-type(3) div:nth-of-type(1) div:nth-of-type(1)') else i.select_one('th:nth-of-type(3) div:nth-of-type(1)').text
H_team = i.select_one('th:nth-of-type(3) div:nth-of-type(2) div:nth-of-type(1)').text if i.select_one('th:nth-of-type(3) div:nth-of-type(2) div:nth-of-type(1)') else i.select_one('th:nth-of-type(3) div:nth-of-type(2)').text
#I tried this:
# WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#schedule tbody tr[id^="g"]:has(.scTD1):nth-of-type(1) .book.b10 div:nth-of-type(1)')))
#and Tried this:
driver.execute_script("document.querySelector('#schedule tbody tr:has(.scTD1):nth-of-type(1) .book.b10 div:nth-of-type(1)').click()")
#code to scrape nested table here, but the table don't show
driver.quit()
I don't know what was your problem because I can click cell to open popup window, and later click button Close to close this popup window. But I didn't use Beatifulsoup but only Selenium
I didn't check if it works with all cells and rows.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
#from selenium.webdriver.firefox.options import Options
import time
url = 'https://www.wagertalk.com/freeOddsPage/page.html?sport=S8&date=2021-05-27&cb=0.6242232189793953'
options = Options()
#options.add_argument("--headless")
#options.add_argument('--no-sandbox')
#options.add_argument('--ignore-certificate-errors')
#options.add_argument('--incognito')
#driver = webdriver.Chrome(executable_path=r"C:/chromedriver.exe", options=options)
driver = webdriver.Chrome(options=options)
#driver = webdriver.Firefox(options=options)
driver.get(url)
driver.maximize_window()
driver.implicitly_wait(60)
for row in driver.find_elements_by_css_selector('tr[id^="g"]'):
date_time = row.find_elements_by_css_selector('.time-started')
match_date = date_time[0].text
match_time = date_time[1].text
print('date:', match_date, '| time:', match_time)
teams = row.find_elements_by_css_selector('.team div')
A_team = teams[0].text
H_team = teams[1].text
print('A_team:', A_team)
print('H_team:', H_team)
books = row.find_elements_by_css_selector('.book')
for b in books:
print('--- popup ---')
# open .popupDiv
b.click()
time.sleep(1)
# ... scrape table from .popupDiv ...
tds = driver.find_elements_by_css_selector('.popupDiv table td')
for t in tds:
print(t.text)
# close .popupDiv
driver.find_element_by_css_selector('.popupDiv button').click()
print('--- end row ---')
driver.quit()

Categories