Why is scrolling interacting badly with my webscraping? - python

I'm trying to scrape all the corner betting odds for a given game at skybet, but it looks like scrolling is messing things up in my loop. When I print section.text it looks like its doing what I want but then it clicks the wrong thing?
And when I don't scroll it will only click on the first few odds sections before the code just freezes.
Any help would be really appreciated thanks!
Also, I made the odds_sections refresh itself at each iteration because I thought that might be the problem.
driver = webdriver.Safari()
driver.get("https://m.skybet.com/football/competitions")
driver.maximize_window()
#click accept cookie
try:
button_cookie = WebDriverWait(driver, 20).until(
EC.element_to_be_clickable((By.XPATH, "//body/div[2]/div[1]/a[2]"))
)
button_cookie.click()
except:
print("no cookie")
#find location of premier league
pl = driver.find_elements_by_class_name("split__title")
locate_pl=0
link_name = pl[locate_pl].text
while link_name != "Premier League":
locate_pl += 1
link_name = pl[locate_pl].text
pl[locate_pl].click()
N = locate_pl + 1
#use N now to find pl matches
time.sleep(2)
#click on first match
button_match = driver.find_element_by_xpath("//div[#id='competitions']/ul[1]/li[{}]/div[1]/table[2]/tbody[1]/tr[2]/td[1]/a[1]".format(N))
teams = driver.find_element_by_xpath("//div[#id='competitions']/ul[1]/li[{}]/div[1]/table[2]/tbody[1]/tr[2]/td[1]/a[1]/b/span".format(N))
button_match.send_keys(Keys.ENTER)
time.sleep(2)
#find and click corners button
try:
button_corners = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CLASS_NAME,"_1ouz2ki")))
#button_corners = driver.find_elements_by_class_name("_1ouz2ki")
except:
print("no corners")
n=0
link_name = button_corners[n].text
while link_name != "Corners":
n += 1
link_name = button_corners[n].text
button_corners[n].click()
#Now we will scrape all corner odds for this game.
odds_sections = driver.find_elements_by_class_name('_t0tx82')
N_sections = len(odds_sections)
c=0
scroll_to = 35
#the issue is within this loop
while c <= N_sections:
odds_sections = driver.find_elements_by_class_name('_t0tx82')
section = odds_sections[c]
print(section.text)
section.click()
time.sleep(2)
section.click()
c += 1
driver.execute_script("window.scrollTo(0,{})".format(scroll_to))

Related

How to scrape all the pages in the website

https://www.bestbuy.com/site/promo/health-fitness-deals
I want to loop through these 10 pages and scrape their names and hrefs
Below is my code which only scrapes the 1st page continuously 10 times:
def name():
for i in range(1, 11):
tag = driver.find_elements_by_xpath('/html/body/div[4]/main/div[9]/div/div/div/div/div/div/div[2]/div[2]/div[3]/div/div[5]/ol/li[3]/div/div/div/div/div/div[2]/div[1]/div[2]/div/h4')
for a in tag:
for name in a.find_elements_by_tag_name('a'):
links = name.get_attribute("href")
names = name.get_attribute('text')
watches_name.append(names)
watches_link.append(links)
# print(watches_name)
# print(watches_link)
name()
If you want to get elements from next pages then you have to click() on link >
driver.find_element_by_css_selector('.sku-list-page-next').click()
Minimal working code with other changes.
I reduced xpath to something much simpler. And I keep name, link as pair because it is simpler to write in file CSV or in database or to filter and sort.
I had to use longer sleep - sometimes my browser needs more time to update elements on page.
from selenium import webdriver
import time
url = 'https://www.bestbuy.com/site/promo/health-fitness-deals'
driver = webdriver.Firefox()
driver.get(url)
time.sleep(2)
# page "Hello! Choose a Country" - selecting Unitet State flag
driver.find_element_by_class_name('us-link').click()
items = []
for page in range(1, 11):
print('\n[DEBUG] wait 15 seconds to update page\n')
time.sleep(15)
print('\n--- page', page, '---\n')
all_links = driver.find_elements_by_css_selector('#main-results h4 a')
for a in all_links:
link = a.get_attribute("href")
name = a.get_attribute('text')
items.append( [name, link] )
print(name)
print('\n[DEBUG] click next\n')
driver.find_element_by_css_selector('.sku-list-page-next').click()
#print(items)
BTW:
This method could be done with while True and some method to recognize if there is link > - and exit loop when there is no >. This way it could work with any number of pages.
Other method.
When you manually visit few pages then you should see that second page has url with ?cp=2, third with ?cp=3, etc. so you could use it to load pages
driver.get(url + '?cp=' + str(page+1) )
Minimal working code.
from selenium import webdriver
import time
url = 'https://www.bestbuy.com/site/promo/health-fitness-deals'
driver = webdriver.Firefox()
driver.get(url)
time.sleep(2)
# page "Hello! Choose a Country" - selecting Unitet State flag
driver.find_element_by_class_name('us-link').click()
items = []
for page in range(1, 11):
print('\n[DEBUG] wait 15 seconds to update page\n')
time.sleep(15)
print('\n--- page', page, '---\n')
all_links = driver.find_elements_by_css_selector('#main-results h4 a')
for a in all_links:
link = a.get_attribute("href")
name = a.get_attribute('text')
items.append( [name, link] )
print(name)
print('\n[DEBUG] load next url\n')
driver.get(url + '?cp=' + str(page+1) )
#print(items)
This method could also use while True and variable page to get any number of pages.
EDIT:
Versions with while True
from selenium import webdriver
import time
url = 'https://www.bestbuy.com/site/promo/health-fitness-deals'
driver = webdriver.Firefox()
driver.get(url)
time.sleep(2)
# page "Hello! Choose a Country" - selecting Unitet State flag
driver.find_element_by_class_name('us-link').click()
items = []
page = 1
while True:
print('\n[DEBUG] wait 15 seconds to update page\n')
time.sleep(15)
print('\n--- page', page, '---\n')
all_links = driver.find_elements_by_css_selector('#main-results h4 a')
for a in all_links:
link = a.get_attribute("href")
name = a.get_attribute('text')
items.append( [name, link] )
print(name)
page += 1
print('\n[DEBUG] load next url\n')
driver.get(url + '?cp=' + str(page) )
if driver.title == 'Best Buy: Page Not Found':
print('\n[DEBUG] exit loop\n')
break
#print(items)
and
from selenium import webdriver
import time
url = 'https://www.bestbuy.com/site/promo/health-fitness-deals'
driver = webdriver.Firefox()
driver.get(url)
time.sleep(2)
# page "Hello! Choose a Country" - selecting Unitet State flag
driver.find_element_by_class_name('us-link').click()
items = []
page = 1
while True:
print('\n[DEBUG] wait 15 seconds to update page\n')
time.sleep(15)
print('\n--- page', page, '---\n')
all_links = driver.find_elements_by_css_selector('#main-results h4 a')
for a in all_links:
link = a.get_attribute("href")
name = a.get_attribute('text')
items.append( [name, link] )
print(name)
page += 1
print('\n[DEBUG] click next\n')
item = driver.find_element_by_css_selector('.sku-list-page-next')
if item.get_attribute("href"):
item.click()
else:
print('\n[DEBUG] exit loop\n')
break
#print(items)
I guess if your code is working right, you will just need to click the pagination button. I found it can be located with the help of css selector ('#Caret_Right_Line_Sm'). Try adding this line to your function:
def name():
for i in range(1, 11):
tag = driver.find_elements_by_xpath('/html/body/div[4]/main/div[9]/div/div/div/div/div/div/div[2]/div[2]/div[3]/div/div[5]/ol/li[3]/div/div/div/div/div/div[2]/div[1]/div[2]/div/h4')
for a in tag:
for name in a.find_elements_by_tag_name('a'):
links = name.get_attribute("href")
names = name.get_attribute('text')
watches_name.append(names)
watches_link.append(links)
# print(watches_name)
# print(watches_link)
driver.find_elements_by_css_selector('#Caret_Right_Line_Sm')[1].click()
name()

having trouble looping through elements with selenium python

I've looked all through Stackoverflow to try and find the answer to this but couldn't. What's wrong with my code is that it clicks the first element and then gets the 'href' I want but stops right after that, and throws errors like
box[x].click()
&
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
Here's the code
box = driver.find_elements_by_class_name("info-section.info-primary")
x = 0
#for x in range(0, len(box)):
while True:
while x <= len(box):
#if box[x].is_displayed():
driver.implicitly_wait(2)
# error is happening here
box[x].click()
x += 1
try:
website = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "primary-btn.website-link"))
)
print(website.get_attribute('href'))
driver.back()
except:
driver.back()
if not driver.find_element_by_class_name('ajax-page'):
break
else:
driver.find_element_by_class_name('ajax-page').click()
You are getting the StaleElementReference error because you define box, navigate to another page, then try to use the box variable again. The quickest way to resolve this would be to locate the element without the variable each loop:
box = driver.find_elements_by_class_name("info-section.info-primary")
x = 0
#for x in range(0, len(box)):
while True:
while x <= len(box):
#if box[x].is_displayed():
driver.implicitly_wait(2)
# error is happening here
driver.find_elements_by_class_name("info-section.info-primary")[x].click()
x += 1
try:
website = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "primary-btn.website-link"))
)
print(website.get_attribute('href'))
driver.back()
except:
driver.back()
if not driver.find_element_by_class_name('ajax-page'):
break
else:
driver.find_element_by_class_name('ajax-page').click()

Optimizing python web scraping script with Selenium

I'm having an issue with my web scraping script with Selenium
Normally, the script can run smoothly.
However, I would usually have this error within this for loop
(I believe the script ran too fast before the elements can be visible):
NoSuchElementException Traceback (most recent call last)
<ipython-input-6-470748a6674f> in <module>
66 item_brand.append(driver.find_element_by_xpath('.//*[#id="brand"]/a/span/bdi').get_attribute('textContent'))
67 item_prices.append(driver.find_element_by_css_selector('[id="price"]').text)
---> 68 item_names.append(driver1.find_element_by_css_selector('[class="nav-product-link-text"] span').text)
69 total_rate.append(driver1.find_element_by_class_name('css-i36p8g').text)
70 review_contents.append(containers.find_element_by_class_name('review-text').text)
......
"NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":"[class="nav-product-link-text"] span"}"
I had to add driver.implicitly_wait(3) within the for loop so it can wait until the elements are visible but it didn't work.
please help to check my script as below:
driver = webdriver.Chrome(chrome_path)
driver1 = webdriver.Chrome(chrome_path)
# Create lists for the dataframe:
item_names = list()
item_description = list()
item_brand = list()
review_titles= list()
review_contents = list()
product_helpful= list()
product_not_helpful = list()
member_rating = list()
total_rate = list()
item_prices = list()
item_images = list()
URL = "https://ca.iherb.com/c/Vitamins?sr=2&noi=48&p="
for n in range(1,2):
driver.get(f"{URL}{n}") # modify the page numbers to scrape the products information
# driver.get(f"https://ca.iherb.com/c/Vitamins?sr=2&noi=48&p={n}".format(n+1))
wait = WebDriverWait(driver, 10)
# Store all the links in a list
item_links = [item.get_attribute("href") for item in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,".absolute-link-wrapper > a.product-link")))]
# Iterate over the links
for item_link in item_links:
driver.get(item_link)
# Locate and click on the `View All Reviews` link
all_reviews_link = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"span.all-reviews-link > a")))
time.sleep(2)
x = all_reviews_link.get_attribute("href")
MAX_PAGE_NUM = 60 # Scrape maximum 60 pages in the review section
for i in range(1, MAX_PAGE_NUM + 1):
page_num = str(i)
url = x +'?&p='+ page_num
print(url)
driver1.get(url)
review_containers = driver1.find_elements_by_class_name('review-row')
for containers in review_containers:
driver.implicitly_wait(3) # waiting for the browser to se the website elements
elements = ', '.join([item.text for item in driver.find_elements_by_css_selector("[itemprop='description'] > ul:nth-of-type(1) > li")])
item_description.append(elements)
item_images.append(driver.find_element_by_xpath('//*[#id="product-image"]/div[1]/a').get_attribute('href'))
item_brand.append(driver.find_element_by_xpath('.//*[#id="brand"]/a/span/bdi').get_attribute('textContent'))
item_prices.append(driver.find_element_by_css_selector('[id="price"]').text)
item_names.append(driver1.find_element_by_css_selector('[class="nav-product-link-text"] span').text)
total_rate.append(driver1.find_element_by_class_name('css-i36p8g').text)
review_contents.append(containers.find_element_by_class_name('review-text').text)
product_helpful.append(containers.find_element_by_css_selector('[title="Helpful"] span').text)
product_not_helpful.append(containers.find_element_by_css_selector('[title="Unhelpful"] span').text)
stars = containers.find_elements_by_class_name("css-172co2l")
rating = 0
for star in stars:
star_color = star.find_element_by_tag_name("path").get_attribute("fill")
if star_color != "transparent":
rating += 1
member_rating.append(rating)
time.sleep(2) # Slow the script down
driver.quit()
Please help to check this issue for me. I really appreciate it.

How to continue a script even if there is a missing element on the current page?

I am working on a scraping project and am trying so scrape many different profiles. Not all of the profiles have the same information, so I want to skip that piece of data if the current profile does not have it. Here is my current code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
driver = webdriver.Chrome("MY DIRECTORY")
driver.get("https://directory.bcsp.org/")
count = int(input("Number of Pages to Scrape: "))
body = driver.find_element_by_xpath("//body") #
profile_count = driver.find_elements_by_xpath("//div[#align='right']/a")
while len(profile_count) < count: # Get links up to "count"
body.send_keys(Keys.END)
sleep(1)
profile_count = driver.find_elements_by_xpath("//div[#align='right']/a")
for link in profile_count: # Calling up links
temp = link.get_attribute('href') # temp for
driver.execute_script("window.open('');") # open new tab
driver.switch_to.window(driver.window_handles[1]) # focus new tab
driver.get(temp)
##### SCRAPE CODE #####
Name = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[1]/div[2]/div')
IssuedBy = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[1]/td[1]/div[2]')
CertificationNumber = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[1]/td[3]/div[2]')
CertfiedSince = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[3]/td[1]/div[2]')
RecertificationCycle = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[3]/td[3]/div[2]')
Expires = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[1]/div[2]')
AccreditedBy = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[3]/div[2]/a')
print(Name.text + " : " + IssuedBy.text + " : " + CertificationNumber.text + " : " + CertfiedSince.text + " : " + RecertificationCycle.text + " : " + Expires.text + " : " + AccreditedBy.text)
driver.close()
driver.switch_to.window(driver.window_handles[0])
driver.close()
Please let me know how I would be able to skip an element if it is not present on the current profile.
According to the docs, find_element_by_xpath() raises a NoSuchElementException if the element you're looking for couldn't be found.
I suggest handling potential NoSuchElementExceptions accordingly. What a proper exception handling could look like depends on what you're trying to achieve, you might want to log an error, assign default values, skip certain follow up actions...
try:
Name = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[1]/div[2]/div')
except NoSuchElementException:
Name = "Default Name"
You could even wrap multiple find_element_by_xpath() calls in your try block.
It will fix try:.. except:.. but you have some other errors too. I fixed them all.
Code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
driver = webdriver.Chrome('chromedriver')
driver.get("https://directory.bcsp.org/")
count = int(input("Number of Pages to Scrape: "))
body = driver.find_element_by_xpath("//body") #
profile_count = driver.find_elements_by_xpath("//div[#align='right']/a")
c = 1
while c <= count:
for link in profile_count: # Calling up links
temp = link.get_attribute('href') # temp for
driver.execute_script("window.open('');") # open new tab
driver.switch_to.window(driver.window_handles[1]) # focus new tab
driver.get(temp)
sleep(1)
##### SCRAPE CODE #####
try:
Name = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[1]/div[2]/div')
IssuedBy = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[1]/td[1]/div[2]')
CertificationNumber = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[1]/td[3]/div[2]')
CertfiedSince = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[3]/td[1]/div[2]')
RecertificationCycle = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[3]/td[3]/div[2]')
except:
c -= 1
driver.switch_to.window(driver.window_handles[0])
c += 1
if c > count:
break
driver.quit()

Python Selenium: Scrolling not working

I am trying to automate this Instagram link. I need to scroll and scroll and fetch all links. I am trying following but not working.
def fetch_links_by_hashtag(hash_tag):
url = 'https://www.instagram.com/explore/tags/marketing/'
driver.get(url)
driver.implicitly_wait(20)
is_more = False
try:
elem_more = wait.until(EC.element_to_be_clickable((By.PARTIAL_LINK_TEXT, "Load more")))
elem_more.click()
is_more = True
except Exception as ex:
print(str(ex))
pop = driver.find_element_by_tag_name('footer')
#pop = driver.find_element_by_link_text('About us')
# pop = driver.find_element_by_class_name('_4gt3b')
if pop is not None:
for i in range(10):
print('Calling scrolling script')
# It scolls till end
driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', pop)
sleep(4)
html = pop.get_attribute('innerHTML')
print(html)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
How to scroll down to the bottom of a page ?
In addition to 宏杰李 answer
driver.execute_script("return arguments[0].scrollIntoView();", element_obj)
Also, if you want to make an extra scroll:
driver.execute_script("return arguments[0].parentNode.scrollTop = "
"arguments[0].parentNode.scrollTop + {extra_scroll}"
.format(extra_scroll=extra_scroll_pixels), element_obj)
My entire code:
def _scroll_to_element(driver, element,
extra_scroll=None):
# Scroll to element
driver.execute_script("return arguments[0].scrollIntoView();", element)
# Scroll parentNode with the extra pixels (If provided)
if extra_scroll:
driver.execute_script(
"return arguments[0].parentNode.scrollTop = "
"arguments[0].parentNode.scrollTop + {extra_scroll}".format(
extra_scroll=str(extra_scroll)), element)

Categories