Optimizing python web scraping script with Selenium - python

I'm having an issue with my web scraping script with Selenium
Normally, the script can run smoothly.
However, I would usually have this error within this for loop
(I believe the script ran too fast before the elements can be visible):
NoSuchElementException Traceback (most recent call last)
<ipython-input-6-470748a6674f> in <module>
66 item_brand.append(driver.find_element_by_xpath('.//*[#id="brand"]/a/span/bdi').get_attribute('textContent'))
67 item_prices.append(driver.find_element_by_css_selector('[id="price"]').text)
---> 68 item_names.append(driver1.find_element_by_css_selector('[class="nav-product-link-text"] span').text)
69 total_rate.append(driver1.find_element_by_class_name('css-i36p8g').text)
70 review_contents.append(containers.find_element_by_class_name('review-text').text)
......
"NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":"[class="nav-product-link-text"] span"}"
I had to add driver.implicitly_wait(3) within the for loop so it can wait until the elements are visible but it didn't work.
please help to check my script as below:
driver = webdriver.Chrome(chrome_path)
driver1 = webdriver.Chrome(chrome_path)
# Create lists for the dataframe:
item_names = list()
item_description = list()
item_brand = list()
review_titles= list()
review_contents = list()
product_helpful= list()
product_not_helpful = list()
member_rating = list()
total_rate = list()
item_prices = list()
item_images = list()
URL = "https://ca.iherb.com/c/Vitamins?sr=2&noi=48&p="
for n in range(1,2):
driver.get(f"{URL}{n}") # modify the page numbers to scrape the products information
# driver.get(f"https://ca.iherb.com/c/Vitamins?sr=2&noi=48&p={n}".format(n+1))
wait = WebDriverWait(driver, 10)
# Store all the links in a list
item_links = [item.get_attribute("href") for item in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,".absolute-link-wrapper > a.product-link")))]
# Iterate over the links
for item_link in item_links:
driver.get(item_link)
# Locate and click on the `View All Reviews` link
all_reviews_link = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"span.all-reviews-link > a")))
time.sleep(2)
x = all_reviews_link.get_attribute("href")
MAX_PAGE_NUM = 60 # Scrape maximum 60 pages in the review section
for i in range(1, MAX_PAGE_NUM + 1):
page_num = str(i)
url = x +'?&p='+ page_num
print(url)
driver1.get(url)
review_containers = driver1.find_elements_by_class_name('review-row')
for containers in review_containers:
driver.implicitly_wait(3) # waiting for the browser to se the website elements
elements = ', '.join([item.text for item in driver.find_elements_by_css_selector("[itemprop='description'] > ul:nth-of-type(1) > li")])
item_description.append(elements)
item_images.append(driver.find_element_by_xpath('//*[#id="product-image"]/div[1]/a').get_attribute('href'))
item_brand.append(driver.find_element_by_xpath('.//*[#id="brand"]/a/span/bdi').get_attribute('textContent'))
item_prices.append(driver.find_element_by_css_selector('[id="price"]').text)
item_names.append(driver1.find_element_by_css_selector('[class="nav-product-link-text"] span').text)
total_rate.append(driver1.find_element_by_class_name('css-i36p8g').text)
review_contents.append(containers.find_element_by_class_name('review-text').text)
product_helpful.append(containers.find_element_by_css_selector('[title="Helpful"] span').text)
product_not_helpful.append(containers.find_element_by_css_selector('[title="Unhelpful"] span').text)
stars = containers.find_elements_by_class_name("css-172co2l")
rating = 0
for star in stars:
star_color = star.find_element_by_tag_name("path").get_attribute("fill")
if star_color != "transparent":
rating += 1
member_rating.append(rating)
time.sleep(2) # Slow the script down
driver.quit()
Please help to check this issue for me. I really appreciate it.

Related

How to scrape all the pages in the website

https://www.bestbuy.com/site/promo/health-fitness-deals
I want to loop through these 10 pages and scrape their names and hrefs
Below is my code which only scrapes the 1st page continuously 10 times:
def name():
for i in range(1, 11):
tag = driver.find_elements_by_xpath('/html/body/div[4]/main/div[9]/div/div/div/div/div/div/div[2]/div[2]/div[3]/div/div[5]/ol/li[3]/div/div/div/div/div/div[2]/div[1]/div[2]/div/h4')
for a in tag:
for name in a.find_elements_by_tag_name('a'):
links = name.get_attribute("href")
names = name.get_attribute('text')
watches_name.append(names)
watches_link.append(links)
# print(watches_name)
# print(watches_link)
name()
If you want to get elements from next pages then you have to click() on link >
driver.find_element_by_css_selector('.sku-list-page-next').click()
Minimal working code with other changes.
I reduced xpath to something much simpler. And I keep name, link as pair because it is simpler to write in file CSV or in database or to filter and sort.
I had to use longer sleep - sometimes my browser needs more time to update elements on page.
from selenium import webdriver
import time
url = 'https://www.bestbuy.com/site/promo/health-fitness-deals'
driver = webdriver.Firefox()
driver.get(url)
time.sleep(2)
# page "Hello! Choose a Country" - selecting Unitet State flag
driver.find_element_by_class_name('us-link').click()
items = []
for page in range(1, 11):
print('\n[DEBUG] wait 15 seconds to update page\n')
time.sleep(15)
print('\n--- page', page, '---\n')
all_links = driver.find_elements_by_css_selector('#main-results h4 a')
for a in all_links:
link = a.get_attribute("href")
name = a.get_attribute('text')
items.append( [name, link] )
print(name)
print('\n[DEBUG] click next\n')
driver.find_element_by_css_selector('.sku-list-page-next').click()
#print(items)
BTW:
This method could be done with while True and some method to recognize if there is link > - and exit loop when there is no >. This way it could work with any number of pages.
Other method.
When you manually visit few pages then you should see that second page has url with ?cp=2, third with ?cp=3, etc. so you could use it to load pages
driver.get(url + '?cp=' + str(page+1) )
Minimal working code.
from selenium import webdriver
import time
url = 'https://www.bestbuy.com/site/promo/health-fitness-deals'
driver = webdriver.Firefox()
driver.get(url)
time.sleep(2)
# page "Hello! Choose a Country" - selecting Unitet State flag
driver.find_element_by_class_name('us-link').click()
items = []
for page in range(1, 11):
print('\n[DEBUG] wait 15 seconds to update page\n')
time.sleep(15)
print('\n--- page', page, '---\n')
all_links = driver.find_elements_by_css_selector('#main-results h4 a')
for a in all_links:
link = a.get_attribute("href")
name = a.get_attribute('text')
items.append( [name, link] )
print(name)
print('\n[DEBUG] load next url\n')
driver.get(url + '?cp=' + str(page+1) )
#print(items)
This method could also use while True and variable page to get any number of pages.
EDIT:
Versions with while True
from selenium import webdriver
import time
url = 'https://www.bestbuy.com/site/promo/health-fitness-deals'
driver = webdriver.Firefox()
driver.get(url)
time.sleep(2)
# page "Hello! Choose a Country" - selecting Unitet State flag
driver.find_element_by_class_name('us-link').click()
items = []
page = 1
while True:
print('\n[DEBUG] wait 15 seconds to update page\n')
time.sleep(15)
print('\n--- page', page, '---\n')
all_links = driver.find_elements_by_css_selector('#main-results h4 a')
for a in all_links:
link = a.get_attribute("href")
name = a.get_attribute('text')
items.append( [name, link] )
print(name)
page += 1
print('\n[DEBUG] load next url\n')
driver.get(url + '?cp=' + str(page) )
if driver.title == 'Best Buy: Page Not Found':
print('\n[DEBUG] exit loop\n')
break
#print(items)
and
from selenium import webdriver
import time
url = 'https://www.bestbuy.com/site/promo/health-fitness-deals'
driver = webdriver.Firefox()
driver.get(url)
time.sleep(2)
# page "Hello! Choose a Country" - selecting Unitet State flag
driver.find_element_by_class_name('us-link').click()
items = []
page = 1
while True:
print('\n[DEBUG] wait 15 seconds to update page\n')
time.sleep(15)
print('\n--- page', page, '---\n')
all_links = driver.find_elements_by_css_selector('#main-results h4 a')
for a in all_links:
link = a.get_attribute("href")
name = a.get_attribute('text')
items.append( [name, link] )
print(name)
page += 1
print('\n[DEBUG] click next\n')
item = driver.find_element_by_css_selector('.sku-list-page-next')
if item.get_attribute("href"):
item.click()
else:
print('\n[DEBUG] exit loop\n')
break
#print(items)
I guess if your code is working right, you will just need to click the pagination button. I found it can be located with the help of css selector ('#Caret_Right_Line_Sm'). Try adding this line to your function:
def name():
for i in range(1, 11):
tag = driver.find_elements_by_xpath('/html/body/div[4]/main/div[9]/div/div/div/div/div/div/div[2]/div[2]/div[3]/div/div[5]/ol/li[3]/div/div/div/div/div/div[2]/div[1]/div[2]/div/h4')
for a in tag:
for name in a.find_elements_by_tag_name('a'):
links = name.get_attribute("href")
names = name.get_attribute('text')
watches_name.append(names)
watches_link.append(links)
# print(watches_name)
# print(watches_link)
driver.find_elements_by_css_selector('#Caret_Right_Line_Sm')[1].click()
name()

Why is scrolling interacting badly with my webscraping?

I'm trying to scrape all the corner betting odds for a given game at skybet, but it looks like scrolling is messing things up in my loop. When I print section.text it looks like its doing what I want but then it clicks the wrong thing?
And when I don't scroll it will only click on the first few odds sections before the code just freezes.
Any help would be really appreciated thanks!
Also, I made the odds_sections refresh itself at each iteration because I thought that might be the problem.
driver = webdriver.Safari()
driver.get("https://m.skybet.com/football/competitions")
driver.maximize_window()
#click accept cookie
try:
button_cookie = WebDriverWait(driver, 20).until(
EC.element_to_be_clickable((By.XPATH, "//body/div[2]/div[1]/a[2]"))
)
button_cookie.click()
except:
print("no cookie")
#find location of premier league
pl = driver.find_elements_by_class_name("split__title")
locate_pl=0
link_name = pl[locate_pl].text
while link_name != "Premier League":
locate_pl += 1
link_name = pl[locate_pl].text
pl[locate_pl].click()
N = locate_pl + 1
#use N now to find pl matches
time.sleep(2)
#click on first match
button_match = driver.find_element_by_xpath("//div[#id='competitions']/ul[1]/li[{}]/div[1]/table[2]/tbody[1]/tr[2]/td[1]/a[1]".format(N))
teams = driver.find_element_by_xpath("//div[#id='competitions']/ul[1]/li[{}]/div[1]/table[2]/tbody[1]/tr[2]/td[1]/a[1]/b/span".format(N))
button_match.send_keys(Keys.ENTER)
time.sleep(2)
#find and click corners button
try:
button_corners = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CLASS_NAME,"_1ouz2ki")))
#button_corners = driver.find_elements_by_class_name("_1ouz2ki")
except:
print("no corners")
n=0
link_name = button_corners[n].text
while link_name != "Corners":
n += 1
link_name = button_corners[n].text
button_corners[n].click()
#Now we will scrape all corner odds for this game.
odds_sections = driver.find_elements_by_class_name('_t0tx82')
N_sections = len(odds_sections)
c=0
scroll_to = 35
#the issue is within this loop
while c <= N_sections:
odds_sections = driver.find_elements_by_class_name('_t0tx82')
section = odds_sections[c]
print(section.text)
section.click()
time.sleep(2)
section.click()
c += 1
driver.execute_script("window.scrollTo(0,{})".format(scroll_to))

python selenium error: element is not attached to the page document

I am scraping Banggood, the problem is that driver open just first link and then doesn't go to next link of links list( next product )
and get this error in line 24
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
but i tried to print links out of loop i got all of the
print( links[0].get_attribute('href') )
print( links[2].get_attribute('href') )
main code :
import time
from selenium import webdriver #THIS IS MAIN SCRIPT
driver = webdriver.Chrome(executable_path='C:\\Users\\Compu City\\Desktop\\chromedriver.exe')#DRIVER LOCATION
driver.get('https://usa.banggood.com/Deals_Electronics.html#dealscategories2')#DRIVER LOCATION
driver.implicitly_wait(30)
links = driver.find_elements_by_css_selector('body > div.flashdeals-container.fixed > div.main > div.product-list.cf > ul > li > a.products_name.exclick')
#links has 25 link
product=0
while product <= len(links):
driver.get(links[product].get_attribute('href'))
try:# TITLE
title = driver.find_element_by_css_selector('#centerCtrl > div.title_hd > h2 > strong')
print(title.text)
except:
print('no title')
try:# NEW PRICE
new_price = driver.find_element_by_css_selector('#centerCtrl > div.itemBox > div.item_price_box > div.item_now_price')
print(new_price.text)
except:
print('no new price')
try:# OLD PRICE
old_price = driver.find_element_by_css_selector('#centerCtrl > div.itemBox > div.item_price_box > div.item_old_price')
print(old_price.text)
except:
print('no old price')
try:#image
image = driver.find_element_by_css_selector('#landingImage').get_attribute('src')
print(image)
except:
print('no image')
product +=1
try that
v=[]
for x in links:
#driver.get(links[1].get_attribute('href'))
print(v.append(x.get_attribute('href')))
print(len(v))
driver.get(v[1])
time.sleep(10)
driver.get(v[2])
product=0
while product <= len(v):
driver.get(v[product])
product +=1

Python print results which contains specific string in it

I am trying to get google search result description.
from selenium import webdriver
import re
chrome_path = r"C:\Users\xxxx\Downloads\Compressed\chromedriver_win32\chromedriver.exe"
driver = webdriver.Chrome(chrome_path)
driver.get("https://www.google.co.in/search?q=stackoverflow")
posts = driver.find_elements_by_class_name("st")
for post in posts:
print(post.text)
Here Im getting correct results.
But I only want to print links from description.
And want to get results from 5 google search pages.
Here I am only getting from 1 page.
I have tried using
print(post.get_attribute('href'))
but description links are not clickable so this returns None.
Try the below code:
for i in range(1, 6, 1):
print("--------------------------------------------------------------------")
print("Page "+str(i)+" Results : ")
print("--------------------------------------------------------------------")
staticLinks = driver.find_elements_by_xpath("//*[#class='st']")
for desc in staticLinks:
txt = desc.text+''
if txt.count('http://') > 0 or txt.count('https://') > 0:
for c in txt.split():
if c.startswith('http') or c.startswith('https'):
print(c)
dynamicLinks = driver.find_elements_by_xpath("//*[#class='st']//a")
for desc in dynamicLinks:
link = desc.get_attribute('href')
if link is not None:
print(link)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
nextPage = driver.find_element_by_xpath("//a[#aria-label='Page "+str(i+1)+"']");
nextPage.click();
Will try to fetch the static & dynamic links from the google's first 5 search results description.

python selenium break off when access all 'a' tags

I can access all 'a' tags using Python+selenium in "https://www.zillow.com/homes/recently_sold/Culver-City-CA/house,condo,apartment_duplex,townhouse_type/51617_rid/12m_days/globalrelevanceex_sort/34.044908,-118.348417,33.961088,-118.468924_rect/12_zm/",which means all 'house detail' hyperlinks.But it breaks off when I get into every page to crawl information,that's what troubles me.I want the code can crawl all information in 26 pages successfully.Thank you!
# coding : utf-8
from selenium import webdriver
import time
driver = webdriver.Chrome()
driver.maximize_window()
def crawlHouseDetailForInvoke():
try:
driver.find_element_by_class_name("collapsible-header").click()# price/tax history
time.sleep(5)
table = driver.find_element_by_xpath('//div[#id = "wrapper"]//div[#id = "detail-container-column"]//section[3]/div[#id = "tax-price-history"]/div[#id = "hdp-price-history"]/div/table')
print(table.text)
except Exception:
print("读取数据失败!")
def crawlRegion(url):
driver.get(url)
page_links = driver.find_elements_by_xpath('//div[#id = "search-pagination-wrapper"]/ol/li/a')
print("站内页面数为:%d" % len(page_links))
house_link_parent = driver.find_element_by_id('list-results')
house_links = house_link_parent.find_elements_by_xpath('//div[#id = "search-results"]/ul/li/article/div/a')
print("每页的房源信息数:%d" % len(house_links))
times = 0
for j in range(len(house_links)):
times = times + 1
if(times%9 == 0):
print("元素重加载完成...")
house_link_parent = driver.find_element_by_id('list-results')
house_links = house_link_parent.find_elements_by_xpath('//div[#id = "search-results"]/ul/li/article/div/a')
print("序号:%d" % j)
print("链接:%s" % house_links[j].get_attribute("href"))
house_links[j].click()
time.sleep(8)
crawlHouseDetailForInvoke()
driver.back()
if __name__ == "__main__":
regionUrl = "https://www.zillow.com/homes/recently_sold/Culver-City-CA/house,condo,apartment_duplex,townhouse_type/51617_rid/12m_days/globalrelevanceex_sort/34.05529,-118.33211,33.956531,-118.485919_rect/12_zm/"
print("crawler is started...")
crawlRegion(regionUrl)
driver.close()
driver.quit()

Categories