Python Selenium Football Odds Webscraping

Python Selenium Football Odds Webscraping - python

I am trying to scrape odds from https://en.stoiximan.gr/live. While my code is working, I get an error for having uneven lists in my final dataframe. Unfortunately, stoiximan seems to place 3-way odds together with over/under odds and suspended/locked matches (as in the picture).
What I am trying to do is to delete both home and away teams from their respective lists if their odds are over/under or locked. Any suggestions?
Here 's my code so far:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
import openpyxl
import os
#launch chrome and keep window open
chrome_options = Options()
chrome_options.add_experimental_option("detach", True)
driver = webdriver.Chrome(ChromeDriverManager().install(), options = chrome_options)
#visit en.stoiximan.gr and maximize window
driver.get("https://en.stoiximan.gr/live/")
driver.maximize_window()
#close modal window
try:
WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((
By.XPATH, "//button[#class='sb-modal__close__btn uk-modal-close-default uk-icon uk-
close']"
))).click()
except:
pass
#accept cookies
WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((
By.ID, "onetrust-accept-btn-handler"
))).click()
#Initialize storage for stoiximan
stoiximan_home_teams_list = []
stoiximan_away_teams_list = []
stoiximan_home_odds_list = []
stoiximan_draw_odds_list = []
stoiximan_away_odds_list = []
#grab all home/away teams and explicit odds
try:
stoiximan_home_teams = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((
By.XPATH,
"//div[#class='live-events-event-row__container live-event live-events-event-
row__container--row']/div[1]/a/div[1]/div[1]/span"))
)
stoiximan_away_teams = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((
By.XPATH,
"//div[#class='live-events-event-row__container live-event live-events-event-row__container--row']/div[1]/a/div[1]/div[2]/span"))
)
stoiximan_home_odds = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((
By.XPATH,
"//div[#class='live-events-event-row__container live-event live-events-event-row__container--row']/div[2]/div/button[1]/span[2]"))
)
stoiximan_draw_odds = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((
By.XPATH,
"//div[#class='live-events-event-row__container live-event live-events-event-row__container--row']/div[2]/div/button[2]/span[2]"))
)
stoiximan_away_odds = WebDriverWait(driver, 1).until(
EC.presence_of_all_elements_located((
By.XPATH,
"//div[#class='live-events-event-row__container live-event live-events-event-row__container--row']/div[2]/div/button[3]/span[2]"))
)
except:
driver.quit()
#loop each home team and append the lists
for stoiximan_home_team in stoiximan_home_teams:
stoiximan_home_teams_list.append(stoiximan_home_team.get_attribute('innerText'))
for stoiximan_away_team in stoiximan_away_teams:
stoiximan_away_teams_list.append(stoiximan_away_team.get_attribute('innerText'))
for stoiximan_home_odd in stoiximan_home_odds:
stoiximan_home_odds_list.append(stoiximan_home_odd.text)
for stoiximan_draw_odd in stoiximan_draw_odds:
stoiximan_draw_odds_list.append(stoiximan_draw_odd.text)
for stoiximan_away_odd in stoiximan_away_odds:
stoiximan_away_odds_list.append(stoiximan_away_odd.text)
print(stoiximan_home_teams_list)
print(len(stoiximan_home_teams_list))
print(stoiximan_away_teams_list)
print(len(stoiximan_away_teams_list))
print(stoiximan_home_odds_list)
print(len(stoiximan_home_odds_list))
print(stoiximan_draw_odds_list)
print(len(stoiximan_draw_odds_list))
print(stoiximan_away_odds_list)
print(len(stoiximan_away_odds_list))
#make str to float in odds lists
stoiximan_home_odds_list_float = [float(i) for i in stoiximan_home_odds_list]
stoiximan_draw_odds_list_float = [float(j) for j in stoiximan_draw_odds_list]
stoiximan_away_odds_list_float = [float(k) for k in stoiximan_away_odds_list]
#create dictionary for data
stoiximan_dict = {'Stoiximan Home Team': stoiximan_home_teams_list,
'Stoiximan Away Team': stoiximan_away_teams_list,
'Stoiximan Home Odd': stoiximan_home_odds_list_float,
'Stoiximan Draw Odd': stoiximan_draw_odds_list_float,
'Stoiximan Away Odd': stoiximan_away_odds_list_float
}
#create dataframe for data
df4 = pd.DataFrame(stoiximan_dict)
print(df4)
#write to excel file and open it
df4.to_excel(r'C:\Users\sweet_000\Desktop\data.xlsx', sheet_name="stoiximan", index=False)
os.system('start EXCEL.EXE "C:\\Users\\sweet_000\\Desktop\\data.xlsx"')
driver.quit()

Related

Reading weblink from dataframe throws "stale element reference: element is not attached to the page document" error

I got a dataframe that contains links to google reviews of two restaurants. I wanted to load all reviews of two restaurants (one by one) into the browser and then save them into a new data frame. I wrote a script that reads and load all reviews into the browser as follow:
from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import time
link_df = Link
0 https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318a3aa3041455:0x5f83f4fae76d8656,1,,,&rlfi=hd:;si:6882614014013965910,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEiglZKhm6qAgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSARJidXJtZXNlX3Jlc3RhdXJhbnSqAQwQASoIIgRmb29kKAA,y,UB2auy7TMYs;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]
1 https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318bf82139caaf:0xf115cd7fe794cbcc,1,,,&rlfi=hd:;si:17372017086881385420,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEjh9auu-q6AgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSAQpyZXN0YXVyYW50qgEMEAEqCCIEZm9vZCgA,y,ZeJbBWd7wDg;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]
i = 0
driver = webdriver.Chrome()
for index, i in link_df.iterrows():
base_url = i['Link'] #link_df['Link'][i]
driver.get(base_url)
WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"//div[./span[text()='Newest']]"))).click()
print('Restaurant number is ',index)
title = driver.find_element_by_xpath("//div[#class='P5Bobd']").text
address = driver.find_element_by_xpath("//div[#class='T6pBCe']").text
overall_rating = driver.find_element_by_xpath("//div[#class='review-score-container']//span[#class='Aq14fc']").text
total_reviews_text =driver.find_element_by_xpath("//div[#class='review-score-container']//div//div//span//span[#class='z5jxId']").text
num_reviews = int (total_reviews_text.split()[0])
all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.gws-localreviews__google-review')))
time.sleep(2)
total_reviews = len(all_reviews)
while total_reviews < num_reviews:
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
WebDriverWait(driver, 5, 0.25).until_not(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class$="activityIndicator"]')))
time.sleep(5)
all_reviews = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.gws-localreviews__google-review')))
print(total_reviews)
total_reviews +=1
reviews_info = driver.find_elements_by_xpath("//div[#class='jxjCjc']")
review_information = pd.DataFrame(columns=["Restaurant title","Restaurant rating","Total reviews","Reviewer Name","Rating", "Review"])
name= ''
rating = ''
text = ''
for index,review_info in enumerate(reviews_info):
name = review_info.find_element_by_xpath("./div/div/a").text
rating = review_info.find_element_by_xpath(".//div[#class='PuaHbe']//g-review-stars//span").get_attribute('aria-label')
text = review_info.find_element_by_xpath(".//div[#class='Jtu6Td']//span").text
review_information.at[len(review_information)] = [title,overall_rating,num_reviews,name,rating,text]
filename = 'Google_reviews' + ' ' +pd.to_datetime("now").strftime("%Y_%m_%d")+'.csv'
files_present = glob.glob(filename)
if files_present:
review_information.to_csv(filename,index=False,mode='a',header=False)
else:
review_information.to_csv(filename,index=False)
driver.get('https:ww.google.com')
time.sleep(3)
The problem is that script throws an error when it reaches the following line.
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
It throws following error:
StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
(Session info: chrome=95.0.4638.69)
When I tried the same program without storing google links in dataframe (i.e. no for loop and instead of base_url = i['Link'], I wrote base_url = google review link) it works fine.
I am not sure where I am making the mistake. Any suggestion or help to fix the issue would be highly appreciated?

EDIT
you put the creation of driver outside the for loop
you cant launch the new url with gps data when the first popup is always in front, if you launch it, it stays in backdoor, the easier way is to launch a new url without gps data -> https:ww.google.com and wait 3 dec before to follow your loop:
your count is not good, i have changed your selector and change the total and set some lines in comment
from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.firefox.options import Options
import time
link_df = ["https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318a3aa3041455:0x5f83f4fae76d8656,1,,,&rlfi=hd:;si:6882614014013965910,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEiglZKhm6qAgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSARJidXJtZXNlX3Jlc3RhdXJhbnSqAQwQASoIIgRmb29kKAA,y,UB2auy7TMYs;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]",
"https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318bf82139caaf:0xf115cd7fe794cbcc,1,,,&rlfi=hd:;si:17372017086881385420,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEjh9auu-q6AgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSAQpyZXN0YXVyYW50qgEMEAEqCCIEZm9vZCgA,y,ZeJbBWd7wDg;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]"
]
i = 0
binary = r'C:\Program Files (x86)\Mozilla Firefox\firefox.exe'
cap = DesiredCapabilities().FIREFOX
cap["marionette"] = True
options = Options()
options.binary = binary
driver = webdriver.Firefox(options=options, capabilities=cap, executable_path="E:\\Téléchargement\\geckodriver.exe")
# i have to launch one time to accept the cookies manually
#by setting a breakpoint after, but you dont have that i think
#driver.get(link_df[0])
print ("Headless Firefox Initialized")
print(link_df)
for url in link_df:
base_url = url # i['Link'] # link_df['Link'][i]
print(base_url)
driver.get(base_url)
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//div[./span[text()='Avis les plus récents']]"))).click()
title = driver.find_element_by_xpath("//div[#class='P5Bobd']").text
address = driver.find_element_by_xpath("//div[#class='T6pBCe']").text
overall_rating = driver.find_element_by_xpath("//div[#class='review-score-container']//span[#class='Aq14fc']").text
total_reviews_text = driver.find_element_by_xpath(
"//div[#class='review-score-container']//div//div//span//span[#class='z5jxId']").text
num_reviews = int(total_reviews_text.split()[0])
all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
# time.sleep(2)
total_reviews = 0
while total_reviews < num_reviews:
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
WebDriverWait(driver, 5, 0.25).until_not(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class$="activityIndicator"]')))
all_reviews = WebDriverWait(driver, 5).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
total_reviews = len(all_reviews)
print(total_reviews, len(all_reviews))
driver.get('https:ww.google.com') # or driver.close() if no bugs
time.sleep(3)
driver.close()
driver.quit()
it seems the solution for chrome needs some fixes:
org.openqa.selenium.StaleElementReferenceException: stale element reference: element is not attached to the page document
The literal meaning is about , The referenced element is out of date , No longer attached to the current page . Usually , This is because the page has been refreshed or skipped , The solution is , Reuse findElement or findElements Method to locate the element .
so its seems for chrome there is a problem of refreshing, so i suggest to load the number of record before to scroll, to have a fresh copy of DOM items, and i have to add a wait 1sec at the end of while loop
from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
#from selenium.webdriver.firefox.options import Options
from selenium.webdriver.chrome.options import Options
import time
link_df = [
"https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318a3aa3041455:0x5f83f4fae76d8656,1,,,&rlfi=hd:;si:6882614014013965910,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEiglZKhm6qAgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSARJidXJtZXNlX3Jlc3RhdXJhbnSqAQwQASoIIgRmb29kKAA,y,UB2auy7TMYs;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]",
"https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318bf82139caaf:0xf115cd7fe794cbcc,1,,,&rlfi=hd:;si:17372017086881385420,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEjh9auu-q6AgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSAQpyZXN0YXVyYW50qgEMEAEqCCIEZm9vZCgA,y,ZeJbBWd7wDg;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]"
]
i = 0
binaryfirefox = r'C:\Program Files (x86)\Mozilla Firefox\firefox.exe'
binarychrome = r'C:\Program Files (x86)\Google\Chrome\Application\chrome.exe'
options = Options()
#cap = DesiredCapabilities().CHROME
#cap["marionette"] = True
#cap = DesiredCapabilities().FIREFOX
#options.binary = binaryfirefox
#driver = webdriver.Firefox(options=options, capabilities=cap, executable_path="E:\\Téléchargement\\geckodriver.exe")
options.binary_location = binarychrome
driver = webdriver.Chrome(options=options, executable_path="E:\\Téléchargement\\chromedriver.exe" )
# same reason tha Firefox i have to load one time
# an url to accept manually the cookies
#driver.get(link_df[0])
print(link_df)
for url in link_df:
base_url = url # i['Link'] # link_df['Link'][i]
print(base_url)
driver.get(base_url)
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//div[./span[text()='Newest']]"))).click()
title = driver.find_element_by_xpath("//div[#class='P5Bobd']").text
address = driver.find_element_by_xpath("//div[#class='T6pBCe']").text
overall_rating = driver.find_element_by_xpath("//div[#class='review-score-container']//span[#class='Aq14fc']").text
total_reviews_text = driver.find_element_by_xpath(
"//div[#class='review-score-container']//div//div//span//span[#class='z5jxId']").text
num_reviews = int(total_reviews_text.split()[0])
all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
# time.sleep(2)
total_reviews = 0
while total_reviews < num_reviews:
#reload to avoid exception, or trap scroll with try/except but more expznsive
all_reviews = WebDriverWait(driver, 20).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
total_reviews = len(all_reviews)
print(total_reviews, len(all_reviews))
time.sleep(1)
driver.get('https:ww.google.com') # or driver.close() if no bugs
time.sleep(3)
driver.close()
driver.quit()

Selenium WebDriverWait returning exception

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
Path = "C:\Program Files (x86)\chromedriver.exe"
driver = webdriver.Chrome(Path)
driver.get("https://www.emag.ro/")
search_bar = driver.find_element_by_id("searchboxTrigger")
search_bar.send_keys("laptopuri")
search_bar.send_keys(Keys.RETURN)
main = None
try:
main = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "main-container"))
)
print("Page loaded,main retrived succesfully")
except:
driver.quit()
items = main.find_element_by_id("card_grid")
products = items.find_elements_by_css_selector("div.card-item.js-product-data")
count = 0
for product in products:
raw_name = WebDriverWait(product, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "h2.card-body.product-title-zone"))
).text
raw_price = WebDriverWait(product, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "product-new-price"))
)
#Parsing the product name
raw_name = raw_name.replace("Laptop", "")
raw_name = raw_name.strip()
if raw_name.startswith("Apple"):
sEnd = raw_name.find(",")
else:
sEnd = raw_name.find("cu") - 1
product_name = raw_name[:sEnd]
#Parsing the product price
raw_price = raw_price.text[:raw_price.text.find(" ")]
print(raw_price)
count += 1
print(f"{count} results returned")
driver.quit()
Code works perfectly fine sometimes,but sometimes i get the error:
Please note i am new at this,so an explanation would be very appreciated.I just learned how to use selenium and the reason i transitioned from beautifulsoup because of the lack of wait possibility,and now when trying to use that,i get this error SOMETIMES

See this :
driver = webdriver.Chrome(Path)
and how have you used it here :
try:
main = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "main-container"))
)
print("Page loaded,main retrived succesfully")
If you pay attention you would see that, you are using WebDriverWait(driver, 10) and passing driver reference.
But here
raw_name = WebDriverWait(product, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "h2.card-body.product-title-zone"))
).text
you are passing product in WebDriverWait, which is wrong, you should pass driver reference here. like
raw_name = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "h2.card-body.product-title-zone"))
).text
This should help you past this issue for sure.
also, make changes here
raw_price = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "product-new-price"))
)
This is what we have internally :
class WebDriverWait(object):
def __init__(self, driver, timeout, poll_frequency=POLL_FREQUENCY, ignored_exceptions=None):
"""Constructor, takes a WebDriver instance and timeout in seconds.
Update 1 :
driver = webdriver.Chrome(driver_path)
driver.maximize_window()
driver.implicitly_wait(50)
driver.get("https://www.emag.ro/")
wait = WebDriverWait(driver, 10)
wait.until(EC.element_to_be_clickable((By.XPATH, "//i[contains(#class,'close')]/parent::button[#class='close']"))).click()
ActionChains(driver).move_to_element(wait.until(EC.visibility_of_element_located((By.XPATH, "//button[contains(#class,'js-accept')]")))).click().perform()
search_bar = driver.find_element_by_id("searchboxTrigger")
search_bar.send_keys("laptopuri")
search_bar.send_keys(Keys.RETURN)
main = None
try:
main = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "main-container")))
print("Page loaded,main retrived succesfully")
except:
driver.quit()
items = main.find_element_by_id("card_grid")
products = driver.find_elements_by_css_selector("div.card-item.js-product-data")
count = 0
for product in products:
raw_name = product.find_element_by_css_selector("h2.card-body.product-title-zone a").text
print(raw_name)
raw_price = product.find_element_by_css_selector("p.product-new-price").text
print(raw_price)
#Parsing the product name
# raw_name = raw_name.replace("Laptop", "").strip()
# if raw_name.startswith("Apple"):
# sEnd = raw_name.find(",")
# else:
# sEnd = raw_name.find("cu") - 1
# product_name = raw_name[:sEnd]
#Parsing the product price
# raw_price = raw_price[raw_price.find(" ")]
# print(raw_price)
# count += 1
#print(f"{count} results returned")
driver.quit()

Scraping Vocabulary using Selenium and parsing to DataFrame

There is this little program that goes to a vocabulary, print all the words from that page and then click at the button to go to the next page and print again all the vocabulary on that page.
I used a loop to repeat the process and loop through all the words spread on multiple pages.
#Create csv
outfile = open("Vocab.csv","w",newline='')
writer = csv.writer(outfile)
#Define the dataframe
df = pd.DataFrame(columns=['rating'])
PATH="C:\Program Files (x86)\chromedriver.exe"
driver= webdriver.Chrome(PATH)
driver.get("https://sq.m.wiktionary.org/w/index.php?title=Kategoria:Shqip&pagefrom=agall%C3%ABk#mw-pages")
for x in range(3):
rating_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#mw-pages > div > div > div > ul"))
)
rating=rating_element.text
print(rating)
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.LINK_TEXT, "faqja pasardhëse"))
)
element.click()
df2 = pd.DataFrame([rating],columns=['rating'])
df = df.append(df2,ignore_index=True)
The code itself works perfectly fine, however when I tried to implement the function of parsing all the data into a DataFrame, I only get an empty Csv File. I'm trying to have only one column with the thousands of words.

You can iterate over each word to append to the column:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import selenium.common.exceptions
import os
import pandas as pd
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--window-size=1920x1080")
# chrome_options.add_argument("--headless")
chrome_driver = os.getcwd() + "\\chromedriver.exe"
driver = webdriver.Chrome(options=chrome_options, executable_path=chrome_driver)
# Define the dataframe
df = pd.DataFrame(columns=['rating'])
driver.get("https://sq.m.wiktionary.org/w/index.php?title=Kategoria:Shqip&pagefrom=agall%C3%ABk#mw-pages")
for x in range(200):
rating_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#mw-pages > div > div > div > ul"))
)
rating = rating_element.text
for word in rating.split('\n'):
df2 = pd.DataFrame([word], columns=['rating'])
df = df.append(df2, ignore_index=True)
try:
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.LINK_TEXT, "faqja pasardhëse"))
)
element.click()
except selenium.common.exceptions.TimeoutException:
break
print(df)
df.to_csv('word_list.csv', encoding='utf-8', index=False)
Outputs
rating
0 agallëk
1 agar
2 agave
3 agde
4 ageshë
.. ...
595 ankim
596 ankimor
597 ankohem
598 ankoj
599 ankojë
[600 rows x 1 columns]
Edit
Added the option to write to a file.

Extracting reviews from Google play store app website

import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import pandas as pd
class FindByXpathCss():
# Declaring variables
Reviews = [] # List to store final set of reviews
reviewText = [] # List to store reviews extracted from XPath
reviewFullText = []
# Chromedriver path
driver = webdriver.Chrome(executable_path=r"F:\Chrome-webdriver\chromedriver.exe")
driver.maximize_window()
baseUrl = "https://play.google.com/store/apps/details?id=com.delta.mobile.android&hl=en_US&showAllReviews=true"
driver.get(baseUrl)
# driver.execute_script("scrollBy(0,300);")
# Scrolling down
for i in range(20):
driver.find_element_by_xpath('//*[#id="yDmH0d"]').send_keys(Keys.ARROW_DOWN, i)
time.sleep(0.5)
# To click on Show more button
#btnShowMore = driver.find_element_by_xpath('//*[#id="fcxH9b"]/div[4]/c-wiz/div/div[2]''/div/div[1]/div/div/div[1]/div[2]/div[2]/div/span/span').click()
# Scrolling to top
for j in range(10):
driver.find_element_by_xpath('//*[#id="yDmH0d"]').send_keys(Keys.ARROW_UP, j)
#for i in range(10):
review_btn = driver.find_elements_by_xpath("//button[contains(#class,'')][contains(text(),'Full Review')]")
single_review_btn = driver.find_element_by_xpath("//button[contains(#class,'')][contains(text(),'Full Review')]")
#time.sleep(1)
The div html tag having 2 tags, one is having jsname as 'fbQN7e' which is there for holding the bigger reviews and those reviews will have button called "Full Review". Another one span within the same div html tag is 'bN97Pc' which is there to hold smaller reviews which wont have 'Full review' button at the end of this review. I couldn't get reviews of both types of span. Here I tried to write reviewFullText list directly to dataframe, but getting only element datatype, not text. I don't know why this too happening.
for btn in review_btn:
btn.click()
reviewFullText = driver.find_elements_by_css_selector("span[jsname='fbQN7e']")
#if(single_review_btn.is_enabled()==False):
#reviewText = driver.find_elements_by_css_selector("span[jsname=\"bN97Pc\"]")
##else:
#pass
# Iterating each reviews and appending into list Reviews
for txtreview in reviewText:
reviewFullText.append(txtreview.text)
print(len(reviewFullText))
# Writing the list values into csv file
df = pd.DataFrame(reviewFullText)
#df = pd.DataFrame({'Reviews': 'Reviews'}) #'Sentiment': 'null'})
df.to_csv('Reviews.csv', index=True, encoding='utf-8')
driver.close()

I have modified your solution to retrieve all review from the page.
import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
class FindByXpathCss():
driver = webdriver.Chrome(executable_path=r"C:\New folder\chromedriver.exe")
driver.maximize_window()
baseUrl = "https://play.google.com/store/apps/details?id=com.delta.mobile.android&hl=en_US&showAllReviews=true"
driver.get(baseUrl)
scrolls = 3
while True:
scrolls -= 1
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
time.sleep(3)
if scrolls < 0:
break
buttonClick = WebDriverWait(driver, 30).until(
EC.visibility_of_all_elements_located((By.XPATH, "//button[contains(#class,'')][contains(text(),'Full Review')]")))
for element in buttonClick:
driver.execute_script("arguments[0].click();", element)
reviewText = WebDriverWait(driver, 30).until(
EC.presence_of_all_elements_located((By.XPATH, "//*[#class='UD7Dzf']")))
for textreview in reviewText:
print textreview.text
reviewText = WebDriverWait(driver, 30).until(
EC.presence_of_all_elements_located((By.XPATH, "//*[#class='UD7Dzf']")))
# reviewText = driver.find_elements_by_xpath("//*[#class='UD7Dzf']")
for textreview in reviewText:
print textreview.text
Output:

PYTHON : How to fix for loop stop?

I'm trying to extract over 24 product names from multiple pages but my for loop only return 1 product name. I need the script to go to a individual product page and extract product name and the go back to
page url list and repeat the same steps.
The script below on return the first product name and then stop.
from selenium import webdriver
import time
HROMEDRIVER_PATH = '/Users/reezalaq/PycharmProjects/wholesale/driver/chromedriver'
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(CHROMEDRIVER_PATH, options=chrome_options)
chrome_options.accept_untrusted_certs = True
chrome_options.assume_untrusted_cert_issuer = True
chrome_options.headless = False
home = "https://www.blibli.com/c/4/beli--mukena/MU-1000008/54912?page=1&start=0&category=MU-1000008&sort=7&intent=false"
driver.get(home)
for number in range(1, 24):
elem = driver.find_elements_by_xpath('//*[#id="catalogProductListContentDiv"]/div[3]/div[' + str(number) + ']/div/div/a')
for link in elem:
producturl = link.get_attribute("href")
time.sleep(24)
driver.get(producturl)
getproductname = driver.find_element_by_class_name("product__name-text")
print(getproductname.text)
driver.close()

I do this by opening a new tab, then switching to the new tab.
First, wait until all your elements are visible.
Instead of time.sleep (24) you can use WebDriverWait.
This is the element you mean:
//div[#class="product__item"]/a
You can try the below code:
driver.get('https://www.blibli.com/c/4/beli--mukena/MU-1000008/54912?page=1&start=0&category=MU-1000008&sort=7&intent=false')
elements = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, '//div[#class="product__item"]/a')))
for element in elements:
url = element.get_attribute('href')
#open new tab with specific url
driver.execute_script("window.open('" +url +"');")
#switch to new tab
driver.switch_to.window(driver.window_handles[1])
getproductname = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.CLASS_NAME, 'product__name-text')))
print(getproductname.text)
#close current tab
driver.close()
#back to first tab
driver.switch_to.window(driver.window_handles[0])
driver.quit()
Following import:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python Selenium Football Odds Webscraping - python

Related

Reading weblink from dataframe throws "stale element reference: element is not attached to the page document" error

Selenium WebDriverWait returning exception

Scraping Vocabulary using Selenium and parsing to DataFrame

Extracting reviews from Google play store app website

PYTHON : How to fix for loop stop?

Categories

Resources