Retrieving specific matches from a list in python - python

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from time import sleep
from datetime import datetime
import pandas as pd
import warnings
import os
os.chdir('C:/Users/paulc/Documents/Medium Football')
warnings.filterwarnings('ignore')
base_url = 'https://www.sportingindex.com/spread-betting/football/international-world-cup'
option = Options()
option.headless = False
driver = webdriver.Chrome("C:/Users/paulc/Documents/Medium Football/chromedriver.exe",options=option)
driver.get(base_url)
links = [elem.get_attribute("href") for elem in driver.find_elements(By.TAG_NAME,"a")]
this code retrieves all the href links on this page. I want to search the links list and return only the matches that contain 'https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a'
however I get the AttributeError: 'NoneType' object has no attribute 'startswith'
using
import re
[x for x in links if x.startswith('https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a')]
help is appreciated.

Instead of collecting all a elements on the page where will be a lot of irrelevant results you can use more precise locator.
So, instead of
driver.find_elements(By.TAG_NAME,"a")
Use this:
driver.find_elements(By.XPATH,"//a[contains(#href,'https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a')]")
This will give you desired elements only.
And this
links = [elem.get_attribute("href") for elem in driver.find_elements(By.XPATH,"//a[contains(#href,'https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a')]")]
will directly give you the wanted links only.
UPD
In case this is giving you an empty list you possibly are missing a delay. So, you can simply add some pause before that line, like time.sleep(2) but it's better to use WebDriverWait expected_conditions explicit waits for that.
I can't check it since my computer is blocking that link due to my company policy since that is a gambling site, but normally something like this should work:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
wait = WebDriverWait(driver, 10)
links = [elem.get_attribute("href") for elem in wait.until(EC.visibility_of_all_elements_located((By.XPATH, "//a[contains(#href,'https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a')]")))]

The following code is filtering to grab the right links
import time
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
webdriver_service = Service("./chromedriver") #Your chromedriver path
driver = webdriver.Chrome(service=webdriver_service)
driver.get('https://www.sportingindex.com/spread-betting/football/international-world-cup')
driver.maximize_window()
time.sleep(8)
soup = BeautifulSoup(driver.page_source,"lxml")
for u in soup.select('a[class="gatracking"]'):
link = 'https://www.sportingindex.com' + u.get('href')
if '-v-' in link:
print(link)
Output:
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.24fdf8f5-b69b-4341-b6b4-d27605f7f7fd/spain-v-germany
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.c9bdf787-791a-47e0-b77c-a2d4cf567bfd/cameroon-v-serbia
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.5eddaa44-666b-47dc-8a0f-4ac758de00dc/south-korea-v-ghana
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.70cefd39-60f7-415e-9cb5-7a56acd403d6/brazil-v-switzerland
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.7fe0285e-366f-4f3c-b77f-4c96077a6c71/portugal-v-uruguay
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.dd7a995d-7478-45f8-af27-9f234d37cc76/ecuador-v-senegal
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.92232207-0f1e-4bb1-bacd-1332ef6b9007/netherlands-v-qatar
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.b913620e-69c7-4606-a153-7b48589b7c94/iran-v-usa
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.7a4a18fb-d4ee-4880-849f-f1afdea33cd5/wales-v-england
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.20c098b4-4e97-4fd1-97b0-f42d84424361/australia-v-denmark
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.5a7476e2-8d35-4a8e-8065-b4339e79f395/tunisia-v-france
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.8a869f02-9dd0-49c5-91bd-209ee224fc2a/poland-v-argentina
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.6379b787-f246-4ba4-a896-28a97396d02f/saudi-arabia-v-mexico
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.52737cfd-da19-42dd-b15b-c16c3e8e9a86/canada-v-morocco
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.168fab1f-8360-4e87-ba84-bfbd11a4a207/croatia-v-belgium
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.9fb541f0-43a4-409c-8e54-e34a43965714/costa-rica-v-germany
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.7379c8a7-ab5d-4653-b487-22bf7ff8eefe/japan-v-spain
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.e7e4c6be-98b7-4258-ba40-74c54a790fe1/ghana-v-uruguay
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.e4c18c81-565e-47ce-b08d-9aed62c88a5d/south-korea-v-portugal
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.18f44028-e23d-48d4-970b-e75c164589bd/cameroon-v-brazil
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.526f9b1b-6d95-4f44-abce-e0a6a30acfd4/serbia-v-switzerland
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.24fdf8f5-b69b-4341-b6b4-d27605f7f7fd/spain-v-germany
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.c9bdf787-791a-47e0-b77c-a2d4cf567bfd/cameroon-v-serbia
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.5eddaa44-666b-47dc-8a0f-4ac758de00dc/south-korea-v-ghana
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.70cefd39-60f7-415e-9cb5-7a56acd403d6/brazil-v-switzerland
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.7fe0285e-366f-4f3c-b77f-4c96077a6c71/portugal-v-uruguay
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.24fdf8f5-b69b-4341-b6b4-d27605f7f7fd/spain-v-germany
https://www.sportingindex.com/spread-betting/rugby-union/france-top-14/group_a.ad22f34f-9cd6-47b4-a826-0c0f0dce7df2/lyon-v-toulouse
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.24fdf8f5-b69b-4341-b6b4-d27605f7f7fd/spain-v-germany
https://www.sportingindex.com/spread-betting/rugby-union/france-top-14/group_a.ad22f34f-9cd6-47b4-a826-0c0f0dce7df2/lyon-v-toulouse
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.c9bdf787-791a-47e0-b77c-a2d4cf567bfd/cameroon-v-serbia
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.5eddaa44-666b-47dc-8a0f-4ac758de00dc/south-korea-v-ghana
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.70cefd39-60f7-415e-9cb5-7a56acd403d6/brazil-v-switzerland
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.7fe0285e-366f-4f3c-b77f-4c96077a6c71/portugal-v-uruguay

Related

Extracting html table into panda DataFrame using Selenium

I'm trying to read an html table and then extract it into pd.DataFrame but instead getting something different. What am I doing wrong?
the error is: [<selenium.webdriver.remote.webelement.WebElement (session="38159852443c19167a9033a2b078fe45", element="ef6a42a1-2775-44c1-955f-5f01870bc758")>]
Here is my code:
import pandas as pd
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
options = Options()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
driver = webdriver.Chrome(executable_path = 'mypath/chromedriver.exe', options = options)
driver.get("https://ai.fmcsa.dot.gov/SMS")
wait = WebDriverWait(driver, 20)
wait.until(EC.element_to_be_clickable((By.XPATH, "//a[#title='Close']"))).click()
wait.until(EC.element_to_be_clickable((By.XPATH, "(//input[#name='MCSearch'])[2]"))).send_keys('1818437')
wait.until(EC.element_to_be_clickable((By.XPATH, "(//input[#name='search'])[2]"))).click()
wait.until(EC.element_to_be_clickable((By.XPATH, "//*[#id='BASICs']/p[2]/a"))).click()
tables=WebDriverWait(driver,20).until(EC.presence_of_all_elements_located((By.XPATH,'//*[#id="BASICs"]/table/tbody/tr[2]')))
print(tables)
Disregard the bunch of extra imports as I've been trying to approach the problem in different way but keep failing
I might have solved it, actually.
I changed the XPATH of the table from
'//*[#id="BASICs"]/table/tbody/tr[2]'
to
"//tr[#class='valueRow sumData']"
and I just realized I was printing the element and not its content so I changed the last line from
print(tables) to print(tables.text)
Now it's printing the table but for some reason it's not printing the very last value (0.05 in this case). Any ideas why?

How to scrape the ratings and all the reviews from the website using selenium

I want to scrape the rating and all the reviews on the page .But not able to find the path .
enter code here
import urllib.request
from bs4 import BeautifulSoup
import csv
import os
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys
import pandas as pd
import time
chrome_path =r'C:/Users/91940/AppData/Local/Programs/Python/Python39/Scripts/chromedriver.exe'
driver = webdriver.Chrome(executable_path=chrome_path)
driver.implicitly_wait(10)
driver.get("https://www.lazada.sg/products/samsung-galaxy-watch3-bt-45mm-titanium-i1156462257-
s4537770883.html?search=1&freeshipping=1")
product_name = driver.find_element_by_xpath('//*[#id="module_product_title_1"]/div/div/h1')
print(product_name.text)
rating = driver.find_element_by_xpath("//span[#class='score-average']")
print(rate.text)
review = driver .find_element_by_xpath('//*
[#id="module_product_review"]/div/div/div[3]/div[1]/div[1]')
print(review.text)
I believe print(product_name.text) is getting execute correct, right ?
There is an issue with driver.find_element_by_xpath("//span[#class='score-average']") I could not found score-average anywhere in HTML source.
so try this instead :
driver.find_element_by_css_selector("div.pdp-review-summary")
print(rate.text)
You can try the below code to get review :
wait = WebDriverWait(driver, 10)
driver.get("https://www.lazada.sg/products/samsung-galaxy-watch3-bt-45mm-titanium-i1156462257- s4537770883.html?search=1&freeshipping=1")
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "a[class$='pdp-review-summary__link']"))).click()
ActionChains(driver).move_to_element(wait.until(EC.visibility_of_element_located((By.XPATH, "//h2[contains(text(), 'Ratings & Reviews')]")))).perform()
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div.item-content")))
for review in driver.find_elements(By.CSS_SELECTOR, "div.item-content"):
print(review.get_attribute('innerHTML'))
Imports :
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
Perhaps there is a problem with your path? (apologies I'm not on windows to test). From memory, Windows paths use \ characters instead of /. Additionally, you may need two backticks after the drive path (C:\\).
c:\\Users\91940\AppData\Local\...

Iterating through multiple urls

With your help, I was able to get a scraper running, but now I am stuck when it comes to iterating. Ultimately, I want to scraper to run through different URLs, but I'm getting confused in the syntax. I am using Selenium to open the web page and then BeautifulSoup to extract the data. I think I need to define the URLs and then use something like:
for url in urls
but I am not sure how to use this. Reading other answers and videos has left me scratching my head.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as soup
import pandas as pd
urls = ["https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2021/02/06&Racecourse=ST&RaceNo=1","https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2021/02/06&Racecourse=ST&RaceNo=2"]
driver = webdriver.Chrome()
driver.get(urls)
for url in urls:
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CLASS_NAME, "f_fs13")))
htmlStr = driver.page_source
soup_level1 = soup(htmlStr, 'html.parser')
race_soup = soup_level1.find('tbody',{'class':'f_fs13'}).find_parent('table')
results_soup = soup_level1.find('tbody',{'class':'f_fs12'}).find_parent('table')
df1 = pd.read_html(str(race_soup))[0]
print(df1)
df2 = pd.read_html(str(results_soup))[0]
print(df2)
print('good')
driver.close()

Web Scraping with Selenium on Python using Google Chrome

I am trying to scape a website to get some company information. If the search result is there and matches the search term I would like to continue, if not, I would like to move on to the next company.
Here is the code:
import pandas as pd
import numpy as np
from tqdm import notebook
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from time import time, sleep
import datetime
import sys
url = "https://register.fca.org.uk/s/"
search_box_path = '//*[#id="search-form-search-section-main-input"]'
firm_checkbox_path = '//*[#id="search-form-search-options-radio-group"]/span[1]/label/span[1]'
searchterm = 'XXX Company'
driver = webdriver.Chrome(executable_path=r'C:\Users\XXXX\Chrome Webdriver\chromedriver.exe')
driver.get(url)
element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH,firm_checkbox_path)))
driver.find_element_by_xpath(firm_checkbox_path).click()
driver.find_element_by_xpath(search_box_path).send_keys(searchterm)
driver.find_element_by_xpath(search_box_path).send_keys(Keys.RETURN)
element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH,'//*
[#id="maincontent"]/div[4]/div/div[2]/h1/span[2]')))
element = driver.find_element_by_xpath('//*[#id="maincontent"]/div[4]/div/div[2]/h1/span[2]')
check_result()
The issue is with the check_result function. In this function I am just comparing the searchterm against the element.text of the element from the website.
def check_result():
name= driver.find_element_by_xpath('//*[#id="maincontent"]/div[4]/div/div[2]/h1/span[2]')
return name.text == searchterm:
This logic on its own works fine, but along with the code it give me false even though I know that the text I provide is equal to the element.text.
Any help is much appreciated.

Condition to check if Selenium is done scrolling based on web element?

Currently I have a script that will go to TripAdvisor and try to scrape every image in that particular filter. I was wondering what conditional I should set my if statement to in order for it to break out of the while loop and then parse the list of urls to give me clear url links to each image. I am just confused at how I can tell if I have reached the end once I have reached the last web element. The if statement is right at the end before the last printing loop. Any help is greatly appreciated!
# import dependencies
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import re
import selenium
import io
import pandas as pd
import urllib.request
import urllib.parse
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.common.action_chains import ActionChains
from selenium import webdriver
import time
from _datetime import datetime
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
options = webdriver.ChromeOptions()
options.headless=False
driver = webdriver.Chrome("/Users/rishi/Downloads/chromedriver 3")
driver.maximize_window()
prefs = {"profile.default_content_setting_values.notifications" : 2}
options.add_experimental_option("prefs", prefs)
#open up website
driver.get(
"https://www.tripadvisor.com/Hotel_Review-g28970-d84078-Reviews-Hyatt_Regency_Washington_on_Capitol_Hill-Washington_DC_District_of_Columbia.html#/media/84078/?albumid=101&type=2&category=101")
image_url = []
end = False
while not(end):
#wait until element is found and then store all webelements into list
images = WebDriverWait(driver, 20).until(
EC.presence_of_all_elements_located(
(By.XPATH, '//*[#class="media-viewer-dt-root-GalleryImageWithOverlay__galleryImage--1Drp0"]')))
#iterate through visible images and acquire their url based on background image style
for index, image in enumerate(images):
image_url.append(images[index].value_of_css_property("background-image"))
#if you are at the end of the page then leave loop
# if(length == end_length):
# end = True
#move to next visible images in the array
driver.execute_script("arguments[0].scrollIntoView();", images[-1])
#wait one second
time.sleep(1)
if():
end = True
#clean the list to provide clear links
for i in range(len(image_url)):
start = image_url[i].find("url(\"") + len("url(\"")
end = image_url[i].find("\")")
print(image_url[i][start:end])
#print(image_url)

Categories