Web Scraping with Selenium on Python using Google Chrome - python

I am trying to scape a website to get some company information. If the search result is there and matches the search term I would like to continue, if not, I would like to move on to the next company.
Here is the code:
import pandas as pd
import numpy as np
from tqdm import notebook
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from time import time, sleep
import datetime
import sys
url = "https://register.fca.org.uk/s/"
search_box_path = '//*[#id="search-form-search-section-main-input"]'
firm_checkbox_path = '//*[#id="search-form-search-options-radio-group"]/span[1]/label/span[1]'
searchterm = 'XXX Company'
driver = webdriver.Chrome(executable_path=r'C:\Users\XXXX\Chrome Webdriver\chromedriver.exe')
driver.get(url)
element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH,firm_checkbox_path)))
driver.find_element_by_xpath(firm_checkbox_path).click()
driver.find_element_by_xpath(search_box_path).send_keys(searchterm)
driver.find_element_by_xpath(search_box_path).send_keys(Keys.RETURN)
element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH,'//*
[#id="maincontent"]/div[4]/div/div[2]/h1/span[2]')))
element = driver.find_element_by_xpath('//*[#id="maincontent"]/div[4]/div/div[2]/h1/span[2]')
check_result()
The issue is with the check_result function. In this function I am just comparing the searchterm against the element.text of the element from the website.
def check_result():
name= driver.find_element_by_xpath('//*[#id="maincontent"]/div[4]/div/div[2]/h1/span[2]')
return name.text == searchterm:
This logic on its own works fine, but along with the code it give me false even though I know that the text I provide is equal to the element.text.
Any help is much appreciated.

Related

Retrieving specific matches from a list in python

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from time import sleep
from datetime import datetime
import pandas as pd
import warnings
import os
os.chdir('C:/Users/paulc/Documents/Medium Football')
warnings.filterwarnings('ignore')
base_url = 'https://www.sportingindex.com/spread-betting/football/international-world-cup'
option = Options()
option.headless = False
driver = webdriver.Chrome("C:/Users/paulc/Documents/Medium Football/chromedriver.exe",options=option)
driver.get(base_url)
links = [elem.get_attribute("href") for elem in driver.find_elements(By.TAG_NAME,"a")]
this code retrieves all the href links on this page. I want to search the links list and return only the matches that contain 'https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a'
however I get the AttributeError: 'NoneType' object has no attribute 'startswith'
using
import re
[x for x in links if x.startswith('https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a')]
help is appreciated.
Instead of collecting all a elements on the page where will be a lot of irrelevant results you can use more precise locator.
So, instead of
driver.find_elements(By.TAG_NAME,"a")
Use this:
driver.find_elements(By.XPATH,"//a[contains(#href,'https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a')]")
This will give you desired elements only.
And this
links = [elem.get_attribute("href") for elem in driver.find_elements(By.XPATH,"//a[contains(#href,'https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a')]")]
will directly give you the wanted links only.
UPD
In case this is giving you an empty list you possibly are missing a delay. So, you can simply add some pause before that line, like time.sleep(2) but it's better to use WebDriverWait expected_conditions explicit waits for that.
I can't check it since my computer is blocking that link due to my company policy since that is a gambling site, but normally something like this should work:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
wait = WebDriverWait(driver, 10)
links = [elem.get_attribute("href") for elem in wait.until(EC.visibility_of_all_elements_located((By.XPATH, "//a[contains(#href,'https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a')]")))]
The following code is filtering to grab the right links
import time
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
webdriver_service = Service("./chromedriver") #Your chromedriver path
driver = webdriver.Chrome(service=webdriver_service)
driver.get('https://www.sportingindex.com/spread-betting/football/international-world-cup')
driver.maximize_window()
time.sleep(8)
soup = BeautifulSoup(driver.page_source,"lxml")
for u in soup.select('a[class="gatracking"]'):
link = 'https://www.sportingindex.com' + u.get('href')
if '-v-' in link:
print(link)
Output:
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.24fdf8f5-b69b-4341-b6b4-d27605f7f7fd/spain-v-germany
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.c9bdf787-791a-47e0-b77c-a2d4cf567bfd/cameroon-v-serbia
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.5eddaa44-666b-47dc-8a0f-4ac758de00dc/south-korea-v-ghana
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.70cefd39-60f7-415e-9cb5-7a56acd403d6/brazil-v-switzerland
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.7fe0285e-366f-4f3c-b77f-4c96077a6c71/portugal-v-uruguay
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.dd7a995d-7478-45f8-af27-9f234d37cc76/ecuador-v-senegal
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.92232207-0f1e-4bb1-bacd-1332ef6b9007/netherlands-v-qatar
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.b913620e-69c7-4606-a153-7b48589b7c94/iran-v-usa
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.7a4a18fb-d4ee-4880-849f-f1afdea33cd5/wales-v-england
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.20c098b4-4e97-4fd1-97b0-f42d84424361/australia-v-denmark
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.5a7476e2-8d35-4a8e-8065-b4339e79f395/tunisia-v-france
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.8a869f02-9dd0-49c5-91bd-209ee224fc2a/poland-v-argentina
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.6379b787-f246-4ba4-a896-28a97396d02f/saudi-arabia-v-mexico
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.52737cfd-da19-42dd-b15b-c16c3e8e9a86/canada-v-morocco
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.168fab1f-8360-4e87-ba84-bfbd11a4a207/croatia-v-belgium
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.9fb541f0-43a4-409c-8e54-e34a43965714/costa-rica-v-germany
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.7379c8a7-ab5d-4653-b487-22bf7ff8eefe/japan-v-spain
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.e7e4c6be-98b7-4258-ba40-74c54a790fe1/ghana-v-uruguay
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.e4c18c81-565e-47ce-b08d-9aed62c88a5d/south-korea-v-portugal
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.18f44028-e23d-48d4-970b-e75c164589bd/cameroon-v-brazil
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.526f9b1b-6d95-4f44-abce-e0a6a30acfd4/serbia-v-switzerland
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.24fdf8f5-b69b-4341-b6b4-d27605f7f7fd/spain-v-germany
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.c9bdf787-791a-47e0-b77c-a2d4cf567bfd/cameroon-v-serbia
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.5eddaa44-666b-47dc-8a0f-4ac758de00dc/south-korea-v-ghana
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.70cefd39-60f7-415e-9cb5-7a56acd403d6/brazil-v-switzerland
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.7fe0285e-366f-4f3c-b77f-4c96077a6c71/portugal-v-uruguay
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.24fdf8f5-b69b-4341-b6b4-d27605f7f7fd/spain-v-germany
https://www.sportingindex.com/spread-betting/rugby-union/france-top-14/group_a.ad22f34f-9cd6-47b4-a826-0c0f0dce7df2/lyon-v-toulouse
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.24fdf8f5-b69b-4341-b6b4-d27605f7f7fd/spain-v-germany
https://www.sportingindex.com/spread-betting/rugby-union/france-top-14/group_a.ad22f34f-9cd6-47b4-a826-0c0f0dce7df2/lyon-v-toulouse
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.c9bdf787-791a-47e0-b77c-a2d4cf567bfd/cameroon-v-serbia
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.5eddaa44-666b-47dc-8a0f-4ac758de00dc/south-korea-v-ghana
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.70cefd39-60f7-415e-9cb5-7a56acd403d6/brazil-v-switzerland
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.7fe0285e-366f-4f3c-b77f-4c96077a6c71/portugal-v-uruguay

Selenium Won't Iterate Through All Pages Using XPATH. Keep receiving "NoSuchElementException" error

Currently the below code gets to the second page and then I get a "NoSuchElementException" error. I have tried iterating using the HREF as well. The link text works slightly better but I encounter issues when I reach the "..." portion of the iteration, so would prefer to stick with the XPATH. Feel like I have tried everything.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
url = 'https://www.sec.state.ma.us/LobbyistPublicSearch/Default.aspx'
wait = WebDriverWait(driver,10)
driver.get(url)
driver.find_element('id','ContentPlaceHolder1_rdbSearchByType').click()
select = Select(driver.find_element(By.CLASS_NAME,'p3'))
select.select_by_value('2020')
driver.find_element('id','ContentPlaceHolder1_btnSearch').click()
find_table = driver.find_element(By.ID,'ContentPlaceHolder1_ucSearchResultByTypeAndCategory_grdvSearchResultByTypeAndCategory')
xpath_string = """//*[#id="ContentPlaceHolder1_ucSearchResultByTypeAndCategory_grdvSearchResultByTypeAndCategory"]/tbody/tr[98]/td/table/tbody/tr/td[1]/a"""
all_xpath = []
for i in range(1,all_pages):
pn = str(i + 1)
links_list_sliced = list(xpath_string)
links_list_sliced[131] = pn
joined_links = ''.join(links_list_sliced)
all_xpath.append(joined_links)
for l in all_xpath:
current_page_element = driver.find_element(By.XPATH, l)
wait.until(EC.element_to_be_clickable((current_page_element))).click()
#David Kahn, Seems the number of elements on each page are different. So in your xpath tr[98] is not always the case. You can do the following changes and it should work.
Your xpath_string should be the following. Where I have removed the dependency on position of the page numbers on 98th row of the table.
xpath_string = """//*[#id="ContentPlaceHolder1_ucSearchResultByTypeAndCategory_grdvSearchResultByTypeAndCategory"]/tbody/tr[#align="center"]/td/table/tbody/tr/td[1]/a"""
For this to work you need to change the following line in your for loop:
links_list_sliced[131] = pn
to:
links_list_sliced[144] = pn
Let me know if it works.

selenium no way to use a visible select element python

I am working on a project that I first sign up instagram by given information and then scrape some data for the sign up form I have some problems with second form which is birthdate the Select class of selenium didn't work. Maybe it is because of visibility. when I tried to access the elements it says that there is no such elements but they are both visible graphically and in inspect page of chrome and I prefer to not use the pyautogui because it is kinda unflexible and hard to use.
here is my code
#information
driverSignUp = webdriver.Chrome()
while True:
while True:
try:
driverSignUp.get('https://www.instagram.com/accounts/emailsignup/')
break
except:
pass
try:
driverSignUp.find_element(By.CSS_SELECTOR, 'body.p-error.dialog-404')
driverSignUp.delete_all_cookies()
except:
break
driverSignUp.implicitly_wait(1)
driverSignUp.find_element(By.NAME, 'emailOrPhone').send_keys(email)
driverSignUp.find_element(By.NAME, 'fullName').send_keys(fullName)
driverSignUp.find_element(By.NAME, 'username').send_keys(username)
driverSignUp.find_element(By.NAME, 'password').send_keys(password)
element = WebDriverWait(driverSignUp, 3).until(
EC.all_of(EC.presence_of_element_located((By.CSS_SELECTOR, 'button.sqdOP.L3NKy.y3zKF')))
)
driverSignUp.find_element(By.CSS_SELECTOR, 'button.sqdOP.L3NKy.y3zKF').click()
#birthdate
actions = ActionChains(driverSignUp)
Month = driverSignUp.find_element(By.CSS_SELECTOR, 'select[title="Month:"]')
actions.move_to_element(Month).perform()
selectMonth = Select(Month)
selectMonth.select_by_visible_text('August')
Day = driverSignUp.find_element(By.CSS_SELECTOR, 'select[title="Day:"]')
actions.move_to_element(Day).perform()
selectDay = Select(Day)
selectDay.select_by_visible_text('1')
Year = driverSignUp.find_element(By.CSS_SELECTOR, 'select[title="Year:"]')
actions.move_to_element(Year).perform()
selectYear = Select(Year)
selectYear.select_by_visible_text('2000')
and the libraries I used:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import Select
thanks for your answers

How to scrape the ratings and all the reviews from the website using selenium

I want to scrape the rating and all the reviews on the page .But not able to find the path .
enter code here
import urllib.request
from bs4 import BeautifulSoup
import csv
import os
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys
import pandas as pd
import time
chrome_path =r'C:/Users/91940/AppData/Local/Programs/Python/Python39/Scripts/chromedriver.exe'
driver = webdriver.Chrome(executable_path=chrome_path)
driver.implicitly_wait(10)
driver.get("https://www.lazada.sg/products/samsung-galaxy-watch3-bt-45mm-titanium-i1156462257-
s4537770883.html?search=1&freeshipping=1")
product_name = driver.find_element_by_xpath('//*[#id="module_product_title_1"]/div/div/h1')
print(product_name.text)
rating = driver.find_element_by_xpath("//span[#class='score-average']")
print(rate.text)
review = driver .find_element_by_xpath('//*
[#id="module_product_review"]/div/div/div[3]/div[1]/div[1]')
print(review.text)
I believe print(product_name.text) is getting execute correct, right ?
There is an issue with driver.find_element_by_xpath("//span[#class='score-average']") I could not found score-average anywhere in HTML source.
so try this instead :
driver.find_element_by_css_selector("div.pdp-review-summary")
print(rate.text)
You can try the below code to get review :
wait = WebDriverWait(driver, 10)
driver.get("https://www.lazada.sg/products/samsung-galaxy-watch3-bt-45mm-titanium-i1156462257- s4537770883.html?search=1&freeshipping=1")
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "a[class$='pdp-review-summary__link']"))).click()
ActionChains(driver).move_to_element(wait.until(EC.visibility_of_element_located((By.XPATH, "//h2[contains(text(), 'Ratings & Reviews')]")))).perform()
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div.item-content")))
for review in driver.find_elements(By.CSS_SELECTOR, "div.item-content"):
print(review.get_attribute('innerHTML'))
Imports :
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
Perhaps there is a problem with your path? (apologies I'm not on windows to test). From memory, Windows paths use \ characters instead of /. Additionally, you may need two backticks after the drive path (C:\\).
c:\\Users\91940\AppData\Local\...

Condition to check if Selenium is done scrolling based on web element?

Currently I have a script that will go to TripAdvisor and try to scrape every image in that particular filter. I was wondering what conditional I should set my if statement to in order for it to break out of the while loop and then parse the list of urls to give me clear url links to each image. I am just confused at how I can tell if I have reached the end once I have reached the last web element. The if statement is right at the end before the last printing loop. Any help is greatly appreciated!
# import dependencies
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import re
import selenium
import io
import pandas as pd
import urllib.request
import urllib.parse
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.common.action_chains import ActionChains
from selenium import webdriver
import time
from _datetime import datetime
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
options = webdriver.ChromeOptions()
options.headless=False
driver = webdriver.Chrome("/Users/rishi/Downloads/chromedriver 3")
driver.maximize_window()
prefs = {"profile.default_content_setting_values.notifications" : 2}
options.add_experimental_option("prefs", prefs)
#open up website
driver.get(
"https://www.tripadvisor.com/Hotel_Review-g28970-d84078-Reviews-Hyatt_Regency_Washington_on_Capitol_Hill-Washington_DC_District_of_Columbia.html#/media/84078/?albumid=101&type=2&category=101")
image_url = []
end = False
while not(end):
#wait until element is found and then store all webelements into list
images = WebDriverWait(driver, 20).until(
EC.presence_of_all_elements_located(
(By.XPATH, '//*[#class="media-viewer-dt-root-GalleryImageWithOverlay__galleryImage--1Drp0"]')))
#iterate through visible images and acquire their url based on background image style
for index, image in enumerate(images):
image_url.append(images[index].value_of_css_property("background-image"))
#if you are at the end of the page then leave loop
# if(length == end_length):
# end = True
#move to next visible images in the array
driver.execute_script("arguments[0].scrollIntoView();", images[-1])
#wait one second
time.sleep(1)
if():
end = True
#clean the list to provide clear links
for i in range(len(image_url)):
start = image_url[i].find("url(\"") + len("url(\"")
end = image_url[i].find("\")")
print(image_url[i][start:end])
#print(image_url)

Categories