Xpath wrong using selenium - python

from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from csv import writer
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 20)
URL = 'https://www.askgamblers.com/online-casinos/reviews/yukon-gold-casino-casino'
driver.get(URL)
data=driver.find_elements(By.XPATH,"//section[#class='review-text richtext']")
for row in data:
try:
para0= row.find_element(By.XPATH,"//h2[text()[contains(.,'Games')]]/following-sibling::p[following::h2[text()[contains(.,'Support')]]]").text
except:
pass
print(para0)
I want they collect the data of Games only but they also get the data of Virtual Games so how we restrict the contains method that get only data of Games only kindly recommend any solution for that these is page link https://www.askgamblers.com/online-casinos/reviews/yukon-gold-casino-casino
Want these only
do not get these text of virtual game

[contains(.,'Games')] will match both Games and Virtual Games.
What you can do here is:
Use equals instead of contains, like this:
"[text()='Games']"
or use starts-with:
"[starts-with(text(), 'Games')]"
So this line para0= row.find_element(By.XPATH,"//h2[text()[contains(.,'Games')]]/following-sibling::p[following::h2[text()[contains(.,'Support')]]]").text can be changed to
para0= row.find_element(By.XPATH,"//h2[text()='Games']/following-sibling::p[following::h2[contains(.,'Support')]]").text
or
para0= row.find_element(By.XPATH,"//h2[starts-with(text(), 'Games')]/following-sibling::p[following::h2[contains(.,'Support')]]").text

Related

how to solve captcha

from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import csv
import requests
from csv import writer
url="https://allegro.pl/oferta/lurlego-76900-speed-champions-koenigsegg-jesko-11096977887"
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get(url)
try:
title=driver.find_element(By.XPATH,'//div[#class="msub_k4"]//h4').text
except:
pass
print(title)
How to solve that will scraping the data any solution for it....is there any possible solution for that kindly recommend me any solution...

Selenium XPATH returns limited number of results

I am trying to scrape Google results to get URLs of Linkedin accounts:
import variables
import csv
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from parsel import Selector
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://www.google.com/')
search_bar = driver.find_element(By.NAME, 'q')
sleep(3)
search_bar.send_keys(variables.query)
sleep(3)
search_bar.send_keys(Keys.ENTER)
linkedin_users_urls_list = driver.find_elements(By.XPATH, '//div[#class="yuRUbf"]/a[#href]')
profile_urls = []
// only 11 results
# get urls
[profile_urls.append(users.get_attribute("href")) for users in linkedin_users_urls_list]
print(profile_urls)
// only 11 results
This code works but for some reason I only get like 11 results inprofile_urls array.
Why is this happening if Google returns like several thousand results containing the required URL? I also have changed the Google settings to output maximum number of results per page.
Is there some kind of limitation with XPATH argument?

website search bar doesn't work for python selenium

I would like auto-click the website and search for the information, but somehow the website cannot search, and keep loading. Or just close quickly after it print the key in search bar.
I would like auto-click the website and search for the information, and I tried:
import selenium
import pandas as pd
import numpy as np
import platform
import time
import random
from os import getcwd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-notification")
options.add_argument("--disable-infobars")
options.add_experimental_option('excludeSwitches', ['enable-logging'])
options.add_argument("--remote-debugging-port=9230")
#options.add_argument("--headless")
url = 'https://vip.stock.finance.sina.com.cn/mkt/#hs_z'
driver.get(url)
w = WebDriverWait(driver, 10)
w.until(EC.presence_of_element_located((By.XPATH, '//*[#id="inputSuggest"]')))
driver.find_element('xpath', '//*[#id="inputSuggest"]').clear()
driver.find_element('xpath', '//*[#id="inputSuggest"]').send_keys('sz111973'))
driver.find_element('xpath', '//*[#id="SSForm"]/input[3]').click()
But somehow the website cannot search, and keep loading. Or just close quickly after it print the key in search bar.
Any help will be appreciated! Thanks.
There are several issues here:
to prevent site from very long loading you can use eager pageLoadStrategy.
I see redundant ) at the end of this line driver.find_element('xpath', '//*[#id="inputSuggest"]').send_keys('sz111973'))
The following code works perfect:
from selenium import webdriver
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
caps = DesiredCapabilities().CHROME
caps["pageLoadStrategy"] = "eager"
webdriver_service = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(service=webdriver_service, options=options, desired_capabilities=caps,)
url = 'https://vip.stock.finance.sina.com.cn/mkt/#hs_z'
driver.get(url)
wait = WebDriverWait(driver, 20)
input = wait.until(EC.element_to_be_clickable((By.ID, 'inputSuggest')))
input.clear()
input.send_keys('sz111973')
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#inputSuggest +input'))).click()

How to redirect output of a program to a dataframe

how to redirect output of a file to a dataframe.
this code opens browser and types the given data provided and gets the first data available.
code :
selenium pincodes
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import re
import pandas as pd
import os
import html5lib
import json
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
pin=['mumbai','newyork']
for i in pin :
url = "https://www.google.com/"
chromedriver = r"C:\Users\me\chromedriver"
driver = webdriver.Chrome(chromedriver)
driver.implicitly_wait(30)
driver.get(url)
search = driver.find_element_by_name('q')
search.send_keys(i,'pincode')
search.send_keys(Keys.RETURN)
WebDriverWait(driver, 10).until(expected_conditions.visibility_of_element_located((By.XPATH, '//div[#class="IAznY"]//div[#class="title"]')))
elmts = driver.find_elements_by_xpath('//div[#class="IAznY"]//div[#class="title"]')
print(i,elmts[0].text)
time.sleep(3)
driver.quit()
this code outputs the following
newyork 10001
mumbai 230532
how to redirect this output into df like this
city pincode
newyork 10001
mumbai 230532
Declare two arrays and add into dataframe.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import re
import pandas as pd
import os
import html5lib
import json
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
pin=['mumbai','newyork']
#Declare list here
City=[]
PinCode=[]
for i in pin :
url = "https://www.google.com/"
chromedriver = r"C:\Users\me\chromedriver"
driver = webdriver.Chrome(chromedriver)
driver.implicitly_wait(30)
driver.get(url)
search = driver.find_element_by_name('q')
search.send_keys(i,'pincode')
search.send_keys(Keys.RETURN)
WebDriverWait(driver, 10).until(expected_conditions.visibility_of_element_located((By.XPATH, '//div[#class="IAznY"]//div[#class="title"]')))
elmts = driver.find_elements_by_xpath('//div[#class="IAznY"]//div[#class="title"]')
#Append the data into list
City.append(i)
PinCode.append(elmts[0].text)
#added into dataframe
df=pd.DataFrame({"City":City,"PinCode":PinCode})
print(df)
time.sleep(3)
driver.quit()
Output:
City PinCode
0 mumbai 230532
1 newyork 10001
Like this?
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import re
import pandas as pd
import os
import html5lib
import json
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
pin=['mumbai','newyork']
df_output = pd.DataFrame(columns=["City", "pincode"])
for i in pin :
url = "https://www.google.com/"
chromedriver = r"C:\Users\me\chromedriver"
driver = webdriver.Chrome(chromedriver)
driver.implicitly_wait(30)
driver.get(url)
search = driver.find_element_by_name('q')
search.send_keys(i,'pincode')
search.send_keys(Keys.RETURN)
WebDriverWait(driver, 10).until(expected_conditions.visibility_of_element_located((By.XPATH, '//div[#class="IAznY"]//div[#class="title"]')))
elmts = driver.find_elements_by_xpath('//div[#class="IAznY"]//div[#class="title"]')
print(i,elmts[0].text)
df_output = df_output.append(pd.DataFrame(columns=["City", "pincode"], data=[[i,elmts[0].text]]))
time.sleep(3)
driver.quit()
print(df_output)
executing a script (as a subprocess) from another script and parsing the (redirected) output can technically be done, but that's definitly not the right approach (nor the easiest actually).
Instead, you want to refactor your scripts as functions in modules, and have one function working on the results of the other.

Each time I run Selenium to ping a specific site, my WebDriver ends up going to a completely different URL

I want to go to this site: https://www.pluginillinois.org/offers.aspx?said=2
Each time I have some code that looks like the following:
import os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import time
import pyautogui
driverpath = r"C:\Users\chromedriver.exe"
browserProfile = webdriver.ChromeOptions()
browserProfile.add_experimental_option('prefs', {'intl.accept_languages': 'en,en_US'})
browser = webdriver.Chrome(driverpath, options=browserProfile)
browser.get('https://www.pluginillinois.org/offers.aspx?said=2')
My Selenium browser will open and go to a completely different URL instead. The browser will instead go to https://www.pluginillinois.org/OffersBegin.aspx.
Is there a way to obfuscate this behavior?

Categories