I am trying to tab through an Instagram page given a user input. I am able to get to the page. The page loads, then the class is found, then the code breaks. Here is my code:
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import urllib.request, urllib.parse, urllib.error
import time
def get_posts(tag, user_count):
'''Getting html of page to be scrape for
hrefs that will get me the user names'''
print("getting page")
url = "https://www.instagram.com/explore/tags/" + tag + "/"
try:
driver = webdriver.Chrome()
driver.get(url)
print("successfully requested site")
except:
print("Unable to reach site")
quit()
browser = driver.find_element_by_class_name('_si7dy')
actions = ActionChains(browser)
for i in range(user_count):
actions = actions.send_keys(Keys.TAB)
time.sleep(0.5)
actions.perform()
soup = BeautifulSoup(driver.page_source, 'lxml')
try:
posts = soup.find_all("div", class_ = ["_mck9w","_gvoze","_f2mse"])
except:
print("No links found")
quit()
print("Length of posts: ",(len(posts)))
print(len(posts))
print(type(posts))
print("All Done")
driver.close()
return posts
I keep getting this error:
packages\selenium\webdriver\common\action_chains.py", line 69, in __init__
if self._driver.w3c:
AttributeError: 'WebElement' object has no attribute 'w3c'
I have searched around but have not found any info on w3c. I have never tabbed down a page before and so am using the answer found here: Send multiple tab key presses with selenium.
ActionChains seems like the best way to tab multiple times down the page, but if anyone has a better method I am open to trying that.
ActionChains should receive WebDriver, but you are sending WebElement instead
actions = ActionChains(driver)
Related
I am currently trying to scrape news headlines with python, beautiful soup and selenium from a website with a "show more" button. I am able to successfully load the page with selenium, click the button to bring up more headlines, and then print out the headlines, all with no error messages. My problem is Beautiful Soup is not reading the contents of the driver after the "show more" button is clicked. It is only reading the headlines that are initially on the page before the button is clicked. How do I make it so the the headlines are read and printed out only after the "show more" button is click a certain number of times? I have a for loop rather than a while loop to so I can click the button n times.
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time
from bs4 import BeautifulSoup
s=Service('/Users/comp/Desktop/chromedriver')
driver= webdriver.Chrome(service=s)
url='https://www.foxnews.com/politics'
driver.get(url)
for x in range(10):
try:
loadMoreButton = driver.find_element(By.XPATH, "/html/body/div[2]/div/div/div/div[2]/div/main/section[4]/footer/div/a")
time.sleep(3)
loadMoreButton.click()
time.sleep(3)
except Exception as e:
print(e)
break
time.sleep(3)
soup = BeautifulSoup(driver.page_source, 'lxml')
headlines = soup.find('body').find_all('h4')
for x in headlines:
print(x.text.strip())
time.sleep(3)
driver.quit()
Try the below code. Now it's working
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time
from bs4 import BeautifulSoup
s=Service('./chromedriver')
driver= webdriver.Chrome(service=s)
url='https://www.foxnews.com/politics'
driver.get(url)
time.sleep(3)
for x in range(10):
try:
soup = BeautifulSoup(driver.page_source, 'lxml')
headlines = soup.find('body').find_all('h4')
for x in headlines:
print(x.text.strip())
loadMoreButton = driver.find_element(By.XPATH, "/html/body/div[2]/div/div/div/div[2]/div/main/section[4]/footer/div/a")
if loadMoreButton:
loadMoreButton.click()
time.sleep(3)
except Exception as e:
print(e)
break
Currently having quite the issue with selenium.
I am trying to get all the links on a page, click each, obtain the data from the page and go back. Even when using the StaleElementReference exception handler, it will completely break the loop,despite using driver.back() as is advised.
The code is as follows:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver import ActionChains
from selenium.webdriver.common.keys import Keys
from datetime import datetime
from pymongo import MongoClient
from selenium.common.exceptions import StaleElementReferenceException
options = Options()
options.page_load_strategy = 'none'
# options.add_argument("--headless")
driver = webdriver.Chrome(options=options)
url = "https://www.depop.com/purevintage_clothing/"
# driver = webdriver.Chrome()
driver.get(url)
for link in links:
linkClass = link.get_attribute("class")
try:
if str(linkClass[:19]) == "styles__ProductCard":
action = ActionChains(driver)
action.move_to_element(link)
action.click().perform()
product = doSomethingFunction()
if product != None:
insertIntoDatabase(product)
driver.back()
except StaleElementReferenceException as e:
print(e)
driver.back()
I am aware indentation is a bit dodgy here, wrote this out manually as the rest of the processing code such as insertIntoDatabase I'm not sure is relevant here (please let me know if you need all of it)
Whenever I do this I end up with the error exception in a loop despite the driver.back() I'm sure the answer is staring me in the face and I'm a bit too dense to see it, but any help is appreciated here
Everytime you go back to the main page you need to get links because they are not present in the DOM anymore since you changed page; so you should do as follows:
links = driver.find_elements_by_xpath(path_to_elements)
for i in range(len(links)):
link = driver.find_elements_by_xpath(path_to_elements)[i]
linkClass = link.get_attribute("class")
if str(linkClass[:19]) == "styles__ProductCard":
action = ActionChains(driver)
action.move_to_element(link)
action.click().perform()
product = doSomethingFunction()
if product != None:
insertIntoDatabase(product)
driver.back()
I'm trying to get a value that is given by the website after a click on a button.
Here is the website: https://www.4devs.com.br/gerador_de_cpf
You can see that there is a button called "Gerar CPF", this button provides a number that appears after the click.
My current script opens the browser and get the value, but I'm getting the value from the page before the click, so the value is empty. I would like to know if it is possible to get the value after the click on the button.
from selenium import webdriver
from bs4 import BeautifulSoup
from requests import get
url = "https://www.4devs.com.br/gerador_de_cpf"
def open_browser():
driver = webdriver.Chrome("/home/felipe/Downloads/chromedriver")
driver.get(url)
driver.find_element_by_id('bt_gerar_cpf').click()
def get_cpf():
response = get(url)
page_with_cpf = BeautifulSoup(response.text, 'html.parser')
cpf = page_with_cpf.find("div", {"id": "texto_cpf"}).text
print("The value is: " + cpf)
open_browser()
get_cpf()
open_browser and get_cpf are absolutely not related to each other...
Actually you don't need get_cpf at all. Just wait for text after clicking the button:
from selenium.webdriver.support.ui import WebDriverWait as wait
def open_browser():
driver = webdriver.Chrome("/home/felipe/Downloads/chromedriver")
driver.get(url)
driver.find_element_by_id('bt_gerar_cpf').click()
text_field = driver.find_element_by_id('texto_cpf')
text = wait(driver, 10).until(lambda driver: not text_field.text == 'Gerando...' and text_field.text)
return text
print(open_browser())
Update
The same with requests:
import requests
url = 'https://www.4devs.com.br/ferramentas_online.php'
data = {'acao': 'gerar_cpf', 'pontuacao': 'S'}
response = requests.post(url, data=data)
print(response.text)
You don't need to use requests and BeautifulSoup.
from selenium import webdriver
from time import sleep
url = "https://www.4devs.com.br/gerador_de_cpf"
def get_cpf():
driver = webdriver.Chrome("/home/felipe/Downloads/chromedriver")
driver.get(url)
driver.find_element_by_id('bt_gerar_cpf').click()
sleep(10)
text=driver.find_element_by_id('texto_cpf').text
print(text)
get_cpf()
Can you use a While loop until text changes?
from selenium import webdriver
url = "https://www.4devs.com.br/gerador_de_cpf"
def get_value():
driver = webdriver.Chrome()
driver.get(url)
driver.find_element_by_id('bt_gerar_cpf').click()
while driver.find_element_by_id('texto_cpf').text == 'Gerando...':
continue
val = driver.find_element_by_id('texto_cpf').text
driver.quit()
return val
print(get_value())
I recommend this website that does exactly the same thing.
https://4devs.net.br/gerador-cpf
But to get the "gerar cpf" action with selenium, you can inspect the HTML source code with a browser and click on "copy XPath for this element".
It is much simpler than manually searching for the elements in the page.
I am trying to access all href-links from a website, the search-results to be precise. My first intention is to get all the links, and then to look further on it. The problem is --> I get some links from the website, but not the links of the search-results. Here is one version of my code.
from selenium import webdriver
from htmldom import htmldom
dom = htmldom.HtmlDom("myWebsite")
dom = dom.createDom()
p_links = dom.find("a")
for link in p_links:
print("URL: " +link.attr("href"))
Here is screen of the HTML of that particular website. In the screen, I marked the href-link I try to access in the future. I am open for any help given, be it in Selenium, htmldom, b4soup, etc.
The data you are after, is loaded with AJAX requests. So, you can't scrape them directly after getting the page source. But, the AJAX request is sent to this URL:
https://open.nrw/solr/collection1/select?q=*%3A*&fl=validated_data_dict%20title%20groups%20notes%20maintainer%20metadata_modified%20res_format%20author_email%20name%20extras_opennrw_spatial%20author%20extras_opennrw_groups%20extras_opennrw_format%20license_id&wt=json&fq=-type:harvest+&sort=title_string%20asc&indent=true&rows=20
which returns the data in JSON format. You can use requests module to scrape this data.
import requests
BASE_URL = 'https://open.nrw/dataset/'
r = requests.get('https://open.nrw/solr/collection1/select?q=*%3A*&fl=validated_data_dict%20title%20groups%20notes%20maintainer%20metadata_modified%20res_format%20author_email%20name%20extras_opennrw_spatial%20author%20extras_opennrw_groups%20extras_opennrw_format%20license_id&wt=json&fq=-type:harvest+&sort=title_string%20asc&indent=true&rows=20')
data = r.json()
for item in data['response']['docs']:
print(BASE_URL + item['name'])
Output:
https://open.nrw/dataset/mags-90-10-dezilsverhaeltnis-der-aequivalenzeinkommen-1512029759099
https://open.nrw/dataset/alkis-nutzungsarten-pro-baublock-wuppertal-w
https://open.nrw/dataset/allgemein-bildende-schulen-am-1510-nach-schulformen-schulen-schueler-und-lehrerbestand-w
https://open.nrw/dataset/altersgruppen-in-meerbusch-gesamt-meerb
https://open.nrw/dataset/amtliche-stadtkarte-wuppertal-raster-w
https://open.nrw/dataset/mais-anteil-abhaengig-erwerbstaetiger-mit-geringfuegiger-beschaeftigung-1477312040433
https://open.nrw/dataset/mags-anteil-der-stillen-reserve-nach-geschlecht-und-altersgruppen-1512033735012
https://open.nrw/dataset/mags-anteil-der-vermoegenslosen-in-nrw-nach-beruflicher-stellung-1512032087083
https://open.nrw/dataset/anzahl-kinderspielplatze-meerb
https://open.nrw/dataset/anzahl-der-sitzungen-von-rat-und-ausschussen-meerb
https://open.nrw/dataset/anzahl-medizinischer-anwendungen-den-oeffentlichen-baedern-duesseldorfs-seit-2006-d
https://open.nrw/dataset/arbeitslose-den-wohnquartieren-duesseldorf-d
https://open.nrw/dataset/arbeitsmarktstatistik-arbeitslose-gelsenkirchen-ge
https://open.nrw/dataset/arbeitsmarktstatistik-arbeitslose-nach-rechtskreisen-des-sgb-ge
https://open.nrw/dataset/arbeitsmarktstatistik-arbeitslose-nach-stadtteilen-gelsenkirchen-ge
https://open.nrw/dataset/arbeitsmarktstatistik-sgb-ii-rechtskreis-auf-stadtteilebene-gelsenkirchen-ge
https://open.nrw/dataset/arbeitsmarktstatistik-sozialversicherungspflichtige-auf-stadtteilebene-gelsenkirchen-ge
https://open.nrw/dataset/verkehrszentrale-arbeitsstellen-in-nordrhein-westfalen-1476688294843
https://open.nrw/dataset/mags-arbeitsvolumen-nach-wirtschaftssektoren-1512025235377
https://open.nrw/dataset/mais-armutsrisikoquoten-nach-geschlecht-und-migrationsstatus-der-personen-1477313317038
As you can see, this returned the first 20 URLs. When you first load the page only 20 items are present. But, if you scroll down, more are loaded. To get more items, you can change the Query String Parameter in the URL. The URL ends with rows=20. You can change this number to get the desired number of results.
Results appear after the initial page load due to the AJAX request.
I managed to get the links with Selenium, however I had to wait for .ckantitle a elements to be loaded (these are the links you want to get).
I should mention that the webdriver will wait for a page to load by
default. It does not wait for loading inside frames or for ajax
requests. It means when you use .get('url'), your browser will wait
until the page is completely loaded and then go to the next command in
the code. But when you are posting an ajax request, webdriver does not
wait and it's your responsibility to wait an appropriate amount of
time for the page or a part of page to load; so there is a module
named expected_conditions.
Code:
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
url = 'https://open.nrw/suche'
html = None
browser = webdriver.Chrome()
browser.get(url)
delay = 3 # seconds
try:
WebDriverWait(browser, delay).until(
EC.presence_of_element_located((By.CSS_SELECTOR, '.ckantitle a'))
)
html = browser.page_source
except TimeoutException:
print('Loading took too much time!')
finally:
browser.quit()
if html:
soup = BeautifulSoup(html, 'lxml')
links = soup.select('.ckantitle a')
for link in links:
print(urljoin(url, link['href']))
You need to install selenium:
pip install selenium
and get a driver here.
I'm trying to visit next page after searching. I'm getting the first page, but in order to go to next page i need to scroll down to click next page element.I've tried different methods as shown in the code to scroll down the webpage but despite all attempt i'm still getting ElementNotVisibleException error. Can anyone tell me why the scrolling isn't working.
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import time
driver = selenium.webdriver.PhantomJS(executable_path=r'C:\phantomjs-2.1.1-windows\bin\phantomjs.exe')
driver.get('https://www.texasbar.com/am/Template.cfm;jsessionid=7EB4486736A022DC2AB99C24E9071D70.cfusion?Section=Find_A_Lawyer&template=/Customsource/MemberDirectory/Search_form_client_main.cfm&CFID=39868973&CFTOKEN=2f314a81f05a55c6-469AE4D3-91FD-AA7B-9D59C8F7DB39779F')
time.sleep(4)
elem = driver.find_element_by_id("Zip").send_keys("75001"+"\n")
time.sleep(6)
new = driver.find_element_by_css_selector("form[name=\"HiddenFormFields\"] > a.next-btn.btn")
driver.execute_script("window.scrollTo(0, 7664)")
#driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
#driver.send_keys(Keys.END)
new.click()
time.sleep(4)
pagesource = driver.page_source
soup = BeautifulSoup(pagesource, 'html.parser')
print(soup)
Finally i have solved the problem. Before getting the url, i have set the browser window size driver.set_window_size(1124,850) and it's solved.