I am very new to web scraping. I have the following url:
https://www.bloomberg.com/markets/symbolsearch
So, I use Selenium to enter the Symbol Textbox and press Find Symbols to get the details. This is the code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
driver = webdriver.Firefox()
driver.get("https://www.bloomberg.com/markets/symbolsearch/")
element = driver.find_element_by_id("query")
element.send_keys("WMT:US")
driver.find_element_by_name("commit").click()
It returns the table. How can I retrieve that? I am pretty clueless.
Second question,
Can I do this without Selenium as it is slowing down things? Is there a way to find an API which returns a JSON?
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from bs4 import BeautifulSoup
import requests
driver = webdriver.Firefox()
driver.get("https://www.bloomberg.com/markets/symbolsearch/")
element = driver.find_element_by_id("query")
element.send_keys("WMT:US")
driver.find_element_by_name("commit").click()
time.sleep(5)
url = driver.current_url
time.sleep(5)
parsed = requests.get(url)
soup = BeautifulSoup(parsed.content,'html.parser')
a = soup.findAll("table", { "class" : "dual_border_data_table" })
print(a)
here is the total code by which you can get the table you are looking for. now do what you need to do after getting the table. hope it helps
Related
The webpage I am trying to scrape can only be seen after login so using a direct url won't work. I need to scrape data while I am logged in using my chrome browser.
Then I need to get the value of the the element from
I have tried using the following code.
import requests
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import pandas as pd
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(ChromeDriverManager().install())
lastdatadate=[]
lastprocesseddate=[]
source = requests.get('webpage.com').text
content = driver.page_source
soup = bs(content, 'lxml')
#print(soup.prettify())
price = soup.find('span', attrs={'id':'calculatedMinRate'})
print(price.text)
You could still perform a login on the opened webdriver and fill in the input fields, as explained here: How to locate and insert a value in a text box (input) using Python Selenium?
Steps:
Fill in the input fields
Find the submit button and trigger a click event
Afterwards add a sleep command, few seconds should be enough
Afterwards you should be able to get the data.
Link that I am scraping : https://www.indusind.com/in/en/personal/cards/credit-card.html
from urllib.request import urlopen
from bs4 import BeautifulSoup
import json, requests, re, sys
from selenium import webdriver
import re
IndusInd_url = "https://www.indusind.com/in/en/personal/cards/credit-card.html"
html = requests.get(IndusInd_url)
soup = BeautifulSoup(html.content, 'lxml')
print(soup)
for x in soup.select("#display-product-cards .text-primary"):
print(x.get_text())
Using the above code I am trying to scrape the titles of the card, but unfortuantely I am getting this output
<html><body><p>This website is secured against online attacks. Your request was blocked due to suspicious behavior<br/>
<br/>
Client IP : 124.123.170.109<br/>
<br/>
Incident Time : 2021-02-24 06:28:10 UTC <br/>
<br/>
Incident ID : YDXx#m6g3nSFLvi5lGg4wgAAAf8<br/>
<br/>
If you feel it was a legitimate request, please contact the website owner for further investigation and remediation with a screenshot of this page.</p></body></html>
Is there any other alternative to follow to scrape the details.
Any help is highly appreciated ! ! !
Please check this.
FYI: Make sure you have the right driver (firefoxe or chrome or whatever with right version)
from selenium import webdriver
import requests
from bs4 import BeautifulSoup
import time
url = 'https://www.indusind.com/in/en/personal/cards/credit-card.html'
# open the chrome driver
driver = webdriver.Chrome(executable_path='webdrivers/chromedriver.exe')
# pings the specified url
driver.get(url)
# sleep time to wait for t seconds to wait for page load
# replace 3 with any int value (int value in seconds)
time.sleep(3)
# gets the page source
pg = driver.page_source
# beautify with beautifulsoup
soup = BeautifulSoup(pg)
# get the titles of the card
for x in soup.select("#display-product-cards .text-primary"):
print(x.get_text())
Below is output image
Can be achieved without BeautifulSoup.
I define the locator with xpath with the value:
//div[#id='display-product-cards']//a[#class='card-title text-primary' and text()!='']
And utilize method .presence_of_all_elements_located.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome(executable_path='webdrivers/chromedriver.exe')
driver.get('https://www.indusind.com/in/en/personal/cards/credit-card.html')
wait = WebDriverWait(driver, 20)
elements = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//div[#id='display-product-cards']//a[#class='card-title text-primary' and text()!='']")))
for element in elements:
print(element.get_attribute('innerHTML'))
driver.quit()
I'm trying to scrape all of the images for a specific TripAdivsor page but when using the find_elements_by_class_name function in Selenium, it is giving me no values whatsoever. I am confused as that is the exact class name for what values I want to iterate through and append to a list, here is the site. Any help would be greatly appreciated!
# importing dependencies
import re
import selenium
import io
import pandas as pd
import urllib.request
import urllib.parse
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.common.action_chains import ActionChains
from selenium import webdriver
import time
from _datetime import datetime
from selenium.webdriver.common.keys import Keys
#setup opening url window of website to be scraped
options = webdriver.ChromeOptions()
options.headless=False
prefs = {"profile.default_content_setting_values.notifications" : 2}
options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome("/Users/rishi/Downloads/chromedriver 3") #possible issue by not including the file extension
driver.maximize_window()
time.sleep(5)
driver.get("""https://www.tripadvisor.com/""") #get the information from the page
#automate searching for hotels in specific city
driver.find_element_by_xpath('/html/body/div[2]/div/div[6]/div[1]/div/div/div/div/span[1]/div/div/div/a').click() #clicks on hotels option
driver.implicitly_wait(12) #allows xpath to be found
driver.find_element_by_xpath('//*[#id="BODY_BLOCK_JQUERY_REFLOW"]/div[12]/div/div/div[1]/div[1]/div/input').send_keys("Washington D.C.", Keys.ENTER) #change string to get certain city
time.sleep(8)
#now get current url
url = driver.current_url
response = requests.get(url)
response = response.text
data = BeautifulSoup(response, 'html.parser')
#get list of all hotels
hotels = driver.find_elements_by_class_name("prw_rup prw_meta_hsx_responsive_listing ui_section listItem")
print("Total Number of Hotels: ", len(hotels))
I would recommend that, if you use Selenium, don't use BeautifulSoup beside it because you can get whatever you want using Selenium.
You can simply achieve your goal as follows:
driver = webdriver.Chrome("/Users/rishi/Downloads/chromedriver 3")
driver.maximize_window()
driver.get("https://www.tripadvisor.ca/Hotels")
time.sleep(1)
driver.implicitly_wait(12)
driver.find_element_by_xpath('//*[#class="typeahead_input"]').send_keys("Washington D.C.", Keys.ENTER)
time.sleep(1)
hotels = driver.find_elements_by_xpath('//*[#class="listing collapsed"]')
print("Total Number of Hotels: ", len(hotels))
Please note that using this code you would get the first 30 hotels (i.e., first page). You would need to loop through all the pages of hotels of the specified city in order to get them all.
Hope it helps.
Hello I want to web scrape data from a site with an age verification pop-up using python 3.x and beautifulsoup. I can't get to the underlying text and images without clicking "yes" for "are you over 21". Thanks for any support.
EDIT: Thanks, with some help from a comment I see that I can use the cookies but am not sure how to manage/store/call cookies with the requests package.
So with some help from another user I am using selenium package so that it will work also in case it's a graphical overlay (I think?). Having trouble getting it to work with the gecko driver but will keep trying! Thanks for all the advice again, everyone.
EDIT 3: OK I have made progress and I can get the browser window to open, using the gecko driver!~ Unfortunately it doesn't like that link specification so I'm posting again. The link to click "yes" on the age verification is buried on that page as something called mlink...
EDIT 4: Made some progress, updated code is below. I managed to find the element in the XML code, now I just need to manage to click the link.
#
import time
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
driver = webdriver.Firefox(executable_path=r'/Users/jeff/Documents/geckodriver') # Optional argument, if not specified will search path.
driver.get('https://www.shopharborside.com/oakland/#/shop/412');
url = 'https://www.shopharborside.com/oakland/#/shop/412'
driver.get(url)
#
driver.find_element_by_class_name('hhc_modal-body').click(Yes)
#wait.1.second
time.sleep(1)
pagesource = driver.page_source
soup = BeautifulSoup(pagesource)
#you.can.now.enjoy.soup
print(soup.prettify())
Edit new: Stuck again, here is the current code. I seem to have isolated the element "mBtnYes" but I get an error when running the code :
ElementClickInterceptedException: Message: Element is not clickable at point (625,278.5500030517578) because another element obscures it
import time
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
driver = webdriver.Firefox(executable_path=r'/Users/jeff/Documents/geckodriver') # Optional argument, if not specified will search path.
driver.get('https://www.shopharborside.com/oakland/#/shop/412');
url = 'https://www.shopharborside.com/oakland/#/shop/412'
driver.get(url)
#
driver.find_element_by_id('myBtnYes').click()
#wait.1.second
time.sleep(1)
pagesource = driver.page_source
soup = BeautifulSoup(pagesource)
#you.can.now.enjoy.soup
print(soup.prettify())
if your aim is to click the verification get to selenium:
ps install selenium && get geckodriver(firefox) or chromedriver(chrome)
#Mossein~King(hi i'm here to help)
import time
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.options import Options
from BeautifulSoup import BeautifulSoup
#this.is.for.headless.This.will.save.you.a.bunch.of.research.time(Trust.me)
options = Options()
options.add_argument("--headless")
driver = webdriver.Firefox(firefox_options=options)
#for.graphical(you.need.gecko.driver.for.firefox)
# driver = webdriver.Firefox()
url = 'your-url'
driver.get(url)
#get.the.link.to.clicking
#exaple if<a class='MosseinKing'>
driver.find_element_by_xpath("//a[#class='MosseinKing']").click()
#wait.1.secong.in.case.of.transitions
time.sleep(1)
pagesource = driver.page_source
soup = BeautifulSoup(pagesource)
#you.can.now.enjoy.soup
print soup.prettify()
I'm trying to scrape a JavaScript enables page using BS and Selenium.
I have the following code so far. It still doesn't somehow detect the JavaScript (and returns a null value). In this case I'm trying to scrape the Facebook comments in the bottom. (Inspect element shows the class as postText)
Thanks for the help!
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
import BeautifulSoup
browser = webdriver.Firefox()
browser.get('http://techcrunch.com/2012/05/15/facebook-lightbox/')
html_source = browser.page_source
browser.quit()
soup = BeautifulSoup.BeautifulSoup(html_source)
comments = soup("div", {"class":"postText"})
print comments
There are some mistakes in your code that are fixed below. However, the class "postText" must exist elsewhere, since it is not defined in the original source code.
My revised version of your code was tested and is working on multiple websites.
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
browser = webdriver.Firefox()
browser.get('http://techcrunch.com/2012/05/15/facebook-lightbox/')
html_source = browser.page_source
browser.quit()
soup = BeautifulSoup(html_source,'html.parser')
#class "postText" is not defined in the source code
comments = soup.findAll('div',{'class':'postText'})
print comments