I'm trying to scrape the player statistics in the Totals table at this link: http://www.basketball-reference.com/players/j/jordami01.html. It's much more difficult to scrape the data as-is when you first appear on that site, so you have the option of clicking 'CSV' right above the table. This format would be much easier to digest.
I'm having trouble
import urllib2
from bs4 import BeautifulSoup
from selenium import webdriver
player_link = "http://www.basketball-reference.com/players/j/jordami01.html"
browser = webdriver.Firefox()
browser.get(player_link)
elem = browser.find_element_by_xpath("//span[#class='tooltip' and #onlick='table2csv('totals')']")
elem.click()
When I run this, a Firefox window pops up, but the code never changes the table from its original format to CSV. The CSV table only pops up in the source code after I click CSV (obviously). How can I get selenium to click that CSV button and then BS to scrape the data?
You don't need BeautifulSoup here. Click the CSV button with selenium, extract the contents of the appeared pre element with CSV data and parse it with built-in csv module:
import csv
from StringIO import StringIO
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
player_link = "http://www.basketball-reference.com/players/j/jordami01.html"
browser = webdriver.Firefox()
wait = WebDriverWait(browser, 10)
browser.set_page_load_timeout(10)
# stop load after a timeout
try:
browser.get(player_link)
except TimeoutException:
browser.execute_script("window.stop();")
# click "CSV"
elem = wait.until(EC.presence_of_element_located((By.XPATH, "//div[#class='table_heading']//span[. = 'CSV']")))
elem.click()
# get CSV data
csv_data = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "pre#csv_totals"))).text.encode("utf-8")
browser.close()
# read CSV
reader = csv.reader(StringIO(csv_data))
for line in reader:
print(line)
Related
I'm trying to web scrape using Selenium, Python and Beautiful Soup. I am scraping this page, but I want to scrape information off the pop-up window that appears when you click on the 'i' (information) icons in the corner of each product. My code is as follows:
import requests
from bs4 import BeautifulSoup
import time
import selenium
import math
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import chromedriver_binary
import re
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome(ChromeDriverManager().install())
r = requests.get('https://dmarket.com/csgo-skins/product-card/ak-47-redline/field-tested')
driver.get('https://dmarket.com/csgo-skins/product-card/ak-47-redline/field-tested')
html_getter = BeautifulSoup(r.text, "html.parser")
data = html_getter.findAll(attrs={"class":"c-asset__priceNumber"})
dataskin = html_getter.findAll(attrs={"class" : "c-asset__exterior"})
time.sleep(2)
driver.find_element_by_id("onesignal-slidedown-cancel-button").click()
time.sleep(2)
driver.find_element_by_class_name("c-dialogHeader__close").click()
time.sleep(30)
driver.find_element_by_class_name("c-asset__action--info").click()
time.sleep(30)
price_element = driver.switch_to.active_element
print("<<<<<TEXT>>>>>")
print(price_element.text)
print("<<<<<END>>>>>")
driver.close()
However, when I run this, the only text that prints are "close." If you inspect the information page pop-up, it should print out the price, data from the chart, etc. How can I get it to print this info? Specifically, I want the amount sold on the most recent day and the price listed on the chart on the most recent day (both seem to be accessible in Chrome DevTools). I don't think I'm looking at the wrong frame, as I switch to the active frame, so I'm not sure how to fix this!
Sup, I'm trying to extract some data tables from an website (https://www.anbima.com.br/pt_br/informar/curvas-de-juros-fechamento.htm), but as we can see the data is inside an Iframe. It took me a while, since I'm not an expert to webscraping data, to click in the button "Consultar" to get in the page that I want. Basically, i't loads the data (4 tables) that inside an Iframe too.
The problem it's I still don't have any successful attempt to get the tables, maybe it's because of the Iframe.
For an example, I tried to use xpath i the first table without sucess.
drive.find_elemnt_by_xpath(//*[#id="Parametros"]/table).text
Here's the code to reach the page that i mentioned:
from selenium import webdriver
import time
import re
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as expectedCondition
from selenium.webdriver.chrome.options import Options
import pandas as pd
import numpy as np
#----------------------- INICIALIZAÇÃO DO SCRAPING -----------------------------#
want_to_scrape = True
if want_to_scrape:
options = Options()
#options.add_argument('--headless')
driver = webdriver.Chrome("C:\\Users\\......\\chromedriver.exe",options=options)
now = time.time()
dataset_list = []
url = 'https://www.anbima.com.br/pt_br/informar/curvas-de-juros-fechamento.htm'
driver.get(url)
#element = driver.find_element_by_class_name('full')
#driver.switch_to.frame(element)
driver.switch_to.frame(0)
element = driver.find_elements_by_name('Consultar')
element[0].click()
time.sleep(1)
try:
alert = driver.switch_to_alert()
alert.accept()
print("alert accepted")
except:
print("no alert")
time.sleep(1)
driver.switch_to.frame(0)
driver.find_element_by_xpath
Try replacing your driver.switch_to.frame(0) line with this:
# Get the iframe element - note, may need to use more specialized selector here
iframe = driver.find_elements_by_tag_name('iframe')
driver.switch_to.frame(iframe)
That will get your driver into the frame context so you can fetch the tables. You may need to use a different selector to get the iframe element. If you post the iframe HTML here, I can help you write a selector.
I am scraping a web page, I have managed to fetch the data from the table into a csv file using selenium. What I am struggling with is to fetch information from anchor tags present on each row of the table.
I have tried clicking all the anchor tags of the table in order to get information from the corresponding URL's but it stops after clicking the first URL. It gives an error saying Message: stale element reference: element is not attached to the page document.
I am not sure that it is the right approach to this problem.
Here's my code for what I have tried so far. I am sorry if the code is not properly formatted I am new to python and stackoverflow.
import csv
import requests
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
browser = webdriver.Chrome(executable_path=r"D:\jewel\chromedriver.exe")
browser.get(('https://e-sourcingni.bravosolution.co.uk/web/login.shtml'))
signInButton = browser.find_element_by_css_selector(".only")
signInButton.click()
time.sleep(5)
table = browser.find_element_by_css_selector(".list-table")
for a in browser.find_elements_by_css_selector(".detailLink"):
a.click()
time.sleep(2)
browser.execute_script("window.history.go(-1)")
time.sleep(2)
with open('output.csv', "w") as f:
writer = csv.writer(f)
writer.writerow(["S.No","Status","Organization","Project Title","First Publishing Date","Work Category","Listing Deadline"])
for row in table.find_elements_by_css_selector('tr'):
writer.writerow([d.text for d in row.find_elements_by_css_selector('td')])
browser.close()
What I need is to fetch the data from href of tags having class detailLink. I cannot get to a proper approach to carry this out.
I have used normal for loop to iterate the table instead of for each loop. Try this and let me know how it goes.
import csv
import time
from selenium import webdriver
browser = webdriver.Chrome('/usr/local/bin/chromedriver') # Optional argument, if not specified will search path.
browser.implicitly_wait(5)
browser.execute_script("window.open('about:blank','tab1');")
browser.switch_to.window("tab1")
browser.get(('https://e-sourcingni.bravosolution.co.uk/web/login.shtml'))
signInButton = browser.find_element_by_css_selector(".only")
signInButton.click()
time.sleep(5)
table = browser.find_element_by_css_selector(".list-table")
links=browser.find_elements_by_css_selector(".detailLink")
for i in range(len(links)):
links=browser.find_elements_by_css_selector(".detailLink")
links[i].click()
time.sleep(2)
browser.execute_script("window.history.go(-1)")
time.sleep(2)
with open('output.csv', "w") as f:
writer = csv.writer(f)
writer.writerow(["S.No","Status","Organization","Project Title","First Publishing Date","Work Category","Listing Deadline"])
table=browser.find_elements_by_xpath("//table[#class='list-table']//tr")
for row in range(len(table)):
x=[]
for d in browser.find_elements_by_xpath("//table[#class='list-table']//tr["+str(row)+"]//td"):
x.append(d.text.encode('utf-8'))
writer.writerow(x)
browser.close()
Yes since you move to the next page it couldn't able to find the element on the previous page since you changed the page.
You can try this
import csv
import requests
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
browser = webdriver.Chrome(executable_path=r"D:\jewel\chromedriver.exe")
browser.execute_script("window.open('about:blank','tab1');")
browser.switch_to.window("tab1")
browser.get("https://e-sourcingni.bravosolution.co.uk/web/login.shtml")
signInButton = browser.find_element_by_css_selector(".only")
signInButton.click()
time.sleep(5)
table = browser.find_element_by_css_selector(".list-table")
for a in table.find_elements_by_tag_name("a"):
try:
if a.get_attribute("class") == "detailLink":
id = a.get_attribute("onclick")
id = id.replace("javascript:goToDetail('","")
id = id.replace("', '02260');stopEventPropagation(event);", "")
a_href = a.get_attribute("href")
browser.execute_script("window.open('about:blank','tab2');")
browser.switch_to.window("tab2")
browser.get("https://e-sourcingni.bravosolution.co.uk/esop/toolkit/opportunity/opportunityDetail.do?opportunityId="+ id +"&oppList=CURRENT")
time.sleep(2)
#wait for the element to load
browser.switch_to.window("tab1")
# print("in it ")
except:
print("detailLink is not present in the a tag class")
with open('output.csv', "w") as f:
writer = csv.writer(f)
writer.writerow(["S.No","Status","Organization","Project Title","First Publishing Date","Work Category","Listing Deadline"])
for row in table.find_elements_by_css_selector('tr'):
writer.writerow([d.text for d in row.find_elements_by_css_selector('td')])
browser.close()
I am trying to put together a web scraper to get locations by zip code entered by the user. As of right now I am able to navagate to the website but I am not able to click on the drop down button that allows you to enter in a zip code. Here is what I have so far
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import pandas as pd
from selenium.webdriver.common.by import By
zipcode = input("What zip code would you like to search? ")
out_table = 'Oreilly_autp_parts_addresses_{}.csv'.format(zipcode)
#Using Selenium to navigate to website, search zipcode and get html data
driver = webdriver.Chrome() #requires geckodriver.exe
driver.get('https://www.oreillyauto.com/')
time.sleep(2)
driver.maximize_window()
el = driver.find_element_by_class_name("site-store")
time.sleep(2)
driver.execute_script("arguments[0].setAttribute('class','site-store site-nav_item dispatcher-trigger--active')", el)
It seems to be clicking on the correct element but the drop down that is supposed to show up isn't there. HTML Code Block
Any help is much appreciated!
I've written some code in python in combination with selenium to parse different product names from a webpage. There are few load more buttons visible if the browser is made to scroll downward. The webpage displays it's full content if the page is made to scroll downmost until there is no load more button to click. My scraper seems to be doing good but I'm not getting all the results. There are around 200 products in that page but I'm getting 90 out of them. What change should I bring about in my scraper to get them all? Thanks in advance.
The webpage I'm dealing with: Page_Link
This is the script I'm trying with:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get("put_above_url_here")
wait = WebDriverWait(driver, 10)
page = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,".listing_item")))
for scroll in range(17):
page.send_keys(Keys.PAGE_DOWN)
time.sleep(2)
try:
load = driver.find_element_by_css_selector(".lm-btm")
load.click()
except Exception:
pass
for item in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "[id^=item_]"))):
name = item.find_element_by_css_selector(".pro-name.el2").text
print(name)
driver.quit()
Try below code to get required data:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get("https://www.purplle.com/search?q=hair%20fall%20shamboo")
wait = WebDriverWait(driver, 10)
header = driver.find_element_by_tag_name("header")
driver.execute_script("arguments[0].style.display='none';", header)
while True:
try:
page = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".listing_item")))
driver.execute_script("arguments[0].scrollIntoView();", page)
page.send_keys(Keys.END)
load = wait.until(EC.element_to_be_clickable((By.PARTIAL_LINK_TEXT, "LOAD MORE")))
driver.execute_script("arguments[0].scrollIntoView();", load)
load.click()
wait.until(EC.staleness_of(load))
except:
break
for item in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "[id^=item_]"))):
name = item.find_element_by_css_selector(".pro-name.el2").text
print(name)
driver.quit()
You should only Use Selenium as a last resort.
A simple look around in the webpage showed the API it called to get your data.
It returns a JSON output with all the details:
Link
You can now just loop over and store in a dataframe easily.
Very fast, fewer errors than selenium.