get the new html after .click() with selenium - python

I'm using selenium to click in a link but i can't get the new table. what code i use to retrieve the new page?
df_list = []
url = 'https://www.cartolafcbrasil.com.br/scouts/cartola-fc-2018/rodada-1' #+ str(i)
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
table = soup.find_all('table')[0]
df = pd.read_html(str(table), encoding="UTF-8")
driver = webdriver.PhantomJS(executable_path = 'C:\\Python27\\phantomjs-2.1.1-windows\\bin\\phantomjs')
driver.get('https://www.cartolafcbrasil.com.br/scouts/cartola-fc-2018/rodada-1')
driver.find_element_by_xpath("/html[1]/body[1]/form[1]/div[1]/div[2]/div[3]/div[1]/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[2]/div[1]/table[1]/tbody[1]/tr[52]/td[1]/table[1]/tbody[1]/tr[1]/td[2]/a[1]").click()
?????
table = soup.find_all('table')[0]
df = pd.read_html(str(table), encoding="UTF-8")

If I understand your question it is "How do I get the HMTL from my driver object for the new page I've loaded". The answer would be driver.page_source
driver.find_element_by_xpath("Some crazy shenanigans of an xpath").click()
html_from_page = driver.page_source
soup = bs4.BeautifulSoup(html_from_page, 'html.parser')
# more stuff

Welcome to SO. Here is the another approach where you the script will iterate through all the tables (pages) and get the data.
df_list = []
url = 'https://www.cartolafcbrasil.com.br/scouts/cartola-fc-2018/rodada-1' #+ str(i)
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
table = soup.find_all('table')[0]
df = pd.read_html(str(table), encoding="UTF-8")
driver = webdriver.PhantomJS(executable_path = 'C:\\Python27\\phantomjs-2.1.1-windows\\bin\\phantomjs')
driver.get('https://www.cartolafcbrasil.com.br/scouts/cartola-fc-2018/rodada-1')
# get the number of pages and iterate each of them
numberOfPage = driver.find_element_by_xpath("(//tr[#class='tbpaging']//a)[last()]").text
for i in range(2,int(numberOfPage)):
# click on each page link and then get the details
driver.find_element_by_xpath("(//tr[#class='tbpaging']//a)[" + i +"]").click()
soup = BeautifulSoup(driver.page_source, 'html.parser')
table = soup.find_all('table')[0]
df = pd.read_html(str(table), encoding="UTF-8")

Related

How to scrape an expandable table from investing.com using python beautiful soup or investpy?

https://www.investing.com/economic-calendar/initial-jobless-claims-294
As stated in the question, I tried to web scrape a data table from this link. However, I was only able to scrape the first few lines of the data until the "show more" button. Except for web scraping, I ve also tried investpy.economic_calendar(), yet the filtering parameters are so random so that I could not extract the jobless claim data directly. Could somebody please help me with this?
url = 'https://www.investing.com/economic-calendar/initial-jobless-claims-294'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
table1 = soup.find('table', id='eventHistoryTable294')
headers = []
for i in table1.find_all('th'):
title = i.text
headers.append(title)
mydata = pd.DataFrame(columns = headers)
table_rows = table1.find_all('tr')
#$df_side = pd.DataFrame(mydata)
#x = df_side.head(100)
for j in table1.find_all('tr')[1:]:
row_data = j.find_all('td')
row = [i.text for i in row_data]
length = len(mydata)
mydata.loc[length] = row
print(mydata)

Loop through different links on a website and scrape certain information

Good afternoon all, i'm hoping that somebody may help me with a problem relating to looping through multiple links on a website. Many thanks in anticipation of your help. I have this code below which gets the info i need from the first link and creates the df i need to present it. But there are more than 6oo more links on the website and im not sure how to go about it.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#matplotlib inline
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = "https://auctions.royaltyexchange.com/auctions_overview/"
html = urlopen("https://auctions.royaltyexchange.com/auctions/jay-zs-multi-platinum-empire-state-of-mind/?origin=overview&filter_value=overview")
soup = BeautifulSoup(html, 'lxml')
type(soup)
# Get the title
title = soup.title
title = soup.find('h1', class_='title -auction-page -dark').text.strip()
title
data = {'Name':['Title',title]}
df_title = pd.DataFrame(data)
irr = soup.find('span',attrs={'id':'current-irr'}).text.strip()
irr
data = {'value' : ['theoretical IRR',irr]}
df_irr = pd.DataFrame(data)
table = soup.find('table', class_='es-overview-table')
table_rows = table.find_all('tr')
res = []
for tr in table_rows:
td = tr.find_all('td')
row = [tr.text.strip() for tr in td if tr.text.strip()]
if row:
res.append(row)
df_table = pd.DataFrame(pd.DataFrame(res).transpose())
df_final = pd.concat([df_title,df_irr ,df_table], axis=1, ignore_index = True)
df_final.head()
You can use this to get all the links on all pages primarily.
from urllib.request import urlopen
import re
from bs4 import BeautifulSoup
raw_url = "https://auctions.royaltyexchange.com/"
def get_link(page_num):
global raw_url
link_ls = []
for page in range(1,page_num+1):
url = "https://auctions.royaltyexchange.com/auctions_overview/?origin=overview&page=" + str(page)
html = urlopen(url)
bs = BeautifulSoup(html, 'html.parser')
for link in bs.find('div',{'class':'-list'}).findAll('a',href=re.compile("^(/auctions/)")):
print(link.attrs['href'])
link_ls.append(raw_url + link.attrs['href'])
return link_ls
link_list = get_link(55) # the last page number
link_list
['https://auctions.royaltyexchange.com//auctions/hip-hop-royalties-danileighs-lil-bebe/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/k-pop-publishing-featuring-exo-and-tvxq/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/jay-zs-multi-platinum-empire-state-of-mind/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/film-royalties-classic-comedy-trading-places/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/ben-jerrys-cherry-garcia-trademark-royalties/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/the-doobie-brothers-black-water-more/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/dirty-dancings-ive-had-the-time-of-my-life/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/multi-platinum-hip-hop-collection/?origin=overview&filter_value=overview',
...
On each page, specify the data you want to extract (eg title, name, etc.) and tell it the type of dataframe.
A slight refactor of #yganalyst and your code:
import pandas as pd
import re
from urllib.request import urlopen
from bs4 import BeautifulSoup
def get_link(page_num, raw_url):
link_ls = []
for page in range(1, page_num+1):
url = raw_url + "auctions_overview/?origin=overview&page=" + str(page)
html = urlopen(url)
bs = BeautifulSoup(html, 'html.parser')
pobj = re.compile("^(/auctions/)")
for link in bs.find('div', {'class': '-list'}).findAll('a', href=pobj):
link_ls.append(raw_url + link.attrs['href'])
return link_ls
def extract_auction(url2):
data = {}
html = urlopen(url2)
soup = BeautifulSoup(html, 'lxml')
title = soup.find('h1', class_='title -auction-page -dark').text.strip()
data['Title'] = title
irr = soup.find('span', attrs={'id': 'current-irr'}).text.strip()
data['theoretical IRR'] = irr
table = soup.find('table', class_='es-overview-table')
table_rows = table.find_all('tr')
for tr in table_rows:
td = tr.find_all('td')
row = [tr.text.strip() for tr in td if tr.text.strip()]
if row:
key = row[0].replace(':', '')
data[key] = row[1]
return data
base_url = "https://auctions.royaltyexchange.com/"
page_num = 1
link_list = get_link(page_num, base_url)
data = []
for ll in link_list:
print(ll)
data.append(extract_auction(ll))
df_final = pd.DataFrame(data)

Getting web links to all items in a table and then doing pagination

I am able to get all the links on a particular web page but am having trouble with the pagination.
I am doing the following:
import requests, bs4, re
from bs4 import BeautifulSoup
from urllib.parse import urljoin
r = requests.get(start_url)
soup = BeautifulSoup(r.text,'html.parser')
a_tags = soup.find_all('a')
print(a_tags)
links = [urljoin(start_url, a['href'])for a in a_tags]
print(links)
As a toy example, I am using the following website:
start_url = 'https://www.opencodez.com/page/1'
I am able to get all the links this way. However, I am trying to automate it more by going to the next page and doing the same thing, and outputting all the links to a csv file.
I tried the following but get no outputs:
start_url = 'https://www.opencodez.com/'
with open('names.csv', mode='w') as csv_file:
fieldnames = ['Name']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
article_link = []
def scraping(webpage, page_number):
next_page = webpage + str(page_number)
r = requests.get(str(next_page))
soup = BeautifulSoup(r.text,'html.parser')
a_tags = soup.find_all('a')
print(a_tags)
links = [urljoin(start_url, a['href'])for a in a_tags]
print(links)
for x in range(len(soup)):
article_link.append(links)
if page_number < 16:
page_number = page_number + 1
scraping(webpage, page_number)
scraping('https://www.opencodez.com/page/', 1)
#creating the data frame and populating its data into the csv file
data = { 'Name': article_link}
df = DataFrame(data, columns = ['Article_Link'])
df.to_csv(r'C:\Users\xxxxx\names.csv')
Could you please help me determine where I am going wrong?
I do not mind getting the links in either the output console or printed in a csv file
There were issues here and there with your code but this worked for me:
import requests, bs4, re
from bs4 import BeautifulSoup
from urllib.parse import urljoin
start_url = 'https://www.opencodez.com/'
r = requests.get(start_url) # first page scraping
soup = BeautifulSoup(r.text,'html.parser')
a_tags = soup.find_all('a')
article_link = []
links = [urljoin(start_url, a['href'])for a in a_tags]
article_link.append(links)
for page in range(2,19): # for every page after 1
links = [] # resetting lists on every page just in case
a_tags = []
url = 'https://www.opencodez.com/page/'+str(page)
r = requests.get(start_url)
soup = BeautifulSoup(r.text,'html.parser')
a_tags = soup.find_all('a')
links = [urljoin(start_url, a['href'])for a in a_tags]
article_link.append(links)
print(article_link)
I basically just changed how you append to the list article_link. This variable at the moment is a list of length 18. Each list within article_link is a list of 136 links.

Get every href from the same div in python

I have this soup:
The webpage has references of companies in a grid view (16 rows x 5 columns) and I want to retrieve each reference's url and the title. The problem is that all 5 references in each row, are in one class named row and when I'm scraping the page, I can only see the first reference of every row, instead of all 5 of them. Here is my code so far:
url = 'http://www.slimstock.com/nl/referenties/'
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
info_block = soup.find_all("div", attrs={"class": "row"})
references = pd.DataFrame(columns=['Company Name', 'Web Page'])
for entry in info_block:
try:
title = entry.find('img').get('title')
url = entry.a['href']
urlcontent = BeautifulSoup(requests.get(url).content, "lxml")
row = [{'Company Name': title, 'Web Page': url}]
references = references.append(row, ignore_index=True)
except:
pass
Is there a way to fix this?
I think you should iterate over the "img" or over the "a".
You can write something like this:
for entry in info_block:
try:
for a in entry.find_all("a"):
title = a.find('img').get('title')
url = a.get('href')
urlcontent = BeautifulSoup(requests.get(url).content, "lxml")
row = [{'Company Name': title, 'Web Page': url}]
references = references.append(row, ignore_index=True)
except:
pass
import pandas as pd
from bs4 import BeautifulSoup
import requests
url = 'http://www.slimstock.com/nl/referenties/'
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
info_block = soup.find_all("div", attrs={"class": "row"})
references = pd.DataFrame(columns=['Company Name', 'Web Page'])
for entry in info_block:
anchors = entry.find_all("a")
for a in anchors:
try:
title = a.find('img').get('title')
url = a['href']
# urlcontent = BeautifulSoup(requests.get(url).content, "lxml")
row = [{'Company Name': title, 'Web Page': url}]
references = references.append(row, ignore_index=True)
except:
pass

How to speed up my webscraping python code?

I've been working on Python code to pull data from stock-trading simulation software (I get the tickers for the securities I'm trading, e.g. VTI, AAPL, GOOG, etc.), then search Morningstar for the ticker and pull pricing info and whatever else I want from that site. I save the data I want from lists into a .csv file for use in Excel. I'm using Selenium to run a webdriver (I use either Chrome to see the process visually or PhantomJS to run the program headless without browser GUI), and beautifulsoup to access the websites and work with the HTML.
I have the program working decently, but it takes upwards of 120 seconds to run through a portfolio of only 11 securities, and I am hoping to expand this program to do more elaborate actions.
Is there anything in my coding style that could be changed to speed up the webscraping process? Are there any general methods of writing Python code to allow for fast execution?
Here's the code:
from selenium import webdriver
from bs4 import BeautifulSoup
import csv
#browser = webdriver.Chrome() #replace with .Firefox(), or with the browser of your choice
browser = webdriver.PhantomJS()
security_links_list = list()
equitysim_ticker_list = list()
url = ['https://www.equitysim.com/Home'
]
for item in url:
browser.get(item) #navigate to page behind login
username = browser.find_element_by_id('placeholderContent_txtUserName')
username.send_keys('EDITED_FOR_SECURITY')
password = browser.find_element_by_id('placeholderContent_txtPassword')
password.send_keys('EDITED_FOR_SECURITY')
form = browser.find_element_by_id("placeholderContent_LoginButton")
form.click()
innerHTML = browser.execute_script("return document.body.innerHTML") #returns the inner HTML as a string
innerHTML = browser.page_source
html = browser.page_source
soup = BeautifulSoup(html, 'html.parser')
table_a = soup.find('table', 'ba-tbl admintable')
for a in table_a.find_all('a', href=True):
security_links_list.append(a['href'])
links_set = set(security_links_list)
links_set.remove('#')
print(links_set)
mystring = "https://www.equitysim.com"
links_set_revised = [mystring + link_text for link_text in links_set]
print(links_set_revised)
for item in links_set_revised:
browser.get(item)
innerHTML = browser.execute_script("return document.body.innerHTML") #returns the inner HTML as a string
innerHTML = browser.page_source
html = browser.page_source
soup = BeautifulSoup(html, 'html.parser')
title_element = soup.find("title")
title = title_element.text
ticker = title.split(':', 1)[0]
ticker = ticker.replace('\n','')
ticker = ticker.replace('\t','')
equitysim_ticker_list.append(ticker)
print(equitysim_ticker_list)
morningstar_ticker_search = "http://quote.morningstar.com/TickerLookup.html"
uri_links = list()
for ticker in equitysim_ticker_list:
browser.get(morningstar_ticker_search)
enter_ticker = browser.find_element_by_xpath("//input[#value='Ticker']")
enter_ticker.click()
search_ticker = browser.find_element_by_class_name('F3')
search_ticker.send_keys(ticker)
go_box = browser.find_element_by_xpath("//input[#src='http://im.morningstar.com/im/go.gif']")
go_box.click()
html = browser.page_source
soup = BeautifulSoup(html, 'html.parser')
outer_div = soup.find('div', attrs={'id': 'quote_quicktake'})
iframe = outer_div.find('iframe').get('src')
full_url = 'https:' + iframe
uri_links.append(full_url)
print(uri_links)
price_list = list()
ticker_list = list()
nav_list = list()
for item in uri_links:
browser.get(item) #navigate to page behind login
innerHTML = browser.execute_script("return document.body.innerHTML") #returns the inner HTML as a string
innerHTML = browser.page_source
html = browser.page_source
soup = BeautifulSoup(html, 'html.parser')
price_element = soup.find("div", attrs={"id": "lastPrice"})
price = price_element.text # strip() is used to remove starting and trailing
nav_element = soup.find("span", attrs={"id": "NAV"} or {"vkey": "NAV"})
nav = nav_element.text
nav_split1 = nav.split('\n \t\t', 1)[1]
nav_split2 = nav_split1.split(' ', 1)[0]
title_element = soup.find("title")
title = title_element.text
ticker = title.split(' ', 1)[0]
price_list.append(price)
nav_list.append(nav_split2)
ticker_list.append(ticker)
print(ticker)
print(price)
print(nav_split2)
#ticker =
print(ticker_list)
print(price_list)
print(nav_list)
csvfile = "C:\\Users\\USERNAME\\AppData\\Local\\Programs\\Python\\Python36\\personal\\exampleCsv.csv"
#Assuming res is a flat list
with open(csvfile, "w") as output:
writer = csv.writer(output,lineterminator='')
writer.writerow(ticker_list)
writer.writerow('\n')
writer.writerow(price_list)
writer.writerow('\n')
writer.writerow(nav_list)

Categories