I am writing a small program to fetch stock exchange data using Python. The sample code below makes a request to a URL and it should return the appropriate data. Here is the resource that I am using:
https://python.plainenglish.io/4-python-libraries-to-help-you-make-money-from-webscraping-57ba6d8ce56d
from xml.dom.minidom import Element
from selenium import webdriver
from bs4 import BeautifulSoup
import logging
from selenium.webdriver.common.by import By
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
url = "http://eoddata.com/stocklist/NASDAQ/A.htm"
driver = webdriver.Chrome(executable_path="C:\Program Files\Chrome\chromedriver")
page = driver.get(url)
# TODO: find element by CSS selector
stock_symbol = driver.find_elements(by=By.CSS_SELECTOR, value='#ctl00_cph1_divSymbols')
soup = BeautifulSoup(driver.page_source, features="html.parser")
elements = []
table = soup.find('div', {'id','ct100_cph1_divSymbols'})
logging.info(f"{table}")
I've added a todo for getting the element that I am trying to retrieve from the program.
Expected:
The proper data should be returned.
Actual:
Nothing is returned.
It is most common practice to scrape tables with pandas.read_html() to get its texts, so I would also recommend it.
But to answer your question and follow your approach, select <div> and <table> more specific:
soup.select('#ctl00_cph1_divSymbols table')`
To get and store the data you could iterat the rows and append results to a list:
data = []
for row in soup.select('#ctl00_cph1_divSymbols table tr:has(td)'):
d = dict(zip(soup.select_one('#ctl00_cph1_divSymbols table tr:has(th)').stripped_strings,row.stripped_strings))
d.update({'url': 'https://eoddata.com'+row.a.get('href')})
data.append(d)
Example
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = "https://eoddata.com/stocklist/NASDAQ/A.htm"
res = requests.get(url)
soup = BeautifulSoup(res.text)
data = []
for row in soup.select('#ctl00_cph1_divSymbols table tr:has(td)'):
d = dict(zip(soup.select_one('#ctl00_cph1_divSymbols table tr:has(th)').stripped_strings,row.stripped_strings))
d.update({'url': 'https://eoddata.com'+row.a.get('href')})
data.append(d)
pd.DataFrame(data)
Output
Code
Name
High
Low
Close
Volume
Change
url
0
AACG
Ata Creativity Global ADR
1.390
1.360
1.380
8,900
0
https://eoddata.com/stockquote/NASDAQ/AACG.htm
1
AACI
Armada Acquisition Corp I
9.895
9.880
9.880
5,400
-0.001
https://eoddata.com/stockquote/NASDAQ/AACI.htm
2
AACIU
Armada Acquisition Corp I
9.960
9.960
9.960
300
-0.01
https://eoddata.com/stockquote/NASDAQ/AACIU.htm
3
AACIW
Armada Acquisition Corp I WT
0.1900
0.1699
0.1700
36,400
-0.0193
https://eoddata.com/stockquote/NASDAQ/AACIW.htm
4
AADI
Aadi Biosciences Inc
13.40
12.66
12.90
98,500
-0.05
https://eoddata.com/stockquote/NASDAQ/AADI.htm
5
AADR
Advisorshares Dorsey Wright ETF
47.49
46.82
47.49
1,100
0.3
https://eoddata.com/stockquote/NASDAQ/AADR.htm
6
AAL
American Airlines Gp
14.44
13.70
14.31
45,193,100
-0.46
https://eoddata.com/stockquote/NASDAQ/AAL.htm
...
Related
I want to scrape the URLs of all the items in the table but when I try, nothing comes up. The code is quite basic so I can see why it might not work. However, even trying to scrape the title of this website, nothing comes up. I at least expected the h1 tag as it's outside the table...
Website: https://www.vanguard.com.au/personal/products/en/overview
import requests
from bs4 import BeautifulSoup
lists =[]
url = 'https://www.vanguard.com.au/personal/products/en/overview'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
title = soup.find_all('h1', class_='heading2 gbs-font-vanguard-red')
for links in soup.find_all('a', style='padding-bottom: 1px;'):
link_text = links['href']
lists.append(link_text)
print(title)
print(lists)
If the problem is caused by the JavaScript eventlistener, I would suggest you use beautifulsoup along with selenium to scrape this website. So, let's apply selenium at sending request and get back page source and then use beautifulsoup to parse it.
In addition, you should use title = soup.find() instead of title = soup.findall() in order to get only one title.
The example of code using Firefox:
from selenium import webdriver
from webdriver_manager.firefox import GeckoDriverManager
from bs4 import BeautifulSoup
url = 'https://www.vanguard.com.au/personal/products/en/overview'
browser = webdriver.Firefox(executable_path=GeckoDriverManager().install())
browser.get(url)
soup = BeautifulSoup(browser.page_source, 'html.parser')
browser.close()
lists =[]
title = soup.find('h1', class_='heading2 gbs-font-vanguard-red')
for links in soup.find_all('a', style='padding-bottom: 1px;'):
link_text = links['href']
lists.append(link_text)
print(title)
print(lists)
Output:
<h1 class="heading2 gbs-font-vanguard-red">Investment products</h1>
['/personal/products/en/detail/8132', '/personal/products/en/detail/8219', '/personal/products/en/detail/8121',...,'/personal/products/en/detail/8217']
The most common problem (with many modern pages): this page uses JavaScript to add elements but requests/BeautifulSoup can't run JavaScript.
You may need to use Selenium to control real web browser which can run JavaScript.
This example use only Selenium without BeautifulSoup
I use xpath but you may also use css selector.
from selenium import webdriver
from selenium.webdriver.common.by import By
url = 'https://www.vanguard.com.au/personal/products/en/overview'
lists = []
#driver = webdriver.Chrome(executable_path="/path/to/chromedrive.exe")
driver = webdriver.Firefox(executable_path="/path/to/geckodrive.exe")
driver.get(url)
title = driver.find_element(By.XPATH, '//h1[#class="heading2 gbs-font-vanguard-red"]')
print(title.text)
all_items = driver.find_elements(By.XPATH, '//a[#style="padding-bottom: 1px;"]')
for links in all_items:
link_text = links.get_attribute('href')
print(link_text)
lists.append(link_text)
ChromeDriver (for Chrome)
GeckoDriver (for Firefox)
It's always more efficient to get the data from the source as opposed to doing it through Selenium. Looks like the links are created through the portId.
import pandas as pd
import requests
url = 'https://www3.vanguard.com.au/personal/products/funds.json'
payload = {
'context': '/personal/products/',
'countryCode': 'au.ret',
'paths': "[[['funds','legacyFunds'],'AU']]",
'method': 'get'}
jsonData = requests.get(url, params=payload).json()
results = jsonData['jsonGraph']['funds']['AU']['value']
df1 = pd.json_normalize(results, record_path=['children'])
df2 = pd.json_normalize(results, record_path=['listings'])
df = pd.concat([df1, df2], axis=0)
df['url_link'] = 'https://www.vanguard.com.au/personal/products/en/detail/' + df['portId'] + '/Overview'
Output:
print(df[['fundName', 'url_link']])
fundName url_link
0 Vanguard Active Emerging Market Equity Fund https://www.vanguard.com.au/personal/products/...
1 Vanguard Active Global Credit Bond Fund https://www.vanguard.com.au/personal/products/...
2 Vanguard Active Global Growth Fund https://www.vanguard.com.au/personal/products/...
3 Vanguard Australian Corporate Fixed Interest I... https://www.vanguard.com.au/personal/products/...
4 Vanguard Australian Fixed Interest Index Fund https://www.vanguard.com.au/personal/products/...
.. ... ...
23 Vanguard MSCI Australian Small Companies Index... https://www.vanguard.com.au/personal/products/...
24 Vanguard MSCI Index International Shares (Hedg... https://www.vanguard.com.au/personal/products/...
25 Vanguard MSCI Index International Shares ETF https://www.vanguard.com.au/personal/products/...
26 Vanguard MSCI International Small Companies In... https://www.vanguard.com.au/personal/products/...
27 Vanguard International Credit Securities Hedge... https://www.vanguard.com.au/personal/products/...
[66 rows x 2 columns]
I'm pretty new to Python, so would like some guidance. I would like to pull "Name, Protocol, APY, TVL" data from https://coindix.com/?sort=-tvl by scraping (as I believe there is no API), but having some issue. When I execute below:
import requests
from bs4 import BeautifulSoup
url = "https://coindix.com/?sort=-tvl"
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
the returned data does not include the information I would like to get. Could someone please help?
There's an api. Find the url in Dev Tools -> Network -> XHR -> Headers
import requests
import pandas as pd
url = 'https://apiv2.coindix.com/search'
payload = {
'sort': '-tvl',
'first': 'true',
'screen': '1114'}
data = requests.get(url, params=payload).json()['data']
df = pd.DataFrame(data)
Output:
print(df.head(5).to_string())
id name icon chain protocol base reward rewards apy apy_7_day tvl risk link is_new
0 17419 UST https://apiv2.coindix.com/icons/UST.png Terra Anchor 0.193600 0.0000 {} 0.193600 0.19570 5977961341 2 https://apiv2.coindix.com/vault/17419/redirect False
1 17206 DAI-USDC-USDT https://apiv2.coindix.com/icons/DAI-USDC-USDT.png Ethereum Curve 0.002800 0.0087 {'CRV': 0.0087} 0.011500 0.01210 5952854016 1 https://apiv2.coindix.com/vault/17206/redirect False
2 17174 LUNA https://apiv2.coindix.com/icons/LUNA.png Terra Lido 0.079000 0.0000 {} 0.079000 0.07900 5534798290 1 https://apiv2.coindix.com/vault/17174/redirect False
3 15940 ETH https://apiv2.coindix.com/icons/ETH.png Ethereum Lido 0.047000 0.0000 {} 0.047000 0.04700 5347746431 1 https://apiv2.coindix.com/vault/15940/redirect False
4 13517 cUSD-cEUR https://apiv2.coindix.com/icons/cUSD-cEUR.png Celo Sushi 0.002466 0.0000 {} 0.002466 0.01058 4609514119 2 https://apiv2.coindix.com/vault/13517/redirect False
[100 rows x 14 columns]
I want to get all the products on this page:
nike.com.br/snkrs#estoque
My python code is this:
produtos = []
def aviso():
print("Started!")
request = requests.get("https://www.nike.com.br/snkrs#estoque")
soup = bs4(request.text, "html.parser")
links = soup.find_all("a", class_="btn", text="Comprar")
links_filtred = list(set(links))
for link in links_filtred:
if(produto not in produtos):
request = requests.get(f"{link['href']}")
soup = bs4(request.text, "html.parser")
produto = soup.find("div", class_="nome-preco-produto").get_text()
if(code_formated == ""):
code_formated = "\u200b"
print(f"Nome: {produto} Link: {link['href']}\n")
produtos.append(link["href"])
aviso()
Guys, this code gets the products from the page, but not all yesterday, I suspect that the content is dynamic, but how can I get them all with request and beautifulsoup? I don't want to use Selenium or an automation library, how do I do that? I don't want to have to change my code a lot because it's almost done, how do I do that?
DO NOT USE requests.get if you are dealing with the same HOST.
Reason: read-that
import requests
from bs4 import BeautifulSoup
import pandas as pd
def main(url):
allin = []
with requests.Session() as req:
for page in range(1, 6):
params = {
'p': page,
'demanda': 'true'
}
r = req.get(url, params=params)
soup = BeautifulSoup(r.text, 'lxml')
goal = [(x.find_next('h2').get_text(strip=True, separator=" "), x['href'])
for x in soup.select('.aspect-radio-box')]
allin.extend(goal)
df = pd.DataFrame(allin, columns=['Title', 'Url'])
print(df)
main('https://www.nike.com.br/Snkrs/Feed')
Output:
Title Url
0 Dunk High x Fragment design Black https://www.nike.com.br/dunk-high-x-fragment-d...
1 Dunk Low Infantil (16-26) City Market https://www.nike.com.br/dunk-low-infantil-16-2...
2 ISPA Flow 2020 Desert Sand https://www.nike.com.br/ispa-flow-2020-153-169...
3 ISPA Flow 2020 Pure Platinum https://www.nike.com.br/ispa-flow-2020-153-169...
4 Nike iSPA Men's Lightweight Packable Jacket https://www.nike.com.br/nike-ispa-153-169-211-...
.. ... ...
115 Air Jordan 1 Mid Hyper Royal https://www.nike.com.br/air-jordan-1-mid-153-1...
116 Dunk High Orange Blaze https://www.nike.com.br/dunk-high-153-169-211-...
117 Air Jordan 5 Stealth https://www.nike.com.br/air-jordan-5-153-169-2...
118 Air Jordan 3 Midnight Navy https://www.nike.com.br/air-jordan-3-153-169-2...
119 Air Max 90 Bacon https://www.nike.com.br/air-max-90-153-169-211...
[120 rows x 2 columns]
To get the data you can send a request to:
https://www.nike.com.br/Snkrs/Estoque?p=<PAGE>&demanda=true
where providing a page number between 1-5 to p= in the URL.
For example, to print the links, you can try:
import requests
from bs4 import BeautifulSoup
url = "https://www.nike.com.br/Snkrs/Estoque?p={page}&demanda=true"
for page in range(1, 6):
response = requests.get(url.format(page=page))
soup = BeautifulSoup(response.content, "html.parser")
print(soup.find_all("a", class_="btn", text="Comprar"))
I am trying to scrape all the text from a webpage which is embedded within the "td" tags that have a class="calendar__cell calendar__currency currency ". As of now my code only returns the first occurence of this tag and class. How can I keep it iterating through the source code. So that it returns all occurrences one by one. The webpage is forexfactory.com
from bs4 import BeautifulSoup
import requests
source = requests.get("https://www.forexfactory.com/#detail=108867").text
soup = BeautifulSoup(source, 'lxml')
body = soup.find("body")
article = body.find("table", class_="calendar__table")
actual = article.find("td", class_="calendar__cell calendar__actual actual")
forecast = article.find("td", class_="calendar__cell calendar__forecast forecast").text
currency = article.find("td", class_="calendar__cell calendar__currency currency")
Tcurrency = currency.text
Tactual = actual.text
print(Tcurrency)
You have to use find_all() to get all elements and then you can use for-loop to iterate it.
import requests
from bs4 import BeautifulSoup
r = requests.get("https://www.forexfactory.com/#detail=108867")
soup = BeautifulSoup(r.text, 'lxml')
table = soup.find("table", class_="calendar__table")
for row in table.find_all('tr', class_='calendar__row--grey'):
currency = row.find("td", class_="currency")
#print(currency.prettify()) # before get text
currency = currency.get_text(strip=True)
actual = row.find("td", class_="actual")
actual = actual.get_text(strip=True)
forecast = row.find("td", class_="forecast")
forecast = forecast.get_text(strip=True)
print(currency, actual, forecast)
Result
CHF 96.4 94.6
EUR 0.8% 0.9%
GBP 43.7K 41.3K
EUR 1.35|1.3
USD -63.2B -69.2B
USD 0.0% 0.2%
USD 48.9 48.2
USD 1.2% 1.5%
BTW: I found that this page uses JavaScript to redirect page and in browser I see table with different values. But if I turn off JavaScript in browser then it shows me data which I get with Python code. BeautifulSoup and requests can't run JavaScript. If you need data like in browser then you may need Selenium to control web browser which can run JavaScript.
The below code provide information from all the numeric tags in the page. Can I use a filter to extract once for each region
For example : https://opensignal.com/reports/2019/04/uk/mobile-network-experience , I am interested in numbers only under the regional analysis tab and for all regions.
import requests
from bs4 import BeautifulSoup
html=requests.get("https://opensignal.com/reports/2019/04/uk/mobile-network-experience").text
soup=BeautifulSoup(html,'html.parser')
items=soup.find_all('div',class_='c-ru-graph__rect')
for item in items:
provider=item.find('span', class_='c-ru-graph__label').text
prodvalue=item.find_next_sibling('span').find('span', class_='c-ru-graph__number').text
print(provider + " : " + prodvalue)
I want a table or df as below
Easter Region
o2 Vodaphone 3 EE
4G Availability 82 76.9 73.0 89.2
Upload Speed Experience 5.6 5.9 6.8 9.5
Any pointers that can help in getting the result ?
Here is how I would do it for all regions. Requires bs4 4.7.1. AFAICS you have to assume consistent ordering of companies.
import requests
from bs4 import BeautifulSoup
import pandas as pd
r = requests.get("https://opensignal.com/reports/2019/04/uk/mobile-network-experience")
soup = BeautifulSoup(r.content,'lxml') #'html.parser' if lxml not installed
metrics = ['4g-availability', 'video-experience', 'download-speed' , 'upload-speed', 'latency']
headers = ['02', 'Vodaphone', '3', 'EE']
results = []
for region in soup.select('.s-regional-analysis__region'):
for metric in metrics:
providers = [item.text for item in region.select('.c-ru-chart:has([data-metric="' + metric + '"]) .c-ru-graph__number')]
row = {headers[i] : providers[i] for i in range(len(providers))}
row['data-metric'] = metric
row['region'] = region['id']
results.append(row)
df = pd.DataFrame(results, columns = ['region', 'data-metric', '02','Vodaphone', '3', 'EE'] )
print(df)
Sample output:
Assuming fixed the order of companies (it is, indeed), you can simply reduce the content to examine to only those div's containing the information you need.
import requests
from bs4 import BeautifulSoup
html = requests.get("https://opensignal.com/reports/2019/04/uk/mobile-network-experience").text
soup = BeautifulSoup(html,'html.parser')
res = soup.find_all('div', {'id':'eastern'})
aval = res[0].find_all('div', {'data-chart-name':'4g-availability'})
avalname = aval[0].find('span', {'class':'js-metric-name'}).text
upload = res[0].find_all('div', {'data-chart-name':'upload-speed'})
uploadname = upload[0].find('span', {'class':'js-metric-name'}).text
companies = [i.text for i in aval[0].find_all('span', class_='c-ru-graph__label')]
row1 = [i.text for i in aval[0].find_all('span', class_='c-ru-graph__number')]
row2 = [i.text for i in upload[0].find_all('span', class_='c-ru-graph__number')]
import pandas as pd
df = pd.DataFrame({avalname:row1,
uploadname:row2})
df.index = companies
df = df.T
output
O2 Vodafone 3 EE
4G Availability 82.0 76.9 73.0 89.2
Upload Speed Experience 5.6 5.9 6.8 9.5