I am trying to get the price of the stock using the below code, it returns null for the current price. Please let me know where I am making a error
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
my_url = 'https://finance.yahoo.com/quote/MMM/key-statistics?p=MMM'
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup =soup(page_html,"lxml")
uClient.close()
# I tried this option 1
currentPrice = page_soup.find('div',attrs={"span": "Trsdu(0.3s) Fw(b) Fz(36px) Mb(-4px) D(ib)"})
print(currentPrice)
# I tried this option 2
for currentPrice in page_soup.find("div",{"class": "D(ib) Mend(20px)"}) :
print (page_soup.span)
You might want to have a look at yfinance
https://pypi.org/project/yfinance/
Related
I've been working on this web scraper for a while and trying to get the body content of different links of an online newsletter. Therefore, if I breakdown the code for the second loop and run it separately, it will return the correct results, however, if the same part is put inside a loop in the bigger script, it will return the error "IndexError: list index out of range".
This is the script that 2nd loop returns the error (UPDATED):
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import pandas as pd
def pages(myurl):
# opening up connection, grabbing the page
uClient = uReq(myurl)
page_html = uClient.read()
uClient.close()
# html parsing
page_soup = soup(page_html, "html.parser")
dt = []
ttl = []
name = []
body = []
source = []
# grabs each newsletter subjects
titular = page_soup.findAll("div",{"class":"col-md-9 col-sm-9 col-xs-9"})
titular[0]
tit1 = titular[0]
fixed = 'https://www.df.cl/noticias/site/tax/port/all'
for tit1 in titular:
date = tit1.span.text
dt.append(date)
title = tit1.h2.a.text
ttl.append(title)
link = tit1.h2.a["href"].strip()
source.append(fixed + link)
df = pd.DataFrame(dt, columns =['date'])
df['title'] = ttl
df['link'] = source
for link in df['link']:
new_link = fixed + link
page = uReq(new_link)
page_html_1 = page.read()
page.close()
page_soup = soup(page_html_1, "html.parser")
content = page_soup.findAll("div",{"class":"entry cuerpo-noticias CUERPO"})
content[0].text
cont1 = content[0].text
body.append(cont1)
df['content'] = body
print(df)
#df.to_csv(u'C:/Users/snpw9/Desktop/Scripts/sample_scrap.csv', mode='a', header=False, index=False)
pages('https://www.df.cl/noticias/site/tax/port/all/taxport_3_230__1.html') #Banca y Fintech
pages('https://www.df.cl/noticias/site/tax/port/all/taxport_3_20__1.html') #Bolsa y Monedas
pages('https://www.df.cl/noticias/site/tax/port/all/taxport_3_226__1.html') #Pensiones
pages('https://www.df.cl/noticias/site/tax/port/all/taxport_3_228__1.html') #Seguros
It would be very helpful to make this part work, hopefully, with your help!
The second script without the loop (which work properly):
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
myurl = 'https://www.df.cl/noticias/site/tax/port/all/noticias/mercados/banca-fintech/bancoestado-
destina-90-millones-para-los-gastos-de-internet-de-sus/2020-07-07/152240.html'
#def pages(myurl):
# opening up connection, grabbing the page
uClient = uReq(myurl)
page_html = uClient.read()
uClient.close()
#html parsing
page_soup = soup(page_html, "html.parser")
#grabs each newsletter subjects
content = page_soup.findAll("div",{"class":"entry cuerpo-noticias CUERPO"})
cont1 = content[0].text
print(cont1)
How can I access the second span element in a found div?
My code looks like this:
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
#url website
url = 'https://www.gumtree.pl/a-samochody-osobowe/krakow/honda-civic-1-5-sport-mt-2019/1006295441720911232816609'
#opening up connection, grabbing page
uClient = uReq(url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, 'html.parser')
#scraping div
product = page_soup.find('div', {'class': 'vip-details'})
for li in product.findAll('div', {'class': 'attribute'}):
print(li)
The result is something like this:
[...]
<div class="attribute"><span class="name">Rok</span><span class="value">2019</span></div>
<div class="attribute"><span class="name">Kilometry</span><span class="value">12</span></div>
[...]
How to get access to 2 span element? in my case to value 2019 and 12
I tried to use such solutions but without success:
for li in product.findAll('div', {'class': 'attribute'}):
print(li.span.span)
and
for li in product.findAll('div', {'class': 'attribute'}):
print(li.span[1])
How can I access the second span of an element in my loop?
You can use find_next()
for li in product.findAll('div', {'class': 'attribute'}):
print(li.find_next("span" , class_='value').text)
Or Use css selector.
for li in product.select('.attribute .value'):
print(li.text)
Code:
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
#url website
url = 'https://www.gumtree.pl/a-samochody-osobowe/krakow/honda-civic-1-5-sport-mt-2019/1006295441720911232816609'
uClient = uReq(url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, 'html.parser')
product = page_soup.select_one('.vip-details')
for li in product.select('.attribute .value'):
print(li.text)
I got my expected result by using two loops:
for li in product.findAll('div', {'class': 'attribute'}):
for value in li.findAll('span', {'class': 'value'}):
print(value.text)
if there is a simpler solution to the problem, I invite you to comment.
I am practicing here and my goal is to retrieve these data from the page in the url variable:
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
url = "https://www.newegg.com/global/bg-en/PS4-Accessories/SubCategory/ID-3142"
# opening connection, grabing the page
uClient = uReq(url)
page_html = uClient.read()
uClient.close()
# html parser
page_soup = soup(page_html, "html.parser")
# grabs each product
containers = page_soup.findAll("div", {"class": "item-container"})
for container in containers:
brand = container.select("div.item-info")[0].a.img["title"]
name = container.findAll("a", {"class": "item-title"})[0].text.strip()
shipping = container.findAll("li", {"class": "price-ship"})[0].text.strip()
print("brand " + brand)
print("name " + name)
print("shipping " + shipping)
Nothing more I can say for it :) I just simple as that but I still can't get it why no data is retrieved. Will be thankful for every advice!
You are invoking the find_all method with wrong arguments.
You should use the argument "class_" properly, according to the documentation found here:
https://www.crummy.com/software/BeautifulSoup/bs4/doc/#searching-by-css-class
I suppose to get product information from source page, the data I want is in the HTML tag , but there is another tag in tag, so when I save the data to local storage, it looks very bad. I hope someone knows how to fix this problem.
Here is my code:
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'https://list.jd.com/list.html?
cat=9987,653,655&ev=exbrand_15127&page=1'
#opening up connection, grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
#html parsing
page_soup = soup(page_html, "html.parser")
filename = "params.csv"
f = open(filename,"w")
#grabs each product
li_containers = page_soup.findAll("li",{"class":"gl-item"})
for i in range(0,len(li_containers)):
p_name_div = li_containers[i].find("div",{"class":"p-name"})
p_name = p_name_div.a.em.text.strip()
print(p_name)
f.write(p_name)
f.close()
There is the some screenshots.
I wanted it to be like this:
But it ended up looking like this:
Without span tag
With span tag
Try this
my_url = 'https://list.jd.com/list.html?
cat=9987,653,655&ev=exbrand_15127&page=1'
#opening up connection, grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
#html parsing
page_soup = soup(page_html, "html.parser")
filename = "params.csv"
f = open(filename,"w")
#grabs each product
li_containers = page_soup.findAll("li",{"class":"gl-item"})
for i in range(0,len(li_containers)):
p_name_div = li_containers[i].find("div",{"class":"p-name"})
p_name = p_name_div.a.em.text.strip()
print(p_name.strip(" "))
f.write(p_name.strip(" "))
f.close()
i've got a strange problem with my webscraper. I am trying to get the data from a website using BeautifulSoup.
My code works on 90% of all links i've tried out but on a few it does not read the page fully.
The text that intrests me is "1152x864"
When checking the soure code on my browser i clearly see the text:
<li class="x-block-grid-item">
<h3 style="margin: 0 0 0.35em;font-size: 1em;letter-spacing: 0.05em;line-height: 1">Resolution</h3>
<p class="man">1152x864</p>
</li>
But when I try to get the source via BeautifulSoup it only shows this:
<li class="x-block-grid-item">
<h3 style="margin: 0 0 0.35em;font-size: 1em;letter-spacing: 0.05em;line-height: 1">Resolution</h3>
</li>
This is my code:
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'https://prosettings.net/counterstrike/fer/'
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("li",{"class":"x-block-grid-item"})
cont_res = containers[8].p.text
print("Res: " + cont_res)
When I try a different link for example:
my_url = 'https://prosettings.net/counterstrike/fallen/'
Everything works fine.
Try this. It should not disappoint you:
from urllib.request import urlopen
from bs4 import BeautifulSoup
URL = 'https://prosettings.net/counterstrike/fer/'
res = urlopen(URL).read()
soup = BeautifulSoup(res, "lxml")
cont_res = ' '.join([item.find(class_="man").text for item in soup.find_all(class_="x-block-grid-item") if "Resolution" in item.text])
# or using .select()
# cont_res = ' '.join([item.select_one(".man").text for item in soup.select(".x-block-grid-item") if "Resolution" in item.text])
print("Res: " + cont_res)
Output:
Res: 1152x864
I'm used to BeautifulSoup.text printing out the text of every tag and its children, but there may be something funny going on with these <p>'s in particular. At any rate, you're not getting the right soup, so maybe try requests instead of urllib, and then dig straight for the <p> tags with bs4.
site = 'https://prosettings.net/counterstrike/fer/'
r = requests.get(site)
soup = BeautifulSoup(r.content, 'html.parser')
list2 = soup.find_all('p', class_='man')
for item in list2:
if item.find('p'):
print(item.text)
Gives me all the class="man" <p> tags' info:
400
2.50
1000
125
1.00
0
6
1
1152x864
4:3
stretched
240
It's literally just the Resolution tag. No idea why.