Access the second span in loop using Beautiful Soup - python

How can I access the second span element in a found div?
My code looks like this:
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
#url website
url = 'https://www.gumtree.pl/a-samochody-osobowe/krakow/honda-civic-1-5-sport-mt-2019/1006295441720911232816609'
#opening up connection, grabbing page
uClient = uReq(url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, 'html.parser')
#scraping div
product = page_soup.find('div', {'class': 'vip-details'})
for li in product.findAll('div', {'class': 'attribute'}):
print(li)
The result is something like this:
[...]
<div class="attribute"><span class="name">Rok</span><span class="value">2019</span></div>
<div class="attribute"><span class="name">Kilometry</span><span class="value">12</span></div>
[...]
How to get access to 2 span element? in my case to value 2019 and 12
I tried to use such solutions but without success:
for li in product.findAll('div', {'class': 'attribute'}):
print(li.span.span)
and
for li in product.findAll('div', {'class': 'attribute'}):
print(li.span[1])
How can I access the second span of an element in my loop?

You can use find_next()
for li in product.findAll('div', {'class': 'attribute'}):
print(li.find_next("span" , class_='value').text)
Or Use css selector.
for li in product.select('.attribute .value'):
print(li.text)
Code:
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
#url website
url = 'https://www.gumtree.pl/a-samochody-osobowe/krakow/honda-civic-1-5-sport-mt-2019/1006295441720911232816609'
uClient = uReq(url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, 'html.parser')
product = page_soup.select_one('.vip-details')
for li in product.select('.attribute .value'):
print(li.text)

I got my expected result by using two loops:
for li in product.findAll('div', {'class': 'attribute'}):
for value in li.findAll('span', {'class': 'value'}):
print(value.text)
if there is a simpler solution to the problem, I invite you to comment.

Related

How do I make this web crawler print only the titles of the songs?

import requests
from bs4 import BeautifulSoup
url = 'https://www.officialcharts.com/charts/singles-chart'
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'html.parser')
urls = []
for link in soup.find_all('a'):
print(link.get('href'))
def chart_spider(max_pages):
page = 1
while page >= max_pages:
url = "https://www.officialcharts.com/charts/singles-chart"
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('a', {"class": "title"}):
href = "BAD HABITS" + link.title(href)
print(href)
page += 1
chart_spider(1)
Wondering how to make this print just the titles of the songs instead of the entire page. I want it to go through the top 100 charts and print all the titles for now. Thanks
Here's is a possible solution, which modify your code as little as possible:
#!/usr/bin/env python3
import requests
from bs4 import BeautifulSoup
URL = 'https://www.officialcharts.com/charts/singles-chart'
def chart_spider():
source_code = requests.get(URL)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for title in soup.find_all('div', {"class": "title"}):
print(title.contents[1].string)
chart_spider()
The result is a list of all the titles found in the page, one per line.
If all you want is the titles for each song on the top 100,
this code:
import requests
from bs4 import BeautifulSoup
url='https://www.officialcharts.com/charts/singles-chart/'
req = requests.get(url)
soup = BeautifulSoup(req.content, 'html.parser')
titles = [i.text.replace('\n', '') for i in soup.find_all('div', class_="title")]
does what you are looking for.
You can do like this.
The Song title is present inside a <div> tag with class name as title.
Select all those <div> with .find_all(). This gives you a list of all <div> tags.
Iterate over the list and print the text of each div.
from bs4 import BeautifulSoup
import requests
url = 'https://www.officialcharts.com/charts/singles-chart/'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
d = soup.find_all('div', class_='title')
for i in d:
print(i.text.strip())
Sample Output:
BAD HABITS
STAY
REMEMBER
BLACK MAGIC
VISITING HOURS
HAPPIER THAN EVER
INDUSTRY BABY
WASTED
.
.
.

python stock price using BeautifulSoup

I am trying to get the price of the stock using the below code, it returns null for the current price. Please let me know where I am making a error
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
my_url = 'https://finance.yahoo.com/quote/MMM/key-statistics?p=MMM'
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup =soup(page_html,"lxml")
uClient.close()
# I tried this option 1
currentPrice = page_soup.find('div',attrs={"span": "Trsdu(0.3s) Fw(b) Fz(36px) Mb(-4px) D(ib)"})
print(currentPrice)
# I tried this option 2
for currentPrice in page_soup.find("div",{"class": "D(ib) Mend(20px)"}) :
print (page_soup.span)
You might want to have a look at yfinance
https://pypi.org/project/yfinance/

I'm using Python 3.7 an BS4 for web scraping, there is a problem I couldn't solve, hope someone knows how to fix this

I suppose to get product information from source page, the data I want is in the HTML tag , but there is another tag in tag, so when I save the data to local storage, it looks very bad. I hope someone knows how to fix this problem.
Here is my code:
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'https://list.jd.com/list.html?
cat=9987,653,655&ev=exbrand_15127&page=1'
#opening up connection, grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
#html parsing
page_soup = soup(page_html, "html.parser")
filename = "params.csv"
f = open(filename,"w")
#grabs each product
li_containers = page_soup.findAll("li",{"class":"gl-item"})
for i in range(0,len(li_containers)):
p_name_div = li_containers[i].find("div",{"class":"p-name"})
p_name = p_name_div.a.em.text.strip()
print(p_name)
f.write(p_name)
f.close()
There is the some screenshots.
I wanted it to be like this:
But it ended up looking like this:
Without span tag
With span tag
Try this
my_url = 'https://list.jd.com/list.html?
cat=9987,653,655&ev=exbrand_15127&page=1'
#opening up connection, grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
#html parsing
page_soup = soup(page_html, "html.parser")
filename = "params.csv"
f = open(filename,"w")
#grabs each product
li_containers = page_soup.findAll("li",{"class":"gl-item"})
for i in range(0,len(li_containers)):
p_name_div = li_containers[i].find("div",{"class":"p-name"})
p_name = p_name_div.a.em.text.strip()
print(p_name.strip(" "))
f.write(p_name.strip(" "))
f.close()

Python: BeautifulSoup not always getting all text data

i've got a strange problem with my webscraper. I am trying to get the data from a website using BeautifulSoup.
My code works on 90% of all links i've tried out but on a few it does not read the page fully.
The text that intrests me is "1152x864"
When checking the soure code on my browser i clearly see the text:
<li class="x-block-grid-item">
<h3 style="margin: 0 0 0.35em;font-size: 1em;letter-spacing: 0.05em;line-height: 1">Resolution</h3>
<p class="man">1152x864</p>
</li>
But when I try to get the source via BeautifulSoup it only shows this:
<li class="x-block-grid-item">
<h3 style="margin: 0 0 0.35em;font-size: 1em;letter-spacing: 0.05em;line-height: 1">Resolution</h3>
</li>
This is my code:
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'https://prosettings.net/counterstrike/fer/'
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("li",{"class":"x-block-grid-item"})
cont_res = containers[8].p.text
print("Res: " + cont_res)
When I try a different link for example:
my_url = 'https://prosettings.net/counterstrike/fallen/'
Everything works fine.
Try this. It should not disappoint you:
from urllib.request import urlopen
from bs4 import BeautifulSoup
URL = 'https://prosettings.net/counterstrike/fer/'
res = urlopen(URL).read()
soup = BeautifulSoup(res, "lxml")
cont_res = ' '.join([item.find(class_="man").text for item in soup.find_all(class_="x-block-grid-item") if "Resolution" in item.text])
# or using .select()
# cont_res = ' '.join([item.select_one(".man").text for item in soup.select(".x-block-grid-item") if "Resolution" in item.text])
print("Res: " + cont_res)
Output:
Res: 1152x864
I'm used to BeautifulSoup.text printing out the text of every tag and its children, but there may be something funny going on with these <p>'s in particular. At any rate, you're not getting the right soup, so maybe try requests instead of urllib, and then dig straight for the <p> tags with bs4.
site = 'https://prosettings.net/counterstrike/fer/'
r = requests.get(site)
soup = BeautifulSoup(r.content, 'html.parser')
list2 = soup.find_all('p', class_='man')
for item in list2:
if item.find('p'):
print(item.text)
Gives me all the class="man" <p> tags' info:
400
2.50
1000
125
1.00
0
6
1
1152x864
4:3
stretched
240
It's literally just the Resolution tag. No idea why.

No output in console python

from bs4 import BeautifulSoup
import requests
def imdb_spider():
url = 'http://www.imdb.com/chart/top'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for link in soup.findAll('a', {'class': 'secondaryInfo' }):
href = link.get('href')
print(href)
imdb_spider()
I'm trying to get links of all top rated movies from imdb . I'm using pycharm . The code runs for more than 30 mins but i'm not getting any print in my console.
You're correct that there's an element with class secondaryInfo for every movie title, but that's not the a element. If you want to find that, you have to use a different selector. For example, the following selector will do the trick instead of using soup.findAll().
soup.select('td.titleColumn a')
The problem is that {'class': 'secondaryInfo' } is a parameter of <span> object.
So try this:
from bs4 import BeautifulSoup
import requests
def imdb_spider():
url = 'http://www.imdb.com/chart/top'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "lxml")
for td in soup.findAll('td', {'class': 'titleColumn'}):
href = td.find('a').get('href')
print(href)
imdb_spider()

Categories