Conditioning the soup selection on a web scrape.Python/BeautifulSoup - python

I have the following code for an item of a list of products:
<div class="nice_product_item">
<div class="npi_name">
<h2>
<a href="/solutii-mobile-telefoane-mobile/apple-telefon-mobil-apple-iphone-13-super-retina-xdr-oled-6.1-256gb-flash-camera-duala-12-12-mp-wi-fi-5g-ios-negru-3824456.html">
<span style="color:red">Stoc limitat!</span>
Telefon Mobil Apple iPhone 13, Super Retina XDR OLED 6.1", 256GB Flash, Camera Duala 12 + 12 MP, Wi-Fi, 5G, iOS (Negru)
</a>
</h2>
</div>
<div class="price_block_list">
<span class="old_price"> 999,00 Lei </span>
<span class="price_discount">-12%</span>
<span class="cheaper_by">mai ieftin cu 120,00 lei</span>
<span class="real_price">879,00 Lei</span>
<span class="evo-credit">evoCREDIT</span></div>
</div>
</div>
Some products got the price_discount span,while others dont
<span class="price_discount">-12%</span>
I use the following code to scrape the names of products:
texts = []
for a in soup.select("div.npi_name a[href]"):
if a.span:
text = a.span.next_sibling
else:
text = a.string
texts.append(text.strip())
I don't know what conditions do I need to get the names of the products with discounts.
Note:It has to work for a list

A way to process the data could be to select all items with discounts:
soup.select('div.nice_product_item:has(.price_discount):has(a[href])')
Iterate over ResultSet, pick information you need and store it in a structured way like list of dicts to process it later e.g. DataFrame and save to csv, json, ...
Example
from bs4 import BeautifulSoup
import pandas as pd
html = '''
<div class="nice_product_item">
<div class="npi_name">
<h2>
<a href="/solutii-mobile-telefoane-mobile/apple-telefon-mobil-apple-iphone-13-super-retina-xdr-oled-6.1-256gb-flash-camera-duala-12-12-mp-wi-fi-5g-ios-negru-3824456.html">
<span style="color:red">Stoc limitat!</span>
Telefon Mobil Apple iPhone 13, Super Retina XDR OLED 6.1", 256GB Flash, Camera Duala 12 + 12 MP, Wi-Fi, 5G, iOS (Negru)
</a>
</h2>
</div>
<div class="price_block_list">
<span class="old_price"> 999,00 Lei </span>
<span class="price_discount">-12%</span>
<span class="cheaper_by">mai ieftin cu 120,00 lei</span>
<span class="real_price">879,00 Lei</span>
<span class="evo-credit">evoCREDIT</span></div>
</div>
</div>
'''
soup = BeautifulSoup(html)
data = []
for e in soup.select('div.nice_product_item:has(.price_discount):has(a[href])'):
data.append({
'url' : e.a['href'],
'label' :s[-1] if (s := list(e.a.stripped_strings)) else None,
'price' : s.text if (s := e.select_one('span.real_price')) else None,
'discount' : s.text if (s := e.select_one('span.price_discount')) else None,
'other' : 'edit for elements you need'
})
pd.DataFrame(data)
Output
url
label
price
discount
other
/solutii-mobile-telefoane-mobile/apple-telefon-mobil-apple-iphone-13-super-retina-xdr-oled-6.1-256gb-flash-camera-duala-12-12-mp-wi-fi-5g-ios-negru-3824456.html
Telefon Mobil Apple iPhone 13, Super Retina XDR OLED 6.1", 256GB Flash, Camera Duala 12 + 12 MP, Wi-Fi, 5G, iOS (Negru)
879,00 Lei
-12%
edit for elements you need

Related

Scrape a line of text from a website inside a div

I don't know how to scrape this text
Telefon Mobil Apple iPhone 13, Super Retina XDR OLED 6.1", 256GB
Flash, Camera Duala 12 + 12 MP, Wi-Fi, 5G, iOS (Negru)
<div class="npi_name">
<h2>
<a href="/solutii-mobile-telefoane-mobile/apple-telefon-mobil-apple-iphone-13-super-retina-xdr-oled-6.1-256gb-flash-camera-duala-12-12-mp-wi-fi-5g-ios-negru-3824456.html">
<span style="color:red">Stoc limitat!</span>
Telefon Mobil Apple iPhone 13, Super Retina XDR OLED 6.1", 256GB Flash, Camera Duala 12 + 12 MP, Wi-Fi, 5G, iOS (Negru)
</a>
</h2>
</div>
What I've tried:
for n in j.find_all("div","npi_name"):
n2=n.find("a", href=True, text=True)
try:
n1=n2['href']
except:
n2=n.find("a")
n1=n2['href']
n3=n2.string
print(n3)
Output:
None
Try:
from bs4 import BeautifulSoup
html_doc = """
<div class="npi_name">
<h2>
<a href="/solutii-mobile-telefoane-mobile/apple-telefon-mobil-apple-iphone-13-super-retina-xdr-oled-6.1-256gb-flash-camera-duala-12-12-mp-wi-fi-5g-ios-negru-3824456.html">
<span style="color:red">Stoc limitat!</span>
Telefon Mobil Apple iPhone 13, Super Retina XDR OLED 6.1", 256GB Flash, Camera Duala 12 + 12 MP, Wi-Fi, 5G, iOS (Negru)
</a>
</h2>
</div>
"""
soup = BeautifulSoup(html_doc, "html.parser")
t = "".join(soup.select_one(".npi_name a").find_all(text=True, recursive=False))
print(t.strip())
Prints:
Telefon Mobil Apple iPhone 13, Super Retina XDR OLED 6.1", 256GB Flash, Camera Duala 12 + 12 MP, Wi-Fi, 5G, iOS (Negru)
I've made a few assumptions but something like this should work:
for n in j.find_all("div", {"class": "npi_name"}):
print(n.find("a").contents[2].strip())
This is how I arrived at my answer (the HTML you provided was entered in to a.html):
from bs4 import BeautifulSoup
def main():
with open("a.html", "r") as file:
html = file.read()
soup = BeautifulSoup(html, "html.parser")
divs = soup.find_all("div", {"class": "npi_name"})
for div in divs:
a = div.find("a").contents[2].strip()
# Testing
print(a)
if __name__ == "__main__":
main()
texts = []
for a in soup.select("div.npi_name a[href]"):
texts.append(a.contents[-1].strip())
Or more explicitly:
texts = []
for a in soup.select("div.npi_name a[href]"):
if a.span:
text = a.span.next_sibling
else:
text = a.string
texts.append(text.strip())
Select your elements more specific e.g. css selectors and use stripped_strings to get text, assuming it is always the last node in your element:
for e in soup.select('div.npi_name a[href]'):
text = list(e.stripped_strings)[-1]
print(text)
This way you could also process other information if needed e.g. href,span text,...
Example
Select multiple items, store information in list of dicts and convert it into a dataframe:
from bs4 import BeautifulSoup
import pandas as pd
html = '''
<div class="npi_name">
<h2>
<a href="/solutii-mobile-telefoane-mobile/apple-telefon-mobil-apple-iphone-13-super-retina-xdr-oled-6.1-256gb-flash-camera-duala-12-12-mp-wi-fi-5g-ios-negru-3824456.html">
<span style="color:red">Stoc limitat!</span>
Telefon Mobil Apple iPhone 13, Super Retina XDR OLED 6.1", 256GB Flash, Camera Duala 12 + 12 MP, Wi-Fi, 5G, iOS (Negru)
</a>
</h2>
</div>
'''
soup = BeautifulSoup(html)
data = []
for e in soup.select('div.npi_name a[href]'):
data.append({
'url' : e['href'],
'stock': s.text if (s := e.span) else None,
'label' :list(e.stripped_strings)[-1]
})
pd.DataFrame(data)
Output
url
stock
label
/solutii-mobile-telefoane-mobile/apple-telefon-mobil-apple-iphone-13-super-retina-xdr-oled-6.1-256gb-flash-camera-duala-12-12-mp-wi-fi-5g-ios-negru-3824456.html
Stoc limitat!
Telefon Mobil Apple iPhone 13, Super Retina XDR OLED 6.1", 256GB Flash, Camera Duala 12 + 12 MP, Wi-Fi, 5G, iOS (Negru)

how to scrape nested two elements with python

hi i would like to get some info which is on below < del> and < ins> tags but i could not find any solution for it can is anyone has idea about this scraping and is there any for getting those informations
this is my python code
import requests
import json
from bs4 import BeautifulSoup
header = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'}
base_url = "https://www.n11.com/super-firsatlar"
r = requests.get(base_url,headers=header)
if r.status_code == 200:
soup = BeautifulSoup(r.text, 'html.parser')
books = soup.find_all('li',attrs={"class":"column"})
result=[]
for book in books:
title=book.find('h3').text
link=base_url +book.find('a')['href']
picture = base_url + book.find('img')['src']
price=book.find('p', {'class': 'del'})
single ={'title':title,'link':link,'picture':picture,'price':price}
result.append(single)
with open('book.json','w', encoding='utf-8') as f:
json.dump(result,f,indent=4,ensure_ascii=False)
else:
print(r.status_code)
and this my html page
<li class="column">
<script type="text/javascript">
var customTextOptionMap = {};
</script>
<div id="p-457010862" class="columnContent ">
<div class="pro">
<a href="https://www.n11.com/urun/oppo-a73-128-gb-oppo-turkiye-garantili-1599155?magaza=gelecekbizde"
title="Oppo A73 128 GB (Oppo Türkiye Garantili)" class="plink" data-id="457010862">
<img data-original="https://n11scdn1.akamaized.net/a1/215/elektronik/cep-telefonu/oppo-a73-128-gb-oppo-turkiye-garantili__1298508275589871.jpg"
width="215" height="215"
src="https://n11scdn1.akamaized.net/a1/215/elektronik/cep-telefonu/oppo-a73-128-gb-oppo-turkiye-garantili__1298508275589871.jpg"
alt="Oppo A73 128 GB (Oppo Türkiye Garantili)" class="lazy" style="">
<h3 class="productName ">
Oppo A73 128 GB (Oppo Türkiye Garantili)</h3>
<span class="loading"></span>
</a>
</div>
<div class="proDetail">
<a href="https://www.n11.com/urun/oppo-a73-128-gb-oppo-turkiye-garantili-1599155?magaza=gelecekbizde"
class="oldPrice" title="Oppo A73 128 GB (Oppo Türkiye Garantili)">
<del>2.999, 00 TL</del>
</a> <a href="https://www.n11.com/urun/oppo-a73-128-gb-oppo-turkiye-garantili-1599155?magaza=gelecekbizde"
class="newPrice" title="Oppo A73 128 GB (Oppo Türkiye Garantili)">
<ins>2.899, 00<span content="TRY">TL</span></ins>
</a>
<div class="discount discountS">
<div>
<span class="percent">%</span>
<span class="ratio">3</span>
</div>
</div>
<span class="textImg freeShipping"></span>
<p class="catalogView-hover-separate"></p>
<div class="moreOpt">
<a title="Oppo A73 128 GB (Oppo Türkiye Garantili)" class="textImg moreOptBtn"
href="https://www.n11.com/urun/oppo-a73-128-gb-oppo-turkiye-garantili-1599155?magaza=gelecekbizde"></a>
</div>
</div>
</div>
</li>
Unless I am not understanding your question, it should be as simple as doing:
del_data = soup.find_all("del")
ins_data = soup.find_all("ins")
is this not what you're trying to achieve? If not please clarify your question
del and ins are not class names but tags. You can simply find them with Soup.find_all('del')
price = book.find_all('del')
for p in price:
print(p.text)
gives
2.999,00 TL
189,90 TL
8.308,44 TL
499,90 TL
6.999,00 TL
99,00 TL
18,00 TL
499,00 TL
169,99 TL
1.499,90 TL
3.010,00 TL
2.099,90 TL
......
which is what you want I guess. You have to access the text attribute here. So, the element is located. The way you want to serialize it is a different question.

How do I scrape nested data using selenium and Python>

I basically want to scrape Feb 2016 - Present under <span class="visually-hidden">, but I can't see to get to it. Here's the HTML at code:
<div class="pv-entity__summary-info">
<h3 class="Sans-17px-black-85%-semibold">Litigation Paralegal</h3>
<h4>
<span class="visually-hidden">Company Name</span>
<span class="pv-entity__secondary-title Sans-15px-black-55%">Olswang</span>
</h4>
<div class="pv-entity__position-info detail-facet m0"><h4 class="pv-entity__date-range Sans-15px-black-55%">
<span class="visually-hidden">Dates Employed</span>
<span>Feb 2016 – Present</span>
</h4><h4 class="pv-entity__duration de Sans-15px-black-55% ml0">
<span class="visually-hidden">Employment Duration</span>
<span class="pv-entity__bullet-item">1 yr 2 mos</span>
</h4><h4 class="pv-entity__location detail-facet Sans-15px-black-55% inline-block">
<span class="visually-hidden">Location</span>
<span class="pv-entity__bullet-item">London, United Kingdom</span>
</h4></div>
</div>
And here is what I've been doing at the moment with selenium in my code:
date= browser.find_element_by_xpath('.//div[#class = "pv-entity__duration de Sans-15px-black-55% ml0"]').text
print date
But this gives no results. How would I go about either pulling the date?
There is no div with class="pv-entity__duration de Sans-15px-black-55% ml0", but h4. If you want to get text of div, then try:
date= browser.find_element_by_xpath('.//div[#class = "pv-entity__position-info detail-facet m0"]').text
print date
If you want to get "Feb 2016 - Present", then try
date= browser.find_element_by_xpath('//h4[#class="pv-entity__date-range Sans-15px-black-55%"]/span[2]').text
print date
You can rewrite your xpath code something like this :
# -*- coding: utf-8 -*-
from lxml import html
import unicodedata
html_str = """
<div class="pv-entity__summary-info">
<h3 class="Sans-17px-black-85%-semibold">Litigation Paralegal</h3>
<h4>
<span class="visually-hidden">Company Name</span>
<span class="pv-entity__secondary-title Sans-15px-black-55%">Olswang</span>
</h4>
<div class="pv-entity__position-info detail-facet m0"><h4 class="pv-entity__date-range Sans-15px-black-55%">
<span class="visually-hidden">Dates Employed</span>
<span>Feb 2016 – Present</span>
</h4><h4 class="pv-entity__duration de Sans-15px-black-55% ml0">
<span class="visually-hidden">Employment Duration</span>
<span class="pv-entity__bullet-item">1 yr 2 mos</span>
</h4><h4 class="pv-entity__location detail-facet Sans-15px-black-55% inline-block">
<span class="visually-hidden">Location</span>
<span class="pv-entity__bullet-item">London, United Kingdom</span>
</h4></div>
</div>
"""
root = html.fromstring(html_str)
# For fetching Feb 2016 â Present :
txt = root.xpath('//h4[#class="pv-entity__date-range Sans-15px-black-55%"]/span/text()')[1]
# For fetching 1 yr 2 mos :
txt1 = root.xpath('//h4[#class="pv-entity__duration de Sans-15px-black-55% ml0"]/span/text()')[1]
print txt
print txt1
This will result in :
Feb 2016 â Present
1 yr 2 mos

Retrieve bbc weather data with identical span class and nested spans

I am trying to pull data form BBC weather with a view to use in a home automation dashboard.
The HTML code I can pull fine and I can pull one set of temps but it just pulls the first.
</li>
<li class="daily__day-tab day-20150418 ">
<a data-ajax-href="/weather/en/2646504/daily/2015-04-18?day=3" href="/weather/2646504?day=3" rel="nofollow">
<div class="daily__day-header">
<h3 class="daily__day-date">
<span aria-label="Saturday" class="day-name">Sat</span>
</h3>
</div>
<span class="weather-type-image weather-type-image-40" title="Sunny"><img alt="Sunny" src="http://static.bbci.co.uk/weather/0.5.327/images/icons/tab_sprites/40px/1.png"/></span>
<span class="max-temp max-temp-value"> <span class="units-values temperature-units-values"><span class="units-value temperature-value temperature-value-unit-c" data-unit="c">13<span class="unit">°C</span></span><span class="unit-types-separator"> </span><span class="units-value temperature-value temperature-value-unit-f" data-unit="f">55<span class="unit">°F</span></span></span></span>
<span class="min-temp min-temp-value"> <span class="units-values temperature-units-values"><span class="units-value temperature-value temperature-value-unit-c" data-unit="c">5<span class="unit">°C</span></span><span class="unit-types-separator"> </span><span class="units-value temperature-value temperature-value-unit-f" data-unit="f">41<span class="unit">°F</span></span></span></span>
<span class="wind wind-speed windrose-icon windrose-icon--average windrose-icon-40 windrose-icon-40--average wind-direction-ene" data-tooltip-kph="31 km/h, East North Easterly" data-tooltip-mph="19 mph, East North Easterly" title="19 mph, East North Easterly">
<span class="speed"> <span class="wind-speed__description wind-speed__description--average">Wind Speed</span>
<span class="units-values windspeed-units-values"><span class="units-value windspeed-value windspeed-value-unit-kph" data-unit="kph">31 <span class="unit">km/h</span></span><span class="unit-types-separator"> </span><span class="units-value windspeed-value windspeed-value-unit-mph" data-unit="mph">19 <span class="unit">mph</span></span></span></span>
<span class="description blq-hide">East North Easterly</span>
</span>
This is my code which isn’t working
import urllib2
import pprint
from bs4 import BeautifulSoup
htmlFile=urllib2.urlopen('http://www.bbc.co.uk/weather/2646504?day=1')
htmlData = htmlFile.read()
soup = BeautifulSoup(htmlData)
table=soup.find("div","daily-window")
temperatures=[str(tem.contents[0]) for tem in table.find_all("span",class_="units-value temperature-value temperature-value-unit-c")]
mintemp=[str(min.contents[0]) for min in table.find_("span",class_="min-temp min-temp-value")]
maxtemp=[str(min.contents[0]) for min in table.find_all("span",class_="max-temp max-temp-value")]
windspeeds=[str(speed.contents[0]) for speed in table.find_all("span",class_="units-value windspeed-value windspeed-value-unit-mph")]
pprint.pprint(zip(temperatures,temp2,windspeeds))
your min and max temp extract is wrong.You just find the hole min temp span (include both c and f format).Get the first thing of content gives you empty string.
And the min temp tag identify class=min-temp.min-temp-value is not the same with the c-type min temp class=temperature-value-unit-c.So I suggest you to use css selector.
Eg,find all of your min temp span could be
table.select('span.min-temp.min-temp-value span.temperature-value-unit-c')
This means select all class=temperature-value-unit-c spans which are children of class=min-temp min-temp-value spans.
So do the other information lists like max_temp wind

python parse html elements while scraping

well i have a website :
http://www.custojusto.pt/Lisboa?ca=14_s&th=1&q=macbook&cg=0&w=1
and i want to get all the name of the ads and the value for the item in a array, what i have right now is :
import urllib2
from BeautifulSoup import BeautifulSoup
import re
listofads = []
page = urllib2.urlopen("http://www.custojusto.pt/Lisboa?ca=14_s&th=1&q=macbook&cg=0&w=1").read()
soup = BeautifulSoup(page)
for a in soup.findAll("div", {"class":re.compile("lista")}):
for i in a:
c = soup.findAll('h2')
y = soup.findAll("span", {"class":re.compile("right")})
listofads.append(c)
listofads.append(y)
print listofads
what i get is something like this :
</h2>, <h2>
Procura: Macbook Pro i7, 15'
</h2>], [<span class="right">50 €</span>
which look very bad .... i want to get :
Macbook bla bla . price = 500
Macbook B . price = 600
and so on
The html of the site is like this :
<div class="listofads">
<div class="lista " style="cursor: pointer;">
<div class="lista " style="cursor: pointer;">
<div class="li_image">
<div class="li_desc">
<a href="http://www.custojusto.pt/Lisboa/Laptops/Macbook+pro+15-11018054.htm?xtcr=2&" name="11018054">
<h2> Macbook pro 15 </h2>
</a>
<div class="clear"></div>
<span class="li_date largedate listline"> Informática & Acessórios - Loures </span>
<span class="li_date largedate listline">
</div>
<div class="li_categoria">
<span class="li_price">
<ul>
<li>
<span class="right">1 199 €</span>
<div class="clear"></div>
</li>
<li class="excep"> </li>
</ul>
</span>
</div>
<div class="clear"></div>
</div>
As you can see i only want the H2 value ( text ) on the div with the class "li_desc" and the price from the span on the class "right" .
I don't know how to do it with BeautifulSoup as it doesn't support xpath, but here's how you could do it nicely with lxml:
import urllib2
from lxml import etree
from lxml.cssselect import CSSSelector
url = "http://www.custojusto.pt/Lisboa?ca=14_s&th=1&q=macbook&cg=0&w=1"
response = urllib2.urlopen(url)
htmlparser = etree.HTMLParser()
tree = etree.parse(response, htmlparser)
my_products = []
# Here, we harvet all the results into a list of dictionaries, containing the items we want.
for product_result in CSSSelector(u'div.lista')(tree):
# Now, we can select the children element of each div.lista.
this_product = {
u'name': product_result.xpath('div[2]/a/h2'), # first h2 of the second child div
u'category': product_result.xpath('div[2]/span[1]'), # first span of the second child div
u'price': product_result.xpath('div[3]/span/ul/li[1]/span'), # Third div, span, ul, first li, span tag.
}
print this_product.get(u'name')[0].text
my_products.append(this_product)
# Let's inspect a product result now:
for product in my_products:
print u'Product Name: "{0}", costs: "{1}"'.format(
product.get(u'name')[0].text.replace(u'Procura:', u'').strip() if product.get(u'name') else 'NONAME!',
product.get(u'price')[0].text.strip() if product.get(u'price') else u'NO PRICE!',
)
And, here's some output:
Product Name: "Macbook Pro", costs: "890 €"
Product Name: "Memoria para Macbook Pro", costs: "50 €"
Product Name: "Macbook pro 15", costs: "1 199 €"
Product Name: "Macbook Air 13", costs: "1 450 €"
Some items do not contain a price, so results need to be checked before outputting each one.

Categories