beautifulsoup does not shows all the elements on website - python

I am making parser for website [https://edp.by/shop/womens-fragrances/][1]
first i got all the links from site to navigate through site
import requests
from bs4 import BeautifulSoup
def get_html(url):
r = requests.get(url,'lxml')
return r.text
url='https://edp.by/'
html=get_html(url)
soup=BeautifulSoup(html, )
x = soup.findAll("div", {"class": "row mainmenu"})
#print(x)
links=[]
for i in x:
z=i.find_all("ul", {"class": "nav navbar-nav"})[0].find_all("a", {"class": "dropdown-toggle"})
print(233,z,len(z),type(z))
for i in z:
q=i["href"]
links.append(url+str(q))
then i am trying to get each product from links:
url='https://edp.by/shop/womens-fragrances/'
html=get_html(url)
soup=BeautifulSoup(html, )
#x = soup.findAll("div", {"class": "row"})
#print()
action = soup.find('form').get('action')
print(action)
and result is : /search/
but at website i see all the structure via google code analizator
<form method="get" action="/shop/womens-fragrances/">
<div class="rr-widget" data-rr-widget-category-id="594" data-rr-widget-id="516e7cba0d422d00402a14b4" data-rr-widget-width="100%"></div>
<div class="shop_block">
<div class="shop_table">
<div class="col-md-4 col-sm-4 col-xs-12 product">
<div class="block">
<a href="/shop/womens-fragrances/43653/">
<img src="/images/no-image.png" class="text-center" alt="" title="">
<p class="fch"></p>
<p class="tch">0,00 руб. </p>
</a>
i want to get link to product, image ,and texts, but bs4 does not shows it. Whats the reason and how could i get it?
i tried also mechanicalsoup, no result also
import mechanicalsoup
browser = mechanicalsoup.StatefulBrowser()
browser.open(links[0])
form = browser.select_form('form')
action = form.form.attrs['action']
print(action) `/search/`

.find() will only get the first appearance of that tag. There are 6 elements with the <form> tag. You can use the .find_all(), then when you iterate through that, you'll see it's the 3rd index position in that list:
import requests
from bs4 import BeautifulSoup
def get_html(url):
r = requests.get(url,'lxml')
return r.text
url='https://edp.by/'
html=get_html(url)
soup=BeautifulSoup(html, )
x = soup.findAll("div", {"class": "row mainmenu"})
#print(x)
links=[]
for i in x:
z=i.find_all("ul", {"class": "nav navbar-nav"})[0].find_all("a", {"class": "dropdown-toggle"})
print(233,z,len(z),type(z))
for i in z:
q=i["href"]
links.append(url+str(q))
url='https://edp.by/shop/womens-fragrances/'
html=get_html(url)
soup=BeautifulSoup(html, 'html.parser')
#x = soup.findAll("div", {"class": "row"})
#print()
actions = soup.find_all('form')
for action in actions:
alpha = action.get('action')
print (alpha)
Output:
/search/
/filter-ajax/
/filter-ajax/
/shop/womens-fragrances/
/shop/womens-fragrances/?lxml
/users/

Related

Python - BeautifulSoup - How to return two different elements or more, with different attributes?

HTML Exemple
<html>
<div book="blue" return="abc">
<h4 class="link">www.example.com</h4>
<p class="author">RODRIGO</p>
</html>
Ex1:
url = urllib.request.urlopen(url)
page_soup = soup(url.read(), "html.parser")
res=page_soup.find_all(attrs={"class": ["author","link"]})
for each in res:
print(each)
Result1:
www.example.com
RODRIGO
Ex2:
url = urllib.request.urlopen(url)
page_soup = soup(url.read(), "html.parser")
res=page_soup.find_all(attrs={"book": ["blue"]})
for each in res:
print(each["return")
Result 2:
abc
!!!puzzle!!!
The question I have is how to return the 3 results in a single query?
Result 3
www.example.com
RODRIGO
abc
Example HTML seems to be broken - Assuming the div wrappes the other tags and it is may not the only book you can select all books:
for e in soup.find_all(attrs={"book": ["blue"]}):
print(' '.join(e.stripped_strings),e.get('return'))
Example
from bs4 import BeautifulSoup
html = '''
<html>
<div book="blue" return="abc">
<h4 class="link">www.rodrigo.com</h4>
<p class="author">RODRIGO</p>
</html>
'''
soup = BeautifulSoup(html)
for e in soup.find_all(attrs={"book": ["blue"]}):
print(' '.join(e.stripped_strings),e.get('return'))
Output
www.rodrigo.com RODRIGO abc
A more structured example could be:
data = []
for e in soup.select('[book="blue"]'):
data.append({
'link':e.h4.text,
'author':e.select_one('.author').text,
'return':e.get('return')
})
data
Output:
[{'link': 'www.rodrigo.com', 'author': 'RODRIGO', 'return': 'abc'}]
For the case one attribute against many values a regex approach is suggested:
from bs4 import BeautifulSoup
import re
html = """<html>
<div book="blue" return="abc">
<h4 class="link">www.rodrigo.com</h4>
<p class="author">RODRIGO</p>
</html>"""
soup = BeautifulSoup(html, 'lxml')
by_clss = soup.find_all(class_=re.compile(r'link|author'))
print(b_clss)
For more flexibility, a custom query function can be passed to find or find_all:
from bs4 import BeautifulSoup
html = """<html>
<div href="blue" return="abc"></div> <!-- div need a closing tag in a html-doc-->
<h4 class="link">www.rodrigo.com</h4>
<p class="author">RODRIGO</p>
</html>"""
def query(tag):
if tag.has_attr('class'):
# tag['class'] is a list. Here assumed that has only one value
return set(tag['class']) <= {'link', 'author'}
if tag.has_attr('book'):
return tag['book'] in {'blue'}
return False
print(soup.find_all(query))
# [<div book="blue" return="abc"></div>, <h4 class="link">www.rodrigo.com</h4>, <p class="author">RODRIGO</p>]
Notice that your html-sample has no closing div-tag. In my second case I added it otherwise the soup... will not taste good.
EDIT
To retrieve elements which satisfies a simultaneous conditions on attributes the query could look like this:
def query_by_attrs(**tag_kwargs):
# tag_kwargs: {attr: [val1, val2], ...}
def wrapper(tag):
for attr, values in tag_kwargs.items():
if tag.has_attr(attr):
# check if tag has multi-valued attributes (class,...)
if not isinstance((tag_attr:=tag[attr]), list): # := for python >=3.8
tag_attr = (tag_attr,) # as tuple
return bool(set(tag_attr).intersection(values)) # false if empty set
return wrapper
q_data = {'class': ['link', 'author'], 'book': ['blue']}
results = soup.find_all(query_by_attrs(**q_data))
print(results)
Extract All link from WebSite
import requests
from bs4 import BeautifulSoup
url = 'https://mixkit.co/free-stock-music/hip-hop/'
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'html.parser')
urls = []
for link in soup.find_all('a'):
print(link.get('href'))

Trying to find all <a> elements without a *specific* class

I'm trying web scraping for the first time and I'm using BeautifulSoup to gather bits of information from a website. I'm trying to get all the elements which has one class but not another. For example:
from bs4 import BeautifulSoup
html = """
<a class="something">Information I want</a>
<a class="something somethingelse">Information I don't want</a>
"""
soup = BeautifulSoup(html)
In this example, I want to get all the elements with the something class. However, when finding all elements containing that class I also get the element containing the somethingelse class, and I do not want these.
The code I'm using to get it is:
results = soup.find_all("a", {"class": "something"})
Any help is appreciated! Thanks.
This will work fine:
from bs4 import BeautifulSoup
text = '''<a class="something">Information I want</a>
<a class="something somethingelse">Information I don't want</a>'''
soup = BeautifulSoup(text, 'html.parser')
r1 = soup.find_all("a", {"class": "something"})
r2 = soup.find_all("a", {"class": "somethingelse"})
for item in r2:
if item in r1:
r1.remove(item)
print(r1)
Output
[<a class="something">Information I want</a>]
For extracting the text present in the tags, just add this lines:
for item in r1:
print(item.text)
Output
Information I want
For this task, you can find elements by lambda function, for example:
from bs4 import BeautifulSoup
html_doc = """<a class="something">Information I want</a>
<a class="something somethingelse">Information I don't want</a>
"""
soup = BeautifulSoup(html_doc, "html.parser")
a = soup.find(
lambda tag: tag.name == "a" and tag.get("class", []) == ["something"]
)
print(a)
Prints:
<a class="something">Information I want</a>
Or: specify "class" as a list:
a = soup.find("a", {"class": ["something"]})
print(a)
Prints:
<a class="something">Information I want</a>
EDIT:
For filtering type-icon type-X:
from bs4 import BeautifulSoup
html_doc = """
<a class="type-icon type-1">Information I want 1</a>
<a class="type-icon type-1 type-cell type-abbr">Information I don't want</a>
<a class="type-icon type-2">Information I want 2</a>
<a class="type-icon type-2 type-cell type-abbr">Information I don't want</a>
"""
soup = BeautifulSoup(html_doc, "html.parser")
my_types = ["type-icon", "type-1", "type-2"]
def my_filter(tag):
if tag.name != "a":
return False
c = tag.get("class", [])
return "type-icon" in c and not set(c).difference(my_types)
a = soup.find_all(my_filter)
print(a)
Prints:
[<a class="type-icon type-1">Information I want 1</a>, <a class="type-icon type-2">Information I want 2</a>]
Or extract tags you don't want first:
soup = BeautifulSoup(html_doc, "html.parser")
# extract tags I don't want:
for t in soup.select(".type-cell.type-abbr"):
t.extract()
print(soup.select(".type-icon.type-1, .type-icon.type-2"))
Prints:
[<a class="type-icon type-1">Information I want 1</a>, <a class="type-icon type-2">Information I want 2</a>]

Trying to get content of span in Python using BeautifulSoup

from bs4 import BeautifulSoup
url = 'C:\\Users\\Zandrio\\Documents\\Python-Selexion\\HTML-localhost\\Selexion.html'
page = open(url)
soup = BeautifulSoup(page.read(), features="lxml")
prettify = soup.prettify
Model = "".join([div.text for div in soup.find_all('div', {'class' : 'title-options'})])
print(Model)
Output:
PS C:\Users\Zandrio> & C:/Users/Zandrio/AppData/Local/Programs/Python/Python38/python.exe c:/Users/Zandrio/Documents/Requests/selexion.py
SQQE55Q90R
Merk:
Samsung Afdrukken
HTML:
<div class="title-options">
<span>
SQQE55Q90R
</span>
<span>
Merk: Samsung
</span>
<span class="print"> Afdrukken
</span>
</div>
I just want the Model number in this case, that is SQQE55Q90R here. Please suggest any solution.
from bs4 import BeautifulSoup
url = 'C:\\Users\\Zandrio\\Documents\\Python-Selexion\\HTML-localhost\\Selexion.html'
page = open(url)
soup = BeautifulSoup(page.read(), features="lxml")
div = soup.body.find('div', attrs={'class': 'title-options'})
model_number = div.span.text.strip() # text of first span
print(model_number)

Can't access a tweet id with beautiful soup

My goal is to retrieve the ids of tweets in a twitter search as they are being posted. My code so far looks like this:
import requests
from bs4 import BeautifulSoup
keys = some_key_words + " -filter:retweets AND -filter:replies"
query = "https://twitter.com/search?f=tweets&vertical=default&q=" + keys + "&src=typd&lang=es"
req = requests.get(query).text
soup = BeautifulSoup(req, "lxml")
for tweets in soup.findAll("li",{"class":"js-stream-item stream-item stream-item"}):
print(tweets)
However, this doesn't return anything. Is there a problem with the code itself or am I looking at the wrong place of the source code? I understand that the ids should be stored here:
<div class="stream">
<ol class="stream-items js-navigable-stream" id="stream-items-id">
<li class="js-stream-item stream-item stream-item" **data-item-id**="1210306781806833664" id="stream-item-tweet-1210306781806833664" data-item-type="tweet">
from bs4 import BeautifulSoup
data = """
<div class="stream">
<ol class="stream-items js-navigable-stream" id="stream-items-id">
<li class="js-stream-item stream-item stream-item
" **data-item-id**="1210306781806833664"
id="stream-item-tweet-1210306781806833664"
data-item-type="tweet"
>
...
"""
soup = BeautifulSoup(data, 'html.parser')
for item in soup.findAll("li", {'class': 'js-stream-item stream-item stream-item'}):
print(item.get("**data-item-id**"))
Output:
1210306781806833664

How do I extract text after <i class> tag?

I am trying to print out the text 'Dealer' from div class by using beautifulSoup, but I do not know how to extract it.
I tried to print the i class, but the text Dealer did not come out
url = 'https://www.carlist.my/used-cars-for-sale/proton/malaysia'
response = requests.get(url, params={'page_number': 1})
soup = BeautifulSoup(response.text, 'lxml')
articles = soup.find_all('article')[:25]
seller_type = articles[4].find('div', class_ = 'item push-quarter--ends listing__spec--dealer')
seller_type_text = articles[4].find('i', class_ = 'icon icon--secondary muted valign--top push-quarter--right icon--user-formal')
print(seller_type.prettify())
print()
print(seller_type_text)
This is the output that I got:
<div class="item push-quarter--ends listing__spec--dealer">
<i class="icon icon--secondary muted valign--top push-quarter--right icon--user-formal">
</i>
Dealer
<span class="flyout listing__badge listing__badge--trusted-seller inline--block valign--top push-quarter--left">
<i class="icon icon--thumb-up">
</i>
<span class="flyout__content flyout__content--tip visuallyhidden--portable">
This 'Trusted Dealer' has a proven track record of upholding the best car selling practices certified by Carlist.my
</span>
</span>
<!-- used car -->
<!-- BMW -->
</div>
<i class="icon icon--secondary muted valign--top push-quarter--right icon--user-formal"></i>
How do I print the word 'Dealer' right after i class and before the span class?
Can someone please help me?
Thanks a lot!
There is a faster way of using one of the compound class names of the i tag element along with next_sibling.
If you examine the html you can see "Dealer" is part of the parent div of the i tag, and follows the i tag; so, you can grab the i tag then use next_sibling
from bs4 import BeautifulSoup as bs
import requests
r = requests.get('https://www.carlist.my/used-cars-for-sale/proton/malaysia')
soup = bs(r.content, 'lxml')
print(soup.select_one('.icon--user-formal').next_sibling)
Take a look at the contents property of your seller_type. You'll see that Dealer is at seller_type.contents[2]. In other words,
import requests
from bs4 import BeautifulSoup
url = 'https://www.carlist.my/used-cars-for-sale/proton/malaysia?profile_type=Dealer'
response = requests.get(url, params={'page_number': 1})
soup = BeautifulSoup(response.text, 'lxml')
articles = soup.find_all('article')[:25]
seller_type = articles[4].find('div', class_ = 'item push-quarter--ends listing__spec--dealer')
print(seller_type.contents[2])
import requests
from bs4 import BeautifulSoup
url = 'https://www.carlist.my/used-cars-for-sale/proton/malaysia?profile_type=Dealer'
response = requests.get(url, params={'page_number': 1})
soup = BeautifulSoup(response.text, 'lxml')
articles = soup.find_all('article')[:25]
seller_type = articles[4].find('div', class_ = 'item push-quarter--ends listing__spec--dealer')
print(seller_type.contents[2])

Categories