Remove specific result for requests_html - python

import datetime
from requests_html import HTMLSession
session = HTMLSession()
url = 'https://music.apple.com/us/playlist/top-100-hong-kong/pl.7f35cffa10b54b91aab128ccc547f6ef'
applemusic = session.get(url)
applemusic.html.render(sleep=1, scrolldown=1)
data = applemusic.html.xpath('//*[#id="scrollable-page"]/main/div/div[2]', first=True)
artist_list = data.find('span.svelte-vyyb4r')
for artist in artist_list:
print(artist)
Hi Guys, I am a newbie learner for python. I want to do a small function that can scape the information from the Apple Music playlist. But there is a row result that I want to remove it form the the output (you can see the following output result). How can I do it? I know this maybe a simple question but I really appreciate any kind assistance.
<Element 'span' class=('svelte-vyyb4r',)>
<Element 'span' class=('svelte-vyyb4r',)>
*<Element 'span' class=('songs-list-row__badge', 'songs-list-row__badge--explicit', 'svelte-vyyb4r')>*
<Element 'span' class=('svelte-vyyb4r',)>
<Element 'span' class=('svelte-vyyb4r',)>
I have try to remove() function but it seems not working
for artist in artist_list.remove("songs-list-row__badge"):
print(artist)
Output
ValueError: list.remove(x): x not in list

You can change your CSS selector to only get the span elements that have only svelte-vyyb4r as class:
data = applemusic.html.xpath('//*[#id="scrollable-page"]/main/div/div[2]', first=True)
artist_list = data.find('span[class="svelte-vyyb4r"]')
you can also loop through your list and check if it has the class you don't want and ignore it
for artist in artist_list.remove():
if "songs-list-row__badge" in artist.attrs["class"]:
# skip element
continue
print(artist)

Related

How can I extract a specific item attribute from an ebay listing using BeautifulSoup?

def get_data(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
return soup
current_data = get_data(link)
x = current_data.find_all(text="Style Code:")
I'm trying to get the style code of a shoe off ebay but the problem is that it doesn't have a specific class or any kind of unique identifier so I can't just use find() to get the data. Currently I searched by text to find 'Style Code:' but how can I get to the next div? An example of a shoe product page would be this.
soup.select_one('span.ux-textspans:-soup-contains("Style Code:")').find_next('span').get_text(strip=True)
Try this,
spans = soup.find_all('span', attrs={'class':'ux-textspans'})
style_code = None
for idx, span in enumerate(spans):
if span.text == 'Style Code:':
style_code = spans[idx+1].text
break
print(style_code)
# 554724-371
Since there are lot's of span is similar (with class 'ux-textspans') you need to iterate through it and find the next span after 'Style Code:'

How do you get a text from a span tag using BeautifulSoup when there's no clear identification?

enter image description here
I am trying to extract the value from this span tag for Year Built using BeautifulSoup and the following code below, but I'm not getting the actual Year. Please help. Thanks :)
enter image description here
results = []
for url in All_product[:2]:
link = url
html = getAndParseURL(url)
YearBuilt = html.findAll("span", {"class":"header font-color-
gray-light inline-block"})[4]
results.append([YearBuilt])
The output shows
[[<span class="header font-color-gray-light inline-block">Year Built</span>],
[<span class="header font-color-gray-light inline-block">Community</span>]]
Try using the .next_sibling:
result = []
year_built = html.find_all(
"span", {"class":"header font-color- gray-light inline-block"}
)
for elem in year_built:
if elem.text.strip() == 'Year Built':
result.append(elem.next_sibling)
I'm not sure how the whole HTML looks, but something along these lines might help.
Note: Sure there would be a more specific solution to extract all attributes for your results you may need, but therefor you should improve your question and add more details
Using css selectors you can simply chain / combinate your selection to be more strict. In this case you select the <span> contains your string and use adjacent sibling combinator to get the next sibling <span>.
YearBuilt = e.text if (e := html.select_one('span.header:-soup-contains("Year Built") + span')) else None
It also avoid AttributeError: 'NoneType' object has no attribute 'text', if element is not available you can check if it exists before calling text method
soup = BeautifulSoup(html_doc, "html.parser")
results = []
for url in All_product[:2]:
link = url
html = getAndParseURL(url)
YearBuilt = e.text if (e := html.select_one('span.header:-soup-contains("Year Built") + span')) else None
results.append([YearBuilt])

Python Selenium get text out of a class

Hey I want to have the text which is in this class and print it in the console.
How can I do this?
I have tried:
profileName = driver.find_element_by_class_name("._7UhW9.fKFbl.yUEEX.KV-D4.fDxYl").text()
print(profileName)
I would be very happy if you could help me.
Tom Rudolph
it not text() in python, it is .text
also this ._7UhW9.fKFbl.yUEEX.KV-D4.fDxYl looks dynamic and a css_selector not class_name.
Code 1 :
profileName = driver.find_element_by_css_selector("._7UhW9.fKFbl.yUEEX.KV-D4.fDxYl").text
print(profileName)
Code 2 :
profileName = driver.find_element_by_css_selector("._7UhW9.fKFbl.yUEEX.KV-D4.fDxYl").get_attribute('innerHTML')
print(profileName)
In case that element has all these class names you just have to change from find_element_by_class_name to find_element_by_css_selector, as following:
profileName = driver.find_element_by_css_selector("._7UhW9.fKFbl.yUEEX.KV-D4.fDxYl").text()
print(profileName)
However I'm quite sure these class names are dynamically changing so this will not work.
Also, make sure you added some delay / wait before accessing the element.
maybe try this
is from the BeautifulSoup library
from the beginning:
import requests
from bs4 import BeautifulSoup
url = 'my_site'
r = request.get (url)
re1 = BeautifulSoup (r.text, "lxml")
re2 = re1.body.find_all ('div', {'class': '._7UhW9.fKFbl.yUEEX.KV-D4.fDxYl'})
for syntax in re2 [:]:
print (syntax)
where there is div you can put another tag where this class is (a, p, h)
where there is div you can put another tag where this class is (a, p, h)
and in selenium, instead of 'element' use 'elements', when harvesting it is better to collect everything: D
But when you want to click an element or type something (send_keys () or click ()) you have to use a single number of characters, which is 'element' and not 'elements'
I do this for seleneium webdriver
data1 = driver.find_elements_by_class_name('class')
data2 = driver.find_elements_by_css_selector('css_selector")
if len(data1) > 0:
do something
if len(data2) > 0:
do something
you can always add an else if you want to see how the error pops up, print (len (data1)) or data2 to see how much data you have collected etc.

Finding specific tag using BeautifulSoup

Here is the website that i'm parsing: http://uniapple.net/usaddress/address.php?address1=501+10th+ave&address2=&city=nyc&state=ny&zipcode=10036&country=US
I would like to be able to find the word that will be in line 39 between the td tags. That line tells me if the address is residential or commercial, which is what I need for my script.
Here's what I have, but i'm getting this error:
AttributeError: 'NoneType' object has no attribute 'find_next'
The code I'm using is:
from bs4 import BeautifulSoup
import urllib
page = "http://uniapple.net/usaddress/address.php?address1=501+10th+ave&address2=&city=nyc&state=ny&zipcode=10036&country=US"
z = urllib.urlopen(page).read()
thesoup = BeautifulSoup(z, "html.parser")
comres = (thesoup.find("th",text=" Residential or ").find_next("td").text)
print(str(comres))
text argument would not work in this particular case. This is related to how the .string property of an element is calculated. Instead, I would use a search function where you can actually call get_text() and check the complete "text" of an element including the children nodes:
label = thesoup.find(lambda tag: tag and tag.name == "th" and \
"Residential" in tag.get_text())
comres = label.find_next("td").get_text()
print(str(comres))
Prints Commercial.
We can go a little bit further and make a reusable function to get a value by label:
soup = BeautifulSoup(z, "html.parser")
def get_value_by_label(soup, label):
label = soup.find(lambda tag: tag and tag.name == "th" and label in tag.get_text())
return label.find_next("td").get_text(strip=True)
print(get_value_by_label(soup, "Residential"))
print(get_value_by_label(soup, "City"))
Prints:
Commercial
NYC
All you are missing is a bit of housekeeping:
ths = thesoup.find_all("th")
for th in ths:
if 'Residential or' in th.text:
comres = th.find_next("td").text
print(str(comres))
>> Commercial
You'll need to use a regular expression as your text field, like re.compile('Residential or'), rather than a string.
This was working for me. I had to iterate over the results provided, though if you only expect a single result per page you could swap find for find_all:
for r in thesoup.find_all(text=re.compile('Residential or')):
r.find_next('td').text

Extending selection with BeautifulSoup

I am trying to get BeautifulSoup to do the following.
I have HTML files which I wish to modify. I am interested in two tags in particular, one which I will call TagA is
<div class ="A">...</div>
and one which I will call TagB
<p class = "B">...</p>
Both tags occur independently throughout the HTML and may themselves contain other tags and be nested inside other tags.
I want to place a marker tag around every TagA whenever it is not immediately followed by TagB so that
<p class="A"">...</p> becomes <marker><p class="A">...</p></marker>
But when TagA is followed immediately by TagB, I want the marker Tag to surround them both
so that
<p class="A">...</p><div class="B">...</div>
becomes
<marker><p class="A">...</p><div class="B">...</div></marker>
I can see how to select TagA and enclose it with the marker tag, but when it is followed by TagB I do not know if or how the BeautiulSoup 'selection' can be extended to include the NextSibling.
Any help appreciated.
beautifulSoup does have a "next sibling" function. find all tags of class A and use a.next_sibling to check if it is b.
look at the docs:
http://www.crummy.com/software/BeautifulSoup/bs4/doc/#going-sideways
I think I was going about this the wrong way by trying to extend the 'selection' from one tag to the following. Instead I found the following code which insets the outer 'Marker' tag and then inserts the A and B tags does the trick.
I am pretty new to Python so would appreciate advice regarding improvements or snags with the following.
def isTagB(tag):
#If tag is <p class = "B"> return true
#if not - or tag is just a string return false
try:
return tag.name == 'p'#has_key('p') and tag.has_key('B')
except:
return False
from bs4 import BeautifulSoup
soup = BeautifulSoup("""<div class = "A"><p><i>more content</i></p></div><div class = "A"><p><i>hello content</i></p></div><p class="B">da <i>de</i> da </p><div class = "fred">not content</div>""")
for TagA in soup.find_all("div", "A"):
Marker = soup.new_tag('Marker')
nexttag = TagA.next_sibling
#skipover white space
while str(nexttag).isspace():
nexttag = nexttag.next_sibling
if isTagB(nexttag):
TagA.replaceWith(Marker) #Put it where the A element is
Marker.insert(1,TagA)
Marker.insert(2,nexttag)
else:
#print("FALSE",nexttag)
TagA.replaceWith(Marker) #Put it where the A element is
Marker.insert(1,TagA)
print (soup)
import urllib
from BeautifulSoup import BeautifulSoup
html = urllib.urlopen("http://ursite.com") #gives html response
soup = BeautifulSoup(html)
all_div = soup.findAll("div",attrs={}) #use attrs as dict for attribute parsing
#exa- attrs={'class':"class","id":"1234"}
single_div = all_div[0]
#to find p tag inside single_div
p_tag_obj = single_div.find("p")
you can use obj.findNext(), obj.findAllNext(), obj.findALLPrevious(), obj.findPrevious(),
to get attribute you can use obj.get("href"), obj.get("title") etc.

Categories