I'm trying to scrap Airbnb, indeed only three simples informations: description, city and price of each apartment in a country. However it is not working. All the time it apears the AttributeError: "ResultSet object has no attribute 'get_text'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?"
How can I scrap these data properly?
Here is my code:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36',
'Accept-Language':'en-GB, en; q=0.5',
'Referer':'https://google.com',
'DNT':'1'}
url = 'https://www.airbnb.com.br/s/Italia/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_lengths%5B%5D=one_week&price_filter_input_type=0&price_filter_num_nights=5&query=Italia&place_id=ChIJA9KNRIL-1BIRb15jJFz1LOI&date_picker_type=calendar&checkin=2023-03-09&checkout=2023-04-09&adults=1&source=structured_search_input_header&search_type=autocomplete_click'
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
features_dict = {}
descp = soup.find_all("div", {"class": "t1jojoys dir dir-ltr"}).get_text()
city = soup.find_all("div", {"class": "nquyp1l s1cjsi4j dir dir-ltr"}).get_text()
price = soup.find_all("div", {"class": "phbjkf1 dir dir-ltr"}).get_text()
features_dict['descrição'] = descp
features_dict['cidade'] = city
features_dict['preço'] = price
Well, I erased the ".get_text()" and the above error did not apper anymore, however all my lists of html elements are empty. When I to a brief check to see whether all html classes are there, I discover that the classes which I'm interested they don't appear.
class_list = set()
tags = {tag.name for tag in soup.find_all()}
for tag in tags:
for i in soup.find_all(tag):
if i.has_attr("class"):
if len(i['class']) != 0:
class_list.add(" ".join(i['class']))
classes = list(class_list)
What am I doing wrong?
Related
I'm trying to get all the category, sub category and sub sub category and so on of authors URL from dmoz website using BeautifulSoup.
I'm getting the following output:
# Missing the every 2nd option/URL in first step
/Arts/Literature/Authors/A
/Arts/Literature/Authors/C
/Arts/Literature/Authors/E
/Arts/Literature/Authors/G
/Arts/Literature/Authors/Horror
. . .
# Missing the every 1st option/URL in second step
/Arts/Literature/Authors/A/Abbey,_Lynn
/Top/Arts/Literature/Authors/A/Abe,_Kobo
In the above code 2nd element is missing in 1st step and 1st element in 2nd step.
Here is my code:
scrape_url = "http://dmoz.org/Arts/Literature/Authors"
page = session.get(scrape_url, headers={
"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
})
soup = bs(page.text, 'html.parser')
find_row = soup.find_all('div', attrs = {'class':'row'})[6:7]
# get all the root category author list
for test in find_row:
if test.find('div', attrs = {'class':'panel-body'}):
test_link = test.find_all('a')
for link in test_link:
sub_cat.append(link['href'])
# now get the sub or sub-sub category author URL list
for cat in sub_cat:
scrape_cat_url = "http://dmoz.org%s" % (cat)
print('scraping...', scrape_cat_url)
page = session.get(scrape_cat_url, headers={
"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
})
soup = bs(page.text, 'html.parser')
find_row = soup.find_all('div', attrs = {'class':'row'})[6:7]
# if sub category go next level or restart
for row in find_row:
if row.find('div', attrs = {'class':'panel-body'}):
test_link = row.find_all('a')
for link in test_link:
sub_cat.append(link['href'])
records.append(scrape_cat_url)
else:
records.append(scrape_cat_url)
# remove the category url from the sub_cat list
sub_cat.remove(cat)
Can anybody suggest a better way to get all the category, sub category and sub sub category URL of authors?
Try this streamlined version of your code:
from bs4 import BeautifulSoup
headers = {"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
scrape_url = "http://dmozlive.com/Top/Arts/Literature/Authors"
page = requests.get(scrape_url, headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
find_rows = soup.find_all('div', attrs = {'class':'row'})[6:7]
cats = []
for row in find_rows:
links = row.find_all('a')
for link in links:
cats.append(link['href'])
cats
Print out:
['/Top/Arts/Literature/Authors/A',
'/Top/Arts/Literature/Authors/B',
'/Top/Arts/Literature/Authors/C',
'/Top/Arts/Literature/Authors/D',
'/Top/Arts/Literature/Authors/E',
'/Top/Arts/Literature/Authors/F',
…
Now get the subcategories:
sub_cats = []
for cat in cats:
scrape_url = f"http://dmozlive.com{cat}"
page = requests.get(scrape_url, headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
find_rows = soup.find_all('div', attrs = {'class':'row'})[6:7]
for row in find_rows:
links = row.find_all('a')
for link in links:
sub_cats.append(link['href'])
subcats
Print out:
['/Top/Arts/Literature/Authors/A/Abbey,_Edward',
'/Top/Arts/Literature/Authors/A/Abbey,_Lynn',
'/Top/Arts/Literature/Authors/A/Abbott,_Edwin_A.',
'/Top/Arts/Literature/Authors/A/Abe,_Kobo',
'/Top/Arts/Literature/Authors/A/Achebe,_Chinua',
'/Top/Arts/Literature/Authors/A/Ackroyd,_Peter',
'/Top/Arts/Literature/Authors/A/Adams,_Douglas',
…
The following code may meet your expectation that's pull all the categories and sub-categories urls.
import requests
from bs4 import BeautifulSoup
url= 'http://dmozlive.com/Top/Arts/Literature/Authors'
headers= {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36"}
req=requests.get(url,headers=headers)
soup = BeautifulSoup(req.text, 'html.parser')
for cat_url in soup.select('.list-group.col-md-6 a'):
cat_url = 'http://dmozlive.com' + cat_url.get('href')
#print(cat_url)
req2=requests.get(cat_url,headers=headers)
soup2 = BeautifulSoup(req2.text, 'html.parser')
for author_url in soup2.select('.list-group-item'):
author_url= 'http://dmozlive.com' + str(author_url.get('href'))
print(author_url)
I am trying to split the links of the images
what is wrong in my code
mainURL = "https://w.cima4u.ws/category/%d8%a7%d9%81%d9%84%d8%a7%d9%85-
%d9%83%d8%b1%d8%aa%d9%88%d9%86-movies-anime/"
headers = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like
Gecko) Chrome/96.0.4664.93 Safari/537.36"}
s = requests.Session()
r = s.get(mainURL)
soup = BeautifulSoup(r.content, "html.parser")
for movie in soup.findAll('li', {'class':'MovieBlock'}):
movieLink = movie.find('a')
imageLink = movie.find('div', {'class':'Half1'})
imageLink = (['style'])
imageLink = imageLink.split("url(")[1][:-2]
print(imageLink)
since you didn't added the full stack trace, i suppose the error originating in this line
imageLink = imageLink.split("url(")[1][:-2]
split cannot be executed on a list, but on a string. in this case, imageLink is a list.
doc
I am having some problems with extracting tags from a websites:
r = req.get(web+"?pg=news&tf=G&page={}/".format(num))
soup = BeautifulSoup(r.content, 'html.parser')
results = [
(
x.select_one("h3.d-flex").text,
x.select_one("div.i").text,
x.select_one("div.a").a.text,
x.select_one("div.entry-content").p.text,
) for x in soup.findAll("section")
]
I need to scrape relevant information such as headlines, preview of content, date and link.
When I print the above tags, I get empty lists. Since I have no a lot of experience in selecting tags and I am not sure about classes I selected above, I would ask you if you could have a look and tell me which one(s) is wrong.
I hope this code help you. assume url http://gentedellarete.it/?pg=news&tf=G&page=1
import requests
from bs4 import BeautifulSoup
URL = "https://www.centrepointstores.com/sa/en/Women/Fashion-Accessories/Watches/CENTREPOINT-Citizen-Women%27s-Rose-Gold-Analog-Metal-Strap-Watch-EU-6039-86A/p/EU603986AGold"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
}
r = requests.get("http://www.gentedellarete.it/?pg=news&tf=G&page={}/".format(1), headers=HEADERS)
soup = BeautifulSoup(r.content, 'html.parser')
for x in soup.findAll('div', {'class',"text py-5 pl-md-5"}):
print('\n',x.select_one("div > a:nth-child(2) h3").text, sep='\n') #heading ok
print('\n', x.select_one('p').text) #under h3 ok
print('\n', x.select('p')[1].text) # body ok
print('\n', x.select('p')[1].text.split('(')[1].strip(')')) # date ok?
Any idea how can i retrieve the price (now 2917.99) from this source code view-https://www.emag.ro/televizor-led-smart-samsung-138-cm-55ru7402-4k-ultra-hd-ue55ru7402uxxh/pd/DTN2XZBBM/
If I call the class p.product-new-price i get None.
I have managed to get the title, but not the price.
What I have done so far:
import requests
from bs4 import BeautifulSoup
URL = 'https://www.emag.ro/televizor-led-smart-samsung-138-cm-55ru7402-4k-ultra-hd-ue55ru7402uxxh/pd/DTN2XZBBM/'
headers = {"User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
page = requests.get(URL, headers = headers)
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.find('title')
div = soup.find('div', {"class" : 'product-new-price'})
text = div.string
print(text)
The class looks like below and I want to extract the 2917 as int.
div class="product-highlight product-page-pricing"
p class="product-new-price"
2.917<sup>99</sup> <span>Lei</span>
Thank you very much!
Ok, with minor modifications:
It seems that the class product-new-price is on the p element for me!
I am assuming there will always be a <sup> tag after the main price
import requests
from bs4 import BeautifulSoup
URL = 'https://www.emag.ro/televizor-led-smart-samsung-138-cm-55ru7402-4k-ultra-hd-ue55ru7402uxxh/pd/DTN2XZBBM/'
headers = {"User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
page = requests.get(URL, headers = headers)
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.find('title')
p = soup.find('p', {"class" : 'product-new-price'})
# Get the text before <sup> tag
value = p.find('sup').previousSibling.strip()
print("Value: {}".format(value))
# Keep only numbers
value = ''.join(c for c in value if c.isdigit())
price = int(value)
print("Price: {}".format(price))
The above prints:
$ python3 ./test.py
Value: 2.917
Price: 2917
Now, with small changes you can also add the missing .99 if this is required
New to programming and web scraping and having some trouble getting BeautifulSoup to pull only the text from a given page.
Here's what I'm working with right now:
import requests
from bs4 import BeautifulSoup
url = 'https://www.tsn.ca/panarin-tops-2019-free-agent-frenzy-class-1.1303592'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
players = soup.find_all('td').text
print(players)
Which returns the following:
Traceback (most recent call last):
File "tsn.py", line 10, in <module>
players = soup.find_all('td').text
File "/home/debian1/.local/lib/python3.5/site-packages/bs4/element.py", line 1620, in __getattr__
"ResultSet object has no attribute '%s'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?" % key
AttributeError: ResultSet object has no attribute 'text'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?
I have also seen .get_text() used in BS documentation but that returns the same error.
Your solution was correct. You get a list of values from the find_all() method. all you have to do is iterate it and get the required text. I have corrected the code and put it below.
import requests
from bs4 import BeautifulSoup
url = 'https://www.tsn.ca/panarin-tops-2019-free-agent-frenzy-class-1.1303592'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
# This is how you should have extracted the text from the ResultSet
players = [elem.text for elem in soup.find_all('td')]
print(players)
find_all() will return a list of all elements meeting your specifications. Even if only a single item, or no item is found it will return [item] or [] respectively. To get the text you will need to index to the item like:
players_list = soup.find_all('td')
for player in players_list:
print(player.text)
I use .getText() in my scripts, I'm not sure if .text works the same or not!
That error indicates that you should iterate over each item like this:
players = [item.text for item in soup.find_all('td')] # Iterate over every item and extract the text
print(players)
print("".join(players)) # If you want all the text in one string
Hope this helps!
This is a working script:
import requests
from bs4 import BeautifulSoup
url = 'https://www.tsn.ca/panarin-tops-2019-free-agent-frenzy-class-1.1303592'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
players = []
tbl = soup.find('table', attrs={'class':'stats-table-scrollable article-table'})
tbl_body = tbl.find('tbody')
rows = tbl_body.find_all('tr')
for row in rows:
columns = row.find_all('td')
columns = [c.text for c in columns]
players.append(columns[1])
print(players)
Result:
['Artemi Panarin', 'Erik Karlsson', 'Sergei Bobrovsky', 'Matt Duchene', 'Jeff Skinner', 'Anders Lee', 'Joe Pavelski', 'Brock Nelson', 'Tyler Myers', 'Mats Zuccarello', 'Alex Edler', 'Gustav Nyquist', 'Jordan Eberle', 'Micheal Ferland', 'Jake Gardiner', 'Ryan Dzingel', 'Kevin Hayes', 'Brett Connolly', 'Marcus Johansson', 'Braydon Coburn', 'Wayne Simmonds', 'Brandon Tanev', 'Joonas Donskoi', 'Colin Wilson', 'Ron Hainsey']