BeautifulSoup printing pagination along with names - python

Link: https://www.yelp.com/search?cflt=restaurants&find_loc=San+Francisco%2C+CA
I am scraping names from yelp.com but it also print the page numbers below. How can I target only the names using BeautifulSoup? I am sharing couple screenshots. How can I target the name attribute showing in the inspect element screenshot?
import requests
from bs4 import BeautifulSoup as bs
url = "https://www.yelp.com/search?cflt=restaurants&find_loc=San+Francisco%2C+CA"
yelp_r = requests.get(url)
yelp_soup = bs(yelp_r.text, "html.parser")
# print(yelp_soup.prettify())
for name in yelp_soup.find_all("a", {"class": "lemon--a__373c0__IEZFH link__373c0__1G70M link-color--inherit__373c0__3dzpk link-size--inherit__373c0__1VFlE"}):
print(name.text)

import requests
from bs4 import BeautifulSoup as bs
url = "https://www.yelp.com/search?cflt=restaurants&find_loc=San+Francisco%2C+CA"
yelp_r = requests.get(url)
yelp_soup = bs(yelp_r.text, "html.parser")
ul = yelp_soup.find('ul', {'class':'lemon--ul__373c0__1_cxs undefined list__373c0__2G8oH'})
for li in ul.find_all('li', {'class':'lemon--li__373c0__1r9wz border-color--default__373c0__3-ifU'}):
for a_tag in li.find_all("a", {'class':"lemon--a__373c0__IEZFH link__373c0__1G70M link-color--inherit__373c0__3dzpk link-size--inherit__373c0__1VFlE"}):
print(a_tag.text) # get the text
print(a_tag.get('name')) # get the name property of a tag
output
Boo Koo
Boo Koo
Fog Harbor Fish House
Fog Harbor Fish House
... some results remove
Gary Danko
Gary Danko
um.ma
um.ma
Note: I didn't investigate if class name is dynamic and if changes

I will check the contents of each tag and check if there is no keyword 'Page' in it.
import requests
from bs4 import BeautifulSoup as bs
url = "https://www.yelp.com/search?cflt=restaurants&find_loc=San+Francisco%2C+CA"
yelp_r = requests.get(url)
yelp_soup = bs(yelp_r.text, "html.parser")
#print(yelp_soup.prettify())
for name in yelp_soup.find_all("a", {"class": "lemon--a__373c0__IEZFH link__373c0__1G70M link-color--inherit__373c0__3dzpk link-size--inherit__373c0__1VFlE"}):
if 'Page:' not in str(name.contents[0]):
print(name.contents[0])
Result:
San Francisco, CA
Restaurants
Boo Koo
...
Palm House
Anchor Oyster Bar

Related

Conditions in loop to ensure python only scrapes single div

While attempting to scrape this website: https://dining.umich.edu/menus-locations/dining-halls/mosher-jordan/ I have located the food item names by doing the following:
import requests
from bs4 import BeautifulSoup
url = "https://dining.umich.edu/menus-locations/dining-halls/mosher-jordan/"
req = requests.get(url, headers)
soup = BeautifulSoup(req.content, 'html.parser')
foodLocation = soup.find_all('div', class_='item-name')
for singleFood in foodLocation:
food = singleFood.text
print(food)
The problem is, I only want to print the food inside of the "World Palate Maize" section seen in the Lunch portion of the link. In the HTML, there are multiple divs that all contain the foods within a certain type (World Palate Maize, Hot Cereal, MBakery etc.) I'm having trouble figuring out how to tell the loop to only print inside of a certain section (certain div?). This may require an if statement or condition in the for loop but I am unsure about how to format/what to use as a condition to ensure this loop only prints the content from one section.
One strategy could be to select more specific by text e.g. with css selectors:
soup.select('h3:-soup-contains("Lunch")+div h4:-soup-contains("World Palate Maize") + ul .item-name')
Example
import requests
from bs4 import BeautifulSoup
url = "https://dining.umich.edu/menus-locations/dining-halls/mosher-jordan/"
req = requests.get(url)
soup = BeautifulSoup(req.content, 'html.parser')
foodLocation = soup.select('h3:-soup-contains("Lunch")+div h4:-soup-contains("World Palate Maize") + ul .item-name')
for singleFood in foodLocation:
food = singleFood.text
print(food)
Output
Mojo Grilled Chicken
Italian White Bean Salad
Seems like "Lunch" would always be the second div, so you can probably do
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla'
}
url = "https://dining.umich.edu/menus-locations/dining-halls/mosher-jordan/"
req = requests.get(url, headers)
soup = BeautifulSoup(req.content, 'html.parser')
[breakfast, lunch, dinner] = soup.select('div#mdining-items div.courses')
foods = lunch.select('div.item-name')
for food in foods:
print(food.text)

BS4 + html, b Tag issue

This question is about web scraping with bs4
this is the code I have written:
import requests
from bs4 import BeautifulSoup
import json
import csv
page = requests.get('https://www.alibaba.com/product-detail/Portable-Small-USB-Travel-LED-Makeup_60830030133.html?spm=a2700.details.maylikever.2.1fb53cc2uSVPvx')
# Create a BeautifulSoup object
soup = BeautifulSoup(page.text, 'html.parser')
#extract product score **(This is what I want to extract)**
stars = soup.select_one('a[class="score-lite"]', namespaces=None, flags=0)
#score = json.loads(stars)
print('Stars', stars)
My outcome:
<a class="score-lite" data-spm-click="gostr=/details.index.reviewLevel;locaid=dreviewLevel" href="https://onuliss.en.alibaba.com/company_profile/feedback.html" target="_blank"><b>4.8 </b><img src="//img.alicdn.com/tfs/TB1MJPmiQL0gK0jSZFtXXXQCXXa-8-9.svg"/></a>
The outcome I want is just the 4.8 number between the 'b' tags
What do I have to do with the = soup.select_one() function?
Thank you very much :)
Try with a more specific selector, the string property of the match and strip() to get rid of eventual extra spaces.
import requests
from bs4 import BeautifulSoup
import json
import csv
page = requests.get('https://www.alibaba.com/product-detail/Portable-Small-USB-Travel-LED-Makeup_60830030133.html?spm=a2700.details.maylikever.2.1fb53cc2uSVPvx')
# Create a BeautifulSoup object
soup = BeautifulSoup(page.text, 'html.parser')
#extract product score **(This is what I want to extract)**
stars = soup.select_one('a[class="score-lite"] > b', namespaces=None, flags=0).get_text(strip=True)
#score = json.loads(stars)
print('Stars', stars)
Stars 4.8
how about SimplifiedDoc
import requests
from simplified_scrapy.simplified_doc import SimplifiedDoc
page = requests.get('https://www.alibaba.com/product-detail/Portable-Small-USB-Travel-LED-Makeup_60830030133.html?spm=a2700.details.maylikever.2.1fb53cc2uSVPvx')
# Create a SimplifiedDoc object
doc = SimplifiedDoc(page.text)
# get element use tag and class
stars = doc.getElement('a','class',"score-lite")
print('Stars', stars.text, stars.b.text) # Stars 4.8 4.8
import requests
from bs4 import BeautifulSoup
r = requests.get(
'https://www.alibaba.com/product-detail/Portable-Small-USB-Travel-LED-Makeup_60830030133.html?spm=a2700.details.maylikever.2.1fb53cc2uSVPvx')
soup = BeautifulSoup(r.text, 'html.parser')
if r.status_code == 200:
item = soup.find('a', {'class': 'score-lite'}).find('b')
print(item.get_text(strip=True))
output:
4.8

BeautifulSoup: get_text() returns empty string from bs4 Tag

I am trying to extract information from this news page.
First I parse the page:
import requests
from bs4 import BeautifulSoup
page = requests.get("https://www.theguardian.com/politics/2019/oct/20/boris-johnson-could-be-held-in-contempt-of-court-over-brexit-letter")
soup = BeautifulSoup(page.content, 'html.parser')
Then I start with the title:
title = soup.find('meta', property="og:title")
and if I print it, I get:
<meta content="Boris Johnson could be held in contempt of court over Brexit letter" property="og:title"/>
However, when I run title.get_text(), the outcome is an empty string: ''
Where is my mistake?
That's because there actually isn't any text defined by the tag. The "text" you are after in this case, is contained in the <meta> tag with the attribute content. So you need to pull out the value of content:
import requests
from bs4 import BeautifulSoup
page = requests.get("https://www.theguardian.com/politics/2019/oct/20/boris-johnson-could-be-held-in-contempt-of-court-over-brexit-letter")
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.find('meta', property="og:title")['content']
Output:
print (title)
Boris Johnson could be held in contempt of court over Brexit letter
You can get all the attributes and values by using .attrs. This will return a dictionary (key:value pairs) of the attribute and value within the given tag:
title = soup.find('meta', property="og:title")
print (title.attrs)
Output:
print (title.attrs)
{'property': 'og:title', 'content': 'Boris Johnson could be held in contempt of court over Brexit letter'}

Python not finding search terms in strings parsed by BeautifulSoup

In Python 3, when I want to return only strings with the term I am interested in, I can do this:
phrases = ["1. The cat was sleeping",
"2. The dog jumped over the cat",
"3. The cat was startled"]
for phrase in phrases:
if "dog" in phrase:
print(phrase)
Which of course prints "2. The dog jumped over the cat"
Now what I'm trying to do is make the same concept work with parsed strings in BeautifulSoup. Craigslist, for example, has lots of A Tags, but only the A Tags that also have "hdrlnk" in them are of interest to us. So I:
import requests
from bs4 import BeautifulSoup
url = "https://chicago.craigslist.org/search/apa"
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
links = soup.find_all("a")
for link in links:
if "hdrlnk" in link:
print(link)
Problem is, instead of printing all the A Tags with "hdrlnk" inside, Python prints nothing. And I'm not sure what's going wrong.
"hdrlnk" is a class attribute on the links. As you say you are only interested in these links just find the links based on class like this:
import requests
from bs4 import BeautifulSoup
url = "https://chicago.craigslist.org/search/apa"
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
links = soup.find_all("a", {"class": "hdrlnk"})
for link in links:
print(link)
Outputs:
<a class="result-title hdrlnk" data-id="6293679332" href="/chc/apa/d/high-rise-2-bedroom-heated/6293679332.html">High-Rise 2 Bedroom Heated Pool Indoor Parking Fire Pit Pet Friendly!</a>
<a class="result-title hdrlnk" data-id="6285069993" href="/chc/apa/d/new-beautiful-studio-in/6285069993.html">NEW-Beautiful Studio in Uptown/free heat</a>
<a class="result-title hdrlnk" data-id="6293694090" href="/chc/apa/d/albany-park-2-bed-1-bath/6293694090.html">Albany Park 2 Bed 1 Bath Dishwasher W/D & Heat + Parking Incl Pets ok</a>
<a class="result-title hdrlnk" data-id="6282289498" href="/chc/apa/d/north-center-2-bed-1-bath/6282289498.html">NORTH CENTER: 2 BED 1 BATH HDWD AC UNITS PROVIDE W/D ON SITE PRK INCLU</a>
<a class="result-title hdrlnk" data-id="6266583119" href="/chc/apa/d/beautiful-2bed-1bath-in-the/6266583119.html">Beautiful 2bed/1bath in the heart of Wrigleyville</a>
<a class="result-title hdrlnk" data-id="6286352598" href="/chc/apa/d/newly-rehabbed-2-bedroom-unit/6286352598.html">Newly Rehabbed 2 Bedroom Unit! Section 8 OK! Pets OK! (NHQ)</a>
To get the link href or text use:
print(link["href"])
print(link.text)
Try:
for link in links:
if "hdrlnk" in link["href"]:
print(link)
Just search term in link content, otherwise your code seems fine
import requests
from bs4 import BeautifulSoup
url = "https://chicago.craigslist.org/search/apa"
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
links = soup.find_all("a")
for link in links:
if "hdrlnk" in link.contents[0]:
print(link)
Or, if you want to search inside href or title, use link['href'] and link['title']
To get the required links, you can use selectors within your script to make the scraper robust and concise.
import requests
from bs4 import BeautifulSoup
base_link = "https://chicago.craigslist.org"
res = requests.get("https://chicago.craigslist.org/search/apa").text
soup = BeautifulSoup(res, "lxml")
for link in soup.select(".hdrlnk"):
print(base_link + link.get("href"))

Unable to scrape name from google finance

I want to scrape name, url and description of companies as listed on google finance. So far I am successful in getting description and url but unable to fetch the name. In the source code of myUrl, name is 024 Pharma Inc. When I see the div, the class is named 'appbar-snippet-primary'. But still the code doesn't find it. I ma new to web scraping so may be I am missing something. Please guide me in this regard.
from bs4 import BeautifulSoup
import urllib
import csv
myUrl = 'https://www.google.com/finance?q=OTCMKTS%3AEEIG'
r = urllib.urlopen(myUrl).read()
soup = BeautifulSoup(r, 'html.parser')
name_box = soup.find('div', class_='appbar-snippet-primary') # !! This div is not found
#name = name_box.text
#print name
description = soup.find('div', class_='companySummary')
desc = description.text.strip()
#print desc
website = soup.find('div', class_='item')
site = website.text
#print site
from bs4 import BeautifulSoup
import requests
myUrl = 'https://www.google.com/finance?q=OTCMKTS%3AEEIG'
r = requests.get(myUrl).content
soup = BeautifulSoup(r, 'html.parser')
name = soup.find('title').text.split(':')[0] # !! This div is not found
#print name
description = soup.find('div', class_='companySummary')
desc = description.text.strip()
#print desc
website = soup.find('div', class_='item')
site = website.text
write soup.find_all() instead of soup.find()

Categories