rating=[]
for i in range(0,10):
url = "https://www.yelp.com/biz/snow-show-flushing?osq=ice%20cream%20shop&start="+str(10*i)
ourUrl = urllib.request.urlopen(url)
soup = BeautifulSoup(ourUrl,'html.parser')
for r in soup.find_all('span',{'class':"display--inline__373c0__1gaV4 border-color--default__373c0__1yxBb"})[1:]:
per_rating = r.div.get('aria-label')
rating.append(per_rating)
Try to get ratings for each page. Should have only 58 ratings in total, but it includes the rating from the "you might also consider".
How to fix it.
One possible solution would be to retrieve the total number of Reviews from yelp using BeautifulSoup. You can then trim your "rating"-list by the number of reviews.
# find the total number of reviews:
regex_count = re.compile('.*css-foyide.*')
Review_count = soup.find_all("p", {"class": regex_count})
Review_count = Review_count[0].text
Review_count = int(Review_count.split()[0]) # total number of reviews
Related
I am trying to scrape the total number of customer reviews from the Amazon page, using python and beautiful soup library. The details like customer ratings, stars are getting scraped, but not the review count.
def get_total_review(soup):
try:
# total_review1= soup.find("div", attrs={'id': 'filter-info-section'}).string.strip()
total_review1= soup.find("div", attrs={'data-hook': 'cr-filter-info-section'}).string.strip()
total_review = sototal_review1.find("div", attrs={'class':'a-row a-spacing-base a-size-base'}).string.strip()
except AttributeError:
total_review = ""
return total_review
I tried adding every particular id, class but the review count isn't getting displayed ('3,432' in this case, as you can see in the image):
I am lost with making a loop to go through all of the pages on this book site. The url ends in 'all?page=' followed by the page number, so it should be easy I thought, but I'm stuck. All the info gathering works fine, I just don't know how to move to the next pages. Any help would be appreciated.
import requests
from bs4 import BeautifulSoup
URL = 'https://www.bookdepository.com/category/352/Science-Fiction/browse/viewmode/all?page=' +str(page)
page = 1
page += 1
for page in max_pages:
html = requests.get(URL)
soup = BeautifulSoup(html.content, "html.parser")
# ^This part I need help with^
# results = all books present on page
# books = each individual book on the page
results = soup.find(class_='tab search')
books = results.find_all('div', class_='book-item')
for book in books:
title = book.h3.a
author = book.p.span
# in case there is no rating on a book
if len(book.find('div','rating-wrap').findAll('span', 'full-star')) == None:
pass
else: rating = len(book.find('div','rating-wrap').findAll('span', 'full-star'))
publish_date = book.find(class_='published')
format = book.find(class_='format')
price = book.find('span', class_='sale-price').text.strip()
# if there is no discount
if book.find(class_='rrp') == None:
pass
else:
original_price = book.find(class_='rrp').text.strip()
if book.find(class_='price-save') == None:
pass
else:
discount = book.find(class_='price-save').text.strip()
# unneeded text removed such as 'US' before the price shown
price = price.replace('US', '')
original_price = original_price.replace('US', '')
discount = discount.replace('Save US', '')
# .text.strip() gets text and rids of empty spaces
print(title.text.strip())
print(author.text.strip())
print(rating, 'stars')
print(publish_date.text.strip())
print(format.text.strip())
print(price)
print(original_price)
print(discount, 'in savings!')
What the code does is it loops 5 times in this case with page going up one every singe time.
max_pages = 5
for page in range(max_pages):
URL = f"https://www.bookdepository.com/category/352/Science-Fiction/browse/viewmode/all?page={page}"
html = requests.get(URL)
soup = BeautifulSoup(html.content, "html.parser")
I am using BeautifulSoup to scrape movies in the IMDB website. I was able to scrape name, genre, duration, rating of movies successfully. But I am not able to scrape description of the movies as when I am looking at the classes, it is "text-muted" and since this class is there multiple times holding other data such as rating, genre, duration. But since these data has inner classes also, so it was easier for me to scrape it but when it is coming to description, it does not have any inner class. So when pulling out data just using "text-muted" is giving other data also. How do I just get the description of the movies?
Attaching the code and screenshot for reference:
The sample code which I used to scrape genre is as follows:
genre_tags=data.select(".text-muted .genre")
genre=[g.get_text() for g in genre_tags]
Genre = [item.strip() for item in genre if str(genre)]
print(Genre)
In general, lxml is much better than beautifulsoup.
import requests
from lxml
import html
url = "xxxx"
r = requests.get(url)
tree = html.fromstring(r.text)
rows = tree.xpath('//div[#class="lister-item mode-detail"]')
for row in rows:
description = row.xpath('.//div[#class="ratings-bar"]/following-sibling::p[#class="text-muted"]/text()')[0].strip()
You can use this, :) , if helped you, UP my solution pls.. thks,
from bs4 import BeautifulSoup
from requests_html import HTMLSession
URL = 'https://www.imdb.com/chart/moviemeter/?ref_=nv_mv_mpm' #url of Most Popular Movies in IMDB
PAGE = HTMLSession().get(URL)
PAGE_BS4 = BeautifulSoup(PAGE.html.html,'html.parser')
MoviesObj = PAGE_BS4.find_all("tbody","lister-list") #get table body of Most Popular Movies
for index in range(len(MoviesObj[0].find_all("td","titleColumn"))):
a = list(MoviesObj[0].find_all("td","titleColumn")[index])[1]
href = 'https://www.imdb.com'+a.get('href') #get each link for movie page
moviepage = HTMLSession().get(href) #request each page of movie
moviepage = BeautifulSoup(moviepage.html.html,'html.parser')
title = list(moviepage.find_all('h1')[0].stripped_strings)[0] #parse title
year = list(moviepage.find_all('h1')[0].stripped_strings)[2] #parse year
try:
score = list(moviepage.find_all('div','ratingValue')[0].stripped_strings)[0] #parse score if is available
except IndexError:
score = '-' #if score is not available '-' is filled
description = list(moviepage.find_all('div','summary_text')[0].stripped_strings)[0] #parse description
print(f'TITLE: {title} YEAR: {year} SCORE: {score}\nDESCRIPTION:{description}\n')
PRINT
Junior Saldanha
#UmSaldanha
I would like to extract for each movie at least 20 user reviews, but I don't know how to loop to get into the IMDb title movie and then to the user reviews with beautifulsoup.
start link = "https://www.imdb.com/search/title/?title_type=feature,tv_movie&release_date=2018-01-01,2019-12-31&count=250";
title_link(1) = "https://www.imdb.com/title/tt7131622/?ref_=adv_li_tt";
user_reviews_link_movie1 = "https://www.imdb.com/title/tt7131622/reviews?ref_=tt_ov_rt" ;
I am able to extract from a static page titles, years, ratings and metascores of each movie of the list.
# Import packages and set urls
from requests import get
url = 'https://www.imdb.com/search/title/?title_type=feature,tv_movie&release_date=2018-01-01,2019-12-31&count=250'
response = get(url)
print(response.text[:500])
from bs4 import BeautifulSoup
html_soup = BeautifulSoup(response.text, 'html.parser')
type(html_soup)
movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')
print(type(movie_containers))
print(len(movie_containers))
# Lists to store the scraped data in
names = []
years = []
imdb_ratings = []
metascores = []
votes = []
# Extract data from individual movie container
for container in movie_containers:
# If the movie has Metascore, then extract:
if container.find('div', class_ = 'ratings-metascore') is not None:
# The name
name = container.h3.a.text
names.append(name)
# The year
year = container.h3.find('span', class_ = 'lister-item-year').text
years.append(year)
# The IMDB rating
imdb = float(container.strong.text)
imdb_ratings.append(imdb)
# The Metascore
m_score = container.find('span', class_ = 'metascore').text
metascores.append(int(m_score))
import pandas as pd
test_df = pd.DataFrame({'movie': names,'year': years,'imdb': imdb_ratings,'metascore': metascores})
test_df
Actual results :
movie year imdb metascore
Once Upon a Time... in Hollywood (2019) (8.1) (83)
Scary Stories (2019) (6.5) (61)
Fast & Furious: Hobbs & Shaw (2019) (6.8) (60)
Avengers: Endgame (2019) (8.6) (78)
Expected :
movie1 year1 imbd1 metascore1 review1
movie1 year1 imbd1 metascore1 review2
...
movie1 year1 imbd1 metascore1 review20
movie2 year2 imbd2 metascore2 review1
...
movie2 year2 imbd2 metascore2 review20
...
movie250 year250 imbd250 metascore250 review20
Assuming that answer on my question in comments is "yes".
Below is a solution to your initial request.
There's a check whether a particular film really has 20 reviews. If less, then gather all available ones.
Technically parsing process is correct, I checked it when assigned movie_containers = movie_containers[:3]. Gathering all data will take some time.
UPDATE: just finished collecting info on all 250 films - everything is scraped without errors, so block after solution itself is just FYI.
Also if you want to go further with your parsing, I mean collect data for next 250 films and so on, you can add one more looping level to this parser. The process is similar to one in the "Reviews extracting" section.
# Import packages and set urls
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
base_url = 'https://www.imdb.com/search/title/?title_type=feature,tv_movie&release_date=2018-01-01,2019-12-31&count=250'
url_header_for_reviews = 'https://www.imdb.com'
url_tail_for_reviews = 'reviews?ref_=tt_urv'
base_response = get(base_url)
html_soup = BeautifulSoup(base_response.text, 'html.parser')
movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')
result_df = pd.DataFrame()
# Extract data from individual movie container
for container in movie_containers:
# If the movie has Metascore, then extract:
if container.find('div', class_ = 'ratings-metascore') is not None:
# Reviews extracting
num_reviews = 20
# Getting last piece of link puzzle for a movie reviews` link
url_middle_for_reviews = container.find('a')['href']
# Opening reviews page of a concrete movie
response_reviews = get(url_header_for_reviews + url_middle_for_reviews + url_tail_for_reviews)
reviews_soup = BeautifulSoup(response_reviews.text, 'html.parser')
# Searching all reviews
reviews_containers = reviews_soup.find_all('div', class_ = 'imdb-user-review')
# Check if actual number of reviews is less than target one
if len(reviews_containers) < num_reviews:
num_reviews = len(reviews_containers)
# Looping through each review and extracting title and body
reviews_titles = []
reviews_bodies = []
for review_index in range(num_reviews):
review_container = reviews_containers[review_index]
review_title = review_container.find('a', class_ = 'title').text.strip()
review_body = review_container.find('div', class_ = 'text').text.strip()
reviews_titles.append(review_title)
reviews_bodies.append(review_body)
# The name
name = container.h3.a.text
names = [name for i in range(num_reviews)]
# The year
year = container.h3.find('span', class_ = 'lister-item-year').text
years = [year for i in range(num_reviews)]
# The IMDB rating
imdb_rating = float(container.strong.text)
imdb_ratings = [imdb_rating for i in range(num_reviews)]
# The Metascore
metascore = container.find('span', class_ = 'metascore').text
metascores = [metascore for i in range(num_reviews)]
# Gathering up scraped data into result_df
if result_df.empty:
result_df = pd.DataFrame({'movie': names,'year': years,'imdb': imdb_ratings,'metascore': metascores,'review_title': reviews_titles,'review_body': reviews_bodies})
elif num_reviews > 0:
result_df = result_df.append(pd.DataFrame({'movie': names,'year': years,'imdb': imdb_ratings,'metascore': metascores,'review_title': reviews_titles,'review_body': reviews_bodies}))
Btw I'm not sure that IMDB will let you gather data for all films in a loop as is. There's a possibility that you can get a captcha or redirection to some other page. If these issue appears,I'd go with a simple solution - pauses in scraping and/or changing user-agents.
Pause (sleep) can be implemented as follows:
import time
import numpy as np
time.sleep((30-5)*np.random.random()+5) #from 5 to 30 seconds
Inserting a user-agent in request can be done as follows:
import requests
from bs4 import BeautifulSoup
url = ('http://www.link_you_want_to_make_request_on.com/bla_bla')
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
Google some other variants of user-agents, make a list from them and change them from time to time in next requests. Watch out though which user-agents you use - some of them indicate mobile or tablet devices, and for them a site (not only IMDB) can give response pages in a format that differs from PC one - other markup, other design etc. So in general above algorithm works only for PC version of pages.
I have written a code in python using Beautiful Soup for extracting user name and their rating from IMDB. But there are many user who did not gave rating for their reviews. Its become difficult to map exactly ratings with their reviews. So how can i do this part?
http://www.imdb.com/title/tt2866360/reviews?ref_=tt_ov_rt
In this url reviews are not assign rating.
url1 ="http://www.imdb.com/title/tt2866360/reviews?ref_=tt_ov_rt"
response = requests.get(url1, headers=headers)
page=response.content
soup=BeautifulSoup(page)
for k in soup.findAll('div',{"class":"load-more-data"}):
if k.name == 'span' and m['class'] == "rating-other-user-rating":
print blah()
else:
print blah 1()
This is the code to check whether rating part exist in review part or not but it did not returning any thing?
The information you're looking for (username, rating) is located in 'div.review-container' tags.
About the tags that have no rating, you can just ignore them.
for k in soup.find_all('div',{"class":"review-container"}):
rating = k.find('span', class_='rating-other-user-rating')
if rating:
rating = ''.join(i.text for i in rating.find_all('span')[-2:])
name = k.find('span', class_='display-name-link').text
print name, rating
The information that shows when you press the Load More button is loaded via XHR requests.
You'll find the all data you need in order to preform the request in a 'div.load-more-data' tag.
load_more = soup.find('div', class_='load-more-data')
url = 'http://www.imdb.com{}?paginationKey={}'.format(
load_more['data-ajaxurl'], load_more['data-key']
)
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
Just repeat the above process untill you have all the info.
import requests
from bs4 import BeautifulSoup
url = "http://www.imdb.com/title/tt2866360/reviews?ref_=tt_ov_rt"
ajax_url = url.split('?')[0] + "/_ajax?paginationKey={}"
reviews = []
while True:
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
for k in soup.find_all('div',{"class":"review-container"}):
rating = k.find('span', class_='rating-other-user-rating')
if rating:
rating = ''.join(i.text for i in rating.find_all('span')[-2:])
name = k.find('span', class_='display-name-link').text
reviews.append([name, rating])
print name, rating
load_more = soup.find('div', class_='load-more-data')
if not load_more:
break
url = ajax_url.format(load_more['data-key'])
I suggest you should try to console the content from <div class="review-container" ... of every review. Then select the specific data you want to retrieve.