how to get review from tripadvisor
this my code using beautifulsoup
review_data = data.find_all('div', attrs={'class':'reviews-tab'})
for review in review_data:
namareview = review.findNext('a', attrs={'class':'ui_header_link social-member-event-MemberEventOnObjectBlock__member--35-jC'})[0].text.strip()
ratingreview =
tittlereview = data.find_all('a', attrs={'class':'location-review-review-list-parts-ReviewTitle__reviewTitleText--2tFRT'})[0].text.strip()
print (namareview)
and how to get value rating review from bubble rating
<span class="ui_bubble_rating bubble_30"></span>
this my code now
from bs4 import BeautifulSoup
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
url = "https://www.tripadvisor.com/Attraction_Review-g297722-d6611509-Reviews-Pahawang_Island-Bandar_Lampung_Lampung_Sumatra.html"
response = requests.get(url)
data = BeautifulSoup(response.text, "html.parser")
print(data.title.text)
nama = data.find_all('h1', attrs={'class':'ui_header h1'})[0].text.strip()
print (nama)
category = data.find_all('div', attrs={'class':'attractions-attraction-review-header-AttractionLinks__detail--2-xvX'})[0].text.strip()
print (category)
location= data.find_all('div', attrs={'class':'attractions-contact-card-ContactCard__contactRow--3Ih6v'})[0].text.strip()
print (location)
review_data = data.find_all('div', attrs={'class':'reviews-tab'})
for review in review_data:
namareview = review.findNext('a', attrs={'class':'ui_header_link social-member-event-MemberEventOnObjectBlock__member--35-jC'})[0].text.strip()
bubblereview=
tittlereview = data.find_all('a', attrs={'class':'location-review-review-list-parts-ReviewTitle__reviewTitleText--2tFRT'})[0].text.strip()
print (namareview,bubblereview,tittlereview)
thistime my full code
Tripadvisor is a tricky site to scrape. But not impossible. Not sure what you are after, but you can work/look at parsing the json within the script tags:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import json
url = "https://www.tripadvisor.co.uk/Attraction_Review-g297722-d6611509-Reviews-Pahawang_Island-Bandar_Lampung_Lampung_Sumatra.html"
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
response = requests.get(url, headers=headers)
data = BeautifulSoup(response.text, "html.parser")
print(data.title.text)
nama = data.find_all('h1', attrs={'class':'ui_header h1'})[0].text.strip()
print (nama)
category = data.find_all('div', attrs={'class':'attractions-attraction-review-header-AttractionLinks__detail--2-xvX'})[0].text.strip()
print (category)
location= data.find_all('div', attrs={'class':'attractions-contact-card-ContactCard__contactRow--3Ih6v'})[0].text.strip()
print (location)
# Get Total count of reviews
data = BeautifulSoup(response.text, "html.parser")
reviewDataIDs = []
scripts = data.find_all('script')
for script in scripts:
if 'window.__WEB_CONTEXT__=' in script.text:
jsonStr = script.text
jsonStr = jsonStr.split('window.__WEB_CONTEXT__={pageManifest:')[-1]
iterateJson = True
while iterateJson == True:
try:
jsonData = json.loads(jsonStr + '}')
iterateJson = False
except:
jsonStr = jsonStr.rsplit('}',1)[0]
raiseError = True
for k, v in jsonData['urqlCache'].items():
try:
totalCount = jsonData['urqlCache'][k]['data']['locations'][0]['reviewListPage']['totalCount']
raiseError = False
reviewDataIDs.append(k)
break
except:
pass
def getJsonData(reviewCount, reviewDataIDs, continueLoop):
while continueLoop == True:
url = "https://www.tripadvisor.co.uk/Attraction_Review-g297722-d6611509-Reviews-or%s-Pahawang_Island-Bandar_Lampung_Lampung_Sumatra.html#REVIEWS" %reviewCount
response = requests.get(url, headers=headers)
data = BeautifulSoup(response.text, "html.parser")
scripts = data.find_all('script')
for script in scripts:
if 'window.__WEB_CONTEXT__=' in script.text:
jsonStr = script.text
jsonStr = jsonStr.split('window.__WEB_CONTEXT__={pageManifest:')[-1]
iterateJson = True
while iterateJson == True:
try:
jsonData = json.loads(jsonStr + '}')
iterateJson = False
except:
jsonStr = jsonStr.rsplit('}',1)[0]
raiseError = True
for k, v in jsonData['urqlCache'].items():
try:
reviewData = jsonData['urqlCache'][k]['data']['locations'][0]['reviewListPage']['reviews']
raiseError = False
if k not in reviewDataIDs:#
continueLoop = False
reviewDataIDs.append(k)
break
except:
pass
if raiseError == True:
raise ValueError ('Data could not be found.')
if continueLoop == False:
return reviewData, reviewDataIDs
# Get Reviews
for reviewCount in list(range(0,totalCount,5)):
reviewData, reviewDataIDs = getJsonData(reviewCount, reviewDataIDs, continueLoop=True)
for each in reviewData:
rating = each['rating']
title = each['title']
text = each['text']
user = each['username']
print ('Name: %s\nTitle: %s\nRating: %s\nReview: %s\n' %(user, title, rating, text) + '-'*70 + '\n')
Output:
Name: Hamdan O
Title: Great for snorkelling and beach fun
Rating: 4
Review: Get a boat from Ketapang Jetty. There were 4 piers with lots of boats to choose from. Choose from traditional wooden boats which are cheaper but slow paced or higher priced fast fiberglass speed boats. We haggled for a fast speed boat to take us snorkelling and island hopping for half a day at 700K Rupiah. We got it from Pak Yayat at Pier 2. Pahawang is excellent for snorkelling. Just off shore the island the residents built platforms with small food/drink booths. They moored the boats there as bases for snorkelling. You can hop from one platform to another. Fantastic ideas to preserve the corals but unfortunately the inexperienced snorkellers ravaged through some of the patches closer to te beach. Great for an overnight trip as well at some of the local folks' homestays on the island.
----------------------------------------------------------------------
Name: PaulusKK
Title: he Trip is just So So
Rating: 3
Review: the boat trip to Pahawang island to me is a bit unsafe, it was a small wooden boat, and the journey was bumpy with high waves, and the island itself almost have no attraction, and the lunch provided there was not good, I only enjoy the fresh coconut water.
----------------------------------------------------------------------
Name: damarwianggo
Title: Pahawang is awesome
Rating: 5
Review: It was a story that Pahawang Island is great place to visit. Then, when I had a chance to accompany students from SMAK IPEKA Palembang to visit Pahawang Island in Lampung, Pahawang is truly exciting. Our one-day-trip to Pahawang was really extraordinary. Moreover, all the students were really excited to join all activities during the trip. The guide helped us to enjoy the trip.
----------------------------------------------------------------------
Name: deddy p
Title: Awesome
Rating: 5
Review: One word i can tell about Pahawang..... Superb. Clean water, beautiful corals. Hope you can help to take care this beautiful environment. Keep it clean.....stay away from plastic.
----------------------------------------------------------------------
Name: kristi0308
Title: Clean beach
Rating: 3
Review: I felt like in pulau pari seribu island for the view
The corals are dead but i saw lots of babies baracudas and a huge purple jellyfish and still got so many pretty little fish
Water are clean and people are not careless about environment as it was very clean when i swam in the island
Thanks to my boat man i paid him only 400k just for a day trip by myself
Paid boat parking every time i move like around 15-20k
And snorkel gear for 30k
----------------------------------------------------------------------
The value of the bubble rating is represented as the number at the end of the class name. Each bubble has a value of 10, so ui_bubble_rating bubble_30 is a rating with 3 out of 5 bubbles filled. Likewise ui_bubble_rating bubble_45 will have 4.5 out of 5 bubbles filled. You can find all these instances with a regex since the number changes.
bubblereview = data.find_all('span', {'class': re.compile('ui_bubble_rating bubble_\d*')})
The resulting list:
[<span class="ui_bubble_rating bubble_45"></span>,
<span class="ui_bubble_rating bubble_45"></span>,
<span class="ui_bubble_rating bubble_40"></span>,
<span class="ui_bubble_rating bubble_30"></span>,
<span class="ui_bubble_rating bubble_50"></span>,
<span class="ui_bubble_rating bubble_50"></span>,
<span class="ui_bubble_rating bubble_30"></span>,
<span class="ui_bubble_rating bubble_40"></span>,
<span class="ui_bubble_rating bubble_35"></span>,
<span class="ui_bubble_rating bubble_40"></span>,
<span class="ui_bubble_rating bubble_40"></span>,
<span class="ui_bubble_rating bubble_45"></span>,
<span class="ui_bubble_rating bubble_40"></span>]
You can filter out the ratings like this:
ratings = re.findall('\d+', ''.join(map(str, bubblereview)))
# ['45', '45', '40', '30', '50', '50', '30', '40', '35', '40', '40', '45', '40']
Try this loop:
for review in data.select("div[class*='SingleReview']"):
title= review.select_one(":scope a > span > span").get_text()
buble_tag = review.select_one(":scope span[class*='bubble']")
raiting = buble_tag["class"][-1].split("_")[-1]
print(f"({raiting}){title}")
This also works and for the only 5 reviews needed per page...
bubblereview = soup.find_all('div', {'class': re.compile('nf9vGX55')})
Related
I'm trying to web crawl movie titles from this website: https://www.the-numbers.com/market/2019/top-grossing-movies
And keep getting broken word like "John Wick: Chapter 3 — ".
this is the picture:
This is the code:
url = "https://www.the-numbers.com/market/" + "2019" + "/top-grossing-movies"
raw = requests.get(url,
headers={'User-Agent':'Mozilla/5.0'})
html = BeautifulSoup(raw.text, "html.parser")
movie_list = html.select("#page_filling_chart table tr > td > b > a") #"#page_filling_chart > table > tbody > tr > td > b"
for i in range(len(movie_list)):
print(movie_list[i].text)
And these are the outputs:
Avengers: Endgame
The Lion King
Frozen II
Toy Story 4
Captain Marvel
Star Wars: The Rise of Skyw…
Spider-Man: Far From Home
Aladdin
Joker
Jumanji: The Next Level
It: Chapter Two
Us
Fast & Furious Presents: Ho…
John Wick: Chapter 3 — Para…
How to Train Your Dragon: T…
The Secret Life of Pets 2
Pokémon: Detective Pikachu
Once Upon a Time…in Hollywo…
I want to know why I keep getting these broken words and how to fix this!
Due to this page is server-render, you could request those page separately when the title getting broken.(Also don't forget to get the title by regex, because the title of its page contain the publication date.)
Try code below:
import requests
from bs4 import BeautifulSoup
url = "https://www.the-numbers.com/market/" + "2019" + "/top-grossing-movies"
raw = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
html = BeautifulSoup(raw.text, "html.parser")
movie_list = html.select("#page_filling_chart table tr > td > b > a") # "#page_filling_chart > table > tbody > tr > td > b"
for movie in movie_list:
raw = requests.get("https://www.the-numbers.com" + movie.get("href"), headers={'User-Agent': 'Mozilla/5.0'})
raw.encoding = 'utf-8'
html = BeautifulSoup(raw.text, "html.parser")
print(html.select_one("#main > div > h1").text)
That's gave me:
Avengers: Endgame (2019)
The Lion King (2019)
Frozen II (2019)
Toy Story 4 (2019)
Captain Marvel (2019)
Star Wars: The Rise of Skywalker (2019)
Spider-Man: Far From Home (2019)
....
You need to handle the strings like this, the solution code is:
import requests
from bs4 import BeautifulSoup
url = "https://www.the-numbers.com/market/" + "2019" + "/top-grossing-movies"
raw = requests.get(url,
headers={'User-Agent':'Mozilla/5.0'})
html = BeautifulSoup(raw.text, "lxml")
movie_list = html.select("#page_filling_chart table tr > td > b > a") #"#page_filling_chart > table > tbody > tr > td > b"
import unicodedata
for i in range(len(movie_list)):
movie_name = movie_list[i].text
print(unicodedata.normalize('NFKD', movie_name).encode('ascii', 'ignore').decode())
The output is like this:
Avengers: Endgame
The Lion King
Frozen II
Toy Story 4
Captain Marvel
Star Wars: The Rise of Skyw...
Spider-Man: Far From Home
Aladdin
Joker
Jumanji: The Next Level
It: Chapter Two
Us
Fast & Furious Presents: Ho...
John Wick: Chapter 3 a Para...
How to Train Your Dragon: T...
The Secret Life of Pets 2
PokAmon: Detective Pikachu
Once Upon a Timeain Hollywo...
Shazam!
Aquaman
Knives Out
Dumbo
Maleficent: Mistress of Evil
.
.
Narcissister Organ Player
Chef Flynn
I am Not a Witch
Divide and Conquer: The Sto...
Senso
Never-Ending Man: Hayao Miy...
I'm trying to improve my Python by playing around with Beautifulsoup and the requests modules. I've done a few of the tutorials and have successfully scraped data from various place, but can't manage to get this one working. I know there is a ready made product imdb offer for accessing data, but I like using the site to practise Python.
I'm trying to scrape the titles of each of the episodes on this page, but my code is just giving me an empty list.
import requests
from bs4 import BeautifulSoup
URL = 'https://www.imdb.com/title/tt0094525/episodes?season=5&ref_=tt_eps_sn_5'
headers = {"User-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}
page = requests.get(URL, headers=headers)
pageTree = requests.get(URL, headers=headers)
soup = BeautifulSoup(pageTree.content, 'html.parser')
print(soup) #testing its working
print(soup.title.string)
episodes_list = []
episodes = soup.find_all("a", class_="title")
for episode in episodes:
episodeName = episodes.find("a").get_text()
episodes_list.append(episodeName)
print(episodes_list)
Pointers would be greatly appreciated, I know the issue is with the episodes variable, but trial and error has not given me the answer.
You're looking for elements with class = title, but if you look at the HTML, the a elements you're looking for don't have a class attribute. For example:
<a href="/title/tt0676164/"
title="The Adventure of the Egyptian Tomb" itemprop="url">...</a>
There is a title attribute, but not class attribute. Reading through the beautifulsoup documentation, it looks like you can use a regular expression has an attribute filter, so we can probably do something like this:
episodes = soup.find_all("a", title=re.compile('.'))
That finds everything with a non-empty title attribute, which seems to be what you want:
>>> episodes = soup.find_all("a", title=re.compile('.'))
>>> [x.get('title') for x in episodes]
['The Adventure of the Egyptian Tomb', 'The Adventure of the Egyptian Tomb',
'The Underdog', 'The Underdog', 'The Yellow Iris', 'The Yellow Iris',
'The Case of the Missing Will', 'The Case of the Missing Will',
'The Adventure of the Italian Nobleman', 'The Adventure of the Italian Nobleman',
'The Chocolate Box', 'The Chocolate Box', "Dead Man's Mirror",
"Dead Man's Mirror", 'Jewel Robbery at the Grand Metropolitan',
'Jewel Robbery at the Grand Metropolitan', 'Share on Facebook',
'Share on Twitter', 'Share the page', 'Facebook', 'Instagram', 'Twitch',
'Twitter', 'YouTube']
You could try something like this. It will select only the title of the series and put it into the episode list.
import requests
from bs4 import BeautifulSoup
URL = 'https://www.imdb.com/title/tt0094525/episodes?season=5&ref_=tt_eps_sn_5'
headers = {"User-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}
page = requests.get(URL, headers=headers)
pageTree = requests.get(URL, headers=headers)
soup = BeautifulSoup(pageTree.content, 'html.parser')
episodes_list = []
episodes = soup.find_all("div",{"class": "info"})
# Iterate over results and print
for episode in episodes:
episodes_list.append(episode.a.text)
print(episodes_list)
The output will look like this:
['The Adventure of the Egyptian Tomb', 'The Underdog', 'The Yellow Iris', 'The Case of the Missing Will', 'The Adventure of the Italian Nobleman', 'The Chocolate Box', "Dead Man's Mirror", 'Jewel Robbery at the Grand Metropolitan']
I would like to extract for each movie at least 20 user reviews, but I don't know how to loop to get into the IMDb title movie and then to the user reviews with beautifulsoup.
start link = "https://www.imdb.com/search/title/?title_type=feature,tv_movie&release_date=2018-01-01,2019-12-31&count=250";
title_link(1) = "https://www.imdb.com/title/tt7131622/?ref_=adv_li_tt";
user_reviews_link_movie1 = "https://www.imdb.com/title/tt7131622/reviews?ref_=tt_ov_rt" ;
I am able to extract from a static page titles, years, ratings and metascores of each movie of the list.
# Import packages and set urls
from requests import get
url = 'https://www.imdb.com/search/title/?title_type=feature,tv_movie&release_date=2018-01-01,2019-12-31&count=250'
response = get(url)
print(response.text[:500])
from bs4 import BeautifulSoup
html_soup = BeautifulSoup(response.text, 'html.parser')
type(html_soup)
movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')
print(type(movie_containers))
print(len(movie_containers))
# Lists to store the scraped data in
names = []
years = []
imdb_ratings = []
metascores = []
votes = []
# Extract data from individual movie container
for container in movie_containers:
# If the movie has Metascore, then extract:
if container.find('div', class_ = 'ratings-metascore') is not None:
# The name
name = container.h3.a.text
names.append(name)
# The year
year = container.h3.find('span', class_ = 'lister-item-year').text
years.append(year)
# The IMDB rating
imdb = float(container.strong.text)
imdb_ratings.append(imdb)
# The Metascore
m_score = container.find('span', class_ = 'metascore').text
metascores.append(int(m_score))
import pandas as pd
test_df = pd.DataFrame({'movie': names,'year': years,'imdb': imdb_ratings,'metascore': metascores})
test_df
Actual results :
movie year imdb metascore
Once Upon a Time... in Hollywood (2019) (8.1) (83)
Scary Stories (2019) (6.5) (61)
Fast & Furious: Hobbs & Shaw (2019) (6.8) (60)
Avengers: Endgame (2019) (8.6) (78)
Expected :
movie1 year1 imbd1 metascore1 review1
movie1 year1 imbd1 metascore1 review2
...
movie1 year1 imbd1 metascore1 review20
movie2 year2 imbd2 metascore2 review1
...
movie2 year2 imbd2 metascore2 review20
...
movie250 year250 imbd250 metascore250 review20
Assuming that answer on my question in comments is "yes".
Below is a solution to your initial request.
There's a check whether a particular film really has 20 reviews. If less, then gather all available ones.
Technically parsing process is correct, I checked it when assigned movie_containers = movie_containers[:3]. Gathering all data will take some time.
UPDATE: just finished collecting info on all 250 films - everything is scraped without errors, so block after solution itself is just FYI.
Also if you want to go further with your parsing, I mean collect data for next 250 films and so on, you can add one more looping level to this parser. The process is similar to one in the "Reviews extracting" section.
# Import packages and set urls
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
base_url = 'https://www.imdb.com/search/title/?title_type=feature,tv_movie&release_date=2018-01-01,2019-12-31&count=250'
url_header_for_reviews = 'https://www.imdb.com'
url_tail_for_reviews = 'reviews?ref_=tt_urv'
base_response = get(base_url)
html_soup = BeautifulSoup(base_response.text, 'html.parser')
movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')
result_df = pd.DataFrame()
# Extract data from individual movie container
for container in movie_containers:
# If the movie has Metascore, then extract:
if container.find('div', class_ = 'ratings-metascore') is not None:
# Reviews extracting
num_reviews = 20
# Getting last piece of link puzzle for a movie reviews` link
url_middle_for_reviews = container.find('a')['href']
# Opening reviews page of a concrete movie
response_reviews = get(url_header_for_reviews + url_middle_for_reviews + url_tail_for_reviews)
reviews_soup = BeautifulSoup(response_reviews.text, 'html.parser')
# Searching all reviews
reviews_containers = reviews_soup.find_all('div', class_ = 'imdb-user-review')
# Check if actual number of reviews is less than target one
if len(reviews_containers) < num_reviews:
num_reviews = len(reviews_containers)
# Looping through each review and extracting title and body
reviews_titles = []
reviews_bodies = []
for review_index in range(num_reviews):
review_container = reviews_containers[review_index]
review_title = review_container.find('a', class_ = 'title').text.strip()
review_body = review_container.find('div', class_ = 'text').text.strip()
reviews_titles.append(review_title)
reviews_bodies.append(review_body)
# The name
name = container.h3.a.text
names = [name for i in range(num_reviews)]
# The year
year = container.h3.find('span', class_ = 'lister-item-year').text
years = [year for i in range(num_reviews)]
# The IMDB rating
imdb_rating = float(container.strong.text)
imdb_ratings = [imdb_rating for i in range(num_reviews)]
# The Metascore
metascore = container.find('span', class_ = 'metascore').text
metascores = [metascore for i in range(num_reviews)]
# Gathering up scraped data into result_df
if result_df.empty:
result_df = pd.DataFrame({'movie': names,'year': years,'imdb': imdb_ratings,'metascore': metascores,'review_title': reviews_titles,'review_body': reviews_bodies})
elif num_reviews > 0:
result_df = result_df.append(pd.DataFrame({'movie': names,'year': years,'imdb': imdb_ratings,'metascore': metascores,'review_title': reviews_titles,'review_body': reviews_bodies}))
Btw I'm not sure that IMDB will let you gather data for all films in a loop as is. There's a possibility that you can get a captcha or redirection to some other page. If these issue appears,I'd go with a simple solution - pauses in scraping and/or changing user-agents.
Pause (sleep) can be implemented as follows:
import time
import numpy as np
time.sleep((30-5)*np.random.random()+5) #from 5 to 30 seconds
Inserting a user-agent in request can be done as follows:
import requests
from bs4 import BeautifulSoup
url = ('http://www.link_you_want_to_make_request_on.com/bla_bla')
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
Google some other variants of user-agents, make a list from them and change them from time to time in next requests. Watch out though which user-agents you use - some of them indicate mobile or tablet devices, and for them a site (not only IMDB) can give response pages in a format that differs from PC one - other markup, other design etc. So in general above algorithm works only for PC version of pages.
So far my code can scrape the number of items on sale in the category Charms. But I cannot make it print out the name of the category.
The site uses an infinite scroller - but I managed to identify where the sites are and thus the site URL contains {} which is filled out with the while loop.
import requests
from bs4 import BeautifulSoup
url = "https://us.pandora.net/en/charms/?sz=30&start={}&format=page-element"
def fetch_items(link,page):
Total_items = 0
while page<=1000:
#print("current page no: ",page)
res = requests.get(link.format(page),headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(res.text,"lxml")
list_total = soup.select('.grid-tile .price-standard')
Total_items += len(list_total)
#print(Total_items)
page+=30
category_tags = soup.select('span.breadcrumb-element')
return Total_items
return category_tags
if __name__ == '__main__':
page = 0
product_list = []
total_items = fetch_items(url,page)
#print number of items on sale
print(total_items)
print(category_tags)
Here's what I need:
I need to print out the category of the scraped items, which can be found in using this line:
category_tags = soup.select('span.breadcrumb-element')
But I cannot make it print somehow.
While we're at it, how can I make the code print out ALL the items and not just the items on sale?
Thank you.
EDIT:
So building one of the guys' code I ended up with this.
import requests
from bs4 import BeautifulSoup
import re
url1 = "https://us.pandora.net/en/charms/?sz=30&start={}&format=page-element"
url2 = "https://us.pandora.net/en/bracelets/?sz=30&start={}&format=page-element"
url3 = "https://us.pandora.net/en/rings/?sz=30&start={}&format=page-element"
url4 = "https://us.pandora.net/en/necklaces/?sz=30&start={}&format=page-element"
url5 = "https://us.pandora.net/en/earrings/?sz=30&start={}&format=page-element"
#res = requests.get(link.format(url1),headers={"User-Agent":"Mozilla/5.0"})
soup1 = BeautifulSoup(requests.get(url1.format(0)).text, 'lxml')
soup2 = BeautifulSoup(requests.get(url2.format(0)).text, 'lxml')
soup3 = BeautifulSoup(requests.get(url3.format(0)).text, 'lxml')
soup4 = BeautifulSoup(requests.get(url4.format(0)).text, 'lxml')
soup5 = BeautifulSoup(requests.get(url5.format(0)).text, 'lxml')
total_items1 = ''.join(re.findall(r'\d', soup1.select_one('span.products-count').text))
total_items2 = ''.join(re.findall(r'\d', soup2.select_one('span.products-count').text))
total_items3 = ''.join(re.findall(r'\d', soup3.select_one('span.products-count').text))
total_items4 = ''.join(re.findall(r'\d', soup4.select_one('span.products-count').text))
total_items5 = ''.join(re.findall(r'\d', soup5.select_one('span.products-count').text))
#categories = [tag['title'].strip() for tag in soup.select('.refinement-link[title]')
#total_items_sale1 = ''.join(re.findall(r'\d', soup1.select_one('.grid-tile .price-standard')))
#total_items_sale1
#total_items_sale1
#total_items_sale1
#total_items_sale1
#print('Categories:')
#for category in categories:
#print('\t{}'.format(category))
print('\nTotal Charms: {}'.format(total_items1))
print('\nTotal Bracelets: {}'.format(total_items2))
print('\nTotal Rings: {}'.format(total_items3))
print('\nTotal Necklaces: {}'.format(total_items4))
print('\nTotal Earrings: {}'.format(total_items5))
I know it looks horrible. How can we shorten it?
Looking at the result from the server, you don't have to loop through all pages. All the info you have on one page:
import requests
from bs4 import BeautifulSoup
url = "https://us.pandora.net/en/charms/?sz=30&start={}&format=page-element"
sale_url = "https://us.pandora.net/en/sale/sale-charms/?sz=30&start={}&format=page-element"
soup = BeautifulSoup(requests.get(url.format(0)).text, 'lxml')
sale_soup = BeautifulSoup(requests.get(sale_url.format(0)).text, 'lxml')
total_items = soup.select_one('#products_count')['value']
total_sale_items = sale_soup.select_one('#products_count')['value']
categories = [tag['title'].strip() for tag in soup.select('.refinement-link[title]')]
print('Categories:')
for category in categories:
print('\t{}'.format(category))
print('\nTotal items: {}'.format(total_items))
print('Total sale items: {}'.format(total_sale_items))
Prints:
Categories:
Charms
New Arrivals
Best Sellers
Clips
Spacers
Dangles
Safety Chains
Alphabet & Symbols
Animals & Pets
Birthday
Touch of Color
Disney
Family
Holidays
Christmas
Inspirational
Symbols of Love
Nature
Passions
Vacation & Travel
Wedding & Anniversary
Last Chance
Pandora Reflexions™
$0 - $50
$50 - $100
$100 - $150
$150 & Over
Charms
New Arrivals
Best Sellers
Clips
Spacers
Dangles
Safety Chains
Alphabet & Symbols
Animals & Pets
Birthday
Touch of Color
Disney
Family
Holidays
Christmas
Inspirational
Symbols of Love
Nature
Passions
Vacation & Travel
Wedding & Anniversary
Last Chance
Pandora Reflexions™
Total items: 959
Total sale items: 376
can't have 2 returns there. The function stops after that first return, so if you want to return multiple objects, you can put that in one line. You also need to append that within a list within the loop. You have that outside of your loop. Note, I change it from 1000 to 300 to just test it.
Secondly, I think what you want is the text.
To print all the items, you'll need to get each item, not just the ones with 'price-standard'
import requests
from bs4 import BeautifulSoup
url = "https://us.pandora.net/en/charms/?sz=30&start={}&format=page-element"
def fetch_items(link,page):
Total_items = 0
categories = []
while page<=300:
#print("current page no: ",page)
res = requests.get(link.format(page),headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(res.text,"lxml")
list_total = soup.select('.grid-tile .price-standard')
Total_items += len(list_total)
#print(Total_items)
page+=30
print(page)
category_tags = soup.select('span.breadcrumb-element')[0]
try:
categories.append(category_tags.text)
except:
categories.append('N/A')
return Total_items, categories
page = 0
total_items = fetch_items(url,page)
#print number of items on sale
print(total_items[0])
print(total_items[1])
Here's how you can go about getting the whole products:
def fetch_items(link,page):
Total_items = 0
names = []
categories = []
prices = []
sales = []
while page<=300:
res = requests.get(link.format(page),headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(res.text,"lxml")
products = soup.find_all("li", class_=lambda value: value and value.startswith("grid-tile"))
for each in products:
Total_items += 1
category = each.find('div', {'class':'product-tile'})['data-cgid']
name = each.find('div', {'class':'product-name'}).text.strip()
price = each.find('div', {'class':'product-pricing'}).text.strip()
sale_price = each.find('span', {'class':'price-sales'}).text.strip()
names.append(name)
categories.append(category)
prices.append(price)
sales.append(sale_price)
print(page)
page+=30
return Total_items, names, categories, prices, sales
results = fetch_items(url,page)
Not Sure how you want those results though. But you can dump that into a table if you'd like:
import pandas as pd
df = pd.DataFrame(
{'name':results[1],
'category':results[2],
'price':results[3],
'sale':results[4]})
Output:
print (df.head(10).to_string())
name category price sale
0 American Icons Dangle Charm charms $60.00 $60.00
1 Disney Pixar, Toy Story, Buzz Lightyear Dangle... charms $70.00 $70.00
2 Disney Pixar, Toy Story, Woody Dangle Charm charms $60.00 $60.00
3 Spinning Globe Dangle Charm charms $60.00 $60.00
4 Elephant Charm charms $45.00 $45.00
5 Canada Dangle Charm, Pandora Rose™ charms $65.00 $65.00
6 Sparkling Monkey Charm charms $70.00 $70.00
7 Propeller Plane Dangle Charm charms $55.00 $55.00
8 Spotted Heart Charm charms $50.00 $50.00
9 Pink Travel Bag Charm charms $50.00 $50.00
I am extracting review data from IMDB.
However, sometimes there is data without rank.
I want to treat Rank as 0 for such data and add it to an array.
I'm not sure how.
Can you help me?
Thank you very much!
web image
When extracted like this, the Rank value is lower.
for star in soup.select('span:has(~ .point-scale)'):
Star.append(star.text.strip());
for title in soup.find_all('a', {'class' : 'title'}):
Title.append(title.text.strip())
for content in soup.find_all(True,{'class' :[text show-more__control'
,'text show-more__control clickable]}):
Content.append(content.text.strip())
print(range(len(Content)))
len(list : rank, title, content)
How elements in a site fit into a list.
Not all reviews will have ratings so you need to take this into account:
$ python3 test.py https://www.imdb.com/title/tt5113040/reviews
Got response: 200
Title: The Secret Life of Pets 2
# (8/10) Not as bad as some reviews on here
Let's get this straight it a film made for childre...
-----
ddriver385, 26 May 2019
# (7/10) A Good Film for the kids
This film is a good film to watch with the kids. C...
-----
xxharriet_hobbsxx, 27 May 2019
# (7/10) Worth a watch
Admittedly, it probably wasn't necessary to follow...
-----
MythoGenesis, 24 May 2019
# (No rating) Intense and entertaining
Narratively, the film is not without fault. In par...
-----
TheBigSick, 26 May 2019
...
test.py
import requests
import sys
import time
from bs4 import BeautifulSoup
def fetch(url):
with requests.Session() as s:
r = s.get(url, timeout=5)
return r
def main(url):
start_t = time.time()
resp = fetch(url)
print(f'Got response: {resp.status_code}')
html = resp.content
bs = BeautifulSoup(html, 'html.parser')
title = bs.find('h3', attrs={'itemprop': 'name'})
print(f'Title: {title.a.text}')
reviews = bs.find_all('div', class_='review-container')
for review in reviews:
title = review.find('a', class_='title').text.strip()
rating = review.find('span', class_='rating-other-user-rating')
if rating:
rating = ''.join(i.text for i in rating.find_all('span'))
rating = rating if rating else 'No rating'
user = review.find('span', class_='display-name-link').text
date = review.find('span', class_='review-date').text
content = review.find('div', class_='content').div.text
print(
f'# ({rating}) {title}\n'
f'{content[:50]}...\n'
f'{"-" * 5}\n'
f'{user}, {date}\n'
)
end_t = time.time()
elapsed_t = end_t - start_t
r_time = resp.elapsed.total_seconds()
print(f'Total: {elapsed_t:.2f}s, request: {r_time:.2f}s')
if __name__ == '__main__':
if len(sys.argv) > 1:
url = sys.argv[1]
main(url)
else:
print('URL is required.')
sys.exit(1)