Scraping OpenTable website using python BeautifulSoup

Scraping OpenTable website using python BeautifulSoup - python

I'm trying to scrape the Open Table site with the use of Beautiful Soup.The code runs successfully, but the result I am getting has a lot of NA columns. Here is the code.
def parse_html(html):
data, item = pd.DataFrame(), {}
soup = BeautifulSoup(html, 'lxml')
for i, resto in enumerate(soup.find_all('div', class_='rest-row-info')):
item['name'] = resto.find('span', class_='rest-row-name-text').text
booking = resto.find('div', class_='booking')
item['bookings'] = re.search('\d+', booking.text).group() if booking else 'NA'
rating = resto.select('div.all-stars.filled')
item['rating'] = int(re.search('\d+', rating[0].get('style')).group()) if rating else 'NA'
reviews = resto.find('span', class_='star-rating-text--review-text')
item['reviews'] = int(re.search('\d+', reviews.text).group()) if reviews else 'NA'
item['price'] = int(resto.find('div', class_='rest-row-pricing').find('i').text.count('$'))
item['cuisine'] = resto.find('span', class_='rest-row-meta--cuisine').text
item['location'] = resto.find('span', class_='rest-row-meta--location').text
data[i] = pd.Series(item)
return data.T
restaurants = pd.DataFrame()
driver = webdriver.Chrome(ChromeDriverManager().install())
url = "https://www.opentable.com/new-york-restaurant-listings"
driver.get(url)
while True:
sleep(1)
new_data = parse_html(driver.page_source)
if new_data.empty:
break
restaurants = pd.concat([restaurants, new_data], ignore_index=True)
print(len(restaurants))
# driver.find_element_by_link_text('Next').click()
driver.close()
restaurants.to_csv('results.csv', index=False)
print(restaurants)
and the results:
name bookings rating reviews price cuisine location
0 IL Carino Restaurant 1 NA NA 3 Upper East Side
1 French Roast Uptown 10 NA NA 3 Upper West Side
2 The Mermaid Inn Uptown 72 NA NA 3 Upper West Side
3 Cafe Du Soleil 101 NA NA 2 Upper West Side
4 The Leopard at des Artistes 24 NA NA 4 Upper West Side
Any recommendation or suggestion is appreciated.

I don't see on this page
rating = resto.select('div.all-stars.filled')
and code also can't find it - so you get NA for rating
But this gives me strings like 4.5 stars out of 5
rating = resto.select('.star-rating .star-rating-score')
#print(rating)
item['rating'] = rating[0]['aria-label'] if rating else 'NA'
I don't see on this page
resto.find('span', class_='star-rating-text--review-text')
and code also can't find it - so you get NA for reviews
But this gives me strings like Awesome, Exceptional
reviews = resto.select('div.review-rating-text span')
#print(reviews)
item['reviews'] = reviews[0].text if reviews else 'NA'
There are two elements with class 'rest-row-meta--cuisine' and you get first so you get $$$$
item['cuisine'] = resto.find('span', class_='rest-row-meta--cuisine').text
but you should use find_all to get both and later use [-1] to get last one
item['cuisine'] = resto.find_all('span', class_='rest-row-meta--cuisine')[-1].text
and this gives me
Pizzeria
Italian
Sushi
Steak
Contemporary Italian
Pizzeria
American
Italian
American
from selenium import webdriver
import pandas as pd
from bs4 import BeautifulSoup
from time import sleep
import re
def parse_html(html):
data, item = pd.DataFrame(), {}
soup = BeautifulSoup(html, 'lxml')
for i, resto in enumerate(soup.find_all('div', class_='rest-row-info')):
item['name'] = resto.find('span', class_='rest-row-name-text').text
booking = resto.find('div', class_='booking')
item['bookings'] = re.search('\d+', booking.text).group() if booking else 'NA'
rating = resto.select('.star-rating .star-rating-score')
#print(rating)
item['rating'] = rating[0]['aria-label'] if rating else 'NA'
reviews = resto.find('span', class_='star-rating-text--review-text')
reviews = resto.select('div.review-rating-text span')
#print(reviews)
item['reviews'] = reviews[0].text if reviews else 'NA'
item['price'] = int(resto.find('div', class_='rest-row-pricing').find('i').text.count('$'))
item['cuisine'] = resto.find_all('span', class_='rest-row-meta--cuisine')[-1].text
#print(item['cuisine'])
item['location'] = resto.find('span', class_='rest-row-meta--location').text
data[i] = pd.Series(item)
return data.T
restaurants = pd.DataFrame()
#driver = webdriver.Chrome(ChromeDriverManager().install())
driver = webdriver.Chrome()
url = "https://www.opentable.com/new-york-restaurant-listings"
driver.get(url)
while True:
sleep(1)
new_data = parse_html(driver.page_source)
if new_data.empty:
break
restaurants = pd.concat([restaurants, new_data], ignore_index=True)
print(len(restaurants))
# driver.find_element_by_link_text('Next').click()
#driver.close()
restaurants.to_csv('results.csv', index=False)
print(restaurants[['rating', 'reviews', 'cuisine']])
rating reviews cuisine
0 4.5 stars out of 5 Awesome Italian
1 4.5 stars out of 5 Awesome French American
2 4.7 stars out of 5 Exceptional Italian
3 4.8 stars out of 5 Exceptional Seafood
4 4.4 stars out of 5 Awesome French
.. ... ... ...
95 4.7 stars out of 5 Exceptional Contemporary Italian
96 4 stars out of 5 Excellent Pizzeria
97 NA NA American
98 4.7 stars out of 5 Exceptional Italian
99 4.4 stars out of 5 Awesome American

Related

how can i get all product name by scraping this

import requests as r
from bs4 import BeautifulSoup as bs
url=r.get("https://www.consumerreports.org/cro/coffee-makers.htm")
soup=bs(url.content)
product=soup.find('div',class_="row product-type-container")
pclass=product.find('div',class_="product-type-item col-xs-4")
pname=pclass.find('div',class_="product-type-info-container").h3.text
print(pname)
i am scraping all the product name and details but can only scrape one product at a time how can i scrape

To get titles of all products in all categories you can use next example:
import requests
from bs4 import BeautifulSoup
def get_products(url):
soup = BeautifulSoup(requests.get(url).content, "html.parser")
out = []
for title in soup.select(".crux-component-title"):
out.append(title.get_text(strip=True))
return out
url = "https://www.consumerreports.org/cro/coffee-makers.htm"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
all_data = []
for category_link in soup.select("h3.crux-product-title a"):
u = "https://www.consumerreports.org" + category_link["href"]
print("Getting {}".format(u))
all_data.extend(get_products(u))
for i, title in enumerate(all_data, 1):
print("{:<5} {}".format(i, title))
Prints:
1 Bella 14755 with Brew Strength Selector
2 Bella Pro Series 90061
3 Betty Crocker 12-cup Stainless Steel BC-2809CB
4 Black+Decker 12-cup Programmable CM1331S
5 Black+Decker 12-Cup Thermal Programmable CM2046S
6 Black+Decker CM2036S 12-cup Thermal
7 Black+Decker CM4000S
8 Black+Decker DLX1050B
9 Black+Decker Even Stream CM2035B
10 Black+Decker Honeycomb Collection CM1251W
11 Black+Decker Programmable CM1331BS (Walmart Exclusive)
12 Bonavita BV1901TS 8-Cup One-Touch
13 Braun Brew Sense KF7150BK
14 Braun BrewSense 12-cup Programmable KF7150
15 Braun BrewSense 12-cup Programmable KF7000BK
...and so on.

Why is that: find(..) returns only first object which matches your criteria.
Solution: Try using find_all(..) method.

Ebay scraper makes a dataframe but can´t use the numbers for future graphics

I´m trying to make an easy scraper with python for ebay, the thing is I can´t use the dataframe I make.
My code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from csv import reader
url = "https://www.ebay.es/sch/i.html?_from=R40&_nkw=iphone&_sacat=0&LH_TitleDesc=0&_fsrp=1&Modelo=Apple%2520iPhone%2520X&_dcat=9355"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
productslist = []
results = soup.find_all('div', {'class': 's-item__info clearfix'})
print(len(results))
for item in results:
product = {
'title': item.find('h3', {'class': 's-item__title'}),
'soldprice': item.find('span', {'class': 's-item__price'})
}
productslist.append(product)
df = pd.DataFrame(productslist)
df
But the dataframe I get it´s like this:
Dataframe
I would like to be able to work with the numbers of the price but I can´t use it, I imagine it´s because dtype: object, I would like to know how to convert for example [359,00 EUR] in 359,00 to be able to make graphics.
Thanks.

To remove the "price" from a [...] format, use the .text method. Also, make sure that the price not None:
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "https://www.ebay.es/sch/i.html?_from=R40&_nkw=iphone&_sacat=0&LH_TitleDesc=0&_fsrp=1&Modelo=Apple%2520iPhone%2520X&_dcat=9355"
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
productslist = []
results = soup.find_all("div", {"class": "s-item__info clearfix"})
for item in results:
title = item.find("h3", {"class": "s-item__title"})
price = item.find("span", {"class": "s-item__price"})
if price is None:
continue
productslist.append({"title": title.text, "soldprice": price.text})
df = pd.DataFrame(productslist)
print(df)
Output:
title soldprice
0 APPLE IPHONE X 64 GB A+LIBRE+FACTURA+8 ACCESOR... 359,00 EUR
1 Apple iPhone X - 64GB - Plata (Libre) 177,50 EUR
2 Apple iPhone X - 64GB - Blanco (Libre) 181,50 EUR
3 iphone x 64gb 240,50 EUR
4 Iphone x 256 gb 370,00 EUR
5 Apple iPhone X - 256GB - Space Gray (Libre) 400,00 EUR
6 Nuevo anuncioSMARTPHONE APPLE IPHONE X 64GB LI... 334,95 EUR
...
...

webscrape star rating using beautifulsoup from webpage

I am trying to extract the star rating of each review in a dataframe for sentiment analysis.
https://www.mouthshut.com/product-reviews/Kotak-811-Mobile-Banking-reviews-925917218
This the webpage I am trying to scrape. I am fairly new to webscraping, so I prefer beautifulsoup as it is easier to understand.
import requests
from bs4 import BeautifulSoup
import pandas as pd
URL = ""
Final = []
for x in range(0, 8):
if x == 1:
URL = "https://www.mouthshut.com/product-reviews/Kotak-811-Mobile-Banking-reviews-925917218"
else:
URL ="https://www.mouthshut.com/product-reviews/Kotak-811-Mobile-Banking-reviews-925917218-page-{}".format(x)
r = requests.get(URL)
soup = BeautifulSoup(r.content, 'html.parser')
reviews = [] # a list to store reviews
# Use a CSS selector to extract all the review containers
review_divs = soup.select('div.col-10.review')
for element in review_divs :
review = {'Review_Title': element .a.text, 'URL': element .a['href'], 'Review': element .find('div', {'class': ['more', 'reviewdata']}).text.strip()}
reviews.append(review)
Final.extend(reviews)
df = pd.DataFrame(Final)
I would really appreciate the help.
Thank You

You may add the following entry to your review dictionary to get all the
giving stars under class=rating.
'Stars' : len(element.find('div', "rating").findAll("i", "rated-star"))
Review_Title ... Stars
0 Why need permission for contact, gallery ... 1
1 Very dull marketing for open account ... 1
2 Worst bank ... 1
3 Good interface & can be easily accessible ... 3
4 Best digital Bank account ... 4
5 Better account for everyone ... 4
6 Feature full Mobile banking ... 5
7 Very good bank ... 4
8 Above average online banking experience ... 3
...

Searching Dataframe for Specific Values to Be Stored

I'm new to programming and Python. I'm adopting code(https://github.com/rileypredum/East-Bay-Housing-Web-Scrape/blob/master/EB_Room_Prices.ipynb) to scrape Craiglist. My goal is to retrieve and store all the automotive posts in Chicago. I am able to store the Post Title, Post Time, Price, and Neighborhood. My next goal is to create a new column adding only the make of the vehicle, i.e. Toyota, Nissan, Honda, etc by searching the Post Title. How do I do this?
I believe this would be where I would add logic here: In [13]" for a variable "post_make" to search "post_title".
#build out the loop
from time import sleep
from random import randint
from warnings import warn
from time import time
from IPython.core.display import clear_output
import numpy as np
#find the total number of posts to find the limit of the pagination
results_num = html_soup.find('div', class_= 'search-legend')
results_total = int(results_num.find('span', class_='totalcount').text)
pages = np.arange(0, results_total, 120)
iterations = 0
post_timing = []
post_hoods = []
post_title_texts = []
post_links = []
post_prices = []
for page in pages:
#get request
response = get("https://sfbay.craigslist.org/search/eby/roo?"
+ "s="
+ str(page)
+ "&hasPic=1"
+ "&availabilityMode=0")
sleep(randint(1,5))
#throw warning for status codes that are not 200
if response.status_code != 200:
warn('Request: {}; Status code: {}'.format(requests, response.status_code))
#define the html text
page_html = BeautifulSoup(response.text, 'html.parser')
#define the posts
posts = html_soup.find_all('li', class_= 'result-row')
#extract data item-wise
for post in posts:
if post.find('span', class_ = 'result-hood') is not None:
#posting date
#grab the datetime element 0 for date and 1 for time
post_datetime = post.find('time', class_= 'result-date')['datetime']
post_timing.append(post_datetime)
#neighborhoods
post_hood = post.find('span', class_= 'result-hood').text
post_hoods.append(post_hood)
#title text
post_title = post.find('a', class_='result-title hdrlnk')
post_title_text = post_title.text
post_title_texts.append(post_title_text)
#post link
post_link = post_title['href']
post_links.append(post_link)
post_price = post.a.text
post_prices.append(post_price)
iterations += 1
print("Finished iteration: " + str(iterations))
Trying to figure out how to show the output.
Current output in excel is:
posted, neighborhood, post title, url, price
My goal is to add "post make" after the price.
I'm also looking for advice on how to show output from Jupyter notebooks here.

It's rather tricky to pull that out. I gave it a shot using another package Spacy to try to pull out the entities that are linked to organisations/car companies. It's not perfect, but it's a start:
Code:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import spacy
nlp = spacy.load("en_core_web_sm")
req_url = 'https://chicago.craigslist.org/search/cta'
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Mobile Safari/537.36'}
payload = {
's': '0',
'query': 'automotive',
'sort': 'rel'}
response = requests.get(req_url, headers=headers, params=payload)
soup = BeautifulSoup(response.text, 'html.parser')
total_posts = int(soup.find('span',{'class':'totalcount'}).text)
pages = list(range(0, total_posts, 120))
iterations = 0
post_timing = []
post_hoods = []
post_title_texts = []
post_links = []
post_prices = []
post_makes = []
post_models = []
for page in pages:
payload = {
's': page,
'query': 'automotive',
'sort': 'rel'}
response = requests.get(req_url, headers=headers, params=payload)
soup = BeautifulSoup(response.text, 'html.parser')
posts = soup.find_all('li', class_= 'result-row')
#extract data item-wise
for post in posts:
if post.find('span', class_ = 'result-hood') is not None:
#posting date
#grab the datetime element 0 for date and 1 for time
post_datetime = post.find('time', class_= 'result-date')['datetime']
post_timing.append(post_datetime)
#neighborhoods
post_hood = post.find('span', class_= 'result-hood').text
post_hoods.append(post_hood)
#title text
post_title = post.find('a', class_='result-title hdrlnk')
post_title_text = post_title.text
post_title_texts.append(post_title_text)
#post link
post_link = post_title['href']
post_links.append(post_link)
post_price = post.a.text.strip()
post_prices.append(post_price)
try:
# Used Spacy and Named Entity Recognition (NER) to pull out makes/models within the title text
post_title_text = post_title_text.replace('*', ' ')
post_title_text = [ each.strip() for each in post_title_text.split(' ') if each.strip() != '' ]
post_title_text = ' '.join( post_title_text)
doc = nlp(post_title_text)
model = [ent.text for ent in doc.ents if ent.label_ == 'PRODUCT']
make_model_list = [ent.text for ent in doc if ent.tag_ == 'NNP']
doc = nlp(' '.join(make_model_list))
make = [ent.text for ent in doc.ents if ent.label_ == 'ORG']
post_make = make[0]
post_makes.append(post_make)
post_model = model[0]
post_models.append(post_model)
except:
post_makes.append('')
post_models.append('')
iterations += 1
print("Finished iteration: " + str(iterations))
data = list(zip(post_timing,post_hoods,post_title_texts,post_links,post_prices,post_makes,post_models))
df = pd.DataFrame(list(zip(post_timing,post_hoods,post_title_texts,post_links,post_prices,post_makes,post_models)),
columns = ['time','hood','title','link','price','make','model'])
Output:
print (df.head(20).to_string())
time hood title link price make model
0 2019-10-03 07:12 (TEXT 855-976-4304 FOR CUSTOM PAYMENT) 2015 Ford Focus SE Sedan 4D sedan Dk. Gray - F... https://chicago.craigslist.org/chc/ctd/d/chica... $11500 Ford Focus SE
1 2019-10-03 06:03 (EVERYBODY DRIVES IN SOUTH ELGIN) $174/mo [][][] 2013 Hyundai Sonata BAD CREDIT OK https://chicago.craigslist.org/nwc/ctd/d/south... $174 Sonata BAD
2 2019-10-03 00:04 (EVERYBODY DRIVES IN SOUTH ELGIN) $658/mo [][][] 2016 Jeep Grand Cherokee BAD CR... https://chicago.craigslist.org/nwc/ctd/d/south... $658 Hyundai
3 2019-10-02 21:04 (EVERYBODY DRIVES IN SOUTH ELGIN) $203/mo [][][] 2010 Chevrolet Traverse BAD CRE... https://chicago.craigslist.org/nwc/ctd/d/south... $203 Jeep Grand Cherokee BAD Traverse BAD
4 2019-10-02 20:24 (DENVER) 2017 Jeep Cherokee Latitude 4x4 4dr SUV SKU:60... https://chicago.craigslist.org/chc/ctd/d/denve... $8995 Cherokee
5 2019-10-02 20:03 ( Buy Here Pay Here!) Good Credit, Bad Credit, NO Credit = NO Problem https://chicago.craigslist.org/nwc/ctd/d/chica... $0 Chevrolet
6 2019-10-02 20:03 ( Buy Here Pay Here!) Aceptamos Matricula!!! Te pagan en efectivo?? ... https://chicago.craigslist.org/wcl/ctd/d/chica... $0 Jeep
7 2019-10-02 20:02 ( Buy Here Pay Here!) Good Credit, Bad Credit, No Credit = No Problem https://chicago.craigslist.org/chc/ctd/d/vista... $0 Credit Bad Credit
8 2019-10-02 20:00 ( Buy Here Pay Here!) Good Credit, Bad Credit, No Credit= No Problem https://chicago.craigslist.org/sox/ctd/d/chica... $0
9 2019-10-02 19:15 (* CHRYSLER * TOWN AND COUNTRY * WWW.YOURCHOI... 2013*CHRYSLER*TOWN & COUNTRY*TOURING LEATHER K... https://chicago.craigslist.org/nwc/ctd/d/2013c... $9499
10 2019-10-02 19:09 (*CADILLAC* *DTS* WWW.YOURCHOICEAUTOS.COM) 2008*CADILLAC*DTS*1OWNER LEATHER SUNROOF NAVI ... https://chicago.craigslist.org/sox/ctd/d/2008c... $5999 Credit Bad Credit
11 2019-10-02 18:59 (WAUKEGANAUTOAUCTION.COM OPEN TO PUBLIC OVER ... 2001 *GMC**YUKON* XL DENALI AWD 6.0L V8 1OWNER... https://chicago.craigslist.org/nch/ctd/d/2001-... $1200
12 2019-10-02 18:47 (*GMC *SAVANA *CARGO* WWW.YOURCHOICEAUTOS.COM) 1999 *GMC *SAVANA *CARGO*G2500 SHELVES CABINET... https://chicago.craigslist.org/sox/ctd/d/1999-... $2999 Credit Bad Credit
13 2019-10-02 18:04 ( Buy Here Pay Here!) GoodCredit, Bad Credit, No credit = No Problem https://chicago.craigslist.org/nwc/ctd/d/chica... $0
14 2019-10-02 18:05 ( Buy Here Pay Here!) Rebuild your credit today!!! https://chicago.craigslist.org/sox/ctd/d/chica... $0 CHRYSLER
15 2019-10-02 18:03 ( Buy Here Pay Here!) Rebuild your credit today!!! Repo? No Problem!... https://chicago.craigslist.org/chc/ctd/d/vista... $0
16 2019-10-02 17:59 (* ACURA * TL * WWW.YOURCHOICEAUTOS.COM) 2006 *ACURA**TL* LEATHER SUNROOF CD KEYLES ALL... https://chicago.craigslist.org/sox/ctd/d/2006-... $4499
17 2019-10-02 18:00 ( Buy Here Pay Here!) Buy Here Pay Here!!! We Make it Happen!! Bad C... https://chicago.craigslist.org/wcl/ctd/d/chica... $0
18 2019-10-02 17:35 (ST JOHN) 2009 NISSAN VERSA https://chicago.craigslist.org/nwi/ctd/d/saint... $4995
19 2019-10-02 17:33 (DENVER) 2013 Scion tC Base 2dr Coupe 6M SKU:065744 Sci... https://chicago.craigslist.org/chc/ctd/d/denve... $5995 GoodCredit Bad Credit

Scraping e-commerce in python - cannot fetch product categories and total amounts

So far my code can scrape the number of items on sale in the category Charms. But I cannot make it print out the name of the category.
The site uses an infinite scroller - but I managed to identify where the sites are and thus the site URL contains {} which is filled out with the while loop.
import requests
from bs4 import BeautifulSoup
url = "https://us.pandora.net/en/charms/?sz=30&start={}&format=page-element"
def fetch_items(link,page):
Total_items = 0
while page<=1000:
#print("current page no: ",page)
res = requests.get(link.format(page),headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(res.text,"lxml")
list_total = soup.select('.grid-tile .price-standard')
Total_items += len(list_total)
#print(Total_items)
page+=30
category_tags = soup.select('span.breadcrumb-element')
return Total_items
return category_tags
if __name__ == '__main__':
page = 0
product_list = []
total_items = fetch_items(url,page)
#print number of items on sale
print(total_items)
print(category_tags)
Here's what I need:
I need to print out the category of the scraped items, which can be found in using this line:
category_tags = soup.select('span.breadcrumb-element')
But I cannot make it print somehow.
While we're at it, how can I make the code print out ALL the items and not just the items on sale?
Thank you.
EDIT:
So building one of the guys' code I ended up with this.
import requests
from bs4 import BeautifulSoup
import re
url1 = "https://us.pandora.net/en/charms/?sz=30&start={}&format=page-element"
url2 = "https://us.pandora.net/en/bracelets/?sz=30&start={}&format=page-element"
url3 = "https://us.pandora.net/en/rings/?sz=30&start={}&format=page-element"
url4 = "https://us.pandora.net/en/necklaces/?sz=30&start={}&format=page-element"
url5 = "https://us.pandora.net/en/earrings/?sz=30&start={}&format=page-element"
#res = requests.get(link.format(url1),headers={"User-Agent":"Mozilla/5.0"})
soup1 = BeautifulSoup(requests.get(url1.format(0)).text, 'lxml')
soup2 = BeautifulSoup(requests.get(url2.format(0)).text, 'lxml')
soup3 = BeautifulSoup(requests.get(url3.format(0)).text, 'lxml')
soup4 = BeautifulSoup(requests.get(url4.format(0)).text, 'lxml')
soup5 = BeautifulSoup(requests.get(url5.format(0)).text, 'lxml')
total_items1 = ''.join(re.findall(r'\d', soup1.select_one('span.products-count').text))
total_items2 = ''.join(re.findall(r'\d', soup2.select_one('span.products-count').text))
total_items3 = ''.join(re.findall(r'\d', soup3.select_one('span.products-count').text))
total_items4 = ''.join(re.findall(r'\d', soup4.select_one('span.products-count').text))
total_items5 = ''.join(re.findall(r'\d', soup5.select_one('span.products-count').text))
#categories = [tag['title'].strip() for tag in soup.select('.refinement-link[title]')
#total_items_sale1 = ''.join(re.findall(r'\d', soup1.select_one('.grid-tile .price-standard')))
#total_items_sale1
#total_items_sale1
#total_items_sale1
#total_items_sale1
#print('Categories:')
#for category in categories:
#print('\t{}'.format(category))
print('\nTotal Charms: {}'.format(total_items1))
print('\nTotal Bracelets: {}'.format(total_items2))
print('\nTotal Rings: {}'.format(total_items3))
print('\nTotal Necklaces: {}'.format(total_items4))
print('\nTotal Earrings: {}'.format(total_items5))
I know it looks horrible. How can we shorten it?

Looking at the result from the server, you don't have to loop through all pages. All the info you have on one page:
import requests
from bs4 import BeautifulSoup
url = "https://us.pandora.net/en/charms/?sz=30&start={}&format=page-element"
sale_url = "https://us.pandora.net/en/sale/sale-charms/?sz=30&start={}&format=page-element"
soup = BeautifulSoup(requests.get(url.format(0)).text, 'lxml')
sale_soup = BeautifulSoup(requests.get(sale_url.format(0)).text, 'lxml')
total_items = soup.select_one('#products_count')['value']
total_sale_items = sale_soup.select_one('#products_count')['value']
categories = [tag['title'].strip() for tag in soup.select('.refinement-link[title]')]
print('Categories:')
for category in categories:
print('\t{}'.format(category))
print('\nTotal items: {}'.format(total_items))
print('Total sale items: {}'.format(total_sale_items))
Prints:
Categories:
Charms
New Arrivals
Best Sellers
Clips
Spacers
Dangles
Safety Chains
Alphabet & Symbols
Animals & Pets
Birthday
Touch of Color
Disney
Family
Holidays
Christmas
Inspirational
Symbols of Love
Nature
Passions
Vacation & Travel
Wedding & Anniversary
Last Chance
Pandora Reflexions™
$0 - $50
$50 - $100
$100 - $150
$150 & Over
Charms
New Arrivals
Best Sellers
Clips
Spacers
Dangles
Safety Chains
Alphabet & Symbols
Animals & Pets
Birthday
Touch of Color
Disney
Family
Holidays
Christmas
Inspirational
Symbols of Love
Nature
Passions
Vacation & Travel
Wedding & Anniversary
Last Chance
Pandora Reflexions™
Total items: 959
Total sale items: 376

can't have 2 returns there. The function stops after that first return, so if you want to return multiple objects, you can put that in one line. You also need to append that within a list within the loop. You have that outside of your loop. Note, I change it from 1000 to 300 to just test it.
Secondly, I think what you want is the text.
To print all the items, you'll need to get each item, not just the ones with 'price-standard'
import requests
from bs4 import BeautifulSoup
url = "https://us.pandora.net/en/charms/?sz=30&start={}&format=page-element"
def fetch_items(link,page):
Total_items = 0
categories = []
while page<=300:
#print("current page no: ",page)
res = requests.get(link.format(page),headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(res.text,"lxml")
list_total = soup.select('.grid-tile .price-standard')
Total_items += len(list_total)
#print(Total_items)
page+=30
print(page)
category_tags = soup.select('span.breadcrumb-element')[0]
try:
categories.append(category_tags.text)
except:
categories.append('N/A')
return Total_items, categories
page = 0
total_items = fetch_items(url,page)
#print number of items on sale
print(total_items[0])
print(total_items[1])
Here's how you can go about getting the whole products:
def fetch_items(link,page):
Total_items = 0
names = []
categories = []
prices = []
sales = []
while page<=300:
res = requests.get(link.format(page),headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(res.text,"lxml")
products = soup.find_all("li", class_=lambda value: value and value.startswith("grid-tile"))
for each in products:
Total_items += 1
category = each.find('div', {'class':'product-tile'})['data-cgid']
name = each.find('div', {'class':'product-name'}).text.strip()
price = each.find('div', {'class':'product-pricing'}).text.strip()
sale_price = each.find('span', {'class':'price-sales'}).text.strip()
names.append(name)
categories.append(category)
prices.append(price)
sales.append(sale_price)
print(page)
page+=30
return Total_items, names, categories, prices, sales
results = fetch_items(url,page)
Not Sure how you want those results though. But you can dump that into a table if you'd like:
import pandas as pd
df = pd.DataFrame(
{'name':results[1],
'category':results[2],
'price':results[3],
'sale':results[4]})
Output:
print (df.head(10).to_string())
name category price sale
0 American Icons Dangle Charm charms $60.00 $60.00
1 Disney Pixar, Toy Story, Buzz Lightyear Dangle... charms $70.00 $70.00
2 Disney Pixar, Toy Story, Woody Dangle Charm charms $60.00 $60.00
3 Spinning Globe Dangle Charm charms $60.00 $60.00
4 Elephant Charm charms $45.00 $45.00
5 Canada Dangle Charm, Pandora Rose™ charms $65.00 $65.00
6 Sparkling Monkey Charm charms $70.00 $70.00
7 Propeller Plane Dangle Charm charms $55.00 $55.00
8 Spotted Heart Charm charms $50.00 $50.00
9 Pink Travel Bag Charm charms $50.00 $50.00

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scraping OpenTable website using python BeautifulSoup - python

Related

how can i get all product name by scraping this

Ebay scraper makes a dataframe but can´t use the numbers for future graphics

webscrape star rating using beautifulsoup from webpage

Searching Dataframe for Specific Values to Be Stored

Scraping e-commerce in python - cannot fetch product categories and total amounts

Categories

Resources