Why does scraper stop at the 12th image? - python

So I have to scrape all the products from this website's shop ( https://bewellstore.ro/shop/), but my code stops at the 12th photo. I have made a version for websites with multiple shop pages where I take them all in a for loop, but since here it's only one page I thought that isn't necessary.
Any idea why my code stops at the 12th product?
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
folder = 'beWell_images'
os.makedirs(folder, exist_ok=True)
root_folder = os.getcwd()
baseurl = 'https://bewellstore.ro/shop/'
# an array for all the product links
product_links = []
# going through all the pages of the shop
url = 'https://bewellstore.ro/shop/'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml')
product_list = soup.find_all('div', class_= 'loop-product-inner')
print (product_list)
# taking all the links to each product page
for item in product_list:
for link in item.find_all('a', href=True, class_='woocommerce-LoopProduct-link woocommerce-loop-product__link'):
product_links.append(link['href'])
# appending the links previously taken to the array
print(product_links)
product_items_list = []
i = 0
d = {} # use as set()
os.chdir(folder)
for link_test in product_links:
r = requests.get(link_test)
soup = BeautifulSoup(r.content, 'lxml')
title = soup.find('h1', class_='product_title').text.strip()
price = soup.find('p', class_ = 'price').text.strip()
header = soup.find('div', class_ = 'woocommerce-product-details__short-description').text.strip()
sku = soup.find('span', class_ = 'sku').text.strip()
categories = soup.find('div' , class_ = 'posted_in').text.strip()
description = soup.find('div', class_ = 'cell large-6').text.strip()
brand = soup.find('div', class_ = 'tabs-panel').text.strip()
images = soup.select('.wp-post-image')
# --- before `for`-loop ---
downloaded = []
# --- `for`-loop ---
for image in images:
link = image['src']
if link in d:
name = d[link]
downloaded.append(name)
else:
i += 1
name = str(i) +'img.jpg'
d[link] = name
print('link:', link)
print('name:', name)
print('---')
# here i am adding the .jpg and saving the images
with open(name, 'wb') as f:
im = requests.get(link)
#print("URMEAZA DEBUG: {}".format(im))
f.write(im.content)
downloaded.append(name)
# --- after `for`-loop ---
# storing all the infos about this product
img_str = ''
if len(downloaded) > 1:
for index, img in enumerate(downloaded):
if index == len(downloaded)-1:
img_str = img_str + img
else:
img_str = img_str + img + '/'
else:
img_str = downloaded[0]
product = {
'sku': sku,
'base_image': img_str,
'small_image': img_str,
'thumbnail_image': img_str,
'additional_images': img_str,
'product_type': 'simple',
'attribute_set_code': 'Default',
'categories': categories.replace('Categorii: ','').replace(', ', '/'),
'name' : title,
'description': description,
'short_description': header,
'price' : price[0:5]
}
product_items_list.append(product)
os.chdir(root_folder)
# os.chdir('output')
df = pd.DataFrame(product_items_list)
print(df)
df.to_csv('beWell.csv', index=False)

That's because this webpage uses pagination (with 12 products per page) and each page gets loaded only when you scroll. You will have to use selenium to scroll the page.
But if you only want to use beautifulsoup then there is a work around.
The URL for each page looks like this
https://bewellstore.ro/shop/page/<page_no>/
Example:
1st page: https://bewellstore.ro/shop/page/1/
2nd page: https://bewellstore.ro/shop/page/2/
You could make a request to each of the above URLs and scrape your data using beautifulsoup.

You can try this for the all pages
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime
results = []
page_number = 1
import requests
product_links = []
headers = {
'authority': 'bewellstore.ro',
'sec-ch-ua': '"Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
'sec-ch-ua-platform': '"Linux"',
'accept': '*/*',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': 'https://bewellstore.ro/shop/',
'accept-language': 'en-US,en;q=0.9',
'cookie': 'fp_session=new; mc_landing_site=https://bewellstore.ro/shop/; _omappvp=i5rIyW2xsMFKIu3uhQtmFj1TN9jw7aKjO8dgy3SVvWMhAj30NvKFrBXfJLe3dQK6ZdbB4FezbrwFWPGLdKrsj1A1vqN2PRLI; _omappvs=1634795539874; _clck=1f7zptk|1|evr|0; _ga=GA1.2.2117949575.1634795541; _gid=GA1.2.1155690725.1634795541; _fbp=fb.1.1634795541140.1266696245; PHPSESSID=94b6b1996b0b5e831d898c192b4bca06; _clsk=2489zg|1634795542054|1|1|e.clarity.ms/collect; yith_wcwl_session_d235bd7d63b3a120c05ba3c90256789a=%7B%22session_id%22%3A%222e40c31b1503902767c5327edd3cf926%22%2C%22session_expiration%22%3A1637387542%2C%22session_expiring%22%3A1637383942%2C%22cookie_hash%22%3A%2249a81940bd8d39b2f894021c16333e6f%22%7D; omSeen-dwf9rgtvzzrhqylccaag=1634795583943; om-dwf9rgtvzzrhqylccaag=1634795585931; _omra={"dwf9rgtvzzrhqylccaag":"click"}; cookie_notice_accepted=true; ls_smartpush=fdfbe0ffe7800007',
}
while True:
response = requests.get(f'https://bewellstore.ro/shop/page/{page_number}/', headers=headers)
print(response.status_code)
print(response.url)
if response.status_code != 200:
break
soup = BeautifulSoup(response.content, 'html.parser')
product_list = soup.find_all('div', class_= 'loop-product-inner')
# print (product_list)
for item in product_list:
for link in item.find_all('a', href=True, class_='woocommerce-LoopProduct-link woocommerce-loop-product__link'):
product_links.append(link['href'])
print('Addedn link in product_links list :', link['href'])
product_items_list = []
i = 0
d = {}
for link_test in product_links:
r = requests.get(link_test)
soup = BeautifulSoup(r.content, 'lxml')
title = soup.find('h1', class_='product_title').text.strip()
price = soup.find('p', class_ = 'price').text.strip()
header = soup.find('div', class_ = 'woocommerce-product-details__short-description').text.strip()
sku = soup.find('span', class_ = 'sku').text.strip()
categories = soup.find('div' , class_ = 'posted_in').text.strip()
description = soup.find('div', class_ = 'cell large-6').text.strip()
brand = soup.find('div', class_ = 'tabs-panel').text.strip()
images = soup.select('.wp-post-image')
downloaded = []
for image in images:
link = image['src']
if link in d:
name = d[link]
downloaded.append(name)
else:
i += 1
name = str(i) +'img.jpg'
d[link] = name
print('link:', link)
print('name:', name)
print('---')
# here i am adding the .jpg and saving the images
with open(name, 'wb') as f:
im = requests.get(link)
#print("URMEAZA DEBUG: {}".format(im))
f.write(im.content)
downloaded.append(name)
img_str = ''
if len(downloaded) > 1:
for index, img in enumerate(downloaded):
if index == len(downloaded)-1:
img_str = img_str + img
else:
img_str = img_str + img + '/'
else:
img_str = downloaded[0]
product = {
'sku': sku,
'base_image': img_str,
'small_image': img_str,
'thumbnail_image': img_str,
'additional_images': img_str,
'product_type': 'simple',
'attribute_set_code': 'Default',
'categories': categories.replace('Categorii: ','').replace(', ', '/'),
'name' : title,
'description': description,
'short_description': header,
'price' : price[0:5]
}
product_items_list.append(product)
page_number += 1
df = pd.DataFrame(product_items_list)
print(df)
df.to_csv('beWell.csv', index=False)

Related

Pandas to_csv only write the data from certain page

I tried to scrape data from tripadvisor, but from several pages that I tried to scrape, when I try to export it to csv it only shows 1 line of data and gives an error message like this
AttributeError: 'NoneType' object has no attribute 'text'
this is my code
import requests
import pandas as pd
from requests import get
from bs4 import BeautifulSoup
URL = 'https://www.tripadvisor.com/Attraction_Review-g469404-d3780963-Reviews-oa'
for offset in range(0, 30, 10):
url = URL + str(offset) + '-Double_Six_Beach-Seminyak_Kuta_District_Bali.html'
headers = {"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, "html.parser")
container = soup.find_all('div', {'class':'_2rspOqPP'})
for r in container:
reviews = r.find_all('div', {'class': None})
#the container that contains the elements that I want to scrape has no attributes and use DOM element. So I tried to access div with _2rspOqPP class first then access the div with no attributes from there
records = []
for review in reviews:
user = review.find('a', {'class':'_7c6GgQ6n _37QDe3gr WullykOU _3WoyIIcL'}).text
country = review.find('div', {'class' : 'DrjyGw-P _26S7gyB4 NGv7A1lw _2yS548m8 _2cnjB3re _1TAWSgm1 _1Z1zA2gh _2-K8UW3T _1dimhEoy'}).span.text
date = review.find('div', {'class' : '_3JxPDYSx'}).text
content = review.find('div', {'class' : 'DrjyGw-P _26S7gyB4 _2nPM5Opx'}).text
records.append((user, country, date, content))
df = pd.DataFrame(records, columns=['Name', 'Country', 'Date', 'Content'])
df.to_csv('doublesix_.csv', index=False, encoding='utf-8')
Code updated
for r in container:
reviews = r.find_all('div', {'class': None})
records = []
for review in reviews:
try:
user = review.find('a', {'class':'_7c6GgQ6n _37QDe3gr WullykOU _3WoyIIcL'}).text
country = review.find('div', {'class' : 'DrjyGw-P _26S7gyB4 NGv7A1lw _2yS548m8 _2cnjB3re _1TAWSgm1 _1Z1zA2gh _2-K8UW3T _1dimhEoy'}).span.text
date = review.find('div', {'class' : '_3JxPDYSx'}).text
content = review.find('div', {'class' : 'DrjyGw-P _26S7gyB4 _2nPM5Opx'}).text
records.append((user, country, date, content))
except:
pass
print(records)
df = pd.DataFrame(records, columns=['Name', 'Country', 'Date', 'Content'])
df.to_csv('doublesix_.csv', index=False, encoding='utf-8')
You should move the records out of the for loops and unindent the last few lines.
See this:
import pandas as pd
import requests
from bs4 import BeautifulSoup
main_url = 'https://www.tripadvisor.com/Attraction_Review-g469404-d3780963-Reviews-oa'
country_class = "DrjyGw-P _26S7gyB4 NGv7A1lw _2yS548m8 _2cnjB3re _1TAWSgm1 _1Z1zA2gh _2-K8UW3T _1dimhEoy"
records = []
for offset in range(0, 30, 10):
url = main_url + str(offset) + '-Double_Six_Beach-Seminyak_Kuta_District_Bali.html'
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
}
soup = BeautifulSoup(requests.get(url, headers=headers).text, "html.parser")
container = soup.find_all('div', {'class': '_2rspOqPP'})
for r in container:
reviews = r.find_all('div', {'class': None})
for review in reviews:
try:
user = review.find('a', {'class': '_7c6GgQ6n _37QDe3gr WullykOU _3WoyIIcL'}).text
country = review.find('div', {'class': country_class}).span.text
date = review.find('div', {'class': '_3JxPDYSx'}).text
content = review.find('div', {'class': 'DrjyGw-P _26S7gyB4 _2nPM5Opx'}).text
records.append((user, country, date, content))
except AttributeError:
pass
df = pd.DataFrame(records, columns=['Name', 'Country', 'Date', 'Content'])
df.to_csv('doublesix_.csv', index=False, encoding='utf-8')
Output from the .csv file:

beautifulsoup for loop extracts only first page data

I have a txt file with 2 urls in it
https://www.kununu.com/de/volkswagen/kommentare
https://www.kununu.com/de/audi/kommentare
I want to extract some data from all pages in that urls with beautifulsoup. Below code extracts that data but only for first page. I should be missing something, can you update code so, it will extract from all pages?
firma = []
lineList2 = [line.rstrip('\n') for line in open(r"C:/myfolder/555.txt")]
print(lineList2)
for url in lineList2:
with requests.Session() as session:
session.headers = {
'x-requested-with': 'XMLHttpRequest'
}
page = 1
while True:
print(f"Processing page {page}..")
url = f'{url}/{page}'
response = session.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
articles = soup.find_all('article')
print("Number of articles: " + str(len(articles)))
for article in articles:
try:
firmaText = article.find('div', text=re.compile(r'Firma')).find_next('div').text.strip()
firma.append(firmaText)
except:
firma.append('N/A')
page += 1
pagination = soup.find_all('div', {'class': 'paginationControl'})
if not pagination:
break
df = pd.DataFrame({
'Company': firma
})
print(df)
from bs4 import BeautifulSoup
import requests
import pandas as pd
firma = []
lineList2=[]
with open('555.txt', 'r') as file:
lines = file.readlines()
for line in lines:
lineList2.append(line.strip('\n'))
print(lineList2)
for lurl in lineList2:
with requests.Session() as session:
session.headers = {
'x-requested-with': 'XMLHttpRequest'
}
page = 1
while True:
print("in while")
print(f"Processing page {page}..")
url = f'{lurl}/{page}'
print(url)
response = session.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
articles = soup.find_all('article')
print("Number of articles: " + str(len(articles)))
for article in articles:
try:
firmaText = article.find('div', text=re.compile(r'Firma')).find_next('div').text.strip()
firma.append(firmaText)
except:
firma.append('N/A')
page += 1
pagination = soup.find_all('div', {'class': 'paginationControl'})
if not pagination:
break
df = pd.DataFrame({
'Company': firma
})
print(df)

Cannot web scrape a page which includes pagination and products in grid layout using python

I want to web scrape the following webpage
https://www.websupplies.gr/laptop#/pageSize=48&viewMode=list&orderBy=10&pageNumber=1
But I keep getting a part of the url links the first 12 of the first 2 pages, not the 3rd, and not the total links. I used the following:
initial_url = 'https://www.websupplies.gr/laptop#/pageSize=48&viewMode=list&orderBy=10'
caturl = 'https://www.websupplies.gr/laptop#/pageSize=48&viewMode=list&orderBy=10&pageNumber={}'
r = requests.get(initial_url)
if r.status_code == 200:
Myhtml = r.text
soup = BeautifulSoup(Myhtml, 'html.parser')
#GETTING THE LAST PAGE
last_page = soup.find('div', class_='pager').find('li', class_='next-page').a['href'].split('=')[1]
#GETTING THE PAGE URL LINKS
dept_page_url = [caturl.format(i) for i in range(1, int(last_page)+1)]
time.sleep(2)
for pageurl in dept_page_url:
r = requests.get(pageurl)
if r.status_code == 200:
Myhtml = r.text
soup = BeautifulSoup(Myhtml, 'html.parser')
#GETTING THE PRODUCT LINKS
productlist = soup.find('div',attrs={'class':'item-grid'})
atagslist = productlist.findAll('a', href=True)
links_with_text = []
final_links = []
for a in atagslist:
if a.text:
mlink = a['href']
if mlink !='#':
links_with_text.append(infodomain+mlink)
#DELETE DUPLICATES
links_with_text = list(dict.fromkeys(links_with_text))
links_with_text.extend(links_with_text)
How can I get all URL links
You could mimic the POST request the page makes and have an exit condition based on next-page being present
import requests
from bs4 import BeautifulSoup as bs
headers = {
'user-agent': 'Mozilla/5.0',
'content-type': 'application/json; charset=UTF-8',
'authority': 'www.websupplies.gr',
'x-requested-with': 'XMLHttpRequest'
}
links = []
page = 1
with requests.Session() as s:
while True:
data = '{"categoryId":"405","manufacturerId":"0","vendorId":"0","priceRangeFilterModel7Spikes":{"CategoryId":"405","ManufacturerId":"0","VendorId":"0","SelectedPriceRange":{},"MinPrice":"204","MaxPrice":"3850"},"specificationFiltersModel7Spikes":{"CategoryId":"405","ManufacturerId":"0","VendorId":"0","SpecificationFilterGroups":[{"Id":"658","FilterItems":[{"Id":"4821","FilterItemState":"Unchecked"},{"Id":"1969","FilterItemState":"Unchecked"},{"Id":"4394","FilterItemState":"Unchecked"},{"Id":"1971","FilterItemState":"Unchecked"},{"Id":"5459","FilterItemState":"Unchecked"},{"Id":"1953","FilterItemState":"Unchecked"},{"Id":"1962","FilterItemState":"Unchecked"},{"Id":"1963","FilterItemState":"Unchecked"}]},{"Id":"900","FilterItems":[{"Id":"2503","FilterItemState":"Unchecked"},{"Id":"2504","FilterItemState":"Unchecked"},{"Id":"2505","FilterItemState":"Unchecked"}]},{"Id":"944","FilterItems":[{"Id":"2715","FilterItemState":"Unchecked"},{"Id":"2714","FilterItemState":"Unchecked"}]},{"Id":"980","FilterItems":[{"Id":"2994","FilterItemState":"Unchecked"},{"Id":"2835","FilterItemState":"Unchecked"},{"Id":"2836","FilterItemState":"Unchecked"},{"Id":"4381","FilterItemState":"Unchecked"}]},{"Id":"988","FilterItems":[{"Id":"2882","FilterItemState":"Unchecked"},{"Id":"2883","FilterItemState":"Unchecked"},{"Id":"2989","FilterItemState":"Unchecked"}]},{"Id":"901","FilterItems":[{"Id":"2520","FilterItemState":"Unchecked"},{"Id":"2521","FilterItemState":"Unchecked"},{"Id":"2512","FilterItemState":"Unchecked"},{"Id":"2611","FilterItemState":"Unchecked"},{"Id":"2513","FilterItemState":"Unchecked"},{"Id":"5995","FilterItemState":"Unchecked"},{"Id":"2970","FilterItemState":"Unchecked"},{"Id":"2530","FilterItemState":"Unchecked"},{"Id":"5996","FilterItemState":"Unchecked"}]},{"Id":"986","FilterItems":[{"Id":"2971","FilterItemState":"Unchecked"},{"Id":"2872","FilterItemState":"Unchecked"},{"Id":"2871","FilterItemState":"Unchecked"},{"Id":"4995","FilterItemState":"Unchecked"},{"Id":"5009","FilterItemState":"Unchecked"}]},{"Id":"761","FilterItems":[{"Id":"4358","FilterItemState":"Unchecked"},{"Id":"4359","FilterItemState":"Unchecked"},{"Id":"4361","FilterItemState":"Unchecked"},{"Id":"5460","FilterItemState":"Unchecked"},{"Id":"4362","FilterItemState":"Unchecked"},{"Id":"4822","FilterItemState":"Unchecked"},{"Id":"4371","FilterItemState":"Unchecked"}]},{"Id":"917","FilterItems":[{"Id":"4826","FilterItemState":"Unchecked"},{"Id":"4825","FilterItemState":"Unchecked"},{"Id":"5357","FilterItemState":"Unchecked"},{"Id":"4827","FilterItemState":"Unchecked"},{"Id":"5345","FilterItemState":"Unchecked"},{"Id":"4828","FilterItemState":"Unchecked"}]},{"Id":"911","FilterItems":[{"Id":"4843","FilterItemState":"Unchecked"},{"Id":"4845","FilterItemState":"Unchecked"},{"Id":"4850","FilterItemState":"Unchecked"},{"Id":"4851","FilterItemState":"Unchecked"},{"Id":"5891","FilterItemState":"Unchecked"},{"Id":"5892","FilterItemState":"Unchecked"},{"Id":"5291","FilterItemState":"Unchecked"},{"Id":"6011","FilterItemState":"Unchecked"},{"Id":"6552","FilterItemState":"Unchecked"},{"Id":"6949","FilterItemState":"Unchecked"}]}]},"attributeFiltersModel7Spikes":null,"manufacturerFiltersModel7Spikes":{"CategoryId":"405","ManufacturerFilterItems":[{"Id":"268","FilterItemState":"Unchecked"},{"Id":"63","FilterItemState":"Unchecked"},{"Id":"191","FilterItemState":"Unchecked"},{"Id":"9","FilterItemState":"Unchecked"},{"Id":"330","FilterItemState":"Unchecked"},{"Id":"5","FilterItemState":"Unchecked"}]},"vendorFiltersModel7Spikes":null,"pageNumber":"'+ str(page) + '","orderby":"10","viewmode":"list","pagesize":"48","queryString":"","shouldNotStartFromFirstPage":true,"onSaleFilterModel":null,"keyword":"","searchCategoryId":"0","searchManufacturerId":"0","priceFrom":"","priceTo":"","includeSubcategories":"False","searchInProductDescriptions":"False","advancedSearch":"False","isOnSearchPage":"False"}'
r = s.post('https://www.websupplies.gr/getFilteredProducts', headers=headers,data=data)
soup = bs(r.content, 'lxml')
links.append([item['href'] for item in soup.select('.product-title a')])
page+=1
if soup.select_one('.next-page') is None:
break
base = 'https://www.websupplies.gr'
final_list = {base + item for i in links for item in i}

Newbie: Python "AttributeError: 'NoneType' object has no attribute 'text' " when scraping Tripadvisor Reviews

I am trying to scrape some Tripadvisor reviews as a complete newbie to this.
I'm using code from Susanli2016.
It worked (though, removing the attribute "language") for one link but it doesn't work for any more link (for example.)
I'm receiving the error:
> Traceback (most recent call last):
> File "<pyshell#27>", line 4, in <module>
> items = scrape(url)
> File "<pyshell#12>", line 11, in scrape
> items = parse(session, url + '?filterLang=' + lang)
> File "<pyshell#15>", line 12, in parse
> num_reviews = soup.find('span', class_='hotels-hotel-review-community-content-TabBar__tabCount--37DbH').text # get text
> AttributeError: 'NoneType' object has no attribute 'text'
I'm attaching the code here with the changes I made in case someone can help me.
Thank you so much!
Silvia
--
I substituted the original:
num_reviews = soup.find('span', class_='reviews_header_count').text # get text
with
num_reviews = soup.find('span', class_='hotels-hotel-review-community-content-TabBar__tabCount--37DbH').text # get text
With the original code I get the error
ValueError: invalid literal for int() with base 10: '5.695'
(where 5.695 is the number of reviews in the page)
--
Hereby the complete code:
import requests
from bs4 import BeautifulSoup
import csv
import webbrowser
import io
def display(content, filename='output.html'):
with open(filename, 'wb') as f:
f.write(content)
webbrowser.open(filename)
def get_soup(session, url, show=False):
r = session.get(url)
if show:
display(r.content, 'temp.html')
if r.status_code != 200: # not OK
print('[get_soup] status code:', r.status_code)
else:
return BeautifulSoup(r.text, 'html.parser')
def post_soup(session, url, params, show=False):
'''Read HTML from server and convert to Soup'''
r = session.post(url, data=params)
if show:
display(r.content, 'temp.html')
if r.status_code != 200: # not OK
print('[post_soup] status code:', r.status_code)
else:
return BeautifulSoup(r.text, 'html.parser')
def scrape(url, lang='ALL'):
# create session to keep all cookies (etc.) between requests
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0',
})
items = parse(session, url + '?filterLang=' + lang)
return items
def parse(session, url):
'''Get number of reviews and start getting subpages with reviews'''
print('[parse] url:', url)
soup = get_soup(session, url)
if not soup:
print('[parse] no soup:', url)
return
num_reviews = soup.find('span', class_='hotels-hotel-review-community-content-TabBar__tabCount--37DbH').text # get text
num_reviews = num_reviews[1:-1]
num_reviews = num_reviews.replace(',', '')
num_reviews = int(num_reviews) # convert text into integer
print('[parse] num_reviews ALL:', num_reviews)
url_template = url.replace('.html', '-or{}.html')
print('[parse] url_template:', url_template)
items = []
offset = 0
while(True):
subpage_url = url_template.format(offset)
subpage_items = parse_reviews(session, subpage_url)
if not subpage_items:
break
items += subpage_items
if len(subpage_items) < 5:
break
offset += 5
return items
def get_reviews_ids(soup):
items = soup.find_all('div', attrs={'data-reviewid': True})
if items:
reviews_ids = [x.attrs['data-reviewid'] for x in items][::2]
print('[get_reviews_ids] data-reviewid:', reviews_ids)
return reviews_ids
def get_more(session, reviews_ids):
url = 'https://www.tripadvisor.com/OverlayWidgetAjax?Mode=EXPANDED_HOTEL_REVIEWS_RESP&metaReferer=Hotel_Review'
payload = {
'reviews': ','.join(reviews_ids), # ie. "577882734,577547902,577300887",
#'contextChoice': 'DETAIL_HR', # ???
'widgetChoice': 'EXPANDED_HOTEL_REVIEW_HSX', # ???
'haveJses': 'earlyRequireDefine,amdearly,global_error,long_lived_global,apg-Hotel_Review,apg-Hotel_Review-in,bootstrap,desktop-rooms-guests-dust-en_US,responsive-calendar-templates-dust-en_US,taevents',
'haveCsses': 'apg-Hotel_Review-in',
'Action': 'install',
}
soup = post_soup(session, url, payload)
return soup
def parse_reviews(session, url):
'''Get all reviews from one page'''
print('[parse_reviews] url:', url)
soup = get_soup(session, url)
if not soup:
print('[parse_reviews] no soup:', url)
return
hotel_name = soup.find('h1', id='HEADING').text
reviews_ids = get_reviews_ids(soup)
if not reviews_ids:
return
soup = get_more(session, reviews_ids)
if not soup:
print('[parse_reviews] no soup:', url)
return
items = []
for idx, review in enumerate(soup.find_all('div', class_='reviewSelector')):
badgets = review.find_all('span', class_='badgetext')
if len(badgets) > 0:
contributions = badgets[0].text
else:
contributions = '0'
if len(badgets) > 1:
helpful_vote = badgets[1].text
else:
helpful_vote = '0'
user_loc = review.select_one('div.userLoc strong')
if user_loc:
user_loc = user_loc.text
else:
user_loc = ''
bubble_rating = review.select_one('span.ui_bubble_rating')['class']
bubble_rating = bubble_rating[1].split('_')[-1]
item = {
'review_body': review.find('p', class_='partial_entry').text,
'review_date': review.find('span', class_='ratingDate')['title'], # 'ratingDate' instead of 'relativeDate'
}
items.append(item)
print('\n--- review ---\n')
for key,val in item.items():
print(' ', key, ':', val)
print()
return items
def write_in_csv(items, filename='results.csv',
headers=['hotel name', 'review title', 'review body',
'review date', 'contributions', 'helpful vote',
'user name' , 'user location', 'rating'],
mode='w'):
print('--- CSV ---')
with io.open(filename, mode, encoding="utf-8") as csvfile:
csv_file = csv.DictWriter(csvfile, headers)
if mode == 'w':
csv_file.writeheader()
csv_file.writerows(items)
DB_COLUMN = 'review_body'
DB_COLUMN1 = 'review_date'
start_urls = [
'https://www.tripadvisor.com/Restaurant_Review-g187823-d2101904-Reviews-Eataly_Genova-Genoa_Italian_Riviera_Liguria.html',
]
headers = [
DB_COLUMN,
DB_COLUMN1,
]
lang = 'it'
for url in start_urls:
# get all reviews for 'url' and 'lang'
items = scrape(url)
if not items:
print('No reviews')
else:
# write in CSV
filename = url.split('Reviews-')[1][:-5]
print('filename:', filename)
write_in_csv(items, filename + '.csv', headers, mode='w')
I realized the problem lies in the source code.
hotel_name = soup.find('h1', id='HEADING').text
found no target id in the source website. I substituted it with:
hotel_name = soup.find('h1', class_='heading').text
I hope it can help others!

Scrape with Beautiful-Soup from site that uses AJAX using Python

i want to scrape the product name,price and image source from the page but only limited result will display here is website which i want to scrape https://www.walmart.com/browse/cell-phones/unlocked-phones/1105910_1073085
and i also want to scrape the check box of fillers but don't know about how to scrape all result but only 10 result display what should i do to scrape complete result .If i remove headers than display complete result of names and prices but image sources are not scrape {
headers = {"Accept-Language": "en-US,en;q=0.5",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Referer": "http://thewebsite.com",
"Connection": "keep-alive"}
scrap = requests.get('https://www.walmart.com/browse/cell-phones/unlocked-phones/1105910_1073085',headers=headers)
# Grab title-artist classes and store in recordList
content = BeautifulSoup(scrap.text, "html.parser")
if content.findAll("div", {"class": "search-result-gridview-item-wrapper"}) != None:
products = content.findAll("div", {"class": "search-result-gridview-item-wrapper"})
for product in products:
name = product.find("div", {"class": "search-result-product-title gridview"})
title = name.find('a').text
price = product.find("div", {"class": "search-result-productprice gridview enable-2price-2"})
p = price.text
image=product.find("div",{"class":"display-inline-block pull-left prod-ProductCard--Image"})
img = image.find("img", {"class": "Tile-img"})['src']
hreff = product.find("div", {"class": "display-inline-block pull-left prod-ProductCard--Image"})
href=hreff.find('a')['href']
if content.findAll("div", {"class": "search-result-listview-item clearfix"}) != None:
products = content.findAll("div", {"class": "search-result-listview-item clearfix"})
for product in products:
if product.find("span",{"class":"Price-group"}) !=None:
name = product.find("a", {"class": "product-title-link"}).text
price = product.find("span", {"class": "Price-group"}).text
image = product.find("div", {"class": "display-inline-block pull-left prod-ProductCard--Image"})
img = image.find("img", {"class": "Tile-img"})['src']
hreff = product.find("div", {"class": "display-inline-block pull-left prod-ProductCard--Image"})
href = hreff.find('a')['href']
}
Please see below the sample code to scrape data from this website. I have just added on interaction, but this should give you general idea - (You need to use inspect element functionality of your browser to find xpaths)
from selenium import webdriver
from selenium.webdriver.common.by import By
browser = webdriver.Chrome("./chromedriver") #download chromebrowser
browser.get("https://www.walmart.com/browse/cell-phones/unlocked-phones/1105910_1073085") #open page in browser
outDF = pd.DataFrame(columns=['prodname', 'imageurl', 'minprice', 'maxprice', 'actualprice']) #template of data
prices = browser.find_elements(By.XPATH, "//div[contains(#class, 'price-main-block')]") #finding prices
product = browser.find_elements(By.XPATH, "//a[contains(#class, 'product-title-link')]") #product name
images = browser.find_elements(By.XPATH, "//img[contains(#class, 'Tile-img')]") #images
#getting actual prices/ranges
for i in range(len(product)):
prodname = product[i].get_attribute("aria-label")
imageurl = images[i].get_attribute("src")
pricerange = prices[i].find_elements_by_xpath(".//span[contains(#class, 'Price-group')]")
if len(pricerange)>1:
minprice = pricerange[0].get_attribute("title")
maxprice = pricerange[1].get_attribute("title")
actualprice = None
else:
minprice = None
maxprice = None
actualprice = pricerange[0].get_attribute("title")
thisline = [prodname, imageurl, minprice, maxprice, actualprice]
outDF.loc[outDF.shape[0]] = thisline
#Reading next pages
next = True
while next:
try:
#clicking next button
browser.find_element(By.XPATH, "//button[contains(#class, 'paginator-btn paginator-btn-next')]").click()
#repeating process
prices = browser.find_elements(By.XPATH, "//div[contains(#class, 'price-main-block')]")
product = browser.find_elements(By.XPATH, "//a[contains(#class, 'product-title-link')]")
images = browser.find_elements(By.XPATH, "//img[contains(#class, 'Tile-img')]")
for i in range(len(product)):
prodname = product[i].get_attribute("aria-label")
imageurl = images[i].get_attribute("src")
pricerange = prices[i].find_elements_by_xpath(".//span[contains(#class, 'Price-group')]")
if len(pricerange)>1:
minprice = pricerange[0].get_attribute("title")
maxprice = pricerange[1].get_attribute("title")
actualprice = None
else:
minprice = None
maxprice = None
actualprice = pricerange[0].get_attribute("title")
thisline = [prodname, imageurl, minprice, maxprice, actualprice]
outDF.loc[outDF.shape[0]] = thisline
except:
print("Something went wrong")
next = False
browser.quit()

Categories