i want to scrape the product name,price and image source from the page but only limited result will display here is website which i want to scrape https://www.walmart.com/browse/cell-phones/unlocked-phones/1105910_1073085
and i also want to scrape the check box of fillers but don't know about how to scrape all result but only 10 result display what should i do to scrape complete result .If i remove headers than display complete result of names and prices but image sources are not scrape {
headers = {"Accept-Language": "en-US,en;q=0.5",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Referer": "http://thewebsite.com",
"Connection": "keep-alive"}
scrap = requests.get('https://www.walmart.com/browse/cell-phones/unlocked-phones/1105910_1073085',headers=headers)
# Grab title-artist classes and store in recordList
content = BeautifulSoup(scrap.text, "html.parser")
if content.findAll("div", {"class": "search-result-gridview-item-wrapper"}) != None:
products = content.findAll("div", {"class": "search-result-gridview-item-wrapper"})
for product in products:
name = product.find("div", {"class": "search-result-product-title gridview"})
title = name.find('a').text
price = product.find("div", {"class": "search-result-productprice gridview enable-2price-2"})
p = price.text
image=product.find("div",{"class":"display-inline-block pull-left prod-ProductCard--Image"})
img = image.find("img", {"class": "Tile-img"})['src']
hreff = product.find("div", {"class": "display-inline-block pull-left prod-ProductCard--Image"})
href=hreff.find('a')['href']
if content.findAll("div", {"class": "search-result-listview-item clearfix"}) != None:
products = content.findAll("div", {"class": "search-result-listview-item clearfix"})
for product in products:
if product.find("span",{"class":"Price-group"}) !=None:
name = product.find("a", {"class": "product-title-link"}).text
price = product.find("span", {"class": "Price-group"}).text
image = product.find("div", {"class": "display-inline-block pull-left prod-ProductCard--Image"})
img = image.find("img", {"class": "Tile-img"})['src']
hreff = product.find("div", {"class": "display-inline-block pull-left prod-ProductCard--Image"})
href = hreff.find('a')['href']
}
Please see below the sample code to scrape data from this website. I have just added on interaction, but this should give you general idea - (You need to use inspect element functionality of your browser to find xpaths)
from selenium import webdriver
from selenium.webdriver.common.by import By
browser = webdriver.Chrome("./chromedriver") #download chromebrowser
browser.get("https://www.walmart.com/browse/cell-phones/unlocked-phones/1105910_1073085") #open page in browser
outDF = pd.DataFrame(columns=['prodname', 'imageurl', 'minprice', 'maxprice', 'actualprice']) #template of data
prices = browser.find_elements(By.XPATH, "//div[contains(#class, 'price-main-block')]") #finding prices
product = browser.find_elements(By.XPATH, "//a[contains(#class, 'product-title-link')]") #product name
images = browser.find_elements(By.XPATH, "//img[contains(#class, 'Tile-img')]") #images
#getting actual prices/ranges
for i in range(len(product)):
prodname = product[i].get_attribute("aria-label")
imageurl = images[i].get_attribute("src")
pricerange = prices[i].find_elements_by_xpath(".//span[contains(#class, 'Price-group')]")
if len(pricerange)>1:
minprice = pricerange[0].get_attribute("title")
maxprice = pricerange[1].get_attribute("title")
actualprice = None
else:
minprice = None
maxprice = None
actualprice = pricerange[0].get_attribute("title")
thisline = [prodname, imageurl, minprice, maxprice, actualprice]
outDF.loc[outDF.shape[0]] = thisline
#Reading next pages
next = True
while next:
try:
#clicking next button
browser.find_element(By.XPATH, "//button[contains(#class, 'paginator-btn paginator-btn-next')]").click()
#repeating process
prices = browser.find_elements(By.XPATH, "//div[contains(#class, 'price-main-block')]")
product = browser.find_elements(By.XPATH, "//a[contains(#class, 'product-title-link')]")
images = browser.find_elements(By.XPATH, "//img[contains(#class, 'Tile-img')]")
for i in range(len(product)):
prodname = product[i].get_attribute("aria-label")
imageurl = images[i].get_attribute("src")
pricerange = prices[i].find_elements_by_xpath(".//span[contains(#class, 'Price-group')]")
if len(pricerange)>1:
minprice = pricerange[0].get_attribute("title")
maxprice = pricerange[1].get_attribute("title")
actualprice = None
else:
minprice = None
maxprice = None
actualprice = pricerange[0].get_attribute("title")
thisline = [prodname, imageurl, minprice, maxprice, actualprice]
outDF.loc[outDF.shape[0]] = thisline
except:
print("Something went wrong")
next = False
browser.quit()
Related
So I have to scrape all the products from this website's shop ( https://bewellstore.ro/shop/), but my code stops at the 12th photo. I have made a version for websites with multiple shop pages where I take them all in a for loop, but since here it's only one page I thought that isn't necessary.
Any idea why my code stops at the 12th product?
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
folder = 'beWell_images'
os.makedirs(folder, exist_ok=True)
root_folder = os.getcwd()
baseurl = 'https://bewellstore.ro/shop/'
# an array for all the product links
product_links = []
# going through all the pages of the shop
url = 'https://bewellstore.ro/shop/'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml')
product_list = soup.find_all('div', class_= 'loop-product-inner')
print (product_list)
# taking all the links to each product page
for item in product_list:
for link in item.find_all('a', href=True, class_='woocommerce-LoopProduct-link woocommerce-loop-product__link'):
product_links.append(link['href'])
# appending the links previously taken to the array
print(product_links)
product_items_list = []
i = 0
d = {} # use as set()
os.chdir(folder)
for link_test in product_links:
r = requests.get(link_test)
soup = BeautifulSoup(r.content, 'lxml')
title = soup.find('h1', class_='product_title').text.strip()
price = soup.find('p', class_ = 'price').text.strip()
header = soup.find('div', class_ = 'woocommerce-product-details__short-description').text.strip()
sku = soup.find('span', class_ = 'sku').text.strip()
categories = soup.find('div' , class_ = 'posted_in').text.strip()
description = soup.find('div', class_ = 'cell large-6').text.strip()
brand = soup.find('div', class_ = 'tabs-panel').text.strip()
images = soup.select('.wp-post-image')
# --- before `for`-loop ---
downloaded = []
# --- `for`-loop ---
for image in images:
link = image['src']
if link in d:
name = d[link]
downloaded.append(name)
else:
i += 1
name = str(i) +'img.jpg'
d[link] = name
print('link:', link)
print('name:', name)
print('---')
# here i am adding the .jpg and saving the images
with open(name, 'wb') as f:
im = requests.get(link)
#print("URMEAZA DEBUG: {}".format(im))
f.write(im.content)
downloaded.append(name)
# --- after `for`-loop ---
# storing all the infos about this product
img_str = ''
if len(downloaded) > 1:
for index, img in enumerate(downloaded):
if index == len(downloaded)-1:
img_str = img_str + img
else:
img_str = img_str + img + '/'
else:
img_str = downloaded[0]
product = {
'sku': sku,
'base_image': img_str,
'small_image': img_str,
'thumbnail_image': img_str,
'additional_images': img_str,
'product_type': 'simple',
'attribute_set_code': 'Default',
'categories': categories.replace('Categorii: ','').replace(', ', '/'),
'name' : title,
'description': description,
'short_description': header,
'price' : price[0:5]
}
product_items_list.append(product)
os.chdir(root_folder)
# os.chdir('output')
df = pd.DataFrame(product_items_list)
print(df)
df.to_csv('beWell.csv', index=False)
That's because this webpage uses pagination (with 12 products per page) and each page gets loaded only when you scroll. You will have to use selenium to scroll the page.
But if you only want to use beautifulsoup then there is a work around.
The URL for each page looks like this
https://bewellstore.ro/shop/page/<page_no>/
Example:
1st page: https://bewellstore.ro/shop/page/1/
2nd page: https://bewellstore.ro/shop/page/2/
You could make a request to each of the above URLs and scrape your data using beautifulsoup.
You can try this for the all pages
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime
results = []
page_number = 1
import requests
product_links = []
headers = {
'authority': 'bewellstore.ro',
'sec-ch-ua': '"Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
'sec-ch-ua-platform': '"Linux"',
'accept': '*/*',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': 'https://bewellstore.ro/shop/',
'accept-language': 'en-US,en;q=0.9',
'cookie': 'fp_session=new; mc_landing_site=https://bewellstore.ro/shop/; _omappvp=i5rIyW2xsMFKIu3uhQtmFj1TN9jw7aKjO8dgy3SVvWMhAj30NvKFrBXfJLe3dQK6ZdbB4FezbrwFWPGLdKrsj1A1vqN2PRLI; _omappvs=1634795539874; _clck=1f7zptk|1|evr|0; _ga=GA1.2.2117949575.1634795541; _gid=GA1.2.1155690725.1634795541; _fbp=fb.1.1634795541140.1266696245; PHPSESSID=94b6b1996b0b5e831d898c192b4bca06; _clsk=2489zg|1634795542054|1|1|e.clarity.ms/collect; yith_wcwl_session_d235bd7d63b3a120c05ba3c90256789a=%7B%22session_id%22%3A%222e40c31b1503902767c5327edd3cf926%22%2C%22session_expiration%22%3A1637387542%2C%22session_expiring%22%3A1637383942%2C%22cookie_hash%22%3A%2249a81940bd8d39b2f894021c16333e6f%22%7D; omSeen-dwf9rgtvzzrhqylccaag=1634795583943; om-dwf9rgtvzzrhqylccaag=1634795585931; _omra={"dwf9rgtvzzrhqylccaag":"click"}; cookie_notice_accepted=true; ls_smartpush=fdfbe0ffe7800007',
}
while True:
response = requests.get(f'https://bewellstore.ro/shop/page/{page_number}/', headers=headers)
print(response.status_code)
print(response.url)
if response.status_code != 200:
break
soup = BeautifulSoup(response.content, 'html.parser')
product_list = soup.find_all('div', class_= 'loop-product-inner')
# print (product_list)
for item in product_list:
for link in item.find_all('a', href=True, class_='woocommerce-LoopProduct-link woocommerce-loop-product__link'):
product_links.append(link['href'])
print('Addedn link in product_links list :', link['href'])
product_items_list = []
i = 0
d = {}
for link_test in product_links:
r = requests.get(link_test)
soup = BeautifulSoup(r.content, 'lxml')
title = soup.find('h1', class_='product_title').text.strip()
price = soup.find('p', class_ = 'price').text.strip()
header = soup.find('div', class_ = 'woocommerce-product-details__short-description').text.strip()
sku = soup.find('span', class_ = 'sku').text.strip()
categories = soup.find('div' , class_ = 'posted_in').text.strip()
description = soup.find('div', class_ = 'cell large-6').text.strip()
brand = soup.find('div', class_ = 'tabs-panel').text.strip()
images = soup.select('.wp-post-image')
downloaded = []
for image in images:
link = image['src']
if link in d:
name = d[link]
downloaded.append(name)
else:
i += 1
name = str(i) +'img.jpg'
d[link] = name
print('link:', link)
print('name:', name)
print('---')
# here i am adding the .jpg and saving the images
with open(name, 'wb') as f:
im = requests.get(link)
#print("URMEAZA DEBUG: {}".format(im))
f.write(im.content)
downloaded.append(name)
img_str = ''
if len(downloaded) > 1:
for index, img in enumerate(downloaded):
if index == len(downloaded)-1:
img_str = img_str + img
else:
img_str = img_str + img + '/'
else:
img_str = downloaded[0]
product = {
'sku': sku,
'base_image': img_str,
'small_image': img_str,
'thumbnail_image': img_str,
'additional_images': img_str,
'product_type': 'simple',
'attribute_set_code': 'Default',
'categories': categories.replace('Categorii: ','').replace(', ', '/'),
'name' : title,
'description': description,
'short_description': header,
'price' : price[0:5]
}
product_items_list.append(product)
page_number += 1
df = pd.DataFrame(product_items_list)
print(df)
df.to_csv('beWell.csv', index=False)
I am trying to scrape all the data of the google search results - title , URL and description.
However, I cant grab the description of the search results, it returns an empty string.
# check Chrome version: Menue (the three dots - upper right corner -> Help -> About Google Chrome)
# download ChromeDriver according to the Chrome version (example version 79)
# download from https://sites.google.com/a/chromium.org/chromedriver/downloads
# place the chromedriver.exe file in the current working directory
# pip install selenium
from selenium import webdriver
from bs4 import BeautifulSoup
import time
from bs4.element import Tag
import pandas as pd
import random
keywords = pd.read_csv('keywords.csv', header=0, index_col=None)
df = pd.DataFrame(columns=['keyword', 'title', 'url', 'description'])
for i in keywords['keyword']:
# Scraper that gives bacck: titles, links, descriptions
driver = webdriver.Chrome()
google_url = "https://www.google.com/search?gl=US&q=" + i + "&num=" + str(10)
driver.get(google_url)
time.sleep(random.randrange(15,50))
soup = BeautifulSoup(driver.page_source,'lxml')
result_div = soup.find_all('div', attrs={'class': 'g'})
links = []
titles = []
descriptions = []
for r in result_div:
# Checks if each element is present, else, raise exception
try:
link = r.find('a', href=True)
title = None
title = r.find('h3')
if isinstance(title,Tag):
title = title.get_text()
description = None
description = r.find('span', attrs={'class': 'st'})
if isinstance(description, Tag):
description = description.get_text()
# Check to make sure everything is present before appending
if link != '' and title != '' and description != '':
links.append(link['href'])
titles.append(title)
descriptions.append(description)
# Next loop if one element is not present
except Exception as e:
print(e)
continue
for link, title, description in zip(links, titles, descriptions):
df = df.append({'keyword': i, 'title': title, 'url': link, 'description': description}, ignore_index=True)
df.to_csv(r'final_dataset.csv', index=False)
Anyone has an idea how to grab the description in the google search results.
Get the description node with the following code.
description = r.select('.aCOpRe span:not(.f)')
Also, you can use requests instead of selenium. The full example is in online IDE.
from requests import Session
from bs4 import BeautifulSoup
from bs4.element import Tag
import pandas as pd
keywords = pd.read_csv('keywords.csv', header=0, index_col=None)
df = pd.DataFrame(columns=['keyword', 'title', 'url', 'description'])
for i in keywords['keyword']:
# Scraper that gives back: titles, links, descriptions
params = {"q": i, 'gl': 'US', 'num': 10}
headers = {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36 Edg/80.0.361.62"
}
with Session() as session:
r = session.get(
"https://google.com/search", params=params, headers=headers)
soup = BeautifulSoup(r.content, 'lxml')
result_div = soup.find_all('div', attrs={'class': 'g'})
links = []
titles = []
descriptions = []
for r in result_div:
# Checks if each element is present, else, raise exception
try:
link = r.find('a', href=True)
title = r.find('h3')
if isinstance(title, Tag):
title = title.get_text()
description = r.select('.aCOpRe span:not(.f)')
if isinstance(description, Tag):
description = description.get_text()
# Check to make sure everything is present before appending
if link != '' and title != '' and description != '':
links.append(link['href'])
titles.append(title)
descriptions.append(description)
# Next loop if one element is not present
except Exception as e:
print(e)
continue
for link, title, description in zip(links, titles, descriptions):
df = df.append({
'keyword': i,
'title': title,
'url': link,
'description': description
}, ignore_index=True)
df.to_csv(r'final_dataset.csv', index=False)
Alternatively, you can extract data from Google Search via SerpApi.
Disclaimer: I work at SerpApi.
The battle to finish my first scraping script continues. I think that I'm almost finishing but I hit a new roadblock.
So, the problem is that when I reach the last pagination page I'm getting this error:
Traceback (most recent call last):
File "C:/Users/Andre/Desktop/scripts python/scrape_learn/ttc_quase.py", line 50, in <module>
url_tag = soup.find('li', {"id": "next-page-link"}).find('a')
AttributeError: 'NoneType' object has no attribute 'find'
I think that the error is related with the way I'm finding url_tag, but I'm not seeign any other way to grab the "next page". I tried to use the Try/Except method but when apply it I just get the listings on the first page.
So I'm not sure what should be my next step. If someone could help I will appreciate.
My full code:
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = "https://timetochoose.co.ao/?ct_keyword&ct_ct_status&ct_property_type&ct_beds&search-listings=true&ct_country=portugal&ct_state&ct_city&ct_price_to&ct_mls&lat&lng"
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0'}
anuncios_ttc = {}
anuncios_nr = 0
while True:
response = requests.get(url, headers=headers)
print(response)
data = response.text
print(data)
soup = BeautifulSoup(requests.get(url, headers=headers).content, 'html.parser')
anuncios = soup.find_all("div", {"class": "grid-listing-info"})
for anuncios in anuncios:
titles = anuncios.find("a",{"class": "listing-link"}).text
location = anuncios.find("p",{"class": "location muted marB0"}).text
link = anuncios.find("a",{"class": "listing-link"}).get("href")
anuncios_response = requests.get(link, headers=headers)
anuncios_data = anuncios_response.text
anuncios_soup = BeautifulSoup(anuncios_data, 'html.parser')
conteudo = anuncios_soup.find("div", {"id":"listing-content"}).text
preco = anuncios_soup.find("span",{"class": "listing-price"})
preco_imo = preco.text if preco else "N/A"
quartos = anuncios_soup.find("li", {"class": "row beds"})
nr_quartos = quartos.text if quartos else "N/A"
wcs = anuncios_soup.find("li", {"class": "row baths"})
nr_wcs = wcs.text if wcs else "N/A"
tipo = anuncios_soup.find("li", {"class": "row property-type"})
tipo_imo = tipo.text if tipo else "N/A"
bairro = anuncios_soup.find("li", {"class": "row community"})
bairro1 = bairro.text if bairro else "N/A"
ref = anuncios_soup.find("li", {"class": "row propid"}).text
anuncios_nr+=1
anuncios_ttc[anuncios_nr] = [titles, location, bairro1, preco_imo, tipo_imo, nr_quartos, nr_wcs, conteudo, ref, link]
print("Título", titles, "\nLocalização", location, "\nPreço", preco_imo, "\nLink", link, "\nReferencia", ref, "\nTipo", tipo_imo, "\nQuartos", nr_quartos, "\nWC", nr_wcs, "\nBairro", bairro1, "\nConteudo", conteudo)
url_tag = soup.find('li', {"id": "next-page-link"}).find('a')
if url_tag.get('href'):
url = url_tag.get('href')
print(url)
else:
break
print("Nr Total de Anuncios: ", anuncios_nr)
anuncios_ttc_df = pd.DataFrame.from_dict(anuncios_ttc, orient = 'index', columns =['Titulo', 'Localização', 'Bairro', 'Preço', 'Tipo', 'Quartos', 'WCs', 'Descrição', 'Referência', 'Ligação'])
anuncios_ttc_df.head()
anuncios_ttc_df.to_csv('ttc_python.csv')
The answer for this question ended up to be provided in other thread where I was trying to identify better the URL_Tag element.
With the help of #Andrej Kesely I was able to solve the problem with:
url_tag = soup.find('li', {"id": "next-page-link"})
if not url_tag:
break
url = url_tag.find('a')['href']
Now the script is able to run until the end and to generate the csv file as intended.
I am using Soup and Selenium to access this page https://www.chewy.com/blue-buffalo-basics-limited/dp/37047 and trying to get a list of all packaging types' prices and ratings.
Below is my code:
import requests
import time
from bs4 import BeautifulSoup
from selenium import webdriver
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0'}
# use Selenium to get buttons through all pages
test_url = 'https://www.chewy.com/blue-buffalo-basics-limited/dp/37047'
test = BeautifulSoup(requests.get(test_url, headers=headers).content, 'html.parser')
btn_count = []
for btn_cnt in test.select('.js-sku-selector > div'):
btn_cnt = btn_cnt['data-attributes'].count('isSelected')
btn_count.append(btn_cnt)
buttons = list(range(1,btn_cnt+1))
xpath = []
for b in buttons:
btn_path = '//*[#id="variation-Size"]/div[2]/div[' + str(b) + ']/div/label'
print(btn_path)
xpath.append(btn_path)
print('{:<25}{:<100}{:<15}{:<15}{:<15}{:<15}'.format('brand', 'product', 'id','auto_ship', 'regular','rating'))
for btn in xpath:
test_url = 'https://www.chewy.com/blue-buffalo-basics-limited/dp/37047'
test = BeautifulSoup(requests.get(test_url, headers=headers).content, 'html.parser')
driver = webdriver.Chrome(executable_path=r'C:\Users\public\chromedriver')
driver.get(test_url)
time.sleep(5)
driver.find_element_by_xpath(btn).click()
time.sleep(5)
for brand, product, id, auto_ship, price, rating in zip(test.findAll('span', attrs={'itemprop': 'brand'}),
test.findAll('div', attrs={'id': 'product-title'}),
test.findAll('div', attrs={'class': 'value js-part-number'}),
test.findAll('p', attrs={'class': 'autoship-pricing p'}),
test.findAll('span', attrs={'class': 'ga-eec__price'}),
test.select('div.ugc')):
#date = date.today()
brand = brand.text
product = ' '.join(product.h1.text.split())
id = ' '.join(id.span.text.split())
p1 = auto_ship.text.index('(')
auto_ship = ' '.join(auto_ship.text[:p1].split())
regular_price = ' '.join(price.text.split())
rating = rating.picture.img['src'][-7:-4].replace('_', '.')
print('{:<25}{:<100}{:<15}{:<15}{:<15}{:<15}'.format(brand, product, id, auto_ship, regular_price, rating))
driver.quit()
The result I have is
I would expect the data to be different for the three different buttons, but it seems it is only returning the value from the default page.
Is there anything else I should do to dynamically insert values for each button?
The HTML looks like
I copied the xpath of labels. It does bring me to the target view for different packages and the underlying HTML values do change. However, my print statment is still getting it from the main page. Any recommendation?
I found what happened. I wasnt loading the current page to soup but was rather loading a brand new source page.
I added a driver.page_source after the click and gave the browser sufficient time to load (10 seconds) then souped the page source. It works now.
# use Selenium to get buttons through all pages
test_url = 'https://www.chewy.com/wellness-large-breed-complete-health/dp/34356'
test = BeautifulSoup(requests.get(test_url, headers=headers).content, 'html.parser')
btn_count = []
for btn_cnt in test.select('.js-sku-selector > div'):
btn_cnt = btn_cnt['data-attributes'].count('isSelected')
btn_count.append(btn_cnt)
buttons = list(range(1,btn_cnt+1))
xpath = []
for b in buttons:
btn_path = '//*[#id="variation-Size"]/div[2]/div[' + str(b) + ']/div/label'
print(btn_path)
xpath.append(btn_path)
print('{:<25}{:<100}{:<15}{:<15}{:<15}{:<15}'.format('brand', 'product', 'id','auto_ship', 'regular','rating'))
for btn in xpath:
test_url = 'https://www.chewy.com/wellness-large-breed-complete-health/dp/34356'
driver = webdriver.Chrome(executable_path=r'C:\Users\public\chromedriver')
driver.get(test_url)
time.sleep(1)
driver.find_element_by_xpath(btn).click()
time.sleep(5)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
for brand, product, id, auto_ship, price, rating in zip(soup.findAll('span', attrs={'itemprop': 'brand'}),
soup.findAll('div', attrs={'id': 'product-title'}),
soup.findAll('div', attrs={'class': 'value js-part-number'}),
soup.findAll('p', attrs={'class': 'autoship-pricing p'}),
soup.findAll('span', attrs={'class': 'ga-eec__price'}),
soup.select('div.ugc')):
#date = date.today()
brand = brand.text
product = ' '.join(product.h1.text.split())
id = ' '.join(id.span.text.split())
p1 = auto_ship.text.index('(')
auto_ship = ' '.join(auto_ship.text[:p1].split())
regular_price = ' '.join(price.text.split())
rating = rating.picture.img['src'][-7:-4].replace('_', '.')
print('{:<25}{:<100}{:<15}{:<15}{:<15}{:<15}'.format(brand, product, id, auto_ship, regular_price, rating))
driver.quit()
I have build a webscraping for real estate data with the help of some fellowsmembers on this website.
It works perfectly, but after is crawls to page 6/7 or furhter, a cookie the typical cookie warning pop up, and seem to disrupt my output in my CSV file.
Is there a way to handle the pop up?
from selenium import webdriver
from bs4 import BeautifulSoup
import re
import time
import requests
import pandas as pd
#open('output.csv', 'w').close()
browser = webdriver.Chrome('C:/Users/907133/Pythonstuff/chromedriver')
browser.set_window_position(0,0)
def jaap_spider(max_pages):
page = 1
while page <= max_pages:
url = 'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{}'.format(page)
#browser.delete_all_cookies()
browser.get(url)
#session = requests.Session()
#res1 = session.post(url, post_data)
#res2 = session.get(url1)
time.sleep(15)
#input('Press Enter after bypassing Captcha')
soup = BeautifulSoup(browser.page_source, 'html.parser')
info = soup.find_all('div', {'class':'property-info'})
inside = soup.find_all('a', {'class': 'property-inner'},{'href'})
# Make empty lists with header lines
outputlist_l1 = [['street', 'address', 'price', 'pricetag']]
outputlist_l2 = [['soort', 'bouwjaar', 'woonoppervlakte', 'inhoud', 'perceel']]
for huis in info:
street = huis.find('h2')
street = ' '.join(street.get_text(separator='\r\n', strip=True).split()[:+3])
address = huis.find('div')
address = address.find('div').text.strip()
price = huis.find('div', {'class': 'price-info'})
price = price.find('div').text.strip()
price = re.findall(r'\d', price)
price = ''.join(price)
pricetag = huis.find('div', {'class': 'property-price'})
pricetag = pricetag.find('span').text.strip()
outputlist_l1.append([street, address, price, pricetag])
for items in inside:
#browser.delete_all_cookies()
href = items.get('href')
url1 = href.format(page)
browser.get(url1)
kenmerken = BeautifulSoup(browser.page_source, 'html.parser')
details = kenmerken.find_all ('div', {'class':'detail-tab-content kenmerken'})
try:
tr = details[0].find_all ('td', {'class': 'value'})
except IndexError:
size_space = 'Unknown'
for inhoud in tr:
soort = tr[0].get_text(separator='\n', strip=True)
bouwjaar = tr[1].get_text(separator='\n', strip=True)
woonoppervlakte = tr[2].get_text(separator='\n', strip=True)
inhoud = tr[3].get_text(separator='\n', strip=True)
perceel = tr[4].get_text(separator='\n', strip=True)
l2 = ('{},{},{},{},{}'.format(soort, bouwjaar, woonoppervlakte, inhoud, perceel))
outputlist_l2.append([soort, bouwjaar, woonoppervlakte, inhoud, perceel])
page += 1
# Merge outputlist_l1 with outputlist_l2
outputlist = [a + b for a, b in zip(outputlist_l1, outputlist_l2)]
# transform to Pandas dataframe and export as csv
#saveFile = open('output.csv', 'a')
df = pd.DataFrame(outputlist[1:], columns=outputlist[0])
df.to_csv('output.csv', index=False)
#saveFile.close()
jaap_spider(15)
THe cookie script in the website:
(function(){function g(a){return{get:function(b){var c=JSON.parse(a.getItem(b));return!c||Date.parse(c.expires)<=(new Date).getTime()?(a.removeItem(b),null):c.value},set:function(b,c,d){c={value:c,expires:d.toUTCString()};a.setItem(b,JSON.stringify(c))},remove:function(b){a.removeItem(b)}}}function d(a,b,c,d){this.parseCommand=function(e,g){function h(){var a=JSON.stringify({messageId:k,value:l||!1});window.parent.postMessage(a,"")}var m=q[a],n=e.action,p=e.key,k=e.messageId,f=e.siteId,f=d?p:p+":"+
f,l=e.value,r=e.expiresMinutes||1440(e.expiresDays||365),s=function(){var a=new Date;a.setTime(a.getTime()+6E4*r);return a}();if(!function(){var a={_hjSet:c,_hjGet:b,_hjRemove:c}[n]||[];return 0<=a.indexOf("")||0<=a.indexOf(g)}())throw Error("Command "+n+" not allowed on key: "+p);switch(n){case "_hjSet":m.set(f,l,s);break;case "_hjGet":l=m.get(f);h();break;case "_hjRemove":m.remove(f)}}}function h(a){try{var b=JSON.parse(a.data);b.key&&k[b.key]&&k[b.key].parseCommand(b,a.origin)}catch(c){return null}}
var q;try{q={cookie:{get:function(a){return(a=RegExp("(?:^|; )"+a+"=([^;])").exec(document.cookie))?a[1]:void 0},set:function(a,b,c){document.cookie=a+"="+b+"; path=/; expires="+c.toUTCString()},remove:function(a){document.cookie=a+"=; expires=Tue, 13 Mar 1979 00:00:00 UTC; path=/;"}},localStorage:g(localStorage),sessionStorage:g(sessionStorage)}}catch(t){return}var k={_hjOptOut:new d("cookie",[""],["https://www.hotjar.com","https://local.hotjar.com","http://local.hotjar.com","https://insights-staging.hotjar.com",
"http://insights-staging.hotjar.com"],!0),grant_consent:new d("cookie",[""],[""],!1),screenshot_retake:new d("localStorage",[""],[""],!1),screenshot_active_retake:new d("sessionStorage",[""],["*"],!1)};window.addEventListener?window.addEventListener("message",h,!1):window.attachEvent("onmessage",h)})();
To overcome the pop up problem just check after loading the page if there any pop up available. If yes,then click on that.Hope this help.
page = 1
while page <= max_pages:
url = 'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{}'.format(page)
browser.get(url)
time.sleep(10)
#Check here if there popup available
if len(browser.find_elements_by_xpath("//a[#class='CookiesOK']"))>0:
browser.find_element_by_xpath("//a[#class='CookiesOK']").click()
time.sleep(5)
#input('Press Enter after bypassing Captcha')
soup = BeautifulSoup(browser.page_source, 'html.parser')
info = soup.find_all('div', {'class':'property-info'})
inside = soup.find_all('a', {'class': 'property-inner'},{'href'})