I have build a webscraping for real estate data with the help of some fellowsmembers on this website.
It works perfectly, but after is crawls to page 6/7 or furhter, a cookie the typical cookie warning pop up, and seem to disrupt my output in my CSV file.
Is there a way to handle the pop up?
from selenium import webdriver
from bs4 import BeautifulSoup
import re
import time
import requests
import pandas as pd
#open('output.csv', 'w').close()
browser = webdriver.Chrome('C:/Users/907133/Pythonstuff/chromedriver')
browser.set_window_position(0,0)
def jaap_spider(max_pages):
page = 1
while page <= max_pages:
url = 'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{}'.format(page)
#browser.delete_all_cookies()
browser.get(url)
#session = requests.Session()
#res1 = session.post(url, post_data)
#res2 = session.get(url1)
time.sleep(15)
#input('Press Enter after bypassing Captcha')
soup = BeautifulSoup(browser.page_source, 'html.parser')
info = soup.find_all('div', {'class':'property-info'})
inside = soup.find_all('a', {'class': 'property-inner'},{'href'})
# Make empty lists with header lines
outputlist_l1 = [['street', 'address', 'price', 'pricetag']]
outputlist_l2 = [['soort', 'bouwjaar', 'woonoppervlakte', 'inhoud', 'perceel']]
for huis in info:
street = huis.find('h2')
street = ' '.join(street.get_text(separator='\r\n', strip=True).split()[:+3])
address = huis.find('div')
address = address.find('div').text.strip()
price = huis.find('div', {'class': 'price-info'})
price = price.find('div').text.strip()
price = re.findall(r'\d', price)
price = ''.join(price)
pricetag = huis.find('div', {'class': 'property-price'})
pricetag = pricetag.find('span').text.strip()
outputlist_l1.append([street, address, price, pricetag])
for items in inside:
#browser.delete_all_cookies()
href = items.get('href')
url1 = href.format(page)
browser.get(url1)
kenmerken = BeautifulSoup(browser.page_source, 'html.parser')
details = kenmerken.find_all ('div', {'class':'detail-tab-content kenmerken'})
try:
tr = details[0].find_all ('td', {'class': 'value'})
except IndexError:
size_space = 'Unknown'
for inhoud in tr:
soort = tr[0].get_text(separator='\n', strip=True)
bouwjaar = tr[1].get_text(separator='\n', strip=True)
woonoppervlakte = tr[2].get_text(separator='\n', strip=True)
inhoud = tr[3].get_text(separator='\n', strip=True)
perceel = tr[4].get_text(separator='\n', strip=True)
l2 = ('{},{},{},{},{}'.format(soort, bouwjaar, woonoppervlakte, inhoud, perceel))
outputlist_l2.append([soort, bouwjaar, woonoppervlakte, inhoud, perceel])
page += 1
# Merge outputlist_l1 with outputlist_l2
outputlist = [a + b for a, b in zip(outputlist_l1, outputlist_l2)]
# transform to Pandas dataframe and export as csv
#saveFile = open('output.csv', 'a')
df = pd.DataFrame(outputlist[1:], columns=outputlist[0])
df.to_csv('output.csv', index=False)
#saveFile.close()
jaap_spider(15)
THe cookie script in the website:
(function(){function g(a){return{get:function(b){var c=JSON.parse(a.getItem(b));return!c||Date.parse(c.expires)<=(new Date).getTime()?(a.removeItem(b),null):c.value},set:function(b,c,d){c={value:c,expires:d.toUTCString()};a.setItem(b,JSON.stringify(c))},remove:function(b){a.removeItem(b)}}}function d(a,b,c,d){this.parseCommand=function(e,g){function h(){var a=JSON.stringify({messageId:k,value:l||!1});window.parent.postMessage(a,"")}var m=q[a],n=e.action,p=e.key,k=e.messageId,f=e.siteId,f=d?p:p+":"+
f,l=e.value,r=e.expiresMinutes||1440(e.expiresDays||365),s=function(){var a=new Date;a.setTime(a.getTime()+6E4*r);return a}();if(!function(){var a={_hjSet:c,_hjGet:b,_hjRemove:c}[n]||[];return 0<=a.indexOf("")||0<=a.indexOf(g)}())throw Error("Command "+n+" not allowed on key: "+p);switch(n){case "_hjSet":m.set(f,l,s);break;case "_hjGet":l=m.get(f);h();break;case "_hjRemove":m.remove(f)}}}function h(a){try{var b=JSON.parse(a.data);b.key&&k[b.key]&&k[b.key].parseCommand(b,a.origin)}catch(c){return null}}
var q;try{q={cookie:{get:function(a){return(a=RegExp("(?:^|; )"+a+"=([^;])").exec(document.cookie))?a[1]:void 0},set:function(a,b,c){document.cookie=a+"="+b+"; path=/; expires="+c.toUTCString()},remove:function(a){document.cookie=a+"=; expires=Tue, 13 Mar 1979 00:00:00 UTC; path=/;"}},localStorage:g(localStorage),sessionStorage:g(sessionStorage)}}catch(t){return}var k={_hjOptOut:new d("cookie",[""],["https://www.hotjar.com","https://local.hotjar.com","http://local.hotjar.com","https://insights-staging.hotjar.com",
"http://insights-staging.hotjar.com"],!0),grant_consent:new d("cookie",[""],[""],!1),screenshot_retake:new d("localStorage",[""],[""],!1),screenshot_active_retake:new d("sessionStorage",[""],["*"],!1)};window.addEventListener?window.addEventListener("message",h,!1):window.attachEvent("onmessage",h)})();
To overcome the pop up problem just check after loading the page if there any pop up available. If yes,then click on that.Hope this help.
page = 1
while page <= max_pages:
url = 'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{}'.format(page)
browser.get(url)
time.sleep(10)
#Check here if there popup available
if len(browser.find_elements_by_xpath("//a[#class='CookiesOK']"))>0:
browser.find_element_by_xpath("//a[#class='CookiesOK']").click()
time.sleep(5)
#input('Press Enter after bypassing Captcha')
soup = BeautifulSoup(browser.page_source, 'html.parser')
info = soup.find_all('div', {'class':'property-info'})
inside = soup.find_all('a', {'class': 'property-inner'},{'href'})
Related
I am using Soup and Selenium to access this page https://www.chewy.com/blue-buffalo-basics-limited/dp/37047 and trying to get a list of all packaging types' prices and ratings.
Below is my code:
import requests
import time
from bs4 import BeautifulSoup
from selenium import webdriver
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0'}
# use Selenium to get buttons through all pages
test_url = 'https://www.chewy.com/blue-buffalo-basics-limited/dp/37047'
test = BeautifulSoup(requests.get(test_url, headers=headers).content, 'html.parser')
btn_count = []
for btn_cnt in test.select('.js-sku-selector > div'):
btn_cnt = btn_cnt['data-attributes'].count('isSelected')
btn_count.append(btn_cnt)
buttons = list(range(1,btn_cnt+1))
xpath = []
for b in buttons:
btn_path = '//*[#id="variation-Size"]/div[2]/div[' + str(b) + ']/div/label'
print(btn_path)
xpath.append(btn_path)
print('{:<25}{:<100}{:<15}{:<15}{:<15}{:<15}'.format('brand', 'product', 'id','auto_ship', 'regular','rating'))
for btn in xpath:
test_url = 'https://www.chewy.com/blue-buffalo-basics-limited/dp/37047'
test = BeautifulSoup(requests.get(test_url, headers=headers).content, 'html.parser')
driver = webdriver.Chrome(executable_path=r'C:\Users\public\chromedriver')
driver.get(test_url)
time.sleep(5)
driver.find_element_by_xpath(btn).click()
time.sleep(5)
for brand, product, id, auto_ship, price, rating in zip(test.findAll('span', attrs={'itemprop': 'brand'}),
test.findAll('div', attrs={'id': 'product-title'}),
test.findAll('div', attrs={'class': 'value js-part-number'}),
test.findAll('p', attrs={'class': 'autoship-pricing p'}),
test.findAll('span', attrs={'class': 'ga-eec__price'}),
test.select('div.ugc')):
#date = date.today()
brand = brand.text
product = ' '.join(product.h1.text.split())
id = ' '.join(id.span.text.split())
p1 = auto_ship.text.index('(')
auto_ship = ' '.join(auto_ship.text[:p1].split())
regular_price = ' '.join(price.text.split())
rating = rating.picture.img['src'][-7:-4].replace('_', '.')
print('{:<25}{:<100}{:<15}{:<15}{:<15}{:<15}'.format(brand, product, id, auto_ship, regular_price, rating))
driver.quit()
The result I have is
I would expect the data to be different for the three different buttons, but it seems it is only returning the value from the default page.
Is there anything else I should do to dynamically insert values for each button?
The HTML looks like
I copied the xpath of labels. It does bring me to the target view for different packages and the underlying HTML values do change. However, my print statment is still getting it from the main page. Any recommendation?
I found what happened. I wasnt loading the current page to soup but was rather loading a brand new source page.
I added a driver.page_source after the click and gave the browser sufficient time to load (10 seconds) then souped the page source. It works now.
# use Selenium to get buttons through all pages
test_url = 'https://www.chewy.com/wellness-large-breed-complete-health/dp/34356'
test = BeautifulSoup(requests.get(test_url, headers=headers).content, 'html.parser')
btn_count = []
for btn_cnt in test.select('.js-sku-selector > div'):
btn_cnt = btn_cnt['data-attributes'].count('isSelected')
btn_count.append(btn_cnt)
buttons = list(range(1,btn_cnt+1))
xpath = []
for b in buttons:
btn_path = '//*[#id="variation-Size"]/div[2]/div[' + str(b) + ']/div/label'
print(btn_path)
xpath.append(btn_path)
print('{:<25}{:<100}{:<15}{:<15}{:<15}{:<15}'.format('brand', 'product', 'id','auto_ship', 'regular','rating'))
for btn in xpath:
test_url = 'https://www.chewy.com/wellness-large-breed-complete-health/dp/34356'
driver = webdriver.Chrome(executable_path=r'C:\Users\public\chromedriver')
driver.get(test_url)
time.sleep(1)
driver.find_element_by_xpath(btn).click()
time.sleep(5)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
for brand, product, id, auto_ship, price, rating in zip(soup.findAll('span', attrs={'itemprop': 'brand'}),
soup.findAll('div', attrs={'id': 'product-title'}),
soup.findAll('div', attrs={'class': 'value js-part-number'}),
soup.findAll('p', attrs={'class': 'autoship-pricing p'}),
soup.findAll('span', attrs={'class': 'ga-eec__price'}),
soup.select('div.ugc')):
#date = date.today()
brand = brand.text
product = ' '.join(product.h1.text.split())
id = ' '.join(id.span.text.split())
p1 = auto_ship.text.index('(')
auto_ship = ' '.join(auto_ship.text[:p1].split())
regular_price = ' '.join(price.text.split())
rating = rating.picture.img['src'][-7:-4].replace('_', '.')
print('{:<25}{:<100}{:<15}{:<15}{:<15}{:<15}'.format(brand, product, id, auto_ship, regular_price, rating))
driver.quit()
I am trying to extract each "Overall Rating" (number value in strong tags) from each product page
https://www.guitarguitar.co.uk/product/12082017334688--epiphone-les-paul-standard-plus-top-pro-translucent-blue
The structure goes as follows:
<div class="col-sm-12">
<h2 class="line-bottom"> Customer Reviews</h2>
<h4>
Overall Rating
<strong>5</strong>
<span></span>
</h4>
</div>
I am trying to extract only the strong values.
productsRating = soup.find("div", {"class": "col-sm-12"}.h4
This sometimes works, but the page makes use of same class for different elements so it extracts un-wanted html elements.
Is there any solution to only getting the products overall reviews?
EDITED!!
this is the whole loop for my program.
for page in range(1, 2):
guitarPage = requests.get('https://www.guitarguitar.co.uk/guitars/electric/page-{}'.format(page)).text
soup = BeautifulSoup(guitarPage, 'lxml')
guitars = soup.find_all(class_='col-xs-6 col-sm-4 col-md-4 col-lg-3')
for guitar in guitars:
title_text = guitar.h3.text.strip()
print('Guitar Name: ', title_text)
price = guitar.find(class_='price bold small').text.strip()
trim = re.compile(r'[^\d.,]+')
int_price = trim.sub('', price)
print('Guitar Price: ', int_price)
priceSave = guitar.find('span', {'class': 'price save'})
if priceSave is not None:
priceOf = priceSave.text
trim = re.compile(r'[^\d.,]+')
int_priceOff = trim.sub('', priceOf)
print('Save: ', int_priceOff)
else:
print("No discount!")
image = guitar.img.get('src')
print('Guitar Image: ', image)
productLink = guitar.find('a').get('href')
linkProd = url + productLink
print('Link of product', linkProd)
productsPage.append(linkProd)
for products in productsPage:
response = requests.get(products)
soup = BeautifulSoup(response.content, "lxml")
productsDetails = soup.find("div", {"class": "description-preview"})
if productsDetails is not None:
description = productsDetails.text
print('product detail: ', description)
else:
print('none')
time.sleep(0.2)
productsRating = soup.find_all('strong')[0].text
print(productsRating)
Review info is all in a script tag you can extract and load with json. Simply enough to see how to fit that in a loop.
import requests
from bs4 import BeautifulSoup as bs
import json
url = 'https://www.guitarguitar.co.uk/product/12082017334688--epiphone-les-paul-standard-plus-top-pro-translucent-blue'
r = requests.get(url)
soup = bs(r.content, 'lxml')
script = soup.select_one('[type="application/ld+json"]').text
data = json.loads(script.strip())
overall_rating = data['#graph'][2]['aggregateRating']['ratingValue']
reviews = [review for review in data['#graph'][2]['review']] #extract what you want
Output:
Explore json
To handle no reviews you could use a simply try except:
import requests
from bs4 import BeautifulSoup as bs
import json
url = 'https://www.guitarguitar.co.uk/product/190319340849008--gibson-les-paul-standard-60s-iced-tea'
r = requests.get(url)
soup = bs(r.content, 'lxml')
script = soup.select_one('[type="application/ld+json"]').text
data = json.loads(script.strip())
try:
overall_rating = data['#graph'][2]['aggregateRating']['ratingValue']
reviews = [review for review in data['#graph'][2]['review']] #extract what you want
except: #you might want to use except KeyError
overall_rating = "None"
reviews = ['None']
or, use an if statement:
if 'aggregateRating' in script:
overall_rating = data['#graph'][2]['aggregateRating']['ratingValue']
reviews = [review for review in data['#graph'][2]['review']] #extract what you want
else:
overall_rating = "None"
reviews = ['None']
Try:
import requests
from bs4 import BeautifulSoup
url = 'https://www.guitarguitar.co.uk/product/190319340849008--gibson-les-paul-standard-60s-iced-tea'
html = requests.get(url).text
soup = BeautifulSoup(html, "lxml")
try:
productsRating = soup.find('h2', string=lambda s: "Customer reviews" in s).find_next_siblings()[0].find('strong').text
except:
productsRating = None
print(productsRating)
I have this soup:
The webpage has references of companies in a grid view (16 rows x 5 columns) and I want to retrieve each reference's url and the title. The problem is that all 5 references in each row, are in one class named row and when I'm scraping the page, I can only see the first reference of every row, instead of all 5 of them. Here is my code so far:
url = 'http://www.slimstock.com/nl/referenties/'
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
info_block = soup.find_all("div", attrs={"class": "row"})
references = pd.DataFrame(columns=['Company Name', 'Web Page'])
for entry in info_block:
try:
title = entry.find('img').get('title')
url = entry.a['href']
urlcontent = BeautifulSoup(requests.get(url).content, "lxml")
row = [{'Company Name': title, 'Web Page': url}]
references = references.append(row, ignore_index=True)
except:
pass
Is there a way to fix this?
I think you should iterate over the "img" or over the "a".
You can write something like this:
for entry in info_block:
try:
for a in entry.find_all("a"):
title = a.find('img').get('title')
url = a.get('href')
urlcontent = BeautifulSoup(requests.get(url).content, "lxml")
row = [{'Company Name': title, 'Web Page': url}]
references = references.append(row, ignore_index=True)
except:
pass
import pandas as pd
from bs4 import BeautifulSoup
import requests
url = 'http://www.slimstock.com/nl/referenties/'
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
info_block = soup.find_all("div", attrs={"class": "row"})
references = pd.DataFrame(columns=['Company Name', 'Web Page'])
for entry in info_block:
anchors = entry.find_all("a")
for a in anchors:
try:
title = a.find('img').get('title')
url = a['href']
# urlcontent = BeautifulSoup(requests.get(url).content, "lxml")
row = [{'Company Name': title, 'Web Page': url}]
references = references.append(row, ignore_index=True)
except:
pass
I've made this script but I tried few options to save data, but I keep messing up the code. How to save extracted data into and csv, or into an excel file?
import requests
from bs4 import BeautifulSoup
base_url = "http://www.privredni-imenik.com/pretraga?abcd=&keyword=&cities_id=0&category_id=0&sub_category_id=0&page=1"
current_page = 1
while current_page < 200:
print(current_page)
url = base_url + str(current_page)
#current_page += 1
r = requests.get(url)
zute_soup = BeautifulSoup(r.text, 'html.parser')
firme = zute_soup.findAll('div', {'class': 'jobs-item'})
for title in firme:
title1 = title.findAll('h6')[0].text
print(title1)
adresa = title.findAll('div', {'class': 'description'})[0].text
print(adresa)
kontakt = title.findAll('div', {'class': 'description'})[1].text
print(kontakt)
print('\n')
page_line = "{title1}\n{adresa}\n{kontakt}".format(
title1=title1,
adresa=adresa,
kontakt=kontakt
)
current_page += 1
A simple way to get a CSV would be to print each line separated by commas, and then use the operating system's ">" to write to a file.
import csv
import requests
from bs4 import BeautifulSoup
base_url = "http://www.privredni-imenik.com/pretraga?abcd=&keyword=&cities_id=0&category_id=0&sub_category_id=0&page=1"
current_page = 1
with open('scrape_results.csv', 'w', newline='') as scrape_results:
csvwriter = csv.writer(scrape_results)
while current_page < 200:
url = base_url + str(current_page)
r = requests.get(url)
zute_soup = BeautifulSoup(r.text, 'html.parser')
firme = zute_soup.findAll('div', {'class': 'jobs-item'})
for title in firme:
title1 = title.findAll('h6')[0].text
adresa = title.findAll('div', {'class': 'description'})[0].text
kontakt = title.findAll('div', {'class': 'description'})[1].text
csvwriter.writerow([current_page, title1, adresa, kontakt])
current_page += 1
I've been working on Python code to pull data from stock-trading simulation software (I get the tickers for the securities I'm trading, e.g. VTI, AAPL, GOOG, etc.), then search Morningstar for the ticker and pull pricing info and whatever else I want from that site. I save the data I want from lists into a .csv file for use in Excel. I'm using Selenium to run a webdriver (I use either Chrome to see the process visually or PhantomJS to run the program headless without browser GUI), and beautifulsoup to access the websites and work with the HTML.
I have the program working decently, but it takes upwards of 120 seconds to run through a portfolio of only 11 securities, and I am hoping to expand this program to do more elaborate actions.
Is there anything in my coding style that could be changed to speed up the webscraping process? Are there any general methods of writing Python code to allow for fast execution?
Here's the code:
from selenium import webdriver
from bs4 import BeautifulSoup
import csv
#browser = webdriver.Chrome() #replace with .Firefox(), or with the browser of your choice
browser = webdriver.PhantomJS()
security_links_list = list()
equitysim_ticker_list = list()
url = ['https://www.equitysim.com/Home'
]
for item in url:
browser.get(item) #navigate to page behind login
username = browser.find_element_by_id('placeholderContent_txtUserName')
username.send_keys('EDITED_FOR_SECURITY')
password = browser.find_element_by_id('placeholderContent_txtPassword')
password.send_keys('EDITED_FOR_SECURITY')
form = browser.find_element_by_id("placeholderContent_LoginButton")
form.click()
innerHTML = browser.execute_script("return document.body.innerHTML") #returns the inner HTML as a string
innerHTML = browser.page_source
html = browser.page_source
soup = BeautifulSoup(html, 'html.parser')
table_a = soup.find('table', 'ba-tbl admintable')
for a in table_a.find_all('a', href=True):
security_links_list.append(a['href'])
links_set = set(security_links_list)
links_set.remove('#')
print(links_set)
mystring = "https://www.equitysim.com"
links_set_revised = [mystring + link_text for link_text in links_set]
print(links_set_revised)
for item in links_set_revised:
browser.get(item)
innerHTML = browser.execute_script("return document.body.innerHTML") #returns the inner HTML as a string
innerHTML = browser.page_source
html = browser.page_source
soup = BeautifulSoup(html, 'html.parser')
title_element = soup.find("title")
title = title_element.text
ticker = title.split(':', 1)[0]
ticker = ticker.replace('\n','')
ticker = ticker.replace('\t','')
equitysim_ticker_list.append(ticker)
print(equitysim_ticker_list)
morningstar_ticker_search = "http://quote.morningstar.com/TickerLookup.html"
uri_links = list()
for ticker in equitysim_ticker_list:
browser.get(morningstar_ticker_search)
enter_ticker = browser.find_element_by_xpath("//input[#value='Ticker']")
enter_ticker.click()
search_ticker = browser.find_element_by_class_name('F3')
search_ticker.send_keys(ticker)
go_box = browser.find_element_by_xpath("//input[#src='http://im.morningstar.com/im/go.gif']")
go_box.click()
html = browser.page_source
soup = BeautifulSoup(html, 'html.parser')
outer_div = soup.find('div', attrs={'id': 'quote_quicktake'})
iframe = outer_div.find('iframe').get('src')
full_url = 'https:' + iframe
uri_links.append(full_url)
print(uri_links)
price_list = list()
ticker_list = list()
nav_list = list()
for item in uri_links:
browser.get(item) #navigate to page behind login
innerHTML = browser.execute_script("return document.body.innerHTML") #returns the inner HTML as a string
innerHTML = browser.page_source
html = browser.page_source
soup = BeautifulSoup(html, 'html.parser')
price_element = soup.find("div", attrs={"id": "lastPrice"})
price = price_element.text # strip() is used to remove starting and trailing
nav_element = soup.find("span", attrs={"id": "NAV"} or {"vkey": "NAV"})
nav = nav_element.text
nav_split1 = nav.split('\n \t\t', 1)[1]
nav_split2 = nav_split1.split(' ', 1)[0]
title_element = soup.find("title")
title = title_element.text
ticker = title.split(' ', 1)[0]
price_list.append(price)
nav_list.append(nav_split2)
ticker_list.append(ticker)
print(ticker)
print(price)
print(nav_split2)
#ticker =
print(ticker_list)
print(price_list)
print(nav_list)
csvfile = "C:\\Users\\USERNAME\\AppData\\Local\\Programs\\Python\\Python36\\personal\\exampleCsv.csv"
#Assuming res is a flat list
with open(csvfile, "w") as output:
writer = csv.writer(output,lineterminator='')
writer.writerow(ticker_list)
writer.writerow('\n')
writer.writerow(price_list)
writer.writerow('\n')
writer.writerow(nav_list)