Scrape all amenities and Calendar data from AirBnB - python

I manage to scrape a lot of information from AirBnB but i have to questions.
This is my code for scraping several information such as price, rating etc.
Imports
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import requests, re
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.maximize_window()
time.sleep(5)
Main code
url = 'https://www.airbnb.com/s/Thessaloniki--Greece/homes?tab_id=home_tab&flexible_trip_lengths%5B%5D=one_week&refinement_paths%5B%5D=%2Fhomes&place_id=ChIJ7eAoFPQ4qBQRqXTVuBXnugk&query=Thessaloniki%2C%20Greece&date_picker_type=calendar&search_type=user_map_move&price_filter_input_type=0&ne_lat=40.66256734970964&ne_lng=23.003752862853986&sw_lat=40.59051931897441&sw_lng=22.892087137145978&zoom=13&search_by_map=true&federated_search_session_id=1ed21e1c-0c5e-4529-ab84-267361eac02b&pagination_search=true&items_offset={offset}&section_offset=2'
data = []
for offset in range(0,40,20):
driver.get(url.format(offset=offset))
time.sleep(2)
soup=BeautifulSoup(driver.page_source, 'lxml')
detailed_pages = []
for card in soup.select('div[class="c4mnd7m dir dir-ltr"]'):
link = 'https://www.airbnb.com' + card.select_one('a[class="ln2bl2p dir dir-ltr"]')['href']
detailed_pages.append(link)
for page in detailed_pages:
driver.get(page)
time.sleep(3)
soup2=BeautifulSoup(driver.page_source, 'lxml')
room_type = soup2.select_one('div._tqmy57')
room_type = room_type.text if room_type else None
r= requests.get(page)
p_lat = re.compile(r'"lat":([-0-9.]+),')
p_lng = re.compile(r'"lng":([-0-9.]+),')
lat = p_lat.findall(r.text)[0]
lng = p_lng.findall(r.text)[0]
room_id = page[29: link.index("?")]
titles = soup2.select_one('span._1n81at5')
titles = titles.text if titles else None
price = soup2.select_one('span._tyxjp1')
price = price.text if price else None
rating= soup2.select_one('span._12si43g')
rating = rating.text if rating else None
Bedroom_area = soup2.select_one('div[class="_1a5glfg"]')
Bedroom_area = Bedroom_area.text if Bedroom_area else None
place_offers= ', '.join([x.get_text(strip=True) for x in soup2.select('[class="sewcpu6 dir dir-ltr"]+div:nth-of-type(3) > div')])
data.append({
'Room_ID':room_id,
'titles':titles,
'place_offers': place_offers,
'price':price,
'rating':rating,
'Bedroom_area': Bedroom_area,
'Room_Type': room_type,
'Latitude':lat,
'Longitude':lng
})
df=pd.DataFrame(data)
df
The first question is how can I click on buttons like amenities, description etc. and scrape them, since in the landing page we just have some information about this but not all the info.
I know that there is a function .click() in sellenium but i am trying the following code:
soup2.select_one('div.b6xigss dir dir-ltr').click()
but I am getting that error: 'NoneType' object has no attribute 'click' .
The second question is how can I scrape the calendar data and which dates are blocked or not ?

There are few problems:
click() works only with Selenium (driver.find_element()) but not with BeautifulSoup (soup2.select_one()) - so first you have to use different function
for some reasons it can't find 'div.b6xigss.dir.dir-ltr' but it finds 'div.b6xigss button' (To make sure I search button because div can be "unclickable")
there is message about cookies and it hides this element and selenium can't click. It would need to close this message (accept cookies), or it would need to scroll page to move button in visible place, or it needs to use JavaScript (driver.execute_script()) to click it.
This works for me
button = driver.find_element(By.CSS_SELECTOR, 'div.b6xigss button')
driver.execute_script('arguments[0].click()', button)
Miniamal working code:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.maximize_window()
url = 'https://www.airbnb.com/s/Thessaloniki--Greece/homes?tab_id=home_tab&flexible_trip_lengths%5B%5D=one_week&refinement_paths%5B%5D=%2Fhomes&place_id=ChIJ7eAoFPQ4qBQRqXTVuBXnugk&query=Thessaloniki%2C%20Greece&date_picker_type=calendar&search_type=user_map_move&price_filter_input_type=0&ne_lat=40.66256734970964&ne_lng=23.003752862853986&sw_lat=40.59051931897441&sw_lng=22.892087137145978&zoom=13&search_by_map=true&federated_search_session_id=1ed21e1c-0c5e-4529-ab84-267361eac02b&pagination_search=true&items_offset={offset}&section_offset=2'
p_lat = re.compile(r'"lat":([-0-9.]+),')
p_lng = re.compile(r'"lng":([-0-9.]+),')
data = []
for offset in range(0, 40, 20):
print('offset:', offset)
driver.get(url.format(offset=offset))
time.sleep(2)
soup = BeautifulSoup(driver.page_source, 'lxml')
detailed_pages = []
for card in soup.select('div[class="c4mnd7m dir dir-ltr"] a[class="ln2bl2p dir dir-ltr"]'):
link = 'https://www.airbnb.com' + card['href']
detailed_pages.append(link)
print('len(detailed_pages):', len(detailed_pages))
for number, page in enumerate(detailed_pages, 1):
print(number, 'page:', page)
driver.get(page)
time.sleep(5)
soup2 = BeautifulSoup(driver.page_source, 'lxml')
room_type = soup2.select_one('div._tqmy57')
room_type = room_type.text if room_type else None
#r= requests.get(page).text
r = driver.page_source
lat = p_lat.findall(r)[0]
lng = p_lng.findall(r)[0]
room_id = page[29: link.index("?")]
titles = soup2.select_one('span._1n81at5')
titles = titles.text if titles else None
price = soup2.select_one('span._tyxjp1')
price = price.text if price else None
rating= soup2.select_one('span._12si43g')
rating = rating.text if rating else None
bedroom_area = soup2.select_one('div[class="_1a5glfg"]')
bedroom_area = bedroom_area.text if bedroom_area else None
place_offers= ', '.join([x.get_text(strip=True) for x in soup2.select('[class="sewcpu6 dir dir-ltr"]+div:nth-of-type(3) > div')])
try:
button = driver.find_element(By.CSS_SELECTOR, 'div.b6xigss button')
driver.execute_script('arguments[0].click()', button)
except Exception as ex:
print('Exception:', ex)
data.append({
'Room_ID': room_id,
'titles': titles,
'place_offers': place_offers,
'price': price,
'rating': rating,
'Bedroom_area': bedroom_area,
'Room_Type': room_type,
'Latitude': lat,
'Longitude': lng
})
df = pd.DataFrame(data)
df.to_csv('output.csv')
print(df)
EDIT:
As for calendar: every date has aria-disabled=True or aria-disabled=False and you can use aria-disabled to detect dates in calendar and later you can get value from aria-disabled like from any other attribute - item["aria-disabled"]
EDIT:
This works for me
for number, page in enumerate(detailed_pages, 1):
print(number, 'page:', page)
driver.get(page)
time.sleep(5)
# ... other code ...
xpath = '//div[#aria-label="Calendar"]//div[#data-testid]'
for item in driver.find_elements(By.XPATH, xpath):
date = item.get_attribute("data-testid")
blocked = item.get_attribute("data-is-day-blocked")
print(blocked, '|', date)
Result like this:
true | calendar-day-09/18/2022
true | calendar-day-09/19/2022
true | calendar-day-09/20/2022
false | calendar-day-09/21/2022
false | calendar-day-09/22/2022
false | calendar-day-09/23/2022

Related

Append data wrong in csv file

from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import requests
from csv import writer
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 20)
URL = 'https://mergr.com/firms/search/employees?page=1&firm%5BactiveInvestor%5D=2&sortColumn=employee_weight&sortDirection=asc'
driver.get(URL)
email=driver.find_element(By.CSS_SELECTOR,"input#username")
email.send_keys("timgr8#outlook.com")
password=driver.find_element(By.CSS_SELECTOR,"input#password")
password.send_keys("Cosmos1990$$$$$$$")
login=driver.find_element(By.CSS_SELECTOR,"button.btn").click()
urls=[]
product=[]
soup = BeautifulSoup(driver.page_source,"lxml")
details=soup.select("tbody tr")
for detail in details:
try:
t1 =detail.select_one("h5.profile-title a").text
except:
pass
wev={
'Name':t1,
}
product.append(wev)
page_links =driver.find_elements(By.CSS_SELECTOR, "h5.profile-title + p a")
for link in page_links:
href=link.get_attribute("href")
urls.append(href)
for url in urls:
driver.get(url)
soup = BeautifulSoup(driver.page_source,"lxml")
try:
website=soup.select_one("p.adress-info a[target='_blank']").text
except:
website=''
data={
'website':website
}
product.append(data)
df=pd.DataFrame(product)
df.to_csv('firm.csv')
The data of the website will be down in to CSV file as shown in pic is I am appending the data in wrong way why is data moving down where I am wrong ...Kindly recommend where I am wrong there .......
I want output in these format Kindly suggest solution for these...I want output in these format as you shown below...
You can't append wev and data separately - you need website and name in the same dictionary for pandas to know that they belong to same row.
You could add the websites in a separate list like
sites = []
# for url in urls:
# driver.get...
# soup = ....
# try:....except:....
data={
'website':website
}
sites.append(data)
and then zip and combine:
for pi, dictPair in enumerate(zip(product, sites)):
product[pi].update(dictPair[1])
df = pd.DataFrame(product)
df.to_csv('firm.csv')
However, I don't think it's the best way to make sure the right Names and Websites are matched up.
You should just add to the same dictionary for each row from the start instead of zipping and merging.
added_urls = []
product = []
soup = BeautifulSoup(driver.page_source,"lxml")
details = soup.select("tbody tr")
for detail in details:
try:
t1 = detail.select_one("h5.profile-title a").text
except:
# pass # then you'll just be using the previous row's t1
# [also, if this happens in the first loop, it will raise an error]
t1 = 'MISSING' # '' #
wev = {
'Name':t1,
}
href = detail.select_one("h5.profile-title + p a[href]")
if href and href.get("href", '').startswith('http'):
wev['page_link'] = href.get("href")
added_urls.append(href.get("href"))
product.append(wev)
### IF YOU WANT ROWS THAT CAN'T BE CONNECTED TO NAMES ###
page_links = driver.find_elements(By.CSS_SELECTOR, "h5.profile-title + p a")
for link in page_links:
if href in added_urls: continue # skip links that are already added
href = link.get_attribute("href")
# urls.append(href)
added_urls.append(href)
product.append({"page_link": href})
##########################################################
for pi, prod in enumerate(product):
if "page_link" not in prod or not prod["page_link"]: continue ## missing link
url = prod["page_link"]
driver.get(url)
soup = BeautifulSoup(driver.page_source,"lxml")
try:
website=soup.select_one("p.adress-info a[target='_blank']").text
except:
website=''
del product[pi]["page_link"] ## REMOVE this line IF you want a page_link column in csv
# data={'website':website}
# product.append(data)
product[pi]['website'] = website
df=pd.DataFrame(product)
df.to_csv('firm.csv')

Having an issue while scraping the print preview page using selenium webdriver in python

Having an issue to scrape the print preview page and export it into the CSV file. When clicking on the Print Preview url instead of providing only text data it is also giving the source code in it.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import urllib.request
from bs4 import BeautifulSoup
import csv
import os
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd
import os
from openpyxl.workbook import Workbook
from pandas import ExcelWriter
url = 'https://maharerait.mahaonline.gov.in'
chrome_path = r'C:/Users/User/AppData/Local/Programs/Python/Python36/Scripts/chromedriver.exe'
driver = webdriver.Chrome(executable_path=chrome_path)
driver.get(url)
WebDriverWait(driver,
20).until(EC.element_to_be_clickable((By.XPATH,"//div[#class='search-
pro-details']//a[contains(.,'Search Project Details')]"))).click()
Registered_Project_radio= WebDriverWait(driver,
10).until(EC.element_to_be_clickable((By.ID,"Promoter")))
driver.execute_script("arguments[0].click();",Registered_Project_radio)
Application = driver.find_element_by_id("CertiNo")
Application.send_keys("P50500000005")
Search = WebDriverWait(driver,
10).until(EC.element_to_be_clickable((By.ID,"btnSearch")))
driver.execute_script("arguments[0].click();",Search)
View = [item.get_attribute('href') for item in
driver.find_elements_by_tag_name("a")]
print(View)
print(View)
driver.get(View)
request = urllib.request.Request(View)
html = urllib.request.urlopen(request).read()
soup = BeautifulSoup(html , 'html.parser')
print(soup.text)
View = [item.get_attribute('href') for item in
driver.find_elements_by_tag_name("a")]
is list a list eg. ['https://maharerait.mahaonline.gov.in/PrintPreview/PrintPreview?q=MB0agrub1IW1Z63O5lldJdHpk6le6np5EB3HZ3jy8r7qPsLpYPdQwJzwE0S5LXG3fqQe%2fUe6HTGYmXstD%2bcYtATvmObra1R4xBa7L235mdTlmH0wHJPnps0ZXvbDMZxA0Hf9fxpOLM%2ba3Ad13hq9M1bp8Agvb%2bCLA3KOgpoYwr0%3d', None, None, None, None]
which is containt ulr and None element.
Replace your code after driver.execute_script("arguments[0].click();",Search) statement to below code:
View = [item.get_attribute('href') for item in driver.find_elements_by_tag_name("a") if
item.get_attribute('href') is not None]
for url in View:
request = urllib.request.Request(url)
html = urllib.request.urlopen(request).read()
soup = BeautifulSoup(html, 'html.parser')
divPInfo = soup.find("div", {"id": "DivPInfo"})
title = divPInfo.find("div", {'class': 'x_panel'}, recursive=False).find("div", {'class': 'x_title'}).find(
"h2").text.strip()
x_contentObject = divPInfo.find("div", {'class': 'x_panel'}, recursive=False).find_all("div", {'class': 'col-md-3'})
my_dict = {title: {x_contentObject[0].text.strip(): x_contentObject[1].text.strip()}}
print(my_dict)
O/P:
{'General Information': {'Information Type': 'Other Than Individual'}}
enter code here
driver.get(View)
div_2 = driver.find_element_by_xpath("//div[#id='fldFirm']").text
print(div_2)
table = pd.read_html(driver.page_source)
#print(table)
#df.to_csv("trial.csv", sep=',',index = False)
div_3 = driver.find_element_by_xpath("//div[#id='DivProject']").text
print(div_3)
file2 = open("MyFile2.txt","w")
file2.writelines(div_3)
From reference to above Code, I Want to Scrape the second heading (FSI Details)from the div of id = DivProject .But I am unable to get the Second heading.
enter code here
divPInfo2 = soup.find("div", {"id": "DivProject"})
Project_title1 = divPInfo2.find("div", {'class': 'x_panel'},
recursive=False).find("div", {'class': 'x_title'}).find(
"h2")[1].get_text(strip=True)
print(Project_title1)

how to scrapy information from one directory by selenium

scraping contact information from the directory site
I am scraping contact information from the directory site.
this is not a link
I need scrape by selenium. it needs 3 steps,
1. get the company url from website.
2. get all company url from next page/ all pages.
3. scrape all contact information such as company name, website, email. etc.
the code as below, but I face two problem.
# -*- coding: utf-8 -*-
from time import sleep
from scrapy import Spider
from selenium import webdriver
from scrapy.selector import Selector
from scrapy.http import Request
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
results = list()
driver = webdriver.Chrome('D:\chromedriver_win32\chromedriver.exe')
MAX_PAGE_NUM = 2
for i in range(1, MAX_PAGE_NUM):
page_num = str(i)
url ="http://www.arabianbusinesscommunity.com/category/Industrial-Automation-Process-Control/" + page_num
driver.get(url)
sleep(5)
sel = Selector(text=driver.page_source)
companies = sel.xpath('//*[#id="categorypagehtml"]/div[1]/div[7]/ul/li/b//#href').extract()
for i in range(0, len(companies)):
print(companies[i])
results.append(companies[i])
print('---')
for result in results:
url1 = "http://www.arabianbusinesscommunity.com" +result
print(url1)
driver.get(url1)
sleep(5)
sel = Selector(text=driver.page_source)
name = sel.css('h2::text').extract_first()
country = sel.xpath('//*[#id="companypagehtml"]/div[1]/div[2]/ul[1]/li[1]/span[4]/text()').extract_first()
if country:
country = country.strip()
web = sel.xpath('//*[#id="companypagehtml"]/div[1]/div[2]/ul[1]/li[4]/a/#href').extract_first()
email = sel.xpath('//a[contains(#href, "mailto:")]/#href').extract_first()
records = []
records.append((web,email,country,name))
df = pd.DataFrame(records, columns=['web','email', 'country', 'name'])
I write the code as above, but I have two problem.
1. I only can get the last company information.
2.each time it is iteration from the loop, computer always click all urls that clicked before.
can anyone help solve the problem?
Here code to get all companies details from all pages:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
baseUrl = "http://www.arabianbusinesscommunity.com/category/Industrial-Automation-Process-Control"
driver.get(baseUrl)
wait = WebDriverWait(driver, 5)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".search-result-list li")))
# Get last page number
lastPageHref = driver.find_element(By.CSS_SELECTOR, ".PagedList-skipToLast a").get_attribute("href")
hrefArray = lastPageHref.split("/")
lastPageNum = int(hrefArray[len(hrefArray) - 1])
# Get all URLs for the first page and save them in companyUrls list
js = 'return [...document.querySelectorAll(".search-result-list li b a")].map(e=>e.href)'
companyUrls = driver.execute_script(js)
# Iterate through all pages and get all companies URLs
for i in range(2, lastPageNum):
driver.get(baseUrl + "/" + str(i))
companyUrls.extend(driver.execute_script(js))
# Open each company page and get all details
companies = []
for url in companyUrls:
driver.get(url)
company = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#companypagehtml")))
name = company.find_element_by_css_selector("h2").text
email = driver.execute_script('var e = document.querySelector(".email"); if (e!=null) { return e.textContent;} return "";')
website = driver.execute_script('var e = document.querySelector(".website"); if (e!=null) { return e.textContent;} return "";')
phone = driver.execute_script('var e = document.querySelector(".phone"); if (e!=null) { return e.textContent;} return "";')
fax = driver.execute_script('var e = document.querySelector(".fax"); if (e!=null) { return e.textContent;} return "";')
country = company.find_element_by_xpath(".//li[#class='location']/span[last()]").text.replace(",", "").strip()
address = ''.join([e.text.strip() for e in company.find_elements_by_xpath(".//li[#class='location']/span[position() != last()]")])

How to scrape review data present in Read more in Flipkart reviews

I am trying to scrape Flipkart to extract reviews for a product using request and beautifulsoup package.how can take out data present in Read more click event present in those review.
from selenium import webdriver
from selenium.webdriver.common.by import By
from contextlib import closing
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver import Firefox
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
import urllib2
import re
from bs4 import BeautifulSoup
import unicodedata
def remove_non_ascii_1(text):
return ''.join([i if ord(i) < 128 else ' ' for i in text])
with closing(Firefox()) as browser:
site = "https://www.flipkart.com/asus-zenfone-2-laser-ze550kl-black-16-gb/product-reviews/itme9j58yzyzqzgc?pid=MOBE9J587QGMXBB7"
browser.get(site)
file = open("review.txt", "w")
for count in range(1, 10):
nav_btns = browser.find_elements_by_class_name('_33m_Yg')
button = ""
for btn in nav_btns:
number = int(btn.text)
if(number==count):
button = btn
break
button.send_keys(Keys.RETURN)
WebDriverWait(browser, timeout=10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "_2xg6Ul")))
read_more_btns = browser.find_elements_by_class_name('_1EPkIx')
for rm in read_more_btns:
browser.execute_script("return arguments[0].scrollIntoView();", rm)
browser.execute_script("window.scrollBy(0, -150);")
rm.click()
page_source = browser.page_source
soup = BeautifulSoup(page_source, "lxml")
ans = soup.find_all("div", class_="_3DCdKt")
for tag in ans:
title = unicode(tag.find("p", class_="_2xg6Ul").string).replace(u"\u2018", "'").replace(u"\u2019", "'")
title = remove_non_ascii_1(title)
title.encode('ascii','ignore')
content = tag.find("div", class_="qwjRop").div.prettify().replace(u"\u2018", "'").replace(u"\u2019", "'")
content = remove_non_ascii_1(content)
content.encode('ascii','ignore')
content = content[15:-7]
votes = tag.find_all("span", class_="_1_BQL8")
upvotes = int(votes[0].string)
downvotes = int(votes[1].string)
file.write("Review Title : %s\n\n" % title )
file.write("Upvotes : " + str(upvotes) + "\n\nDownvotes : " + str(downvotes) + "\n\n")
file.write("Review Content :\n%s\n\n\n\n" % content )
file.close()
Usage:
Install the requirements by running pip install bs4 selenium.
Add geckodriver to the PATH. Follow these instructions.
Put the link of the product in site variable inside the script.
Run the script by running python scrape.py.
Reviews will be saved in the file review.txt.
Had some issues using #CSMaverick code while accessing the READ MORE link. Modified the code as per my requirement.
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup as bs
def get_source_code(browser):
rm_btns = browser.find_elements_by_class_name('_1BWGvX')
for rm_btn in rm_btns:
rm_btn.click()
return browser.page_source
def collect_reviews_attributes(html):
soup_obj = bs(html, "html.parser")
text_tag_divs = soup_obj.find_all('div', attrs={"class", "t-ZTKy"})
heading_tag_divs = soup_obj.find_all('p', attrs={"class", "_2-N8zT"})
rating_tag_divs = soup_obj.find_all('div', attrs={"class", "_3LWZlK _1BLPMq"})
text_tags = [tag.text for tag in text_tag_divs]
heading_tags = [tag.text for tag in heading_tag_divs]
rating_tags = [tag.text for tag in rating_tag_divs]
return list(zip(heading_tags, text_tags, rating_tags))
collector_list = []
browser = webdriver.Firefox(executable_path=r"path to\geckodriver.exe")
url = "https://www.flipkart.com/samsung-253-l-frost-free-double-door-3-star-convertible-refrigerator/product-reviews/itmf75fa1554bad3?pid=RFRFNDEEJ28SNQPG&lid=LSTRFRFNDEEJ28SNQPGEJ3YHJ&sortOrder=MOST_HELPFUL&certifiedBuyer=false&aid=overall"
num_pages = 3 # get from the url dynamically or else give large number and try hitting until u get exception
browser.get(url) # open the url in the browser
for _ in range(num_pages):
page_source_code = get_source_code(browser)
collector_list.extend(collect_reviews_attributes(page_source_code))
next_page = browser.find_elements_by_class_name('_1LKTO3')[-1] # previous and next are under same class. Access last element
next_page.click()

Web scraping using selenium

My intention is to get the name, location, time of posting, title of the review and the whole review content from the web page (http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061).
My code :
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
firefox_capabilities = DesiredCapabilities.FIREFOX
firefox_capabilities['marionette'] = True
firefox_capabilities['binary'] = '/etc/firefox'
driver = webdriver.Firefox(capabilities=firefox_capabilities)
driver.get('http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061')
soup = BeautifulSoup(driver.page_source,"lxml")
for link in soup.select(".profile"):
try:
profile = link.select("p:nth-of-type(1) a")[0]
profile1 = link.select("p:nth-of-type(2)")[0]
except:pass
print(profile.text,profile1.text)
driver = webdriver.Firefox(capabilities=firefox_capabilities)
driver.get('http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061')
soup1 = BeautifulSoup(driver.page_source,"lxml")
for link in soup1.select(".col-10.review"):
try:
profile2 = link.select("small:nth-of-type(1)")[0]
profile3 = link.select("span:nth-of-type(3)")[0]
profile4 = link.select("a:nth-of-type(1)")[0]
except:pass
print(profile2.text,profile3.text,profile4.text)
driver = webdriver.Firefox(capabilities=firefox_capabilities)
driver.get('http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061')
soup2 = BeautifulSoup(driver.page_source,"lxml")
for link in soup2.select(".more.review"):
try:
containers=page_soup.findAll("div",{"class":"more reviewdata"})
count=len(containers)
for index in range(count):
count1=len(containers[index].p)
for i in range(count1):
profile5 = link.select("p:nth-of-type(i)")[0]
except:pass
print(profile5.text)
driver.quit()
I am getting the output for name, location, time and title of the review but I am unable to get the full review of a user. I would be grateful, if anyone could help me in getting the output for the same, along with the optimization of my code (i.e) I want my code to extract the required data by loading the web page only once. Also, It would be very helpful for me if someone could help me in extracting all the customer reviews of Jio from all the webpages of the website.
You can achieve the same with few lines of code along with lesser pain. However, I've defined here three main categories, as in name, review_title, review_data and the rest of the fields you can twitch very easily.
This is how you can do alternatively:
from selenium import webdriver;import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get("http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061")
wait = WebDriverWait(driver, 10)
for item in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".review-article"))):
link = item.find_element_by_css_selector(".reviewdata a")
link.click()
time.sleep(2)
name = item.find_element_by_css_selector("p a").text
review_title = item.find_element_by_css_selector("strong a[id^=ctl00_ctl00_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews]").text
review_data = ' '.join([' '.join(items.text.split()) for items in item.find_elements_by_css_selector(".reviewdata")])
print("Name: {}\nReview_Title: {}\nReview_Data: {}\n".format(name, review_title, review_data))
driver.quit()
Or to do the same combinedly (selenium + bs4):
from bs4 import BeautifulSoup
from selenium import webdriver;import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get("http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061")
wait = WebDriverWait(driver, 10)
for items in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".review-article"))):
link = items.find_element_by_css_selector(".reviewdata a")
link.click()
time.sleep(2)
soup = BeautifulSoup(driver.page_source,"lxml")
for item in soup.select(".review-article"):
name = item.select("p a")[0].text
review_title = item.select("strong a[id^=ctl00_ctl00_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews]")[0].text
review_data = ' '.join([' '.join(items.text.split()) for items in item.select(".reviewdata")])
print("Name: {}\nReview_Title: {}\nReview_Data: {}\n".format(name, review_title, review_data))
driver.quit()

Categories