Selenium parsing amazon (Python) - python

hello i'm trying to parse all star reviews in text (4,1,4,2,etc.)
driver.get('https://www.amazon.com/gp/new-releases/kitchen/ref=zg_bs_tab_t_bsnr')
elements=WebDriverWait(driver,20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,".a-icon-alt")))
for i in elements:
i = i.get_attribute("innerHTML")
i= i.split(' ')[0]
list3.append(i)
i want to parse Review Stars and if Review star is not exists instead of it print something.

Try:
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.set_window_size(1024, 600)
driver.maximize_window()
url = "https://www.amazon.com/gp/new-releases/kitchen/ref=zg_bs_tab_t_bsnr"
driver.get(url)
time.sleep(2)
soup=bs(driver.page_source,'html.parser')
for card in soup.findAll('span', {'class': 'aok-inline-block zg-item'}):
elem = card.find('span', {'class': 'a-icon-alt'})
if elem:
print(elem.text.split()[0])
else: print("no")

Related

Selenium error' list' object has no attribute 'text'

from selenium.webdriver import Chrome
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://www.suwon.go.kr/sw-www/www01/www01-01.jsp?q_tabs=open')
driver.implicitly_wait(3)
content = driver.find_element(By.TAG_NAME, 'iframe')
driver.switch_to.frame(content)
dropdown = Select(driver.find_element(By.XPATH,'//*[#id="dateType"]'))
dropdown.select_by_index(4)
driver.find_element(By.XPATH,'//*[#id="searchBtn"]').click()
complaint_list = []
contents_list = []
def complaint_Scraping():
for i in range(1,78):
titles = driver.find_elements(By.CSS_SELECTOR,'tbody > tr > td.left')
for complaint in titles:
name = BeautifulSoup(complaint.text, "html.parser")
complaint_list.append(name)
a = driver.find_elements(By.CSS_SELECTOR,' tbody > tr > td.left > a')
for content in a:
content.click()
time.sleep(2)
ancient_html = driver.find_elements(By.XPATH,'//*[#id="txt"]/div[1]/div[1]/div/div[2]')
content = BeautifulSoup(ancient_html.text, "html.parser")
contents_list.append(content)
driver.back()
complaint_Scraping()
i dont'know what's wrong in here
I can get all the titles, but It's not working if I try to get contents of the titles. first page may possible, but other things can't, please let me solve the problem.
ancient_html = driver.find_elements(By.XPATH,'//*[#id="txt"]/div[1]/div[1]/div/div[2]')
content = BeautifulSoup(ancient_html.text, "html.parser")
find_elements() returns a list of elements. ancient_html is a list.

Pages take 2 loadings to complete

On this website, https://toptees.store/linux-funny-cloud-computing I try to scrape sold span text but this website takes 2 time load to coming complete website. That's why data is not scraped.
My Code:
import requests
from bs4 import BeautifulSoup
url = "https://toptees.store/linux-funny-cloud-computing"
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'lxml')
sold = soup.find_all("span", class_='ng-binding')
print(sold)
I also tried with selenium with Beautifulsoup
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install())) # ,options=options
filepath = 'urls.txt'
with open(filepath) as f:
urls = [i.strip() for i in f.readlines()]
titles = []
for url in urls:
driver.get(url)
driver.maximize_window()
time.sleep(3)
soup = BeautifulSoup(driver.page_source, 'lxml')
sold = soup.find('span', class_="ng-binding")
print(sold)
The output is coming like this [] . How can I scrape this link with Beautifulsoup?
This is one way to get that information you're after:
import time as t
from bs4 import BeautifulSoup as bs
import undetected_chromedriver as uc
options = uc.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument('--disable-notifications')
options.add_argument("--window-size=1280,720")
# options.add_argument('--headless')
browser = uc.Chrome(options=options)
url = 'https://toptees.store/linux-funny-cloud-computing'
browser.get(url)
t.sleep(1)
browser.get(url)
t.sleep(7)
soup = bs(browser.page_source, 'html.parser')
sold = soup.select_one('days-available[any-sold="campaign.sold"]')
title = soup.select_one('h1.campaign-name-title')
print(title.text, '|', [x for x in sold.text.split(' ') if len(x.strip()) > 0][0])
Result printed in terminal:
Linux funny Cloud Computing | 43
For undetected_chromedriver, please see https://pypi.org/project/undetected-chromedriver/ [instructions on how to set it up, etc]

Scrape permalinks to answers posted under a certain question on Quora via Python-Selenium Web Driver

I am a beginner in Python-Selenium scraping. I want to scrape permalink of all the Quora answers posted under a question. So far I have created the following code snippet. But when I run it, it gives me only one link in the output. This is due to the fact that the page isn't loaded fully I guess. What should I do to get at least 100 permalinks to answers from the page source?
from selenium import webdriver
from selenium.webdriver.common.by import
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
driver_option = webdriver.ChromeOptions()
driver_option.add_argument(" — incognito")
chromedriver_path = './chromedriver'
def create_webdriver():
return webdriver.Chrome(executable_path=chromedriver_path, chrome_options=driver_option)
f = open('file_text.txt', 'w')
# Open the website
browser = create_webdriver()
browser.get("https://www.quora.com/How-do-I-prove-the-flat-earth-theory")
projects = browser.find_elements_by_xpath("//a[#class='answer_permalink']")
for proj in projects:
anslink = proj.get_attribute('href')
f.write(anslink)
f.close()
import requests
from bs4 import BeautifulSoup
r = requests.get("https://www.quora.com/How-do-I-prove-the-flat-earth-theory")
soup = BeautifulSoup(r.text, 'html.parser')
for item in soup.findAll("a", {'class': 'answer_permalink'}):
print(item.get("href"))
Output:
/How-do-I-prove-the-flat-earth-theory/answer/Dave-Morgan-14
/How-do-I-prove-the-flat-earth-theory/answer/Ken-Natco
/How-do-I-prove-the-flat-earth-theory/answer/Matthew-Writer
/How-do-I-prove-the-flat-earth-theory/answer/Chance-Thompson-13
/How-do-I-prove-the-flat-earth-theory/answers/27223260
/How-do-I-prove-the-flat-earth-theory/answers/26836797
/How-do-I-prove-the-flat-earth-theory/answer/Frida-Schiess
/How-do-I-prove-the-flat-earth-theory/answer/Pierre-Ripplinger
/How-do-I-prove-the-flat-earth-theory/answer/Jacob-Fu
/How-do-I-prove-the-flat-earth-theory/answer/Mike-Howells-4
/How-do-I-prove-the-flat-earth-theory/answer/Mick-Stute
/How-do-I-prove-the-flat-earth-theory/answer/Jesse-Bridges-III
/How-do-I-prove-the-flat-earth-theory/answer/Renard-Leblanc
/How-do-I-prove-the-flat-earth-theory/answers/26831140
/How-do-I-prove-the-flat-earth-theory/answers/27158717
/How-do-I-prove-the-flat-earth-theory/answer/Chris-Lockwood-4
/How-do-I-prove-the-flat-earth-theory/answer/David-Minger
/How-do-I-prove-the-flat-earth-theory/answer/Rick-Brown-50
/How-do-I-prove-the-flat-earth-theory/answer/Jacques-Malan-4
/How-do-I-prove-the-flat-earth-theory/answer/Robert-Lent-1
/How-do-I-prove-the-flat-earth-theory/answers/79419339
/How-do-I-prove-the-flat-earth-theory/answer/Dave-Consiglio
/How-do-I-prove-the-flat-earth-theory/answers/65113366
/How-do-I-prove-the-flat-earth-theory/answer/Krishnabh-Medhi
Selenium Approach:
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.firefox.options import Options
from bs4 import BeautifulSoup
import time
options = Options()
options.add_argument('--headless')
driver = webdriver.Firefox(options=options)
driver.get(
'https://www.quora.com/How-do-I-prove-the-flat-earth-theory')
lenOfPage = driver.execute_script(
"window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
match = False
while(match == False):
lastCount = lenOfPage
lenOfPage = driver.execute_script(
"window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
if lastCount >= 51000:
break
soup = BeautifulSoup(driver.page_source, 'html.parser')
count = 0
for item in soup.findAll("a", {'class': 'answer_permalink'}):
count += 1
print(item.get("href"))
print(count)
driver.quit()
Output:
/How-do-I-prove-the-flat-earth-theory/answer/Dave-Morgan-14
/How-do-I-prove-the-flat-earth-theory/answer/Ken-Natco
/How-do-I-prove-the-flat-earth-theory/answer/Matthew-Writer
/How-do-I-prove-the-flat-earth-theory/answer/Chance-Thompson-13
/How-do-I-prove-the-flat-earth-theory/answers/27223260
/How-do-I-prove-the-flat-earth-theory/answers/26836797
/How-do-I-prove-the-flat-earth-theory/answer/Frida-Schiess
/How-do-I-prove-the-flat-earth-theory/answer/Pierre-Ripplinger
/How-do-I-prove-the-flat-earth-theory/answer/Jacob-Fu
/How-do-I-prove-the-flat-earth-theory/answer/Mike-Howells-4
/How-do-I-prove-the-flat-earth-theory/answer/Mick-Stute
/How-do-I-prove-the-flat-earth-theory/answer/Jesse-Bridges-III
/How-do-I-prove-the-flat-earth-theory/answer/Renard-Leblanc
/How-do-I-prove-the-flat-earth-theory/answers/26831140
/How-do-I-prove-the-flat-earth-theory/answer/Danya-Rose
/How-do-I-prove-the-flat-earth-theory/answer/Chris-Lockwood-4
/How-do-I-prove-the-flat-earth-theory/answer/David-Minger
/How-do-I-prove-the-flat-earth-theory/answer/Rick-Brown-50
/How-do-I-prove-the-flat-earth-theory/answer/Jacques-Malan-4
/How-do-I-prove-the-flat-earth-theory/answer/Robert-Lent-1
/How-do-I-prove-the-flat-earth-theory/answer/John-Lind-22
/How-do-I-prove-the-flat-earth-theory/answer/Dave-Consiglio
/How-do-I-prove-the-flat-earth-theory/answers/65113366
/How-do-I-prove-the-flat-earth-theory/answer/Krishnabh-Medhi
/How-do-I-prove-the-flat-earth-theory/answers/44569062
/How-do-I-prove-the-flat-earth-theory/answer/Abd-Ul-Rahman-Lomax
/How-do-I-prove-the-flat-earth-theory/answer/Helmut-Walle
/How-do-I-prove-the-flat-earth-theory/answer/Ed-Kohlwey-1
/How-do-I-prove-the-flat-earth-theory/answer/Jason-Ree-4
/How-do-I-prove-the-flat-earth-theory/answer/Drew-Curry
/How-do-I-prove-the-flat-earth-theory/answer/Darrel-Blakely-2
/How-do-I-prove-the-flat-earth-theory/answer/Alexander-Kunz-2
/How-do-I-prove-the-flat-earth-theory/answer/Michael-Greenberg-61
/How-do-I-prove-the-flat-earth-theory/answer/Matthew-Schenker
/How-do-I-prove-the-flat-earth-theory/answer/Gregory-Hart-8
/How-do-I-prove-the-flat-earth-theory/answer/Mark-Giammattei
/How-do-I-prove-the-flat-earth-theory/answer/Vernon-Bender
/How-do-I-prove-the-flat-earth-theory/answer/Brett-Evill
/How-do-I-prove-the-flat-earth-theory/answer/Kurt-Mager
/How-do-I-prove-the-flat-earth-theory/answer/Michael-Brenner-13
/How-do-I-prove-the-flat-earth-theory/answer/Luke-Anderson-87
/How-do-I-prove-the-flat-earth-theory/answer/Sassa-Neuf
/How-do-I-prove-the-flat-earth-theory/answer/Spandan-Mallick
/How-do-I-prove-the-flat-earth-theory/answers/58252346
/How-do-I-prove-the-flat-earth-theory/answer/Timothy-Lamothe
/How-do-I-prove-the-flat-earth-theory/answer/Eric-Schwertfeger
/How-do-I-prove-the-flat-earth-theory/answers/70843234
/How-do-I-prove-the-flat-earth-theory/answer/Tony-Flury
/How-do-I-prove-the-flat-earth-theory/answer/Aji-Jijo
/How-do-I-prove-the-flat-earth-theory/answer/Tia-Eastlake
/How-do-I-prove-the-flat-earth-theory/answer/Michael-Grace-53
/How-do-I-prove-the-flat-earth-theory/answer/Ray-Mason-30
/How-do-I-prove-the-flat-earth-theory/answer/Jimmy-May-2
/How-do-I-prove-the-flat-earth-theory/answer/Thomas-Edward-Samuel-Thomas
/How-do-I-prove-the-flat-earth-theory/answer/Alan-Atkinson-4
/How-do-I-prove-the-flat-earth-theory/answer/Joseph-Perkins-11
/How-do-I-prove-the-flat-earth-theory/answer/David-Ridlen
/How-do-I-prove-the-flat-earth-theory/answer/Charles-Li-86
/How-do-I-prove-the-flat-earth-theory/answers/140610748
/How-do-I-prove-the-flat-earth-theory/answer/Corentin-Oger
/How-do-I-prove-the-flat-earth-theory/answer/Jean-Pierre-Choisy
/How-do-I-prove-the-flat-earth-theory/answer/Tom-Kubin
/How-do-I-prove-the-flat-earth-theory/answers/120618033
/How-do-I-prove-the-flat-earth-theory/answer/Charles-Brenchley-1
/How-do-I-prove-the-flat-earth-theory/answer/Jonathan-Johnson-41
/How-do-I-prove-the-flat-earth-theory/answer/Edward-Teach-53
/How-do-I-prove-the-flat-earth-theory/answer/Tony-Price-50
/How-do-I-prove-the-flat-earth-theory/answer/Nathaniel-Day-8
/How-do-I-prove-the-flat-earth-theory/answer/Nuurussubchiy-Fikriy
/How-do-I-prove-the-flat-earth-theory/answers/150581075
/How-do-I-prove-the-flat-earth-theory/answers/87762707
/How-do-I-prove-the-flat-earth-theory/answer/Neil-219
/How-do-I-prove-the-flat-earth-theory/answer/Alex-Frantz-1
/How-do-I-prove-the-flat-earth-theory/answer/Andy-P-Zbinden
/How-do-I-prove-the-flat-earth-theory/answer/Uriel-Anderson-4
/How-do-I-prove-the-flat-earth-theory/answer/Chris-OLeary-19
/How-do-I-prove-the-flat-earth-theory/answer/Daniel-Gerber-7
/How-do-I-prove-the-flat-earth-theory/answer/Roy-Wilson-64
/How-do-I-prove-the-flat-earth-theory/answer/Randy-Wonsowicz-Jr
/How-do-I-prove-the-flat-earth-theory/answer/Leslie-Harrington-4
/How-do-I-prove-the-flat-earth-theory/answer/Eddie-Olsson
/How-do-I-prove-the-flat-earth-theory/answer/Vincent-Emery
/How-do-I-prove-the-flat-earth-theory/answer/Maxwell-Perry-3
/How-do-I-prove-the-flat-earth-theory/answer/Matthew-Granovsky
/How-do-I-prove-the-flat-earth-theory/answers/83259600
/How-do-I-prove-the-flat-earth-theory/answer/Benjamin-Dixon-17
/How-do-I-prove-the-flat-earth-theory/answer/John-Chambers-75
/How-do-I-prove-the-flat-earth-theory/answer/Ryne-Hanz
/How-do-I-prove-the-flat-earth-theory/answer/Eric-Rodriguez-137
/How-do-I-prove-the-flat-earth-theory/answer/Robert-Hopkins-90
/How-do-I-prove-the-flat-earth-theory/answer/Sasha-Maddah
/How-do-I-prove-the-flat-earth-theory/answer/Owen-Lee-126
/How-do-I-prove-the-flat-earth-theory/answer/David-Phillips-133
/How-do-I-prove-the-flat-earth-theory/answer/Hasan-Poonawala-1
/How-do-I-prove-the-flat-earth-theory/answer/Cristiano-Dal-Vi
/How-do-I-prove-the-flat-earth-theory/answer/Rex-Newborn
/How-do-I-prove-the-flat-earth-theory/answer/John-Neumann-9
/How-do-I-prove-the-flat-earth-theory/answer/Josh-D-Davis
/How-do-I-prove-the-flat-earth-theory/answer/Maruthi-Sreenath
/How-do-I-prove-the-flat-earth-theory/answer/Clint-Morgan-2
/How-do-I-prove-the-flat-earth-theory/answer/Nicholas-Volkmuth
/How-do-I-prove-the-flat-earth-theory/answer/Richard-Swim
/How-do-I-prove-the-flat-earth-theory/answers/143504277
/How-do-I-prove-the-flat-earth-theory/answer/Christer-Svanström
/How-do-I-prove-the-flat-earth-theory/answer/Steve-Schlackman-2
/How-do-I-prove-the-flat-earth-theory/answers/147597845
/How-do-I-prove-the-flat-earth-theory/answer/Rene-Dukundane-Felix
/How-do-I-prove-the-flat-earth-theory/answers/148753762
/How-do-I-prove-the-flat-earth-theory/answer/Henk-Schuring
/How-do-I-prove-the-flat-earth-theory/answers/135814117
/How-do-I-prove-the-flat-earth-theory/answer/Emilio-Trampuz
/How-do-I-prove-the-flat-earth-theory/answers/40529643
/How-do-I-prove-the-flat-earth-theory/answer/Karl-Sangree
/How-do-I-prove-the-flat-earth-theory/answer/Ted-Carriker
/How-do-I-prove-the-flat-earth-theory/answer/egi-syahban
/How-do-I-prove-the-flat-earth-theory/answer/Mayank-Dahiya-12
/How-do-I-prove-the-flat-earth-theory/answer/Robert-Jones-741
/How-do-I-prove-the-flat-earth-theory/answer/Jimmi-Carlsson-1
/How-do-I-prove-the-flat-earth-theory/answer/Cole-Johnson-24
/How-do-I-prove-the-flat-earth-theory/answer/Kram-Redarsh
/How-do-I-prove-the-flat-earth-theory/answers/64915389

How to scrape review data present in Read more in Flipkart reviews

I am trying to scrape Flipkart to extract reviews for a product using request and beautifulsoup package.how can take out data present in Read more click event present in those review.
from selenium import webdriver
from selenium.webdriver.common.by import By
from contextlib import closing
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver import Firefox
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
import urllib2
import re
from bs4 import BeautifulSoup
import unicodedata
def remove_non_ascii_1(text):
return ''.join([i if ord(i) < 128 else ' ' for i in text])
with closing(Firefox()) as browser:
site = "https://www.flipkart.com/asus-zenfone-2-laser-ze550kl-black-16-gb/product-reviews/itme9j58yzyzqzgc?pid=MOBE9J587QGMXBB7"
browser.get(site)
file = open("review.txt", "w")
for count in range(1, 10):
nav_btns = browser.find_elements_by_class_name('_33m_Yg')
button = ""
for btn in nav_btns:
number = int(btn.text)
if(number==count):
button = btn
break
button.send_keys(Keys.RETURN)
WebDriverWait(browser, timeout=10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "_2xg6Ul")))
read_more_btns = browser.find_elements_by_class_name('_1EPkIx')
for rm in read_more_btns:
browser.execute_script("return arguments[0].scrollIntoView();", rm)
browser.execute_script("window.scrollBy(0, -150);")
rm.click()
page_source = browser.page_source
soup = BeautifulSoup(page_source, "lxml")
ans = soup.find_all("div", class_="_3DCdKt")
for tag in ans:
title = unicode(tag.find("p", class_="_2xg6Ul").string).replace(u"\u2018", "'").replace(u"\u2019", "'")
title = remove_non_ascii_1(title)
title.encode('ascii','ignore')
content = tag.find("div", class_="qwjRop").div.prettify().replace(u"\u2018", "'").replace(u"\u2019", "'")
content = remove_non_ascii_1(content)
content.encode('ascii','ignore')
content = content[15:-7]
votes = tag.find_all("span", class_="_1_BQL8")
upvotes = int(votes[0].string)
downvotes = int(votes[1].string)
file.write("Review Title : %s\n\n" % title )
file.write("Upvotes : " + str(upvotes) + "\n\nDownvotes : " + str(downvotes) + "\n\n")
file.write("Review Content :\n%s\n\n\n\n" % content )
file.close()
Usage:
Install the requirements by running pip install bs4 selenium.
Add geckodriver to the PATH. Follow these instructions.
Put the link of the product in site variable inside the script.
Run the script by running python scrape.py.
Reviews will be saved in the file review.txt.
Had some issues using #CSMaverick code while accessing the READ MORE link. Modified the code as per my requirement.
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup as bs
def get_source_code(browser):
rm_btns = browser.find_elements_by_class_name('_1BWGvX')
for rm_btn in rm_btns:
rm_btn.click()
return browser.page_source
def collect_reviews_attributes(html):
soup_obj = bs(html, "html.parser")
text_tag_divs = soup_obj.find_all('div', attrs={"class", "t-ZTKy"})
heading_tag_divs = soup_obj.find_all('p', attrs={"class", "_2-N8zT"})
rating_tag_divs = soup_obj.find_all('div', attrs={"class", "_3LWZlK _1BLPMq"})
text_tags = [tag.text for tag in text_tag_divs]
heading_tags = [tag.text for tag in heading_tag_divs]
rating_tags = [tag.text for tag in rating_tag_divs]
return list(zip(heading_tags, text_tags, rating_tags))
collector_list = []
browser = webdriver.Firefox(executable_path=r"path to\geckodriver.exe")
url = "https://www.flipkart.com/samsung-253-l-frost-free-double-door-3-star-convertible-refrigerator/product-reviews/itmf75fa1554bad3?pid=RFRFNDEEJ28SNQPG&lid=LSTRFRFNDEEJ28SNQPGEJ3YHJ&sortOrder=MOST_HELPFUL&certifiedBuyer=false&aid=overall"
num_pages = 3 # get from the url dynamically or else give large number and try hitting until u get exception
browser.get(url) # open the url in the browser
for _ in range(num_pages):
page_source_code = get_source_code(browser)
collector_list.extend(collect_reviews_attributes(page_source_code))
next_page = browser.find_elements_by_class_name('_1LKTO3')[-1] # previous and next are under same class. Access last element
next_page.click()

Web scraping using selenium

My intention is to get the name, location, time of posting, title of the review and the whole review content from the web page (http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061).
My code :
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
firefox_capabilities = DesiredCapabilities.FIREFOX
firefox_capabilities['marionette'] = True
firefox_capabilities['binary'] = '/etc/firefox'
driver = webdriver.Firefox(capabilities=firefox_capabilities)
driver.get('http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061')
soup = BeautifulSoup(driver.page_source,"lxml")
for link in soup.select(".profile"):
try:
profile = link.select("p:nth-of-type(1) a")[0]
profile1 = link.select("p:nth-of-type(2)")[0]
except:pass
print(profile.text,profile1.text)
driver = webdriver.Firefox(capabilities=firefox_capabilities)
driver.get('http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061')
soup1 = BeautifulSoup(driver.page_source,"lxml")
for link in soup1.select(".col-10.review"):
try:
profile2 = link.select("small:nth-of-type(1)")[0]
profile3 = link.select("span:nth-of-type(3)")[0]
profile4 = link.select("a:nth-of-type(1)")[0]
except:pass
print(profile2.text,profile3.text,profile4.text)
driver = webdriver.Firefox(capabilities=firefox_capabilities)
driver.get('http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061')
soup2 = BeautifulSoup(driver.page_source,"lxml")
for link in soup2.select(".more.review"):
try:
containers=page_soup.findAll("div",{"class":"more reviewdata"})
count=len(containers)
for index in range(count):
count1=len(containers[index].p)
for i in range(count1):
profile5 = link.select("p:nth-of-type(i)")[0]
except:pass
print(profile5.text)
driver.quit()
I am getting the output for name, location, time and title of the review but I am unable to get the full review of a user. I would be grateful, if anyone could help me in getting the output for the same, along with the optimization of my code (i.e) I want my code to extract the required data by loading the web page only once. Also, It would be very helpful for me if someone could help me in extracting all the customer reviews of Jio from all the webpages of the website.
You can achieve the same with few lines of code along with lesser pain. However, I've defined here three main categories, as in name, review_title, review_data and the rest of the fields you can twitch very easily.
This is how you can do alternatively:
from selenium import webdriver;import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get("http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061")
wait = WebDriverWait(driver, 10)
for item in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".review-article"))):
link = item.find_element_by_css_selector(".reviewdata a")
link.click()
time.sleep(2)
name = item.find_element_by_css_selector("p a").text
review_title = item.find_element_by_css_selector("strong a[id^=ctl00_ctl00_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews]").text
review_data = ' '.join([' '.join(items.text.split()) for items in item.find_elements_by_css_selector(".reviewdata")])
print("Name: {}\nReview_Title: {}\nReview_Data: {}\n".format(name, review_title, review_data))
driver.quit()
Or to do the same combinedly (selenium + bs4):
from bs4 import BeautifulSoup
from selenium import webdriver;import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get("http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061")
wait = WebDriverWait(driver, 10)
for items in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".review-article"))):
link = items.find_element_by_css_selector(".reviewdata a")
link.click()
time.sleep(2)
soup = BeautifulSoup(driver.page_source,"lxml")
for item in soup.select(".review-article"):
name = item.select("p a")[0].text
review_title = item.select("strong a[id^=ctl00_ctl00_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews]")[0].text
review_data = ' '.join([' '.join(items.text.split()) for items in item.select(".reviewdata")])
print("Name: {}\nReview_Title: {}\nReview_Data: {}\n".format(name, review_title, review_data))
driver.quit()

Categories