Web scraping using selenium

Web scraping using selenium - python

My intention is to get the name, location, time of posting, title of the review and the whole review content from the web page (http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061).
My code :
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
firefox_capabilities = DesiredCapabilities.FIREFOX
firefox_capabilities['marionette'] = True
firefox_capabilities['binary'] = '/etc/firefox'
driver = webdriver.Firefox(capabilities=firefox_capabilities)
driver.get('http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061')
soup = BeautifulSoup(driver.page_source,"lxml")
for link in soup.select(".profile"):
try:
profile = link.select("p:nth-of-type(1) a")[0]
profile1 = link.select("p:nth-of-type(2)")[0]
except:pass
print(profile.text,profile1.text)
driver = webdriver.Firefox(capabilities=firefox_capabilities)
driver.get('http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061')
soup1 = BeautifulSoup(driver.page_source,"lxml")
for link in soup1.select(".col-10.review"):
try:
profile2 = link.select("small:nth-of-type(1)")[0]
profile3 = link.select("span:nth-of-type(3)")[0]
profile4 = link.select("a:nth-of-type(1)")[0]
except:pass
print(profile2.text,profile3.text,profile4.text)
driver = webdriver.Firefox(capabilities=firefox_capabilities)
driver.get('http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061')
soup2 = BeautifulSoup(driver.page_source,"lxml")
for link in soup2.select(".more.review"):
try:
containers=page_soup.findAll("div",{"class":"more reviewdata"})
count=len(containers)
for index in range(count):
count1=len(containers[index].p)
for i in range(count1):
profile5 = link.select("p:nth-of-type(i)")[0]
except:pass
print(profile5.text)
driver.quit()
I am getting the output for name, location, time and title of the review but I am unable to get the full review of a user. I would be grateful, if anyone could help me in getting the output for the same, along with the optimization of my code (i.e) I want my code to extract the required data by loading the web page only once. Also, It would be very helpful for me if someone could help me in extracting all the customer reviews of Jio from all the webpages of the website.

You can achieve the same with few lines of code along with lesser pain. However, I've defined here three main categories, as in name, review_title, review_data and the rest of the fields you can twitch very easily.
This is how you can do alternatively:
from selenium import webdriver;import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get("http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061")
wait = WebDriverWait(driver, 10)
for item in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".review-article"))):
link = item.find_element_by_css_selector(".reviewdata a")
link.click()
time.sleep(2)
name = item.find_element_by_css_selector("p a").text
review_title = item.find_element_by_css_selector("strong a[id^=ctl00_ctl00_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews]").text
review_data = ' '.join([' '.join(items.text.split()) for items in item.find_elements_by_css_selector(".reviewdata")])
print("Name: {}\nReview_Title: {}\nReview_Data: {}\n".format(name, review_title, review_data))
driver.quit()
Or to do the same combinedly (selenium + bs4):
from bs4 import BeautifulSoup
from selenium import webdriver;import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get("http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061")
wait = WebDriverWait(driver, 10)
for items in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".review-article"))):
link = items.find_element_by_css_selector(".reviewdata a")
link.click()
time.sleep(2)
soup = BeautifulSoup(driver.page_source,"lxml")
for item in soup.select(".review-article"):
name = item.select("p a")[0].text
review_title = item.select("strong a[id^=ctl00_ctl00_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews]")[0].text
review_data = ' '.join([' '.join(items.text.split()) for items in item.select(".reviewdata")])
print("Name: {}\nReview_Title: {}\nReview_Data: {}\n".format(name, review_title, review_data))
driver.quit()

Related

How to scrap each product page (comments and custumer country)

I am trying to scrape each product page from aliexpress website in order to get number of comments, number of photos published by the custumer and also the custumer country and put it to a dataframe.
I have written a code that scrape custumer country but I don't know how to get the number of custumer comments and the number of images.
This is my code :
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
url = 'https://www.aliexpress.com/item/1005003801507855.html?spm=a2g0o.productlist.0.0.1e951bc72xISfE&algo_pvid=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad&algo_exp_id=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad-8&pdp_ext_f=%7B%22sku_id%22%3A%2212000027213624098%22%7D&pdp_pi=-1%3B40.81%3B-1%3B-1%40salePrice%3BMAD%3Bsearch-mainSearch'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(url)
wait = WebDriverWait(driver, 10)
driver.execute_script("arguments[0].scrollIntoView();", wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.tab-content'))))
driver.get(wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#product-evaluation'))).get_attribute('src'))
data=[]
while True:
for e in driver.find_elements(By.CSS_SELECTOR, 'div.feedback-item'):
try:
country = e.find_element(By.CSS_SELECTOR, '.user-country > b').text
except:
country = None
data.append({
'country':country,
})
try:
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#complex-pager a.ui-pagination-next'))).click()
except:
break
pd.DataFrame(data).to_csv('filename.csv',index=False)
I would appreciate any help from you! Thank you !

If you want numbers of comments / reviews, you can just check the value in this section :
driver.find_element(By.XPATH, 'XPATH_OF_ELEMENT_TO_SCRAP')
To do so in your exemple lets do this outside your loop :
number_feedbacks = driver.find_element(By.XPATH, '//*[#id="transction-feedback"]/div[1]')
number_images = driver.find_element(By.XPATH, '//*[#id="transction-feedback"]//label[1]/em')
If you dont understand or know this function, please feal free to ask and I will explain where I found theses XPATH.We also can use find by id function.
In your code it would be :
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
url = 'https://www.aliexpress.com/item/1005003801507855.html?spm=a2g0o.productlist.0.0.1e951bc72xISfE&algo_pvid=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad&algo_exp_id=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad-8&pdp_ext_f=%7B%22sku_id%22%3A%2212000027213624098%22%7D&pdp_pi=-1%3B40.81%3B-1%3B-1%40salePrice%3BMAD%3Bsearch-mainSearch'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(url)
wait = WebDriverWait(driver, 10)
driver.execute_script("arguments[0].scrollIntoView();", wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.tab-content'))))
driver.get(wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#product-evaluation'))).get_attribute('src'))
data=[]
number_feedbacks = driver.find_element(By.XPATH, '//*[#id="transction-feedback"]/div[1]')
number_images = driver.find_element(By.XPATH, '//*[#id="transction-feedback"]//label[1]/em')
print(f'number_feedbacks = {number_feedbacks}\nnumber_images = {number_images}')
while True:
for e in driver.find_elements(By.CSS_SELECTOR, 'div.feedback-item'):
try:
country = e.find_element(By.CSS_SELECTOR, '.user-country > b').text
except:
country = None
data.append({
'country':country,
})
try:
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#complex-pager a.ui-pagination-next'))).click()
except:
break
pd.DataFrame(data).to_csv('filename.csv',index=False)

Reading weblink from dataframe throws "stale element reference: element is not attached to the page document" error

I got a dataframe that contains links to google reviews of two restaurants. I wanted to load all reviews of two restaurants (one by one) into the browser and then save them into a new data frame. I wrote a script that reads and load all reviews into the browser as follow:
from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import time
link_df = Link
0 https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318a3aa3041455:0x5f83f4fae76d8656,1,,,&rlfi=hd:;si:6882614014013965910,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEiglZKhm6qAgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSARJidXJtZXNlX3Jlc3RhdXJhbnSqAQwQASoIIgRmb29kKAA,y,UB2auy7TMYs;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]
1 https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318bf82139caaf:0xf115cd7fe794cbcc,1,,,&rlfi=hd:;si:17372017086881385420,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEjh9auu-q6AgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSAQpyZXN0YXVyYW50qgEMEAEqCCIEZm9vZCgA,y,ZeJbBWd7wDg;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]
i = 0
driver = webdriver.Chrome()
for index, i in link_df.iterrows():
base_url = i['Link'] #link_df['Link'][i]
driver.get(base_url)
WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"//div[./span[text()='Newest']]"))).click()
print('Restaurant number is ',index)
title = driver.find_element_by_xpath("//div[#class='P5Bobd']").text
address = driver.find_element_by_xpath("//div[#class='T6pBCe']").text
overall_rating = driver.find_element_by_xpath("//div[#class='review-score-container']//span[#class='Aq14fc']").text
total_reviews_text =driver.find_element_by_xpath("//div[#class='review-score-container']//div//div//span//span[#class='z5jxId']").text
num_reviews = int (total_reviews_text.split()[0])
all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.gws-localreviews__google-review')))
time.sleep(2)
total_reviews = len(all_reviews)
while total_reviews < num_reviews:
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
WebDriverWait(driver, 5, 0.25).until_not(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class$="activityIndicator"]')))
time.sleep(5)
all_reviews = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.gws-localreviews__google-review')))
print(total_reviews)
total_reviews +=1
reviews_info = driver.find_elements_by_xpath("//div[#class='jxjCjc']")
review_information = pd.DataFrame(columns=["Restaurant title","Restaurant rating","Total reviews","Reviewer Name","Rating", "Review"])
name= ''
rating = ''
text = ''
for index,review_info in enumerate(reviews_info):
name = review_info.find_element_by_xpath("./div/div/a").text
rating = review_info.find_element_by_xpath(".//div[#class='PuaHbe']//g-review-stars//span").get_attribute('aria-label')
text = review_info.find_element_by_xpath(".//div[#class='Jtu6Td']//span").text
review_information.at[len(review_information)] = [title,overall_rating,num_reviews,name,rating,text]
filename = 'Google_reviews' + ' ' +pd.to_datetime("now").strftime("%Y_%m_%d")+'.csv'
files_present = glob.glob(filename)
if files_present:
review_information.to_csv(filename,index=False,mode='a',header=False)
else:
review_information.to_csv(filename,index=False)
driver.get('https:ww.google.com')
time.sleep(3)
The problem is that script throws an error when it reaches the following line.
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
It throws following error:
StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
(Session info: chrome=95.0.4638.69)
When I tried the same program without storing google links in dataframe (i.e. no for loop and instead of base_url = i['Link'], I wrote base_url = google review link) it works fine.
I am not sure where I am making the mistake. Any suggestion or help to fix the issue would be highly appreciated?

EDIT
you put the creation of driver outside the for loop
you cant launch the new url with gps data when the first popup is always in front, if you launch it, it stays in backdoor, the easier way is to launch a new url without gps data -> https:ww.google.com and wait 3 dec before to follow your loop:
your count is not good, i have changed your selector and change the total and set some lines in comment
from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.firefox.options import Options
import time
link_df = ["https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318a3aa3041455:0x5f83f4fae76d8656,1,,,&rlfi=hd:;si:6882614014013965910,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEiglZKhm6qAgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSARJidXJtZXNlX3Jlc3RhdXJhbnSqAQwQASoIIgRmb29kKAA,y,UB2auy7TMYs;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]",
"https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318bf82139caaf:0xf115cd7fe794cbcc,1,,,&rlfi=hd:;si:17372017086881385420,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEjh9auu-q6AgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSAQpyZXN0YXVyYW50qgEMEAEqCCIEZm9vZCgA,y,ZeJbBWd7wDg;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]"
]
i = 0
binary = r'C:\Program Files (x86)\Mozilla Firefox\firefox.exe'
cap = DesiredCapabilities().FIREFOX
cap["marionette"] = True
options = Options()
options.binary = binary
driver = webdriver.Firefox(options=options, capabilities=cap, executable_path="E:\\Téléchargement\\geckodriver.exe")
# i have to launch one time to accept the cookies manually
#by setting a breakpoint after, but you dont have that i think
#driver.get(link_df[0])
print ("Headless Firefox Initialized")
print(link_df)
for url in link_df:
base_url = url # i['Link'] # link_df['Link'][i]
print(base_url)
driver.get(base_url)
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//div[./span[text()='Avis les plus récents']]"))).click()
title = driver.find_element_by_xpath("//div[#class='P5Bobd']").text
address = driver.find_element_by_xpath("//div[#class='T6pBCe']").text
overall_rating = driver.find_element_by_xpath("//div[#class='review-score-container']//span[#class='Aq14fc']").text
total_reviews_text = driver.find_element_by_xpath(
"//div[#class='review-score-container']//div//div//span//span[#class='z5jxId']").text
num_reviews = int(total_reviews_text.split()[0])
all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
# time.sleep(2)
total_reviews = 0
while total_reviews < num_reviews:
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
WebDriverWait(driver, 5, 0.25).until_not(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class$="activityIndicator"]')))
all_reviews = WebDriverWait(driver, 5).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
total_reviews = len(all_reviews)
print(total_reviews, len(all_reviews))
driver.get('https:ww.google.com') # or driver.close() if no bugs
time.sleep(3)
driver.close()
driver.quit()
it seems the solution for chrome needs some fixes:
org.openqa.selenium.StaleElementReferenceException: stale element reference: element is not attached to the page document
The literal meaning is about , The referenced element is out of date , No longer attached to the current page . Usually , This is because the page has been refreshed or skipped , The solution is , Reuse findElement or findElements Method to locate the element .
so its seems for chrome there is a problem of refreshing, so i suggest to load the number of record before to scroll, to have a fresh copy of DOM items, and i have to add a wait 1sec at the end of while loop
from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
#from selenium.webdriver.firefox.options import Options
from selenium.webdriver.chrome.options import Options
import time
link_df = [
"https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318a3aa3041455:0x5f83f4fae76d8656,1,,,&rlfi=hd:;si:6882614014013965910,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEiglZKhm6qAgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSARJidXJtZXNlX3Jlc3RhdXJhbnSqAQwQASoIIgRmb29kKAA,y,UB2auy7TMYs;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]",
"https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318bf82139caaf:0xf115cd7fe794cbcc,1,,,&rlfi=hd:;si:17372017086881385420,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEjh9auu-q6AgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSAQpyZXN0YXVyYW50qgEMEAEqCCIEZm9vZCgA,y,ZeJbBWd7wDg;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]"
]
i = 0
binaryfirefox = r'C:\Program Files (x86)\Mozilla Firefox\firefox.exe'
binarychrome = r'C:\Program Files (x86)\Google\Chrome\Application\chrome.exe'
options = Options()
#cap = DesiredCapabilities().CHROME
#cap["marionette"] = True
#cap = DesiredCapabilities().FIREFOX
#options.binary = binaryfirefox
#driver = webdriver.Firefox(options=options, capabilities=cap, executable_path="E:\\Téléchargement\\geckodriver.exe")
options.binary_location = binarychrome
driver = webdriver.Chrome(options=options, executable_path="E:\\Téléchargement\\chromedriver.exe" )
# same reason tha Firefox i have to load one time
# an url to accept manually the cookies
#driver.get(link_df[0])
print(link_df)
for url in link_df:
base_url = url # i['Link'] # link_df['Link'][i]
print(base_url)
driver.get(base_url)
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//div[./span[text()='Newest']]"))).click()
title = driver.find_element_by_xpath("//div[#class='P5Bobd']").text
address = driver.find_element_by_xpath("//div[#class='T6pBCe']").text
overall_rating = driver.find_element_by_xpath("//div[#class='review-score-container']//span[#class='Aq14fc']").text
total_reviews_text = driver.find_element_by_xpath(
"//div[#class='review-score-container']//div//div//span//span[#class='z5jxId']").text
num_reviews = int(total_reviews_text.split()[0])
all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
# time.sleep(2)
total_reviews = 0
while total_reviews < num_reviews:
#reload to avoid exception, or trap scroll with try/except but more expznsive
all_reviews = WebDriverWait(driver, 20).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
total_reviews = len(all_reviews)
print(total_reviews, len(all_reviews))
time.sleep(1)
driver.get('https:ww.google.com') # or driver.close() if no bugs
time.sleep(3)
driver.close()
driver.quit()

Scrape permalinks to answers posted under a certain question on Quora via Python-Selenium Web Driver

I am a beginner in Python-Selenium scraping. I want to scrape permalink of all the Quora answers posted under a question. So far I have created the following code snippet. But when I run it, it gives me only one link in the output. This is due to the fact that the page isn't loaded fully I guess. What should I do to get at least 100 permalinks to answers from the page source?
from selenium import webdriver
from selenium.webdriver.common.by import
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
driver_option = webdriver.ChromeOptions()
driver_option.add_argument(" — incognito")
chromedriver_path = './chromedriver'
def create_webdriver():
return webdriver.Chrome(executable_path=chromedriver_path, chrome_options=driver_option)
f = open('file_text.txt', 'w')
# Open the website
browser = create_webdriver()
browser.get("https://www.quora.com/How-do-I-prove-the-flat-earth-theory")
projects = browser.find_elements_by_xpath("//a[#class='answer_permalink']")
for proj in projects:
anslink = proj.get_attribute('href')
f.write(anslink)
f.close()

import requests
from bs4 import BeautifulSoup
r = requests.get("https://www.quora.com/How-do-I-prove-the-flat-earth-theory")
soup = BeautifulSoup(r.text, 'html.parser')
for item in soup.findAll("a", {'class': 'answer_permalink'}):
print(item.get("href"))
Output:
/How-do-I-prove-the-flat-earth-theory/answer/Dave-Morgan-14
/How-do-I-prove-the-flat-earth-theory/answer/Ken-Natco
/How-do-I-prove-the-flat-earth-theory/answer/Matthew-Writer
/How-do-I-prove-the-flat-earth-theory/answer/Chance-Thompson-13
/How-do-I-prove-the-flat-earth-theory/answers/27223260
/How-do-I-prove-the-flat-earth-theory/answers/26836797
/How-do-I-prove-the-flat-earth-theory/answer/Frida-Schiess
/How-do-I-prove-the-flat-earth-theory/answer/Pierre-Ripplinger
/How-do-I-prove-the-flat-earth-theory/answer/Jacob-Fu
/How-do-I-prove-the-flat-earth-theory/answer/Mike-Howells-4
/How-do-I-prove-the-flat-earth-theory/answer/Mick-Stute
/How-do-I-prove-the-flat-earth-theory/answer/Jesse-Bridges-III
/How-do-I-prove-the-flat-earth-theory/answer/Renard-Leblanc
/How-do-I-prove-the-flat-earth-theory/answers/26831140
/How-do-I-prove-the-flat-earth-theory/answers/27158717
/How-do-I-prove-the-flat-earth-theory/answer/Chris-Lockwood-4
/How-do-I-prove-the-flat-earth-theory/answer/David-Minger
/How-do-I-prove-the-flat-earth-theory/answer/Rick-Brown-50
/How-do-I-prove-the-flat-earth-theory/answer/Jacques-Malan-4
/How-do-I-prove-the-flat-earth-theory/answer/Robert-Lent-1
/How-do-I-prove-the-flat-earth-theory/answers/79419339
/How-do-I-prove-the-flat-earth-theory/answer/Dave-Consiglio
/How-do-I-prove-the-flat-earth-theory/answers/65113366
/How-do-I-prove-the-flat-earth-theory/answer/Krishnabh-Medhi
Selenium Approach:
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.firefox.options import Options
from bs4 import BeautifulSoup
import time
options = Options()
options.add_argument('--headless')
driver = webdriver.Firefox(options=options)
driver.get(
'https://www.quora.com/How-do-I-prove-the-flat-earth-theory')
lenOfPage = driver.execute_script(
"window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
match = False
while(match == False):
lastCount = lenOfPage
lenOfPage = driver.execute_script(
"window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
if lastCount >= 51000:
break
soup = BeautifulSoup(driver.page_source, 'html.parser')
count = 0
for item in soup.findAll("a", {'class': 'answer_permalink'}):
count += 1
print(item.get("href"))
print(count)
driver.quit()
Output:
/How-do-I-prove-the-flat-earth-theory/answer/Dave-Morgan-14
/How-do-I-prove-the-flat-earth-theory/answer/Ken-Natco
/How-do-I-prove-the-flat-earth-theory/answer/Matthew-Writer
/How-do-I-prove-the-flat-earth-theory/answer/Chance-Thompson-13
/How-do-I-prove-the-flat-earth-theory/answers/27223260
/How-do-I-prove-the-flat-earth-theory/answers/26836797
/How-do-I-prove-the-flat-earth-theory/answer/Frida-Schiess
/How-do-I-prove-the-flat-earth-theory/answer/Pierre-Ripplinger
/How-do-I-prove-the-flat-earth-theory/answer/Jacob-Fu
/How-do-I-prove-the-flat-earth-theory/answer/Mike-Howells-4
/How-do-I-prove-the-flat-earth-theory/answer/Mick-Stute
/How-do-I-prove-the-flat-earth-theory/answer/Jesse-Bridges-III
/How-do-I-prove-the-flat-earth-theory/answer/Renard-Leblanc
/How-do-I-prove-the-flat-earth-theory/answers/26831140
/How-do-I-prove-the-flat-earth-theory/answer/Danya-Rose
/How-do-I-prove-the-flat-earth-theory/answer/Chris-Lockwood-4
/How-do-I-prove-the-flat-earth-theory/answer/David-Minger
/How-do-I-prove-the-flat-earth-theory/answer/Rick-Brown-50
/How-do-I-prove-the-flat-earth-theory/answer/Jacques-Malan-4
/How-do-I-prove-the-flat-earth-theory/answer/Robert-Lent-1
/How-do-I-prove-the-flat-earth-theory/answer/John-Lind-22
/How-do-I-prove-the-flat-earth-theory/answer/Dave-Consiglio
/How-do-I-prove-the-flat-earth-theory/answers/65113366
/How-do-I-prove-the-flat-earth-theory/answer/Krishnabh-Medhi
/How-do-I-prove-the-flat-earth-theory/answers/44569062
/How-do-I-prove-the-flat-earth-theory/answer/Abd-Ul-Rahman-Lomax
/How-do-I-prove-the-flat-earth-theory/answer/Helmut-Walle
/How-do-I-prove-the-flat-earth-theory/answer/Ed-Kohlwey-1
/How-do-I-prove-the-flat-earth-theory/answer/Jason-Ree-4
/How-do-I-prove-the-flat-earth-theory/answer/Drew-Curry
/How-do-I-prove-the-flat-earth-theory/answer/Darrel-Blakely-2
/How-do-I-prove-the-flat-earth-theory/answer/Alexander-Kunz-2
/How-do-I-prove-the-flat-earth-theory/answer/Michael-Greenberg-61
/How-do-I-prove-the-flat-earth-theory/answer/Matthew-Schenker
/How-do-I-prove-the-flat-earth-theory/answer/Gregory-Hart-8
/How-do-I-prove-the-flat-earth-theory/answer/Mark-Giammattei
/How-do-I-prove-the-flat-earth-theory/answer/Vernon-Bender
/How-do-I-prove-the-flat-earth-theory/answer/Brett-Evill
/How-do-I-prove-the-flat-earth-theory/answer/Kurt-Mager
/How-do-I-prove-the-flat-earth-theory/answer/Michael-Brenner-13
/How-do-I-prove-the-flat-earth-theory/answer/Luke-Anderson-87
/How-do-I-prove-the-flat-earth-theory/answer/Sassa-Neuf
/How-do-I-prove-the-flat-earth-theory/answer/Spandan-Mallick
/How-do-I-prove-the-flat-earth-theory/answers/58252346
/How-do-I-prove-the-flat-earth-theory/answer/Timothy-Lamothe
/How-do-I-prove-the-flat-earth-theory/answer/Eric-Schwertfeger
/How-do-I-prove-the-flat-earth-theory/answers/70843234
/How-do-I-prove-the-flat-earth-theory/answer/Tony-Flury
/How-do-I-prove-the-flat-earth-theory/answer/Aji-Jijo
/How-do-I-prove-the-flat-earth-theory/answer/Tia-Eastlake
/How-do-I-prove-the-flat-earth-theory/answer/Michael-Grace-53
/How-do-I-prove-the-flat-earth-theory/answer/Ray-Mason-30
/How-do-I-prove-the-flat-earth-theory/answer/Jimmy-May-2
/How-do-I-prove-the-flat-earth-theory/answer/Thomas-Edward-Samuel-Thomas
/How-do-I-prove-the-flat-earth-theory/answer/Alan-Atkinson-4
/How-do-I-prove-the-flat-earth-theory/answer/Joseph-Perkins-11
/How-do-I-prove-the-flat-earth-theory/answer/David-Ridlen
/How-do-I-prove-the-flat-earth-theory/answer/Charles-Li-86
/How-do-I-prove-the-flat-earth-theory/answers/140610748
/How-do-I-prove-the-flat-earth-theory/answer/Corentin-Oger
/How-do-I-prove-the-flat-earth-theory/answer/Jean-Pierre-Choisy
/How-do-I-prove-the-flat-earth-theory/answer/Tom-Kubin
/How-do-I-prove-the-flat-earth-theory/answers/120618033
/How-do-I-prove-the-flat-earth-theory/answer/Charles-Brenchley-1
/How-do-I-prove-the-flat-earth-theory/answer/Jonathan-Johnson-41
/How-do-I-prove-the-flat-earth-theory/answer/Edward-Teach-53
/How-do-I-prove-the-flat-earth-theory/answer/Tony-Price-50
/How-do-I-prove-the-flat-earth-theory/answer/Nathaniel-Day-8
/How-do-I-prove-the-flat-earth-theory/answer/Nuurussubchiy-Fikriy
/How-do-I-prove-the-flat-earth-theory/answers/150581075
/How-do-I-prove-the-flat-earth-theory/answers/87762707
/How-do-I-prove-the-flat-earth-theory/answer/Neil-219
/How-do-I-prove-the-flat-earth-theory/answer/Alex-Frantz-1
/How-do-I-prove-the-flat-earth-theory/answer/Andy-P-Zbinden
/How-do-I-prove-the-flat-earth-theory/answer/Uriel-Anderson-4
/How-do-I-prove-the-flat-earth-theory/answer/Chris-OLeary-19
/How-do-I-prove-the-flat-earth-theory/answer/Daniel-Gerber-7
/How-do-I-prove-the-flat-earth-theory/answer/Roy-Wilson-64
/How-do-I-prove-the-flat-earth-theory/answer/Randy-Wonsowicz-Jr
/How-do-I-prove-the-flat-earth-theory/answer/Leslie-Harrington-4
/How-do-I-prove-the-flat-earth-theory/answer/Eddie-Olsson
/How-do-I-prove-the-flat-earth-theory/answer/Vincent-Emery
/How-do-I-prove-the-flat-earth-theory/answer/Maxwell-Perry-3
/How-do-I-prove-the-flat-earth-theory/answer/Matthew-Granovsky
/How-do-I-prove-the-flat-earth-theory/answers/83259600
/How-do-I-prove-the-flat-earth-theory/answer/Benjamin-Dixon-17
/How-do-I-prove-the-flat-earth-theory/answer/John-Chambers-75
/How-do-I-prove-the-flat-earth-theory/answer/Ryne-Hanz
/How-do-I-prove-the-flat-earth-theory/answer/Eric-Rodriguez-137
/How-do-I-prove-the-flat-earth-theory/answer/Robert-Hopkins-90
/How-do-I-prove-the-flat-earth-theory/answer/Sasha-Maddah
/How-do-I-prove-the-flat-earth-theory/answer/Owen-Lee-126
/How-do-I-prove-the-flat-earth-theory/answer/David-Phillips-133
/How-do-I-prove-the-flat-earth-theory/answer/Hasan-Poonawala-1
/How-do-I-prove-the-flat-earth-theory/answer/Cristiano-Dal-Vi
/How-do-I-prove-the-flat-earth-theory/answer/Rex-Newborn
/How-do-I-prove-the-flat-earth-theory/answer/John-Neumann-9
/How-do-I-prove-the-flat-earth-theory/answer/Josh-D-Davis
/How-do-I-prove-the-flat-earth-theory/answer/Maruthi-Sreenath
/How-do-I-prove-the-flat-earth-theory/answer/Clint-Morgan-2
/How-do-I-prove-the-flat-earth-theory/answer/Nicholas-Volkmuth
/How-do-I-prove-the-flat-earth-theory/answer/Richard-Swim
/How-do-I-prove-the-flat-earth-theory/answers/143504277
/How-do-I-prove-the-flat-earth-theory/answer/Christer-Svanström
/How-do-I-prove-the-flat-earth-theory/answer/Steve-Schlackman-2
/How-do-I-prove-the-flat-earth-theory/answers/147597845
/How-do-I-prove-the-flat-earth-theory/answer/Rene-Dukundane-Felix
/How-do-I-prove-the-flat-earth-theory/answers/148753762
/How-do-I-prove-the-flat-earth-theory/answer/Henk-Schuring
/How-do-I-prove-the-flat-earth-theory/answers/135814117
/How-do-I-prove-the-flat-earth-theory/answer/Emilio-Trampuz
/How-do-I-prove-the-flat-earth-theory/answers/40529643
/How-do-I-prove-the-flat-earth-theory/answer/Karl-Sangree
/How-do-I-prove-the-flat-earth-theory/answer/Ted-Carriker
/How-do-I-prove-the-flat-earth-theory/answer/egi-syahban
/How-do-I-prove-the-flat-earth-theory/answer/Mayank-Dahiya-12
/How-do-I-prove-the-flat-earth-theory/answer/Robert-Jones-741
/How-do-I-prove-the-flat-earth-theory/answer/Jimmi-Carlsson-1
/How-do-I-prove-the-flat-earth-theory/answer/Cole-Johnson-24
/How-do-I-prove-the-flat-earth-theory/answer/Kram-Redarsh
/How-do-I-prove-the-flat-earth-theory/answers/64915389

Html Parser pulling from previous webpage

I have a script that loads a page and saves a bunch of data ids from multiple containers. I then want to open up new urls appending those said data ids onto the end of the urls. For each url I want to locate all the hrefs and compare them to a list of specific links and if any of them match I want to save that link and a few other details to a table.
I have managed to get it to open the url with the appended data id but when I try to search for elements in the new page it either pulls them from the first url that was parsed if I try to findAll from soup again or I constantly get this error when I try to run another html.parser.
ResultSet object has no attribute 'findAll'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?
Is it not possible to run another parser or am I just doing something wrong?
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as soup
from selenium.webdriver.common.action_chains import ActionChains
url = "http://csgo.exchange/id/76561197999004010#x"
driver = webdriver.Firefox()
driver.get(url)
import time
time.sleep(15)
html = driver.page_source
soup = soup(html, "html.parser")
containers = soup.findAll("div",{"class":"vItem"})
print(len(containers))
data_ids = [] # Make a list to hold the data-id's
for container in containers:
test = container.attrs["data-id"]
data_ids.append(test) # add data-id's to the list
print(str(test))
for id in data_ids:
url2 = "http://csgo.exchange/item/" + id
driver.get(url2)
import time
time.sleep(2)
soup2 = soup(html, "html.parser")
containers2 = soup2.findAll("div",{"class":"bar"})
print(str(containers2))
with open('scraped.txt', 'w', encoding="utf-8") as file:
for id in data_ids:
file.write(str(id)+'\n') # write every data-id to a new line

Not sure exactly what you want from each page. You should add waits. I add waits looking for hrefs in the flow history section of each page (if present). It should illustrate the idea.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url = 'http://csgo.exchange/id/76561197999004010'
driver = webdriver.Chrome()
driver.get(url)
ids = [item.get_attribute('data-id') for item in WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "[data-id]")))]
results = []
baseURL = 'http://csgo.exchange/item/'
for id in ids:
url = baseURL + id
driver.get(url)
try:
flowHistory = [item.get_attribute('href') for item in WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#tab-history-flow [href]")))]
results.append([id, flowHistory])
except:
print(url)

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url = 'http://csgo.exchange/id/76561197999004010'
profile = webdriver.FirefoxProfile()
profile.set_preference("permissions.default.image", 2) # Block all images to load websites faster.
driver = webdriver.Firefox(firefox_profile=profile)
driver.get(url)
ids = [item.get_attribute('data-id') for item in WebDriverWait(driver,30).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "[data-id]")))]
results = []
baseURL = 'http://csgo.exchange/item/'
for id in ids:
url = baseURL + id
driver.get(url)
try:
pros = ['http://csgo.exchange/profiles/76561198149324950']
flowHistory = [item.get_attribute('href') for item in WebDriverWait(driver,3).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#tab-history-flow [href]")))]
if flowHistory in pros:
results.append([url,flowHistory])
print(results)
except:
print()

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
urls = ['http://csgo.exchange/id/76561197999004010']
profile = webdriver.FirefoxProfile()
profile.set_preference("permissions.default.image", 2) # Block all images to load websites faster.
driver = webdriver.Firefox(firefox_profile=profile)
for url in urls:
driver.get(url)
ids = [item.get_attribute('data-id') for item in WebDriverWait(driver,30).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "[data-id]")))]
results = []
pros = ['http://csgo.exchange/profiles/76561198149324950', 'http://csgo.exchange/profiles/76561198152970370']
baseURL = 'http://csgo.exchange/item/'
for id in ids:
url = baseURL + id
driver.get(url)
try:
flowHistory = [item.get_attribute('href') for item in WebDriverWait(driver,2).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#tab-history-flow [href]")))]
match = []
for string in pros:
if string in flowHistory:
match = string
break
if match:
pass
results.append([url,match])
print(results)
except:
print()

Parsing a site where URL doesn't change with Selenium Python

I'm trying to scrape [this][1] site its URL doesnt change when next page is clicked on. So, I used Selenium to click on the next page, but doing that doesnt help. As my driver keeps getting the old page even after next page is clicked on. Is there any other way to get to the next page and scrape it?
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
driver = webdriver.Safari()
store_pages = []
#10306 is total number of pages.
for i in range (10306):
Starting_url = 'site'
driver.get(Starting_url)
html = driver.page_source
soup = BeautifulSoup(html, "lxml")
print (store_pages.append(i))
timeout = 20
try:
WebDriverWait(driver, timeout).until(EC.visibility_of_element_located((By.XPATH, "//*[#id='ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_lblDisclaimerMsg']")))
except TimeoutException:
print("Timed out waiting for page to load")
driver.quit()
nextpage_url = driver.find_element_by_name("ctl00$SPWebPartManager1$g_d6877ff2_42a8_4804_8802_6d49230dae8a$ctl00$imgbtnNext").click()
timeout = 20
wait = WebDriverWait(driver, 10).until(EC.text_to_be_present_in_element_value((By.CSS_SELECTOR, '#ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a > div.act_search_results > div.act_search_header'), "206113 Record(s) | Page [2 of 10306]"))
NGO_element = driver.find_element_by_class_name("faq-sub-content exempted-result")
NGO_name = NGO_element.find_elements_by_tag_name("h1")
NGO_name_pancard = driver.find_elements_by_class_name("pan-id")
NGO_data = NGO_element.find_elements_by_tag_name("ul")
NGO_sub_data = NGO_element.find_elements_by_tag_name("li")
for i, p, t in zip(NGO_name, NGO_name_pancard, NGO_data):
n_name = i.text.replace(p.text, '')
n_data = t.text
n_pan = p.text
print ("Name of NGO:", n_name, "Fields of NGO:", n_data, "Pancard number:", n_pan)
nextpage_url = driver.find_element_by_name("ctl00$SPWebPartManager1$g_d6877ff2_42a8_4804_8802_6d49230dae8a$ctl00$imgbtnNext").click()
#timeout = 2

You need to make sure when you reach the next page, the content of the earlier page has become stale otherwise, you will have stale element error or get the same thing repeatedly. Try the below approach, it should get you there. The rest you can modify yourself.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
driver.get("http://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx")
while True:
for elem in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"[id^='arrowex']"))):
print(elem.text)
try:
wait.until(EC.presence_of_element_located((By.ID, "ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_ctl00_imgbtnNext"))).click()
wait.until(EC.staleness_of(elem))
except:
break
driver.quit()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Web scraping using selenium - python

Related

How to scrap each product page (comments and custumer country)

Reading weblink from dataframe throws "stale element reference: element is not attached to the page document" error

Scrape permalinks to answers posted under a certain question on Quora via Python-Selenium Web Driver

Html Parser pulling from previous webpage

Parsing a site where URL doesn't change with Selenium Python

Categories

Resources