python - web scraping an ajax website using BeautifulSoup - python

I am trying to scrape e-commerce site that uses ajax call to load its next pages.
I am able to scrape data present on page 1 but page 2 loads automatically through ajax call when I scroll page 1 to bottom.
My code :
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as ureq
my_url='http://www.shopclues.com/mobiles-smartphones.html'
page=ureq(my_url).read()
page_soup=soup(page,"html.parser")
containers=page_soup.findAll("div",{"class":"column col3"})
for container in containers:
name=container.h3.text
price=container.find("span",{'class':'p_price'}).text
print("Name : "+name.replace(","," "))
print("Price : "+price)
for i in range(2,7):
my_url="http://www.shopclues.com/ajaxCall/moreProducts?catId=1431&filters=&pageType=c&brandName=&start="+str(36*(i-1))+"&columns=4&fl_cal=1&page="+str(i)
page=ureq(my_url).read()
print(page)
page_soup=soup(page,"html.parser")
containers=page_soup.findAll("div",{"class":"column col3"})
for container in containers:
name=container.h3.text
price=container.find("span",{'class':'p_price'}).text
print("Name : "+name.replace(","," "))
print("Price : "+price)
I have printed the ajax page read by ureq to know whether I am able to open the ajax page and I got an output as:
b' ' are the outputs of:
print(page)
please provide me a solution to scrape the remaining data.

from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup as soup
from urllib2 import urlopen as ureq
import random
import time
chrome_options = webdriver.ChromeOptions()
prefs = {"profile.default_content_setting_values.notifications": 2}
chrome_options.add_experimental_option("prefs", prefs)
# A randomizer for the delay
seconds = 5 + (random.random() * 5)
# create a new Chrome session
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.implicitly_wait(30)
# driver.maximize_window()
# navigate to the application home page
driver.get("http://www.shopclues.com/mobiles-smartphones.html")
time.sleep(seconds)
time.sleep(seconds)
# Add more to range for more phones
for i in range(1):
element = driver.find_element_by_id("moreProduct")
driver.execute_script("arguments[0].click();", element)
time.sleep(seconds)
time.sleep(seconds)
html = driver.page_source
page_soup = soup(html, "html.parser")
containers = page_soup.findAll("div", {"class": "column col3"})
for container in containers:
# Add error handling
try:
name = container.h3.text
price = container.find("span", {'class': 'p_price'}).text
print("Name : " + name.replace(",", " "))
print("Price : " + price)
except AttributeError:
continue
driver.quit()
I used selenium to load the website and click the button to load more results. Then take the resulting html and put in your code.

Related

Python web crawling result is less than expected

I tried using Selenium to crawl the web data, it loads all 346 products after clicking on the load more button for a few times on the browser, however, it only shows 96 / 346 product instead of 346 / 346 product, any idea how to fix it? I have already put the crawling code right after the while true loop for clicking the load more button
screen capture of the result
from urllib.request import urlopen
import requests
import ast
from selenium import webdriver
driver=webdriver.Chrome('e:/Users/fungc1/Documents/chromedriver.exe')
from selenium.webdriver.chrome.options import Options
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
options=Options()
from bs4 import BeautifulSoup
url="https://www.toysrus.com.sg/lego"
#data = soup.findAll('div',attrs={'class':'card-image-wrapper'})
#toc = soup.find_all('div',attrs={'class':'result-count text-center'})
driver.get(url)
driver.maximize_window()
time.sleep(5)
driver.find_element_by_link_text("STAY ON THE SINGAPORE SITE").click()
while True:
try:
driver.execute_script("var scrollingElement = (document.scrollingElement || document.body);scrollingElement.scrollTop = scrollingElement.scrollHeight;")
wait=WebDriverWait(driver, 10)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button.btn[data-url*='www.toysrus.com']"))).click()
time.sleep(5)
except Exception as e:
print(e)
break
time.sleep(5)
response = requests.get(url)
response_text = response.text
soup = BeautifulSoup(response_text, 'lxml')
text = urlopen(url).read()
soup = BeautifulSoup(text)
data = soup.findAll('div',attrs={'class':'card-image-wrapper'})
toc = soup.find_all('div',attrs={'class':'result-count text-center'})
emptylist2=[]
for item in toc:
print((item).text.strip()[:-1])
for div in data:
links = div.findAll('a')
for a in links:
catalogueresult=ast.literal_eval("" + a['href'][1:-5][-7:])
emptylist2.append(catalogueresult)
print (emptylist2)
You are mixing few things.
You opened the browser with Selenium and loaded all the items by clicking on load button. But after that you use requests library to request new html again from the url which has nothing to do with Selenium. So, you are doing two separate things. In your case even you remove the Selenium code you will get the same thing because you are not utilizing Selenium after loading all the products.
Now, what you need to do is to ask the Selenium to return the html code of all 396 products so that you can give it to BeautifulSoup for further parsing.
To do that, you don't need first 4 lines after your while loop ends. Do something like this:
html = driver.page_source #will return the html code with all products
soup = BeautifulSoup(html, 'lxml')
With this you will get all 396 products.

WebScraping Aliexpress - Lazyloading

I am trying to web scrape Aliexpress using Selenium and Python. I'm doing it by following a youtube tutorial, I have followed every steps but I just can't seem to get it to work.
I tried to use requests, BeautifulSoup as well. But it seems like Aliexpress uses lazy loaders on their product listings. I tried using the window scroll script but that didn't work. It seems like the content would not load until I personally scroll on it.
This is the url for the page I would like to web scrape
https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=dog+supplies&ltype=wholesale&SortType=default&g=n
This is the code I have currently. It doesn't return anything in the output. I think that's because it's trying to go through all the product listings but it couldn't find any because it's not loaded...
Any suggestions/help would be greatly appreciated, sorry for the bad formatting and the bad code in advance.
Thank you!
"""
To do
HOT PRODUCT FINDER Enter: Keyword, to generate a url
Product Name
Product Image
Product Link
Sales Number
Price
Create an excel file that contains these data
Sort the list by top selling orders
Develop an algorithm for the velocity of the product (total sales increased / time?)
Scrape site every day """
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import requests
import lxml
#Starting Up the web driver
driver = webdriver.Chrome()
# grab Keywords
search_term = input('Keywords: ')
# url generator
def get_url(search_term):
"""Generate a url link using search term provided"""
url_template = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText={}&ltype=wholesale&SortType=default&g=n'
search_term = search_term.replace(" ", "+")
return url_template.format(search_term)
url = get_url('search_term')
driver.get(url)
#scrolling down to the end of the page
time.sleep(2)
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
#Extracting the Collection
r = requests.get(url)
soup = BeautifulSoup(r.content,'lxml')
productlist = soup.find_all('div', class_='list product-card')
print(productlist)
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import requests
import lxml
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("start-maximized")
chrome_options.add_argument("disable-infobars")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
driver = webdriver.Chrome(executable_path = 'chromedriver.exe',options = chrome_options)
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
# grab Keywords
search_term = input('Keywords: ')
# url generator
driver.get('https://www.aliexpress.com')
driver.implicitly_wait(10)
p = driver.find_element_by_name('SearchText')
p.send_keys(search_term)
p.send_keys(Keys.ENTER)
productlist = []
product = driver.find_element_by_xpath('//*[#id="root"]/div/div/div[2]/div[2]/div/div[2]/ul')
height = driver.execute_script("return document.body.scrollHeight")
for scrol in range(100,height-1800,100):
driver.execute_script(f"window.scrollTo(0,{scrol})")
time.sleep(0.5)
# driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
div = []
list_i = []
item_title = []
a = []
for z in range(1,16):
div.append(product.find_element_by_xpath('//*[#id="root"]/div/div/div[2]/div[2]/div/div[2]/ul/div'+str([z])))
for pr in div:
list_i.append(pr.find_elements_by_class_name('list-item'))
for pc in list_i:
for p in pc:
item_title.append(p.find_element_by_class_name('item-title-wrap'))
for pt in item_title:
a.append(pt.find_element_by_tag_name('a'))
for prt in a:
productlist.append(prt.text)

Not able to access website url using beautiful soup and python while web scraping

Link that I am scraping : https://www.indusind.com/in/en/personal/cards/credit-card.html
from urllib.request import urlopen
from bs4 import BeautifulSoup
import json, requests, re, sys
from selenium import webdriver
import re
IndusInd_url = "https://www.indusind.com/in/en/personal/cards/credit-card.html"
html = requests.get(IndusInd_url)
soup = BeautifulSoup(html.content, 'lxml')
print(soup)
for x in soup.select("#display-product-cards .text-primary"):
print(x.get_text())
Using the above code I am trying to scrape the titles of the card, but unfortuantely I am getting this output
<html><body><p>This website is secured against online attacks. Your request was blocked due to suspicious behavior<br/>
<br/>
Client IP : 124.123.170.109<br/>
<br/>
Incident Time : 2021-02-24 06:28:10 UTC <br/>
<br/>
Incident ID : YDXx#m6g3nSFLvi5lGg4wgAAAf8<br/>
<br/>
If you feel it was a legitimate request, please contact the website owner for further investigation and remediation with a screenshot of this page.</p></body></html>
Is there any other alternative to follow to scrape the details.
Any help is highly appreciated ! ! !
Please check this.
FYI: Make sure you have the right driver (firefoxe or chrome or whatever with right version)
from selenium import webdriver
import requests
from bs4 import BeautifulSoup
import time
url = 'https://www.indusind.com/in/en/personal/cards/credit-card.html'
# open the chrome driver
driver = webdriver.Chrome(executable_path='webdrivers/chromedriver.exe')
# pings the specified url
driver.get(url)
# sleep time to wait for t seconds to wait for page load
# replace 3 with any int value (int value in seconds)
time.sleep(3)
# gets the page source
pg = driver.page_source
# beautify with beautifulsoup
soup = BeautifulSoup(pg)
# get the titles of the card
for x in soup.select("#display-product-cards .text-primary"):
print(x.get_text())
Below is output image
Can be achieved without BeautifulSoup.
I define the locator with xpath with the value:
//div[#id='display-product-cards']//a[#class='card-title text-primary' and text()!='']
And utilize method .presence_of_all_elements_located.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome(executable_path='webdrivers/chromedriver.exe')
driver.get('https://www.indusind.com/in/en/personal/cards/credit-card.html')
wait = WebDriverWait(driver, 20)
elements = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//div[#id='display-product-cards']//a[#class='card-title text-primary' and text()!='']")))
for element in elements:
print(element.get_attribute('innerHTML'))
driver.quit()

Web scraping BeautifulSoup find_all doesn't work on APEC

the source code of the page is as in the picture
I want to findall div class container-result but it doesn't work I get an empty list
my code :
`
from bs4 import BeautifulSoup, NavigableString, Tag
import requests
import urllib.request
url = "https://www.apec.fr/candidat/recherche-emploi.html/emploi?page="
for page in range(0,10,1):
r = requests.get(url + str(page))
soup = BeautifulSoup(r.content,"html.parser")
ancher = soup.find_all('div', attrs={'class': 'container-result'})
print(ancher)
`
As the web page is rendered by javascript, requests / BeautifulSoup will not be able to retrieve DOM elements needed as they're added after some time whilst page is being rendered. You could try to use selenium for this purpose, here is an example:
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
# delay for selenium web driver wait
DELAY = 30
# create selenium driver
chrome_options = webdriver.ChromeOptions()
#chrome_options.add_argument('--headless')
#chrome_options.add_argument('--no-sandbox')
driver = webdriver.Chrome('<<PATH TO chromedriver>>', options=chrome_options)
# iterate over pages
for page in range(0, 10, 1):
# open web page
driver.get(f'https://www.apec.fr/candidat/recherche-emploi.html/emploi?page={page}')
# wait for element with class 'container-result' to be added
container_result = WebDriverWait(driver, DELAY).until(EC.presence_of_element_located((By.CLASS_NAME, "container-result")))
# scroll to container-result
driver.execute_script("arguments[0].scrollIntoView();", container_result)
# get source HTML of the container-result element
source = container_result.get_attribute('innerHTML')
# print source
print(source)
# here you can continue work with the source variable either using selenium API or using BeautifulSoup API:
# soup = BeautifulSoup(source, "html.parser")
# quit webdriver
driver.quit()

Scraping contents of multi web pages of a website using BeautifulSoup and Selenium

The website I want to scrap is :
http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061
I want to get the last page number of the above the link for proceeding, which is 499 while taking the screenshot.
My code :
from bs4 import BeautifulSoup
from urllib.request import urlopen as uReq
from selenium import webdriver;import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
firefox_capabilities = DesiredCapabilities.FIREFOX
firefox_capabilities['marionette'] = True
firefox_capabilities['binary'] = '/etc/firefox'
driver = webdriver.Firefox(capabilities=firefox_capabilities)
url = "http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061"
driver.get(url)
wait = WebDriverWait(driver, 10)
soup=BeautifulSoup(driver.page_source,"lxml")
containers = soup.findAll("ul",{"class":"pages table"})
containers[0] = soup.findAll("li")
li_len = len(containers[0])
for item in soup.find("ul",{"class":"pages table"}) :
li_text = item.select("li")[li_len].text
print("li_text : {}\n".format(li_text))
driver.quit()
I need help to figure out the error in my code for getting the last page number. Also, I would be grateful if someone give the alternate solution for the same and suggest ways to achieve my intention.
If you want to get the last page number of the above the link for proceeding, which is 499 you can use either Selenium or Beautifulsoup as follows :
Selenium :
from selenium import webdriver
driver = webdriver.Firefox(executable_path=r'C:\Utility\BrowserDrivers\geckodriver.exe')
url = "http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061"
driver.get(url)
element = driver.find_element_by_xpath("//div[#class='row pagination']//p/span[contains(.,'Reviews on Reliance Jio')]")
driver.execute_script("return arguments[0].scrollIntoView(true);", element)
print(driver.find_element_by_xpath("//ul[#class='pagination table']/li/ul[#class='pages table']//li[last()]/a").get_attribute("innerHTML"))
driver.quit()
Console Output :
499
Beautifulsoup :
import bs4
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
url = "http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061"
uClient = uReq(url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
container = page_soup.find("ul",{"class":"pages table"})
all_li = container.findAll("li")
last_div = None
for last_div in all_li:pass
if last_div:
content = last_div.getText()
print(content)
Console Output :
499

Categories