How to scrape Ajax webpage using python - python

I am learning Python scraping technique but I am stuck with the problem of scraping an Ajax page like this one.
I want to scrape all the medicines name and details coming in the page. Since I read most of the answer on the stack overflow but I am not getting the right data after scraping. I also tried to scrape using selenium or send a forge post request but it failed.
So please help me on this Ajax scraping topic specially this page because ajax is triggered on selecting an option from dropdown options.
Also please provide me with some resources for ajax page scraping.
//using selenium
from selenium import webdriver
import bs4 as bs
import lxml
import requests
path_to_chrome = '/home/brutal/Desktop/chromedriver'
browser = webdriver.Chrome(executable_path = path_to_chrome)
url = 'https://www.gianteagle.com/Pharmacy/Savings/4-10-Dollar-Drug-Program/Generic-Drug-Program/'
browser.get(url)
browser.find_element_by_xpath('//*[#id="ctl00_RegionPage_RegionPageMainContent_RegionPageContent_userControl_StateList"]/option[contains(text(), "Ohio")]').click()
new_url = browser.current_url
r = requests.get(new_url)
print(r.content)

ChromeDriver you can download here
normalize-space is used in order to remove trash from web text, such as x0
from time import sleep
from selenium import webdriver
from lxml.html import fromstring
data = {}
driver = webdriver.Chrome('PATH TO YOUR DRIVER/chromedriver') # i.e '/home/superman/www/myproject/chromedriver'
driver.get('https://www.gianteagle.com/Pharmacy/Savings/4-10-Dollar-Drug-Program/Generic-Drug-Program/')
# Loop states
for i in range(2, 7):
dropdown_state = driver.find_element(by='id', value='ctl00_RegionPage_RegionPageMainContent_RegionPageContent_userControl_StateList')
# open dropdown
dropdown_state.click()
# click state
driver.find_element_by_xpath('//*[#id="ctl00_RegionPage_RegionPageMainContent_RegionPageContent_userControl_StateList"]/option['+str(i)+']').click()
# let download the page
sleep(3)
# prepare HTML
page_content = driver.page_source
tree = fromstring(page_content)
state = tree.xpath('//*[#id="ctl00_RegionPage_RegionPageMainContent_RegionPageContent_userControl_StateList"]/option['+str(i)+']/text()')[0]
data[state] = []
# Loop products inside the state
for line in tree.xpath('//*[#id="ctl00_RegionPage_RegionPageMainContent_RegionPageContent_userControl_gridSearchResults"]/tbody/tr[#style]'):
med_type = line.xpath('normalize-space(.//td[#class="medication-type"])')
generic_name = line.xpath('normalize-space(.//td[#class="generic-name"])')
brand_name = line.xpath('normalize-space(.//td[#class="brand-name hidden-xs"])')
strength = line.xpath('normalize-space(.//td[#class="strength"])')
form = line.xpath('normalize-space(.//td[#class="form"])')
qty_30_day = line.xpath('normalize-space(.//td[#class="30-qty"])')
price_30_day = line.xpath('normalize-space(.//td[#class="30-price"])')
qty_90_day = line.xpath('normalize-space(.//td[#class="90-qty hidden-xs"])')
price_90_day = line.xpath('normalize-space(.//td[#class="90-price hidden-xs"])')
data[state].append(dict(med_type=med_type,
generic_name=generic_name,
brand_name=brand_name,
strength=strength,
form=form,
qty_30_day=qty_30_day,
price_30_day=price_30_day,
qty_90_day=qty_90_day,
price_90_day=price_90_day))
print('data:', data)
driver.quit()

Related

Request to a multi-page address that changes pages without changing the url

I want to request this url:
https://www.codal.ir/CompanyList.aspx
This url contains tables on 110 pages that when the page is changed, neither the url nor the new request is changed.
this is my code:
import requests as req
req = req.Session()
isics = req.get("https://www.codal.ir/CompanyList.aspx")
print(isics.text)
but I only get the first page information.I intend to extract the required information from the tables by request and regex but if you have another way I will be happy to hear .Thanks for helping me get the whole pages.
I used Selenium to navigate in the table. You can't do that with requests, because we do not have links that redirect us to the new page in the table. You can find the code below.
from bs4 import BeautifulSoup
from selenium import webdriver
import time
def get_company_links(links, driver):
soup = BeautifulSoup(driver.page_source, "html.parser")
rows = soup.select("table.companies-table tr")
for row in rows:
link = row.select_one("a")
if(link):
links.append("https://www.codal.ir/" + link['href'])
options = webdriver.ChromeOptions()
#options.add_argument("--headless")
driver = webdriver.Chrome(options=options)
driver.get("https://www.codal.ir/CompanyList.aspx")
current_page_button = driver.find_element_by_css_selector('input[type="submit"].normal.selected')
page_number = int(current_page_button.get_attribute('value'))
while(True):
get_company_links(links, driver)
next_page_button = driver.find_element_by_css_selector('input#ctl00_ContentPlaceHolder1_ucPager1_btnNext')
next_page_button.click()
time.sleep(2)
previous_page_number = page_number
current_page_button = driver.find_element_by_css_selector('input[type="submit"].normal.selected')
page_number = int(current_page_button.get_attribute('value'))
if(previous_page_number == page_number):
break # no more page left
print(links)
Main working principle is navigating through the table and collecting links of the company websites. We use next button to navigate and stop when last page index is equal to the current index, which indicates that we arrived at the end of the table.

Can't locate data on page

I am attempting to pull the title and link of each so called raffle in the list on this website. However, when i try to scrape this data, it can't seem to be found.
I have tried scraping all links on the page, but I think these "boxes" may be loaded via javascript.
The results I am receiving are a few links unrelated to what I want to get. There should be 40+ links that show up in this list, but the majority are not showing. Any help would be great, been stuck on this for a while
For some reason, this link and many others aren't showing up when I am scraping:
my code:
def raffle_page_collection():
chrome_driver()
page = requests.get('https://www.soleretriever.com/yeezy-boost-350-v2-black/')
soup = BeautifulSoup(page.text,'html.parser')
product_header = soup.find('h1').text
product_colorway = soup.find('h2').text
product_sku_and_release_date_and_price = soup.find('h3').text
container = soup.find(class_='main-container')
raffles = container.find_all('a')
raffle_list = []
for items in raffles:
raffle_list.append(items.get('href'))
print(raffle_list)
You should try automation selenium library. it allows you to scrape dynamic rendering request(js or ajax) page data.
Try this:
from selenium import webdriver
from bs4 import BeautifulSoup
import time
browser = webdriver.Chrome()
browser.get('https://www.soleretriever.com/yeezy-boost-350-v2-black/')
time.sleep(3)
soup = BeautifulSoup(browser.page_source,'html.parser')
product_header = soup.find('h1').text
product_colorway = soup.find('h2').text
product_sku_and_release_date_and_price = soup.find('h3').text
container = soup.find(class_='main-container')
raffles = container.find("div",{"class":"vc_pageable-slide-wrapper vc_clearfix"})
raffle_list = []
for items in raffles.find_all("a",href=True):
raffle_list.append(items.get('href'))
print(product_header)
print(product_colorway)
print(product_sku_and_release_date_and_price)
print(raffle_list)
O/P:
Yeezy Boost 350 v2 Black
Black/ Black/ Black
FU9006 | 07/06/19 | $220
['https://www.43einhalb.com/en/adidas-yeezy-boost-350-v2-black-328672#SR', 'https://www.adidas.co.uk/yeezy#SR', 'https://www.allikestore.com/default/adidas-yeezy-boost-350-v2-static-black-black-fu9006-105308.html#SR', 'https://archive1820.com/en/forms/6/yeezy-raffle#SR', 'https://drops.blackboxstore.com/blackbox_launches_view/catalog/product/view/id/22296/s/yeezy-boost-350-v2#SR', 'https://woobox.com/4szm9v#SR', 'https://launches.endclothing.com/product/yeezy-boost-350-v2-fu9006#SR', 'https://www.instagram.com/p/ByEIHSHDSY6/', 'https://www.instagram.com/p/ByFG1G0lWf7/', 'https://releases.footshop.com/adidas-yeezy-boost-350-v2-agqn6WoBJZ9y4RSnzw9G#SR', 'https://launches.goodhoodstore.com/launches/yeezy-boost-350-v2-black-33#SR', 'https://www.hervia.com/launches/yeezy-350#SR', 'https://www.hibbett.com/adidas-yeezy-350-v2-black-mens-shoe/M0991.html#SR', 'https://reporting.jdsports.co.uk/cgi-bin/msite?yeezy_comp+a+0+0+0+0+0&utm_source=RedEye&utm_medium=Email&utm_campaign=Yeezy%20Boost%20351%20Clay&utm_content=0905%20Yeezy%20Clay#SR', 'https://www.instagram.com/p/ByDnK6uH6kE/', 'https://www.nakedcph.com/yeezy-boost-v2-350-static-black-raffle/s/635#SR', 'https://www.instagram.com/p/ByIXT8zHvYz/', 'https://launches.sevenstore.com/launch/yeezy-boost-350-v2-black-4033024#SR', 'https://shelta.eu/news/adidas-yeezy-boost-350-v2-black-fu9006x#SR', 'https://www.instagram.com/p/ByDI_6JAfty/', 'https://www.sneakersnstuff.com/en/product/38889/adidas-yeezy-350-v2#SR', 'https://www.instagram.com/p/ByHtt3HFkE0/', 'https://www.instagram.com/p/ByCaKR7Cde1/', 'https://tres-bien.com/adidas-yeezy-boost-350-v2-black-fu9006-fw19#SR', 'https://yeezysupply.com/products/yeezy-boost-350-v2-black-june-7-2019#SR']
for chrome browser:
http://chromedriver.chromium.org/downloads
Install web driver for chrome browser:
https://christopher.su/2015/selenium-chromedriver-ubuntu/
selenium tutorial
https://selenium-python.readthedocs.io/

Web scraping when scrolling down is needed

I want to scrape, e.g., the title of the first 200 questions under the web page https://www.quora.com/topic/Stack-Overflow-4/all_questions. And I tried the following code:
import requests
from bs4 import BeautifulSoup
url = "https://www.quora.com/topic/Stack-Overflow-4/all_questions"
print("url")
print(url)
r = requests.get(url) # HTTP request
print("r")
print(r)
html_doc = r.text # Extracts the html
print("html_doc")
print(html_doc)
soup = BeautifulSoup(html_doc, 'lxml') # Create a BeautifulSoup object
print("soup")
print(soup)
It gave me a text https://pastebin.com/9dSPzAyX. If we search href='/, we can see that the html does contain title of some questions. However, the problem is that the number is not enough; actually on the web page, a user needs to manually scroll down to trigger extra load.
Does anyone know how I could mimic "scrolling down" by the program to load more content of the page?
Infinite scrolls on a webpage is based on the Javascript functionality. Therefore, to find out what URL we need to access and what parameters to use, we need to either thoroughly study the JS code working inside the page or, and preferably, examine the requests that the browser does when you scroll down the page. We can study requests using the Developer Tools.
See example for quora
the more you scroll down, the more requests generated. so now your requests will be done to that url instead of normal url but keep in mind to send correct headers and playload.
other easier solution will be by using selenium
Couldn't find a response using request. But you can use Selenium. First printed out the number of questions at first load, then send the End key to mimic scrolling down. You can see number of questions went from 20 to 40 after sending the End key.
I used driver.implicitly wait for 5 seconds before loading the DOM again in case the script load to fast before the DOM was loaded. You can improve by using EC with selenium.
The page loads 20 questions per scroll. So if you are looking to scrape 100 questions, then you need to send the End key 5 times.
To use the code below you need to install chromedriver.
http://chromedriver.chromium.org/downloads
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
CHROMEDRIVER_PATH = ""
CHROME_PATH = ""
WINDOW_SIZE = "1920,1080"
chrome_options = Options()
# chrome_options.add_argument("--headless")
chrome_options.add_argument("--window-size=%s" % WINDOW_SIZE)
chrome_options.binary_location = CHROME_PATH
prefs = {'profile.managed_default_content_settings.images':2}
chrome_options.add_experimental_option("prefs", prefs)
url = "https://www.quora.com/topic/Stack-Overflow-4/all_questions"
def scrape(url, times):
if not url.startswith('http'):
raise Exception('URLs need to start with "http"')
driver = webdriver.Chrome(
executable_path=CHROMEDRIVER_PATH,
chrome_options=chrome_options
)
driver.get(url)
counter = 1
while counter <= times:
q_list = driver.find_element_by_class_name('TopicAllQuestionsList')
questions = [x for x in q_list.find_elements_by_xpath('//div[#class="pagedlist_item"]')]
q_len = len(questions)
print(q_len)
html = driver.find_element_by_tag_name('html')
html.send_keys(Keys.END)
wait = WebDriverWait(driver, 5)
time.sleep(5)
questions2 = [x for x in q_list.find_elements_by_xpath('//div[#class="pagedlist_item"]')]
print(len(questions2))
counter += 1
driver.close()
if __name__ == '__main__':
scrape(url, 5)
I recommend using selenium rather than bs.
selenium can control browser and parsing. like scroll down, click button, etc…
this example is for scroll down for get all liker user in instagram.
https://stackoverflow.com/a/54882356/5611675
If the content only loads on "scrolling down", this probably means that the page is using Javascript to dynamically load the content.
You can try using a web client such as PhantomJS to load the page and execute the javascript in it, and simulate the scroll by injecting some JS such as document.body.scrollTop = sY; (Simulate scroll event using Javascript).

scrape website traffic from semrush using beautiful soup python

I'm trying to scrape website traffic from semrush.com.
my current code using BeautifulSoup is:
from bs4 import BeautifulSoup, BeautifulStoneSoup
import urllib
import json
req = urllib.request.Request('https://www.semrush.com/info/burton.com', headers={'User-Agent':'Magic Browser'})
response = urllib.request.urlopen(req)
raw_data = response.read()
response.close()
soup = BeautifulSoup(raw_data)
I've been trying data = soup.findAll("a", {"href":"/info/burton.com+(by+organic)"}) or data = soup.findAll("span", {"class":"sem-report-counter"}) without much luck.
I can see the numbers on the webpage that I would like to get. Is there a way to pull this information off? I'm not seeing it in the html I pull.
I went the extra mile and set up a working example of how you can use selenium to scrape that page. Install selenium and try it out!
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
url = 'https://www.semrush.com/info/burton.com' #your url
options = Options() #set up options
options.add_argument('--headless') #add --headless mode to options
driver = webdriver.Chrome(executable_path='/opt/ChromeDriver/chromedriver',
chrome_options=options)
#note: executable_path will depend on where your chromedriver.exe is located
driver.get(url) #get response
driver.implicitly_wait(1) #wait to load content
elements = driver.find_elements_by_xpath(xpath='//a[#href="/info/burton.com+(by+organic)"]') #grab that stuff you wanted?
for e in elements: print(e.get_attribute('text').strip()) #print text fields
driver.quit() #close the driver when you're done
Output that I see in my terminal:
356K
6.5K
59.3K
$usd305K
Organic keywords
Organic
Top Organic Keywords
View full report
Organic Position Distribution

Open a page programmatically in python

Can you extract the VIN number from this webpage?
I tried urllib2.build_opener, requests, and mechanize. I provided user-agent as well, but none of them could see the VIN.
opener = urllib2.build_opener()
opener.addheaders = [('User-agent',('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7) ' 'AppleWebKit/535.1 (KHTML, like Gecko) ' 'Chrome/13.0.782.13 Safari/535.1'))]
page = opener.open(link)
soup = BeautifulSoup(page)
table = soup.find('dd', attrs = {'class': 'tip_vehicleStats'})
vin = table.contents[0]
print vin
That page has much of the information loaded and displayed with Javascript (probably through Ajax calls), most likely as a direct protection against scraping. To scrape this you therefore either need to use a browser that runs Javascript, and control it remotely, or write the scraper itself in javascript, or you need to deconstruct the site and figure out exactly what it loads with Javascript and how, and see if you can duplicate these calls.
You can use browser automation tools for the purpose.
For example this simple selenium script can do your work.
from selenium import webdriver
from bs4 import BeautifulSoup
link = "https://www.iaai.com/Vehicles/VehicleDetails.aspx?auctionID=14712591&itemID=15775059&RowNumber=0"
browser = webdriver.Firefox()
browser.get(link)
page = browser.page_source
soup = BeautifulSoup(page)
table = soup.find('dd', attrs = {'class': 'tip_vehicleStats'})
vin = table.contents.span.contents[0]
print vin
BTW, table.contents[0] prints the entire span, including the span tags.
table.contents.span.contents[0] prints only the VIN no.
You could use selenium, which calls a browser. This works for me :
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
import time
# See: http://stackoverflow.com/questions/20242794/open-a-page-programatically-in-python
browser = webdriver.Firefox() # Get local session of firefox
browser.get("https://www.iaai.com/Vehicles/VehicleDetails.aspx?auctionID=14712591&itemID=15775059&RowNumber=0") # Load page
time.sleep(0.5) # Let the page load
# Search for a tag "span" with an attribute "id" which contains "ctl00_ContentPlaceHolder1_VINc_VINLabel"
e=browser.find_element_by_xpath("//span[contains(#id,'ctl00_ContentPlaceHolder1_VINc_VINLabel')]")
e.text
# Works for me : u'4JGBF7BE9BA648275'
browser.close()
You do not have to use Selenium.
Just make an additional get request:
import requests
stock_number = '123456789' # located at VEHICLE INFORMATION
url = 'https://www.clearvin.com/ads/iaai/check?stockNumber={}&vin='.format(stock_number)
vin = requests.get(url).json()['car']['vin']

Categories