I'm using webbrowser, so I can open a html to an performance test I'm currently doing.
This small piece of code is the begin of the automation. The goal of the function perf_measure is to return how long took to load the page in url entirely.
import webbrowser
def perf_measure(url=""):
try:
webbrowser.open(url)
except webbrowser.Error, e:
print "It couldn't open the url: ", url
url = "www.google.com"
open_browser(url)
How can I accomplish that? I just need the value, in seconds, like:
www.google.com Total time to load page in (secs): 2.641
Do you need to use the web browser? As in do you need to view the result?
Otherwise you could do this.
import urllib2
from time import time
stream = urllib2.urlopen('http://www.rarlab.com/rar/winrar-x64-420.exe')
start_time = time()
output = stream.read()
end_time = time()
stream.close()
print(end_time-start_time)
If you want a more human-readable result you can use round.
print(round(end_time-start_time, 3))
Output
0.865000009537 # Without Round
0.865 # With Round
A fancy way using a decorator
import time
def time_it(func):
def wrapper(*arg,**kw):
t1 = time.time()
res = func(*arg,**kw)
t2 = time.time()
return (t2-t1),res,func.func_name
return wrapper
#time_it
def perf_measure(url=""):
#w hatever you want
pass
If you want to time the page load (including all of the resources it loads, rendering time etc.) in a real browser you can use Selenium Webdriver. This will open your browser of choice, load the URL and then extract timings:
from selenium import webdriver
def time_url(driver, url):
driver.get(url)
# Use the browser Navigation Timing API to get some numbers:
# https://developer.mozilla.org/en-US/docs/Web/API/Navigation_timing_API
navigation_start = driver.execute_script(
"return window.performance.timing.navigationStart")
dom_complete = driver.execute_script(
"return window.performance.timing.domComplete")
total_time = dom_complete - navigation_start
print(f"Time {total_time}ms")
driver = webdriver.Chrome()
try:
url = "https://httpbin.org/delay/"
time_url(driver, url + '1')
time_url(driver, url + '2')
finally:
driver.close()
There are many other metrics you can load if you want to know the render-time separately from the loading time etc.
Related
The Situation:
I recently started web scraping using selenium and scrapy and i was working on a project where i have a csv file which contains 42 thousand zip codes and my job is to take that zip code and go on this site input the zip code and scrape all the results.
The Problem:
The problem here is that in doing this I have to continuously click the 'load more' button until all the results have been displayed and only once that has finished I can collect the data.
This may not be much of an issue, however it takes 2 minutes to do this per zip code and I have 42 000 to do this with.
The Code:
import scrapy
from numpy.lib.npyio import load
from selenium import webdriver
from selenium.common.exceptions import ElementClickInterceptedException, ElementNotInteractableException, ElementNotSelectableException, NoSuchElementException, StaleElementReferenceException
from selenium.webdriver.common.keys import Keys
from items import CareCreditItem
from datetime import datetime
import os
from scrapy.crawler import CrawlerProcess
global pin_code
pin_code = input("enter pin code")
class CareCredit1Spider(scrapy.Spider):
name = 'care_credit_1'
start_urls = ['https://www.carecredit.com/doctor-locator/results/Any-Profession/Any-Specialty//?Sort=D&Radius=75&Page=1']
def start_requests(self):
directory = os.getcwd()
options = webdriver.ChromeOptions()
options.headless = True
options.add_experimental_option("excludeSwitches", ["enable-logging"])
path = (directory+r"\\Chromedriver.exe")
driver = webdriver.Chrome(path,options=options)
#URL of the website
url = "https://www.carecredit.com/doctor-locator/results/Any-Profession/Any-Specialty/" +pin_code + "/?Sort=D&Radius=75&Page=1"
driver.maximize_window()
#opening link in the browser
driver.get(url)
driver.implicitly_wait(200)
try:
cookies = driver.find_element_by_xpath('//*[#id="onetrust-accept-btn-handler"]')
cookies.click()
except:
pass
i = 0
loadMoreButtonExists = True
while loadMoreButtonExists:
try:
load_more = driver.find_element_by_xpath('//*[#id="next-page"]')
load_more.click()
driver.implicitly_wait(30)
except ElementNotInteractableException:
loadMoreButtonExists = False
except ElementClickInterceptedException:
pass
except StaleElementReferenceException:
pass
except NoSuchElementException:
loadMoreButtonExists = False
try:
previous_page = driver.find_element_by_xpath('//*[#id="previous-page"]')
previous_page.click()
except:
pass
name = driver.find_elements_by_class_name('dl-result-item')
r = 1
temp_list=[]
j = 0
for element in name:
link = element.find_element_by_tag_name('a')
c = link.get_property('href')
yield scrapy.Request(c)
def parse(self, response):
item = CareCreditItem()
item['Practise_name'] = response.css('h1 ::text').get()
item['address'] = response.css('.google-maps-external ::text').get()
item['phone_no'] = response.css('.dl-detail-phone ::text').get()
yield item
now = datetime.now()
dt_string = now.strftime("%d/%m/%Y")
dt = now.strftime("%H-%M-%S")
file_name = dt_string+"_"+dt+"zip-code"+pin_code+".csv"
process = CrawlerProcess(settings={
'FEED_URI' : file_name,
'FEED_FORMAT':'csv'
})
process.crawl(CareCredit1Spider)
process.start()
print("CSV File is Ready")
items.py
import scrapy
class CareCreditItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
Practise_name = scrapy.Field()
address = scrapy.Field()
phone_no = scrapy.Field()
The Question:
Essentially my question is simple. Is there a way to optimize this code in order for it to perform faster? Or what are the other potential methods in order to handle scraping this data without it taking forever?
Since the site loads the data dynamically from an api you can retrieve the data directly from the api. This will speed things up quite a bit, but I'd still implement a wait to avoid hitting the rate limit.
import requests
import time
import pandas as pd
zipcode = '00704'
radius = 75
url = f'https://www.carecredit.com/sites/ContentServer?d=&pagename=CCGetLocatorService&Zip={zipcode}&City=&State=&Lat=&Long=&Sort=D&Radius={radius}&PracticePhone=&Profession=&location={zipcode}&Page=1'
req = requests.get(url)
r = req.json()
data = r['results']
for i in range(2,r['maxPage']+1):
url = f'https://www.carecredit.com/sites/ContentServer?d=&pagename=CCGetLocatorService&Zip={zipcode}&City=&State=&Lat=&Long=&Sort=D&Radius={radius}&PracticePhone=&Profession=&location={zipcode}&Page={i}'
req = requests.get(url)
r = req.json()
data.extend(r['results'])
time.sleep(1)
df = pd.DataFrame(data)
df.to_csv(f'{pd.Timestamp.now().strftime("%d/%m/%Y_%H-%M-%S")}zip-code{zipcode}.csv')
There are multiple ways in which you can do this.
1. Creating a distributed system in which you run the spider through multiple machines in order to run in parallel.
This in my opinio is the better of the options as you can also create a scalable dynamic solution that you will be able to use many times over.
There are many ways of doing this normally it will consist of dividing the seedlist (The Zip Codes) into many separate seedlists in order to have the separate processes working with seperate seedlists, thus the downloads will run in parallel so for example if its on 2 machines it will go 2 times faster, but if on 10 machines its 10 times faster, etc.
In order to do this I might suggest looking into AWS, namely AWS Lambda , AWS EC2 Instances or even AWS Spot Instances these are the ones I have worked wiht previously and they are not terribly hard to work with.
2. Alternatively, if you are wanting to run it on a single machine you can take a look into Multithreading with Python, which can help you run the process in parallel on the singular machine.
3. This is another option particularly if it is a once off process. You can try running it simply with requests which may speed it up but with a massive amount of seeds it usually is faster to develop a process running in parallel.
Reposted cause by : Not enought desired behavior
I have been looking for a solution to my problem for several days.
I have a python script which scrapes iteratively a dynamic website with selenium using the geckodriver.
During 2 hours it manages to take all of the data that I told to recover and at the end of these 2 hours, it begins to slow down and eventually crash.
The crash is caused by the occupation of firefox in the RAM. In detail, the longer the script scrapes, the more memory Firefox occupies increases.
I scoured the net and found various solutions which did not work.
If you can help me find the solution to be able to scrape for at least 24 hours that would be cool of you.
A bit of code
binary = FirefoxBinary('/opt/firefox/firefox')
start_time = time.time()
options = Options()
options.add_argument("--headless")
firefox_profile = webdriver.FirefoxProfile()
firefox_profile.set_preference("browser.privatebrowsing.autostart", True)
driver=webdriver.Firefox(executable_path="/usr/bin/geckodriver",options=options, firefox_binary=binary, firefox_profile=firefox_profile)
driver.get("https:********************************")
time.sleep(5)
print("WebSite OPENED ready to connect")
def auth(t_end) :
print("entering in auth function")
login_button = driver.find_element_by_xpath("******************").click()
time.sleep(5)
username = driver.find_element_by_xpath("******************")
username.clear()
username.send_keys("*****")
password = driver.find_element_by_xpath("*****************")
password.clear()
password.send_keys("*****")
driver.find_element_by_xpath("**************").click()
time.sleep(5)
print("Connected ready for Scraping")
def scraping(i, t_end) :
print("entering in scraping function")
os.system("free -h && sysctl vm.drop_caches=3 && free -h")
maps = driver.find_elements_by_class_name("************")
t_end = t_end * 3600
t_end = time.time() + t_end
l_a = []
dit_l_a ={}
time.sleep(5)
while time.time() < t_end :
dict_tempo = {}
total_a = driver.find_element_by_xpath("***************").text
if total_a == '0.00' :
time.sleep(10.5)
final_a = driver.find_element_by_xpath("**********").text
l_a.append(final_a)
history_a1 = driver.find_element_by_xpath('******').text
scrape = driver.find_element_by_xpath('*******').text
while scrape == history_a1 :
scrape = driver.find_element_by_xpath('****').text
scrape = scrape.split('\n')
dict_tempo["Final a"] = final_a
dict_tempo["List Of All a"] = scrape
now = datetime.datetime.now()
now = now.strftime("%d/%m/%Y %H:%M:%S")
dict_tempo["Date"] = now
dit_l_a[scrape[0]] = dict_tempo
return dit_l_a
for i in range(168):
print("Interation : ", i)
try :
returned_dict = scraping(i, t_end)
joblib.dump(returned_dict, './returned_dict_' + str(i))
except Exception as e :
print(e)
pass
return returned_dict
if __name__ == '__main__':
returned_dict = auth(2)
Environment: VPS 4GB RAM - CentOS 8 - Python 3.8 - Firefox84Beta - GeckDriver 0.28.0 - Headless scraping
I can see you are using firefox profile so what you can do is configure your profile to use less memory by following the recommended settings in firefox and then use that
https://support.mozilla.org/en-US/kb/firefox-uses-too-much-memory-or-cpu-resources
Additionally
In script try to add implicit wait of 3 or 4 seconds , this causes all commands to wait for 4 seconds making the scraping process slower if elements are not found or completed.
also add sleep after while and for loop:
while time.time() < t_end :
time.sleep(5)
Also
for i in range(168):
time.sleep(5)
This results in firefox storing files in harddisk than RAM , If you don't use sleep , the selenium action on the browser will be really fast than firefox could handle. This results in firefox using more RAM to serve the fast demand as read write operation is faster from RAM than harddisk
In addition add driver.close() this wont mostly remove the browser session , if it does try opening url in different tab and closing the previous tab in every 100 iteration or something . This will also free up memory
You can add sleep before each action using listeners
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time;
from selenium.webdriver.support.events import EventFiringWebDriver, AbstractEventListener
class MyListener(AbstractEventListener):
def before_navigate_to(self, url, driver):
time.sleep(5)
print("Before navigate to %s" % url)
def find_element_by_id(self, id_, driver):
for i in range(10):
time.sleep(5)
print("helloooo")
print("After navigate to %s" % url)
def after_navigate_to(self, url, driver):
for i in range(10):
time.sleep(5)
print("hi")
print("After navigate to %s" % url)
tempdriver = webdriver.Chrome()
driver = EventFiringWebDriver(tempdriver, MyListener())
driver.get("http://www.google.co.in/")
driver.find_element_by_id("hi")
Read : 7.37. Event Firing WebDriver Support
at : https://selenium-python.readthedocs.io/api.html
THe above example is only for driver, add the same for webelement if you need. But in your case time.sleep() with in loop would be more than enough
How do I use driver.get to open several URLs in Chrome.
My code:
import requests
import json
import pandas as pd
from selenium import webdriver
chromeOptions = webdriver.ChromeOptions()
chromedriver = r"C:\Users\Harrison Pollock\Downloads\Python\chromedriver_win32\chromedriver.exe"
driver = webdriver.Chrome(executable_path=r"C:\Users\Harrison Pollock\Downloads\Python\chromedriver_win32\chromedriver.exe",chrome_options=chromeOptions)
links = []
request1 = requests.get('https://api.beta.tab.com.au/v1/recommendation-service/featured-events?jurisdiction=NSW')
json1 = request1.json()
for n in json1['nextToGoRaces']:
if n['meeting']['location'] in ['VIC','NSW','QLD','SA','WA','TAS','IRL']:
links.append(n['_links']['self'])
driver.get('links')
Based on the comments - you'll want a class to manage your browsers, a class for your tests, then a runner to run in parallel.
Try this:
import unittest
import time
import testtools
from selenium import webdriver
class BrowserManager:
browsers=[]
def createBrowser(self, url):
browser = webdriver.Chrome()
browser.get(url)
self.browsers.append(browser)
def getBrowserByPartialURL(self, url):
for browser in self.browsers:
if url in browser.current_url:
return browser
def CloseItAllDown(self):
for browser in self.browsers:
browser.close()
class UnitTest1(unittest.TestCase):
def test_DoStuffOnGoogle(self):
browser = b.getBrowserByPartialURL("google")
#Point of this is to watch the output! you'll see this +other test intermingled (proves parallel run)
for i in range(10):
print(browser.current_url)
time.sleep(1)
def test_DoStuffOnYahoo(self):
browser = b.getBrowserByPartialURL("yahoo")
#Point of this is to watch the output! you'll see this +other test intermingled (proves parallel run)
for i in range(10):
print(browser.current_url)
time.sleep(1)
#create a global variable for the brwosers
b = BrowserManager()
# To Run the tests
if __name__ == "__main__":
##move to an init to Create your browers
b.createBrowser("https://www.google.com")
b.createBrowser("https://www.yahoo.com")
time.sleep(5) # This is so you can see both open at the same time
suite = unittest.TestLoader().loadTestsFromTestCase(UnitTest1)
concurrent_suite = testtools.ConcurrentStreamTestSuite(lambda: ((case, None) for case in suite))
concurrent_suite.run(testtools.StreamResult())
This code doesn't do anything exciting - it's an example of how to manage multiple browsers and run tests in parallel. It goes to the specified urls (which you should move to an init/setup), then prints out the URL it's on 10 times.
This is how you add a browser to the manager: b.createBrowser("https://www.google.com")
This is how you retrieve your browser: browser = b.getBrowserByPartialURL("google") - note it's a partial URL so you can use the domain as a keyword.
This is the output (just the first few lines- not all of it...) - It's a print URL for google then yahoo, then google then yahoo - showing that they're running at the same time:
PS C:\Git\PythonSelenium\BrowserManager> cd 'c:\Git\PythonSelenium'; & 'C:\Python38\python.exe' 'c:\Users\User\.vscode\extensions\ms-python.python-2020.7.96456\pythonFiles\lib\python\debugpy\launcher' '62426' '--' 'c:\Git\PythonSelenium\BrowserManager\BrowserManager.py'
DevTools listening on ws://127.0.0.1:62436/devtools/browser/7260dee3-368c-4f21-bd59-2932f3122b2e
DevTools listening on ws://127.0.0.1:62463/devtools/browser/9a7ce919-23bd-4fee-b302-8d7481c4afcd
https://www.google.com/
https://consent.yahoo.com/collectConsent?sessionId=3_cc-session_d548b656-8315-4eef-bb1d-82fd4c6469f8&lang=en-GB&inline=false
https://www.google.com/
https://consent.yahoo.com/collectConsent?sessionId=3_cc-session_d548b656-8315-4eef-bb1d-82fd4c6469f8&lang=en-GB&inline=false
https://www.google.com/
I'm using Selenium to capture screenshots of a web page. It works great on sites like stackoverflow but I'm trying to use it on a page that never stops loading. Is there a way to grab the screenshot after x seconds regardless if it's done or not?
Current code:
import os
from selenium import webdriver
def main():
driver = webdriver.Chrome()
with open('test.txt', 'r') as f:
for url in f.readlines():
driver.get('http://' + url)
sn_name = os.path.join('Screenshots', url.strip().replace('/', '-') + '.png')
print('Attempting to save:', sn_name)
if not driver.save_screenshot(sn_name):
raise Exception('Could not save screen shot: ' + sn_name)
driver.quit()
if __name__ == '__main__':
main()
I think it doesn't work like that.
Webdriver will implicit waiting for a page loading till timed-out.
It should give you a timeout exception.
I think you should use try-except to catch that and then take a screenshot.
Otherwise, you should do a multithreading programming for another thread to take a screenshot.
I want to hit a website at a very high frequency like 100 hits/sec.
In my code below, i have a sample process to open google.com and find an element and record time and then do some more random operations.
I observed that even for 5 request, time varies in seconds.
Ex
17:38:04
17:38:05
17:38:05
17:38:05
17:38:06
I need 100 requests to show same time. Please improve my code or suggest something else to do so. I am open to use any technology that can help me achieve 100 hits per sec. My application is browser based which need to submit HTML form and calling API in background.
import multiprocessing as mp
from selenium import webdriver
import time
def run_test(params):
driver = webdriver.Firefox()
driver.get("http://google.com")
a = driver.find_element_by_name("q")
print time.ctime()
a.send_keys("priyanka chopra")
a = driver.find_element_by_name("btnG")
a.click()
driver.quit()
if __name__ == '__main__':
count=5
parameters = range(count)
pool = mp.Pool(len(parameters))
pool.map(run_test, parameters)
time.sleep(5)