Optimizing selenium code - python

So I wrote some code to grab data about classes at a college to build an interactive scheduler. Here is the code I have to get data:
from selenium import webdriver
import os
import pwd
import shlex
import re
import time
usr = pwd.getpwuid(os.getuid()).pw_name
Path = ('/Users/%s/Downloads/chromedriver') %usr # Have chromedriver dowloaded
# Create a new instance of the Chrome driver
options = webdriver.ChromeOptions()
options.binary_location = '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'
options.add_argument('headless') # Headless so no window is opened
options.add_argument('window-size=1200x600')
driver = webdriver.Chrome(Path, chrome_options=options)
driver.get('https://web.stevens.edu/scheduler/core/2017F/2017F.xml') # Go to database
classes = {}
def Database(AllSelectedCourseInfo):
ClassDict = {}
for item in AllSelectedCourseInfo: # Go through list of class info
try:
thing = item.split("=") # Split string by = to get subject name and value
name = thing[0]
if any(char.isdigit() for char in thing[1]): # Get rid of annoying Z at the end of numbers
thing[1] = re.sub("[Z]","",thing[1])
value = thing[1]
if value: # If subject has a value, store it
ClassDict[str(name)] = str(value) # Store value in a dictionary with the subject as the key
except:
pass
classes[str(ClassDict["Section"])] = ClassDict # Add to dictionary
def makeDatabase(section):
if "Title" in driver.find_element_by_xpath("//*[text()='%s']"%section).find_element_by_xpath("..").text:
classSection = driver.find_elements_by_xpath("//*[text()='%s']"%section) # If class name given find class
for i in range(0, len(classSection)):
AllSelectedCourseInfo = shlex.split(classSection[i].find_element_by_xpath(".." + "/.."*4).text.replace("/>", "").replace(">", "")) # sort into a list grouping string in quotes and getting rid of unnecessary symbols
Database(AllSelectedCourseInfo)
else:
classSection = driver.find_element_by_xpath("//*[text()='%s']"%section) # If class section give, find class
AllSelectedCourseInfo = shlex.split(classSection.find_element_by_xpath(".." + "/.."*3).text.replace("/>", "").replace(">", "")) # sort into a list grouping string in quotes and getting rid of unnecessary symbols
Database(AllSelectedCourseInfo)
def printDic():
for key in classes:
print "\n-------------%s------------" %key
for classkey in classes[key]:
print "%s : %s" %(classkey, classes[key][classkey])
start = time.time()
makeDatabase("Differential Calculus")
makeDatabase("MA 124B")
printDic()
end = time.time()
print end - start
driver.quit()
It takes about 20 seconds for me to pull data from one class and one class section, if I am to make this practical it is going to need at least 7 classes, and that would take over a minute just to create the dictionaries. Does anyone know of a way to make this run any faster?

I tried to integrate lxml and requests into my code but it just didn't have what I was looking for. After a few days of trying to use lxml to accomplish this with no avail I decided to try beautifulsoup4 with urllib. This worked better than I could have hoped,
from bs4 import BeautifulSoup
from HTMLParser import HTMLParser
import urllib
import shlex
import re
import time
h = HTMLParser()
page = urllib.urlopen('https://web.stevens.edu/scheduler/core/2017F/2017F.xml').read() # Get to database
soup = BeautifulSoup(page)
RawClassData = soup.contents[10].contents[0].contents[0].contents
classes = {}
backupClasses = {}
def makeDatabase():
for i in range(0, len(RawClassData)): # Parse through each class
try:
AllSelectedCourseInfo = shlex.split(h.unescape(str(RawClassData[i]).replace(">", " "))) # sort into a list grouping string in quotes and getting rid of unnecessary symbols
ClassDict = {}
for item in AllSelectedCourseInfo: # Go through list of class info
try:
thing = item.split("=") # Split string by = to get subject name and value
name = thing[0]
if any(char.isdigit() for char in thing[1]): # Get rid of annoying Z at the end of numbers
thing[1] = re.sub("[Z]","",thing[1])
value = thing[1]
if value: # If subject has a value, store it
ClassDict[str(name)] = str(value) # Store value in a dictionary with the subject as the key
except:
pass
classes[str(ClassDict["section"])] = ClassDict
except:
pass
def printDic():
with open("Classes", "w") as f:
for key in classes:
f.write("\n-------------%s------------" %key)
for classkey in classes[key]:
f.write( "\n%s : %s" %(classkey, classes[key][classkey]))
f.write("\n")
def printSection(selection):
print "\n-------------%s------------" %selection
for classkey in classes[selection]:
print "%s : %s" %(classkey, classes[selection][classkey])
def printClass(selection):
try:
for key in classes:
if classes[key]["title"] == selection:
print "\n-------------%s------------" %key
for classkey in classes[key]:
print "%s : %s" %(classkey, classes[key][classkey])
finally:
print "\n-------------%s------------" %selection
for classkey in classes[selection]:
print "%s : %s" %(classkey, classes[selection][classkey])
start = time.time()
makeDatabase()
end = time.time()
printClass("Circuits and Systems")
printClass("Differential Equations")
printClass("Writing & Communications Collqm")
printClass("Mechanics of Solids")
printClass("Electricity & Magnetism")
printClass("Engineering Design III")
printClass("Freshman Quiz")
printDic()
print end - start
This new code creates a library of all classes then prints out the desired class, all in 2 seconds. The selenium code took 89 seconds to just build the library for the desired classes and print them out, I would say thats a slight improvement... Thanks a ton to perfect5th for the suggestion!

Related

Print the shop name repeatedly using Selenium python

As it print all details but problem is that print Master Juice repeatedly as below is highlighted in picture that is my link https://www.foodpanda.pk/restaurants/new?lat=24.9414896&lng=67.1676002&vertical=restaurants from where i scrape
from selenium import webdriver
driver = webdriver.Chrome('F:/chromedriver')
driver.get("https://www.foodpanda.pk/restaurants/new?lat=24.9414896&lng=67.1676002&vertical=restaurants")
# response = scrapy.Selector(text=driver.page_source)
list = driver.find_elements_by_css_selector("ul.vendor-list li")
length = len(driver.find_elements_by_css_selector("ul.vendor-list li"))
for i in range(length):
try:
name = driver.find_elements_by_css_selector(".headline .name")[i].text
time = driver.find_elements_by_css_selector(".badge-info")[i].text.strip()
rating = driver.find_elements_by_css_selector(".rating")[i].text
dealtag = driver.find_elements_by_css_selector(".multi-tag")[i].text
except:
pass
print(name,time,rating,dealtag)
It's because it prints every time even if there's an error, printing whatever was previously stored in your name, time, etc. variables. Try it when you move your print statement within your try: block
from selenium import webdriver
driver = webdriver.Chrome('F:/chromedriver')
driver.get("https://www.foodpanda.pk/restaurants/new?lat=24.9414896&lng=67.1676002&vertical=restaurants")
# response = scrapy.Selector(text=driver.page_source)
list = driver.find_elements_by_css_selector("ul.vendor-list li")
length = len(driver.find_elements_by_css_selector("ul.vendor-list li"))
for i in range(length):
try:
name = driver.find_elements_by_css_selector(".headline .name")[i].text
time = driver.find_elements_by_css_selector(".badge-info")[i].text.strip()
rating = driver.find_elements_by_css_selector(".rating")[i].text
dealtag = driver.find_elements_by_css_selector(".multi-tag")[i].text
print(name,time,rating,dealtag)
except:
pass

How to scrape entire integer in python with Beautiful Soup?

Working on getting some wave heights from websites and my code fails when the wave heights get into the double digit range.
Ex: Currently the code would scrape a 12 from the site as '1' and '2' separately, not '12'.
#Author: David Owens
#File name: soupScraper.py
#Description: html scraper that takes surf reports from various websites
import csv
import requests
from bs4 import BeautifulSoup
NUM_SITES = 2
reportsFinal = []
###################### SURFLINE URL STRINGS AND TAG ###########################
slRootUrl = 'http://www.surfline.com/surf-report/'
slSunsetCliffs = 'sunset-cliffs-southern-california_4254/'
slScrippsUrl = 'scripps-southern-california_4246/'
slBlacksUrl = 'blacks-southern-california_4245/'
slCardiffUrl = 'cardiff-southern-california_4786/'
slTagText = 'observed-wave-range'
slTag = 'id'
#list of surfline URL endings
slUrls = [slSunsetCliffs, slScrippsUrl, slBlacksUrl]
###############################################################################
#################### MAGICSEAWEED URL STRINGS AND TAG #########################
msRootUrl = 'http://magicseaweed.com/'
msSunsetCliffs = 'Sunset-Cliffs-Surf-Report/4211/'
msScrippsUrl = 'Scripps-Pier-La-Jolla-Surf-Report/296/'
msBlacksUrl = 'Torrey-Pines-Blacks-Beach-Surf-Report/295/'
msTagText = 'rating-text'
msTag = 'li'
#list of magicseaweed URL endings
msUrls = [msSunsetCliffs, msScrippsUrl, msBlacksUrl]
###############################################################################
'''
This class represents a surf break. It contains all wave, wind, & tide data
associated with that break relevant to the website
'''
class surfBreak:
def __init__(self, name,low, high, wind, tide):
self.name = name
self.low = low
self.high = high
self.wind = wind
self.tide = tide
#toString method
def __str__(self):
return '{0}: Wave height: {1}-{2} Wind: {3} Tide: {4}'.format(self.name,
self.low, self.high, self.wind, self.tide)
#END CLASS
'''
This returns the proper attribute from the surf report sites
'''
def reportTagFilter(tag):
return (tag.has_attr('class') and 'rating-text' in tag['class']) \
or (tag.has_attr('id') and tag['id'] == 'observed-wave-range')
#END METHOD
'''
This method checks if the parameter is of type int
'''
def representsInt(s):
try:
int(s)
return True
except ValueError:
return False
#END METHOD
'''
This method extracts all ints from a list of reports
reports: The list of surf reports from a single website
returns: reportNums - A list of ints of the wave heights
'''
def extractInts(reports):
print reports
reportNums = []
afterDash = False
num = 0
tens = 0
ones = 0
#extract all ints from the reports and ditch the rest
for report in reports:
for char in report:
if representsInt(char) == True:
num = int(char)
reportNums.append(num)
else:
afterDash = True
return reportNums
#END METHOD
'''
This method iterates through a list of urls and extracts the surf report from
the webpage dependent upon its tag location
rootUrl: The root url of each surf website
urlList: A list of specific urls to be appended to the root url for each
break
tag: the html tag where the actual report lives on the page
returns: a list of strings of each breaks surf report
'''
def extractReports(rootUrl, urlList, tag, tagText):
#empty list to hold reports
reports = []
reportNums = []
index = 0
#loop thru URLs
for url in urlList:
try:
index += 1
#request page
request = requests.get(rootUrl + url)
#turn into soup
soup = BeautifulSoup(request.content, 'lxml')
#get the tag where surflines report lives
reportTag = soup.findAll(reportTagFilter)[0]
reports.append(reportTag.text.strip())
#notify if fail
except:
print 'scrape failure at URL ', index
pass
reportNums = extractInts(reports)
return reportNums
#END METHOD
'''
This method calculates the average of the wave heights
'''
def calcAverages(reportList):
#empty list to hold averages
finalAverages = []
listIndex = 0
waveIndex = 0
#loop thru list of reports to calc each breaks ave low and high
for x in range(0, 6):
#get low ave
average = (reportList[listIndex][waveIndex]
+ reportList[listIndex+1][waveIndex]) / NUM_SITES
finalAverages.append(average)
waveIndex += 1
return finalAverages
#END METHOD
slReports = extractReports(slRootUrl, slUrls, slTag, slTagText)
msReports = extractReports(msRootUrl, msUrls, msTag, msTagText)
reportsFinal.append(slReports)
reportsFinal.append(msReports)
print 'Surfline: ', slReports
print 'Magicseaweed: ', msReports
You are not actually extracting integers, but floats, it seems, since the values in reports are something like ['0.3-0.6 m']. Right now you are just going through every single character and converting them to int one by one or discarding. So no wonder that you will get only single-digit numbers.
One (arguably) simple way to extract those numbers from that string is with regexp:
import re
FLOATEXPR = re.compile("(\d+\.\d)-(\d+\.\d) {0,1}m")
def extractFloats(reports):
reportNums = []
for report in reports:
groups = re.match(FLOATEXPR, report).groups()
for group in groups:
reportNums.append(float(group))
return reportNums
This expression would match your floats and return them as a list.
In detail, the expression will match anything that has at least one digit before a '.', and one digit after it, a '-' between, another float sequence and ending with 'm' or ' m'. Then it groups the parts representing floats to a tuple. For example that ['12.0m-3.0m'] would return [12.0, 3.0]. If you expect it to have more digits after the floating point, you can add an extra '+' after the second 'd':s in the expression.

Pyuno indexing issue that I would like an explanation for

The following python libreoffice Uno macro works but only with the try..except statement.
The macro allows you to select text in a writer document and send it to a search engine in your default browser.
The issue, is that if you select a single piece of text,oSelected.getByIndex(0) is populated but if you select multiple pieces of text oSelected.getByIndex(0) is not populated. In this case the data starts at oSelected.getByIndex(1) and oSelected.getByIndex(0) is left blank.
I have no idea why this should be and would love to know if anyone can explain this strange behaviour.
#!/usr/bin/python
import os
import webbrowser
from configobj import ConfigObj
from com.sun.star.awt.MessageBoxButtons import BUTTONS_OK, BUTTONS_OK_CANCEL, BUTTONS_YES_NO, BUTTONS_YES_NO_CANCEL, BUTTONS_RETRY_CANCEL, BUTTONS_ABORT_IGNORE_RETRY
from com.sun.star.awt.MessageBoxButtons import DEFAULT_BUTTON_OK, DEFAULT_BUTTON_CANCEL, DEFAULT_BUTTON_RETRY, DEFAULT_BUTTON_YES, DEFAULT_BUTTON_NO, DEFAULT_BUTTON_IGNORE
from com.sun.star.awt.MessageBoxType import MESSAGEBOX, INFOBOX, WARNINGBOX, ERRORBOX, QUERYBOX
def fs3Browser(*args):
#get the doc from the scripting context which is made available to all scripts
desktop = XSCRIPTCONTEXT.getDesktop()
model = desktop.getCurrentComponent()
doc = XSCRIPTCONTEXT.getDocument()
parentwindow = doc.CurrentController.Frame.ContainerWindow
oSelected = model.getCurrentSelection()
oText = ""
try:
for i in range(0,4,1):
print ("Index No ", str(i))
try:
oSel = oSelected.getByIndex(i)
print (str(i), oSel.getString())
oText += oSel.getString()+" "
except:
break
except AttributeError:
mess = "Do not select text from more than one table cell"
heading = "Processing error"
MessageBox(parentwindow, mess, heading, INFOBOX, BUTTONS_OK)
return
lookup = str(oText)
special_c =str.maketrans("","",'!|##"$~%&/()=?+*][}{-;:,.<>')
lookup = lookup.translate(special_c)
lookup = lookup.strip()
configuration_dir = os.environ["HOME"]+"/fs3"
config_filename = configuration_dir + "/fs3.cfg"
if os.access(config_filename, os.R_OK):
cfg = ConfigObj(config_filename)
#define search engine from the configuration file
try:
searchengine = cfg["control"]["ENGINE"]
except:
searchengine = "https://duckduckgo.com"
if 'duck' in searchengine:
webbrowser.open_new('https://www.duckduckgo.com//?q='+lookup+'&kj=%23FFD700 &k7=%23C9C4FF &ia=meanings')
else:
webbrowser.open_new('https://www.google.com/search?/&q='+lookup)
return None
def MessageBox(ParentWindow, MsgText, MsgTitle, MsgType, MsgButtons):
ctx = XSCRIPTCONTEXT.getComponentContext()
sm = ctx.ServiceManager
si = sm.createInstanceWithContext("com.sun.star.awt.Toolkit", ctx)
mBox = si.createMessageBox(ParentWindow, MsgType, MsgButtons, MsgTitle, MsgText)
mBox.execute()
Your code is missing something. This works without needing an extra try/except clause:
selected_strings = []
try:
for i in range(oSelected.getCount()):
oSel = oSelected.getByIndex(i)
if oSel.getString():
selected_strings.append(oSel.getString())
except AttributeError:
# handle exception...
return
result = " ".join(selected_strings)
To answer your question about the "strange behaviour," it seems pretty straightforward to me. If the 0th element is empty, then there are multiple selections which may need to be handled differently.

Reading specific test steps from Quality Center with python

I am working with Quality Center via OTA COM library. I figured out how to connect to server, but I am lost in OTA documentation on how to work with it. What I need is to create a function which takes a test name as an input and returns number of steps in this test from QC.
For now I am this far in this question.
import win32com
from win32com.client import Dispatch
# import codecs #to store info in additional codacs
import re
import json
import getpass #for password
qcServer = "***"
qcUser = "***"
qcPassword = getpass.getpass('Password: ')
qcDomain = "***"
qcProject = "***"
td = win32com.client.Dispatch("TDApiOle80.TDConnection.1")
#Starting to connect
td.InitConnectionEx(qcServer)
td.Login(qcUser,qcPassword)
td.Connect(qcDomain, qcProject)
if td.Connected == True:
print "Connected to " + qcProject
else:
print "Connection failed"
#Path = "Subject\Regression\C.001_Band_tones"
mg=td.TreeManager
npath="Subject\Regression"
tsFolder = td.TestSetTreeManager.NodeByPath(npath)
print tsFolder
td.Disconnect
td.Logout
print "Disconnected from " + qcProject
Any help on descent python examples or tutorials will be highly appreciated. For now I found this and this, but they doesn't help.
Using the OTA API to get data from Quality Center normally means to get some element by path, create a factory and then use the factory to get search the object. In your case you need the TreeManager to get a folder in the Test Plan, then you need a TestFactory to get the test and finally you need the DesignStepFactory to get the steps. I'm no Python programmer but I hope you can get something out of this:
mg=td.TreeManager
npath="Subject\Test"
tsFolder = mg.NodeByPath(npath)
testFactory = tsFolder.TestFactory
testFilter = testFactory.Filter
testFilter["TS_NAME"] = "Some Test"
testList = testFactory.NewList(testFilter.Text)
test = testList.Item(1) # There should be only 1 item
print test.Name
stepFactory = test.DesignStepFactory
stepList = stepFactory.NewList("")
for step in stepList:
print step.StepName
It takes some time to get used to the QC OTA API documentation but I find it very helpful. Nearly all of my knowledge comes from the examples in the API documentation—for your problem there are examples like "Finding a unique test" or "Get a test object with name and path". Both examples are examples to the Test object. Even if the examples are in VB it should be no big thing to adapt them to Python.
I figured out the solution, if there is a better way to do this you are welcome to post it.
import win32com
from win32com.client import Dispatch
import getpass
def number_of_steps(name):
qcServer = "***"
qcUser = "***"
qcPassword = getpass.getpass('Password: ')
qcDomain = "***"
qcProject = "***"
td = win32com.client.Dispatch("TDApiOle80.TDConnection.1")
#Starting to connect
td.InitConnectionEx(qcServer)
td.Login(qcUser, qcPassword)
td.Connect(qcDomain, qcProject)
if td.Connected is True:
print "Connected to " + qcProject
else:
print "Connection failed"
mg = td.TreeManager # Tree manager
folder = mg.NodeByPath("Subject\Regression")
testList = folder.FindTests(name) # Make a list of tests matching name (partial match is accepted)
if testList is not None:
if len(testList) > 1:
print "There are multiple tests matching this name, please check input parameter\nTests matching"
for test in testList:
print test.name
td.Disconnect
td.Logout
return False
if len(testList) == 1:
print "In test %s there is %d steps" % (testList[0].Name, testList[0].DesStepsNum)
else:
print "There are no test with this test name in Quality Center"
td.Disconnect
td.Logout
return False
td.Disconnect
td.Logout
print "Disconnected from " + qcProject
return testList[0].DesStepsNum # Return number of steps for given test

How do I join results of looping script into a single variable?

I have looping script returning different filtered results, I can make this data return as an array for each of the different filter classes. However I am unsure of the best method to join all of these arrays together.
import mechanize
import urllib
import json
import re
import random
import datetime
from sched import scheduler
from time import time, sleep
from sets import Set
##### Code to loop the script and set up scheduling time
s = scheduler(time, sleep)
random.seed()
##### Code to stop duplicates part 1
userset = set ()
def run_periodically(start, end, interval, func):
event_time = start
while event_time < end:
s.enterabs(event_time, 0, func, ())
event_time += interval + random.randrange(-5, 10)
s.run()
##### Code to get the data required from the URL desired
def getData():
post_url = "URL OF INTEREST"
browser = mechanize.Browser()
browser.set_handle_robots(False)
browser.addheaders = [('User-agent', 'Firefox')]
##### These are the parameters you've got from checking with the aforementioned tools
parameters = {'page' : '1',
'rp' : '250',
'sortname' : 'race_time',
'sortorder' : 'asc'
}
##### Encode the parameters
data = urllib.urlencode(parameters)
trans_array = browser.open(post_url,data).read().decode('UTF-8')
xmlload1 = json.loads(trans_array)
pattern2 = re.compile('/control/profile/view/(.*)\' title=')
pattern4 = re.compile('title=\'posted: (.*) strikes:')
pattern5 = re.compile('strikes: (.*)\'><img src=')
for row in xmlload1['rows']:
cell = row["cell"]
##### defining the Keys (key is the area from which data is pulled in the XML) for use in the pattern finding/regex
user_delimiter = cell['username']
selection_delimiter = cell['race_horse']
user_numberofselections = float(re.findall(pattern4, user_delimiter)[0])
user_numberofstrikes = float(re.findall(pattern5, user_delimiter)[0])
strikeratecalc1 = user_numberofstrikes/user_numberofselections
strikeratecalc2 = strikeratecalc1*100
userid_delimiter_results = (re.findall(pattern2, user_delimiter)[0])
##### Code to stop duplicates throughout the day part 2 (skips if the id is already in the userset)
if userid_delimiter_results in userset: continue;
userset.add(userid_delimiter_results)
arraym = ""
arrayna = ""
if strikeratecalc2 > 50 and strikeratecalc2 < 100):
arraym0 = "System M"
arraym1 = "user id = ",userid_delimiter_results
arraym2 = "percantage = ",strikeratecalc2,"%"
arraym3 = ""
arraym = [arraym0, arraym1, arraym2, arraym3]
if strikeratecalc2 > 0 and strikeratecalc2 < 50):
arrayna0 = "System NA"
arrayna1 = "user id = ",userid_delimiter_results
arrayna2 = "percantage = ",strikeratecalc2,"%"
arrayna3 = ""
arrayna = [arrayna0, arrayna1, arrayna2, arrayna3]
getData()
run_periodically(time()+5, time()+1000000, 10, getData)
What I want to be able to do, is return both the 'arraym' and the 'arrayna' as one final Array, however due to the looping nature of the script upon each loop of the script the old 'arraym'/'arrayna' are overwritten, currently my attempts to yield one array containing all of the data has resulted in the last userid for 'systemm' and the last userid for 'sustemna'. This is obviously because, upon each run of the loop it overwrites the old 'arraym' and the 'arrayna' however I do not know of a way to get around this, so that all of my data can be accumulated in one array. Please note, I have been coding for cumulatively two weeks now, so there may well be some simple function to overcome this problem.
Kind regards AEA
Without looking at that huge code segment, typically you can do something like:
my_array = [] # Create an empty list
for <some loop>:
my_array.append(some_value)
# At this point, my_array is a list containing some_value for each loop iteration
print(my_array)
Look into python's list.append()
So your code might look something like:
#...
arraym = []
arrayna = []
for row in xmlload1['rows']:
#...
if strikeratecalc2 > 50 and strikeratecalc2 < 100):
arraym.append("System M")
arraym.append("user id = %s" % userid_delimiter_results)
arraym.append("percantage = %s%%" % strikeratecalc2)
arraym.append("")
if strikeratecalc2 > 0 and strikeratecalc2 < 50):
arrayna.append("System NA")
arrayna.append("user id = %s" % userid_delimiter_results)
arrayna.append("percantage = %s%%" % strikeratecalc2)
arrayna.append("")
#...

Categories