I am new to python and I m learning by doing.
At this moment, my code is running quite slow and it seems to take longer and longer by each time I run it.
The idea is to download an employee list as CSV, then to check the location of each Employee ID by running it trough a specific page then writing it to an excel file.
We have around 600 associates on site each day and I need to find their location and to keep refreshing it each 2-4 minutes.
EDIT:
For everyone to have a better understanding, I have a CSV file ( TOT.CSV ) that contains Employee ID's, Names and other information of the associates that I have on site.
In order to get their location, I need to run each employee ID from that CSV file trough https://guided-coaching-dub.corp.amazon.com/api/employee-location-svc/GetLastSeenLocationOfEmployee?employeeId= 1 by 1 and at the same time to write it in another CSV file ( Location.csv ). Right now, it does in about 10 minutes and I want to understand if the way I did it is the best possible way, or if there is something else that I could try.
My code looks like this:
# GET EMPLOYEE ID FROM THE CSV
data = read_csv("Z:\\_Tracker\\Dump\\attendance\\TOT.csv")
# converting column data to list
TOT_employeeID = data['Employee ID'].tolist()
# Clean the Location Sheet
with open("Z:\\_Tracker\\Dump\\attendance\\Location.csv", "w") as f:
pass
print("Previous Location data cleared ... ")
# go through EACH employee ID to find out location
for x in TOT_employeeID:
driver.get(
"https://guided-coaching-dub.corp.amazon.com/api/employee-location-svc/GetLastSeenLocationOfEmployee?employeeId=" + x)
print("Getting Location data for EmployeeID: " + x)
locData = driver.find_element(By.TAG_NAME, 'body').text
aaData = str(locData)
realLoc = aaData.split('"')
# write to excel
with open("Z:\\_Tracker\\Dump\\attendance\\Location.csv",
"a") as f:
writer = csv.writer(f)
writer.writerow(realLoc)
time.sleep(5)
print("Employee Location data downloaded...")
Is there a way I can do this faster?
Thank you in advance!
Regards,
Alex
Something like this.
import concurrent.futures
def process_data(data: pd.DataFrame) -> None:
associates = data['Employee ID'].unique()
with concurrent.futures.ProcessPoolExecutor() as executer:
executer.map(get_location, associates)
def get_location(associate: str) -> None:
driver.get(
"https://guided-coaching-dub.corp.amazon.com/api/employee-location-svc/GetLastSeenLocationOfEmployee?"
f"employeeId={associate}")
print(f"Getting Location data for EmployeeID: {associate}")
realLoc = str(driver.find_element(By.TAG_NAME, 'body').text).split('"')
with open("Z:\\_Tracker\\Dump\\attendance\\Location.csv", "a") as f:
writer = csv.writer(f)
writer.writerow(realLoc)
if __name__ == "__main__":
data = read_csv("Z:\\_Tracker\\Dump\\attendance\\TOT.csv")
process_data(data)
You could try separating the step of reading the information and writing the information to your CSV file, like below:
# Get Employee Location Information
# Create list for employee information, to be used below
employee_Locations = []
for x in TOT_employeeID:
driver.get("https://guided-coaching-dub.corp.amazon.com/api/employee-location-svc/GetLastSeenLocationOfEmployee?employeeId=" + x)
print("Getting Location data for EmployeeID: " + x)
locData = driver.find_element(By.TAG_NAME, 'body').text
aaData = str(locData)
realLoc = [aaData.split('"')]
employee_Locations.extend(realLoc)
# Write to excel - Try this as a separate step
with open("Z:\\_Tracker\\Dump\\attendance\\Location.csv","a") as f:
writer = csv.writer(f, delimiter='\n')
writer.writerow(employee_Locations)
print("Employee Location data downloaded...")
You may see some performance gains by collecting all your information first, then writing to your CSV file
So i am trying to write a Stroop experiment from scratch.
Ideally this is how i could like the experiment to be set up:
Enter participant information
Instruction pages (click x to continue)
Instruction page 2 (click x to continue)
Experiment Start
Break between trial
Experiment trial 2
End
(there will be more than 2 trials but for testing just 2 will be used)
I'm having difficulty writing the data to a text file. The second trial records perfectly with the different values per each loop. However the first trial shows up as duplicates and each trial has the same values in the text file.
In addition, i can't figure out how to write the data from the pop-up into my text file. (ie. subject name, age, id)
Also is there a way I can input the file name each time? Without changing code? -perhaps like a popup to choose the path and file name?
Thank you!
from psychopy import visual, core
import random
import time
import datetime
import sys
from psychopy import gui
from psychopy import event
#Write to file, need to figure out how to choose file name in each instance
file = open ("Test Output.txt", 'w')
#Pop up subject information - need to figure out how to output this data
myDlg = gui.Dlg(title="TEST TEXT BOX")
myDlg.addText('Subject info')
myDlg.addField('Name:')
myDlg.addField('Age:', )
myDlg.addText('Experiment Info')
myDlg.addField('Subject ID', "#" )
myDlg.addField('Group:', choices=["Test", "Control"])
ok_data = myDlg.show()
if myDlg.OK:
print(ok_data)
else:
print('user cancelled')
#opens up window w/ text,
win = visual.Window([800,800],monitor="testmonitor", units="deg")
msg = visual.TextStim(win, text="Hello")
msg.draw()
win.flip()
event.waitKeys(maxWait=10, keyList=None, timeStamped=False) #page remains until keyboard input, or max of 10 seconds
#with keyboard input, second screen will come up
msg = visual.TextStim(win, text="Instructions 1")
msg.draw()
win.flip()
event.waitKeys(maxWait=10, keyList=None, timeStamped=False)
#3rd screen will pop up with keyboard input
msg = visual.TextStim(win, text="Trial 1")
msg.draw()
win.flip()
event.waitKeys(maxWait=10, keyList=None, timeStamped=False)
#Trial starts,
for frameN in range(5):
MyColor = random.choice(['red','blue','green','yellow'])
Phrase = random.choice(["Red","Green", "Blue", "Yellow"])
time = str(datetime.datetime.now())
key = str(event.getKeys(keyList=['1','2','3','4','5'], ))
pause = random.randint(1200,2200)/1000.0
length = str(pause)
msg = visual.TextStim(win, text=Phrase,pos=[0,+1],color=MyColor)
msg.draw()
win.flip()
core.wait(pause)
msg = visual.TextStim(win, text="Break between trial")
msg.draw()
win.flip()
event.waitKeys(maxWait=10, keyList=None, timeStamped=False)
#trial 2
for frameN in range(5):
MyColor2 = random.choice(['red','blue','green','yellow'])
Phrase2 = random.choice(["Red","Green", "Blue", "Yellow"])
time2 = str(datetime.datetime.now())
key2 = str(event.getKeys(keyList=['1','2','3','4','5'], ))
pause2 = random.randint(1200,2200)/1000.0
length2 = str(pause2)
msg = visual.TextStim(win, text=Phrase2,pos=[0,+1],color=MyColor2)
msg.draw()
win.flip()
core.wait(pause2)
#specifying which data will be recorded into the file
data = "Stimuli:"+ MyColor + ',' + Phrase + ','+ time + ',' + key + ',' + length + MyColor2 + ',' + Phrase2 + ','+ time2 + ',' + key2 + ',' + length2
file.write(data + '\n')
#Jessica's Code.
You should really consider using the TrialHandler and/or ExperimentHandler classes that are built into PsychoPy: they have solved this (and many more issues) for you already. You don't need to re-invent the wheel.
i.e. define the trial parameters (in your case, colours and phrases) and feed them to the TrialHandler when it is created. It will then automatically cycle through each trial (in sequence or randomly, as required), and handle saving the data for you in structured files automatically. Data gathered from the experiment info dialog is saved with the data, as the dictionary of info gathered from the dialog can be passed as the extraInfo parameter when a TrialHandler or ExperimentHandler is created.
The PsychoPy data API is here: http://www.psychopy.org/api/data.html and there are examples of using the TrialHandler and ExperimentHandler under the Demos → exp control menu. Or examine any simple Builder-generated code for an experiment which contains a loop. For example, the Builder Stroop demo ;-) Builder code is quite verbose, but just look at the part where the Trial/Experiment handlers are created and how the experimental loop is controlled.
Have you considered using command line arguments? This would let you pass in file names at the start of your script. For example:
python myscript.py InputFile1 InputFile2 OutputFile1 OutputFile2
There is a really nice module that does a lot of the heavy lifting for you called argparse:
https://docs.python.org/3/library/argparse.html
Here is a tutorial that the docs provide if you are a little intimidated:
https://docs.python.org/3/howto/argparse.html
If you need Python 2 documentation, you can just change the 3 into a 2 in the URL. Here is a little code sample to show you what you can do with it as well:
import argparse
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--input", required = True, help = "Path to input file")
ap.add_argument("-o", "--output", required = True, help = "Path to output file")
args = vars(ap.parse_args())
print(args["input"])
print(args["output"])
You then can call this from your terminal to pass your file locations (or whatever else you want to pass):
python myscript.py -i File1.txt -o File2.txt
You will then get the following output from the two print statements in the code above:
File1.txt
File2.txt
So you can now use args["input"] and args["output"] to tell your program where it needs to get its input and output from without directly putting it in your code.
I am hoping to extract the change in cost of living from one city against many cities. I plan to list the cities I would like to compare in a CSV file and using this list to create the web link that would take me to the website with the information I am looking for.
Here is the link to an example: http://www.expatistan.com/cost-of-living/comparison/phoenix/new-york-city
Unfortunately I am running into several challenges. Any assistance to the following challenges is greatly appreciated!
The output only shows the percentage, but no indication whether it is more expensive or cheaper. For the example listed above, my output based on the current code shows 48%, 129%, 63%, 43%, 42%, and 42%. I tried to correct for this by adding an 'if-statement' to add '+' sign if it is more expensive, or a '-' sign if it is cheaper. However, this 'if-statement' is not functioning correctly.
When I write the data to a CSV file, each of the percentages is written to a new row. I can't seem to figure out how to write it as a list on one line.
(related to item 2) When I write the data to a CSV file for the example listed above, the data is written in the format listed below. How can I correct the format and have the data written in the preferred format listed below (also without the percentage sign)?
CURRENT CSV FORMAT (Note: 'if-statement' not functioning correctly):
City,Food,Housing,Clothes,Transportation,Personal Care,Entertainment
n,e,w,-,y,o,r,k,-,c,i,t,y,-,4,8,%
n,e,w,-,y,o,r,k,-,c,i,t,y,-,1,2,9,%
n,e,w,-,y,o,r,k,-,c,i,t,y,-,6,3,%
n,e,w,-,y,o,r,k,-,c,i,t,y,-,4,3,%
n,e,w,-,y,o,r,k,-,c,i,t,y,-,4,2,%
n,e,w,-,y,o,r,k,-,c,i,t,y,-,4,2,%
PREFERRED CSV FORMAT:
City,Food,Housing,Clothes,Transportation,Personal Care,Entertainment
new-york-city, 48,129,63,43,42,42
Here is my current code:
import requests
import csv
from bs4 import BeautifulSoup
#Read text file
Textfile = open("City.txt")
Textfilelist = Textfile.read()
Textfilelistsplit = Textfilelist.split("\n")
HomeCity = 'Phoenix'
i=0
while i<len(Textfilelistsplit):
url = "http://www.expatistan.com/cost-of-living/comparison/" + HomeCity + "/" + Textfilelistsplit[i]
page = requests.get(url).text
soup_expatistan = BeautifulSoup(page)
#Prepare CSV writer.
WriteResultsFile = csv.writer(open("Expatistan.csv","w"))
WriteResultsFile.writerow(["City","Food","Housing","Clothes","Transportation","Personal Care", "Entertainment"])
expatistan_table = soup_expatistan.find("table",class_="comparison")
expatistan_titles = expatistan_table.find_all("tr",class_="expandable")
for expatistan_title in expatistan_titles:
percent_difference = expatistan_title.find("th",class_="percent")
percent_difference_title = percent_difference.span['class']
if percent_difference_title == "expensiver":
WriteResultsFile.writerow(Textfilelistsplit[i] + '+' + percent_difference.span.string)
else:
WriteResultsFile.writerow(Textfilelistsplit[i] + '-' + percent_difference.span.string)
i+=1
Answers:
Question 1: the class of the span is a list, you need to check if expensiver is inside this list. In other words, replace:
if percent_difference_title == "expensiver"
with:
if "expensiver" in percent_difference.span['class']
Questions 2 and 3: you need to pass a list of column values to writerow(), not string. And, since you want only one record per city, call writerow() outside of the loop (over the trs).
Other issues:
open csv file for writing before the loop
use with context managers while working with files
try to follow PEP8 style guide
Here's the code with modifications:
import requests
import csv
from bs4 import BeautifulSoup
BASE_URL = 'http://www.expatistan.com/cost-of-living/comparison/{home_city}/{city}'
home_city = 'Phoenix'
with open('City.txt') as input_file:
with open("Expatistan.csv", "w") as output_file:
writer = csv.writer(output_file)
writer.writerow(["City", "Food", "Housing", "Clothes", "Transportation", "Personal Care", "Entertainment"])
for line in input_file:
city = line.strip()
url = BASE_URL.format(home_city=home_city, city=city)
soup = BeautifulSoup(requests.get(url).text)
table = soup.find("table", class_="comparison")
differences = []
for title in table.find_all("tr", class_="expandable"):
percent_difference = title.find("th", class_="percent")
if "expensiver" in percent_difference.span['class']:
differences.append('+' + percent_difference.span.string)
else:
differences.append('-' + percent_difference.span.string)
writer.writerow([city] + differences)
For the City.txt containing just one new-york-city line, it produces Expatistan.csv with the following content:
City,Food,Housing,Clothes,Transportation,Personal Care,Entertainment
new-york-city,+48%,+129%,+63%,+43%,+42%,+42%
Make sure you understand what changes have I made. Let me know if you need further help.
csv.writer.writerow() takes a sequence and makes each element a column; normally you'd give it a list with columns, but you are passing in strings instead; that'll add individual characters as columns instead.
Just build a list, then write it to the CSV file.
First, open the CSV file once, not for every separate city; you are clearing out the file every time you open it.
import requests
import csv
from bs4 import BeautifulSoup
HomeCity = 'Phoenix'
with open("City.txt") as cities, open("Expatistan.csv", "wb") as outfile:
writer = csv.writer(outfile)
writer.writerow(["City", "Food", "Housing", "Clothes",
"Transportation", "Personal Care", "Entertainment"])
for line in cities:
city = line.strip()
url = "http://www.expatistan.com/cost-of-living/comparison/{}/{}".format(
HomeCity, city)
resp = requests.get(url)
soup = BeautifulSoup(resp.content, from_encoding=resp.encoding)
titles = soup.select("table.comparison tr.expandable")
row = [city]
for title in titles:
percent_difference = title.find("th", class_="percent")
changeclass = percent_difference.span['class']
change = percent_difference.span.string
if "expensiver" in changeclass:
change = '+' + change
else:
change = '-' + change
row.append(change)
writer.writerow(row)
So, first of all, one passes the writerow method an iterable, and each object in that iterable gets written with commas separating them. So if you give it a string, then each character gets separated:
WriteResultsFile.writerow('hello there')
writes
h,e,l,l,o, ,t,h,e,r,e
But
WriteResultsFile.writerow(['hello', 'there'])
writes
hello,there
That's why you are getting results like
n,e,w,-,y,o,r,k,-,c,i,t,y,-,4,8,%
The rest of your problems are errors in your webscraping. First of all, when I scrape the site, searching for tables with CSS class "comparison" gives me None. So I had to use
expatistan_table = soup_expatistan.find("table","comparison")
Now, the reason your "if statement is broken" is because
percent_difference.span['class']
returns a list. If we modify that to
percent_difference.span['class'][0]
things will work the way you expect.
Now, your real issue is that inside the innermost loop you are finding the % changing in price for the individual items. You want these as items in your row of price differences, not individual rows. So, I declare an empty list items to which I append percent_difference.span.string, and then write the row outside the innermost loop Like so:
items = []
for expatistan_title in expatistan_titles:
percent_difference = expatistan_title.find("th","percent")
percent_difference_title = percent_difference.span["class"][0]
print percent_difference_title
if percent_difference_title == "expensiver":
items.append('+' + percent_difference.span.string)
else:
items.append('-' + percent_difference.span.string)
row = [Textfilelistsplit[i]]
row.extend(items)
WriteResultsFile.writerow(row)
The final error, is the in the while loop you re-open the csv file, and overwrite everything so you only have the final city in the end. Accounting for all theses errors (many of which you should have been able to find without help) leaves us with:
#Prepare CSV writer.
WriteResultsFile = csv.writer(open("Expatistan.csv","w"))
i=0
while i<len(Textfilelistsplit):
url = "http://www.expatistan.com/cost-of-living/comparison/" + HomeCity + "/" + Textfilelistsplit[i]
page = requests.get(url).text
print url
soup_expatistan = BeautifulSoup(page)
WriteResultsFile.writerow(["City","Food","Housing","Clothes","Transportation","Personal Care", "Entertainment"])
expatistan_table = soup_expatistan.find("table","comparison")
expatistan_titles = expatistan_table.find_all("tr","expandable")
items = []
for expatistan_title in expatistan_titles:
percent_difference = expatistan_title.find("th","percent")
percent_difference_title = percent_difference.span["class"][0]
print percent_difference_title
if percent_difference_title == "expensiver":
items.append('+' + percent_difference.span.string)
else:
items.append('-' + percent_difference.span.string)
row = [Textfilelistsplit[i]]
row.extend(items)
WriteResultsFile.writerow(row)
i+=1
YAA - Yet Another Answer.
Unlike the other answers, this treats the data as a series key-value pairs; ie: a list of dictionaries, which are then written to CSV. A list of wanted fields is provided to the csv writer (DictWriter), which discards additional information (beyond the specified fields) and blanks missing information. Also, should the order of the information on the original page change, this solution is unaffected.
I also assume you are going to open the CSV file in something like Excel. Additional parameters need to be given to the csv writer for this to happen nicely (see dialect parameter). Given that we are not sanitising the returned data, we should explicitly delimit it with unconditional quoting (see quoting parameter).
import csv
import requests
from bs4 import BeautifulSoup
#Read text file
with open("City.txt") as cities_h:
cities = cities_h.readlines()
home_city = "Phoenix"
city_data = []
for city in cities:
url = "http://www.expatistan.com/cost-of-living/comparison/%s/%s" % (home_city, city)
resp = requests.get(url)
soup = BeautifulSoup(resp.content, from_encoding = resp.encoding)
titles = soup.select("table.comparison tr.expandable")
if titles:
data = {}
for title in titles:
name = title.find("th", class_ = "clickable")
diff = title.find("th", class_ = "percent")
exp = bool(diff.find("span", class_ = "expensiver"))
data[name.text] = ("+" if exp else "-") + diff.span.text
data["City"] = soup.find("strong", class_ = "city-2").text
city_data.append(data)
with open("Expatistan.csv","w") as csv_h:
fields = \
[
"City",
"Food",
"Housing",
"Clothes",
"Transportation",
"Personal Care",
"Entertainment"
]
#Prepare CSV writer.
writer = csv.DictWriter\
(
csv_h,
fields,
quoting = csv.QUOTE_ALL,
extrasaction = "ignore",
dialect = "excel",
lineterminator = "\n",
)
writer.writeheader()
writer.writerows(city_data)