Python multiprocessing, functions with arguments - python

I have a program that simulates an entire baseball season, but does a lot of calculations per game, so each game takes around 30 seconds to run. With 2430 games in a season, the program takes about 20 hours to run, per season. Obviously I'd like to speed this up, so the most immediate solution seems like multiprocessing. I could manually split it up into groups of ~600 and run four processes, but I'd like to figure out how the multiprocessing module works.
Here's what I've tried so far, but obviously it doesn't work.
def test_func():
algorithm_selection = 1
# Create sqlite database connection
conn = sqlite3.connect('C:/F5 Prediction Engine/sqlite3/Version 2/statcast_db.db')
c = conn.cursor()
season = input('Year to simulate: ')
c.execute('SELECT * FROM gamelogs_' + season)
season_games = c.fetchall()
game_num = 0
for game in season_games:
game_num = game_num + 1
#Get away lineup in terms of MLB IDs
away_lineup = ConvertLineup(game[105], game[108], game[111], game[114], game[117], game[120], game[123], game[126], game[129])
#Get home lineup in terms of MLB IDs
home_lineup = ConvertLineup(game[132], game[135], game[138], game[141], game[144], game[147], game[150], game[153], game[156])
#Get away starting pitcher and hand in terms of MLB ID
away_pitcher_results = GetPitcherIDandHand(game[101])
away_pitcher_id = away_pitcher_results[0][0]
away_pitcher_hand = away_pitcher_results[0][1]
#Get home starting pitcher and hand in terms of MLB ID
home_pitcher_results = GetPitcherIDandHand(game[103])
home_pitcher_id = home_pitcher_results[0][0]
home_pitcher_hand = home_pitcher_results[0][1]
#Get the date of the game
today_date = game[0]
if algorithm_selection == 1:
#Check if the current game has already been evaluated and entered into the database
c.execute('SELECT * FROM pemstein_results_' + season + ' WHERE date = "' + game[0] + '" AND away_team = "' + game[3] + '" AND home_team = "' + game[6] + \
'" AND away_team_score = "' + game[9] + '" AND home_team_score = "' + game[10] + '"')
check_results = c.fetchall()
if len(check_results) == 0:
exp_slgs = PemsteinSimulation(home_pitcher_id, away_pitcher_id, season, home_pitcher_hand, away_pitcher_hand, home_lineup, away_lineup, game[0])
if exp_slgs[2] == 0: #if both pitches had at least 300 PAs to use for simulation
c.execute([long string to insert results into database])
conn.commit()
print('Game ' + str(game_num) + ' finished.')
if exp_slgs[2] == 1: #if one of the pitches did not have enough PAs to qualify
c.execute([long string to insert results into database])
conn.commit()
print('Game ' + str(game_num) + ' finished.')
if len(check_results) > 0:
print('Game ' + str(game_num) + ' has already been evaluated.')
from multiprocessing import Process
import os
processes = []
for i in range(0, os.cpu_count()):
print('Registering process %d' % i)
processes.append(Process(target=test))
for process in processes:
process.start()
for process in processes:
process.join()
==================
Edit: new code
#Child Process
def simulate_games(games_list, counter, lock):
while(1):
# Create sqlite database connection
conn = sqlite3.connect('C:/F5 Prediction Engine/sqlite3/Version 2/statcast_db.db')
c = conn.cursor()
#acquire the lock which grants access to the shared variable
with lock:
#check the termination condition
if counter >= len(games_list):
break
#get the game_num and game to simulate
game_num = counter.value
game_to_simulate = game_list[counter.value]
#update the counter for the next process
counter.value += 1
#Do simulation
game_num = 0
game_num = game_num + 1
#Get away lineup in terms of MLB IDs
away_lineup = ConvertLineup(game_to_simulate[105], game_to_simulate[108], game_to_simulate[111], game_to_simulate[114], game_to_simulate[117], game_to_simulate[120], game_to_simulate[123], game_to_simulate[126], game_to_simulate[129])
#Get home lineup in terms of MLB IDs
home_lineup = ConvertLineup(game_to_simulate[132], game_to_simulate[135], game_to_simulate[138], game_to_simulate[141], game_to_simulate[144], game_to_simulate[147], game_to_simulate[150], game_to_simulate[153], game_to_simulate[156])
#Get away starting pitcher and hand in terms of MLB ID
away_pitcher_results = GetPitcherIDandHand(game[101])
away_pitcher_id = away_pitcher_results[0][0]
away_pitcher_hand = away_pitcher_results[0][1]
#Get home starting pitcher and hand in terms of MLB ID
home_pitcher_results = GetPitcherIDandHand(game[103])
home_pitcher_id = home_pitcher_results[0][0]
home_pitcher_hand = home_pitcher_results[0][1]
#Get the date of the game
today_date = game_to_simulate[0]
if algorithm_selection == 1:
#Check if the current game has already been evaluated and entered into the database
c.execute('SELECT * FROM pemstein_results_' + season + ' WHERE date = "' + game_to_simulate[0] + '" AND away_team = "' + game_to_simulate[3] + '" AND home_team = "' + game_to_simulate[6] + \
'" AND away_team_score = "' + game_to_simulate[9] + '" AND home_team_score = "' + game_to_simulate[10] + '"')
check_results = c.fetchall()
if len(check_results) == 0:
exp_slgs = PemsteinSimulation(home_pitcher_id, away_pitcher_id, season, home_pitcher_hand, away_pitcher_hand, home_lineup, away_lineup, game_to_simulate[0])
if exp_slgs[2] == 0: #if both pitches had at least 300 PAs to use for simulation
c.execute('long sql')
conn.commit()
print('Game ' + str(game_num) + ' finished.')
if exp_slgs[2] == 1: #if one of the pitches did not have enough PAs to qualify
c.execute('long sql')
conn.commit()
print('Game ' + str(game_num) + ' finished.')
if len(check_results) > 0:
print('Game ' + str(game_num) + ' has already been evaluated.')
if __name__ == "__main__":
# Create sqlite database connection
conn = sqlite3.connect('C:/F5 Prediction Engine/sqlite3/Version 2/statcast_db.db')
c = conn.cursor()
#Query all games for season to be simulated
season = int(input('Year to simulate: '))
c.execute('SELECT * FROM gamelogs_' + str(season))
season_games = c.fetchall()
algorithmSelection = 1
if algorithmSelection == 1:
PemsteinSQLresults(str(season))
counter = mp.Value('i', 0)
lock = mp.Lock()
children = []
for i in range(os.cpu_count()):
children.append(mp.Process(target=simulate_games, args=(season_games, counter, lock)))
for child in children:
child.start()
for child in children:
child.join()
Error:
Traceback (most recent call last):
File "C:\F5 Prediction Engine\Version 2\SimulateSeason v2.py", line 126, in <module>
child.start()
File "C:\Python\lib\multiprocessing\process.py", line 105, in start
self._popen = self._Popen(self)
File "C:\Python\lib\multiprocessing\context.py", line 223, in _Popen
return _default_context.get_context().Process._Popen(process_obj)
File "C:\Python\lib\multiprocessing\context.py", line 322, in _Popen
return Popen(process_obj)
File "C:\Python\lib\multiprocessing\popen_spawn_win32.py", line 65, in __init__
reduction.dump(process_obj, to_child)
File "C:\Python\lib\multiprocessing\reduction.py", line 60, in dump
ForkingPickler(file, protocol).dump(obj)
BrokenPipeError: [Errno 32] Broken pipe
=============
So I went to this website to review some things, and tried a new script with the following code that I copied from the site:
import mp
def worker(num):
"""thread worker function"""
print('Worker:' + num)
return
if __name__ == '__main__':
jobs = []
for i in range(5):
p = mp.Process(target=worker, args=(i,))
jobs.append(p)
p.start()
But it likewise doesn't do anything. The site says it should print Worker:0 Worker:1 etc, but I'm getting no prints. Is it possible there's something wrong locally on my machine?

It seems to me that you have simply tried to instantiate a new process for each cpu and had them run the same function that you wrote at first, however if you want to work with processes you would have to adapt it and handle process synchonization.
As an example you could have a master process which prompts the user for the season year, fetches all the games for that year and then the child processes would read from the resulting array. See the following example:
# Parent Process
import multiprocessing as mp
# establish db connection [ ... ]
season = int(input("Year to simulate: "))
c.execute('SELECT * FROM gamelogs_' + season)
season_games = c.fetchall()
counter = mp.Value("i", 0)
lock = mp.Lock()
children = []
for i in range(os.cpu_count()):
children.append(mp.Process(target=simulate_games, args=(season_games, counter, lock,)))
for child in children:
child.start()
for child in children:
child.join()
# Child Process
def simulate_games(games_list, counter, lock):
while(1):
# acquire the lock which grants the access to the shared variable
with lock:
# check the termination condition
if counter.value >= len(games_list):
break
# get the game_num and the game to simulate
game_num = counter.value
game_to_simulate = games_list[counter.value]
# update counter for the next process
counter.value += 1
# Do simulation here
What we have above is a parent process which is basically preparing some data and creating new child processes.
The counter is implemented by means of a special class, i.e Value, which is used for sharing scalar values among processes; Lock is basically a mutex, which we use to synchronize the access to the counter variable and avoid concurrent access: note that you could have used the Lock which is automatically created inside of the counter shared variable, but I thought it would be easier to understand by separating the two.
The children processes by first acquiring the lock, read the counter value and increment it, then proceed to their normal behavior, thus simulating the games

Related

QUERY_EXCEEDED_MAX_MATCHES_ALLOWED error on Kaltura API (Python)

I'm unable to generate all entries in Kaltura. An ApiException with the message "Unable to generate list. max matches value was reached" (Error: QUERY_EXCEEDED_MAX_MATCHES_ALLOWED) gets triggered.
I tried to work around such issue by setting my sessionPrivileges to disableentitlement
class class_chk_integrity():
client = None
pagesize = 0
def __init__(self,worker_num, progress):
self.pagesize = 30
self.worker_num = worker_num
self.progress = progress
config = KalturaConfiguration(2723521)
config.serviceUrl = "https://www.kaltura.com/"
self.client = KalturaClient(config)
ks = self.client.session.start("KALTURA_ADMIN_SECRET",
"email#email.com",
KalturaPluginsCore.KalturaSessionType.ADMIN,
"KALTURA_PARTNER_ID",
432000,
"disableentitlement")
self.client.setKs(ks)
I also tried to filter based on the id's. However, I can't manage to put the filter.idNotIn to work properly.
def get_total_reg(self, cont, lastEntryIds, lastEntryCreatedAt):
filter = KalturaPluginsCore.KalturaBaseEntryFilter()
if lastEntryIds != "":
filter.idNotIn = lastEntryIds
filter.orderBy = KalturaBaseEntryOrderBy.CREATED_AT_DESC
pager = KalturaPluginsCore.KalturaFilterPager()
pageIndex = 1
entriesGot = 0
pager.pageSize = self.pagesize
pager.setPageIndex = pageIndex
result = self.client.baseEntry.list(filter, pager)
totalCount = result.totalCount
if totalCount > 10000:
totalCount = 9970
if totalCount <= 0:
cont = False
while entriesGot < totalCount:
pager.pageSize = self.pagesize
pageIndex += 1
pager.pageIndex = pageIndex
result = self.client.baseEntry.list(filter, pager)
entriesGot += len(result.objects)
for e in result.objects:
if lastEntryIds == "":
lastEntryIds.append(e.id)
else:
lastEntryIds.append(e.id)
lastEntryCreatedAt = e.createdAt
return result.totalCount, self.pagesize, cont, lastEntryIds, lastEntryCreatedAt
This is my how I'm calling the functions
if __name__ == '__main__':
try:
log = _ServiceUtils.log()
log.setup('all', 'integrity')
cont = True
lastEntryIds = []
lastEntryCreatedAt = 0
while cont is True:
kmc = class_chk_integrity(0,0)
kmc_total_reg, kmc_page_size, cont, lastEntryIds, lastEntryCreatedAt = kmc.get_total_reg(cont, lastEntryIds, lastEntryCreatedAt)
interval = 10
max_threads = math.ceil(kmc_total_reg / (interval * kmc_page_size))
# max_threads = 1
threads_list = []
print('TOTAL REG : %s | PAGE_SIZE : %s | INTERVAL : %s | THREADS : %s' % (kmc_total_reg,kmc_page_size,interval,max_threads))
progress = class_progress_thread(max_threads)
for index in range(0,max_threads):
page_ini = index * interval
page_end = index * interval + interval
progress.add_worker_progress(index,datetime.now())
threads_list.append(threading.Thread(target=thread_chk_integrity, args=(index, log, index * interval + 1,index * interval + interval,progress)))
threads_list.append(threading.Thread(target=thread_output_progress, args=(progress,max_threads)))
for thread in threads_list:
thread.start()
for thread in threads_list:
thread.join()
while not progress.stop(): time.sleep(30)
except KeyboardInterrupt:
try:
sys.exit(0)
except SystemExit:
os._exit(0)
I'd appreciate any help with this.
Thank you for your attention.
if totalCount > 10000:
totalCount = 9970
I'm curious to know why you are changing the totalCount this way.
Short answer - paging works as long as the result set is up to 10K.
To work around that, sort the result by creation date (as you did), and when you get to 10K, start with a new search where the created_at date in the filter is the last value you got in the previous search. Reset your paging of course.

How to get big amount of data as fast as possible

I am trying to return an array of constructed objects that are build on top of objects that I retrieve from some url plus another fields that I get from another url.
I have an array that consists of two arrays that each has about 8000 objects...
I have tried to make each object construction as a thread however it still takes a lot of time...
Any solution? Here is my code:
def get_all_players_full_data(ea_players_json):
all = []
ea_players_json = list(ea_players_json.values())
for i in range(len(ea_players_json)):
for player_obj in ea_players_json[i]:
all.append(player_obj)
for player_obj in range(len(all)):
all_data = []
with concurrent.futures.ThreadPoolExecutor(len(all)) as executor:
for player_data in all:
future = executor.submit(build_full_player_data_obj, player_data)
print(future.result())
all_data.append(future.result())
def build_full_player_data_obj(ea_player_data):
if ea_player_data.get("c") is not None:
player_full_name = ea_player_data.get("c")
else:
player_full_name = ea_player_data.get("f") + " " + ea_player_data.get("l")
player_id = ea_player_data.get("id")
# go to futhead to find all cards of that player
futhead_url_player_data = f'{FUTHEAD_PLAYER}{player_full_name}'
details_of_specific_player = json.loads(requests.get(futhead_url_player_data).content)
cards_from_the_same_id = []
for player_in_json_futhead in details_of_specific_player:
if player_in_json_futhead["player_id"] == player_id:
rating = player_in_json_futhead["rating"]
specific_card_id = player_in_json_futhead["def_id"]
revision = player_in_json_futhead["revision_type"]
name = player_in_json_futhead["full_name"]
nation = player_in_json_futhead["nation_name"]
position = player_in_json_futhead["position"]
club = player_in_json_futhead["club_name"]
cards_from_the_same_id.append(Player(specific_card_id, name, rating, revision, nation,
position, club))
return cards_from_the_same_id

Python schedule to run a function at specific day between time periods

I've got a function that accesses an API to check for train data at specific times. This is actually run 3 times for each journey, so I'd need to run each of the 3 at specific times.
I've tried using the schedule module to get this going but I can't seem to get it working. Here's my current code:
schedule.every().day.at("07:30").every(5).minutes.do(darwinChecker(train_station['home_station'], train_station['connect_station'], user_time['morning_time']))
But I get an AttributeError: 'Job' object has no attribute 'every'. The documentation states this happens if your code imports the wrong schedule module, but I've no other files under that name.
How would I go about running my function, say, every Friday from 07:30 till 08:40, every 5 minutes?
Edit: As per request, added my full code for what I'm trying to do:
import requests
import re
import schedule
import time
from darwin_token import DARWIN_KEY
jsonToken = DARWIN_KEY
train_station = {'work_station': 'bat', 'home_station': 'man', 'connect_station': 'wds'}
user_time = {'morning_time': ['0821', '0853'], 'evening_time': ['1733'], 'connect_time': ['0834', '0843']}
def darwinChecker(departure_station, arrival_station, user_time):
response = requests.get("https://huxley.apphb.com/all/" + str(departure_station) + "/to/" + str(arrival_station) + "/" + str(user_time), params={"accessToken": jsonToken})
response.raise_for_status() # this makes an error if something failed
data1 = response.json()
train_service = data1["trainServices"]
print('Departure Station: ' + str(data1.get('crs')))
print('Arrival Station: ' + str(data1.get('filtercrs')))
print('-' * 40)
try:
found_service = 0 # keeps track of services so note is generated if service not in user_time
for index, service in enumerate(train_service):
if service['sta'].replace(':', '') in user_time: # replaces sta time with values in user_time
found_service += 1 # increments for each service in user_time
print('Service RSID: ' + str(train_service[index]['rsid']))
print('Scheduled arrival time: ' + str(train_service[index]['sta']))
print('Scheduled departure time: ' + str(train_service[index]['std']))
print('Status: ' + str(train_service[index]['eta']))
print('-' * 40)
if service['eta'] == 'Cancelled':
# print('The ' + str(train_service[index]['sta']) + ' service is cancelled.')
print('Previous train departure time: ' + str(train_service[index - 1]['sta']))
print('Previous train status: ' + str(train_service[index - 1]['eta']))
if found_service == 0: # if no service is found
print('The services currently available are not specified in user_time.')
except TypeError:
print('There is no train service data')
try:
# print('\nNRCC Messages: ' + str(data1['nrccMessages'][0]['value']))
NRCCRegex = re.compile('^(.*?)[\.!\?](?:\s|$)') # regex pulls all characters until hitting a . or ! or ?
myline = NRCCRegex.search(data1['nrccMessages'][0]['value']) # regex searches through nrccMessages
print('\nNRCC Messages: ' + myline.group(1)) # prints parsed NRCC message
except (TypeError, AttributeError) as error: # tuple catches multiple errors, AttributeError for None value
print('There is no NRCC data currently available\n')
print('Morning Journey'.center(50, '='))
darwinChecker(train_station['home_station'], train_station['connect_station'], user_time['morning_time'])
# schedule.every().day.at("21:50").do()
# schedule.every(2).seconds.do(darwinChecker,train_station['home_station'], train_station['connect_station'], user_time['morning_time'])
schedule.every().day.at("07:30").every(5).minutes.do(darwinChecker,train_station['home_station'], train_station['connect_station'], user_time['morning_time'])
while True:
schedule.run_pending()
time.sleep(1)
# print('Connection Journey'.center(50, '='))
# darwinChecker(train_station['connect_station'], train_station['work_station'], user_time['connect_time'])
# print('Evening Journey'.center(50, '='))
# darwinChecker(train_station['work_station'], train_station['home_station'], user_time['evening_time'])`

Can search for a record in a shapefile but how to get other fields

I can search a shapefile for an attribute and it works fine but I don't know how to get the other fields in that record once the correct records is found. Don't know if I should use SearchCursor or SelectLayerByAttribute_management.
townlands = r'F:\MyProject\Assignment\townlands.shp'
outpath = r'F:\MyProject\Assignment'
the_townland=str(text_search_townland.get())
selection = str(""" "NAME_TAG" = '""" + the_townland + "'")
selection2 = ????????????????
print selection, selection2
This code is working in that it finds the townland that the user puts in text_search_townland and it prints it as selection. I'm looking to get another field called OSM_USER from that record into selection2.
I got this working after lots of trial and error. It does need SearchCursor or at least that is how I got it working.
def new_record():
#set environment variables.
arcpy.env.workspace = r'F:\MyProject\Assignment\folklore.gdb'
myPath = r'F:\MyProject\Assignment\folklore.gdb'
editRows = arcpy.da.InsertCursor('folklore', '*')
print editRows.fields
# get the centroid of the townland from townland_centroid (fc) based on the
# townland the user enters.
database = r'F:\MyProject\Assignment\folklore.gdb'
fc = database + '/' + 'townland_centroid'
the_townland=str(text_search_townland.get())
fields = ['NAME_TAG', 'X_coord', 'Y_coord']
whereClause = '"NAME_TAG"' + " = '" + the_townland + "'"
with arcpy.da.SearchCursor(fc, fields, whereClause) as cursor:
for row in cursor:
print('{0}, {1}, {2}'.format(row[0], row[1], row[2]))
X_coord = str(row[1])
Y_coord = str(row[2])
del cursor
# Set variables with values that will populate 'folklore' featureclass.
OID = 1
ptShape = arcpy.Point(0,0)
townland = text_search_townland.get()
county = var_county2.get()
category = var_category.get()
URL = text_search_URL.get()
spec_location = "text_search_speclocation.get()"
date_entered = text_search_date_entered.get()
story_year = int(text_search_story_year.get())
X_coord_put = X_coord
Y_coord_put = Y_coord
newRecord = [OID, ptShape, townland, county, URL, spec_location, date_entered, story_year, category, X_coord, Y_coord]
editRows.insertRow(newRecord)
del editRows
Hope this helps someone.

Python script to build least-cost paths between several polygons: How to speed up it?

I created a python program which uses the function "CostPath" of ArcGIS to automatically build least-cost paths (LCPs) between several polygons contained in the shapefile "selected_patches.shp". My python program seems to work but it is much too slow. I must build 275493 LCPs. Unfortunately, I don't know how to speed up my program (I am a beginner in Python programming language and ArcGIS). Or is there another solution to calculate rapidly least-cost paths between several polygons with ArcGIS (I use ArcGIS 10.1) ? Here is my code:
# Import system modules
import arcpy
from arcpy import env
from arcpy.sa import *
arcpy.CheckOutExtension("Spatial")
# Overwrite outputs
arcpy.env.overwriteOutput = True
# Set the workspace
arcpy.env.workspace = "C:\Users\LCP"
# Set the extent environment
arcpy.env.extent = "costs.tif"
rowsInPatches_start = arcpy.SearchCursor("selected_patches.shp")
for rowStart in rowsInPatches_start:
ID_patch_start = rowStart.getValue("GRIDCODE")
expressionForSelectInPatches_start = "GRIDCODE=%s" % (ID_patch_start) ## Define SQL expression for the fonction Select Layer By Attribute
# Process: Select Layer By Attribute in Patches_start
arcpy.MakeFeatureLayer_management("selected_patches.shp", "Selected_patch_start", expressionForSelectInPatches_start)
# Process: Cost Distance
outCostDist=CostDistance("Selected_patch_start", "costs.tif", "", "outCostLink.tif")
# Save the output
outCostDist.save("outCostDist.tif")
rowsInSelectedPatches_end = arcpy.SearchCursor("selected_patches.shp")
for rowEnd in rowsInSelectedPatches_end:
ID_patch_end = rowEnd.getValue("GRIDCODE")
expressionForSelectInPatches_end = "GRIDCODE=%s" % (ID_patch_end) ## Define SQL expression for the fonction Select Layer By Attribute
# Process: Select Layer By Attribute in Patches_end
arcpy.MakeFeatureLayer_management("selected_patches.shp", "Selected_patch_end", expressionForSelectInPatches_end)
# Process: Cost Path
outCostPath = CostPath("Selected_patch_end", "outCostDist.tif", "outCostLink.tif", "EACH_ZONE","FID")
# Save the output
outCostPath.save('P_' + str(int(ID_patch_start)) + '_' + str(int(ID_patch_end)) + ".tif")
# Writing in file .txt
outfile=open('P_' + str(int(ID_patch_start)) + '_' + str(int(ID_patch_end)) + ".txt", "w")
rowsTxt = arcpy.SearchCursor('P_' + str(int(ID_patch_start)) + '_' + str(int(ID_patch_end)) + ".tif")
for rowTxt in rowsTxt:
value = rowTxt.getValue("Value")
count = rowTxt.getValue("Count")
pathcost = rowTxt.getValue("PATHCOST")
startrow = rowTxt.getValue("STARTROW")
startcol = rowTxt.getValue("STARTCOL")
print value, count, pathcost, startrow, startcol
outfile.write(str(value) + " " + str(count) + " " + str(pathcost) + " " + str(startrow) + " " + str(startcol) + "\n")
outfile.close()
Thanks very much for your help.
The speed it takes to write to disc vs calculating your cost can be a bottleneck, consider adding a thread to handle all of your writes.
This:
for rowTxt in rowsTxt:
value = rowTxt.getValue("Value")
count = rowTxt.getValue("Count")
pathcost = rowTxt.getValue("PATHCOST")
startrow = rowTxt.getValue("STARTROW")
startcol = rowTxt.getValue("STARTCOL")
print value, count, pathcost, startrow, startcol
outfile.write(str(value) + " " + str(count) + " " + str(pathcost) + " " + str(startrow) + " " + str(startcol) + "\n")
Can be converted into a thread function by making rowsTxt a global variable, and having your thread write to disk from rowsTxt.
After you complete all of your processing you can have an additional global boolean so that your thread function can end when you are done writing everything and you can close your thread.
Example thread function I currently use:
import threading
class ThreadExample:
def __init__(self):
self.receiveThread = None
def startRXThread(self):
self.receiveThread = threading.Thread(target = self.receive)
self.receiveThread.start()
def stopRXThread(self):
if self.receiveThread is not None:
self.receiveThread.__Thread__stop()
self.receiveThread.join()
self.receiveThread = None
def receive(self):
while true:
#do stuff for the life of the thread
#in my case, I listen on a socket for data
#and write it out
So for your case, you could add a class variable to the thread class
self.rowsTxt
and then update your receive to check self.rowsTxt, and if it is not empty, handle it as u do in the code snippet i took from you above. After you handle it, set self.rowsTxt back to None. You could update your threads self.rowsTxt with your main function as it gets rowsTxt. Consider using a buffer like list for self.rowsTxt so you don't miss writing anything.
The most immediate change you can make to significant improve speed would be to switch to data access cursors (e.g. arcpy.da.SearchCursor()). To illustrate, I ran a benchmark test a while back to see the data access cursors perform compared to the old cursors.
The attached figure shows the results of a benchmark test on the new da method UpdateCursor versus the old UpdateCursor method. Essentially, the benchmark test performs the following workflow:
Create random points (10, 100, 1000, 10000, 100000)
Randomly sample from a normal distribution and add value to a new
column in the random points attribute table with a cursor
Run 5 iterations of each random point scenario for both the new and
old UpdateCursor methods and write the mean value to lists
Plot the results
import arcpy, os, numpy, time
arcpy.env.overwriteOutput = True
outws = r'C:\temp'
fc = os.path.join(outws, 'randomPoints.shp')
iterations = [10, 100, 1000, 10000, 100000]
old = []
new = []
meanOld = []
meanNew = []
for x in iterations:
arcpy.CreateRandomPoints_management(outws, 'randomPoints', '', '', x)
arcpy.AddField_management(fc, 'randFloat', 'FLOAT')
for y in range(5):
# Old method ArcGIS 10.0 and earlier
start = time.clock()
rows = arcpy.UpdateCursor(fc)
for row in rows:
# generate random float from normal distribution
s = float(numpy.random.normal(100, 10, 1))
row.randFloat = s
rows.updateRow(row)
del row, rows
end = time.clock()
total = end - start
old.append(total)
del start, end, total
# New method 10.1 and later
start = time.clock()
with arcpy.da.UpdateCursor(fc, ['randFloat']) as cursor:
for row in cursor:
# generate random float from normal distribution
s = float(numpy.random.normal(100, 10, 1))
row[0] = s
cursor.updateRow(row)
end = time.clock()
total = end - start
new.append(total)
del start, end, total
meanOld.append(round(numpy.mean(old),4))
meanNew.append(round(numpy.mean(new),4))
#######################
# plot the results
import matplotlib.pyplot as plt
plt.plot(iterations, meanNew, label = 'New (da)')
plt.plot(iterations, meanOld, label = 'Old')
plt.title('arcpy.da.UpdateCursor -vs- arcpy.UpdateCursor')
plt.xlabel('Random Points')
plt.ylabel('Time (minutes)')
plt.legend(loc = 2)
plt.show()

Categories