Simpy: Callcenter simulation - inactive call timeout - python

currently I am working on a more or less complex callcenter simulation. I am quite new to Simpy and have a problem with timing out calls if no agent could answer them.
In my simulation I generate calls in 4 different queues. Each of them should have it's own call spawn rate.
Additionally I have 4 active agents. Each of them is working on multiple queues, but nobody on all 4 queues. And each of them has a separate handling time distribution when working in a certain queue.
Additionally, an available agent should not only be able to take the first available call in a queue, but I want to add more detailed logics about which call to pick, later. Thus, I add spawning calls to a store (later a filter store) and pull them out in the consume_calls function.
As the callcenter could operate in an understaffed situation (too few agents), I want to model customer patience as well. So I defined MIN_PATIENCE and MAX_PATIENCE as well and let them quit the call if the agent cannot take it within this time range.
My script looks something like this.
Code updated (2023-02-02): Seems to work now
from typing import Callable, Generator, List, Union
import numpy as np
import pandas as pd
import simpy
RANDOM_SEED = 42
NUM_AGENTS = 4 # Number of agents in the callcenter
NUM_QUEUES = 2
MIN_PATIENCE = 2 * 60
MAX_PATIENCE = 5 * 60
SIM_DURATION = 8 * 60 * 60
RNG = np.random.default_rng(RANDOM_SEED)
i = 0
# Parse config files for agent performance and queue arrivial times
agents_config = pd.read_csv("queue_agent_mapping.csv")
agents_config["lambda"] = agents_config["lambda"] * 60
agents_config_idx = agents_config.set_index(["agent_id", "queue_id"])
queue_config = pd.read_csv("call_freq.csv")
queue_config["lambda"] = queue_config["lambda"] * 60
queue_config_idx = queue_config.set_index("queue_id")
callcenter_logging = pd.DataFrame(
{
"call_id": [],
"queue_id": [],
"received_time": [],
"agent_id": [],
"start_time": [],
"end_time": [],
"status": [],
}
)
callcenter_logging = callcenter_logging.set_index("call_id")
open_transactions = pd.DataFrame({"call_id": [], "queue_id": [], "received_time": []})
open_transactions = open_transactions.set_index("call_id")
# Get number of agents in simulation from config file
num_agents = agents_config.drop_duplicates(subset="agent_id")
# Get agent-queue mapping from config file
agents_config_grouped = (
agents_config.groupby("agent_id").agg({"queue_id": lambda x: list(x)}).reset_index()
)
class Callcenter:
"""Representation of the call center.
Entry point of the simulation as it starts processes to generate calls and their processing.
"""
# Variable used to generate unique ids
call_id = 0
def __init__(self, env: simpy.Environment):
self.env = env
def get_next_call_id(self):
self.call_id += 1
return self.call_id
def run_simulation(self, agents: simpy.FilterStore):
self.agents = agents
self.queues = [Queue(env, queue_id=qq) for qq in range(4)]
self.call_generator = [env.process(queue.generate_calls(self)) for queue in self.queues]
self.call_accept_consumer = [env.process(queue.consume_calls()) for queue in self.queues]
class Agent:
"""Representation of agents and their global attributes."""
def __init__(self, agent_id, queue_id):
self.agent_id = agent_id
self.allowed_queue_ids = queue_id
class Queue:
"""Representation of call center queues.
Holds methods to generate and consume calls. Calls are stored in a store.
"""
def __init__(self, env, queue_id):
self.env = env
# Defines the store for calls. Later, a FilterStore should be used. This will enable us to
# draw calls by special attribute to fine tune the routing.
self.store = simpy.FilterStore(env)
self.queue_id = queue_id
# Get the arrivial distribution of calls in the queue from the initial config.
self.lam = self._get_customer_arrival_distribution()
def generate_calls(self, callcenter_instance: Callcenter) -> Generator:
"""Generate the calls.
Calls are then put to the queues store.
"""
while True:
yield self.env.timeout(RNG.poisson(self.lam))
# Initialize and fill the call object.
new_call_id = callcenter_instance.get_next_call_id()
call = Call(queue_id=self.queue_id, call_id=new_call_id, env=env)
call = call.update_history()
call.add_open_transaction(status="active")
# Write call to logging table
callcenter_logging.loc[call.call_id, ["queue_id", "received_time"]] = [
call.queue_id,
self.env.now,
]
# Put call to the queue store.
self.put_call(call)
def consume_calls(self) -> Generator:
"""Draw call from queue store and let agents work on them.
If no agent is found within MIN_PATIENCE and MAX_PATIENCE, drop the call.
"""
while True:
# Wait for available agent or drop the call as customer ran out of patience.
agent = yield agents.get(lambda ag: self.queue_id in ag.allowed_queue_ids)
# call = yield self.get_call(lambda ca: ca.status == "active")
call = yield self.get_call(lambda ca: ca.status == "active")
if call.received_at + call.max_waiting <= call.env.now:
print(
f"customer hung up call {call.call_id} after waiting {call.max_waiting / 60} minutes."
)
# The call did not receive an agent in time and ran out of patience.
call.status = "dropped"
callcenter_logging.loc[call.call_id, ["end_time", "status"]] = [
call.env.now,
call.status,
]
print(
f"no agent for {call.call_id} after waiting {(call.env.now - call.received_at) / 60} minutes"
)
else:
# Do some logging that an agent took the call.
print(
f"Agent {agent.agent_id} takes {call.call_id} in queue {call.queue_id} at {call.env.now}."
)
callcenter_logging.loc[call.call_id, ["queue_id", "agent_id", "start_time"]] = [
call.queue_id,
agent.agent_id,
call.env.now,
]
# Get average handling time from config dataframe. Change this to an Agent class
# attribute, later.
ag_lambda = agents_config_idx.loc[agent.agent_id, call.queue_id]["lambda"]
yield call.env.timeout(RNG.poisson(ag_lambda)) # Let the agent work on the call.
call.status = "finished"
# Do some logging
callcenter_logging.loc[call.call_id, ["end_time", "status"]] = [
call.env.now,
call.status,
]
print(
f"Agent {agent.agent_id} finishes {call.call_id} in queue {call.queue_id} at {call.env.now}."
)
# Put the agent back to the agents store. -> Why is this needed? Shouldn't the
# context manager handle this? But it did not work without this line.
yield agents.put(agent)
def _get_customer_arrival_distribution(self) -> float:
"""Returns the mean call arrivial time in a given queue_id."""
return queue_config.loc[self.queue_id, "lambda"]
def get_call(self, filter_func: Callable):
"""Helper function to get calls by complex filters from the queue store."""
return self.store.get(filter=filter_func)
def put_call(self, call):
"""Helper function to put calls to the queue store."""
self.store.put(call)
print(f"Call {call.call_id} added to queue {call.queue_id} at {call.env.now}")
class Call(Callcenter):
"""Representation of a call with all its."""
def __init__(self, env: simpy.Environment, call_id: int, queue_id: int):
self.env = env
self.call_id = call_id
self.queue_id = queue_id
self.agent_id = None
self.status = "active"
self.received_at = env.now
self.max_waiting = RNG.integers(MIN_PATIENCE, MAX_PATIENCE)
self.history: List[Union[int, str]] = []
def update_history(self):
"""Helper to update the call history.
Needed for tracking calls, which changes are transferred from one queue to another.
Currently this is not in use.
"""
self.history.append([self.call_id, self.queue_id, self.agent_id, self.status])
return self
def add_open_transaction(self, status="active"):
"""Helper to add a call to the open transactions log.
Needed for rebalancing logics. Currently this is not in use.
"""
open_transactions.loc[self.call_id, ["queue_id", "received_time", "status"]] = [
self.queue_id,
self.env.now,
status,
]
env = simpy.Environment()
agents = simpy.FilterStore(env, capacity=len(num_agents))
agents.items = [Agent(row.agent_id, row.queue_id) for row in agents_config_grouped.itertuples()]
callcenter = Callcenter(env)
callcenter.run_simulation(agents)
env.run(until=SIM_DURATION)
These are the config files.
# call_freq
queue_name,queue_id,lambda
A,0,2
B,1,4
C,2,3.5
D,3,3
# queue_agent_mapping.csv
queue_id,agent_id,lambda
0,abc11,2
0,abc13,5
0,abc14,2
1,abc11,5
1,abc12,3
1,abc14,12
2,abc12,2
2,abc13,3
3,abc14,3
I set the customer patience to be within 3 to 6 minutes. Nonetheless, if I check the waiting time of a call until it is served, I find a lot of them waiting much more than 3-6 minutes. To make things even worse, I also find some calls being dropped as desired.
callcenter_logging["wait_time"] = callcenter_logging["start_time"] - callcenter_logging["received_time"]
callcenter_logging["serving_wait"] = callcenter_logging["end_time"] - callcenter_logging["start_time"]
callcenter_logging.head(20)
dropped = (
callcenter_logging.loc[callcenter_logging["status2"] == "dropped", ["queue_id", "status2", "end_time"]]
)
dropped = (
dropped.set_index("end_time")
.groupby(["queue_id"])
.expanding()["status2"]
.agg({"cnt_dropped": 'count'})
.reset_index()
)
fig_waiting = px.line(callcenter_logging, "received_time", "wait_time", color="queue_id")
fig_waiting = fig_waiting.update_traces(connectgaps=True)
fig_waiting.show()
fig_dropped = px.line(dropped, x="end_time", y="cnt_dropped", color="queue_id")
fig_dropped.show()
waiting time plot
dropped calls plot
I guess my calls are not pulled from the queue store at the right times. But I was not able to understand the exact problem with the code.
Is there anybody who has an idea?

Your time out for patience does not factor in how long a call has already been in the queue. So if a call has been in the queue for 10 minutes when the loop starts and no agents are available then the total wait time for the call when the patience time out happens will be 10 minutes plus env.timeout(RNG.integers(MIN_PATIENCE, MAX_PATIENCE)).
You need the patience time out to be in the call object and have the call object remove itself from the queue or set a patience expired flag when the patience time out expires.
I would change your loop to first get a agent, and do a inner loop to get a call, ignoring calls with the expired flag.

Related

Multithread or multiprocess

So, currently, I am using multiprocessing to run these 3 functions together.
As only tokens changes, is it recommended to switch to multi-threading? (if yes, will it really help in a performance like speed-up and I think memory will be for sure used less)
This is my code:
from database_function import *
from kiteconnect import KiteTicker
import pandas as pd
from datetime import datetime, timedelta
import schedule
import time
from multiprocessing import Process
def tick_A():
#credentials code here
tokens = [x[0] for x in db_fetchquery("SELECT zerodha FROM script ORDER BY id ASC LIMIT 50")] #FETCHING FIRST 50 SCRIPTS TOKEN
#print(tokens)
##### TO MAKE SURE THE TASK STARTS AFTER 8:59 ONLY ###########
t = datetime.today()
future = datetime(t.year,t.month,t.day,8,59)
if ((future-t).total_seconds()) < 0:
future = datetime(t.year,t.month,t.day,t.hour,t.minute,(t.second+2))
time.sleep((future-t).total_seconds())
##### TO MAKE SURE THE TASK STARTS AFTER 8:59 ONLY ###########
def on_ticks(ws, ticks):
global ltp
ltp = ticks[0]["last_price"]
for tick in ticks:
print(f"{tick['instrument_token']}A")
db_runquery(f'UPDATE SCRIPT SET ltp = {tick["last_price"]} WHERE zerodha = {tick["instrument_token"]}') #UPDATING LTP IN DATABASE
#print(f"{tick['last_price']}")
def on_connect(ws, response):
#print(f"response from connect :: {response}")
# Subscribe to a list of instrument_tokens (TOKENS FETCHED ABOVE WILL BE SUBSCRIBED HERE).
# logging.debug("on connect: {}".format(response))
ws.subscribe(tokens)
ws.set_mode(ws.MODE_LTP,tokens) # SETTING TOKEN TO TICK MODE (LTP / FULL / QUOTE)
kws.on_ticks = on_ticks
kws.on_connect = on_connect
kws.connect(threaded=True)
#####TO STOP THE TASK AFTER 15:32 #######
end_time = datetime(t.year,t.month,t.day,15,32)
while True:
schedule.run_pending()
#time.sleep(1)
if datetime.now() > end_time:
break
#####TO STOP THE TASK AFTER 15:32 #######
def tick_B():
everything remains the same only tokens value changes
tokens = [x[0] for x in db_fetchquery("SELECT zerodha FROM script ORDER BY id ASC OFFSET (50) ROWS FETCH NEXT (50) ROWS ONLY")]
def tick_C():
everything remains the same only tokens value changes
tokens = [x[0] for x in db_fetchquery("SELECT zerodha FROM script ORDER BY id ASC OFFSET (100) ROWS FETCH NEXT (50) ROWS ONLY")]
if __name__ == '__main__':
def runInParallel(*fns):
proc = []
for fn in fns:
p = Process(target=fn)
p.start()
proc.append(p)
for p in proc:
p.join()
runInParallel(tick_A , tick_B , tick_C)
So, currently, I am using multiprocessing to run these 3 functions together.
As only tokens changes, is it recommended to switch to multi-threading? (if yes, will it really help in a performance like speed-up and I think memory will be for sure used less)
most Python implementations do not have true multi-threading, because they use global lock (GIL). So only one thread runs at a time.
For I/O heavy applications it should not make difference. But if you need CPU heavy operations done in parallel (and I see that you use Panda - so the answer must be yes) - you will be better off staying with multi-process app.

How to feed the data obtained from rospy.Subscriber data into a variable?

I have written a sample Subscriber. I want to feed the data that I have obtained from the rospy.Subscriber into another variable, so that I can use it later in the program for processing. At the moment I could see that the Subscriber is functioning as I can see the subscribed values being printed when I use rospy.loginfo() function. Although I donot know how to store this data into another varible. I have tried assigning it directly to a variable by using assignment operator '=', but I get error.
I have tried writing a callback function with rospy.loginfo to print the position data from the subscribed object. I have subscribed JointState and it containes, header, position, velocity and effort arrays. using rospy.loginfo I can verify that the subscriber is subscribing. But when i tried to assign it directly to a variable, I get an error.
I am displaying loginfo from a call back function as follows
def callback(data):
rospy.loginfo(data.position)
global listen
listen = rospy.Subscriber("joint_states", JointState,
callback)
rospy.spin()
and this works fine. But when i slightly modify the code to assign the subscribed values, I get following error i.e.
listen1 = rospy.Subscriber("joint_states", JointState,
callback=None)
listen = listen1.position
#rospy.loginfo(listen)
print(listen)
rospy.spin()```
The error is as follows,
```listen = listen1.position
AttributeError: 'Subscriber' object has no attribute 'position'
EDIT:
Here is my node I have defined in my program,
#rospy.loginfo(msg.data)
global tactile_states
tactile_states = data.data
def joint_callback(data):
#rospy.loginfo(data.position)
global g_joint_states
global g_position
global g_pos1
g_joint_states = data
#for i in len(data.position):
#g_position[i] = data.position[i]
g_position = data.position
if len(data.position) > 0:
print("jointstate more than 0")
g_pos1 = data.position[0]
#print(g_position)
def joint_modifier(*args):
#choice describes what the node is supposed to do whether act as publisher or subscribe to joint states or tactile sensors
rospy.init_node('joint_listener_publisher', anonymous=True)
pub1 = rospy.Publisher('joint_states', JointState, queue_size = 10)
if(len(args)>1):
choice = args[0]
joint_name = args[1]
position = args[2]
else:
choice = args[0]
if (choice == 1):
rate = rospy.Rate(1)
robot_configuration = JointState()
robot_configuration.header = Header()
robot_configuration.name = [joint_name]
robot_configuration.position = [position]
robot_configuration.velocity = [10]
robot_configuration.effort = [100]
while not rospy.is_shutdown():
robot_configuration.header.stamp = rospy.Time.now()
rospy.loginfo(robot_configuration)
break
pub1.publish(robot_configuration)
rospy.sleep(2)
if (choice == 2):
#rospy.Timer(rospy.Duration(2), joint_modifier)
listen = rospy.Subscriber("joint_states", JointState, joint_callback)
rospy.spin()
if (choice == 3):
#rospy.Timer(rospy.Duration(2), joint_modifier)
tactile_sub = rospy.Subscriber("/sr_tactile/touch/ff", Float64, tactile_callback)
rospy.spin()
This is how I am calling the node inside the main body of the program,
joint_modifier(2)
print("printing g_position")
print(g_position)#to check the format of g_position
print("printed g _position")
leg_1 = Leg_attribute(g_position[0], g_position[1], g_position[2], velocity1 = 10, velocity2 = 10, velocity3 = 10, effort1 = 100, effort2 = 100, effort3 = 100, acceleration=1)
When calling this way, the program is stuck at joint_modifier(2) as that function has rospy.spin().
The style which you're using is not very standard. I assume you've seen the example on ROS wiki, I've modified it to demonstrate standard usage below.
Chiefly, addressing the code you posted, you needed to make listen have global scope outside of the callback. This is to store the data you want, not the Subscriber object. The rospy.spin() never goes in a callback, only the main node function/section. The subscriber object, listen1, which is used infrequently, doesn't return anything, and doesn't store the data it acquires. That is, you need Subscriber() to have a non-None callback.
It's more of a bind, giving the data to the callback instead of returning it from Subscriber. That's why listen1 (Subscriber) has no attribute position (JointState).
import rospy
from sensor_msgs.msg import JointState
# Subscribers
# joint_sub (sensor_msgs/JointState): "joint_states"
# This is where you store all your data you recieve
g_joint_states = None
g_positions = None
g_pos1 = None
def timer_callback(event): # Type rospy.TimerEvent
print('timer_cb (' + str(event.current_real) + '): g_positions is')
print(str(None) if g_positions is None else str(g_positions))
def joint_callback(data): # data of type JointState
# Each subscriber gets 1 callback, and the callback either
# stores information and/or computes something and/or publishes
# It _does not!_ return anything
global g_joint_states, g_positions, g_pos1
rospy.loginfo(data.position)
g_joint_states = data
g_positions = data.position
if len(data.position) > 0:
g_pos1 = data.position[0]
print(g_positions)
# In your main function, only! here do you subscribe to topics
def joint_logger_node():
# Init ROS
rospy.init_node('joint_logger_node', anonymous=True)
# Subscribers
# Each subscriber has the topic, topic type, AND the callback!
rospy.Subscriber('joint_states', JointState, joint_callback)
# Rarely need to hold onto the object with a variable:
# joint_sub = rospy.Subscriber(...)
rospy.Timer(rospy.Duration(2), timer_callback)
# spin() simply keeps python from exiting until this node is stopped
# This is an infinite loop, the only code that gets ran are callbacks
rospy.spin()
# NO CODE GOES AFTER THIS, NONE! USE TIMER CALLBACKS!
# unless you need to clean up resource allocation, close(), etc when program dies
if __name__ == '__main__':
joint_logger_node()
Edit 1:
There seems to be some confusion on what Subscriber(), spin(), and _callback(s) do.
It's a bit obscured in the Python, but there is a master program that manages all nodes, and sending nodes between them. In each node, we register with that master program that the node exists, and what publishers and subscribers it has. By register, it means we tell the master program, "Hey, I want that topic!"; in your case, for your (undeclared) joint_sub Subscriber, "Hey, I want all the JointState msgs from the joint_states topic!" The master program will, every time it gets (from some publisher somewhere) a new joint_states JointState msg, send it to that subscriber.
The subscriber handles, deals with, and processes the msg (data) with a callback: when(!) I receive a message, run the callback.
So the master program receives a new joint_states JointState msg from some publisher. Then it, because we registered a subscriber to it, sends it to this node. rospy.spin() is an infinite loop waiting for that data. This is what it does (kinda-mostly):
def rospy.spin():
while rospy.ok():
for new_msg in get_new_messages from master():
if I have a subscriber to new_msg:
my_subscriber.callback(new_msg)
rospy.spin() is where your callback, joint_callback (and/or timer_callback, etc) actually get called, and executed. It only runs when there is data for it.
More fundamentally, I think because of this confusion, your program structure is flawed; your functions don't do what you think they do. This is how you should make your node.
Make your math-portion (all the real non-ros code), the one doing the NN, into a separate module, and make a function to run it.
If you only want to run it when you receive data, run it in the callback. If you want to publish the result, publish in the callback.
Don't call the main function! The if __name__ == '__main__': my_main_function() should be the only place it gets called, and this will call your code. I repeat: the main function, declaring subscribers/publishers/init/timers/parameters, is only run in if __name__ ..., and this function runs your code. To have it run your code, place your code in a callback. Timer callbacks are handy for this.
I hope this code sample clarifies:
import rospy
from std_msgs.msg import Header
from sensor_msgs.msg import JointState
import my_nn as nn # nn.run(data)
# Subscribers
# joint_sub (sensor_msgs/JointState): "joint_states"
# Publishers
# joint_pub (sensor_msgs/JointState): "target_joint_states"
joint_pub = None
def joint_callback(data): # data of type JointState
pub_msg = JointState() # Make a new msg to publish results
pub_msg.header = Header()
pub_msg.name = data.name
pub_msg.velocity = [10] * len(data.name)
pub_msg.effort = [100] * len(data.name)
# This next line might not be quite right for what you want to do,
# But basically, run the "real code" on the data, and get the
# result to publish back out
pub_msg.position = nn.run(data.position) # Run NN on data, store results
joint_pub.publish(pub_msg) # Send it when ready!
if __name__ == '__main__':
# Init ROS
rospy.init_node('joint_logger_node', anonymous=True)
# Subscribers
rospy.Subscriber('joint_states', JointState, joint_callback)
# Publishers
joint_pub = rospy.Publisher('target_joint_states', JointState, queue_size = 10)
# Spin
rospy.spin()
# No more code! This is not a function to call, but its
# own program! This is an executable! Run your code in
# a callback!
Notice that a python module we design to be a ros node, has no functions to be called. It has a defined structure of callbacks and global data shared between them, all initialized and registered in the main function / if __name__ == '__main__'.

Multiprocessing function not writing to file or printing

I'm working on a Raspberry Pi (3 B+) making a data collection device and I'm
trying to spawn a process to record the data coming in and write it to a file. I have a function for the writing that works fine when I call it directly.
When I call it using the multiprocess approach however, nothing seems to happen. I can see in task monitors in Linux that the process does in fact get spawned but no file gets written, and when I try to pass a flag to it to shut down it doesn't work, meaning I end up terminating the process and nothing seems to have happened.
I've been over this every which way and can't see what I'm doing wrong; does anyone else? In case it's relevant, these are functions inside a parent class, and one of the functions is meant to spawn another as a thread.
Code I'm using:
from datetime import datetime, timedelta
import csv
from drivers.IMU_SEN0 import IMU_SEN0
import multiprocessing, os
class IMU_data_logger:
_output_filename = ''
_csv_headers = []
_accelerometer_headers = ['Accelerometer X','Accelerometer Y','Accelerometer Z']
_gyroscope_headers = ['Gyroscope X','Gyroscope Y','Gyroscope Z']
_magnetometer_headers = ['Bearing']
_log_accelerometer = False
_log_gyroscope= False
_log_magnetometer = False
IMU = None
_writer=[]
_run_underway = False
_process=[]
_stop_value = 0
def __init__(self,output_filename='/home/pi/blah.csv',log_accelerometer = True,log_gyroscope= True,log_magnetometer = True):
"""data logging device
NOTE! Multiple instances of this class should not use the same IMU devices simultaneously!"""
self._output_filename = output_filename
self._log_accelerometer = log_accelerometer
self._log_gyroscope = log_gyroscope
self._log_magnetometer = log_magnetometer
def __del__(self):
# TODO Update this
if self._run_underway: # If there's still a run underway, end it first
self.end_recording()
def _set_up(self):
self.IMU = IMU_SEN0(self._log_accelerometer,self._log_gyroscope,self._log_magnetometer)
self._set_up_headers()
def _set_up_headers(self):
"""Set up the headers of the CSV file based on the header substrings at top and the input flags on what will be measured"""
self._csv_headers = []
if self._log_accelerometer is not None:
self._csv_headers+= self._accelerometer_headers
if self._log_gyroscope is not None:
self._csv_headers+= self._gyroscope_headers
if self._log_magnetometer is not None:
self._csv_headers+= self._magnetometer_headers
def _record_data(self,frequency,stop_value):
self._set_up() #Run setup in thread
"""Record data function, which takes a recording frequency, in herz, as an input"""
previous_read_time=datetime.now()-timedelta(1,0,0)
self._run_underway = True # Note that a run is now going
Period = 1/frequency # Period, in seconds, of a recording based on the input frequency
print("Writing output data to",self._output_filename)
with open(self._output_filename,'w',newline='') as outcsv:
self._writer = csv.writer(outcsv)
self._writer.writerow(self._csv_headers) # Write headers to file
while stop_value.value==0: # While a run continues
if datetime.now()-previous_read_time>=timedelta(0,1,0): # If we've waited a period, collect the data; otherwise keep looping
print("run underway value",self._run_underway)
if datetime.now()-previous_read_time>=timedelta(0,Period,0): # If we've waited a period, collect the data; otherwise keep looping
previous_read_time = datetime.now() # Update previous readtime
next_row = []
if self._log_accelerometer:
# Get values in m/s^2
axes = self.IMU.read_accelerometer_values()
next_row += [axes['x'],axes['y'],axes['z']]
if self._log_gyroscope:
# Read gyro values
gyro = self.IMU.read_gyroscope_values()
next_row += [gyro['x'],gyro['y'],gyro['z']]
if self._log_magnetometer:
# Read magnetometer value
b= self.IMU.read_magnetometer_bearing()
next_row += b
self._writer.writerow(next_row)
# Close the csv when done
outcsv.close()
def start_recording(self,frequency_in_hz):
# Create recording process
self._stop_value = multiprocessing.Value('i',0)
self._process = multiprocessing.Process(target=self._record_data,args=(frequency_in_hz,self._stop_value))
# Start recording process
self._process.start()
print(datetime.now().strftime("%H:%M:%S.%f"),"Data logging process spawned")
print("Logging Accelerometer:",self._log_accelerometer)
print("Logging Gyroscope:",self._log_gyroscope)
print("Logging Magnetometer:",self._log_magnetometer)
print("ID of data logging process: {}".format(self._process.pid))
def end_recording(self,terminate_wait = 2):
"""Function to end the recording multithread that's been spawned.
Args: terminate_wait: This is the time, in seconds, to wait after attempting to shut down the process before terminating it."""
# Get process id
id = self._process.pid
# Set stop event for process
self._stop_value.value = 1
self._process.join(terminate_wait) # Wait two seconds for the process to terminate
if self._process.is_alive(): # If it's still alive after waiting
self._process.terminate()
print(datetime.now().strftime("%H:%M:%S.%f"),"Process",id,"needed to be terminated.")
else:
print(datetime.now().strftime("%H:%M:%S.%f"),"Process",id,"successfully ended itself.")
====================================================================
ANSWER: For anyone following up here, it turns out the problem was my use of the VS Code debugger which apparently doesn't work with multiprocessing and was somehow preventing the success of the spawned process. Many thanks to Tomasz Swider below for helping me work through issues and, eventually, find my idiocy. The help was very deeply appreciated!!
I can see few thing wrong in your code:
First thing
stop_value == 0 will not work as the multiprocess.Value('i', 0) != 0, change that line to
while stop_value.value == 0
Second, you never update previous_read_time so it will write the readings as fast as it can, you will run out of disk quick
Third, try use time.sleep() the thing you are doing is called busy looping and it is bad, it is wasting CPU cycles needlessly.
Four, terminating with self._stop_value = 1 probably will not work there must be other way to set that value maybe self._stop_value.value = 1.
Well here is a pice of example code based on the code that you have provided that is working just fine:
import csv
import multiprocessing
import time
from datetime import datetime, timedelta
from random import randint
class IMU(object):
#staticmethod
def read_accelerometer_values():
return dict(x=randint(0, 100), y=randint(0, 100), z=randint(0, 10))
class Foo(object):
def __init__(self, output_filename):
self._output_filename = output_filename
self._csv_headers = ['xxxx','y','z']
self._log_accelerometer = True
self.IMU = IMU()
def _record_data(self, frequency, stop_value):
#self._set_up() # Run setup functions for the data collection device and store it in the self.IMU variable
"""Record data function, which takes a recording frequency, in herz, as an input"""
previous_read_time = datetime.now() - timedelta(1, 0, 0)
self._run_underway = True # Note that a run is now going
Period = 1 / frequency # Period, in seconds, of a recording based on the input frequency
print("Writing output data to", self._output_filename)
with open(self._output_filename, 'w', newline='') as outcsv:
self._writer = csv.writer(outcsv)
self._writer.writerow(self._csv_headers) # Write headers to file
while stop_value.value == 0: # While a run continues
if datetime.now() - previous_read_time >= timedelta(0, 1,
0): # If we've waited a period, collect the data; otherwise keep looping
print("run underway value", self._run_underway)
if datetime.now() - previous_read_time >= timedelta(0, Period,
0): # If we've waited a period, collect the data; otherwise keep looping
next_row = []
if self._log_accelerometer:
# Get values in m/s^2
axes = self.IMU.read_accelerometer_values()
next_row += [axes['x'], axes['y'], axes['z']]
previous_read_time = datetime.now()
self._writer.writerow(next_row)
# Close the csv when done
outcsv.close()
def start_recording(self, frequency_in_hz):
# Create recording process
self._stop_value = multiprocessing.Value('i', 0)
self._process = multiprocessing.Process(target=self._record_data, args=(frequency_in_hz, self._stop_value))
# Start recording process
self._process.start()
print(datetime.now().strftime("%H:%M:%S.%f"), "Data logging process spawned")
print("ID of data logging process: {}".format(self._process.pid))
def end_recording(self, terminate_wait=2):
"""Function to end the recording multithread that's been spawned.
Args: terminate_wait: This is the time, in seconds, to wait after attempting to shut down the process before terminating it."""
# Get process id
id = self._process.pid
# Set stop event for process
self._stop_value.value = 1
self._process.join(terminate_wait) # Wait two seconds for the process to terminate
if self._process.is_alive(): # If it's still alive after waiting
self._process.terminate()
print(datetime.now().strftime("%H:%M:%S.%f"), "Process", id, "needed to be terminated.")
else:
print(datetime.now().strftime("%H:%M:%S.%f"), "Process", id, "successfully ended itself.")
if __name__ == '__main__':
foo = Foo('/tmp/foometer.csv')
foo.start_recording(20)
time.sleep(5)
print('Ending recording')
foo.end_recording()

Tornado Server using most of the cpu while using tornado-sockjs and only two clients.

I am using Tornado Server, 4.4.2 and pypy 5.9.0 and python 2.7.13,
hosted on Ubuntu 16.04.3 LTS
A new client logs in and a new class is created and passed the socket, so dialog can be maintained. I am using a global clients[] list to contain the classes. initial dialog looks like :
clients = []
class RegisterWebSocket(SockJSConnection):
# intialize the class and handle on-open (some things left out)
def on_open(self,info):
self.ipaddress = info.headers['X-Real-Ip']
def on_message(self, data):
coinlist = []
msg = json.loads(data)
if 'coinlist' in msg:
coinlist = msg['coinlist']
if 'currency' in msg:
currency = msg['currency']
tz = pendulum.timezone('America/New_York')
started = pendulum.now(tz).to_day_datetime_string()
ws = WebClientUpdater(self, self.clientid, coinlist,currency,
started, self.ipaddress)
clients.append(ws)
The ws class is shown below and I use a tornado periodiccallback to update the clients with their specific info every 20 seconds
class WebClientUpdater(SockJSConnection):
def __init__(self, ws,id, clist, currency, started, ipaddress):
super(WebClientUpdater,self).__init__(ws.session)
self.ws = ws
self.id = id
self.coinlist = clist
self.currency = currency
self.started = started
self.ipaddress = ipaddress
self.location = loc
self.loop = tornado.ioloop.PeriodicCallback(self.updateCoinList,
20000, io_loop=tornado.ioloop.IOLoop.instance())
self.loop.start()
self.send_msg('welcome '+ id)
def updateCoinList(self):
pdata = db.getPricesOfCoinsInCurrency(self.coinlist,self.currency)
self.send(dict(priceforcoins = pdata))
def send_msg(self,msg):
self.send(msg)
I also start at 60 second periodiccallback at startup, to monitor the clients for closed connections and remove them from the client[] list. Which I put on the startup line to call a def internally like
if __name__ == "__main__":
app = make_app()
app.listen(options.port)
ScheduleSocketCleaning()
and
def ScheduleSocketCleaning():
def cleanSocketHouse():
print "checking sockets"
for x in clients:
if x.is_closed:
x = None
clients[:] = [y for y in clients if not y.is_closed ]
loop = tornado.ioloop.PeriodicCallback(cleanSocketHouse, 60000,
io_loop=tornado.ioloop.IOLoop.instance())
loop.start()
If I monitor the server using TOP I see that it uses 4% cpu typical with bursts to 60+ immediately, but later, say after a few hours it becomes in the 90% and stays there.
I have used strace and I see an enormous amount of Stat calls on the same files with errors shown in the strace -c view, but I cannot find any errors in a text file using -o trace.log. How can I find those errors ?
But I also notice that most of the time is consumed in epoll_wait.
%time
41.61 0.068097 7 9484 epoll_wait
26.65 0.043617 0 906154 2410 stat
15.77 0.025811 0 524072 read
10.90 0.017840 129 138 brk
2.41 0.003937 9 417 madvise
2.04 0.003340 0 524072 lseek
0.56 0.000923 3 298 sendto
0.06 0.000098 0 23779 gettimeofday
100.00 0.163663 1989527 2410 total
Notice 2410 errors above.
When I view the strace output stream using attached pid, I just see endless Stat calls on the same files..
Can someone advise me as to how to better debug this situation? With only two clients and 20 seconds between client updates, I would expect the CPU usage (there are no other users of the site during this prototype stage) would be less than 1% or thereabouts.
You need to close PeriodicCallbacks, otherwise its a memory leak. You do that by simply calling .close() on a PeriodicCallback object. One way to deal with that is in your periodic cleaning task:
def cleanSocketHouse():
global clients
new_clients = []
for client in clients:
if client.is_closed:
# I don't know why you call it loop,
# .timer would be more appropriate
client.loop.close()
else:
new_clients.append(client)
clients = new_clients
I'm not sure how accurate .is_closed is (some testing is required). The other way is to alter updateCoinList. The .send() method should fail when the client is no longer connected, right? Therefore try: except: should do the trick:
def updateCoinList(self):
global clients
pdata = db.getPricesOfCoinsInCurrency(self.coinlist,self.currency)
try:
self.send(dict(priceforcoins = pdata))
except Exception:
# log exception?
self.loop.close()
clients.remove(self) # you should probably use set instead of list
If ,send() actually doesn't fail (for whatever reason, I'm not that familiar with Tornado) then stick to the first solution.

Watchdog compatibility: A workaround for "CancelIoEx"

Using the python watchdog file system events watching library I noticed that when being used under Windows Server 2003 it entered into "Polling Mode" thus stoping using asynchronous OS notification and, therefore, heavily reducing system performance under big amount of file changes.
I traced the problem to watchdog/observers/winapi.py file where CancelIoEx system call is used in order to stop ReadDirectoryChangesW call lock when the user wants to stop monitoring the watched directory or file:
(winapi.py)
CancelIoEx = ctypes.windll.kernel32.CancelIoEx
CancelIoEx.restype = ctypes.wintypes.BOOL
CancelIoEx.errcheck = _errcheck_bool
CancelIoEx.argtypes = (
ctypes.wintypes.HANDLE, # hObject
ctypes.POINTER(OVERLAPPED) # lpOverlapped
)
...
...
...
def close_directory_handle(handle):
try:
CancelIoEx(handle, None) # force ReadDirectoryChangesW to return
except WindowsError:
return
The problem with CancelIoEx call is that it is not available until Windows Server 2008:
http://msdn.microsoft.com/en-us/library/windows/desktop/aa363792(v=vs.85).aspx
One possible alternative is to change close_directory_handle in order to make it create a mock file within the monitored directory, thus unlocking the thread waiting for ReadDirectoryChangesW to return.
However, I noticed that CancelIo system call is in fact available in Windows Server 2003:
Cancels all pending input and output (I/O) operations that are issued
by the calling thread for the specified file. The function does not
cancel I/O operations that other threads issue for a file handle. To
cancel I/O operations from another thread, use the CancelIoEx
function.
But calling CancelIo won't affect the waiting thread.
Do you have any idea on how to solve this problem?
May be threading.enumerate() could be used issue a signal to be handled by each thread being CancelIo called from these handlers?
The natural approach is to implement a completion routine and call to ReadDirectoryChangesW using its overlapped mode. The following example shows the way to do that:
RDCW_CALLBACK_F = ctypes.WINFUNCTYPE(None, ctypes.wintypes.DWORD, ctypes.wintypes.DWORD, ctypes.POINTER(OVERLAPPED))
First, create a WINFUNCTYPE factory which will be used to generate (callable from Windows API) C like functions from python methods. In this case, no return value and 3 parameters corresponding to
VOID CALLBACK FileIOCompletionRoutine(
_In_ DWORD dwErrorCode,
_In_ DWORD dwNumberOfBytesTransfered,
_Inout_ LPOVERLAPPED lpOverlapped
);
FileIOCompletionRoutine header.
The callback reference as well as the overlapped structure need to be added to ReadDirectoryChangesW arguments list:
ReadDirectoryChangesW = ctypes.windll.kernel32.ReadDirectoryChangesW
ReadDirectoryChangesW.restype = ctypes.wintypes.BOOL
ReadDirectoryChangesW.errcheck = _errcheck_bool
ReadDirectoryChangesW.argtypes = (
ctypes.wintypes.HANDLE, # hDirectory
LPVOID, # lpBuffer
ctypes.wintypes.DWORD, # nBufferLength
ctypes.wintypes.BOOL, # bWatchSubtree
ctypes.wintypes.DWORD, # dwNotifyFilter
ctypes.POINTER(ctypes.wintypes.DWORD), # lpBytesReturned
ctypes.POINTER(OVERLAPPED), # lpOverlapped
RDCW_CALLBACK_F # FileIOCompletionRoutine # lpCompletionRoutine
)
From here, we are ready to perform the overlapped system call.
This is a simple call bacl just usefult to test that everything works fine:
def dir_change_callback(dwErrorCode,dwNumberOfBytesTransfered,p):
print("dir_change_callback! PID:" + str(os.getpid()))
print("CALLBACK THREAD: " + str(threading.currentThread()))
Prepare and perform the call:
event_buffer = ctypes.create_string_buffer(BUFFER_SIZE)
nbytes = ctypes.wintypes.DWORD()
overlapped_read_dir = OVERLAPPED()
call2pass = RDCW_CALLBACK_F(dir_change_callback)
hand = get_directory_handle(os.path.abspath("/test/"))
def docall():
ReadDirectoryChangesW(hand, ctypes.byref(event_buffer),
len(event_buffer), False,
WATCHDOG_FILE_NOTIFY_FLAGS,
ctypes.byref(nbytes),
ctypes.byref(overlapped_read_dir), call2pass)
print("Waiting!")
docall()
If you load and execute all this code into a DreamPie interactive shell you can check the system call is done and that the callback executes thus printing the thread and pid numbers after the first change done under c:\test directory. Besides, you will notice those are the same than the main thread and process: Despite the event is raised by a separated thread, the callback runs in the same process and thread as our main program thus providing an undesired behaviour:
lck = threading.Lock()
def dir_change_callback(dwErrorCode,dwNumberOfBytesTransfered,p):
print("dir_change_callback! PID:" + str(os.getpid()))
print("CALLBACK THREAD: " + str(threading.currentThread()))
...
...
...
lck.acquire()
print("Waiting!")
docall()
lck.acquire()
This program will lock the main thread and the callback will never execute.
I tried many synchronization tools, even Windows API semaphores always getting the same behaviour so, finally, I decided to implement the ansynchronous call using the synchronous configuration for ReadDirectoryChangesW within a separate process managed and synchronized using multiprocessing python library:
Calls to get_directory_handle won't return the handle number given by windows API but one managed by winapi library, for that I implemented a handle generator:
class FakeHandleFactory():
_hl = threading.Lock()
_next = 0
#staticmethod
def next():
FakeHandleFactory._hl.acquire()
ret = FakeHandleFactory._next
FakeHandleFactory._next += 1
FakeHandleFactory._hl.release()
return ret
Each generated handle has to be globally associated with a file system path:
handle2file = {}
Each call to read_directory_changes will now generate ReadDirectoryRequest (derived from multiprocessing.Process) object:
class ReadDirectoryRequest(multiprocessing.Process):
def _perform_and_wait4request(self, path, recursive, event_buffer, nbytes):
hdl = CreateFileW(path, FILE_LIST_DIRECTORY, WATCHDOG_FILE_SHARE_FLAGS,
None, OPEN_EXISTING, WATCHDOG_FILE_FLAGS, None)
#print("path: " + path)
aux_buffer = ctypes.create_string_buffer(BUFFER_SIZE)
aux_n = ctypes.wintypes.DWORD()
#print("_perform_and_wait4request! PID:" + str(os.getpid()))
#print("CALLBACK THREAD: " + str(threading.currentThread()) + "\n----------")
try:
ReadDirectoryChangesW(hdl, ctypes.byref(aux_buffer),
len(event_buffer), recursive,
WATCHDOG_FILE_NOTIFY_FLAGS,
ctypes.byref(aux_n), None, None)
except WindowsError as e:
print("!" + str(e))
if e.winerror == ERROR_OPERATION_ABORTED:
nbytes = 0
event_buffer = []
else:
nbytes = 0
event_buffer = []
# Python 2/3 compat
nbytes.value = aux_n.value
for i in xrange(self.int_class(aux_n.value)):
event_buffer[i] = aux_buffer[i]
CloseHandle(hdl)
try:
self.lck.release()
except:
pass
def __init__(self, handle, recursive):
buffer = ctypes.create_string_buffer(BUFFER_SIZE)
self.event_buffer = multiprocessing.Array(ctypes.c_char, buffer)
self.nbytes = multiprocessing.Value(ctypes.wintypes.DWORD, 0)
targetPath = handle2file.get(handle, None)
super(ReadDirectoryRequest, self).__init__(target=self._perform_and_wait4request, args=(targetPath, recursive, self.event_buffer, self.nbytes))
self.daemon = True
self.lck = multiprocessing.Lock()
self.result = None
try:
self.int_class = long
except NameError:
self.int_class = int
if targetPath is None:
self.result = ([], -1)
def CancelIo(self):
try:
self.result = ([], 0)
self.lck.release()
except:
pass
def read_changes(self):
#print("read_changes! PID:" + str(os.getpid()))
#print("CALLBACK THREAD: " + str(threading.currentThread()) + "\n----------")
if self.result is not None:
raise Exception("ReadDirectoryRequest object can be used only once!")
self.lck.acquire()
self.start()
self.lck.acquire()
self.result = (self.event_buffer, self.int_class(self.nbytes.value))
return self.result
This class specifies Process providing a process which perform the system call and waits until (or):
A change event has been raised.
The main thread cancels the request by calling to the ReadDirectoryRequest object CancelIo method.
Note that:
get_directory_handle
close_directory_handle
read_directory_changes
Roles are now to manage requests. For that, thread locks and auxiliary data structures are needed:
rqIndexLck = threading.Lock() # Protects the access to `rqIndex`
rqIndex = {} # Maps handles to request objects sets.
get_directory_handle
def get_directory_handle(path):
rqIndexLck.acquire()
ret = FakeHandleFactory.next()
handle2file[ret] = path
rqIndexLck.release()
return ret
close_directory_handle
def close_directory_handle(handle):
rqIndexLck.acquire()
rqset4handle = rqIndex.get(handle, None)
if rqset4handle is not None:
for rq in rqset4handle:
rq.CancelIo()
del rqIndex[handle]
if handle in handle2file:
del handle2file[handle]
rqIndexLck.release()
And last but not least: read_directory_changes
def read_directory_changes(handle, recursive):
rqIndexLck.acquire()
rq = ReadDirectoryRequest(handle, recursive)
set4handle = None
if handle in rqIndex:
set4handle = rqIndex[handle]
else:
set4handle = set()
rqIndex[handle] = set4handle
set4handle.add(rq)
rqIndexLck.release()
ret = rq.read_changes()
rqIndexLck.acquire()
if rq in set4handle:
set4handle.remove(rq)
rqIndexLck.release()
return ret

Categories