Multiple Python threads writing to single JSON file

Multiple Python threads writing to single JSON file - python

I am adapting the Python script in this project (expanded below) to a point where it updates a JSON file's elements, instead of the InitialState streamer. However, with the multiple threads that are opened by the script, it is impossible to succinctly write the data from each thread back to the file as it would be read, changed, and written back to the file in all threads at the same time. As there can only be one file, no version will ever be accurate as the last thread would override all others.
Question: How can I update the states in the JSON based in each thread (simultaneously) without it affecting the other thread's writing operation or locking up the file?
JSON file contains the occupant's status that I would like to manipulate with the python script:
{
"janeHome": "false",
"johnHome": "false",
"jennyHome": "false",
"jamesHome": "false"
}
This is the python script:
import subprocess
import json
from time import sleep
from threading import Thread
# Edit these for how many people/devices you want to track
occupant = ["Jane","John","Jenny","James"]
# MAC addresses for our phones
address = ["11:22:33:44:55:66","77:88:99:00:11:22","33:44:55:66:77:88","99:00:11:22:33:44"]
# Sleep once right when this script is called to give the Pi enough time
# to connect to the network
sleep(60)
# Some arrays to help minimize streaming and account for devices
# disappearing from the network when asleep
firstRun = [1] * len(occupant)
presentSent = [0] * len(occupant)
notPresentSent = [0] * len(occupant)
counter = [0] * len(occupant)
# Function that checks for device presence
def whosHere(i):
# 30 second pause to allow main thread to finish arp-scan and populate output
sleep(30)
# Loop through checking for devices and counting if they're not present
while True:
# Exits thread if Keyboard Interrupt occurs
if stop == True:
print ("Exiting Thread")
exit()
else:
pass
# If a listed device address is present print
if address[i] in output:
print(occupant[i] + "'s device is connected")
if presentSent[i] == 0:
# TODO: UPDATE THIS OCCUPANT'S STATUS TO TRUE
# Reset counters so another stream isn't sent if the device
# is still present
firstRun[i] = 0
presentSent[i] = 1
notPresentSent[i] = 0
counter[i] = 0
sleep(900)
else:
# If a stream's already been sent, just wait for 15 minutes
counter[i] = 0
sleep(900)
# If a listed device address is not present, print and stream
else:
print(occupant[i] + "'s device is not connected")
# Only consider a device offline if it's counter has reached 30
# This is the same as 15 minutes passing
if counter[i] == 30 or firstRun[i] == 1:
firstRun[i] = 0
if notPresentSent[i] == 0:
# TODO: UPDATE THIS OCCUPANT'S STATUS TO FALSE
# Reset counters so another stream isn't sent if the device
# is still present
notPresentSent[i] = 1
presentSent[i] = 0
counter[i] = 0
else:
# If a stream's already been sent, wait 30 seconds
counter[i] = 0
sleep(30)
# Count how many 30 second intervals have happened since the device
# disappeared from the network
else:
counter[i] = counter[i] + 1
print(occupant[i] + "'s counter at " + str(counter[i]))
sleep(30)
# Main thread
try:
# Initialize a variable to trigger threads to exit when True
global stop
stop = False
# Start the thread(s)
# It will start as many threads as there are values in the occupant array
for i in range(len(occupant)):
t = Thread(target=whosHere, args=(i,))
t.start()
while True:
# Make output global so the threads can see it
global output
# Reads existing JSON file into buffer
with open("data.json", "r") as jsonFile:
data = json.load(jsonFile)
jsonFile.close()
# Assign list of devices on the network to "output"
output = subprocess.check_output("arp-scan -interface en1 --localnet -l", shell=True)
temp = data["janeHome"]
data["janeHome"] = # RETURNED STATE
data["johnHome"] = # RETURNED STATE
data["jennyHome"] = # RETURNED STATE
data["jamesHome"] = # RETURNED STATE
with open("data.json", "w") as jsonFile:
json.dump(data, jsonFile)
jsonFile.close()
# Wait 30 seconds between scans
sleep(30)
except KeyboardInterrupt:
# On a keyboard interrupt signal threads to exit
stop = True
exit()
I think we can all agree that the best idea would be to return the data from each thread to the main and write it to the file in one location but here is where it gets confusing, with each thread checking for a different person, how can the state be passed back to main for writing?

Related

Uploading with PyMongo in a While True loop using Multiprocessing

I have a script that reads data via the serial port from a development board. I want to have this script upload the data to a MongoDB collection at the end of each loop, but I don't want the loop to block because of the upload. When I try to use the multiprocessing library to do so, the loop only uploads a blank document.
client = MongoClient()
db = client['CompostMonitor-1']
def upload_to_database(data):
# Connect to the collection where the data will be stored
collection = db.RedBoard
# Insert the data into the collection
collection.insert_one(data)
port = '/dev/ttyUSB0'
filename = '~/TestData'
containernumber = 1
baud_rate = 9600
RBSerial = serial.Serial(port, baud_rate, timeout=1)
directoryBase = "{}/{}/Bucket {}/RB".format(filename, time.strftime("%m-%d-%Y"), containernumber)
pathlib.Path(directoryBase).mkdir(parents=True, exist_ok=True)
logFileRB = '{}/RB_Bucket_{}_{}_{}_log.bin'.format(directoryBase, containernumber, time.strftime("%m-%d-%Y"),
time.strftime("%H;%M;%S"))
csvRB = '{}/RB_Bucket_{}_{}_{}.csv'.format(directoryBase, containernumber, time.strftime("%m-%d-%Y"),
time.strftime("%H;%M;%S"))
startup = True
count = 0
bytearray = []
RB_DataList = []
RB_DataDict = {}
header = ['Date/Time',
'SGP TVOC (ppb)',
'BME Humidity (%)',
'BME Pressure (Pa)',
'BME Temp (Deg C)']
startTime = time.time()
p = multiprocessing.Process(target=upload_to_database, args=(RB_DataDict,))
while 1:
RB_DataDict = {'_id': ''}
RB_inbyte = RBSerial.read(size=1)
with open(logFileRB, 'ab') as l:
l.write(RB_inbyte)
bytearray.append(RB_inbyte)
if RB_inbyte == b'\n':
bytearray.pop()
with open(csvRB, 'a', newline = '') as table:
writer = csv.writer(table)
if count == 0:
writer.writerow(header)
RB_DataSplit = ''.join(str(bytearray)).replace(" ", "").replace('b', '').replace("'", '').replace(",", '').\
replace('[', '').replace(']', '').split(';')
RB_DataList.append(time.strftime("%m-%d-%Y %H:%M:%S"))
for i in range(len(RB_DataSplit)):
RB_DataList.append(RB_DataSplit[i])
print(RB_DataList)
writer.writerow(RB_DataList)
RB_DataDict = {'Date_Time': RB_DataList[0], 'TVOC Con': RB_DataList[1], 'BME Humidity': RB_DataList[2],
'BME Pressure': RB_DataList[3], 'BME Temp': RB_DataList[4]}
print(RB_DataDict)
RB_DataList = []
# upload_to_database(RB_DataDict)
if startup:
p.start()
startup = False
bytearray = []
However, if I just call upload_to_database(RB_DataDict) as in the commented line, it works as intended. I thought that starting the process would have it continually upload RB_DataDict to my Mongo database, but it appears that it just runs one time and then stops.
I haven't found any examples of code attempting to use multiprocessing in an infinite loop, so it's hard to compare my code to something that works. How can I change this code so that it uploads RB_DataDict with the multiprocessing object each time the dictionary is populated?

I found a solution to my problem. I don't really understand why this works so well, but it does:
if __name__ == '__main__':
if startup:
p.start()
startup = False
print('Startup == False')
else:
# Close the process instance and start a new one!
p.close()
p = multiprocessing.Process(target= upload_to_database, args = (RB_DataDict,))
p.start()
print('should have uploaded something here')
Just closing the original process on the second loop and starting a new one fixes the issue. I'm not sure if, in my particular case, the if __name__ == '__main__' is necessary, as this script isn't intended to be imported for anything else, but I just followed the lead of the multiprocessing documentation.

How to pause a thread until data received in Python

I do not have much experience with threads (and networking in general). I am creating a script which receives data from a client (1, 2 or 3). All these have a meaning:
1 = NEW Apache Benchmark ITERATION - we must run new top command and append every second
2 = END Apache Benchmark ITERATION - we must end top command
3 = STOP ENTIRE PROGRAM
The top command on Linux just records the CPU and memory usage.
I have created an initial thread which listens for data from the client and targets the get_data() function.
The run() function waits for data to be returned from get_data() but if it isn't getting any data then both the run() and get_data() function will halt.
Is there a way to pause the thread targeting the get_data function until data is sent from the client side so that the run() function doesn't halt?
My Current Code:
import socket
import sys
import threading
import subprocess
import sched
import time
import os
import signal
sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
server_socket = ('xxx.xxx.x.xx', 5000)
sock.bind(server_socket)
process = None
def run_command(i):
global process
print("Running top")
process = subprocess.Popen('top -b -n 1 | head -n 5 >> htop-' + str(i) + '.txt', shell=True)
print("Finished running top")
return process
def get_data():
while True:
# data = ''
data, address = sock.recvfrom(1024)
print("waiting?")
data = data.split(",")
iteration = data[1]
data = data[0]
print("Data: " + data + " and iteration: " + iteration)
time.sleep(1.0)
# run(data, iteration)
return data, iteration
'''
1 = NEW AB ITERATION - we must run new top command and append every second
2 = END OF AB - we must end top command
3 = STOP ENTIRE PROGRAM
'''
def run():
while True:
print("New")
data = get_data()
# data = data.split(",")
iteration = data[1]
data = data[0]
print("Data: " + data + " and iteration: " + iteration)
if data == '1' or data == '':
run_command(iteration)
print("We ran the command")
time.sleep(1.0)
print("Terminating")
process.kill()
print("We terminated the process")
if data == '2':
print("We got 2")
process.kill()
if data == '3':
print("Closing program")
exit()
runThread = threading.Thread(target = run)
runThread.start()
run()
print("Starting the listening thread...")

You need to make your socket non-blocking (see setblocking()) and rewrite your run() procedure to handle cases when data in socket is not received yet.

How can I terminate running jobs without closing connection to the core? (currently using execnet)

I have a cluster of computers which uses a master node to communicate with the slave nodes in the cluster.
The main problem I'm facing is using execnet is being able to kill certain jobs that are running and then having new jobs requeue on the same core that the other job just got terminated on (as I want to utilize all cores of the slave nodes at any given time).
As of now there is no way to terminate running jobs using execnet, so I figured if I could just kill the jobs manually through a bash script, say sudo kill 12345 where 12345 is the PID of the job (obtaining the PID of each job is another thing not supported by execnet, but that's another topic), then it would terminate the job and then requeue another on the same core that was just terminated on. It does kill the job correctly, however it closes the connection to that channel (the core; the master node communicates to each core individually) and then does not utilize that core anymore, until all jobs are done. Is there a way to terminate a running job, without killing the connection to the core?
Here is the script to submit jobs
import execnet, os, sys
import re
import socket
import numpy as np
import pickle, cPickle
from copy import deepcopy
import time
import job
def main():
print 'execnet source files are located at:\n {}/\n'.format(
os.path.join(os.path.dirname(execnet.__file__))
)
# Generate a group of gateways.
work_dir = '/home/mpiuser/pn2/'
f = 'cluster_core_info.txt'
n_start, n_end = 250000, 250008
ci = get_cluster_info(f)
group, g_labels = make_gateway_group(ci, work_dir)
mch = group.remote_exec(job)
args = range(n_start, n_end+1) # List of parameters to compute factorial.
manage_jobs(group, mch, queue, g_labels, args)
# Close the group of gateways.
group.terminate()
def get_cluster_info(f):
nodes, ncores = [], []
with open(f, 'r') as fid:
while True:
line = fid.readline()
if not line:
fid.close()
break
line = line.strip('\n').split()
nodes.append(line[0])
ncores.append(int(line[1]))
return dict( zip(nodes, ncores) )
def make_gateway_group(cluster_info, work_dir):
''' Generate gateways on all cores in remote nodes. '''
print 'Gateways generated:\n'
group = execnet.Group()
g_labels = []
nodes = list(cluster_info.keys())
for node in nodes:
for i in range(cluster_info[node]):
group.makegateway(
"ssh={0}//id={0}_{1}//chdir={2}".format(
node, i, work_dir
))
sys.stdout.write(' ')
sys.stdout.flush()
print list(group)[-1]
# Generate a string 'node-id_core-id'.
g_labels.append('{}_{}'.format(re.findall(r'\d+',node)[0], i))
print ''
return group, g_labels
def get_mch_id(g_labels, string):
ids = [x for x in re.findall(r'\d+', string)]
ids = '{}_{}'.format(*ids)
return g_labels.index(ids)
def manage_jobs(group, mch, queue, g_labels, args):
args_ref = deepcopy(args)
terminated_channels = 0
active_jobs, active_args = [], []
while True:
channel, item = queue.get()
if item == 'terminate_channel':
terminated_channels += 1
print " Gateway closed: {}".format(channel.gateway.id)
if terminated_channels == len(mch):
print "\nAll jobs done.\n"
break
continue
if item != "ready":
mch_id_completed = get_mch_id(g_labels, channel.gateway.id)
depopulate_list(active_jobs, mch_id_completed, active_args)
print " Gateway {} channel id {} returned:".format(
channel.gateway.id, mch_id_completed)
print " {}".format(item)
if not args:
print "\nNo more jobs to submit, sending termination request...\n"
mch.send_each(None)
args = 'terminate_channel'
if args and \
args != 'terminate_channel':
arg = args.pop(0)
idx = args_ref.index(arg)
channel.send(arg) # arg is copied by value to the remote side of
# channel to be executed. Maybe blocked if the
# sender queue is full.
# Get the id of current channel used to submit a job,
# this id can be used to refer mch[id] to terminate a job later.
mch_id_active = get_mch_id(g_labels, channel.gateway.id)
print "Job {}: {}! submitted to gateway {}, channel id {}".format(
idx, arg, channel.gateway.id, mch_id_active)
populate_list(active_jobs, mch_id_active,
active_args, arg)
def populate_list(jobs, job_active, args, arg_active):
jobs.append(job_active)
args.append(arg_active)
def depopulate_list(jobs, job_completed, args):
i = jobs.index(job_completed)
jobs.pop(i)
args.pop(i)
if __name__ == '__main__':
main()
and here is my job.py script:
#!/usr/bin/env python
import os, sys
import socket
import time
import numpy as np
import pickle, cPickle
import random
import job
def hostname():
return socket.gethostname()
def working_dir():
return os.getcwd()
def listdir(path):
return os.listdir(path)
def fac(arg):
return np.math.factorial(arg)
def dump(arg):
path = working_dir() + '/out'
if not os.path.exists(path):
os.mkdir(path)
f_path = path + '/fac_{}.txt'.format(arg)
t_0 = time.time()
num = fac(arg) # Main operation
t_1 = time.time()
cPickle.dump(num, open(f_path, "w"), protocol=2) # Main operation
t_2 = time.time()
duration_0 = "{:.4f}".format(t_1 - t_0)
duration_1 = "{:.4f}".format(t_2 - t_1)
#num2 = cPickle.load(open(f_path, "rb"))
return '--Calculation: {} s, dumping: {} s'.format(
duration_0, duration_1)
if __name__ == '__channelexec__':
channel.send("ready")
for arg in channel:
if arg is None:
break
elif str(arg).isdigit():
channel.send((
str(arg)+'!',
job.hostname(),
job.dump(arg)
))
else:
print 'Warnning! arg sent should be number | None'

Yes, you are on the right track. Use psutil library to manage the processes, find their pids etc.
And kill them. No need for involveing bash anywhere. Python covers it all.
Or, even better, program your script to terminate when master say so.
It is usually done that way.
You can even make it start another script before terminating itself if you want/need.
Or, if it is the same that you would be doing in another process, just stop the current work and start a new one in the script without terminating it at all.
And, if I may make a suggestion. Don't read your file line by line, read a whole file and then use *.splitlines(). For small files reading them in chunks just tortures the IO. You wouldn't be needing *.strip() as well. And you should remove unused imports too.

how to use threads to grab multiple chunks of a file concurrently from server but write to disk atomically?

I am stuck in a grieve problem, and not able to figure out which way to go, in attempts made whole day I have posted so many times, this is not a duplicate question, since I need clarity how can I use multiple threads to grab a multiple chunks simultaneously from a server but write to disk atomically by locking the file write operation for single thread access; while the next thread waits for lock to get released.
import argparse,logging, Queue, os, requests, signal, sys, time, threading
import utils as _fdUtils
DESKTOP_PATH = os.path.expanduser("~/Desktop")
appName = 'FileDownloader'
logFile = os.path.join(DESKTOP_PATH, '%s.log' % appName)
_log = _fdUtils.fdLogger(appName, logFile, logging.DEBUG, logging.DEBUG, console_level=logging.DEBUG)
queue = Queue.Queue()
STOP_REQUEST = threading.Event()
maxSplits = threading.BoundedSemaphore(3)
threadLimiter = threading.BoundedSemaphore(5)
lock = threading.Lock()
Update:1
def _grabAndWriteToDisk(url, saveTo, first=None, queue=None, mode='wb', irange=None):
""" Function to download file..
Args:
url(str): url of file to download
saveTo(str): path where to save file
first(int): starting byte of the range
queue(Queue.Queue): queue object to set status for file download
mode(str): mode of file to be downloaded
irange(str): range of byte to download
"""
fileName = url.split('/')[-1]
filePath = os.path.join(saveTo, fileName)
fileSize = int(_fdUtils.getUrlSizeInBytes(url))
downloadedFileSize = 0 if not first else first
block_sz = 8192
irange = irange if irange else '0-%s' % fileSize
# print mode
resp = requests.get(url, headers={'Range': 'bytes=%s' % irange}, stream=True)
fileBuffer = resp.raw.read()
with open(filePath, mode) as fd:
downloadedFileSize += len(fileBuffer)
fd.write(fileBuffer)
status = r"%10d [%3.2f%%]" % (downloadedFileSize, downloadedFileSize * 100. / fileSize)
status = status + chr(8)*(len(status)+1)
sys.stdout.write('%s\r' % status)
time.sleep(.05)
sys.stdout.flush()
if downloadedFileSize == fileSize:
STOP_REQUEST.set()
if queue:
queue.task_done()
_log.info("Download Completed %s%% for file %s, saved to %s",
downloadedFileSize * 100. / fileSize, fileName, saveTo)
class ThreadedFetch(threading.Thread):
""" docstring for ThreadedFetch
"""
def __init__(self, queue):
super(ThreadedFetch, self).__init__()
self.queue = queue
self.lock = threading.Lock()
def run(self):
threadLimiter.acquire()
try:
items = self.queue.get()
url = items[0]
saveTo = DESKTOP_PATH if not items[1] else items[1]
split = items[-1]
# grab split chunks in separate thread.
if split > 1:
maxSplits.acquire()
try:
sizeInBytes = int(_fdUtils.getUrlSizeInBytes(url))
byteRanges = _fdUtils.getRange(sizeInBytes, split)
mode = 'wb'
th = threading.Thread(target=_grabAndWriteToDisk, args=(url, saveTo, first, self.queue, mode, _range))
_log.info("Pulling for range %s using %s" , _range, th.getName())
th.start()
# _grabAndWriteToDisk(url, saveTo, first, self.queue, mode, _range)
mode = 'a'
finally:
maxSplits.release()
else:
while not STOP_REQUEST.isSet():
self.setName("primary_%s" % url.split('/')[-1])
# if downlaod whole file in single chunk no need
# to start a new thread, so directly download here.
_grabAndWriteToDisk(url, saveTo, 0, self.queue)
finally:
threadLimiter.release()
def main(appName, flag='with'):
args = _fdUtils.getParser()
urls_saveTo = {}
if flag == 'with':
_fdUtils.Watcher()
elif flag != 'without':
_log.info('unrecognized flag: %s', flag)
sys.exit()
# spawn a pool of threads, and pass them queue instance
# each url will be downloaded concurrently
for i in xrange(len(args.urls)):
t = ThreadedFetch(queue)
t.daemon = True
t.start()
split = 4
try:
for url in args.urls:
# TODO: put split as value of url as tuple with saveTo
urls_saveTo[url] = args.saveTo
# populate queue with data
for url, saveTo in urls_saveTo.iteritems():
queue.put((url, saveTo, split))
# wait on the queue until everything has been processed
queue.join()
_log.info('Finsihed all dowonloads.')
except (KeyboardInterrupt, SystemExit):
_log.critical('! Received keyboard interrupt, quitting threads.')
I expect to multiple threads grab each chunk but in the terminal i see same thread grabbing each range of chunk:
INFO - Pulling for range 0-25583 using Thread-1
INFO - Pulling for range 25584-51166 using Thread-1
INFO - Pulling for range 51167-76748 using Thread-1
INFO - Pulling for range 76749-102331 using Thread-1
INFO - Download Completed 100.0% for file 607800main_kepler1200_1600-1200.jpg
but what I am expecting is
INFO - Pulling for range 0-25583 using Thread-1
INFO - Pulling for range 25584-51166 using Thread-2
INFO - Pulling for range 51167-76748 using Thread-3
INFO - Pulling for range 76749-102331 using Thread-4
Please do not mark duplicate, without understanding...
please recommend if there is a better approach of doing what I am trying to do..
Cheers:/

If you want the download happen simultaneously and just writing to the file to be atomic then don't put the lock around downloading and writing to the file, but just around the write part.
As every thread uses its own file object to write I don't see the need to lock that access anyway. What you have to make sure is every thread writes to the correct offset, so you need a seek() call on the file before writing the data chunk. Otherwise you'll have to write the chunks in file order, which would make things more complicated.

Python Multiprocessing Exception Handling Data Chunking

I'm trying to speed up some data processing using the multiprocessing module, the idea being I can send a chunk of data to each process I start up to utilize all the cores on my machine instead of just one at a time.
So I built an iterator for the data using the pandas read_fwf() function, with chunksize=50000 lines at a time. My problem is that eventually the iterator should raise StopIteration, and I'm trying to catch this in an except block in the child process and pass it along to the parent thread using a Queue to let the parent know it can stop spawning child processes. I have no idea what's wrong though, but what's happening is it gets to the end of the data and then keeps spawning processes which essentially do nothing.
def MyFunction(data_iterator, results_queue, Placeholder, message_queue):
try:
current_data = data_iterator.next()
#does other stuff here
#that isn't important
placeholder_result = "Eggs and Spam"
results_queue.put(placeholder_result)
return None
except StopIteration:
message_queue.put("Out Of Data")
return None
results_queue = Queue() #for passing results from each child process
message_queue = Queue() #for passing the stop iteration message
cpu_count = cpu_count() #num of cores on the machine
Data_Remaining = True #loop control
output_values = [] #list to put results in
print_num_records = 0 #used to print how many lines have been processed
my_data_file = "some_data.dat"
data_iterator = BuildDataIterator(my_data_file)
while Data_Remaining:
processes = []
for process_num in range(cpu_count):
if __name__ == "__main__":
p = Process(target=MyFunction, args=(data_iterator,results_queue,Placeholder, message_queue))
processes.append(p)
p.start()
print "Process " + str(process_num) + " Started" #print some stuff to
print_num_records = print_num_records + 50000 #show how far along
print "Processing records through: ", print_num_records #my data file I am
for i,p in enumerate(processes):
print "Joining Process " + str(i)
output_values.append(results_queue.get())
p.join(None)
if not message_queue.empty():
message = message_queue.get()
else:
message = ""
if message == "Out Of Data":
Data_Remaining = False
print "STOP ITERATION NOW PLEASE"
Update:
I discovered a problem with the data iterator. There are approximately 8 million rows in my data set, and after it processes the 8 million it never actually returns a StopIteration, it keeps returning the same 14 rows of data over and over. Here is the code that builds my data iterator:
def BuildDataIterator(my_data_file):
#data_columns is a list of 2-tuples
#headers is a list of strings
#num_lines is 50000
data_reader = read_fwf(my_data_file, colspecs=data_columns, header=None, names=headers, chunksize=num_lines)
data_iterator = data_reader.__iter__()
return data_iterator

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Multiple Python threads writing to single JSON file - python

Related

Uploading with PyMongo in a While True loop using Multiprocessing

How to pause a thread until data received in Python

How can I terminate running jobs without closing connection to the core? (currently using execnet)

how to use threads to grab multiple chunks of a file concurrently from server but write to disk atomically?

Python Multiprocessing Exception Handling Data Chunking

Categories

Resources