python, multthreading, safe to use pandas "to_csv" on common file? - python

I've got some code that works pretty nicely. It's a while-loop that goes through a list of dates, finds files on my HDD that corresponds to those dates, does some calculations with those files, and then outputs to a "results.csv" file using the command:
my_df.to_csv("results.csv",mode = 'a')
I'm wondering if it's safe to create a new thread for each date, and call the stuff in the while loop on several dates at a time?
MY CODE:
import datetime, time, os
import sys
import threading
import helperPY #a python file containing the logic I need
class myThread (threading.Thread):
def __init__(self, threadID, name, counter,sn, m_date):
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.counter = counter
self.sn = sn
self.m_date = m_date
def run(self):
print "Starting " + self.name
m_runThis(sn, m_date)
print "Exiting " + self.name
def m_runThis(sn, m_date):
helperPY.helpFn(sn,m_date) #this is where the "my_df.to_csv()" is called
sn = 'XXXXXX'
today=datetime.datetime(2016,9,22) #
yesterday=datetime.datetime(2016,6,13)
threadList = []
i_threadlist=0
while(today>yesterday):
threadList.append(myThread(i_threadlist, str(today), i_threadlist,sn,today))
threadList[i_threadlist].start()
i_threadlist = i_threadlist +1
today = today-datetime.timedelta(1)

Writing the file in multiple threads is not safe. But you can create a lock to protect that one operation while letting the rest run in parallel. Your to_csv isn't shown, but you could create the lock
csv_output_lock = threading.Lock()
and pass it to helperPY.helpFn. When you get to the operation, do
with csv_output_lock:
my_df.to_csv("results.csv",mode = 'a')
You get parallelism for other operations - subject to the GIL of course - but the file access is protected.

Related

python Thread.name is printing last thread created name

I'm a newbie to Python and learning about threads. I have created a sample Producer-Consumer code wherein I add a movie to a list in Producer thread and pop the front element from the same list in Consumer thread. The problem is while printing the items of the movie List along with thread name I'm getting incorrect thread name in Producer thread. This is my code
Producer.py
from threading import Thread
from threading import RLock
import time
class Producer(Thread):
def __init__(self):
Thread.__init__(self)
Thread.name = 'Producer'
self.movieList = list()
self.movieListLock = RLock()
def printMovieList(self):
self.movieListLock.acquire()
if len(self.movieList) > 0:
for movie in self.movieList:
print(Thread.name, movie)
print('\n')
self.movieListLock.release()
def pushMovieToList(self, movie):
self.movieListLock.acquire()
self.movieList.append(movie)
self.printMovieList()
self.movieListLock.release()
def run(self):
for i in range(6):
self.pushMovieToList('Avengers' + str(i + 1))
time.sleep(1)
Consumer.py
from threading import Thread
import time
class Consumer(Thread):
def __init__(self):
Thread.__init__(self)
Thread.name = 'Consumer'
self.objProducer = None
def popMovieFromList(self):
self.objProducer.movieListLock.acquire()
if len(self.objProducer.movieList) > 0:
movie = self.objProducer.movieList.pop(0)
print(Thread.name, ':', movie)
print('\n')
self.objProducer.movieListLock.release()
def run(self):
while True:
time.sleep(1)
self.popMovieFromList()
Main.py
from Producer import *
from Consumer import *
def main():
objProducer = Producer()
objConsumer = Consumer()
objConsumer.objProducer = objProducer
objProducer.start()
objConsumer.start()
objProducer.join()
objConsumer.join()
main()
I am not sure whether you solve this problem.
Hope my answer will be helpful.
You can check the document of threading.
Here it says that Thread.name may set same name for multiple thread.
name
A string used for identification purposes only. It has no semantics. Multiple threads may be given the same name. The initial name is set by the constructor.
I think Thread.name is a static variable so it shares in different thread.
If you want to set name of thread, you can set it in thread object like this:
class Producer(Thread):
def __init__(self):
Thread.__init__(self)
self.name= 'Producer'
and get it by threading.current_thread().name.
if len(self.movieList) > 0:
for movie in self.movieList:
print(threading.current_thread().name, movie)
Hope you enjoy it!

Memory leak while retrieving data from a proxy class

I am multi-processing data from a series of files.
To achieve the purpose, I built a class to distribute the data.
I started 4 processes that will visit the same class and retrieve data.
The problem is, if I use the class method (retrieve()) to retrieve data, the memory will keep going up. If I don't, the memory is stable, even though the data keeps refreshing by getData(). How to keep a stable memory usage while retrieving data? Or any other way to achieve the same goal?
import pandas as pd
from multiprocessing import Process, RLock
from multiprocessing.managers import BaseManager
class myclass():
def __init__(self, path):
self.path = path
self.lock = RLock()
self.getIter()
def getIter(self):
self.iter = pd.read_csv(self.path, chunksize=1000)
def getData(self):
with self.lock:
try:
self.data = next(self.iter)
except:
self.getIter()
self.data = next(self.iter)
def retrieve(self):
return self.data
def worker(c):
while True:
c.getData()
# Uncommenting the following line, memory usage goes up
data = c.retrieve()
#Generate a testing file
with open('tmp.csv', 'w') as f:
for i in range(1000000):
f.write('%f\n'%(i*1.))
BaseManager.register('myclass', myclass)
bm = BaseManager()
bm.start()
c = bm.myclass('tmp.csv')
for i in range(4):
p = Process(target=worker, args=(c,))
p.start()
I wasn't able to find out the cause nor solving it, but after changing the data type for the returning variable from pandas.DataFrame to a str (json string), the problem goes.

Shared state in multiprocessing Processes

Please consider this code:
import time
from multiprocessing import Process
class Host(object):
def __init__(self):
self.id = None
def callback(self):
print "self.id = %s" % self.id
def bind(self, event_source):
event_source.callback = self.callback
class Event(object):
def __init__(self):
self.callback = None
def trigger(self):
self.callback()
h = Host()
h.id = "A"
e = Event()
h.bind(e)
e.trigger()
def delayed_trigger(f, delay):
time.sleep(delay)
f()
p = Process(target = delayed_trigger, args = (e.trigger, 3,))
p.start()
h.id = "B"
e.trigger()
This gives in output
self.id = A
self.id = B
self.id = A
However, I expected it to give
self.id = A
self.id = B
self.id = B
..because the h.id was already changed to "B" by the time the trigger method was called.
It seems that a copy of host instance is created at the moment when the separate Process is started, so the changes in the original host do not influence that copy.
In my project (more elaborate, of course), the host instance fields are altered time to time, and it is important that the events that are triggered by the code running in a separate process, have access to those changes.
multiprocessing runs stuff in separate processes. It is almost inconceivable that things are not copied as they're sent, as sharing stuff between processes requires shared memory or communication.
In fact, if you peruse the module, you can see the amount of effort it takes to actually share anything between the processes after the diverge, either through explicit communication, or through explicitly-shared objects (which are of a very limited subset of the language, and have to be managed by a Manager).

How to properly set up multiprocessing proxy objects for objects that already exist

I'm trying to share an existing object across multiple processing using the proxy methods described here. My multiprocessing idiom is the worker/queue setup, modeled after the 4th example here.
The code needs to do some calculations on data that are stored in rather large files on disk. I have a class that encapsulates all the I/O interactions, and once it has read a file from disk, it saves the data in memory for the next time a task needs to use the same data (which happens often).
I thought I had everything working from reading the examples linked to above. Here is a mock up of the code that just uses numpy random arrays to model the disk I/O:
import numpy
from multiprocessing import Process, Queue, current_process, Lock
from multiprocessing.managers import BaseManager
nfiles = 200
njobs = 1000
class BigFiles:
def __init__(self, nfiles):
# Start out with nothing read in.
self.data = [ None for i in range(nfiles) ]
# Use a lock to make sure only one process is reading from disk at a time.
self.lock = Lock()
def access(self, i):
# Get the data for a particular file
# In my real application, this function reads in files from disk.
# Here I mock it up with random numpy arrays.
if self.data[i] is None:
with self.lock:
self.data[i] = numpy.random.rand(1024,1024)
return self.data[i]
def summary(self):
return 'BigFiles: %d, %d Storing %d of %d files in memory'%(
id(self),id(self.data),
(len(self.data) - self.data.count(None)),
len(self.data) )
# I'm using a worker/queue setup for the multprocessing:
def worker(input, output):
proc = current_process().name
for job in iter(input.get, 'STOP'):
(big_files, i, ifile) = job
data = big_files.access(ifile)
# Do some calculations on the data
answer = numpy.var(data)
msg = '%s, job %d'%(proc, i)
msg += '\n Answer for file %d = %f'%(ifile, answer)
msg += '\n ' + big_files.summary()
output.put(msg)
# A class that returns an existing file when called.
# This is my attempted workaround for the fact that Manager.register needs a callable.
class ObjectGetter:
def __init__(self, obj):
self.obj = obj
def __call__(self):
return self.obj
def main():
# Prior to the place where I want to do the multprocessing,
# I already have a BigFiles object, which might have some data already read in.
# (Here I start it out empty.)
big_files = BigFiles(nfiles)
print 'Initial big_files.summary = ',big_files.summary()
# My attempt at making a proxy class to pass big_files to the workers
class BigFileManager(BaseManager):
pass
getter = ObjectGetter(big_files)
BigFileManager.register('big_files', callable = getter)
manager = BigFileManager()
manager.start()
# Set up the jobs:
task_queue = Queue()
for i in range(njobs):
ifile = numpy.random.randint(0, nfiles)
big_files_proxy = manager.big_files()
task_queue.put( (big_files_proxy, i, ifile) )
# Set up the workers
nproc = 12
done_queue = Queue()
process_list = []
for j in range(nproc):
p = Process(target=worker, args=(task_queue, done_queue))
p.start()
process_list.append(p)
task_queue.put('STOP')
# Log the results
for i in range(njobs):
msg = done_queue.get()
print msg
print 'Finished all jobs'
print 'big_files.summary = ',big_files.summary()
# Shut down the workers
for j in range(nproc):
process_list[j].join()
task_queue.close()
done_queue.close()
main()
This works in the sense that it calculates everything correctly, and it is caching the data that is read along the way. The only problem I'm having is that at the end, the big_files object doesn't have any of the files loaded. The final msg returned is:
Process-2, job 999. Answer for file 198 = 0.083406
BigFiles: 4303246400, 4314056248 Storing 198 of 200 files in memory
But then after it's all done, we have:
Finished all jobs
big_files.summary = BigFiles: 4303246400, 4314056248 Storing 0 of 200 files in memory
So my question is: What happened to all the stored data? It's claiming to be using the same self.data according to the id(self.data). But it's empty now.
I want the end state of big_files to have all the saved data that it accumulated along the way, since I actually have to repeat this entire process many times, so I don't want to have to redo all the (slow) I/O each time.
I'm assuming it must have something to do with my ObjectGetter class. The examples for using BaseManager only show how to make a new object that will be shared, not share an existing one. So am I doing something wrong with way I get the existing big_files object? Can anyone suggest a better way to do this step?
Thanks much!

Creating interruptible process in python

I'm creating a python script of which parses a large (but simple) CSV.
It'll take some time to process. I would like the ability to interrupt the parsing of the CSV so I can continue at a later stage.
Currently I have this - of which lives in a larger class: (unfinished)
Edit:
I have some changed code. But the system will parse over 3 million rows.
def parseData(self)
reader = csv.reader(open(self.file))
for id, title, disc in reader:
print "%-5s %-50s %s" % (id, title, disc)
l = LegacyData()
l.old_id = int(id)
l.name = title
l.disc_number = disc
l.parsed = False
l.save()
This is the old code.
def parseData(self):
#first line start
fields = self.data.next()
for row in self.data:
items = zip(fields, row)
item = {}
for (name, value) in items:
item[name] = value.strip()
self.save(item)
Thanks guys.
If under linux, hit Ctrl-Z and stop the running process. Type "fg" to bring it back and start where you stopped it.
You can use signal to catch the event. This is a mockup of a parser than can catch CTRL-C on windows and stop parsing:
import signal, tme, sys
def onInterupt(signum, frame):
raise Interupted()
try:
#windows
signal.signal(signal.CTRL_C_EVENT, onInterupt)
except:
pass
class Interupted(Exception): pass
class InteruptableParser(object):
def __init__(self, previous_parsed_lines=0):
self.parsed_lines = previous_parsed_lines
def _parse(self, line):
# do stuff
time.sleep(1) #mock up
self.parsed_lines += 1
print 'parsed %d' % self.parsed_lines
def parse(self, filelike):
for line in filelike:
try:
self._parse(line)
except Interupted:
print 'caught interupt'
self.save()
print 'exiting ...'
sys.exit(0)
def save(self):
# do what you need to save state
# like write the parse_lines to a file maybe
pass
parser = InteruptableParser()
parser.parse([1,2,3])
Can't test it though as I'm on linux at the moment.
The way I'd do it:
Puty the actual processing code in a class, and on that class I'd implement the Pickle protocol (http://docs.python.org/library/pickle.html ) (basically, write proper __getstate__ and __setstate__ functions)
This class would accept the filename, keep the open file, and the CSV reader instance as instance members. The __getstate__ method would save the current file position, and setstate would reopen the file, forward it to the proper position, and create a new reader.
I'd perform the actuall work in an __iter__ method, that would yeld to an external function after each line was processed.
This external function would run a "main loop" monitoring input for interrupts (sockets, keyboard, state of an specific file on the filesystem, etc...) - everything being quiet, it would just call for the next iteration of the processor. If an interrupt happens, it would pickle the processor state to an specific file on disk.
When startingm the program just has to check if a there is a saved execution, if so, use pickle to retrieve the executor object, and resume the main loop.
Here goes some (untested) code - the iea is simple enough:
from cPickle import load, dump
import csv
import os, sys
SAVEFILE = "running.pkl"
STOPNOWFILE = "stop.now"
class Processor(object):
def __init__(self, filename):
self.file = open(filename, "rt")
self.reader = csv.reader(self.file)
def __iter__(self):
for line in self.reader():
# do stuff
yield None
def __getstate__(self):
return (self.file.name, self.file.tell())
def __setstate__(self, state):
self.file = open(state[0],"rt")
self.file.seek(state[1])
self.reader = csv.reader(self.File)
def check_for_interrupts():
# Use your imagination here!
# One simple thing would e to check for the existence of an specific file
# on disk.
# But you go all the way up to instantiate a tcp server and listen to
# interruptions on the network
if os.path.exists(STOPNOWFILE):
return True
return False
def main():
if os.path.exists(SAVEFILE):
with open(SAVEFILE) as savefile:
processor = load(savefile)
os.unlink(savefile)
else:
#Assumes the name of the .csv file to be passed on the command line
processor = Processor(sys.argv[1])
for line in processor:
if check_for_interrupts():
with open(SAVEFILE, "wb") as savefile:
dump(processor)
break
if __name__ == "__main__":
main()
My Complete Code
I followed the advice of #jsbueno with a flag - but instead of another file, I kept it within the class as a variable:
I create a class - when I call it asks for ANY input and then begins another process doing my work. As its looped - if I were to press a key, the flag is set and only checked when the loop is called for my next parse. Thus I don't kill the current action.
Adding a process flag in the database for each object from the data I'm calling means I can start this any any time and resume where I left off.
class MultithreadParsing(object):
process = None
process_flag = True
def f(self):
print "\nMultithreadParsing has started\n"
while self.process_flag:
''' get my object from database '''
legacy = LegacyData.objects.filter(parsed=False)[0:1]
if legacy:
print "Processing: %s %s" % (legacy[0].name, legacy[0].disc_number)
for l in legacy:
''' ... Do what I want it to do ...'''
sleep(1)
else:
self.process_flag = False
print "Nothing to parse"
def __init__(self):
self.process = Process(target=self.f)
self.process.start()
print self.process
a = raw_input("Press any key to stop \n")
print "\nKILL FLAG HAS BEEN SENT\n"
if a:
print "\nKILL\n"
self.process_flag = False
Thanks for all you help guys (especially yours #jsbueno) - if it wasn't for you I wouldn't have got this class idea.

Categories