I've read the documentation on how to write strings into a compressed python file:
with bz2.open ( "/tmp/test.bz2", "w" ) as f:
f.write ( b"Hello" )
The problem I have is that I've functions accepting a file parameter, which is passed to the print() function, ie:
def produce_out ( out = sys.stdout ):
# many print ( file = out )
# invocations of other functions accepting out
Clearly, the cleanest and most modular way to obtain that my output is printed and compressed at the same time would be chaining the two above, ie:
with bz2.open ( "/tmp/test.bz2", "w" ) as f:
out = compressed_stream_adapter ( f )
produce_out ( out )
where compressed_stream_adapter() yields some object compatible with the file parameter that print() accepts and which automatically forwards the strings it receives to the compressed stream. This is how the compression works in Java, or how you can use the pipe operator in Linux shells to compress any kind of output (which also parallelises its endpoints, but that's not very important here).
My question is: does anything like compressed_stream_adapter() exist in python? Is there a different way to do it that does not require to change existing code?
Note that I already know I could do: out = io.StringIO () and later:
f.write ( out.getvalue ().encode () ). However, that's not good when I have to dynamically dump big amounts of data to files (which indeed, is why I want to compress them).
Answering myself: I guess there isn't any off-the-shelf way to do that.
So, I've followed the Dan Mašek comments and implemented a little wrapper, which relies on the fact that print() expects an object having a write method:
class BinaryWriter:
def __init__ ( self, bin_out, encoding = "utf-8", errors = 'strict' ):
self.bin_out = bin_out
self.encoding = encoding
self.errors = errors
def write ( self, s: str ):
self.bin_out.write ( s.encode ( self.encoding, self.errors ) )
def close ( self ):
self.bin_out.close ()
Usage:
with bz2.open ( file_path, "w" ) as bout
out = BinaryWriter ( bout )
print ( "Hello, world", file = out )
my_output ( out ) # Uses print( ..., file = out )
If compression is optional:
out = open ( file_path, mode = "w" ) if not file_path.endswith ( ".bz2" ) \
else BinaryWriter ( bz2.open ( file_path, "w" ) )
try:
my_output ( out )
finally:
out.close ()
Related
Every reference I find for creating a buffer in ctypes seems to create one of static length...
Where I'm dealing with data read from a file handled by ctypes that defines inline buffers within a struct where the length is initially unknown until read.
import ctypes
class Buffer16(ctypes.Structure):
_fields_ = [
('length', ctypes.c_ushort.__ctype_be__ ),
('data', ctypes.c_ubyte*0 ) # to be resized via malloc
]
def __new__(cls): # not executed for some reason
b16 = ctypes.Structure.__new__(cls) # wish I could interrupt before reading the 0-length array...
# some unknown magic here to malloc b16.data
return b16
class Test(ctypes.Structure):
_fields_ = [
('data', ctypes.c_uint.__ctype_be__ ),
('buf1', Buffer16 ),
('buf2', Buffer16 )
]
I can easily define the data as a c_ubyte array as read from the file, and initialize the struct with Structure.from_address(ctypes.addressof(bytedata))...
But the problem here is __new__ and __init__ don't get executed, so the buffers aren't sized appropriately.
here's some test data for an example:
>>> bytedata = (ctypes.c_ubyte*19)(*b'\x00\x04\x18\x80\x00\x04test\x00\x07testing')
>>>
>>> testinstance = Test.from_address(ctypes.addressof(bytedata))
>>> testinstance.data # just some dummy data which is correct
268416
>>> testinstance.buf1.length # this is correct
4
>>> testinstance.buf1.data # this should be __len__ == 4
<__main__.c_ubyte_Array_0 object at 0x...>
>>> testinstance.buf2.length # this is wrong (0x7465 from b'te'), it should be 7
29797
Is there a better way that can inline malloc than from_address?
(casting is no different from from_address other than testinstance[0])
You've got variable-sized data in your structure. How would you create this structure in C? Typically only the last element in a structure can be an array and C allows one index beyond the end of the structure, but in this case you have two variables.
Although it can be done in ctypes, I'll first suggest unpacking the data as you go with the struct module. If you are reading the data from a file, all you really care about is obtaining the data and the buffers and it doesn't need to be in ctypes format, nor do you need the lengths beyond their use reading the buffers:
import struct
import io
# create a file-like byte stream
filedata = io.BytesIO(b'\x00\x04\x18\x80\x00\x04test\x00\x07testing')
data,len1 = struct.unpack('>LH',filedata.read(6))
data1 = filedata.read(len1)
len2, = struct.unpack(f'>H',filedata.read(2))
data2 = filedata.read(len2)
print(hex(data),data1,data2)
Output:
0x41880 b'test' b'testing'
Here's a way to do it in ctypes by creating a custom class definition for each structure, but is the data really needed in a ctypes format?
import struct
import ctypes
import io
# Read a variable-sized Buffer16 object from the file.
# Once the length is read, declare a custom class with data of that length.
def read_Buffer16(filedata):
length, = struct.unpack('>H',filedata.read(2))
class Buffer16(ctypes.BigEndianStructure):
_fields_ = (('length', ctypes.c_ushort),
('data', ctypes.c_char * length))
def __repr__(self):
return f'Buffer16({self.length}, {self.data})'
return Buffer16(length,filedata.read(length))
# Read a variable-sized Test object from the file.
# Once the buffers are read, declare a custom class of their exact type.
def read_Test(filedata):
data, = struct.unpack('>L',filedata.read(4))
b1 = read_Buffer16(filedata)
b2 = read_Buffer16(filedata)
class Test(ctypes.BigEndianStructure):
_fields_ = (('data', ctypes.c_uint),
('buf1', type(b1)),
('buf2', type(b2)))
def __repr__(self):
return f'Test({self.data:#x}, {self.buf1}, {self.buf2})'
return Test(data,b1,b2)
# create a file-like byte stream
filedata = io.BytesIO(b'\x00\x04\x18\x80\x00\x04test\x00\x07testing')
t = read_Test(filedata)
print(t)
Output:
Test(0x41880, Buffer16(4, b'test'), Buffer16(7, b'testing'))
Edit per comment
This might be how you'd store this file data in a C-like structure. The variable buffers are read in, stored in an array (similar to C malloc) and its length and address are stored in the structure. The class methods know how to read a particular structure from the file stream and return the appropriate object. Note, however, that just like in C you can read past the end of a pointer and risk exceptions or undefined behavior.
import struct
import ctypes
import io
class Buffer16(ctypes.Structure):
_fields_ = (('length', ctypes.c_ushort),
('data', ctypes.POINTER(ctypes.c_char)))
#classmethod
def read(cls,file):
length, = struct.unpack('>H',file.read(2))
data = (ctypes.c_char * length)(*file.read(length))
return cls(length,data)
def __repr__(self):
return f'Buffer16({self.data[:self.length]})'
class Test(ctypes.Structure):
_fields_ = (('data', ctypes.c_uint),
('buf1', Buffer16),
('buf2', Buffer16))
#classmethod
def read(cls,file):
data, = struct.unpack('>L',file.read(4))
b1 = Buffer16.read(file)
b2 = Buffer16.read(file)
return cls(data,b1,b2)
def __repr__(self):
return f'Test({self.data:#x}, {self.buf1}, {self.buf2})'
# create a file-like byte stream
file = io.BytesIO(b'\x00\x04\x18\x80\x00\x04test\x00\x07testing')
t = Test.read(file)
print(t)
print(t.buf1.length)
print(t.buf1.data[:10]) # Just like in C, you can read beyond the end of the pointer
Output:
Test(0x41880, Buffer16(b'test'), Buffer16(b'testing'))
4
b'test\x00\x00\x00\x00\x00\x00'
With credit to and inspiration from Mark Tolonen's answer, I realized his answer was a similar mechanic to the ctypes.Structure.from_address() method.
Here's my answer and tests with my updates to his:
from ctypes import Structure, c_char, c_ushort, c_uint, POINTER, addressof
c_bushort = c_ushort.__ctype_be__
c_buint = c_uint.__ctype_be__
class Buffer16(Structure):
_fields_ = (
('length', c_bushort),
('data', POINTER( c_char ))
)
#classmethod
def from_address(cls, addr):
length = c_bushort.from_address( addr ).value
data = ( c_char*length ).from_address( addr+2 )
return cls( length, data )
class Test(Structure):
_fields_ = (
('data', c_buint),
('buf1', Buffer16),
('buf2', Buffer16)
)
#classmethod
def from_address(cls, addr):
data = c_buint.from_address( addr )
b1 = Buffer16.from_address( addr+4 )
b2 = Buffer16.from_address( addr+6+b1.length )
return cls( data, b1, b2 )
bytedata = ( c_char*19 )( *b'\x00\x04\x18\x80\x00\x04test\x00\x07testing' )
t = Test.from_address( addressof( bytedata ) )
print( t.data )
print( t.buf1.data[:t.buf1.length] )
print( t.buf2.data[:t.buf2.length] )
and the results:
>>>
268416
b'test'
b'testing'
Also a minor note about the enforcement of .__ctype_be__ on ctypes.c_uint and ctypes.c_ushort...
Not all systems use the same default endian when reading data.
My systems in particular read data in little endian, so b'\x00\x04\x18\x80' returns 2149057536 when processed with ctypes.c_uint, rather than the expected 268416.
I know that similar questions might have been asked before. But I couldn't find a solution that fits my case. Apologies for the dumb question in advance.
I am reading two voltage values from a USB-hub (connected to two sensors). The problem is that, the way that my code does it, there will be an approx. 0.8-second delay between them so I can never have both their values at the same time ( if I decrease the any of the two time.sleep(), the value of the second def will not be reported ). I was thinking that if both could run at the same time, maybe I could have values that belong to exact same time point and not shifted through time. If you have any comments that can improve this code, I appreciate it.
I thank you for your comments in advance.
import sys
import time
import datetime
from Phidget22.Devices.VoltageRatioInput import *
from Phidget22.PhidgetException import *
from Phidget22.Phidget import *
from Phidget22.Net import *
fig = plt.figure()
ax1 = fig.add_subplot(1,1,1)
try:
ch = VoltageRatioInput()
except RuntimeError as e:
print("Runtime Exception %s" % e.details)
print("Press Enter to Exit...\n")
readin = sys.stdin.read(1)
exit(1)
a=[]
b=[]
try:
start = time.time()
while True:
def VoltageRatioChangeHandler(e, voltageRatio):
n=voltageRatio
a.append(n)
ch.setOnVoltageRatioChangeHandler(VoltageRatioChangeHandler)
ch.setHubPort(1)
ch.setIsHubPortDevice(1)
ch.openWaitForAttachment(5000)
if(ch.getChannelSubclass() == ChannelSubclass.PHIDCHSUBCLASS_VOLTAGERATIOINPUT_BRIDGE):
ch.setBridgeEnabled(1)
time.sleep(0.3)
ch.close()
end1 = time.time()
Elt1 = end1-start
print (Elt1)
print a
###
def VoltageRatioChangeHandler(e, voltageRatio2):
m=voltageRatio2
if m is None:
b.append(0)
else:
b.append(m)
ch.setOnVoltageRatioChangeHandler(VoltageRatioChangeHandler)
ch.setHubPort(0)
ch.setIsHubPortDevice(0)
ch.openWaitForAttachment(5000)
if(ch.getChannelSubclass() == ChannelSubclass.PHIDCHSUBCLASS_VOLTAGERATIOINPUT_BRIDGE):
ch.setBridgeEnabled(1)
time.sleep(0.4)
ch.close()
end = time.time()
Elt = end - start
print (Elt)
print b
except KeyboardInterrupt:
print ("gracefully aborted")
sys.exit()
Going parallel sounds easier than done. (... watch the [us] lost here ) Better not move in this way, but:
Long story short: costs of going "parallel" are devastating for your use-case.
Also reinventing a wheel is quite expensive, so let me offer you a concept, which is almost free of charge and works like charm.
Your measurements are so close to a multi-agent control-system, so let's re-use the framework that was developed exactly for the same reason -- the MVC ( yes, an anxious and great idea, as old as the one originated in the famous nest of smart thinkers from the XEROX Palo Alto Research Centre ).
import Tkinter as tk # YESSSS! re-using a GUI-tool ( i.e. multi-agent by-design )
The key value is in the freedom of design "under" the Controller-part of the concept, using all the exceptionally well polished tools built-in, not to bother with low-level details.
High-level idea: ( a full sensoric-network control-plan may go this way )
Let the sensors get read as often as you need ( be it driven just by a common sense, a smell of reason, or an indeed rigorous Nyquist-boundary of the theory of stability of your experiment's control-loop ).
First, we may need a way, how to passively read a value-pair ( coherently read at the same time ( well, better withing a common window-of-time, right? ) ).
SENSOR_A_Last_Voltage_Value = tk.DoubleVar()
SENSOR_B_Last_Voltage_Value = tk.DoubleVar()
SCHEDULED_EVENT_READ_A = tk.StringVar()
SCHEDULED_EVENT_READ_B = tk.StringVar()
SIGNAL_2_READ_Voltage_Value = tk.IntVar()
These are the MVC-Model-part smart-"registers", if you wish.
def aSensorREAD_A():
#--------------------------------------------------
# handle all the tricks to read a given sensor ONCE
#--------------------------------------------------
...
ch.setHubPort( 0 )
ch.setIsHubPortDevice( 0 )
ch.openWaitForAttachment( 5000 )
...
a_just_read_value = ...
#--------------------------------------------------
# Let the MVC-framework store this value into MODEL
#--------------------------------------------------
SENSOR_A_Last_Voltage_Value.set( a_just_read_value )
#--------------------------------------------------
# schedule a ( self-operated ) read "next" ONCE
#--------------------------------------------------
SCHEDULED_EVENT_READ_A.set( root.after( 100, aSensorREAD_A ) )
# repeat after 100 [ms]
#
# a cool way to command actually your Boss, isn't it?
# + may,
# if at need, root.after_cancel( SCHEDULED_EVENT_READ_A )
So, we may consider a role of a SensorREAD_*() to be one of such independent agents, responsible for doing the low-level job with the actual sensor readings.
For a passive value-consumer, there will be just a pair of "intelligent" variables, that are granted to always carry the updated ( last read )-value.
print( "[A] {0: >16.3f}[mV]".format( SENSOR_A_Last_Voltage_Value.get() )
print( "[B] {0: >16.3f}[mV]".format( SENSOR_B_Last_Voltage_Value.get() )
For a triggered-expecting value-consumer, there might be an additional tool, that will inform any such trigger-expecting-reader.
idTrA1 = SENSOR_A_Last_Voltage_Value.trace_variable( "w", aTriggerdFUN1ToCallOnA )
idTrA2 = SENSOR_A_Last_Voltage_Value.trace_variable( "w", aTriggerdFUN2ToCallOnA )
idTrA3 = SENSOR_A_Last_Voltage_Value.trace_variable( "w", aTriggerdFUN3ToCallOnA )
idTrB1 = SENSOR_B_Last_Voltage_Value.trace_variable( "w", aTriggerdFUN1ToCallOnB )
...
idTrB7 = SENSOR_B_Last_Voltage_Value.trace_variable( "w", aTriggerdFUN7ToCallOnB )
# as one may wish and need
Last, but not least, there might be another coherent-reading strategy:
SIGNAL_2_READ_Voltage_Value = tk.IntVar() # MVC-Model "register"
idTrSIG2R_A = SIGNAL_2_READ_Voltage_Value( "w", aSensorREAD_A ) # MVC-Controller actor
idTrSIG2R_B = SIGNAL_2_READ_Voltage_Value( "w", aSensorREAD_B )
This makes an external trigger-to-read tool, that will actually help to "fire" both reading "at the same moment", just by touching:
SIGNAL_2_READ_Voltage_Value.set( 1 + SIGNAL_2_READ_Voltage_Value.get() )
Also some final steps for graceful termination are clear and honest:
finally:
#---------------------------------------------------
SIGNAL_2_READ_Voltage_Value.trace_vdelete( "w", idTrSIG2R_A )
SIGNAL_2_READ_Voltage_Value.trace_vdelete( "w", idTrSIG2R_B )
#---------------------------------------------------
SENSOR_A_Last_Voltage_Value.trace_vdelete( "w", idTrA1 )
SENSOR_A_Last_Voltage_Value.trace_vdelete( "w", idTrA2 )
SENSOR_A_Last_Voltage_Value.trace_vdelete( "w", idTrA3 )
#---------------------------------------------------
SENSOR_B_Last_Voltage_Value.trace_vdelete( "w", idTrB1 )
...
SENSOR_B_Last_Voltage_Value.trace_vdelete( "w", idTrB7 )
For more details and mock-up case inspiration may like to read this
I'm looking to do something in this example: Python - How to get the start/base address of a process?. I'm having the same issue as the person in that topic, in that the pointers cheat engine provides is in reference to the base address of the process itself.
I've looked around and it looks like the best solution is to use ctypes and the MODULEENTRY32 to store snapshots of processes and analyze their modBaseAddr.
Here is my current code
import os.path, ctypes, ctypes.wintypes
from ctypes import *
from ctypes.wintypes import *
PROCESS_QUERY_INFORMATION = (0x0400)
PROCESS_VM_OPERATION = (0x0008)
PROCESS_VM_READ = (0x0010)
PROCESS_VM_WRITE = (0x0020)
TH32CS_SNAPMODULE = (0x00000008)
CreateToolhelp32Snapshot= ctypes.windll.kernel32.CreateToolhelp32Snapshot
Process32First = ctypes.windll.kernel32.Process32First
Process32Next = ctypes.windll.kernel32.Process32Next
Module32First = ctypes.windll.kernel32.Module32First
Module32Next = ctypes.windll.kernel32.Module32Next
GetLastError = ctypes.windll.kernel32.GetLastError
OpenProcess = ctypes.windll.kernel32.OpenProcess
GetPriorityClass = ctypes.windll.kernel32.GetPriorityClass
CloseHandle = ctypes.windll.kernel32.CloseHandle
class MODULEENTRY32(Structure):
_fields_ = [ ( 'dwSize' , DWORD ) ,
( 'th32ModuleID' , DWORD ),
( 'th32ProcessID' , DWORD ),
( 'GlblcntUsage' , DWORD ),
( 'ProccntUsage' , DWORD ) ,
( 'modBaseAddr' , POINTER(BYTE)) ,
( 'modBaseSize' , DWORD ) ,
( 'hModule' , HMODULE ) ,
( 'szModule' , c_char * 256 ),
( 'szExePath' , c_char * 260 ) ]
def GetBaseAddr(ProcId, ProcName):
me32 = MODULEENTRY32()
me32.dwSize = sizeof(me32)
hSnapshot = CreateToolhelp32Snapshot( TH32CS_SNAPMODULE, ProcId)
if GetLastError() != 0:
CloseHandle(hSnapshot)
print 'Handle Error %s' % WinError()
return 'Error'
else:
if Module32First(hSnapshot, byref(me32)):
if me32.szModule == ProcName:
CloseHandle(hSnapshot)
return id(me32.modBaseAddr)
else:
Module32Next(hSnapshot, byref(me32))
while int(GetLastError())!= 18:
if me32.szModule == ProcName:
CloseHandle(hSnapshot)
return id(me32.modBaseAddr)
else:
Module32Next(hSnapshot, byref(me32))
CloseHandle(hSnapshot)
print 'Couldn\'t find Process with name %s' % ProcName
else:
print 'Module32First is False %s' % WinError()
CloseHandle(hSnapshot)
def GetProcessIdByName( pName):
if pName.endswith('.exe'):
pass
else:
pName = pName+'.exe'
ProcessIds, BytesReturned = EnumProcesses()
for index in range(BytesReturned / ctypes.sizeof(ctypes.wintypes.DWORD)):
ProcessId = ProcessIds[index]
hProcess = ctypes.windll.kernel32.OpenProcess(PROCESS_QUERY_INFORMATION, False, ProcessId)
if hProcess:
ImageFileName = (ctypes.c_char*MAX_PATH)()
if ctypes.windll.psapi.GetProcessImageFileNameA(hProcess, ImageFileName, MAX_PATH)>0:
filename = os.path.basename(ImageFileName.value)
if filename == pName:
return ProcessId
CloseHandle(hProcess)
def EnumProcesses():
count = 32
while True:
ProcessIds = (ctypes.wintypes.DWORD*count)()
cb = ctypes.sizeof(ProcessIds)
BytesReturned = ctypes.wintypes.DWORD()
if ctypes.windll.Psapi.EnumProcesses(ctypes.byref(ProcessIds), cb, ctypes.byref(BytesReturned)):
if BytesReturned.value<cb:
return ProcessIds, BytesReturned.value
break
else:
count *= 2
else:
return None
if __name__ == '__main__':
ProcId = GetProcessIdByName('RocketLeague.exe')
#print ProcId
print hex(GetBaseAddr(ProcId, 'RocketLeague.exe'))
#print hex(GetBaseAddr(8252,'RocketLeague.exe'))
Now my understanding of memory isn't the greatest, but I'd figure that the base address should be static while a program is running. When I do get this code to run, the ModBaseAddr I get back changes every time I run it. Another weird Issue I'm having is that without that print ProcId statement, running the program returns an ERROR_ACCESS_DENIED (error 5) from line 41 (This has something to do with the CreateToolhelp32Snapshot function I assume as I have admin rights on the computer). With the print statement, however, the program runs through giving me a different ModBaseAddr every time. If I feed the GetBaseAddr function the ProcessId manually it also works without the print statement, again however, it's giving me a random address every time.
If anyone could provide me any help or point me in the right direction I'd really appreciate it!
Clarification: MODULEENTRY32 stores information about modules, not processes. when you call CreateToolhelp32Snapshot using TH32CS_SNAPMODULE you are getting modules loaded by the process, not processes themselves.
Instead of getting the MODULEENTRY32 in combination with EnumProcesses you can instead use CreateToolHelp32Snapshot with TH32CS_SNAPPROCESS to get a list of processes in the form of PROCESSENRTY32 structs, which also contains the process identifier.
Despite being a user with administrator privileges, you must also run the process as an administrator.
You should also ensure you're initializing your MODULEENTRY32 to {0} for proper error handling and not running into an issue of the returned value being subject to undefined behavior of uninitialized memory.
I do not know the specific cause of your issue but I have used a source code for this purpose that is very robust that may be a plug and play alternative to what you're currently using, the important snippet will follow, but the full source is available here.
def ListProcessModules( ProcessID ):
hModuleSnap = c_void_p(0)
me32 = MODULEENTRY32()
me32.dwSize = sizeof( MODULEENTRY32 )
hModuleSnap = CreateToolhelp32Snapshot( TH32CS_SNAPMODULE, ProcessID )
ret = Module32First( hModuleSnap, pointer(me32) )
if ret == 0 :
print 'ListProcessModules() Error on Module32First[%d]' % GetLastError()
CloseHandle( hModuleSnap )
return False
while ret :
print " MODULE NAME: %s"% me32.szModule
print " executable = %s"% me32.szExePath
print " process ID = 0x%08X"% me32.th32ProcessID
print " ref count (g) = 0x%04X"% me32.GlblcntUsage
print " ref count (p) = 0x%04X"% me32.ProccntUsage
print " base address = 0x%08X"% me32.modBaseAddr
print " base size = %d"% me32.modBaseSize
ret = Module32Next( hModuleSnap , pointer(me32) )
CloseHandle( hModuleSnap )
return True
I am working with a language where the modules are defined as
<module_name> <inst_name>(.<port_name> (<net_name>)….);
or
module1 inst1 ( .input a,
.output b;
port b=a;);
I want to find all such modules, while ignoring function calls .
I'm having difficulty with regex. I am looking for this
text1 text2 ( .text3; text4 );
note that all the spaces except the ones between text 1 and text2 are optional and might be new lines instead of spaces.text 3 and text4 can be multi lines but all are in the form of
text3 - >
.blah1 (blah2),
.blah3 (blah4)
text4->
blah1 blah2=xyz;
blah3 blah4=qwe;
I am trying to do
re.split(r"^[a-zA-Z]*\s[a-zA-Z]*\s?\n?\([a-zA-Z]*\s?\n?;[a-zA-Z]*\);", data)
Doesn't work though.It just grabs everything. How do i fix it? Thanks !!
I do need to grab everything individually, eventually (module/instances/port/nets). I think I can split it once regex is working.
I think you need to write a parser that understands enough of the language to at least canonicalize it before you try extracting information. You could write a simple parser by hand, or you could use a parsing framework such as PLY or others of that ilk.
To give you a more concrete idea about what I'm suggesting, consider
the following code, which defines a parse_data function that, given
the contents of a file, will yield a series of tokens recognized in
that file:
import re
tokens = {
'lparen': '\(',
'rparen': '\)',
'comma': ',',
'semicolon': ';',
'whitespace': '\s+',
'equals': '=',
'identifier': '[.\d\w]+',
}
tokens = dict((k, re.compile(v)) for k,v in tokens.items())
def parse_data(data):
while data:
for tn, tv in tokens.items():
mo = tv.match(data)
if mo:
matched = data[mo.start():mo.end()]
data = data[mo.end():]
yield tn, matched
Using this, you could write something that would put your sample input
into canonical form:
with open('inputfile') as fd:
data = fd.read()
last_token = (None, None)
for tn, tv in parse(data):
if tn == 'whitespace' and last_token[0] != 'semicolon':
print ' ',
elif tn == 'whitespace':
pass
elif tn == 'semicolon' and last_token[0] == 'rparen':
print tv
else:
print tv,
last_token = (tn, tv)
Given input like this:
module1 inst1 ( .input a,
.output b;
port b=a;);
module2 inst2 ( .input a, .output b; port b=a;);
module3 inst3 ( .input a, .output b;
port b=a;);
The above code would yield:
module1 inst1 ( .input a , .output b ; port b = a ; ) ;
module2 inst2 ( .input a , .output b ; port b = a ; ) ;
module3 inst3 ( .input a , .output b ; port b = a ; ) ;
Which, because it is in standard form, would be much more amendable to
extracting information via simple pattern matching.
Note that while this code relies on reading the entire source file
into memory first, you could fairly easily write code that you parse a
file in fragments if you were concerned about memory utilization.
I am a Python re-newbie. I would like advice on handling program parameters which are in a file in json format. Currently, I am doing something like what is shown below, however, it seems too wordy, and the idea of typing the same literal string multiple times (sometimes with dashes and sometimes with underscores) seems juvenile - error prone - stinky... :-) (I do have many more parameters!)
#!/usr/bin/env python
import sys
import os
import json ## for control file parsing
# control parameters
mpi_nodes = 1
cluster_size = None
initial_cutoff = None
# ...
#process the arguments
if len(sys.argv) != 2:
raise Exception(
"""Usage:
run_foo <controls.json>
Where:
<control.json> is a dictionary of run parameters
"""
)
# We expect a .json file with our parameters
controlsFileName = sys.argv[1]
err = ""
err += "" #validateFileArgument(controlsFileName, exists=True)
# read in the control parameters from the .json file
try:
controls = json.load(open(controlsFileName, "r"))
except:
err += "Could not process the file '" + controlsFileName + "'!\n"
# check each control parameter. The first one is optional
if "mpi-nodes" in controls:
mpi_nodes = controls["mpi-nodes"]
else:
mpi_nodes = controls["mpi-nodes"] = 1
if "cluster-size" in controls:
cluster_size = controls["cluster-size"]
else:
err += "Missing control definition for \"cluster-size\".\n"
if "initial-cutoff" in controls:
initial_cutoff = controls["initial-cutoff"]
else:
err += "Missing control definition for \"initial-cutoff\".\n"
# ...
# Quit if any of these things were not true
if len(err) > 0:
print err
exit()
#...
This works, but it seems like there must be a better way. I am stuck with the requirements to use a json file and to use the hyphenated parameter names. Any ideas?
I was looking for something with more static binding. Perhaps this is as good as it gets.
Usually, we do things like this.
def get_parameters( some_file_name ):
source= json.loads( some_file_name )
return dict(
mpi_nodes= source.get('mpi-nodes',1),
cluster_size= source['cluster-size'],
initial_cutoff = source['initial-cutoff'],
)
controlsFileName= sys.argv[1]
try:
params = get_parameters( controlsFileName )
except IOError:
print "Could not process the file '{0}'!".format( controlsFileName )
sys.exit( 1 )
except KeyError, e:
print "Missing control definition for '{0}'.".format( e.message )
sys.exit( 2 )
A the end params['mpi_nodes'] has the value of mpi_nodes
If you want a simple variable, you do this. mpi_nodes = params['mpi_nodes']
If you want a namedtuple, change get_parameters like this
def get_parameters( some_file_name ):
Parameters= namedtuple( 'Parameters', 'mpi_nodes, cluster_size, initial_cutoff' )
return Parameters( source.get('mpi-nodes',1),
source['cluster-size'],
source['initial-cutoff'],
)
I don't know if you'd find that better or not.
the argparse library is nice, it can handle most of the argument parsing and validation for you as well as printing pretty help screens
[1] http://docs.python.org/dev/library/argparse.html
I will knock up a quick demo showing how you'd want to use it this arvo.
Assuming you have many more parameters to process, something like this could work:
def underscore(s):
return s.replace('-','_')
# parameters with default values
for name, default in (("mpi-nodes", 1),):
globals()[underscore(name)] = controls.get(name, default)
# mandatory parameters
for name in ("cluster-size", "initial-cutoff"):
try:
globals()[underscore(name)] = controls[name]
except KeyError:
err += "Missing control definition for %r" % name
Instead of manipulating globals, you can also make this more explicit:
def underscore(s):
return s.replace('-','_')
settings = {}
# parameters with default values
for name, default in (("mpi-nodes", 1),):
settings[underscore(name)] = controls.get(name, default)
# mandatory parameters
for name in ("cluster-size", "initial-cutoff"):
try:
settings[underscore(name)] = controls[name]
except KeyError:
err += "Missing control definition for %r" % name
# print out err if necessary
mpi_nodes = settings['mpi_nodes']
cluster_size = settings['cluster_size']
initial_cutoff = settings['initial_cutoff']
I learned something from all of these responses - thanks! I would like to get feedback on my approach which incorporates something from each suggestion. In addition to the conditions imposed by the client, I want something:
1) that is fairly obvious to use and to debug
2) that is easy to maintain and modify
I decided to incorporate str.replace, namedtuple, and globals(), creating a ControlParameters namedtuple in the globals() namespace.
#!/usr/bin/env python
import sys
import os
import collections
import json
def get_parameters(parameters_file_name ):
"""
Access all of the control parameters from the json filename given. A
variable of type namedtuple named "ControlParameters" is injected
into the global namespace. Parameter validation is not performed. Both
the names and the defaults, if any, are defined herein. Parameters not
found in the json file will get values of None.
Parameter usage example: ControlParameters.cluster_size
"""
parameterValues = json.load(open(parameters_file_name, "r"))
Parameters = collections.namedtuple( 'Parameters',
"""
mpi_nodes
cluster_size
initial_cutoff
truncation_length
"""
)
parameters = Parameters(
parameterValues.get(Parameters._fields[0].replace('_', '-'), 1),
parameterValues.get(Parameters._fields[1].replace('_', '-')),
parameterValues.get(Parameters._fields[2].replace('_', '-')),
parameterValues.get(Parameters._fields[3].replace('_', '-'))
)
globals()["ControlParameters"] = parameters
#process the program argument(s)
err = ""
if len(sys.argv) != 2:
raise Exception(
"""Usage:
foo <control.json>
Where:
<control.json> is a dictionary of run parameters
"""
)
# We expect a .json file with our parameters
parameters_file_name = sys.argv[1]
err += "" #validateFileArgument(parameters_file_name, exists=True)
if err == "":
get_parameters(parameters_file_name)
cp_dict = ControlParameters._asdict()
for name in ControlParameters._fields:
if cp_dict[name] == None:
err += "Missing control parameter '%s'\r\n" % name
print err
print "Done"