dpkt source code documentation project - python

Dpkt is a python packet creation and parsing library https://code.google.com/p/dpkt/
The project lacks documentation for beginners. I am trying to document it and make example sample code for all. Based on my knowledge of python, i am having difficulty understanding some of the source code. Here for example is python the RTP (Real Time Transport Protocol) module
https://code.google.com/p/dpkt/source/browse/trunk/dpkt/rtp.py #rtp.py source code
# $Id$
"""Real-Time Transport Protocol"""
from dpkt import Packet
# version 1100 0000 0000 0000 ! 0xC000 14
# p 0010 0000 0000 0000 ! 0x2000 13
# x 0001 0000 0000 0000 ! 0x1000 12
# cc 0000 1111 0000 0000 ! 0x0F00 8
# m 0000 0000 1000 0000 ! 0x0080 7
# pt 0000 0000 0111 1111 ! 0x007F 0
#
_VERSION_MASK= 0xC000
_P_MASK = 0x2000
_X_MASK = 0x1000
_CC_MASK = 0x0F00
_M_MASK = 0x0080
_PT_MASK = 0x007F
_VERSION_SHIFT=14
_P_SHIFT = 13
_X_SHIFT = 12
_CC_SHIFT = 8
_M_SHIFT = 7
_PT_SHIFT = 0
VERSION = 2
class RTP(Packet):
__hdr__ = (
('_type', 'H', 0x8000),
('seq', 'H', 0),
('ts', 'I', 0),
('ssrc', 'I', 0),
)
csrc = ''
def _get_version(self): return (self._type&_VERSION_MASK)>>_VERSION_SHIFT
def _set_version(self, ver):
self._type = (ver << _VERSION_SHIFT) | (self._type & ~_VERSION_MASK)
def _get_p(self): return (self._type & _P_MASK) >> _P_SHIFT
def _set_p(self, p): self._type = (p << _P_SHIFT) | (self._type & ~_P_MASK)
def _get_x(self): return (self._type & _X_MASK) >> _X_SHIFT
def _set_x(self, x): self._type = (x << _X_SHIFT) | (self._type & ~_X_MASK)
def _get_cc(self): return (self._type & _CC_MASK) >> _CC_SHIFT
def _set_cc(self, cc): self._type = (cc<<_CC_SHIFT)|(self._type&~_CC_MASK)
def _get_m(self): return (self._type & _M_MASK) >> _M_SHIFT
def _set_m(self, m): self._type = (m << _M_SHIFT) | (self._type & ~_M_MASK)
def _get_pt(self): return (self._type & _PT_MASK) >> _PT_SHIFT
def _set_pt(self, m): self._type = (m << _PT_SHIFT)|(self._type&~_PT_MASK)
version = property(_get_version, _set_version)
p = property(_get_p, _set_p)
x = property(_get_x, _set_x)
cc = property(_get_cc, _set_cc)
m = property(_get_m, _set_m)
pt = property(_get_pt, _set_pt)
def __len__(self):
return self.__hdr_len__ + len(self.csrc) + len(self.data)
def __str__(self):
return self.pack_hdr() + self.csrc + str(self.data)
def unpack(self, buf):
super(RTP, self).unpack(buf)
self.csrc = buf[self.__hdr_len__:self.__hdr_len__ + self.cc * 4]
self.data = buf[self.__hdr_len__ + self.cc * 4:]
With this code, i was able to do the following in IPython Shell
[37] import dpkt
[38] rtp_pkt=dpkt.rtp.RTP()
[39] rtp_pkt.pack_hdr()
Out[39]: '\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
[47] rtp_pkt.data="HelloWorld"
[48] rtp_pkt.pack()
Out[48]: '\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00HelloWorld'
Based on my understanding of Classes in Python, there should be "init" function in the class that i don't see in rtp.py? I am wondering how the above Ipython Shell commands worked? Why does the hdr variable in rtp.py starts with double underscore "__" and why every class method precedes with single underscore "_". I know this could be to make it private or semi-private but does it have to be that way?
I know that RTP class is derived from Packet whose source code is also pasted here for convenience.
# $Id$
"""Simple packet creation and parsing."""
import copy, itertools, socket, struct
class Error(Exception): pass
class UnpackError(Error): pass
class NeedData(UnpackError): pass
class PackError(Error): pass
class _MetaPacket(type):
def __new__(cls, clsname, clsbases, clsdict):
t = type.__new__(cls, clsname, clsbases, clsdict)
st = getattr(t, '__hdr__', None)
if st is not None:
# XXX - __slots__ only created in __new__()
clsdict['__slots__'] = [ x[0] for x in st ] + [ 'data' ]
t = type.__new__(cls, clsname, clsbases, clsdict)
t.__hdr_fields__ = [ x[0] for x in st ]
t.__hdr_fmt__ = getattr(t, '__byte_order__', '>') + \
''.join([ x[1] for x in st ])
t.__hdr_len__ = struct.calcsize(t.__hdr_fmt__)
t.__hdr_defaults__ = dict(zip(
t.__hdr_fields__, [ x[2] for x in st ]))
return t
class Packet(object):
"""Base packet class, with metaclass magic to generate members from
self.__hdr__.
__hdr__ should be defined as a list of (name, structfmt, default) tuples
__byte_order__ can be set to override the default ('>')
Example::
>>> class Foo(Packet):
... __hdr__ = (('foo', 'I', 1), ('bar', 'H', 2), ('baz', '4s', 'quux'))
...
>>> foo = Foo(bar=3)
>>> foo
Foo(bar=3)
>>> str(foo)
'\x00\x00\x00\x01\x00\x03quux'
>>> foo.bar
3
>>> foo.baz
'quux'
>>> foo.foo = 7
>>> foo.baz = 'whee'
>>> foo
Foo(baz='whee', foo=7, bar=3)
>>> Foo('hello, world!')
Foo(baz=' wor', foo=1751477356L, bar=28460, data='ld!')
"""
__metaclass__ = _MetaPacket
def __init__(self, *args, **kwargs):
"""Packet constructor with ([buf], [field=val,...]) prototype.
Arguments:
buf -- optional packet buffer to unpack
Optional keyword arguments correspond to members to set
(matching fields in self.__hdr__, or 'data').
"""
self.data = ''
if args:
try:
self.unpack(args[0])
except struct.error:
if len(args[0]) < self.__hdr_len__:
raise NeedData
raise UnpackError('invalid %s: %r' %
(self.__class__.__name__, args[0]))
else:
for k in self.__hdr_fields__:
setattr(self, k, copy.copy(self.__hdr_defaults__[k]))
for k, v in kwargs.iteritems():
setattr(self, k, v)
def __len__(self):
return self.__hdr_len__ + len(self.data)
def __getitem__(self, k):
try: return getattr(self, k)
except AttributeError: raise KeyError
def __repr__(self):
l = [ '%s=%r' % (k, getattr(self, k))
for k in self.__hdr_defaults__
if getattr(self, k) != self.__hdr_defaults__[k] ]
if self.data:
l.append('data=%r' % self.data)
return '%s(%s)' % (self.__class__.__name__, ', '.join(l))
def __str__(self):
return self.pack_hdr() + str(self.data)
def pack_hdr(self):
"""Return packed header string."""
try:
return struct.pack(self.__hdr_fmt__,
*[ getattr(self, k) for k in self.__hdr_fields__ ])
except struct.error:
vals = []
for k in self.__hdr_fields__:
v = getattr(self, k)
if isinstance(v, tuple):
vals.extend(v)
else:
vals.append(v)
try:
return struct.pack(self.__hdr_fmt__, *vals)
except struct.error, e:
raise PackError(str(e))
def pack(self):
"""Return packed header + self.data string."""
return str(self)
def unpack(self, buf):
"""Unpack packet header fields from buf, and set self.data."""
for k, v in itertools.izip(self.__hdr_fields__,
struct.unpack(self.__hdr_fmt__, buf[:self.__hdr_len__])):
setattr(self, k, v)
self.data = buf[self.__hdr_len__:]
# XXX - ''.join([(len(`chr(x)`)==3) and chr(x) or '.' for x in range(256)])
__vis_filter = """................................ !"#$%&\'()*+,-./0123456789:;<=>?#ABCDEFGHIJKLMNOPQRSTUVWXYZ[.]^_`abcdefghijklmnopqrstuvwxyz{|}~................................................................................................................................."""
def hexdump(buf, length=16):
"""Return a hexdump output string of the given buffer."""
n = 0
res = []
while buf:
line, buf = buf[:length], buf[length:]
hexa = ' '.join(['%02x' % ord(x) for x in line])
line = line.translate(__vis_filter)
res.append(' %04d: %-*s %s' % (n, length * 3, hexa, line))
n += length
return '\n'.join(res)
try:
import dnet
def in_cksum_add(s, buf):
return dnet.ip_cksum_add(buf, s)
def in_cksum_done(s):
return socket.ntohs(dnet.ip_cksum_carry(s))
except ImportError:
import array
def in_cksum_add(s, buf):
n = len(buf)
cnt = (n / 2) * 2
a = array.array('H', buf[:cnt])
if cnt != n:
a.append(struct.unpack('H', buf[-1] + '\x00')[0])
return s + sum(a)
def in_cksum_done(s):
s = (s >> 16) + (s & 0xffff)
s += (s >> 16)
return socket.ntohs(~s & 0xffff)
def in_cksum(buf):
"""Return computed Internet checksum."""
return in_cksum_done(in_cksum_add(0, buf))
The question is how to really understand the source code so its documentation is done correctly?

Related

Two apparently identical dataclasses are not equal

I've defined the following dataclass:
"""This module declares the SubtitleItem dataclass."""
import re
from dataclasses import dataclass
from time_utils import Timestamp
#dataclass
class SubtitleItem:
"""Class for storing all the information for
a subtitle item."""
index: int
start_time: Timestamp
end_time: Timestamp
text: str
#staticmethod
def load_from_text_item(text_item: str) -> "SubtitleItem":
"""Create new subtitle item from their .srt file text.
Example, if your .srt file contains the following subtitle item:
```
3
00:00:05,847 --> 00:00:06,916
The robot.
```
This function will return:
```
SubtitleItem(
index=3,
start_time=Timestamp(seconds=5, milliseconds=847),
end_time=Timestamp(seconds=6, milliseconds=916),
text='The robot.')
```
Args:
text_item (str): The .srt text for a subtitle item.
Returns:
SubtitleItem: A corresponding SubtitleItem.
"""
# Build regex
index_re = r"\d+"
timestamp = lambda prefix: rf"(?P<{prefix}_hours>\d\d):" + \
rf"(?P<{prefix}_minutes>\d\d):" + \
rf"(?P<{prefix}_seconds>\d\d)," + \
rf"(?P<{prefix}_milliseconds>\d\d\d)"
start_timestamp_re = timestamp("start")
end_timestamp_re = timestamp("end")
text_re = r".+"
complete_re = f"^(?P<index>{index_re})\n"
complete_re += f"{start_timestamp_re} --> {end_timestamp_re}\n"
complete_re += f"(?P<text>{text_re})$"
regex = re.compile(complete_re)
# Match and extract groups
match = regex.match(text_item)
if match is None:
raise ValueError(f"Index item invalid format:\n'{text_item}'")
groups = match.groupdict()
# Extract values
index = int(groups['index'])
group_items = filter(lambda kv: kv[0].startswith("start_"), groups.items())
args = { k[len("start_"):]: int(v) for k, v in group_items }
start = Timestamp(**args)
group_items = filter(lambda kv: kv[0].startswith("end_"), groups.items())
args = { k[len("end_"):]: int(v) for k, v in group_items }
end = Timestamp(**args)
text = groups['text']
if start >= end:
raise ValueError(
f"Start timestamp must be later than end timestamp: start={start}, end={end}")
return SubtitleItem(index, start, end, text)
#staticmethod
def _format_timestamp(t: Timestamp) -> str:
"""Format a timestamp in the .srt format.
Args:
t (Timestamp): The timestamp to convert.
Returns:
str: The textual representation for the .srt format.
"""
return f"{t.get_hours()}:{t.get_minutes()}:{t.get_seconds()},{t.get_milliseconds()}"
def __str__(self):
res = f"{self.index}\n"
res += f"{SubtitleItem._format_timestamp(self.start_time)}"
res += " --> "
res += f"{SubtitleItem._format_timestamp(self.end_time)}\n"
res += self.text
return res
... which I use in the following test:
import unittest
from src.subtitle_item import SubtitleItem
from src.time_utils import Timestamp
class SubtitleItemTest(unittest.TestCase):
def testLoadFromText(self):
text = "21\n01:02:03,004 --> 05:06:07,008\nTest subtitle."
res = SubtitleItem.load_from_text_item(text)
exp = SubtitleItem(
21, Timestamp(hours=1, minutes=2, seconds=3, milliseconds=4),
Timestamp(hours=5, minutes=6, seconds=7, milliseconds=8),
"Test subtitle."
)
self.assertEqual(res, exp)
This test fails, but I don't understand why.
I've checked with the debugger: exp and res have exactly the same fields. The Timestamp class is another separate dataclass. I've checked equality per field manually in the debugger, all fields are identical:
>>> exp == res
False
>>> exp.index == res.index
True
>>> exp.start_time == res.start_time
True
>>> exp.end_time == res.end_time
True
>>> exp.text == res.text
True
Furthermore, asdict() on each object returns identical dictionaries:
>>> dataclasses.asdict(exp) == dataclasses.asdict(res)
True
Is there something I'm misunderstanding regarding the implementation of the equality operator with dataclasses?
Thanks.
EDIT: my time_utils module, sorry for not including that earlier
"""
This module declares the Delta and Timestamp classes.
"""
from dataclasses import dataclass
#dataclass(frozen=True)
class _TimeBase:
hours: int = 0
minutes: int = 0
seconds: int = 0
milliseconds: int = 0
def __post_init__(self):
BOUNDS_H = range(0, 100)
BOUNDS_M = range(0, 60)
BOUNDS_S = range(0, 60)
BOUNDS_MS = range(0, 1000)
if self.hours not in BOUNDS_H:
raise ValueError(
f"{self.hours=} not in [{BOUNDS_H.start, BOUNDS_H.stop})")
if self.minutes not in BOUNDS_M:
raise ValueError(
f"{self.minutes=} not in [{BOUNDS_M.start, BOUNDS_M.stop})")
if self.seconds not in BOUNDS_S:
raise ValueError(
f"{self.seconds=} not in [{BOUNDS_S.start, BOUNDS_S.stop})")
if self.milliseconds not in BOUNDS_MS:
raise ValueError(
f"{self.milliseconds=} not in [{BOUNDS_MS.start, BOUNDS_MS.stop})")
def _to_ms(self):
return self.milliseconds + 1000 * (self.seconds + 60 * (self.minutes + 60 * self.hours))
#dataclass(frozen=True)
class Delta(_TimeBase):
"""A time difference, with milliseconds accuracy.
Must be less than 100h long."""
sign: int = 1
def __post_init__(self):
if self.sign not in (1, -1):
raise ValueError(
f"{self.sign=} should either be 1 or -1")
super().__post_init__()
def __add__(self, other: "Delta") -> "Delta":
self_ms = self.sign * self._to_ms()
other_ms = other.sign * other._to_ms()
ms_sum = self_ms + other_ms
sign = -1 if ms_sum < 0 else 1
ms_sum = abs(ms_sum)
ms_n, s_rem = ms_sum % 1000, ms_sum // 1000
s_n, m_rem = s_rem % 60, s_rem // 60
m_n, h_n = m_rem % 60, m_rem // 60
return Delta(hours=h_n, minutes=m_n, seconds=s_n, milliseconds=ms_n, sign=sign)
#dataclass(frozen=True)
class Timestamp(_TimeBase):
"""A timestamp with milliseconds accuracy. Must be
less than 100h long."""
def __add__(self, other: Delta) -> "Timestamp":
ms_sum = self._to_ms() + other.sign * other._to_ms()
ms_n, s_rem = ms_sum % 1000, ms_sum // 1000
s_n, m_rem = s_rem % 60, s_rem // 60
m_n, h_n = m_rem % 60, m_rem // 60
return Timestamp(hours=h_n, minutes=m_n, seconds=s_n, milliseconds=ms_n)
def __ge__(self, other: "Timestamp") -> bool:
return self._to_ms() >= other._to_ms()
class Timestamp:
def __init__( self, hours=0, minutes=0, seconds=0, milliseconds=0 ):
self.ms = ((hours*60+minutes)*60+seconds)*1000+milliseconds
def get_hours(self):
return self.ms // (60*60*1000)
def get_minutes(self):
return (self.ms // (60*1000)) % 60
def get_seconds(self):
return (self.ms // 1000) % 60
def get_milliseconds(self):
return self.ms % 1000
def __add__(self,other):
return Timestamp(milliseconds=self.ms + self.other)
def __eq__(self,other):
return self.ms == other.ms
def __lt__(self,other):
return self.ms < other.ms
def __le__(self,other):
return self.ms <= other.ms
... your code ...
text = "21\n01:02:03,004 --> 05:06:07,008\nTest subtitle."
res = SubtitleItem.load_from_text_item(text)
exp = SubtitleItem(
21, Timestamp(hours=1, minutes=2, seconds=3, milliseconds=4),
Timestamp(hours=5, minutes=6, seconds=7, milliseconds=8),
"Test subtitle."
)
print(res)
print(exp)
print(res==exp)
Produces:
21
1:2:3,4 --> 5:6:7,8
Test subtitle.
21
1:2:3,4 --> 5:6:7,8
Test subtitle.
True
with no assert exception.
Okay, I think I found what's going wrong here.
First, I made a mistake when I reported the issue before: in the unit test, exp.start_time != res.start_time and exp.end_time != res.end_time. Sorry about that. That narrows down the issue to comparison of timestamps.
My sources are in project/src/, the test that fails is in project/tests/. To make source modules accessible to the test, I had to add the source directory to PYTHONPATH:
$ PYTHONPATH=src/ python -m unittest discover -s tests/ -v
In the unit test, even though res.start_time and end.start_time do have the same fields, they do not have the same type:
>>> print(type(res.start_time), type(exp.start_time))
<class 'time_utils.Timestamp'> <class 'src.time_utils.Timestamp'>
I've added a new post with a minimally reproducible example, and more details about the file structure here: Minimally reproducible example.

How to implement Rope data structure split operation

I was reading about Rope(or cord) data structure https://en.wikipedia.org/wiki/Rope_(data_structure) and trying to implement it, but I am struggling to implement the split operation. I tried to look it up but all related answers I was able to find were incorrect.
Below is the split operation:
We want to find the character and return two nodes before and after the split. For example, if we want to split at index 5 of'MyNameIsSimon' then we should return the root of two ropes 'MyName' and 'IsSimon' respectively. Finding the index is easy as given by the pseudo-code in wiki. But I'm struggling the split part especially how to join and return the 2nd half as a new rope. Anyone can help with pseudo-code or any language is much appreciated.
Wikipedia’s diagram looks muddled to me. Here’s a working implementation in Python (without balancing).
class Leaf:
def __init__(self, s):
self._s = s
def __len__(self):
return len(self._s)
def __str__(self):
return self._s
def inspect(self, indent=0):
print(" " * indent + repr(self._s))
def split(self, i):
return Leaf(self._s[:i]), Leaf(self._s[i:])
class Branch:
def __init__(self, a, b):
self._a = a
self._b = b
self._l = len(a) + len(b)
def __len__(self):
return self._l
def __str__(self):
return str(self._a) + str(self._b)
def inspect(self, indent=0):
self._a.inspect(indent + 2)
print(" " * indent + str(len(self._a)))
self._b.inspect(indent + 2)
def split(self, i):
if i < len(self._a):
a0, a1 = self._a.split(i)
return a0, Branch(a1, self._b)
elif i == len(self._a):
return self._a, self._b
else:
assert i > len(self._a)
b0, b1 = self._b.split(i - len(self._a))
return Branch(self._a, b0), b1
def make_test_rope():
e = Leaf("Hello ")
f = Leaf("my ")
c = Branch(e, f)
j = Leaf("na")
k = Leaf("me i")
g = Branch(j, k)
m = Leaf("s")
n = Leaf(" Simon")
h = Branch(m, n)
d = Branch(g, h)
b = Branch(c, d)
a = Branch(b, Leaf(""))
return a
def test():
a = make_test_rope()
a.inspect()
b, c = a.split(11)
print("--")
b.inspect()
print("--")
c.inspect()
test()
Output:
'Hello '
6
'my '
9
'na'
2
'me i'
6
's'
1
' Simon'
22
''
--
'Hello '
6
'my '
9
'na'
--
'me i'
4
's'
1
' Simon'
11
''

How can I get the length of a python ctypes structure member for formatting of hex output?

Using this code:
from ctypes import *
class Tracerec(BigEndianStructure):
def __repr__(self):
textlist = list()
for name, *dtype in self._fields_:
value = getattr(self, name)
if type(value) == int: textlist.append(f'{name}:0x{value:x}')
else: textlist.append(f'{name}:{value}')
fulltext = ' '.join(textlist)
return f'<{self.__class__.__name__}={fulltext}>'
def __getitem__(self, i):
if type(i)==str: return getattr(self, i)
return getattr(self, self._fields_[i][0])
def __len__(self):
return len(self._fields_)
class Mystruct(Tracerec):
_fields_ = [
('a', c_uint16),
('b', c_uint16),
('c', c_uint32),
]
buffer = b'\x01\x02\x03\x04\x00\x00\x05\x06'
x = Mystruct.from_buffer_copy(buffer)
x
I get this output:
<Mystruct=a:0x102 b:0x304 c:0x506>
But I would like it to format to the ctypes byte length but sizeof(dtype) only returns this type has no size. Example of desired output:
<Mystruct=a:0x0102 b:0x0304 c:0x00000506>
Try sizeof(*dtype) and 0 padding left:
...
for name, *dtype in self._fields_:
value = getattr(self, name)
size = sizeof(*dtype) * 2
if type(value) == int: textlist.append(f'{name}:0x{value:0{size}x}')
...
Here is a my final solution (hopefully), which also supports the use of bitsize and Array types will be exploded too:
class Tracerec(BigEndianStructure):
# do not define _fields_ in this parent class
def _format_value(self, value, dtype):
if isinstance(value, Array):
# Use Array undocumented _type_:
text = ','.join([ self._format_value(x, value._type_) for x in value ])
return f'[{text}]'
elif type(value) == int:
size = sizeof(dtype) * 2 # size mutliply by byte width
return f'0x{value:0{size}x}'
else:
return f'{value}'
def _format_field(self, field):
name, dtype, *bitsize = field
value = getattr(self, name)
return f'{name}:{self._format_value(value, dtype)}'
def __repr__(self):
text = ' '.join( [ self._format_field(x) for x in self._fields_ ] )
return f'<{self.__class__.__name__}={text}>'
def __getitem__(self, i):
if type(i)==str: return getattr(self, i)
return getattr(self, self._fields_[i][0])
def __len__(self):
return len(self._fields_)
class Mystruct(Tracerec):
_fields_ = [
('a', c_uint16),
('b', c_uint16,14),
('c', c_uint32),
('d', c_uint16 * 3)
]
buffer = b'\x01\x02\x03\x04\x00\x00\x05\x06\x07\x07\x08\x08\x09\x09\0x0\0x0'
x = Mystruct.from_buffer_copy(buffer)
x
And the output:
<Mystruct=a:0x0102 b:0x00c1 c:0x00000506 d:[0x0707,0x0808,0x0909]>

make a package support python 3

the original code which only support python 2 is here
link to thinkgear.py
I'm trying to edit it to support python 3. the edited code is here:
import sys
import serial
from io import BytesIO
import struct
from collections import namedtuple
import logging
import logging.handlers
import sys
import time
import datetime
global delta
delta = []
_log = logging.getLogger(__name__)
_bytelog = logging.getLogger(__name__+'.bytes')
_bytelog.propagate = False
fh = logging.FileHandler('spam.log')
fh.setLevel(logging.DEBUG)
_log.addHandler(fh)
class ThinkGearProtocol(object):
def __init__(self, port):
self.serial = serial.Serial(port, 57600)
self.preread = BytesIO()
self.io = self.serial
#staticmethod
def _chksum(packet):
return ~sum(c for c in packet ) & 0xff
def _read(self, n):
buf = self.io.read(n)
if len(buf) < n:
_log.debug('incomplete read, short %s bytes', n - len(buf))
if self.io == self.preread:
_log.debug('end of preread buffer')
# self.preread.reset()
# self.preread.truncate()
# two line comment out
self.io = self.serial
buf += self.io.read(n-len(buf))
if len(buf) < n:
_log.debug('incomplete read, short %s bytes', n - len(buf))
for o in range(0, len(buf), 16):
_bytelog.debug('%04X '+' '.join(('%02X',)*len(buf[o:o+16])), o, *(c for c in buf[o:o+16]))
return buf
def _deread(self, buf):
_log.debug('putting back %s bytes', len(buf))
pos = self.preread.tell()
self.preread.seek(0, 2)
self.preread.write(buf)
self.preread.seek(pos)
self.io = self.preread
def get_packets(self):
last_two = ()
while True:
last_two = last_two[-1:]+(self._read(1),)
# _log.debug('last_two: %r', last_two)
if last_two == (b'\xAA',b'\xAA'):
plen = self._read(1)
if plen >= b'\xAA':
# Bogosity
_log.debug('discarding %r while syncing', last_two[0])
last_two = last_two[-1:]+(plen,)
else:
last_two = ()
packet = self._read(int.from_bytes((plen), byteorder='big'))
# _log.debug(plen)
checksum = self._read(1)
if ord(checksum) == self._chksum(packet):
yield self._decode(packet)
else:
_log.debug('bad checksum')
self._deread(packet+checksum)
elif len(last_two) == 2:
_log.debug('discarding %r while syncing', last_two[0])
def _decode(self, packet):
decoded = []
while packet:
extended_code_level = 0
while len(packet) and packet[0] == '\x55':
extended_code_level += 1
packet = packet[1:]
if len(packet) < 2:
_log.debug('ran out of packet: %r', '\x55'*extended_code_level+packet)
break
code = packet[0]
if code < 0x80:
value = packet[1]
packet = packet[2:]
else:
vlen = packet[1]
if len(packet) < 2+vlen:
_log.debug('ran out of packet: %r', '\x55'*extended_code_level+chr(code)+chr(vlen)+packet)
break
value = packet[2:2+vlen]
packet = packet[2+vlen:]
# _log.debug('extended_code_level is '+str(extended_code_level))
# _log.debug('code is '+str(code))
# _log.debug('data_types is '+str(data_types))
# _log.debug(not extended_code_level and code in data_types)
# _log.debug(not bool(extended_code_level and code in data_types))
# _log.debug((extended_code_level,code) in data_types)
if not bool(extended_code_level and code in data_types):
data = data_types[code](extended_code_level, code, value)
# _log.debug('extended_code_level is '+str(extended_code_level))
# _log.debug('code is '+str(code))
# _log.debug('value is '+str(value))
# _log.debug('data_types is '+str(data_types))
elif (extended_code_level,code) in data_types:
data = data_types[(extended_code_level,code)](extended_code_level, code, value)
else:
data = ThinkGearUnknownData(extended_code_level, code, value)
decoded.append(data)
return decoded
data_types = {}
class ThinkGearMetaClass(type):
def __new__(mcls, name, bases, data):
cls = super(ThinkGearMetaClass, mcls).__new__(mcls, name, bases, data)
code = getattr(cls, 'code', None)
if code:
data_types[code] = cls
extended_code_level = getattr(cls, 'extended_code_level', None)
if extended_code_level:
data_types[(extended_code_level,code)] = cls
return cls
class ThinkGearData(object, metaclass=ThinkGearMetaClass):
def __init__(self, extended_code_level, code, value):
self.extended_code_level = extended_code_level
self.code = code
# self.value = self._decode(value)
self.value = value
# _log.debug('123')
if self._log:
_log.log(self._log, '%s', self)
#staticmethod
def _decode(v):
return v
def __str__(self):
return self._strfmt % vars(self)
# __metaclass__ = ThinkGearMetaClass
_log = logging.DEBUG
class ThinkGearUnknownData(ThinkGearData):
'''???'''
_strfmt = 'Unknown: code=%(code)02X extended_code_level=%(extended_code_level)s %(value)r'
class ThinkGearPoorSignalData(ThinkGearData):
'''POOR_SIGNAL Quality (0-255)'''
code = 0x02
_strfmt = 'POOR SIGNAL: %(value)s'
_decode = staticmethod(ord)
class ThinkGearAttentionData(ThinkGearData):
'''ATTENTION eSense (0 to 100)'''
code = 0x04
_strfmt = 'ATTENTION eSense: %(value)s'
_decode = staticmethod(ord)
class ThinkGearMeditationData(ThinkGearData):
'''MEDITATION eSense (0 to 100)'''
code = 0x05
_strfmt = 'MEDITATION eSense: %(value)s'
_decode = staticmethod(ord)
class ThinkGearRawWaveData(ThinkGearData):
'''RAW Wave Value (-32768 to 32767)'''
code = 0x80
_strfmt = 'Raw Wave: %(value)s'
_decode = staticmethod(lambda v: struct.unpack('>h', v)[0])
# There are lots of these, don't log them by default
_log = False
EEGPowerData = namedtuple('EEGPowerData', 'delta theta lowalpha highalpha lowbeta highbeta lowgamma midgamma')
delta_value = namedtuple('EEGPowerData', 'delta')
class ThinkGearEEGPowerData(ThinkGearData):
'''Eight EEG band power values (0 to 16777215).
delta, theta, low-alpha high-alpha, low-beta, high-beta, low-gamma, and
mid-gamma EEG band power values.
'''
code = 0x83
_strfmt = 'ASIC EEG Power: %(value)r'
_decode = staticmethod(lambda v: EEGPowerData(*struct.unpack('>8L', ''.join( '\x00'+v[o:o+3] for o in range(0, 24, 3)))))
#print(EEGPowerData.delta)
def main():
global packet_log
packet_log = []
logging.basicConfig(level=logging.DEBUG)
for pkt in ThinkGearProtocol('COM3').get_packets():
packet_log.append(pkt)
if __name__ == '__main__':
main()
when running in python2, i get the result like this:
DEBUG:__main__:ASIC EEG Power: EEGPowerData(delta=7784, theta=7734, lowalpha=2035, highalpha=1979, lowbeta=2914, highbeta=3996, lowgamma=1944, midgamma=1847
when running in python3, the result is like this:
DEBUG:__main__:ASIC EEG Power: b'\x00\xa9\xf1\x00t%\x00\rK\x00\x18"\x00\x16%\x00\x1d6\x00OT\x00\x17\x84'
Anyone know how should i edit this line of code in order to make it work in python 3? Thank you
_decode = staticmethod(lambda v: EEGPowerData(*struct.unpack('>8L', ''.join( '\x00'+v[o:o+3] for o in range(0, 24, 3)))))

replace semicolon by newline in python code

I would like to parse Python code that contains semicolons ; for separating commands and produce code that replaces those by newlines \n. E.g., from
def main():
a = "a;b"; return a
I'd like to produce
def main():
a = "a;b"
return a
Any hints?
Use the tokenize library to look for token.OP tokens, where the second element is a ; *. Replace these tokens with a token.NEWLINE token.
You'd need to adjust your token offsets and generate matching indent too however; so after a NEWLINE you'd need to adjust line numbers (increment by an offset you increase for every NEWLINE you insert) and the 'next' line (remainder of the current line) would have to have the indices adjusted to match the current indentation level:
import tokenize
TokenInfo = getattr(tokenize, 'TokenInfo', lambda *a: a) # Python 3 compat
def semicolon_to_newline(tokens):
line_offset = 0
last_indent = None
col_offset = None # None or an integer
for ttype, tstr, (slno, scol), (elno, ecol), line in tokens:
slno, elno = slno + line_offset, elno + line_offset
if ttype in (tokenize.INDENT, tokenize.DEDENT):
last_indent = ecol # block is indented to this column
elif ttype == tokenize.OP and tstr == ';':
# swap out semicolon with a newline
ttype = tokenize.NEWLINE
tstr = '\n'
line_offset += 1
if col_offset is not None:
scol, ecol = scol - col_offset, ecol - col_offset
col_offset = 0 # next tokens should start at the current indent
elif col_offset is not None:
if not col_offset:
# adjust column by starting column of next token
col_offset = scol - last_indent
scol, ecol = scol - col_offset, ecol - col_offset
if ttype == tokenize.NEWLINE:
col_offset = None
yield TokenInfo(
ttype, tstr, (slno, scol), (elno, ecol), line)
with open(sourcefile, 'r') as source, open(destination, 'w') as dest:
generator = tokenize.generate_tokens(source.readline)
dest.write(tokenize.untokenize(semicolon_to_newline(generator)))
Note that I don't bother to correct the line value; it is informative only, the data that was read from the file is not actually used when un-tokenizing.
Demo:
>>> from io import StringIO
>>> source = StringIO('''\
... def main():
... a = "a;b"; return a
... ''')
>>> generator = tokenize.generate_tokens(source.readline)
>>> result = tokenize.untokenize(semicolon_to_newline(generator))
>>> print(result)
def main():
a = "a;b"
return a
and slightly more complex:
>>> source = StringIO('''\
... class Foo(object):
... def bar(self):
... a = 10; b = 11; c = 12
... if self.spam:
... x = 12; return x
... x = 15; return y
...
... def baz(self):
... return self.bar;
... # note, nothing after the semicolon
... ''')
>>> generator = tokenize.generate_tokens(source.readline)
>>> result = tokenize.untokenize(semicolon_to_newline(generator))
>>> print(result)
class Foo(object):
def bar(self):
a = 10
b = 11
c = 12
if self.spam:
x = 12
return x
x = 15
return y
def baz(self):
return self.bar
# note, nothing after the semicolon
>>> print(result.replace(' ', '.'))
class.Foo(object):
....def.bar(self):
........a.=.10
........b.=.11
........c.=.12
........if.self.spam:
............x.=.12
............return.x
........x.=.15
........return.y
....def.baz(self):
........return.self.bar
........
........#.note,.nothing.after.the.semicolon
* The Python 3 version of tokenize outputs more informative TokenInfo named tuples, which have an extra exact_type attribute that can be used instead of doing a text match: tok.exact_type == tokenize.SEMI. I kept the above compatible with Python 2 and 3 however.
Here's a pyparsing solution - see comments in the code below:
from pyparsing import Literal, restOfLine, quotedString, pythonStyleComment, line
SEMI = Literal(';')
patt = SEMI + restOfLine
patt.ignore(quotedString)
patt.ignore(pythonStyleComment)
def split_at(s, locs):
"""
break up s into pieces, given list of break locations
"""
current = 0
ret = []
for loc in locs:
ret.append(s[current:loc].lstrip())
current = loc+1
ret.append(s[current:].lstrip())
return ret
def split_on_semicolon(s,l,tokens):
"""
parse time callback, when finding first unquoted ';' on a line
"""
current_line = line(l,s)
line_body = current_line.lstrip()
indent = current_line.index(line_body)
indent = current_line[:indent]
# may be more than one ';' on this line, find them all
# (the second token contains everything after the ';')
remainder = tokens[1]
if remainder.strip():
all_semis = [s for _,s,_ in SEMI.scanString(remainder)]
# break line into pieces
pieces = split_at(remainder, all_semis)
# rejoin pieces, with leading indents
return '\n'+'\n'.join(indent+piece for piece in pieces)
else:
return ''
patt.addParseAction(split_on_semicolon)
sample = """
def main():
this_semi_does_nothing();
neither_does_this_but_there_are_spaces_afterward();
a = "a;b"; return a # this is a comment; it has a semicolon!
def b():
if False:
z=1000;b("; in quotes"); c=200;return z
return ';'
class Foo(object):
def bar(self):
'''a docstring; with a semicolon'''
a = 10; b = 11; c = 12
# this comment; has several; semicolons
if self.spam:
x = 12; return x # so; does; this; one
x = 15;;; y += x; return y
def baz(self):
return self.bar
"""
print(patt.transformString(sample))
Gives:
def main():
this_semi_does_nothing()
neither_does_this_but_there_are_spaces_afterward()
a = "a;b"
return a # this is a comment; it has a semicolon!
def b():
if False:
z=1000
b("; in quotes")
c=200
return z
return ';'
class Foo(object):
def bar(self):
'''a docstring; with a semicolon'''
a = 10
b = 11
c = 12
# this comment; has several; semicolons
if self.spam:
x = 12
return x # so; does; this; one
x = 15
y += x
return y
def baz(self):
return self.bar

Categories