Two apparently identical dataclasses are not equal

Two apparently identical dataclasses are not equal - python

I've defined the following dataclass:
"""This module declares the SubtitleItem dataclass."""
import re
from dataclasses import dataclass
from time_utils import Timestamp
#dataclass
class SubtitleItem:
"""Class for storing all the information for
a subtitle item."""
index: int
start_time: Timestamp
end_time: Timestamp
text: str
#staticmethod
def load_from_text_item(text_item: str) -> "SubtitleItem":
"""Create new subtitle item from their .srt file text.
Example, if your .srt file contains the following subtitle item:
```
3
00:00:05,847 --> 00:00:06,916
The robot.
```
This function will return:
```
SubtitleItem(
index=3,
start_time=Timestamp(seconds=5, milliseconds=847),
end_time=Timestamp(seconds=6, milliseconds=916),
text='The robot.')
```
Args:
text_item (str): The .srt text for a subtitle item.
Returns:
SubtitleItem: A corresponding SubtitleItem.
"""
# Build regex
index_re = r"\d+"
timestamp = lambda prefix: rf"(?P<{prefix}_hours>\d\d):" + \
rf"(?P<{prefix}_minutes>\d\d):" + \
rf"(?P<{prefix}_seconds>\d\d)," + \
rf"(?P<{prefix}_milliseconds>\d\d\d)"
start_timestamp_re = timestamp("start")
end_timestamp_re = timestamp("end")
text_re = r".+"
complete_re = f"^(?P<index>{index_re})\n"
complete_re += f"{start_timestamp_re} --> {end_timestamp_re}\n"
complete_re += f"(?P<text>{text_re})$"
regex = re.compile(complete_re)
# Match and extract groups
match = regex.match(text_item)
if match is None:
raise ValueError(f"Index item invalid format:\n'{text_item}'")
groups = match.groupdict()
# Extract values
index = int(groups['index'])
group_items = filter(lambda kv: kv[0].startswith("start_"), groups.items())
args = { k[len("start_"):]: int(v) for k, v in group_items }
start = Timestamp(**args)
group_items = filter(lambda kv: kv[0].startswith("end_"), groups.items())
args = { k[len("end_"):]: int(v) for k, v in group_items }
end = Timestamp(**args)
text = groups['text']
if start >= end:
raise ValueError(
f"Start timestamp must be later than end timestamp: start={start}, end={end}")
return SubtitleItem(index, start, end, text)
#staticmethod
def _format_timestamp(t: Timestamp) -> str:
"""Format a timestamp in the .srt format.
Args:
t (Timestamp): The timestamp to convert.
Returns:
str: The textual representation for the .srt format.
"""
return f"{t.get_hours()}:{t.get_minutes()}:{t.get_seconds()},{t.get_milliseconds()}"
def __str__(self):
res = f"{self.index}\n"
res += f"{SubtitleItem._format_timestamp(self.start_time)}"
res += " --> "
res += f"{SubtitleItem._format_timestamp(self.end_time)}\n"
res += self.text
return res
... which I use in the following test:
import unittest
from src.subtitle_item import SubtitleItem
from src.time_utils import Timestamp
class SubtitleItemTest(unittest.TestCase):
def testLoadFromText(self):
text = "21\n01:02:03,004 --> 05:06:07,008\nTest subtitle."
res = SubtitleItem.load_from_text_item(text)
exp = SubtitleItem(
21, Timestamp(hours=1, minutes=2, seconds=3, milliseconds=4),
Timestamp(hours=5, minutes=6, seconds=7, milliseconds=8),
"Test subtitle."
)
self.assertEqual(res, exp)
This test fails, but I don't understand why.
I've checked with the debugger: exp and res have exactly the same fields. The Timestamp class is another separate dataclass. I've checked equality per field manually in the debugger, all fields are identical:
>>> exp == res
False
>>> exp.index == res.index
True
>>> exp.start_time == res.start_time
True
>>> exp.end_time == res.end_time
True
>>> exp.text == res.text
True
Furthermore, asdict() on each object returns identical dictionaries:
>>> dataclasses.asdict(exp) == dataclasses.asdict(res)
True
Is there something I'm misunderstanding regarding the implementation of the equality operator with dataclasses?
Thanks.
EDIT: my time_utils module, sorry for not including that earlier
"""
This module declares the Delta and Timestamp classes.
"""
from dataclasses import dataclass
#dataclass(frozen=True)
class _TimeBase:
hours: int = 0
minutes: int = 0
seconds: int = 0
milliseconds: int = 0
def __post_init__(self):
BOUNDS_H = range(0, 100)
BOUNDS_M = range(0, 60)
BOUNDS_S = range(0, 60)
BOUNDS_MS = range(0, 1000)
if self.hours not in BOUNDS_H:
raise ValueError(
f"{self.hours=} not in [{BOUNDS_H.start, BOUNDS_H.stop})")
if self.minutes not in BOUNDS_M:
raise ValueError(
f"{self.minutes=} not in [{BOUNDS_M.start, BOUNDS_M.stop})")
if self.seconds not in BOUNDS_S:
raise ValueError(
f"{self.seconds=} not in [{BOUNDS_S.start, BOUNDS_S.stop})")
if self.milliseconds not in BOUNDS_MS:
raise ValueError(
f"{self.milliseconds=} not in [{BOUNDS_MS.start, BOUNDS_MS.stop})")
def _to_ms(self):
return self.milliseconds + 1000 * (self.seconds + 60 * (self.minutes + 60 * self.hours))
#dataclass(frozen=True)
class Delta(_TimeBase):
"""A time difference, with milliseconds accuracy.
Must be less than 100h long."""
sign: int = 1
def __post_init__(self):
if self.sign not in (1, -1):
raise ValueError(
f"{self.sign=} should either be 1 or -1")
super().__post_init__()
def __add__(self, other: "Delta") -> "Delta":
self_ms = self.sign * self._to_ms()
other_ms = other.sign * other._to_ms()
ms_sum = self_ms + other_ms
sign = -1 if ms_sum < 0 else 1
ms_sum = abs(ms_sum)
ms_n, s_rem = ms_sum % 1000, ms_sum // 1000
s_n, m_rem = s_rem % 60, s_rem // 60
m_n, h_n = m_rem % 60, m_rem // 60
return Delta(hours=h_n, minutes=m_n, seconds=s_n, milliseconds=ms_n, sign=sign)
#dataclass(frozen=True)
class Timestamp(_TimeBase):
"""A timestamp with milliseconds accuracy. Must be
less than 100h long."""
def __add__(self, other: Delta) -> "Timestamp":
ms_sum = self._to_ms() + other.sign * other._to_ms()
ms_n, s_rem = ms_sum % 1000, ms_sum // 1000
s_n, m_rem = s_rem % 60, s_rem // 60
m_n, h_n = m_rem % 60, m_rem // 60
return Timestamp(hours=h_n, minutes=m_n, seconds=s_n, milliseconds=ms_n)
def __ge__(self, other: "Timestamp") -> bool:
return self._to_ms() >= other._to_ms()

class Timestamp:
def __init__( self, hours=0, minutes=0, seconds=0, milliseconds=0 ):
self.ms = ((hours*60+minutes)*60+seconds)*1000+milliseconds
def get_hours(self):
return self.ms // (60*60*1000)
def get_minutes(self):
return (self.ms // (60*1000)) % 60
def get_seconds(self):
return (self.ms // 1000) % 60
def get_milliseconds(self):
return self.ms % 1000
def __add__(self,other):
return Timestamp(milliseconds=self.ms + self.other)
def __eq__(self,other):
return self.ms == other.ms
def __lt__(self,other):
return self.ms < other.ms
def __le__(self,other):
return self.ms <= other.ms
... your code ...
text = "21\n01:02:03,004 --> 05:06:07,008\nTest subtitle."
res = SubtitleItem.load_from_text_item(text)
exp = SubtitleItem(
21, Timestamp(hours=1, minutes=2, seconds=3, milliseconds=4),
Timestamp(hours=5, minutes=6, seconds=7, milliseconds=8),
"Test subtitle."
)
print(res)
print(exp)
print(res==exp)
Produces:
21
1:2:3,4 --> 5:6:7,8
Test subtitle.
21
1:2:3,4 --> 5:6:7,8
Test subtitle.
True
with no assert exception.

Okay, I think I found what's going wrong here.
First, I made a mistake when I reported the issue before: in the unit test, exp.start_time != res.start_time and exp.end_time != res.end_time. Sorry about that. That narrows down the issue to comparison of timestamps.
My sources are in project/src/, the test that fails is in project/tests/. To make source modules accessible to the test, I had to add the source directory to PYTHONPATH:
$ PYTHONPATH=src/ python -m unittest discover -s tests/ -v
In the unit test, even though res.start_time and end.start_time do have the same fields, they do not have the same type:
>>> print(type(res.start_time), type(exp.start_time))
<class 'time_utils.Timestamp'> <class 'src.time_utils.Timestamp'>
I've added a new post with a minimally reproducible example, and more details about the file structure here: Minimally reproducible example.

Related

How to use proper container in python

For the following code I would like to use a structure in order to not have to give 2 values for each element min and max and also I would like to have a container if exists to can give one value and the other one to remain None. For instance power to have only min element. So for power to have a container with 2 elements (for min and max) and same for temperature. How is that possible in python ? please help, thanks!
def result_final(
power_min,
power_max,
temperature_min,
temperature_max
) -> str:
def _result_min(value) -> str:
return "<min>" "<value>" + str(value) + "</value>" + "</min>"
def _result_max(value) -> str:
return "<max>" "<value>" + str(value) + "</value>" +" </max>"
def _measure_result(unit_id, min_value, max_value) -> str:
return (
"<measure_result>"
"<unit-id>" + str(unit_id) + "</unit-id>"
"" + _result_min(min_value) + ""
"" + _result_max(max_value) + ""
"</measure_result>"
)
def _stats(object, min_value, max_value) -> str:
return (
"<stats>"
"<object>" + object + "</object>"
"" + _measure_result(0, min_value, max_value) + ""
"" + _measure_result(1, min_value, max_value) + ""
"</stats>"
)
content = (
'<result-stats>'
"" + _stats("POWER", power_min, power_max) + ""
"" + _stats("TEMPERATURE", temperature_min, temperature_max) + ""
"</result-stats>"
)
return content
x = result_final(power_min = 12, power_max = 125, temperature_min = 12, temperature_max = 12)
print(x)

I'd suggest just using tuples for each min/max pair:
from typing import Optional, Tuple
Stats = Tuple[Optional[int], Optional[int]] # min, max
def result_final(power: Stats, temperature: Stats) -> str:
def _result_min(value: Optional[int]) -> str:
return "" if value is None else f"<min><value>{value}</value></min>"
def _result_max(value: Optional[int]) -> str:
return "" if value is None else f"<max><value>{value}</value></max>"
def _measure_result(unit_id: int, value: Stats) -> str:
min_value, max_value = value
return (
"<measure_result>"
f"<unit-id>{unit_id}</unit-id>"
f"{_result_min(min_value)}"
f"{_result_max(max_value)}"
"</measure_result>"
)
def _stats(obj: str, value: Stats) -> str:
return (
"<stats>"
f"<object>{object}</object>"
f"{_measure_result(0, value)}"
f"{_measure_result(1, value)}"
"</stats>"
)
return (
"<result-stats>"
f"{_stats('POWER', power)}"
f"{_stats('TEMPERATURE', temperature)}"
"</result-stats>"
)
print(result_final((12, 125), (12, 12)))

I suggest you take a look at Python dataclasses.
from dataclasses import dataclass
#dataclass
class MyContainer:
power_min: int = None
power_max: int = None
temperature_min: int = None
temperature_max: int = None
The dataclass wrapper provides a convenient way to define a class that just stores some data. dataclasses is in the standard library (i.e., you do not need to install anything).
The class I defined, by default, uses None for all attributes. Or you can specify values for the attributes you need.
a_container = MyContainer(power_max=5, temperature_min=3)
I also suggest to choose a better name than MyContainer: I used that because I did not know what you were trying to achieve!
You can also decide to define two separate classes for TemperatureExtremes and PowerExtremes, if that makes more sense for you!

Alternative to global variable

I got a recursive function, which reverses an integer. The reversed Integer can not start with zero, e.g.: 12340 becomes 4321.
res = 0
base = 1
def reverse(n):
global res
global base
if n > 0:
reverse(n // 10)
res = res + (n % 10) * base
base = base * 10
return res
this code works, but only once and hence I want to get rid of the global variables. I thought of a helper function, inside of the reverse(n) function, but I could not get it to work properly. I have tried for almost an hour and would love to see the solution eventually.

It's totally possible with a helper method:
def reverse_recursive(i: int) -> int:
def helper(i: int, result_up_to_now: int) -> int:
if i == 0:
return result_up_to_now
return helper(i // 10, result_up_to_now * 10 + (i % 10))
return helper(i, 0)
assert reverse_recursive(123456789) == 987654321
And even without one:
def reverse_recursive_nohelper(i: int, result_up_to_now=0) -> int:
if i == 0:
return result_up_to_now
return reverse_recursive_nohelper(i // 10, result_up_to_now * 10 + (i % 10))
assert reverse_recursive_nohelper(123456789) == 987654321
But the latter one could of course be misused/misunderstood by someone.
Just for comparison, the non-recursive version:
def reverse_straight(i: int) -> int:
result = 0
while i != 0:
result = result*10 + i % 10
i //= 10
return result
assert reverse_straight(123456789) == 987654321

global only on the agg_res variable for the result.
Anyway agg_res is reinitialized at the end of the calculation (no influence on the second re-use)
agg_res=0
def reverse(n, res=0, base=1):
global agg_res
ls = len(str(n))
if n > 0:
reverse(n // 10, res, base)
res = res + (n % 10)
base = base * (10**(ls-1))
agg_res += res*base
else :
agg_res=0
# First time
reverse(5786)
print(agg_res)
# 6875
# Second time
reverse(5786)
print(agg_res)
# 6875

Example of using a mutable argument to get rid of globals
def reverse(n, res_base = None):
if res_base is None:
res_base = [0, 1] # Use two element list for res, base
# res_base[0] -> res
# res_base[1] -> base
if n > 0:
reverse(n // 10, res_base)
# Update as in original code
# but replacing res & base with elements from list res_base
res_base[0] = (res_base[0] + (n%10)*res_base[1]) # res update
res_base[1] = res_base[1] * 10 # base update
return res_base[0]
Check that it works on multiple runs
print(reverse(12345)) # Output: 54321
print(reverse(6789)) # 9876
Alternative Using Helper Function
def reverse(n):
def helper(n, res, base):
if n > 0:
res, base = helper(n // 10, res, base)
res = res + (n % 10) * base
base = base * 10
return res, base
res, base = helper(n, 0, 1)
return res
print(reverse(12345)) # Output: 54321
print(reverse(6789)) # 9876
Shows Use of Mutable Argument
Use mutable argument removes need to pass update back to parent
def reverse(n):
def helper(n, res_base):
if n > 0:
helper(n//10, res_base)
res_base[0] += (n%10)*res_base[1] # Since res_base is mutable, it's parent will see the change in value
res_base[1] *= 10
res_base = [0, 1]
helper(n, res_base)
return res_base[0]
print(reverse(12345)) # Out: 54321

replace semicolon by newline in python code

I would like to parse Python code that contains semicolons ; for separating commands and produce code that replaces those by newlines \n. E.g., from
def main():
a = "a;b"; return a
I'd like to produce
def main():
a = "a;b"
return a
Any hints?

Use the tokenize library to look for token.OP tokens, where the second element is a ; *. Replace these tokens with a token.NEWLINE token.
You'd need to adjust your token offsets and generate matching indent too however; so after a NEWLINE you'd need to adjust line numbers (increment by an offset you increase for every NEWLINE you insert) and the 'next' line (remainder of the current line) would have to have the indices adjusted to match the current indentation level:
import tokenize
TokenInfo = getattr(tokenize, 'TokenInfo', lambda *a: a) # Python 3 compat
def semicolon_to_newline(tokens):
line_offset = 0
last_indent = None
col_offset = None # None or an integer
for ttype, tstr, (slno, scol), (elno, ecol), line in tokens:
slno, elno = slno + line_offset, elno + line_offset
if ttype in (tokenize.INDENT, tokenize.DEDENT):
last_indent = ecol # block is indented to this column
elif ttype == tokenize.OP and tstr == ';':
# swap out semicolon with a newline
ttype = tokenize.NEWLINE
tstr = '\n'
line_offset += 1
if col_offset is not None:
scol, ecol = scol - col_offset, ecol - col_offset
col_offset = 0 # next tokens should start at the current indent
elif col_offset is not None:
if not col_offset:
# adjust column by starting column of next token
col_offset = scol - last_indent
scol, ecol = scol - col_offset, ecol - col_offset
if ttype == tokenize.NEWLINE:
col_offset = None
yield TokenInfo(
ttype, tstr, (slno, scol), (elno, ecol), line)
with open(sourcefile, 'r') as source, open(destination, 'w') as dest:
generator = tokenize.generate_tokens(source.readline)
dest.write(tokenize.untokenize(semicolon_to_newline(generator)))
Note that I don't bother to correct the line value; it is informative only, the data that was read from the file is not actually used when un-tokenizing.
Demo:
>>> from io import StringIO
>>> source = StringIO('''\
... def main():
... a = "a;b"; return a
... ''')
>>> generator = tokenize.generate_tokens(source.readline)
>>> result = tokenize.untokenize(semicolon_to_newline(generator))
>>> print(result)
def main():
a = "a;b"
return a
and slightly more complex:
>>> source = StringIO('''\
... class Foo(object):
... def bar(self):
... a = 10; b = 11; c = 12
... if self.spam:
... x = 12; return x
... x = 15; return y
...
... def baz(self):
... return self.bar;
... # note, nothing after the semicolon
... ''')
>>> generator = tokenize.generate_tokens(source.readline)
>>> result = tokenize.untokenize(semicolon_to_newline(generator))
>>> print(result)
class Foo(object):
def bar(self):
a = 10
b = 11
c = 12
if self.spam:
x = 12
return x
x = 15
return y
def baz(self):
return self.bar
# note, nothing after the semicolon
>>> print(result.replace(' ', '.'))
class.Foo(object):
....def.bar(self):
........a.=.10
........b.=.11
........c.=.12
........if.self.spam:
............x.=.12
............return.x
........x.=.15
........return.y
....def.baz(self):
........return.self.bar
........
........#.note,.nothing.after.the.semicolon
* The Python 3 version of tokenize outputs more informative TokenInfo named tuples, which have an extra exact_type attribute that can be used instead of doing a text match: tok.exact_type == tokenize.SEMI. I kept the above compatible with Python 2 and 3 however.

Here's a pyparsing solution - see comments in the code below:
from pyparsing import Literal, restOfLine, quotedString, pythonStyleComment, line
SEMI = Literal(';')
patt = SEMI + restOfLine
patt.ignore(quotedString)
patt.ignore(pythonStyleComment)
def split_at(s, locs):
"""
break up s into pieces, given list of break locations
"""
current = 0
ret = []
for loc in locs:
ret.append(s[current:loc].lstrip())
current = loc+1
ret.append(s[current:].lstrip())
return ret
def split_on_semicolon(s,l,tokens):
"""
parse time callback, when finding first unquoted ';' on a line
"""
current_line = line(l,s)
line_body = current_line.lstrip()
indent = current_line.index(line_body)
indent = current_line[:indent]
# may be more than one ';' on this line, find them all
# (the second token contains everything after the ';')
remainder = tokens[1]
if remainder.strip():
all_semis = [s for _,s,_ in SEMI.scanString(remainder)]
# break line into pieces
pieces = split_at(remainder, all_semis)
# rejoin pieces, with leading indents
return '\n'+'\n'.join(indent+piece for piece in pieces)
else:
return ''
patt.addParseAction(split_on_semicolon)
sample = """
def main():
this_semi_does_nothing();
neither_does_this_but_there_are_spaces_afterward();
a = "a;b"; return a # this is a comment; it has a semicolon!
def b():
if False:
z=1000;b("; in quotes"); c=200;return z
return ';'
class Foo(object):
def bar(self):
'''a docstring; with a semicolon'''
a = 10; b = 11; c = 12
# this comment; has several; semicolons
if self.spam:
x = 12; return x # so; does; this; one
x = 15;;; y += x; return y
def baz(self):
return self.bar
"""
print(patt.transformString(sample))
Gives:
def main():
this_semi_does_nothing()
neither_does_this_but_there_are_spaces_afterward()
a = "a;b"
return a # this is a comment; it has a semicolon!
def b():
if False:
z=1000
b("; in quotes")
c=200
return z
return ';'
class Foo(object):
def bar(self):
'''a docstring; with a semicolon'''
a = 10
b = 11
c = 12
# this comment; has several; semicolons
if self.spam:
x = 12
return x # so; does; this; one
x = 15
y += x
return y
def baz(self):
return self.bar

Python: Why is my generator based range is X2 slower than xrange?

Just of curiosity, I've written 3 tests in Python and timed them out using timeit:
import timeit
# simple range based on generator
def my_range(start, stop):
i = start
while (i < stop):
yield i
i += 1
# test regular range
def test_range():
x = range(1, 100000)
sum = 0
for i in x:
sum += i
# test xrange
def test_xrange():
x = xrange(1, 100000)
sum = 0
for i in x:
sum += i
# test my range
def test_my_range():
x = my_range(1, 100000)
sum = 0
for i in x:
sum += i
print timeit.timeit("test_range()", setup = "from __main__ import test_range", number = 100)
print timeit.timeit("test_xrange()", setup = "from __main__ import test_xrange", number = 100)
print timeit.timeit("test_my_range()", setup = "from __main__ import test_my_range", number = 100)
And I've got these benchmarks:
regular range based test - 0.616795163262
xrange based test - 0.537716731096
my_range (generator) based test - **1.27872886337**
My range was X2 slower even than a range that creates a list. Why?
Are xrange() / range() implemented using C directly?
Are they implemented without condition check?
Thanks!

I feel that the simple answer is that xrange() is builtin and written in C.
I added another case to your test (see below): A pure-Python reference implementation of xrange() based on the CPython source.
import timeit
from collections import Sequence, Iterator
from math import ceil
# simple range based on generator
def my_range(start, stop):
i = start
while (i < stop):
yield i
i += 1
# test regular range
def test_range():
x = range(1, 100000)
sum = 0
for i in x:
sum += i
# test xrange
def test_xrange():
x = xrange(1, 100000)
sum = 0
for i in x:
sum += i
# test my range
def test_my_range():
x = my_range(1, 100000)
sum = 0
for i in x:
sum += i
class pure_python_xrange(Sequence):
"""Pure-Python implementation of an ``xrange`` (aka ``range``
in Python 3) object. See `the CPython documentation
<http://docs.python.org/py3k/library/functions.html#range>`_
for details.
"""
def __init__(self, *args):
if len(args) == 1:
start, stop, step = 0, args[0], 1
elif len(args) == 2:
start, stop, step = args[0], args[1], 1
elif len(args) == 3:
start, stop, step = args
else:
raise TypeError('pure_python_xrange() requires 1-3 int arguments')
try:
start, stop, step = int(start), int(stop), int(step)
except ValueError:
raise TypeError('an integer is required')
if step == 0:
raise ValueError('pure_python_xrange() arg 3 must not be zero')
elif step < 0:
stop = min(stop, start)
else:
stop = max(stop, start)
self._start = start
self._stop = stop
self._step = step
self._len = (stop - start) // step + bool((stop - start) % step)
def __repr__(self):
if self._start == 0 and self._step == 1:
return 'pure_python_xrange(%d)' % self._stop
elif self._step == 1:
return 'pure_python_xrange(%d, %d)' % (self._start, self._stop)
return 'pure_python_xrange(%d, %d, %d)' % (self._start, self._stop, self._step)
def __eq__(self, other):
return isinstance(other, xrange) and \
self._start == other._start and \
self._stop == other._stop and \
self._step == other._step
def __len__(self):
return self._len
def index(self, value):
"""Return the 0-based position of integer `value` in
the sequence this xrange represents."""
diff = value - self._start
quotient, remainder = divmod(diff, self._step)
if remainder == 0 and 0 <= quotient < self._len:
return abs(quotient)
raise ValueError('%r is not in range' % value)
def count(self, value):
"""Return the number of ocurrences of integer `value`
in the sequence this xrange represents."""
# a value can occur exactly zero or one times
return int(value in self)
def __contains__(self, value):
"""Return ``True`` if the integer `value` occurs in
the sequence this xrange represents."""
try:
self.index(value)
return True
except ValueError:
return False
def __reversed__(self):
"""Return an xrange which represents a sequence whose
contents are the same as the sequence this xrange
represents, but in the opposite order."""
sign = self._step / abs(self._step)
last = self._start + ((self._len - 1) * self._step)
return pure_python_xrange(last, self._start - sign, -1 * self._step)
def __getitem__(self, index):
"""Return the element at position ``index`` in the sequence
this xrange represents, or raise :class:`IndexError` if the
position is out of range."""
if isinstance(index, slice):
return self.__getitem_slice(index)
if index < 0:
# negative indexes access from the end
index = self._len + index
if index < 0 or index >= self._len:
raise IndexError('xrange object index out of range')
return self._start + index * self._step
def __getitem_slice(self, slce):
"""Return an xrange which represents the requested slce
of the sequence represented by this xrange.
"""
start, stop, step = slce.start, slce.stop, slce.step
if step == 0:
raise ValueError('slice step cannot be 0')
start = start or self._start
stop = stop or self._stop
if start < 0:
start = max(0, start + self._len)
if stop < 0:
stop = max(start, stop + self._len)
if step is None or step > 0:
return pure_python_xrange(start, stop, step or 1)
else:
rv = reversed(self)
rv._step = step
return rv
def __iter__(self):
"""Return an iterator which enumerates the elements of the
sequence this xrange represents."""
return xrangeiterator(self)
class xrangeiterator(Iterator):
"""An iterator for an :class:`xrange`.
"""
def __init__(self, xrangeobj):
self._xrange = xrangeobj
# Intialize the "last outputted value" to the value
# just before the first value; this simplifies next()
self._last = self._xrange._start - self._xrange._step
self._count = 0
def __iter__(self):
"""An iterator is already an iterator, so return ``self``.
"""
return self
def next(self):
"""Return the next element in the sequence represented
by the xrange we are iterating, or raise StopIteration
if we have passed the end of the sequence."""
self._last += self._xrange._step
self._count += 1
if self._count > self._xrange._len:
raise StopIteration()
return self._last
# test xrange
def test_pure_python_xrange():
x = pure_python_xrange(1, 100000)
sum = 0
for i in x:
sum += i
print timeit.timeit("test_range()", setup = "from __main__ import test_range", number = 100)
print timeit.timeit("test_xrange()", setup = "from __main__ import test_xrange", number = 100)
print timeit.timeit("test_my_range()", setup = "from __main__ import test_my_range", number = 100)
print timeit.timeit("test_pure_python_xrange()", setup = "from __main__ import test_pure_python_xrange", number = 100)
The results?
$ python so.py
0.426695823669
0.371111869812
0.964643001556
6.06390094757
This is simply the difference between interpreted Python code and C. Additionally, as #byels mentioned above, xrange() is limited to short integers, which likely has positive effect.

This is an interesting test. Looking at the python 2 docs on xrange, one guess that comes to mind is that xrange is alowed to take advantage of type restrictions (only uses "short" integers)

dpkt source code documentation project

Dpkt is a python packet creation and parsing library https://code.google.com/p/dpkt/
The project lacks documentation for beginners. I am trying to document it and make example sample code for all. Based on my knowledge of python, i am having difficulty understanding some of the source code. Here for example is python the RTP (Real Time Transport Protocol) module
https://code.google.com/p/dpkt/source/browse/trunk/dpkt/rtp.py #rtp.py source code
# $Id$
"""Real-Time Transport Protocol"""
from dpkt import Packet
# version 1100 0000 0000 0000 ! 0xC000 14
# p 0010 0000 0000 0000 ! 0x2000 13
# x 0001 0000 0000 0000 ! 0x1000 12
# cc 0000 1111 0000 0000 ! 0x0F00 8
# m 0000 0000 1000 0000 ! 0x0080 7
# pt 0000 0000 0111 1111 ! 0x007F 0
#
_VERSION_MASK= 0xC000
_P_MASK = 0x2000
_X_MASK = 0x1000
_CC_MASK = 0x0F00
_M_MASK = 0x0080
_PT_MASK = 0x007F
_VERSION_SHIFT=14
_P_SHIFT = 13
_X_SHIFT = 12
_CC_SHIFT = 8
_M_SHIFT = 7
_PT_SHIFT = 0
VERSION = 2
class RTP(Packet):
__hdr__ = (
('_type', 'H', 0x8000),
('seq', 'H', 0),
('ts', 'I', 0),
('ssrc', 'I', 0),
)
csrc = ''
def _get_version(self): return (self._type&_VERSION_MASK)>>_VERSION_SHIFT
def _set_version(self, ver):
self._type = (ver << _VERSION_SHIFT) | (self._type & ~_VERSION_MASK)
def _get_p(self): return (self._type & _P_MASK) >> _P_SHIFT
def _set_p(self, p): self._type = (p << _P_SHIFT) | (self._type & ~_P_MASK)
def _get_x(self): return (self._type & _X_MASK) >> _X_SHIFT
def _set_x(self, x): self._type = (x << _X_SHIFT) | (self._type & ~_X_MASK)
def _get_cc(self): return (self._type & _CC_MASK) >> _CC_SHIFT
def _set_cc(self, cc): self._type = (cc<<_CC_SHIFT)|(self._type&~_CC_MASK)
def _get_m(self): return (self._type & _M_MASK) >> _M_SHIFT
def _set_m(self, m): self._type = (m << _M_SHIFT) | (self._type & ~_M_MASK)
def _get_pt(self): return (self._type & _PT_MASK) >> _PT_SHIFT
def _set_pt(self, m): self._type = (m << _PT_SHIFT)|(self._type&~_PT_MASK)
version = property(_get_version, _set_version)
p = property(_get_p, _set_p)
x = property(_get_x, _set_x)
cc = property(_get_cc, _set_cc)
m = property(_get_m, _set_m)
pt = property(_get_pt, _set_pt)
def __len__(self):
return self.__hdr_len__ + len(self.csrc) + len(self.data)
def __str__(self):
return self.pack_hdr() + self.csrc + str(self.data)
def unpack(self, buf):
super(RTP, self).unpack(buf)
self.csrc = buf[self.__hdr_len__:self.__hdr_len__ + self.cc * 4]
self.data = buf[self.__hdr_len__ + self.cc * 4:]
With this code, i was able to do the following in IPython Shell
[37] import dpkt
[38] rtp_pkt=dpkt.rtp.RTP()
[39] rtp_pkt.pack_hdr()
Out[39]: '\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
[47] rtp_pkt.data="HelloWorld"
[48] rtp_pkt.pack()
Out[48]: '\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00HelloWorld'
Based on my understanding of Classes in Python, there should be "init" function in the class that i don't see in rtp.py? I am wondering how the above Ipython Shell commands worked? Why does the hdr variable in rtp.py starts with double underscore "__" and why every class method precedes with single underscore "_". I know this could be to make it private or semi-private but does it have to be that way?
I know that RTP class is derived from Packet whose source code is also pasted here for convenience.
# $Id$
"""Simple packet creation and parsing."""
import copy, itertools, socket, struct
class Error(Exception): pass
class UnpackError(Error): pass
class NeedData(UnpackError): pass
class PackError(Error): pass
class _MetaPacket(type):
def __new__(cls, clsname, clsbases, clsdict):
t = type.__new__(cls, clsname, clsbases, clsdict)
st = getattr(t, '__hdr__', None)
if st is not None:
# XXX - __slots__ only created in __new__()
clsdict['__slots__'] = [ x[0] for x in st ] + [ 'data' ]
t = type.__new__(cls, clsname, clsbases, clsdict)
t.__hdr_fields__ = [ x[0] for x in st ]
t.__hdr_fmt__ = getattr(t, '__byte_order__', '>') + \
''.join([ x[1] for x in st ])
t.__hdr_len__ = struct.calcsize(t.__hdr_fmt__)
t.__hdr_defaults__ = dict(zip(
t.__hdr_fields__, [ x[2] for x in st ]))
return t
class Packet(object):
"""Base packet class, with metaclass magic to generate members from
self.__hdr__.
__hdr__ should be defined as a list of (name, structfmt, default) tuples
__byte_order__ can be set to override the default ('>')
Example::
>>> class Foo(Packet):
... __hdr__ = (('foo', 'I', 1), ('bar', 'H', 2), ('baz', '4s', 'quux'))
...
>>> foo = Foo(bar=3)
>>> foo
Foo(bar=3)
>>> str(foo)
'\x00\x00\x00\x01\x00\x03quux'
>>> foo.bar
3
>>> foo.baz
'quux'
>>> foo.foo = 7
>>> foo.baz = 'whee'
>>> foo
Foo(baz='whee', foo=7, bar=3)
>>> Foo('hello, world!')
Foo(baz=' wor', foo=1751477356L, bar=28460, data='ld!')
"""
__metaclass__ = _MetaPacket
def __init__(self, *args, **kwargs):
"""Packet constructor with ([buf], [field=val,...]) prototype.
Arguments:
buf -- optional packet buffer to unpack
Optional keyword arguments correspond to members to set
(matching fields in self.__hdr__, or 'data').
"""
self.data = ''
if args:
try:
self.unpack(args[0])
except struct.error:
if len(args[0]) < self.__hdr_len__:
raise NeedData
raise UnpackError('invalid %s: %r' %
(self.__class__.__name__, args[0]))
else:
for k in self.__hdr_fields__:
setattr(self, k, copy.copy(self.__hdr_defaults__[k]))
for k, v in kwargs.iteritems():
setattr(self, k, v)
def __len__(self):
return self.__hdr_len__ + len(self.data)
def __getitem__(self, k):
try: return getattr(self, k)
except AttributeError: raise KeyError
def __repr__(self):
l = [ '%s=%r' % (k, getattr(self, k))
for k in self.__hdr_defaults__
if getattr(self, k) != self.__hdr_defaults__[k] ]
if self.data:
l.append('data=%r' % self.data)
return '%s(%s)' % (self.__class__.__name__, ', '.join(l))
def __str__(self):
return self.pack_hdr() + str(self.data)
def pack_hdr(self):
"""Return packed header string."""
try:
return struct.pack(self.__hdr_fmt__,
*[ getattr(self, k) for k in self.__hdr_fields__ ])
except struct.error:
vals = []
for k in self.__hdr_fields__:
v = getattr(self, k)
if isinstance(v, tuple):
vals.extend(v)
else:
vals.append(v)
try:
return struct.pack(self.__hdr_fmt__, *vals)
except struct.error, e:
raise PackError(str(e))
def pack(self):
"""Return packed header + self.data string."""
return str(self)
def unpack(self, buf):
"""Unpack packet header fields from buf, and set self.data."""
for k, v in itertools.izip(self.__hdr_fields__,
struct.unpack(self.__hdr_fmt__, buf[:self.__hdr_len__])):
setattr(self, k, v)
self.data = buf[self.__hdr_len__:]
# XXX - ''.join([(len(`chr(x)`)==3) and chr(x) or '.' for x in range(256)])
__vis_filter = """................................ !"#$%&\'()*+,-./0123456789:;<=>?#ABCDEFGHIJKLMNOPQRSTUVWXYZ[.]^_`abcdefghijklmnopqrstuvwxyz{|}~................................................................................................................................."""
def hexdump(buf, length=16):
"""Return a hexdump output string of the given buffer."""
n = 0
res = []
while buf:
line, buf = buf[:length], buf[length:]
hexa = ' '.join(['%02x' % ord(x) for x in line])
line = line.translate(__vis_filter)
res.append(' %04d: %-*s %s' % (n, length * 3, hexa, line))
n += length
return '\n'.join(res)
try:
import dnet
def in_cksum_add(s, buf):
return dnet.ip_cksum_add(buf, s)
def in_cksum_done(s):
return socket.ntohs(dnet.ip_cksum_carry(s))
except ImportError:
import array
def in_cksum_add(s, buf):
n = len(buf)
cnt = (n / 2) * 2
a = array.array('H', buf[:cnt])
if cnt != n:
a.append(struct.unpack('H', buf[-1] + '\x00')[0])
return s + sum(a)
def in_cksum_done(s):
s = (s >> 16) + (s & 0xffff)
s += (s >> 16)
return socket.ntohs(~s & 0xffff)
def in_cksum(buf):
"""Return computed Internet checksum."""
return in_cksum_done(in_cksum_add(0, buf))
The question is how to really understand the source code so its documentation is done correctly?

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Two apparently identical dataclasses are not equal - python

Related

How to use proper container in python

Alternative to global variable

replace semicolon by newline in python code

Python: Why is my generator based range is X2 slower than xrange?

dpkt source code documentation project

Categories

Resources