slices to immutable strings by reference and not copy - python

If you use string.split() on a Python string, it returns a list of strings. These substrings that have been split-out are copies of their part of the parent string.
Is it possible to instead get some cheaper slice object that holds only a reference, offset and length to the bits split out?
And is it possible to have some 'string view' to extract and treat these sub-strings as if they are strings yet without making a copy of their bytes?
(I ask as I have very large strings I want to slice and am running out of memory occasionally; removing the copies would be a cheap profile-guided win.)

buffer will give you a read-only view on a string.
>>> s = 'abcdefghijklmnopqrstuvwxyz'
>>> b = buffer(s, 2, 10)
>>> b
<read-only buffer for 0x7f935ee75d70, size 10, offset 2 at 0x7f935ee5a8f0>
>>> b[:]
'cdefghijkl'

String objects always point to a NUL-terminated buffer in Python, so substrings must be copied. As Ignacio pointed out, you can use buffer() to get a read-only view on the string memory. The buffer() built-in function has been superseded by the more versatile memoryview objects, though, which are available in Python 2.7 and 3.x (buffer() is gone in Python 3.x).
s = "abcd" * 50
view = memoryview(s)
subview = view[10:20]
print subview.tobytes()
This code prints
cdabcdabcd
As soon as you call tobytes(), a copy of the string will be created, but the same happens when slicing the old buffer objects as in Ignacio's answer.

Here's the quick string-like buffer wrapper I came up with; I was able to use this in place of classic strings without changing the code that expected to consume strings.
class StringView:
def __init__(self,s,start=0,size=sys.maxint):
self.s, self.start, self.stop = s, start, min(start+size,len(s))
self.size = self.stop - self.start
self._buf = buffer(s,start,self.size)
def find(self,sub,start=0,stop=None):
assert start >= 0, start
assert (stop is None) or (stop <= self.size), stop
ofs = self.s.find(sub,self.start+start,self.stop if (stop is None) else (self.start+stop))
if ofs != -1: ofs -= self.start
return ofs
def split(self,sep=None,maxsplit=sys.maxint):
assert maxsplit > 0, maxsplit
ret = []
if sep is None: #whitespace logic
pos = [self.start,self.start] # start and stop
def eat(whitespace=False):
while (pos[1] < self.stop) and (whitespace == (ord(self.s[pos[1]])<=32)):
pos[1] += 1
def eat_whitespace():
eat(True)
pos[0] = pos[1]
eat_whitespace()
while pos[1] < self.stop:
eat()
ret.append(self.__class__(self.s,pos[0],pos[1]-pos[0]))
eat_whitespace()
if len(ret) == maxsplit:
ret.append(self.__class__(self.s,pos[1]))
break
else:
start = stop = 0
while len(ret) < maxsplit:
stop = self.find(sep,start)
if -1 == stop:
break
ret.append(self.__class__(self.s,self.start+start,stop-start))
start = stop + len(sep)
ret.append(self.__class__(self.s,self.start+start,self.size-start))
return ret
def split_str(self,sep=None,maxsplit=sys.maxint):
"if you really want strings and not views"
return [str(sub) for sub in self.split(sep,maxsplit)]
def __cmp__(self,s):
if isinstance(s,self.__class__):
return cmp(self._buf,s._buf)
assert isinstance(s,str), type(s)
return cmp(self._buf,s)
def __len__(self):
return self.size
def __str__(self):
return str(self._buf)
def __repr__(self):
return "'%s'"%self._buf
if __name__=="__main__":
test_str = " this: is: a: te:st str:ing :"
test = Envelope.StringView(test_str)
print "find('is')"
print "\t",test_str.find("is")
print "\t",test.find("is")
print "find('is',4):"
print "\t",test_str.find("is",4)
print "\t",test.find("is",4)
print "find('is',4,7):"
print "\t",test_str.find("is",4,7)
print "\t",test.find("is",4,7)
print "split():"
print "\t",test_str.split()
print "\t",test.split()
print "split(None,2):"
print "\t",test_str.split(None,2)
print "\t",test.split(None,2)
print "split(':'):"
print "\t",test_str.split(":")
print "\t",test.split(":")
print "split('x'):"
print "\t",test_str.split("x")
print "\t",test.split("x")
print "''.split('x'):"
print "\t","".split("x")
print "\t",Envelope.StringView("").split("x")

Related

How is the global variable being incremented when it's not being directly specified?

I'm working on a small project where I'm implementing a queue as a circular array. As a challenge, I was assigned to not use any list functions like append when implementing this ADT. It's assumed that I only need to resize the queue when it gets full. When I ran my code through a debugger and step through it, I find that the issue revolves around my self.read value (read pointer) which is the global variable in question. This is kind of perplexing to me since neither of my functions that would be affected increments my pointer. Could anyone shed some light on this problem for me?
This class is coded as:
class CircularQueue(object):
def __init__(self, capacity=2):
"""
Initialize the queue to be empty with a fixed capacity
:param capacity: Initial size of the queue
"""
self.capacity = capacity
self.size = 0
self.list = [0] * self.capacity
self.sum = 0
self.read = 0
self.write = 0
def __eq__(self, other):
return self.capacity == other.capacity
and self.size == other.size
and self.read == other.read
and self.write == other.write
def __str__(self):
if self.size == 0:
return "Queue is empty"
content = ""
while ((self.read + 1) % self.size) != self.write:
content = content + str(self.list[self.read]) + " -> "
self.read = (self.read + 1) % self.size
content = content[:-3]
return f"Contents: {content}"
__repr__ = __str__
The portion that I'm interested in looking at is my enqueue and resize functions:
def resize(self):
bigger = [None] * (self.capacity * 2) #create bigger queue
b_ind = 0
read_ptr = self.read
while read_ptr != (self.write + 1): #iterate through old queue to copy into new queue
bigger[b_ind] = self.list[read_ptr]
b_ind += 1
read_ptr += 1
self.capacity *= 2 #setting capacity
self.list = bigger #setting new list as queue
self.read = 0 #normalize queue
self.write = b_ind
def enqueue(self, number):
if self.size == 0: #if queue was originally empty
self.list[self.read] = number
self.write += 1
else:
self.list[self.write] = number #add onto end of queue
if ((self.write + 1) % self.capacity == self.read): #resize if queue loops back and the write pointer is the same as the read pointer
self.resize()
else:
self.write = (self.write + 1) % self.capacity #set write pointer
self.sum += number #add to sum
self.size += 1 # increment size
This was the test case I ran for my code:
queue = CircularQueue()
queue.enqueue(23)
queue.enqueue(42)
queue.enqueue(2)
queue.enqueue(195)
print(queue)
You mutate the state of your queue when printing. print() calls __str__, and that method alters your state:
self.read = (self.read + 1) % self.size
Use a local variable instead of self.read:
def __str__(self):
if self.size == 0:
return "Queue is empty"
content = ""
read = self.read
while (read % self.capacity) != self.write:
if content:
content += ' -> '
content += str(self.list[read])
read = (read + 1) % self.capacity
return f"Contents: {content}"
Note the while loop condition; you want to see if the current read position is not the same as the write position (meaning you can display the current value), not the next position, and you want to wrap round at the capacity.
I adjusted your separator handling a little to only add the arrow in between values if at least one entry has been added to content already, that avoids having to remove a portion again.
Demo using the fixed __str__ method (no other changes made):
>>> queue = CircularQueue()
>>> print(queue)
Queue is empty
>>> queue.enqueue(23)
>>> print(queue)
Contents: 23
>>> queue.enqueue(42)
>>> print(queue)
Contents: 23 -> 42
>>> queue.enqueue(2)
>>> print(queue)
Contents: 23 -> 42 -> 2
>>> queue.enqueue(195)
>>> print(queue)
Contents: 23 -> 42 -> 2 -> 195

printing stack in a new line within a class

I have a linked stack class and I'm having the problem of printing the elements in the stack, with each element in a new line. The str function in the linked stack class is printing every element in a new line which is what i wanted but it even prints an extra new line at the end.
class Node:
def __init__(self,item,the_next = None):
self.item = item
self.next = the_next
def __str__(self):
return str(self.item)
class LinkedStack:
def __init__(self):
self.top = None
self.count = 0
def __len__(self):
return self.count
def is_empty(self):
return self.count == 0
def isFull(self):
return False
def reset(self):
self.top = None
self.count = 0
def __str__(self): #im having the issue here whereby it prints a newline even after printing the last element
current = self.top
ans = ""
while not (current is None):
ans += str(current)
ans += '\n'
current = current.next
return ans
if __name__ == "__main__":
L = LinkedStack()
L.push(1)
L.push(2)
L.push(3)
print(L)
The output i get is:
3
2
1
#an extra newline printed here which is not wanted
I'm looking for a way to improvise the str function in the Linked Stack class in order to get rid of the redundant new line at the end. Any help is much appreciated.
Why not simply trim the return value of __str__ like so return ans[:-1] ? Since you always append a new line after adding an element to the string.
On a side note, it could be nicer to write your function like so:
def __str__(self):
strs = []
cur = self.top
while cur is not None:
strs = [str(cur)] + strs
cur = cur.next
return '\n'.join(strs)
You could move the addition of the newline until after you get the next item, and check it is not None before you then append it.
A further optimisation then is to move the break condition to that point:
while True:
ans += str(current)
current = current.next
if current is None:
break
ans += '\n'

Where does the difference in size come from?

I created a trie of sorts to store all the words (not definitions) in the English dictionary. The point of it was so that I can get all the words that only contain letters within a given range.
The text file containing all the words is about 2.7 mb, but after creating the tree and writing it to a file using pickle, the file is >33 mb.
Where does this difference in size come from? I thought I would be saving space by not needing to store multiple copies of the same letter for different word, e.g for the words app and apple I would only need 5 nodes, for a -> p -> p -> l -> e.
My code is as follows:
import pickle
class WordTrieNode:
def __init__(self, nodeLetter='', parentNode=None, isWordEnding=False):
self.nodeLetter = nodeLetter
self.parentNode = parentNode
self.isWordEnding = isWordEnding
self.children = [None]*26 # One entry for each lowercase letter of the alphabet
def getWord(self):
if(self.parentNode is None):
return ''
return self.parentNode.getWord() + self.nodeLetter
def isEndOfWord(self):
return self.isWordEnding
def markEndOfWord():
self.isWordEnding = True
def insertWord(self, word):
if(len(word) == 0):
return
char = word[0]
idx = ord(char) - ord('a')
if(len(word) == 1):
if(self.children[idx] is None):
node = WordTrieNode(char, self, True)
self.children[idx] = node
else:
self.children[idx].markEndOfWord()
else:
if(self.children[idx] is None):
node = WordTrieNode(char, self, False)
self.children[idx] = node
self.children[idx].insertWord(word[1:])
else:
self.children[idx].insertWord(word[1:])
def getAllWords(self):
for node in self.children:
if node is not None:
if node.isEndOfWord():
print(node.getWord())
node.getAllWords()
def getAllWordsInRange(self, low='a', high='z'):
i = ord(low) - ord('a')
j = ord(high) - ord('a')
for node in self.children[i:j+1]:
if node is not None:
if node.isEndOfWord():
print(node.getWord())
node.getAllWordsInRange(low, high)
def main():
tree = WordTrieNode("", None, False)
with open('en.txt') as file:
for line in file:
tree.insertWord(line.strip('\n'))
with open("treeout", 'wb') as output:
pickle.dump(tree, output, pickle.HIGHEST_PROTOCOL)
#tree.getAllWordsInRange('a', 'l')
#tree.getAllWords()
if __name__ == "__main__":
main()
Nodes of a trie are huge as they store a link for all possible next letters. As you can see in the code, every node holds a list of 26 links (children).
More compact schemes are possible (https://en.wikipedia.org/wiki/Trie#Compressing_tries), at the expense of more complexity and slower speed.

Python: Why is my generator based range is X2 slower than xrange?

Just of curiosity, I've written 3 tests in Python and timed them out using timeit:
import timeit
# simple range based on generator
def my_range(start, stop):
i = start
while (i < stop):
yield i
i += 1
# test regular range
def test_range():
x = range(1, 100000)
sum = 0
for i in x:
sum += i
# test xrange
def test_xrange():
x = xrange(1, 100000)
sum = 0
for i in x:
sum += i
# test my range
def test_my_range():
x = my_range(1, 100000)
sum = 0
for i in x:
sum += i
print timeit.timeit("test_range()", setup = "from __main__ import test_range", number = 100)
print timeit.timeit("test_xrange()", setup = "from __main__ import test_xrange", number = 100)
print timeit.timeit("test_my_range()", setup = "from __main__ import test_my_range", number = 100)
And I've got these benchmarks:
regular range based test - 0.616795163262
xrange based test - 0.537716731096
my_range (generator) based test - **1.27872886337**
My range was X2 slower even than a range that creates a list. Why?
Are xrange() / range() implemented using C directly?
Are they implemented without condition check?
Thanks!
I feel that the simple answer is that xrange() is builtin and written in C.
I added another case to your test (see below): A pure-Python reference implementation of xrange() based on the CPython source.
import timeit
from collections import Sequence, Iterator
from math import ceil
# simple range based on generator
def my_range(start, stop):
i = start
while (i < stop):
yield i
i += 1
# test regular range
def test_range():
x = range(1, 100000)
sum = 0
for i in x:
sum += i
# test xrange
def test_xrange():
x = xrange(1, 100000)
sum = 0
for i in x:
sum += i
# test my range
def test_my_range():
x = my_range(1, 100000)
sum = 0
for i in x:
sum += i
class pure_python_xrange(Sequence):
"""Pure-Python implementation of an ``xrange`` (aka ``range``
in Python 3) object. See `the CPython documentation
<http://docs.python.org/py3k/library/functions.html#range>`_
for details.
"""
def __init__(self, *args):
if len(args) == 1:
start, stop, step = 0, args[0], 1
elif len(args) == 2:
start, stop, step = args[0], args[1], 1
elif len(args) == 3:
start, stop, step = args
else:
raise TypeError('pure_python_xrange() requires 1-3 int arguments')
try:
start, stop, step = int(start), int(stop), int(step)
except ValueError:
raise TypeError('an integer is required')
if step == 0:
raise ValueError('pure_python_xrange() arg 3 must not be zero')
elif step < 0:
stop = min(stop, start)
else:
stop = max(stop, start)
self._start = start
self._stop = stop
self._step = step
self._len = (stop - start) // step + bool((stop - start) % step)
def __repr__(self):
if self._start == 0 and self._step == 1:
return 'pure_python_xrange(%d)' % self._stop
elif self._step == 1:
return 'pure_python_xrange(%d, %d)' % (self._start, self._stop)
return 'pure_python_xrange(%d, %d, %d)' % (self._start, self._stop, self._step)
def __eq__(self, other):
return isinstance(other, xrange) and \
self._start == other._start and \
self._stop == other._stop and \
self._step == other._step
def __len__(self):
return self._len
def index(self, value):
"""Return the 0-based position of integer `value` in
the sequence this xrange represents."""
diff = value - self._start
quotient, remainder = divmod(diff, self._step)
if remainder == 0 and 0 <= quotient < self._len:
return abs(quotient)
raise ValueError('%r is not in range' % value)
def count(self, value):
"""Return the number of ocurrences of integer `value`
in the sequence this xrange represents."""
# a value can occur exactly zero or one times
return int(value in self)
def __contains__(self, value):
"""Return ``True`` if the integer `value` occurs in
the sequence this xrange represents."""
try:
self.index(value)
return True
except ValueError:
return False
def __reversed__(self):
"""Return an xrange which represents a sequence whose
contents are the same as the sequence this xrange
represents, but in the opposite order."""
sign = self._step / abs(self._step)
last = self._start + ((self._len - 1) * self._step)
return pure_python_xrange(last, self._start - sign, -1 * self._step)
def __getitem__(self, index):
"""Return the element at position ``index`` in the sequence
this xrange represents, or raise :class:`IndexError` if the
position is out of range."""
if isinstance(index, slice):
return self.__getitem_slice(index)
if index < 0:
# negative indexes access from the end
index = self._len + index
if index < 0 or index >= self._len:
raise IndexError('xrange object index out of range')
return self._start + index * self._step
def __getitem_slice(self, slce):
"""Return an xrange which represents the requested slce
of the sequence represented by this xrange.
"""
start, stop, step = slce.start, slce.stop, slce.step
if step == 0:
raise ValueError('slice step cannot be 0')
start = start or self._start
stop = stop or self._stop
if start < 0:
start = max(0, start + self._len)
if stop < 0:
stop = max(start, stop + self._len)
if step is None or step > 0:
return pure_python_xrange(start, stop, step or 1)
else:
rv = reversed(self)
rv._step = step
return rv
def __iter__(self):
"""Return an iterator which enumerates the elements of the
sequence this xrange represents."""
return xrangeiterator(self)
class xrangeiterator(Iterator):
"""An iterator for an :class:`xrange`.
"""
def __init__(self, xrangeobj):
self._xrange = xrangeobj
# Intialize the "last outputted value" to the value
# just before the first value; this simplifies next()
self._last = self._xrange._start - self._xrange._step
self._count = 0
def __iter__(self):
"""An iterator is already an iterator, so return ``self``.
"""
return self
def next(self):
"""Return the next element in the sequence represented
by the xrange we are iterating, or raise StopIteration
if we have passed the end of the sequence."""
self._last += self._xrange._step
self._count += 1
if self._count > self._xrange._len:
raise StopIteration()
return self._last
# test xrange
def test_pure_python_xrange():
x = pure_python_xrange(1, 100000)
sum = 0
for i in x:
sum += i
print timeit.timeit("test_range()", setup = "from __main__ import test_range", number = 100)
print timeit.timeit("test_xrange()", setup = "from __main__ import test_xrange", number = 100)
print timeit.timeit("test_my_range()", setup = "from __main__ import test_my_range", number = 100)
print timeit.timeit("test_pure_python_xrange()", setup = "from __main__ import test_pure_python_xrange", number = 100)
The results?
$ python so.py
0.426695823669
0.371111869812
0.964643001556
6.06390094757
This is simply the difference between interpreted Python code and C. Additionally, as #byels mentioned above, xrange() is limited to short integers, which likely has positive effect.
This is an interesting test. Looking at the python 2 docs on xrange, one guess that comes to mind is that xrange is alowed to take advantage of type restrictions (only uses "short" integers)

for loop iterates into a NoneType only during a __str__ statement?

So I'm working in Python trying to create a ShapeSet instance that contains a list of Shape instances and I need it to print out the list of Shape instances.
I can use the for loop in other parts of the code without running into an error. However, when I attempt a print statement it prints out the whole list and at the end results in error:
__str__ returned non-string (type NoneType)
I don't understand why it fails to understand to stop at the end of the list here. (At least that's what I think it's doing).
Any help is greatly appreciated.
class ShapeSet:
def __init__(self):
"""
Initialize any needed variables
"""
self.collect = []
self.place = None
def __iter__(self):
"""
Return an iterator that allows you to iterate over the set of
shapes, one shape at a time
"""
self.place = 0
return self
def next(self):
if self.place >= len(self.collect):
raise StopIteration
self.place = self.place + 1
return self.collect[self.place-1]
def addShape(self, sh):
"""
Add shape sh to the set; no two shapes in the set may be
identical
sh: shape to be added
"""
s_count = 0
c_count = 0
t_count = 0
self.collect.append(sh)
for i in self.collect:
if type(sh) == Square and type(i) == Square:
if sh.side == i.side:
s_count = s_count + 1
if s_count == 2:
self.collect.remove(sh)
print('already there')
if type(sh) == Circle and type(i) == Circle:
if sh.radius == i.radius:
c_count = c_count + 1
if c_count == 2:
self.collect.remove(sh)
print('already there')
if type(sh) == Triangle and type(i) == Triangle:
if sh.base == i.base and sh.height == i.height:
t_count = t_count + 1
if t_count == 2:
self.collect.remove(sh)
print('already there')
def __str__(self):
"""
Return the string representation for a set, which consists of
the string representation of each shape, categorized by type
(circles, then squares, then triangles)
"""
for i in self.collect:
if type(i) == Square:
print ('Square with measurements ' + str(i.side))
if type(i) == Circle:
print ('Circle with measurements ' + str(i.radius))
if type(i) == Triangle:
print ('Triangle with measurements, base/height ' + str(i.base)+ ' ' + str(i.height))
Read the docstring in your __str__ function. You're suppose to "return the string representation" not print it. Since there is no return statement in the __str__ function, it returns None, which print chokes on.
Instead, actually return the desired string, and let the external print call display it:
def __str__(self):
"""
Return the string representation for a set, which consists of
the string representation of each shape, categorized by type
(circles, then squares, then triangles)
"""
strings = []
for i in self.collect:
if type(i) == Square:
strings.append('Square with measurements ' + str(i.side))
if type(i) == Circle:
strings.append('Circle with measurements ' + str(i.radius))
if type(i) == Triangle:
strings.append('Triangle with measurements, base/height ' + str(i.base)+ ' ' + str(i.height))
return '\n'.join(strings)
You wrote
def __str__(self):
"""
**Return** the string representation for a set, which consists of
the string representation of each shape, categorized by type
(circles, then squares, then triangles)
"""
but you don't return anything - you just print stuff.
Put a appropriate __str__ method on all your classes:
class Square:
def __str__(self):
return 'Square with measurements ' + str(i.side)
class Circle:
def __str__(self):
return 'Circle with measurements ' + str(i.radius)
# and so on
and a representation for your ShapeSet:
class ShapeSet:
def __str__(self):
return '\n'.join(str(x) for x in self.collect)
Now you can print(some_shapeset) as well as print(some_circle).
You could also do whatever you like within the str method, itterate, print outputs, more logic, etc, as long as at the end, you return a string i.e. return "" just to satisfy the requirement.
In Your case:
def __str__(self):
"""
Return the string representation for a set, which consists of
the string representation of each shape, categorized by type
(circles, then squares, then triangles)
"""
for i in self.collect:
if type(i) == Square:
print ('Square with measurements ' + str(i.side))
if type(i) == Circle:
print ('Circle with measurements ' + str(i.radius))
if type(i) == Triangle:
print ('Triangle with measurements, base/height ' + str(i.base)+ ' ' + str(i.height))
return ""

Categories