Related
I have a Python script which takes as input a list of integers, which I need to work with four integers at a time. Unfortunately, I don't have control of the input, or I'd have it passed in as a list of four-element tuples. Currently, I'm iterating over it this way:
for i in range(0, len(ints), 4):
# dummy op for example code
foo += ints[i] * ints[i + 1] + ints[i + 2] * ints[i + 3]
It looks a lot like "C-think", though, which makes me suspect there's a more pythonic way of dealing with this situation. The list is discarded after iterating, so it needn't be preserved. Perhaps something like this would be better?
while ints:
foo += ints[0] * ints[1] + ints[2] * ints[3]
ints[0:4] = []
Still doesn't quite "feel" right, though. :-/
Related question: How do you split a list into evenly sized chunks in Python?
def chunker(seq, size):
return (seq[pos:pos + size] for pos in range(0, len(seq), size))
Works with any sequence:
text = "I am a very, very helpful text"
for group in chunker(text, 7):
print(repr(group),)
# 'I am a ' 'very, v' 'ery hel' 'pful te' 'xt'
print('|'.join(chunker(text, 10)))
# I am a ver|y, very he|lpful text
animals = ['cat', 'dog', 'rabbit', 'duck', 'bird', 'cow', 'gnu', 'fish']
for group in chunker(animals, 3):
print(group)
# ['cat', 'dog', 'rabbit']
# ['duck', 'bird', 'cow']
# ['gnu', 'fish']
Modified from the Recipes section of Python's itertools docs:
from itertools import zip_longest
def grouper(iterable, n, fillvalue=None):
args = [iter(iterable)] * n
return zip_longest(*args, fillvalue=fillvalue)
Example
grouper('ABCDEFG', 3, 'x') # --> 'ABC' 'DEF' 'Gxx'
Note: on Python 2 use izip_longest instead of zip_longest.
chunk_size = 4
for i in range(0, len(ints), chunk_size):
chunk = ints[i:i+chunk_size]
# process chunk of size <= chunk_size
import itertools
def chunks(iterable,size):
it = iter(iterable)
chunk = tuple(itertools.islice(it,size))
while chunk:
yield chunk
chunk = tuple(itertools.islice(it,size))
# though this will throw ValueError if the length of ints
# isn't a multiple of four:
for x1,x2,x3,x4 in chunks(ints,4):
foo += x1 + x2 + x3 + x4
for chunk in chunks(ints,4):
foo += sum(chunk)
Another way:
import itertools
def chunks2(iterable,size,filler=None):
it = itertools.chain(iterable,itertools.repeat(filler,size-1))
chunk = tuple(itertools.islice(it,size))
while len(chunk) == size:
yield chunk
chunk = tuple(itertools.islice(it,size))
# x2, x3 and x4 could get the value 0 if the length is not
# a multiple of 4.
for x1,x2,x3,x4 in chunks2(ints,4,0):
foo += x1 + x2 + x3 + x4
If you don't mind using an external package you could use iteration_utilities.grouper from iteration_utilties 1. It supports all iterables (not just sequences):
from iteration_utilities import grouper
seq = list(range(20))
for group in grouper(seq, 4):
print(group)
which prints:
(0, 1, 2, 3)
(4, 5, 6, 7)
(8, 9, 10, 11)
(12, 13, 14, 15)
(16, 17, 18, 19)
In case the length isn't a multiple of the groupsize it also supports filling (the incomplete last group) or truncating (discarding the incomplete last group) the last one:
from iteration_utilities import grouper
seq = list(range(17))
for group in grouper(seq, 4):
print(group)
# (0, 1, 2, 3)
# (4, 5, 6, 7)
# (8, 9, 10, 11)
# (12, 13, 14, 15)
# (16,)
for group in grouper(seq, 4, fillvalue=None):
print(group)
# (0, 1, 2, 3)
# (4, 5, 6, 7)
# (8, 9, 10, 11)
# (12, 13, 14, 15)
# (16, None, None, None)
for group in grouper(seq, 4, truncate=True):
print(group)
# (0, 1, 2, 3)
# (4, 5, 6, 7)
# (8, 9, 10, 11)
# (12, 13, 14, 15)
Benchmarks
I also decided to compare the run-time of a few of the mentioned approaches. It's a log-log plot grouping into groups of "10" elements based on a list of varying size. For qualitative results: Lower means faster:
At least in this benchmark the iteration_utilities.grouper performs best. Followed by the approach of Craz.
The benchmark was created with simple_benchmark1. The code used to run this benchmark was:
import iteration_utilities
import itertools
from itertools import zip_longest
def consume_all(it):
return iteration_utilities.consume(it, None)
import simple_benchmark
b = simple_benchmark.BenchmarkBuilder()
#b.add_function()
def grouper(l, n):
return consume_all(iteration_utilities.grouper(l, n))
def Craz_inner(iterable, n, fillvalue=None):
args = [iter(iterable)] * n
return zip_longest(*args, fillvalue=fillvalue)
#b.add_function()
def Craz(iterable, n, fillvalue=None):
return consume_all(Craz_inner(iterable, n, fillvalue))
def nosklo_inner(seq, size):
return (seq[pos:pos + size] for pos in range(0, len(seq), size))
#b.add_function()
def nosklo(seq, size):
return consume_all(nosklo_inner(seq, size))
def SLott_inner(ints, chunk_size):
for i in range(0, len(ints), chunk_size):
yield ints[i:i+chunk_size]
#b.add_function()
def SLott(ints, chunk_size):
return consume_all(SLott_inner(ints, chunk_size))
def MarkusJarderot1_inner(iterable,size):
it = iter(iterable)
chunk = tuple(itertools.islice(it,size))
while chunk:
yield chunk
chunk = tuple(itertools.islice(it,size))
#b.add_function()
def MarkusJarderot1(iterable,size):
return consume_all(MarkusJarderot1_inner(iterable,size))
def MarkusJarderot2_inner(iterable,size,filler=None):
it = itertools.chain(iterable,itertools.repeat(filler,size-1))
chunk = tuple(itertools.islice(it,size))
while len(chunk) == size:
yield chunk
chunk = tuple(itertools.islice(it,size))
#b.add_function()
def MarkusJarderot2(iterable,size):
return consume_all(MarkusJarderot2_inner(iterable,size))
#b.add_arguments()
def argument_provider():
for exp in range(2, 20):
size = 2**exp
yield size, simple_benchmark.MultiArgument([[0] * size, 10])
r = b.run()
1 Disclaimer: I'm the author of the libraries iteration_utilities and simple_benchmark.
With Python 3.8 you can use the walrus operator and itertools.islice.
from itertools import islice
list_ = [i for i in range(10, 100)]
def chunker(it, size):
iterator = iter(it)
while chunk := list(islice(iterator, size)):
print(chunk)
In [2]: chunker(list_, 10)
[10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
[20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
[30, 31, 32, 33, 34, 35, 36, 37, 38, 39]
[40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
[50, 51, 52, 53, 54, 55, 56, 57, 58, 59]
[60, 61, 62, 63, 64, 65, 66, 67, 68, 69]
[70, 71, 72, 73, 74, 75, 76, 77, 78, 79]
[80, 81, 82, 83, 84, 85, 86, 87, 88, 89]
[90, 91, 92, 93, 94, 95, 96, 97, 98, 99]
I needed a solution that would also work with sets and generators. I couldn't come up with anything very short and pretty, but it's quite readable at least.
def chunker(seq, size):
res = []
for el in seq:
res.append(el)
if len(res) == size:
yield res
res = []
if res:
yield res
List:
>>> list(chunker([i for i in range(10)], 3))
[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
Set:
>>> list(chunker(set([i for i in range(10)]), 3))
[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
Generator:
>>> list(chunker((i for i in range(10)), 3))
[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
The more-itertools package has chunked method which does exactly that:
import more_itertools
for s in more_itertools.chunked(range(9), 4):
print(s)
Prints
[0, 1, 2, 3]
[4, 5, 6, 7]
[8]
chunked returns the items in a list. If you'd prefer iterables, use ichunked.
The ideal solution for this problem works with iterators (not just sequences). It should also be fast.
This is the solution provided by the documentation for itertools:
def grouper(n, iterable, fillvalue=None):
#"grouper(3, 'ABCDEFG', 'x') --> ABC DEF Gxx"
args = [iter(iterable)] * n
return itertools.izip_longest(fillvalue=fillvalue, *args)
Using ipython's %timeit on my mac book air, I get 47.5 us per loop.
However, this really doesn't work for me since the results are padded to be even sized groups. A solution without the padding is slightly more complicated. The most naive solution might be:
def grouper(size, iterable):
i = iter(iterable)
while True:
out = []
try:
for _ in range(size):
out.append(i.next())
except StopIteration:
yield out
break
yield out
Simple, but pretty slow: 693 us per loop
The best solution I could come up with uses islice for the inner loop:
def grouper(size, iterable):
it = iter(iterable)
while True:
group = tuple(itertools.islice(it, None, size))
if not group:
break
yield group
With the same dataset, I get 305 us per loop.
Unable to get a pure solution any faster than that, I provide the following solution with an important caveat: If your input data has instances of filldata in it, you could get wrong answer.
def grouper(n, iterable, fillvalue=None):
#"grouper(3, 'ABCDEFG', 'x') --> ABC DEF Gxx"
args = [iter(iterable)] * n
# itertools.zip_longest on Python 3
for x in itertools.izip_longest(*args, fillvalue=fillvalue):
if x[-1] is fillvalue:
yield tuple(v for v in x if v is not fillvalue)
else:
yield x
I really don't like this answer, but it is significantly faster. 124 us per loop
from itertools import izip_longest
def chunker(iterable, chunksize, filler):
return izip_longest(*[iter(iterable)]*chunksize, fillvalue=filler)
Similar to other proposals, but not exactly identical, I like doing it this way, because it's simple and easy to read:
it = iter([1, 2, 3, 4, 5, 6, 7, 8, 9])
for chunk in zip(it, it, it, it):
print chunk
>>> (1, 2, 3, 4)
>>> (5, 6, 7, 8)
This way you won't get the last partial chunk. If you want to get (9, None, None, None) as last chunk, just use izip_longest from itertools.
Since nobody's mentioned it yet here's a zip() solution:
>>> def chunker(iterable, chunksize):
... return zip(*[iter(iterable)]*chunksize)
It works only if your sequence's length is always divisible by the chunk size or you don't care about a trailing chunk if it isn't.
Example:
>>> s = '1234567890'
>>> chunker(s, 3)
[('1', '2', '3'), ('4', '5', '6'), ('7', '8', '9')]
>>> chunker(s, 4)
[('1', '2', '3', '4'), ('5', '6', '7', '8')]
>>> chunker(s, 5)
[('1', '2', '3', '4', '5'), ('6', '7', '8', '9', '0')]
Or using itertools.izip to return an iterator instead of a list:
>>> from itertools import izip
>>> def chunker(iterable, chunksize):
... return izip(*[iter(iterable)]*chunksize)
Padding can be fixed using #ΤΖΩΤΖΙΟΥ's answer:
>>> from itertools import chain, izip, repeat
>>> def chunker(iterable, chunksize, fillvalue=None):
... it = chain(iterable, repeat(fillvalue, chunksize-1))
... args = [it] * chunksize
... return izip(*args)
Another approach would be to use the two-argument form of iter:
from itertools import islice
def group(it, size):
it = iter(it)
return iter(lambda: tuple(islice(it, size)), ())
This can be adapted easily to use padding (this is similar to Markus Jarderot’s answer):
from itertools import islice, chain, repeat
def group_pad(it, size, pad=None):
it = chain(iter(it), repeat(pad))
return iter(lambda: tuple(islice(it, size)), (pad,) * size)
These can even be combined for optional padding:
_no_pad = object()
def group(it, size, pad=_no_pad):
if pad == _no_pad:
it = iter(it)
sentinel = ()
else:
it = chain(iter(it), repeat(pad))
sentinel = (pad,) * size
return iter(lambda: tuple(islice(it, size)), sentinel)
If the list is large, the highest-performing way to do this will be to use a generator:
def get_chunk(iterable, chunk_size):
result = []
for item in iterable:
result.append(item)
if len(result) == chunk_size:
yield tuple(result)
result = []
if len(result) > 0:
yield tuple(result)
for x in get_chunk([1,2,3,4,5,6,7,8,9,10], 3):
print x
(1, 2, 3)
(4, 5, 6)
(7, 8, 9)
(10,)
Using little functions and things really doesn't appeal to me; I prefer to just use slices:
data = [...]
chunk_size = 10000 # or whatever
chunks = [data[i:i+chunk_size] for i in xrange(0,len(data),chunk_size)]
for chunk in chunks:
...
Using map() instead of zip() fixes the padding issue in J.F. Sebastian's answer:
>>> def chunker(iterable, chunksize):
... return map(None,*[iter(iterable)]*chunksize)
Example:
>>> s = '1234567890'
>>> chunker(s, 3)
[('1', '2', '3'), ('4', '5', '6'), ('7', '8', '9'), ('0', None, None)]
>>> chunker(s, 4)
[('1', '2', '3', '4'), ('5', '6', '7', '8'), ('9', '0', None, None)]
>>> chunker(s, 5)
[('1', '2', '3', '4', '5'), ('6', '7', '8', '9', '0')]
One-liner, adhoc solution to iterate over a list x in chunks of size 4 -
for a, b, c, d in zip(x[0::4], x[1::4], x[2::4], x[3::4]):
... do something with a, b, c and d ...
To avoid all conversions to a list import itertools and:
>>> for k, g in itertools.groupby(xrange(35), lambda x: x/10):
... list(g)
Produces:
...
0 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
1 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
2 [20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
3 [30, 31, 32, 33, 34]
>>>
I checked groupby and it doesn't convert to list or use len so I (think) this will delay resolution of each value until it is actually used. Sadly none of the available answers (at this time) seemed to offer this variation.
Obviously if you need to handle each item in turn nest a for loop over g:
for k,g in itertools.groupby(xrange(35), lambda x: x/10):
for i in g:
# do what you need to do with individual items
# now do what you need to do with the whole group
My specific interest in this was the need to consume a generator to submit changes in batches of up to 1000 to the gmail API:
messages = a_generator_which_would_not_be_smart_as_a_list
for idx, batch in groupby(messages, lambda x: x/1000):
batch_request = BatchHttpRequest()
for message in batch:
batch_request.add(self.service.users().messages().modify(userId='me', id=message['id'], body=msg_labels))
http = httplib2.Http()
self.credentials.authorize(http)
batch_request.execute(http=http)
Unless I misses something, the following simple solution with generator expressions has not been mentioned. It assumes that both the size and the number of chunks are known (which is often the case), and that no padding is required:
def chunks(it, n, m):
"""Make an iterator over m first chunks of size n.
"""
it = iter(it)
# Chunks are presented as tuples.
return (tuple(next(it) for _ in range(n)) for _ in range(m))
In your second method, I would advance to the next group of 4 by doing this:
ints = ints[4:]
However, I haven't done any performance measurement so I don't know which one might be more efficient.
Having said that, I would usually choose the first method. It's not pretty, but that's often a consequence of interfacing with the outside world.
With NumPy it's simple:
ints = array([1, 2, 3, 4, 5, 6, 7, 8])
for int1, int2 in ints.reshape(-1, 2):
print(int1, int2)
output:
1 2
3 4
5 6
7 8
I never want my chunks padded, so that requirement is essential. I find that the ability to work on any iterable is also requirement. Given that, I decided to extend on the accepted answer, https://stackoverflow.com/a/434411/1074659.
Performance takes a slight hit in this approach if padding is not wanted due to the need to compare and filter the padded values. However, for large chunk sizes, this utility is very performant.
#!/usr/bin/env python3
from itertools import zip_longest
_UNDEFINED = object()
def chunker(iterable, chunksize, fillvalue=_UNDEFINED):
"""
Collect data into chunks and optionally pad it.
Performance worsens as `chunksize` approaches 1.
Inspired by:
https://docs.python.org/3/library/itertools.html#itertools-recipes
"""
args = [iter(iterable)] * chunksize
chunks = zip_longest(*args, fillvalue=fillvalue)
yield from (
filter(lambda val: val is not _UNDEFINED, chunk)
if chunk[-1] is _UNDEFINED
else chunk
for chunk in chunks
) if fillvalue is _UNDEFINED else chunks
def chunker(iterable, n):
"""Yield iterable in chunk sizes.
>>> chunks = chunker('ABCDEF', n=4)
>>> chunks.next()
['A', 'B', 'C', 'D']
>>> chunks.next()
['E', 'F']
"""
it = iter(iterable)
while True:
chunk = []
for i in range(n):
try:
chunk.append(next(it))
except StopIteration:
yield chunk
raise StopIteration
yield chunk
if __name__ == '__main__':
import doctest
doctest.testmod()
Yet another answer, the advantages of which are:
1) Easily understandable
2) Works on any iterable, not just sequences (some of the above answers will choke on filehandles)
3) Does not load the chunk into memory all at once
4) Does not make a chunk-long list of references to the same iterator in memory
5) No padding of fill values at the end of the list
That being said, I haven't timed it so it might be slower than some of the more clever methods, and some of the advantages may be irrelevant given the use case.
def chunkiter(iterable, size):
def inneriter(first, iterator, size):
yield first
for _ in xrange(size - 1):
yield iterator.next()
it = iter(iterable)
while True:
yield inneriter(it.next(), it, size)
In [2]: i = chunkiter('abcdefgh', 3)
In [3]: for ii in i:
for c in ii:
print c,
print ''
...:
a b c
d e f
g h
Update:
A couple of drawbacks due to the fact the inner and outer loops are pulling values from the same iterator:
1) continue doesn't work as expected in the outer loop - it just continues on to the next item rather than skipping a chunk. However, this doesn't seem like a problem as there's nothing to test in the outer loop.
2) break doesn't work as expected in the inner loop - control will wind up in the inner loop again with the next item in the iterator. To skip whole chunks, either wrap the inner iterator (ii above) in a tuple, e.g. for c in tuple(ii), or set a flag and exhaust the iterator.
def group_by(iterable, size):
"""Group an iterable into lists that don't exceed the size given.
>>> group_by([1,2,3,4,5], 2)
[[1, 2], [3, 4], [5]]
"""
sublist = []
for index, item in enumerate(iterable):
if index > 0 and index % size == 0:
yield sublist
sublist = []
sublist.append(item)
if sublist:
yield sublist
You can use partition or chunks function from funcy library:
from funcy import partition
for a, b, c, d in partition(4, ints):
foo += a * b * c * d
These functions also has iterator versions ipartition and ichunks, which will be more efficient in this case.
You can also peek at their implementation.
About solution gave by J.F. Sebastian here:
def chunker(iterable, chunksize):
return zip(*[iter(iterable)]*chunksize)
It's clever, but has one disadvantage - always return tuple. How to get string instead?
Of course you can write ''.join(chunker(...)), but the temporary tuple is constructed anyway.
You can get rid of the temporary tuple by writing own zip, like this:
class IteratorExhausted(Exception):
pass
def translate_StopIteration(iterable, to=IteratorExhausted):
for i in iterable:
yield i
raise to # StopIteration would get ignored because this is generator,
# but custom exception can leave the generator.
def custom_zip(*iterables, reductor=tuple):
iterators = tuple(map(translate_StopIteration, iterables))
while True:
try:
yield reductor(next(i) for i in iterators)
except IteratorExhausted: # when any of iterators get exhausted.
break
Then
def chunker(data, size, reductor=tuple):
return custom_zip(*[iter(data)]*size, reductor=reductor)
Example usage:
>>> for i in chunker('12345', 2):
... print(repr(i))
...
('1', '2')
('3', '4')
>>> for i in chunker('12345', 2, ''.join):
... print(repr(i))
...
'12'
'34'
I like this approach. It feels simple and not magical and supports all iterable types and doesn't require imports.
def chunk_iter(iterable, chunk_size):
it = iter(iterable)
while True:
chunk = tuple(next(it) for _ in range(chunk_size))
if not chunk:
break
yield chunk
Quite pythonic here (you may also inline the body of the split_groups function)
import itertools
def split_groups(iter_in, group_size):
return ((x for _, x in item) for _, item in itertools.groupby(enumerate(iter_in), key=lambda x: x[0] // group_size))
for x, y, z, w in split_groups(range(16), 4):
foo += x * y + z * w
Here is a chunker without imports that supports generators:
def chunks(seq, size):
it = iter(seq)
while True:
ret = tuple(next(it) for _ in range(size))
if len(ret) == size:
yield ret
else:
raise StopIteration()
Example of use:
>>> def foo():
... i = 0
... while True:
... i += 1
... yield i
...
>>> c = chunks(foo(), 3)
>>> c.next()
(1, 2, 3)
>>> c.next()
(4, 5, 6)
>>> list(chunks('abcdefg', 2))
[('a', 'b'), ('c', 'd'), ('e', 'f')]
I have a Python script which takes as input a list of integers, which I need to work with four integers at a time. Unfortunately, I don't have control of the input, or I'd have it passed in as a list of four-element tuples. Currently, I'm iterating over it this way:
for i in range(0, len(ints), 4):
# dummy op for example code
foo += ints[i] * ints[i + 1] + ints[i + 2] * ints[i + 3]
It looks a lot like "C-think", though, which makes me suspect there's a more pythonic way of dealing with this situation. The list is discarded after iterating, so it needn't be preserved. Perhaps something like this would be better?
while ints:
foo += ints[0] * ints[1] + ints[2] * ints[3]
ints[0:4] = []
Still doesn't quite "feel" right, though. :-/
Related question: How do you split a list into evenly sized chunks in Python?
def chunker(seq, size):
return (seq[pos:pos + size] for pos in range(0, len(seq), size))
Works with any sequence:
text = "I am a very, very helpful text"
for group in chunker(text, 7):
print(repr(group),)
# 'I am a ' 'very, v' 'ery hel' 'pful te' 'xt'
print('|'.join(chunker(text, 10)))
# I am a ver|y, very he|lpful text
animals = ['cat', 'dog', 'rabbit', 'duck', 'bird', 'cow', 'gnu', 'fish']
for group in chunker(animals, 3):
print(group)
# ['cat', 'dog', 'rabbit']
# ['duck', 'bird', 'cow']
# ['gnu', 'fish']
Modified from the Recipes section of Python's itertools docs:
from itertools import zip_longest
def grouper(iterable, n, fillvalue=None):
args = [iter(iterable)] * n
return zip_longest(*args, fillvalue=fillvalue)
Example
grouper('ABCDEFG', 3, 'x') # --> 'ABC' 'DEF' 'Gxx'
Note: on Python 2 use izip_longest instead of zip_longest.
chunk_size = 4
for i in range(0, len(ints), chunk_size):
chunk = ints[i:i+chunk_size]
# process chunk of size <= chunk_size
import itertools
def chunks(iterable,size):
it = iter(iterable)
chunk = tuple(itertools.islice(it,size))
while chunk:
yield chunk
chunk = tuple(itertools.islice(it,size))
# though this will throw ValueError if the length of ints
# isn't a multiple of four:
for x1,x2,x3,x4 in chunks(ints,4):
foo += x1 + x2 + x3 + x4
for chunk in chunks(ints,4):
foo += sum(chunk)
Another way:
import itertools
def chunks2(iterable,size,filler=None):
it = itertools.chain(iterable,itertools.repeat(filler,size-1))
chunk = tuple(itertools.islice(it,size))
while len(chunk) == size:
yield chunk
chunk = tuple(itertools.islice(it,size))
# x2, x3 and x4 could get the value 0 if the length is not
# a multiple of 4.
for x1,x2,x3,x4 in chunks2(ints,4,0):
foo += x1 + x2 + x3 + x4
If you don't mind using an external package you could use iteration_utilities.grouper from iteration_utilties 1. It supports all iterables (not just sequences):
from iteration_utilities import grouper
seq = list(range(20))
for group in grouper(seq, 4):
print(group)
which prints:
(0, 1, 2, 3)
(4, 5, 6, 7)
(8, 9, 10, 11)
(12, 13, 14, 15)
(16, 17, 18, 19)
In case the length isn't a multiple of the groupsize it also supports filling (the incomplete last group) or truncating (discarding the incomplete last group) the last one:
from iteration_utilities import grouper
seq = list(range(17))
for group in grouper(seq, 4):
print(group)
# (0, 1, 2, 3)
# (4, 5, 6, 7)
# (8, 9, 10, 11)
# (12, 13, 14, 15)
# (16,)
for group in grouper(seq, 4, fillvalue=None):
print(group)
# (0, 1, 2, 3)
# (4, 5, 6, 7)
# (8, 9, 10, 11)
# (12, 13, 14, 15)
# (16, None, None, None)
for group in grouper(seq, 4, truncate=True):
print(group)
# (0, 1, 2, 3)
# (4, 5, 6, 7)
# (8, 9, 10, 11)
# (12, 13, 14, 15)
Benchmarks
I also decided to compare the run-time of a few of the mentioned approaches. It's a log-log plot grouping into groups of "10" elements based on a list of varying size. For qualitative results: Lower means faster:
At least in this benchmark the iteration_utilities.grouper performs best. Followed by the approach of Craz.
The benchmark was created with simple_benchmark1. The code used to run this benchmark was:
import iteration_utilities
import itertools
from itertools import zip_longest
def consume_all(it):
return iteration_utilities.consume(it, None)
import simple_benchmark
b = simple_benchmark.BenchmarkBuilder()
#b.add_function()
def grouper(l, n):
return consume_all(iteration_utilities.grouper(l, n))
def Craz_inner(iterable, n, fillvalue=None):
args = [iter(iterable)] * n
return zip_longest(*args, fillvalue=fillvalue)
#b.add_function()
def Craz(iterable, n, fillvalue=None):
return consume_all(Craz_inner(iterable, n, fillvalue))
def nosklo_inner(seq, size):
return (seq[pos:pos + size] for pos in range(0, len(seq), size))
#b.add_function()
def nosklo(seq, size):
return consume_all(nosklo_inner(seq, size))
def SLott_inner(ints, chunk_size):
for i in range(0, len(ints), chunk_size):
yield ints[i:i+chunk_size]
#b.add_function()
def SLott(ints, chunk_size):
return consume_all(SLott_inner(ints, chunk_size))
def MarkusJarderot1_inner(iterable,size):
it = iter(iterable)
chunk = tuple(itertools.islice(it,size))
while chunk:
yield chunk
chunk = tuple(itertools.islice(it,size))
#b.add_function()
def MarkusJarderot1(iterable,size):
return consume_all(MarkusJarderot1_inner(iterable,size))
def MarkusJarderot2_inner(iterable,size,filler=None):
it = itertools.chain(iterable,itertools.repeat(filler,size-1))
chunk = tuple(itertools.islice(it,size))
while len(chunk) == size:
yield chunk
chunk = tuple(itertools.islice(it,size))
#b.add_function()
def MarkusJarderot2(iterable,size):
return consume_all(MarkusJarderot2_inner(iterable,size))
#b.add_arguments()
def argument_provider():
for exp in range(2, 20):
size = 2**exp
yield size, simple_benchmark.MultiArgument([[0] * size, 10])
r = b.run()
1 Disclaimer: I'm the author of the libraries iteration_utilities and simple_benchmark.
With Python 3.8 you can use the walrus operator and itertools.islice.
from itertools import islice
list_ = [i for i in range(10, 100)]
def chunker(it, size):
iterator = iter(it)
while chunk := list(islice(iterator, size)):
print(chunk)
In [2]: chunker(list_, 10)
[10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
[20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
[30, 31, 32, 33, 34, 35, 36, 37, 38, 39]
[40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
[50, 51, 52, 53, 54, 55, 56, 57, 58, 59]
[60, 61, 62, 63, 64, 65, 66, 67, 68, 69]
[70, 71, 72, 73, 74, 75, 76, 77, 78, 79]
[80, 81, 82, 83, 84, 85, 86, 87, 88, 89]
[90, 91, 92, 93, 94, 95, 96, 97, 98, 99]
I needed a solution that would also work with sets and generators. I couldn't come up with anything very short and pretty, but it's quite readable at least.
def chunker(seq, size):
res = []
for el in seq:
res.append(el)
if len(res) == size:
yield res
res = []
if res:
yield res
List:
>>> list(chunker([i for i in range(10)], 3))
[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
Set:
>>> list(chunker(set([i for i in range(10)]), 3))
[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
Generator:
>>> list(chunker((i for i in range(10)), 3))
[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
The more-itertools package has chunked method which does exactly that:
import more_itertools
for s in more_itertools.chunked(range(9), 4):
print(s)
Prints
[0, 1, 2, 3]
[4, 5, 6, 7]
[8]
chunked returns the items in a list. If you'd prefer iterables, use ichunked.
The ideal solution for this problem works with iterators (not just sequences). It should also be fast.
This is the solution provided by the documentation for itertools:
def grouper(n, iterable, fillvalue=None):
#"grouper(3, 'ABCDEFG', 'x') --> ABC DEF Gxx"
args = [iter(iterable)] * n
return itertools.izip_longest(fillvalue=fillvalue, *args)
Using ipython's %timeit on my mac book air, I get 47.5 us per loop.
However, this really doesn't work for me since the results are padded to be even sized groups. A solution without the padding is slightly more complicated. The most naive solution might be:
def grouper(size, iterable):
i = iter(iterable)
while True:
out = []
try:
for _ in range(size):
out.append(i.next())
except StopIteration:
yield out
break
yield out
Simple, but pretty slow: 693 us per loop
The best solution I could come up with uses islice for the inner loop:
def grouper(size, iterable):
it = iter(iterable)
while True:
group = tuple(itertools.islice(it, None, size))
if not group:
break
yield group
With the same dataset, I get 305 us per loop.
Unable to get a pure solution any faster than that, I provide the following solution with an important caveat: If your input data has instances of filldata in it, you could get wrong answer.
def grouper(n, iterable, fillvalue=None):
#"grouper(3, 'ABCDEFG', 'x') --> ABC DEF Gxx"
args = [iter(iterable)] * n
# itertools.zip_longest on Python 3
for x in itertools.izip_longest(*args, fillvalue=fillvalue):
if x[-1] is fillvalue:
yield tuple(v for v in x if v is not fillvalue)
else:
yield x
I really don't like this answer, but it is significantly faster. 124 us per loop
from itertools import izip_longest
def chunker(iterable, chunksize, filler):
return izip_longest(*[iter(iterable)]*chunksize, fillvalue=filler)
Similar to other proposals, but not exactly identical, I like doing it this way, because it's simple and easy to read:
it = iter([1, 2, 3, 4, 5, 6, 7, 8, 9])
for chunk in zip(it, it, it, it):
print chunk
>>> (1, 2, 3, 4)
>>> (5, 6, 7, 8)
This way you won't get the last partial chunk. If you want to get (9, None, None, None) as last chunk, just use izip_longest from itertools.
Since nobody's mentioned it yet here's a zip() solution:
>>> def chunker(iterable, chunksize):
... return zip(*[iter(iterable)]*chunksize)
It works only if your sequence's length is always divisible by the chunk size or you don't care about a trailing chunk if it isn't.
Example:
>>> s = '1234567890'
>>> chunker(s, 3)
[('1', '2', '3'), ('4', '5', '6'), ('7', '8', '9')]
>>> chunker(s, 4)
[('1', '2', '3', '4'), ('5', '6', '7', '8')]
>>> chunker(s, 5)
[('1', '2', '3', '4', '5'), ('6', '7', '8', '9', '0')]
Or using itertools.izip to return an iterator instead of a list:
>>> from itertools import izip
>>> def chunker(iterable, chunksize):
... return izip(*[iter(iterable)]*chunksize)
Padding can be fixed using #ΤΖΩΤΖΙΟΥ's answer:
>>> from itertools import chain, izip, repeat
>>> def chunker(iterable, chunksize, fillvalue=None):
... it = chain(iterable, repeat(fillvalue, chunksize-1))
... args = [it] * chunksize
... return izip(*args)
Another approach would be to use the two-argument form of iter:
from itertools import islice
def group(it, size):
it = iter(it)
return iter(lambda: tuple(islice(it, size)), ())
This can be adapted easily to use padding (this is similar to Markus Jarderot’s answer):
from itertools import islice, chain, repeat
def group_pad(it, size, pad=None):
it = chain(iter(it), repeat(pad))
return iter(lambda: tuple(islice(it, size)), (pad,) * size)
These can even be combined for optional padding:
_no_pad = object()
def group(it, size, pad=_no_pad):
if pad == _no_pad:
it = iter(it)
sentinel = ()
else:
it = chain(iter(it), repeat(pad))
sentinel = (pad,) * size
return iter(lambda: tuple(islice(it, size)), sentinel)
If the list is large, the highest-performing way to do this will be to use a generator:
def get_chunk(iterable, chunk_size):
result = []
for item in iterable:
result.append(item)
if len(result) == chunk_size:
yield tuple(result)
result = []
if len(result) > 0:
yield tuple(result)
for x in get_chunk([1,2,3,4,5,6,7,8,9,10], 3):
print x
(1, 2, 3)
(4, 5, 6)
(7, 8, 9)
(10,)
Using little functions and things really doesn't appeal to me; I prefer to just use slices:
data = [...]
chunk_size = 10000 # or whatever
chunks = [data[i:i+chunk_size] for i in xrange(0,len(data),chunk_size)]
for chunk in chunks:
...
Using map() instead of zip() fixes the padding issue in J.F. Sebastian's answer:
>>> def chunker(iterable, chunksize):
... return map(None,*[iter(iterable)]*chunksize)
Example:
>>> s = '1234567890'
>>> chunker(s, 3)
[('1', '2', '3'), ('4', '5', '6'), ('7', '8', '9'), ('0', None, None)]
>>> chunker(s, 4)
[('1', '2', '3', '4'), ('5', '6', '7', '8'), ('9', '0', None, None)]
>>> chunker(s, 5)
[('1', '2', '3', '4', '5'), ('6', '7', '8', '9', '0')]
One-liner, adhoc solution to iterate over a list x in chunks of size 4 -
for a, b, c, d in zip(x[0::4], x[1::4], x[2::4], x[3::4]):
... do something with a, b, c and d ...
To avoid all conversions to a list import itertools and:
>>> for k, g in itertools.groupby(xrange(35), lambda x: x/10):
... list(g)
Produces:
...
0 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
1 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
2 [20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
3 [30, 31, 32, 33, 34]
>>>
I checked groupby and it doesn't convert to list or use len so I (think) this will delay resolution of each value until it is actually used. Sadly none of the available answers (at this time) seemed to offer this variation.
Obviously if you need to handle each item in turn nest a for loop over g:
for k,g in itertools.groupby(xrange(35), lambda x: x/10):
for i in g:
# do what you need to do with individual items
# now do what you need to do with the whole group
My specific interest in this was the need to consume a generator to submit changes in batches of up to 1000 to the gmail API:
messages = a_generator_which_would_not_be_smart_as_a_list
for idx, batch in groupby(messages, lambda x: x/1000):
batch_request = BatchHttpRequest()
for message in batch:
batch_request.add(self.service.users().messages().modify(userId='me', id=message['id'], body=msg_labels))
http = httplib2.Http()
self.credentials.authorize(http)
batch_request.execute(http=http)
Unless I misses something, the following simple solution with generator expressions has not been mentioned. It assumes that both the size and the number of chunks are known (which is often the case), and that no padding is required:
def chunks(it, n, m):
"""Make an iterator over m first chunks of size n.
"""
it = iter(it)
# Chunks are presented as tuples.
return (tuple(next(it) for _ in range(n)) for _ in range(m))
In your second method, I would advance to the next group of 4 by doing this:
ints = ints[4:]
However, I haven't done any performance measurement so I don't know which one might be more efficient.
Having said that, I would usually choose the first method. It's not pretty, but that's often a consequence of interfacing with the outside world.
With NumPy it's simple:
ints = array([1, 2, 3, 4, 5, 6, 7, 8])
for int1, int2 in ints.reshape(-1, 2):
print(int1, int2)
output:
1 2
3 4
5 6
7 8
I never want my chunks padded, so that requirement is essential. I find that the ability to work on any iterable is also requirement. Given that, I decided to extend on the accepted answer, https://stackoverflow.com/a/434411/1074659.
Performance takes a slight hit in this approach if padding is not wanted due to the need to compare and filter the padded values. However, for large chunk sizes, this utility is very performant.
#!/usr/bin/env python3
from itertools import zip_longest
_UNDEFINED = object()
def chunker(iterable, chunksize, fillvalue=_UNDEFINED):
"""
Collect data into chunks and optionally pad it.
Performance worsens as `chunksize` approaches 1.
Inspired by:
https://docs.python.org/3/library/itertools.html#itertools-recipes
"""
args = [iter(iterable)] * chunksize
chunks = zip_longest(*args, fillvalue=fillvalue)
yield from (
filter(lambda val: val is not _UNDEFINED, chunk)
if chunk[-1] is _UNDEFINED
else chunk
for chunk in chunks
) if fillvalue is _UNDEFINED else chunks
def chunker(iterable, n):
"""Yield iterable in chunk sizes.
>>> chunks = chunker('ABCDEF', n=4)
>>> chunks.next()
['A', 'B', 'C', 'D']
>>> chunks.next()
['E', 'F']
"""
it = iter(iterable)
while True:
chunk = []
for i in range(n):
try:
chunk.append(next(it))
except StopIteration:
yield chunk
raise StopIteration
yield chunk
if __name__ == '__main__':
import doctest
doctest.testmod()
Yet another answer, the advantages of which are:
1) Easily understandable
2) Works on any iterable, not just sequences (some of the above answers will choke on filehandles)
3) Does not load the chunk into memory all at once
4) Does not make a chunk-long list of references to the same iterator in memory
5) No padding of fill values at the end of the list
That being said, I haven't timed it so it might be slower than some of the more clever methods, and some of the advantages may be irrelevant given the use case.
def chunkiter(iterable, size):
def inneriter(first, iterator, size):
yield first
for _ in xrange(size - 1):
yield iterator.next()
it = iter(iterable)
while True:
yield inneriter(it.next(), it, size)
In [2]: i = chunkiter('abcdefgh', 3)
In [3]: for ii in i:
for c in ii:
print c,
print ''
...:
a b c
d e f
g h
Update:
A couple of drawbacks due to the fact the inner and outer loops are pulling values from the same iterator:
1) continue doesn't work as expected in the outer loop - it just continues on to the next item rather than skipping a chunk. However, this doesn't seem like a problem as there's nothing to test in the outer loop.
2) break doesn't work as expected in the inner loop - control will wind up in the inner loop again with the next item in the iterator. To skip whole chunks, either wrap the inner iterator (ii above) in a tuple, e.g. for c in tuple(ii), or set a flag and exhaust the iterator.
def group_by(iterable, size):
"""Group an iterable into lists that don't exceed the size given.
>>> group_by([1,2,3,4,5], 2)
[[1, 2], [3, 4], [5]]
"""
sublist = []
for index, item in enumerate(iterable):
if index > 0 and index % size == 0:
yield sublist
sublist = []
sublist.append(item)
if sublist:
yield sublist
You can use partition or chunks function from funcy library:
from funcy import partition
for a, b, c, d in partition(4, ints):
foo += a * b * c * d
These functions also has iterator versions ipartition and ichunks, which will be more efficient in this case.
You can also peek at their implementation.
About solution gave by J.F. Sebastian here:
def chunker(iterable, chunksize):
return zip(*[iter(iterable)]*chunksize)
It's clever, but has one disadvantage - always return tuple. How to get string instead?
Of course you can write ''.join(chunker(...)), but the temporary tuple is constructed anyway.
You can get rid of the temporary tuple by writing own zip, like this:
class IteratorExhausted(Exception):
pass
def translate_StopIteration(iterable, to=IteratorExhausted):
for i in iterable:
yield i
raise to # StopIteration would get ignored because this is generator,
# but custom exception can leave the generator.
def custom_zip(*iterables, reductor=tuple):
iterators = tuple(map(translate_StopIteration, iterables))
while True:
try:
yield reductor(next(i) for i in iterators)
except IteratorExhausted: # when any of iterators get exhausted.
break
Then
def chunker(data, size, reductor=tuple):
return custom_zip(*[iter(data)]*size, reductor=reductor)
Example usage:
>>> for i in chunker('12345', 2):
... print(repr(i))
...
('1', '2')
('3', '4')
>>> for i in chunker('12345', 2, ''.join):
... print(repr(i))
...
'12'
'34'
I like this approach. It feels simple and not magical and supports all iterable types and doesn't require imports.
def chunk_iter(iterable, chunk_size):
it = iter(iterable)
while True:
chunk = tuple(next(it) for _ in range(chunk_size))
if not chunk:
break
yield chunk
Quite pythonic here (you may also inline the body of the split_groups function)
import itertools
def split_groups(iter_in, group_size):
return ((x for _, x in item) for _, item in itertools.groupby(enumerate(iter_in), key=lambda x: x[0] // group_size))
for x, y, z, w in split_groups(range(16), 4):
foo += x * y + z * w
Here is a chunker without imports that supports generators:
def chunks(seq, size):
it = iter(seq)
while True:
ret = tuple(next(it) for _ in range(size))
if len(ret) == size:
yield ret
else:
raise StopIteration()
Example of use:
>>> def foo():
... i = 0
... while True:
... i += 1
... yield i
...
>>> c = chunks(foo(), 3)
>>> c.next()
(1, 2, 3)
>>> c.next()
(4, 5, 6)
>>> list(chunks('abcdefg', 2))
[('a', 'b'), ('c', 'd'), ('e', 'f')]
I have a Python script which takes as input a list of integers, which I need to work with four integers at a time. Unfortunately, I don't have control of the input, or I'd have it passed in as a list of four-element tuples. Currently, I'm iterating over it this way:
for i in range(0, len(ints), 4):
# dummy op for example code
foo += ints[i] * ints[i + 1] + ints[i + 2] * ints[i + 3]
It looks a lot like "C-think", though, which makes me suspect there's a more pythonic way of dealing with this situation. The list is discarded after iterating, so it needn't be preserved. Perhaps something like this would be better?
while ints:
foo += ints[0] * ints[1] + ints[2] * ints[3]
ints[0:4] = []
Still doesn't quite "feel" right, though. :-/
Related question: How do you split a list into evenly sized chunks in Python?
def chunker(seq, size):
return (seq[pos:pos + size] for pos in range(0, len(seq), size))
Works with any sequence:
text = "I am a very, very helpful text"
for group in chunker(text, 7):
print(repr(group),)
# 'I am a ' 'very, v' 'ery hel' 'pful te' 'xt'
print('|'.join(chunker(text, 10)))
# I am a ver|y, very he|lpful text
animals = ['cat', 'dog', 'rabbit', 'duck', 'bird', 'cow', 'gnu', 'fish']
for group in chunker(animals, 3):
print(group)
# ['cat', 'dog', 'rabbit']
# ['duck', 'bird', 'cow']
# ['gnu', 'fish']
Modified from the Recipes section of Python's itertools docs:
from itertools import zip_longest
def grouper(iterable, n, fillvalue=None):
args = [iter(iterable)] * n
return zip_longest(*args, fillvalue=fillvalue)
Example
grouper('ABCDEFG', 3, 'x') # --> 'ABC' 'DEF' 'Gxx'
Note: on Python 2 use izip_longest instead of zip_longest.
chunk_size = 4
for i in range(0, len(ints), chunk_size):
chunk = ints[i:i+chunk_size]
# process chunk of size <= chunk_size
import itertools
def chunks(iterable,size):
it = iter(iterable)
chunk = tuple(itertools.islice(it,size))
while chunk:
yield chunk
chunk = tuple(itertools.islice(it,size))
# though this will throw ValueError if the length of ints
# isn't a multiple of four:
for x1,x2,x3,x4 in chunks(ints,4):
foo += x1 + x2 + x3 + x4
for chunk in chunks(ints,4):
foo += sum(chunk)
Another way:
import itertools
def chunks2(iterable,size,filler=None):
it = itertools.chain(iterable,itertools.repeat(filler,size-1))
chunk = tuple(itertools.islice(it,size))
while len(chunk) == size:
yield chunk
chunk = tuple(itertools.islice(it,size))
# x2, x3 and x4 could get the value 0 if the length is not
# a multiple of 4.
for x1,x2,x3,x4 in chunks2(ints,4,0):
foo += x1 + x2 + x3 + x4
If you don't mind using an external package you could use iteration_utilities.grouper from iteration_utilties 1. It supports all iterables (not just sequences):
from iteration_utilities import grouper
seq = list(range(20))
for group in grouper(seq, 4):
print(group)
which prints:
(0, 1, 2, 3)
(4, 5, 6, 7)
(8, 9, 10, 11)
(12, 13, 14, 15)
(16, 17, 18, 19)
In case the length isn't a multiple of the groupsize it also supports filling (the incomplete last group) or truncating (discarding the incomplete last group) the last one:
from iteration_utilities import grouper
seq = list(range(17))
for group in grouper(seq, 4):
print(group)
# (0, 1, 2, 3)
# (4, 5, 6, 7)
# (8, 9, 10, 11)
# (12, 13, 14, 15)
# (16,)
for group in grouper(seq, 4, fillvalue=None):
print(group)
# (0, 1, 2, 3)
# (4, 5, 6, 7)
# (8, 9, 10, 11)
# (12, 13, 14, 15)
# (16, None, None, None)
for group in grouper(seq, 4, truncate=True):
print(group)
# (0, 1, 2, 3)
# (4, 5, 6, 7)
# (8, 9, 10, 11)
# (12, 13, 14, 15)
Benchmarks
I also decided to compare the run-time of a few of the mentioned approaches. It's a log-log plot grouping into groups of "10" elements based on a list of varying size. For qualitative results: Lower means faster:
At least in this benchmark the iteration_utilities.grouper performs best. Followed by the approach of Craz.
The benchmark was created with simple_benchmark1. The code used to run this benchmark was:
import iteration_utilities
import itertools
from itertools import zip_longest
def consume_all(it):
return iteration_utilities.consume(it, None)
import simple_benchmark
b = simple_benchmark.BenchmarkBuilder()
#b.add_function()
def grouper(l, n):
return consume_all(iteration_utilities.grouper(l, n))
def Craz_inner(iterable, n, fillvalue=None):
args = [iter(iterable)] * n
return zip_longest(*args, fillvalue=fillvalue)
#b.add_function()
def Craz(iterable, n, fillvalue=None):
return consume_all(Craz_inner(iterable, n, fillvalue))
def nosklo_inner(seq, size):
return (seq[pos:pos + size] for pos in range(0, len(seq), size))
#b.add_function()
def nosklo(seq, size):
return consume_all(nosklo_inner(seq, size))
def SLott_inner(ints, chunk_size):
for i in range(0, len(ints), chunk_size):
yield ints[i:i+chunk_size]
#b.add_function()
def SLott(ints, chunk_size):
return consume_all(SLott_inner(ints, chunk_size))
def MarkusJarderot1_inner(iterable,size):
it = iter(iterable)
chunk = tuple(itertools.islice(it,size))
while chunk:
yield chunk
chunk = tuple(itertools.islice(it,size))
#b.add_function()
def MarkusJarderot1(iterable,size):
return consume_all(MarkusJarderot1_inner(iterable,size))
def MarkusJarderot2_inner(iterable,size,filler=None):
it = itertools.chain(iterable,itertools.repeat(filler,size-1))
chunk = tuple(itertools.islice(it,size))
while len(chunk) == size:
yield chunk
chunk = tuple(itertools.islice(it,size))
#b.add_function()
def MarkusJarderot2(iterable,size):
return consume_all(MarkusJarderot2_inner(iterable,size))
#b.add_arguments()
def argument_provider():
for exp in range(2, 20):
size = 2**exp
yield size, simple_benchmark.MultiArgument([[0] * size, 10])
r = b.run()
1 Disclaimer: I'm the author of the libraries iteration_utilities and simple_benchmark.
With Python 3.8 you can use the walrus operator and itertools.islice.
from itertools import islice
list_ = [i for i in range(10, 100)]
def chunker(it, size):
iterator = iter(it)
while chunk := list(islice(iterator, size)):
print(chunk)
In [2]: chunker(list_, 10)
[10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
[20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
[30, 31, 32, 33, 34, 35, 36, 37, 38, 39]
[40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
[50, 51, 52, 53, 54, 55, 56, 57, 58, 59]
[60, 61, 62, 63, 64, 65, 66, 67, 68, 69]
[70, 71, 72, 73, 74, 75, 76, 77, 78, 79]
[80, 81, 82, 83, 84, 85, 86, 87, 88, 89]
[90, 91, 92, 93, 94, 95, 96, 97, 98, 99]
I needed a solution that would also work with sets and generators. I couldn't come up with anything very short and pretty, but it's quite readable at least.
def chunker(seq, size):
res = []
for el in seq:
res.append(el)
if len(res) == size:
yield res
res = []
if res:
yield res
List:
>>> list(chunker([i for i in range(10)], 3))
[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
Set:
>>> list(chunker(set([i for i in range(10)]), 3))
[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
Generator:
>>> list(chunker((i for i in range(10)), 3))
[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
The more-itertools package has chunked method which does exactly that:
import more_itertools
for s in more_itertools.chunked(range(9), 4):
print(s)
Prints
[0, 1, 2, 3]
[4, 5, 6, 7]
[8]
chunked returns the items in a list. If you'd prefer iterables, use ichunked.
The ideal solution for this problem works with iterators (not just sequences). It should also be fast.
This is the solution provided by the documentation for itertools:
def grouper(n, iterable, fillvalue=None):
#"grouper(3, 'ABCDEFG', 'x') --> ABC DEF Gxx"
args = [iter(iterable)] * n
return itertools.izip_longest(fillvalue=fillvalue, *args)
Using ipython's %timeit on my mac book air, I get 47.5 us per loop.
However, this really doesn't work for me since the results are padded to be even sized groups. A solution without the padding is slightly more complicated. The most naive solution might be:
def grouper(size, iterable):
i = iter(iterable)
while True:
out = []
try:
for _ in range(size):
out.append(i.next())
except StopIteration:
yield out
break
yield out
Simple, but pretty slow: 693 us per loop
The best solution I could come up with uses islice for the inner loop:
def grouper(size, iterable):
it = iter(iterable)
while True:
group = tuple(itertools.islice(it, None, size))
if not group:
break
yield group
With the same dataset, I get 305 us per loop.
Unable to get a pure solution any faster than that, I provide the following solution with an important caveat: If your input data has instances of filldata in it, you could get wrong answer.
def grouper(n, iterable, fillvalue=None):
#"grouper(3, 'ABCDEFG', 'x') --> ABC DEF Gxx"
args = [iter(iterable)] * n
# itertools.zip_longest on Python 3
for x in itertools.izip_longest(*args, fillvalue=fillvalue):
if x[-1] is fillvalue:
yield tuple(v for v in x if v is not fillvalue)
else:
yield x
I really don't like this answer, but it is significantly faster. 124 us per loop
from itertools import izip_longest
def chunker(iterable, chunksize, filler):
return izip_longest(*[iter(iterable)]*chunksize, fillvalue=filler)
Similar to other proposals, but not exactly identical, I like doing it this way, because it's simple and easy to read:
it = iter([1, 2, 3, 4, 5, 6, 7, 8, 9])
for chunk in zip(it, it, it, it):
print chunk
>>> (1, 2, 3, 4)
>>> (5, 6, 7, 8)
This way you won't get the last partial chunk. If you want to get (9, None, None, None) as last chunk, just use izip_longest from itertools.
Since nobody's mentioned it yet here's a zip() solution:
>>> def chunker(iterable, chunksize):
... return zip(*[iter(iterable)]*chunksize)
It works only if your sequence's length is always divisible by the chunk size or you don't care about a trailing chunk if it isn't.
Example:
>>> s = '1234567890'
>>> chunker(s, 3)
[('1', '2', '3'), ('4', '5', '6'), ('7', '8', '9')]
>>> chunker(s, 4)
[('1', '2', '3', '4'), ('5', '6', '7', '8')]
>>> chunker(s, 5)
[('1', '2', '3', '4', '5'), ('6', '7', '8', '9', '0')]
Or using itertools.izip to return an iterator instead of a list:
>>> from itertools import izip
>>> def chunker(iterable, chunksize):
... return izip(*[iter(iterable)]*chunksize)
Padding can be fixed using #ΤΖΩΤΖΙΟΥ's answer:
>>> from itertools import chain, izip, repeat
>>> def chunker(iterable, chunksize, fillvalue=None):
... it = chain(iterable, repeat(fillvalue, chunksize-1))
... args = [it] * chunksize
... return izip(*args)
Another approach would be to use the two-argument form of iter:
from itertools import islice
def group(it, size):
it = iter(it)
return iter(lambda: tuple(islice(it, size)), ())
This can be adapted easily to use padding (this is similar to Markus Jarderot’s answer):
from itertools import islice, chain, repeat
def group_pad(it, size, pad=None):
it = chain(iter(it), repeat(pad))
return iter(lambda: tuple(islice(it, size)), (pad,) * size)
These can even be combined for optional padding:
_no_pad = object()
def group(it, size, pad=_no_pad):
if pad == _no_pad:
it = iter(it)
sentinel = ()
else:
it = chain(iter(it), repeat(pad))
sentinel = (pad,) * size
return iter(lambda: tuple(islice(it, size)), sentinel)
If the list is large, the highest-performing way to do this will be to use a generator:
def get_chunk(iterable, chunk_size):
result = []
for item in iterable:
result.append(item)
if len(result) == chunk_size:
yield tuple(result)
result = []
if len(result) > 0:
yield tuple(result)
for x in get_chunk([1,2,3,4,5,6,7,8,9,10], 3):
print x
(1, 2, 3)
(4, 5, 6)
(7, 8, 9)
(10,)
Using little functions and things really doesn't appeal to me; I prefer to just use slices:
data = [...]
chunk_size = 10000 # or whatever
chunks = [data[i:i+chunk_size] for i in xrange(0,len(data),chunk_size)]
for chunk in chunks:
...
Using map() instead of zip() fixes the padding issue in J.F. Sebastian's answer:
>>> def chunker(iterable, chunksize):
... return map(None,*[iter(iterable)]*chunksize)
Example:
>>> s = '1234567890'
>>> chunker(s, 3)
[('1', '2', '3'), ('4', '5', '6'), ('7', '8', '9'), ('0', None, None)]
>>> chunker(s, 4)
[('1', '2', '3', '4'), ('5', '6', '7', '8'), ('9', '0', None, None)]
>>> chunker(s, 5)
[('1', '2', '3', '4', '5'), ('6', '7', '8', '9', '0')]
One-liner, adhoc solution to iterate over a list x in chunks of size 4 -
for a, b, c, d in zip(x[0::4], x[1::4], x[2::4], x[3::4]):
... do something with a, b, c and d ...
To avoid all conversions to a list import itertools and:
>>> for k, g in itertools.groupby(xrange(35), lambda x: x/10):
... list(g)
Produces:
...
0 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
1 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
2 [20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
3 [30, 31, 32, 33, 34]
>>>
I checked groupby and it doesn't convert to list or use len so I (think) this will delay resolution of each value until it is actually used. Sadly none of the available answers (at this time) seemed to offer this variation.
Obviously if you need to handle each item in turn nest a for loop over g:
for k,g in itertools.groupby(xrange(35), lambda x: x/10):
for i in g:
# do what you need to do with individual items
# now do what you need to do with the whole group
My specific interest in this was the need to consume a generator to submit changes in batches of up to 1000 to the gmail API:
messages = a_generator_which_would_not_be_smart_as_a_list
for idx, batch in groupby(messages, lambda x: x/1000):
batch_request = BatchHttpRequest()
for message in batch:
batch_request.add(self.service.users().messages().modify(userId='me', id=message['id'], body=msg_labels))
http = httplib2.Http()
self.credentials.authorize(http)
batch_request.execute(http=http)
Unless I misses something, the following simple solution with generator expressions has not been mentioned. It assumes that both the size and the number of chunks are known (which is often the case), and that no padding is required:
def chunks(it, n, m):
"""Make an iterator over m first chunks of size n.
"""
it = iter(it)
# Chunks are presented as tuples.
return (tuple(next(it) for _ in range(n)) for _ in range(m))
In your second method, I would advance to the next group of 4 by doing this:
ints = ints[4:]
However, I haven't done any performance measurement so I don't know which one might be more efficient.
Having said that, I would usually choose the first method. It's not pretty, but that's often a consequence of interfacing with the outside world.
With NumPy it's simple:
ints = array([1, 2, 3, 4, 5, 6, 7, 8])
for int1, int2 in ints.reshape(-1, 2):
print(int1, int2)
output:
1 2
3 4
5 6
7 8
I never want my chunks padded, so that requirement is essential. I find that the ability to work on any iterable is also requirement. Given that, I decided to extend on the accepted answer, https://stackoverflow.com/a/434411/1074659.
Performance takes a slight hit in this approach if padding is not wanted due to the need to compare and filter the padded values. However, for large chunk sizes, this utility is very performant.
#!/usr/bin/env python3
from itertools import zip_longest
_UNDEFINED = object()
def chunker(iterable, chunksize, fillvalue=_UNDEFINED):
"""
Collect data into chunks and optionally pad it.
Performance worsens as `chunksize` approaches 1.
Inspired by:
https://docs.python.org/3/library/itertools.html#itertools-recipes
"""
args = [iter(iterable)] * chunksize
chunks = zip_longest(*args, fillvalue=fillvalue)
yield from (
filter(lambda val: val is not _UNDEFINED, chunk)
if chunk[-1] is _UNDEFINED
else chunk
for chunk in chunks
) if fillvalue is _UNDEFINED else chunks
def chunker(iterable, n):
"""Yield iterable in chunk sizes.
>>> chunks = chunker('ABCDEF', n=4)
>>> chunks.next()
['A', 'B', 'C', 'D']
>>> chunks.next()
['E', 'F']
"""
it = iter(iterable)
while True:
chunk = []
for i in range(n):
try:
chunk.append(next(it))
except StopIteration:
yield chunk
raise StopIteration
yield chunk
if __name__ == '__main__':
import doctest
doctest.testmod()
Yet another answer, the advantages of which are:
1) Easily understandable
2) Works on any iterable, not just sequences (some of the above answers will choke on filehandles)
3) Does not load the chunk into memory all at once
4) Does not make a chunk-long list of references to the same iterator in memory
5) No padding of fill values at the end of the list
That being said, I haven't timed it so it might be slower than some of the more clever methods, and some of the advantages may be irrelevant given the use case.
def chunkiter(iterable, size):
def inneriter(first, iterator, size):
yield first
for _ in xrange(size - 1):
yield iterator.next()
it = iter(iterable)
while True:
yield inneriter(it.next(), it, size)
In [2]: i = chunkiter('abcdefgh', 3)
In [3]: for ii in i:
for c in ii:
print c,
print ''
...:
a b c
d e f
g h
Update:
A couple of drawbacks due to the fact the inner and outer loops are pulling values from the same iterator:
1) continue doesn't work as expected in the outer loop - it just continues on to the next item rather than skipping a chunk. However, this doesn't seem like a problem as there's nothing to test in the outer loop.
2) break doesn't work as expected in the inner loop - control will wind up in the inner loop again with the next item in the iterator. To skip whole chunks, either wrap the inner iterator (ii above) in a tuple, e.g. for c in tuple(ii), or set a flag and exhaust the iterator.
def group_by(iterable, size):
"""Group an iterable into lists that don't exceed the size given.
>>> group_by([1,2,3,4,5], 2)
[[1, 2], [3, 4], [5]]
"""
sublist = []
for index, item in enumerate(iterable):
if index > 0 and index % size == 0:
yield sublist
sublist = []
sublist.append(item)
if sublist:
yield sublist
You can use partition or chunks function from funcy library:
from funcy import partition
for a, b, c, d in partition(4, ints):
foo += a * b * c * d
These functions also has iterator versions ipartition and ichunks, which will be more efficient in this case.
You can also peek at their implementation.
About solution gave by J.F. Sebastian here:
def chunker(iterable, chunksize):
return zip(*[iter(iterable)]*chunksize)
It's clever, but has one disadvantage - always return tuple. How to get string instead?
Of course you can write ''.join(chunker(...)), but the temporary tuple is constructed anyway.
You can get rid of the temporary tuple by writing own zip, like this:
class IteratorExhausted(Exception):
pass
def translate_StopIteration(iterable, to=IteratorExhausted):
for i in iterable:
yield i
raise to # StopIteration would get ignored because this is generator,
# but custom exception can leave the generator.
def custom_zip(*iterables, reductor=tuple):
iterators = tuple(map(translate_StopIteration, iterables))
while True:
try:
yield reductor(next(i) for i in iterators)
except IteratorExhausted: # when any of iterators get exhausted.
break
Then
def chunker(data, size, reductor=tuple):
return custom_zip(*[iter(data)]*size, reductor=reductor)
Example usage:
>>> for i in chunker('12345', 2):
... print(repr(i))
...
('1', '2')
('3', '4')
>>> for i in chunker('12345', 2, ''.join):
... print(repr(i))
...
'12'
'34'
I like this approach. It feels simple and not magical and supports all iterable types and doesn't require imports.
def chunk_iter(iterable, chunk_size):
it = iter(iterable)
while True:
chunk = tuple(next(it) for _ in range(chunk_size))
if not chunk:
break
yield chunk
Quite pythonic here (you may also inline the body of the split_groups function)
import itertools
def split_groups(iter_in, group_size):
return ((x for _, x in item) for _, item in itertools.groupby(enumerate(iter_in), key=lambda x: x[0] // group_size))
for x, y, z, w in split_groups(range(16), 4):
foo += x * y + z * w
Here is a chunker without imports that supports generators:
def chunks(seq, size):
it = iter(seq)
while True:
ret = tuple(next(it) for _ in range(size))
if len(ret) == size:
yield ret
else:
raise StopIteration()
Example of use:
>>> def foo():
... i = 0
... while True:
... i += 1
... yield i
...
>>> c = chunks(foo(), 3)
>>> c.next()
(1, 2, 3)
>>> c.next()
(4, 5, 6)
>>> list(chunks('abcdefg', 2))
[('a', 'b'), ('c', 'd'), ('e', 'f')]
This question already has answers here:
How do I split a list into equally-sized chunks?
(66 answers)
Closed 1 year ago.
How do I make a for loop or a list comprehension so that every iteration gives me two elements?
l = [1,2,3,4,5,6]
for i,k in ???:
print str(i), '+', str(k), '=', str(i+k)
Output:
1+2=3
3+4=7
5+6=11
You need a pairwise() (or grouped()) implementation.
def pairwise(iterable):
"s -> (s0, s1), (s2, s3), (s4, s5), ..."
a = iter(iterable)
return zip(a, a)
for x, y in pairwise(l):
print("%d + %d = %d" % (x, y, x + y))
Or, more generally:
def grouped(iterable, n):
"s -> (s0,s1,s2,...sn-1), (sn,sn+1,sn+2,...s2n-1), (s2n,s2n+1,s2n+2,...s3n-1), ..."
return zip(*[iter(iterable)]*n)
for x, y in grouped(l, 2):
print("%d + %d = %d" % (x, y, x + y))
In Python 2, you should import izip as a replacement for Python 3's built-in zip() function.
All credit to martineau for his answer to my question, I have found this to be very efficient as it only iterates once over the list and does not create any unnecessary lists in the process.
N.B: This should not be confused with the pairwise recipe in Python's own itertools documentation, which yields s -> (s0, s1), (s1, s2), (s2, s3), ..., as pointed out by #lazyr in the comments.
Little addition for those who would like to do type checking with mypy on Python 3:
from typing import Iterable, Tuple, TypeVar
T = TypeVar("T")
def grouped(iterable: Iterable[T], n=2) -> Iterable[Tuple[T, ...]]:
"""s -> (s0,s1,s2,...sn-1), (sn,sn+1,sn+2,...s2n-1), ..."""
return zip(*[iter(iterable)] * n)
Well you need tuple of 2 elements, so
data = [1,2,3,4,5,6]
for i,k in zip(data[0::2], data[1::2]):
print str(i), '+', str(k), '=', str(i+k)
Where:
data[0::2] means create subset collection of elements that (index % 2 == 0)
zip(x,y) creates a tuple collection from x and y collections same index elements.
>>> l = [1,2,3,4,5,6]
>>> zip(l,l[1:])
[(1, 2), (2, 3), (3, 4), (4, 5), (5, 6)]
>>> zip(l,l[1:])[::2]
[(1, 2), (3, 4), (5, 6)]
>>> [a+b for a,b in zip(l,l[1:])[::2]]
[3, 7, 11]
>>> ["%d + %d = %d" % (a,b,a+b) for a,b in zip(l,l[1:])[::2]]
['1 + 2 = 3', '3 + 4 = 7', '5 + 6 = 11']
A simple solution.
l = [1, 2, 3, 4, 5, 6]
for i in range(0, len(l), 2):
print str(l[i]), '+', str(l[i + 1]), '=', str(l[i] + l[i + 1])
While all the answers using zip are correct, I find that implementing the functionality yourself leads to more readable code:
def pairwise(it):
it = iter(it)
while True:
try:
yield next(it), next(it)
except StopIteration:
# no more elements in the iterator
return
The it = iter(it) part ensures that it is actually an iterator, not just an iterable. If it already is an iterator, this line is a no-op.
Usage:
for a, b in pairwise([0, 1, 2, 3, 4, 5]):
print(a + b)
I hope this will be even more elegant way of doing it.
a = [1,2,3,4,5,6]
zip(a[::2], a[1::2])
[(1, 2), (3, 4), (5, 6)]
In case you're interested in the performance, I did a small benchmark (using my library simple_benchmark) to compare the performance of the solutions and I included a function from one of my packages: iteration_utilities.grouper
from iteration_utilities import grouper
import matplotlib as mpl
from simple_benchmark import BenchmarkBuilder
bench = BenchmarkBuilder()
#bench.add_function()
def Johnsyweb(l):
def pairwise(iterable):
"s -> (s0, s1), (s2, s3), (s4, s5), ..."
a = iter(iterable)
return zip(a, a)
for x, y in pairwise(l):
pass
#bench.add_function()
def Margus(data):
for i, k in zip(data[0::2], data[1::2]):
pass
#bench.add_function()
def pyanon(l):
list(zip(l,l[1:]))[::2]
#bench.add_function()
def taskinoor(l):
for i in range(0, len(l), 2):
l[i], l[i+1]
#bench.add_function()
def mic_e(it):
def pairwise(it):
it = iter(it)
while True:
try:
yield next(it), next(it)
except StopIteration:
return
for a, b in pairwise(it):
pass
#bench.add_function()
def MSeifert(it):
for item1, item2 in grouper(it, 2):
pass
bench.use_random_lists_as_arguments(sizes=[2**i for i in range(1, 20)])
benchmark_result = bench.run()
mpl.rcParams['figure.figsize'] = (8, 10)
benchmark_result.plot_both(relative_to=MSeifert)
So if you want the fastest solution without external dependencies you probably should just use the approach given by Johnysweb (at the time of writing it's the most upvoted and accepted answer).
If you don't mind the additional dependency then the grouper from iteration_utilities will probably be a bit faster.
Additional thoughts
Some of the approaches have some restrictions, that haven't been discussed here.
For example a few solutions only work for sequences (that is lists, strings, etc.), for example Margus/pyanon/taskinoor solutions which uses indexing while other solutions work on any iterable (that is sequences and generators, iterators) like Johnysweb/mic_e/my solutions.
Then Johnysweb also provided a solution that works for other sizes than 2 while the other answers don't (okay, the iteration_utilities.grouper also allows setting the number of elements to "group").
Then there is also the question about what should happen if there is an odd number of elements in the list. Should the remaining item be dismissed? Should the list be padded to make it even sized? Should the remaining item be returned as single? The other answer don't address this point directly, however if I haven't overlooked anything they all follow the approach that the remaining item should be dismissed (except for taskinoors answer - that will actually raise an Exception).
With grouper you can decide what you want to do:
>>> from iteration_utilities import grouper
>>> list(grouper([1, 2, 3], 2)) # as single
[(1, 2), (3,)]
>>> list(grouper([1, 2, 3], 2, truncate=True)) # ignored
[(1, 2)]
>>> list(grouper([1, 2, 3], 2, fillvalue=None)) # padded
[(1, 2), (3, None)]
Use the zip and iter commands together:
I find this solution using iter to be quite elegant:
it = iter(l)
list(zip(it, it))
# [(1, 2), (3, 4), (5, 6)]
Which I found in the Python 3 zip documentation.
it = iter(l)
print(*(f'{u} + {v} = {u+v}' for u, v in zip(it, it)), sep='\n')
# 1 + 2 = 3
# 3 + 4 = 7
# 5 + 6 = 11
To generalise to N elements at a time:
N = 2
list(zip(*([iter(l)] * N)))
# [(1, 2), (3, 4), (5, 6)]
for (i, k) in zip(l[::2], l[1::2]):
print i, "+", k, "=", i+k
zip(*iterable) returns a tuple with the next element of each iterable.
l[::2] returns the 1st, the 3rd, the 5th, etc. element of the list: the first colon indicates that the slice starts at the beginning because there's no number behind it, the second colon is only needed if you want a 'step in the slice' (in this case 2).
l[1::2] does the same thing but starts in the second element of the lists so it returns the 2nd, the 4th, 6th, etc. element of the original list.
With unpacking:
l = [1,2,3,4,5,6]
while l:
i, k, *l = l
print(f'{i}+{k}={i+k}')
Note: this will consume l, leaving it empty afterward.
There are many ways to do that. For example:
lst = [1,2,3,4,5,6]
[(lst[i], lst[i+1]) for i,_ in enumerate(lst[:-1])]
>>>[(1, 2), (2, 3), (3, 4), (4, 5), (5, 6)]
list(zip(*[iter(lst)]*2))
>>>[(1, 2), (3, 4), (5, 6)]
you can use more_itertools package.
import more_itertools
lst = range(1, 7)
for i, j in more_itertools.chunked(lst, 2):
print(f'{i} + {j} = {i+j}')
For anyone it might help, here is a solution to a similar problem but with overlapping pairs (instead of mutually exclusive pairs).
From the Python itertools documentation:
from itertools import izip
def pairwise(iterable):
"s -> (s0,s1), (s1,s2), (s2, s3), ..."
a, b = tee(iterable)
next(b, None)
return izip(a, b)
Or, more generally:
from itertools import izip
def groupwise(iterable, n=2):
"s -> (s0,s1,...,sn-1), (s1,s2,...,sn), (s2,s3,...,sn+1), ..."
t = tee(iterable, n)
for i in range(1, n):
for j in range(0, i):
next(t[i], None)
return izip(*t)
The title of this question is misleading, you seem to be looking for consecutive pairs, but if you want to iterate over the set of all possible pairs than this will work :
for i,v in enumerate(items[:-1]):
for u in items[i+1:]:
A simplistic approach:
[(a[i],a[i+1]) for i in range(0,len(a),2)]
this is useful if your array is a and you want to iterate on it by pairs.
To iterate on triplets or more just change the "range" step command, for example:
[(a[i],a[i+1],a[i+2]) for i in range(0,len(a),3)]
(you have to deal with excess values if your array length and the step do not fit)
The polished Python3 solution is given in one of the itertools recipes:
import itertools
def grouper(iterable, n, fillvalue=None):
"Collect data into fixed-length chunks or blocks"
# grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
args = [iter(iterable)] * n
return itertools.zip_longest(*args, fillvalue=fillvalue)
Another try at cleaner solution
def grouped(itr, n=2):
itr = iter(itr)
end = object()
while True:
vals = tuple(next(itr, end) for _ in range(n))
if vals[-1] is end:
return
yield vals
For more customization options
from collections.abc import Sized
def grouped(itr, n=2, /, truncate=True, fillvalue=None, strict=False, nofill=False):
if strict:
if isinstance(itr, Sized):
if len(itr) % n != 0:
raise ValueError(f"{len(itr)=} is not divisible by {n=}")
itr = iter(itr)
end = object()
while True:
vals = tuple(next(itr, end) for _ in range(n))
if vals[-1] is end:
if vals[0] is end:
return
if strict:
raise ValueError("found extra stuff in iterable")
if nofill:
yield tuple(v for v in vals if v is not end)
return
if truncate:
return
yield tuple(v if v is not end else fillvalue for v in vals)
return
yield vals
Thought that this is a good place to share my generalization of this for n>2, which is just a sliding window over an iterable:
def sliding_window(iterable, n):
its = [ itertools.islice(iter, i, None)
for i, iter
in enumerate(itertools.tee(iterable, n)) ]
return itertools.izip(*its)
I need to divide a list by a number and fixed like this.
l = [1,2,3,4,5,6]
def divideByN(data, n):
return [data[i*n : (i+1)*n] for i in range(len(data)//n)]
>>> print(divideByN(l,2))
[[1, 2], [3, 4], [5, 6]]
>>> print(divideByN(l,3))
[[1, 2, 3], [4, 5, 6]]
Using typing so you can verify data using mypy static analysis tool:
from typing import Iterator, Any, Iterable, TypeVar, Tuple
T_ = TypeVar('T_')
Pairs_Iter = Iterator[Tuple[T_, T_]]
def legs(iterable: Iterator[T_]) -> Pairs_Iter:
begin = next(iterable)
for end in iterable:
yield begin, end
begin = end
Here we can have alt_elem method which can fit in your for loop.
def alt_elem(list, index=2):
for i, elem in enumerate(list, start=1):
if not i % index:
yield tuple(list[i-index:i])
a = range(10)
for index in [2, 3, 4]:
print("With index: {0}".format(index))
for i in alt_elem(a, index):
print(i)
Output:
With index: 2
(0, 1)
(2, 3)
(4, 5)
(6, 7)
(8, 9)
With index: 3
(0, 1, 2)
(3, 4, 5)
(6, 7, 8)
With index: 4
(0, 1, 2, 3)
(4, 5, 6, 7)
Note: Above solution might not be efficient considering operations performed in func.
This is simple solution, which is using range function to pick alternative elements from a list of elements.
Note: This is only valid for an even numbered list.
a_list = [1, 2, 3, 4, 5, 6]
empty_list = []
for i in range(0, len(a_list), 2):
empty_list.append(a_list[i] + a_list[i + 1])
print(empty_list)
# [3, 7, 11]
I have a Python script which takes as input a list of integers, which I need to work with four integers at a time. Unfortunately, I don't have control of the input, or I'd have it passed in as a list of four-element tuples. Currently, I'm iterating over it this way:
for i in range(0, len(ints), 4):
# dummy op for example code
foo += ints[i] * ints[i + 1] + ints[i + 2] * ints[i + 3]
It looks a lot like "C-think", though, which makes me suspect there's a more pythonic way of dealing with this situation. The list is discarded after iterating, so it needn't be preserved. Perhaps something like this would be better?
while ints:
foo += ints[0] * ints[1] + ints[2] * ints[3]
ints[0:4] = []
Still doesn't quite "feel" right, though. :-/
Related question: How do you split a list into evenly sized chunks in Python?
def chunker(seq, size):
return (seq[pos:pos + size] for pos in range(0, len(seq), size))
Works with any sequence:
text = "I am a very, very helpful text"
for group in chunker(text, 7):
print(repr(group),)
# 'I am a ' 'very, v' 'ery hel' 'pful te' 'xt'
print('|'.join(chunker(text, 10)))
# I am a ver|y, very he|lpful text
animals = ['cat', 'dog', 'rabbit', 'duck', 'bird', 'cow', 'gnu', 'fish']
for group in chunker(animals, 3):
print(group)
# ['cat', 'dog', 'rabbit']
# ['duck', 'bird', 'cow']
# ['gnu', 'fish']
Modified from the Recipes section of Python's itertools docs:
from itertools import zip_longest
def grouper(iterable, n, fillvalue=None):
args = [iter(iterable)] * n
return zip_longest(*args, fillvalue=fillvalue)
Example
grouper('ABCDEFG', 3, 'x') # --> 'ABC' 'DEF' 'Gxx'
Note: on Python 2 use izip_longest instead of zip_longest.
chunk_size = 4
for i in range(0, len(ints), chunk_size):
chunk = ints[i:i+chunk_size]
# process chunk of size <= chunk_size
import itertools
def chunks(iterable,size):
it = iter(iterable)
chunk = tuple(itertools.islice(it,size))
while chunk:
yield chunk
chunk = tuple(itertools.islice(it,size))
# though this will throw ValueError if the length of ints
# isn't a multiple of four:
for x1,x2,x3,x4 in chunks(ints,4):
foo += x1 + x2 + x3 + x4
for chunk in chunks(ints,4):
foo += sum(chunk)
Another way:
import itertools
def chunks2(iterable,size,filler=None):
it = itertools.chain(iterable,itertools.repeat(filler,size-1))
chunk = tuple(itertools.islice(it,size))
while len(chunk) == size:
yield chunk
chunk = tuple(itertools.islice(it,size))
# x2, x3 and x4 could get the value 0 if the length is not
# a multiple of 4.
for x1,x2,x3,x4 in chunks2(ints,4,0):
foo += x1 + x2 + x3 + x4
If you don't mind using an external package you could use iteration_utilities.grouper from iteration_utilties 1. It supports all iterables (not just sequences):
from iteration_utilities import grouper
seq = list(range(20))
for group in grouper(seq, 4):
print(group)
which prints:
(0, 1, 2, 3)
(4, 5, 6, 7)
(8, 9, 10, 11)
(12, 13, 14, 15)
(16, 17, 18, 19)
In case the length isn't a multiple of the groupsize it also supports filling (the incomplete last group) or truncating (discarding the incomplete last group) the last one:
from iteration_utilities import grouper
seq = list(range(17))
for group in grouper(seq, 4):
print(group)
# (0, 1, 2, 3)
# (4, 5, 6, 7)
# (8, 9, 10, 11)
# (12, 13, 14, 15)
# (16,)
for group in grouper(seq, 4, fillvalue=None):
print(group)
# (0, 1, 2, 3)
# (4, 5, 6, 7)
# (8, 9, 10, 11)
# (12, 13, 14, 15)
# (16, None, None, None)
for group in grouper(seq, 4, truncate=True):
print(group)
# (0, 1, 2, 3)
# (4, 5, 6, 7)
# (8, 9, 10, 11)
# (12, 13, 14, 15)
Benchmarks
I also decided to compare the run-time of a few of the mentioned approaches. It's a log-log plot grouping into groups of "10" elements based on a list of varying size. For qualitative results: Lower means faster:
At least in this benchmark the iteration_utilities.grouper performs best. Followed by the approach of Craz.
The benchmark was created with simple_benchmark1. The code used to run this benchmark was:
import iteration_utilities
import itertools
from itertools import zip_longest
def consume_all(it):
return iteration_utilities.consume(it, None)
import simple_benchmark
b = simple_benchmark.BenchmarkBuilder()
#b.add_function()
def grouper(l, n):
return consume_all(iteration_utilities.grouper(l, n))
def Craz_inner(iterable, n, fillvalue=None):
args = [iter(iterable)] * n
return zip_longest(*args, fillvalue=fillvalue)
#b.add_function()
def Craz(iterable, n, fillvalue=None):
return consume_all(Craz_inner(iterable, n, fillvalue))
def nosklo_inner(seq, size):
return (seq[pos:pos + size] for pos in range(0, len(seq), size))
#b.add_function()
def nosklo(seq, size):
return consume_all(nosklo_inner(seq, size))
def SLott_inner(ints, chunk_size):
for i in range(0, len(ints), chunk_size):
yield ints[i:i+chunk_size]
#b.add_function()
def SLott(ints, chunk_size):
return consume_all(SLott_inner(ints, chunk_size))
def MarkusJarderot1_inner(iterable,size):
it = iter(iterable)
chunk = tuple(itertools.islice(it,size))
while chunk:
yield chunk
chunk = tuple(itertools.islice(it,size))
#b.add_function()
def MarkusJarderot1(iterable,size):
return consume_all(MarkusJarderot1_inner(iterable,size))
def MarkusJarderot2_inner(iterable,size,filler=None):
it = itertools.chain(iterable,itertools.repeat(filler,size-1))
chunk = tuple(itertools.islice(it,size))
while len(chunk) == size:
yield chunk
chunk = tuple(itertools.islice(it,size))
#b.add_function()
def MarkusJarderot2(iterable,size):
return consume_all(MarkusJarderot2_inner(iterable,size))
#b.add_arguments()
def argument_provider():
for exp in range(2, 20):
size = 2**exp
yield size, simple_benchmark.MultiArgument([[0] * size, 10])
r = b.run()
1 Disclaimer: I'm the author of the libraries iteration_utilities and simple_benchmark.
With Python 3.8 you can use the walrus operator and itertools.islice.
from itertools import islice
list_ = [i for i in range(10, 100)]
def chunker(it, size):
iterator = iter(it)
while chunk := list(islice(iterator, size)):
print(chunk)
In [2]: chunker(list_, 10)
[10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
[20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
[30, 31, 32, 33, 34, 35, 36, 37, 38, 39]
[40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
[50, 51, 52, 53, 54, 55, 56, 57, 58, 59]
[60, 61, 62, 63, 64, 65, 66, 67, 68, 69]
[70, 71, 72, 73, 74, 75, 76, 77, 78, 79]
[80, 81, 82, 83, 84, 85, 86, 87, 88, 89]
[90, 91, 92, 93, 94, 95, 96, 97, 98, 99]
I needed a solution that would also work with sets and generators. I couldn't come up with anything very short and pretty, but it's quite readable at least.
def chunker(seq, size):
res = []
for el in seq:
res.append(el)
if len(res) == size:
yield res
res = []
if res:
yield res
List:
>>> list(chunker([i for i in range(10)], 3))
[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
Set:
>>> list(chunker(set([i for i in range(10)]), 3))
[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
Generator:
>>> list(chunker((i for i in range(10)), 3))
[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
The more-itertools package has chunked method which does exactly that:
import more_itertools
for s in more_itertools.chunked(range(9), 4):
print(s)
Prints
[0, 1, 2, 3]
[4, 5, 6, 7]
[8]
chunked returns the items in a list. If you'd prefer iterables, use ichunked.
The ideal solution for this problem works with iterators (not just sequences). It should also be fast.
This is the solution provided by the documentation for itertools:
def grouper(n, iterable, fillvalue=None):
#"grouper(3, 'ABCDEFG', 'x') --> ABC DEF Gxx"
args = [iter(iterable)] * n
return itertools.izip_longest(fillvalue=fillvalue, *args)
Using ipython's %timeit on my mac book air, I get 47.5 us per loop.
However, this really doesn't work for me since the results are padded to be even sized groups. A solution without the padding is slightly more complicated. The most naive solution might be:
def grouper(size, iterable):
i = iter(iterable)
while True:
out = []
try:
for _ in range(size):
out.append(i.next())
except StopIteration:
yield out
break
yield out
Simple, but pretty slow: 693 us per loop
The best solution I could come up with uses islice for the inner loop:
def grouper(size, iterable):
it = iter(iterable)
while True:
group = tuple(itertools.islice(it, None, size))
if not group:
break
yield group
With the same dataset, I get 305 us per loop.
Unable to get a pure solution any faster than that, I provide the following solution with an important caveat: If your input data has instances of filldata in it, you could get wrong answer.
def grouper(n, iterable, fillvalue=None):
#"grouper(3, 'ABCDEFG', 'x') --> ABC DEF Gxx"
args = [iter(iterable)] * n
# itertools.zip_longest on Python 3
for x in itertools.izip_longest(*args, fillvalue=fillvalue):
if x[-1] is fillvalue:
yield tuple(v for v in x if v is not fillvalue)
else:
yield x
I really don't like this answer, but it is significantly faster. 124 us per loop
from itertools import izip_longest
def chunker(iterable, chunksize, filler):
return izip_longest(*[iter(iterable)]*chunksize, fillvalue=filler)
Similar to other proposals, but not exactly identical, I like doing it this way, because it's simple and easy to read:
it = iter([1, 2, 3, 4, 5, 6, 7, 8, 9])
for chunk in zip(it, it, it, it):
print chunk
>>> (1, 2, 3, 4)
>>> (5, 6, 7, 8)
This way you won't get the last partial chunk. If you want to get (9, None, None, None) as last chunk, just use izip_longest from itertools.
Since nobody's mentioned it yet here's a zip() solution:
>>> def chunker(iterable, chunksize):
... return zip(*[iter(iterable)]*chunksize)
It works only if your sequence's length is always divisible by the chunk size or you don't care about a trailing chunk if it isn't.
Example:
>>> s = '1234567890'
>>> chunker(s, 3)
[('1', '2', '3'), ('4', '5', '6'), ('7', '8', '9')]
>>> chunker(s, 4)
[('1', '2', '3', '4'), ('5', '6', '7', '8')]
>>> chunker(s, 5)
[('1', '2', '3', '4', '5'), ('6', '7', '8', '9', '0')]
Or using itertools.izip to return an iterator instead of a list:
>>> from itertools import izip
>>> def chunker(iterable, chunksize):
... return izip(*[iter(iterable)]*chunksize)
Padding can be fixed using #ΤΖΩΤΖΙΟΥ's answer:
>>> from itertools import chain, izip, repeat
>>> def chunker(iterable, chunksize, fillvalue=None):
... it = chain(iterable, repeat(fillvalue, chunksize-1))
... args = [it] * chunksize
... return izip(*args)
Another approach would be to use the two-argument form of iter:
from itertools import islice
def group(it, size):
it = iter(it)
return iter(lambda: tuple(islice(it, size)), ())
This can be adapted easily to use padding (this is similar to Markus Jarderot’s answer):
from itertools import islice, chain, repeat
def group_pad(it, size, pad=None):
it = chain(iter(it), repeat(pad))
return iter(lambda: tuple(islice(it, size)), (pad,) * size)
These can even be combined for optional padding:
_no_pad = object()
def group(it, size, pad=_no_pad):
if pad == _no_pad:
it = iter(it)
sentinel = ()
else:
it = chain(iter(it), repeat(pad))
sentinel = (pad,) * size
return iter(lambda: tuple(islice(it, size)), sentinel)
If the list is large, the highest-performing way to do this will be to use a generator:
def get_chunk(iterable, chunk_size):
result = []
for item in iterable:
result.append(item)
if len(result) == chunk_size:
yield tuple(result)
result = []
if len(result) > 0:
yield tuple(result)
for x in get_chunk([1,2,3,4,5,6,7,8,9,10], 3):
print x
(1, 2, 3)
(4, 5, 6)
(7, 8, 9)
(10,)
Using little functions and things really doesn't appeal to me; I prefer to just use slices:
data = [...]
chunk_size = 10000 # or whatever
chunks = [data[i:i+chunk_size] for i in xrange(0,len(data),chunk_size)]
for chunk in chunks:
...
Using map() instead of zip() fixes the padding issue in J.F. Sebastian's answer:
>>> def chunker(iterable, chunksize):
... return map(None,*[iter(iterable)]*chunksize)
Example:
>>> s = '1234567890'
>>> chunker(s, 3)
[('1', '2', '3'), ('4', '5', '6'), ('7', '8', '9'), ('0', None, None)]
>>> chunker(s, 4)
[('1', '2', '3', '4'), ('5', '6', '7', '8'), ('9', '0', None, None)]
>>> chunker(s, 5)
[('1', '2', '3', '4', '5'), ('6', '7', '8', '9', '0')]
One-liner, adhoc solution to iterate over a list x in chunks of size 4 -
for a, b, c, d in zip(x[0::4], x[1::4], x[2::4], x[3::4]):
... do something with a, b, c and d ...
To avoid all conversions to a list import itertools and:
>>> for k, g in itertools.groupby(xrange(35), lambda x: x/10):
... list(g)
Produces:
...
0 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
1 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
2 [20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
3 [30, 31, 32, 33, 34]
>>>
I checked groupby and it doesn't convert to list or use len so I (think) this will delay resolution of each value until it is actually used. Sadly none of the available answers (at this time) seemed to offer this variation.
Obviously if you need to handle each item in turn nest a for loop over g:
for k,g in itertools.groupby(xrange(35), lambda x: x/10):
for i in g:
# do what you need to do with individual items
# now do what you need to do with the whole group
My specific interest in this was the need to consume a generator to submit changes in batches of up to 1000 to the gmail API:
messages = a_generator_which_would_not_be_smart_as_a_list
for idx, batch in groupby(messages, lambda x: x/1000):
batch_request = BatchHttpRequest()
for message in batch:
batch_request.add(self.service.users().messages().modify(userId='me', id=message['id'], body=msg_labels))
http = httplib2.Http()
self.credentials.authorize(http)
batch_request.execute(http=http)
Unless I misses something, the following simple solution with generator expressions has not been mentioned. It assumes that both the size and the number of chunks are known (which is often the case), and that no padding is required:
def chunks(it, n, m):
"""Make an iterator over m first chunks of size n.
"""
it = iter(it)
# Chunks are presented as tuples.
return (tuple(next(it) for _ in range(n)) for _ in range(m))
In your second method, I would advance to the next group of 4 by doing this:
ints = ints[4:]
However, I haven't done any performance measurement so I don't know which one might be more efficient.
Having said that, I would usually choose the first method. It's not pretty, but that's often a consequence of interfacing with the outside world.
With NumPy it's simple:
ints = array([1, 2, 3, 4, 5, 6, 7, 8])
for int1, int2 in ints.reshape(-1, 2):
print(int1, int2)
output:
1 2
3 4
5 6
7 8
I never want my chunks padded, so that requirement is essential. I find that the ability to work on any iterable is also requirement. Given that, I decided to extend on the accepted answer, https://stackoverflow.com/a/434411/1074659.
Performance takes a slight hit in this approach if padding is not wanted due to the need to compare and filter the padded values. However, for large chunk sizes, this utility is very performant.
#!/usr/bin/env python3
from itertools import zip_longest
_UNDEFINED = object()
def chunker(iterable, chunksize, fillvalue=_UNDEFINED):
"""
Collect data into chunks and optionally pad it.
Performance worsens as `chunksize` approaches 1.
Inspired by:
https://docs.python.org/3/library/itertools.html#itertools-recipes
"""
args = [iter(iterable)] * chunksize
chunks = zip_longest(*args, fillvalue=fillvalue)
yield from (
filter(lambda val: val is not _UNDEFINED, chunk)
if chunk[-1] is _UNDEFINED
else chunk
for chunk in chunks
) if fillvalue is _UNDEFINED else chunks
def chunker(iterable, n):
"""Yield iterable in chunk sizes.
>>> chunks = chunker('ABCDEF', n=4)
>>> chunks.next()
['A', 'B', 'C', 'D']
>>> chunks.next()
['E', 'F']
"""
it = iter(iterable)
while True:
chunk = []
for i in range(n):
try:
chunk.append(next(it))
except StopIteration:
yield chunk
raise StopIteration
yield chunk
if __name__ == '__main__':
import doctest
doctest.testmod()
Yet another answer, the advantages of which are:
1) Easily understandable
2) Works on any iterable, not just sequences (some of the above answers will choke on filehandles)
3) Does not load the chunk into memory all at once
4) Does not make a chunk-long list of references to the same iterator in memory
5) No padding of fill values at the end of the list
That being said, I haven't timed it so it might be slower than some of the more clever methods, and some of the advantages may be irrelevant given the use case.
def chunkiter(iterable, size):
def inneriter(first, iterator, size):
yield first
for _ in xrange(size - 1):
yield iterator.next()
it = iter(iterable)
while True:
yield inneriter(it.next(), it, size)
In [2]: i = chunkiter('abcdefgh', 3)
In [3]: for ii in i:
for c in ii:
print c,
print ''
...:
a b c
d e f
g h
Update:
A couple of drawbacks due to the fact the inner and outer loops are pulling values from the same iterator:
1) continue doesn't work as expected in the outer loop - it just continues on to the next item rather than skipping a chunk. However, this doesn't seem like a problem as there's nothing to test in the outer loop.
2) break doesn't work as expected in the inner loop - control will wind up in the inner loop again with the next item in the iterator. To skip whole chunks, either wrap the inner iterator (ii above) in a tuple, e.g. for c in tuple(ii), or set a flag and exhaust the iterator.
def group_by(iterable, size):
"""Group an iterable into lists that don't exceed the size given.
>>> group_by([1,2,3,4,5], 2)
[[1, 2], [3, 4], [5]]
"""
sublist = []
for index, item in enumerate(iterable):
if index > 0 and index % size == 0:
yield sublist
sublist = []
sublist.append(item)
if sublist:
yield sublist
You can use partition or chunks function from funcy library:
from funcy import partition
for a, b, c, d in partition(4, ints):
foo += a * b * c * d
These functions also has iterator versions ipartition and ichunks, which will be more efficient in this case.
You can also peek at their implementation.
About solution gave by J.F. Sebastian here:
def chunker(iterable, chunksize):
return zip(*[iter(iterable)]*chunksize)
It's clever, but has one disadvantage - always return tuple. How to get string instead?
Of course you can write ''.join(chunker(...)), but the temporary tuple is constructed anyway.
You can get rid of the temporary tuple by writing own zip, like this:
class IteratorExhausted(Exception):
pass
def translate_StopIteration(iterable, to=IteratorExhausted):
for i in iterable:
yield i
raise to # StopIteration would get ignored because this is generator,
# but custom exception can leave the generator.
def custom_zip(*iterables, reductor=tuple):
iterators = tuple(map(translate_StopIteration, iterables))
while True:
try:
yield reductor(next(i) for i in iterators)
except IteratorExhausted: # when any of iterators get exhausted.
break
Then
def chunker(data, size, reductor=tuple):
return custom_zip(*[iter(data)]*size, reductor=reductor)
Example usage:
>>> for i in chunker('12345', 2):
... print(repr(i))
...
('1', '2')
('3', '4')
>>> for i in chunker('12345', 2, ''.join):
... print(repr(i))
...
'12'
'34'
I like this approach. It feels simple and not magical and supports all iterable types and doesn't require imports.
def chunk_iter(iterable, chunk_size):
it = iter(iterable)
while True:
chunk = tuple(next(it) for _ in range(chunk_size))
if not chunk:
break
yield chunk
Quite pythonic here (you may also inline the body of the split_groups function)
import itertools
def split_groups(iter_in, group_size):
return ((x for _, x in item) for _, item in itertools.groupby(enumerate(iter_in), key=lambda x: x[0] // group_size))
for x, y, z, w in split_groups(range(16), 4):
foo += x * y + z * w
Here is a chunker without imports that supports generators:
def chunks(seq, size):
it = iter(seq)
while True:
ret = tuple(next(it) for _ in range(size))
if len(ret) == size:
yield ret
else:
raise StopIteration()
Example of use:
>>> def foo():
... i = 0
... while True:
... i += 1
... yield i
...
>>> c = chunks(foo(), 3)
>>> c.next()
(1, 2, 3)
>>> c.next()
(4, 5, 6)
>>> list(chunks('abcdefg', 2))
[('a', 'b'), ('c', 'd'), ('e', 'f')]