i want to merge two kyoto cabinet b-tree databases by key.
(kyoto cabinet python api ).
the resulting list should contain each unique key (and its value) of any of the two input dbs.
the following code works but i think its ugly.
left_generator/right_generator are two cursor objects.
its especially odd that get() returns None if the generator is exhausted.
def merge_join_kv(left_generator, right_generator):
stop = False
while left_generator.get() or right_generator.get():
try:
comparison = cmp(right_generator.get_key(), left_generator.get_key())
if comparison == 0:
yield left_generator.get_key(), left_generator.get_value()
left_generator.next()
right_generator.next()
elif (comparison < 0) or (not left_generator.get() or not right_generator.get()):
yield right_generator.get_key(), right_generator.get_value()
right_generator.next()
else:
yield left_generator.get_key(), left_generator.get_value()
left_generator.next()
except StopIteration:
if stop:
raise
stop = True
generally: is there a function/lib which merge joins generators with cmp() ?
I think this is what you need; orderedMerge is based on Gnibbler's code but adds a custom key function and a unique argument,
import kyotocabinet
import collections
import heapq
class IterableCursor(kyotocabinet.Cursor, collections.Iterator):
def __init__(self, *args, **kwargs):
kyotocabinet.Cursor.__init__(self, *args, **kwargs)
collections.Iterator.__init__(self)
def next():
"Return (key,value) pair"
res = self.get(True)
if res is None:
raise StopIteration
else:
return res
def orderedMerge(*iterables, **kwargs):
"""Take a list of ordered iterables; return as a single ordered generator.
#param key: function, for each item return key value
(Hint: to sort descending, return negated key value)
#param unique: boolean, return only first occurrence for each key value?
"""
key = kwargs.get('key', (lambda x: x))
unique = kwargs.get('unique', False)
_heapify = heapq.heapify
_heapreplace = heapq.heapreplace
_heappop = heapq.heappop
_StopIteration = StopIteration
# preprocess iterators as heapqueue
h = []
for itnum, it in enumerate(map(iter, iterables)):
try:
next = it.next
data = next()
keyval = key(data)
h.append([keyval, itnum, data, next])
except _StopIteration:
pass
_heapify(h)
# process iterators in ascending key order
oldkeyval = None
while True:
try:
while True:
keyval, itnum, data, next = s = h[0] # get smallest-key value
# raises IndexError when h is empty
# if unique, skip duplicate keys
if unique and keyval==oldkeyval:
pass
else:
yield data
oldkeyval = keyval
# load replacement value from same iterator
s[2] = data = next() # raises StopIteration when exhausted
s[0] = key(data)
_heapreplace(h, s) # restore heap condition
except _StopIteration:
_heappop(h) # remove empty iterator
except IndexError:
return
then your function can be done as
from operator import itemgetter
def merge_join_kv(leftGen, rightGen):
# assuming that kyotocabinet.Cursor has a copy initializer
leftIter = IterableCursor(leftGen)
rightIter = IterableCursor(rightGen)
return orderedMerge(leftIter, rightIter, key=itemgetter(0), unique=True)
Python 2.6 has a merge in heapq, but it does not support a user defined cmp/key func
def merge(*iterables):
'''Merge multiple sorted inputs into a single sorted output.
Similar to sorted(itertools.chain(*iterables)) but returns a generator,
does not pull the data into memory all at once, and assumes that each of
the input streams is already sorted (smallest to largest).
>>> list(merge([1,3,5,7], [0,2,4,8], [5,10,15,20], [], [25]))
[0, 1, 2, 3, 4, 5, 5, 7, 8, 10, 15, 20, 25]
'''
_heappop, _heapreplace, _StopIteration = heappop, heapreplace, StopIteration
h = []
h_append = h.append
for itnum, it in enumerate(map(iter, iterables)):
try:
next = it.next
h_append([next(), itnum, next])
except _StopIteration:
pass
heapify(h)
while 1:
try:
while 1:
v, itnum, next = s = h[0] # raises IndexError when h is empty
yield v
s[0] = next() # raises StopIteration when exhausted
_heapreplace(h, s) # restore heap condition
except _StopIteration:
_heappop(h) # remove empty iterator
except IndexError:
return
Related
I have a array which stores a object. I am trying to see if there are duplicate values in this object array, but only on one of the objects parameters (hexdigest).
How can I check for duplicates and record the entire object of duplicates I find?
# class to store hashes
class customclass:
def __init__(self, value, hexdigest):
self.value = value
self.hexdigest = hexdigest
# array to store class data
hash_array = []
hash_array.append(customclass(value=299, hexdigest='927'))
hash_array.append(customclass(value=207, hexdigest='92b'))
hash_array.append(customclass(value=113, hexdigest='951'))
hash_array.append(customclass(value=187, hexdigest='951'))
hash_array.append(customclass(value=205, hexdigest='998'))
# sort array
sorted_array = sorted(hash_array, key=attrgetter('hexdigest'))
# check for duplicate hexdigest's
newlist = []
duplist = []
for obj in sorted_array:
for jbo in newlist:
if obj.hexdigest not in jbo:
newlist.append(obj)
else:
duplist.append(obj)
hex_list = []
duplist = []
for obj in sorted_array:
if(obj.hexdigest in hex_list):
duplist.append(obj)
else:
hex_list.append(obj.hexdigest)
use this above block of code instead of the below one which you have implemented to find the list of duplicate object
newlist = []
duplist = []
for obj in sorted_array:
for jbo in newlist:
if obj.hexdigest not in jbo:
newlist.append(obj)
else:
duplist.append(obj)
Well, newlist is empty, so the inner for loop never runs, so nothing gets appended to newlist or duplist.
You may wish to group by the hexdigest attribute using itertools.groupby and a dictionary comprehension.
from operator import attrgetter
from itertools import groupby
class customclass:
def __init__(self, value, hexdigest):
self.value = value
self.hexdigest = hexdigest
hash_array = []
hash_array.append(customclass(value=299, hexdigest='927'))
hash_array.append(customclass(value=207, hexdigest='92b'))
hash_array.append(customclass(value=113, hexdigest='951'))
hash_array.append(customclass(value=187, hexdigest='951'))
hash_array.append(customclass(value=205, hexdigest='998'))
sorted_array = sorted(hash_array, key=attrgetter('hexdigest'))
# [<__main__.customclass object at 0x7f488d1a2a20>,
# <__main__.customclass object at 0x7f488d1a29b0>,
# <__main__.customclass object at 0x7f488d1a2b00>,
# <__main__.customclass object at 0x7f488d1a2b70>,
# <__main__.customclass object at 0x7f488d1a2c18>]
groups = groupby(sorted_array, key=attrgetter('hexdigest'))
{k: list(v) for k, v in groups}
# {'927': [<__main__.customclass object at 0x7f488d1a2a20>],
# '92b': [<__main__.customclass object at 0x7f488d1a29b0>],
# '951': [<__main__.customclass object at 0x7f488d1a2b00>,
# <__main__.customclass object at 0x7f488d1a2b70>],
# '998': [<__main__.customclass object at 0x7f488d1a2c18>]}
From there it's relatively easy to retrieve the unique and duplicate values.
It may be easier to visualize what's going on if you provide a more useful definition for __repr__.
class customclass:
def __init__(self, value, hexdigest):
self.value = value
self.hexdigest = hexdigest
def __repr__(self):
return f"<customclass value: {self.value}, hexdigest: {self.hexdigest}>"
Doing so, hash_array prints in the interactive interpreter as follows, with the exception of he newlines I added for sanity's sake.
[<customclass value: 299, hexdigest: 927>,
<customclass value: 207, hexdigest: 92b>,
<customclass value: 113, hexdigest: 951>,
<customclass value: 187, hexdigest: 951>,
<customclass value: 205, hexdigest: 998>]
I would like to improve the way this code is written. Right now I have six methods that are almost copy-paste, only one line is changing. How can I make a generic method and depending on the property of the data input to change the calculations? I was thinking to use functional programming to achieve that, but I am not sure how to do it properly.
The method is getting a dict object. Then this object is transformed into JSON. The mid variable is storing a JSON with midrate for currency from external API, it must be before the for loop otherwise the API will be called in every iteration and this slows down the process a lot! Then in the for loop, I iterate through the data from the input. The only difference between methods is the calculation before inserting it in the list. .append(mid_current - bankMSell)
def margin_to_exchange_rate_sell(data):
j = data.to_JSON()
list_p = []
mid = midrate.get_midrate(j["fromCurrency"][0])
for idx, val in enumerate(j['toCurrency']):
try:
mid_current = 1/get_key(mid, j['toCurrency'][idx])
bankMSell = float(j['sellMargin'][idx])
list_p.append(mid_current - bankMSell)
except Exception as e:
list_p.append(0)
print(str(e))
return list_p
Another one of the methods:
def margin_to_exchange_rate_buy(data):
j = data.to_JSON()
list_p = []
mid = midrate.get_midrate(j["fromCurrency"][0])
for idx, val in enumerate(j['toCurrency']):
try:
mid_current = 1/get_key(mid, j['toCurrency'][idx])
bankMSell = float(j['sellMargin'][idx])
list_p.append(mid_current + bankMSell)
except Exception as e:
list_p.append(0)
print(str(e))
return list_p
Indeed, there is a way to reduce code here with lambdas:
def margin_to_exchange_rate_sell(data):
return margin_to_exchange_rate(data, lambda m, b: m - b)
def margin_to_exchange_rate_buy(data):
return margin_to_exchange_rate(data, lambda m, b: m + b)
def margin_to_exchange_rate(data, operation):
j = data.to_JSON()
list_p = []
mid = midrate.get_midrate(j["fromCurrency"][0])
for idx, val in enumerate(j['toCurrency']):
try:
mid_current = 1/get_key(mid, j['toCurrency'][idx])
bankMSell = float(j['sellMargin'][idx])
list_p.append(operation(mid_current, bankMSell))
except Exception as e:
list_p.append(0)
print(str(e))
return list_p
I would like to apply a maximum number of items for a list, making sure that the code does not allow the function that appends to the list to add more than, for example, 3 items.
Function that appends to list:
transactions = []
def append_hash():
transactions.append(hash)
How do I not allow append_hash to add more than three hashes to the list: transactions without deleting any previous hashes?
A list is, by definition, of arbitrary size. You'll need a new type instead.
class BoundedListFullError(RuntimeError):
pass
class BoundedList:
def __init__(self, max_size, x=None):
if x is None:
x = []
self.values = []
self.values.extend(x)
self.max_size = max_size
def append(self, x):
if len(self.values) == self.max_size:
raise BoundedListFullError(self.max_size)
self.values.append(x)
def extend(self, xs):
if len(self.values) + len(xs) > self.max_size:
raise BoundedListFullError(self.max_size)
self.values.extend(xs)
You could just subclass list and modify the append method:
class MyStack(list):
def __init__(self, max_size, *args, **kwargs):
super().__init__(*args, **kwargs)
self.max_size = max_size
def append(self, value):
if len(self) >= self.max_size:
raise ValueError("NO!")
# Per #chepner's suggestion
super().append(value)
somestack = MyStack(3)
somestack.append(1)
somestack.append(2)
somestack.append(3)
somestack.append(4) # Raises ValueError
If you controll your code and ensure you only ever use your function:
transactions = []
def append_hash(h):
transaction = (transactions + [h])[:3]
or
def append_hash(h):
if len(transaction) < 3:
transaction.append(3)
# else:
# raise some error you need to choose/define
Neither of those will enforce it though - you can still modify the list without your function. You would need a seperate class - see chepners answer.
Adding a fourth hash will silently fail - if you want to raise an exception instead, the second solution can be adapted.
I got a question regarding list within python. I use the append method to actually append values to my list, now it only replaced the list with new values.
This is my code:
def init(serial):
serial_number = serial
api_call = "http://wwww.herecomesmyhyperlink/"+serial_number
result = []
with open('allserials.csv') as csvfile:
reader = csv.reader(csvfile, delimiter=';', quotechar='|')
for row in reader:
if row[0].strip() == api_call:
result.append(row[1].strip())
call_api(serial_number,result)
return
def call_api(get_serial,get_result):
list_serial = []
for i in range(len(get_result)):
# do an api call
....
# get result of api call
list_serial.append(api_result)
sort_serials(list_serial)
return
def sort_serials(get_list_serial)
sorted_list_serial = sorted(get_list_serial, reverse=True)
print(sorted_list_serial)
max_results = 10
length_of_sorted_list_serial = len(get_list_serial)
if length_of_sorted_list_serial < max_results:
get_first_list_element = sorted_list_serial[0]
get_second_element_of_that_list = get_first_list_element[1]
init(get_second_element_of_that_list)
else:
print("it is not smaller")
return
print(init('1320739'))
sorted_list_serial would contain something like: [rankingid,serial,title].
get_second_element_of_that_list: [serial]
The thing is that when I run my code I got the following results:
s: 1320739, max result:10 length of the list:3
s: 1523039, max result:10 length of the list:9
What the code does is that instead of having a list of 12 items, it replace the list with the 3 items with the new list of 9 items.
What I want is to actually have a new list containing 12 items, so that the first 3 items are still within the list and the 9 other elements are added to the original list.
The list is scoped to the function call_api() so pull it out, or pass it to each function, or create a class.
def init(serial):
serial_number = serial
result = []
with open('allserials.csv') as csvfile:
result.append()
return result
def call_api(get_serial,get_result):
# list_serial = []
#
# Move this out
# Or pass it along to each function
for i in range(len(get_result)):
# do an api call
....
# get result of api call
list_serial.append(api_result)
return list_serial
def sort_serials(get_list_serial)
sorted_list_serial = sorted(get_list_serial, reverse=True)
max_results = 10
length_of_sorted_list_serial = len(get_list_serial)
if length_of_sorted_list_serial < max_results:
get_first_list_element = sorted_list_serial[0]
get_second_element_of_that_list = get_first_list_element[1]
else:
print("it is not smaller")
return {'get_second_element_of_that_list':get_second_element_of_that_list, 'sorted_serial_list':sorted_serial_list}
So scope it to the same function, and have the other functions return results:
def run():
list_serial = []
serial_number = '1320739'
result = init(serial_number)
# here the items get set
list_serial = call_api(serial_number,result)
# here they get sorted
serial_sorted = sort_serials(list_serial)
# list serial is now the sorted list
list_serial = serial_sorted['sorted_serial_list']
get_second_element_of_that_list = serial_sorted['get_second_element_of_that_list']
init(get_second_element_of_that_list)
Or redefine how its passed:
serial_number = '1320739'
init(serial_number, list_serial)
call_api(serial_number,result, list_serial)
sort_serials(list_serial)
init(get_second_element_of_that_list, list_serial)
Or just pull it out:
.
.
.
list_serial = []
print(init('1320739'))
Or create a class:
class SomeClassNameHere(object):
def __init__(self,serialnumber=None, item2=''):
self.serialnumber = serialnumber
self.item3 = item2
self.listserial = []
self.run(item2)
def doOtherStuff(self):
# self.listserial will be updated
self.listserial = [1,2,3]
print(self.item3)
print(self.serialnumber)
def run(self,passeditem2):
print('Item2 has been passed: {0}'.format(passeditem2))
print('listserial not updated:',self.listserial)
self.doOtherStuff()
print('listserial updated:',self.listserial)
here = SomeClassNameHere(serialnumber='456',item2='somestring')
print(here.serialnumber)
print(here.item3)
here.run()
here.doOtherStuff()
I'm trying to port a library over to Python 3. It has a tokenizer for PDF streams. The reader class calls next() on these tokens. This worked in Python 2, but when I run it in Python 3 I get TypeError: 'PdfTokens' object is not an iterator.
Selections from tokens.py concerning iterators:
class PdfTokens(object):
def __init__(self, fdata, startloc=0, strip_comments=True):
self.fdata = fdata
self.iterator = iterator = self._gettoks(startloc)
self.next = next(iterator)
def __iter__(self):
return self.iterator
def _gettoks(self, startloc, cacheobj=_cacheobj,
delimiters=delimiters, findtok=findtok, findparen=findparen,
PdfString=PdfString, PdfObject=PdfObject):
fdata = self.fdata
current = self.current = [(startloc, startloc)]
namehandler = (cacheobj, self.fixname)
cache = {}
while 1:
for match in findtok(fdata, current[0][1]):
current[0] = tokspan = match.span()
token = match.group(1)
firstch = token[0]
if firstch not in delimiters:
token = cacheobj(cache, token, PdfObject)
elif firstch in '/<(%':
if firstch == '/':
# PDF Name
token = namehandler['#' in token](cache, token, PdfObject)
elif firstch == '<':
# << dict delim, or < hex string >
if token[1:2] != '<':
token = cacheobj(cache, token, PdfString)
elif firstch == '(':
ends = None # For broken strings
if fdata[match.end(1)-1] != ')':
nest = 2
m_start, loc = tokspan
for match in findparen(fdata, loc):
loc = match.end(1)
ending = fdata[loc-1] == ')'
nest += 1 - ending * 2
if not nest:
break
if ending and ends is None:
ends = loc, match.end(), nest
token = fdata[m_start:loc]
current[0] = m_start, match.end()
if nest:
(self.error, self.exception)[not ends]('Unterminated literal string')
loc, ends, nest = ends
token = fdata[m_start:loc] + ')' * nest
current[0] = m_start, ends
token = cacheobj(cache, token, PdfString)
elif firstch == '%':
# Comment
if self.strip_comments:
continue
else:
self.exception('Tokenizer logic incorrect -- should never get here')
yield token
if current[0] is not tokspan:
break
else:
if self.strip_comments:
break
raise StopIteration
The beginning of the offending method in the pdfreader file that raises the error:
def findxref(fdata):
''' Find the cross reference section at the end of a file
'''
startloc = fdata.rfind('startxref')
if startloc < 0:
raise PdfParseError('Did not find "startxref" at end of file')
source = PdfTokens(fdata, startloc, False)
tok = next(source)
I was under the impression that all you needed to define a custom iterator object was a .__iter__method, a .next() method and to raise a StopIteration error. This class has all these things and yet it stills raises the TypeError.
Furthermore, this library and it's methods worked in Python 2.7 and have ceased to work in a Python 3 environment. What about Python 3 has made this different? What can I do to make the PdfTokens object iterable?
You cannot call next on PdfTokens's instance directly, you need to get its iterator first by calling iter() on it. That's exactly what a for-loop does as well*, it calls iter() on the object first and gets an iterator and then within the loop __next__ is invoked on that iterator until it is not exhausted:
instance = PdfTokens(fdata, startloc, False)
source = iter(instance)
tok = next(source)
Well not always, if there's no __iter__ defined on the class then the iterator protocol falls back to __getitem__ if defined.