Parallelizing list filtering - python

I have a list of items that I need to filter based on some conditions. I'm wondering whether Dask could do this filtering in parallel, as the list is very long (a few dozen million records).
Basically, what I need to do is this:
items = [
{'type': 'dog', 'weight': 10},
{'type': 'dog', 'weight': 20},
{'type': 'cat', 'weight': 15},
{'type': 'dog', 'weight': 30},
]
def item_is_valid(item):
item_is_valid = True
if item['type']=='cat':
item_is_valid = False
elif item['weight']>20:
item_is_valid = False
# ...
# elif for n conditions
return item_is_valid
items_filtered = [item for item in items if item_is_valid(item)]
With Dask, what I have achieved to do is the following:
def item_is_valid_v2(item):
"""Return the whole item if valid."""
item_is_valid = True
if item['type']=='cat':
item_is_valid = False
elif item['weight']>20:
item_is_valid = False
# ...
# elif for n conditions
if item_is_valid:
return item
results = []
item = []
for item in items:
delayed = dask.delayed(item_is_valid)(item)
results.append(delayed)
results = dask.compute(*results)
However, the result I get contains a few None values, which then need to be filtered out somehow in a non-parallel way.
({'type': 'dog', 'weight': 10}, {'type': 'dog', 'weight': 20}, None, None)

Perhaps the bag API will work you, this is a rough pseudo-code:
import dask.bag as db
bag = db.from_sequence() # or better yet read it from disk
result = bag.filter(item_is_valid) # note this uses the first version (bool)
To inspect if this is working, inspect the outcome of result.take(5) and if that is satisfactory:
computed_result = result.compute()

Related

loosing dict content as soon as am out of loop in python

Team: need some assistance..
sub: loosing dict content as soon as am out of loop. dict is populated with loop vars that are added to dict using subscript approach.
below foo() is always getting executed because the team_oncall_dict is empty outside. any hint how can I retain it as it was inside loop?
def askduty_oncall(self, *args):
session = APISession(PD_API_KEY, default_from=PD_USER_EMAIL)
total = 1 #true or false
limit = 40
teamteamnm = "Team Test Team"
team_esp_name = “Team Test Escalation Policy"
teamteamid = ""
teamesplcyid = ""
team_oncall_dict = {}
if args:
offset = args[0]
total_teams = args[1]
if offset <= total_teams:
print("\nfunc with args with new offset {} called\n".format(offset))
teams = session.get('/teams?limit={0}&total={1}&offset={2}'.format(limit,total,offset))
else:
print("Reached max teams, no more team records to pull")
return
else:
print("\nFunc with no args called, hence pull first set of {} teams as defined by limit var\n".format(limit))
teams = session.get('/teams?limit={0}&total={1}'.format(limit,total))
if not teams.ok:
return
else:
tj = teams.json()
tjd = tj['teams']
for adict in tjd:
if not adict['name'] == teamteamnm:
continue
elif adict['name'] == teamteamnm:
teamteamid = adict['id']
print("\nFound team..\nFetched",adict['name'], "id: {0}".format(teamteamid))
print("Pull escalation policy for team '{}':'{}'".format(teamteamnm,teamteamid))
esclp = session.get('/escalation_policies?total={0}&team_ids%5B%5D={1}'.format(total,teamteamid))
if not esclp.ok:
print("Pulling Escalation polices for team '{}' failed".format(teamteamnm))
return
else:
ep = esclp.json()
epj = esclp.json()['escalation_policies']
if not epj:
print("Escalation polices for team '{}' not defined".format(teamteamnm))
return
else:
for adict2 in epj:
if not adict2['summary'] == team_esp_name:
continue
else:
print("***************FOUND FOUND********************")
teamesplcyid = adict2['id']
print("\nFetched {} id: {}\n".format(team_esp_name, teamesplcyid))
oncalls = session.get('/oncalls?total={0}&escalation_policy_ids%5B%5D={1}'.format(total,teamesplcyid))
if not oncalls.ok:
print(“issue “with oncalls)
return
else:
ocj = oncalls.json()['oncalls']
for adict3 in ocj:
print("\n")
print(adict3['escalation_level'])
if i['escalation_level'] == 1:
print(adict3['schedule']['summary'], adict3['user']['summary'])
team_oncall_dict[adict3['schedule']['summary']] = adict3['user']['summary']
print(team_oncall_dict)
return team_oncall_dict
if not team_oncall_dict: #part of func def
do foo()
output
foo stuff
sample data is a list of dicts
[{'escalation_policy': {'id': 'P8RKTEE', 'type': 'escalation_policy_reference', 'summary': 'Team Escalation Policy'}, 'escalation_level': 3, 'schedule': None, 'user': {'id': 'PX8XYFT', 'type': 'user_reference', 'summary': 'M1’}, 'start': None, 'end': None},
{'escalation_policy': {'id': 'P8RKTEE', 'type': 'escalation_policy_reference', 'summary': 'Team Escalation Policy'}, 'escalation_level': 1, 'schedule': None, 'user': {'id': 'PKXXVJI', 'type': 'user_reference', 'summary': ‘R1’}, 'start': None, 'end': None},
{'escalation_policy': {'id': 'P8RKTEE', 'type': 'escalation_policy_reference', 'summary': 'Team’}, 'escalation_level': 2, 'schedule': None, 'user': {'d': 'PN8F9PC', 'type': 'user_reference’,'summary': ‘T1’}],'start': None, 'end': None}]
btw: above is 4th inner loop.
so flow is like this diagramatically.
def func1()
team_oncall_dict = {}
loop1
loop2
loop3
loop4
...
team_oncall_dict
if not team_oncall_dict:
print("dict is empty")
output
dict is empty
t was local vs global. fixed it by declaring the team_oncall_dict globally outside the function.
intead of
def func1()
team_oncall_dict = {}
team_oncall_dict = {}
def func1()

#pytest.mark.parametrize in similar tests

I'm planning to write a test for a query for the database but I wonder what is the best way to write it and can I use #pytest.mark.parametrize for this type of test, because I need to test 4 type of tests that looks very similar
def test_get_number(self):
#first create test data in the database
#then make queries
test_number = 'test_number'
data_base_records_number = self.get_records_query(number)
# data_base_records_number =
# [{'id': 1 ,
# 'number':"test_number",
# 'company':"test_company1",
# 'warehouse' : "test_warehouse1",
# 'product': "test_product1"},
# {'id': 2 ,
# 'number':"test_number",
# 'company':"test_company2",
# 'warehouse' : "test_warehouse2",
# 'product': "test_product2"},
# {'id': 3,
# 'number': "test_number",
# 'company': "test_company3",
# 'warehouse': "test_warehouse3",
# 'product': "test_product3"},
# {'id': 4,
# 'number': "test_number",
# 'company': "test_company4",
# 'warehouse': "test_warehouse4",
# 'product': "test_product4"}
# ]
assert len(data_base_records_number) == 4
for record in data_base_records:
assert record['number'] == number
def test_get_number_and_company(self):
#first create test data in the database
#then make queries
test_number = 'test_number'
company = 'test_company1'
data_base_records_number_company = self.get_records_query(number, company)
#Output
# data_base_records_number_company =
# [{'id': 1 ,
# 'number':"test_number",
# 'company':"test_company1",
# 'warehouse' : "test_warehouse1",
# 'product': "test_product1"},
#
# ]
assert len(data_base_records_number_company) == 1
assert data_base_records_number_company['number'] == number
assert data_base_records_number_company['company'] == company
def test_get_number_and_warehouse(self):
...
def test_get_number_and_product(self):
...

QTableview Select Item from filtered model

I have a QTableView which is using a QStandardItemModel. How can I select a specific row in the table with/without a search applied.
For example I typed 'y' in the search bar to filter the list to only display rows that contain the letter 'y'. When I click the button 'Select Emily' how can i make it select the correct row in the tableview, considering users can change the sort order?
import sys
from PySide import QtCore, QtGui
class Example(QtGui.QWidget):
def __init__(self, *args, **kwargs):
super(Example, self).__init__(*args, **kwargs)
self.resize(400,400)
# controls
model = QtGui.QStandardItemModel(5, 3)
model.setHorizontalHeaderLabels(['NAME', 'AGE', 'CAREER'])
people = [
{'name': 'Kevin', 'age': 5, 'career': 'athlete'},
{'name': 'Maggie', 'age': 13, 'career': 'banker'},
{'name': 'Leslie', 'age': 32, 'career': 'banker'},
{'name': 'Abby', 'age': 32, 'career': 'marketing'},
{'name': 'Emily', 'age': 45, 'career': 'athlete'},
{'name': 'David', 'age': 27, 'career': 'banker'},
{'name': 'Johnny', 'age': 27, 'career': 'soccer'},
{'name': 'Marie', 'age': 63, 'career': 'secretary'}
]
for row, obj in enumerate(people):
item = QtGui.QStandardItem(obj['name'])
model.setItem(row, 0, item)
item = QtGui.QStandardItem(str(obj['age']))
model.setItem(row, 1, item)
item = QtGui.QStandardItem(obj['career'])
model.setItem(row, 2, item)
proxy_model = QtGui.QSortFilterProxyModel()
proxy_model.setSourceModel(model)
# controls
self.ui_table = QtGui.QTableView()
self.ui_table.setEditTriggers(QtGui.QAbstractItemView.NoEditTriggers)
self.ui_table.setSelectionBehavior(QtGui.QAbstractItemView.SelectRows)
self.ui_table.setSelectionMode(QtGui.QAbstractItemView.SingleSelection)
self.ui_table.setModel(proxy_model)
self.ui_table.setSortingEnabled(False)
self.ui_table.setSortingEnabled(True)
self.ui_table.sortByColumn(0, self.ui_table.horizontalHeader().sortIndicatorOrder())
self.ui_search = QtGui.QLineEdit()
self.ui_selected = QtGui.QPushButton('Select Emily')
# lay main
lay_main = QtGui.QVBoxLayout()
lay_main.addWidget(self.ui_search)
lay_main.addWidget(self.ui_table)
lay_main.addWidget(self.ui_selected)
self.setLayout(lay_main)
# connections
self.ui_selected.clicked.connect(self.clicked_selected_emily)
self.ui_search.textChanged.connect(self.filter_items)
def clicked_selected_emily(self):
print 'select emily'
self.ui_table.selectRow(2)
def filter_items(self, text):
rows = self.ui_table.model().rowCount()
for row in range(rows):
self.filter_row(row, text)
def filter_row(self, row, pattern):
if not pattern:
self.ui_table.setRowHidden(row, False)
return
model = self.ui_table.model()
columns = model.columnCount()
stringlist = []
# collect text from all columns into single string for searching
for c in range(columns):
mdx = model.index(row, c)
if mdx.isValid():
val = str(mdx.data(role=QtCore.Qt.DisplayRole)).lower()
stringlist.append(val)
# search for string
patterns = filter(None, [x.lower() for x in pattern.split(' ')])
results = all(any(x in y for y in stringlist) for x in patterns)
if results:
self.ui_table.setRowHidden(row, False)
else:
self.ui_table.setRowHidden(row, True)
if __name__ == '__main__':
app = QtGui.QApplication(sys.argv)
ex = Example()
ex.show()
sys.exit(app.exec_())
You have to use the match() method of the model:
def clicked_selected_emily(self):
print("select emily")
self.ui_table.clearSelection()
indexes = self.ui_table.model().match(
self.ui_table.model().index(0, 0),
QtCore.Qt.DisplayRole, # role of the search, the text uses the role Qt::DisplayRole
"Emily", # value that is being searched in the model.
-1, # maximum number of values ​​are being searched, if it is -1 it will search for all the matches
QtCore.Qt.MatchExactly # type of search
)
for ix in indexes:
self.ui_table.selectRow(ix.row())
Update:
By default, the search is the column that is passed to self.ui_table.model().index(0, col) in the previous example, if you want to search through all the columns you should only iterate over them, to observe the effect, multiselection is enabled:
self.ui_table.setSelectionMode(QtGui.QAbstractItemView.MultiSelection)
...
def clicked_selected_emily(self):
print("select banker")
self.ui_table.clearSelection()
word = "banker"
for i in range(self.ui_table.model().columnCount()):
indexes = self.ui_table.model().match(
self.ui_table.model().index(0, i),
QtCore.Qt.DisplayRole, # role of the search, the text uses the role Qt::DisplayRole
"banker", # value that is being searched in the model.
-1, # maximum number of values ​​are being searched, if it is -1 it will search for all the matches
QtCore.Qt.MatchExactly # type of search
)
for ix in indexes:
self.ui_table.selectRow(ix.row())

Python Cerberus how to check dynamic root keys

I have a dict with IDs as its root keys that I want to validate. In other words, the root keys of the dict I want to validate are dynamic. Is there a way to run keyschema against the root keys?
e.g. https://repl.it/#crunk1/cerberusrootkeys
import cerberus
v = cerberus.validator.Validator()
schema = {'keyschema': {'type': 'string'}}
d = {'foo': 'bar', 'baz': 'gaz'}
print('I want this to be true.')
print(v.validate(d, schema))
### Output:
# I want this to be true.
# False
I know I could do the following:
wrapper = {'nested': d}
schema = {'nested': {'keyschema': {'type': 'string'}}}
v.validate(wrapper, schema)
but the current structure of my project doesn't easily allow for that.
Any solutions/tips/suggestions?
I managed to hack something together (https://repl.it/#crunk1/Cerberus-root-types) subclassing Validator and overriding validate():
class V(cerberus.Validator):
def validate(self, document, schema=None, update=False, normalize=True):
doc = None
wrapped = False
if schema is not None:
root_schema = schema.get('__root__', None)
wrapped = root_schema is not None
if wrapped:
doc = {'__root__': document}
schema = {'__root__': root_schema}
elif self.schema is not None:
root_schema = self.schema.get('__root__', None)
wrapped = root_schema is not None
if wrapped:
doc = {'__root__': document}
schema = {'__root__': root_schema}
doc = doc or document
result = super(V, self).validate(doc, schema, update, normalize)
if wrapped:
# Unwrap.
self.document = self.document['__root__']
for e in self._errors:
e.schema_path = tuple(e.schema_path[1:])
if len(e.document_path) > 1:
e.document_path = tuple(e.document_path[1:])
return result
This allows you to treat the root document as a 'type': 'dict' or 'type': 'list'.
v = V()
d = {'1': '1', '2': '2'}
schema = {'__root__': {
'type': 'dict',
'keyschema': {'coerce': int},
'valueschema': {'coerce': int},
}}
print(v.validate(d, schema), v.document, v.errors)
l = ['1', '2']
schema = {'__root__': {
'type': 'list',
'schema': {'coerce': int},
}}
print(v.validate(l, schema), v.document, v.errors)
l = ['1', 'b']
print(v.validate(l, schema), v.document, v.errors)
Output:
True {1: 1, 2: 2} {}
True [1, 2] {}
False [1, 'b'] {1: ["field '1' cannot be coerced: invalid literal for int() with base 10: 'b'"]}

list comprehension fails but why?

Who can explain to me why this list comprehension fails:
provider1 = {'id': 1, 'name': 'Een'}
provider2 = {'id': 2, 'name': 'Twee'}
provider3 = {'id': 3, 'name': 'Drie'}
provider4 = {'id': 4, 'name': 'Vier'}
provider5 = {'id': 5, 'name': 'Vijf'}
provider6 = {'id': 6, 'name': 'Zes'}
provider7 = {'id': 7, 'name': 'Zeven'}
providers = [provider1, provider2, provider3, provider4, provider5, provider6, provider7]
def testfunc(id):
return next(provider for provider in providers if int(provider['id']) == int(id))
for x in range(0, 8):
print testfunc(x)
When I run this and 0 is passed to the funtion, the output from this is:
Traceback (most recent call last):
File "/Users/me/Documents/scratchpad/main.py", line 17, in <module>
print testfunc(x)
File "/Users/me/Documents/scratchpad/main.py", line 13, in testfunc
return next(provider for provider in providers if int(provider['id']) == int(id))
StopIteration
Process finished with exit code 1
It does work for a non zero integer.
That's because next function raises StopIteration when there's no next item. In particular this occures when the underlying iterator is empty which is your case for id == 0.
The dictionary does not have a value for key 0. It finds value as None and it cannot determine the next value for iteration.
Replace your code with valid ranges, your code will work
for x in range(1, 8):
print( testfunc(x))
OR
You could add provider0 = {'id': 0, 'name': 'Onkar'}
and providers = [provider0,provider1, provider2, provider3, provider4, provider5, provider6, provider7] to make
for x in range(0, 8):
print( testfunc(x))
work
Yes, because your generator is empty. None of your data matches
if int(provider['id']) == 0
Calling next on an empty generator throws the StopIteration.

Categories