I'm receiving many CSV-files that contain orders for different products. Those CSV-files need to be "converted" into a specific JSON-structure.
Each row of the CSV-file represents the order of one product. This means that if I would order two products, the CSV would contain two rows.
A simplified version of the CSV-file may look like this (please note the orderId "111" in the first and third row):
orderId,itemNumber,itemName,name,street
111,123,testitem,john doe,samplestreet 1
222,345,anothertestitem,jane doe,samplestreet 1
111,345,anothertestitem,john doe,samplestreet 1
My current solution works but I think I'm overcomplicating things.
Currently, I'm iterating over each CSV-row and create the JSON-structure where I use a helper-function that will either add the order or append a list that contains ordered items like so:
def add_orderitem(orderitem, order, all_orders):
""" Adds an ordered product to the order or "create" a new order if it doesn't exist """
for row in all_orders:
# Order already exists
if any(order["orderNumber"] == value for field, value in row.items()):
print(f"Order '{order['orderNumber']}' already exists, adding product #{orderitem['sku']}")
row["orderItems"].append(orderitem)
return all_orders
# New order
print(f"New Order found, creating order '{order['orderNumber']}' and adding product #{orderitem['sku']}")
all_orders.append(order)
order["orderItems"].append(orderitem)
return all_orders
def parse_orders():
""" Converts CSV-orders into JSON """
results = []
orders = read_csv("testorder.csv") # helper-function returns CSV-dictreader (list of dicts)
for order in orders:
# Create basic structure
orderdata = {
"orderNumber": order["orderId"],
"address": {
"name": order["orderId"],
"street": order["street"]
},
"orderItems": [] # <-- this will be filled later
}
# Extract product-information that will be inserted in above 'orderItems' list
product = {
"sku": order["itemNumber"],
"name": order["itemName"]
}
# Add order to final list or add item if order already exists
results = add_orderitem(product, orderdata, results)
return results
def main():
from pprint import pprint
parsed_orders = parse_orders()
pprint(parsed_orders)
if __name__ == "__main__":
main()
The skript works fine, the output below is what I'm expecting:
New Order found, creating order '111' and adding product #123
New Order found, creating order '222' and adding product #345
Order '111' already exists, adding product #345
[{'address': {'name': '111', 'street': 'samplestreet 1'},
'orderItems': [{'name': 'testitem', 'sku': '123'},
{'name': 'anothertestitem', 'sku': '345'}],
'orderNumber': '111'},
{'address': {'name': '222', 'street': 'samplestreet 1'},
'orderItems': [{'name': 'anothertestitem', 'sku': '345'}],
'orderNumber': '222'}]
Is there a way, to do this "smarter"?
Imo a namedtuple and a groupby would make your code clearer:
from collections import namedtuple
from itertools import groupby
# csv data or file
data = """orderId,itemNumber,itemName,name,street
111,123,testitem,john doe,samplestreet 1
222,345,anothertestitem,jane doe,samplestreet 1
111,345,anothertestitem,john doe,samplestreet 1
"""
# the Order tuple
Order = namedtuple('Order', 'orderId itemNumber itemName name street')
# load the csv into orders
orders = [Order(*values) for line in data.split("\n")[1:] if line for values in [line.split(",")]]
# and group it by orderId
orders = sorted(orders, key = lambda order: order.orderId)
# group it by orderId
output = list()
for key, values in groupby(orders, key=lambda order: order.orderId):
items = list(values)
dct = {"address": {"name": items[0].name, "street": items[0].street},
"orderItems": [{"name": item.itemName, "sku": item.itemNumber} for item in items]}
output.append(dct)
print(output)
This yields
[{'address': {'name': 'john doe', 'street': 'samplestreet 1'}, 'orderItems': [{'name': 'testitem', 'sku': '123'}, {'name': 'anothertestitem', 'sku': '345'}]},
{'address': {'name': 'jane doe', 'street': 'samplestreet 1'}, 'orderItems': [{'name': 'anothertestitem', 'sku': '345'}]}]
You could even put it in a great comprehension but that would not make it more readable.
Related
I want to get all fieldnames from the union of two lists to later export as csv, but I'm only getting fildname from just one list.
I want to get all fieldnames because when I go to export to csv I get the following error:
ValueError: dict contains fields not in fieldnames: 'amzlink', 'url', 'asin'
amazondata = [{'amzlink': 'https://www.amazon.com/dp/B084ZZ7VY3', 'asin': 'B084ZZ7VY3', 'url': 'https://www.amazon.com/s?k=712145360504&s=review-rank'}]
amazonPage = [{'price': '$14.95', 'image': 'https://m.media-amazon.com/images/I/81D1P4QqLfL._AC_SX425_.jpg', 'rating': '4.7 out of 5'}]
result = []
amazonPage.extend(amazondata)
for myDict in amazonPage:
if myDict not in result:
result.append(myDict)
print (result[0])
If you are just looking to get a list of all field names in the dictionaries:
Extract the keys from the dictionaries, convert to set, and take union of sets.
Borrowed #Baramr's amazondata list to demonstrate this below:
amazondata = [{'amzlink': 'https://www.amazon.com/dp/B084ZZ7VY3', 'asin': 'B084ZZ7VY3', 'url': 'https://www.amazon.com/s?k=712145360504&s=review-rank'}]
amazonPage = [{'price': '$14.95', 'image': 'https://m.media-amazon.com/images/I/81D1P4QqLfL._AC_SX425_.jpg', 'rating': '4.7 out of 5'}]
amazondata_fields = set(amazondata[0].keys())
amazonPage_fields = set(amazonPage[0].keys())
all_fields = amazondata_fields.union(amazonPage_fields)
print(all_fields)
> {'price', 'rating', 'asin', 'image', 'amzlink', 'url'}
If you are looking to fuse two dictionaries: Use the update method.
amazondata[0].update(amazonPage[0])
print(amazondata[0])
> {'amzlink': 'https://www.amazon.com/dp/B084ZZ7VY3', 'asin':
> 'B084ZZ7VY3', 'url':
> 'https://www.amazon.com/s?k=712145360504&s=review-rank', 'price':
> '$14.95', 'image':
> 'https://m.media-amazon.com/images/I/81D1P4QqLfL._AC_SX425_.jpg',
> 'rating': '4.7 out of 5'}
Loop over all the dictionaries, adding the keys to a set.
amazondata = [{'amzlink': 'https://www.amazon.com/dp/B084ZZ7VY3', 'asin': 'B084ZZ7VY3', 'url': 'https://www.amazon.com/s?k=712145360504&s=review-rank'}]
amazonPage = [{'price': '$14.95', 'image': 'https://m.media-amazon.com/images/I/81D1P4QqLfL._AC_SX425_.jpg', 'rating': '4.7 out of 5'}]
result = []
amazonPage.extend(amazondata)
all_fields = set()
for myDict in amazonPage:
all_fields |= myDict.keys()
print(all_fields)
I have a csv file and Im trying to create a nested dictionary that looks like this:
contacts = {"Tom": {"name": "Tom Techie",
"phone": "123 123546",
"email": "tom#tom.fi",
"skype": "skypenick"},
"Mike": {"name": "Mike Mechanic",
"phone": "000 123546",
"email": "mike#mike.fi",
"skype": "-Mike-M-"}}
etc
And this is what I have written:
file = open("csv","r")
d = {}
for i in file:
f = i.strip()
x = f.split(";")
if x[4] != "":
d.update({x[0] : {"name":x[1],
"phone":x[2],
"email":x[3],
"skype":x[4]}})
else:
d.update ({x[0] : {"name": x[1],
"phone": x[2],
"email": x[3]}})
However it prints the dict as a normal dictionary with the updates as keys when they should be like stated above.
EDIT:
First lines of the csv:
key;name;phone;email;skype
Tom;Tom Techie;123 123546;tom#tom.fi;skypenick
Mike;Mike Mechanic;000 123456;mike#mike.fi;-Mike-M-
Archie;Archie Architect;050 987654;archie#archie
You can use pd.read_csv() and to_dict():
import pandas as pd
contacts = pd.read_csv('test.csv', sep=';').set_index('key').to_dict(orient='index')
Yields:
{'Tom': {'name': 'Tom Techie', 'phone': '123 123546', 'email': 'tom#tom.fi', 'skype': 'skypenick'}, 'Mike': {'name': 'Mike Mechanic', 'phone': '000 123456', 'email': 'mike#mike.fi', 'skype': '-Mike-M-'}, 'Archie': {'name': 'Archie Architect', 'phone': '050 987654', 'email': 'archie#archie', 'skype': nan}}
I like the pandas answer, but if you don't want a 3rd party library, use the built-in csv module:
import csv
from pprint import pprint
D = {}
with open('csv',newline='') as f:
r = csv.DictReader(f,delimiter=';')
for line in r:
name = line['key']
del line['key']
D[name] = dict(line)
pprint(D)
Output:
{'Archie': {'email': 'archie#archie',
'name': 'Archie Architect',
'phone': '050 987654',
'skype': None},
'Mike': {'email': 'mike#mike.fi',
'name': 'Mike Mechanic',
'phone': '000 123456',
'skype': '-Mike-M-'},
'Tom': {'email': 'tom#tom.fi',
'name': 'Tom Techie',
'phone': '123 123546',
'skype': 'skypenick'}}
You can use zip() to achieve your goal:
file = """key;name;phone;email;skype
Tom;Tom Techie;123 123546;tom#tom.fi;skypenick
Mike;Mike Mechanic;000 123456;mike#mike.fi;-Mike-M-
Archie;Archie Architect;050 987654;archie#archie""".splitlines()
d = {}
h = None
for i in file: # works the same for your csv-file
# first row == header, store in h
if h is None:
h = i.strip().split(";")[1:]
continue # done for first row
x = i.strip().split(";")
# zip pairs the read in line with the header line to get tuples
# wich are fed into the dict constructor that creates the inner dict
d[x[0]] = dict(zip(h,x[1:]+[""])) # no default for skype
# use this instead if you want the skype key always present with empty default
# d[x[0]] = dict(zip(h,x[1:]+[""]))
print(d)
zip() discards the elements of the longer list - you won't need any checks for that.
Output:
{'Tom': {'name': 'Tom Techie', 'phone': '123 123546',
'email': 'tom#tom.fi', 'skype': 'skypenick'},
'Mike': {'name': 'Mike Mechanic', 'phone': '000 123456',
'email': 'mike#mike.fi', 'skype': '-Mike-M-'},
'Archie': {'name': 'Archie Architect', 'phone': '050 987654',
'email': 'archie#archie'}}
If you use the commented line, the data will get a default value of '' for the skype - works only b/c skype is the last element of the splitted line
You can use a dict comprehension! Assuming the data is something like
with open("df.csv", "r") as file:
d = {x.split(";")[0]:{
"name": x.split(";")[2],
"phone": x.split(";")[3],
"email": x.split(";")[1],
"skype": x.split(";")[4][:-1] # Slice off trailing newline
} for x in file}
d.pop("")
We want to open files using with whenever possible to benefit from Python's context management. See https://www.python.org/dev/peps/pep-0343/ for fundamental understanding of the with statement.
Since the key "" only appears once at the head of the csv, we can pop it at the end and avoid performing a comparison at every iteration. A dict comprehension accomplishes the same thing you wanted to achieve with d.update.
More about comprehensions:
https://docs.python.org/3/tutorial/datastructures.html#list-comprehensions
Edit: refactoring to remove the repetitive calls to .split can look something like this:
def line_to_dict(x, d):
x = x.split(";")
d[x[0]] = {
"name": x[2],
"phone": x[3],
"email": x[1],
"skype": x[4][:-1] # Slice off trailing newline
}
with open("df.csv", "r") as file:
d = {}
for x in file:
line_to_dict(x, d)
d.pop("")
Simple Python question, but I'm scratching my head over the answer!
I have an array of strings of arbitrary length called path, like this:
path = ['country', 'city', 'items']
I also have a dictionary, data, and a string, unwanted_property. I know that the dictionary is of arbitrary depth and is dictionaries all the way down, with the exception of the items property, which is always an array.
[CLARIFICATION: The point of this question is that I don't know what the contents of path will be. They could be anything. I also don't know what the dictionary will look like. I need to walk down the dictionary as far as the path indicates, and then delete the unwanted properties from there, without knowing in advance what the path looks like, or how long it will be.]
I want to retrieve the parts of the data object (if any) that matches the path, and then delete the unwanted_property from each.
So in the example above, I would like to retrieve:
data['country']['city']['items']
and then delete unwanted_property from each of the items in the array. I want to amend the original data, not a copy. (CLARIFICATION: By this I mean, I'd like to end up with the original dict, just minus the unwanted properties.)
How can I do this in code?
I've got this far:
path = ['country', 'city', 'items']
data = {
'country': {
'city': {
'items': [
{
'name': '114th Street',
'unwanted_property': 'foo',
},
{
'name': '8th Avenue',
'unwanted_property': 'foo',
},
]
}
}
}
for p in path:
if p == 'items':
data = [i for i in data[p]]
else:
data = data[p]
if isinstance(data, list):
for d in data:
del d['unwanted_property']
else:
del data['unwanted_property']
The problem is that this doesn't amend the original data. It also relies on items always being the last string in the path, which may not always be the case.
CLARIFICATION: I mean that I'd like to end up with:
{
'country': {
'city': {
'items': [
{
'name': '114th Street'
},
{
'name': '8th Avenue'
},
]
}
}
}
Whereas what I have available in data is only [{'name': '114th Street'}, {'name': '8th Avenue'}].
I feel like I need something like XPath for the dictionary.
The problem you are overwriting the original data reference. Change your processing code to
temp = data
for p in path:
temp = temp[p]
if isinstance(temp, list):
for d in temp:
del d['unwanted_property']
else:
del temp['unwanted_property']
In this version, you set temp to point to the same object that data was referring to. temp is not a copy, so any changes you make to it will be visible in the original object. Then you step temp along itself, while data remains a reference to the root dictionary. When you find the path you are looking for, any changes made via temp will be visible in data.
I also removed the line data = [i for i in data[p]]. It creates an unnecessary copy of the list that you never need, since you are not modifying the references stored in the list, just the contents of the references.
The fact that path is not pre-determined (besides the fact that items is going to be a list) means that you may end up getting a KeyError in the first loop if the path does not exist in your dictionary. You can handle that gracefully be doing something more like:
try:
temp = data
for p in path:
temp = temp[p]
except KeyError:
print('Path {} not in data'.format(path))
else:
if isinstance(temp, list):
for d in temp:
del d['unwanted_property']
else:
del temp['unwanted_property']
The problem you are facing is that you are re-assigning the data variable to an undesired value. In the body of your for loop you are setting data to the next level down on the tree, for instance given your example data will have the following values (in order), up to when it leaves the for loop:
data == {'country': {'city': {'items': [{'name': '114th Street', 'unwanted_property': 'foo',}, {'name': '8th Avenue', 'unwanted_property': 'foo',},]}}}
data == {'city': {'items': [{'name': '114th Street', 'unwanted_property': 'foo',}, {'name': '8th Avenue', 'unwanted_property': 'foo',},]}}
data == {'items': [{'name': '114th Street', 'unwanted_property': 'foo',}, {'name': '8th Avenue', 'unwanted_property': 'foo',},]}
data == [{'name': '114th Street', 'unwanted_property': 'foo',}, {'name': '8th Avenue', 'unwanted_property': 'foo',},]
Then when you delete the items from your dictionaries at the end you are left with data being a list of those dictionaries as you have lost the higher parts of the structure. Thus if you make a backup reference for your data you can get the correct output, for example:
path = ['country', 'city', 'items']
data = {
'country': {
'city': {
'items': [
{
'name': '114th Street',
'unwanted_property': 'foo',
},
{
'name': '8th Avenue',
'unwanted_property': 'foo',
},
]
}
}
}
data_ref = data
for p in path:
if p == 'items':
data = [i for i in data[p]]
else:
data = data[p]
if isinstance(data, list):
for d in data:
del d['unwanted_property']
else:
del data['unwanted_property']
data = data_ref
def delKey(your_dict,path):
if len(path) == 1:
for item in your_dict:
del item[path[0]]
return
delKey( your_dict[path[0]],path[1:])
data
{'country': {'city': {'items': [{'name': '114th Street', 'unwanted_property': 'foo'}, {'name': '8th Avenue', 'unwanted_property': 'foo'}]}}}
path
['country', 'city', 'items', 'unwanted_property']
delKey(data,path)
data
{'country': {'city': {'items': [{'name': '114th Street'}, {'name': '8th Avenue'}]}}}
You need to remove the key unwanted_property.
names_list = []
def remove_key_from_items(data):
for d in data:
if d != 'items':
remove_key_from_items(data[d])
else:
for item in data[d]:
unwanted_prop = item.pop('unwanted_property', None)
names_list.append(item)
This will remove the key. The second parameter None is returned if the key unwanted_property does not exist.
EDIT:
You can use pop even without the second parameter. It will raise KeyError if the key does not exist.
EDIT 2: Updated to recursively go into depth of data dict until it finds the items key, where it pops the unwanted_property as desired and append into the names_list list to get the desired output.
Using operator.itemgetter you can compose a function to return the final key's value.
import operator, functools
def compose(*functions):
'''returns a callable composed of the functions
compose(f, g, h, k) -> f(g(h(k())))
'''
def compose2(f, g):
return lambda x: f(g(x))
return functools.reduce(compose2, functions, lambda x: x)
get_items = compose(*[operator.itemgetter(key) for key in path[::-1]])
Then use it like this:
path = ['country', 'city', 'items']
unwanted_property = 'unwanted_property'
for thing in get_items(data):
del thing[unwanted_property]
Of course if the path contains non-existent keys it will throw a KeyError - you probably should account for that:
path = ['country', 'foo', 'items']
get_items = compose(*[operator.itemgetter(key) for key in path[::-1]])
try:
for thing in get_items(data):
del thing[unwanted_property]
except KeyError as e:
print('missing key:', e)
You can try this:
path = ['country', 'city', 'items']
previous_data = data[path[0]]
previous_key = path[0]
for i in path:
previous_data = previous_data[i]
previous_key = i
if isinstance(previous_data, list):
for c, b in enumerate(previous_data):
if "unwanted_property" in b:
del previous_data[c]["unwanted_property"]
current_dict = {}
previous_data_dict = {}
for i, a in enumerate(path):
if i == 0:
current_dict[a] = data[a]
previous_data_dict = data[a]
else:
if a == previous_key:
current_dict[a] = previous_data
else:
current_dict[a] = previous_data_dict[a]
previous_data_dict = previous_data_dict[a]
data = current_dict
print(data)
Output:
{'country': {'city': {'items': [{'name': '114th Street'}, {'name': '8th Avenue'}]}}, 'items': [{'name': '114th Street'}, {'name': '8th Avenue'}], 'city': {'items': [{'name': '114th Street'}, {'name': '8th Avenue'}]}}
I have a database schema in Postgres that looks like this (in pseudo code):
users (table):
pk (field, unique)
name (field)
permissions (table):
pk (field, unique)
permission (field, unique)
addresses (table):
pk (field, unique)
address (field, unique)
association1 (table):
user_pk (field, foreign_key)
permission_pk (field, foreign_key)
association2 (table):
user_pk (field, foreign_key)
address_pk (field, foreign_key)
Hopefully this makes intuitive sense. It's a users table that has a many-to-many relationship with a permissions table as well as a many-to-many relationship with an addresses table.
In Python, when I perform the correct SQLAlchemy query incantations, I get back results that look something like this (after converting them to a list of dictionaries in Python):
results = [
{'pk': 1, 'name': 'Joe', 'permission': 'user', 'address': 'home'},
{'pk': 1, 'name': 'Joe', 'permission': 'user', 'address': 'work'},
{'pk': 1, 'name': 'Joe', 'permission': 'admin', 'address': 'home'},
{'pk': 1, 'name': 'Joe', 'permission': 'admin', 'address': 'work'},
{'pk': 2, 'name': 'John', 'permission': 'user', 'address': 'home'},
]
So in this contrived example, Joe is both a user and and an admin. John is only a user. Both Joe's home and work addresses exist in the database. Only John's home address exists.
So the question is, does anybody know the best way to go from these SQL query 'results' to the more compact 'desired_results' below?
desired_results = [
{
'pk': 1,
'name': 'Joe',
'permissions': ['user', 'admin'],
'addresses': ['home', 'work']
},
{
'pk': 2,
'name': 'John',
'permissions': ['user'],
'addresses': ['home']
},
]
Additional information required: Small list of dictionaries describing the 'labels' I would like to use in the desired_results for each of the fields that have many-to-many relationships.
relationships = [
{'label': 'permissions', 'back_populates': 'permission'},
{'label': 'addresses', 'back_populates': 'address'},
]
Final consideration, I've put together a concrete example for the purposes of this question, but in general I'm trying to solve the problem of querying SQL databases in general, assuming an arbitrary amount of relationships. SQLAlchemy ORM solves this problem well, but I'm limited to using SQLAlchemy Core; so am trying to build my own solution.
Update
Here's an answer, but I'm not sure it's the best / most efficient solution. Can anyone come up with something better?
# step 1: generate set of keys that will be replaced by new keys in desired_result
back_populates = set(rel['back_populates'] for rel in relationships)
# step 2: delete from results keys generated in step 1
intermediate_results = [
{k: v for k, v in res.items() if k not in back_populates}
for res in results]
# step 3: eliminate duplicates
intermediate_results = [
dict(t)
for t in set([tuple(ires.items())
for ires in intermediate_results])]
# step 4: add back information from deleted fields but in desired form
for ires in intermediate_results:
for rel in relationships:
ires[rel['label']] = set([
res[rel['back_populates']]
for res in results
if res['pk'] == ires['pk']])
# done
desired_results = intermediate_results
Iterating over the groups of partial entries looks like a job for itertools.groupby.
But first lets put relationships into a format that is easier to use, prehaps a back_populates:label dictionary?
conversions = {d["back_populates"]:d['label'] for d in relationships}
Next because we will be using itertools.groupby it will need a keyfunc to distinguish between the different groups of entries.
So given one entry from the initial results, this function will return a dictionary with only the pairs that will not be condensed/converted
def grouper(entry):
#each group is identified by all key:values that are not identified in conversions
return {k:v for k,v in entry.items() if k not in conversions}
Now we will be able to traverse the results in groups something like this:
for base_info, group in itertools.groupby(old_results, grouper):
#base_info is dict with info unique to all entries in group
for partial in group:
#partial is one entry from results that will contribute to the final result
#but wait, what do we add it too?
The only issue is that if we build our entry from base_info it will confuse groupby so we need to make an entry to work with:
entry = {new_field:set() for new_field in conversions.values()}
entry.update(base_info)
Note that I am using sets here because they are the natural container when all contence are unique,
however because it is not json-compatible we will need to change them into lists at the end.
Now that we have an entry to build we can just iterate through the group to add to each new field from the original
for partial in group:
for original, new in conversions.items():
entry[new].add(partial[original])
then once the final entry is constructed all that is left is to convert the sets back into lists
for new in conversions.values():
entry[new] = list(entry[new])
And that entry is done, now we can either append it to a list called new_results but since this process is essentially generating results it would make more sense to put it into a generator
making the final code look something like this:
import itertools
results = [
{'pk': 1, 'name': 'Joe', 'permission': 'user', 'address': 'home'},
{'pk': 1, 'name': 'Joe', 'permission': 'user', 'address': 'work'},
{'pk': 1, 'name': 'Joe', 'permission': 'admin', 'address': 'home'},
{'pk': 1, 'name': 'Joe', 'permission': 'admin', 'address': 'work'},
{'pk': 2, 'name': 'John', 'permission': 'user', 'address': 'home'},
]
relationships = [
{'label': 'permissions', 'back_populates': 'permission'},
{'label': 'addresses', 'back_populates': 'address'},
]
#first we put the "relationships" in a format that is much easier to use.
conversions = {d["back_populates"]:d['label'] for d in relationships}
def grouper(entry):
#each group is identified by all key:values that are not identified in conversions
return {k:v for k,v in entry.items() if k not in conversions}
def parse_results(old_results, conversions=conversions):
for base_info, group in itertools.groupby(old_results, grouper):
entry = {new_field:set() for new_field in conversions.values()}
entry.update(base_info)
for partial in group: #for each entry in the original results set
for original, new in conversions.items(): #for each field that will be condensed
entry[new].add(partial[original])
#convert sets back to lists so it can be put back into json
for new in conversions.values():
entry[new] = list(entry[new])
yield entry
Then the new_results can be gotten like this:
>>> new_results = list(parse_results(results))
>>> from pprint import pprint #for demo purpose
>>> pprint(new_results,width=50)
[{'addresses': ['home', 'work'],
'name': 'Joe',
'permissions': ['admin', 'user'],
'pk': 1},
{'addresses': ['home'],
'name': 'John',
'permissions': ['user'],
'pk': 2}]
How do I merge the JSON data rows as shown below using the merge function below with pyspark?
Note: Assume this is just a minutia example and I have 1000s of rows of data to merge. What is the most performant solution? For better or for worse, I must use pyspark.
Input:
data = [
{'timestamp': '20080411204445', 'address': '100 Sunder Ct', 'name': 'Joe Schmoe'},
{'timestamp': '20040218165319', 'address': '100 Lee Ave', 'name': 'Joe Schmoe'},
{'timestamp': '20120309173318', 'address': '1818 Westminster', 'name': 'John Doe'},
... More ...
]
Desired Output:
combined_result = [
{'name': 'Joe Schmoe': {'addresses': [('20080411204445', '100 Sunder Ct'), ('20040218165319', '100 Lee Ave')]}},
{'name': 'John Doe': {'addresses': [('20120309173318', '1818 Westminster')]}},
... More ...
]
Merge function:
def reduce_on_name(a, b):
'''Combines two JSON data rows based on name'''
merged = {}
if a['name'] == b['name']:
addresses = (a['timestamp'], a['address']), (b['timestamp'], b['address'])
merged['name'] = a['name']
merged['addresses'] = addresses
return merged
I think it would be something like this:
sc.parallelize(data).groupBy(lambda x: x['name']).map(lambda t: {'name':t[0],'addresses':[(x['timestamp'], x['address']) for x in t[1]]}).collect()
All right, using maxymoo's example, I put together my own reusable code. It's not exactly what I was looking for, but it gets me closer to how I want to solve this particular problem: without lambdas and with reusable code.
#!/usr/bin/env pyspark
# -*- coding: utf-8 -*-
data = [
{'timestamp': '20080411204445', 'address': '100 Sunder Ct', 'name': 'Joe Schmoe'},
{'timestamp': '20040218165319', 'address': '100 Lee Ave', 'name': 'Joe Schmoe'},
{'timestamp': '20120309173318', 'address': '1818 Westminster', 'name': 'John Doe'},
]
def combine(field):
'''Returns a function which reduces on a specific field
Args:
field(str): data field to use for merging
Returns:
func: returns a function which supplies the data for the field
'''
def _reduce_this(data):
'''Returns the field value using data'''
return data[field]
return _reduce_this
def aggregate(*fields):
'''Merges data based on a list of fields
Args:
fields(list): a list of fields that should be used as a composite key
Returns:
func: a function which does the aggregation
'''
def _merge_this(iterable):
name, iterable = iterable
new_map = dict(name=name, window=dict(max=None, min=None))
for data in iterable:
for field, value in data.iteritems():
if field in fields:
new_map[field] = value
else:
new_map.setdefault(field, set()).add(value)
return new_map
return _merge_this
# sc provided by pyspark context
combined = sc.parallelize(data).groupBy(combine('name'))
reduced = combined.map(aggregate('name'))
output = reduced.collect()