Merge lists of complex dicts with arbitrary keys

Merge lists of complex dicts with arbitrary keys - python

I have this code:
dotteds = ["apple.orange.banana", "a.b.c", "a.b.d"]
name = "name"
avtype = "type"
fields = "fields"
main_dictionary_list = []
for x in dotteds:
split_name = x.split('.')
if len(split_name) > 1:
value = {name: split_name[-1], avtype: 'string'}
dicts = []
for y in split_name:
dicts.append({name: y, avtype: {name: y, avtype: "record", fields: []}})
dicts[-1] = value
value = value['name']+split_name[-1]
for z in reversed(range(len(dicts))):
if z != 0:
dicts[z - 1]['type']['fields'].append(dicts[z])
main_dictionary_list.append(dicts[0])
else:
dicts = []
value = {name: split_name[-1], avtype: 'string'}
dicts.append(value)
main_dictionary_list.append(dicts[0])
print(main_dictionary_list)
Which gives me an output like this:
[{
'name': 'apple',
'type': {
'name': 'apple',
'type': 'record',
'fields': [{
'name': 'orange',
'type': {
'name': 'orange',
'type': 'record',
'fields': [{
'name': 'banana',
'type': 'string'
}
]
}
}
]
}
}, {
'name': 'a',
'type': {
'name': 'a',
'type': 'record',
'fields': [{
'name': 'b',
'type': {
'name': 'b',
'type': 'record',
'fields': [{
'name': 'c',
'type': 'string'
}
]
}
}
]
}
}, {
'name': 'a',
'type': {
'name': 'a',
'type': 'record',
'fields': [{
'name': 'b',
'type': {
'name': 'b',
'type': 'record',
'fields': [{
'name': 'd',
'type': 'string'
}
]
}
}
]
}
}
]
Ideally I need:
[{
'name': 'apple',
'type': {
'name': 'apple',
'type': 'record',
'fields': [{
'name': 'orange',
'type': {
'name': 'orange',
'type': 'record',
'fields': [{
'name': 'banana',
'type': 'string'
}
]
}
}
]
}
}, {
'name': 'a',
'type': {
'name': 'a',
'type': 'record',
'fields': [{
'name': 'b',
'type': {
'name': 'b',
'type': 'record',
'fields': [{
'name': 'c',
'type': 'string'
},
{
'name': 'd',
'type': 'string'
}
]
}
}
]
}
}
]
I need to be able to do this with any number of combinations:
apple.orange.banana, a.b.c, a.b.d, a.b.q.e.a.s.d, etc.
I cannot figure out how to combine the similar 'name: key' combinations. It's intended to be avro format.
I have also tried making the dotted values into a dictionary which is a bit of trouble on its own.

You can use recursion with collections.defaultdict:
from collections import defaultdict
def group(vals, last=None):
if any(len(i) == 1 for i in vals):
return [{'name':last, 'type':{'name':last, 'type':'record', 'fields':[{'name':i[0], 'type':'string'} if len(i) == 1 else group([i], i[0])[0] for i in vals]}}]
_d = defaultdict(list)
for i in vals:
_d[i[0]].append(i[1:])
return [{'name':a, 'type':group(b, last=a)} if last is None else
{'name':last, 'type':'record', 'fields':group(b, last=a)} for a, b in _d.items()]
import json
vals = ['apple.orange.banana', 'a.b.c', 'a.b.d']
print(json.dumps(group([i.split('.') for i in vals]), indent=4))
Output:
[
{
"name": "apple",
"type": [
{
"name": "apple",
"type": "record",
"fields": [
{
"name": "orange",
"type": {
"name": "orange",
"type": "record",
"fields": [
{
"name": "banana",
"type": "string"
}
]
}
}
]
}
]
},
{
"name": "a",
"type": [
{
"name": "a",
"type": "record",
"fields": [
{
"name": "b",
"type": {
"name": "b",
"type": "record",
"fields": [
{
"name": "c",
"type": "string"
},
{
"name": "d",
"type": "string"
}
]
}
}
]
}
]
}
]
vals = ['asd.2', 'asd.3', 'asd.5.3.4']
print(json.dumps(group([i.split('.') for i in vals]), indent=4))
Output:
[
{
"name": "asd",
"type": [
{
"name": "asd",
"type": {
"name": "asd",
"type": "record",
"fields": [
{
"name": "2",
"type": "string"
},
{
"name": "3",
"type": "string"
},
{
"name": "5",
"type": "record",
"fields": [
{
"name": "5",
"type": "record",
"fields": [
{
"name": "3",
"type": {
"name": "3",
"type": "record",
"fields": [
{
"name": "4",
"type": "string"
}
]
}
}
]
}
]
}
]
}
}
]
}
]

Related

How to merge two DSL query for aggregation and filter

I need to search BusinessArea which is Research or Accounting this is array of fields(OR) statement
I need to search Role is Developer or Tester condition this is array of fields(OR) statement
I want to get the count of masterid of BusinessArea, designationNames, Role which is all the names
Name filter is "Group1"
Below is the dictionary
test= [ { 'masterid': '1', 'name': 'Group1', 'BusinessArea': [ 'Accounting','Research'], 'Designation': [ 'L1' 'L2' ] }, { 'masterid': '2', 'name': 'Group1', 'BusinessArea': ['Research','Accounting' ], 'Role': [ { 'id': '5032', 'name': 'Tester' }, { 'id': '5033', 'name': 'Developer' } ], 'Designation': [ 'L1' 'L2' ]}, { 'masterid': '3', 'name': 'Group1', 'BusinessArea': [ 'Engineering' ], 'Role': [ { 'id': '5032', 'name': 'Developer' }, { 'id': '5033', 'name': 'Developer', 'parentname': '' } ], 'Designation': [ 'L1' 'L2' ]}]
Below is the aggregation function
{
"size": 0,
"aggs": {
"countNames": {
"terms": {
"field": "BusinessArea.keyword"
}
},
"designationNames": {
"terms": {
"field": "Designation.keyword"
}
},
"Role": {
"terms": {
"field": "Role.name.keyword"
}
}
}
}
Below is the filtering query
{
"query": {
"bool": {
"must": [
{
"terms": {
"BusinessArea.keyword": [
"Research",
"Accounting"
]
}
},
{
"terms": {
"Role.name.keyword": [
"Developer",
"Tester"
]
}
}
]
}
}
}
"filter": [
"term": {
"name.keyword": "Group1"}]
I need to merge both query and output will be having from the both

Nice start !!! Now you can simply combine all those snippets like this:
{
"size": 0,
"query": {
"bool": {
"filter": [
{
"term": {
"name.keyword": "Group1"
}
},
{
"terms": {
"BusinessArea.keyword": [
"Research",
"Accounting"
]
}
},
{
"terms": {
"Role.name.keyword": [
"Developer",
"Tester"
]
}
}
]
}
},
"aggs": {
"countNames": {
"terms": {
"field": "BusinessArea.keyword"
}
},
"designationNames": {
"terms": {
"field": "Designation.keyword"
}
},
"Role": {
"terms": {
"field": "Role.name.keyword"
}
}
}
}

How to get the individual count of field from Elasticsearch

My content inside a dictionary is below
test=
[ { 'masterid': '1', 'name': 'Group1', 'BusinessArea': [ { 'id': '14', 'name': 'Accounting', 'parentname': 'Finance'}, { 'id': '3', 'name': 'Research', 'parentname': 'R & D' } ], 'Designation': [ { 'id': '16', 'name': 'L1' }, { 'id': '20', 'name': 'L2' }, { 'id': '25', 'name': 'L2' }] },
{ 'masterid': '2', 'name': 'Group1', 'BusinessArea': [ { 'id': '14', 'name': 'Research', 'parentname': '' }, { 'id': '3', 'name': 'Accounting', 'parentname': '' } ], 'Role': [ { 'id': '5032', 'name': 'Tester' }, { 'id': '5033', 'name': 'Developer' } ], 'Designation': [ { 'id': '16', 'name': 'L1' }, { 'id': '20', 'name': 'L2' }, { 'id': '25', 'name': 'L2' }]},
{ 'masterid': '3', 'name': 'Group1', 'BusinessArea': [ { 'id': '14', 'name': 'Engineering' }, { 'id': '3', 'name': 'Engineering', 'parentname': '' } ], 'Role': [ { 'id': '5032', 'name': 'Developer' }, { 'id': '5033', 'name': 'Developer', 'parentname': '' } ], 'Designation': [ { 'id': '16', 'name': 'L1' }, { 'id': '20', 'name': 'L2' }, { 'id': '25', 'name': 'L2' }]}]
Code is below to put into elastic search index
from elasticsearch import Elasticsearch
es = Elasticsearch()
es.indices.create(index='new')
for e in test:
es.index(index="new", body=e, id=e['id'])
I want to get the count of masterid of BusinessArea which is all the names
Here it is Accounting, Research Engineering
[ {
"name": "BusinessArea",
"values": [
{
"name": "Accounting",
"count": "2"
},
{
"name": "Research",
"count": "2"
},
{
"name": "Engineering",
"count": "1"
}]
}]
or can i have answer like below
{
"A": {
"Designation": [{
"key": "L1",
"doc_count": 3
},
{
"key": "L2",
"doc_count": 3
}
]
},
{
"B": {
"BusinessArea": [{
"key": "Accounting",
"doc_count": 2
},
{
"key": "Research",
"doc_count": 2
},
{
"key": "Engineering",
"doc_count": 1
}
]
}
}

If you want to get the individual count of the field you can use the terms aggregation that is a multi-bucket value source-based aggregation where buckets are dynamically built - one per unique value.
Search Query:
{
"size":0,
"aggs": {
"countNames": {
"terms": {
"field": "BusinessArea.name.keyword"
}
}
}
}
Search Result:
"aggregations": {
"countNames": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Accounting",
"doc_count": 2
},
{
"key": "Research",
"doc_count": 2
},
{
"key": "Engineering",
"doc_count": 1
}
]
}
Update 1:
If you want to have an individual count of the field for Designation as well as BusinessArea
Search Query:
{
"size": 0,
"aggs": {
"countNames": {
"terms": {
"field": "BusinessArea.name.keyword"
}
},
"designationNames": {
"terms": {
"field": "Designation.name.keyword"
}
}
}
}
Search Result:
"aggregations": {
"designationNames": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "L1",
"doc_count": 3
},
{
"key": "L2",
"doc_count": 3
}
]
},
"countNames": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Accounting",
"doc_count": 2
},
{
"key": "Research",
"doc_count": 2
},
{
"key": "Engineering",
"doc_count": 1
}
]
}

You can simply use the count API of elasticsearch to get the count of All the documents in the elasticsearch index or based on a condition as shown in the same doc.
For your case, it should be like
GET /<your-index-name>/_count?q=name:BusinessArea
Or, if masterid is the Unique-id in your document, you can simply use
GET /<your-index-name>/_count

merge complex list of nested dicts

I'm trying to merge nested Dicts in a list based on "name" like the following:
[
{
"name": "abc",
"metadata": [
{
"name": "foo",
"data": [
{
"version": "1.0"
}
]
},
{
"name": "foo",
"data": [
{
"version": "2.0"
}
]
},
{
"name": "bar",
"data": [
{
"version": "1.0"
}
]
}
]
},
{
"name": "xyz",
"metadata": [
{
"name": "bob",
"data": [
{
"version": "3.2"
}
]
},
{
"name": "alice",
"data": [
{
"version": "2.2"
}
]
}
]
},
{
"name": "xyz",
"metadata": [
{
"name": "mike",
"data": [
{
"version": "3.2"
}
]
},
{
"name": "alice",
"data": [
{
"version": "2.2"
}
]
}
]
}
]
Considering that the merged items should not have duplicates in the metadata, how can I do that in Python? Metadata entries should be unique, if name+data+version exist in the metadata, then the item should not be merged.
my desired output should look like this
[
{
"name": "abc",
"metadata": [
{
"name": "foo",
"data": [
{
"version": "1.0"
}
]
},
{
"name": "foo",
"data": [
{
"version": "2.0"
}
]
},
{
"name": "bar",
"data": [
{
"version": "1.0"
}
]
}
]
},
{
"name": "xyz",
"metadata": [
{
"name": "bob",
"data": [
{
"version": "3.2"
}
]
},
{
"name": "mike",
"data": [
{
"version": "3.2"
}
]
},
{
"name": "alice",
"data": [
{
"version": "2.2"
}
]
}
]
}
]

You can use itertools.groubpy:
import itertools
d = [{'name': 'abc', 'metadata': [{'name': 'foo', 'data': [{'version': '1.0'}]}, {'name': 'foo', 'data': [{'version': '2.0'}]}, {'name': 'bar', 'data': [{'version': '1.0'}]}]}, {'name': 'xyz', 'metadata': [{'name': 'bob', 'data': [{'version': '3.2'}]}, {'name': 'alice', 'data': [{'version': '2.2'}]}]}, {'name': 'xyz', 'metadata': [{'name': 'mike', 'data': [{'version': '3.2'}]}, {'name': 'alice', 'data': [{'version': '2.2'}]}]}]
new_d = [[a, list(b)] for a, b in itertools.groupby(sorted(d, key=lambda x:x['name']), key=lambda x:x['name'])]
result = [{'name':a, 'metadata':[c for j in b for c in j['metadata']]} for a, b in new_d]
final_result = [{**i, 'metadata':[c for d, c in enumerate(i['metadata']) if all(a != c for a in i['metadata'][:d])]} for i in result]
import json
print(json.dumps(final_result, indent=4))
Output:
[
{
"name": "abc",
"metadata": [
{
"name": "foo",
"data": [
{
"version": "1.0"
}
]
},
{
"name": "foo",
"data": [
{
"version": "2.0"
}
]
},
{
"name": "bar",
"data": [
{
"version": "1.0"
}
]
}
]
},
{
"name": "xyz",
"metadata": [
{
"name": "bob",
"data": [
{
"version": "3.2"
}
]
},
{
"name": "alice",
"data": [
{
"version": "2.2"
}
]
},
{
"name": "mike",
"data": [
{
"version": "3.2"
}
]
}
]
}
]

Adding new pairs to a json file

I have a json file I need to add pairs to, I convert it into a dict, but now I need to put my new values in a specific place.
This is some of the json file I convert:
"rootObject": {
"id": "6ff0010c-00fe-485b-b695-4ddd6aca4dcd",
"type": "IDO_GEAR",
"children": [
{
"id": "1dd94d1a-e52d-40b3-a82b-6db02a8fbbab",
"type": "IDO_SYSTEM_LOADCASE",
"children": [],
"childList": "SYSTEMLOADCASE",
"properties": [
{
"name": "IDCO_IDENTIFICATION",
"value": "1dd94d1a-e52d-40b3-a82b-6db02a8fbbab"
},
{
"name": "IDCO_DESIGNATION",
"value": "Lastfall 1"
},
{
"name": "IDSLC_TIME_PORTION",
"value": 100
},
{
"name": "IDSLC_DISTANCE_PORTION",
"value": 100
},
{
"name": "IDSLC_OPERATING_TIME_IN_HOURS",
"value": 1
},
{
"name": "IDSLC_OPERATING_TIME_IN_SECONDS",
"value": 3600
},
{
"name": "IDSLC_OPERATING_REVOLUTIONS",
"value": 1
},
{
"name": "IDSLC_OPERATING_DISTANCE",
"value": 1
},
{
"name": "IDSLC_ACCELERATION",
"value": 9.81
},
{
"name": "IDSLC_EPSILON_X",
"value": 0
},
{
"name": "IDSLC_EPSILON_Y",
"value": 0
},
{
"name": "IDSLC_EPSILON_Z",
"value": 0
},
{
"name": "IDSLC_CALCULATION_WITH_OWN_WEIGHT",
"value": "CO_CALCULATION_WITHOUT_OWN_WEIGHT"
},
{
"name": "IDSLC_CALCULATION_WITH_TEMPERATURE",
"value": "CO_CALCULATION_WITH_TEMPERATURE"
},
{
"name": "IDSLC_FLAG_FOR_LOADCASE_CALCULATION",
"value": "LB_CALCULATE_LOADCASE"
},
{
"name": "IDSLC_STATUS_OF_LOADCASE_CALCULATION",
"value": false
}
I want to add somthing like ENTRY_ONE and ENTRY_TWO like this:
"rootObject": {
"id": "6ff0010c-00fe-485b-b695-4ddd6aca4dcd",
"type": "IDO_GEAR",
"children": [
{
"id": "1dd94d1a-e52d-40b3-a82b-6db02a8fbbab",
"type": "IDO_SYSTEM_LOADCASE",
"children": [],
"childList": "SYSTEMLOADCASE",
"properties": [
{
"name": "IDCO_IDENTIFICATION",
"value": "1dd94d1a-e52d-40b3-a82b-6db02a8fbbab"
},
{
"name": "IDCO_DESIGNATION",
"value": "Lastfall 1"
},
{
"name": "IDSLC_TIME_PORTION",
"value": 100
},
{
"name": "IDSLC_DISTANCE_PORTION",
"value": 100
},
{
"name": "ENTRY_ONE",
"value": 100
},
{
"name": "ENTRY_TWO",
"value": 55
},
{
"name": "IDSLC_OPERATING_TIME_IN_HOURS",
"value": 1
},
{
"name": "IDSLC_OPERATING_TIME_IN_SECONDS",
"value": 3600
},
{
"name": "IDSLC_OPERATING_REVOLUTIONS",
"value": 1
},
{
"name": "IDSLC_OPERATING_DISTANCE",
"value": 1
},
{
"name": "IDSLC_ACCELERATION",
"value": 9.81
},
{
"name": "IDSLC_EPSILON_X",
"value": 0
},
{
"name": "IDSLC_EPSILON_Y",
"value": 0
},
{
"name": "IDSLC_EPSILON_Z",
"value": 0
},
{
"name": "IDSLC_CALCULATION_WITH_OWN_WEIGHT",
"value": "CO_CALCULATION_WITHOUT_OWN_WEIGHT"
},
{
"name": "IDSLC_CALCULATION_WITH_TEMPERATURE",
"value": "CO_CALCULATION_WITH_TEMPERATURE"
},
{
"name": "IDSLC_FLAG_FOR_LOADCASE_CALCULATION",
"value": "LB_CALCULATE_LOADCASE"
},
{
"name": "IDSLC_STATUS_OF_LOADCASE_CALCULATION",
"value": false
}
How can I add the entries so that they are under the IDCO_IDENTIFICATION tag, and not under the rootObject?

The way I see your json file, it WOULD be under rootObject as EVERYTHING is under that key. There's quite a few closing brackets and braces missing.
So I can only assume you are meaning you want it directly under IDCO_IDENTIFICATION (which is nested under rootObject). But that doesn't match what you have as your example output either. You have the new ENTRY_ONE and ENTRY_TWO within the properties, within the children, within the rootObject, not "under" IDCO_IDENTIFICATION. So I'm going to follow what you are asking for from your example output.
import json
with open('C:/test.json') as f:
data = json.load(f)
new_elements = [{"name":"ENTRY_ONE", "value":100},
{"name":"ENTRY_TWO", "value":55}]
for each in new_elements:
data['rootObject']['children'][0]['properties'].append(each)
with open('C:/test.json', 'w') as f:
json.dump(data, f)
Output:
import pprint
pprint.pprint(data)
{'rootObject': {'children': [{'childList': 'SYSTEMLOADCASE',
'children': [],
'id': '1dd94d1a-e52d-40b3-a82b-6db02a8fbbab',
'properties': [{'name': 'IDCO_IDENTIFICATION',
'value': '1dd94d1a-e52d-40b3-a82b-6db02a8fbbab'},
{'name': 'IDCO_DESIGNATION',
'value': 'Lastfall 1'},
{'name': 'IDSLC_TIME_PORTION',
'value': 100},
{'name': 'IDSLC_DISTANCE_PORTION',
'value': 100},
{'name': 'IDSLC_OPERATING_TIME_IN_HOURS',
'value': 1},
{'name': 'IDSLC_OPERATING_TIME_IN_SECONDS',
'value': 3600},
{'name': 'IDSLC_OPERATING_REVOLUTIONS',
'value': 1},
{'name': 'IDSLC_OPERATING_DISTANCE',
'value': 1},
{'name': 'IDSLC_ACCELERATION',
'value': 9.81},
{'name': 'IDSLC_EPSILON_X',
'value': 0},
{'name': 'IDSLC_EPSILON_Y',
'value': 0},
{'name': 'IDSLC_EPSILON_Z',
'value': 0},
{'name': 'IDSLC_CALCULATION_WITH_OWN_WEIGHT',
'value': 'CO_CALCULATION_WITHOUT_OWN_WEIGHT'},
{'name': 'IDSLC_CALCULATION_WITH_TEMPERATURE',
'value': 'CO_CALCULATION_WITH_TEMPERATURE'},
{'name': 'IDSLC_FLAG_FOR_LOADCASE_CALCULATION',
'value': 'LB_CALCULATE_LOADCASE'},
{'name': 'IDSLC_STATUS_OF_LOADCASE_CALCULATION',
'value': False},
{'name': 'ENTRY_ONE',
'value': 100},
{'name': 'ENTRY_TWO',
'value': 55}],
'type': 'IDO_SYSTEM_LOADCASE'}],
'id': '6ff0010c-00fe-485b-b695-4ddd6aca4dcd',
'type': 'IDO_GEAR'}}

Python - Adding fields and labels to nested json file

I have a dataframe as follows:
Name_ID | URL | Count | Rating
------------------------------------------------
ABC | www.example.com/ABC | 10 | 5
123 | www.example.com/123 | 9 | 4
XYZ | www.example.com/XYZ | 5 | 2
ABC111 | www.example.com/ABC111 | 5 | 2
ABC121 | www.example.com/ABC121 | 5 | 2
222 | www.example.com/222 | 5 | 3
abc222 | www.example.com/abc222 | 4 | 2
ABCaaa | www.example.com/ABCaaa | 4 | 2
I am trying to create a JSON as follows:
{
"name": "sampledata",
"children": [
{
"name": 9,
"children": [
{
"name": 4,
"children": [
{
"name": "123",
"size": 100
}
]
}
]
},
{
"name": 10,
"children": [
{
"name": 5,
"children": [
{
"name": "ABC",
"size": 100
}
]
}
]
},
{
"name": 4,
"children": [
{
"name": 2,
"children": [
{
"name": "abc222",
"size": 50
},
{
"name": "ABCaaa",
"size": 50
}
]
}
]
},
{
"name": 5,
"children": [
{
"name": 2,
"children": [
{
"name": "ABC",
"size": 16
},
{
"name": "ABC111",
"size": 16
},
{
"name": "ABC121",
"size": 16
}
]
},
{
"name": 3,
"children": [
{
"name": "222",
"size": 50
}
]
}
]
}
]
}
In order to do that:
I am trying to add labels such as "name" and "children" to the json while creating it.
I tried something like
results = [{"name": i, "children": j} for i,j in results.items()]
But it won't label it properly I believe.
Also, add another field with the label `"size"which I am planning to calculate based on the formula:
(Rating*Count*10000)/number_of_children_to_the_immediate_parent
Here is my dirty code:
import pandas as pd
from collections import defaultdict
import json
data =[('ABC', 'www.example.com/ABC', 10 , 5), ('123', 'www.example.com/123', 9, 4), ('XYZ', 'www.example.com/XYZ', 5, 2), ('ABC111', 'www.example.com/ABC111', 5, 2), ('ABC121', 'www.example.com/ABC121', 5, 2), ('222', 'www.example.com/222', 5, 3), ('abc222', 'www.example.com/abc222', 4, 2), ('ABCaaa', 'www.example.com/ABCaaa', 4, 2)]
df = pd.DataFrame(data, columns=['Name', 'URL', 'Count', 'Rating'])
gp = df.groupby(['Count'])
dict_json = {"name": "flare"}
children = []
for name, group in gp:
temp = {}
temp["name"] = name
temp["children"] = []
rgp = group.groupby(['Rating'])
for n, g in rgp:
temp2 = {}
temp2["name"] = n
temp2["children"] = g.reset_index().T.to_dict().values()
for t in temp2["children"]:
t["size"] = (t["Rating"] * t["Count"] * 10000) / len(temp2["children"])
t["name"] = t["Name"]
del t["Count"]
del t["Rating"]
del t["URL"]
del t["Name"]
del t["index"]
temp["children"].append(temp2)
children.append(temp)
dict_json["children"] = children
print json.dumps(dict_json, indent=4)
Though the above code does print what I need, I am looking for more efficient and cleaner way to do the same, mainly because the actual dataset might be even more nested and complicated. Any help/suggestion will be much appreciated.

Quite an interesting problem and a great question!
You can improve your approach by reorganizing the code inside the loops and using list comprehensions. No need to delete things and introduce temp variables inside loops:
dict_json = {"name": "flare"}
children = []
for name, group in gp:
temp = {"name": name, "children": []}
rgp = group.groupby(['Rating'])
for n, g in rgp:
temp["children"].append({
"name": n,
"children": [
{"name": row["Name"],
"size": row["Rating"] * row["Count"] * 10000 / len(g)}
for _, row in g.iterrows()
]
})
children.append(temp)
dict_json["children"] = children
Or, a "wrapped" version:
dict_json = {
"name": "flare",
"children": [
{
"name": name,
"children": [
{
"name": n,
"children": [
{
"name": row["Name"],
"size": row["Rating"] * row["Count"] * 10000 / len(g)
} for _, row in g.iterrows()
]
} for n, g in group.groupby(['Rating'])
]
} for name, group in gp
]
}
I'm getting the following dictionary printed for you sample input dataframe:
{
"name": "flare",
"children": [
{
"name": 4,
"children": [
{
"name": 2,
"children": [
{
"name": "abc222",
"size": 40000
},
{
"name": "ABCaaa",
"size": 40000
}
]
}
]
},
{
"name": 5,
"children": [
{
"name": 2,
"children": [
{
"name": "XYZ",
"size": 33333
},
{
"name": "ABC111",
"size": 33333
},
{
"name": "ABC121",
"size": 33333
}
]
},
{
"name": 3,
"children": [
{
"name": "222",
"size": 150000
}
]
}
]
},
{
"name": 9,
"children": [
{
"name": 4,
"children": [
{
"name": "123",
"size": 360000
}
]
}
]
},
{
"name": 10,
"children": [
{
"name": 5,
"children": [
{
"name": "ABC",
"size": 500000
}
]
}
]
}
]
}

If I understand correctly what you wan't to do is put a groupby into a nested json, if that is the case then you could use pandas groupby and cast it into a nested list of lists as so:
lol = pd.DataFrame(df.groupby(['Count','Rating'])\
.apply(lambda x: list(x['Name_ID']))).reset_index().values.tolist()
lol should look something like this:
[['10', '5', ['ABC']],
['4', '2', ['abc222', 'ABCaaa']],
['5', '2', ['XYZ ', 'ABC111', 'ABC121']],
['5', '3', ['222 ']],
['9', '4', ['123 ']]]
after that you could loop over lol to put it into a dict, but since you want to set nested items you'l have to use autovivification (check it out):
class autovividict(dict):
def __missing__(self, key):
value = self[key] = type(self)()
return value
d = autovividict()
for l in lol:
d[l[0]][l[1]] = l[2]
now you can use the json pack for printing and exporting:
print json.dumps(d,indent=2)
In case you need more than one groupby, you could concat your groups with pandas, cast to lol, remove any nans, and then loop, let me know if a full example can help.

setup
from io import StringIO
import pandas as pd
txt = """Name_ID,URL,Count,Rating
ABC,www.example.com/ABC,10,5
123,www.example.com/123,9,4
XYZ,www.example.com/XYZ,5,2
ABC111,www.example.com/ABC111,5,2
ABC121,www.example.com/ABC121,5,2
222,www.example.com/222,5,3
abc222,www.example.com/abc222,4,2
ABCaaa,www.example.com/ABCaaa,4,2"""
df = pd.read_csv(StringIO(txt))
size
pre-calculate it
df['size'] = df.Count.mul(df.Rating) \
.mul(10000) \
.div(df.groupby(
['Count', 'Rating']).Name_ID.transform('count')
).astype(int)
solution
create recursive function
def h(d):
if isinstance(d, pd.Series): d = d.to_frame().T
rec_cond = d.index.nlevels > 1 or d.index.nunique() > 1
return {'name': str(d.index[0]), 'size': str(d['size'].iloc[0])} if not rec_cond else \
[dict(name=str(n), children=h(g.xs(n))) for n, g in d.groupby(level=0)]
demo
import json
my_dict = dict(name='flare', children=h(df.set_index(['Count', 'Rating', 'Name_ID'])))
json.dumps(my_dict)
'{"name": "flare", "children": [{"name": "4", "children": [{"name": "2", "children": [{"name": "ABCaaa", "children": {"name": "ABCaaa", "size": "40000"}}, {"name": "abc222", "children": {"name": "abc222", "size": "40000"}}]}]}, {"name": "5", "children": [{"name": "2", "children": [{"name": "ABC111", "children": {"name": "ABC111", "size": "33333"}}, {"name": "ABC121", "children": {"name": "ABC121", "size": "33333"}}, {"name": "XYZ", "children": {"name": "XYZ", "size": "33333"}}]}, {"name": "3", "children": {"name": "222", "size": "150000"}}]}, {"name": "9", "children": [{"name": "4", "children": {"name": "123", "size": "360000"}}]}, {"name": "10", "children": [{"name": "5", "children": {"name": "ABC", "size": "500000"}}]}]}'
my_dict
{'children': [{'children': [{'children': [{'children': {'name': 'ABCaaa',
'size': '40000'},
'name': 'ABCaaa'},
{'children': {'name': 'abc222', 'size': '40000'}, 'name': 'abc222'}],
'name': '2'}],
'name': '4'},
{'children': [{'children': [{'children': {'name': 'ABC111', 'size': '33333'},
'name': 'ABC111'},
{'children': {'name': 'ABC121', 'size': '33333'}, 'name': 'ABC121'},
{'children': {'name': 'XYZ', 'size': '33333'}, 'name': 'XYZ'}],
'name': '2'},
{'children': {'name': '222', 'size': '150000'}, 'name': '3'}],
'name': '5'},
{'children': [{'children': {'name': '123', 'size': '360000'}, 'name': '4'}],
'name': '9'},
{'children': [{'children': {'name': 'ABC', 'size': '500000'}, 'name': '5'}],
'name': '10'}],
'name': 'flare'}

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Merge lists of complex dicts with arbitrary keys - python

Related

How to merge two DSL query for aggregation and filter

How to get the individual count of field from Elasticsearch

merge complex list of nested dicts

Adding new pairs to a json file

Python - Adding fields and labels to nested json file

Categories

Resources