How to convert the following data structure to a hierarchical data structure - python

There is an array, each element in the array is a node in the following figure, and there is a hierarchical relationship between the nodes, similar to the "tree" data structure (except that the child node can refer back to the parent node).
#The current data structure is in the following format
[
{
'id': 1,
'name': '开始',
'next': '2,3,4'
},
{
'id': 2,
'name': '2号',
'next': ''
},
{
'id': 3,
'name': '3号',
'next': '5,8'
},
{
'id': 4,
'name': '4号',
'next': '6'
},
{
'id': 5,
'name': '5号',
'next': '7'
},
{
'id': 6,
'name': '6号',
'next': ''
},
{
'id': 7,
'name': '7号',
'next': '1,3,5'
},
{
'id': 8,
'name': '8号',
'next': ''
}
]
In the case of ensuring that the first element in the above array is the root node, please write a code to convert any of the above types of data formats into the following hierarchical format.
#to convert
{
"id":1,
"name":"开始",
"backpoints":[ ],
"childs":[
{
"id":2,
"name":"2号",
"backpoints":[ ],
"childs":[ ]
},
{
"id":3,
"name":"3号",
"backpoints":[ ],
"childs":[
{
"id":5,
"name":"5号",
"backpoints":[ ],
"childs":[
{
"id":7,
"name":"7号",
"backpoints":[
"1",
"3",
"5"
],
"childs":[ ]
}
]
},
{
"id":8,
"name":"8号",
"backpoints":[ ],
"childs":[ ]
}
]
},
{
"id":4,
"name":"4号",
"backpoints":[ ],
"childs":[
{
"id":6,
"name":"6号",
"backpoints":[ ],
"childs":[ ]
}
]
}
]
}

You can iterate over the given list of dicts (named nodes in the following example) and use a dict that maps node IDs to node objects, and iterate over the node IDs in the next key to pre-create entries in the mapping dict as items in the childs sub-list if the ID does not already exist in the mapping dict, or append the ID to the backpoints sub-list:
mapping = {}
for node in nodes:
nexts = node.pop('next')
entry = mapping.setdefault(node['id'], {})
entry.update({**node, **{'backpoints': [], 'childs': []}})
if nexts:
for n in map(int, nexts.split(',')):
if n in mapping:
entry['backpoints'].append(str(n))
else:
entry['childs'].append(mapping.setdefault(n, {}))
so that mapping[nodes[0]['id']] would return:
{
"id": 1,
"name": "开始",
"backpoints": [],
"childs": [
{
"id": 2,
"name": "2号",
"backpoints": [],
"childs": []
},
{
"id": 3,
"name": "3号",
"backpoints": [],
"childs": [
{
"id": 5,
"name": "5号",
"backpoints": [],
"childs": [
{
"id": 7,
"name": "7号",
"backpoints": [
"1",
"3",
"5"
],
"childs": []
}
]
},
{
"id": 8,
"name": "8号",
"backpoints": [],
"childs": []
}
]
},
{
"id": 4,
"name": "4号",
"backpoints": [],
"childs": [
{
"id": 6,
"name": "6号",
"backpoints": [],
"childs": []
}
]
}
]
}
Demo: https://repl.it/repls/StrikingFunctionalCrash

Related

Modify nested dict key name by its parent

I have a following kind of structure to be handled:
payload = {
"name":"Event1",
"events":[
{
"name":"A",
"data":[
{
"name":"subscriptionId",
"data_id":0,
"data":0
},
{
"name":"updateCounter",
"data_id":1,
"data":0
},
{
"name":"noOfMessages",
"data_id":2,
"data":0
},
{
"name":"counter",
"data_id":3,
"data":0
},
{
"name":"resourceElements",
"data_id":4,
"data":0
},
{
"name":"type",
"data_id":5,
"data":0
},
{
"name":"subscription",
"data_id":6,
"data":0
},
{
"name":"element",
"data_id":7,
"data":[
{
"name":"type",
"data_id":0,
"data":0
},
{
"name":"plugLockState",
"data_id":1,
"data":{
"value":""
}
},
{
"name":"lockState",
"data_id":2,
"data":{
"value":""
}
},
{
"name":"flapState",
"data_id":6,
"data":{
"value":""
}
},
{
"name":"plugState",
"data_id":3,
"data":0
},
{
"name":"plugConnectionState",
"data_id":4,
"data":0
},
{
"name":"infrastructureState",
"data_id":5,
"data":0
}
]
}
]
}
]
}
I want to replace any key name within the nested structure by the parent, so the ideal result should look like this:
{
"name":"Event1",
"events":[
{
"name":"Event1.A",
"data":[
{
"name":"Event1.A.subscriptionId",
"data_id":0,
"data":0
},
{
"name":"Event1.A.updateCounter",
"data_id":1,
"data":0
},
{
"name":"Event1.A.noOfMessages",
"data_id":2,
"data":0
},
{
"name":"Event1.A.counter",
"data_id":3,
"data":0
},
{
"name":"Event1.A.resourceElements",
"data_id":4,
"data":0
},
{
"name":"Event1.A.type",
"data_id":5,
"data":0
},
{
"name":"Event1.A.subscription",
"data_id":6,
"data":0
},
{
"name":"Event1.A.element",
"data_id":7,
"data":[
{
"name":"Event1.A.element.type",
"data_id":0,
"data":0
},
{
"name":"Event1.A.element.plugLockState",
"data_id":1,
"data":{
"value":""
}
},
{
"name":"Event1.A.element.lockState",
"data_id":2,
"data":{
"value":""
}
},
{
"name":"Event1.A.element.flapState",
"data_id":6,
"data":{
"value":""
}
},
{
"name":"Event1.A.element.plugState",
"data_id":3,
"data":0
},
{
"name":"Event1.A.element.plugConnectionState",
"data_id":4,
"data":0
},
{
"name":"Event1.A.element.infrastructureState",
"data_id":5,
"data":0
}
]
}
]
}
]
}
so far I have written this recursive method:
def iterate_recursively(dictionary: dict, names=None):
if names is None:
names = []
for k, v in dictionary.items():
if isinstance(v, dict):
iterate_recursively(v)
elif isinstance(v, list):
for d in v:
if isinstance(d, dict):
names.append(d["name"])
iterate_recursively(d)
but I simply don't get it. How can the keys, based on my requirement, be changed while iterating recursively?
Here's a variant that returns a new dictionary (and thus leaving the original one unchanged).
code00.py:
#!/usr/bin/env python
import sys
from pprint import pprint as pp
payload = {
"name": "Event1",
"events": [
{
"name": "A",
"data": [
{
"name": "subscriptionId",
"data_id": 0,
"data": 0
},
{
"name": "updateCounter",
"data_id": 1,
"data": 0
},
{
"name": "noOfMessages",
"data_id": 2,
"data": 0
},
{
"name": "counter",
"data_id": 3,
"data": 0
},
{
"name": "resourceElements",
"data_id": 4,
"data": 0
},
{
"name": "type",
"data_id": 5,
"data": 0
},
{
"name": "subscription",
"data_id": 6,
"data": 0
},
{
"name": "element",
"data_id": 7,
"data": [
{
"name": "type",
"data_id": 0,
"data": 0
},
{
"name": "plugLockState",
"data_id": 1,
"data": {
"value": ""
}
},
{
"name": "lockState",
"data_id": 2,
"data": {
"value": ""
}
},
{
"name": "flapState",
"data_id": 6,
"data": {
"value": ""
}
},
{
"name": "plugState",
"data_id": 3,
"data": 0
},
{
"name": "plugConnectionState",
"data_id": 4,
"data": 0
},
{
"name": "infrastructureState",
"data_id": 5,
"data": 0
}
]
}
]
}
]
}
def concat_names(data, names=()):
if isinstance(data, dict):
name = data.get("name")
new_names = names + (name,) if name is not None else names
return {k: concat_names(v, names=new_names) if k != "name" else ".".join(new_names) for k, v in data.items()}
elif isinstance(data, (list, tuple)):
return [concat_names(e, names=names) for e in data]
else:
return data
def main(*argv):
pp(concat_names(payload), indent=2, sort_dicts=False)
if __name__ == "__main__":
print("Python {:s} {:03d}bit on {:s}\n".format(" ".join(elem.strip() for elem in sys.version.split("\n")),
64 if sys.maxsize > 0x100000000 else 32, sys.platform))
rc = main(*sys.argv[1:])
print("\nDone.")
sys.exit(rc)
Output:
[cfati#CFATI-5510-0:e:\Work\Dev\StackOverflow\q073621243]> "e:\Work\Dev\VEnvs\py_pc064_03.09_test0\Scripts\python.exe" ./code00.py
Python 3.9.9 (tags/v3.9.9:ccb0e6a, Nov 15 2021, 18:08:50) [MSC v.1929 64 bit (AMD64)] 064bit on win32
{ 'name': 'Event1',
'events': [ { 'name': 'Event1.A',
'data': [ { 'name': 'Event1.A.subscriptionId',
'data_id': 0,
'data': 0},
{ 'name': 'Event1.A.updateCounter',
'data_id': 1,
'data': 0},
{ 'name': 'Event1.A.noOfMessages',
'data_id': 2,
'data': 0},
{'name': 'Event1.A.counter', 'data_id': 3, 'data': 0},
{ 'name': 'Event1.A.resourceElements',
'data_id': 4,
'data': 0},
{'name': 'Event1.A.type', 'data_id': 5, 'data': 0},
{ 'name': 'Event1.A.subscription',
'data_id': 6,
'data': 0},
{ 'name': 'Event1.A.element',
'data_id': 7,
'data': [ { 'name': 'Event1.A.element.type',
'data_id': 0,
'data': 0},
{ 'name': 'Event1.A.element.plugLockState',
'data_id': 1,
'data': {'value': ''}},
{ 'name': 'Event1.A.element.lockState',
'data_id': 2,
'data': {'value': ''}},
{ 'name': 'Event1.A.element.flapState',
'data_id': 6,
'data': {'value': ''}},
{ 'name': 'Event1.A.element.plugState',
'data_id': 3,
'data': 0},
{ 'name': 'Event1.A.element.plugConnectionState',
'data_id': 4,
'data': 0},
{ 'name': 'Event1.A.element.infrastructureState',
'data_id': 5,
'data': 0}]}]}]}
Done.
You can do something like this:
def iterate_recursively(dictionary: dict, prefix_name=None):
if 'name' in dictionary:
if prefix_name is None:
prefix_name = dictionary['name']
else:
prefix_name += '.' + dictionary['name']
dictionary['name'] = prefix_name
for k, v in dictionary.items():
if isinstance(v, dict):
iterate_recursively(v, prefix_name)
elif isinstance(v, list):
for d in v:
iterate_recursively(d, prefix_name)

Do a pymongo Query with elemmatch and filter

I have the following data structure:
[
{
"site_id": ObjectId("5e85b9d20498abd407e9a030"),
"status": "ERROR"
},
{
"site_id": ObjectId("5e85b9d20498abd407e9a120"),
"status": "ERROR"
},
{
"site_id": ObjectId("5e85b9d20498abd407e9a030"),
"status": "OK",
"risk_categories": [
{
"position": 1,
"category_id": 1414,
},
{
"position": 2,
"category_id": 1402,
},
{
"position": 3,
"category_id": 1392,
}
]
}
]
I want to make a query with pymongo like this:
collection.find_one(filter=filter)
where:
filter = {'$and': [{'$and': [{'site_id': ObjectId('5e85b9d20498abd407e9a030')}, {'status': 'OK'}]}, {'risk_categories': {'$elemMatch': {'$or': [{'position': {'$eq': 1}}, {'position': {'$eq': 2}}]}}}]}
however, it returns me the entire object. Not only the values of risk categories that I want.
What can I do on my filter to modify that?
The aggregation runs from mongo shell:
db.collection.aggregate( [
{
$match: {
site_id: ObjectId('5e85b9d20498abd407e9a030'),
status: "OK"
}
},
{
$addFields: {
risk_categories: {
$filter: {
input: "$risk_categories",
as: "cat",
cond: {
$in: [ "$$cat.position", [ 1, 2 ] ] // this is equivalent to using the "$or"
}
}
}
}
},
] ).pretty()
The output:
{
"_id" : ObjectId("5e85c7b6724e461876467077"),
"site_id" : ObjectId("5e85b9d20498abd407e9a030"),
"status" : "OK",
"risk_categories" : [
{
"position" : 1,
"category_id" : 1414
},
{
"position" : 2,
"category_id" : 1402
}
]
}
Using PyMongo 3.9 and MongoDB 4.2, from the Python shell:
import pymongo
from pymongo import MongoClient
client = MongoClient()
db = client.test
collection = db.collection
import pprint
from bson.objectid import ObjectId
pipeline = [
{
'$match': {
'site_id': ObjectId('5e85b9d20498abd407e9a030'),
'status': 'OK'
}
},
{
'$addFields': {
'risk_categories': {
'$filter': {
'input': '$risk_categories',
'as': 'cat',
'cond': {
'$in': [ '$$cat.position', [ 1, 2 ] ]
}
}
}
}
},
]
pprint.pprint(list(collection.aggregate(pipeline)))

Elasticsearch return phonetic token with search

I use the phonetic analysis plugin from elastic search to do some string matching thanks to phonetic transformation.
My problem is, how to get phonetic transformation processed by elastic search in the result of the query?.
First, I create an index with a metaphone transformation:
request_body = {
'settings': {
'index': {
'analysis': {
'analyzer': {
'metaphone_analyzer': {
'tokenizer':
'standard',
'filter': [
'ascii_folding_filter', 'lowercase',
'metaphone_filter'
]
}
},
'filter': {
'metaphone_filter': {
'type': 'phonetic',
'encoder': 'metaphone',
'replace': False
},
'ascii_folding_filter': {
'type': 'asciifolding',
'preserve_original': True
}
}
}
}
},
'mappings': {
'person_name': {
'properties': {
'full_name': {
'type': 'text',
'fields': {
'metaphone_field': {
'type': 'string',
'analyzer': 'metaphone_analyzer'
}
}
}
}
}
}
}
res = es.indices.create(index="my_index", body=request_body)
Then, I add some data:
# Add some data
names = [{
"full_name": "John Doe"
}, {
"full_name": "Bob Alice"
}, {
"full_name": "Foo Bar"
}]
for name in names:
res = es.index(index="my_index",
doc_type='person_name',
body=name,
refresh=True)
And finally, I query a name:
es.search(index="my_index",
body={
"size": 5,
"query": {
"multi_match": {
"query": "Jon Doe",
"fields": "*_field"
}
}
})
Search returns:
{
'took': 1,
'timed_out': False,
'_shards': {
'total': 5,
'successful': 5,
'skipped': 0,
'failed': 0
},
'hits': {
'total':
1,
'max_score':
0.77749264,
'hits': [{
'_index': 'my_index',
'_type': 'person_name',
'_id': 'AWwYjl4Mqo63y_hLp5Yl',
'_score': 0.77749264,
'_source': {
'full_name': 'John Doe'
}
}]
}
}
In the search return I would like to get the phonetic transformation of the names in elastic search (also from the query name but it is less important) when I execute the search.
I know, that I could use explain API but I would like to avoid a 2nd request, and moreover the explain API seems a little "overkill" for what I want to achieve.
Thanks !
It doesn't look like an easy thing to implement in an Elasticsearch query, but you could try analyze API and scripted fields with fielddata enabled, and term vectors might come handy. Here's how.
Retrieve tokens from an arbitrary query
Analyze API is a great tool if you want to understand how exactly does Elasticsearch tokenize your query.
Using your mapping you could do, for example:
GET myindex/_analyze
{
"analyzer": "metaphone_analyzer",
"text": "John Doe"
}
And get something like this as a result:
{
"tokens": [
{
"token": "JN",
"start_offset": 0,
"end_offset": 4,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "john",
"start_offset": 0,
"end_offset": 4,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "T",
"start_offset": 5,
"end_offset": 8,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "doe",
"start_offset": 5,
"end_offset": 8,
"type": "<ALPHANUM>",
"position": 1
}
]
}
This is technically a different query, but still might be useful.
Retrieve tokens from a field of a document
In theory, we could try to retrieve the very same tokens which analyze API returned in the previous section, from the documents matched by our query.
In practice Elasticsearch will not store the tokens of a text field it has just analyzed: fielddata is disabled by default. We need to enable it:
PUT /myindex
{
"mappings": {
"person_name": {
"properties": {
"full_name": {
"fields": {
"metaphone_field": {
"type": "text",
"analyzer": "metaphone_analyzer",
"fielddata": true
}
},
"type": "text"
}
}
}
},
"settings": {
...
}
}
Now, we can use scripted fields to ask Elasticsearch to return those tokens.
The query might look like this:
POST myindex/_search
{
"script_fields": {
"my tokens": {
"script": {
"lang": "painless",
"source": "doc[params.field].values",
"params": {
"field": "full_name.metaphone_field"
}
}
}
}
}
And the response would look like this:
{
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "myindex",
"_type": "person_name",
"_id": "123",
"_score": 1,
"fields": {
"my tokens": [
"JN",
"T",
"doe",
"john"
]
}
}
]
}
}
As you can see, the very same tokens (but in random order).
Can we retrieve also the information about location of these tokens in the document?
Retrieving tokens with their positions
term vectors may help. To be able to use them we actually don't need fielddata enabled. We could lookup term vectors for a document:
GET myindex/person_name/123/_termvectors
{
"fields" : ["full_name.metaphone_field"],
"offsets" : true,
"positions" : true
}
This would return something like this:
{
"_index": "myindex",
"_type": "person_name",
"_id": "123",
"_version": 1,
"found": true,
"took": 1,
"term_vectors": {
"full_name.metaphone_field": {
"field_statistics": {
"sum_doc_freq": 4,
"doc_count": 1,
"sum_ttf": 4
},
"terms": {
"JN": {
"term_freq": 1,
"tokens": [
{
"position": 0,
"start_offset": 0,
"end_offset": 4
}
]
},
"T": {
"term_freq": 1,
"tokens": [
{
"position": 1,
"start_offset": 5,
"end_offset": 8
}
]
},
"doe": {
"term_freq": 1,
"tokens": [
{
"position": 1,
"start_offset": 5,
"end_offset": 8
}
]
},
"john": {
"term_freq": 1,
"tokens": [
{
"position": 0,
"start_offset": 0,
"end_offset": 4
}
]
}
}
}
}
}
This gives a way to get the tokens of a field of a document like the analyzer produced them.
Unfortunately, as of my knowledge, there is no way to combine these three queries into a single one. Also fielddata should be used with caution since it uses a lot of memory.
Hope this helps!

Remove duplicate values from list of nested dictionaries

I have list of dictionaries with nested structure. I need to remove all duplicate values. I'm newbie in Python and can't solve this task. Anyone can help me?
My list looks like:
[
{
"task_id":123,
"results":[
{
"url":"site.com",
"date":"04.18.2019"
},
{
"url":"another_site.com",
"date":"04.18.2019"
},
{
"url":"site1.com",
"date":"04.18.2019"
}
]
},
{
"task_id":456,
"results":[
{
"url":"site3.com",
"date":"04.18.2019"
},
{
"url":"site.com",
"date":"04.18.2019"
}
]
},
{
"task_id":789,
"results":[
{
"url":"site7.com",
"date":"04.18.2019"
},
{
"url":"site9.com",
"date":"04.18.2019"
},
{
"url":"site.com",
"date":"04.18.2019"
}
]
}
]
I need to set site.com only once. If any value of url is duplicated - exclude it from dict.
As result:
task 123 with 3 dicts in results
task 456 with 1 dict in results (exclude site.com)
task 789 with 2 dict in results (exclude site.com)
Desired output should looks like:
[
{
"task_id":123,
"results":[
{
"url":"site.com",
"date":"04.18.2019"
},
{
"url":"another_site.com",
"date":"04.18.2019"
},
{
"url":"site1.com",
"date":"04.18.2019"
}
]
},
{
"task_id":456,
"results":[
{
"url":"site3.com",
"date":"04.18.2019"
}
]
},
{
"task_id":789,
"results":[
{
"url":"site7.com",
"date":"04.18.2019"
},
{
"url":"site9.com",
"date":"04.18.2019"
}
]
}
]
let results to be your array.
u = set()
final = []
for dict in results:
for res in dict["results"]:
if res["url"] not in u:
u.add(res["url"])
final.append(res)
print(final)
You can use a list comprehension:
d = [{'task_id': 123, 'results': [{'url': 'site.com', 'date': '04.18.2019'}, {'url': 'another_site.com', 'date': '04.18.2019'}, {'url': 'site1.com', 'date': '04.18.2019'}]}, {'task_id': 456, 'results': [{'url': 'site3.com', 'date': '04.18.2019'}, {'url': 'site.com', 'date': '04.18.2019'}]}, {'task_id': 789, 'results': [{'url': 'site7.com', 'date': '04.18.2019'}, {'url': 'site9.com', 'date': '04.18.2019'}, {'url': 'site.com', 'date': '04.18.2019'}]}]
new_d = [{**a, 'results':[c for c in a['results'] if all(c not in b['results'] for b in d[:i])]} for i, a in enumerate(d)]
Output:
[
{
"task_id": 123,
"results": [
{
"url": "site.com",
"date": "04.18.2019"
},
{
"url": "another_site.com",
"date": "04.18.2019"
},
{
"url": "site1.com",
"date": "04.18.2019"
}
]
},
{
"task_id": 456,
"results": [
{
"url": "site3.com",
"date": "04.18.2019"
}
]
},
{
"task_id": 789,
"results": [
{
"url": "site7.com",
"date": "04.18.2019"
},
{
"url": "site9.com",
"date": "04.18.2019"
}
]
}
]
people = {
1: {'name': 'John',},
2: {'name': 'Marie'},
3: {'name': 'Ann',},
4: {'name': 'John'},
}
print(people)
unique = {}
for key, value in people.items():
if value not in unique.values():
unique[key] = value
print(unique)
try these

Is there more effective way to get result (O(n+m) rather than O(n*m))?

Origin data as below show, every item has a type mark, such as interests, family, behaviors, etc and I want to group by this type field.
return_data = [
{
"id": "112",
"name": "name_112",
"type": "interests",
},
{
"id": "113",
"name": "name_113",
"type": "interests",
},
{
"id": "114",
"name": "name_114",
"type": "interests",
},
{
"id": "115",
"name": "name_115",
"type": "behaviors",
},
{
"id": "116",
"name": "name_116",
"type": "family",
},
{
"id": "117",
"name": "name_117",
"type": "interests",
},
...
]
And expected ouput data format like:
output_data = [
{"interests":[
{
"id": "112",
"name": "name_112"
},
{
"id": "113",
"name": "name_113"
},
...
]
},
{
"behaviors": [
{
"id": "115",
"name": "name_115"
},
...
]
},
{
"family": [
{
"id": "116",
"name": "name_116"
},
...
]
},
...
]
And here is my trial:
type_list = []
for item in return_data:
if item['type'] not in type_list:
type_list.append(item['type'])
interests_list = []
for type in type_list:
temp_list = []
for item in return_data:
if item['type'] == type:
temp_list.append({"id": item['id'], "name": item['name']})
interests_list.append({type: temp_list})
Obviously my trial is low efficient as it is O(n*m), but I cannot find the more effective way to solve the problem.
Is there more effective way to get the result? any commentary is great welcome, thanks.
Use a defaultdict to store a list of items for each type:
from collections import defaultdict
# group by type
temp_dict = defaultdict(list)
for item in return_data:
temp_dict[item["type"]].append({"id": item["id"], "name": item["name"]})
# convert back into a list with the desired format
output_data = [{k: v} for k, v in temp_dict.items()]
Output:
[
{
'behaviors': [
{'name': 'name_115', 'id': '115'}
]
},
{
'family': [
{'name': 'name_116', 'id': '116'}
]
},
{
'interests': [
{'name': 'name_112', 'id': '112'},
{'name': 'name_113', 'id': '113'},
{'name': 'name_114', 'id': '114'},
{'name': 'name_117', 'id': '117'}
]
},
...
]
If you don't want to import defaultdict, you could use a vanilla dictionary with setdefault:
# temp_dict = {}
temp_dict.setdefault(item["type"], []).append(...)
Behaves in exactly the same way, if a little less efficient.
please see Python dictionary for map.
for item in return_data:
typeMap[item['type']] = typeMap[item['type']] + delimiter + item['name']

Categories