I have a text file which contains dictionaries that are not comma sepearated in the following format:
{} {} {}
Example
{
'header': 'sdf',
'meta': {
'searchId': {
'searchId': 1234
},
'timestamp': 1234,
'attachments': [
'ABC'
],
'xmlData': {
'release': None,
'version': None,
}
}
{
'header': 'sdf',
'timestamp': 14,
'attachments': [
'ABC'
],
'xmlData': {
'release': None,
'version': None,
}
}
These dictionaries may contain nested dictionaries. I want to read this file and turn it into a list of dictionaries i.e. in the format [{},{},{}]
Example
[{
'header': 'sdf',
'meta': {
'searchId': {
'searchId': 1234
},
'timestamp': 1234,
'attachments': [
'ABC'
],
'xmlData': {
'release': None,
'version': None,
}
},
{
'header': 'sdf',
'timestamp': 14,
'attachments': [
'ABC'
],
'xmlData': {
'release': None,
'version': None,
}
}]
Can someone suggest a way to do it.
Thanks
My two other answers assume that the dicts in your data file are on separate lines so that each dict can be parsed as valid Python statements. If that is not the case, however, you can use lib2to3 and modify the Python grammar in Grammar.txt so that a simple statement (denoted by simple_stmt in the grammar file) does not have to end with a newline character:
from lib2to3 import fixer_base, refactor, pygram, pgen2
from io import StringIO
from functools import partialmethod
with open(pygram._GRAMMAR_FILE) as file:
grammar = StringIO(''.join(line.replace(' NEWLINE', '') if line.startswith('simple_stmt:') else line for line in file))
pgen2.pgen.ParserGenerator.__init__ = partialmethod(pgen2.pgen.ParserGenerator.__init__, stream=grammar)
pygram.python_grammar = pgen2.pgen.generate_grammar()
and look for atom nodes at the top level (whose parent node does not have a parent) instead:
class ScrapeAtoms(fixer_base.BaseFix):
PATTERN = "atom"
def __init__(self, *args):
super().__init__(*args)
self.nodes = []
def transform(self, node, results):
if not node.parent.parent:
self.nodes.append(node)
return node
class Refactor(refactor.RefactoringTool):
def get_fixers(self):
self.scraper = ScrapeAtoms(None, None)
return [self.scraper], []
def get_result(self):
return '[%s]\n' % ',\n'.join(str(node).rstrip() for node in self.scraper.nodes)
so that:
s = '''{'a': {1: 2}}{'b': 2}{
'c': 3
}{'d': 4}'''
refactor = Refactor(None)
refactor.refactor_string(s, '')
print(refactor.get_result())
outputs:
[{'a': {1: 2}},
{'b': 2},
{
'c': 3
},
{'d': 4}]
Demo: https://repl.it/#blhsing/CompleteStarchyFactorial
Like others have stated in the comments. This isn't json data. You merely have multiple string representations of dicts pretty printed to the file in succession, and you're also missing a closing bracket in the first one.
So I suggest looping through the file and build a string for each dict then you can use ast.literal_eval to parse the string into a dict. Something like this:
from ast import literal_eval
current = ''
data = []
with open('filename.txt') as f:
for line in f:
if line.startswith('{'):
current = line
elif line.startswith('}'):
data.append(literal_eval(current + line))
else:
current += line
Results in data (using pprint):
[{'header': 'sdf',
'meta': {'attachments': ['ABC'],
'searchId': {'searchId': 1234},
'timestamp': 1234,
'xmlData': {'release': None, 'version': None}}},
{'attachments': ['ABC'],
'header': 'sdf',
'timestamp': 14,
'xmlData': {'release': None, 'version': None}}]
After this you should overwrite the data, And never use this as serialization again. This is why there's libraries for this.
Since each dict in the file is a valid Python statement, a more robust solution would be to use the lib2to3 to parse the file as Python code and extract the statement nodes so that you can enclose them in square brackets, separated by commas:
from lib2to3 import fixer_base, refactor
class ScrapeStatements(fixer_base.BaseFix):
PATTERN = "simple_stmt"
def __init__(self, *args):
super().__init__(*args)
self.nodes = []
def transform(self, node, results):
self.nodes.append(node)
return node
class Refactor(refactor.RefactoringTool):
def get_fixers(self):
self.scraper = ScrapeStatements(None, None)
return [self.scraper], []
def get_result(self):
return '[%s]\n' % ',\n'.join(str(node).rstrip() for node in self.scraper.nodes)
so that:
s = '''{
'header': 'sdf',
'meta': {
'searchId': {
'searchId': 1234
},
'timestamp': 1234,
'attachments': [
'ABC'
],
'xmlData': {
'release': None,
'version': None,
}
}
}
{
'header': 'sdf',
'timestamp': 14,
'attachments': [
'ABC'
],
'xmlData': {
'release': None,
'version': None,
}
}
'''
refactor = Refactor(None)
refactor.refactor_string(s, '')
print(refactor.get_result())
outputs:
[{
'header': 'sdf',
'meta': {
'searchId': {
'searchId': 1234
},
'timestamp': 1234,
'attachments': [
'ABC'
],
'xmlData': {
'release': None,
'version': None,
}
}
},
{
'header': 'sdf',
'timestamp': 14,
'attachments': [
'ABC'
],
'xmlData': {
'release': None,
'version': None,
}
}]
If all the dicts in the file are on separate lines as they are in your sample input, then each dict by itself is a valid Python statement, so you can use ast.parse to parse the file into an abstract syntax tree, look for the expression nodes (of type Expr), and build a new Expression node with a List node to hold all the aforementioned Expr nodes. The new Expression node can then be compiled and evaluated as an actual Python list of dicts, so that given your sample input data in variable s:
import ast
tree = ast.parse(s)
exprs = [node.value for node in ast.walk(tree) if isinstance(node, ast.Expr)]
new = ast.Expression(body=ast.List(elts=exprs, ctx=ast.Load()))
ast.fix_missing_locations(new)
lst = eval(compile(new, '', 'eval'))
lst would become:
[{'header': 'sdf',
'meta': {'searchId': {'searchId': 1234},
'timestamp': 1234,
'attachments': ['ABC'],
'xmlData': {'release': None, 'version': None}}},
{'header': 'sdf',
'timestamp': 14,
'attachments': ['ABC'],
'xmlData': {'release': None, 'version': None}}]
Demo: https://repl.it/#blhsing/FocusedCylindricalTypes
Related
I need to extract 2 values from this list of dictionary and store it as a key-value pair.
Here I attached sample data..Where I need to extract "Name" and "Service" from this input and store it as a dictionary. Where "Name" is Key and corresponding "Service" is its value.
Input:
response = {
'Roles': [
{
'Path': '/',
'Name': 'Heera',
'Age': '25',
'Policy': 'Policy1',
'Start_Month': 'January',
'PolicyDocument':
{
'Date': '2012-10-17',
'Statement': [
{
'id': '',
'RoleStatus': 'New_Joinee',
'RoleType': {
'Service': 'Service1'
},
'Action': ''
}
]
},
'Duration': 3600
},
{
'Path': '/',
'Name': 'Prem',
'Age': '40',
'Policy': 'Policy2',
'Start_Month': 'April',
'PolicyDocument':
{
'Date': '2018-11-27',
'Statement': [
{
'id': '',
'RoleStatus': 'Senior',
'RoleType': {
'Service': ''
},
'Action': ''
}
]
},
'Duration': 2600
},
]
}
From this input, I need output as a dictionary type.
Output Format: { Name : Service }
Output:
{ "Heera":"Service1","Prem" : " "}
My try:
Role_name =[]
response = {#INPUT WHICH I SPECIFIED ABOVE#}
roles = response['Roles']
for role in roles:
Role_name.append(role['Name'])
print(Role_name)
I need to pair the name with its corresponding service. Any help would be really appreciable.
Thanks in advance.
You just have to write a long line which can reach till the key 'Service'.
And you a syntax error in line Start_Month': 'January') and 'Start_Month': 'April'). You can't have one unclosed brackets.
Fix it and run the following.
This is the code:
output_dict = {}
for r in response['Roles']:
output_dict[r["Name"]] = r['PolicyDocument']['Statement'][0]['RoleType']['Service']
print(output_dict)
Output:
{'Heera': 'Service1', 'Prem': ''}
You just have to do like this:
liste = []
for role in response['Roles']:
liste.append(
{
role['Name']:role['PolicyDocument']['Statement'][0]['RoleType']['Service'],
}
)
print(liste)
It seems your input data is structured kind of strange and I am not sure what the ) are doing next to the months since they make things invalid but here is a working script assuming you removed the parenthesis from your input.
response = {
'Roles': [
{
'Path': '/',
'Name': 'Heera',
'Age': '25',
'Policy': 'Policy1',
'Start_Month': 'January',
'PolicyDocument':
{
'Date': '2012-10-17',
'Statement': [
{
'id': '',
'RoleStatus': 'New_Joinee',
'RoleType': {
'Service': 'Service1'
},
'Action': ''
}
]
},
'Duration': 3600
},
{
'Path': '/',
'Name': 'Prem',
'Age': '40',
'Policy': 'Policy2',
'Start_Month': 'April',
'PolicyDocument':
{
'Date': '2018-11-27',
'Statement': [
{
'id': '',
'RoleStatus': 'Senior',
'RoleType': {
'Service': ''
},
'Action': ''
}
]
},
'Duration': 2600
},
]
}
output = {}
for i in response['Roles']:
output[i['Name']] = i['PolicyDocument']['Statement'][0]['RoleType']['Service']
print(output)
This should give you what you want in a variable called role_services:
role_services = {}
for role in response['Roles']:
for st in role['PolicyDocument']['Statement']:
role_services[role['Name']] = st['RoleType']['Service']
It will ensure you'll go through all of the statements within that data structure but be aware you'll overwrite key-value pairs as you traverse the response, if they exist in more than a single entry!
A reference on for loops which might be helpful, illustrates using if statements within them which can help you to extend this to check if items already exist!
Hope that helps
So I have been trying to figure out how I can print out two different formats using one for loop. I would like to provide the code before explaining my issue
fullList = [
{
'url': 'www.randomsite.com/251293',
'numbers': '7.5'
},
{
'url': 'www.randomsite.com/251294',
'numbers': '8'
},
{
'url': 'www.randomsite.com/251295',
'numbers': '8.5'
},
{
'url': 'www.randomsite.com/251296',
'numbers': '9'
},
{
'url': 'www.randomsite.com/251297',
'numbers': '9.5'
}
]
#fullList = [
# {
# 'numbers': '7.5'
# },
# {
# 'numbers': '8'
# },
# {
# 'numbers': '8.5'
# },
# {
# 'numbers': '9'
# },
# {
# 'numbers': '9.5'
# }
#]
try:
numbersList = []
for numbers in fullList:
numbersList.append('{}{}'.format('{}'.format(numbers.get('url') if numbers.get('url') else ''), numbers.get('numbers')))
print(numbersList)
except Exception:
pass
and what I am looking for outcome is:
If url is in the list: print('<url|numbers>') meaning the format would be <url|numbers>
If no url is in the list: print(numbers) and the print here should only give the numbers - I sometimes just want the numbers, meaning that in the list I removed all URL's so it will only remain numbers.
My problem is that I dont know how I can combine these two into one format. So far I am able to print out only numbers with the code I have provided.
Use normal if/else. It will be more readable. And you have only one format.
for numbers in fullList:
if numbers.get('url'):
numbersList.append('{}|{}'.format(numbers.get('url'), numbers.get('numbers'))
else:
numbersList.append(numbers.get('numbers'))
You can solve this problem and it will look more pythonic this way:
fullList = [
{'url': 'www.randomsite.com/251293', 'numbers': '7.5'},
{'url': 'www.randomsite.com/251294', 'numbers': '8'},
{'url': 'www.randomsite.com/251295', 'numbers': '8.5'},
{'url': 'www.randomsite.com/251296', 'numbers': '9'},
{'url': 'www.randomsite.com/251297', 'numbers': '9.5'},
{'numbers': '100'}
]
[(x['url'] + '|' + x['numbers']) if x.get('url') else x['numbers'] for x in fullList ]
You are using list comprehensions, minimizing nesting etc.
One solution is to select all values in each subdict and join them with a custom delimiter. In this way, you don't care if the key/value exist or not.
# Let's consider partial data
fullList = [
{
'url': 'www.randomsite.com/251293',
'numbers': '7.5'
},
{
'url': 'www.randomsite.com/251294',
'numbers': '8'
},
{
'url': 'www.randomsite.com/251295',
'numbers': '8.5'
},
{
'url': 'www.randomsite.com/251296',
},
{
'numbers': '9.5'
}
]
numbersList = []
for element in fullList:
numbersList.append("|".join([element[v] for v in element.keys()]))
print(numbersList)
# ['www.randomsite.com/251293|7.5', 'www.randomsite.com/251294|8',
# 'www.randomsite.com/251295|8.5', 'www.randomsite.com/251296', '9.5']
You can do it in one line with list comprehension:
output = ["|".join([element[v] for v in element.keys()]) for element in fullList]
print(output)
# ['www.randomsite.com/251293|7.5', 'www.randomsite.com/251294|8',
# 'www.randomsite.com/251295|8.5', 'www.randomsite.com/251296', '9.5']
Using list comprehension
Ex.
fullList = [
{'url': 'www.randomsite.com/251293','numbers': '7.5'},
{'url': 'www.randomsite.com/251294','numbers': '8'},
{'url': 'www.randomsite.com/251295','numbers': '8.5'},
{'url': 'www.randomsite.com/251296','numbers': '9'},
{'url': 'www.randomsite.com/251297','numbers': '9.5'}
]
list1 = [ "{0}|{1}".format(x['url'],x['numbers']) for x in fullList ]
print(list1)
O/P:
['www.randomsite.com/251293|7.5', 'www.randomsite.com/251294|8', 'www.randomsite.com/251295|8.5', 'www.randomsite.com/251296|9', 'www.randomsite.com/251297|9.5']
OR
for the updated question, if the dictionary does not contain url
fullList = [
{'url': 'www.randomsite.com/251296','numbers': '9'},
{'numbers': '9.5'}
]
list1 = [ "{0}{1}".format((x.get('url')+'|' if 'url' in x else ''),x.get('numbers','')) for x in fullList ]
print(list1)
O/P:
['www.randomsite.com/251296|9', '9.5']
I have a JSON with following structure:
{
'count': 93,
'apps' : [
{
'last_modified_at': '2016-10-21T12:20:26Z',
'frequency_caps': [],
'ios': {
'enabled': True,
'push_enabled': False,
'app_store_id': 'bbb',
'connection_type': 'certificate',
'sdk_api_secret': '--'
},
'organization_id': '--',
'name': '---',
'app_id': 27,
'control_group_percentage': 0,
'created_by': {
'user_id': 'abc',
'user_name': 'def'
},
'created_at': '2016-09-28T11:41:24Z',
'web': {}
}, {
'last_modified_at': '2016-10-12T08:58:57Z',
'frequency_caps': [],
'ios': {
'enabled': True,
'push_enabled': True,
'app_store_id': '386304604',
'connection_type': 'certificate',
'sdk_api_secret': '---',
'push_expiry': '2018-01-14T08:24:09Z'
},
'organization_id': '---',
'name': '---',
'app_id': 87,
'control_group_percentage': 0,
'created_by': {
'user_id': '----',
'user_name': '---'
},
'created_at': '2016-10-12T08:58:57Z',
'web': {}
}
]
}
It's a JSON with two key-value-pairs. The second pair's value is a List of more JSON's.
For me it is too much information and I want to have a JSON like this:
{
'apps' : [
{
'name': 'Appname',
'app_id' : 1234,
'organization_id' : 'Blablabla'
},
{
'name': 'Appname2',
'app_id' : 5678,
'organization_id' : 'Some other Organization'
}
]
}
I want to have a JSON that only contains one key ("apps") and its value, which would be a List of more JSONs that only have three key-value-pairs..
I am thankful for any advice.
Thank you for your help!
#bishakh-ghosh I don't think you need to use the input json as string. It can be used straight as a dictionary. (thus avoid ast)
One more concise way :
# your original json
input_ = { 'count': 93, ... }
And here are the steps :
Define what keys you want to keep
slice_keys = ['name', 'app_id', 'organization_id']
Define the new dictionary as a slice on the slice_keys
dict(apps=[{key:value for key,value in d.items() if key in slice_keys} for d in input_['apps']])
And that's it.
That should yield the JSON formatted as you want, e.g
{
'apps':
[
{'app_id': 27, 'name': '---', 'organization_id': '--'},
{'app_id': 87, 'name': '---', 'organization_id': '---'}
]
}
This might be what you are looking for:
import ast
import json
json_str = """{
'count': 93,
'apps' : [
{
'last_modified_at': '2016-10-21T12:20:26Z',
'frequency_caps': [],
'ios': {
'enabled': True,
'push_enabled': False,
'app_store_id': 'bbb',
'connection_type': 'certificate',
'sdk_api_secret': '--'
},
'organization_id': '--',
'name': '---',
'app_id': 27,
'control_group_percentage': 0,
'created_by': {
'user_id': 'abc',
'user_name': 'def'
},
'created_at': '2016-09-28T11:41:24Z',
'web': {}
}, {
'last_modified_at': '2016-10-12T08:58:57Z',
'frequency_caps': [],
'ios': {
'enabled': True,
'push_enabled': True,
'app_store_id': '386304604',
'connection_type': 'certificate',
'sdk_api_secret': '---',
'push_expiry': '2018-01-14T08:24:09Z'
},
'organization_id': '---',
'name': '---',
'app_id': 87,
'control_group_percentage': 0,
'created_by': {
'user_id': '----',
'user_name': '---'
},
'created_at': '2016-10-12T08:58:57Z',
'web': {}
}
]
}"""
json_dict = ast.literal_eval(json_str)
new_dict = {}
app_list = []
for appdata in json_dict['apps']:
appdata_dict = {}
appdata_dict['name'] = appdata['name']
appdata_dict['app_id'] = appdata['app_id']
appdata_dict['organization_id'] = appdata['organization_id']
app_list.append(appdata_dict)
new_dict['apps'] = app_list
new_json_str = json.dumps(new_dict)
print(new_json_str) # This is your resulting json string
Lets take for example the following collections:
{
'_id': '0',
'docs': [
{'value': 'abcd', 'key': '1234'},
{'value': 'abef', 'key': '5678'}
]
}
{
'_id': '1',
'docs': [
{'value': 'wxyz', 'key': '1234'},
{'value': 'abgh', 'key': '5678'}
]
}
I want to be able to select only the sub-documents under the 'docs' list which 'value' contains the string 'ab'. What I'm expecting to get is the following collections:
{
'_id': '0',
'docs': [
{'value': 'abcd', 'key': '1234'},
{'value': 'abef', 'key': '5678'}
]
}
{
'_id': '1',
'docs': [
{'value': 'abgh', 'key': '5678'}
]
}
Thus, filtering out the unmatched sub-documents.
You need an aggregation pipeline that matches each subdocument separately, then re-joins the matching subdocuments into arrays:
from pprint import pprint
from bson import Regex
regex = Regex(r'ab')
pprint(list(col.aggregate([{
'$unwind': '$docs'
}, {
'$match': {'docs.value': regex}
}, {
'$group': {
'_id': '$_id',
'docs': {'$push': '$docs'}
}
}])))
I assume "col" is a variable pointing to your PyMongo Collection object. This outputs:
[{u'_id': u'1',
u'docs': [{u'key': u'5678', u'value': u'abgh'}]},
{u'_id': u'0',
u'docs': [{u'key': u'1234', u'value': u'abcd'},
{u'key': u'5678', u'value': u'abef'}]}]
The "r" prefix to the string makes it a Python "raw" string to avoid any trouble with regex code. In this case the regex is just "ab" so the "r" prefix isn't necessary, but it's good practice now so you don't make a mistake in the future.
Consider a basic adjacency list; a list of nodes represent by a Node class, with properties id, parent_id, and name. The parent_id of top-level nodes = None.
What would be a Pythonic way of transforming the list into an un-ordered html menu tree, e.g.:
node name
node name
sub-node name
sub-node name
Assuming you've got something like this:
data = [
{ 'id': 1, 'parent_id': 2, 'name': "Node1" },
{ 'id': 2, 'parent_id': 5, 'name': "Node2" },
{ 'id': 3, 'parent_id': 0, 'name': "Node3" },
{ 'id': 4, 'parent_id': 5, 'name': "Node4" },
{ 'id': 5, 'parent_id': 0, 'name': "Node5" },
{ 'id': 6, 'parent_id': 3, 'name': "Node6" },
{ 'id': 7, 'parent_id': 3, 'name': "Node7" },
{ 'id': 8, 'parent_id': 0, 'name': "Node8" },
{ 'id': 9, 'parent_id': 1, 'name': "Node9" }
]
This function iterates through the list and creates the tree, collecting children of each node is the sub list:
def list_to_tree(data):
out = {
0: { 'id': 0, 'parent_id': 0, 'name': "Root node", 'sub': [] }
}
for p in data:
out.setdefault(p['parent_id'], { 'sub': [] })
out.setdefault(p['id'], { 'sub': [] })
out[p['id']].update(p)
out[p['parent_id']]['sub'].append(out[p['id']])
return out[0]
Example:
tree = list_to_tree(data)
import pprint
pprint.pprint(tree)
If parent ids are None's (not 0's), modify the function like this:
def list_to_tree(data):
out = {
'root': { 'id': 0, 'parent_id': 0, 'name': "Root node", 'sub': [] }
}
for p in data:
pid = p['parent_id'] or 'root'
out.setdefault(pid, { 'sub': [] })
out.setdefault(p['id'], { 'sub': [] })
out[p['id']].update(p)
out[pid]['sub'].append(out[p['id']])
return out['root']
# or return out['root']['sub'] to return the list of root nodes
This is how I ended up implementing it- #thg435's way is elegant, but builds a list of dictionaries to print. This one will print an actual HTML UL menu tree:
nodes = [
{ 'id':1, 'parent_id':None, 'name':'a' },
{ 'id':2, 'parent_id':None, 'name':'b' },
{ 'id':3, 'parent_id':2, 'name':'c' },
{ 'id':4, 'parent_id':2, 'name':'d' },
{ 'id':5, 'parent_id':4, 'name':'e' },
{ 'id':6, 'parent_id':None, 'name':'f' }
]
output = ''
def build_node(node):
global output
output += '<li><a>'+node['name']+'</a>'
build_nodes(node['id']
output += '</li>'
def build_nodes(node_parent_id):
global output
subnodes = [node for node in nodes if node['parent_id'] == node_parent_id]
if len(subnodes) > 0 :
output += '<ul>'
[build_node(subnode) for subnode in subnodes]
output += '</ul>'
build_nodes(None) # Pass in None as a parent id to start with top level nodes
print output
You can see it here: http://ideone.com/34RT4
Mine uses recursion (cool) and a global output string (not cool)
Someone could surely improve on this, but it's working for me right now..