Python - Convert JSON key/values into key/value where value is an array - python

I have a JSON file with numerous entries like this:
{
"area1": "California",
"area2": "Sierra Eastside",
"area3": "Bishop Area",
"area4": "Volcanic Tablelands (Happy/Sad Boulders)",
"area5": "Fish Slough Boulders",
"grade": "V6 ",
"route": "The Orgasm",
"type1": "Boulder",
"type2": "NONE",
"type3": "NONE",
"type4": "NONE",
},
I want to take the area and type entries and turn them into arrays:
{
"area": ["California","Sierra Eastside","Bishop Area","Volcanic Tablelands (Happy/Sad Boulders)","Fish Slough Boulders"]
"grade": "V6 ",
"route": "The Orgasm",
"type": ["Boulder","NONE","NONE","NONE"]
},
I have this code which almost works:
json_data=open('../json/routes_test.json')
datas = json.load(json_data)
datas_arrays = []
area_keys = ['area1','area2','area3','area4','area5']
type_keys = ['type1','type2','type3','type4']
for data in datas:
areaArray = []
typeArray = []
deleteArray = []
for k, v in data.iteritems():
for area_key in area_keys:
if (k == area_key):
areaArray.append(v)
deleteArray.append(k)
for type_key in type_keys:
if (k == type_key):
typeArray.append(v)
deleteArray.append(k)
for k in deleteArray:
del data[k]
data['area'] = areaArray
data['type'] = typeArray
datas_arrays.append(data)
print datas_arrays
print "********"
out = json.dumps(datas_arrays, sort_keys=True,indent=4, separators=(',', ': '))
print out
f_out= open('../json/toues_test_intoarrays.json', 'w')
f_out.write(out)
f_out.close()
The problem is that the area array is all out of order and the type array is backwards, which I can't have. I find it strange that one is unordered and one is ordered but backwards. To me it seems like the iteration should assure they're placed in order.

Python dictionaries have an arbitrary ordering, they are not sorted. You want to use your prebuilt lists of keys instead:
with open('../json/routes_test.json') as json_data:
datas = json.load(json_data)
area_keys = ['area1','area2','area3','area4','area5']
type_keys = ['type1','type2','type3','type4']
for data in datas:
data['area'] = [data[k] for k in area_keys]
data['type'] = [data[k] for k in type_keys]
for k in area_keys + type_keys:
del data[k]
out = json.dumps(datas, sort_keys=True, indent=4, separators=(',', ': '))
print out
with open('../json/toues_test_intoarrays.json', 'w') as f_out:
f_out.write(out)
which changes the dictionaries in-place.
You could even determine the area and type keys from each entry:
for data in datas:
keys = sorted(data.keys())
area_keys = [k for k in keys if k.startswith('area')]
data['area'] = [data[k] for k in area_keys]
type_keys = [k for k in keys if k.startswith('type')]
data['type'] = [data[k] for k in type_keys]
for k in area_keys + type_keys:
del data[k]
and omit the list literals with the 'area1', 'area2' etc. hardcoded lists altogether.

Iterate the keys in order.
for k, v in sorted(data.iteritems()):
This will fail once you get past 9, but it will do for now.

Related

in python, read csv data and. convert unique values into python's dictionary

How do I convert below comma delimited records -
COL1,COL2,COL3,COL4
A101,P501,U901,US_A
A101,P501,U902,US_B
A101,P502,U901,US_A
A102,P501,U901,US_A
A102,P502,U902,US_B
into python dictionary -
result = {
"A101": {
"P501": {"U901": "US_A", "U902": "US_B"},
"P502": {"U901": "US_A"}
},
"A102": {
"P501": {"U901": "US_A"},
"P502": {"U902": "US_B"}
}
}
Thank you for your help!
Approach
We can process the rows of the CSV file as follows:
Convert each row in CSV file from a list to a nested dictionary using Convert a list to nested dictionary i.e. line reduce(lambda x, y: {y: x}, reversed(row))) in code below.
Merge the nested dictionaries using
Merge nested dictionaries in Python using merge_dict function below
Code
import csv
def csv_to_nested_dict(filenm):
' CSV file to nested dictionary '
with open(filenm, 'r') as csvfile:
csv_reader = csv.reader(csvfile, delimiter=',')
next(csv_reader) # skip header row
result = {}
for row in csv_reader:
# Convert row to nested dictionary and
# Merge into result
result = merge_dict(result,
reduce(lambda x, y: {y: x}, reversed(row))) # row to nested dictionary
return result
def merge_dict(dict1, dict2):
' Merges nested dictionaries '
for key, val in dict1.items():
if type(val) == dict:
if key in dict2 and type(dict2[key] == dict):
merge_dict(dict1[key], dict2[key])
else:
if key in dict2:
dict1[key] = dict2[key]
for key, val in dict2.items():
if not key in dict1:
dict1[key] = val
return dict1
Test
Usage:
res = csv_to_nested_dict('test.txt') # result
# Use json to pretty print nested dictionary res
import json
print(json.dumps(res, indent = 4))
Input File test.txt
COL1,COL2,COL3,COL4
A101,P501,U901,US_A
A101,P501,U902,US_B
A101,P502,U901,US_A
A102,P501,U901,US_A
A102,P502,U902,US_B
Output
{
"A101": {
"P501": {
"U901": "US_A",
"U902": "US_B"
},
"P502": {
"U901": "US_A"
}
},
"A102": {
"P501": {
"U901": "US_A"
},
"P502": {
"U902": "US_B"
}
}
}
here is simple version of solution -
def dict_reader(file_name):
with open(file_name, 'r') as csvfile:
reader = csv.DictReader(csvfile)
try:
data = dict()
for row in reader:
col1, col2, col3, col4 = (row["col1"], row["col2"], row["col3"], row["col4"])
if col1 in data:
if col2 in data[col1]:
data[col1][col2].update({col3: col4})
else:
data[col1][col2] = {col3: col4}
else:
data[col1] = {col2: {col3: col4}}
except csv.Error as e:
sys.exit('file {}, line {}: {}'.format(file_name, reader.line_num, e))
finally:
return data
it is not very elegant solution but works.

Removing duplicate key and appending the value of the deleted key in Python

Let's say I've got some results like the below from iterating thru JSON file.
{257585701: [156173119], 667512043: [228087519], 821360748: [5350676] and more }
{136607969: [13510118], 667512043: [13510118], 257585701: [13510118] and more }
{....................more data..........}
{....................more data..........}
like 100s
Now, if I wanna delete the duplicate value and append the value (from deleted duplicate value) to the original key, how can I do that? I'm hoping to get something like this:
{257585701: [156173119,13510118 ], 667512043: [228087519, 13510118], 821360748: [5350676], 136607969: [13510118]}
My codes are:
import json
filepath = '../data/' # I have subdirectories and tons of json file
with open(filepath) as stream:
data = json.load(stream)
results = {}
for item in data['info']['items']:
cid = item['id']
for trainer in item['trainer']:
tid = trainer['id']
if tid not in trainers:
trainers[tid] = []
trainers[tid].append(cid)
print(results)
# this print(results) prints the dictionary I mentioned above and they're like 100s of them.
This iterates through all the keys in dict2 and if it is already present it appends the value, otherwise it adds a new key:
dict1 = {257585701: [156173119], 667512043: [228087519], 821360748: [5350676]}
dict2 = {136607969: [13510118], 667512043: [13510118], 257585701: [13510118]}
dict3 = dict1
for k, v in dict2.items():
if k in dict3.keys():
dict3[k] += v
else:
dict3[k] = v
print(dict3)
Output:
{257585701: [156173119, 13510118], 667512043: [228087519, 13510118], 821360748: [5350676], 136607969: [13510118]}
You can start here
def merge_dicts(*dicts):
d = {}
for dict in dicts:
for key in dict:
try:
d[key].append(dict[key])
except KeyError:
d[key] = [dict[key]]
return d
pass all dicts in merge_dicts(d1,d2,d3..)
Write functions.
I can't test the code fully because I don't have access to your input. I also had to guess the type of trainers. The following code hopefully approaches a solution.
from collections import defaultdict
import json
def read_one_json(filepath: str, trainers: [dict]) -> dict:
with open(filepath) as stream:
data = json.load(stream)
results = {}
for item in data['info']['items']:
cid = item['id']
for trainer in item['trainer']:
tid = trainer['id']
if tid not in trainers:
trainers[tid] = []
trainers[tid].append(cid)
return results
def read_jsons(filepaths: [str], trainers: [dict]) -> list[dict]:
jsons = []
for filepath in filepaths:
jsons.append(read_one_json(filepath, trainers))
return jsons
def combine_dicts(dicts: [dict]) -> dict:
"""
dicts is list of dicts of (int, [int]) pairs.
combine_dicts returns a new dict where the values of duplicate keys are combined
>>> dicts = [{257585701: [156173119], 667512043: [228087519], 821360748: [5350676]}]
>>> dicts += [{136607969: [13510118], 667512043: [13510118], 257585701: [13510118]}]
>>> combine_dicts(dicts)
defaultdict(<class 'list'>, {257585701: [156173119, 13510118], 667512043: [228087519, 13510118], 821360748: [5350676], 136607969: [13510118]})
"""
combined_data = defaultdict(list)
for data in dicts:
for key, value in data.items():
combined_data[key] += value
return combined_data
def main() -> None:
filepaths = [...] # you supply these
trainers = [...] # you supply these
separate_data = read_jsons(filepaths, trainers)
combined_data = combine_dicts(separate_data)
print(combined_data)
if __name__ == '__main__':
main()
You can try to ingest the data string into a list of dictionary and process from there.
I'm using dic.get(key, '') instead of dic['key'] for the same purpose, but without the key error if the key does not exist. When the key does not exist, it outputs the empty string '' specified.
data = """{257585701: [156173119], 667512043: [228087519], 821360748: [5350676]}
{136607969: [13510118], 667512043: [13510118], 257585701: [13510118]}
{136607969: [135101], 667512043: [135101], 257585701: [135101]}"""
#dict_list = [eval(e) for e in data.split('\n')] #NOT safe, do NOT use this!
import ast
dict_list = [ast.literal_eval(e) for e in data.split('\n')] #use this
Output dict_list
[{257585701: [156173119], 667512043: [228087519], 821360748: [5350676]},
{136607969: [13510118], 667512043: [13510118], 257585701: [13510118]},
{136607969: [135101], 667512043: [135101], 257585701: [135101]}]
I'm assuming data is from print results, and they are separated by new line \n, so they can be processed into Python dict above.
keys = []
result = {}
for dic in dict_list:
keys.extend(dic.keys())
keys = set(keys)
for key in keys:
result[key] = []
for dic in dict_list:
result[key] += dic.get(key, '')
print(result)
Output:
{136607969: [13510118, 135101],
667512043: [228087519, 13510118, 135101],
821360748: [5350676],
257585701: [156173119, 13510118, 135101]}

PYTHON - Fastest Way of Flattening/Exploding multiple large JSON files with nested arrays, have more than 100000 json files

I have written an efficient JSON flattening logic that explodes and join nested JSON arrays it works faster for on JSON with more than 100s of nested arrays and nested dict but problem is now I have 100000 JSON files to handle. is there a way to either merge multiple JSONs to one big ad run this code or something else, any help will be great....
I know there are some duplicate question but this is mainly regarding the efficiently handling large number of large JSON files
# let's say I have this json and flattening/exploding code:
from collections import defaultdict, MutableMapping
from copy import deepcopy
import pandas as pd
sample = {
"rss": {
"overview": {
"id": {
"data": [
{
"stuff": [
{
"onetype": [
{"id": '1', "name": "John Doe"},
{"id": '2', "name": "Don Joeh"},
]
},
{"othertype": [{"id": '2', "company": "ACME"}]},
]
},
{"otherstuff": [{"thing": [['1', '42'], ['2', '2']]}]},
]
}
}
}
}
# Flattening with exploding Logic:
def cross_join(left, right):
new_rows = [] if right else left
for left_row in left:
for right_row in right:
temp_row = deepcopy(left_row)
for key, value in right_row.items():
temp_row[key] = value
new_rows.append(deepcopy(temp_row))
return new_rows
def dict_maker(dic_list):
dd = defaultdict(list)
for d in dic_list:
for key, value in d.items():
dd[key].append(value)
return dd
def flatten_list(data):
for elem in data:
if isinstance(elem, list):
yield from flatten_list(elem)
else:
yield elem
def flatten_struct(data, prev_heading=""):
if isinstance(data, dict):
rows = [{}]
for key, value in data.items():
rows = cross_join(rows, flatten_struct(value, prev_heading + "_" + key))
elif isinstance(data, list):
rows = []
for i in range(len(data)):
[
rows.append(elem)
for elem in flatten_list(flatten_struct(data[i], prev_heading))
]
else:
rows = [{prev_heading[1:]: data}]
return rows
def flatten(d, parent_key="", sep="_"):
items = []
if isinstance(d, dict):
for k, v in d.items():
new_key = parent_key + sep + k if parent_key else k
if isinstance(v, MutableMapping):
items.extend(flatten(v, new_key, sep=sep).items())
else:
items.append((new_key, v))
else:
{}
return dict(items)
def get_section_df(section, section_grp, id=None):
df_lst = []
finalMap = {}
for elem in section:
d = flatten(elem)
flat = [
{k + "_" + key: val for key, val in dict_maker(flatten_struct(v)).items()}
if isinstance(v, list)
else {k: v}
for k, v in d.items()
]
for new_d in flat:
finalMap.update(new_d)
# finalMap.update({k:v for k,v in id})
if len(finalMap) > 0:
df = pd.concat(
{
str(section_grp)
+ "_"
+ k.replace("#", "").replace("#", ""): pd.Series(v)
for k, v in finalMap.items()
},
axis=1,
)
df_lst.append(df)
return df_lst
def process(json_sample):
df_list = []
master_d = flatten(json_sample)
master_keys = [k for k in master_d.keys() if type(master_d.get(k)) == list]
grouped_path_dict = {x: x.split("_")[2] for x in master_keys}
master_id = ''
for flatted in master_keys:
lst = master_d.get(flatted)
path_group = grouped_path_dict.get(flatted)
# if isinstance(lst, list):
if len(get_section_df(section=lst, id=master_id, section_grp=path_group)) > 0:
pdf = pd.concat(
get_section_df(section=lst, id=master_id, section_grp=path_group)
)
df_list.append(pdf)
df = pd.concat(df_list)
return df
print(process(json_sample=sample))
id_stuff_onetype_id id_stuff_onetype_name id_stuff_othertype_id id_stuff_othertype_company id_otherstuff_thing
0 1 John Doe 2 ACME NaN
1 2 Don Joeh NaN NaN NaN
0 1 John Doe 2 ACME 1
1 2 Don Joeh NaN NaN 42
2 NaN NaN NaN NaN 2
3 NaN NaN NaN NaN 2

Duplicate values in a dictionary

I am trying to read through a csv file in the following format:
number,alphabet
1,a
2,b
3,c
2,b
1,a
My code to create a dictionary:
alpha = open('alpha.csv','r')
csv_alpha = csv.reader(alpha)
alpha_file = {row[0]:row[1] for row in csv_alpha}
OUTPUT:
alpha_file = { 1:'a', 2:'b', 3:'c' }
By looking at the file, 1 and 2 have duplicate values.
How can i possibly change my output to :
alpha_file = { 1:'a', 1:'a', 2:'b', 2:'b', 3:'c' }
LNG - PYTHON
use a list to hold key's value
alpha = open('alpha.csv','r')
csv_alpha = csv.reader(alpha)
alpha_file = dict()
for row in csv_alpha:
if row[0] in alpha_file:
alpha_file[row[0]].append(row[1])
else:
alpha_file[row[0]] = [row[1]]
the output will be like:
{ 1:['a','a'],2:['b','b'], 3:['c'] }
to output the number of key occurrences, use a for loop
d = { 1:['a','a'],2:['b','b'], 3:['c'] }
amount = []
for key, value in d.iteritems():
amount += [key] * len(value)
print amount
output looks like:
[1, 1, 2, 2, 3]

Split dictionary key and list of values from dict

I want to split keys and values and display the dictionary result below mentioned format. I'm reading a file and splitting the data into list and later moving to dictionary.
Please help me to get the result.
INPUT FILE - commands.txt
login url=http://demo.url.net username=test#url.net password=mytester
create-folder foldername=demo
select-folder foldername=test123
logout
Expected result format
print result_dict
"0": {
"login": [
{
"url": "http://demo.url.net",
"username": "test#url.net",
"password": "mytester"
}
]
},
"1": {
"create-folder": {
"foldername": "demo"
}
},
"2": {
"select-folder": {
"foldername": "test-folder"
}
},
"3": {
"logout": {}
}
CODE
file=os.path.abspath('catalog/commands.txt')
list_output=[f.rstrip().split() for f in open(file).readlines()]
print list_output
counter=0
for data in list_output:
csvdata[counter]=data[0:]
counter=counter+1
print csvdata
for key,val in csvdata.iteritems():
for item in val:
if '=' in item:
key,value=item.split("=")
result[key]=value
print result
As a function:
from collections import defaultdict
from itertools import count
def read_file(file_path):
result = defaultdict(dict)
item = count()
with open(file_path) as f:
for line in f:
if not line:
continue
parts = line.split()
result[next(item)][parts[0]] = dict(p.split('=') for p in parts[1:])
return dict(result)
Better example and explanation:
s = """
login url=http://demo.url.net username=test#url.net password=mytester
create-folder foldername=demo
select-folder foldername=test123
logout
"""
from collections import defaultdict
from itertools import count
result_dict = defaultdict(dict)
item = count()
# pretend you opened the file and are reading it line by line
for line in s.splitlines():
if not line:
continue # skip empty lines
parts = line.split()
result_dict[next(item)][parts[0]] = dict(p.split('=') for p in parts[1:])
With pretty print:
>>> pprint(dict(result_dict))
{0: {'login': {'password': 'mytester',
'url': 'http://demo.url.net',
'username': 'test#url.net'}},
1: {'create-folder': {'foldername': 'demo'}},
2: {'select-folder': {'foldername': 'test123'}},
3: {'logout': {}}}
lines = ["login url=http://demo.url.net username=test#url.net password=mytester",
"create-folder foldername=demo",
"select-folder foldername=test123",
"logout"]
result = {}
for no, line in enumerate(lines):
values = line.split()
pairs = [v.split('=') for v in values[1:]]
result[str(no)] = {values[0]: [dict(pairs)] if len(pairs) > 1 else dict(pairs)}
import pprint
pprint.pprint(result)
Output:
{'0': {'login': [{'password': 'mytester',
'url': 'http://demo.url.net',
'username': 'test#url.net'}]},
'1': {'create-folder': {'foldername': 'demo'}},
'2': {'select-folder': {'foldername': 'test123'}},
'3': {'logout': {}}}
But are you sure you need the extra list inside the login value? If not, just change [dict(pairs)] if len(pairs) > 1 else dict(pairs) to dict(pairs).
r = dict()
f = open('commands.txt')
for i, line in enumerate(f.readlines()):
r[str(i)] = dict()
actions = line.split()
list_actions = {}
for action in actions[1:]:
if "=" in action:
k, v = action.split('=')
list_actions[k] = v
if len(actions[1:]) > 1:
r[str(i)][actions[0]] = [list_actions]
else:
r[str(i)][actions[0]] = list_actions
print r
Should be work

Categories