Extracting key/value pairs from JSON file - python

I am trying to convert Python code to extract key/value pairs from JSON output (originating from Microsoft Form Recognizer) and cannot recreate the loop within VB.NET (UiPath).
So far I created a nested for-each loop in UiPath to loop through each key/value pair within each page.
import json
response_file = "response.json"
# Specify the list of keys to be reported.
keys = set(["Number","Opened"])
with open(response_file, mode = "r", encoding = "utf-8") as f:
data = json.load(f)
# Loop over all pages in the document.
for page in data["pages"]:
# Loop over all key/value pairs in the page.
for kvp in page["keyValuePairs"]:
key_txt = " ".join([x["text"] for x in kvp["key"]])
# Report only the pre-specified subset of keys.
if key_txt in keys:
print("key: %s" % key_txt)
vals = [x["text"] for x in kvp["value"]]
print("value: %s" % " ".join(vals))
The JSON example I am using:
{
"status": "success",
"pages": [
{
"number": 1,
"height": 792,
"width": 612,
"clusterId": 0,
"keyValuePairs": [
{
"key": [
{
"text": "Number",
"boundingBox": [
71.6,
704.6,
109.0,
704.6,
109.0,
693.6,
71.6,
693.6
]
}
],
"value": [
{
"text": "RITM0041763",
"boundingBox": [
178.7,
704.6,
241.4,
704.6,
241.4,
693.6,
178.7,
693.6
],
"confidence": 1.0
}
]
},
{
"key": [
{
"text": "Opened",
"boundingBox": [
321.0,
704.6,
357.8,
704.6,
357.8,
693.6,
321.0,
693.6
]
}
],
"value": [
{
"text": "09/21/2018 09:04:01 AM",
"boundingBox": [
428.1,
704.6,
536.5,
704.6,
536.5,
693.6,
428.1,
693.6
],
"confidence": 1.0
}
]
The error in UiPath (running on VB.NET) is 'bracketed identifier is missing closing ']'.

Related

calculate json values average in python

how can I calculate in python the values JSON file in the following example:
"items": [
{
"start": "0.6",
"end": "0.9",
"alter": [
{
"conf": "0.6",
"content": ""
}
],
"type": "pron"
},
]
import json
with open("./file.json") as f:
dict_data = json.load(f) # passing file object and will return json in dictionary datatype
confidences = [float(i['alternatives'][0]['confidence']) for i in dict_data['items']]
confidence_avg = sum(confidences) / len(confidences)
print(confidence_avg)
Output:
0.8534666666666667
For starters, your JSON file is missing the first and last curly brackets, so I've added them manually. Without them, it is not valid JSON.
Use json.loads to parse the JSON string and return a dict.
The confidence values are stored as strings, so they need to be transformed to floats.
Add them one by one and divide by the number of confidence values. In this case we assume each item has only 1.
import json
json_str = r"""{
"items": [
{
"start_time": "0.0",
"end_time": "0.46",
"alternatives": [
{
"confidence": "0.9534",
"content": "رسالة"
}
],
"type": "pronunciation"
},
{
"start_time": "0.46",
"end_time": "0.69",
"alternatives": [
{
"confidence": "0.6475",
"content": "اللغة"
}
],
"type": "pronunciation"
},
{
"start_time": "0.69",
"end_time": "1.23",
"alternatives": [
{
"confidence": "0.9595",
"content": "العربية"
}
],
"type": "pronunciation"
}
]
}"""
items = json.loads(json_str)["items"]
average = 0
for item in items:
confidence = float(item["alternatives"][0]["confidence"])
average += confidence
average /= len(items)
print(average)
Output:
0.8534666666666667

Getting all the Keys from JSON Object?

Goal: To create a script that will take in nested JSON object as input and output a CSV file with all keys as rows in the CSV?
Example:
{
"Document": {
"DocumentType": 945,
"Version": "V007",
"ClientCode": "WI",
"Shipment": [
{
"ShipmentHeader": {
"ShipmentID": 123456789,
"OrderChannel": "Shopify",
"CustomerNumber": 234234,
"VendorID": "2343SDF",
"ShipViaCode": "FEDX2D",
"AsnDate": "2018-01-27",
"AsnTime": "09:30:47-08:00",
"ShipmentDate": "2018-01-23",
"ShipmentTime": "09:30:47-08:00",
"MBOL": 12345678901234568,
"BOL": 12345678901234566,
"ShippingNumber": "1ZTESTTEST",
"LoadID": 321456987,
"ShipmentWeight": 10,
"ShipmentCost": 2.3,
"CartonsTotal": 2,
"CartonPackagingCode": "CTN25",
"OrdersTotal": 2
},
"References": [
{
"Reference": {
"ReferenceQualifier": "TST",
"ReferenceText": "Testing text"
}
}
],
"Addresses": {
"Address": [
{
"AddressLocationQualifier": "ST",
"LocationNumber": 23234234,
"Name": "John Smith",
"Address1": "123 Main St",
"Address2": "Suite 12",
"City": "Hometown",
"State": "WA",
"Zip": 92345,
"Country": "USA"
},
{
"AddressLocationQualifier": "BT",
"LocationNumber": 2342342,
"Name": "Jane Smith",
"Address1": "345 Second Ave",
"Address2": "Building 32",
"City": "Sometown",
"State": "CA",
"Zip": "23665-0987",
"Country": "USA"
}
]
},
"Orders": {
"Order": [
{
"OrderHeader": {
"PurchaseOrderNumber": 23456342,
"RetailerPurchaseOrderNumber": 234234234,
"RetailerOrderNumber": 23423423,
"CustomerOrderNumber": 234234234,
"Department": 3333,
"Division": 23423,
"OrderWeight": 10.23,
"CartonsTotal": 2,
"QTYOrdered": 12,
"QTYShipped": 23
},
"Cartons": {
"Carton": [
{
"SSCC18": 12345678901234567000,
"TrackingNumber": "1ZTESTTESTTEST",
"CartonContentsQty": 10,
"CartonWeight": 10.23,
"LineItems": {
"LineItem": [
{
"LineNumber": 1,
"ItemNumber": 1234567890,
"UPC": 9876543212,
"QTYOrdered": 34,
"QTYShipped": 32,
"QTYUOM": "EA",
"Description": "Shoes",
"Style": "Tall",
"Size": 9.5,
"Color": "Bllack",
"RetailerItemNumber": 2342333,
"OuterPack": 10
},
{
"LineNumber": 2,
"ItemNumber": 987654321,
"UPC": 7654324567,
"QTYOrdered": 12,
"QTYShipped": 23,
"QTYUOM": "EA",
"Description": "Sunglasses",
"Style": "Short",
"Size": 10,
"Color": "White",
"RetailerItemNumber": 565465456,
"OuterPack": 12
}
]
}
}
]
}
}
]
}
}
]
}
}
In the above JSON Object, I want all the keys (nested included) in a List (Duplicates can be removed by using a set Data Structure). If Nested Key Occurs like in actual JSON they can be keys multiple times in the CSV !
I personally feel that recursion is a perfect application for this type of problem if the amount of nests you will encounter is unpredictable. Here I have written an example in Python of how you can utilise recursion to extract all keys. Cheers.
import json
row = ""
def extract_keys(data):
global row
if isinstance(data, dict):
for key, value in data.items():
row += key + "\n"
extract_keys(value)
elif isinstance(data, list):
for element in data:
extract_keys(element)
# MAIN
with open("input.json", "r") as rfile:
dicts = json.load(rfile)
extract_keys(dicts)
with open("output.csv", "w") as wfile:
wfile.write(row)

Extract values from json based on select condition using python

I am trying to Extract values from json based on select condition using python.
My Json file looks like below:
{
"bindings": [
{
"members": [
"user:rohithmn3#gmail.com"
],
"role": "roles/browser"
},
{
"members": [
"serviceAccount:admin-user#linuxacademy-3.iam.gserviceaccount.com",
"user:rohithmn03#gmail.com"
],
"role": "roles/owner"
},
{
"members": [
"user:rohithmn3#gmail.com"
],
"role": "roles/viewer"
}
],
"etag": "BwrRsH-UhJ0=",
"version": 1
}
I am trying to parse this above file in python based on the user. For Example: Get the roles defined for user rohithmn3#gmail.com; as per the json the output should be :
roles/browser
roles/viewer
Regards,
Rohith
Using a list comprehension and dictionary input d:
var = 'rohithmn3#gmail.com'
res = [subd['role'] for subd in d['bindings'] if 'user:'+var in subd['members']]
print(res)
['roles/browser', 'roles/viewer']
Setup
d = {
"bindings": [
{
"members": [
"user:rohithmn3#gmail.com"
],
"role": "roles/browser"
},
{
"members": [
"serviceAccount:admin-user#linuxacademy-3.iam.gserviceaccount.com",
"user:rohithmn03#gmail.com"
],
"role": "roles/owner"
},
{
"members": [
"user:rohithmn3#gmail.com"
],
"role": "roles/viewer"
}
],
"etag": "BwrRsH-UhJ0=",
"version": 1
}

How find data from JSON using python and watson discovery news

{
"matching_results": 1264,
"results": [
{
"main_image_url": "https://s4.reutersmedia.net/resources_v2/images/rcom-default.png",
"enriched_text": {
"entities": [
{
"relevance": 0.33,
"disambiguation": {
"subtype": [
"Country"
]
},
"sentiment": {
"score": 0
},
"type": "Location",
"count": 1,
"text": "China"
},
{
"relevance": 0.33,
"disambiguation": {
"subtype": [
"Country"
]
},
"sentiment": {
"score": 0
},
This is too much large file so I want to find "relevance" and "score" using python.
How fetch this info?
Regardless of how large it is, it is only a simple dictionary.
Iterate the lists. Extract the key-values.
for result in data['results']:
for e in result['enriched_text']['entities']:
print(e['relevance'])
print(e['sentiment']['score'])

API Nested JSON Response TO CSV

I am trying to convert a Nested JSON Response to CSV. Following is the JSON Response
{
"rows": [
[
{
"postId": 188365573,
"messageId": 198365562,
"accountId": 214,
"messageType": 2,
"channelType": "TWITTER",
"accountType": "TWITTER",
"taxonomy": {
"campaignId": "2521_4",
"clientCustomProperties": {
"PromotionChannelAbbreviation": [
"3tw"
],
"PromotionChannels": [
"Twitter"
],
"ContentOwner": [
"Audit"
],
"Location": [
"us"
],
"Sub_Category": [
"dbriefs"
],
"ContentOwnerAbbreviation": [
"aud"
],
"PrimaryPurpose_Outcome": [
"Engagement"
],
"PrimaryPurposeOutcomeAbbv": [
"eng"
]
},
"partnerCustomProperties": {},
"tags": [],
"urlShortnerDomain": "2721_spr.ly"
},
"approval": {
"approvalOption": "NONE",
"comment": ""
},
"status": "SENT",
"createdDate": 1433331585000,
"scheduleDate": 1435783440000,
"version": 4,
"deleted": false,
"publishedDate": 1435783441000,
"statusID": "6163465412728176",
"permalink": "https://twitter.com/Acctg/status/916346541272498176",
"additional": {
"links": []
}
},
0
],
[
{
"postId": 999145171,
"messageId": 109145169,
"accountId": 21388,
"messageType": 2,
"channelType": "TWITTER",
"accountType": "TWITTER",
"taxonomy": {
"campaignId": "2521_4",
"clientCustomProperties": {
"PromotionChannelAbbreviation": [
"3tw"
],
"Eminence_Registry_Number": [
"1000159"
],
"PromotionChannels": [
"Twitter"
],
"ContentOwner": [
"Ctr. Health Solutions"
],
"Location": [
"us"
],
"Sub_Category": [
"fraud"
],
"ContentOwnerAbbreviation": [
"chs"
],
"PrimaryPurpose_Outcome": [
"Awareness"
],
"PrimaryPurposeOutcomeAbbv": [
"awa"
]
},
"partnerCustomProperties": {},
"tags": [],
"urlShortnerDomain": "2521_spr.ly"
},
"approval": {
"approvalOption": "NONE",
"comment": ""
},
"status": "SENT",
"createdDate": 1434983660000,
"scheduleDate": 1435753800000,
"version": 4,
"deleted": false,
"publishedDate": 1435753801000,
"statusID": "616222222198407168",
"permalink": "https://twitter.com/Health/status/6162222221984070968",
"additional": {
"links": []
}
},
0
]
}
And the python code I am using to covert this is
import json
import csv
# importing the data
with open('Post_Insights_test.json') as Test:
data1 = json.load(Test)
# opening the csv
csvdata= open('Data_table2.csv', 'w')
csvwriter = csv.writer(csvdata, delimiter=',')
#Taking the keys out from 1st dict, that too which aren't nested
header= data1["rows"][1][0].keys()
csvwriter.writerow(header)
for i in range(0,70):
csvwriter.writerow(data1["rows"][i][0].values())
csvdata.close()
Problems are following:
Unable to get the keys for nested responses like taxonomy
Unable to get the values for nested responses like taxonomy
Many responses have different headers/ keys, so ideally I should have them as headers in my excel, but I am not able to figure out how to do it in python
My excel sheet shows gap of row after every entry , I dont know why
Please help. All criticism are welcome. Kind Regards

Categories