Extract values from json based on select condition using python - python

I am trying to Extract values from json based on select condition using python.
My Json file looks like below:
{
"bindings": [
{
"members": [
"user:rohithmn3#gmail.com"
],
"role": "roles/browser"
},
{
"members": [
"serviceAccount:admin-user#linuxacademy-3.iam.gserviceaccount.com",
"user:rohithmn03#gmail.com"
],
"role": "roles/owner"
},
{
"members": [
"user:rohithmn3#gmail.com"
],
"role": "roles/viewer"
}
],
"etag": "BwrRsH-UhJ0=",
"version": 1
}
I am trying to parse this above file in python based on the user. For Example: Get the roles defined for user rohithmn3#gmail.com; as per the json the output should be :
roles/browser
roles/viewer
Regards,
Rohith

Using a list comprehension and dictionary input d:
var = 'rohithmn3#gmail.com'
res = [subd['role'] for subd in d['bindings'] if 'user:'+var in subd['members']]
print(res)
['roles/browser', 'roles/viewer']
Setup
d = {
"bindings": [
{
"members": [
"user:rohithmn3#gmail.com"
],
"role": "roles/browser"
},
{
"members": [
"serviceAccount:admin-user#linuxacademy-3.iam.gserviceaccount.com",
"user:rohithmn03#gmail.com"
],
"role": "roles/owner"
},
{
"members": [
"user:rohithmn3#gmail.com"
],
"role": "roles/viewer"
}
],
"etag": "BwrRsH-UhJ0=",
"version": 1
}

Related

Python Cubes OLAP Framework - How to sum a json column?

I started using Python Cubes Olap recently.
I'm trying to sum/avg a JSON postgres column, how can i do this?
my db structure:
events
id
object_type
sn_name
spectra
id
snx_wavelengths (json column)
event_id
my json:
{
"dimensions": [
{
"name": "event",
"levels": [
{
"name": "object_type",
"label": "Object Type",
"attributes": [
"object_type"
]
},
{
"name": "sn_name",
"label": "name",
"attributes": [
"sn_name"
]
}
]
},
{
"name": "spectra",
"levels": [
{
"name": "catalog_name",
"label": "Catalog Name",
"attributes": [
"catalog_name"
]
},
{
"name": "capture_date",
"label": "Capture Date",
"attributes": [
"capture_date"
]
}
]
},
{
"name": "date"
}
],
"cubes": [
{
"id": "uid",
"name": "14G31Yx98ZG8aEhFHjOWNNBmFOETg5APjZo5AiHaqog5YxLMK5",
"dimensions": [
"event",
"spectra",
"date"
],
"aggregates": [
{
"name": "event_snx_wavelengths_sum",
"function": "sum",
"measure": "event.snx_wavelengths"
},
{
"name": "record_count",
"function": "count"
}
],
"joins": [
{
"master": "14G31Yx98ZG8aEhFHjOWNNBmFOETg5APjZo5AiHaqog5YxLMK5.id",
"detail": "spectra.event_id"
},
],
"mappings": {
"event.sn_name": "sn_name",
"event.object_type": "object_type",
"spectra.catalog_name": "spectra.catalog_name",
"spectra.capture_date": "spectra.capture_date",
"event.snx_wavelengths": "spectra.snx_wavelengths",
"date": "spectra.capture_date"
},
}
]
}
I'm getting the follow error:
Unknown attribute ''event.snx_wavelengths''
Anyone can help?
I already tried use mongodb to do the sum, i didnt had success.

Extract specific column from csv stored in S3

I am querying Athena thru lambda. Results are getting stored in csv format in S3 bucket.
The csv files has two columns - EventTime and instance id.
I am reading csv file via one of function in my lambda handler:
def read_instanceids(path):
s3 = boto3.resource('s3')
bucket = s3.Bucket('aws-athena-query-results-mybucket-us-east-1')
obj = bucket.Object(key= path)
response = obj.get()
lines = response['Body'].read().decode('utf-8').split()
return lines**
Output:
[
"\"eventTime\",\"instanceId\"",
"\"2021-09-27T19:46:08Z\",\"\"\"i-0aa1f4dd\"\"\"",
"\"2021-09-27T21:04:13Z\",\"\"\"i-0465c287\"\"\"",
"\"2021-09-27T21:10:48Z\",\"\"\"i-08b75f79\"\"\"",
"\"2021-09-27T19:40:43Z\",\"\"\"i-0456700b\"\"\"",
"\"2021-03-29T21:58:40Z\",\"\"\"i-0724f99f\"\"\"",
"\"2021-03-29T23:27:44Z\",\"\"\"i-0fafbe64\"\"\"",
"\"2021-03-29T21:41:12Z\",\"\"\"i-0064a8552\"\"\"",
"\"2021-03-29T23:19:09Z\",\"\"\"i-07f5f08e5\"\"\""
]
I want to store only my instance ids in one array.
How I can achieve that. I cant use Pandas/Numpy.
If I am using get_query_results - and returning the response - its in the below format:
[
{
"Data": [
{
"VarCharValue": "eventTime"
},
{
"VarCharValue": "instanceId"
}
]
},
{
"Data": [
{
"VarCharValue": "2021-09-23T22:36:15Z"
},
{
"VarCharValue": "\"i-053090803\""
}
]
},
{
"Data": [
{
"VarCharValue": "2021-03-29T21:58:40Z"
},
{
"VarCharValue": "\"i-0724f62a\""
}
]
},
{
"Data": [
{
"VarCharValue": "2021-03-29T21:41:12Z"
},
{
"VarCharValue": "\"i-552\""
}
]
},
{
"Data": [
{
"VarCharValue": "2021-03-29T23:19:09Z"
},
{
"VarCharValue": "\"i-07f4e5\""
}
]
},
{
"Data": [
{
"VarCharValue": "2021-03-29T23:03:09Z"
},
{
"VarCharValue": "\"i-0eb453\""
}
]
},
{
"Data": [
{
"VarCharValue": "2021-03-30T19:18:11Z"
},
{
"VarCharValue": "\"i-062120\""
}
]
},
{
"Data": [
{
"VarCharValue": "2021-03-30T18:15:26Z"
},
{
"VarCharValue": "\"i-0121a04\""
}
]
},
{
"Data": [
{
"VarCharValue": "2021-03-29T23:27:44Z"
},
{
"VarCharValue": "\"i-0f213\""
}
]
},
{
"Data": [
{
"VarCharValue": "2021-03-30T18:07:05Z"
},
{
"VarCharValue": "\"i-0ee19d8\""
}
]
},
{
"Data": [
{
"VarCharValue": "2021-04-28T14:49:22Z"
},
{
"VarCharValue": "\"i-04ad3c29\""
}
]
},
{
"Data": [
{
"VarCharValue": "2021-04-28T14:38:43Z"
},
{
"VarCharValue": "\"i-7c6166\""
}
]
},
{
"Data": [
{
"VarCharValue": "2021-03-30T19:13:42Z"
},
{
"VarCharValue": "\"i-07bc579d\""
}
]
},
{
"Data": [
{
"VarCharValue": "2021-04-29T19:47:34Z"
},
{
"VarCharValue": "\"i-0b8bc7df5\""
}
]
}
]
You can use the result returned from Amazon Athena via get_query_results().
If the data variable contains the JSON shown in your question, you can extract a list of the instances with:
rows = [row['Data'][1]['VarCharValue'].replace('"', '') for row in data]
print(rows)
The output is:
['instanceId', 'i-053090803', 'i-0724f62a', 'i-552', 'i-07f4e5', 'i-0eb453', 'i-062120', 'i-0121a04', 'i-0f213', 'i-0ee19d8', 'i-04ad3c29', 'i-7c6166', 'i-07bc579d', 'i-0b8bc7df5']
You can skip the column header by referencing: rows[1:]
IF your list was valid, you can do:
l = [ "eventTime",
"instanceId",
"2021-09-27T19:46:08Z",
"i-0aa1f4dd",
"2021-09-27T21:04:13Z",
"""i-0465c287""",
"2021-09-27T21:10:48Z",
"""i-08b75f79""",
"2021-09-27T19:40:43Z",
"""i-0456700b""",
"2021-03-29T21:58:40Z",
"""i-0724f99f""",
"2021-03-29T23:27:44Z",
"""i-0fafbe64""",
"2021-03-29T21:41:12Z",
"""i-0064a8552""",
"2021-03-29T23:19:09Z",
"""i-07f5f08e5""" ]
print(l[2:][1::2])
['i-0aa1f4dd', 'i-0465c287', 'i-08b75f79', 'i-0456700b', 'i-0724f99f', 'i-0fafbe64', 'i-0064a8552', 'i-07f5f08e5']
Python has csv module in standard library. https://docs.python.org/3/library/csv.html
But in this use case, if instanceIds doesn't contain comma you can split lines by comma, take second field and strip double quotes.
def read_instanceids(path):
s3 = boto3.resource('s3')
bucket = s3.Bucket('aws-athena-query-results-mybucket-us-east-1')
obj = bucket.Object(key= path)
response = obj.get()
lines = response['Body'].read().decode('utf-8').split()
return [line.split(',')[1].strip('"') for line in lines[1:]]

Using pandas to convert csv into nested json with dynamic strucutre

I am new to python and now want to convert a csv file into json file. Basically the json file is nested with dynamic structure, the structure will be defined using the csv header.
From csv input:
ID, Name, person_id/id_type, person_id/id_value,person_id_expiry_date,additional_info/0/name,additional_info/0/value,additional_info/1/name,additional_info/1/value,salary_info/details/0/grade,salary_info/details/0/payment,salary_info/details/0/amount,salary_info/details/1/next_promotion
1,Peter,PASSPORT,A452817,1-01-2055,Age,19,Gender,M,Manager,Monthly,8956.23,unknown
2,Jane,PASSPORT,B859804,2-01-2035,Age,38,Gender,F,Worker, Monthly,125980.1,unknown
To json output:
[
{
"ID": 1,
"Name": "Peter",
"person_id": {
"id_type": "PASSPORT",
"id_value": "A452817"
},
"person_id_expiry_date": "1-01-2055",
"additional_info": [
{
"name": "Age",
"value": 19
},
{
"name": "Gender",
"value": "M"
}
],
"salary_info": {
"details": [
{
"grade": "Manager",
"payment": "Monthly",
"amount": 8956.23
},
{
"next_promotion": "unknown"
}
]
}
},
{
"ID": 2,
"Name": "Jane",
"person_id": {
"id_type": "PASSPORT",
"id_value": "B859804"
},
"person_id_expiry_date": "2-01-2035",
"additional_info": [
{
"name": "Age",
"value": 38
},
{
"name": "Gender",
"value": "F"
}
],
"salary_info": {
"details": [
{
"grade": "Worker",
"payment": " Monthly",
"amount": 125980.1
},
{
"next_promotion": "unknown"
}
]
}
}
]
Is this something can be done by the existing pandas API or I have to write lots of complex codes to dynamically construct the json object? Thanks.

Converting nested JSON structures to Pandas DataFrames

I've been struggling with the nested structure in json, how to convert to correct form
{
"id": "0c576f35-d704-4fa8-8cbb-311c6be36358",
"employee_id": null,
"creator_id": "16ca2db9-206c-4e18-891d-a00a5252dbd3",
"closed_by_id": null,
"request_number": 23,
"priority": "2",
"form_id": "urlaub-weitere-abwesenheiten",
"status": "opened",
"name": "Urlaub & weitere Abwesenheiten",
"read_by_employee": false,
"custom_status": {
"id": 15793,
"name": "In Bearbeitung HR"
},
"due_date": null,
"created_at": "2021-03-29T15:18:37.572040+02:00",
"updated_at": "2021-03-29T15:22:15.590156+02:00",
"closed_at": null,
"archived_at": null,
"attachment_count": 1,
"category": {
"id": "payroll-time-management",
"name": "Payroll, Time & Attendance"
},
"public_comment_count": 0,
"form_data": [
{
"field_id": "subcategory",
"values": [
"Time & Attendance - Manage monthly/year-end consolidation and report"
]
},
{
"field_id": "separator-2",
"values": [
null
]
},
{
"field_id": "art-der-massnahme",
"values": [
"Fortbildung"
]
},
{
"field_id": "bezeichnung-der-schulung-kurses",
"values": [
"dfgzhujiko"
]
},
{
"field_id": "startdatum",
"values": [
"2021-03-26"
]
},
{
"field_id": "enddatum",
"values": [
"2021-03-27"
]
},
{
"field_id": "freistellung",
"values": [
"nein"
]
},
{
"field_id": "mit-bildungsurlaub",
"values": [
""
]
},
{
"field_id": "kommentarfeld_fortbildung",
"values": [
""
]
},
{
"field_id": "separator",
"values": [
null
]
},
{
"field_id": "instructions",
"values": [
null
]
},
{
"field_id": "entscheidung-hr-bp",
"values": [
"Zustimmen"
]
},
{
"field_id": "kommentarfeld-hr-bp",
"values": [
"wsdfghjkmhnbgvfcdxsybvnm,"
]
},
{
"field_id": "individuelle-abstimmung",
"values": [
""
]
}
],
"form_files": [
{
"id": 30129,
"filename": "empty_background.png",
"field_id": "anhang"
}
],
"visible_by_employee": false,
"organization_ids": [],
"need_edit_by_employee": false,
"attachments": []
}
using a simple solution with pandas, dataframe
Request = pd.DataFrame.from_dict(pd.json_normalize(data), orient='columns')
it's displaying almost in its correct form:
how to split a dictionary from columns form_data i form_files, I've done a lot of research, but I'm still having a lot of trouble solving this problem, how to split form_data for columns, no rows for meta to ID
You can do something like this.
pass the dataframe and the column to the function as arguments
def explode_node(child_df, column_value):
child_df = child_df.dropna(subset=[column_value])
if isinstance(child_df[str(column_value)].iloc[0], str):
child_df[column_value] = child_df[str(column_value)].apply(ast.literal_eval)
expanded_child_df = (pd.concat({i: json_normalize(x) for i, x in child_df.pop(str(column_value)).items()}).reset_index(level=1,drop=True).join(child_df, how='right', lsuffix='_left', rsuffix='_right').reset_index(drop=True))
expanded_child_df.columns = map(str.lower, expanded_child_df.columns)
return expanded_child_df

How to search through JSON object for specific list o expected key values?

I have this json object, and I am curious how to iterate through servicecatalog:name and alert for any name that does not equal "service-foo" or "service-bar".
Here is my json object:
{
"access": {
"serviceCatalog": [
{
"endpoints": [
{
"internalURL": "https://snet-storage101.example.com//v1.0",
"publicURL": "https://storage101.example.com//v1.0",
"region": "LON",
"tenantId": "1
},
{
"internalURL": "https://snet-storage101.example.com//v1.0",
"publicURL": "https://storage101.example.com//v1.0",
"region": "USA",
"tenantId": "1
}
],
"name": "service-foo",
"type": "object-store"
},
{
"endpoints": [
{
"publicURL": "https://x.example.com:9384/v1.0/x",
"tenantId": "6y5t4re32"
}
],
"name": "service-bar",
"type": "rax:test"
},
{
"endpoints": [
{
"publicURL": "https://y.example.com:9384/v1.0/x",
"tenantId": "765432"
}
],
"name": "service-thesystem",
"type": "rax:test"
}
]
}
If x is the above mentioned dictionary. You could do
for item in x["access"]["serviceCatalog"]:
if item["name"] not in ["service-foo", "service-bar"]:
print(item["name"])
ps: you could use json.loads() to decode json data if you are asking for that. And also you have errors in your JSON.

Categories