How to use S3 Select for Nested Parquet Objects

How to use S3 Select for Nested Parquet Objects - python

I have dumped data into a parquet file.
When I use
SELECT * FROM s3object s LIMIT 1
it gives me the following result.
{
"name": "John",
"age": "45",
"country": "USA",
"experience": [{
"company": {
"name": "ABC",
"years": "10",
"position": "Manager"
}
},
{
"company": {
"name": "BBC",
"years": "2",
"position": "Assistant"
}
}
]
}
I want to filter the result where company.name = "ABC"
so, the output should be looks like following.
{
"name": "John",
"age": "45",
"country": "USA",
"experience": [{
"company": {
"name": "ABC",
"years": "10",
"position": "Manager"
}
}
]
}
or this
{
"name": "John",
"age": "45",
"country": "USA",
"experience.company.name": "ABC",
"experience.company.years": "10",
"experience.company.position": "Manager"
}
Any support is highly appreciated.
Thanks.

Related

Update nested JSON with name of json file name

I'm wondering if you could help me with filling jsons with their original filenames.
Here is a sample of json:
jsv is a list of jsons (the first main key is number of document (document_0, document_1 ...)
jsv =
[
{
{
"document_0":{
"id":111,
"laboratory":"xxx",
"document_type":"xxx",
"language":"pl",
"creation_date":"09-12-2022",
"source_filename":"None",
"version":"0.1",
"exams_ocr_avg_confidence":0.0,
"patient_data":{
"first_name":"YYYY",
"surname":"YYYY",
"pesel":"12345678901",
"birth_date":"1111-22-22",
"sex":"F",
"age":"None"
},
"exams":[
{
"name":"xx",
"sampling_date":"2020-11-30",
"comment":"None",
"confidence":97,
"result":"222",
"unit":"ml",
"norm":"None",
"material":"None",
"icd9":"uuuuu"
},
{
"document_1":{
"id":111,
"laboratory":"xxx",
"document_type":"xxx",
"language":"pl",
"creation_date":"09-12-2022",
"source_filename":"None",
"version":"0.1",
"exams_ocr_avg_confidence":0.0,
"patient_data":{
"first_name":"YYYY",
"surname":"YYYY",
"pesel":"12345678901",
"birth_date":"1111-22-22",
"sex":"F",
"age":"None"
},
"exams":[
{
"name":"xx",
"sampling_date":"2020-11-30",
"comment":"None",
"confidence":97,
"result":"222",
"unit":"ml",
"norm":"None",
"material":"None",
"icd9":"uuuuu"
}
}
]
And inside of this json there is a key: source_filename which I want to update with real name of json file name
my folder with files as an example:
'11111.pdf.json',
'11112.pdf.json',
'11113.pdf.json',
'11114.pdf.json',
'11115.pdf.json'
What I want to achieve:
jsv =
[
{
{
"document_0":{
"id":111,
"laboratory":"xxx",
"document_type":"xxx",
"language":"pl",
"creation_date":"09-12-2022",
"source_filename":"11111.pdf.json",
"version":"0.1",
"exams_ocr_avg_confidence":0.0,
"patient_data":{
"first_name":"YYYY",
"surname":"YYYY",
"pesel":"12345678901",
"birth_date":"1111-22-22",
"sex":"F",
"age":"None"
},
"exams":[
{
"name":"xx",
"sampling_date":"2222-22-22",
"comment":"None",
"confidence":22,
"result":"222",
"unit":"ml",
"norm":"None",
"material":"None",
"icd9":"uuuuu"
},
{
"document_1":{
"id":111,
"laboratory":"xxx",
"document_type":"xxx",
"language":"pl",
"creation_date":"22-22-2222",
"source_filename":"11111.pdf.json",
"version":"0.1",
"exams_ocr_avg_confidence":0.0,
"patient_data":{
"first_name":"YYYY",
"surname":"YYYY",
"pesel":"12345678901",
"birth_date":"1111-22-22",
"sex":"F",
"age":"None"
},
"exams":[
{
"name":"xx",
"sampling_date":"2222-11-22",
"comment":"None",
"confidence":22,
"result":"222",
"unit":"ml",
"norm":"None",
"material":"None",
"icd9":"uuuuu"
}
}
]
document_0 and document_1 are with the same filename
what I've managed to get:
dir_name = 'path_name'
from os import listdir
from os.path import isfile, join
onlyfiles = [f for f in listdir(dir_name) if isfile(join(dir_name, f))]
only_files which is a list of filenames of my jsons.
Now I was thinking to maybe update somehow my jsv with it in a loop?
But I'm also looking for a method which will be very efficient due to large amount of data I have to process
EDIT:
I've managed to do it with a for loop, but maybe there is more effective way:
for i in range(len(jsv)):
if (type(jsv[i]) == dict):
jsv[i]["document_0"].update({"source_filename": onlyfiles[i]})
else:
print(onlyfiles[i])

If your jsv is:
jsv = [
{
"document_0": {
"id": 111,
"laboratory": "xxx",
"document_type": "xxx",
"language": "pl",
"creation_date": "09-12-2022",
"source_filename": "None",
"version": "0.1",
"exams_ocr_avg_confidence": 0.0,
"patient_data": {
"first_name": "YYYY",
"surname": "YYYY",
"pesel": "12345678901",
"birth_date": "1111-22-22",
"sex": "F",
"age": "None",
},
"exams": [
{
"name": "xx",
"sampling_date": "2020-11-30",
"comment": "None",
"confidence": 97,
"result": "222",
"unit": "ml",
"norm": "None",
"material": "None",
"icd9": "uuuuu",
},
],
}
},
{
"document_1": {
"id": 111,
"laboratory": "xxx",
"document_type": "xxx",
"language": "pl",
"creation_date": "09-12-2022",
"source_filename": "None",
"version": "0.1",
"exams_ocr_avg_confidence": 0.0,
"patient_data": {
"first_name": "YYYY",
"surname": "YYYY",
"pesel": "12345678901",
"birth_date": "1111-22-22",
"sex": "F",
"age": "None",
},
"exams": [
{
"name": "xx",
"sampling_date": "2020-11-30",
"comment": "None",
"confidence": 97,
"result": "222",
"unit": "ml",
"norm": "None",
"material": "None",
"icd9": "uuuuu",
},
],
},
},
]
In Python, you can do something like this:
arq = ['11111.pdf.json', '11112.pdf.json']
if len(arq) == len(jsv):
for i, json in enumerate(jsv):
for key in enumerate(json.keys()):
json[key[1]]['source_filename'] = arq[i]
Need to check if the length of files list is the same of the jsv list!
result this jsv:
[
{
"document_0": {
"id": 111,
"laboratory": "xxx",
"document_type": "xxx",
"language": "pl",
"creation_date": "09-12-2022",
"source_filename": "11111.pdf.json",
"version": "0.1",
"exams_ocr_avg_confidence": 0.0,
"patient_data": {
"first_name": "YYYY",
"surname": "YYYY",
"pesel": "12345678901",
"birth_date": "1111-22-22",
"sex": "F",
"age": "None",
},
"exams": [
{
"name": "xx",
"sampling_date": "2020-11-30",
"comment": "None",
"confidence": 97,
"result": "222",
"unit": "ml",
"norm": "None",
"material": "None",
"icd9": "uuuuu",
}
],
}
},
{
"document_1": {
"id": 222,
"laboratory": "xxx",
"document_type": "xxx",
"language": "pl",
"creation_date": "09-12-2022",
"source_filename": "11112.pdf.json",
"version": "0.1",
"exams_ocr_avg_confidence": 0.0,
"patient_data": {
"first_name": "YYYY",
"surname": "YYYY",
"pesel": "12345678901",
"birth_date": "1111-22-22",
"sex": "F",
"age": "None",
},
"exams": [
{
"name": "xx",
"sampling_date": "2020-11-30",
"comment": "None",
"confidence": 97,
"result": "222",
"unit": "ml",
"norm": "None",
"material": "None",
"icd9": "uuuuu",
}
],
}
},
]

Explode json without pandas

I have a JSON object:
{
"data": {
"geography": [
{
"id": "1",
"state": "USA",
"properties": [
{
"code": "CMD-01",
"value": "34"
},
{
"code": "CMD-02",
"value": "24"
}
]
},
{
"id": "2",
"state": "Canada",
"properties": [
{
"code": "CMD-04",
"value": "50"
},
{
"code": "CMD-05",
"value": "60"
}
]
}
]
}
}
I want to get the result as a new JSON, but without using pandas (and all those explode, flatten and normalize functions...). Is there any option to get this structure without using pandas or having an Out of memory issue?
The output should be:
{ "id": "1",
"state": "USA",
"code": "CMD-01",
"value": "34"
},
{ "id": "1",
"state": "USA",
"code": "CMD-02",
"value": "24",
},
{ "id": "2",
"state": "Canada",
"code": "CMD-04",
"value": "50"
},
{ "id": "2",
"state": "Canada",
"code": "CMD-05",
"value": "60"
},

You can simply loop over the list associated with "geography" and build new dictionaries that you will add to a newly created list:
dict_in = {
"data": {
"geography": [
{
"id": "1",
"state": "USA",
"properties": [
{
"code": "CMD-01",
"value": "34"
},
{
"code": "CMD-02",
"value": "24"
}
]
},
{
"id": "2",
"state": "Canada",
"properties": [
{
"code": "CMD-04",
"value": "50"
},
{
"code": "CMD-05",
"value": "60"
}
]
}
]
}
}
import json
rec_out = []
for obj in dict_in["data"]["geography"]:
for prop in obj["properties"]:
dict_out = {
"id": obj["id"],
"state": obj["state"]
}
dict_out.update(prop)
rec_out.append(dict_out)
print(json.dumps(rec_out, indent=4))
Output:
[
{
"id": "1",
"state": "USA",
"code": "CMD-01",
"value": "34"
},
{
"id": "1",
"state": "USA",
"code": "CMD-02",
"value": "24"
},
{
"id": "2",
"state": "Canada",
"code": "CMD-04",
"value": "50"
},
{
"id": "2",
"state": "Canada",
"code": "CMD-05",
"value": "60"
}
]

What is the best way for me to iterate over this dataset to return all matching values from another key value pair if I match a separate key?

I want to be able to search through this list (see bottom of post) of dicts (I think that is what this particular arrangement is called) to search for an ['address'] that matches '0xd2'. If that match is found, I want to return/print all the corresponding ['id']s.
So in this case I would like to return:
632, 315, 432, 100
I'm able to extract individual values like this:
none = None
print(my_dict['result'][2]["id"])
432
I'm struggling with how to get a loop to do this properly.
{
"total": 4,
"page": 0,
"page_size": 100,
"result": [
{
"address": "0xd2",
"id": "632",
"amount": "1",
"name": "Avengers",
"group": "Marvel",
"uri": "https://google.com/",
"metadata": null,
"synced_at": "2022-05-26T22:52:34.113Z",
"last_sync": "2022-05-26T22:52:34.113Z"
},
{
"address": "0xd2",
"id": "315",
"amount": "1",
"name": "Avengers",
"group": "Marvel",
"uri": "https://google.com/",
"metadata": null,
"synced_at": "2022-05-26T22:52:34.113Z",
"last_sync": "2022-05-26T22:52:34.113Z"
},
{
"address": "0xd2",
"id": "432",
"amount": "1",
"name": "Avengers",
"group": "Marvel",
"uri": "https://google.com/",
"metadata": null,
"synced_at": "2022-05-26T22:52:34.113Z",
"last_sync": "2022-05-26T22:52:34.113Z"
},
{
"address": "0x44",
"id": "100",
"amount": "1",
"name": "Suicide Squad",
"group": "DC",
"uri": "https://google.com/",
"metadata": null,
"synced_at": "2022-05-26T22:52:34.113Z",
"last_sync": "2022-05-26T22:52:34.113Z"
}
],
"status": "SYNCED"
}

Welcome to StackOverflow.
You can try list comprehension:
[res["id"] for res in my_dict["result"] if res["address"] == "0xd2"]
If you'd like to use a for loop:
l = []
for res in my_dict["result"]:
if res["address"] == "0xd2":
l.append(res["id"])

You can use a list comprehension.
import json
json_string = """{
"total": 4,
"page": 0,
"page_size": 100,
"result": [
{
"address": "0xd2",
"id": "632",
"amount": "1",
"name": "Avengers",
"group": "Marvel",
"uri": "https://google.com/",
"metadata": null,
"synced_at": "2022-05-26T22:52:34.113Z",
"last_sync": "2022-05-26T22:52:34.113Z"
},
{
"address": "0xd2",
"id": "315",
"amount": "1",
"name": "Avengers",
"group": "Marvel",
"uri": "https://google.com/",
"metadata": null,
"synced_at": "2022-05-26T22:52:34.113Z",
"last_sync": "2022-05-26T22:52:34.113Z"
},
{
"address": "0xd2",
"id": "432",
"amount": "1",
"name": "Avengers",
"group": "Marvel",
"uri": "https://google.com/",
"metadata": null,
"synced_at": "2022-05-26T22:52:34.113Z",
"last_sync": "2022-05-26T22:52:34.113Z"
},
{
"address": "0x44",
"id": "100",
"amount": "1",
"name": "Suicide Squad",
"group": "DC",
"uri": "https://google.com/",
"metadata": null,
"synced_at": "2022-05-26T22:52:34.113Z",
"last_sync": "2022-05-26T22:52:34.113Z"
}
],
"status": "SYNCED"
}"""
json_dict = json.loads(json_string)
result = [elem['id'] for elem in json_dict['result'] if elem['address'] == '0xd2']
print(result)
Output:
['632', '315', '432']

This would store the associated ids in the list:
ids=[]
for r in dataset.get('result'):
if r.get('address')=='0xd2':
ids.append(r.get('id'))

How to create very large Json object in Python

I have a json file like this:
{
"students": [
{
"name": "Jack",
"age": "12",
"class": "8",
"start_date": "2021-01-01",
"score": {
"Eng": "90",
"Math": "90",
"History": "91",
"Art": "80"
},
"friend": {},
"talents": [
"dance"
]
}
],
"school": [
{
"city": "LA",
"state": "CA",
"country": "US",
}
]
}
Now I want to modify this file and add new students' info into it
like this: (Add more similar json objects into students with minimum change)
{
"students": [
{
"name": "Jack",
"age": "12",
"class": "8",
"start_date": "2021-01-01",
"score": {
"Eng": "90",
"Math": "90",
"History": "91",
"Art": "80"
},
"friend": {},
"talents": [
"dance"
]
},
{
"name": "David",
"age": "12",
"class": "8",
"start_date": "2021-02-01",
"score": {
"Eng": "92",
"Math": "90",
"History": "95",
"Art": "70"
},
"friend": {},
"talents": [
"skate"
]
},
... ...
],
"school": [
{
"city": "LA",
"state": "CA",
"country": "US",
}
]
}
Is there a way to do this in large scale? Since I need to append >1000 new students.
And since this is a large json and only do minimum change, so I don't want specify each key and value for >1000 times.

Why am I receiving an error when attempting to parse JSON object within for loop?

Everything with my script runs fine until I try to run it through a for loop. Specifically, when I attempt to index a specific array within the object. Before I get to the The script is intended to grab the delivery date for each tracking number in my list.
This is my script:
import requests
import json
TrackList = ['1Z3X756E0310496105','1ZX0373R0303581450','1ZX0373R0103574417']
url = 'https://onlinetools.ups.com/rest/Track'
para1 = '...beginning of JSON request string...'
para2 = '...end of JSON request string...'
for TrackNum in TrackList:
parameters = para1+TrackNum+para2
resp = requests.post(url = url, data = parameters, verify=False)
data = json.loads(resp.text)
DelDate = data['TrackResponse']['Shipment']['Package'][0]['Activity'][0]['Date']
print(DelDate)
JSON API Response (if needed):
{
"TrackResponse": {
"Response": {
"ResponseStatus": {
"Code": "1",
"Description": "Success"
},
"TransactionReference": {
"CustomerContext": "Analytics Inquiry"
}
},
"Shipment": {
"InquiryNumber": {
"Code": "01",
"Description": "ShipmentIdentificationNumber",
"Value": "1ZX0373R0103574417"
},
"Package": {
"Activity": [
{
"ActivityLocation": {
"Address": {
"City": "OKLAHOMA CITY",
"CountryCode": "US",
"PostalCode": "73128",
"StateProvinceCode": "OK"
},
"Code": "M3",
"Description": "Front Desk",
"SignedForByName": "CUMMINGS"
},
"Date": "20190520",
"Status": {
"Code": "9E",
"Description": "Delivered",
"Type": "D"
},
"Time": "091513"
},
{
"ActivityLocation": {
"Address": {
"City": "Oklahoma City",
"CountryCode": "US",
"StateProvinceCode": "OK"
},
"Description": "Front Desk"
},
"Date": "20190520",
"Status": {
"Code": "OT",
"Description": "Out For Delivery Today",
"Type": "I"
},
"Time": "085943"
},
{
"ActivityLocation": {
"Address": {
"City": "Oklahoma City",
"CountryCode": "US",
"StateProvinceCode": "OK"
},
"Description": "Front Desk"
},
"Date": "20190520",
"Status": {
"Code": "DS",
"Description": "Destination Scan",
"Type": "I"
},
"Time": "011819"
},
{
"ActivityLocation": {
"Address": {
"City": "Oklahoma City",
"CountryCode": "US",
"StateProvinceCode": "OK"
},
"Description": "Front Desk"
},
"Date": "20190519",
"Status": {
"Code": "AR",
"Description": "Arrival Scan",
"Type": "I"
},
"Time": "235100"
},
{
"ActivityLocation": {
"Address": {
"City": "DFW Airport",
"CountryCode": "US",
"StateProvinceCode": "TX"
},
"Description": "Front Desk"
},
"Date": "20190519",
"Status": {
"Code": "DP",
"Description": "Departure Scan",
"Type": "I"
},
"Time": "195500"
},
{
"ActivityLocation": {
"Address": {
"City": "DFW Airport",
"CountryCode": "US",
"StateProvinceCode": "TX"
},
"Description": "Front Desk"
},
"Date": "20190517",
"Status": {
"Code": "OR",
"Description": "Origin Scan",
"Type": "I"
},
"Time": "192938"
},
{
"ActivityLocation": {
"Address": {
"CountryCode": "US"
},
"Description": "Front Desk"
},
"Date": "20190517",
"Status": {
"Code": "MP",
"Description": "Order Processed: Ready for UPS",
"Type": "M"
},
"Time": "184621"
}
],
"PackageWeight": {
"UnitOfMeasurement": {
"Code": "LBS"
},
"Weight": "2.00"
},
"ReferenceNumber": [
{
"Code": "01",
"Value": "8472745558"
},
{
"Code": "01",
"Value": "5637807:1007379402:BN81-17077A:1"
},
{
"Code": "01",
"Value": "5637807"
}
],
"TrackingNumber": "1ZX0373R0103574417"
},
"PickupDate": "20190517",
"Service": {
"Code": "001",
"Description": "UPS Next Day Air"
},
"ShipmentAddress": [
{
"Address": {
"AddressLine": "S 600 ROYAL LN",
"City": "COPPELL",
"CountryCode": "US",
"PostalCode": "750193827",
"StateProvinceCode": "TX"
},
"Type": {
"Code": "01",
"Description": "Shipper Address"
}
},
{
"Address": {
"City": "OKLAHOMA CITY",
"CountryCode": "US",
"PostalCode": "73128",
"StateProvinceCode": "OK"
},
"Type": {
"Code": "02",
"Description": "ShipTo Address"
}
}
],
"ShipmentWeight": {
"UnitOfMeasurement": {
"Code": "LBS"
},
"Weight": "2.00"
},
"ShipperNumber": "X0373R"
}
}
}
Below is the error I receive:
Traceback (most recent call last):
File "/Users/***/Library/Preferences/PyCharmCE2019.1/scratches/UPS_API.py", line 15, in <module>
DelDate = data['TrackResponse']['Shipment']['Package'][0]['Activity'][0]['Date']
KeyError: 0

You're trying to index "Package" at index 0, but it's an object not an array. So you should be accessing ['Package']['Activity']

just take away the [0] because there is no [1] or [2]

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to use S3 Select for Nested Parquet Objects - python

Related

Update nested JSON with name of json file name

Explode json without pandas

What is the best way for me to iterate over this dataset to return all matching values from another key value pair if I match a separate key?

How to create very large Json object in Python

Why am I receiving an error when attempting to parse JSON object within for loop?

Categories

Resources