JSON to Python dataframe: mapping values from another file - python

I have some JSON files like the below, for every student id there will be a corresponding JSON file with mark details.
students.json
{
"Students": [
{
"StudentName": "AAA",
"Sid": 1020,
"Saddress": "st.aaa",
"Sdob": "10-11-1999"
},
{
"StudentName": "BBB",
"Sid": 1021,
"Saddress": "st.bbb",
"Sdob": "11-11-1999"
},
{
"StudentName": "CCC",
"Sid": 1022,
"Saddress": "st.fff",
"Sdob": "05-12-1999"
},
{
"StudentName": "DDD",
"Sid": 1023,
"Saddress": "st.ddd",
"Sdob": "15-09-1999"
},
{
"StudentName": "EEE",
"Sid": 1024,
"Saddress": "st.eee",
"Sdob": "10-11-1999"
},
{
"StudentName": "FFF",
"Sid": 1025,
"Saddress": "st.ddd",
"Sdob": "20-11-1999"
},
{
"StudentName": "GGG",
"Sid": 1026,
"Saddress": "st.ggg",
"Sdob": "25-11-1999"
},
{
"StudentName": "JJJ",
"Sid": 1019,
"Saddress": "st.aaa",
"Sdob": "18-11-1999"
}
]
}
1020.json
{
"marks": [
{
"English": 50,
"Math": 75,
"Art": 75,
"Science": 80,
"History": 30,
"Geography": 35,
"Physical Education": 90,
"Chemistry": 80,
"Physics": 85,
"Biology": 75
}
]
}
1021.json
{
"marks": [
{
"English": 50,
"Math": 75,
"Art": 75,
"Science": 80,
"History": 30,
"Geography": 35,
"Physical Education": 90,
"Chemistry": 80,
"Physics": 85,
"Biology": 75
}
]
}
1022.json
{
"marks": [
{
"English": 50,
"Math": 75,
"Art": 75,
"Science": 80,
"History": 30,
"Geography": 35,
"Physical Education": 90,
"Chemistry": 80,
"Physics": 85,
"Biology": 75
}
]
}
1023.json
{
"marks": [
{
"English": 50,
"Math": 75,
"Art": 75,
"Science": 80,
"History": 30,
"Geography": 35,
"Physical Education": 90,
"Chemistry": 80,
"Physics": 85,
"Biology": 75
}
]
}
1024.json
{
"marks": [
{
"English": 50,
"Math": 75,
"Art": 75,
"Science": 80,
"History": 30,
"Geography": 35,
"Physical Education": 90,
"Chemistry": 80,
"Physics": 85,
"Biology": 75
}
]
}
1025.json
{
"marks": [
{
"English": 50,
"Math": 75,
"Art": 75,
"Science": 80,
"History": 30,
"Geography": 35,
"Physical Education": 90,
"Chemistry": 80,
"Physics": 85,
"Biology": 75
}
]
}
1026.json
{
"marks": [
{
"English": 50,
"Math": 75,
"Art": 75,
"Science": 80,
"History": 30,
"Geography": 35,
"Physical Education": 90,
"Chemistry": 80,
"Physics": 85,
"Biology": 75
}
]
}
1019.json
{
"marks": [
{
"English": 50,
"Math": 75,
"Art": 75,
"Science": 80,
"History": 30,
"Geography": 35,
"Physical Education": 90,
"Chemistry": 80,
"Physics": 85,
"Biology": 75
}
]
}
I need to get an output like this
My code:
import json
import pandas as pd
data_JSON = open("students.json")
json_str = data_JSON.read()
data= json.loads(json_str)
print("JSON Data")
print(data)
def Normalize(data_JSON,record_path):
temp = json.dumps(data_JSON)
ar = json.loads(temp)
df = pd.json_normalize(ar[record_path])
return df
data2 = Normalize(data, "Students")
print("\nAfter normalizing JSON Data")
print(data2)
df0 = pd.DataFrame(data2)
df1 = df0[['StudentName', 'Sid']]
print("\nTaking specific columns from the df")
print(df1)
files = []
for i, row in df1.groupby('Sid').size().iteritems():
file = str(i)+".json"
files.append(file)
print("\nFile Names")
print(files)
df2 = pd.DataFrame(columns=['StudentName','Sid'])
merged = pd.DataFrame()
for i in files:
data_JSON = open(i)
json_str = data_JSON.read()
data= json.loads(json_str)
marks = Normalize(data, "marks")
df = pd.DataFrame(marks)
merged = pd.concat([df2, df], ignore_index=True, sort=False)
print("\nMarks")
print(merged)
# saving the dataframe
merged.to_csv('StudentsMark.csv',index=False)
Output:
is it possible to insert the StudentName and Sid along with the mark? I dont want to give the StudentName and Sid directly to the dataframe df2, when the program fetches the mark from each file it should be able to map and add the StudentName and Sid to df2 for each file.

Using json_normalize:
import json
import pandas as pd
path = "path/to/files"
def get_data(file: str) -> json:
with open(f"{path}/{file}.json", "r") as f:
return json.loads(f.read())
students_df = pd.json_normalize(data=get_data("students"), record_path="Students")
student_ids = students_df["Sid"].tolist()
grades_df = pd.concat([pd.json_normalize(data=get_data(x), record_path="marks").assign(Id=x) for x in student_ids])
(pd
.merge(left=students_df, right=grades_df, left_on="Sid", right_on="Id")
.drop(columns="Id")
.to_csv(f"{path}/students_marks.csv", index=False)
)

Related

How to split complex JSON file into multiple files by Python

I am currently splitting Json file.
The structure of JSON file is like this :
{
"id": 2131424,
"file": "video_2131424_1938263.mp4",
"metadata": {
"width": 3840,
"height": 2160,
"duration": 312.83,
"fps": 30,
"frames": 9385,
"created": "Sun Jan 17 17:48:52 2021"
},
"frames": [
{
"number": 207,
"image": "frame_207.jpg",
"annotations": [
{
"label": {
"x": 730,
"y": 130,
"width": 62,
"height": 152
},
"category": {
"code": "child",
"attributes": [
{
"code": "global_id",
"value": "7148"
}
]
}
},
{
"label": {
"x": 815,
"y": 81,
"width": 106,
"height": 197
},
"category": {
"code": "person",
"attributes": []
}
}
]
},
{
"number": 221,
"image": "frame_221.jpg",
"annotations": [
{
"label": {
"x": 730,
"y": 130,
"width": 64,
"height": 160
},
"category": {
"code": "child",
"attributes": [
{
"code": "global_id",
"value": "7148"
}
]
}
},
{
"label": {
"x": 819,
"y": 82,
"width": 106,
"height": 200
},
"category": {
"code": "person",
"attributes": []
}
}
]
},
{
"number": 236,
"image": "frame_236.jpg",
"annotations": [
{
"label": {
"x": 731,
"y": 135,
"width": 74,
"height": 160
},
"category": {
"code": "child",
"attributes": [
{
"code": "global_id",
"value": "7148"
}
]
}
},
{
"label": {
"x": 821,
"y": 83,
"width": 106,
"height": 206
},
"category": {
"code": "person",
"attributes": []
}
}
]
},
I have to extract [x, y, width, height] from each label.
I tried some code like this:
file = json.load(open('annotation_2131424.json'))
file['frames'][i]['annotations'][j]['label']['x']
But I cannot split JSON.
I tried like this but I cannot run...
I hope I've understood your question right. To get x, y, width, height from each label (dct is your dictionary from the question):
out = [
[
[
a["label"]["x"],
a["label"]["y"],
a["label"]["width"],
a["label"]["height"],
]
for a in frame["annotations"]
]
for frame in dct["frames"]
]
print(out)
Prints:
[
[[730, 130, 62, 152], [815, 81, 106, 197]],
[[730, 130, 64, 160], [819, 82, 106, 200]],
[[731, 135, 74, 160], [821, 83, 106, 206]],
]

merge dicts that have the same value for specific key

I need to combine dictionaries that have the same value for the key "tag".
Like from this:
[
[
{
"tag": "#2C00L02RU",
"stamina": 233
},
{
"tag": "#8YG8RJV90",
"stamina": 20
},
{
"tag": "#LQV2JCPR",
"stamina": 154
},
{
"tag": "#9JQLPGLJJ",
"stamina": 134
}
],
[
{
"tag": "#2C00L02RU",
"health": 200
},
{
"tag": "#8YG8RJV90",
"health": 40
},
{
"tag": "#LQV2JCPR",
"health": 100
},
{
"tag": "#9JQLPGLJJ",
"health": 240
}
],
[
{
"tag": "#LQV2JCPR",
"fame": 1
},
{
"tag": "#8YG8RJV90",
"fame": 2
},
{
"tag": "#9JQLPGLJJ",
"fame": 3
},
{
"tag": "#2C00L02RU",
"fame": 4
}
],
[
{
"tag": "#LQV2JCPR",
"moves": 6
},
{
"tag": "#8YG8RJV90",
"moves": 0
},
{
"tag": "#9JQLPGLJJ",
"moves": 8
},
{
"tag": "#2C00L02RU",
"moves": 4
}
]
]
to this:
[
{
"tag": "#2C00L02RU",
"stamina": 233,
"health": 200,
"fame": 4,
"moves": 4
},
{
"tag": "#8YG8RJV90",
"stamina": 20,
"health": 40,
"fame": 2,
"moves": 2
},
{
"tag": "#LQV2JCPR",
"stamina": 154,
"health": 100,
"fame": 1,
"moves": 6
},
{
"tag": "#9JQLPGLJJ",
"stamina": 134,
"health": 240,
"fame": 3,
"moves": 8
}
]
I've already tried iterating through countless loops, but only got failures.
I won't show any of my attempts here because they didn't even come close to the expected result.
If you need any other information, just let me know.
If lst is list from your question, you can do:
out = {}
for l in lst:
for d in l:
out.setdefault(d["tag"], {}).update(d)
print(list(out.values()))
Prints:
[
{"tag": "#2C00L02RU", "stamina": 233, "health": 200, "fame": 4, "moves": 4},
{"tag": "#8YG8RJV90", "stamina": 20, "health": 40, "fame": 2, "moves": 0},
{"tag": "#LQV2JCPR", "stamina": 154, "health": 100, "fame": 1, "moves": 6},
{"tag": "#9JQLPGLJJ", "stamina": 134, "health": 240, "fame": 3, "moves": 8},
]

Can we make a table from the nested dictionary with the use of Pandas?

Question: How to print student name by maths mark descending?
[
{
"name": "student2",
"age": 30,
"is_male": False,
"marks": {"science": 56, "maths": 32, "english": 67},
"total_marks": 155,
},
{
"name": "student4",
"age": 28,
"is male": False,
"marks": {"science": 78, "maths": 55, "english": 98},
"total_marks": 230,
},
{
"name": "student3",
"age": 25,
"is_male": True,
"marks": {"science": 89, "maths": 56, "english": 99},
"total_marks": 244,
},
{
"name": "student1",
"age": 23,
"is_male": True,
"marks": {"science": 95, "maths": 89, "english": 95},
"total_marks": 279,
},
]
Try:
df = pd.DataFrame(
[
{
"name": "student2",
"age": 30,
"is_male": False,
"marks": {"science": 56, "maths": 32, "english": 67},
"total_marks": 155,
},
{
"name": "student4",
"age": 28,
"is male": False,
"marks": {"science": 78, "maths": 55, "english": 98},
"total_marks": 230,
},
{
"name": "student3",
"age": 25,
"is_male": True,
"marks": {"science": 89, "maths": 56, "english": 99},
"total_marks": 244,
},
{
"name": "student1",
"age": 23,
"is_male": True,
"marks": {"science": 95, "maths": 89, "english": 95},
"total_marks": 279,
},
]
)
indx = (
df.explode("marks")
.query('marks == "maths"')
.sort_values("total_marks", ascending=False)
.index
)
#or more simply:
indx = df['marks'].str['maths'].sort_values(ascending=False).index
df.reindex(indx)
Output:
name age is_male marks total_marks is male
3 student1 23 True {'science': 95, 'maths': 89, 'english': 95} 279 NaN
2 student3 25 True {'science': 89, 'maths': 56, 'english': 99} 244 NaN
1 student4 28 NaN {'science': 78, 'maths': 55, 'english': 98} 230 False
0 student2 30 False {'science': 56, 'maths': 32, 'english': 67} 155 NaN
I think the easiest way to do this is to use json_normalize() though there is also nothing wrong with #scott-boston's answer.
import pandas
data = [
{'name': 'student2', 'age': 30, 'is_male': False, 'marks': {'science': 56, 'maths': 32, 'english': 67}, 'total_marks': 155},
{'name': 'student4', 'age': 28, 'is male': False, 'marks': {'science': 78, 'maths': 55, 'english': 98}, 'total_marks': 230},
{'name': 'student3', 'age': 25, 'is_male': True, 'marks': {'science': 89, 'maths': 56, 'english': 99}, 'total_marks': 244},
{'name': 'student1', 'age': 23, 'is_male': True, 'marks': {'science': 95, 'maths': 89, 'english': 95}, 'total_marks': 279}
]
print(pandas.json_normalize(data).sort_values("marks.maths", ascending=False))
Breaking out the nested dict so you can sort on it.
This gives you:
name age is_male total_marks marks.science marks.maths marks.english is male
3 student1 23 True 279 95 89 95 NaN
2 student3 25 True 244 89 56 99 NaN
1 student4 28 NaN 230 78 55 98 False
0 student2 30 False 155 56 32 67 NaN

Merge/Join/Combine dictionaries with same key inside a list

I have list of dictionaries
rows = [{'sku':123,'barcode':99123,'day_1_qty':9,'store':118},
{'sku':123,'barcode':99123,'day_1_qty':7,'store':109},
{'sku':124,'barcode':99124,'day_1_qty':9,'store':118},
{'sku':123,'barcode':99123,'day_2_qty':10,'store':118}....]
I want merge them and this is my desired output:
rows = [{'sku':123,'barcode':99123,'day_1_qty':9,'store':118,'day_2_qty':10},
{'sku':123,'barcode':99123,'day_1_qty':7,'store':109},
{'sku':124,'barcode':99124,'day_1_qty':9,'store':118},....]
tried merging them by sku but the other store wont show please help
def generate_oos(dict_list):
res = map(lambda dict_tuple: dict(ChainMap(*dict_tuple[1])),
groupby(sorted(dict_list,key=lambda sub_dict: sub_dict["SKU"]),
key=lambda sub_dict: sub_dict["SKU"]))
return list(res)
Try:
rows = [
{"sku": 123, "barcode": 99123, "day_1_qty": 9, "store": 118},
{"sku": 123, "barcode": 99123, "day_1_qty": 7, "store": 109},
{"sku": 124, "barcode": 99124, "day_1_qty": 9, "store": 118},
{"sku": 123, "barcode": 99123, "day_2_qty": 10, "store": 118},
]
tmp = {}
for d in rows:
tmp.setdefault((d["sku"], d["store"]), []).append(d)
out = []
for k, v in tmp.items():
out.append({})
for vv in v:
out[-1].update(vv)
print(out)
Prints:
[
{
"sku": 123,
"barcode": 99123,
"day_1_qty": 9,
"store": 118,
"day_2_qty": 10,
},
{"sku": 123, "barcode": 99123, "day_1_qty": 7, "store": 109},
{"sku": 124, "barcode": 99124, "day_1_qty": 9, "store": 118},
]

Check list of dic and add values

i have the following problem:
I have a list of dicts and want to loop through the list (temp_list) and check:
if value dic["z"] of temp[x] and temp[y] is in between the range of distance_value.
if not, insert a new dict in between temp[x] and temp[y] which contains a z_value of ( temp[y]-temp[x])/2 ), lets name it dic_x_y
afterwards fill the left over values of the new inserted dic (dic_x_y["t1"], dic_x_y["angle1"] and dic_x_y["material"]) with the values of the dic in temp[x]
Here is the data with the list and the variable:
distance_value = 1000
temp = [
{
"z": 1450,
"t1": 0,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 1950,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 12800,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 13000,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 25900,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 26000,
"t1": 10,
"angle1": 90,
"material": "Balsa 150"
}]
I searched a lot for my problem but could not find an answer.
I hope i could state my problem clearly and someone can help me.
Thanks a lot in advance.
i dont really know how to start but thats kind of my idea that i cannot get to work:
distance_value = 1000
for dic in temp:
if "dic["z"] +1 (second element of the list) - dic["z"] < distance_value:
new_dic = {"z": (dic["z"]+1 - dic["z"]), "t1": dic["t1"] , "angle1":dic["angle1"], "material":dic["material"] }
temp.insert[dic["z"]+1, new_dic]
From my json test file test.json:
[
{
"z": 1450,
"t1": 0,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 1950,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 12800,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 13000,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 25900,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 26000,
"t1": 10,
"angle1": 90,
"material": "Balsa 150"
}]
The python code:
import json
with open('test.json') as f:
temp = json.load(f)
distance_value = 1000
temp.sort(key=lambda k: k['z'])
counter = 0
Continue = True
while (Continue):
for i in range (0,len(temp)-1):
if(temp[i+1]['z'] - temp[i]['z'] > distance_value):
Continue = True
new_dic = {"z": (temp[i+1]['z'] + temp[i]['z'])/2., "t1": temp[i]['t1'], "angle1": 90, "material": temp[i]['material']}
temp.append(new_dic)
temp.sort(key=lambda k: k['z'])
break
else:
Continue = False
temp_as_string = json.dumps(temp, sort_keys=True, indent=4, separators=(',', ': '))
print(temp_as_string)
my output:
[
[
{
"z": 1450,
"t1": 0,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 1950,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 2628.125,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 3306.25,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 3984.375,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 4662.5,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 5340.625,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 6018.75,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 6696.875,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 7375.0,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 8053.125,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 8731.25,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 9409.375,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 10087.5,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 10765.625,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 11443.75,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 12121.875,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 12800,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 13000,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 13806.25,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 14612.5,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 15418.75,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 16225.0,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 17031.25,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 17837.5,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 18643.75,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 19450.0,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 20256.25,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 21062.5,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 21868.75,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 22675.0,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 23481.25,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 24287.5,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 25093.75,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 25900,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 26000,
"t1": 10,
"angle1": 90,
"material": "Balsa 150"
}
]
[Finished in 0.096s]
The logic is as follows:
Run a while loop with condition, should I keep looping over my dictionaries and check? : Continue
Inside the while loop, loop over the current list items, check if at any possible list[i+1]['z'] - list[i]['z'] is greater than set distance (this is the checking loop)
if found then make a new dict with middle point Z value, append, and re-sort (this is very important) and then break from for loop (break from checking loop at first occurrence), by breaking Continuecondition for the while loop is still true
At a stage later on in the while loop when we go through all for loop and check condition not found, then Continue is false and the while loop breaks

Categories