Related
I have some JSON files like the below, for every student id there will be a corresponding JSON file with mark details.
students.json
{
"Students": [
{
"StudentName": "AAA",
"Sid": 1020,
"Saddress": "st.aaa",
"Sdob": "10-11-1999"
},
{
"StudentName": "BBB",
"Sid": 1021,
"Saddress": "st.bbb",
"Sdob": "11-11-1999"
},
{
"StudentName": "CCC",
"Sid": 1022,
"Saddress": "st.fff",
"Sdob": "05-12-1999"
},
{
"StudentName": "DDD",
"Sid": 1023,
"Saddress": "st.ddd",
"Sdob": "15-09-1999"
},
{
"StudentName": "EEE",
"Sid": 1024,
"Saddress": "st.eee",
"Sdob": "10-11-1999"
},
{
"StudentName": "FFF",
"Sid": 1025,
"Saddress": "st.ddd",
"Sdob": "20-11-1999"
},
{
"StudentName": "GGG",
"Sid": 1026,
"Saddress": "st.ggg",
"Sdob": "25-11-1999"
},
{
"StudentName": "JJJ",
"Sid": 1019,
"Saddress": "st.aaa",
"Sdob": "18-11-1999"
}
]
}
1020.json
{
"marks": [
{
"English": 50,
"Math": 75,
"Art": 75,
"Science": 80,
"History": 30,
"Geography": 35,
"Physical Education": 90,
"Chemistry": 80,
"Physics": 85,
"Biology": 75
}
]
}
1021.json
{
"marks": [
{
"English": 50,
"Math": 75,
"Art": 75,
"Science": 80,
"History": 30,
"Geography": 35,
"Physical Education": 90,
"Chemistry": 80,
"Physics": 85,
"Biology": 75
}
]
}
1022.json
{
"marks": [
{
"English": 50,
"Math": 75,
"Art": 75,
"Science": 80,
"History": 30,
"Geography": 35,
"Physical Education": 90,
"Chemistry": 80,
"Physics": 85,
"Biology": 75
}
]
}
1023.json
{
"marks": [
{
"English": 50,
"Math": 75,
"Art": 75,
"Science": 80,
"History": 30,
"Geography": 35,
"Physical Education": 90,
"Chemistry": 80,
"Physics": 85,
"Biology": 75
}
]
}
1024.json
{
"marks": [
{
"English": 50,
"Math": 75,
"Art": 75,
"Science": 80,
"History": 30,
"Geography": 35,
"Physical Education": 90,
"Chemistry": 80,
"Physics": 85,
"Biology": 75
}
]
}
1025.json
{
"marks": [
{
"English": 50,
"Math": 75,
"Art": 75,
"Science": 80,
"History": 30,
"Geography": 35,
"Physical Education": 90,
"Chemistry": 80,
"Physics": 85,
"Biology": 75
}
]
}
1026.json
{
"marks": [
{
"English": 50,
"Math": 75,
"Art": 75,
"Science": 80,
"History": 30,
"Geography": 35,
"Physical Education": 90,
"Chemistry": 80,
"Physics": 85,
"Biology": 75
}
]
}
1019.json
{
"marks": [
{
"English": 50,
"Math": 75,
"Art": 75,
"Science": 80,
"History": 30,
"Geography": 35,
"Physical Education": 90,
"Chemistry": 80,
"Physics": 85,
"Biology": 75
}
]
}
I need to get an output like this
My code:
import json
import pandas as pd
data_JSON = open("students.json")
json_str = data_JSON.read()
data= json.loads(json_str)
print("JSON Data")
print(data)
def Normalize(data_JSON,record_path):
temp = json.dumps(data_JSON)
ar = json.loads(temp)
df = pd.json_normalize(ar[record_path])
return df
data2 = Normalize(data, "Students")
print("\nAfter normalizing JSON Data")
print(data2)
df0 = pd.DataFrame(data2)
df1 = df0[['StudentName', 'Sid']]
print("\nTaking specific columns from the df")
print(df1)
files = []
for i, row in df1.groupby('Sid').size().iteritems():
file = str(i)+".json"
files.append(file)
print("\nFile Names")
print(files)
df2 = pd.DataFrame(columns=['StudentName','Sid'])
merged = pd.DataFrame()
for i in files:
data_JSON = open(i)
json_str = data_JSON.read()
data= json.loads(json_str)
marks = Normalize(data, "marks")
df = pd.DataFrame(marks)
merged = pd.concat([df2, df], ignore_index=True, sort=False)
print("\nMarks")
print(merged)
# saving the dataframe
merged.to_csv('StudentsMark.csv',index=False)
Output:
is it possible to insert the StudentName and Sid along with the mark? I dont want to give the StudentName and Sid directly to the dataframe df2, when the program fetches the mark from each file it should be able to map and add the StudentName and Sid to df2 for each file.
Using json_normalize:
import json
import pandas as pd
path = "path/to/files"
def get_data(file: str) -> json:
with open(f"{path}/{file}.json", "r") as f:
return json.loads(f.read())
students_df = pd.json_normalize(data=get_data("students"), record_path="Students")
student_ids = students_df["Sid"].tolist()
grades_df = pd.concat([pd.json_normalize(data=get_data(x), record_path="marks").assign(Id=x) for x in student_ids])
(pd
.merge(left=students_df, right=grades_df, left_on="Sid", right_on="Id")
.drop(columns="Id")
.to_csv(f"{path}/students_marks.csv", index=False)
)
I have a list of dictionary which I am sorting using multiple keys.
Now I want to push all the elements that have zero rank (rank is a key)
basically rank 0 must be at bottom
mylist = [
{
"score": 5.0,
"rank": 2,
"duration": 123,
"amount": "18.000",
},
{
"score": -1.0,
"rank": 0,
"duration": 23,
"amount": "-8.000",
},
{
"score": -2.0,
"rank": 0,
"duration": 63,
"amount": "28.000",
},
{
"score": 2.0,
"rank": 1,
"duration": 73,
"amount": "18.000",
},
]
from operator import itemgetter
sort_fields = ['rank', 'duration']
sorted_list = sorted(mylist, key=itemgetter(*sort_fields), reverse=False)
print(sorted_list)
current output
[{'score': -1.0, 'rank': 0, 'duration': 23, 'amount': '-8.000'}, {'score': -2.0, 'rank': 0, 'duration': 63, 'amount': '28.000'}, {'score': 2.0, 'rank': 1, 'duration': 73, 'amount': '18.000'}, {'score': 5.0, 'rank': 2, 'duration': 123, 'amount': '18.000'}]
expected output
[{'score': 2.0, 'rank': 1, 'duration': 73, 'amount': '18.000'}, {'score': 5.0, 'rank': 2, 'duration': 123, 'amount': '18.000'},{'score': -1.0, 'rank': 0, 'duration': 23, 'amount': '-8.000'}, {'score': -2.0, 'rank': 0, 'duration': 63, 'amount': '28.000'}, ]
mylist = [
{
"score": 5.0,
"rank": 2,
"duration": 123,
"amount": "18.000",
},
{
"score": -1.0,
"rank": 0,
"duration": 23,
"amount": "-8.000",
},
{
"score": -2.0,
"rank": 0,
"duration": 63,
"amount": "28.000",
},
{
"score": 2.0,
"rank": 1,
"duration": 73,
"amount": "18.000",
},
]
sorted_list = sorted(mylist, key = lambda x: (x['rank'] if x['rank'] > 0 else float('inf'),x['duration']))
print(sorted_list)
You should make the key function return a tuple of values based on the precedence of the sorting criteria. Since the first of your sorting criteria is in fact whether the rank is zero, make that test the first item of the tuple. Then you got the rest, namely the rank and the duration, correctly in order:
sorted_list = sorted(mylist, key=lambda d: (d['rank'] == 0, d['rank'], d['duration'])))
I m new to programming, I want to change the following JSON format. I want to remove the "content" keyword as shown in the below example.
[{
"content": "abc",
'entities': [
[44, 55, "SEN"],
[27, 31, "FIN"]
]
}, {
"content": "xyz",
'entities': [
[8, 17, "FIN"]
]
}, {
"content": "klm",
'entities': [
[18, 26, "FIN"]
]
}]
to
[
('abc', {
'entities': [(44, 55, "SEN"), (27, 31, "FIN")]
}),
('xyz', {
'entities': [(8, 17, "FIN")]
}),
('klm', {
'entities': [(18, 26, "FIN"]
})
]
Please help.
Thanks
>>> data = [{
... "content": "abc",
... 'entities': [
... [44, 55, "SEN"],
... [27, 31, "FIN"]
... ]
... }, {
... "content": "xyz",
... 'entities': [
... [8, 17, "FIN"]
... ]
... }, {
... "content": "klm",
... 'entities': [
... [18, 26, "FIN"]
... ]
... }]
>>> [(dct["content"], {"entities": list(map(tuple, dct["entities"]))}) for dct in data]
[('abc', {'entities': [(44, 55, 'SEN'), (27, 31, 'FIN')]}), ('xyz', {'entities': [(8, 17, 'FIN')]}), ('klm', {'entities': [(18, 26, 'FIN')]})]
>>>
In a more readable format:
[
# 2. build a tuple...
(
# 3. whose first element is `content`
dct["content"],
# 4. and the second - a dictionary with one element
{
# 5. which is a list of entities that are converted to `tuple`
"entities": list(map(tuple, dct["entities"]))
}
)
# 1. For each dictionary...
for dct in data
]
You can use list comprehension as:
lst = [{
"content": "abc",
'entities': [
[44, 55, "SEN"],
[27, 31, "FIN"]
]
}, {
"content": "xyz",
'entities': [
[8, 17, "FIN"]
]
}, {
"content": "klm",
'entities': [
[18, 26, "FIN"]
]
}]
output = [( elt["content"], { "entities": [tuple(e) for e in elt["entities"]] } ) for elt in lst]
print(output)
I'm making a standard find query to my MongoDB database, it looks like this:
MyData = pd.DataFrame(list(db.MyData.find({'datetimer': {'$gte': StartTime, '$lt': Endtime}})), columns=['price', 'amount', 'datetime'])
Now i'm trying to do another query, but it's more complicated and i don't know how to do it. Here is a sample of my data:
{"datetime": "2020-07-08 15:10", "price": 21, "amount": 90}
{"datetime": "2020-07-08 15:15", "price": 22, "amount": 50}
{"datetime": "2020-07-08 15:19", "price": 21, "amount": 40}
{"datetime": "2020-07-08 15:30", "price": 21, "amount": 90}
{"datetime": "2020-07-08 15:35", "price": 32, "amount": 50}
{"datetime": "2020-07-08 15:39", "price": 41, "amount": 40}
{"datetime": "2020-07-08 15:49", "price": 32, "amount": 40}
I need to group that data in intervals of 30 Minutes and have them distinct by price. So all the records before 15:30must have 15:30 as datetime, all the records before 16:00 need to have 16:00. An example of the expected output:
The previous data becomes this:
{"datetime": "2020-07-08 15:30", "price": 21, "amount": 90}
{"datetime": "2020-07-08 15:30", "price": 22, "amount": 50}
{"datetime": "2020-07-08 16:00", "price": 32, "amount": 50}
{"datetime": "2020-07-08 16:00", "price": 41, "amount": 40}
I don't know if this query is doable, so any kind of advice is appreciated. I can also do that from my code, if it's not possible to do
I tried the code suggested here, but i got the following result, which is not the expected output:
Query = db.myData.aggregate([
{ "$group": {
"_id": {
"$toDate": {
"$subtract": [
{ "$toLong": "$datetime" },
{ "$mod": [ { "$toLong": "$datetime" }, 1000 * 60 * 15 ] }
]
}
},
"count": { "$sum": 1 }
}}
])
for x in Query:
print(x)
//OUTPUT:
{'_id': datetime.datetime(2020, 7, 7, 9, 15), 'count': 39}
{'_id': datetime.datetime(2020, 7, 6, 18, 30), 'count': 44}
{'_id': datetime.datetime(2020, 7, 7, 16, 30), 'count': 54}
{'_id': datetime.datetime(2020, 7, 7, 11, 45), 'count': 25}
{'_id': datetime.datetime(2020, 7, 6, 22, 15), 'count': 48}
{'_id': datetime.datetime(2020, 7, 7, 15, 0), 'count': 30}
...
What #Gibbs suggested is correct, you just have to modify the data a little bit.
Check if the below aggregate query is what you are looking for
Query = db.myData.aggregate([
{
"$group": {
"_id": {
"datetime":{
"$toDate": {
"$subtract": [
{ "$toLong": "$datetime" },
{ "$mod": [ { "$toLong": "$datetime" }, 1000 * 60 * 30 ] }
]
}
},
"price": "$price",
"amount": "$amount"
},
}
},
{
"$replaceRoot": { "newRoot": "$_id"}
}
])
for x in Query:
print(x)
i have the following problem:
I have a list of dicts and want to loop through the list (temp_list) and check:
if value dic["z"] of temp[x] and temp[y] is in between the range of distance_value.
if not, insert a new dict in between temp[x] and temp[y] which contains a z_value of ( temp[y]-temp[x])/2 ), lets name it dic_x_y
afterwards fill the left over values of the new inserted dic (dic_x_y["t1"], dic_x_y["angle1"] and dic_x_y["material"]) with the values of the dic in temp[x]
Here is the data with the list and the variable:
distance_value = 1000
temp = [
{
"z": 1450,
"t1": 0,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 1950,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 12800,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 13000,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 25900,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 26000,
"t1": 10,
"angle1": 90,
"material": "Balsa 150"
}]
I searched a lot for my problem but could not find an answer.
I hope i could state my problem clearly and someone can help me.
Thanks a lot in advance.
i dont really know how to start but thats kind of my idea that i cannot get to work:
distance_value = 1000
for dic in temp:
if "dic["z"] +1 (second element of the list) - dic["z"] < distance_value:
new_dic = {"z": (dic["z"]+1 - dic["z"]), "t1": dic["t1"] , "angle1":dic["angle1"], "material":dic["material"] }
temp.insert[dic["z"]+1, new_dic]
From my json test file test.json:
[
{
"z": 1450,
"t1": 0,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 1950,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 12800,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 13000,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 25900,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 26000,
"t1": 10,
"angle1": 90,
"material": "Balsa 150"
}]
The python code:
import json
with open('test.json') as f:
temp = json.load(f)
distance_value = 1000
temp.sort(key=lambda k: k['z'])
counter = 0
Continue = True
while (Continue):
for i in range (0,len(temp)-1):
if(temp[i+1]['z'] - temp[i]['z'] > distance_value):
Continue = True
new_dic = {"z": (temp[i+1]['z'] + temp[i]['z'])/2., "t1": temp[i]['t1'], "angle1": 90, "material": temp[i]['material']}
temp.append(new_dic)
temp.sort(key=lambda k: k['z'])
break
else:
Continue = False
temp_as_string = json.dumps(temp, sort_keys=True, indent=4, separators=(',', ': '))
print(temp_as_string)
my output:
[
[
{
"z": 1450,
"t1": 0,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 1950,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 2628.125,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 3306.25,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 3984.375,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 4662.5,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 5340.625,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 6018.75,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 6696.875,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 7375.0,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 8053.125,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 8731.25,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 9409.375,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 10087.5,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 10765.625,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 11443.75,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 12121.875,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 12800,
"t1": 25,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 13000,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 13806.25,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 14612.5,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 15418.75,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 16225.0,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 17031.25,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 17837.5,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 18643.75,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 19450.0,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 20256.25,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 21062.5,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 21868.75,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 22675.0,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 23481.25,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 24287.5,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 25093.75,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 25900,
"t1": 15,
"angle1": 90,
"material": "Balsa 150"
},
{
"z": 26000,
"t1": 10,
"angle1": 90,
"material": "Balsa 150"
}
]
[Finished in 0.096s]
The logic is as follows:
Run a while loop with condition, should I keep looping over my dictionaries and check? : Continue
Inside the while loop, loop over the current list items, check if at any possible list[i+1]['z'] - list[i]['z'] is greater than set distance (this is the checking loop)
if found then make a new dict with middle point Z value, append, and re-sort (this is very important) and then break from for loop (break from checking loop at first occurrence), by breaking Continuecondition for the while loop is still true
At a stage later on in the while loop when we go through all for loop and check condition not found, then Continue is false and the while loop breaks