Store rows of DataFrame with certain value in list - python

I have a DataFrame like:
id
country
city
amount
duplicated
1
France
Paris
200
1
2
France
Paris
200
1
3
France
Lyon
50
2
4
France
Lyon
50
2
5
France
Lyon
50
2
And I would like to store a list per distinct value in duplicated, like:
list 1
[
{
"id": 1,
"country": "France",
"city": "Paris",
"amount": 200,
},
{
"id": 2,
"country": "France",
"city": "Paris",
"amount": 200,
}
]
list 2
[
{
"id": 3,
"country": "France",
"city": "Lyon",
"amount": 50,
},
{
"id": 4,
"country": "France",
"city": "Lyon",
"amount": 50,
},
{
"id": 5,
"country": "France",
"city": "Lyon",
"amount": 50,
}
]
I tried filtering duplicates with
df[df.duplicated(['country','city','amount', 'duplicated'], keep = False)]
but it just returns the same df.

You can use groupby:
lst = (df.groupby(['country', 'city', 'amount']) # or .groupby('duplicated')
.apply(lambda x: x.to_dict('records'))
.tolist())
Output:
>>> lst
[[{'id': 3,
'country': 'France',
'city': 'Lyon',
'amount': 50,
'duplicated': 2},
{'id': 4,
'country': 'France',
'city': 'Lyon',
'amount': 50,
'duplicated': 2},
{'id': 5,
'country': 'France',
'city': 'Lyon',
'amount': 50,
'duplicated': 2}],
[{'id': 1,
'country': 'France',
'city': 'Paris',
'amount': 200,
'duplicated': 1},
{'id': 2,
'country': 'France',
'city': 'Paris',
'amount': 200,
'duplicated': 1}]]
Another solution if you want a dict indexed by duplicated key:
data = {k: v.to_dict('records') for k, v in df.set_index('duplicated').groupby(level=0)}
>>> data[1]
[{'id': 1, 'country': 'France', 'city': 'Paris', 'amount': 200},
{'id': 2, 'country': 'France', 'city': 'Paris', 'amount': 200}]
>>> data[2]
[{'id': 3, 'country': 'France', 'city': 'Lyon', 'amount': 50},
{'id': 4, 'country': 'France', 'city': 'Lyon', 'amount': 50},
{'id': 5, 'country': 'France', 'city': 'Lyon', 'amount': 50}]

If I understand you correctly, you can use DataFrame.to_dict('records') to make your lists:
list_1 = df[df['duplicated'] == 1].to_dict('records')
list_1 = df[df['duplicated'] == 2].to_dict('records')
Or for an arbitrary number of values in the column, you can make a dict:
result = {}
for value in df['duplicated'].unique():
result[value] = df[df['duplicated'] == value].to_dict('records')

Related

How to groupby columns by value and make json from them? Python3 Pandas

I have a dataset containing all the professors in Turkey. I need to change the shape of this data structure, but I couldn't find a solution. In this data, there is information about the university, faculty, department and title of approximately 44 thousand academicians.
[ { "name": "XX", "title": "PROFESÖR", "university": "GEBZE TEKNİK ÜNİVERSİTESİ", "faculty": "MÜHENDİSLİK FAKÜLTESİ", "department": "BİLGİSAYAR MÜHENDİSLİĞİ BÖLÜMÜ" }, { "name": "YY", "title": "PROFESÖR", "university": "GEBZE TEKNİK ÜNİVERSİTESİ", "faculty": "MÜHENDİSLİK FAKÜLTESİ", "department": "ELEKTRONİK MÜHENDİSLİĞİ BÖLÜMÜ" } ]
I have 44000 yields as above and I want to process them. For example, there are nearly 200 universities, I want to separate them.
{ "universities": [ { "id": 1, "name": "GEBZE TEKNİK ÜNİVERSİTESİ", "faculties": [ { "id" : 1, "name": "MÜHENDİSLİK FAKÜLTESİ", "departments" : [ { "id" : 1, "name" : "ELEKTRONİK MÜHENDİSLİĞİ BÖLÜMÜ", "academicians" : [ { "id" : 1, "name":"AA", "title" : "PROFESÖR" }, { "id" : 2, "name":"BB", "title" : "PROFESÖR" }, { "id" : 3, "name":"CC", "title" : "PROFESÖR" } ] }, { "id" : 2, "name" : "HARİTA MÜHENDİSLİĞİ BÖLÜMÜ", "academicians" : [ { "id" : 1, "name":"DD", "title" : "PROFESÖR" }, { "id" : 2, "name":"EE", "title" : "PROFESÖR" } ] } ] } ] } ] }
I want it as in the above format but I couldn't get it done. Can anyone help?
1.) get json datas
js_output = """{'universities': [{'id': 1,
'name': 'GEBZE TEKNİK ÜNİVERSİTESİ',
'faculties': [{'id': 1,
'name': 'MÜHENDİSLİK FAKÜLTESİ',
'departments': [{'id': 1,
'name': 'ELEKTRONİK MÜHENDİSLİĞİ BÖLÜMÜ',
'academicians': [{'id': 1, 'name': 'AA', 'title': 'PROFESÖR'},
{'id': 2, 'name': 'BB', 'title': 'PROFESÖR'},
{'id': 3, 'name': 'CC', 'title': 'PROFESÖR'}]},
{'id': 2,
'name': 'HARİTA MÜHENDİSLİĞİ BÖLÜMÜ',
'academicians': [{'id': 1, 'name': 'DD', 'title': 'PROFESÖR'},
{'id': 2, 'name': 'EE', 'title': 'PROFESÖR'}]}]}]}]}"""
js_input = """[{'name': 'XX',
'title': 'PROFESÖR',
'university': 'GEBZE TEKNİK ÜNİVERSİTESİ',
'faculty': 'MÜHENDİSLİK FAKÜLTESİ',
'department': 'BİLGİSAYAR MÜHENDİSLİĞİ BÖLÜMÜ'},
{'name': 'YY',
'title': 'PROFESÖR',
'university': 'GEBZE TEKNİK ÜNİVERSİTESİ',
'faculty': 'MÜHENDİSLİK FAKÜLTESİ',
'department': 'ELEKTRONİK MÜHENDİSLİĞİ BÖLÜMÜ'}]"""
2.) set json normalize
# get record_path with json keys and get frame list
df_list = []
def get_frames(l,j):
for i in range(len(l)):
df_list.append(pd.json_normalize(j, l[:i+1]))
records = ["universities", "faculties", "departments", "academicians"]
jdo = json.loads(js_output.replace("'",'"'))
get_frames(records, jdo)
3.) concatenate all frames
con = pd.DataFrame()
for df in df_list[:-1]: # because last item is dict and must be opened next step
con = pd.concat([con, df.iloc[:,:-1]], axis=1)
con = pd.concat([con, df_list[-1]], axis=1)
4.) drop na because of example frame is output template
df = con.dropna().copy()
5.) design columns and match input keys for next concatenates
df.columns = [
"uni_id",
"university",
"faculty_id",
"faculty",
"department_id",
"department",
"aca_id",
"name",
"title"
]
6.) refix id sections and join input frame with template
def input_join_to_get_desired_template(jdi):
jdf = pd.DataFrame(jdi)
con_df = pd.concat([df,jdf], ignore_index=True, sort=False)
# enumerate ids with unique counts ↓ =================================================
unique_uni = list(con_df["university"].unique())
unique_fac = list(con_df["faculty"].unique())
unique_dep = list(con_df["department"].unique())
con_df["uni_id"] = con_df["university"].apply(lambda x: unique_uni.index(x)+1)
con_df["faculty_id"] = con_df["faculty"].apply(lambda x: unique_fac.index(x)+1)
con_df["department_id"] = con_df["department"].apply(lambda x: unique_dep.index(x)+1)
# set academicians indexes
l = ["uni_id","faculty_id","department_id","aca_id"]
con_df["aca_id"] = 1
con_df["aca_id"] = con_df.groupby(l)["aca_id"].cumsum().to_frame()
# enumerate ids with unique counts ↑ =================================================
return con_df
jd_input = json.loads(js_input.replace("'",'"'))
result_df = input_join_to_get_desired_template(jd_input)
result_df
7.) get other inputs and test
js_input_test = """[{'name': 'hl',
'title': 'doc',
'university': 'GEBZE ÜNİVERSİTESİ',
'faculty': 'FAKÜLTESİ',
'department': 'BİLGİSAYAR BÖLÜMÜ'},
{'name': 'hl',
'title': 'PROFESÖR',
'university': 'GEBZE ÜNİVERSİTESİ',
'faculty': 'FAKÜLTESİ',
'department': 'BİLGİSAYAR BÖLÜMÜ'},
{'name': 'yz',
'title': 'yrddoc',
'university': 'TEKNİK ÜNİVERSİTESİ',
'faculty': 'other',
'department': 'BİLGİSAYAR MÜHENDİSLİĞİ BÖLÜMÜ'},
{'name': 'zz',
'title': 'doc',
'university': 'TEKNİK ÜNİVERSİTESİ',
'faculty': 'other',
'department': 'ELEKTRONİK MÜHENDİSLİĞİ BÖLÜMÜ'},
{'name': 'hl',
'title': 'PROFESÖR',
'university': 'TEKNİK ÜNİVERSİTESİ',
'faculty': 'other',
'department': 'BİLGİSAYAR MÜHENDİSLİĞİ BÖLÜMÜ'},
{'name': 'abc',
'title': 'prof',
'university': 'TEKNİK ÜNİVERSİTESİ',
'faculty': 'other',
'department': 'HARİTA MÜHENDİSLİĞİ BÖLÜMÜ'},
{'name': 'aaa',
'title': 'PROFESÖR',
'university': 'ÜNİVERSİTESİ',
'faculty': 'MÜHENDİSLİK',
'department': 'BİLGİSAYAR BÖLÜMÜ'},
{'name': 'bbb',
'title': 'PROFESÖR',
'university': 'ÜNİVERSİTESİ',
'faculty': 'MÜHENDİSLİK',
'department': 'BİLGİSAYAR BÖLÜMÜ'},
{'name': 'ccc',
'title': 'PROFESÖR',
'university': 'ÜNİVERSİTESİ',
'faculty': 'MÜHENDİSLİK',
'department': 'BİLGİSAYAR BÖLÜMÜ'},
{'name': 'ddd',
'title': 'PROFESÖR',
'university': 'ÜNİVERSİTESİ',
'faculty': 'MÜHENDİSLİK FAKÜLTESİ',
'department': 'ELEKTRONİK MÜHENDİSLİĞİ BÖLÜMÜ'}]"""
8.) and get results
jd_input = json.loads(js_input_test.replace("'",'"'))
result_df = input_join_to_get_desired_template(jd_input)
result_df

Pandas to JSON Within Groups

I have the following pandas dataframe. I want to output a json object but nested within State first and then City. The Code, Name, and Rank variables all become triplets to make a list of dictionaries.
MWE
import pandas as pd
df = pd.DataFrame({
'State': ['PA', 'PA', 'PA', 'PA', 'PA', 'PA', 'PA', 'NY', 'NY', 'NY', 'NY', 'NY', 'NY', 'ME', 'ME', 'ME' ],
'City': ['Philadelphia', 'Philadelphia', 'Philadelphia', 'Philadelphia', 'Scranton', 'Scranton', 'Williamsport', 'Buffalo', 'Buffalo', 'Buffalo', 'Buffalo', 'Albany', 'Albany', 'Portland', 'Portland', 'Ogunquit'],
'Code': [10, 20, 30, 40, 50, 60, 10, 20, 30, 40, 50, 10, 20, 30, 40, 30],
'Name': ['A', 'B', 'C', 'D', 'E', 'F', 'A', 'B', 'C', 'D', 'E', 'A', 'B', 'C', 'D', 'C'],
'Rank': [1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 1, 2, 3, 4, 3]
})
df
I got to here but this was not close to where I want it to be.
df.groupby(['State', 'City']).apply(lambda x: x[['Code', 'Name', 'Rank']].to_json(orient='records', indent = 4))
Desired Output
[
{
"State": "PA",
"City": "Philadelphia",
"List": [
{
"Code": 10,
"Name": "A",
"Rank": 1
},
{
"Code": 20,
"Name": "B",
"Rank": 2
},
{
"Code": 30,
"Name": "C",
"Rank": 3
},
{
"Code": 40,
"Name": "D",
"Rank": 4
}
]
},
{
"State": "PA",
"City": "Scranton",
"List": [
{
"Code": 50,
"Name": "E",
"Rank": 5
},
{
"Code": 60,
"Name": "F",
"Rank": 6
}
]
},
{
"State": "PA",
"City": "Williamsport",
"List": [
{
"Code": 10,
"Name": "A",
"Rank": 1
}
]
},
{
"State": "NY",
"City": "Albany",
"List": [
{
"Code": 10,
"Name": "A",
"Rank": 1
},
{
"Code": 20,
"Name": "B",
"Rank": 2
}
]
},
{
"State": "NY",
"City": "Buffalo",
"List": [
{
"Code": 20,
"Name": "B",
"Rank": 2
},
{
"Code": 30,
"Name": "C",
"Rank": 3
},
{
"Code": 40,
"Name": "D",
"Rank": 4
},
{
"Code": 50,
"Name": "E",
"Rank": 5
}
]
},
{
"State": "ME",
"City": "Portland",
"List": [
{
"Code": 30,
"Name": "C",
"Rank": 3
},
{
"Code": 40,
"Name": "D",
"Rank": 4
}
]
},
{
"State": "ME",
"City": "Ogunquit",
"List": [
{
"Code": 30,
"Name": "C",
"Rank": 3
}
]
}
]
IIUC, you can try:
df["List"] = df[["Code", "Name", "Rank"]].to_dict("records")
grouped = df.groupby(["State", "City"])["List"].apply(list).reset_index()
json_obj = grouped.to_json(orient="records")
>>>json_obj
'[{"State":"ME",
"City":"Ogunquit",
"List":[{"Code":30,"Name":"C","Rank":3}]},
{"State":"ME",
"City":"Portland",
"List":[{"Code":30,"Name":"C","Rank":3},
{"Code":40,"Name":"D","Rank":4}]},
{"State":"NY",
"City":"Albany",
"List":[{"Code":10,"Name":"A","Rank":1},
{"Code":20,"Name":"B","Rank":2}]},
{"State":"NY",
"City":"Buffalo",
"List":[{"Code":20,"Name":"B","Rank":2},
{"Code":30,"Name":"C","Rank":3},
{"Code":40,"Name":"D","Rank":4},
{"Code":50,"Name":"E","Rank":5}]},
{"State":"PA",
"City":"Philadelphia",
"List":[{"Code":10,"Name":"A","Rank":1},
{"Code":20,"Name":"B","Rank":2},
{"Code":30,"Name":"C","Rank":3},
{"Code":40,"Name":"D","Rank":4}]},
{"State":"PA",
"City":"Scranton",
"List":[{"Code":50,"Name":"E","Rank":5},
{"Code":60,"Name":"F","Rank":6}]},
{"State":"PA",
"City":"Williamsport",
"List":[{"Code":10,"Name":"A","Rank":1}]}]'
Try:
df.groupby(["State", "City"]).apply(
lambda x: x[["Code", "Name", "Rank"]].to_dict("records")
).reset_index(name="List").to_json(orient="records")
Output:
[{'State': 'ME',
'City': 'Ogunquit',
'List': [{'Code': 30, 'Name': 'C', 'Rank': 3}]},
{'State': 'ME',
'City': 'Portland',
'List': [{'Code': 30, 'Name': 'C', 'Rank': 3},
{'Code': 40, 'Name': 'D', 'Rank': 4}]},
{'State': 'NY',
'City': 'Albany',
'List': [{'Code': 10, 'Name': 'A', 'Rank': 1},
{'Code': 20, 'Name': 'B', 'Rank': 2}]},
{'State': 'NY',
'City': 'Buffalo',
'List': [{'Code': 20, 'Name': 'B', 'Rank': 2},
{'Code': 30, 'Name': 'C', 'Rank': 3},
{'Code': 40, 'Name': 'D', 'Rank': 4},
{'Code': 50, 'Name': 'E', 'Rank': 5}]},
{'State': 'PA',
'City': 'Philadelphia',
'List': [{'Code': 10, 'Name': 'A', 'Rank': 1},
{'Code': 20, 'Name': 'B', 'Rank': 2},
{'Code': 30, 'Name': 'C', 'Rank': 3},
{'Code': 40, 'Name': 'D', 'Rank': 4}]},
{'State': 'PA',
'City': 'Scranton',
'List': [{'Code': 50, 'Name': 'E', 'Rank': 5},
{'Code': 60, 'Name': 'F', 'Rank': 6}]},
{'State': 'PA',
'City': 'Williamsport',
'List': [{'Code': 10, 'Name': 'A', 'Rank': 1}]}]

creating df to generate json in the given format

I am trying to generate a df to produce this below json.
Json data:
{
"name": "flare",
"children": [
{
"name": "K1",
"children": [
{"name": "Exact", "size": 4},
{"name": "synonyms", "size": 14}
]
},
{
"name": "K2",
"children": [
{"name": "Exact", "size": 10},
{"name": "synonyms", "size": 20}
]
},
{
"name": "K3",
"children": [
{"name": "Exact", "size": 0},
{"name": "synonyms", "size": 5}
]
},
{
"name": "K4",
"children": [
{"name": "Exact", "size": 13},
{"name": "synonyms", "size": 15}
]
},
{
"name": "K5",
"children": [
{"name": "Exact", "size": 0},
{"name": "synonyms", "size": 0}
]
}
]
}
input data:
name Exact synonyms
K1 4 14
K2 10 20
K3 0 5
K4 13 15
K5 0 0
I tried creating df with values in the json but I was not able to get the desired json on df.to_json, please help.
You need reshape data by set_index + stack and then use groupby with apply for nested list of dict:
import json
df = (df.set_index('name')
.stack()
.reset_index(level=1)
.rename(columns={'level_1':'name', 0:'size'})
.groupby(level=0).apply(lambda x: x.to_dict(orient='records'))
.reset_index(name='children')
)
print (df)
name children
0 K1 [{'name': 'Exact', 'size': 4}, {'name': 'synon...
1 K2 [{'name': 'Exact', 'size': 10}, {'name': 'syno...
2 K3 [{'name': 'Exact', 'size': 0}, {'name': 'synon...
3 K4 [{'name': 'Exact', 'size': 13}, {'name': 'syno...
4 K5 [{'name': 'Exact', 'size': 0}, {'name': 'synon...
#convert output to dict
j = { "name": "flare", "children": df.to_dict(orient='records')}
#for nice output - easier check
import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(j)
{ 'children': [ { 'children': [ {'name': 'Exact', 'size': 4},
{'name': 'synonyms', 'size': 14}],
'name': 'K1'},
{ 'children': [ {'name': 'Exact', 'size': 10},
{'name': 'synonyms', 'size': 20}],
'name': 'K2'},
{ 'children': [ {'name': 'Exact', 'size': 0},
{'name': 'synonyms', 'size': 5}],
'name': 'K3'},
{ 'children': [ {'name': 'Exact', 'size': 13},
{'name': 'synonyms', 'size': 15}],
'name': 'K4'},
{ 'children': [ {'name': 'Exact', 'size': 0},
{'name': 'synonyms', 'size': 0}],
'name': 'K5'}],
'name': 'flare'}
#convert data to json and write to file
with open('data.json', 'w') as outfile:
json.dump(j, outfile)

python map array of dictionaries to dictionary?

I've got an array of dictionaries that looks like this:
[
{ 'country': 'UK', 'city': 'Manchester' },
{ 'country': 'UK', 'city': 'Liverpool' },
{ 'country': 'France', 'city': 'Paris' } ...
]
And I want to end up with a dictionary like this:
{ 'Liverpool': 'UK', 'Manchester': 'UK', ... }
Obviously I can do this:
d = {}
for c in cities:
d[c['city']] = c['country']
But is there any way I could do it with a single-line map?
You can use a dict comprehension :
>>> li = [
... { 'country': 'UK', 'city': 'Manchester' },
... { 'country': 'UK', 'city': 'Liverpool' },
... { 'country': 'France', 'city': 'Paris' }
... ]
>>> {d['city']: d['country'] for d in li}
{'Paris': 'France', 'Liverpool': 'UK', 'Manchester': 'UK'}
Or us operator.itemgetter and map function :
>>> dict(map(operator.itemgetter('city','country'),li))
{'Paris': 'France', 'Liverpool': 'UK', 'Manchester': 'UK'}

Dictionary of Dictionaries : Sorting by a specific key

I have a dictionary that looks like this
{'Africa': {'Name': 'Africa',
'men': 33333,
'priority': 3,
'women': 30000},
'America': {'Name': 'USA',
'men': 1114444411333L,
'priority': 4,
'women': 44430000},
'Asia': {'Name': 'China',
'men': 444433333,
'priority': 2,
'women': 444430000},
'Europe': {'Name': 'UK',
'men': 11111333,
'priority': 1,
'women': 1111430000}}
I need to sort this dictionary by Key = Priority
I'm using 2.7 and have tried few options (which dont look very elegant). Any suggestions?
>>> d = {"Africa" :
{ "Name" : "Africa", "men": 33333, "women" : 30000, "priority" :3},
"Asia":
{ "Name" : "China", "men": 444433333, "women" : 444430000, "priority" :2},
"Europe":
{ "Name" : "UK", "men": 11111333, "women" : 1111430000, "priority" :1},
"America":
{ "Name" : "USA", "men": 1114444411333, "women" : 44430000, "priority" :4}
}
>>> from collections import OrderedDict
>>> OrderedDict(sorted(d.items(), key=lambda x: x[1]['priority']))
OrderedDict([('Europe', {'priority': 1, 'men': 11111333, 'Name': 'UK', 'women': 1111430000}), ('Asia', {'priority': 2, 'men': 444433333, 'Name': 'China', 'women': 444430000}), ('Africa', {'priority': 3, 'men': 33333, 'Name': 'Africa', 'women': 30000}), ('America', {'priority': 4, 'men': 1114444411333L, 'Name': 'USA', 'women': 44430000})])
It is not possible to sort a dict.You can't get a dictionary as sorted, but you can convert it to sorted tuple list. Here is another version of sorting it;
data={'Africa': {'Name': 'Africa',
'men': 33333,
'priority': 3,
'women': 30000},
'America': {'Name': 'USA',
'men': 1114444411333L,
'priority': 4,
'women': 44430000},
'Asia': {'Name': 'China',
'men': 444433333,
'priority': 2,
'women': 444430000},
'Europe': {'Name': 'UK',
'men': 11111333,
'priority': 1,
'women': 1111430000}}
from operator import itemgetter
listOfTuple = sorted(data.items(), key= lambda(k,v):itemgetter(1)('priority'))
listOfTuple.sort(key=lambda tup:tup[1]['priority'])
print listOfTuple
>>>[('Europe', {'priority': 1, 'men': 11111333, 'Name': 'UK', 'women': 1111430000}), ('Asia', {'priority': 2, 'men': 444433333, 'Name': 'China', 'women': 444430000}), ('Africa', {'priority': 3, 'men': 33333, 'Name': 'Africa', 'women': 30000}), ('America', {'priority': 4, 'men': 1114444411333L, 'Name': 'USA', 'women': 44430000})]

Categories