Store rows of DataFrame with certain value in list

Store rows of DataFrame with certain value in list - python

I have a DataFrame like:
id
country
city
amount
duplicated
1
France
Paris
200
1
2
France
Paris
200
1
3
France
Lyon
50
2
4
France
Lyon
50
2
5
France
Lyon
50
2
And I would like to store a list per distinct value in duplicated, like:
list 1
[
{
"id": 1,
"country": "France",
"city": "Paris",
"amount": 200,
},
{
"id": 2,
"country": "France",
"city": "Paris",
"amount": 200,
}
]
list 2
[
{
"id": 3,
"country": "France",
"city": "Lyon",
"amount": 50,
},
{
"id": 4,
"country": "France",
"city": "Lyon",
"amount": 50,
},
{
"id": 5,
"country": "France",
"city": "Lyon",
"amount": 50,
}
]
I tried filtering duplicates with
df[df.duplicated(['country','city','amount', 'duplicated'], keep = False)]
but it just returns the same df.

You can use groupby:
lst = (df.groupby(['country', 'city', 'amount']) # or .groupby('duplicated')
.apply(lambda x: x.to_dict('records'))
.tolist())
Output:
>>> lst
[[{'id': 3,
'country': 'France',
'city': 'Lyon',
'amount': 50,
'duplicated': 2},
{'id': 4,
'country': 'France',
'city': 'Lyon',
'amount': 50,
'duplicated': 2},
{'id': 5,
'country': 'France',
'city': 'Lyon',
'amount': 50,
'duplicated': 2}],
[{'id': 1,
'country': 'France',
'city': 'Paris',
'amount': 200,
'duplicated': 1},
{'id': 2,
'country': 'France',
'city': 'Paris',
'amount': 200,
'duplicated': 1}]]
Another solution if you want a dict indexed by duplicated key:
data = {k: v.to_dict('records') for k, v in df.set_index('duplicated').groupby(level=0)}
>>> data[1]
[{'id': 1, 'country': 'France', 'city': 'Paris', 'amount': 200},
{'id': 2, 'country': 'France', 'city': 'Paris', 'amount': 200}]
>>> data[2]
[{'id': 3, 'country': 'France', 'city': 'Lyon', 'amount': 50},
{'id': 4, 'country': 'France', 'city': 'Lyon', 'amount': 50},
{'id': 5, 'country': 'France', 'city': 'Lyon', 'amount': 50}]

If I understand you correctly, you can use DataFrame.to_dict('records') to make your lists:
list_1 = df[df['duplicated'] == 1].to_dict('records')
list_1 = df[df['duplicated'] == 2].to_dict('records')
Or for an arbitrary number of values in the column, you can make a dict:
result = {}
for value in df['duplicated'].unique():
result[value] = df[df['duplicated'] == value].to_dict('records')

Related

How to groupby columns by value and make json from them? Python3 Pandas

I have a dataset containing all the professors in Turkey. I need to change the shape of this data structure, but I couldn't find a solution. In this data, there is information about the university, faculty, department and title of approximately 44 thousand academicians.
[ { "name": "XX", "title": "PROFESÖR", "university": "GEBZE TEKNİK ÜNİVERSİTESİ", "faculty": "MÜHENDİSLİK FAKÜLTESİ", "department": "BİLGİSAYAR MÜHENDİSLİĞİ BÖLÜMÜ" }, { "name": "YY", "title": "PROFESÖR", "university": "GEBZE TEKNİK ÜNİVERSİTESİ", "faculty": "MÜHENDİSLİK FAKÜLTESİ", "department": "ELEKTRONİK MÜHENDİSLİĞİ BÖLÜMÜ" } ]
I have 44000 yields as above and I want to process them. For example, there are nearly 200 universities, I want to separate them.
{ "universities": [ { "id": 1, "name": "GEBZE TEKNİK ÜNİVERSİTESİ", "faculties": [ { "id" : 1, "name": "MÜHENDİSLİK FAKÜLTESİ", "departments" : [ { "id" : 1, "name" : "ELEKTRONİK MÜHENDİSLİĞİ BÖLÜMÜ", "academicians" : [ { "id" : 1, "name":"AA", "title" : "PROFESÖR" }, { "id" : 2, "name":"BB", "title" : "PROFESÖR" }, { "id" : 3, "name":"CC", "title" : "PROFESÖR" } ] }, { "id" : 2, "name" : "HARİTA MÜHENDİSLİĞİ BÖLÜMÜ", "academicians" : [ { "id" : 1, "name":"DD", "title" : "PROFESÖR" }, { "id" : 2, "name":"EE", "title" : "PROFESÖR" } ] } ] } ] } ] }
I want it as in the above format but I couldn't get it done. Can anyone help?

1.) get json datas
js_output = """{'universities': [{'id': 1,
'name': 'GEBZE TEKNİK ÜNİVERSİTESİ',
'faculties': [{'id': 1,
'name': 'MÜHENDİSLİK FAKÜLTESİ',
'departments': [{'id': 1,
'name': 'ELEKTRONİK MÜHENDİSLİĞİ BÖLÜMÜ',
'academicians': [{'id': 1, 'name': 'AA', 'title': 'PROFESÖR'},
{'id': 2, 'name': 'BB', 'title': 'PROFESÖR'},
{'id': 3, 'name': 'CC', 'title': 'PROFESÖR'}]},
{'id': 2,
'name': 'HARİTA MÜHENDİSLİĞİ BÖLÜMÜ',
'academicians': [{'id': 1, 'name': 'DD', 'title': 'PROFESÖR'},
{'id': 2, 'name': 'EE', 'title': 'PROFESÖR'}]}]}]}]}"""
js_input = """[{'name': 'XX',
'title': 'PROFESÖR',
'university': 'GEBZE TEKNİK ÜNİVERSİTESİ',
'faculty': 'MÜHENDİSLİK FAKÜLTESİ',
'department': 'BİLGİSAYAR MÜHENDİSLİĞİ BÖLÜMÜ'},
{'name': 'YY',
'title': 'PROFESÖR',
'university': 'GEBZE TEKNİK ÜNİVERSİTESİ',
'faculty': 'MÜHENDİSLİK FAKÜLTESİ',
'department': 'ELEKTRONİK MÜHENDİSLİĞİ BÖLÜMÜ'}]"""
2.) set json normalize
# get record_path with json keys and get frame list
df_list = []
def get_frames(l,j):
for i in range(len(l)):
df_list.append(pd.json_normalize(j, l[:i+1]))
records = ["universities", "faculties", "departments", "academicians"]
jdo = json.loads(js_output.replace("'",'"'))
get_frames(records, jdo)
3.) concatenate all frames
con = pd.DataFrame()
for df in df_list[:-1]: # because last item is dict and must be opened next step
con = pd.concat([con, df.iloc[:,:-1]], axis=1)
con = pd.concat([con, df_list[-1]], axis=1)
4.) drop na because of example frame is output template
df = con.dropna().copy()
5.) design columns and match input keys for next concatenates
df.columns = [
"uni_id",
"university",
"faculty_id",
"faculty",
"department_id",
"department",
"aca_id",
"name",
"title"
]
6.) refix id sections and join input frame with template
def input_join_to_get_desired_template(jdi):
jdf = pd.DataFrame(jdi)
con_df = pd.concat([df,jdf], ignore_index=True, sort=False)
# enumerate ids with unique counts ↓ =================================================
unique_uni = list(con_df["university"].unique())
unique_fac = list(con_df["faculty"].unique())
unique_dep = list(con_df["department"].unique())
con_df["uni_id"] = con_df["university"].apply(lambda x: unique_uni.index(x)+1)
con_df["faculty_id"] = con_df["faculty"].apply(lambda x: unique_fac.index(x)+1)
con_df["department_id"] = con_df["department"].apply(lambda x: unique_dep.index(x)+1)
# set academicians indexes
l = ["uni_id","faculty_id","department_id","aca_id"]
con_df["aca_id"] = 1
con_df["aca_id"] = con_df.groupby(l)["aca_id"].cumsum().to_frame()
# enumerate ids with unique counts ↑ =================================================
return con_df
jd_input = json.loads(js_input.replace("'",'"'))
result_df = input_join_to_get_desired_template(jd_input)
result_df
7.) get other inputs and test
js_input_test = """[{'name': 'hl',
'title': 'doc',
'university': 'GEBZE ÜNİVERSİTESİ',
'faculty': 'FAKÜLTESİ',
'department': 'BİLGİSAYAR BÖLÜMÜ'},
{'name': 'hl',
'title': 'PROFESÖR',
'university': 'GEBZE ÜNİVERSİTESİ',
'faculty': 'FAKÜLTESİ',
'department': 'BİLGİSAYAR BÖLÜMÜ'},
{'name': 'yz',
'title': 'yrddoc',
'university': 'TEKNİK ÜNİVERSİTESİ',
'faculty': 'other',
'department': 'BİLGİSAYAR MÜHENDİSLİĞİ BÖLÜMÜ'},
{'name': 'zz',
'title': 'doc',
'university': 'TEKNİK ÜNİVERSİTESİ',
'faculty': 'other',
'department': 'ELEKTRONİK MÜHENDİSLİĞİ BÖLÜMÜ'},
{'name': 'hl',
'title': 'PROFESÖR',
'university': 'TEKNİK ÜNİVERSİTESİ',
'faculty': 'other',
'department': 'BİLGİSAYAR MÜHENDİSLİĞİ BÖLÜMÜ'},
{'name': 'abc',
'title': 'prof',
'university': 'TEKNİK ÜNİVERSİTESİ',
'faculty': 'other',
'department': 'HARİTA MÜHENDİSLİĞİ BÖLÜMÜ'},
{'name': 'aaa',
'title': 'PROFESÖR',
'university': 'ÜNİVERSİTESİ',
'faculty': 'MÜHENDİSLİK',
'department': 'BİLGİSAYAR BÖLÜMÜ'},
{'name': 'bbb',
'title': 'PROFESÖR',
'university': 'ÜNİVERSİTESİ',
'faculty': 'MÜHENDİSLİK',
'department': 'BİLGİSAYAR BÖLÜMÜ'},
{'name': 'ccc',
'title': 'PROFESÖR',
'university': 'ÜNİVERSİTESİ',
'faculty': 'MÜHENDİSLİK',
'department': 'BİLGİSAYAR BÖLÜMÜ'},
{'name': 'ddd',
'title': 'PROFESÖR',
'university': 'ÜNİVERSİTESİ',
'faculty': 'MÜHENDİSLİK FAKÜLTESİ',
'department': 'ELEKTRONİK MÜHENDİSLİĞİ BÖLÜMÜ'}]"""
8.) and get results
jd_input = json.loads(js_input_test.replace("'",'"'))
result_df = input_join_to_get_desired_template(jd_input)
result_df

Pandas to JSON Within Groups

I have the following pandas dataframe. I want to output a json object but nested within State first and then City. The Code, Name, and Rank variables all become triplets to make a list of dictionaries.
MWE
import pandas as pd
df = pd.DataFrame({
'State': ['PA', 'PA', 'PA', 'PA', 'PA', 'PA', 'PA', 'NY', 'NY', 'NY', 'NY', 'NY', 'NY', 'ME', 'ME', 'ME' ],
'City': ['Philadelphia', 'Philadelphia', 'Philadelphia', 'Philadelphia', 'Scranton', 'Scranton', 'Williamsport', 'Buffalo', 'Buffalo', 'Buffalo', 'Buffalo', 'Albany', 'Albany', 'Portland', 'Portland', 'Ogunquit'],
'Code': [10, 20, 30, 40, 50, 60, 10, 20, 30, 40, 50, 10, 20, 30, 40, 30],
'Name': ['A', 'B', 'C', 'D', 'E', 'F', 'A', 'B', 'C', 'D', 'E', 'A', 'B', 'C', 'D', 'C'],
'Rank': [1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 1, 2, 3, 4, 3]
})
df
I got to here but this was not close to where I want it to be.
df.groupby(['State', 'City']).apply(lambda x: x[['Code', 'Name', 'Rank']].to_json(orient='records', indent = 4))
Desired Output
[
{
"State": "PA",
"City": "Philadelphia",
"List": [
{
"Code": 10,
"Name": "A",
"Rank": 1
},
{
"Code": 20,
"Name": "B",
"Rank": 2
},
{
"Code": 30,
"Name": "C",
"Rank": 3
},
{
"Code": 40,
"Name": "D",
"Rank": 4
}
]
},
{
"State": "PA",
"City": "Scranton",
"List": [
{
"Code": 50,
"Name": "E",
"Rank": 5
},
{
"Code": 60,
"Name": "F",
"Rank": 6
}
]
},
{
"State": "PA",
"City": "Williamsport",
"List": [
{
"Code": 10,
"Name": "A",
"Rank": 1
}
]
},
{
"State": "NY",
"City": "Albany",
"List": [
{
"Code": 10,
"Name": "A",
"Rank": 1
},
{
"Code": 20,
"Name": "B",
"Rank": 2
}
]
},
{
"State": "NY",
"City": "Buffalo",
"List": [
{
"Code": 20,
"Name": "B",
"Rank": 2
},
{
"Code": 30,
"Name": "C",
"Rank": 3
},
{
"Code": 40,
"Name": "D",
"Rank": 4
},
{
"Code": 50,
"Name": "E",
"Rank": 5
}
]
},
{
"State": "ME",
"City": "Portland",
"List": [
{
"Code": 30,
"Name": "C",
"Rank": 3
},
{
"Code": 40,
"Name": "D",
"Rank": 4
}
]
},
{
"State": "ME",
"City": "Ogunquit",
"List": [
{
"Code": 30,
"Name": "C",
"Rank": 3
}
]
}
]

IIUC, you can try:
df["List"] = df[["Code", "Name", "Rank"]].to_dict("records")
grouped = df.groupby(["State", "City"])["List"].apply(list).reset_index()
json_obj = grouped.to_json(orient="records")
>>>json_obj
'[{"State":"ME",
"City":"Ogunquit",
"List":[{"Code":30,"Name":"C","Rank":3}]},
{"State":"ME",
"City":"Portland",
"List":[{"Code":30,"Name":"C","Rank":3},
{"Code":40,"Name":"D","Rank":4}]},
{"State":"NY",
"City":"Albany",
"List":[{"Code":10,"Name":"A","Rank":1},
{"Code":20,"Name":"B","Rank":2}]},
{"State":"NY",
"City":"Buffalo",
"List":[{"Code":20,"Name":"B","Rank":2},
{"Code":30,"Name":"C","Rank":3},
{"Code":40,"Name":"D","Rank":4},
{"Code":50,"Name":"E","Rank":5}]},
{"State":"PA",
"City":"Philadelphia",
"List":[{"Code":10,"Name":"A","Rank":1},
{"Code":20,"Name":"B","Rank":2},
{"Code":30,"Name":"C","Rank":3},
{"Code":40,"Name":"D","Rank":4}]},
{"State":"PA",
"City":"Scranton",
"List":[{"Code":50,"Name":"E","Rank":5},
{"Code":60,"Name":"F","Rank":6}]},
{"State":"PA",
"City":"Williamsport",
"List":[{"Code":10,"Name":"A","Rank":1}]}]'

Try:
df.groupby(["State", "City"]).apply(
lambda x: x[["Code", "Name", "Rank"]].to_dict("records")
).reset_index(name="List").to_json(orient="records")
Output:
[{'State': 'ME',
'City': 'Ogunquit',
'List': [{'Code': 30, 'Name': 'C', 'Rank': 3}]},
{'State': 'ME',
'City': 'Portland',
'List': [{'Code': 30, 'Name': 'C', 'Rank': 3},
{'Code': 40, 'Name': 'D', 'Rank': 4}]},
{'State': 'NY',
'City': 'Albany',
'List': [{'Code': 10, 'Name': 'A', 'Rank': 1},
{'Code': 20, 'Name': 'B', 'Rank': 2}]},
{'State': 'NY',
'City': 'Buffalo',
'List': [{'Code': 20, 'Name': 'B', 'Rank': 2},
{'Code': 30, 'Name': 'C', 'Rank': 3},
{'Code': 40, 'Name': 'D', 'Rank': 4},
{'Code': 50, 'Name': 'E', 'Rank': 5}]},
{'State': 'PA',
'City': 'Philadelphia',
'List': [{'Code': 10, 'Name': 'A', 'Rank': 1},
{'Code': 20, 'Name': 'B', 'Rank': 2},
{'Code': 30, 'Name': 'C', 'Rank': 3},
{'Code': 40, 'Name': 'D', 'Rank': 4}]},
{'State': 'PA',
'City': 'Scranton',
'List': [{'Code': 50, 'Name': 'E', 'Rank': 5},
{'Code': 60, 'Name': 'F', 'Rank': 6}]},
{'State': 'PA',
'City': 'Williamsport',
'List': [{'Code': 10, 'Name': 'A', 'Rank': 1}]}]

creating df to generate json in the given format

I am trying to generate a df to produce this below json.
Json data:
{
"name": "flare",
"children": [
{
"name": "K1",
"children": [
{"name": "Exact", "size": 4},
{"name": "synonyms", "size": 14}
]
},
{
"name": "K2",
"children": [
{"name": "Exact", "size": 10},
{"name": "synonyms", "size": 20}
]
},
{
"name": "K3",
"children": [
{"name": "Exact", "size": 0},
{"name": "synonyms", "size": 5}
]
},
{
"name": "K4",
"children": [
{"name": "Exact", "size": 13},
{"name": "synonyms", "size": 15}
]
},
{
"name": "K5",
"children": [
{"name": "Exact", "size": 0},
{"name": "synonyms", "size": 0}
]
}
]
}
input data:
name Exact synonyms
K1 4 14
K2 10 20
K3 0 5
K4 13 15
K5 0 0
I tried creating df with values in the json but I was not able to get the desired json on df.to_json, please help.

You need reshape data by set_index + stack and then use groupby with apply for nested list of dict:
import json
df = (df.set_index('name')
.stack()
.reset_index(level=1)
.rename(columns={'level_1':'name', 0:'size'})
.groupby(level=0).apply(lambda x: x.to_dict(orient='records'))
.reset_index(name='children')
)
print (df)
name children
0 K1 [{'name': 'Exact', 'size': 4}, {'name': 'synon...
1 K2 [{'name': 'Exact', 'size': 10}, {'name': 'syno...
2 K3 [{'name': 'Exact', 'size': 0}, {'name': 'synon...
3 K4 [{'name': 'Exact', 'size': 13}, {'name': 'syno...
4 K5 [{'name': 'Exact', 'size': 0}, {'name': 'synon...
#convert output to dict
j = { "name": "flare", "children": df.to_dict(orient='records')}
#for nice output - easier check
import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(j)
{ 'children': [ { 'children': [ {'name': 'Exact', 'size': 4},
{'name': 'synonyms', 'size': 14}],
'name': 'K1'},
{ 'children': [ {'name': 'Exact', 'size': 10},
{'name': 'synonyms', 'size': 20}],
'name': 'K2'},
{ 'children': [ {'name': 'Exact', 'size': 0},
{'name': 'synonyms', 'size': 5}],
'name': 'K3'},
{ 'children': [ {'name': 'Exact', 'size': 13},
{'name': 'synonyms', 'size': 15}],
'name': 'K4'},
{ 'children': [ {'name': 'Exact', 'size': 0},
{'name': 'synonyms', 'size': 0}],
'name': 'K5'}],
'name': 'flare'}
#convert data to json and write to file
with open('data.json', 'w') as outfile:
json.dump(j, outfile)

python map array of dictionaries to dictionary?

I've got an array of dictionaries that looks like this:
[
{ 'country': 'UK', 'city': 'Manchester' },
{ 'country': 'UK', 'city': 'Liverpool' },
{ 'country': 'France', 'city': 'Paris' } ...
]
And I want to end up with a dictionary like this:
{ 'Liverpool': 'UK', 'Manchester': 'UK', ... }
Obviously I can do this:
d = {}
for c in cities:
d[c['city']] = c['country']
But is there any way I could do it with a single-line map?

You can use a dict comprehension :
>>> li = [
... { 'country': 'UK', 'city': 'Manchester' },
... { 'country': 'UK', 'city': 'Liverpool' },
... { 'country': 'France', 'city': 'Paris' }
... ]
>>> {d['city']: d['country'] for d in li}
{'Paris': 'France', 'Liverpool': 'UK', 'Manchester': 'UK'}
Or us operator.itemgetter and map function :
>>> dict(map(operator.itemgetter('city','country'),li))
{'Paris': 'France', 'Liverpool': 'UK', 'Manchester': 'UK'}

Dictionary of Dictionaries : Sorting by a specific key

I have a dictionary that looks like this
{'Africa': {'Name': 'Africa',
'men': 33333,
'priority': 3,
'women': 30000},
'America': {'Name': 'USA',
'men': 1114444411333L,
'priority': 4,
'women': 44430000},
'Asia': {'Name': 'China',
'men': 444433333,
'priority': 2,
'women': 444430000},
'Europe': {'Name': 'UK',
'men': 11111333,
'priority': 1,
'women': 1111430000}}
I need to sort this dictionary by Key = Priority
I'm using 2.7 and have tried few options (which dont look very elegant). Any suggestions?

>>> d = {"Africa" :
{ "Name" : "Africa", "men": 33333, "women" : 30000, "priority" :3},
"Asia":
{ "Name" : "China", "men": 444433333, "women" : 444430000, "priority" :2},
"Europe":
{ "Name" : "UK", "men": 11111333, "women" : 1111430000, "priority" :1},
"America":
{ "Name" : "USA", "men": 1114444411333, "women" : 44430000, "priority" :4}
}
>>> from collections import OrderedDict
>>> OrderedDict(sorted(d.items(), key=lambda x: x[1]['priority']))
OrderedDict([('Europe', {'priority': 1, 'men': 11111333, 'Name': 'UK', 'women': 1111430000}), ('Asia', {'priority': 2, 'men': 444433333, 'Name': 'China', 'women': 444430000}), ('Africa', {'priority': 3, 'men': 33333, 'Name': 'Africa', 'women': 30000}), ('America', {'priority': 4, 'men': 1114444411333L, 'Name': 'USA', 'women': 44430000})])

It is not possible to sort a dict.You can't get a dictionary as sorted, but you can convert it to sorted tuple list. Here is another version of sorting it;
data={'Africa': {'Name': 'Africa',
'men': 33333,
'priority': 3,
'women': 30000},
'America': {'Name': 'USA',
'men': 1114444411333L,
'priority': 4,
'women': 44430000},
'Asia': {'Name': 'China',
'men': 444433333,
'priority': 2,
'women': 444430000},
'Europe': {'Name': 'UK',
'men': 11111333,
'priority': 1,
'women': 1111430000}}
from operator import itemgetter
listOfTuple = sorted(data.items(), key= lambda(k,v):itemgetter(1)('priority'))
listOfTuple.sort(key=lambda tup:tup[1]['priority'])
print listOfTuple
>>>[('Europe', {'priority': 1, 'men': 11111333, 'Name': 'UK', 'women': 1111430000}), ('Asia', {'priority': 2, 'men': 444433333, 'Name': 'China', 'women': 444430000}), ('Africa', {'priority': 3, 'men': 33333, 'Name': 'Africa', 'women': 30000}), ('America', {'priority': 4, 'men': 1114444411333L, 'Name': 'USA', 'women': 44430000})]

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Store rows of DataFrame with certain value in list - python

Related

How to groupby columns by value and make json from them? Python3 Pandas

Pandas to JSON Within Groups

creating df to generate json in the given format

python map array of dictionaries to dictionary?

Dictionary of Dictionaries : Sorting by a specific key

Categories

Resources