Plot groupby of groupby pandas - python

The data is a time series, with many member ids associated with many categories:
data_df = pd.DataFrame({'Date': ['2018-09-14 00:00:22',
'2018-09-14 00:01:46',
'2018-09-14 00:01:56',
'2018-09-14 00:01:57',
'2018-09-14 00:01:58',
'2018-09-14 00:02:05'],
'category': [1, 1, 1, 2, 2, 2],
'member': ['bob', 'joe', 'jim', 'sally', 'jane', 'doe'],
'data': ['23', '20', '20', '11', '16', '62']})
There are about 50 categories with 30 members, each with around 1000 datapoints.
I am trying to make one plot per category.
By subsetting each category then plotting via:
fig, ax = plt.subplots(figsize=(8,6))
for i, g in category.groupby(['memeber']):
g.plot(y='data', ax=ax, label=str(i))
plt.show()
This works fine for a single category, however, when i try to use a for loop to repeat this for each category, it does not work
tests = pd.DataFrame()
for category in categories:
tests = df.loc[df['category'] == category]
for test in tests:
fig, ax = plt.subplots(figsize=(8,6))
for i, g in category.groupby(['member']):
g.plot(y='data', ax=ax, label=str(i))
plt.show()
yields an "AttributeError: 'str' object has no attribute 'groupby'" error.
What i would like is a loop that spits out one graph per category, with all the members' data plotted on each graph

Creating your dataframe
import pandas as pd
data_df = pd.DataFrame({'Date': ['2018-09-14 00:00:22',
'2018-09-14 00:01:46',
'2018-09-14 00:01:56',
'2018-09-14 00:01:57',
'2018-09-14 00:01:58',
'2018-09-14 00:02:05'],
'category': [1, 1, 1, 2, 2, 2],
'member': ['bob', 'joe', 'jim', 'sally', 'jane', 'doe'],
'data': ['23', '20', '20', '11', '16', '62']})
then [EDIT after comments]
import matplotlib.pyplot as plt
import numpy as np
subplots_n = np.unique(data_df['category']).size
subplots_x = np.round(np.sqrt(subplots_n)).astype(int)
subplots_y = np.ceil(np.sqrt(subplots_n)).astype(int)
for i, category in enumerate(data_df.groupby('category')):
category_df = pd.DataFrame(category[1])
x = [str(x) for x in category_df['member']]
y = [float(x) for x in category_df['data']]
plt.subplot(subplots_x, subplots_y, i+1)
plt.plot(x, y)
plt.title("Category {}".format(category_df['category'].values[0]))
plt.tight_layout()
plt.show()
yields to
Please note that this nicely takes care also of bigger groups like
data_df2 = pd.DataFrame({'category': [1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 5],
'member': ['bob', 'joe', 'jim', 'sally', 'jane', 'doe', 'ric', 'mat', 'pip', 'zoe', 'qui', 'quo', 'qua'],
'data': ['23', '20', '20', '11', '16', '62', '34', '27', '12', '7', '9', '13', '7']})

Far from an expert with pandas, but if you execute the following simple enough snippet
import matplotlib.pyplot as plt
import pandas as pd
df = pd.DataFrame({'Date': ['2018-09-14 00:00:22',
'2018-09-14 00:01:46',
'2018-09-14 00:01:56',
'2018-09-14 00:01:57',
'2018-09-14 00:01:58',
'2018-09-14 00:02:05'],
'category': [1, 1, 1, 2, 2, 2],
'Id': ['bob', 'joe', 'jim', 'sally', 'jane', 'doe'],
'data': ['23', '20', '20', '11', '16', '62']})
fig, ax = plt.subplots()
for item in df.groupby('category'):
ax.plot([float(x) for x in item[1]['category']],
[float(x) for x in item[1]['data'].values],
linestyle='none', marker='D')
plt.show()
you produce this figure
But there is probably a better way.
EDIT: Based on the changes made to your question, I changed my snippet to
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
df = pd.DataFrame({'Date': ['2018-09-14 00:00:22',
'2018-09-14 00:01:46',
'2018-09-14 00:01:56',
'2018-09-14 00:01:57',
'2018-09-14 00:01:58',
'2018-09-14 00:02:05'],
'category': [1, 1, 1, 2, 2, 2],
'Id': ['bob', 'joe', 'jim', 'sally', 'jane', 'doe'],
'data': ['23', '20', '20', '11', '16', '62']})
fig, ax = plt.subplots(nrows=np.unique(df['category']).size)
for i, item in enumerate(df.groupby('category')):
ax[i].plot([str(x) for x in item[1]['Id']],
[float(x) for x in item[1]['data'].values],
linestyle='none', marker='D')
ax[i].set_title('Category {}'.format(item[1]['category'].values[0]))
fig.tight_layout()
plt.show()
which now displays

Related

When I run the code, graph axes appear under the table [duplicate]

This question already has answers here:
How to hide axes in matplotlib.pyplot
(2 answers)
Closed 2 months ago.
Consider:
import matplotlib as mpl
import matplotlib.patches as patches
from matplotlib import pyplot as plt
import datetime
# First, we'll create a new figure and axis object
fig, ax = plt.subplots(figsize=(8, 6))
# Set the number of rows and cols for our table
rows = 10
cols = 6
# Create a coordinate system based on the number of rows/columns
# Adding a bit of padding on bottom (-1), top (1), right (0.5)
ax.set_ylim(-1, rows + 1)
ax.set_xlim(0, cols + .5)
x = datetime.datetime.now()
e = datetime.datetime.now()
# Sample data
data = [
{'id': 'player10', 'Price %': 125.658, 'Vol %': 255.489, 'goals': 125.859},
{'id': 'player9', 'Price %': 2, 'Vol %': 72, 'goals': 0},
{'id': 'player8', 'Price %': 3, 'Vol %': 47, 'goals': 0},
{'id': 'player7', 'Price %': 4, 'Vol %': 99, 'goals': 0},
{'id': 'player6', 'Price %': 5, 'Vol %': 84, 'goals': 1},
{'id': 'player5', 'Price %': 6, 'Vol %': 56, 'goals': 2},
{'id': 'player4', 'Price %': 7, 'Vol %': 67, 'goals': 0},
{'id': 'player3', 'Price %': 8, 'Vol %': 91, 'goals': 1},
{'id': 'player2', 'Price %': 9, 'Vol %': 75, 'goals': 3},
{'id': 'player1', 'Price %': 10, 'Vol %': 70, 'goals': 4}
]
for row in range(rows):
d = data[row]
ax.text(x=.5, y=row, s=d['id'], va='center', ha='left')
ax.text(x=2.5, y=row, s=d['Price %'], va='center', ha='right')
ax.text(x=3.5, y=row, s=d['Vol %'], va='center', ha='right')
ax.text(x=4.5, y=row, s=d['goals'], va='center', ha='right')
ax.text(.5, 9.75, '', weight='bold', ha='left')
ax.text(2.5, 9.75, 'Price %', weight='bold', ha='right')
ax.text(3.5, 9.75, 'Vol %', weight='bold', ha='right')
ax.text(4.5, 9.75, 'Goals', weight='bold', ha='right')
for row in range(rows):
ax.plot(
[0, cols + 1],
[row -.5, row - .5],
ls=':',
lw='.5',
c='grey'
)
ax.plot([0, cols + 1], [9.5, 9.5], lw='.5', c='black')
ax.set_title(
" test1 %s/%s/%s" % (e.day, e.month, e.year),
loc='left',
fontsize=15,
weight='bold',
color='r'
)
plt.show()
When I run the Python code, the chart axes appear under the table. How can I destroy them?
Waiting for the axes not to come out. How can I fix this?
I don't think there is an error anywhere, but I think there is a place where I wrote missing. I have not encountered such a problem in other graphics, but I do not know what the fix will be in this code.
Try this (matplotlib.pyplot.axis, axis -off parameter):
plt.axis('off')
plt.show()
Gives #

Apply a function on two pandas tables

I have the following two tables:
>>> df1 = pd.DataFrame(data={'1': ['john', '10', 'john'],
... '2': ['mike', '30', 'ana'],
... '3': ['ana', '20', 'mike'],
... '4': ['eve', 'eve', 'eve'],
... '5': ['10', np.NaN, '10'],
... '6': [np.NaN, np.NaN, '20']},
... index=pd.Series(['ind1', 'ind2', 'ind3'], name='index'))
>>> df1
1 2 3 4 5 6
index
ind1 john mike ana eve 10 NaN
ind2 10 30 20 eve NaN NaN
ind3 john ana mike eve 10 20
df2 = pd.DataFrame(data={'first_n': [4, 4, 3]},
index=pd.Series(['ind1', 'ind2', 'ind3'], name='index'))
>>> df2
first_n
index
ind1 4
ind2 4
ind3 3
I also have the following function that reverses a list and gets the first n non-NA elements:
def get_rev_first_n(row, top_n):
rev_row = [x for x in row[::-1] if x == x]
return rev_row[:top_n]
>>> get_rev_first_n(['john', 'mike', 'ana', 'eve', '10', np.NaN], 4)
['10', 'eve', 'ana', 'mike']
How would I apply this function to the two tables so that it takes in both df1 and df2 and outputs either a list or columns?
df=pd.concat([df1,df2],axis=1)
df.apply(get_rev_first_n,args=[4]) #send args as top_in
axis=0 is run along rows means runs on each column which is the default you don't have to specify it
args=[4] will be passed to second argument of get_rev_first_n
You can try apply with lambda on each row of the data frame, I just concatenate the two df's using concat and applied your method to each row of the resulted dataframe.
Full Code:
import pandas as pd
import numpy as np
def get_rev_first_n(row, top_n):
rev_row = [x for x in row[::-1] if x == x]
return rev_row[1:top_n]
df1 = pd.DataFrame(data={'1': ['john', '10', 'john'],
'2': ['mike', '30', 'ana'],
'3': ['ana', '20', 'mike'],
'4': ['eve', 'eve', 'eve'],
'5': ['10', np.NaN, '10'],
'6': [np.NaN, np.NaN, '20']},
index=pd.Series(['ind1', 'ind2', 'ind3'], name='index'))
df2 = pd.DataFrame(data={'first_n': [4, 4, 3]},
index=pd.Series(['ind1', 'ind2', 'ind3'], name='index'))
df3 = pd.concat([df1, df2.reindex(df1.index)], axis=1)
df = df3.apply(lambda row : get_rev_first_n(row, row['first_n']), axis = 1)
print(df)
Output:
index
ind1 [10, eve, ana]
ind2 [eve, 20, 30]
ind3 [20, 10]
dtype: object

pandas: tricking left join in python

please how should i do to get the result below. if the cod of df_1 exists in df_2 then i should add the row as explained in my code below.
data1 = {'date': ['2021-06', '2021-06', '2021-07', '2021-07', '2021-07', '2021-07'], 'cod': ['12', '12', '14', '15', '15', '18'], 'Zone': ['LA', 'NY', 'LA', 'NY', 'PARIS', 'PARIS'], 'Revenue_Radio': [10, 20, 30, 50, 40, 10]}
df_1 = pd.DataFrame(data1)
data2 = {'date': ['2021-06', '2021-06', '2021-07', '2021-07', '2021-08'], 'cod': ['12', '14', '15', '15', '18'], 'Zone': ['PARIS', 'NY', 'LA', 'NY', 'NY'], 'Revenue_Str': [10, 20, 30, 50, 5]}
df_2 = pd.DataFrame(data2)
the expected output is
data_result = {'date': ['2021-06', '2021-06', '2021-06', '2021-07', '2021-07', '2021-07', '2021-07', '2021-07','2021-07'], 'cod': ['12', '12', '12', '14', '14', '15', '15', '15', '18'], 'Zone': ['LA', 'NY', 'PARIS','LA', 'NY', 'NY', 'PARIS', 'LA', 'PARIS'], 'Revenue_Radio': [10, 20, 0, 30, 0, 50, 40, 0, 10], 'Revenue_Str': [0, 0, 10,0, 20, 50, 0, 30, 0]}
df_result = pd.DataFrame(data_result)
Use inner join by date and cod first, and then outer join with replace missing values:
df22 = df_2.merge(df_1[['date','cod']].drop_duplicates(), on=['date','cod'])
df = (df_1.merge(df22, on=['date','cod','Zone'], how='outer')
.fillna(0)
.sort_values(['date','cod'], ignore_index=True))
print (df)
date cod Zone Revenue_Radio Revenue_Str
0 2021-06 12 LA 10.0 0.0
1 2021-06 12 NY 20.0 0.0
2 2021-06 12 PARIS 0.0 10.0
3 2021-07 14 LA 30.0 0.0
4 2021-07 15 NY 50.0 50.0
5 2021-07 15 PARIS 40.0 0.0
6 2021-07 15 LA 0.0 30.0

Mapping items in one list to the items in another list

I have two lists in Python and I'm trying to map the values of one to the other.
List 1 (coordinates):
['7,16', '71,84', '72,48', '36,52', '75,36', '52,28', '76,44', '11,69', '56,35',
'15,21', '32,74', '88,32', '10,74', '61,34', '51,85', '10,75', '55,96',
'94,12', '34,64', '71,59', '76,75', '25,16', '54,100', '62,1', '60,85',
'16,32', '14,77', '40,78', '2,60', '71,4', '78,91', '100,98', '42,32', '37,49',
'49,34', '3,5', '42,77', '39,60', '38,77', '49,40', '40,53', '57,48', '14,99',
'66,67', '10,9', '97,3', '66,76', '86,68', '10,60', '8,87']
List 2 (index):
[3, 2, 3, 3, 3, 3, 3, 1, 3, 3, 2, 3, 1, 3, 2, 1, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
1, 2, 1, 3, 2, 2, 3, 3, 3, 3, 2, 2, 2, 3, 3, 3, 1, 2, 3, 3, 2, 2, 1, 1]
For the output, I need to have something like:
cluster_1: [x, y], [a,b]...
cluster_2: [c, d], [e, f]...
cluster_3: [g, h], [o, j]...
I tried doing this in a dictionary, but I can only get it to put in the last coordinate in the for loop for each value. It also always outputs keys starting from 0, and I'm looking to label them starting from 1.
for i in range(len(patients)):
# other stuff
k = 3
for b in range(k):
if cluster == (k - b):
dct['cluster_%s' % b] = patients[i]
which outputs:
{'cluster_0': '97,3', 'cluster_1': '86,68', 'cluster_2': '8,87'}
I've tried using dct['cluster_%s' % b].append(patients[i]) but I get a key error on cluster_0. Any help would be much appreciated!
You can zip your indices and coordinates, then loop over them element-wise and populate a dictionary based on the index.
clusters = {}
for idx, coord in zip(index, coords):
if idx in clusters:
clusters[idx].append(coord.split(','))
else:
clusters[idx] = [coord.split(',')]
result, where clusters[i] refers the the i-th cluster.
>>> clusters
{
3: [['7', '16'], ['72', '48'], ['36', '52'], ['75', '36'], ['52', '28'], ['76', '44'], ['56', '35'], ['15', '21'], ['88', '32'], ['61', '34'], ['94', '12'], ['71', '59'], ['25', '16'], ['62', '1'], ['16', '32'], ['71', '4'], ['42', '32'], ['37', '49'], ['49', '34'], ['3', '5'], ['49', '40'], ['40', '53'], ['57', '48'], ['10', '9'], ['97', '3']],
2: [['71', '84'], ['32', '74'], ['51', '85'], ['55', '96'], ['34', '64'], ['76', '75'], ['54', '100'], ['60', '85'], ['40', '78'], ['78', '91'], ['100', '98'], ['42', '77'], ['39', '60'], ['38', '77'], ['66', '67'], ['66', '76'], ['86', '68']],
1: [['11', '69'], ['10', '74'], ['10', '75'], ['14', '77'], ['2', '60'], ['14', '99'], ['10', '60'], ['8', '87']]
}
You could use defaultdict along with zip:
from collections import defaultdict
clusters = defaultdict(list)
for id, value in zip(cluster_indices, values):
clusters[id].append(value.split(","))
print(dict(clusters)) # {3: [['7', '16'], ['72', '48'], ...
A defaultdict can be converted to a dict with dict(clusters). However, this may not be necessary since defaultdict basically extends dict.
Note: If you need int values, then you may replace value.split(",") with [int(v) for v in value.split(",")] or list(map(int, value.split(","))). Casting them already at this point will save you an iteration later.
from collections import defaultdict
clusters = defaultdict(list)
for id, value in zip(cluster_indices, values):
clusters[id].append([int(v) for v in value.split(",")])
print(dict(clusters)) # {3: [[7, 16], [72, 48], ...
The group-by behaviour extracted to a function groupby (using a lambda function to allow any kind of transformation) so it can be reused:
from collections import defaultdict
def groupby(indices, values, map_fn):
grouped = defaultdict(list)
for id, value in zip(indices, values):
grouped[id].append(map_fn(id, value))
return dict(grouped)
clusters = groupby(cluster_indices, values, lambda _, value: value.split(","))
print(clusters) # {3: [['7', '16'], ['72', '48'], ...
Here just another way by using itertools.groupby:
from itertools import groupby
from operator import itemgetter
data = sorted(zip(cluster_indices, values), key=itemgetter(0))
grouped = groupby(data, key=itemgetter(0))
clusters = {
cluster: [value[1].split(",") for value in list(values)]
for cluster, values in grouped
}
print(clusters) # {3: [['7', '16'], ['72', '48'], ...
However, I would use the defaultdict approach above or Cory Kramer's answer as it is more simple and easier to read (and therefore preferable)!

Problem inputting several lists to dictionary values

Stackoverflow, hello
I have a big problem with inputting a several list into multiple values
Initial code is:
animal_book={}
animal_type=['herbivorous', 'carnivorous']
animal_name=[['Elephant', 'Cow', 'Deer'], ['Tiger', 'Lion', 'Puma']]
animal_quantity=[[1, 2, 3], [4, 5, 6]]
animal_age=[['50','10','5'],['6', '7', '8']]
j=0
k=0
for i in animal_type:
animal_book[i]=[{'type':animal_name[j][k], 'name': animal_quantity[j][k], 'age':animal_age[j][k]}]
j += 1
print(animal_book)
The result I have is almost what I need, but with only one first compound in a animal_book value, not the several dictionaries, as I wanted (I've separated it by lines manually for better look):
>>{'herbivorous': [
{'name': 'Elephant', 'quantity': 1, 'age': '50'}
],
'carnivorous': [
{'name': 'Tiger', 'quantity': 4, 'age': '6'}
]
}
However, my aim here is to put these list into multiple values of the dictionary. Required code is:
{'herbivorous': [
{'name': 'Elephant', 'quantity': 1, 'age': '50'},
{'name': 'Cow', 'quantity': 2, 'age': '10'},
{'name': 'Deer', 'quantity': 3, 'age': '5'},
],
'carnivorous':[
{'name': 'Tiger', 'quantity': 4, 'age': '6'}
{'name': 'Lion', 'quantity': 5, 'age': '7'}
{'name': 'Puma', 'quantity': 6, 'age': '8'}
]
}
Does everybody know how to solve the problem?
I've tried to add the lists to animal_book.values() also after the for i in animal_type:
animal_book[i]=animal_book.values().append([{'name':animal_name[j][k], 'quantity': animal_quantity[j][k], 'age':animal_age[j][k]}])
But in this case I have an error: AttributeError: 'dict_values' object has no attribute 'append'
First Mistake :you are getting only one result because you are over-writing the dictionary value each time with the list .
Second Mistake : You are increasing j by one and looping through the animal type list which has 2 elements that means only two elements will be in the dictionary.
from pprint import pprint
animal_book={}
animal_type=['herbivorous', 'carnivorous']
animal_name=[['Elephant', 'Cow', 'Deer'], ['Tiger', 'Lion', 'Puma']]
animal_quantity=[[1, 2, 3], [4, 5, 6]]
animal_age=[['50','10','5'],['6', '7', '8']]
for j in range(len(animal_type)):
value_list = []
for k in range(len(animal_name[j])):
value_list.append({'type':animal_quantity[j][k], 'name': animal_name[j][k], 'age':animal_age[j][k]})
animal_book.update({animal_type[j]: value_list})
pprint(animal_book)
Output:
{'carnivorous': [{'age': '6', 'name': 'Tiger', 'type': 4},
{'age': '7', 'name': 'Lion', 'type': 5},
{'age': '8', 'name': 'Puma', 'type': 6}],
'herbivorous': [{'age': '50', 'name': 'Elephant', 'type': 1},
{'age': '10', 'name': 'Cow', 'type': 2},
{'age': '5', 'name': 'Deer', 'type': 3}]}
You can use zip to arrange your data in a suiteable manner and fill your book with that:
animal_book ={}
animal_type=['herbivorous', 'carnivorous']
animal_name=[['Elephant', 'Cow', 'Deer'], ['Tiger', 'Lion', 'Puma']]
animal_quantity=[[1, 2, 3], [4, 5, 6]]
animal_age=[['50','10','5'],['6', '7', '8']]
# zip your data together, extract the typ again, put the remainder into R
for typ, *R in zip(animal_type, animal_name, animal_quantity, animal_age):
# add the typ-list
animal_book.setdefault(typ,[])
# now handle the inner dicts data that have to be added to your lists
# first create tuples for each animal as r
for r in zip(*R):
# then create tuples of (key,value) and make dicts from it
animal_book[typ].append(dict(zip(["name","quantity","age"],r)))
# pretty print it
from pprint import pprint
pprint(animal_book)
Output:
{'carnivorous': [{'age': '6', 'name': 'Tiger', 'quantity': 4},
{'age': '7', 'name': 'Lion', 'quantity': 5},
{'age': '8', 'name': 'Puma', 'quantity': 6}],
'herbivorous': [{'age': '50', 'name': 'Elephant', 'quantity': 1},
{'age': '10', 'name': 'Cow', 'quantity': 2},
{'age': '5', 'name': 'Deer', 'quantity': 3}]}
R looks like this:
[['Elephant', 'Cow', 'Deer'], [1, 2, 3], ['50', '10', '5']]
[['Tiger', 'Lion', 'Puma'], [4, 5, 6], ['6', '7', '8']]
and r look like this:
('Elephant', 1, '50')
('Cow', 2, '10')
('Deer', 3, '5')
('Tiger', 4, '6')
('Lion', 5, '7')
('Puma', 6, '8')
and zip(["name","quantity","age"],r) looks approximately like so:
[('name', 'Elephant'), ('quantity', 1), ('age', '50')]
[('name', 'Cow'), ('quantity', 2), ('age', '10')]
[('name', 'Deer'), ('quantity', 3), ('age', '5')]
[('name', 'Tiger'), ('quantity', 4), ('age', '6')]
[('name', 'Lion'), ('quantity', 5), ('age', '7')]
[('name', 'Puma'), ('quantity', 6), ('age', '8')]
Here you go, this solves your problem. You have to have one for loop nested inside another one and use enumerate so you can keep track of the index. Also, if you use json.dumps() it will pretty-print a dictionary for you.
import json
animal_type=['herbivorous', 'carnivorous']
animal_name=[['Elephant', 'Cow', 'Deer'], ['Tiger', 'Lion', 'Puma']]
animal_quantity=[[1, 2, 3], [4, 5, 6]]
animal_age=[['50','10','5'],['6', '7', '8']]
animal_book={}
for index, type in enumerate(animal_type):
animal_book[type] = []
for jdex, animal in enumerate(animal_name[index]):
animal_book[type].append({
'name': animal,
'quantity': animal_quantity[index][jdex],
'age': animal_age[index][jdex],
})
print(json.dumps(animal_book, indent=2))
This will output:
{
"herbivorous": [
{
"name": "Elephant",
"quantity": 1,
"age": "50"
},
{
"name": "Cow",
"quantity": 2,
"age": "10"
},
{
"name": "Deer",
"quantity": 3,
"age": "5"
}
],
"carnivorous": [
{
"name": "Tiger",
"quantity": 4,
"age": "6"
},
{
"name": "Lion",
"quantity": 5,
"age": "7"
},
{
"name": "Puma",
"quantity": 6,
"age": "8"
}
]
}
You should use the range function to iterate over the items in more systematic way:
animal_book={}
animal_type=['herbivorous', 'carnivorous']
animal_name=[['Elephant', 'Cow', 'Deer'], ['Tiger', 'Lion', 'Puma']]
animal_quantity=[[1, 2, 3], [4, 5, 6]]
animal_age=[['50','10','5'],['6', '7', '8']]
for i in range(len(animal_type)):
for j in range(len(animal_name[i])):
animal_book[i]=[{'type':animal_name[i][j], 'name': animal_quantity[i][j], 'age':animal_age[i][j]}]
print(animal_book)

Categories