Computing sample statistics using NumPy functions with a Pandas DataFrame - python

I am trying to create a function that returns either the mean, median, or standard deviation of all columns in a Pandas DataFrame using NumPy functions.
It is for a school assignment, so there's no reason for using NumPy other than it is what is being asked of me. I am struggling to figure out how to use a NumPy function with a Pandas DataFrame for this problem.
Here is the text of the problem.
The code cell below contains a function called comp_sample_stat that accepts 2 parameters "df" which contains data from the dow jones for a particular company, and stat which will contain 1 of the 3 strings: "mean", "std", or "median".
For this problem:
if the stat is equal to "mean" return the mean of the dataframe columns using numpy's mean function
if the stat is equal to "median" return the median of the dataframe columns using numpy's median function
if the stat is equal to "std" return the std of the dataframe columns using numpy's std function
Here is the function I have written.
def comp_sample_stat(df, stat='mean'):
'''
Computes a sample statistic for any dataframe passed in
Parameters
----------
df: Pandas dataframe
Returns
-------
a pandas dataframe
'''
df_mean = df.apply(np.mean(df))
df_median = df.apply(np.median(df))
df_std = df.apply(np.std(df))
if stat is str('std'):
return df_std
elif stat is str('median'):
return df_median
else:
return df_mean
df is a DataFrame that has been defined previously in my assignment as follows:
def read_data(file_path):
'''
Reads in a dataset using pandas.
Parameters
----------
file_path : string containing path to a file
Returns
-------
pandas dataframe with data read in from the file path
'''
read_file = pd.read_csv(file_path)
new_df = pd.DataFrame(read_file)
return new_df
df = read_data('data/dow_jones_index.data')
The variable df_AA has also been previously defined as follows:
def select_stock(df, symbol):
'''
Selects data only containing a particular stock symbol.
Parameters
----------
df: dataframe containing data from the dow jones index
stock: string containing the stock symbol to select
Returns
-------
dataframe containing a particular stock
'''
stock = df[df.stock == symbol]
return stock
df_AA = select_stock(df.copy(), 'AA')
When I call the function within a Jupyter Notebook as follows:
comp_sample_stat(df_AA)
I get the following error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call
last)
<ipython-input-17-a2bcbeedcc56> in <module>()
22 return df_mean
23
---> 24 comp_sample_stat(df_AA)
<ipython-input-17-a2bcbeedcc56> in comp_sample_stat(df, stat)
11 a pandas dataframe
12 '''
---> 13 df_mean = df.apply(np.mean(df))
14 df_median = df.apply(np.median(df))
15 df_std = df.apply(np.std(df))
/opt/conda/lib/python3.6/site-packages/pandas/core/frame.py in
apply(self, func, axis, broadcast, raw, reduce, result_type, args,
**kwds)
6012 args=args,
6013 kwds=kwds)
-> 6014 return op.get_result()
6015
6016 def applymap(self, func):
/opt/conda/lib/python3.6/site-packages/pandas/core/apply.py in
get_result(self)
316 *self.args, **self.kwds)
317
--> 318 return super(FrameRowApply, self).get_result()
319
320 def apply_broadcast(self):
/opt/conda/lib/python3.6/site-packages/pandas/core/apply.py in
get_result(self)
140 return self.apply_raw()
141
--> 142 return self.apply_standard()
143
144 def apply_empty_result(self):
/opt/conda/lib/python3.6/site-packages/pandas/core/apply.py in
apply_standard(self)
246
247 # compute the result using the series generator
--> 248 self.apply_series_generator()
249
250 # wrap results
/opt/conda/lib/python3.6/site-packages/pandas/core/apply.py in
apply_series_generator(self)
275 try:
276 for i, v in enumerate(series_gen):
--> 277 results[i] = self.f(v)
278 keys.append(v.name)
279 except Exception as e:
TypeError: ("'Series' object is not callable", 'occurred at index
quarter')

DataFrame.apply expects you to pass it a function, not a dataframe. So you should be passing np.mean without arguments.
That is, you should be doing something like this:
df_mean = df.apply(np.mean)
The docs.

Related

Filling in missing values ​using a function

Hello,
I'm working on a column that has missing values ('year_of_release'). The data type is 'timestamp64'.
At first, I created a function that "pulls" the year numbers, from a column in which years appears next to the names of some games, and finally, I combined this data into a new column - 'years_from_titles':
def get_year(row):
regex="\d{4}"
match=re.findall(regex, row)
for i in match:
if (int(i) > 1970) & (int(i) < 2017):
return int(I)
gaming['years_from_titles']=gaming['name'].apply(lambda x: get_year(str(x)))
I tested the function and it works.
Now, I'm trying to create another function, which will fill in those missing years of the original column - 'year_of_release', but only if they appear on the same row:
def year_row(row):
if math.isnan(row['year_of_release']):
return row['years_from_titles']
else:
return row['year_of_release']
gaming['year_of_release']=gaming.apply(year_row,axis=1)
But when I'm running the code I get TypeError:
/tmp/ipykernel_31/133192424.py in <module>
7 return row['year_of_release']
8
----> 9 gaming['year_of_release']=gaming.apply(year_row,axis=1)
/opt/conda/lib/python3.9/site-packages/pandas/core/frame.py in apply(self, func, axis, raw, result_type, args, **kwds)
7766 kwds=kwds,
7767 )
-> 7768 return op.get_result()
7769
7770 def applymap(self, func, na_action: Optional[str] = None) -> DataFrame:
/opt/conda/lib/python3.9/site-packages/pandas/core/apply.py in get_result(self)
183 return self.apply_raw()
184
--> 185 return self.apply_standard()
186
187 def apply_empty_result(self):
/opt/conda/lib/python3.9/site-packages/pandas/core/apply.py in apply_standard(self)
274
275 def apply_standard(self):
--> 276 results, res_index = self.apply_series_generator()
277
278 # wrap results
/opt/conda/lib/python3.9/site-packages/pandas/core/apply.py in apply_series_generator(self)
288 for i, v in enumerate(series_gen):
289 # ignore SettingWithCopy here in case the user mutates
--> 290 results[i] = self.f(v)
291 if isinstance(results[i], ABCSeries):
292 # If we have a view on v, we need to make a copy because
/tmp/ipykernel_31/133192424.py in year_row(row)
2 # but only if a year is found, on the same row, and in correspond to years_from_titles column.
3 def year_row(row):
----> 4 if math.isnan(row['year_of_release']):
5 return row['years_from_titles']
6 else:
TypeError: must be real number, not Timestamp.
If anyone knows how to overcome this I would greatly appreciate it.
Thanks
You can use the feature that NaN is not equal with itself.
def year_row(row):
if row['year_of_release'] != row['year_of_release']:
return row['years_from_titles']
else:
return row['year_of_release']
gaming['year_of_release']=gaming.apply(year_row,axis=1)
Or with Series.mask
gaming['year_of_release'] = gaming['year_of_release'].mask(gaming['year_of_release'].isna(), gaming['years_from_titles'])
Or with Series.fillna
gaming['year_of_release'] = gaming['year_of_release'].fillna(gaming['years_from_titles'])
Instead of using the math module to check for missing values, here's a more pandas-specific approach.
Change this line:
if math.isnan(row['year_of_release']):
to this:
if row['year_of_release'].isna():

DataError: No numeric types to aggregate pandas pivot

I have a pandas dataframe like this:
User-Id Training-Id TrainingTaken
0 4327024 25 10
1 6662572 3 10
2 3757520 26 10
and I need to convert it to a Matrix like they do here:
https://github.com/tr1ten/Anime-Recommender-System/blob/main/HybridRecommenderSystem.ipynb
Cell 13.
So I did the following:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas_profiling
from scipy.sparse import csr_matrix
from lightfm.evaluation import auc_score
from lightfm.data import Dataset
user_training_interaction = pd.pivot_table(trainingtaken, index='User-Id', columns='Training-Id', values='TrainingTaken')
user_training_interaction.fillna(0,inplace=True)
user_training_csr = csr_matrix(user_training_interaction.values)
But I get this error:
---------------------------------------------------------------------------
DataError Traceback (most recent call last)
<ipython-input-96-5a2c7ba28976> in <module>
10 from lightfm.data import Dataset
11
---> 12 user_training_interaction = pd.pivot_table(trainingtaken, index='User-Id', columns='Training-Id', values='TrainingTaken')
13 user_training_interaction.fillna(0,inplace=True)
14 user_training_csr = csr_matrix(user_training_interaction.values)
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/pandas/core/reshape/pivot.py in pivot_table(data, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name, observed)
110
111 grouped = data.groupby(keys, observed=observed)
--> 112 agged = grouped.agg(aggfunc)
113 if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns):
114 agged = agged.dropna(how="all")
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/pandas/core/groupby/generic.py in aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
949 func = maybe_mangle_lambdas(func)
950
--> 951 result, how = self._aggregate(func, *args, **kwargs)
952 if how is None:
953 return result
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/pandas/core/base.py in _aggregate(self, arg, *args, **kwargs)
305
306 if isinstance(arg, str):
--> 307 return self._try_aggregate_string_function(arg, *args, **kwargs), None
308
309 if isinstance(arg, dict):
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/pandas/core/base.py in _try_aggregate_string_function(self, arg, *args, **kwargs)
261 if f is not None:
262 if callable(f):
--> 263 return f(*args, **kwargs)
264
265 # people may try to aggregate on a non-callable attribute
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in mean(self, numeric_only)
1396 "mean",
1397 alt=lambda x, axis: Series(x).mean(numeric_only=numeric_only),
-> 1398 numeric_only=numeric_only,
1399 )
1400
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/pandas/core/groupby/generic.py in _cython_agg_general(self, how, alt, numeric_only, min_count)
1020 ) -> DataFrame:
1021 agg_blocks, agg_items = self._cython_agg_blocks(
-> 1022 how, alt=alt, numeric_only=numeric_only, min_count=min_count
1023 )
1024 return self._wrap_agged_blocks(agg_blocks, items=agg_items)
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/pandas/core/groupby/generic.py in _cython_agg_blocks(self, how, alt, numeric_only, min_count)
1128
1129 if not (agg_blocks or split_frames):
-> 1130 raise DataError("No numeric types to aggregate")
1131
1132 if split_items:
DataError: No numeric types to aggregate
What am I missing?
The Pandas Documentation states:
While pivot() provides general purpose pivoting with various data
types (strings, numerics, etc.), pandas also provides pivot_table()
for pivoting with aggregation of numeric data
Make sure the column is numeric. Without seeing how you create trainingtaken I can't provide more specific guidance. However the following may help:
Make sure you handle "empty" values in that column. The Pandas guide is a very good place to start. Pandas points out that "a column of integers with even one missing values is cast to floating-point dtype".
If working with a dataframe, the column can be cast to a specific type via your_df.your_col.astype(int) or for your example, pd.trainingtaken.astype(int)

Splitting dataframe column in multiple columns using json_normalize does not work

I have a dataframe df created through an import from a mysql-db
ID CONFIG
0 276 {"pos":[{"type":"geo...
1 349 {"pos":[{"type":"geo...
2 378 {"pos":[{"type":"geo...
3 381 {"pos":[{"type":"geo...
4 385 {"pos":[{"type":"geo...
where the elements in the CONFIG column all have the form:
{"posit":[{"type":"geo_f","priority":1,"validity":0},{"type":"geo_m","priority":2,"validity":0},{"type":"geo_i","priority":3,"validity":0},{"type":"geo_c","priority":4,"validity":0}]}
Now, I was convinced these elements are json-type elements and tried the following method to transform them into columns:
df_new = pd.json_normalize(df['CONFIG'])
However, this return the following error:
AttributeError: 'str' object has no attribute 'values'
What am I missing? Thankful for any help!
EDIT: Full Traceback
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-22-23db4c0afdab> in <module>
----> 1 df_new = pd.json_normalize(df['CONFIG'])
c:\users\s-degossondevarennes\appdata\local\programs\python\python37\lib\site-packages\pandas\io\json\_normalize.py in _json_normalize(data, record_path, meta, meta_prefix, record_prefix, errors, sep, max_level)
268
269 if record_path is None:
--> 270 if any([isinstance(x, dict) for x in y.values()] for y in data):
271 # naive normalization, this is idempotent for flat records
272 # and potentially will inflate the data considerably for
c:\users\s-degossondevarennes\appdata\local\programs\python\python37\lib\site-packages\pandas\io\json\_normalize.py in <genexpr>(.0)
268
269 if record_path is None:
--> 270 if any([isinstance(x, dict) for x in y.values()] for y in data):
271 # naive normalization, this is idempotent for flat records
272 # and potentially will inflate the data considerably for
AttributeError: 'str' object has no attribute 'values'
First issue is the values in CONFIG column are strings in disguise. So, a literal_eval can make them true dictionaries. Then, they are all indexed with "posit" key first that we better get rid of. But then we are left with lists; so explode comes in. Overall,
from ast import literal_eval
pd.json_normalize(df['CONFIG'].apply(lambda x: literal_eval(x)["posit"]).explode())
I get (for a 1-row sample data)
type priority validity
0 geo_f 1 0
1 geo_m 2 0
2 geo_i 3 0
3 geo_c 4 0

Dask Dataframe: Resample partitioned data loaded from multiple parquet files

I am loading multiple parquet files containing timeseries data together. But the loaded dask dataframe has unknown partitions because of which I can't apply various time series operations on it.
df = dd.read_parquet('/path/to/*.parquet', index='Timestamps)
For instance, df_resampled = df.resample('1T').mean().compute() gives following error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-12-8e6f7f4340fd> in <module>
1 df = dd.read_parquet('/path/to/*.parquet', index='Timestamps')
----> 2 df_resampled = df.resample('1T').mean().compute()
~/.conda/envs/suf/lib/python3.7/site-packages/dask/dataframe/core.py in resample(self, rule, closed, label)
2627 from .tseries.resample import Resampler
2628
-> 2629 return Resampler(self, rule, closed=closed, label=label)
2630
2631 #derived_from(pd.DataFrame)
~/.conda/envs/suf/lib/python3.7/site-packages/dask/dataframe/tseries/resample.py in __init__(self, obj, rule, **kwargs)
118 "for more information."
119 )
--> 120 raise ValueError(msg)
121 self.obj = obj
122 self._rule = pd.tseries.frequencies.to_offset(rule)
ValueError: Can only resample dataframes with known divisions
See https://docs.dask.org/en/latest/dataframe-design.html#partitions
for more information.
I went to the link: https://docs.dask.org/en/latest/dataframe-design.html#partitions and it says,
In these cases (when divisions are unknown), any operation that requires a cleanly partitioned DataFrame with known divisions will have to perform a sort. This can generally achieved by calling df.set_index(...).
I then tried following, but no success.
df = dd.read_parquet('/path/to/*.parquet')
df = df.set_index('Timestamps')
This step throws the following error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-4-468e9af0c4d6> in <module>
1 df = dd.read_parquet(os.path.join(OUTPUT_DATA_DIR, '20*.gzip'))
----> 2 df.set_index('Timestamps')
3 # df_resampled = df.resample('1T').mean().compute()
~/.conda/envs/suf/lib/python3.7/site-packages/dask/dataframe/core.py in set_index(***failed resolving arguments***)
3915 npartitions=npartitions,
3916 divisions=divisions,
-> 3917 **kwargs,
3918 )
3919
~/.conda/envs/suf/lib/python3.7/site-packages/dask/dataframe/shuffle.py in set_index(df, index, npartitions, shuffle, compute, drop, upsample, divisions, partition_size, **kwargs)
483 if divisions is None:
484 sizes = df.map_partitions(sizeof) if repartition else []
--> 485 divisions = index2._repartition_quantiles(npartitions, upsample=upsample)
486 mins = index2.map_partitions(M.min)
487 maxes = index2.map_partitions(M.max)
~/.conda/envs/suf/lib/python3.7/site-packages/dask/dataframe/core.py in __getattr__(self, key)
3755 return self[key]
3756 else:
-> 3757 raise AttributeError("'DataFrame' object has no attribute %r" % key)
3758
3759 def __dir__(self):
AttributeError: 'DataFrame' object has no attribute '_repartition_quantiles'
Can anybody suggest what is the right way to load multiple timeseries files as a dask dataframe on which timeseries operations of pandas can be applied?

How to resample a python pandas TimeSeries containing dytpe Decimal values?

I'm having a pandas Series object filled with decimal numbers of dtype Decimal. I'd like to use the new pandas 0.8 function to resample the decimal time series like this:
resampled = ts.resample('D', how = 'mean')
When trying this i get an "GroupByError: No numeric types to aggregate" error. I assume the problem is that np.mean is used internaly to resample the values and np.mean expects floats instead of Decimals.
Thanks to the help of this forum i managed to solve a similar question using groupBy and the apply function but i would love to also use the cool resample function.
How use the mean method on a pandas TimeSeries with Decimal type values?
Any idea how to solve this?
Here is the complete ipython session creating the error:
In [37]: from decimal import Decimal
In [38]: from pandas import *
In [39]: rng = date_range('1.1.2012',periods=48, freq='H')
In [40]: rnd = np.random.randn(len(rng))
In [41]: rnd_dec = [Decimal(x) for x in rnd]
In [42]: ts = Series(rnd_dec, index=rng)
In [43]: ts[0:3]
Out[43]:
2012-01-01 00:00:00 -0.1020591335576267189022559023214853368699550628
2012-01-01 01:00:00 0.99245713975437366283216533702216111123561859130
2012-01-01 02:00:00 1.80080710727195758558139004890108481049537658691
Freq: H
In [44]: type(ts[0])
Out[44]: decimal.Decimal
In [45]: ts.resample('D', how = 'mean')
---------------------------------------------------------------------------
GroupByError Traceback (most recent call last)
C:\Users\THM\Documents\Python\<ipython-input-45-09c898403ddd> in <module>()
----> 1 ts.resample('D', how = 'mean')
C:\Python27\lib\site-packages\pandas\core\generic.pyc in resample(self, rule, how, axis, fill_method, closed, label, convention, kind, loffset, l
imit, base)
187 fill_method=fill_method, convention=convention,
188 limit=limit, base=base)
--> 189 return sampler.resample(self)
190
191 def first(self, offset):
C:\Python27\lib\site-packages\pandas\tseries\resample.pyc in resample(self, obj)
65
66 if isinstance(axis, DatetimeIndex):
---> 67 rs = self._resample_timestamps(obj)
68 elif isinstance(axis, PeriodIndex):
69 offset = to_offset(self.freq)
C:\Python27\lib\site-packages\pandas\tseries\resample.pyc in _resample_timestamps(self, obj)
184 if len(grouper.binlabels) < len(axlabels) or self.how is not None:
185 grouped = obj.groupby(grouper, axis=self.axis)
--> 186 result = grouped.aggregate(self._agg_method)
187 else:
188 # upsampling shortcut
C:\Python27\lib\site-packages\pandas\core\groupby.pyc in aggregate(self, func_or_funcs, *args, **kwargs)
1215 """
1216 if isinstance(func_or_funcs, basestring):
-> 1217 return getattr(self, func_or_funcs)(*args, **kwargs)
1218
1219 if hasattr(func_or_funcs,'__iter__'):
C:\Python27\lib\site-packages\pandas\core\groupby.pyc in mean(self)
290 """
291 try:
--> 292 return self._cython_agg_general('mean')
293 except GroupByError:
294 raise
C:\Python27\lib\site-packages\pandas\core\groupby.pyc in _cython_agg_general(self, how)
376
377 if len(output) == 0:
--> 378 raise GroupByError('No numeric types to aggregate')
379
380 return self._wrap_aggregated_output(output, names)
GroupByError: No numeric types to aggregate
Any help is appreciated.
Thanks,
Thomas
I found the answer by myself. It is possible to provide a function to the 'how' argument of resample:
f = lambda x: Decimal(np.mean(x))
ts.resample('D', how = f)
I get the error for object type columns in DataFrame. I got around it by using
df.resample('D', method='ffill', how=lambda c: c[-1])

Categories