using negative indices with Pathlib Parents method - python

I'm trying to use groupby on a pandas data structure containing pathlib information and file sizes from a particular drive. I want to total up the storage used at a particular depth of the file directory structure to see which directories are the most full.
I was trying to do a summation groupby on the Pathlib Parent value for each file but that still doesn't tell you what your total storage is at a particular depth. Pathlib "Parents" looked promising but it starts with the full path and works backwards, so I tried reverse indexing but it doesn't seem to work.
From what I read in the documentation Pathlib Parents are supposed to be sequences, which are supposed to support reverse indexes, but the error messages seem to imply they don't do negatives.
Here is the code I've been using (with help from http://pbpython.com/pathlib-intro.html)
import pandas as pd
from pathlib import Path
import time
dir_to_scan = "c:/Program Files"
p = Path(dir_to_scan)
all_files = []
for i in p.rglob('*.*'):
all_files.append((i.name, i.parent,i.stat().st_size))
columns = ["File_Name", "Parent", "Size"]
df = pd.DataFrame.from_records(all_files, columns=columns)
df["path_stem"]=df['Parent'].apply(lambda x: x.parent if len(x.parents)<3 else x.parents[-2] )
The error trace is as follows:
IndexError Traceback (most recent call last)
<ipython-input-3-5748b1f0a9ee> in <module>()
1 #df.groupby('Parent')['Size'].sum()
2
----> 3 df["path_stem"]=df['Parent'].apply(lambda x: x.parent if len(x.parents)<3 else x.parents[-1] )
4
5 #df([apps])=df([Parent]).parents
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in apply(self, func, convert_dtype, args, **kwds)
2549 else:
2550 values = self.asobject
-> 2551 mapped = lib.map_infer(values, f, convert=convert_dtype)
2552
2553 if len(mapped) and isinstance(mapped[0], Series):
pandas/_libs/src/inference.pyx in pandas._libs.lib.map_infer()
<ipython-input-3-5748b1f0a9ee> in <lambda>(x)
1 #df.groupby('Parent')['Size'].sum()
2
----> 3 df["path_stem"]=df['Parent'].apply(lambda x: x.parent if len(x.parents)<3 else x.parents[-1] )
4
5 #df([apps])=df([Parent]).parents
C:\ProgramData\Anaconda3\lib\pathlib.py in __getitem__(self, idx)
592 def __getitem__(self, idx):
593 if idx < 0 or idx >= len(self):
--> 594 raise IndexError(idx)
595 return self._pathcls._from_parsed_parts(self._drv, self._root,
596 self._parts[:-idx - 1])
IndexError: -1

Related

Dask Dataframe: Resample partitioned data loaded from multiple parquet files

I am loading multiple parquet files containing timeseries data together. But the loaded dask dataframe has unknown partitions because of which I can't apply various time series operations on it.
df = dd.read_parquet('/path/to/*.parquet', index='Timestamps)
For instance, df_resampled = df.resample('1T').mean().compute() gives following error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-12-8e6f7f4340fd> in <module>
1 df = dd.read_parquet('/path/to/*.parquet', index='Timestamps')
----> 2 df_resampled = df.resample('1T').mean().compute()
~/.conda/envs/suf/lib/python3.7/site-packages/dask/dataframe/core.py in resample(self, rule, closed, label)
2627 from .tseries.resample import Resampler
2628
-> 2629 return Resampler(self, rule, closed=closed, label=label)
2630
2631 #derived_from(pd.DataFrame)
~/.conda/envs/suf/lib/python3.7/site-packages/dask/dataframe/tseries/resample.py in __init__(self, obj, rule, **kwargs)
118 "for more information."
119 )
--> 120 raise ValueError(msg)
121 self.obj = obj
122 self._rule = pd.tseries.frequencies.to_offset(rule)
ValueError: Can only resample dataframes with known divisions
See https://docs.dask.org/en/latest/dataframe-design.html#partitions
for more information.
I went to the link: https://docs.dask.org/en/latest/dataframe-design.html#partitions and it says,
In these cases (when divisions are unknown), any operation that requires a cleanly partitioned DataFrame with known divisions will have to perform a sort. This can generally achieved by calling df.set_index(...).
I then tried following, but no success.
df = dd.read_parquet('/path/to/*.parquet')
df = df.set_index('Timestamps')
This step throws the following error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-4-468e9af0c4d6> in <module>
1 df = dd.read_parquet(os.path.join(OUTPUT_DATA_DIR, '20*.gzip'))
----> 2 df.set_index('Timestamps')
3 # df_resampled = df.resample('1T').mean().compute()
~/.conda/envs/suf/lib/python3.7/site-packages/dask/dataframe/core.py in set_index(***failed resolving arguments***)
3915 npartitions=npartitions,
3916 divisions=divisions,
-> 3917 **kwargs,
3918 )
3919
~/.conda/envs/suf/lib/python3.7/site-packages/dask/dataframe/shuffle.py in set_index(df, index, npartitions, shuffle, compute, drop, upsample, divisions, partition_size, **kwargs)
483 if divisions is None:
484 sizes = df.map_partitions(sizeof) if repartition else []
--> 485 divisions = index2._repartition_quantiles(npartitions, upsample=upsample)
486 mins = index2.map_partitions(M.min)
487 maxes = index2.map_partitions(M.max)
~/.conda/envs/suf/lib/python3.7/site-packages/dask/dataframe/core.py in __getattr__(self, key)
3755 return self[key]
3756 else:
-> 3757 raise AttributeError("'DataFrame' object has no attribute %r" % key)
3758
3759 def __dir__(self):
AttributeError: 'DataFrame' object has no attribute '_repartition_quantiles'
Can anybody suggest what is the right way to load multiple timeseries files as a dask dataframe on which timeseries operations of pandas can be applied?

How can I iterate through elements of a koala groupby?

I would like to iterate through groups in a dataframe. This is possible in pandas, but when I port this to koalas, I get an error.
import databricks.koalas as ks
import pandas as pd
pdf = pd.DataFrame({'x':range(3), 'y':['a','b','b'], 'z':['a','b','b']})
# Create a Koalas DataFrame from pandas DataFrame
df = ks.from_pandas(pdf)
for a in df.groupby('x'):
print(a)
Here is the error:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-35-d4164d1f71e0> in <module>
----> 1 for a in df.groupby('x'):
2 print(a)
/opt/conda/lib/python3.7/site-packages/databricks/koalas/groupby.py in __getitem__(self, item)
2630 if self._as_index and is_name_like_value(item):
2631 return SeriesGroupBy(
-> 2632 self._kdf._kser_for(item if is_name_like_tuple(item) else (item,)),
2633 self._groupkeys,
2634 dropna=self._dropna,
/opt/conda/lib/python3.7/site-packages/databricks/koalas/frame.py in _kser_for(self, label)
721 Name: id, dtype: int64
722 """
--> 723 return self._ksers[label]
724
725 def _apply_series_op(self, op, should_resolve: bool = False):
KeyError: (0,)
Is this kind of group iteration possible in koalas? The koalas documentation kind of implies it is possible - https://koalas.readthedocs.io/en/latest/reference/groupby.html
Groupby iteration is not yet implemented:
https://github.com/databricks/koalas/issues/2014

Using regex pattern to read files from directories

I have directories with the following names:
s3://bucket/elig_date=2020-06-01/
s3://bucket/elig_date=2020-06-02/
....
s3://bucket/elig_date=2020-09-30/
s3://bucket/elig_date=2020-10-01/
...
s3://bucket/elig_date=2020-12-31/
When I want to read all files inside all directories from 2020-06-01 to 2020-09-30, I use the following and it works:
import dask.dataframe as dd
all_data = dd.read_parquet("s3://bucket/elig_date=2020-0[6-9]-*/*")
But, I want to extend this upto the directory 2020-12-31, I am trying the following and it doesn't work:
all_data = dd.read_parquet("s3://bucket/elig_date=2020-0[6-9]|1[0-2]-*/*")
This throws the following error:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-61-60da829cf51e> in <module>
----> 1 all_data = dd.read_parquet("s3://bucket/elig_date=2020-0[6-9]|1[0-2]-*/*")
~/anaconda3/envs/3.8.1/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py in read_parquet(path, columns, filters, categories, index, storage_options, engine, gather_statistics, split_row_groups, read_from_paths, chunksize, **kwargs)
333 index = [index]
334
--> 335 meta, statistics, parts, index = engine.read_metadata(
336 fs,
337 paths,
~/anaconda3/envs/3.8.1/lib/python3.9/site-packages/dask/dataframe/io/parquet/arrow.py in read_metadata(cls, fs, paths, categories, index, gather_statistics, filters, split_row_groups, read_from_paths, engine, **kwargs)
497 split_row_groups,
498 gather_statistics,
--> 499 ) = cls._gather_metadata(
500 paths,
501 fs,
~/anaconda3/envs/3.8.1/lib/python3.9/site-packages/dask/dataframe/io/parquet/arrow.py in _gather_metadata(cls, paths, fs, split_row_groups, gather_statistics, filters, index, read_from_paths, dataset_kwargs)
1647
1648 # Step 1: Create a ParquetDataset object
-> 1649 dataset, base, fns = _get_dataset_object(paths, fs, filters, dataset_kwargs)
1650 if fns == [None]:
1651 # This is a single file. No danger in gathering statistics
~/anaconda3/envs/3.8.1/lib/python3.9/site-packages/dask/dataframe/io/parquet/arrow.py in _get_dataset_object(paths, fs, filters, dataset_kwargs)
1600 if proxy_metadata:
1601 dataset.metadata = proxy_metadata
-> 1602 elif fs.isdir(paths[0]):
1603 # This is a directory. We can let pyarrow do its thing.
1604 # Note: In the future, it may be best to avoid listing the
IndexError: list index out of range
I only tested it on regExr because I do not have your files.
But this worked on there:
s3://bucket/elig_date=2020-(0[6-9])|(1[0-2])-*/*
Same as you had, just with brackets

TypeError in read_parquet Dask

I have a parquet file called data.parquet. I'm using the library dask from Python. When I run the line
import dask.dataframe as dd
df = dd.read_parquet('data.parquet',engine='pyarrow')
I get the error
TypeError Traceback (most recent call last)
<ipython-input-22-807fa43763c1> in <module>
----> 1 df = dd.read_parquet('data.parquet',engine='pyarrow')
~/anaconda3/lib/python3.7/site-packages/dask/dataframe/io/parquet.py in read_parquet(path, columns, filters, categories, index, storage_options, engine, infer_divisions)
1395 categories=categories,
1396 index=index,
-> 1397 infer_divisions=infer_divisions,
1398 )
1399
~/anaconda3/lib/python3.7/site-packages/dask/dataframe/io/parquet.py in _read_pyarrow(fs, fs_token, paths, columns, filters, categories, index, infer_divisions)
858 _open = lambda fn: pq.ParquetFile(fs.open(fn, mode="rb"))
859 for piece in dataset.pieces:
--> 860 pf = piece.get_metadata(_open)
861 # non_empty_pieces.append(piece)
862 if pf.num_row_groups > 0:
TypeError: get_metadata() takes 1 positional argument but 2 were given
I just don't understand why this happens, since this is how it is implemented here.
Any help will be appreciated!
I faced the same problem. I resolved by upgrade version dask 2.30.0

Doing computation on folders

I have a main directory and 9 subfolders within. I have to first read the path and in each folder excluding specific files and plotting the result with the legend of the name of the folder. The problem is that I can see the files which I need to do computations but then it does not work anything. The code that I wrote is below:
from __future__ import division
import sys
import os
import numpy as np
import matplotlib.pyplot as plt
import glob
import seaborn as sns
from scipy import stats
from scipy.stats.kde import gaussian_kde
mean_curv = []
FILES=[]
for r, d, f in os.walk(r'C:\Users\Hasan\Desktop\output\new our scenario\beta 15\test'):
for dirs in d:
CASES = [f for f in sorted(files) if f.startswith('config')]
maxnum = np.max([int(os.path.splitext(f)[0].split('_')[1]) for f in CASES])
CASES = ['configuration_%d.out' % i for i in range(maxnum)]
FILES.append(CASES)
for i, d in enumerate(FILES):
a = np.loadtxt(d).T
num = os.path.splitext(d)[0]
local_curv = np.abs(a[4])
mean_curv.append(np.mean(local_curv))
Time = np.arange(0,len(mean_curv))
plt.plot(Time,mean_curv)
The error that I have gotten is below:
ValueError Traceback (most recent call last)
<ipython-input-62-4e1e3e29813a> in <module>
1 for i, d in enumerate(RIVERS):
----> 2 a = np.loadtxt(d).T
3 num = os.path.splitext(d)[0]
4 local_curv = np.abs(a[4])
5 mean_curv.append(np.mean(local_curv))
~\Anaconda3\lib\site-packages\numpy\lib\npyio.py in loadtxt(fname, dtype, comments, delimiter, converters, skiprows, usecols, unpack, ndmin, encoding, max_rows)
1157 # converting the data
1158 X = None
-> 1159 for x in read_data(_loadtxt_chunksize):
1160 if X is None:
1161 X = np.array(x, dtype)
~\Anaconda3\lib\site-packages\numpy\lib\npyio.py in read_data(chunk_size)
1085
1086 # Convert each value according to its column and store
-> 1087 items = [conv(val) for (conv, val) in zip(converters, vals)]
1088
1089 # Then pack it according to the dtype's nesting
~\Anaconda3\lib\site-packages\numpy\lib\npyio.py in <listcomp>(.0)
1085
1086 # Convert each value according to its column and store
-> 1087 items = [conv(val) for (conv, val) in zip(converters, vals)]
1088
1089 # Then pack it according to the dtype's nesting
~\Anaconda3\lib\site-packages\numpy\lib\npyio.py in floatconv(x)
792 if '0x' in x:
793 return float.fromhex(x)
--> 794 return float(x)
795
796 typ = dtype.type
ValueError: could not convert string to float: 'configuration_0.out'
You're ignoring the path to the files, instead of:
a = np.loadtxt(d).T
you should use:
a = np.loadtxt(os.path.join(r, d)).T

Categories