Doing computation on folders - python

I have a main directory and 9 subfolders within. I have to first read the path and in each folder excluding specific files and plotting the result with the legend of the name of the folder. The problem is that I can see the files which I need to do computations but then it does not work anything. The code that I wrote is below:
from __future__ import division
import sys
import os
import numpy as np
import matplotlib.pyplot as plt
import glob
import seaborn as sns
from scipy import stats
from scipy.stats.kde import gaussian_kde
mean_curv = []
FILES=[]
for r, d, f in os.walk(r'C:\Users\Hasan\Desktop\output\new our scenario\beta 15\test'):
for dirs in d:
CASES = [f for f in sorted(files) if f.startswith('config')]
maxnum = np.max([int(os.path.splitext(f)[0].split('_')[1]) for f in CASES])
CASES = ['configuration_%d.out' % i for i in range(maxnum)]
FILES.append(CASES)
for i, d in enumerate(FILES):
a = np.loadtxt(d).T
num = os.path.splitext(d)[0]
local_curv = np.abs(a[4])
mean_curv.append(np.mean(local_curv))
Time = np.arange(0,len(mean_curv))
plt.plot(Time,mean_curv)
The error that I have gotten is below:
ValueError Traceback (most recent call last)
<ipython-input-62-4e1e3e29813a> in <module>
1 for i, d in enumerate(RIVERS):
----> 2 a = np.loadtxt(d).T
3 num = os.path.splitext(d)[0]
4 local_curv = np.abs(a[4])
5 mean_curv.append(np.mean(local_curv))
~\Anaconda3\lib\site-packages\numpy\lib\npyio.py in loadtxt(fname, dtype, comments, delimiter, converters, skiprows, usecols, unpack, ndmin, encoding, max_rows)
1157 # converting the data
1158 X = None
-> 1159 for x in read_data(_loadtxt_chunksize):
1160 if X is None:
1161 X = np.array(x, dtype)
~\Anaconda3\lib\site-packages\numpy\lib\npyio.py in read_data(chunk_size)
1085
1086 # Convert each value according to its column and store
-> 1087 items = [conv(val) for (conv, val) in zip(converters, vals)]
1088
1089 # Then pack it according to the dtype's nesting
~\Anaconda3\lib\site-packages\numpy\lib\npyio.py in <listcomp>(.0)
1085
1086 # Convert each value according to its column and store
-> 1087 items = [conv(val) for (conv, val) in zip(converters, vals)]
1088
1089 # Then pack it according to the dtype's nesting
~\Anaconda3\lib\site-packages\numpy\lib\npyio.py in floatconv(x)
792 if '0x' in x:
793 return float.fromhex(x)
--> 794 return float(x)
795
796 typ = dtype.type
ValueError: could not convert string to float: 'configuration_0.out'

You're ignoring the path to the files, instead of:
a = np.loadtxt(d).T
you should use:
a = np.loadtxt(os.path.join(r, d)).T

Related

How to save a large pandas dataframe with compex arrays and load it up again?

I have a large pandas DataFrame with individual elements that are complex numpy arrays. Please see below a minimal code example to reproduce the scenario:
d = {f'x{i}': [] for i in range(4)}
df = pd.DataFrame(data=d).astype(object)
for K in range(4):
for i in range(4):
df.loc[f'{K}', f'x{i}'] = np.random.random(size=(2,2)) + np.random.random(size=(2,2)) * 1j
df
What is the best way to save these and load them up again for use later?
The problem I am having is that when I increase the size of the matrices stored and the number of elements, I get an OverflowError when I try to save it as .h5 file as shown below:
import pandas as pd
size = (300,300)
xs = 1500
d = {f'x{i}': [] for i in range(xs)}
df = pd.DataFrame(data=d).astype(object)
for K in range(10):
for i in range(xs):
df.loc[f'{K}', f'x{i}'] = np.random.random(size=size) + np.random.random(size=size) * 1j
df.to_hdf('test.h5', key="df", mode="w")
load_test = pd.read_hdf("test.h5", "df")
---------------------------------------------------------------------------
OverflowError Traceback (most recent call last)
<ipython-input-124-8cb8df1a0653> in <module>
12 df.loc[f'{K}', f'x{i}'] = np.random.random(size=size) + np.random.random(size=size) * 1j
13
---> 14 df.to_hdf('test.h5', key="df", mode="w")
15
16
~/PQKs/pqks/lib/python3.6/site-packages/pandas/core/generic.py in to_hdf(self, path_or_buf, key, mode, complevel, complib, append, format, index, min_itemsize, nan_rep, dropna, data_columns, errors, encoding)
2447 data_columns=data_columns,
2448 errors=errors,
-> 2449 encoding=encoding,
2450 )
2451
~/PQKs/pqks/lib/python3.6/site-packages/pandas/io/pytables.py in to_hdf(path_or_buf, key, value, mode, complevel, complib, append, format, index, min_itemsize, nan_rep, dropna, data_columns, errors, encoding)
268 path_or_buf, mode=mode, complevel=complevel, complib=complib
269 ) as store:
--> 270 f(store)
271 else:
272 f(path_or_buf)
~/PQKs/pqks/lib/python3.6/site-packages/pandas/io/pytables.py in <lambda>(store)
260 data_columns=data_columns,
261 errors=errors,
--> 262 encoding=encoding,
263 )
264
~/PQKs/pqks/lib/python3.6/site-packages/pandas/io/pytables.py in put(self, key, value, format, index, append, complib, complevel, min_itemsize, nan_rep, data_columns, encoding, errors, track_times)
1127 encoding=encoding,
1128 errors=errors,
-> 1129 track_times=track_times,
1130 )
1131
~/PQKs/pqks/lib/python3.6/site-packages/pandas/io/pytables.py in _write_to_group(self, key, value, format, axes, index, append, complib, complevel, fletcher32, min_itemsize, chunksize, expectedrows, dropna, nan_rep, data_columns, encoding, errors, track_times)
1799 nan_rep=nan_rep,
1800 data_columns=data_columns,
-> 1801 track_times=track_times,
1802 )
1803
~/PQKs/pqks/lib/python3.6/site-packages/pandas/io/pytables.py in write(self, obj, **kwargs)
3189 # I have no idea why, but writing values before items fixed #2299
3190 blk_items = data.items.take(blk.mgr_locs)
-> 3191 self.write_array(f"block{i}_values", blk.values, items=blk_items)
3192 self.write_index(f"block{i}_items", blk_items)
3193
~/PQKs/pqks/lib/python3.6/site-packages/pandas/io/pytables.py in write_array(self, key, value, items)
3047
3048 vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom())
-> 3049 vlarr.append(value)
3050
3051 elif empty_array:
~/PQKs/pqks/lib/python3.6/site-packages/tables/vlarray.py in append(self, sequence)
526 nparr = None
527
--> 528 self._append(nparr, nobjects)
529 self.nrows += 1
530
~/PQKs/pqks/lib/python3.6/site-packages/tables/hdf5extension.pyx in tables.hdf5extension.VLArray._append()
OverflowError: value too large to convert to int
As noted in the similar issue https://stackoverflow.com/a/57133759/8896855, hdf/h5 files have more overhead and are intended to optimize many dataframes saved into a single file system. Feather and parquet objects will likely provide a better solution in terms of saving/loading a larger single dataframe as an in-memory object. In terms of the specific overflow error, this likely is the result of having larger mixed-type (as numpy array) columns stored in the "object" type in pandas. One (more complicated) option would be to split out the arrays in your dataframe into separate columns, but that's probably unnecessary.
A general quick fix would be to use df.to_pickle(r'path_to/filename.pkl'), but to_feather or to_parquet likely present more optimized solutions.

Using the max function in Pandas for Python

I am doing a tutorial online on Juypter notebook with Python and Pandas, and when I run the following code, I run into this error.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
# reading the csv file
titanic = pd.read_csv("titanic.csv")
titanic_class = titanic.groupby("Pclass")
titanic_class.get_group(1)
titanic_class.max()
AssertionError Traceback (most recent call last)
<ipython-input-26-4d1be28a55cb> in <module>
1 #max ticket fare paid
----> 2 titanic_class.max()
~\anaconda3\lib\site-packages\pandas\core\groupby\groupby.py in f(self, **kwargs)
1369 # try a cython aggregation if we can
1370 try:
-> 1371 return self._cython_agg_general(alias, alt=npfunc, **kwargs)
1372 except DataError:
1373 pass
~\anaconda3\lib\site-packages\pandas\core\groupby\generic.py in _cython_agg_general(self, how, alt, numeric_only, min_count)
992 ) -> DataFrame:
993 agg_blocks, agg_items = self._cython_agg_blocks(
--> 994 how, alt=alt, numeric_only=numeric_only, min_count=min_count
995 )
996 return self._wrap_agged_blocks(agg_blocks, items=agg_items)
~\anaconda3\lib\site-packages\pandas\core\groupby\generic.py in _cython_agg_blocks(self, how, alt, numeric_only, min_count)
1098 # Clean up the mess left over from split blocks.
1099 for locs, result in zip(split_items, split_frames):
-> 1100 assert len(locs) == result.shape[1]
1101 for i, loc in enumerate(locs):
1102 new_items.append(np.array([loc], dtype=locs.dtype))
AssertionError:
Can someone tell me what's wrong? The titanic_class.sum() & the titanic_class.mean() works without any error.
The last column of the excel file has letters. Once I removed them, the max function worked.
This happens when any column has empty (Nan) values. Try to remove those columns before using max.

using negative indices with Pathlib Parents method

I'm trying to use groupby on a pandas data structure containing pathlib information and file sizes from a particular drive. I want to total up the storage used at a particular depth of the file directory structure to see which directories are the most full.
I was trying to do a summation groupby on the Pathlib Parent value for each file but that still doesn't tell you what your total storage is at a particular depth. Pathlib "Parents" looked promising but it starts with the full path and works backwards, so I tried reverse indexing but it doesn't seem to work.
From what I read in the documentation Pathlib Parents are supposed to be sequences, which are supposed to support reverse indexes, but the error messages seem to imply they don't do negatives.
Here is the code I've been using (with help from http://pbpython.com/pathlib-intro.html)
import pandas as pd
from pathlib import Path
import time
dir_to_scan = "c:/Program Files"
p = Path(dir_to_scan)
all_files = []
for i in p.rglob('*.*'):
all_files.append((i.name, i.parent,i.stat().st_size))
columns = ["File_Name", "Parent", "Size"]
df = pd.DataFrame.from_records(all_files, columns=columns)
df["path_stem"]=df['Parent'].apply(lambda x: x.parent if len(x.parents)<3 else x.parents[-2] )
The error trace is as follows:
IndexError Traceback (most recent call last)
<ipython-input-3-5748b1f0a9ee> in <module>()
1 #df.groupby('Parent')['Size'].sum()
2
----> 3 df["path_stem"]=df['Parent'].apply(lambda x: x.parent if len(x.parents)<3 else x.parents[-1] )
4
5 #df([apps])=df([Parent]).parents
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in apply(self, func, convert_dtype, args, **kwds)
2549 else:
2550 values = self.asobject
-> 2551 mapped = lib.map_infer(values, f, convert=convert_dtype)
2552
2553 if len(mapped) and isinstance(mapped[0], Series):
pandas/_libs/src/inference.pyx in pandas._libs.lib.map_infer()
<ipython-input-3-5748b1f0a9ee> in <lambda>(x)
1 #df.groupby('Parent')['Size'].sum()
2
----> 3 df["path_stem"]=df['Parent'].apply(lambda x: x.parent if len(x.parents)<3 else x.parents[-1] )
4
5 #df([apps])=df([Parent]).parents
C:\ProgramData\Anaconda3\lib\pathlib.py in __getitem__(self, idx)
592 def __getitem__(self, idx):
593 if idx < 0 or idx >= len(self):
--> 594 raise IndexError(idx)
595 return self._pathcls._from_parsed_parts(self._drv, self._root,
596 self._parts[:-idx - 1])
IndexError: -1

convert txt column data (string) to int using python 2.7(by np.array.astype function)

import numpy as np
import matplotlib.pyplot as plt
f=open('00001.txt','r')
if f==0:
print("fail to open the file")
else:
print("file successfully opened")
data=f.readlines()
a = np.array(data)
yvec1 = a.astype(int)
print(yvec1)
if f.close()==0:
print("fail to close file")
else:
print("file closed")
and this is the answer:
ValueError: invalid literal for int() with base 10: '\n'
Original text data is:
name
716
722
729
732
730
728
729
733
735
737
737
739
741
744
747
749
747
742
742
742
742
741
739
738
736
734
732
...
You should do it this way since it is more "pythonic":
import numpy as np
import matplotlib.pyplot as plt
# Read your file properly
with open('00001.txt', 'r') as f
# Retrieve the data without '\n' code (it was your problem)
data = f.read().splitlines()
# Load it in numpy
a = np.array(data)
# Do what you want with it
yvec1 = a.astype(int)
Try a = np.array(data.split('\n')) - the problem is that when you read file that way each line contains new-line symbol \n.
Your code is wrong because /n cannot be converted to int as it is not a numerical character.
Try this:
data=f.readlines()
data_int = []
for item in data:
data_int.append(int(item))
a = np.array(data_int)
yvec1 = a.astype(int)
print(yvec1)

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

Using Python, I am struggling to merge 208 CSV files into one dataframe. (My files names are Customer_1.csv, Customer_2.csv,,, and Customer_208.csv)
Following are my codes,
%matplotlib inline
import pandas as pd
df_merged = pd.concat([pd.read_csv('data_TVV1/Customer_{0}.csv'.format(i), names = ['Time', 'Energy_{0}'.format(i)], parse_dates=['Time'], index_col=['Time'], skiprows=1) for i in range(1, 209)], axis=1)
I got an error saying,
InvalidIndexError Traceback (most recent call last)
<ipython-input-4-a4d19b3c2a3e> in <module>()
----> 1 df_merged = pd.concat([pd.read_csv('data_TVV1/Customer_{0}.csv'.format(i), names = ['Time', 'Energy_{0}'.format(i)], parse_dates=['Time'], index_col=['Time'], skiprows=1) for i in range(1, 209)], axis=1)
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/tools/merge.pyc in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity, copy)
752 keys=keys, levels=levels, names=names,
753 verify_integrity=verify_integrity,
--> 754 copy=copy)
755 return op.get_result()
756
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/tools/merge.pyc in __init__(self, objs, axis, join, join_axes, keys, levels, names, ignore_index, verify_integrity, copy)
884 self.copy = copy
885
--> 886 self.new_axes = self._get_new_axes()
887
888 def get_result(self):
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/tools/merge.pyc in _get_new_axes(self)
944 if i == self.axis:
945 continue
--> 946 new_axes[i] = self._get_comb_axis(i)
947 else:
948 if len(self.join_axes) != ndim - 1:
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/tools/merge.pyc in _get_comb_axis(self, i)
970 raise TypeError("Cannot concatenate list of %s" % types)
971
--> 972 return _get_combined_index(all_indexes, intersect=self.intersect)
973
974 def _get_concat_axis(self):
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/core/index.pyc in _get_combined_index(indexes, intersect)
5730 index = index.intersection(other)
5731 return index
-> 5732 union = _union_indexes(indexes)
5733 return _ensure_index(union)
5734
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/core/index.pyc in _union_indexes(indexes)
5759
5760 if hasattr(result, 'union_many'):
-> 5761 return result.union_many(indexes[1:])
5762 else:
5763 for other in indexes[1:]:
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/tseries/index.pyc in union_many(self, others)
847 else:
848 tz = this.tz
--> 849 this = Index.union(this, other)
850 if isinstance(this, DatetimeIndex):
851 this.tz = tz
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/core/index.pyc in union(self, other)
1400 result.extend([x for x in other.values if x not in value_set])
1401 else:
-> 1402 indexer = self.get_indexer(other)
1403 indexer, = (indexer == -1).nonzero()
1404
/Users/Suzuki/Envs/DataVizProj/lib/python2.7/site-packages/pandas/core/index.pyc in get_indexer(self, target, method, limit)
1685
1686 if not self.is_unique:
-> 1687 raise InvalidIndexError('Reindexing only valid with uniquely'
1688 ' valued Index objects')
1689
InvalidIndexError: Reindexing only valid with uniquely valued Index objects
Do you have any idea to solve this problem???..
Your code works on a small sample of five files that I used for testing (each file containing two columns and three row). ONLY FOR DEBUGGING, try to write this in a for loop. First, before the loop, read all of the files into the list. Then loop again and append each one using a try/except block to catch the errors. Finally, print the problem files and investigate.
# First, read all the files into a list.
files_in = [pd.read_csv('data_TVV1/Customer_{0}.csv'.format(i),
names = ['Time', 'Energy_{0}'.format(i)],
parse_dates=['Time'],
index_col=['Time'],
skiprows=1)
for i in range(1, 209)]
df = pd.DataFrame()
errors = []
# Try to append each file to the dataframe.
for i i range(1, 209):
try:
df = pd.concat([df, files_in[i - 1]], axis=1)
except:
errors.append(i)
# Print files containing errors.
for error in errors:
print(files_in[error])

Categories