Iterating over folder names in Python - python

I have two folders, 1 and 2. I want to go to each folder which has the file Test.xlsx. I tried to iterate on file_loc using i in range(1,3) but there's an error. The code works if I mention 1 or 2 on file_loc.
import pandas as pd
import numpy as np
for i in range(1,3):
file_loc = "C:\\Users\\USER\\OneDrive - Technion\\Research_Technion\\Python_PNM\\Sept12_2022\\i\\Test.xlsx"
df = pd.read_excel(file_loc, index_col=None, na_values=['NA'], usecols="A,C:AA")
A=df["N"].to_numpy()
print([A])
A = [x for x in A if str(x) != 'nan']
print(A)
A = [eval(e) for e in A]
print(A)
A=np.array(A)
print([A])
A_mean=[]
for i in range(0,len(A)):
A_mean.append(np.mean(A[i]))
print(*A_mean, sep='\n')
The error is
Traceback (most recent call last):
File "C:\Users\USER\OneDrive - Technion\Research_Technion\Python_PNM\Sept12_2022\Test.py", line 12, in <module>
df = pd.read_excel(file_loc, index_col=None, na_values=['NA'], usecols="A,C:AA")
File "C:\Users\USER\anaconda3\lib\site-packages\pandas\util\_decorators.py", line 311, in wrapper
return func(*args, **kwargs)
File "C:\Users\USER\anaconda3\lib\site-packages\pandas\io\excel\_base.py", line 364, in read_excel
io = ExcelFile(io, storage_options=storage_options, engine=engine)
File "C:\Users\USER\anaconda3\lib\site-packages\pandas\io\excel\_base.py", line 1191, in __init__
ext = inspect_excel_format(
File "C:\Users\USER\anaconda3\lib\site-packages\pandas\io\excel\_base.py", line 1070, in inspect_excel_format
with get_handle(
File "C:\Users\USER\anaconda3\lib\site-packages\pandas\io\common.py", line 711, in get_handle
handle = open(handle, ioargs.mode)
FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\USER\\OneDrive - Technion\\Research_Technion\\Python_PNM\\Sept12_2022\\i\\Test.xlsx'

for i in range(1,3):
file_loc = f"C:\\Users\\USER\\OneDrive - Technion\\Research_Technion\\Python_PNM\\Sept12_2022\\{i}\\Test.xlsx"
...
Make sure you entered correct path

Related

Export/import dataframe as Excel sheet

I have an example from the pandas documentation site and can't get it run. Export as excel file works well, but the following import not:
import numpy as np
import pandas as pd
df = pd.DataFrame(np.random.randn(1000, 4), columns=list("ABCD"))
df = df.cumsum()
df.to_excel("/tmp/foo.xlsx", sheet_name="Sheet1")
print("Reading data back from an excel file")
df2=pd.read_excel("/tmp/foo.xlsx", "Sheet1", index_col=None, na_values=["NA"])
#print(df2)
my error message:
python3 /tmp/downloads/tmp_358/main.py
Reading data back from an excel file
Traceback (most recent call last):
File "/tmp/downloads/tmp_358/main.py", line 10, in <module>
df2=pd.read_excel("/tmp/foo.xlsx", "Sheet1", index_col=None, na_values=["NA"])
File "/usr/local/lib/python3.6/dist-packages/pandas/util/_decorators.py", line 296, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/pandas/io/excel/_base.py", line 304, in read_excel
io = ExcelFile(io, engine=engine)
File "/usr/local/lib/python3.6/dist-packages/pandas/io/excel/_base.py", line 867, in __init__
self._reader = self._engines[engine](self._io)
File "/usr/local/lib/python3.6/dist-packages/pandas/io/excel/_xlrd.py", line 22, in __init__
super().__init__(filepath_or_buffer)
File "/usr/local/lib/python3.6/dist-packages/pandas/io/excel/_base.py", line 353, in __init__
self.book = self.load_workbook(filepath_or_buffer)
File "/usr/local/lib/python3.6/dist-packages/pandas/io/excel/_xlrd.py", line 37, in load_workbook
return open_workbook(filepath_or_buffer)
File "/usr/local/lib/python3.6/dist-packages/xlrd/__init__.py", line 170, in open_workbook
raise XLRDError(FILE_FORMAT_DESCRIPTIONS[file_format]+'; not supported')
xlrd.biffh.XLRDError: Excel xlsx file; not supported
Python 3.6.9 and pandas==1.1.4 on Ubuntu 18.04
Can you try this:
df.to_excel("/tmp/foo.xlsx", sheet_name="Sheet1", engine='openpyxl')
df2=pd.read_excel("/tmp/foo.xlsx", "Sheet1", index_col=None, na_values=["NA"], engine='openpyxl')

Pandas: Read excel file for specific columns, remove empty rows, create CSV

I am trying to read a folder of .xlsm files, take columns A:J, remove any empty rows, and then combine each excel file into a single CSV. The code below seems to work when I use just one specific excel file but has an error when I loop. Any help would be appreciated.
import pandas as pd
import os
import glob
# defines the folder to pull from and to save into
source = r"C:\Users\bwendt\QAR"
#defines list of files as dir and changes directory to source
os.chdir(source)
files = glob.glob(source + "/*.xlsm")
MultiRents = []
#loops through list of file paths, reads the file, removes blank cells, and adds to data frame
for f in files:
data = pd.read_excel(f,"Page2",usecols = "A:J")
#data.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)
MultiRents.append(data)
#create pandas DF
df = pd.DataFrame.from_records(MultiRents)
#Exports dataframe to a csv file
export_csv = df.to_csv("Multifamily_Rents.csv")
Traceback: Traceback (most recent call last):
File "", line 1, in
runfile('C:/Users/bwendt/.spyder-py3/Print_rents.py', wdir='C:/Users/bwendt/.spyder-py3')
File
"C:\Users\bwendt\AppData\Local\Continuum\anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py",
line 827, in runfile
execfile(filename, namespace)
File
"C:\Users\bwendt\AppData\Local\Continuum\anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py",
line 110, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/bwendt/.spyder-py3/Print_rents.py", line 21, in
data = pd.read_excel(f,"Page2",usecols = "A:J")
File
"C:\Users\bwendt\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\util_decorators.py",
line 188, in wrapper
return func(*args, **kwargs)
File
"C:\Users\bwendt\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\util_decorators.py",
line 188, in wrapper
return func(*args, **kwargs)
File
"C:\Users\bwendt\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\io\excel.py",
line 350, in read_excel
io = ExcelFile(io, engine=engine)
File
"C:\Users\bwendt\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\io\excel.py",
line 653, in init
self._reader = self._enginesengine
File
"C:\Users\bwendt\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\io\excel.py",
line 424, in init
self.book = xlrd.open_workbook(filepath_or_buffer)
File
"C:\Users\bwendt\AppData\Local\Continuum\anaconda3\lib\site-packages\xlrd__init__.py",
line 157, in open_workbook
ragged_rows=ragged_rows,
File
"C:\Users\bwendt\AppData\Local\Continuum\anaconda3\lib\site-packages\xlrd\book.py",
line 92, in open_workbook_xls
biff_version = bk.getbof(XL_WORKBOOK_GLOBALS)
File
"C:\Users\bwendt\AppData\Local\Continuum\anaconda3\lib\site-packages\xlrd\book.py",
line 1278, in getbof
bof_error('Expected BOF record; found %r' % self.mem[savpos:savpos+8])
File
"C:\Users\bwendt\AppData\Local\Continuum\anaconda3\lib\site-packages\xlrd\book.py",
line 1272, in bof_error
raise XLRDError('Unsupported format, or corrupt file: ' + msg)
XLRDError: Unsupported format, or corrupt file: Expected BOF record;
found b'\x0eWendt, '

Python - Trying to extract files from one location to another

I am trying to pull a set of files from a server and store in one of the folders in my local. The below code works well for this task. However if any of the files are empty it stops at that point and does not continue further.
list_ = []
for file_ in allFiles:
try:
df = pd.read_csv(file_, index_col=None, delim_whitespace=True)
list_.append(df)
temp = pd.concat(list_)
except EmptyDataError:
df = pd.DataFrame()
return df
Could anyone advice as to how could I by-pass these empty files and continue to extract the other files from the server. Thanks
Update:
Given below is the function I am trying to perform
list_ = []
for file_ in allFiles:
try:
df = pd.read_csv(file_, index_col=None, header=None, delim_whitespace=True)
list_.append(df)
temp = pd.concat(list_)
except pd.errors.EmptyDataError:
continue
df_v1 = [pd.read_csv(fp, delim_whitespace=True).assign(FileName=os.path.basename(fp)) for fp in allFiles] <<-- Error thrown on this line as per trackback
df = pd.concat(df_v1, ignore_index=True, sort=False)
Trackback:
Traceback (most recent call last):
File "/Users/PycharmProjects/venv/try.py", line 102, in <module>
s3_func("stores","store_a", "2018-10-03", "2018-10-05")
File "/Users/PycharmProjects/venv/try.py", line 86, in s3_func
df_v1 = [pd.read_csv(fp, delim_whitespace=True).assign(FileName=os.path.basename(fp)) for fp in allFiles]
File "/Users/PycharmProjects/venv/try.py", line 86, in <listcomp>
df_v1 = [pd.read_csv(fp, delim_whitespace=True).assign(FileName=os.path.basename(fp)) for fp in allFiles]
File "/Users/PycharmProjects/venv/lib/python3.6/site-packages/pandas/io/parsers.py", line 678, in parser_f
return _read(filepath_or_buffer, kwds)
File "/Users/PycharmProjects/venv/lib/python3.6/site-packages/pandas/io/parsers.py", line 440, in _read
parser = TextFileReader(filepath_or_buffer, **kwds)
File "/Users/PycharmProjects/venv/lib/python3.6/site-packages/pandas/io/parsers.py", line 787, in __init__
self._make_engine(self.engine)
File "/Users/PycharmProjects/venv/lib/python3.6/site-packages/pandas/io/parsers.py", line 1014, in _make_engine
self._engine = CParserWrapper(self.f, **self.options)
File "/Users/PycharmProjects/venv/lib/python3.6/site-packages/pandas/io/parsers.py", line 1708, in __init__
self._reader = parsers.TextReader(src, **kwds)
File "pandas/_libs/parsers.pyx", line 542, in pandas._libs.parsers.TextReader.__cinit__
pandas.errors.EmptyDataError: No columns to parse from file
Your loop is exiting upon reaching return condition.
If you want to continue the iteration if exception occurs you can do the following:
list_ = []
for file_ in allFiles:
try:
df = pd.read_csv(file_, index_col=None, delim_whitespace=True)
list_.append(df)
temp = pd.concat(list_)
except EmptyDataError:
df = pd.DataFrame()
continue # Changed return with continue, since return breaks the loop.
Also I see that you are creating empty data frame on exception. What you do with that empty data frame? Do you need it for future usage?
If you would need the empty data frames in future, consider appending them to the list as well
except EmptyDataError:
df = pd.DataFrame()
list_.append(df) # Appending empty dataframes to the list
continue

"Already tz-aware" error when reading h5 file using pandas, python 3 (but not 2)

I have an h5 store named weather.h5. My default Python environment is 3.5.2. When I try to read this store I get TypeError: Already tz-aware, use tz_convert to convert.
I've tried both pd.read_hdf('weather.h5','weather_history') and pd.io.pytables.HDFStore('weather.h5')['weather_history], but I get the error no matter what.
I can open the h5 in a Python 2.7 environment. Is this a bug in Python 3 / pandas?
I have the same issue. I'm using Anaconda Python: 3.4.5 and 2.7.3. Both are using pandas 0.18.1.
Here is a reproducible example:
generate.py (to be executed with Python2):
import pandas as pd
from pandas import HDFStore
index = pd.DatetimeIndex(['2017-06-20 06:00:06.984630-05:00', '2017-06-20 06:03:01.042616-05:00'], dtype='datetime64[ns, CST6CDT]', freq=None)
p1 = [0, 1]
p2 = [0, 2]
# Saving any of these dataframes cause issues
df1 = pd.DataFrame({"p1":p1, "p2":p2}, index=index)
df2 = pd.DataFrame({"p1":p1, "p2":p2, "i":index})
store = HDFStore("./test_issue.h5")
store['df'] = df1
#store['df'] = df2
store.close()
read_issue.py:
import pandas as pd
from pandas import HDFStore
store = HDFStore("./test_issue.h5", mode="r")
df = store['/df']
store.close()
print(df)
Running read_issue.py in Python2 has no issues and produces this output:
p1 p2
2017-06-20 11:00:06.984630-05:00 0 0
2017-06-20 11:03:01.042616-05:00 1 2
But running it in Python3 produces Error with this traceback:
Traceback (most recent call last):
File "read_issue.py", line 5, in
df = store['df']
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/io/pytables.py", line 417, in getitem
return self.get(key)
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/io/pytables.py", line 634, in get
return self._read_group(group)
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/io/pytables.py", line 1272, in _read_group
return s.read(**kwargs)
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/io/pytables.py", line 2779, in read
ax = self.read_index('axis%d' % i)
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/io/pytables.py", line 2367, in read_index
_, index = self.read_index_node(getattr(self.group, key))
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/io/pytables.py", line 2492, in read_index_node
_unconvert_index(data, kind, encoding=self.encoding), **kwargs)
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/indexes/base.py", line 153, in new
result = DatetimeIndex(data, copy=copy, name=name, **kwargs)
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/util/decorators.py", line 91, in wrapper
return func(*args, **kwargs)
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/tseries/index.py", line 321, in new
raise TypeError("Already tz-aware, use tz_convert "
TypeError: Already tz-aware, use tz_convert to convert.
Closing remaining open files:./test_issue.h5...done
So, there is an issue with indices. However, if you save df2 in generate.py (datetime as a column, not as an index), then Python3 in read_issue.py produces a different error:
Traceback (most recent call last):
File "read_issue.py", line 5, in
df = store['/df']
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/io/pytables.py", line 417, in getitem
return self.get(key)
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/io/pytables.py", line 634, in get
return self._read_group(group)
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/io/pytables.py", line 1272, in _read_group
return s.read(**kwargs)
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/io/pytables.py", line 2788, in read
placement=items.get_indexer(blk_items))
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/core/internals.py", line 2518, in make_block
return klass(values, ndim=ndim, fastpath=fastpath, placement=placement)
File "/home/denper/anaconda3/envs/py34/lib/python3.4/site-packages/pandas/core/internals.py", line 90, in init
len(self.mgr_locs)))
ValueError: Wrong number of items passed 2, placement implies 1
Closing remaining open files:./test_issue.h5...done
Also, if you execute generate_issue.py in Python3 (saving either df1 or df2), then there is no problem executing read_issue.py in either Python3 or Python2

IO Error: csv file does not exist though it exists at given location specified

import pandas as pd
import os
import time
from datetime import datetime
path = "C:\WinPython-32bit-2.7.9.5\python- 2.7.9\Lib\idlelib\MuditPracticals\intraQuarter\intraQuarter"
def Key_Stats(gather="Total Debt/Equity (mrq)"):
statspath = path+'/_KeyStats'
stock_list = [x[0] for x in os.walk(statspath)]
df = pd.DataFrame(columns = ['Date','Unix','Ticker','DE Ratio','Price','SP500'])
sp500_df = pd.DataFrame.from_csv("YAHOO-INDEX_GSPC.csv")
for each_dir in stock_list[1:25]:
each_file = os.listdir(each_dir)
ticker = each_dir.split("\\")[3]
if len(each_file) > 0:
for file in each_file:
date_stamp = datetime.strptime(file, '%Y%m%d%H%M%S.html')
unix_time = time.mktime(date_stamp.timetuple())
full_file_path = each_dir+'/'+file
source = open(full_file_path,'r').read()
try:
value = float(source.split(gather+':</td><td class="yfnc_tabledata1">')[1].split('</td>')[0])
try:
sp500_date = datetime.fromtimestamp(unix_time).strftime('%Y-%m-%d')
row = sp500_df[(sp500_df.index == sp500_date)]
sp500_value = float(row["Adjusted Close"])
except:
sp500_date = datetime.fromtimestamp(unix_time-259200).strftime('%Y-%m-%d')
row = sp500_df[(sp500_df.index == sp500_date)]
sp500_value = float(row["Adjusted Close"])
stock_price = float(source.split('</small><big><b>')[1].split('</b></big>')[0])
#print("stock_price:",stock_price,"ticker:", ticker)
df = df.append({'Date':date_stamp,
'Unix':unix_time,
'Ticker':ticker,
'DE Ratio':value,
'Price':stock_price,
'SP500':sp500_value}, ignore_index = True)
except Exception as e:
print "hello"
save = gather.replace(' ','').replace(')','').replace('(','').replace('/','')+('.csv')
print(save)
df.to_csv(save)
Key_Stats()
Compile Time Error In Spyder
File "<ipython-input-1-dfafbc7450e8>", line 1, in <module>
runfile('C:/WinPython-32bit-2.7.9.5/python- 2.7.9/Lib/idlelib/MuditPracticals/data_organisation1.py', wdir='C:/WinPython-32bit-2.7.9.5/python-2.7.9/Lib/idlelib/MuditPracticals')
File "C:\WinPython-32bit-2.7.9.5\python-2.7.9\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 682, in runfile
execfile(filename, namespace)
File "C:\WinPython-32bit-2.7.9.5\python-2.7.9\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 71, in execfile
exec(compile(scripttext, filename, 'exec'), glob, loc)
File "C:/WinPython-32bit-2.7.9.5/python-2.7.9/Lib/idlelib/MuditPracticals/data_organisation1.py", line 56, in <module>
Key_Stats()
File "C:/WinPython-32bit-2.7.9.5/python-2.7.9/Lib/idlelib/MuditPracticals/data_organisation1.py", line 13, in Key_Stats
sp500_df = pd.DataFrame.from_csv("YAHOO-INDEX_GSPC.csv")
File "C:\WinPython-32bit-2.7.9.5\python-2.7.9\lib\site-packages\pandas\core\frame.py", line 1036, in from_csv
infer_datetime_format=infer_datetime_format)
File "C:\WinPython-32bit-2.7.9.5\python-2.7.9\lib\site-packages\pandas\io\parsers.py", line 474, in parser_f
return _read(filepath_or_buffer, kwds)
File "C:\WinPython-32bit-2.7.9.5\python-2.7.9\lib\site-packages\pandas\io\parsers.py", line 250, in _read
parser = TextFileReader(filepath_or_buffer, **kwds)
File "C:\WinPython-32bit-2.7.9.5\python-2.7.9\lib\site-packages\pandas\io\parsers.py", line 566, in __init__
self._make_engine(self.engine)
File "C:\WinPython-32bit-2.7.9.5\python-2.7.9\lib\site-packages\pandas\io\parsers.py", line 705, in _make_engine
``self._engine = CParserWrapper(self.f, **self.options)
File "C:\WinPython-32bit-2.7.9.5\python-2.7.9\lib\site-packages\pandas\io\parsers.py", line 1072, in __init__
self._reader = _parser.TextReader(src, **kwds)
File "pandas\parser.pyx", line 350, in pandas.parser.TextReader.__cinit__ (pandas\parser.c:3160)
File "pandas\parser.pyx", line 594, in pandas.parser.TextReader._setup_parser_source (pandas\parser.c:5905)
IOError: File YAHOO-INDEX_GSPC.csv does not exist
It is giving IO error though file exists at that location
IO ERROR occurs at compile time
and why it is so that in other IDLE pandas module is not found but in Spyder there is no pandas Error
the path to your .csv file is relative. if the file is not in your current working directory python will not find it.
"though file exists at that location"... that is the problem with relative paths: what is that location?
here is a previous answer that should resolve the issue:
Python ConfigParser cannot search .ini file correctly (Ubuntu 14, Python 3.4)

Categories