cannot access excel file using Pandas Python - python

Hi i am trying to run my python code through several excel files and get the data from each file and save into a data frame. Here is my code..
import os
import glob
import pandas as pd
path =r'C:\Users\user1\Desktop\test'
files = os.listdir(path)
files_xls = [f for f in files if f[-3:] == 'xls']
df = pd.DataFrame()
for f in files_xls:
filename, ext = os.path.splitext(f)
data = pd.read_excel(f, filename)
df = df.append(data)
a = df.describe()
print (a)
and i am getting this error.. the first file in the folder i am working at is test.xls
Traceback (most recent call last):
File "test.py", line 20, in <module>
data = pd.read_excel(f, filename)
File "C:\Users\user1\AppData\Local\Programs\Python\Python35-32\lib\site- packages\pandas\io\excel.py", line 170, in read_excel
io = ExcelFile(io, engine=engine)
File "C:\Users\user1\AppData\Local\Programs\Python\Python35-32\lib\site-packages\pandas\io\excel.py", line 227, in __init__
self.book = xlrd.open_workbook(io)
File "C:\Users\user1\AppData\Local\Programs\Python\Python35-32\lib\site-packages\xlrd\__init__.py", line 395, in open_workbook
with open(filename, "rb") as f:
FileNotFoundError: [Errno 2] No such file or directory: 'test.xls'

import os
import pandas as pd
path =r'C:\Users\user1\Desktop\test'
os.chdir(path)
files = os.listdir(path)
files_xls = [f for f in files if f[-3:] == 'xls']
df = pd.DataFrame()
for f in files_xls:
data = pd.read_excel(f)
df = df.append(data)
a = df.describe()
print (a)

File is not found because you are calling a relative reference to Excel file and the Python script may not reside in same folder as the file. Hence, use absolute reference which is not contigent to location of called script. You can do so by concatenating the path to file name using os.path.join():
import os
import pandas as pd
path = r'C:\Users\user1\Desktop\test'
files = os.listdir(path)
files_xls = [f for f in files if f[-3:] == 'xls']
dfList = []
for f in files_xls:
data = pd.read_excel(os.path.join(path, f))
dfList.append(data)
df = pd.concat(dfList)
Alternatively, use glob which avoids check on extension and retrieves full path of files:
import glob
import pandas as pd
path = r'C:\Users\user1\Desktop\test'
files_xls = glob.glob(path+'\*.xls')
dfList = []
for f in files_xls:
data = pd.read_excel(f)
dfList.append(data)
df = pd.concat(dfList)

Related

Issues reading many excel files

I'm reading many xls files with this code:
import os
import pandas as pd
#reading the name of files
path = os.getcwd()
files = os.listdir(path)
files_xls = [f for f in files if f[-3:] == 'xls']
#creating empty dataframe
dfs = pd.DataFrame()
#reading and append xls files
for i in files_xls:
data = pd.read_excel(i, 'Sheet 1',converters={'CODIGO':str})
dfs = dfs.append(data)
With this code, I can read all the xls files with no problem.
But when I want to define the path, I get an error.
#reading the name of files
path = "/path/to/file"
files = os.listdir(path)
files_xls = [f for f in files if f[-3:] == 'xls']
#creating empty dataframe
dfs = pd.DataFrame()
#reading and append xls files
for i in files_xls:
data = pd.read_excel(i, 'Sheet 1',converters={'CODIGO':str})
dfs = dfs.append(data)
Error message:
FileNotFoundError: [Errno 2] No such file or directory: 'Acobamba000659a.xls'
How can I solve this?
os.listdir gives you file name not path.
you can use jurez solution or just use glob
import glob
dfs = pd.DataFrame()
path = "/path/to/file/*.xls"
for i in glob.glob(path):
data = pd.read_excel(i, 'Sheet 1',converters={'CODIGO':str})
dfs = dfs.append(data)
You are probably forgetting that os.listdir() returns just the file names, without the path. You might try this:
files_xls = [os.path.join(path, f) for f in files if f[-3:] == 'xls']

Merging csv files using Python

I am starting to learn Python and I would like to merge csv files. I have found the following code :
from os import chdir
from glob import glob
import pandas as pdlib
# Produce a single CSV after combining all files
def produceOneCSV(list_of_files, file_out):
# Consolidate all CSV files into one object
result_obj = pdlib.concat([pdlib.read_csv(file) for file in list_of_files])
# Convert the above object into a csv file and export
result_obj.to_csv(file_out, index=False, encoding="utf-8")
# Move to the path that holds our CSV files
csv_file_path = 'c:/Users/user/Desktop/DUT1'
chdir(csv_file_path)
# List all CSV files in the working dir
file_pattern = ".csv"
list_of_files = [file for file in glob('*.{}'.format(file_pattern))]
print(list_of_files)
file_out = "ConsolidateOutput.csv"
produceOneCSV(list_of_files, file_out)
But I get those errors when I tried to compile it :
Traceback (most recent call last):
File "C:\Users\user\Desktop\DUT1\test.py", line 26, in <module>
produceOneCSV(list_of_files, file_out)
File "C:\Users\user\Desktop\DUT1\test.py", line 12, in produceOneCSV
result_obj = pdlib.concat([pdlib.read_csv(file) for file in list_of_files])
File "C:\Python\Python385\lib\site-packages\pandas\core\reshape\concat.py", line 274, in concat
op = _Concatenator(
File "C:\Python\Python385\lib\site-packages\pandas\core\reshape\concat.py", line 331, in __init__
raise ValueError("No objects to concatenate")
ValueError: No objects to concatenate
I don't know why it doesn't work.
Furthermore, I would like to remove the headers from all the files except the first one.
I had a similar use-case for which I developed this code chunk. You can try it like this:
import pandas as pd
from glob import glob
import os
def joinCsvFiles(outFile, dirPath, filePattern="*.csv"):
dfs = []
globPattern = os.path.join(dirPath, filePattern)
fileParts = glob(globPattern)
for filePart in fileParts:
df = pd.read_csv(filePart, index_col=False, header=0)
dfs.append(df)
print("[!]. Merging {} part files to create a consolidated file\n".format(len(dfs)))
try:
finalDf = pd.concat(dfs, sort=False)
finalDf.to_csv(outFile, index=False)
print ("[>]. Consolidated csv file generated successfully at filepath: '{}'\n".format(outFile))
except Exception as e:
raise e
if __name__ == '__main__':
joinCsvFiles("finalReport.csv", "c:/Users/user/Desktop/DUT1", "*.csv")

reading multiple csv file from a different directory in python

import csv
import pandas
df_list = []
path = "C:/Users/bubai/Desktop/try/scrapy/output"
#all csv file
for file in os.listdir(path):
#print(file)
df_list.append(file) # all csv file in this
#print(df_list)
for i in df_list:
df = pandas.read_csv(i) # open one by one
print(df)
I have some error:-FileNotFoundError: [Errno 2] File b'poem1.csv' does not exist: b'poem1.csv'
file name are saved like poem1.csv
poem10.csv
poem11.csv
poem12.csv
poem13.csv
poem14.csv
poem15.csv
poem16.csv
poem17.csv
poem18.csv
poem19.csv
poem2.csv
poem20.csv
You need to append the filename to the path.
import csv
import pandas
import os
df_list = []
path = "C:/Users/bubai/Desktop/try/scrapy/output"
#all csv file
for file in os.listdir(path):
df_list.append(os.path.join(path,file)) # all csv file in this
#print(df_list)
for i in df_list:
df = pandas.read_csv(i) # open one by one
print(df)
You need to concatenate the directory name with the filename in order to refer to the file.
import os
df = pandas.read_csv(os.path.join(path, i)

How to Import Multiple excel file in PandasDataframe

I cannot load multiple excel files from a directory in only one Dataframe.
I have tried two different ways and both do no work.
Gives me this error.
How can I solve the problem? It does find the files when creates the list, but than cannot open it in the Dataframe.
Any hints ?
import pandas as pd
import os
import glob
import xlrd
cwd = os.getcwd()
cwd
path = '/Users/giovanni/Desktop/news media'
files = os.listdir(path)
files
files_xls = [f for f in files if f[-3:] == 'lsx']
files_xls
df = pd.DataFrame()
for f in files_xls:
data = pd.read_excel(f)
df = df.append(data)
FileNotFoundError: [Errno 2] No such file or directory: 'NOV.xlsx'
Try this:
import os
import glob
path = '/Users/giovanni/Desktop/news media'
df = pd.DataFrame()
for file in glob.glob(os.path.join(path,'*.xlsx')):
data = pd.read_excel(file)
print(data)
df = df.append(data)
Replace your final loop with:
for f in files_xls:
full_path = os.path.join(path, f)
data = pd.read_excel(full_path)
df = df.append(data)

Browse and merger Excel files in a folder and its subfolders in Python

I have a folder which contains multiple subfolders, I want to browser all excel files end with xlsx and merger them into one single xlsx file with following code:
import os
import glob
for root, dirs, files in os.walk("D:/Test"):
for file in files:
if file.endswith(".xlsx"):
#print(os.path.join(root, file))
s = os.path.join(root, file)
print(s)
all_data = pd.DataFrame()
for f in s:
df = pd.read_excel(f)
all_data = all_data.append(df,ignore_index=True)
# now save the data frame
writer = pd.ExcelWriter('result.xlsx')
all_data.to_excel(writer,'sheet1')
writer.save()
While an error happens when it run:
Traceback (most recent call last):
File "<ipython-input-169-41c6d76207e7>", line 12, in <module>
df = pd.read_excel(f)
File "C:\Users\User\Anaconda3\lib\site-packages\pandas\util\_decorators.py", line 118, in wrapper
return func(*args, **kwargs)
File "C:\Users\User\Anaconda3\lib\site-packages\pandas\io\excel.py", line 230, in read_excel
io = ExcelFile(io, engine=engine)
File "C:\Users\User\Anaconda3\lib\site-packages\pandas\io\excel.py", line 294, in __init__
self.book = xlrd.open_workbook(self._io)
File "C:\Users\User\Anaconda3\lib\site-packages\xlrd\__init__.py", line 116, in open_workbook
with open(filename, "rb") as f:
FileNotFoundError: [Errno 2] No such file or directory: 'D'
Does someone know how to deal with this problem? Thanks.
Your problem is with df = pd.read_excel(f). What are the contents of f? It looks like Python thinks that it's 'D'.
This is because your for f in s: is just iterating over the string that you created with s = os.path.join(root, file). I think you want to be saving this in some container like so
paths = []
for root, dirs, files in os.walk("D:/Test"):
for file in files:
if file.endswith(".xlsx"):
#print(os.path.join(root, file))
s = os.path.join(root, file)
print(s)
paths.append(s)
all_data = pd.DataFrame()
for f in paths:
df = pd.read_excel(f)
all_data = all_data.append(df,ignore_index=True)
You can also reduce that initial for loop into a list comprehension with
paths = [os.path.join(root, file) for root, _, files in os.walk('D:/Test') for file in files if file.endswith('.xlsx')]
import os
import pandas as pd
listof_files = os.listdir()
current_file_name = os.path.basename(__file__)
#flag to make sure append is happening properly
count = 0
mainFrame = 0
for file in listof_files:
#To ignore the python script file for pd.read_excel
if((file != current_file_name) and (file.endswith(".xlsx"))):
tempdf = pd.read_excel(str(file))
if(count == 0):
mainFrame = tempdf.copy()
else:
mainFrame = pd.concat([mainFrame,tempdf])
count += 1
mainFrame.to_excel('final.xlsx',index=False)
You can do like this also, put the script in the folder where you have all the xlsx files, then run the script, it will fetch all the xlsx file and concat with each other, finally, a single excel file is formed.

Categories