How to Import Multiple excel file in PandasDataframe - python

I cannot load multiple excel files from a directory in only one Dataframe.
I have tried two different ways and both do no work.
Gives me this error.
How can I solve the problem? It does find the files when creates the list, but than cannot open it in the Dataframe.
Any hints ?
import pandas as pd
import os
import glob
import xlrd
cwd = os.getcwd()
cwd
path = '/Users/giovanni/Desktop/news media'
files = os.listdir(path)
files
files_xls = [f for f in files if f[-3:] == 'lsx']
files_xls
df = pd.DataFrame()
for f in files_xls:
data = pd.read_excel(f)
df = df.append(data)
FileNotFoundError: [Errno 2] No such file or directory: 'NOV.xlsx'

Try this:
import os
import glob
path = '/Users/giovanni/Desktop/news media'
df = pd.DataFrame()
for file in glob.glob(os.path.join(path,'*.xlsx')):
data = pd.read_excel(file)
print(data)
df = df.append(data)

Replace your final loop with:
for f in files_xls:
full_path = os.path.join(path, f)
data = pd.read_excel(full_path)
df = df.append(data)

Related

Issues reading many excel files

I'm reading many xls files with this code:
import os
import pandas as pd
#reading the name of files
path = os.getcwd()
files = os.listdir(path)
files_xls = [f for f in files if f[-3:] == 'xls']
#creating empty dataframe
dfs = pd.DataFrame()
#reading and append xls files
for i in files_xls:
data = pd.read_excel(i, 'Sheet 1',converters={'CODIGO':str})
dfs = dfs.append(data)
With this code, I can read all the xls files with no problem.
But when I want to define the path, I get an error.
#reading the name of files
path = "/path/to/file"
files = os.listdir(path)
files_xls = [f for f in files if f[-3:] == 'xls']
#creating empty dataframe
dfs = pd.DataFrame()
#reading and append xls files
for i in files_xls:
data = pd.read_excel(i, 'Sheet 1',converters={'CODIGO':str})
dfs = dfs.append(data)
Error message:
FileNotFoundError: [Errno 2] No such file or directory: 'Acobamba000659a.xls'
How can I solve this?
os.listdir gives you file name not path.
you can use jurez solution or just use glob
import glob
dfs = pd.DataFrame()
path = "/path/to/file/*.xls"
for i in glob.glob(path):
data = pd.read_excel(i, 'Sheet 1',converters={'CODIGO':str})
dfs = dfs.append(data)
You are probably forgetting that os.listdir() returns just the file names, without the path. You might try this:
files_xls = [os.path.join(path, f) for f in files if f[-3:] == 'xls']

reading multiple csv file from a different directory in python

import csv
import pandas
df_list = []
path = "C:/Users/bubai/Desktop/try/scrapy/output"
#all csv file
for file in os.listdir(path):
#print(file)
df_list.append(file) # all csv file in this
#print(df_list)
for i in df_list:
df = pandas.read_csv(i) # open one by one
print(df)
I have some error:-FileNotFoundError: [Errno 2] File b'poem1.csv' does not exist: b'poem1.csv'
file name are saved like poem1.csv
poem10.csv
poem11.csv
poem12.csv
poem13.csv
poem14.csv
poem15.csv
poem16.csv
poem17.csv
poem18.csv
poem19.csv
poem2.csv
poem20.csv
You need to append the filename to the path.
import csv
import pandas
import os
df_list = []
path = "C:/Users/bubai/Desktop/try/scrapy/output"
#all csv file
for file in os.listdir(path):
df_list.append(os.path.join(path,file)) # all csv file in this
#print(df_list)
for i in df_list:
df = pandas.read_csv(i) # open one by one
print(df)
You need to concatenate the directory name with the filename in order to refer to the file.
import os
df = pandas.read_csv(os.path.join(path, i)

Selecting files based on creation date

I have a folder named myclientcard and it has 69 subfolders in that subfolders we have number of subfolders where it has to go to error folder and inside error folder it has number of txt files, So I want the contents of those text file of all 69 folders inside error inside the specified using the date format 17/01/2019 to 24/01/2019 and convert it into excel file
import os
import numpy as np
from os import listdir
from os.path import join
import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile
mypath = "D:\myclientcard"
files = [join(mypath,f) for f in listdir(mypath) if '.txt' not in f]
for file in files:
path = file
filename =[join(path,f) for f in listdir(path) if 'ERROR' in f]
#print(filename)
for text_file_path in filename:
file_path = text_file_path
textfiles = [join(file_path,f) for f in listdir(file_path) if '.txt' in f]
for files in textfiles:
reading_files = open(files,'r')
read = reading_files.read()
writting_files = open('result.txt','a')
wr = writting_files.write(read)
read_files = pd.read_csv('result.txt',delim_whitespace='')
writer = ExcelWriter('output.xlsx')
read_files.to_excel(writer,'Sheet1',index=false)
writer.save()
reading_files.close()
writting_files.close()
Using the answers from here and here. Assuming you are on a windows platform.
import os
import numpy as np
from os import listdir
from os.path import join
# Importing datetime module
from datetime import datetime as dt
import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile
mypath = "D:\myclientcard"
# Add start date here
start_date = dt.strptime('17/01/2019', '%d/%m/%Y')
# Add end date here
end_date = dt.strptime('24/01/2019', '%d/%m/%Y')
files = [join(mypath,f) for f in listdir(mypath) if '.txt' not in f]
for file in files:
path = file
filename =[join(path,f) for f in listdir(path) if 'ERROR' in f]
#print(filename)
for text_file_path in filename:
file_path = text_file_path
textfiles = [join(file_path,f) for f in listdir(file_path) if '.txt' in f]
# Filtering on the basis of date
textfiles = [f for f in textfiles if ((os.path.getctime(f) >= start_date) and (os.path.getctime(f) <= end_date))]
for files in textfiles:
reading_files = open(files,'r')
read = reading_files.read()
writting_files = open('result.txt','a')
wr = writting_files.write(read)
read_files = pd.read_csv('result.txt',delim_whitespace='')
writer = ExcelWriter('output.xlsx')
read_files.to_excel(writer,'Sheet1',index=false)
writer.save()
reading_files.close()
writting_files.close()
On a side note, consider optimizing your code. Also try os.walk, it can be useful at times!

Loop through directory and create data frame

I'm trying to create a data frame and then loop through a directory filled with csv files and add those to the data frame. I'm trying to use the following code:
df = []
for dirName, subdirList, fileList in os.walk(rootDir):
for fname in fileList:
df = pd.read_csv(fname)
Unfortunately I'm getting an error stating that "File CIN_2017 does not exist" (it does). Any insight into how to add all these csv files into a dataframe? There is a .DS_Store in there but everything else is just a csv. Thanks.
You can try another solution with glob for return file names, then loop in list comprehension and create list of DataFrames. last concate them to one big df:
import glob
files = glob.glob('files/*.csv')
df = pd.concat([pd.read_csv(fp) for fp in files], ignore_index=True)
It is same as:
import glob
files = glob.glob('files/*.csv')
dfs = []
for fp in files:
dfs.append(pd.read_csv(fp))
df = pd.concat(dfs, ignore_index=True)
import os
import pandas as pd
un_process_file = []
master_frame = pd.DataFrame(columns=['item_sku', 'external_product_id', 'standard_price', 'quantity'])
for root, dirs, files in os.walk(os.getcwd()):
for file_path in files:
if file_path.endswith('.csv'):
try:
print file_path
file_name = os.path.join(root, file_path)
file_frames = pd.read_csv(file_name, skiprows=2,
usecols=['item_sku', 'external_product_id', 'standard_price', 'quantity'])
master_frame = master_frame.append(file_frames)
except:
un_process_file.append(file_path)
master_frame = master_frame.rename(
columns={'item_sku': 'sku', 'external_product_id': 'asin', 'standard_price': 'price'})
master_frame = master_frame.drop_duplicates(subset='asin')
master_frame.to_csv('masterfile.txt', sep='\t')
if un_process_file:
print '\nUnable To Process these files\n'
for files in un_process_file:
print files
I have a similar problem. I made this solution. Modify columns name according to you need

cannot access excel file using Pandas Python

Hi i am trying to run my python code through several excel files and get the data from each file and save into a data frame. Here is my code..
import os
import glob
import pandas as pd
path =r'C:\Users\user1\Desktop\test'
files = os.listdir(path)
files_xls = [f for f in files if f[-3:] == 'xls']
df = pd.DataFrame()
for f in files_xls:
filename, ext = os.path.splitext(f)
data = pd.read_excel(f, filename)
df = df.append(data)
a = df.describe()
print (a)
and i am getting this error.. the first file in the folder i am working at is test.xls
Traceback (most recent call last):
File "test.py", line 20, in <module>
data = pd.read_excel(f, filename)
File "C:\Users\user1\AppData\Local\Programs\Python\Python35-32\lib\site- packages\pandas\io\excel.py", line 170, in read_excel
io = ExcelFile(io, engine=engine)
File "C:\Users\user1\AppData\Local\Programs\Python\Python35-32\lib\site-packages\pandas\io\excel.py", line 227, in __init__
self.book = xlrd.open_workbook(io)
File "C:\Users\user1\AppData\Local\Programs\Python\Python35-32\lib\site-packages\xlrd\__init__.py", line 395, in open_workbook
with open(filename, "rb") as f:
FileNotFoundError: [Errno 2] No such file or directory: 'test.xls'
import os
import pandas as pd
path =r'C:\Users\user1\Desktop\test'
os.chdir(path)
files = os.listdir(path)
files_xls = [f for f in files if f[-3:] == 'xls']
df = pd.DataFrame()
for f in files_xls:
data = pd.read_excel(f)
df = df.append(data)
a = df.describe()
print (a)
File is not found because you are calling a relative reference to Excel file and the Python script may not reside in same folder as the file. Hence, use absolute reference which is not contigent to location of called script. You can do so by concatenating the path to file name using os.path.join():
import os
import pandas as pd
path = r'C:\Users\user1\Desktop\test'
files = os.listdir(path)
files_xls = [f for f in files if f[-3:] == 'xls']
dfList = []
for f in files_xls:
data = pd.read_excel(os.path.join(path, f))
dfList.append(data)
df = pd.concat(dfList)
Alternatively, use glob which avoids check on extension and retrieves full path of files:
import glob
import pandas as pd
path = r'C:\Users\user1\Desktop\test'
files_xls = glob.glob(path+'\*.xls')
dfList = []
for f in files_xls:
data = pd.read_excel(f)
dfList.append(data)
df = pd.concat(dfList)

Categories