Issues reading many excel files

Issues reading many excel files - python

I'm reading many xls files with this code:
import os
import pandas as pd
#reading the name of files
path = os.getcwd()
files = os.listdir(path)
files_xls = [f for f in files if f[-3:] == 'xls']
#creating empty dataframe
dfs = pd.DataFrame()
#reading and append xls files
for i in files_xls:
data = pd.read_excel(i, 'Sheet 1',converters={'CODIGO':str})
dfs = dfs.append(data)
With this code, I can read all the xls files with no problem.
But when I want to define the path, I get an error.
#reading the name of files
path = "/path/to/file"
files = os.listdir(path)
files_xls = [f for f in files if f[-3:] == 'xls']
#creating empty dataframe
dfs = pd.DataFrame()
#reading and append xls files
for i in files_xls:
data = pd.read_excel(i, 'Sheet 1',converters={'CODIGO':str})
dfs = dfs.append(data)
Error message:
FileNotFoundError: [Errno 2] No such file or directory: 'Acobamba000659a.xls'
How can I solve this?

os.listdir gives you file name not path.
you can use jurez solution or just use glob
import glob
dfs = pd.DataFrame()
path = "/path/to/file/*.xls"
for i in glob.glob(path):
data = pd.read_excel(i, 'Sheet 1',converters={'CODIGO':str})
dfs = dfs.append(data)

You are probably forgetting that os.listdir() returns just the file names, without the path. You might try this:
files_xls = [os.path.join(path, f) for f in files if f[-3:] == 'xls']

Related

Python, Reading Zip files of a subdirectory. Windows object is not iterable

I am trying to loop through my subdirectories to read in my zip files. I am getting error TypeError: 'WindowsPath' object is not iterable
What i am trying:
path = Path("O:/Stack/Over/Flow/")
for p in path.rglob("*"):
print(p.name)
zip_files = (str(x) for x in Path(p.name).glob("*.zip"))
df = process_files(p) #function
What does work - when I go to the folder directly with my path:
path = r'O:/Stack/Over/Flow/2022 - 10/'
zip_files = (str(x) for x in Path(path).glob("*.zip"))
df = process_files(zip_files)
any help would be appreciated.
Directory structure is like:
//Stack/Over/Flow/2022 - 10/Original.zip
//Stack/Over/Flow/2022 - 09/Next file.zip
function i call:
from io import BytesIO
from pathlib import Path
from zipfile import ZipFile
import os
import pandas as pd
def process_files(files: list) -> pd.DataFrame:
file_mapping = {}
for file in files:
#data_mapping = pd.read_excel(BytesIO(ZipFile(file).read(Path(file).stem)), sheet_name=None)
archive = ZipFile(file)
# find file names in the archive which end in `.xls`, `.xlsx`, `.xlsb`, ...
files_in_archive = archive.namelist()
excel_files_in_archive = [
f for f in files_in_archive if Path(f).suffix[:4] == ".xls"
]
# ensure we only have one file (otherwise, loop or choose one somehow)
assert len(excel_files_in_archive) == 1
# read in data
data_mapping = pd.read_excel(
BytesIO(archive.read(excel_files_in_archive[0])),
sheet_name=None,
)
row_counts = []
for sheet in list(data_mapping.keys()):
row_counts.append(len(data_mapping.get(sheet)))
file_mapping.update({file: sum(row_counts)})
frame = pd.DataFrame([file_mapping]).transpose().reset_index()
frame.columns = ["file_name", "row_counts"]
return frame
New : what I am trying
for root, dirs, files in os.walk(dir_path):
for file in files:
print(files)
if file.endswith('.zip'):
df = process_files(os.path.join(root, file))
print(df) #function
else:
print("nyeh")
This is returning files like Original - All fields - 11012021 - 11302021.zip but then i get an error OSError: [Errno 22] Invalid argument: '\\'

A possible solution using os.walk():
zip_files = []
for root, dirs, files in os.walk(main_path):
for file in files:
if file.endswith('.zip'):
zip_files.append(os.path.join(root, file))
df = process_files(zip_files) #function

Add folder name to exported file

I'm hoping someone can assist. I want to add the folder name to a file export so the exported filename is "combined_summary_of .xls" but can't seem to be able to add in the right reference name. The list of folders does work but stuck at the folder name.
import os
import glob
import pandas as pd
df_list = list() # list of dataframes
folder = r"D:/summary_tables/"
os.chdir(folder)
for root, dirs, files in os.walk(folder):
for folder in folder:
keyword = folder
os.chdir("D:/summary_tables/")
glob.glob("D:/summary_tables/"+ keyword + "/filtered*.xls")
#initialize a empty dataframe and append individual files
all_data = pd.DataFrame()
for f in glob.glob("D:/summary_tables/" +keyword + "/filtered*.xls"):
df = pd.read_excel(f)
all_data = all_data.append(df,ignore_index=True)
all_data.head()
#group all of the files together and sort
all_data2 = pd.concat([all_data]).groupby(['host_name_queried']).sum().reset_index()
all_data2 = all_data2.sort_values('Total_count', ascending=False)
all_data2.head(n=10)
all_data2['Total_nx_domain'] = all_data2['Total_nx_domain'].astype(float)
#send to xls
import openpyxl
all_data2.to_excel('D:/summary_tables/combined_summary_of_' + '.xls', index=False)
print ("file has been saved")
all_data

How to Import Multiple excel file in PandasDataframe

I cannot load multiple excel files from a directory in only one Dataframe.
I have tried two different ways and both do no work.
Gives me this error.
How can I solve the problem? It does find the files when creates the list, but than cannot open it in the Dataframe.
Any hints ?
import pandas as pd
import os
import glob
import xlrd
cwd = os.getcwd()
cwd
path = '/Users/giovanni/Desktop/news media'
files = os.listdir(path)
files
files_xls = [f for f in files if f[-3:] == 'lsx']
files_xls
df = pd.DataFrame()
for f in files_xls:
data = pd.read_excel(f)
df = df.append(data)
FileNotFoundError: [Errno 2] No such file or directory: 'NOV.xlsx'

Try this:
import os
import glob
path = '/Users/giovanni/Desktop/news media'
df = pd.DataFrame()
for file in glob.glob(os.path.join(path,'*.xlsx')):
data = pd.read_excel(file)
print(data)
df = df.append(data)

Replace your final loop with:
for f in files_xls:
full_path = os.path.join(path, f)
data = pd.read_excel(full_path)
df = df.append(data)

Loop through directory and create data frame

I'm trying to create a data frame and then loop through a directory filled with csv files and add those to the data frame. I'm trying to use the following code:
df = []
for dirName, subdirList, fileList in os.walk(rootDir):
for fname in fileList:
df = pd.read_csv(fname)
Unfortunately I'm getting an error stating that "File CIN_2017 does not exist" (it does). Any insight into how to add all these csv files into a dataframe? There is a .DS_Store in there but everything else is just a csv. Thanks.

You can try another solution with glob for return file names, then loop in list comprehension and create list of DataFrames. last concate them to one big df:
import glob
files = glob.glob('files/*.csv')
df = pd.concat([pd.read_csv(fp) for fp in files], ignore_index=True)
It is same as:
import glob
files = glob.glob('files/*.csv')
dfs = []
for fp in files:
dfs.append(pd.read_csv(fp))
df = pd.concat(dfs, ignore_index=True)

import os
import pandas as pd
un_process_file = []
master_frame = pd.DataFrame(columns=['item_sku', 'external_product_id', 'standard_price', 'quantity'])
for root, dirs, files in os.walk(os.getcwd()):
for file_path in files:
if file_path.endswith('.csv'):
try:
print file_path
file_name = os.path.join(root, file_path)
file_frames = pd.read_csv(file_name, skiprows=2,
usecols=['item_sku', 'external_product_id', 'standard_price', 'quantity'])
master_frame = master_frame.append(file_frames)
except:
un_process_file.append(file_path)
master_frame = master_frame.rename(
columns={'item_sku': 'sku', 'external_product_id': 'asin', 'standard_price': 'price'})
master_frame = master_frame.drop_duplicates(subset='asin')
master_frame.to_csv('masterfile.txt', sep='\t')
if un_process_file:
print '\nUnable To Process these files\n'
for files in un_process_file:
print files
I have a similar problem. I made this solution. Modify columns name according to you need

cannot access excel file using Pandas Python

Hi i am trying to run my python code through several excel files and get the data from each file and save into a data frame. Here is my code..
import os
import glob
import pandas as pd
path =r'C:\Users\user1\Desktop\test'
files = os.listdir(path)
files_xls = [f for f in files if f[-3:] == 'xls']
df = pd.DataFrame()
for f in files_xls:
filename, ext = os.path.splitext(f)
data = pd.read_excel(f, filename)
df = df.append(data)
a = df.describe()
print (a)
and i am getting this error.. the first file in the folder i am working at is test.xls
Traceback (most recent call last):
File "test.py", line 20, in <module>
data = pd.read_excel(f, filename)
File "C:\Users\user1\AppData\Local\Programs\Python\Python35-32\lib\site- packages\pandas\io\excel.py", line 170, in read_excel
io = ExcelFile(io, engine=engine)
File "C:\Users\user1\AppData\Local\Programs\Python\Python35-32\lib\site-packages\pandas\io\excel.py", line 227, in __init__
self.book = xlrd.open_workbook(io)
File "C:\Users\user1\AppData\Local\Programs\Python\Python35-32\lib\site-packages\xlrd\__init__.py", line 395, in open_workbook
with open(filename, "rb") as f:
FileNotFoundError: [Errno 2] No such file or directory: 'test.xls'

import os
import pandas as pd
path =r'C:\Users\user1\Desktop\test'
os.chdir(path)
files = os.listdir(path)
files_xls = [f for f in files if f[-3:] == 'xls']
df = pd.DataFrame()
for f in files_xls:
data = pd.read_excel(f)
df = df.append(data)
a = df.describe()
print (a)

File is not found because you are calling a relative reference to Excel file and the Python script may not reside in same folder as the file. Hence, use absolute reference which is not contigent to location of called script. You can do so by concatenating the path to file name using os.path.join():
import os
import pandas as pd
path = r'C:\Users\user1\Desktop\test'
files = os.listdir(path)
files_xls = [f for f in files if f[-3:] == 'xls']
dfList = []
for f in files_xls:
data = pd.read_excel(os.path.join(path, f))
dfList.append(data)
df = pd.concat(dfList)
Alternatively, use glob which avoids check on extension and retrieves full path of files:
import glob
import pandas as pd
path = r'C:\Users\user1\Desktop\test'
files_xls = glob.glob(path+'\*.xls')
dfList = []
for f in files_xls:
data = pd.read_excel(f)
dfList.append(data)
df = pd.concat(dfList)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Issues reading many excel files - python

os.listdir gives you file name not path. you can use jurez solution or just use glob import glob dfs = pd.DataFrame() path = "/path/to/file/*.xls" for i in glob.glob(path): data = pd.read_excel(i, 'Sheet 1',converters={'CODIGO':str}) dfs = dfs.append(data)

You are probably forgetting that os.listdir() returns just the file names, without the path. You might try this: files_xls = [os.path.join(path, f) for f in files if f[-3:] == 'xls']

Related

Python, Reading Zip files of a subdirectory. Windows object is not iterable

Add folder name to exported file

How to Import Multiple excel file in PandasDataframe

Loop through directory and create data frame

cannot access excel file using Pandas Python

Categories

Resources