I am starting to learn Python and I would like to merge csv files. I have found the following code :
from os import chdir
from glob import glob
import pandas as pdlib
# Produce a single CSV after combining all files
def produceOneCSV(list_of_files, file_out):
# Consolidate all CSV files into one object
result_obj = pdlib.concat([pdlib.read_csv(file) for file in list_of_files])
# Convert the above object into a csv file and export
result_obj.to_csv(file_out, index=False, encoding="utf-8")
# Move to the path that holds our CSV files
csv_file_path = 'c:/Users/user/Desktop/DUT1'
chdir(csv_file_path)
# List all CSV files in the working dir
file_pattern = ".csv"
list_of_files = [file for file in glob('*.{}'.format(file_pattern))]
print(list_of_files)
file_out = "ConsolidateOutput.csv"
produceOneCSV(list_of_files, file_out)
But I get those errors when I tried to compile it :
Traceback (most recent call last):
File "C:\Users\user\Desktop\DUT1\test.py", line 26, in <module>
produceOneCSV(list_of_files, file_out)
File "C:\Users\user\Desktop\DUT1\test.py", line 12, in produceOneCSV
result_obj = pdlib.concat([pdlib.read_csv(file) for file in list_of_files])
File "C:\Python\Python385\lib\site-packages\pandas\core\reshape\concat.py", line 274, in concat
op = _Concatenator(
File "C:\Python\Python385\lib\site-packages\pandas\core\reshape\concat.py", line 331, in __init__
raise ValueError("No objects to concatenate")
ValueError: No objects to concatenate
I don't know why it doesn't work.
Furthermore, I would like to remove the headers from all the files except the first one.
I had a similar use-case for which I developed this code chunk. You can try it like this:
import pandas as pd
from glob import glob
import os
def joinCsvFiles(outFile, dirPath, filePattern="*.csv"):
dfs = []
globPattern = os.path.join(dirPath, filePattern)
fileParts = glob(globPattern)
for filePart in fileParts:
df = pd.read_csv(filePart, index_col=False, header=0)
dfs.append(df)
print("[!]. Merging {} part files to create a consolidated file\n".format(len(dfs)))
try:
finalDf = pd.concat(dfs, sort=False)
finalDf.to_csv(outFile, index=False)
print ("[>]. Consolidated csv file generated successfully at filepath: '{}'\n".format(outFile))
except Exception as e:
raise e
if __name__ == '__main__':
joinCsvFiles("finalReport.csv", "c:/Users/user/Desktop/DUT1", "*.csv")
Related
I am trying to look into my directory for zip files and perform a function on them. It does seem to loop through some of the files correctly but gets stuck with OS Error \\
Directory structure is like:
//Stack/Over/Flow/2022 - 10/Original.zip
//Stack/Over/Flow/2022 - 09/Next file.zip
function i call:
from io import BytesIO
from pathlib import Path
from zipfile import ZipFile
import os
import pandas as pd
def process_files(files: list) -> pd.DataFrame:
file_mapping = {}
for file in files:
#data_mapping = pd.read_excel(BytesIO(ZipFile(file).read(Path(file).stem)), sheet_name=None)
archive = ZipFile(file)
# find file names in the archive which end in `.xls`, `.xlsx`, `.xlsb`, ...
files_in_archive = archive.namelist()
excel_files_in_archive = [
f for f in files_in_archive if Path(f).suffix[:4] == ".xls"
]
# ensure we only have one file (otherwise, loop or choose one somehow)
assert len(excel_files_in_archive) == 1
# read in data
data_mapping = pd.read_excel(
BytesIO(archive.read(excel_files_in_archive[0])),
sheet_name=None,
)
row_counts = []
for sheet in list(data_mapping.keys()):
row_counts.append(len(data_mapping.get(sheet)))
file_mapping.update({file: sum(row_counts)})
frame = pd.DataFrame([file_mapping]).transpose().reset_index()
frame.columns = ["file_name", "row_counts"]
return frame
code:
dir_path = r'\\stack\over\flow'
for root, dirs, files in os.walk(dir_path):
for file in files:
print(files)
if file.endswith('.zip'):
df = process_files(os.path.join(root, file))
print(df) #function
else:
print("nyeh")
Error:
runfile('/test.py', wdir='test python location')
['History Detail view (page 5) - Nov 2021.zip', 'Original - All fields - 11012021 - 11302021.zip', 'Other fields - 11012021 - 11302021.zip', 'Qualified - All fields - 11012021 - 11302021.zip', 'WPS Report (page 3) 11012021 - 11302021.zip']
Traceback (most recent call last):
File "test.py", line 75, in <module>
df = process_files(os.path.join(root, file))
File "test.py", line 21, in process_files
archive = ZipFile(file)
File "C:\Users\user\.conda\envs\diamond\lib\zipfile.py", line 1251, in __init__
self.fp = io.open(file, filemode)
OSError: [Errno 22] Invalid argument: '\\'
Why am I getting this error? How can I bypass this?
import csv
import pandas
df_list = []
path = "C:/Users/bubai/Desktop/try/scrapy/output"
#all csv file
for file in os.listdir(path):
#print(file)
df_list.append(file) # all csv file in this
#print(df_list)
for i in df_list:
df = pandas.read_csv(i) # open one by one
print(df)
I have some error:-FileNotFoundError: [Errno 2] File b'poem1.csv' does not exist: b'poem1.csv'
file name are saved like poem1.csv
poem10.csv
poem11.csv
poem12.csv
poem13.csv
poem14.csv
poem15.csv
poem16.csv
poem17.csv
poem18.csv
poem19.csv
poem2.csv
poem20.csv
You need to append the filename to the path.
import csv
import pandas
import os
df_list = []
path = "C:/Users/bubai/Desktop/try/scrapy/output"
#all csv file
for file in os.listdir(path):
df_list.append(os.path.join(path,file)) # all csv file in this
#print(df_list)
for i in df_list:
df = pandas.read_csv(i) # open one by one
print(df)
You need to concatenate the directory name with the filename in order to refer to the file.
import os
df = pandas.read_csv(os.path.join(path, i)
I have a folder which contains multiple subfolders, I want to browser all excel files end with xlsx and merger them into one single xlsx file with following code:
import os
import glob
for root, dirs, files in os.walk("D:/Test"):
for file in files:
if file.endswith(".xlsx"):
#print(os.path.join(root, file))
s = os.path.join(root, file)
print(s)
all_data = pd.DataFrame()
for f in s:
df = pd.read_excel(f)
all_data = all_data.append(df,ignore_index=True)
# now save the data frame
writer = pd.ExcelWriter('result.xlsx')
all_data.to_excel(writer,'sheet1')
writer.save()
While an error happens when it run:
Traceback (most recent call last):
File "<ipython-input-169-41c6d76207e7>", line 12, in <module>
df = pd.read_excel(f)
File "C:\Users\User\Anaconda3\lib\site-packages\pandas\util\_decorators.py", line 118, in wrapper
return func(*args, **kwargs)
File "C:\Users\User\Anaconda3\lib\site-packages\pandas\io\excel.py", line 230, in read_excel
io = ExcelFile(io, engine=engine)
File "C:\Users\User\Anaconda3\lib\site-packages\pandas\io\excel.py", line 294, in __init__
self.book = xlrd.open_workbook(self._io)
File "C:\Users\User\Anaconda3\lib\site-packages\xlrd\__init__.py", line 116, in open_workbook
with open(filename, "rb") as f:
FileNotFoundError: [Errno 2] No such file or directory: 'D'
Does someone know how to deal with this problem? Thanks.
Your problem is with df = pd.read_excel(f). What are the contents of f? It looks like Python thinks that it's 'D'.
This is because your for f in s: is just iterating over the string that you created with s = os.path.join(root, file). I think you want to be saving this in some container like so
paths = []
for root, dirs, files in os.walk("D:/Test"):
for file in files:
if file.endswith(".xlsx"):
#print(os.path.join(root, file))
s = os.path.join(root, file)
print(s)
paths.append(s)
all_data = pd.DataFrame()
for f in paths:
df = pd.read_excel(f)
all_data = all_data.append(df,ignore_index=True)
You can also reduce that initial for loop into a list comprehension with
paths = [os.path.join(root, file) for root, _, files in os.walk('D:/Test') for file in files if file.endswith('.xlsx')]
import os
import pandas as pd
listof_files = os.listdir()
current_file_name = os.path.basename(__file__)
#flag to make sure append is happening properly
count = 0
mainFrame = 0
for file in listof_files:
#To ignore the python script file for pd.read_excel
if((file != current_file_name) and (file.endswith(".xlsx"))):
tempdf = pd.read_excel(str(file))
if(count == 0):
mainFrame = tempdf.copy()
else:
mainFrame = pd.concat([mainFrame,tempdf])
count += 1
mainFrame.to_excel('final.xlsx',index=False)
You can do like this also, put the script in the folder where you have all the xlsx files, then run the script, it will fetch all the xlsx file and concat with each other, finally, a single excel file is formed.
I'm tryin to find a way to put multiple .xls files into a single one .xls with separated sheets. (So 1.xls will go under Sheet1, etc.)
Here's my code
mypath = raw_input("Please enter the directory path for the input files: ")
from os import listdir
from os.path import isfile, join
textfiles = [ join(mypath,f) for f in listdir(mypath) if isfile(join(mypath,f)) and '.txt' in f]
def is_number(s):
try:
float(s)
return True
except ValueError:
return False
import xlwt
import xlrd
style = xlwt.XFStyle()
style.num_format_str = '#,###0.00'
for textfile in textfiles:
f = open(textfile, 'r+')
row_list = []
for row in f:
row_list.append(row.split('\t'))
column_list = zip(*row_list)
workbook = xlwt.Workbook()
worksheet = workbook.add_sheet('Sheet1')
i = 0
for column in column_list:
for item in range(len(column)):
value = column[item].strip()
if is_number(value):
worksheet.write(item, i, float(value), style=style)
else:
worksheet.write(item, i, value)
i+=1
workbook.save(textfile.replace('.txt', '.xls'))
import glob, os
import pandas as pd
writer = pd.ExcelWriter('C:\Users\xxx\Desktop\forpythonscript\minonna.xls')
i=1
for xlsfile in glob.glob(os.path.abspath('C:\Users\xxx\Desktop\forpythonscript\*.xls')):
df = pd.read_excel(xlsfile)
df.to_excel(writer, 'sheet%s' % i)
i +=1
writer.save()
Here's the error while run it into anaconda.
Traceback (most recent call last):
File "C:\Users\xxx\Desktop\provaimport.py", line 51, in
writer.save()
File "C:\Users\xxx\Anaconda2\lib\site-packages\pandas\io\excel.py", line 1423, in save
return self.book.save(self.path)
File "C:\Users\xxx\Anaconda2\lib\site-packages\xlwt\Workbook.py", line 710, in save
doc.save(filename_or_stream, self.get_biff_data())
File "C:\Users\xxx\Anaconda2\lib\site-packages\xlwt\Workbook.py", line 680, in get_biff_data
self.__worksheets[self.__active_sheet].selected = True
IndexError: list index out of range
I cant comment but I think its cause of two possible issues.
Do the saved workbooks in the folder only have a single sheet? typically with read_excel you also select a tab name to read from.
also have you tried setting the following
df.to_excel(writer, 'sheet%s' % i, index=False)
Hi i am trying to run my python code through several excel files and get the data from each file and save into a data frame. Here is my code..
import os
import glob
import pandas as pd
path =r'C:\Users\user1\Desktop\test'
files = os.listdir(path)
files_xls = [f for f in files if f[-3:] == 'xls']
df = pd.DataFrame()
for f in files_xls:
filename, ext = os.path.splitext(f)
data = pd.read_excel(f, filename)
df = df.append(data)
a = df.describe()
print (a)
and i am getting this error.. the first file in the folder i am working at is test.xls
Traceback (most recent call last):
File "test.py", line 20, in <module>
data = pd.read_excel(f, filename)
File "C:\Users\user1\AppData\Local\Programs\Python\Python35-32\lib\site- packages\pandas\io\excel.py", line 170, in read_excel
io = ExcelFile(io, engine=engine)
File "C:\Users\user1\AppData\Local\Programs\Python\Python35-32\lib\site-packages\pandas\io\excel.py", line 227, in __init__
self.book = xlrd.open_workbook(io)
File "C:\Users\user1\AppData\Local\Programs\Python\Python35-32\lib\site-packages\xlrd\__init__.py", line 395, in open_workbook
with open(filename, "rb") as f:
FileNotFoundError: [Errno 2] No such file or directory: 'test.xls'
import os
import pandas as pd
path =r'C:\Users\user1\Desktop\test'
os.chdir(path)
files = os.listdir(path)
files_xls = [f for f in files if f[-3:] == 'xls']
df = pd.DataFrame()
for f in files_xls:
data = pd.read_excel(f)
df = df.append(data)
a = df.describe()
print (a)
File is not found because you are calling a relative reference to Excel file and the Python script may not reside in same folder as the file. Hence, use absolute reference which is not contigent to location of called script. You can do so by concatenating the path to file name using os.path.join():
import os
import pandas as pd
path = r'C:\Users\user1\Desktop\test'
files = os.listdir(path)
files_xls = [f for f in files if f[-3:] == 'xls']
dfList = []
for f in files_xls:
data = pd.read_excel(os.path.join(path, f))
dfList.append(data)
df = pd.concat(dfList)
Alternatively, use glob which avoids check on extension and retrieves full path of files:
import glob
import pandas as pd
path = r'C:\Users\user1\Desktop\test'
files_xls = glob.glob(path+'\*.xls')
dfList = []
for f in files_xls:
data = pd.read_excel(f)
dfList.append(data)
df = pd.concat(dfList)