reading multiple csv file from a different directory in python

reading multiple csv file from a different directory in python - python

import csv
import pandas
df_list = []
path = "C:/Users/bubai/Desktop/try/scrapy/output"
#all csv file
for file in os.listdir(path):
#print(file)
df_list.append(file) # all csv file in this
#print(df_list)
for i in df_list:
df = pandas.read_csv(i) # open one by one
print(df)
I have some error:-FileNotFoundError: [Errno 2] File b'poem1.csv' does not exist: b'poem1.csv'
file name are saved like poem1.csv
poem10.csv
poem11.csv
poem12.csv
poem13.csv
poem14.csv
poem15.csv
poem16.csv
poem17.csv
poem18.csv
poem19.csv
poem2.csv
poem20.csv

You need to append the filename to the path.
import csv
import pandas
import os
df_list = []
path = "C:/Users/bubai/Desktop/try/scrapy/output"
#all csv file
for file in os.listdir(path):
df_list.append(os.path.join(path,file)) # all csv file in this
#print(df_list)
for i in df_list:
df = pandas.read_csv(i) # open one by one
print(df)

You need to concatenate the directory name with the filename in order to refer to the file.
import os
df = pandas.read_csv(os.path.join(path, i)

Related

Outputting or reading columns in a different order using pandas/python

I have a Python script that reads a CSV file and outputs it to HTML.
I would like to change the order of the columns once written to the HTML file, but I don't know if I should do this on read or write or what function to use, here is my code so far.
import pandas as pd
import os
import shutil
import glob
#paths
HTMLPATH="C:/NMS4/QUE/HTML/"
QUEPATH="C:/NMS4/QUE/"
#create the directory for holding the HTML files if it doesn't exist
isExist = os.path.exists(HTMLPATH)
if not isExist:
os.mkdir(HTMLPATH, 0o777)
#convert .txt file to a html file in the HTML folder
#can't convert an empty file so only convert if file size in not 0
for quefile in glob.iglob('C:/NMS4/QUE/*.txt'):
if os.path.getsize(quefile) != 0:
csv = pd.read_csv(quefile, header=None, usecols=[0,3,4,15,34,43,44,129], names=['OrderNo','Req Qty','Planned Start','Resource','Op','Part','Desc','Qty Recd'])
html_table = csv.to_html()
f = open(quefile + '.html', 'w')
f.write(html_table)
f.close()
shutil.move(quefile + ".HTML", HTMLPATH)
Any help, greatly appreciated.
Thanks.
I've been looking at https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_html.html, but can't quite find out how to re-order the columns.
Edit,
Changes to get the solution are,
import pandas as pd
import os
import shutil
import glob
#paths
HTMLPATH="C:/NMS4/QUE/HTML/"
QUEPATH="C:/NMS4/QUE/"
#create the directory for holding the HTML files if it doesn't exist
isExist = os.path.exists(HTMLPATH)
if not isExist:
os.mkdir(HTMLPATH, 0o777)
#convert .txt file to a html file in the HTML folder
#python can't convert an empty file so only convert if file size in not 0
for quefile in glob.iglob('C:/NMS4/QUE/*.txt'):
if os.path.getsize(quefile) != 0:
csv = pd.read_csv(quefile, header=None, usecols=[0,3,4,15,34,43,44,129], names=['Order No','Req Qty','Planned Start','Resource','Op','Part','Desc','Qty Recd'])
cols = list(csv.columns)
a, b, c, d, e, f, g = cols.index('Order No'), cols.index('Req Qty'), cols.index('Planned Start'), cols.index('Resource'), cols.index('Op'), cols.index('Part'), cols.index('Desc')
cols[a], cols[b], cols[c], cols[d], cols[e], cols[f], cols[g] = cols[a], cols[e], cols[f], cols[g], cols[c], cols[b], cols[d]
df = csv[cols]
html_table = df.to_html()
f = open(quefile + '.html', 'w')
f.write(html_table)
f.close()
shutil.move(quefile + ".HTML", HTMLPATH)

Merging csv files using Python

I am starting to learn Python and I would like to merge csv files. I have found the following code :
from os import chdir
from glob import glob
import pandas as pdlib
# Produce a single CSV after combining all files
def produceOneCSV(list_of_files, file_out):
# Consolidate all CSV files into one object
result_obj = pdlib.concat([pdlib.read_csv(file) for file in list_of_files])
# Convert the above object into a csv file and export
result_obj.to_csv(file_out, index=False, encoding="utf-8")
# Move to the path that holds our CSV files
csv_file_path = 'c:/Users/user/Desktop/DUT1'
chdir(csv_file_path)
# List all CSV files in the working dir
file_pattern = ".csv"
list_of_files = [file for file in glob('*.{}'.format(file_pattern))]
print(list_of_files)
file_out = "ConsolidateOutput.csv"
produceOneCSV(list_of_files, file_out)
But I get those errors when I tried to compile it :
Traceback (most recent call last):
File "C:\Users\user\Desktop\DUT1\test.py", line 26, in <module>
produceOneCSV(list_of_files, file_out)
File "C:\Users\user\Desktop\DUT1\test.py", line 12, in produceOneCSV
result_obj = pdlib.concat([pdlib.read_csv(file) for file in list_of_files])
File "C:\Python\Python385\lib\site-packages\pandas\core\reshape\concat.py", line 274, in concat
op = _Concatenator(
File "C:\Python\Python385\lib\site-packages\pandas\core\reshape\concat.py", line 331, in __init__
raise ValueError("No objects to concatenate")
ValueError: No objects to concatenate
I don't know why it doesn't work.
Furthermore, I would like to remove the headers from all the files except the first one.

I had a similar use-case for which I developed this code chunk. You can try it like this:
import pandas as pd
from glob import glob
import os
def joinCsvFiles(outFile, dirPath, filePattern="*.csv"):
dfs = []
globPattern = os.path.join(dirPath, filePattern)
fileParts = glob(globPattern)
for filePart in fileParts:
df = pd.read_csv(filePart, index_col=False, header=0)
dfs.append(df)
print("[!]. Merging {} part files to create a consolidated file\n".format(len(dfs)))
try:
finalDf = pd.concat(dfs, sort=False)
finalDf.to_csv(outFile, index=False)
print ("[>]. Consolidated csv file generated successfully at filepath: '{}'\n".format(outFile))
except Exception as e:
raise e
if __name__ == '__main__':
joinCsvFiles("finalReport.csv", "c:/Users/user/Desktop/DUT1", "*.csv")

Add folder name to exported file

I'm hoping someone can assist. I want to add the folder name to a file export so the exported filename is "combined_summary_of .xls" but can't seem to be able to add in the right reference name. The list of folders does work but stuck at the folder name.
import os
import glob
import pandas as pd
df_list = list() # list of dataframes
folder = r"D:/summary_tables/"
os.chdir(folder)
for root, dirs, files in os.walk(folder):
for folder in folder:
keyword = folder
os.chdir("D:/summary_tables/")
glob.glob("D:/summary_tables/"+ keyword + "/filtered*.xls")
#initialize a empty dataframe and append individual files
all_data = pd.DataFrame()
for f in glob.glob("D:/summary_tables/" +keyword + "/filtered*.xls"):
df = pd.read_excel(f)
all_data = all_data.append(df,ignore_index=True)
all_data.head()
#group all of the files together and sort
all_data2 = pd.concat([all_data]).groupby(['host_name_queried']).sum().reset_index()
all_data2 = all_data2.sort_values('Total_count', ascending=False)
all_data2.head(n=10)
all_data2['Total_nx_domain'] = all_data2['Total_nx_domain'].astype(float)
#send to xls
import openpyxl
all_data2.to_excel('D:/summary_tables/combined_summary_of_' + '.xls', index=False)
print ("file has been saved")
all_data

Converting all worksheets in an Excel workbook to csv format

My Excel document my.xlsx has two Sheets named Sheet1 and Sheet2. I want to convert all worksheets to csv format using xlsx2csv. I used the following commands:
from xlsx2csv import *
xlsx2csv my.xlsx convert.csv
File "<stdin>", line 1
xlsx2csv my.xlsx convert.csv
^
SyntaxError: invalid syntax
x2c -a my.xlsx my1.csv
File "<stdin>", line 1
x2c -a my.xlsx my1.csv
^
SyntaxError: invalid syntax
Any help, please.

I have not used xlsx2csv before but why don't we try pandas.
Your requirement can be solved like this:
import pandas as pd
for sheet in ['Sheet1', 'Sheet2']:
df = pd.read_excel('my.xlsx', sheetname=sheet)
df.to_csv(sheet + '_output.csv', index=False)

You can do something as the follows:
import pandas as pd
xls_file = pd.ExcelFile('<path_to_your_excel_file>')
sheet_names = xls_file.sheet_names
for sheet in sheet_names:
df = xls_file.parse(sheet)

Xlsx2csv python implementation:
Could only execute Xlsx2csv with sheetid parameter. In order to get sheet names and ids, get_sheet_details was used.
csvfrmxlsx creates csv files for each sheet in csv folder under parent directory.
import pandas as pd
from pathlib import Path
def get_sheet_details(filename):
import os
import xmltodict
import shutil
import zipfile
sheets = []
# Make a temporary directory with the file name
directory_to_extract_to = (filename.with_suffix(''))
os.mkdir(directory_to_extract_to)
# Extract the xlsx file as it is just a zip file
zip_ref = zipfile.ZipFile(filename, 'r')
zip_ref.extractall(directory_to_extract_to)
zip_ref.close()
# Open the workbook.xml which is very light and only has meta data, get sheets from it
path_to_workbook = directory_to_extract_to / 'xl' / 'workbook.xml'
with open(path_to_workbook, 'r') as f:
xml = f.read()
dictionary = xmltodict.parse(xml)
for sheet in dictionary['workbook']['sheets']['sheet']:
sheet_details = {
'id': sheet['#sheetId'], # can be sheetId for some versions
'name': sheet['#name'] # can be name
}
sheets.append(sheet_details)
# Delete the extracted files directory
shutil.rmtree(directory_to_extract_to)
return sheets
def csvfrmxlsx(xlsxfl, df): # create csv files in csv folder on parent directory
from xlsx2csv import Xlsx2csv
for index, row in df.iterrows():
shnum = row['id']
shnph = xlsxfl.parent / 'csv' / Path(row['name'] + '.csv') # path for converted csv file
Xlsx2csv(str(xlsxfl), outputencoding="utf-8").convert(str(shnph), sheetid=int(shnum))
return
pthfnc = 'c:/xlsx/'
wrkfl = 'my.xlsx'
xls_file = Path(pthfnc + wrkfl)
sheetsdic = get_sheet_details(xls_file) # dictionary with sheet names and ids without opening xlsx file
df = pd.DataFrame.from_dict(sheetsdic)
csvfrmxlsx(xls_file, df) # df with sheets to be converted

How to Import Multiple excel file in PandasDataframe

I cannot load multiple excel files from a directory in only one Dataframe.
I have tried two different ways and both do no work.
Gives me this error.
How can I solve the problem? It does find the files when creates the list, but than cannot open it in the Dataframe.
Any hints ?
import pandas as pd
import os
import glob
import xlrd
cwd = os.getcwd()
cwd
path = '/Users/giovanni/Desktop/news media'
files = os.listdir(path)
files
files_xls = [f for f in files if f[-3:] == 'lsx']
files_xls
df = pd.DataFrame()
for f in files_xls:
data = pd.read_excel(f)
df = df.append(data)
FileNotFoundError: [Errno 2] No such file or directory: 'NOV.xlsx'

Try this:
import os
import glob
path = '/Users/giovanni/Desktop/news media'
df = pd.DataFrame()
for file in glob.glob(os.path.join(path,'*.xlsx')):
data = pd.read_excel(file)
print(data)
df = df.append(data)

Replace your final loop with:
for f in files_xls:
full_path = os.path.join(path, f)
data = pd.read_excel(full_path)
df = df.append(data)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

reading multiple csv file from a different directory in python - python

You need to concatenate the directory name with the filename in order to refer to the file. import os df = pandas.read_csv(os.path.join(path, i)

Related

Outputting or reading columns in a different order using pandas/python

Merging csv files using Python

Add folder name to exported file

Converting all worksheets in an Excel workbook to csv format

How to Import Multiple excel file in PandasDataframe

Categories

Resources