Loop through excel files in subfolders - python

I am trying to loop through my files in different folers
the first part of the code is working :
from os import walk
import pandas as pd
path = r'C:\Users\Sarah\Desktop\test2'
my_files = []
for (dirpath, dirnames, filenames) in walk(path):
my_files.extend(filenames)
print(my_files)
the code successfully print all the files with my subfolders
however the problem comes in this part when I try to extract excel columns different files and save them in a directory
all_dicts_list = []
for file_name in my_files:
#Display sheets names using pandas
pd.set_option('display.width',300)
mosul_file = file_name
xl = pd.ExcelFile(mosul_file)
mosul_df = xl.parse(0, header=[1], index_col=[0,1,2])
#Read Excel and Select columns
mosul_file = pd.read_excel(file_name, sheet_name = 0 ,
index_clo=None, na_values= ['NA'], usecols = "C , F ,G")
#Remove NaN values
data_mosul_df = mosul_file.apply (pd.to_numeric, errors='coerce')
data_mosul_df = mosul_file.dropna()
#Save to Dictionary
datamosulx = data_mosul_df.to_dict()
all_dicts_list.append(datamosulx)
all dictionaries will be in all_dicts_list
I get an error FileNotFoundError: [Errno 2] No such file or directory I don't understand the problem or how to fix it.
Thank you

It's hard to tell because you might have lost some of the formatting from copy and pasting but make sure that after the
for file_name in my_files:
anything that you want in the for loop needs to be indented with tabs or spaces to the same level.
print out mosul_file after allocating it to see whether this could be the case and then indent appropriately.

Related

I am trying to iterate a xlsx files in specified path.How to exclude specified file

i have 3 xlsx sheet in particular directory.I am combining into workbook.While combining,i need to exclude specified file.
Path="C:/JackDaniels/100Pipers/"
name="Panic"
writer=ExcelWriter(Path+name+"*.xlsx")#creating a workbook in name "name")
inp=glob.glob(Path+"*.xlsx")
inp=inp.remove(Path+name+"*.xlsx")#to remove ths file to avoid overwrite
# I have a code that will combine sheets
when i tried to run the above code am getting below error
list.remove(x):x not in list
The question is really not clear and you should rephrase it.
If you are trying to combine them in the sense that you want to append all the three sheets to a new empty sheet (for example all of your sheets have same columns) you should make a python file in the same directory as your excel worksheets:
import copy
import os
import pandas as pd
cwd = os.getcwd()
# list to store files
xlsx_files = []
exc_file = 'Exclude.xlsx' # <-- The file name you want to exclude goes here.
out_file = 'Output.xlsx' # <-- The output file name goes here.
# Iterate directory
for file in os.listdir(cwd):
# check only Excel files
if file.endswith('.xlsx'):
xlsx_files.append(file)
print("All xlsx files:", xlsx_files)
df = pd.DataFrame()
aux_files_var = copy.deepcopy(xlsx_files)
for file in aux_files_var:
print(file)
if file == exc_file or file == out_file: continue # <-- here you exclude the file and the output
df = df.append(pd.read_excel(file), ignore_index=True)
xlsx_files.remove(file)
print(f"""As you can see, only exc_file remains in xlsx_files.
Remaining xlsx files:{xlsx_files}""")
print(df)
df.to_excel(out_file)

Reading Specific Files from folder in Python

I have folder with 12000 csv sample files and I need to read only certain files of interest from it. I have a list with filenames that I want to read from that folder. Here's my code so far
Filenames # list that contains name of filenames that I want to read
# Import data
data_path= "/MyDataPath"
data = []
i=0
# Import csv files
#I feel I am doing a mistake here with looping Filenames[i]
for file in glob.glob(f"{data_path}/{Filenames[i]}.csv", recursive=False):
df = pd.read_csv(file,header=None)
# Append dataframe
data.append(df)
i=i+1
This code only reads first file and ignores all other.
The problem is you are not iterating over the Filenames.
Try the following:
# i=0
# Import csv files
#I feel I am doing a mistake here with looping Filenames[i]
for f in Filenames:
file = glob.glob(f"{data_path}/{f}.csv", recursive=False)
df = pd.read_csv(file,header=None)
# Append dataframe
data.append(df)
# i=i+1

How can I read any excel file with pandas which is in folder?

I am having a folder d:/data/input where I have an excel file stored. I want to read the excel file into a dataframe with pandas WITHOUT declaring the excel filename. Is that possible?
Thanks for your help.
If it's the only Excel file in the folder, you could do something like:
from pathlib import Path
fn = list(Path("D:/data/input").glob("*.xlsx"))[0]
df = pd.read_excel(fn)
If there's more than one file there, it'll end up just picking one of them arbitrarily, so probably not ideal.
Whatever you have in this folder path, you can do what you request for only one Excels files or as many Excels files you want by this way bellow, not the most elegant, for sure, but it work and it is flexible:
import pandas as pd
from os import listdir
from os.path import isfile, join
sourcePath = r"d:/data/input"
extensionFile = r".xlsx"
# Get all files in the path
fileNameList = [f for f in listdir(sourcePath) if isfile(join(sourcePath, f))]
# Removing all non excel (.xlsx) files from the list
fileNameList = [x for x in fileNameList if extensionFile in x]
# For accessing only one "first" file
print(fileNameList[0], "is loaded!") # Print the name as reference if you want to
df = pd.read_excel(sourcePath + r"/" +fileNameList[0])
# For loading all files into a list or dict
dataframeList = []
for fn in fileNameList:
dataframeList.append([pd.read_excel(sourcePath + r"/" + fn)])
# To access each data frame
fileNameList[0] # Print the name as reference if you want
dataframeList[0]
# call the seconde
fileNameList[1] # Print the name as reference if you want
dataframeList[1]
# call the third
fileNameList[2] # Print the name as reference if you want
dataframeList[2]
# call the ...
fileNameList[...]
dataframeList[...]
If there are multiple excel files we could get the file with the latest create time :
from pathlib import Path
def get_latest_excel(src):
""" src : source path to target files"""
dfs = {f.stat().st_ctime : f for f in Path(src).glob('*.xlsx')}
return dfs[max(dfs, key=dfs.get)]
file = get_latest_excel(r"d:/data/input")
print(file)
WindowsPath('d:/data/input/new_excel_file.xlsx')
df = pd.read_excel(file)

How to match the .mp4 files present in a folder with the names in .csv file and sort according to some column value, in python?

I have a folder containing about 500 .mp4 files :
abc.mp4
lmn.mp4
ijk.mp4
Also I have a .csv file containing the file names (>500) and some values associated with them:
file name value
abc.mp4 5
xyz.mp4 3
lmn.mp4 5
rgb.mp4 4
I want to match the file names of .csv and folder and then place the mp4 files in separate folders depending on the value.
**folder 5:**
abc.mp4
lmn.mp4
**folder 3:**
xyz.mp4
and so on
I tried link
names=[]
names1=[]
for dirname, dirnames, filenames in os.walk('./videos_test'):
for filename in filenames:
if filename.endswith('.mp4'):
names.append(filename)
file = open('names.csv',encoding='utf-8-sig')
lns = csv.reader(file)
for line in lns:
nam = line [0]
sc=line[1]
names1.append(nam)
if nam in names:
print (nam, line[1])
if line[1]==5
print ('5')
print(nam) %just prints the name of file not save
else if line[1]==3
print ('3')
print(nam)
does not give any result.
I'd recommend you to use pandas if you're going to handle csv files.
Here's a code that will automatically create the folders, and put the files in the right place for you using shutil and pandas. I have assumed that your csv's columns are "filename" and "value". Change them if there's a mismatch.
import pandas as pd
import shutil
import os
path_to_csv_file = "file.csv"
df = pd.read_csv(path_to_csv_file)
mp4_root = "mp4_root"
destination_path = "destination_path"
#In order to remove the folder if previously created. You can delete this if you don't like it.
if os.path.isdir(destination_path):
shutil.rmtree(destination_path)
os.mkdir(destination_path)
unique_values = pd.unique(df['value'])
for u in unique_values:
os.mkdir(os.path.join(destination_path, str(u)))
#Here we iterate over the rows of your csv file, and concatenate the value and the filename to the destination_path with our new folder structure.
for index, row in df.iterrows():
cur_path = os.path.join(destination_path, str(row['value']), str(row['filename']))
source_path = os.path.join(mp4_root, str(row['filename']))
shutil.copyfile(source_path, cur_path)
EDIT: If there's a file that is in the csv but not present in the source folder, you could check it before (more pythonic) or you could handle it via a try/catch exception check.(Not recommended)
Check the code below.
source_files = os.listdir(mp4_root)
for index, row in df.iterrows():
if str(row['filename']) not in source_files:
continue
cur_path = os.path.join(destination_path, str(row['value']), str(row['filename']))
source_path = os.path.join(mp4_root, str(row['filename']))
shutil.copyfile(source_path, cur_path)

How to fix messed up Copying of CSV

I'm trying to copy a bunch of csv files into 1 big csv.
all 3 files have the same column headers, but I changed them to be according to the files name. For example, file arousal_a_103_happy.csv will now be in the new csv in the column header for it's columns.
My issue, is that
1st: it copies the file in a very strange order, it does not flip it, it's just every column is wherever it wants to be.
2nd: it doesnt copy the files next to eachother, but more like a slope. so if the first file finishes at P23, the new file will start at Q24.
This is the code:
def concatenate(path = "C:\\Users\\User\Desktop\\Work\\subject", outfile = "C:\\Users\\User\\Desktop\\Work\\subject\\concatenated.csv"):
os.chdir(path)
fileList=glob.glob("*happy.csv")
dfList=[]
print(fileList)
i=1
string = "subject"
for files in fileList:
df = pd.read_csv("C:\\Users\\Desktop\\Work\\subject\\" + files, encoding ='CP1255') #Get's error because of the link!
sub = files
i+=1
ColNames =[sub + " Level", sub +" Description", sub+" Number",sub+" Onset_Date",sub+" Onset_Time",sub+" Offset_Date",sub+" Offset_Time",sub+" Duration_Date",sub+" Duration_Time",sub+" Arousal",sub+" Gaze",sub+" Movement" , sub+" Vocalization", sub+" eyes covered", sub+" Mother\'s arrousal", sub+" Transcript"]
df.columns=ColNames
dfList.append(df)
concatDf = pd.concat(dfList, axis=0, ignore_index=True, verify_integrity=True)
concatDf.to_csv(outfile, index=None)
# Fetching files
import csv
FileNames = []
path="C:\\Users\\User\Desktop\\Work\\subject\\"
os.chdir(path)
for counter, files in enumerate(glob.glob("*.csv")):
FileNames.append(files)
print (FileNames)
# Merging all .csv from your folder 'subject'
pathout="C:\\Users\\User\Desktop\\Work\\subject\\"
for filenames in SortedFileNames:
df = pd.read_csv(filenames,encoding='utf-8')
saved_column = df.tweet
saved_column.to_csv(pathout+"mixed.csv", mode='a')
print("File Created Sucessfully mixed.csv")

Categories