Loop over files in different folders - python

How can I loop over 2 folders? In Apple and all its subfolders, I want to look for Excel files that contain "green". In Banana, I want to look for files that contain "yellow". I explicitly need to specify the folder paths and can't just loop over the whole C drive.
import os
folders = ['C:/Desktop/apple', 'C:/Downloads/banana']
for x in in range(len(folders)):
for root, dirs, files in os.walk(folders[i]):
for file in files:
if file.endswith(".xlsx") and "banana" in folders[i] and "yellow" in file:
df = pd.read_excel(os.path.join(root, file))
df['date'] = pd.to_datetime(df.date)
...
if file.endswith(".xlsx") and "apple" in folders[i] and "green" in file:
df = pd.read_excel(os.path.join(root, file))
df['date'] = pd.to_datetime(df.date)
...
Since all the excel files look the same, my code above is cumbersome since I'm duplicating the code to read the dataframe and clean the df.

The easiest way to get all the file paths that match your condition would be to use glob package:
import glob
for file in glob.glob('C:/Desktop/apple/*green*.xlsx') + glob.glob('C:/Desktop/banana/*yellow*.xlsx'):
print(file)
df = pd.read_excel(os.path.join(root, file))
df['date'] = pd.to_datetime(df.date)
Glob uses regex pattern matching. If you want choose files that only start with green, you may remove the first asterisk like sogreen*.
To this using pathlib:
from pathlib import Path
for file in [f"C:/Desktop/{f}" for f in list(Path('apple').glob('*green*.csv')) + list(Path('banana').glob('*yellow*.csv'))]:
df = pd.read_excel(os.path.join(root, file))
df['date'] = pd.to_datetime(df.date)

You can create a dictionary where keys will be folders and values will be what to search. Pseudocode:
import os
to_search = { # <--- the dictionary
"C:/Desktop/apple": "green",
"C:/Desktop/banana": "yellow",
}
for folder, item in to_search.items(): # <--- use dict.items()
for root, dirs, files in os.walk(folder): # <--- here you use "folder"
for file in files:
if file.endswith(".xlsx") and item in file: # <--- here you use "item"
df = pd.read_excel(os.path.join(root, file))
df["date"] = pd.to_datetime(df.date)
# ...

Related

Merge CSV files in different folders using Python

I have about 5600 directories structured as follows:
I need to merge all A files into one file, all B files into another file, and so on.
How can I do this?
IIUC, this should work for your case (I used a RootDir with 2 subdirectories Dir1 and Dir2 with in each 2 files A.csv and B.csv). You can change the value of rootdir to match your usecase:
import os
import pandas as pd
rootdir = 'RootDir' # Change when needed to your root directory
files = [os.path.join(dp, f) for dp, dn, filenames in os.walk(rootdir) for f in filenames if os.path.splitext(f)[1] == '.csv']
names = set([x.rstrip('.csv').split('/')[-1] for x in files])
df_dict = {key: pd.DataFrame() for key in names}
for file in files:
key = file.rstrip('.csv').split('/')[-1]
df = pd.read_csv(file)
df_dict[key] = pd.concat([df_dict[key], df])
Output is a dictionary of dataframes df_dict with A and B as keys.
Use df_dict['A'] to access DataFrame A and so on...

Read multiple csv files from multiple folders in Python

I have a folder that includes folders and these folders include many csv files. I want to import and concatenate all of them in Python.
Let's say main folder: /main
subfolders: /main/main_1
csv: /main/main_1/first.csv
path='/main'
df_list = []
for file in os.listdir(path):
df = pd.read_csv(file)
df_list.append(df)
final_df = df.append(df for df in df_list)
What about this:
import pandas as pd
from pathlib import Path
directory = "path/to/root_dir"
# Read each CSV file in dir "path/to/root_dir"
dfs = []
for file in Path(directory).glob("**/*.csv"):
dfs.append(pd.read_csv(file))
# Put the dataframes to a single dataframe
df = pd.concat(dfs)
Change the path/to/root_dir to where ever your CSV files are.
I found a way to concat all of them but it doesn't satisfy to me as it takes too much time due to computational complexity.
path = "/main"
folders = []
directory = os.path.join(path)
for root,dirs,files in os.walk(directory):
folders.append(root)
del folders[0]
final = []
for folder in folders:
df = pd.concat(map(pd.read_csv, glob.glob(os.path.join(folder + "/*.csv"))))
final.append(df)
Remember to add back main to the path:
df =pd.read_csv(path + "/" + file)

Search (in folders and subfolders ) and read files to a list of dataframes, using Python

I have a code
df1 = pd.read_excel('DIRECTORY\\file.xlsm', sheetname='Resume', header=1, usecols='A:I')
#some operations
bf1 =pd.read_excel('DIRECTORY\\file.xlsm', sheetname='Resume', header=1, usecols='K:P')
#some operations
Final_file = pd.concat([df1,bf1], ignore_index=True)
Note that df and bf are reading the same file, the difference is the columns being read.
I have a lot of files.
Is it possible to go through folders and subfolders, search for a filename pattern and create a list of dataframes to read, instead of writing each path I have?
you can use a recursive method with both pathlib and glob
note parent_path should be the top level folder you want to search.
from pathlib import Path
files = [file for file in Path(parent_path).rglob('*filename*.xls')]
this will return a list of files that match your condition. you can then cocnat a list comp.
dfs = [ pd.read_excel(file, sheet_name='Resume', header=1, usecols='A:I') for file in files]
df1 = pd.concat(dfs)
Edit Latest File by Modified Time.
We can use the following function to take in a path and return a list of pathlib objects to get the latest modified time, we do this by splitting on a delimiter to get a unique file so sales_v1, sales_v2, sales_v3 will all become sales. We then get the latest modified file from the three.
import pandas as pd
from pathlib import Path
def get_latest_files(path):
files = {
f: pd.Timestamp(f.stat().st_mtime, unit="s") for f in Path(path).rglob("*.csv")
}
df = (
pd.DataFrame.from_dict(files, orient="index")
.reset_index()
.rename(columns={"index": "path", 0: "seconds"})
)
df["dupe_files"] = df["path"].apply(lambda x: x.stem).str.split("_", expand=True)[0]
max_files = (
df.groupby(["dupe_files", "path"])
.max()
.groupby(level=0)["seconds"]
.nlargest(1)
.to_frame()
.reset_index(-1)["path"]
.tolist()
)
return max_files
Here is a code snippet that might help your cause:-
source = r'C:\Mypath\SubFolder'
for root, dirs, files in os.walk(source):
for name in files:
if name.endswith((".xls", ".xlsx",".xlsm")):
filetoprocess=os.path.join(root,name)
df=pd.read_excel(filetoprocess, sheetname='Resume', header=1, usecols='A:I')
Hope that helps.
You can use glob library to do this -
from glob import glob
filenames = glob('./Folder/pattern*.xlsx') #pattern is the common pattern in filenames
dataframes = [pd.read_excel(f) for f in filenames] #sequentially read all the files and create a dataframe for each file
master_df = pd.concat(dataframes) #master dataframe after concatenating all the dataframes

Alternate Between Relative and Absolute Path in Same Loop

I am trying to:
Loop through a directory of CSV files
Append the file name as a new column to each file
Concatenate every file into single master file
But I get stuck at step #3, when converting the absolute path back into a relative path because my output looks like ../../../../Desktop/2018.12.31.csv when I just want it to be 2018.12.31.
For example, say the directory contains two files: 2018.12.31.csv and 2018.11.30.csv.
2018.12.31.csv
A B
1 2
2018.11.30.csv
A B
3 4
After running my program:
import os
import pandas as pd
folder = ('/Users/user/Desktop/copy')
files = os.listdir(folder)
file_list = list()
for file in files:
file = os.path.join(folder, file)
if file.endswith('.csv'):
df = pd.read_csv(file, sep=";")
df['filename'] = os.path.relpath(file)
file_list.append(df)
all_days = pd.concat(file_list, axis=0, ignore_index=True, sort=False)
all_days.to_csv("/Users/user/Desktop/copy/all.csv")
I want the output to be:
A B filename
1 2 2018.12.31
3 4 2018.11.30
But instead it's:
A B filename
1 2 ../../../../Desktop/copy/2018.12.31.csv
3 4 ../../../../Desktop/copy/2018.11.30.csv
os.path.relpath returns the file location relative to your current directory. You can get the original filename using os.path.basename(path), or just keep the filename as a separate variable and set df['filename'] = file_orig.
If you already have the full filepath to the .csv files, you can use the os.path module to get just the filename:
df['filename'] = os.path.splitext(os.path.split(file)[1])[0]
os.path.splitext() splits the path string in to a tuple with the extension as the second element.
os.path.split() splits the path string into a tuple with the filename (including extension) as the second element.
If you are only ever using .csv files you could simplify to:
df['filename'] = os.path.split(file)[1][:-4]

How to merge 2000 CSV files saved in different subfolders within the same main folder

Hey People I would like to merge 2000 Csv files into one of 2000 sub-folders. Each sub-folder contains three Csv files with different names. so I need to select only one Csv from each folder.
I know the code for how to merge bunch of Csv files if they are in the same - folder.
import pandas as pd
import glob
path = r'Total_csvs'
all_files = glob.glob(path + "/*.csv")
li = []
for filename in all_files:
df = pd.read_csv(filename, index_col=None, header=0)
li.append(df)
frame = pd.concat(li, axis=0, ignore_index=True)
frame.to_csv('Total.csv',index=False)
But my problems with 2000 Csv files look totally different.
Folder structure is:
Main folder (with in this 2000 subfolders, within subfolders I had multiple Csv Files and I need to select only one Csv file from this. Finally concating all 2000 Csv files.)
Coming to Naming Conventions (all the subfolders had different names, but the subfolder name and the Csv name within the subfolder is same)
Any suggestions or a sample code (how to read 2000 Csv from sub-folders) would be helpful.
Thanks in advance
You can loop through all the subfolders using os.listdir.
Since the CSV filename is the same as the subfolder name, simply use the subfolder name to construct the full path name.
import os
import pandas
folders = os.listdir("Total_csvs")
li = []
for folder in folders:
# Since they are the same name
selected_csv = folder
filename = os.path.join(folder, selected_csv + ".csv")
df = pd.read_csv(filename, index_col=None, header=0)
li.append(df)
frame = pd.concat(li, axis=0, ignore_index=True)
frame.to_csv('Total.csv',index=False)
We can iterate on every subfolder, determine expected_csv_path, check if it exists. If it exists, we add them to our all_files list.
Try following:
import pandas as pd
import os
path = r'Total_csvs'
li = []
for f in os.listdir(path):
expected_csv_path = os.path.join(path, f, f + '.csv')
csv_exists = os.path.isfile(expected_csv_path)
if csv_exists:
df = pd.read_csv(expected_csv_path, index_col=None, header=0)
li.append(df)
frame = pd.concat(li, axis=0, ignore_index=True, sort=False)
frame.to_csv('Total.csv',index=False)
If you are using Python 3.5 or newer you could use glob.glob in recursive manner following way:
import glob
path = r'Total_csvs'
all_csv = glob.glob(path+"/**/*.csv",recursive=True)
Now all_csv is list of relative paths to all *.csv inside Total_csv and subdirectories of Total_csv and subdirectories of subdirectories of Total_csv and so on.
For example purpose lets assume that all_csv is now:
all_csv = ['Total_csvs/abc/abc.csv','Total_csv/abc/another.csv']
So we need to get files with names correnponding to directory of their residence, this could be done following way:
import os
def check(x):
directory,filename = x.split(os.path.sep)[-2:]
return directory+'.csv'==filename
all_csv = [i for i in all_csv if check(i)]
print(all_csv) #prints ['Total_csvs/abc/abc.csv']
Now all_csv is list of paths to all .csv you are seeking and you can use it same way as you did with all_csv in "flat" (non-recursive) case.
You can do it without joining paths:
import pathlib,pandas
lastparent=None
for ff in pathlib.Path("Total_csvs").rglob("*.csv"): # recursive glob
print(ff)
if(ff.parent!=lastparent): # process the 1st file in the dir
lastparent= ff.parent
df = pd.read_csv(str(ff),... )
...etc.

Categories