Iterating through txt files in directory, saving filenames - python

I'm iterating through files in a directory and would like to save the filename and some stuff I extract from the files in the same pandas dataframe. How can I save the names the txt files in a list (which I would then insert into pandas dataframe as a separate column) while going through all the files in a directory?
Here's part of my code:
columns_df = ['file', 'stuff']
df_stuff = pd.DataFrame(columns = columns_df)
filenamelist = []
stufflist = []
os.chdir(r'path\to\directory')
for file in glob.glob('*.txt'):
# Extract some stuff from file and append to stufflist (DONE)
# Save filename in the filenamelist (THE PROBLEM)
df_stuff['stuff'] = stufflist
df_stuff['file'] = filenamelist

Do you need this functionality?
for file in glob.glob('*.txt'):
filenamelist.append(file)

Related

I am trying to iterate a xlsx files in specified path.How to exclude specified file

i have 3 xlsx sheet in particular directory.I am combining into workbook.While combining,i need to exclude specified file.
Path="C:/JackDaniels/100Pipers/"
name="Panic"
writer=ExcelWriter(Path+name+"*.xlsx")#creating a workbook in name "name")
inp=glob.glob(Path+"*.xlsx")
inp=inp.remove(Path+name+"*.xlsx")#to remove ths file to avoid overwrite
# I have a code that will combine sheets
when i tried to run the above code am getting below error
list.remove(x):x not in list
The question is really not clear and you should rephrase it.
If you are trying to combine them in the sense that you want to append all the three sheets to a new empty sheet (for example all of your sheets have same columns) you should make a python file in the same directory as your excel worksheets:
import copy
import os
import pandas as pd
cwd = os.getcwd()
# list to store files
xlsx_files = []
exc_file = 'Exclude.xlsx' # <-- The file name you want to exclude goes here.
out_file = 'Output.xlsx' # <-- The output file name goes here.
# Iterate directory
for file in os.listdir(cwd):
# check only Excel files
if file.endswith('.xlsx'):
xlsx_files.append(file)
print("All xlsx files:", xlsx_files)
df = pd.DataFrame()
aux_files_var = copy.deepcopy(xlsx_files)
for file in aux_files_var:
print(file)
if file == exc_file or file == out_file: continue # <-- here you exclude the file and the output
df = df.append(pd.read_excel(file), ignore_index=True)
xlsx_files.remove(file)
print(f"""As you can see, only exc_file remains in xlsx_files.
Remaining xlsx files:{xlsx_files}""")
print(df)
df.to_excel(out_file)

Reading Specific Files from folder in Python

I have folder with 12000 csv sample files and I need to read only certain files of interest from it. I have a list with filenames that I want to read from that folder. Here's my code so far
Filenames # list that contains name of filenames that I want to read
# Import data
data_path= "/MyDataPath"
data = []
i=0
# Import csv files
#I feel I am doing a mistake here with looping Filenames[i]
for file in glob.glob(f"{data_path}/{Filenames[i]}.csv", recursive=False):
df = pd.read_csv(file,header=None)
# Append dataframe
data.append(df)
i=i+1
This code only reads first file and ignores all other.
The problem is you are not iterating over the Filenames.
Try the following:
# i=0
# Import csv files
#I feel I am doing a mistake here with looping Filenames[i]
for f in Filenames:
file = glob.glob(f"{data_path}/{f}.csv", recursive=False)
df = pd.read_csv(file,header=None)
# Append dataframe
data.append(df)
# i=i+1

Generate Pandas DataFrames from CSV file list

To frame the question. I am searching a directory for all csv files. I am saving the path of each csv file along with the delineation into a DataFrame. I know want to iterate over the DataFrame, and read in the specific csv file into a dataframe with a name generated from the original filename. I cannot figure out how to dynamically generate these dataframes. I started coding a few days ago so apologies if the syntax is poor.
# Looks in a given directory and all subsequent subdirectories for the extension ".csv"
# Reads path to all csv files and creates a list
PATH = "Z:\Adam"
EXT = "*.csv"
all_csv_files = [file
for path, subdir, files in os.walk(PATH)
for file in glob(os.path.join(path, EXT))]
# The list of csv file directories is read into a DataFrame
# Dataframe is then split into columns based on the \\ found in the path
df_csv_path = pd.DataFrame(all_csv_files, columns =['Path'])
df_split_path = df_csv_path['Path'].str.split('\\', n = -1, expand = True)
df_split_path = df_split_path.rename(columns = {0:'Drive',1:'Main',2:'Project',3:'Imaging Folder', 4:'Experimental Group',5:'Experimental Rep',6:'File Name'})
df_csv_info = df_split_path.join(df_csv_path['Path'])
# Generates a Dataframe for each of the csv files found in directory
# Dataframe has a name based on the csv filename
for index in df_csv_info.index:
filepath = ""
filename = df_csv_info['File Name'].values[index]
filepath = str(df_csv_info['Path'].values[index])
filename = pd.read_csv(filepath)
The best way is to create a dictionary whose keys are the filenames and the values are the corresponding DataFrame. Instead of using os.path and glob, the modern approach is to use pathlib from the standard library.
Assuming that you don't actually need the DataFrame containing the filenames and just want the DataFrames for each csv file, you can simply do
from pathlib import Path
PATH = Path("Z:\Adam")
EXT = "*.csv"
# dictionary holding all the files DataFrames with the format {"filename": file_DataFrame}
files_dfs = {}
# recursive search for csv files in PATH folder and subfolders
for csv_file in PATH.rglob(EXT):
filename = csv_file.name # get the filename
df = pd.read_csv(csv_file) # read the csv file as a DataFrame
files_dfs[filename] = df # add the DataFrame to the dictionary
Then, to access the DataFrame of a specific file you can do
filename_df = files_dfs["<filename>"]

Finding the number of rows for all files within a folder

Hello I am trying to find the number of rows for all files within a folder. I am trying to do this for a folder that contains only ".txt" files and for a folder that contains ."csv" files.
I know that the way to get the number of rows for a SINGLE ".txt" file is something like this:
file = open("sample.txt","r")
Counter = 0
Content = file.read()
CoList = Content.split("\n")
for i in CoList:
if i:
Counter += 1
print("This is the number of lines in the file")
print(Counter)
Whereas for a SINGLE ".csv" file is something like this:
file = open("sample.csv")
reader = csv.reader(file)
lines= len(list(reader))
print(lines)
But how can I do this for ALL files within a folder? That is, how can I loop each of these procedures across all files within a folder and, ideally, export the output into an excel sheet with columns akin to these:
Filename Number of Rows
1.txt 900
2.txt 653
and so on and so on.
Thank you so much for your help.
You can use glob to detect the files and then just iterate over them.
Other methods : How do I list all files of a directory?
import glob
# 1. list all text files in the directory
rel_filepaths = glob.glob("*.txt")
# 2. (optional) create a function to read the number of rows in a file
def count_rows(filepath):
res = 0
f = open(filepath, 'r')
res = len(f.readlines())
f.close()
return res
# 3. iterate over your files and use the count_row function
counts = [count_rows(filepath) for filepath in rel_filepaths]
print(counts)
Then, if you want to export this result in a .csv or .xslx file, I recommend using pandas.
import pandas as pd
# 1. create a new table and add your two columns filled with the previous values
df = pd.DataFrame()
df["Filename"] = rel_filepaths
df["Number of rows"] = counts
# 2. export this dataframe to `.csv`
df.to_csv("results.csv")
You can also use pandas.ExcelWriter() if you want to use the .xlsx format. Link to documentation & examples : Pandas - ExcelWriter doc

How to match the .mp4 files present in a folder with the names in .csv file and sort according to some column value, in python?

I have a folder containing about 500 .mp4 files :
abc.mp4
lmn.mp4
ijk.mp4
Also I have a .csv file containing the file names (>500) and some values associated with them:
file name value
abc.mp4 5
xyz.mp4 3
lmn.mp4 5
rgb.mp4 4
I want to match the file names of .csv and folder and then place the mp4 files in separate folders depending on the value.
**folder 5:**
abc.mp4
lmn.mp4
**folder 3:**
xyz.mp4
and so on
I tried link
names=[]
names1=[]
for dirname, dirnames, filenames in os.walk('./videos_test'):
for filename in filenames:
if filename.endswith('.mp4'):
names.append(filename)
file = open('names.csv',encoding='utf-8-sig')
lns = csv.reader(file)
for line in lns:
nam = line [0]
sc=line[1]
names1.append(nam)
if nam in names:
print (nam, line[1])
if line[1]==5
print ('5')
print(nam) %just prints the name of file not save
else if line[1]==3
print ('3')
print(nam)
does not give any result.
I'd recommend you to use pandas if you're going to handle csv files.
Here's a code that will automatically create the folders, and put the files in the right place for you using shutil and pandas. I have assumed that your csv's columns are "filename" and "value". Change them if there's a mismatch.
import pandas as pd
import shutil
import os
path_to_csv_file = "file.csv"
df = pd.read_csv(path_to_csv_file)
mp4_root = "mp4_root"
destination_path = "destination_path"
#In order to remove the folder if previously created. You can delete this if you don't like it.
if os.path.isdir(destination_path):
shutil.rmtree(destination_path)
os.mkdir(destination_path)
unique_values = pd.unique(df['value'])
for u in unique_values:
os.mkdir(os.path.join(destination_path, str(u)))
#Here we iterate over the rows of your csv file, and concatenate the value and the filename to the destination_path with our new folder structure.
for index, row in df.iterrows():
cur_path = os.path.join(destination_path, str(row['value']), str(row['filename']))
source_path = os.path.join(mp4_root, str(row['filename']))
shutil.copyfile(source_path, cur_path)
EDIT: If there's a file that is in the csv but not present in the source folder, you could check it before (more pythonic) or you could handle it via a try/catch exception check.(Not recommended)
Check the code below.
source_files = os.listdir(mp4_root)
for index, row in df.iterrows():
if str(row['filename']) not in source_files:
continue
cur_path = os.path.join(destination_path, str(row['value']), str(row['filename']))
source_path = os.path.join(mp4_root, str(row['filename']))
shutil.copyfile(source_path, cur_path)

Categories