Can anybody help me with some command prompt details/ script detail/python programming on how to get file details?
Scenario:
Folder contains many subfolders -- > how to get to know what file formats are present in the folders and how to get path of all those files.
Like, I need, distinct file names/formats/path of the files present under a folder/subfolders
Is there anyway possible to get that or manual effort will only be required?
To recursively list all files in folders and sub-folders in Python:
Glob [docs]
from glob import glob
glob("**", recursive=True)
OS Walk [docs]
import os
list(os.walk("./"))
import os, csv
import glob
import pandas as pd
import ast
dir_path = r'<path of directory>'
extension_output_path = r"<path of output file. Path where u want to save output in csv format>"
output_filenames_path = r"<path of output file. Path where u want to save output in csv format>"
exts = set(f.split('.')[-1] for dir,dirs,files in os.walk(dir_path) for f in files if '.' in f)
exts = list(set(exts))
subdirs = [x[0] for x in os.walk(dir_path)]
print(exts)
big_list = []
bigg_list = []
def FindMaxLength(lst):
maxLength = max(map(len, lst))
return maxLength
for dirs in subdirs:
split_dirs = dirs.split('\\')
big_list.append(split_dirs)
big_list_count = FindMaxLength(big_list)
for subdis in big_list:
count_val = big_list_count - len(subdis)
bigg_list.append(subdis + ['']* count_val + ['/'.join(subdis)])
output_list = []
path_list = []
for subbs in bigg_list:
big_dict = {}
for ext in exts:
tifCounter = len(glob.glob1(subbs[-1],"*."+ext))
filenames = glob.glob1(subbs[-1],"*."+ext)
if filenames != []:
val = list(map((subbs[-1]+'/').__add__,filenames))
if len(val) >1:
for li in val:
path_list.append([ext, li])
else:
path_list.append([ext]+val)
if tifCounter != 0:
big_dict[ext] = tifCounter
output_list.append(subbs+ [big_dict])
columns_row = ['col']* (big_list_count + 1)+ ['val'] + exts
with open(extension_output_path,'w', newline='') as csv_file:
csv_wr = csv.writer(csv_file)
csv_wr.writerow(columns_row)
csv_wr.writerows(output_list)
cv = pd.read_csv(extension_output_path)
for index, row in cv.iterrows():
for ext in exts:
if row['val'] != '{}' and ext in ast.literal_eval(row['val']):
cv.loc[index,ext] = ast.literal_eval(row['val'])[ext]
del cv['val']
cv.to_csv(extension_output_path, index=False)
with open(output_filenames_path,'w', newline='') as csv_file:
csv_wr = csv.writer(csv_file)
csv_wr.writerow(['extension', 'filename'])
csv_wr.writerows(path_list)
print("completed")
This output file will contain folder/subfolder path with extension's count.
Related
I have a directory of files that follows this file naming pattern:
alice_01.mov
alice_01.mp4
alice_02.mp4
bob_01.avi
My goal is to find all files at a given path and create a "multidimensional" list of them where each sublist is the unique name of the file (without extension) and then a list of extensions, like so:
resulting_list = [
['alice_01', ['mov','mp4']],
['alice_02', ['mp4']],
['bob_01', ['avi']]
]
I have gotten this far:
import os
path = "user_files/"
def user_files(path):
files = []
for file in os.listdir(path):
files.append(file)
return files
file_array = []
for file in user_files(path):
file_name = file.split(".")[0]
file_ext = file.split(".")[1]
if file_name not in (sublist[0] for sublist in file_array):
file_array.append([file_name,[file_ext]])
else:
file_array[file_array.index(file_name)].append([file_name,[file_ext]])
print(file_array)
My problem is in the else condition but I'm struggling to get it right.
Any help is appreciated.
Here's how you can do it using a dict to store the results:
filenames = [
"alice_01.mov",
"alice_01.mp4",
"alice_02.mp4",
"bob_01.avi",
]
file_dict = {}
for file in filenames:
file_name, file_ext = file.split(".")[0:2]
file_dict.setdefault(file_name, []).append(file_ext)
print(file_dict)
Result:
{'alice_01': ['mov', 'mp4'], 'alice_02': ['mp4'], 'bob_01': ['avi']}
UPDATE: The code above doesn't handle special cases, so here's a slightly more robust version.
from pprint import pprint
filenames = [
"alice_01.mov",
"alice_01.mp4",
"alice_02.mp4",
"bob_01.avi",
"john_007.json.xz",
"john_007.json.txt.xz",
"john_007.json.txt.zip",
"tom_and_jerry",
"tom_and_jerry.dat",
]
file_dict = {}
for file in filenames:
parts = file.split(".")
if len(parts) > 1:
file_name = ".".join(parts[0:-1])
file_ext = parts[-1]
else:
file_name = parts[0]
file_ext = ""
file_dict.setdefault(file_name, []).append(file_ext)
pprint(file_dict)
Result:
{'alice_01': ['mov', 'mp4'],
'alice_02': ['mp4'],
'bob_01': ['avi'],
'john_007.json': ['xz'],
'john_007.json.txt': ['xz', 'zip'],
'tom_and_jerry': ['', 'dat']}
I am trying to loop through my subdirectories to read in my zip files. I am getting error TypeError: 'WindowsPath' object is not iterable
What i am trying:
path = Path("O:/Stack/Over/Flow/")
for p in path.rglob("*"):
print(p.name)
zip_files = (str(x) for x in Path(p.name).glob("*.zip"))
df = process_files(p) #function
What does work - when I go to the folder directly with my path:
path = r'O:/Stack/Over/Flow/2022 - 10/'
zip_files = (str(x) for x in Path(path).glob("*.zip"))
df = process_files(zip_files)
any help would be appreciated.
Directory structure is like:
//Stack/Over/Flow/2022 - 10/Original.zip
//Stack/Over/Flow/2022 - 09/Next file.zip
function i call:
from io import BytesIO
from pathlib import Path
from zipfile import ZipFile
import os
import pandas as pd
def process_files(files: list) -> pd.DataFrame:
file_mapping = {}
for file in files:
#data_mapping = pd.read_excel(BytesIO(ZipFile(file).read(Path(file).stem)), sheet_name=None)
archive = ZipFile(file)
# find file names in the archive which end in `.xls`, `.xlsx`, `.xlsb`, ...
files_in_archive = archive.namelist()
excel_files_in_archive = [
f for f in files_in_archive if Path(f).suffix[:4] == ".xls"
]
# ensure we only have one file (otherwise, loop or choose one somehow)
assert len(excel_files_in_archive) == 1
# read in data
data_mapping = pd.read_excel(
BytesIO(archive.read(excel_files_in_archive[0])),
sheet_name=None,
)
row_counts = []
for sheet in list(data_mapping.keys()):
row_counts.append(len(data_mapping.get(sheet)))
file_mapping.update({file: sum(row_counts)})
frame = pd.DataFrame([file_mapping]).transpose().reset_index()
frame.columns = ["file_name", "row_counts"]
return frame
New : what I am trying
for root, dirs, files in os.walk(dir_path):
for file in files:
print(files)
if file.endswith('.zip'):
df = process_files(os.path.join(root, file))
print(df) #function
else:
print("nyeh")
This is returning files like Original - All fields - 11012021 - 11302021.zip but then i get an error OSError: [Errno 22] Invalid argument: '\\'
A possible solution using os.walk():
zip_files = []
for root, dirs, files in os.walk(main_path):
for file in files:
if file.endswith('.zip'):
zip_files.append(os.path.join(root, file))
df = process_files(zip_files) #function
I have managed to remove numbers but am still not getting total count against the file name. Here is the code, I am trying to create a csv file with file name and total word count of text files.
import csv
from collections import Counter
import glob
import os
import re
folderpath = r'C:/Users/haris/Downloads/PDF/txt/'
target_file = r'C:/Users/haris/Downloads/PDF/txt/total.csv'
filepaths = [os.path.join(folderpath, name) for name in os.listdir(folderpath)]
all_files = []
for path in filepaths:
with open(path, 'r') as f:
counter = Counter()
words = re.findall(r'[a-zA-Z]+', f.read().lower())
counter = counter + Counter(words)
all_files.append(words)
total = (len(words))
print(total)
mycsvfile = open(target_file, 'w')
for path, subdirs, files in os.walk('C:/Users/haris/Downloads/PDF/txt'):
for filename in files:
print(files)
import pandas as pd
dict = {'filename': files, 'total': words}
df = pd.DataFrame(dict)
df.to_csv(target_file) '
Your requirement is a little unclear, for example, you match A-Z but read file then convert to all lower case. You seem to only read files in one folder, but you use os.walk to print files recursively.
The following is tested and can write file's word count in one folder.
import os
import re
folderpath = r'C:/Users/haris/Downloads/PDF/txt/'
target_file = r'C:/Users/haris/Downloads/PDF/txt/total.csv'
all_files = [os.path.join(folderpath, name) for name in os.listdir(folderpath)]
words_counts = []
for path in all_files:
with open(path, 'r', encoding='utf8') as f:
words = re.findall(r'[a-zA-Z]+', f.read())
words_counts.append(len(words))
# print(all_files)
# print(words_counts)
import pandas as pd
dict = {'filename': all_files, 'total': words_counts}
df = pd.DataFrame(dict)
df.to_csv(target_file)
I have some csv files that i have filtered with this code and it works:
with open('path' , 'r')as f:
for lines in f:
if '2020-12-31' in lines:
line_data = lines.split(';')
filtered_list.append(line_data)
newfile.write(lines)
Firstly i would like do this but for ALL csv file in my folder.
Secondly i would like to do this in prompt command line if possible( with sys?).
i tried:
import os
from os import walk
from pathlib import Path
dir = r'myPathFolder1'
target = r'myPathFolder2'
filtered_list=[]
for filenames in os.listdir(dir):
for f in filenames:
if f.endswith(".csv"):
newfile = open(dir + f, 'w')
with open(f , 'r') as t:
for lines in t:
if '2020-12-31' in lines:
line_data = lines.split(';')
filtered_list.append(line_data)
newfile.write(lines)
But it doesnt work.
The full code would be, I tried my code, it will copy to another folder.
import os,fnmatch
dir = "C:\\Users\\Frederic\\Desktop\\"
def find(pattern, path):
result = []
for root, dirs, files in os.walk(path):
for name in files:
if fnmatch.fnmatch(name, pattern):
result.append(os.path.join(root, name))
return result
filtered_list = find('*.csv', dir)
print(filtered_list)
for filenames in filtered_list:
print(filenames)
for f in filtered_list:
if f.endswith(".csv"):
print(f.endswith(".csv"))
base_dir_pair = os.path.split(f)
address = "C:\\Users\\Frederic\\Desktop\\aa\\"
address = address + base_dir_pair[1]
print(address)
newfile = open(address, 'w')
with open(f, 'r') as t:
print("in1")
for lines in t:
print("in2")
if '2020-12-31' in lines:
print("in3")
line_data = lines.split(';')
filtered_list.append(line_data)
newfile.write(lines)
So i wrote this to help me sort files in a bunch of different folders, it works by taking the first file from each folder and creating a folder for it then the 2nd file from each folder and does the same and so on, but when ever I run the code nothing happens can someone help.
import os, sys
path = "\Users\mac\Desktop\soliddd sort\dir"
fdir = os.listdir(path)
f = len(fdir[0])
array = [[] for i in xrange(f)]
def setArray(c, i, j):
array[j][i] = c[j]
def chooseFile(j):
for i in fdir:
setArray(fdir[i], i, j)
def makedir(f, fdir):
for i in f:
folder = r"\Users\mac\Desktop\soliddd sort\dir"+str(i)
if not os.path.exists(folder):
os.makedirs(folder)
for j in fdir:
with open(os.path.join(folder, array[i][j], 'wb')) as temp:
temp.write(buff)
folder.close()
def main():
for j in f:
chooseFile(j)
makedir(f, fdir)
A tiny example on how you can go about sorting files into folders, as I couldn't understand why you were randomly selecting files from different locations. This example gets all the file names, and then lets you select a common filename to sort into a new folder
def sort_files(base_dir="C:\sorted_files")
import os, shutil
folder_list = [
'folder_path_a',
'folder_path_b',
]
file_list = []
for folder in folder_list:
file_list.extend([os.path.join(folder, file_name) for file_name in os.listdir(folder)])
key = raw_input("Group files with <text> in the filename: ")
matching_files = [ file_name for file_name in file_list if key in file_name ]
for file_name in matching_files:
shutil.move(file_name, os.path.join(os.path.join(base_dir,key), os.path.basename(file_name)))