I have a file structure that looks like:
|num_1
|----|dir1
|--------|dir2
|------------|dcm
|----------------\file_1
|----------------\file_2
|----------------\file_n
|num_2
|----|dir1
|--------|dcm
|------------\file_1
|------------\file_n
|num_n
I want to us os.walk (or something more appropriate?) to traverse the tree until it finds the directory "dcm". dcm can be at varying levels of the tree
This is what I have. Thanks!
import dicom
import re
import os
dcm = []
PATH = "C:\foo"
#find the directory we want to get to, save path
for path, dirs in os.walk(PATH):
for dirname in dirs:
fullpath = os.path.join(path,dirname)
if "dcm" in dirname:
#copied this first_file line - just want a fast and easy way to grab ONE file in the dcm directory
#without reading any of the others (for time reasons)
first_file = next((join(path, f) for f in os.listdir(path) if isfile(join(path, f))),"none")
fullpath = os.path.join(fullpath,first_file)
dcm.append(fullpath)
I went ahead with the "lazy" way and used listdir to read out all of the files under the dcm directory - decided that the resource cost wasn't too high.
That being said, I think that pulling out a single random file from a directory without reading all of those files is an interesting query that someone more Python oriented than I should answer!
For reference, my final solution was... do excuse the inefficiencies in iterator usage! I am new and needed a quick solution
for path, dirs, filename in os.walk(rootDir): #omit files, loop through later
for dirname in dirs:
fullpath = os.path.join(path,dirname)
if "dcm" in dirname:
dcm.append(fullpath)
final = []
uni = 0
final.append(dcm[0])
for i in range(len(dcm)):
if len(os.listdir(dcm[i])) < 10:
pass
elif dcm[i][16:19] != final[uni][16:19]:
final.append(dcm[i])
uni += 1
tags = ((0x8, 0x70)),((0x8, 0x1090)), ((0x18, 0x1020))
values = []
printout = []
for p in range(len(final)):
file = os.path.join(final[p],os.listdir(final[p])[0])
ds = dicom.read_file(file)
printout.append([final[p]])
for k in range(len(tags)):
printout.append([ds[tags[k]]])
Related
files = [
f for f in listdir(input_path)
if path.isfile(path.join(input_path, f))
]
if files:
for file in files:
if file.endswith(".xml"):
xml_filename = input_path + file
elif file.endswith(".csv"):
csv_filename = input_path + file
elif file.endswith(".osgb"):
osgb_filename = input_path + file
elif file.endswith(".xodr"):
xodr_filename = input_path + file
I'm trying to get 4 files from a directory with an specific extension each one but my solution looks kinda ugly you smart guys may have a clever solution ;D
You can reduce code count if you move your result into a collection that can be filled in a loop. With individual variables, you need code per variable for the assignment. Using a dictionary and the standard pathlib module, your code could be
from pathlib import Path
files = {path.suffix[1:]:path for path in Path(input_path).iterdir()
if path.suffix in {".csv", ".xml", ".osgb", ".xodr"}}
Now xml_filename is files["xml"].
use glob.glob()
from glob import glob
import os
xml_filename = glob(os.path.join(input_path, '*.xml'))[0]
csv_filename = glob(os.path.join(input_path, '*.csv'))[0]
osgb_filename = glob(os.path.join(input_path, '*.osgb'))[0]
xodr_filename = glob(os.path.join(input_path, '*.xodr'))[0]
Note that this code assumes that there's at least one of each file type in the directory. You can use try/except to catch the IndexError if glob() doesn't return any matches.
I have a folder with subdirectories which has CSV files. Each subdirectory has CSV files named as modified.csv added_field.csv and retired.csv. How can I loop through each subdirectory read all files in each subdirectory starting with modified, added_field and retired names then recursively append them together?
I have tried os walk in this case but I don't have a clue on how I can use os walk to read all files in each directory by names, append and move to next directory and perform the same process appending to previous table. Here is my silly code
from os import walk
f = []
path ="working dir"
for (dirpath, dirnames, filenames) in walk(path):
file1 = [filenames for filenames in os.listdir(path) if
filenames.startswith("modified")]
file2 = [filenames for filenames in os.listdir(path) if
filenames.startswith("Added_field")]
file3 = [filenames for filenames in os.listdir(path) if
filenames.startswith("Retired")]
df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)
df3 = pd.read_csv(file3)
Finalcombined_df = df1.append([df2,df3], sort=False)
My intention is go through each subdirectory read files by just selecting their start names since there are other files there then append them togethe and have a final large table that has combined all tables from each subdirectory
from pathlib import Path
p = Path.cwd() # if you're in the current working dir
p = Path('to level dir') # insert a top level path of choice
f_names = ['modified', 'Added_field', 'Retired']
f = [y for x in f_names for y in p.glob(f'**/{x}*.csv') ] # ** gets all sub-dirs
df = pd.concat([pd.read_csv(x) for x in f])
You can get all files from all subdirectories using Path.rglob
from pathlib import Path
path = '.'
prefixes = ['modified', 'Added_field', 'Retired']
found = []
for file in Path(path).rglob('*.csv'):
for p in prefixes:
if file.name.startswith(p):
found.append(file)
break
print(found)
I want to read N number of text files in a folder and store them as N number of variables. Note, input will just be folder path and number of text files in it may vary(so n).
Manually i do it like below code, which needs to be completely changed:
import os
os.chdir('C:/Users/Documents/0_CDS/fileread') # Work DIrectory
#reading file
File_object1 = open(r"abc","r")
ex1=File_object1.read()
File_object2 = open(r"def.txt","r")
ex2=File_object2.read()
File_object3 = open(r"ghi.txt","r")
ex3=File_object3.read()
File_object4 = open(r"jkl.txt","r")
ex4=File_object4.read()
File_object5 = open(r"mno.txt","r")
ex5=File_object5.read()
You can use python's built-in dict. Here I only give keys of each input as its filename, you can name them in anyway you like.
import os
path = 'Your Directory'
result_dict = {}
for root, dirs, files in os.walk(path):
for f in files:
with open(os.path.join(path,f), 'r') as myfile:
result_dict[f] = myfile.read()
If you are not interested in the file names and only the content and there are only files in the dir
from os import listdir
l = [open(f).read() for f in listdir('.')]
I am really new to python and coding in general and am looking for some help with optimizing my code. I am attempting to write a script that will locate all "Temp" folders under a certain path, find the newest file age for each, and proceed on with the deletion only if no files found are newer than 1 hour old. Currently the deletion mechanism is not implemented but that should be easy enough to add once the files have been located and checked.
My current iteration has no problem running on my test folder structure of a few thousand files but when I try and run it on the real thing (5b+ files) it of course takes forever. Almost all of these files reside outside of the "Temp" folders. Is there a way to isolate the searching of files to only Temp folders?
Below is my current code.
import os
import fnmatch
import time
import calendar
def find_newest_file(path):
for cur_path, dirnames, filenames in os.walk(path):
for filename in filenames:
yield os.path.join(cur_path, filename)
matches = []
latest_file = []
for root, dirnames, filenames in os.walk("--Path To Network Share--"):
for filename in fnmatch.filter(dirnames, '*Temp'):
matches.append(os.path.join(root, filename))
latest_file.append(max(find_newest_file(os.path.join(root, filename)),
key=os.path.getmtime))
counter = 0
newestfileage = []
for name in matches:
newestfileage.append((calendar.timegm(time.gmtime()) - os.stat(latest_file[counter]).st_mtime) / 3600)
counter += 1
if min(newestfileage) < 1:
print("\nToo new of an Entry in Temp folders, stopping.\nNewest file is only", min(newestfileage)*60,"Minutes Old")
else:
print("\nAll Temp files are older than 1 hour old, proceeding.")
If there is an entirely different approach to this I am all ears.
It seems to me, you only need to do the following:
import re
import os
temp_regex = re.compile(r'.*Temp')
matches = []
latest_file = []
for root, dirnames, filenames in os.walk("--Path To Network Share--"):
if temp_regex.match(os.path.basename(root)):
matches.append(root)
fullnames = (os.path.join(root, filename) for filename in filenames)
latest_file.append(max(fullnames, key=os.path.getmtime)
And dispense with your find_newest_file function, which initiates more os.walks over matching directories, but you visit them as many times as they are below a root that you give to find_newest_file. This makes you scale horrifically.
As an aside, don't keep a counter variable if you are incrementing in every iteration, use enumerate instead:
newestfileage = []
for i, name in enumerate(matches):
newestfileage.append((calendar.timegm(time.gmtime()) - os.stat(latest_file[i]).st_mtime) / 3600)
Or better yet, I think you just need to iterate over for file in latest_file since you never use match and only use i to index into latest_file. Indeed, the above could probably be accomplished during your os.walk pass:
temp_regex = re.compile(r'.*Temp')
matches = []
latest_file = []
newestfileage = []
for root, dirnames, filenames in os.walk("--Path To Network Share--"):
if temp_regex.match(os.path.basename(root)):
matches.append(root)
fullnames = (os.path.join(root, filename) for filename in filenames)
latest = max(fullnames, key=os.path.getmtime)
latest_file.append(latest)
newestfileage.append(calendar.timegm(time.gmtime()) - os.stat(latest).st_mtime) / 3600)
Is it possible to append to different lists while looping through multiple directories simultaneously? my code:
def trav(dir_1, dir_2):
data_0= []
data_1 = []
for dir in [dir_1, dir_2]:
for path, dirs, files in os.walk(dir):
for file in files:
for line in file:
data_0.append(line)
How do I append line from dir_1 -> data_0 and appand dir_2 -> data_1 using one loop, I know i can write two separate methods but would like to know if there is a more efficient, simpler way of doing it. I tried using chain from itertools, but no luck with that, any suggestiosn?
If you do not want two loops its okay you can simply perform an if
def trav(dir_1, dir_2):
data_0 = []
data_1 = []
for dir in [dir_1, dir_2]:
current_dir = dir
for path, dirs, files in os.walk(dir):
for file in files:
for line in file:
if current_dir == dir_1:
data_0.append(line)
else:
data_1.append(line)
another way could be:
def trav(dir_1, dir_2):
data_0 = []
data_1 = []
for dir in [dir_1, dir_2]:
if dir == dir_1:
data = data_0
else:
data = data_1
for path, dirs, files in os.walk(dir):
for file in files:
for line in file:
data.append(line)
Second one will run faster than the first one, since number of comparison needed will be lesser.
Well, you could make data a dict:
def trav(dir_1, dir_2):
data = {}
data[dir_1] = []
data[dir_2] = []
for dir in [dir_1, dir_2]:
for path, dirs, files in os.walk(dir):
for file in files:
for line in file:
data[dir].append(line)
Or you could make data a collections.defaultdict(list). Then you wouldn't have to initialize the entries to empty lists. Also, I would suggest you not use the name dir because of confusion with the built-in name. There's no harm done here though, because it's a local variable.