Is it possible to append to different lists while looping through multiple directories simultaneously? my code:
def trav(dir_1, dir_2):
data_0= []
data_1 = []
for dir in [dir_1, dir_2]:
for path, dirs, files in os.walk(dir):
for file in files:
for line in file:
data_0.append(line)
How do I append line from dir_1 -> data_0 and appand dir_2 -> data_1 using one loop, I know i can write two separate methods but would like to know if there is a more efficient, simpler way of doing it. I tried using chain from itertools, but no luck with that, any suggestiosn?
If you do not want two loops its okay you can simply perform an if
def trav(dir_1, dir_2):
data_0 = []
data_1 = []
for dir in [dir_1, dir_2]:
current_dir = dir
for path, dirs, files in os.walk(dir):
for file in files:
for line in file:
if current_dir == dir_1:
data_0.append(line)
else:
data_1.append(line)
another way could be:
def trav(dir_1, dir_2):
data_0 = []
data_1 = []
for dir in [dir_1, dir_2]:
if dir == dir_1:
data = data_0
else:
data = data_1
for path, dirs, files in os.walk(dir):
for file in files:
for line in file:
data.append(line)
Second one will run faster than the first one, since number of comparison needed will be lesser.
Well, you could make data a dict:
def trav(dir_1, dir_2):
data = {}
data[dir_1] = []
data[dir_2] = []
for dir in [dir_1, dir_2]:
for path, dirs, files in os.walk(dir):
for file in files:
for line in file:
data[dir].append(line)
Or you could make data a collections.defaultdict(list). Then you wouldn't have to initialize the entries to empty lists. Also, I would suggest you not use the name dir because of confusion with the built-in name. There's no harm done here though, because it's a local variable.
Related
I have a directory of 50 txt files. I want to combine the contents of each file into a Python list.
Each file looks like;
line1
line2
line3
I am putting the files / file path into a list with this code. I just need to loop through file_list and append the content of each txt file to a list.
from pathlib import Path
def searching_all_files():
dirpath = Path(r'C:\num')
assert dirpath.is_dir()
file_list = []
for x in dirpath.iterdir():
if x.is_file():
file_list.append(x)
elif x.is_dir():
file_list.extend(searching_all_files(x))
return file_list
But I am unsure best method
Maybe loop something close to this?
NOTE: NOT REAL CODE!!!! JUST A THOUGHT PULLED FROM THE AIR. THE QUESTION ISNT HOW TO FIX THIS. I AM JUST SHOWING THIS AS A THOUGHT. ALL METHODS WELCOME.
file_path = Path(r'.....')
with open(file_path) as f:
source_path = f.read().splitlines()
source_nospaces = [x.strip(' ') for x in source_path]
return source_nospaces
You could make use of pathlib.rglob in order to search for all files in a directory recursively and readlines() to append the contents to list:
from pathlib import Path
files = Path('/tmp/text').rglob('*.txt')
res = []
for file in files:
res += open(file).readlines()
print(res)
Out:
['file_content2\n', 'file_content3\n', 'file_content1\n']
I need to create a list holding all files from multiple directories.
I have all_dir which contains dir1, dir2, dir3.... Each directory contains multiple files ['text1.txt','text2.txt'...].
While I'm capable of creating list of single directories, I can't find the way to automate.
This is what I have and it work for the single directory.
path = '../all_dir'
list1 = [f for f in os.listdir(os.path.join(path, 'dir1')
list2 = [f for f in os.listdir(os.path.join(path,'dir1')
#etc...
This would be the code I'm thinking of:
all_list = []
for dir1 in os.listdir(path):
current = os.listdir(os.path.join(path,dir1))
all_list.append(current)
But this for loop raise: NotADirectoryError
To fix this I've tried
all_list = []
for dir1 in os.listdir(path):
current = os.walk(os.path.join(path,dir1))
all_list.append(current)
But this loop raises a list of <generator object walk at 0x100ca4e40>
Could you help?
listdir also gives back files, so in the for loop you should do a check, if it is directory. You can use os.path.isdir()
for dir1 in os.listdir(path):
full_path = os.path.join(path, dir1)
if os.path.isdir(full_path):
current = os.listdir(full_path)
all_list += current
#Navigate to the location where all_dir is
os.chdir('../all_dir')
#l is the list of paths of each directory in all_dir
l = []
for folder in os.listdir(os.getcwd()):
path = os.path.join(os.getcwd(), folder)
l.append(path)
#all files in all directories will be stored in all_list
all_list=[]
for i in li:
os.chdir(i)
#'*.*' prints files only (ended with any extension), if you want to print everything (including) in each directory use '*'
all_files = glob.glob('*.*')
for f in all_files:
all_list.append(f)
#number of all files
len(all_list)
#print all files
print(all_list)
So I'm using Pyhton, and I have a parent directory, with two child directories, in turn containing many directories, each with three files. I want to take the third file (which is a .CSV file) of each of these directories, and parse them together into a pandas dataframe. This is the code I have this far
import os
rootdir ='C:\\Dir\\Dir\\Dir\\root(parent)dir'
# os.listdir(rootdir)
# os.getcwd()
filelist = os.listdir(rootdir)
# file_count = len(filelist)
def list_files(dir):
r = []
for root, dirs, files in os.walk(dir):
# if files.startswith('C74'):
for name in files:
r.append(os.path.join(root, name))
return r
filelist = list_files(rootdir)
Now with "filelist" I get all file paths contained in all directories as strings. Now I need to find:
1. The file names that begin with three specific letters (for example funtest, in this case the first letters being fun)
2. Take every third file, and construct a pandas dataframe from that, so that I can proceed to perform data analysis.
IIUC we can do this much easier using a recursive function from pathlib :
from pathlib import Path
csv = [f for f in Path(r'parent_dir').rglob('*C74*.csv')]
df = pd.concat([pd.read_csv(f) for f in csv])
if you want to subset your list again you could do
subset_list = [x for x in csv if 'abc' in x.stem]
Test
[x for x in csv if 'abc' in x.stem]
out : ['C74_abc.csv', 'abc_C74.csv']
I have a folder with subdirectories which has CSV files. Each subdirectory has CSV files named as modified.csv added_field.csv and retired.csv. How can I loop through each subdirectory read all files in each subdirectory starting with modified, added_field and retired names then recursively append them together?
I have tried os walk in this case but I don't have a clue on how I can use os walk to read all files in each directory by names, append and move to next directory and perform the same process appending to previous table. Here is my silly code
from os import walk
f = []
path ="working dir"
for (dirpath, dirnames, filenames) in walk(path):
file1 = [filenames for filenames in os.listdir(path) if
filenames.startswith("modified")]
file2 = [filenames for filenames in os.listdir(path) if
filenames.startswith("Added_field")]
file3 = [filenames for filenames in os.listdir(path) if
filenames.startswith("Retired")]
df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)
df3 = pd.read_csv(file3)
Finalcombined_df = df1.append([df2,df3], sort=False)
My intention is go through each subdirectory read files by just selecting their start names since there are other files there then append them togethe and have a final large table that has combined all tables from each subdirectory
from pathlib import Path
p = Path.cwd() # if you're in the current working dir
p = Path('to level dir') # insert a top level path of choice
f_names = ['modified', 'Added_field', 'Retired']
f = [y for x in f_names for y in p.glob(f'**/{x}*.csv') ] # ** gets all sub-dirs
df = pd.concat([pd.read_csv(x) for x in f])
You can get all files from all subdirectories using Path.rglob
from pathlib import Path
path = '.'
prefixes = ['modified', 'Added_field', 'Retired']
found = []
for file in Path(path).rglob('*.csv'):
for p in prefixes:
if file.name.startswith(p):
found.append(file)
break
print(found)
I have a file structure that looks like:
|num_1
|----|dir1
|--------|dir2
|------------|dcm
|----------------\file_1
|----------------\file_2
|----------------\file_n
|num_2
|----|dir1
|--------|dcm
|------------\file_1
|------------\file_n
|num_n
I want to us os.walk (or something more appropriate?) to traverse the tree until it finds the directory "dcm". dcm can be at varying levels of the tree
This is what I have. Thanks!
import dicom
import re
import os
dcm = []
PATH = "C:\foo"
#find the directory we want to get to, save path
for path, dirs in os.walk(PATH):
for dirname in dirs:
fullpath = os.path.join(path,dirname)
if "dcm" in dirname:
#copied this first_file line - just want a fast and easy way to grab ONE file in the dcm directory
#without reading any of the others (for time reasons)
first_file = next((join(path, f) for f in os.listdir(path) if isfile(join(path, f))),"none")
fullpath = os.path.join(fullpath,first_file)
dcm.append(fullpath)
I went ahead with the "lazy" way and used listdir to read out all of the files under the dcm directory - decided that the resource cost wasn't too high.
That being said, I think that pulling out a single random file from a directory without reading all of those files is an interesting query that someone more Python oriented than I should answer!
For reference, my final solution was... do excuse the inefficiencies in iterator usage! I am new and needed a quick solution
for path, dirs, filename in os.walk(rootDir): #omit files, loop through later
for dirname in dirs:
fullpath = os.path.join(path,dirname)
if "dcm" in dirname:
dcm.append(fullpath)
final = []
uni = 0
final.append(dcm[0])
for i in range(len(dcm)):
if len(os.listdir(dcm[i])) < 10:
pass
elif dcm[i][16:19] != final[uni][16:19]:
final.append(dcm[i])
uni += 1
tags = ((0x8, 0x70)),((0x8, 0x1090)), ((0x18, 0x1020))
values = []
printout = []
for p in range(len(final)):
file = os.path.join(final[p],os.listdir(final[p])[0])
ds = dicom.read_file(file)
printout.append([final[p]])
for k in range(len(tags)):
printout.append([ds[tags[k]]])