Checking if pairs from zip are correct? - python

I need your advice on this problem.
I have collected what I need in these two lists: simpl2, astik, with this code:
simpl2 = []
astik = []
for path, subdirs, files in os.walk(rootfolder):
for name in files:
if 'sim2.shp' == name:
simpl2.append(os.path.join(path, name))
elif 'ASTIK.shp' == name:
astik.append(os.path.join(path, name))
The code above searches in a rootfolder that contains folders: v1.v2,v3,v4
So using this:
for i,j in zip(simpl2,astik):
print(i,j)
gives this:
CONTENT
C:\Users\user\Desktop\pl\v1\exported\sim2.shp C:\Users\user\Desktop\pl\v1\ASTIK\ASTIK.shp
C:\Users\user\Desktop\pl\v2\exported\sim2.shp C:\Users\user\Desktop\pl\v4\ASTIK\ASTIK.shp
Question
How to ensure that the pairs would be from the same folder (like the first row that come both from v1 and if don't (like the second row where one is from v2 and the other from v4) make them not have a pair at all.
This should happen because, they will be used later and they have to be correct pairs otherwise I have a code ready with exception for those that don't have a pair, so the problem is how to fix this part that is described earlier.
Explanation
The rootfolder is:
C:\Users\user\Desktop\pl
after that pl there is a v1,v2,v3,v4 folder. Each of these folders has some files that are the same to all the 4 folders. The only difference is that some will be empty. I just want to check if correct pairs of the same v are created in the lists.

Ok, seeing your update maybe you are interested in something more like this:
import os
simpl2 = []
astik = []
rootfolder = r'C:\Users\user\Desktop\pl'
subfolders = [os.path.join(rootfolder, i) for i in ['v1','v2','v3','v4']]
for folder in subfolders:
temp = {name: os.path.join(path, name)
for path, subdirs, files in os.walk(folder)
for name in files
if name in ['sim2.shp', 'ASTIK.shp']}
if len(temp) == 2:
simpl2.append(temp['sim2.shp'])
astik.append(temp['ASTIK.shp'])
OLD CODE
But... if this is your end goal you could also just store the paths. If both files are in the path then you know the path contains both files. You can then easily build the endpaths with os.path.join() when needed.
paths = []
for path, subdirs, files in os.walk(rootfolder):
if ('sim2.shp' in files) and ('ASTIK.shp' in files):
paths.append(path)
Or a more compact format:
lookfor = ['sim2.shp','ASTIK.shp']
paths = [p for p,s,f in os.walk(rootfolder) if all(i in f for i in lookfor)]

Related

how do I read file from two folder with the same order in python

I have two folders with the same file names, but when I try to read all text files from the folders in python, it reads in a different order. but I need to read files from two folders in the same order because they correspond. I used the following code to read all text files in a folder.
dir_psnr=current_path+'\\'+dir_psnr+'\\'
os.chdir(dir_psnr) #change directory to downloads folder
files_path =[os.path.abspath(x) for x in os.listdir()]
fnames_psnr_tmp = [x for x in files_path if x.endswith(".txt")]
the address of the folders are as follows:
F:\RD_data_from_twitch_system\RD_data_from_twitch_system\psnr
F:\RD_data_from_twitch_system\RD_data_from_twitch_system\bitrate
the name of text files in both two folders are as follows:
asmr_1.txt
asmr_2.txt
Counter_strike_1.txt
Counter_strike_2.txt
dota2_1.txt
what is the problem? and how can I read files in the same order?
the full code is :
def reading_file_to_array(dir_psnr,current_path):
dir_psnr=current_path+'\\'+dir_psnr+'\\'
os.chdir(dir_psnr) #change directory to downloads folder
files_path =[os.path.abspath(x) for x in os.listdir()]
fnames_psnr_tmp = [x for x in files_path if x.endswith(".txt")]
.
.
.
return()
current_path='F:/RD_data_from_twitch_system/RD_data_from_twitch_system'
current_dir ='F:/RD_data_from_twitch_system/RD_data_from_twitch_system'
all_sub_dir_paths = glob(str(current_dir) + '/*/')
all_sub_dir_names = [Path(sub_dir).name for sub_dir in all_sub_dir_paths]
for i in range(len(all_sub_dir_names)):
if all_sub_dir_names[i]=='bitrate':
bitrate_1080p,bitrate_720p,bitrate_480p,bitrate_360p,bitrate_160p=reading_file_to_array(all_sub_dir_names[i], current_path)
else:
psnr_1080p,psnr_720p,psnr_480p,psnr_360p,psnr_160p=reading_file_to_array(all_sub_dir_names[i], current_path)
Since the file names are the same, you could list the files in one directory and then add the bases to both for processing. This could be done in a generator that you can use in a loop. For example
folder1 = r"F:\RD_data_from_twitch_system\RD_data_from_twitch_system\psnr"
folder2 = r"F:\RD_data_from_twitch_system\RD_data_from_twitch_system\bitrate"
def list_directories(primary, secondary):
primary = os.path.abspath(primary)
secondary = os.path.abspath(secondary)
for fn in os.listdir(primary):
if fn.endswith(".txt"):
yield (os.path.join(primary, fn),
os.path.join(secondary, fn))
# print files for test
for f1, f2 in list_directories(folder1, folder2):
print(f1, f2)
Its usually a bad idea to os.chdir- especially without remembering which directory you came from. As long as your code builds absolute path names, the current working directory doesn't matter.
The easiest way would be to use listdir and to append the path to the front of every element of the list.
import os
#hardcoded folders
def reading_file_to_array(dir_1, dir_2):
list_1 = [f"{dir_1}/"+f for f in os.listdir(dir_1)]
list_2 = [f"{dir_2}/"+f for f in os.listdir(dir_2)]
# Add more lists
# Do sorting stuff here if needed
return zip(list_1, list_2)
for f1, f2 in reading_file_to_array("./f_1", "./f_2"):
print(f1, f2)
#more dynamic appraoch
def reading_file_to_array_dyn(dirs):
results = list()
for directory in dirs:
results.append([f"{directory}/"+f for f in os.listdir(directory)])
# Do sorting stuff here if needed
return zip(*results)
for f1, f2 in reading_file_to_array_dyn(["./f_1", "./f_2"]):
print(f1, f2)
The result of this test code looks like this for me:
./f_1/a.txt ./f_2/a.txt
./f_1/b.txt ./f_2/b.txt
./f_1/c.txt ./f_2/c.txt
If you want to filter the files in the folder based on type, I recommend the package glob.

building a dictionary of my directories and file paths to select all files whose name contains a specific string

I have a rootdirectory called 'IC'. 'IC' contains a bunch of subdirectories which contain subsubdirectories which contain subsubsubdirectories and so on. Is there an easy way to move all the sub...directory files into their parent subdirectory and then delete the empty sub...directories.
So far I've made this monstrosity of nested loops to build a dictionary of file paths and subdirectories as dictionaries containing file paths etc. I was gonna then make something to go through the dictionary and pick all files containing 'IC' and the subdirectory they are in. I need to know which directories contain an 'IC' file or not. I also need to move all the files containing 'IC' to the top level subdirectories(see hashtag in code)
import os, shutil
rootdir = 'data/ICs'
def dir_tree(rootdir):
IC_details = {}
# This first loop is what I'm calling the top level subdirectories. They are the three
# subdirectories inside the directory 'data/ICs'
for i in os.scandir(rootdir):
if os.path.isdir(i):
IC_details[i.path] = {}
for i in IC_details:
for j in os.scandir(i):
if os.path.isdir(j.path):
IC_details[i][j.name] = {}
elif os.path.isfile(j.path):
IC_details[i][j.name] = [j.path]
for j in IC_details[i]:
if os.path.isdir(os.path.join(i,j)):
for k in os.scandir(os.path.join(i,j)):
if os.path.isdir(k.path):
IC_details[i][j][k.name] = {}
elif os.path.isfile(k.path):
IC_details[i][j][k.name] = [k.path]
for k in IC_details[i][j]:
if os.path.isdir(os.path.join(i,j,k)):
for l in os.scandir(os.path.join(i,j,k)):
if os.path.isdir(l.path):
IC_details[i][j][k][l.name] = {}
elif os.path.isfile(l.path):
IC_details[i][j][k][l.name] = [l.path]
for l in IC_details[i][j][k]:
if os.path.isdir(os.path.join(i,j,k,l)):
for m in os.scandir(os.path.join(i,j,k,l)):
if os.path.isfile(m.path):
IC_details[i][j][k][l][m.name] = [m.path]
return IC_details
IC_tree = dir_tree(rootdir)
You should have a look at the 'glob' module :
glob — Unix style pathname pattern expansion¶

Combining subproccess.os with regex to get a filtered list of directories/files

I'm new to coding and am having difficulty using subprocess.os with regex. I'm trying to get a list of all the files and directories that start with an uppercase C. This is what I've got so far...
home = subprocess.os.path.expanduser("~")
FilesDirsStartingWithC = []
for (dir, subdir, files) in subprocess.os.walk(home):
match = re.findall(r'^C\w+')
for i in match:
FilesDirsStartingWithC.append(i)
print(FilesDirsStartingWithC)
I realise the part between the first for statement and the append statement is wrong but I can't figure out how to fix it. Any help would be much appreciated! Thank you :)
You don't need to use re module. Its just os.walk() and .startswith():
import os
catalogs = []
for root, dirs, files in os.walk('D:/'):
for Dir in dirs:
if Dir.startswith('C'):
catalogs.append(Dir)
print(catalogs)
Good luck!
Extend your with matched lists:
home = subprocess.os.path.expanduser("~")
FilesDirsStartingWithC = []
for (dirs, subdirs, files) in subprocess.os.walk(home):
FilesDirsStartingWithC.extend(re.findall('^C\w+', ''.join(dirs)))
FilesDirsStartingWithC.extend(re.findall('^C\w+', ''.join(subdirs)))
FilesDirsStartingWithC.extend(re.findall('^C\w+', ''.join(files)))
print(FilesDirsStartingWithC)
If you need to walk and capture both directories and files, you can do as suggested earlier by #min-protector but merge the directories, d, and files, f, first:
import os
paths = []
for r, d, f in os.walk('/home/'):
d_and_f = d + f
for item in d_and_f:
if item.startswith('C'):
paths.append(os.path.join(os.sep, r, item))
Note that I am using os.path.join to add the root, r, to the directories and files paths, to ensure identifying duplications. Without this, the output for \path1\C1 and \path2\C1 will be ['C1', 'C1']
Or you can use list List Comprehensions:
paths = [os.path.join(os.sep, r,item)
for r,d,f in os.walk('/home/')
for item in d + f
if item.startswith('C')]

Trying to exclude a substring within a list in python

I have a list phplist containing the following strings (example below), there are many more, this is a snippet of the entire list
/home/comradec/public_html/moodle/config.php
/home/comradec/public_html/moodle/cache/classes/config.php
/home/comradec/public_html/moodle/theme/sky_high/config.php
/home/comradec/public_html/moodle/theme/brick/config.php
/home/comradec/public_html/moodle/theme/serenity/config.php
/home/comradec/public_html/moodle/theme/binarius/config.php
/home/comradec/public_html/moodle/theme/anomaly/config.php
/home/comradec/public_html/moodle/theme/standard/config.php
What I am trying to do is only keep the subdir/config.php file and exclude all other config.php files (eg cache/classes/config.php).
Full code is
for folder, subs, files in os.walk(path):
for filename in files:
if filename.endswith('.php'):
phplist.append(abspath(join(folder, filename)))
for i in phplist:
if i.endswith("/config.php"):
cmsconfig.append(i)
if i.endswith("/mdeploy.php"):
cmslist.append(cms1[18])
So the outcome will only add /config.php file path to the list cmsconfig but what is happening I am getting all the config.php files as in the top example
I have been using the code like is not i.endswith("/theme/brick/config.php") but I want a way to exclude the theme directory from the list.
The reason I am placing the output into a list is I use that output in another area of the code.
Change your if-condition to if i.endswith("moodle/config.php").
If you want to change the folder that you want to this with:
path_ending = '%s/config.php' % folder_name
Now change the if-condition to if i.endswith(path_ending)
This will show paths that end with config.php within the folder tbat you passed.
I think this is what you want. may change the naming of variables it is not pep8 style.
First i sort all entries that the shortest comes first, then i remember which parts are already checked.
url1 = '/home/comradec/public_html/moodle/theme/binarius/config.php'
url2 = '/home/comradec/public_html/moodle/config.php'
url3 = '/home/comradec/public_html/othername/theme/binarius/config.php'
url4 = '/home/comradec/public_html/othername/config.php'
urls = []
urls.append(url1)
urls.append(url2)
urls.append(url3)
urls.append(url4)
moodleUrls = []
checkedDirs = []
#sort
for i in sorted(urls):
if str(i).endswith('config.php'):
alreadyChecked = False
for checkedDir in checkedDirs:
if str(i).startswith(checkedDir):
alreadyChecked = True
break
if not alreadyChecked:
moodleUrls.append(i)
checkedDirs.append(str(i).replace('/config.php',''))
print(checkedDirs)
print(moodleUrls)
Output:
['/home/comradec/public_html/moodle', '/home/comradec/public_html/othername']
['/home/comradec/public_html/moodle/config.php', '/home/comradec/public_html/othername/config.php']
The way I resolved my question. Provides the output I am looking for.
path = "/home/comradec"
phplist = []
cmsconfig = []
config = "config.php"
for folder, subs, files in os.walk(path):
for filename in files:
if filename.endswith('.php'):
phplist.append(abspath(join(folder, filename)))
for i in phplist:
if i.endswith("/mdeploy.php"):
newurl = i
newurl = newurl[:-11]
newurl = newurl + config
for i in phplist:
if i.endswith("/config.php"):
confirmurl = i
if confirmurl == newurl:
cmsconfig.append(newurl)
print('\n'.join(cmsconfig))

How to traverse through the files in a directory?

I have a directory logfiles. I want to process each file inside this directory using a Python script.
for file in directory:
# do something
How do I do this?
With os.listdir() or os.walk(), depending on whether you want to do it recursively.
In Python 2, you can try something like:
import os.path
def print_it(x, dir_name, files):
print dir_name
print files
os.path.walk(your_dir, print_it, 0)
Note: the 3rd argument of os.path.walk is whatever you want. You'll get it as the 1st arg of the callback.
In Python 3 os.path.walk has been removed; use os.walk instead. Instead of taking a callback, you just pass it a directory and it yields (dirpath, dirnames, filenames) triples. So a rough equivalent of the above becomes
import os
for dirpath, dirnames, filenames in os.walk(your_dir):
print dirpath
print dirnames
print filenames
You can list every file from a directory recursively like this.
from os import listdir
from os.path import isfile, join, isdir
def getAllFilesRecursive(root):
files = [ join(root,f) for f in listdir(root) if isfile(join(root,f))]
dirs = [ d for d in listdir(root) if isdir(join(root,d))]
for d in dirs:
files_in_d = getAllFilesRecursive(join(root,d))
if files_in_d:
for f in files_in_d:
files.append(join(root,f))
return files
import os
# location of directory you want to scan
loc = '/home/sahil/Documents'
# global dictonary element used to store all results
global k1
k1 = {}
# scan function recursively scans through all the diretories in loc and return a dictonary
def scan(element,loc):
le = len(element)
for i in range(le):
try:
second_list = os.listdir(loc+'/'+element[i])
temp = loc+'/'+element[i]
print "....."
print "Directory %s " %(temp)
print " "
print second_list
k1[temp] = second_list
scan(second_list,temp)
except OSError:
pass
return k1 # return the dictonary element
# initial steps
try:
initial_list = os.listdir(loc)
print initial_list
except OSError:
print "error"
k =scan(initial_list,loc)
print " ..................................................................................."
print k
I made this code as a directory scanner to make a playlist feature for my audio player and it will recursively scan all the sub directories present in directory.
You could try glob:
import glob
for file in glob.glob('log-*-*.txt'):
# Etc.
But glob doesn't work recursively (as far as I know), so if your logs are in folders inside of that directory, you'd be better off looking at what Ignacio Vazquez-Abrams posted.
If you need to check for multiple file types, use
glob.glob("*.jpg") + glob.glob("*.png")
Glob doesn't care about the ordering of the files in the list. If you need files sorted by filename, use
sorted(glob.glob("*.jpg"))
import os
rootDir = '.'
for dirName, subdirList, fileList in os.walk(rootDir):
print('Found directory: %s' % dirName)
for fname in fileList:
print('\t%s' % fname)
# Remove the first entry in the list of sub-directories
# if there are any sub-directories present
if len(subdirList) > 0:
del subdirList[0]
Here's my version of the recursive file walker based on the answer of Matheus Araujo, that can take optional exclusion list arguments, which happens to be very helpful when dealing with tree copies where some directores / files / file extensions aren't wanted.
import os
def get_files_recursive(root, d_exclude_list=[], f_exclude_list=[], ext_exclude_list=[], primary_root=None):
"""
Walk a path to recursively find files
Modified version of https://stackoverflow.com/a/24771959/2635443 that includes exclusion lists
:param root: path to explore
:param d_exclude_list: list of root relative directories paths to exclude
:param f_exclude_list: list of filenames without paths to exclude
:param ext_exclude_list: list of file extensions to exclude, ex: ['.log', '.bak']
:param primary_root: Only used for internal recursive exclusion lookup, don't pass an argument here
:return: list of files found in path
"""
# Make sure we use a valid os separator for exclusion lists, this is done recursively :(
d_exclude_list = [os.path.normpath(d) for d in d_exclude_list]
files = [os.path.join(root, f) for f in os.listdir(root) if os.path.isfile(os.path.join(root, f))
and f not in f_exclude_list and os.path.splitext(f)[1] not in ext_exclude_list]
dirs = [d for d in os.listdir(root) if os.path.isdir(os.path.join(root, d))]
for d in dirs:
p_root = os.path.join(primary_root, d) if primary_root is not None else d
if p_root not in d_exclude_list:
files_in_d = get_files_recursive(os.path.join(root, d), d_exclude_list, f_exclude_list, ext_exclude_list, primary_root=p_root)
if files_in_d:
for f in files_in_d:
files.append(os.path.join(root, f))
return files
This is an update of my last version that accepts glob style wildcards in exclude lists.
The function basically walks into every subdirectory of the given path and returns the list of all files from those directories, as relative paths.
Function works like Matheus' answer, and may use optional exclude lists.
Eg:
files = get_files_recursive('/some/path')
files = get_files_recursive('/some/path', f_exclude_list=['.cache', '*.bak'])
files = get_files_recursive('C:\\Users', d_exclude_list=['AppData', 'Temp'])
files = get_files_recursive('/some/path', ext_exclude_list=['.log', '.db'])
Hope this helps someone like the initial answer of this thread helped me :)
import os
from fnmatch import fnmatch
def glob_path_match(path, pattern_list):
"""
Checks if path is in a list of glob style wildcard paths
:param path: path of file / directory
:param pattern_list: list of wildcard patterns to check for
:return: Boolean
"""
return any(fnmatch(path, pattern) for pattern in pattern_list)
def get_files_recursive(root, d_exclude_list=None, f_exclude_list=None, ext_exclude_list=None, primary_root=None):
"""
Walk a path to recursively find files
Modified version of https://stackoverflow.com/a/24771959/2635443 that includes exclusion lists
and accepts glob style wildcards on files and directories
:param root: path to explore
:param d_exclude_list: list of root relative directories paths to exclude
:param f_exclude_list: list of filenames without paths to exclude
:param ext_exclude_list: list of file extensions to exclude, ex: ['.log', '.bak']
:param primary_root: Only used for internal recursive exclusion lookup, don't pass an argument here
:return: list of files found in path
"""
if d_exclude_list is not None:
# Make sure we use a valid os separator for exclusion lists, this is done recursively :(
d_exclude_list = [os.path.normpath(d) for d in d_exclude_list]
else:
d_exclude_list = []
if f_exclude_list is None:
f_exclude_list = []
if ext_exclude_list is None:
ext_exclude_list = []
files = [os.path.join(root, f) for f in os.listdir(root) if os.path.isfile(os.path.join(root, f))
and not glob_path_match(f, f_exclude_list) and os.path.splitext(f)[1] not in ext_exclude_list]
dirs = [d for d in os.listdir(root) if os.path.isdir(os.path.join(root, d))]
for d in dirs:
p_root = os.path.join(primary_root, d) if primary_root is not None else d
if not glob_path_match(p_root, d_exclude_list):
files_in_d = get_files_recursive(os.path.join(root, d), d_exclude_list, f_exclude_list, ext_exclude_list,
primary_root=p_root)
if files_in_d:
for f in files_in_d:
files.append(os.path.join(root, f))
return files

Categories