os.listdir analog for a zipped directory

os.listdir analog for a zipped directory - python

My goal is to list all files contained in the certain sub-directory inside a zip-archive.
os.listdir(target_dir) raises a FileNotFoundError, and zfile.namelist() just lists all the files in all directories.
Any ideas?

Try the following:
files = list(filter(lambda f: f.startswith("subdir"), zfile.namelist()))
print(files)
Explanation: filter filters the list supplied by zfile.namelist() on a lambda that is checking whether the filename starts with "subdir".
The filter function does not return a list but rather a filter object (generator) and thus we need to convert it to a list.
You could also use the following line which does the same but uses list comprehension:
files = [f for f in zfile.namelist() if f.startswith("subdir")]
Edit: As pointed out by advance512: "The problem with this solution is that it will also return files in subdirectories inside the subdirectory you're checking.":
files = [f for f in zfile.namelist() if f.startswith("subdir") and f.count("/") == 1]
This will not return any files in sub-sub directories.

You can use the supplied zip_listdir function, which is a bit quick-n-dirty but should always work in Unix clones.
class MockZipFile(object):
fake_file_names = [
"string.pyc", # Top level name
"test/__init__.pyc", # Package directory
"test/test_support.pyc", # Module test.test_support
"test/bogus/__init__.pyc", # Subpackage directory
"test/bogus/myfile.pyc" # Submodule test.bogus.myfile
]
def namelist(self):
return self.fake_file_names
def zip_listdir(zip_file, target_dir):
file_names = zip_file.namelist()
if not target_dir.endswith("/"):
target_dir += "/"
if target_dir == "/":
target_dir = ""
result = [ file_name
for file_name in file_names
if file_name.startswith(target_dir) and
not "/" in file_name[len(target_dir):]
]
return result
mockZipfile = MockZipFile()
print zip_listdir(zip_file=mockZipfile, target_dir="test")
print zip_listdir(zip_file=mockZipfile, target_dir="test/bogus")
print zip_listdir(zip_file=mockZipfile, target_dir="test/")
print zip_listdir(zip_file=mockZipfile, target_dir="/")
print zip_listdir(zip_file=mockZipfile, target_dir="")
print zip_listdir(zip_file=mockZipfile, target_dir="/asd")
Please note I created a MockZipFile class, and am using it as the input for the zip_listdir function, but a proper zipfile object should work exactly the same.

Related

how do I read file from two folder with the same order in python

I have two folders with the same file names, but when I try to read all text files from the folders in python, it reads in a different order. but I need to read files from two folders in the same order because they correspond. I used the following code to read all text files in a folder.
dir_psnr=current_path+'\\'+dir_psnr+'\\'
os.chdir(dir_psnr) #change directory to downloads folder
files_path =[os.path.abspath(x) for x in os.listdir()]
fnames_psnr_tmp = [x for x in files_path if x.endswith(".txt")]
the address of the folders are as follows:
F:\RD_data_from_twitch_system\RD_data_from_twitch_system\psnr
F:\RD_data_from_twitch_system\RD_data_from_twitch_system\bitrate
the name of text files in both two folders are as follows:
asmr_1.txt
asmr_2.txt
Counter_strike_1.txt
Counter_strike_2.txt
dota2_1.txt
what is the problem? and how can I read files in the same order?
the full code is :
def reading_file_to_array(dir_psnr,current_path):
dir_psnr=current_path+'\\'+dir_psnr+'\\'
os.chdir(dir_psnr) #change directory to downloads folder
files_path =[os.path.abspath(x) for x in os.listdir()]
fnames_psnr_tmp = [x for x in files_path if x.endswith(".txt")]
.
.
.
return()
current_path='F:/RD_data_from_twitch_system/RD_data_from_twitch_system'
current_dir ='F:/RD_data_from_twitch_system/RD_data_from_twitch_system'
all_sub_dir_paths = glob(str(current_dir) + '/*/')
all_sub_dir_names = [Path(sub_dir).name for sub_dir in all_sub_dir_paths]
for i in range(len(all_sub_dir_names)):
if all_sub_dir_names[i]=='bitrate':
bitrate_1080p,bitrate_720p,bitrate_480p,bitrate_360p,bitrate_160p=reading_file_to_array(all_sub_dir_names[i], current_path)
else:
psnr_1080p,psnr_720p,psnr_480p,psnr_360p,psnr_160p=reading_file_to_array(all_sub_dir_names[i], current_path)

Since the file names are the same, you could list the files in one directory and then add the bases to both for processing. This could be done in a generator that you can use in a loop. For example
folder1 = r"F:\RD_data_from_twitch_system\RD_data_from_twitch_system\psnr"
folder2 = r"F:\RD_data_from_twitch_system\RD_data_from_twitch_system\bitrate"
def list_directories(primary, secondary):
primary = os.path.abspath(primary)
secondary = os.path.abspath(secondary)
for fn in os.listdir(primary):
if fn.endswith(".txt"):
yield (os.path.join(primary, fn),
os.path.join(secondary, fn))
# print files for test
for f1, f2 in list_directories(folder1, folder2):
print(f1, f2)
Its usually a bad idea to os.chdir- especially without remembering which directory you came from. As long as your code builds absolute path names, the current working directory doesn't matter.

The easiest way would be to use listdir and to append the path to the front of every element of the list.
import os
#hardcoded folders
def reading_file_to_array(dir_1, dir_2):
list_1 = [f"{dir_1}/"+f for f in os.listdir(dir_1)]
list_2 = [f"{dir_2}/"+f for f in os.listdir(dir_2)]
# Add more lists
# Do sorting stuff here if needed
return zip(list_1, list_2)
for f1, f2 in reading_file_to_array("./f_1", "./f_2"):
print(f1, f2)
#more dynamic appraoch
def reading_file_to_array_dyn(dirs):
results = list()
for directory in dirs:
results.append([f"{directory}/"+f for f in os.listdir(directory)])
# Do sorting stuff here if needed
return zip(*results)
for f1, f2 in reading_file_to_array_dyn(["./f_1", "./f_2"]):
print(f1, f2)
The result of this test code looks like this for me:
./f_1/a.txt ./f_2/a.txt
./f_1/b.txt ./f_2/b.txt
./f_1/c.txt ./f_2/c.txt
If you want to filter the files in the folder based on type, I recommend the package glob.

Finding all subfolders that contain two files that end with certain strings

So I have a folder, say D:\Tree, that contains only subfolders (names may contain spaces). These subfolders contain a few files - and they may contain files of the form "D:\Tree\SubfolderName\SubfolderName_One.txt" and "D:\Tree\SubfolderName\SubfolderName_Two.txt" (in other words, the subfolder may contain both of them, one, or neither). I need to find every occurence where a subfolder contains both of these files, and send their absolute paths to a text file (in a format explained in the following example). Consider these three subfolders in D:\Tree:
D:\Tree\Grass contains Grass_One.txt and Grass_Two.txt
D:\Tree\Leaf contains Leaf_One.txt
D:\Tree\Branch contains Branch_One.txt and Branch_Two.txt
Given this structure and the problem mentioned above, I'd to like to be able to write the following lines in myfile.txt:
D:\Tree\Grass\Grass_One.txt D:\Tree\Grass\Grass_Two.txt
D:\Tree\Branch\Branch_One.txt D:\Tree\Branch\Branch_Two.txt
How might this be done? Thanks in advance for any help!
Note: It is very important that "file_One.txt" comes before "file_Two.txt" in myfile.txt

import os
folderPath = r'Your Folder Path'
for (dirPath, allDirNames, allFileNames) in os.walk(folderPath):
for fileName in allFileNames:
if fileName.endswith("One.txt") or fileName.endswith("Two.txt") :
print (os.path.join(dirPath, fileName))
# Or do your task as writing in file as per your need
Hope this helps....

Here is a recursive solution
def findFiles(writable, current_path, ending1, ending2):
'''
:param writable: file to write output to
:param current_path: current path of recursive traversal of sub folders
:param postfix: the postfix which needs to match before
:return: None
'''
# check if current path is a folder or not
try:
flist = os.listdir(current_path)
except NotADirectoryError:
return
# stores files which match given endings
ending1_files = []
ending2_files = []
for dirname in flist:
if dirname.endswith(ending1):
ending1_files.append(dirname)
elif dirname.endswith(ending2):
ending2_files.append(dirname)
findFiles(writable, current_path+ '/' + dirname, ending1, ending2)
# see if exactly 2 files have matching the endings
if len(ending1_files) == 1 and len(ending2_files) == 1:
writable.write(current_path+ '/'+ ending1_files[0] + ' ')
writable.write(current_path + '/'+ ending2_files[0] + '\n')
findFiles(sys.stdout, 'G:/testf', 'one.txt', 'two.txt')

Trying to exclude a substring within a list in python

I have a list phplist containing the following strings (example below), there are many more, this is a snippet of the entire list
/home/comradec/public_html/moodle/config.php
/home/comradec/public_html/moodle/cache/classes/config.php
/home/comradec/public_html/moodle/theme/sky_high/config.php
/home/comradec/public_html/moodle/theme/brick/config.php
/home/comradec/public_html/moodle/theme/serenity/config.php
/home/comradec/public_html/moodle/theme/binarius/config.php
/home/comradec/public_html/moodle/theme/anomaly/config.php
/home/comradec/public_html/moodle/theme/standard/config.php
What I am trying to do is only keep the subdir/config.php file and exclude all other config.php files (eg cache/classes/config.php).
Full code is
for folder, subs, files in os.walk(path):
for filename in files:
if filename.endswith('.php'):
phplist.append(abspath(join(folder, filename)))
for i in phplist:
if i.endswith("/config.php"):
cmsconfig.append(i)
if i.endswith("/mdeploy.php"):
cmslist.append(cms1[18])
So the outcome will only add /config.php file path to the list cmsconfig but what is happening I am getting all the config.php files as in the top example
I have been using the code like is not i.endswith("/theme/brick/config.php") but I want a way to exclude the theme directory from the list.
The reason I am placing the output into a list is I use that output in another area of the code.

Change your if-condition to if i.endswith("moodle/config.php").
If you want to change the folder that you want to this with:
path_ending = '%s/config.php' % folder_name
Now change the if-condition to if i.endswith(path_ending)
This will show paths that end with config.php within the folder tbat you passed.

I think this is what you want. may change the naming of variables it is not pep8 style.
First i sort all entries that the shortest comes first, then i remember which parts are already checked.
url1 = '/home/comradec/public_html/moodle/theme/binarius/config.php'
url2 = '/home/comradec/public_html/moodle/config.php'
url3 = '/home/comradec/public_html/othername/theme/binarius/config.php'
url4 = '/home/comradec/public_html/othername/config.php'
urls = []
urls.append(url1)
urls.append(url2)
urls.append(url3)
urls.append(url4)
moodleUrls = []
checkedDirs = []
#sort
for i in sorted(urls):
if str(i).endswith('config.php'):
alreadyChecked = False
for checkedDir in checkedDirs:
if str(i).startswith(checkedDir):
alreadyChecked = True
break
if not alreadyChecked:
moodleUrls.append(i)
checkedDirs.append(str(i).replace('/config.php',''))
print(checkedDirs)
print(moodleUrls)
Output:
['/home/comradec/public_html/moodle', '/home/comradec/public_html/othername']
['/home/comradec/public_html/moodle/config.php', '/home/comradec/public_html/othername/config.php']

The way I resolved my question. Provides the output I am looking for.
path = "/home/comradec"
phplist = []
cmsconfig = []
config = "config.php"
for folder, subs, files in os.walk(path):
for filename in files:
if filename.endswith('.php'):
phplist.append(abspath(join(folder, filename)))
for i in phplist:
if i.endswith("/mdeploy.php"):
newurl = i
newurl = newurl[:-11]
newurl = newurl + config
for i in phplist:
if i.endswith("/config.php"):
confirmurl = i
if confirmurl == newurl:
cmsconfig.append(newurl)
print('\n'.join(cmsconfig))

Python - Truncate unknown file names

Let's say I have the following files in a directory:
snackbox_1a.dat
zebrabar_3z.dat
cornrows_00.dat
meatpack_z2.dat
I have SEVERAL of these directories, in which all of the files are of the same format, ie:
snackbox_xx.dat
zebrabar_xx.dat
cornrows_xx.dat
meatpack_xx.dat
So what I KNOW about these files is the first bit (snackbox, zebrabar, cornrows, meatpack). What I don't know is the bit for the file extension (the 'xx'). This changes both within the directory across the files, and across the directories (so another directory might have different xx values, like 12, yy, 2m, 0t, whatever).
Is there a way for me to rename all of these files, or truncate them all (since the xx.dat will always be the same length), for ease of use when attempting to call them? For instance, I'd like to rename them so that I can, in another script, use a simple index to step through and find the file I want (instead of having to go into each directory and pull the file out manually).
In other words, I'd like to change the file names to:
snackbox.dat
zebrabar.dat
cornrows.dat
meatpack.dat
Thanks!

You can use shutil.move to move files. To calculate the new filename, you can use Python's string split method:
original_name = "snackbox_12.dat"
truncated_name = original.split("_")[0] + ".dat"

Try re.sub:
import re
filename = 'snackbox_xx.dat'
filename_new = re.sub(r'_[A-Za-z0-9]{2}', '', filename)
You should get 'snackbox.dat' for filename_new
This assumes the two characters after the "_" are either a number or lowercase/uppercase letter, but you could choose to expand the classes included in the regular expression.
EDIT: including moving and recursive search:
import shutil, re, os, fnmatch
directory = 'your_path'
for root, dirnames, filenames in os.walk(directory):
for filename in fnmatch.filter(filenames, '*.dat'):
filename_new = re.sub(r'_[A-Za-z0-9]{2}', '', filename)
shutil.move(os.path.join(root, filename), os.path.join(root, filename_new))

This solution renames all files in the current directory that match the pattern in the function call.
What the function does
snackbox_5R.txt >>> snackbox.txt
snackbox_6y.txt >>> snackbox_0.txt
snackbox_a2.txt >>> snackbox_1.txt
snackbox_Tm.txt >>> snackbox_2.txt
Let's look at the functions inputs and some examples.
list_of_files_names This is a list of string. Where each string is the filename without the _?? part.
Examples:
['snackbox.txt', 'zebrabar.txt', 'cornrows.txt', 'meatpack.txt', 'calc.txt']
['text.dat']
upper_bound=1000 This is an integer. When the ideal filename is already taken, e.g snackbox.dat already exist it will create snackbox_0.dat all the way up to snackbox_9999.dat if need be. You shouldn't have to change the default.
The Code
import re
import os
import os.path
def find_and_rename(dir, list_of_files_names, upper_bound=1000):
"""
:param list_of_files_names: List. A list of string: filname (without the _??) + extension, EX: snackbox.txt
Renames snackbox_R5.dat to snackbox.dat, etc.
"""
# split item in the list_of_file_names into two parts, filename and extension "snackbox.dat" -> "snackbox", "dat"
list_of_files_names = [(prefix.split('.')[0], prefix.split('.')[1]) for prefix in list_of_files_names]
# store the content of the dir in a list
list_of_files_in_dir = os.listdir(dir)
for file_in_dir in list_of_files_in_dir: # list all files and folders in current dir
file_in_dir_full_path = os.path.join(dir, file_in_dir) # we need the full path to rename to use .isfile()
print() # DEBUG
print('Is "{}" a file?: '.format(file_in_dir), end='') # DEBUG
print(os.path.isfile(file_in_dir_full_path)) # DEBUG
if os.path.isfile(file_in_dir_full_path): # filters out the folder, only files are needed
# Filename is a tuple containg the prefix filename and the extenstion
for file_name in list_of_files_names: # check if the file matches on of our renaming prefixes
# match both the file name (e.g "snackbox") and the extension (e.g "dat")
# It find "snackbox_5R.txt" by matching "snackbox" in the front and matching "dat" in the rear
if re.match('{}_\w+\.{}'.format(file_name[0], file_name[1]), file_in_dir):
print('\nOriginal File: ' + file_in_dir) # printing this is not necessary
print('.'.join(file_name))
ideal_new_file_name = '.'.join(file_name) # name might already be taken
# print(ideal_new_file_name)
if os.path.isfile(os.path.join(dir, ideal_new_file_name)): # file already exists
# go up a name, e.g "snackbox.dat" --> "snackbox_1.dat" --> "snackbox_2.dat
for index in range(upper_bound):
# check if this new name already exists as well
next_best_name = file_name[0] + '_' + str(index) + '.' + file_name[1]
# file does not already exist
if os.path.isfile(os.path.join(dir,next_best_name)) == False:
print('Renaming with next best name')
os.rename(file_in_dir_full_path, os.path.join(dir, next_best_name))
break
# this file exist as well, keeping increasing the name
else:
pass
# file with ideal name does not already exist, rename with the ideal name (no _##)
else:
print('Renaming with ideal name')
os.rename(file_in_dir_full_path, os.path.join(dir, ideal_new_file_name))
def find_and_rename_include_sub_dirs(master_dir, list_of_files_names, upper_bound=1000):
for path, subdirs, files in os.walk(master_dir):
print(path) # DEBUG
find_and_rename(path, list_of_files_names, upper_bound)
find_and_rename_include_sub_dirs('C:/Users/Oxen/Documents/test_folder', ['snackbox.txt', 'zebrabar.txt', 'cornrows.txt', 'meatpack.txt', 'calc.txt'])

How to traverse through the files in a directory?

I have a directory logfiles. I want to process each file inside this directory using a Python script.
for file in directory:
# do something
How do I do this?

With os.listdir() or os.walk(), depending on whether you want to do it recursively.

In Python 2, you can try something like:
import os.path
def print_it(x, dir_name, files):
print dir_name
print files
os.path.walk(your_dir, print_it, 0)
Note: the 3rd argument of os.path.walk is whatever you want. You'll get it as the 1st arg of the callback.
In Python 3 os.path.walk has been removed; use os.walk instead. Instead of taking a callback, you just pass it a directory and it yields (dirpath, dirnames, filenames) triples. So a rough equivalent of the above becomes
import os
for dirpath, dirnames, filenames in os.walk(your_dir):
print dirpath
print dirnames
print filenames

You can list every file from a directory recursively like this.
from os import listdir
from os.path import isfile, join, isdir
def getAllFilesRecursive(root):
files = [ join(root,f) for f in listdir(root) if isfile(join(root,f))]
dirs = [ d for d in listdir(root) if isdir(join(root,d))]
for d in dirs:
files_in_d = getAllFilesRecursive(join(root,d))
if files_in_d:
for f in files_in_d:
files.append(join(root,f))
return files

import os
# location of directory you want to scan
loc = '/home/sahil/Documents'
# global dictonary element used to store all results
global k1
k1 = {}
# scan function recursively scans through all the diretories in loc and return a dictonary
def scan(element,loc):
le = len(element)
for i in range(le):
try:
second_list = os.listdir(loc+'/'+element[i])
temp = loc+'/'+element[i]
print "....."
print "Directory %s " %(temp)
print " "
print second_list
k1[temp] = second_list
scan(second_list,temp)
except OSError:
pass
return k1 # return the dictonary element
# initial steps
try:
initial_list = os.listdir(loc)
print initial_list
except OSError:
print "error"
k =scan(initial_list,loc)
print " ..................................................................................."
print k
I made this code as a directory scanner to make a playlist feature for my audio player and it will recursively scan all the sub directories present in directory.

You could try glob:
import glob
for file in glob.glob('log-*-*.txt'):
# Etc.
But glob doesn't work recursively (as far as I know), so if your logs are in folders inside of that directory, you'd be better off looking at what Ignacio Vazquez-Abrams posted.

If you need to check for multiple file types, use
glob.glob("*.jpg") + glob.glob("*.png")
Glob doesn't care about the ordering of the files in the list. If you need files sorted by filename, use
sorted(glob.glob("*.jpg"))

import os
rootDir = '.'
for dirName, subdirList, fileList in os.walk(rootDir):
print('Found directory: %s' % dirName)
for fname in fileList:
print('\t%s' % fname)
# Remove the first entry in the list of sub-directories
# if there are any sub-directories present
if len(subdirList) > 0:
del subdirList[0]

Here's my version of the recursive file walker based on the answer of Matheus Araujo, that can take optional exclusion list arguments, which happens to be very helpful when dealing with tree copies where some directores / files / file extensions aren't wanted.
import os
def get_files_recursive(root, d_exclude_list=[], f_exclude_list=[], ext_exclude_list=[], primary_root=None):
"""
Walk a path to recursively find files
Modified version of https://stackoverflow.com/a/24771959/2635443 that includes exclusion lists
:param root: path to explore
:param d_exclude_list: list of root relative directories paths to exclude
:param f_exclude_list: list of filenames without paths to exclude
:param ext_exclude_list: list of file extensions to exclude, ex: ['.log', '.bak']
:param primary_root: Only used for internal recursive exclusion lookup, don't pass an argument here
:return: list of files found in path
"""
# Make sure we use a valid os separator for exclusion lists, this is done recursively :(
d_exclude_list = [os.path.normpath(d) for d in d_exclude_list]
files = [os.path.join(root, f) for f in os.listdir(root) if os.path.isfile(os.path.join(root, f))
and f not in f_exclude_list and os.path.splitext(f)[1] not in ext_exclude_list]
dirs = [d for d in os.listdir(root) if os.path.isdir(os.path.join(root, d))]
for d in dirs:
p_root = os.path.join(primary_root, d) if primary_root is not None else d
if p_root not in d_exclude_list:
files_in_d = get_files_recursive(os.path.join(root, d), d_exclude_list, f_exclude_list, ext_exclude_list, primary_root=p_root)
if files_in_d:
for f in files_in_d:
files.append(os.path.join(root, f))
return files

This is an update of my last version that accepts glob style wildcards in exclude lists.
The function basically walks into every subdirectory of the given path and returns the list of all files from those directories, as relative paths.
Function works like Matheus' answer, and may use optional exclude lists.
Eg:
files = get_files_recursive('/some/path')
files = get_files_recursive('/some/path', f_exclude_list=['.cache', '*.bak'])
files = get_files_recursive('C:\\Users', d_exclude_list=['AppData', 'Temp'])
files = get_files_recursive('/some/path', ext_exclude_list=['.log', '.db'])
Hope this helps someone like the initial answer of this thread helped me :)
import os
from fnmatch import fnmatch
def glob_path_match(path, pattern_list):
"""
Checks if path is in a list of glob style wildcard paths
:param path: path of file / directory
:param pattern_list: list of wildcard patterns to check for
:return: Boolean
"""
return any(fnmatch(path, pattern) for pattern in pattern_list)
def get_files_recursive(root, d_exclude_list=None, f_exclude_list=None, ext_exclude_list=None, primary_root=None):
"""
Walk a path to recursively find files
Modified version of https://stackoverflow.com/a/24771959/2635443 that includes exclusion lists
and accepts glob style wildcards on files and directories
:param root: path to explore
:param d_exclude_list: list of root relative directories paths to exclude
:param f_exclude_list: list of filenames without paths to exclude
:param ext_exclude_list: list of file extensions to exclude, ex: ['.log', '.bak']
:param primary_root: Only used for internal recursive exclusion lookup, don't pass an argument here
:return: list of files found in path
"""
if d_exclude_list is not None:
# Make sure we use a valid os separator for exclusion lists, this is done recursively :(
d_exclude_list = [os.path.normpath(d) for d in d_exclude_list]
else:
d_exclude_list = []
if f_exclude_list is None:
f_exclude_list = []
if ext_exclude_list is None:
ext_exclude_list = []
files = [os.path.join(root, f) for f in os.listdir(root) if os.path.isfile(os.path.join(root, f))
and not glob_path_match(f, f_exclude_list) and os.path.splitext(f)[1] not in ext_exclude_list]
dirs = [d for d in os.listdir(root) if os.path.isdir(os.path.join(root, d))]
for d in dirs:
p_root = os.path.join(primary_root, d) if primary_root is not None else d
if not glob_path_match(p_root, d_exclude_list):
files_in_d = get_files_recursive(os.path.join(root, d), d_exclude_list, f_exclude_list, ext_exclude_list,
primary_root=p_root)
if files_in_d:
for f in files_in_d:
files.append(os.path.join(root, f))
return files

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

os.listdir analog for a zipped directory - python

My goal is to list all files contained in the certain sub-directory inside a zip-archive. os.listdir(target_dir) raises a FileNotFoundError, and zfile.namelist() just lists all the files in all directories. Any ideas?

Related

how do I read file from two folder with the same order in python

Finding all subfolders that contain two files that end with certain strings

Trying to exclude a substring within a list in python

Python - Truncate unknown file names

How to traverse through the files in a directory?

Categories

Resources