Processing filenames in Python

Processing filenames in Python - python

I've written a function to strip double spaces out of my raw data files:
def fixDat(file):
'''
Removes extra spaces in the data files. Replaces original file with new
and renames original to "...._original.dat".
'''
import os
import re
with open(file+'.dat', 'r') as infile:
with open(file+'_fixed.dat', 'w') as outfile:
lines = infile.readlines()
for line in lines:
fixed = re.sub("\s\s+" , " ", line)
outfile.write(fixed)
os.rename(file+'.dat', file+'_original.dat')
os.rename(file+'_fixed.dat', file+'.dat')
I have 19 files in a folder that I need to process with this function, but I'm not sure how to parse the filenames and pass them to the function. Something like
for filename in folder:
fixDat(filename)
but how do I code filename and folder in Python?

If I understand correctly, you are asking about the os module's .walk() functionality. Where an example would look like:
import os
for root, dirs, files in os.walk(".", topdown=False): # "." uses current folder
# change it to a pathway if you want to process files not where your script is located
for name in files:
print(os.path.join(root, name))
With filename outputs which can be fed to your fixDat() function such as:
./tmp/test.py
./amrood.tar.gz
./httpd.conf
./www.tar.gz
./mysql.tar.gz
./test.py
Note that these are all strings so you could change the script to:
import os
for root, dirs, files in os.walk(".", topdown=False):
for name in files:
if name.endswith('.dat'): # or some other extension
print(os.path.join(root, name))
fixDat(os.path.join(root, name))

Related

Find directories missing .csv file in Python

I have ~1000 directories, containing various .csv files within them. I am trying to check if a specific type of csv file, containing a filename that begins with PTSD_OCOTBER, exists in each directory.
If this file does not exist in the directory, I want to print out that directory into a .txt file.
Here is what I have so far.
import os,sys,time,shutil
import subprocess
#determine filetype to look for.
file_type = ".csv"
print("Running file counter for" + repr(file_type))
#for each folder in the root directory
for subdir, dirs, files in os.walk(rootdir):
if("GeneSet" in subdir):
folder_name = subdir.rsplit('/', 1)[-1] #get the folder name.
for f in files:
#unclear how to write this part.
#how to tell if no files exist in directory?
This successfully finds the .csv files of interest, but how do achieve the above?

So files is the list of files in that directory that you are currently walking. You want to know if there are no files that start with PTSD_OCOTBER (PTSD_OCTOBER ?):
for subdir, dirs, files in os.walk(rootdir):
if("GeneSet" in subdir):
folder_name = subdir.rsplit('/', 1)[-1] #get the folder name.
dir_of_interest = not any(f.startswith('PTSD_OCOTBER') for f in files)
if dir_of_interest:
# do stuff with folder_name
Now you want to save the results into a text file? If you have a Unix-style computer, then you can use output redirection on your terminal, such as
python3 fileanalysis.py > result.txt
after writing print(folder_name) instead of # do stuff with folder_name.
Or you can use Python itself to write the file, such as:
found_dirs = []
for subdir, dirs, files in os.walk(rootdir):
...
if dir_of_interest:
found_dirs.append(folder_name)
with open('result.txt', 'w') as f:
f.write('\n'.join(found_dirs))

How to Save file names and their directories path in a text file using Python

I am trying to find a string that is contained in files under a directory. Then make it to store it's file names and directories under a new text file or something.
I got upto where it is going through a directory and finding a string, then printing a result. But not sure of the next step.
Please help, I'm completely new to coding and python.
import glob, os
#Open a source as a file and assign it as source
source = open('target.txt').read()
filedirectories = []
#locating the source file and printing the directories.
os.chdir("/Users/a1003584/desktop")
for root, dirs, files in os.walk(".", topdown=True):
for name in files:
print(os.path.join(root, name))
if source in open(os.path.join(root, name)).read():
print 'treasure found.'

Don't do a string comparison if your looking for a dictionary. Instead use the json module. Like this.
import json
import os
filesFound = []
def searchDir(dirName):
for name in os.listdir(dirName):
# If it is a file.
if os.isfile(dirName+name):
try:
fileCon = json.load(dirName+name)
except:
print("None json file.")
if "KeySearchedFor" in fileCon:
filesFound.append(dirName+name)
# If it is a directory.
else:
searchDir(dirName+name+'/')
# Change this to the directory your looking in.
searchDir("~/Desktop")
open("~/Desktop/OutFile.txt",'w').write(filesFound)

This should write the output to a csv file
import csv
import os
with open('target.txt') as infile: source = infile.read()
with open("output.csv", 'w') as fout:
outfile = csv.writer(fout)
outfile.writerow("Directory FileName FilePath".split())
for root, dirnames, fnames in os.walk("/Users/a1003584/desktop", topdown=True):
for fname in fnames:
with open(os.path.join(root, fname)) as infile:
if source not in infile.read(): continue
outfile.writerow(root, fname, os.path.join(root, fname))

How to extract zip file recursively?

I have a zip file which contains three zip files in it like this:
zipfile.zip\
dirA.zip\
a
dirB.zip\
b
dirC.zip\
c
I want to extract all the inner zip files that are inside the zip file in directories with these names (dirA, dirB, dirC).
Basically, I want to end up with the following schema:
output\
dirA\
a
dirB\
b
dirC\
c
I have tried the following:
import os, re
from zipfile import ZipFile
os.makedirs(directory) # where directory is "\output"
with ZipFile(self.archive_name, "r") as archive:
for id, files in data.items():
if files:
print("Creating", id)
dirpath = os.path.join(directory, id)
os.mkdir(dirpath)
for file in files:
match = pattern.match(filename)
new = match.group(2)
new_filename = os.path.join(dirpath, new)
content = archive.open(file).read()
with open(new_filename, "wb") as outfile:
outfile.write(content)
But it only extracts the zip file and I end up with:
output\
dirA\
dirA.zip
dirB\
dirB.zip
dirC\
dirC.zip
Any suggestions including code-segments will be much appreciated cause I have tried so many different things and read the docs without success.

When extracting the zip file, you would want to write the inner zip files to memory instead of them on disk. To do this, I've used BytesIO.
Check out this code:
import os
import io
import zipfile
def extract(filename):
z = zipfile.ZipFile(filename)
for f in z.namelist():
# get directory name from file
dirname = os.path.splitext(f)[0]
# create new directory
os.mkdir(dirname)
# read inner zip file into bytes buffer
content = io.BytesIO(z.read(f))
zip_file = zipfile.ZipFile(content)
for i in zip_file.namelist():
zip_file.extract(i, dirname)
If you run extract("zipfile.zip") with zipfile.zip as:
zipfile.zip/
dirA.zip/
a
dirB.zip/
b
dirC.zip/
c
Output should be:
dirA/
a
dirB/
b
dirC/
c

For a function that extracts a nested zip file (any level of nesting) and cleans up the original zip files:
import zipfile, re, os
def extract_nested_zip(zippedFile, toFolder):
""" Extract a zip file including any nested zip files
Delete the zip file(s) after extraction
"""
with zipfile.ZipFile(zippedFile, 'r') as zfile:
zfile.extractall(path=toFolder)
os.remove(zippedFile)
for root, dirs, files in os.walk(toFolder):
for filename in files:
if re.search(r'\.zip$', filename):
fileSpec = os.path.join(root, filename)
extract_nested_zip(fileSpec, root)

I tried some of the other solutions but couldn't get them to work "in place". I'll post my solution to handle the "in place" version. Note: it deletes the zip files and 'replaces' them with identically named directories, so back up your zip files if you want to keep.
Strategy is simple. Unzip all zip files in the directory (and subdirectories) and rinse and repeat until no zip files remain. The rinse and repeat is needed if the zip files contain zip files.
import os
import io
import zipfile
import re
def unzip_directory(directory):
"""" This function unzips (and then deletes) all zip files in a directory """
for root, dirs, files in os.walk(directory):
for filename in files:
if re.search(r'\.zip$', filename):
to_path = os.path.join(root, filename.split('.zip')[0])
zipped_file = os.path.join(root, filename)
if not os.path.exists(to_path):
os.makedirs(to_path)
with zipfile.ZipFile(zipped_file, 'r') as zfile:
zfile.extractall(path=to_path)
# deletes zip file
os.remove(zipped_file)
def exists_zip(directory):
""" This function returns T/F whether any .zip file exists within the directory, recursively """
is_zip = False
for root, dirs, files in os.walk(directory):
for filename in files:
if re.search(r'\.zip$', filename):
is_zip = True
return is_zip
def unzip_directory_recursively(directory, max_iter=1000):
print("Does the directory path exist? ", os.path.exists(directory))
""" Calls unzip_directory until all contained zip files (and new ones from previous calls)
are unzipped
"""
iterate = 0
while exists_zip(directory) and iterate < max_iter:
unzip_directory(directory)
iterate += 1
pre = "Did not " if iterate < max_iter else "Did"
print(pre, "time out based on max_iter limit of", max_iter, ". Took iterations:", iterate)
Assuming your zip files are backed up, you make this all work by calling unzip_directory_recursively(your_directory).

This works for me. Just place this script with the nested zip under the same directory. It will extract zip into directory with the same name as the original zip and clean up the original zip. It will also count the total number of files within the nested zip as well
import os
from zipfile import ZipFile
def unzip (path, total_count):
for root, dirs, files in os.walk(path):
for file in files:
file_name = os.path.join(root, file)
if (not file_name.endswith('.zip')):
total_count += 1
else:
currentdir = file_name[:-4]
if not os.path.exists(currentdir):
os.makedirs(currentdir)
with ZipFile(file_name) as zipObj:
zipObj.extractall(currentdir)
os.remove(file_name)
total_count = unzip(currentdir, total_count)
return total_count
total_count = unzip ('.', 0)
print(total_count)

reading HTML(different folders) files

I want to read HTML files in python. Normaly I do it like this (and it works):
import codecs
f = codecs.open("test.html",'r')
print f.read()
The Problem is that my html files are not all in the same Folder since have a program which generates this html files and save them into folders which are inside the folder where I have my script to read the files.
Summarizing, I have my script in a Folder and inside this Folder there are more Folders where the generated html files are.
Does anybody know how can I proceed?

import os
import codecs
for root, dirs, files in os.walk("./"):
for name in files:
abs_path = os.path.normpath(root + '/' + name)
file_name, file_ext = os.path.splitext(abs_path)
if file_ext == '.html':
f = codecs.open(abs_path,'r')
print f.read()
This will walk through <script dir>/ (./ will get translated to your script-directory) and loop through all files in each sub-directory.
It will check if the extension is .html and do the work on each .html file.
You would perhaps define more file endings that are "accepted" (for instance .htm).

use os.walk:
import os,codecs
for root, dirs, files in os.walk("/mydir"):
for file in files:
if file.endswith(".html"):
f = codecs.open(os.path.join(root, file),'r')
print f.read()

Get an arbitrary file from a directory tree

I'd like to get a path to an arbitrary text file (with .txt suffix) which is present somewhere in the directory tree. The file should not be hidden or in hidden directory.
I tried to write the code but it looks little cumbersome. How would you improve it to avoid useless steps?
def getSomeTextFile(rootDir):
"""Get the path to arbitrary text file under the rootDir"""
for root, dirs, files in os.walk(rootDir):
for f in files:
path = os.path.join(root, f)
ext = path.split(".")[-1]
if ext.lower() == "txt":
# it shouldn't be hidden or in hidden directory
if not "/." in path:
return path
return "" # there isn't any text file

Using os.walk (like in your example) is definitely a good start.
You can use fnmatch (link to the docs here) to simplify the rest of the code.
E.g:
...
if fnmatch.fnmatch(file, '*.txt'):
print file
...

I'd use fnmatch instead of string manipulations.
import os, os.path, fnmatch
def find_files(root, pattern, exclude_hidden=True):
""" Get the path to arbitrary .ext file under the root dir """
for dir, _, files in os.walk(root):
for f in fnmatch.filter(files, pattern):
path = os.path.join(dir, f)
if '/.' not in path or not exclude_hidden:
yield path
I've also rewritten the function to be more generic (and "pythonic"). To get just one pathname, call it like this:
first_txt = next(find_files(some_dir, '*.txt'))

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Processing filenames in Python - python

Related

Find directories missing .csv file in Python

How to Save file names and their directories path in a text file using Python

How to extract zip file recursively?

reading HTML(different folders) files

Get an arbitrary file from a directory tree

Categories

Resources