Python: work with recursive folders to read and write - python

I have this code:
# cwd = "C:\Users\johnr\Desktop\myFolder" - current working directory
for filename in os.listdir(os.path.join(cwd, "content")):
header_file = open(header_file_dir, "r")
footer_file = open(footer_file_dir, "r")
if ".md" in filename:
newFilename = filename.replace(".md", ".html")
if ".tile" in filename:
newFilename = filename.replace(".tile", ".html")
elif ".html" in filename:
newFilename = filename
elif ".txt" in filename:
newFilename = filename.replace(".txt", ".html")
else:
print(filename+" is not a valid file type!")
currents_working_file = open(os.path.join(cwd, "build", newFilename), "w")
# Write the header
currents_working_file.write(header_file.read())
# Get the actual stuff we want to put on the page
text_content = open(os.path.join(cwd, "content", filename), "r")
if ".md" in filename:
text_cont1 = "\n"+markdown.markdown(text_content.read())+"\n"
elif ".tile" in filename:
text_cont1 = "\n"+textile.textile(text_content.read())+"\n"
elif ".html" in filename:
text_cont1 = text_content.read()
elif ".txt" in filename:
text_cont1 = text_content.read()
else:
print(filename+" is not a valid file type!")
# Write the text content into the content template and onto the build file
content_templ_dir = os.path.join(cwd, "templates", "content_page.html")
if os.path.exists(content_templ_dir):
content_templ_file = open(content_templ_dir, "r")
content_templ_file1 = content_templ_file.read()
content_templ_file2 = content_templ_file1.replace("{page_content}", text_cont1)
currents_working_file.write(content_templ_file2)
else:
currents_working_file.write(text_cont1)
# Write the footer to the build file
currents_working_file.write("\n"+footer_file.read())
# Close the build file
currents_working_file.close()
which searches for a file in the 'content' directory and then creates a file of the same name in the'build' directory. How can I make this work when there are files in folders in the 'content' directory?

In order to recursively traverse directories, Python provides os.walk:
for root, dirs, files in os.walk(os.path.join(cwd, "content")):
relative_path = os.path.relpath(root, os.path.join(cwd, "content"))
for filename in files:
currents_working_file = open(os.path.join(cwd, "build", relative_path, filename), "w")

Assuming that cwd just holds the path to the current working dir:
from pathlib import Path
from itertools import chain
source_extensions = {'md', 'html', 'txt'}
source_root_dir_path = Path("content")
source_file_paths = chain.from_iterable(
source_root_dir_path.glob("**/*.{}".format(ext)) for ext in source_extensions
)
for p in source_file_paths:
destination_file_path = Path("build", *p.with_suffix(".html").parts[1:])
destination_file_path.parent.mkdir(parents=True, exist_ok=True)
with destination_file_path.open('w') as f:
f.write(header_file.read())
f.write("\n")
f.write(footer_file.read())

Related

How can I convert a PureWindowsPath to a Iterable?

I'm working on an academy project to encrypt some files I have managed to encrypt all files from one folder but when there is a folder into that folder i get errors so i decide to first list all files and sub-directories of the folder:
ROOT = r"C:\Users\Practiques\Desktop\archivos"
for path, subdirs, files in os.walk(ROOT):
for name in files:
pure_path = PurePath(path, name)
print (pure_path)
With this code I get the paths in that form: C:\Users\XXX\Desktop\archivos\external-content.duckduckgo.com.jpg
C:\Users\XXX\Desktop\archivos\hola.txt
and then when i try to pass to the function 'encrypt', i get this error:
TypeError: 'PureWindowsPath' object is not iterable
The format I need to pass to the function is this: ['C:\Users\XXX\Desktop\archivos\external-content.duckduckgo.com.jpg', 'C:\Users\XXX\Desktop\archivos\hola.txt', etc.]
I think one possible solution is to make a list when i obtain all recursive path and their files, but i don't know how to do that.
The function encrypt:
def encrypt(items, key):
f = Fernet(key)
for item in items:
with open(item, 'rb') as file:
file_data = file.read()
encrypted_data = f.encrypt(file_data)
with open(item, 'wb') as file:
file.write(encrypted_data)
How i call it:
for path, subdirs, files in os.walk(ROOT):
for name in files:
pure_path = PurePath(path, name)
print (pure_path)
encrypt(pure_path, key)
You need to use recursion to encrypt the sub-folders' contents:
import os
def recursive_search(path: str) -> "list[str]":
"""get all files from an absolute path
:param path: absolute path of the directory to search
:type path: str
:return: a list of all files
:rtype: list[str]
"""
found_files = []
if not os.path.isdir(path):
raise RuntimeError(f"'{path}' is not a directory")
for item in os.listdir(path):
full_path = os.path.join(path, item)
if os.path.isfile(full_path):
found_files.append(full_path)
elif os.path.isdir(full_path):
found_files.extend(recursive_search(full_path))
return found_files
directory = "YOUR ROOT/TOP-LEVEL DIRECTORY HERE"
print(recursive_search(directory))
Then, you would do:
f = Fernet(key)
for item in recursive_search(directory):
with open(item, 'rb') as file:
file_data = file.read()
encrypted_data = f.encrypt(file_data)
with open(item, 'wb') as file:
file.write(encrypted_data)
Edit 1: in regard to skipping over certain file extensions:
import os
def recursive_search(path: str) -> "list[str]":
"""get all files from an absolute path
:param path: absolute path of the directory to search
:type path: str
:return: a list of all files
:rtype: list[str]
"""
found_files = []
if not os.path.isdir(path):
raise RuntimeError(f"'{path}' is not a directory")
for item in os.listdir(path):
full_path = os.path.join(path, item)
dot_extension = os.path.splitext(full_path)[1] # ex.: '.txt'
if os.path.isfile(full_path):
if dot_extension == ".ini":
continue # this tells python to skip to break the for-loop run on the current item in an iterable and go to the next one
found_files.append(full_path)
elif os.path.isdir(full_path):
found_files.extend(recursive_search(full_path))
return found_files
directory = "/Users/nicholasbarrow/GitHub/com.nicholasrbarrow.cpp"
print(recursive_search(directory))
f = Fernet(key)
for item in recursive_search(directory):
with open(item, 'rb') as file:
file_data = file.read()
encrypted_data = f.encrypt(file_data)
with open(item, 'wb') as file:
file.write(encrypted_data)

How to show filenames in csv format

I am fetching file names from my local file directory and there are 411 items there but after printing it shows me file names and I am not able to count them in python and second problem is by saving in CSV format, it doesn't show me all the file names? Where is my mistake?
import os
FOLDER_PATH = '/home/bilal/Documents/Books/English/'
def listDir(dir):
fileNames = os.listdir(dir)
for fileName in fileNames:
my_file = open('my_file.csv', 'w')
my_file.write(fileName)
# print('File Name: ' + fileName)
if __name__ == '__main__':
listDir(FOLDER_PATH)
import glob
path = r"/home/bilal/Documents/Books/English/*.csv"
my_file = open('my_file.csv', 'w')
fileList = list(map(os.path.basename, glob.glob(path)))
for filename in fileList:
print(filename)
my_file.write(filename)
my_file.close()
or
import glob
path = r"/home/bilal/Documents/Books/English/*.csv"
with open('my_file.csv', 'w') as my_file:
fileList = list(map(os.path.basename, glob.glob(path)))
for filename in fileList:
print(filename)
my_file.write(filename)

Check list if file has downloaded and skip if it has?

I am new to Python and sure the below can be optimised however I have ran in to an issue with my last step in my script.
The aim is not to download a file if it has been previously downloaded. At this time I log the download in a file called download_history.log
I need to therefore implement a check here to kind of do the following check the log - if it exists in log do nothing and move to next file if it does not exists download the file and log it in to the file.
Any help would be appreciated.
#!/usr/bin/env python3
import boto
import sys, os
import zipfile
import shutil
import glob
import re
from boto.s3.key import Key
from boto.exception import S3ResponseError
#Make the download files
DOWNLOAD_LOCATION_PATH = os.path.expanduser("~") + "/AWSSplunk/Downloads/"
if not os.path.exists(DOWNLOAD_LOCATION_PATH):
print ("Making download directory")
os.mkdir(DOWNLOAD_LOCATION_PATH)
#Delete Output Folder if it exsists
OUTPUT_FOLDER = os.path.expanduser("~") + "/AWSSplunk/Output/"
shutil.rmtree(OUTPUT_FOLDER)
#Define the AWS Bucket
def backup_s3_folder():
BUCKET_NAME = "my-bucket-name"
AWS_ACCESS_KEY_ID= os.getenv("##################")
AWS_ACCESS_SECRET_KEY = os.getenv("#########################")
conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_ACCESS_SECRET_KEY)
bucket = conn.get_bucket(BUCKET_NAME)
#goto through the list of files
bucket_list = bucket.list()
for l in bucket_list:
key_string = str(l.key)
s3_path = DOWNLOAD_LOCATION_PATH + key_string
try:
# Add files to the log file
print ("Downloading file ", key_string)
file_object = open('download_history.log', 'a')
file_object.write(key_string)
file_object.write("\n")
# Working code
file_object.close()
l.get_contents_to_filename(s3_path)
except (OSError,S3ResponseError) as e:
pass
# check if the file has been downloaded locally
if not os.path.exists(s3_path):
try:
os.makedirs(s3_path)
except OSError as exc:
# let guard againts race conditions
import errno
if exc.errno != errno.EEXIST:
raise
if __name__ == '__main__':
backup_s3_folder()
# Start the unzipping process
print("Unzipping Starting")
dir_path = os.path.expanduser("~") + "/AWSSplunk/Downloads/"
for path, dir_list, file_list in os.walk(dir_path):
for file_name in file_list:
if file_name.endswith(".zip"):
abs_file_path = os.path.join(path, file_name)
parent_path = os.path.split(abs_file_path)[0]
output_folder_name = os.path.splitext(abs_file_path)[0]
output_path = os.path.join(parent_path, output_folder_name)
zip_obj = zipfile.ZipFile(abs_file_path, 'r')
zip_obj.extractall(output_path)
zip_obj.close()
print("Unzipping Completed")
# Start moving files to output
print("Moving Files")
FILE_LOCATION_PATH = os.path.expanduser("~") + "/AWSSplunk/Output/"
if not os.path.exists(FILE_LOCATION_PATH):
print ("Making download directory")
os.mkdir(FILE_LOCATION_PATH)
# .log files move
for root, dirs, files in os.walk(dir_path):
for file in files:
if file.endswith('.log'):
count = 1
destination_file = os.path.join(FILE_LOCATION_PATH, file)
while os.path.exists(destination_file):
destination_file = os.path.join(FILE_LOCATION_PATH, f"{file}_{count}")
count += 1
shutil.move(os.path.join(root, file), destination_file)
# .txt files move
for root, dirs, files in os.walk(dir_path):
for file in files:
if file.endswith('.txt'):
count = 1
destination_file = os.path.join(FILE_LOCATION_PATH, file)
while os.path.exists(destination_file):
destination_file = os.path.join(FILE_LOCATION_PATH, f"{file}_{count}")
count += 1
shutil.move(os.path.join(root, file), destination_file)
# .json files move
for root, dirs, files in os.walk(dir_path):
for file in files:
if file.endswith('.json'):
count = 1
destination_file = os.path.join(FILE_LOCATION_PATH, file)
while os.path.exists(destination_file):
destination_file = os.path.join(FILE_LOCATION_PATH, f"{file}_{count}")
count += 1
shutil.move(os.path.join(root, file), destination_file)
print("Files Move Complete")
# Delete Directory
print("Cleaning up Downloads Directory")
shutil.rmtree(DOWNLOAD_LOCATION_PATH)
# Remove EFR Audit Logs stratinbg with 2020
print("Remove the encrypted Audit Logs")
pattern = "^(2020)"
FILE_LOCATION_PATH = os.path.expanduser("~") + "/AWSSplunk/Output/"
for root, dirs, files in os.walk(FILE_LOCATION_PATH):
for file in filter(lambda x: re.match(pattern, x), files):
os.remove(os.path.join(root, file))
# Remove EFR Audit Logs stratinbg with EFR
pattern = "^(EFR)"
FILE_LOCATION_PATH = os.path.expanduser("~") + "/AWSSplunk/Output/"
for root, dirs, files in os.walk(FILE_LOCATION_PATH):
for file in filter(lambda x: re.match(pattern, x), files):
os.remove(os.path.join(root, file))
# Remove EFR Audit Logs stratinbg with 2019
pattern = "^(2019)"
FILE_LOCATION_PATH = os.path.expanduser("~") + "/AWSSplunk/Output/"
for root, dirs, files in os.walk(FILE_LOCATION_PATH):
for file in filter(lambda x: re.match(pattern, x), files):
os.remove(os.path.join(root, file))
# Script clean up
print("Script Complete")
#with open("download_history.log", "a") as myfile:
# myfile.write('New Line\n')
With os you can check whether a file exist or not:
if not os.isfile(PATH_TO_EXPECTED_DOWNLOADED_FILE):
#do download
For your own security please seperate your steps into functions and build a pipeline of these.

Python Algorithim Skipping the Search of New Files being added a Folder

Good day to everyone, my python code below converts every doc,docx,and rtf file to .txt which is awesome. When i hadd a new file to the directory, Ex.(rootdir) in code. My code finds the file and converts it perfectly. However, if i add the same exact file in a subdirectory of rootdir it does not pick up the new file being added. My question is what can i do different in my code or in general so that files being added in any subdirectory or main directory(rootdir) will be picked up and converted.
#RTF,DOCX,DOC TO TEXT
import win32com.client
import os
import re
import traceback
from os import listdir
from docx import Document
import shutil
import glob
rootdir = r'C:\Users\aac1928\Desktop\Test'
searchdir = rootdir + '\Search'
namedir = 'Search'
searchlist = []
dirlist = []
app = win32com.client.Dispatch('Word.Application')
app.Visible = False
app.DisplayAlerts = False
#Creates The search folder for text search in the directory
if os.path.exists(searchdir) == False:
os.mkdir(searchdir)
print((searchdir + " Has been created"))
#Modifies all word doc file types to .TXT
try:
for root, dirs, files in os.walk(rootdir):
for file in files:
fullpath = os.path.join(*[root, file])
if file.endswith(".docx"):
out_name = file.replace("docx", r"txt")
in_file = os.path.join(*[root, file])
out_file = os.path.abspath(root + "\\" + out_name)
doc = app.Documents.Open(in_file)
content = doc.Content.Text
print((file), out_file)
doc.SaveAs(out_file, FileFormat=7)
doc.Close()
if file.endswith(".doc"):
out_name = file.replace("doc", r"txt")
in_file = os.path.join(*[root, file])
out_file = os.path.abspath(root + "\\" + out_name)
doc = app.Documents.Open(in_file)
content = doc.Content.Text
print((file), out_file)
doc.SaveAs(out_file, FileFormat=7)
doc.Close()
if file.endswith(".rtf"):
out_name = file.replace("rtf", r"txt")
in_file = os.path.join(*[root, file])
out_file = os.path.abspath(root + "\\" + out_name)
doc = app.Documents.Open(in_file)
content = doc.Content.Text
print((file), out_file)
doc.SaveAs(out_file, FileFormat=7)
except:
pass
if os.path.exists(searchdir) == True:
print('Search file is Present')
for root, dirs, files in os.walk(searchdir, onerror=None):
for filename in files:
searchlist.append(os.path.splitext(filename)[0])
try:
for root, dirs, files in os.walk(rootdir):
if namedir in dirs:
dirs.remove(namedir)
for filename in files:
if (os.path.splitext(filename)[0]) not in searchlist:
print(filename)
#for filename in filenames:
fullpath = os.path.join(*[root, filename])
if filename.endswith(".docx"):
out_name = filename.replace("docx", r"txt")
in_filename = os.path.join(*[root, filename])
out_filename = os.path.abspath(root + "\\" + out_name)
doc = app.Documents.Open(in_filename)
content = doc.Content.Text
print((filename), out_filename)
doc.SaveAs(out_filename, FileFormat=7)
doc.Close()
if filename.endswith(".doc"):
out_name = filename.replace("doc", r"txt")
in_filename = os.path.join(*[root, filename])
out_filename = os.path.abspath(root + "\\" + out_name)
doc = app.Documents.Open(in_filename)
content = doc.Content.Text
print((filename), out_filename)
doc.SaveAs(out_filename, FileFormat=7)
doc.Close()
if filename.endswith(".rtf"):
out_name = filename.replace("rtf", r"txt")
in_filename = os.path.join(*[root, filename])
out_filename = os.path.abspath(root + "\\" + out_name)
doc = app.Documents.Open(in_filename)
content = doc.Content.Text
print((filename), out_filename)
doc.SaveAs(out_filename, FileFormat=7)
doc.Close()
except:
pass
else:
print("")
app.Quit()
#Moves the Converted Txt Files to The Search Folder
try:
for root, dirs, files in os.walk(rootdir):
for file in files:
for filename in file:
if namedir in dirs:
dirs.remove(namedir)
if file.endswith('.txt'):
shutil.move(os.path.join(root, file), os.path.join(searchdir, file))
break
except (IOError, OSError): # ignore read and permission errors
pass
here is solution
import os
import shutil
def absoluteFilePaths(directory):
for dirpath,_,filenames in os.walk(directory):
for f in filenames:
yield os.path.abspath(os.path.join(dirpath, f))
rootdir = r'C:\Users\aac1928\Desktop\Test'
file_names = list(absoluteFilePaths(rootdir))
extensions = ['doc', 'docs', 'rtf']
for i in file_names:
file_name, extension = i.split('.')
if extension in extensions ans os.path.exists(i):
new_file_name = file_name+ '.txt'
shutill.move(i, new_file_name)

Going into subfolders (python)

I've written something to remove special characters in Filenames. But it just includes the one folder and not it's subfolders. How can I do this also in subfolders and subsubfolders and so on?
import os
import re
def dir_list2(directory, *args):
fileList = []
content = os.listdir(directory)
for file in content :
dirfile = os.path.join(directory, file)
if os.path.isfile(dirfile):
if len(args) == 0:
fileList.append(dirfile)
else:
if os.path.splitext(dirfile)[1][1:] in args:
fileList.append(dirfile)
print "##################################################"
print "Old filename:", file
filename = file
remove = re.compile("[^.a-zA-z0-9_]")
output = remove.sub('_', filename)
newfile = directory + "/" + output
os.rename(dirfile, newfile)
print "Corrected filename:", output
#Removes Special Characters
return fileList
if __name__ == '__main__':
fileList = dir_list2('/path/')
Try using os.walk instead of os.listdir, it allows you to walk through a folder and its files and subfolders and so on.
Edit your code to be like:
content = os.walk(directory)
for dirpath, dirnames, filenames in content:
for file in filenames:
dirfile = os.path.join(dirpath, file)
# The rest of your code

Categories