python unzip files below the root folder

python unzip files below the root folder - python

i would like to unzip all the folders and files of an archive below the root folder, i have archive named abc.zip which gives me files as abc/xyz/ abc/123.jpg abc/xyz1/ , i just want to extract xyz/ , 123.jpg and xyz1/ in the CWD
i use below code to extract a file, but would need help on how to omit the root folder of the list
def unzip_artifact( local_directory, file_path ):
fileName, ext = os.path.splitext( file_path )
if ext == ".zip":
Downloadfile = basename(fileName) + ext
print 'unzipping file ' + Downloadfile
try:
zipfile.ZipFile(file_path).extractall(local_directory)
except zipfile.error, e:
print "Bad zipfile: %s" % (e)
return

You have to use a more complex (and therefore more customizable) way to unzip. Instead of using the 'extractall' method, you must extract each files separately with the 'extract' method. Then you will be able to change the destination directory, omitting archive's sub-directories.
Here is your code with the modification you needed :
def unzip_artifact( local_directory, file_path ):
fileName, ext = os.path.splitext( file_path )
if ext == ".zip":
Downloadfile = fileName + ext
print 'unzipping file ' + Downloadfile
try:
#zipfile.ZipFile(file_path).extractall(local_directory) # Old way
# Open the zip
with zipfile.ZipFile(file_path) as zf:
# For each members of the archive
for member in zf.infolist():
# If it's a directory, continue
if member.filename[-1] == '/': continue
# Else write its content to the root
with open(local_directory+'/'+os.path.basename(member.filename), "w") as outfile:
outfile.write(zf.read(member))
except zipfile.error, e:
print "Bad zipfile: %s" % (e)
return

Related

How to copy only non-duplicate files whilst maintaining folder structure?

I am trying to find duplicates between two folders and copy only unique image files to the 'dest' folder. I can copy all the non-dupes using the code below, however it doesn't maintain the source directory structure. I think OS.walk returns 3 tuples, but they aren't linked so not sure how to re-construct the sub dir?
Example:
import shutil, os
from difPy import dif
source = input('Input source folder:')
dest = input('Input backup \ destination folder:')
ext = ('.jpg','.jpeg','.gif','.JPG','.JPEG','.GIF')
search = dif(source, dest)
result = search.result
result
dupes = []
srcfiles = []
filecount = []
failed = []
removed = []
for i in result.values():
dupes.append(i['location'])
for dirpath, subdirs, files in os.walk(source):
for x in files:
if x.endswith(ext):
srcfiles.append(os.path.join(dirpath, x))
for f in srcfiles:
if f not in dupes:
shutil.copy(f, dest)
print('File copied successfully - '+f)
filecount.append(f)
else:
print('File not copied successfully !!!! - '+f)
failed.append(f)
I have also tried using the shutil.copytree function with an ignore list, however it requires a new folder and can't get the ignore list function to work
shutil.copytree example:
for i in result.values():
df = []
df.append(i['filename'])
def ignorelist(source, df):
return [f for f in df if os.path.isfile(os.path.join(source, f))]
shutil.copytree(source, destnew, ignore=ignorelist)

This function ignorelist should do the trick:
import shutil, os
from difPy import dif
source = input('Input source folder:')
dest = input('Input backup \ destination folder:')
ext = ('.jpg','.jpeg','.gif')
search = dif(source, dest)
dupes = [value['location'] for value in search.result.values()]
def ignorelist(source, files):
return [file for file in files
if (os.path.isfile(os.path.join(source, file))
and (os.path.join(source, file) in dupes
or not file.lower().endswith(ext)))]
shutil.copytree(source, dest, ignore=ignorelist)
And the other "more manual" way would be
import shutil, os
from difPy import dif
source = input('Input source folder:').rstrip('/\\')
dest = input('Input backup \ destination folder:').rstrip('/\\')
ext = ('.jpg','.jpeg','.gif')
search = dif(source, dest)
dupes = [value['location'] for value in search.result.values()]
srcfiles = []
copied = []
failed = []
skipped = []
for dirpath, subdirs, files in os.walk(source):
for file in files:
if file.lower().endswith(ext):
srcfile = os.path.join(dirpath,file)
srcfiles.append(srcfile)
if srcfile in dupes:
print('File not copied (duplicate) - '+srcfile)
skipped.append(srcfile)
else:
try:
destfile = os.path.join(dest,srcfile[len(source)+1:])
os.makedirs(os.path.dirname(destfile), exist_ok=True)
shutil.copy(srcfile,destfile)
print('File copied successfully - '+srcfile)
copied.append(srcfile)
except Exception as err:
print('File not copied (error %s) - %s' % (str(err),srcfile))
failed.append(f)

I have changed some variable names to make them more descriptive. And what you call failed is really just a list of files that are not copied because they are duplicates rather than files whose copying was attempted but failed.
import shutil, os
from difPy import dif
source = input('Input source folder: ')
dest = input('Input backup \ destination folder: ')
# Remove trailing path separators if they exist:
if source.endswith(('/', '\\')):
source = source[:-1]
if dest.endswith(('/', '\\')):
dest = dest[:-1]
# Use the correct path separator to
# ensure correct matching with dif results:
if os.sep == '/':
source = source.replace('\\', os.sep)
elif os.sep == '\\':
source = source.replace('/', os.sep)
source_directory_length = len(source) + 1
ext = ('.jpg','.jpeg','.gif','.JPG','.JPEG','.GIF')
search = dif(source, dest)
result = search.result
# Set comprehension:
dupes = {duplicate['location'] for duplicate in result.values()}
copied = []
not_copied = []
for dirpath, subdirs, files in os.walk(source):
for file in files:
if file.endswith(ext):
source_path = os.path.join(dirpath, file)
if source_path not in dupes:
# get subdirectory of source directory that this file is in:
file_length = len(file) + 1
# Get subdirectory relative to the source directory:
subdirectory = source_path[source_directory_length:-file_length]
if subdirectory:
dest_directory = os.path.join(dest, subdirectory)
# ensure directory exists:
os.makedirs(dest_directory, exist_ok=True)
else:
dest_directory = dest
dest_path = os.path.join(dest_directory, file)
shutil.copy(source_path, dest_path)
print('File copied successfully -', source_path)
copied.append(source_path)
else:
print('File not copied -', source_path)
not_copied.append(source_path)

Os.rename Files Overwrite

I am making a Python project that renames multiple files. However, sometimes the files overwrite.
suffixes = ['.pdf', '.epub', '.mobi']
file_list = []
def change_fname(dir_name, part=' (z-lib.org)', action='remove'):
fnames = os.listdir(dir_name)
for suffix in suffixes:
fnames_suffix = [f for f in fnames if f.endswith(suffix)]
for fname in fnames_suffix:
print(f'{action} "{part}" into/from "{fname}"')
if action == 'remove' and fname.endswith(part+suffix):
new_name = fname[:-len(suffix) - len(part)] + suffix
print(f'fname is {fname}')
elif action == 'insert':
new_name = fname[:-len(suffix)] + part + suffix
else:
raise Exception(f'Unknown Action: {action}')
print(new_name)
old_file = os.path.join(dir_name, fname)
new_file = os.path.join(dir_name, new_name)
os.rename(old_file, new_file)
file_to_show = '/Users/ChrisHart/Downloads/test i love you daddy/'
subprocess.call(["open", "-R", file_to_show])
if __name__ == '__main__':
dir_name = '/Users/ChrisHart/Downloads/test i love you daddy/'
try:
change_fname(dir_name, part=' (z-lib.org)', action='remove')
except Exception as ex:
print(ex)
This is my program ^
file (part).pdf
file.pdf
The file will delete " (part)", so we get this
file.pdf
file.pdf
And they overwrite.
file.pdf
How can I fix this overwriting?

I also wrote a script that changes multiple files. Maybe my code helps you understand your problem:
import os
print(os.getcwd()) #Gives you your current directory
os.chdir('/PATH/TO/FILES') #Change directory to the files
for i in os.listdir('/PATH/TO/FILES'):
os.rename(i, i.replace('(z-lib.org)', ' ')) #replaces z-lib with one whitespace
print(i)
I know what you're trying to replace :D ... I did the same thing

Check list if file has downloaded and skip if it has?

I am new to Python and sure the below can be optimised however I have ran in to an issue with my last step in my script.
The aim is not to download a file if it has been previously downloaded. At this time I log the download in a file called download_history.log
I need to therefore implement a check here to kind of do the following check the log - if it exists in log do nothing and move to next file if it does not exists download the file and log it in to the file.
Any help would be appreciated.
#!/usr/bin/env python3
import boto
import sys, os
import zipfile
import shutil
import glob
import re
from boto.s3.key import Key
from boto.exception import S3ResponseError
#Make the download files
DOWNLOAD_LOCATION_PATH = os.path.expanduser("~") + "/AWSSplunk/Downloads/"
if not os.path.exists(DOWNLOAD_LOCATION_PATH):
print ("Making download directory")
os.mkdir(DOWNLOAD_LOCATION_PATH)
#Delete Output Folder if it exsists
OUTPUT_FOLDER = os.path.expanduser("~") + "/AWSSplunk/Output/"
shutil.rmtree(OUTPUT_FOLDER)
#Define the AWS Bucket
def backup_s3_folder():
BUCKET_NAME = "my-bucket-name"
AWS_ACCESS_KEY_ID= os.getenv("##################")
AWS_ACCESS_SECRET_KEY = os.getenv("#########################")
conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_ACCESS_SECRET_KEY)
bucket = conn.get_bucket(BUCKET_NAME)
#goto through the list of files
bucket_list = bucket.list()
for l in bucket_list:
key_string = str(l.key)
s3_path = DOWNLOAD_LOCATION_PATH + key_string
try:
# Add files to the log file
print ("Downloading file ", key_string)
file_object = open('download_history.log', 'a')
file_object.write(key_string)
file_object.write("\n")
# Working code
file_object.close()
l.get_contents_to_filename(s3_path)
except (OSError,S3ResponseError) as e:
pass
# check if the file has been downloaded locally
if not os.path.exists(s3_path):
try:
os.makedirs(s3_path)
except OSError as exc:
# let guard againts race conditions
import errno
if exc.errno != errno.EEXIST:
raise
if __name__ == '__main__':
backup_s3_folder()
# Start the unzipping process
print("Unzipping Starting")
dir_path = os.path.expanduser("~") + "/AWSSplunk/Downloads/"
for path, dir_list, file_list in os.walk(dir_path):
for file_name in file_list:
if file_name.endswith(".zip"):
abs_file_path = os.path.join(path, file_name)
parent_path = os.path.split(abs_file_path)[0]
output_folder_name = os.path.splitext(abs_file_path)[0]
output_path = os.path.join(parent_path, output_folder_name)
zip_obj = zipfile.ZipFile(abs_file_path, 'r')
zip_obj.extractall(output_path)
zip_obj.close()
print("Unzipping Completed")
# Start moving files to output
print("Moving Files")
FILE_LOCATION_PATH = os.path.expanduser("~") + "/AWSSplunk/Output/"
if not os.path.exists(FILE_LOCATION_PATH):
print ("Making download directory")
os.mkdir(FILE_LOCATION_PATH)
# .log files move
for root, dirs, files in os.walk(dir_path):
for file in files:
if file.endswith('.log'):
count = 1
destination_file = os.path.join(FILE_LOCATION_PATH, file)
while os.path.exists(destination_file):
destination_file = os.path.join(FILE_LOCATION_PATH, f"{file}_{count}")
count += 1
shutil.move(os.path.join(root, file), destination_file)
# .txt files move
for root, dirs, files in os.walk(dir_path):
for file in files:
if file.endswith('.txt'):
count = 1
destination_file = os.path.join(FILE_LOCATION_PATH, file)
while os.path.exists(destination_file):
destination_file = os.path.join(FILE_LOCATION_PATH, f"{file}_{count}")
count += 1
shutil.move(os.path.join(root, file), destination_file)
# .json files move
for root, dirs, files in os.walk(dir_path):
for file in files:
if file.endswith('.json'):
count = 1
destination_file = os.path.join(FILE_LOCATION_PATH, file)
while os.path.exists(destination_file):
destination_file = os.path.join(FILE_LOCATION_PATH, f"{file}_{count}")
count += 1
shutil.move(os.path.join(root, file), destination_file)
print("Files Move Complete")
# Delete Directory
print("Cleaning up Downloads Directory")
shutil.rmtree(DOWNLOAD_LOCATION_PATH)
# Remove EFR Audit Logs stratinbg with 2020
print("Remove the encrypted Audit Logs")
pattern = "^(2020)"
FILE_LOCATION_PATH = os.path.expanduser("~") + "/AWSSplunk/Output/"
for root, dirs, files in os.walk(FILE_LOCATION_PATH):
for file in filter(lambda x: re.match(pattern, x), files):
os.remove(os.path.join(root, file))
# Remove EFR Audit Logs stratinbg with EFR
pattern = "^(EFR)"
FILE_LOCATION_PATH = os.path.expanduser("~") + "/AWSSplunk/Output/"
for root, dirs, files in os.walk(FILE_LOCATION_PATH):
for file in filter(lambda x: re.match(pattern, x), files):
os.remove(os.path.join(root, file))
# Remove EFR Audit Logs stratinbg with 2019
pattern = "^(2019)"
FILE_LOCATION_PATH = os.path.expanduser("~") + "/AWSSplunk/Output/"
for root, dirs, files in os.walk(FILE_LOCATION_PATH):
for file in filter(lambda x: re.match(pattern, x), files):
os.remove(os.path.join(root, file))
# Script clean up
print("Script Complete")
#with open("download_history.log", "a") as myfile:
# myfile.write('New Line\n')

With os you can check whether a file exist or not:
if not os.isfile(PATH_TO_EXPECTED_DOWNLOADED_FILE):
#do download
For your own security please seperate your steps into functions and build a pipeline of these.

Using variables recursively in python

Can i use variables to set my zip patch inset of entering it manualy
Example part of the code that works fine
if __name__ == '__main__':
zip_folder(r'Monday' ,
r'Monday.zip')
But can i use a variable insted of just a entering the day myself, for this second example i get a "invalid syntax" error
today = "Monday"
today_zip = "Monday.zip"
if __name__ == '__main__':
zip_folder(r today,
r today_zip)
import zipfile
import sys
import os
def zip_folder(folder_path, output_path):
"""Zip the contents of an entire folder (with that folder included
in the archive). Empty subfolders will be included in the archive
as well.
"""
parent_folder = os.path.dirname(folder_path)
# Retrieve the paths of the folder contents.
contents = os.walk(folder_path)
try:
zip_file = zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED)
for root, folders, files in contents:
# Include all subfolders, including empty ones.
for folder_name in folders:
absolute_path = os.path.join(root, folder_name)
relative_path = absolute_path.replace(parent_folder + '\\',
'')
print "Adding '%s' to archive." % absolute_path
zip_file.write(absolute_path, relative_path)
for file_name in files:
absolute_path = os.path.join(root, file_name)
relative_path = absolute_path.replace(parent_folder + '\\',
'')
print "Adding '%s' to archive." % absolute_path
zip_file.write(absolute_path, relative_path)
print "'%s' created successfully." % output_path
except IOError, message:
print message
sys.exit(1)
except OSError, message:
print message
sys.exit(1)
except zipfile.BadZipfile, message:
print message
sys.exit(1)
finally:
zip_file.close()
if __name__ == '__main__':
zip_folder(r'Monday',
r'Monday.zip')

You do not need to specify r here:
if __name__ == '__main__':
zip_folder( today, today_zip)
would work fine. r,u etc are qualifiers for strings in python, which is not needed here in your case.

Going into subfolders (python)

I've written something to remove special characters in Filenames. But it just includes the one folder and not it's subfolders. How can I do this also in subfolders and subsubfolders and so on?
import os
import re
def dir_list2(directory, *args):
fileList = []
content = os.listdir(directory)
for file in content :
dirfile = os.path.join(directory, file)
if os.path.isfile(dirfile):
if len(args) == 0:
fileList.append(dirfile)
else:
if os.path.splitext(dirfile)[1][1:] in args:
fileList.append(dirfile)
print "##################################################"
print "Old filename:", file
filename = file
remove = re.compile("[^.a-zA-z0-9_]")
output = remove.sub('_', filename)
newfile = directory + "/" + output
os.rename(dirfile, newfile)
print "Corrected filename:", output
#Removes Special Characters
return fileList
if __name__ == '__main__':
fileList = dir_list2('/path/')

Try using os.walk instead of os.listdir, it allows you to walk through a folder and its files and subfolders and so on.
Edit your code to be like:
content = os.walk(directory)
for dirpath, dirnames, filenames in content:
for file in filenames:
dirfile = os.path.join(dirpath, file)
# The rest of your code

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

python unzip files below the root folder - python

Related

How to copy only non-duplicate files whilst maintaining folder structure?

Os.rename Files Overwrite

Check list if file has downloaded and skip if it has?

Using variables recursively in python

Going into subfolders (python)

Categories

Resources