Get all files from my C drive - Python - python

Here is what I try to do:
I would like to get a list of all files that are heavier than 35 MB in my C drive.
Here is my code:
def getAllFileFromDirectory(directory, temp):
files = os.listdir(directory)
for file in files:
if (os.path.isdir(file)):
getAllFileFromDirectory(file, temp)
elif (os.path.isfile(file) and os.path.getsize(file) > 35000000):
temp.write(os.path.abspath(file))
def getFilesOutOfTheLimit():
basePath = "C:/"
tempFile = open('temp.txt', 'w')
getAllFileFromDirectory(basePath, tempFile)
tempFile.close()
print("Get all files ... Done !")
For some reason, the interpreter doesn't go in the if-block inside 'getAllFileFromDirectory'.
Can someone tell me what I'm doing wrong and why (learning is my aim). How to fix it ?
Thanks a lot for your comments.

I fixed your code. Your problem was that os.path.isdir can only know if something is a directory if it receives the full path of it. So, I changed the code to the following and it works. Same thing for os.path.getsize and os.path.isfile.
import os
def getAllFileFromDirectory(directory, temp):
files = os.listdir(directory)
for file in files:
if (os.path.isdir(directory + file)):
if file[0] == '.': continue # i added this because i'm on a UNIX system
print(directory + file)
getAllFileFromDirectory(directory + file, temp)
elif (os.path.isfile(directory + file) and os.path.getsize(directory + file) > 35000000):
temp.write(os.path.abspath(file))
def getFilesOutOfTheLimit():
basePath = "/"
tempFile = open('temp.txt', 'w')
getAllFileFromDirectory(basePath, tempFile)
tempFile.close()
print("Get all files ... Done !")
getFilesOutOfTheLimit()

Related

Ttrying to copy the content of files from source to destination, if file is .txt and if files have the same names and then ZIP each just copied file

I'm new to python and trying to copy the content of files from dir_A to dir_B. If file is .txt and if these files from dir_A and dir_B have the same names and after zip each of these newly copied files.
import os, shutil, zipfile
src_folder = "C:/Users/pushka/pythonApp1/src_folder"
dst_folder = "C:/Users/pushka/pythonApp1/dst_folder"
# only .txt files will be copied
ext = (".txt")
try:
for src_f in os.scandir(src_folder):
for dst_f in os.scandir(dst_folder):
if src_f.path.endswith(ext) and os.path.basename(src_f) == os.path.basename(dst_f):
# copy file
shutil.copyfile(src_f, dst_f)
finally:
print("The 'try except' is finished")
I have searched and tried several options to ZIP, but none of them work properly, so I need your help please
I modified your code a bit, but this should do the trick:
import os, shutil, zipfile
src_folder = "C:/Users/pushka/pythonApp1/src_folder"
dst_folder = "C:/Users/pushka/pythonApp1/dst_folder"
# only .txt files will be copied
ext = ".txt"
copied_files = []
for src_f in os.scandir(src_folder):
if src_f.name.endswith(ext) and not src_f.is_dir():
dst_f = os.path.join(dst_folder, src_f.name)
if not os.path.exists(dst_f):
shutil.copyfile(src_f, dst_f)
copied_files.append(dst_f)
print(copied_files)
zipfile_name = os.path.join(dst_folder, "copied_files.zip")
if not os.path.exists(zipfile_name):
with zipfile.ZipFile(zipfile_name, "w") as zf:
for txtfile in copied_files:
print("Writing " + txtfile)
zf.write(txtfile, os.path.split(txtfile)[-1])
It should be pretty self-explanatory, but I'll walk you through it. In the first for loop, we scan all entries in src_folder. If the name ends in .txt and it is not a directory, we create a path to the destination file. Then, as long as the destination file does not exist, we copy the source to the destination, and add the destination to the copied_files list.
After all the copying is done, we create the zip file's name. If it doesn't exist, we create it using the zipfile.ZipFile context manager and write in each copied file (from the destination, not the source), stripping the full path from it in the archive.
Please note that, by default, the zipfile uses ZIP_STORED as the compression format - i.e., the data is not compressed. See the docs for the other supported compression formats if you need a compressed archive.
Thanks a lot, but here is the answer to my own question with your help
import os, shutil, zipfile
src_folder = "C:/Users/pushka/pythonApp1/src_folder"
dst_folder = "C:/Users/pushka/pythonApp1/dst_folder"
# only .txt files will be copied
ext = ".txt"
copied_files = []
for src_f in os.scandir(src_folder):
for dst_f in os.scandir(dst_folder):
if src_f.name.endswith(ext) and os.path.basename(src_f) == os.path.basename(dst_f):
# copy file
shutil.copyfile(src_f, dst_f)
copied_files.append(dst_f)
print(copied_files)
for txt_file in copied_files:
file_root = os.path.splitext(txt_file)[0]
zip_file_name = file_root + '.zip'
with zipfile.ZipFile(zip_file_name, mode='w') as zf:
zf.write(txt_file, os.path.basename(txt_file))
Works as expected
Simple format:
from pathlib import Path
from typing import List
from zipfile import ZipFile
src_folder = Path("C:/Users/pushka/pythonApp1/src_folder")
dst_folder = Path("C:/Users/pushka/pythonApp1/dst_folder")
SUFFIX = ".txt"
def copy_file(from_path: Path, to_path: Path, copied) -> None:
content = from_path.read_bytes()
to_path.write_bytes(content)
copied.append(to_path)
print(f"Copy file: {from_path} --> {to_path}")
def zip_them(paths: List[Path]) -> str:
filename = "copied.zip"
with ZipFile(filename, "w") as z:
for path in paths:
z.write(path, path.name)
return filename
def main():
assert src_folder.exists(), f"path `{src_folder}` not found!"
assert dst_folder.exists(), f"path `{dst_folder}` not found!"
copied = []
for p in src_folder.glob(f"*{SUFFIX}"):
dst = dst_folder / p.name
copy_file(p, dst, copied)
fn = zip_them(copied)
print(f"There are {len(copied)} files copied. And zipped to: {fn}")
if __name__ == "__main__":
main()
My prefer:
from typing import List
from zipfile import ZipFile
import anyio # pip install anyio
from anyio import Path
src_folder = Path("C:/Users/pushka/pythonApp1/src_folder")
dst_folder = Path("C:/Users/pushka/pythonApp1/dst_folder")
async def copy_file(from_path: Path, to_path: Path, copied) -> None:
content = await from_path.read_bytes()
await to_path.write_bytes(content)
copied.append(to_path)
print(f"copy file: {from_path} --> {to_path}")
def zip_them(paths: List[Path]) -> str:
filename = "copied.zip"
with ZipFile(filename, "w") as z:
for path in paths:
z.write(path, path.name)
return filename
async def main():
copied = []
async with anyio.create_task_group() as tg:
async for p in src_folder.glob("*.txt"):
dst = dst_folder / p.name
tg.start_soon(copy_file, p, dst, copied)
fn = zip_them(copied)
print(f"zip file created: {fn}")
if __name__ == "__main__":
import timeit
cost = timeit.timeit("anyio.run(main)", number=1, globals=globals())
print("Cost:", round(cost, 2), "seconds.")

Finding Files in File Tree Based on List of Extensions

I'm working on a small python 3 utility to build a zip file based on a list of file extensions. I have a text file of extensions and I'm passing a folder into the script:
working_folder = sys.argv[1]
zip_name = sys.argv[2]
#Open the extension file
extensions = []
with open('CxExt.txt') as fp:
lines = fp.readlines()
for line in lines:
extensions.append(line)
#Now get the files in the directory. If they have the right exttension add them to the list.
files = os.listdir(working_folder)
files_to_zip = []
for ext in extensions:
results = glob.glob(working_folder + '**/' + ext, recursive=True)
print(str(len(results)) + " results for " + working_folder + '**/*' + ext)
#search = "*"+ext
#results = [y for x in os.walk(working_folder) for y in glob(os.path.join(x[0], search))]
#results = list(Path(".").rglob(search))
for result in results:
files_to_zip.append(result)
if len(files_to_zip) == 0:
print("No Files Found")
sys.exit()
for f in files:
print("Checking: " + f)
filename, file_extension = os.path.splitext(f)
print(file_extension)
if file_extension in extensions:
print(f)
files_to_zip.append(file)
ZipFile = zipfile.ZipFile(zip_name, "w" )
for z in files_to_zip:
ZipFile.write(os.path.basename(z), compress_type=zipfile.ZIP_DEFLATED)
I've tried using glob, os.walk, and Path.rglob and I still can't get a list of files. There's got to be something just obvious that I'm missing. I built a test directory that has some directories, py files, and a few zip files. It returns 0 for all file types. What am I overlooking?
This is my first answer, so please don't expect it to be perfect.
I notice you're using file.readlines(). According to the Python docs here, file.readlines() returns a list of lines including the newline at the end. If your text file has the extensions separated by newlines, maybe try using file.read().split("\n") instead. Besides that, your code looks okay. Tell me if this fix doesn't work.

Python OS - check if file exists, if so rename, check again, then save

I have a script that takes a file from a form, renames it and uploads it to a folder and inserts record into a database. I would like to add the functionality where before the file is saved, it checks the upload folder to determine if the filename exists. If it does exist, renames the file in a loop and then saves the file.
What I have currently:
file = request.files['xx']
extension = os.path.splitext(file.filename)[1]
xx = str(uuid.uuid4()) + extension
## if xx exists .. xx = str(uuid.uuid4()) + extension.. loop endlessly.
file.save(os.path.join(app.config['UPLOAD_FOLDER'], xx)
Haven't tested this yet but you can use os.path.isfile() to check if a file already exists (for directories, use os.path.exists).
import os
def save():
file = request.files['xx']
extension = os.path.splitext(file.filename)[1]
xx = generate_filename(extension)
file.save(os.path.join(app.config['UPLOAD_FOLDER'], xx))
def generate_filename(extension):
xx = str(uuid.uuid4()) + extension
if os.path.isfile(os.path.join(app.config['UPLOAD_FOLDER'], xx)):
return generate_filename(extension)
return xx
quick and dirty, haven't tested this. using the check and rename function recursively to add "_1", "_2" etc to the end of the file name until it can be saved.
def check_and_rename(file, add=0):
original_file = file
if add != 0:
split = file.split(".")
part_1 = split[0] + "_" + str(add)
file = ".".join([part1, split[1]])
if not os.path.isfile(file):
# save here
else:
check_and_rename(original_file, add+=1)
This will check if a file exist and generate a new name that does not exist by increasing a number:
from os import path
def check_file(filePath):
if path.exists(filePath):
numb = 1
while True:
newPath = "{0}_{2}{1}".format(*path.splitext(filePath) + (numb,))
if path.exists(newPath):
numb += 1
else:
return newPath
return filePath
Improving on N.Walters answer, but so you have a function that just parses the file_path and gives you a valid one back and using the internal Path class:
import os
from pathlib import Path
def check_and_rename(file_path: Path, add: int = 0) -> Path:
original_file_path = file_path
if add != 0:
file_path = file_path.with_stem(file_path.stem + "_" + str(add))
if not os.path.isfile(file_path):
return file_path
else:
return check_and_rename(original_file_path, add + 1)
Have you tried to use the glob Module, it provides an interface similar to ls, you can use it as it follows:
import os
import glob
file_list = glob.glob('my_file')
if len(file_list) > 0:
os.rename('my_file', 'new_name')
if not os.path.isfile(xx):
file.save(os.path.join(app.config['UPLOAD_FOLDER'], xx)
else:
print("File does not exist")

Script that reads PDF metadata and writes to CSV

I wrote a script to read PDF metadata to ease a task at work. The current working version is not very usable in the long run:
from pyPdf import PdfFileReader
BASEDIR = ''
PDFFiles = []
def extractor():
output = open('windoutput.txt', 'r+')
for file in PDFFiles:
try:
pdf_toread = PdfFileReader(open(BASEDIR + file, 'r'))
pdf_info = pdf_toread.getDocumentInfo()
#print str(pdf_info) #print full metadata if you want
x = file + "~" + pdf_info['/Title'] + " ~ " + pdf_info['/Subject']
print x
output.write(x + '\n')
except:
x = file + '~' + ' ERROR: Data missing or corrupt'
print x
output.write(x + '\n')
pass
output.close()
if __name__ == "__main__":
extractor()
Currently, as you can see, I have to manually input the working directory and manually populate the list of PDF files. It also just prints out the data in the terminal in a format that I can copy/paste/separate into a spreadsheet.
I'd like the script to work automatically in whichever directory I throw it in and populate a CSV file for easier use. So far:
from pyPdf import PdfFileReader
import csv
import os
def extractor():
basedir = os.getcwd()
extension = '.pdf'
pdffiles = [filter(lambda x: x.endswith('.pdf'), os.listdir(basedir))]
with open('pdfmetadata.csv', 'wb') as csvfile:
for f in pdffiles:
try:
pdf_to_read = PdfFileReader(open(f, 'r'))
pdf_info = pdf_to_read.getDocumentInfo()
title = pdf_info['/Title']
subject = pdf_info['/Subject']
csvfile.writerow([file, title, subject])
print 'Metadata for %s written successfully.' % (f)
except:
print 'ERROR reading file %s.' % (f)
#output.writerow(x + '\n')
pass
if __name__ == "__main__":
extractor()
In its current state it seems to just prints a single error (as in, the error message in the exception, not an error returned by Python) message and then stop. I've been staring at it for a while and I'm not really sure where to go from here. Can anyone point me in the right direction?
writerow([file, title, subject]) should be writerow([f, title, subject])
You can use sys.exc_info() to print the details of your error
http://docs.python.org/2/library/sys.html#sys.exc_info
Did you check the pdffiles variable contains what you think it does? I was getting a list inside a list... so maybe try:
for files in pdffiles:
for f in files:
#do stuff with f
I personally like glob. Notice I add * before the .pdf in the extension variable:
import os
import glob
basedir = os.getcwd()
extension = '*.pdf'
pdffiles = glob.glob(os.path.join(basedir,extension)))
Figured it out. The script I used to download the files was saving the files with '\r\n' trailing after the file name, which I didn't notice until I actually ls'd the directory to see what was up. Thanks for everyone's help.

Python - Create html page with hyperlinks from os.listdir

I have a script that creates a folder called "videos" on a USB drive, moves 6,500 WMV files over to the "videos" folder. Then it's suppose to create an HTML page with hyperlinks to each file. Here is my current example that's broken. I'm trying to have it crawl the videos directory and create an HTML page with hyperlinks only to the local files on the USB drive.
#!/usr/bin/python
import os.path
import os
import shutil
import re
# Create the videos directory in the current location
# If the directory exists ignore it
def createDirectory():
directory = "videos"
if not os.path.isdir("./" + directory + "/"):
os.mkdir("./" + directory + "/")
print "Videos Folder Created."
else:
print "Video Folder Exists."
print "---------------------"
# Move all the files in the root directory with the .wmv extension
# to the videos folder
def moveVideos():
for file in os.listdir("."):
if os.path.splitext(file)[1] == ".wmv":
print "Moving:", file
shutil.move(file, os.path.join("videos", file))
def createHTML():
videoDirectory = os.listdir("videos")
f = open("videos.html", "w")
f.writelines(videoDirectory)
r = re.compile(r"(\\[^ ]+)")
print r.sub(r'\1', videoDirectory)
createDirectory()
moveVideos()
createHTML()
import cgi
def is_video_file(filename):
return filename.endswith(".wmv") # customize however you like
def createHTML():
videoDirectory = os.listdir("videos")
with open("videos.html", "w") as f:
f.write("<html><body><ul>\n")
for filename in videoDirectory:
if is_video_file(filename):
f.write('<li>%s</li>\n' %
(cgi.escape(filename, True), cgi.escape(filename)))
f.write("</ul></body></html>\n")
Don't do f.writelines(videoDirectory) and then regex. Besides you're only printing to the console with that regex subsitution.
Do
videoDirectory = os.listdir("videos")
f = open("videos.html", "w")
f.write('<html><head></head><body><ul>'
f.writelines(['<li>%s</li>' % (f, f) for f in videoDirectory])
f.write('</ul></body></html>')
def createHTML():
h = open("videos.html", 'w')
for vid in os.listdir:
path = "./videos" + vid
f = open(path, r)
h.write("<a href='"+f.name+"'>"+f.name[f.name.rfind('\\') +1 :]+"</a>")
f.close()
h.close()
print "done writing HTML file"

Categories