How to copy a file in Python with a progress bar? - python

When copying large files using shutil.copy(), you get no indication of how the operation is progressing..
I have put together something that works - it uses a simple ProgressBar class (which simple returns a simple ASCII progress bar, as a string), and a loop of open().read() and .write() to do the actual copying. It displays the progress bar using sys.stdout.write("\r%s\r" % (the_progress_bar)) which is a little hackish, but it works.
You can see the code (in context) on github here
Is there any built-in module that will do this better? Is there any improvements that can be made to this code?

Two things:
I would make the default block size a lot larger than 512. I would start with 16384 and perhaps more.
For modularity, it might be better to have the copy_with_prog function not output the progress bar itself, but call a callback function so the caller can decide how to display the progress.
Perhaps something like this:
def copy_with_prog(src, dest, callback = None):
while True:
# copy loop stuff
if callback:
callback(pos, total)
prog = ProgressBar(...)
copy_with_prog(src, dest, lambda pos, total: prog.update(pos, total))

Overkill? Perhaps. But on almost any system, Linux, Mac, and With a quick wxWidgets install on Windows, you can have the real deal, with pause and cancel buttons in a gui setup. Macs ship with wxWidgets these days, and it's a common package on Linux.
A single file is very quick (it will immediately finish and look broken) so you might consider creating a fileSet job that ticks along once per file instead of once per block. Enjoy!
-Jim Carroll
"""
Threaded Jobs.
Any class that does a long running process can inherit
from ThreadedJob. This enables running as a background
thread, progress notification, pause and cancel. The
time remaining is also calculated by the ThreadedJob class.
"""
import wx.lib.newevent
import thread
import exceptions
import time
(RunEvent, EVT_RUN) = wx.lib.newevent.NewEvent()
(CancelEvent, EVT_CANCEL) = wx.lib.newevent.NewEvent()
(DoneEvent, EVT_DONE) = wx.lib.newevent.NewEvent()
(ProgressStartEvent, EVT_PROGRESS_START) = wx.lib.newevent.NewEvent()
(ProgressEvent, EVT_PROGRESS) = wx.lib.newevent.NewEvent()
class InterruptedException(exceptions.Exception):
def __init__(self, args = None):
self.args = args
#
#
class ThreadedJob:
def __init__(self):
# tell them ten seconds at first
self.secondsRemaining = 10.0
self.lastTick = 0
# not running yet
self.isPaused = False
self.isRunning = False
self.keepGoing = True
def Start(self):
self.keepGoing = self.isRunning = True
thread.start_new_thread(self.Run, ())
self.isPaused = False
#
def Stop(self):
self.keepGoing = False
#
def WaitUntilStopped(self):
while self.isRunning:
time.sleep(0.1)
wx.SafeYield()
#
#
def IsRunning(self):
return self.isRunning
#
def Run(self):
# this is overridden by the
# concrete ThreadedJob
print "Run was not overloaded"
self.JobFinished()
pass
#
def Pause(self):
self.isPaused = True
pass
#
def Continue(self):
self.isPaused = False
pass
#
def PossibleStoppingPoint(self):
if not self.keepGoing:
raise InterruptedException("process interrupted.")
wx.SafeYield()
# allow cancel while paused
while self.isPaused:
if not self.keepGoing:
raise InterruptedException("process interrupted.")
# don't hog the CPU
time.sleep(0.1)
#
#
def SetProgressMessageWindow(self, win):
self.win = win
#
def JobBeginning(self, totalTicks):
self.lastIterationTime = time.time()
self.totalTicks = totalTicks
if hasattr(self, "win") and self.win:
wx.PostEvent(self.win, ProgressStartEvent(total=totalTicks))
#
#
def JobProgress(self, currentTick):
dt = time.time() - self.lastIterationTime
self.lastIterationTime = time.time()
dtick = currentTick - self.lastTick
self.lastTick = currentTick
alpha = 0.92
if currentTick > 1:
self.secondsPerTick = dt * (1.0 - alpha) + (self.secondsPerTick * alpha)
else:
self.secondsPerTick = dt
#
if dtick > 0:
self.secondsPerTick /= dtick
self.secondsRemaining = self.secondsPerTick * (self.totalTicks - 1 - currentTick) + 1
if hasattr(self, "win") and self.win:
wx.PostEvent(self.win, ProgressEvent(count=currentTick))
#
#
def SecondsRemaining(self):
return self.secondsRemaining
#
def TimeRemaining(self):
if 1: #self.secondsRemaining > 3:
minutes = self.secondsRemaining // 60
seconds = int(self.secondsRemaining % 60.0)
return "%i:%02i" % (minutes, seconds)
else:
return "a few"
#
def JobFinished(self):
if hasattr(self, "win") and self.win:
wx.PostEvent(self.win, DoneEvent())
#
# flag we're done before we post the all done message
self.isRunning = False
#
#
class EggTimerJob(ThreadedJob):
""" A sample Job that demonstrates the mechanisms and features of the Threaded Job"""
def __init__(self, duration):
self.duration = duration
ThreadedJob.__init__(self)
#
def Run(self):
""" This can either be run directly for synchronous use of the job,
or started as a thread when ThreadedJob.Start() is called.
It is responsible for calling JobBeginning, JobProgress, and JobFinished.
And as often as possible, calling PossibleStoppingPoint() which will
sleep if the user pauses, and raise an exception if the user cancels.
"""
self.time0 = time.clock()
self.JobBeginning(self.duration)
try:
for count in range(0, self.duration):
time.sleep(1.0)
self.JobProgress(count)
self.PossibleStoppingPoint()
#
except InterruptedException:
# clean up if user stops the Job early
print "canceled prematurely!"
#
# always signal the end of the job
self.JobFinished()
#
#
def __str__(self):
""" The job progress dialog expects the job to describe its current state."""
response = []
if self.isPaused:
response.append("Paused Counting")
elif not self.isRunning:
response.append("Will Count the seconds")
else:
response.append("Counting")
#
return " ".join(response)
#
#
class FileCopyJob(ThreadedJob):
""" A common file copy Job. """
def __init__(self, orig_filename, copy_filename, block_size=32*1024):
self.src = orig_filename
self.dest = copy_filename
self.block_size = block_size
ThreadedJob.__init__(self)
#
def Run(self):
""" This can either be run directly for synchronous use of the job,
or started as a thread when ThreadedJob.Start() is called.
It is responsible for calling JobBeginning, JobProgress, and JobFinished.
And as often as possible, calling PossibleStoppingPoint() which will
sleep if the user pauses, and raise an exception if the user cancels.
"""
self.time0 = time.clock()
try:
source = open(self.src, 'rb')
# how many blocks?
import os
(st_mode, st_ino, st_dev, st_nlink, st_uid, st_gid, st_size, st_atime, st_mtime, st_ctime) = os.stat(self.src)
num_blocks = st_size / self.block_size
current_block = 0
self.JobBeginning(num_blocks)
dest = open(self.dest, 'wb')
while 1:
copy_buffer = source.read(self.block_size)
if copy_buffer:
dest.write(copy_buffer)
current_block += 1
self.JobProgress(current_block)
self.PossibleStoppingPoint()
else:
break
source.close()
dest.close()
except InterruptedException:
# clean up if user stops the Job early
dest.close()
# unlink / delete the file that is partially copied
os.unlink(self.dest)
print "canceled, dest deleted!"
#
# always signal the end of the job
self.JobFinished()
#
#
def __str__(self):
""" The job progress dialog expects the job to describe its current state."""
response = []
if self.isPaused:
response.append("Paused Copy")
elif not self.isRunning:
response.append("Will Copy a file")
else:
response.append("Copying")
#
return " ".join(response)
#
#
class JobProgress(wx.Dialog):
""" This dialog shows the progress of any ThreadedJob.
It can be shown Modally if the main application needs to suspend
operation, or it can be shown Modelessly for background progress
reporting.
app = wx.PySimpleApp()
job = EggTimerJob(duration = 10)
dlg = JobProgress(None, job)
job.SetProgressMessageWindow(dlg)
job.Start()
dlg.ShowModal()
"""
def __init__(self, parent, job):
self.job = job
wx.Dialog.__init__(self, parent, -1, "Progress", size=(350,200))
# vertical box sizer
sizeAll = wx.BoxSizer(wx.VERTICAL)
# Job status text
self.JobStatusText = wx.StaticText(self, -1, "Starting...")
sizeAll.Add(self.JobStatusText, 0, wx.EXPAND|wx.ALL, 8)
# wxGague
self.ProgressBar = wx.Gauge(self, -1, 10, wx.DefaultPosition, (250, 15))
sizeAll.Add(self.ProgressBar, 0, wx.EXPAND|wx.ALL, 8)
# horiz box sizer, and spacer to right-justify
sizeRemaining = wx.BoxSizer(wx.HORIZONTAL)
sizeRemaining.Add((2,2), 1, wx.EXPAND)
# time remaining read-only edit
# putting wide default text gets a reasonable initial layout.
self.remainingText = wx.StaticText(self, -1, "???:??")
sizeRemaining.Add(self.remainingText, 0, wx.LEFT|wx.RIGHT|wx.ALIGN_CENTER_VERTICAL, 8)
# static text: remaining
self.remainingLabel = wx.StaticText(self, -1, "remaining")
sizeRemaining.Add(self.remainingLabel, 0, wx.ALL|wx.ALIGN_CENTER_VERTICAL, 8)
# add that row to the mix
sizeAll.Add(sizeRemaining, 1, wx.EXPAND)
# horiz box sizer & spacer
sizeButtons = wx.BoxSizer(wx.HORIZONTAL)
sizeButtons.Add((2,2), 1, wx.EXPAND|wx.ADJUST_MINSIZE)
# Pause Button
self.PauseButton = wx.Button(self, -1, "Pause")
sizeButtons.Add(self.PauseButton, 0, wx.ALL, 4)
self.Bind(wx.EVT_BUTTON, self.OnPauseButton, self.PauseButton)
# Cancel button
self.CancelButton = wx.Button(self, wx.ID_CANCEL, "Cancel")
sizeButtons.Add(self.CancelButton, 0, wx.ALL, 4)
self.Bind(wx.EVT_BUTTON, self.OnCancel, self.CancelButton)
# Add all the buttons on the bottom row to the dialog
sizeAll.Add(sizeButtons, 0, wx.EXPAND|wx.ALL, 4)
self.SetSizer(sizeAll)
#sizeAll.Fit(self)
sizeAll.SetSizeHints(self)
# jobs tell us how they are doing
self.Bind(EVT_PROGRESS_START, self.OnProgressStart)
self.Bind(EVT_PROGRESS, self.OnProgress)
self.Bind(EVT_DONE, self.OnDone)
self.Layout()
#
def OnPauseButton(self, event):
if self.job.isPaused:
self.job.Continue()
self.PauseButton.SetLabel("Pause")
self.Layout()
else:
self.job.Pause()
self.PauseButton.SetLabel("Resume")
self.Layout()
#
#
def OnCancel(self, event):
self.job.Stop()
#
def OnProgressStart(self, event):
self.ProgressBar.SetRange(event.total)
self.statusUpdateTime = time.clock()
#
def OnProgress(self, event):
# update the progress bar
self.ProgressBar.SetValue(event.count)
self.remainingText.SetLabel(self.job.TimeRemaining())
# update the text a max of 20 times a second
if time.clock() - self.statusUpdateTime > 0.05:
self.JobStatusText.SetLabel(str(self.job))
self.statusUpdateTime = time.clock()
self.Layout()
#
#
# when a job is done
def OnDone(self, event):
self.ProgressBar.SetValue(0)
self.JobStatusText.SetLabel("Finished")
self.Destroy()
#
#
if __name__ == "__main__":
app = wx.PySimpleApp()
#job = EggTimerJob(duration = 10)
job = FileCopyJob("VeryBigFile.mp4", "/tmp/test_junk.mp4", 1024*1024*10)
dlg = JobProgress(None, job)
job.SetProgressMessageWindow(dlg)
job.Start()
dlg.ShowModal()
#

I have this shutil.copy() with progress bar made in a simple way just with built in modules.
If you are using utf-8 encoding you can get a progress like the second example in the gif image:
Progress bars examples for this:
Read the comments to change style and colors. The first and last examples don't need utf-8.
You can use the command CPprogress(SOURCE, DESTINATION) just where you had shutil.copy(src, dst):
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
CPprogress(SOURCE, DESTINATION)
I made this to give shutil.copy() [or shutil.copy2() in this case] a progress bar.
You can use CPprogress(SOURCE, DESTINATION) just like shutil.copy(src, dst). SOURCE must be a file path and DESTINATION a file or folder path.
It will give you a progress bar for each file copied. Just copy this code above the place where you want to use CPprogress(SOURCE, DESTINATION) in your code.
You can easily change the look of the progress bar:
- To keep the style and just change the colors, replace the colors values of progressCOLOR and finalCOLOR (orange code at the end of the lines).
- The use a solid block progress bar, # -*- coding: utf-8 -*- is required. Otherwise, you will get an encoding error. Some basic terminals, like xterm, may not show the progress bar because of the utf-8 characters.
To use this style, remove the comments #STYLE# in lines ###COLORS### - BlueCOLOR and endBLOCK.
In def getPERCECENTprogress() remove the comments #STYLE# AND COMMENT THE PREVIOUS line. Do the same in def CPprogress()
If you don't want the utf-8 encoding, delete the four lines beginning with #STYLE#.
NOTE: If you want to copy lots of small files, the copy process for file is so fast
that all you will see is a lot of lines scrolling in you terminal window - not enough time for a 'progress'.
In that case, I use an overall progress that shows only one progress bar to the complete job. nzX
'''
import os
import shutil
import sys
import threading
import time
######## COLORS ######
progressCOLOR = '\033[38;5;33;48;5;236m' #\033[38;5;33;48;5;236m# copy inside '' for colored progressbar| orange:#\033[38;5;208;48;5;235m
finalCOLOR = '\033[38;5;33;48;5;33m' #\033[38;5;33;48;5;33m# copy inside '' for colored progressbar| orange:#\033[38;5;208;48;5;208m
#STYLE#BlueCOLOR = '\033[38;5;33m'#\033[38;5;33m# copy inside '' for colored progressbar Orange#'\033[38;5;208m'# # BG progress# #STYLE#
#STYLE#endBLOCK = '' # ▌ copy OR '' for none # BG progress# #STYLE# requires utf8 coding header
########
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
CEND = '\033[0m'
def getPERCECENTprogress(source_path, destination_path):
time.sleep(.24)
if os.path.exists(destination_path):
while os.path.getsize(source_path) != os.path.getsize(destination_path):
sys.stdout.write('\r')
percentagem = int((float(os.path.getsize(destination_path))/float(os.path.getsize(source_path))) * 100)
steps = int(percentagem/5)
copiado = int(os.path.getsize(destination_path)/1000000)# Should be 1024000 but this get's equal to Thunar file manager report (Linux - Xfce)
sizzz = int(os.path.getsize(source_path)/1000000)
sys.stdout.write((" {:d} / {:d} Mb ".format(copiado, sizzz)) + (BOLD + progressCOLOR + "{:20s}".format('|'*steps) + CEND) + (" {:d}% ".format(percentagem))) # BG progress
#STYLE#sys.stdout.write((" {:d} / {:d} Mb ".format(copiado, sizzz)) + (BOLD + BlueCOLOR + "▐" + "{:s}".format('█'*steps) + CEND) + ("{:s}".format(' '*(20-steps))+ BOLD + BlueCOLOR + endBLOCK+ CEND) +(" {:d}% ".format(percentagem))) #STYLE# # BG progress# closer to GUI but less compatible (no block bar with xterm) # requires utf8 coding header
sys.stdout.flush()
time.sleep(.01)
def CPprogress(SOURCE, DESTINATION):
if os.path.isdir(DESTINATION):
dst_file = os.path.join(DESTINATION, os.path.basename(SOURCE))
else: dst_file = DESTINATION
print " "
print (BOLD + UNDERLINE + "FROM:" + CEND + " "), SOURCE
print (BOLD + UNDERLINE + "TO:" + CEND + " "), dst_file
print " "
threading.Thread(name='progresso', target=getPERCECENTprogress, args=(SOURCE, dst_file)).start()
shutil.copy2(SOURCE, DESTINATION)
time.sleep(.02)
sys.stdout.write('\r')
sys.stdout.write((" {:d} / {:d} Mb ".format((int(os.path.getsize(dst_file)/1000000)), (int(os.path.getsize(SOURCE)/1000000)))) + (BOLD + finalCOLOR + "{:20s}".format('|'*20) + CEND) + (" {:d}% ".format(100))) # BG progress 100%
#STYLE#sys.stdout.write((" {:d} / {:d} Mb ".format((int(os.path.getsize(dst_file)/1000000)), (int(os.path.getsize(SOURCE)/1000000)))) + (BOLD + BlueCOLOR + "▐" + "{:s}{:s}".format(('█'*20), endBLOCK) + CEND) + (" {:d}% ".format(100))) #STYLE# # BG progress 100%# closer to GUI but less compatible (no block bar with xterm) # requires utf8 coding header
sys.stdout.flush()
print " "
print " "
'''
#Ex. Copy all files from root of the source dir to destination dir
folderA = '/path/to/SOURCE' # SOURCE
folderB = '/path/to/DESTINATION' # DESTINATION
for FILE in os.listdir(folderA):
if not os.path.isdir(os.path.join(folderA, FILE)):
if os.path.exists(os.path.join(folderB, FILE)): continue # as we are using shutil.copy2() that overwrites destination, this skips existing files
CPprogress(os.path.join(folderA, FILE), folderB) # use the command as if it was shutil.copy2() but with progress
75 / 150 Mb |||||||||| | 50%
'''

If you want to use the Windows copy dialog with progress you can use these:
https://github.com/tjguk/winshell/
https://github.com/frmdstryr/pywinutils

If you want an overall progress, you can use something like this (made for another script). Note that in this case, the 'threading.Thread' that calls the progress bar was placed outside the 'for' loop. Also, the measures need be taken in a different way. This is the third example (non utf-8) from the gif image in the previous answer. It adds a files 'ToGo’ counting:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
Ex.
CopyProgress('/path/to/SOURCE', '/path/to/DESTINATION')
I think this 'copy with overall progress' is very 'plastic' and can be easily adapted.
By default, it will RECURSIVELY copy the CONTENT of 'path/to/SOURCE' to 'path/to/DESTINATION/' keeping the directory tree.
Paying attention to comments, there are 4 main options that can be immediately change:
1 - The LOOK of the progress bar: see COLORS and the PAIR of STYLE lines in 'def getPERCECENTprogress'(inside and after the 'while' loop);
2 - The DESTINATION path: to get 'path/to/DESTINATION/SOURCE_NAME' as target, comment the 2nd 'DST =' definition on the top of the 'def CopyProgress(SOURCE, DESTINATION)' function;
3 - If you don't want to RECURSIVELY copy from sub-directories but just the files in the root source directory to the root of destination, you can use os.listdir() instead of os.walk(). Read the comments inside 'def CopyProgress(SOURCE, DESTINATION)' function to disable RECURSION. Be aware that the RECURSION changes(4x2) must be made in both os.walk() loops;
4 - Handling destination files: if you use this in a situation where the destination filename may already exist, by default, the file is skipped and the loop will jump to the next and so on. On the other way shutil.copy2(), by default, overwrites destination file if exists. Alternatively, you can handle files that exist by overwriting or renaming (according to current date and time). To do that read the comments after 'if os.path.exists(dstFILE): continue' both in the count bytes loop and the main loop. Be aware that the changes must match in both loops (as described in comments) or the progress function will not work properly.
'''
import os
import shutil
import sys
import threading
import time
progressCOLOR = '\033[38;5;33;48;5;236m' #BLUEgreyBG
finalCOLOR = '\033[48;5;33m' #BLUEBG
# check the color codes below and paste above
###### COLORS #######
# WHITEblueBG = '\033[38;5;15;48;5;33m'
# BLUE = '\033[38;5;33m'
# BLUEBG = '\033[48;5;33m'
# ORANGEBG = '\033[48;5;208m'
# BLUEgreyBG = '\033[38;5;33;48;5;236m'
# ORANGEgreyBG = '\033[38;5;208;48;5;236m' # = '\033[38;5;FOREGROUND;48;5;BACKGROUNDm' # ver 'https://i.stack.imgur.com/KTSQa.png' para 256 color codes
# INVERT = '\033[7m'
###### COLORS #######
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
CEND = '\033[0m'
FilesLeft = 0
def FullFolderSize(path):
TotalSize = 0
if os.path.exists(path):# to be safely used # if FALSE returns 0
for root, dirs, files in os.walk(path):
for file in files:
TotalSize += os.path.getsize(os.path.join(root, file))
return TotalSize
def getPERCECENTprogress(source_path, destination_path, bytes_to_copy):
dstINIsize = FullFolderSize(destination_path)
time.sleep(.25)
print " "
print (BOLD + UNDERLINE + "FROM:" + CEND + " "), source_path
print (BOLD + UNDERLINE + "TO:" + CEND + " "), destination_path
print " "
if os.path.exists(destination_path):
while bytes_to_copy != (FullFolderSize(destination_path)-dstINIsize):
sys.stdout.write('\r')
percentagem = int((float((FullFolderSize(destination_path)-dstINIsize))/float(bytes_to_copy)) * 100)
steps = int(percentagem/5)
copiado = '{:,}'.format(int((FullFolderSize(destination_path)-dstINIsize)/1000000))# Should be 1024000 but this get's closer to the file manager report
sizzz = '{:,}'.format(int(bytes_to_copy/1000000))
sys.stdout.write((" {:s} / {:s} Mb ".format(copiado, sizzz)) + (BOLD + progressCOLOR + "{:20s}".format('|'*steps) + CEND) + (" {:d}% ".format(percentagem)) + (" {:d} ToGo ".format(FilesLeft))) # STYLE 1 progress default #
#BOLD# sys.stdout.write(BOLD + (" {:s} / {:s} Mb ".format(copiado, sizzz)) + (progressCOLOR + "{:20s}".format('|'*steps) + CEND) + BOLD + (" {:d}% ".format(percentagem)) + (" {:d} ToGo ".format(FilesLeft))+ CEND) # STYLE 2 progress BOLD #
#classic B/W# sys.stdout.write(BOLD + (" {:s} / {:s} Mb ".format(copiado, sizzz)) + ("|{:20s}|".format('|'*steps)) + (" {:d}% ".format(percentagem)) + (" {:d} ToGo ".format(FilesLeft))+ CEND) # STYLE 3 progress classic B/W #
sys.stdout.flush()
time.sleep(.01)
sys.stdout.write('\r')
time.sleep(.05)
sys.stdout.write((" {:s} / {:s} Mb ".format('{:,}'.format(int((FullFolderSize(destination_path)-dstINIsize)/1000000)), '{:,}'.format(int(bytes_to_copy/1000000)))) + (BOLD + finalCOLOR + "{:20s}".format(' '*20) + CEND) + (" {:d}% ".format( 100)) + (" {:s} ".format(' ')) + "\n") # STYLE 1 progress default #
#BOLD# sys.stdout.write(BOLD + (" {:s} / {:s} Mb ".format('{:,}'.format(int((FullFolderSize(destination_path)-dstINIsize)/1000000)), '{:,}'.format(int(bytes_to_copy/1000000)))) + (finalCOLOR + "{:20s}".format(' '*20) + CEND) + BOLD + (" {:d}% ".format( 100)) + (" {:s} ".format(' ')) + "\n" + CEND ) # STYLE 2 progress BOLD #
#classic B/W# sys.stdout.write(BOLD + (" {:s} / {:s} Mb ".format('{:,}'.format(int((FullFolderSize(destination_path)-dstINIsize)/1000000)), '{:,}'.format(int(bytes_to_copy/1000000)))) + ("|{:20s}|".format('|'*20)) + (" {:d}% ".format( 100)) + (" {:s} ".format(' ')) + "\n" + CEND ) # STYLE 3 progress classic B/W #
sys.stdout.flush()
print " "
print " "
def CopyProgress(SOURCE, DESTINATION):
global FilesLeft
DST = os.path.join(DESTINATION, os.path.basename(SOURCE))
# <- the previous will copy the Source folder inside of the Destination folder. Result Target: path/to/Destination/SOURCE_NAME
# -> UNCOMMENT the next (# DST = DESTINATION) to copy the CONTENT of Source to the Destination. Result Target: path/to/Destination
DST = DESTINATION # UNCOMMENT this to specify the Destination as the target itself and not the root folder of the target
#
if DST.startswith(SOURCE):
print " "
print BOLD + UNDERLINE + 'Source folder can\'t be changed.' + CEND
print 'Please check your target path...'
print " "
print BOLD + ' CANCELED' + CEND
print " "
exit()
#count bytes to copy
Bytes2copy = 0
for root, dirs, files in os.walk(SOURCE): # USE for filename in os.listdir(SOURCE): # if you don't want RECURSION #
dstDIR = root.replace(SOURCE, DST, 1) # USE dstDIR = DST # if you don't want RECURSION #
for filename in files: # USE if not os.path.isdir(os.path.join(SOURCE, filename)): # if you don't want RECURSION #
dstFILE = os.path.join(dstDIR, filename)
if os.path.exists(dstFILE): continue # must match the main loop (after "threading.Thread")
# To overwrite delete dstFILE first here so the progress works properly: ex. change continue to os.unlink(dstFILE)
# To rename new files adding date and time, instead of deleating and overwriting,
# comment 'if os.path.exists(dstFILE): continue'
Bytes2copy += os.path.getsize(os.path.join(root, filename)) # USE os.path.getsize(os.path.join(SOURCE, filename)) # if you don't want RECURSION #
FilesLeft += 1
# <- count bytes to copy
#
# Treading to call the preogress
threading.Thread(name='progresso', target=getPERCECENTprogress, args=(SOURCE, DST, Bytes2copy)).start()
# main loop
for root, dirs, files in os.walk(SOURCE): # USE for filename in os.listdir(SOURCE): # if you don't want RECURSION #
dstDIR = root.replace(SOURCE, DST, 1) # USE dstDIR = DST # if you don't want RECURSION #
if not os.path.exists(dstDIR):
os.makedirs(dstDIR)
for filename in files: # USE if not os.path.isdir(os.path.join(SOURCE, filename)): # if you don't want RECURSION #
srcFILE = os.path.join(root, filename) # USE os.path.join(SOURCE, filename) # if you don't want RECURSION #
dstFILE = os.path.join(dstDIR, filename)
if os.path.exists(dstFILE): continue # MUST MATCH THE PREVIOUS count bytes loop
# <- <- this jumps to the next file without copying this file, if destination file exists.
# Comment to copy with rename or overwrite dstFILE
#
# RENAME part below
head, tail = os.path.splitext(filename)
count = -1
year = int(time.strftime("%Y"))
month = int(time.strftime("%m"))
day = int(time.strftime("%d"))
hour = int(time.strftime("%H"))
minute = int(time.strftime("%M"))
while os.path.exists(dstFILE):
count += 1
if count == 0:
dstFILE = os.path.join(dstDIR, '{:s}[{:d}.{:d}.{:d}]{:d}-{:d}{:s}'.format(head, year, month, day, hour, minute, tail))
else:
dstFILE = os.path.join(dstDIR, '{:s}[{:d}.{:d}.{:d}]{:d}-{:d}[{:d}]{:s}'.format(head, year, month, day, hour, minute, count, tail))
# END of RENAME part
shutil.copy2(srcFILE, dstFILE)
FilesLeft -= 1
#
'''
Ex.
CopyProgress('/path/to/SOURCE', '/path/to/DESTINATION')
'''

Alternatively, you can use ROBOCOPY with the os module. It won't give you a progress bar, but it'll give you a percentage indicator as well as a robust summary at the end.
import os
def robocopy(source, destination, extension=''):
os.system("robocopy {} {} {} /xx /njh".format(source, destination, extension))
# Usage example
robocopy(r'C:\Users\Example\Downloads', r'C:\Users\Example\Desktop', '*.mov')
The example above will copy all .mov files to the desktop
Leaving extension blank will copy all files in the source folder to the destination folder.
/xx removes extra files/directories from being listed
/njh removes job header
See documentation for more info:
https://learn.microsoft.com/en-us/windows-server/administration/windows-commands/robocopy

That is simple PySide app can copy any file from source to destination
#!/usr/bin/python3
import os
import sys
from PySide2.QtWidgets import QProgressBar, QApplication, QDialog, QMainWindow, QPushButton
from PySide2.QtCore import QThread, Signal, Slot
class ProgressDialog(QDialog):
def __init__(self, parent, source, destination):
QDialog.__init__(self, parent)
self.resize(400, 50)
self.parent = parent
self.source = source
self.destination = destination
self.prog = QProgressBar(self)
self.prog.setMaximum(100)
self.prog.setMinimum(0)
self.prog.setFormat("%p%")
def start(self):
self.show()
self.copy()
def copy(self):
copy_thread = CopyThread(self, self.source, self.destination)
copy_thread.procPartDone.connect(self.update_progress)
copy_thread.procDone.connect(self.finished_copy)
copy_thread.start()
def update_progress(self, progress):
self.prog.setValue(progress)
def finished_copy(self, state):
self.close()
class CopyThread(QThread):
procDone = Signal(bool)
procPartDone = Signal(int)
def __init__(self, parent, source: str, destination: str):
QThread.__init__(self, parent)
self.source = source
self.destination = destination
def run(self):
self.copy()
self.procDone.emit(True)
def copy(self):
source_size = os.stat(self.source).st_size
copied = 0
with open(self.source, "rb") as source, open(self.destination, "wb") as target:
while True:
chunk = source.read(1024)
if not chunk:
break
target.write(chunk)
copied += len(chunk)
self.procPartDone.emit(copied * 100 / source_size)
class MainWindow(QMainWindow):
def __init__(self, parent: object = None) -> None:
super().__init__(parent)
self.src = "/path/to/file.ext"
self.dest = "/path/to/file.ext"
self.btn = QPushButton(self)
self.btn.setText("Start copy")
self.btn.clicked.connect(self.run)
self.setCentralWidget(self.btn)
def run(self):
self.prog = ProgressDialog(self, self.src, self.dest)
self.prog.start()
def main():
app = QApplication(sys.argv)
window = MainWindow()
window.show()
sys.exit(app.exec_())
if __name__ == "__main__":
main()
Save this script in the 'main.py' file and execute the command
python3 main.py

The following uses tqdm to generate a progress bar while copying a single file.
from tqdm import tqdm
def copy_with_progress(src, dst):
size = os.path.getsize(src)
with open(src, 'rb') as fsrc:
with open(dst, 'wb') as fdst:
with tqdm(total=size, unit='B', unit_scale=True, desc=f'Copying {src} to {dst}') as pbar:
while True:
chunk = fsrc.read(4096)
if not chunk:
break
fdst.write(chunk)
pbar.update(len(chunk))

Related

Finding text string with pfdminer not consistent [Python]

I've got a question about a code that's getting text string from a pdf file and returns the output in a .csv
The output is stored in Output.csv. Like you can see it returns value on p.27 here the code works and 29, p. 28 is missing. What i want to return is textstring on p. 28 code not working.
Can somebody tell me what im doing wrong? In the 2nd code pdfminer does read out the proper output that is needed.
import re, csv, os
import sys, time
from tqdm import tqdm
import multiprocessing as mp
from joblib import Parallel, delayed
from pathlib import Path
from io import StringIO
try:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
except ImportError:
print ("Trying to Install required module: pdfminer\n")
os.system('python -m pip install pdfminer')
# -- above lines try to install requests module if not present
# -- if all went well, import required module again ( for global access)
# method 3: object oriented programming
class Program:
#initialisation (happens when Program() is called for the first time)
def __init__(self):
# locations
# this defines the location of the workspace and directory of the data to process
self.ws_loc = Path("C:/Users/pco/Desktop/workspace")
self.dat_loc = Path("C:/Users/pco/Desktop/workspace/data/Test")
# lookuptable
# this converts the lookuptable from maximo to a list which can be used for comparison
self.lu_file = self.ws_loc / "lookuptable.csv"
with open(self.lu_file, newline='') as f:
reader = csv.reader(f)
self.lu_list = list(filter(None,list(reader)))
self.lu_list = [each[0] for each in self.lu_list]
def listener(self,q):
'''listens for messages on the q (queue), writes (appends) to file (output.csv). '''
# open output.csv in location workspace/data/ and use as 'f'
with open(self.ws_loc / 'output.csv', 'a') as f:
#start infinite listening loop until 'kill' message is received
while 1:
# get the message which is first in q (queue)
m = q.get()
# break loop if message is kill and close file 'output.csv'
if m == 'kill':
f.close()
break
# if message is not 'kill' then write message to file and flush file
f.write(m)
f.flush()
def worker(self, file, q):
''' processes a pdf file given by main() and writes output to q (queue)'''
# init PDF class (this class is used to get pages from the PDF and process pdftext)
PDF = self.PDF(self.dat_loc,self.lu_list,0)
# get all the pages from PDF: contains pages = [page1, ..., pageN]
# pageN = "bla bla \n bla etc."
PDFpages = PDF.getPages(file)
pages = []
for page in PDFpages:
pages.append(page)
# varargs defines extra data for files (this is where metadata is stored)
# varargs should not be filled here, but it is initialized here.
varargs = ''
# check if file is a manual (this can be seen as an example for a varargs entry)
# it should contain atleast ',' (this creates a new column entry in the csv)
# PDF.fileCategory() which is a class within the Program class, can be taken as an example
varargs+= PDF.fileCategory(file,pages) + ',' + PDF.fileSupplier(file, pages) + ',' + PDF.fileRev(file, pages)
# new vararg can be added like: varargs+= THE_VARARG
# initialise pageNum (which is a page number identifier inside the for loop)
pageNum = 1
# create an empty datastack (which is the message that will be send to q (queue))
datastack = ''
# for each page do...
for page in pages:
'''!!! for each page look for tags (THIS IS WHERE THE REGEX HAPPENS PDF.find_tag()) !!!'''
found_strings, found = PDF.find_tag(page)
# found_stringsrev, foundrev = PDF.find_rev(page)
# if tags are found, then fix the tags such that they are correct with
# Program.putStripe() (or self.putStripe()) it changes 12AB1234A to 12-AB-1234-A
# if foundrev:
# string = ''
# fixedstring = ''
# for stringrev in found_stringsrev:
# # fill datastack with found tags
# datastack += file + ',' + str(pageNum) + ',' + string + ',' + fixedstring + ',' + stringrev + ',' + varargs + '\n'
if found:
for string in found_strings:
# if correct, do not change
fixedstring = string
# check if the tag matches the correct regexpression ('regex' or 're')
if re.match('^(\d{1,2}[ -]{,1}[A-Z]{1,4}[ -]{,1}\d{4}[ -]{,1}[A-Z]*).*$', string)!=None:
# else fix the tag
fixedstring = self.putStripe(string)
# fill datastack with found tags
datastack += file + ',' + str(pageNum) + ',' + string + ',' + fixedstring + varargs + '\n'
# next page, so pageNum becomes pageNum + 1
pageNum +=1
# if the datastack is empty, we are still interested in the varargs:
# (so empty tag columns are added)
if datastack=='':
datastack = file + ',' + ',' + ',' + varargs + '\n'
# put the datastack message inside of the q (queue)
q.put(datastack)
# terminate the PDF class so that the pdf file is closed in a correct way
PDF.terminate()
# return (in case the datastack should be printed)
return datastack
def putStripe(self,input):
'''This function fixes a tag that is not correct'''
# strip the tag from spaces
input = re.sub(' ','',input)
# for each string that matches the expression write to words
words = re.findall('[0-9][A-Za-z]+', input)
words += re.findall('[A-Za-z][0-9]+', input)
# for each match inside the tag add a '-' in the second position
for word in words:
i = input.find(word)+1
input = input[:i] + '-' + input[i:]
# return the fixed tag
return input
def main(self):
try:
# initiate time
t = time.time()
# create pools for paralell pooling (max cpu threads is optained automatically)
pool = mp.Pool(mp.cpu_count() + 2)
# create a manager
manager = mp.Manager()
# from the pool manager create a queue object which can be used to
# exchange data between the worker and listener
q = manager.Queue()
# start up listener first
# ignore warning, it is being used
watcher = pool.apply_async(self.listener, (q,))
# fire off workers (basically assign them jobs)
jobs = []
# NOTE: FOR LOOPS DO NOT CAUSE A LOOP, CODE PROCEEDS WITH PARALLEL THREADING
# AS IF THE RESULT OF EACH LOOP IS INSTANTLY COMPLETED
# each file in the data location is a job
for file in os.listdir(self.dat_loc):
# assign the job to a worker
job = pool.apply_async(self.worker, (file, q))
# append the job to jobs (for data aquisition)
jobs.append(job)
# this is used to get the data back from jobs
for job in tqdm(jobs):
#print('')
#print(job.get()[:-1])
job.get()
# printed elapsed time (good for project management)
print('elapsed time = ' + str(time.time()-t) + ' seconds')
# catch interupt and try to properly terminate workers (might take time)
# best to just do everything in batches and dont interrupt
except KeyboardInterrupt:
print("\nCaught KeyboardInterrupt, terminating workers")
q.put('kill') # <-- makes sure the output.csv is always closed properly
pool.close()
pool.join()
pool.terminate()
SystemExit(1)
# always excecute (kills workers and listener)
finally:
q.put('kill') # <-- makes sure the output.csv is always closed properly
pool.close()
pool.join()
def execute(self):
self.main()
class PDF:
# from PDF.
def __init__(self,dat_loc,lu_list,maxpages):
self.dat_loc = dat_loc
self.lu_list = lu_list
self.lu_list_f = 0
self.password = ""
self.maxpages = maxpages
self.caching = True
self.rsrcmgr = PDFResourceManager()
self.retstr = StringIO()
self.laparams = LAParams()
self.device = TextConverter(self.rsrcmgr, self.retstr, laparams=self.laparams)
self.interpreter = PDFPageInterpreter(self.rsrcmgr, self.device)
self.pagenos=set()
# from PDF.
def getPages(self,file):
self.fp = open(self.dat_loc / file, 'rb')
pages = PDFPage.get_pages(self.fp,
self.pagenos,
maxpages=self.maxpages,
password=self.password,
caching=self.caching,
check_extractable=True)
return pages
# from PDF.
def fileCategory(self,file,pages):
rules = []
rules.append(['Manual',['ANLAGE - INSTALLATION','User Guide','MANUAL','Manual','manual','Handleiding','handleiding','Instruction','instructions','Instructie', 'Guide', 'GUIDE']])
rules.append(['Specification',['SPECIFICATION','Specification','Specificatie']])
rules.append(['Datasheet',['DATA BOOK','UTILITIES LIST','DATA PACKAGE','Data Package','data-sheet','Datasheet','DATASHEET','datasheet','DATA SHEET','Data Sheet','Data sheet','data sheet']])
rules.append(['Spare part list',['SPARE PARTS LIST']])
rules.append(['Invoice',['BILL OF MATERIAL','invoice','Invoice','INVOICE','Purchase order','Purchase Order','PURCHASE ORDER']])
rules.append(['Schematic Diagram',['SCHEMATIC DIAGRAM','Schematic Diagram','Schematic diagram', 'ISOMETRIC', 'Isometric', 'isometric']])
rules.append(['Checklist', ['Checklist', 'CHECKLIST', 'CHECKSHEET', 'Checksheet']])
rules.append(['Certificates', ['Certificate', 'CERTIFICATE', 'Zertifikat', 'ZERTIFIKAT', 'Certificat', 'CERTIFICAT']])
rules.append(['Required documents list', ['REQUIRED SUBMITTAL DOCUMENTS']])
fileCategory = ''
found = False
counter = 1
for page in pages:
if counter>4:
break
for rule in rules:
category = rule[0]
category_rules = rule[1]
for line in self.pagestr(page).splitlines():
if any(line.find(x)!=-1 for x in category_rules):
found = True
if found:
break
if found:
break
if found:
break
counter+=1
if found:
fileCategory += ',' + category
else:
fileCategory += ',' + 'Unreadable'
return fileCategory
# from PDF.
def fileSupplier(self,file,pages):
rules = []
rules.append(['JE Jacobs',['JE Jacobs', 'JE JACOBS', 'Jacobs', 'JACOBS']])
rules.append(['Emerson',['Emerson', 'Emerson Process Management', 'EMERSON',]])
rules.append(['Air Liquide',['Air Liquide', 'AIR LIQUIDE']])
rules.append(['Rosemount',['ROSEMOUNT', 'Rosemount']])
rules.append(['Deltak',['Deltak', 'DELTAK']])
rules.append(['AviComp',['AVICOMP', 'Avicomp', 'avicomp']])
fileSupplier = ''
found = False
counter = 1
for page in pages:
if counter>4:
break
for rule in rules:
category = rule[0]
category_rules = rule[1]
for line in self.pagestr(page).splitlines():
if any(line.find(x)!=-1 for x in category_rules):
found = True
if found:
break
if found:
break
if found:
break
counter+=1
if found:
fileSupplier += ',' + category
else:
fileSupplier += ',' + 'Supplier N/A'
return fileSupplier
# from PDF.
def fileRev(self,file,pages):
fileRev = ''
found = False
counter = 1
for page in pages:
if counter>4:
break
for line in self.pagestr(page).splitlines():
if re.match('^(Rev.*).*$', line):
found = True
if found:
break
if found:
break
counter+=1
if found:
fileRev += ',' + line
else:
fileRev += ',' + ''
return fileRev
# from PDF.
def find_string_lookup(self,page,pageNum,file,varargs):
datastack = []
data = []
found = False
for line in self.pagestr(page).splitlines():
line = re.sub('[^A-Za-z0-9]+', '', line)
counter = 0
for tag in self.lu_list_f:
if line.find(tag)!=-1:
found = True
data = file + ',' + str(self.lu_list[counter][0]) + ',' + str(pageNum) + varargs +'\n'
if data not in datastack:
datastack += [data]
counter+=1
return datastack, found
# from PDF.
def find_string(self,page,strings,Method=None):
datastack = []
data = []
found = False
if Method=='ALPHABET_NUM_ONLY':
tags = [re.sub('[^A-Za-z0-9]+', '', line) for line in strings]
elif Method=='ALPHABETCAPS_NUM_ONLY':
tags = [re.sub('[^A-Za-z0-9]+', '', line).upper() for line in strings]
elif Method=='ALPHABETCAPS':
tags = [line.upper() for line in strings]
else:
tags = strings
for line in self.pagestr(page).splitlines():
if Method=='ALPHABET_NUM_ONLY':
line = re.sub('[^A-Za-z0-9]+', '', line)
elif Method=='ALPHABETCAPS_NUM_ONLY':
line = re.sub('[^A-Za-z0-9]+', '', line).upper()
elif Method=='ALPHABETCAPS':
line = line.upper()
i = 0
for tag in tags:
if tag != '':
if line.find(tag)!=-1:
found = True
data = strings[i]
if data not in datastack:
datastack += [data]
i+=1
return datastack, found
# from PDF.
def find_tag(self,page):
datastack = []
found = False
for line in self.pagestr(page).splitlines():
tags = re.findall('^(\d{2}[ -]{,1}[A-Z]{1,4}[ -]{,1}\d{4}[ -]{,1}[A-Z]*).*$', line)
for tag in tags:
if tag not in datastack:
datastack += [tag]
found = True
return datastack, found
# from PDF.
# def find_rev(self,page):
# datastack = []
# found = False
# for line in self.pagestr(page).splitlines():
# tags = re.findall('^(Rev.*).*$', line)
# for tag in tags:
# if tag not in datastack:
# datastack += [tag]
# found = True
# return datastack, found
# from PDF.
def pagestr(self,page):
self.retstr.truncate(0)
self.retstr.seek(0)
self.interpreter.process_page(page)
return self.retstr.getvalue()
# from PDF.
def terminate(self):
self.fp.close()
self.device.close()
self.retstr.close()
# start the code (the proper way)
if __name__ == '__main__':
Program().execute()
If i read out the pdf with this code in python (also with pdfminer):
from pathlib import Path
from io import StringIO
try:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
except ImportError:
print ("Trying to Install required module: pdfminer\n")
os.system('python -m pip install pdfminer')
# -- above lines try to install requests module if not present
# -- if all went well, import required module again ( for global access)
class glb():
workspace_folder = Path('C:/Users/pco/Desktop/workspace')
data_folder = Path('C:/Users/pco/Desktop/workspace/data/Test')
lookup_file = workspace_folder / "lookuptable.csv"
with open(lookup_file, newline='') as f:
reader = csv.reader(f)
lookup_list = list(reader)
lookup_list_filtered = list(filter(None,[re.sub('[^A-Za-z0-9]+', '', str(line)) for line in lookup_list]))
def find_tagnumbers(path):
pagelines = []
rsrcmgr = PDFResourceManager()
retstr = StringIO()
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, laparams=laparams)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
page_no = 1
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
page_str = retstr.getvalue()
pagelines.append(page_str.splitlines())
retstr.truncate(0)
retstr.seek(0)
page_no +=1
page_no +=-1
print(pagelines)
fp.close()
device.close()
retstr.close()
return 1
find_tagnumbers('C:/Users/pco/Desktop/workspace/data/Test/1845613_1_27_Marked.pdf')
it does returns 47-AT -0053. But if i run the code below it doesn't return the value in output file. Output when i print pagelines
p.s. my coding skills is beginner (so i write out all the steps)

New class or new .py Python

I'm writing a Python script to check if a file is added to a folder with watchdog, that file is going to be added to a queue.
My idea is to add the filename to a txt, then either run a new class that watches the txt and then executes a line in cmd and start for example FME.
Is it the best way to write a new .py for every new program I want to open. For example one for FME and one for notepad.
I still want the watchdog class to go into the background.
looking_for_files_and_adding_to_queue py
looking_in_queue_for_the_next_in_line_and_direct_to_3_party py
FME py
Notepad py
and so on...
Or on all.py
class looking_for_files_and_adding_to_queue
class looking_in_queue_for_the_next_in_line_and_direct_to_3_party
class FME
class Notepad
Today my script looks like this:
import time
import sys
import os
import datetime
from watchdog.observers import Observer
from watchdog.events import PatternMatchingEventHandler
class MyHandler(PatternMatchingEventHandler):
patterns = ["*.tif"]
count_move = 0
def process(self, event):
if self.count_move == 1:
# the file will be processed there
folder = "P:\\03_auto\\Indata"
indata = event.src_path
#Makes a new folder in Utdata based on filename
newfolder = os.path.join(folder[:11], str("Utdata\\orto"), event.src_path[18:29])
if not os.path.exists(newfolder):
os.makedirs(newfolder)
#Logg and print start of FME
print(time.strftime('%a %H:%M:%S') + ": FME " + event.src_path[18:] + " startats i FME.")
log_file = open("P:\\03_auto\\log.txt", "a")
log_file.write(time.strftime('%a %H:%M:%S') + ": FME " + event.src_path[18:] + " startats i FME.\n")
log_file.close()
#Starting and excequting FME
var_fme = str('fme.exe "P:\\03_auto\\Script\\tiff_to_milti_jpg_tiff\\tif_to_multi-jpg-tiff.fmw" --SourceDataset_TIFF "') + indata + str('" --FEATURE_TYPES "" --DestDataset_JPEG "') + newfolder + str('" --DestDataset_JPEG_5 "') + newfolder + str('" --DestDataset_JPEG_4 "') + newfolder + str('" --DestDataset_GEOTIFF "') + newfolder + str('" --DestDataset_GEOTIFF_3 "') + newfolder + str('"')
os.system(var_fme)
#Logg and pring move file
print(time.strftime('%a %H:%M:%S') + ": Flytt " + event.src_path[18:] + " har flyttats till" + newfolder + "\nTransformering klar\n")
log_file = open("P:\\03_auto\\log.txt", "a")
log_file.write(time.strftime('%a %H:%M:%S') + ": Flytt " + event.src_path[18:] + " har flyttats till" + newfolder + "\nTransformering klar\n\n")
log_file.close()
#Move org file to Utdata\orto
file_move = newfolder + indata[17:]
os.rename(indata, file_move)
#Restets script
self.count_move = 0
else:
#Logg and pring loadning file while transfering
print(time.strftime('%a %H:%M:%S') + ": Laddar " + event.src_path[18:] + " startar inladdning.")
log_file = open("P:\\03_auto\\log.txt", "a")
log_file.write(time.strftime('%a %H:%M:%S') + ": Laddar " + event.src_path[18:] + " startar inladdning.\n")
log_file.close()
#Sets counter to 1 which enables the FME part
self.count_move += 1
def on_modified(self, event):
self.process(event)
if __name__ == '__main__':
path = "P:\\03_auto\\Indata"
observer = Observer()
observer.schedule(MyHandler(), path, recursive=True)
observer.start()
try:
while True:
time.sleep(1)
except KeyboardInterrupt:
observer.stop()
observer.join()
tl;dr keep everything in one file for now, split subsequently while refactoring when the file becomes huge.
Python does not force you split classes / functions into modules. We as programmers make that call for solely the purpose of readability and maintainability.
While refactoring I personally look at functions with more ~40 - 50 lines and files with ~ 1000 lines to split and try to keep closely related things together.
high cohesion and low coupling.
is a characteristic feature of good software.
Also, since you seem to be starting out with this project I would recommend you to first concentrate on making a version that works, thereafter refactor it to improve code quality.
premature optimization is the root of all evil.
I am assuming that you are looking for suggestions to improve code quality here, so here are a few things you might be also be interested in:
follow pep8 standards: https://pep8.org
make your functions / methods accept parameters instead of hardcoding them eg the path of the folder you are watching.
make your program capable of resuming operations even after erroneous / abrupt termination: eg store state with a file or database
instead of trying to implement a queue yourself use robust systems like rabbitmq or redis.
write functions / methods that perform only one operation and do it well.
This it how far I have done. Now I have to get the files from the que to FME
import time
import sys
import os
import datetime
import arrow
from watchdog.observers import Observer
from watchdog.events import PatternMatchingEventHandler
from shutil import copy
class Queue:
def __init__(self):
self.items =[]
def isEmpty(self):
return self.items == []
def enqueue(self, item):
self.items.insert(0, item)
def dequeue(self):
self.items.pop()
def size(self):
return len(self.items)
def printqueue(self):
i = 0
for items in self.items:
i += 1
print(str(i) + ": " + items)
class MyHandler(PatternMatchingEventHandler):
patterns = ["*.tif","*.pdf"]
q = Queue()
def on_created(self, event):
file_name = os.path.basename(event.src_path)
file_type = file_name.split(".")[-1]
file_path = "path"
file_name_path = event.src_path
endwith = file_name.endswith("_mosaic_group1.tif")
new_folder = "C:\\FME_workdir\\"
new_path = new_folder + file_name
#create new temp folder for FME
if not os.path.exists(new_folder):
os.makedirs(new_folder)
#get tif file from project
if file_name.endswith("_mosaic_group1.tif") and not os.path.exists(new_path):
print("Queue:")
self.q.enqueue("[" + file_name + ", " + file_name_path + ", " + new_path + ", " + file_type + "]")
self.q.printqueue()
print("\n")
#fme = Fme()
#return fme.runfme(file_name, file_path, file_name_path)
#copy file to FME folder
if not os.path.exists(new_path):
copy(file_name_path, new_path)
#get the PDF report
elif file_name.endswith("_report.pdf") and "1_initial" in file_name_path:
pdf_path = os.path.dirname(file_name_path)
pdf_path_new_path = "\\".join(pdf_path.split("\\")[:3])
pdf_path_new_dir = "\\".join(pdf_path.split("\\")[5:6])
date_now = str(time.strftime("%y%m%d"))
pdf_new_path = pdf_path_new_path + "\\03_leverans\\" + pdf_path_new_dir + "_" + date_now
pdf_new_path_filename = pdf_new_path + "\\" + file_name
if not os.path.exists(pdf_new_path):
os.makedirs(pdf_new_path)
copy(file_name_path, pdf_new_path_filename)
#put inte que system
self.q.enqueue("[" + file_name + ", " + file_name_path + ", " + pdf_new_path + ", " + file_type + "]")
self.q.printqueue()
class Fme:
def runfme(self, file_name, file_path, file_name_path):
print("FME: " + self.file_name)
if __name__ == '__main__':
path = "P:\\"
observer = Observer()
observer.schedule(MyHandler(), path, recursive=True)
observer.start()
try:
while True:
time.sleep(1)
except KeyboardInterrupt:
observer.stop()
print("stop")
observer.join()
input('Press ENTER to exit')

PyQt5 GUI crashes when traversing iglob iterator on large directory

from PyQt5.QtCore import *
from PyQt5.QtGui import *
from PyQt5.QtWidgets import *
import os
from os import scandir, walk
from time import sleep
from glob import iglob, glob
class FileRead(QWidget):
def __init__(self, parent=None):
super(FileRead, self).__init__(parent)
self.resize(600, 400)
self.layout = QVBoxLayout()
self.loader = QProgressDialog("File read progress", "stop showing progress", 0, 100)
self.layout.addWidget(self.loader)
self.curfile = QLabel("CURRENT FILE")
self.layout.addWidget(self.curfile)
self.dirs = QLabel("DIRECTORY LISTING")
self.layout.addWidget(self.dirs)
self.btn = QPushButton("read files")
self.btn.clicked.connect(self.wrapper)
self.layout.addWidget(self.btn)
self.setLayout(self.layout)
self.filenames = []
def wrapper(self):
self.file_names('.')
This method: process_subdir, is what causes the most problems.
def process_subdir(self, path, toplevel=True):
txts = iglob(path + "/**/*.txt", recursive=True)
for t in txts:
print(t)
def file_names(self, start_path = '.'):
dirs = glob(start_path + '/*/')
d_len = len(dirs)
files = glob(start_path + '/*.txt')
f_len = len(files)
total_len = d_len + f_len
toplevel = [(d.split('\\'))[1] for d in dirs]
# setting the text to empty for now
self.dirs.setText("\n".join(toplevel))
ctr = 0
for f in files:
ctr += 1
if (f.endswith('.txt')):
self.filenames.append(f)
self.curfile.setText(f)
self.loader.setValue(((ctr)/total_len)*100)
sleep(1)
for idx, d in enumerate(dirs):
ctr += 1
write_str = ""
for n_idx, n_d in enumerate(dirs):
n_d = (n_d.split('\\'))[1]
if (n_idx < idx):
write_str += n_d + "(done)\n"
elif (n_idx == idx):
write_str += n_d + "(in progress)\n"
else:
write_str += n_d + "\n"
self.dirs.setText(write_str)
self.loader.setValue(((ctr)/total_len)*100)
self.process_subdir(d)
sleep(1)
write_str = ""
for d in dirs:
d = (d.split('\\'))[1]
write_str += d + "(done)\n"
self.dirs.setText(write_str)
print(self.filenames)
if __name__ == '__main__':
...
What I am attempting to do in this program is read through all the files in a certain directory and check their file extensions. For this section of code, it is checking for .txt files. The method I use for this is to allow the user to click a button, and scan a hardcoded (for now) directory for the text files. It works for smaller directories, and doesn't end up crashing, but when I attempt to run the program and parse larger directories, the GUI tends to crash. Something I also noticed is that when I run the program on a smaller directory, I can no longer interact with the GUI. Is there a way to
a) prevent the GUI from crashing when parsing large directories?
b) allow the GUI to still be interacted with even when the file parsing is going on?
The solution I used was to use QThreads instead of running a for loop for every d in dir. This way the GUI event loop isn't blocked and it doesn't look like the GUI crashes. In addition the program runs faster as a whole.

Filename convention for TimedRotatingFileHandler log? (python)

Python 2.7:
Every time a log experiences a rollover event (logs with a RotatingFileHandler) a 'backup' log is generated.
For instance :
logFile = 'general.log'
file_handler = logging.handlers.TimedRotatingFileHandler(logFile,when="midnight")
Results in midnight roll over and on rollover event the following file is created:
general.log.2015-01-21
Does this module offer any flexibility how these filenames are structured?
ie use a different convention ... 20150121_general.log
Short answer is no: according to TimedRotatingFileHandler documentation you have no way to do it.
The suffix change based on when parameter like you can see in the code https://hg.python.org/cpython/file/2.7/Lib/logging/handlers.py#l187
From the same source code you can see that override suffix is simple but you must override extMatch too:
class MyTimedRotatingFileHandler(TimedRotatingFileHandler):
def __init__(self, *args, **kwargs):
super(MyTimedRotatingFileHandler,self).__init__(*args,**kwargs)
self.suffix = "%Y%m%d"
self.extMatch = re.compile(r"^\d{4}\d{2}\d{2}$")
Unfortunately replace dot separator and swap suffix and basename is not so simple and you must rewrite doRollover() and getFilesToDelete() methods.
A hacking can be something like this (untested)... I hope it works but I cannot give any warrant :)
class MyTimedRotatingFileHandler(TimedRotatingFileHandler):
self.extMatch = r"^\d{4}-\d{2}-\d{2}$"
def getFilesToDelete(self):
""" CUT, PASTE AND .... HACK
"""
dirName, baseName = os.path.split(self.baseFilename)
fileNames = os.listdir(dirName)
result = []
extMatch = re.compile(r"^\d{4}\d{2}\d{2}$")
ends = "_" + baseName + ".log"
elen = len(ends)
for fileName in fileNames:
if fileName[-elen:] == ends:
date = fileName[-elen:]
if self.extMatch.match(date):
result.append(os.path.join(dirName, fileName))
result.sort()
if len(result) < self.backupCount:
result = []
else:
result = result[:len(result) - self.backupCount]
return result
def doRollover(self):
"""
CUT AND PAST FROM TimedRotatingFileHandler
customize file name by prefix instead suffix
"""
if self.stream:
self.stream.close()
self.stream = None
# get the time that this sequence started at and make it a TimeTuple
currentTime = int(time.time())
dstNow = time.localtime(currentTime)[-1]
t = self.rolloverAt - self.interval
if self.utc:
timeTuple = time.gmtime(t)
else:
timeTuple = time.localtime(t)
dstThen = timeTuple[-1]
if dstNow != dstThen:
if dstNow:
addend = 3600
else:
addend = -3600
timeTuple = time.localtime(t + addend)
#################################################
# THE HACK!!!! ##################################
##################################################
dfn = time.strftime("%Y%m%d", timeTuple) + "_" +self.baseFilename + ".log"
if os.path.exists(dfn):
os.remove(dfn)
# Issue 18940: A file may not have been created if delay is True.
if os.path.exists(self.baseFilename):
os.rename(self.baseFilename, dfn)
if self.backupCount > 0:
for s in self.getFilesToDelete():
os.remove(s)
if not self.delay:
self.stream = self._open()
newRolloverAt = self.computeRollover(currentTime)
while newRolloverAt <= currentTime:
newRolloverAt = newRolloverAt + self.interval
#If DST changes and midnight or weekly rollover, adjust for this.
if (self.when == 'MIDNIGHT' or self.when.startswith('W')) and not self.utc:
dstAtRollover = time.localtime(newRolloverAt)[-1]
if dstNow != dstAtRollover:
if not dstNow: # DST kicks in before next rollover, so we need to deduct an hour
addend = -3600
else: # DST bows out before next rollover, so we need to add an hour
addend = 3600
newRolloverAt += addend
self.rolloverAt = newRolloverAt

Getting the memory layout out of an (avr)elf file by useing python + pyElftools

I am creating my own bootloader for an ATXmega128A4U. To use the bootloader I want to transform the ELF-file of the firmware into a memory map used in the the ATXmega.
For that I use python and the modul "pyelftools". The documentation of it is poor and so I run into a problem: I do not know what information I can use to get the address, offset etc. from the data at the sections.
My goal is to create a bytearray, copy the data/code into it and transfer it to the bootlaoder. Below is my code:
import sys
# If pyelftools is not installed, the example can also run from the root or
# examples/ dir of the source distribution.
sys.path[0:0] = ['.', '..']
from elftools.common.py3compat import bytes2str
from elftools.elf.elffile import ELFFile
# 128k flash for the ATXmega128a4u
flashsize = 128 * 1024
def process_file(filename):
with open(filename, 'rb') as f:
# get the data
elffile = ELFFile(f)
dataSec = elffile.get_section_by_name(b'.data')
textSec = elffile.get_section_by_name(b'.text')
# prepare the memory
flashMemory = bytearray(flashsize)
# the data section
startAddr = dataSec.header.sh_offset
am = dataSec.header.sh_size
i = 0
while i < am:
val = dataSec.stream.read(1)
flashMemory[startAddr] = val[0]
startAddr += 1
i += 1
# the text section
startAddr = textSec.header.sh_offset
am = textSec.header.sh_size
i = 0
while i < am:
print(str(startAddr) + ' : ' + str(i))
val = textSec.stream.read(1)
flashMemory[startAddr] = val[0]
startAddr += 1
i += 1
print('finished')
if __name__ == '__main__':
process_file('firmware.elf')
Hope someone can tell me how to solve this problem.
I manged to solve the problem.
don't read the data manualy from the stream by "textSec.stream.read" use "textSec.data()" instead. Internaly (see "sections.py") a seek operation in the file is done. Afterwards the data is read. The result will be the valid data chunk.
The following code reads the code(text) section of a atxmega firmware and copies it into a bytearray which has the layout of the flash of an atxmega128a4u device.
#vlas_tepesch: the hex conversation is not needed and the the 64k pitfall is avoided.
sys.path[0:0] = ['.', '..']
from elftools.common.py3compat import bytes2str
from elftools.elf.elffile import ELFFile
# 128k flash for the ATXmega128a4u
flashsize = 128 * 1024
def __printSectionInfo (s):
print ('[{nr}] {name} {type} {addr} {offs} {size}'.format(
nr = s.header['sh_name'],
name = s.name,
type = s.header['sh_type'],
addr = s.header['sh_addr'],
offs = s.header['sh_offset'],
size = s.header['sh_size']
)
)
def process_file(filename):
print('In file: ' + filename)
with open(filename, 'rb') as f:
# get the data
elffile = ELFFile(f)
print ('sections:')
for s in elffile.iter_sections():
__printSectionInfo(s)
print ('get the code from the .text section')
textSec = elffile.get_section_by_name(b'.text')
# prepare the memory
flashMemory = bytearray(flashsize)
# the text section
startAddr = textSec.header['sh_addr']
val = textSec.data()
flashMemory[startAddr:startAddr+len(val)] = val
# print memory
print('finished')
if __name__ == '__main__':
process_file('firmware.elf')
Tanks for the comments!

Categories