Keep latest file and delete all other - python

In my folder there are many pdf files with date-timestamp format such as shown in the last.
I would like to keep the latest file for the day and delete the rest for that day. How can I do in Python ?
2012-07-13-15-13-27_1342167207.pdf
2012-07-13-15-18-22_1342167502.pdf
2012-07-13-15-18-33_1342167513.pdf
2012-07-23-14-45-12_1343029512.pdf
2012-07-23-14-56-48_1343030208.pdf
2012-07-23-16-03-45_1343034225.pdf
2012-07-23-16-04-23_1343034263.pdf
2012-07-26-07-27-19_1343262439.pdf
2012-07-26-07-33-27_1343262807.pdf
2012-07-26-07-51-59_1343263919.pdf
2012-07-26-22-38-30_1343317110.pdf
2012-07-26-22-38-54_1343317134.pdf
2012-07-27-10-43-27_1343360607.pdf
2012-07-27-10-58-40_1343361520.pdf
2012-07-27-11-03-19_1343361799.pdf
2012-07-27-11-04-14_1343361854.pdf
Should I use list to fill and sort out then ? Desired output is:
2012-07-13-15-18-33_1342167513.pdf
2012-07-23-16-04-23_1343034263.pdf
2012-07-26-22-38-54_1343317134.pdf
2012-07-27-11-04-14_1343361854.pdf
Thanks

Your desired list can also be achieved using groupby .
from itertools import groupby
from os import listdir,unlink
filtered_list = list()
names = os.listdir()
for key,group in groupby(names,lambda x : x[:10]): # groups based on the start 10 characters of file
filtered_list.append([item for item in group][-1]) #picks the last file from the group
print filtered_list

Sort the list and delete files if the next file in the list is on the same day,
import glob
import os
files = glob.glob("*.pdf")
files.sort()
for ifl, fl in enumerate(files[:-1]):
if files[ifl+1].startswith(fl[:10]): #Check if next file is same day
os.unlink(fl) # It is - delete current file
Edit:
As the OPs question became clearer it became evident that not just the last file of the list is required, but the latest file of each day - to achieve this I included a "same day" conditioned unlinking.

You could do it that way. The following code is untested, but may work:
import os
names = os.listdir()
names.sort()
for f in names[:-1]:
os.unlink(f)
Fortunately your file names use ISO8601 date format so the textual sort achieves the desired result with no need to parse the dates.

The following snippet works with the test case given.
files = os.listdir(".")
days = set(fname[8:10] for fname in files)
for d in days:
f = [i for i in files if i[8:10] == d]
for x in sorted(f)[:-1]:
os.remove(x)

Using dictionary You can keep one value. This can be dirty and quickest solution, maybe not the best.
#!/usr/bin/env python
import os
import datetime
import stat
import shutil
filelist=[]
lst=[]
dc={}
os.chdir(".")
for files in os.listdir("."):
if files.endswith(".pdf"):
lst.append(files)
for x in lst:
print x[0:10].replace("-","")
dc[int(x[0:10].replace("-",""))]=x
a = dc.items()
flist=[]
for k, v in a:
flist.append(v)
dir="tmpdir"
if not os.path.exists(dir):
os.makedirs(dir)
from shutil import copyfile
for x in flist:
print x
copyfile(x, dir + "/" + x)
#os.chdir(".")
for files in os.listdir("."):
if files.endswith(".pdf"):
os.unlink(files)
os.chdir("./tmpdir")
for files in os.listdir("."):
if files.endswith(".pdf"):
copyfile(files, "../"+files)
os.chdir("../")
shutil.rmtree(os.path.abspath(".")+"/tmpdir")

Related

how to count the no of files of each extension in a directory using python?

I'm fairly new to python and i came across this problem,
i want to be able to write a python script that counts the no of files in a directory of each extension and output the following details
First row shows image count
Second row shows file names in padding format.
Third row shows frame numbers continuity
Example:
files in the directory:-
alpha.txt
file01_0040.rgb
file01_0041.rgb
file01_0042.rgb
file01_0043.rgb
file02_0044.rgb
file02_0045.rgb
file02_0046.rgb
file02_0047.rgb
Output:-
1 alpha.txt
4 file01_%04d.rgb 40-43
4 file02_%04d.rgb 44-47
I'd suggest you have a look at python's native Pathlib library (it comes with glob)
here's a first idea how you could do it (sure this can be improved but it should provide you with the basics):
from pathlib import Path
from itertools import groupby
basepath = Path(r"path-to-the-folder")
# get all filenames that follow the pattern
files = basepath.glob("file*_[0-9][0-9][0-9][0-9].rgb")
# extract the pattern so that we get the 2 numbers separately
patterns = [i.stem.lstrip("file").split("_") for i in files]
# group by the first number
groups = groupby(patterns, key=lambda x: x[0])
filenames = list()
# loop through the obtained groups and extract min/max file_ids found
for file_num, group in groups:
file_ids = [int(i[1]) for i in group]
filenames.append(f"file_{file_num}_%04d.rgb {min(file_ids)} - {max(file_ids)}")
print(*filenames, sep="\n")
you can use the glob library to search through directories like this
import glob
glob.glob('*.rgb')
This code will return the filenames of all files ending with .rgb in an array for you to sort and edit

How to get the latest folder in a directory using Python

I need to retrieve the directory of the most recently create folder. I am using a program that will output a new run## folder each time it is executed (i.e run01, run02, run03 and so on). Within any one run## folder resides a data file that I want analyze (file-i-want.txt).
folder_numb = 'run01'
dir = os.path.dirname(__file__)
filepath = os.path.join(dir, '..\data\directory',run_numb,'file-i-want.txt')
In short I want to skip having to hardcode in run## and just get the directory of a file within the most recently created run## folder.
You can get the creation date with os.stat
path = '/a/b/c'
#newest
newest = max([f for f in os.listdir(path)], key=lambda x: os.stat(os.path.join(path,x)).st_birthtime)
# all files sorted
sorted_files = sorted([f for f in os.listdir(path)],key=lambda x: os.stat(os.path.join(path, x)).st_birthtime, reverse=True)
pathlib is the recommeded over os for filesystem related tasks.
reference
You can try:
filepath = Path(__file__).parent / 'data/directory'
fnames = sorted(list(Path(filepath).rglob('file-i-want.txt')), key=lambda x: Path.stat(x).st_mtime, reverse=True)
filepath = str(fnames[0])
filepath
glob.glob('run*') will return the list of files/directories that match the pattern ordered by name.
so if you want the latest run your code will be:
import glob
print(glob.glob('run*')[-1]) # raises index error if there are no runs
IMPORTANT, the files are ordered by name, in this case, for example, 'run21' will come AFTER 'run100', so you will need to use a high enough number of digits to not see this error. or just count the number of matched files and recreate the name of the folder with this number.
you can use glob to check the number of files with the same name pattern:
import glob
n = len(glob.glob('run*')) # number of files which name starts with 'run'
new_run_name = 'run' + str(n)
Note: with this code the file names starts from 0, if you want to start from 1 just add 1 to n.
if you want always double digit run number (00, 01, 02) instead of 'str(n)' use 'str(n).zfill(2)'
example:
import glob
n = len(glob.glob('run*')) # number of files which name starts with 'run'
new_run_name = 'run' + str(n + 1).zfill(2)

How to loop reading files group by group?

Generally, I loop through files one by one in Python. Now I want to loop through them group by group. How do I read them efficiently?
Here's an example to explain my question.
Given files like these:
group1: m2000_01, m2000_02,..., m2000_12
group2: m2001_01, m2001_02,...., m2001_12
.....
group17: m2016_01, m2016_02,...., m2016_12
I want to read files in same year for calculation and loop alone time serials for batching. Pseudo-code as follow:
for year in list[2000,2001,...,2016]:
A=open(m2000_01), B=open(m2000_02), C=open(m2000_03).... # reading files section
mean2000 = (A + B + C ...) / 12
#calculation body,how to set varibles for each file.such as A=m2000_01, B=m2000_02, ...,
#use a dict to set these files?
print mean2000, mean2001,..., mean2016 #result I want
Maybe I could make a list, and then loop element in list for matching(seive) and extracting group files. But if there are many groups of files and the group key words (such as 2000 in above example) are irregular. Are there any common method to solve similar problems? I think there is a proven method, but I don't know how to describe and search. Please forgive me if this problem is simple.
This will do
import os
path = "your\\path"
all_files = [x for x in os.listdir(path) if os.path.isfile(path + "\\" + x)]
for year in range(2000, 2017):
for file_name in [y for y in all_files if str(year) in y]:
sub_file_path = path + "\\" + file_name
# read file, insert appropriate code yourself
You can find and group the files for processing using os.listdir(), along with the re regex module, and the itertools.groupby() function to do something along these lines:
from itertools import groupby
import os
import re
folder_path = 'data_folder'
pattern = r'm\d\d\d\d_\d\d'
filenames = [filename for filename in sorted(os.listdir(folder_path))
if re.match(pattern, filename)]
for k, g in groupby(filenames, lambda filename: filename.split('_')[0]):
year = int(k[1:])
year_files = list(g)
print('{}: {}'.format(year, year_files))
Sample output:
2000: ['m2000_01', 'm2000_02', 'm2000_03', 'm2000_04', 'm2000_05', 'm2000_06', 'm2000_07', 'm2000_08', 'm2000_09', 'm2000_10', 'm2000_11', 'm2000_12']
2001: ['m2001_01', 'm2001_02', 'm2001_03', 'm2001_04', 'm2001_05', 'm2001_06', 'm2001_07', 'm2001_08', 'm2001_09', 'm2001_10', 'm2001_11', 'm2001_12']
2002: ['m2002_01', 'm2002_02', 'm2002_03', 'm2002_04', 'm2002_05', 'm2002_06', 'm2002_07', 'm2002_08', 'm2002_09', 'm2002_10', 'm2002_11', 'm2002_12']

Looping through a directory, importing and processing text files in a chronology order using Python

Hoping I could get help with my python code, currently I have to change the working directory manually every time I run my code that loops through all the .txt files in chronological order, since they are numbered 1_Ix_100.txt, 2_Ix_99.txt etc etc until 201_Ix_-100.txt. all the text files are in the same directory i.e. C:/test/Ig=*/340_TXT what changes is the starred folder which goes from 340 to 1020 in increments of 40 i.e. C:/test/Ig=340/340_TXT, C:/test/Ig=380/340_TXT etc etc etc until C:/test/Ig=1020/340_TXT.
I'm looking for a way to automate this process so that the code loops through the different /Ig=*/ folder, process the text files and save the outcome as csv file in the /Ig=/
import matplotlib.pylab as plt
import pandas as pd
import numpy as np
import re
import os
import glob
D = []
E = []
F = []
os.chdir('C:/test/**Ig=700**/340_TXT') #Need to loop through the different folders in bold, these go from Ig=340 to Ig=1020 in incruments of 40
numbers = re.compile(r'(\d+)')
def numericalSort(value):
parts = numbers.split(value)
parts[1::2] = map(int, parts[1::2])
return parts
for infile in sorted(glob.glob('*.txt'), key=numericalSort):
name=['1', '2']
results = pd.read_table(infile, sep='\s+', names=name)
#process files here with output [D], [E], [F]
ArrayMain = []
ArrayMain = np.column_stack((D,E,F))
np.savetxt("C:/test/**Ig=700**/Grey_Zone.csv", ArrayMain, delimiter=",", fmt='%.9f') #save output in this directory which is one less than the working directory
I really hope the way I have worded it makes sense and I appreciate any help at all, thank you
Using a simple loop and some string manipulation you can create a list of the paths you want and then iterate over them.
Ig_txts = []
i=340
while i <= 1020:
Ig_txts.append( 'Ig='+str(i) )
i += 40
for Ig_txt in Ig_txts:
path = 'C:/test/'+Ig_txt+'/340_TXT'
out_file = 'C:/test/'+Ig_txt+'/Grey_Zone.csv'
os.chdir(path)
...
...
EDIT: Gabriel brought up that my range is a little off. Check the second code blurb for the modification.
I would first put your script into a function that takes, as one of its arguments, a path. The details are up to you, this code just details how to loop through the file names.
for root, _, files in os.walk('C:/test/'):
for f in files:
os.chdir(os.path.join(root, f))
#You now have the paths you need to open, close, etc.
Now, if there are other garbage files in 'C:/test/', then you could use a range based loop:
min_file_num = 340
max_file_num = 1020
for dir_num in range(min_file_num, max_file_num+1, 40):
path = 'C:/test/Ig=' + str(dir_num) + '/'
for root, _, files in os.walk(path):
for f in files:
os.chdir(os.path.join(root, f))
#You now have the paths you need to open, close, etc.

How to get sequence number of the file in the folder?

I have Windows PC. My script should identify sequency number of the file passed in the command line in the folder, i.e.
myscript.py \\network-drive\files\Long-long.file.name.with.numbers.txt
Folder content is the following:
\\network-drive\files\
folder1
folder2
file1
file2
Long.long.file.name.with.numbers.txt
file3
file4
My script should identify sequence number of the file given in the command line, i.e. should return 5 (folders are also to be counted; assumption is that files are sorted by their names).
Upd. I've stopped with the following:
import sys
import os.path
if sys.argv[1]: # regardless of this verification, exception happens if argument is not passed
head, tail = os.path.split(sys.argv[1])
print head
print os.listdir(head)
The list returned by listdir doesn't allow me to identify what is folder and what is file. So, I can not sort them properly.
There are a couple of problems you are trying to solve, and a couple of options for the solutions.
1st - are you looking for something that is naturally sorted i.e.:
/path/to/folder/
subfolder01/
test1.png
test2.png
test3.png
test10.png
test11.png
If so...you'll need to create a natural sort method. If you are happy with alpha-numeric sorting:
/path/to/folder/
subfolder01/
test1.png
test10.png
test11.png
test2.png
test3.png
Then the standard sort will work. Depending on how you sort your files, the index of your result will vary.
To get the directory and files from the system, you can do it one of two ways - not 100% sure which is faster, so test them both out. I'm going to break the answer into chunks so you can piece it together how best seems fit:
Part 01: Initialization
import os
import sys
try:
searchpath = sys.argv[1]
except IndexError:
print 'No searchpath supplied'
sys.exit(0)
basepath, searchname = os.path.split(searchpath)
Part 02: Collecting folders and files
Option #1: os.listdir + os.path.isfile
files = []
folders = []
for filepath in os.listdir(basepath):
if ( os.path.isfile(filepath) ):
files.append(filepath)
else:
folders.append(folder)
Option #2: os.walk
# we only want the top level list of folders and files,
# so break out of the loop after the first result
for basepath, folders, files in os.walk(basepath):
break
Part 03: Calculating the Index
Option #1: no sorting - what you get from the system is what you get
# no sorting
try:
index = len(folders) + files.index(searchname)
except IndexError:
index = -1
Option #2: alphanumeric sorting
# sort alpha-numerically (only need to sort the files)
try:
index = len(folders) + sorted(files).index(searchname)
except IndexError:
index = -1
Option #3: natural sorting
# natural sort using the projex.sorting.natural method
import projex.sorting
sorted_files = sorted(files, projex.sorting.natural)
try:
index = len(folders) + sorted_files.index(searchname)
except IndexError:
index = -1
Part 04: Logging the result
# if wanting a 1-based answer
index += 1
print index
I'm not going to go into detail about natural sorting since that wasn't a part of the question - I think there are other forums on here you can find with advice on that. The projex.sorting module is one that I've written and is available here: http://dev.projexsoftware.com/projects/projex if you want to see the exact implementation of it.
Suffice to say this would be the difference in results:
>>> import pprint, projex.sorting
>>> files = ['test2.png', 'test1.png', 'test10.png', 'test5.png', 'test11.png']
>>> print files.index('test10.png')
2
>>> print sorted(files).index('test10.png')
1
>>> print sorted(files, projex.sorting.natural).index('test10.png')
3
>>> print files
['test2.png', 'test1.png', 'test10.png', 'test5.png', 'test11.png']
>>> print sorted(files)
['test1.png', 'test10.png', 'test11.png', 'test2.png', 'test5.png']
>>> print sorted(files, projex.sorting.natural)
['test1.png', 'test2.png', 'test5.png', 'test10.png', 'test11.png']
So just keep that in mind when you're working with it.
Cheers!
It looks like something like this should work:
import os
import glob
import sys
import os.path as path
try:
directory,file = path.split( sys.argv[1] )
def sort_func(fname):
"""
Russian directories , english directories, russian files then english files
although, honestly I don't know how russian files will actually be sorted ...
"""
fullname = path.join(directory,fname)
isRussian = any(ord(x) > 127 for x in fullname)
isDirectory = path.isdir(fullname)
return ( not isDirectory, not isRussian, fullname)
files = sorted( os.listdir(directory), key=sort_func)
print ( files.index(file) + 1 )
except IndexError:
print "oops, no commandline arguments"
from os import listdir
from sys import argv
from os.path import *
print listdir(dirname(argv[1]).index(basename(argv[1]))
but it really means nothing, can't even imagine usecase when you need it. See os.path for details.

Categories