Read files between two dates in a folder Python - python

I am trying to read file names in a folder between startdate and enddate. (Datestamp on file name)
I'm trying something like this.
Is there a better or more efficient way to do this?
I have thousands of files in that folder but based on start/end date values, often I will have a small percentage files between them.
startdate = "05/05/2013"
enddate = "06/06/2013"
mypath = "C:\\somepath\\"
onlyfiles = [ f for f in listdir(mypath) if isfile(join(mypath,f)) ]
for filetoread in onlyfiles:
filesBetweenDate = [ f for f in time.strftime('%m/%d/%Y', time.gmtime(os.path.getmtime(somepath+filetoread ))) if f > startdate and f < enddate]
Thanks

This avoids the walk through the folder:
from datetime import datetime, timedelta
start = datetime.strptime('05/06/2013', '%m/%d/%Y')
end = datetime.strptime('06/05/2013', '%m/%d/%Y')
filesBetweenDate = []
while start <= end:
f = start.strftime('%m/%d/%Y')
if isfile(join(mypath,f))
filesBetweenDate.append(f)
start += timedelta(1)

This should do the trick, with a couple of nice extra features, and only a single pass through the loop.
import calendar
from datetime import datetime
import os
import glob, os
mypath = "/Users/craigmj/"
timefmt = "%Y%m%d %H:%M:%S"
start = calendar.timegm(datetime.strptime("20130128 00:00:00", timefmt).timetuple())
end = calendar.timegm(datetime.strptime("20130601 00:00:00", timefmt).timetuple())
def test(f):
if (not os.path.isfile(f)):
return 0
(mode, ino, dev, nlink, uid, gid, size, atime, mtime, ctime) = os.stat(f)
return start<=ctime and end>=ctime
files = [f for f in glob.glob(os.path.join(mypath, "*")) if test(f)]
for f in files:
print(f)
First off, I use glob.glob so that you can use a wildcard in selecting your files. This might save you time if you can be more specific about the files you want to select (eg. if you files contain the datestamp in the filename).
Secondly, I use ctime in the test function, but you could as easily use mtime - the last modification time.
Finally, I'm time-specific, not just date-specific.
The only thing I'm not 100% sure about is whether this is all timezone safe. You might want to check that with an example, before digging through the docs to decide.

Related

How to get all files modified in a certain time window?

I want to get all files modified/created in the last 1 hour with Python. I tried this code but it's only getting the last file which was created:
import glob
import os
list_of_files = glob.glob('c://*')
latest_file = max(list_of_files, key=os.path.getctime)
print(latest_file)
If I created 10 files it shows only last one. How to get all files created in the last 1 hour?
You can do basically this:
get the list of files
get the time for each of them (also check os.path.getmtime() for updates)
use datetime module to get a value to compare against (that 1h)
compare
For that I've used a dictionary to both store paths and timestamps in a compact format. Then you can sort the dictionary by its values (dict.values()) (which is a float, timestamp) and by that you will get the latest files created within 1 hour that are sorted. (e.g. by sorted(...) function):
import os
import glob
from datetime import datetime, timedelta
hour_files = {
key: val for key, val in {
path: os.path.getctime(path)
for path in glob.glob("./*")
}.items()
if datetime.fromtimestamp(val) >= datetime.now() - timedelta(hours=1)
}
Alternatively, without the comprehension:
files = glob.glob("./*")
times = {}
for path in files:
times[path] = os.path.getctime(path)
hour_files = {}
for key, val in times.items():
if datetime.fromtimestamp(val) < datetime.now() - timedelta(hours=1):
continue
hour_files[key] = val
Or, perhaps your folder is just a mess and you have too many files. In that case, approach it incrementally:
hour_files = {}
for file in glob.glob("./*"):
timestamp = os.path.getctime(file)
if datetime.fromtimestamp(timestamp) < datetime.now() - timedelta(hours=1):
continue
hour_files[file] = timestamp
Here is another solution, shorter, that uses only the os package:
import os
directory = "/path/to/directory"
latest_file = os.popen(f"ls -t {directory}").read().split("\n")[0]

read csv files in python

I need to read one column from the multiple csv file present in folder and then extract minimum and maximum dates from the column.
For e.g. if i have folder path "/usr/abc/xyz/" and multiple csv files are present as below
aaa.csv
bbb.csv
ccc.csv
and the files are containing data
aaa.csv is containing the data
name,address,dates
xxx,11111,20190101
yyy,22222,20190201
zzz,33333,20190101
bbb.csv is containing the data
name,address,dates
fff,11111,20190301
ggg,22222,20190501
hhh,33333,20190601
so I need to extract the minimum and maximum dates from the files and in the above case the date range should be 20190101 to 20190601
Can anyone please help how can i extract the minimum and maximum dates from the files in python
I need to avoid pandas or any other package as I need to read csv files in directly in pyhton
import pandas as pd
dt = pd.read_csv('you_csv.csv')
print(max(dt['dates']))
print(min(dt['dates']))
If you need to avoid pandas you can do the following which is not recommended at all:
dt = []
with open('your_csv.csv', 'r') as f:
data = f.readlines()
for row in data:
dt.append(row.split(',')[2].rstrip())
dt.pop(0)
print(max(dt))
print(min(dt))
A solution only using the available core libraries. It doesn't read the whole file into memory so should have a very low footprint and will work with larger files.
pathlib is used to get all the csv files
datetime is used to convert to dates
sys is used for user input
$ python3 date_min_max.py /usr/abc/xyz/
min date: 2019-01-01 00:00:00
max date: 2019-06-01 00:00:00
date_min_max.py
from pathlib import Path
from datetime import datetime
import sys
if len(sys.argv) > 1:
p = sys.argv[1]
else:
p = "."
files = [x for x in Path(p).iterdir() if x.suffix == ".csv"]
date_format = "%Y%m%d"
dt_max = datetime.strptime("19000101", date_format)
dt_min = datetime.strptime("30000101", date_format)
for file in files:
with file.open("r") as fh:
for i, line in enumerate(fh):
if i == 0:
continue
t = line.strip().split(",")[2]
dt_max = max(dt_max, datetime.strptime(t, date_format))
dt_min = min(dt_min, datetime.strptime(t, date_format))
print("min date: {}\nmax date: {}".format(dt_min, dt_max))

Extracting Files between two dates which has been saved with dates

I have saved all the daily sales reports in a common folder. each file is named with the corresponding date. eg: 01-01-2019-Sales.csv, 02-01-2019-Sales.csv, etc. all the files are saved in the "C:\Desktop\Sales" folder path. now i want to extract & combine all the files which are between 05-01-2019 to 04-02-2019.
I know I can extract all the files with pandas using the below code
import pandas as pd
import glob
import os
file_path = r'C:\Desktop\Sales'
all_files = glob.glob(os.path.join(file_path,'*.csv'))
df = pd.concat([pd.read_csv(f) for f in all_files], sort=False)
But, my question is how can i extract files between 2 given specific dates using pandas/python. (using the file names which has been saved with the date) eg ; extract only the files between 05-01-2019 to 04-02-2019.
What about this
start_date = "05-01-2019"
end_date = "04-02-2019"
all_csv_files = [x for x in os.listdir(file_path) if x.endswith('.csv')]
correct_date_files = [x for x in all_csv_files
if x >= start_date + "-Sales.csv" and x <= end_date + "-Sales.csv"]
df = pd.concat([pd.read_csv(f) for f in correct_date_files], sort=False)
You basically just list all .csv files in your directory and only take the ones between the chosen dates.
I think that this piece of code will help you
import datetime
d1 = datetime.date(2019,1,1)
d2 = datetime.date(2019,2,1)
d3 = datetime.date(2019,1,20)
d4 = datetime.date(2019,2,20)
print(d1<d3<d2)
# True
print(d1<d4<d2)
# False
The dates could be compared lexically with a change to yyyy-mm-dd.
L = [ '01-01-2019-Sales.csv', '02-01-2019-Sales.csv']
>>> start = '2018-12-01'
>>> end = '2019-02-01'
>>> for file in L:
m, d, yr = file.split('-')[:3]
date = '-'.join([yr, m, d])
if start <= date <= end:
print(file)
01-01-2019-Sales.csv
02-01-2019-Sales.csv
Use the dates as comparison:
import pandas as pd
import glob
import os
from time import strptime
file_path = r'C:\Desktop\Sales'
all_files = glob.glob(os.path.join(file_path,'*.csv'))
start_date = strptime('04-02-2019', '%m-%d-%Y')
end_date = strptime('05-01-2019', '%m-%d-%Y')
df = pd.concat([pd.read_csv(f) for f in all_files
if start_date < strptime(f, '%d-%m-%Y.csv') < end_date],
sort=False)

Open files older than 3 days of date stamp in file name - Python 2.7

** Problem **
I'm trying to open (in python) files older than 3 days of the date stamp which is in the current name. Example: 2016_08_18_23_10_00 - JPN - MLB - Mickeymouse v Burgerface.ply. So far I can create a date variable, however I do not know how to search for this variable in a filename. I presume I need to convert it to a string first?
from datetime import datetime, timedelta
import os
import re
path = "C:\Users\michael.lawton\Desktop\Housekeeper"
## create variable d where current date time is subtracted by 3 days ##
days_to_subtract = 3
d = datetime.today() - timedelta(days=days_to_subtract)
print d
## open file in dir where date in filename = d or older ##
for filename in os.listdir(path):
if re.match(d, filename):
with open(os.path.join(path, filename), 'r') as f:
print line,
Any help will be much appreciated
You can use strptime for this. It will convert your string (assuming it is correctly formatted) into a datetime object which you can use to compare if your file is older than 3 days based on the filename:
from datetime import datetime
...
lines = []
for filename in os.listdir(path):
date_filename = datetime.strptime(filename.split(" ")[0], '%Y_%m_%d_%H_%M_%S')
if date_filename < datetime.datetime.now()-datetime.timedelta(days=days_to_subtract):
with open(os.path.join(path, filename), 'r') as f:
lines.extend(f.readlines()) # put all lines into array
If the filename is 2016_08_18_23_10_00 - JPN - MLB - Mickeymouse v Burgerface.ply the datetime part will be extracted with filename.split(" ")[0]. Then we can use that to check if it is older than three days using datetime.timedelta
To open all files in the given directory that contain a timestamp in their name older than 3 days:
#!/usr/bin/env python2
import os
import time
DAY = 86400 # POSIX day in seconds
three_days_ago = time.time() - 3 * DAY
for filename in os.listdir(dirpath):
time_string = filename.partition(" ")[0]
try:
timestamp = time.mktime(time.strptime(time_string, '%Y_%m_%d_%H_%M_%S'))
except Exception: # can't get timestamp
continue
if timestamp < three_days_ago: # old enough to open
with open(os.path.join(dirpath, filename)) as file: # assume it is a file
for line in file:
print line,
The code assumes that the timestamps are in the local timezone. It may take DST transitions into account on platforms where C mktime() has access to the tz database (if it doesn't matter whether the file is 72 or 73 hours old in your case then just ignore this paragraph).
Consider using file metadata such as "the last modification time of a file" instead of extracting the timestamp from its name: timestamp = os.path.getmtime(path).

Delete the zip files with date format by comparing to the present date in Python

I have a list of files(Actually these are the files in some directory) as below for example
import os
path = '/home/user/folder'
files = os.listdir(path)
so result is as below
files = ['backup_file_2010-06-30_category.zip','backup_file_2010-06-28_category.zip',
'backup_file_2010-06-26_category.zip','backup_file_2010-06-24_category.zip',
'backup_file_2010-06-23_category.zip','backup_file_2010-06-20_category.zip'
'some_text_files_one.txt','some_text_files_two.txt']
so from this list i need to delete the zip files that contains the date in it on a condition that, the files that are created before five days from today needs to be deleted
I mean if the file created today is backup_file_2013-04-17_category.zip, we need to delete the files that are created before five days from today something like the files named as backup_file_2013-04-11_category.zip
Can anyone please let me know how to do this in python
You could do something like that and in the filtered_files list you have the list of files that need to be deleted. It works if your backup files starting with prefix.
from datetime import datetime
from datetime import timedelta
import os
path = '/home/user/folder'
files = os.listdir(path)
prefix = 'backup_file_'
days = 5
filtered_files = []
five_days_ago = datetime.now() - timedelta(days=days)
date_before = '%s%s' % (prefix, five_days_ago.strftime('%Y-%m-%d'))
for f in files:
if f.startswith(prefix) and f < date_before:
filtered_files.append(f)
print filtered_files
import datetime, os
mydaterange=[datetime.datetime.today()-datetime.timedelta(days=x) for x in range(1,6)]
myfilenames=['backup_file_'+ str(x.year)+'-'+ str(x.month) + '-' + str(x.day)+ '_category.zip' for x in mydaterange]
for files in os.listdir('/home/user/folder'):
if files.endswith('.zip') and files.startswith('backup_file_'):
if files not in myfilenames:
os.remove(files)
You can extract the date from each filename using a regex, and compare it to see if the backup file is indeed old. If there is no match from the regex then it's not a backup file.
from datetime import date
import re
OLD_FILE_DAYS = 5
def is_old_backup_file(filename, today):
m = re.match('backup_file_(\d\d\d\d)-(\d\d)-(\d\d)_category.zip', filename)
if not m:
return False
year, month, day = (int(s) for s in m.groups())
d = date(year, month, day)
delta = today - d
return delta.days > OLD_FILE_DAYS
files = ['backup_file_2010-06-30_category.zip','backup_file_2010-06-28_category.zip',
'backup_file_2010-06-26_category.zip','backup_file_2010-06-24_category.zip',
'backup_file_2010-06-23_category.zip','backup_file_2010-06-20_category.zip',
'some_text_files_one.txt','some_text_files_two.txt'] # os.listdir(path)
today = date(2010, 7, 1) # date.today()
filtered_files = [f for f in files if not is_old_backup_file(f, today)]
print filtered_files
Output:
['backup_file_2010-06-30_category.zip', 'backup_file_2010-06-28_category.zip',
'backup_file_2010-06-26_category.zip', 'some_text_files_one.txt',
'some_text_files_two.txt']

Categories