Load files that has name patterns and clean data using python [duplicate] - python

I am trying to find all file names in a folder which follows this pattern: 'index_YYYYMMDD.csv'. The 'YYYYMMDD' part represents the date of the data file. Some of the files names are listed below:
'index_20091101.csv',
'index_20091102.csv',
'index_20091103.csv',
'index_20091104.csv',
'index_20091105.csv',
'index_20091106.csv',
'index_20091107.csv',
'index_20091108.csv',
Given a startDate and endDate, I would like to find all file names, the date part of which is between the startDate and endDate. For example, for the above file list, if the startDate=20091104 and endDate=20091107, the file names I would like to find should be:
'index_20091104.csv',
'index_20091105.csv',
'index_20091106.csv',
'index_20091107.csv'
I've tried os.listdir function, which gives me all the file names. To filter out the unwanted files, I think I need to use regular expression, but could not work it out.
Anyone could help me with this? Thanks!

import glob
glob.glob('index_[0-9]*.csv')
This will math the filename that starts with a digital .
John's solution matches exactly 8 digital .

I would take the following approach. You can define a simple file filter factory.
import time
def make_time_filter(start, end, time_format, file_format='index_{time_format:}.csv'):
t_start = time.strptime(start, time_format)
t_end = time.strptime(end, time_format)
ft_fmt = file_format.format(time_format=time_format)
def filt(fname):
try:
return t_start <= time.strptime(fname, ft_fmt) <= t_end
except ValueError:
return False
return filt
Now, you can simply make a predicate to filter out the date range you want
time_filt = make_time_filter('20091101', '20091201', '%Y%m%d')
Then pass this to filter
filter(time_filt, os.listdir(your_dir))
Or put it a comprehension of some sort
(fname for fname in os.listdir(your_dir) if time_filt(fname))
A regex will be more general, but you don't need one in your case since your file names all follow a simple pattern which you know must contain a date. For more on the time module see the docs.

If you want to match exactly 8 digits with glob you need to write them all out like this
import glob
glob.glob('index_[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9].csv')
Help on function glob in module glob:
glob(pathname)
Return a list of paths matching a pathname pattern.
The pattern may contain simple shell-style wildcards a la
fnmatch. However, unlike fnmatch, filenames starting with a
dot are special cases that are not matched by '*' and '?'
patterns.
If you want real regex, use os.listdir and filter the result
[x for x in os.listdir('.') if re.match('index_[0-9]*.csv', x)]

This will get you where you want to be and allows you to provide start and end dates:
import os
import re
import datetime
start_date = datetime.datetime.strptime('20071102', '%Y%m%d')
end_date = datetime.datetime.strptime('20071103', '%Y%m%d')
files = os.listdir('.')
files_in_range = []
for fl in files:
if re.match('index_\d+\.csv', fl):
date = re.match('index_(\d+)\.csv', fl).group(1)
date = datetime.datetime.strptime(date, '%Y%m%d')
if date >= start_date and date <= end_date:
files_in_range.append(fl)
print files_in_range

Related

Cut out a sequence of files using glob in python

I have a directory with files like img-0001.jpg, img-0005.pg, img-0006.jpg, ... , img-xxxx.jpg.
What I need to do is to get a list with all files starting at 0238, literally img-0238.jpg. The next existing filename is img-0240.jpg
Right now I use glob to get all filenames.
list_images = glob.glob(path_images + "*.jpg")
Thanks in advance
Edit:
-> The last filename is img-0315.jpg
Glob doesn't allow regex filtering. But you filter list right after you receive all matching files.
Here is how it would look like using re:
import re
list_images = [f for f in glob.glob(path_images + "*.jpg") \
if re.search(r'[1-9]\d{3,}|0[3-9]\d{2,}|02[4-9]\d|023[8-9]\.jpg$', f)]
The regular expression with verify that file ends with number with 4 digits bigger or equal 0238.
You can play around with regular expression using https://regex101.com/
Basically, we check if number is:
starts with 1 followed by any 3 digits
or starts with 0[3-9] followed by any 2 digits
or starts with 02[4-9] followed by any 1 digit
or starts with 023 and followed by either 8 or 9.
But it's probably would be easier to do simple comparison:
list_images = [f for f in glob.glob(path_images + "*.jpg") \
if f[-8:-4] > "0237" and f[-8:-4] < "0316"]
You can specify multiple repeated wildcards to match all files whose number is 23[89] or 2[4-9][0-9] or 30[0-9] etc;
list_images = []
for pattern in ('023[89]', '02[4-9][0-9]', '030[0-9]', '031[0-5]'):
list_images.extend(glob.glob(
os.path.join(path_images, '*{0}.jpg'.format(pattern))))
or you can just filter out the ones you don't want.
list_images = [x for x in glob.glob(os.path.join(path_images, "*.jpg"))
if 238 <= int(x[-8:-4]) <= 315]
For something like this, you could try the wcmatch library. It's a library that aims to enhance file globbing and wildcard matching.
In this example, we enable brace expansion and demonstrate the pattern by filtering a list of files:
from wcmatch import glob
files = []
# Generate list of files from img-0000.jpg to img-0315.jpg
for x in range(316):
files.append('path/img-{:04d}.jpg'.format(x))
print(glob.globfilter(files, 'path/img-{0238..0315}.jpg', flags=glob.BRACE))
And we get the following output:
['path/img-0238.jpg', 'path/img-0239.jpg', 'path/img-0240.jpg', 'path/img-0241.jpg', 'path/img-0242.jpg', 'path/img-0243.jpg', 'path/img-0244.jpg', 'path/img-0245.jpg', 'path/img-0246.jpg', 'path/img-0247.jpg', 'path/img-0248.jpg', 'path/img-0249.jpg', 'path/img-0250.jpg', 'path/img-0251.jpg', 'path/img-0252.jpg', 'path/img-0253.jpg', 'path/img-0254.jpg', 'path/img-0255.jpg', 'path/img-0256.jpg', 'path/img-0257.jpg', 'path/img-0258.jpg', 'path/img-0259.jpg', 'path/img-0260.jpg', 'path/img-0261.jpg', 'path/img-0262.jpg', 'path/img-0263.jpg', 'path/img-0264.jpg', 'path/img-0265.jpg', 'path/img-0266.jpg', 'path/img-0267.jpg', 'path/img-0268.jpg', 'path/img-0269.jpg', 'path/img-0270.jpg', 'path/img-0271.jpg', 'path/img-0272.jpg', 'path/img-0273.jpg', 'path/img-0274.jpg', 'path/img-0275.jpg', 'path/img-0276.jpg', 'path/img-0277.jpg', 'path/img-0278.jpg', 'path/img-0279.jpg', 'path/img-0280.jpg', 'path/img-0281.jpg', 'path/img-0282.jpg', 'path/img-0283.jpg', 'path/img-0284.jpg', 'path/img-0285.jpg', 'path/img-0286.jpg', 'path/img-0287.jpg', 'path/img-0288.jpg', 'path/img-0289.jpg', 'path/img-0290.jpg', 'path/img-0291.jpg', 'path/img-0292.jpg', 'path/img-0293.jpg', 'path/img-0294.jpg', 'path/img-0295.jpg', 'path/img-0296.jpg', 'path/img-0297.jpg', 'path/img-0298.jpg', 'path/img-0299.jpg', 'path/img-0300.jpg', 'path/img-0301.jpg', 'path/img-0302.jpg', 'path/img-0303.jpg', 'path/img-0304.jpg', 'path/img-0305.jpg', 'path/img-0306.jpg', 'path/img-0307.jpg', 'path/img-0308.jpg', 'path/img-0309.jpg', 'path/img-0310.jpg', 'path/img-0311.jpg', 'path/img-0312.jpg', 'path/img-0313.jpg', 'path/img-0314.jpg', 'path/img-0315.jpg']
So, we could apply this to a file search:
from wcmatch import glob
list_images = glob.glob('path/img-{0238..0315}.jpg', flags=glob.BRACE)
In this example, we've hard coded the path, but in your example, make sure path_images has a trailing / so that the pattern is constructed correctly. Others have suggested this might be an issue. Print out your pattern to confirm the pattern is correct.

Python Parse through String to create variable

I have a variable that reads in a datafile
dfPort = pd.read_csv("E:...\Portfolios\ConsDisc_20160701_Q.csv")
I was hoping to create three variables: portName, inceptionDate, and frequency that would read the string of the "E:..." above and take out the wanted parts of the string using the underscore as a indicator to go to next variable. Example after parsing string:
portName = "ConsDisc"
inceptionDate: "2016-07-01"
frequency: "Q"
Any tips would be appreciated!
You can use os.path.basename, os.path.splitext and str.split:
import os
filename = r'E:...\Portfolios\ConsDisc_20160701_Q.csv'
parts = os.path.splitext(os.path.basename(filename.replace('\\', os.sep)))[0].split('_')
print(parts)
outputs ['ConsDisc', '20160701', 'Q']. You can then manipulate this list as you like, for example extract it into variables with port_name, inception_date, frequency = parts, etc.
The .replace('\\', os.sep) there is used to "normalize" Windows-style backslash-separated paths into whatever is the convention of the system the code is being run on (i.e. forward slashes on anything but Windows :) )
import os
def parse_filename(path):
filename = os.path.basename(path)
filename_no_ext = os.path.splitext(filename)[0]
return filename_no_ext.split("_")
path = r"Portfolios\ConsDisc_20160701_Q.csv"
portName, inceptionDate, frequency = parse_filename(path)
How about an alternative solution just in case if you want to store them into a dictionary and use them like so,
import re
str1 = "E:...\Portfolios\ConsDisc_20160701_Q.csv"
re.search(r'Portfolios\\(?P<portName>.*)_(?P<inceptionDate>.*)_(?P<frequency>.)', str1).groupdict()
# result
# {'portName': 'ConsDisc', 'inceptionDate': '20160701', 'frequency': 'Q'}

python find all file names in folder that follows a pattern

I am trying to find all file names in a folder which follows this pattern: 'index_YYYYMMDD.csv'. The 'YYYYMMDD' part represents the date of the data file. Some of the files names are listed below:
'index_20091101.csv',
'index_20091102.csv',
'index_20091103.csv',
'index_20091104.csv',
'index_20091105.csv',
'index_20091106.csv',
'index_20091107.csv',
'index_20091108.csv',
Given a startDate and endDate, I would like to find all file names, the date part of which is between the startDate and endDate. For example, for the above file list, if the startDate=20091104 and endDate=20091107, the file names I would like to find should be:
'index_20091104.csv',
'index_20091105.csv',
'index_20091106.csv',
'index_20091107.csv'
I've tried os.listdir function, which gives me all the file names. To filter out the unwanted files, I think I need to use regular expression, but could not work it out.
Anyone could help me with this? Thanks!
import glob
glob.glob('index_[0-9]*.csv')
This will math the filename that starts with a digital .
John's solution matches exactly 8 digital .
I would take the following approach. You can define a simple file filter factory.
import time
def make_time_filter(start, end, time_format, file_format='index_{time_format:}.csv'):
t_start = time.strptime(start, time_format)
t_end = time.strptime(end, time_format)
ft_fmt = file_format.format(time_format=time_format)
def filt(fname):
try:
return t_start <= time.strptime(fname, ft_fmt) <= t_end
except ValueError:
return False
return filt
Now, you can simply make a predicate to filter out the date range you want
time_filt = make_time_filter('20091101', '20091201', '%Y%m%d')
Then pass this to filter
filter(time_filt, os.listdir(your_dir))
Or put it a comprehension of some sort
(fname for fname in os.listdir(your_dir) if time_filt(fname))
A regex will be more general, but you don't need one in your case since your file names all follow a simple pattern which you know must contain a date. For more on the time module see the docs.
If you want to match exactly 8 digits with glob you need to write them all out like this
import glob
glob.glob('index_[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9].csv')
Help on function glob in module glob:
glob(pathname)
Return a list of paths matching a pathname pattern.
The pattern may contain simple shell-style wildcards a la
fnmatch. However, unlike fnmatch, filenames starting with a
dot are special cases that are not matched by '*' and '?'
patterns.
If you want real regex, use os.listdir and filter the result
[x for x in os.listdir('.') if re.match('index_[0-9]*.csv', x)]
This will get you where you want to be and allows you to provide start and end dates:
import os
import re
import datetime
start_date = datetime.datetime.strptime('20071102', '%Y%m%d')
end_date = datetime.datetime.strptime('20071103', '%Y%m%d')
files = os.listdir('.')
files_in_range = []
for fl in files:
if re.match('index_\d+\.csv', fl):
date = re.match('index_(\d+)\.csv', fl).group(1)
date = datetime.datetime.strptime(date, '%Y%m%d')
if date >= start_date and date <= end_date:
files_in_range.append(fl)
print files_in_range

python : reading a datetime from a log file using regex

I have a log file which has text that looks like this.
Jul 1 03:27:12 syslog: [m_java][ 1/Jul/2013 03:27:12.818][j:[SessionThread <]^Iat com/avc/abc/magr/service/find.something(abc/1235/locator/abc;Ljava/lang/String;)Labc/abc/abcd/abcd;(bytecode:7)
There are two time formats in the file. I need to sort this log file based on the date time format enclosed in [].
This is the regex I am trying to use. But it does not return anything.
t_pat = re.compile(r".*\[\d+/\D+/.*\]")
I want to go over each line in file, be able to apply this pattern and sort the lines based on the date & time.
Can someone help me on this? Thanks!
You have a space in there that needs to be added to the regular expression
text = "Jul 1 03:27:12 syslog: [m_java][ 1/Jul/2013 03:27:12.818][j:[SessionThread <]^Iat com/avc/abc/magr/service/find.something(abc/1235/locator/abc;Ljava/lang/String;)Labc/abc/abcd/abcd;(bytecode:7)"
matches = re.findall(r"\[\s*(\d+/\D+/.*?)\]", text)
print matches
['1/Jul/2013 03:27:12.818']
Next parse the time using the following function
http://docs.python.org/2/library/time.html#time.strptime
Finally use this as a key into a dict, and the line as the value, and sort these entries based on the key.
You are not matching the initial space; you also want to group the date for easy extraction, and limit the \D and .* patterns to non-greedy:
t_pat = re.compile(r".*\[\s?(\d+/\D+?/.*?)\]")
Demo:
>>> re.compile(r".*\[\s?(\d+/\D+?/.*?)\]").search(line).group(1)
'1/Jul/2013 03:27:12.818'
You can narrow down the pattern some more; you only need to match 3 letters for the month for example:
t_pat = re.compile(r".*\[\s?(\d{1,2}/[A-Z][a-z]{2}/\d{4} \d{2}:\d{2}:[\d.]{2,})\]")
Read all the lines of the file and use the sort function and pass in a function that parses out the date and uses that as the key for sorting:
import re
import datetime
def parse_date_from_log_line(line):
t_pat = re.compile(r".*\[\s?(\d+/\D+?/.*?)\]")
date_string = t_pat.search(line).group(1)
format = '%d/%b/%Y %H:%M:%S.%f'
return datetime.datetime.strptime(date_string, format)
log_path = 'mylog.txt'
with open(log_path) as log_file:
lines = log_file.readlines()
lines.sort(key=parse_date_from_log_line)

Keep latest file and delete all other

In my folder there are many pdf files with date-timestamp format such as shown in the last.
I would like to keep the latest file for the day and delete the rest for that day. How can I do in Python ?
2012-07-13-15-13-27_1342167207.pdf
2012-07-13-15-18-22_1342167502.pdf
2012-07-13-15-18-33_1342167513.pdf
2012-07-23-14-45-12_1343029512.pdf
2012-07-23-14-56-48_1343030208.pdf
2012-07-23-16-03-45_1343034225.pdf
2012-07-23-16-04-23_1343034263.pdf
2012-07-26-07-27-19_1343262439.pdf
2012-07-26-07-33-27_1343262807.pdf
2012-07-26-07-51-59_1343263919.pdf
2012-07-26-22-38-30_1343317110.pdf
2012-07-26-22-38-54_1343317134.pdf
2012-07-27-10-43-27_1343360607.pdf
2012-07-27-10-58-40_1343361520.pdf
2012-07-27-11-03-19_1343361799.pdf
2012-07-27-11-04-14_1343361854.pdf
Should I use list to fill and sort out then ? Desired output is:
2012-07-13-15-18-33_1342167513.pdf
2012-07-23-16-04-23_1343034263.pdf
2012-07-26-22-38-54_1343317134.pdf
2012-07-27-11-04-14_1343361854.pdf
Thanks
Your desired list can also be achieved using groupby .
from itertools import groupby
from os import listdir,unlink
filtered_list = list()
names = os.listdir()
for key,group in groupby(names,lambda x : x[:10]): # groups based on the start 10 characters of file
filtered_list.append([item for item in group][-1]) #picks the last file from the group
print filtered_list
Sort the list and delete files if the next file in the list is on the same day,
import glob
import os
files = glob.glob("*.pdf")
files.sort()
for ifl, fl in enumerate(files[:-1]):
if files[ifl+1].startswith(fl[:10]): #Check if next file is same day
os.unlink(fl) # It is - delete current file
Edit:
As the OPs question became clearer it became evident that not just the last file of the list is required, but the latest file of each day - to achieve this I included a "same day" conditioned unlinking.
You could do it that way. The following code is untested, but may work:
import os
names = os.listdir()
names.sort()
for f in names[:-1]:
os.unlink(f)
Fortunately your file names use ISO8601 date format so the textual sort achieves the desired result with no need to parse the dates.
The following snippet works with the test case given.
files = os.listdir(".")
days = set(fname[8:10] for fname in files)
for d in days:
f = [i for i in files if i[8:10] == d]
for x in sorted(f)[:-1]:
os.remove(x)
Using dictionary You can keep one value. This can be dirty and quickest solution, maybe not the best.
#!/usr/bin/env python
import os
import datetime
import stat
import shutil
filelist=[]
lst=[]
dc={}
os.chdir(".")
for files in os.listdir("."):
if files.endswith(".pdf"):
lst.append(files)
for x in lst:
print x[0:10].replace("-","")
dc[int(x[0:10].replace("-",""))]=x
a = dc.items()
flist=[]
for k, v in a:
flist.append(v)
dir="tmpdir"
if not os.path.exists(dir):
os.makedirs(dir)
from shutil import copyfile
for x in flist:
print x
copyfile(x, dir + "/" + x)
#os.chdir(".")
for files in os.listdir("."):
if files.endswith(".pdf"):
os.unlink(files)
os.chdir("./tmpdir")
for files in os.listdir("."):
if files.endswith(".pdf"):
copyfile(files, "../"+files)
os.chdir("../")
shutil.rmtree(os.path.abspath(".")+"/tmpdir")

Categories