Match and change multiple date formats into one with Python Regex - python
I need to implement a function in Python which is able to retrieve multiple date formats from input string, change them into one specific format and return the date only:
Format Example Input String
MMDDYYYY foo.bar.02242015.txt
MMDDYY foo.bar.022415.txt
MONCCYY foo.bar.FEB2015.txt
YYYY-MM-DD foo_bar_2015-02-01_2015-02-28.txt
YYYYMMDD foo_bar_20150224.txt
MM_YY foo_bar_02_15.txt
YYYYMMDD foo_bar_20150224.txt
Output: just a fixed 8 digits date format (no foo, bar or txt):
YYYYMMDD (e.g. 20120524)
Example:
Input Output
foo.bar.02242015.txt -> 20150224
Some requirements:
if date is missing, add the last day of the month:
foo_02_15.txt -> 20150228
if year is 2 digits, change it to 4:
foo_02_24_16.txt -> 20160224
valid year is current or previous year, for now: 2016 or 2015
if month is not number, e.g. FEB, change it to 2 digit number:
foo.FEB2015.txt -> 20150228
Format 'YYYY-MM-DD' always contains two dates, fetch the second one:
foo_2015-02-01_2015-02-28.txt -> 20150228
Anyone know how to do it with Regex in Python? Or what is the best practice to do it?
Try this:
import re
import time
import datetime
import calendar
p = re.compile(ur'(?<=\.|_)([A-Z\d+_-]*?([A-Z\d+_-]{0,10}))(?=\.)')
test_str = u"Format Example Input String \n\nMMDDYYYY foo.bar.02242015.txt\nMMDDYY foo.bar.022415.txt\nMONCCYY foo.bar.FEB2015.txt\nYYYY-MM-DD foo_bar_2015-02-01_2015-02-28.txt\nYYYYMMDD foo_bar_20150224.txt\nMM_YY foo_bar_02_15.txt\nYYYYMMDD foo_bar_20150224.txt"
def changedate(date):
try:
t = time.strptime(date,'%m%d%Y')
except:
pass
try:
t = time.strptime(date,'%m%d%y')
except:
pass
try:
t = time.strptime(date,'%b%Y')
lastday = calendar.monthrange(int(t.tm_year), int(t.tm_mon))[1]
t = time.strptime(date + str(lastday),'%b%Y%d')
except:
pass
try:
t = time.strptime(date,'%m_%y')
lastday = calendar.monthrange(int(t.tm_year), int(t.tm_mon))[1]
t = time.strptime(date + str(lastday),'%m_%y%d')
except:
pass
try:
t = time.strptime(date,'%Y-%m-%d')
except:
pass
try:
r = time.strftime("%Y%m%d",t)
return r
except:
pass
return date
test_str = re.sub(p,lambda m: changedate(m.group(2)), test_str)
print test_str
Regex Demo
Input
Format Example Input String
MMDDYYYY foo.bar.02242015.txt
MMDDYY foo.bar.022415.txt
MONCCYY foo.bar.FEB2015.txt
YYYY-MM-DD foo_bar_2015-02-01_2015-02-28.txt
YYYYMMDD foo_bar_20150224.txt
MM_YY foo_bar_02_15.txt
YYYYMMDD foo_bar_20150224.txt
Output:
Format Example Input String
MMDDYYYY foo.bar.20150224.txt
MMDDYY foo.bar.20150224.txt
MONCCYY foo.bar.20150228.txt
YYYY-MM-DD foo_bar_20150228.txt
YYYYMMDD foo_bar_20150224.txt
MM_YY foo_bar_20150228.txt
YYYYMMDD foo_bar_20150224.txt
Explanation:
E.g.
Input
foo_bar_2015-02-01_2015-02-28.txt
So
(?<=\.|_)([A-Z\d+_-]*?([A-Z\d+_-]{0,10}))(?=\.)
Regex to capture date string in to group m
1. [182-203] `2015-02-01_2015-02-28`
2. [193-203] `2015-02-28`
m.group(0) = 2015-02-01_2015-02-28
m.group(1) = 2015-02-01_2015-02-28
m.group(2) = 2015-02-28
Then
lambda m: changedate(m.group(2)) to re-format datetime
So
2015-02-28 can not pass others as
try:
t = time.strptime(date,'%m%d%Y')
except:
pass
But pass only this block
try:
r = time.strftime("%Y-%m-%d",t)
return r
except:
pass
Then format it
try:
r = time.strftime("%Y%m%d",t)
return r
except:
pass
UPDATE2 Please try the following approach (python 2.7):
import re
import calendar
INPUT = ['foo.bar.02242015.txt',
'foo.bar.022415.txt',
'foo.bar.FEB2015.txt',
'foo_bar_2015-02-01_2015-02-28.txt',
'foo_bar_20150224.txt',
'foo_bar_02_15.txt',
'foo_bar_20150224.txt' ]
P1 = r'(0[1-9]|1[0-2])(0[1-9]|[12][0-9]|3[01])((?:19|20)?\d{2})'
P2 = r'[A-Z]{3}[12]\d{3}|[12]\d{3}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])_?|(?:0[1-9]|1[0-2])_[12]\d'
MONTHS = ['JAN','FEB','MAR','APR','MAY','JUN','JUL','AUG','SEP','OCT','NOV','DEC']
def StrFormat(date_string):
m2 = re.findall(P2, date_string)
if m2:
for m in m2:
if len(m) == 5:
month, year = m.split('_')[0], '20' + m.split('_')[1]
last_day = calendar.monthrange(int(year), int(month))[1]
date_string = re.sub(P2, year+month+ str(last_day), date_string, 1)
elif len(m) == 7:
month, year = str(MONTHS.index(m[0:3]) + 1).zfill(2), m[3:]
last_day = calendar.monthrange(int(year), int(month))[1]
date_string = re.sub(P2, year+month+ str(last_day), date_string, 1)
elif len(m) == 10:
date_string = re.sub(P2, m.replace('-', ''), date_string, 1)
elif len(m) > 5:
date_string = re.sub(P2, '', date_string, 1)
m1 = re.findall(P1, date_string)
if m1:
for m in m1:
if len(m[2]) == 2:
date_string = re.sub(P1, r'20\3\1\2', date_string, 1)
elif len(m[2]) == 4:
date_string = re.sub(P1, r'\3\1\2', date_string, 1)
elif len(m) > 2:
date_string = re.sub(P1, '', date_string, 1)
return date_string
for i in INPUT:
print i.ljust(35), '->', StrFormat(i).rjust(20)
Output:
foo.bar.02242015.txt -> foo.bar.20150224.txt
foo.bar.022415.txt -> foo.bar.20150224.txt
foo.bar.FEB2015.txt -> foo.bar.20150228.txt
foo_bar_2015-02-01_2015-02-28.txt -> foo_bar_20150228.txt
foo_bar_20150224.txt -> foo_bar_20150224.txt
foo_bar_02_15.txt -> foo_bar_20150228.txt
foo_bar_20150224.txt -> foo_bar_20150224.txt
Btw: It is indeed 10% Regex + 90% programming as suggested by noob :-)
Related
comparing a list of strings with a path which contains files
i have a list of paths to files: rasterfiles= glob.glob(r'Y:\5_Wetter\system\ecmwf_data\temperature\*\*avg.tif') rasterfiles = ['Y:\\5_Wetter\\system\\ecmwf_data\\temperature\\2000\\ecmwf_2000291_temperature_avg.tif, 'Y:\\5_Wetter\\system\\ecmwf_data\\temperature\\2000\\ecmwf_2000225_temperature_avg.tif','Y:\\5_Wetter\\system\\ecmwf_data\\temperature\\2000\\ecmwf_2000027_temperature_avg.tif','Y:\\5_Wetter\\system\\ecmwf_data\\temperature\\2000\\ecmwf_2000254_temperature_avg.tif','Y:\\5_Wetter\\system\\ecmwf_data\\temperature\\2000\\ecmwf_2000177_temperature_avg.tif,...'] and the list: def dates_to_dayoftheyears(start, end): duration = end - start period = [] for d in range(duration.days + 1): day = start + timedelta(days=d) period.append(day) year = [] for day in period: year.append(day.strftime('%Y')) dayoftheyear = [] for day in period: dayoftheyear.append(day.strftime('%j')) year_and_days = [a+ b for a,b in zip(year, dayoftheyear)] return year_and_days print (dates_to_dayoftheyears(date(2013,4,1), date(2013,4,30))) Output ['2013091', '2013092', '2013093', '2013094', '2013095', '2013096', '2013097', '2013098', '2013099', '2013100', '2013101', '2013102', '2013103', '2013104', '2013105', '2013106', '2013107', '2013108', '2013109', '2013110', '2013111', '2013112', '2013113', '2013114', '2013115', '2013116', '2013117', '2013118', '2013119', '2013120'] How can I only extract those files within my rasterfiles list which have the same string sequence as my second list?
You can simply iterate through the values import os rasterfiles = ['Y:\\5_Wetter\\system\\ecmwf_data\\temperature\\2000\\ecmwf_2000291_temperature_avg.tif', 'Y:\\5_Wetter\\system\\ecmwf_data\\temperature\\2000\\ecmwf_2000225_temperature_avg.tif','Y:\\5_Wetter\\system\\ecmwf_data\\temperature\\2000\\ecmwf_2000027_temperature_avg.tif','Y:\\5_Wetter\\system\\ecmwf_data\\temperature\\2000\\ecmwf_2000254_temperature_avg.tif','Y:\\5_Wetter\\system\\ecmwf_data\\temperature\\2000\\ecmwf_2000177_temperature_avg.tif'] dates = ['2013091', '2013092', '2013093', '2013094', '2013095', '2013096', '2013097', '2013098', '2013099', '2013100', '2013101', '2013102', '2013103', '2013104', '2013105', '2013106', '2013107', '2013108', '2013109', '2013110', '2013111', '2013112', '2013113', '2013114', '2013115', '2013116', '2013117', '2013118', '2013119', '2013120'] result = [] for f in rasterfiles: for d in dates: if d in os.path.basename(f): result.append(f) print(result) In a more sophisticated version, you can use a regular expression to extract the date part of the filename and use it to compare to the dates list: import os import re rasterfiles = ['Y:\\5_Wetter\\system\\ecmwf_data\\temperature\\2000\\ecmwf_2000291_temperature_avg.tif', 'Y:\\5_Wetter\\system\\ecmwf_data\\temperature\\2000\\ecmwf_2000225_temperature_avg.tif','Y:\\5_Wetter\\system\\ecmwf_data\\temperature\\2000\\ecmwf_2000027_temperature_avg.tif','Y:\\5_Wetter\\system\\ecmwf_data\\temperature\\2000\\ecmwf_2000254_temperature_avg.tif','Y:\\5_Wetter\\system\\ecmwf_data\\temperature\\2000\\ecmwf_2000177_temperature_avg.tif'] dates = ['2013091', '2013092', '2013093', '2013094', '2013095', '2013096', '2013097', '2013098', '2013099', '2013100', '2013101', '2013102', '2013103', '2013104', '2013105', '2013106', '2013107', '2013108', '2013109', '2013110', '2013111', '2013112', '2013113', '2013114', '2013115', '2013116', '2013117', '2013118', '2013119', '2013120'] re_pattern = "[a-zA-Z]+_([0-9]{6,8})_.*" result = [] for f in rasterfiles: m = re.search(re_pattern, os.path.basename(f)) if (m != None): if m.group(1) in dates: result.append(f) print(result)
Know which parts a day contains
The dateparser library set missing parts of a date to today's values. Example: >>> import dateparser >>> dateparser.parse("2015") datetime.datetime(2015, 2, 14, 0, 0) How to know which parts a date really contains? (and thus which parts were set to today's values by the library)? This is what I've come up with. Is there a more efficient way? date_str = input('Type a date: ') settings = {"REQUIRE_PARTS": ["year"]} res = dateparser.parse(date_str, settings=settings) if res is None: print("Invalid Date") return settings = {"REQUIRE_PARTS": ["year", "month"]} res = dateparser.parse(date_str, settings=settings) if res is None: print("Date has year only") return settings = {"REQUIRE_PARTS": ["year", "month", "day"]} res = dateparser.parse(date_str, settings=settings) if res is None: print("Date has year and month") return print("Date has year, month and day")
How to replace the day in a date with another date?
I'm trying to replace the day in my if statement for my date but I keep getting this output for my year. 05/15/5 besides 05/15/2020 . Code is below: today_date = datetime.datetime.now() date = today_date.date() formatted_date = datetime.date.strftime(date, "%m/%d/%Y") mmonth = date.month myear = date.year mdate = date.day if mdate < 7: m0weekend = formatted_date.replace(str(myear),str(mmonth),1) else: m0weekend = formatted_date.replace(str(myear),str(mmonth),15)
it's easier to replace the day before converting to a string: date = date.replace(day=1) or, in your case: if mdate < 7: m0weekend = date.replace(day=1) else: m0weekend = date.replace(day=15)
formatted_date is actually a string. You are using the str.replace() method not the datetime.date.replace() method. import datetime today_date = datetime.datetime.now() pre_formatted_date = today_date.date() mmonth = pre_formatted_date.month myear = pre_formatted_date.year mdate = pre_formatted_date.day if mdate < 7: pre_formatted_date = pre_formatted_date.replace(day=1) else: pre_formatted_date = pre_formatted_date.replace(day=15) print(pre_formatted_date) formatted_date = pre_formatted_date.strftime("%m/%d/%Y") print(formatted_date) Which has the following output: 2020-05-15 05/15/2020
You might get today datetime.date directly from datetime rather than creating datetime.datetime and converting to date. After you have today you might create needed datetime.date and turn it into str, i.e.: import datetime today = datetime.date.today() date = datetime.date(today.year, today.month, 1 if today.day < 7 else 15) formatted_date = datetime.date.strftime(date, "%m/%d/%Y") print(formatted_date) # 05/15/2020
Datetime format detection Python3
I want to use python3 to do the date format detection for example I have file1 = "test_20180101234523.txt" and the output should be the format type %Y%M%D%H%m%S and expected datetime format 2018-01-01,23:45:23 Here's what I did so far, import re file1 = "test_20180101234523.txt" pattern = r'[0-9]{14}' regex=re.compile(pattern) matches = regex.findall(file1) matchesStr = matches[0] matchesYear = int(matchesStr[0:4]) matchesMonth = int(matchesStr[4:6]) matchesdate = int(matchesStr[6:8]) matchesH = int(matchesStr[8:10]) matchesM = int(matchesStr[10:12]) matchesS = int(matchesStr[12:14]) def checkdate(): if matchesYear > 1900: print("%Y") else: print("Year is not format") if matchesMonth >= 1 and matchesMonth <= 12: print("%M") else: print("Month is not format") if matchesdate >= 1 and matchesdate <= 31: print("%d") else: print("Date is not format") if matchesH >= 1 and matchesH <= 24: print("%H") else: print("Hour is not a format") if matchesM >= 1 and matchesM <= 60: print("%m") else: print("Min is not a format") if matchesS >= 1 and matchesS <= 60: print("%S") else: print("Sec is not a format") I use regex to find out the group of integer and substring those to be each variable that I need. And use if-else condition to check each of those. If you guys have any other idea, could you share, please?
Use datetime.strptime as (Assuming the regex output will be 14 digit everytime and follows same format): import datetime date = datetime.datetime.strptime('20180101234523', '%Y%m%d%H%M%S') date.strftime('%Y-%m-%d,%H:%M:%S') '2018-01-01,23:45:23'
If the digits in your inputs are always 14 digits, then you can usedatetime.strptime with regex along with this code to have your desired output: import re from datetime import datetime def get_integers(file_name, prefix='test_'): """Return matched integers""" regex = re.compile(r'{prefix}(\d+)'.format(prefix=prefix)) matched = re.findall(regex, file_name) return matched[0] if matched else '' def get_datetime_object(date_string): """Return datetime object from date_string if it exists""" try: date_object = datetime.strptime(date_string, '%Y%m%d%H%M%S') return date_object.strftime('%Y-%m-%d,%H:%M:%S') except ValueError: return None file1 = 'test_20180101234523.txt' integers = get_integers(file1) date = get_datetime_object(integers) print(date) Output: 2018-01-01,23:45:23 PS: Notice, if the integers in the input are'nt 14 digits, then you should adapt get_integers function to return string that contains 14 digits.
change (eg) 8 to 08...python
I am reading data from a csv file, and there are date elements in it, but there is an inconsistency in the dates. For example: sometimes the date element is like 1/1/2011 and sometimes it is like 01/01/2011 Since I am plotting this data later.. this causes a great deal of noise in my plots. The following is my date class. Can you help me out in where and how to modify the code in order to get the date in the form 01/01/2011 import re class Date: def __init__(self, input_date): self._input_date = input_date self._date = None self._month = None self._year = None self._hour = None self._min = None def setDate(self): date = self._input_date #date = re.findall('w+',date) date = self.__mySplit() #print"len ",len(date) assert (len(date) >= 3) #has atleast dd/mm/yy #dateLength = len(date[0]) self._month = int(date[0]) self._date = int(date[1]) self._year = int(date[2]) if (len(date) ==5): self._hour = int(date[3]) self._min = int(date[4]) def __mySplit(self): #splitting the date by delimiters.. res = [self._input_date] #print res seps = [' ','/',':'] for sep in seps: s,res = res,[] for seq in s: res += seq.split(sep) #print res return res Thanks
You definitely want to be using datetime. Here's some code that will get a datetime from either string type: from datetime import datetime def strToDatetime(dateStr): return datetime.strptime(dateStr, "%d/%m/%Y") Then, you can print a datetime out in the format you want with: strToDatetime("1/3/2011").strftime("%d/%m/%Y) >'01/03/2011' You should never need to roll your own date/time/datetime structure in python.