IEEE C37.118 python parser of PMU RAW file - python

Here is sample output data that comes from a Phasor Measurement Unit (PMU) device placed in the distribution network. PMU device is based on IEEE C37.118 standard. Here are sample 3 records:
+---------+---------------+----------+
15:28:59,420,551 ETHER
|0 |f4|03|43|3e|e0|18|00|45|1d|62|5f|f9|08|00|45|00|00|4e|a7|88|00|00|3c|06|35|d3|c0|a8|84|85|0a|14|52|0d|12|68|ee|2c|08|1d|21|f6|27|9a|9b|fa|50|18|20|00|7e|06|00|00|aa|01|00|26|00|06|60|21|58|bb|ff|06|68|20|08|04|f2|48|d7|b1|00|00|00|00|fe|3c|fb|47|fe|37|fb|46|00|04|ff|ff|f2|db|
+---------+---------------+----------+
15:28:59,440,855 ETHER
|0 |f4|03|43|3e|e0|18|00|45|1d|62|5f|f9|08|00|45|00|00|4e|a7|8c|00|00|3c|06|35|cf|c0|a8|84|85|0a|14|52|0d|12|68|ee|2c|08|1d|22|1c|27|9a|9b|fa|50|18|20|00|a5|d5|00|00|aa|01|00|26|00|06|60|21|58|bb|ff|06|b6|00|08|04|f2|4d|d7|b0|00|00|00|00|fe|3c|fb|47|fe|37|fb|46|00|04|ff|ff|7d|02|
+---------+---------------+----------+
Does anyone has idea where can I find Python implementation, PIP package or anything similar for parser that could convert this data in some sensible dataframe?
Any info could be useful.

I've created script that parses these types of files and outputs them in Excel:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import pandas as pd
import numpy as np
import datetime
from datetime import timedelta
import pytz
from datetime import datetime
import pytz
from tzlocal import get_localzone
tz = pytz.timezone('Europe/London')
listofrecords = []
i=0
finalDict = {}
with open("inputData/PMU-Location1.txt",'r') as pmuraw:
for line in pmuraw.readlines(): #read in multiple lines
if len(line.strip())==0:
continue
if line.startswith('+---------+---------------+----------+'):
record = {}
if "ETHER" in line:
record["Time"],record["Network"] = line.strip().rstrip().split(sep=" ")
if "|" in line:
for i, value in enumerate(line.strip().rstrip().split(sep="|")):
record[i] = value
record["SYNC"] = int((str(record[56])+" "+str(record[57])).strip().rstrip().replace(" ",""),16)
record["Packet size"] = int((str(record[58])+" "+str(record[59])).strip().rstrip().replace(" ",""),16)
record["PMU ID"] = int((str(record[60])+" "+str(record[61])).strip().rstrip().replace(" ",""),16)
record["Timestamp"] = datetime.fromtimestamp(int((str(record[62])+" "+str(record[63])+" "+str(record[64])+" "+str(record[65])).strip().rstrip().replace(" ",""), 16)).astimezone(tz).strftime('%d.%m.%Y %H:%M:%S')
record["Time diff"] = str(timedelta(microseconds = int((str(record[67])+" "+str(record[68])+""+str(record[69])).strip().rstrip().replace(" ",""),16)))[6:]
record["Status PMU"] = int((str(record[70])+" "+str(record[71])).strip().rstrip().replace(" ",""),16)
record["Fazor 1"] = int((str(record[72])+" "+str(record[73])+" "+ str(record[74])+" "+str(record[75])).strip().rstrip().replace(" ",""),16)
record["Fazor 2"] = int((str(record[76])+" "+str(record[77])+" "+ str(record[78])+" "+str(record[79])).strip().rstrip().replace(" ",""),16)
record["Fazor 3"] = int((str(record[80])+" "+str(record[81])+" "+ str(record[82])+" "+str(record[83])).strip().rstrip().replace(" ",""),16)
record["Fazor 4"] = int((str(record[84])+" "+str(record[85])+" "+ str(record[86])+" "+str(record[87])).strip().rstrip().replace(" ",""),16)
record["Frequency"] = int((str(record[88])+" "+str(record[89])).strip().rstrip().replace(" ",""),16)
record["Delta freq"] = int((str(record[90])+" "+str(record[91])).strip().rstrip().replace(" ",""),16)
remove = [k for k in record if isinstance(k, (int, np.integer))]
for k in remove: del record[k]
listofrecords.append(record)
PMU-Location1DF = pd.DataFrame(listofrecords)
PMU-Location1DF.to_excel("outputData/PMU-Location1.xlsx",sheet_name='PMU Žerjavinec', encoding='UTF-8')

Related

Python: ipywidgets not showing output

I wrote a python script that should have a data frame as output, but it does not show any output. Below is the python code:
import pandas as pd
import numpy as np
import ipywidgets as widgets
import datetime
from ipywidgets import interactive
from IPython.display import display, Javascript
from datetime import date, timedelta
from random import choices
books = ["Book_1","Book_2","Book_3","Book_4","Book_5"]
counterparties = ["Counterparty_1","Counterparty_2","Counterparty_3","Counterparty_4","Counterparty_5"]
book = choices(books, k = 100)
counterparty = choices(counterparties, k = 100)
date1, date2 = date(2018, 8, 1), date(2023, 8, 3)
res_dates = [date1]
while date1 != date2:
date1 += timedelta(days=1)
res_dates.append(date1)
ldd = choices(res_dates, k=100)
dict = {'book': book, 'counterparty': counterparty, 'last_trading_date': ldd}
df = pd.DataFrame(dict)
books = pd.Categorical(df['book'])
books = books.categories
books_dropdown = widgets.Dropdown(
options=books,
value=books[0],
description='Book:',
disabled=False,
)
counterparty = pd.Categorical(df['counterparty'])
counterparty = counterparty.categories
counter_dropdown = widgets.Dropdown(
options=counterparty,
value=counterparty[0],
description='Counterparty:',
disabled=False,
)
date_picker = widgets.DatePicker(
description='Pick a Date',
disabled=False,
)
date_picker.add_class("start-date")
script = Javascript("\
const query = '.start-date > input:first-of-type'; \
document.querySelector(query).setAttribute('min', '2020-12-01'); \
document.querySelector(query).setAttribute('max', '2025-01-01'); \
")
box = widgets.VBox([books_dropdown, counter_dropdown, date_picker])
display(box)
def filter_function(bookcode, cpartycode, datecode):
filtered = df[(df['book'] == bookcode) & (df['counterparty'] == cpartycode)]
x = datetime.date(datecode.value)
filtered = filtered[filtered['last_trading_date'] < x]
with report_output:
report_output.clear_output()
display(filtered)
interactive(filter_function, bookcode=books_dropdown, cpartycode=counter_dropdown, datecode=date_picker)
report_output = widgets.Output()
display(report_output)
What this does is basically take a data frame, subset the said data frame into a smaller data frame based on categories of two variables, and truncate the resulting data frame based on a date selected by the user.
Did I make a mistake somewhere? If so, can someone point to me where? Thank you in advance.
Edit:
After many attempts I came to the conclusion that the problem is related to the DatePicker widget. So you can focus on that when trying to solve the problem.
Here is the code I used to reproduce the issue if I understand it correctly:
from datetime import date, timedelta
from random import choices
import pandas as pd
import ipywidgets as widgets
import datetime
from ipywidgets import interactive
from IPython.display import display, Javascript
books = ["Book_1","Book_2","Book_3","Book_4","Book_5"]
counterparties = ["Counterparty_1","Counterparty_2","Counterparty_3","Counterparty_4","Counterparty_5"]
book = choices(books, k = 100)
counterparty = choices(counterparties, k = 100)
date1, date2 = date(2018, 8, 1), date(2023, 8, 3)
res_dates = [date1]
while date1 != date2:
date1 += timedelta(days=1)
res_dates.append(date1)
ldd = choices(res_dates, k=100)
dict = {'book': book, 'counterparty': counterparty, 'last_trading_date': ldd}
df = pd.DataFrame(dict)
df['last_trading_date'] = pd.to_datetime(df['last_trading_date'], format = '%Y-%m-%d').dt.date
books = pd.Categorical(df['book'])
books = books.categories
books_dropdown = widgets.Dropdown(
options=books,
value=books[0],
description='Book:',
disabled=False,)
counterparty = pd.Categorical(df['counterparty'])
counterparty = counterparty.categories
counter_dropdown = widgets.Dropdown(
options=counterparty,
value=counterparty[0],
description='Counterparty:',
disabled=False,
)
date_picker = widgets.DatePicker(
description='Pick a Date',
disabled=False,
)
date_picker.add_class("start-date")
script = Javascript("\
const query = '.start-date > input:first-of-type'; \
document.querySelector(query).setAttribute('min', '2020-12-01'); \
document.querySelector(query).setAttribute('max', '2025-01-01'); \
")
def filter_function(bookcode, cpartycode, datecode):
filtered = df[(df['book'] == bookcode) & (df['counterparty'] == cpartycode)]
filtered = filtered[filtered['last_trading_date'] < datecode]
with report_output:
report_output.clear_output()
display(filtered)
w = interactive(filter_function, bookcode=books_dropdown, cpartycode=counter_dropdown, datecode=date_picker)
display(w)
report_output = widgets.Output()
display(report_output)
Using the widget that's displayed when the code is run in Jupyter Notebook, I get the following output:
Only changes that I made in the code provided by you are:
Remove the code for VBox.
Store interactive widget as a variable and use display() to display it.
Directly use datecode argument to filtered_function for creating filtered instead of using datetime.date(datecode.value).

Get time from city name using Python

As you can see in the title, I want to find the time of given city in Python. How can I achieve this? I've tried geopy and timezonefinder modules but they are giving me different results too. (like 'What time is it in Spotify?', 'It's 12:04')
What I'm trying to achieve is:
What time is it in California?
It's 16:15
THE CODE
import nltk
import datetime
import calendar
import pytz
from geopy.geocoders import Nominatim
from timezonefinder import TimezoneFinder
self.inp = input("City name: ")
# Find city name using NLP
# Get city name
findCityName = str(self.inp.title())
# NLP
word = nltk.word_tokenize(findCityName)
pos_tag = nltk.pos_tag(word)
chunk = nltk.ne_chunk(pos_tag)
self.inp = [ " ".join(w for w, t in ele) for ele in chunk if isinstance(ele, nltk.Tree)]
self.inp = ' '.join(self.inp)
# Get lat, long from city name
geolocator = Nominatim(user_agent='xxx')
location = geolocator.geocode(self.inp.capitalize())
# Get timezone from coordinates
tf = TimezoneFinder()
latitude, longitude = location.latitude, location.longitude
# Timezone
datez = tf.timezone_at(lng=longitude, lat=latitude)
datez = str(datez)
globalDate = datetime.datetime.now(pytz.timezone(datez))
print("The date in " + str(self.inp) + " is: " + globalDate.strftime('%A, %m/%d/%y'))

Pandas format datetime with many different date types

I am trying to format the column 'Data' to make a pattern with dates.
The formats I have are:
1/30/20 16:00
1/31/2020 23:59
2020-02-02T23:43:02
Here is the code for the dataframe.
import requests
import pandas as pd
import numpy as np
url = "https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_daily_reports"
csv_only = [i.split("=")[1][1:-1] for i in requests.get(url).text.split(" ") if '.csv' in i and 'title' in i]
combo = [pd.read_csv(url.replace("github","raw.githubusercontent").replace("/tree/","/")+"/"+f) for f in csv_only]
one_df = pd.concat(combo,ignore_index=True)
one_df["País"] = one_df["Country/Region"].fillna(one_df["Country_Region"])
one_df["Data"] = one_df["Last Update"].fillna(one_df["Last_Update"])
I tried adding the code bellow but it doesn't bring the result I wanted
pd.to_datetime(one_df['Data'])
one_df.style.format({"Data": lambda t: t.strftime("%m/%d/%Y")})
Any help?
UPDATE
This is the complete code, but it doesn't work. Many exceptions printed with different date formats.
import requests
import pandas as pd
import numpy as np
from datetime import datetime
url = "https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_daily_reports"
csv_only = [i.split("=")[1][1:-1] for i in requests.get(url).text.split(" ") if '.csv' in i and 'title' in i]
combo = [pd.read_csv(url.replace("github","raw.githubusercontent").replace("/tree/","/")+"/"+f) for f in csv_only]
one_df = pd.concat(combo,ignore_index=True)
df = pd.DataFrame()
DATE_FORMATS = ["%m/%d/%y %H:%M", "%m/%d/%Y %H:%M", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M:%S"]
df["Região"] = one_df["Province/State"].fillna(one_df["Admin2"])
df["País"] = one_df["Country/Region"].fillna(one_df["Country_Region"])
df["Data"] = one_df["Last Update"].fillna(one_df["Last_Update"])
df["Confirmados"] = one_df["Confirmed"]
df["Mortes"] = one_df["Deaths"]
df["Recuperados"] = one_df["Recovered"]
def parse(x_):
for fmt in DATE_FORMATS :
try:
tmp = datetime.strptime(x_, fmt).strftime("%m/%d/%Y")
return tmp
except ValueError:
print(x_)
pd.to_datetime(df['Data'])
df['Data'] = df['Data'].apply(lambda x: parse(x))
#df['Data'].strftime('%m/%d/%Y')
#df['Data'] = df['Data'].map(lambda x: x.strftime('%m/%d/%Y') if x else '')
df.to_excel(r'C:\Users\guilh\Downloads\Covid2\Covid-19.xlsx', index=False, encoding="utf8")
print(df)
from datetime import datetime
import pandas as pd
You could save all possible formats in a list as -
DATE_FORMATS = ["%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S", "%m/%d/%y %H:%M", "%m/%d/%Y %H:%M"]
Define a function that loops through the formats and tries to parse it.
(Fixed a bug, where the print statement should have been outside the for loop)
issues = set()
def parse(x_):
for fmt in DATE_FORMATS:
try:
return datetime.strptime(x_, fmt).strftime("%m/%d/%Y")
except ValueError:
pass
issues.add(x_)
sample = ["1/30/20 16:00", "1/31/2020 23:59", "2020-02-02T23:43:02"]
df = pd.DataFrame({'data': sample})
df['data'] = df['data'].apply(lambda x: parse(x))
assert df['Data'].isna().sum() == len(issues) == 0, "Issues observed, nulls observed in dataframe"
print("Done")
Output
data
0 01/30/2020
1 01/31/2020
2 02/02/2020
If df.apply() comes across a particular date format that hasn't been defined in the list, it would simply print None since nothing would be returned by the function parse()
also here, letting pd.to_datetime infer the format does the trick:
import pandas as pd
s = pd.to_datetime(["1/30/20 16:00", "1/31/2020 23:59", "2020-02-02T23:43:02"])
print(s)
# DatetimeIndex(['2020-01-30 16:00:00', '2020-01-31 23:59:00',
# '2020-02-02 23:43:02'],
# dtype='datetime64[ns]', freq=None)
Note that if your date/time format generally provides the day first (e.g. 30.1.2021 for Jan 30th 2021), set keyword dayfirst=True.

Python: invalid syntax: <string>, line 1, pos 16

I have developed a code in Python in which -in order to run the program- I need to take some arguments from the command line. But I am getting continuously the same error:
Traceback (most recent call last):
File "<string>", line 1, in <fragment>
invalid syntax: <string>, line 1, pos 16
I have the faintest idea what is wrong with my code. So, I present my code below in case someone could help me:
import QSTK.qstkutil.qsdateutil as du
import QSTK.qstkutil.tsutil as tsu
import QSTK.qstkutil.DataAccess as da
import datetime as dt
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import time
import math
import copy
import QSTK.qstkstudy.EventProfiler as ep
import csv
import sys
import argparse
def readData(li_startDate, li_endDate, ls_symbols):
#Create datetime objects for Start and End dates (STL)
dt_start = dt.datetime(li_startDate[0], li_startDate[1], li_startDate[2])
dt_end = dt.datetime(li_endDate[0], li_endDate[1], li_endDate[2])
#Initialize daily timestamp: closing prices, so timestamp should be hours=16 (STL)
dt_timeofday = dt.timedelta(hours=16)
#Get a list of trading days between the start and end dates (QSTK)
ldt_timestamps = du.getNYSEdays(dt_start, dt_end, dt_timeofday)
#Create an object of the QSTK-dataaccess class with Yahoo as the source (QSTK)
c_dataobj = da.DataAccess('Yahoo', cachestalltime=0)
#Keys to be read from the data
ls_keys = ['open', 'high', 'low', 'close', 'volume', 'actual_close']
#Read the data and map it to ls_keys via dict() (i.e. Hash Table structure)
ldf_data = c_dataobj.get_data(ldt_timestamps, ls_symbols, ls_keys)
d_data = dict(zip(ls_keys, ldf_data))
return [d_data, dt_start, dt_end, dt_timeofday, ldt_timestamps]
def marketsim(cash,orders_file,values_file):
orders = pd.read_csv(orders_file,index_col='Date',parse_dates=True,header=None)
ls_symbols = list(set(orders['X.4'].values))
df_lastrow = len(orders) - 1
dt_start = dt.datetime(orders.get_value(0, 'X.1'),orders.get_value(0, 'X.2'),orders.get_value(0, 'X.3'))
dt_end = dt.datetime(orders.get_value(df_lastrow, 'X.1'),orders.get_value(df_lastrow, 'X.2'),orders.get_value(df_lastrow, 'X.3') + 1 )
#d_data = readData(dt_start,dt_end,ls_symbols)
#Initialize daily timestamp: closing prices, so timestamp should be hours=16 (STL)
dt_timeofday = dt.timedelta(hours=16)
#Get a list of trading days between the start and end dates (QSTK)
ldt_timestamps = du.getNYSEdays(dt_start, dt_end, dt_timeofday)
#Create an object of the QSTK-dataaccess class with Yahoo as the source (QSTK)
c_dataobj = da.DataAccess('Yahoo', cachestalltime=0)
#Keys to be read from the data
ls_keys = ['open', 'high', 'low', 'close', 'volume', 'actual_close']
#Read the data and map it to ls_keys via dict() (i.e. Hash Table structure)
df_data = c_dataobj.get_data(ldt_timestamps, ls_symbols, ls_keys)
d_data = dict(zip(ls_keys, ldf_data))
ls_symbols.append("_CASH")
trades = pd.Dataframe(index=list(ldt_timestamps[0]),columns=list(ls_symbols))
current_cash = cash
trades["_CASH"][ldt_timestamps[0]] = current_cash
current_stocks = dict()
for symb in ls_symbols:
current_stocks[symb] = 0
trades[symb][ldt_timestamps[0]] = 0
for row in orders.iterrows():
row_data = row[1]
current_date = dt.datetime(row_data['X.1'],row_data['X.2'],row_data['X.3'],16)
symb = row_data['X.4']
stock_value = d_data['close'][symb][current_date]
stock_amount = row_data['X.6']
if row_data['X.5'] == "Buy":
current_cash = current_cash - (stock_value*stock_amount)
trades["_CASH"][current_date] = current_cash
current_stocks[symb] = current_stocks[symb] + stock_amount
trades[symb][current_date] = current_stocks[symb]
else:
current_cash = current_cash + (stock_value*stock_amount)
trades["_CASH"][current_date] = current_cash
current_stocks[symb] = current_stocks[symb] - stock_amount
trades[symb][current_date] = current_stocks[symb]
#trades.fillna(method='ffill',inplace=True)
#trades.fillna(method='bfill',inplace=False)
trades.fillna(0)
#alt_cash = current_cash
#alt_cash = trades.cumsum()
value_data = pd.Dataframe(index=list(ldt_timestamps),columns=list("V"))
value_data = value_data.fillna(0)
value_data = value_data.cumsum(axis=0)
for day in ldt_timestamps:
value = 0
for sym in ls_symbols:
if sym == "_CASH":
value = value + trades[sym][day]
else:
value = calue + trades[sym][day]*d_data['close'][sym][day]
value_data["V"][day] = value
fileout = open(values_file,"w")
for row in value_data.iterrows():
file_out.writelines(str(row[0].strftime('%Y,%m,%d')) + ", " + str(row[1]["V"].round()) + "\n" )
fileout.close()
def main(argv):
if len(sys.argv) != 3:
print "Invalid arguments for marketsim.py. It should be of the following syntax: marketsim.py orders_file.csv values_file.csv"
sys.exit(0)
#initial_cash = int (sys.argv[1])
initial_cash = 1000000
ordersFile = str(sys.argv[1])
valuesFile = str(sys.argv[2])
marketsim(initial_cash,ordersFile,valuesFile)
if __name__ == "__main__":
main(sys.argv[1:])
The input I gave to the command line was:
python marketsim.py orders.csv values.csv
I guess that the problem lies either into the imports or probably into the main function(incl. the if below the def main(argv)
I have to point out that the files orders.csv and values.csv exist and are located into the same folder.
I hope have made everything clear.
So, I am looking forward to reading your answers community-mates! :D
Thank you!

Convert CreationTime of PDF to a readable format in Python

I'm working on PDF with Python and I'm accessing the file's meta data by using PDFMiner. I extract the info using this:
from pdfminer.pdfparser import PDFParser, PDFDocument
fp = open('diveintopython.pdf', 'rb')
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize()
print doc.info[0]['CreationDate']
# And return this value "D:20130501200439+01'00'"
How can I convert D:20130501200439+01'00' into a readable format in Python?
I found the format documented here. I needed to cope with the timezones too because I have 160k documents from all over to deal with. Here is my full solution:
import datetime
import re
from dateutil.tz import tzutc, tzoffset
pdf_date_pattern = re.compile(''.join([
r"(D:)?",
r"(?P<year>\d\d\d\d)",
r"(?P<month>\d\d)",
r"(?P<day>\d\d)",
r"(?P<hour>\d\d)",
r"(?P<minute>\d\d)",
r"(?P<second>\d\d)",
r"(?P<tz_offset>[+-zZ])?",
r"(?P<tz_hour>\d\d)?",
r"'?(?P<tz_minute>\d\d)?'?"]))
def transform_date(date_str):
"""
Convert a pdf date such as "D:20120321183444+07'00'" into a usable datetime
http://www.verypdf.com/pdfinfoeditor/pdf-date-format.htm
(D:YYYYMMDDHHmmSSOHH'mm')
:param date_str: pdf date string
:return: datetime object
"""
global pdf_date_pattern
match = re.match(pdf_date_pattern, date_str)
if match:
date_info = match.groupdict()
for k, v in date_info.iteritems(): # transform values
if v is None:
pass
elif k == 'tz_offset':
date_info[k] = v.lower() # so we can treat Z as z
else:
date_info[k] = int(v)
if date_info['tz_offset'] in ('z', None): # UTC
date_info['tzinfo'] = tzutc()
else:
multiplier = 1 if date_info['tz_offset'] == '+' else -1
date_info['tzinfo'] = tzoffset(None, multiplier*(3600 * date_info['tz_hour'] + 60 * date_info['tz_minute']))
for k in ('tz_offset', 'tz_hour', 'tz_minute'): # no longer needed
del date_info[k]
return datetime.datetime(**date_info)
Is "+01'00'" the timezone information? Not taking that into account, you can create a datetime object as follows...
>>>from time import mktime, strptime
>>>from datetime import datetime
...
>>>datestring = doc.info[0]['CreationDate'][2:-7]
>>>ts = strptime(datestring, "%Y%m%d%H%M%S")
>>>dt = datetime.fromtimestamp(mktime(ts))
datetime(2013, 5, 1, 20, 4, 30)
use Python 3's datetime.strptime; just remove the apostrophes first:
from datetime import datetime
creation_date = "D:20130501200439+01'00'"
dt = datetime.strptime(creation_date.replace("'", ""), "D:%Y%m%d%H%M%S%z")
print(repr(dt))
# datetime.datetime(2013, 5, 1, 20, 4, 39, tzinfo=datetime.timezone(datetime.timedelta(seconds=3600)))
print(dt.isoformat())
# 2013-05-01T20:04:39+01:00
once you have a datetime object, you can format back to string however you like for a "readable" output, see strptime/strftime directives.
Guess I don't have the rep to comment on Paul Whipp's illustrative answer, but I've amended it to handle a form of the Y2K bug present in some of my old files. The year 2000 was written 19100, so the relevant line of pdf_date_pattern became
r"(?P<year>191\d\d|\d\d\d\d)",
and I added an elif to the transform values loop:
elif k == 'year' and len(v) == 5:
date_info[k] = int('20' + v[3:])

Categories