Convert CreationTime of PDF to a readable format in Python - python

I'm working on PDF with Python and I'm accessing the file's meta data by using PDFMiner. I extract the info using this:
from pdfminer.pdfparser import PDFParser, PDFDocument
fp = open('diveintopython.pdf', 'rb')
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize()
print doc.info[0]['CreationDate']
# And return this value "D:20130501200439+01'00'"
How can I convert D:20130501200439+01'00' into a readable format in Python?

I found the format documented here. I needed to cope with the timezones too because I have 160k documents from all over to deal with. Here is my full solution:
import datetime
import re
from dateutil.tz import tzutc, tzoffset
pdf_date_pattern = re.compile(''.join([
r"(D:)?",
r"(?P<year>\d\d\d\d)",
r"(?P<month>\d\d)",
r"(?P<day>\d\d)",
r"(?P<hour>\d\d)",
r"(?P<minute>\d\d)",
r"(?P<second>\d\d)",
r"(?P<tz_offset>[+-zZ])?",
r"(?P<tz_hour>\d\d)?",
r"'?(?P<tz_minute>\d\d)?'?"]))
def transform_date(date_str):
"""
Convert a pdf date such as "D:20120321183444+07'00'" into a usable datetime
http://www.verypdf.com/pdfinfoeditor/pdf-date-format.htm
(D:YYYYMMDDHHmmSSOHH'mm')
:param date_str: pdf date string
:return: datetime object
"""
global pdf_date_pattern
match = re.match(pdf_date_pattern, date_str)
if match:
date_info = match.groupdict()
for k, v in date_info.iteritems(): # transform values
if v is None:
pass
elif k == 'tz_offset':
date_info[k] = v.lower() # so we can treat Z as z
else:
date_info[k] = int(v)
if date_info['tz_offset'] in ('z', None): # UTC
date_info['tzinfo'] = tzutc()
else:
multiplier = 1 if date_info['tz_offset'] == '+' else -1
date_info['tzinfo'] = tzoffset(None, multiplier*(3600 * date_info['tz_hour'] + 60 * date_info['tz_minute']))
for k in ('tz_offset', 'tz_hour', 'tz_minute'): # no longer needed
del date_info[k]
return datetime.datetime(**date_info)

Is "+01'00'" the timezone information? Not taking that into account, you can create a datetime object as follows...
>>>from time import mktime, strptime
>>>from datetime import datetime
...
>>>datestring = doc.info[0]['CreationDate'][2:-7]
>>>ts = strptime(datestring, "%Y%m%d%H%M%S")
>>>dt = datetime.fromtimestamp(mktime(ts))
datetime(2013, 5, 1, 20, 4, 30)

use Python 3's datetime.strptime; just remove the apostrophes first:
from datetime import datetime
creation_date = "D:20130501200439+01'00'"
dt = datetime.strptime(creation_date.replace("'", ""), "D:%Y%m%d%H%M%S%z")
print(repr(dt))
# datetime.datetime(2013, 5, 1, 20, 4, 39, tzinfo=datetime.timezone(datetime.timedelta(seconds=3600)))
print(dt.isoformat())
# 2013-05-01T20:04:39+01:00
once you have a datetime object, you can format back to string however you like for a "readable" output, see strptime/strftime directives.

Guess I don't have the rep to comment on Paul Whipp's illustrative answer, but I've amended it to handle a form of the Y2K bug present in some of my old files. The year 2000 was written 19100, so the relevant line of pdf_date_pattern became
r"(?P<year>191\d\d|\d\d\d\d)",
and I added an elif to the transform values loop:
elif k == 'year' and len(v) == 5:
date_info[k] = int('20' + v[3:])

Related

How to get all the timezones date in order by python

i create this loop to give all timezones :
import datetime
import pytz
today=datetime.datetime.now(tz=pytz.UTC)
for i in pytz.all_timezones:
print(today.astimezone(pytz.timezone(i)))
but i want to get them in order from -9 to +14
now = datetime.datetime.now()
tzs = sorted(pytz.all_timezones, key=lambda tz: pytz.timezone(tz).utcoffset(now))
This gives you a list like:
['Etc/GMT+12',
'Etc/GMT+11',
'Pacific/Midway',
'Pacific/Niue',
'Pacific/Pago_Pago',
'Pacific/Samoa',
'US/Samoa',
'Etc/GMT+10',
'HST',
'Pacific/Honolulu',
...]
Supposing you are just interested in GMT timezones...
from datetime import datetime
from pytz import UTC, all_timezones, timezone, UnknownTimeZoneError
now = datetime.now(tz=UTC)
print('Timezones should be similar to these...')
timezones_candidates = [tz for tz in all_timezones if 'GMT' in tz]
print(timezones_candidates)
first = -14
last = +9
desired_timezones = ['Etc/GMT{0:+}'.format(shift) for shift in range(first, last+1)]
print('\nNow as timezone:')
for tz in desired_timezones:
try:
print(now.astimezone(timezone(tz)))
except UnknownTimeZoneError:
print('The timezone {0} doesnt exist'.format(tz))
import datetime
import pytz
today=datetime.datetime.now(tz=pytz.UTC)
ordered = {}
for i in pytz.all_timezones:
timezone = today.astimezone(pytz.timezone(i))
gmt = str(timezone)[-6:-3]
gmt = (-1 if gmt[0] == '-' else 1) * int(gmt[1:])
ordered[pytz.timezone(i)] = gmt
for timezone, _ in sorted(ordered.items(), key=lambda item: item[1]):
print(today.astimezone(timezone))

Pandas format datetime with many different date types

I am trying to format the column 'Data' to make a pattern with dates.
The formats I have are:
1/30/20 16:00
1/31/2020 23:59
2020-02-02T23:43:02
Here is the code for the dataframe.
import requests
import pandas as pd
import numpy as np
url = "https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_daily_reports"
csv_only = [i.split("=")[1][1:-1] for i in requests.get(url).text.split(" ") if '.csv' in i and 'title' in i]
combo = [pd.read_csv(url.replace("github","raw.githubusercontent").replace("/tree/","/")+"/"+f) for f in csv_only]
one_df = pd.concat(combo,ignore_index=True)
one_df["País"] = one_df["Country/Region"].fillna(one_df["Country_Region"])
one_df["Data"] = one_df["Last Update"].fillna(one_df["Last_Update"])
I tried adding the code bellow but it doesn't bring the result I wanted
pd.to_datetime(one_df['Data'])
one_df.style.format({"Data": lambda t: t.strftime("%m/%d/%Y")})
Any help?
UPDATE
This is the complete code, but it doesn't work. Many exceptions printed with different date formats.
import requests
import pandas as pd
import numpy as np
from datetime import datetime
url = "https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_daily_reports"
csv_only = [i.split("=")[1][1:-1] for i in requests.get(url).text.split(" ") if '.csv' in i and 'title' in i]
combo = [pd.read_csv(url.replace("github","raw.githubusercontent").replace("/tree/","/")+"/"+f) for f in csv_only]
one_df = pd.concat(combo,ignore_index=True)
df = pd.DataFrame()
DATE_FORMATS = ["%m/%d/%y %H:%M", "%m/%d/%Y %H:%M", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M:%S"]
df["Região"] = one_df["Province/State"].fillna(one_df["Admin2"])
df["País"] = one_df["Country/Region"].fillna(one_df["Country_Region"])
df["Data"] = one_df["Last Update"].fillna(one_df["Last_Update"])
df["Confirmados"] = one_df["Confirmed"]
df["Mortes"] = one_df["Deaths"]
df["Recuperados"] = one_df["Recovered"]
def parse(x_):
for fmt in DATE_FORMATS :
try:
tmp = datetime.strptime(x_, fmt).strftime("%m/%d/%Y")
return tmp
except ValueError:
print(x_)
pd.to_datetime(df['Data'])
df['Data'] = df['Data'].apply(lambda x: parse(x))
#df['Data'].strftime('%m/%d/%Y')
#df['Data'] = df['Data'].map(lambda x: x.strftime('%m/%d/%Y') if x else '')
df.to_excel(r'C:\Users\guilh\Downloads\Covid2\Covid-19.xlsx', index=False, encoding="utf8")
print(df)
from datetime import datetime
import pandas as pd
You could save all possible formats in a list as -
DATE_FORMATS = ["%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S", "%m/%d/%y %H:%M", "%m/%d/%Y %H:%M"]
Define a function that loops through the formats and tries to parse it.
(Fixed a bug, where the print statement should have been outside the for loop)
issues = set()
def parse(x_):
for fmt in DATE_FORMATS:
try:
return datetime.strptime(x_, fmt).strftime("%m/%d/%Y")
except ValueError:
pass
issues.add(x_)
sample = ["1/30/20 16:00", "1/31/2020 23:59", "2020-02-02T23:43:02"]
df = pd.DataFrame({'data': sample})
df['data'] = df['data'].apply(lambda x: parse(x))
assert df['Data'].isna().sum() == len(issues) == 0, "Issues observed, nulls observed in dataframe"
print("Done")
Output
data
0 01/30/2020
1 01/31/2020
2 02/02/2020
If df.apply() comes across a particular date format that hasn't been defined in the list, it would simply print None since nothing would be returned by the function parse()
also here, letting pd.to_datetime infer the format does the trick:
import pandas as pd
s = pd.to_datetime(["1/30/20 16:00", "1/31/2020 23:59", "2020-02-02T23:43:02"])
print(s)
# DatetimeIndex(['2020-01-30 16:00:00', '2020-01-31 23:59:00',
# '2020-02-02 23:43:02'],
# dtype='datetime64[ns]', freq=None)
Note that if your date/time format generally provides the day first (e.g. 30.1.2021 for Jan 30th 2021), set keyword dayfirst=True.

How to parse Date(928142400000+0200)?

I have JSON response object with string representing date and time:
"event":{
"type":"Type",
"date-time":"\/Date(928142400000+0200)\/",
},
I am not sure:
what format is that
how can I parse it in python app
how can I convert python date into this format
Any suggestions?
928142400000 is the time in milliseconds since the UNIX epoch, +0200 is the timezone.
With the dateutil library or datetime.timezone() objects you can model the timezone offset, the timestamp itself is parsable with datetime.datetime.fromtimestamp(), provided you divide the value by 1000.0:
import datetime
import re
timestamp_parse = re.compile(r'Date\((\d+)([+-]\d{4})\)')
timestamp, offset = timestamp_parse.search(datetime_value).groups()
tzoffset = datetime.timedelta(hours=int(offset[1:3]), minutes=int(offset[3:]))
if offset[0] == '-':
tzoffset *= -1
tzoffset = datetime.timezone(tzoffset)
dt = datetime.datetime.fromtimestamp(int(timestamp) / 1000.0).replace(tzinfo=tzoffset)
The dateutil.tz.tzoffset() object version is similar:
import datetime
import re
import dateutil.tz
timestamp_parse = re.compile(r'Date\((\d+)([+-]\d{4})\)')
timestamp, offset = timestamp_parse.search(datetime_value).groups()
tzoffset = int(offset[1:3]) * 3600 + int(offset[3:]) * 60
if offset[0] == '-':
tzoffset *= -1
tzoffset = dateutil.tz.tzoffset(None, tzoffset)
dt = datetime.datetime.fromtimestamp(int(timestamp) / 1000.0).replace(tzinfo=tzoffset)
Demo:
>>> import datetime
>>> import re
>>> datetime_value = "/Date(928142400000+0200)/"
>>> timestamp_parse = re.compile(r'Date\((\d+)([+-]\d{4})\)')
>>> timestamp, offset = timestamp_parse.search(datetime_value).groups()
>>> tzoffset = datetime.timedelta(hours=int(offset[1:3]), minutes=int(offset[3:]))
>>> if offset[0] == '-':
... tzoffset *= -1
...
>>> tzoffset = datetime.timezone(tzoffset)
>>> datetime.datetime.fromtimestamp(int(timestamp) / 1000.0).replace(tzinfo=tzoffset)
datetime.datetime(1999, 5, 31, 10, 20, tzinfo=datetime.timezone(datetime.timedelta(0, 7200)))

python, how i do xml = '<start>%??</start>' % datetime.datetime

is there any way to pass a datetime directly in this format?
mydate = datetime.datetime.now()
myxmldate = '<start>%??</start>' % mydate
or have i to pass like a string?
I need to pass to an xml a datetime structure.
thanks
Try with datetime.isoformat()
mydate = datetime.datetime.now()
myxmldate = '<start>%s</start>' % mydate.isoformat()
Try to reed for 2.6:
>>> mydate.isoformat()
'2002-03-11'
>>> mydate.strftime("%d/%m/%y")
'11/03/02'
>>> mydate.strftime("%A %d. %B %Y")
'Monday 11. March 2002'
And for version 3 Using type-specific :
>>> import datetime
>>> d = datetime.datetime(2010, 7, 4, 12, 15, 58)
>>> '{:%Y-%m-%d %H:%M:%S}'.format(d)
'2010-07-04 12:15:58'
Since you haveXML, it makes sense to use an XML parser. For example, with lxml:
import lxml.etree as ET
import datetime
mydate = datetime.datetime.now()
doc = ET.fromstring('<start>%??</start>')
for start in doc.xpath('//start'):
start.text = start.text.replace('%??',str(mydate))
print(ET.tostring(doc))
yields
<start>2011-11-07 12:28:58.883274</start>

change (eg) 8 to 08...python

I am reading data from a csv file, and there are date elements in it, but there is an inconsistency in the dates.
For example: sometimes the date element is like 1/1/2011 and sometimes it is like 01/01/2011
Since I am plotting this data later.. this causes a great deal of noise in my plots. The following is my date class. Can you help me out in where and how to modify the code in order to get the date in the form 01/01/2011
import re
class Date:
def __init__(self, input_date):
self._input_date = input_date
self._date = None
self._month = None
self._year = None
self._hour = None
self._min = None
def setDate(self):
date = self._input_date
#date = re.findall('w+',date)
date = self.__mySplit()
#print"len ",len(date)
assert (len(date) >= 3) #has atleast dd/mm/yy
#dateLength = len(date[0])
self._month = int(date[0])
self._date = int(date[1])
self._year = int(date[2])
if (len(date) ==5):
self._hour = int(date[3])
self._min = int(date[4])
def __mySplit(self): #splitting the date by delimiters..
res = [self._input_date]
#print res
seps = [' ','/',':']
for sep in seps:
s,res = res,[]
for seq in s:
res += seq.split(sep)
#print res
return res
Thanks
You definitely want to be using datetime. Here's some code that will get a datetime from either string type:
from datetime import datetime
def strToDatetime(dateStr):
return datetime.strptime(dateStr, "%d/%m/%Y")
Then, you can print a datetime out in the format you want with:
strToDatetime("1/3/2011").strftime("%d/%m/%Y)
>'01/03/2011'
You should never need to roll your own date/time/datetime structure in python.

Categories