PostgreSQL csvlog: Large Message/Detail Truncated Causing Malformed CSV - python

In attempt to utilize a foreign table for PostgreSQL's log analysis, I set log_destination=csvlog. Because our application uses large JSON/XML data, which get logged in the messages, from day 1 we have been getting errors like the following, because the large contents in the message or detail field get truncated due to certain system limit.
ERROR: extra data after last expected column
CONTEXT: COPY postgresql_log_1, line 268367: "2017-05-17 09:46:37.419 PDT,"user","dbname",75303,"10.181.55.93:50206",591549a8.12627,11820,"I..."
pgBadger similarly errors out with large contents like the following, which I have submitted an issue at https://github.com/dalibo/pgbadger/issues/342.
FATAL: cannot use CSV on /var/log/postgresql/postgresql-9.5-proddb.csv.1, EIQ - QUO character not allowed at line 766714
DETAIL: 2017-04-19 12:45:05.389 PDT,"user","dbname",56389,"10.181.55.94:50466",58f6f766.dc45,3870,"INSERT",2017-04-18 22:36:38 PDT,71/104232,3111351054,LOG,00000,"duration: 82.541 ms execute <unnamed>: insert into EVENT_NOTIFICATION (ACTOR_ID, CREATED, DATA, TYPE, UPDATED, ID) values ($1, $2, $3, $4, $5, $6)","parameters: $1 = NULL, $2 = '2017-04-19 12:45:05.245-07', $3 = '{""productID"":""093707231228"",""fileData"":{""name"":""EPSON010.JPG"",""mimeType"":""image/jpeg"",""content"":""/9j/4QEGRXhpZgAASUkqAAgAAAAIAA4BAgAQAA...
...AUUAF2017-04-19 12:45:11.174 PDT,"user","dbname",56389,"10.181.55.94:50466",58f6f766.dc45,3871,"SELECT",2017-04-18 22:36:38 PDT,71/104241,0,LOG,00000,"duration: 125.202 ms execute <unnamed>: select max(cast(version as integer)) from category where org = $1","parameters: $1 = 'ST'",,,,,,,,""
reset CSV parser
I ended up with a workaround using the following Python script to repair the bad CSV contents. I can now use a foreign table to analyze my csvlog via SQL queries, by getting rid of a few bad CSV records from the log.
I figure this must have been a common issue with a lot of Postgres folks out there and hope to find a more elegant approach to this issue. Kindly share your solution on this post, if you have one.
Here is mine for now.
#!/usr/bin/env python
import csv
import os
import sys
import linecache as lc
import pyparsing as pp
from datetime import datetime
csv.field_size_limit(sys.maxsize) # Max out the limit for large CSV contents
filename = sys.argv[1]
headers = [ 'log_time', 'user_name', 'database_name', 'process_id', 'connection_from',
'session_id', 'session_line_num', 'command_tag', 'session_start_time',
'virtual_transaction_id', 'transaction_id', 'error_severity', 'sql_state_code',
'message', 'detail', 'hint', 'internal_query', 'internal_query_pos', 'context',
'query', 'query_pos', 'location', 'application_name' ]
'''
Identify ranges of lines containing invalid CSV data
'''
bad_ranges = []
l_start = 0
with open(filename) as f:
reader = csv.DictReader(f, fieldnames=headers)
for csv_dict in reader:
# Extraneous columns beyond the predefined headers, keyed as None, indicate a bad csvlog.
if None in csv_dict:
bad_ranges += [(csv_dict, l_start, reader.line_num + 1,)]
else:
try: # Validate datetime format on log_time.
datetime.strptime(csv_dict['log_time'], '%Y-%m-%d %H:%M:%S.%f %Z')
except ValueError:
bad_ranges += [(csv_dict, l_start, reader.line_num + 1,)]
l_start = reader.line_num + 1
line_count = reader.line_num + 1
yyyy = pp.Word(pp.nums, exact=4).setName("yyyy")
mm = pp.Word(pp.nums, exact=2).setName("mm")
dd = pp.Word(pp.nums, exact=2).setName("dd")
HH24 = pp.Word(pp.nums, exact=2).setName("HH24")
MI = pp.Word(pp.nums, exact=2).setName("MI")
SS = pp.Word(pp.nums, exact=2).setName("SS")
TZ = pp.Word(pp.alphas.upper(), exact=3).setName("TZ")
date = yyyy + "-" + mm + "-" + dd
time = HH24 + ":" + MI + ":" + SS + pp.Optional("." + pp.Word(pp.nums, max=3)) + " " + TZ
timestamptz = pp.Combine(date + " " + time)
mlDblQuoteString = pp.QuotedString('"', escQuote='""', multiline=True)
slDblQuoteString = pp.QuotedString('"', escQuote='""', multiline=False)
comma = pp.Suppress(',')
validCSVLog = timestamptz("log_time") + comma \
+ slDblQuoteString('user_name') + comma \
+ slDblQuoteString('database_name') + comma \
+ pp.Word(pp.nums)('process_id') + comma \
+ slDblQuoteString('connection_from') + comma \
+ pp.Word(pp.hexnums + ".")('session_id') + comma \
+ pp.Word(pp.nums)('session_line_num') + comma \
+ slDblQuoteString('command_tag') + comma \
+ timestamptz('session_start_time') + comma \
+ pp.Combine(pp.Word(pp.nums) + pp.Literal("/")
+ pp.Word(pp.nums))('virtual_transaction_id') + comma \
+ pp.Word(pp.nums)('transaction_id') + comma \
+ pp.Word(pp.alphas.upper())('error_severity') + comma \
+ pp.Word(pp.alphanums)('sql_state_code') + comma \
+ pp.Optional(mlDblQuoteString)('message') + comma \
+ pp.Optional(mlDblQuoteString)('detail') + comma \
+ pp.Optional(mlDblQuoteString)('hint') + comma \
+ pp.Optional(mlDblQuoteString)('internal_query') + comma \
+ pp.Optional(pp.nums)('internal_query_pos') + comma \
+ pp.Optional(mlDblQuoteString)('context') + comma \
+ pp.Optional(mlDblQuoteString)('query') + comma \
+ pp.Optional(pp.nums)('query_pos') + comma \
+ pp.Optional(slDblQuoteString)('location') + comma \
+ pp.Optional(slDblQuoteString)('application_name') + pp.LineEnd().suppress()
'''
1. Scan for any valid CSV data to salvage from the malformed contents.
2. Make a new copy without the malformed CSV rows.
'''
if bad_ranges:
l_lower = 0
with open(filename+'.new', 'w') as t:
for bad_dict, l_start, l_end in bad_ranges:
t.writelines([lc.getline(filename, l) for l in range(l_lower,l_start)])
bad_csv = ''.join([lc.getline(filename, l) for l in range(l_start,l_end)])
print("{0:>8}: line[{1}:{2}] log_time={log_time} malformed CSV row found".format(
'NOTICE', l_start, l_end-1, **bad_dict))
for valid_dict, c_start, c_end in validCSVLog.scanString(bad_csv):
print("{0:>8}: line[{1}:{2}] log_time={log_time} as valid CSV portion retained".format(
'INFO', l_start, l_end-1, **valid_dict))
good_csv = bad_csv[c_start:c_end]
t.write(good_csv)
l_lower = l_end
t.writelines([lc.getline(filename, l) for l in range(l_lower, line_count)])
# Back up old file as .bak
backup = filename+'.bak'
os.rename(filename, backup)
print("{0:>8}: original file renamed to {1}".format('NOTICE', backup))
os.rename(filename+'.new', filename)

Related

PDF template not merging data properly with pdftk

I'm editing a PDF template with using pdftk
command = ("pdftk " + '"' +
template + '"' +
" fill_form " + '"' +
pathUser + user['mail'] + ".xfdf" + '"' +
" output " + '"' +
pathUser + user['mail'] + ".pdf" + '"' +
" need_appearances")
command = command.replace('/', '\\')
os.system(command)
First I'm writing my data in a .xfdf file
for key, value in user.items():
print(key, value)
fields.append(u"""<field name="%s"><value>%s</value></field>""" % (key, value))
tpl = u"""<?xml version="1.0" encoding="UTF-8"?>
<xfdf xmlns="http://ns.adobe.com/xfdf/" xml:space="preserve">
<fields>
%s
</fields>
</xfdf>""" % "\n".join(fields)
f = open(pathUser + user['mail'] + '.xfdf', 'wb')
f.write(tpl.encode("utf-8"))
f.close()
I fetch the template and as shown above, write the data from the xfdf to pdf but for some reason, only the ime gets written.
Templates get fetched using some basic conditional logic as shown below:
for item in user['predavanja']:
user[acthead + str(actn)] = item
actn += 1
for item in user['radionice']:
user[acthead + str(actn)] = item
actn += 1
for item in user['izlet']:
user[acthead + str(actn)] = item
actn += 1
print(actn)
templates = {}
templates['0'] = "Template/2019/certificate_2019.pdf"
templates['5'] = "Template/2019/certificate_2019_5.pdf"
templates['10'] = "Template/2019/certificate_2019_10.pdf"
templates['15'] = "Template/2019/certificate_2019_15.pdf"
templates['20'] = "Template/2019/certificate_2019_20.pdf"
templates['25'] = "Template/2019/certificate_2019_25.pdf"
templates['30'] = "Template/2019/certificate_2019_30.pdf"
templates['35'] = "Template/2019/certificate_2019_35.pdf"
templates['40'] = "Template/2019/certificate_2019_40.pdf"
templates['45'] = "Template/2019/certificate_2019_45.pdf"
templates['50'] = "Template/2019/certificate_2019_50.pdf"
I'm writing this data
user['id'] = data['recommendations'][0]['role_in_team']['user']['id']
user['ime'] = data['recommendations'][0]['role_in_team']['user']['first_name']
user['prezime'] = data['recommendations'][0]['role_in_team']['user']['last_name']
user['tim'] = data['recommendations'][0]['role_in_team']['team']['short_name']
user['mail'] = data['recommendations'][0]['role_in_team']['user']['estudent_email']
user['puno_ime'] = (data['recommendations'][0]['role_in_team']['user']['first_name'] + ' ' +
data['recommendations'][0]['role_in_team']['user']['last_name'])
user['predavanja'] = predavanja
user['radionice'] = radionice
user['izlet'] = izlet
One note. predavanja, radionice and izlet are lists.
I've tried printing tpl which shows all the data being properly added to the scheme.
Turns out the issue was the naming of the variables since they didn't match the field names in the acroform PDF. So the solution was to rename the variables in the code to match the field names.

Parsing dynamic arabic character xml in python

This is the code. However, this code can only parse 4 characters of Arabian only. I want it to parse dynamically. So, the number of characters does not matter. Therefore, it can parse 1 character, 2 character or more based on the number of existing characters.
import xml.etree.ElementTree as ET
import os, glob
import csv
from time import time
#read xml path
xml_path = glob.glob('D:\1. Thesis FINISH!!!\*.xml')
#create file declaration for saving the result
file = open("parsing.csv","w")
#file = open("./%s" % ('parsing.csv'), 'w')
#create variable of starting time
t0=time()
#create file header
file.write('wordImage_id'+'|'+'paw1'+'|'+'paw2'+'|' + 'paw3' + '|' + 'paw4' + '|'+'font_size'+'|'+'font_style'+
'|'+'font_name'+'|'+'specs_effect'+'|'+'specs_height'+'|'+'specs_height'
+'|'+'specs_width'+'|'+'specs_encoding'+'|'+'generation_filtering'+
'|'+'generation_renderer'+'|'+'generation_type' + '\n')
for doc in xml_path:
print 'Reading file - ', os.path.basename(doc)
tree = ET.parse(doc)
#tree = ET.parse('D:\1. Thesis FINISH!!!\Image_14_AdvertisingBold_13.xml')
root = tree.getroot()
#get wordimage id
image_id = root.attrib['id']
#get paw 1 and paw 2
paw1 = root[0][0].text
paw2 = root[0][1].text
paw3 = root[0][2].text
paw4 = root[0][3].text
#get properties of font
for font in root.findall('font'):
size = font.get('size')
style = font.get('fontStyle')
name = font.get('name')
#get properties of specs
for specs in root.findall('specs'):
effect = specs.get('effect')
height = specs.get('height')
width = specs.get('width')
encoding = specs.get('encoding')
#get properties for generation
for generation in root.findall('generation'):
filtering = generation.get('filtering')
renderer = generation.get('renderer')
types = generation.get('type')
#save the result in csv
file.write(image_id + '|' + paw1 + '|' + paw2 + '|' + paw3 + '|' + paw4 + '|' + size + '|' +
style + '|' + name + '|' + effect + '|' + height + '|'
+ width + '|' + encoding + '|' + filtering + '|' + renderer + '|' + types + '\n')
#close the file
file.close()
#print time execution
print("process done in %0.3fs." % (time() - t0))

How to encrypt a .bin file

I need to encrypt 3 .bin files which contain 2 keys for Diffie-Hellman. I have no clue how to do that, all I could think of was what I did in the following Python file. I have an example what the output should look like but my code doesn't seem to produce the right keys. The output file server.ini is used by a client to connect to a server.
import base64
fileList = [['game_key.bin', 'Game'], ['gate_key.bin', 'Gate'], ['auth_key.bin', 'Auth']]
iniList = []
for i in fileList:
file = open(i[0], 'rb')
n = list(file.read(64))
x = list(file.read(64))
file.close()
n.reverse()
x.reverse()
iniList.append(['Server.' + i[1] + '.N "' + base64.b64encode("".join(n)) + '"\n', 'Server.' + i[1] + '.X "' + base64.b64encode("".join(x)) + '"\n'])
iniList[0].append('\n')
#time for user Input
ip = '"' + raw_input('Hostname: ') + '"'
dispName = 'Server.DispName ' + '"' + raw_input('DispName: ') + '"' + '\n'
statusUrl = 'Server.Status ' + '"' + raw_input('Status URL: ') + '"' + '\n'
signupUrl = 'Server.Signup ' + '"' + raw_input('Signup URL: ') + '"' + '\n'
for l in range(1, 3):
iniList[l].append('Server.' + fileList[l][1] + '.Host ' + ip + '\n\n')
for l in [[dispName], [statusUrl], [signupUrl]]:
iniList.append(l)
outFile = open('server.ini', 'w')
for l in iniList:
for i in l:
outFile.write(i)
outFile.close()
The following was in my example file:
# Keys are Base64-encoded 512 bit RC4 keys, as generated by DirtSand's keygen
# command. Note that they MUST be quoted in the commands below, or the client
# won't parse them correctly!
I also tried it without inverting n and x

Trouble with apostrophe in arcpy search cursor where clause

I've put together a tkinter form and python script for downloading files from an ftp site. The filenames are in the attribute table of a shapefile, as well as an overall Name that the filenames correspond too. In other words I look up a Name such as "CABOT" and download the filename 34092_18.tif. However, if a Name has an apostrophe, such as "O'KEAN", it's giving me trouble. I try to replace the apostrophe, like I've done in previous scripts, but it doesn't download anything....
whereExp = quadField + " = " + "'" + quadName.replace("'", '"') + "'"
quadFields = ["FILENAME"]
c = arcpy.da.SearchCursor(collarlessQuad, quadFields, whereExp)
for row in c:
tifFile = row[0]
tifName = quadName.replace("'", '') + '_' + tifFile
#fullUrl = ftpUrl + tifFile
local_filename = os.path.join(downloadDir, tifName)
lf = open(local_filename, "wb")
ftp.retrbinary('RETR ' + tifFile, lf.write)
lf.close()
Here is an example of a portion of a script that works fine by replacing the apostrophe....
where_clause = quadField + " = " + "'" + quad.replace("'", '"') + "'"
#out_quad = quad.replace("'", "") + ".shp"
arcpy.MakeFeatureLayer_management(quadTable, "quadLayer")
select_out_feature_class = arcpy.SelectLayerByAttribute_management("quadLayer", "NEW_SELECTION", where_clause)

Add a column in the middle of a row from a .csv file with python

Hello I have a python script which changes a timestamp column in a .csv file from dot notation to "date time TSQL" notation:
One row looks like this before executing the code:
send,2007.10.04.10.11.11.669,Server,Data,Client,TYPE=STP,Length=329,Cnt=11
after executing the code it looks like this:
send,2007-10-04 10:11:11.669,Server,Data,Client,TYPE=STP,Length=329,Cnt=11
I want to append the same time in the new format after the first time column, that it looks like this:
send,2007-10-04 10:11:11.669,2007-10-04 10:11:11.669,Server,Data,Client,TYPE=STP,Length=329,Cnt=11
Here is the Script:
import csv
cr = csv.reader(open("ActualTrace_01 - short2Times.csv", "rb"))
output = csv.writer(open("GermanygoalInputFormatActualTrace_01 - short.csv", "wb"))
for row in cr:
dateTimeContentsSend = row[1].split(".")
finishSend = dateTimeContentsSend[0] + "-" + dateTimeContentsSend[1] + "-" + dateTimeContentsSend[2] + " " + dateTimeContentsSend[3] + ":"
finishSend+= dateTimeContentsSend[4] + ":" + dateTimeContentsSend[5] + "." + dateTimeContentsSend[6]
row[1] = finishSend
output.writerow(row)
All Threads here were not useful and if you just say row[1] = finishSend + "," + finishSend
it makes it in row[1] with quotes like this
send,"2007-10-04 10:11:11.669,2007-10-04 10:11:11.684",Server,Data,Client,TYPE=STP,Length=329,Cnt=11
Are you after (just after row[1] = finishSend)?
row.insert(2, row[1])

Categories