error while concatinating git command string in python - python

Following this: Find out git branch creator
I making a python script that provides me a sorted set of emails out of the result of
git for-each-ref --format='%(authoremail)%09%(refname)' | sort -k5n -k2M -k3n -k4n | grep remotes | awk -F "\t" '{ printf "%-32s %-27s %s\n", $1, $2, $3 }'
so that I can email them that these are you branches up on remote please delete them.
but when I try to put it together in python I getting error
intitial = "git for-each-ref --format='%(authoremail)%09%(refname)' | sort -k5n -k2M -k3n -k4n | grep remotes | awk -F "
addTab = "\t"
printf = '{ printf "%-32s %-27s %s\n", $1, $2, $3 }'
gitCommnad = "%s%s %s " % (intitial, addTab, printf)
def _exec_git_command(command, verbose=False):
""" Function used to get data out of git commads
and errors in case of failure.
Args:
command(string): string of a git command
verbose(bool): whether to display every command
and its resulting data.
Returns:
(tuple): string of Data and error if present
"""
# converts multiple spaces to single space
command = re.sub(' +',' ',command)
pr = subprocess.Popen(command, shell=True,
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
msg = pr.stdout.read()
err = pr.stderr.read()
if err:
print err
if 'Could not resolve host' in err:
return
if verbose and msg:
print "Executing '%s' %s" % (command, msg)
return msg, err
print _exec_git_command(gitCommnad)

The issue is that you are not putting \t or { printf "%-32s %-27s %s\n", $1, $2, $3 } inside quotes, hence awk reports a Syntax error. You should use -
intitial = "git for-each-ref --format='%(authoremail)%09%(refname)' | sort -k5n -k2M -k3n -k4n | grep remotes | awk -F"
addTab = "\t"
printf = '{ printf "%-32s %-27s %s\n", $1, $2, $3 }'
gitCommnad = "%s \"%s\" '%s' " % (intitial, addTab, printf)

Related

Jenkins build failing without updating Xray with the failed status

Please forgive me if this is not the place to ask this question. I'm running python scripts in a Jenkins pipeline from a Jenkinsfile. I am also updating Jira Xray tickets within the Jenkisfile. Behave is being used to validate the test status. If the check fails then the Jenkins build fails without getting the Xray ticket updated with the failure. I've attempted to use "try" to capture the failure but have not succeeded in getting the failure to propagate to the Xray ticket.
Would anyone here know where I might find an answer? I would be in your dept.
Jenkinsfile
node() {
def repoURL = '<GitLab URL>/prod-003.git'
def STC_INSTALL = "/opt/STC_CLIENT/Spirent_TestCenter_5.22/Spirent_TestCenter_Application_Linux/"
try {
stage("Prepare Workspace") {
echo "*** Prepare Workspace ***"
cleanWs()
env.WORKSPACE_LOCAL = sh(returnStdout: true, script: 'pwd').trim()
env.BUILD_TIME = "${BUILD_TIMESTAMP}"
echo "Workspace set to:" + env.WORKSPACE_LOCAL
echo "Build time:" + env.BUILD_TIME
sh """
cd ${env.WORKSPACE_LOCAL}
rm -fr *
"""
}
stage('Checkout Code') {
echo "*** Checking Code Out ***"
git branch: 'master', credentialsId: '', url: repoURL
}
stage('Executing Tests') {
if (env.WanModeCheck == "Yes") {
echo "Executing WAN Mode Change Before FW Upgrade"
sh """
/var/lib/jenkins/.pyenv/shims/python WanMode.py -i $modemIP -m $WanMode
"""
echo "Starting Firmware Upgrade"
sh """
cd ${env.WORKSPACE_LOCAL}
./ModemUpgrade.sh -i $modemIP -f $FW -p2
/var/lib/jenkins/.pyenv/shims/behave -f cucumber -o storetarget-bdd/reporting/cucumber.json --junit --format=json -o target/behave.json --junit ./features/PROD-003.feature
"""
} else {
echo "#######################\n# Skipping WAN Mode Change #\n#######################"
}
if (env.WanModeCheck == "No") {
echo "Starting Firmware Upgrade"
sh """
cd ${env.WORKSPACE_LOCAL}
./ModemUpgrade.sh -i $modemIP -f $FW -p2
/var/lib/jenkins/.pyenv/shims/behave -f cucumber -o storetarget-bdd/reporting/cucumber.json --junit --format=json -o target/behave.json --junit ./features/fwupgrade.feature
"""
}
// Setting variables to use for the Xray Test Execution
res = sh(returnStdout: true, script: 'awk "/##/{f=1;next} /#####/{f=0} f" PROD-003-Out.txt | sed -e "s/#//g" -e "s/^ * //g" | tr "\n" "%" | sed -e "s/^%%%%%%//g" -e "s/%%$//g" -e "s/%/\\\\\\\\Z/g" -e "s/Z/n/g"')
env.STResults = res.strip()
model = sh(returnStdout: true, script: 'grep Model: PROD-003-Out.txt')
env.Model = model.strip()
wanmode = sh(returnStdout: true, script: 'grep CPE PROD-003-Out.txt')
env.WanMode = wanmode.strip()
serialnum = sh(returnStdout: true, script: 'grep Number: PROD-003-Out.txt')
env.SerialNum = serialnum.strip()
echo "End of test phase"
}
stage('Expose report') {
echo "*** Expose Reports ***"
echo "*** Archive Artifacts ***"
archiveArtifacts "**/cucumber.json"
echo "*** cucumber cucumber.json ***"
cucumber '**/cucumber.json'
junit skipPublishingChecks: true, allowEmptyResults: true, keepLongStdio: true, testResults: 'reports/*.xml'
cucumber buildStatus: "UNSTABLE",
fileIncludePattern: "**/cucumber.json",
jsonReportDirectory: 'reports'
}
stage('Import results to Xray') {
echo "*** Import Results to XRAY ***"
def description = "Jenkins Project: ${env.JOB_NAME}\\n\\nCucumber Test Report: [${env.JOB_NAME}-Link|${env.BUILD_URL}/cucumber-html-reports/overview-features.html]\\n\\nJenkins Console Output: [${env.JOB_NAME}-Console-Link|${env.BUILD_URL}/console]\\n\\nCPE IP: ${modemIP}\\n\\nCPE FW File Name: ${FW}\\n\\n${env.STResults}"
def labels = '["regression","automated_regression"]'
def environment = "DEV"
def testExecutionFieldId = 10552
def testEnvironmentFieldName = "customfield_10372"
def projectKey = "AARC"
def projectId = 10608
def xrayConnectorId = "e66d84d8-f978-4af6-9757-93d5804fde1d"
// def xrayConnectorId = "${xrayConnectorId}"
def info = '''{
"fields": {
"project": {
"id": "''' + projectId + '''"
},
"labels":''' + labels + ''',
"description":"''' + description + '''",
"summary": "''' + env.JOB_NAME + ' ' + env.Model + ' ' + env.WanMode + ' ' + env.SerialNum + ''' Test Executed ''' + env.BUILD_TIME + ''' " ,
"issuetype": {
"id": "''' + testExecutionFieldId + '''"
}
}
}'''
echo info
step([$class: 'XrayImportBuilder',
endpointName: '/cucumber/multipart',
importFilePath: 'storetarget-bdd/reporting/cucumber.json',
importInfo: info,
inputInfoSwitcher: 'fileContent',
serverInstance: xrayConnectorId])
}
}
catch(e) {
// If there was an exception thrown, the build failed
currentBuild.result = "FAILED"
throw e
} finally {
// Success or failure, always send notifications
echo "Sending final test status to Slack"
// notifyBuild(currentBuild.result)
}
}
def notifyBuild(String buildStatus = 'STARTED') {
// build status of null means successful
buildStatus = buildStatus ?: 'SUCCESSFUL'
// Default values
def colorName = 'RED'
def colorCode = '#FF0000'
def subject = "${buildStatus}: Job '${env.JOB_NAME} [${env.BUILD_NUMBER}]'"
def summary = "${subject} (${env.BUILD_URL})"
def details = """<p>STARTED: Job '${env.JOB_NAME} [${env.BUILD_NUMBER}]':</p>
<p>Check console output at &QUOT;<a href='${env.BUILD_URL}'>${env.JOB_NAME} [${env.BUILD_NUMBER}]</a>&QUOT;</p>"""
// Override default values based on build status
if (buildStatus == 'STARTED') {
color = 'BLUE'
colorCode = '#0000FF'
msg = "Build: ${env.JOB_NAME} has started: ${BUILD_TIMESTAMP}"
} else if (buildStatus == 'UNSTABLE') {
color = 'YELLOW'
colorCode = '#FFFF00'
msg = "Build: ${env.JOB_NAME} was listed as unstable. Look at ${env.BUILD_URL} and Report: ${env.BUILD_URL}/cucumber-html-reports/overview-features.html"
} else if (buildStatus == 'SUCCESSFUL') {
color = 'GREEN'
colorCode = '#00FF00'
msg = "Build: ${env.JOB_NAME} Completed Successfully ${env.BUILD_URL} Report: ${env.BUILD_URL}/cucumber-html-reports/overview-features.html"
} else {
color = 'RED'
colorCode = '#FF0000'
msg = "Build: ${env.JOB_NAME} had an issue ${env.BUILD_URL}/console"
}
// Send notifications
slackSend (color: colorCode, message: summary)
slackSend baseUrl: 'https://hooks.slack.com/services/',
channel: '#wopr-private',
color: colorCode,
message: msg,
teamDomain: '<Slack URL>',
tokenCredentialId: 'Jenkins-Slack-Token',
username: 'JenkinsAutomation'
}
feature file
Feature: SNMP Firmware Upgrade Test
#demo #AARC-3428
Scenario: SNMP Firmware Upgrade Executed against the DUT
Given ModemUpgrade.sh Script Exists
When SNMP Firmware Upgrade Executed
Then I expect Result Pass
step file
from behave import *
import pathlib
from pathlib import Path
#given('ModemUpgrade.sh Script Exists')
def step_impl(context):
STCFile = pathlib.Path('ModemUpgrade.sh')
if STCFile.exists():
print("SNMP Firmware Upgrade file exists")
pass
# else:
# print("SNMP Firmware Upgrade file does not exists")
# assert context.failed
#when('SNMP Firmware Upgrade Executed')
def step_impl(context):
path_to_file = 'PROD-003-Out.txt'
path = Path(path_to_file)
if path.is_file():
print(f'Output file {path_to_file} exists')
else:
print(f'Output file {path_to_file} does not exists')
#then('I expect Result Pass')
def step_impl(context):
Result = False
with open("PROD-003-Out.txt") as FwUpgradeResults:
for line in FwUpgradeResults:
if 'Upgrade Status: Passed'.lower() in line.strip().lower():
Result = True
break
else:
Result = False
break
if Result is False:
print("Error: Upgrade Failed")
assert context.failed
The suggestion of using || /usr/bin/true appears to have worked for the above mentioned code. Now I have a second instance where my Python test is throwing an exception when the DUT fails DHCP bind
def wait_for_dhcp_bind():
try:
stc.perform("Dhcpv4BindWait", objectlist=project)
except Exception:
raise Exception("DHCP Bind Failed")
I attempted to add the same after the Python script but the Jenkins build fails without the Xray test getting updated with a failure.
Here is what this looks like in the Jenkinsfile
echo "Starting Speed Test"
// def ModemMac = sh(returnStdout: true, script: './ModemUpgrade.sh -i ${modemIP} -f mac')
sh """
export STC_PRIVATE_INSTALL_DIR=${STC_INSTALL}
cd ${env.WORKSPACE_LOCAL}
/var/lib/jenkins/.pyenv/shims/python SpeedTest.py -d $dsp -u $usp -i $iterations -x $imix -f $frames -m $ModemMac || /usr/bin/true
/var/lib/jenkins/.pyenv/shims/behave -f cucumber -o storetarget-bdd/reporting/cucumber.json --junit --format=json -o target/behave.json --junit ./features/speedtest.feature || /usr/bin/true
"""
Your case should be easy to fix. Behave utility returns exit code 1 if tests fails..
Just add this to the end of your behave command || /usr/bin/true (please make sure of the path of the "true" command).
This will make your command to always return true even if some problems exist with behave.
So your overall command should be something like:
/var/lib/jenkins/.pyenv/shims/behave -f cucumber -o storetarget-bdd/reporting/cucumber.json --junit --format=json -o target/behave.json --junit ./features/PROD-003.feature || /usr/bin/true

how to use popen with command line arguments contains single quote and double quote?

I want to run following jq command with subprocess.Popen() in python3.
$ jq 'INDEX(.images[]; .id) as $imgs | {
"filename_with_label":[
.annotations[]
| select(.attributes.type=="letter" )
| $imgs[.image_id] + {label:.text}
| {id:.id} + {filename:.file_name} + {label:.label}
]
}' image_data_annotation.json > image_data_annotation_with_label.json
Note that first command line argument contains dot, dollar sign, double quotes within single quote.
FYI, jq is JSON processor utility for processing json files.
I wrote following python3 script for automating JSON file processing with jq utility.
#!python3
# file name: letter_image_tool.py
import os, subprocess
"""
command line example to automate
$ jq 'INDEX(.images[]; .id) as $imgs | {
"filename_with_label":[
.annotations[]
| select(.attributes.type=="letter" )
| $imgs[.image_id] + {label:.text}
| {id:.id} + {filename:.file_name} + {label:.label}
]
}' image_data_annotation.json > image_data_annotation_with_label.json
"""
# define first command line argument
jq_filter='\'INDEX(.images[]; .id) as $imgs | { "filename_with_label" : [ .annotations[] | select(.attributes.type=="letter" ) | $imgs[.image_id] + {label:.text} | {id:.id} + {filename:.file_name} + {label:.label} ] }\''
input_json_files= [ "image_data_annotation.json"]
output_json_files= []
for input_json in input_json_files:
print("Processing %s" %(input_json))
filename, ext = os.path.splitext(input_json)
output_json = filename + "_with_label" + ext
output_json_files.append(output_json)
print("output file is : %s" %(output_json))
#jq_command ='jq' + " " + jq_filter, input_json + ' > ' + output_json
jq_command =['jq', jq_filter, input_json + ' > ' + output_json]
print(jq_command)
subprocess.Popen(jq_command, shell=True)
Running the above python script on bash results in folowing:
$ ./letter_image_tool.py
Processing image_data_annotation.json
output file is : image_data_annotation_with_label.json
['jq', '\'INDEX(.images[]; .id) as $imgs | { "filename_with_label" : [ .annotations[] | select(.attributes.type=="letter" ) | $imgs[.image_id] + {label:.text} | {id:.id} + {filename:.file_name} + {label:.label} ] }\'', 'image_data_annotation.json > image_data_annotation_with_label.json']
jq - commandline JSON processor [version 1.6-124-gccc79e5-dirty]
Usage: jq [options] <jq filter> [file...]
jq [options] --args <jq filter> [strings...]
jq [options] --jsonargs <jq filter> [JSON_TEXTS...]
jq is a tool for processing JSON inputs, applying the given filter to
its JSON text inputs and producing the filter's results as JSON on
standard output.
The simplest filter is ., which copies jq's input to its output
unmodified (except for formatting, but note that IEEE754 is used
for number representation internally, with all that that implies).
For more advanced filters see the jq(1) manpage ("man jq")
and/or https://stedolan.github.io/jq
Example:
$ echo '{"foo": 0}' | jq .
{
"foo": 0
}
For a listing of options, use jq --help.
It does not handle the first argument of jq utility:
'INDEX(.images[]; .id) as $imgs | {
"filename_with_label":[
.annotations[]
| select(.attributes.type=="letter" )
| $imgs[.image_id] + {label:.text}
| {id:.id} + {filename:.file_name} + {label:.label}
]
}'
The first argument should be enclosed with single quote as above snipet but my script does not handle it.
I think the main problems are related to the dot, dollar sign, single quote and double quote used in the first command line argument (jq_filter in the above python script). But I don't know how to treat this kind of complex meta character related to bash.
What should I do to solve above problems?
Thanks for your kind reading.
Update with my solution
With triple quote for jq_filter defintion, and space seprated join as follows
#!python3
# file name: letter_image_tool.py
import os, subprocess
"""
command line example to automate
$ jq 'INDEX(.images[]; .id) as $imgs | {
"filename_with_label":[
.annotations[]
| select(.attributes.type=="letter" )
| $imgs[.image_id] + {label:.text}
| {id:.id} + {filename:.file_name} + {label:.label}
]
}' image_data_annotation.json > image_data_annotation_with_label.json
"""
# define first command line argument with triple quotes
jq_filter=""" 'INDEX(.images[]; .id) as $imgs | {
"filename_with_label" : [
.annotations[]
| select(.attributes.type=="letter" )
| $imgs[.image_id] + {label:.text}
| {id:.id} + {filename:.file_name} + {label:.label} ] } ' """
input_json_files= [ "image_data_annotation.json"]
output_json_files= []
for input_json in input_json_files:
print("Processing %s" %(input_json))
filename, ext = os.path.splitext(input_json)
output_json = filename + "_with_label" + ext
output_json_files.append(output_json)
print("output file is : %s" %(output_json))
#jq_command ='jq' + " " + jq_filter, input_json + ' > ' + output_json
# jq command composed with space separated join
jq_command =' '.join['jq', jq_filter, input_json, ' > ', output_json]
print(jq_command)
# shell keyword argument should be set True
subprocess.Popen(jq_command, shell=True)
With triple double quotes, jq_filter can be more readable using multi-lined definition instead of single line defintion.
The reason you need single quotes is to prevent the shell from doing any expansion of your argument. This is a problem, only when using shell=True. If this is not set, the shell will never touch your arguments and there is no need to "protect" them.
However, the shell is also responsible for the stdout redirect (i.e. [... '>', output_json]). Not using the shell, requires that the redirect is handled in the Python code instead. That, however, is as simple as adding the argument stdout=... to Popen.
All-in-all this means that your code can be rewritten as
import os
import subprocess
# Still define first command line argument with triple quotes for readability
# Note that there are no single quotes though
jq_filter = """INDEX(.images[]; .id) as $imgs | {
"filename_with_label" : [
.annotations[]
| select(.attributes.type=="letter" )
| $imgs[.image_id] + {label:.text}
| {id:.id} + {filename:.file_name} + {label:.label} ] }"""
input_json_files = ["image_data_annotation.json"]
output_json_files = []
for input_json in input_json_files:
print("Processing %s" % (input_json))
filename, ext = os.path.splitext(input_json)
output_json = filename + "_with_label" + ext
output_json_files.append(output_json)
print("output file is : %s" % (output_json))
# Keep command as list, since this is what we need when NOT using shell=True
# Note also that the redirect and the output file are not parts of the argument list
jq_command = ['jq', jq_filter, input_json]
# shell keyword argument should NOT be set True
# Instead redirect stdout to an out_file
# (We must open the file for writing before redirecting)
with open(output_json, "w") as out_file:
subprocess.Popen(jq_command, stdout=out_file)
Generally it is recommended to not use shell=True anyway, since that opens up another vector of attack against the code, since an injection attack can give full access to the shell. Also, another small benefit with not using the shell, is that it will reduce the number of created subprocesses, since no extra shell process is needed.

Python with grep/sed/awk command

I am new to Python and trying to figure out how to get the port number from /etc/services if I give the port name.
/etc/services contains following value
DB2_test 60000/tcp
DB2_test_1 60001/tcp
DB2_test_2 60002/tcp
DB2_test_3 60003/tcp
DB2_test_4 60004/tcp
DB2_test_END 60005/tcp
The command
db2port=os.popen("db2 get dbm cfg | grep -i Service | awk '{{print $6}}'").read()
print(db2port)
returns DB2_test
The below command does not work. I want to just see the value of DB2_test, which is 60000:
getnum = "cat /etc/services | sed -n '/\{db2port}\s/p' | awk '{print $2}' | sed 's/\/tcp$//'"
print(getnum}
No need to invoke awk, sed etc. A pure Python solution would be:
for line in open("/etc/services").readlines():
parts = line.split()
if parts and parts[0] == 'DB2_test':
port, protocol = parts[1].split('/')
print(port)
Assuming the variable services contains the text from your /etc/services.
port_map = {
name: int(value.split('/')[0])
for name, value in (
line.split() for line in services.splitlines()
)
}
Now you have a map from the service's name to its port, so that port_map["DB2_test"] == 60000, for example.

Need a help on fetching value based on a key from a config file

I have a file containing similar data
[xxx]
name = xxx
address = bangalore
[yyy]
name = yyy
address = sjc
Please help me getting a regex that I can fetch the address/name value based on xxx or yyy (xxx or yyy and address or name is the input)
You can do something like this with awk if your file is just like that (i.e., the name is the same as the section and it is before the address):
$ awk -v nm='yyy' -F ' *= *' '$1=="name" && $2==nm{infi=1; next}
$1=="address" && infi {print $2; infi=0}' file
sjc
Or, better still you can get the section and then fetch the key, value as they occur and print them and then exit:
$ awk -v sec='yyy' -v key='address' '
BEGIN{
FS=" *= *"
pat=sprintf("^\\[%s\\]", sec)}
$0 ~ pat {secin=$1; next}
NF==2 && $1==key && secin ~ pat {print $2; exit}' file
sjc
If you want to gather all sections with their key/value pairs, you can do (with gawk):
$ gawk 'BEGIN{FS=" *= *"}
/^\[[^\]]+\]/ && NF==1 {sec=$1; next}
NF==2 {d[sec][$1]=$2}
END{ for (k in d){
printf "%s: ",k
for (v in d[k])
printf "\t%s = %s\n", v, d[k][v]
}
}' file
[xxx]: address = bangalore
name = xxx
[yyy]: address = sjc
name = yyy
Config or .ini files can have quoting like csv, so it is best to use a full config file parser. You can use Perl or Python that have robust libraries for parsing .ini or config type files.
Python example:
#!/usr/bin/python
import ConfigParser
config = ConfigParser.ConfigParser()
config.read("/tmp/file")
Then you can grab the sections, the items in each section, or a specific items in a specific section:
>>> config.sections()
['xxx', 'yyy']
>>> config.items("yyy")
[('name', 'yyy'), ('address', 'sjc')]
>>> config.get("xxx", "address")
'bangalore'
Regex to the rescue!
This approach splits the entries into single elements and parses the key-value-pairs afterwards. In the end, you can simply ask your resulting dictionary for ie. values['xxx'].
See a demo on ideone.com.
import re
string = """
[xxx]
name = xxx
address = bangalore
[yyy]
name = yyy
address = sjc
"""
rx_item = re.compile(r'''
^\[(?P<name>[^][]*)\]
.*?
(?=^\[[^][]*\]$|\Z)
''', re.X | re.M | re.DOTALL)
rx_value = re.compile(r'^(?P<key>\w+)\s*=\s*(?P<value>.+)$', re.MULTILINE)
values = {item.group('name'): {
m.group('key'): m.group('value')
for m in rx_value.finditer(item.group(0))}
for item in rx_item.finditer(string)
}
print(values)
# {'xxx': {'name': 'xxx', 'address': 'bangalore'}, 'yyy': {'name': 'yyy', 'address': 'sjc'}}
It's not clear if you're trying to search for the value inside the square brackets or the value of the "name" tag but here's a solution to one possible interpretation of your question:
$ cat tst.awk
BEGIN { FS=" *= *" }
!NF { next }
NF<2 { prt(); k=$0 }
{ map[$1] = $2 }
END { prt() }
function prt() { if (k=="["key"]") print map[tag]; delete map }
$ awk -v key='yyy' -v tag='address' -f tst.awk file
sjc
$ awk -v key='xxx' -v tag='address' -f tst.awk file
bangalore
$ awk -v key='xxx' -v tag='name' -f tst.awk file
xxx

Convert Outlook PST to json using libpst

I have an Outlook PST file, and I'd like to get a json of the emails, e.g. something like
{"emails": [
{"from": "alice#example.com",
"to": "bob#example.com",
"bcc": "eve#example.com",
"subject": "mitm",
"content": "be careful!"
}, ...]}
I've thought using readpst to convert to MH format and then scan it in a ruby/python/bash script, is there a better way?
Unfortunately the ruby-msg gem doesn't work on my PST files (and looks like it wasn't updated since 2014).
I found a way to do it in 2 stages, first convert to mbox and then to json:
# requires installing libpst
pst2json my.pst
# or you can specify a custom output dir and an outlook mail folder,
# e.g. Inbox, Sent, etc.
pst2json -o email/ -f Inbox my.pst
Where pst2json is my script and mbox2json is slightly modified from Mining the Social Web.
pst2json:
#!/usr/bin/env bash
usage(){
echo "usage: $(basename $0) [-o <output-dir>] [-f <folder>] <pst-file>"
echo "default output-dir: email/mbox-all/<pst-file>"
echo "default folder: Inbox"
exit 1
}
which readpst || { echo "Error: libpst not installed"; exit 1; }
folder=Inbox
while (( $# > 0 )); do
[[ -n "$pst_file" ]] && usage
case "$1" in
-o)
if [[ -n "$2" ]]; then
out_dir="$2"
shift 2
else
usage
fi
;;
-f)
if [[ -n "$2" ]]; then
folder="$2"
shift 2
else
usage
fi
;;
*)
pst_file="$1"
shift
esac
done
default_out_dir="email/mbox-all/$(basename $pst_file)"
out_dir=${out_dir:-"$default_out_dir"}
mkdir -p "$out_dir"
readpst -o "$out_dir" "$pst_file"
[[ -f "$out_dir/$folder" ]] || { echo "Error: folder $folder is missing or empty."; exit 1; }
res="$out_dir"/"$folder".json
mbox2json "$out_dir/$folder" "$res" && echo "Success: result saved to $res"
mbox2json (python 2.7):
# -*- coding: utf-8 -*-
import sys
import mailbox
import email
import quopri
import json
from BeautifulSoup import BeautifulSoup
MBOX = sys.argv[1]
OUT_FILE = sys.argv[2]
SKIP_HTML=True
def cleanContent(msg):
# Decode message from "quoted printable" format
msg = quopri.decodestring(msg)
# Strip out HTML tags, if any are present
soup = BeautifulSoup(msg)
return ''.join(soup.findAll(text=True))
def jsonifyMessage(msg):
json_msg = {'parts': []}
for (k, v) in msg.items():
json_msg[k] = v.decode('utf-8', 'ignore')
# The To, CC, and Bcc fields, if present, could have multiple items
# Note that not all of these fields are necessarily defined
for k in ['To', 'Cc', 'Bcc']:
if not json_msg.get(k):
continue
json_msg[k] = json_msg[k].replace('\n', '').replace('\t', '').replace('\r'
, '').replace(' ', '').decode('utf-8', 'ignore').split(',')
try:
for part in msg.walk():
json_part = {}
if part.get_content_maintype() == 'multipart':
continue
type = part.get_content_type()
if SKIP_HTML and type == 'text/html':
continue
json_part['contentType'] = type
content = part.get_payload(decode=False).decode('utf-8', 'ignore')
json_part['content'] = cleanContent(content)
json_msg['parts'].append(json_part)
except Exception, e:
sys.stderr.write('Skipping message - error encountered (%s)\n' % (str(e), ))
finally:
return json_msg
# There's a lot of data to process, so use a generator to do it. See http://wiki.python.org/moin/Generators
# Using a generator requires a trivial custom encoder be passed to json for serialization of objects
class Encoder(json.JSONEncoder):
def default(self, o):
return {'emails': list(o)}
# The generator itself...
def gen_json_msgs(mb):
while 1:
msg = mb.next()
if msg is None:
break
yield jsonifyMessage(msg)
mbox = mailbox.UnixMailbox(open(MBOX, 'rb'), email.message_from_file)
json.dump(gen_json_msgs(mbox),open(OUT_FILE, 'wb'), indent=4, cls=Encoder)
Now, it's possible to process the file easily. E.g. to get just the contents of the emails:
jq '.emails[] | .parts[] | .content' < out/Inbox.json

Categories