Find, decode and replace all base64 values in text file - python

I have a SQL dump file that contains text with html links like:
<a href="http://blahblah.org/kb/getattachment.php?data=NHxUb3Bjb25fZGF0YS1kb3dubG9hZF9ob3d0by5wZGY=">attached file</a>
I'd like to find, decode and replace the base64 part of the text in each of these links.
I've been trying to use Python w/ regular expressions and base64 to do the job. However, my regex skills are not up to the task.
I need to select any string that starts with
'getattachement.php?data='
and ends with
'"'
I then need to decode the part between 'data=' and '&quot' using base64.b64decode()
results should look something like:
<a href="http://blahblah.org/kb/4/Topcon_data-download_howto.pdf">attached file</a>
I think the solution will look something like:
import re
import base64
with open('phpkb_articles.sql') as f:
for line in f:
re.sub(some_regex_expression_here, some_function_here_to_decode_base64)
Any ideas?
EDIT: Answer for anyone who's interested.
import re
import base64
import sys
def decode_base64(s):
"""
Method to decode base64 into ascii
"""
# fix escaped equal signs in some base64 strings
base64_string = re.sub('%3D', '=', s.group(1))
decodedString = base64.b64decode(base64_string)
# substitute '|' for '/'
decodedString = re.sub('\|', '/', decodedString)
# escape the spaces in file names
decodedString = re.sub(' ', '%20', decodedString)
# print 'assets/' + decodedString + '&quot' # Print for debug
return 'assets/' + decodedString + '&quot'
count = 0
pattern = r'getattachment.php\?data=([^&]+?)&quot'
# Open the file and read line by line
with open('phpkb_articles.sql') as f:
for line in f:
try:
# globally substitute in new file path
edited_line = re.sub(pattern, decode_base64, line)
# output the edited line to standard out
sys.stdout.write(edited_line)
except TypeError:
# output unedited line if decoding fails to prevent corruption
sys.stdout.write(line)
# print line
count += 1

you already have it, you just need the small pieces:
pattern: r'data=([^&]+?)&quot' will match anything after data= and before &quot
>>> pat = r'data=([^&]+?)&quot'
>>> line = '<a href="http://blahblah.org/kb/getattachment.php?data=NHxUb3Bjb25fZGF0YS1kb3dubG9hZF9ob3d0by5wZGY=">attached file</a>'
>>> decodeString = re.search(pat,line).group(1) #because the b64 string is capture by grouping, we only want group(1)
>>> decodeString
'NHxUb3Bjb25fZGF0YS1kb3dubG9hZF9ob3d0by5wZGY='
you can then use str.replace() method as well as base64.b64decode() method to finish the rest. I dont want to just write your code for you but this should give you a good idea of where to go.

Related

String from file to string utf-8 in python

So I am reading and manipulate a file with :
base_file = open(path+'/'+base_name, "r")
lines = base_file.readlines()
After this I search and find the "raw_data" start of line.
if re.match("\s{0,100}raw_data: ",line):
split_line = line.split("raw_data:")
print(split_line)
raw_string = split_line[1]
One example of raw_data is:
raw_data: "&\276!\300\307 =\277\"O\271\277vH9?j?\345?#\243\264=\350\034\345\277\260\345\033\300\023\017(#z|\273\277L\}\277\210\\031\300\213\263z\277\302\241\033\300\000\207\323\277\247Oh>j\354\215#\364\305\201\276\361+\202#t:\304\277\344\231\243#\225k\002\300vw\262\277\362\220j\300\"(\337\276\354b8\300\230\347H\300\201\320\204\300S;N\300Z0G\300>j\210\000#\034\014\220#\231\330J#\223\025\236#\006\332\230\276\227\273\n\277\353#,#\202\205\215\277\340\356\022\300/\223\035\277\331\277\362\276a\350\013#)\353\276\277v6\316\277K\326\207\300`2)\300\004\014Q\300\340\267\271\300MV\305\300\327\010\207\300j\346o\300\377\260\216\300[\332g\300\336\266\003\300\320S\272?6\300Y#\356\250\034\300\367\277&\300\335Uq>o\010&\300r\277\252\300U\314\243\300\253d\377\300"
And raw_string will be
print(raw_data)
"&\276!\300\307
=\277\"O\271\277vH9?j?\345?#\243\264=\350\034\345\277\260\345\033\300\023\017(#z|\273\277L\}\277\210\\031\300\213\263z\277\302\241\033\300\000\207\323\277\247Oh>j\354\215#\364\305\201\276\361+\202#t:\304\277\344\231\243#\225k\002\300vw\262\277\362\220j\300\"(\337\276\354b8\300\230\347H\300\201\320\204\300S;N\300Z0G\300>j\210\000#\034\014\220#\231\330J#\223\025\236#\006\332\230\276\227\273\n\277\353#,#\202\205\215\277\340\356\022\300/\223\035\277\331\277\362\276a\350\013#)\353\276\277v6\316\277K\326\207\300`2)\300\004\014Q\300\340\267\271\300MV\305\300\327\010\207\300j\346o\300\377\260\216\300[\332g\300\336\266\003\300\320S\272?6\300Y#\356\250\034\300\367\277&\300\335Uq>o\010&\300r\277\252\300U\314\243\300\253d\377\300"
If I tried to read this file I will obtain one char to one char even for escape characters.
So, my question is how to transform this plain text to utf-8 string so that I can have one character when reading \300 and not 4 characters.
I tried to pass "encondig =utf-8" in open file method but does not work.
I have made the same example passing raw_data as variable and it works properly.
RAW_DATA = "&\276!\300\307 =\277\"O\271\277vH9?j?\345?#\243\264=\350\034\345\277\260\345\033\300\023\017(#z|\273\277L\\}\277\210\\\031\300\213\263z\277\302\241\033\300\000\207\323\277\247Oh>j\354\215#\364\305\201\276\361+\202#t:\304\277\344\231\243#\225k\002\300vw\262\277\362\220j\300\"(\337\276\354b8\300\230\347H\300\201\320\204\300S;N\300Z0G\300<I>>j\210\000#\034\014\220#\231\330J#\223\025\236#\006\332\230\276\227\273\n\277\353#,#\202\205\215\277\340\356\022\300/\223\035\277\331\277\362\276a\350\013#)\353\276\277v6\316\277K\326\207\300`2)\300\004\014Q\300\340\267\271\300MV\305\300\327\010\207\300j\346o\300\377\260\216\300[\332g\300\336\266\003\300\320S\272?6\300Y#\356\250\034\300\367\277&\300\335Uq>o\010&\300r\277\252\300U\314\243\300\253d\377\300"
print(f"Qnt -> {len(RAW_DATA)}") # Qnt -> 256
print(type(RAW_DATA))
at = 0
total = 0
while at < len(RAW_DATA):
fin = at+4
substrin = RAW_DATA[at:fin]
resu = FourString_float(substrin)
at = fin
For this example \300 is only one char.
Hope someone can help me.
The problem is that on the read file the escape \ symbols are coming in as \, but in the example you've provided they are being evaluated as part of the numerics that follow it. ie, \276 is read as a single character.
If you run:
RAW_DATA = r"&\276!\300\307 =\277\"O\271\277vH9?j?\345?#\243\264=\350\034\345\277\260\345\033\300\023\017(#z|\273\277L\\}\277\210\\\031\300\213\263z\277\302\241\033\300\000\207\323\277\247Oh>j\354\215#\364\305\201\276\361+\202#t:\304\277\344\231\243#\225k\002\300vw\262\277\362\220j\300\"(\337\276\354b8\300\230\347H\300\201\320\204\300S;N\300Z0G\300<I>>j\210\000#\034\014\220#\231\330J#\223\025\236#\006\332\230\276\227\273\n\277\353#,#\202\205\215\277\340\356\022\300/\223\035\277\331\277\362\276a\350\013#)\353\276\277v6\316\277K\326\207\300`2)\300\004\014Q\300\340\267\271\300MV\305\300\327\010\207\300j\346o\300\377\260\216\300[\332g\300\336\266\003\300\320S\272?6\300Y#\356\250\034\300\367\277&\300\335Uq>o\010&\300r\277\252\300U\314\243\300\253d\377\300"
print(f"Qnt -> {len(RAW_DATA)}") # Qnt -> 256
print(type(RAW_DATA))
at = 0
total = 0
while at < len(RAW_DATA):
fin = at+4
substrin = RAW_DATA[at:fin]
resu = FourString_float(substrin)
at = fin
You would should be getting the same error that you were getting originally. Notice that we are using the raw-string literal instead of regular string literal. This will ensure that the \ don't get escaped.
You would need to evaluate the RAW_DATA to force it to evaluate the \.
You can do something like RAW_DATA = eval(f'"{RAW_DATA}"') or
import ast
RAW_DATA = ast.literal_eval(f'"{RAW_DATA}"')
Note, the second option is a bit more secure that doing a straight eval as you are limiting the scope of what can be executed.

searching word following a giving pattern

I want get the word 'MASTER_INACTIVE' in the string:
'p_esco_link->state = MASTER_INACTIVE; /*M-t10*/'
by searching reg-expression 'p_esco_link->state =' to find the following word.
I have to replace date accessing to API functions. I try some reg-expression in python 3.6, but it does not work.
pattern = '(?<=\bp_esco_link->state =\W)\w+'
if __name__ == "__main__":
syslogger.info(sys.argv)
if version_info.major != 3:
raise Exception('Olny work on Python 3.x')
with open(cFile, encoding='utf-8') as file_obj:
lineNum = 0
for line in file_obj:
print(len(line))
re_obj = re.compile(pattern)
result = re.search(pattern, line)
lineNum += 1
#print(result)
if result:
print(str(lineNum) + ' ' +str(result.span()) + ' ' + result.group())
excepted Python re module can find the position of 'MASTER_INACTIVE' and put it into result.group().
error message is that Python re module find nothing.
Your pattern is working fine,
Just change the bellow line in your code,
pattern = r'(?<=\bp_esco_link->state =\W)\w+' # add r prefix
Check this sample work, I added line as your string.
import re
pattern = r'(?<=\bp_esco_link->state =\W)\w+'
line = 'p_esco_link->state = MASTER_INACTIVE; /*M-t10*/'
re_obj = re.compile(pattern)
result = re.search(pattern, line)
print(result.span()) # (21, 36)
print(result.group()) # 'MASTER_INACTIVE'
Check below question to get more understand about 'r' prefix,
Python regex - r prefix
What exactly do “u” and “r” string flags do, and what are raw string literals?
What does preceding a string literal with “r” mean? [duplicate]

Replace decimals within a special character in file

I am currently trying to read in file and replace all the decimals that are only between the thorn character in it such that:
ie.
þ219.91þ
þ122.1919þ
þ467.426þ
þ104.351þ
þ104.0443þ
will become
þ219þ
þ122þ
þ467þ
þ104þ
þ104þ
The gist of something I'm trying to replicate that works in Notepad++ (regex replacing - below) and trying to replicate it in python (code below which is not working). Any suggestions?
In Notepad++:
Find: (\xFE\d+)\.\d+(\xFE)
Replace: $1$2
Python:
for line in file:
line = re.sub("(\xFE\d+)\.\d+(\xFE)", "\xFE\d+\xFE", line)
I don't think it would be necessary to have \xFE and this might simply work:
import re
regex = r"(þ\d+)\.\d+(þ)"
test_str = ("þ219.91þ\n"
"þ122.1919þ\n"
"þ467.426þ\n"
"þ104.351þ\n"
"þ104.0443þ")
subst = "\\1\\2"
# You can manually specify the number of replacements by changing the 4th argument
result = re.sub(regex, subst, test_str, 0, re.MULTILINE)
if result:
print (result)
You're not replacing decimals: you're truncating the values. Will the mathematical treatment do for you? This assumes that all lines are of the format you show.
for line in file:
_, val, _ = line.split('þ') # null string, value, null string
line = 'þ' + str(int(val))+ 'þ'
Note that you could reduce this a little with a single line in the loop:
line = 'þ' + str(int(line.split('þ')[1]))+ 'þ'
You could use a one-liner such as:
f = ["þ219.91þ", "þ122.1919þ", "þ467.426þ", "þ104.351þ", "þ104.0443þ"]
print(["þ{}þ".format(int(float(l.strip("þ")))) for l in f])
Result:
['þ219þ', 'þ122þ', 'þ467þ', 'þ104þ', 'þ104þ']

How to find parenthesis bound strings in python

I'm learning Python and wanted to automate one of my assignments in a cybersecurity class.
I'm trying to figure out how I would look for the contents of a file that are bound by a set of parenthesis. The contents of the (.txt) file look like:
cow.jpg : jphide[v5](asdfl;kj88876)
fish.jpg : jphide[v5](65498ghjk;0-)
snake.jpg : jphide[v5](poi098*/8!##)
test_practice_0707.jpg : jphide[v5](sJ*=tT#&Ve!2)
test_practice_0101.jpg : jphide[v5](nKFdFX+C!:V9)
test_practice_0808.jpg : jphide[v5](!~rFX3FXszx6)
test_practice_0202.jpg : jphide[v5](X&aC$|mg!wC2)
test_practice_0505.jpg : jphide[v5](pe8f%yC$V6Z3)
dog.jpg : negative`
And here is my code so far:
import sys, os, subprocess, glob, shutil
# Finding the .jpg files that will be copied.
sourcepath = os.getcwd() + '\\imgs\\'
destpath = 'stegdetect'
rawjpg = glob.glob(sourcepath + '*.jpg')
# Copying the said .jpg files into the destpath variable
for filename in rawjpg:
shutil.copy(filename, destpath)
# Asks user for what password file they want to use.
passwords = raw_input("Enter your password file with the .txt extension:")
shutil.copy(passwords, 'stegdetect')
# Navigating to stegdetect. Feel like this could be abstracted.
os.chdir('stegdetect')
# Preparing the arguments then using subprocess to run
args = "stegbreak.exe -r rules.ini -f " + passwords + " -t p *.jpg"
# Uses open to open the output file, and then write the results to the file.
with open('cracks.txt', 'w') as f: # opens cracks.txt and prepares to w
subprocess.call(args, stdout=f)
# Processing whats in the new file.
f = open('cracks.txt')
If it should just be bound by ( and ) you can use the following regex, which ensures starting ( and closing ) and you can have numbers and characters between them. You can add any other symbol also that you want to include.
[\(][a-z A-Z 0-9]*[\)]
[\(] - starts the bracket
[a-z A-Z 0-9]* - all text inside bracket
[\)] - closes the bracket
So for input sdfsdfdsf(sdfdsfsdf)sdfsdfsdf , the output will be (sdfdsfsdf)
Test this regex here: https://regex101.com/
I'm learning Python
If you are learning you should consider alternative implementations, not only regexps.
TO iterate line by line of a text file you just open the file and for over the file handle:
with open('file.txt') as f:
for line in f:
do_something(line)
Each line is a string with the line contents, including the end-of-line char '/n'. To find the start index of a specific substring in a string you can use find:
>>> A = "hello (world)"
>>> A.find('(')
6
>>> A.find(')')
12
To get a substring from the string you can use the slice notation in the form:
>>> A[6:12]
'(world'
You should use regular expressions which are implemented in the Python re module
a simple regex like \(.*\) could match your "parenthesis string"
but it would be better with a group \((.*)\) which allows to get only the content in the parenthesis.
import re
test_string = """cow.jpg : jphide[v5](asdfl;kj88876)
fish.jpg : jphide[v5](65498ghjk;0-)
snake.jpg : jphide[v5](poi098*/8!##)
test_practice_0707.jpg : jphide[v5](sJ*=tT#&Ve!2)
test_practice_0101.jpg : jphide[v5](nKFdFX+C!:V9)
test_practice_0808.jpg : jphide[v5](!~rFX3FXszx6)
test_practice_0202.jpg : jphide[v5](X&aC$|mg!wC2)
test_practice_0505.jpg : jphide[v5](pe8f%yC$V6Z3)
dog.jpg : negative`"""
REGEX = re.compile(r'\((.*)\)', re.MULTILINE)
print(REGEX.findall(test_string))
# ['asdfl;kj88876', '65498ghjk;0-', 'poi098*/8!##', 'sJ*=tT#&Ve!2', 'nKFdFX+C!:V9' , '!~rFX3FXszx6', 'X&aC$|mg!wC2', 'pe8f%yC$V6Z3']

how to manipulate SREC file

I have an S19 file looking something like below:
S0030000FC
S30D0003C0000F0000000000000020
S3FD00000000782EFF1FB58E00003D2B00003D2B00003D2B00003D2B00003D2B00003D
S3ED000000F83D2B00003D2B00003D2B00003D2B00003D2B00003D2B00003D2B00003D
S31500000400FFFFFFFFFFFFFFFFFFFFFFFF7EF9FFFF7D
S3FD0000041010B5DFF828000468012147F22C10C4F20300016047F22010C4F2030000
S70500008EB4B8
I want to separate the first two characters and also the next two characters, and so on... I want it to look like below (last two characters are also to be separated for each line):
S0, 03, 0000, FC
S3, 0D, 0003C000, 0F00000000000000, 20
S3, FD, 00000000, 782EFF1FB58E00003D2B00003D2B00003D2B00003D2B00003D2B0000, 3D
S3, ED, 000000F8, 3D2B00003D2B00003D2B00003D2B00003D2B00003D2B00003D2B0000, 3D
S3, 15, 00000400, FFFFFFFFFFFFFFFFFFFFFFFF7EF9FFFF, 7D
S3, FD, 00000410, 10B5DFF828000468012147F22C10C4F20300016047F22010C4F20300, 00
S7, 05, 00008EB4, B8
How can I do this in Python?
I have something like this:
#!/usr/bin/python
import string,os,sys,re,fileinput
print "hi"
inputfile = "k60.S19"
outputfile = "k60_out.S19"
# open the source file and read it
fh = file(inputfile, 'r')
subject = fh.read()
fh.close()
# create the pattern object. Note the "r". In case you're unfamiliar with Python
# this is to set the string as raw so we don't have to escape our escape characters
pattern2 = re.compile(r'S3')
pattern3 = re.compile(r'S7')
pattern1 = re.compile(r'S0')
# do the replace
result1 = pattern1.sub("S0, ", subject)
result2 = pattern2.sub("S3, ", subject)
result3 = pattern3.sub("S7, ", subject)
# write the file
f_out = file(outputfile, 'w')
f_out.write(result1)
f_out.write(result2)
f_out.write(result3)
f_out.close()
#EoF
but it is not working as I like!! Can someone help me with how to come up with proper regular expression use for this?
try package bincopy, maybe you need it.
bincopy - Interpret strings as packed binary data
Mangling of various file formats that conveys binary information (Motorola S-Record, Intel HEX and binary files).
import bincopy
f = bincopy.BinFile()
f.add_srec_file("path/to/your/s19/flie.s19")
f.as_binary() # print s19 as binary
or you can easily use open() for a file:
with open("path/to/your/s19/flie.s19") as s19:
for line in s19:
type = line[0:2]
count = line[2:4]
adress = line[4:12]
data = line[12:-2]
crc = line[-2:]
print type + ", "+ count + ", " + adress + ", " + data + ", " + crc + "\n"
hope it helps.
Motorola S-record file format
You can do it using a callback function as replacement with re.sub:
#!/usr/bin/python
import re
data = r'''S0030000FC
S30D0003C0000F0000000000000020
S3FD00000000782EFF1FB58E00003D2B00003D2B00003D2B00003D2B00003D2B00003D
S3ED000000F83D2B00003D2B00003D2B00003D2B00003D2B00003D2B00003D2B00003D
S31500000400FFFFFFFFFFFFFFFFFFFFFFFF7EF9FFFF7D
S3FD0000041010B5DFF828000468012147F22C10C4F20300016047F22010C4F2030000
S70500008EB4B8'''
pattern = re.compile(r'^(..)(..)((?:.{4}){1,2})(.*)(?=..)', re.M)
def repl(m):
repstr = ''
for g in m.groups():
if (g):
repstr += g + ', '
return repstr
print re.sub(pattern, repl, data)
However, as Mark Setchell notices it, there is probably a nice way to do it with slicing.
I know you are thinking Python and regexes, but this was made for awk and the following will maybe help you work out the way to do it using slicing:
awk '{r=length($0);print substr($0,1,2),substr($0,3,2),substr($0,5,8),substr($0,13,r-14),substr($0,r-1)}' OFS=, k60.s19
That says "get the length of the line in variable r, then print the first two characters, the next two characters, the next 8 characters and so on... and use a comma as the field separator".
EDITED
Here are a few more hints to get you started...
if you want to avoid printing line 1, you can do
awk 'FNR==1{next} ...rest of awk script above ... '
If you want to only process lines longer than 40 characters, you can do
awk 'length($0)>40 {print}' yourfile
If you only want to process lines where the second field is "xx", you can do
awk '$2 ~ "xx" {print}' yourfile

Categories