Create NTFS junction point in Python - python

Is there a way to create an NTFS junction point in Python? I know I can call the junction utility, but it would be better not to rely on external tools.

Since Python 3.5 there's a function CreateJunction in _winapi module.
import _winapi
_winapi.CreateJunction(source, target)

I answered this in a similar question, so I'll copy my answer to that below. Since writing that answer, I ended up writing a python-only (if you can call a module that uses ctypes python-only) module to creating, reading, and checking junctions which can be found in this folder. Hope that helps.
Also, unlike the answer that utilizes uses the CreateSymbolicLinkA API, the linked implementation should work on any Windows version that supports junctions. CreateSymbolicLinkA is only supported in Vista+.
Answer:
python ntfslink extension
Or if you want to use pywin32, you can use the previously stated method, and to read, use:
from win32file import *
from winioctlcon import FSCTL_GET_REPARSE_POINT
__all__ = ['islink', 'readlink']
# Win32file doesn't seem to have this attribute.
FILE_ATTRIBUTE_REPARSE_POINT = 1024
# To make things easier.
REPARSE_FOLDER = (FILE_ATTRIBUTE_DIRECTORY | FILE_ATTRIBUTE_REPARSE_POINT)
# For the parse_reparse_buffer function
SYMBOLIC_LINK = 'symbolic'
MOUNTPOINT = 'mountpoint'
GENERIC = 'generic'
def islink(fpath):
""" Windows islink implementation. """
if GetFileAttributes(fpath) & REPARSE_FOLDER:
return True
return False
def parse_reparse_buffer(original, reparse_type=SYMBOLIC_LINK):
""" Implementing the below in Python:
typedef struct _REPARSE_DATA_BUFFER {
ULONG ReparseTag;
USHORT ReparseDataLength;
USHORT Reserved;
union {
struct {
USHORT SubstituteNameOffset;
USHORT SubstituteNameLength;
USHORT PrintNameOffset;
USHORT PrintNameLength;
ULONG Flags;
WCHAR PathBuffer[1];
} SymbolicLinkReparseBuffer;
struct {
USHORT SubstituteNameOffset;
USHORT SubstituteNameLength;
USHORT PrintNameOffset;
USHORT PrintNameLength;
WCHAR PathBuffer[1];
} MountPointReparseBuffer;
struct {
UCHAR DataBuffer[1];
} GenericReparseBuffer;
} DUMMYUNIONNAME;
} REPARSE_DATA_BUFFER, *PREPARSE_DATA_BUFFER;
"""
# Size of our data types
SZULONG = 4 # sizeof(ULONG)
SZUSHORT = 2 # sizeof(USHORT)
# Our structure.
# Probably a better way to iterate a dictionary in a particular order,
# but I was in a hurry, unfortunately, so I used pkeys.
buffer = {
'tag' : SZULONG,
'data_length' : SZUSHORT,
'reserved' : SZUSHORT,
SYMBOLIC_LINK : {
'substitute_name_offset' : SZUSHORT,
'substitute_name_length' : SZUSHORT,
'print_name_offset' : SZUSHORT,
'print_name_length' : SZUSHORT,
'flags' : SZULONG,
'buffer' : u'',
'pkeys' : [
'substitute_name_offset',
'substitute_name_length',
'print_name_offset',
'print_name_length',
'flags',
]
},
MOUNTPOINT : {
'substitute_name_offset' : SZUSHORT,
'substitute_name_length' : SZUSHORT,
'print_name_offset' : SZUSHORT,
'print_name_length' : SZUSHORT,
'buffer' : u'',
'pkeys' : [
'substitute_name_offset',
'substitute_name_length',
'print_name_offset',
'print_name_length',
]
},
GENERIC : {
'pkeys' : [],
'buffer': ''
}
}
# Header stuff
buffer['tag'] = original[:SZULONG]
buffer['data_length'] = original[SZULONG:SZUSHORT]
buffer['reserved'] = original[SZULONG+SZUSHORT:SZUSHORT]
original = original[8:]
# Parsing
k = reparse_type
for c in buffer[k]['pkeys']:
if type(buffer[k][c]) == int:
sz = buffer[k][c]
bytes = original[:sz]
buffer[k][c] = 0
for b in bytes:
n = ord(b)
if n:
buffer[k][c] += n
original = original[sz:]
# Using the offset and length's grabbed, we'll set the buffer.
buffer[k]['buffer'] = original
return buffer
def readlink(fpath):
""" Windows readlink implementation. """
# This wouldn't return true if the file didn't exist, as far as I know.
if not islink(fpath):
return None
# Open the file correctly depending on the string type.
handle = CreateFileW(fpath, GENERIC_READ, 0, None, OPEN_EXISTING, FILE_FLAG_OPEN_REPARSE_POINT, 0) \
if type(fpath) == unicode else \
CreateFile(fpath, GENERIC_READ, 0, None, OPEN_EXISTING, FILE_FLAG_OPEN_REPARSE_POINT, 0)
# MAXIMUM_REPARSE_DATA_BUFFER_SIZE = 16384 = (16*1024)
buffer = DeviceIoControl(handle, FSCTL_GET_REPARSE_POINT, None, 16*1024)
# Above will return an ugly string (byte array), so we'll need to parse it.
# But first, we'll close the handle to our file so we're not locking it anymore.
CloseHandle(handle)
# Minimum possible length (assuming that the length of the target is bigger than 0)
if len(buffer) < 9:
return None
# Parse and return our result.
result = parse_reparse_buffer(buffer)
offset = result[SYMBOLIC_LINK]['substitute_name_offset']
ending = offset + result[SYMBOLIC_LINK]['substitute_name_length']
rpath = result[SYMBOLIC_LINK]['buffer'][offset:ending].replace('\x00','')
if len(rpath) > 4 and rpath[0:4] == '\\??\\':
rpath = rpath[4:]
return rpath
def realpath(fpath):
from os import path
while islink(fpath):
rpath = readlink(fpath)
if not path.isabs(rpath):
rpath = path.abspath(path.join(path.dirname(fpath), rpath))
fpath = rpath
return fpath
def example():
from os import system, unlink
system('cmd.exe /c echo Hello World > test.txt')
system('mklink test-link.txt test.txt')
print 'IsLink: %s' % islink('test-link.txt')
print 'ReadLink: %s' % readlink('test-link.txt')
print 'RealPath: %s' % realpath('test-link.txt')
unlink('test-link.txt')
unlink('test.txt')
if __name__=='__main__':
example()
Adjust the attributes in the CreateFile to your needs, but for a normal situation, it should work. Feel free to improve on it.
It should also work for folder junctions if you use MOUNTPOINT instead of SYMBOLIC_LINK.
You may way to check that
sys.getwindowsversion()[0] >= 6
if you put this into something you're releasing, since this form of symbolic link is only supported on Vista+.

you can use python win32 API modules e.g.
import win32file
win32file.CreateSymbolicLink(srcDir, targetDir, 1)
see http://docs.activestate.com/activepython/2.5/pywin32/win32file__CreateSymbolicLink_meth.html for more details
if you do not want to rely on that too, you can always use ctypes and directly call CreateSymbolicLinl win32 API, which is anyway a simple call
here is example call using ctypes
import ctypes
kdll = ctypes.windll.LoadLibrary("kernel32.dll")
kdll.CreateSymbolicLinkA("d:\testdir", "d:\testdir_link", 1)
MSDN says Minimum supported client Windows Vista

Based on the accepted answer by Charles, here improved (and cross-platform) versions of the functions (Python 2.7 and 3.5+).
islink() now also detects file symbolic links under Windows (just like the POSIX equivalent)
parse_reparse_buffer() and readlink() now actually detect the type of reparse point (NTFS Junction, symlink or generic) which is needed to correctly decode the path
readlink() no longer fails with access denied on NTFS Junctions or directory symlinks (unless you really have no permission to read attributes)
import os
import struct
import sys
if sys.platform == "win32":
from win32file import *
from winioctlcon import FSCTL_GET_REPARSE_POINT
__all__ = ['islink', 'readlink']
# Win32file doesn't seem to have this attribute.
FILE_ATTRIBUTE_REPARSE_POINT = 1024
# These are defined in win32\lib\winnt.py, but with wrong values
IO_REPARSE_TAG_MOUNT_POINT = 0xA0000003 # Junction
IO_REPARSE_TAG_SYMLINK = 0xA000000C
def islink(path):
"""
Cross-platform islink implementation.
Supports Windows NT symbolic links and reparse points.
"""
if sys.platform != "win32" or sys.getwindowsversion()[0] < 6:
return os.path.islink(path)
return bool(os.path.exists(path) and GetFileAttributes(path) &
FILE_ATTRIBUTE_REPARSE_POINT == FILE_ATTRIBUTE_REPARSE_POINT)
def parse_reparse_buffer(buf):
""" Implementing the below in Python:
typedef struct _REPARSE_DATA_BUFFER {
ULONG ReparseTag;
USHORT ReparseDataLength;
USHORT Reserved;
union {
struct {
USHORT SubstituteNameOffset;
USHORT SubstituteNameLength;
USHORT PrintNameOffset;
USHORT PrintNameLength;
ULONG Flags;
WCHAR PathBuffer[1];
} SymbolicLinkReparseBuffer;
struct {
USHORT SubstituteNameOffset;
USHORT SubstituteNameLength;
USHORT PrintNameOffset;
USHORT PrintNameLength;
WCHAR PathBuffer[1];
} MountPointReparseBuffer;
struct {
UCHAR DataBuffer[1];
} GenericReparseBuffer;
} DUMMYUNIONNAME;
} REPARSE_DATA_BUFFER, *PREPARSE_DATA_BUFFER;
"""
# See https://learn.microsoft.com/en-us/windows-hardware/drivers/ddi/content/ntifs/ns-ntifs-_reparse_data_buffer
data = {'tag': struct.unpack('<I', buf[:4])[0],
'data_length': struct.unpack('<H', buf[4:6])[0],
'reserved': struct.unpack('<H', buf[6:8])[0]}
buf = buf[8:]
if data['tag'] in (IO_REPARSE_TAG_MOUNT_POINT, IO_REPARSE_TAG_SYMLINK):
keys = ['substitute_name_offset',
'substitute_name_length',
'print_name_offset',
'print_name_length']
if data['tag'] == IO_REPARSE_TAG_SYMLINK:
keys.append('flags')
# Parsing
for k in keys:
if k == 'flags':
fmt, sz = '<I', 4
else:
fmt, sz = '<H', 2
data[k] = struct.unpack(fmt, buf[:sz])[0]
buf = buf[sz:]
# Using the offset and lengths grabbed, we'll set the buffer.
data['buffer'] = buf
return data
def readlink(path):
"""
Cross-platform implenentation of readlink.
Supports Windows NT symbolic links and reparse points.
"""
if sys.platform != "win32":
return os.readlink(path)
# This wouldn't return true if the file didn't exist
if not islink(path):
# Mimic POSIX error
raise OSError(22, 'Invalid argument', path)
# Open the file correctly depending on the string type.
if type(path) is type(u''):
createfilefn = CreateFileW
else:
createfilefn = CreateFile
# FILE_FLAG_OPEN_REPARSE_POINT alone is not enough if 'path'
# is a symbolic link to a directory or a NTFS junction.
# We need to set FILE_FLAG_BACKUP_SEMANTICS as well.
# See https://learn.microsoft.com/en-us/windows/desktop/api/fileapi/nf-fileapi-createfilea
handle = createfilefn(path, GENERIC_READ, 0, None, OPEN_EXISTING,
FILE_FLAG_BACKUP_SEMANTICS | FILE_FLAG_OPEN_REPARSE_POINT, 0)
# MAXIMUM_REPARSE_DATA_BUFFER_SIZE = 16384 = (16 * 1024)
buf = DeviceIoControl(handle, FSCTL_GET_REPARSE_POINT, None, 16 * 1024)
# Above will return an ugly string (byte array), so we'll need to parse it.
# But first, we'll close the handle to our file so we're not locking it anymore.
CloseHandle(handle)
# Minimum possible length (assuming that the length is bigger than 0)
if len(buf) < 9:
return type(path)()
# Parse and return our result.
result = parse_reparse_buffer(buf)
if result['tag'] in (IO_REPARSE_TAG_MOUNT_POINT, IO_REPARSE_TAG_SYMLINK):
offset = result['substitute_name_offset']
ending = offset + result['substitute_name_length']
rpath = result['buffer'][offset:ending].decode('UTF-16-LE')
else:
rpath = result['buffer']
if len(rpath) > 4 and rpath[0:4] == '\\??\\':
rpath = rpath[4:]
return rpath

You don't want to rely on external tools but you don't mind relying on the specific environment? I think you could safely assume that, if it's NTFS you're running on, the junction utility will probably be there.
But, if you mean you'd rather not call out to an external program, I've found the ctypes stuff to be invaluable. It allows you to call Windows DLLs directly from Python. And I'm pretty sure it's in the standard Python releases nowadays.
You'd just have to figure out which Windows DLL the CreateJunction() (or whatever Windows calls it) API call is in and set up the parameters and call. Best of luck with that, Microsoft don't seem to support it very well. You could disassemble the SysInternals junction program or linkd or one of the other tools to find out how they do it.
Me, I'm pretty lazy, I'd just call junction as an external process :-)

Related

Python ctypes writing data to be read by C executable

I'm trying to learn how to use the Python ctypes library to write data to a file that can easily be read by C executables. In the little test case that I've put together, I'm running into some problems with reading/writing character arrays.
At the moment, I have three source files. write_struct.py creates a simple struct with two
entries, an integer value called git and a character array called command, then writes the struct to a file using ctypes.fwrite. read_struct.c and read_struct.h compile into an executable that internally defines an identical struct to the one in write_struct.py, then reads in the data written by the python script and prints it out.
At the moment, the following values are assigned in the python file (not literally in the manner shown below, scroll down to see the actual code):
git = 1
command = 'cp file1 file2'
And when run, the C executable prints the following:
git: 1
command:
I realize that the problem is almost certainly in how the command variable is being assigned in the python script. I have read that c_char_p() (the function I'm currently using to initialize the data in that variable) does not create a pointer to mutable memory, and create_string_buffer() should be used instead, however I'm not sure about how this works with either adding that data to a struct, or writing it to a file. I guess I'm also confused about how writing pointers/their data to a file works in the first place. What is the best way to go about doing this?
Thanks in advance to anyone that is able to help!!
The code of my three files is below for reference:
write_struct.py:
"""
write_struct.py
"""
from ctypes import *
libc = cdll.LoadLibrary("libc.so.6")
class DataStruct(Structure):
_fields_ = [("git", c_int),
("command", c_char_p)
]
def main():
pydata = DataStruct(1, c_char_p("cp file1 file2"))
libc.fopen.argtypes = c_char_p, c_char_p
libc.fopen.restype = c_void_p
libc.fwrite = libc.fwrite
libc.fwrite.argtypes = c_void_p, c_size_t, c_size_t, c_void_p
libc.fwrite.restype = c_size_t
libc.fclose = libc.fclose
libc.fclose.argtypes = c_void_p,
libc.fclose.restype = c_int
f = libc.fopen("stored_data", "wb")
libc.fwrite(byref(pydata), sizeof(pydata), 1, f)
libc.fclose(f)
return 0
main()
read_struct.c:
/*
* read_struct.c
*
*/
#include "read_struct.h"
int main()
{
data_struct cdata = malloc(DATASIZE);
FILE *fp;
if ((fp = fopen("stored_data", "r")) != NULL) {
fread(cdata, DATASIZE, 1, fp);
printf("git: %i\n", cdata->git);
printf("command:");
printf("%s\n", cdata->command);
fclose(fp);
} else {
printf("Could not open file\n");
exit(1);
}
return 0;
}
read_struct.h:
/*
* read_struct.h
*
*/
#include <stdio.h>
#include <stdlib.h>
typedef struct _data_struct *data_struct;
struct _data_struct {
int git;
char command[40];
};
#define DATASIZE sizeof(struct _data_struct)
You can write binary data directly with Python. ctypes can be used to create the structure and supports bit fields and unions, or for simple structures the struct module can be used.
from ctypes import *
class DataStruct(Structure):
_fields_ = [("git", c_int),
("command", c_char * 40)] # You want array here, not pointer
pydata = DataStruct(1,b'cp file1 file2') # byte string for initialization.
with open('stored_data','wb') as f: # write file in binary mode
f.write(pydata) # ctypes support conversion to bytes
import struct
# See struct docs for formatting codes
# i = int (native-endian. Use <i to force little-endian, >i for big-endian)
# 40s = char[40] (zero-padded if initializer is shorter)
pydata = struct.pack('i40s',1,b'cp file1 file2')
with open('stored_data2','wb') as f:
f.write(pydata)
Ref: https://docs.python.org/3/library/struct.html#format-strings

C++ debug/print custom type with GDB : the case of nlohmann json library

I'm working on a project using nlohmann's json C++ implementation.
How can one easily explore nlohmann's JSON keys/vals in GDB ?
I tried to use this STL gdb wrapping since it provides helpers to explore standard C++ library structures that nlohmann's JSON lib is using.
But I don't find it convenient.
Here is a simple use case:
json foo;
foo["flex"] = 0.2;
foo["awesome_str"] = "bleh";
foo["nested"] = {{"bar", "barz"}};
What I would like to have in GDB:
(gdb) p foo
{
"flex" : 0.2,
"awesome_str": "bleh",
"nested": etc.
}
Current behavior
(gdb) p foo
$1 = {
m_type = nlohmann::detail::value_t::object,
m_value = {
object = 0x129ccdd0,
array = 0x129ccdd0,
string = 0x129ccdd0,
boolean = 208,
number_integer = 312266192,
number_unsigned = 312266192,
number_float = 1.5427999782486669e-315
}
}
(gdb) p foo.at("flex")
Cannot evaluate function -- may be inlined // I suppose it depends on my compilation process. But I guess it does not invalidate the question.
(gdb) p *foo.m_value.object
$2 = {
_M_t = {
_M_impl = {
<std::allocator<std::_Rb_tree_node<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, nlohmann::basic_json<std::map, std::vector, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, bool, long long, unsigned long long, double, std::allocator, nlohmann::adl_serializer> > > >> = {
<__gnu_cxx::new_allocator<std::_Rb_tree_node<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, nlohmann::basic_json<std::map, std::vector, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, bool, long long, unsigned long long, double, std::allocator, nlohmann::adl_serializer> > > >> = {<No data fields>}, <No data fields>},
<std::_Rb_tree_key_compare<std::less<void> >> = {
_M_key_compare = {<No data fields>}
},
<std::_Rb_tree_header> = {
_M_header = {
_M_color = std::_S_red,
_M_parent = 0x4d72d0,
_M_left = 0x4d7210,
_M_right = 0x4d7270
},
_M_node_count = 5
}, <No data fields>}
}
}
I found my own answer reading further the GDB capabilities and stack overflow questions concerning print of std::string.
The short path is the easiest.
The other path was hard, but I'm glad I managed to do this. There is lots of room for improvements.
there is an open issue for this particular matter here https://github.com/nlohmann/json/issues/1952*
Short path v3.1.2
I simply defined a gdb command as follows:
# this is a gdb script
# can be loaded from gdb using
# source my_script.txt (or. gdb or whatever you like)
define pjson
# use the lohmann's builtin dump method, ident 4 and use space separator
printf "%s\n", $arg0.dump(4, ' ', true).c_str()
end
# configure command helper (text displayed when typing 'help pjson' in gdb)
document pjson
Prints a lohmann's JSON C++ variable as a human-readable JSON string
end
Using it in gdb:
(gdb) source my_custom_script.gdb
(gdb) pjson foo
{
"flex" : 0.2,
"awesome_str": "bleh",
"nested": {
"bar": "barz"
}
}
Short path v3.7.0 [EDIT] 2019-onv-06
One may also use the new to_string() method,but I could not get it to work withing GDB with a live inferior process. Method below still works.
# this is a gdb script
# can be loaded from gdb using
# source my_script.txt (or. gdb or whatever you like)
define pjson
# use the lohmann's builtin dump method, ident 4 and use space separator
printf "%s\n", $arg0.dump(4, ' ', true, json::error_handler_t::strict).c_str()
end
# configure command helper (text displayed when typing 'help pjson' in gdb)
document pjson
Prints a lohmann's JSON C++ variable as a human-readable JSON string
end
April 18th 2020: WORKING FULL PYTHON GDB (with live inferior process and debug symbols)
Edit 2020-april-26: the code (offsets) here are out of blue and NOT compatible for all platforms/JSON lib compilations. The github project is much more mature regarding this matter (3 platforms tested so far). Code is left there as is since I won't maintain 2 codebases.
versions:
https://github.com/nlohmann/json version 3.7.3
GNU gdb (GDB) 8.3 for GNAT Community 2019 [rev=gdb-8.3-ref-194-g3fc1095]
c++ project built with GPRBUILD/ GNAT Community 2019 (20190517) (x86_64-pc-mingw32)
The following python code shall be loaded within gdb. I use a .gdbinit file sourced in gdb.
Github repo: https://github.com/LoneWanderer-GH/nlohmann-json-gdb
GDB script
Feel free to adopt the loading method of your choice (auto, or not, or IDE plugin, whatever)
set print pretty
# source stl_parser.gdb # if you like the good work done with those STL containers GDB parsers
source printer.py # the python file is given below
python gdb.printing.register_pretty_printer(gdb.current_objfile(), build_pretty_printer())
Python script
import gdb
import platform
import sys
import traceback
# adapted from https://github.com/hugsy/gef/blob/dev/gef.py
# their rights are theirs
HORIZONTAL_LINE = "_" # u"\u2500"
LEFT_ARROW = "<-" # "\u2190 "
RIGHT_ARROW = "->" # " \u2192 "
DOWN_ARROW = "|" # "\u21b3"
nlohmann_json_type_namespace = \
r"nlohmann::basic_json<std::map, std::vector, std::__cxx11::basic_string<char, std::char_traits<char>, " \
r"std::allocator<char> >, bool, long long, unsigned long long, double, std::allocator, nlohmann::adl_serializer>"
# STD black magic
MAGIC_STD_VECTOR_OFFSET = 16 # win 10 x64 values, beware on your platform
MAGIC_OFFSET_STD_MAP = 32 # win 10 x64 values, beware on your platform
""""""
# GDB black magic
""""""
nlohmann_json_type = gdb.lookup_type(nlohmann_json_type_namespace).pointer()
# for in memory direct jumps. cast to type is still necessary yet to obtain values, but this could be changed by chaning the types to simpler ones ?
std_rb_tree_node_type = gdb.lookup_type("std::_Rb_tree_node_base::_Base_ptr").pointer()
std_rb_tree_size_type = gdb.lookup_type("std::size_t").pointer()
""""""
# nlohmann_json reminder. any interface change should be reflected here
# enum class value_t : std::uint8_t
# {
# null, ///< null value
# object, ///< object (unordered set of name/value pairs)
# array, ///< array (ordered collection of values)
# string, ///< string value
# boolean, ///< boolean value
# number_integer, ///< number value (signed integer)
# number_unsigned, ///< number value (unsigned integer)
# number_float, ///< number value (floating-point)
# discarded ///< discarded by the the parser callback function
# };
""""""
enum_literals_namespace = ["nlohmann::detail::value_t::null",
"nlohmann::detail::value_t::object",
"nlohmann::detail::value_t::array",
"nlohmann::detail::value_t::string",
"nlohmann::detail::value_t::boolean",
"nlohmann::detail::value_t::number_integer",
"nlohmann::detail::value_t::number_unsigned",
"nlohmann::detail::value_t::number_float",
"nlohmann::detail::value_t::discarded"]
enum_literal_namespace_to_literal = dict([(e, e.split("::")[-1]) for e in enum_literals_namespace])
INDENT = 4 # beautiful isn't it ?
def std_stl_item_to_int_address(node):
return int(str(node), 0)
def parse_std_str_from_hexa_address(hexa_str):
# https://stackoverflow.com/questions/6776961/how-to-inspect-stdstring-in-gdb-with-no-source-code
return '"{}"'.format(gdb.parse_and_eval("*(char**){}".format(hexa_str)).string())
class LohmannJSONPrinter(object):
"""Print a nlohmann::json in GDB python
BEWARE :
- Contains shitty string formatting (defining lists and playing with ",".join(...) could be better; ident management is stoneage style)
- Parsing barely tested only with a live inferior process.
- It could possibly work with a core dump + debug symbols. TODO: read that stuff
https://doc.ecoscentric.com/gnutools/doc/gdb/Core-File-Generation.html
- Not idea what happens with no symbols available, lots of fields are retrieved by name and should be changed to offsets if possible
- NO LIB VERSION MANAGEMENT. TODO: determine if there are serious variants in nlohmann data structures that would justify working with strucutres
- PLATFORM DEPENDANT TODO: remove the black magic offsets or handle them in a nicer way
NB: If you are python-kaizer-style-guru, please consider helping or teaching how to improve all that mess
"""
def __init__(self, val, indent_level=0):
self.val = val
self.field_type_full_namespace = None
self.field_type_short = None
self.indent_level = indent_level
self.function_map = {"nlohmann::detail::value_t::null": self.parse_as_leaf,
"nlohmann::detail::value_t::object": self.parse_as_object,
"nlohmann::detail::value_t::array": self.parse_as_array,
"nlohmann::detail::value_t::string": self.parse_as_str,
"nlohmann::detail::value_t::boolean": self.parse_as_leaf,
"nlohmann::detail::value_t::number_integer": self.parse_as_leaf,
"nlohmann::detail::value_t::number_unsigned": self.parse_as_leaf,
"nlohmann::detail::value_t::number_float": self.parse_as_leaf,
"nlohmann::detail::value_t::discarded": self.parse_as_leaf}
def parse_as_object(self):
assert (self.field_type_short == "object")
o = self.val["m_value"][self.field_type_short]
# traversing tree is a an adapted copy pasta from STL gdb parser
# (http://www.yolinux.com/TUTORIALS/src/dbinit_stl_views-1.03.txt and similar links)
# Simple GDB Macros writen by Dan Marinescu (H-PhD) - License GPL
# Inspired by intial work of Tom Malnar,
# Tony Novac (PhD) / Cornell / Stanford,
# Gilad Mishne (PhD) and Many Many Others.
# Contact: dan_c_marinescu#yahoo.com (Subject: STL)
#
# Modified to work with g++ 4.3 by Anders Elton
# Also added _member functions, that instead of printing the entire class in map, prints a member.
node = o["_M_t"]["_M_impl"]["_M_header"]["_M_left"]
# end = o["_M_t"]["_M_impl"]["_M_header"]
tree_size = o["_M_t"]["_M_impl"]["_M_node_count"]
# in memory alternatives:
_M_t = std_stl_item_to_int_address(o.referenced_value().address)
_M_t_M_impl_M_header_M_left = _M_t + 8 + 16 # adding bits
_M_t_M_impl_M_node_count = _M_t + 8 + 16 + 16 # adding bits
node = gdb.Value(long(_M_t_M_impl_M_header_M_left)).cast(std_rb_tree_node_type).referenced_value()
tree_size = gdb.Value(long(_M_t_M_impl_M_node_count)).cast(std_rb_tree_size_type).referenced_value()
i = 0
if tree_size == 0:
return "{}"
else:
s = "{\n"
self.indent_level += 1
while i < tree_size:
# STL GDB scripts write "+1" which in my w10 x64 GDB makes a +32 bits move ...
# may be platform dependant and should be taken with caution
key_address = std_stl_item_to_int_address(node) + MAGIC_OFFSET_STD_MAP
# print(key_object['_M_dataplus']['_M_p'])
k_str = parse_std_str_from_hexa_address(hex(key_address))
# offset = MAGIC_OFFSET_STD_MAP
value_address = key_address + MAGIC_OFFSET_STD_MAP
value_object = gdb.Value(long(value_address)).cast(nlohmann_json_type)
v_str = LohmannJSONPrinter(value_object, self.indent_level + 1).to_string()
k_v_str = "{} : {}".format(k_str, v_str)
end_of_line = "\n" if tree_size <= 1 or i == tree_size else ",\n"
s = s + (" " * (self.indent_level * INDENT)) + k_v_str + end_of_line # ",\n"
if std_stl_item_to_int_address(node["_M_right"]) != 0:
node = node["_M_right"]
while std_stl_item_to_int_address(node["_M_left"]) != 0:
node = node["_M_left"]
else:
tmp_node = node["_M_parent"]
while std_stl_item_to_int_address(node) == std_stl_item_to_int_address(tmp_node["_M_right"]):
node = tmp_node
tmp_node = tmp_node["_M_parent"]
if std_stl_item_to_int_address(node["_M_right"]) != std_stl_item_to_int_address(tmp_node):
node = tmp_node
i += 1
self.indent_level -= 2
s = s + (" " * (self.indent_level * INDENT)) + "}"
return s
def parse_as_str(self):
return parse_std_str_from_hexa_address(str(self.val["m_value"][self.field_type_short]))
def parse_as_leaf(self):
s = "WTFBBQ !"
if self.field_type_short == "null" or self.field_type_short == "discarded":
s = self.field_type_short
elif self.field_type_short == "string":
s = self.parse_as_str()
else:
s = str(self.val["m_value"][self.field_type_short])
return s
def parse_as_array(self):
assert (self.field_type_short == "array")
o = self.val["m_value"][self.field_type_short]
start = o["_M_impl"]["_M_start"]
size = o["_M_impl"]["_M_finish"] - start
# capacity = o["_M_impl"]["_M_end_of_storage"] - start
# size_max = size - 1
i = 0
start_address = std_stl_item_to_int_address(start)
if size == 0:
s = "[]"
else:
self.indent_level += 1
s = "[\n"
while i < size:
# STL GDB scripts write "+1" which in my w10 x64 GDB makes a +16 bits move ...
offset = i * MAGIC_STD_VECTOR_OFFSET
i_address = start_address + offset
value_object = gdb.Value(long(i_address)).cast(nlohmann_json_type)
v_str = LohmannJSONPrinter(value_object, self.indent_level + 1).to_string()
end_of_line = "\n" if size <= 1 or i == size else ",\n"
s = s + (" " * (self.indent_level * INDENT)) + v_str + end_of_line
i += 1
self.indent_level -= 2
s = s + (" " * (self.indent_level * INDENT)) + "]"
return s
def is_leaf(self):
return self.field_type_short != "object" and self.field_type_short != "array"
def parse_as_aggregate(self):
if self.field_type_short == "object":
s = self.parse_as_object()
elif self.field_type_short == "array":
s = self.parse_as_array()
else:
s = "WTFBBQ !"
return s
def parse(self):
# s = "WTFBBQ !"
if self.is_leaf():
s = self.parse_as_leaf()
else:
s = self.parse_as_aggregate()
return s
def to_string(self):
try:
self.field_type_full_namespace = self.val["m_type"]
str_val = str(self.field_type_full_namespace)
if not str_val in enum_literal_namespace_to_literal:
return "TIMMY !"
self.field_type_short = enum_literal_namespace_to_literal[str_val]
return self.function_map[str_val]()
# return self.parse()
except:
show_last_exception()
return "NOT A JSON OBJECT // CORRUPTED ?"
def display_hint(self):
return self.val.type
# adapted from https://github.com/hugsy/gef/blob/dev/gef.py
# inspired by https://stackoverflow.com/questions/44733195/gdb-python-api-getting-the-python-api-of-gdb-to-print-the-offending-line-numbe
def show_last_exception():
"""Display the last Python exception."""
print("")
exc_type, exc_value, exc_traceback = sys.exc_info()
print(" Exception raised ".center(80, HORIZONTAL_LINE))
print("{}: {}".format(exc_type.__name__, exc_value))
print(" Detailed stacktrace ".center(80, HORIZONTAL_LINE))
for (filename, lineno, method, code) in traceback.extract_tb(exc_traceback)[::-1]:
print("""{} File "{}", line {:d}, in {}()""".format(DOWN_ARROW, filename, lineno, method))
print(" {} {}".format(RIGHT_ARROW, code))
print(" Last 10 GDB commands ".center(80, HORIZONTAL_LINE))
gdb.execute("show commands")
print(" Runtime environment ".center(80, HORIZONTAL_LINE))
print("* GDB: {}".format(gdb.VERSION))
print("* Python: {:d}.{:d}.{:d} - {:s}".format(sys.version_info.major, sys.version_info.minor,
sys.version_info.micro, sys.version_info.releaselevel))
print("* OS: {:s} - {:s} ({:s}) on {:s}".format(platform.system(), platform.release(),
platform.architecture()[0],
" ".join(platform.dist())))
print(horizontal_line * 80)
print("")
exit(-6000)
def build_pretty_printer():
pp = gdb.printing.RegexpCollectionPrettyPrinter("nlohmann_json")
pp.add_printer(nlohmann_json_type_namespace, "^{}$".format(nlohmann_json_type_namespace), LohmannJSONPrinter)
return pp
######
# executed at autoload (or to be executed by in GDB)
# gdb.printing.register_pretty_printer(gdb.current_objfile(),build_pretty_printer())
BEWARE :
- Contains shitty string formatting (defining lists and playing with ",".join(...) could be better; ident management is stoneage style)
- Parsing barely tested only with a live inferior process.
- It could possibly work with a core dump + debug symbols. TODO: read that stuff
https://doc.ecoscentric.com/gnutools/doc/gdb/Core-File-Generation.html
- Not idea what happens with no symbols available, lots of fields are retrieved by name and should be changed to offsets if possible
- NO LIB VERSION MANAGEMENT. TODO: determine if there are serious variants in nlohmann data structures that would justify working with structures
- PLATFORM DEPENDANT TODO: remove the black magic offsets or handle them in a nicer way
NB: If you are python-kaizer-style-guru, please consider helping or teaching how to improve all that mess
some (light tests):
gpr file:
project Debug_Printer is
for Source_Dirs use ("src", "include");
for Object_Dir use "obj";
for Main use ("main.cpp");
for Languages use ("C++");
package Naming is
for Spec_Suffix ("c++") use ".hpp";
end Naming;
package Compiler is
for Switches ("c++") use ("-O3", "-Wall", "-Woverloaded-virtual", "-g");
end Compiler;
package Linker is
for Switches ("c++") use ("-g");
end Linker;
end Debug_Printer;
main.cpp
#include // i am using the standalone json.hpp from the repo release
#include
using json = nlohmann::json;
int main() {
json fooz;
fooz = 0.7;
json arr = {3, "25", 0.5};
json one;
one["first"] = "second";
json foo;
foo["flex"] = 0.2;
foo["bool"] = true;
foo["int"] = 5;
foo["float"] = 5.22;
foo["trap "] = "you fell";
foo["awesome_str"] = "bleh";
foo["nested"] = {{"bar", "barz"}};
foo["array"] = { 1, 0, 2 };
std::cout << "fooz" << std::endl;
std::cout << fooz.dump(4) << std::endl << std::endl;
std::cout << "arr" << std::endl;
std::cout << arr.dump(4) << std::endl << std::endl;
std::cout << "one" << std::endl;
std::cout << one.dump(4) << std::endl << std::endl;
std::cout << "foo" << std::endl;
std::cout << foo.dump(4) << std::endl << std::endl;
json mixed_nested;
mixed_nested["Jean"] = fooz;
mixed_nested["Baptiste"] = one;
mixed_nested["Emmanuel"] = arr;
mixed_nested["Zorg"] = foo;
std::cout << "5th element" << std::endl;
std::cout << mixed_nested.dump(4) << std::endl << std::endl;
return 0;
}
outputs:
(gdb) source .gdbinit
Breakpoint 1, main () at F:\DEV\Projets\nlohmann.json\src\main.cpp:45
(gdb) p mixed_nested
$1 = {
"Baptiste" : {
"first" : "second"
},
"Emmanuel" : [
3,
"25",
0.5,
],
"Jean" : 0.69999999999999996,
"Zorg" : {
"array" : [
1,
0,
2,
],
"awesome_str" : "bleh",
"bool" : true,
"flex" : 0.20000000000000001,
"float" : 5.2199999999999998,
"int" : 5,
"nested" : {
"bar" : "barz"
},
"trap " : "you fell",
},
}
Edit 2019-march-24 : add precision given by employed russian.
Edit 2020-april-18 : after a long night of struggling with python/gdb/stl I had something working by the ways of the GDB documentation for python pretty printers. Please forgive any mistakes or misconceptions, I banged my head a whole night on this and everything is flurry-blurry now.
Edit 2020-april-18 (2): rb tree node and tree_size could be traversed in a more "in-memory" way (see above)
Edit 2020-april-26: add warning concerning the GDB python pretty printer.
My solution was to edit the ~/.gdbinit file.
define jsontostring
printf "%s\n", $arg0.dump(2, ' ', true, nlohmann::detail::error_handler_t::strict).c_str()
end
This makes the "jsontostring" command available on every gdb session without the need of sourcing any files.
(gdb) jsontostring object

Parsing C in python with libclang but generated the wrong AST

I want to use the libclang binding python to generate a C code's AST. OK, the source code is portrayed below .
#include <stdlib.h>
#include "adlist.h"
#include "zmalloc.h"
list *listCreate(void)
{
struct list *list;
if ((list = zmalloc(sizeof(*list))) == NULL)
return NULL;
list->head = list->tail = NULL;
list->len = 0;
list->dup = NULL;
list->free = NULL;
list->match = NULL;
return list;
}
And a implementation I wrote :
#!/usr/bin/python
# vim: set fileencoding=utf-8
import clang.cindex
import asciitree
import sys
def node_children(node):
return (c for c in node.get_children() if c.location.file.name == sys.argv[1])
def print_node(node):
text = node.spelling or node.displayname
kind = str(node.kind)[str(node.kind).index('.')+1:]
return '{} {}'.format(kind, text)
if len(sys.argv) != 2:
print("Usage: dump_ast.py [header file name]")
sys.exit()
clang.cindex.Config.set_library_file('/usr/lib/llvm-3.6/lib/libclang-3.6.so')
index = clang.cindex.Index.create()
translation_unit = index.parse(sys.argv[1], ['-x', 'c++', '-std=c++11', '-D__CODE_GENERATOR__'])
print(asciitree.draw_tree(translation_unit.cursor, node_children, print_node))
But the final output of this test is like the below :
TRANSLATION_UNIT adlist.c
+--FUNCTION_DECL listCreate
+--COMPOUND_STMT
+--DECL_STMT
+--STRUCT_DECL list
+--VAR_DECL list
+--TYPE_REF struct list
Obviously, the final result is wrong. there are much codes left no parsed. I have tried to traverse the translation unit but the result is just like the tree shows---many nodes were gone. Why will be that ? And is there any method to solve the problem? Thank you!
I guess that the reason is that Libclang is unable to parse malloc(). because neither stdlib has been included in this code nor has a user-defined definition provided for malloc.
The parse did not complete successfully, probably because you're missing some include paths.
You can confirm what the exact problem is by printing the diagnostic messages.
translation_unit = index.parse(sys.argv[1], args)
for diag in translation_unit.diagnostics:
print diag

gdb-python : Parsing structure's each field and print them with proper value, if exists

I am writing a python script to automate debugging core dump from gdb. i am trying to print data structure which includes kernel data structures and lists(e.g. struct list_head). For example the structure is something like this:
struct my_struct {
struct my_hardware_context ahw;
struct net_device *netdev;
struct pci_dev *pdev;
struct list_head mac_list;
....
....
};
i am using following API tp print this structure:
gdb.execute('p (*(struct my_struct *)dev_base->priv)')
so i am able to print the content of 'struct my_struct' , struct my_hardware_context ahw, but not the content of pointers and list ( e.g. struct net_device *netdev, struct pci_dev *pdev, struct list_head mac_list) automatically (only address is printed). So how to print the content of *netdev, *pdev and mac_list using gdb-python script?
EDITED : to make my question more clear
I am writing a python script to automate debugging core dump from gdb. i am trying to print data structure which includes kernel data structures and lists(e.g. struct list_head). For example the structure is something like this:
struct my_struct {
struct my_hardware_context ahw;
struct net_device *netdev;
struct pci_dev *pdev;
struct list_head mac_list;
....
....
};
i am using following API to print this structure: (it can be assumed that i have right core dump and added proper symbols.
main_struct = gdb.execute('p (*(struct my_struct *)dev_base->priv)')
print main_struct
Now it will print the values of all members of struct my_struct but upto one level , meaning it will print the whole content of struct my_hardware_context ahw because it is an instance but it will not print the content of struct net_device *netdev, struct pci_dev *pdev, struct list_head mac_list etc. so now manually i need to do it like below:
netdev = gdb.parse_and_eval('*(*(struct my_struct *)dev_base->next->priv).netdev')
print netdev
pdev = gdb.parse_and_eval('*(*(struct my_struct *)dev_base->next->priv).pdev')
print pdev
so i want to automate these steps. Is there any gdb-python API or way by which it can iterate the struct my_struct and print the pointers, arrays and lists values also automatically?
Thanks.
struct net_device, struct pci_dev from Linux are meant to be used by kernel and not userspace code. They're not even exported in the sanitized kernel headers you get with make headers_install for use with libc.
GDB can't print struct net_device, struct pci_dev because it doesn't have debug info describing the definition of those structures. Your userspace struct my_struct is declared to have opaque pointers to those structures. I don't think you should be doing that in the first place.
Edit After Core Dump Clarification
The trick is loading debug info from both the kernel and your driver module into GDB:
Grab a kernel with debuginfo (CONFIG_DEBUG_INFO). e.g. for Centos, get the matching kernel-debuginfo package from http://debuginfo.centos.org/6/x86_64/.
Get the .text, .data and .bss load addresses of your driver module by inspecting /sys/module/MY-DRIVER/sections/{.text,.data,.bss} from a system running your driver under normal operation.
Assuming the kernel with debug info is located at /usr/lib/debug/lib/modules/3.9.4-200.fc18.x86_64/vmlinux, run:
$ gdb /usr/lib/debug/lib/modules/3.9.4-200.fc18.x86_64/vmlinux vmcore
(gdb) add-symbol-file MY-DRIVER.ko TEXT-ADDR -s .data DATA-ADDR -s .bss BSS-ADDR
while replacing TEXT-ADDR, DATA-ADDR and BSS-ADDR with the address from the files under /sys/module/MY-DRIVER/sections/. (I think just lying and using an address of 0 would probably work in this case)
Verify that ptype struct net_device, ptype struct pci_dev, ptype my_struct work. Then after obtaining the address of a struct *my_struct the way you did before you should be able print its contents.
Traversing a Struct While Following Pointers
print-struct-follow-pointers.py
import gdb
def is_container(v):
c = v.type.code
return (c == gdb.TYPE_CODE_STRUCT or c == gdb.TYPE_CODE_UNION)
def is_pointer(v):
return (v.type.code == gdb.TYPE_CODE_PTR)
def print_struct_follow_pointers(s, level_limit = 3, level = 0):
indent = ' ' * level
if not is_container(s):
gdb.write('%s\n' % (s,))
return
if level >= level_limit:
gdb.write('%s { ... },\n' % (s.type,))
return
gdb.write('%s {\n' % (s.type,))
for k in s.type.keys():
v = s[k]
if is_pointer(v):
gdb.write('%s %s: %s' % (indent, k, v))
try:
v1 = v.dereference()
v1.fetch_lazy()
except gdb.error:
gdb.write(',\n')
continue
else:
gdb.write(' -> ')
print_struct_follow_pointers(v1, level_limit, level + 1)
elif is_container(v):
gdb.write('%s %s: ' % (indent, k))
print_struct_follow_pointers(v, level_limit, level + 1)
else:
gdb.write('%s %s: %s,\n' % (indent, k, v))
gdb.write('%s},\n' % (indent,))
class PrintStructFollowPointers(gdb.Command):
'''
print-struct-follow-pointers [/LEVEL_LIMIT] STRUCT-VALUE
'''
def __init__(self):
super(PrintStructFollowPointers, self).__init__(
'print-struct-follow-pointers',
gdb.COMMAND_DATA, gdb.COMPLETE_SYMBOL, False)
def invoke(self, arg, from_tty):
s = arg.find('/')
if s == -1:
(expr, limit) = (arg, 3)
else:
if arg[:s].strip():
(expr, limit) = (arg, 3)
else:
i = s + 1
for (i, c) in enumerate(arg[s+1:], s + 1):
if not c.isdigit():
break
end = i
digits = arg[s+1:end]
try:
limit = int(digits)
except ValueError:
raise gdb.GdbError(PrintStructFollowPointers.__doc__)
(expr, limit) = (arg[end:], limit)
try:
v = gdb.parse_and_eval(expr)
except gdb.error, e:
raise gdb.GdbError(e.message)
print_struct_follow_pointers(v, limit)
PrintStructFollowPointers()
Sample Session
(gdb) source print-struct-follow-pointers.py
(gdb) print-struct-follow-pointers *p
You can limit the levels of embedded structures printed:
(gdb) print-struct-follow-pointers/4 *p

Windows Common Item Dialog: ctypes + COM access violation

I am trying to use the ctypes module to make calls to Windows' Common Item Dialog API. The code shown below is roughly based on the steps outlined in the MSDN documentation. Its only dependency is the comtypes.GUID module.
import ctypes
from ctypes import byref, POINTER, c_int, c_long
from ctypes.wintypes import HWND, HRESULT
from comtypes import GUID
CLSID_FileOpenDialog = '{DC1C5A9C-E88A-4DDE-A5A1-60F82A20AEF7}'
IID_IFileDialog = '{42F85136-DB7E-439C-85F1-E4075D135FC8}'
#IID_IFileOpenDialog = '{D57C7288-D4AD-4768-BE02-9D969532D960}'
CLSCTX_SERVER = 5
COINIT_APARTMENTTHREADED = 2
FOS_PICKFOLDERS = 32
FOS_FORCEFILESYSTEM = 64
ole32 = ctypes.windll.ole32
CoCreateInstance = ole32.CoCreateInstance
CoInitializeEx = ole32.CoInitializeEx
CoInitializeEx(None, COINIT_APARTMENTTHREADED)
ptr = c_int()
error = CoCreateInstance(
byref(GUID(CLSID_FileOpenDialog)), None, CLSCTX_SERVER,
byref(GUID(IID_IFileDialog)), byref(ptr))
assert error == 0
ptr = ptr.value
c_long_p = ctypes.POINTER(ctypes.c_int)
print('Pointer to COM object: %s' % ptr)
vtable = ctypes.cast(ptr, c_long_p).contents.value
print('Pointer to vtable: %s' % vtable)
func_proto = ctypes.WINFUNCTYPE(HRESULT, HWND)
# Calculating function pointer offset: 3rd entry in vtable; 32-bit => 4 bytes
show_p = ctypes.cast(vtable + 3*4, c_long_p).contents.value
print('Pointer to show(): %s' % show_p)
show = func_proto(show_p)
show(0)
Everything works as intended until the first call to show(0):
WindowsError: exception: access violation reading 0xXXXXXXXX
(Output may vary.) For comparison, I have carried out the same steps in AutoHotkey_L, which has direct access to COM.
CLSID := "{DC1C5A9C-E88A-4DDE-A5A1-60F82A20AEF7}"
IID := "{42F85136-DB7E-439C-85F1-E4075D135FC8}"
ptr := ComObjCreate(CLSID, IID)
vtable := NumGet(ptr + 0, 0, "Ptr")
show := NumGet(vtbl + 0, 3 * A_PtrSize, "Ptr")
MsgBox ptr: %ptr% vtable: %vtable% show: %A_PtrSize%
DllCall(show, "Ptr", ptr, "Ptr", 44)
The resulting macro pops up an Open File dialog, as expected. The vtable pointer offsets are the same in both cases, but only the Python version throws up an access violation.
Can anyone shed some light on this?
[I apologize for not adding more hyperlinks where appropriate, but as a new user I am limited to two at a time.]
Background:
I am putting together a lightweight module which provides a native save/open file dialog for use in Python scripts. So far I have been unable to find an implementation in pure Python. Those that exist rely on UI toolkits such as Tkinter or wxPython.
Here is the solution:
COM methods take an additional parameter: The 'this' pointer. It is implicit when you call the method from C++, in C (and in ctypes) you must supply it yourself.
Change the line
func_proto = ctypes.WINFUNCTYPE(HRESULT, HWND)
into
func_proto = ctypes.WINFUNCTYPE(HRESULT, c_long, HWND)
and this line
show(0)
into
show(ptr, 0)
and your code will work.

Categories