Binary reading with python gives unexpected results - python

I'm trying to read some binary files with python for my analysis generated with Zemax OpticStudio. The structure of the file is supposed to be the following:
2 x 32-bit integer as header
n chunks of data
Each chunk is made by
32-bit integer indicating the number of C struc that come after
m C structures
The structures' definition is the following:
typedef struct
{
unsigned int status;
int level;
int hit_object;
int hit_face;
int unused;
int in_object;
int parent;
int storage;
int xybin, lmbin;
double index, starting_phase;
double x, y, z;
double l, m, n;
double nx, ny, nz;
double path_to, intensity;
double phase_of, phase_at;
double exr, exi, eyr, eyi, ezr, ezi;
}
which has a size of 208 bytes, for your convenience.
Here is the code that I wrote with some research and a couple of brilliant answers from here.
from pathlib import Path
from functools import partial
from io import DEFAULT_BUFFER_SIZE
import struct
def little_endian_int(x):
return int.from_bytes(x,'little')
def file_byte_iterator(path):
"""iterator over lazily loaded file
"""
path = Path(path)
with path.open('rb') as file:
reader = partial(file.read1, DEFAULT_BUFFER_SIZE)
file_iterator = iter(reader, bytes())
for chunk in file_iterator:
yield from chunk
def ray_tell(rays_idcs:list,ray_idx:int,seg_idx:int):
idx = rays_idcs[ray_idx][0]
idx += 4 + 208*seg_idx
return idx
def read_header(bytearr:bytearray):
version = int.from_bytes(bytearr[0:4],'little')
zrd_format = version//10000
version = version%10000
num_seg_max = int.from_bytes(bytearr[4:8],'little')
return zrd_format,version,num_seg_max
def rays_indices(bytearr:bytearray):
index=8
rays=[]
while index <len(bytearr):
num_seg = int.from_bytes(bytearr[index:index+4],'little')
rays.append((index,num_seg))
index = index+4 + 208*num_seg
return rays
def read_ray(bytearr:bytearray,ray):
ray_idx,num_seg = ray
data = []
ray_idx = ray_idx + 4
seg_idx=0
for ray_idx in range(8,8+num_seg*208,208):
offsets = [0,4,8,12,16,20,24,28,32,36,40,48,56,64,72,80,88,96,104,112,120,128,136,144,152,160,168,176,184,192,200]
int_vars = offsets[0:11]
doubl_vars = offsets[11:]
data_integ = [bytearr[ray_idx+offset:ray_idx+offset+4] for offset in int_vars]
data_doubl = [bytearr[ray_idx+offset:ray_idx+offset+8] for offset in doubl_vars]
data.append([seg_idx,data_integ,data_doubl])
seg_idx += 1
return data
file="test_uncompressed.ZRD"
raypath = {}
filebin = bytearray(file_byte_iterator(file))
header = read_header(filebin)
print(header)
rays_idcs = rays_indices(filebin)
rays = []
for ray in rays_idcs:
rays.append(read_ray(filebin,ray))
ray = rays[1] #Random ray
segm = ray[2] #Random segm
ints = segm[1]
doub = segm[2]
print("integer vars:")
for x in ints:
print(x,little_endian_int(x))
print("double vars:")
for x in doub:
print(x,struct.unpack('<d',x))
I have verified that all of the structures have the right size and number of chunks and structures (my reading matches the number of segments and rays that I read with Zemax, ) , and thanks to the header, I verified the endianness of the file (little endian).
My output is the following:
(0, 2002)
bytearray(b'\x1f\xd8\x9c?') 1067243551
bytearray(b'\x06\x80\x00\x00') 32774
bytearray(b'\x02\x00\x00\x00') 2
bytearray(b'\x11\x00\x00\x00') 17
bytearray(b'\x02\x00\x00\x00') 2
bytearray(b'\x00\x00\x00\x00') 0
bytearray(b'\x11\x00\x00\x00') 17
bytearray(b'\x01\x00\x00\x00') 1
bytearray(b'\x00\x00\x00\x00') 0
bytearray(b'\x00\x00\x00\x00') 0
double vars:
bytearray(b'\x00\x00\x00\x00# \xac\xe8') (-1.6425098109028998e+196,)
bytearray(b'\xe8\xe3\xf9?\x00\x00\x00\x00') (5.3030112e-315,)
bytearray(b'\x00\x00\x00\x00\x00\x00\x00\x00') (0.0,)
bytearray(b'\x00\x00\x00\x00p_\xb4\xec') (-4.389425605765071e+215,)
bytearray(b'5\xe3\x9d\xbf\xf0\xbd"\xa2') (-3.001836066957746e-144,)
bytearray(b'z"\xc0?\x00\x00\x00\x00') (5.28431047e-315,)
bytearray(b'\x00\x00\x00\x00 \xc9+\xa3') (-2.9165705864036956e-139,)
bytearray(b'g\xd4\xcd?\x9ch{ ') (3.2707669223572687e-152,)
bytearray(b'q\x1e\xef?\x00\x00\x00\x00') (5.299523535e-315,)
bytearray(b'\x00\x00\x00\x00%\x0c\xb4A') (336340224.0,)
bytearray(b'\t\xf2u\xbf\\3L\xe6') (-5.991371249309652e+184,)
bytearray(b'\xe1\xff\xef\xbf1\x8dV\x1e') (1.5664573023148095e-162,)
bytearray(b'\xa1\xe9\xe8?\x9c\x9a6\xfc') (-2.202825582975923e+290,)
bytearray(b'qV\xb9?\x00\x00\x00\x00') (5.28210966e-315,)
bytearray(b'\x00\x00\x00\x00\x00\x00\x00\x00') (0.0,)
bytearray(b'\x00\x00\x00\x00\xc6\xfd\x0c\xa1') (-1.7713316840526727e-149,)
bytearray(b'\x96\x94\x8d?\xad\xf9(\xcc') (-7.838624888507203e+58,)
bytearray(b'yN\xb2\xbff.\\\x1a') (1.0611651097687064e-181,)
bytearray(b'\xb9*\xae?\xac\xaf\xe5\xe1') (-3.90257774261585e+163,)
bytearray(b'c\xab\xd2\xbf\xccQ\x8bj') (1.7130904564012918e+205,)
bytearray(b'\xc8\xea\x8c\xbf\xdf\xdc\xe49') (8.22891935818188e-30,)
I'm reading correctly just the int values. I don't understand why I get those binaries for all the other variables
EDIT
I want to highlight that bytearrays contain non-hexadecimal digits, and I'm sure that binary files are not corrupted, since I can read those in zemax

Solved.
It was just an error in my pointer arithmetic in the read_ray function. Thanks to Mad Physicist for his suggestion to unpack the whole structure which put me in the right direction.
def read_ray(bytearr:bytearray,ray):
ray_idx,num_seg = ray
data = []
assert num_seg==little_endian_int(bytearr[ray_idx:ray_idx+4])
ray_idx = ray_idx + 4
for seg_ptr in range(ray_idx,ray_idx + num_seg*208,208):
...
data_integ = [bytearr[seg_ptr+offset:seg_ptr+offset+4] for offset in int_vars]
data_doubl = [bytearr[seg_ptr+offset:seg_ptr+offset+8] for offset in doubl_vars]
...
return data

Related

Convert buffer repesents list of int little indian Python class

I'm trying to get data from buffer represents as string,
Example:
got :
str = "0004000001000000020000000A000000"
class MyData:
length
some_data
array_data
buf_data
data = parse(str)
Except :
length=1024, some_data=1, array_data=[2,10], buf_data="000000020000010"
Explain:
length=1024 since the 8 numbers "00040000" repesnts an hex number in little indian
and the rest the same idea,
"00040000 01000000 0200000 00A000000"
1024, 1, 2, 10
any idea?
I have some solution but it's too messy and isn't easy to support
This is one way to do it:
class MyData:
mmap = [16**1, 16**0, 16**3, 16**2, 16**5, 16**4, 16**7, 16**6]
def __init__(self, buffer):
self.buffer = buffer
self.integers = []
def get_integers(self):
if len(self.integers) == 0:
for i in range(0, len(self.buffer), 8):
a = 0
for x, y in zip(self.buffer[i:i+8], self.mmap):
a += int(x, 16) * y
self.integers.append(a)
return self.integers
mydata = MyData('0004000001000000020000000A000000')
print(mydata.get_integers())
Output:
[1024, 1, 2, 10]
NOTE: This is specifically for 32-bit unsigned values

Loading every nth element with numpy.fromfile [duplicate]

This question already has an answer here:
Read binary flatfile and skip bytes
(1 answer)
Closed 2 years ago.
I want to create a numpy array from a binary file using np.fromfile. The file contains a 3D array, and I'm only concerned with a certain cell in each frame.
x = np.fromfile(file, dtype='int32', count=width*height*frames)
vals = x[5::width*height]
The code above would work in theory, but my file is very large and reading it all into x causes memory errors. Is there a way to use fromfile to only get vals to begin with?
This may be horribly inefficient but it works:
import numpy as np
def read_in_chunks(fn, offset, step, steps_per_chunk, dtype=np.int32):
out = []
fd = open(fn, 'br')
while True:
chunk = (np.fromfile(fd, dtype=dtype, count=steps_per_chunk*step)
[offset::step])
if chunk.size==0:
break
out.append(chunk)
return np.r_[tuple(out)]
x = np.arange(100000)
x.tofile('test.bin')
b = read_in_chunks('test.bin', 2, 100, 6, int)
print(b)
Update:
Here's one that uses seek to skip over the unwanted stuff. It works for me, but is totally undertested.
def skip_load(fn, offset, step, dtype=np.float, n = 10**100):
elsize = np.dtype(dtype).itemsize
step *= elsize
offset *= elsize
fd = open(fn, 'rb') if isinstance(fn, str) else fn
out = []
pos = fd.tell()
target = ((pos - offset - 1) // step + 1) * step + offset
fd.seek(target)
while n > 0:
if (fd.tell() != target):
return np.frombuffer(b"".join(out), dtype=dtype)
out.append(fd.read(elsize))
n -= 1
if len(out[-1]) < elsize:
return np.frombuffer(b"".join(out[:-1]), dtype=dtype)
target += step
fd.seek(target)
return np.frombuffer(b"".join(out), dtype=dtype)

Array of complex numbers in PyopenCL

I've been working on a new problem with PyopenCl in which i have to deal with complex numbers. More accurately, it would be really handy to use a 2D numpy array with complex numbers inside.
Something like:
np_array[np_array[C_number, C_number, ..], np_array[C_number, C_number, ..], ...]
Then for the results i would need a simple 1D numpy array of complex numbers.
I've also noticed that pyopencl sees a numpy complex number as a float2, for which i use a float16 for my data data array since I have around 8 numbers to deal with.
To work out the basic operations I've built a simple program.
I've worked out building the initials arrays and sending them to the kernel but the results and different from what i expected. I guess it has something to do with the thread's ID but i can't seem to figure it out.
The code i'm using is the following.
import pyopencl as cl
import numpy as np
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
MF = cl.mem_flags
M = 3
zero = np.complex64(0.0)
X1_h = np.array([1 + 1j*2, 2 + 1j*3, 3 + 1j*4]).astype(np.complex64)
X2_h = np.array([1 + 1j*2, 2 + 1j*3, 3 + 1j*4]).astype(np.complex64)
X3_h = np.array([1 + 1j*2, 2 + 1j*3, 3 + 1j*4]).astype(np.complex64)
Y1_h = np.array([4 + 1j*5, 5 + 1j*6, 6 + 1j*7]).astype(np.complex64)
Y2_h = np.array([4 + 1j*5, 5 + 1j*6, 6 + 1j*7]).astype(np.complex64)
Y3_h = np.array([4 + 1j*5, 5 + 1j*6, 6 + 1j*7]).astype(np.complex64)
aux_h = np.complex64(1 + 1j*1)
RES_h = np.empty_like(X1_h)
dados_h = []
for i in range(3):
dados_h.append(np.array([X1_h[i], X2_h[i], X3_h[i], Y1_h[i], Y2_h[i], Y3_h[i]]).astype(np.complex64))
dados_h = np.array(dados_h).astype(np.complex64)
print dados_h
aux_d = cl.Buffer(ctx, MF.READ_WRITE | MF.COPY_HOST_PTR, hostbuf=aux_h)
dados_d = cl.Buffer(ctx, MF.READ_WRITE | MF.COPY_HOST_PTR, hostbuf=dados_h)
RES_d = cl.Buffer(ctx, MF.READ_WRITE | MF.COPY_HOST_PTR, hostbuf = RES_h)
Source = """
__kernel void soma(__global float2 *aux, __global float16 *dados, __global float2 *res){
const int gid_x = get_global_id(0);
const int gid_y = get_global_id(1);
res[gid_x].x = dados[gid_y].s0;
res[gid_x].y = dados[gid_y].s1;
}
"""
prg = cl.Program(ctx, Source).build()
completeEvent = prg.soma(queue, (M,), None, aux_d, dados_d, RES_d)
completeEvent.wait()
cl.enqueue_copy(queue, RES_h, RES_d)
print "GPU"
print RES_h
The output I'm getting is the following:
[[ 1.+2.j 1.+2.j 1.+2.j 4.+5.j 4.+5.j 4.+5.j]
[ 2.+3.j 2.+3.j 2.+3.j 5.+6.j 5.+6.j 5.+6.j]
[ 3.+4.j 3.+4.j 3.+4.j 6.+7.j 6.+7.j 6.+7.j]]
GPU
[ 1.+2.j 1.+2.j 1.+2.j]
My expected output is:
[ 1.+2.j 2.+3.j 3.+4.j]
I can't understand how I'm getting that result. As said, i believe it to be something related to the thread IDs but i can't figure it out. If i use gid_x instead of gid_y on the right part of the red[gid_x] = ... I get the following
[ 1.+2.j 2.+3.j 6.+7.j]
Can anyone give me some insight in what I'm doing wrong, please?
You're launching a 1D kernel, so get_global_id(1) will always return 0. This explains why your kernel simply copies the first element of the dados array into each element of the output.
Using a float16 to represent one 'row' of your input only works if you actually have 8 complex numbers per row. In your example you only have 6, which is why you don't quite get the correct results when copying from dados[gid_x].
To allow your code to deal with an arbitrary row size, simply pass the width in as a parameter, and then compute the linear index manually:
__kernel void soma(__global float2 *aux,
__global float2 *dados,
__global float2 *res,
int rowWidth){
const int gid_x = get_global_id(0);
res[gid_x] = dados[gid_x*rowWidth];
}
and then pass the row-width as an extra argument when you launch the kernel:
# Pass your actual row-width instead of 6!
completeEvent = prg.soma(queue, (M,), None, aux_d, dados_d, RES_d, np.int32(6))

Matlab to Python conversion - Can't assign to function call

I have recently been trying to convert a piece of Matlab code into Python code.
I have made most of the changes that I need to however, the issue I am having is the line where it says:
y(index(m)) = 1-x(index(m));
I get the error:
"Can't assign to function call"
However I am not sure how to restructure it in order to remove this error.
I have had a look around and people mention "get item" and "set item" however I have tried to use them, but I can't get them to work (probably because I can't figure out the structure)
Here is the full code:
import numpy
N = 100;
B = N+1;
M = 5e4;
burnin = M;
Niter = 20;
p = ones(B,Niter+1)/B;
hit = zeros(B,1);
for j in range(1,Niter):
x = double(rand(1,N)>0.5);
bin_x = 1+sum(x);
index = ceil(N*rand(1,M+burnin));
acceptval = rand(1,M+burnin);
for m in range(1,M+burnin):
y = x;
y(index(m)) = 1-x(index(m));
bin_y = 1+sum(y);
alpha = min(1, p(bin_x,j)/p(bin_y,j) );
if acceptval(m)<alpha:
x = y; bin_x = bin_y;
end
if m > burnin: hit(bin_x) = hit(bin_x)+1; end
end
pnew = p[:,j];
for b in range(1,B-1):
if (hit(b+1)*hit(b) == 0):
pnew(b+1) = pnew(b)*(p(b+1,j)/p(b,j));
else:
g(b,j) = hit(b+1)*hit(b) / (hit(b+1)+hit(b));
g_hat(b) = g(b,j)/sum(g(b,arange(1,j)));
pnew(b+1) = pnew(b)*(p(b+1,j)/p(b,j))+((hit(b+1)/hit(b))^g_hat(b));
end
end
p[:,j+1] = pnew/sum(pnew);
hit[:] = 0;
end
Thanks in advance
The round brackets () indicate a function. For indexing you need [] square brackets - but that is only the first of many, many errors... I am currently going through line by line, but it's taking a while.
This code at least runs... you need to figure out whether the indexing is doing what you are expecting since Python arrays are indexed from zero, and Matlab arrays start at 1. I tried to fix that in a couple of places but didn't go through line by line - that's debugging.
Some key learnings:
There is no end statement... just stop indenting
When you import a library, you need to reference it (numpy.zeros, not zeros)
Lists are indexed from zero, not one
Indexing is done with [], not ()
Creating an array of random numbers is done with [random.random() for r in xrange(N)], not random(N).
... and many other things you will find as you look through the code below.
Good luck!
import numpy
import random
N = int(100);
B = N+1;
M = 5e4;
burnin = M;
Niter = 20;
p = numpy.ones([B,Niter+1])/B;
hit = numpy.zeros([B,1]);
g = numpy.zeros([B, Niter]);
b_hat = numpy.zeros(B);
for j in range(1,Niter):
x = [float(random.randint(0,1)>0.5) for r in xrange(N)];
bin_x = 1+sum(x);
index = [random.randint(0,N-1) for r in xrange(int(M+burnin))];
#acceptval = rand(1,M+burnin);
acceptval = [random.random() for r in xrange(int(M+burnin))];
for m in range(1,int(M+burnin)):
y = x;
y[index[m]] = 1-x[index[m]];
bin_y = 1+sum(y);
alpha = min(1, p[bin_x,j]/p[bin_y,j] );
if acceptval[m]<alpha:
x = y; bin_x = bin_y;
if m > burnin:
hit[bin_x] = hit[bin_x]+1;
pnew = p[:,j];
for b in range(1,B-1):
if (hit[b+1]*hit[b] == 0):
pnew[b+1] = pnew[b]*(p[b+1,j]/p[b,j]);
else:
g[b,j] = hit[b+1]*hit[b] / [hit[b+1]+hit[b]];
g_hat[b] = g[b,j]/sum(g[b,numpy.arange(1,j)]);
pnew[b+1] = pnew[b]*(p[b+1,j]/p[b,j])+((hit[b+1]/hit[b])^g_hat[b]);
p[:,j+1] = pnew/sum(pnew);
hit[:] = 0;

How to efficiently parse fixed width files?

I am trying to find an efficient way of parsing files that holds fixed width lines. For example, the first 20 characters represent a column, from 21:30 another one and so on.
Assuming that the line holds 100 characters, what would be an efficient way to parse a line into several components?
I could use string slicing per line, but it's a little bit ugly if the line is big. Are there any other fast methods?
Using the Python standard library's struct module would be fairly easy as well as fairly fast since it's written in C. The code below how it use it. It also allows columns of characters to be skipped by specifying negative values for the number of characters in the field.
import struct
fieldwidths = (2, -10, 24)
fmtstring = ' '.join('{}{}'.format(abs(fw), 'x' if fw < 0 else 's') for fw in fieldwidths)
# Convert Unicode input to bytes and the result back to Unicode string.
unpack = struct.Struct(fmtstring).unpack_from # Alias.
parse = lambda line: tuple(s.decode() for s in unpack(line.encode()))
print('fmtstring: {!r}, record size: {} chars'.format(fmtstring, struct.calcsize(fmtstring)))
line = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789\n'
fields = parse(line)
print('fields: {}'.format(fields))
Output:
fmtstring: '2s 10x 24s', recsize: 36 chars
fields: ('AB', 'MNOPQRSTUVWXYZ0123456789')
Here's a way to do it with string slices, as you were considering but were concerned that it might get too ugly. It is kind of complicated and speedwise it's about the same as the version based the struct module — although I have an idea about how it could be sped up (which might make the extra complexity worthwhile). See update below on that topic.
from itertools import zip_longest
from itertools import accumulate
def make_parser(fieldwidths):
cuts = tuple(cut for cut in accumulate(abs(fw) for fw in fieldwidths))
pads = tuple(fw < 0 for fw in fieldwidths) # bool values for padding fields
flds = tuple(zip_longest(pads, (0,)+cuts, cuts))[:-1] # ignore final one
parse = lambda line: tuple(line[i:j] for pad, i, j in flds if not pad)
# Optional informational function attributes.
parse.size = sum(abs(fw) for fw in fieldwidths)
parse.fmtstring = ' '.join('{}{}'.format(abs(fw), 'x' if fw < 0 else 's')
for fw in fieldwidths)
return parse
line = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789\n'
fieldwidths = (2, -10, 24) # negative widths represent ignored padding fields
parse = make_parser(fieldwidths)
fields = parse(line)
print('format: {!r}, rec size: {} chars'.format(parse.fmtstring, parse.size))
print('fields: {}'.format(fields))
Output:
format: '2s 10x 24s', rec size: 36 chars
fields: ('AB', 'MNOPQRSTUVWXYZ0123456789')
Update
As I suspected, there is a way of making the string-slicing version of the code faster — which in Python 2.7 make it about the same speed as the version using struct, but in Python 3.x make it 233% faster (as well as the un-optimized version of itself which is about the same speed as the struct version).
What the version presented above does is define a lambda function that's primarily a comprehension that generates the limits of a bunch of slices at runtime.
parse = lambda line: tuple(line[i:j] for pad, i, j in flds if not pad)
Which is equivalent to a statement like the following, depending on the values of i and j in the for loop, to something looking like this:
parse = lambda line: tuple(line[0:2], line[12:36], line[36:51], ...)
However the latter executes more than twice as fast since the slice boundaries are all constants.
Fortunately it relatively easy to convert and "compile" the former into the latter using the built-in eval() function:
def make_parser(fieldwidths):
cuts = tuple(cut for cut in accumulate(abs(fw) for fw in fieldwidths))
pads = tuple(fw < 0 for fw in fieldwidths) # bool flags for padding fields
flds = tuple(zip_longest(pads, (0,)+cuts, cuts))[:-1] # ignore final one
slcs = ', '.join('line[{}:{}]'.format(i, j) for pad, i, j in flds if not pad)
parse = eval('lambda line: ({})\n'.format(slcs)) # Create and compile source code.
# Optional informational function attributes.
parse.size = sum(abs(fw) for fw in fieldwidths)
parse.fmtstring = ' '.join('{}{}'.format(abs(fw), 'x' if fw < 0 else 's')
for fw in fieldwidths)
return parse
I'm not really sure if this is efficient, but it should be readable (as opposed to do the slicing manually). I defined a function slices that gets a string and column lengths, and returns the substrings. I made it a generator, so for really long lines, it doesn't build a temporary list of substrings.
def slices(s, *args):
position = 0
for length in args:
yield s[position:position + length]
position += length
Example
In [32]: list(slices('abcdefghijklmnopqrstuvwxyz0123456789', 2))
Out[32]: ['ab']
In [33]: list(slices('abcdefghijklmnopqrstuvwxyz0123456789', 2, 10, 50))
Out[33]: ['ab', 'cdefghijkl', 'mnopqrstuvwxyz0123456789']
In [51]: d,c,h = slices('dogcathouse', 3, 3, 5)
In [52]: d,c,h
Out[52]: ('dog', 'cat', 'house')
But I think the advantage of a generator is lost if you need all columns at once. Where one could benefit from is when you want to process columns one by one, say in a loop.
Two more options that are easier and prettier than already mentioned solutions:
The first is using pandas:
import pandas as pd
path = 'filename.txt'
#inferred - as suggested in the comments by James Paul Mason
data = pd.read_fwf(path, colspecs='infer')
# Or using Pandas with a column specification
col_specification = [(0, 20), (21, 30), (31, 50), (51, 100)]
data = pd.read_fwf(path, colspecs=col_specification)
And the second option using numpy.loadtxt:
import numpy as np
# Using NumPy and letting it figure it out automagically
data_also = np.loadtxt(path)
It really depends on in what way you want to use your data.
The code below gives a sketch of what you might want to do if you have some serious fixed-column-width file handling to do.
"Serious" = multiple record types in each of multiple file types, records up to 1000 bytes, the layout-definer and "opposing" producer/consumer is a government department with attitude, layout changes result in unused columns, up to a million records in a file, ...
Features: Precompiles the struct formats. Ignores unwanted columns. Converts input strings to required data types (sketch omits error handling). Converts records to object instances (or dicts, or named tuples if you prefer).
Code:
import struct, datetime, io, pprint
# functions for converting input fields to usable data
cnv_text = rstrip
cnv_int = int
cnv_date_dmy = lambda s: datetime.datetime.strptime(s, "%d%m%Y") # ddmmyyyy
# etc
# field specs (field name, start pos (1-relative), len, converter func)
fieldspecs = [
('surname', 11, 20, cnv_text),
('given_names', 31, 20, cnv_text),
('birth_date', 51, 8, cnv_date_dmy),
('start_date', 71, 8, cnv_date_dmy),
]
fieldspecs.sort(key=lambda x: x[1]) # just in case
# build the format for struct.unpack
unpack_len = 0
unpack_fmt = ""
for fieldspec in fieldspecs:
start = fieldspec[1] - 1
end = start + fieldspec[2]
if start > unpack_len:
unpack_fmt += str(start - unpack_len) + "x"
unpack_fmt += str(end - start) + "s"
unpack_len = end
field_indices = range(len(fieldspecs))
print unpack_len, unpack_fmt
unpacker = struct.Struct(unpack_fmt).unpack_from
class Record(object):
pass
# or use named tuples
raw_data = """\
....v....1....v....2....v....3....v....4....v....5....v....6....v....7....v....8
Featherstonehaugh Algernon Marmaduke 31121969 01012005XX
"""
f = cStringIO.StringIO(raw_data)
headings = f.next()
for line in f:
# The guts of this loop would of course be hidden away in a function/method
# and could be made less ugly
raw_fields = unpacker(line)
r = Record()
for x in field_indices:
setattr(r, fieldspecs[x][0], fieldspecs[x][3](raw_fields[x]))
pprint.pprint(r.__dict__)
print "Customer name:", r.given_names, r.surname
Output:
78 10x20s20s8s12x8s
{'birth_date': datetime.datetime(1969, 12, 31, 0, 0),
'given_names': 'Algernon Marmaduke',
'start_date': datetime.datetime(2005, 1, 1, 0, 0),
'surname': 'Featherstonehaugh'}
Customer name: Algernon Marmaduke Featherstonehaugh
> str = '1234567890'
> w = [0,2,5,7,10]
> [ str[ w[i-1] : w[i] ] for i in range(1,len(w)) ]
['12', '345', '67', '890']
This is how I solved with a dictionary that contains where fields start and end. Giving start and end points helped me to manage changes at the length of the column also.
# fixed length
# '---------- ------- ----------- -----------'
line = '20.06.2019 myname active mydevice '
SLICES = {'date_start': 0,
'date_end': 10,
'name_start': 11,
'name_end': 18,
'status_start': 19,
'status_end': 30,
'device_start': 31,
'device_end': 42}
def get_values_as_dict(line, SLICES):
values = {}
key_list = {key.split("_")[0] for key in SLICES.keys()}
for key in key_list:
values[key] = line[SLICES[key+"_start"]:SLICES[key+"_end"]].strip()
return values
>>> print (get_values_as_dict(line,SLICES))
{'status': 'active', 'name': 'myname', 'date': '20.06.2019', 'device': 'mydevice'}
Here's a simple module for Python 3, based on John Machin's answer - adapt as needed :)
"""
fixedwidth
Parse and iterate through a fixedwidth text file, returning record objects.
Adapted from https://stackoverflow.com/a/4916375/243392
USAGE
import fixedwidth, pprint
# define the fixed width fields we want
# fieldspecs is a list of [name, description, start, width, type] arrays.
fieldspecs = [
["FILEID", "File Identification", 1, 6, "A/N"],
["STUSAB", "State/U.S. Abbreviation (USPS)", 7, 2, "A"],
["SUMLEV", "Summary Level", 9, 3, "A/N"],
["LOGRECNO", "Logical Record Number", 19, 7, "N"],
["POP100", "Population Count (100%)", 30, 9, "N"],
]
# define the fieldtype conversion functions
fieldtype_fns = {
'A': str.rstrip,
'A/N': str.rstrip,
'N': int,
}
# iterate over record objects in the file
with open(f, 'rb'):
for record in fixedwidth.reader(f, fieldspecs, fieldtype_fns):
pprint.pprint(record.__dict__)
# output:
{'FILEID': 'SF1ST', 'LOGRECNO': 2, 'POP100': 1, 'STUSAB': 'TX', 'SUMLEV': '040'}
{'FILEID': 'SF1ST', 'LOGRECNO': 3, 'POP100': 2, 'STUSAB': 'TX', 'SUMLEV': '040'}
...
"""
import struct, io
# fieldspec columns
iName, iDescription, iStart, iWidth, iType = range(5)
def get_struct_unpacker(fieldspecs):
"""
Build the format string for struct.unpack to use, based on the fieldspecs.
fieldspecs is a list of [name, description, start, width, type] arrays.
Returns a string like "6s2s3s7x7s4x9s".
"""
unpack_len = 0
unpack_fmt = ""
for fieldspec in fieldspecs:
start = fieldspec[iStart] - 1
end = start + fieldspec[iWidth]
if start > unpack_len:
unpack_fmt += str(start - unpack_len) + "x"
unpack_fmt += str(end - start) + "s"
unpack_len = end
struct_unpacker = struct.Struct(unpack_fmt).unpack_from
return struct_unpacker
class Record(object):
pass
# or use named tuples
def reader(f, fieldspecs, fieldtype_fns):
"""
Wrap a fixedwidth file and return records according to the given fieldspecs.
fieldspecs is a list of [name, description, start, width, type] arrays.
fieldtype_fns is a dictionary of functions used to transform the raw string values,
one for each type.
"""
# make sure fieldspecs are sorted properly
fieldspecs.sort(key=lambda fieldspec: fieldspec[iStart])
struct_unpacker = get_struct_unpacker(fieldspecs)
field_indices = range(len(fieldspecs))
for line in f:
raw_fields = struct_unpacker(line) # split line into field values
record = Record()
for i in field_indices:
fieldspec = fieldspecs[i]
fieldname = fieldspec[iName]
s = raw_fields[i].decode() # convert raw bytes to a string
fn = fieldtype_fns[fieldspec[iType]] # get conversion function
value = fn(s) # convert string to value (eg to an int)
setattr(record, fieldname, value)
yield record
if __name__=='__main__':
# test module
import pprint, io
# define the fields we want
# fieldspecs are [name, description, start, width, type]
fieldspecs = [
["FILEID", "File Identification", 1, 6, "A/N"],
["STUSAB", "State/U.S. Abbreviation (USPS)", 7, 2, "A"],
["SUMLEV", "Summary Level", 9, 3, "A/N"],
["LOGRECNO", "Logical Record Number", 19, 7, "N"],
["POP100", "Population Count (100%)", 30, 9, "N"],
]
# define a conversion function for integers
def to_int(s):
"""
Convert a numeric string to an integer.
Allows a leading ! as an indicator of missing or uncertain data.
Returns None if no data.
"""
try:
return int(s)
except:
try:
return int(s[1:]) # ignore a leading !
except:
return None # assume has a leading ! and no value
# define the conversion fns
fieldtype_fns = {
'A': str.rstrip,
'A/N': str.rstrip,
'N': to_int,
# 'N': int,
# 'D': lambda s: datetime.datetime.strptime(s, "%d%m%Y"), # ddmmyyyy
# etc
}
# define a fixedwidth sample
sample = """\
SF1ST TX04089000 00000023748 1
SF1ST TX04090000 00000033748! 2
SF1ST TX04091000 00000043748!
"""
sample_data = sample.encode() # convert string to bytes
file_like = io.BytesIO(sample_data) # create a file-like wrapper around bytes
# iterate over record objects in the file
for record in reader(file_like, fieldspecs, fieldtype_fns):
# print(record)
pprint.pprint(record.__dict__)
Here is what NumPy uses under the hood (much much simplified, but still - this code is found in the LineSplitter class within the _iotools module):
import numpy as np
DELIMITER = (20, 10, 10, 20, 10, 10, 20)
idx = np.cumsum([0] + list(DELIMITER))
slices = [slice(i, j) for (i, j) in zip(idx[:-1], idx[1:])]
def parse(line):
return [line[s] for s in slices]
It does not handle negative delimiters for ignoring column so it is not as versatile as struct, but it is faster.
Because my old work often handles 1 million lines of fixwidth data, I did research on this issue when I started using Python.
There are 2 types of FixedWidth
ASCII FixedWidth (ascii character length = 1, double-byte encoded character length = 2)
Unicode FixedWidth (ascii character & double-byte encoded character length = 1)
If the resource string is all composed of ascii characters, then ASCII FixedWidth = Unicode FixedWidth
Fortunately, string and byte are different in py3, which reduces a lot of confusion when dealing with double-byte encoded characters (e.g.gbk, big5, euc-jp, shift-jis, etc.).
For the processing of "ASCII FixedWidth", the String is usually converted to Bytes and then split.
Without importing third-party modules
totalLineCount = 1 million, lineLength = 800 byte , FixedWidthArgs=(10,25,4,....), I split the Line in about 5 ways and get the following conclusion:
struct is the fastest (1x)
Loop only, not pre-processing FixedWidthArgs is the slowest (5x+)
slice(bytes) is faster than slice(string)
The source string is the bytes test result: struct(1x) , operator.itemgetter(1.7x) , precompiled sliceObject & list comprehensions(2.8x), re.patten object (2.9x)
When dealing with large files, we often use with open ( file, "rb") as f:.
The method traverses one of the above files, about 2.4 second.
I think the appropriate handler, which processes 1 million rows of data, splits each row into 20 fields and takes less than 2.4 seconds.
I only find that stuct and itemgetter meet the requirements
ps: For normal display, I converted unicode str to bytes.
If you are in a double-byte environment, you don't need to do this.
from itertools import accumulate
from operator import itemgetter
def oprt_parser(sArgs):
sum_arg = tuple(accumulate(abs(i) for i in sArgs))
# Negative parameter field index
cuts = tuple(i for i,num in enumerate(sArgs) if num < 0)
# Get slice args and Ignore fields of negative length
ig_Args = tuple(item for i, item in enumerate(zip((0,)+sum_arg,sum_arg)) if i not in cuts)
# Generate `operator.itemgetter` object
oprtObj =itemgetter(*[slice(s,e) for s,e in ig_Args])
return oprtObj
lineb = b'abcdefghijklmnopqrstuvwxyz\xb0\xa1\xb2\xbb\xb4\xd3\xb5\xc4\xb6\xee\xb7\xa2\xb8\xf6\xba\xcd0123456789'
line = lineb.decode("GBK")
# Unicode Fixed Width
fieldwidthsU = (13, -13, 4, -4, 5,-5) # Negative width fields is ignored
# ASCII Fixed Width
fieldwidths = (13, -13, 8, -8, 5,-5) # Negative width fields is ignored
# Unicode FixedWidth processing
parse = oprt_parser(fieldwidthsU)
fields = parse(line)
print('Unicode FixedWidth','fields: {}'.format(tuple(map(lambda s: s.encode("GBK"), fields))))
# ASCII FixedWidth processing
parse = oprt_parser(fieldwidths)
fields = parse(lineb)
print('ASCII FixedWidth','fields: {}'.format(fields))
line = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789\n'
fieldwidths = (2, -10, 24)
parse = oprt_parser(fieldwidths)
fields = parse(line)
print(f"fields: {fields}")
Output:
Unicode FixedWidth fields: (b'abcdefghijklm', b'\xb0\xa1\xb2\xbb\xb4\xd3\xb5\xc4', b'01234')
ASCII FixedWidth fields: (b'abcdefghijklm', b'\xb0\xa1\xb2\xbb\xb4\xd3\xb5\xc4', b'01234')
fields: ('AB', 'MNOPQRSTUVWXYZ0123456789')
oprt_parser is 4x make_parser(list comprehensions + slice)
During the research, it was found that when the cpu speed is faster, it seems that the efficiency of the re method increases faster.
Since I don't have more and better computers to test, provide my test code, if anyone is interested, you can test it with a faster computer.
Run Environment:
os:win10
python: 3.7.2
CPU:amd athlon x3 450
HD:seagate 1T
import timeit
import time
import re
from itertools import accumulate
from operator import itemgetter
def eff2(stmt,onlyNum= False,showResult=False):
'''test function'''
if onlyNum:
rl = timeit.repeat(stmt=stmt,repeat=roundI,number=timesI,globals=globals())
avg = sum(rl) / len(rl)
return f"{avg * (10 ** 6)/timesI:0.4f}"
else:
rl = timeit.repeat(stmt=stmt,repeat=10,number=1000,globals=globals())
avg = sum(rl) / len(rl)
print(f"【{stmt}】")
print(f"\tquick avg = {avg * (10 ** 6)/1000:0.4f} s/million")
if showResult:
print(f"\t Result = {eval(stmt)}\n\t timelist = {rl}\n")
else:
print("")
def upDouble(argList,argRate):
return [c*argRate for c in argList]
tbStr = "000000001111000002222真2233333333000000004444444QAZ55555555000000006666666ABC这些事中文字abcdefghijk"
tbBytes = tbStr.encode("GBK")
a20 = (4,4,2,2,2,3,2,2, 2 ,2,8,8,7,3,8,8,7,3, 12 ,11)
a20U = (4,4,2,2,2,3,2,2, 1 ,2,8,8,7,3,8,8,7,3, 6 ,11)
Slng = 800
rateS = Slng // 100
tStr = "".join(upDouble(tbStr , rateS))
tBytes = tStr.encode("GBK")
spltArgs = upDouble( a20 , rateS)
spltArgsU = upDouble( a20U , rateS)
testList = []
timesI = 100000
roundI = 5
print(f"test round = {roundI} timesI = {timesI} sourceLng = {len(tStr)} argFieldCount = {len(spltArgs)}")
print(f"pure str \n{''.ljust(60,'-')}")
# ==========================================
def str_parser(sArgs):
def prsr(oStr):
r = []
r_ap = r.append
stt=0
for lng in sArgs:
end = stt + lng
r_ap(oStr[stt:end])
stt = end
return tuple(r)
return prsr
Str_P = str_parser(spltArgsU)
# eff2("Str_P(tStr)")
testList.append("Str_P(tStr)")
print(f"pure bytes \n{''.ljust(60,'-')}")
# ==========================================
def byte_parser(sArgs):
def prsr(oBytes):
r, stt = [], 0
r_ap = r.append
for lng in sArgs:
end = stt + lng
r_ap(oBytes[stt:end])
stt = end
return r
return prsr
Byte_P = byte_parser(spltArgs)
# eff2("Byte_P(tBytes)")
testList.append("Byte_P(tBytes)")
# re,bytes
print(f"re compile object \n{''.ljust(60,'-')}")
# ==========================================
def rebc_parser(sArgs,otype="b"):
re_Args = "".join([f"(.{{{n}}})" for n in sArgs])
if otype == "b":
rebc_Args = re.compile(re_Args.encode("GBK"))
else:
rebc_Args = re.compile(re_Args)
def prsr(oBS):
return rebc_Args.match(oBS).groups()
return prsr
Rebc_P = rebc_parser(spltArgs)
# eff2("Rebc_P(tBytes)")
testList.append("Rebc_P(tBytes)")
Rebc_Ps = rebc_parser(spltArgsU,"s")
# eff2("Rebc_Ps(tStr)")
testList.append("Rebc_Ps(tStr)")
print(f"struct \n{''.ljust(60,'-')}")
# ==========================================
import struct
def struct_parser(sArgs):
struct_Args = " ".join(map(lambda x: str(x) + "s", sArgs))
def prsr(oBytes):
return struct.unpack(struct_Args, oBytes)
return prsr
Struct_P = struct_parser(spltArgs)
# eff2("Struct_P(tBytes)")
testList.append("Struct_P(tBytes)")
print(f"List Comprehensions + slice \n{''.ljust(60,'-')}")
# ==========================================
import itertools
def slice_parser(sArgs):
tl = tuple(itertools.accumulate(sArgs))
slice_Args = tuple(zip((0,)+tl,tl))
def prsr(oBytes):
return [oBytes[s:e] for s, e in slice_Args]
return prsr
Slice_P = slice_parser(spltArgs)
# eff2("Slice_P(tBytes)")
testList.append("Slice_P(tBytes)")
def sliceObj_parser(sArgs):
tl = tuple(itertools.accumulate(sArgs))
tl2 = tuple(zip((0,)+tl,tl))
sliceObj_Args = tuple(slice(s,e) for s,e in tl2)
def prsr(oBytes):
return [oBytes[so] for so in sliceObj_Args]
return prsr
SliceObj_P = sliceObj_parser(spltArgs)
# eff2("SliceObj_P(tBytes)")
testList.append("SliceObj_P(tBytes)")
SliceObj_Ps = sliceObj_parser(spltArgsU)
# eff2("SliceObj_Ps(tStr)")
testList.append("SliceObj_Ps(tStr)")
print(f"operator.itemgetter + slice object \n{''.ljust(60,'-')}")
# ==========================================
def oprt_parser(sArgs):
sum_arg = tuple(accumulate(abs(i) for i in sArgs))
cuts = tuple(i for i,num in enumerate(sArgs) if num < 0)
ig_Args = tuple(item for i,item in enumerate(zip((0,)+sum_arg,sum_arg)) if i not in cuts)
oprtObj =itemgetter(*[slice(s,e) for s,e in ig_Args])
return oprtObj
Oprt_P = oprt_parser(spltArgs)
# eff2("Oprt_P(tBytes)")
testList.append("Oprt_P(tBytes)")
Oprt_Ps = oprt_parser(spltArgsU)
# eff2("Oprt_Ps(tStr)")
testList.append("Oprt_Ps(tStr)")
print("|".join([s.split("(")[0].center(11," ") for s in testList]))
print("|".join(["".center(11,"-") for s in testList]))
print("|".join([eff2(s,True).rjust(11," ") for s in testList]))
Output:
Test round = 5 timesI = 100000 sourceLng = 744 argFieldCount = 20
...
...
   Str_P | Byte_P | Rebc_P | Rebc_Ps | Struct_P | Slice_P | SliceObj_P|SliceObj_Ps| Oprt_P | Oprt_Ps
-----------|-----------|-----------|-----------|-- ---------|-----------|-----------|-----------|---- -------|-----------
     9.6315| 7.5952| 4.4187| 5.6867| 1.5123| 5.2915| 4.2673| 5.7121| 2.4713| 3.9051
String slicing doesn't have to be ugly as long as you keep it organized. Consider storing your field widths in a dictionary and then using the associated names to create an object:
from collections import OrderedDict
class Entry:
def __init__(self, line):
name2width = OrderedDict()
name2width['foo'] = 2
name2width['bar'] = 3
name2width['baz'] = 2
pos = 0
for name, width in name2width.items():
val = line[pos : pos + width]
if len(val) != width:
raise ValueError("not enough characters: \'{}\'".format(line))
setattr(self, name, val)
pos += width
file = "ab789yz\ncd987wx\nef555uv"
entry = []
for line in file.split('\n'):
entry.append(Entry(line))
print(entry[1].bar) # output: 987
I like to process text files containing fixed width fields using regular expressions. More specifically, using named capture groups. It's fast, does not require importing large libraries and is quite descriptive and convenient (in my opinion).
I also like the fact that the named capture groups are basically auto-documenting the data format, acting as a sort of data specification, since each capture group can be written to define each fields' name, data type and length.
Here's simple example...
import re
data = [
"1234ABCDEFGHIJ5",
"6789KLMNOPQRST0"
]
record_regex = (
r"^"
r"(?P<firstnumbers>[0-9]{4})"
r"(?P<middletext>[a-zA-Z0-9_\-\s]{10})"
r"(?P<lastnumber>[0-9]{1})"
r"$"
)
records = []
for line in data:
match = re.match(record_regex, line)
if match:
records.append(match.groupdict())
print(records)
...that yields a convenient dictionary of each record:
[
{'firstnumbers': '1234', 'lastnumber': '5', 'middletext': 'ABCDEFGHIJ'},
{'firstnumbers': '6789', 'lastnumber': '0', 'middletext': 'KLMNOPQRST'}
]
Helpful tools, like the online regex tester and debugger, are available if you are not familiar (or comfortable) with Python regular expressions or named capture groups.

Categories