I'm trying to diff with three files in Python on Linux.
I've got (a,b,c) three files such as
/work/a
/work/b
/work/c
The name of a file consists of absolute paths of some hex files
(for example of a file)
user/work/test0/.../bin0.hex
user/work/test0/.../bin1.hex
user/work/test0/.../bin2.hex
user/work/test0/.../bin3.hex
...
The name of b file consists of absolute path of some hex files
(for example of b file)
user/work/test1/.../bin0.hex
user/work/test1/.../bin1.hex
user/work/test1/.../bin2.hex
user/work/test1/.../bin3.hex
...
The name of c file consists of absolute path of some hex files
(for example of c)
user/work/test2/.../bin0.hex
user/work/test2/.../bin1.hex
user/work/test2/.../bin2.hex
user/work/test2/.../bin3.hex
...
and each hex file contains the string list such as
[ 0] A4B232
[ 1] 14B2F2
[ 2] 1472F1
...
I want to diff the each 3 hex files from a, b and c file.
so I started coding as below in Python. so far, I've got successfully save the data into the global variables.
arr_s_line1 = []
arr_s_line2 = []
arr_s_line3 = []
#def dfile():
# with open('a') as f1:
# f_lines1 = f1.read().splitlines();
# for f_line1 in f_lines1:
# with open(f_line1) as f2:
# s_line1 = f2.read().splitlines()
# for s_line1 in s_lines1:
# arr_s_line1.append(s_line1)
def prtf():
with open ('a') as fprtfa:
linesa = fprtfa.read().splitlines()
with open ('b') as fprtfb:
linesb = fprtfb.read().splitlines()
with open ('c') as fprtfc:
linesc = fprtfc.read().splitlines()
for linea in linesa :
with open(linea) as fa:
s_linesa = fa.read().splitlines()
for s_linea in s_linesa:
arr_s_line1.append(s_linea)
for lineb in linesb :
with open(lineb) as fb:
s_linesb = fb.read().splitlines()
for s_lineb in s_linesb:
arr_s_line2.append(s_lineb)
for linec in linesc :
with open(linec) as fc:
s_linesc = fc.read().splitlines()
for s_linec in s_linesc:
arr_s_line3.append(s_linec)
if __name__== "__main__":
prtf()
and I want to diff among with arr_s_line1[i] & arr_s_line2[i] &arr_s_line3[i]) to know whether there is mismatch or not. and If there is a happening of mismatch then I want to print the name of mismatched file and where line. How to diff from multiple read file in python?
Especially, the problem of this way is when I run a large amount of files, many memories required to run. so I want to avoid it.
I have python 2 code that works:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
from os import path
filename = "test.bin" # file contents in hex: 57 58 59 5A 12 00 00 00 4E 44
ID = 4
myfile = open(filename, 'rb')
filesize = path.getsize(filename)
data = list(myfile.read(filesize))
myfile.close()
temp_ptr = data[ID:ID+2]
pointer = int(''.join(reversed(temp_ptr)).encode('hex'), 16)
print(pointer)
Prints "18"
However, it does not work in python 3. I get:
Traceback (most recent call last):
File "py2vs3.py", line 13, in <module>
ptr = int(''.join(reversed(temp_ptr)).encode('hex'), 16)
TypeError: sequence item 0: expected str instance, int found
I am simply grabbing one 32-bit field from a file and printing how C would see it. How do I make this work in Py3? All the code examples I find are for python 2, and the docs make no sense to me.
Python 3 distinguishes between binary and text I/O. Files opened in binary mode (including 'b' in the mode argument) return contents as bytes objects without any decoding based on https://docs.python.org/3/library/functions.html#open
I imitated the example provided by you inline below, instead of reading from a file.
# Python 2
frame = "\x57\x58\x59\x5A\x12\x00\x00\x00\x4E\x44"
int(''.join(reversed(frame[4:6])).encode('hex'), 16)
# Result is 18
Same thing in Python 3
# Python 3
# The preceding b'' signifies that this is a bytearray, the same type
# returned when read from a file in binary mode
frame = b"\x57\x58\x59\x5A\x12\x00\x00\x00\x4E\x44"
int.from_bytes(frame[4:6], "little")
# The 2nd argument "little" represents which is the most significant bit
# i.e left most or right most; more details in the link below
# Result is 18
https://docs.python.org/3/library/stdtypes.html#int.from_bytes has more information about the method
As Mad Wombat commented, python3 does read the file as a byte array rather than a string. The following snippet essentially synthesizes the process.
data = [char for char in myfile.read()]+['\n']
I have seen this question: How to read first 2 rows of csv from Google Cloud Storage
But in my case, I don't want to load whole csv blob into memory, as it could be huge. Is there any way to open it as some iterable (or file-like object), and read only bytes of first couple of lines?
Wanted to expand answer of simzes with example of how to create iterable in cases where we do not know size of CSV header. Also could be useful for reading CSV from datastore line by line:
def get_csv_header(blob):
for line in csv.reader(blob_lines(blob)):
return line
# How much bytes of blob download using one request.
# Selected experimentally. If there is more optimal value for this - please update.
BLOB_CHUNK_SIZE = 2000
def blob_lines(blob: storage.blob.Blob) -> Generator[str, None, None]:
position = 0
buff = []
while True:
chunk = blob.download_as_string(start=position, end=position + BLOB_CHUNK_SIZE).decode()
if '\n' in chunk:
part1, part2 = chunk.split('\n', 1)
buff.append(part1)
yield ''.join(buff)
parts = part2.split('\n')
for part in parts[:-1]:
yield part
buff = [parts[-1]]
else:
buff.append(chunk)
position += BLOB_CHUNK_SIZE + 1 # Blob chunk is downloaded using closed interval
if len(chunk) < BLOB_CHUNK_SIZE:
yield ''.join(buff)
return
The API for google.cloud.storage.blob.Blob specifies that the download_as_string method has start and end keywords that provide byte ranges:
https://googleapis.github.io/google-cloud-python/latest/storage/blobs.html#google.cloud.storage.blob.Blob
EDIT:
download_as_string is deprecated in favor of download_as_byte
if you have gsutil installed on your machine:
import subprocess
uri = 'gs://my-bucket/my-file.txt'
input_file_columns = subprocess.getoutput(f'gsutil cp {uri} - | head -1')
You can also use a tool called gcsfs (pip install gcsfs)
>>> import gcsfs
>>> fs = gcsfs.GCSFileSystem(project='my-google-project')
>>> fs.ls('my-bucket')
['my-file.txt']
>>> fs.read_block('gs://my-bucket/my-file.txt', offset=1000, length=10, delimiter=b'\n')
b'A whole line of text\n'
GCSFS also has a head method.
https://gcsfs.readthedocs.io/en/latest/
I'm trying to compare the data within two files, and retrieve a list of offsets of where the differences are.
I tried it on some text files and it worked quite well..
However on non-text files (that still contain ascii text), I call them binary data files. (executables, so on..)
It seems to think some bytes are the same, even though when I look at it in hex editor, they are obviously not. I tried printing out this binary data that it thinks is the same and I get blank lines where it should be printed.
Thus, I think this is the source of the problem.
So what is the best way to compare bytes of data that could be both binary and contain ascii text? I thought using the struct module by be a starting point...
As you can see below, I compare the bytes with the == operator
Here's the code:
import os
import math
#file1 = 'file1.txt'
#file2 = 'file2.txt'
file1 = 'file1.exe'
file2 = 'file2.exe'
file1size = os.path.getsize(file1)
file2size = os.path.getsize(file2)
a = file1size - file2size
end = file1size #if they are both same size
if a > 0:
#file 2 is smallest
end = file2size
big = file1size
elif a < 0:
#file 1 is smallest
end = file1size
big = file2size
f1 = open(file1, 'rb')
f2 = open(file2, 'rb')
readSize = 500
r = readSize
off = 0
data = []
looking = False
d = open('data.txt', 'w')
while off < end:
f1.seek(off)
f2.seek(off)
b1, b2 = f1.read(r), f2.read(r)
same = b1 == b2
print ''
if same:
print 'Same at: '+str(off)
print 'readSize: '+str(r)
print b1
print b2
print ''
#save offsets of the section of "different" bytes
#data.append([diffOff, diffOff+off-1]) #[begin diff off, end diff off]
if looking:
d.write(str(diffOff)+" => "+str(diffOff+off-2)+"\n")
looking = False
r = readSize
off = off + 1
else:
off = off + r
else:
if r == 1:
looking = True
diffOff = off
off = off + 1 #continue reading 1 at a time, until u find a same reading
r = 1 #it will shoot back to the last off, since we didn't increment it here
d.close()
f1.close()
f2.close()
#add the diff ending portion to diff data offs, if 1 file is longer than the other
a = int(math.fabs(a)) #get abs val of diff
if a:
data.append([big-a, big-1])
print data
Did you try difflib and filecmp modules?
This module provides classes and
functions for comparing sequences. It
can be used for example, for comparing
files, and can produce difference
information in various formats,
including HTML and context and unified
diffs. For comparing directories and
files, see also, the filecmp module.
The filecmp module defines functions
to compare files and directories, with
various optional time/correctness
trade-offs. For comparing files, see
also the difflib module
.
You are probably encountering encoding/decoding problems. Someone may suggest a better solution, but you could try reading the file into a bytearray so you're reading raw bytes instead of decoded characters:
Here's a crude example:
$ od -Ax -tx1 /tmp/aa
000000 e0 b2 aa 0a
$ od -Ax -tx1 /tmp/bb
000000 e0 b2 bb 0a
$ cat /tmp/diff.py
a = bytearray(open('/tmp/aa', 'rb').read())
b = bytearray(open('/tmp/bb', 'rb').read())
print "%02x, %02x" % (a[2], a[3])
print "%02x, %02x" % (b[2], b[3])
$ python /tmp/diff.py
aa, 0a
bb, 0a
I have to convert a number of large files (up to 2GB) of EBCDIC 500 encoded files to Latin-1. Since I could only find EBCDIC to ASCII converters (dd, recode) and the files contain some additional proprietary character codes, I thought I'd write my own converter.
I have the character mapping so I'm interested in the technical aspects.
This is my approach so far:
# char mapping lookup table
EBCDIC_TO_LATIN1 = {
0xC1:'41', # A
0xC2:'42', # B
# and so on...
}
BUFFER_SIZE = 1024 * 64
ebd_file = file(sys.argv[1], 'rb')
latin1_file = file(sys.argv[2], 'wb')
buffer = ebd_file.read(BUFFER_SIZE)
while buffer:
latin1_file.write(ebd2latin1(buffer))
buffer = ebd_file.read(BUFFER_SIZE)
ebd_file.close()
latin1_file.close()
This is the function that does the converting:
def ebd2latin1(ebcdic):
result = []
for ch in ebcdic:
result.append(EBCDIC_TO_LATIN1[ord(ch)])
return ''.join(result).decode('hex')
The question is whether or not this is a sensible approach from an engineering standpoint. Does it have some serious design issues? Is the buffer size OK? And so on...
As for the "proprietary characters" that some don't believe in: Each file contains a year's worth of patent documents in SGML format. The patent office has been using EBCDIC until they switched to Unicode in 2005. So there are thousands of documents within each file. They are separated by some hex values that are not part of any IBM specification. They were added by the patent office. Also, at the beginning of each file there are a few digits in ASCII that tell you about the length of the file. I don't really need that information but if I want to process the file so I have to deal with them.
Also:
$ recode IBM500/CR-LF..Latin1 file.ebc
recode: file.ebc failed: Ambiguous output in step `CR-LF..data'
Thanks for the help so far.
EBCDIC 500, aka Code Page 500, is amongst Pythons encodings, although you link to cp1047, which doesn't. Which one are you using, really? Anyway this works for cp500 (or any other encoding that you have).
from __future__ import with_statement
import sys
from contextlib import nested
BUFFER_SIZE = 16384
with nested(open(sys.argv[1], 'rb'), open(sys.argv[2], 'wb')) as (infile, outfile):
while True:
buffer = infile.read(BUFFER_SIZE)
if not buffer:
break
outfile.write(buffer.decode('cp500').encode('latin1'))
This way you shouldn't need to keep track of the mappings yourself.
If you set up the table correctly, then you just need to do:
translated_chars = ebcdic.translate(EBCDIC_TO_LATIN1)
where ebcdic contains EBCDIC characters and EBCDIC_TO_LATIN1 is a 256-char string which maps each EBCDIC character to its Latin-1 equivalent. The characters in EBCDIC_TO_LATIN1 are the actual binary values rather than their hex representations. For example, if you are using code page 500, the first 16 bytes of EBCDIC_TO_LATIN1 would be
'\x00\x01\x02\x03\x37\x2D\x2E\x2F\x16\x05\x25\x0B\x0C\x0D\x0E\x0F'
using this reference.
While this might not help the original poster anymore, some time ago I released a package for Python 2.6+ and 3.2+ that adds most of the western 8 bit mainframe codecs including CP1047 (French) and CP1141 (German): https://pypi.python.org/pypi/ebcdic. Simply import ebcdic to add the codecs and then use open(..., encoding='cp1047') to read or write files.
Answer 1:
Yet another silly question: What gave you the impression that recode produced only ASCII as output? AFAICT it will transcode ANY of its repertoire of charsets to ANY of its repertoire, AND its repertoire includes IBM cp500 and cp1047, and OF COURSE latin1. Reading the comments, you will note that Lennaert and I have discovered that there aren't any "proprietary" codes in those two IBM character sets. So you may well be able to use recode after all, once you are certain what charset you've actually got.
Answer 2:
If you really need/want to transcode IBM cp1047 via Python, you might like to firstly get the mapping from an authoritative source, processing it via script with some checks:
URL = "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM1047-2.1.2.ucm"
"""
Sample lines:
<U0000> \x00 |0
<U0001> \x01 |0
<U0002> \x02 |0
<U0003> \x03 |0
<U0004> \x37 |0
<U0005> \x2D |0
"""
import urllib, re
text = urllib.urlopen(URL).read()
regex = r"<U([0-9a-fA-F]{4,4})>\s+\\x([0-9a-fA-F]{2,2})\s"
results = re.findall(regex, text)
wlist = [None] * 256
for result in results:
unum, inum = [int(x, 16) for x in result]
assert wlist[inum] is None
assert 0 <= unum <= 255
wlist[inum] = chr(unum)
assert not any(x is None for x in wlist)
print repr(''.join(wlist))
Then carefully copy/paste the output into your transcoding script for use with Vinay's buffer.translate(the_mapping) idea, with a buffer size perhaps a bit larger than 16KB and certainly a bit smaller than 2GB :-)
No crystal ball, no info from OP, so had a bit of a rummage in the EPO website. Found freely downloadable weekly patent info files, still available in cp500/SGML even though website says this to be replaced by utf8/XML in 2006 :-). Got the 2009 week 27 file. Is a zip containing 2 files s350927[ab].bin. "bin" means "not XML". Got the spec! Looks possible that "proprietary codes" are actually BINARY fields. Each record has a fixed 252-byte header. First 5 bytes are record length in EBCDIC e.g. hex F0F2F2F0F8 -> 2208 bytes. Last 2 bytes of the fixed header are the BINARY length (redundant) of the following variable part. In the middle are several text fields, two 2-byte binary fields, and one 4-byte binary field. The binary fields are serial numbers within groups, but all I saw are 1. The variable part is SGML.
Example (last record from s350927b.bin):
Record number: 7266
pprint of header text and binary slices:
['EPB102055619 TXT00000001',
1,
' 20090701200927 08013627.8 EP20090528NN ',
1,
1,
' T *lots of spaces snipped*']
Edited version of the rather long SGML:
<PATDOC FILE="08013627.8" CY=EP DNUM=2055619 KIND=B1 DATE=20090701 STATUS=N>
*snip*
<B541>DE<B542>Windschutzeinheit für ein Motorrad
<B541>EN<B542>Windshield unit for saddle-ride type vehicle
<B541>FR<B542>Unité pare-brise pour motocyclette</B540>
*snip*
</PATDOC>
There are no header or trailer records, just this one record format.
So: if the OP's annual files are anything like this, we might be able to help him out.
Update: Above was the "2 a.m. in my timezone" version. Here's a bit more info:
OP said: "at the beginning of each file there are a few digits in ASCII that tell you about the length of the file." ... translate that to "at the beginning of each record there are five digits in EBCDIC that tell you exactly the length of the record" and we have a (very fuzzy) match!
Here is the URL of the documentation page: http://docs.epoline.org/ebd/info.htm
The FIRST file mentioned is the spec.
Here is the URL of the download-weekly-data page: http://ebd2.epoline.org/jsp/ebdst35.jsp
An observation: The data that I looked at is in the ST.35 series. There is also available for download ST.32 which appears to be a parallel version containing only the SGML content (in "reduced cp437/850", one tag per line). This indicates that the fields in the fixed-length header of the ST.35 records may not be very interesting, and can thus be skipped over, which would greatly simplify the transcoding task.
For what it's worth, here is my (investigatory, written after midnight) code:
[Update 2: tidied up the code a little; no functionality changes]
from pprint import pprint as pp
import sys
from struct import unpack
HDRSZ = 252
T = '>s' # text
H = '>H' # binary 2 bytes
I = '>I' # binary 4 bytes
hdr_defn = [
6, T,
38, H,
40, T,
94, I,
98, H,
100, T,
251, H, # length of following SGML text
HDRSZ + 1
]
# above positions as per spec, reduce to allow for counting from 1
for i in xrange(0, len(hdr_defn), 2):
hdr_defn[i] -= 1
def records(fname, output_encoding='latin1', debug=False):
xlator=''.join(chr(i).decode('cp500').encode(output_encoding, 'replace') for i in range(256))
# print repr(xlator)
def xlate(ebcdic):
return ebcdic.translate(xlator)
# return ebcdic.decode('cp500') # use this if unicode output desired
f = open(fname, 'rb')
recnum = -1
while True:
# get header
buff = f.read(HDRSZ)
if not buff:
return # EOF
recnum += 1
if debug: print "\nrecnum", recnum
assert len(buff) == HDRSZ
recsz = int(xlate(buff[:5]))
if debug: print "recsz", recsz
# split remainder of header into text and binary pieces
fields = []
for i in xrange(0, len(hdr_defn) - 2, 2):
ty = hdr_defn[i + 1]
piece = buff[hdr_defn[i]:hdr_defn[i+2]]
if ty == T:
fields.append(xlate(piece))
else:
fields.append(unpack(ty, piece)[0])
if debug: pp(fields)
sgmlsz = fields.pop()
if debug: print "sgmlsz: %d; expected: %d - %d = %d" % (sgmlsz, recsz, HDRSZ, recsz - HDRSZ)
assert sgmlsz == recsz - HDRSZ
# get sgml part
sgml = f.read(sgmlsz)
assert len(sgml) == sgmlsz
sgml = xlate(sgml)
if debug: print "sgml", sgml
yield recnum, fields, sgml
if __name__ == "__main__":
maxrecs = int(sys.argv[1]) # dumping out the last `maxrecs` records in the file
fname = sys.argv[2]
keep = [None] * maxrecs
for recnum, fields, sgml in records(fname):
# do something useful here
keep[recnum % maxrecs] = (recnum, fields, sgml)
keep.sort()
for k in keep:
if k:
recnum, fields, sgml = k
print
print recnum
pp(fields)
print sgml
Assuming cp500 contains all of your "additional propietary characters", a more concise version based on Lennart's answer using the codecs module:
import sys, codecs
BUFFER_SIZE = 64*1024
ebd_file = codecs.open(sys.argv[1], 'r', 'cp500')
latin1_file = codecs.open(sys.argv[2], 'w', 'latin1')
buffer = ebd_file.read(BUFFER_SIZE)
while buffer:
latin1_file.write(buffer)
buffer = ebd_file.read(BUFFER_SIZE)
ebd_file.close()
latin1_file.close()