make file still readable after hashing in python - python

I'm having this hash function for hashing an image with a new name:
def get_file_hash(file):
"""
Returns a new filename based on the file content using MD5 hashing.
It uses hashlib.md5() function from Python standard library to get
the hash.
Parameters
----------
file : werkzeug.datastructures.FileStorage
File sent by user.
Returns
-------
str
New filename based in md5 file hash.
"""
hasher = hashlib.md5()
basename = os.path.basename(file.filename)
filename, extension = os.path.splitext(basename)
file_read = file.read()
hasher.update(file_read)
processed_filename = hasher.hexdigest() + extension
return processed_filename
and I'm not passing the last evaluation of this test (the one which says "Check the file content is still readable!"):
def test_get_file_hash(self):
filename = "tests/dog.jpeg"
md5_filename = "0a7c757a80f2c5b13fa7a2a47a683593.jpeg"
with open(filename, "rb") as fp:
file = FileStorage(fp)
# Check the new filename is correct
new_filename = utils.get_file_hash(file)
self.assertEqual(md5_filename, new_filename, new_filename)
# Check the file content is still readable!
self.assertTrue(file.read() != b"")
I don't know how to make that happen after I call the function, in context I implement the hash function and then save the image:
filename = utils.get_file_hash(img_api)
img_api.save("static/uploads/" + filename)
Any thoughts would be very much appreciated,
best

Thanks you very much Mathias! this is how the function finally pass the test just in case:
def get_file_hash(file):
"""
Returns a new filename based on the file content using MD5 hashing.
It uses hashlib.md5() function from Python standard library to get
the hash.
Parameters
----------
file : werkzeug.datastructures.FileStorage
File sent by user.
Returns
-------
str
New filename based in md5 file hash.
"""
hasher = hashlib.md5()
basename = os.path.basename(file.filename)
filename, extension = os.path.splitext(basename)
file_read = file.read()
hasher.update(file_read)
processed_filename = hasher.hexdigest() + extension
file.stream.seek(0)
return processed_filename

Related

Calling argument to function and getting the file hashes

I'm attempting to get the hashes of the file which is the argument supplied. Here is my current code:
import hashlib
import argparse
md5 = hashlib.md5()
sha1 = hashlib.sha1()
sha256 = hashlib.sha256()
BUF_SIZE = 32768
parse = argparse.ArgumentParser()
parse.add_argument("-test", help = 'testing')
args = parse.parse_args()
def hashing(hashThis=args.test):
with open(hashThis, 'rb') as f:
while True:
data = f.read(BUF_SIZE)
if not data:
break
md5.update(data)
sha1.update(data)
sha256.update(data)
#print hashes
print('MD5: {0}'.format(md5.hexdigest()))
print('SHA1: {0}'.format(sha1.hexdigest()))
print('SHA256: {0}'.format(sha256.hexdigest()))
hashing(hashThis=args.test)
This gives me the following output:
user#user:~/Testing$ python test.py -test test.txt
MD5: d41d8cd98f00b204e9800998ecf8427e
SHA1: da39a3ee5e6b4b0d3255bfef95601890afd80709
SHA256: e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
The issue is that the hashes given are for an empty file, by using sha256sum of the same file I get
user#user:~/Testing$ sha256sum test.txt
8f434346648f6b96df89dda901c5176b10a6d83961dd3c1ac88b59b2dc327aa4 test.txt
Its not pulling the data from the file, and it works if I use the same code outside of a function. I feel like I'm missing something obvious, but can't figure it out.
You need to be updating the hash objects within the while loop - right now the while loop only exits once 'data' is empty, so all you hash is that empty byte array

How to get absolute path of the file selected as input file in python?

I want the absolute path of the file selected as input file (from file browser in the form) using the python code below:
for attr, document in request.files.iteritems():
orig_filename = document.filename
print os.path.abspath(orig_filename)
mhash = get_hash_for_doc(orig_filename)
This prints the path of current working directory along(where the python script is executing) with the 'orig_filename' appended to it, which is the wrong path. I am using python 2.7, flask 0.12 under linux OS.
The requirement is to find the hash value of the file before uploading it to the server to check deduplication. So I need to use the algorithm by passing the file selected for hashing to another function as:
def get_hash_for_doc(orig_filename):
mhash = None
hash = sha1()#md5()
with open(mfile, "rb") as f:
for chunk in iter(lambda: f.read(128 * hash.block_size), b""):
hash.update(chunk)
mhash = hash.hexdigest()
return mhash
In this function I want to read file from absolute path of the orig_filename before uploading. Avoided all other code checks here.
First you need to create a temp file to simulate this required file then make your process on it
import tempfile, os
try:
fd, tmp = tempfile.mkstemp()
with os.fdopen(fd, 'w') as out:
out.write(file.read())
mhash = get_hash_for_doc(tmp)
finally:
os.unlink(tmp)
If you want to find a folder/file.ext, for an input file, simply use 'os.path.abspath' like:
savefile = os.path.abspath(Myinputfile)
when "Myinputfile" is a variable that contains the relative path and file name. For instance, derived from an argument define by the user.
But if you prefer to have absolute address of the folder, without file name try this:
saveloc = os.path.dirname(os.path.realpath(Myinputfile))
You can use pathlib to find the absolute path of the selected file.

Determine if variable is an open file pointer, or a string

I'd like to write a function to calculate the md5 hash of a file, where I could supply the function with either a string that indicates the full file path, or an opened file pointer.
Right now, my function only accepts a string:
def getMD5Hash(fname):
""" Returns an md5 hash
"""
try:
with open(fname,'rb') as fo:
md5 = hashlib.md5()
chunk_sz = md5.block_size * 128
data = fo.read(chunk_sz)
while data:
md5.update(data)
data = fo.read(chunk_sz)
md5hash = base64.urlsafe_b64encode(md5.digest()).decode('UTF-8').rstrip('=\n')
except IOError:
md5hash = None
How can I detect if fname is a string or an open file pointer?
Python has several different file-like types (file, StringIO, io.TextIOWrapper, etc.), which makes asking "Is this a file?" difficult. Instead, ask "Is this a string?" and assume that anything that isn't must be a file:
def getMD5Hash(fname):
if isinstance(fname, str):
# It's a string!
else:
# I guess it's a file, then.

How to read from a text file compressed with 7z?

I would like to read (in Python 2.7), line by line, from a csv (text) file, which is 7z compressed. I don't want to decompress the entire (large) file, but to stream the lines.
I tried pylzma.decompressobj() unsuccessfully. I get a data error. Note that this code doesn't yet read line by line:
input_filename = r"testing.csv.7z"
with open(input_filename, 'rb') as infile:
obj = pylzma.decompressobj()
o = open('decompressed.raw', 'wb')
obj = pylzma.decompressobj()
while True:
tmp = infile.read(1)
if not tmp: break
o.write(obj.decompress(tmp))
o.close()
Output:
o.write(obj.decompress(tmp))
ValueError: data error during decompression
This will allow you to iterate the lines. It's partially derived from some code I found in an answer to another question.
At this point in time (pylzma-0.5.0) the py7zlib module doesn't implement an API that would allow archive members to be read as a stream of bytes or characters — its ArchiveFile class only provides a read() function that decompresses and returns the uncompressed data in a member all at once. Given that, about the best that can be done is return bytes or lines iteratively via a Python generator using that as a buffer.
The following does the latter, but may not help if the problem is the archive member file itself is huge.
The code below should work in Python 3.x as well as 2.7.
import io
import os
import py7zlib
class SevenZFileError(py7zlib.ArchiveError):
pass
class SevenZFile(object):
#classmethod
def is_7zfile(cls, filepath):
""" Determine if filepath points to a valid 7z archive. """
is7z = False
fp = None
try:
fp = open(filepath, 'rb')
archive = py7zlib.Archive7z(fp)
_ = len(archive.getnames())
is7z = True
finally:
if fp: fp.close()
return is7z
def __init__(self, filepath):
fp = open(filepath, 'rb')
self.filepath = filepath
self.archive = py7zlib.Archive7z(fp)
def __contains__(self, name):
return name in self.archive.getnames()
def readlines(self, name, newline=''):
r""" Iterator of lines from named archive member.
`newline` controls how line endings are handled.
It can be None, '', '\n', '\r', and '\r\n' and works the same way as it does
in StringIO. Note however that the default value is different and is to enable
universal newlines mode, but line endings are returned untranslated.
"""
archivefile = self.archive.getmember(name)
if not archivefile:
raise SevenZFileError('archive member %r not found in %r' %
(name, self.filepath))
# Decompress entire member and return its contents iteratively.
data = archivefile.read().decode()
for line in io.StringIO(data, newline=newline):
yield line
if __name__ == '__main__':
import csv
if SevenZFile.is_7zfile('testing.csv.7z'):
sevenZfile = SevenZFile('testing.csv.7z')
if 'testing.csv' not in sevenZfile:
print('testing.csv is not a member of testing.csv.7z')
else:
reader = csv.reader(sevenZfile.readlines('testing.csv'))
for row in reader:
print(', '.join(row))
If you were using Python 3.3+, you might be able to do this using the lzma module which was added to the standard library in that version.
See: lzma Examples
If you can use python 3, there is a useful library, py7zr, which supports partially 7zip decompression as below:
import py7zr
import re
filter_pattern = re.compile(r'<your/target/file_and_directories/regex/expression>')
with SevenZipFile('archive.7z', 'r') as archive:
allfiles = archive.getnames()
selective_files = [f if filter_pattern.match(f) for f in allfiles]
archive.extract(targets=selective_files)

How do I find the MD5 hash of an ISO file using Python?

I am writing a simple tool that allows me to quickly check MD5 hash values of downloaded ISO files. Here is my algorithm:
import sys
import hashlib
def main():
filename = sys.argv[1] # Takes the ISO 'file' as an argument in the command line
testFile = open(filename, "r") # Opens and reads the ISO 'file'
# Use hashlib here to find MD5 hash of the ISO 'file'. This is where I'm having problems
hashedMd5 = hashlib.md5(testFile).hexdigest()
realMd5 = input("Enter the valid MD5 hash: ") # Promt the user for the valid MD5 hash
if (realMd5 == hashedMd5): # Check if valid
print("GOOD!")
else:
print("BAD!!")
main()
My problem is on the 9th line when I try to take the MD5 hash of the file. I'm getting the Type Error: object supporting the buffer API required. Could anyone shed some light on to how to make this function work?
The object created by hashlib.md5 doesn't take a file object. You need to feed it data a piece at a time, and then request the hash digest.
import hashlib
testFile = open(filename, "rb")
hash = hashlib.md5()
while True:
piece = testFile.read(1024)
if piece:
hash.update(piece)
else: # we're at end of file
hex_hash = hash.hexdigest()
break
print hex_hash # will produce what you're looking for
You need to read the file:
import sys
import hashlib
def main():
filename = sys.argv[1] # Takes the ISO 'file' as an argument in the command line
testFile = open(filename, "rb") # Opens and reads the ISO 'file'
# Use hashlib here to find MD5 hash of the ISO 'file'. This is where I'm having problems
m = hashlib.md5()
while True:
data = testFile.read(4*1024*1024)
if not data: break
m.update(data)
hashedMd5 = m.hexdigest()
realMd5 = input("Enter the valid MD5 hash: ") # Promt the user for the valid MD5 hash
if (realMd5 == hashedMd5): # Check if valid
print("GOOD!")
else:
print("BAD!!")
main()
And you probably need to open the file in binary ("rb") and read the blocks of data in chunks. An ISO file is likely too large to fit in memory.

Categories