Hashing Issue, Non-Text Files - python

My code works ok except for hashing. It works fine on hashing text files but as soon as it encounters a jpg or other file type, it crashes. I know it's some type of encoding error, but I'm stumped on how to encode it properly for non-text files.
#import libraries
import os
import time
from datetime import datetime
import logging
import hashlib
from prettytable import PrettyTable
from pathlib import Path
import glob
#user input
path = input ("Please enter directory: ")
print ("===============================================")
#processing input
if os.path.exists(path):
print("Processing directory: ", (path))
else:
print("Invalid directory.")
logging.basicConfig(filename="error.log", level=logging.ERROR)
logging.error(' The directory is not valid, please run the script again with the correct directory.')
print ("===============================================")
#process directory
directory = Path(path)
paths = []
filename = []
size = []
hashes = []
modified = []
files = list(directory.glob('**/*.*'))
for file in files:
paths.append(file.parents[0])
filename.append(file.parts[-1])
size.append(file.stat().st_size)
modified.append(datetime.fromtimestamp(file.stat().st_mtime))
with open(file) as f:
hashes.append(hashlib.md5(f.read().encode()).hexdigest())
#output in to tablecx
report = PrettyTable()
column_names = ['Path', 'File Name', 'File Size', 'Last Modified Time', 'MD5 Hash']
report.add_column(column_names[0], paths)
report.add_column(column_names[1], filename)
report.add_column(column_names[2], size)
report.add_column(column_names[3], modified)
report.add_column(column_names[4], hashes)
report.sortby = 'File Size'
print(report)

change following lines
with open(file) as f:
hashes.append(hashlib.md5(f.read().encode()).hexdigest())
to
with open(file, "rb") as f:
hashes.append(hashlib.md5(f.read()).hexdigest())
Doing this you will read the contents directly as bytes and you calculate the hash.
Your version tried to read the file as text and re-encoded it to bytes.
Reading a file as text means, the code tries to decode it with the system's encoding. For some byte combinations this will fail, as they are no valid code points for the given encoding.
So just read everything directly as bytes.

Related

Avoid date changes in Zipfile.write

Looking at Zipfile module, I'm trying to figure out why the content of zipfile changes when I recreate a file with the same content
Here's a sample code I'm working on:
import os
import hashlib
import zipfile
from io import BytesIO
FILE_PATH = './'
SAMPLE_FILE = "zip_test123.txt"
# create an empty file
new_file = FILE_PATH+"/"+SAMPLE_FILE
try:
open(new_file, 'x')
except FileExistsError:
os.remove(new_file)
open(new_file, 'x')
full_path = os.path.expanduser(FILE_PATH)
# zip it
data = BytesIO()
with zipfile.ZipFile(data, mode='w') as zf:
zf.write(os.path.join(full_path, SAMPLE_FILE), SAMPLE_FILE)
zip_cntn = data.getvalue()
data.close()
print(zip_cntn)
print(hashlib.md5(zip_cntn).hexdigest())
This first creates an empty file, then zip it and prints out the hash of zipped data.
Running this multiple times results in differnt contents/hash, which I think is caused by modification date (my assumption is based on this which shows the Modified date as well)
I'm only interested in zipping the actual contents, and not anything else (e.g. hash should stay the same if I recreate the same content for a given file)
Any suggestion how to achieve this goal/ignore extra info while archiving a file?

How to convert .docx to .txt in Python

I would like to convert a large batch of MS Word files into the plain text format. I have no idea how to do it in Python. I found the following code online. My path is local and all file names are like cx-xxx (i.e. c1-000, c1-001, c2-000, c2-001 etc.):
from docx import [name of file]
import io
import shutil
import os
def convertDocxToText(path):
for d in os.listdir(path):
fileExtension=d.split(".")[-1]
if fileExtension =="docx":
docxFilename = path + d
print(docxFilename)
document = Document(docxFilename)
textFilename = path + d.split(".")[0] + ".txt"
with io.open(textFilename,"c", encoding="utf-8") as textFile:
for para in document.paragraphs:
textFile.write(unicode(para.text))
path= "/home/python/resumes/"
convertDocxToText(path)
Convert docx to txt with pypandoc:
import pypandoc
# Example file:
docxFilename = 'somefile.docx'
output = pypandoc.convert_file(docxFilename, 'plain', outputfile="somefile.txt")
assert output == ""
See the official documentation here:
https://pypi.org/project/pypandoc/
You can also use the library docx2txt in Python. Here's an example:
I use glob to iter over all DOCX files in the folder.
Note: I use a little list comprehension on the original name in order to re-use it in the TXT filename.
If there's anything I've forgotten to explain, tag me and I'll edit it in.
import docx2txt
import glob
directory = glob.glob('C:/folder_name/*.docx')
for file_name in directory:
with open(file_name, 'rb') as infile:
outfile = open(file_name[:-5]+'.txt', 'w', encoding='utf-8')
doc = docx2txt.process(infile)
outfile.write(doc)
outfile.close()
infile.close()
print("=========")
print("All done!")`
GroupDocs.Conversion Cloud SDK for Python supports 50+ file formats conversion. Its free plan provides 150 free API calls monthly.
# Import module
import groupdocs_conversion_cloud
from shutil import copyfile
# Get your client_id and client_key at https://dashboard.groupdocs.cloud (free registration is required).
client_id = "xxxxx-xxxx-xxxx-xxxx-xxxxxxxx"
client_key = "xxxxxxxxxxxxxxxxxxxxxxxxxxxx"
# Create instance of the API
convert_api = groupdocs_conversion_cloud.ConvertApi.from_keys(client_id, client_key)
try:
#Convert DOCX to txt
# Prepare request
request = groupdocs_conversion_cloud.ConvertDocumentDirectRequest("txt", "C:/Temp/sample.docx")
# Convert
result = convert_api.convert_document_direct(request)
copyfile(result, 'C:/Temp/sample.txt')
except groupdocs_conversion_cloud.ApiException as e:
print("Exception when calling get_supported_conversion_types: {0}".format(e.message))

How to read Arabic text from PDF using Python script

I have a code written in Python that reads from PDF files and convert it to text file.
The problem occurred when I tried to read Arabic text from PDF files. I know that the error is in the coding and encoding process but I don't know how to fix it.
The system converts Arabic PDF files but the text file is empty.
and display this error:
Traceback (most recent call last): File
"C:\Users\test\Downloads\pdf-txt\text maker.py", line 68, in
f.write(content) UnicodeEncodeError: 'ascii' codec can't encode character u'\xa9' in position 50: ordinal not in range(128)
Code:
import os
from os import chdir, getcwd, listdir, path
import codecs
import pyPdf
from time import strftime
def check_path(prompt):
''' (str) -> str
Verifies if the provided absolute path does exist.
'''
abs_path = raw_input(prompt)
while path.exists(abs_path) != True:
print "\nThe specified path does not exist.\n"
abs_path = raw_input(prompt)
return abs_path
print "\n"
folder = check_path("Provide absolute path for the folder: ")
list=[]
directory=folder
for root,dirs,files in os.walk(directory):
for filename in files:
if filename.endswith('.pdf'):
t=os.path.join(directory,filename)
list.append(t)
m=len(list)
print (m)
i=0
while i<=m-1:
path=list[i]
print(path)
head,tail=os.path.split(path)
var="\\"
tail=tail.replace(".pdf",".txt")
name=head+var+tail
content = ""
# Load PDF into pyPDF
pdf = pyPdf.PdfFileReader(file(path, "rb"))
# Iterate pages
for j in range(0, pdf.getNumPages()):
# Extract text from page and add to content
content += pdf.getPage(j).extractText() + "\n"
print strftime("%H:%M:%S"), " pdf -> txt "
f=open(name,'w')
content.encode('utf-8')
f.write(content)
f.close
i=i+1
You have a couple of problems:
content.encode('utf-8') doesn't do anything. The return value is the encoded content, but you have to assign it to a variable. Better yet, open the file with an encoding, and write Unicode strings to that file. content appears to be Unicode data.
Example (works for both Python 2 and 3):
import io
f = io.open(name,'w',encoding='utf8')
f.write(content)
If you don't close the file properly, you may see no content because the file is not flushed to disk. You have f.close not f.close(). It's better to use with, which ensures the file is closed when the block exits.
Example:
import io
with io.open(name,'w',encoding='utf8') as f:
f.write(content)
In Python 3, you don't need to import and use io.open but it still works. open is equivalent. Python 2 needs the io.open form.
you can use anthor library called pdfplumber instead of using pypdf or PyPDF2
import arabic_reshaper
from bidi.algorithm import get_display
with pdfplumber.open(r'example.pdf') as pdf:
my_page = pdf.pages[10]
thepages=my_page.extract_text()
reshaped_text = arabic_reshaper.reshape(thepages)
bidi_text = get_display(reshaped_text)
print(bidi_text)

Python Converting String to jpg file

I have a file with strings that were pulled out of our HR system that are images of people that work for our company. I wrote the following code to convert these strings into .jpg files.
d is the name of the new file and x is the image string. I have printed both of these variables and they seem to be working. The file is saving and it is 71KB but when I open it in paint it says that it "cannot read this file and This is not a valid bitmap file, or its format is not currently supported."
I opened it with Photos and it just said it "can't open this file." Are you able to see any issue with the code?
import csv
import base64
import tkinter as tk
from tkinter import filedialog
root = tk.Tk()
root.withdraw()
file_path = filedialog.askopenfilename()
with open(file_path, 'r') as csvfile:
readCSV = csv.reader(csvfile,delimiter=',')
next(readCSV)
for line in readCSV:
d = line[0]
x = line[1]
y = base64.encodebytes(x.encode())
with open("C:\\%s.jpg" %(d), "wb") as fh:
fh.write(base64.decodebytes(y))
fh.close()
break

DBF - encoding cp1250

I have dbf database encoded in cp1250 and I am reading this database using folowing code:
import csv
from dbfpy import dbf
import os
import sys
filename = sys.argv[1]
if filename.endswith('.dbf'):
print "Converting %s to csv" % filename
csv_fn = filename[:-4]+ ".csv"
with open(csv_fn,'wb') as csvfile:
in_db = dbf.Dbf(filename)
out_csv = csv.writer(csvfile)
names = []
for field in in_db.header.fields:
names.append(field.name)
#out_csv.writerow(names)
for rec in in_db:
out_csv.writerow(rec.fieldData)
in_db.close()
print "Done..."
else:
print "Filename does not end with .dbf"
Problem is, that final csv file is wrong. Encoding of the file is ANSI and some characters are corrupted. I would like to ask you, if you can help me how to read dbf file correctly.
EDIT 1
I tried different code from https://pypi.python.org/pypi/simpledbf/0.2.4, there is some error.
Source 2:
from simpledbf import Dbf5
import os
import sys
dbf = Dbf5('test.dbf', codec='cp1250');
dbf.to_csv('junk.csv');
Output:
python program2.py
Traceback (most recent call last):
File "program2.py", line 5, in <module>
dbf = Dbf5('test.dbf', codec='cp1250');
File "D:\ProgramFiles\Anaconda\lib\site-packages\simpledbf\simpledbf.py", line 557, in __init__
assert terminator == b'\r'
AssertionError
I really don't know how to solve this problem.
Try using my dbf library:
import dbf
with dbf.Table('test.dbf') as table:
dbf.export(table, 'junk.csv')
I wrote simpledbf. The line that is causing you problems was from some testing I was doing when developing the module. First of all, you might want to update your installation, as 0.2.6 is the most recent. Then you can try removing that particular line (#557) from the file "D:\ProgramFiles\Anaconda\lib\site-packages\simpledbf\simpledbf.py". If that doesn't work, you can ping me at the GitHub repo for simpledbf, or you could try Ethan's suggestion for the dbf module.
You can decode and encode as necessary. dbfpy assumes strings are utf8 encoded, so you can decode as it isn't that encoding and then encode again with the right encoding.
import csv
from dbfpy import dbf
import os
import sys
filename = sys.argv[1]
if filename.endswith('.dbf'):
print "Converting %s to csv" % filename
csv_fn = filename[:-4]+ ".csv"
with open(csv_fn,'wb') as csvfile:
in_db = dbf.Dbf(filename)
out_csv = csv.writer(csvfile)
names = []
for field in in_db.header.fields:
names.append(field.name)
#out_csv.writerow(names)
for rec in in_db:
row = [i.decode('utf8').encode('cp1250') if isinstance(i, str) else i for i in rec.fieldData]
out_csv.writerow(rec.fieldData)
in_db.close()
print "Done..."
else:
print "Filename does not end with .dbf"

Categories