I've taken over a project for university and am getting to grips with the existing code base. I am encountering an error, as in the title, where I am getting a UnicodeDecodeError when I attempt to run the code. Essentially, the software parses documents for citations and then exports and analyses them, storing them in a JSON and classifying them as academic or non-academic based on keywords.
I have (somewhat?) worked on the issue, where putting "encoding = ''" in the with open() call will fix it - but only with codecs other than utf-8, such as latin1. Also putting "errors = 'ignore'" in this with open() call fixes it. Based on other lines of code, I am sure the codec used is utf-8. I am stumped as to why I am running into this issue and am not quite sure on where to begin to fix it (firstly, having not written the code myself and secondly, I am new to Python having self-taught it for this project).
The error I was previously getting is:
Exception in Tkinter callback
Traceback (most recent call last):
File "C:\Python\Python38\lib\tkinter\__init__.py", line 1883, in __call__
return self.func(*args)
File "C:/path/GUIFrontend.py", line 50, in filepath_submitted
results = return_dict_of_results(filepath)
File "C:/path/GUIFrontend.py", line 131, in return_dict_of_results
results_array = loader_text.return_citation_array()
File "C:\path\DataLoaders\CitationLoaderTxt.py", line 23, in return_citation_array
for string in self.__nonblank_lines__(file):
File "C:\path\DataLoaders\CitationLoaderTxt.py", line 65, in __nonblank_lines__
for l in file:
File "C:\Python\Python38\lib\codecs.py", line 322, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x9c in position 72: invalid start byte
The issue I then run into, when I get the code working, is undecipherable results, such as the below, when I use either of the above to actually get the code running.
{"C:/Users/myname/Downloads/paper-8641.pdf": [{"__class__": "CitationObj", "__module__": "Citations.CitationObj", "title": "\u2020\u2021L\u017e|}\u00fa\u0002Ii8\u203arJUb\u00a3\u001b7\u00fa\u0006\u00a2\u203a\u2021\u203a\u00e8\u00e6\u00db7\u00ef\u00ee\u00de|\u00f1\u00be\u00cco\u0160\u00b0\u00cc\u00b2\u00e4\u00e6\u00ee\u00fe&\u00ceu\u00a8\u00b3\u00fc&/\u00f30/\u00d2\u203a\u00bb\u00e3\u00cd?\u0192\u00f7\u00b7E\u0014\u201e\u00b7{\u2022F\u00c1\u00f7\u00d5`;\u0006\u00ab\u00ee\u00c8\u00c0\u00d7B\u00fb0\u00ce\u00b4\u00b7\u00ff\u00ba\u00fb\u00e3M\u2018\u2021I\u2019\u00dcDa\u201dH'?\u00ab,!\u00d2>", "author": [], "journal": "", "id": "C:/Users/myname/Downloads/paper-8641.pdf", "classification": "Academic"}
Below are the scripts from where the errors stem:
# Loader dedicated to loading text files and extracting the info into our given citation object
import json
import re
from DataLoaders import CitationLoaderBase
from Citations.CitationObj import CitationObj
class CitationLoaderTxt(CitationLoaderBase.CitationLoaderBase):
# Path: path to the text file you are looking to extract citations from
# AnalyzedFiles: Array containing the info of files that have already been analyzed
def __init__(self, path):
self.regex = "\d*.(.*)\"(.*)\"(.*)|\d*.(.*)"
self.path = path
self.analyzedFiles = []
# Loads file from instance variable and runs through the file returning all
# matches to the regex supplied as a instance variable as a array
def return_citation_array(self):
if self.__has_file_been_read__():
print("Finding matches to " + self.regex + " in file at " + self.path + " to return as array")
list_of_citations = []
with open(self.path, 'r', encoding='utf-8') as file:
for string in self.__nonblank_lines__(file):
match = re.search(self.regex, string)
if match.group(1) is None:
list_of_citations.append(CitationObj(match.group(4), [], "", self.path))
else:
list_of_authors = [match.group(1)]
list_of_citations.append(CitationObj(match.group(2), list_of_authors, match.group(3), self.path))
self.analyzedFiles.append(self.path)
return list_of_citations
else:
print("File already analyzed")
def return_citation_dictionary(self):
if self.__has_file_been_read__():
print("Finding matches to " + self.regex + " in file at " + self.path + " to return as dictionary")
list_of_citations = []
with open(self.path, 'r', encoding='utf-8') as file:
for string in self.__nonblank_lines__(file):
match = re.search(self.regex, string)
if match.group(1) is None:
list_of_citations.append(CitationObj(match.group(4), [], "", self.path))
self.analyzedFiles.append(self.path)
else:
list_of_authors = [match.group(1)]
list_of_citations.append(CitationObj(match.group(2), list_of_authors, match.group(3), self.path))
self.analyzedFiles.append(self.path)
citation_dict = {"Citations" : list_of_citations}
self.analyzedFiles.append(self.path)
return citation_dict
else:
print("File already analyzed")
def change_file(self, new_file_path):
self.path = new_file_path
# Clears the analyzed files from the analyzedFiles list
def clear_analyzed_files(self):
self.analyzedFiles = []
# removes all blank lines from the input file to help preserve ordering
def __nonblank_lines__(self, file):
for l in file:
line = l.rstrip()
if line:
yield line
# Determines if the file has already been analyzed and returns a boolean to that effect
def __has_file_been_read__(self):
if len(self.analyzedFiles) > 0:
for file in self.analyzedFiles:
if self.path == file:
return False
return True
#Loader for PDF files, extracting info from the text of academic papers,
# does however break liskov substitution principle due to addition of a additional parameter for userinput
from Citations.CitationObj import CitationObj
from DataLoaders.CitationLoaderBase import CitationLoaderBase
import re
import textract
import json
class CitationLoaderPDF(CitationLoaderBase):
# filePath: path to the PDF file you are looking to extract citations from
# regexPath: path for the user defined file to define the regexs
# AnalyzedFiles: Array containing the info of files that have already been analyzed
def __init__(self, filepath, regexpath):
self.filePath = filepath
self.regexPath = regexpath
self.analyzedFiles = []
def return_citation_array(self):
if self.__has_file_been_read__():
list_of_citation_objects = []
pdf_text = self.__get_pdf_text__()
citation_regex = self.__get_citation_regex__()
citation_data = re.findall(citation_regex, pdf_text)
for citation in citation_data:
list_of_citation_objects.append(CitationObj(citation[1], citation[0], citation[2], self.filePath))
self.analyzedFiles.append(self.filePath)
return list_of_citation_objects
else:
print("File already analyzed")
def return_citation_dictionary(self):
if self.__has_file_been_read__():
list_of_citation_objects = []
pdf_text = self.__get_pdf_text__()
citation_regex = self.__get_citation_regex__()
citation_data = re.findall(citation_regex, pdf_text)
for citation in citation_data:
list_of_citation_objects.append(CitationObj(citation[1], citation[0], citation[2], self.filePath))
citation_dict = {"Citations": list_of_citation_objects}
self.analyzedFiles.append(self.filePath)
return citation_dict
else:
print("File already analyzed")
# Returns a dictionary of all terms the user has defined in the users config file
def return_complete_dict(self):
list_of_citation_objects = []
pdf_text = self.__get_pdf_text__()
citation_regex = self.__get_citation_regex__()
url_regex = self.__get_url_regex__()
doi_regex = self.__get_doi_regex__()
citation_data = re.findall(citation_regex, pdf_text)
list_of_urls = re.findall(url_regex, pdf_text)
list_of_dois = re.findall(doi_regex, pdf_text)
for citation in citation_data:
list_of_citation_objects.append(CitationObj(citation[1], citation[0], citation[2]))
citation_dict = {"Citations": list_of_citation_objects,
"URLS": list_of_urls,
"DOIS": list_of_dois}
return citation_dict
# Returns a array of all matches to the url regex if present
def return_url_array(self):
pdf_text = self.__get_pdf_text__()
url_regex = self.__get_url_regex__()
return re.findall(url_regex, pdf_text)
# Returns a array of all matches to the doi regex if present
def return_doi_array(self):
pdf_text = self.__get_pdf_text__()
doi_regex = self.__get_doi_regex__()
return re.findall(doi_regex, pdf_text)
# Returns a array of all matches to the url regex if present
def is_pdf(self):
try:
textract.process(self.filePath).decode("utf-8")
return True
except:
return False
# Clears the analyzed files from the analyzedFiles list
def clear_analyzed_files(self):
self.analyzedFiles = []
# Changes the file the loader is extracting info from
def change_file(self, new_file_path):
self.filePath = new_file_path
# Extracts all the text from the pdf while removing superfluous/unmatched space characters
def __get_pdf_text__(self):
text = textract.process(self.filePath).decode("utf-8")
text = text.replace('\n', '').replace('\r', ' ')
return text
# Gets the regex from the user defined file that extracts citations
def __get_citation_regex__(self):
with open(self.regexPath, 'r', encoding='utf-8') as f:
regex_dict = json.load(f)
return re.compile(regex_dict.get("Citation"), re.MULTILINE)
# Gets the regex from the user defined file that extracts urls
def __get_url_regex__(self):
with open(self.regexPath, 'r', encoding='utf-8') as f:
regex_dict = json.load(f)
return re.compile(regex_dict.get("URL"))
# Gets the regex from the user defined file that extracts DOI's
def __get_doi_regex__(self):
with open(self.regexPath, 'r', encoding='utf-8') as f:
regex_dict = json.load(f)
return re.compile(regex_dict.get("DOI"))
# Determines if the file has already been analyzed and returns a boolean to that effect
def __has_file_been_read__(self):
if len(self.analyzedFiles) > 0:
for file in self.analyzedFiles:
if self.filePath == file:
return False
return True
Any and all help is appreciated, as I'm somewhat bashing my head against a wall trying to get things to work.
Related
As the title says, I made a file editing program with python.
Here is the code that I'm have a problem with:
#fileEditing.py
def fileError(file):
raise OSError("file {} does not exist".format(file))
class AccessFile():
def fileExists(self, file):
import os
return bool(os.path.exists(file))
def filecreate(self, file):
if not self.fileExists(file):
with open(file, "w") as f:
f.close()
else: raise OSError("file {} already exists".format(file))
def filedelete(self, file):
import os
if self.fileExists(file):
os.remove(file)
else: fileError(file)
def fileread(self, file):
#check if file exists
if self.fileExists(file):
#detect length of file
with open(file, "r") as f:
line = " "
x = 0
while line != "":
line = f.readline()
x += 1
#piece lines together in a list
filelines = []
with open(file, "r") as f:
for i in range(x - 1):
filelines.append(str(f.readline()))
#return a tuple
return tuple(filelines)
else: fileError(file)
def filewrite(self, file, line, text):
''' BUG: apparently this either overwrites the line its writing or appends
to the line its writing... make up your mind!'''
if self.fileExists(file):
#get file contents
filelines = list(self.fileread(file))
#see if line parameter is out of range or not
try:
filelines[line] = text
except IndexError:
for i in range(line - len(filelines)):
filelines.append("")
filelines.append(str(text) + "\n")
#apply changes
with open(file, "w") as f:
f.write("") #delete contents
with open(file, "w") as f:
for l in filelines:
f.write(l)
else: fileError(file)
def fileoverwrite(self, file, data):
#if there is no file to delete, it will make a new one
try:
self.filedelete(file)
except:
pass
self.filecreate(file)
x = 0
for line in data:
print(line)
self.filewrite(file, x, line)
x += 1
accessfile = AccessFile()
The bug is in the filewrite(self, file, line, text) function. When called, it either writes a new line (which is what I want it to do), appends to the line its supposed to replace, or just doesn't write any lines at all.
Say I want to write a python file with this program:
#pytesting.py
from fileEditing import *
file = "/Users/ashton/Desktop/Atom/Python/FileEditing/FileManager.py"
data = [
"from fileEditing import *",
"",
"class FileEditing():",
" def __init__(options, immutable_files):",
" self.options, self.immutable_files = options, immutable_files",
" ",
" def prompt():",
" ",
"",
"while True:",
" pass"
]
accessfile.fileoverwrite(file, data)
When I run it, it makes a file with accessfile.fileoverwrite(file, data), like its supposed to.
But thats where things get whacky.
(FileManager.py below)
from fileEditing import *
class FileEditing():
def __init__(options, immutable_files): self.options, self.immutable_files = options, immutable_files
def prompt():
while True:
If you know how to fix the filewrite(self, file, line, text), please let me know.
(I use python 2.7 but python 3 is fine)
So this is definitely a Python 3.x solution but you said that it is fine, don't know if it will work in Python 2.x but it is so simple it should:
def file_overwrite(self, file, data):
with open(file, 'w') as file:
file.write('\n'.join(data))
And you seemingly also need to fix that data list because it is missing a few commas. Also the fact that this is all in a class is a bit weird, you do nothing with the instance, they all might as well be separate functions or #classmethods or #staticmethods. Also several things could be improved with your other functions. For example you shouldn't open the file twice and count its lines to read it. Just do file.readlines() at it will return a list of all lines:
def fileread(self, file):
if self.fileExists(file):
with open(file) as file:
return file.readlines()
else:
fileError(file)
Then also import os once at the start of the file, you don't need to import it in every function where you use os, also:
with open(file, "w") as f:
f.close()
f.close() is completely pointless because the context manger closes the file anyways and also there is mode "x" which is specifically made for file creation and will raise an error if the file already exists: https://www.w3schools.com/python/python_file_handling.asp
I have text that is key-value pairs separated by '='. I would like to replace the line if the key matches. if not, i would like to append it at the bottom. I've tried several ways, including:
def split_command_key_and_value(command):
if '=' in command:
command2 = command.split('=')
return command2
def test(command, path):
command2 = split_command_key_and_value(command)
pattern = command2[0]
myfile = open(path,'r') # open file handle for read
# use r'', you don't need to replace '\' with '/'
result = open(path, 'w') # open file handle for write
for line in myfile:
line = line.strip() # it's always a good behave to strip what you read from files
if pattern in line:
line = command # if match, replace line
result.write(line) # write every line
myfile.close() # don't forget to close file handle
result.close()
I know the above is just to replace text, but it deletes the text in the file, and I can't see why. Could someone point me in the right direction?
Thanks
Update:
I'm almost there, but some of my lines have similar keys, so mutiple lines are matching when only 1 should. I've tried to incorporate a regex boundary in my loop with no luck. My code is below. Does anyone have a suggestion?
There is some text in the file that isn't key-value, so I would like to skip that.
def modify(self, name, value):
comb = name + ' ' + '=' + ' ' + value + '\n'
with open('/file/', 'w') as tmpstream:
with open('/file/', 'r') as stream:
for line in stream:
if setting_name in line:
tmpstream.write(comb)
else:
tmpstream.write(line)
I think I got it. See code below.
def modify(self, name, value):
comb = name + ' ' + '=' + ' ' + value + '\n'
mylist = []
with open('/file/', 'w') as tmpstream:
with open('/file/', 'r') as stream:
for line in stream:
a = line.split()
b = re.compile('\\b'+name+'\\b')
if len(a) > 0:
if b.search(a[0]):
tmpstream.write(comb)
else:
tmpstream.write(line)
I spoke too soon. It stops at the key-value I provide. So, it only writes one line, and doesn't write the lines that don't match.
def modify(name, value):
comb = name + ' ' + '=' + ' ' + value + '\n'
mylist = []
with open('/file1', 'w') as tmpstream:
with open('/file2', 'r') as stream:
for line in stream:
a = line.split()
b = re.compile('\\b'+name+'\\b')
if len(a) > 0:
if b.search(a[0]):
tmpstream.write(comb)
else:
tmpstream.write(line)
Can anyone see the issue?
Because when you open file for writing
result = open(path, 'w') # open file handle for write
you just erase it content. Try to write in different file and after all work done replace old file with new one. Or read all data into memory and then process it and write to file.
with open(path) as f:
data = f.read()
with open(path, 'w') as f:
for l in data:
# make job here
first of all you are reading an writing the same file ...
you could first read it all and the write line by line
with open(path,'r') as f:
myfile = f.read() # read everything in the variable "myfile"
result = open(path, 'w') # open file handle for write
for line in myfile.splitlines(): # process the original file content 1 line at a time
# as before
I strongly recommend reading python's documentation on how to read and write files.
If you open an existing file in write-mode open(path, 'w'), its content will be erased:
mode can be (...) 'w' for only writing (an existing file with the same name will be erased)
To replace a line in python you can have a look at this: Search and replace a line in a file in Python
Here is one the solutions provided there adapted to your context (tested for python3):
from tempfile import mkstemp
from shutil import move
from os import close
def test(filepath, command):
# Split command into key/value
key, _ = command.split('=')
matched_key = False
# Create a temporary file
fh, tmp_absolute_path = mkstemp()
with open(tmp_absolute_path, 'w') as tmp_stream:
with open(filepath, 'r') as stream:
for line in stream:
if key in line:
matched_key = True
tmp_stream.write(command + '\n')
else:
tmp_stream.write(line)
if not matched_key:
tmp_stream.write(command + '\n')
close(fh)
move(tmp_absolute_path, filepath)
Note that with the code above every line that matches key (key=blob or blob=key) will be replaced.
I am getting this error.
I am not sure whether it is my mistake or something else.
I am on python 3.X version right now.
Traceback (most recent call last):
File "/Users/Administrator/Desktop/A2_b/author_program.py", line 104, in <module>
signature = read_signature(dir_name + "/" + this_file)
File "/Users/Administrator/Desktop/A2_b/author_program.py", line 48, in read_signature
result = [sig_file.readline().strip()]
File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/encodings/ascii.py", line 26, in decode
return codecs.ascii_decode(input, self.errors)[0]
UnicodeDecodeError: 'ascii' codec can't decode byte 0x80 in position 3131: ordinal not in range(128)
Here is the code that gives me this error. I only had to complete in this file first and second function.
import author_functions, os.path
def get_valid_filename(msg):
""" (str) -> str
Prompt the user, using msg, to type the name of a file. This file should
exist in the same directory as the starter code. If the file does not
exist, keep re-prompting until they give a valid filename.
Return the name of that file.
"""
filename = input(msg)
while not os.path.exists(filename):
print("That file does not exist.")
filename = input(msg)
return filename
def get_valid_directory_name(msg):
""" (str) -> str
Prompt the user, using msg, to type the name of a directory. If
the directory does not exist, keep re-prompting until they give a valid
directory.
Return the name of that directory.
"""
dirname = input(msg)
while not os.path.isdir(dirname):
print("That directory does not exist.")
dirname = input(msg)
return dirname
### Provided helper function ###
def read_signature(filename):
""" (str) -> list
Read a linguistic signature from filename and return it as
a list of features.
"""
sig_file = open(filename, 'r')
# Read the first feature.
result = [sig_file.readline().strip()]
# Read each remaining feature and convert each one to float.
for line in sig_file:
result.append(float(line.strip()))
sig_file.close()
return result
# #############################
# The main program begins here
# #############################
if __name__ == '__main__':
prompt = 'Enter the name of the file with unknown author: '
mystery_filename = get_valid_filename(prompt)
prompt = 'Enter the name of the directory of signature files: '
dir_name = get_valid_directory_name(prompt)
# Every file in the dir_name directory must be a linguistic signature.
# We assume there is a minimum of one file.
files = os.listdir(dir_name)
# ####################################################################
# The following code parses the mystery file and calculates its
# linguistic signature.
# ####################################################################
mystery_file = open(mystery_filename, 'r')
# readlines() gives us a list of strings, one for each line of the file
text = mystery_file.readlines()
mystery_file.close()
# Calculate the signature for the mystery file
mystery_signature = [mystery_filename]
mystery_signature.append(author_functions.avg_word_length(text))
mystery_signature.append(author_functions.type_token_ratio(text))
mystery_signature.append(author_functions.hapax_legomena_ratio(text))
mystery_signature.append(author_functions.avg_sentence_length(text))
mystery_signature.append(author_functions.avg_sentence_complexity(text))
# ####################################################
# The following code reads the linguistic signatures,
# compares them with the mystery_signature,
# and reports the author that was the best match.
# ####################################################
# Weights of linguistic features.
weights = [0, 11, 33, 50, 0.4, 4]
# We assume there is at least one signature in the dir_name directory
this_file = files[0]
signature = read_signature(dir_name + "/" + this_file)
best_score = author_functions.compare_signatures(mystery_signature,
signature, weights)
best_author = signature[0]
for this_file in files[1:]:
signature = read_signature(dir_name + "/" + this_file)
score = author_functions.compare_signatures(mystery_signature,
signature, weights)
if score < best_score:
best_score = score
best_author = signature[0]
if type(best_score) != float:
print("Error! No score could be computed")
else:
print("Best author match:", best_author, "with score", best_score)
try sig_file = open(filename, 'rb')
the b means there is binary data in the file (not just ascii)
that will probably resolve your issue
I created a notepad text document called "connections.txt". I need to have some initial information inside it, several lines of just URLs. Each URL has it's own line. I put that in manually. Then in my program I have a function that checks if a URL is in the file:
def checkfile(string):
datafile = file(f)
for line in datafile:
if string in line:
return True
return False
where f is declared at the beginning of the program:
f = "D:\connections.txt"
Then I tried to write to the document like this:
file = open(f, "w")
if checkfile(user) == False:
usernames.append(user)
file.write("\n")
file.write(user)
file.close()
but it hasn't really been working correctly..I'm not sure what's wrong..am I doing it wrong?
I want the information in the notepad document to stay there ACROSS runs of the program. I want it to build up.
Thanks.
EDIT: I found something wrong... It needs to be file = f, not datafile = file(f)
But the problem is... It clears the text document every time I rerun the program.
f = "D:\connections.txt"
usernames = []
def checkfile(string):
file = f
for line in file:
if string in line:
return True
print "True"
return False
print "False"
file = open(f, "w")
user = "aasdf"
if checkfile(user) == False:
usernames.append(user)
file.write("\n")
file.write(user)
file.close()
I was working with the file command incorrectly...here is the code that works.
f = "D:\connections.txt"
usernames = []
def checkfile(string):
datafile = file(f)
for line in datafile:
if string in line:
print "True"
return True
print "False"
return False
user = "asdf"
if checkfile(user) == False:
usernames.append(user)
with open(f, "a") as myfile:
myfile.write("\n")
myfile.write(user)
The code that checks for a specific URL is ok!
If the problem is not erasing everything:
To write to the document without erasing everything you have to use the .seek() method:
file = open("D:\connections.txt", "w")
# The .seek() method sets the cursor to the wanted position
# seek(offset, [whence]) where:
# offset = 2 is relative to the end of file
# read more here: http://docs.python.org/2/library/stdtypes.html?highlight=seek#file.seek
file.seek(2)
file.write("*The URL you want to write*")
Implemented on your code will be something like:
def checkfile(URL):
# your own function as it is...
if checkfile(URL) == False:
file = open("D:\connections.txt", "w")
file.seek(2)
file.write(URL)
file.close()
def fun(EACH) :
list1 = []
EACH = str(EACH)
for all in a: # contains names of different checks
for files in b :
for line in open(str(files)) :
if EACH in line :
print files
break
Here a is a directory which contains many files. I'm storing those files in list b.
I want to extract files with names in EACH
E.g. I want to open file which as a string "apple" in its content. this apple word is passed via arguments.
The problem is that I don't understand how to read that value and use the same string in the if condition.
Do you really have to parse each file line by line ? Couldn't you just do something like :
for b in a:
for fname in b:
with open(fname, "r") as f:
content = f.read()
if EACH in content:
print "There's a {0} in {1}".format(EACH, fname)
(the with open(...) as ... is to make sure your file is closed once you've read it...)
You can use the following code:
def grepFileForLines( self, fileName = "", keepLinesWith = "" ):
file = open( fileName, 'r' )
matches = []
for line in file:
if line.find( keepLinesWith ) > -1:
matches.append( line )
return matches