UnicodeDecodeError: Python - python

I am getting this error.
I am not sure whether it is my mistake or something else.
I am on python 3.X version right now.
Traceback (most recent call last):
File "/Users/Administrator/Desktop/A2_b/author_program.py", line 104, in <module>
signature = read_signature(dir_name + "/" + this_file)
File "/Users/Administrator/Desktop/A2_b/author_program.py", line 48, in read_signature
result = [sig_file.readline().strip()]
File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/encodings/ascii.py", line 26, in decode
return codecs.ascii_decode(input, self.errors)[0]
UnicodeDecodeError: 'ascii' codec can't decode byte 0x80 in position 3131: ordinal not in range(128)
Here is the code that gives me this error. I only had to complete in this file first and second function.
import author_functions, os.path
def get_valid_filename(msg):
""" (str) -> str
Prompt the user, using msg, to type the name of a file. This file should
exist in the same directory as the starter code. If the file does not
exist, keep re-prompting until they give a valid filename.
Return the name of that file.
"""
filename = input(msg)
while not os.path.exists(filename):
print("That file does not exist.")
filename = input(msg)
return filename
def get_valid_directory_name(msg):
""" (str) -> str
Prompt the user, using msg, to type the name of a directory. If
the directory does not exist, keep re-prompting until they give a valid
directory.
Return the name of that directory.
"""
dirname = input(msg)
while not os.path.isdir(dirname):
print("That directory does not exist.")
dirname = input(msg)
return dirname
### Provided helper function ###
def read_signature(filename):
""" (str) -> list
Read a linguistic signature from filename and return it as
a list of features.
"""
sig_file = open(filename, 'r')
# Read the first feature.
result = [sig_file.readline().strip()]
# Read each remaining feature and convert each one to float.
for line in sig_file:
result.append(float(line.strip()))
sig_file.close()
return result
# #############################
# The main program begins here
# #############################
if __name__ == '__main__':
prompt = 'Enter the name of the file with unknown author: '
mystery_filename = get_valid_filename(prompt)
prompt = 'Enter the name of the directory of signature files: '
dir_name = get_valid_directory_name(prompt)
# Every file in the dir_name directory must be a linguistic signature.
# We assume there is a minimum of one file.
files = os.listdir(dir_name)
# ####################################################################
# The following code parses the mystery file and calculates its
# linguistic signature.
# ####################################################################
mystery_file = open(mystery_filename, 'r')
# readlines() gives us a list of strings, one for each line of the file
text = mystery_file.readlines()
mystery_file.close()
# Calculate the signature for the mystery file
mystery_signature = [mystery_filename]
mystery_signature.append(author_functions.avg_word_length(text))
mystery_signature.append(author_functions.type_token_ratio(text))
mystery_signature.append(author_functions.hapax_legomena_ratio(text))
mystery_signature.append(author_functions.avg_sentence_length(text))
mystery_signature.append(author_functions.avg_sentence_complexity(text))
# ####################################################
# The following code reads the linguistic signatures,
# compares them with the mystery_signature,
# and reports the author that was the best match.
# ####################################################
# Weights of linguistic features.
weights = [0, 11, 33, 50, 0.4, 4]
# We assume there is at least one signature in the dir_name directory
this_file = files[0]
signature = read_signature(dir_name + "/" + this_file)
best_score = author_functions.compare_signatures(mystery_signature,
signature, weights)
best_author = signature[0]
for this_file in files[1:]:
signature = read_signature(dir_name + "/" + this_file)
score = author_functions.compare_signatures(mystery_signature,
signature, weights)
if score < best_score:
best_score = score
best_author = signature[0]
if type(best_score) != float:
print("Error! No score could be computed")
else:
print("Best author match:", best_author, "with score", best_score)

try sig_file = open(filename, 'rb')
the b means there is binary data in the file (not just ascii)
that will probably resolve your issue

Related

UnicodeDecodeError or Undecipherible results

I've taken over a project for university and am getting to grips with the existing code base. I am encountering an error, as in the title, where I am getting a UnicodeDecodeError when I attempt to run the code. Essentially, the software parses documents for citations and then exports and analyses them, storing them in a JSON and classifying them as academic or non-academic based on keywords.
I have (somewhat?) worked on the issue, where putting "encoding = ''" in the with open() call will fix it - but only with codecs other than utf-8, such as latin1. Also putting "errors = 'ignore'" in this with open() call fixes it. Based on other lines of code, I am sure the codec used is utf-8. I am stumped as to why I am running into this issue and am not quite sure on where to begin to fix it (firstly, having not written the code myself and secondly, I am new to Python having self-taught it for this project).
The error I was previously getting is:
Exception in Tkinter callback
Traceback (most recent call last):
File "C:\Python\Python38\lib\tkinter\__init__.py", line 1883, in __call__
return self.func(*args)
File "C:/path/GUIFrontend.py", line 50, in filepath_submitted
results = return_dict_of_results(filepath)
File "C:/path/GUIFrontend.py", line 131, in return_dict_of_results
results_array = loader_text.return_citation_array()
File "C:\path\DataLoaders\CitationLoaderTxt.py", line 23, in return_citation_array
for string in self.__nonblank_lines__(file):
File "C:\path\DataLoaders\CitationLoaderTxt.py", line 65, in __nonblank_lines__
for l in file:
File "C:\Python\Python38\lib\codecs.py", line 322, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x9c in position 72: invalid start byte
The issue I then run into, when I get the code working, is undecipherable results, such as the below, when I use either of the above to actually get the code running.
{"C:/Users/myname/Downloads/paper-8641.pdf": [{"__class__": "CitationObj", "__module__": "Citations.CitationObj", "title": "\u2020\u2021L\u017e|}\u00fa\u0002Ii8\u203arJUb\u00a3\u001b7\u00fa\u0006\u00a2\u203a\u2021\u203a\u00e8\u00e6\u00db7\u00ef\u00ee\u00de|\u00f1\u00be\u00cco\u0160\u00b0\u00cc\u00b2\u00e4\u00e6\u00ee\u00fe&\u00ceu\u00a8\u00b3\u00fc&/\u00f30/\u00d2\u203a\u00bb\u00e3\u00cd?\u0192\u00f7\u00b7E\u0014\u201e\u00b7{\u2022F\u00c1\u00f7\u00d5`;\u0006\u00ab\u00ee\u00c8\u00c0\u00d7B\u00fb0\u00ce\u00b4\u00b7\u00ff\u00ba\u00fb\u00e3M\u2018\u2021I\u2019\u00dcDa\u201dH'?\u00ab,!\u00d2>", "author": [], "journal": "", "id": "C:/Users/myname/Downloads/paper-8641.pdf", "classification": "Academic"}
Below are the scripts from where the errors stem:
# Loader dedicated to loading text files and extracting the info into our given citation object
import json
import re
from DataLoaders import CitationLoaderBase
from Citations.CitationObj import CitationObj
class CitationLoaderTxt(CitationLoaderBase.CitationLoaderBase):
# Path: path to the text file you are looking to extract citations from
# AnalyzedFiles: Array containing the info of files that have already been analyzed
def __init__(self, path):
self.regex = "\d*.(.*)\"(.*)\"(.*)|\d*.(.*)"
self.path = path
self.analyzedFiles = []
# Loads file from instance variable and runs through the file returning all
# matches to the regex supplied as a instance variable as a array
def return_citation_array(self):
if self.__has_file_been_read__():
print("Finding matches to " + self.regex + " in file at " + self.path + " to return as array")
list_of_citations = []
with open(self.path, 'r', encoding='utf-8') as file:
for string in self.__nonblank_lines__(file):
match = re.search(self.regex, string)
if match.group(1) is None:
list_of_citations.append(CitationObj(match.group(4), [], "", self.path))
else:
list_of_authors = [match.group(1)]
list_of_citations.append(CitationObj(match.group(2), list_of_authors, match.group(3), self.path))
self.analyzedFiles.append(self.path)
return list_of_citations
else:
print("File already analyzed")
def return_citation_dictionary(self):
if self.__has_file_been_read__():
print("Finding matches to " + self.regex + " in file at " + self.path + " to return as dictionary")
list_of_citations = []
with open(self.path, 'r', encoding='utf-8') as file:
for string in self.__nonblank_lines__(file):
match = re.search(self.regex, string)
if match.group(1) is None:
list_of_citations.append(CitationObj(match.group(4), [], "", self.path))
self.analyzedFiles.append(self.path)
else:
list_of_authors = [match.group(1)]
list_of_citations.append(CitationObj(match.group(2), list_of_authors, match.group(3), self.path))
self.analyzedFiles.append(self.path)
citation_dict = {"Citations" : list_of_citations}
self.analyzedFiles.append(self.path)
return citation_dict
else:
print("File already analyzed")
def change_file(self, new_file_path):
self.path = new_file_path
# Clears the analyzed files from the analyzedFiles list
def clear_analyzed_files(self):
self.analyzedFiles = []
# removes all blank lines from the input file to help preserve ordering
def __nonblank_lines__(self, file):
for l in file:
line = l.rstrip()
if line:
yield line
# Determines if the file has already been analyzed and returns a boolean to that effect
def __has_file_been_read__(self):
if len(self.analyzedFiles) > 0:
for file in self.analyzedFiles:
if self.path == file:
return False
return True
#Loader for PDF files, extracting info from the text of academic papers,
# does however break liskov substitution principle due to addition of a additional parameter for userinput
from Citations.CitationObj import CitationObj
from DataLoaders.CitationLoaderBase import CitationLoaderBase
import re
import textract
import json
class CitationLoaderPDF(CitationLoaderBase):
# filePath: path to the PDF file you are looking to extract citations from
# regexPath: path for the user defined file to define the regexs
# AnalyzedFiles: Array containing the info of files that have already been analyzed
def __init__(self, filepath, regexpath):
self.filePath = filepath
self.regexPath = regexpath
self.analyzedFiles = []
def return_citation_array(self):
if self.__has_file_been_read__():
list_of_citation_objects = []
pdf_text = self.__get_pdf_text__()
citation_regex = self.__get_citation_regex__()
citation_data = re.findall(citation_regex, pdf_text)
for citation in citation_data:
list_of_citation_objects.append(CitationObj(citation[1], citation[0], citation[2], self.filePath))
self.analyzedFiles.append(self.filePath)
return list_of_citation_objects
else:
print("File already analyzed")
def return_citation_dictionary(self):
if self.__has_file_been_read__():
list_of_citation_objects = []
pdf_text = self.__get_pdf_text__()
citation_regex = self.__get_citation_regex__()
citation_data = re.findall(citation_regex, pdf_text)
for citation in citation_data:
list_of_citation_objects.append(CitationObj(citation[1], citation[0], citation[2], self.filePath))
citation_dict = {"Citations": list_of_citation_objects}
self.analyzedFiles.append(self.filePath)
return citation_dict
else:
print("File already analyzed")
# Returns a dictionary of all terms the user has defined in the users config file
def return_complete_dict(self):
list_of_citation_objects = []
pdf_text = self.__get_pdf_text__()
citation_regex = self.__get_citation_regex__()
url_regex = self.__get_url_regex__()
doi_regex = self.__get_doi_regex__()
citation_data = re.findall(citation_regex, pdf_text)
list_of_urls = re.findall(url_regex, pdf_text)
list_of_dois = re.findall(doi_regex, pdf_text)
for citation in citation_data:
list_of_citation_objects.append(CitationObj(citation[1], citation[0], citation[2]))
citation_dict = {"Citations": list_of_citation_objects,
"URLS": list_of_urls,
"DOIS": list_of_dois}
return citation_dict
# Returns a array of all matches to the url regex if present
def return_url_array(self):
pdf_text = self.__get_pdf_text__()
url_regex = self.__get_url_regex__()
return re.findall(url_regex, pdf_text)
# Returns a array of all matches to the doi regex if present
def return_doi_array(self):
pdf_text = self.__get_pdf_text__()
doi_regex = self.__get_doi_regex__()
return re.findall(doi_regex, pdf_text)
# Returns a array of all matches to the url regex if present
def is_pdf(self):
try:
textract.process(self.filePath).decode("utf-8")
return True
except:
return False
# Clears the analyzed files from the analyzedFiles list
def clear_analyzed_files(self):
self.analyzedFiles = []
# Changes the file the loader is extracting info from
def change_file(self, new_file_path):
self.filePath = new_file_path
# Extracts all the text from the pdf while removing superfluous/unmatched space characters
def __get_pdf_text__(self):
text = textract.process(self.filePath).decode("utf-8")
text = text.replace('\n', '').replace('\r', ' ')
return text
# Gets the regex from the user defined file that extracts citations
def __get_citation_regex__(self):
with open(self.regexPath, 'r', encoding='utf-8') as f:
regex_dict = json.load(f)
return re.compile(regex_dict.get("Citation"), re.MULTILINE)
# Gets the regex from the user defined file that extracts urls
def __get_url_regex__(self):
with open(self.regexPath, 'r', encoding='utf-8') as f:
regex_dict = json.load(f)
return re.compile(regex_dict.get("URL"))
# Gets the regex from the user defined file that extracts DOI's
def __get_doi_regex__(self):
with open(self.regexPath, 'r', encoding='utf-8') as f:
regex_dict = json.load(f)
return re.compile(regex_dict.get("DOI"))
# Determines if the file has already been analyzed and returns a boolean to that effect
def __has_file_been_read__(self):
if len(self.analyzedFiles) > 0:
for file in self.analyzedFiles:
if self.filePath == file:
return False
return True
Any and all help is appreciated, as I'm somewhat bashing my head against a wall trying to get things to work.

Reads And Updates XML in pycharm but not command line

I am very new to python and SO. The script opens xml files inside of a folder. Using os.walk I iterate over the collection and open the file and then calls the function to iterate over the xml file and update the xml file rewriting the updated file over the original using .writexml. the problem is when i run this program from the command line the it says there is an error
Traceback (most recent call last):
File "./XMLParser.py", line 67, in <module>
xmldoc = minidom.parse(xml)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/xml/dom/minidom.py", line 1918, in parse
return expatbuilder.parse(file)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/xml/dom/expatbuilder.py", line 928, in parse
result = builder.parseFile(file)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/xml/dom/expatbuilder.py", line 207, in parseFile
parser.Parse(buffer, 0)
UnicodeEncodeError: 'ascii' codec can't encode character u'\xa0' in position 5614: ordinal not in range(128)
CODE:
from xml.dom import minidom
import os
import codecs
'''
Function to iterate over the directory that contains the work items
params:
CoreID of new author,
x is the path to the workItem.xml file,
p is the path to the workItem.xml that will be overwritten with new data
'''
def changeauthor(coreid, x, p):
# Gets the content of the xml based within the work item tag.
testcase = x.getElementsByTagName("work-item")[0]
# All fields are stored as a <field> tag with the id attribute being the
# differentiators between them. Fields is a list of all the field tags in the
# document.
fields = testcase.getElementsByTagName("field")
# Loop iterates over the field tags and looks for the one tag where the id
# attribute has a value of author. when this tag is found the tags value is
# updated to the core id passed to the function.
for field in fields:
attribute = field.attributes['id'].value
if attribute == "author":
# print the current author.
print("Previous Author: ", field.firstChild.data)
# Set the author to the core id entered into the script
field.firstChild.data = coreid
# Print the updated author field
print("New Author: ", field.firstChild.data)
# Create a temp file with the same path as the source
tmp_config = p
# Open the new temp file with the write mode set.
with codecs.open(tmp_config, 'w', "utf-8") as f:
# f = open(tmp_config, 'w')
# Write the xml into the file at the same location as the orginal
x.writexml(f)
# Close the file
# f.close()
return
while True:
core = str(input("Enter Core ID of the new author: "))
core = core.upper()
spath = str(input("Please enter the full path to the directory of test cases: "))
count = 0
confirm = str(input("Confirm path and core id (Y/N or Exit to leave script): "))
confirm = confirm.upper()
if confirm == "Y":
'''Hard code path here and comment out line above asking for input either will work.'''
# spath = "/Users/Evan/Desktop/workitems-r583233"
# Loop iterates over the directory. Whenever a workitem.xml file is found the path is stored and the file is
# parsed. the core ID entered and the path as well as the parsed xml doc are passed to the change author
# function.
for roots, dirs, files in os.walk(spath):
for file in files:
title = file.title()
if title == "Workitem.Xml":
path = os.path.join(roots, file)
with codecs.open(path, 'r+', "utf-8") as xml:
xmldoc = minidom.parse(xml)
lst = path.split('/')
wi = lst[5]
print("Updating: ", wi)
changeauthor(core, xmldoc, path)
count += 1
print(wi, "updated succesfully.")
print("-------------------------------")
if count > 0:
# Print how many test cases were updated.
print("All Done", count, "workItems updated!")
else:
print("Please double check path and try again no workItems found to update!")
elif confirm == "N":
continue
elif confirm == "EXIT":
break

Python detecting file type before operation

I'm working on this piece of code and this weird bug showed up on the Try command near the end of the code. The whole script is aimed towards .flac files, and sometimes it'd read .jpg files in the folders and blow up. Simply enough I went ahead and added if (".flac" or ".FLAC" in Song): before the Try, this way easily enough it would only process the correct filetype. However this made absolutely no difference and I kept on getting the following error
Traceback (most recent call last):
File ".\musync.py", line 190, in <module>
match_metadata(CurrentAlbum + Song, CoAlbum + Song)
File ".\musync.py", line 152, in match_metadata
TagSource = FLAC(SrcFile)
File "C:\Python34\lib\site-packages\mutagen\_file.py", line 41, in __init__
self.load(filename, *args, **kwargs)
File "C:\Python34\lib\site-packages\mutagen\flac.py", line 721, in load
self.__check_header(fileobj)
File "C:\Python34\lib\site-packages\mutagen\flac.py", line 844, in __check_header
"%r is not a valid FLAC file" % fileobj.name)
mutagen.flac.FLACNoHeaderError: 'C:/Users/berna/Desktop/Lib/Andrew Bird/Armchair Apocrypha/cover.jpg' is not a valid FLAC file
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File ".\musync.py", line 194, in <module>
check_song(CurrentAlbum + Song, CoAlbum)
File ".\musync.py", line 83, in check_song
TagSource = FLAC(SrcFile)
File "C:\Python34\lib\site-packages\mutagen\_file.py", line 41, in __init__
self.load(filename, *args, **kwargs)
File "C:\Python34\lib\site-packages\mutagen\flac.py", line 721, in load
self.__check_header(fileobj)
File "C:\Python34\lib\site-packages\mutagen\flac.py", line 844, in __check_header
"%r is not a valid FLAC file" % fileobj.name)
mutagen.flac.FLACNoHeaderError: 'C:/Users/berna/Desktop/Lib/Andrew Bird/Armchair Apocrypha/cover.jpg' is not a valid FLAC file
Why is the if condition not doing it's job and how can I fix this? Code Is currently as follows:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import shutil
import os
from mutagen.flac import FLAC # Used for metadata handling.
from os import listdir # Used for general operations.
from fuzzywuzzy import fuzz # Last resource name association.
# Insert here the root directory of your library and device respectively.
lib = 'C:/Users/berna/Desktop/Lib/'
dev = 'C:/Users/berna/Desktop/Dev/'
# Faster file copying function, arguments go as follows: Source file location,
# target directory, whether to keep the filename intact and whether to create
# the target directory in case it doesn't exist.
def copy_file(SrcFile, TgtDir, KeepName=True, MakeDir=True):
SourceFile = None
TargetFile = None
KeepGoing = False
# Checks is TgtDir is valid and creates if needed.
if MakeDir and not os.path.isdir(TgtDir):
os.makedirs(TgtDir)
# Processes TgtDir depending on filename choice.
if KeepName is True:
TgtDir += os.path.basename(SrcFile)
print(TgtDir)
try:
SourceFile = open(SrcFile, 'rb')
TargetFile = open(TgtDir, 'wb')
KeepGoing = True
Count = 0
while KeepGoing:
# Read blocks of size 2**20 = 1048576
Buffer = SourceFile.read(2 ** 20)
if not Buffer:
break
TargetFile.write(Buffer)
Count += len(Buffer)
finally:
if TargetFile:
TargetFile.close()
if SourceFile:
SourceFile.close()
return KeepGoing
# XXX TODO
# Copies a directory (SrcDir) to TgtDir, if Replace is True will delete same
# name directory and replace with new one.
def copy_tree(SrcDir, TgtDir, Replace=True):
if not os.path.isdir(TgtDir):
os.makedirs(TgtDir)
Target = format_dir(TgtDir, os.path.basename(SrcDir))
if os.path.isdir(Target) and Replace:
shutil.rmtree(Target)
if not os.path.isdir(Target):
os.makedirs(Target)
for File in listdir(SrcDir):
FileDir = format_dir(SrcDir, File)
# copy_file(FileDir, Tgt)
return()
# Checks for new and deleted folders and returns their name.
def check_folder(SrcDir, TgtDir):
# Lists Source and Target folder.
Source = listdir(SrcDir)
Target = listdir(TgtDir)
# Then creates a list of deprecated and new directories.
Deleted = [FileName for FileName in Target if FileName not in Source]
Added = [FileName for FileName in Source if FileName not in Target]
# Returns both lists.
return (Added, Deleted)
# Checks for song in case there's a name mismatch or missing file.
def check_song(SrcFile, TgtDir):
Matches = []
# Invariably the new name will be that of the source file, the issue here
# is finding which song is the correct one.
NewName = TgtDir + '/' + os.path.basename(SrcFile)
TagSource = FLAC(SrcFile)
# Grabs the number of samples in the original file.
SourceSamples = TagSource.info.total_samples
# Checks if any song has a matching sample number and if true appends the
# song's filename to Matches[]
for Song in listdir(TgtDir):
SongInfo = FLAC(TgtDir + '/' + Song)
if (SongInfo.info.total_samples == SourceSamples):
Matches.append(Song)
# If two songs have the same sample rate (44100Hz for CDs) and the same
# length it matches them to the source by filename similarity.
if (Matches.count > 1):
Diffs = []
for Song in Matches:
Diffs.append(fuzz.ratio(Song, os.path.basename(SrcFile)))
if (max(Diffs) > 0.8):
BestMatch = TgtDir + '/' + Matches[Diffs.index(max(Diffs))]
os.rename(BestMatch, NewName)
else:
shutil.copy(SrcFile, TgtDir)
# If there's no match at all simply copy over the missing file.
elif (Matches.count == 0):
shutil.copy(SrcFile, TgtDir)
# If a single match is found the filename will be the first item on the
# Matches[] list.
else:
os.rename(TgtDir + '/' + Matches[0], NewName)
# Syncs folders in a directory and return the change count.
def sync(SrcDir, TgtDir):
AddCount = 0
DeleteCount = 0
# Grabs the folders to be added and deleted.
NewDir, OldDir = check_folder(SrcDir, TgtDir)
# Checks if any and then does add/rm.
if OldDir:
for Folder in OldDir:
shutil.rmtree(TgtDir + Folder)
DeleteCount += 1
if NewDir:
for Folder in NewDir:
shutil.copytree(SrcDir + Folder, TgtDir + Folder)
AddCount += 1
return(AddCount, DeleteCount)
# Fixes missing metadata fields.
def fix_metadata(SrcFile, TgtFile):
TagSource = FLAC(TgtFile)
TagTarget = FLAC(SrcFile)
# Checks for deleted tags on source file and deletes them from target.
if (set(TagTarget) - set(TagSource)):
OldTags = list(set(TagTarget) - set(TagSource))
for Tag in OldTags:
# TODO Right now I haven't quite figured out how to delete
# specific tags, so workaround is to delete them all.
TagTarget.delete()
# Checks for new tags on source file and transfers them to target.
if (set(TagSource) != set(TagTarget)):
NewTags = list(set(TagSource) - set(TagTarget))
for Tag in NewTags:
TagTarget["%s" % Tag] = TagSource[Tag]
TagTarget.save(TgtFile)
# Does metadata transfer between two files.
def match_metadata(SrcFile, TgtFile):
Altered = 0
TagSource = FLAC(SrcFile)
TagTarget = FLAC(TgtFile)
# For every different Tag in source song copy it to target and save.
for Tag in TagSource:
if TagSource[Tag] != TagTarget[Tag]:
Altered += 1
TagTarget[Tag] = TagSource[Tag]
TagTarget.save(TgtFile)
return(Altered)
# Simply does directory formatting to make things easier.
def format_dir(Main, Second, Third=""):
# Replaces \ with /
Main = Main.replace('\\', '/')
# Adds a / to the end of Main and concatenates Main and Second.
if(Main[len(Main) - 1] != '/'):
Main += '/'
Main += Second + '/'
# Concatenates Main and Third if necessary.
if (Third):
Main += Third + '/'
return (Main)
# Sync main folders in lib with dev.
sync(lib, dev)
# For every Artist in lib sync it's Albums
for Artist in listdir(lib):
sync(format_dir(lib, Artist), format_dir(dev, Artist))
# For every Album in Artist match songs
for Album in listdir(format_dir(lib, Artist)):
# Declares lib Album and dev Album to make function calls shorter.
CurrentAlbum = format_dir(lib, Artist, Album)
CoAlbum = format_dir(dev, Artist, Album)
for Song in listdir(CurrentAlbum):
if (".flac" or ".FLAC" in Song):
try:
# Tries to match lib and dev song's metadata.
match_metadata(CurrentAlbum + Song, CoAlbum + Song)
except:
# If that fails will try to fix both Filename and Tag
# fields.
check_song(CurrentAlbum + Song, CoAlbum)
fix_metadata(CurrentAlbum + Song, CoAlbum + Song)
try:
# Try again after fix.
match_metadata(CurrentAlbum + Song, CoAlbum + Song)
except Exception as e:
# If it still doesn't work there's black magic in place
# go sleep, drink a beer and try again later.
print("""Ehm, something happened and your sync failed.\n
Error:{}""".format(e))
raise SystemExit(0)
Try it:
Songs = ["a.flac", "a.mp3", "b.FLAC"]
flac_files = [s for s in Songs if s.lower().endswith('.flac')]
As pointed by #EliKorvigo the error was caused by a simple miswriting in the if condition, fix looks as follows:
for Song in listdir(CurrentAlbum):
if (".flac" in Song or ".FLAC" in Song):
try:
# Tries to match lib and dev song's metadata.
match_metadata(CurrentAlbum + Song, CoAlbum + Song)
except:
# If that fails will try to fix both Filename and Tag
# fields.
check_song(CurrentAlbum + Song, CoAlbum)
fix_metadata(CurrentAlbum + Song, CoAlbum + Song)
try:
# Try again after fix.
match_metadata(CurrentAlbum + Song, CoAlbum + Song)
except Exception as e:
# If it still doesn't work there's black magic in place
# go sleep, drink a beer and try again later.
print("""Ehm, something happened and your sync failed.\n
Error:{}""".format(e))
raise SystemExit(0)

Python Codecs module register error infinite loop

I download a file that may contain chinese characters and I want to convert it to utf8.
In some case, there are characters that cannot be converted and I replace the character that cannot be converted by a question mark '�'.
I created a program that receive 2 parameters , one is file path , other one is source file encoding
In some cases, this code end in an infinite loop and I have no idea why.
What is causing this infinite loop ?
Source Code:
class ErrorHandler:
def __init__(self,file_path):
self.file_path = file_path
self.previous_end_position = -65535
self.error_threshold = 0
def error_handler(self,exception):
if exception.start == self.previous_end_position+1:
self.error_threshold+=1
if self.error_threshold >= 64:
raise exception
else:
print("Start:"+str(exception.start))
print("End:"+str(exception.end))
self.previous_end_position = exception.end
return ("�",-1,)
src_path = sys.argv[1]
try:
src_ext = src_path[src_path.rindex("."):]
dest_path = src_path[:src_path.rindex(".")]+"_utf8"+src_ext
except:
dest_path = src_path+"_utf8"
src_encoding = sys.argv[2]
codecs.register_error("myreplace",ErrorHandler(src_path).error_handler)
with io.TextIOWrapper(open(src_path,"rb"),encoding=src_encoding,errors="myreplace") as src , open(dest_path,"w") as dest:
for line in src:
dest.write(line)

Reading from multiple files and storing data in a list

I am trying to read print search for all files in a directory and store contents in each file in a list to be used.
My problem is when i use print to debug if the file exists, it prints out the current file or first file in the list. However, It complains that file is not found when i try to read from this file
import re
import os
# Program to extract emails from text files
def path_file():
#path = raw_input("Please enter path to file:\n> ")
path = '/home/holy/thinker/leads/'
return os.listdir('/home/holy/thinker/leads') # returns a list like ["file1.txt", 'image.gif'] # need to remove trailing slashes
# read a file as 1 big string
def in_file():
print path_file()
content = []
for a_file in path_file(): # ['add.txt', 'email.txt']
print a_file
fin = open(a_file, 'r')
content.append(fin.read()) # store content of each file
print content
fin.close()
return content
print in_file()
# this is the error i get
""" ['add.txt', 'email.txt']
add.txt
Traceback (most recent call last):
File "Extractor.py", line 24, in <module>
print in_file()
File "Extractor.py", line 17, in in_file
fin = open(a_file, 'r')
IOError: [Errno 2] No such file or directory: 'add.txt'
"""
The error I get is aboive
os.listdir will return you only file name. You have to directory name on before that file name.
Its trying to open add.txt in same directory where you ran your program. Please add directory name before file name.
def path_file():
#path = raw_input("Please enter path to file:\n> ")
path = '/home/holy/thinker/leads/'
return [os.path.join(path, x) for x in os.listdir(path)]
you should use the full path of the file you want to read.
so please do fin = open(os.path.join(r'/home/holy/thinker/leads/', a_file), 'r')
Here's a rewrite using glob to limit which files are considered;
import glob
import os
import re
import sys
if sys.hexversion < 0x3000000:
# Python 2.x
inp = raw_input
else:
# Python 3.xrange
inp = input
def get_dir(prompt):
while True:
dir_name = inp(prompt)
dir_name = os.path.join(os.getcwd(), dir_name)
if os.path.isdir(dir_name):
return dir_name
else:
print("{} does not exist or is not a directory".format(dir_name))
def files_in_dir(dir_name, file_spec="*.txt"):
return glob.glob(os.path.join(dir_name, file_spec))
def file_iter(files):
for fname in files:
with open(fname) as inf:
yield fname, inf.read()
def main():
email_dir = get_dir("Please enter email directory: ")
email_files = files_in_dir(email_dir, "*.eml")
print(email_files)
content = [txt for fname,txt in file_iter(email_files)]
print(content)
if __name__=="__main__":
main()
and a trial run looks like
Please enter email directory: c:\temp
['c:\\temp\\file1.eml', 'c:\\temp\\file2.eml']
['file1 line one\nfile1 line two\nfile1 line three',
'file2 line one\nfile2 line two']

Categories