I am very new to python and SO. The script opens xml files inside of a folder. Using os.walk I iterate over the collection and open the file and then calls the function to iterate over the xml file and update the xml file rewriting the updated file over the original using .writexml. the problem is when i run this program from the command line the it says there is an error
Traceback (most recent call last):
File "./XMLParser.py", line 67, in <module>
xmldoc = minidom.parse(xml)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/xml/dom/minidom.py", line 1918, in parse
return expatbuilder.parse(file)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/xml/dom/expatbuilder.py", line 928, in parse
result = builder.parseFile(file)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/xml/dom/expatbuilder.py", line 207, in parseFile
parser.Parse(buffer, 0)
UnicodeEncodeError: 'ascii' codec can't encode character u'\xa0' in position 5614: ordinal not in range(128)
CODE:
from xml.dom import minidom
import os
import codecs
'''
Function to iterate over the directory that contains the work items
params:
CoreID of new author,
x is the path to the workItem.xml file,
p is the path to the workItem.xml that will be overwritten with new data
'''
def changeauthor(coreid, x, p):
# Gets the content of the xml based within the work item tag.
testcase = x.getElementsByTagName("work-item")[0]
# All fields are stored as a <field> tag with the id attribute being the
# differentiators between them. Fields is a list of all the field tags in the
# document.
fields = testcase.getElementsByTagName("field")
# Loop iterates over the field tags and looks for the one tag where the id
# attribute has a value of author. when this tag is found the tags value is
# updated to the core id passed to the function.
for field in fields:
attribute = field.attributes['id'].value
if attribute == "author":
# print the current author.
print("Previous Author: ", field.firstChild.data)
# Set the author to the core id entered into the script
field.firstChild.data = coreid
# Print the updated author field
print("New Author: ", field.firstChild.data)
# Create a temp file with the same path as the source
tmp_config = p
# Open the new temp file with the write mode set.
with codecs.open(tmp_config, 'w', "utf-8") as f:
# f = open(tmp_config, 'w')
# Write the xml into the file at the same location as the orginal
x.writexml(f)
# Close the file
# f.close()
return
while True:
core = str(input("Enter Core ID of the new author: "))
core = core.upper()
spath = str(input("Please enter the full path to the directory of test cases: "))
count = 0
confirm = str(input("Confirm path and core id (Y/N or Exit to leave script): "))
confirm = confirm.upper()
if confirm == "Y":
'''Hard code path here and comment out line above asking for input either will work.'''
# spath = "/Users/Evan/Desktop/workitems-r583233"
# Loop iterates over the directory. Whenever a workitem.xml file is found the path is stored and the file is
# parsed. the core ID entered and the path as well as the parsed xml doc are passed to the change author
# function.
for roots, dirs, files in os.walk(spath):
for file in files:
title = file.title()
if title == "Workitem.Xml":
path = os.path.join(roots, file)
with codecs.open(path, 'r+', "utf-8") as xml:
xmldoc = minidom.parse(xml)
lst = path.split('/')
wi = lst[5]
print("Updating: ", wi)
changeauthor(core, xmldoc, path)
count += 1
print(wi, "updated succesfully.")
print("-------------------------------")
if count > 0:
# Print how many test cases were updated.
print("All Done", count, "workItems updated!")
else:
print("Please double check path and try again no workItems found to update!")
elif confirm == "N":
continue
elif confirm == "EXIT":
break
Related
I'm very green when it comes to Python, so please forgive my disgusting formatting or poor optimization.
I'm trying to write a script to sort files into new folders based on their name.
In order to match their name to the correct new location, I have a csv file with two columns; the first is part of the name of the file, and the second is the correct folder it belongs in.
So far I have everything written to extract the parts of the file names I need, but now I'm stuck as to how I can match the strings I have to a value in the csv, and then extract the adjacent column.
This is what I have so far:
import os
import csv
def openCSV(csvFile):
file = open(csvFile)
reader = csv.DictReader(file)
data = list(reader)
return data
def findDemoName(fileName):
demoName = fileName[16:]
demoName = demoName[:-11]
return demoName
def moveFiles(sortingFile, sourceDirectory, destinationDirectory):
sortingCSV = openCSV(sortingFile)
srcDir = sourceDirectory
destDir = destinationDirectory
for filename in os.listdir(srcDir):
name = findDemoName(filename)
print(name)
# begin program
if __name__ == "__main__":
# set the CSV used to sort the files
fileToSortFrom = '<csv used for sorting>'
inputDirectory = '<where the files are located>'
outputDirectory = '<where I want to move the files>'
moveFiles(fileToSortFrom, inputDirectory, outputDirectory)
Right now it just prints the extracted portion of the file name and prints it so I could make sure it was doing what I wanted.
So my next steps are
1. Match the extracted portion of the file name to a matching value in the first column of the csv
2. Take the value adjacent to the match and use it to complete the destination path for the file to be moved to
I found this thread match names in csv file to filename in folder, but I don't understand where in the answer the csv is being matched to.
If I need to clear up some points let me know and I will.
Thank you in advance for reading :)
EDIT:
I've tried to stumble my way through this, and here's what I have so far:
import os, shutil
import csv
def openCSV(csvFile):
file = open(csvFile)
reader = csv.DictReader(file)
data = list(reader)
return data
"""def createReader(csvFile):
file = open(csvFile)
reader = csv.DictReader(file)
return reader"""
def extractDemoName(fileName):
originalName = fileName
demoName = fileName[16:]
demoName = demoName[:-11]
return demoName
def moveFiles(sortingFile, sourceDirectory, destinationDirectory, prefix, suffix):
reader = openCSV(sortingFile)
#reader = createReader(sortingFile)
srcDir = sourceDirectory
destDir = destinationDirectory
column1 = 'DemographicName'
column2 = 'DemographicTypeName'
folder = ''
for filename in os.listdir(srcDir):
name = extractDemoName(filename)
for row in reader:
if row(column1) == name:
folder = row(column2)
destination = destDir + folder
file = prefix + name + suffix
shutil.copy(file, destination)
print('Moved ' + file + ' to ' + destination)
#else reader.next()
print(name)
# begin program
if __name__ == "__main__":
# set the CSV used to sort the files
fileToSortFrom = '<csv file>'
inputDirectory = '<source path>'
outputDirectory = '<destination path>'
filePrefix = '<beginning text of files>'
fileSuffix = '<ending text of files>'
moveFiles(fileToSortFrom, inputDirectory, outputDirectory, filePrefix, fileSuffix)
But now I'm receiving the following error instead:
Traceback (most recent call last):
File "script.py", line 63, in <module>
moveFiles(fileToSortFrom, inputDirectory, outputDirectory, filePrefix, fileSuffix)
File "script.py", line 38, in moveFiles
if row(column1) == name:
TypeError: 'collections.OrderedDict' object is not callable
There is the problem (line 38)
if row(column1) == name:
it should be
if row[column1] == name:
I haven't checked any other logic in the script :)
This script reads the files from the directory you pass in method move_files's from_dir.
It checks if the file in the from_dir exists in the csv_file and if it does, it gets the location and moves it to that directory.
import os
import csv
import shutil
def get_file_sorter_dict(csv_file):
return dict(list(csv.reader(open(csv_file))))
def move_files(csv_file, from_dir, to_dir):
file_sorter_dict = get_file_sorter_dict(csv_file)
for filename in os.listdir(from_dir):
if file_sorter_dict.get(filename):
# you can use the location to move the file from csv_file
# move_to = file_sorter_dict.get(filename)
# shutil.move(filename, move_to)
# or you can use to_dir to move the file.
shutil.move(filename, to_dir)
if __name__ == "__main__":
move_files('files_sorter.csv', '.', '../')
The csv I am using looks like:
name, location
"foo.txt","../"
"baz.txt","../"
I have a script that modifies data in a django app.I have data in an excel file that i process then update my models with it, some of the data is in Arabic and when i execute the script i get the following error:
Traceback (most recent call last):
File "script.py", line 77, in <module>
update_locations(path)
File "script.py", line 36, in update_locations
household.location = new_location
File "/data/envs/ve.maidea/lib/python2.7/site-packages/django/db/models/fields/related_descriptors.py", line 207, in __set__
self.field.remote_field.model._meta.object_name,
ValueError: Cannot assign "'\xd8\xa7\xd9\x84\xd8\xa8\xd8\xad\xd9\x8a\xd8\xb1\xd9\x87'": "Household.location" must be a "Location" instance.
I think the error is been raised by these Arabic characters.
here is my script:
import django
django.setup()
import sys
reload(sys) # to re-enable sys.setdefaultencoding()
sys.setdefaultencoding('utf-8')
import xlrd
from django.db import transaction
from foodnet.apps.registration.models import Household
from geo.models import Location
log_file = "/opt/cv_instances/cv1/autodeploy/branches/nboreports/maidea/egypt/data_import_files/egypt_beheira_locations.txt"
logfile_to_write = open(log_file, "w")
def process_file(path):
book = xlrd.open_workbook(path)
print("Got {0} number of sheets.".format(book.nsheets))
hh_counter = 0
for sheet_num in range(book.nsheets-1, -1, -1):
sheet = book.sheet_by_index(sheet_num)
print("Processing sheet number {0} ({1})".format(sheet_num, sheet.name))
for row_idx in range(1, sheet.nrows):
with transaction.atomic():
try:
household_name = str(sheet.row_values(row_idx)[0]).strip().replace(".0","")
# old_location = str(sheet.row_values(row_idx)[1]).strip().replace(".0","")
new_location = str(sheet.row_values(row_idx)[2]).strip().replace(".0","")
if household_name:
household = Household.objects.get(office__slug='eg-co',name=household_name)
# print(household.name, household.location)
#update new locations
household.location = new_location
household.save()
hh_counter += 1
logfile_to_write.write("Household {0} updated to location {1}".format(household, household.location))
except Household.DoesNotExist:
continue
print("Done looping and updating locations")
print("================================================================================================================================")
def delete_old_locations(path):
"""
Delete old locations no longer needed by the country office
"""
book = xlrd.open_workbook(path)
print("Got {0} number of sheets.".format(book.nsheets))
location_counter = 0
for sheet_num in range(book.nsheets-1, -1, -1):
sheet = book.sheet_by_index(sheet_num)
print("Processing sheet number {0} ({1})".format(sheet_num, sheet.name))
for row_idx in range(1, sheet.nrows):
with transaction.atomic():
try:
old_location = str(sheet.row_values(row_idx)[1]).strip().replace(".0","")
if old_location:
location = Location.objects.get(country__name="Egypt", name=old_location)
# print(location.name, location.country)
location.delete()
location_counter += 1
logfile_to_write.write("Location {0} deleted ".format(location))
except Location.DoesNotExist:
continue
print("Done looping and deleting locations")
print("================================================================================================================================")
#call the our process file method
if __name__=="__main__":
path = "/opt/cv_instances/cv1/autodeploy/branches/nboreports/maidea/egypt/data_import_files/egypt-sf-beheira-enrolments.xlsx"
process_file(path)
delete_old_locations(path)
print("Done processing file")
I kindly need advice on the best way of printing this arabic characters.Thanks in advance.
This has nothing to do with Arabic characters. As the error says, you need to assign an instance of Location there, not a string.
All, I am just getting started with python and I thought this may be a good time to see if it can help me automate a lot of repeative tasks I have to complete.
I am using a script I found on Gethub that will search and replace and then write a new file with the name output.txt. It works fine, but Since I have lots of these files I need to be able to name them different names based on the Text in the final modified document.
To make this a little more difficult the name of the file is based on the text I will be modifing the document with.
So, basically after I run this script, I have a file that sits at C:\Program Files (x86)\Python35-32\Scripts\Text_Find_and_Replace\Result with the name of output.txt in this Modified new file I would like to name it based on what text is in a particular line of the file. So in the modified file of output.txt I would like to have it rename the file to the plain text in line 35.
I have figured out how to read the line within the file using
import linecache
line = linecache.getline("readme.txt", 1)
line
>>> line
'This is Python version 3.5.1\n'
I just need to figure out how to rename the file based on the variable "line"
Any Ideas?
#!/usr/bin/python
import os
import sys
import string
import re
## information/replacingvalues.txt this is the text of the values you want in your final document
information = open("C:\Program Files (x86)\Python35- 32\Scripts\Text_Find_and_Replace\information/replacingvalues.txt", 'r')
#Text_Find_and_Replace\Result\output.txt This is the dir and the sum or final document
output = open("C:\Program Files (x86)\Python35-32\Scripts\Text_Find_and_Replace\Result\output.txt", 'w')
#field = open("C:\Program Files (x86)\Python35- 32\Scripts\Text_Find_and_Replace\Field/values.txt"
# Field is the file or words you will be replacing
field = open("C:\Program Files (x86)\Python35- 32\Scripts\Text_Find_and_Replace\Field/values.txt", 'r')
##
##
# modified code for autohot key
# Text_Find_and_Replace\Test/remedy line 1.ahk is the original doc you want modified
with open("C:\Program Files (x86)\Python35- 32\Scripts\Text_Find_and_Replace\Test/remedy line 1.ahk", 'r') as myfile:
inline = myfile.read()
#orig code
##with open("C:\Program Files (x86)\Python35- 32\Scripts\Text_Find_and_Replace\Test/input.txt", 'r') as myfile:
## inline = myfile.read()
informations = []
fields = []
dictionary = {}
i = 0
for line in information:
informations.append(line.splitlines())
for lines in field:
fields.append(lines.split())
i = i+1;
if (len(fields) != len(informations) ):
print ("replacing values and values have different numbers")
exit();
else:
for i in range(0, i):
rightvalue = str(informations[i])
rightvalue = rightvalue.strip('[]')
rightvalue = rightvalue[1:-1]
leftvalue = str(fields[i])
leftvalue = leftvalue.strip('[]')
leftvalue = leftvalue.strip("'")
dictionary[leftvalue] = rightvalue
robj = re.compile('|'.join(dictionary.keys()))
result = robj.sub(lambda m: dictionary[m.group(0)], inline)
output.write(result)
information.close;
output.close;
field.close;
I figured out how...
import os
import linecache
linecache.clearcache()
newfilename= linecache.getline("C:\python 3.5/remedy line 1.txt",37)
filename = ("C:\python 3.5/output.ahk")
os.rename(filename, newfilename.strip())
linecache.clearcache()
I'm working on this piece of code and this weird bug showed up on the Try command near the end of the code. The whole script is aimed towards .flac files, and sometimes it'd read .jpg files in the folders and blow up. Simply enough I went ahead and added if (".flac" or ".FLAC" in Song): before the Try, this way easily enough it would only process the correct filetype. However this made absolutely no difference and I kept on getting the following error
Traceback (most recent call last):
File ".\musync.py", line 190, in <module>
match_metadata(CurrentAlbum + Song, CoAlbum + Song)
File ".\musync.py", line 152, in match_metadata
TagSource = FLAC(SrcFile)
File "C:\Python34\lib\site-packages\mutagen\_file.py", line 41, in __init__
self.load(filename, *args, **kwargs)
File "C:\Python34\lib\site-packages\mutagen\flac.py", line 721, in load
self.__check_header(fileobj)
File "C:\Python34\lib\site-packages\mutagen\flac.py", line 844, in __check_header
"%r is not a valid FLAC file" % fileobj.name)
mutagen.flac.FLACNoHeaderError: 'C:/Users/berna/Desktop/Lib/Andrew Bird/Armchair Apocrypha/cover.jpg' is not a valid FLAC file
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File ".\musync.py", line 194, in <module>
check_song(CurrentAlbum + Song, CoAlbum)
File ".\musync.py", line 83, in check_song
TagSource = FLAC(SrcFile)
File "C:\Python34\lib\site-packages\mutagen\_file.py", line 41, in __init__
self.load(filename, *args, **kwargs)
File "C:\Python34\lib\site-packages\mutagen\flac.py", line 721, in load
self.__check_header(fileobj)
File "C:\Python34\lib\site-packages\mutagen\flac.py", line 844, in __check_header
"%r is not a valid FLAC file" % fileobj.name)
mutagen.flac.FLACNoHeaderError: 'C:/Users/berna/Desktop/Lib/Andrew Bird/Armchair Apocrypha/cover.jpg' is not a valid FLAC file
Why is the if condition not doing it's job and how can I fix this? Code Is currently as follows:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import shutil
import os
from mutagen.flac import FLAC # Used for metadata handling.
from os import listdir # Used for general operations.
from fuzzywuzzy import fuzz # Last resource name association.
# Insert here the root directory of your library and device respectively.
lib = 'C:/Users/berna/Desktop/Lib/'
dev = 'C:/Users/berna/Desktop/Dev/'
# Faster file copying function, arguments go as follows: Source file location,
# target directory, whether to keep the filename intact and whether to create
# the target directory in case it doesn't exist.
def copy_file(SrcFile, TgtDir, KeepName=True, MakeDir=True):
SourceFile = None
TargetFile = None
KeepGoing = False
# Checks is TgtDir is valid and creates if needed.
if MakeDir and not os.path.isdir(TgtDir):
os.makedirs(TgtDir)
# Processes TgtDir depending on filename choice.
if KeepName is True:
TgtDir += os.path.basename(SrcFile)
print(TgtDir)
try:
SourceFile = open(SrcFile, 'rb')
TargetFile = open(TgtDir, 'wb')
KeepGoing = True
Count = 0
while KeepGoing:
# Read blocks of size 2**20 = 1048576
Buffer = SourceFile.read(2 ** 20)
if not Buffer:
break
TargetFile.write(Buffer)
Count += len(Buffer)
finally:
if TargetFile:
TargetFile.close()
if SourceFile:
SourceFile.close()
return KeepGoing
# XXX TODO
# Copies a directory (SrcDir) to TgtDir, if Replace is True will delete same
# name directory and replace with new one.
def copy_tree(SrcDir, TgtDir, Replace=True):
if not os.path.isdir(TgtDir):
os.makedirs(TgtDir)
Target = format_dir(TgtDir, os.path.basename(SrcDir))
if os.path.isdir(Target) and Replace:
shutil.rmtree(Target)
if not os.path.isdir(Target):
os.makedirs(Target)
for File in listdir(SrcDir):
FileDir = format_dir(SrcDir, File)
# copy_file(FileDir, Tgt)
return()
# Checks for new and deleted folders and returns their name.
def check_folder(SrcDir, TgtDir):
# Lists Source and Target folder.
Source = listdir(SrcDir)
Target = listdir(TgtDir)
# Then creates a list of deprecated and new directories.
Deleted = [FileName for FileName in Target if FileName not in Source]
Added = [FileName for FileName in Source if FileName not in Target]
# Returns both lists.
return (Added, Deleted)
# Checks for song in case there's a name mismatch or missing file.
def check_song(SrcFile, TgtDir):
Matches = []
# Invariably the new name will be that of the source file, the issue here
# is finding which song is the correct one.
NewName = TgtDir + '/' + os.path.basename(SrcFile)
TagSource = FLAC(SrcFile)
# Grabs the number of samples in the original file.
SourceSamples = TagSource.info.total_samples
# Checks if any song has a matching sample number and if true appends the
# song's filename to Matches[]
for Song in listdir(TgtDir):
SongInfo = FLAC(TgtDir + '/' + Song)
if (SongInfo.info.total_samples == SourceSamples):
Matches.append(Song)
# If two songs have the same sample rate (44100Hz for CDs) and the same
# length it matches them to the source by filename similarity.
if (Matches.count > 1):
Diffs = []
for Song in Matches:
Diffs.append(fuzz.ratio(Song, os.path.basename(SrcFile)))
if (max(Diffs) > 0.8):
BestMatch = TgtDir + '/' + Matches[Diffs.index(max(Diffs))]
os.rename(BestMatch, NewName)
else:
shutil.copy(SrcFile, TgtDir)
# If there's no match at all simply copy over the missing file.
elif (Matches.count == 0):
shutil.copy(SrcFile, TgtDir)
# If a single match is found the filename will be the first item on the
# Matches[] list.
else:
os.rename(TgtDir + '/' + Matches[0], NewName)
# Syncs folders in a directory and return the change count.
def sync(SrcDir, TgtDir):
AddCount = 0
DeleteCount = 0
# Grabs the folders to be added and deleted.
NewDir, OldDir = check_folder(SrcDir, TgtDir)
# Checks if any and then does add/rm.
if OldDir:
for Folder in OldDir:
shutil.rmtree(TgtDir + Folder)
DeleteCount += 1
if NewDir:
for Folder in NewDir:
shutil.copytree(SrcDir + Folder, TgtDir + Folder)
AddCount += 1
return(AddCount, DeleteCount)
# Fixes missing metadata fields.
def fix_metadata(SrcFile, TgtFile):
TagSource = FLAC(TgtFile)
TagTarget = FLAC(SrcFile)
# Checks for deleted tags on source file and deletes them from target.
if (set(TagTarget) - set(TagSource)):
OldTags = list(set(TagTarget) - set(TagSource))
for Tag in OldTags:
# TODO Right now I haven't quite figured out how to delete
# specific tags, so workaround is to delete them all.
TagTarget.delete()
# Checks for new tags on source file and transfers them to target.
if (set(TagSource) != set(TagTarget)):
NewTags = list(set(TagSource) - set(TagTarget))
for Tag in NewTags:
TagTarget["%s" % Tag] = TagSource[Tag]
TagTarget.save(TgtFile)
# Does metadata transfer between two files.
def match_metadata(SrcFile, TgtFile):
Altered = 0
TagSource = FLAC(SrcFile)
TagTarget = FLAC(TgtFile)
# For every different Tag in source song copy it to target and save.
for Tag in TagSource:
if TagSource[Tag] != TagTarget[Tag]:
Altered += 1
TagTarget[Tag] = TagSource[Tag]
TagTarget.save(TgtFile)
return(Altered)
# Simply does directory formatting to make things easier.
def format_dir(Main, Second, Third=""):
# Replaces \ with /
Main = Main.replace('\\', '/')
# Adds a / to the end of Main and concatenates Main and Second.
if(Main[len(Main) - 1] != '/'):
Main += '/'
Main += Second + '/'
# Concatenates Main and Third if necessary.
if (Third):
Main += Third + '/'
return (Main)
# Sync main folders in lib with dev.
sync(lib, dev)
# For every Artist in lib sync it's Albums
for Artist in listdir(lib):
sync(format_dir(lib, Artist), format_dir(dev, Artist))
# For every Album in Artist match songs
for Album in listdir(format_dir(lib, Artist)):
# Declares lib Album and dev Album to make function calls shorter.
CurrentAlbum = format_dir(lib, Artist, Album)
CoAlbum = format_dir(dev, Artist, Album)
for Song in listdir(CurrentAlbum):
if (".flac" or ".FLAC" in Song):
try:
# Tries to match lib and dev song's metadata.
match_metadata(CurrentAlbum + Song, CoAlbum + Song)
except:
# If that fails will try to fix both Filename and Tag
# fields.
check_song(CurrentAlbum + Song, CoAlbum)
fix_metadata(CurrentAlbum + Song, CoAlbum + Song)
try:
# Try again after fix.
match_metadata(CurrentAlbum + Song, CoAlbum + Song)
except Exception as e:
# If it still doesn't work there's black magic in place
# go sleep, drink a beer and try again later.
print("""Ehm, something happened and your sync failed.\n
Error:{}""".format(e))
raise SystemExit(0)
Try it:
Songs = ["a.flac", "a.mp3", "b.FLAC"]
flac_files = [s for s in Songs if s.lower().endswith('.flac')]
As pointed by #EliKorvigo the error was caused by a simple miswriting in the if condition, fix looks as follows:
for Song in listdir(CurrentAlbum):
if (".flac" in Song or ".FLAC" in Song):
try:
# Tries to match lib and dev song's metadata.
match_metadata(CurrentAlbum + Song, CoAlbum + Song)
except:
# If that fails will try to fix both Filename and Tag
# fields.
check_song(CurrentAlbum + Song, CoAlbum)
fix_metadata(CurrentAlbum + Song, CoAlbum + Song)
try:
# Try again after fix.
match_metadata(CurrentAlbum + Song, CoAlbum + Song)
except Exception as e:
# If it still doesn't work there's black magic in place
# go sleep, drink a beer and try again later.
print("""Ehm, something happened and your sync failed.\n
Error:{}""".format(e))
raise SystemExit(0)
I am getting this error.
I am not sure whether it is my mistake or something else.
I am on python 3.X version right now.
Traceback (most recent call last):
File "/Users/Administrator/Desktop/A2_b/author_program.py", line 104, in <module>
signature = read_signature(dir_name + "/" + this_file)
File "/Users/Administrator/Desktop/A2_b/author_program.py", line 48, in read_signature
result = [sig_file.readline().strip()]
File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/encodings/ascii.py", line 26, in decode
return codecs.ascii_decode(input, self.errors)[0]
UnicodeDecodeError: 'ascii' codec can't decode byte 0x80 in position 3131: ordinal not in range(128)
Here is the code that gives me this error. I only had to complete in this file first and second function.
import author_functions, os.path
def get_valid_filename(msg):
""" (str) -> str
Prompt the user, using msg, to type the name of a file. This file should
exist in the same directory as the starter code. If the file does not
exist, keep re-prompting until they give a valid filename.
Return the name of that file.
"""
filename = input(msg)
while not os.path.exists(filename):
print("That file does not exist.")
filename = input(msg)
return filename
def get_valid_directory_name(msg):
""" (str) -> str
Prompt the user, using msg, to type the name of a directory. If
the directory does not exist, keep re-prompting until they give a valid
directory.
Return the name of that directory.
"""
dirname = input(msg)
while not os.path.isdir(dirname):
print("That directory does not exist.")
dirname = input(msg)
return dirname
### Provided helper function ###
def read_signature(filename):
""" (str) -> list
Read a linguistic signature from filename and return it as
a list of features.
"""
sig_file = open(filename, 'r')
# Read the first feature.
result = [sig_file.readline().strip()]
# Read each remaining feature and convert each one to float.
for line in sig_file:
result.append(float(line.strip()))
sig_file.close()
return result
# #############################
# The main program begins here
# #############################
if __name__ == '__main__':
prompt = 'Enter the name of the file with unknown author: '
mystery_filename = get_valid_filename(prompt)
prompt = 'Enter the name of the directory of signature files: '
dir_name = get_valid_directory_name(prompt)
# Every file in the dir_name directory must be a linguistic signature.
# We assume there is a minimum of one file.
files = os.listdir(dir_name)
# ####################################################################
# The following code parses the mystery file and calculates its
# linguistic signature.
# ####################################################################
mystery_file = open(mystery_filename, 'r')
# readlines() gives us a list of strings, one for each line of the file
text = mystery_file.readlines()
mystery_file.close()
# Calculate the signature for the mystery file
mystery_signature = [mystery_filename]
mystery_signature.append(author_functions.avg_word_length(text))
mystery_signature.append(author_functions.type_token_ratio(text))
mystery_signature.append(author_functions.hapax_legomena_ratio(text))
mystery_signature.append(author_functions.avg_sentence_length(text))
mystery_signature.append(author_functions.avg_sentence_complexity(text))
# ####################################################
# The following code reads the linguistic signatures,
# compares them with the mystery_signature,
# and reports the author that was the best match.
# ####################################################
# Weights of linguistic features.
weights = [0, 11, 33, 50, 0.4, 4]
# We assume there is at least one signature in the dir_name directory
this_file = files[0]
signature = read_signature(dir_name + "/" + this_file)
best_score = author_functions.compare_signatures(mystery_signature,
signature, weights)
best_author = signature[0]
for this_file in files[1:]:
signature = read_signature(dir_name + "/" + this_file)
score = author_functions.compare_signatures(mystery_signature,
signature, weights)
if score < best_score:
best_score = score
best_author = signature[0]
if type(best_score) != float:
print("Error! No score could be computed")
else:
print("Best author match:", best_author, "with score", best_score)
try sig_file = open(filename, 'rb')
the b means there is binary data in the file (not just ascii)
that will probably resolve your issue