How to convert a file to utf-8 in Python? - python

I need to convert a bunch of files to utf-8 in Python, and I have trouble with the "converting the file" part.
I'd like to do the equivalent of:
iconv -t utf-8 $file > converted/$file # this is shell code
Thanks!

You can use the codecs module, like this:
import codecs
BLOCKSIZE = 1048576 # or some other, desired size in bytes
with codecs.open(sourceFileName, "r", "your-source-encoding") as sourceFile:
with codecs.open(targetFileName, "w", "utf-8") as targetFile:
while True:
contents = sourceFile.read(BLOCKSIZE)
if not contents:
break
targetFile.write(contents)
EDIT: added BLOCKSIZE parameter to control file chunk size.

This worked for me in a small test:
sourceEncoding = "iso-8859-1"
targetEncoding = "utf-8"
source = open("source")
target = open("target", "w")
target.write(unicode(source.read(), sourceEncoding).encode(targetEncoding))

Thanks for the replies, it works!
And since the source files are in mixed formats, I added a list of source formats to be tried in sequence (sourceFormats), and on UnicodeDecodeError I try the next format:
from __future__ import with_statement
import os
import sys
import codecs
from chardet.universaldetector import UniversalDetector
targetFormat = 'utf-8'
outputDir = 'converted'
detector = UniversalDetector()
def get_encoding_type(current_file):
detector.reset()
for line in file(current_file):
detector.feed(line)
if detector.done: break
detector.close()
return detector.result['encoding']
def convertFileBestGuess(filename):
sourceFormats = ['ascii', 'iso-8859-1']
for format in sourceFormats:
try:
with codecs.open(fileName, 'rU', format) as sourceFile:
writeConversion(sourceFile)
print('Done.')
return
except UnicodeDecodeError:
pass
def convertFileWithDetection(fileName):
print("Converting '" + fileName + "'...")
format=get_encoding_type(fileName)
try:
with codecs.open(fileName, 'rU', format) as sourceFile:
writeConversion(sourceFile)
print('Done.')
return
except UnicodeDecodeError:
pass
print("Error: failed to convert '" + fileName + "'.")
def writeConversion(file):
with codecs.open(outputDir + '/' + fileName, 'w', targetFormat) as targetFile:
for line in file:
targetFile.write(line)
# Off topic: get the file list and call convertFile on each file
# ...
(EDIT by Rudro Badhon: this incorporates the original try multiple formats until you don't get an exception as well as an alternate approach that uses chardet.universaldetector)

Answer for unknown source encoding type
based on #Sébastien RoccaSerra
python3.6
import os
from chardet import detect
# get file encoding type
def get_encoding_type(file):
with open(file, 'rb') as f:
rawdata = f.read()
return detect(rawdata)['encoding']
from_codec = get_encoding_type(srcfile)
# add try: except block for reliability
try:
with open(srcfile, 'r', encoding=from_codec) as f, open(trgfile, 'w', encoding='utf-8') as e:
text = f.read() # for small files, for big use chunks
e.write(text)
os.remove(srcfile) # remove old encoding file
os.rename(trgfile, srcfile) # rename new encoding
except UnicodeDecodeError:
print('Decode Error')
except UnicodeEncodeError:
print('Encode Error')

You can use this one liner (assuming you want to convert from utf16 to utf8)
python -c "from pathlib import Path; path = Path('yourfile.txt') ; path.write_text(path.read_text(encoding='utf16'), encoding='utf8')"
Where yourfile.txt is a path to your $file.
For this to work you need python 3.4 or newer (probably nowadays you do).
Below a more readable version of the code above
from pathlib import Path
path = Path("yourfile.txt")
path.write_text(path.read_text(encoding="utf16"), encoding="utf8")

This is a Python3 function for converting any text file into the one with UTF-8 encoding. (without using unnecessary packages)
def correctSubtitleEncoding(filename, newFilename, encoding_from, encoding_to='UTF-8'):
with open(filename, 'r', encoding=encoding_from) as fr:
with open(newFilename, 'w', encoding=encoding_to) as fw:
for line in fr:
fw.write(line[:-1]+'\r\n')
You can use it easily in a loop to convert a list of files.

To guess what's the source encoding you can use the file *nix command.
Example:
$ file --mime jumper.xml
jumper.xml: application/xml; charset=utf-8

This is my brute force method. It also takes care of mingled \n and \r\n in the input.
# open the CSV file
inputfile = open(filelocation, 'rb')
outputfile = open(outputfilelocation, 'w', encoding='utf-8')
for line in inputfile:
if line[-2:] == b'\r\n' or line[-2:] == b'\n\r':
output = line[:-2].decode('utf-8', 'replace') + '\n'
elif line[-1:] == b'\r' or line[-1:] == b'\n':
output = line[:-1].decode('utf-8', 'replace') + '\n'
else:
output = line.decode('utf-8', 'replace') + '\n'
outputfile.write(output)
outputfile.close()
except BaseException as error:
cfg.log(self.outf, "Error(18): opening CSV-file " + filelocation + " failed: " + str(error))
self.loadedwitherrors = 1
return ([])
try:
# open the CSV-file of this source table
csvreader = csv.reader(open(outputfilelocation, "rU"), delimiter=delimitervalue, quoting=quotevalue, dialect=csv.excel_tab)
except BaseException as error:
cfg.log(self.outf, "Error(19): reading CSV-file " + filelocation + " failed: " + str(error))

convert all file in a dir to utf-8 encode. it is recursive and can filter file by suffix. thanks #Sole Sensei
# pip install -i https://pypi.tuna.tsinghua.edu.cn/simple chardet
import os
import re
from chardet import detect
def get_file_list(d):
result = []
for root, dirs, files in os.walk(d):
dirs[:] = [d for d in dirs if d not in ['venv', 'cmake-build-debug']]
for filename in files:
# your filter
if re.search(r'(\.c|\.cpp|\.h|\.txt)$', filename):
result.append(os.path.join(root, filename))
return result
# get file encoding type
def get_encoding_type(file):
with open(file, 'rb') as f:
raw_data = f.read()
return detect(raw_data)['encoding']
if __name__ == "__main__":
file_list = get_file_list('.')
for src_file in file_list:
print(src_file)
trg_file = src_file + '.swp'
from_codec = get_encoding_type(src_file)
try:
with open(src_file, 'r', encoding=from_codec) as f, open(trg_file, 'w', encoding='utf-8') as e:
text = f.read()
e.write(text)
os.remove(src_file)
os.rename(trg_file, src_file)
except UnicodeDecodeError:
print('Decode Error')
except UnicodeEncodeError:
print('Encode Error')

Related

How to create a loop with FOR in a temporary file?

I am working with an encrypted file, but I can't manage to create a loop with for in order to read it before it get closed and removed.
My intention is to read the data given in the encrypted file and loop it to assign each line to a variable.
Whenever I execute my code, Python just goes straight to finish, without working with the decrypted info; I believe it is because the with command close it before the loop starts.
This is what I want, not working, no errors either:
with open(input_file, 'rb') as fp:
data = fp.read()
fernet = Fernet(key)
encrypted = fernet.decrypt(data)
with tempfile.TemporaryFile() as fp:
fp.write(encrypted)
for url in fp: #Python ignores the tempfile. I belive it is closed in the previous line.
segment = url.strip()
url = 'https://docs.python.org/3.3/tutorial/' + segment
filename = segment + '.html'
filePath = pjoin('Data/' + filename)
response = urlopen(url)
webContent = response.read()
html_content = urlopen(url).read()
matches = re.findall(b'string', html_content);
if len(matches) == 0:
print(segment + ' unchanged.')
else:
with open(filePath, 'wb') as w:
w.write(webContent)
This is the working code (Sorry, tried to make it shorter but couldn't):
with open(input_file, 'rb') as fp:
data = fp.read()
fernet = Fernet(key)
encrypted = fernet.decrypt(data)
with open(output_file, 'wb') as fp:
fp.write(encrypted)
with open(output_file) as fp:
for url in fp:
segment = url.strip()
url = 'https://docs.python.org/3.3/tutorial/' + segment
filename = segment + '.html'
filePath = pjoin('Data/' + filename)
response = urlopen(url)
webContent = response.read()
html_content = urlopen(url).read()
matches = re.findall(b'string', html_content);
if len(matches) == 0:
print(segment + ' unchanged.')
else:
with open(filePath, 'wb') as w:
w.write(webContent)
Header for both examples (apart to make it shorter):
#python 3.6.6
from urllib.request import urlopen
import urllib.request
from os.path import join as pjoin
import re, os, sys, tempfile, six, ctypes, time, fileinput
from cryptography.fernet import Fernet
print("[*] Checking list.dat for consistency . . .")
key = b'wTmVBRLytAmlfkctCuEf59K0LDCXa3sGas3kPg3r4fs=' #Decrypt list.dat
input_file = 'List.dat'
output_file = 'List.txt'
List.txt content:
errors
classes
stdlib
Any hints?
The problem is that once you have written to the file, the "file pointer" is at the end of the file. There's nothing to read.
You can use the seek method to reposition the file pointer at the beginning. Alternatively, closing and re-opening the file (as in your working code) will position the pointer at the beginning of the file.
#LarryLustig pretty much answered why your code wasn't working, but IMO if you eliminate the temp file altogether (which shouldn't be necessary) you don't even need to worry about the cursor. See below commented changes on your desired code.
# We'll use os.linesep to get the line terminator string for your os.
import os
...
with open(input_file, 'rb') as fp:
data = fp.read()
fernet = Fernet(key)
# decode your decrypted bytes into strings. Change 'utf-8' into whichever file encoding you're using if necessary.
decrypted = fernet.decrypt(data).decode('utf-8')
# Don't write to a temp file
# Iterate directly on each line of the extracted data
for url in decrypted.split(os.linesep):
segment = url.strip()
url = 'https://docs.python.org/3.3/tutorial/' + segment
filename = segment + '.html'
filePath = pjoin('Data/' + filename)
response = urlopen(url)
webContent = response.read()
html_content = urlopen(url).read()
matches = re.findall(b'string', html_content);
if len(matches) == 0:
print(segment + ' unchanged.')
else:
with open(filePath, 'wb') as w:
w.write(webContent)
Alternatively, if you know for sure what is the line terminator used in the file (e.g. \r\n, or \n) then you can eliminate using os.linesep altogether.

Removing all BOMs from file with multiple BOMs

I have a text file containing multiple lines beginning with a byte order mark. Passing encoding='utf-8-sig' to open removes the BOM at the start of the file but all subsequent BOMs remain. Is there a more correct way to remove these than this:
import codecs
filepath = 'foo.txt'
bom_len = len(codecs.BOM_UTF8)
def remove_bom(s):
s = str.encode(s)
if codecs.BOM_UTF8 in s:
s = s[bom_len:]
return s.decode()
try:
with open(filepath, encoding='utf-8-sig') as file_object:
for line in file_object:
line = line.rstrip()
line = remove_bom(line)
if line != '':
print([line[0]])
except FileNotFoundError:
print('No file found at ' + filepath)
I'm having similar problems.
This kinda helped me:
import codecs
with open(path, "rb") as infile:
bytecontent = infile.read()
bytecontent = bytecontent.replace(codecs.BOM_UTF8, b"")

PyPDF2: TypeError: coercing to Unicode: need string or buffer, PdfFileWriter found

Reworking code to include Context Managers via with statements. However I am receiving a Traceback: using Python 2.7 on Windows
Traceback (most recent call last):
File "CommissionSecurity.py", line 52, in <module>
with open(output, 'w') as output_stream :
TypeError: coercing to Unicode: need string or buffer, PdfFileWriter found
I'm not sure how to fix this, I am trying to use PyPDF2 to encrypt pdf files, and I'm having trouble figuring out exactly what I've done wrong here. Any guidance is appreciated.
import os
from PyPDF2 import PdfWriter, PdfReader
for ID in file_dict:
# print REP
# print ID # debug: REP always coming over 764
if ID in email_dict:
# print file_dict[ID]
path = "C:\\Apps\\CorVu\\DATA\\Reports\\AlliD\\Monthly Commission Reports\\Output\\pdcom1\\"
file_path = os.path.join(path + file_dict[ID])
writer = PdfWriter()
reader = PdfReader(file_path)
output_stream = (file_path, "wb")
with open(file_path, "rb") as reader:
with open(writer, "w") as output_stream:
writer.encrypt(email_dict[ID][1])
writer.write(output_stream)
else:
continue
The code has several issues:
You don't use reader
You overwrite several variables
I guess what you want to do is this:
import os
from PyPDF2 import PdfWriter, PdfReader
for ID in file_dict:
# print REP
# print ID # debug: REP always coming over 764
if ID in email_dict:
# print file_dict[ID]
path = "C:\\Apps\\CorVu\\DATA\\Reports\\AlliD\\Monthly Commission Reports\\Output\\pdcom1\\"
file_path = os.path.join(path + file_dict[ID])
writer = PdfWriter()
reader = PdfReader(file_path)
output_stream = (file_path, "wb")
with open("target.pdf", "w") as fp: # this is
writer.encrypt(email_dict[ID][1]) # the important
writer.write(fp) # part
else:
continue
There are for sure other things necessary

Encode and decode a mp3 file

I want so save a mp3 file as encoded string in a text file, but it doesn't work with my code
import sys, base64
f = open(sys.argv[1], 'r')
b = base64.b64encode(f.read())
print sys.getsizeof(b)
f.close()
try:
file = open(sys.argv[2] + '.txt', 'w')
file.write(b)
file.close()
except:
print('Something went wrong!')
sys.exit(0)
f = open(sys.argv[2] + '.txt', 'r').read()
b = base64.b64decode(f)
f.close()
try:
file = open(sys.argv[2] + '2.mp3', 'w')
file.write(b)
file.close()
except:
print('Something went wrong!')
sys.exit(0)
The encoded string is too short for being the full string, so there isn't a good result. So why "doesn't" it work?
Okay, I've reached my personal goal.
As pentadecagon has mentioned:
You need to call open using 'rb', because it's binary. Use len instead of sys.getsizeof.
f = open(sys.argv[2] + '.txt', 'r').read()
b = base64.b64decode(f)
f.close()
I changed this to
f = open(sys.argv[2] + '.txt', 'r')
b = base64.b64decode(f.read())
f.close()
So I've changed it and when I finally create the mp3 file again, you need to write binary 'wb'
and it works.

Read many csv file and write it to encoding to utf8 using python

I'm using python code to read from many csv files and set encoding to utf8.I meet the problem when I read the file I can read all lines but when I write it, it can write only 1 line. Please help me to check my code as below:
def convert_files(files, ascii, to="utf-8"):
for name in files:
#print ("Convert {0} from {1} to {2}").format(name, ascii, to)
with open(name) as f:
print(name)
count = 0
lineno = 0
#this point I want to write the below text into my each new file at the first line
#file_source.write('id;nom;prenom;nom_pere;nom_mere;prenom_pere;prenom_mere;civilite (1=homme 2=f);date_naissance;arrondissement;adresse;ville;code_postal;pays;telephone;email;civilite_demandeur (1=homme 2=f);nom_demandeur;prenom_demandeur;qualite_demandeur;type_acte;nombre_actes\n')
for line in f.readlines():
lineno +=1
if lineno == 1 :
continue
file_source = open(name, mode='w', encoding='utf-8', errors='ignore')
#pass
#print (line)
# start write data to to new file with encode
file_source.write(line)
#file_source.close
#print unicode(line, "cp866").encode("utf-8")
csv_files = find_csv_filenames('./csv', ".csv")
convert_files(csv_files, "cp866")
You're reopening the file during every iteration.
for line in f.readlines():
lineno +=1
if lineno == 1 :
continue
#move the following line outside of the for block
file_source = open(name, mode='w', encoding='utf-8', errors='ignore')
If all you need is to change the character encoding of the files then it doesn't matter that they are csv files unless the conversion may change what characters are interpreted as delimiter, quotechar, etc:
def convert(filename, from_encoding, to_encoding):
with open(filename, newline='', encoding=from_encoding) as file:
data = file.read().encode(to_encoding)
with open(filename, 'wb') as outfile:
outfile.write(data)
for path in csv_files:
convert(path, "cp866", "utf-8")
Add errors parameter to change how encoding/decoding errors are handled.
If files may be large then you could convert data incrementally:
import os
from shutil import copyfileobj
from tempfile import NamedTemporaryFile
def convert(filename, from_encoding, to_encoding):
with open(filename, newline='', encoding=from_encoding) as file:
with NamedTemporaryFile('w', encoding=to_encoding, newline='',
dir=os.path.dirname(filename)) as tmpfile:
copyfileobj(file, tmpfile)
tmpfile.delete = False
os.replace(tmpfile.name, filename) # rename tmpfile -> filename
for path in csv_files:
convert(path, "cp866", "utf-8")
You can do this
def convert_files(files, ascii, to="utf-8"):
for name in files:
with open(name, 'r+') as f:
data = ''.join(f.readlines())
data.decode(ascii).encode(to)
f.seek(0)
f.write(data)
f.truncate()

Categories