I need to convert a bunch of files to utf-8 in Python, and I have trouble with the “converting the file” part.
I’d like to do the equivalent of:
iconv -t utf-8 $file > converted/$file # this is shell code
Thanks!
Answers:
Thank you for visiting the Q&A section on Magenaut. Please note that all the answers may not help you solve the issue immediately. So please treat them as advisements. If you found the post helpful (or not), leave a comment & I’ll get back to you as soon as possible.
Method 1
You can use the codecs module, like this:
import codecs
BLOCKSIZE = 1048576 # or some other, desired size in bytes
with codecs.open(sourceFileName, "r", "your-source-encoding") as sourceFile:
with codecs.open(targetFileName, "w", "utf-8") as targetFile:
while True:
contents = sourceFile.read(BLOCKSIZE)
if not contents:
break
targetFile.write(contents)
EDIT: added BLOCKSIZE parameter to control file chunk size.
Method 2
This worked for me in a small test:
sourceEncoding = "iso-8859-1"
targetEncoding = "utf-8"
source = open("source")
target = open("target", "w")
target.write(unicode(source.read(), sourceEncoding).encode(targetEncoding))
Method 3
Thanks for the replies, it works!
And since the source files are in mixed formats, I added a list of source formats to be tried in sequence (sourceFormats), and on UnicodeDecodeError I try the next format:
from __future__ import with_statement
import os
import sys
import codecs
from chardet.universaldetector import UniversalDetector
targetFormat = 'utf-8'
outputDir = 'converted'
detector = UniversalDetector()
def get_encoding_type(current_file):
detector.reset()
for line in file(current_file):
detector.feed(line)
if detector.done: break
detector.close()
return detector.result['encoding']
def convertFileBestGuess(filename):
sourceFormats = ['ascii', 'iso-8859-1']
for format in sourceFormats:
try:
with codecs.open(fileName, 'rU', format) as sourceFile:
writeConversion(sourceFile)
print('Done.')
return
except UnicodeDecodeError:
pass
def convertFileWithDetection(fileName):
print("Converting '" + fileName + "'...")
format=get_encoding_type(fileName)
try:
with codecs.open(fileName, 'rU', format) as sourceFile:
writeConversion(sourceFile)
print('Done.')
return
except UnicodeDecodeError:
pass
print("Error: failed to convert '" + fileName + "'.")
def writeConversion(file):
with codecs.open(outputDir + '/' + fileName, 'w', targetFormat) as targetFile:
for line in file:
targetFile.write(line)
# Off topic: get the file list and call convertFile on each file
# ...
(EDIT by Rudro Badhon: this incorporates the original try multiple formats until you don’t get an exception as well as an alternate approach that uses chardet.universaldetector)
Method 4
Answer for unknown source encoding type
based on @Sébastien RoccaSerra
python3.6
import os
from chardet import detect
# get file encoding type
def get_encoding_type(file):
with open(file, 'rb') as f:
rawdata = f.read()
return detect(rawdata)['encoding']
from_codec = get_encoding_type(srcfile)
# add try: except block for reliability
try:
with open(srcfile, 'r', encoding=from_codec) as f, open(trgfile, 'w', encoding='utf-8') as e:
text = f.read() # for small files, for big use chunks
e.write(text)
os.remove(srcfile) # remove old encoding file
os.rename(trgfile, srcfile) # rename new encoding
except UnicodeDecodeError:
print('Decode Error')
except UnicodeEncodeError:
print('Encode Error')
Method 5
This is a Python3 function for converting any text file into the one with UTF-8 encoding. (without using unnecessary packages)
def correctSubtitleEncoding(filename, newFilename, encoding_from, encoding_to='UTF-8'):
with open(filename, 'r', encoding=encoding_from) as fr:
with open(newFilename, 'w', encoding=encoding_to) as fw:
for line in fr:
fw.write(line[:-1]+'rn')
You can use it easily in a loop to convert a list of files.
Method 6
You can use this one liner (assuming you want to convert from utf16 to utf8)
python -c "from pathlib import Path; path = Path('yourfile.txt') ; path.write_text(path.read_text(encoding='utf16'), encoding='utf8')"
Where yourfile.txt is a path to your $file.
For this to work you need python 3.4 or newer (probably nowadays you do).
Below a more readable version of the code above
from pathlib import Path
path = Path("yourfile.txt")
path.write_text(path.read_text(encoding="utf16"), encoding="utf8")
Method 7
To guess what’s the source encoding you can use the file *nix command.
Example:
$ file --mime jumper.xml jumper.xml: application/xml; charset=utf-8
Method 8
This is my brute force method. It also takes care of mingled n and rn in the input.
# open the CSV file
inputfile = open(filelocation, 'rb')
outputfile = open(outputfilelocation, 'w', encoding='utf-8')
for line in inputfile:
if line[-2:] == b'rn' or line[-2:] == b'nr':
output = line[:-2].decode('utf-8', 'replace') + 'n'
elif line[-1:] == b'r' or line[-1:] == b'n':
output = line[:-1].decode('utf-8', 'replace') + 'n'
else:
output = line.decode('utf-8', 'replace') + 'n'
outputfile.write(output)
outputfile.close()
except BaseException as error:
cfg.log(self.outf, "Error(18): opening CSV-file " + filelocation + " failed: " + str(error))
self.loadedwitherrors = 1
return ([])
try:
# open the CSV-file of this source table
csvreader = csv.reader(open(outputfilelocation, "rU"), delimiter=delimitervalue, quoting=quotevalue, dialect=csv.excel_tab)
except BaseException as error:
cfg.log(self.outf, "Error(19): reading CSV-file " + filelocation + " failed: " + str(error))
Method 9
convert all file in a dir to utf-8 encode. it is recursive and can filter file by suffix. thanks @Sole Sensei
# pip install -i https://pypi.tuna.tsinghua.edu.cn/simple chardet
import os
import re
from chardet import detect
def get_file_list(d):
result = []
for root, dirs, files in os.walk(d):
dirs[:] = [d for d in dirs if d not in ['venv', 'cmake-build-debug']]
for filename in files:
# your filter
if re.search(r'(.c|.cpp|.h|.txt)$', filename):
result.append(os.path.join(root, filename))
return result
# get file encoding type
def get_encoding_type(file):
with open(file, 'rb') as f:
raw_data = f.read()
return detect(raw_data)['encoding']
if __name__ == "__main__":
file_list = get_file_list('.')
for src_file in file_list:
print(src_file)
trg_file = src_file + '.swp'
from_codec = get_encoding_type(src_file)
try:
with open(src_file, 'r', encoding=from_codec) as f, open(trg_file, 'w', encoding='utf-8') as e:
text = f.read()
e.write(text)
os.remove(src_file)
os.rename(trg_file, src_file)
except UnicodeDecodeError:
print('Decode Error')
except UnicodeEncodeError:
print('Encode Error')
All methods was sourced from stackoverflow.com or stackexchange.com, is licensed under cc by-sa 2.5, cc by-sa 3.0 and cc by-sa 4.0