Will it be possible in the future to add to the system a procedure for open\read file in the proper encoding?
While I use this variant:
import strutils
proc readFile*(filePath:string,
sourceEncoding:string,
destEncoding:string = "UTF8"): TaintedString =
var file:File
if not open(file, filePath):
raise newException(IOError,"Could not open file:" & filePath)
defer:file.close()
let text = file.readAll()
if cmpIgnoreCase(destEncoding,sourceEncoding) != 0 :
result = text.convert(destEncoding, sourceEncoding)
else:
result = text
And expand module encodings such functions:
proc isUTF8*(bom:openarray[uint8]):bool =
var s = newSeq[uint8](3)
for i in 0..<3 : s[i]= bom[i]
s == @[239'u8, 187'u8, 191'u8]
proc isUTF16BE*(bom:openarray[uint8]):bool=
var s = newSeq[uint8](2)
for i in 0..<2 : s[i]= bom[i]
s == @[254'u8, 255'u8]
proc isUTF16LE*(bom:openarray[uint8]):bool=
var s = newSeq[uint8](2)
for i in 0..<2 : s[i]= bom[i]
s == @[255'u8, 254'u8]
proc isUTF32BE*(bom:openarray[uint8]):bool=
var s = newSeq[uint8](4)
for i in 0..<4 : s[i]= bom[i]
s == @[0'u8, 0'u8, 254'u8, 255'u8]
proc isUTF32LE*(bom:openarray[uint8]):bool=
var s = newSeq[uint8](4)
for i in 0..<4 : s[i]= bom[i]
s == @[255'u8, 254'u8, 0'u8, 0'u8]
proc isUTFEBCDIC*(bom:openarray[uint8]):bool=
var s = newSeq[uint8](4)
for i in 0..<4 : s[i]= bom[i]
s == @[221'u8, 115'u8, 102'u8, 115'u8]
proc isGB18030*(bom:openarray[uint8]):bool=
var s = newSeq[uint8](4)
for i in 0..<4 : s[i]= bom[i]
s == @[132'u8, 49'u8, 149'u8, 51'u8]
proc isSCSU*(bom:openarray[uint8]):bool=
var s = newSeq[uint8](3)
for i in 0..<3 : s[i]= bom[i]
s == @[14'u8, 254'u8, 255'u8]
proc isUTF1*(bom:openarray[uint8]):bool=
var s = newSeq[uint8](3)
for i in 0..<3 : s[i]= bom[i]
s == @[247'u8, 100'u8, 76'u8]
proc isBOCU1*(bom:openarray[uint8]):bool=
var s = newSeq[uint8](3)
for i in 0..<3 : s[i]= bom[i]
s == @[251'u8, 238'u8, 40'u8]
proc isUTF7*(bom:openarray[uint8]):bool=
var s = newSeq[uint8](16)
for i in 0..<16 : s[i]= bom[i]
s == @[43'u8, 47'u8, 118'u8, 56'u8, 43'u8, 47'u8, 118'u8, 57'u8, 43'u8, 47'u8, 118'u8, 43'u8, 43'u8, 47'u8, 118'u8, 47'u8]
proc isUTF16*(bom:openarray[uint8]):bool =
return isUTF16BE(bom) or isUTF16LE(bom)
proc isUTF32*(bom:openarray[uint8]):bool =
return isUTF32BE(bom) or isUTF32LE(bom)
import strutils, encodings
proc getFileEncoding*(filePath:string):string=
var
bytes = 16
box:array[16,uint8]
file:File
if not open(file,filePath):
raise newException(IOError,"Could not open file:" & filePath)
defer:file.close()
discard file.readBytes(box,0,bytes)
result = ""
if isUTF32BE(box): return "UTF-32BE"
if isUTF32LE(box): return "UTF-32LE"
if isUTF16BE(box): return "UTF-16BE"
if isUTF16LE(box): return "UTF-16LE"
if isUTF8(box): return "UTF-8"
if isUTFEBCDIC(box): return "UTF-EBCDIC"
if isGB18030(box): return "GB-18030"
if isSCSU(box): return "SCSU"
if isUTF1(box): return "UTF-1"
if isBOCU1(box): return "BOCU-1"
if isUTF7(box): return "UTF-7"
PS: However, in fact, the module type chardet (Python) would be even more powerful. :-) Writing transparently re-encoding open/read functions for general usage may be possible but is hardly a "system level" functionality. I guess that should need to go into an extra (external) module.
Adding BOM to the encodings module may be interesting. But technically I think it belongs into the unicode module as it is an unicode feature. The encodings module is a bit broader and would need something like chardet indeed.