nimforum mirror - Read the file in the proper encoding

Garry_Galler (orginal) [2016-07-19T16:08:27+02:00] view original

Will it be possible in the future to add to the system a procedure for open\read file in the proper encoding?

While I use this variant:

import strutils
proc readFile*(filePath:string,
               sourceEncoding:string,
               destEncoding:string = "UTF8"): TaintedString =
  var file:File
  if not open(file, filePath):
    raise newException(IOError,"Could not open file:" & filePath)
  defer:file.close()
  let text = file.readAll()
  if cmpIgnoreCase(destEncoding,sourceEncoding) != 0 :
    result = text.convert(destEncoding, sourceEncoding)
  else:
    result = text

And expand module encodings such functions:

proc isUTF8*(bom:openarray[uint8]):bool =
  var s = newSeq[uint8](3)
  for i in 0..<3 : s[i]= bom[i]
  s == @[239'u8, 187'u8, 191'u8]

proc isUTF16BE*(bom:openarray[uint8]):bool=
  var s = newSeq[uint8](2)
  for i in 0..<2 : s[i]= bom[i]
  s == @[254'u8, 255'u8]

proc isUTF16LE*(bom:openarray[uint8]):bool=
  var s = newSeq[uint8](2)
  for i in 0..<2 : s[i]= bom[i]
  s == @[255'u8, 254'u8]

proc isUTF32BE*(bom:openarray[uint8]):bool=
  var s = newSeq[uint8](4)
  for i in 0..<4 : s[i]= bom[i]
  s  == @[0'u8, 0'u8, 254'u8, 255'u8]

proc isUTF32LE*(bom:openarray[uint8]):bool=
  var s = newSeq[uint8](4)
  for i in 0..<4 : s[i]= bom[i]
  s == @[255'u8, 254'u8, 0'u8, 0'u8]

proc isUTFEBCDIC*(bom:openarray[uint8]):bool=
  var s = newSeq[uint8](4)
  for i in 0..<4 : s[i]= bom[i]
  s == @[221'u8, 115'u8, 102'u8, 115'u8]

proc isGB18030*(bom:openarray[uint8]):bool=
  var s = newSeq[uint8](4)
  for i in 0..<4 : s[i]= bom[i]
  s == @[132'u8, 49'u8, 149'u8, 51'u8]

proc isSCSU*(bom:openarray[uint8]):bool=
  var s = newSeq[uint8](3)
  for i in 0..<3 : s[i]= bom[i]
  s == @[14'u8, 254'u8, 255'u8]

proc isUTF1*(bom:openarray[uint8]):bool=
  var s = newSeq[uint8](3)
  for i in 0..<3 : s[i]= bom[i]
  s == @[247'u8, 100'u8, 76'u8]

proc isBOCU1*(bom:openarray[uint8]):bool=
  var s = newSeq[uint8](3)
  for i in 0..<3 : s[i]= bom[i]
  s == @[251'u8, 238'u8, 40'u8]

proc isUTF7*(bom:openarray[uint8]):bool=
  var s = newSeq[uint8](16)
  for i in 0..<16 : s[i]= bom[i]
  s == @[43'u8, 47'u8, 118'u8, 56'u8, 43'u8, 47'u8, 118'u8, 57'u8, 43'u8, 47'u8, 118'u8, 43'u8, 43'u8, 47'u8, 118'u8, 47'u8]

proc isUTF16*(bom:openarray[uint8]):bool =
  return isUTF16BE(bom) or isUTF16LE(bom)

proc isUTF32*(bom:openarray[uint8]):bool =
  return isUTF32BE(bom) or isUTF32LE(bom)


import strutils, encodings

proc getFileEncoding*(filePath:string):string=
  var
    bytes = 16
    box:array[16,uint8]
    file:File
  if not open(file,filePath):
    raise newException(IOError,"Could not open file:" & filePath)
  defer:file.close()
  discard file.readBytes(box,0,bytes)
  result = ""
  if isUTF32BE(box):   return "UTF-32BE"
  if isUTF32LE(box):   return "UTF-32LE"
  if isUTF16BE(box):   return "UTF-16BE"
  if isUTF16LE(box):   return "UTF-16LE"
  if isUTF8(box):      return "UTF-8"
  if isUTFEBCDIC(box): return "UTF-EBCDIC"
  if isGB18030(box):   return "GB-18030"
  if isSCSU(box):      return "SCSU"
  if isUTF1(box):      return "UTF-1"
  if isBOCU1(box):     return "BOCU-1"
  if isUTF7(box):      return "UTF-7"

PS: However, in fact, the module type chardet (Python) would be even more powerful. :-)

Mirror of forum.nim-lang.org

2392 :: Read the file in the proper encoding