determining a text files character set.

Jean-Yves_Pochez · January 11, 2019, 6:32pm

[code]Attributes ( extendsString ) Public Function GuessEncoding(extends s As String) as TextEncoding
// Guess what text encoding the text in the given string is in.
//
// Note that it this version does not report whether the UTF-32 or
// UTF-16 that is in the wrong byte order. If that’s a possibility,
// you should use the other version of GuessEncoding instead.

Dim whoCares As Boolean
return s.GuessEncoding( whoCares )

End Function
[/code]

and

[code]Attributes ( extendsString ) Public Function GuessEncoding(extends s As String, ByRef outWrongOrder As Boolean) as TextEncoding
// Guess what text encoding the text in the given string is in.
// This ignores the encoding set on the string, and guesses
// one of the following:
//
// * UTF-32
// * UTF-16
// * UTF-8
// * Encodings.SystemDefault
//
// If the UTF-32 or UTF-16 is in the wrong byte order for this platform,
// then outWrongOrder will be set to true.

static isBigEndian, endianChecked As Boolean
if not endianChecked then
Dim temp As String = Encodings.UTF16.Chr( &hFEFF )
isBigEndian = (AscB( MidB( temp, 1, 1 ) ) = &hFE)
endianChecked = true
end if

// check for a BOM
Dim b0 As Integer = AscB( s.MidB( 1, 1 ) )
Dim b1 As Integer = AscB( s.MidB( 2, 1 ) )
Dim b2 As Integer = AscB( s.MidB( 3, 1 ) )
Dim b3 As Integer = AscB( s.MidB( 4, 1 ) )
if b0=0 and b1=0 and b2=&hFE and b3=&hFF then
// UTF-32, big-endian
outWrongOrder = not isBigEndian
return Encodings.UTF32BE
elseif b0=&hFF and b1=&hFE and b2=0 and b3=0 and s.LenB >= 4 then
// UTF-32, little-endian
outWrongOrder = isBigEndian
return Encodings.UTF32LE
elseif b0=&hFE and b1=&hFF then
// UTF-16, big-endian
outWrongOrder = not isBigEndian
return Encodings.UTF16
elseif b0=&hFF and b1=&hFE then
// UTF-16, little-endian
outWrongOrder = isBigEndian
return Encodings.UTF16
elseif b0=&hEF and b1=&hBB and b1=&hBF then
// UTF-8 (ah, a sensible encoding where endianness doesn’t matter!)
return Encodings.UTF8
end if

// no BOM; see if it’s entirely ASCII.
Dim m As MemoryBlock = s
Dim i, maxi As Integer = s.LenB - 1
for i = 0 to maxi
if m.Byte(i) > 127 then exit
next
if i > maxi then return Encodings.ASCII

// Not ASCII; check for a high incidence of nulls every other byte,
// which suggests UTF-16 (at least in Roman text).
Dim nulls(1) As Integer // null count in even (0) and odd (1) bytes
for i = 0 to maxi
if m.Byte(i) = 0 then
nulls(i mod 2) = nulls(i mod 2) + 1
end if
next
if nulls(0) > nulls(1)*2 and nulls(0) > maxi\2 then
// UTF-16, big-endian
outWrongOrder = not isBigEndian
return Encodings.UTF16
elseif nulls(1) > nulls(0)*2 and nulls(1) > maxi\2 then
// UTF-16, little-endian
outWrongOrder = isBigEndian
return Encodings.UTF16
end if

// it’s not ASCII; check for illegal UTF-8 characters.
// See Table 3.1B, “Legal UTF-8 Byte Sequences”,
// at http://unicode.org/versions/corrigendum1.html
Dim b As Byte
for i = 0 to maxi
select case m.Byte(i)
case &h00 to &h7F
// single-byte character; just continue
case &hC2 to &hDF
// one additional byte
if i+1 > maxi then exit for
b = m.Byte(i+1)
if b < &h80 or b > &hBF then exit for
i = i+1
case &hE0
// two additional bytes
if i+2 > maxi then exit for
b = m.Byte(i+1)
if b < &hA0 or b > &hBF then exit for
b = m.Byte(i+2)
if b < &h80 or b > &hBF then exit for
i = i+2
case &hE1 to &hEF
// two additional bytes
if i+2 > maxi then exit for
b = m.Byte(i+1)
if b < &h80 or b > &hBF then exit for
b = m.Byte(i+2)
if b < &h80 or b > &hBF then exit for
i = i+2
case &hF0
// three additional bytes
if i+3 > maxi then exit for
b = m.Byte(i+1)
if b < &h90 or b > &hBF then exit for
b = m.Byte(i+2)
if b < &h80 or b > &hBF then exit for
b = m.Byte(i+3)
if b < &h80 or b > &hBF then exit for
i = i+3
case &hF1 to &hF3
// three additional bytes
if i+3 > maxi then exit for
b = m.Byte(i+1)
if b < &h80 or b > &hBF then exit for
b = m.Byte(i+2)
if b < &h80 or b > &hBF then exit for
b = m.Byte(i+3)
if b < &h80 or b > &hBF then exit for
i = i+3
case &hF4
// three additional bytes
if i+3 > maxi then exit for
b = m.Byte(i+1)
if b < &h80 or b > &h8F then exit for
b = m.Byte(i+2)
if b < &h80 or b > &hBF then exit for
b = m.Byte(i+3)
if b < &h80 or b > &hBF then exit for
i = i+3
else
exit for
end select
next i
if i > maxi then return Encodings.UTF8 // no illegal UTF-8 sequences, so that’s probably what it is

// recherche de macosroman
// http://stackoverflow.com/questions/4198804/how-to-reliably-guess-the-encoding-between-macroman-cp1252-latin1-utf-8-and
dim found as Boolean = False
for i = 0 to maxi
if m.Byte(i) = &h8e then found=True
if m.Byte(i) = &h8f then found=True
if m.Byte(i) = &h9a then found=True
if m.Byte(i) = &ha1 then found=True
if m.Byte(i) = &ha5 then found=True
if m.Byte(i) = &ha8 then found=True
if m.Byte(i) = &hd0 then found=True
if m.Byte(i) = &hd1 then found=True
if m.Byte(i) = &hd5 then found=True
if m.Byte(i) = &he1 then found=True
next
if found then Return Encodings.MacRoman

// recherche de windows 1252
// http://stackoverflow.com/questions/4198804/how-to-reliably-guess-the-encoding-between-macroman-cp1252-latin1-utf-8-and
found = False
for i = 0 to maxi
if m.Byte(i) = &h92 then found=True
if m.Byte(i) = &h95 then found=True
if m.Byte(i) = &h96 then found=True
if m.Byte(i) = &h97 then found=True
if m.Byte(i) = &hae then found=True
if m.Byte(i) = &hb0 then found=True
if m.Byte(i) = &hb7 then found=True
if m.Byte(i) = &he8 then found=True
if m.Byte(i) = &he9 then found=True
if m.Byte(i) = &hf6 then found=True
next
if found then Return Encodings.WindowsANSI

// recherche du DOS original soit DOSLatinUS ou IBM437
// http://stackoverflow.com/questions/2700794/what-character-encoding-is-this
found = False
for i = 0 to maxi
if m.Byte(i) = &h82 then found=True
if m.Byte(i) = &h8a then found=True
if m.Byte(i) = &h88 then found=True
if m.Byte(i) = &h85 then found=True
if m.Byte(i) = &h87 then found=True
next
if found then Return Encodings.DOSLatinUS

// If not valid UTF-8, then let’s just guess the system default.
return Encodings.SystemDefault

End Function
[/code]