determining a text files character set.

Brian_O_Brien · January 11, 2019, 6:29pm

I have recently run into some text files that are not pure ascii…
After years of handling csv file i came across one that had BOM in it… and it just raised some flags in my mind…

So maybe I just found a new use for a histogram personally.

What would be a good way of determining if a text file originated on mac or window.
Well MSDOS wants cr/lf and mac / linux want lf.

So using imageJ I opened up a .txt file and imported it as raw.
There were 3767 bytes in the file so I opened the raw as 1 row of 3767 columns of 8 bit data.
I computed the images histogram and look at what it shows.

There were 64 occurrences of 10 and 0 occurrences of 13
There were also no values greater than 127!

Right now I’m trying to determine if my app should be UTF8 or ISO Latin 1 or UTF8 big4… and you know what? I just don’t know.
But can the histogram of data tell me more than just line feed endings?
I mean there is no data > 127 here so is it strictly ASCII?

Just thinking out loud… Thoughts?

Jean-Yves_Pochez · January 11, 2019, 6:32pm

[code]Attributes ( extendsString ) Public Function GuessEncoding(extends s As String) as TextEncoding
// Guess what text encoding the text in the given string is in.
//
// Note that it this version does not report whether the UTF-32 or
// UTF-16 that is in the wrong byte order. If that’s a possibility,
// you should use the other version of GuessEncoding instead.

Dim whoCares As Boolean
return s.GuessEncoding( whoCares )

End Function
[/code]

and

[code]Attributes ( extendsString ) Public Function GuessEncoding(extends s As String, ByRef outWrongOrder As Boolean) as TextEncoding
// Guess what text encoding the text in the given string is in.
// This ignores the encoding set on the string, and guesses
// one of the following:
//
// * UTF-32
// * UTF-16
// * UTF-8
// * Encodings.SystemDefault
//
// If the UTF-32 or UTF-16 is in the wrong byte order for this platform,
// then outWrongOrder will be set to true.

static isBigEndian, endianChecked As Boolean
if not endianChecked then
Dim temp As String = Encodings.UTF16.Chr( &hFEFF )
isBigEndian = (AscB( MidB( temp, 1, 1 ) ) = &hFE)
endianChecked = true
end if

// check for a BOM
Dim b0 As Integer = AscB( s.MidB( 1, 1 ) )
Dim b1 As Integer = AscB( s.MidB( 2, 1 ) )
Dim b2 As Integer = AscB( s.MidB( 3, 1 ) )
Dim b3 As Integer = AscB( s.MidB( 4, 1 ) )
if b0=0 and b1=0 and b2=&hFE and b3=&hFF then
// UTF-32, big-endian
outWrongOrder = not isBigEndian
return Encodings.UTF32BE
elseif b0=&hFF and b1=&hFE and b2=0 and b3=0 and s.LenB >= 4 then
// UTF-32, little-endian
outWrongOrder = isBigEndian
return Encodings.UTF32LE
elseif b0=&hFE and b1=&hFF then
// UTF-16, big-endian
outWrongOrder = not isBigEndian
return Encodings.UTF16
elseif b0=&hFF and b1=&hFE then
// UTF-16, little-endian
outWrongOrder = isBigEndian
return Encodings.UTF16
elseif b0=&hEF and b1=&hBB and b1=&hBF then
// UTF-8 (ah, a sensible encoding where endianness doesn’t matter!)
return Encodings.UTF8
end if

// no BOM; see if it’s entirely ASCII.
Dim m As MemoryBlock = s
Dim i, maxi As Integer = s.LenB - 1
for i = 0 to maxi
if m.Byte(i) > 127 then exit
next
if i > maxi then return Encodings.ASCII

// Not ASCII; check for a high incidence of nulls every other byte,
// which suggests UTF-16 (at least in Roman text).
Dim nulls(1) As Integer // null count in even (0) and odd (1) bytes
for i = 0 to maxi
if m.Byte(i) = 0 then
nulls(i mod 2) = nulls(i mod 2) + 1
end if
next
if nulls(0) > nulls(1)*2 and nulls(0) > maxi\2 then
// UTF-16, big-endian
outWrongOrder = not isBigEndian
return Encodings.UTF16
elseif nulls(1) > nulls(0)*2 and nulls(1) > maxi\2 then
// UTF-16, little-endian
outWrongOrder = isBigEndian
return Encodings.UTF16
end if

// it’s not ASCII; check for illegal UTF-8 characters.
// See Table 3.1B, “Legal UTF-8 Byte Sequences”,
// at http://unicode.org/versions/corrigendum1.html
Dim b As Byte
for i = 0 to maxi
select case m.Byte(i)
case &h00 to &h7F
// single-byte character; just continue
case &hC2 to &hDF
// one additional byte
if i+1 > maxi then exit for
b = m.Byte(i+1)
if b < &h80 or b > &hBF then exit for
i = i+1
case &hE0
// two additional bytes
if i+2 > maxi then exit for
b = m.Byte(i+1)
if b < &hA0 or b > &hBF then exit for
b = m.Byte(i+2)
if b < &h80 or b > &hBF then exit for
i = i+2
case &hE1 to &hEF
// two additional bytes
if i+2 > maxi then exit for
b = m.Byte(i+1)
if b < &h80 or b > &hBF then exit for
b = m.Byte(i+2)
if b < &h80 or b > &hBF then exit for
i = i+2
case &hF0
// three additional bytes
if i+3 > maxi then exit for
b = m.Byte(i+1)
if b < &h90 or b > &hBF then exit for
b = m.Byte(i+2)
if b < &h80 or b > &hBF then exit for
b = m.Byte(i+3)
if b < &h80 or b > &hBF then exit for
i = i+3
case &hF1 to &hF3
// three additional bytes
if i+3 > maxi then exit for
b = m.Byte(i+1)
if b < &h80 or b > &hBF then exit for
b = m.Byte(i+2)
if b < &h80 or b > &hBF then exit for
b = m.Byte(i+3)
if b < &h80 or b > &hBF then exit for
i = i+3
case &hF4
// three additional bytes
if i+3 > maxi then exit for
b = m.Byte(i+1)
if b < &h80 or b > &h8F then exit for
b = m.Byte(i+2)
if b < &h80 or b > &hBF then exit for
b = m.Byte(i+3)
if b < &h80 or b > &hBF then exit for
i = i+3
else
exit for
end select
next i
if i > maxi then return Encodings.UTF8 // no illegal UTF-8 sequences, so that’s probably what it is

// recherche de macosroman
// http://stackoverflow.com/questions/4198804/how-to-reliably-guess-the-encoding-between-macroman-cp1252-latin1-utf-8-and
dim found as Boolean = False
for i = 0 to maxi
if m.Byte(i) = &h8e then found=True
if m.Byte(i) = &h8f then found=True
if m.Byte(i) = &h9a then found=True
if m.Byte(i) = &ha1 then found=True
if m.Byte(i) = &ha5 then found=True
if m.Byte(i) = &ha8 then found=True
if m.Byte(i) = &hd0 then found=True
if m.Byte(i) = &hd1 then found=True
if m.Byte(i) = &hd5 then found=True
if m.Byte(i) = &he1 then found=True
next
if found then Return Encodings.MacRoman

// recherche de windows 1252
// http://stackoverflow.com/questions/4198804/how-to-reliably-guess-the-encoding-between-macroman-cp1252-latin1-utf-8-and
found = False
for i = 0 to maxi
if m.Byte(i) = &h92 then found=True
if m.Byte(i) = &h95 then found=True
if m.Byte(i) = &h96 then found=True
if m.Byte(i) = &h97 then found=True
if m.Byte(i) = &hae then found=True
if m.Byte(i) = &hb0 then found=True
if m.Byte(i) = &hb7 then found=True
if m.Byte(i) = &he8 then found=True
if m.Byte(i) = &he9 then found=True
if m.Byte(i) = &hf6 then found=True
next
if found then Return Encodings.WindowsANSI

// recherche du DOS original soit DOSLatinUS ou IBM437
// http://stackoverflow.com/questions/2700794/what-character-encoding-is-this
found = False
for i = 0 to maxi
if m.Byte(i) = &h82 then found=True
if m.Byte(i) = &h8a then found=True
if m.Byte(i) = &h88 then found=True
if m.Byte(i) = &h85 then found=True
if m.Byte(i) = &h87 then found=True
next
if found then Return Encodings.DOSLatinUS

// If not valid UTF-8, then let’s just guess the system default.
return Encodings.SystemDefault

End Function
[/code]

TimStreater · January 11, 2019, 6:34pm

Sounds like a reasonable guess is that the file is a mac/linux ASCII file.

But there’s no guarantee of that. Only the creator of the file knows what the content is “supposed” to be. And as I’ve discovered processing email files, the creator may be lying anyway.

Brian_O_Brien · January 11, 2019, 7:13pm

Merci Jean-Yves!
Thanks all.

Christian_Schmitz · January 11, 2019, 7:30pm

There is a Encodinfs.UTF8.IsValidData function which may be quicker than your Xojo code.