determining a text files character set.

  1. 6 months ago

    Brian O

    11 Jan 2019 Pre-Release Testers, Xojo Pro Calgary, AB

    I have recently run into some text files that are not pure ascii..
    After years of handling csv file i came across one that had BOM in it.. and it just raised some flags in my mind...

    So maybe I just found a new use for a histogram personally.

    What would be a good way of determining if a text file originated on mac or window.
    Well MSDOS wants cr/lf and mac / linux want lf.

    So using imageJ I opened up a .txt file and imported it as raw.
    There were 3767 bytes in the file so I opened the raw as 1 row of 3767 columns of 8 bit data.
    I computed the images histogram and look at what it shows.

    There were 64 occurrences of 10 and 0 occurrences of 13
    There were also no values greater than 127!

    Right now I'm trying to determine if my app should be UTF8 or ISO Latin 1 or UTF8 big4... and you know what? I just don't know.
    But can the histogram of data tell me more than just line feed endings?
    I mean there is no data > 127 here so is it strictly ASCII?

    Just thinking out loud.. Thoughts?

  2. Jean-Yves P

    11 Jan 2019 Pre-Release Testers, Xojo Pro Europe (France, Besançon)
    Attributes ( extendsString ) Public Function GuessEncoding(extends s As String) as TextEncoding
      // Guess what text encoding the text in the given string is in.
      //
      // Note that it this version does not report whether the UTF-32 or
      // UTF-16 that is in the wrong byte order.  If that's a possibility,
      // you should use the other version of GuessEncoding instead.
      
      Dim whoCares As Boolean
      return s.GuessEncoding( whoCares )
      
    End Function

    and

    Attributes ( extendsString ) Public Function GuessEncoding(extends s As String, ByRef outWrongOrder As Boolean) as TextEncoding
      // Guess what text encoding the text in the given string is in.
      // This ignores the encoding set on the string, and guesses
      // one of the following:
      //
      //   * UTF-32
      //   * UTF-16
      //   * UTF-8
      //   * Encodings.SystemDefault
      //
      // If the UTF-32 or UTF-16 is in the wrong byte order for this platform,
      // then outWrongOrder will be set to true.
      
      static isBigEndian, endianChecked As Boolean
      if not endianChecked then
        Dim temp As String = Encodings.UTF16.Chr( &hFEFF )
        isBigEndian = (AscB( MidB( temp, 1, 1 ) ) = &hFE)
        endianChecked = true
      end if
      
      // check for a BOM
      Dim b0 As Integer = AscB( s.MidB( 1, 1 ) )
      Dim b1 As Integer = AscB( s.MidB( 2, 1 ) )
      Dim b2 As Integer = AscB( s.MidB( 3, 1 ) )
      Dim b3 As Integer = AscB( s.MidB( 4, 1 ) )
      if b0=0 and b1=0 and b2=&hFE and b3=&hFF then
        // UTF-32, big-endian
        outWrongOrder = not isBigEndian
        return Encodings.UTF32BE
      elseif b0=&hFF and b1=&hFE and b2=0 and b3=0 and s.LenB >= 4 then
        // UTF-32, little-endian
        outWrongOrder = isBigEndian
        return Encodings.UTF32LE
      elseif b0=&hFE and b1=&hFF then
        // UTF-16, big-endian
        outWrongOrder = not isBigEndian
        return Encodings.UTF16
      elseif b0=&hFF and b1=&hFE then
        // UTF-16, little-endian
        outWrongOrder = isBigEndian
        return Encodings.UTF16
      elseif b0=&hEF and b1=&hBB and b1=&hBF then
        // UTF-8 (ah, a sensible encoding where endianness doesn't matter!)
        return Encodings.UTF8
      end if
      
      // no BOM; see if it's entirely ASCII.
      Dim m As MemoryBlock = s
      Dim i, maxi As Integer = s.LenB - 1
      for i = 0 to maxi
        if m.Byte(i) > 127 then exit
      next
      if i > maxi then return Encodings.ASCII
      
      // Not ASCII; check for a high incidence of nulls every other byte,
      // which suggests UTF-16 (at least in Roman text).
      Dim nulls(1) As Integer  // null count in even (0) and odd (1) bytes
      for i = 0 to maxi
        if m.Byte(i) = 0 then
          nulls(i mod 2) = nulls(i mod 2) + 1
        end if
      next
      if nulls(0) > nulls(1)*2 and nulls(0) > maxi\2 then
        // UTF-16, big-endian
        outWrongOrder = not isBigEndian
        return Encodings.UTF16
      elseif nulls(1) > nulls(0)*2 and nulls(1) > maxi\2 then
        // UTF-16, little-endian
        outWrongOrder = isBigEndian
        return Encodings.UTF16
      end if
      
      // it's not ASCII; check for illegal UTF-8 characters.
      // See Table 3.1B, "Legal UTF-8 Byte Sequences",
      // at <http://unicode.org/versions/corrigendum1.html>
      Dim b As Byte
      for i = 0 to maxi
        select case m.Byte(i)
        case &h00 to &h7F
          // single-byte character; just continue
        case &hC2 to &hDF
          // one additional byte
          if i+1 > maxi then exit for
          b = m.Byte(i+1)
          if b < &h80 or b > &hBF then exit for
          i = i+1
        case &hE0
          // two additional bytes
          if i+2 > maxi then exit for
          b = m.Byte(i+1)
          if b < &hA0 or b > &hBF then exit for
          b = m.Byte(i+2)
          if b < &h80 or b > &hBF then exit for
          i = i+2
        case &hE1 to &hEF
          // two additional bytes
          if i+2 > maxi then exit for
          b = m.Byte(i+1)
          if b < &h80 or b > &hBF then exit for
          b = m.Byte(i+2)
          if b < &h80 or b > &hBF then exit for
          i = i+2
        case &hF0
          // three additional bytes
          if i+3 > maxi then exit for
          b = m.Byte(i+1)
          if b < &h90 or b > &hBF then exit for
          b = m.Byte(i+2)
          if b < &h80 or b > &hBF then exit for
          b = m.Byte(i+3)
          if b < &h80 or b > &hBF then exit for
          i = i+3
        case &hF1 to &hF3
          // three additional bytes
          if i+3 > maxi then exit for
          b = m.Byte(i+1)
          if b < &h80 or b > &hBF then exit for
          b = m.Byte(i+2)
          if b < &h80 or b > &hBF then exit for
          b = m.Byte(i+3)
          if b < &h80 or b > &hBF then exit for
          i = i+3
        case &hF4
          // three additional bytes
          if i+3 > maxi then exit for
          b = m.Byte(i+1)
          if b < &h80 or b > &h8F then exit for
          b = m.Byte(i+2)
          if b < &h80 or b > &hBF then exit for
          b = m.Byte(i+3)
          if b < &h80 or b > &hBF then exit for
          i = i+3
        else
          exit for
        end select
      next i
      if i > maxi then return Encodings.UTF8  // no illegal UTF-8 sequences, so that's probably what it is
      
      // recherche de macosroman
      // http://stackoverflow.com/questions/4198804/how-to-reliably-guess-the-encoding-between-macroman-cp1252-latin1-utf-8-and
      dim found as Boolean = False
      for i = 0 to maxi
        if m.Byte(i) = &h8e then found=True
        if m.Byte(i) = &h8f then found=True
        if m.Byte(i) = &h9a then found=True
        if m.Byte(i) = &ha1 then found=True
        if m.Byte(i) = &ha5 then found=True
        if m.Byte(i) = &ha8 then found=True
        if m.Byte(i) = &hd0 then found=True
        if m.Byte(i) = &hd1 then found=True
        if m.Byte(i) = &hd5 then found=True
        if m.Byte(i) = &he1 then found=True
      next
      if found then Return Encodings.MacRoman
      
      // recherche de windows 1252
      // http://stackoverflow.com/questions/4198804/how-to-reliably-guess-the-encoding-between-macroman-cp1252-latin1-utf-8-and
      found = False
      for i = 0 to maxi
        if m.Byte(i) = &h92 then found=True
        if m.Byte(i) = &h95 then found=True
        if m.Byte(i) = &h96 then found=True
        if m.Byte(i) = &h97 then found=True
        if m.Byte(i) = &hae then found=True
        if m.Byte(i) = &hb0 then found=True
        if m.Byte(i) = &hb7 then found=True
        if m.Byte(i) = &he8 then found=True
        if m.Byte(i) = &he9 then found=True
        if m.Byte(i) = &hf6 then found=True
      next
      if found then Return Encodings.WindowsANSI
      
      // recherche du DOS original soit DOSLatinUS ou IBM437
      // http://stackoverflow.com/questions/2700794/what-character-encoding-is-this
      found = False
      for i = 0 to maxi
        if m.Byte(i) = &h82 then found=True
        if m.Byte(i) = &h8a then found=True
        if m.Byte(i) = &h88 then found=True
        if m.Byte(i) = &h85 then found=True
        if m.Byte(i) = &h87 then found=True
      next
      if found then Return Encodings.DOSLatinUS
      
      // If not valid UTF-8, then let's just guess the system default.
      return Encodings.SystemDefault
      
    End Function
  3. Tim S

    11 Jan 2019 Canterbury, UK

    Sounds like a reasonable guess is that the file is a mac/linux ASCII file.

    But there's no guarantee of that. Only the creator of the file knows what the content is "supposed" to be. And as I've discovered processing email files, the creator may be lying anyway.

  4. Brian O

    11 Jan 2019 Pre-Release Testers, Xojo Pro Calgary, AB

    Merci Jean-Yves!
    Thanks all.

  5. Christian S

    11 Jan 2019 Pre-Release Testers, Xojo Pro, XDC Speakers Germany

    There is a Encodinfs.UTF8.IsValidData function which may be quicker than your Xojo code.

or Sign Up to reply!