OK, I’ve added this method to my M_String module, but have not posted the update to my web site yet. It ensures a string is made into valid UTF-8, discarding whatever doesn’t make sense. Comments welcome:
Protected Function MakeValidUTF8(src As String) As String
// Turn the given string into valid UTF-8.
// Filters out invalid characters, so it might return an empty string.
if src.Encoding = nil then
src = src.DefineEncoding( Encodings.UTF8 )
elseif src.Encoding <> Encodings.UTF8 then
src = src.ConvertEncoding( Encodings.UTF8 )
end if
if src = "" or Encodings.UTF8.IsValidData( src ) then
return src
end if
// If we get here, we have a non-empty string defined as UTF8, but it's not valid.
// We have to remove the invalid bytes.
dim mb as MemoryBlock = src
dim p as Ptr = mb
dim lastIndex as integer = mb.Size - 1
dim writeIndex as integer
dim readIndex as integer
while readIndex <= lastIndex
dim thisByte as integer = p.Byte( readIndex )
if thisByte <= &b01111111 then
p.Byte( writeIndex ) = thisByte
readIndex = readIndex + 1
writeIndex = writeIndex + 1
elseif thisByte >= &b11111110 then // Invalid byte
readIndex = readIndex + 1
else // It's a leading byte so figure out how many valid bytes should be in the group and check them
dim byteCount as integer
if thisByte >= &b11111100 then
byteCount = 6
elseif thisByte >= &b11111000 then
byteCount = 5
elseif thisByte >= &b11110000 then
byteCount = 4
elseif thisByte >= &b11100000 then
byteCount = 3
elseif thisByte >= &b11000000 then
byteCount = 2
else // This is an invalid byte so filter it out
readIndex = readIndex + 1
continue while // Skip to the next byte immediately
end if
// Make sure we have enough bytes to make a complete character. If not, filter this out.
if ( readIndex + byteCount - 1 ) > lastIndex then
readIndex = readIndex + 1
continue while // Skip to the next byte immediately
end if
dim chunk as string = mb.StringValue( readIndex, byteCount )
if Encodings.UTF8.IsValidData( chunk ) then
mb.StringValue( writeIndex, byteCount ) = chunk
readIndex = readIndex + byteCount
writeIndex = writeIndex + byteCount
else // This can't be a leading byte so let's discard it
readIndex = readIndex + 1
end if
end if
wend
dim r as string
if writeIndex <> 0 then
r = mb.StringValue( 0, writeIndex )
r = r.DefineEncoding( Encodings.UTF8 )
end if
return r
End Function