String concatenation is slow, so currently I split the string, iterate over the resulting array into a new array and insert a space every third iteration, and join the new array to get the final string.
Well, I was curious and made a quick test.
Using split and join is much faster than any manual string handling. And even a loop which creates a 3 char array (quick and dirty, so no check if it works properly at string end), la
[code]Public Function build3CharArray(s As String) as String() #Pragma BackgroundTasks False
Dim a() As String = Split(s,"")
Dim b() As String
For i As Integer = 0 To a.Ubound - 3 Step 3
b.Append a(i) + a(i+1) + a(i+2)
Next
Return b
End Function[/code]
is fast for larger strings in my test. Perhaps it is worth a try to build a 3 char string array which can be joined.
Doing the matches interactively we get all of them
Dim ar3() As String
DIm re As New RegEx
re.SearchPattern = "..."
Dim text As String = "aaaabbbbccccddddeeeeffffgggghhhhiiiijjjjkkkkllllmmmmnnnnoooo"
Dim match As RegExMatch = re.Search(text)
Do
If match <> Nil Then ar3.Append(match.SubExpressionString(0))
match = re.Search
Loop until match Is Nil
MsgBox Join(ar3," ")
[code]Dim s As String = “aaabbbcccdddeeefffggghhhiiijjjkkklllmmmnnnooop”
If s <> “” Then
Dim l As Double = ceil((len(s) / 3))
Dim mb As MemoryBlock = s
'bump the size of the mb to a 3 boundry so we dont have to code around overrunning the buffer
mb.Size = l * 3
'create a new mb the size of the original with room for the spaces
Dim newmb As New MemoryBlock((l) * 4)
For n As Integer = 0 To (newmb.Size / 4) - 1
newmb.stringvalue(n * 4, 4) = mb.StringValue(n * 3, 3) + chr(32)
Next
'remove the last space as we only wanted spaces between groups of three
newmb.StringValue(newmb.Size - 1, 1) = chr(0)
'assign the string back into s, as we will always have a 0 at the end we can use CString to trim the string down
s = newmb.CString(0)
Julian, I like your approach but this will fail, or outright crash, if the string contains characters outside the ASCII range (code point > 127) or the encoding is UTF-16 or UTF-32. Those characters will spread over two or more bytes so you will end up splitting those with a space.
You can get around that by converting the original string to UTF-8, then scanning each byte to get the segment size. If the byte < 128, it’s solo. Otherwise, you can check the value to determine if it’s 2, 3, or 4 bytes. This should still be pretty quick.
Also, instead of appending Chr(32), it would probably be faster to set the next byte in newmb to 32 manually.
Finally, the encoding of the result should be defined back to UTF-8, then converted to whatever the original encoding was.
Using Julian’s idea as a starting point, I came up with this generalized code. To process a string of 700,000 characters takes about 75 ms here.
Public Function InsertEvery(s As String, every As Integer, charToInsert As String = " ") as String
if s = "" or charToInsert = "" then
return s
end if
dim origEncoding as TextEncoding = s.Encoding
if origEncoding isa TextEncoding then
s = s.ConvertEncoding( Encodings.UTF8 )
end if
dim r as string
dim charToInsertSize as integer = charToInsert.LenB
dim mb as MemoryBlock = s
dim newSize as integer = mb.Size + _
( ( ( mb.Size ) / every ) * charToInsertSize ) + _
charToInsertSize
dim rmb as new MemoryBlock( newSize )
dim p as ptr = mb
dim lastByteIndex as integer = mb.Size - 1
dim byteIndex as integer = 0
dim insertIndex as integer
while byteIndex <= lastByteIndex
dim startByteIndex as integer = byteIndex
dim segmentSize as integer
dim charCount as integer
while charCount < every and _
byteIndex <= lastByteIndex
dim thisByte as integer = p.Byte( byteIndex )
dim byteAdder as integer
select case thisByte
case 0 to 127
byteAdder = 1
case &b11000000 to &b11011111
byteAdder = 2
case &b11100000 to &b11101111
byteAdder = 3
case else
byteAdder = 4
end select
segmentSize = segmentSize + byteAdder
byteIndex = byteIndex + byteAdder
charCount = charCount + 1
wend
rmb.StringValue( insertIndex, segmentSize ) = _
mb.StringValue( startByteIndex, segmentSize )
insertIndex = insertIndex + segmentSize
rmb.StringValue( insertIndex, charToInsertSize ) = _
charToInsert
insertIndex = insertIndex + charToInsertSize
wend
r = rmb.StringValue( 0, insertIndex - 1 )
r = r.DefineEncoding( Encodings.UTF8 )
if origEncoding is nil then
r = r.DefineEncoding( nil )
else
r = r.ConvertEncoding( origEncoding )
#if DebugBuild
if origEncoding.IsValidData( r ) = false then
break
end if
#endif
end if
return r
End Function
And the new framework is actually slower, although I don’t yet understand why. It takes about 110 ms vs 75 ms for the classic MemoryBlock. I’d expect it to be dramatically faster since it can copy bytes directly and doesn’t have to go through a string.
Here is is with the new framework. By adding the code to disable error checking to both, I was able to process a 700k string in about 50 ms with the classic MemoryBlock, and 25 ms with the new.
Public Function InsertEveryNewMB(s As String, every As Integer, charToInsert As String = " ") as String
#if not DebugBuild
#pragma BackgroundTasks False
#pragma NilObjectChecking False
#pragma StackOverflowChecking False
#pragma BoundsChecking False
#endif
if s = "" or charToInsert = "" then
return s
end if
dim origEncoding as TextEncoding = s.Encoding
if origEncoding isa TextEncoding then
s = s.ConvertEncoding( Encodings.UTF8 )
end if
dim r as string
dim mb as Xojo.Core.MemoryBlock = StringToNewMB( s )
dim mbCharsToInsert as Xojo.Core.MemoryBlock = StringToNewMB( charToInsert )
dim charToInsertSize as integer = mbCharsToInsert.Size
dim newSize as integer = mb.Size + _
( ( ( mb.Size ) / every ) * charToInsertSize ) + _
charToInsertSize
dim rmb as new Xojo.Core.MutableMemoryBlock( newSize )
dim p as ptr = mb.Data
dim rp as ptr = rmb.Data
'dim cp as ptr = mbCharsToInsert.Data
dim lastByteIndex as integer = mb.Size - 1
dim byteIndex as integer = 0
dim insertIndex as integer
while byteIndex <= lastByteIndex
dim startByteIndex as integer = byteIndex
dim segmentSize as integer
dim charCount as integer
while charCount < every and _
byteIndex <= lastByteIndex
dim thisByte as integer = p.Byte( byteIndex )
dim byteAdder as integer
select case thisByte
case 0 to 127
byteAdder = 1
case &b11000000 to &b11011111
byteAdder = 2
case &b11100000 to &b11101111
byteAdder = 3
case else
byteAdder = 4
end select
segmentSize = segmentSize + byteAdder
byteIndex = byteIndex + byteAdder
charCount = charCount + 1
wend
//
// Copy the bytes manually
// (faster than using Mid)
//
for i as integer = 0 to segmentSize - 1
rp.Byte( insertIndex ) = p.Byte( startByteIndex )
insertIndex = insertIndex + 1
startByteIndex = startByteIndex + 1
next
'rmb.Mid( insertIndex, segmentSize ) = mb.Mid( startByteIndex, segmentSize )
'insertIndex = insertIndex + segmentSize
//
// But Mid here is faster
//
'for i as integer = 0 to charToInsertSize - 1
'rp.Byte( insertIndex ) = cp.Byte( i )
'insertIndex = insertIndex + 1
'next
rmb.Mid( insertIndex, charToInsertSize ) = mbCharsToInsert
insertIndex = insertIndex + charToInsertSize
wend
r = NewMBToString( rmb.Left( insertIndex - 1 ) )
r = r.DefineEncoding( Encodings.UTF8 )
if origEncoding is nil then
r = r.DefineEncoding( nil )
else
r = r.ConvertEncoding( origEncoding )
#if DebugBuild
if origEncoding.IsValidData( r ) = false then
break
end if
#endif
end if
return r
End Function
Private Function StringToNewMB(s As String) as Xojo.Core.MemoryBlock
dim mb as MemoryBlock = s
dim tempMB as new Xojo.Core.MemoryBlock( mb, mb.Size )
dim newMB as new Xojo.Core.MutableMemoryBlock( 0 )
newMB.Append tempMB
return newMB
End Function
Private Function NewMBToString(newMB As Xojo.Core.MemoryBlock) as String
dim mb as MemoryBlock = newMB.Data
return mb.StringValue( 0, newMB.Size )
End Function