Well, this is embarrassing. It turns out that @Jeremy C was right, the compiler optimized away my loop when I commented out the part where the values are written back to the MemoryBlock, so it's not that the writes are particularly slow at all!

This is the function if anyone wants to make suggestions. Note that this has been inlined intentionally to avoid the overhead of function calls.

Private Sub Expand0State(repetitions As Integer, ParamArray keys() As Xojo.Core.MutableMemoryBlock)
#if not DebugBuild
#pragma BackgroundTasks False
#pragma BoundsChecking False
#pragma NilObjectChecking False
#pragma StackOverflowChecking False
#endif
const kLastIndex as integer = BLF_N + 1 // = 17
const kShift3 as UInt32 = 256 ^ 3
const kShift2 as UInt32 = 256 ^ 2
const kShift1 as UInt32 = 256 ^ 1
const kMask0 as UInt32 = &hFF000000
const kMask1 as UInt32 = &h00FF0000
const kMask2 as UInt32 = &h0000FF00
const kMask3 as UInt32 = &h000000FF
dim myPPtr as ptr = PPtr
dim mySPtr as ptr = SPtr
//
// Create the stream keys
//
for keyIndex as integer = 0 to keys.Ubound
dim key as Xojo.Core.MutableMemoryBlock = keys( keyIndex )
dim keySize as integer = key.Size
if keySize = 0 then
RaiseErrorIf( true, kErrorKeyCannotBeEmpty )
end if
WasKeySet = true
dim streamKey as Xojo.Core.MutableMemoryBlock
dim streamKeySize as integer = keySize
if ( keySize mod 4 ) = 0 then
streamKey = new Xojo.Core.MutableMemoryBlock( keySize )
streamKey.Left( keySize ) = key
else
streamKeySize = streamKeySize * 4
streamKey = new Xojo.Core.MutableMemoryBlock( streamKeySize )
streamKey.Left( keySize ) = key
streamKey.Mid( keySize, keySize ) = key
streamKey.Mid( keySize + keySize, keySize ) = key
streamKey.Right( keySize ) = key
end if
if IsLittleEndian then
//
// Swap the bytes
//
dim streamKeyPtr as ptr = streamKey.Data
dim swapIndex as integer
while swapIndex < streamKeySize
dim temp as UInt32 = streamKeyPtr.UInt32( swapIndex )
streamKeyPtr.UInt32( swapIndex ) = _
( temp \ kShift3 ) or _
( ( temp and kMask1 ) \ kShift1 ) or _
( ( temp and kMask2 ) * kShift1 ) or _
( temp * kShift3 )
swapIndex = swapIndex + 4
wend
end if
keys( keyIndex ) = streamKey
next
for rep as integer = 1 to repetitions
for keyIndex as integer = 0 to keys.Ubound
dim key as Xojo.Core.MutableMemoryBlock = keys( keyIndex )
dim keyPtr as ptr = key.Data
dim keySize as integer = key.Size
dim j as integer
dim i, k as integer, arrIndex as integer
dim temp as UInt32
dim d0, d1 as UInt32
for i = 0 to kLastIndex
'temp = Stream2Word( key, j, streamBuffer, streamBufferPtr )
if j = keySize then
j = 0
end if
temp = keyPtr.UInt32( j )
j = j + 4
arrIndex = i * 4
myPPtr.UInt32( arrIndex ) = myPPtr.UInt32( arrIndex ) xor temp
next i
dim a, b, c, d as integer // Used as indexes
dim inner as integer
dim xl as UInt32
dim xr as UInt32
dim j1 as UInt32
j = 0
arrIndex = 0
for i = 0 to kLastIndex step 2
'self.Encipher( d0, d1 )
xl = d0
xr = d1
xl = xl xor myPPtr.UInt32( 0 )
for inner = 1 to 16 step 2
j1 = xl
a = ( j1 \ kShift3 )
b = ( j1 \ kShift2 ) and kMask3
c = ( j1 \ kShift1 ) and kMask3
d = j1 and kMask3
j1 = ( ( mySPtr.UInt32( a * 4 ) + mySPtr.UInt32( ( 256 + b ) * 4 ) ) _
xor mySPtr.UInt32( ( 512 + c ) * 4 ) ) _
+ mySPtr.UInt32( ( 768 + d ) * 4 )
xr = xr xor ( j1 xor myPPtr.UInt32( inner * 4 ) )
j1 = xr
a = ( j1 \ kShift3 )
b = ( j1 \ kShift2 ) and kMask3
c = ( j1 \ kShift1 ) and kMask3
d = j1 and kMask3
j1 = ( ( mySPtr.UInt32( a * 4 ) + mySPtr.UInt32( ( 256 + b ) * 4 ) ) _
xor mySPtr.UInt32( ( 512 + c ) * 4 ) ) _
+ mySPtr.UInt32( ( 768 + d ) * 4 )
xl = xl xor ( j1 xor myPPtr.UInt32( ( inner + 1 ) * 4 ) )
next inner
xr = xr xor myPPtr.UInt32( 17 * 4 )
d0 = xr
d1 = xl
myPPtr.UInt32( arrIndex ) = d0
arrIndex = arrIndex + 4
myPPtr.UInt32( arrIndex ) = d1
arrIndex = arrIndex + 4
next i
dim firstPPtr as UInt32 = myPPtr.UInt32( 0 )
arrIndex = 0
for i = 0 to 3
for k = 0 to 255 step 2
'self.Encipher( d0, d1 )
xl = d0
xr = d1
xl = xl xor firstPPtr
for inner = 1 to 16 step 2
j1 = xl
a = ( j1 \ kShift3 )
b = ( j1 \ kShift2 ) and kMask3
c = ( j1 \ kShift1 ) and kMask3
d = j1 and kMask3
j1 = ( ( mySPtr.UInt32( a * 4 ) + mySPtr.UInt32( ( 256 + b ) * 4 ) ) _
xor mySPtr.UInt32( ( 512 + c ) * 4 ) ) _
+ mySPtr.UInt32( ( 768 + d ) * 4 )
xr = xr xor ( j1 xor myPPtr.UInt32( inner * 4 ) )
j1 = xr
a = ( j1 \ kShift3 )
b = ( j1 \ kShift2 ) and kMask3
c = ( j1 \ kShift1 ) and kMask3
d = j1 and kMask3
j1 = ( ( mySPtr.UInt32( a * 4 ) + mySPtr.UInt32( ( 256 + b ) * 4 ) ) _
xor mySPtr.UInt32( ( 512 + c ) * 4 ) ) _
+ mySPtr.UInt32( ( 768 + d ) * 4 )
xl = xl xor ( j1 xor myPPtr.UInt32( ( inner + 1 ) * 4 ) )
next inner
xr = xr xor myPPtr.UInt32( 17 * 4 )
d0 = xr
d1 = xl
mySPtr.UInt32( arrIndex ) = d0
arrIndex = arrIndex + 4
mySPtr.UInt32( arrIndex ) = d1
arrIndex = arrIndex + 4
next k
next i
next keyIndex
next rep
End Sub