/*
* Copyright (c) 2009 Apple Inc. All rights reserved.
*
* @APPLE_LICENSE_HEADER_START@
** This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. Please obtain a copy of the License at
*http://www.opensource.apple.com/apsl/ and read it before using this
* file.
** The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
** @APPLE_LICENSE_HEADER_END@
*/
#include <arm/arch.h>
#if defined _ARM_ARCH_7 && !defined VARIANT_DYLD
/*****************************************************************************
* Cortex-A8 implementation *
*****************************************************************************/
// Cortex-A8 implementations of memcpy( ), memmove( ) and bcopy( ).
//
// Our tests have shown that NEON is always a performance win for memcpy( ).
// However, for the specific case of copies from a warm source to a cold
// destination when the buffer size is between 1k and 32k, it is not enough
// of a performance win to offset the increased power footprint, resulting
// in an energy usage regression. Thus, we detect that particular case, and
// pass those copies through the ARM core registers. All other copies larger
// than 8 bytes are handled on NEON.
//
// Stephen Canon, August 2009
.text
.code 16
.syntax unified
// void bcopy(const void * source,
// void * destination,
// size_t length);
//
// void *memmove(void * destination,
// const void * source,
// size_t n);
//
// void *memcpy(void * restrict destination,
// const void * restrict source,
// size_t n);
//
// all copy n successive bytes from source to destination. memmove and memcpy
// returns destination, whereas bcopy has no return value. copying takes place
// as if it were through a temporary buffer -- after return destination contains
// exactly the bytes from source, even if the buffers overlap.
.thumb_func _bcopy$VARIANT$CortexA8
.thumb_func _memmove$VARIANT$CortexA8
.thumb_func _memcpy$VARIANT$CortexA8
.globl _bcopy$VARIANT$CortexA8
.globl _memmove$VARIANT$CortexA8
.globl _memcpy$VARIANT$CortexA8
#define SAVE_REGISTERS {r4,r5,r6,r8,r10,r11}
#define COPY_REGISTERS {r3,r4,r5,r6,r8,r9,r10,r11}
/*****************************************************************************
* entry points *
*****************************************************************************/
.align 2
_bcopy$VARIANT$CortexA8:
// bcopy has the first and second arguments in the opposite order as the C
// library functions memmove and memcpy. If bcopy is called, we swap these
// two arguments and then fall into memmove.
mov r3, r0
mov r0, r1
mov r1, r3
.align 2
_memmove$VARIANT$CortexA8:_memcpy$VARIANT$CortexA8:
// At entry to memmove/memcpy, registers contain the following values:
//
// r0 pointer to the first byte of the destination buffer
// r1 pointer to the first byte of the source buffer
// r2 number of bytes to copy
//
// Our preference is to use a (faster and easier to understand) front-to-back
// copy of the buffer. However, memmove requires that copies take place as
// though through a temporary buffer. This means that if the buffers overlap,
// it may be necessary to copy the buffer in reverse order.
//
// To properly detect such overlap, we begin by computing the offset between
// the source and destination pointers. If the offset happens to be zero,
// then there is no work to be done, so we can early out.
subs r3, r0, r1
it eq
bxeq lr
// r3 now contains the offset between the buffers, (destination - source). If
// 0 < offset < length, then the high-addressed bits of the source alias the
// low addressed bytes of the destination. Thus, if we were to perform the
// copy in ascending address order, we would overwrite the high-addressed
// source bytes before we had a chance to copy them, and the data would be lost.
//
// Thus, we can use the front-to-back copy only if offset is negative or
// greater than the length. This is the case precisely if offset compares
// unsigned higher than length.
cmp r3, r2
bhs L_copyFrontToBack
/*****************************************************************************
* back to front copy *
*****************************************************************************/
// Here we have fallen through into the back-to-front copy. We preserve the
// original destination pointer in r0 because it is the return value for the
// routine, and update the other registers as follows:
//
// r1 one byte beyond the end of the destination buffer
// r2 number of bytes to copy
// ip one byte beyond the end of the destination buffer
mov ip, r0
add r1, r2
add ip, r2
// Subtract 8 from the buffer length; if this is negative, then we will use
// only single-byte copies, and we jump directly to a scalar copy loop.
subs r2, $8
blt L_scalarReverseCopy
// If the destination pointer is 8-byte aligned we can use 8-byte NEON copies
// to move the data.
tst ip, $7
beq L_vectorReverseCopy
// Otherwise, we copy a single byte at a time, in order of descending memory
// address, until the destination is 8 byte aligned. Within this loop,
// registers are used as follows:
//
// r0 original destination pointer
// r1 pointer to one byte past the next element to be copied
// r2 (bytes remaining to be copied) - 8
// r3 temporary to hold the byte that is being copied
// ip pointer one byte past the destination of the next byte to be copied
//
// byte that will be copied in this iteration
// | byte that was copied in the previous iteration
// Source buffer: v v
// ------------------------+---+---+-------------------------
// bytes still to copy ... | | | ... bytes already copied
// ------------------------+---+---+-------------------------
// ^
// r1 holds the address of this byte
0: ldrb r3, [r1, $-1]!
sub r2, $1
strb r3, [ip, $-1]!
tst ip, $7
bne 0b
// At this point, the destination pointer is 8 byte aligned. Check again that
// there are at least 8 bytes remaining to copy by comparing the remaining
// length minus 8 to zero. If fewer than 8 bytes remain, jump to the cleanup
// path.
cmp r2, $0
blt L_scalarReverseCopy
/*****************************************************************************
* destination is 8 byte aligned *
*****************************************************************************/
L_vectorReverseCopy:
// At this point, registers contain the following values:
//
// r0 original destination pointer
// r1 pointer to one byte past the next element to be copied
// r2 (bytes remaining to copy) - 8
// ip pointer one byte past the destination of the next byte to be copied
//
// Furthermore, it is known that ip is 8 byte aligned, and that r2 is positive.
// NEON has really excellent alignment handling in hardware, so we would like
// to use that to handle cases where the source is not similarly aligned to the
// destination (it supports even single-byte misalignment at speed). However,
// on some SoC designs, not all of the DMA busses support such access. Thus,
// we must unfortunately use a software workaround in those cases.
//
// Fortunately, 4-byte aligned loads are supported even on the DMA busses, so
// we only need to handle the different possible source alignments modulo 4.
// Here we have a dispatch table to jump to the correct copy implementation
// for the given source alignment.
//
// The tbh instruction loads the address offset of the correct implementation
// from the data table that immediately follows it and adds it to the pc to
// jump to the correct branch.
ands r3, r1, $3
tbh [pc, r3, lsl $1]
0:
.short (L_reverseAligned0-0b)/2
.short (L_reverseAligned1-0b)/2
.short (L_reverseAligned2-0b)/2
.short (L_reverseAligned3-0b)/2
/*****************************************************************************
* source is also at least word aligned *
*****************************************************************************/
L_reverseAligned0:
// Subtract 56 from r2, so that it contains the number of bytes remaining to
// copy minus 64. If this result is negative, then we jump into a loop that
// copies 8 bytes at a time.
subs r2, $0x38
blt L_reverseVectorCleanup
// Check if the destination pointer is 64-byte aligned. If so, jump to a loop
// that copies whole cachelines.
tst ip, $0x38
beq L_reverseCachelineAligned
// Otherwise, we copy a 8 bytes at a time, in order of descending memory
// address, until the destination is 64 byte aligned. Within this loop,
// registers are used as follows:
//
// r0 original destination pointer
// r1 pointer to one byte past the next element to be copied
// r2 (bytes remaining to be copied) - 64
// ip pointer one byte past the destination of the next byte to be copied
// d0 temporary storage for copy
//
// bytes that will be copied after this iteration
// | 8 byte block that will be copied in this iteration
// v v
// --------------+-------------------------------+---------------------
// | 0 1 2 3 4 5 6 7 | bytes already copied
// --------------+-------------------------------+---------------------
// ^
// r1 points here
0: sub r1, $8
vld1.32 {d0}, [r1]
sub ip, $8
sub r2, $8
tst ip, $0x38
vst1.64 {d0}, [ip,:64]
bne 0b
// At this point, the destination pointer is 64 byte aligned. Check again that
// there are at least 64 bytes remaining to copy by comparing the remaining
// length minus 64 to zero. If fewer than 64 bytes remain, skip over the main
// copy loop.
cmp r2, $0
blt L_reverseVectorCleanup
/*****************************************************************************
* destination is cacheline aligned *
*****************************************************************************/
L_reverseCachelineAligned:
// In the special case that we are copying a buffer of between 1k and 32k bytes
// we do not use a NEON copy for the main loop. This is because if we happen
// to be doing a copy from a source in cache to a destination that is not in
// cache, this will result in an increase in energy usage. In all other cases,
// NEON gives superior energy conservation.
sub r3, r2, $0x3c0
cmp r3, $0x7c00
blo L_useSTMDB
// Pre-decrement the source (r1) and destination (ip) pointers so that they
// point to the first byte of the trailing 32-byte window of each buffer.
// Additionally, load the address increment of -32 into r3.
sub r1, $32
sub ip, $32
mov r3, $-32
// The destination pointer is known to be 64-byte aligned, so we can use the
// maximal alignment hint (:256) for our vector stores. Detect if the source
// is also at least 32-byte aligned and jump to a loop that uses maximal
// alignment hints for the loads as well if possible.
tst r1, $0x1f
beq L_reverseSourceAligned
// This loop copies 64 bytes per iteration, from a 4-byte aligned source to a
// 64-byte aligned destination, in order of descending memory address. Within
// this loop, registers are used as follows:
//
// r0 original destination pointer (unmodified)
// r1 pointer to the next 32-byte block to load
// r2 (number of bytes remaining to copy) - 64
// r3 address increment of -32.
// ip pointer to which the next 32-byte block is to be stored
// q0-q3 temporary registers used for copies
//
// Note that the loop is arrange in such a way that a single cleanup store is
// necessary after the final loop iteration. This occurs at label (1), and is
// shared between the unaligned and aligned loops.
vld1.32 {q2,q3}, [r1], r3
vld1.32 {q0,q1}, [r1], r3
subs r2, $64
vst1.64 {q2,q3}, [ip,:256], r3
blt 1f
.align 3
0: vld1.32 {q2,q3}, [r1], r3
vst1.64 {q0,q1}, [ip,:256], r3
vld1.32 {q0,q1}, [r1], r3
subs r2, $64
vst1.64 {q2,q3}, [ip,:256], r3
bge 0b
b 1f
L_reverseSourceAligned:
// This loop is identical to the immediately preceeding loop, except that it
// uses the additional alignment hint that the source pointer (r1) is 32-byte
// aligned. The two loops share cleanup code for the final iteration.
vld1.64 {q2,q3}, [r1,:256], r3
vld1.64 {q0,q1}, [r1,:256], r3
subs r2, $64
vst1.64 {q2,q3}, [ip,:256], r3
blt 1f
.align 3
0: vld1.64 {q2,q3}, [r1,:256], r3
vst1.64 {q0,q1}, [ip,:256], r3
vld1.64 {q0,q1}, [r1,:256], r3
subs r2, $64
vst1.64 {q2,q3}, [ip,:256], r3
bge 0b
// Final vector store for both of the above loops.
1: vst1.64 {q0,q1}, [ip,:256], r3
// Adjust the source and destination pointers so that they once again point to
// the last byte that we used (which is one byte higher than the address that
// we will use next for any required cleanup).
add r1, $32
add ip, $32
L_reverseVectorCleanup:
// Add 56 to r2, so that it contains the number of bytes remaing to copy minus
// 8. A comparison of this value with zero tells us if any more whole 8-byte
// blocks need to be copied.
adds r2, r2, $0x38
blt L_scalarReverseCopy
// This loop copies 8 bytes at a time in order of descending memory address,
// until fewer than 8 bytes remain to be copied. Within this loop, registers
// are used as follows:
//
// r0 original destination pointer
// r1 pointer to one byte past the next element to be copied
// r2 (bytes remaining to be copied) - 64
// ip pointer one byte past the destination of the next byte to be copied
// d0 temporary storage for copy
0: sub r1, $8
vld1.32 {d0}, [r1]
sub ip, $8
subs r2, $8
vst1.64 {d0}, [ip,:64]
bge 0b
/*****************************************************************************
* sub-doubleword cleanup copies *
*****************************************************************************/
L_scalarReverseCopy:
// Add 8 to r2, so that it contains the number of bytes remaining to copy, and
// return to the calling routine if zero bytes remain.
adds r2, $8
it eq
bxeq lr
// Copy one byte at a time in descending address order until we reach the front
// of the buffer. Within this loop, registers are used as follows:
//
// r0 original destination pointer
// r1 pointer to one byte past the next element to be copied
// r2 (bytes remaining to be copied) - 8
// r3 temporary to hold the byte that is being copied
// ip pointer one byte past the destination of the next byte to be copied
0: ldrb r3, [r1, $-1]!
subs r2, $1
strb r3, [ip, $-1]!
bne 0b
bx lr
/*****************************************************************************
* STMDB loop for 1k-32k buffers *
*****************************************************************************/
// This loop copies 64 bytes each iteration in order of descending memory
// address, using the GPRs instead of NEON.
//
// r0 original destination pointer
// r1 pointer to one byte past the next element to be copied
// r2 (bytes remaining to be copied) - 64
// r3-6,r8-11 (COPY_REGISTERS) temporary registers used for moving data
// ip pointer to one byte past the next location to store to
L_useSTMDB:push SAVE_REGISTERS
.align 3
0: ldmdb r1!, COPY_REGISTERS
subs r2, r2, $64
stmdb ip!, COPY_REGISTERS
ldmdb r1!, COPY_REGISTERS
pld [r1, $-64]
stmdb ip!, COPY_REGISTERS
bge 0b
pop SAVE_REGISTERS
b L_reverseVectorCleanup
/*****************************************************************************
* Misaligned reverse vld1 loop *
*****************************************************************************/
// Software alignment fixup to handle source and dest that are relatively
// misaligned mod 4 bytes.
//
// The basic idea is to use 4-byte aligned loads to load 8 bytes per iteration,
// which we combine with the 8 bytes loaded in the previous iteration to get a
// 16 byte field; the next 8 bytes to be stored to the destination buffer are
// somewhere in that field, and we get them using the VEXT instruction:
//
// | 8 bytes from this iteration | 8 bytes from last iteration |
// +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
// | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | b | c | d | e | f |
// +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
// ^8 bytes to store this iteration^ |
// could be a page boundary
//
// We need to be a little bit careful, however. Because the loads only have 4
// byte alignment, the very first load could slop over into a page that is not
// mapped readable. In order to prevent this scenario, we copy eight bytes
// using byte-by-byte before beginning the main loop.
//
// At the beginning of each iteration through this loop, registers are used
// as follows:
//
// r0 original destination pointer
// r1 pointer to the next block of 8 bytes to load
// r2 (bytes remaining to copy) - 8
// ip pointer to the next block of 8 bytes to store
// d0 next 8 bytes to store
// d2 8 bytes loaded in the previous iteration
// d3 8 bytes loaded two iterations ago
#define RCOPY_UNALIGNED(offset) \
0: ldrb r3, [r1,$-1]! ;\
strb r3, [ip,$-1]! ;\
subs r2, $1 ;\
blt L_scalarReverseCopy ;\
tst ip, $7 ;\
bne 0b ;\
bic r1, $3 ;\
sub r1, $8 ;\
sub ip, $8 ;\
mov r3, $-8 ;\
vld1.32 {d2,d3}, [r1], r3 ;\
subs r2, $8 ;\
blt 1f ;\
0: vext.8 d0, d2, d3, $(offset);\
vmov d3, d2 ;\
vld1.32 {d2}, [r1], r3 ;\
subs r2, $8 ;\
vst1.64 {d0}, [ip, :64], r3 ;\
bge 0b ;\
1: vext.8 d0, d2, d3, $(offset);\
add r1, $8 ;\
vst1.64 {d0}, [ip, :64] ;\
2: add r1, $(offset);\
b L_scalarReverseCopy
L_reverseAligned1:RCOPY_UNALIGNED(1)
L_reverseAligned2:RCOPY_UNALIGNED(2)
L_reverseAligned3:RCOPY_UNALIGNED(3)
/*****************************************************************************
* front to back copy *
*****************************************************************************/
L_copyFrontToBack:
// Here the pointers are laid out such that we can use our preferred
// front-to-back copy. We preserve original destination pointer in r0 because
// it is the return value for the routine, and copy it to ip to use in this
// routine.
mov ip, r0
// Subtract 8 from the buffer length; if this is negative, then we will use
// only single-byte copies, and we jump directly to a scalar copy loop.
subs r2, $8
blt L_scalarCopy
// If the destination pointer is 8-byte aligned we can use 8-byte NEON copies
// to move the data.
tst ip, $7
beq L_vectorCopy
// Otherwise, we copy a single byte at a time, in order of ascending memory
// address, until the destination is 8 byte aligned. Within this loop,
// registers are used as follows:
//
// r0 original destination pointer
// r1 pointer to the next byte to copy
// r2 (bytes remaining to be copied) - 8
// r3 temporary to hold the byte that is being copied
// ip pointer to the next byte to store to
0: ldrb r3, [r1], $1
sub r2, $1
strb r3, [ip], $1
tst ip, $7
bne 0b
// At this point, the destination pointer is 8 byte aligned. Check again that
// there are at least 8 bytes remaining to copy by comparing the remaining
// length minus 8 to zero. If fewer than 8 bytes remain, jump to the cleanup
// path.
cmp r2, $0
blt L_scalarCopy
/*****************************************************************************
* destination is doubleword aligned *
*****************************************************************************/
L_vectorCopy:
// At this point, registers contain the following values:
//
// r0 original destination pointer
// r1 pointer to the next element to be copied
// r2 (bytes remaining to copy) - 8
// ip pointer to the destination of the next byte to be copied
//
// Furthermore, it is known that ip is 8 byte aligned, and that r2 is positive.
// NEON has really excellent alignment handling in hardware, so we would like
// to use that to handle cases where the source is not similarly aligned to the
// destination (it supports even single-byte misalignment at speed). However,
// on some SoC designs, not all of the DMA busses support such access. Thus,
// we must unfortunately use a software workaround in those cases.
//
// Fortunately, 4-byte aligned loads are supported even on the DMA busses, so
// we only need to handle the different possible source alignments modulo 4.
// Here we have a dispatch table to jump to the correct copy implementation
// for the given source alignment.
//
// The tbh instruction loads the address offset of the correct implementation
// from the data table that immediately follows it and adds it to the pc to
// jump to the correct branch.
ands r3, r1, $3
bic r1, $3
tbh [pc, r3, lsl $1]
0:
.short (L_sourceAligned0-0b)/2
.short (L_sourceAligned1-0b)/2
.short (L_sourceAligned2-0b)/2
.short (L_sourceAligned3-0b)/2
/*****************************************************************************
* source is also at least word aligned *
*****************************************************************************/
L_sourceAligned0:
// Subtract 56 from r2, so that it contains the number of bytes remaining to
// copy minus 64. If this result is negative, then we jump into a loop that
// copies 8 bytes at a time.
subs r2, $0x38
blt L_vectorCleanup
// Check if the destination pointer is 64-byte aligned. If so, jump to a loop
// that copies whole cachelines.
tst ip, $0x38
beq L_cachelineAligned
// Otherwise, we copy a 8 bytes at a time, in order of ascending memory
// address, until the destination is 64 byte aligned. Within this loop,
// registers are used as follows:
//
// r0 original destination pointer
// r1 pointer to the next element to be copied
// r2 (bytes remaining to be copied) - 64
// ip pointer to the destination of the next byte to be copied
// d0 temporary storage for copy
0: vld1.32 {d0}, [r1]!
sub r2, $8
vst1.64 {d0}, [ip,:64]!
tst ip, $0x38
bne 0b
// At this point, the destination pointer is 64 byte aligned. Check again that
// there are at least 64 bytes remaining to copy by comparing the remaining
// length minus 64 to zero. If fewer than 64 bytes remain, skip over the main
// copy loop.
cmp r2, $0
blt L_vectorCleanup
/*****************************************************************************
* destination is cacheline aligned *
*****************************************************************************/
// In the special case that we are copying a buffer of between 1k and 32k bytes
// we do not use a NEON copy for the main loop. This is because if we happen
// to be doing a copy from a source in cache to a destination that is not in
// cache, this will result in an increase in energy usage. In all other cases,
// NEON gives superior energy conservation.
L_cachelineAligned:sub r3, r2, $0x3c0
cmp r3, $0x7c00
blo L_useSTMIA
// The destination pointer is known to be 64-byte aligned, so we can use the
// maximal alignment hint (:256) for our vector stores. Detect if the source
// is also at least 32-byte aligned and jump to a loop that uses maximal
// alignment hints for the loads as well if possible.
tst r1, $0x1f
beq L_sourceAligned32
// This loop copies 64 bytes per iteration, from a 4-byte aligned source to a
// 64-byte aligned destination, in order of ascending memory address. Within
// this loop, registers are used as follows:
//
// r0 original destination pointer (unmodified)
// r1 pointer to the next 32-byte block to load
// r2 (number of bytes remaining to copy) - 64
// ip pointer to which the next 32-byte block is to be stored
// q0-q3 temporary registers used for copies
//
// Note that the loop is arrange in such a way that a single cleanup store is
// necessary after the final loop iteration. This occurs at label (1), and is
// shared between the unaligned and aligned loops.
vld1.32 {q2,q3}, [r1]!
vld1.32 {q0,q1}, [r1]!
subs r2, $64
vst1.64 {q2,q3}, [ip,:256]!
blt 1f
.align 3
0: vld1.32 {q2,q3}, [r1]!
vst1.64 {q0,q1}, [ip,:256]!
vld1.32 {q0,q1}, [r1]!
subs r2, $64
vst1.64 {q2,q3}, [ip,:256]!
bge 0b
b 1f
L_sourceAligned32:
// This loop is identical to the immediately preceeding loop, except that it
// uses the additional alignment hint that the source pointer (r1) is 32-byte
// aligned. The two loops share cleanup code for the final iteration.
vld1.64 {q2,q3}, [r1,:256]!
vld1.64 {q0,q1}, [r1,:256]!
subs r2, $64
vst1.64 {q2,q3}, [ip,:256]!
blt 1f
.align 3
0: vld1.64 {q2,q3}, [r1,:256]!
vst1.64 {q0,q1}, [ip,:256]!
vld1.64 {q0,q1}, [r1,:256]!
subs r2, $64
vst1.64 {q2,q3}, [ip,:256]!
bge 0b
// Final vector store for both of the above loops.
1: vst1.64 {q0,q1}, [ip,:256]!
L_vectorCleanup:
// Add 56 to r2, so that it contains the number of bytes remaing to copy minus
// 8. A comparison of this value with zero tells us if any more whole 8-byte
// blocks need to be copied.
adds r2, $0x38
blt L_scalarCopy
// This loop copies 8 bytes at a time in order of descending memory address,
// until fewer than 8 bytes remain to be copied. Within this loop, registers
// are used as follows:
//
// r0 original destination pointer
// r1 pointer to the next element to be copied
// r2 (bytes remaining to be copied) - 64
// ip pointer to the destination of the next byte to be copied
// d0 temporary storage for copy
0: vld1.32 {d0}, [r1]!
subs r2, $8
vst1.64 {d0}, [ip,:64]!
bge 0b
/*****************************************************************************
* sub-doubleword cleanup copies *
*****************************************************************************/
L_scalarCopy:
// Add 8 to r2, so that it contains the number of bytes remaining to copy, and
// return to the calling routine if zero bytes remain.
adds r2, $8
it eq
bxeq lr
// Copy one byte at a time in descending address order until we reach the front
// of the buffer. Within this loop, registers are used as follows:
//
// r0 original destination pointer
// r1 pointer to one byte past the next element to be copied
// r2 (bytes remaining to be copied) - 8
// r3 temporary to hold the byte that is being copied
// ip pointer one byte past the destination of the next byte to be copied
0: ldrb r3, [r1], $1
strb r3, [ip], $1
subs r2, $1
bne 0b
bx lr
/*****************************************************************************
* STMIA loop for 1k-32k buffers *
*****************************************************************************/
// This loop copies 64 bytes each iteration in order of ascending memory
// address, using the GPRs instead of NEON.
//
// r0 original destination pointer
// r1 pointer to the next element to be copied
// r2 (bytes remaining to be copied) - 64
// r3-6,r8-11 (COPY_REGISTERS) temporary registers used for moving data
// ip pointer to the next location to store to
L_useSTMIA:push SAVE_REGISTERS
.align 3
0: ldmia r1!, COPY_REGISTERS
subs r2, r2, $64
stmia ip!, COPY_REGISTERS
ldmia r1!, COPY_REGISTERS
pld [r1, $64]
stmia ip!, COPY_REGISTERS
bge 0b
pop SAVE_REGISTERS
b L_vectorCleanup
/*****************************************************************************
* Misaligned forward vld1 loop *
*****************************************************************************/
// Software alignment fixup to handle source and dest that are relatively
// misaligned mod 4 bytes.
//
// The basic idea is to use 4-byte aligned loads to load 8 bytes per iteration,
// which we combine with the 8 bytes loaded in the previous iteration to get a
// 16 byte field; the next 8 bytes to be stored to the destination buffer are
// somewhere in that field, and we get them using the VEXT instruction:
//
// | 8 bytes from last iteration | 8 bytes from this iteration |
// +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
// | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | b | c | d | e | f |
// +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
// ^8 bytes to store this iteration^ |
// could be a page boundary
//
// We need to be a little bit careful, however. Because the loads only have 4
// byte alignment, if we used this approach all the way to the end of the
// buffer, the very last 8 byte load might slop over onto a new page by 4
// bytes, and that new page might not be mapped into our process. Thus, we
// terminate this copy loop when fewer than 12 bytes remain to be copied,
// instead of the more natural-seeming termination condition of "8 bytes
// remaining" (the illustration above shows the worst case and demonstrates
// why 12 is a sufficiently safe condition).
//
// At the beginning of each iteration through this loop, registers are used
// as follows:
//
// r0 original destination pointer
// r1 pointer to the next block of 8 bytes to load
// r2 (bytes remaining to copy) - 12
// ip pointer to the next block of 8 bytes to store
// d0 next 8 bytes to store
// d2 8 bytes loaded in the previous iteration
// d3 8 bytes loaded two iterations ago
#define COPY_UNALIGNED(offset) \
subs r2, $4 ;\
blt 2f ;\
vld1.32 {d2,d3}, [r1]! ;\
subs r2, $8 ;\
blt 1f ;\
0: vext.8 d0, d2, d3, $(offset);\
vmov d2, d3 ;\
vld1.32 {d3}, [r1]! ;\
subs r2, $8 ;\
vst1.64 {d0}, [ip, :64]! ;\
bge 0b ;\
1: vext.8 d0, d2, d3, $(offset);\
sub r1, $8 ;\
vst1.64 {d0}, [ip, :64]! ;\
2: add r1, $(offset);\
add r2, $4 ;\
b L_scalarCopy
L_sourceAligned1:COPY_UNALIGNED(1)
L_sourceAligned2:COPY_UNALIGNED(2)
L_sourceAligned3:COPY_UNALIGNED(3)
#endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD