// | |
// Copyright (c) 2012 - 2016, Linaro Limited | |
// All rights reserved. | |
// Copyright (c) 2015 ARM Ltd | |
// All rights reserved. | |
// SPDX-License-Identifier: BSD-2-Clause-Patent | |
// | |
// Assumptions: | |
// | |
// ARMv8-a, AArch64, unaligned accesses. | |
// | |
// | |
#define dstin x0 | |
#define src x1 | |
#define count x2 | |
#define dst x3 | |
#define srcend x4 | |
#define dstend x5 | |
#define A_l x6 | |
#define A_lw w6 | |
#define A_h x7 | |
#define A_hw w7 | |
#define B_l x8 | |
#define B_lw w8 | |
#define B_h x9 | |
#define C_l x10 | |
#define C_h x11 | |
#define D_l x12 | |
#define D_h x13 | |
#define E_l x14 | |
#define E_h x15 | |
#define F_l srcend | |
#define F_h dst | |
#define tmp1 x9 | |
#define tmp2 x3 | |
#define L(l) .L ## l | |
// Copies are split into 3 main cases: small copies of up to 16 bytes, | |
// medium copies of 17..96 bytes which are fully unrolled. Large copies | |
// of more than 96 bytes align the destination and use an unrolled loop | |
// processing 64 bytes per iteration. | |
// Small and medium copies read all data before writing, allowing any | |
// kind of overlap, and memmove tailcalls memcpy for these cases as | |
// well as non-overlapping copies. | |
__memcpy: | |
prfm PLDL1KEEP, [src] | |
add srcend, src, count | |
add dstend, dstin, count | |
cmp count, 16 | |
b.ls L(copy16) | |
cmp count, 96 | |
b.hi L(copy_long) | |
// Medium copies: 17..96 bytes. | |
sub tmp1, count, 1 | |
ldp A_l, A_h, [src] | |
tbnz tmp1, 6, L(copy96) | |
ldp D_l, D_h, [srcend, -16] | |
tbz tmp1, 5, 1f | |
ldp B_l, B_h, [src, 16] | |
ldp C_l, C_h, [srcend, -32] | |
stp B_l, B_h, [dstin, 16] | |
stp C_l, C_h, [dstend, -32] | |
1: | |
stp A_l, A_h, [dstin] | |
stp D_l, D_h, [dstend, -16] | |
ret | |
.p2align 4 | |
// Small copies: 0..16 bytes. | |
L(copy16): | |
cmp count, 8 | |
b.lo 1f | |
ldr A_l, [src] | |
ldr A_h, [srcend, -8] | |
str A_l, [dstin] | |
str A_h, [dstend, -8] | |
ret | |
.p2align 4 | |
1: | |
tbz count, 2, 1f | |
ldr A_lw, [src] | |
ldr A_hw, [srcend, -4] | |
str A_lw, [dstin] | |
str A_hw, [dstend, -4] | |
ret | |
// Copy 0..3 bytes. Use a branchless sequence that copies the same | |
// byte 3 times if count==1, or the 2nd byte twice if count==2. | |
1: | |
cbz count, 2f | |
lsr tmp1, count, 1 | |
ldrb A_lw, [src] | |
ldrb A_hw, [srcend, -1] | |
ldrb B_lw, [src, tmp1] | |
strb A_lw, [dstin] | |
strb B_lw, [dstin, tmp1] | |
strb A_hw, [dstend, -1] | |
2: ret | |
.p2align 4 | |
// Copy 64..96 bytes. Copy 64 bytes from the start and | |
// 32 bytes from the end. | |
L(copy96): | |
ldp B_l, B_h, [src, 16] | |
ldp C_l, C_h, [src, 32] | |
ldp D_l, D_h, [src, 48] | |
ldp E_l, E_h, [srcend, -32] | |
ldp F_l, F_h, [srcend, -16] | |
stp A_l, A_h, [dstin] | |
stp B_l, B_h, [dstin, 16] | |
stp C_l, C_h, [dstin, 32] | |
stp D_l, D_h, [dstin, 48] | |
stp E_l, E_h, [dstend, -32] | |
stp F_l, F_h, [dstend, -16] | |
ret | |
// Align DST to 16 byte alignment so that we don't cross cache line | |
// boundaries on both loads and stores. There are at least 96 bytes | |
// to copy, so copy 16 bytes unaligned and then align. The loop | |
// copies 64 bytes per iteration and prefetches one iteration ahead. | |
.p2align 4 | |
L(copy_long): | |
and tmp1, dstin, 15 | |
bic dst, dstin, 15 | |
ldp D_l, D_h, [src] | |
sub src, src, tmp1 | |
add count, count, tmp1 // Count is now 16 too large. | |
ldp A_l, A_h, [src, 16] | |
stp D_l, D_h, [dstin] | |
ldp B_l, B_h, [src, 32] | |
ldp C_l, C_h, [src, 48] | |
ldp D_l, D_h, [src, 64]! | |
subs count, count, 128 + 16 // Test and readjust count. | |
b.ls 2f | |
1: | |
stp A_l, A_h, [dst, 16] | |
ldp A_l, A_h, [src, 16] | |
stp B_l, B_h, [dst, 32] | |
ldp B_l, B_h, [src, 32] | |
stp C_l, C_h, [dst, 48] | |
ldp C_l, C_h, [src, 48] | |
stp D_l, D_h, [dst, 64]! | |
ldp D_l, D_h, [src, 64]! | |
subs count, count, 64 | |
b.hi 1b | |
// Write the last full set of 64 bytes. The remainder is at most 64 | |
// bytes, so it is safe to always copy 64 bytes from the end even if | |
// there is just 1 byte left. | |
2: | |
ldp E_l, E_h, [srcend, -64] | |
stp A_l, A_h, [dst, 16] | |
ldp A_l, A_h, [srcend, -48] | |
stp B_l, B_h, [dst, 32] | |
ldp B_l, B_h, [srcend, -32] | |
stp C_l, C_h, [dst, 48] | |
ldp C_l, C_h, [srcend, -16] | |
stp D_l, D_h, [dst, 64] | |
stp E_l, E_h, [dstend, -64] | |
stp A_l, A_h, [dstend, -48] | |
stp B_l, B_h, [dstend, -32] | |
stp C_l, C_h, [dstend, -16] | |
ret | |
// | |
// All memmoves up to 96 bytes are done by memcpy as it supports overlaps. | |
// Larger backwards copies are also handled by memcpy. The only remaining | |
// case is forward large copies. The destination is aligned, and an | |
// unrolled loop processes 64 bytes per iteration. | |
// | |
ASM_GLOBAL ASM_PFX(InternalMemCopyMem) | |
ASM_PFX(InternalMemCopyMem): | |
AARCH64_BTI(c) | |
sub tmp2, dstin, src | |
cmp count, 96 | |
ccmp tmp2, count, 2, hi | |
b.hs __memcpy | |
cbz tmp2, 3f | |
add dstend, dstin, count | |
add srcend, src, count | |
// Align dstend to 16 byte alignment so that we don't cross cache line | |
// boundaries on both loads and stores. There are at least 96 bytes | |
// to copy, so copy 16 bytes unaligned and then align. The loop | |
// copies 64 bytes per iteration and prefetches one iteration ahead. | |
and tmp2, dstend, 15 | |
ldp D_l, D_h, [srcend, -16] | |
sub srcend, srcend, tmp2 | |
sub count, count, tmp2 | |
ldp A_l, A_h, [srcend, -16] | |
stp D_l, D_h, [dstend, -16] | |
ldp B_l, B_h, [srcend, -32] | |
ldp C_l, C_h, [srcend, -48] | |
ldp D_l, D_h, [srcend, -64]! | |
sub dstend, dstend, tmp2 | |
subs count, count, 128 | |
b.ls 2f | |
nop | |
1: | |
stp A_l, A_h, [dstend, -16] | |
ldp A_l, A_h, [srcend, -16] | |
stp B_l, B_h, [dstend, -32] | |
ldp B_l, B_h, [srcend, -32] | |
stp C_l, C_h, [dstend, -48] | |
ldp C_l, C_h, [srcend, -48] | |
stp D_l, D_h, [dstend, -64]! | |
ldp D_l, D_h, [srcend, -64]! | |
subs count, count, 64 | |
b.hi 1b | |
// Write the last full set of 64 bytes. The remainder is at most 64 | |
// bytes, so it is safe to always copy 64 bytes from the start even if | |
// there is just 1 byte left. | |
2: | |
ldp E_l, E_h, [src, 48] | |
stp A_l, A_h, [dstend, -16] | |
ldp A_l, A_h, [src, 32] | |
stp B_l, B_h, [dstend, -32] | |
ldp B_l, B_h, [src, 16] | |
stp C_l, C_h, [dstend, -48] | |
ldp C_l, C_h, [src] | |
stp D_l, D_h, [dstend, -64] | |
stp E_l, E_h, [dstin, 48] | |
stp A_l, A_h, [dstin, 32] | |
stp B_l, B_h, [dstin, 16] | |
stp C_l, C_h, [dstin] | |
3: ret |