| // | |
| // Copyright (c) 2012 - 2016, Linaro Limited | |
| // All rights reserved. | |
| // Copyright (c) 2015 ARM Ltd | |
| // All rights reserved. | |
| // SPDX-License-Identifier: BSD-2-Clause-Patent | |
| // | |
| // Assumptions: | |
| // | |
| // ARMv8-a, AArch64, unaligned accesses. | |
| // | |
| // | |
| #define dstin x0 | |
| #define src x1 | |
| #define count x2 | |
| #define dst x3 | |
| #define srcend x4 | |
| #define dstend x5 | |
| #define A_l x6 | |
| #define A_lw w6 | |
| #define A_h x7 | |
| #define A_hw w7 | |
| #define B_l x8 | |
| #define B_lw w8 | |
| #define B_h x9 | |
| #define C_l x10 | |
| #define C_h x11 | |
| #define D_l x12 | |
| #define D_h x13 | |
| #define E_l x14 | |
| #define E_h x15 | |
| #define F_l srcend | |
| #define F_h dst | |
| #define tmp1 x9 | |
| #define tmp2 x3 | |
| #define L(l) .L ## l | |
| // Copies are split into 3 main cases: small copies of up to 16 bytes, | |
| // medium copies of 17..96 bytes which are fully unrolled. Large copies | |
| // of more than 96 bytes align the destination and use an unrolled loop | |
| // processing 64 bytes per iteration. | |
| // Small and medium copies read all data before writing, allowing any | |
| // kind of overlap, and memmove tailcalls memcpy for these cases as | |
| // well as non-overlapping copies. | |
| __memcpy: | |
| prfm PLDL1KEEP, [src] | |
| add srcend, src, count | |
| add dstend, dstin, count | |
| cmp count, 16 | |
| b.ls L(copy16) | |
| cmp count, 96 | |
| b.hi L(copy_long) | |
| // Medium copies: 17..96 bytes. | |
| sub tmp1, count, 1 | |
| ldp A_l, A_h, [src] | |
| tbnz tmp1, 6, L(copy96) | |
| ldp D_l, D_h, [srcend, -16] | |
| tbz tmp1, 5, 1f | |
| ldp B_l, B_h, [src, 16] | |
| ldp C_l, C_h, [srcend, -32] | |
| stp B_l, B_h, [dstin, 16] | |
| stp C_l, C_h, [dstend, -32] | |
| 1: | |
| stp A_l, A_h, [dstin] | |
| stp D_l, D_h, [dstend, -16] | |
| ret | |
| .p2align 4 | |
| // Small copies: 0..16 bytes. | |
| L(copy16): | |
| cmp count, 8 | |
| b.lo 1f | |
| ldr A_l, [src] | |
| ldr A_h, [srcend, -8] | |
| str A_l, [dstin] | |
| str A_h, [dstend, -8] | |
| ret | |
| .p2align 4 | |
| 1: | |
| tbz count, 2, 1f | |
| ldr A_lw, [src] | |
| ldr A_hw, [srcend, -4] | |
| str A_lw, [dstin] | |
| str A_hw, [dstend, -4] | |
| ret | |
| // Copy 0..3 bytes. Use a branchless sequence that copies the same | |
| // byte 3 times if count==1, or the 2nd byte twice if count==2. | |
| 1: | |
| cbz count, 2f | |
| lsr tmp1, count, 1 | |
| ldrb A_lw, [src] | |
| ldrb A_hw, [srcend, -1] | |
| ldrb B_lw, [src, tmp1] | |
| strb A_lw, [dstin] | |
| strb B_lw, [dstin, tmp1] | |
| strb A_hw, [dstend, -1] | |
| 2: ret | |
| .p2align 4 | |
| // Copy 64..96 bytes. Copy 64 bytes from the start and | |
| // 32 bytes from the end. | |
| L(copy96): | |
| ldp B_l, B_h, [src, 16] | |
| ldp C_l, C_h, [src, 32] | |
| ldp D_l, D_h, [src, 48] | |
| ldp E_l, E_h, [srcend, -32] | |
| ldp F_l, F_h, [srcend, -16] | |
| stp A_l, A_h, [dstin] | |
| stp B_l, B_h, [dstin, 16] | |
| stp C_l, C_h, [dstin, 32] | |
| stp D_l, D_h, [dstin, 48] | |
| stp E_l, E_h, [dstend, -32] | |
| stp F_l, F_h, [dstend, -16] | |
| ret | |
| // Align DST to 16 byte alignment so that we don't cross cache line | |
| // boundaries on both loads and stores. There are at least 96 bytes | |
| // to copy, so copy 16 bytes unaligned and then align. The loop | |
| // copies 64 bytes per iteration and prefetches one iteration ahead. | |
| .p2align 4 | |
| L(copy_long): | |
| and tmp1, dstin, 15 | |
| bic dst, dstin, 15 | |
| ldp D_l, D_h, [src] | |
| sub src, src, tmp1 | |
| add count, count, tmp1 // Count is now 16 too large. | |
| ldp A_l, A_h, [src, 16] | |
| stp D_l, D_h, [dstin] | |
| ldp B_l, B_h, [src, 32] | |
| ldp C_l, C_h, [src, 48] | |
| ldp D_l, D_h, [src, 64]! | |
| subs count, count, 128 + 16 // Test and readjust count. | |
| b.ls 2f | |
| 1: | |
| stp A_l, A_h, [dst, 16] | |
| ldp A_l, A_h, [src, 16] | |
| stp B_l, B_h, [dst, 32] | |
| ldp B_l, B_h, [src, 32] | |
| stp C_l, C_h, [dst, 48] | |
| ldp C_l, C_h, [src, 48] | |
| stp D_l, D_h, [dst, 64]! | |
| ldp D_l, D_h, [src, 64]! | |
| subs count, count, 64 | |
| b.hi 1b | |
| // Write the last full set of 64 bytes. The remainder is at most 64 | |
| // bytes, so it is safe to always copy 64 bytes from the end even if | |
| // there is just 1 byte left. | |
| 2: | |
| ldp E_l, E_h, [srcend, -64] | |
| stp A_l, A_h, [dst, 16] | |
| ldp A_l, A_h, [srcend, -48] | |
| stp B_l, B_h, [dst, 32] | |
| ldp B_l, B_h, [srcend, -32] | |
| stp C_l, C_h, [dst, 48] | |
| ldp C_l, C_h, [srcend, -16] | |
| stp D_l, D_h, [dst, 64] | |
| stp E_l, E_h, [dstend, -64] | |
| stp A_l, A_h, [dstend, -48] | |
| stp B_l, B_h, [dstend, -32] | |
| stp C_l, C_h, [dstend, -16] | |
| ret | |
| // | |
| // All memmoves up to 96 bytes are done by memcpy as it supports overlaps. | |
| // Larger backwards copies are also handled by memcpy. The only remaining | |
| // case is forward large copies. The destination is aligned, and an | |
| // unrolled loop processes 64 bytes per iteration. | |
| // | |
| ASM_GLOBAL ASM_PFX(InternalMemCopyMem) | |
| ASM_PFX(InternalMemCopyMem): | |
| AARCH64_BTI(c) | |
| sub tmp2, dstin, src | |
| cmp count, 96 | |
| ccmp tmp2, count, 2, hi | |
| b.hs __memcpy | |
| cbz tmp2, 3f | |
| add dstend, dstin, count | |
| add srcend, src, count | |
| // Align dstend to 16 byte alignment so that we don't cross cache line | |
| // boundaries on both loads and stores. There are at least 96 bytes | |
| // to copy, so copy 16 bytes unaligned and then align. The loop | |
| // copies 64 bytes per iteration and prefetches one iteration ahead. | |
| and tmp2, dstend, 15 | |
| ldp D_l, D_h, [srcend, -16] | |
| sub srcend, srcend, tmp2 | |
| sub count, count, tmp2 | |
| ldp A_l, A_h, [srcend, -16] | |
| stp D_l, D_h, [dstend, -16] | |
| ldp B_l, B_h, [srcend, -32] | |
| ldp C_l, C_h, [srcend, -48] | |
| ldp D_l, D_h, [srcend, -64]! | |
| sub dstend, dstend, tmp2 | |
| subs count, count, 128 | |
| b.ls 2f | |
| nop | |
| 1: | |
| stp A_l, A_h, [dstend, -16] | |
| ldp A_l, A_h, [srcend, -16] | |
| stp B_l, B_h, [dstend, -32] | |
| ldp B_l, B_h, [srcend, -32] | |
| stp C_l, C_h, [dstend, -48] | |
| ldp C_l, C_h, [srcend, -48] | |
| stp D_l, D_h, [dstend, -64]! | |
| ldp D_l, D_h, [srcend, -64]! | |
| subs count, count, 64 | |
| b.hi 1b | |
| // Write the last full set of 64 bytes. The remainder is at most 64 | |
| // bytes, so it is safe to always copy 64 bytes from the start even if | |
| // there is just 1 byte left. | |
| 2: | |
| ldp E_l, E_h, [src, 48] | |
| stp A_l, A_h, [dstend, -16] | |
| ldp A_l, A_h, [src, 32] | |
| stp B_l, B_h, [dstend, -32] | |
| ldp B_l, B_h, [src, 16] | |
| stp C_l, C_h, [dstend, -48] | |
| ldp C_l, C_h, [src] | |
| stp D_l, D_h, [dstend, -64] | |
| stp E_l, E_h, [dstin, 48] | |
| stp A_l, A_h, [dstin, 32] | |
| stp B_l, B_h, [dstin, 16] | |
| stp C_l, C_h, [dstin] | |
| 3: ret |