MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CopyMem.S - edk2 - Git at Google

 //
 // Copyright (c) 2012 - 2016, Linaro Limited
 // All rights reserved.
 // Copyright (c) 2015 ARM Ltd
 // All rights reserved.
 // SPDX-License-Identifier: BSD-2-Clause-Patent
 //

 // Assumptions:
 //
 // ARMv8-a, AArch64, unaligned accesses.
 //
 //

 #define dstin     x0
 #define src       x1
 #define count     x2
 #define dst       x3
 #define srcend    x4
 #define dstend    x5
 #define A_l       x6
 #define A_lw      w6
 #define A_h       x7
 #define A_hw      w7
 #define B_l       x8
 #define B_lw      w8
 #define B_h       x9
 #define C_l       x10
 #define C_h       x11
 #define D_l       x12
 #define D_h       x13
 #define E_l       x14
 #define E_h       x15
 #define F_l       srcend
 #define F_h       dst
 #define tmp1      x9
 #define tmp2      x3

 #define L(l) .L ## l

 // Copies are split into 3 main cases: small copies of up to 16 bytes,
 // medium copies of 17..96 bytes which are fully unrolled. Large copies
 // of more than 96 bytes align the destination and use an unrolled loop
 // processing 64 bytes per iteration.
 // Small and medium copies read all data before writing, allowing any
 // kind of overlap, and memmove tailcalls memcpy for these cases as
 // well as non-overlapping copies.

 __memcpy:
     prfm    PLDL1KEEP, [src]
     add     srcend, src, count
     add     dstend, dstin, count
     cmp     count, 16
     b.ls    L(copy16)
     cmp     count, 96
     b.hi    L(copy_long)

     // Medium copies: 17..96 bytes.
     sub     tmp1, count, 1
     ldp     A_l, A_h, [src]
     tbnz    tmp1, 6, L(copy96)
     ldp     D_l, D_h, [srcend, -16]
     tbz     tmp1, 5, 1f
     ldp     B_l, B_h, [src, 16]
     ldp     C_l, C_h, [srcend, -32]
     stp     B_l, B_h, [dstin, 16]
     stp     C_l, C_h, [dstend, -32]
 1:
     stp     A_l, A_h, [dstin]
     stp     D_l, D_h, [dstend, -16]
     ret

     .p2align 4
     // Small copies: 0..16 bytes.
 L(copy16):
     cmp     count, 8
     b.lo    1f
     ldr     A_l, [src]
     ldr     A_h, [srcend, -8]
     str     A_l, [dstin]
     str     A_h, [dstend, -8]
     ret
     .p2align 4
 1:
     tbz     count, 2, 1f
     ldr     A_lw, [src]
     ldr     A_hw, [srcend, -4]
     str     A_lw, [dstin]
     str     A_hw, [dstend, -4]
     ret

     // Copy 0..3 bytes.  Use a branchless sequence that copies the same
     // byte 3 times if count==1, or the 2nd byte twice if count==2.
 1:
     cbz     count, 2f
     lsr     tmp1, count, 1
     ldrb    A_lw, [src]
     ldrb    A_hw, [srcend, -1]
     ldrb    B_lw, [src, tmp1]
     strb    A_lw, [dstin]
     strb    B_lw, [dstin, tmp1]
     strb    A_hw, [dstend, -1]
 2:  ret

     .p2align 4
     // Copy 64..96 bytes.  Copy 64 bytes from the start and
     // 32 bytes from the end.
 L(copy96):
     ldp     B_l, B_h, [src, 16]
     ldp     C_l, C_h, [src, 32]
     ldp     D_l, D_h, [src, 48]
     ldp     E_l, E_h, [srcend, -32]
     ldp     F_l, F_h, [srcend, -16]
     stp     A_l, A_h, [dstin]
     stp     B_l, B_h, [dstin, 16]
     stp     C_l, C_h, [dstin, 32]
     stp     D_l, D_h, [dstin, 48]
     stp     E_l, E_h, [dstend, -32]
     stp     F_l, F_h, [dstend, -16]
     ret

     // Align DST to 16 byte alignment so that we don't cross cache line
     // boundaries on both loads and stores. There are at least 96 bytes
     // to copy, so copy 16 bytes unaligned and then align.  The loop
     // copies 64 bytes per iteration and prefetches one iteration ahead.

     .p2align 4
 L(copy_long):
     and     tmp1, dstin, 15
     bic     dst, dstin, 15
     ldp     D_l, D_h, [src]
     sub     src, src, tmp1
     add     count, count, tmp1      // Count is now 16 too large.
     ldp     A_l, A_h, [src, 16]
     stp     D_l, D_h, [dstin]
     ldp     B_l, B_h, [src, 32]
     ldp     C_l, C_h, [src, 48]
     ldp     D_l, D_h, [src, 64]!
     subs    count, count, 128 + 16  // Test and readjust count.
     b.ls    2f
 1:
     stp     A_l, A_h, [dst, 16]
     ldp     A_l, A_h, [src, 16]
     stp     B_l, B_h, [dst, 32]
     ldp     B_l, B_h, [src, 32]
     stp     C_l, C_h, [dst, 48]
     ldp     C_l, C_h, [src, 48]
     stp     D_l, D_h, [dst, 64]!
     ldp     D_l, D_h, [src, 64]!
     subs    count, count, 64
     b.hi    1b

     // Write the last full set of 64 bytes.   The remainder is at most 64
     // bytes, so it is safe to always copy 64 bytes from the end even if
     // there is just 1 byte left.
 2:
     ldp     E_l, E_h, [srcend, -64]
     stp     A_l, A_h, [dst, 16]
     ldp     A_l, A_h, [srcend, -48]
     stp     B_l, B_h, [dst, 32]
     ldp     B_l, B_h, [srcend, -32]
     stp     C_l, C_h, [dst, 48]
     ldp     C_l, C_h, [srcend, -16]
     stp     D_l, D_h, [dst, 64]
     stp     E_l, E_h, [dstend, -64]
     stp     A_l, A_h, [dstend, -48]
     stp     B_l, B_h, [dstend, -32]
     stp     C_l, C_h, [dstend, -16]
     ret


 //
 // All memmoves up to 96 bytes are done by memcpy as it supports overlaps.
 // Larger backwards copies are also handled by memcpy. The only remaining
 // case is forward large copies.  The destination is aligned, and an
 // unrolled loop processes 64 bytes per iteration.
 //

 ASM_GLOBAL ASM_PFX(InternalMemCopyMem)
 ASM_PFX(InternalMemCopyMem):
     AARCH64_BTI(c)
     sub     tmp2, dstin, src
     cmp     count, 96
     ccmp    tmp2, count, 2, hi
     b.hs    __memcpy

     cbz     tmp2, 3f
     add     dstend, dstin, count
     add     srcend, src, count

     // Align dstend to 16 byte alignment so that we don't cross cache line
     // boundaries on both loads and stores. There are at least 96 bytes
     // to copy, so copy 16 bytes unaligned and then align. The loop
     // copies 64 bytes per iteration and prefetches one iteration ahead.

     and     tmp2, dstend, 15
     ldp     D_l, D_h, [srcend, -16]
     sub     srcend, srcend, tmp2
     sub     count, count, tmp2
     ldp     A_l, A_h, [srcend, -16]
     stp     D_l, D_h, [dstend, -16]
     ldp     B_l, B_h, [srcend, -32]
     ldp     C_l, C_h, [srcend, -48]
     ldp     D_l, D_h, [srcend, -64]!
     sub     dstend, dstend, tmp2
     subs    count, count, 128
     b.ls    2f
     nop
 1:
     stp     A_l, A_h, [dstend, -16]
     ldp     A_l, A_h, [srcend, -16]
     stp     B_l, B_h, [dstend, -32]
     ldp     B_l, B_h, [srcend, -32]
     stp     C_l, C_h, [dstend, -48]
     ldp     C_l, C_h, [srcend, -48]
     stp     D_l, D_h, [dstend, -64]!
     ldp     D_l, D_h, [srcend, -64]!
     subs    count, count, 64
     b.hi    1b

     // Write the last full set of 64 bytes. The remainder is at most 64
     // bytes, so it is safe to always copy 64 bytes from the start even if
     // there is just 1 byte left.
 2:
     ldp     E_l, E_h, [src, 48]
     stp     A_l, A_h, [dstend, -16]
     ldp     A_l, A_h, [src, 32]
     stp     B_l, B_h, [dstend, -32]
     ldp     B_l, B_h, [src, 16]
     stp     C_l, C_h, [dstend, -48]
     ldp     C_l, C_h, [src]
     stp     D_l, D_h, [dstend, -64]
     stp     E_l, E_h, [dstin, 48]
     stp     A_l, A_h, [dstin, 32]
     stp     B_l, B_h, [dstin, 16]
     stp     C_l, C_h, [dstin]
 3:  ret
	//
	// Copyright (c) 2012 - 2016, Linaro Limited
	// All rights reserved.
	// Copyright (c) 2015 ARM Ltd
	// All rights reserved.
	// SPDX-License-Identifier: BSD-2-Clause-Patent
	//

	// Assumptions:
	//
	// ARMv8-a, AArch64, unaligned accesses.
	//
	//

	#define dstin x0
	#define src x1
	#define count x2
	#define dst x3
	#define srcend x4
	#define dstend x5
	#define A_l x6
	#define A_lw w6
	#define A_h x7
	#define A_hw w7
	#define B_l x8
	#define B_lw w8
	#define B_h x9
	#define C_l x10
	#define C_h x11
	#define D_l x12
	#define D_h x13
	#define E_l x14
	#define E_h x15
	#define F_l srcend
	#define F_h dst
	#define tmp1 x9
	#define tmp2 x3

	#define L(l) .L ## l

	// Copies are split into 3 main cases: small copies of up to 16 bytes,
	// medium copies of 17..96 bytes which are fully unrolled. Large copies
	// of more than 96 bytes align the destination and use an unrolled loop
	// processing 64 bytes per iteration.
	// Small and medium copies read all data before writing, allowing any
	// kind of overlap, and memmove tailcalls memcpy for these cases as
	// well as non-overlapping copies.

	__memcpy:
	prfm PLDL1KEEP, [src]
	add srcend, src, count
	add dstend, dstin, count
	cmp count, 16
	b.ls L(copy16)
	cmp count, 96
	b.hi L(copy_long)

	// Medium copies: 17..96 bytes.
	sub tmp1, count, 1
	ldp A_l, A_h, [src]
	tbnz tmp1, 6, L(copy96)
	ldp D_l, D_h, [srcend, -16]
	tbz tmp1, 5, 1f
	ldp B_l, B_h, [src, 16]
	ldp C_l, C_h, [srcend, -32]
	stp B_l, B_h, [dstin, 16]
	stp C_l, C_h, [dstend, -32]
	1:
	stp A_l, A_h, [dstin]
	stp D_l, D_h, [dstend, -16]
	ret

	.p2align 4
	// Small copies: 0..16 bytes.
	L(copy16):
	cmp count, 8
	b.lo 1f
	ldr A_l, [src]
	ldr A_h, [srcend, -8]
	str A_l, [dstin]
	str A_h, [dstend, -8]
	ret
	.p2align 4
	1:
	tbz count, 2, 1f
	ldr A_lw, [src]
	ldr A_hw, [srcend, -4]
	str A_lw, [dstin]
	str A_hw, [dstend, -4]
	ret

	// Copy 0..3 bytes. Use a branchless sequence that copies the same
	// byte 3 times if count==1, or the 2nd byte twice if count==2.
	1:
	cbz count, 2f
	lsr tmp1, count, 1
	ldrb A_lw, [src]
	ldrb A_hw, [srcend, -1]
	ldrb B_lw, [src, tmp1]
	strb A_lw, [dstin]
	strb B_lw, [dstin, tmp1]
	strb A_hw, [dstend, -1]
	2: ret

	.p2align 4
	// Copy 64..96 bytes. Copy 64 bytes from the start and
	// 32 bytes from the end.
	L(copy96):
	ldp B_l, B_h, [src, 16]
	ldp C_l, C_h, [src, 32]
	ldp D_l, D_h, [src, 48]
	ldp E_l, E_h, [srcend, -32]
	ldp F_l, F_h, [srcend, -16]
	stp A_l, A_h, [dstin]
	stp B_l, B_h, [dstin, 16]
	stp C_l, C_h, [dstin, 32]
	stp D_l, D_h, [dstin, 48]
	stp E_l, E_h, [dstend, -32]
	stp F_l, F_h, [dstend, -16]
	ret

	// Align DST to 16 byte alignment so that we don't cross cache line
	// boundaries on both loads and stores. There are at least 96 bytes
	// to copy, so copy 16 bytes unaligned and then align. The loop
	// copies 64 bytes per iteration and prefetches one iteration ahead.

	.p2align 4
	L(copy_long):
	and tmp1, dstin, 15
	bic dst, dstin, 15
	ldp D_l, D_h, [src]
	sub src, src, tmp1
	add count, count, tmp1 // Count is now 16 too large.
	ldp A_l, A_h, [src, 16]
	stp D_l, D_h, [dstin]
	ldp B_l, B_h, [src, 32]
	ldp C_l, C_h, [src, 48]
	ldp D_l, D_h, [src, 64]!
	subs count, count, 128 + 16 // Test and readjust count.
	b.ls 2f
	1:
	stp A_l, A_h, [dst, 16]
	ldp A_l, A_h, [src, 16]
	stp B_l, B_h, [dst, 32]
	ldp B_l, B_h, [src, 32]
	stp C_l, C_h, [dst, 48]
	ldp C_l, C_h, [src, 48]
	stp D_l, D_h, [dst, 64]!
	ldp D_l, D_h, [src, 64]!
	subs count, count, 64
	b.hi 1b

	// Write the last full set of 64 bytes. The remainder is at most 64
	// bytes, so it is safe to always copy 64 bytes from the end even if
	// there is just 1 byte left.
	2:
	ldp E_l, E_h, [srcend, -64]
	stp A_l, A_h, [dst, 16]
	ldp A_l, A_h, [srcend, -48]
	stp B_l, B_h, [dst, 32]
	ldp B_l, B_h, [srcend, -32]
	stp C_l, C_h, [dst, 48]
	ldp C_l, C_h, [srcend, -16]
	stp D_l, D_h, [dst, 64]
	stp E_l, E_h, [dstend, -64]
	stp A_l, A_h, [dstend, -48]
	stp B_l, B_h, [dstend, -32]
	stp C_l, C_h, [dstend, -16]
	ret


	//
	// All memmoves up to 96 bytes are done by memcpy as it supports overlaps.
	// Larger backwards copies are also handled by memcpy. The only remaining
	// case is forward large copies. The destination is aligned, and an
	// unrolled loop processes 64 bytes per iteration.
	//

	ASM_GLOBAL ASM_PFX(InternalMemCopyMem)
	ASM_PFX(InternalMemCopyMem):
	AARCH64_BTI(c)
	sub tmp2, dstin, src
	cmp count, 96
	ccmp tmp2, count, 2, hi
	b.hs __memcpy

	cbz tmp2, 3f
	add dstend, dstin, count
	add srcend, src, count

	// Align dstend to 16 byte alignment so that we don't cross cache line
	// boundaries on both loads and stores. There are at least 96 bytes
	// to copy, so copy 16 bytes unaligned and then align. The loop
	// copies 64 bytes per iteration and prefetches one iteration ahead.

	and tmp2, dstend, 15
	ldp D_l, D_h, [srcend, -16]
	sub srcend, srcend, tmp2
	sub count, count, tmp2
	ldp A_l, A_h, [srcend, -16]
	stp D_l, D_h, [dstend, -16]
	ldp B_l, B_h, [srcend, -32]
	ldp C_l, C_h, [srcend, -48]
	ldp D_l, D_h, [srcend, -64]!
	sub dstend, dstend, tmp2
	subs count, count, 128
	b.ls 2f
	nop
	1:
	stp A_l, A_h, [dstend, -16]
	ldp A_l, A_h, [srcend, -16]
	stp B_l, B_h, [dstend, -32]
	ldp B_l, B_h, [srcend, -32]
	stp C_l, C_h, [dstend, -48]
	ldp C_l, C_h, [srcend, -48]
	stp D_l, D_h, [dstend, -64]!
	ldp D_l, D_h, [srcend, -64]!
	subs count, count, 64
	b.hi 1b

	// Write the last full set of 64 bytes. The remainder is at most 64
	// bytes, so it is safe to always copy 64 bytes from the start even if
	// there is just 1 byte left.
	2:
	ldp E_l, E_h, [src, 48]
	stp A_l, A_h, [dstend, -16]
	ldp A_l, A_h, [src, 32]
	stp B_l, B_h, [dstend, -32]
	ldp B_l, B_h, [src, 16]
	stp C_l, C_h, [dstend, -48]
	ldp C_l, C_h, [src]
	stp D_l, D_h, [dstend, -64]
	stp E_l, E_h, [dstin, 48]
	stp A_l, A_h, [dstin, 32]
	stp B_l, B_h, [dstin, 16]
	stp C_l, C_h, [dstin]
	3: ret