| // | |
| // Copyright (c) 2012 - 2016, Linaro Limited | |
| // All rights reserved. | |
| // | |
| // Redistribution and use in source and binary forms, with or without | |
| // modification, are permitted provided that the following conditions are met: | |
| // * Redistributions of source code must retain the above copyright | |
| // notice, this list of conditions and the following disclaimer. | |
| // * Redistributions in binary form must reproduce the above copyright | |
| // notice, this list of conditions and the following disclaimer in the | |
| // documentation and/or other materials provided with the distribution. | |
| // * Neither the name of the Linaro nor the | |
| // names of its contributors may be used to endorse or promote products | |
| // derived from this software without specific prior written permission. | |
| // | |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
| // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
| // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
| // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
| // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
| // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
| // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
| // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
| // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
| // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
| // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| // | |
| // | |
| // Copyright (c) 2015 ARM Ltd | |
| // All rights reserved. | |
| // | |
| // Redistribution and use in source and binary forms, with or without | |
| // modification, are permitted provided that the following conditions | |
| // are met: | |
| // 1. Redistributions of source code must retain the above copyright | |
| // notice, this list of conditions and the following disclaimer. | |
| // 2. Redistributions in binary form must reproduce the above copyright | |
| // notice, this list of conditions and the following disclaimer in the | |
| // documentation and/or other materials provided with the distribution. | |
| // 3. The name of the company may not be used to endorse or promote | |
| // products derived from this software without specific prior written | |
| // permission. | |
| // | |
| // THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED | |
| // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF | |
| // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. | |
| // IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
| // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED | |
| // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
| // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | |
| // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | |
| // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
| // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| // | |
| // Assumptions: | |
| // | |
| // ARMv8-a, AArch64, unaligned accesses. | |
| // | |
| // | |
| #define dstin x0 | |
| #define src x1 | |
| #define count x2 | |
| #define dst x3 | |
| #define srcend x4 | |
| #define dstend x5 | |
| #define A_l x6 | |
| #define A_lw w6 | |
| #define A_h x7 | |
| #define A_hw w7 | |
| #define B_l x8 | |
| #define B_lw w8 | |
| #define B_h x9 | |
| #define C_l x10 | |
| #define C_h x11 | |
| #define D_l x12 | |
| #define D_h x13 | |
| #define E_l x14 | |
| #define E_h x15 | |
| #define F_l srcend | |
| #define F_h dst | |
| #define tmp1 x9 | |
| #define tmp2 x3 | |
| #define L(l) .L ## l | |
| // Copies are split into 3 main cases: small copies of up to 16 bytes, | |
| // medium copies of 17..96 bytes which are fully unrolled. Large copies | |
| // of more than 96 bytes align the destination and use an unrolled loop | |
| // processing 64 bytes per iteration. | |
| // Small and medium copies read all data before writing, allowing any | |
| // kind of overlap, and memmove tailcalls memcpy for these cases as | |
| // well as non-overlapping copies. | |
| __memcpy: | |
| prfm PLDL1KEEP, [src] | |
| add srcend, src, count | |
| add dstend, dstin, count | |
| cmp count, 16 | |
| b.ls L(copy16) | |
| cmp count, 96 | |
| b.hi L(copy_long) | |
| // Medium copies: 17..96 bytes. | |
| sub tmp1, count, 1 | |
| ldp A_l, A_h, [src] | |
| tbnz tmp1, 6, L(copy96) | |
| ldp D_l, D_h, [srcend, -16] | |
| tbz tmp1, 5, 1f | |
| ldp B_l, B_h, [src, 16] | |
| ldp C_l, C_h, [srcend, -32] | |
| stp B_l, B_h, [dstin, 16] | |
| stp C_l, C_h, [dstend, -32] | |
| 1: | |
| stp A_l, A_h, [dstin] | |
| stp D_l, D_h, [dstend, -16] | |
| ret | |
| .p2align 4 | |
| // Small copies: 0..16 bytes. | |
| L(copy16): | |
| cmp count, 8 | |
| b.lo 1f | |
| ldr A_l, [src] | |
| ldr A_h, [srcend, -8] | |
| str A_l, [dstin] | |
| str A_h, [dstend, -8] | |
| ret | |
| .p2align 4 | |
| 1: | |
| tbz count, 2, 1f | |
| ldr A_lw, [src] | |
| ldr A_hw, [srcend, -4] | |
| str A_lw, [dstin] | |
| str A_hw, [dstend, -4] | |
| ret | |
| // Copy 0..3 bytes. Use a branchless sequence that copies the same | |
| // byte 3 times if count==1, or the 2nd byte twice if count==2. | |
| 1: | |
| cbz count, 2f | |
| lsr tmp1, count, 1 | |
| ldrb A_lw, [src] | |
| ldrb A_hw, [srcend, -1] | |
| ldrb B_lw, [src, tmp1] | |
| strb A_lw, [dstin] | |
| strb B_lw, [dstin, tmp1] | |
| strb A_hw, [dstend, -1] | |
| 2: ret | |
| .p2align 4 | |
| // Copy 64..96 bytes. Copy 64 bytes from the start and | |
| // 32 bytes from the end. | |
| L(copy96): | |
| ldp B_l, B_h, [src, 16] | |
| ldp C_l, C_h, [src, 32] | |
| ldp D_l, D_h, [src, 48] | |
| ldp E_l, E_h, [srcend, -32] | |
| ldp F_l, F_h, [srcend, -16] | |
| stp A_l, A_h, [dstin] | |
| stp B_l, B_h, [dstin, 16] | |
| stp C_l, C_h, [dstin, 32] | |
| stp D_l, D_h, [dstin, 48] | |
| stp E_l, E_h, [dstend, -32] | |
| stp F_l, F_h, [dstend, -16] | |
| ret | |
| // Align DST to 16 byte alignment so that we don't cross cache line | |
| // boundaries on both loads and stores. There are at least 96 bytes | |
| // to copy, so copy 16 bytes unaligned and then align. The loop | |
| // copies 64 bytes per iteration and prefetches one iteration ahead. | |
| .p2align 4 | |
| L(copy_long): | |
| and tmp1, dstin, 15 | |
| bic dst, dstin, 15 | |
| ldp D_l, D_h, [src] | |
| sub src, src, tmp1 | |
| add count, count, tmp1 // Count is now 16 too large. | |
| ldp A_l, A_h, [src, 16] | |
| stp D_l, D_h, [dstin] | |
| ldp B_l, B_h, [src, 32] | |
| ldp C_l, C_h, [src, 48] | |
| ldp D_l, D_h, [src, 64]! | |
| subs count, count, 128 + 16 // Test and readjust count. | |
| b.ls 2f | |
| 1: | |
| stp A_l, A_h, [dst, 16] | |
| ldp A_l, A_h, [src, 16] | |
| stp B_l, B_h, [dst, 32] | |
| ldp B_l, B_h, [src, 32] | |
| stp C_l, C_h, [dst, 48] | |
| ldp C_l, C_h, [src, 48] | |
| stp D_l, D_h, [dst, 64]! | |
| ldp D_l, D_h, [src, 64]! | |
| subs count, count, 64 | |
| b.hi 1b | |
| // Write the last full set of 64 bytes. The remainder is at most 64 | |
| // bytes, so it is safe to always copy 64 bytes from the end even if | |
| // there is just 1 byte left. | |
| 2: | |
| ldp E_l, E_h, [srcend, -64] | |
| stp A_l, A_h, [dst, 16] | |
| ldp A_l, A_h, [srcend, -48] | |
| stp B_l, B_h, [dst, 32] | |
| ldp B_l, B_h, [srcend, -32] | |
| stp C_l, C_h, [dst, 48] | |
| ldp C_l, C_h, [srcend, -16] | |
| stp D_l, D_h, [dst, 64] | |
| stp E_l, E_h, [dstend, -64] | |
| stp A_l, A_h, [dstend, -48] | |
| stp B_l, B_h, [dstend, -32] | |
| stp C_l, C_h, [dstend, -16] | |
| ret | |
| // | |
| // All memmoves up to 96 bytes are done by memcpy as it supports overlaps. | |
| // Larger backwards copies are also handled by memcpy. The only remaining | |
| // case is forward large copies. The destination is aligned, and an | |
| // unrolled loop processes 64 bytes per iteration. | |
| // | |
| ASM_GLOBAL ASM_PFX(InternalMemCopyMem) | |
| ASM_PFX(InternalMemCopyMem): | |
| sub tmp2, dstin, src | |
| cmp count, 96 | |
| ccmp tmp2, count, 2, hi | |
| b.hs __memcpy | |
| cbz tmp2, 3f | |
| add dstend, dstin, count | |
| add srcend, src, count | |
| // Align dstend to 16 byte alignment so that we don't cross cache line | |
| // boundaries on both loads and stores. There are at least 96 bytes | |
| // to copy, so copy 16 bytes unaligned and then align. The loop | |
| // copies 64 bytes per iteration and prefetches one iteration ahead. | |
| and tmp2, dstend, 15 | |
| ldp D_l, D_h, [srcend, -16] | |
| sub srcend, srcend, tmp2 | |
| sub count, count, tmp2 | |
| ldp A_l, A_h, [srcend, -16] | |
| stp D_l, D_h, [dstend, -16] | |
| ldp B_l, B_h, [srcend, -32] | |
| ldp C_l, C_h, [srcend, -48] | |
| ldp D_l, D_h, [srcend, -64]! | |
| sub dstend, dstend, tmp2 | |
| subs count, count, 128 | |
| b.ls 2f | |
| nop | |
| 1: | |
| stp A_l, A_h, [dstend, -16] | |
| ldp A_l, A_h, [srcend, -16] | |
| stp B_l, B_h, [dstend, -32] | |
| ldp B_l, B_h, [srcend, -32] | |
| stp C_l, C_h, [dstend, -48] | |
| ldp C_l, C_h, [srcend, -48] | |
| stp D_l, D_h, [dstend, -64]! | |
| ldp D_l, D_h, [srcend, -64]! | |
| subs count, count, 64 | |
| b.hi 1b | |
| // Write the last full set of 64 bytes. The remainder is at most 64 | |
| // bytes, so it is safe to always copy 64 bytes from the start even if | |
| // there is just 1 byte left. | |
| 2: | |
| ldp E_l, E_h, [src, 48] | |
| stp A_l, A_h, [dstend, -16] | |
| ldp A_l, A_h, [src, 32] | |
| stp B_l, B_h, [dstend, -32] | |
| ldp B_l, B_h, [src, 16] | |
| stp C_l, C_h, [dstend, -48] | |
| ldp C_l, C_h, [src] | |
| stp D_l, D_h, [dstend, -64] | |
| stp E_l, E_h, [dstin, 48] | |
| stp A_l, A_h, [dstin, 32] | |
| stp B_l, B_h, [dstin, 16] | |
| stp C_l, C_h, [dstin] | |
| 3: ret |