| // | |
| // Copyright (c) 2014, ARM Limited | |
| // All rights Reserved. | |
| // | |
| // Redistribution and use in source and binary forms, with or without | |
| // modification, are permitted provided that the following conditions are met: | |
| // * Redistributions of source code must retain the above copyright | |
| // notice, this list of conditions and the following disclaimer. | |
| // * Redistributions in binary form must reproduce the above copyright | |
| // notice, this list of conditions and the following disclaimer in the | |
| // documentation and/or other materials provided with the distribution. | |
| // * Neither the name of the company nor the names of its contributors | |
| // may be used to endorse or promote products derived from this | |
| // software without specific prior written permission. | |
| // | |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
| // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
| // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
| // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
| // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
| // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
| // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
| // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
| // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
| // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
| // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| // | |
| // Assumptions: | |
| // | |
| // ARMv8-a, AArch64 | |
| // Neon Available. | |
| // | |
| // Arguments and results. | |
| #define srcin x0 | |
| #define cntin x1 | |
| #define chrin w2 | |
| #define result x0 | |
| #define src x3 | |
| #define tmp x4 | |
| #define wtmp2 w5 | |
| #define synd x6 | |
| #define soff x9 | |
| #define cntrem x10 | |
| #define vrepchr v0 | |
| #define vdata1 v1 | |
| #define vdata2 v2 | |
| #define vhas_chr1 v3 | |
| #define vhas_chr2 v4 | |
| #define vrepmask v5 | |
| #define vend v6 | |
| // | |
| // Core algorithm: | |
| // | |
| // For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits | |
| // per byte. For each tuple, bit 0 is set if the relevant byte matched the | |
| // requested character and bit 1 is not used (faster than using a 32bit | |
| // syndrome). Since the bits in the syndrome reflect exactly the order in which | |
| // things occur in the original string, counting trailing zeros allows to | |
| // identify exactly which byte has matched. | |
| // | |
| ASM_GLOBAL ASM_PFX(InternalMemScanMem8) | |
| ASM_PFX(InternalMemScanMem8): | |
| // Do not dereference srcin if no bytes to compare. | |
| cbz cntin, .Lzero_length | |
| // | |
| // Magic constant 0x40100401 allows us to identify which lane matches | |
| // the requested byte. | |
| // | |
| mov wtmp2, #0x0401 | |
| movk wtmp2, #0x4010, lsl #16 | |
| dup vrepchr.16b, chrin | |
| // Work with aligned 32-byte chunks | |
| bic src, srcin, #31 | |
| dup vrepmask.4s, wtmp2 | |
| ands soff, srcin, #31 | |
| and cntrem, cntin, #31 | |
| b.eq .Lloop | |
| // | |
| // Input string is not 32-byte aligned. We calculate the syndrome | |
| // value for the aligned 32 bytes block containing the first bytes | |
| // and mask the irrelevant part. | |
| // | |
| ld1 {vdata1.16b, vdata2.16b}, [src], #32 | |
| sub tmp, soff, #32 | |
| adds cntin, cntin, tmp | |
| cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b | |
| cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b | |
| and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b | |
| and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b | |
| addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 | |
| addp vend.16b, vend.16b, vend.16b // 128->64 | |
| mov synd, vend.d[0] | |
| // Clear the soff*2 lower bits | |
| lsl tmp, soff, #1 | |
| lsr synd, synd, tmp | |
| lsl synd, synd, tmp | |
| // The first block can also be the last | |
| b.ls .Lmasklast | |
| // Have we found something already? | |
| cbnz synd, .Ltail | |
| .Lloop: | |
| ld1 {vdata1.16b, vdata2.16b}, [src], #32 | |
| subs cntin, cntin, #32 | |
| cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b | |
| cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b | |
| // If we're out of data we finish regardless of the result | |
| b.ls .Lend | |
| // Use a fast check for the termination condition | |
| orr vend.16b, vhas_chr1.16b, vhas_chr2.16b | |
| addp vend.2d, vend.2d, vend.2d | |
| mov synd, vend.d[0] | |
| // We're not out of data, loop if we haven't found the character | |
| cbz synd, .Lloop | |
| .Lend: | |
| // Termination condition found, let's calculate the syndrome value | |
| and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b | |
| and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b | |
| addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 | |
| addp vend.16b, vend.16b, vend.16b // 128->64 | |
| mov synd, vend.d[0] | |
| // Only do the clear for the last possible block | |
| b.hi .Ltail | |
| .Lmasklast: | |
| // Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits | |
| add tmp, cntrem, soff | |
| and tmp, tmp, #31 | |
| sub tmp, tmp, #32 | |
| neg tmp, tmp, lsl #1 | |
| lsl synd, synd, tmp | |
| lsr synd, synd, tmp | |
| .Ltail: | |
| // Count the trailing zeros using bit reversing | |
| rbit synd, synd | |
| // Compensate the last post-increment | |
| sub src, src, #32 | |
| // Check that we have found a character | |
| cmp synd, #0 | |
| // And count the leading zeros | |
| clz synd, synd | |
| // Compute the potential result | |
| add result, src, synd, lsr #1 | |
| // Select result or NULL | |
| csel result, xzr, result, eq | |
| ret | |
| .Lzero_length: | |
| mov result, #0 | |
| ret |