blob: c26fe30d502347e95ce8ea8536819f7b08ce67c9 [file] [log] [blame]
#ifndef X86_BITS_STRING_H
#define X86_BITS_STRING_H
* Copyright (C) 2007 Michael Brown <>.
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or any later version.
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* General Public License for more details.
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
* You can also choose to distribute this program under the terms of
* the Unmodified Binary Distribution Licence (as given in the file
* COPYING.UBDL), provided that you have satisfied its requirements.
/** @file
* Optimised string operations
extern void * __memcpy ( void *dest, const void *src, size_t len );
extern void * __memcpy_reverse ( void *dest, const void *src, size_t len );
* Copy memory area (where length is a compile-time constant)
* @v dest Destination address
* @v src Source address
* @v len Length
* @ret dest Destination address
static inline __attribute__ (( always_inline )) void *
__constant_memcpy ( void *dest, const void *src, size_t len ) {
union {
uint32_t u32[2];
uint16_t u16[4];
uint8_t u8[8];
} __attribute__ (( __may_alias__ )) *dest_u = dest;
const union {
uint32_t u32[2];
uint16_t u16[4];
uint8_t u8[8];
} __attribute__ (( __may_alias__ )) *src_u = src;
const void *esi;
void *edi;
switch ( len ) {
case 0 : /* 0 bytes */
return dest;
* Single-register moves; these are always better than a
* string operation. We can clobber an arbitrary two
* registers (data, source, dest can re-use source register)
* instead of being restricted to esi and edi. There's also a
* much greater potential for optimising with nearby code.
case 1 : /* 4 bytes */
dest_u->u8[0] = src_u->u8[0];
return dest;
case 2 : /* 6 bytes */
dest_u->u16[0] = src_u->u16[0];
return dest;
case 4 : /* 4 bytes */
dest_u->u32[0] = src_u->u32[0];
return dest;
* Double-register moves; these are probably still a win.
case 3 : /* 12 bytes */
dest_u->u16[0] = src_u->u16[0];
dest_u->u8[2] = src_u->u8[2];
return dest;
case 5 : /* 10 bytes */
dest_u->u32[0] = src_u->u32[0];
dest_u->u8[4] = src_u->u8[4];
return dest;
case 6 : /* 12 bytes */
dest_u->u32[0] = src_u->u32[0];
dest_u->u16[2] = src_u->u16[2];
return dest;
case 8 : /* 10 bytes */
dest_u->u32[0] = src_u->u32[0];
dest_u->u32[1] = src_u->u32[1];
return dest;
/* Even if we have to load up esi and edi ready for a string
* operation, we can sometimes save space by using multiple
* single-byte "movs" operations instead of loading up ecx and
* using "rep movsb".
* "load ecx, rep movsb" is 7 bytes, plus an average of 1 byte
* to allow for saving/restoring ecx 50% of the time.
* "movsl" and "movsb" are 1 byte each, "movsw" is two bytes.
* (In 16-bit mode, "movsl" is 2 bytes and "movsw" is 1 byte,
* but "movsl" moves twice as much data, so it balances out).
* The cutoff point therefore occurs around 26 bytes; the byte
* requirements for each method are:
* len 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
* #bytes (ecx) 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
* #bytes (no ecx) 4 5 6 7 5 6 7 8 6 7 8 9 7 8 9 10
esi = src;
edi = dest;
if ( len >= 26 )
return __memcpy ( dest, src, len );
if ( len >= 6*4 )
__asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
: "0" ( edi ), "1" ( esi ) : "memory" );
if ( len >= 5*4 )
__asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
: "0" ( edi ), "1" ( esi ) : "memory" );
if ( len >= 4*4 )
__asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
: "0" ( edi ), "1" ( esi ) : "memory" );
if ( len >= 3*4 )
__asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
: "0" ( edi ), "1" ( esi ) : "memory" );
if ( len >= 2*4 )
__asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
: "0" ( edi ), "1" ( esi ) : "memory" );
if ( len >= 1*4 )
__asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
: "0" ( edi ), "1" ( esi ) : "memory" );
if ( ( len % 4 ) >= 2 )
__asm__ __volatile__ ( "movsw" : "=&D" ( edi ), "=&S" ( esi )
: "0" ( edi ), "1" ( esi ) : "memory" );
if ( ( len % 2 ) >= 1 )
__asm__ __volatile__ ( "movsb" : "=&D" ( edi ), "=&S" ( esi )
: "0" ( edi ), "1" ( esi ) : "memory" );
return dest;
* Copy memory area
* @v dest Destination address
* @v src Source address
* @v len Length
* @ret dest Destination address
static inline __attribute__ (( always_inline )) void *
memcpy ( void *dest, const void *src, size_t len ) {
if ( __builtin_constant_p ( len ) ) {
return __constant_memcpy ( dest, src, len );
} else {
return __memcpy ( dest, src, len );
extern void * __memmove ( void *dest, const void *src, size_t len );
* Copy (possibly overlapping) memory area
* @v dest Destination address
* @v src Source address
* @v len Length
* @ret dest Destination address
static inline __attribute__ (( always_inline )) void *
memmove ( void *dest, const void *src, size_t len ) {
ssize_t offset = ( dest - src );
if ( __builtin_constant_p ( offset ) ) {
if ( offset <= 0 ) {
return memcpy ( dest, src, len );
} else {
return __memcpy_reverse ( dest, src, len );
} else {
return __memmove ( dest, src, len );
* Fill memory region
* @v dest Destination address
* @v fill Fill pattern
* @v len Length
* @ret dest Destination address
static inline __attribute__ (( always_inline )) void *
__memset ( void *dest, int fill, size_t len ) {
void *discard_D;
size_t discard_c;
__asm__ __volatile__ ( "rep stosb"
: "=&D" ( discard_D ), "=&c" ( discard_c )
: "0" ( dest ), "1" ( len ), "a" ( fill )
: "memory" );
return dest;
* Fill memory region with zero (where length is a compile-time constant)
* @v dest Destination address
* @v len Length
* @ret dest Destination address
static inline __attribute__ (( always_inline )) void *
__constant_memset_zero ( void *dest, size_t len ) {
union {
uint32_t u32[2];
uint16_t u16[4];
uint8_t u8[8];
} __attribute__ (( __may_alias__ )) *dest_u = dest;
void *edi;
uint32_t eax;
switch ( len ) {
case 0 : /* 0 bytes */
return dest;
/* Single-register moves. Almost certainly better than a
* string operation. We can avoid clobbering any registers,
* we can reuse a zero that happens to already be in a
* register, and we can optimise away the code entirely if the
* memset() is used to clear a region which then gets
* immediately overwritten.
case 1 : /* 3 bytes */
dest_u->u8[0] = 0;
return dest;
case 2: /* 5 bytes */
dest_u->u16[0] = 0;
return dest;
case 4: /* 6 bytes */
dest_u->u32[0] = 0;
return dest;
/* Double-register moves. Very probably better than a string
* operation.
case 3 : /* 9 bytes */
dest_u->u16[0] = 0;
dest_u->u8[2] = 0;
return dest;
case 5 : /* 10 bytes */
dest_u->u32[0] = 0;
dest_u->u8[4] = 0;
return dest;
case 6 : /* 12 bytes */
dest_u->u32[0] = 0;
dest_u->u16[2] = 0;
return dest;
case 8 : /* 13 bytes */
dest_u->u32[0] = 0;
dest_u->u32[1] = 0;
return dest;
/* As with memcpy(), we can potentially save space by using
* multiple single-byte "stos" instructions instead of loading
* up ecx and using "rep stosb".
* "load ecx, rep movsb" is 7 bytes, plus an average of 1 byte
* to allow for saving/restoring ecx 50% of the time.
* "stosl" and "stosb" are 1 byte each, "stosw" is two bytes.
* The calculations are therefore the same as for memcpy(),
* giving a cutoff point of around 26 bytes.
edi = dest;
eax = 0;
if ( len >= 26 )
return __memset ( dest, 0, len );
if ( len >= 6*4 )
__asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
: "0" ( edi ), "1" ( eax ) : "memory" );
if ( len >= 5*4 )
__asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
: "0" ( edi ), "1" ( eax ) : "memory" );
if ( len >= 4*4 )
__asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
: "0" ( edi ), "1" ( eax ) : "memory" );
if ( len >= 3*4 )
__asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
: "0" ( edi ), "1" ( eax ) : "memory" );
if ( len >= 2*4 )
__asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
: "0" ( edi ), "1" ( eax ) : "memory" );
if ( len >= 1*4 )
__asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
: "0" ( edi ), "1" ( eax ) : "memory" );
if ( ( len % 4 ) >= 2 )
__asm__ __volatile__ ( "stosw" : "=&D" ( edi ), "=&a" ( eax )
: "0" ( edi ), "1" ( eax ) : "memory" );
if ( ( len % 2 ) >= 1 )
__asm__ __volatile__ ( "stosb" : "=&D" ( edi ), "=&a" ( eax )
: "0" ( edi ), "1" ( eax ) : "memory" );
return dest;
* Fill memory region
* @v dest Destination address
* @v fill Fill pattern
* @v len Length
* @ret dest Destination address
static inline __attribute__ (( always_inline )) void *
memset ( void *dest, int fill, size_t len ) {
if ( __builtin_constant_p ( fill ) && ( fill == 0 ) &&
__builtin_constant_p ( len ) ) {
return __constant_memset_zero ( dest, len );
} else {
return __memset ( dest, fill, len );
#endif /* X86_BITS_STRING_H */