|  | /* | 
|  | * Xor Based Zero Run Length Encoding | 
|  | * | 
|  | * Copyright 2013 Red Hat, Inc. and/or its affiliates | 
|  | * | 
|  | * Authors: | 
|  | *  Orit Wasserman  <owasserm@redhat.com> | 
|  | * | 
|  | * This work is licensed under the terms of the GNU GPL, version 2 or later. | 
|  | * See the COPYING file in the top-level directory. | 
|  | * | 
|  | */ | 
|  | #include "qemu/osdep.h" | 
|  | #include "qemu/cutils.h" | 
|  | #include "qemu/host-utils.h" | 
|  | #include "xbzrle.h" | 
|  |  | 
|  | #if defined(CONFIG_AVX512BW_OPT) | 
|  | #include <immintrin.h> | 
|  | #include "host/cpuinfo.h" | 
|  |  | 
|  | static int __attribute__((target("avx512bw"))) | 
|  | xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen, | 
|  | uint8_t *dst, int dlen) | 
|  | { | 
|  | uint32_t zrun_len = 0, nzrun_len = 0; | 
|  | int d = 0, i = 0, num = 0; | 
|  | uint8_t *nzrun_start = NULL; | 
|  | /* add 1 to include residual part in main loop */ | 
|  | uint32_t count512s = (slen >> 6) + 1; | 
|  | /* countResidual is tail of data, i.e., countResidual = slen % 64 */ | 
|  | uint32_t count_residual = slen & 0b111111; | 
|  | bool never_same = true; | 
|  | uint64_t mask_residual = 1; | 
|  | mask_residual <<= count_residual; | 
|  | mask_residual -= 1; | 
|  | __m512i r = _mm512_set1_epi32(0); | 
|  |  | 
|  | while (count512s) { | 
|  | int bytes_to_check = 64; | 
|  | uint64_t mask = 0xffffffffffffffff; | 
|  | if (count512s == 1) { | 
|  | bytes_to_check = count_residual; | 
|  | mask = mask_residual; | 
|  | } | 
|  | __m512i old_data = _mm512_mask_loadu_epi8(r, | 
|  | mask, old_buf + i); | 
|  | __m512i new_data = _mm512_mask_loadu_epi8(r, | 
|  | mask, new_buf + i); | 
|  | uint64_t comp = _mm512_cmpeq_epi8_mask(old_data, new_data); | 
|  | count512s--; | 
|  |  | 
|  | bool is_same = (comp & 0x1); | 
|  | while (bytes_to_check) { | 
|  | if (d + 2 > dlen) { | 
|  | return -1; | 
|  | } | 
|  | if (is_same) { | 
|  | if (nzrun_len) { | 
|  | d += uleb128_encode_small(dst + d, nzrun_len); | 
|  | if (d + nzrun_len > dlen) { | 
|  | return -1; | 
|  | } | 
|  | nzrun_start = new_buf + i - nzrun_len; | 
|  | memcpy(dst + d, nzrun_start, nzrun_len); | 
|  | d += nzrun_len; | 
|  | nzrun_len = 0; | 
|  | } | 
|  | /* 64 data at a time for speed */ | 
|  | if (count512s && (comp == 0xffffffffffffffff)) { | 
|  | i += 64; | 
|  | zrun_len += 64; | 
|  | break; | 
|  | } | 
|  | never_same = false; | 
|  | num = ctz64(~comp); | 
|  | num = (num < bytes_to_check) ? num : bytes_to_check; | 
|  | zrun_len += num; | 
|  | bytes_to_check -= num; | 
|  | comp >>= num; | 
|  | i += num; | 
|  | if (bytes_to_check) { | 
|  | /* still has different data after same data */ | 
|  | d += uleb128_encode_small(dst + d, zrun_len); | 
|  | zrun_len = 0; | 
|  | } else { | 
|  | break; | 
|  | } | 
|  | } | 
|  | if (never_same || zrun_len) { | 
|  | /* | 
|  | * never_same only acts if | 
|  | * data begins with diff in first count512s | 
|  | */ | 
|  | d += uleb128_encode_small(dst + d, zrun_len); | 
|  | zrun_len = 0; | 
|  | never_same = false; | 
|  | } | 
|  | /* has diff, 64 data at a time for speed */ | 
|  | if ((bytes_to_check == 64) && (comp == 0x0)) { | 
|  | i += 64; | 
|  | nzrun_len += 64; | 
|  | break; | 
|  | } | 
|  | num = ctz64(comp); | 
|  | num = (num < bytes_to_check) ? num : bytes_to_check; | 
|  | nzrun_len += num; | 
|  | bytes_to_check -= num; | 
|  | comp >>= num; | 
|  | i += num; | 
|  | if (bytes_to_check) { | 
|  | /* mask like 111000 */ | 
|  | d += uleb128_encode_small(dst + d, nzrun_len); | 
|  | /* overflow */ | 
|  | if (d + nzrun_len > dlen) { | 
|  | return -1; | 
|  | } | 
|  | nzrun_start = new_buf + i - nzrun_len; | 
|  | memcpy(dst + d, nzrun_start, nzrun_len); | 
|  | d += nzrun_len; | 
|  | nzrun_len = 0; | 
|  | is_same = true; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | if (nzrun_len != 0) { | 
|  | d += uleb128_encode_small(dst + d, nzrun_len); | 
|  | /* overflow */ | 
|  | if (d + nzrun_len > dlen) { | 
|  | return -1; | 
|  | } | 
|  | nzrun_start = new_buf + i - nzrun_len; | 
|  | memcpy(dst + d, nzrun_start, nzrun_len); | 
|  | d += nzrun_len; | 
|  | } | 
|  | return d; | 
|  | } | 
|  |  | 
|  | static int xbzrle_encode_buffer_int(uint8_t *old_buf, uint8_t *new_buf, | 
|  | int slen, uint8_t *dst, int dlen); | 
|  |  | 
|  | static int (*accel_func)(uint8_t *, uint8_t *, int, uint8_t *, int); | 
|  |  | 
|  | static void __attribute__((constructor)) init_accel(void) | 
|  | { | 
|  | unsigned info = cpuinfo_init(); | 
|  | if (info & CPUINFO_AVX512BW) { | 
|  | accel_func = xbzrle_encode_buffer_avx512; | 
|  | } else { | 
|  | accel_func = xbzrle_encode_buffer_int; | 
|  | } | 
|  | } | 
|  |  | 
|  | int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen, | 
|  | uint8_t *dst, int dlen) | 
|  | { | 
|  | return accel_func(old_buf, new_buf, slen, dst, dlen); | 
|  | } | 
|  |  | 
|  | #define xbzrle_encode_buffer xbzrle_encode_buffer_int | 
|  | #endif | 
|  |  | 
|  | /* | 
|  | page = zrun nzrun | 
|  | | zrun nzrun page | 
|  |  | 
|  | zrun = length | 
|  |  | 
|  | nzrun = length byte... | 
|  |  | 
|  | length = uleb128 encoded integer | 
|  | */ | 
|  | int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen, | 
|  | uint8_t *dst, int dlen) | 
|  | { | 
|  | uint32_t zrun_len = 0, nzrun_len = 0; | 
|  | int d = 0, i = 0; | 
|  | long res; | 
|  | uint8_t *nzrun_start = NULL; | 
|  |  | 
|  | g_assert(!(((uintptr_t)old_buf | (uintptr_t)new_buf | slen) % | 
|  | sizeof(long))); | 
|  |  | 
|  | while (i < slen) { | 
|  | /* overflow */ | 
|  | if (d + 2 > dlen) { | 
|  | return -1; | 
|  | } | 
|  |  | 
|  | /* not aligned to sizeof(long) */ | 
|  | res = (slen - i) % sizeof(long); | 
|  | while (res && old_buf[i] == new_buf[i]) { | 
|  | zrun_len++; | 
|  | i++; | 
|  | res--; | 
|  | } | 
|  |  | 
|  | /* word at a time for speed */ | 
|  | if (!res) { | 
|  | while (i < slen && | 
|  | (*(long *)(old_buf + i)) == (*(long *)(new_buf + i))) { | 
|  | i += sizeof(long); | 
|  | zrun_len += sizeof(long); | 
|  | } | 
|  |  | 
|  | /* go over the rest */ | 
|  | while (i < slen && old_buf[i] == new_buf[i]) { | 
|  | zrun_len++; | 
|  | i++; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* buffer unchanged */ | 
|  | if (zrun_len == slen) { | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | /* skip last zero run */ | 
|  | if (i == slen) { | 
|  | return d; | 
|  | } | 
|  |  | 
|  | d += uleb128_encode_small(dst + d, zrun_len); | 
|  |  | 
|  | zrun_len = 0; | 
|  | nzrun_start = new_buf + i; | 
|  |  | 
|  | /* overflow */ | 
|  | if (d + 2 > dlen) { | 
|  | return -1; | 
|  | } | 
|  | /* not aligned to sizeof(long) */ | 
|  | res = (slen - i) % sizeof(long); | 
|  | while (res && old_buf[i] != new_buf[i]) { | 
|  | i++; | 
|  | nzrun_len++; | 
|  | res--; | 
|  | } | 
|  |  | 
|  | /* word at a time for speed, use of 32-bit long okay */ | 
|  | if (!res) { | 
|  | /* truncation to 32-bit long okay */ | 
|  | unsigned long mask = (unsigned long)0x0101010101010101ULL; | 
|  | while (i < slen) { | 
|  | unsigned long xor; | 
|  | xor = *(unsigned long *)(old_buf + i) | 
|  | ^ *(unsigned long *)(new_buf + i); | 
|  | if ((xor - mask) & ~xor & (mask << 7)) { | 
|  | /* found the end of an nzrun within the current long */ | 
|  | while (old_buf[i] != new_buf[i]) { | 
|  | nzrun_len++; | 
|  | i++; | 
|  | } | 
|  | break; | 
|  | } else { | 
|  | i += sizeof(long); | 
|  | nzrun_len += sizeof(long); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | d += uleb128_encode_small(dst + d, nzrun_len); | 
|  | /* overflow */ | 
|  | if (d + nzrun_len > dlen) { | 
|  | return -1; | 
|  | } | 
|  | memcpy(dst + d, nzrun_start, nzrun_len); | 
|  | d += nzrun_len; | 
|  | nzrun_len = 0; | 
|  | } | 
|  |  | 
|  | return d; | 
|  | } | 
|  |  | 
|  | int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen) | 
|  | { | 
|  | int i = 0, d = 0; | 
|  | int ret; | 
|  | uint32_t count = 0; | 
|  |  | 
|  | while (i < slen) { | 
|  |  | 
|  | /* zrun */ | 
|  | if ((slen - i) < 2) { | 
|  | return -1; | 
|  | } | 
|  |  | 
|  | ret = uleb128_decode_small(src + i, &count); | 
|  | if (ret < 0 || (i && !count)) { | 
|  | return -1; | 
|  | } | 
|  | i += ret; | 
|  | d += count; | 
|  |  | 
|  | /* overflow */ | 
|  | if (d > dlen) { | 
|  | return -1; | 
|  | } | 
|  |  | 
|  | /* nzrun */ | 
|  | if ((slen - i) < 2) { | 
|  | return -1; | 
|  | } | 
|  |  | 
|  | ret = uleb128_decode_small(src + i, &count); | 
|  | if (ret < 0 || !count) { | 
|  | return -1; | 
|  | } | 
|  | i += ret; | 
|  |  | 
|  | /* overflow */ | 
|  | if (d + count > dlen || i + count > slen) { | 
|  | return -1; | 
|  | } | 
|  |  | 
|  | memcpy(dst + d, src + i, count); | 
|  | d += count; | 
|  | i += count; | 
|  | } | 
|  |  | 
|  | return d; | 
|  | } |