[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH v5 1/2] Update AVX512 support for xbzrle_encode_buffer
From: |
Juan Quintela |
Subject: |
Re: [PATCH v5 1/2] Update AVX512 support for xbzrle_encode_buffer |
Date: |
Wed, 24 Aug 2022 10:42:05 +0200 |
User-agent: |
Gnus/5.13 (Gnus v5.13) Emacs/28.1 (gnu/linux) |
ling xu <ling1.xu@intel.com> wrote:
> This commit updates code of avx512 support for xbzrle_encode_buffer function
> to
> accelerate xbzrle encoding speed. We add runtime check of avx512 and add
> benchmark for this feature. Compared with C version of
> xbzrle_encode_buffer function, avx512 version can achieve 50%-70%
> performance improvement on benchmarking. In addition, if dirty data is
> randomly located in 4K page, the avx512 version can achieve almost 140%
> performance gain.
>
> Signed-off-by: ling xu <ling1.xu@intel.com>
> Co-authored-by: Zhou Zhao <zhou.zhao@intel.com>
> Co-authored-by: Jun Jin <jun.i.jin@intel.com>
> ---
> meson.build | 16 ++++++
> meson_options.txt | 2 +
> migration/ram.c | 35 ++++++++++--
> migration/xbzrle.c | 130 +++++++++++++++++++++++++++++++++++++++++++++
> migration/xbzrle.h | 4 ++
> 5 files changed, 184 insertions(+), 3 deletions(-)
>
> diff --git a/meson.build b/meson.build
> index 30a380752c..c9d90a5bff 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -2264,6 +2264,22 @@ config_host_data.set('CONFIG_AVX512F_OPT',
> get_option('avx512f') \
> int main(int argc, char *argv[]) { return bar(argv[0]); }
> '''), error_message: 'AVX512F not available').allowed())
>
> +config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \
> + .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot
> enable AVX512BW') \
> + .require(cc.links('''
> + #pragma GCC push_options
> + #pragma GCC target("avx512bw")
> + #include <cpuid.h>
> + #include <immintrin.h>
> + static int bar(void *a) {
> + __m512i x = *(__m512i *)a;
> + __m512i res= _mm512_abs_epi8(x);
Cast is as ugly as hell, what about:
__m512i *x = a;
__m512i res = _mm512_abs_epi8(*x);
??
> +static void __attribute__((constructor)) init_cpu_flag(void)
> +{
> + unsigned max = __get_cpuid_max(0, NULL);
> + int a, b, c, d;
> + if (max >= 1) {
> + __cpuid(1, a, b, c, d);
> + /* We must check that AVX is not just available, but usable. */
> + if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
> + int bv;
> + __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
> + __cpuid_count(7, 0, a, b, c, d);
> + /* 0xe6:
> + * XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
> + * and ZMM16-ZMM31 state are enabled by OS)
> + * XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
> + */
> + if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
> + xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
> + }
> + }
> + }
> + return ;
This return line is not needed.
> +}
> +#endif
> +
> XBZRLECacheStats xbzrle_counters;
>
> /* struct contains XBZRLE cache and a static page
> @@ -802,9 +831,9 @@ static int save_xbzrle_page(RAMState *rs, uint8_t
> **current_data,
> memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
>
> /* XBZRLE encoding (if there is no overflow) */
> - encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
> - TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
> - TARGET_PAGE_SIZE);
> + encoded_len = xbzrle_encode_buffer_func(prev_cached_page,
> XBZRLE.current_buf,
> + TARGET_PAGE_SIZE,
> XBZRLE.encoded_buf,
> + TARGET_PAGE_SIZE);
>
> /*
> * Update the cache contents, so that it corresponds to the data
> diff --git a/migration/xbzrle.c b/migration/xbzrle.c
> index 1ba482ded9..6da7f79625 100644
> --- a/migration/xbzrle.c
> +++ b/migration/xbzrle.c
> @@ -174,3 +174,133 @@ int xbzrle_decode_buffer(uint8_t *src, int slen,
> uint8_t *dst, int dlen)
>
> return d;
> }
> +
> +#if defined(CONFIG_AVX512BW_OPT)
> +#pragma GCC push_options
> +#pragma GCC target("avx512bw")
> +#include <immintrin.h>
> +int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
> + uint8_t *dst, int dlen)
> +{
> + uint32_t zrun_len = 0, nzrun_len = 0;
> + int d = 0, i = 0, num = 0;
> + uint8_t *nzrun_start = NULL;
> + /* add 1 to include residual part in main loop */
> + uint32_t count512s = (slen >> 6) + 1;
> + /* countResidual is tail of data, i.e., countResidual = slen % 64 */
> + uint32_t countResidual = slen & 0b111111;
> + bool never_same = true;
> + uint64_t maskResidual = 1;
> + maskResidual <<= countResidual;
> + maskResidual -=1;
> + uint64_t comp = 0;
> + int bytesToCheck = 0;
> +
> + while (count512s) {
> + if (d + 2 > dlen) {
> + return -1;
> + }
> +
> + if(count512s != 1){
> + __m512i old_data = _mm512_mask_loadu_epi8(old_data,
> + 0xffffffffffffffff,
> old_buf + i);
> + __m512i new_data = _mm512_mask_loadu_epi8(new_data,
> + 0xffffffffffffffff,
> new_buf + i);
> + comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
> + bytesToCheck = 64;
> + count512s--;
> + } else {
> + __m512i old_data = _mm512_mask_loadu_epi8(old_data,
> + maskResidual, old_buf
> + i);
> + __m512i new_data = _mm512_mask_loadu_epi8(new_data,
> + maskResidual, new_buf
> + i);
> + comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
> + bytesToCheck = countResidual;
> + count512s--;
> + }
It is basically the same in both branches of the if, what about:
int bytesToCheck = 64;
uint86_t mask = 0xffffffffffffffff;
/* I am assuming this is the opposit of the if condition */
if(count512s == 1){
mask = maskResidual;
bytesToCheck = countResidual;
}
__m512i old_data = _mm512_mask_loadu_epi8(old_data, mask, old_buf + i);
__m512i new_data = _mm512_mask_loadu_epi8(new_data, mask, new_buf + i);
uint64_t comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
count512s--;
BTW, once that we are here, why not to be consistent:
bool is_same;
uint64_t maskResidual;
just use always Cammel case or underscores, but half and half ....
Later, Juan.