[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PULL 08/30] Update AVX512 support for xbzrle_encode_buffer
From: |
Juan Quintela |
Subject: |
[PULL 08/30] Update AVX512 support for xbzrle_encode_buffer |
Date: |
Tue, 15 Nov 2022 16:34:52 +0100 |
From: ling xu <ling1.xu@intel.com>
This commit updates code of avx512 support for xbzrle_encode_buffer
function to accelerate xbzrle encoding speed. Runtime check of avx512
support and benchmark for this feature are added. Compared with C
version of xbzrle_encode_buffer function, avx512 version can achieve
50%-70% performance improvement on benchmarking. In addition, if dirty
data is randomly located in 4K page, the avx512 version can achieve
almost 140% performance gain.
Signed-off-by: ling xu <ling1.xu@intel.com>
Co-authored-by: Zhou Zhao <zhou.zhao@intel.com>
Co-authored-by: Jun Jin <jun.i.jin@intel.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
---
meson.build | 16 +++++
migration/xbzrle.h | 4 ++
migration/ram.c | 34 +++++++++-
migration/xbzrle.c | 124 ++++++++++++++++++++++++++++++++++
meson_options.txt | 2 +
scripts/meson-buildoptions.sh | 14 ++--
6 files changed, 186 insertions(+), 8 deletions(-)
diff --git a/meson.build b/meson.build
index cf3e517e56..d0d28f5c9e 100644
--- a/meson.build
+++ b/meson.build
@@ -2344,6 +2344,22 @@ config_host_data.set('CONFIG_AVX512F_OPT',
get_option('avx512f') \
int main(int argc, char *argv[]) { return bar(argv[argc - 1]); }
'''), error_message: 'AVX512F not available').allowed())
+config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \
+ .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable
AVX512BW') \
+ .require(cc.links('''
+ #pragma GCC push_options
+ #pragma GCC target("avx512bw")
+ #include <cpuid.h>
+ #include <immintrin.h>
+ static int bar(void *a) {
+
+ __m512i *x = a;
+ __m512i res= _mm512_abs_epi8(*x);
+ return res[1];
+ }
+ int main(int argc, char *argv[]) { return bar(argv[0]); }
+ '''), error_message: 'AVX512BW not available').allowed())
+
have_pvrdma = get_option('pvrdma') \
.require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics
libraries') \
.require(cc.compiles(gnu_source_prefix + '''
diff --git a/migration/xbzrle.h b/migration/xbzrle.h
index a0db507b9c..6feb49160a 100644
--- a/migration/xbzrle.h
+++ b/migration/xbzrle.h
@@ -18,4 +18,8 @@ int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf,
int slen,
uint8_t *dst, int dlen);
int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen);
+#if defined(CONFIG_AVX512BW_OPT)
+int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
+ uint8_t *dst, int dlen);
+#endif
#endif
diff --git a/migration/ram.c b/migration/ram.c
index 67e41dd2c0..bb4f08bfed 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -83,6 +83,34 @@
/* 0x80 is reserved in migration.h start with 0x100 next */
#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
+int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int,
+ uint8_t *, int) = xbzrle_encode_buffer;
+#if defined(CONFIG_AVX512BW_OPT)
+#include "qemu/cpuid.h"
+static void __attribute__((constructor)) init_cpu_flag(void)
+{
+ unsigned max = __get_cpuid_max(0, NULL);
+ int a, b, c, d;
+ if (max >= 1) {
+ __cpuid(1, a, b, c, d);
+ /* We must check that AVX is not just available, but usable. */
+ if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
+ int bv;
+ __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
+ __cpuid_count(7, 0, a, b, c, d);
+ /* 0xe6:
+ * XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
+ * and ZMM16-ZMM31 state are enabled by OS)
+ * XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
+ */
+ if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
+ xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
+ }
+ }
+ }
+}
+#endif
+
XBZRLECacheStats xbzrle_counters;
/* struct contains XBZRLE cache and a static page
@@ -802,9 +830,9 @@ static int save_xbzrle_page(RAMState *rs, uint8_t
**current_data,
memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
/* XBZRLE encoding (if there is no overflow) */
- encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
- TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
- TARGET_PAGE_SIZE);
+ encoded_len = xbzrle_encode_buffer_func(prev_cached_page,
XBZRLE.current_buf,
+ TARGET_PAGE_SIZE,
XBZRLE.encoded_buf,
+ TARGET_PAGE_SIZE);
/*
* Update the cache contents, so that it corresponds to the data
diff --git a/migration/xbzrle.c b/migration/xbzrle.c
index 1ba482ded9..05366e86c0 100644
--- a/migration/xbzrle.c
+++ b/migration/xbzrle.c
@@ -174,3 +174,127 @@ int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t
*dst, int dlen)
return d;
}
+
+#if defined(CONFIG_AVX512BW_OPT)
+#pragma GCC push_options
+#pragma GCC target("avx512bw")
+#include <immintrin.h>
+int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
+ uint8_t *dst, int dlen)
+{
+ uint32_t zrun_len = 0, nzrun_len = 0;
+ int d = 0, i = 0, num = 0;
+ uint8_t *nzrun_start = NULL;
+ /* add 1 to include residual part in main loop */
+ uint32_t count512s = (slen >> 6) + 1;
+ /* countResidual is tail of data, i.e., countResidual = slen % 64 */
+ uint32_t count_residual = slen & 0b111111;
+ bool never_same = true;
+ uint64_t mask_residual = 1;
+ mask_residual <<= count_residual;
+ mask_residual -= 1;
+ __m512i r = _mm512_set1_epi32(0);
+
+ while (count512s) {
+ if (d + 2 > dlen) {
+ return -1;
+ }
+
+ int bytes_to_check = 64;
+ uint64_t mask = 0xffffffffffffffff;
+ if (count512s == 1) {
+ bytes_to_check = count_residual;
+ mask = mask_residual;
+ }
+ __m512i old_data = _mm512_mask_loadu_epi8(r,
+ mask, old_buf + i);
+ __m512i new_data = _mm512_mask_loadu_epi8(r,
+ mask, new_buf + i);
+ uint64_t comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
+ count512s--;
+
+ bool is_same = (comp & 0x1);
+ while (bytes_to_check) {
+ if (is_same) {
+ if (nzrun_len) {
+ d += uleb128_encode_small(dst + d, nzrun_len);
+ if (d + nzrun_len > dlen) {
+ return -1;
+ }
+ nzrun_start = new_buf + i - nzrun_len;
+ memcpy(dst + d, nzrun_start, nzrun_len);
+ d += nzrun_len;
+ nzrun_len = 0;
+ }
+ /* 64 data at a time for speed */
+ if (count512s && (comp == 0xffffffffffffffff)) {
+ i += 64;
+ zrun_len += 64;
+ break;
+ }
+ never_same = false;
+ num = __builtin_ctzll(~comp);
+ num = (num < bytes_to_check) ? num : bytes_to_check;
+ zrun_len += num;
+ bytes_to_check -= num;
+ comp >>= num;
+ i += num;
+ if (bytes_to_check) {
+ /* still has different data after same data */
+ d += uleb128_encode_small(dst + d, zrun_len);
+ zrun_len = 0;
+ } else {
+ break;
+ }
+ }
+ if (never_same || zrun_len) {
+ /*
+ * never_same only acts if
+ * data begins with diff in first count512s
+ */
+ d += uleb128_encode_small(dst + d, zrun_len);
+ zrun_len = 0;
+ never_same = false;
+ }
+ /* has diff, 64 data at a time for speed */
+ if ((bytes_to_check == 64) && (comp == 0x0)) {
+ i += 64;
+ nzrun_len += 64;
+ break;
+ }
+ num = __builtin_ctzll(comp);
+ num = (num < bytes_to_check) ? num : bytes_to_check;
+ nzrun_len += num;
+ bytes_to_check -= num;
+ comp >>= num;
+ i += num;
+ if (bytes_to_check) {
+ /* mask like 111000 */
+ d += uleb128_encode_small(dst + d, nzrun_len);
+ /* overflow */
+ if (d + nzrun_len > dlen) {
+ return -1;
+ }
+ nzrun_start = new_buf + i - nzrun_len;
+ memcpy(dst + d, nzrun_start, nzrun_len);
+ d += nzrun_len;
+ nzrun_len = 0;
+ is_same = true;
+ }
+ }
+ }
+
+ if (nzrun_len != 0) {
+ d += uleb128_encode_small(dst + d, nzrun_len);
+ /* overflow */
+ if (d + nzrun_len > dlen) {
+ return -1;
+ }
+ nzrun_start = new_buf + i - nzrun_len;
+ memcpy(dst + d, nzrun_start, nzrun_len);
+ d += nzrun_len;
+ }
+ return d;
+}
+#pragma GCC pop_options
+#endif
diff --git a/meson_options.txt b/meson_options.txt
index 66128178bf..96814dd211 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -104,6 +104,8 @@ option('avx2', type: 'feature', value: 'auto',
description: 'AVX2 optimizations')
option('avx512f', type: 'feature', value: 'disabled',
description: 'AVX512F optimizations')
+option('avx512bw', type: 'feature', value: 'auto',
+ description: 'AVX512BW optimizations')
option('keyring', type: 'feature', value: 'auto',
description: 'Linux keyring support')
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
index 2cb0de5601..bcb5d854a5 100644
--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
@@ -40,7 +40,8 @@ meson_options_help() {
printf "%s\n" ' --enable-trace-backends=CHOICES'
printf "%s\n" ' Set available tracing backends
[log] (choices:'
printf "%s\n" '
dtrace/ftrace/log/nop/simple/syslog/ust)'
- printf "%s\n" ' --firmwarepath=VALUES search PATH for firmware files
[share/qemu-firmware]'
+ printf "%s\n" ' --firmwarepath=VALUES search PATH for firmware files
[share/qemu-'
+ printf "%s\n" ' firmware]'
printf "%s\n" ' --iasl=VALUE Path to ACPI disassembler'
printf "%s\n" ' --includedir=VALUE Header file directory [include]'
printf "%s\n" ' --interp-prefix=VALUE where to find shared libraries
etc., use %M for'
@@ -66,6 +67,7 @@ meson_options_help() {
printf "%s\n" ' attr attr/xattr support'
printf "%s\n" ' auth-pam PAM access control'
printf "%s\n" ' avx2 AVX2 optimizations'
+ printf "%s\n" ' avx512bw AVX512BW optimizations'
printf "%s\n" ' avx512f AVX512F optimizations'
printf "%s\n" ' blkio libblkio block device driver'
printf "%s\n" ' bochs bochs image format support'
@@ -155,6 +157,8 @@ meson_options_help() {
printf "%s\n" ' usb-redir libusbredir support'
printf "%s\n" ' vde vde network backend support'
printf "%s\n" ' vdi vdi image format support'
+ printf "%s\n" ' vduse-blk-export'
+ printf "%s\n" ' VDUSE block export support'
printf "%s\n" ' vfio-user-server'
printf "%s\n" ' vfio-user server support'
printf "%s\n" ' vhost-crypto vhost-user crypto backend support'
@@ -163,8 +167,6 @@ meson_options_help() {
printf "%s\n" ' vhost-user vhost-user backend support'
printf "%s\n" ' vhost-user-blk-server'
printf "%s\n" ' build vhost-user-blk server'
- printf "%s\n" ' vduse-blk-export'
- printf "%s\n" ' VDUSE block export support'
printf "%s\n" ' vhost-vdpa vhost-vdpa kernel backend support'
printf "%s\n" ' virglrenderer virgl rendering support'
printf "%s\n" ' virtfs virtio-9p support'
@@ -193,6 +195,8 @@ _meson_option_parse() {
--disable-auth-pam) printf "%s" -Dauth_pam=disabled ;;
--enable-avx2) printf "%s" -Davx2=enabled ;;
--disable-avx2) printf "%s" -Davx2=disabled ;;
+ --enable-avx512bw) printf "%s" -Davx512bw=enabled ;;
+ --disable-avx512bw) printf "%s" -Davx512bw=disabled ;;
--enable-avx512f) printf "%s" -Davx512f=enabled ;;
--disable-avx512f) printf "%s" -Davx512f=disabled ;;
--enable-gcov) printf "%s" -Db_coverage=true ;;
@@ -426,6 +430,8 @@ _meson_option_parse() {
--disable-vde) printf "%s" -Dvde=disabled ;;
--enable-vdi) printf "%s" -Dvdi=enabled ;;
--disable-vdi) printf "%s" -Dvdi=disabled ;;
+ --enable-vduse-blk-export) printf "%s" -Dvduse_blk_export=enabled ;;
+ --disable-vduse-blk-export) printf "%s" -Dvduse_blk_export=disabled ;;
--enable-vfio-user-server) printf "%s" -Dvfio_user_server=enabled ;;
--disable-vfio-user-server) printf "%s" -Dvfio_user_server=disabled ;;
--enable-vhost-crypto) printf "%s" -Dvhost_crypto=enabled ;;
@@ -438,8 +444,6 @@ _meson_option_parse() {
--disable-vhost-user) printf "%s" -Dvhost_user=disabled ;;
--enable-vhost-user-blk-server) printf "%s"
-Dvhost_user_blk_server=enabled ;;
--disable-vhost-user-blk-server) printf "%s"
-Dvhost_user_blk_server=disabled ;;
- --enable-vduse-blk-export) printf "%s" -Dvduse_blk_export=enabled ;;
- --disable-vduse-blk-export) printf "%s" -Dvduse_blk_export=disabled ;;
--enable-vhost-vdpa) printf "%s" -Dvhost_vdpa=enabled ;;
--disable-vhost-vdpa) printf "%s" -Dvhost_vdpa=disabled ;;
--enable-virglrenderer) printf "%s" -Dvirglrenderer=enabled ;;
--
2.38.1
- [PULL 00/30] Next patches, Juan Quintela, 2022/11/15
- [PULL 05/30] multifd: Create page_count fields into both MultiFD{Recv, Send}Params, Juan Quintela, 2022/11/15
- [PULL 01/30] migration/channel-block: fix return value for qio_channel_block_{readv, writev}, Juan Quintela, 2022/11/15
- [PULL 03/30] migration: check magic value for deciding the mapping of channels, Juan Quintela, 2022/11/15
- [PULL 04/30] multifd: Create page_size fields into both MultiFD{Recv, Send}Params, Juan Quintela, 2022/11/15
- [PULL 02/30] migration/multifd/zero-copy: Create helper function for flushing, Juan Quintela, 2022/11/15
- [PULL 10/30] migration: Fix possible infinite loop of ram save process, Juan Quintela, 2022/11/15
- [PULL 07/30] migration: Export ram_release_page(), Juan Quintela, 2022/11/15
- [PULL 08/30] Update AVX512 support for xbzrle_encode_buffer,
Juan Quintela <=
- [PULL 09/30] Unit test code and benchmark code, Juan Quintela, 2022/11/15
- [PULL 06/30] migration: Export ram_transferred_ram(), Juan Quintela, 2022/11/15
- [PULL 12/30] migration: Disallow postcopy preempt to be used with compress, Juan Quintela, 2022/11/15
- [PULL 13/30] migration: Use non-atomic ops for clear log bitmap, Juan Quintela, 2022/11/15
- [PULL 15/30] migration: Take bitmap mutex when completing ram migration, Juan Quintela, 2022/11/15
- [PULL 16/30] migration: Add postcopy_preempt_active(), Juan Quintela, 2022/11/15
- [PULL 11/30] migration: Fix race on qemu_file_shutdown(), Juan Quintela, 2022/11/15
- [PULL 14/30] migration: Disable multifd explicitly with compression, Juan Quintela, 2022/11/15
- [PULL 17/30] migration: Cleanup xbzrle zero page cache update logic, Juan Quintela, 2022/11/15
- [PULL 21/30] migration: Use atomic ops properly for page accountings, Juan Quintela, 2022/11/15