qemu-block
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[RFC 2/5] block: add transfer of protection information


From: Dmitry Tihov
Subject: [RFC 2/5] block: add transfer of protection information
Date: Thu, 24 Nov 2022 18:58:18 +0300

Under linux hosts, T10 protection information can be passed directly
from userspace to integrity capable block devices using io_uring API.
Discover integrity capable block devices and support submitting IO with
integrity payload to such block devices if it is present in request.

Signed-off-by: Dmitry Tihov <d.tihov@yadro.com>
---
 block/file-posix.c           | 130 +++++++++++++++++++++++++++++++++--
 block/io_uring.c             | 109 +++++++++++++++++++++++++++--
 include/block/block-common.h |   2 +
 include/block/raw-aio.h      |   3 +-
 include/qemu/iov.h           |   6 ++
 util/iov.c                   |  24 +++++++
 6 files changed, 262 insertions(+), 12 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index b9647c5ffc..1eec7dd3cb 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -152,6 +152,10 @@ typedef struct BDRVRawState {
     int perm_change_flags;
     BDRVReopenState *reopen_state;
 
+    /* DIF T10 Protection Information */
+    uint8_t t10_type;
+    uint64_t protection_interval_bytes;
+
     bool has_discard:1;
     bool has_write_zeroes:1;
     bool use_linux_aio:1;
@@ -2094,8 +2098,9 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, 
uint64_t offset,
 #ifdef CONFIG_LINUX_IO_URING
     } else if (s->use_linux_io_uring) {
         LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
+        bool is_pi = (s->t10_type && qiov->dif.iov_len);
         assert(qiov->size == bytes);
-        return luring_co_submit(bs, aio, s->fd, offset, qiov, type);
+        return luring_co_submit(bs, aio, s->fd, offset, qiov, type, is_pi);
 #endif
 #ifdef CONFIG_LINUX_AIO
     } else if (s->use_linux_aio) {
@@ -2190,7 +2195,7 @@ static int coroutine_fn 
raw_co_flush_to_disk(BlockDriverState *bs)
 #ifdef CONFIG_LINUX_IO_URING
     if (s->use_linux_io_uring) {
         LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
-        return luring_co_submit(bs, aio, s->fd, 0, NULL, QEMU_AIO_FLUSH);
+        return luring_co_submit(bs, aio, s->fd, 0, NULL, QEMU_AIO_FLUSH, 
false);
     }
 #endif
     return raw_thread_pool_submit(bs, handle_aiocb_flush, &acb);
@@ -3516,6 +3521,110 @@ static bool hdev_is_sg(BlockDriverState *bs)
     return false;
 }
 
+#if defined(CONFIG_LINUX_IO_URING)
+
+static int fill_pi_info(BlockDriverState *bs, Error **errp)
+{
+    BDRVRawState *s = bs->opaque;
+    int ret = 0, bytes;
+    uint64_t is_integrity_capable;
+    g_autofree char *sysfs_int_cap = NULL;
+    g_autofree char *sysfs_fmt = NULL;
+    g_autofree char *sysfs_bytes = NULL;
+    const char *str_int_cap;
+    const char *str_bytes;
+    int fd_fmt = -1, fd_bytes = -1, fd_int_cap = -1;
+    char buf[24] = {0};
+    g_autofree char *dev_name = g_path_get_basename(bs->filename);
+
+    str_int_cap = "/sys/class/block/%s/integrity/device_is_integrity_capable";
+    sysfs_int_cap = g_strdup_printf(str_int_cap, dev_name);
+    sysfs_fmt = g_strdup_printf("/sys/class/block/%s/integrity/format",
+                                dev_name);
+    str_bytes = "/sys/class/block/%s/integrity/protection_interval_bytes";
+    sysfs_bytes = g_strdup_printf(str_bytes, dev_name);
+
+    if (!(bs->open_flags & BDRV_O_NOCACHE)) {
+        goto out;
+    }
+
+    fd_int_cap = open(sysfs_int_cap, O_RDONLY);
+    if (fd_int_cap == -1) {
+        error_setg_errno(errp, errno, "Can not open %s integrity capability"
+                         " sysfs entry", dev_name);
+        ret = -errno;
+        goto out;
+    }
+    bytes = read(fd_int_cap, buf, sizeof(buf));
+    if (bytes < 0) {
+        error_setg_errno(errp, errno, "Can not read %s integrity capability"
+                         " sysfs entry", dev_name);
+        ret = -errno;
+        goto out;
+    }
+    is_integrity_capable = g_ascii_strtoull(buf, NULL, 10);
+    if (!is_integrity_capable) {
+        goto out;
+    }
+    memset(buf, 0, sizeof(buf));
+
+    fd_fmt = open(sysfs_fmt, O_RDONLY);
+    if (fd_fmt == -1) {
+        error_setg_errno(errp, errno, "Can not open %s integrity format"
+                         " sysfs entry", dev_name);
+        ret = -errno;
+        goto out;
+    }
+    bytes = read(fd_fmt, buf, sizeof(buf));
+    if (bytes < 0) {
+        error_setg_errno(errp, errno, "Can not read %s integrity format"
+                         " sysfs entry", dev_name);
+        ret = -errno;
+        goto out;
+    }
+    if (bytes > 0 && buf[bytes - 1] == '\n') {
+        buf[bytes - 1] = 0;
+    }
+    if (strcmp(buf, "T10-DIF-TYPE1-CRC") == 0) {
+        s->t10_type = 1;
+    } else if (strcmp(buf, "T10-DIF-TYPE3-CRC") == 0) {
+        s->t10_type = 3;
+    } else {
+        s->t10_type = 0;
+    }
+    memset(buf, 0, sizeof(buf));
+
+    fd_bytes = open(sysfs_bytes, O_RDONLY);
+    if (fd_bytes == -1) {
+        error_setg_errno(errp, errno, "Can not open %s protection interval"
+                         " bytes sysfs entry", dev_name);
+        ret = -errno;
+        goto out;
+    }
+    if (read(fd_bytes, buf, sizeof(buf)) < 0) {
+        error_setg_errno(errp, errno, "Can not read %s protection interval"
+                   " bytes sysfs entry", dev_name);
+        ret = -errno;
+        goto out;
+    }
+    s->protection_interval_bytes = g_ascii_strtoull(buf, NULL, 10);
+
+out:
+    if (fd_fmt != -1) {
+        close(fd_fmt);
+    }
+    if (fd_bytes != -1) {
+        close(fd_bytes);
+    }
+    if (fd_int_cap != -1) {
+        close(fd_int_cap);
+    }
+
+    return ret;
+}
+
+#endif
+
 static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
                      Error **errp)
 {
@@ -3601,6 +3710,11 @@ hdev_open_Mac_error:
     /* Since this does ioctl the device must be already opened */
     bs->sg = hdev_is_sg(bs);
 
+#if defined(CONFIG_LINUX_IO_URING)
+    if (s->use_linux_io_uring) {
+        ret = fill_pi_info(bs, errp);
+    }
+#endif
     return ret;
 }
 
@@ -3668,6 +3782,14 @@ static coroutine_fn int 
hdev_co_pwrite_zeroes(BlockDriverState *bs,
     return raw_do_pwrite_zeroes(bs, offset, bytes, flags, true);
 }
 
+static int hdev_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+    BDRVRawState *s = bs->opaque;
+    bdi->protection_interval = s->protection_interval_bytes;
+    bdi->protection_type = s->t10_type;
+    return 0;
+}
+
 static BlockDriver bdrv_host_device = {
     .format_name        = "host_device",
     .protocol_name        = "host_device",
@@ -3698,8 +3820,8 @@ static BlockDriver bdrv_host_device = {
     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
 
     .bdrv_co_truncate       = raw_co_truncate,
-    .bdrv_getlength    = raw_getlength,
-    .bdrv_get_info = raw_get_info,
+    .bdrv_getlength         = raw_getlength,
+    .bdrv_get_info          = hdev_get_info,
     .bdrv_get_allocated_file_size
                         = raw_get_allocated_file_size,
     .bdrv_get_specific_stats = hdev_get_specific_stats,
diff --git a/block/io_uring.c b/block/io_uring.c
index 973e15d876..ba9fec1145 100644
--- a/block/io_uring.c
+++ b/block/io_uring.c
@@ -21,6 +21,84 @@
 /* io_uring ring size */
 #define MAX_ENTRIES 128
 
+#define IORING_OP_READV_PI  (48)
+#define IORING_OP_WRITEV_PI (49)
+
+#pragma pack(push, 1)
+
+struct __io_uring_sqe {
+    __u8    opcode;     /* type of operation for this sqe */
+    __u8    flags;      /* IOSQE_ flags */
+    __u16   ioprio;     /* ioprio for the request */
+    __s32   fd;     /* file descriptor to do IO on */
+    union {
+        __u64   off;    /* offset into file */
+        __u64   addr2;
+    };
+    union {
+        __u64   addr;   /* pointer to buffer or iovecs */
+        __u64   splice_off_in;
+    };
+    __u32   len;        /* buffer size or number of iovecs */
+    union {
+        __kernel_rwf_t  rw_flags;
+        __u32       fsync_flags;
+        __u16       poll_events;    /* compatibility */
+        __u32       poll32_events;  /* word-reversed for BE */
+        __u32       sync_range_flags;
+        __u32       msg_flags;
+        __u32       timeout_flags;
+        __u32       accept_flags;
+        __u32       cancel_flags;
+        __u32       open_flags;
+        __u32       statx_flags;
+        __u32       fadvise_advice;
+        __u32       splice_flags;
+        __u32       rename_flags;
+        __u32       unlink_flags;
+        __u32       hardlink_flags;
+    };
+    __u64   user_data;  /* data to be passed back at completion time */
+    /* pack this to avoid bogus arm OABI complaints */
+    union {
+        /* index into fixed buffers, if used */
+        __u16   buf_index;
+        /* for grouped buffer selection */
+        __u16   buf_group;
+    } __attribute__((packed));
+    /* personality to use, if used */
+    __u16   personality;
+    union {
+        __s32   splice_fd_in;
+        __u32   file_index;
+    };
+    __u64   pi_addr;
+    __u32 pi_len;
+    __u32   __pad2[1];
+};
+
+#pragma pack(pop)
+
+static inline void __io_uring_prep_writev_pi(uint8_t op,
+    struct io_uring_sqe *sqe, int fd, const struct iovec *iovecs,
+    unsigned nr_vecs, const struct iovec *pi_iovec, unsigned nr_pi_vecs,
+    off_t offset)
+{
+    io_uring_prep_rw(op, sqe, fd, iovecs, nr_vecs, offset);
+    ((struct __io_uring_sqe *)sqe)->pi_addr = (__u64)pi_iovec;
+    ((struct __io_uring_sqe *)sqe)->pi_len = nr_pi_vecs;
+}
+
+static inline void __io_uring_prep_readv_pi(uint8_t op,
+    struct io_uring_sqe *sqe, int fd, const struct iovec *iovecs,
+    unsigned nr_vecs, const struct iovec *pi_iovec, unsigned nr_pi_vecs,
+    off_t offset)
+{
+    io_uring_prep_rw(op, sqe, fd, iovecs, nr_vecs, offset);
+    ((struct __io_uring_sqe *)sqe)->pi_addr = (__u64)pi_iovec;
+    ((struct __io_uring_sqe *)sqe)->pi_len = nr_pi_vecs;
+}
+
 typedef struct LuringAIOCB {
     Coroutine *co;
     struct io_uring_sqe sqeq;
@@ -330,24 +408,39 @@ void luring_io_unplug(BlockDriverState *bs, LuringState 
*s)
  * @s: AIO state
  * @offset: offset for request
  * @type: type of request
+ * @is_pi: is protection information attached
  *
  * Fetches sqes from ring, adds to pending queue and preps them
  *
  */
 static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s,
-                            uint64_t offset, int type)
+                            uint64_t offset, int type, bool is_pi)
 {
     int ret;
     struct io_uring_sqe *sqes = &luringcb->sqeq;
 
     switch (type) {
     case QEMU_AIO_WRITE:
-        io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
-                             luringcb->qiov->niov, offset);
+        if (is_pi) {
+            __io_uring_prep_writev_pi(IORING_OP_WRITEV_PI, sqes, fd,
+                                      luringcb->qiov->iov,
+                                      luringcb->qiov->niov,
+                                      &luringcb->qiov->dif, 1, offset);
+        } else {
+            io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
+                                 luringcb->qiov->niov, offset);
+        }
         break;
     case QEMU_AIO_READ:
-        io_uring_prep_readv(sqes, fd, luringcb->qiov->iov,
-                            luringcb->qiov->niov, offset);
+        if (is_pi) {
+            __io_uring_prep_readv_pi(IORING_OP_READV_PI, sqes, fd,
+                                     luringcb->qiov->iov,
+                                     luringcb->qiov->niov,
+                                     &luringcb->qiov->dif, 1, offset);
+        } else {
+            io_uring_prep_readv(sqes, fd, luringcb->qiov->iov,
+                                luringcb->qiov->niov, offset);
+        }
         break;
     case QEMU_AIO_FLUSH:
         io_uring_prep_fsync(sqes, fd, IORING_FSYNC_DATASYNC);
@@ -374,7 +467,8 @@ static int luring_do_submit(int fd, LuringAIOCB *luringcb, 
LuringState *s,
 }
 
 int coroutine_fn luring_co_submit(BlockDriverState *bs, LuringState *s, int fd,
-                                  uint64_t offset, QEMUIOVector *qiov, int 
type)
+                                  uint64_t offset, QEMUIOVector *qiov, int 
type,
+                                  bool is_pi)
 {
     int ret;
     LuringAIOCB luringcb = {
@@ -383,9 +477,10 @@ int coroutine_fn luring_co_submit(BlockDriverState *bs, 
LuringState *s, int fd,
         .qiov       = qiov,
         .is_read    = (type == QEMU_AIO_READ),
     };
+
     trace_luring_co_submit(bs, s, &luringcb, fd, offset, qiov ? qiov->size : 0,
                            type);
-    ret = luring_do_submit(fd, &luringcb, s, offset, type);
+    ret = luring_do_submit(fd, &luringcb, s, offset, type, is_pi);
 
     if (ret < 0) {
         return ret;
diff --git a/include/block/block-common.h b/include/block/block-common.h
index 297704c1e9..1f283dbef8 100644
--- a/include/block/block-common.h
+++ b/include/block/block-common.h
@@ -59,6 +59,8 @@ typedef struct BlockDriverInfo {
      * True if this block driver only supports compressed writes
      */
     bool needs_compressed_writes;
+    uint8_t protection_type;
+    uint32_t protection_interval;
 } BlockDriverInfo;
 
 typedef struct BlockFragInfo {
diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
index 21fc10c4c9..3f715b4bcc 100644
--- a/include/block/raw-aio.h
+++ b/include/block/raw-aio.h
@@ -65,7 +65,8 @@ typedef struct LuringState LuringState;
 LuringState *luring_init(Error **errp);
 void luring_cleanup(LuringState *s);
 int coroutine_fn luring_co_submit(BlockDriverState *bs, LuringState *s, int fd,
-                                uint64_t offset, QEMUIOVector *qiov, int type);
+                                uint64_t offset, QEMUIOVector *qiov, int type,
+                                bool is_pi);
 void luring_detach_aio_context(LuringState *s, AioContext *old_context);
 void luring_attach_aio_context(LuringState *s, AioContext *new_context);
 void luring_io_plug(BlockDriverState *bs, LuringState *s);
diff --git a/include/qemu/iov.h b/include/qemu/iov.h
index 9330746680..58ae2d1f51 100644
--- a/include/qemu/iov.h
+++ b/include/qemu/iov.h
@@ -181,6 +181,9 @@ typedef struct QEMUIOVector {
             size_t size;
         };
     };
+
+    /* T10 data integrity field */
+    struct iovec dif;
 } QEMUIOVector;
 
 QEMU_BUILD_BUG_ON(offsetof(QEMUIOVector, size) !=
@@ -229,6 +232,9 @@ int qemu_iovec_init_extended(
         void *tail_buf, size_t tail_len);
 void qemu_iovec_init_slice(QEMUIOVector *qiov, QEMUIOVector *source,
                            size_t offset, size_t len);
+void qemu_iovec_init_pi(QEMUIOVector *qiov, int alloc_hint,
+                        unsigned int lba_cnt);
+void qemu_iovec_destroy_pi(QEMUIOVector *qiov);
 int qemu_iovec_subvec_niov(QEMUIOVector *qiov, size_t offset, size_t len);
 void qemu_iovec_add(QEMUIOVector *qiov, void *base, size_t len);
 void qemu_iovec_concat(QEMUIOVector *dst,
diff --git a/util/iov.c b/util/iov.c
index b4be580022..f0e51d5e66 100644
--- a/util/iov.c
+++ b/util/iov.c
@@ -20,6 +20,7 @@
 #include "qemu/iov.h"
 #include "qemu/sockets.h"
 #include "qemu/cutils.h"
+#include "qemu/memalign.h"
 
 size_t iov_from_buf_full(const struct iovec *iov, unsigned int iov_cnt,
                          size_t offset, const void *buf, size_t bytes)
@@ -278,6 +279,8 @@ void qemu_iovec_init(QEMUIOVector *qiov, int alloc_hint)
     qiov->niov = 0;
     qiov->nalloc = alloc_hint;
     qiov->size = 0;
+    qiov->dif.iov_base = NULL;
+    qiov->dif.iov_len = 0;
 }
 
 void qemu_iovec_init_external(QEMUIOVector *qiov, struct iovec *iov, int niov)
@@ -292,6 +295,19 @@ void qemu_iovec_init_external(QEMUIOVector *qiov, struct 
iovec *iov, int niov)
         qiov->size += iov[i].iov_len;
 }
 
+void qemu_iovec_init_pi(QEMUIOVector *qiov, int alloc_hint,
+                        unsigned int lba_cnt)
+{
+    void *alignd_mem = NULL;
+    qemu_iovec_init(qiov, alloc_hint);
+
+    /* dif size is always 8 bytes */
+    qiov->dif.iov_len = lba_cnt << 3;
+
+    alignd_mem = qemu_memalign(qemu_real_host_page_size(), qiov->dif.iov_len);
+    qiov->dif.iov_base = memset(alignd_mem, 0, qiov->dif.iov_len);
+}
+
 void qemu_iovec_add(QEMUIOVector *qiov, void *base, size_t len)
 {
     assert(qiov->nalloc != -1);
@@ -530,12 +546,20 @@ void qemu_iovec_destroy(QEMUIOVector *qiov)
     memset(qiov, 0, sizeof(*qiov));
 }
 
+void qemu_iovec_destroy_pi(QEMUIOVector *qiov)
+{
+    g_free(qiov->dif.iov_base);
+
+    qemu_iovec_destroy(qiov);
+}
+
 void qemu_iovec_reset(QEMUIOVector *qiov)
 {
     assert(qiov->nalloc != -1);
 
     qiov->niov = 0;
     qiov->size = 0;
+    qiov->dif.iov_len = 0;
 }
 
 size_t qemu_iovec_to_buf(QEMUIOVector *qiov, size_t offset,
-- 
2.38.1




reply via email to

[Prev in Thread] Current Thread [Next in Thread]