[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[RFC 2/5] block: add transfer of protection information
From: |
Dmitry Tihov |
Subject: |
[RFC 2/5] block: add transfer of protection information |
Date: |
Thu, 24 Nov 2022 18:58:18 +0300 |
Under linux hosts, T10 protection information can be passed directly
from userspace to integrity capable block devices using io_uring API.
Discover integrity capable block devices and support submitting IO with
integrity payload to such block devices if it is present in request.
Signed-off-by: Dmitry Tihov <d.tihov@yadro.com>
---
block/file-posix.c | 130 +++++++++++++++++++++++++++++++++--
block/io_uring.c | 109 +++++++++++++++++++++++++++--
include/block/block-common.h | 2 +
include/block/raw-aio.h | 3 +-
include/qemu/iov.h | 6 ++
util/iov.c | 24 +++++++
6 files changed, 262 insertions(+), 12 deletions(-)
diff --git a/block/file-posix.c b/block/file-posix.c
index b9647c5ffc..1eec7dd3cb 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -152,6 +152,10 @@ typedef struct BDRVRawState {
int perm_change_flags;
BDRVReopenState *reopen_state;
+ /* DIF T10 Protection Information */
+ uint8_t t10_type;
+ uint64_t protection_interval_bytes;
+
bool has_discard:1;
bool has_write_zeroes:1;
bool use_linux_aio:1;
@@ -2094,8 +2098,9 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs,
uint64_t offset,
#ifdef CONFIG_LINUX_IO_URING
} else if (s->use_linux_io_uring) {
LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
+ bool is_pi = (s->t10_type && qiov->dif.iov_len);
assert(qiov->size == bytes);
- return luring_co_submit(bs, aio, s->fd, offset, qiov, type);
+ return luring_co_submit(bs, aio, s->fd, offset, qiov, type, is_pi);
#endif
#ifdef CONFIG_LINUX_AIO
} else if (s->use_linux_aio) {
@@ -2190,7 +2195,7 @@ static int coroutine_fn
raw_co_flush_to_disk(BlockDriverState *bs)
#ifdef CONFIG_LINUX_IO_URING
if (s->use_linux_io_uring) {
LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
- return luring_co_submit(bs, aio, s->fd, 0, NULL, QEMU_AIO_FLUSH);
+ return luring_co_submit(bs, aio, s->fd, 0, NULL, QEMU_AIO_FLUSH,
false);
}
#endif
return raw_thread_pool_submit(bs, handle_aiocb_flush, &acb);
@@ -3516,6 +3521,110 @@ static bool hdev_is_sg(BlockDriverState *bs)
return false;
}
+#if defined(CONFIG_LINUX_IO_URING)
+
+static int fill_pi_info(BlockDriverState *bs, Error **errp)
+{
+ BDRVRawState *s = bs->opaque;
+ int ret = 0, bytes;
+ uint64_t is_integrity_capable;
+ g_autofree char *sysfs_int_cap = NULL;
+ g_autofree char *sysfs_fmt = NULL;
+ g_autofree char *sysfs_bytes = NULL;
+ const char *str_int_cap;
+ const char *str_bytes;
+ int fd_fmt = -1, fd_bytes = -1, fd_int_cap = -1;
+ char buf[24] = {0};
+ g_autofree char *dev_name = g_path_get_basename(bs->filename);
+
+ str_int_cap = "/sys/class/block/%s/integrity/device_is_integrity_capable";
+ sysfs_int_cap = g_strdup_printf(str_int_cap, dev_name);
+ sysfs_fmt = g_strdup_printf("/sys/class/block/%s/integrity/format",
+ dev_name);
+ str_bytes = "/sys/class/block/%s/integrity/protection_interval_bytes";
+ sysfs_bytes = g_strdup_printf(str_bytes, dev_name);
+
+ if (!(bs->open_flags & BDRV_O_NOCACHE)) {
+ goto out;
+ }
+
+ fd_int_cap = open(sysfs_int_cap, O_RDONLY);
+ if (fd_int_cap == -1) {
+ error_setg_errno(errp, errno, "Can not open %s integrity capability"
+ " sysfs entry", dev_name);
+ ret = -errno;
+ goto out;
+ }
+ bytes = read(fd_int_cap, buf, sizeof(buf));
+ if (bytes < 0) {
+ error_setg_errno(errp, errno, "Can not read %s integrity capability"
+ " sysfs entry", dev_name);
+ ret = -errno;
+ goto out;
+ }
+ is_integrity_capable = g_ascii_strtoull(buf, NULL, 10);
+ if (!is_integrity_capable) {
+ goto out;
+ }
+ memset(buf, 0, sizeof(buf));
+
+ fd_fmt = open(sysfs_fmt, O_RDONLY);
+ if (fd_fmt == -1) {
+ error_setg_errno(errp, errno, "Can not open %s integrity format"
+ " sysfs entry", dev_name);
+ ret = -errno;
+ goto out;
+ }
+ bytes = read(fd_fmt, buf, sizeof(buf));
+ if (bytes < 0) {
+ error_setg_errno(errp, errno, "Can not read %s integrity format"
+ " sysfs entry", dev_name);
+ ret = -errno;
+ goto out;
+ }
+ if (bytes > 0 && buf[bytes - 1] == '\n') {
+ buf[bytes - 1] = 0;
+ }
+ if (strcmp(buf, "T10-DIF-TYPE1-CRC") == 0) {
+ s->t10_type = 1;
+ } else if (strcmp(buf, "T10-DIF-TYPE3-CRC") == 0) {
+ s->t10_type = 3;
+ } else {
+ s->t10_type = 0;
+ }
+ memset(buf, 0, sizeof(buf));
+
+ fd_bytes = open(sysfs_bytes, O_RDONLY);
+ if (fd_bytes == -1) {
+ error_setg_errno(errp, errno, "Can not open %s protection interval"
+ " bytes sysfs entry", dev_name);
+ ret = -errno;
+ goto out;
+ }
+ if (read(fd_bytes, buf, sizeof(buf)) < 0) {
+ error_setg_errno(errp, errno, "Can not read %s protection interval"
+ " bytes sysfs entry", dev_name);
+ ret = -errno;
+ goto out;
+ }
+ s->protection_interval_bytes = g_ascii_strtoull(buf, NULL, 10);
+
+out:
+ if (fd_fmt != -1) {
+ close(fd_fmt);
+ }
+ if (fd_bytes != -1) {
+ close(fd_bytes);
+ }
+ if (fd_int_cap != -1) {
+ close(fd_int_cap);
+ }
+
+ return ret;
+}
+
+#endif
+
static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
Error **errp)
{
@@ -3601,6 +3710,11 @@ hdev_open_Mac_error:
/* Since this does ioctl the device must be already opened */
bs->sg = hdev_is_sg(bs);
+#if defined(CONFIG_LINUX_IO_URING)
+ if (s->use_linux_io_uring) {
+ ret = fill_pi_info(bs, errp);
+ }
+#endif
return ret;
}
@@ -3668,6 +3782,14 @@ static coroutine_fn int
hdev_co_pwrite_zeroes(BlockDriverState *bs,
return raw_do_pwrite_zeroes(bs, offset, bytes, flags, true);
}
+static int hdev_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+ BDRVRawState *s = bs->opaque;
+ bdi->protection_interval = s->protection_interval_bytes;
+ bdi->protection_type = s->t10_type;
+ return 0;
+}
+
static BlockDriver bdrv_host_device = {
.format_name = "host_device",
.protocol_name = "host_device",
@@ -3698,8 +3820,8 @@ static BlockDriver bdrv_host_device = {
.bdrv_attach_aio_context = raw_aio_attach_aio_context,
.bdrv_co_truncate = raw_co_truncate,
- .bdrv_getlength = raw_getlength,
- .bdrv_get_info = raw_get_info,
+ .bdrv_getlength = raw_getlength,
+ .bdrv_get_info = hdev_get_info,
.bdrv_get_allocated_file_size
= raw_get_allocated_file_size,
.bdrv_get_specific_stats = hdev_get_specific_stats,
diff --git a/block/io_uring.c b/block/io_uring.c
index 973e15d876..ba9fec1145 100644
--- a/block/io_uring.c
+++ b/block/io_uring.c
@@ -21,6 +21,84 @@
/* io_uring ring size */
#define MAX_ENTRIES 128
+#define IORING_OP_READV_PI (48)
+#define IORING_OP_WRITEV_PI (49)
+
+#pragma pack(push, 1)
+
+struct __io_uring_sqe {
+ __u8 opcode; /* type of operation for this sqe */
+ __u8 flags; /* IOSQE_ flags */
+ __u16 ioprio; /* ioprio for the request */
+ __s32 fd; /* file descriptor to do IO on */
+ union {
+ __u64 off; /* offset into file */
+ __u64 addr2;
+ };
+ union {
+ __u64 addr; /* pointer to buffer or iovecs */
+ __u64 splice_off_in;
+ };
+ __u32 len; /* buffer size or number of iovecs */
+ union {
+ __kernel_rwf_t rw_flags;
+ __u32 fsync_flags;
+ __u16 poll_events; /* compatibility */
+ __u32 poll32_events; /* word-reversed for BE */
+ __u32 sync_range_flags;
+ __u32 msg_flags;
+ __u32 timeout_flags;
+ __u32 accept_flags;
+ __u32 cancel_flags;
+ __u32 open_flags;
+ __u32 statx_flags;
+ __u32 fadvise_advice;
+ __u32 splice_flags;
+ __u32 rename_flags;
+ __u32 unlink_flags;
+ __u32 hardlink_flags;
+ };
+ __u64 user_data; /* data to be passed back at completion time */
+ /* pack this to avoid bogus arm OABI complaints */
+ union {
+ /* index into fixed buffers, if used */
+ __u16 buf_index;
+ /* for grouped buffer selection */
+ __u16 buf_group;
+ } __attribute__((packed));
+ /* personality to use, if used */
+ __u16 personality;
+ union {
+ __s32 splice_fd_in;
+ __u32 file_index;
+ };
+ __u64 pi_addr;
+ __u32 pi_len;
+ __u32 __pad2[1];
+};
+
+#pragma pack(pop)
+
+static inline void __io_uring_prep_writev_pi(uint8_t op,
+ struct io_uring_sqe *sqe, int fd, const struct iovec *iovecs,
+ unsigned nr_vecs, const struct iovec *pi_iovec, unsigned nr_pi_vecs,
+ off_t offset)
+{
+ io_uring_prep_rw(op, sqe, fd, iovecs, nr_vecs, offset);
+ ((struct __io_uring_sqe *)sqe)->pi_addr = (__u64)pi_iovec;
+ ((struct __io_uring_sqe *)sqe)->pi_len = nr_pi_vecs;
+}
+
+static inline void __io_uring_prep_readv_pi(uint8_t op,
+ struct io_uring_sqe *sqe, int fd, const struct iovec *iovecs,
+ unsigned nr_vecs, const struct iovec *pi_iovec, unsigned nr_pi_vecs,
+ off_t offset)
+{
+ io_uring_prep_rw(op, sqe, fd, iovecs, nr_vecs, offset);
+ ((struct __io_uring_sqe *)sqe)->pi_addr = (__u64)pi_iovec;
+ ((struct __io_uring_sqe *)sqe)->pi_len = nr_pi_vecs;
+}
+
typedef struct LuringAIOCB {
Coroutine *co;
struct io_uring_sqe sqeq;
@@ -330,24 +408,39 @@ void luring_io_unplug(BlockDriverState *bs, LuringState
*s)
* @s: AIO state
* @offset: offset for request
* @type: type of request
+ * @is_pi: is protection information attached
*
* Fetches sqes from ring, adds to pending queue and preps them
*
*/
static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s,
- uint64_t offset, int type)
+ uint64_t offset, int type, bool is_pi)
{
int ret;
struct io_uring_sqe *sqes = &luringcb->sqeq;
switch (type) {
case QEMU_AIO_WRITE:
- io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
- luringcb->qiov->niov, offset);
+ if (is_pi) {
+ __io_uring_prep_writev_pi(IORING_OP_WRITEV_PI, sqes, fd,
+ luringcb->qiov->iov,
+ luringcb->qiov->niov,
+ &luringcb->qiov->dif, 1, offset);
+ } else {
+ io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
+ luringcb->qiov->niov, offset);
+ }
break;
case QEMU_AIO_READ:
- io_uring_prep_readv(sqes, fd, luringcb->qiov->iov,
- luringcb->qiov->niov, offset);
+ if (is_pi) {
+ __io_uring_prep_readv_pi(IORING_OP_READV_PI, sqes, fd,
+ luringcb->qiov->iov,
+ luringcb->qiov->niov,
+ &luringcb->qiov->dif, 1, offset);
+ } else {
+ io_uring_prep_readv(sqes, fd, luringcb->qiov->iov,
+ luringcb->qiov->niov, offset);
+ }
break;
case QEMU_AIO_FLUSH:
io_uring_prep_fsync(sqes, fd, IORING_FSYNC_DATASYNC);
@@ -374,7 +467,8 @@ static int luring_do_submit(int fd, LuringAIOCB *luringcb,
LuringState *s,
}
int coroutine_fn luring_co_submit(BlockDriverState *bs, LuringState *s, int fd,
- uint64_t offset, QEMUIOVector *qiov, int
type)
+ uint64_t offset, QEMUIOVector *qiov, int
type,
+ bool is_pi)
{
int ret;
LuringAIOCB luringcb = {
@@ -383,9 +477,10 @@ int coroutine_fn luring_co_submit(BlockDriverState *bs,
LuringState *s, int fd,
.qiov = qiov,
.is_read = (type == QEMU_AIO_READ),
};
+
trace_luring_co_submit(bs, s, &luringcb, fd, offset, qiov ? qiov->size : 0,
type);
- ret = luring_do_submit(fd, &luringcb, s, offset, type);
+ ret = luring_do_submit(fd, &luringcb, s, offset, type, is_pi);
if (ret < 0) {
return ret;
diff --git a/include/block/block-common.h b/include/block/block-common.h
index 297704c1e9..1f283dbef8 100644
--- a/include/block/block-common.h
+++ b/include/block/block-common.h
@@ -59,6 +59,8 @@ typedef struct BlockDriverInfo {
* True if this block driver only supports compressed writes
*/
bool needs_compressed_writes;
+ uint8_t protection_type;
+ uint32_t protection_interval;
} BlockDriverInfo;
typedef struct BlockFragInfo {
diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
index 21fc10c4c9..3f715b4bcc 100644
--- a/include/block/raw-aio.h
+++ b/include/block/raw-aio.h
@@ -65,7 +65,8 @@ typedef struct LuringState LuringState;
LuringState *luring_init(Error **errp);
void luring_cleanup(LuringState *s);
int coroutine_fn luring_co_submit(BlockDriverState *bs, LuringState *s, int fd,
- uint64_t offset, QEMUIOVector *qiov, int type);
+ uint64_t offset, QEMUIOVector *qiov, int type,
+ bool is_pi);
void luring_detach_aio_context(LuringState *s, AioContext *old_context);
void luring_attach_aio_context(LuringState *s, AioContext *new_context);
void luring_io_plug(BlockDriverState *bs, LuringState *s);
diff --git a/include/qemu/iov.h b/include/qemu/iov.h
index 9330746680..58ae2d1f51 100644
--- a/include/qemu/iov.h
+++ b/include/qemu/iov.h
@@ -181,6 +181,9 @@ typedef struct QEMUIOVector {
size_t size;
};
};
+
+ /* T10 data integrity field */
+ struct iovec dif;
} QEMUIOVector;
QEMU_BUILD_BUG_ON(offsetof(QEMUIOVector, size) !=
@@ -229,6 +232,9 @@ int qemu_iovec_init_extended(
void *tail_buf, size_t tail_len);
void qemu_iovec_init_slice(QEMUIOVector *qiov, QEMUIOVector *source,
size_t offset, size_t len);
+void qemu_iovec_init_pi(QEMUIOVector *qiov, int alloc_hint,
+ unsigned int lba_cnt);
+void qemu_iovec_destroy_pi(QEMUIOVector *qiov);
int qemu_iovec_subvec_niov(QEMUIOVector *qiov, size_t offset, size_t len);
void qemu_iovec_add(QEMUIOVector *qiov, void *base, size_t len);
void qemu_iovec_concat(QEMUIOVector *dst,
diff --git a/util/iov.c b/util/iov.c
index b4be580022..f0e51d5e66 100644
--- a/util/iov.c
+++ b/util/iov.c
@@ -20,6 +20,7 @@
#include "qemu/iov.h"
#include "qemu/sockets.h"
#include "qemu/cutils.h"
+#include "qemu/memalign.h"
size_t iov_from_buf_full(const struct iovec *iov, unsigned int iov_cnt,
size_t offset, const void *buf, size_t bytes)
@@ -278,6 +279,8 @@ void qemu_iovec_init(QEMUIOVector *qiov, int alloc_hint)
qiov->niov = 0;
qiov->nalloc = alloc_hint;
qiov->size = 0;
+ qiov->dif.iov_base = NULL;
+ qiov->dif.iov_len = 0;
}
void qemu_iovec_init_external(QEMUIOVector *qiov, struct iovec *iov, int niov)
@@ -292,6 +295,19 @@ void qemu_iovec_init_external(QEMUIOVector *qiov, struct
iovec *iov, int niov)
qiov->size += iov[i].iov_len;
}
+void qemu_iovec_init_pi(QEMUIOVector *qiov, int alloc_hint,
+ unsigned int lba_cnt)
+{
+ void *alignd_mem = NULL;
+ qemu_iovec_init(qiov, alloc_hint);
+
+ /* dif size is always 8 bytes */
+ qiov->dif.iov_len = lba_cnt << 3;
+
+ alignd_mem = qemu_memalign(qemu_real_host_page_size(), qiov->dif.iov_len);
+ qiov->dif.iov_base = memset(alignd_mem, 0, qiov->dif.iov_len);
+}
+
void qemu_iovec_add(QEMUIOVector *qiov, void *base, size_t len)
{
assert(qiov->nalloc != -1);
@@ -530,12 +546,20 @@ void qemu_iovec_destroy(QEMUIOVector *qiov)
memset(qiov, 0, sizeof(*qiov));
}
+void qemu_iovec_destroy_pi(QEMUIOVector *qiov)
+{
+ g_free(qiov->dif.iov_base);
+
+ qemu_iovec_destroy(qiov);
+}
+
void qemu_iovec_reset(QEMUIOVector *qiov)
{
assert(qiov->nalloc != -1);
qiov->niov = 0;
qiov->size = 0;
+ qiov->dif.iov_len = 0;
}
size_t qemu_iovec_to_buf(QEMUIOVector *qiov, size_t offset,
--
2.38.1
- [RFC 0/5] Protection information pass-through for block devices, Dmitry Tihov, 2022/11/24
- [RFC 1/5] docs/nvme: add new feature summary, Dmitry Tihov, 2022/11/24
- [RFC 2/5] block: add transfer of protection information,
Dmitry Tihov <=
- [RFC 4/5] hw/nvme: implement pi pass read/write/wrz commands, Dmitry Tihov, 2022/11/24
- [RFC 3/5] hw/nvme: add protection information pass parameter, Dmitry Tihov, 2022/11/24
- [RFC 5/5] hw/nvme: extend pi pass capable commands, Dmitry Tihov, 2022/11/24
- Re: [RFC 0/5] Protection information pass-through for block devices, Klaus Jensen, 2022/11/25