qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [RFC v5 08/11] virtio-blk: add zoned storage APIs for zoned devices


From: Stefan Hajnoczi
Subject: Re: [RFC v5 08/11] virtio-blk: add zoned storage APIs for zoned devices
Date: Mon, 1 Aug 2022 11:46:03 -0400

On Sun, 31 Jul 2022 at 21:43, Sam Li <faithilikerun@gmail.com> wrote:
>
> This patch extends virtio-blk emulation to handle zoned device commands
> by calling the new block layer APIs to perform zoned device I/O on
> behalf of the guest. It supports Report Zone, and four zone oparations (open,
> close, finish, reset). The virtio-blk zoned device command specifications
> is currently in the reviewing process.
>
> VIRTIO_BLK_F_ZONED will only be set if the host does support zoned block
> devices. The regular block device will not be set. The guest os having
> zoned device support can use blkzone(8) to test those commands.
>
> Signed-off-by: Sam Li <faithilikerun@gmail.com>
> ---
>  block/block-backend.c             |  92 ++++++++++++++++
>  hw/block/virtio-blk.c             | 172 +++++++++++++++++++++++++++++-
>  include/sysemu/block-backend-io.h |   6 ++
>  3 files changed, 268 insertions(+), 2 deletions(-)
>
> diff --git a/block/block-backend.c b/block/block-backend.c
> index ef6a1f33d5..8f2cfcbd9d 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1431,6 +1431,15 @@ typedef struct BlkRwCo {
>      void *iobuf;
>      int ret;
>      BdrvRequestFlags flags;
> +    union {
> +        struct {
> +            unsigned int *nr_zones;
> +            BlockZoneDescriptor *zones;
> +        } zone_report;
> +        struct {
> +            BlockZoneOp op;
> +        } zone_mgmt;
> +    };
>  } BlkRwCo;
>
>  int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
> @@ -1775,6 +1784,89 @@ int coroutine_fn blk_co_flush(BlockBackend *blk)
>      return ret;
>  }
>
> +static void blk_aio_zone_report_entry(void *opaque) {
> +    BlkAioEmAIOCB *acb = opaque;
> +    BlkRwCo *rwco = &acb->rwco;
> +
> +    rwco->ret = blk_co_zone_report(rwco->blk, rwco->offset,
> +                                   rwco->zone_report.nr_zones,
> +                                   rwco->zone_report.zones);
> +    blk_aio_complete(acb);
> +}
> +
> +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
> +                                unsigned int *nr_zones,
> +                                BlockZoneDescriptor  *zones,
> +                                BlockCompletionFunc *cb, void *opaque)
> +{
> +    BlkAioEmAIOCB *acb;
> +    Coroutine *co;
> +
> +    blk_inc_in_flight(blk);
> +    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
> +    acb->rwco = (BlkRwCo) {
> +            .blk    = blk,
> +            .offset = offset,
> +            .ret    = NOT_DONE,
> +            .zone_report = {
> +                    .zones = zones,
> +                    .nr_zones = nr_zones,
> +            },
> +    };
> +    acb->has_returned = false;
> +
> +    co = qemu_coroutine_create(blk_aio_zone_report_entry, acb);
> +    bdrv_coroutine_enter(blk_bs(blk), co);
> +
> +    acb->has_returned = true;
> +    if (acb->rwco.ret != NOT_DONE) {
> +        replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> +                                         blk_aio_complete_bh, acb);
> +    }
> +
> +    return &acb->common;
> +}
> +
> +static void blk_aio_zone_mgmt_entry(void *opaque) {
> +    BlkAioEmAIOCB *acb = opaque;
> +    BlkRwCo *rwco = &acb->rwco;
> +
> +    rwco->ret = blk_co_zone_mgmt(rwco->blk, rwco->zone_mgmt.op,
> +                                 rwco->offset, acb->bytes);
> +    blk_aio_complete(acb);
> +}
> +
> +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
> +                              int64_t offset, int64_t len,
> +                              BlockCompletionFunc *cb, void *opaque) {
> +    BlkAioEmAIOCB *acb;
> +    Coroutine *co;
> +
> +    blk_inc_in_flight(blk);
> +    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
> +    acb->rwco = (BlkRwCo) {
> +            .blk    = blk,
> +            .offset = offset,
> +            .ret    = NOT_DONE,
> +            .zone_mgmt = {
> +                    .op = op,
> +            },
> +    };
> +    acb->bytes = len;
> +    acb->has_returned = false;
> +
> +    co = qemu_coroutine_create(blk_aio_zone_mgmt_entry, acb);
> +    bdrv_coroutine_enter(blk_bs(blk), co);
> +
> +    acb->has_returned = true;
> +    if (acb->rwco.ret != NOT_DONE) {
> +        replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> +                                         blk_aio_complete_bh, acb);
> +    }
> +
> +    return &acb->common;
> +}
> +
>  /*
>   * Send a zone_report command.
>   * offset is a byte offset from the start of the device. No alignment
> diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
> index e9ba752f6b..9722f447a2 100644
> --- a/hw/block/virtio-blk.c
> +++ b/hw/block/virtio-blk.c
> @@ -37,6 +37,7 @@
>  /* Config size before the discard support (hide associated config fields) */
>  #define VIRTIO_BLK_CFG_SIZE offsetof(struct virtio_blk_config, \
>                                       max_discard_sectors)
> +
>  /*
>   * Starting from the discard feature, we can use this array to properly
>   * set the config size depending on the features enabled.
> @@ -46,6 +47,8 @@ static const VirtIOFeature feature_sizes[] = {
>       .end = endof(struct virtio_blk_config, discard_sector_alignment)},
>      {.flags = 1ULL << VIRTIO_BLK_F_WRITE_ZEROES,
>       .end = endof(struct virtio_blk_config, write_zeroes_may_unmap)},
> +    {.flags = 1ULL << VIRTIO_BLK_F_ZONED,
> +     .end = endof(struct virtio_blk_config, zoned)},
>      {}
>  };
>
> @@ -514,6 +517,131 @@ static void virtio_blk_handle_flush(VirtIOBlockReq 
> *req, MultiReqBuffer *mrb)
>      blk_aio_flush(s->blk, virtio_blk_flush_complete, req);
>  }
>
> +typedef struct ZoneReportData {
> +    VirtIOBlockReq *req;
> +    unsigned int nr_zones;
> +    BlockZoneDescriptor zones[];
> +} ZoneReportData;
> +
> +static void virtio_blk_zone_report_complete(void *opaque, int ret)
> +{
> +    ZoneReportData *data = opaque;
> +    VirtIOBlockReq *req = data->req;
> +    VirtIOBlock *s = req->dev;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
> +    struct iovec *in_iov = req->elem.in_sg;
> +    unsigned in_num = req->elem.in_num;
> +    int64_t zrp_size, nz, offset, n, j; /* offset is of in_iov */
> +
> +    nz = data->nr_zones;
> +    j = 0;
> +
> +    struct virtio_blk_zone_report zrp_hdr = (struct virtio_blk_zone_report) {
> +            .nr_zones = cpu_to_le64(nz),
> +    };
> +
> +    zrp_size = sizeof(struct virtio_blk_zone_report)
> +              + sizeof(struct virtio_blk_zone_descriptor) * nz;
> +    offset = sizeof(zrp_hdr);
> +
> +    memset(zrp_hdr.reserved, 0, 56);

Can the 56 magic number be replaced by sizeof(zrp_hdr.reserved)?

> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));

aio_context_acquire/release() is only necessary around
virtio_blk_req_complete(). Please move the
aio_context_acquire/release() calls just around
virtio_blk_req_complete(). That solves the missing
aio_context_release() calls issues where there are return statements.

> +    n = iov_from_buf(in_iov, in_num, 0, &zrp_hdr, offset);

sizeof(zrp_hdr) would be clearer than offset. After that change the
offset local variable is no longer needed.

> +    if (n != sizeof(zrp_hdr)) {
> +        virtio_error(vdev, "Driver provided intput buffer that is too 
> small!");
> +        return;
> +    }
> +
> +    for (size_t i = offset; i < zrp_size; i += sizeof(struct 
> virtio_blk_zone_descriptor), ++j) {
> +        struct virtio_blk_zone_descriptor desc =
> +                (struct virtio_blk_zone_descriptor) {
> +                        .z_start = cpu_to_le64(data->zones[j].start),
> +                        .z_cap = cpu_to_le64(data->zones[j].cap),
> +                        .z_wp = cpu_to_le64(data->zones[j].wp),
> +                        .z_type = data->zones[j].type,
> +                        .z_state = data->zones[j].cond,
> +                };
> +        memset(desc.reserved, 0, 38);

The memset is unnecessary because unnamed fields are initialized to 0
automatically in a struct literal.

> +        n = iov_from_buf(in_iov, in_num, i, &desc, sizeof(desc));
> +        if (n != sizeof(desc)) {
> +            virtio_error(vdev, "Driver provided input buffer "
> +                               "for descriptors that is too small!");
> +            return;
> +        }
> +    }
> +    g_free(data);

data is leaked in the error cases. It may be easiest to add a goto out label.

> +
> +    virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
> +    virtio_blk_free_request(req);
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +}
> +
> +static int virtio_blk_handle_zone_report(VirtIOBlockReq *req) {
> +    VirtIOBlock *s = req->dev;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> +    unsigned int nr_zones;
> +    size_t data_size;
> +    ZoneReportData *data;
> +    int64_t offset;
> +
> +    if (req->in_len <= sizeof(struct virtio_blk_inhdr)) {
> +        virtio_error(vdev, "in buffer too small for zone report");
> +        return -1;
> +    }

This check should be req->in_len < sizeof(struct virtio_blk_inhdr) +
sizeof(virtio_blk_zone_report) to prevent integer underflow in the
nr_zones calculation below.

> +
> +    /* start offset of the zone report */
> +    offset = virtio_ldq_p(vdev, &req->out.sector);
> +    nr_zones = (req->in_len - sizeof(struct virtio_blk_inhdr) -
> +                sizeof(struct virtio_blk_zone_report)) /
> +                sizeof(struct virtio_blk_zone_descriptor);
> +    data_size = sizeof(ZoneReportData) +
> +                sizeof(BlockZoneDescriptor) * nr_zones;
> +    data = g_malloc(data_size);
> +    memset(data, 0, data_size);

You can use g_malloc0() instead of calling memset(3) explicitly.

> +    data->nr_zones = nr_zones;
> +    data->req = req;
> +
> +    blk_aio_zone_report(s->blk, offset, &data->nr_zones, data->zones,
> +                        virtio_blk_zone_report_complete, data);
> +    return 0;
> +}
> +
> +static void virtio_blk_zone_mgmt_complete(void *opaque, int ret) {
> +    VirtIOBlockReq *req = opaque;
> +
> +    virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);

What if the zone management command failed? I think the spec has some
error constants to report specific errors.

I think aio_context_acquire/release() is missing around the
virtio_blk_req_complete() call.

> +    virtio_blk_free_request(req);
> +}
> +
> +static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp op) {
> +    VirtIOBlock *s = req->dev;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> +    int64_t offset = virtio_ldq_p(vdev, &req->out.sector);
> +    struct iovec *out_iov =req->elem.out_sg;
> +    unsigned out_num = req->elem.out_num;
> +    BlockDriverState *state = blk_bs(s->blk);
> +    struct virtio_blk_zone_mgmt_outhdr zm_hdr;
> +    uint64_t len;
> +
> +    if (unlikely(iov_to_buf(out_iov, out_num, 0, &zm_hdr,
> +                            sizeof(zm_hdr)) != sizeof(zm_hdr))){
> +        virtio_error(vdev, "virtio-blk request zone_mgmt_outhdr too short");
> +        return -1;
> +    }
> +
> +    if (zm_hdr.flags & VIRTIO_BLK_ZONED_FLAG_ALL) {

There was discussion about dropping this flag and instead introducing
a dedicated request type (for RESET_ALL? I can't remember). This might
need to change depending on how the VIRTIO spec develops.

> +        /* Entire drive capacity */
> +        offset = 0;
> +        blk_get_geometry(s->blk, &len);
> +    } else {
> +        len = state->bl.zone_sectors;
> +    }
> +
> +    blk_aio_zone_mgmt(s->blk, op, offset, len,
> +                      virtio_blk_zone_mgmt_complete, req);
> +    return 0;
> +}
> +
>  static bool virtio_blk_sect_range_ok(VirtIOBlock *dev,
>                                       uint64_t sector, size_t size)
>  {
> @@ -700,6 +828,21 @@ static int virtio_blk_handle_request(VirtIOBlockReq 
> *req, MultiReqBuffer *mrb)
>      case VIRTIO_BLK_T_FLUSH:
>          virtio_blk_handle_flush(req, mrb);
>          break;
> +    case VIRTIO_BLK_T_ZONE_REPORT:
> +        virtio_blk_handle_zone_report(req);
> +        break;
> +    case VIRTIO_BLK_T_ZONE_OPEN:
> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_OPEN);
> +        break;
> +    case VIRTIO_BLK_T_ZONE_CLOSE:
> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_CLOSE);
> +        break;
> +    case VIRTIO_BLK_T_ZONE_RESET:
> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
> +        break;
> +    case VIRTIO_BLK_T_ZONE_FINISH:
> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_FINISH);
> +        break;
>      case VIRTIO_BLK_T_SCSI_CMD:
>          virtio_blk_handle_scsi(req);
>          break;
> @@ -913,10 +1056,10 @@ static void virtio_blk_reset(VirtIODevice *vdev)
>
>  /* coalesce internal state, copy to pci i/o region 0
>   */
> -static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
> -{
> +static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config) {
>      VirtIOBlock *s = VIRTIO_BLK(vdev);
>      BlockConf *conf = &s->conf.conf;
> +    BlockDriverState *state = blk_bs(s->blk);
>      struct virtio_blk_config blkcfg;
>      uint64_t capacity;
>      int64_t length;
> @@ -976,6 +1119,25 @@ static void virtio_blk_update_config(VirtIODevice 
> *vdev, uint8_t *config)
>          blkcfg.write_zeroes_may_unmap = 1;
>          virtio_stl_p(vdev, &blkcfg.max_write_zeroes_seg, 1);
>      }
> +#ifdef CONFIG_BLKZONED
> +    blkcfg.zoned.model = state->bl.zoned;

The QEMU zone model constant must be explicitly converted to the
virtio-blk constant in case their values are different. Or you can use
QEMU_BUILD_BUG_ON() to check that the constant values are identical.

> +    if (state->bl.zoned != BLK_Z_NONE) {
> +        switch (state->bl.zoned) {
> +        case BLK_Z_HM:
> +        case BLK_Z_HA:
> +            virtio_stl_p(vdev, &blkcfg.zoned.zone_sectors, 
> state->bl.zone_sectors);
> +            virtio_stl_p(vdev, &blkcfg.zoned.max_active_zones, 
> state->bl.max_active_zones);
> +            virtio_stl_p(vdev, &blkcfg.zoned.max_open_zones, 
> state->bl.max_open_zones);
> +            virtio_stl_p(vdev, &blkcfg.zoned.max_append_sectors, 
> state->bl.zone_append_max_bytes);
> +            virtio_stl_p(vdev, &blkcfg.zoned.write_granularity, blk_size);
> +            break;
> +        default:
> +            virtio_error(vdev, "Invalid zoned model %x \n", 
> (int)state->bl.zoned);
> +            blkcfg.zoned.model = BLK_Z_NONE;
> +            break;
> +        }
> +    }
> +#endif
>      memcpy(config, &blkcfg, s->config_size);
>  }
>
> @@ -995,6 +1157,7 @@ static uint64_t virtio_blk_get_features(VirtIODevice 
> *vdev, uint64_t features,
>                                          Error **errp)
>  {
>      VirtIOBlock *s = VIRTIO_BLK(vdev);
> +    BlockDriverState *state = blk_bs(s->blk);
>
>      /* Firstly sync all virtio-blk possible supported features */
>      features |= s->host_features;
> @@ -1003,6 +1166,8 @@ static uint64_t virtio_blk_get_features(VirtIODevice 
> *vdev, uint64_t features,
>      virtio_add_feature(&features, VIRTIO_BLK_F_GEOMETRY);
>      virtio_add_feature(&features, VIRTIO_BLK_F_TOPOLOGY);
>      virtio_add_feature(&features, VIRTIO_BLK_F_BLK_SIZE);
> +    if (state->bl.zoned != BLK_Z_NONE)
> +        virtio_add_feature(&s->host_features, VIRTIO_BLK_F_ZONED);

QEMU coding style uses {} around if statement bodies.

>      if (virtio_has_feature(features, VIRTIO_F_VERSION_1)) {
>          if (virtio_has_feature(s->host_features, VIRTIO_BLK_F_SCSI)) {
>              error_setg(errp, "Please set scsi=off for virtio-blk devices in 
> order to use virtio 1.0");
> @@ -1286,6 +1451,9 @@ static Property virtio_blk_properties[] = {
>  #ifdef __linux__
>      DEFINE_PROP_BIT64("scsi", VirtIOBlock, host_features,
>                        VIRTIO_BLK_F_SCSI, false),
> +#endif
> +#ifdef CONFIG_BLKZONED
> +    DEFINE_PROP_BIT64("zoned", VirtIOBlock, host_features, 
> VIRTIO_BLK_F_ZONED, true),
>  #endif
>      DEFINE_PROP_BIT("request-merging", VirtIOBlock, conf.request_merging, 0,
>                      true),
> diff --git a/include/sysemu/block-backend-io.h 
> b/include/sysemu/block-backend-io.h
> index 50f5aa2e07..6fe98bd1ed 100644
> --- a/include/sysemu/block-backend-io.h
> +++ b/include/sysemu/block-backend-io.h
> @@ -45,6 +45,12 @@ BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t 
> offset,
>                              BlockCompletionFunc *cb, void *opaque);
>  BlockAIOCB *blk_aio_flush(BlockBackend *blk,
>                            BlockCompletionFunc *cb, void *opaque);
> +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
> +                                unsigned int *nr_zones, BlockZoneDescriptor 
> *zones,
> +                                BlockCompletionFunc *cb, void *opaque);
> +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
> +                              int64_t offset, int64_t len,
> +                              BlockCompletionFunc *cb, void *opaque);
>  BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t 
> bytes,
>                               BlockCompletionFunc *cb, void *opaque);
>  void blk_aio_cancel_async(BlockAIOCB *acb);
> --
> 2.37.1
>
>



reply via email to

[Prev in Thread] Current Thread [Next in Thread]