qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v7 4/8] block: add block layer APIs resembling Linux ZonedBlo


From: Damien Le Moal
Subject: Re: [PATCH v7 4/8] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls
Date: Wed, 24 Aug 2022 16:46:52 -0700
User-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:91.0) Gecko/20100101 Thunderbird/91.12.0

On 2022/08/22 21:12, Sam Li wrote:
> Stefan Hajnoczi <stefanha@redhat.com> 于2022年8月23日周二 08:49写道:
>>
>> On Tue, Aug 16, 2022 at 02:25:18PM +0800, Sam Li wrote:
>>> By adding zone management operations in BlockDriver, storage controller
>>> emulation can use the new block layer APIs including Report Zone and
>>> four zone management operations (open, close, finish, reset).
>>>
>>> Add zoned storage commands of the device: zone_report(zrp), zone_open(zo),
>>> zone_close(zc), zone_reset(zrs), zone_finish(zf).
>>>
>>> For example, to test zone_report, use following command:
>>> $ ./build/qemu-io --image-opts driver=zoned_host_device, 
>>> filename=/dev/nullb0
>>> -c "zrp offset nr_zones"
>>>
>>> Signed-off-by: Sam Li <faithilikerun@gmail.com>
>>> Reviewed-by: Hannes Reinecke <hare@suse.de>
>>> ---
>>>  block/block-backend.c             |  50 +++++
>>>  block/file-posix.c                | 341 +++++++++++++++++++++++++++++-
>>>  block/io.c                        |  41 ++++
>>>  include/block/block-common.h      |   1 -
>>>  include/block/block-io.h          |  13 ++
>>>  include/block/block_int-common.h  |  22 +-
>>>  include/block/raw-aio.h           |   6 +-
>>>  include/sysemu/block-backend-io.h |   6 +
>>>  meson.build                       |   1 +
>>>  qapi/block-core.json              |   8 +-
>>>  qemu-io-cmds.c                    | 143 +++++++++++++
>>>  11 files changed, 625 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/block/block-backend.c b/block/block-backend.c
>>> index d4a5df2ac2..fc639b0cd7 100644
>>> --- a/block/block-backend.c
>>> +++ b/block/block-backend.c
>>> @@ -1775,6 +1775,56 @@ int coroutine_fn blk_co_flush(BlockBackend *blk)
>>>      return ret;
>>>  }
>>>
>>> +/*
>>> + * Send a zone_report command.
>>> + * offset is a byte offset from the start of the device. No alignment
>>> + * required for offset.
>>> + * nr_zones represents IN maximum and OUT actual.
>>> + */
>>> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
>>> +                                    unsigned int *nr_zones,
>>> +                                    BlockZoneDescriptor *zones)
>>> +{
>>> +    int ret;
>>> +    IO_CODE();
>>> +
>>> +    blk_inc_in_flight(blk); /* increase before waiting */
>>> +    blk_wait_while_drained(blk);
>>> +    if (!blk_is_available(blk)) {
>>> +        blk_dec_in_flight(blk);
>>> +        return -ENOMEDIUM;
>>> +    }
>>> +    ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones);
>>> +    blk_dec_in_flight(blk);
>>> +    return ret;
>>> +}
>>> +
>>> +/*
>>> + * Send a zone_management command.
>>> + * offset is the starting zone specified as a sector offset.
>>> + * len is the maximum number of sectors the command should operate on.
>>> + */
>>> +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>>> +        int64_t offset, int64_t len)
>>> +{
>>> +    int ret;
>>> +    IO_CODE();
>>> +
>>> +    ret = blk_check_byte_request(blk, offset, len);
>>> +    if (ret < 0) {
>>> +        return ret;
>>> +    }
>>
>> blk_check_byte_request() calls blk_is_available() and returns -ENOMEDIUM
>> when it fails. You can therefore move this down and replace "if
>> (!blk_is_available(blk)) {".
>>
>>> +    blk_inc_in_flight(blk);
>>> +    blk_wait_while_drained(blk);
>>> +    if (!blk_is_available(blk)) {
>>> +        blk_dec_in_flight(blk);
>>> +        return -ENOMEDIUM;
>>> +    }
>>> +    ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len);
>>> +    blk_dec_in_flight(blk);
>>> +    return ret;
>>> +}
>>> +
>>>  void blk_drain(BlockBackend *blk)
>>>  {
>>>      BlockDriverState *bs = blk_bs(blk);
>>> diff --git a/block/file-posix.c b/block/file-posix.c
>>> index 727389488c..29f67082d9 100644
>>> --- a/block/file-posix.c
>>> +++ b/block/file-posix.c
>>> @@ -67,6 +67,9 @@
>>>  #include <sys/param.h>
>>>  #include <sys/syscall.h>
>>>  #include <sys/vfs.h>
>>> +#if defined(CONFIG_BLKZONED)
>>> +#include <linux/blkzoned.h>
>>> +#endif
>>>  #include <linux/cdrom.h>
>>>  #include <linux/fd.h>
>>>  #include <linux/fs.h>
>>> @@ -216,6 +219,13 @@ typedef struct RawPosixAIOData {
>>>              PreallocMode prealloc;
>>>              Error **errp;
>>>          } truncate;
>>> +        struct {
>>> +            unsigned int *nr_zones;
>>> +            BlockZoneDescriptor *zones;
>>> +        } zone_report;
>>> +        struct {
>>> +            unsigned long ioctl_op;
>>> +        } zone_mgmt;
>>>      };
>>>  } RawPosixAIOData;
>>>
>>> @@ -1328,7 +1338,7 @@ static void raw_refresh_limits(BlockDriverState *bs, 
>>> Error **errp)
>>>  #endif
>>>
>>>      if (bs->sg || S_ISBLK(st.st_mode)) {
>>> -        int ret = hdev_get_max_hw_transfer(s->fd, &st);
>>> +        ret = hdev_get_max_hw_transfer(s->fd, &st);
>>>
>>>          if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
>>>              bs->bl.max_hw_transfer = ret;
>>> @@ -1340,11 +1350,32 @@ static void raw_refresh_limits(BlockDriverState 
>>> *bs, Error **errp)
>>>          }
>>>      }
>>>
>>> -    ret = get_sysfs_zoned_model(s->fd, &st, &zoned);
>>> +    ret = get_sysfs_zoned_model(&st, &zoned);
>>>      if (ret < 0) {
>>>          zoned = BLK_Z_NONE;
>>>      }
>>>      bs->bl.zoned = zoned;
>>> +    if (zoned != BLK_Z_NONE) {
>>> +        ret = get_sysfs_long_val(&st, "chunk_sectors");
>>> +        if (ret > 0) {
>>> +            bs->bl.zone_sectors = ret;
>>> +        }
>>> +
>>> +        ret = get_sysfs_long_val(&st, "zone_append_max_bytes");
>>> +        if (ret > 0) {
>>> +            bs->bl.zone_append_max_bytes = ret;
>>> +        }
>>> +
>>> +        ret = get_sysfs_long_val(&st, "max_open_zones");
>>> +        if (ret > 0) {
>>> +            bs->bl.max_open_zones = ret;
>>> +        }
>>> +
>>> +        ret = get_sysfs_long_val(&st, "max_active_zones");
>>> +        if (ret > 0) {
>>> +            bs->bl.max_active_zones = ret;
>>> +        }
>>> +    }
>>>  }
>>>
>>>  static int check_for_dasd(int fd)
>>> @@ -1839,6 +1870,134 @@ static off_t copy_file_range(int in_fd, off_t 
>>> *in_off, int out_fd,
>>>  }
>>>  #endif
>>>
>>> +/*
>>> + * parse_zone - Fill a zone descriptor
>>> + */
>>> +#if defined(CONFIG_BLKZONED)
>>> +static inline void parse_zone(struct BlockZoneDescriptor *zone,
>>> +                              struct blk_zone *blkz) {
>>
>> Declaring the second argument "const struct blk_zone *blkz" would make
>> it clear that this function converts from blk_zone to
>> BlockZoneDescriptor.
>>
>>> +    zone->start = blkz->start;
>>> +    zone->length = blkz->len;
>>> +    zone->cap = blkz->capacity;
>>> +    zone->wp = blkz->wp;
>>> +
>>> +    switch (blkz->type) {
>>> +    case BLK_ZONE_TYPE_SEQWRITE_REQ:
>>> +        zone->type = BLK_ZT_SWR;
>>> +        break;
>>> +    case BLK_ZONE_TYPE_SEQWRITE_PREF:
>>> +        zone->type = BLK_ZT_SWP;
>>> +        break;
>>> +    case BLK_ZONE_TYPE_CONVENTIONAL:
>>> +        zone->type = BLK_ZT_CONV;
>>> +        break;
>>> +    default:
>>> +        error_report("Invalid zone type: 0x%x", blkz->type);
>>
>> Or g_assert_not_reached() to indicate that this should never happen. If
>> it does happen the process will call abort(3) and it will terminate with
>> a coredump file for debugging.
>>
>>> +    }
>>> +
>>> +    switch (blkz->cond) {
>>> +    case BLK_ZONE_COND_NOT_WP:
>>> +        zone->cond = BLK_ZS_NOT_WP;
>>> +        break;
>>> +    case BLK_ZONE_COND_EMPTY:
>>> +        zone->cond = BLK_ZS_EMPTY;
>>> +        break;
>>> +    case BLK_ZONE_COND_IMP_OPEN:
>>> +        zone->cond =BLK_ZS_IOPEN;
>>> +        break;
>>> +    case BLK_ZONE_COND_EXP_OPEN:
>>> +        zone->cond = BLK_ZS_EOPEN;
>>> +        break;
>>> +    case BLK_ZONE_COND_CLOSED:
>>> +        zone->cond = BLK_ZS_CLOSED;
>>> +        break;
>>> +    case BLK_ZONE_COND_READONLY:
>>> +        zone->cond = BLK_ZS_RDONLY;
>>> +        break;
>>> +    case BLK_ZONE_COND_FULL:
>>> +        zone->cond = BLK_ZS_FULL;
>>> +        break;
>>> +    case BLK_ZONE_COND_OFFLINE:
>>> +        zone->cond = BLK_ZS_OFFLINE;
>>> +        break;
>>> +    default:
>>> +        error_report("Invalid zone condition 0x%x", blkz->cond);
>>
>> Same here.
>>
>>> +    }
>>> +}
>>> +#endif
>>> +
>>> +static int handle_aiocb_zone_report(void *opaque) {
>>> +#if defined(CONFIG_BLKZONED)
>>> +    RawPosixAIOData *aiocb = opaque;
>>> +    int fd = aiocb->aio_fildes;
>>> +    unsigned int *nr_zones = aiocb->zone_report.nr_zones;
>>> +    BlockZoneDescriptor *zones = aiocb->zone_report.zones;
>>> +    int64_t sector = aiocb->aio_offset;
>>> +
>>> +    struct blk_zone *blkz;
>>> +    int64_t rep_size;
>>> +    unsigned int nrz;
>>> +    int ret, n = 0, i = 0;
>>> +
>>> +    nrz = *nr_zones;
>>> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct 
>>> blk_zone);
>>> +    g_autofree struct blk_zone_report *rep = NULL;
>>> +    rep = g_malloc(rep_size);
>>> +
>>> +    blkz = (struct blk_zone *)(rep + 1);
>>> +    while (n < nrz) {
>>> +        memset(rep, 0, rep_size);
>>> +        rep->sector = sector;
>>> +        rep->nr_zones = nrz - n;
>>> +
>>> +        ret = ioctl(fd, BLKREPORTZONE, rep);
>>
>> Does this ioctl() need "do { ... } while (ret == -1 && errno == EINTR)"?
> 
> No? We discussed this before. I guess even EINTR should be propagated
> back to the guest. Maybe Damien can talk more about why.

In the kernel, completion of zone management IO requests are waited for using
wait_for_completion_io() which uses TASK_UNINTERRUPTIBLE. So a signal will not
abort anything. So I do not think that the do { } while() loop is necessary.

> 
>>
>>> +        if (ret != 0) {
>>> +            error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed 
>>> %d",
>>> +                         fd, sector, errno);
>>> +            return -errno;
>>> +        }
>>> +
>>> +        if (!rep->nr_zones) {
>>> +            break;
>>> +        }
>>> +
>>> +        for (i = 0; i < rep->nr_zones; i++, n++) {
>>> +            parse_zone(&zones[n], &blkz[i]);
>>> +            /* The next report should start after the last zone reported */
>>> +            sector = blkz[i].start + blkz[i].len;
>>> +        }
>>> +    }
>>> +
>>> +    *nr_zones = n;
>>> +    return 0;
>>> +#else
>>> +    return -ENOTSUP;
>>> +#endif
>>> +}
>>> +
>>> +static int handle_aiocb_zone_mgmt(void *opaque) {
>>> +#if defined(CONFIG_BLKZONED)
>>> +    RawPosixAIOData *aiocb = opaque;
>>> +    int fd = aiocb->aio_fildes;
>>> +    int64_t sector = aiocb->aio_offset;
>>> +    int64_t nr_sectors = aiocb->aio_nbytes;
>>> +    unsigned long ioctl_op = aiocb->zone_mgmt.ioctl_op;
>>> +    struct blk_zone_range range;
>>> +    int ret;
>>> +
>>> +    /* Execute the operation */
>>> +    range.sector = sector;
>>> +    range.nr_sectors = nr_sectors;
>>> +    do {
>>> +        ret = ioctl(fd, ioctl_op, &range);
>>> +    } while (ret != 0 && errno == EINTR);
>>> +
>>> +    return ret;
>>
>>   if (ret < 0) {
>>       return -errno;
>>   }
>>   return 0;
>>
>>> +#else
>>> +    return -ENOTSUP;
>>> +#endif
>>> +}
>>> +
>>>  static int handle_aiocb_copy_range(void *opaque)
>>>  {
>>>      RawPosixAIOData *aiocb = opaque;
>>> @@ -3011,6 +3170,124 @@ static void raw_account_discard(BDRVRawState *s, 
>>> uint64_t nbytes, int ret)
>>>      }
>>>  }
>>>
>>> +/*
>>> + * zone report - Get a zone block device's information in the form
>>> + * of an array of zone descriptors.
>>> + *
>>> + * @param bs: passing zone block device file descriptor
>>> + * @param zones: an array of zone descriptors to hold zone
>>> + * information on reply
>>> + * @param offset: offset can be any byte within the zone size.
>>> + * @param len: (not sure yet.
>>> + * @return 0 on success, -1 on failure
>>> + */
>>> +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t 
>>> offset,
>>> +                                           unsigned int *nr_zones,
>>> +                                           BlockZoneDescriptor *zones) {
>>> +#if defined(CONFIG_BLKZONED)
>>> +    BDRVRawState *s = bs->opaque;
>>> +    RawPosixAIOData acb;
>>> +
>>> +    acb = (RawPosixAIOData) {
>>> +        .bs         = bs,
>>> +        .aio_fildes = s->fd,
>>> +        .aio_type   = QEMU_AIO_ZONE_REPORT,
>>> +        /* zoned block devices use 512-byte sectors */
>>> +        .aio_offset = offset / 512,
>>> +        .zone_report    = {
>>> +                .nr_zones       = nr_zones,
>>> +                .zones          = zones,
>>> +        },
>>> +    };
>>> +
>>> +    return raw_thread_pool_submit(bs, handle_aiocb_zone_report, &acb);
>>> +#else
>>> +    return -ENOTSUP;
>>> +#endif
>>> +}
>>> +
>>> +/*
>>> + * zone management operations - Execute an operation on a zone
>>> + */
>>> +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp 
>>> op,
>>> +        int64_t offset, int64_t len) {
>>> +#if defined(CONFIG_BLKZONED)
>>> +    BDRVRawState *s = bs->opaque;
>>> +    RawPosixAIOData acb;
>>> +    int64_t zone_sector, zone_sector_mask;
>>> +    const char *ioctl_name;
>>> +    unsigned long ioctl_op;
>>> +    int ret;
>>> +
>>> +    struct stat st;
>>> +    if (fstat(s->fd, &st) < 0) {
>>> +        ret = -errno;
>>> +        return ret;
>>> +    }
>>> +    zone_sector = get_sysfs_long_val(&st, "chunk_sectors");
>>> +    if (zone_sector < 0) {
>>> +        error_report("invalid zone sector size %" PRId64 "", zone_sector);
>>> +        return -EINVAL;
>>> +    }
>>> +
>>> +    zone_sector_mask = zone_sector - 1;
>>> +    if (offset & zone_sector_mask) {
>>> +        error_report("sector offset %" PRId64 " is not aligned to zone 
>>> size "
>>> +                     "%" PRId64 "", offset, zone_sector);
>>> +        return -EINVAL;
>>> +    }
>>> +
>>> +    if (len & zone_sector_mask) {
>>> +        error_report("number of sectors %" PRId64 " is not aligned to zone 
>>> size"
>>> +                      " %" PRId64 "", len, zone_sector);
>>> +        return -EINVAL;
>>> +    }
>>> +
>>> +    switch (op) {
>>> +    case BLK_ZO_OPEN:
>>> +        ioctl_name = "BLKOPENZONE";
>>> +        ioctl_op = BLKOPENZONE;
>>> +        break;
>>> +    case BLK_ZO_CLOSE:
>>> +        ioctl_name = "BLKCLOSEZONE";
>>> +        ioctl_op = BLKCLOSEZONE;
>>> +        break;
>>> +    case BLK_ZO_FINISH:
>>> +        ioctl_name = "BLKFINISHZONE";
>>> +        ioctl_op = BLKFINISHZONE;
>>> +        break;
>>> +    case BLK_ZO_RESET:
>>> +        ioctl_name = "BLKRESETZONE";
>>> +        ioctl_op = BLKRESETZONE;
>>> +        break;
>>> +    default:
>>> +        error_report("Invalid zone operation 0x%x", op);
>>> +        return -EINVAL;
>>> +    }
>>> +
>>> +    acb = (RawPosixAIOData) {
>>> +        .bs             = bs,
>>> +        .aio_fildes     = s->fd,
>>> +        .aio_type       = QEMU_AIO_ZONE_MGMT,
>>> +        .aio_offset     = offset,
>>> +        .aio_nbytes     = len,
>>> +        .zone_mgmt  = {
>>> +                .ioctl_op = ioctl_op,
>>> +        },
>>> +    };
>>> +
>>> +    ret = raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb);
>>> +    if (ret != 0) {
>>> +        error_report("ioctl %s failed %d", ioctl_name, errno);
>>> +        return -errno;
>>> +    }
>>> +
>>> +    return ret;
>>> +#else
>>> +    return -ENOTSUP;
>>> +#endif
>>> +}
>>> +
>>>  static coroutine_fn int
>>>  raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
>>>                  bool blkdev)
>>> @@ -3511,6 +3788,14 @@ static void hdev_parse_filename(const char 
>>> *filename, QDict *options,
>>>      bdrv_parse_filename_strip_prefix(filename, "host_device:", options);
>>>  }
>>>
>>> +#if defined(CONFIG_BLKZONED)
>>> +static void zoned_host_device_parse_filename(const char *filename, QDict 
>>> *options,
>>> +                                Error **errp)
>>> +{
>>> +    bdrv_parse_filename_strip_prefix(filename, "zoned_host_device:", 
>>> options);
>>> +}
>>> +#endif
>>
>> Sorry, I asked you to add this function but I've changed my mind and I
>> think it should not be present. .bdrv_parse_filename() helps legacy
>> drivers convert arguments into QDict *options. But this is a new driver
>> that no one expects to work with string filenames. Therefore
>> .bdrv_parse_filename can be dropped.
>>
>>> +
>>>  static bool hdev_is_sg(BlockDriverState *bs)
>>>  {
>>>
>>> @@ -3741,6 +4026,55 @@ static BlockDriver bdrv_host_device = {
>>>  #endif
>>>  };
>>>
>>> +#if defined(CONFIG_BLKZONED)
>>> +static BlockDriver bdrv_zoned_host_device = {
>>> +        .format_name = "zoned_host_device",
>>> +        .protocol_name = "zoned_host_device",
>>> +        .instance_size = sizeof(BDRVRawState),
>>> +        .bdrv_needs_filename = true,
>>> +        .bdrv_probe_device  = hdev_probe_device,
>>> +        .bdrv_parse_filename = zoned_host_device_parse_filename,
>>> +        .bdrv_file_open     = hdev_open,
>>> +        .bdrv_close         = raw_close,
>>> +        .bdrv_reopen_prepare = raw_reopen_prepare,
>>> +        .bdrv_reopen_commit  = raw_reopen_commit,
>>> +        .bdrv_reopen_abort   = raw_reopen_abort,
>>> +        .bdrv_co_create_opts = bdrv_co_create_opts_simple,
>>> +        .create_opts         = &bdrv_create_opts_simple,
>>> +        .mutable_opts        = mutable_opts,
>>> +        .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
>>> +        .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
>>> +
>>> +        .bdrv_co_preadv         = raw_co_preadv,
>>> +        .bdrv_co_pwritev        = raw_co_pwritev,
>>> +        .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
>>> +        .bdrv_co_pdiscard       = hdev_co_pdiscard,
>>> +        .bdrv_co_copy_range_from = raw_co_copy_range_from,
>>> +        .bdrv_co_copy_range_to  = raw_co_copy_range_to,
>>> +        .bdrv_refresh_limits = raw_refresh_limits,
>>> +        .bdrv_io_plug = raw_aio_plug,
>>> +        .bdrv_io_unplug = raw_aio_unplug,
>>> +        .bdrv_attach_aio_context = raw_aio_attach_aio_context,
>>> +
>>> +        .bdrv_co_truncate       = raw_co_truncate,
>>> +        .bdrv_getlength = raw_getlength,
>>> +        .bdrv_get_info = raw_get_info,
>>> +        .bdrv_get_allocated_file_size
>>> +                            = raw_get_allocated_file_size,
>>> +        .bdrv_get_specific_stats = hdev_get_specific_stats,
>>> +        .bdrv_check_perm = raw_check_perm,
>>> +        .bdrv_set_perm   = raw_set_perm,
>>> +        .bdrv_abort_perm_update = raw_abort_perm_update,
>>> +        .bdrv_probe_blocksizes = hdev_probe_blocksizes,
>>> +        .bdrv_probe_geometry = hdev_probe_geometry,
>>> +        .bdrv_co_ioctl = hdev_co_ioctl,
>>> +
>>> +        /* zone management operations */
>>> +        .bdrv_co_zone_report = raw_co_zone_report,
>>> +        .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
>>> +};
>>> +#endif
>>> +
>>>  #if defined(__linux__) || defined(__FreeBSD__) || 
>>> defined(__FreeBSD_kernel__)
>>>  static void cdrom_parse_filename(const char *filename, QDict *options,
>>>                                   Error **errp)
>>> @@ -4001,6 +4335,9 @@ static void bdrv_file_init(void)
>>>      bdrv_register(&bdrv_file);
>>>  #if defined(HAVE_HOST_BLOCK_DEVICE)
>>>      bdrv_register(&bdrv_host_device);
>>> +#if defined(CONFIG_BLKZONED)
>>> +    bdrv_register(&bdrv_zoned_host_device);
>>> +#endif
>>>  #ifdef __linux__
>>>      bdrv_register(&bdrv_host_cdrom);
>>>  #endif
>>> diff --git a/block/io.c b/block/io.c
>>> index 0a8cbefe86..de9ec1d740 100644
>>> --- a/block/io.c
>>> +++ b/block/io.c
>>> @@ -3198,6 +3198,47 @@ out:
>>>      return co.ret;
>>>  }
>>>
>>> +int bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
>>> +                        unsigned int *nr_zones,
>>> +                        BlockZoneDescriptor *zones)
>>> +{
>>> +    BlockDriver *drv = bs->drv;
>>> +    CoroutineIOCompletion co = {
>>> +            .coroutine = qemu_coroutine_self(),
>>> +    };
>>> +    IO_CODE();
>>> +
>>> +    bdrv_inc_in_flight(bs);
>>> +    if (!drv || !drv->bdrv_co_zone_report) {
>>> +        co.ret = -ENOTSUP;
>>> +        goto out;
>>> +    }
>>> +    co.ret = drv->bdrv_co_zone_report(bs, offset, nr_zones, zones);
>>> +out:
>>> +    bdrv_dec_in_flight(bs);
>>> +    return co.ret;
>>> +}
>>> +
>>> +int bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>>> +        int64_t offset, int64_t len)
>>> +{
>>> +    BlockDriver *drv = bs->drv;
>>> +    CoroutineIOCompletion co = {
>>> +            .coroutine = qemu_coroutine_self(),
>>> +    };
>>> +    IO_CODE();
>>> +
>>> +    bdrv_inc_in_flight(bs);
>>> +    if (!drv || !drv->bdrv_co_zone_mgmt) {
>>> +        co.ret = -ENOTSUP;
>>> +        goto out;
>>> +    }
>>> +    co.ret = drv->bdrv_co_zone_mgmt(bs, op, offset, len);
>>> +out:
>>> +    bdrv_dec_in_flight(bs);
>>> +    return co.ret;
>>> +}
>>> +
>>>  void *qemu_blockalign(BlockDriverState *bs, size_t size)
>>>  {
>>>      IO_CODE();
>>> diff --git a/include/block/block-common.h b/include/block/block-common.h
>>> index 36bd0e480e..5102fa6858 100644
>>> --- a/include/block/block-common.h
>>> +++ b/include/block/block-common.h
>>> @@ -23,7 +23,6 @@
>>>   */
>>>  #ifndef BLOCK_COMMON_H
>>>  #define BLOCK_COMMON_H
>>> -
>>>  #include "block/aio.h"
>>>  #include "block/aio-wait.h"
>>>  #include "qemu/iov.h"
>>
>> Unrelated whitespace change. Please drop this.
>>
>>> diff --git a/include/block/block-io.h b/include/block/block-io.h
>>> index fd25ffa9be..55ad261e16 100644
>>> --- a/include/block/block-io.h
>>> +++ b/include/block/block-io.h
>>> @@ -88,6 +88,13 @@ int bdrv_co_ioctl(BlockDriverState *bs, int req, void 
>>> *buf);
>>>  /* Ensure contents are flushed to disk.  */
>>>  int coroutine_fn bdrv_co_flush(BlockDriverState *bs);
>>>
>>> +/* Report zone information of zone block device. */
>>> +int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
>>> +                                     unsigned int *nr_zones,
>>> +                                     BlockZoneDescriptor *zones);
>>> +int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>>> +                                   int64_t offset, int64_t len);
>>> +
>>>  int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
>>>  bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
>>>  int bdrv_block_status(BlockDriverState *bs, int64_t offset,
>>> @@ -297,6 +304,12 @@ bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector 
>>> *qiov, int64_t pos);
>>>  int generated_co_wrapper
>>>  bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
>>>
>>> +int generated_co_wrapper
>>> +blk_zone_report(BlockBackend *blk, int64_t offset, unsigned int *nr_zones,
>>> +                BlockZoneDescriptor *zones);
>>> +int generated_co_wrapper
>>> +blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op, int64_t offset, int64_t 
>>> len);
>>> +
>>>  /**
>>>   * bdrv_parent_drained_begin_single:
>>>   *
>>> diff --git a/include/block/block_int-common.h 
>>> b/include/block/block_int-common.h
>>> index 7f7863cc9e..de44c7b6f4 100644
>>> --- a/include/block/block_int-common.h
>>> +++ b/include/block/block_int-common.h
>>> @@ -94,7 +94,6 @@ typedef struct BdrvTrackedRequest {
>>>      struct BdrvTrackedRequest *waiting_for;
>>>  } BdrvTrackedRequest;
>>>
>>> -
>>>  struct BlockDriver {
>>>      /*
>>>       * These fields are initialized when this object is created,
>>
>> Unrelated whitespace change. Please drop this.
>>
>>> @@ -691,6 +690,12 @@ struct BlockDriver {
>>>                                            QEMUIOVector *qiov,
>>>                                            int64_t pos);
>>>
>>> +    int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs,
>>> +            int64_t offset, unsigned int *nr_zones,
>>> +            BlockZoneDescriptor *zones);
>>> +    int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, 
>>> BlockZoneOp op,
>>> +            int64_t offset, int64_t len);
>>> +
>>>      /* removable device specific */
>>>      bool (*bdrv_is_inserted)(BlockDriverState *bs);
>>>      void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag);
>>> @@ -828,6 +833,21 @@ typedef struct BlockLimits {
>>>
>>>      /* device zone model */
>>>      BlockZoneModel zoned;
>>> +
>>> +    /* zone size expressed in 512-byte sectors */
>>> +    uint32_t zone_sectors;
>>> +
>>> +    /* total number of zones */
>>> +    unsigned int nr_zones;
>>> +
>>> +    /* maximum size in bytes of a zone append write operation */
>>> +    int64_t zone_append_max_bytes;
>>> +
>>> +    /* maximum number of open zones */
>>> +    int64_t max_open_zones;
>>> +
>>> +    /* maximum number of active zones */
>>> +    int64_t max_active_zones;
>>>  } BlockLimits;
>>>
>>>  typedef struct BdrvOpBlocker BdrvOpBlocker;
>>> diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
>>> index 21fc10c4c9..3d26929cdd 100644
>>> --- a/include/block/raw-aio.h
>>> +++ b/include/block/raw-aio.h
>>> @@ -29,6 +29,8 @@
>>>  #define QEMU_AIO_WRITE_ZEROES 0x0020
>>>  #define QEMU_AIO_COPY_RANGE   0x0040
>>>  #define QEMU_AIO_TRUNCATE     0x0080
>>> +#define QEMU_AIO_ZONE_REPORT  0x0100
>>> +#define QEMU_AIO_ZONE_MGMT    0x0200
>>>  #define QEMU_AIO_TYPE_MASK \
>>>          (QEMU_AIO_READ | \
>>>           QEMU_AIO_WRITE | \
>>> @@ -37,7 +39,9 @@
>>>           QEMU_AIO_DISCARD | \
>>>           QEMU_AIO_WRITE_ZEROES | \
>>>           QEMU_AIO_COPY_RANGE | \
>>> -         QEMU_AIO_TRUNCATE)
>>> +         QEMU_AIO_TRUNCATE  | \
>>> +         QEMU_AIO_ZONE_REPORT | \
>>> +         QEMU_AIO_ZONE_MGMT)
>>>
>>>  /* AIO flags */
>>>  #define QEMU_AIO_MISALIGNED   0x1000
>>> diff --git a/include/sysemu/block-backend-io.h 
>>> b/include/sysemu/block-backend-io.h
>>> index 50f5aa2e07..6e7df1d93b 100644
>>> --- a/include/sysemu/block-backend-io.h
>>> +++ b/include/sysemu/block-backend-io.h
>>> @@ -156,6 +156,12 @@ int generated_co_wrapper 
>>> blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
>>>  int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
>>>                                        int64_t bytes, BdrvRequestFlags 
>>> flags);
>>>
>>> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
>>> +                                    unsigned int *nr_zones,
>>> +                                    BlockZoneDescriptor *zones);
>>> +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>>> +                                  int64_t offset, int64_t len);
>>> +
>>>  int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset,
>>>                                        int64_t bytes);
>>>  int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset,
>>> diff --git a/meson.build b/meson.build
>>> index 294e9a8f32..c3219b0e87 100644
>>> --- a/meson.build
>>> +++ b/meson.build
>>> @@ -1883,6 +1883,7 @@ config_host_data.set('CONFIG_REPLICATION', 
>>> get_option('live_block_migration').al
>>>  # has_header
>>>  config_host_data.set('CONFIG_EPOLL', cc.has_header('sys/epoll.h'))
>>>  config_host_data.set('CONFIG_LINUX_MAGIC_H', 
>>> cc.has_header('linux/magic.h'))
>>> +config_host_data.set('CONFIG_BLKZONED', cc.has_header('linux/blkzoned.h'))
>>>  config_host_data.set('CONFIG_VALGRIND_H', 
>>> cc.has_header('valgrind/valgrind.h'))
>>>  config_host_data.set('HAVE_BTRFS_H', cc.has_header('linux/btrfs.h'))
>>>  config_host_data.set('HAVE_DRM_H', cc.has_header('libdrm/drm.h'))
>>> diff --git a/qapi/block-core.json b/qapi/block-core.json
>>> index 2173e7734a..c6bbb7a037 100644
>>> --- a/qapi/block-core.json
>>> +++ b/qapi/block-core.json
>>> @@ -2942,6 +2942,7 @@
>>>  # @compress: Since 5.0
>>>  # @copy-before-write: Since 6.2
>>>  # @snapshot-access: Since 7.0
>>> +# @zoned_host_device: Since 7.2
>>>  #
>>>  # Since: 2.9
>>>  ##
>>> @@ -2955,7 +2956,8 @@
>>>              'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 
>>> 'parallels',
>>>              'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
>>>              { 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
>>> -            'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
>>> +            'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat',
>>> +            { 'name': 'zoned_host_device', 'if': 'CONFIG_BLKZONED' } ] }
>>>
>>>  ##
>>>  # @BlockdevOptionsFile:
>>> @@ -4329,7 +4331,9 @@
>>>        'vhdx':       'BlockdevOptionsGenericFormat',
>>>        'vmdk':       'BlockdevOptionsGenericCOWFormat',
>>>        'vpc':        'BlockdevOptionsGenericFormat',
>>> -      'vvfat':      'BlockdevOptionsVVFAT'
>>> +      'vvfat':      'BlockdevOptionsVVFAT',
>>> +      'zoned_host_device': { 'type': 'BlockdevOptionsFile',
>>> +                             'if': 'CONFIG_BLKZONED' }
>>>    } }
>>>
>>>  ##
>>> diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
>>> index 952dc940f1..687c3a624c 100644
>>> --- a/qemu-io-cmds.c
>>> +++ b/qemu-io-cmds.c
>>> @@ -1712,6 +1712,144 @@ static const cmdinfo_t flush_cmd = {
>>>      .oneline    = "flush all in-core file state to disk",
>>>  };
>>>
>>> +static int zone_report_f(BlockBackend *blk, int argc, char **argv)
>>> +{
>>> +    int ret;
>>> +    int64_t offset;
>>> +    unsigned int nr_zones;
>>> +
>>> +    ++optind;
>>> +    offset = cvtnum(argv[optind]);
>>> +    ++optind;
>>> +    nr_zones = cvtnum(argv[optind]);
>>> +
>>> +    g_autofree BlockZoneDescriptor *zones = NULL;
>>> +    zones = g_new(BlockZoneDescriptor, nr_zones);
>>> +    ret = blk_zone_report(blk, offset, &nr_zones, zones);
>>> +    if (ret < 0) {
>>> +        printf("zone report failed: %s\n", strerror(-ret));
>>> +    } else {
>>> +        for (int i = 0; i < nr_zones; ++i) {
>>> +            printf("start: 0x%" PRIx64 ", len 0x%" PRIx64 ", "
>>> +                   "cap"" 0x%" PRIx64 ",wptr 0x%" PRIx64 ", "
>>> +                   "zcond:%u, [type: %u]\n",
>>> +                   zones[i].start, zones[i].length, zones[i].cap, 
>>> zones[i].wp,
>>> +                   zones[i].cond, zones[i].type);
>>> +        }
>>> +    }
>>> +    return ret;
>>> +}
>>> +
>>> +static const cmdinfo_t zone_report_cmd = {
>>> +        .name = "zone_report",
>>> +        .altname = "zrp",
>>> +        .cfunc = zone_report_f,
>>> +        .argmin = 2,
>>> +        .argmax = 2,
>>> +        .args = "offset number",
>>> +        .oneline = "report zone information",
>>> +};
>>> +
>>> +static int zone_open_f(BlockBackend *blk, int argc, char **argv)
>>> +{
>>> +    int ret;
>>> +    int64_t offset, len;
>>> +    ++optind;
>>> +    offset = cvtnum(argv[optind]);
>>> +    ++optind;
>>> +    len = cvtnum(argv[optind]);
>>> +    ret = blk_zone_mgmt(blk, BLK_ZO_OPEN, offset, len);
>>> +    if (ret < 0) {
>>> +        printf("zone open failed: %s\n", strerror(-ret));
>>> +    }
>>> +    return ret;
>>> +}
>>> +
>>> +static const cmdinfo_t zone_open_cmd = {
>>> +        .name = "zone_open",
>>> +        .altname = "zo",
>>> +        .cfunc = zone_open_f,
>>> +        .argmin = 2,
>>> +        .argmax = 2,
>>> +        .args = "offset len",
>>> +        .oneline = "explicit open a range of zones in zone block device",
>>> +};
>>> +
>>> +static int zone_close_f(BlockBackend *blk, int argc, char **argv)
>>> +{
>>> +    int ret;
>>> +    int64_t offset, len;
>>> +    ++optind;
>>> +    offset = cvtnum(argv[optind]);
>>> +    ++optind;
>>> +    len = cvtnum(argv[optind]);
>>> +    ret = blk_zone_mgmt(blk, BLK_ZO_CLOSE, offset, len);
>>> +    if (ret < 0) {
>>> +        printf("zone close failed: %s\n", strerror(-ret));
>>> +    }
>>> +    return ret;
>>> +}
>>> +
>>> +static const cmdinfo_t zone_close_cmd = {
>>> +        .name = "zone_close",
>>> +        .altname = "zc",
>>> +        .cfunc = zone_close_f,
>>> +        .argmin = 2,
>>> +        .argmax = 2,
>>> +        .args = "offset len",
>>> +        .oneline = "close a range of zones in zone block device",
>>> +};
>>> +
>>> +static int zone_finish_f(BlockBackend *blk, int argc, char **argv)
>>> +{
>>> +    int ret;
>>> +    int64_t offset, len;
>>> +    ++optind;
>>> +    offset = cvtnum(argv[optind]);
>>> +    ++optind;
>>> +    len = cvtnum(argv[optind]);
>>> +    ret = blk_zone_mgmt(blk, BLK_ZO_FINISH, offset, len);
>>> +    if (ret < 0) {
>>> +        printf("zone finish failed: %s\n", strerror(-ret));
>>> +    }
>>> +    return ret;
>>> +}
>>> +
>>> +static const cmdinfo_t zone_finish_cmd = {
>>> +        .name = "zone_finish",
>>> +        .altname = "zf",
>>> +        .cfunc = zone_finish_f,
>>> +        .argmin = 2,
>>> +        .argmax = 2,
>>> +        .args = "offset len",
>>> +        .oneline = "finish a range of zones in zone block device",
>>> +};
>>> +
>>> +static int zone_reset_f(BlockBackend *blk, int argc, char **argv)
>>> +{
>>> +    int ret;
>>> +    int64_t offset, len;
>>> +    ++optind;
>>> +    offset = cvtnum(argv[optind]);
>>> +    ++optind;
>>> +    len = cvtnum(argv[optind]);
>>> +    ret = blk_zone_mgmt(blk, BLK_ZO_RESET, offset, len);
>>> +    if (ret < 0) {
>>> +        printf("zone reset failed: %s\n", strerror(-ret));
>>> +    }
>>> +    return ret;
>>> +}
>>> +
>>> +static const cmdinfo_t zone_reset_cmd = {
>>> +        .name = "zone_reset",
>>> +        .altname = "zrs",
>>> +        .cfunc = zone_reset_f,
>>> +        .argmin = 2,
>>> +        .argmax = 2,
>>> +        .args = "offset len",
>>> +        .oneline = "reset a zone write pointer in zone block device",
>>> +};
>>> +
>>>  static int truncate_f(BlockBackend *blk, int argc, char **argv);
>>>  static const cmdinfo_t truncate_cmd = {
>>>      .name       = "truncate",
>>> @@ -2504,6 +2642,11 @@ static void __attribute((constructor)) 
>>> init_qemuio_commands(void)
>>>      qemuio_add_command(&aio_write_cmd);
>>>      qemuio_add_command(&aio_flush_cmd);
>>>      qemuio_add_command(&flush_cmd);
>>> +    qemuio_add_command(&zone_report_cmd);
>>> +    qemuio_add_command(&zone_open_cmd);
>>> +    qemuio_add_command(&zone_close_cmd);
>>> +    qemuio_add_command(&zone_finish_cmd);
>>> +    qemuio_add_command(&zone_reset_cmd);
>>>      qemuio_add_command(&truncate_cmd);
>>>      qemuio_add_command(&length_cmd);
>>>      qemuio_add_command(&info_cmd);
>>> --
>>> 2.37.1
>>>


-- 
Damien Le Moal
Western Digital Research



reply via email to

[Prev in Thread] Current Thread [Next in Thread]