From 7a6691f1f89784f775fa0c54be57533445726068 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 18 May 2022 16:37:59 +0300 Subject: [PATCH 01/29] vdpa: Fix error logic in vdpa_nl_cmd_dev_get_doit In vdpa_nl_cmd_dev_get_doit(), if the call to genlmsg_reply() fails we must not call nlmsg_free() since this is done inside genlmsg_reply(). Fix it. Fixes: bc0d90ee021f ("vdpa: Enable user to query vdpa device info") Reviewed-by: Si-Wei Liu Acked-by: Jason Wang Signed-off-by: Eli Cohen Message-Id: <20220518133804.1075129-2-elic@nvidia.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index 2b75c00b1005..fac89a0d8178 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -756,14 +756,19 @@ static int vdpa_nl_cmd_dev_get_doit(struct sk_buff *skb, struct genl_info *info) goto mdev_err; } err = vdpa_dev_fill(vdev, msg, info->snd_portid, info->snd_seq, 0, info->extack); - if (!err) - err = genlmsg_reply(msg, info); + if (err) + goto mdev_err; + + err = genlmsg_reply(msg, info); + put_device(dev); + mutex_unlock(&vdpa_dev_mutex); + return err; + mdev_err: put_device(dev); err: mutex_unlock(&vdpa_dev_mutex); - if (err) - nlmsg_free(msg); + nlmsg_free(msg); return err; } From 13b00b135665c92065a27c0c39dd97e0f380bd4f Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 18 May 2022 16:38:00 +0300 Subject: [PATCH 02/29] vdpa: Add support for querying vendor statistics Allows to read vendor statistics of a vdpa device. The specific statistics data are received from the upstream driver in the form of an (attribute name, attribute value) pairs. An example of statistics for mlx5_vdpa device are: received_desc - number of descriptors received by the virtqueue completed_desc - number of descriptors completed by the virtqueue A descriptor using indirect buffers is still counted as 1. In addition, N chained descriptors are counted correctly N times as one would expect. A new callback was added to vdpa_config_ops which provides the means for the vdpa driver to return statistics results. The interface allows for reading all the supported virtqueues, including the control virtqueue if it exists. Below are some examples taken from mlx5_vdpa which are introduced in the following patch: 1. Read statistics for the virtqueue at index 1 $ vdpa dev vstats show vdpa-a qidx 1 vdpa-a: queue_type tx queue_index 1 received_desc 3844836 completed_desc 3844836 2. Read statistics for the virtqueue at index 32 $ vdpa dev vstats show vdpa-a qidx 32 vdpa-a: queue_type control_vq queue_index 32 received_desc 62 completed_desc 62 3. Read statisitics for the virtqueue at index 0 with json output $ vdpa -j dev vstats show vdpa-a qidx 0 {"vstats":{"vdpa-a":{ "queue_type":"rx","queue_index":0,"name":"received_desc","value":417776,\ "name":"completed_desc","value":417548}}} 4. Read statistics for the virtqueue at index 0 with preety json output $ vdpa -jp dev vstats show vdpa-a qidx 0 { "vstats": { "vdpa-a": { "queue_type": "rx", "queue_index": 0, "name": "received_desc", "value": 417776, "name": "completed_desc", "value": 417548 } } } Signed-off-by: Eli Cohen Message-Id: <20220518133804.1075129-3-elic@nvidia.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa.c | 162 ++++++++++++++++++++++++++++++++++++++ include/linux/vdpa.h | 3 + include/uapi/linux/vdpa.h | 6 ++ 3 files changed, 171 insertions(+) diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index fac89a0d8178..31b5eb2c0778 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -914,6 +914,108 @@ out: return err; } +static int vdpa_fill_stats_rec(struct vdpa_device *vdev, struct sk_buff *msg, + struct genl_info *info, u32 index) +{ + struct virtio_net_config config = {}; + u64 features; + u16 max_vqp; + u8 status; + int err; + + status = vdev->config->get_status(vdev); + if (!(status & VIRTIO_CONFIG_S_FEATURES_OK)) { + NL_SET_ERR_MSG_MOD(info->extack, "feature negotiation not complete"); + return -EAGAIN; + } + vdpa_get_config_unlocked(vdev, 0, &config, sizeof(config)); + + max_vqp = le16_to_cpu(config.max_virtqueue_pairs); + if (nla_put_u16(msg, VDPA_ATTR_DEV_NET_CFG_MAX_VQP, max_vqp)) + return -EMSGSIZE; + + features = vdev->config->get_driver_features(vdev); + if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_NEGOTIATED_FEATURES, + features, VDPA_ATTR_PAD)) + return -EMSGSIZE; + + if (nla_put_u32(msg, VDPA_ATTR_DEV_QUEUE_INDEX, index)) + return -EMSGSIZE; + + err = vdev->config->get_vendor_vq_stats(vdev, index, msg, info->extack); + if (err) + return err; + + return 0; +} + +static int vendor_stats_fill(struct vdpa_device *vdev, struct sk_buff *msg, + struct genl_info *info, u32 index) +{ + int err; + + mutex_lock(&vdev->cf_mutex); + if (!vdev->config->get_vendor_vq_stats) { + err = -EOPNOTSUPP; + goto out; + } + + err = vdpa_fill_stats_rec(vdev, msg, info, index); +out: + mutex_unlock(&vdev->cf_mutex); + return err; +} + +static int vdpa_dev_vendor_stats_fill(struct vdpa_device *vdev, + struct sk_buff *msg, + struct genl_info *info, u32 index) +{ + u32 device_id; + void *hdr; + int err; + u32 portid = info->snd_portid; + u32 seq = info->snd_seq; + u32 flags = 0; + + hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, + VDPA_CMD_DEV_VSTATS_GET); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_string(msg, VDPA_ATTR_DEV_NAME, dev_name(&vdev->dev))) { + err = -EMSGSIZE; + goto undo_msg; + } + + device_id = vdev->config->get_device_id(vdev); + if (nla_put_u32(msg, VDPA_ATTR_DEV_ID, device_id)) { + err = -EMSGSIZE; + goto undo_msg; + } + + switch (device_id) { + case VIRTIO_ID_NET: + if (index > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX) { + NL_SET_ERR_MSG_MOD(info->extack, "queue index excceeds max value"); + err = -ERANGE; + break; + } + + err = vendor_stats_fill(vdev, msg, info, index); + break; + default: + err = -EOPNOTSUPP; + break; + } + genlmsg_end(msg, hdr); + + return err; + +undo_msg: + genlmsg_cancel(msg, hdr); + return err; +} + static int vdpa_nl_cmd_dev_config_get_doit(struct sk_buff *skb, struct genl_info *info) { struct vdpa_device *vdev; @@ -995,6 +1097,60 @@ vdpa_nl_cmd_dev_config_get_dumpit(struct sk_buff *msg, struct netlink_callback * return msg->len; } +static int vdpa_nl_cmd_dev_stats_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct vdpa_device *vdev; + struct sk_buff *msg; + const char *devname; + struct device *dev; + u32 index; + int err; + + if (!info->attrs[VDPA_ATTR_DEV_NAME]) + return -EINVAL; + + if (!info->attrs[VDPA_ATTR_DEV_QUEUE_INDEX]) + return -EINVAL; + + devname = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + index = nla_get_u32(info->attrs[VDPA_ATTR_DEV_QUEUE_INDEX]); + mutex_lock(&vdpa_dev_mutex); + dev = bus_find_device(&vdpa_bus, NULL, devname, vdpa_name_match); + if (!dev) { + NL_SET_ERR_MSG_MOD(info->extack, "device not found"); + err = -ENODEV; + goto dev_err; + } + vdev = container_of(dev, struct vdpa_device, dev); + if (!vdev->mdev) { + NL_SET_ERR_MSG_MOD(info->extack, "unmanaged vdpa device"); + err = -EINVAL; + goto mdev_err; + } + err = vdpa_dev_vendor_stats_fill(vdev, msg, info, index); + if (err) + goto mdev_err; + + err = genlmsg_reply(msg, info); + + put_device(dev); + mutex_unlock(&vdpa_dev_mutex); + + return err; + +mdev_err: + put_device(dev); +dev_err: + nlmsg_free(msg); + mutex_unlock(&vdpa_dev_mutex); + return err; +} + static const struct nla_policy vdpa_nl_policy[VDPA_ATTR_MAX + 1] = { [VDPA_ATTR_MGMTDEV_BUS_NAME] = { .type = NLA_NUL_STRING }, [VDPA_ATTR_MGMTDEV_DEV_NAME] = { .type = NLA_STRING }, @@ -1035,6 +1191,12 @@ static const struct genl_ops vdpa_nl_ops[] = { .doit = vdpa_nl_cmd_dev_config_get_doit, .dumpit = vdpa_nl_cmd_dev_config_get_dumpit, }, + { + .cmd = VDPA_CMD_DEV_VSTATS_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = vdpa_nl_cmd_dev_stats_get_doit, + .flags = GENL_ADMIN_PERM, + }, }; static struct genl_family vdpa_nl_family __ro_after_init = { diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 8943a209202e..2ae8443331e1 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -276,6 +276,9 @@ struct vdpa_config_ops { const struct vdpa_vq_state *state); int (*get_vq_state)(struct vdpa_device *vdev, u16 idx, struct vdpa_vq_state *state); + int (*get_vendor_vq_stats)(struct vdpa_device *vdev, u16 idx, + struct sk_buff *msg, + struct netlink_ext_ack *extack); struct vdpa_notification_area (*get_vq_notification)(struct vdpa_device *vdev, u16 idx); /* vq irq is not expected to be changed once DRIVER_OK is set */ diff --git a/include/uapi/linux/vdpa.h b/include/uapi/linux/vdpa.h index 1061d8d2d09d..25c55cab3d7c 100644 --- a/include/uapi/linux/vdpa.h +++ b/include/uapi/linux/vdpa.h @@ -18,6 +18,7 @@ enum vdpa_command { VDPA_CMD_DEV_DEL, VDPA_CMD_DEV_GET, /* can dump */ VDPA_CMD_DEV_CONFIG_GET, /* can dump */ + VDPA_CMD_DEV_VSTATS_GET, }; enum vdpa_attr { @@ -46,6 +47,11 @@ enum vdpa_attr { VDPA_ATTR_DEV_NEGOTIATED_FEATURES, /* u64 */ VDPA_ATTR_DEV_MGMTDEV_MAX_VQS, /* u32 */ VDPA_ATTR_DEV_SUPPORTED_FEATURES, /* u64 */ + + VDPA_ATTR_DEV_QUEUE_INDEX, /* u32 */ + VDPA_ATTR_DEV_VENDOR_ATTR_NAME, /* string */ + VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, /* u64 */ + /* new attributes must be added above here */ VDPA_ATTR_MAX, }; From 0078ad905dc8eada34461312a0060b0904c57e2d Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 18 May 2022 16:38:01 +0300 Subject: [PATCH 03/29] net/vdpa: Use readers/writers semaphore instead of vdpa_dev_mutex Use rw_semaphore instead of mutex to control access to vdpa devices. This can be especially beneficial in case processes poll on statistics information. Suggested-by: Si-Wei Liu Reviewed-by: Si-Wei Liu Acked-by: Jason Wang Signed-off-by: Eli Cohen Message-Id: <20220518133804.1075129-4-elic@nvidia.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa.c | 64 ++++++++++++++++++++++----------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index 31b5eb2c0778..2ff7de5e6b2f 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -18,7 +18,7 @@ static LIST_HEAD(mdev_head); /* A global mutex that protects vdpa management device and device level operations. */ -static DEFINE_MUTEX(vdpa_dev_mutex); +static DECLARE_RWSEM(vdpa_dev_lock); static DEFINE_IDA(vdpa_index_ida); void vdpa_set_status(struct vdpa_device *vdev, u8 status) @@ -238,7 +238,7 @@ static int __vdpa_register_device(struct vdpa_device *vdev, u32 nvqs) vdev->nvqs = nvqs; - lockdep_assert_held(&vdpa_dev_mutex); + lockdep_assert_held(&vdpa_dev_lock); dev = bus_find_device(&vdpa_bus, NULL, dev_name(&vdev->dev), vdpa_name_match); if (dev) { put_device(dev); @@ -278,9 +278,9 @@ int vdpa_register_device(struct vdpa_device *vdev, u32 nvqs) { int err; - mutex_lock(&vdpa_dev_mutex); + down_write(&vdpa_dev_lock); err = __vdpa_register_device(vdev, nvqs); - mutex_unlock(&vdpa_dev_mutex); + up_write(&vdpa_dev_lock); return err; } EXPORT_SYMBOL_GPL(vdpa_register_device); @@ -293,7 +293,7 @@ EXPORT_SYMBOL_GPL(vdpa_register_device); */ void _vdpa_unregister_device(struct vdpa_device *vdev) { - lockdep_assert_held(&vdpa_dev_mutex); + lockdep_assert_held(&vdpa_dev_lock); WARN_ON(!vdev->mdev); device_unregister(&vdev->dev); } @@ -305,9 +305,9 @@ EXPORT_SYMBOL_GPL(_vdpa_unregister_device); */ void vdpa_unregister_device(struct vdpa_device *vdev) { - mutex_lock(&vdpa_dev_mutex); + down_write(&vdpa_dev_lock); device_unregister(&vdev->dev); - mutex_unlock(&vdpa_dev_mutex); + up_write(&vdpa_dev_lock); } EXPORT_SYMBOL_GPL(vdpa_unregister_device); @@ -352,9 +352,9 @@ int vdpa_mgmtdev_register(struct vdpa_mgmt_dev *mdev) return -EINVAL; INIT_LIST_HEAD(&mdev->list); - mutex_lock(&vdpa_dev_mutex); + down_write(&vdpa_dev_lock); list_add_tail(&mdev->list, &mdev_head); - mutex_unlock(&vdpa_dev_mutex); + up_write(&vdpa_dev_lock); return 0; } EXPORT_SYMBOL_GPL(vdpa_mgmtdev_register); @@ -371,14 +371,14 @@ static int vdpa_match_remove(struct device *dev, void *data) void vdpa_mgmtdev_unregister(struct vdpa_mgmt_dev *mdev) { - mutex_lock(&vdpa_dev_mutex); + down_write(&vdpa_dev_lock); list_del(&mdev->list); /* Filter out all the entries belong to this management device and delete it. */ bus_for_each_dev(&vdpa_bus, NULL, mdev, vdpa_match_remove); - mutex_unlock(&vdpa_dev_mutex); + up_write(&vdpa_dev_lock); } EXPORT_SYMBOL_GPL(vdpa_mgmtdev_unregister); @@ -532,17 +532,17 @@ static int vdpa_nl_cmd_mgmtdev_get_doit(struct sk_buff *skb, struct genl_info *i if (!msg) return -ENOMEM; - mutex_lock(&vdpa_dev_mutex); + down_read(&vdpa_dev_lock); mdev = vdpa_mgmtdev_get_from_attr(info->attrs); if (IS_ERR(mdev)) { - mutex_unlock(&vdpa_dev_mutex); + up_read(&vdpa_dev_lock); NL_SET_ERR_MSG_MOD(info->extack, "Fail to find the specified mgmt device"); err = PTR_ERR(mdev); goto out; } err = vdpa_mgmtdev_fill(mdev, msg, info->snd_portid, info->snd_seq, 0); - mutex_unlock(&vdpa_dev_mutex); + up_read(&vdpa_dev_lock); if (err) goto out; err = genlmsg_reply(msg, info); @@ -561,7 +561,7 @@ vdpa_nl_cmd_mgmtdev_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) int idx = 0; int err; - mutex_lock(&vdpa_dev_mutex); + down_read(&vdpa_dev_lock); list_for_each_entry(mdev, &mdev_head, list) { if (idx < start) { idx++; @@ -574,7 +574,7 @@ vdpa_nl_cmd_mgmtdev_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) idx++; } out: - mutex_unlock(&vdpa_dev_mutex); + up_read(&vdpa_dev_lock); cb->args[0] = idx; return msg->len; } @@ -627,7 +627,7 @@ static int vdpa_nl_cmd_dev_add_set_doit(struct sk_buff *skb, struct genl_info *i !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; - mutex_lock(&vdpa_dev_mutex); + down_write(&vdpa_dev_lock); mdev = vdpa_mgmtdev_get_from_attr(info->attrs); if (IS_ERR(mdev)) { NL_SET_ERR_MSG_MOD(info->extack, "Fail to find the specified management device"); @@ -643,7 +643,7 @@ static int vdpa_nl_cmd_dev_add_set_doit(struct sk_buff *skb, struct genl_info *i err = mdev->ops->dev_add(mdev, name, &config); err: - mutex_unlock(&vdpa_dev_mutex); + up_write(&vdpa_dev_lock); return err; } @@ -659,7 +659,7 @@ static int vdpa_nl_cmd_dev_del_set_doit(struct sk_buff *skb, struct genl_info *i return -EINVAL; name = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); - mutex_lock(&vdpa_dev_mutex); + down_write(&vdpa_dev_lock); dev = bus_find_device(&vdpa_bus, NULL, name, vdpa_name_match); if (!dev) { NL_SET_ERR_MSG_MOD(info->extack, "device not found"); @@ -677,7 +677,7 @@ static int vdpa_nl_cmd_dev_del_set_doit(struct sk_buff *skb, struct genl_info *i mdev_err: put_device(dev); dev_err: - mutex_unlock(&vdpa_dev_mutex); + up_write(&vdpa_dev_lock); return err; } @@ -743,7 +743,7 @@ static int vdpa_nl_cmd_dev_get_doit(struct sk_buff *skb, struct genl_info *info) if (!msg) return -ENOMEM; - mutex_lock(&vdpa_dev_mutex); + down_read(&vdpa_dev_lock); dev = bus_find_device(&vdpa_bus, NULL, devname, vdpa_name_match); if (!dev) { NL_SET_ERR_MSG_MOD(info->extack, "device not found"); @@ -761,13 +761,13 @@ static int vdpa_nl_cmd_dev_get_doit(struct sk_buff *skb, struct genl_info *info) err = genlmsg_reply(msg, info); put_device(dev); - mutex_unlock(&vdpa_dev_mutex); + up_read(&vdpa_dev_lock); return err; mdev_err: put_device(dev); err: - mutex_unlock(&vdpa_dev_mutex); + up_read(&vdpa_dev_lock); nlmsg_free(msg); return err; } @@ -809,9 +809,9 @@ static int vdpa_nl_cmd_dev_get_dumpit(struct sk_buff *msg, struct netlink_callba info.start_idx = cb->args[0]; info.idx = 0; - mutex_lock(&vdpa_dev_mutex); + down_read(&vdpa_dev_lock); bus_for_each_dev(&vdpa_bus, NULL, &info, vdpa_dev_dump); - mutex_unlock(&vdpa_dev_mutex); + up_read(&vdpa_dev_lock); cb->args[0] = info.idx; return msg->len; } @@ -1031,7 +1031,7 @@ static int vdpa_nl_cmd_dev_config_get_doit(struct sk_buff *skb, struct genl_info if (!msg) return -ENOMEM; - mutex_lock(&vdpa_dev_mutex); + down_read(&vdpa_dev_lock); dev = bus_find_device(&vdpa_bus, NULL, devname, vdpa_name_match); if (!dev) { NL_SET_ERR_MSG_MOD(info->extack, "device not found"); @@ -1052,7 +1052,7 @@ static int vdpa_nl_cmd_dev_config_get_doit(struct sk_buff *skb, struct genl_info mdev_err: put_device(dev); dev_err: - mutex_unlock(&vdpa_dev_mutex); + up_read(&vdpa_dev_lock); if (err) nlmsg_free(msg); return err; @@ -1090,9 +1090,9 @@ vdpa_nl_cmd_dev_config_get_dumpit(struct sk_buff *msg, struct netlink_callback * info.start_idx = cb->args[0]; info.idx = 0; - mutex_lock(&vdpa_dev_mutex); + down_read(&vdpa_dev_lock); bus_for_each_dev(&vdpa_bus, NULL, &info, vdpa_dev_config_dump); - mutex_unlock(&vdpa_dev_mutex); + up_read(&vdpa_dev_lock); cb->args[0] = info.idx; return msg->len; } @@ -1119,7 +1119,7 @@ static int vdpa_nl_cmd_dev_stats_get_doit(struct sk_buff *skb, return -ENOMEM; index = nla_get_u32(info->attrs[VDPA_ATTR_DEV_QUEUE_INDEX]); - mutex_lock(&vdpa_dev_mutex); + down_read(&vdpa_dev_lock); dev = bus_find_device(&vdpa_bus, NULL, devname, vdpa_name_match); if (!dev) { NL_SET_ERR_MSG_MOD(info->extack, "device not found"); @@ -1139,7 +1139,7 @@ static int vdpa_nl_cmd_dev_stats_get_doit(struct sk_buff *skb, err = genlmsg_reply(msg, info); put_device(dev); - mutex_unlock(&vdpa_dev_mutex); + up_read(&vdpa_dev_lock); return err; @@ -1147,7 +1147,7 @@ mdev_err: put_device(dev); dev_err: nlmsg_free(msg); - mutex_unlock(&vdpa_dev_mutex); + up_read(&vdpa_dev_lock); return err; } From a6a51adc6e8aafebfe0c4beb80e99694ea562b40 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 18 May 2022 16:38:02 +0300 Subject: [PATCH 04/29] net/vdpa: Use readers/writers semaphore instead of cf_mutex Replace cf_mutex with rw_semaphore to reflect the fact that some calls could be called concurrently but can suffice with read lock. Suggested-by: Si-Wei Liu Signed-off-by: Eli Cohen Message-Id: <20220518133804.1075129-5-elic@nvidia.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa.c | 25 ++++++++++++------------- include/linux/vdpa.h | 12 ++++++------ 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index 2ff7de5e6b2f..9d3534a0bc5f 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -23,9 +23,9 @@ static DEFINE_IDA(vdpa_index_ida); void vdpa_set_status(struct vdpa_device *vdev, u8 status) { - mutex_lock(&vdev->cf_mutex); + down_write(&vdev->cf_lock); vdev->config->set_status(vdev, status); - mutex_unlock(&vdev->cf_mutex); + up_write(&vdev->cf_lock); } EXPORT_SYMBOL(vdpa_set_status); @@ -148,7 +148,6 @@ static void vdpa_release_dev(struct device *d) ops->free(vdev); ida_simple_remove(&vdpa_index_ida, vdev->index); - mutex_destroy(&vdev->cf_mutex); kfree(vdev->driver_override); kfree(vdev); } @@ -211,7 +210,7 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent, if (err) goto err_name; - mutex_init(&vdev->cf_mutex); + init_rwsem(&vdev->cf_lock); device_initialize(&vdev->dev); return vdev; @@ -407,9 +406,9 @@ static void vdpa_get_config_unlocked(struct vdpa_device *vdev, void vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, void *buf, unsigned int len) { - mutex_lock(&vdev->cf_mutex); + down_read(&vdev->cf_lock); vdpa_get_config_unlocked(vdev, offset, buf, len); - mutex_unlock(&vdev->cf_mutex); + up_read(&vdev->cf_lock); } EXPORT_SYMBOL_GPL(vdpa_get_config); @@ -423,9 +422,9 @@ EXPORT_SYMBOL_GPL(vdpa_get_config); void vdpa_set_config(struct vdpa_device *vdev, unsigned int offset, const void *buf, unsigned int length) { - mutex_lock(&vdev->cf_mutex); + down_write(&vdev->cf_lock); vdev->config->set_config(vdev, offset, buf, length); - mutex_unlock(&vdev->cf_mutex); + up_write(&vdev->cf_lock); } EXPORT_SYMBOL_GPL(vdpa_set_config); @@ -866,7 +865,7 @@ vdpa_dev_config_fill(struct vdpa_device *vdev, struct sk_buff *msg, u32 portid, u8 status; int err; - mutex_lock(&vdev->cf_mutex); + down_read(&vdev->cf_lock); status = vdev->config->get_status(vdev); if (!(status & VIRTIO_CONFIG_S_FEATURES_OK)) { NL_SET_ERR_MSG_MOD(extack, "Features negotiation not completed"); @@ -903,14 +902,14 @@ vdpa_dev_config_fill(struct vdpa_device *vdev, struct sk_buff *msg, u32 portid, if (err) goto msg_err; - mutex_unlock(&vdev->cf_mutex); + up_read(&vdev->cf_lock); genlmsg_end(msg, hdr); return 0; msg_err: genlmsg_cancel(msg, hdr); out: - mutex_unlock(&vdev->cf_mutex); + up_read(&vdev->cf_lock); return err; } @@ -954,7 +953,7 @@ static int vendor_stats_fill(struct vdpa_device *vdev, struct sk_buff *msg, { int err; - mutex_lock(&vdev->cf_mutex); + down_read(&vdev->cf_lock); if (!vdev->config->get_vendor_vq_stats) { err = -EOPNOTSUPP; goto out; @@ -962,7 +961,7 @@ static int vendor_stats_fill(struct vdpa_device *vdev, struct sk_buff *msg, err = vdpa_fill_stats_rec(vdev, msg, info, index); out: - mutex_unlock(&vdev->cf_mutex); + up_read(&vdev->cf_lock); return err; } diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 2ae8443331e1..2cb14847831e 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -66,7 +66,7 @@ struct vdpa_mgmt_dev; * @dma_dev: the actual device that is performing DMA * @driver_override: driver name to force a match * @config: the configuration ops for this device. - * @cf_mutex: Protects get and set access to configuration layout. + * @cf_lock: Protects get and set access to configuration layout. * @index: device index * @features_valid: were features initialized? for legacy guests * @use_va: indicate whether virtual address must be used by this device @@ -79,7 +79,7 @@ struct vdpa_device { struct device *dma_dev; const char *driver_override; const struct vdpa_config_ops *config; - struct mutex cf_mutex; /* Protects get/set config */ + struct rw_semaphore cf_lock; /* Protects get/set config */ unsigned int index; bool features_valid; bool use_va; @@ -398,10 +398,10 @@ static inline int vdpa_reset(struct vdpa_device *vdev) const struct vdpa_config_ops *ops = vdev->config; int ret; - mutex_lock(&vdev->cf_mutex); + down_write(&vdev->cf_lock); vdev->features_valid = false; ret = ops->reset(vdev); - mutex_unlock(&vdev->cf_mutex); + up_write(&vdev->cf_lock); return ret; } @@ -420,9 +420,9 @@ static inline int vdpa_set_features(struct vdpa_device *vdev, u64 features) { int ret; - mutex_lock(&vdev->cf_mutex); + down_write(&vdev->cf_lock); ret = vdpa_set_features_unlocked(vdev, features); - mutex_unlock(&vdev->cf_mutex); + up_write(&vdev->cf_lock); return ret; } From 1892a3d425bf525ac98d6d3534035e6ed2bfab50 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 18 May 2022 16:38:03 +0300 Subject: [PATCH 05/29] vdpa/mlx5: Add support for reading descriptor statistics Implement the get_vq_stats calback of vdpa_config_ops to return the statistics for a virtqueue. The statistics are provided as vendor specific statistics where the driver provides a pair of attribute name and attribute value. Currently supported are received descriptors and completed descriptors. Signed-off-by: Eli Cohen Message-Id: <20220518133804.1075129-6-elic@nvidia.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/mlx5/core/mlx5_vdpa.h | 2 + drivers/vdpa/mlx5/net/mlx5_vnet.c | 149 +++++++++++++++++++++++++++++ include/linux/mlx5/mlx5_ifc.h | 1 + include/linux/mlx5/mlx5_ifc_vdpa.h | 39 ++++++++ 4 files changed, 191 insertions(+) diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h b/drivers/vdpa/mlx5/core/mlx5_vdpa.h index daaf7b503677..44104093163b 100644 --- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h +++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h @@ -61,6 +61,8 @@ struct mlx5_control_vq { struct vringh_kiov riov; struct vringh_kiov wiov; unsigned short head; + unsigned int received_desc; + unsigned int completed_desc; }; struct mlx5_vdpa_wq_ent { diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index e0de44000d92..2b815ef850c8 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -119,6 +119,7 @@ struct mlx5_vdpa_virtqueue { struct mlx5_vdpa_umem umem2; struct mlx5_vdpa_umem umem3; + u32 counter_set_id; bool initialized; int index; u32 virtq_id; @@ -818,6 +819,12 @@ static u16 get_features_12_3(u64 features) (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_CSUM)) << 6); } +static bool counters_supported(const struct mlx5_vdpa_dev *mvdev) +{ + return MLX5_CAP_GEN_64(mvdev->mdev, general_obj_types) & + BIT_ULL(MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS); +} + static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) { int inlen = MLX5_ST_SZ_BYTES(create_virtio_net_q_in); @@ -872,6 +879,8 @@ static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtque MLX5_SET(virtio_q, vq_ctx, umem_3_id, mvq->umem3.id); MLX5_SET(virtio_q, vq_ctx, umem_3_size, mvq->umem3.size); MLX5_SET(virtio_q, vq_ctx, pd, ndev->mvdev.res.pdn); + if (counters_supported(&ndev->mvdev)) + MLX5_SET(virtio_q, vq_ctx, counter_set_id, mvq->counter_set_id); err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out)); if (err) @@ -1135,6 +1144,47 @@ static int modify_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtque return err; } +static int counter_set_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) +{ + u32 in[MLX5_ST_SZ_DW(create_virtio_q_counters_in)] = {}; + u32 out[MLX5_ST_SZ_DW(create_virtio_q_counters_out)] = {}; + void *cmd_hdr; + int err; + + if (!counters_supported(&ndev->mvdev)) + return 0; + + cmd_hdr = MLX5_ADDR_OF(create_virtio_q_counters_in, in, hdr); + + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS); + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid); + + err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out)); + if (err) + return err; + + mvq->counter_set_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); + + return 0; +} + +static void counter_set_dealloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) +{ + u32 in[MLX5_ST_SZ_DW(destroy_virtio_q_counters_in)] = {}; + u32 out[MLX5_ST_SZ_DW(destroy_virtio_q_counters_out)] = {}; + + if (!counters_supported(&ndev->mvdev)) + return; + + MLX5_SET(destroy_virtio_q_counters_in, in, hdr.opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT); + MLX5_SET(destroy_virtio_q_counters_in, in, hdr.obj_id, mvq->counter_set_id); + MLX5_SET(destroy_virtio_q_counters_in, in, hdr.uid, ndev->mvdev.res.uid); + MLX5_SET(destroy_virtio_q_counters_in, in, hdr.obj_type, MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS); + if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out))) + mlx5_vdpa_warn(&ndev->mvdev, "dealloc counter set 0x%x\n", mvq->counter_set_id); +} + static int setup_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) { u16 idx = mvq->index; @@ -1162,6 +1212,10 @@ static int setup_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) if (err) goto err_connect; + err = counter_set_alloc(ndev, mvq); + if (err) + goto err_counter; + err = create_virtqueue(ndev, mvq); if (err) goto err_connect; @@ -1179,6 +1233,8 @@ static int setup_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) return 0; err_connect: + counter_set_dealloc(ndev, mvq); +err_counter: qp_destroy(ndev, &mvq->vqqp); err_vqqp: qp_destroy(ndev, &mvq->fwqp); @@ -1223,6 +1279,7 @@ static void teardown_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue * suspend_vq(ndev, mvq); destroy_virtqueue(ndev, mvq); + counter_set_dealloc(ndev, mvq); qp_destroy(ndev, &mvq->vqqp); qp_destroy(ndev, &mvq->fwqp); cq_destroy(ndev, mvq->index); @@ -1659,6 +1716,7 @@ static void mlx5_cvq_kick_handler(struct work_struct *work) if (read != sizeof(ctrl)) break; + cvq->received_desc++; switch (ctrl.class) { case VIRTIO_NET_CTRL_MAC: status = handle_ctrl_mac(mvdev, ctrl.cmd); @@ -1682,6 +1740,7 @@ static void mlx5_cvq_kick_handler(struct work_struct *work) if (vringh_need_notify_iotlb(&cvq->vring)) vringh_notify(&cvq->vring); + cvq->completed_desc++; queue_work(mvdev->wq, &wqent->work); break; } @@ -2303,6 +2362,8 @@ static int mlx5_vdpa_reset(struct vdpa_device *vdev) mlx5_vdpa_destroy_mr(&ndev->mvdev); ndev->mvdev.status = 0; ndev->cur_num_vqs = 0; + ndev->mvdev.cvq.received_desc = 0; + ndev->mvdev.cvq.completed_desc = 0; memset(ndev->event_cbs, 0, sizeof(*ndev->event_cbs) * (mvdev->max_vqs + 1)); ndev->mvdev.actual_features = 0; ++mvdev->generation; @@ -2422,6 +2483,93 @@ static u64 mlx5_vdpa_get_driver_features(struct vdpa_device *vdev) return mvdev->actual_features; } +static int counter_set_query(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, + u64 *received_desc, u64 *completed_desc) +{ + u32 in[MLX5_ST_SZ_DW(query_virtio_q_counters_in)] = {}; + u32 out[MLX5_ST_SZ_DW(query_virtio_q_counters_out)] = {}; + void *cmd_hdr; + void *ctx; + int err; + + if (!counters_supported(&ndev->mvdev)) + return -EOPNOTSUPP; + + if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) + return -EAGAIN; + + cmd_hdr = MLX5_ADDR_OF(query_virtio_q_counters_in, in, hdr); + + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS); + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid); + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->counter_set_id); + + err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out)); + if (err) + return err; + + ctx = MLX5_ADDR_OF(query_virtio_q_counters_out, out, counters); + *received_desc = MLX5_GET64(virtio_q_counters, ctx, received_desc); + *completed_desc = MLX5_GET64(virtio_q_counters, ctx, completed_desc); + return 0; +} + +static int mlx5_vdpa_get_vendor_vq_stats(struct vdpa_device *vdev, u16 idx, + struct sk_buff *msg, + struct netlink_ext_ack *extack) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + struct mlx5_vdpa_virtqueue *mvq; + struct mlx5_control_vq *cvq; + u64 received_desc; + u64 completed_desc; + int err = 0; + + mutex_lock(&ndev->reslock); + if (!is_index_valid(mvdev, idx)) { + NL_SET_ERR_MSG_MOD(extack, "virtqueue index is not valid"); + err = -EINVAL; + goto out_err; + } + + if (idx == ctrl_vq_idx(mvdev)) { + cvq = &mvdev->cvq; + received_desc = cvq->received_desc; + completed_desc = cvq->completed_desc; + goto out; + } + + mvq = &ndev->vqs[idx]; + err = counter_set_query(ndev, mvq, &received_desc, &completed_desc); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "failed to query hardware"); + goto out_err; + } + +out: + err = -EMSGSIZE; + if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, "received_desc")) + goto out_err; + + if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, received_desc, + VDPA_ATTR_PAD)) + goto out_err; + + if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, "completed_desc")) + goto out_err; + + if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, completed_desc, + VDPA_ATTR_PAD)) + goto out_err; + + err = 0; +out_err: + mutex_unlock(&ndev->reslock); + return err; +} + static const struct vdpa_config_ops mlx5_vdpa_ops = { .set_vq_address = mlx5_vdpa_set_vq_address, .set_vq_num = mlx5_vdpa_set_vq_num, @@ -2431,6 +2579,7 @@ static const struct vdpa_config_ops mlx5_vdpa_ops = { .get_vq_ready = mlx5_vdpa_get_vq_ready, .set_vq_state = mlx5_vdpa_set_vq_state, .get_vq_state = mlx5_vdpa_get_vq_state, + .get_vendor_vq_stats = mlx5_vdpa_get_vendor_vq_stats, .get_vq_notification = mlx5_get_vq_notification, .get_vq_irq = mlx5_get_vq_irq, .get_vq_align = mlx5_vdpa_get_vq_align, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 78b3d3465dd7..2a8334bb5f82 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -87,6 +87,7 @@ enum { enum { MLX5_OBJ_TYPE_GENEVE_TLV_OPT = 0x000b, MLX5_OBJ_TYPE_VIRTIO_NET_Q = 0x000d, + MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS = 0x001c, MLX5_OBJ_TYPE_MATCH_DEFINER = 0x0018, MLX5_OBJ_TYPE_MKEY = 0xff01, MLX5_OBJ_TYPE_QP = 0xff02, diff --git a/include/linux/mlx5/mlx5_ifc_vdpa.h b/include/linux/mlx5/mlx5_ifc_vdpa.h index 1a9c9d94cb59..4414ed5b6ed2 100644 --- a/include/linux/mlx5/mlx5_ifc_vdpa.h +++ b/include/linux/mlx5/mlx5_ifc_vdpa.h @@ -165,4 +165,43 @@ struct mlx5_ifc_modify_virtio_net_q_out_bits { struct mlx5_ifc_general_obj_out_cmd_hdr_bits general_obj_out_cmd_hdr; }; +struct mlx5_ifc_virtio_q_counters_bits { + u8 modify_field_select[0x40]; + u8 reserved_at_40[0x40]; + u8 received_desc[0x40]; + u8 completed_desc[0x40]; + u8 error_cqes[0x20]; + u8 bad_desc_errors[0x20]; + u8 exceed_max_chain[0x20]; + u8 invalid_buffer[0x20]; + u8 reserved_at_180[0x280]; +}; + +struct mlx5_ifc_create_virtio_q_counters_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr; + struct mlx5_ifc_virtio_q_counters_bits virtio_q_counters; +}; + +struct mlx5_ifc_create_virtio_q_counters_out_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr; + struct mlx5_ifc_virtio_q_counters_bits virtio_q_counters; +}; + +struct mlx5_ifc_destroy_virtio_q_counters_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr; +}; + +struct mlx5_ifc_destroy_virtio_q_counters_out_bits { + struct mlx5_ifc_general_obj_out_cmd_hdr_bits hdr; +}; + +struct mlx5_ifc_query_virtio_q_counters_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr; +}; + +struct mlx5_ifc_query_virtio_q_counters_out_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr; + struct mlx5_ifc_virtio_q_counters_bits counters; +}; + #endif /* __MLX5_IFC_VDPA_H_ */ From 759ae7f9bf1e6b7f5c9c197d7207e2be1dfd74b1 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 18 May 2022 16:38:04 +0300 Subject: [PATCH 06/29] vdpa/mlx5: Use readers/writers semaphore instead of mutex Reading statistics could be done intensively and by several processes concurrently. Reader's lock is sufficient in this case. Change reslock from mutex to a rwsem. Suggested-by: Si-Wei Liu Signed-off-by: Eli Cohen Message-Id: <20220518133804.1075129-7-elic@nvidia.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 41 ++++++++++++++----------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 2b815ef850c8..57cfc64248b7 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -155,7 +155,7 @@ struct mlx5_vdpa_net { * since memory map might change and we need to destroy and create * resources while driver in operational. */ - struct mutex reslock; + struct rw_semaphore reslock; struct mlx5_flow_table *rxft; struct mlx5_fc *rx_counter; struct mlx5_flow_handle *rx_rule_ucast; @@ -1695,7 +1695,7 @@ static void mlx5_cvq_kick_handler(struct work_struct *work) ndev = to_mlx5_vdpa_ndev(mvdev); cvq = &mvdev->cvq; - mutex_lock(&ndev->reslock); + down_write(&ndev->reslock); if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) goto out; @@ -1746,7 +1746,7 @@ static void mlx5_cvq_kick_handler(struct work_struct *work) } out: - mutex_unlock(&ndev->reslock); + up_write(&ndev->reslock); } static void mlx5_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx) @@ -2244,7 +2244,7 @@ static int setup_driver(struct mlx5_vdpa_dev *mvdev) struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); int err; - WARN_ON(!mutex_is_locked(&ndev->reslock)); + WARN_ON(!rwsem_is_locked(&ndev->reslock)); if (ndev->setup) { mlx5_vdpa_warn(mvdev, "setup driver called for already setup driver\n"); @@ -2292,7 +2292,7 @@ out: static void teardown_driver(struct mlx5_vdpa_net *ndev) { - WARN_ON(!mutex_is_locked(&ndev->reslock)); + WARN_ON(!rwsem_is_locked(&ndev->reslock)); if (!ndev->setup) return; @@ -2322,7 +2322,7 @@ static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status) print_status(mvdev, status, true); - mutex_lock(&ndev->reslock); + down_write(&ndev->reslock); if ((status ^ ndev->mvdev.status) & VIRTIO_CONFIG_S_DRIVER_OK) { if (status & VIRTIO_CONFIG_S_DRIVER_OK) { @@ -2338,14 +2338,14 @@ static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status) } ndev->mvdev.status = status; - mutex_unlock(&ndev->reslock); + up_write(&ndev->reslock); return; err_setup: mlx5_vdpa_destroy_mr(&ndev->mvdev); ndev->mvdev.status |= VIRTIO_CONFIG_S_FAILED; err_clear: - mutex_unlock(&ndev->reslock); + up_write(&ndev->reslock); } static int mlx5_vdpa_reset(struct vdpa_device *vdev) @@ -2356,7 +2356,7 @@ static int mlx5_vdpa_reset(struct vdpa_device *vdev) print_status(mvdev, 0, true); mlx5_vdpa_info(mvdev, "performing device reset\n"); - mutex_lock(&ndev->reslock); + down_write(&ndev->reslock); teardown_driver(ndev); clear_vqs_ready(ndev); mlx5_vdpa_destroy_mr(&ndev->mvdev); @@ -2371,7 +2371,7 @@ static int mlx5_vdpa_reset(struct vdpa_device *vdev) if (mlx5_vdpa_create_mr(mvdev, NULL)) mlx5_vdpa_warn(mvdev, "create MR failed\n"); } - mutex_unlock(&ndev->reslock); + up_write(&ndev->reslock); return 0; } @@ -2411,7 +2411,7 @@ static int mlx5_vdpa_set_map(struct vdpa_device *vdev, struct vhost_iotlb *iotlb bool change_map; int err; - mutex_lock(&ndev->reslock); + down_write(&ndev->reslock); err = mlx5_vdpa_handle_set_map(mvdev, iotlb, &change_map); if (err) { @@ -2423,7 +2423,7 @@ static int mlx5_vdpa_set_map(struct vdpa_device *vdev, struct vhost_iotlb *iotlb err = mlx5_vdpa_change_map(mvdev, iotlb); err: - mutex_unlock(&ndev->reslock); + up_write(&ndev->reslock); return err; } @@ -2442,7 +2442,6 @@ static void mlx5_vdpa_free(struct vdpa_device *vdev) mlx5_mpfs_del_mac(pfmdev, ndev->config.mac); } mlx5_vdpa_free_resources(&ndev->mvdev); - mutex_destroy(&ndev->reslock); kfree(ndev->event_cbs); kfree(ndev->vqs); } @@ -2527,7 +2526,7 @@ static int mlx5_vdpa_get_vendor_vq_stats(struct vdpa_device *vdev, u16 idx, u64 completed_desc; int err = 0; - mutex_lock(&ndev->reslock); + down_read(&ndev->reslock); if (!is_index_valid(mvdev, idx)) { NL_SET_ERR_MSG_MOD(extack, "virtqueue index is not valid"); err = -EINVAL; @@ -2566,7 +2565,7 @@ out: err = 0; out_err: - mutex_unlock(&ndev->reslock); + up_read(&ndev->reslock); return err; } @@ -2835,18 +2834,18 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, } init_mvqs(ndev); - mutex_init(&ndev->reslock); + init_rwsem(&ndev->reslock); config = &ndev->config; if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU)) { err = config_func_mtu(mdev, add_config->net.mtu); if (err) - goto err_mtu; + goto err_alloc; } err = query_mtu(mdev, &mtu); if (err) - goto err_mtu; + goto err_alloc; ndev->config.mtu = cpu_to_mlx5vdpa16(mvdev, mtu); @@ -2860,14 +2859,14 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, } else { err = mlx5_query_nic_vport_mac_address(mdev, 0, 0, config->mac); if (err) - goto err_mtu; + goto err_alloc; } if (!is_zero_ether_addr(config->mac)) { pfmdev = pci_get_drvdata(pci_physfn(mdev->pdev)); err = mlx5_mpfs_add_mac(pfmdev, config->mac); if (err) - goto err_mtu; + goto err_alloc; ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_NET_F_MAC); } @@ -2917,8 +2916,6 @@ err_res: err_mpfs: if (!is_zero_ether_addr(config->mac)) mlx5_mpfs_del_mac(pfmdev, config->mac); -err_mtu: - mutex_destroy(&ndev->reslock); err_alloc: put_device(&mvdev->vdev.dev); return err; From 4e0400525691d0e676dbe002641f9a61261f1e1b Mon Sep 17 00:00:00 2001 From: Suwan Kim Date: Thu, 7 Apr 2022 00:32:06 +0900 Subject: [PATCH 07/29] virtio-blk: support polling I/O This patch supports polling I/O via virtio-blk driver. Polling feature is enabled by module parameter "poll_queues" and it sets dedicated polling queues for virtio-blk. This patch improves the polling I/O throughput and latency. The virtio-blk driver doesn't not have a poll function and a poll queue and it has been operating in interrupt driven method even if the polling function is called in the upper layer. virtio-blk polling is implemented upon 'batched completion' of block layer. virtblk_poll() queues completed request to io_comp_batch->req_list and later, virtblk_complete_batch() calls unmap function and ends the requests in batch. virtio-blk reads the number of poll queues from module parameter "poll_queues". If VM sets queue parameter as below, ("num-queues=N" [QEMU property], "poll_queues=M" [module parameter]) It allocates N virtqueues to virtio_blk->vqs[N] and it uses [0..(N-M-1)] as default queues and [(N-M)..(N-1)] as poll queues. Unlike the default queues, the poll queues have no callback function. Regarding HW-SW queue mapping, the default queue mapping uses the existing method that condsiders MSI irq vector. But the poll queue doesn't have an irq, so it uses the regular blk-mq cpu mapping. For verifying the improvement, I did Fio polling I/O performance test with io_uring engine with the options below. (io_uring, hipri, randread, direct=1, bs=512, iodepth=64 numjobs=N) I set 4 vcpu and 4 virtio-blk queues - 2 default queues and 2 poll queues for VM. As a result, IOPS and average latency improved about 10%. Test result: - Fio io_uring poll without virtio-blk poll support -- numjobs=1 : IOPS = 339K, avg latency = 188.33us -- numjobs=2 : IOPS = 367K, avg latency = 347.33us -- numjobs=4 : IOPS = 383K, avg latency = 682.06us - Fio io_uring poll with virtio-blk poll support -- numjobs=1 : IOPS = 385K, avg latency = 165.94us -- numjobs=2 : IOPS = 408K, avg latency = 313.28us -- numjobs=4 : IOPS = 424K, avg latency = 613.05us Reviewed-by: Stefan Hajnoczi Reviewed-by: Christoph Hellwig Reviewed-by: Max Gurtovoy Signed-off-by: Suwan Kim Message-Id: <20220406153207.163134-2-suwan.kim027@gmail.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Chaitanya Kulkarni --- drivers/block/virtio_blk.c | 106 +++++++++++++++++++++++++++++++++++-- 1 file changed, 103 insertions(+), 3 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index d624cc8eddc3..ad5f9ce8f3f9 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -37,6 +37,10 @@ MODULE_PARM_DESC(num_request_queues, "0 for no limit. " "Values > nr_cpu_ids truncated to nr_cpu_ids."); +static unsigned int poll_queues; +module_param(poll_queues, uint, 0644); +MODULE_PARM_DESC(poll_queues, "The number of dedicated virtqueues for polling I/O"); + static int major; static DEFINE_IDA(vd_index_ida); @@ -74,6 +78,7 @@ struct virtio_blk { /* num of vqs */ int num_vqs; + int io_queues[HCTX_MAX_TYPES]; struct virtio_blk_vq *vqs; }; @@ -512,6 +517,7 @@ static int init_vq(struct virtio_blk *vblk) const char **names; struct virtqueue **vqs; unsigned short num_vqs; + unsigned int num_poll_vqs; struct virtio_device *vdev = vblk->vdev; struct irq_affinity desc = { 0, }; @@ -520,6 +526,7 @@ static int init_vq(struct virtio_blk *vblk) &num_vqs); if (err) num_vqs = 1; + if (!err && !num_vqs) { dev_err(&vdev->dev, "MQ advertised but zero queues reported\n"); return -EINVAL; @@ -529,6 +536,17 @@ static int init_vq(struct virtio_blk *vblk) min_not_zero(num_request_queues, nr_cpu_ids), num_vqs); + num_poll_vqs = min_t(unsigned int, poll_queues, num_vqs - 1); + + vblk->io_queues[HCTX_TYPE_DEFAULT] = num_vqs - num_poll_vqs; + vblk->io_queues[HCTX_TYPE_READ] = 0; + vblk->io_queues[HCTX_TYPE_POLL] = num_poll_vqs; + + dev_info(&vdev->dev, "%d/%d/%d default/read/poll queues\n", + vblk->io_queues[HCTX_TYPE_DEFAULT], + vblk->io_queues[HCTX_TYPE_READ], + vblk->io_queues[HCTX_TYPE_POLL]); + vblk->vqs = kmalloc_array(num_vqs, sizeof(*vblk->vqs), GFP_KERNEL); if (!vblk->vqs) return -ENOMEM; @@ -541,12 +559,18 @@ static int init_vq(struct virtio_blk *vblk) goto out; } - for (i = 0; i < num_vqs; i++) { + for (i = 0; i < num_vqs - num_poll_vqs; i++) { callbacks[i] = virtblk_done; snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%d", i); names[i] = vblk->vqs[i].name; } + for (; i < num_vqs; i++) { + callbacks[i] = NULL; + snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req_poll.%d", i); + names[i] = vblk->vqs[i].name; + } + /* Discover virtqueues and write information to configuration. */ err = virtio_find_vqs(vdev, num_vqs, vqs, callbacks, names, &desc); if (err) @@ -692,16 +716,89 @@ static const struct attribute_group *virtblk_attr_groups[] = { static int virtblk_map_queues(struct blk_mq_tag_set *set) { struct virtio_blk *vblk = set->driver_data; + int i, qoff; - return blk_mq_virtio_map_queues(&set->map[HCTX_TYPE_DEFAULT], - vblk->vdev, 0); + for (i = 0, qoff = 0; i < set->nr_maps; i++) { + struct blk_mq_queue_map *map = &set->map[i]; + + map->nr_queues = vblk->io_queues[i]; + map->queue_offset = qoff; + qoff += map->nr_queues; + + if (map->nr_queues == 0) + continue; + + /* + * Regular queues have interrupts and hence CPU affinity is + * defined by the core virtio code, but polling queues have + * no interrupts so we let the block layer assign CPU affinity. + */ + if (i == HCTX_TYPE_POLL) + blk_mq_map_queues(&set->map[i]); + else + blk_mq_virtio_map_queues(&set->map[i], vblk->vdev, 0); + } + + return 0; +} + +static void virtblk_complete_batch(struct io_comp_batch *iob) +{ + struct request *req; + + rq_list_for_each(&iob->req_list, req) { + virtblk_unmap_data(req, blk_mq_rq_to_pdu(req)); + virtblk_cleanup_cmd(req); + } + blk_mq_end_request_batch(iob); +} + +static int virtblk_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) +{ + struct virtio_blk *vblk = hctx->queue->queuedata; + struct virtio_blk_vq *vq = hctx->driver_data; + struct virtblk_req *vbr; + unsigned long flags; + unsigned int len; + int found = 0; + + spin_lock_irqsave(&vq->lock, flags); + + while ((vbr = virtqueue_get_buf(vq->vq, &len)) != NULL) { + struct request *req = blk_mq_rq_from_pdu(vbr); + + found++; + if (!blk_mq_add_to_batch(req, iob, vbr->status, + virtblk_complete_batch)) + blk_mq_complete_request(req); + } + + if (found) + blk_mq_start_stopped_hw_queues(vblk->disk->queue, true); + + spin_unlock_irqrestore(&vq->lock, flags); + + return found; +} + +static int virtblk_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct virtio_blk *vblk = data; + struct virtio_blk_vq *vq = &vblk->vqs[hctx_idx]; + + WARN_ON(vblk->tag_set.tags[hctx_idx] != hctx->tags); + hctx->driver_data = vq; + return 0; } static const struct blk_mq_ops virtio_mq_ops = { .queue_rq = virtio_queue_rq, .commit_rqs = virtio_commit_rqs, + .init_hctx = virtblk_init_hctx, .complete = virtblk_request_done, .map_queues = virtblk_map_queues, + .poll = virtblk_poll, }; static unsigned int virtblk_queue_depth; @@ -778,6 +875,9 @@ static int virtblk_probe(struct virtio_device *vdev) sizeof(struct scatterlist) * VIRTIO_BLK_INLINE_SG_CNT; vblk->tag_set.driver_data = vblk; vblk->tag_set.nr_hw_queues = vblk->num_vqs; + vblk->tag_set.nr_maps = 1; + if (vblk->io_queues[HCTX_TYPE_POLL]) + vblk->tag_set.nr_maps = 3; err = blk_mq_alloc_tag_set(&vblk->tag_set); if (err) From 0e9911fa768f32f30e5678512ea405d99a7a9fef Mon Sep 17 00:00:00 2001 From: Suwan Kim Date: Thu, 7 Apr 2022 00:32:07 +0900 Subject: [PATCH 08/29] virtio-blk: support mq_ops->queue_rqs() This patch supports mq_ops->queue_rqs() hook. It has an advantage of batch submission to virtio-blk driver. It also helps polling I/O because polling uses batched completion of block layer. Batch submission in queue_rqs() can boost polling performance. In queue_rqs(), it iterates plug->mq_list, collects requests that belong to same HW queue until it encounters a request from other HW queue or sees the end of the list. Then, virtio-blk adds requests into virtqueue and kicks virtqueue to submit requests. If there is an error, it inserts error request to requeue_list and passes it to ordinary block layer path. For verification, I did fio test. (io_uring, randread, direct=1, bs=4K, iodepth=64 numjobs=N) I set 4 vcpu and 2 virtio-blk queues for VM and run fio test 5 times. It shows about 2% improvement. | numjobs=2 | numjobs=4 ----------------------------------------------------------- fio without queue_rqs() | 291K IOPS | 238K IOPS ----------------------------------------------------------- fio with queue_rqs() | 295K IOPS | 243K IOPS For polling I/O performance, I also did fio test as below. (io_uring, hipri, randread, direct=1, bs=512, iodepth=64 numjobs=4) I set 4 vcpu and 2 poll queues for VM. It shows about 2% improvement in polling I/O. | IOPS | avg latency ----------------------------------------------------------- fio poll without queue_rqs() | 424K | 613.05 usec ----------------------------------------------------------- fio poll with queue_rqs() | 435K | 601.01 usec Reviewed-by: Stefan Hajnoczi Reviewed-by: Christoph Hellwig Signed-off-by: Suwan Kim Message-Id: <20220406153207.163134-3-suwan.kim027@gmail.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Chaitanya Kulkarni --- drivers/block/virtio_blk.c | 118 +++++++++++++++++++++++++++++++------ 1 file changed, 100 insertions(+), 18 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index ad5f9ce8f3f9..6fc7850c2b0a 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -101,8 +101,7 @@ static inline blk_status_t virtblk_result(struct virtblk_req *vbr) } } -static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr, - struct scatterlist *data_sg, bool have_data) +static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr) { struct scatterlist hdr, status, *sgs[3]; unsigned int num_out = 0, num_in = 0; @@ -110,11 +109,11 @@ static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr, sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr)); sgs[num_out++] = &hdr; - if (have_data) { + if (vbr->sg_table.nents) { if (vbr->out_hdr.type & cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_OUT)) - sgs[num_out++] = data_sg; + sgs[num_out++] = vbr->sg_table.sgl; else - sgs[num_out + num_in++] = data_sg; + sgs[num_out + num_in++] = vbr->sg_table.sgl; } sg_init_one(&status, &vbr->status, sizeof(vbr->status)); @@ -304,18 +303,12 @@ static void virtio_commit_rqs(struct blk_mq_hw_ctx *hctx) virtqueue_notify(vq->vq); } -static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, - const struct blk_mq_queue_data *bd) +static blk_status_t virtblk_prep_rq(struct blk_mq_hw_ctx *hctx, + struct virtio_blk *vblk, + struct request *req, + struct virtblk_req *vbr) { - struct virtio_blk *vblk = hctx->queue->queuedata; - struct request *req = bd->rq; - struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); - unsigned long flags; - int num; - int qid = hctx->queue_num; - bool notify = false; blk_status_t status; - int err; status = virtblk_setup_cmd(vblk->vdev, req, vbr); if (unlikely(status)) @@ -323,14 +316,33 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(req); - num = virtblk_map_data(hctx, req, vbr); - if (unlikely(num < 0)) { + vbr->sg_table.nents = virtblk_map_data(hctx, req, vbr); + if (unlikely(vbr->sg_table.nents < 0)) { virtblk_cleanup_cmd(req); return BLK_STS_RESOURCE; } + return BLK_STS_OK; +} + +static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct virtio_blk *vblk = hctx->queue->queuedata; + struct request *req = bd->rq; + struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); + unsigned long flags; + int qid = hctx->queue_num; + bool notify = false; + blk_status_t status; + int err; + + status = virtblk_prep_rq(hctx, vblk, req, vbr); + if (unlikely(status)) + return status; + spin_lock_irqsave(&vblk->vqs[qid].lock, flags); - err = virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg_table.sgl, num); + err = virtblk_add_req(vblk->vqs[qid].vq, vbr); if (err) { virtqueue_kick(vblk->vqs[qid].vq); /* Don't stop the queue if -ENOMEM: we may have failed to @@ -360,6 +372,75 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_OK; } +static bool virtblk_prep_rq_batch(struct request *req) +{ + struct virtio_blk *vblk = req->mq_hctx->queue->queuedata; + struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); + + req->mq_hctx->tags->rqs[req->tag] = req; + + return virtblk_prep_rq(req->mq_hctx, vblk, req, vbr) == BLK_STS_OK; +} + +static bool virtblk_add_req_batch(struct virtio_blk_vq *vq, + struct request **rqlist, + struct request **requeue_list) +{ + unsigned long flags; + int err; + bool kick; + + spin_lock_irqsave(&vq->lock, flags); + + while (!rq_list_empty(*rqlist)) { + struct request *req = rq_list_pop(rqlist); + struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); + + err = virtblk_add_req(vq->vq, vbr); + if (err) { + virtblk_unmap_data(req, vbr); + virtblk_cleanup_cmd(req); + rq_list_add(requeue_list, req); + } + } + + kick = virtqueue_kick_prepare(vq->vq); + spin_unlock_irqrestore(&vq->lock, flags); + + return kick; +} + +static void virtio_queue_rqs(struct request **rqlist) +{ + struct request *req, *next, *prev = NULL; + struct request *requeue_list = NULL; + + rq_list_for_each_safe(rqlist, req, next) { + struct virtio_blk_vq *vq = req->mq_hctx->driver_data; + bool kick; + + if (!virtblk_prep_rq_batch(req)) { + rq_list_move(rqlist, &requeue_list, req, prev); + req = prev; + if (!req) + continue; + } + + if (!next || req->mq_hctx != next->mq_hctx) { + req->rq_next = NULL; + kick = virtblk_add_req_batch(vq, rqlist, &requeue_list); + if (kick) + virtqueue_notify(vq->vq); + + *rqlist = next; + prev = NULL; + } else + prev = req; + } + + *rqlist = requeue_list; +} + /* return id (s/n) string for *disk to *id_str */ static int virtblk_get_id(struct gendisk *disk, char *id_str) @@ -794,6 +875,7 @@ static int virtblk_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, static const struct blk_mq_ops virtio_mq_ops = { .queue_rq = virtio_queue_rq, + .queue_rqs = virtio_queue_rqs, .commit_rqs = virtio_commit_rqs, .init_hctx = virtblk_init_hctx, .complete = virtblk_request_done, From 35c51e093d956f6d058e193711c8d424817a44a9 Mon Sep 17 00:00:00 2001 From: Xianting Tian Date: Mon, 28 Mar 2022 18:58:16 +0800 Subject: [PATCH 09/29] virtio_ring: remove unnecessary to_vvq call in vring hot path It passes '_vq' to virtqueue_use_indirect(), which still calls to_vvq to get 'vq', let's directly pass 'vq'. It can avoid unnecessary call of to_vvq in hot path. Signed-off-by: Xianting Tian Message-Id: <20220328105817.1028065-1-xianting.tian@linux.alibaba.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Reviewed-by: Stefano Garzarella --- drivers/virtio/virtio_ring.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index cfb028ca238e..f72d5ae2cd8f 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -205,11 +205,9 @@ struct vring_virtqueue { #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq) -static inline bool virtqueue_use_indirect(struct virtqueue *_vq, +static inline bool virtqueue_use_indirect(struct vring_virtqueue *vq, unsigned int total_sg) { - struct vring_virtqueue *vq = to_vvq(_vq); - /* * If the host supports indirect descriptor tables, and we have multiple * buffers, then go indirect. FIXME: tune this threshold @@ -499,7 +497,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, head = vq->free_head; - if (virtqueue_use_indirect(_vq, total_sg)) + if (virtqueue_use_indirect(vq, total_sg)) desc = alloc_indirect_split(_vq, total_sg, gfp); else { desc = NULL; @@ -1178,7 +1176,7 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq, BUG_ON(total_sg == 0); - if (virtqueue_use_indirect(_vq, total_sg)) { + if (virtqueue_use_indirect(vq, total_sg)) { err = virtqueue_add_indirect_packed(vq, sgs, total_sg, out_sgs, in_sgs, data, gfp); if (err != -ENOMEM) { From b4b4ff73ef047556e81694174bb8561457f25eeb Mon Sep 17 00:00:00 2001 From: Xianting Tian Date: Mon, 28 Mar 2022 18:58:17 +0800 Subject: [PATCH 10/29] virtio_ring: add unlikely annotation for free descs check The 'if (vq->vq.num_free < descs_used)' check will almost always be false. Signed-off-by: Xianting Tian Message-Id: <20220328105817.1028065-2-xianting.tian@linux.alibaba.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Reviewed-by: Stefano Garzarella --- drivers/virtio/virtio_ring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index f72d5ae2cd8f..0dc930de72f5 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -517,7 +517,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, descs_used = total_sg; } - if (vq->vq.num_free < descs_used) { + if (unlikely(vq->vq.num_free < descs_used)) { pr_debug("Can't add buf len %i - avail = %i\n", descs_used, vq->vq.num_free); /* FIXME: for historical reasons, we force a notify here if From 175d493c3c3e09a3abaa843068fae0f0ad42c47e Mon Sep 17 00:00:00 2001 From: Gautam Dawar Date: Wed, 30 Mar 2022 23:33:41 +0530 Subject: [PATCH 11/29] vhost: move the backend feature bits to vhost_types.h We should store feature bits in vhost_types.h as what has been done for e.g VHOST_F_LOG_ALL. Signed-off-by: Jason Wang Signed-off-by: Gautam Dawar Message-Id: <20220330180436.24644-2-gdawar@xilinx.com> Signed-off-by: Michael S. Tsirkin --- include/uapi/linux/vhost.h | 5 ----- include/uapi/linux/vhost_types.h | 5 +++++ 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index 5d99e7c242a2..8f7b4a95d6f9 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -89,11 +89,6 @@ /* Set or get vhost backend capability */ -/* Use message type V2 */ -#define VHOST_BACKEND_F_IOTLB_MSG_V2 0x1 -/* IOTLB can accept batching hints */ -#define VHOST_BACKEND_F_IOTLB_BATCH 0x2 - #define VHOST_SET_BACKEND_FEATURES _IOW(VHOST_VIRTIO, 0x25, __u64) #define VHOST_GET_BACKEND_FEATURES _IOR(VHOST_VIRTIO, 0x26, __u64) diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h index f7f6a3a28977..76ee7016c501 100644 --- a/include/uapi/linux/vhost_types.h +++ b/include/uapi/linux/vhost_types.h @@ -153,4 +153,9 @@ struct vhost_vdpa_iova_range { /* vhost-net should add virtio_net_hdr for RX, and strip for TX packets. */ #define VHOST_NET_F_VIRTIO_NET_HDR 27 +/* Use message type V2 */ +#define VHOST_BACKEND_F_IOTLB_MSG_V2 0x1 +/* IOTLB can accept batching hints */ +#define VHOST_BACKEND_F_IOTLB_BATCH 0x2 + #endif From ea239a67461a34839e2172ead3043295a9173389 Mon Sep 17 00:00:00 2001 From: Gautam Dawar Date: Wed, 30 Mar 2022 23:33:42 +0530 Subject: [PATCH 12/29] virtio-vdpa: don't set callback if virtio doesn't need it There's no need for setting callbacks for the driver that doesn't care about that. Signed-off-by: Jason Wang Signed-off-by: Gautam Dawar Message-Id: <20220330180436.24644-3-gdawar@xilinx.com> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_vdpa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c index 76504559bc25..46c71653f508 100644 --- a/drivers/virtio/virtio_vdpa.c +++ b/drivers/virtio/virtio_vdpa.c @@ -184,7 +184,7 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index, } /* Setup virtqueue callback */ - cb.callback = virtio_vdpa_virtqueue_cb; + cb.callback = callback ? virtio_vdpa_virtqueue_cb : NULL; cb.private = info; ops->set_vq_cb(vdpa, index, &cb); ops->set_vq_num(vdpa, index, virtqueue_get_vring_size(vq)); From ae967246d0997a684093ffc06a14999292ad9276 Mon Sep 17 00:00:00 2001 From: Gautam Dawar Date: Wed, 30 Mar 2022 23:33:43 +0530 Subject: [PATCH 13/29] vhost-vdpa: passing iotlb to IOMMU mapping helpers To prepare for the ASID support for vhost-vdpa, try to pass IOTLB object to dma helpers. No functional changes, it's just a preparation for support multiple IOTLBs. Signed-off-by: Jason Wang Signed-off-by: Gautam Dawar Message-Id: <20220330180436.24644-4-gdawar@xilinx.com> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vdpa.c | 67 ++++++++++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 30 deletions(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 4c2f0bd06285..6d670e32e67b 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -537,10 +537,11 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep, return r; } -static void vhost_vdpa_pa_unmap(struct vhost_vdpa *v, u64 start, u64 last) +static void vhost_vdpa_pa_unmap(struct vhost_vdpa *v, + struct vhost_iotlb *iotlb, + u64 start, u64 last) { struct vhost_dev *dev = &v->vdev; - struct vhost_iotlb *iotlb = dev->iotlb; struct vhost_iotlb_map *map; struct page *page; unsigned long pfn, pinned; @@ -559,10 +560,10 @@ static void vhost_vdpa_pa_unmap(struct vhost_vdpa *v, u64 start, u64 last) } } -static void vhost_vdpa_va_unmap(struct vhost_vdpa *v, u64 start, u64 last) +static void vhost_vdpa_va_unmap(struct vhost_vdpa *v, + struct vhost_iotlb *iotlb, + u64 start, u64 last) { - struct vhost_dev *dev = &v->vdev; - struct vhost_iotlb *iotlb = dev->iotlb; struct vhost_iotlb_map *map; struct vdpa_map_file *map_file; @@ -574,21 +575,24 @@ static void vhost_vdpa_va_unmap(struct vhost_vdpa *v, u64 start, u64 last) } } -static void vhost_vdpa_iotlb_unmap(struct vhost_vdpa *v, u64 start, u64 last) +static void vhost_vdpa_iotlb_unmap(struct vhost_vdpa *v, + struct vhost_iotlb *iotlb, + u64 start, u64 last) { struct vdpa_device *vdpa = v->vdpa; if (vdpa->use_va) - return vhost_vdpa_va_unmap(v, start, last); + return vhost_vdpa_va_unmap(v, iotlb, start, last); - return vhost_vdpa_pa_unmap(v, start, last); + return vhost_vdpa_pa_unmap(v, iotlb, start, last); } static void vhost_vdpa_iotlb_free(struct vhost_vdpa *v) { struct vhost_dev *dev = &v->vdev; + struct vhost_iotlb *iotlb = dev->iotlb; - vhost_vdpa_iotlb_unmap(v, 0ULL, 0ULL - 1); + vhost_vdpa_iotlb_unmap(v, iotlb, 0ULL, 0ULL - 1); kfree(dev->iotlb); dev->iotlb = NULL; } @@ -615,15 +619,15 @@ static int perm_to_iommu_flags(u32 perm) return flags | IOMMU_CACHE; } -static int vhost_vdpa_map(struct vhost_vdpa *v, u64 iova, - u64 size, u64 pa, u32 perm, void *opaque) +static int vhost_vdpa_map(struct vhost_vdpa *v, struct vhost_iotlb *iotlb, + u64 iova, u64 size, u64 pa, u32 perm, void *opaque) { struct vhost_dev *dev = &v->vdev; struct vdpa_device *vdpa = v->vdpa; const struct vdpa_config_ops *ops = vdpa->config; int r = 0; - r = vhost_iotlb_add_range_ctx(dev->iotlb, iova, iova + size - 1, + r = vhost_iotlb_add_range_ctx(iotlb, iova, iova + size - 1, pa, perm, opaque); if (r) return r; @@ -632,13 +636,13 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, u64 iova, r = ops->dma_map(vdpa, iova, size, pa, perm, opaque); } else if (ops->set_map) { if (!v->in_batch) - r = ops->set_map(vdpa, dev->iotlb); + r = ops->set_map(vdpa, iotlb); } else { r = iommu_map(v->domain, iova, pa, size, perm_to_iommu_flags(perm)); } if (r) { - vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1); + vhost_iotlb_del_range(iotlb, iova, iova + size - 1); return r; } @@ -648,25 +652,27 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, u64 iova, return 0; } -static void vhost_vdpa_unmap(struct vhost_vdpa *v, u64 iova, u64 size) +static void vhost_vdpa_unmap(struct vhost_vdpa *v, + struct vhost_iotlb *iotlb, + u64 iova, u64 size) { - struct vhost_dev *dev = &v->vdev; struct vdpa_device *vdpa = v->vdpa; const struct vdpa_config_ops *ops = vdpa->config; - vhost_vdpa_iotlb_unmap(v, iova, iova + size - 1); + vhost_vdpa_iotlb_unmap(v, iotlb, iova, iova + size - 1); if (ops->dma_map) { ops->dma_unmap(vdpa, iova, size); } else if (ops->set_map) { if (!v->in_batch) - ops->set_map(vdpa, dev->iotlb); + ops->set_map(vdpa, iotlb); } else { iommu_unmap(v->domain, iova, size); } } static int vhost_vdpa_va_map(struct vhost_vdpa *v, + struct vhost_iotlb *iotlb, u64 iova, u64 size, u64 uaddr, u32 perm) { struct vhost_dev *dev = &v->vdev; @@ -696,7 +702,7 @@ static int vhost_vdpa_va_map(struct vhost_vdpa *v, offset = (vma->vm_pgoff << PAGE_SHIFT) + uaddr - vma->vm_start; map_file->offset = offset; map_file->file = get_file(vma->vm_file); - ret = vhost_vdpa_map(v, map_iova, map_size, uaddr, + ret = vhost_vdpa_map(v, iotlb, map_iova, map_size, uaddr, perm, map_file); if (ret) { fput(map_file->file); @@ -709,7 +715,7 @@ next: map_iova += map_size; } if (ret) - vhost_vdpa_unmap(v, iova, map_iova - iova); + vhost_vdpa_unmap(v, iotlb, iova, map_iova - iova); mmap_read_unlock(dev->mm); @@ -717,6 +723,7 @@ next: } static int vhost_vdpa_pa_map(struct vhost_vdpa *v, + struct vhost_iotlb *iotlb, u64 iova, u64 size, u64 uaddr, u32 perm) { struct vhost_dev *dev = &v->vdev; @@ -780,7 +787,7 @@ static int vhost_vdpa_pa_map(struct vhost_vdpa *v, if (last_pfn && (this_pfn != last_pfn + 1)) { /* Pin a contiguous chunk of memory */ csize = PFN_PHYS(last_pfn - map_pfn + 1); - ret = vhost_vdpa_map(v, iova, csize, + ret = vhost_vdpa_map(v, iotlb, iova, csize, PFN_PHYS(map_pfn), perm, NULL); if (ret) { @@ -810,7 +817,7 @@ static int vhost_vdpa_pa_map(struct vhost_vdpa *v, } /* Pin the rest chunk */ - ret = vhost_vdpa_map(v, iova, PFN_PHYS(last_pfn - map_pfn + 1), + ret = vhost_vdpa_map(v, iotlb, iova, PFN_PHYS(last_pfn - map_pfn + 1), PFN_PHYS(map_pfn), perm, NULL); out: if (ret) { @@ -830,7 +837,7 @@ out: for (pfn = map_pfn; pfn <= last_pfn; pfn++) unpin_user_page(pfn_to_page(pfn)); } - vhost_vdpa_unmap(v, start, size); + vhost_vdpa_unmap(v, iotlb, start, size); } unlock: mmap_read_unlock(dev->mm); @@ -841,11 +848,10 @@ free: } static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, + struct vhost_iotlb *iotlb, struct vhost_iotlb_msg *msg) { - struct vhost_dev *dev = &v->vdev; struct vdpa_device *vdpa = v->vdpa; - struct vhost_iotlb *iotlb = dev->iotlb; if (msg->iova < v->range.first || !msg->size || msg->iova > U64_MAX - msg->size + 1 || @@ -857,10 +863,10 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, return -EEXIST; if (vdpa->use_va) - return vhost_vdpa_va_map(v, msg->iova, msg->size, + return vhost_vdpa_va_map(v, iotlb, msg->iova, msg->size, msg->uaddr, msg->perm); - return vhost_vdpa_pa_map(v, msg->iova, msg->size, msg->uaddr, + return vhost_vdpa_pa_map(v, iotlb, msg->iova, msg->size, msg->uaddr, msg->perm); } @@ -870,6 +876,7 @@ static int vhost_vdpa_process_iotlb_msg(struct vhost_dev *dev, struct vhost_vdpa *v = container_of(dev, struct vhost_vdpa, vdev); struct vdpa_device *vdpa = v->vdpa; const struct vdpa_config_ops *ops = vdpa->config; + struct vhost_iotlb *iotlb = dev->iotlb; int r = 0; mutex_lock(&dev->mutex); @@ -880,17 +887,17 @@ static int vhost_vdpa_process_iotlb_msg(struct vhost_dev *dev, switch (msg->type) { case VHOST_IOTLB_UPDATE: - r = vhost_vdpa_process_iotlb_update(v, msg); + r = vhost_vdpa_process_iotlb_update(v, iotlb, msg); break; case VHOST_IOTLB_INVALIDATE: - vhost_vdpa_unmap(v, msg->iova, msg->size); + vhost_vdpa_unmap(v, iotlb, msg->iova, msg->size); break; case VHOST_IOTLB_BATCH_BEGIN: v->in_batch = true; break; case VHOST_IOTLB_BATCH_END: if (v->in_batch && ops->set_map) - ops->set_map(vdpa, dev->iotlb); + ops->set_map(vdpa, iotlb); v->in_batch = false; break; default: From 0b7ee47c5f36634926def0142a515eafedc8a779 Mon Sep 17 00:00:00 2001 From: Gautam Dawar Date: Wed, 30 Mar 2022 23:33:44 +0530 Subject: [PATCH 14/29] vhost-vdpa: switch to use vhost-vdpa specific IOTLB To ease the implementation of per group ASID support for vDPA device. This patch switches to use a vhost-vdpa specific IOTLB to avoid the unnecessary refactoring of the vhost core. Signed-off-by: Jason Wang Signed-off-by: Gautam Dawar Message-Id: <20220330180436.24644-5-gdawar@xilinx.com> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vdpa.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 6d670e32e67b..632c43eb5ecf 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -39,6 +39,7 @@ struct vhost_vdpa { struct vhost_virtqueue *vqs; struct completion completion; struct vdpa_device *vdpa; + struct vhost_iotlb *iotlb; struct device dev; struct cdev cdev; atomic_t opened; @@ -589,12 +590,11 @@ static void vhost_vdpa_iotlb_unmap(struct vhost_vdpa *v, static void vhost_vdpa_iotlb_free(struct vhost_vdpa *v) { - struct vhost_dev *dev = &v->vdev; - struct vhost_iotlb *iotlb = dev->iotlb; + struct vhost_iotlb *iotlb = v->iotlb; vhost_vdpa_iotlb_unmap(v, iotlb, 0ULL, 0ULL - 1); - kfree(dev->iotlb); - dev->iotlb = NULL; + kfree(v->iotlb); + v->iotlb = NULL; } static int perm_to_iommu_flags(u32 perm) @@ -876,7 +876,7 @@ static int vhost_vdpa_process_iotlb_msg(struct vhost_dev *dev, struct vhost_vdpa *v = container_of(dev, struct vhost_vdpa, vdev); struct vdpa_device *vdpa = v->vdpa; const struct vdpa_config_ops *ops = vdpa->config; - struct vhost_iotlb *iotlb = dev->iotlb; + struct vhost_iotlb *iotlb = v->iotlb; int r = 0; mutex_lock(&dev->mutex); @@ -1017,15 +1017,15 @@ static int vhost_vdpa_open(struct inode *inode, struct file *filep) vhost_dev_init(dev, vqs, nvqs, 0, 0, 0, false, vhost_vdpa_process_iotlb_msg); - dev->iotlb = vhost_iotlb_alloc(0, 0); - if (!dev->iotlb) { + v->iotlb = vhost_iotlb_alloc(0, 0); + if (!v->iotlb) { r = -ENOMEM; goto err_init_iotlb; } r = vhost_vdpa_alloc_domain(v); if (r) - goto err_init_iotlb; + goto err_alloc_domain; vhost_vdpa_set_iova_range(v); @@ -1033,6 +1033,8 @@ static int vhost_vdpa_open(struct inode *inode, struct file *filep) return 0; +err_alloc_domain: + vhost_vdpa_iotlb_free(v); err_init_iotlb: vhost_dev_cleanup(&v->vdev); kfree(vqs); From d4821902e43453b85b31329441a9f6ac071228a8 Mon Sep 17 00:00:00 2001 From: Gautam Dawar Date: Wed, 30 Mar 2022 23:33:45 +0530 Subject: [PATCH 15/29] vdpa: introduce virtqueue groups This patch introduces virtqueue groups to vDPA device. The virtqueue group is the minimal set of virtqueues that must share an address space. And the address space identifier could only be attached to a specific virtqueue group. Signed-off-by: Jason Wang Signed-off-by: Gautam Dawar Message-Id: <20220330180436.24644-6-gdawar@xilinx.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/alibaba/eni_vdpa.c | 2 +- drivers/vdpa/ifcvf/ifcvf_main.c | 8 +++++++- drivers/vdpa/mlx5/net/mlx5_vnet.c | 8 +++++++- drivers/vdpa/vdpa.c | 3 +++ drivers/vdpa/vdpa_sim/vdpa_sim.c | 9 ++++++++- drivers/vdpa/vdpa_sim/vdpa_sim.h | 1 + drivers/vdpa/vdpa_user/vduse_dev.c | 2 +- drivers/vdpa/virtio_pci/vp_vdpa.c | 2 +- include/linux/vdpa.h | 16 ++++++++++++---- 9 files changed, 41 insertions(+), 10 deletions(-) diff --git a/drivers/vdpa/alibaba/eni_vdpa.c b/drivers/vdpa/alibaba/eni_vdpa.c index f480d54f308c..3e93c5eb0cf9 100644 --- a/drivers/vdpa/alibaba/eni_vdpa.c +++ b/drivers/vdpa/alibaba/eni_vdpa.c @@ -470,7 +470,7 @@ static int eni_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id) return ret; eni_vdpa = vdpa_alloc_device(struct eni_vdpa, vdpa, - dev, &eni_vdpa_ops, NULL, false); + dev, &eni_vdpa_ops, 1, NULL, false); if (IS_ERR(eni_vdpa)) { ENI_ERR(pdev, "failed to allocate vDPA structure\n"); return PTR_ERR(eni_vdpa); diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c index 4366320fb68d..fde33e143246 100644 --- a/drivers/vdpa/ifcvf/ifcvf_main.c +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -626,6 +626,11 @@ static size_t ifcvf_vdpa_get_config_size(struct vdpa_device *vdpa_dev) return vf->config_size; } +static u32 ifcvf_vdpa_get_vq_group(struct vdpa_device *vdpa, u16 idx) +{ + return 0; +} + static void ifcvf_vdpa_get_config(struct vdpa_device *vdpa_dev, unsigned int offset, void *buf, unsigned int len) @@ -704,6 +709,7 @@ static const struct vdpa_config_ops ifc_vdpa_ops = { .get_device_id = ifcvf_vdpa_get_device_id, .get_vendor_id = ifcvf_vdpa_get_vendor_id, .get_vq_align = ifcvf_vdpa_get_vq_align, + .get_vq_group = ifcvf_vdpa_get_vq_group, .get_config_size = ifcvf_vdpa_get_config_size, .get_config = ifcvf_vdpa_get_config, .set_config = ifcvf_vdpa_set_config, @@ -758,7 +764,7 @@ static int ifcvf_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, pdev = ifcvf_mgmt_dev->pdev; dev = &pdev->dev; adapter = vdpa_alloc_device(struct ifcvf_adapter, vdpa, - dev, &ifc_vdpa_ops, name, false); + dev, &ifc_vdpa_ops, 1, name, false); if (IS_ERR(adapter)) { IFCVF_ERR(pdev, "Failed to allocate vDPA structure"); return PTR_ERR(adapter); diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 57cfc64248b7..5647e12056d8 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1947,6 +1947,11 @@ static u32 mlx5_vdpa_get_vq_align(struct vdpa_device *vdev) return PAGE_SIZE; } +static u32 mlx5_vdpa_get_vq_group(struct vdpa_device *vdpa, u16 idx) +{ + return 0; +} + enum { MLX5_VIRTIO_NET_F_GUEST_CSUM = 1 << 9, MLX5_VIRTIO_NET_F_CSUM = 1 << 10, MLX5_VIRTIO_NET_F_HOST_TSO6 = 1 << 11, @@ -2582,6 +2587,7 @@ static const struct vdpa_config_ops mlx5_vdpa_ops = { .get_vq_notification = mlx5_get_vq_notification, .get_vq_irq = mlx5_get_vq_irq, .get_vq_align = mlx5_vdpa_get_vq_align, + .get_vq_group = mlx5_vdpa_get_vq_group, .get_device_features = mlx5_vdpa_get_device_features, .set_driver_features = mlx5_vdpa_set_driver_features, .get_driver_features = mlx5_vdpa_get_driver_features, @@ -2817,7 +2823,7 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, } ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mlx5_vdpa_ops, - name, false); + 1, name, false); if (IS_ERR(ndev)) return PTR_ERR(ndev); diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index 9d3534a0bc5f..4ca54779be2c 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -158,6 +158,7 @@ static void vdpa_release_dev(struct device *d) * initialized but before registered. * @parent: the parent device * @config: the bus operations that is supported by this device + * @ngroups: number of groups supported by this device * @size: size of the parent structure that contains private data * @name: name of the vdpa device; optional. * @use_va: indicate whether virtual address must be used by this device @@ -170,6 +171,7 @@ static void vdpa_release_dev(struct device *d) */ struct vdpa_device *__vdpa_alloc_device(struct device *parent, const struct vdpa_config_ops *config, + unsigned int ngroups, size_t size, const char *name, bool use_va) { @@ -202,6 +204,7 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent, vdev->config = config; vdev->features_valid = false; vdev->use_va = use_va; + vdev->ngroups = ngroups; if (name) err = dev_set_name(&vdev->dev, "%s", name); diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index ddbe142af09a..c98cb1f869fa 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -250,7 +250,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr) else ops = &vdpasim_config_ops; - vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops, + vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops, 1, dev_attr->name, false); if (IS_ERR(vdpasim)) { ret = PTR_ERR(vdpasim); @@ -399,6 +399,11 @@ static u32 vdpasim_get_vq_align(struct vdpa_device *vdpa) return VDPASIM_QUEUE_ALIGN; } +static u32 vdpasim_get_vq_group(struct vdpa_device *vdpa, u16 idx) +{ + return 0; +} + static u64 vdpasim_get_device_features(struct vdpa_device *vdpa) { struct vdpasim *vdpasim = vdpa_to_sim(vdpa); @@ -620,6 +625,7 @@ static const struct vdpa_config_ops vdpasim_config_ops = { .set_vq_state = vdpasim_set_vq_state, .get_vq_state = vdpasim_get_vq_state, .get_vq_align = vdpasim_get_vq_align, + .get_vq_group = vdpasim_get_vq_group, .get_device_features = vdpasim_get_device_features, .set_driver_features = vdpasim_set_driver_features, .get_driver_features = vdpasim_get_driver_features, @@ -650,6 +656,7 @@ static const struct vdpa_config_ops vdpasim_batch_config_ops = { .set_vq_state = vdpasim_set_vq_state, .get_vq_state = vdpasim_get_vq_state, .get_vq_align = vdpasim_get_vq_align, + .get_vq_group = vdpasim_get_vq_group, .get_device_features = vdpasim_get_device_features, .set_driver_features = vdpasim_set_driver_features, .get_driver_features = vdpasim_get_driver_features, diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.h b/drivers/vdpa/vdpa_sim/vdpa_sim.h index cd58e888bcf3..0be7c1e7ef80 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.h +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.h @@ -63,6 +63,7 @@ struct vdpasim { u32 status; u32 generation; u64 features; + u32 groups; /* spinlock to synchronize iommu table */ spinlock_t iommu_lock; }; diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index f85d1a08ed87..4ee6850b9a68 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -1495,7 +1495,7 @@ static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name) return -EEXIST; vdev = vdpa_alloc_device(struct vduse_vdpa, vdpa, dev->dev, - &vduse_vdpa_config_ops, name, true); + &vduse_vdpa_config_ops, 1, name, true); if (IS_ERR(vdev)) return PTR_ERR(vdev); diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c b/drivers/vdpa/virtio_pci/vp_vdpa.c index cce101e6a940..e18dfe993901 100644 --- a/drivers/vdpa/virtio_pci/vp_vdpa.c +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c @@ -466,7 +466,7 @@ static int vp_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id) return ret; vp_vdpa = vdpa_alloc_device(struct vp_vdpa, vdpa, - dev, &vp_vdpa_ops, NULL, false); + dev, &vp_vdpa_ops, 1, NULL, false); if (IS_ERR(vp_vdpa)) { dev_err(dev, "vp_vdpa: Failed to allocate vDPA structure\n"); return PTR_ERR(vp_vdpa); diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 2cb14847831e..e4e53574183e 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -85,6 +85,7 @@ struct vdpa_device { bool use_va; u32 nvqs; struct vdpa_mgmt_dev *mdev; + unsigned int ngroups; }; /** @@ -172,6 +173,10 @@ struct vdpa_map_file { * for the device * @vdev: vdpa device * Returns virtqueue algin requirement + * @get_vq_group: Get the group id for a specific virtqueue + * @vdev: vdpa device + * @idx: virtqueue index + * Returns u32: group id for this virtqueue * @get_device_features: Get virtio features supported by the device * @vdev: vdpa device * Returns the virtio features support by the @@ -286,6 +291,7 @@ struct vdpa_config_ops { /* Device ops */ u32 (*get_vq_align)(struct vdpa_device *vdev); + u32 (*get_vq_group)(struct vdpa_device *vdev, u16 idx); u64 (*get_device_features)(struct vdpa_device *vdev); int (*set_driver_features)(struct vdpa_device *vdev, u64 features); u64 (*get_driver_features)(struct vdpa_device *vdev); @@ -318,6 +324,7 @@ struct vdpa_config_ops { struct vdpa_device *__vdpa_alloc_device(struct device *parent, const struct vdpa_config_ops *config, + unsigned int ngroups, size_t size, const char *name, bool use_va); @@ -328,17 +335,18 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent, * @member: the name of struct vdpa_device within the @dev_struct * @parent: the parent device * @config: the bus operations that is supported by this device + * @ngroups: the number of virtqueue groups supported by this device * @name: name of the vdpa device * @use_va: indicate whether virtual address must be used by this device * * Return allocated data structure or ERR_PTR upon error */ -#define vdpa_alloc_device(dev_struct, member, parent, config, name, use_va) \ - container_of(__vdpa_alloc_device( \ - parent, config, \ +#define vdpa_alloc_device(dev_struct, member, parent, config, ngroups, name, use_va) \ + container_of((__vdpa_alloc_device( \ + parent, config, ngroups, \ sizeof(dev_struct) + \ BUILD_BUG_ON_ZERO(offsetof( \ - dev_struct, member)), name, use_va), \ + dev_struct, member)), name, use_va)), \ dev_struct, member) int vdpa_register_device(struct vdpa_device *vdev, u32 nvqs); From db9adcbf4286ad1c1fca091a870db6e49bb0df07 Mon Sep 17 00:00:00 2001 From: Gautam Dawar Date: Wed, 30 Mar 2022 23:33:46 +0530 Subject: [PATCH 16/29] vdpa: multiple address spaces support This patches introduces the multiple address spaces support for vDPA device. This idea is to identify a specific address space via an dedicated identifier - ASID. During vDPA device allocation, vDPA device driver needs to report the number of address spaces supported by the device then the DMA mapping ops of the vDPA device needs to be extended to support ASID. This helps to isolate the environments for the virtqueue that will not be assigned directly. E.g in the case of virtio-net, the control virtqueue will not be assigned directly to guest. As a start, simply claim 1 virtqueue groups and 1 address spaces for all vDPA devices. And vhost-vDPA will simply reject the device with more than 1 virtqueue groups or address spaces. Signed-off-by: Jason Wang Signed-off-by: Gautam Dawar Message-Id: <20220330180436.24644-7-gdawar@xilinx.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/alibaba/eni_vdpa.c | 2 +- drivers/vdpa/ifcvf/ifcvf_main.c | 2 +- drivers/vdpa/mlx5/net/mlx5_vnet.c | 5 +++-- drivers/vdpa/vdpa.c | 4 +++- drivers/vdpa/vdpa_sim/vdpa_sim.c | 10 ++++++---- drivers/vdpa/vdpa_user/vduse_dev.c | 3 ++- drivers/vdpa/virtio_pci/vp_vdpa.c | 2 +- drivers/vhost/vdpa.c | 14 +++++++++----- include/linux/vdpa.h | 28 +++++++++++++++++++--------- 9 files changed, 45 insertions(+), 25 deletions(-) diff --git a/drivers/vdpa/alibaba/eni_vdpa.c b/drivers/vdpa/alibaba/eni_vdpa.c index 3e93c5eb0cf9..5a09a09cca70 100644 --- a/drivers/vdpa/alibaba/eni_vdpa.c +++ b/drivers/vdpa/alibaba/eni_vdpa.c @@ -470,7 +470,7 @@ static int eni_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id) return ret; eni_vdpa = vdpa_alloc_device(struct eni_vdpa, vdpa, - dev, &eni_vdpa_ops, 1, NULL, false); + dev, &eni_vdpa_ops, 1, 1, NULL, false); if (IS_ERR(eni_vdpa)) { ENI_ERR(pdev, "failed to allocate vDPA structure\n"); return PTR_ERR(eni_vdpa); diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c index fde33e143246..c1767a0ce630 100644 --- a/drivers/vdpa/ifcvf/ifcvf_main.c +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -764,7 +764,7 @@ static int ifcvf_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, pdev = ifcvf_mgmt_dev->pdev; dev = &pdev->dev; adapter = vdpa_alloc_device(struct ifcvf_adapter, vdpa, - dev, &ifc_vdpa_ops, 1, name, false); + dev, &ifc_vdpa_ops, 1, 1, name, false); if (IS_ERR(adapter)) { IFCVF_ERR(pdev, "Failed to allocate vDPA structure"); return PTR_ERR(adapter); diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 5647e12056d8..dcca782c698e 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -2409,7 +2409,8 @@ static u32 mlx5_vdpa_get_generation(struct vdpa_device *vdev) return mvdev->generation; } -static int mlx5_vdpa_set_map(struct vdpa_device *vdev, struct vhost_iotlb *iotlb) +static int mlx5_vdpa_set_map(struct vdpa_device *vdev, unsigned int asid, + struct vhost_iotlb *iotlb) { struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); @@ -2823,7 +2824,7 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, } ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mlx5_vdpa_ops, - 1, name, false); + 1, 1, name, false); if (IS_ERR(ndev)) return PTR_ERR(ndev); diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index 4ca54779be2c..f15fb11010a8 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -159,6 +159,7 @@ static void vdpa_release_dev(struct device *d) * @parent: the parent device * @config: the bus operations that is supported by this device * @ngroups: number of groups supported by this device + * @nas: number of address spaces supported by this device * @size: size of the parent structure that contains private data * @name: name of the vdpa device; optional. * @use_va: indicate whether virtual address must be used by this device @@ -171,7 +172,7 @@ static void vdpa_release_dev(struct device *d) */ struct vdpa_device *__vdpa_alloc_device(struct device *parent, const struct vdpa_config_ops *config, - unsigned int ngroups, + unsigned int ngroups, unsigned int nas, size_t size, const char *name, bool use_va) { @@ -205,6 +206,7 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent, vdev->features_valid = false; vdev->use_va = use_va; vdev->ngroups = ngroups; + vdev->nas = nas; if (name) err = dev_set_name(&vdev->dev, "%s", name); diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index c98cb1f869fa..659e2e2e4b0c 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -251,7 +251,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr) ops = &vdpasim_config_ops; vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops, 1, - dev_attr->name, false); + 1, dev_attr->name, false); if (IS_ERR(vdpasim)) { ret = PTR_ERR(vdpasim); goto err_alloc; @@ -539,7 +539,7 @@ static struct vdpa_iova_range vdpasim_get_iova_range(struct vdpa_device *vdpa) return range; } -static int vdpasim_set_map(struct vdpa_device *vdpa, +static int vdpasim_set_map(struct vdpa_device *vdpa, unsigned int asid, struct vhost_iotlb *iotlb) { struct vdpasim *vdpasim = vdpa_to_sim(vdpa); @@ -566,7 +566,8 @@ err: return ret; } -static int vdpasim_dma_map(struct vdpa_device *vdpa, u64 iova, u64 size, +static int vdpasim_dma_map(struct vdpa_device *vdpa, unsigned int asid, + u64 iova, u64 size, u64 pa, u32 perm, void *opaque) { struct vdpasim *vdpasim = vdpa_to_sim(vdpa); @@ -580,7 +581,8 @@ static int vdpasim_dma_map(struct vdpa_device *vdpa, u64 iova, u64 size, return ret; } -static int vdpasim_dma_unmap(struct vdpa_device *vdpa, u64 iova, u64 size) +static int vdpasim_dma_unmap(struct vdpa_device *vdpa, unsigned int asid, + u64 iova, u64 size) { struct vdpasim *vdpasim = vdpa_to_sim(vdpa); diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index 4ee6850b9a68..d503848b3b6e 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -693,6 +693,7 @@ static u32 vduse_vdpa_get_generation(struct vdpa_device *vdpa) } static int vduse_vdpa_set_map(struct vdpa_device *vdpa, + unsigned int asid, struct vhost_iotlb *iotlb) { struct vduse_dev *dev = vdpa_to_vduse(vdpa); @@ -1495,7 +1496,7 @@ static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name) return -EEXIST; vdev = vdpa_alloc_device(struct vduse_vdpa, vdpa, dev->dev, - &vduse_vdpa_config_ops, 1, name, true); + &vduse_vdpa_config_ops, 1, 1, name, true); if (IS_ERR(vdev)) return PTR_ERR(vdev); diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c b/drivers/vdpa/virtio_pci/vp_vdpa.c index e18dfe993901..35acba0e8d6d 100644 --- a/drivers/vdpa/virtio_pci/vp_vdpa.c +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c @@ -466,7 +466,7 @@ static int vp_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id) return ret; vp_vdpa = vdpa_alloc_device(struct vp_vdpa, vdpa, - dev, &vp_vdpa_ops, 1, NULL, false); + dev, &vp_vdpa_ops, 1, 1, NULL, false); if (IS_ERR(vp_vdpa)) { dev_err(dev, "vp_vdpa: Failed to allocate vDPA structure\n"); return PTR_ERR(vp_vdpa); diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 632c43eb5ecf..9202ff97ddb5 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -633,10 +633,10 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, struct vhost_iotlb *iotlb, return r; if (ops->dma_map) { - r = ops->dma_map(vdpa, iova, size, pa, perm, opaque); + r = ops->dma_map(vdpa, 0, iova, size, pa, perm, opaque); } else if (ops->set_map) { if (!v->in_batch) - r = ops->set_map(vdpa, iotlb); + r = ops->set_map(vdpa, 0, iotlb); } else { r = iommu_map(v->domain, iova, pa, size, perm_to_iommu_flags(perm)); @@ -662,10 +662,10 @@ static void vhost_vdpa_unmap(struct vhost_vdpa *v, vhost_vdpa_iotlb_unmap(v, iotlb, iova, iova + size - 1); if (ops->dma_map) { - ops->dma_unmap(vdpa, iova, size); + ops->dma_unmap(vdpa, 0, iova, size); } else if (ops->set_map) { if (!v->in_batch) - ops->set_map(vdpa, iotlb); + ops->set_map(vdpa, 0, iotlb); } else { iommu_unmap(v->domain, iova, size); } @@ -897,7 +897,7 @@ static int vhost_vdpa_process_iotlb_msg(struct vhost_dev *dev, break; case VHOST_IOTLB_BATCH_END: if (v->in_batch && ops->set_map) - ops->set_map(vdpa, iotlb); + ops->set_map(vdpa, 0, iotlb); v->in_batch = false; break; default: @@ -1163,6 +1163,10 @@ static int vhost_vdpa_probe(struct vdpa_device *vdpa) int minor; int r; + /* Only support 1 address space and 1 groups */ + if (vdpa->ngroups != 1 || vdpa->nas != 1) + return -EOPNOTSUPP; + v = kzalloc(sizeof(*v), GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!v) return -ENOMEM; diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index e4e53574183e..1515748e84ff 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -69,6 +69,8 @@ struct vdpa_mgmt_dev; * @cf_lock: Protects get and set access to configuration layout. * @index: device index * @features_valid: were features initialized? for legacy guests + * @ngroups: the number of virtqueue groups + * @nas: the number of address spaces * @use_va: indicate whether virtual address must be used by this device * @nvqs: maximum number of supported virtqueues * @mdev: management device pointer; caller must setup when registering device as part @@ -86,6 +88,7 @@ struct vdpa_device { u32 nvqs; struct vdpa_mgmt_dev *mdev; unsigned int ngroups; + unsigned int nas; }; /** @@ -241,6 +244,7 @@ struct vdpa_map_file { * Needed for device that using device * specific DMA translation (on-chip IOMMU) * @vdev: vdpa device + * @asid: address space identifier * @iotlb: vhost memory mapping to be * used by the vDPA * Returns integer: success (0) or error (< 0) @@ -249,6 +253,7 @@ struct vdpa_map_file { * specific DMA translation (on-chip IOMMU) * and preferring incremental map. * @vdev: vdpa device + * @asid: address space identifier * @iova: iova to be mapped * @size: size of the area * @pa: physical address for the map @@ -260,6 +265,7 @@ struct vdpa_map_file { * specific DMA translation (on-chip IOMMU) * and preferring incremental unmap. * @vdev: vdpa device + * @asid: address space identifier * @iova: iova to be unmapped * @size: size of the area * Returns integer: success (0) or error (< 0) @@ -313,10 +319,12 @@ struct vdpa_config_ops { struct vdpa_iova_range (*get_iova_range)(struct vdpa_device *vdev); /* DMA ops */ - int (*set_map)(struct vdpa_device *vdev, struct vhost_iotlb *iotlb); - int (*dma_map)(struct vdpa_device *vdev, u64 iova, u64 size, - u64 pa, u32 perm, void *opaque); - int (*dma_unmap)(struct vdpa_device *vdev, u64 iova, u64 size); + int (*set_map)(struct vdpa_device *vdev, unsigned int asid, + struct vhost_iotlb *iotlb); + int (*dma_map)(struct vdpa_device *vdev, unsigned int asid, + u64 iova, u64 size, u64 pa, u32 perm, void *opaque); + int (*dma_unmap)(struct vdpa_device *vdev, unsigned int asid, + u64 iova, u64 size); /* Free device resources */ void (*free)(struct vdpa_device *vdev); @@ -324,7 +332,7 @@ struct vdpa_config_ops { struct vdpa_device *__vdpa_alloc_device(struct device *parent, const struct vdpa_config_ops *config, - unsigned int ngroups, + unsigned int ngroups, unsigned int nas, size_t size, const char *name, bool use_va); @@ -336,17 +344,19 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent, * @parent: the parent device * @config: the bus operations that is supported by this device * @ngroups: the number of virtqueue groups supported by this device + * @nas: the number of address spaces * @name: name of the vdpa device * @use_va: indicate whether virtual address must be used by this device * * Return allocated data structure or ERR_PTR upon error */ -#define vdpa_alloc_device(dev_struct, member, parent, config, ngroups, name, use_va) \ +#define vdpa_alloc_device(dev_struct, member, parent, config, ngroups, nas, \ + name, use_va) \ container_of((__vdpa_alloc_device( \ - parent, config, ngroups, \ - sizeof(dev_struct) + \ + parent, config, ngroups, nas, \ + (sizeof(dev_struct) + \ BUILD_BUG_ON_ZERO(offsetof( \ - dev_struct, member)), name, use_va)), \ + dev_struct, member))), name, use_va)), \ dev_struct, member) int vdpa_register_device(struct vdpa_device *vdev, u32 nvqs); From 46d554b1bcd19133401d2d5c0728b85e7bfd1358 Mon Sep 17 00:00:00 2001 From: Gautam Dawar Date: Wed, 30 Mar 2022 23:33:47 +0530 Subject: [PATCH 17/29] vdpa: introduce config operations for associating ASID to a virtqueue group This patch introduces a new bus operation to allow the vDPA bus driver to associate an ASID to a virtqueue group. Signed-off-by: Jason Wang Signed-off-by: Gautam Dawar Message-Id: <20220330180436.24644-8-gdawar@xilinx.com> Signed-off-by: Michael S. Tsirkin --- include/linux/vdpa.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 1515748e84ff..f336d253db3d 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -240,6 +240,12 @@ struct vdpa_map_file { * @vdev: vdpa device * Returns the iova range supported by * the device. + * @set_group_asid: Set address space identifier for a + * virtqueue group + * @vdev: vdpa device + * @group: virtqueue group + * @asid: address space id for this group + * Returns integer: success (0) or error (< 0) * @set_map: Set device memory mapping (optional) * Needed for device that using device * specific DMA translation (on-chip IOMMU) @@ -325,6 +331,8 @@ struct vdpa_config_ops { u64 iova, u64 size, u64 pa, u32 perm, void *opaque); int (*dma_unmap)(struct vdpa_device *vdev, unsigned int asid, u64 iova, u64 size); + int (*set_group_asid)(struct vdpa_device *vdev, unsigned int group, + unsigned int asid); /* Free device resources */ void (*free)(struct vdpa_device *vdev); From 1cb108994c6830cc6a6e066ad7d9a22ef59fa167 Mon Sep 17 00:00:00 2001 From: Gautam Dawar Date: Wed, 30 Mar 2022 23:33:48 +0530 Subject: [PATCH 18/29] vhost_iotlb: split out IOTLB initialization This patch splits out IOTLB initialization to make sure it could be reused by external modules. Signed-off-by: Jason Wang Signed-off-by: Gautam Dawar Message-Id: <20220330180436.24644-9-gdawar@xilinx.com> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/iotlb.c | 23 ++++++++++++++++++----- include/linux/vhost_iotlb.h | 2 ++ 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/drivers/vhost/iotlb.c b/drivers/vhost/iotlb.c index 5829cf2d0552..ea61330a3431 100644 --- a/drivers/vhost/iotlb.c +++ b/drivers/vhost/iotlb.c @@ -125,6 +125,23 @@ void vhost_iotlb_del_range(struct vhost_iotlb *iotlb, u64 start, u64 last) } EXPORT_SYMBOL_GPL(vhost_iotlb_del_range); +/** + * vhost_iotlb_init - initialize a vhost IOTLB + * @iotlb: the IOTLB that needs to be initialized + * @limit: maximum number of IOTLB entries + * @flags: VHOST_IOTLB_FLAG_XXX + */ +void vhost_iotlb_init(struct vhost_iotlb *iotlb, unsigned int limit, + unsigned int flags) +{ + iotlb->root = RB_ROOT_CACHED; + iotlb->limit = limit; + iotlb->nmaps = 0; + iotlb->flags = flags; + INIT_LIST_HEAD(&iotlb->list); +} +EXPORT_SYMBOL_GPL(vhost_iotlb_init); + /** * vhost_iotlb_alloc - add a new vhost IOTLB * @limit: maximum number of IOTLB entries @@ -139,11 +156,7 @@ struct vhost_iotlb *vhost_iotlb_alloc(unsigned int limit, unsigned int flags) if (!iotlb) return NULL; - iotlb->root = RB_ROOT_CACHED; - iotlb->limit = limit; - iotlb->nmaps = 0; - iotlb->flags = flags; - INIT_LIST_HEAD(&iotlb->list); + vhost_iotlb_init(iotlb, limit, flags); return iotlb; } diff --git a/include/linux/vhost_iotlb.h b/include/linux/vhost_iotlb.h index 2d0e2f52f938..e79a40838998 100644 --- a/include/linux/vhost_iotlb.h +++ b/include/linux/vhost_iotlb.h @@ -36,6 +36,8 @@ int vhost_iotlb_add_range(struct vhost_iotlb *iotlb, u64 start, u64 last, u64 addr, unsigned int perm); void vhost_iotlb_del_range(struct vhost_iotlb *iotlb, u64 start, u64 last); +void vhost_iotlb_init(struct vhost_iotlb *iotlb, unsigned int limit, + unsigned int flags); struct vhost_iotlb *vhost_iotlb_alloc(unsigned int limit, unsigned int flags); void vhost_iotlb_free(struct vhost_iotlb *iotlb); void vhost_iotlb_reset(struct vhost_iotlb *iotlb); From 91233ad711866f4e375742d84ef3ed6aab9daa96 Mon Sep 17 00:00:00 2001 From: Gautam Dawar Date: Wed, 30 Mar 2022 23:33:49 +0530 Subject: [PATCH 19/29] vhost: support ASID in IOTLB API This patches allows userspace to send ASID based IOTLB message to vhost. This idea is to use the reserved u32 field in the existing V2 IOTLB message. Vhost device should advertise this capability via VHOST_BACKEND_F_IOTLB_ASID backend feature. Signed-off-by: Jason Wang Signed-off-by: Gautam Dawar Message-Id: <20220330180436.24644-10-gdawar@xilinx.com> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vdpa.c | 5 ++++- drivers/vhost/vhost.c | 23 ++++++++++++++++++----- drivers/vhost/vhost.h | 4 ++-- include/uapi/linux/vhost_types.h | 6 +++++- 4 files changed, 29 insertions(+), 9 deletions(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 9202ff97ddb5..174c9e81df4e 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -870,7 +870,7 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, msg->perm); } -static int vhost_vdpa_process_iotlb_msg(struct vhost_dev *dev, +static int vhost_vdpa_process_iotlb_msg(struct vhost_dev *dev, u32 asid, struct vhost_iotlb_msg *msg) { struct vhost_vdpa *v = container_of(dev, struct vhost_vdpa, vdev); @@ -879,6 +879,9 @@ static int vhost_vdpa_process_iotlb_msg(struct vhost_dev *dev, struct vhost_iotlb *iotlb = v->iotlb; int r = 0; + if (asid != 0) + return -EINVAL; + mutex_lock(&dev->mutex); r = vhost_dev_check_owner(dev); diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index d02173fb290c..d1e58f976f6e 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -468,7 +468,7 @@ void vhost_dev_init(struct vhost_dev *dev, struct vhost_virtqueue **vqs, int nvqs, int iov_limit, int weight, int byte_weight, bool use_worker, - int (*msg_handler)(struct vhost_dev *dev, + int (*msg_handler)(struct vhost_dev *dev, u32 asid, struct vhost_iotlb_msg *msg)) { struct vhost_virtqueue *vq; @@ -1090,11 +1090,14 @@ static bool umem_access_ok(u64 uaddr, u64 size, int access) return true; } -static int vhost_process_iotlb_msg(struct vhost_dev *dev, +static int vhost_process_iotlb_msg(struct vhost_dev *dev, u32 asid, struct vhost_iotlb_msg *msg) { int ret = 0; + if (asid != 0) + return -EINVAL; + mutex_lock(&dev->mutex); vhost_dev_lock_vqs(dev); switch (msg->type) { @@ -1141,6 +1144,7 @@ ssize_t vhost_chr_write_iter(struct vhost_dev *dev, struct vhost_iotlb_msg msg; size_t offset; int type, ret; + u32 asid = 0; ret = copy_from_iter(&type, sizeof(type), from); if (ret != sizeof(type)) { @@ -1156,7 +1160,16 @@ ssize_t vhost_chr_write_iter(struct vhost_dev *dev, offset = offsetof(struct vhost_msg, iotlb) - sizeof(int); break; case VHOST_IOTLB_MSG_V2: - offset = sizeof(__u32); + if (vhost_backend_has_feature(dev->vqs[0], + VHOST_BACKEND_F_IOTLB_ASID)) { + ret = copy_from_iter(&asid, sizeof(asid), from); + if (ret != sizeof(asid)) { + ret = -EINVAL; + goto done; + } + offset = sizeof(__u16); + } else + offset = sizeof(__u32); break; default: ret = -EINVAL; @@ -1178,9 +1191,9 @@ ssize_t vhost_chr_write_iter(struct vhost_dev *dev, } if (dev->msg_handler) - ret = dev->msg_handler(dev, &msg); + ret = dev->msg_handler(dev, asid, &msg); else - ret = vhost_process_iotlb_msg(dev, &msg); + ret = vhost_process_iotlb_msg(dev, asid, &msg); if (ret) { ret = -EFAULT; goto done; diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 638bb640d6b4..9f238d6c7b58 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -161,7 +161,7 @@ struct vhost_dev { int byte_weight; u64 kcov_handle; bool use_worker; - int (*msg_handler)(struct vhost_dev *dev, + int (*msg_handler)(struct vhost_dev *dev, u32 asid, struct vhost_iotlb_msg *msg); }; @@ -169,7 +169,7 @@ bool vhost_exceeds_weight(struct vhost_virtqueue *vq, int pkts, int total_len); void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, int nvqs, int iov_limit, int weight, int byte_weight, bool use_worker, - int (*msg_handler)(struct vhost_dev *dev, + int (*msg_handler)(struct vhost_dev *dev, u32 asid, struct vhost_iotlb_msg *msg)); long vhost_dev_set_owner(struct vhost_dev *dev); bool vhost_dev_has_owner(struct vhost_dev *dev); diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h index 76ee7016c501..634cee485abb 100644 --- a/include/uapi/linux/vhost_types.h +++ b/include/uapi/linux/vhost_types.h @@ -87,7 +87,7 @@ struct vhost_msg { struct vhost_msg_v2 { __u32 type; - __u32 reserved; + __u32 asid; union { struct vhost_iotlb_msg iotlb; __u8 padding[64]; @@ -157,5 +157,9 @@ struct vhost_vdpa_iova_range { #define VHOST_BACKEND_F_IOTLB_MSG_V2 0x1 /* IOTLB can accept batching hints */ #define VHOST_BACKEND_F_IOTLB_BATCH 0x2 +/* IOTLB can accept address space identifier through V2 type of IOTLB + * message + */ +#define VHOST_BACKEND_F_IOTLB_ASID 0x3 #endif From 3d5698793897a2b9c0060d899881d1a0591630d5 Mon Sep 17 00:00:00 2001 From: Gautam Dawar Date: Wed, 30 Mar 2022 23:33:50 +0530 Subject: [PATCH 20/29] vhost-vdpa: introduce asid based IOTLB This patch converts the vhost-vDPA device to support multiple IOTLBs tagged via ASID via hlist. This will be used for supporting multiple address spaces in the following patches. Signed-off-by: Jason Wang Signed-off-by: Gautam Dawar Message-Id: <20220330180436.24644-11-gdawar@xilinx.com> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vdpa.c | 97 ++++++++++++++++++++++++++++++++------------ 1 file changed, 72 insertions(+), 25 deletions(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 174c9e81df4e..cd1bee536c46 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -33,13 +33,21 @@ enum { #define VHOST_VDPA_DEV_MAX (1U << MINORBITS) +#define VHOST_VDPA_IOTLB_BUCKETS 16 + +struct vhost_vdpa_as { + struct hlist_node hash_link; + struct vhost_iotlb iotlb; + u32 id; +}; + struct vhost_vdpa { struct vhost_dev vdev; struct iommu_domain *domain; struct vhost_virtqueue *vqs; struct completion completion; struct vdpa_device *vdpa; - struct vhost_iotlb *iotlb; + struct hlist_head as[VHOST_VDPA_IOTLB_BUCKETS]; struct device dev; struct cdev cdev; atomic_t opened; @@ -55,6 +63,51 @@ static DEFINE_IDA(vhost_vdpa_ida); static dev_t vhost_vdpa_major; +static struct vhost_vdpa_as *asid_to_as(struct vhost_vdpa *v, u32 asid) +{ + struct hlist_head *head = &v->as[asid % VHOST_VDPA_IOTLB_BUCKETS]; + struct vhost_vdpa_as *as; + + hlist_for_each_entry(as, head, hash_link) + if (as->id == asid) + return as; + + return NULL; +} + +static struct vhost_vdpa_as *vhost_vdpa_alloc_as(struct vhost_vdpa *v, u32 asid) +{ + struct hlist_head *head = &v->as[asid % VHOST_VDPA_IOTLB_BUCKETS]; + struct vhost_vdpa_as *as; + + if (asid_to_as(v, asid)) + return NULL; + + as = kmalloc(sizeof(*as), GFP_KERNEL); + if (!as) + return NULL; + + vhost_iotlb_init(&as->iotlb, 0, 0); + as->id = asid; + hlist_add_head(&as->hash_link, head); + + return as; +} + +static int vhost_vdpa_remove_as(struct vhost_vdpa *v, u32 asid) +{ + struct vhost_vdpa_as *as = asid_to_as(v, asid); + + if (!as) + return -EINVAL; + + hlist_del(&as->hash_link); + vhost_iotlb_reset(&as->iotlb); + kfree(as); + + return 0; +} + static void handle_vq_kick(struct vhost_work *work) { struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, @@ -588,15 +641,6 @@ static void vhost_vdpa_iotlb_unmap(struct vhost_vdpa *v, return vhost_vdpa_pa_unmap(v, iotlb, start, last); } -static void vhost_vdpa_iotlb_free(struct vhost_vdpa *v) -{ - struct vhost_iotlb *iotlb = v->iotlb; - - vhost_vdpa_iotlb_unmap(v, iotlb, 0ULL, 0ULL - 1); - kfree(v->iotlb); - v->iotlb = NULL; -} - static int perm_to_iommu_flags(u32 perm) { int flags = 0; @@ -876,7 +920,8 @@ static int vhost_vdpa_process_iotlb_msg(struct vhost_dev *dev, u32 asid, struct vhost_vdpa *v = container_of(dev, struct vhost_vdpa, vdev); struct vdpa_device *vdpa = v->vdpa; const struct vdpa_config_ops *ops = vdpa->config; - struct vhost_iotlb *iotlb = v->iotlb; + struct vhost_vdpa_as *as = asid_to_as(v, 0); + struct vhost_iotlb *iotlb = &as->iotlb; int r = 0; if (asid != 0) @@ -987,6 +1032,13 @@ static void vhost_vdpa_set_iova_range(struct vhost_vdpa *v) } } +static void vhost_vdpa_cleanup(struct vhost_vdpa *v) +{ + vhost_dev_cleanup(&v->vdev); + kfree(v->vdev.vqs); + vhost_vdpa_remove_as(v, 0); +} + static int vhost_vdpa_open(struct inode *inode, struct file *filep) { struct vhost_vdpa *v; @@ -1020,15 +1072,12 @@ static int vhost_vdpa_open(struct inode *inode, struct file *filep) vhost_dev_init(dev, vqs, nvqs, 0, 0, 0, false, vhost_vdpa_process_iotlb_msg); - v->iotlb = vhost_iotlb_alloc(0, 0); - if (!v->iotlb) { - r = -ENOMEM; - goto err_init_iotlb; - } + if (!vhost_vdpa_alloc_as(v, 0)) + goto err_alloc_as; r = vhost_vdpa_alloc_domain(v); if (r) - goto err_alloc_domain; + goto err_alloc_as; vhost_vdpa_set_iova_range(v); @@ -1036,11 +1085,8 @@ static int vhost_vdpa_open(struct inode *inode, struct file *filep) return 0; -err_alloc_domain: - vhost_vdpa_iotlb_free(v); -err_init_iotlb: - vhost_dev_cleanup(&v->vdev); - kfree(vqs); +err_alloc_as: + vhost_vdpa_cleanup(v); err: atomic_dec(&v->opened); return r; @@ -1064,11 +1110,9 @@ static int vhost_vdpa_release(struct inode *inode, struct file *filep) vhost_vdpa_clean_irq(v); vhost_vdpa_reset(v); vhost_dev_stop(&v->vdev); - vhost_vdpa_iotlb_free(v); vhost_vdpa_free_domain(v); vhost_vdpa_config_put(v); vhost_dev_cleanup(&v->vdev); - kfree(v->vdev.vqs); mutex_unlock(&d->mutex); atomic_dec(&v->opened); @@ -1164,7 +1208,7 @@ static int vhost_vdpa_probe(struct vdpa_device *vdpa) const struct vdpa_config_ops *ops = vdpa->config; struct vhost_vdpa *v; int minor; - int r; + int i, r; /* Only support 1 address space and 1 groups */ if (vdpa->ngroups != 1 || vdpa->nas != 1) @@ -1212,6 +1256,9 @@ static int vhost_vdpa_probe(struct vdpa_device *vdpa) init_completion(&v->completion); vdpa_set_drvdata(vdpa, v); + for (i = 0; i < VHOST_VDPA_IOTLB_BUCKETS; i++) + INIT_HLIST_HEAD(&v->as[i]); + return 0; err: From 3ace88bd37436abc84906312146fe5158a469142 Mon Sep 17 00:00:00 2001 From: Gautam Dawar Date: Wed, 30 Mar 2022 23:33:51 +0530 Subject: [PATCH 21/29] vhost-vdpa: introduce uAPI to get the number of virtqueue groups Follows the vDPA support for multiple address spaces, this patch introduce uAPI for the userspace to know the number of virtqueue groups supported by the vDPA device. Signed-off-by: Jason Wang Signed-off-by: Gautam Dawar Message-Id: <20220330180436.24644-12-gdawar@xilinx.com> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vdpa.c | 4 ++++ include/uapi/linux/vhost.h | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index cd1bee536c46..92f78df0f685 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -559,6 +559,10 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep, case VHOST_VDPA_GET_VRING_NUM: r = vhost_vdpa_get_vring_num(v, argp); break; + case VHOST_VDPA_GET_GROUP_NUM: + r = copy_to_user(argp, &v->vdpa->ngroups, + sizeof(v->vdpa->ngroups)); + break; case VHOST_SET_LOG_BASE: case VHOST_SET_LOG_FD: r = -ENOIOCTLCMD; diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index 8f7b4a95d6f9..61317c61d768 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -145,11 +145,13 @@ /* Get the valid iova range */ #define VHOST_VDPA_GET_IOVA_RANGE _IOR(VHOST_VIRTIO, 0x78, \ struct vhost_vdpa_iova_range) - /* Get the config size */ #define VHOST_VDPA_GET_CONFIG_SIZE _IOR(VHOST_VIRTIO, 0x79, __u32) /* Get the count of all virtqueues */ #define VHOST_VDPA_GET_VQS_COUNT _IOR(VHOST_VIRTIO, 0x80, __u32) +/* Get the number of virtqueue groups. */ +#define VHOST_VDPA_GET_GROUP_NUM _IOR(VHOST_VIRTIO, 0x81, __u32) + #endif From a0c95f201170bd559737d3cdc8a950aea62f29c6 Mon Sep 17 00:00:00 2001 From: Gautam Dawar Date: Wed, 30 Mar 2022 23:33:52 +0530 Subject: [PATCH 22/29] vhost-vdpa: introduce uAPI to get the number of address spaces This patch introduces the uAPI for getting the number of address spaces supported by this vDPA device. Signed-off-by: Jason Wang Signed-off-by: Gautam Dawar Message-Id: <20220330180436.24644-13-gdawar@xilinx.com> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vdpa.c | 3 +++ include/uapi/linux/vhost.h | 2 ++ 2 files changed, 5 insertions(+) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 92f78df0f685..a017011ad1f5 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -563,6 +563,9 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep, r = copy_to_user(argp, &v->vdpa->ngroups, sizeof(v->vdpa->ngroups)); break; + case VHOST_VDPA_GET_AS_NUM: + r = copy_to_user(argp, &v->vdpa->nas, sizeof(v->vdpa->nas)); + break; case VHOST_SET_LOG_BASE: case VHOST_SET_LOG_FD: r = -ENOIOCTLCMD; diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index 61317c61d768..51322008901a 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -154,4 +154,6 @@ /* Get the number of virtqueue groups. */ #define VHOST_VDPA_GET_GROUP_NUM _IOR(VHOST_VIRTIO, 0x81, __u32) +/* Get the number of address spaces. */ +#define VHOST_VDPA_GET_AS_NUM _IOR(VHOST_VIRTIO, 0x7A, unsigned int) #endif From 2d1fcb7758e49fd9caf150f3c70804b95b2ce80c Mon Sep 17 00:00:00 2001 From: Gautam Dawar Date: Wed, 30 Mar 2022 23:33:53 +0530 Subject: [PATCH 23/29] vhost-vdpa: uAPI to get virtqueue group id Follows the support for virtqueue group in vDPA. This patches introduces uAPI to get the virtqueue group ID for a specific virtqueue in vhost-vdpa. Signed-off-by: Jason Wang Signed-off-by: Gautam Dawar Message-Id: <20220330180436.24644-14-gdawar@xilinx.com> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vdpa.c | 8 ++++++++ include/uapi/linux/vhost.h | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index a017011ad1f5..aa5cacdc5263 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -465,6 +465,14 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, return -EFAULT; ops->set_vq_ready(vdpa, idx, s.num); return 0; + case VHOST_VDPA_GET_VRING_GROUP: + s.index = idx; + s.num = ops->get_vq_group(vdpa, idx); + if (s.num >= vdpa->ngroups) + return -EIO; + else if (copy_to_user(argp, &s, sizeof(s))) + return -EFAULT; + return 0; case VHOST_GET_VRING_BASE: r = ops->get_vq_state(v->vdpa, idx, &vq_state); if (r) diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index 51322008901a..668914c87f74 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -156,4 +156,12 @@ /* Get the number of address spaces. */ #define VHOST_VDPA_GET_AS_NUM _IOR(VHOST_VIRTIO, 0x7A, unsigned int) + +/* Get the group for a virtqueue: read index, write group in num, + * The virtqueue index is stored in the index field of + * vhost_vring_state. The group for this specific virtqueue is + * returned via num field of vhost_vring_state. + */ +#define VHOST_VDPA_GET_VRING_GROUP _IOWR(VHOST_VIRTIO, 0x7B, \ + struct vhost_vring_state) #endif From 84d7c8fd3aade2fe79313003ed06ede431ec2a6d Mon Sep 17 00:00:00 2001 From: Gautam Dawar Date: Wed, 30 Mar 2022 23:33:54 +0530 Subject: [PATCH 24/29] vhost-vdpa: introduce uAPI to set group ASID Follows the vDPA support for associating ASID to a specific virtqueue group. This patch adds a uAPI to support setting them from userspace. Signed-off-by: Jason Wang Signed-off-by: Gautam Dawar Message-Id: <20220330180436.24644-15-gdawar@xilinx.com> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vdpa.c | 8 ++++++++ include/uapi/linux/vhost.h | 7 +++++++ 2 files changed, 15 insertions(+) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index aa5cacdc5263..6c7ee0f18892 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -473,6 +473,14 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, else if (copy_to_user(argp, &s, sizeof(s))) return -EFAULT; return 0; + case VHOST_VDPA_SET_GROUP_ASID: + if (copy_from_user(&s, argp, sizeof(s))) + return -EFAULT; + if (s.num >= vdpa->nas) + return -EINVAL; + if (!ops->set_group_asid) + return -EOPNOTSUPP; + return ops->set_group_asid(vdpa, idx, s.num); case VHOST_GET_VRING_BASE: r = ops->get_vq_state(v->vdpa, idx, &vq_state); if (r) diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index 668914c87f74..cab645d4a645 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -164,4 +164,11 @@ */ #define VHOST_VDPA_GET_VRING_GROUP _IOWR(VHOST_VIRTIO, 0x7B, \ struct vhost_vring_state) +/* Set the ASID for a virtqueue group. The group index is stored in + * the index field of vhost_vring_state, the ASID associated with this + * group is stored at num field of vhost_vring_state. + */ +#define VHOST_VDPA_SET_GROUP_ASID _IOW(VHOST_VIRTIO, 0x7C, \ + struct vhost_vring_state) + #endif From aaca8373c4b1e010b8d748fc99d929de1bf860b8 Mon Sep 17 00:00:00 2001 From: Gautam Dawar Date: Wed, 30 Mar 2022 23:33:55 +0530 Subject: [PATCH 25/29] vhost-vdpa: support ASID based IOTLB API This patch extends the vhost-vdpa to support ASID based IOTLB API. The vhost-vdpa device will allocated multiple IOTLBs for vDPA device that supports multiple address spaces. The IOTLBs and vDPA device memory mappings is determined and maintained through ASID. Note that we still don't support vDPA device with more than one address spaces that depends on platform IOMMU. This work will be done by moving the IOMMU logic from vhost-vDPA to vDPA device driver. Signed-off-by: Jason Wang Signed-off-by: Gautam Dawar Message-Id: <20220330180436.24644-16-gdawar@xilinx.com> Signed-off-by: Michael S. Tsirkin Includes fixup: vhost-vdpa: Fix some error handling path in vhost_vdpa_process_iotlb_msg() In the error paths introduced by the original patch, a mutex may be left locked. Add the correct goto instead of a direct return. Signed-off-by: Christophe JAILLET Message-Id: <89ef0ae4c26ac3cfa440c71e97e392dcb328ac1b.1653227924.git.christophe.jaillet@wanadoo.fr> Acked-by: Jason Wang Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vdpa.c | 111 ++++++++++++++++++++++++++++++++++-------- drivers/vhost/vhost.c | 2 +- 2 files changed, 93 insertions(+), 20 deletions(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 6c7ee0f18892..3e86080041fc 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -28,7 +28,8 @@ enum { VHOST_VDPA_BACKEND_FEATURES = (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2) | - (1ULL << VHOST_BACKEND_F_IOTLB_BATCH), + (1ULL << VHOST_BACKEND_F_IOTLB_BATCH) | + (1ULL << VHOST_BACKEND_F_IOTLB_ASID), }; #define VHOST_VDPA_DEV_MAX (1U << MINORBITS) @@ -57,12 +58,20 @@ struct vhost_vdpa { struct eventfd_ctx *config_ctx; int in_batch; struct vdpa_iova_range range; + u32 batch_asid; }; static DEFINE_IDA(vhost_vdpa_ida); static dev_t vhost_vdpa_major; +static inline u32 iotlb_to_asid(struct vhost_iotlb *iotlb) +{ + struct vhost_vdpa_as *as = container_of(iotlb, struct + vhost_vdpa_as, iotlb); + return as->id; +} + static struct vhost_vdpa_as *asid_to_as(struct vhost_vdpa *v, u32 asid) { struct hlist_head *head = &v->as[asid % VHOST_VDPA_IOTLB_BUCKETS]; @@ -75,6 +84,16 @@ static struct vhost_vdpa_as *asid_to_as(struct vhost_vdpa *v, u32 asid) return NULL; } +static struct vhost_iotlb *asid_to_iotlb(struct vhost_vdpa *v, u32 asid) +{ + struct vhost_vdpa_as *as = asid_to_as(v, asid); + + if (!as) + return NULL; + + return &as->iotlb; +} + static struct vhost_vdpa_as *vhost_vdpa_alloc_as(struct vhost_vdpa *v, u32 asid) { struct hlist_head *head = &v->as[asid % VHOST_VDPA_IOTLB_BUCKETS]; @@ -83,6 +102,9 @@ static struct vhost_vdpa_as *vhost_vdpa_alloc_as(struct vhost_vdpa *v, u32 asid) if (asid_to_as(v, asid)) return NULL; + if (asid >= v->vdpa->nas) + return NULL; + as = kmalloc(sizeof(*as), GFP_KERNEL); if (!as) return NULL; @@ -94,6 +116,17 @@ static struct vhost_vdpa_as *vhost_vdpa_alloc_as(struct vhost_vdpa *v, u32 asid) return as; } +static struct vhost_vdpa_as *vhost_vdpa_find_alloc_as(struct vhost_vdpa *v, + u32 asid) +{ + struct vhost_vdpa_as *as = asid_to_as(v, asid); + + if (as) + return as; + + return vhost_vdpa_alloc_as(v, asid); +} + static int vhost_vdpa_remove_as(struct vhost_vdpa *v, u32 asid) { struct vhost_vdpa_as *as = asid_to_as(v, asid); @@ -692,6 +725,7 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, struct vhost_iotlb *iotlb, struct vhost_dev *dev = &v->vdev; struct vdpa_device *vdpa = v->vdpa; const struct vdpa_config_ops *ops = vdpa->config; + u32 asid = iotlb_to_asid(iotlb); int r = 0; r = vhost_iotlb_add_range_ctx(iotlb, iova, iova + size - 1, @@ -700,10 +734,10 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, struct vhost_iotlb *iotlb, return r; if (ops->dma_map) { - r = ops->dma_map(vdpa, 0, iova, size, pa, perm, opaque); + r = ops->dma_map(vdpa, asid, iova, size, pa, perm, opaque); } else if (ops->set_map) { if (!v->in_batch) - r = ops->set_map(vdpa, 0, iotlb); + r = ops->set_map(vdpa, asid, iotlb); } else { r = iommu_map(v->domain, iova, pa, size, perm_to_iommu_flags(perm)); @@ -725,17 +759,24 @@ static void vhost_vdpa_unmap(struct vhost_vdpa *v, { struct vdpa_device *vdpa = v->vdpa; const struct vdpa_config_ops *ops = vdpa->config; + u32 asid = iotlb_to_asid(iotlb); vhost_vdpa_iotlb_unmap(v, iotlb, iova, iova + size - 1); if (ops->dma_map) { - ops->dma_unmap(vdpa, 0, iova, size); + ops->dma_unmap(vdpa, asid, iova, size); } else if (ops->set_map) { if (!v->in_batch) - ops->set_map(vdpa, 0, iotlb); + ops->set_map(vdpa, asid, iotlb); } else { iommu_unmap(v->domain, iova, size); } + + /* If we are in the middle of batch processing, delay the free + * of AS until BATCH_END. + */ + if (!v->in_batch && !iotlb->nmaps) + vhost_vdpa_remove_as(v, asid); } static int vhost_vdpa_va_map(struct vhost_vdpa *v, @@ -943,19 +984,40 @@ static int vhost_vdpa_process_iotlb_msg(struct vhost_dev *dev, u32 asid, struct vhost_vdpa *v = container_of(dev, struct vhost_vdpa, vdev); struct vdpa_device *vdpa = v->vdpa; const struct vdpa_config_ops *ops = vdpa->config; - struct vhost_vdpa_as *as = asid_to_as(v, 0); - struct vhost_iotlb *iotlb = &as->iotlb; + struct vhost_iotlb *iotlb = NULL; + struct vhost_vdpa_as *as = NULL; int r = 0; - if (asid != 0) - return -EINVAL; - mutex_lock(&dev->mutex); r = vhost_dev_check_owner(dev); if (r) goto unlock; + if (msg->type == VHOST_IOTLB_UPDATE || + msg->type == VHOST_IOTLB_BATCH_BEGIN) { + as = vhost_vdpa_find_alloc_as(v, asid); + if (!as) { + dev_err(&v->dev, "can't find and alloc asid %d\n", + asid); + r = -EINVAL; + goto unlock; + } + iotlb = &as->iotlb; + } else + iotlb = asid_to_iotlb(v, asid); + + if ((v->in_batch && v->batch_asid != asid) || !iotlb) { + if (v->in_batch && v->batch_asid != asid) { + dev_info(&v->dev, "batch id %d asid %d\n", + v->batch_asid, asid); + } + if (!iotlb) + dev_err(&v->dev, "no iotlb for asid %d\n", asid); + r = -EINVAL; + goto unlock; + } + switch (msg->type) { case VHOST_IOTLB_UPDATE: r = vhost_vdpa_process_iotlb_update(v, iotlb, msg); @@ -964,12 +1026,15 @@ static int vhost_vdpa_process_iotlb_msg(struct vhost_dev *dev, u32 asid, vhost_vdpa_unmap(v, iotlb, msg->iova, msg->size); break; case VHOST_IOTLB_BATCH_BEGIN: + v->batch_asid = asid; v->in_batch = true; break; case VHOST_IOTLB_BATCH_END: if (v->in_batch && ops->set_map) - ops->set_map(vdpa, 0, iotlb); + ops->set_map(vdpa, asid, iotlb); v->in_batch = false; + if (!iotlb->nmaps) + vhost_vdpa_remove_as(v, asid); break; default: r = -EINVAL; @@ -1057,9 +1122,17 @@ static void vhost_vdpa_set_iova_range(struct vhost_vdpa *v) static void vhost_vdpa_cleanup(struct vhost_vdpa *v) { + struct vhost_vdpa_as *as; + u32 asid; + vhost_dev_cleanup(&v->vdev); kfree(v->vdev.vqs); - vhost_vdpa_remove_as(v, 0); + + for (asid = 0; asid < v->vdpa->nas; asid++) { + as = asid_to_as(v, asid); + if (as) + vhost_vdpa_remove_as(v, asid); + } } static int vhost_vdpa_open(struct inode *inode, struct file *filep) @@ -1095,12 +1168,9 @@ static int vhost_vdpa_open(struct inode *inode, struct file *filep) vhost_dev_init(dev, vqs, nvqs, 0, 0, 0, false, vhost_vdpa_process_iotlb_msg); - if (!vhost_vdpa_alloc_as(v, 0)) - goto err_alloc_as; - r = vhost_vdpa_alloc_domain(v); if (r) - goto err_alloc_as; + goto err_alloc_domain; vhost_vdpa_set_iova_range(v); @@ -1108,7 +1178,7 @@ static int vhost_vdpa_open(struct inode *inode, struct file *filep) return 0; -err_alloc_as: +err_alloc_domain: vhost_vdpa_cleanup(v); err: atomic_dec(&v->opened); @@ -1233,8 +1303,11 @@ static int vhost_vdpa_probe(struct vdpa_device *vdpa) int minor; int i, r; - /* Only support 1 address space and 1 groups */ - if (vdpa->ngroups != 1 || vdpa->nas != 1) + /* We can't support platform IOMMU device with more than 1 + * group or as + */ + if (!ops->set_map && !ops->dma_map && + (vdpa->ngroups > 1 || vdpa->nas > 1)) return -EOPNOTSUPP; v = kzalloc(sizeof(*v), GFP_KERNEL | __GFP_RETRY_MAYFAIL); diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index d1e58f976f6e..5022c648d9c0 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1167,7 +1167,7 @@ ssize_t vhost_chr_write_iter(struct vhost_dev *dev, ret = -EINVAL; goto done; } - offset = sizeof(__u16); + offset = 0; } else offset = sizeof(__u32); break; From 05b6976212d4cffde60ff53716c40809ebdf4aee Mon Sep 17 00:00:00 2001 From: Gautam Dawar Date: Wed, 30 Mar 2022 23:33:56 +0530 Subject: [PATCH 26/29] vdpa_sim: advertise VIRTIO_NET_F_MTU We've already reported maximum mtu via config space, so let's advertise the feature. Signed-off-by: Jason Wang Signed-off-by: Gautam Dawar Message-Id: <20220330180436.24644-17-gdawar@xilinx.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa_sim/vdpa_sim_net.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c index d5324f6fd8c7..2d1d8c59d0ea 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c @@ -26,7 +26,8 @@ #define DRV_LICENSE "GPL v2" #define VDPASIM_NET_FEATURES (VDPASIM_FEATURES | \ - (1ULL << VIRTIO_NET_F_MAC)) + (1ULL << VIRTIO_NET_F_MAC) | \ + (1ULL << VIRTIO_NET_F_MTU)) #define VDPASIM_NET_VQ_NUM 2 From ec103d983bb56037a147dcdd9798cfa4fcdab126 Mon Sep 17 00:00:00 2001 From: Gautam Dawar Date: Wed, 30 Mar 2022 23:33:57 +0530 Subject: [PATCH 27/29] vdpa_sim: factor out buffer completion logic Wrap up common buffer completion logic in to vdpasim_net_complete Signed-off-by: Jason Wang Signed-off-by: Gautam Dawar Message-Id: <20220330180436.24644-18-gdawar@xilinx.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa_sim/vdpa_sim_net.c | 33 +++++++++++++++------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c index 2d1d8c59d0ea..f4607172b0b8 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c @@ -31,6 +31,22 @@ #define VDPASIM_NET_VQ_NUM 2 +static void vdpasim_net_complete(struct vdpasim_virtqueue *vq, size_t len) +{ + /* Make sure data is wrote before advancing index */ + smp_wmb(); + + vringh_complete_iotlb(&vq->vring, vq->head, len); + + /* Make sure used is visible before rasing the interrupt. */ + smp_wmb(); + + local_bh_disable(); + if (vringh_need_notify_iotlb(&vq->vring) > 0) + vringh_notify(&vq->vring); + local_bh_enable(); +} + static void vdpasim_net_work(struct work_struct *work) { struct vdpasim *vdpasim = container_of(work, struct vdpasim, work); @@ -78,21 +94,8 @@ static void vdpasim_net_work(struct work_struct *work) total_write += write; } - /* Make sure data is wrote before advancing index */ - smp_wmb(); - - vringh_complete_iotlb(&txq->vring, txq->head, 0); - vringh_complete_iotlb(&rxq->vring, rxq->head, total_write); - - /* Make sure used is visible before rasing the interrupt. */ - smp_wmb(); - - local_bh_disable(); - if (vringh_need_notify_iotlb(&txq->vring) > 0) - vringh_notify(&txq->vring); - if (vringh_need_notify_iotlb(&rxq->vring) > 0) - vringh_notify(&rxq->vring); - local_bh_enable(); + vdpasim_net_complete(txq, 0); + vdpasim_net_complete(rxq, total_write); if (++pkts > 4) { schedule_work(&vdpasim->work); From cfe226892913a448e83e7a19db93862baa3cb99c Mon Sep 17 00:00:00 2001 From: Gautam Dawar Date: Wed, 30 Mar 2022 23:33:58 +0530 Subject: [PATCH 28/29] vdpa_sim: filter destination mac address This patch implements a simple unicast filter for vDPA simulator. Signed-off-by: Jason Wang Signed-off-by: Gautam Dawar Message-Id: <20220330180436.24644-19-gdawar@xilinx.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa_sim/vdpa_sim_net.c | 49 ++++++++++++++++++---------- 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c index f4607172b0b8..5fa59d4fddc8 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c @@ -47,13 +47,28 @@ static void vdpasim_net_complete(struct vdpasim_virtqueue *vq, size_t len) local_bh_enable(); } +static bool receive_filter(struct vdpasim *vdpasim, size_t len) +{ + bool modern = vdpasim->features & (1ULL << VIRTIO_F_VERSION_1); + size_t hdr_len = modern ? sizeof(struct virtio_net_hdr_v1) : + sizeof(struct virtio_net_hdr); + struct virtio_net_config *vio_config = vdpasim->config; + + if (len < ETH_ALEN + hdr_len) + return false; + + if (!strncmp(vdpasim->buffer + hdr_len, vio_config->mac, ETH_ALEN)) + return true; + + return false; +} + static void vdpasim_net_work(struct work_struct *work) { struct vdpasim *vdpasim = container_of(work, struct vdpasim, work); struct vdpasim_virtqueue *txq = &vdpasim->vqs[1]; struct vdpasim_virtqueue *rxq = &vdpasim->vqs[0]; ssize_t read, write; - size_t total_write; int pkts = 0; int err; @@ -66,36 +81,34 @@ static void vdpasim_net_work(struct work_struct *work) goto out; while (true) { - total_write = 0; err = vringh_getdesc_iotlb(&txq->vring, &txq->out_iov, NULL, &txq->head, GFP_ATOMIC); if (err <= 0) break; + read = vringh_iov_pull_iotlb(&txq->vring, &txq->out_iov, + vdpasim->buffer, + PAGE_SIZE); + + if (!receive_filter(vdpasim, read)) { + vdpasim_net_complete(txq, 0); + continue; + } + err = vringh_getdesc_iotlb(&rxq->vring, NULL, &rxq->in_iov, &rxq->head, GFP_ATOMIC); if (err <= 0) { - vringh_complete_iotlb(&txq->vring, txq->head, 0); + vdpasim_net_complete(txq, 0); break; } - while (true) { - read = vringh_iov_pull_iotlb(&txq->vring, &txq->out_iov, - vdpasim->buffer, - PAGE_SIZE); - if (read <= 0) - break; - - write = vringh_iov_push_iotlb(&rxq->vring, &rxq->in_iov, - vdpasim->buffer, read); - if (write <= 0) - break; - - total_write += write; - } + write = vringh_iov_push_iotlb(&rxq->vring, &rxq->in_iov, + vdpasim->buffer, read); + if (write <= 0) + break; vdpasim_net_complete(txq, 0); - vdpasim_net_complete(rxq, total_write); + vdpasim_net_complete(rxq, write); if (++pkts > 4) { schedule_work(&vdpasim->work); From bda324fd037a6b0d44da5699574ce741ca161bc4 Mon Sep 17 00:00:00 2001 From: Gautam Dawar Date: Wed, 30 Mar 2022 23:33:59 +0530 Subject: [PATCH 29/29] vdpasim: control virtqueue support This patch introduces the control virtqueue support for vDPA simulator. This is a requirement for supporting advanced features like multiqueue. A requirement for control virtqueue is to isolate its memory access from the rx/tx virtqueues. This is because when using vDPA device for VM, the control virqueue is not directly assigned to VM. Userspace (Qemu) will present a shadow control virtqueue to control for recording the device states. The isolation is done via the virtqueue groups and ASID support in vDPA through vhost-vdpa. The simulator is extended to have: 1) three virtqueues: RXVQ, TXVQ and CVQ (control virtqueue) 2) two virtqueue groups: group 0 contains RXVQ and TXVQ; group 1 contains CVQ 3) two address spaces and the simulator simply implements the address spaces by mapping it 1:1 to IOTLB. For the VM use cases, userspace(Qemu) may set AS 0 to group 0 and AS 1 to group 1. So we have: 1) The IOTLB for virtqueue group 0 contains the mappings of guest, so RX and TX can be assigned to guest directly. 2) The IOTLB for virtqueue group 1 contains the mappings of CVQ which is the buffers that allocated and managed by VMM only. So CVQ of vhost-vdpa is visible to VMM only. And Guest can not access the CVQ of vhost-vdpa. For the other use cases, since AS 0 is associated to all virtqueue groups by default. All virtqueues share the same mapping by default. To demonstrate the function, VIRITO_NET_F_CTRL_MACADDR is implemented in the simulator for the driver to set mac address. Signed-off-by: Jason Wang Signed-off-by: Gautam Dawar Message-Id: <20220330180436.24644-20-gdawar@xilinx.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 93 ++++++++++++++++++++++------ drivers/vdpa/vdpa_sim/vdpa_sim.h | 2 + drivers/vdpa/vdpa_sim/vdpa_sim_net.c | 88 +++++++++++++++++++++++++- 3 files changed, 162 insertions(+), 21 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 659e2e2e4b0c..51bd0bafce06 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -96,11 +96,17 @@ static void vdpasim_do_reset(struct vdpasim *vdpasim) { int i; - for (i = 0; i < vdpasim->dev_attr.nvqs; i++) - vdpasim_vq_reset(vdpasim, &vdpasim->vqs[i]); - spin_lock(&vdpasim->iommu_lock); - vhost_iotlb_reset(vdpasim->iommu); + + for (i = 0; i < vdpasim->dev_attr.nvqs; i++) { + vdpasim_vq_reset(vdpasim, &vdpasim->vqs[i]); + vringh_set_iotlb(&vdpasim->vqs[i].vring, &vdpasim->iommu[0], + &vdpasim->iommu_lock); + } + + for (i = 0; i < vdpasim->dev_attr.nas; i++) + vhost_iotlb_reset(&vdpasim->iommu[i]); + spin_unlock(&vdpasim->iommu_lock); vdpasim->features = 0; @@ -145,7 +151,7 @@ static dma_addr_t vdpasim_map_range(struct vdpasim *vdpasim, phys_addr_t paddr, dma_addr = iova_dma_addr(&vdpasim->iova, iova); spin_lock(&vdpasim->iommu_lock); - ret = vhost_iotlb_add_range(vdpasim->iommu, (u64)dma_addr, + ret = vhost_iotlb_add_range(&vdpasim->iommu[0], (u64)dma_addr, (u64)dma_addr + size - 1, (u64)paddr, perm); spin_unlock(&vdpasim->iommu_lock); @@ -161,7 +167,7 @@ static void vdpasim_unmap_range(struct vdpasim *vdpasim, dma_addr_t dma_addr, size_t size) { spin_lock(&vdpasim->iommu_lock); - vhost_iotlb_del_range(vdpasim->iommu, (u64)dma_addr, + vhost_iotlb_del_range(&vdpasim->iommu[0], (u64)dma_addr, (u64)dma_addr + size - 1); spin_unlock(&vdpasim->iommu_lock); @@ -250,8 +256,9 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr) else ops = &vdpasim_config_ops; - vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops, 1, - 1, dev_attr->name, false); + vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops, + dev_attr->ngroups, dev_attr->nas, + dev_attr->name, false); if (IS_ERR(vdpasim)) { ret = PTR_ERR(vdpasim); goto err_alloc; @@ -278,16 +285,20 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr) if (!vdpasim->vqs) goto err_iommu; - vdpasim->iommu = vhost_iotlb_alloc(max_iotlb_entries, 0); + vdpasim->iommu = kmalloc_array(vdpasim->dev_attr.nas, + sizeof(*vdpasim->iommu), GFP_KERNEL); if (!vdpasim->iommu) goto err_iommu; + for (i = 0; i < vdpasim->dev_attr.nas; i++) + vhost_iotlb_init(&vdpasim->iommu[i], 0, 0); + vdpasim->buffer = kvmalloc(dev_attr->buffer_size, GFP_KERNEL); if (!vdpasim->buffer) goto err_iommu; for (i = 0; i < dev_attr->nvqs; i++) - vringh_set_iotlb(&vdpasim->vqs[i].vring, vdpasim->iommu, + vringh_set_iotlb(&vdpasim->vqs[i].vring, &vdpasim->iommu[0], &vdpasim->iommu_lock); ret = iova_cache_get(); @@ -401,7 +412,11 @@ static u32 vdpasim_get_vq_align(struct vdpa_device *vdpa) static u32 vdpasim_get_vq_group(struct vdpa_device *vdpa, u16 idx) { - return 0; + /* RX and TX belongs to group 0, CVQ belongs to group 1 */ + if (idx == 2) + return 1; + else + return 0; } static u64 vdpasim_get_device_features(struct vdpa_device *vdpa) @@ -539,20 +554,53 @@ static struct vdpa_iova_range vdpasim_get_iova_range(struct vdpa_device *vdpa) return range; } +static int vdpasim_set_group_asid(struct vdpa_device *vdpa, unsigned int group, + unsigned int asid) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + struct vhost_iotlb *iommu; + int i; + + if (group > vdpasim->dev_attr.ngroups) + return -EINVAL; + + if (asid > vdpasim->dev_attr.nas) + return -EINVAL; + + iommu = &vdpasim->iommu[asid]; + + spin_lock(&vdpasim->lock); + + for (i = 0; i < vdpasim->dev_attr.nvqs; i++) + if (vdpasim_get_vq_group(vdpa, i) == group) + vringh_set_iotlb(&vdpasim->vqs[i].vring, iommu, + &vdpasim->iommu_lock); + + spin_unlock(&vdpasim->lock); + + return 0; +} + static int vdpasim_set_map(struct vdpa_device *vdpa, unsigned int asid, struct vhost_iotlb *iotlb) { struct vdpasim *vdpasim = vdpa_to_sim(vdpa); struct vhost_iotlb_map *map; + struct vhost_iotlb *iommu; u64 start = 0ULL, last = 0ULL - 1; int ret; + if (asid >= vdpasim->dev_attr.nas) + return -EINVAL; + spin_lock(&vdpasim->iommu_lock); - vhost_iotlb_reset(vdpasim->iommu); + + iommu = &vdpasim->iommu[asid]; + vhost_iotlb_reset(iommu); for (map = vhost_iotlb_itree_first(iotlb, start, last); map; map = vhost_iotlb_itree_next(map, start, last)) { - ret = vhost_iotlb_add_range(vdpasim->iommu, map->start, + ret = vhost_iotlb_add_range(iommu, map->start, map->last, map->addr, map->perm); if (ret) goto err; @@ -561,7 +609,7 @@ static int vdpasim_set_map(struct vdpa_device *vdpa, unsigned int asid, return 0; err: - vhost_iotlb_reset(vdpasim->iommu); + vhost_iotlb_reset(iommu); spin_unlock(&vdpasim->iommu_lock); return ret; } @@ -573,9 +621,12 @@ static int vdpasim_dma_map(struct vdpa_device *vdpa, unsigned int asid, struct vdpasim *vdpasim = vdpa_to_sim(vdpa); int ret; + if (asid >= vdpasim->dev_attr.nas) + return -EINVAL; + spin_lock(&vdpasim->iommu_lock); - ret = vhost_iotlb_add_range_ctx(vdpasim->iommu, iova, iova + size - 1, - pa, perm, opaque); + ret = vhost_iotlb_add_range_ctx(&vdpasim->iommu[asid], iova, + iova + size - 1, pa, perm, opaque); spin_unlock(&vdpasim->iommu_lock); return ret; @@ -586,8 +637,11 @@ static int vdpasim_dma_unmap(struct vdpa_device *vdpa, unsigned int asid, { struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + if (asid >= vdpasim->dev_attr.nas) + return -EINVAL; + spin_lock(&vdpasim->iommu_lock); - vhost_iotlb_del_range(vdpasim->iommu, iova, iova + size - 1); + vhost_iotlb_del_range(&vdpasim->iommu[asid], iova, iova + size - 1); spin_unlock(&vdpasim->iommu_lock); return 0; @@ -611,8 +665,7 @@ static void vdpasim_free(struct vdpa_device *vdpa) } kvfree(vdpasim->buffer); - if (vdpasim->iommu) - vhost_iotlb_free(vdpasim->iommu); + vhost_iotlb_free(vdpasim->iommu); kfree(vdpasim->vqs); kfree(vdpasim->config); } @@ -643,6 +696,7 @@ static const struct vdpa_config_ops vdpasim_config_ops = { .set_config = vdpasim_set_config, .get_generation = vdpasim_get_generation, .get_iova_range = vdpasim_get_iova_range, + .set_group_asid = vdpasim_set_group_asid, .dma_map = vdpasim_dma_map, .dma_unmap = vdpasim_dma_unmap, .free = vdpasim_free, @@ -674,6 +728,7 @@ static const struct vdpa_config_ops vdpasim_batch_config_ops = { .set_config = vdpasim_set_config, .get_generation = vdpasim_get_generation, .get_iova_range = vdpasim_get_iova_range, + .set_group_asid = vdpasim_set_group_asid, .set_map = vdpasim_set_map, .free = vdpasim_free, }; diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.h b/drivers/vdpa/vdpa_sim/vdpa_sim.h index 0be7c1e7ef80..622782e92239 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.h +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.h @@ -41,6 +41,8 @@ struct vdpasim_dev_attr { size_t buffer_size; int nvqs; u32 id; + u32 ngroups; + u32 nas; work_func_t work_fn; void (*get_config)(struct vdpasim *vdpasim, void *config); diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c index 5fa59d4fddc8..5125976a4df8 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c @@ -27,9 +27,14 @@ #define VDPASIM_NET_FEATURES (VDPASIM_FEATURES | \ (1ULL << VIRTIO_NET_F_MAC) | \ - (1ULL << VIRTIO_NET_F_MTU)) + (1ULL << VIRTIO_NET_F_MTU) | \ + (1ULL << VIRTIO_NET_F_CTRL_VQ) | \ + (1ULL << VIRTIO_NET_F_CTRL_MAC_ADDR)) -#define VDPASIM_NET_VQ_NUM 2 +/* 3 virtqueues, 2 address spaces, 2 virtqueue groups */ +#define VDPASIM_NET_VQ_NUM 3 +#define VDPASIM_NET_AS_NUM 2 +#define VDPASIM_NET_GROUP_NUM 2 static void vdpasim_net_complete(struct vdpasim_virtqueue *vq, size_t len) { @@ -63,6 +68,81 @@ static bool receive_filter(struct vdpasim *vdpasim, size_t len) return false; } +static virtio_net_ctrl_ack vdpasim_handle_ctrl_mac(struct vdpasim *vdpasim, + u8 cmd) +{ + struct virtio_net_config *vio_config = vdpasim->config; + struct vdpasim_virtqueue *cvq = &vdpasim->vqs[2]; + virtio_net_ctrl_ack status = VIRTIO_NET_ERR; + size_t read; + + switch (cmd) { + case VIRTIO_NET_CTRL_MAC_ADDR_SET: + read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->in_iov, + vio_config->mac, ETH_ALEN); + if (read == ETH_ALEN) + status = VIRTIO_NET_OK; + break; + default: + break; + } + + return status; +} + +static void vdpasim_handle_cvq(struct vdpasim *vdpasim) +{ + struct vdpasim_virtqueue *cvq = &vdpasim->vqs[2]; + virtio_net_ctrl_ack status = VIRTIO_NET_ERR; + struct virtio_net_ctrl_hdr ctrl; + size_t read, write; + int err; + + if (!(vdpasim->features & (1ULL << VIRTIO_NET_F_CTRL_VQ))) + return; + + if (!cvq->ready) + return; + + while (true) { + err = vringh_getdesc_iotlb(&cvq->vring, &cvq->in_iov, + &cvq->out_iov, + &cvq->head, GFP_ATOMIC); + if (err <= 0) + break; + + read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->in_iov, &ctrl, + sizeof(ctrl)); + if (read != sizeof(ctrl)) + break; + + switch (ctrl.class) { + case VIRTIO_NET_CTRL_MAC: + status = vdpasim_handle_ctrl_mac(vdpasim, ctrl.cmd); + break; + default: + break; + } + + /* Make sure data is wrote before advancing index */ + smp_wmb(); + + write = vringh_iov_push_iotlb(&cvq->vring, &cvq->out_iov, + &status, sizeof(status)); + vringh_complete_iotlb(&cvq->vring, cvq->head, write); + vringh_kiov_cleanup(&cvq->in_iov); + vringh_kiov_cleanup(&cvq->out_iov); + + /* Make sure used is visible before rasing the interrupt. */ + smp_wmb(); + + local_bh_disable(); + if (cvq->cb) + cvq->cb(cvq->private); + local_bh_enable(); + } +} + static void vdpasim_net_work(struct work_struct *work) { struct vdpasim *vdpasim = container_of(work, struct vdpasim, work); @@ -77,6 +157,8 @@ static void vdpasim_net_work(struct work_struct *work) if (!(vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK)) goto out; + vdpasim_handle_cvq(vdpasim); + if (!txq->ready || !rxq->ready) goto out; @@ -162,6 +244,8 @@ static int vdpasim_net_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, dev_attr.id = VIRTIO_ID_NET; dev_attr.supported_features = VDPASIM_NET_FEATURES; dev_attr.nvqs = VDPASIM_NET_VQ_NUM; + dev_attr.ngroups = VDPASIM_NET_GROUP_NUM; + dev_attr.nas = VDPASIM_NET_AS_NUM; dev_attr.config_size = sizeof(struct virtio_net_config); dev_attr.get_config = vdpasim_net_get_config; dev_attr.work_fn = vdpasim_net_work;