ACK: [SRU][Lunar][PATCH 1/1] UBUNTU: SAUCE: Add mdev_set_iommu_device() kABI.

Andrei Gherzan andrei.gherzan at canonical.com
Tue Apr 25 21:06:19 UTC 2023


On 23/04/26 01:15AM, Tarun Gupta wrote:
> With below commit present from 5.16+ upstream kernel onwards, support
> mdev_set_iommu_device() kABI has been removed from kernel due to lack of
> in-tree vendor drivers using the kABI.
> 
> fda49d97f2c4 ("vfio: remove the unused mdev iommu hook")
> 
> This patch partially reverts the above commit so that
> mdev_set_iommu_device() kABI is still supported with HWE kernels for
> Ubuntu 22.04. In this partial revert, have not added back the code for
> "aux" variants (IOMMU_DEV_FEAT_AUX) present in
> vfio_mdev_[attach|detach]_domain as this support was never added by any
> in-tree driver or known out-of-tree driver. Nvidia vGPU doesn't make use
> of IOMMU_DEV_FEAT_AUX feature.
> 
> Also, it adds back the vfio_bus_is_mdev() function which was reverted in
> below patch as there were no users of it. This patch adds it back to
> detect if this is an mdev device.
> 
> c3c0fa9d94f7 ("vfio: clean up the check for mediated device in
> vfio_iommu_type1")
> 
> Also, in v6.2 kernel, "mdev_bus_type" struct has been unexported as
> part of below commit because it was used only in mdev.ko. But, for
> vGPU, as mentioned above, since we use vfio_bus_is_mdev() fn
> in vfio_iommu_type1.ko, we need to again export "mdev_bus_type" struct.
> 
> 2815fe149ffa ("vfio/mdev: unexport mdev_bus_type")
> 
> It is not a clean revert in vfio_iommu_type1_attach_group() fn as it is
> changed in v6.2 upstream kernel compared to when
> mdev_set_iommu_device() kABI was removed in 5.16 kernel.
> In 5.19 kernel, VFIO_EMULATED_IOMMU handling is introduced in
> vfio_iommu_type1_attach_group() fn which was not present when the patch
> was reverted in 5.16 kernel.
> 
> But, the logic remains the same. The logic here is if this is an
> vfio-mdev device, then check by calling vfio_mdev_iommu_device() if this
> mdev device already has an backed IOMMU device (which will be provided
> by mdev_set_iommu_device kABI from vendor driver). If the mdev device
> has backed iommu device, then use that device's IOMMU domain.
> 
> This kABI is used by SRIOV based Nvidia vGPU to pin all guest sysmem on
> VF during vGPU VM boot. With this patch, SRIOV based Nvidia vGPU will
> continue to work with upstream kernels. Nvidia vGPU driver calls
> mdev_set_iommu_device() for mdev device with VF as the backing IOMMU.
> 
> BugLink : https://bugs.launchpad.net/bugs/1988806
> 
> Signed-off-by: Tarun Gupta <targupta at nvidia.com>

Acked-by: Andrei Gherzan <andrei.gherzan at canonical.com>

> ---
>  drivers/vfio/mdev/mdev_driver.c  |   1 +
>  drivers/vfio/mdev/mdev_private.h |   1 -
>  drivers/vfio/vfio_iommu_type1.c  | 126 ++++++++++++++++++++++++++++---
>  include/linux/mdev.h             |  22 ++++++
>  4 files changed, 140 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c
> index 7825d83a55f8..a4799e7d79fc 100644
> --- a/drivers/vfio/mdev/mdev_driver.c
> +++ b/drivers/vfio/mdev/mdev_driver.c
> @@ -46,6 +46,7 @@ struct bus_type mdev_bus_type = {
>  	.remove		= mdev_remove,
>  	.match		= mdev_match,
>  };
> +EXPORT_SYMBOL_GPL(mdev_bus_type);
>  
>  /**
>   * mdev_register_driver - register a new MDEV driver
> diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h
> index af457b27f607..ba1b2dbddc0b 100644
> --- a/drivers/vfio/mdev/mdev_private.h
> +++ b/drivers/vfio/mdev/mdev_private.h
> @@ -13,7 +13,6 @@
>  int  mdev_bus_register(void);
>  void mdev_bus_unregister(void);
>  
> -extern struct bus_type mdev_bus_type;
>  extern const struct attribute_group *mdev_device_groups[];
>  
>  #define to_mdev_type_attr(_attr)	\
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 7fa68dc4e938..fef221a87aa7 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -36,6 +36,7 @@
>  #include <linux/uaccess.h>
>  #include <linux/vfio.h>
>  #include <linux/workqueue.h>
> +#include <linux/mdev.h>
>  #include <linux/notifier.h>
>  #include <linux/irqdomain.h>
>  #include "vfio.h"
> @@ -115,6 +116,7 @@ struct vfio_batch {
>  struct vfio_iommu_group {
>  	struct iommu_group	*iommu_group;
>  	struct list_head	next;
> +	bool			mdev_group;
>  	bool			pinned_page_dirty_scope;
>  };
>  
> @@ -1744,6 +1746,18 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  	return ret;
>  }
>  
> +static int vfio_bus_type(struct device *dev, void *data)
> +{
> +	struct bus_type **bus = data;
> +
> +	if (*bus && *bus != dev->bus)
> +		return -EINVAL;
> +
> +	*bus = dev->bus;
> +
> +	return 0;
> +}
> +
>  static int vfio_iommu_replay(struct vfio_iommu *iommu,
>  			     struct vfio_domain *domain)
>  {
> @@ -1992,6 +2006,81 @@ static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions,
>  	return ret;
>  }
>  
> +static int vfio_mdev_attach_domain(struct device *dev, void *data)
> +{
> +	struct mdev_device *mdev = to_mdev_device(dev);
> +	struct iommu_domain *domain = data;
> +	struct device *iommu_device;
> +
> +	iommu_device = mdev_get_iommu_device(mdev);
> +	if (iommu_device)
> +		return iommu_attach_device(domain, iommu_device);
> +
> +	return -EINVAL;
> +}
> +
> +static int vfio_mdev_detach_domain(struct device *dev, void *data)
> +{
> +	struct mdev_device *mdev = to_mdev_device(dev);
> +	struct iommu_domain *domain = data;
> +	struct device *iommu_device;
> +
> +	iommu_device = mdev_get_iommu_device(mdev);
> +	if (iommu_device)
> +		iommu_detach_device(domain, iommu_device);
> +
> +	return 0;
> +}
> +
> +static int vfio_iommu_attach_group(struct vfio_domain *domain,
> +				   struct vfio_iommu_group *group)
> +{
> +	if (group->mdev_group)
> +		return iommu_group_for_each_dev(group->iommu_group,
> +						domain->domain,
> +						vfio_mdev_attach_domain);
> +	else
> +		return iommu_attach_group(domain->domain, group->iommu_group);
> +}
> +
> +static void vfio_iommu_detach_group(struct vfio_domain *domain,
> +				    struct vfio_iommu_group *group)
> +{
> +	if (group->mdev_group)
> +		iommu_group_for_each_dev(group->iommu_group, domain->domain,
> +					 vfio_mdev_detach_domain);
> +	else
> +		iommu_detach_group(domain->domain, group->iommu_group);
> +}
> +
> +static bool vfio_bus_is_mdev(struct bus_type *bus)
> +{
> +	struct bus_type *mdev_bus;
> +	bool ret = false;
> +
> +	mdev_bus = symbol_get(mdev_bus_type);
> +	if (mdev_bus) {
> +		ret = (bus == mdev_bus);
> +		symbol_put(mdev_bus_type);
> +	}
> +
> +	return ret;
> +}
> +
> +static int vfio_mdev_iommu_device(struct device *dev, void *data)
> +{
> +	struct mdev_device *mdev = to_mdev_device(dev);
> +	struct device **old = data, *new;
> +
> +	new = mdev_get_iommu_device(mdev);
> +	if (!new || (*old && *old != new))
> +		return -EINVAL;
> +
> +	*old = new;
> +
> +	return 0;
> +}
> +
>  /*
>   * This is a helper function to insert an address range to iova list.
>   * The list is initially created with a single entry corresponding to
> @@ -2260,6 +2349,25 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	group->iommu_group = iommu_group;
>  
>  	if (type == VFIO_EMULATED_IOMMU) {
> +		struct bus_type *bus = NULL;
> +
> +		ret = iommu_group_for_each_dev(iommu_group, &bus, vfio_bus_type);
> +
> +		if (!ret && vfio_bus_is_mdev(bus)) {
> +			struct device *iommu_device = NULL;
> +
> +			group->mdev_group = true;
> +
> +			/* Determine the isolation type */
> +			ret = iommu_group_for_each_dev(iommu_group,
> +						       &iommu_device,
> +						       vfio_mdev_iommu_device);
> +			if (!ret && iommu_device) {
> +				iommu_group = iommu_device->iommu_group;
> +				goto mdev_iommu_device;
> +			}
> +		}
> +
>  		list_add(&group->next, &iommu->emulated_iommu_groups);
>  		/*
>  		 * An emulated IOMMU group cannot dirty memory directly, it can
> @@ -2272,6 +2380,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  		goto out_unlock;
>  	}
>  
> +mdev_iommu_device:
> +
>  	ret = -ENOMEM;
>  	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
>  	if (!domain)
> @@ -2294,7 +2404,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  			goto out_domain;
>  	}
>  
> -	ret = iommu_attach_group(domain->domain, group->iommu_group);
> +	ret = vfio_iommu_attach_group(domain, group);
>  	if (ret)
>  		goto out_domain;
>  
> @@ -2370,17 +2480,15 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  		if (d->domain->ops == domain->domain->ops &&
>  		    d->enforce_cache_coherency ==
>  			    domain->enforce_cache_coherency) {
> -			iommu_detach_group(domain->domain, group->iommu_group);
> -			if (!iommu_attach_group(d->domain,
> -						group->iommu_group)) {
> +			vfio_iommu_detach_group(domain, group);
> +			if (!vfio_iommu_attach_group(d, group)) {
>  				list_add(&group->next, &d->group_list);
>  				iommu_domain_free(domain->domain);
>  				kfree(domain);
>  				goto done;
>  			}
>  
> -			ret = iommu_attach_group(domain->domain,
> -						 group->iommu_group);
> +			ret = vfio_iommu_attach_group(domain, group);
>  			if (ret)
>  				goto out_domain;
>  		}
> @@ -2417,7 +2525,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	return 0;
>  
>  out_detach:
> -	iommu_detach_group(domain->domain, group->iommu_group);
> +	vfio_iommu_detach_group(domain, group);
>  out_domain:
>  	iommu_domain_free(domain->domain);
>  	vfio_iommu_iova_free(&iova_copy);
> @@ -2578,7 +2686,7 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
>  		if (!group)
>  			continue;
>  
> -		iommu_detach_group(domain->domain, group->iommu_group);
> +		vfio_iommu_detach_group(domain, group);
>  		update_dirty_scope = !group->pinned_page_dirty_scope;
>  		list_del(&group->next);
>  		kfree(group);
> @@ -2669,7 +2777,7 @@ static void vfio_release_domain(struct vfio_domain *domain)
>  
>  	list_for_each_entry_safe(group, group_tmp,
>  				 &domain->group_list, next) {
> -		iommu_detach_group(domain->domain, group->iommu_group);
> +		vfio_iommu_detach_group(domain, group);
>  		list_del(&group->next);
>  		kfree(group);
>  	}
> diff --git a/include/linux/mdev.h b/include/linux/mdev.h
> index 139d05b26f82..b08163d67e63 100644
> --- a/include/linux/mdev.h
> +++ b/include/linux/mdev.h
> @@ -20,6 +20,7 @@ struct mdev_device {
>  	guid_t uuid;
>  	struct list_head next;
>  	struct mdev_type *type;
> +	struct device *iommu_device;
>  	bool active;
>  };
>  
> @@ -53,6 +54,25 @@ static inline struct mdev_device *to_mdev_device(struct device *dev)
>  	return container_of(dev, struct mdev_device, dev);
>  }
>  
> +/*
> + * Called by the parent device driver to set the device which represents
> + * this mdev in iommu protection scope. By default, the iommu device is
> + * NULL, that indicates using vendor defined isolation.
> + *
> + * @dev: the mediated device that iommu will isolate.
> + * @iommu_device: a pci device which represents the iommu for @dev.
> + */
> +static inline void mdev_set_iommu_device(struct mdev_device *mdev,
> +					 struct device *iommu_device)
> +{
> +	mdev->iommu_device = iommu_device;
> +}
> +
> +static inline struct device *mdev_get_iommu_device(struct mdev_device *mdev)
> +{
> +	return mdev->iommu_device;
> +}
> +
>  /**
>   * struct mdev_driver - Mediated device driver
>   * @device_api: string to return for the device_api sysfs
> @@ -73,6 +93,8 @@ struct mdev_driver {
>  	struct device_driver driver;
>  };
>  
> +extern struct bus_type mdev_bus_type;
> +
>  int mdev_register_parent(struct mdev_parent *parent, struct device *dev,
>  		struct mdev_driver *mdev_driver, struct mdev_type **types,
>  		unsigned int nr_types);
> -- 
> 2.31.1

-- 
Andrei Gherzan
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 833 bytes
Desc: not available
URL: <https://lists.ubuntu.com/archives/kernel-team/attachments/20230425/ac9127af/attachment.sig>


More information about the kernel-team mailing list