Centos-kernel-stream-9/drivers/infiniband/hw/mlx5/main.c

4965 lines
128 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
/*
* Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved.
* Copyright (c) 2020, Intel Corporation. All rights reserved.
*/
#include <linux/debugfs.h>
#include <linux/highmem.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/pci.h>
#include <linux/dma-mapping.h>
#include <linux/slab.h>
#include <linux/bitmap.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/delay.h>
#include <rdma/ib_user_verbs.h>
#include <rdma/ib_addr.h>
#include <rdma/ib_cache.h>
#include <linux/mlx5/port.h>
#include <linux/mlx5/vport.h>
#include <linux/mlx5/fs.h>
#include <linux/mlx5/eswitch.h>
#include <linux/mlx5/driver.h>
#include <linux/list.h>
#include <rdma/ib_smi.h>
#include <rdma/ib_umem_odp.h>
#include <rdma/lag.h>
#include <linux/in.h>
#include <linux/etherdevice.h>
#include "mlx5_ib.h"
#include "ib_rep.h"
#include "cmd.h"
#include "devx.h"
#include "dm.h"
#include "fs.h"
#include "srq.h"
#include "qp.h"
#include "wr.h"
#include "restrack.h"
#include "counters.h"
#include "umr.h"
#include <rdma/uverbs_std_types.h>
#include <rdma/uverbs_ioctl.h>
#include <rdma/mlx5_user_ioctl_verbs.h>
#include <rdma/mlx5_user_ioctl_cmds.h>
#include "macsec.h"
RDMA/mlx5: Introduce the 'data direct' driver JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: v6.12-rc1 commit 6910e3660d86c1a5654f742a40181d2c9154f26f Author: Yishai Hadas <yishaih@nvidia.com> Date: Thu Aug 1 15:05:11 2024 +0300 RDMA/mlx5: Introduce the 'data direct' driver Introduce the 'data direct' driver for a ConnectX-8 Data Direct device. The 'data direct' driver functions as the affiliated DMA device for one or more capable mlx5_ib devices. This DMA device, as the name suggests, is used exclusively for DMA operations. It can be considered a DMA engine managed by a PF/VF, lacking network capabilities and having minimal overall capabilities. Consequently, the DMA NIC PF will not be exposed to or directly used by software applications. The driver will not have any direct interface or interaction with the firmware (no command interface, no capabilities, etc.). It will operate solely over PCI to enable its DMA functionality. Registration and un-registration of the driver are handled as part of the mlx5_ib initialization and exit processes, as the mlx5_ib devices will effectively be its clients. The driver will serve as the DMA device for accessing another PCI device to achieve optimal performance (both on the same NUMA node, P2P access, etc.). Upon probing, it will read its VUID over PCI to handle mlx5_ib device registrations with the same VUID. Upon removal, it will notify its clients to allow them to clean up the resources that were mmaped with its DMA device. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Link: https://patch.msgid.link/b77edecfd476c3f445da96ab6aef499ae47b2829.1722512548.git.leon@kernel.org Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:08 +00:00
#include "data_direct.h"
#define UVERBS_MODULE_NAME mlx5_ib
#include <rdma/uverbs_named_ioctl.h>
MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
MODULE_DESCRIPTION("Mellanox 5th generation network adapters (ConnectX series) IB driver");
MODULE_LICENSE("Dual BSD/GPL");
struct mlx5_ib_event_work {
struct work_struct work;
union {
struct mlx5_ib_dev *dev;
struct mlx5_ib_multiport_info *mpi;
};
bool is_slave;
unsigned int event;
void *param;
};
enum {
MLX5_ATOMIC_SIZE_QP_8BYTES = 1 << 3,
};
static struct workqueue_struct *mlx5_ib_event_wq;
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
static LIST_HEAD(mlx5_ib_unaffiliated_port_list);
static LIST_HEAD(mlx5_ib_dev_list);
/*
* This mutex should be held when accessing either of the above lists
*/
static DEFINE_MUTEX(mlx5_ib_multiport_mutex);
struct mlx5_ib_dev *mlx5_ib_get_ibdev_from_mpi(struct mlx5_ib_multiport_info *mpi)
{
struct mlx5_ib_dev *dev;
mutex_lock(&mlx5_ib_multiport_mutex);
dev = mpi->ibdev;
mutex_unlock(&mlx5_ib_multiport_mutex);
return dev;
}
static enum rdma_link_layer
mlx5_port_type_cap_to_rdma_ll(int port_type_cap)
{
switch (port_type_cap) {
case MLX5_CAP_PORT_TYPE_IB:
return IB_LINK_LAYER_INFINIBAND;
case MLX5_CAP_PORT_TYPE_ETH:
return IB_LINK_LAYER_ETHERNET;
default:
return IB_LINK_LAYER_UNSPECIFIED;
}
}
static enum rdma_link_layer
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
mlx5_ib_port_link_layer(struct ib_device *device, u32 port_num)
{
struct mlx5_ib_dev *dev = to_mdev(device);
int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type);
return mlx5_port_type_cap_to_rdma_ll(port_type_cap);
}
static int get_port_state(struct ib_device *ibdev,
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
u32 port_num,
enum ib_port_state *state)
{
struct ib_port_attr attr;
int ret;
memset(&attr, 0, sizeof(attr));
ret = ibdev->ops.query_port(ibdev, port_num, &attr);
if (!ret)
*state = attr.state;
return ret;
}
static struct mlx5_roce *mlx5_get_rep_roce(struct mlx5_ib_dev *dev,
struct net_device *ndev,
struct net_device *upper,
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
u32 *port_num)
{
struct net_device *rep_ndev;
struct mlx5_ib_port *port;
int i;
for (i = 0; i < dev->num_ports; i++) {
port = &dev->port[i];
if (!port->rep)
continue;
if (upper == ndev && port->rep->vport == MLX5_VPORT_UPLINK) {
*port_num = i + 1;
return &port->roce;
}
if (upper && port->rep->vport == MLX5_VPORT_UPLINK)
continue;
RDMA/mlx5: Use IB set_netdev and get_netdev functions JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: v6.12-rc1 commit 8d159eb2117b2e3697a31785662b653938f007cb Author: Chiara Meiohas <cmeiohas@nvidia.com> Date: Mon Sep 9 20:30:23 2024 +0300 RDMA/mlx5: Use IB set_netdev and get_netdev functions The IB layer provides a common interface to store and get net devices associated to an IB device port (ib_device_set_netdev() and ib_device_get_netdev()). Previously, mlx5_ib stored and managed the associated net devices internally. Replace internal net device management in mlx5_ib with ib_device_set_netdev() when attaching/detaching a net device and ib_device_get_netdev() when retrieving the net device. Export ib_device_get_netdev(). For mlx5 representors/PFs/VFs and lag creation we replace the netdev assignments with the IB set/get netdev functions. In active-backup mode lag the active slave net device is stored in the lag itself. To assure the net device stored in a lag bond IB device is the active slave we implement the following: - mlx5_core: when modifying the slave of a bond we send the internal driver event MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE. - mlx5_ib: when catching the event call ib_device_set_netdev() This patch also ensures the correct IB events are sent in switchdev lag. While at it, when in multiport eswitch mode, only a single IB device is created for all ports. The said IB device will receive all netdev events of its VFs once loaded, thus to avoid overwriting the mapping of PF IB device to PF netdev, ignore NETDEV_REGISTER events if the ib device has already been mapped to a netdev. Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com> Signed-off-by: Michael Guralnik <michaelgur@nvidia.com> Link: https://patch.msgid.link/20240909173025.30422-6-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:09 +00:00
rep_ndev = ib_device_get_netdev(&dev->ib_dev, i + 1);
if (rep_ndev && rep_ndev == ndev) {
dev_put(rep_ndev);
*port_num = i + 1;
return &port->roce;
}
RDMA/mlx5: Use IB set_netdev and get_netdev functions JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: v6.12-rc1 commit 8d159eb2117b2e3697a31785662b653938f007cb Author: Chiara Meiohas <cmeiohas@nvidia.com> Date: Mon Sep 9 20:30:23 2024 +0300 RDMA/mlx5: Use IB set_netdev and get_netdev functions The IB layer provides a common interface to store and get net devices associated to an IB device port (ib_device_set_netdev() and ib_device_get_netdev()). Previously, mlx5_ib stored and managed the associated net devices internally. Replace internal net device management in mlx5_ib with ib_device_set_netdev() when attaching/detaching a net device and ib_device_get_netdev() when retrieving the net device. Export ib_device_get_netdev(). For mlx5 representors/PFs/VFs and lag creation we replace the netdev assignments with the IB set/get netdev functions. In active-backup mode lag the active slave net device is stored in the lag itself. To assure the net device stored in a lag bond IB device is the active slave we implement the following: - mlx5_core: when modifying the slave of a bond we send the internal driver event MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE. - mlx5_ib: when catching the event call ib_device_set_netdev() This patch also ensures the correct IB events are sent in switchdev lag. While at it, when in multiport eswitch mode, only a single IB device is created for all ports. The said IB device will receive all netdev events of its VFs once loaded, thus to avoid overwriting the mapping of PF IB device to PF netdev, ignore NETDEV_REGISTER events if the ib device has already been mapped to a netdev. Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com> Signed-off-by: Michael Guralnik <michaelgur@nvidia.com> Link: https://patch.msgid.link/20240909173025.30422-6-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:09 +00:00
dev_put(rep_ndev);
}
return NULL;
}
static bool mlx5_netdev_send_event(struct mlx5_ib_dev *dev,
struct net_device *ndev,
struct net_device *upper,
struct net_device *ib_ndev)
{
if (!dev->ib_active)
return false;
/* Event is about our upper device */
if (upper == ndev)
return true;
/* RDMA device is not in lag and not in switchdev */
if (!dev->is_rep && !upper && ndev == ib_ndev)
return true;
/* RDMA devie is in switchdev */
if (dev->is_rep && ndev == ib_ndev)
return true;
return false;
}
static struct net_device *mlx5_ib_get_rep_uplink_netdev(struct mlx5_ib_dev *ibdev)
{
struct mlx5_ib_port *port;
int i;
for (i = 0; i < ibdev->num_ports; i++) {
port = &ibdev->port[i];
if (port->rep && port->rep->vport == MLX5_VPORT_UPLINK) {
return ib_device_get_netdev(&ibdev->ib_dev, i + 1);
}
}
return NULL;
}
static int mlx5_netdev_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
struct mlx5_roce *roce = container_of(this, struct mlx5_roce, nb);
struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
u32 port_num = roce->native_port_num;
RDMA/mlx5: Use IB set_netdev and get_netdev functions JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: v6.12-rc1 commit 8d159eb2117b2e3697a31785662b653938f007cb Author: Chiara Meiohas <cmeiohas@nvidia.com> Date: Mon Sep 9 20:30:23 2024 +0300 RDMA/mlx5: Use IB set_netdev and get_netdev functions The IB layer provides a common interface to store and get net devices associated to an IB device port (ib_device_set_netdev() and ib_device_get_netdev()). Previously, mlx5_ib stored and managed the associated net devices internally. Replace internal net device management in mlx5_ib with ib_device_set_netdev() when attaching/detaching a net device and ib_device_get_netdev() when retrieving the net device. Export ib_device_get_netdev(). For mlx5 representors/PFs/VFs and lag creation we replace the netdev assignments with the IB set/get netdev functions. In active-backup mode lag the active slave net device is stored in the lag itself. To assure the net device stored in a lag bond IB device is the active slave we implement the following: - mlx5_core: when modifying the slave of a bond we send the internal driver event MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE. - mlx5_ib: when catching the event call ib_device_set_netdev() This patch also ensures the correct IB events are sent in switchdev lag. While at it, when in multiport eswitch mode, only a single IB device is created for all ports. The said IB device will receive all netdev events of its VFs once loaded, thus to avoid overwriting the mapping of PF IB device to PF netdev, ignore NETDEV_REGISTER events if the ib device has already been mapped to a netdev. Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com> Signed-off-by: Michael Guralnik <michaelgur@nvidia.com> Link: https://patch.msgid.link/20240909173025.30422-6-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:09 +00:00
struct net_device *ib_ndev = NULL;
struct mlx5_core_dev *mdev;
struct mlx5_ib_dev *ibdev;
ibdev = roce->dev;
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
mdev = mlx5_ib_get_native_port_mdev(ibdev, port_num, NULL);
if (!mdev)
return NOTIFY_DONE;
switch (event) {
case NETDEV_REGISTER:
/* Should already be registered during the load */
if (ibdev->is_rep)
break;
RDMA/mlx5: Use IB set_netdev and get_netdev functions JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: v6.12-rc1 commit 8d159eb2117b2e3697a31785662b653938f007cb Author: Chiara Meiohas <cmeiohas@nvidia.com> Date: Mon Sep 9 20:30:23 2024 +0300 RDMA/mlx5: Use IB set_netdev and get_netdev functions The IB layer provides a common interface to store and get net devices associated to an IB device port (ib_device_set_netdev() and ib_device_get_netdev()). Previously, mlx5_ib stored and managed the associated net devices internally. Replace internal net device management in mlx5_ib with ib_device_set_netdev() when attaching/detaching a net device and ib_device_get_netdev() when retrieving the net device. Export ib_device_get_netdev(). For mlx5 representors/PFs/VFs and lag creation we replace the netdev assignments with the IB set/get netdev functions. In active-backup mode lag the active slave net device is stored in the lag itself. To assure the net device stored in a lag bond IB device is the active slave we implement the following: - mlx5_core: when modifying the slave of a bond we send the internal driver event MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE. - mlx5_ib: when catching the event call ib_device_set_netdev() This patch also ensures the correct IB events are sent in switchdev lag. While at it, when in multiport eswitch mode, only a single IB device is created for all ports. The said IB device will receive all netdev events of its VFs once loaded, thus to avoid overwriting the mapping of PF IB device to PF netdev, ignore NETDEV_REGISTER events if the ib device has already been mapped to a netdev. Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com> Signed-off-by: Michael Guralnik <michaelgur@nvidia.com> Link: https://patch.msgid.link/20240909173025.30422-6-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:09 +00:00
ib_ndev = ib_device_get_netdev(&ibdev->ib_dev, port_num);
/* Exit if already registered */
if (ib_ndev)
goto put_ndev;
5.2 Merge Window pull request This has been a smaller cycle than normal. One new driver was accepted, which is unusual, and at least one more driver remains in review on the list. - Driver fixes for hns, hfi1, nes, rxe, i40iw, mlx5, cxgb4, vmw_pvrdma - Many patches from MatthewW converting radix tree and IDR users to use xarray - Introduction of tracepoints to the MAD layer - Build large SGLs at the start for DMA mapping and get the driver to split them - Generally clean SGL handling code throughout the subsystem - Support for restricting RDMA devices to net namespaces for containers - Progress to remove object allocation boilerplate code from drivers - Change in how the mlx5 driver shows representor ports linked to VFs - mlx5 uapi feature to access the on chip SW ICM memory - Add a new driver for 'EFA'. This is HW that supports user space packet processing through QPs in Amazon's cloud -----BEGIN PGP SIGNATURE----- iQIzBAABCgAdFiEEfB7FMLh+8QxL+6i3OG33FX4gmxoFAlzTIU0ACgkQOG33FX4g mxrGKQ/8CqpyvuCyZDW5ovO4DI4YlzYSPXehWlwxA4CWhU1AYTujutnNOdZdngnz atTthOlJpZWJV26orvvzwIOi4qX/5UjLXEY3HYdn07JP1Z4iT7E3P4W2sdU3vdl3 j8bU7xM7ZWmnGxrBZ6yQlVRadEhB8+HJIZWMw+wx66cIPnvU+g9NgwouH67HEEQ3 PU8OCtGBwNNR508WPiZhjqMDfi/3BED4BfCihFhMbZEgFgObjRgtCV0M33SSXKcR IO2FGNVuDAUBlND3vU9guW1+M77xE6p1GvzkIgdCp6qTc724NuO5F2ngrpHKRyZT CxvBhAJI6tAZmjBVnmgVJex7rA8p+y/8M/2WD6GE3XSO89XVOkzNBiO2iTMeoxXr +CX6VvP2BWwCArxsfKMgW3j0h/WVE9w8Ciej1628m1NvvKEV4AGIJC1g93lIJkRN i3RkJ5PkIrdBrTEdKwDu1FdXQHaO7kGgKvwzJ7wBFhso8BRMrMfdULiMbaXs2Bw1 WdL5zoSe/bLUpPZxcT9IjXRxY5qR0FpIOoo6925OmvyYe/oZo1zbitS5GGbvV90g tkq6Jb+aq8ZKtozwCo+oMcg9QPLYNibQsnkL3QirtURXWCG467xdgkaJLdF6s5Oh cp+YBqbR/8HNMG/KQlCfnNQKp1ci8mG3EdthQPhvdcZ4jtbqnSI= =TS64 -----END PGP SIGNATURE----- Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma Pull rdma updates from Jason Gunthorpe: "This has been a smaller cycle than normal. One new driver was accepted, which is unusual, and at least one more driver remains in review on the list. Summary: - Driver fixes for hns, hfi1, nes, rxe, i40iw, mlx5, cxgb4, vmw_pvrdma - Many patches from MatthewW converting radix tree and IDR users to use xarray - Introduction of tracepoints to the MAD layer - Build large SGLs at the start for DMA mapping and get the driver to split them - Generally clean SGL handling code throughout the subsystem - Support for restricting RDMA devices to net namespaces for containers - Progress to remove object allocation boilerplate code from drivers - Change in how the mlx5 driver shows representor ports linked to VFs - mlx5 uapi feature to access the on chip SW ICM memory - Add a new driver for 'EFA'. This is HW that supports user space packet processing through QPs in Amazon's cloud" * tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (186 commits) RDMA/ipoib: Allow user space differentiate between valid dev_port IB/core, ipoib: Do not overreact to SM LID change event RDMA/device: Don't fire uevent before device is fully initialized lib/scatterlist: Remove leftover from sg_page_iter comment RDMA/efa: Add driver to Kconfig/Makefile RDMA/efa: Add the efa module RDMA/efa: Add EFA verbs implementation RDMA/efa: Add common command handlers RDMA/efa: Implement functions that submit and complete admin commands RDMA/efa: Add the ABI definitions RDMA/efa: Add the com service API definitions RDMA/efa: Add the efa_com.h file RDMA/efa: Add the efa.h header file RDMA/efa: Add EFA device definitions RDMA: Add EFA related definitions RDMA/umem: Remove hugetlb flag RDMA/bnxt_re: Use core helpers to get aligned DMA address RDMA/i40iw: Use core helpers to get aligned DMA address within a supported page size RDMA/verbs: Add a DMA iterator to return aligned contiguous memory blocks RDMA/umem: Add API to find best driver supported page size in an MR ...
2019-05-09 16:02:46 +00:00
if (ndev->dev.parent == mdev->device)
RDMA/mlx5: Use IB set_netdev and get_netdev functions JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: v6.12-rc1 commit 8d159eb2117b2e3697a31785662b653938f007cb Author: Chiara Meiohas <cmeiohas@nvidia.com> Date: Mon Sep 9 20:30:23 2024 +0300 RDMA/mlx5: Use IB set_netdev and get_netdev functions The IB layer provides a common interface to store and get net devices associated to an IB device port (ib_device_set_netdev() and ib_device_get_netdev()). Previously, mlx5_ib stored and managed the associated net devices internally. Replace internal net device management in mlx5_ib with ib_device_set_netdev() when attaching/detaching a net device and ib_device_get_netdev() when retrieving the net device. Export ib_device_get_netdev(). For mlx5 representors/PFs/VFs and lag creation we replace the netdev assignments with the IB set/get netdev functions. In active-backup mode lag the active slave net device is stored in the lag itself. To assure the net device stored in a lag bond IB device is the active slave we implement the following: - mlx5_core: when modifying the slave of a bond we send the internal driver event MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE. - mlx5_ib: when catching the event call ib_device_set_netdev() This patch also ensures the correct IB events are sent in switchdev lag. While at it, when in multiport eswitch mode, only a single IB device is created for all ports. The said IB device will receive all netdev events of its VFs once loaded, thus to avoid overwriting the mapping of PF IB device to PF netdev, ignore NETDEV_REGISTER events if the ib device has already been mapped to a netdev. Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com> Signed-off-by: Michael Guralnik <michaelgur@nvidia.com> Link: https://patch.msgid.link/20240909173025.30422-6-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:09 +00:00
ib_device_set_netdev(&ibdev->ib_dev, ndev, port_num);
break;
case NETDEV_UNREGISTER:
/* In case of reps, ib device goes away before the netdevs */
RDMA/mlx5: Use IB set_netdev and get_netdev functions JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: v6.12-rc1 commit 8d159eb2117b2e3697a31785662b653938f007cb Author: Chiara Meiohas <cmeiohas@nvidia.com> Date: Mon Sep 9 20:30:23 2024 +0300 RDMA/mlx5: Use IB set_netdev and get_netdev functions The IB layer provides a common interface to store and get net devices associated to an IB device port (ib_device_set_netdev() and ib_device_get_netdev()). Previously, mlx5_ib stored and managed the associated net devices internally. Replace internal net device management in mlx5_ib with ib_device_set_netdev() when attaching/detaching a net device and ib_device_get_netdev() when retrieving the net device. Export ib_device_get_netdev(). For mlx5 representors/PFs/VFs and lag creation we replace the netdev assignments with the IB set/get netdev functions. In active-backup mode lag the active slave net device is stored in the lag itself. To assure the net device stored in a lag bond IB device is the active slave we implement the following: - mlx5_core: when modifying the slave of a bond we send the internal driver event MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE. - mlx5_ib: when catching the event call ib_device_set_netdev() This patch also ensures the correct IB events are sent in switchdev lag. While at it, when in multiport eswitch mode, only a single IB device is created for all ports. The said IB device will receive all netdev events of its VFs once loaded, thus to avoid overwriting the mapping of PF IB device to PF netdev, ignore NETDEV_REGISTER events if the ib device has already been mapped to a netdev. Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com> Signed-off-by: Michael Guralnik <michaelgur@nvidia.com> Link: https://patch.msgid.link/20240909173025.30422-6-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:09 +00:00
if (ibdev->is_rep)
break;
ib_ndev = ib_device_get_netdev(&ibdev->ib_dev, port_num);
if (ib_ndev == ndev)
ib_device_set_netdev(&ibdev->ib_dev, NULL, port_num);
goto put_ndev;
case NETDEV_CHANGE:
case NETDEV_UP:
case NETDEV_DOWN: {
struct net_device *upper = NULL;
RDMA/mlx5: Use IB set_netdev and get_netdev functions JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: v6.12-rc1 commit 8d159eb2117b2e3697a31785662b653938f007cb Author: Chiara Meiohas <cmeiohas@nvidia.com> Date: Mon Sep 9 20:30:23 2024 +0300 RDMA/mlx5: Use IB set_netdev and get_netdev functions The IB layer provides a common interface to store and get net devices associated to an IB device port (ib_device_set_netdev() and ib_device_get_netdev()). Previously, mlx5_ib stored and managed the associated net devices internally. Replace internal net device management in mlx5_ib with ib_device_set_netdev() when attaching/detaching a net device and ib_device_get_netdev() when retrieving the net device. Export ib_device_get_netdev(). For mlx5 representors/PFs/VFs and lag creation we replace the netdev assignments with the IB set/get netdev functions. In active-backup mode lag the active slave net device is stored in the lag itself. To assure the net device stored in a lag bond IB device is the active slave we implement the following: - mlx5_core: when modifying the slave of a bond we send the internal driver event MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE. - mlx5_ib: when catching the event call ib_device_set_netdev() This patch also ensures the correct IB events are sent in switchdev lag. While at it, when in multiport eswitch mode, only a single IB device is created for all ports. The said IB device will receive all netdev events of its VFs once loaded, thus to avoid overwriting the mapping of PF IB device to PF netdev, ignore NETDEV_REGISTER events if the ib device has already been mapped to a netdev. Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com> Signed-off-by: Michael Guralnik <michaelgur@nvidia.com> Link: https://patch.msgid.link/20240909173025.30422-6-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:09 +00:00
if (mlx5_lag_is_roce(mdev) || mlx5_lag_is_sriov(mdev)) {
struct net_device *lag_ndev;
RDMA/mlx5: Use IB set_netdev and get_netdev functions JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: v6.12-rc1 commit 8d159eb2117b2e3697a31785662b653938f007cb Author: Chiara Meiohas <cmeiohas@nvidia.com> Date: Mon Sep 9 20:30:23 2024 +0300 RDMA/mlx5: Use IB set_netdev and get_netdev functions The IB layer provides a common interface to store and get net devices associated to an IB device port (ib_device_set_netdev() and ib_device_get_netdev()). Previously, mlx5_ib stored and managed the associated net devices internally. Replace internal net device management in mlx5_ib with ib_device_set_netdev() when attaching/detaching a net device and ib_device_get_netdev() when retrieving the net device. Export ib_device_get_netdev(). For mlx5 representors/PFs/VFs and lag creation we replace the netdev assignments with the IB set/get netdev functions. In active-backup mode lag the active slave net device is stored in the lag itself. To assure the net device stored in a lag bond IB device is the active slave we implement the following: - mlx5_core: when modifying the slave of a bond we send the internal driver event MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE. - mlx5_ib: when catching the event call ib_device_set_netdev() This patch also ensures the correct IB events are sent in switchdev lag. While at it, when in multiport eswitch mode, only a single IB device is created for all ports. The said IB device will receive all netdev events of its VFs once loaded, thus to avoid overwriting the mapping of PF IB device to PF netdev, ignore NETDEV_REGISTER events if the ib device has already been mapped to a netdev. Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com> Signed-off-by: Michael Guralnik <michaelgur@nvidia.com> Link: https://patch.msgid.link/20240909173025.30422-6-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:09 +00:00
if(mlx5_lag_is_roce(mdev))
lag_ndev = ib_device_get_netdev(&ibdev->ib_dev, 1);
else /* sriov lag */
lag_ndev = mlx5_ib_get_rep_uplink_netdev(ibdev);
if (lag_ndev) {
upper = netdev_master_upper_dev_get(lag_ndev);
dev_put(lag_ndev);
} else {
goto done;
}
}
if (ibdev->is_rep)
roce = mlx5_get_rep_roce(ibdev, ndev, upper, &port_num);
if (!roce)
return NOTIFY_DONE;
RDMA/mlx5: Use IB set_netdev and get_netdev functions JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: v6.12-rc1 commit 8d159eb2117b2e3697a31785662b653938f007cb Author: Chiara Meiohas <cmeiohas@nvidia.com> Date: Mon Sep 9 20:30:23 2024 +0300 RDMA/mlx5: Use IB set_netdev and get_netdev functions The IB layer provides a common interface to store and get net devices associated to an IB device port (ib_device_set_netdev() and ib_device_get_netdev()). Previously, mlx5_ib stored and managed the associated net devices internally. Replace internal net device management in mlx5_ib with ib_device_set_netdev() when attaching/detaching a net device and ib_device_get_netdev() when retrieving the net device. Export ib_device_get_netdev(). For mlx5 representors/PFs/VFs and lag creation we replace the netdev assignments with the IB set/get netdev functions. In active-backup mode lag the active slave net device is stored in the lag itself. To assure the net device stored in a lag bond IB device is the active slave we implement the following: - mlx5_core: when modifying the slave of a bond we send the internal driver event MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE. - mlx5_ib: when catching the event call ib_device_set_netdev() This patch also ensures the correct IB events are sent in switchdev lag. While at it, when in multiport eswitch mode, only a single IB device is created for all ports. The said IB device will receive all netdev events of its VFs once loaded, thus to avoid overwriting the mapping of PF IB device to PF netdev, ignore NETDEV_REGISTER events if the ib device has already been mapped to a netdev. Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com> Signed-off-by: Michael Guralnik <michaelgur@nvidia.com> Link: https://patch.msgid.link/20240909173025.30422-6-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:09 +00:00
ib_ndev = ib_device_get_netdev(&ibdev->ib_dev, port_num);
if (mlx5_netdev_send_event(ibdev, ndev, upper, ib_ndev)) {
struct ib_event ibev = { };
enum ib_port_state port_state;
if (get_port_state(&ibdev->ib_dev, port_num,
&port_state))
RDMA/mlx5: Use IB set_netdev and get_netdev functions JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: v6.12-rc1 commit 8d159eb2117b2e3697a31785662b653938f007cb Author: Chiara Meiohas <cmeiohas@nvidia.com> Date: Mon Sep 9 20:30:23 2024 +0300 RDMA/mlx5: Use IB set_netdev and get_netdev functions The IB layer provides a common interface to store and get net devices associated to an IB device port (ib_device_set_netdev() and ib_device_get_netdev()). Previously, mlx5_ib stored and managed the associated net devices internally. Replace internal net device management in mlx5_ib with ib_device_set_netdev() when attaching/detaching a net device and ib_device_get_netdev() when retrieving the net device. Export ib_device_get_netdev(). For mlx5 representors/PFs/VFs and lag creation we replace the netdev assignments with the IB set/get netdev functions. In active-backup mode lag the active slave net device is stored in the lag itself. To assure the net device stored in a lag bond IB device is the active slave we implement the following: - mlx5_core: when modifying the slave of a bond we send the internal driver event MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE. - mlx5_ib: when catching the event call ib_device_set_netdev() This patch also ensures the correct IB events are sent in switchdev lag. While at it, when in multiport eswitch mode, only a single IB device is created for all ports. The said IB device will receive all netdev events of its VFs once loaded, thus to avoid overwriting the mapping of PF IB device to PF netdev, ignore NETDEV_REGISTER events if the ib device has already been mapped to a netdev. Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com> Signed-off-by: Michael Guralnik <michaelgur@nvidia.com> Link: https://patch.msgid.link/20240909173025.30422-6-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:09 +00:00
goto put_ndev;
if (roce->last_port_state == port_state)
RDMA/mlx5: Use IB set_netdev and get_netdev functions JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: v6.12-rc1 commit 8d159eb2117b2e3697a31785662b653938f007cb Author: Chiara Meiohas <cmeiohas@nvidia.com> Date: Mon Sep 9 20:30:23 2024 +0300 RDMA/mlx5: Use IB set_netdev and get_netdev functions The IB layer provides a common interface to store and get net devices associated to an IB device port (ib_device_set_netdev() and ib_device_get_netdev()). Previously, mlx5_ib stored and managed the associated net devices internally. Replace internal net device management in mlx5_ib with ib_device_set_netdev() when attaching/detaching a net device and ib_device_get_netdev() when retrieving the net device. Export ib_device_get_netdev(). For mlx5 representors/PFs/VFs and lag creation we replace the netdev assignments with the IB set/get netdev functions. In active-backup mode lag the active slave net device is stored in the lag itself. To assure the net device stored in a lag bond IB device is the active slave we implement the following: - mlx5_core: when modifying the slave of a bond we send the internal driver event MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE. - mlx5_ib: when catching the event call ib_device_set_netdev() This patch also ensures the correct IB events are sent in switchdev lag. While at it, when in multiport eswitch mode, only a single IB device is created for all ports. The said IB device will receive all netdev events of its VFs once loaded, thus to avoid overwriting the mapping of PF IB device to PF netdev, ignore NETDEV_REGISTER events if the ib device has already been mapped to a netdev. Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com> Signed-off-by: Michael Guralnik <michaelgur@nvidia.com> Link: https://patch.msgid.link/20240909173025.30422-6-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:09 +00:00
goto put_ndev;
roce->last_port_state = port_state;
ibev.device = &ibdev->ib_dev;
if (port_state == IB_PORT_DOWN)
ibev.event = IB_EVENT_PORT_ERR;
else if (port_state == IB_PORT_ACTIVE)
ibev.event = IB_EVENT_PORT_ACTIVE;
else
RDMA/mlx5: Use IB set_netdev and get_netdev functions JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: v6.12-rc1 commit 8d159eb2117b2e3697a31785662b653938f007cb Author: Chiara Meiohas <cmeiohas@nvidia.com> Date: Mon Sep 9 20:30:23 2024 +0300 RDMA/mlx5: Use IB set_netdev and get_netdev functions The IB layer provides a common interface to store and get net devices associated to an IB device port (ib_device_set_netdev() and ib_device_get_netdev()). Previously, mlx5_ib stored and managed the associated net devices internally. Replace internal net device management in mlx5_ib with ib_device_set_netdev() when attaching/detaching a net device and ib_device_get_netdev() when retrieving the net device. Export ib_device_get_netdev(). For mlx5 representors/PFs/VFs and lag creation we replace the netdev assignments with the IB set/get netdev functions. In active-backup mode lag the active slave net device is stored in the lag itself. To assure the net device stored in a lag bond IB device is the active slave we implement the following: - mlx5_core: when modifying the slave of a bond we send the internal driver event MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE. - mlx5_ib: when catching the event call ib_device_set_netdev() This patch also ensures the correct IB events are sent in switchdev lag. While at it, when in multiport eswitch mode, only a single IB device is created for all ports. The said IB device will receive all netdev events of its VFs once loaded, thus to avoid overwriting the mapping of PF IB device to PF netdev, ignore NETDEV_REGISTER events if the ib device has already been mapped to a netdev. Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com> Signed-off-by: Michael Guralnik <michaelgur@nvidia.com> Link: https://patch.msgid.link/20240909173025.30422-6-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:09 +00:00
goto put_ndev;
ibev.element.port_num = port_num;
ib_dispatch_event(&ibev);
}
break;
}
default:
break;
}
RDMA/mlx5: Use IB set_netdev and get_netdev functions JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: v6.12-rc1 commit 8d159eb2117b2e3697a31785662b653938f007cb Author: Chiara Meiohas <cmeiohas@nvidia.com> Date: Mon Sep 9 20:30:23 2024 +0300 RDMA/mlx5: Use IB set_netdev and get_netdev functions The IB layer provides a common interface to store and get net devices associated to an IB device port (ib_device_set_netdev() and ib_device_get_netdev()). Previously, mlx5_ib stored and managed the associated net devices internally. Replace internal net device management in mlx5_ib with ib_device_set_netdev() when attaching/detaching a net device and ib_device_get_netdev() when retrieving the net device. Export ib_device_get_netdev(). For mlx5 representors/PFs/VFs and lag creation we replace the netdev assignments with the IB set/get netdev functions. In active-backup mode lag the active slave net device is stored in the lag itself. To assure the net device stored in a lag bond IB device is the active slave we implement the following: - mlx5_core: when modifying the slave of a bond we send the internal driver event MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE. - mlx5_ib: when catching the event call ib_device_set_netdev() This patch also ensures the correct IB events are sent in switchdev lag. While at it, when in multiport eswitch mode, only a single IB device is created for all ports. The said IB device will receive all netdev events of its VFs once loaded, thus to avoid overwriting the mapping of PF IB device to PF netdev, ignore NETDEV_REGISTER events if the ib device has already been mapped to a netdev. Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com> Signed-off-by: Michael Guralnik <michaelgur@nvidia.com> Link: https://patch.msgid.link/20240909173025.30422-6-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:09 +00:00
put_ndev:
dev_put(ib_ndev);
done:
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
mlx5_ib_put_native_port_mdev(ibdev, port_num);
return NOTIFY_DONE;
}
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *ibdev,
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
u32 ib_port_num,
u32 *native_port_num)
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
{
enum rdma_link_layer ll = mlx5_ib_port_link_layer(&ibdev->ib_dev,
ib_port_num);
struct mlx5_core_dev *mdev = NULL;
struct mlx5_ib_multiport_info *mpi;
struct mlx5_ib_port *port;
if (ibdev->ib_dev.type == RDMA_DEVICE_TYPE_SMI) {
if (native_port_num)
*native_port_num = smi_to_native_portnum(ibdev,
ib_port_num);
return ibdev->mdev;
}
if (!mlx5_core_mp_enabled(ibdev->mdev) ||
ll != IB_LINK_LAYER_ETHERNET) {
if (native_port_num)
*native_port_num = ib_port_num;
return ibdev->mdev;
}
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
if (native_port_num)
*native_port_num = 1;
port = &ibdev->port[ib_port_num - 1];
spin_lock(&port->mp.mpi_lock);
mpi = ibdev->port[ib_port_num - 1].mp.mpi;
if (mpi && !mpi->unaffiliate) {
mdev = mpi->mdev;
/* If it's the master no need to refcount, it'll exist
* as long as the ib_dev exists.
*/
if (!mpi->is_master)
mpi->mdev_refcnt++;
}
spin_unlock(&port->mp.mpi_lock);
return mdev;
}
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
void mlx5_ib_put_native_port_mdev(struct mlx5_ib_dev *ibdev, u32 port_num)
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
{
enum rdma_link_layer ll = mlx5_ib_port_link_layer(&ibdev->ib_dev,
port_num);
struct mlx5_ib_multiport_info *mpi;
struct mlx5_ib_port *port;
if (!mlx5_core_mp_enabled(ibdev->mdev) || ll != IB_LINK_LAYER_ETHERNET)
return;
port = &ibdev->port[port_num - 1];
spin_lock(&port->mp.mpi_lock);
mpi = ibdev->port[port_num - 1].mp.mpi;
if (mpi->is_master)
goto out;
mpi->mdev_refcnt--;
if (mpi->unaffiliate)
complete(&mpi->unref_comp);
out:
spin_unlock(&port->mp.mpi_lock);
}
static int translate_eth_legacy_proto_oper(u32 eth_proto_oper,
u16 *active_speed, u8 *active_width)
{
switch (eth_proto_oper) {
case MLX5E_PROT_MASK(MLX5E_1000BASE_CX_SGMII):
case MLX5E_PROT_MASK(MLX5E_1000BASE_KX):
case MLX5E_PROT_MASK(MLX5E_100BASE_TX):
case MLX5E_PROT_MASK(MLX5E_1000BASE_T):
*active_width = IB_WIDTH_1X;
*active_speed = IB_SPEED_SDR;
break;
case MLX5E_PROT_MASK(MLX5E_10GBASE_T):
case MLX5E_PROT_MASK(MLX5E_10GBASE_CX4):
case MLX5E_PROT_MASK(MLX5E_10GBASE_KX4):
case MLX5E_PROT_MASK(MLX5E_10GBASE_KR):
case MLX5E_PROT_MASK(MLX5E_10GBASE_CR):
case MLX5E_PROT_MASK(MLX5E_10GBASE_SR):
case MLX5E_PROT_MASK(MLX5E_10GBASE_ER):
*active_width = IB_WIDTH_1X;
*active_speed = IB_SPEED_QDR;
break;
case MLX5E_PROT_MASK(MLX5E_25GBASE_CR):
case MLX5E_PROT_MASK(MLX5E_25GBASE_KR):
case MLX5E_PROT_MASK(MLX5E_25GBASE_SR):
*active_width = IB_WIDTH_1X;
*active_speed = IB_SPEED_EDR;
break;
case MLX5E_PROT_MASK(MLX5E_40GBASE_CR4):
case MLX5E_PROT_MASK(MLX5E_40GBASE_KR4):
case MLX5E_PROT_MASK(MLX5E_40GBASE_SR4):
case MLX5E_PROT_MASK(MLX5E_40GBASE_LR4):
*active_width = IB_WIDTH_4X;
*active_speed = IB_SPEED_QDR;
break;
case MLX5E_PROT_MASK(MLX5E_50GBASE_CR2):
case MLX5E_PROT_MASK(MLX5E_50GBASE_KR2):
case MLX5E_PROT_MASK(MLX5E_50GBASE_SR2):
*active_width = IB_WIDTH_1X;
*active_speed = IB_SPEED_HDR;
break;
case MLX5E_PROT_MASK(MLX5E_56GBASE_R4):
*active_width = IB_WIDTH_4X;
*active_speed = IB_SPEED_FDR;
break;
case MLX5E_PROT_MASK(MLX5E_100GBASE_CR4):
case MLX5E_PROT_MASK(MLX5E_100GBASE_SR4):
case MLX5E_PROT_MASK(MLX5E_100GBASE_KR4):
case MLX5E_PROT_MASK(MLX5E_100GBASE_LR4):
*active_width = IB_WIDTH_4X;
*active_speed = IB_SPEED_EDR;
break;
default:
return -EINVAL;
}
return 0;
}
static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u16 *active_speed,
u8 *active_width)
{
switch (eth_proto_oper) {
case MLX5E_PROT_MASK(MLX5E_SGMII_100M):
case MLX5E_PROT_MASK(MLX5E_1000BASE_X_SGMII):
*active_width = IB_WIDTH_1X;
*active_speed = IB_SPEED_SDR;
break;
case MLX5E_PROT_MASK(MLX5E_5GBASE_R):
*active_width = IB_WIDTH_1X;
*active_speed = IB_SPEED_DDR;
break;
case MLX5E_PROT_MASK(MLX5E_10GBASE_XFI_XAUI_1):
*active_width = IB_WIDTH_1X;
*active_speed = IB_SPEED_QDR;
break;
case MLX5E_PROT_MASK(MLX5E_40GBASE_XLAUI_4_XLPPI_4):
*active_width = IB_WIDTH_4X;
*active_speed = IB_SPEED_QDR;
break;
case MLX5E_PROT_MASK(MLX5E_25GAUI_1_25GBASE_CR_KR):
*active_width = IB_WIDTH_1X;
*active_speed = IB_SPEED_EDR;
break;
case MLX5E_PROT_MASK(MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2):
*active_width = IB_WIDTH_2X;
*active_speed = IB_SPEED_EDR;
break;
case MLX5E_PROT_MASK(MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR):
*active_width = IB_WIDTH_1X;
*active_speed = IB_SPEED_HDR;
break;
case MLX5E_PROT_MASK(MLX5E_CAUI_4_100GBASE_CR4_KR4):
*active_width = IB_WIDTH_4X;
*active_speed = IB_SPEED_EDR;
break;
case MLX5E_PROT_MASK(MLX5E_100GAUI_2_100GBASE_CR2_KR2):
*active_width = IB_WIDTH_2X;
*active_speed = IB_SPEED_HDR;
break;
case MLX5E_PROT_MASK(MLX5E_100GAUI_1_100GBASE_CR_KR):
*active_width = IB_WIDTH_1X;
*active_speed = IB_SPEED_NDR;
break;
case MLX5E_PROT_MASK(MLX5E_200GAUI_4_200GBASE_CR4_KR4):
*active_width = IB_WIDTH_4X;
*active_speed = IB_SPEED_HDR;
break;
case MLX5E_PROT_MASK(MLX5E_200GAUI_2_200GBASE_CR2_KR2):
*active_width = IB_WIDTH_2X;
*active_speed = IB_SPEED_NDR;
break;
case MLX5E_PROT_MASK(MLX5E_400GAUI_8_400GBASE_CR8):
*active_width = IB_WIDTH_8X;
*active_speed = IB_SPEED_HDR;
break;
case MLX5E_PROT_MASK(MLX5E_400GAUI_4_400GBASE_CR4_KR4):
*active_width = IB_WIDTH_4X;
*active_speed = IB_SPEED_NDR;
break;
case MLX5E_PROT_MASK(MLX5E_800GAUI_8_800GBASE_CR8_KR8):
*active_width = IB_WIDTH_8X;
*active_speed = IB_SPEED_NDR;
break;
default:
return -EINVAL;
}
return 0;
}
static int translate_eth_proto_oper(u32 eth_proto_oper, u16 *active_speed,
u8 *active_width, bool ext)
{
return ext ?
translate_eth_ext_proto_oper(eth_proto_oper, active_speed,
active_width) :
translate_eth_legacy_proto_oper(eth_proto_oper, active_speed,
active_width);
}
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
static int mlx5_query_port_roce(struct ib_device *device, u32 port_num,
struct ib_port_attr *props)
{
struct mlx5_ib_dev *dev = to_mdev(device);
u32 out[MLX5_ST_SZ_DW(ptys_reg)] = {0};
struct mlx5_core_dev *mdev;
struct net_device *ndev, *upper;
enum ib_mtu ndev_ib_mtu;
bool put_mdev = true;
u32 eth_prot_oper;
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
u32 mdev_port_num;
bool ext;
int err;
mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num);
if (!mdev) {
/* This means the port isn't affiliated yet. Get the
* info for the master port instead.
*/
put_mdev = false;
mdev = dev->mdev;
mdev_port_num = 1;
port_num = 1;
}
/* Possible bad flows are checked before filling out props so in case
* of an error it will still be zeroed out.
* Use native port in case of reps
*/
if (dev->is_rep)
err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN,
1, 0);
else
err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN,
mdev_port_num, 0);
if (err)
goto out;
ext = !!MLX5_GET_ETH_PROTO(ptys_reg, out, true, eth_proto_capability);
eth_prot_oper = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, eth_proto_oper);
props->active_width = IB_WIDTH_4X;
props->active_speed = IB_SPEED_QDR;
translate_eth_proto_oper(eth_prot_oper, &props->active_speed,
&props->active_width, ext);
if (!dev->is_rep && dev->mdev->roce.roce_en) {
u16 qkey_viol_cntr;
props->port_cap_flags |= IB_PORT_CM_SUP;
props->ip_gids = true;
props->gid_tbl_len = MLX5_CAP_ROCE(dev->mdev,
roce_address_table_size);
mlx5_query_nic_vport_qkey_viol_cntr(mdev, &qkey_viol_cntr);
props->qkey_viol_cntr = qkey_viol_cntr;
}
props->max_mtu = IB_MTU_4096;
props->max_msg_sz = 1 << MLX5_CAP_GEN(dev->mdev, log_max_msg);
props->pkey_tbl_len = 1;
props->state = IB_PORT_DOWN;
props->phys_state = IB_PORT_PHYS_STATE_DISABLED;
/* If this is a stub query for an unaffiliated port stop here */
if (!put_mdev)
goto out;
RDMA/mlx5: Use IB set_netdev and get_netdev functions JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: v6.12-rc1 commit 8d159eb2117b2e3697a31785662b653938f007cb Author: Chiara Meiohas <cmeiohas@nvidia.com> Date: Mon Sep 9 20:30:23 2024 +0300 RDMA/mlx5: Use IB set_netdev and get_netdev functions The IB layer provides a common interface to store and get net devices associated to an IB device port (ib_device_set_netdev() and ib_device_get_netdev()). Previously, mlx5_ib stored and managed the associated net devices internally. Replace internal net device management in mlx5_ib with ib_device_set_netdev() when attaching/detaching a net device and ib_device_get_netdev() when retrieving the net device. Export ib_device_get_netdev(). For mlx5 representors/PFs/VFs and lag creation we replace the netdev assignments with the IB set/get netdev functions. In active-backup mode lag the active slave net device is stored in the lag itself. To assure the net device stored in a lag bond IB device is the active slave we implement the following: - mlx5_core: when modifying the slave of a bond we send the internal driver event MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE. - mlx5_ib: when catching the event call ib_device_set_netdev() This patch also ensures the correct IB events are sent in switchdev lag. While at it, when in multiport eswitch mode, only a single IB device is created for all ports. The said IB device will receive all netdev events of its VFs once loaded, thus to avoid overwriting the mapping of PF IB device to PF netdev, ignore NETDEV_REGISTER events if the ib device has already been mapped to a netdev. Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com> Signed-off-by: Michael Guralnik <michaelgur@nvidia.com> Link: https://patch.msgid.link/20240909173025.30422-6-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:09 +00:00
ndev = ib_device_get_netdev(device, port_num);
if (!ndev)
goto out;
if (mlx5_lag_is_roce(mdev) || mlx5_lag_is_sriov(mdev)) {
rcu_read_lock();
upper = netdev_master_upper_dev_get_rcu(ndev);
if (upper) {
dev_put(ndev);
ndev = upper;
dev_hold(ndev);
}
rcu_read_unlock();
}
if (netif_running(ndev) && netif_carrier_ok(ndev)) {
props->state = IB_PORT_ACTIVE;
props->phys_state = IB_PORT_PHYS_STATE_LINK_UP;
}
ndev_ib_mtu = iboe_get_mtu(ndev->mtu);
dev_put(ndev);
props->active_mtu = min(props->max_mtu, ndev_ib_mtu);
out:
if (put_mdev)
mlx5_ib_put_native_port_mdev(dev, port_num);
return err;
}
int set_roce_addr(struct mlx5_ib_dev *dev, u32 port_num,
unsigned int index, const union ib_gid *gid,
const struct ib_gid_attr *attr)
{
enum ib_gid_type gid_type;
u16 vlan_id = 0xffff;
u8 roce_version = 0;
u8 roce_l3_type = 0;
u8 mac[ETH_ALEN];
int ret;
gid_type = attr->gid_type;
if (gid) {
ret = rdma_read_gid_l2_fields(attr, &vlan_id, &mac[0]);
if (ret)
return ret;
}
switch (gid_type) {
case IB_GID_TYPE_ROCE:
roce_version = MLX5_ROCE_VERSION_1;
break;
case IB_GID_TYPE_ROCE_UDP_ENCAP:
roce_version = MLX5_ROCE_VERSION_2;
if (gid && ipv6_addr_v4mapped((void *)gid))
roce_l3_type = MLX5_ROCE_L3_TYPE_IPV4;
else
roce_l3_type = MLX5_ROCE_L3_TYPE_IPV6;
break;
default:
mlx5_ib_warn(dev, "Unexpected GID type %u\n", gid_type);
}
return mlx5_core_roce_gid_set(dev->mdev, index, roce_version,
roce_l3_type, gid->raw, mac,
vlan_id < VLAN_CFI_MASK, vlan_id,
port_num);
}
static int mlx5_ib_add_gid(const struct ib_gid_attr *attr,
__always_unused void **context)
{
int ret;
ret = mlx5r_add_gid_macsec_operations(attr);
if (ret)
return ret;
return set_roce_addr(to_mdev(attr->device), attr->port_num,
attr->index, &attr->gid, attr);
}
static int mlx5_ib_del_gid(const struct ib_gid_attr *attr,
__always_unused void **context)
{
int ret;
ret = set_roce_addr(to_mdev(attr->device), attr->port_num,
attr->index, NULL, attr);
if (ret)
return ret;
mlx5r_del_gid_macsec_operations(attr);
return 0;
}
__be16 mlx5_get_roce_udp_sport_min(const struct mlx5_ib_dev *dev,
const struct ib_gid_attr *attr)
{
if (attr->gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP)
return 0;
return cpu_to_be16(MLX5_CAP_ROCE(dev->mdev, r_roce_min_src_udp_port));
}
static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev)
{
if (MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_IB)
return !MLX5_CAP_GEN(dev->mdev, ib_virt);
return 0;
}
enum {
MLX5_VPORT_ACCESS_METHOD_MAD,
MLX5_VPORT_ACCESS_METHOD_HCA,
MLX5_VPORT_ACCESS_METHOD_NIC,
};
static int mlx5_get_vport_access_method(struct ib_device *ibdev)
{
if (mlx5_use_mad_ifc(to_mdev(ibdev)))
return MLX5_VPORT_ACCESS_METHOD_MAD;
if (mlx5_ib_port_link_layer(ibdev, 1) ==
IB_LINK_LAYER_ETHERNET)
return MLX5_VPORT_ACCESS_METHOD_NIC;
return MLX5_VPORT_ACCESS_METHOD_HCA;
}
static void get_atomic_caps(struct mlx5_ib_dev *dev,
u8 atomic_size_qp,
struct ib_device_attr *props)
{
u8 tmp;
u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations);
u8 atomic_req_8B_endianness_mode =
MLX5_CAP_ATOMIC(dev->mdev, atomic_req_8B_endianness_mode);
/* Check if HW supports 8 bytes standard atomic operations and capable
* of host endianness respond
*/
tmp = MLX5_ATOMIC_OPS_CMP_SWAP | MLX5_ATOMIC_OPS_FETCH_ADD;
if (((atomic_operations & tmp) == tmp) &&
(atomic_size_qp & MLX5_ATOMIC_SIZE_QP_8BYTES) &&
(atomic_req_8B_endianness_mode)) {
props->atomic_cap = IB_ATOMIC_HCA;
} else {
props->atomic_cap = IB_ATOMIC_NONE;
}
}
static void get_atomic_caps_qp(struct mlx5_ib_dev *dev,
struct ib_device_attr *props)
{
u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp);
get_atomic_caps(dev, atomic_size_qp, props);
}
static int mlx5_query_system_image_guid(struct ib_device *ibdev,
__be64 *sys_image_guid)
{
struct mlx5_ib_dev *dev = to_mdev(ibdev);
struct mlx5_core_dev *mdev = dev->mdev;
u64 tmp;
int err;
switch (mlx5_get_vport_access_method(ibdev)) {
case MLX5_VPORT_ACCESS_METHOD_MAD:
return mlx5_query_mad_ifc_system_image_guid(ibdev,
sys_image_guid);
case MLX5_VPORT_ACCESS_METHOD_HCA:
err = mlx5_query_hca_vport_system_image_guid(mdev, &tmp);
break;
case MLX5_VPORT_ACCESS_METHOD_NIC:
err = mlx5_query_nic_vport_system_image_guid(mdev, &tmp);
break;
default:
return -EINVAL;
}
if (!err)
*sys_image_guid = cpu_to_be64(tmp);
return err;
}
static int mlx5_query_max_pkeys(struct ib_device *ibdev,
u16 *max_pkeys)
{
struct mlx5_ib_dev *dev = to_mdev(ibdev);
struct mlx5_core_dev *mdev = dev->mdev;
switch (mlx5_get_vport_access_method(ibdev)) {
case MLX5_VPORT_ACCESS_METHOD_MAD:
return mlx5_query_mad_ifc_max_pkeys(ibdev, max_pkeys);
case MLX5_VPORT_ACCESS_METHOD_HCA:
case MLX5_VPORT_ACCESS_METHOD_NIC:
*max_pkeys = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev,
pkey_table_size));
return 0;
default:
return -EINVAL;
}
}
static int mlx5_query_vendor_id(struct ib_device *ibdev,
u32 *vendor_id)
{
struct mlx5_ib_dev *dev = to_mdev(ibdev);
switch (mlx5_get_vport_access_method(ibdev)) {
case MLX5_VPORT_ACCESS_METHOD_MAD:
return mlx5_query_mad_ifc_vendor_id(ibdev, vendor_id);
case MLX5_VPORT_ACCESS_METHOD_HCA:
case MLX5_VPORT_ACCESS_METHOD_NIC:
return mlx5_core_query_vendor_id(dev->mdev, vendor_id);
default:
return -EINVAL;
}
}
static int mlx5_query_node_guid(struct mlx5_ib_dev *dev,
__be64 *node_guid)
{
u64 tmp;
int err;
switch (mlx5_get_vport_access_method(&dev->ib_dev)) {
case MLX5_VPORT_ACCESS_METHOD_MAD:
return mlx5_query_mad_ifc_node_guid(dev, node_guid);
case MLX5_VPORT_ACCESS_METHOD_HCA:
err = mlx5_query_hca_vport_node_guid(dev->mdev, &tmp);
break;
case MLX5_VPORT_ACCESS_METHOD_NIC:
err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp);
break;
default:
return -EINVAL;
}
if (!err)
*node_guid = cpu_to_be64(tmp);
return err;
}
struct mlx5_reg_node_desc {
u8 desc[IB_DEVICE_NODE_DESC_MAX];
};
static int mlx5_query_node_desc(struct mlx5_ib_dev *dev, char *node_desc)
{
struct mlx5_reg_node_desc in;
if (mlx5_use_mad_ifc(dev))
return mlx5_query_mad_ifc_node_desc(dev, node_desc);
memset(&in, 0, sizeof(in));
return mlx5_core_access_reg(dev->mdev, &in, sizeof(in), node_desc,
sizeof(struct mlx5_reg_node_desc),
MLX5_REG_NODE_DESC, 0, 0);
}
static void fill_esw_mgr_reg_c0(struct mlx5_core_dev *mdev,
struct mlx5_ib_query_device_resp *resp)
{
struct mlx5_eswitch *esw = mdev->priv.eswitch;
u16 vport = mlx5_eswitch_manager_vport(mdev);
resp->reg_c0.value = mlx5_eswitch_get_vport_metadata_for_match(esw,
vport);
resp->reg_c0.mask = mlx5_eswitch_get_vport_metadata_mask();
}
static int mlx5_ib_query_device(struct ib_device *ibdev,
struct ib_device_attr *props,
struct ib_udata *uhw)
{
size_t uhw_outlen = (uhw) ? uhw->outlen : 0;
struct mlx5_ib_dev *dev = to_mdev(ibdev);
struct mlx5_core_dev *mdev = dev->mdev;
int err = -ENOMEM;
int max_sq_desc;
int max_rq_sg;
int max_sq_sg;
u64 min_page_size = 1ull << MLX5_CAP_GEN(mdev, log_pg_sz);
bool raw_support = !mlx5_core_mp_enabled(mdev);
struct mlx5_ib_query_device_resp resp = {};
size_t resp_len;
u64 max_tso;
resp_len = sizeof(resp.comp_mask) + sizeof(resp.response_length);
if (uhw_outlen && uhw_outlen < resp_len)
return -EINVAL;
resp.response_length = resp_len;
if (uhw && uhw->inlen && !ib_is_udata_cleared(uhw, 0, uhw->inlen))
return -EINVAL;
memset(props, 0, sizeof(*props));
err = mlx5_query_system_image_guid(ibdev,
&props->sys_image_guid);
if (err)
return err;
props->max_pkeys = dev->pkey_table_len;
err = mlx5_query_vendor_id(ibdev, &props->vendor_id);
if (err)
return err;
props->fw_ver = ((u64)fw_rev_maj(dev->mdev) << 32) |
(fw_rev_min(dev->mdev) << 16) |
fw_rev_sub(dev->mdev);
props->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT |
IB_DEVICE_PORT_ACTIVE_EVENT |
IB_DEVICE_SYS_IMAGE_GUID |
IB_DEVICE_RC_RNR_NAK_GEN;
if (MLX5_CAP_GEN(mdev, pkv))
props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
if (MLX5_CAP_GEN(mdev, qkv))
props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
if (MLX5_CAP_GEN(mdev, apm))
props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
if (MLX5_CAP_GEN(mdev, xrc))
props->device_cap_flags |= IB_DEVICE_XRC;
if (MLX5_CAP_GEN(mdev, imaicl)) {
props->device_cap_flags |= IB_DEVICE_MEM_WINDOW |
IB_DEVICE_MEM_WINDOW_TYPE_2B;
props->max_mw = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
/* We support 'Gappy' memory registration too */
props->kernel_cap_flags |= IBK_SG_GAPS_REG;
}
/* IB_WR_REG_MR always requires changing the entity size with UMR */
if (!MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled))
props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
if (MLX5_CAP_GEN(mdev, sho)) {
props->kernel_cap_flags |= IBK_INTEGRITY_HANDOVER;
/* At this stage no support for signature handover */
props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 |
IB_PROT_T10DIF_TYPE_2 |
IB_PROT_T10DIF_TYPE_3;
props->sig_guard_cap = IB_GUARD_T10DIF_CRC |
IB_GUARD_T10DIF_CSUM;
}
if (MLX5_CAP_GEN(mdev, block_lb_mc))
props->kernel_cap_flags |= IBK_BLOCK_MULTICAST_LOOPBACK;
if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && raw_support) {
if (MLX5_CAP_ETH(mdev, csum_cap)) {
/* Legacy bit to support old userspace libraries */
props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM;
props->raw_packet_caps |= IB_RAW_PACKET_CAP_IP_CSUM;
}
if (MLX5_CAP_ETH(dev->mdev, vlan_cap))
props->raw_packet_caps |=
IB_RAW_PACKET_CAP_CVLAN_STRIPPING;
if (offsetofend(typeof(resp), tso_caps) <= uhw_outlen) {
max_tso = MLX5_CAP_ETH(mdev, max_lso_cap);
if (max_tso) {
resp.tso_caps.max_tso = 1 << max_tso;
resp.tso_caps.supported_qpts |=
1 << IB_QPT_RAW_PACKET;
resp.response_length += sizeof(resp.tso_caps);
}
}
if (offsetofend(typeof(resp), rss_caps) <= uhw_outlen) {
resp.rss_caps.rx_hash_function =
MLX5_RX_HASH_FUNC_TOEPLITZ;
resp.rss_caps.rx_hash_fields_mask =
MLX5_RX_HASH_SRC_IPV4 |
MLX5_RX_HASH_DST_IPV4 |
MLX5_RX_HASH_SRC_IPV6 |
MLX5_RX_HASH_DST_IPV6 |
MLX5_RX_HASH_SRC_PORT_TCP |
MLX5_RX_HASH_DST_PORT_TCP |
MLX5_RX_HASH_SRC_PORT_UDP |
MLX5_RX_HASH_DST_PORT_UDP |
MLX5_RX_HASH_INNER;
resp.response_length += sizeof(resp.rss_caps);
}
} else {
if (offsetofend(typeof(resp), tso_caps) <= uhw_outlen)
resp.response_length += sizeof(resp.tso_caps);
if (offsetofend(typeof(resp), rss_caps) <= uhw_outlen)
resp.response_length += sizeof(resp.rss_caps);
}
if (MLX5_CAP_GEN(mdev, ipoib_basic_offloads)) {
props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
props->kernel_cap_flags |= IBK_UD_TSO;
}
if (MLX5_CAP_GEN(dev->mdev, rq_delay_drop) &&
MLX5_CAP_GEN(dev->mdev, general_notification_event) &&
raw_support)
props->raw_packet_caps |= IB_RAW_PACKET_CAP_DELAY_DROP;
if (MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads) &&
MLX5_CAP_IPOIB_ENHANCED(mdev, csum_cap))
props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
MLX5_CAP_ETH(dev->mdev, scatter_fcs) &&
raw_support) {
/* Legacy bit to support old userspace libraries */
props->device_cap_flags |= IB_DEVICE_RAW_SCATTER_FCS;
props->raw_packet_caps |= IB_RAW_PACKET_CAP_SCATTER_FCS;
}
if (MLX5_CAP_DEV_MEM(mdev, memic)) {
props->max_dm_size =
MLX5_CAP_DEV_MEM(mdev, max_memic_size);
}
if (mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS))
props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING;
if (MLX5_CAP_GEN(mdev, end_pad))
props->device_cap_flags |= IB_DEVICE_PCI_WRITE_END_PADDING;
props->vendor_part_id = mdev->pdev->device;
props->hw_ver = mdev->pdev->revision;
props->max_mr_size = ~0ull;
props->page_size_cap = ~(min_page_size - 1);
props->max_qp = 1 << MLX5_CAP_GEN(mdev, log_max_qp);
props->max_qp_wr = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
max_rq_sg = MLX5_CAP_GEN(mdev, max_wqe_sz_rq) /
sizeof(struct mlx5_wqe_data_seg);
max_sq_desc = min_t(int, MLX5_CAP_GEN(mdev, max_wqe_sz_sq), 512);
max_sq_sg = (max_sq_desc - sizeof(struct mlx5_wqe_ctrl_seg) -
sizeof(struct mlx5_wqe_raddr_seg)) /
sizeof(struct mlx5_wqe_data_seg);
props->max_send_sge = max_sq_sg;
props->max_recv_sge = max_rq_sg;
props->max_sge_rd = MLX5_MAX_SGE_RD;
props->max_cq = 1 << MLX5_CAP_GEN(mdev, log_max_cq);
props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_cq_sz)) - 1;
props->max_mr = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
props->max_pd = 1 << MLX5_CAP_GEN(mdev, log_max_pd);
props->max_qp_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_req_qp);
props->max_qp_init_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_res_qp);
props->max_srq = 1 << MLX5_CAP_GEN(mdev, log_max_srq);
props->max_srq_wr = (1 << MLX5_CAP_GEN(mdev, log_max_srq_sz)) - 1;
props->local_ca_ack_delay = MLX5_CAP_GEN(mdev, local_ca_ack_delay);
props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp;
props->max_srq_sge = max_rq_sg - 1;
props->max_fast_reg_page_list_len =
1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size);
props->max_pi_fast_reg_page_list_len =
props->max_fast_reg_page_list_len / 2;
props->max_sgl_rd =
MLX5_CAP_GEN(mdev, max_sgl_for_optimized_performance);
get_atomic_caps_qp(dev, props);
props->masked_atomic_cap = IB_ATOMIC_NONE;
props->max_mcast_grp = 1 << MLX5_CAP_GEN(mdev, log_max_mcg);
props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg);
props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
props->max_mcast_grp;
props->max_ah = INT_MAX;
props->hca_core_clock = MLX5_CAP_GEN(mdev, device_frequency_khz);
props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL;
if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
if (dev->odp_caps.general_caps & IB_ODP_SUPPORT)
props->kernel_cap_flags |= IBK_ON_DEMAND_PAGING;
props->odp_caps = dev->odp_caps;
if (!uhw) {
/* ODP for kernel QPs is not implemented for receive
* WQEs and SRQ WQEs
*/
props->odp_caps.per_transport_caps.rc_odp_caps &=
~(IB_ODP_SUPPORT_READ |
IB_ODP_SUPPORT_SRQ_RECV);
props->odp_caps.per_transport_caps.uc_odp_caps &=
~(IB_ODP_SUPPORT_READ |
IB_ODP_SUPPORT_SRQ_RECV);
props->odp_caps.per_transport_caps.ud_odp_caps &=
~(IB_ODP_SUPPORT_READ |
IB_ODP_SUPPORT_SRQ_RECV);
props->odp_caps.per_transport_caps.xrc_odp_caps &=
~(IB_ODP_SUPPORT_READ |
IB_ODP_SUPPORT_SRQ_RECV);
}
}
if (mlx5_core_is_vf(mdev))
props->kernel_cap_flags |= IBK_VIRTUAL_FUNCTION;
if (mlx5_ib_port_link_layer(ibdev, 1) ==
IB_LINK_LAYER_ETHERNET && raw_support) {
props->rss_caps.max_rwq_indirection_tables =
1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt);
props->rss_caps.max_rwq_indirection_table_size =
1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt_size);
props->rss_caps.supported_qpts = 1 << IB_QPT_RAW_PACKET;
props->max_wq_type_rq =
1 << MLX5_CAP_GEN(dev->mdev, log_max_rq);
}
if (MLX5_CAP_GEN(mdev, tag_matching)) {
props->tm_caps.max_num_tags =
(1 << MLX5_CAP_GEN(mdev, log_tag_matching_list_sz)) - 1;
props->tm_caps.max_ops =
1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
props->tm_caps.max_sge = MLX5_TM_MAX_SGE;
}
if (MLX5_CAP_GEN(mdev, tag_matching) &&
MLX5_CAP_GEN(mdev, rndv_offload_rc)) {
props->tm_caps.flags = IB_TM_CAP_RNDV_RC;
props->tm_caps.max_rndv_hdr_size = MLX5_TM_MAX_RNDV_MSG_SIZE;
}
if (MLX5_CAP_GEN(dev->mdev, cq_moderation)) {
props->cq_caps.max_cq_moderation_count =
MLX5_MAX_CQ_COUNT;
props->cq_caps.max_cq_moderation_period =
MLX5_MAX_CQ_PERIOD;
}
if (offsetofend(typeof(resp), cqe_comp_caps) <= uhw_outlen) {
resp.response_length += sizeof(resp.cqe_comp_caps);
if (MLX5_CAP_GEN(dev->mdev, cqe_compression)) {
resp.cqe_comp_caps.max_num =
MLX5_CAP_GEN(dev->mdev,
cqe_compression_max_num);
resp.cqe_comp_caps.supported_format =
MLX5_IB_CQE_RES_FORMAT_HASH |
MLX5_IB_CQE_RES_FORMAT_CSUM;
if (MLX5_CAP_GEN(dev->mdev, mini_cqe_resp_stride_index))
resp.cqe_comp_caps.supported_format |=
MLX5_IB_CQE_RES_FORMAT_CSUM_STRIDX;
}
}
if (offsetofend(typeof(resp), packet_pacing_caps) <= uhw_outlen &&
raw_support) {
if (MLX5_CAP_QOS(mdev, packet_pacing) &&
MLX5_CAP_GEN(mdev, qos)) {
resp.packet_pacing_caps.qp_rate_limit_max =
MLX5_CAP_QOS(mdev, packet_pacing_max_rate);
resp.packet_pacing_caps.qp_rate_limit_min =
MLX5_CAP_QOS(mdev, packet_pacing_min_rate);
resp.packet_pacing_caps.supported_qpts |=
1 << IB_QPT_RAW_PACKET;
if (MLX5_CAP_QOS(mdev, packet_pacing_burst_bound) &&
MLX5_CAP_QOS(mdev, packet_pacing_typical_size))
resp.packet_pacing_caps.cap_flags |=
MLX5_IB_PP_SUPPORT_BURST;
}
resp.response_length += sizeof(resp.packet_pacing_caps);
}
if (offsetofend(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes) <=
uhw_outlen) {
if (MLX5_CAP_ETH(mdev, multi_pkt_send_wqe))
resp.mlx5_ib_support_multi_pkt_send_wqes =
MLX5_IB_ALLOW_MPW;
if (MLX5_CAP_ETH(mdev, enhanced_multi_pkt_send_wqe))
resp.mlx5_ib_support_multi_pkt_send_wqes |=
MLX5_IB_SUPPORT_EMPW;
resp.response_length +=
sizeof(resp.mlx5_ib_support_multi_pkt_send_wqes);
}
if (offsetofend(typeof(resp), flags) <= uhw_outlen) {
resp.response_length += sizeof(resp.flags);
if (MLX5_CAP_GEN(mdev, cqe_compression_128))
resp.flags |=
MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_COMP;
if (MLX5_CAP_GEN(mdev, cqe_128_always))
resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_PAD;
if (MLX5_CAP_GEN(mdev, qp_packet_based))
resp.flags |=
MLX5_IB_QUERY_DEV_RESP_PACKET_BASED_CREDIT_MODE;
resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_SCAT2CQE_DCT;
}
if (offsetofend(typeof(resp), sw_parsing_caps) <= uhw_outlen) {
resp.response_length += sizeof(resp.sw_parsing_caps);
if (MLX5_CAP_ETH(mdev, swp)) {
resp.sw_parsing_caps.sw_parsing_offloads |=
MLX5_IB_SW_PARSING;
if (MLX5_CAP_ETH(mdev, swp_csum))
resp.sw_parsing_caps.sw_parsing_offloads |=
MLX5_IB_SW_PARSING_CSUM;
if (MLX5_CAP_ETH(mdev, swp_lso))
resp.sw_parsing_caps.sw_parsing_offloads |=
MLX5_IB_SW_PARSING_LSO;
if (resp.sw_parsing_caps.sw_parsing_offloads)
resp.sw_parsing_caps.supported_qpts =
BIT(IB_QPT_RAW_PACKET);
}
}
if (offsetofend(typeof(resp), striding_rq_caps) <= uhw_outlen &&
raw_support) {
resp.response_length += sizeof(resp.striding_rq_caps);
if (MLX5_CAP_GEN(mdev, striding_rq)) {
resp.striding_rq_caps.min_single_stride_log_num_of_bytes =
MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES;
resp.striding_rq_caps.max_single_stride_log_num_of_bytes =
MLX5_MAX_SINGLE_STRIDE_LOG_NUM_BYTES;
if (MLX5_CAP_GEN(dev->mdev, ext_stride_num_range))
resp.striding_rq_caps
.min_single_wqe_log_num_of_strides =
MLX5_EXT_MIN_SINGLE_WQE_LOG_NUM_STRIDES;
else
resp.striding_rq_caps
.min_single_wqe_log_num_of_strides =
MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES;
resp.striding_rq_caps.max_single_wqe_log_num_of_strides =
MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES;
resp.striding_rq_caps.supported_qpts =
BIT(IB_QPT_RAW_PACKET);
}
}
if (offsetofend(typeof(resp), tunnel_offloads_caps) <= uhw_outlen) {
resp.response_length += sizeof(resp.tunnel_offloads_caps);
if (MLX5_CAP_ETH(mdev, tunnel_stateless_vxlan))
resp.tunnel_offloads_caps |=
MLX5_IB_TUNNELED_OFFLOADS_VXLAN;
if (MLX5_CAP_ETH(mdev, tunnel_stateless_geneve_rx))
resp.tunnel_offloads_caps |=
MLX5_IB_TUNNELED_OFFLOADS_GENEVE;
if (MLX5_CAP_ETH(mdev, tunnel_stateless_gre))
resp.tunnel_offloads_caps |=
MLX5_IB_TUNNELED_OFFLOADS_GRE;
if (MLX5_CAP_ETH(mdev, tunnel_stateless_mpls_over_gre))
resp.tunnel_offloads_caps |=
MLX5_IB_TUNNELED_OFFLOADS_MPLS_GRE;
if (MLX5_CAP_ETH(mdev, tunnel_stateless_mpls_over_udp))
resp.tunnel_offloads_caps |=
MLX5_IB_TUNNELED_OFFLOADS_MPLS_UDP;
}
if (offsetofend(typeof(resp), dci_streams_caps) <= uhw_outlen) {
resp.response_length += sizeof(resp.dci_streams_caps);
resp.dci_streams_caps.max_log_num_concurent =
MLX5_CAP_GEN(mdev, log_max_dci_stream_channels);
resp.dci_streams_caps.max_log_num_errored =
MLX5_CAP_GEN(mdev, log_max_dci_errored_streams);
}
if (offsetofend(typeof(resp), reserved) <= uhw_outlen)
resp.response_length += sizeof(resp.reserved);
if (offsetofend(typeof(resp), reg_c0) <= uhw_outlen) {
struct mlx5_eswitch *esw = mdev->priv.eswitch;
resp.response_length += sizeof(resp.reg_c0);
if (mlx5_eswitch_mode(mdev) == MLX5_ESWITCH_OFFLOADS &&
mlx5_eswitch_vport_match_metadata_enabled(esw))
fill_esw_mgr_reg_c0(mdev, &resp);
}
if (uhw_outlen) {
err = ib_copy_to_udata(uhw, &resp, resp.response_length);
if (err)
return err;
}
return 0;
}
static void translate_active_width(struct ib_device *ibdev, u16 active_width,
u8 *ib_width)
{
struct mlx5_ib_dev *dev = to_mdev(ibdev);
if (active_width & MLX5_PTYS_WIDTH_1X)
*ib_width = IB_WIDTH_1X;
else if (active_width & MLX5_PTYS_WIDTH_2X)
*ib_width = IB_WIDTH_2X;
else if (active_width & MLX5_PTYS_WIDTH_4X)
*ib_width = IB_WIDTH_4X;
else if (active_width & MLX5_PTYS_WIDTH_8X)
*ib_width = IB_WIDTH_8X;
else if (active_width & MLX5_PTYS_WIDTH_12X)
*ib_width = IB_WIDTH_12X;
else {
mlx5_ib_dbg(dev, "Invalid active_width %d, setting width to default value: 4x\n",
active_width);
*ib_width = IB_WIDTH_4X;
}
return;
}
static int mlx5_mtu_to_ib_mtu(int mtu)
{
switch (mtu) {
case 256: return 1;
case 512: return 2;
case 1024: return 3;
case 2048: return 4;
case 4096: return 5;
default:
pr_warn("invalid mtu\n");
return -1;
}
}
enum ib_max_vl_num {
__IB_MAX_VL_0 = 1,
__IB_MAX_VL_0_1 = 2,
__IB_MAX_VL_0_3 = 3,
__IB_MAX_VL_0_7 = 4,
__IB_MAX_VL_0_14 = 5,
};
enum mlx5_vl_hw_cap {
MLX5_VL_HW_0 = 1,
MLX5_VL_HW_0_1 = 2,
MLX5_VL_HW_0_2 = 3,
MLX5_VL_HW_0_3 = 4,
MLX5_VL_HW_0_4 = 5,
MLX5_VL_HW_0_5 = 6,
MLX5_VL_HW_0_6 = 7,
MLX5_VL_HW_0_7 = 8,
MLX5_VL_HW_0_14 = 15
};
static int translate_max_vl_num(struct ib_device *ibdev, u8 vl_hw_cap,
u8 *max_vl_num)
{
switch (vl_hw_cap) {
case MLX5_VL_HW_0:
*max_vl_num = __IB_MAX_VL_0;
break;
case MLX5_VL_HW_0_1:
*max_vl_num = __IB_MAX_VL_0_1;
break;
case MLX5_VL_HW_0_3:
*max_vl_num = __IB_MAX_VL_0_3;
break;
case MLX5_VL_HW_0_7:
*max_vl_num = __IB_MAX_VL_0_7;
break;
case MLX5_VL_HW_0_14:
*max_vl_num = __IB_MAX_VL_0_14;
break;
default:
return -EINVAL;
}
return 0;
}
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
static int mlx5_query_hca_port(struct ib_device *ibdev, u32 port,
struct ib_port_attr *props)
{
struct mlx5_ib_dev *dev = to_mdev(ibdev);
struct mlx5_core_dev *mdev = dev->mdev;
struct mlx5_hca_vport_context *rep;
u8 vl_hw_cap, plane_index = 0;
u16 max_mtu;
u16 oper_mtu;
int err;
u16 ib_link_width_oper;
rep = kzalloc(sizeof(*rep), GFP_KERNEL);
if (!rep) {
err = -ENOMEM;
goto out;
}
/* props being zeroed by the caller, avoid zeroing it here */
if (ibdev->type == RDMA_DEVICE_TYPE_SMI) {
plane_index = port;
port = smi_to_native_portnum(dev, port);
}
err = mlx5_query_hca_vport_context(mdev, 0, port, 0, rep);
if (err)
goto out;
props->lid = rep->lid;
props->lmc = rep->lmc;
props->sm_lid = rep->sm_lid;
props->sm_sl = rep->sm_sl;
props->state = rep->vport_state;
props->phys_state = rep->port_physical_state;
props->port_cap_flags = rep->cap_mask1;
if (dev->num_plane) {
props->port_cap_flags |= IB_PORT_SM_DISABLED;
props->port_cap_flags &= ~IB_PORT_SM;
} else if (ibdev->type == RDMA_DEVICE_TYPE_SMI)
props->port_cap_flags &= ~IB_PORT_CM_SUP;
props->gid_tbl_len = mlx5_get_gid_table_len(MLX5_CAP_GEN(mdev, gid_table_size));
props->max_msg_sz = 1 << MLX5_CAP_GEN(mdev, log_max_msg);
props->pkey_tbl_len = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size));
props->bad_pkey_cntr = rep->pkey_violation_counter;
props->qkey_viol_cntr = rep->qkey_violation_counter;
props->subnet_timeout = rep->subnet_timeout;
props->init_type_reply = rep->init_type_reply;
if (props->port_cap_flags & IB_PORT_CAP_MASK2_SUP)
props->port_cap_flags2 = rep->cap_mask2;
err = mlx5_query_ib_port_oper(mdev, &ib_link_width_oper,
&props->active_speed, port, plane_index);
if (err)
goto out;
translate_active_width(ibdev, ib_link_width_oper, &props->active_width);
mlx5_query_port_max_mtu(mdev, &max_mtu, port);
props->max_mtu = mlx5_mtu_to_ib_mtu(max_mtu);
mlx5_query_port_oper_mtu(mdev, &oper_mtu, port);
props->active_mtu = mlx5_mtu_to_ib_mtu(oper_mtu);
err = mlx5_query_port_vl_hw_cap(mdev, &vl_hw_cap, port);
if (err)
goto out;
err = translate_max_vl_num(ibdev, vl_hw_cap,
&props->max_vl_num);
out:
kfree(rep);
return err;
}
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
int mlx5_ib_query_port(struct ib_device *ibdev, u32 port,
struct ib_port_attr *props)
{
unsigned int count;
int ret;
switch (mlx5_get_vport_access_method(ibdev)) {
case MLX5_VPORT_ACCESS_METHOD_MAD:
ret = mlx5_query_mad_ifc_port(ibdev, port, props);
break;
case MLX5_VPORT_ACCESS_METHOD_HCA:
ret = mlx5_query_hca_port(ibdev, port, props);
break;
case MLX5_VPORT_ACCESS_METHOD_NIC:
ret = mlx5_query_port_roce(ibdev, port, props);
break;
default:
ret = -EINVAL;
}
if (!ret && props) {
struct mlx5_ib_dev *dev = to_mdev(ibdev);
struct mlx5_core_dev *mdev;
bool put_mdev = true;
mdev = mlx5_ib_get_native_port_mdev(dev, port, NULL);
if (!mdev) {
/* If the port isn't affiliated yet query the master.
* The master and slave will have the same values.
*/
mdev = dev->mdev;
port = 1;
put_mdev = false;
}
count = mlx5_core_reserved_gids_count(mdev);
if (put_mdev)
mlx5_ib_put_native_port_mdev(dev, port);
props->gid_tbl_len -= count;
}
return ret;
}
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
static int mlx5_ib_rep_query_port(struct ib_device *ibdev, u32 port,
struct ib_port_attr *props)
{
return mlx5_query_port_roce(ibdev, port, props);
}
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
static int mlx5_ib_rep_query_pkey(struct ib_device *ibdev, u32 port, u16 index,
u16 *pkey)
{
/* Default special Pkey for representor device port as per the
* IB specification 1.3 section 10.9.1.2.
*/
*pkey = 0xffff;
return 0;
}
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
static int mlx5_ib_query_gid(struct ib_device *ibdev, u32 port, int index,
union ib_gid *gid)
{
struct mlx5_ib_dev *dev = to_mdev(ibdev);
struct mlx5_core_dev *mdev = dev->mdev;
switch (mlx5_get_vport_access_method(ibdev)) {
case MLX5_VPORT_ACCESS_METHOD_MAD:
return mlx5_query_mad_ifc_gids(ibdev, port, index, gid);
case MLX5_VPORT_ACCESS_METHOD_HCA:
return mlx5_query_hca_vport_gid(mdev, 0, port, 0, index, gid);
default:
return -EINVAL;
}
}
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
static int mlx5_query_hca_nic_pkey(struct ib_device *ibdev, u32 port,
u16 index, u16 *pkey)
{
struct mlx5_ib_dev *dev = to_mdev(ibdev);
struct mlx5_core_dev *mdev;
bool put_mdev = true;
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
u32 mdev_port_num;
int err;
mdev = mlx5_ib_get_native_port_mdev(dev, port, &mdev_port_num);
if (!mdev) {
/* The port isn't affiliated yet, get the PKey from the master
* port. For RoCE the PKey tables will be the same.
*/
put_mdev = false;
mdev = dev->mdev;
mdev_port_num = 1;
}
err = mlx5_query_hca_vport_pkey(mdev, 0, mdev_port_num, 0,
index, pkey);
if (put_mdev)
mlx5_ib_put_native_port_mdev(dev, port);
return err;
}
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
static int mlx5_ib_query_pkey(struct ib_device *ibdev, u32 port, u16 index,
u16 *pkey)
{
switch (mlx5_get_vport_access_method(ibdev)) {
case MLX5_VPORT_ACCESS_METHOD_MAD:
return mlx5_query_mad_ifc_pkey(ibdev, port, index, pkey);
case MLX5_VPORT_ACCESS_METHOD_HCA:
case MLX5_VPORT_ACCESS_METHOD_NIC:
return mlx5_query_hca_nic_pkey(ibdev, port, index, pkey);
default:
return -EINVAL;
}
}
static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask,
struct ib_device_modify *props)
{
struct mlx5_ib_dev *dev = to_mdev(ibdev);
struct mlx5_reg_node_desc in;
struct mlx5_reg_node_desc out;
int err;
if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
return -EOPNOTSUPP;
if (!(mask & IB_DEVICE_MODIFY_NODE_DESC))
return 0;
/*
* If possible, pass node desc to FW, so it can generate
* a 144 trap. If cmd fails, just ignore.
*/
memcpy(&in, props->node_desc, IB_DEVICE_NODE_DESC_MAX);
err = mlx5_core_access_reg(dev->mdev, &in, sizeof(in), &out,
sizeof(out), MLX5_REG_NODE_DESC, 0, 1);
if (err)
return err;
memcpy(ibdev->node_desc, props->node_desc, IB_DEVICE_NODE_DESC_MAX);
return err;
}
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
static int set_port_caps_atomic(struct mlx5_ib_dev *dev, u32 port_num, u32 mask,
u32 value)
{
struct mlx5_hca_vport_context ctx = {};
struct mlx5_core_dev *mdev;
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
u32 mdev_port_num;
int err;
mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num);
if (!mdev)
return -ENODEV;
err = mlx5_query_hca_vport_context(mdev, 0, mdev_port_num, 0, &ctx);
if (err)
goto out;
if (~ctx.cap_mask1_perm & mask) {
mlx5_ib_warn(dev, "trying to change bitmask 0x%X but change supported 0x%X\n",
mask, ctx.cap_mask1_perm);
err = -EINVAL;
goto out;
}
ctx.cap_mask1 = value;
ctx.cap_mask1_perm = mask;
err = mlx5_core_modify_hca_vport_context(mdev, 0, mdev_port_num,
0, &ctx);
out:
mlx5_ib_put_native_port_mdev(dev, port_num);
return err;
}
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
static int mlx5_ib_modify_port(struct ib_device *ibdev, u32 port, int mask,
struct ib_port_modify *props)
{
struct mlx5_ib_dev *dev = to_mdev(ibdev);
struct ib_port_attr attr;
u32 tmp;
int err;
u32 change_mask;
u32 value;
bool is_ib = (mlx5_ib_port_link_layer(ibdev, port) ==
IB_LINK_LAYER_INFINIBAND);
/* CM layer calls ib_modify_port() regardless of the link layer. For
* Ethernet ports, qkey violation and Port capabilities are meaningless.
*/
if (!is_ib)
return 0;
if (MLX5_CAP_GEN(dev->mdev, ib_virt) && is_ib) {
change_mask = props->clr_port_cap_mask | props->set_port_cap_mask;
value = ~props->clr_port_cap_mask | props->set_port_cap_mask;
return set_port_caps_atomic(dev, port, change_mask, value);
}
mutex_lock(&dev->cap_mask_mutex);
err = ib_query_port(ibdev, port, &attr);
if (err)
goto out;
tmp = (attr.port_cap_flags | props->set_port_cap_mask) &
~props->clr_port_cap_mask;
err = mlx5_set_port_caps(dev->mdev, port, tmp);
out:
mutex_unlock(&dev->cap_mask_mutex);
return err;
}
static void print_lib_caps(struct mlx5_ib_dev *dev, u64 caps)
{
mlx5_ib_dbg(dev, "MLX5_LIB_CAP_4K_UAR = %s\n",
caps & MLX5_LIB_CAP_4K_UAR ? "y" : "n");
}
static u16 calc_dynamic_bfregs(int uars_per_sys_page)
{
/* Large page with non 4k uar support might limit the dynamic size */
if (uars_per_sys_page == 1 && PAGE_SIZE > 4096)
return MLX5_MIN_DYN_BFREGS;
return MLX5_MAX_DYN_BFREGS;
}
static int calc_total_bfregs(struct mlx5_ib_dev *dev, bool lib_uar_4k,
struct mlx5_ib_alloc_ucontext_req_v2 *req,
struct mlx5_bfreg_info *bfregi)
{
int uars_per_sys_page;
int bfregs_per_sys_page;
int ref_bfregs = req->total_num_bfregs;
if (req->total_num_bfregs == 0)
return -EINVAL;
BUILD_BUG_ON(MLX5_MAX_BFREGS % MLX5_NON_FP_BFREGS_IN_PAGE);
BUILD_BUG_ON(MLX5_MAX_BFREGS < MLX5_NON_FP_BFREGS_IN_PAGE);
if (req->total_num_bfregs > MLX5_MAX_BFREGS)
return -ENOMEM;
uars_per_sys_page = get_uars_per_sys_page(dev, lib_uar_4k);
bfregs_per_sys_page = uars_per_sys_page * MLX5_NON_FP_BFREGS_PER_UAR;
/* This holds the required static allocation asked by the user */
req->total_num_bfregs = ALIGN(req->total_num_bfregs, bfregs_per_sys_page);
if (req->num_low_latency_bfregs > req->total_num_bfregs - 1)
return -EINVAL;
bfregi->num_static_sys_pages = req->total_num_bfregs / bfregs_per_sys_page;
bfregi->num_dyn_bfregs = ALIGN(calc_dynamic_bfregs(uars_per_sys_page), bfregs_per_sys_page);
bfregi->total_num_bfregs = req->total_num_bfregs + bfregi->num_dyn_bfregs;
bfregi->num_sys_pages = bfregi->total_num_bfregs / bfregs_per_sys_page;
mlx5_ib_dbg(dev, "uar_4k: fw support %s, lib support %s, user requested %d bfregs, allocated %d, total bfregs %d, using %d sys pages\n",
MLX5_CAP_GEN(dev->mdev, uar_4k) ? "yes" : "no",
lib_uar_4k ? "yes" : "no", ref_bfregs,
req->total_num_bfregs, bfregi->total_num_bfregs,
bfregi->num_sys_pages);
return 0;
}
static int allocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *context)
{
struct mlx5_bfreg_info *bfregi;
int err;
int i;
bfregi = &context->bfregi;
for (i = 0; i < bfregi->num_static_sys_pages; i++) {
err = mlx5_cmd_uar_alloc(dev->mdev, &bfregi->sys_pages[i],
context->devx_uid);
if (err)
goto error;
mlx5_ib_dbg(dev, "allocated uar %d\n", bfregi->sys_pages[i]);
}
for (i = bfregi->num_static_sys_pages; i < bfregi->num_sys_pages; i++)
bfregi->sys_pages[i] = MLX5_IB_INVALID_UAR_INDEX;
return 0;
error:
for (--i; i >= 0; i--)
if (mlx5_cmd_uar_dealloc(dev->mdev, bfregi->sys_pages[i],
context->devx_uid))
mlx5_ib_warn(dev, "failed to free uar %d\n", i);
return err;
}
static void deallocate_uars(struct mlx5_ib_dev *dev,
struct mlx5_ib_ucontext *context)
{
struct mlx5_bfreg_info *bfregi;
int i;
bfregi = &context->bfregi;
for (i = 0; i < bfregi->num_sys_pages; i++)
if (i < bfregi->num_static_sys_pages ||
bfregi->sys_pages[i] != MLX5_IB_INVALID_UAR_INDEX)
mlx5_cmd_uar_dealloc(dev->mdev, bfregi->sys_pages[i],
context->devx_uid);
}
int mlx5_ib_enable_lb(struct mlx5_ib_dev *dev, bool td, bool qp)
{
int err = 0;
mutex_lock(&dev->lb.mutex);
if (td)
dev->lb.user_td++;
if (qp)
dev->lb.qps++;
if (dev->lb.user_td == 2 ||
dev->lb.qps == 1) {
if (!dev->lb.enabled) {
err = mlx5_nic_vport_update_local_lb(dev->mdev, true);
dev->lb.enabled = true;
}
}
mutex_unlock(&dev->lb.mutex);
return err;
}
void mlx5_ib_disable_lb(struct mlx5_ib_dev *dev, bool td, bool qp)
{
mutex_lock(&dev->lb.mutex);
if (td)
dev->lb.user_td--;
if (qp)
dev->lb.qps--;
if (dev->lb.user_td == 1 &&
dev->lb.qps == 0) {
if (dev->lb.enabled) {
mlx5_nic_vport_update_local_lb(dev->mdev, false);
dev->lb.enabled = false;
}
}
mutex_unlock(&dev->lb.mutex);
}
static int mlx5_ib_alloc_transport_domain(struct mlx5_ib_dev *dev, u32 *tdn,
u16 uid)
{
int err;
if (!MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
return 0;
err = mlx5_cmd_alloc_transport_domain(dev->mdev, tdn, uid);
if (err)
return err;
if ((MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) ||
(!MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) &&
!MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
return err;
return mlx5_ib_enable_lb(dev, true, false);
}
static void mlx5_ib_dealloc_transport_domain(struct mlx5_ib_dev *dev, u32 tdn,
u16 uid)
{
if (!MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
return;
mlx5_cmd_dealloc_transport_domain(dev->mdev, tdn, uid);
if ((MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) ||
(!MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) &&
!MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
return;
mlx5_ib_disable_lb(dev, true, false);
}
static int set_ucontext_resp(struct ib_ucontext *uctx,
struct mlx5_ib_alloc_ucontext_resp *resp)
{
struct ib_device *ibdev = uctx->device;
struct mlx5_ib_dev *dev = to_mdev(ibdev);
struct mlx5_ib_ucontext *context = to_mucontext(uctx);
struct mlx5_bfreg_info *bfregi = &context->bfregi;
if (MLX5_CAP_GEN(dev->mdev, dump_fill_mkey)) {
resp->dump_fill_mkey = dev->mkeys.dump_fill_mkey;
resp->comp_mask |=
MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_DUMP_FILL_MKEY;
}
resp->qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp);
net/mlx5: Reimplement write combining test JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: v6.11-rc1 commit d98995b4bf981519dde4af0a081c393d62474039 Author: Jianbo Liu <jianbol@nvidia.com> Date: Mon Jun 3 13:26:37 2024 +0300 net/mlx5: Reimplement write combining test The test of write combining was added before in mlx5_ib driver. It opens UD QP and posts NOP WQEs, and uses BlueFlame doorbell. When BlueFlame is used, WQEs get written directly to a PCI BAR of the device (in addition to memory) so that the device handles them without having to access memory. In this test, the WQEs written in memory are different from the ones written to the BlueFlame which request CQE update. By checking the completion reports posted on CQ, we can know if BlueFlame succeeds or not. The write combining must be supported if BlueFlame succeeds as its register is written using write combining. This patch reimplements the test in the same way, but using a pair of SQ and CQ only. It is moved to mlx5_core as a general feature used by both mlx5_core and mlx5_ib. Besides, save write combine test result of the PCI function, so that its thousands of child functions such as SF can query without paying the time and resource penalty by itself. The test function is called only after failing to get the cached result. With this enhancement, all thousands of SFs of the PF attached to same driver no longer need to perform WC check explicitly, which is already done in the system. This saves several commands per SF, thereby speeds up SF creation and also saves completion EQ creation. Signed-off-by: Jianbo Liu <jianbol@nvidia.com> Reviewed-by: Tariq Toukan <tariqt@nvidia.com> Link: https://lore.kernel.org/r/4ff5a8cc4c5b5b0d98397baa45a5019bcdbf096e.1717409369.git.leon@kernel.org Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:03 +00:00
if (mlx5_wc_support_get(dev->mdev))
resp->bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev,
log_bf_reg_size);
resp->cache_line_size = cache_line_size();
resp->max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq);
resp->max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq);
resp->max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
resp->max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
resp->max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
resp->cqe_version = context->cqe_version;
resp->log_uar_size = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
MLX5_ADAPTER_PAGE_SHIFT : PAGE_SHIFT;
resp->num_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
MLX5_CAP_GEN(dev->mdev,
num_of_uars_per_page) : 1;
resp->tot_bfregs = bfregi->lib_uar_dyn ? 0 :
bfregi->total_num_bfregs - bfregi->num_dyn_bfregs;
resp->num_ports = dev->num_ports;
resp->cmds_supp_uhw |= MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE |
MLX5_USER_CMDS_SUPP_UHW_CREATE_AH;
if (mlx5_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET) {
mlx5_query_min_inline(dev->mdev, &resp->eth_min_inline);
resp->eth_min_inline++;
}
if (dev->mdev->clock_info)
resp->clock_info_versions = BIT(MLX5_IB_CLOCK_INFO_V1);
/*
* We don't want to expose information from the PCI bar that is located
* after 4096 bytes, so if the arch only supports larger pages, let's
* pretend we don't support reading the HCA's core clock. This is also
* forced by mmap function.
*/
if (PAGE_SIZE <= 4096) {
resp->comp_mask |=
MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET;
resp->hca_core_clock_offset =
offsetof(struct mlx5_init_seg,
internal_timer_h) % PAGE_SIZE;
}
if (MLX5_CAP_GEN(dev->mdev, ece_support))
resp->comp_mask |= MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_ECE;
if (rt_supported(MLX5_CAP_GEN(dev->mdev, sq_ts_format)) &&
rt_supported(MLX5_CAP_GEN(dev->mdev, rq_ts_format)) &&
rt_supported(MLX5_CAP_ROCE(dev->mdev, qp_ts_format)))
resp->comp_mask |=
MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_REAL_TIME_TS;
resp->num_dyn_bfregs = bfregi->num_dyn_bfregs;
if (MLX5_CAP_GEN(dev->mdev, drain_sigerr))
resp->comp_mask |= MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_SQD2RTS;
resp->comp_mask |=
MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_MKEY_UPDATE_TAG;
return 0;
}
static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx,
struct ib_udata *udata)
{
struct ib_device *ibdev = uctx->device;
struct mlx5_ib_dev *dev = to_mdev(ibdev);
struct mlx5_ib_alloc_ucontext_req_v2 req = {};
struct mlx5_ib_alloc_ucontext_resp resp = {};
struct mlx5_ib_ucontext *context = to_mucontext(uctx);
struct mlx5_bfreg_info *bfregi;
int ver;
int err;
size_t min_req_v2 = offsetof(struct mlx5_ib_alloc_ucontext_req_v2,
max_cqe_version);
bool lib_uar_4k;
bool lib_uar_dyn;
if (!dev->ib_active)
return -EAGAIN;
if (udata->inlen == sizeof(struct mlx5_ib_alloc_ucontext_req))
ver = 0;
else if (udata->inlen >= min_req_v2)
ver = 2;
else
return -EINVAL;
err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
if (err)
return err;
if (req.flags & ~MLX5_IB_ALLOC_UCTX_DEVX)
return -EOPNOTSUPP;
if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2)
return -EOPNOTSUPP;
req.total_num_bfregs = ALIGN(req.total_num_bfregs,
MLX5_NON_FP_BFREGS_PER_UAR);
if (req.num_low_latency_bfregs > req.total_num_bfregs - 1)
return -EINVAL;
if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) {
err = mlx5_ib_devx_create(dev, true);
if (err < 0)
goto out_ctx;
context->devx_uid = err;
}
lib_uar_4k = req.lib_caps & MLX5_LIB_CAP_4K_UAR;
lib_uar_dyn = req.lib_caps & MLX5_LIB_CAP_DYN_UAR;
bfregi = &context->bfregi;
if (lib_uar_dyn) {
bfregi->lib_uar_dyn = lib_uar_dyn;
goto uar_done;
}
/* updates req->total_num_bfregs */
err = calc_total_bfregs(dev, lib_uar_4k, &req, bfregi);
if (err)
goto out_devx;
mutex_init(&bfregi->lock);
bfregi->lib_uar_4k = lib_uar_4k;
bfregi->count = kcalloc(bfregi->total_num_bfregs, sizeof(*bfregi->count),
GFP_KERNEL);
if (!bfregi->count) {
err = -ENOMEM;
goto out_devx;
}
bfregi->sys_pages = kcalloc(bfregi->num_sys_pages,
sizeof(*bfregi->sys_pages),
GFP_KERNEL);
if (!bfregi->sys_pages) {
err = -ENOMEM;
goto out_count;
}
err = allocate_uars(dev, context);
if (err)
goto out_sys_pages;
uar_done:
err = mlx5_ib_alloc_transport_domain(dev, &context->tdn,
context->devx_uid);
if (err)
goto out_uars;
INIT_LIST_HEAD(&context->db_page_list);
mutex_init(&context->db_page_mutex);
context->cqe_version = min_t(__u8,
(__u8)MLX5_CAP_GEN(dev->mdev, cqe_version),
req.max_cqe_version);
err = set_ucontext_resp(uctx, &resp);
if (err)
goto out_mdev;
resp.response_length = min(udata->outlen, sizeof(resp));
err = ib_copy_to_udata(udata, &resp, resp.response_length);
if (err)
goto out_mdev;
bfregi->ver = ver;
bfregi->num_low_latency_bfregs = req.num_low_latency_bfregs;
context->lib_caps = req.lib_caps;
print_lib_caps(dev, context->lib_caps);
if (mlx5_ib_lag_should_assign_affinity(dev)) {
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
u32 port = mlx5_core_native_port_num(dev->mdev) - 1;
atomic_set(&context->tx_port_affinity,
atomic_add_return(
1, &dev->port[port].roce.tx_port_affinity));
}
return 0;
out_mdev:
mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid);
out_uars:
deallocate_uars(dev, context);
out_sys_pages:
kfree(bfregi->sys_pages);
out_count:
kfree(bfregi->count);
out_devx:
if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX)
mlx5_ib_devx_destroy(dev, context->devx_uid);
out_ctx:
return err;
}
static int mlx5_ib_query_ucontext(struct ib_ucontext *ibcontext,
struct uverbs_attr_bundle *attrs)
{
struct mlx5_ib_alloc_ucontext_resp uctx_resp = {};
int ret;
ret = set_ucontext_resp(ibcontext, &uctx_resp);
if (ret)
return ret;
uctx_resp.response_length =
min_t(size_t,
uverbs_attr_get_len(attrs,
MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX),
sizeof(uctx_resp));
ret = uverbs_copy_to_struct_or_zero(attrs,
MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX,
&uctx_resp,
sizeof(uctx_resp));
return ret;
}
static void mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
{
struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
struct mlx5_bfreg_info *bfregi;
bfregi = &context->bfregi;
mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid);
deallocate_uars(dev, context);
kfree(bfregi->sys_pages);
kfree(bfregi->count);
if (context->devx_uid)
mlx5_ib_devx_destroy(dev, context->devx_uid);
}
static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev,
int uar_idx)
{
int fw_uars_per_page;
fw_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ? MLX5_UARS_IN_PAGE : 1;
return (dev->mdev->bar_addr >> PAGE_SHIFT) + uar_idx / fw_uars_per_page;
}
static u64 uar_index2paddress(struct mlx5_ib_dev *dev,
int uar_idx)
{
unsigned int fw_uars_per_page;
fw_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
MLX5_UARS_IN_PAGE : 1;
return (dev->mdev->bar_addr + (uar_idx / fw_uars_per_page) * PAGE_SIZE);
}
static int get_command(unsigned long offset)
{
return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK;
}
static int get_arg(unsigned long offset)
{
return offset & ((1 << MLX5_IB_MMAP_CMD_SHIFT) - 1);
}
static int get_index(unsigned long offset)
{
return get_arg(offset);
}
/* Index resides in an extra byte to enable larger values than 255 */
static int get_extended_index(unsigned long offset)
{
return get_arg(offset) | ((offset >> 16) & 0xff) << 8;
}
static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
{
}
static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd)
{
switch (cmd) {
case MLX5_IB_MMAP_WC_PAGE:
return "WC";
case MLX5_IB_MMAP_REGULAR_PAGE:
return "best effort WC";
case MLX5_IB_MMAP_NC_PAGE:
return "NC";
case MLX5_IB_MMAP_DEVICE_MEM:
return "Device Memory";
default:
return "Unknown";
}
}
static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev,
struct vm_area_struct *vma,
struct mlx5_ib_ucontext *context)
{
if ((vma->vm_end - vma->vm_start != PAGE_SIZE) ||
!(vma->vm_flags & VM_SHARED))
return -EINVAL;
if (get_index(vma->vm_pgoff) != MLX5_IB_CLOCK_INFO_V1)
return -EOPNOTSUPP;
if (vma->vm_flags & (VM_WRITE | VM_EXEC))
return -EPERM;
mm: replace vma->vm_flags direct modifications with modifier calls JIRA: https://issues.redhat.com/browse/RHEL-27740 Tested: by me Conflicts: dropped stuff we don't support when not applying cleanly, left the rest for sake of saving work commit 1c71222e5f2393b5ea1a41795c67589eea7e3490 Author: Suren Baghdasaryan <surenb@google.com> Date: Thu Jan 26 11:37:49 2023 -0800 mm: replace vma->vm_flags direct modifications with modifier calls Replace direct modifications to vma->vm_flags with calls to modifier functions to be able to track flag changes and to keep vma locking correctness. [akpm@linux-foundation.org: fix drivers/misc/open-dice.c, per Hyeonggon Yoo] Link: https://lkml.kernel.org/r/20230126193752.297968-5-surenb@google.com Signed-off-by: Suren Baghdasaryan <surenb@google.com> Acked-by: Michal Hocko <mhocko@suse.com> Acked-by: Mel Gorman <mgorman@techsingularity.net> Acked-by: Mike Rapoport (IBM) <rppt@kernel.org> Acked-by: Sebastian Reichel <sebastian.reichel@collabora.com> Reviewed-by: Liam R. Howlett <Liam.Howlett@Oracle.com> Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Andy Lutomirski <luto@kernel.org> Cc: Arjun Roy <arjunroy@google.com> Cc: Axel Rasmussen <axelrasmussen@google.com> Cc: David Hildenbrand <david@redhat.com> Cc: David Howells <dhowells@redhat.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: David Rientjes <rientjes@google.com> Cc: Eric Dumazet <edumazet@google.com> Cc: Greg Thelen <gthelen@google.com> Cc: Hugh Dickins <hughd@google.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jann Horn <jannh@google.com> Cc: Joel Fernandes <joelaf@google.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Kent Overstreet <kent.overstreet@linux.dev> Cc: Laurent Dufour <ldufour@linux.ibm.com> Cc: Lorenzo Stoakes <lstoakes@gmail.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Minchan Kim <minchan@google.com> Cc: Paul E. McKenney <paulmck@kernel.org> Cc: Peter Oskolkov <posk@google.com> Cc: Peter Xu <peterx@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Punit Agrawal <punit.agrawal@bytedance.com> Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Cc: Shakeel Butt <shakeelb@google.com> Cc: Soheil Hassas Yeganeh <soheil@google.com> Cc: Song Liu <songliubraving@fb.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Will Deacon <will@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
2024-04-29 18:32:57 +00:00
vm_flags_clear(vma, VM_MAYWRITE);
if (!dev->mdev->clock_info)
return -EOPNOTSUPP;
return vm_insert_page(vma, vma->vm_start,
virt_to_page(dev->mdev->clock_info));
}
static void mlx5_ib_mmap_free(struct rdma_user_mmap_entry *entry)
{
struct mlx5_user_mmap_entry *mentry = to_mmmap(entry);
struct mlx5_ib_dev *dev = to_mdev(entry->ucontext->device);
struct mlx5_var_table *var_table = &dev->var_table;
struct mlx5_ib_ucontext *context = to_mucontext(entry->ucontext);
switch (mentry->mmap_flag) {
case MLX5_IB_MMAP_TYPE_MEMIC:
case MLX5_IB_MMAP_TYPE_MEMIC_OP:
mlx5_ib_dm_mmap_free(dev, mentry);
break;
case MLX5_IB_MMAP_TYPE_VAR:
mutex_lock(&var_table->bitmap_lock);
clear_bit(mentry->page_idx, var_table->bitmap);
mutex_unlock(&var_table->bitmap_lock);
kfree(mentry);
break;
case MLX5_IB_MMAP_TYPE_UAR_WC:
case MLX5_IB_MMAP_TYPE_UAR_NC:
mlx5_cmd_uar_dealloc(dev->mdev, mentry->page_idx,
context->devx_uid);
kfree(mentry);
break;
default:
WARN_ON(true);
}
}
static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
struct vm_area_struct *vma,
struct mlx5_ib_ucontext *context)
{
struct mlx5_bfreg_info *bfregi = &context->bfregi;
int err;
unsigned long idx;
phys_addr_t pfn;
pgprot_t prot;
u32 bfreg_dyn_idx = 0;
u32 uar_index;
int dyn_uar = (cmd == MLX5_IB_MMAP_ALLOC_WC);
int max_valid_idx = dyn_uar ? bfregi->num_sys_pages :
bfregi->num_static_sys_pages;
if (bfregi->lib_uar_dyn)
return -EINVAL;
if (vma->vm_end - vma->vm_start != PAGE_SIZE)
return -EINVAL;
if (dyn_uar)
idx = get_extended_index(vma->vm_pgoff) + bfregi->num_static_sys_pages;
else
idx = get_index(vma->vm_pgoff);
if (idx >= max_valid_idx) {
mlx5_ib_warn(dev, "invalid uar index %lu, max=%d\n",
idx, max_valid_idx);
return -EINVAL;
}
switch (cmd) {
case MLX5_IB_MMAP_WC_PAGE:
case MLX5_IB_MMAP_ALLOC_WC:
case MLX5_IB_MMAP_REGULAR_PAGE:
/* For MLX5_IB_MMAP_REGULAR_PAGE do the best effort to get WC */
prot = pgprot_writecombine(vma->vm_page_prot);
break;
case MLX5_IB_MMAP_NC_PAGE:
prot = pgprot_noncached(vma->vm_page_prot);
break;
default:
return -EINVAL;
}
if (dyn_uar) {
int uars_per_page;
uars_per_page = get_uars_per_sys_page(dev, bfregi->lib_uar_4k);
bfreg_dyn_idx = idx * (uars_per_page * MLX5_NON_FP_BFREGS_PER_UAR);
if (bfreg_dyn_idx >= bfregi->total_num_bfregs) {
mlx5_ib_warn(dev, "invalid bfreg_dyn_idx %u, max=%u\n",
bfreg_dyn_idx, bfregi->total_num_bfregs);
return -EINVAL;
}
mutex_lock(&bfregi->lock);
/* Fail if uar already allocated, first bfreg index of each
* page holds its count.
*/
if (bfregi->count[bfreg_dyn_idx]) {
mlx5_ib_warn(dev, "wrong offset, idx %lu is busy, bfregn=%u\n", idx, bfreg_dyn_idx);
mutex_unlock(&bfregi->lock);
return -EINVAL;
}
bfregi->count[bfreg_dyn_idx]++;
mutex_unlock(&bfregi->lock);
err = mlx5_cmd_uar_alloc(dev->mdev, &uar_index,
context->devx_uid);
if (err) {
mlx5_ib_warn(dev, "UAR alloc failed\n");
goto free_bfreg;
}
} else {
uar_index = bfregi->sys_pages[idx];
}
pfn = uar_index2pfn(dev, uar_index);
mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn);
err = rdma_user_mmap_io(&context->ibucontext, vma, pfn, PAGE_SIZE,
prot, NULL);
if (err) {
mlx5_ib_err(dev,
"rdma_user_mmap_io failed with error=%d, mmap_cmd=%s\n",
err, mmap_cmd2str(cmd));
goto err;
}
if (dyn_uar)
bfregi->sys_pages[idx] = uar_index;
return 0;
err:
if (!dyn_uar)
return err;
mlx5_cmd_uar_dealloc(dev->mdev, idx, context->devx_uid);
free_bfreg:
mlx5_ib_free_bfreg(dev, bfregi, bfreg_dyn_idx);
return err;
}
static unsigned long mlx5_vma_to_pgoff(struct vm_area_struct *vma)
{
unsigned long idx;
u8 command;
command = get_command(vma->vm_pgoff);
idx = get_extended_index(vma->vm_pgoff);
return (command << 16 | idx);
}
static int mlx5_ib_mmap_offset(struct mlx5_ib_dev *dev,
struct vm_area_struct *vma,
struct ib_ucontext *ucontext)
{
struct mlx5_user_mmap_entry *mentry;
struct rdma_user_mmap_entry *entry;
unsigned long pgoff;
pgprot_t prot;
phys_addr_t pfn;
int ret;
pgoff = mlx5_vma_to_pgoff(vma);
entry = rdma_user_mmap_entry_get_pgoff(ucontext, pgoff);
if (!entry)
return -EINVAL;
mentry = to_mmmap(entry);
pfn = (mentry->address >> PAGE_SHIFT);
if (mentry->mmap_flag == MLX5_IB_MMAP_TYPE_VAR ||
mentry->mmap_flag == MLX5_IB_MMAP_TYPE_UAR_NC)
prot = pgprot_noncached(vma->vm_page_prot);
else
prot = pgprot_writecombine(vma->vm_page_prot);
ret = rdma_user_mmap_io(ucontext, vma, pfn,
entry->npages * PAGE_SIZE,
prot,
entry);
rdma_user_mmap_entry_put(&mentry->rdma_entry);
return ret;
}
static u64 mlx5_entry_to_mmap_offset(struct mlx5_user_mmap_entry *entry)
{
u64 cmd = (entry->rdma_entry.start_pgoff >> 16) & 0xFFFF;
u64 index = entry->rdma_entry.start_pgoff & 0xFFFF;
return (((index >> 8) << 16) | (cmd << MLX5_IB_MMAP_CMD_SHIFT) |
(index & 0xFF)) << PAGE_SHIFT;
}
static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
{
struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
unsigned long command;
phys_addr_t pfn;
command = get_command(vma->vm_pgoff);
switch (command) {
case MLX5_IB_MMAP_WC_PAGE:
case MLX5_IB_MMAP_ALLOC_WC:
net/mlx5: Reimplement write combining test JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: v6.11-rc1 commit d98995b4bf981519dde4af0a081c393d62474039 Author: Jianbo Liu <jianbol@nvidia.com> Date: Mon Jun 3 13:26:37 2024 +0300 net/mlx5: Reimplement write combining test The test of write combining was added before in mlx5_ib driver. It opens UD QP and posts NOP WQEs, and uses BlueFlame doorbell. When BlueFlame is used, WQEs get written directly to a PCI BAR of the device (in addition to memory) so that the device handles them without having to access memory. In this test, the WQEs written in memory are different from the ones written to the BlueFlame which request CQE update. By checking the completion reports posted on CQ, we can know if BlueFlame succeeds or not. The write combining must be supported if BlueFlame succeeds as its register is written using write combining. This patch reimplements the test in the same way, but using a pair of SQ and CQ only. It is moved to mlx5_core as a general feature used by both mlx5_core and mlx5_ib. Besides, save write combine test result of the PCI function, so that its thousands of child functions such as SF can query without paying the time and resource penalty by itself. The test function is called only after failing to get the cached result. With this enhancement, all thousands of SFs of the PF attached to same driver no longer need to perform WC check explicitly, which is already done in the system. This saves several commands per SF, thereby speeds up SF creation and also saves completion EQ creation. Signed-off-by: Jianbo Liu <jianbol@nvidia.com> Reviewed-by: Tariq Toukan <tariqt@nvidia.com> Link: https://lore.kernel.org/r/4ff5a8cc4c5b5b0d98397baa45a5019bcdbf096e.1717409369.git.leon@kernel.org Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:03 +00:00
if (!mlx5_wc_support_get(dev->mdev))
return -EPERM;
fallthrough;
case MLX5_IB_MMAP_NC_PAGE:
case MLX5_IB_MMAP_REGULAR_PAGE:
return uar_mmap(dev, command, vma, context);
case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES:
return -ENOSYS;
case MLX5_IB_MMAP_CORE_CLOCK:
if (vma->vm_end - vma->vm_start != PAGE_SIZE)
return -EINVAL;
if (vma->vm_flags & VM_WRITE)
return -EPERM;
mm: replace vma->vm_flags direct modifications with modifier calls JIRA: https://issues.redhat.com/browse/RHEL-27740 Tested: by me Conflicts: dropped stuff we don't support when not applying cleanly, left the rest for sake of saving work commit 1c71222e5f2393b5ea1a41795c67589eea7e3490 Author: Suren Baghdasaryan <surenb@google.com> Date: Thu Jan 26 11:37:49 2023 -0800 mm: replace vma->vm_flags direct modifications with modifier calls Replace direct modifications to vma->vm_flags with calls to modifier functions to be able to track flag changes and to keep vma locking correctness. [akpm@linux-foundation.org: fix drivers/misc/open-dice.c, per Hyeonggon Yoo] Link: https://lkml.kernel.org/r/20230126193752.297968-5-surenb@google.com Signed-off-by: Suren Baghdasaryan <surenb@google.com> Acked-by: Michal Hocko <mhocko@suse.com> Acked-by: Mel Gorman <mgorman@techsingularity.net> Acked-by: Mike Rapoport (IBM) <rppt@kernel.org> Acked-by: Sebastian Reichel <sebastian.reichel@collabora.com> Reviewed-by: Liam R. Howlett <Liam.Howlett@Oracle.com> Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Andy Lutomirski <luto@kernel.org> Cc: Arjun Roy <arjunroy@google.com> Cc: Axel Rasmussen <axelrasmussen@google.com> Cc: David Hildenbrand <david@redhat.com> Cc: David Howells <dhowells@redhat.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: David Rientjes <rientjes@google.com> Cc: Eric Dumazet <edumazet@google.com> Cc: Greg Thelen <gthelen@google.com> Cc: Hugh Dickins <hughd@google.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jann Horn <jannh@google.com> Cc: Joel Fernandes <joelaf@google.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Kent Overstreet <kent.overstreet@linux.dev> Cc: Laurent Dufour <ldufour@linux.ibm.com> Cc: Lorenzo Stoakes <lstoakes@gmail.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Minchan Kim <minchan@google.com> Cc: Paul E. McKenney <paulmck@kernel.org> Cc: Peter Oskolkov <posk@google.com> Cc: Peter Xu <peterx@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Punit Agrawal <punit.agrawal@bytedance.com> Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Cc: Shakeel Butt <shakeelb@google.com> Cc: Soheil Hassas Yeganeh <soheil@google.com> Cc: Song Liu <songliubraving@fb.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Will Deacon <will@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
2024-04-29 18:32:57 +00:00
vm_flags_clear(vma, VM_MAYWRITE);
/* Don't expose to user-space information it shouldn't have */
if (PAGE_SIZE > 4096)
return -EOPNOTSUPP;
pfn = (dev->mdev->iseg_base +
offsetof(struct mlx5_init_seg, internal_timer_h)) >>
PAGE_SHIFT;
return rdma_user_mmap_io(&context->ibucontext, vma, pfn,
PAGE_SIZE,
pgprot_noncached(vma->vm_page_prot),
NULL);
case MLX5_IB_MMAP_CLOCK_INFO:
return mlx5_ib_mmap_clock_info_page(dev, vma, context);
default:
return mlx5_ib_mmap_offset(dev, vma, ibcontext);
}
return 0;
}
static int mlx5_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
{
struct mlx5_ib_pd *pd = to_mpd(ibpd);
struct ib_device *ibdev = ibpd->device;
struct mlx5_ib_alloc_pd_resp resp;
int err;
u32 out[MLX5_ST_SZ_DW(alloc_pd_out)] = {};
u32 in[MLX5_ST_SZ_DW(alloc_pd_in)] = {};
u16 uid = 0;
struct mlx5_ib_ucontext *context = rdma_udata_to_drv_context(
udata, struct mlx5_ib_ucontext, ibucontext);
uid = context ? context->devx_uid : 0;
MLX5_SET(alloc_pd_in, in, opcode, MLX5_CMD_OP_ALLOC_PD);
MLX5_SET(alloc_pd_in, in, uid, uid);
err = mlx5_cmd_exec_inout(to_mdev(ibdev)->mdev, alloc_pd, in, out);
if (err)
return err;
pd->pdn = MLX5_GET(alloc_pd_out, out, pd);
pd->uid = uid;
if (udata) {
resp.pdn = pd->pdn;
if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
mlx5_cmd_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn, uid);
return -EFAULT;
}
}
return 0;
}
static int mlx5_ib_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
{
struct mlx5_ib_dev *mdev = to_mdev(pd->device);
struct mlx5_ib_pd *mpd = to_mpd(pd);
return mlx5_cmd_dealloc_pd(mdev->mdev, mpd->pdn, mpd->uid);
}
static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
{
struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
struct mlx5_ib_qp *mqp = to_mqp(ibqp);
int err;
u16 uid;
uid = ibqp->pd ?
to_mpd(ibqp->pd)->uid : 0;
if (mqp->flags & IB_QP_CREATE_SOURCE_QPN) {
mlx5_ib_dbg(dev, "Attaching a multi cast group to underlay QP is not supported\n");
return -EOPNOTSUPP;
}
err = mlx5_cmd_attach_mcg(dev->mdev, gid, ibqp->qp_num, uid);
if (err)
mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n",
ibqp->qp_num, gid->raw);
return err;
}
static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
{
struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
int err;
u16 uid;
uid = ibqp->pd ?
to_mpd(ibqp->pd)->uid : 0;
err = mlx5_cmd_detach_mcg(dev->mdev, gid, ibqp->qp_num, uid);
if (err)
mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n",
ibqp->qp_num, gid->raw);
return err;
}
static int init_node_data(struct mlx5_ib_dev *dev)
{
int err;
err = mlx5_query_node_desc(dev, dev->ib_dev.node_desc);
if (err)
return err;
dev->mdev->rev_id = dev->mdev->pdev->revision;
return mlx5_query_node_guid(dev, &dev->ib_dev.node_guid);
}
static ssize_t fw_pages_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct mlx5_ib_dev *dev =
rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
RDMA: Convert sysfs device * show functions to use sysfs_emit() Done with cocci script: @@ identifier d_show; identifier dev, attr, buf; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... return - sprintf(buf, + sysfs_emit(buf, ...); ...> } @@ identifier d_show; identifier dev, attr, buf; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... return - snprintf(buf, PAGE_SIZE, + sysfs_emit(buf, ...); ...> } @@ identifier d_show; identifier dev, attr, buf; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... return - scnprintf(buf, PAGE_SIZE, + sysfs_emit(buf, ...); ...> } @@ identifier d_show; identifier dev, attr, buf; expression chr; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... return - strcpy(buf, chr); + sysfs_emit(buf, chr); ...> } @@ identifier d_show; identifier dev, attr, buf; identifier len; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... len = - sprintf(buf, + sysfs_emit(buf, ...); ...> return len; } @@ identifier d_show; identifier dev, attr, buf; identifier len; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... len = - snprintf(buf, PAGE_SIZE, + sysfs_emit(buf, ...); ...> return len; } @@ identifier d_show; identifier dev, attr, buf; identifier len; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... len = - scnprintf(buf, PAGE_SIZE, + sysfs_emit(buf, ...); ...> return len; } @@ identifier d_show; identifier dev, attr, buf; identifier len; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... - len += scnprintf(buf + len, PAGE_SIZE - len, + len += sysfs_emit_at(buf, len, ...); ...> return len; } @@ identifier d_show; identifier dev, attr, buf; expression chr; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { ... - strcpy(buf, chr); - return strlen(buf); + return sysfs_emit(buf, chr); } Link: https://lore.kernel.org/r/7f406fa8e3aa2552c022bec680f621e38d1fe414.1602122879.git.joe@perches.com Signed-off-by: Joe Perches <joe@perches.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2020-10-08 02:36:24 +00:00
return sysfs_emit(buf, "%d\n", dev->mdev->priv.fw_pages);
}
static DEVICE_ATTR_RO(fw_pages);
static ssize_t reg_pages_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct mlx5_ib_dev *dev =
rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
RDMA: Convert sysfs device * show functions to use sysfs_emit() Done with cocci script: @@ identifier d_show; identifier dev, attr, buf; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... return - sprintf(buf, + sysfs_emit(buf, ...); ...> } @@ identifier d_show; identifier dev, attr, buf; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... return - snprintf(buf, PAGE_SIZE, + sysfs_emit(buf, ...); ...> } @@ identifier d_show; identifier dev, attr, buf; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... return - scnprintf(buf, PAGE_SIZE, + sysfs_emit(buf, ...); ...> } @@ identifier d_show; identifier dev, attr, buf; expression chr; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... return - strcpy(buf, chr); + sysfs_emit(buf, chr); ...> } @@ identifier d_show; identifier dev, attr, buf; identifier len; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... len = - sprintf(buf, + sysfs_emit(buf, ...); ...> return len; } @@ identifier d_show; identifier dev, attr, buf; identifier len; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... len = - snprintf(buf, PAGE_SIZE, + sysfs_emit(buf, ...); ...> return len; } @@ identifier d_show; identifier dev, attr, buf; identifier len; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... len = - scnprintf(buf, PAGE_SIZE, + sysfs_emit(buf, ...); ...> return len; } @@ identifier d_show; identifier dev, attr, buf; identifier len; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... - len += scnprintf(buf + len, PAGE_SIZE - len, + len += sysfs_emit_at(buf, len, ...); ...> return len; } @@ identifier d_show; identifier dev, attr, buf; expression chr; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { ... - strcpy(buf, chr); - return strlen(buf); + return sysfs_emit(buf, chr); } Link: https://lore.kernel.org/r/7f406fa8e3aa2552c022bec680f621e38d1fe414.1602122879.git.joe@perches.com Signed-off-by: Joe Perches <joe@perches.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2020-10-08 02:36:24 +00:00
return sysfs_emit(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages));
}
static DEVICE_ATTR_RO(reg_pages);
static ssize_t hca_type_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct mlx5_ib_dev *dev =
rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
RDMA: Convert sysfs device * show functions to use sysfs_emit() Done with cocci script: @@ identifier d_show; identifier dev, attr, buf; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... return - sprintf(buf, + sysfs_emit(buf, ...); ...> } @@ identifier d_show; identifier dev, attr, buf; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... return - snprintf(buf, PAGE_SIZE, + sysfs_emit(buf, ...); ...> } @@ identifier d_show; identifier dev, attr, buf; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... return - scnprintf(buf, PAGE_SIZE, + sysfs_emit(buf, ...); ...> } @@ identifier d_show; identifier dev, attr, buf; expression chr; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... return - strcpy(buf, chr); + sysfs_emit(buf, chr); ...> } @@ identifier d_show; identifier dev, attr, buf; identifier len; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... len = - sprintf(buf, + sysfs_emit(buf, ...); ...> return len; } @@ identifier d_show; identifier dev, attr, buf; identifier len; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... len = - snprintf(buf, PAGE_SIZE, + sysfs_emit(buf, ...); ...> return len; } @@ identifier d_show; identifier dev, attr, buf; identifier len; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... len = - scnprintf(buf, PAGE_SIZE, + sysfs_emit(buf, ...); ...> return len; } @@ identifier d_show; identifier dev, attr, buf; identifier len; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... - len += scnprintf(buf + len, PAGE_SIZE - len, + len += sysfs_emit_at(buf, len, ...); ...> return len; } @@ identifier d_show; identifier dev, attr, buf; expression chr; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { ... - strcpy(buf, chr); - return strlen(buf); + return sysfs_emit(buf, chr); } Link: https://lore.kernel.org/r/7f406fa8e3aa2552c022bec680f621e38d1fe414.1602122879.git.joe@perches.com Signed-off-by: Joe Perches <joe@perches.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2020-10-08 02:36:24 +00:00
return sysfs_emit(buf, "MT%d\n", dev->mdev->pdev->device);
}
static DEVICE_ATTR_RO(hca_type);
static ssize_t hw_rev_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct mlx5_ib_dev *dev =
rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
RDMA: Convert sysfs device * show functions to use sysfs_emit() Done with cocci script: @@ identifier d_show; identifier dev, attr, buf; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... return - sprintf(buf, + sysfs_emit(buf, ...); ...> } @@ identifier d_show; identifier dev, attr, buf; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... return - snprintf(buf, PAGE_SIZE, + sysfs_emit(buf, ...); ...> } @@ identifier d_show; identifier dev, attr, buf; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... return - scnprintf(buf, PAGE_SIZE, + sysfs_emit(buf, ...); ...> } @@ identifier d_show; identifier dev, attr, buf; expression chr; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... return - strcpy(buf, chr); + sysfs_emit(buf, chr); ...> } @@ identifier d_show; identifier dev, attr, buf; identifier len; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... len = - sprintf(buf, + sysfs_emit(buf, ...); ...> return len; } @@ identifier d_show; identifier dev, attr, buf; identifier len; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... len = - snprintf(buf, PAGE_SIZE, + sysfs_emit(buf, ...); ...> return len; } @@ identifier d_show; identifier dev, attr, buf; identifier len; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... len = - scnprintf(buf, PAGE_SIZE, + sysfs_emit(buf, ...); ...> return len; } @@ identifier d_show; identifier dev, attr, buf; identifier len; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... - len += scnprintf(buf + len, PAGE_SIZE - len, + len += sysfs_emit_at(buf, len, ...); ...> return len; } @@ identifier d_show; identifier dev, attr, buf; expression chr; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { ... - strcpy(buf, chr); - return strlen(buf); + return sysfs_emit(buf, chr); } Link: https://lore.kernel.org/r/7f406fa8e3aa2552c022bec680f621e38d1fe414.1602122879.git.joe@perches.com Signed-off-by: Joe Perches <joe@perches.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2020-10-08 02:36:24 +00:00
return sysfs_emit(buf, "%x\n", dev->mdev->rev_id);
}
static DEVICE_ATTR_RO(hw_rev);
static ssize_t board_id_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct mlx5_ib_dev *dev =
rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
RDMA: Convert sysfs device * show functions to use sysfs_emit() Done with cocci script: @@ identifier d_show; identifier dev, attr, buf; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... return - sprintf(buf, + sysfs_emit(buf, ...); ...> } @@ identifier d_show; identifier dev, attr, buf; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... return - snprintf(buf, PAGE_SIZE, + sysfs_emit(buf, ...); ...> } @@ identifier d_show; identifier dev, attr, buf; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... return - scnprintf(buf, PAGE_SIZE, + sysfs_emit(buf, ...); ...> } @@ identifier d_show; identifier dev, attr, buf; expression chr; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... return - strcpy(buf, chr); + sysfs_emit(buf, chr); ...> } @@ identifier d_show; identifier dev, attr, buf; identifier len; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... len = - sprintf(buf, + sysfs_emit(buf, ...); ...> return len; } @@ identifier d_show; identifier dev, attr, buf; identifier len; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... len = - snprintf(buf, PAGE_SIZE, + sysfs_emit(buf, ...); ...> return len; } @@ identifier d_show; identifier dev, attr, buf; identifier len; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... len = - scnprintf(buf, PAGE_SIZE, + sysfs_emit(buf, ...); ...> return len; } @@ identifier d_show; identifier dev, attr, buf; identifier len; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { <... - len += scnprintf(buf + len, PAGE_SIZE - len, + len += sysfs_emit_at(buf, len, ...); ...> return len; } @@ identifier d_show; identifier dev, attr, buf; expression chr; @@ ssize_t d_show(struct device *dev, struct device_attribute *attr, char *buf) { ... - strcpy(buf, chr); - return strlen(buf); + return sysfs_emit(buf, chr); } Link: https://lore.kernel.org/r/7f406fa8e3aa2552c022bec680f621e38d1fe414.1602122879.git.joe@perches.com Signed-off-by: Joe Perches <joe@perches.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2020-10-08 02:36:24 +00:00
return sysfs_emit(buf, "%.*s\n", MLX5_BOARD_ID_LEN,
dev->mdev->board_id);
}
static DEVICE_ATTR_RO(board_id);
static struct attribute *mlx5_class_attributes[] = {
&dev_attr_hw_rev.attr,
&dev_attr_hca_type.attr,
&dev_attr_board_id.attr,
&dev_attr_fw_pages.attr,
&dev_attr_reg_pages.attr,
NULL,
};
static const struct attribute_group mlx5_attr_group = {
.attrs = mlx5_class_attributes,
};
static void pkey_change_handler(struct work_struct *work)
{
struct mlx5_ib_port_resources *ports =
container_of(work, struct mlx5_ib_port_resources,
pkey_change_work);
if (!ports->gsi)
/*
* We got this event before device was fully configured
* and MAD registration code wasn't called/finished yet.
*/
return;
mlx5_ib_gsi_pkey_change(ports->gsi);
}
static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev)
{
struct mlx5_ib_qp *mqp;
struct mlx5_ib_cq *send_mcq, *recv_mcq;
struct mlx5_core_cq *mcq;
struct list_head cq_armed_list;
unsigned long flags_qp;
unsigned long flags_cq;
unsigned long flags;
INIT_LIST_HEAD(&cq_armed_list);
/* Go over qp list reside on that ibdev, sync with create/destroy qp.*/
spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags);
list_for_each_entry(mqp, &ibdev->qp_list, qps_list) {
spin_lock_irqsave(&mqp->sq.lock, flags_qp);
if (mqp->sq.tail != mqp->sq.head) {
send_mcq = to_mcq(mqp->ibqp.send_cq);
spin_lock_irqsave(&send_mcq->lock, flags_cq);
if (send_mcq->mcq.comp &&
mqp->ibqp.send_cq->comp_handler) {
if (!send_mcq->mcq.reset_notify_added) {
send_mcq->mcq.reset_notify_added = 1;
list_add_tail(&send_mcq->mcq.reset_notify,
&cq_armed_list);
}
}
spin_unlock_irqrestore(&send_mcq->lock, flags_cq);
}
spin_unlock_irqrestore(&mqp->sq.lock, flags_qp);
spin_lock_irqsave(&mqp->rq.lock, flags_qp);
/* no handling is needed for SRQ */
if (!mqp->ibqp.srq) {
if (mqp->rq.tail != mqp->rq.head) {
recv_mcq = to_mcq(mqp->ibqp.recv_cq);
spin_lock_irqsave(&recv_mcq->lock, flags_cq);
if (recv_mcq->mcq.comp &&
mqp->ibqp.recv_cq->comp_handler) {
if (!recv_mcq->mcq.reset_notify_added) {
recv_mcq->mcq.reset_notify_added = 1;
list_add_tail(&recv_mcq->mcq.reset_notify,
&cq_armed_list);
}
}
spin_unlock_irqrestore(&recv_mcq->lock,
flags_cq);
}
}
spin_unlock_irqrestore(&mqp->rq.lock, flags_qp);
}
/*At that point all inflight post send were put to be executed as of we
* lock/unlock above locks Now need to arm all involved CQs.
*/
list_for_each_entry(mcq, &cq_armed_list, reset_notify) {
mcq->comp(mcq, NULL);
}
spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags);
}
static void delay_drop_handler(struct work_struct *work)
{
int err;
struct mlx5_ib_delay_drop *delay_drop =
container_of(work, struct mlx5_ib_delay_drop,
delay_drop_work);
atomic_inc(&delay_drop->events_cnt);
IB/mlx5: Use IP version matching to classify IP traffic This change adds the ability for flow steering to classify IPv4/6 packets with MPLS tag (Ethertype 0x8847 and 0x8848) as standard IP packets and hit IPv4/6 classifed steering rules. When user added a flow rule with IP classification, driver was implicitly adding ethertype matching to the created rule in order to distinguish between IPv4 and IPv6 protocols. Since IP packets with MPLS tag header have MPLS ethertype, they missed the rule and ended up hitting the default filters. Such behavior prevented from MPLS packets to undergo inbound traffic load balancing flows (if such were defined by configuring RSS) to achieve higher throughput - the way that non-MPLS IP packets performed. Since our device is able to look past the MPLS tag and identify the next protocol we introduce this solution which replaces Ethertype matching by the device's capability to perform IP version parsing and matching in order to distinguish between IPv4 and IPv6. Therefore, whenever a flow with IP spec is added and device support IP version matching, driver will implicitly add IP version matching to the rule (Based on the IP spec type) without Ethertype matching which will cause relevant MPLS tagged packets to hit this rule as well. Otherwise (device doesn't support IP version matching), we fall back to setting Ethertype matching. If the user's filters specify an L2 ethertype and an IP spec the rule will then match both the ethertype and the IP version. The device's support for IP version matching is reported by the device via dedicated capability bit in query_device_cap and named outer/inner_ip_version. Signed-off-by: Ariel Levkovich <lariel@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-04-03 10:11:03 +00:00
mutex_lock(&delay_drop->lock);
err = mlx5_core_set_delay_drop(delay_drop->dev, delay_drop->timeout);
if (err) {
mlx5_ib_warn(delay_drop->dev, "Failed to set delay drop, timeout=%u\n",
delay_drop->timeout);
delay_drop->activate = false;
}
mutex_unlock(&delay_drop->lock);
}
static void handle_general_event(struct mlx5_ib_dev *ibdev, struct mlx5_eqe *eqe,
struct ib_event *ibev)
{
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
u32 port = (eqe->data.port.port >> 4) & 0xf;
switch (eqe->sub_type) {
case MLX5_GENERAL_SUBTYPE_DELAY_DROP_TIMEOUT:
if (mlx5_ib_port_link_layer(&ibdev->ib_dev, port) ==
IB_LINK_LAYER_ETHERNET)
schedule_work(&ibdev->delay_drop.delay_drop_work);
break;
default: /* do nothing */
return;
}
}
static int handle_port_change(struct mlx5_ib_dev *ibdev, struct mlx5_eqe *eqe,
struct ib_event *ibev)
{
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
u32 port = (eqe->data.port.port >> 4) & 0xf;
ibev->element.port_num = port;
switch (eqe->sub_type) {
case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
case MLX5_PORT_CHANGE_SUBTYPE_INITIALIZED:
/* In RoCE, port up/down events are handled in
* mlx5_netdev_event().
*/
if (mlx5_ib_port_link_layer(&ibdev->ib_dev, port) ==
IB_LINK_LAYER_ETHERNET)
return -EINVAL;
ibev->event = (eqe->sub_type == MLX5_PORT_CHANGE_SUBTYPE_ACTIVE) ?
IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
break;
case MLX5_PORT_CHANGE_SUBTYPE_LID:
ibev->event = IB_EVENT_LID_CHANGE;
break;
case MLX5_PORT_CHANGE_SUBTYPE_PKEY:
ibev->event = IB_EVENT_PKEY_CHANGE;
schedule_work(&ibdev->devr.ports[port - 1].pkey_change_work);
break;
case MLX5_PORT_CHANGE_SUBTYPE_GUID:
ibev->event = IB_EVENT_GID_CHANGE;
break;
case MLX5_PORT_CHANGE_SUBTYPE_CLIENT_REREG:
ibev->event = IB_EVENT_CLIENT_REREGISTER;
break;
default:
return -EINVAL;
}
return 0;
}
static void mlx5_ib_handle_event(struct work_struct *_work)
{
struct mlx5_ib_event_work *work =
container_of(_work, struct mlx5_ib_event_work, work);
struct mlx5_ib_dev *ibdev;
struct ib_event ibev;
bool fatal = false;
if (work->is_slave) {
ibdev = mlx5_ib_get_ibdev_from_mpi(work->mpi);
if (!ibdev)
goto out;
} else {
ibdev = work->dev;
}
switch (work->event) {
case MLX5_DEV_EVENT_SYS_ERROR:
ibev.event = IB_EVENT_DEVICE_FATAL;
mlx5_ib_handle_internal_error(ibdev);
ibev.element.port_num = (u8)(unsigned long)work->param;
fatal = true;
break;
case MLX5_EVENT_TYPE_PORT_CHANGE:
if (handle_port_change(ibdev, work->param, &ibev))
goto out;
break;
case MLX5_EVENT_TYPE_GENERAL_EVENT:
handle_general_event(ibdev, work->param, &ibev);
fallthrough;
default:
goto out;
}
ibev.device = &ibdev->ib_dev;
if (!rdma_is_port_valid(&ibdev->ib_dev, ibev.element.port_num)) {
mlx5_ib_warn(ibdev, "warning: event on port %d\n", ibev.element.port_num);
goto out;
}
if (ibdev->ib_active)
ib_dispatch_event(&ibev);
if (fatal)
ibdev->ib_active = false;
out:
kfree(work);
}
static int mlx5_ib_event(struct notifier_block *nb,
unsigned long event, void *param)
{
struct mlx5_ib_event_work *work;
work = kmalloc(sizeof(*work), GFP_ATOMIC);
if (!work)
return NOTIFY_DONE;
INIT_WORK(&work->work, mlx5_ib_handle_event);
work->dev = container_of(nb, struct mlx5_ib_dev, mdev_events);
work->is_slave = false;
work->param = param;
work->event = event;
queue_work(mlx5_ib_event_wq, &work->work);
return NOTIFY_OK;
}
static int mlx5_ib_event_slave_port(struct notifier_block *nb,
unsigned long event, void *param)
{
struct mlx5_ib_event_work *work;
work = kmalloc(sizeof(*work), GFP_ATOMIC);
if (!work)
return NOTIFY_DONE;
INIT_WORK(&work->work, mlx5_ib_handle_event);
work->mpi = container_of(nb, struct mlx5_ib_multiport_info, mdev_events);
work->is_slave = true;
work->param = param;
work->event = event;
queue_work(mlx5_ib_event_wq, &work->work);
return NOTIFY_OK;
}
static int mlx5_ib_get_plane_num(struct mlx5_core_dev *mdev, u8 *num_plane)
{
struct mlx5_hca_vport_context vport_ctx;
int err;
*num_plane = 0;
if (!MLX5_CAP_GEN(mdev, ib_virt))
return 0;
err = mlx5_query_hca_vport_context(mdev, 0, 1, 0, &vport_ctx);
if (err)
return err;
*num_plane = vport_ctx.num_plane;
return 0;
}
static int set_has_smi_cap(struct mlx5_ib_dev *dev)
{
struct mlx5_hca_vport_context vport_ctx;
int err;
int port;
if (MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_IB)
return 0;
for (port = 1; port <= dev->num_ports; port++) {
if (dev->num_plane) {
dev->port_caps[port - 1].has_smi = false;
continue;
} else if (!MLX5_CAP_GEN(dev->mdev, ib_virt) ||
dev->ib_dev.type == RDMA_DEVICE_TYPE_SMI) {
dev->port_caps[port - 1].has_smi = true;
continue;
}
err = mlx5_query_hca_vport_context(dev->mdev, 0, port, 0,
&vport_ctx);
if (err) {
mlx5_ib_err(dev, "query_hca_vport_context for port=%d failed %d\n",
port, err);
return err;
}
dev->port_caps[port - 1].has_smi = vport_ctx.has_smi;
}
return 0;
}
static void get_ext_port_caps(struct mlx5_ib_dev *dev)
{
unsigned int port;
rdma_for_each_port (&dev->ib_dev, port)
mlx5_query_ext_port_caps(dev, port);
}
static u8 mlx5_get_umr_fence(u8 umr_fence_cap)
{
switch (umr_fence_cap) {
case MLX5_CAP_UMR_FENCE_NONE:
return MLX5_FENCE_MODE_NONE;
case MLX5_CAP_UMR_FENCE_SMALL:
return MLX5_FENCE_MODE_INITIATOR_SMALL;
default:
return MLX5_FENCE_MODE_STRONG_ORDERING;
}
}
int mlx5_ib_dev_res_cq_init(struct mlx5_ib_dev *dev)
{
struct mlx5_ib_resources *devr = &dev->devr;
struct ib_cq_init_attr cq_attr = {.cqe = 1};
struct ib_device *ibdev;
struct ib_pd *pd;
struct ib_cq *cq;
int ret = 0;
/*
* devr->c0 is set once, never changed until device unload.
* Avoid taking the mutex if initialization is already done.
*/
if (devr->c0)
return 0;
mutex_lock(&devr->cq_lock);
if (devr->c0)
goto unlock;
IB/mlx5: Expose extended error counters This patch adds below requester and responder side error counters, which will be exposed by hardware counters interface and are supported as part of query Q counters command extension. +---------------------------+-------------------------------------+ | Name | Description | |---------------------------+-------------------------------------| |resp_local_length_error | Number of times responder detected | | | local length errors | |---------------------------+-------------------------------------| |resp_cqe_error | Number of CQEs completed with error | | | at responder | |---------------------------+-------------------------------------| |req_cqe_error | Number of CQEs completed with error | | | at requester | |---------------------------+-------------------------------------| |req_remote_invalid_request | Number of times requester detected | | | remote invalid request error | |---------------------------+-------------------------------------| |req_remote_access_error | Number of times requester detected | | | remote access error | |---------------------------+-------------------------------------| |resp_remote_access_error | Number of times responder detected | | | remote access error | |---------------------------+-------------------------------------| |resp_cqe_flush_error | Number of CQEs completed with | | | flushed with error at responder | |---------------------------+-------------------------------------| |req_cqe_flush_error | Number of CQEs completed with | | | flushed with error at requester | +---------------------------+-------------------------------------+ Signed-off-by: Parav Pandit <parav@mellanox.com> Reviewed-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Eli Cohen <eli@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-06-19 04:19:37 +00:00
ibdev = &dev->ib_dev;
pd = ib_alloc_pd(ibdev, 0);
if (IS_ERR(pd)) {
ret = PTR_ERR(pd);
mlx5_ib_err(dev, "Couldn't allocate PD for res init, err=%d\n", ret);
goto unlock;
}
cq = ib_create_cq(ibdev, NULL, NULL, NULL, &cq_attr);
if (IS_ERR(cq)) {
ret = PTR_ERR(cq);
mlx5_ib_err(dev, "Couldn't create CQ for res init, err=%d\n", ret);
ib_dealloc_pd(pd);
goto unlock;
}
devr->p0 = pd;
devr->c0 = cq;
unlock:
mutex_unlock(&devr->cq_lock);
return ret;
}
int mlx5_ib_dev_res_srq_init(struct mlx5_ib_dev *dev)
{
struct mlx5_ib_resources *devr = &dev->devr;
struct ib_srq_init_attr attr;
struct ib_srq *s0, *s1;
int ret = 0;
/*
* devr->s1 is set once, never changed until device unload.
* Avoid taking the mutex if initialization is already done.
*/
if (devr->s1)
return 0;
mutex_lock(&devr->srq_lock);
if (devr->s1)
goto unlock;
ret = mlx5_ib_dev_res_cq_init(dev);
if (ret)
goto unlock;
memset(&attr, 0, sizeof(attr));
attr.attr.max_sge = 1;
attr.attr.max_wr = 1;
attr.srq_type = IB_SRQT_XRC;
attr.ext.cq = devr->c0;
s0 = ib_create_srq(devr->p0, &attr);
if (IS_ERR(s0)) {
ret = PTR_ERR(s0);
mlx5_ib_err(dev, "Couldn't create SRQ 0 for res init, err=%d\n", ret);
goto unlock;
}
memset(&attr, 0, sizeof(attr));
attr.attr.max_sge = 1;
attr.attr.max_wr = 1;
attr.srq_type = IB_SRQT_BASIC;
IB/mlx5: Expose extended error counters This patch adds below requester and responder side error counters, which will be exposed by hardware counters interface and are supported as part of query Q counters command extension. +---------------------------+-------------------------------------+ | Name | Description | |---------------------------+-------------------------------------| |resp_local_length_error | Number of times responder detected | | | local length errors | |---------------------------+-------------------------------------| |resp_cqe_error | Number of CQEs completed with error | | | at responder | |---------------------------+-------------------------------------| |req_cqe_error | Number of CQEs completed with error | | | at requester | |---------------------------+-------------------------------------| |req_remote_invalid_request | Number of times requester detected | | | remote invalid request error | |---------------------------+-------------------------------------| |req_remote_access_error | Number of times requester detected | | | remote access error | |---------------------------+-------------------------------------| |resp_remote_access_error | Number of times responder detected | | | remote access error | |---------------------------+-------------------------------------| |resp_cqe_flush_error | Number of CQEs completed with | | | flushed with error at responder | |---------------------------+-------------------------------------| |req_cqe_flush_error | Number of CQEs completed with | | | flushed with error at requester | +---------------------------+-------------------------------------+ Signed-off-by: Parav Pandit <parav@mellanox.com> Reviewed-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Eli Cohen <eli@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-06-19 04:19:37 +00:00
s1 = ib_create_srq(devr->p0, &attr);
if (IS_ERR(s1)) {
ret = PTR_ERR(s1);
mlx5_ib_err(dev, "Couldn't create SRQ 1 for res init, err=%d\n", ret);
ib_destroy_srq(s0);
}
devr->s0 = s0;
devr->s1 = s1;
unlock:
mutex_unlock(&devr->srq_lock);
return ret;
}
static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev)
{
struct mlx5_ib_resources *devr = &dev->devr;
int ret;
if (!MLX5_CAP_GEN(dev->mdev, xrc))
return -EOPNOTSUPP;
ret = mlx5_cmd_xrcd_alloc(dev->mdev, &devr->xrcdn0, 0);
if (ret)
return ret;
ret = mlx5_cmd_xrcd_alloc(dev->mdev, &devr->xrcdn1, 0);
if (ret) {
mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn0, 0);
return ret;
}
mutex_init(&devr->cq_lock);
mutex_init(&devr->srq_lock);
return 0;
}
static void mlx5_ib_dev_res_cleanup(struct mlx5_ib_dev *dev)
{
struct mlx5_ib_resources *devr = &dev->devr;
/* After s0/s1 init, they are not unset during the device lifetime. */
if (devr->s1) {
ib_destroy_srq(devr->s1);
ib_destroy_srq(devr->s0);
}
mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn1, 0);
mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn0, 0);
/* After p0/c0 init, they are not unset during the device lifetime. */
if (devr->c0) {
ib_destroy_cq(devr->c0);
ib_dealloc_pd(devr->p0);
}
mutex_destroy(&devr->cq_lock);
mutex_destroy(&devr->srq_lock);
}
static int
mlx5_ib_create_data_direct_resources(struct mlx5_ib_dev *dev)
{
int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
struct mlx5_core_dev *mdev = dev->mdev;
void *mkc;
u32 mkey;
u32 pdn;
u32 *in;
int err;
err = mlx5_core_alloc_pd(mdev, &pdn);
if (err)
return err;
in = kvzalloc(inlen, GFP_KERNEL);
if (!in) {
err = -ENOMEM;
goto err;
}
MLX5_SET(create_mkey_in, in, data_direct, 1);
mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
MLX5_SET(mkc, mkc, lw, 1);
MLX5_SET(mkc, mkc, lr, 1);
MLX5_SET(mkc, mkc, rw, 1);
MLX5_SET(mkc, mkc, rr, 1);
MLX5_SET(mkc, mkc, a, 1);
MLX5_SET(mkc, mkc, pd, pdn);
MLX5_SET(mkc, mkc, length64, 1);
MLX5_SET(mkc, mkc, qpn, 0xffffff);
err = mlx5_core_create_mkey(mdev, &mkey, in, inlen);
kvfree(in);
if (err)
goto err;
dev->ddr.mkey = mkey;
dev->ddr.pdn = pdn;
return 0;
err:
mlx5_core_dealloc_pd(mdev, pdn);
return err;
}
static void
mlx5_ib_free_data_direct_resources(struct mlx5_ib_dev *dev)
{
mlx5_core_destroy_mkey(dev->mdev, dev->ddr.mkey);
mlx5_core_dealloc_pd(dev->mdev, dev->ddr.pdn);
}
static u32 get_core_cap_flags(struct ib_device *ibdev,
struct mlx5_hca_vport_context *rep)
{
struct mlx5_ib_dev *dev = to_mdev(ibdev);
enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, 1);
u8 l3_type_cap = MLX5_CAP_ROCE(dev->mdev, l3_type);
u8 roce_version_cap = MLX5_CAP_ROCE(dev->mdev, roce_version);
bool raw_support = !mlx5_core_mp_enabled(dev->mdev);
u32 ret = 0;
if (rep->grh_required)
ret |= RDMA_CORE_CAP_IB_GRH_REQUIRED;
if (dev->num_plane)
return ret | RDMA_CORE_CAP_PROT_IB | RDMA_CORE_CAP_IB_MAD |
RDMA_CORE_CAP_IB_CM | RDMA_CORE_CAP_IB_SA |
RDMA_CORE_CAP_AF_IB;
else if (ibdev->type == RDMA_DEVICE_TYPE_SMI)
return ret | RDMA_CORE_CAP_IB_MAD | RDMA_CORE_CAP_IB_SMI;
if (ll == IB_LINK_LAYER_INFINIBAND)
return ret | RDMA_CORE_PORT_IBA_IB;
if (raw_support)
ret |= RDMA_CORE_PORT_RAW_PACKET;
if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP))
return ret;
if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV6_CAP))
return ret;
if (roce_version_cap & MLX5_ROCE_VERSION_1_CAP)
ret |= RDMA_CORE_PORT_IBA_ROCE;
if (roce_version_cap & MLX5_ROCE_VERSION_2_CAP)
ret |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
return ret;
}
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
static int mlx5_port_immutable(struct ib_device *ibdev, u32 port_num,
struct ib_port_immutable *immutable)
IB/mlx5: Add port protocol stats Expose new counters using the get protocol stats callback. We expose the following counters: |------------------------------------------------------------------------| | Name | IB | EN | Description | |------------------------------------------------------------------------| |rx_write_requests | + | - | Number of received WRITE requests for | | | | | the associated QP. | |------------------------------------------------------------------------| |rx_read_requests | + | - | Number of received READ requests for | | | | | the associated QP. | |------------------------------------------------------------------------| |rx_atomic_requests | + | - | Number of received ATOMIC requests for | | | | | the associated QP. | |------------------------------------------------------------------------| |out_of_buffer | + | + | Number of drops occurred due to lack | | | | | of WQE for the associated QPs/RQs. | |------------------------------------------------------------------------| |out_of_sequence | + | - | Number of errors in the packet | | | | | transport sequence number | |------------------------------------------------------------------------| |duplicate_request | + | + | Number of received duplicated packets. | | | | | A request that previously executed is | | | | | named duplicated. | |------------------------------------------------------------------------| |rnr_nak_retry_err | + | + | Number of received RNR NAC packets. | | | | | The QP retry limit did not exceed. | |------------------------------------------------------------------------| |packet_seq_err | + | + | Number of received NAK - sequence error| | | | | packets. The QP retry limit did not | | | | | exceed. | |------------------------------------------------------------------------| |implied_nak_err | + | + | Number of times the requester detected | | | | | an ACK with a PSN larger than expected | | | | | PSN for RDMA READ or ATOMIC response | | | | | The QP retry limit did not exceed. | |------------------------------------------------------------------------| |local_ack_timeout_err| + | - | Number of NO ACK responses from | | | | | responder within timer interval. | | | | | The QP retry limit did not exceed. | |------------------------------------------------------------------------| Counters are available if all of them are supported. Signed-off-by: Mark Bloch <markb@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Reviewed-by: Christoph Lameter <cl@linux.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2016-06-17 12:10:56 +00:00
{
struct ib_port_attr attr;
struct mlx5_ib_dev *dev = to_mdev(ibdev);
enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, port_num);
struct mlx5_hca_vport_context rep = {0};
int err;
IB/mlx5: Add port protocol stats Expose new counters using the get protocol stats callback. We expose the following counters: |------------------------------------------------------------------------| | Name | IB | EN | Description | |------------------------------------------------------------------------| |rx_write_requests | + | - | Number of received WRITE requests for | | | | | the associated QP. | |------------------------------------------------------------------------| |rx_read_requests | + | - | Number of received READ requests for | | | | | the associated QP. | |------------------------------------------------------------------------| |rx_atomic_requests | + | - | Number of received ATOMIC requests for | | | | | the associated QP. | |------------------------------------------------------------------------| |out_of_buffer | + | + | Number of drops occurred due to lack | | | | | of WQE for the associated QPs/RQs. | |------------------------------------------------------------------------| |out_of_sequence | + | - | Number of errors in the packet | | | | | transport sequence number | |------------------------------------------------------------------------| |duplicate_request | + | + | Number of received duplicated packets. | | | | | A request that previously executed is | | | | | named duplicated. | |------------------------------------------------------------------------| |rnr_nak_retry_err | + | + | Number of received RNR NAC packets. | | | | | The QP retry limit did not exceed. | |------------------------------------------------------------------------| |packet_seq_err | + | + | Number of received NAK - sequence error| | | | | packets. The QP retry limit did not | | | | | exceed. | |------------------------------------------------------------------------| |implied_nak_err | + | + | Number of times the requester detected | | | | | an ACK with a PSN larger than expected | | | | | PSN for RDMA READ or ATOMIC response | | | | | The QP retry limit did not exceed. | |------------------------------------------------------------------------| |local_ack_timeout_err| + | - | Number of NO ACK responses from | | | | | responder within timer interval. | | | | | The QP retry limit did not exceed. | |------------------------------------------------------------------------| Counters are available if all of them are supported. Signed-off-by: Mark Bloch <markb@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Reviewed-by: Christoph Lameter <cl@linux.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2016-06-17 12:10:56 +00:00
err = ib_query_port(ibdev, port_num, &attr);
if (err)
return err;
if (ll == IB_LINK_LAYER_INFINIBAND) {
if (ibdev->type == RDMA_DEVICE_TYPE_SMI)
port_num = smi_to_native_portnum(dev, port_num);
err = mlx5_query_hca_vport_context(dev->mdev, 0, port_num, 0,
&rep);
if (err)
return err;
}
IB/mlx5: Add port protocol stats Expose new counters using the get protocol stats callback. We expose the following counters: |------------------------------------------------------------------------| | Name | IB | EN | Description | |------------------------------------------------------------------------| |rx_write_requests | + | - | Number of received WRITE requests for | | | | | the associated QP. | |------------------------------------------------------------------------| |rx_read_requests | + | - | Number of received READ requests for | | | | | the associated QP. | |------------------------------------------------------------------------| |rx_atomic_requests | + | - | Number of received ATOMIC requests for | | | | | the associated QP. | |------------------------------------------------------------------------| |out_of_buffer | + | + | Number of drops occurred due to lack | | | | | of WQE for the associated QPs/RQs. | |------------------------------------------------------------------------| |out_of_sequence | + | - | Number of errors in the packet | | | | | transport sequence number | |------------------------------------------------------------------------| |duplicate_request | + | + | Number of received duplicated packets. | | | | | A request that previously executed is | | | | | named duplicated. | |------------------------------------------------------------------------| |rnr_nak_retry_err | + | + | Number of received RNR NAC packets. | | | | | The QP retry limit did not exceed. | |------------------------------------------------------------------------| |packet_seq_err | + | + | Number of received NAK - sequence error| | | | | packets. The QP retry limit did not | | | | | exceed. | |------------------------------------------------------------------------| |implied_nak_err | + | + | Number of times the requester detected | | | | | an ACK with a PSN larger than expected | | | | | PSN for RDMA READ or ATOMIC response | | | | | The QP retry limit did not exceed. | |------------------------------------------------------------------------| |local_ack_timeout_err| + | - | Number of NO ACK responses from | | | | | responder within timer interval. | | | | | The QP retry limit did not exceed. | |------------------------------------------------------------------------| Counters are available if all of them are supported. Signed-off-by: Mark Bloch <markb@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Reviewed-by: Christoph Lameter <cl@linux.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2016-06-17 12:10:56 +00:00
immutable->pkey_tbl_len = attr.pkey_tbl_len;
immutable->gid_tbl_len = attr.gid_tbl_len;
immutable->core_cap_flags = get_core_cap_flags(ibdev, &rep);
immutable->max_mad_size = IB_MGMT_MAD_SIZE;
return 0;
IB/mlx5: Add port protocol stats Expose new counters using the get protocol stats callback. We expose the following counters: |------------------------------------------------------------------------| | Name | IB | EN | Description | |------------------------------------------------------------------------| |rx_write_requests | + | - | Number of received WRITE requests for | | | | | the associated QP. | |------------------------------------------------------------------------| |rx_read_requests | + | - | Number of received READ requests for | | | | | the associated QP. | |------------------------------------------------------------------------| |rx_atomic_requests | + | - | Number of received ATOMIC requests for | | | | | the associated QP. | |------------------------------------------------------------------------| |out_of_buffer | + | + | Number of drops occurred due to lack | | | | | of WQE for the associated QPs/RQs. | |------------------------------------------------------------------------| |out_of_sequence | + | - | Number of errors in the packet | | | | | transport sequence number | |------------------------------------------------------------------------| |duplicate_request | + | + | Number of received duplicated packets. | | | | | A request that previously executed is | | | | | named duplicated. | |------------------------------------------------------------------------| |rnr_nak_retry_err | + | + | Number of received RNR NAC packets. | | | | | The QP retry limit did not exceed. | |------------------------------------------------------------------------| |packet_seq_err | + | + | Number of received NAK - sequence error| | | | | packets. The QP retry limit did not | | | | | exceed. | |------------------------------------------------------------------------| |implied_nak_err | + | + | Number of times the requester detected | | | | | an ACK with a PSN larger than expected | | | | | PSN for RDMA READ or ATOMIC response | | | | | The QP retry limit did not exceed. | |------------------------------------------------------------------------| |local_ack_timeout_err| + | - | Number of NO ACK responses from | | | | | responder within timer interval. | | | | | The QP retry limit did not exceed. | |------------------------------------------------------------------------| Counters are available if all of them are supported. Signed-off-by: Mark Bloch <markb@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Reviewed-by: Christoph Lameter <cl@linux.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2016-06-17 12:10:56 +00:00
}
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
static int mlx5_port_rep_immutable(struct ib_device *ibdev, u32 port_num,
struct ib_port_immutable *immutable)
IB/mlx5: Add port protocol stats Expose new counters using the get protocol stats callback. We expose the following counters: |------------------------------------------------------------------------| | Name | IB | EN | Description | |------------------------------------------------------------------------| |rx_write_requests | + | - | Number of received WRITE requests for | | | | | the associated QP. | |------------------------------------------------------------------------| |rx_read_requests | + | - | Number of received READ requests for | | | | | the associated QP. | |------------------------------------------------------------------------| |rx_atomic_requests | + | - | Number of received ATOMIC requests for | | | | | the associated QP. | |------------------------------------------------------------------------| |out_of_buffer | + | + | Number of drops occurred due to lack | | | | | of WQE for the associated QPs/RQs. | |------------------------------------------------------------------------| |out_of_sequence | + | - | Number of errors in the packet | | | | | transport sequence number | |------------------------------------------------------------------------| |duplicate_request | + | + | Number of received duplicated packets. | | | | | A request that previously executed is | | | | | named duplicated. | |------------------------------------------------------------------------| |rnr_nak_retry_err | + | + | Number of received RNR NAC packets. | | | | | The QP retry limit did not exceed. | |------------------------------------------------------------------------| |packet_seq_err | + | + | Number of received NAK - sequence error| | | | | packets. The QP retry limit did not | | | | | exceed. | |------------------------------------------------------------------------| |implied_nak_err | + | + | Number of times the requester detected | | | | | an ACK with a PSN larger than expected | | | | | PSN for RDMA READ or ATOMIC response | | | | | The QP retry limit did not exceed. | |------------------------------------------------------------------------| |local_ack_timeout_err| + | - | Number of NO ACK responses from | | | | | responder within timer interval. | | | | | The QP retry limit did not exceed. | |------------------------------------------------------------------------| Counters are available if all of them are supported. Signed-off-by: Mark Bloch <markb@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Reviewed-by: Christoph Lameter <cl@linux.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2016-06-17 12:10:56 +00:00
{
struct ib_port_attr attr;
int err;
IB/mlx5: Add port protocol stats Expose new counters using the get protocol stats callback. We expose the following counters: |------------------------------------------------------------------------| | Name | IB | EN | Description | |------------------------------------------------------------------------| |rx_write_requests | + | - | Number of received WRITE requests for | | | | | the associated QP. | |------------------------------------------------------------------------| |rx_read_requests | + | - | Number of received READ requests for | | | | | the associated QP. | |------------------------------------------------------------------------| |rx_atomic_requests | + | - | Number of received ATOMIC requests for | | | | | the associated QP. | |------------------------------------------------------------------------| |out_of_buffer | + | + | Number of drops occurred due to lack | | | | | of WQE for the associated QPs/RQs. | |------------------------------------------------------------------------| |out_of_sequence | + | - | Number of errors in the packet | | | | | transport sequence number | |------------------------------------------------------------------------| |duplicate_request | + | + | Number of received duplicated packets. | | | | | A request that previously executed is | | | | | named duplicated. | |------------------------------------------------------------------------| |rnr_nak_retry_err | + | + | Number of received RNR NAC packets. | | | | | The QP retry limit did not exceed. | |------------------------------------------------------------------------| |packet_seq_err | + | + | Number of received NAK - sequence error| | | | | packets. The QP retry limit did not | | | | | exceed. | |------------------------------------------------------------------------| |implied_nak_err | + | + | Number of times the requester detected | | | | | an ACK with a PSN larger than expected | | | | | PSN for RDMA READ or ATOMIC response | | | | | The QP retry limit did not exceed. | |------------------------------------------------------------------------| |local_ack_timeout_err| + | - | Number of NO ACK responses from | | | | | responder within timer interval. | | | | | The QP retry limit did not exceed. | |------------------------------------------------------------------------| Counters are available if all of them are supported. Signed-off-by: Mark Bloch <markb@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Reviewed-by: Christoph Lameter <cl@linux.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2016-06-17 12:10:56 +00:00
immutable->core_cap_flags = RDMA_CORE_PORT_RAW_PACKET;
IB/mlx5: Add port protocol stats Expose new counters using the get protocol stats callback. We expose the following counters: |------------------------------------------------------------------------| | Name | IB | EN | Description | |------------------------------------------------------------------------| |rx_write_requests | + | - | Number of received WRITE requests for | | | | | the associated QP. | |------------------------------------------------------------------------| |rx_read_requests | + | - | Number of received READ requests for | | | | | the associated QP. | |------------------------------------------------------------------------| |rx_atomic_requests | + | - | Number of received ATOMIC requests for | | | | | the associated QP. | |------------------------------------------------------------------------| |out_of_buffer | + | + | Number of drops occurred due to lack | | | | | of WQE for the associated QPs/RQs. | |------------------------------------------------------------------------| |out_of_sequence | + | - | Number of errors in the packet | | | | | transport sequence number | |------------------------------------------------------------------------| |duplicate_request | + | + | Number of received duplicated packets. | | | | | A request that previously executed is | | | | | named duplicated. | |------------------------------------------------------------------------| |rnr_nak_retry_err | + | + | Number of received RNR NAC packets. | | | | | The QP retry limit did not exceed. | |------------------------------------------------------------------------| |packet_seq_err | + | + | Number of received NAK - sequence error| | | | | packets. The QP retry limit did not | | | | | exceed. | |------------------------------------------------------------------------| |implied_nak_err | + | + | Number of times the requester detected | | | | | an ACK with a PSN larger than expected | | | | | PSN for RDMA READ or ATOMIC response | | | | | The QP retry limit did not exceed. | |------------------------------------------------------------------------| |local_ack_timeout_err| + | - | Number of NO ACK responses from | | | | | responder within timer interval. | | | | | The QP retry limit did not exceed. | |------------------------------------------------------------------------| Counters are available if all of them are supported. Signed-off-by: Mark Bloch <markb@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Reviewed-by: Christoph Lameter <cl@linux.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2016-06-17 12:10:56 +00:00
err = ib_query_port(ibdev, port_num, &attr);
if (err)
return err;
immutable->pkey_tbl_len = attr.pkey_tbl_len;
immutable->gid_tbl_len = attr.gid_tbl_len;
immutable->core_cap_flags = RDMA_CORE_PORT_RAW_PACKET;
return 0;
}
static void get_dev_fw_str(struct ib_device *ibdev, char *str)
{
struct mlx5_ib_dev *dev =
container_of(ibdev, struct mlx5_ib_dev, ib_dev);
snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%04d",
fw_rev_maj(dev->mdev), fw_rev_min(dev->mdev),
fw_rev_sub(dev->mdev));
}
RDMA/mlx5: Use IB set_netdev and get_netdev functions JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: v6.12-rc1 commit 8d159eb2117b2e3697a31785662b653938f007cb Author: Chiara Meiohas <cmeiohas@nvidia.com> Date: Mon Sep 9 20:30:23 2024 +0300 RDMA/mlx5: Use IB set_netdev and get_netdev functions The IB layer provides a common interface to store and get net devices associated to an IB device port (ib_device_set_netdev() and ib_device_get_netdev()). Previously, mlx5_ib stored and managed the associated net devices internally. Replace internal net device management in mlx5_ib with ib_device_set_netdev() when attaching/detaching a net device and ib_device_get_netdev() when retrieving the net device. Export ib_device_get_netdev(). For mlx5 representors/PFs/VFs and lag creation we replace the netdev assignments with the IB set/get netdev functions. In active-backup mode lag the active slave net device is stored in the lag itself. To assure the net device stored in a lag bond IB device is the active slave we implement the following: - mlx5_core: when modifying the slave of a bond we send the internal driver event MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE. - mlx5_ib: when catching the event call ib_device_set_netdev() This patch also ensures the correct IB events are sent in switchdev lag. While at it, when in multiport eswitch mode, only a single IB device is created for all ports. The said IB device will receive all netdev events of its VFs once loaded, thus to avoid overwriting the mapping of PF IB device to PF netdev, ignore NETDEV_REGISTER events if the ib device has already been mapped to a netdev. Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com> Signed-off-by: Michael Guralnik <michaelgur@nvidia.com> Link: https://patch.msgid.link/20240909173025.30422-6-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:09 +00:00
static int lag_event(struct notifier_block *nb, unsigned long event, void *data)
{
struct mlx5_ib_dev *dev = container_of(nb, struct mlx5_ib_dev,
lag_events);
struct mlx5_core_dev *mdev = dev->mdev;
RDMA/mlx5: Ensure active slave attachment to the bond IB device JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git master commit 0bd2c61df95321e1ec123017cd8657360d15a24e Author: Chiara Meiohas <cmeiohas@nvidia.com> Date: Thu Oct 31 15:36:52 2024 +0200 RDMA/mlx5: Ensure active slave attachment to the bond IB device Fix a race condition when creating a lag bond in active backup mode where after the bond creation the backup slave was attached to the IB device, instead of the active slave. This caused stale entries in the GID table, as the gid updating mechanism relies on ib_device_get_netdev(), which would return the backup slave. Send an MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE event when activating the lag, additionally to when modifying the lag. This ensures that eventually the active netdevice is stored in the bond IB device. When handling this event remove the GIDs of the previously attached netdevice in this port and rescan the GIDs of the newly attached netdevice. This ensures that eventually the active slave netdevice is correctly stored in the IB device port. While there might be a brief moment where the backup slave GIDs appear in the GID table, it will eventually stabilize with the correct GIDs (of the bond and the active slave). Fixes: 8d159eb2117b ("RDMA/mlx5: Use IB set_netdev and get_netdev functions") Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com> Link: https://patch.msgid.link/91fc2cb24f63add266a528c1c702668a80416d9f.1730381292.git.leon@kernel.org Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:11 +00:00
struct ib_device *ibdev = &dev->ib_dev;
struct net_device *old_ndev = NULL;
RDMA/mlx5: Use IB set_netdev and get_netdev functions JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: v6.12-rc1 commit 8d159eb2117b2e3697a31785662b653938f007cb Author: Chiara Meiohas <cmeiohas@nvidia.com> Date: Mon Sep 9 20:30:23 2024 +0300 RDMA/mlx5: Use IB set_netdev and get_netdev functions The IB layer provides a common interface to store and get net devices associated to an IB device port (ib_device_set_netdev() and ib_device_get_netdev()). Previously, mlx5_ib stored and managed the associated net devices internally. Replace internal net device management in mlx5_ib with ib_device_set_netdev() when attaching/detaching a net device and ib_device_get_netdev() when retrieving the net device. Export ib_device_get_netdev(). For mlx5 representors/PFs/VFs and lag creation we replace the netdev assignments with the IB set/get netdev functions. In active-backup mode lag the active slave net device is stored in the lag itself. To assure the net device stored in a lag bond IB device is the active slave we implement the following: - mlx5_core: when modifying the slave of a bond we send the internal driver event MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE. - mlx5_ib: when catching the event call ib_device_set_netdev() This patch also ensures the correct IB events are sent in switchdev lag. While at it, when in multiport eswitch mode, only a single IB device is created for all ports. The said IB device will receive all netdev events of its VFs once loaded, thus to avoid overwriting the mapping of PF IB device to PF netdev, ignore NETDEV_REGISTER events if the ib device has already been mapped to a netdev. Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com> Signed-off-by: Michael Guralnik <michaelgur@nvidia.com> Link: https://patch.msgid.link/20240909173025.30422-6-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:09 +00:00
struct mlx5_ib_port *port;
struct net_device *ndev;
RDMA/mlx5: Ensure active slave attachment to the bond IB device JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git master commit 0bd2c61df95321e1ec123017cd8657360d15a24e Author: Chiara Meiohas <cmeiohas@nvidia.com> Date: Thu Oct 31 15:36:52 2024 +0200 RDMA/mlx5: Ensure active slave attachment to the bond IB device Fix a race condition when creating a lag bond in active backup mode where after the bond creation the backup slave was attached to the IB device, instead of the active slave. This caused stale entries in the GID table, as the gid updating mechanism relies on ib_device_get_netdev(), which would return the backup slave. Send an MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE event when activating the lag, additionally to when modifying the lag. This ensures that eventually the active netdevice is stored in the bond IB device. When handling this event remove the GIDs of the previously attached netdevice in this port and rescan the GIDs of the newly attached netdevice. This ensures that eventually the active slave netdevice is correctly stored in the IB device port. While there might be a brief moment where the backup slave GIDs appear in the GID table, it will eventually stabilize with the correct GIDs (of the bond and the active slave). Fixes: 8d159eb2117b ("RDMA/mlx5: Use IB set_netdev and get_netdev functions") Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com> Link: https://patch.msgid.link/91fc2cb24f63add266a528c1c702668a80416d9f.1730381292.git.leon@kernel.org Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:11 +00:00
u32 portnum = 0;
int ret = 0;
int i;
RDMA/mlx5: Use IB set_netdev and get_netdev functions JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: v6.12-rc1 commit 8d159eb2117b2e3697a31785662b653938f007cb Author: Chiara Meiohas <cmeiohas@nvidia.com> Date: Mon Sep 9 20:30:23 2024 +0300 RDMA/mlx5: Use IB set_netdev and get_netdev functions The IB layer provides a common interface to store and get net devices associated to an IB device port (ib_device_set_netdev() and ib_device_get_netdev()). Previously, mlx5_ib stored and managed the associated net devices internally. Replace internal net device management in mlx5_ib with ib_device_set_netdev() when attaching/detaching a net device and ib_device_get_netdev() when retrieving the net device. Export ib_device_get_netdev(). For mlx5 representors/PFs/VFs and lag creation we replace the netdev assignments with the IB set/get netdev functions. In active-backup mode lag the active slave net device is stored in the lag itself. To assure the net device stored in a lag bond IB device is the active slave we implement the following: - mlx5_core: when modifying the slave of a bond we send the internal driver event MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE. - mlx5_ib: when catching the event call ib_device_set_netdev() This patch also ensures the correct IB events are sent in switchdev lag. While at it, when in multiport eswitch mode, only a single IB device is created for all ports. The said IB device will receive all netdev events of its VFs once loaded, thus to avoid overwriting the mapping of PF IB device to PF netdev, ignore NETDEV_REGISTER events if the ib device has already been mapped to a netdev. Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com> Signed-off-by: Michael Guralnik <michaelgur@nvidia.com> Link: https://patch.msgid.link/20240909173025.30422-6-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:09 +00:00
switch (event) {
case MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE:
ndev = data;
if (ndev) {
if (!mlx5_lag_is_roce(mdev)) {
// sriov lag
for (i = 0; i < dev->num_ports; i++) {
port = &dev->port[i];
if (port->rep && port->rep->vport ==
MLX5_VPORT_UPLINK) {
portnum = i;
break;
}
}
}
RDMA/mlx5: Ensure active slave attachment to the bond IB device JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git master commit 0bd2c61df95321e1ec123017cd8657360d15a24e Author: Chiara Meiohas <cmeiohas@nvidia.com> Date: Thu Oct 31 15:36:52 2024 +0200 RDMA/mlx5: Ensure active slave attachment to the bond IB device Fix a race condition when creating a lag bond in active backup mode where after the bond creation the backup slave was attached to the IB device, instead of the active slave. This caused stale entries in the GID table, as the gid updating mechanism relies on ib_device_get_netdev(), which would return the backup slave. Send an MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE event when activating the lag, additionally to when modifying the lag. This ensures that eventually the active netdevice is stored in the bond IB device. When handling this event remove the GIDs of the previously attached netdevice in this port and rescan the GIDs of the newly attached netdevice. This ensures that eventually the active slave netdevice is correctly stored in the IB device port. While there might be a brief moment where the backup slave GIDs appear in the GID table, it will eventually stabilize with the correct GIDs (of the bond and the active slave). Fixes: 8d159eb2117b ("RDMA/mlx5: Use IB set_netdev and get_netdev functions") Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com> Link: https://patch.msgid.link/91fc2cb24f63add266a528c1c702668a80416d9f.1730381292.git.leon@kernel.org Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:11 +00:00
old_ndev = ib_device_get_netdev(ibdev, portnum + 1);
ret = ib_device_set_netdev(ibdev, ndev, portnum + 1);
if (ret)
goto out;
if (old_ndev)
roce_del_all_netdev_gids(ibdev, portnum + 1,
old_ndev);
rdma_roce_rescan_port(ibdev, portnum + 1);
RDMA/mlx5: Use IB set_netdev and get_netdev functions JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: v6.12-rc1 commit 8d159eb2117b2e3697a31785662b653938f007cb Author: Chiara Meiohas <cmeiohas@nvidia.com> Date: Mon Sep 9 20:30:23 2024 +0300 RDMA/mlx5: Use IB set_netdev and get_netdev functions The IB layer provides a common interface to store and get net devices associated to an IB device port (ib_device_set_netdev() and ib_device_get_netdev()). Previously, mlx5_ib stored and managed the associated net devices internally. Replace internal net device management in mlx5_ib with ib_device_set_netdev() when attaching/detaching a net device and ib_device_get_netdev() when retrieving the net device. Export ib_device_get_netdev(). For mlx5 representors/PFs/VFs and lag creation we replace the netdev assignments with the IB set/get netdev functions. In active-backup mode lag the active slave net device is stored in the lag itself. To assure the net device stored in a lag bond IB device is the active slave we implement the following: - mlx5_core: when modifying the slave of a bond we send the internal driver event MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE. - mlx5_ib: when catching the event call ib_device_set_netdev() This patch also ensures the correct IB events are sent in switchdev lag. While at it, when in multiport eswitch mode, only a single IB device is created for all ports. The said IB device will receive all netdev events of its VFs once loaded, thus to avoid overwriting the mapping of PF IB device to PF netdev, ignore NETDEV_REGISTER events if the ib device has already been mapped to a netdev. Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com> Signed-off-by: Michael Guralnik <michaelgur@nvidia.com> Link: https://patch.msgid.link/20240909173025.30422-6-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:09 +00:00
}
break;
default:
return NOTIFY_DONE;
}
RDMA/mlx5: Ensure active slave attachment to the bond IB device JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git master commit 0bd2c61df95321e1ec123017cd8657360d15a24e Author: Chiara Meiohas <cmeiohas@nvidia.com> Date: Thu Oct 31 15:36:52 2024 +0200 RDMA/mlx5: Ensure active slave attachment to the bond IB device Fix a race condition when creating a lag bond in active backup mode where after the bond creation the backup slave was attached to the IB device, instead of the active slave. This caused stale entries in the GID table, as the gid updating mechanism relies on ib_device_get_netdev(), which would return the backup slave. Send an MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE event when activating the lag, additionally to when modifying the lag. This ensures that eventually the active netdevice is stored in the bond IB device. When handling this event remove the GIDs of the previously attached netdevice in this port and rescan the GIDs of the newly attached netdevice. This ensures that eventually the active slave netdevice is correctly stored in the IB device port. While there might be a brief moment where the backup slave GIDs appear in the GID table, it will eventually stabilize with the correct GIDs (of the bond and the active slave). Fixes: 8d159eb2117b ("RDMA/mlx5: Use IB set_netdev and get_netdev functions") Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com> Link: https://patch.msgid.link/91fc2cb24f63add266a528c1c702668a80416d9f.1730381292.git.leon@kernel.org Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:11 +00:00
out:
dev_put(old_ndev);
return notifier_from_errno(ret);
RDMA/mlx5: Use IB set_netdev and get_netdev functions JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: v6.12-rc1 commit 8d159eb2117b2e3697a31785662b653938f007cb Author: Chiara Meiohas <cmeiohas@nvidia.com> Date: Mon Sep 9 20:30:23 2024 +0300 RDMA/mlx5: Use IB set_netdev and get_netdev functions The IB layer provides a common interface to store and get net devices associated to an IB device port (ib_device_set_netdev() and ib_device_get_netdev()). Previously, mlx5_ib stored and managed the associated net devices internally. Replace internal net device management in mlx5_ib with ib_device_set_netdev() when attaching/detaching a net device and ib_device_get_netdev() when retrieving the net device. Export ib_device_get_netdev(). For mlx5 representors/PFs/VFs and lag creation we replace the netdev assignments with the IB set/get netdev functions. In active-backup mode lag the active slave net device is stored in the lag itself. To assure the net device stored in a lag bond IB device is the active slave we implement the following: - mlx5_core: when modifying the slave of a bond we send the internal driver event MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE. - mlx5_ib: when catching the event call ib_device_set_netdev() This patch also ensures the correct IB events are sent in switchdev lag. While at it, when in multiport eswitch mode, only a single IB device is created for all ports. The said IB device will receive all netdev events of its VFs once loaded, thus to avoid overwriting the mapping of PF IB device to PF netdev, ignore NETDEV_REGISTER events if the ib device has already been mapped to a netdev. Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com> Signed-off-by: Michael Guralnik <michaelgur@nvidia.com> Link: https://patch.msgid.link/20240909173025.30422-6-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:09 +00:00
}
static void mlx5e_lag_event_register(struct mlx5_ib_dev *dev)
{
dev->lag_events.notifier_call = lag_event;
blocking_notifier_chain_register(&dev->mdev->priv.lag_nh,
&dev->lag_events);
}
static void mlx5e_lag_event_unregister(struct mlx5_ib_dev *dev)
{
blocking_notifier_chain_unregister(&dev->mdev->priv.lag_nh,
&dev->lag_events);
}
static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev)
{
struct mlx5_core_dev *mdev = dev->mdev;
struct mlx5_flow_namespace *ns = mlx5_get_flow_namespace(mdev,
MLX5_FLOW_NAMESPACE_LAG);
struct mlx5_flow_table *ft;
int err;
if (!ns || !mlx5_lag_is_active(mdev))
return 0;
err = mlx5_cmd_create_vport_lag(mdev);
if (err)
return err;
ft = mlx5_create_lag_demux_flow_table(ns, 0, 0);
if (IS_ERR(ft)) {
err = PTR_ERR(ft);
goto err_destroy_vport_lag;
}
RDMA/mlx5: Use IB set_netdev and get_netdev functions JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: v6.12-rc1 commit 8d159eb2117b2e3697a31785662b653938f007cb Author: Chiara Meiohas <cmeiohas@nvidia.com> Date: Mon Sep 9 20:30:23 2024 +0300 RDMA/mlx5: Use IB set_netdev and get_netdev functions The IB layer provides a common interface to store and get net devices associated to an IB device port (ib_device_set_netdev() and ib_device_get_netdev()). Previously, mlx5_ib stored and managed the associated net devices internally. Replace internal net device management in mlx5_ib with ib_device_set_netdev() when attaching/detaching a net device and ib_device_get_netdev() when retrieving the net device. Export ib_device_get_netdev(). For mlx5 representors/PFs/VFs and lag creation we replace the netdev assignments with the IB set/get netdev functions. In active-backup mode lag the active slave net device is stored in the lag itself. To assure the net device stored in a lag bond IB device is the active slave we implement the following: - mlx5_core: when modifying the slave of a bond we send the internal driver event MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE. - mlx5_ib: when catching the event call ib_device_set_netdev() This patch also ensures the correct IB events are sent in switchdev lag. While at it, when in multiport eswitch mode, only a single IB device is created for all ports. The said IB device will receive all netdev events of its VFs once loaded, thus to avoid overwriting the mapping of PF IB device to PF netdev, ignore NETDEV_REGISTER events if the ib device has already been mapped to a netdev. Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com> Signed-off-by: Michael Guralnik <michaelgur@nvidia.com> Link: https://patch.msgid.link/20240909173025.30422-6-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:09 +00:00
mlx5e_lag_event_register(dev);
dev->flow_db->lag_demux_ft = ft;
dev->lag_ports = mlx5_lag_get_num_ports(mdev);
dev->lag_active = true;
return 0;
err_destroy_vport_lag:
mlx5_cmd_destroy_vport_lag(mdev);
return err;
IB/mlx5: Add port protocol stats Expose new counters using the get protocol stats callback. We expose the following counters: |------------------------------------------------------------------------| | Name | IB | EN | Description | |------------------------------------------------------------------------| |rx_write_requests | + | - | Number of received WRITE requests for | | | | | the associated QP. | |------------------------------------------------------------------------| |rx_read_requests | + | - | Number of received READ requests for | | | | | the associated QP. | |------------------------------------------------------------------------| |rx_atomic_requests | + | - | Number of received ATOMIC requests for | | | | | the associated QP. | |------------------------------------------------------------------------| |out_of_buffer | + | + | Number of drops occurred due to lack | | | | | of WQE for the associated QPs/RQs. | |------------------------------------------------------------------------| |out_of_sequence | + | - | Number of errors in the packet | | | | | transport sequence number | |------------------------------------------------------------------------| |duplicate_request | + | + | Number of received duplicated packets. | | | | | A request that previously executed is | | | | | named duplicated. | |------------------------------------------------------------------------| |rnr_nak_retry_err | + | + | Number of received RNR NAC packets. | | | | | The QP retry limit did not exceed. | |------------------------------------------------------------------------| |packet_seq_err | + | + | Number of received NAK - sequence error| | | | | packets. The QP retry limit did not | | | | | exceed. | |------------------------------------------------------------------------| |implied_nak_err | + | + | Number of times the requester detected | | | | | an ACK with a PSN larger than expected | | | | | PSN for RDMA READ or ATOMIC response | | | | | The QP retry limit did not exceed. | |------------------------------------------------------------------------| |local_ack_timeout_err| + | - | Number of NO ACK responses from | | | | | responder within timer interval. | | | | | The QP retry limit did not exceed. | |------------------------------------------------------------------------| Counters are available if all of them are supported. Signed-off-by: Mark Bloch <markb@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Reviewed-by: Christoph Lameter <cl@linux.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2016-06-17 12:10:56 +00:00
}
static void mlx5_eth_lag_cleanup(struct mlx5_ib_dev *dev)
{
struct mlx5_core_dev *mdev = dev->mdev;
if (dev->lag_active) {
dev->lag_active = false;
RDMA/mlx5: Use IB set_netdev and get_netdev functions JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: v6.12-rc1 commit 8d159eb2117b2e3697a31785662b653938f007cb Author: Chiara Meiohas <cmeiohas@nvidia.com> Date: Mon Sep 9 20:30:23 2024 +0300 RDMA/mlx5: Use IB set_netdev and get_netdev functions The IB layer provides a common interface to store and get net devices associated to an IB device port (ib_device_set_netdev() and ib_device_get_netdev()). Previously, mlx5_ib stored and managed the associated net devices internally. Replace internal net device management in mlx5_ib with ib_device_set_netdev() when attaching/detaching a net device and ib_device_get_netdev() when retrieving the net device. Export ib_device_get_netdev(). For mlx5 representors/PFs/VFs and lag creation we replace the netdev assignments with the IB set/get netdev functions. In active-backup mode lag the active slave net device is stored in the lag itself. To assure the net device stored in a lag bond IB device is the active slave we implement the following: - mlx5_core: when modifying the slave of a bond we send the internal driver event MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE. - mlx5_ib: when catching the event call ib_device_set_netdev() This patch also ensures the correct IB events are sent in switchdev lag. While at it, when in multiport eswitch mode, only a single IB device is created for all ports. The said IB device will receive all netdev events of its VFs once loaded, thus to avoid overwriting the mapping of PF IB device to PF netdev, ignore NETDEV_REGISTER events if the ib device has already been mapped to a netdev. Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com> Signed-off-by: Michael Guralnik <michaelgur@nvidia.com> Link: https://patch.msgid.link/20240909173025.30422-6-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:09 +00:00
mlx5e_lag_event_unregister(dev);
mlx5_destroy_flow_table(dev->flow_db->lag_demux_ft);
dev->flow_db->lag_demux_ft = NULL;
mlx5_cmd_destroy_vport_lag(mdev);
}
}
static void mlx5_netdev_notifier_register(struct mlx5_roce *roce,
struct net_device *netdev)
{
int err;
if (roce->tracking_netdev)
return;
roce->tracking_netdev = netdev;
roce->nb.notifier_call = mlx5_netdev_event;
err = register_netdevice_notifier_dev_net(netdev, &roce->nb, &roce->nn);
WARN_ON(err);
}
static void mlx5_netdev_notifier_unregister(struct mlx5_roce *roce)
{
if (!roce->tracking_netdev)
return;
unregister_netdevice_notifier_dev_net(roce->tracking_netdev, &roce->nb,
&roce->nn);
roce->tracking_netdev = NULL;
}
static int mlx5e_mdev_notifier_event(struct notifier_block *nb,
unsigned long event, void *data)
{
struct mlx5_roce *roce = container_of(nb, struct mlx5_roce, mdev_nb);
struct net_device *netdev = data;
switch (event) {
case MLX5_DRIVER_EVENT_UPLINK_NETDEV:
if (netdev)
mlx5_netdev_notifier_register(roce, netdev);
else
mlx5_netdev_notifier_unregister(roce);
break;
default:
return NOTIFY_DONE;
}
return NOTIFY_OK;
}
static void mlx5_mdev_netdev_track(struct mlx5_ib_dev *dev, u32 port_num)
{
struct mlx5_roce *roce = &dev->port[port_num].roce;
roce->mdev_nb.notifier_call = mlx5e_mdev_notifier_event;
mlx5_blocking_notifier_register(dev->mdev, &roce->mdev_nb);
mlx5_core_uplink_netdev_event_replay(dev->mdev);
}
static void mlx5_mdev_netdev_untrack(struct mlx5_ib_dev *dev, u32 port_num)
{
struct mlx5_roce *roce = &dev->port[port_num].roce;
mlx5_blocking_notifier_unregister(dev->mdev, &roce->mdev_nb);
mlx5_netdev_notifier_unregister(roce);
}
static int mlx5_enable_eth(struct mlx5_ib_dev *dev)
{
int err;
if (!dev->is_rep && dev->profile != &raw_eth_profile) {
err = mlx5_nic_vport_enable_roce(dev->mdev);
if (err)
return err;
}
err = mlx5_eth_lag_init(dev);
if (err)
goto err_disable_roce;
return 0;
err_disable_roce:
if (!dev->is_rep && dev->profile != &raw_eth_profile)
mlx5_nic_vport_disable_roce(dev->mdev);
return err;
}
static void mlx5_disable_eth(struct mlx5_ib_dev *dev)
{
mlx5_eth_lag_cleanup(dev);
if (!dev->is_rep && dev->profile != &raw_eth_profile)
mlx5_nic_vport_disable_roce(dev->mdev);
}
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
static int mlx5_ib_rn_get_params(struct ib_device *device, u32 port_num,
enum rdma_netdev_t type,
struct rdma_netdev_alloc_params *params)
{
if (type != RDMA_NETDEV_IPOIB)
return -EOPNOTSUPP;
return mlx5_rdma_rn_get_params(to_mdev(device)->mdev, device, params);
}
static ssize_t delay_drop_timeout_read(struct file *filp, char __user *buf,
size_t count, loff_t *pos)
{
struct mlx5_ib_delay_drop *delay_drop = filp->private_data;
char lbuf[20];
int len;
len = snprintf(lbuf, sizeof(lbuf), "%u\n", delay_drop->timeout);
return simple_read_from_buffer(buf, count, pos, lbuf, len);
}
static ssize_t delay_drop_timeout_write(struct file *filp, const char __user *buf,
size_t count, loff_t *pos)
{
struct mlx5_ib_delay_drop *delay_drop = filp->private_data;
u32 timeout;
u32 var;
if (kstrtouint_from_user(buf, count, 0, &var))
return -EFAULT;
timeout = min_t(u32, roundup(var, 100), MLX5_MAX_DELAY_DROP_TIMEOUT_MS *
1000);
if (timeout != var)
mlx5_ib_dbg(delay_drop->dev, "Round delay drop timeout to %u usec\n",
timeout);
delay_drop->timeout = timeout;
return count;
}
static const struct file_operations fops_delay_drop_timeout = {
.owner = THIS_MODULE,
.open = simple_open,
.write = delay_drop_timeout_write,
.read = delay_drop_timeout_read,
};
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev,
struct mlx5_ib_multiport_info *mpi)
{
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
u32 port_num = mlx5_core_native_port_num(mpi->mdev) - 1;
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
struct mlx5_ib_port *port = &ibdev->port[port_num];
int comps;
int err;
int i;
lockdep_assert_held(&mlx5_ib_multiport_mutex);
mlx5_core_mp_event_replay(ibdev->mdev,
MLX5_DRIVER_EVENT_AFFILIATION_REMOVED,
NULL);
mlx5_core_mp_event_replay(mpi->mdev,
MLX5_DRIVER_EVENT_AFFILIATION_REMOVED,
NULL);
mlx5_ib_cleanup_cong_debugfs(ibdev, port_num);
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
spin_lock(&port->mp.mpi_lock);
if (!mpi->ibdev) {
spin_unlock(&port->mp.mpi_lock);
return;
}
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
mpi->ibdev = NULL;
spin_unlock(&port->mp.mpi_lock);
RDMA/mlx5: Release locks during notifier unregister The below kernel panic was observed when created bond mode LACP with GRE tunnel on top. The reason to it was not released spinlock during mlx5 notify unregsiter sequence. [ 234.562007] BUG: scheduling while atomic: sh/10900/0x00000002 [ 234.563005] Preemption disabled at: [ 234.566864] ------------[ cut here ]------------ [ 234.567120] DEBUG_LOCKS_WARN_ON(val > preempt_count()) [ 234.567139] WARNING: CPU: 16 PID: 10900 at kernel/sched/core.c:3203 preempt_count_sub+0xca/0x170 [ 234.569550] CPU: 16 PID: 10900 Comm: sh Tainted: G W 5.2.0-rc1-for-linust-dbg-2019-05-25_04-57-33-60 #1 [ 234.569886] Hardware name: Dell Inc. PowerEdge R720/0X3D66, BIOS 2.6.1 02/12/2018 [ 234.570183] RIP: 0010:preempt_count_sub+0xca/0x170 [ 234.570404] Code: 03 38 d0 7c 08 84 d2 0f 85 b0 00 00 00 8b 15 dd 02 03 04 85 d2 75 ba 48 c7 c6 00 e1 88 83 48 c7 c7 40 e1 88 83 e8 76 11 f7 ff <0f> 0b 5b c3 65 8b 05 d3 1f d8 7e 84 c0 75 82 e8 62 c3 c3 00 85 c0 [ 234.570911] RSP: 0018:ffff888b94477b08 EFLAGS: 00010286 [ 234.571133] RAX: 0000000000000000 RBX: 0000000000000001 RCX: 0000000000000000 [ 234.571391] RDX: 0000000000000000 RSI: 0000000000000004 RDI: 0000000000000246 [ 234.571648] RBP: ffff888ba5560000 R08: fffffbfff08962d5 R09: fffffbfff08962d5 [ 234.571902] R10: 0000000000000001 R11: fffffbfff08962d4 R12: ffff888bac6e9548 [ 234.572157] R13: ffff888babfaf728 R14: ffff888bac6e9568 R15: ffff888babfaf750 [ 234.572412] FS: 00007fcafa59b740(0000) GS:ffff888bed200000(0000) knlGS:0000000000000000 [ 234.572686] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 234.572914] CR2: 00007f984f16b140 CR3: 0000000b2bf0a001 CR4: 00000000001606e0 [ 234.573172] Call Trace: [ 234.573336] _raw_spin_unlock+0x2e/0x50 [ 234.573542] mlx5_ib_unbind_slave_port+0x1bc/0x690 [mlx5_ib] [ 234.573793] mlx5_ib_cleanup_multiport_master+0x1d3/0x660 [mlx5_ib] [ 234.574039] mlx5_ib_stage_init_cleanup+0x4c/0x360 [mlx5_ib] [ 234.574271] ? kfree+0xf5/0x2f0 [ 234.574465] __mlx5_ib_remove+0x61/0xd0 [mlx5_ib] [ 234.574688] ? __mlx5_ib_remove+0xd0/0xd0 [mlx5_ib] [ 234.574951] mlx5_remove_device+0x234/0x300 [mlx5_core] [ 234.575224] mlx5_unregister_device+0x4d/0x1e0 [mlx5_core] [ 234.575493] remove_one+0x4f/0x160 [mlx5_core] [ 234.575704] pci_device_remove+0xef/0x2a0 [ 234.581407] ? pcibios_free_irq+0x10/0x10 [ 234.587143] ? up_read+0xc1/0x260 [ 234.592785] device_release_driver_internal+0x1ab/0x430 [ 234.598442] unbind_store+0x152/0x200 [ 234.604064] ? sysfs_kf_write+0x3b/0x180 [ 234.609441] ? sysfs_file_ops+0x160/0x160 [ 234.615021] kernfs_fop_write+0x277/0x440 [ 234.620288] ? __sb_start_write+0x1ef/0x2c0 [ 234.625512] vfs_write+0x15e/0x460 [ 234.630786] ksys_write+0x156/0x1e0 [ 234.635988] ? __ia32_sys_read+0xb0/0xb0 [ 234.641120] ? trace_hardirqs_off_thunk+0x1a/0x1c [ 234.646163] do_syscall_64+0x95/0x470 [ 234.651106] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 234.656004] RIP: 0033:0x7fcaf9c9cfd0 [ 234.660686] Code: 73 01 c3 48 8b 0d c0 6e 2d 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d cd cf 2d 00 00 75 10 b8 01 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 ee cb 01 00 48 89 04 24 [ 234.670128] RSP: 002b:00007ffd3b01ddd8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 [ 234.674811] RAX: ffffffffffffffda RBX: 000000000000000d RCX: 00007fcaf9c9cfd0 [ 234.679387] RDX: 000000000000000d RSI: 00007fcafa5c1000 RDI: 0000000000000001 [ 234.683848] RBP: 00007fcafa5c1000 R08: 000000000000000a R09: 00007fcafa59b740 [ 234.688167] R10: 00007ffd3b01d8e0 R11: 0000000000000246 R12: 00007fcaf9f75400 [ 234.692386] R13: 000000000000000d R14: 0000000000000001 R15: 0000000000000000 [ 234.696495] irq event stamp: 153067 [ 234.700525] hardirqs last enabled at (153067): [<ffffffff83258c39>] _raw_spin_unlock_irqrestore+0x59/0x70 [ 234.704665] hardirqs last disabled at (153066): [<ffffffff83259382>] _raw_spin_lock_irqsave+0x22/0x90 [ 234.708722] softirqs last enabled at (153058): [<ffffffff836006c5>] __do_softirq+0x6c5/0xb4e [ 234.712673] softirqs last disabled at (153051): [<ffffffff81227c1d>] irq_exit+0x17d/0x1d0 [ 234.716601] ---[ end trace 5dbf096843ee9ce6 ]--- Fixes: df097a278c75 ("IB/mlx5: Use the new mlx5 core notifier API") Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Link: https://lore.kernel.org/r/20190731083852.584-1-leon@kernel.org Signed-off-by: Doug Ledford <dledford@redhat.com>
2019-07-31 08:38:52 +00:00
if (mpi->mdev_events.notifier_call)
mlx5_notifier_unregister(mpi->mdev, &mpi->mdev_events);
mpi->mdev_events.notifier_call = NULL;
mlx5_mdev_netdev_untrack(ibdev, port_num);
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
spin_lock(&port->mp.mpi_lock);
comps = mpi->mdev_refcnt;
if (comps) {
mpi->unaffiliate = true;
init_completion(&mpi->unref_comp);
spin_unlock(&port->mp.mpi_lock);
for (i = 0; i < comps; i++)
wait_for_completion(&mpi->unref_comp);
spin_lock(&port->mp.mpi_lock);
mpi->unaffiliate = false;
}
port->mp.mpi = NULL;
spin_unlock(&port->mp.mpi_lock);
err = mlx5_nic_vport_unaffiliate_multiport(mpi->mdev);
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
mlx5_ib_dbg(ibdev, "unaffiliated port %u\n", port_num + 1);
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
/* Log an error, still needed to cleanup the pointers and add
* it back to the list.
*/
if (err)
mlx5_ib_err(ibdev, "Failed to unaffiliate port %u\n",
port_num + 1);
ibdev->port[port_num].roce.last_port_state = IB_PORT_DOWN;
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
}
static bool mlx5_ib_bind_slave_port(struct mlx5_ib_dev *ibdev,
struct mlx5_ib_multiport_info *mpi)
{
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
u32 port_num = mlx5_core_native_port_num(mpi->mdev) - 1;
u64 key;
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
int err;
lockdep_assert_held(&mlx5_ib_multiport_mutex);
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
spin_lock(&ibdev->port[port_num].mp.mpi_lock);
if (ibdev->port[port_num].mp.mpi) {
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
mlx5_ib_dbg(ibdev, "port %u already affiliated.\n",
port_num + 1);
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
spin_unlock(&ibdev->port[port_num].mp.mpi_lock);
return false;
}
ibdev->port[port_num].mp.mpi = mpi;
mpi->ibdev = ibdev;
mpi->mdev_events.notifier_call = NULL;
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
spin_unlock(&ibdev->port[port_num].mp.mpi_lock);
err = mlx5_nic_vport_affiliate_multiport(ibdev->mdev, mpi->mdev);
if (err)
goto unbind;
mlx5_mdev_netdev_track(ibdev, port_num);
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
mpi->mdev_events.notifier_call = mlx5_ib_event_slave_port;
mlx5_notifier_register(mpi->mdev, &mpi->mdev_events);
mlx5_ib_init_cong_debugfs(ibdev, port_num);
key = mpi->mdev->priv.adev_idx;
mlx5_core_mp_event_replay(mpi->mdev,
MLX5_DRIVER_EVENT_AFFILIATION_DONE,
&key);
mlx5_core_mp_event_replay(ibdev->mdev,
MLX5_DRIVER_EVENT_AFFILIATION_DONE,
&key);
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
return true;
unbind:
mlx5_ib_unbind_slave_port(ibdev, mpi);
return false;
}
static int mlx5_ib_data_direct_init(struct mlx5_ib_dev *dev)
{
char vuid[MLX5_ST_SZ_BYTES(array1024_auto) + 1] = {};
int ret;
if (!MLX5_CAP_GEN(dev->mdev, data_direct) ||
!MLX5_CAP_GEN_2(dev->mdev, query_vuid))
return 0;
ret = mlx5_cmd_query_vuid(dev->mdev, true, vuid);
if (ret)
return ret;
ret = mlx5_ib_create_data_direct_resources(dev);
if (ret)
return ret;
RDMA/mlx5: Add support for DMABUF MR registrations with Data-direct JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: v6.12-rc1 commit de8f847a5114ff7cfcdfc114af8485c431dec703 Author: Yishai Hadas <yishaih@nvidia.com> Date: Thu Aug 1 15:05:16 2024 +0300 RDMA/mlx5: Add support for DMABUF MR registrations with Data-direct Add support for DMABUF MR registrations with Data-direct device. Upon userspace calling to register a DMABUF MR with the data direct bit set, the below algorithm will be followed. 1) Obtain a pinned DMABUF umem from the IB core using the user input parameters (FD, offset, length) and the DMA PF device. The DMA PF device is needed to allow the IOMMU to enable the DMA PF to access the user buffer over PCI. 2) Create a KSM MKEY by setting its entries according to the user buffer VA to IOVA mapping, with the MKEY being the data direct device-crossed MKEY. This KSM MKEY is umrable and will be used as part of the MR cache. The PD for creating it is the internal device 'data direct' kernel one. 3) Create a crossing MKEY that points to the KSM MKEY using the crossing access mode. 4) Manage the KSM MKEY by adding it to a list of 'data direct' MKEYs managed on the mlx5_ib device. 5) Return the crossing MKEY to the user, created with its supplied PD. Upon DMA PF unbind flow, the driver will revoke the KSM entries. The final deregistration will occur under the hood once the application deregisters its MKEY. Notes: - This version supports only the PINNED UMEM mode, so there is no dependency on ODP. - The IOVA supplied by the application must be system page aligned due to HW translations of KSM. - The crossing MKEY will not be umrable or part of the MR cache, as we cannot change its crossed (i.e. KSM) MKEY over UMR. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Link: https://patch.msgid.link/1f99d8020ed540d9702b9e2252a145a439609ba6.1722512548.git.leon@kernel.org Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:08 +00:00
INIT_LIST_HEAD(&dev->data_direct_mr_list);
ret = mlx5_data_direct_ib_reg(dev, vuid);
if (ret)
mlx5_ib_free_data_direct_resources(dev);
return ret;
}
static void mlx5_ib_data_direct_cleanup(struct mlx5_ib_dev *dev)
{
if (!MLX5_CAP_GEN(dev->mdev, data_direct) ||
!MLX5_CAP_GEN_2(dev->mdev, query_vuid))
return;
mlx5_data_direct_ib_unreg(dev);
mlx5_ib_free_data_direct_resources(dev);
}
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
static int mlx5_ib_init_multiport_master(struct mlx5_ib_dev *dev)
{
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
u32 port_num = mlx5_core_native_port_num(dev->mdev) - 1;
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev,
port_num + 1);
struct mlx5_ib_multiport_info *mpi;
int err;
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
u32 i;
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
if (!mlx5_core_is_mp_master(dev->mdev) || ll != IB_LINK_LAYER_ETHERNET)
return 0;
err = mlx5_query_nic_vport_system_image_guid(dev->mdev,
&dev->sys_image_guid);
if (err)
return err;
err = mlx5_nic_vport_enable_roce(dev->mdev);
if (err)
return err;
mutex_lock(&mlx5_ib_multiport_mutex);
for (i = 0; i < dev->num_ports; i++) {
bool bound = false;
/* build a stub multiport info struct for the native port. */
if (i == port_num) {
mpi = kzalloc(sizeof(*mpi), GFP_KERNEL);
if (!mpi) {
mutex_unlock(&mlx5_ib_multiport_mutex);
mlx5_nic_vport_disable_roce(dev->mdev);
return -ENOMEM;
}
mpi->is_master = true;
mpi->mdev = dev->mdev;
mpi->sys_image_guid = dev->sys_image_guid;
dev->port[i].mp.mpi = mpi;
mpi->ibdev = dev;
mpi = NULL;
continue;
}
list_for_each_entry(mpi, &mlx5_ib_unaffiliated_port_list,
list) {
if (dev->sys_image_guid == mpi->sys_image_guid &&
(mlx5_core_native_port_num(mpi->mdev) - 1) == i) {
bound = mlx5_ib_bind_slave_port(dev, mpi);
}
if (bound) {
dev_dbg(mpi->mdev->device,
"removing port from unaffiliated list.\n");
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
mlx5_ib_dbg(dev, "port %d bound\n", i + 1);
list_del(&mpi->list);
break;
}
}
if (!bound)
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
mlx5_ib_dbg(dev, "no free port found for port %d\n",
i + 1);
}
list_add_tail(&dev->ib_dev_list, &mlx5_ib_dev_list);
mutex_unlock(&mlx5_ib_multiport_mutex);
return err;
}
static void mlx5_ib_cleanup_multiport_master(struct mlx5_ib_dev *dev)
{
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
u32 port_num = mlx5_core_native_port_num(dev->mdev) - 1;
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev,
port_num + 1);
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
u32 i;
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
if (!mlx5_core_is_mp_master(dev->mdev) || ll != IB_LINK_LAYER_ETHERNET)
return;
mutex_lock(&mlx5_ib_multiport_mutex);
for (i = 0; i < dev->num_ports; i++) {
if (dev->port[i].mp.mpi) {
/* Destroy the native port stub */
if (i == port_num) {
kfree(dev->port[i].mp.mpi);
dev->port[i].mp.mpi = NULL;
} else {
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
mlx5_ib_dbg(dev, "unbinding port_num: %u\n",
i + 1);
list_add_tail(&dev->port[i].mp.mpi->list,
&mlx5_ib_unaffiliated_port_list);
RDMA/mlx5: Don't access NULL-cleared mpi pointer The "dev->port[i].mp.mpi" is set to NULL during mlx5_ib_unbind_slave_port() execution, however that field is needed to add device to unaffiliated list. Such flow causes to the following kernel panic while unloading mlx5_ib module in multi-port mode, hence the device should be added to the list prior to unbind call. RPC: Unregistered rdma transport module. RPC: Unregistered rdma backchannel transport module. BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page PGD 0 P4D 0 Oops: 0002 [#1] SMP NOPTI CPU: 4 PID: 1904 Comm: modprobe Not tainted 5.13.0-rc7_for_upstream_min_debug_2021_06_24_12_08 #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:mlx5_ib_cleanup_multiport_master+0x18b/0x2d0 [mlx5_ib] Code: 00 04 0f 85 c4 00 00 00 48 89 df e8 ef fa ff ff 48 8b 83 40 0d 00 00 48 8b 15 b9 e8 05 00 4a 8b 44 28 20 48 89 05 ad e8 05 00 <48> c7 00 d0 57 c5 a0 48 89 50 08 48 89 02 39 ab 88 0a 00 00 0f 86 RSP: 0018:ffff888116ee3df8 EFLAGS: 00010296 RAX: 0000000000000000 RBX: ffff8881154f6000 RCX: 0000000000000080 RDX: ffffffffa0c557d0 RSI: ffff88810b69d200 RDI: 000000000002d8a0 RBP: 0000000000000002 R08: ffff888110780408 R09: 0000000000000000 R10: ffff88812452e1c0 R11: fffffffffff7e028 R12: 0000000000000000 R13: 0000000000000080 R14: ffff888102c58000 R15: 0000000000000000 FS: 00007f884393a740(0000) GS:ffff8882f5a00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000001249f6004 CR4: 0000000000370ea0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: mlx5_ib_stage_init_cleanup+0x16/0xd0 [mlx5_ib] __mlx5_ib_remove+0x33/0x90 [mlx5_ib] mlx5r_remove+0x22/0x30 [mlx5_ib] auxiliary_bus_remove+0x18/0x30 __device_release_driver+0x177/0x220 driver_detach+0xc4/0x100 bus_remove_driver+0x58/0xd0 auxiliary_driver_unregister+0x12/0x20 mlx5_ib_cleanup+0x13/0x897 [mlx5_ib] __x64_sys_delete_module+0x154/0x230 ? exit_to_user_mode_prepare+0x104/0x140 do_syscall_64+0x3f/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f8842e095c7 Code: 73 01 c3 48 8b 0d d9 48 2c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 b8 b0 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d a9 48 2c 00 f7 d8 64 89 01 48 RSP: 002b:00007ffc68f6e758 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0 RAX: ffffffffffffffda RBX: 00005638207929c0 RCX: 00007f8842e095c7 RDX: 0000000000000000 RSI: 0000000000000800 RDI: 0000563820792a28 RBP: 00005638207929c0 R08: 00007ffc68f6d701 R09: 0000000000000000 R10: 00007f8842e82880 R11: 0000000000000206 R12: 0000563820792a28 R13: 0000000000000001 R14: 0000563820792a28 R15: 00007ffc68f6fb40 Modules linked in: xt_MASQUERADE nf_conntrack_netlink nfnetlink iptable_nat xt_addrtype xt_conntrack nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 br_netfilter overlay rdma_ucm ib_iser libiscsi scsi_transport_iscsi rdma_cm iw_cm ib_ipoib ib_cm ib_umad mlx5_ib(-) mlx4_ib ib_uverbs ib_core mlx4_en mlx4_core mlx5_core ptp pps_core [last unloaded: rpcrdma] CR2: 0000000000000000 ---[ end trace a0bb7e20804e9e9b ]--- Fixes: 7ce6095e3bff ("RDMA/mlx5: Don't add slave port to unaffiliated list") Link: https://lore.kernel.org/r/899ac1b33a995be5ec0e16a4765c4e43c2b1ba5b.1624956444.git.leonro@nvidia.com Reviewed-by: Itay Aveksis <itayav@nvidia.com> Reviewed-by: Maor Gottlieb <maorg@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-06-29 08:51:38 +00:00
mlx5_ib_unbind_slave_port(dev,
dev->port[i].mp.mpi);
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
}
}
}
mlx5_ib_dbg(dev, "removing from devlist\n");
list_del(&dev->ib_dev_list);
mutex_unlock(&mlx5_ib_multiport_mutex);
mlx5_nic_vport_disable_roce(dev->mdev);
}
static int mmap_obj_cleanup(struct ib_uobject *uobject,
enum rdma_remove_reason why,
struct uverbs_attr_bundle *attrs)
{
struct mlx5_user_mmap_entry *obj = uobject->object;
rdma_user_mmap_entry_remove(&obj->rdma_entry);
return 0;
}
static int mlx5_rdma_user_mmap_entry_insert(struct mlx5_ib_ucontext *c,
struct mlx5_user_mmap_entry *entry,
size_t length)
{
return rdma_user_mmap_entry_insert_range(
&c->ibucontext, &entry->rdma_entry, length,
(MLX5_IB_MMAP_OFFSET_START << 16),
((MLX5_IB_MMAP_OFFSET_END << 16) + (1UL << 16) - 1));
}
static struct mlx5_user_mmap_entry *
alloc_var_entry(struct mlx5_ib_ucontext *c)
{
struct mlx5_user_mmap_entry *entry;
struct mlx5_var_table *var_table;
u32 page_idx;
int err;
var_table = &to_mdev(c->ibucontext.device)->var_table;
entry = kzalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
return ERR_PTR(-ENOMEM);
mutex_lock(&var_table->bitmap_lock);
page_idx = find_first_zero_bit(var_table->bitmap,
var_table->num_var_hw_entries);
if (page_idx >= var_table->num_var_hw_entries) {
err = -ENOSPC;
mutex_unlock(&var_table->bitmap_lock);
goto end;
}
set_bit(page_idx, var_table->bitmap);
mutex_unlock(&var_table->bitmap_lock);
entry->address = var_table->hw_start_addr +
(page_idx * var_table->stride_size);
entry->page_idx = page_idx;
entry->mmap_flag = MLX5_IB_MMAP_TYPE_VAR;
err = mlx5_rdma_user_mmap_entry_insert(c, entry,
var_table->stride_size);
if (err)
goto err_insert;
return entry;
err_insert:
mutex_lock(&var_table->bitmap_lock);
clear_bit(page_idx, var_table->bitmap);
mutex_unlock(&var_table->bitmap_lock);
end:
kfree(entry);
return ERR_PTR(err);
}
static int UVERBS_HANDLER(MLX5_IB_METHOD_VAR_OBJ_ALLOC)(
struct uverbs_attr_bundle *attrs)
{
struct ib_uobject *uobj = uverbs_attr_get_uobject(
attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_HANDLE);
struct mlx5_ib_ucontext *c;
struct mlx5_user_mmap_entry *entry;
u64 mmap_offset;
u32 length;
int err;
c = to_mucontext(ib_uverbs_get_ucontext(attrs));
if (IS_ERR(c))
return PTR_ERR(c);
entry = alloc_var_entry(c);
if (IS_ERR(entry))
return PTR_ERR(entry);
mmap_offset = mlx5_entry_to_mmap_offset(entry);
length = entry->rdma_entry.npages * PAGE_SIZE;
uobj->object = entry;
uverbs_finalize_uobj_create(attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_HANDLE);
err = uverbs_copy_to(attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_OFFSET,
&mmap_offset, sizeof(mmap_offset));
if (err)
return err;
err = uverbs_copy_to(attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_PAGE_ID,
&entry->page_idx, sizeof(entry->page_idx));
if (err)
return err;
err = uverbs_copy_to(attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_LENGTH,
&length, sizeof(length));
return err;
}
DECLARE_UVERBS_NAMED_METHOD(
MLX5_IB_METHOD_VAR_OBJ_ALLOC,
UVERBS_ATTR_IDR(MLX5_IB_ATTR_VAR_OBJ_ALLOC_HANDLE,
MLX5_IB_OBJECT_VAR,
UVERBS_ACCESS_NEW,
UA_MANDATORY),
UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_VAR_OBJ_ALLOC_PAGE_ID,
UVERBS_ATTR_TYPE(u32),
UA_MANDATORY),
UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_LENGTH,
UVERBS_ATTR_TYPE(u32),
UA_MANDATORY),
UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_OFFSET,
UVERBS_ATTR_TYPE(u64),
UA_MANDATORY));
DECLARE_UVERBS_NAMED_METHOD_DESTROY(
MLX5_IB_METHOD_VAR_OBJ_DESTROY,
UVERBS_ATTR_IDR(MLX5_IB_ATTR_VAR_OBJ_DESTROY_HANDLE,
MLX5_IB_OBJECT_VAR,
UVERBS_ACCESS_DESTROY,
UA_MANDATORY));
DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_VAR,
UVERBS_TYPE_ALLOC_IDR(mmap_obj_cleanup),
&UVERBS_METHOD(MLX5_IB_METHOD_VAR_OBJ_ALLOC),
&UVERBS_METHOD(MLX5_IB_METHOD_VAR_OBJ_DESTROY));
static bool var_is_supported(struct ib_device *device)
{
struct mlx5_ib_dev *dev = to_mdev(device);
return (MLX5_CAP_GEN_64(dev->mdev, general_obj_types) &
MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q);
}
static struct mlx5_user_mmap_entry *
alloc_uar_entry(struct mlx5_ib_ucontext *c,
enum mlx5_ib_uapi_uar_alloc_type alloc_type)
{
struct mlx5_user_mmap_entry *entry;
struct mlx5_ib_dev *dev;
u32 uar_index;
int err;
entry = kzalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
return ERR_PTR(-ENOMEM);
dev = to_mdev(c->ibucontext.device);
err = mlx5_cmd_uar_alloc(dev->mdev, &uar_index, c->devx_uid);
if (err)
goto end;
entry->page_idx = uar_index;
entry->address = uar_index2paddress(dev, uar_index);
if (alloc_type == MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF)
entry->mmap_flag = MLX5_IB_MMAP_TYPE_UAR_WC;
else
entry->mmap_flag = MLX5_IB_MMAP_TYPE_UAR_NC;
err = mlx5_rdma_user_mmap_entry_insert(c, entry, PAGE_SIZE);
if (err)
goto err_insert;
return entry;
err_insert:
mlx5_cmd_uar_dealloc(dev->mdev, uar_index, c->devx_uid);
end:
kfree(entry);
return ERR_PTR(err);
}
static int UVERBS_HANDLER(MLX5_IB_METHOD_UAR_OBJ_ALLOC)(
struct uverbs_attr_bundle *attrs)
{
struct ib_uobject *uobj = uverbs_attr_get_uobject(
attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_HANDLE);
enum mlx5_ib_uapi_uar_alloc_type alloc_type;
struct mlx5_ib_ucontext *c;
struct mlx5_user_mmap_entry *entry;
u64 mmap_offset;
u32 length;
int err;
c = to_mucontext(ib_uverbs_get_ucontext(attrs));
if (IS_ERR(c))
return PTR_ERR(c);
err = uverbs_get_const(&alloc_type, attrs,
MLX5_IB_ATTR_UAR_OBJ_ALLOC_TYPE);
if (err)
return err;
if (alloc_type != MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF &&
alloc_type != MLX5_IB_UAPI_UAR_ALLOC_TYPE_NC)
return -EOPNOTSUPP;
net/mlx5: Reimplement write combining test JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: v6.11-rc1 commit d98995b4bf981519dde4af0a081c393d62474039 Author: Jianbo Liu <jianbol@nvidia.com> Date: Mon Jun 3 13:26:37 2024 +0300 net/mlx5: Reimplement write combining test The test of write combining was added before in mlx5_ib driver. It opens UD QP and posts NOP WQEs, and uses BlueFlame doorbell. When BlueFlame is used, WQEs get written directly to a PCI BAR of the device (in addition to memory) so that the device handles them without having to access memory. In this test, the WQEs written in memory are different from the ones written to the BlueFlame which request CQE update. By checking the completion reports posted on CQ, we can know if BlueFlame succeeds or not. The write combining must be supported if BlueFlame succeeds as its register is written using write combining. This patch reimplements the test in the same way, but using a pair of SQ and CQ only. It is moved to mlx5_core as a general feature used by both mlx5_core and mlx5_ib. Besides, save write combine test result of the PCI function, so that its thousands of child functions such as SF can query without paying the time and resource penalty by itself. The test function is called only after failing to get the cached result. With this enhancement, all thousands of SFs of the PF attached to same driver no longer need to perform WC check explicitly, which is already done in the system. This saves several commands per SF, thereby speeds up SF creation and also saves completion EQ creation. Signed-off-by: Jianbo Liu <jianbol@nvidia.com> Reviewed-by: Tariq Toukan <tariqt@nvidia.com> Link: https://lore.kernel.org/r/4ff5a8cc4c5b5b0d98397baa45a5019bcdbf096e.1717409369.git.leon@kernel.org Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:03 +00:00
if (!mlx5_wc_support_get(to_mdev(c->ibucontext.device)->mdev) &&
alloc_type == MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF)
return -EOPNOTSUPP;
entry = alloc_uar_entry(c, alloc_type);
if (IS_ERR(entry))
return PTR_ERR(entry);
mmap_offset = mlx5_entry_to_mmap_offset(entry);
length = entry->rdma_entry.npages * PAGE_SIZE;
uobj->object = entry;
uverbs_finalize_uobj_create(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_HANDLE);
err = uverbs_copy_to(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_OFFSET,
&mmap_offset, sizeof(mmap_offset));
if (err)
return err;
err = uverbs_copy_to(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_PAGE_ID,
&entry->page_idx, sizeof(entry->page_idx));
if (err)
return err;
err = uverbs_copy_to(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_LENGTH,
&length, sizeof(length));
return err;
}
DECLARE_UVERBS_NAMED_METHOD(
MLX5_IB_METHOD_UAR_OBJ_ALLOC,
UVERBS_ATTR_IDR(MLX5_IB_ATTR_UAR_OBJ_ALLOC_HANDLE,
MLX5_IB_OBJECT_UAR,
UVERBS_ACCESS_NEW,
UA_MANDATORY),
UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_UAR_OBJ_ALLOC_TYPE,
enum mlx5_ib_uapi_uar_alloc_type,
UA_MANDATORY),
UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_UAR_OBJ_ALLOC_PAGE_ID,
UVERBS_ATTR_TYPE(u32),
UA_MANDATORY),
UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_LENGTH,
UVERBS_ATTR_TYPE(u32),
UA_MANDATORY),
UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_OFFSET,
UVERBS_ATTR_TYPE(u64),
UA_MANDATORY));
DECLARE_UVERBS_NAMED_METHOD_DESTROY(
MLX5_IB_METHOD_UAR_OBJ_DESTROY,
UVERBS_ATTR_IDR(MLX5_IB_ATTR_UAR_OBJ_DESTROY_HANDLE,
MLX5_IB_OBJECT_UAR,
UVERBS_ACCESS_DESTROY,
UA_MANDATORY));
DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_UAR,
UVERBS_TYPE_ALLOC_IDR(mmap_obj_cleanup),
&UVERBS_METHOD(MLX5_IB_METHOD_UAR_OBJ_ALLOC),
&UVERBS_METHOD(MLX5_IB_METHOD_UAR_OBJ_DESTROY));
ADD_UVERBS_ATTRIBUTES_SIMPLE(
mlx5_ib_query_context,
UVERBS_OBJECT_DEVICE,
UVERBS_METHOD_QUERY_CONTEXT,
UVERBS_ATTR_PTR_OUT(
MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX,
UVERBS_ATTR_STRUCT(struct mlx5_ib_alloc_ucontext_resp,
dump_fill_mkey),
UA_MANDATORY));
RDMA/mlx5: Add support for DMABUF MR registrations with Data-direct JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: v6.12-rc1 commit de8f847a5114ff7cfcdfc114af8485c431dec703 Author: Yishai Hadas <yishaih@nvidia.com> Date: Thu Aug 1 15:05:16 2024 +0300 RDMA/mlx5: Add support for DMABUF MR registrations with Data-direct Add support for DMABUF MR registrations with Data-direct device. Upon userspace calling to register a DMABUF MR with the data direct bit set, the below algorithm will be followed. 1) Obtain a pinned DMABUF umem from the IB core using the user input parameters (FD, offset, length) and the DMA PF device. The DMA PF device is needed to allow the IOMMU to enable the DMA PF to access the user buffer over PCI. 2) Create a KSM MKEY by setting its entries according to the user buffer VA to IOVA mapping, with the MKEY being the data direct device-crossed MKEY. This KSM MKEY is umrable and will be used as part of the MR cache. The PD for creating it is the internal device 'data direct' kernel one. 3) Create a crossing MKEY that points to the KSM MKEY using the crossing access mode. 4) Manage the KSM MKEY by adding it to a list of 'data direct' MKEYs managed on the mlx5_ib device. 5) Return the crossing MKEY to the user, created with its supplied PD. Upon DMA PF unbind flow, the driver will revoke the KSM entries. The final deregistration will occur under the hood once the application deregisters its MKEY. Notes: - This version supports only the PINNED UMEM mode, so there is no dependency on ODP. - The IOVA supplied by the application must be system page aligned due to HW translations of KSM. - The crossing MKEY will not be umrable or part of the MR cache, as we cannot change its crossed (i.e. KSM) MKEY over UMR. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Link: https://patch.msgid.link/1f99d8020ed540d9702b9e2252a145a439609ba6.1722512548.git.leon@kernel.org Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:08 +00:00
ADD_UVERBS_ATTRIBUTES_SIMPLE(
mlx5_ib_reg_dmabuf_mr,
UVERBS_OBJECT_MR,
UVERBS_METHOD_REG_DMABUF_MR,
UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS,
enum mlx5_ib_uapi_reg_dmabuf_flags,
UA_OPTIONAL));
static const struct uapi_definition mlx5_ib_defs[] = {
UAPI_DEF_CHAIN(mlx5_ib_devx_defs),
UAPI_DEF_CHAIN(mlx5_ib_flow_defs),
UAPI_DEF_CHAIN(mlx5_ib_qos_defs),
UAPI_DEF_CHAIN(mlx5_ib_std_types_defs),
UAPI_DEF_CHAIN(mlx5_ib_dm_defs),
UAPI_DEF_CHAIN(mlx5_ib_create_cq_defs),
UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DEVICE, &mlx5_ib_query_context),
RDMA/mlx5: Add support for DMABUF MR registrations with Data-direct JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: v6.12-rc1 commit de8f847a5114ff7cfcdfc114af8485c431dec703 Author: Yishai Hadas <yishaih@nvidia.com> Date: Thu Aug 1 15:05:16 2024 +0300 RDMA/mlx5: Add support for DMABUF MR registrations with Data-direct Add support for DMABUF MR registrations with Data-direct device. Upon userspace calling to register a DMABUF MR with the data direct bit set, the below algorithm will be followed. 1) Obtain a pinned DMABUF umem from the IB core using the user input parameters (FD, offset, length) and the DMA PF device. The DMA PF device is needed to allow the IOMMU to enable the DMA PF to access the user buffer over PCI. 2) Create a KSM MKEY by setting its entries according to the user buffer VA to IOVA mapping, with the MKEY being the data direct device-crossed MKEY. This KSM MKEY is umrable and will be used as part of the MR cache. The PD for creating it is the internal device 'data direct' kernel one. 3) Create a crossing MKEY that points to the KSM MKEY using the crossing access mode. 4) Manage the KSM MKEY by adding it to a list of 'data direct' MKEYs managed on the mlx5_ib device. 5) Return the crossing MKEY to the user, created with its supplied PD. Upon DMA PF unbind flow, the driver will revoke the KSM entries. The final deregistration will occur under the hood once the application deregisters its MKEY. Notes: - This version supports only the PINNED UMEM mode, so there is no dependency on ODP. - The IOVA supplied by the application must be system page aligned due to HW translations of KSM. - The crossing MKEY will not be umrable or part of the MR cache, as we cannot change its crossed (i.e. KSM) MKEY over UMR. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Link: https://patch.msgid.link/1f99d8020ed540d9702b9e2252a145a439609ba6.1722512548.git.leon@kernel.org Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:08 +00:00
UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_MR, &mlx5_ib_reg_dmabuf_mr),
UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_VAR,
UAPI_DEF_IS_OBJ_SUPPORTED(var_is_supported)),
UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_UAR),
{}
};
static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
{
mlx5_ib_data_direct_cleanup(dev);
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
mlx5_ib_cleanup_multiport_master(dev);
WARN_ON(!xa_empty(&dev->odp_mkeys));
mutex_destroy(&dev->cap_mask_mutex);
WARN_ON(!xa_empty(&dev->sig_mrs));
WARN_ON(!bitmap_empty(dev->dm.memic_alloc_pages, MLX5_MAX_MEMIC_PAGES));
mlx5r_macsec_dealloc_gids(dev);
}
static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
{
struct mlx5_core_dev *mdev = dev->mdev;
int err, i;
dev->ib_dev.node_type = RDMA_NODE_IB_CA;
dev->ib_dev.local_dma_lkey = 0 /* not supported for now */;
dev->ib_dev.dev.parent = mdev->device;
dev->ib_dev.lag_flags = RDMA_LAG_FLAGS_HASH_ALL_SLAVES;
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
for (i = 0; i < dev->num_ports; i++) {
spin_lock_init(&dev->port[i].mp.mpi_lock);
dev->port[i].roce.dev = dev;
dev->port[i].roce.native_port_num = i + 1;
dev->port[i].roce.last_port_state = IB_PORT_DOWN;
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
}
err = mlx5r_cmd_query_special_mkeys(dev);
if (err)
return err;
err = mlx5r_macsec_init_gids_and_devlist(dev);
if (err)
return err;
err = mlx5_ib_init_multiport_master(dev);
if (err)
goto err;
err = set_has_smi_cap(dev);
if (err)
goto err_mp;
err = mlx5_query_max_pkeys(&dev->ib_dev, &dev->pkey_table_len);
if (err)
goto err_mp;
if (mlx5_use_mad_ifc(dev))
get_ext_port_caps(dev);
dev->ib_dev.num_comp_vectors = mlx5_comp_vectors_max(mdev);
mutex_init(&dev->cap_mask_mutex);
RDMA/mlx5: Introduce the 'data direct' driver JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: v6.12-rc1 commit 6910e3660d86c1a5654f742a40181d2c9154f26f Author: Yishai Hadas <yishaih@nvidia.com> Date: Thu Aug 1 15:05:11 2024 +0300 RDMA/mlx5: Introduce the 'data direct' driver Introduce the 'data direct' driver for a ConnectX-8 Data Direct device. The 'data direct' driver functions as the affiliated DMA device for one or more capable mlx5_ib devices. This DMA device, as the name suggests, is used exclusively for DMA operations. It can be considered a DMA engine managed by a PF/VF, lacking network capabilities and having minimal overall capabilities. Consequently, the DMA NIC PF will not be exposed to or directly used by software applications. The driver will not have any direct interface or interaction with the firmware (no command interface, no capabilities, etc.). It will operate solely over PCI to enable its DMA functionality. Registration and un-registration of the driver are handled as part of the mlx5_ib initialization and exit processes, as the mlx5_ib devices will effectively be its clients. The driver will serve as the DMA device for accessing another PCI device to achieve optimal performance (both on the same NUMA node, P2P access, etc.). Upon probing, it will read its VUID over PCI to handle mlx5_ib device registrations with the same VUID. Upon removal, it will notify its clients to allow them to clean up the resources that were mmaped with its DMA device. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Link: https://patch.msgid.link/b77edecfd476c3f445da96ab6aef499ae47b2829.1722512548.git.leon@kernel.org Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:08 +00:00
mutex_init(&dev->data_direct_lock);
INIT_LIST_HEAD(&dev->qp_list);
spin_lock_init(&dev->reset_flow_resource_lock);
xa_init(&dev->odp_mkeys);
xa_init(&dev->sig_mrs);
atomic_set(&dev->mkey_var, 0);
spin_lock_init(&dev->dm.lock);
dev->dm.dev = mdev;
err = mlx5_ib_data_direct_init(dev);
if (err)
goto err_mp;
return 0;
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
err_mp:
mlx5_ib_cleanup_multiport_master(dev);
err:
mlx5r_macsec_dealloc_gids(dev);
return err;
}
static struct ib_device *mlx5_ib_add_sub_dev(struct ib_device *parent,
enum rdma_nl_dev_type type,
const char *name);
static void mlx5_ib_del_sub_dev(struct ib_device *sub_dev);
static const struct ib_device_ops mlx5_ib_dev_ops = {
.owner = THIS_MODULE,
.driver_id = RDMA_DRIVER_MLX5,
.uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION,
.add_gid = mlx5_ib_add_gid,
.add_sub_dev = mlx5_ib_add_sub_dev,
.alloc_mr = mlx5_ib_alloc_mr,
.alloc_mr_integrity = mlx5_ib_alloc_mr_integrity,
.alloc_pd = mlx5_ib_alloc_pd,
.alloc_ucontext = mlx5_ib_alloc_ucontext,
.attach_mcast = mlx5_ib_mcg_attach,
.check_mr_status = mlx5_ib_check_mr_status,
.create_ah = mlx5_ib_create_ah,
.create_cq = mlx5_ib_create_cq,
.create_qp = mlx5_ib_create_qp,
.create_srq = mlx5_ib_create_srq,
.create_user_ah = mlx5_ib_create_ah,
.dealloc_pd = mlx5_ib_dealloc_pd,
.dealloc_ucontext = mlx5_ib_dealloc_ucontext,
.del_gid = mlx5_ib_del_gid,
.del_sub_dev = mlx5_ib_del_sub_dev,
.dereg_mr = mlx5_ib_dereg_mr,
.destroy_ah = mlx5_ib_destroy_ah,
.destroy_cq = mlx5_ib_destroy_cq,
.destroy_qp = mlx5_ib_destroy_qp,
.destroy_srq = mlx5_ib_destroy_srq,
.detach_mcast = mlx5_ib_mcg_detach,
.disassociate_ucontext = mlx5_ib_disassociate_ucontext,
.drain_rq = mlx5_ib_drain_rq,
.drain_sq = mlx5_ib_drain_sq,
.device_group = &mlx5_attr_group,
.get_dev_fw_str = get_dev_fw_str,
.get_dma_mr = mlx5_ib_get_dma_mr,
.get_link_layer = mlx5_ib_port_link_layer,
.map_mr_sg = mlx5_ib_map_mr_sg,
.map_mr_sg_pi = mlx5_ib_map_mr_sg_pi,
.mmap = mlx5_ib_mmap,
.mmap_free = mlx5_ib_mmap_free,
.modify_cq = mlx5_ib_modify_cq,
.modify_device = mlx5_ib_modify_device,
.modify_port = mlx5_ib_modify_port,
.modify_qp = mlx5_ib_modify_qp,
.modify_srq = mlx5_ib_modify_srq,
.poll_cq = mlx5_ib_poll_cq,
.post_recv = mlx5_ib_post_recv_nodrain,
.post_send = mlx5_ib_post_send_nodrain,
.post_srq_recv = mlx5_ib_post_srq_recv,
.process_mad = mlx5_ib_process_mad,
.query_ah = mlx5_ib_query_ah,
.query_device = mlx5_ib_query_device,
.query_gid = mlx5_ib_query_gid,
.query_pkey = mlx5_ib_query_pkey,
.query_qp = mlx5_ib_query_qp,
.query_srq = mlx5_ib_query_srq,
.query_ucontext = mlx5_ib_query_ucontext,
.reg_user_mr = mlx5_ib_reg_user_mr,
.reg_user_mr_dmabuf = mlx5_ib_reg_user_mr_dmabuf,
.req_notify_cq = mlx5_ib_arm_cq,
.rereg_user_mr = mlx5_ib_rereg_user_mr,
.resize_cq = mlx5_ib_resize_cq,
INIT_RDMA_OBJ_SIZE(ib_ah, mlx5_ib_ah, ibah),
INIT_RDMA_OBJ_SIZE(ib_counters, mlx5_ib_mcounters, ibcntrs),
INIT_RDMA_OBJ_SIZE(ib_cq, mlx5_ib_cq, ibcq),
INIT_RDMA_OBJ_SIZE(ib_pd, mlx5_ib_pd, ibpd),
INIT_RDMA_OBJ_SIZE(ib_qp, mlx5_ib_qp, ibqp),
INIT_RDMA_OBJ_SIZE(ib_srq, mlx5_ib_srq, ibsrq),
INIT_RDMA_OBJ_SIZE(ib_ucontext, mlx5_ib_ucontext, ibucontext),
};
static const struct ib_device_ops mlx5_ib_dev_ipoib_enhanced_ops = {
.rdma_netdev_get_params = mlx5_ib_rn_get_params,
};
static const struct ib_device_ops mlx5_ib_dev_sriov_ops = {
.get_vf_config = mlx5_ib_get_vf_config,
.get_vf_guid = mlx5_ib_get_vf_guid,
.get_vf_stats = mlx5_ib_get_vf_stats,
.set_vf_guid = mlx5_ib_set_vf_guid,
.set_vf_link_state = mlx5_ib_set_vf_link_state,
};
static const struct ib_device_ops mlx5_ib_dev_mw_ops = {
.alloc_mw = mlx5_ib_alloc_mw,
.dealloc_mw = mlx5_ib_dealloc_mw,
INIT_RDMA_OBJ_SIZE(ib_mw, mlx5_ib_mw, ibmw),
};
static const struct ib_device_ops mlx5_ib_dev_xrc_ops = {
.alloc_xrcd = mlx5_ib_alloc_xrcd,
.dealloc_xrcd = mlx5_ib_dealloc_xrcd,
INIT_RDMA_OBJ_SIZE(ib_xrcd, mlx5_ib_xrcd, ibxrcd),
};
static int mlx5_ib_init_var_table(struct mlx5_ib_dev *dev)
{
struct mlx5_core_dev *mdev = dev->mdev;
struct mlx5_var_table *var_table = &dev->var_table;
u8 log_doorbell_bar_size;
u8 log_doorbell_stride;
u64 bar_size;
log_doorbell_bar_size = MLX5_CAP_DEV_VDPA_EMULATION(mdev,
log_doorbell_bar_size);
log_doorbell_stride = MLX5_CAP_DEV_VDPA_EMULATION(mdev,
log_doorbell_stride);
var_table->hw_start_addr = dev->mdev->bar_addr +
MLX5_CAP64_DEV_VDPA_EMULATION(mdev,
doorbell_bar_offset);
bar_size = (1ULL << log_doorbell_bar_size) * 4096;
var_table->stride_size = 1ULL << log_doorbell_stride;
var_table->num_var_hw_entries = div_u64(bar_size,
var_table->stride_size);
mutex_init(&var_table->bitmap_lock);
var_table->bitmap = bitmap_zalloc(var_table->num_var_hw_entries,
GFP_KERNEL);
return (var_table->bitmap) ? 0 : -ENOMEM;
}
static void mlx5_ib_stage_caps_cleanup(struct mlx5_ib_dev *dev)
{
bitmap_free(dev->var_table.bitmap);
}
static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev)
{
struct mlx5_core_dev *mdev = dev->mdev;
int err;
if (MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads) &&
IS_ENABLED(CONFIG_MLX5_CORE_IPOIB))
ib_set_device_ops(&dev->ib_dev,
&mlx5_ib_dev_ipoib_enhanced_ops);
if (mlx5_core_is_pf(mdev))
ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_sriov_ops);
dev->umr_fence = mlx5_get_umr_fence(MLX5_CAP_GEN(mdev, umr_fence));
if (MLX5_CAP_GEN(mdev, imaicl))
ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_mw_ops);
if (MLX5_CAP_GEN(mdev, xrc))
ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_xrc_ops);
if (MLX5_CAP_DEV_MEM(mdev, memic) ||
MLX5_CAP_GEN_64(dev->mdev, general_obj_types) &
MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM)
ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_dm_ops);
ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_ops);
if (IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS))
dev->ib_dev.driver_def = mlx5_ib_defs;
err = init_node_data(dev);
if (err)
return err;
if ((MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) &&
(MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) ||
MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
mutex_init(&dev->lb.mutex);
if (MLX5_CAP_GEN_64(dev->mdev, general_obj_types) &
MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q) {
err = mlx5_ib_init_var_table(dev);
if (err)
return err;
}
dev->ib_dev.use_cq_dim = true;
return 0;
}
static const struct ib_device_ops mlx5_ib_dev_port_ops = {
.get_port_immutable = mlx5_port_immutable,
.query_port = mlx5_ib_query_port,
};
static int mlx5_ib_stage_non_default_cb(struct mlx5_ib_dev *dev)
{
ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_port_ops);
return 0;
}
static const struct ib_device_ops mlx5_ib_dev_port_rep_ops = {
.get_port_immutable = mlx5_port_rep_immutable,
.query_port = mlx5_ib_rep_query_port,
.query_pkey = mlx5_ib_rep_query_pkey,
};
static int mlx5_ib_stage_raw_eth_non_default_cb(struct mlx5_ib_dev *dev)
{
ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_port_rep_ops);
return 0;
}
static const struct ib_device_ops mlx5_ib_dev_common_roce_ops = {
.create_rwq_ind_table = mlx5_ib_create_rwq_ind_table,
.create_wq = mlx5_ib_create_wq,
.destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table,
.destroy_wq = mlx5_ib_destroy_wq,
.modify_wq = mlx5_ib_modify_wq,
INIT_RDMA_OBJ_SIZE(ib_rwq_ind_table, mlx5_ib_rwq_ind_table,
ib_rwq_ind_tbl),
};
static int mlx5_ib_roce_init(struct mlx5_ib_dev *dev)
{
struct mlx5_core_dev *mdev = dev->mdev;
enum rdma_link_layer ll;
int port_type_cap;
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
u32 port_num = 0;
int err;
port_type_cap = MLX5_CAP_GEN(mdev, port_type);
ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
if (ll == IB_LINK_LAYER_ETHERNET) {
ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_common_roce_ops);
port_num = mlx5_core_native_port_num(dev->mdev) - 1;
/* Register only for native ports */
mlx5_mdev_netdev_track(dev, port_num);
err = mlx5_enable_eth(dev);
if (err)
goto cleanup;
}
return 0;
cleanup:
mlx5_mdev_netdev_untrack(dev, port_num);
return err;
}
static void mlx5_ib_roce_cleanup(struct mlx5_ib_dev *dev)
{
struct mlx5_core_dev *mdev = dev->mdev;
enum rdma_link_layer ll;
int port_type_cap;
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 07:04:20 +00:00
u32 port_num;
port_type_cap = MLX5_CAP_GEN(mdev, port_type);
ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
if (ll == IB_LINK_LAYER_ETHERNET) {
mlx5_disable_eth(dev);
port_num = mlx5_core_native_port_num(dev->mdev) - 1;
mlx5_mdev_netdev_untrack(dev, port_num);
}
}
static int mlx5_ib_stage_cong_debugfs_init(struct mlx5_ib_dev *dev)
{
mlx5_ib_init_cong_debugfs(dev,
mlx5_core_native_port_num(dev->mdev) - 1);
return 0;
}
static void mlx5_ib_stage_cong_debugfs_cleanup(struct mlx5_ib_dev *dev)
{
mlx5_ib_cleanup_cong_debugfs(dev,
mlx5_core_native_port_num(dev->mdev) - 1);
}
static int mlx5_ib_stage_uar_init(struct mlx5_ib_dev *dev)
{
dev->mdev->priv.uar = mlx5_get_uars_page(dev->mdev);
return PTR_ERR_OR_ZERO(dev->mdev->priv.uar);
}
static void mlx5_ib_stage_uar_cleanup(struct mlx5_ib_dev *dev)
{
mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar);
}
static int mlx5_ib_stage_bfrag_init(struct mlx5_ib_dev *dev)
{
int err;
err = mlx5_alloc_bfreg(dev->mdev, &dev->bfreg, false, false);
if (err)
return err;
err = mlx5_alloc_bfreg(dev->mdev, &dev->fp_bfreg, false, true);
if (err)
mlx5_free_bfreg(dev->mdev, &dev->bfreg);
return err;
}
static void mlx5_ib_stage_bfrag_cleanup(struct mlx5_ib_dev *dev)
{
mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg);
mlx5_free_bfreg(dev->mdev, &dev->bfreg);
}
static int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev)
{
const char *name;
if (dev->sub_dev_name)
name = dev->sub_dev_name;
else if (!mlx5_lag_is_active(dev->mdev))
name = "mlx5_%d";
else
name = "mlx5_bond_%d";
return ib_register_device(&dev->ib_dev, name, &dev->mdev->pdev->dev);
}
static void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev)
{
mlx5_mkey_cache_cleanup(dev);
mlx5r_umr_resource_cleanup(dev);
mlx5r_umr_cleanup(dev);
}
static void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev)
{
IB/mlx5: Fix cleanup order on unload On load we create private CQ/QP/PD in order to be used by UMR, we create those resources after we register ourself as an IB device, and we destroy them after we unregister as an IB device. This was changed by commit 16c1975f1032 ("IB/mlx5: Create profile infrastructure to add and remove stages") which moved the destruction before we unregistration. This allowed to trigger an invalid memory access when unloading mlx5_ib while there are open resources: BUG: unable to handle kernel paging request at 00000001002c012c ... Call Trace: mlx5_ib_post_send_wait+0x75/0x110 [mlx5_ib] __slab_free+0x9a/0x2d0 delay_time_func+0x10/0x10 [mlx5_ib] unreg_umr.isra.15+0x4b/0x50 [mlx5_ib] mlx5_mr_cache_free+0x46/0x150 [mlx5_ib] clean_mr+0xc9/0x190 [mlx5_ib] dereg_mr+0xba/0xf0 [mlx5_ib] ib_dereg_mr+0x13/0x20 [ib_core] remove_commit_idr_uobject+0x16/0x70 [ib_uverbs] uverbs_cleanup_ucontext+0xe8/0x1a0 [ib_uverbs] ib_uverbs_cleanup_ucontext.isra.9+0x19/0x40 [ib_uverbs] ib_uverbs_remove_one+0x162/0x2e0 [ib_uverbs] ib_unregister_device+0xd4/0x190 [ib_core] __mlx5_ib_remove+0x2e/0x40 [mlx5_ib] mlx5_remove_device+0xf5/0x120 [mlx5_core] mlx5_unregister_interface+0x37/0x90 [mlx5_core] mlx5_ib_cleanup+0xc/0x225 [mlx5_ib] SyS_delete_module+0x153/0x230 do_syscall_64+0x62/0x110 entry_SYSCALL_64_after_hwframe+0x21/0x86 ... We restore the original behavior by breaking the UMR stage into two parts, pre and post IB registration stages, this way we can restore the original functionality and maintain clean separation of logic between stages. Fixes: 16c1975f1032 ("IB/mlx5: Create profile infrastructure to add and remove stages") Signed-off-by: Mark Bloch <markb@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-03-14 07:14:15 +00:00
ib_unregister_device(&dev->ib_dev);
}
static int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev)
{
int ret;
ret = mlx5r_umr_init(dev);
if (ret)
return ret;
ret = mlx5_mkey_cache_init(dev);
IB/mlx5: Fix init stage error handling to avoid double free of same QP and UAF JIRA: https://issues.redhat.com/browse/RHEL-882 Upstream-status: v6.7-rc1 commit 2ef422f063b74adcc4a4a9004b0a87bb55e0a836 Author: George Kennedy <george.kennedy@oracle.com> Date: Tue Oct 24 13:01:58 2023 -0500 IB/mlx5: Fix init stage error handling to avoid double free of same QP and UAF In the unlikely event that workqueue allocation fails and returns NULL in mlx5_mkey_cache_init(), delete the call to mlx5r_umr_resource_cleanup() (which frees the QP) in mlx5_ib_stage_post_ib_reg_umr_init(). This will avoid attempted double free of the same QP when __mlx5_ib_add() does its cleanup. Resolves a splat: Syzkaller reported a UAF in ib_destroy_qp_user workqueue: Failed to create a rescuer kthread for wq "mkey_cache": -EINTR infiniband mlx5_0: mlx5_mkey_cache_init:981:(pid 1642): failed to create work queue infiniband mlx5_0: mlx5_ib_stage_post_ib_reg_umr_init:4075:(pid 1642): mr cache init failed -12 ================================================================== BUG: KASAN: slab-use-after-free in ib_destroy_qp_user (drivers/infiniband/core/verbs.c:2073) Read of size 8 at addr ffff88810da310a8 by task repro_upstream/1642 Call Trace: <TASK> kasan_report (mm/kasan/report.c:590) ib_destroy_qp_user (drivers/infiniband/core/verbs.c:2073) mlx5r_umr_resource_cleanup (drivers/infiniband/hw/mlx5/umr.c:198) __mlx5_ib_add (drivers/infiniband/hw/mlx5/main.c:4178) mlx5r_probe (drivers/infiniband/hw/mlx5/main.c:4402) ... </TASK> Allocated by task 1642: __kmalloc (./include/linux/kasan.h:198 mm/slab_common.c:1026 mm/slab_common.c:1039) create_qp (./include/linux/slab.h:603 ./include/linux/slab.h:720 ./include/rdma/ib_verbs.h:2795 drivers/infiniband/core/verbs.c:1209) ib_create_qp_kernel (drivers/infiniband/core/verbs.c:1347) mlx5r_umr_resource_init (drivers/infiniband/hw/mlx5/umr.c:164) mlx5_ib_stage_post_ib_reg_umr_init (drivers/infiniband/hw/mlx5/main.c:4070) __mlx5_ib_add (drivers/infiniband/hw/mlx5/main.c:4168) mlx5r_probe (drivers/infiniband/hw/mlx5/main.c:4402) ... Freed by task 1642: __kmem_cache_free (mm/slub.c:1826 mm/slub.c:3809 mm/slub.c:3822) ib_destroy_qp_user (drivers/infiniband/core/verbs.c:2112) mlx5r_umr_resource_cleanup (drivers/infiniband/hw/mlx5/umr.c:198) mlx5_ib_stage_post_ib_reg_umr_init (drivers/infiniband/hw/mlx5/main.c:4076 drivers/infiniband/hw/mlx5/main.c:4065) __mlx5_ib_add (drivers/infiniband/hw/mlx5/main.c:4168) mlx5r_probe (drivers/infiniband/hw/mlx5/main.c:4402) ... Fixes: 04876c12c19e ("RDMA/mlx5: Move init and cleanup of UMR to umr.c") Link: https://lore.kernel.org/r/1698170518-4006-1-git-send-email-george.kennedy@oracle.com Suggested-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: George Kennedy <george.kennedy@oracle.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Signed-off-by: Mohammad Kabat <mkabat@redhat.com>
2023-12-13 09:12:15 +00:00
if (ret)
mlx5_ib_warn(dev, "mr cache init failed %d\n", ret);
return ret;
}
static int mlx5_ib_stage_delay_drop_init(struct mlx5_ib_dev *dev)
{
struct dentry *root;
if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP))
return 0;
mutex_init(&dev->delay_drop.lock);
dev->delay_drop.dev = dev;
dev->delay_drop.activate = false;
dev->delay_drop.timeout = MLX5_MAX_DELAY_DROP_TIMEOUT_MS * 1000;
INIT_WORK(&dev->delay_drop.delay_drop_work, delay_drop_handler);
atomic_set(&dev->delay_drop.rqs_cnt, 0);
atomic_set(&dev->delay_drop.events_cnt, 0);
if (!mlx5_debugfs_root)
return 0;
root = debugfs_create_dir("delay_drop", mlx5_debugfs_get_dev_root(dev->mdev));
dev->delay_drop.dir_debugfs = root;
debugfs_create_atomic_t("num_timeout_events", 0400, root,
&dev->delay_drop.events_cnt);
debugfs_create_atomic_t("num_rqs", 0400, root,
&dev->delay_drop.rqs_cnt);
debugfs_create_file("timeout", 0600, root, &dev->delay_drop,
&fops_delay_drop_timeout);
return 0;
}
static void mlx5_ib_stage_delay_drop_cleanup(struct mlx5_ib_dev *dev)
{
if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP))
return;
cancel_work_sync(&dev->delay_drop.delay_drop_work);
if (!dev->delay_drop.dir_debugfs)
return;
debugfs_remove_recursive(dev->delay_drop.dir_debugfs);
dev->delay_drop.dir_debugfs = NULL;
}
static int mlx5_ib_stage_dev_notifier_init(struct mlx5_ib_dev *dev)
{
RDMA/mlx5: Move events notifier registration to be after device registration JIRA: https://issues.redhat.com/browse/RHEL-72349 CVE: CVE-2024-53224 commit ede132a5cf559f3ab35a4c28bac4f4a6c20334d8 Author: Patrisious Haddad <phaddad@nvidia.com> Date: Wed Nov 13 13:23:19 2024 +0200 RDMA/mlx5: Move events notifier registration to be after device registration Move pkey change work initialization and cleanup from device resources stage to notifier stage, since this is the stage which handles this work events. Fix a race between the device deregistration and pkey change work by moving MLX5_IB_STAGE_DEVICE_NOTIFIER to be after MLX5_IB_STAGE_IB_REG in order to ensure that the notifier is deregistered before the device during cleanup. Which ensures there are no works that are being executed after the device has already unregistered which can cause the panic below. BUG: kernel NULL pointer dereference, address: 0000000000000000 PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 1 PID: 630071 Comm: kworker/1:2 Kdump: loaded Tainted: G W OE --------- --- 5.14.0-162.6.1.el9_1.x86_64 #1 Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS 090008 02/27/2023 Workqueue: events pkey_change_handler [mlx5_ib] RIP: 0010:setup_qp+0x38/0x1f0 [mlx5_ib] Code: ee 41 54 45 31 e4 55 89 f5 53 48 89 fb 48 83 ec 20 8b 77 08 65 48 8b 04 25 28 00 00 00 48 89 44 24 18 48 8b 07 48 8d 4c 24 16 <4c> 8b 38 49 8b 87 80 0b 00 00 4c 89 ff 48 8b 80 08 05 00 00 8b 40 RSP: 0018:ffffbcc54068be20 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffff954054494128 RCX: ffffbcc54068be36 RDX: ffff954004934000 RSI: 0000000000000001 RDI: ffff954054494128 RBP: 0000000000000023 R08: ffff954001be2c20 R09: 0000000000000001 R10: ffff954001be2c20 R11: ffff9540260133c0 R12: 0000000000000000 R13: 0000000000000023 R14: 0000000000000000 R15: ffff9540ffcb0905 FS: 0000000000000000(0000) GS:ffff9540ffc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000010625c001 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: mlx5_ib_gsi_pkey_change+0x20/0x40 [mlx5_ib] process_one_work+0x1e8/0x3c0 worker_thread+0x50/0x3b0 ? rescuer_thread+0x380/0x380 kthread+0x149/0x170 ? set_kthread_struct+0x50/0x50 ret_from_fork+0x22/0x30 Modules linked in: rdma_ucm(OE) rdma_cm(OE) iw_cm(OE) ib_ipoib(OE) ib_cm(OE) ib_umad(OE) mlx5_ib(OE) mlx5_fwctl(OE) fwctl(OE) ib_uverbs(OE) mlx5_core(OE) mlxdevm(OE) ib_core(OE) mlx_compat(OE) psample mlxfw(OE) tls knem(OE) netconsole nfsv3 nfs_acl nfs lockd grace fscache netfs qrtr rfkill sunrpc intel_rapl_msr intel_rapl_common rapl hv_balloon hv_utils i2c_piix4 pcspkr joydev fuse ext4 mbcache jbd2 sr_mod sd_mod cdrom t10_pi sg ata_generic pci_hyperv pci_hyperv_intf hyperv_drm drm_shmem_helper drm_kms_helper hv_storvsc syscopyarea hv_netvsc sysfillrect sysimgblt hid_hyperv fb_sys_fops scsi_transport_fc hyperv_keyboard drm ata_piix crct10dif_pclmul crc32_pclmul crc32c_intel libata ghash_clmulni_intel hv_vmbus serio_raw [last unloaded: ib_core] CR2: 0000000000000000 ---[ end trace f6f8be4eae12f7bc ]--- Fixes: 7722f47e71e5 ("IB/mlx5: Create GSI transmission QPs when P_Key table is changed") Signed-off-by: Patrisious Haddad <phaddad@nvidia.com> Reviewed-by: Michael Guralnik <michaelgur@nvidia.com> Link: https://patch.msgid.link/d271ceeff0c08431b3cbbbb3e2d416f09b6d1621.1731496944.git.leon@kernel.org Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: CKI Backport Bot <cki-ci-bot+cki-gitlab-backport-bot@redhat.com>
2025-01-13 23:46:40 +00:00
struct mlx5_ib_resources *devr = &dev->devr;
int port;
for (port = 0; port < ARRAY_SIZE(devr->ports); ++port)
INIT_WORK(&devr->ports[port].pkey_change_work,
pkey_change_handler);
dev->mdev_events.notifier_call = mlx5_ib_event;
mlx5_notifier_register(dev->mdev, &dev->mdev_events);
mlx5r_macsec_event_register(dev);
return 0;
}
static void mlx5_ib_stage_dev_notifier_cleanup(struct mlx5_ib_dev *dev)
{
RDMA/mlx5: Move events notifier registration to be after device registration JIRA: https://issues.redhat.com/browse/RHEL-72349 CVE: CVE-2024-53224 commit ede132a5cf559f3ab35a4c28bac4f4a6c20334d8 Author: Patrisious Haddad <phaddad@nvidia.com> Date: Wed Nov 13 13:23:19 2024 +0200 RDMA/mlx5: Move events notifier registration to be after device registration Move pkey change work initialization and cleanup from device resources stage to notifier stage, since this is the stage which handles this work events. Fix a race between the device deregistration and pkey change work by moving MLX5_IB_STAGE_DEVICE_NOTIFIER to be after MLX5_IB_STAGE_IB_REG in order to ensure that the notifier is deregistered before the device during cleanup. Which ensures there are no works that are being executed after the device has already unregistered which can cause the panic below. BUG: kernel NULL pointer dereference, address: 0000000000000000 PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 1 PID: 630071 Comm: kworker/1:2 Kdump: loaded Tainted: G W OE --------- --- 5.14.0-162.6.1.el9_1.x86_64 #1 Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS 090008 02/27/2023 Workqueue: events pkey_change_handler [mlx5_ib] RIP: 0010:setup_qp+0x38/0x1f0 [mlx5_ib] Code: ee 41 54 45 31 e4 55 89 f5 53 48 89 fb 48 83 ec 20 8b 77 08 65 48 8b 04 25 28 00 00 00 48 89 44 24 18 48 8b 07 48 8d 4c 24 16 <4c> 8b 38 49 8b 87 80 0b 00 00 4c 89 ff 48 8b 80 08 05 00 00 8b 40 RSP: 0018:ffffbcc54068be20 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffff954054494128 RCX: ffffbcc54068be36 RDX: ffff954004934000 RSI: 0000000000000001 RDI: ffff954054494128 RBP: 0000000000000023 R08: ffff954001be2c20 R09: 0000000000000001 R10: ffff954001be2c20 R11: ffff9540260133c0 R12: 0000000000000000 R13: 0000000000000023 R14: 0000000000000000 R15: ffff9540ffcb0905 FS: 0000000000000000(0000) GS:ffff9540ffc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000010625c001 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: mlx5_ib_gsi_pkey_change+0x20/0x40 [mlx5_ib] process_one_work+0x1e8/0x3c0 worker_thread+0x50/0x3b0 ? rescuer_thread+0x380/0x380 kthread+0x149/0x170 ? set_kthread_struct+0x50/0x50 ret_from_fork+0x22/0x30 Modules linked in: rdma_ucm(OE) rdma_cm(OE) iw_cm(OE) ib_ipoib(OE) ib_cm(OE) ib_umad(OE) mlx5_ib(OE) mlx5_fwctl(OE) fwctl(OE) ib_uverbs(OE) mlx5_core(OE) mlxdevm(OE) ib_core(OE) mlx_compat(OE) psample mlxfw(OE) tls knem(OE) netconsole nfsv3 nfs_acl nfs lockd grace fscache netfs qrtr rfkill sunrpc intel_rapl_msr intel_rapl_common rapl hv_balloon hv_utils i2c_piix4 pcspkr joydev fuse ext4 mbcache jbd2 sr_mod sd_mod cdrom t10_pi sg ata_generic pci_hyperv pci_hyperv_intf hyperv_drm drm_shmem_helper drm_kms_helper hv_storvsc syscopyarea hv_netvsc sysfillrect sysimgblt hid_hyperv fb_sys_fops scsi_transport_fc hyperv_keyboard drm ata_piix crct10dif_pclmul crc32_pclmul crc32c_intel libata ghash_clmulni_intel hv_vmbus serio_raw [last unloaded: ib_core] CR2: 0000000000000000 ---[ end trace f6f8be4eae12f7bc ]--- Fixes: 7722f47e71e5 ("IB/mlx5: Create GSI transmission QPs when P_Key table is changed") Signed-off-by: Patrisious Haddad <phaddad@nvidia.com> Reviewed-by: Michael Guralnik <michaelgur@nvidia.com> Link: https://patch.msgid.link/d271ceeff0c08431b3cbbbb3e2d416f09b6d1621.1731496944.git.leon@kernel.org Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: CKI Backport Bot <cki-ci-bot+cki-gitlab-backport-bot@redhat.com>
2025-01-13 23:46:40 +00:00
struct mlx5_ib_resources *devr = &dev->devr;
int port;
mlx5r_macsec_event_unregister(dev);
mlx5_notifier_unregister(dev->mdev, &dev->mdev_events);
RDMA/mlx5: Move events notifier registration to be after device registration JIRA: https://issues.redhat.com/browse/RHEL-72349 CVE: CVE-2024-53224 commit ede132a5cf559f3ab35a4c28bac4f4a6c20334d8 Author: Patrisious Haddad <phaddad@nvidia.com> Date: Wed Nov 13 13:23:19 2024 +0200 RDMA/mlx5: Move events notifier registration to be after device registration Move pkey change work initialization and cleanup from device resources stage to notifier stage, since this is the stage which handles this work events. Fix a race between the device deregistration and pkey change work by moving MLX5_IB_STAGE_DEVICE_NOTIFIER to be after MLX5_IB_STAGE_IB_REG in order to ensure that the notifier is deregistered before the device during cleanup. Which ensures there are no works that are being executed after the device has already unregistered which can cause the panic below. BUG: kernel NULL pointer dereference, address: 0000000000000000 PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 1 PID: 630071 Comm: kworker/1:2 Kdump: loaded Tainted: G W OE --------- --- 5.14.0-162.6.1.el9_1.x86_64 #1 Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS 090008 02/27/2023 Workqueue: events pkey_change_handler [mlx5_ib] RIP: 0010:setup_qp+0x38/0x1f0 [mlx5_ib] Code: ee 41 54 45 31 e4 55 89 f5 53 48 89 fb 48 83 ec 20 8b 77 08 65 48 8b 04 25 28 00 00 00 48 89 44 24 18 48 8b 07 48 8d 4c 24 16 <4c> 8b 38 49 8b 87 80 0b 00 00 4c 89 ff 48 8b 80 08 05 00 00 8b 40 RSP: 0018:ffffbcc54068be20 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffff954054494128 RCX: ffffbcc54068be36 RDX: ffff954004934000 RSI: 0000000000000001 RDI: ffff954054494128 RBP: 0000000000000023 R08: ffff954001be2c20 R09: 0000000000000001 R10: ffff954001be2c20 R11: ffff9540260133c0 R12: 0000000000000000 R13: 0000000000000023 R14: 0000000000000000 R15: ffff9540ffcb0905 FS: 0000000000000000(0000) GS:ffff9540ffc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000010625c001 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: mlx5_ib_gsi_pkey_change+0x20/0x40 [mlx5_ib] process_one_work+0x1e8/0x3c0 worker_thread+0x50/0x3b0 ? rescuer_thread+0x380/0x380 kthread+0x149/0x170 ? set_kthread_struct+0x50/0x50 ret_from_fork+0x22/0x30 Modules linked in: rdma_ucm(OE) rdma_cm(OE) iw_cm(OE) ib_ipoib(OE) ib_cm(OE) ib_umad(OE) mlx5_ib(OE) mlx5_fwctl(OE) fwctl(OE) ib_uverbs(OE) mlx5_core(OE) mlxdevm(OE) ib_core(OE) mlx_compat(OE) psample mlxfw(OE) tls knem(OE) netconsole nfsv3 nfs_acl nfs lockd grace fscache netfs qrtr rfkill sunrpc intel_rapl_msr intel_rapl_common rapl hv_balloon hv_utils i2c_piix4 pcspkr joydev fuse ext4 mbcache jbd2 sr_mod sd_mod cdrom t10_pi sg ata_generic pci_hyperv pci_hyperv_intf hyperv_drm drm_shmem_helper drm_kms_helper hv_storvsc syscopyarea hv_netvsc sysfillrect sysimgblt hid_hyperv fb_sys_fops scsi_transport_fc hyperv_keyboard drm ata_piix crct10dif_pclmul crc32_pclmul crc32c_intel libata ghash_clmulni_intel hv_vmbus serio_raw [last unloaded: ib_core] CR2: 0000000000000000 ---[ end trace f6f8be4eae12f7bc ]--- Fixes: 7722f47e71e5 ("IB/mlx5: Create GSI transmission QPs when P_Key table is changed") Signed-off-by: Patrisious Haddad <phaddad@nvidia.com> Reviewed-by: Michael Guralnik <michaelgur@nvidia.com> Link: https://patch.msgid.link/d271ceeff0c08431b3cbbbb3e2d416f09b6d1621.1731496944.git.leon@kernel.org Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: CKI Backport Bot <cki-ci-bot+cki-gitlab-backport-bot@redhat.com>
2025-01-13 23:46:40 +00:00
for (port = 0; port < ARRAY_SIZE(devr->ports); ++port)
cancel_work_sync(&devr->ports[port].pkey_change_work);
}
RDMA/mlx5: Introduce the 'data direct' driver JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: v6.12-rc1 commit 6910e3660d86c1a5654f742a40181d2c9154f26f Author: Yishai Hadas <yishaih@nvidia.com> Date: Thu Aug 1 15:05:11 2024 +0300 RDMA/mlx5: Introduce the 'data direct' driver Introduce the 'data direct' driver for a ConnectX-8 Data Direct device. The 'data direct' driver functions as the affiliated DMA device for one or more capable mlx5_ib devices. This DMA device, as the name suggests, is used exclusively for DMA operations. It can be considered a DMA engine managed by a PF/VF, lacking network capabilities and having minimal overall capabilities. Consequently, the DMA NIC PF will not be exposed to or directly used by software applications. The driver will not have any direct interface or interaction with the firmware (no command interface, no capabilities, etc.). It will operate solely over PCI to enable its DMA functionality. Registration and un-registration of the driver are handled as part of the mlx5_ib initialization and exit processes, as the mlx5_ib devices will effectively be its clients. The driver will serve as the DMA device for accessing another PCI device to achieve optimal performance (both on the same NUMA node, P2P access, etc.). Upon probing, it will read its VUID over PCI to handle mlx5_ib device registrations with the same VUID. Upon removal, it will notify its clients to allow them to clean up the resources that were mmaped with its DMA device. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Link: https://patch.msgid.link/b77edecfd476c3f445da96ab6aef499ae47b2829.1722512548.git.leon@kernel.org Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:08 +00:00
void mlx5_ib_data_direct_bind(struct mlx5_ib_dev *ibdev,
struct mlx5_data_direct_dev *dev)
{
mutex_lock(&ibdev->data_direct_lock);
ibdev->data_direct_dev = dev;
mutex_unlock(&ibdev->data_direct_lock);
}
void mlx5_ib_data_direct_unbind(struct mlx5_ib_dev *ibdev)
{
mutex_lock(&ibdev->data_direct_lock);
RDMA/mlx5: Add support for DMABUF MR registrations with Data-direct JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: v6.12-rc1 commit de8f847a5114ff7cfcdfc114af8485c431dec703 Author: Yishai Hadas <yishaih@nvidia.com> Date: Thu Aug 1 15:05:16 2024 +0300 RDMA/mlx5: Add support for DMABUF MR registrations with Data-direct Add support for DMABUF MR registrations with Data-direct device. Upon userspace calling to register a DMABUF MR with the data direct bit set, the below algorithm will be followed. 1) Obtain a pinned DMABUF umem from the IB core using the user input parameters (FD, offset, length) and the DMA PF device. The DMA PF device is needed to allow the IOMMU to enable the DMA PF to access the user buffer over PCI. 2) Create a KSM MKEY by setting its entries according to the user buffer VA to IOVA mapping, with the MKEY being the data direct device-crossed MKEY. This KSM MKEY is umrable and will be used as part of the MR cache. The PD for creating it is the internal device 'data direct' kernel one. 3) Create a crossing MKEY that points to the KSM MKEY using the crossing access mode. 4) Manage the KSM MKEY by adding it to a list of 'data direct' MKEYs managed on the mlx5_ib device. 5) Return the crossing MKEY to the user, created with its supplied PD. Upon DMA PF unbind flow, the driver will revoke the KSM entries. The final deregistration will occur under the hood once the application deregisters its MKEY. Notes: - This version supports only the PINNED UMEM mode, so there is no dependency on ODP. - The IOVA supplied by the application must be system page aligned due to HW translations of KSM. - The crossing MKEY will not be umrable or part of the MR cache, as we cannot change its crossed (i.e. KSM) MKEY over UMR. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Link: https://patch.msgid.link/1f99d8020ed540d9702b9e2252a145a439609ba6.1722512548.git.leon@kernel.org Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:08 +00:00
mlx5_ib_revoke_data_direct_mrs(ibdev);
RDMA/mlx5: Introduce the 'data direct' driver JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: v6.12-rc1 commit 6910e3660d86c1a5654f742a40181d2c9154f26f Author: Yishai Hadas <yishaih@nvidia.com> Date: Thu Aug 1 15:05:11 2024 +0300 RDMA/mlx5: Introduce the 'data direct' driver Introduce the 'data direct' driver for a ConnectX-8 Data Direct device. The 'data direct' driver functions as the affiliated DMA device for one or more capable mlx5_ib devices. This DMA device, as the name suggests, is used exclusively for DMA operations. It can be considered a DMA engine managed by a PF/VF, lacking network capabilities and having minimal overall capabilities. Consequently, the DMA NIC PF will not be exposed to or directly used by software applications. The driver will not have any direct interface or interaction with the firmware (no command interface, no capabilities, etc.). It will operate solely over PCI to enable its DMA functionality. Registration and un-registration of the driver are handled as part of the mlx5_ib initialization and exit processes, as the mlx5_ib devices will effectively be its clients. The driver will serve as the DMA device for accessing another PCI device to achieve optimal performance (both on the same NUMA node, P2P access, etc.). Upon probing, it will read its VUID over PCI to handle mlx5_ib device registrations with the same VUID. Upon removal, it will notify its clients to allow them to clean up the resources that were mmaped with its DMA device. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Link: https://patch.msgid.link/b77edecfd476c3f445da96ab6aef499ae47b2829.1722512548.git.leon@kernel.org Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:08 +00:00
ibdev->data_direct_dev = NULL;
mutex_unlock(&ibdev->data_direct_lock);
}
void __mlx5_ib_remove(struct mlx5_ib_dev *dev,
const struct mlx5_ib_profile *profile,
int stage)
{
dev->ib_active = false;
/* Number of stages to cleanup */
while (stage) {
stage--;
if (profile->stage[stage].cleanup)
profile->stage[stage].cleanup(dev);
}
kfree(dev->port);
ib_dealloc_device(&dev->ib_dev);
}
int __mlx5_ib_add(struct mlx5_ib_dev *dev,
const struct mlx5_ib_profile *profile)
{
int err;
int i;
dev->profile = profile;
for (i = 0; i < MLX5_IB_STAGE_MAX; i++) {
if (profile->stage[i].init) {
err = profile->stage[i].init(dev);
if (err)
goto err_out;
}
}
dev->ib_active = true;
return 0;
err_out:
/* Clean up stages which were initialized */
while (i) {
i--;
if (profile->stage[i].cleanup)
profile->stage[i].cleanup(dev);
}
return -ENOMEM;
}
static const struct mlx5_ib_profile pf_profile = {
STAGE_CREATE(MLX5_IB_STAGE_INIT,
mlx5_ib_stage_init_init,
mlx5_ib_stage_init_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_FS,
mlx5_ib_fs_init,
mlx5_ib_fs_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_CAPS,
mlx5_ib_stage_caps_init,
mlx5_ib_stage_caps_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB,
mlx5_ib_stage_non_default_cb,
NULL),
STAGE_CREATE(MLX5_IB_STAGE_ROCE,
mlx5_ib_roce_init,
mlx5_ib_roce_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_QP,
mlx5_init_qp_table,
mlx5_cleanup_qp_table),
STAGE_CREATE(MLX5_IB_STAGE_SRQ,
mlx5_init_srq_table,
mlx5_cleanup_srq_table),
STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
mlx5_ib_dev_res_init,
mlx5_ib_dev_res_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_ODP,
mlx5_ib_odp_init_one,
mlx5_ib_odp_cleanup_one),
STAGE_CREATE(MLX5_IB_STAGE_COUNTERS,
mlx5_ib_counters_init,
mlx5_ib_counters_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS,
mlx5_ib_stage_cong_debugfs_init,
mlx5_ib_stage_cong_debugfs_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_UAR,
mlx5_ib_stage_uar_init,
mlx5_ib_stage_uar_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_BFREG,
mlx5_ib_stage_bfrag_init,
mlx5_ib_stage_bfrag_cleanup),
IB/mlx5: Fix cleanup order on unload On load we create private CQ/QP/PD in order to be used by UMR, we create those resources after we register ourself as an IB device, and we destroy them after we unregister as an IB device. This was changed by commit 16c1975f1032 ("IB/mlx5: Create profile infrastructure to add and remove stages") which moved the destruction before we unregistration. This allowed to trigger an invalid memory access when unloading mlx5_ib while there are open resources: BUG: unable to handle kernel paging request at 00000001002c012c ... Call Trace: mlx5_ib_post_send_wait+0x75/0x110 [mlx5_ib] __slab_free+0x9a/0x2d0 delay_time_func+0x10/0x10 [mlx5_ib] unreg_umr.isra.15+0x4b/0x50 [mlx5_ib] mlx5_mr_cache_free+0x46/0x150 [mlx5_ib] clean_mr+0xc9/0x190 [mlx5_ib] dereg_mr+0xba/0xf0 [mlx5_ib] ib_dereg_mr+0x13/0x20 [ib_core] remove_commit_idr_uobject+0x16/0x70 [ib_uverbs] uverbs_cleanup_ucontext+0xe8/0x1a0 [ib_uverbs] ib_uverbs_cleanup_ucontext.isra.9+0x19/0x40 [ib_uverbs] ib_uverbs_remove_one+0x162/0x2e0 [ib_uverbs] ib_unregister_device+0xd4/0x190 [ib_core] __mlx5_ib_remove+0x2e/0x40 [mlx5_ib] mlx5_remove_device+0xf5/0x120 [mlx5_core] mlx5_unregister_interface+0x37/0x90 [mlx5_core] mlx5_ib_cleanup+0xc/0x225 [mlx5_ib] SyS_delete_module+0x153/0x230 do_syscall_64+0x62/0x110 entry_SYSCALL_64_after_hwframe+0x21/0x86 ... We restore the original behavior by breaking the UMR stage into two parts, pre and post IB registration stages, this way we can restore the original functionality and maintain clean separation of logic between stages. Fixes: 16c1975f1032 ("IB/mlx5: Create profile infrastructure to add and remove stages") Signed-off-by: Mark Bloch <markb@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-03-14 07:14:15 +00:00
STAGE_CREATE(MLX5_IB_STAGE_PRE_IB_REG_UMR,
NULL,
mlx5_ib_stage_pre_ib_reg_umr_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_WHITELIST_UID,
mlx5_ib_devx_init,
mlx5_ib_devx_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
mlx5_ib_stage_ib_reg_init,
mlx5_ib_stage_ib_reg_cleanup),
RDMA/mlx5: Move events notifier registration to be after device registration JIRA: https://issues.redhat.com/browse/RHEL-72349 CVE: CVE-2024-53224 commit ede132a5cf559f3ab35a4c28bac4f4a6c20334d8 Author: Patrisious Haddad <phaddad@nvidia.com> Date: Wed Nov 13 13:23:19 2024 +0200 RDMA/mlx5: Move events notifier registration to be after device registration Move pkey change work initialization and cleanup from device resources stage to notifier stage, since this is the stage which handles this work events. Fix a race between the device deregistration and pkey change work by moving MLX5_IB_STAGE_DEVICE_NOTIFIER to be after MLX5_IB_STAGE_IB_REG in order to ensure that the notifier is deregistered before the device during cleanup. Which ensures there are no works that are being executed after the device has already unregistered which can cause the panic below. BUG: kernel NULL pointer dereference, address: 0000000000000000 PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 1 PID: 630071 Comm: kworker/1:2 Kdump: loaded Tainted: G W OE --------- --- 5.14.0-162.6.1.el9_1.x86_64 #1 Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS 090008 02/27/2023 Workqueue: events pkey_change_handler [mlx5_ib] RIP: 0010:setup_qp+0x38/0x1f0 [mlx5_ib] Code: ee 41 54 45 31 e4 55 89 f5 53 48 89 fb 48 83 ec 20 8b 77 08 65 48 8b 04 25 28 00 00 00 48 89 44 24 18 48 8b 07 48 8d 4c 24 16 <4c> 8b 38 49 8b 87 80 0b 00 00 4c 89 ff 48 8b 80 08 05 00 00 8b 40 RSP: 0018:ffffbcc54068be20 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffff954054494128 RCX: ffffbcc54068be36 RDX: ffff954004934000 RSI: 0000000000000001 RDI: ffff954054494128 RBP: 0000000000000023 R08: ffff954001be2c20 R09: 0000000000000001 R10: ffff954001be2c20 R11: ffff9540260133c0 R12: 0000000000000000 R13: 0000000000000023 R14: 0000000000000000 R15: ffff9540ffcb0905 FS: 0000000000000000(0000) GS:ffff9540ffc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000010625c001 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: mlx5_ib_gsi_pkey_change+0x20/0x40 [mlx5_ib] process_one_work+0x1e8/0x3c0 worker_thread+0x50/0x3b0 ? rescuer_thread+0x380/0x380 kthread+0x149/0x170 ? set_kthread_struct+0x50/0x50 ret_from_fork+0x22/0x30 Modules linked in: rdma_ucm(OE) rdma_cm(OE) iw_cm(OE) ib_ipoib(OE) ib_cm(OE) ib_umad(OE) mlx5_ib(OE) mlx5_fwctl(OE) fwctl(OE) ib_uverbs(OE) mlx5_core(OE) mlxdevm(OE) ib_core(OE) mlx_compat(OE) psample mlxfw(OE) tls knem(OE) netconsole nfsv3 nfs_acl nfs lockd grace fscache netfs qrtr rfkill sunrpc intel_rapl_msr intel_rapl_common rapl hv_balloon hv_utils i2c_piix4 pcspkr joydev fuse ext4 mbcache jbd2 sr_mod sd_mod cdrom t10_pi sg ata_generic pci_hyperv pci_hyperv_intf hyperv_drm drm_shmem_helper drm_kms_helper hv_storvsc syscopyarea hv_netvsc sysfillrect sysimgblt hid_hyperv fb_sys_fops scsi_transport_fc hyperv_keyboard drm ata_piix crct10dif_pclmul crc32_pclmul crc32c_intel libata ghash_clmulni_intel hv_vmbus serio_raw [last unloaded: ib_core] CR2: 0000000000000000 ---[ end trace f6f8be4eae12f7bc ]--- Fixes: 7722f47e71e5 ("IB/mlx5: Create GSI transmission QPs when P_Key table is changed") Signed-off-by: Patrisious Haddad <phaddad@nvidia.com> Reviewed-by: Michael Guralnik <michaelgur@nvidia.com> Link: https://patch.msgid.link/d271ceeff0c08431b3cbbbb3e2d416f09b6d1621.1731496944.git.leon@kernel.org Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: CKI Backport Bot <cki-ci-bot+cki-gitlab-backport-bot@redhat.com>
2025-01-13 23:46:40 +00:00
STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER,
mlx5_ib_stage_dev_notifier_init,
mlx5_ib_stage_dev_notifier_cleanup),
IB/mlx5: Fix cleanup order on unload On load we create private CQ/QP/PD in order to be used by UMR, we create those resources after we register ourself as an IB device, and we destroy them after we unregister as an IB device. This was changed by commit 16c1975f1032 ("IB/mlx5: Create profile infrastructure to add and remove stages") which moved the destruction before we unregistration. This allowed to trigger an invalid memory access when unloading mlx5_ib while there are open resources: BUG: unable to handle kernel paging request at 00000001002c012c ... Call Trace: mlx5_ib_post_send_wait+0x75/0x110 [mlx5_ib] __slab_free+0x9a/0x2d0 delay_time_func+0x10/0x10 [mlx5_ib] unreg_umr.isra.15+0x4b/0x50 [mlx5_ib] mlx5_mr_cache_free+0x46/0x150 [mlx5_ib] clean_mr+0xc9/0x190 [mlx5_ib] dereg_mr+0xba/0xf0 [mlx5_ib] ib_dereg_mr+0x13/0x20 [ib_core] remove_commit_idr_uobject+0x16/0x70 [ib_uverbs] uverbs_cleanup_ucontext+0xe8/0x1a0 [ib_uverbs] ib_uverbs_cleanup_ucontext.isra.9+0x19/0x40 [ib_uverbs] ib_uverbs_remove_one+0x162/0x2e0 [ib_uverbs] ib_unregister_device+0xd4/0x190 [ib_core] __mlx5_ib_remove+0x2e/0x40 [mlx5_ib] mlx5_remove_device+0xf5/0x120 [mlx5_core] mlx5_unregister_interface+0x37/0x90 [mlx5_core] mlx5_ib_cleanup+0xc/0x225 [mlx5_ib] SyS_delete_module+0x153/0x230 do_syscall_64+0x62/0x110 entry_SYSCALL_64_after_hwframe+0x21/0x86 ... We restore the original behavior by breaking the UMR stage into two parts, pre and post IB registration stages, this way we can restore the original functionality and maintain clean separation of logic between stages. Fixes: 16c1975f1032 ("IB/mlx5: Create profile infrastructure to add and remove stages") Signed-off-by: Mark Bloch <markb@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-03-14 07:14:15 +00:00
STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR,
mlx5_ib_stage_post_ib_reg_umr_init,
NULL),
STAGE_CREATE(MLX5_IB_STAGE_DELAY_DROP,
mlx5_ib_stage_delay_drop_init,
mlx5_ib_stage_delay_drop_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_RESTRACK,
mlx5_ib_restrack_init,
NULL),
};
const struct mlx5_ib_profile raw_eth_profile = {
STAGE_CREATE(MLX5_IB_STAGE_INIT,
mlx5_ib_stage_init_init,
mlx5_ib_stage_init_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_FS,
mlx5_ib_fs_init,
mlx5_ib_fs_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_CAPS,
mlx5_ib_stage_caps_init,
mlx5_ib_stage_caps_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB,
mlx5_ib_stage_raw_eth_non_default_cb,
NULL),
STAGE_CREATE(MLX5_IB_STAGE_ROCE,
mlx5_ib_roce_init,
mlx5_ib_roce_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_QP,
mlx5_init_qp_table,
mlx5_cleanup_qp_table),
STAGE_CREATE(MLX5_IB_STAGE_SRQ,
mlx5_init_srq_table,
mlx5_cleanup_srq_table),
STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
mlx5_ib_dev_res_init,
mlx5_ib_dev_res_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_COUNTERS,
mlx5_ib_counters_init,
mlx5_ib_counters_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS,
mlx5_ib_stage_cong_debugfs_init,
mlx5_ib_stage_cong_debugfs_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_UAR,
mlx5_ib_stage_uar_init,
mlx5_ib_stage_uar_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_BFREG,
mlx5_ib_stage_bfrag_init,
mlx5_ib_stage_bfrag_cleanup),
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net Fun set of conflict resolutions here... For the mac80211 stuff, these were fortunately just parallel adds. Trivially resolved. In drivers/net/phy/phy.c we had a bug fix in 'net' that moved the function phy_disable_interrupts() earlier in the file, whilst in 'net-next' the phy_error() call from this function was removed. In net/ipv4/xfrm4_policy.c, David Ahern's changes to remove the 'rt_table_id' member of rtable collided with a bug fix in 'net' that added a new struct member "rt_mtu_locked" which needs to be copied over here. The mlxsw driver conflict consisted of net-next separating the span code and definitions into separate files, whilst a 'net' bug fix made some changes to that moved code. The mlx5 infiniband conflict resolution was quite non-trivial, the RDMA tree's merge commit was used as a guide here, and here are their notes: ==================== Due to bug fixes found by the syzkaller bot and taken into the for-rc branch after development for the 4.17 merge window had already started being taken into the for-next branch, there were fairly non-trivial merge issues that would need to be resolved between the for-rc branch and the for-next branch. This merge resolves those conflicts and provides a unified base upon which ongoing development for 4.17 can be based. Conflicts: drivers/infiniband/hw/mlx5/main.c - Commit 42cea83f9524 (IB/mlx5: Fix cleanup order on unload) added to for-rc and commit b5ca15ad7e61 (IB/mlx5: Add proper representors support) add as part of the devel cycle both needed to modify the init/de-init functions used by mlx5. To support the new representors, the new functions added by the cleanup patch needed to be made non-static, and the init/de-init list added by the representors patch needed to be modified to match the init/de-init list changes made by the cleanup patch. Updates: drivers/infiniband/hw/mlx5/mlx5_ib.h - Update function prototypes added by representors patch to reflect new function names as changed by cleanup patch drivers/infiniband/hw/mlx5/ib_rep.c - Update init/de-init stage list to match new order from cleanup patch ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
2018-03-23 15:24:57 +00:00
STAGE_CREATE(MLX5_IB_STAGE_PRE_IB_REG_UMR,
NULL,
mlx5_ib_stage_pre_ib_reg_umr_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_WHITELIST_UID,
mlx5_ib_devx_init,
mlx5_ib_devx_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
mlx5_ib_stage_ib_reg_init,
mlx5_ib_stage_ib_reg_cleanup),
RDMA/mlx5: Move events notifier registration to be after device registration JIRA: https://issues.redhat.com/browse/RHEL-72349 CVE: CVE-2024-53224 commit ede132a5cf559f3ab35a4c28bac4f4a6c20334d8 Author: Patrisious Haddad <phaddad@nvidia.com> Date: Wed Nov 13 13:23:19 2024 +0200 RDMA/mlx5: Move events notifier registration to be after device registration Move pkey change work initialization and cleanup from device resources stage to notifier stage, since this is the stage which handles this work events. Fix a race between the device deregistration and pkey change work by moving MLX5_IB_STAGE_DEVICE_NOTIFIER to be after MLX5_IB_STAGE_IB_REG in order to ensure that the notifier is deregistered before the device during cleanup. Which ensures there are no works that are being executed after the device has already unregistered which can cause the panic below. BUG: kernel NULL pointer dereference, address: 0000000000000000 PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 1 PID: 630071 Comm: kworker/1:2 Kdump: loaded Tainted: G W OE --------- --- 5.14.0-162.6.1.el9_1.x86_64 #1 Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS 090008 02/27/2023 Workqueue: events pkey_change_handler [mlx5_ib] RIP: 0010:setup_qp+0x38/0x1f0 [mlx5_ib] Code: ee 41 54 45 31 e4 55 89 f5 53 48 89 fb 48 83 ec 20 8b 77 08 65 48 8b 04 25 28 00 00 00 48 89 44 24 18 48 8b 07 48 8d 4c 24 16 <4c> 8b 38 49 8b 87 80 0b 00 00 4c 89 ff 48 8b 80 08 05 00 00 8b 40 RSP: 0018:ffffbcc54068be20 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffff954054494128 RCX: ffffbcc54068be36 RDX: ffff954004934000 RSI: 0000000000000001 RDI: ffff954054494128 RBP: 0000000000000023 R08: ffff954001be2c20 R09: 0000000000000001 R10: ffff954001be2c20 R11: ffff9540260133c0 R12: 0000000000000000 R13: 0000000000000023 R14: 0000000000000000 R15: ffff9540ffcb0905 FS: 0000000000000000(0000) GS:ffff9540ffc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000010625c001 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: mlx5_ib_gsi_pkey_change+0x20/0x40 [mlx5_ib] process_one_work+0x1e8/0x3c0 worker_thread+0x50/0x3b0 ? rescuer_thread+0x380/0x380 kthread+0x149/0x170 ? set_kthread_struct+0x50/0x50 ret_from_fork+0x22/0x30 Modules linked in: rdma_ucm(OE) rdma_cm(OE) iw_cm(OE) ib_ipoib(OE) ib_cm(OE) ib_umad(OE) mlx5_ib(OE) mlx5_fwctl(OE) fwctl(OE) ib_uverbs(OE) mlx5_core(OE) mlxdevm(OE) ib_core(OE) mlx_compat(OE) psample mlxfw(OE) tls knem(OE) netconsole nfsv3 nfs_acl nfs lockd grace fscache netfs qrtr rfkill sunrpc intel_rapl_msr intel_rapl_common rapl hv_balloon hv_utils i2c_piix4 pcspkr joydev fuse ext4 mbcache jbd2 sr_mod sd_mod cdrom t10_pi sg ata_generic pci_hyperv pci_hyperv_intf hyperv_drm drm_shmem_helper drm_kms_helper hv_storvsc syscopyarea hv_netvsc sysfillrect sysimgblt hid_hyperv fb_sys_fops scsi_transport_fc hyperv_keyboard drm ata_piix crct10dif_pclmul crc32_pclmul crc32c_intel libata ghash_clmulni_intel hv_vmbus serio_raw [last unloaded: ib_core] CR2: 0000000000000000 ---[ end trace f6f8be4eae12f7bc ]--- Fixes: 7722f47e71e5 ("IB/mlx5: Create GSI transmission QPs when P_Key table is changed") Signed-off-by: Patrisious Haddad <phaddad@nvidia.com> Reviewed-by: Michael Guralnik <michaelgur@nvidia.com> Link: https://patch.msgid.link/d271ceeff0c08431b3cbbbb3e2d416f09b6d1621.1731496944.git.leon@kernel.org Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: CKI Backport Bot <cki-ci-bot+cki-gitlab-backport-bot@redhat.com>
2025-01-13 23:46:40 +00:00
STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER,
mlx5_ib_stage_dev_notifier_init,
mlx5_ib_stage_dev_notifier_cleanup),
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net Fun set of conflict resolutions here... For the mac80211 stuff, these were fortunately just parallel adds. Trivially resolved. In drivers/net/phy/phy.c we had a bug fix in 'net' that moved the function phy_disable_interrupts() earlier in the file, whilst in 'net-next' the phy_error() call from this function was removed. In net/ipv4/xfrm4_policy.c, David Ahern's changes to remove the 'rt_table_id' member of rtable collided with a bug fix in 'net' that added a new struct member "rt_mtu_locked" which needs to be copied over here. The mlxsw driver conflict consisted of net-next separating the span code and definitions into separate files, whilst a 'net' bug fix made some changes to that moved code. The mlx5 infiniband conflict resolution was quite non-trivial, the RDMA tree's merge commit was used as a guide here, and here are their notes: ==================== Due to bug fixes found by the syzkaller bot and taken into the for-rc branch after development for the 4.17 merge window had already started being taken into the for-next branch, there were fairly non-trivial merge issues that would need to be resolved between the for-rc branch and the for-next branch. This merge resolves those conflicts and provides a unified base upon which ongoing development for 4.17 can be based. Conflicts: drivers/infiniband/hw/mlx5/main.c - Commit 42cea83f9524 (IB/mlx5: Fix cleanup order on unload) added to for-rc and commit b5ca15ad7e61 (IB/mlx5: Add proper representors support) add as part of the devel cycle both needed to modify the init/de-init functions used by mlx5. To support the new representors, the new functions added by the cleanup patch needed to be made non-static, and the init/de-init list added by the representors patch needed to be modified to match the init/de-init list changes made by the cleanup patch. Updates: drivers/infiniband/hw/mlx5/mlx5_ib.h - Update function prototypes added by representors patch to reflect new function names as changed by cleanup patch drivers/infiniband/hw/mlx5/ib_rep.c - Update init/de-init stage list to match new order from cleanup patch ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
2018-03-23 15:24:57 +00:00
STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR,
mlx5_ib_stage_post_ib_reg_umr_init,
NULL),
STAGE_CREATE(MLX5_IB_STAGE_DELAY_DROP,
mlx5_ib_stage_delay_drop_init,
mlx5_ib_stage_delay_drop_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_RESTRACK,
mlx5_ib_restrack_init,
NULL),
};
static const struct mlx5_ib_profile plane_profile = {
STAGE_CREATE(MLX5_IB_STAGE_INIT,
mlx5_ib_stage_init_init,
mlx5_ib_stage_init_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_CAPS,
mlx5_ib_stage_caps_init,
mlx5_ib_stage_caps_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB,
mlx5_ib_stage_non_default_cb,
NULL),
STAGE_CREATE(MLX5_IB_STAGE_QP,
mlx5_init_qp_table,
mlx5_cleanup_qp_table),
STAGE_CREATE(MLX5_IB_STAGE_SRQ,
mlx5_init_srq_table,
mlx5_cleanup_srq_table),
STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
mlx5_ib_dev_res_init,
mlx5_ib_dev_res_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_BFREG,
mlx5_ib_stage_bfrag_init,
mlx5_ib_stage_bfrag_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
mlx5_ib_stage_ib_reg_init,
mlx5_ib_stage_ib_reg_cleanup),
};
static struct ib_device *mlx5_ib_add_sub_dev(struct ib_device *parent,
enum rdma_nl_dev_type type,
const char *name)
{
struct mlx5_ib_dev *mparent = to_mdev(parent), *mplane;
enum rdma_link_layer ll;
int ret;
if (mparent->smi_dev)
return ERR_PTR(-EEXIST);
ll = mlx5_port_type_cap_to_rdma_ll(MLX5_CAP_GEN(mparent->mdev,
port_type));
if (type != RDMA_DEVICE_TYPE_SMI || !mparent->num_plane ||
ll != IB_LINK_LAYER_INFINIBAND ||
!MLX5_CAP_GEN_2(mparent->mdev, multiplane_qp_ud))
return ERR_PTR(-EOPNOTSUPP);
mplane = ib_alloc_device(mlx5_ib_dev, ib_dev);
if (!mplane)
return ERR_PTR(-ENOMEM);
mplane->port = kcalloc(mparent->num_plane * mparent->num_ports,
sizeof(*mplane->port), GFP_KERNEL);
if (!mplane->port) {
ret = -ENOMEM;
goto fail_kcalloc;
}
mplane->ib_dev.type = type;
mplane->mdev = mparent->mdev;
mplane->num_ports = mparent->num_plane;
mplane->sub_dev_name = name;
mplane->ib_dev.phys_port_cnt = mplane->num_ports;
ret = __mlx5_ib_add(mplane, &plane_profile);
if (ret)
goto fail_ib_add;
mparent->smi_dev = mplane;
return &mplane->ib_dev;
fail_ib_add:
kfree(mplane->port);
fail_kcalloc:
ib_dealloc_device(&mplane->ib_dev);
return ERR_PTR(ret);
}
static void mlx5_ib_del_sub_dev(struct ib_device *sub_dev)
{
struct mlx5_ib_dev *mdev = to_mdev(sub_dev);
to_mdev(sub_dev->parent)->smi_dev = NULL;
__mlx5_ib_remove(mdev, mdev->profile, MLX5_IB_STAGE_MAX);
}
static int mlx5r_mp_probe(struct auxiliary_device *adev,
const struct auxiliary_device_id *id)
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
{
struct mlx5_adev *idev = container_of(adev, struct mlx5_adev, adev);
struct mlx5_core_dev *mdev = idev->mdev;
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
struct mlx5_ib_multiport_info *mpi;
struct mlx5_ib_dev *dev;
bool bound = false;
int err;
mpi = kzalloc(sizeof(*mpi), GFP_KERNEL);
if (!mpi)
return -ENOMEM;
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
mpi->mdev = mdev;
err = mlx5_query_nic_vport_system_image_guid(mdev,
&mpi->sys_image_guid);
if (err) {
kfree(mpi);
return err;
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
}
mutex_lock(&mlx5_ib_multiport_mutex);
list_for_each_entry(dev, &mlx5_ib_dev_list, ib_dev_list) {
if (dev->sys_image_guid == mpi->sys_image_guid)
bound = mlx5_ib_bind_slave_port(dev, mpi);
if (bound) {
rdma_roce_rescan_device(&dev->ib_dev);
mpi->ibdev->ib_active = true;
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
break;
}
}
if (!bound) {
list_add_tail(&mpi->list, &mlx5_ib_unaffiliated_port_list);
dev_dbg(mdev->device,
"no suitable IB device found to bind to, added to unaffiliated list.\n");
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
}
mutex_unlock(&mlx5_ib_multiport_mutex);
auxiliary_set_drvdata(adev, mpi);
return 0;
}
static void mlx5r_mp_remove(struct auxiliary_device *adev)
{
struct mlx5_ib_multiport_info *mpi;
mpi = auxiliary_get_drvdata(adev);
mutex_lock(&mlx5_ib_multiport_mutex);
if (mpi->ibdev)
mlx5_ib_unbind_slave_port(mpi->ibdev, mpi);
else
list_del(&mpi->list);
mutex_unlock(&mlx5_ib_multiport_mutex);
kfree(mpi);
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
}
static int mlx5r_probe(struct auxiliary_device *adev,
const struct auxiliary_device_id *id)
{
struct mlx5_adev *idev = container_of(adev, struct mlx5_adev, adev);
struct mlx5_core_dev *mdev = idev->mdev;
const struct mlx5_ib_profile *profile;
int port_type_cap, num_ports, ret;
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
enum rdma_link_layer ll;
struct mlx5_ib_dev *dev;
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
port_type_cap = MLX5_CAP_GEN(mdev, port_type);
ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
num_ports = max(MLX5_CAP_GEN(mdev, num_ports),
MLX5_CAP_GEN(mdev, num_vhca_ports));
dev = ib_alloc_device(mlx5_ib_dev, ib_dev);
if (!dev)
return -ENOMEM;
if (ll == IB_LINK_LAYER_INFINIBAND) {
ret = mlx5_ib_get_plane_num(mdev, &dev->num_plane);
if (ret)
goto fail;
}
dev->port = kcalloc(num_ports, sizeof(*dev->port),
GFP_KERNEL);
if (!dev->port) {
ret = -ENOMEM;
goto fail;
}
dev->mdev = mdev;
dev->num_ports = num_ports;
dev->ib_dev.phys_port_cnt = num_ports;
RDMA/mlx5: Rely on RoCE fw cap instead of devlink when setting profile Bugzilla: https://bugzilla.redhat.com/2112947 Upstream-status: v6.0-rc5 commit 9ca05b0f27de928be121cccf07735819dc9e1ed3 Author: Maher Sanalla <msanalla@nvidia.com> Date: Mon Aug 29 12:02:27 2022 +0300 RDMA/mlx5: Rely on RoCE fw cap instead of devlink when setting profile When the RDMA auxiliary driver probes, it sets its profile based on devlink driverinit value. The latter might not be in sync with FW yet (In case devlink reload is not performed), thus causing a mismatch between RDMA driver and FW. This results in the following FW syndrome when the RDMA driver tries to adjust RoCE state, which fails the probe: "0xC1F678 | modify_nic_vport_context: roce_en set on a vport that doesn't support roce" To prevent this, select the PF profile based on FW RoCE capability instead of relying on devlink driverinit value. To provide backward compatibility of the RoCE disable feature, on older FW's where roce_rw is not set (FW RoCE capability is read-only), keep the current behavior e.g., rely on devlink driverinit value. Fixes: fbfa97b4d79f ("net/mlx5: Disable roce at HCA level") Reviewed-by: Shay Drory <shayd@nvidia.com> Reviewed-by: Michael Guralnik <michaelgur@nvidia.com> Reviewed-by: Saeed Mahameed <saeedm@nvidia.com> Signed-off-by: Maher Sanalla <msanalla@nvidia.com> Link: https://lore.kernel.org/r/cb34ce9a1df4a24c135cb804db87f7d2418bd6cc.1661763459.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Mohammad Kabat <mkabat@redhat.com>
2022-12-11 13:58:37 +00:00
if (ll == IB_LINK_LAYER_ETHERNET && !mlx5_get_roce_state(mdev))
profile = &raw_eth_profile;
else
profile = &pf_profile;
ret = __mlx5_ib_add(dev, profile);
if (ret)
goto fail_ib_add;
auxiliary_set_drvdata(adev, dev);
return 0;
fail_ib_add:
kfree(dev->port);
fail:
ib_dealloc_device(&dev->ib_dev);
return ret;
}
static void mlx5r_remove(struct auxiliary_device *adev)
{
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
struct mlx5_ib_dev *dev;
dev = auxiliary_get_drvdata(adev);
__mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX);
}
static const struct auxiliary_device_id mlx5r_mp_id_table[] = {
{ .name = MLX5_ADEV_NAME ".multiport", },
{},
};
static const struct auxiliary_device_id mlx5r_id_table[] = {
{ .name = MLX5_ADEV_NAME ".rdma", },
{},
};
MODULE_DEVICE_TABLE(auxiliary, mlx5r_mp_id_table);
MODULE_DEVICE_TABLE(auxiliary, mlx5r_id_table);
static struct auxiliary_driver mlx5r_mp_driver = {
.name = "multiport",
.probe = mlx5r_mp_probe,
.remove = mlx5r_mp_remove,
.id_table = mlx5r_mp_id_table,
};
static struct auxiliary_driver mlx5r_driver = {
.name = "rdma",
.probe = mlx5r_probe,
.remove = mlx5r_remove,
.id_table = mlx5r_id_table,
};
static int __init mlx5_ib_init(void)
{
int ret;
xlt_emergency_page = (void *)__get_free_page(GFP_KERNEL);
if (!xlt_emergency_page)
return -ENOMEM;
mlx5_ib_event_wq = alloc_ordered_workqueue("mlx5_ib_event_wq", 0);
if (!mlx5_ib_event_wq) {
free_page((unsigned long)xlt_emergency_page);
return -ENOMEM;
}
ret = mlx5_ib_qp_event_init();
if (ret)
goto qp_event_err;
mlx5_ib_odp_init();
ret = mlx5r_rep_init();
if (ret)
goto rep_err;
RDMA/mlx5: Introduce the 'data direct' driver JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: v6.12-rc1 commit 6910e3660d86c1a5654f742a40181d2c9154f26f Author: Yishai Hadas <yishaih@nvidia.com> Date: Thu Aug 1 15:05:11 2024 +0300 RDMA/mlx5: Introduce the 'data direct' driver Introduce the 'data direct' driver for a ConnectX-8 Data Direct device. The 'data direct' driver functions as the affiliated DMA device for one or more capable mlx5_ib devices. This DMA device, as the name suggests, is used exclusively for DMA operations. It can be considered a DMA engine managed by a PF/VF, lacking network capabilities and having minimal overall capabilities. Consequently, the DMA NIC PF will not be exposed to or directly used by software applications. The driver will not have any direct interface or interaction with the firmware (no command interface, no capabilities, etc.). It will operate solely over PCI to enable its DMA functionality. Registration and un-registration of the driver are handled as part of the mlx5_ib initialization and exit processes, as the mlx5_ib devices will effectively be its clients. The driver will serve as the DMA device for accessing another PCI device to achieve optimal performance (both on the same NUMA node, P2P access, etc.). Upon probing, it will read its VUID over PCI to handle mlx5_ib device registrations with the same VUID. Upon removal, it will notify its clients to allow them to clean up the resources that were mmaped with its DMA device. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Link: https://patch.msgid.link/b77edecfd476c3f445da96ab6aef499ae47b2829.1722512548.git.leon@kernel.org Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:08 +00:00
ret = mlx5_data_direct_driver_register();
if (ret)
goto dd_err;
ret = auxiliary_driver_register(&mlx5r_mp_driver);
if (ret)
goto mp_err;
ret = auxiliary_driver_register(&mlx5r_driver);
if (ret)
goto drv_err;
RDMA/mlx5: Introduce the 'data direct' driver JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: v6.12-rc1 commit 6910e3660d86c1a5654f742a40181d2c9154f26f Author: Yishai Hadas <yishaih@nvidia.com> Date: Thu Aug 1 15:05:11 2024 +0300 RDMA/mlx5: Introduce the 'data direct' driver Introduce the 'data direct' driver for a ConnectX-8 Data Direct device. The 'data direct' driver functions as the affiliated DMA device for one or more capable mlx5_ib devices. This DMA device, as the name suggests, is used exclusively for DMA operations. It can be considered a DMA engine managed by a PF/VF, lacking network capabilities and having minimal overall capabilities. Consequently, the DMA NIC PF will not be exposed to or directly used by software applications. The driver will not have any direct interface or interaction with the firmware (no command interface, no capabilities, etc.). It will operate solely over PCI to enable its DMA functionality. Registration and un-registration of the driver are handled as part of the mlx5_ib initialization and exit processes, as the mlx5_ib devices will effectively be its clients. The driver will serve as the DMA device for accessing another PCI device to achieve optimal performance (both on the same NUMA node, P2P access, etc.). Upon probing, it will read its VUID over PCI to handle mlx5_ib device registrations with the same VUID. Upon removal, it will notify its clients to allow them to clean up the resources that were mmaped with its DMA device. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Link: https://patch.msgid.link/b77edecfd476c3f445da96ab6aef499ae47b2829.1722512548.git.leon@kernel.org Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:08 +00:00
return 0;
drv_err:
auxiliary_driver_unregister(&mlx5r_mp_driver);
mp_err:
RDMA/mlx5: Introduce the 'data direct' driver JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: v6.12-rc1 commit 6910e3660d86c1a5654f742a40181d2c9154f26f Author: Yishai Hadas <yishaih@nvidia.com> Date: Thu Aug 1 15:05:11 2024 +0300 RDMA/mlx5: Introduce the 'data direct' driver Introduce the 'data direct' driver for a ConnectX-8 Data Direct device. The 'data direct' driver functions as the affiliated DMA device for one or more capable mlx5_ib devices. This DMA device, as the name suggests, is used exclusively for DMA operations. It can be considered a DMA engine managed by a PF/VF, lacking network capabilities and having minimal overall capabilities. Consequently, the DMA NIC PF will not be exposed to or directly used by software applications. The driver will not have any direct interface or interaction with the firmware (no command interface, no capabilities, etc.). It will operate solely over PCI to enable its DMA functionality. Registration and un-registration of the driver are handled as part of the mlx5_ib initialization and exit processes, as the mlx5_ib devices will effectively be its clients. The driver will serve as the DMA device for accessing another PCI device to achieve optimal performance (both on the same NUMA node, P2P access, etc.). Upon probing, it will read its VUID over PCI to handle mlx5_ib device registrations with the same VUID. Upon removal, it will notify its clients to allow them to clean up the resources that were mmaped with its DMA device. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Link: https://patch.msgid.link/b77edecfd476c3f445da96ab6aef499ae47b2829.1722512548.git.leon@kernel.org Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:08 +00:00
mlx5_data_direct_driver_unregister();
dd_err:
mlx5r_rep_cleanup();
rep_err:
mlx5_ib_qp_event_cleanup();
qp_event_err:
destroy_workqueue(mlx5_ib_event_wq);
free_page((unsigned long)xlt_emergency_page);
return ret;
}
static void __exit mlx5_ib_cleanup(void)
{
RDMA/mlx5: Introduce the 'data direct' driver JIRA: https://issues.redhat.com/browse/RHEL-52869 Upstream-status: v6.12-rc1 commit 6910e3660d86c1a5654f742a40181d2c9154f26f Author: Yishai Hadas <yishaih@nvidia.com> Date: Thu Aug 1 15:05:11 2024 +0300 RDMA/mlx5: Introduce the 'data direct' driver Introduce the 'data direct' driver for a ConnectX-8 Data Direct device. The 'data direct' driver functions as the affiliated DMA device for one or more capable mlx5_ib devices. This DMA device, as the name suggests, is used exclusively for DMA operations. It can be considered a DMA engine managed by a PF/VF, lacking network capabilities and having minimal overall capabilities. Consequently, the DMA NIC PF will not be exposed to or directly used by software applications. The driver will not have any direct interface or interaction with the firmware (no command interface, no capabilities, etc.). It will operate solely over PCI to enable its DMA functionality. Registration and un-registration of the driver are handled as part of the mlx5_ib initialization and exit processes, as the mlx5_ib devices will effectively be its clients. The driver will serve as the DMA device for accessing another PCI device to achieve optimal performance (both on the same NUMA node, P2P access, etc.). Upon probing, it will read its VUID over PCI to handle mlx5_ib device registrations with the same VUID. Upon removal, it will notify its clients to allow them to clean up the resources that were mmaped with its DMA device. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Link: https://patch.msgid.link/b77edecfd476c3f445da96ab6aef499ae47b2829.1722512548.git.leon@kernel.org Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Benjamin Poirier <bpoirier@redhat.com>
2024-12-05 15:32:08 +00:00
mlx5_data_direct_driver_unregister();
auxiliary_driver_unregister(&mlx5r_driver);
auxiliary_driver_unregister(&mlx5r_mp_driver);
mlx5r_rep_cleanup();
mlx5_ib_qp_event_cleanup();
destroy_workqueue(mlx5_ib_event_wq);
free_page((unsigned long)xlt_emergency_page);
}
module_init(mlx5_ib_init);
module_exit(mlx5_ib_cleanup);