2017-04-13 12:36:55 +09:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2017, Mellanox Technologies. All rights reserved.
|
|
|
|
*
|
|
|
|
* This software is available to you under a choice of one of two
|
|
|
|
* licenses. You may choose to be licensed under the terms of the GNU
|
|
|
|
* General Public License (GPL) Version 2, available from the file
|
|
|
|
* COPYING in the main directory of this source tree, or the
|
|
|
|
* OpenIB.org BSD license below:
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or
|
|
|
|
* without modification, are permitted provided that the following
|
|
|
|
* conditions are met:
|
|
|
|
*
|
|
|
|
* - Redistributions of source code must retain the above
|
|
|
|
* copyright notice, this list of conditions and the following
|
|
|
|
* disclaimer.
|
|
|
|
*
|
|
|
|
* - Redistributions in binary form must reproduce the above
|
|
|
|
* copyright notice, this list of conditions and the following
|
|
|
|
* disclaimer in the documentation and/or other materials
|
|
|
|
* provided with the distribution.
|
|
|
|
*
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
|
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
|
|
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
|
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
|
|
|
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
|
|
|
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
|
|
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
|
|
* SOFTWARE.
|
|
|
|
*/
|
|
|
|
|
2017-04-27 23:01:34 +09:00
|
|
|
#include <rdma/ib_verbs.h>
|
2017-04-13 12:36:55 +09:00
|
|
|
#include <linux/mlx5/fs.h>
|
|
|
|
#include "en.h"
|
|
|
|
#include "ipoib.h"
|
|
|
|
|
2017-04-13 12:37:00 +09:00
|
|
|
#define IB_DEFAULT_Q_KEY 0xb1b
|
2017-05-18 20:44:15 +09:00
|
|
|
#define MLX5I_PARAMS_DEFAULT_LOG_RQ_SIZE 9
|
2017-04-13 12:37:00 +09:00
|
|
|
|
2017-04-13 12:36:59 +09:00
|
|
|
static int mlx5i_open(struct net_device *netdev);
|
|
|
|
static int mlx5i_close(struct net_device *netdev);
|
2017-05-21 14:56:20 +09:00
|
|
|
static int mlx5i_change_mtu(struct net_device *netdev, int new_mtu);
|
2017-04-13 12:36:59 +09:00
|
|
|
|
|
|
|
static const struct net_device_ops mlx5i_netdev_ops = {
|
|
|
|
.ndo_open = mlx5i_open,
|
|
|
|
.ndo_stop = mlx5i_close,
|
2018-09-03 04:12:08 +09:00
|
|
|
.ndo_get_stats64 = mlx5i_get_stats,
|
2017-04-13 12:36:59 +09:00
|
|
|
.ndo_init = mlx5i_dev_init,
|
|
|
|
.ndo_uninit = mlx5i_dev_cleanup,
|
2017-05-21 14:56:20 +09:00
|
|
|
.ndo_change_mtu = mlx5i_change_mtu,
|
2017-06-01 20:56:17 +09:00
|
|
|
.ndo_do_ioctl = mlx5i_ioctl,
|
2017-04-13 12:36:59 +09:00
|
|
|
};
|
|
|
|
|
2017-04-13 12:36:55 +09:00
|
|
|
/* IPoIB mlx5 netdev profile */
|
2017-05-18 20:44:15 +09:00
|
|
|
static void mlx5i_build_nic_params(struct mlx5_core_dev *mdev,
|
|
|
|
struct mlx5e_params *params)
|
|
|
|
{
|
|
|
|
/* Override RQ params as IPoIB supports only LINKED LIST RQ for now */
|
2018-02-07 21:51:45 +09:00
|
|
|
MLX5E_SET_PFLAG(params, MLX5E_PFLAG_RX_STRIDING_RQ, false);
|
|
|
|
mlx5e_set_rq_type(mdev, params);
|
2018-02-18 18:37:06 +09:00
|
|
|
mlx5e_init_rq_type_params(mdev, params);
|
2017-05-18 20:44:15 +09:00
|
|
|
|
|
|
|
/* RQ size in ipoib by default is 512 */
|
2018-02-11 22:21:33 +09:00
|
|
|
params->log_rq_mtu_frames = is_kdump_kernel() ?
|
2017-05-18 20:44:15 +09:00
|
|
|
MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE :
|
|
|
|
MLX5I_PARAMS_DEFAULT_LOG_RQ_SIZE;
|
|
|
|
|
|
|
|
params->lro_en = false;
|
2018-03-12 21:24:41 +09:00
|
|
|
params->hard_mtu = MLX5_IB_GRH_BYTES + MLX5_IPOIB_HARD_LEN;
|
2019-01-20 18:04:34 +09:00
|
|
|
params->tunneled_offload_en = false;
|
2017-05-18 20:44:15 +09:00
|
|
|
}
|
2017-04-13 12:36:55 +09:00
|
|
|
|
|
|
|
/* Called directly after IPoIB netdevice was created to initialize SW structs */
|
2018-10-02 15:54:59 +09:00
|
|
|
int mlx5i_init(struct mlx5_core_dev *mdev,
|
|
|
|
struct net_device *netdev,
|
|
|
|
const struct mlx5e_profile *profile,
|
|
|
|
void *ppriv)
|
2017-04-13 12:36:55 +09:00
|
|
|
{
|
|
|
|
struct mlx5e_priv *priv = mlx5i_epriv(netdev);
|
2018-10-02 15:54:59 +09:00
|
|
|
int err;
|
2017-04-13 12:36:55 +09:00
|
|
|
|
2018-09-13 07:02:05 +09:00
|
|
|
err = mlx5e_netdev_init(netdev, priv, mdev, profile, ppriv);
|
2018-10-02 15:54:59 +09:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
2019-01-22 20:42:10 +09:00
|
|
|
mlx5e_set_netdev_mtu_boundaries(priv);
|
|
|
|
netdev->mtu = netdev->max_mtu;
|
2018-07-15 19:59:36 +09:00
|
|
|
|
net/mlx5e: Add XSK zero-copy support
This commit adds support for AF_XDP zero-copy RX and TX.
We create a dedicated XSK RQ inside the channel, it means that two
RQs are running simultaneously: one for non-XSK traffic and the other
for XSK traffic. The regular and XSK RQs use a single ID namespace split
into two halves: the lower half is regular RQs, and the upper half is
XSK RQs. When any zero-copy AF_XDP socket is active, changing the number
of channels is not allowed, because it would break to mapping between
XSK RQ IDs and channels.
XSK requires different page allocation and release routines. Such
functions as mlx5e_{alloc,free}_rx_mpwqe and mlx5e_{get,put}_rx_frag are
generic enough to be used for both regular and XSK RQs, and they use the
mlx5e_page_{alloc,release} wrappers around the real allocation
functions. Function pointers are not used to avoid losing the
performance with retpolines. Wherever it's certain that the regular
(non-XSK) page release function should be used, it's called directly.
Only the stats that could be meaningful for XSK are exposed to the
userspace. Those that don't take part in the XSK flow are not
considered.
Note that we don't wait for WQEs on the XSK RQ (unlike the regular RQ),
because the newer xdpsock sample doesn't provide any Fill Ring entries
at the setup stage.
We create a dedicated XSK SQ in the channel. This separation has its
advantages:
1. When the UMEM is closed, the XSK SQ can also be closed and stop
receiving completions. If an existing SQ was used for XSK, it would
continue receiving completions for the packets of the closed socket. If
a new UMEM was opened at that point, it would start getting completions
that don't belong to it.
2. Calculating statistics separately.
When the userspace kicks the TX, the driver triggers a hardware
interrupt by posting a NOP to a dedicated XSK ICO (internal control
operations) SQ, in order to trigger NAPI on the right CPU core. This XSK
ICO SQ is protected by a spinlock, as the userspace application may kick
the TX from any core.
Store the pointers to the UMEMs in the net device private context,
independently from the kernel. This way the driver can distinguish
between the zero-copy and non-zero-copy UMEMs. The kernel function
xdp_get_umem_from_qid does not care about this difference, but the
driver is only interested in zero-copy UMEMs, particularly, on the
cleanup it determines whether to close the XSK RQ and SQ or not by
looking at the presence of the UMEM. Use state_lock to protect the
access to this area of UMEM pointers.
LRO isn't compatible with XDP, but there may be active UMEMs while
XDP is off. If this is the case, don't allow LRO to ensure XDP can
be reenabled at any time.
The validation of XSK parameters typically happens when XSK queues
open. However, when the interface is down or the XDP program isn't
set, it's still possible to have active AF_XDP sockets and even to
open new, but the XSK queues will be closed. To cover these cases,
perform the validation also in these flows:
1. A new UMEM is registered, but the XSK queues aren't going to be
created due to missing XDP program or interface being down.
2. MTU changes while there are UMEMs registered.
Having this early check prevents mlx5e_open_channels from failing
at a later stage, where recovery is impossible and the application
has no chance to handle the error, because it got the successful
return value for an MTU change or XSK open operation.
The performance testing was performed on a machine with the following
configuration:
- 24 cores of Intel Xeon E5-2620 v3 @ 2.40 GHz
- Mellanox ConnectX-5 Ex with 100 Gbit/s link
The results with retpoline disabled, single stream:
txonly: 33.3 Mpps (21.5 Mpps with queue and app pinned to the same CPU)
rxdrop: 12.2 Mpps
l2fwd: 9.4 Mpps
The results with retpoline enabled, single stream:
txonly: 21.3 Mpps (14.1 Mpps with queue and app pinned to the same CPU)
rxdrop: 9.9 Mpps
l2fwd: 6.8 Mpps
Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-06-26 23:35:38 +09:00
|
|
|
mlx5e_build_nic_params(mdev, NULL, &priv->rss_params, &priv->channels.params,
|
2019-07-14 17:43:43 +09:00
|
|
|
priv->max_nch, netdev->mtu);
|
2017-05-18 20:44:15 +09:00
|
|
|
mlx5i_build_nic_params(mdev, &priv->channels.params);
|
2017-05-04 23:53:32 +09:00
|
|
|
|
2018-01-08 17:01:04 +09:00
|
|
|
mlx5e_timestamp_init(priv);
|
|
|
|
|
2017-05-18 23:03:21 +09:00
|
|
|
/* netdev init */
|
2017-04-13 12:36:59 +09:00
|
|
|
netdev->hw_features |= NETIF_F_SG;
|
|
|
|
netdev->hw_features |= NETIF_F_IP_CSUM;
|
|
|
|
netdev->hw_features |= NETIF_F_IPV6_CSUM;
|
|
|
|
netdev->hw_features |= NETIF_F_GRO;
|
|
|
|
netdev->hw_features |= NETIF_F_TSO;
|
|
|
|
netdev->hw_features |= NETIF_F_TSO6;
|
|
|
|
netdev->hw_features |= NETIF_F_RXCSUM;
|
|
|
|
netdev->hw_features |= NETIF_F_RXHASH;
|
|
|
|
|
|
|
|
netdev->netdev_ops = &mlx5i_netdev_ops;
|
2017-05-15 19:32:28 +09:00
|
|
|
netdev->ethtool_ops = &mlx5i_ethtool_ops;
|
2018-10-02 15:54:59 +09:00
|
|
|
|
|
|
|
return 0;
|
2017-04-13 12:36:55 +09:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Called directly before IPoIB netdevice is destroyed to cleanup SW structs */
|
2018-10-02 15:54:59 +09:00
|
|
|
void mlx5i_cleanup(struct mlx5e_priv *priv)
|
2017-04-13 12:36:55 +09:00
|
|
|
{
|
2018-10-02 15:54:59 +09:00
|
|
|
mlx5e_netdev_cleanup(priv->netdev, priv);
|
2017-04-13 12:36:55 +09:00
|
|
|
}
|
|
|
|
|
2018-09-05 20:16:02 +09:00
|
|
|
static void mlx5i_grp_sw_update_stats(struct mlx5e_priv *priv)
|
2018-09-03 04:12:08 +09:00
|
|
|
{
|
|
|
|
struct mlx5e_sw_stats s = { 0 };
|
|
|
|
int i, j;
|
|
|
|
|
2019-07-14 17:43:43 +09:00
|
|
|
for (i = 0; i < priv->max_nch; i++) {
|
2018-09-03 04:12:08 +09:00
|
|
|
struct mlx5e_channel_stats *channel_stats;
|
|
|
|
struct mlx5e_rq_stats *rq_stats;
|
|
|
|
|
|
|
|
channel_stats = &priv->channel_stats[i];
|
|
|
|
rq_stats = &channel_stats->rq;
|
|
|
|
|
|
|
|
s.rx_packets += rq_stats->packets;
|
|
|
|
s.rx_bytes += rq_stats->bytes;
|
|
|
|
|
|
|
|
for (j = 0; j < priv->max_opened_tc; j++) {
|
|
|
|
struct mlx5e_sq_stats *sq_stats = &channel_stats->sq[j];
|
|
|
|
|
|
|
|
s.tx_packets += sq_stats->packets;
|
|
|
|
s.tx_bytes += sq_stats->bytes;
|
|
|
|
s.tx_queue_dropped += sq_stats->dropped;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
memcpy(&priv->stats.sw, &s, sizeof(s));
|
|
|
|
}
|
|
|
|
|
|
|
|
void mlx5i_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
|
|
|
|
{
|
|
|
|
struct mlx5e_priv *priv = mlx5i_epriv(dev);
|
|
|
|
struct mlx5e_sw_stats *sstats = &priv->stats.sw;
|
|
|
|
|
|
|
|
mlx5i_grp_sw_update_stats(priv);
|
|
|
|
|
|
|
|
stats->rx_packets = sstats->rx_packets;
|
|
|
|
stats->rx_bytes = sstats->rx_bytes;
|
|
|
|
stats->tx_packets = sstats->tx_packets;
|
|
|
|
stats->tx_bytes = sstats->tx_bytes;
|
|
|
|
stats->tx_dropped = sstats->tx_queue_dropped;
|
|
|
|
}
|
|
|
|
|
2017-09-14 22:33:35 +09:00
|
|
|
int mlx5i_init_underlay_qp(struct mlx5e_priv *priv)
|
2017-09-12 20:11:29 +09:00
|
|
|
{
|
|
|
|
struct mlx5_core_dev *mdev = priv->mdev;
|
|
|
|
struct mlx5i_priv *ipriv = priv->ppriv;
|
|
|
|
struct mlx5_core_qp *qp = &ipriv->qp;
|
|
|
|
struct mlx5_qp_context *context;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
/* QP states */
|
|
|
|
context = kzalloc(sizeof(*context), GFP_KERNEL);
|
|
|
|
if (!context)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
context->flags = cpu_to_be32(MLX5_QP_PM_MIGRATED << 11);
|
|
|
|
context->pri_path.port = 1;
|
2017-09-13 18:17:50 +09:00
|
|
|
context->pri_path.pkey_index = cpu_to_be16(ipriv->pkey_index);
|
2017-09-12 20:11:29 +09:00
|
|
|
context->qkey = cpu_to_be32(IB_DEFAULT_Q_KEY);
|
|
|
|
|
|
|
|
ret = mlx5_core_qp_modify(mdev, MLX5_CMD_OP_RST2INIT_QP, 0, context, qp);
|
|
|
|
if (ret) {
|
|
|
|
mlx5_core_err(mdev, "Failed to modify qp RST2INIT, err: %d\n", ret);
|
|
|
|
goto err_qp_modify_to_err;
|
|
|
|
}
|
|
|
|
memset(context, 0, sizeof(*context));
|
|
|
|
|
|
|
|
ret = mlx5_core_qp_modify(mdev, MLX5_CMD_OP_INIT2RTR_QP, 0, context, qp);
|
|
|
|
if (ret) {
|
|
|
|
mlx5_core_err(mdev, "Failed to modify qp INIT2RTR, err: %d\n", ret);
|
|
|
|
goto err_qp_modify_to_err;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = mlx5_core_qp_modify(mdev, MLX5_CMD_OP_RTR2RTS_QP, 0, context, qp);
|
|
|
|
if (ret) {
|
|
|
|
mlx5_core_err(mdev, "Failed to modify qp RTR2RTS, err: %d\n", ret);
|
|
|
|
goto err_qp_modify_to_err;
|
|
|
|
}
|
|
|
|
|
|
|
|
kfree(context);
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
err_qp_modify_to_err:
|
|
|
|
mlx5_core_qp_modify(mdev, MLX5_CMD_OP_2ERR_QP, 0, &context, qp);
|
|
|
|
kfree(context);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2017-09-14 22:33:35 +09:00
|
|
|
void mlx5i_uninit_underlay_qp(struct mlx5e_priv *priv)
|
2017-09-12 20:11:29 +09:00
|
|
|
{
|
|
|
|
struct mlx5i_priv *ipriv = priv->ppriv;
|
|
|
|
struct mlx5_core_dev *mdev = priv->mdev;
|
|
|
|
struct mlx5_qp_context context;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
err = mlx5_core_qp_modify(mdev, MLX5_CMD_OP_2RST_QP, 0, &context,
|
|
|
|
&ipriv->qp);
|
|
|
|
if (err)
|
|
|
|
mlx5_core_err(mdev, "Failed to modify qp 2RST, err: %d\n", err);
|
|
|
|
}
|
|
|
|
|
2017-04-13 12:37:00 +09:00
|
|
|
#define MLX5_QP_ENHANCED_ULP_STATELESS_MODE 2
|
|
|
|
|
2017-09-14 20:08:39 +09:00
|
|
|
int mlx5i_create_underlay_qp(struct mlx5_core_dev *mdev, struct mlx5_core_qp *qp)
|
2017-04-13 12:37:00 +09:00
|
|
|
{
|
|
|
|
u32 *in = NULL;
|
|
|
|
void *addr_path;
|
|
|
|
int ret = 0;
|
|
|
|
int inlen;
|
|
|
|
void *qpc;
|
|
|
|
|
|
|
|
inlen = MLX5_ST_SZ_BYTES(create_qp_in);
|
2017-05-11 03:32:18 +09:00
|
|
|
in = kvzalloc(inlen, GFP_KERNEL);
|
2017-04-13 12:37:00 +09:00
|
|
|
if (!in)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
|
|
|
|
MLX5_SET(qpc, qpc, st, MLX5_QP_ST_UD);
|
|
|
|
MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
|
|
|
|
MLX5_SET(qpc, qpc, ulp_stateless_offload_mode,
|
|
|
|
MLX5_QP_ENHANCED_ULP_STATELESS_MODE);
|
|
|
|
|
|
|
|
addr_path = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
|
{net, IB}/mlx5: Manage port association for multiport RoCE
When mlx5_ib_add is called determine if the mlx5 core device being
added is capable of dual port RoCE operation. If it is, determine
whether it is a master device or a slave device using the
num_vhca_ports and affiliate_nic_vport_criteria capabilities.
If the device is a slave, attempt to find a master device to affiliate it
with. Devices that can be affiliated will share a system image guid. If
none are found place it on a list of unaffiliated ports. If a master is
found bind the port to it by configuring the port affiliation in the NIC
vport context.
Similarly when mlx5_ib_remove is called determine the port type. If it's
a slave port, unaffiliate it from the master device, otherwise just
remove it from the unaffiliated port list.
The IB device is registered as a multiport device, even if a 2nd port is
not available for affiliation. When the 2nd port is affiliated later the
GID cache must be refreshed in order to get the default GIDs for the 2nd
port in the cache. Export roce_rescan_device to provide a mechanism to
refresh the cache after a new port is bound.
In a multiport configuration all IB object (QP, MR, PD, etc) related
commands should flow through the master mlx5_core_dev, other commands
must be sent to the slave port mlx5_core_mdev, an interface is provide
to get the correct mdev for non IB object commands.
Signed-off-by: Daniel Jurgens <danielj@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-05 00:25:36 +09:00
|
|
|
MLX5_SET(ads, addr_path, vhca_port_num, 1);
|
2017-04-13 12:37:00 +09:00
|
|
|
MLX5_SET(ads, addr_path, grh, 1);
|
|
|
|
|
|
|
|
ret = mlx5_core_create_qp(mdev, qp, in, inlen);
|
|
|
|
if (ret) {
|
|
|
|
mlx5_core_err(mdev, "Failed creating IPoIB QP err : %d\n", ret);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
|
|
|
kvfree(in);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2017-09-14 20:08:39 +09:00
|
|
|
void mlx5i_destroy_underlay_qp(struct mlx5_core_dev *mdev, struct mlx5_core_qp *qp)
|
2017-04-13 12:37:00 +09:00
|
|
|
{
|
|
|
|
mlx5_core_destroy_qp(mdev, qp);
|
|
|
|
}
|
|
|
|
|
2019-07-06 00:30:20 +09:00
|
|
|
int mlx5i_create_tis(struct mlx5_core_dev *mdev, u32 underlay_qpn, u32 *tisn)
|
|
|
|
{
|
|
|
|
u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {};
|
|
|
|
void *tisc;
|
|
|
|
|
|
|
|
tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
|
|
|
|
|
|
|
|
MLX5_SET(tisc, tisc, underlay_qpn, underlay_qpn);
|
|
|
|
|
|
|
|
return mlx5e_create_tis(mdev, in, tisn);
|
|
|
|
}
|
|
|
|
|
2017-04-13 12:36:55 +09:00
|
|
|
static int mlx5i_init_tx(struct mlx5e_priv *priv)
|
|
|
|
{
|
2017-04-13 12:36:58 +09:00
|
|
|
struct mlx5i_priv *ipriv = priv->ppriv;
|
|
|
|
int err;
|
|
|
|
|
2017-04-13 12:37:00 +09:00
|
|
|
err = mlx5i_create_underlay_qp(priv->mdev, &ipriv->qp);
|
|
|
|
if (err) {
|
|
|
|
mlx5_core_warn(priv->mdev, "create underlay QP failed, %d\n", err);
|
|
|
|
return err;
|
|
|
|
}
|
2017-04-13 12:36:58 +09:00
|
|
|
|
2019-08-07 23:46:15 +09:00
|
|
|
err = mlx5i_create_tis(priv->mdev, ipriv->qp.qpn, &priv->tisn[0][0]);
|
2017-04-13 12:36:58 +09:00
|
|
|
if (err) {
|
|
|
|
mlx5_core_warn(priv->mdev, "create tis failed, %d\n", err);
|
2017-09-12 20:11:29 +09:00
|
|
|
goto err_destroy_underlay_qp;
|
2017-04-13 12:36:58 +09:00
|
|
|
}
|
|
|
|
|
2017-04-13 12:36:55 +09:00
|
|
|
return 0;
|
2017-09-12 20:11:29 +09:00
|
|
|
|
|
|
|
err_destroy_underlay_qp:
|
|
|
|
mlx5i_destroy_underlay_qp(priv->mdev, &ipriv->qp);
|
|
|
|
return err;
|
2017-04-13 12:36:55 +09:00
|
|
|
}
|
|
|
|
|
2017-04-22 03:15:56 +09:00
|
|
|
static void mlx5i_cleanup_tx(struct mlx5e_priv *priv)
|
2017-04-13 12:36:55 +09:00
|
|
|
{
|
2017-04-13 12:37:00 +09:00
|
|
|
struct mlx5i_priv *ipriv = priv->ppriv;
|
|
|
|
|
2019-08-07 23:46:15 +09:00
|
|
|
mlx5e_destroy_tis(priv->mdev, priv->tisn[0][0]);
|
2017-04-13 12:37:00 +09:00
|
|
|
mlx5i_destroy_underlay_qp(priv->mdev, &ipriv->qp);
|
2017-04-13 12:36:55 +09:00
|
|
|
}
|
|
|
|
|
2017-04-13 12:36:57 +09:00
|
|
|
static int mlx5i_create_flow_steering(struct mlx5e_priv *priv)
|
|
|
|
{
|
2017-12-07 04:05:01 +09:00
|
|
|
struct ttc_params ttc_params = {};
|
|
|
|
int tt, err;
|
2017-04-13 12:36:57 +09:00
|
|
|
|
|
|
|
priv->fs.ns = mlx5_get_flow_namespace(priv->mdev,
|
|
|
|
MLX5_FLOW_NAMESPACE_KERNEL);
|
|
|
|
|
|
|
|
if (!priv->fs.ns)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
err = mlx5e_arfs_create_tables(priv);
|
|
|
|
if (err) {
|
|
|
|
netdev_err(priv->netdev, "Failed to create arfs tables, err=%d\n",
|
|
|
|
err);
|
|
|
|
priv->netdev->hw_features &= ~NETIF_F_NTUPLE;
|
|
|
|
}
|
|
|
|
|
2017-12-07 04:05:01 +09:00
|
|
|
mlx5e_set_ttc_basic_params(priv, &ttc_params);
|
|
|
|
mlx5e_set_ttc_ft_params(&ttc_params);
|
|
|
|
for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++)
|
2018-01-27 09:16:45 +09:00
|
|
|
ttc_params.indir_tirn[tt] = priv->indir_tir[tt].tirn;
|
2017-12-07 04:05:01 +09:00
|
|
|
|
|
|
|
err = mlx5e_create_ttc_table(priv, &ttc_params, &priv->fs.ttc);
|
2017-04-13 12:36:57 +09:00
|
|
|
if (err) {
|
|
|
|
netdev_err(priv->netdev, "Failed to create ttc table, err=%d\n",
|
|
|
|
err);
|
2021-04-08 23:20:04 +09:00
|
|
|
goto err_destroy_arfs_tables;
|
2017-04-13 12:36:57 +09:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
err_destroy_arfs_tables:
|
|
|
|
mlx5e_arfs_destroy_tables(priv);
|
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void mlx5i_destroy_flow_steering(struct mlx5e_priv *priv)
|
|
|
|
{
|
2017-12-07 04:05:01 +09:00
|
|
|
mlx5e_destroy_ttc_table(priv, &priv->fs.ttc);
|
2017-04-13 12:36:57 +09:00
|
|
|
mlx5e_arfs_destroy_tables(priv);
|
|
|
|
}
|
|
|
|
|
2017-04-13 12:36:55 +09:00
|
|
|
static int mlx5i_init_rx(struct mlx5e_priv *priv)
|
|
|
|
{
|
2018-08-05 12:58:05 +09:00
|
|
|
struct mlx5_core_dev *mdev = priv->mdev;
|
2017-04-13 12:36:56 +09:00
|
|
|
int err;
|
|
|
|
|
2018-08-05 12:58:05 +09:00
|
|
|
mlx5e_create_q_counters(priv);
|
|
|
|
|
|
|
|
err = mlx5e_open_drop_rq(priv, &priv->drop_rq);
|
|
|
|
if (err) {
|
|
|
|
mlx5_core_err(mdev, "open drop rq failed, %d\n", err);
|
|
|
|
goto err_destroy_q_counters;
|
|
|
|
}
|
|
|
|
|
2017-04-13 12:36:56 +09:00
|
|
|
err = mlx5e_create_indirect_rqt(priv);
|
|
|
|
if (err)
|
2018-08-05 12:58:05 +09:00
|
|
|
goto err_close_drop_rq;
|
2017-04-13 12:36:56 +09:00
|
|
|
|
net/mlx5e: Add XSK zero-copy support
This commit adds support for AF_XDP zero-copy RX and TX.
We create a dedicated XSK RQ inside the channel, it means that two
RQs are running simultaneously: one for non-XSK traffic and the other
for XSK traffic. The regular and XSK RQs use a single ID namespace split
into two halves: the lower half is regular RQs, and the upper half is
XSK RQs. When any zero-copy AF_XDP socket is active, changing the number
of channels is not allowed, because it would break to mapping between
XSK RQ IDs and channels.
XSK requires different page allocation and release routines. Such
functions as mlx5e_{alloc,free}_rx_mpwqe and mlx5e_{get,put}_rx_frag are
generic enough to be used for both regular and XSK RQs, and they use the
mlx5e_page_{alloc,release} wrappers around the real allocation
functions. Function pointers are not used to avoid losing the
performance with retpolines. Wherever it's certain that the regular
(non-XSK) page release function should be used, it's called directly.
Only the stats that could be meaningful for XSK are exposed to the
userspace. Those that don't take part in the XSK flow are not
considered.
Note that we don't wait for WQEs on the XSK RQ (unlike the regular RQ),
because the newer xdpsock sample doesn't provide any Fill Ring entries
at the setup stage.
We create a dedicated XSK SQ in the channel. This separation has its
advantages:
1. When the UMEM is closed, the XSK SQ can also be closed and stop
receiving completions. If an existing SQ was used for XSK, it would
continue receiving completions for the packets of the closed socket. If
a new UMEM was opened at that point, it would start getting completions
that don't belong to it.
2. Calculating statistics separately.
When the userspace kicks the TX, the driver triggers a hardware
interrupt by posting a NOP to a dedicated XSK ICO (internal control
operations) SQ, in order to trigger NAPI on the right CPU core. This XSK
ICO SQ is protected by a spinlock, as the userspace application may kick
the TX from any core.
Store the pointers to the UMEMs in the net device private context,
independently from the kernel. This way the driver can distinguish
between the zero-copy and non-zero-copy UMEMs. The kernel function
xdp_get_umem_from_qid does not care about this difference, but the
driver is only interested in zero-copy UMEMs, particularly, on the
cleanup it determines whether to close the XSK RQ and SQ or not by
looking at the presence of the UMEM. Use state_lock to protect the
access to this area of UMEM pointers.
LRO isn't compatible with XDP, but there may be active UMEMs while
XDP is off. If this is the case, don't allow LRO to ensure XDP can
be reenabled at any time.
The validation of XSK parameters typically happens when XSK queues
open. However, when the interface is down or the XDP program isn't
set, it's still possible to have active AF_XDP sockets and even to
open new, but the XSK queues will be closed. To cover these cases,
perform the validation also in these flows:
1. A new UMEM is registered, but the XSK queues aren't going to be
created due to missing XDP program or interface being down.
2. MTU changes while there are UMEMs registered.
Having this early check prevents mlx5e_open_channels from failing
at a later stage, where recovery is impossible and the application
has no chance to handle the error, because it got the successful
return value for an MTU change or XSK open operation.
The performance testing was performed on a machine with the following
configuration:
- 24 cores of Intel Xeon E5-2620 v3 @ 2.40 GHz
- Mellanox ConnectX-5 Ex with 100 Gbit/s link
The results with retpoline disabled, single stream:
txonly: 33.3 Mpps (21.5 Mpps with queue and app pinned to the same CPU)
rxdrop: 12.2 Mpps
l2fwd: 9.4 Mpps
The results with retpoline enabled, single stream:
txonly: 21.3 Mpps (14.1 Mpps with queue and app pinned to the same CPU)
rxdrop: 9.9 Mpps
l2fwd: 6.8 Mpps
Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-06-26 23:35:38 +09:00
|
|
|
err = mlx5e_create_direct_rqts(priv, priv->direct_tir);
|
2017-04-13 12:36:56 +09:00
|
|
|
if (err)
|
|
|
|
goto err_destroy_indirect_rqts;
|
|
|
|
|
2021-04-08 23:20:04 +09:00
|
|
|
err = mlx5e_create_indirect_tirs(priv, false);
|
2017-04-13 12:36:56 +09:00
|
|
|
if (err)
|
|
|
|
goto err_destroy_direct_rqts;
|
|
|
|
|
net/mlx5e: Add XSK zero-copy support
This commit adds support for AF_XDP zero-copy RX and TX.
We create a dedicated XSK RQ inside the channel, it means that two
RQs are running simultaneously: one for non-XSK traffic and the other
for XSK traffic. The regular and XSK RQs use a single ID namespace split
into two halves: the lower half is regular RQs, and the upper half is
XSK RQs. When any zero-copy AF_XDP socket is active, changing the number
of channels is not allowed, because it would break to mapping between
XSK RQ IDs and channels.
XSK requires different page allocation and release routines. Such
functions as mlx5e_{alloc,free}_rx_mpwqe and mlx5e_{get,put}_rx_frag are
generic enough to be used for both regular and XSK RQs, and they use the
mlx5e_page_{alloc,release} wrappers around the real allocation
functions. Function pointers are not used to avoid losing the
performance with retpolines. Wherever it's certain that the regular
(non-XSK) page release function should be used, it's called directly.
Only the stats that could be meaningful for XSK are exposed to the
userspace. Those that don't take part in the XSK flow are not
considered.
Note that we don't wait for WQEs on the XSK RQ (unlike the regular RQ),
because the newer xdpsock sample doesn't provide any Fill Ring entries
at the setup stage.
We create a dedicated XSK SQ in the channel. This separation has its
advantages:
1. When the UMEM is closed, the XSK SQ can also be closed and stop
receiving completions. If an existing SQ was used for XSK, it would
continue receiving completions for the packets of the closed socket. If
a new UMEM was opened at that point, it would start getting completions
that don't belong to it.
2. Calculating statistics separately.
When the userspace kicks the TX, the driver triggers a hardware
interrupt by posting a NOP to a dedicated XSK ICO (internal control
operations) SQ, in order to trigger NAPI on the right CPU core. This XSK
ICO SQ is protected by a spinlock, as the userspace application may kick
the TX from any core.
Store the pointers to the UMEMs in the net device private context,
independently from the kernel. This way the driver can distinguish
between the zero-copy and non-zero-copy UMEMs. The kernel function
xdp_get_umem_from_qid does not care about this difference, but the
driver is only interested in zero-copy UMEMs, particularly, on the
cleanup it determines whether to close the XSK RQ and SQ or not by
looking at the presence of the UMEM. Use state_lock to protect the
access to this area of UMEM pointers.
LRO isn't compatible with XDP, but there may be active UMEMs while
XDP is off. If this is the case, don't allow LRO to ensure XDP can
be reenabled at any time.
The validation of XSK parameters typically happens when XSK queues
open. However, when the interface is down or the XDP program isn't
set, it's still possible to have active AF_XDP sockets and even to
open new, but the XSK queues will be closed. To cover these cases,
perform the validation also in these flows:
1. A new UMEM is registered, but the XSK queues aren't going to be
created due to missing XDP program or interface being down.
2. MTU changes while there are UMEMs registered.
Having this early check prevents mlx5e_open_channels from failing
at a later stage, where recovery is impossible and the application
has no chance to handle the error, because it got the successful
return value for an MTU change or XSK open operation.
The performance testing was performed on a machine with the following
configuration:
- 24 cores of Intel Xeon E5-2620 v3 @ 2.40 GHz
- Mellanox ConnectX-5 Ex with 100 Gbit/s link
The results with retpoline disabled, single stream:
txonly: 33.3 Mpps (21.5 Mpps with queue and app pinned to the same CPU)
rxdrop: 12.2 Mpps
l2fwd: 9.4 Mpps
The results with retpoline enabled, single stream:
txonly: 21.3 Mpps (14.1 Mpps with queue and app pinned to the same CPU)
rxdrop: 9.9 Mpps
l2fwd: 6.8 Mpps
Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-06-26 23:35:38 +09:00
|
|
|
err = mlx5e_create_direct_tirs(priv, priv->direct_tir);
|
2017-04-13 12:36:56 +09:00
|
|
|
if (err)
|
|
|
|
goto err_destroy_indirect_tirs;
|
|
|
|
|
2017-07-06 21:40:32 +09:00
|
|
|
err = mlx5i_create_flow_steering(priv);
|
|
|
|
if (err)
|
2017-09-13 17:37:02 +09:00
|
|
|
goto err_destroy_direct_tirs;
|
2017-07-06 21:40:32 +09:00
|
|
|
|
2017-04-13 12:36:55 +09:00
|
|
|
return 0;
|
2017-04-13 12:36:56 +09:00
|
|
|
|
2017-04-13 12:36:57 +09:00
|
|
|
err_destroy_direct_tirs:
|
net/mlx5e: Add XSK zero-copy support
This commit adds support for AF_XDP zero-copy RX and TX.
We create a dedicated XSK RQ inside the channel, it means that two
RQs are running simultaneously: one for non-XSK traffic and the other
for XSK traffic. The regular and XSK RQs use a single ID namespace split
into two halves: the lower half is regular RQs, and the upper half is
XSK RQs. When any zero-copy AF_XDP socket is active, changing the number
of channels is not allowed, because it would break to mapping between
XSK RQ IDs and channels.
XSK requires different page allocation and release routines. Such
functions as mlx5e_{alloc,free}_rx_mpwqe and mlx5e_{get,put}_rx_frag are
generic enough to be used for both regular and XSK RQs, and they use the
mlx5e_page_{alloc,release} wrappers around the real allocation
functions. Function pointers are not used to avoid losing the
performance with retpolines. Wherever it's certain that the regular
(non-XSK) page release function should be used, it's called directly.
Only the stats that could be meaningful for XSK are exposed to the
userspace. Those that don't take part in the XSK flow are not
considered.
Note that we don't wait for WQEs on the XSK RQ (unlike the regular RQ),
because the newer xdpsock sample doesn't provide any Fill Ring entries
at the setup stage.
We create a dedicated XSK SQ in the channel. This separation has its
advantages:
1. When the UMEM is closed, the XSK SQ can also be closed and stop
receiving completions. If an existing SQ was used for XSK, it would
continue receiving completions for the packets of the closed socket. If
a new UMEM was opened at that point, it would start getting completions
that don't belong to it.
2. Calculating statistics separately.
When the userspace kicks the TX, the driver triggers a hardware
interrupt by posting a NOP to a dedicated XSK ICO (internal control
operations) SQ, in order to trigger NAPI on the right CPU core. This XSK
ICO SQ is protected by a spinlock, as the userspace application may kick
the TX from any core.
Store the pointers to the UMEMs in the net device private context,
independently from the kernel. This way the driver can distinguish
between the zero-copy and non-zero-copy UMEMs. The kernel function
xdp_get_umem_from_qid does not care about this difference, but the
driver is only interested in zero-copy UMEMs, particularly, on the
cleanup it determines whether to close the XSK RQ and SQ or not by
looking at the presence of the UMEM. Use state_lock to protect the
access to this area of UMEM pointers.
LRO isn't compatible with XDP, but there may be active UMEMs while
XDP is off. If this is the case, don't allow LRO to ensure XDP can
be reenabled at any time.
The validation of XSK parameters typically happens when XSK queues
open. However, when the interface is down or the XDP program isn't
set, it's still possible to have active AF_XDP sockets and even to
open new, but the XSK queues will be closed. To cover these cases,
perform the validation also in these flows:
1. A new UMEM is registered, but the XSK queues aren't going to be
created due to missing XDP program or interface being down.
2. MTU changes while there are UMEMs registered.
Having this early check prevents mlx5e_open_channels from failing
at a later stage, where recovery is impossible and the application
has no chance to handle the error, because it got the successful
return value for an MTU change or XSK open operation.
The performance testing was performed on a machine with the following
configuration:
- 24 cores of Intel Xeon E5-2620 v3 @ 2.40 GHz
- Mellanox ConnectX-5 Ex with 100 Gbit/s link
The results with retpoline disabled, single stream:
txonly: 33.3 Mpps (21.5 Mpps with queue and app pinned to the same CPU)
rxdrop: 12.2 Mpps
l2fwd: 9.4 Mpps
The results with retpoline enabled, single stream:
txonly: 21.3 Mpps (14.1 Mpps with queue and app pinned to the same CPU)
rxdrop: 9.9 Mpps
l2fwd: 6.8 Mpps
Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-06-26 23:35:38 +09:00
|
|
|
mlx5e_destroy_direct_tirs(priv, priv->direct_tir);
|
2017-04-13 12:36:56 +09:00
|
|
|
err_destroy_indirect_tirs:
|
2020-04-30 15:16:01 +09:00
|
|
|
mlx5e_destroy_indirect_tirs(priv);
|
2017-04-13 12:36:56 +09:00
|
|
|
err_destroy_direct_rqts:
|
net/mlx5e: Add XSK zero-copy support
This commit adds support for AF_XDP zero-copy RX and TX.
We create a dedicated XSK RQ inside the channel, it means that two
RQs are running simultaneously: one for non-XSK traffic and the other
for XSK traffic. The regular and XSK RQs use a single ID namespace split
into two halves: the lower half is regular RQs, and the upper half is
XSK RQs. When any zero-copy AF_XDP socket is active, changing the number
of channels is not allowed, because it would break to mapping between
XSK RQ IDs and channels.
XSK requires different page allocation and release routines. Such
functions as mlx5e_{alloc,free}_rx_mpwqe and mlx5e_{get,put}_rx_frag are
generic enough to be used for both regular and XSK RQs, and they use the
mlx5e_page_{alloc,release} wrappers around the real allocation
functions. Function pointers are not used to avoid losing the
performance with retpolines. Wherever it's certain that the regular
(non-XSK) page release function should be used, it's called directly.
Only the stats that could be meaningful for XSK are exposed to the
userspace. Those that don't take part in the XSK flow are not
considered.
Note that we don't wait for WQEs on the XSK RQ (unlike the regular RQ),
because the newer xdpsock sample doesn't provide any Fill Ring entries
at the setup stage.
We create a dedicated XSK SQ in the channel. This separation has its
advantages:
1. When the UMEM is closed, the XSK SQ can also be closed and stop
receiving completions. If an existing SQ was used for XSK, it would
continue receiving completions for the packets of the closed socket. If
a new UMEM was opened at that point, it would start getting completions
that don't belong to it.
2. Calculating statistics separately.
When the userspace kicks the TX, the driver triggers a hardware
interrupt by posting a NOP to a dedicated XSK ICO (internal control
operations) SQ, in order to trigger NAPI on the right CPU core. This XSK
ICO SQ is protected by a spinlock, as the userspace application may kick
the TX from any core.
Store the pointers to the UMEMs in the net device private context,
independently from the kernel. This way the driver can distinguish
between the zero-copy and non-zero-copy UMEMs. The kernel function
xdp_get_umem_from_qid does not care about this difference, but the
driver is only interested in zero-copy UMEMs, particularly, on the
cleanup it determines whether to close the XSK RQ and SQ or not by
looking at the presence of the UMEM. Use state_lock to protect the
access to this area of UMEM pointers.
LRO isn't compatible with XDP, but there may be active UMEMs while
XDP is off. If this is the case, don't allow LRO to ensure XDP can
be reenabled at any time.
The validation of XSK parameters typically happens when XSK queues
open. However, when the interface is down or the XDP program isn't
set, it's still possible to have active AF_XDP sockets and even to
open new, but the XSK queues will be closed. To cover these cases,
perform the validation also in these flows:
1. A new UMEM is registered, but the XSK queues aren't going to be
created due to missing XDP program or interface being down.
2. MTU changes while there are UMEMs registered.
Having this early check prevents mlx5e_open_channels from failing
at a later stage, where recovery is impossible and the application
has no chance to handle the error, because it got the successful
return value for an MTU change or XSK open operation.
The performance testing was performed on a machine with the following
configuration:
- 24 cores of Intel Xeon E5-2620 v3 @ 2.40 GHz
- Mellanox ConnectX-5 Ex with 100 Gbit/s link
The results with retpoline disabled, single stream:
txonly: 33.3 Mpps (21.5 Mpps with queue and app pinned to the same CPU)
rxdrop: 12.2 Mpps
l2fwd: 9.4 Mpps
The results with retpoline enabled, single stream:
txonly: 21.3 Mpps (14.1 Mpps with queue and app pinned to the same CPU)
rxdrop: 9.9 Mpps
l2fwd: 6.8 Mpps
Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-06-26 23:35:38 +09:00
|
|
|
mlx5e_destroy_direct_rqts(priv, priv->direct_tir);
|
2017-04-13 12:36:56 +09:00
|
|
|
err_destroy_indirect_rqts:
|
|
|
|
mlx5e_destroy_rqt(priv, &priv->indir_rqt);
|
2018-08-05 12:58:05 +09:00
|
|
|
err_close_drop_rq:
|
|
|
|
mlx5e_close_drop_rq(&priv->drop_rq);
|
|
|
|
err_destroy_q_counters:
|
|
|
|
mlx5e_destroy_q_counters(priv);
|
2017-04-13 12:36:56 +09:00
|
|
|
return err;
|
2017-04-13 12:36:55 +09:00
|
|
|
}
|
|
|
|
|
|
|
|
static void mlx5i_cleanup_rx(struct mlx5e_priv *priv)
|
|
|
|
{
|
2017-04-13 12:36:57 +09:00
|
|
|
mlx5i_destroy_flow_steering(priv);
|
net/mlx5e: Add XSK zero-copy support
This commit adds support for AF_XDP zero-copy RX and TX.
We create a dedicated XSK RQ inside the channel, it means that two
RQs are running simultaneously: one for non-XSK traffic and the other
for XSK traffic. The regular and XSK RQs use a single ID namespace split
into two halves: the lower half is regular RQs, and the upper half is
XSK RQs. When any zero-copy AF_XDP socket is active, changing the number
of channels is not allowed, because it would break to mapping between
XSK RQ IDs and channels.
XSK requires different page allocation and release routines. Such
functions as mlx5e_{alloc,free}_rx_mpwqe and mlx5e_{get,put}_rx_frag are
generic enough to be used for both regular and XSK RQs, and they use the
mlx5e_page_{alloc,release} wrappers around the real allocation
functions. Function pointers are not used to avoid losing the
performance with retpolines. Wherever it's certain that the regular
(non-XSK) page release function should be used, it's called directly.
Only the stats that could be meaningful for XSK are exposed to the
userspace. Those that don't take part in the XSK flow are not
considered.
Note that we don't wait for WQEs on the XSK RQ (unlike the regular RQ),
because the newer xdpsock sample doesn't provide any Fill Ring entries
at the setup stage.
We create a dedicated XSK SQ in the channel. This separation has its
advantages:
1. When the UMEM is closed, the XSK SQ can also be closed and stop
receiving completions. If an existing SQ was used for XSK, it would
continue receiving completions for the packets of the closed socket. If
a new UMEM was opened at that point, it would start getting completions
that don't belong to it.
2. Calculating statistics separately.
When the userspace kicks the TX, the driver triggers a hardware
interrupt by posting a NOP to a dedicated XSK ICO (internal control
operations) SQ, in order to trigger NAPI on the right CPU core. This XSK
ICO SQ is protected by a spinlock, as the userspace application may kick
the TX from any core.
Store the pointers to the UMEMs in the net device private context,
independently from the kernel. This way the driver can distinguish
between the zero-copy and non-zero-copy UMEMs. The kernel function
xdp_get_umem_from_qid does not care about this difference, but the
driver is only interested in zero-copy UMEMs, particularly, on the
cleanup it determines whether to close the XSK RQ and SQ or not by
looking at the presence of the UMEM. Use state_lock to protect the
access to this area of UMEM pointers.
LRO isn't compatible with XDP, but there may be active UMEMs while
XDP is off. If this is the case, don't allow LRO to ensure XDP can
be reenabled at any time.
The validation of XSK parameters typically happens when XSK queues
open. However, when the interface is down or the XDP program isn't
set, it's still possible to have active AF_XDP sockets and even to
open new, but the XSK queues will be closed. To cover these cases,
perform the validation also in these flows:
1. A new UMEM is registered, but the XSK queues aren't going to be
created due to missing XDP program or interface being down.
2. MTU changes while there are UMEMs registered.
Having this early check prevents mlx5e_open_channels from failing
at a later stage, where recovery is impossible and the application
has no chance to handle the error, because it got the successful
return value for an MTU change or XSK open operation.
The performance testing was performed on a machine with the following
configuration:
- 24 cores of Intel Xeon E5-2620 v3 @ 2.40 GHz
- Mellanox ConnectX-5 Ex with 100 Gbit/s link
The results with retpoline disabled, single stream:
txonly: 33.3 Mpps (21.5 Mpps with queue and app pinned to the same CPU)
rxdrop: 12.2 Mpps
l2fwd: 9.4 Mpps
The results with retpoline enabled, single stream:
txonly: 21.3 Mpps (14.1 Mpps with queue and app pinned to the same CPU)
rxdrop: 9.9 Mpps
l2fwd: 6.8 Mpps
Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-06-26 23:35:38 +09:00
|
|
|
mlx5e_destroy_direct_tirs(priv, priv->direct_tir);
|
2020-04-30 15:16:01 +09:00
|
|
|
mlx5e_destroy_indirect_tirs(priv);
|
net/mlx5e: Add XSK zero-copy support
This commit adds support for AF_XDP zero-copy RX and TX.
We create a dedicated XSK RQ inside the channel, it means that two
RQs are running simultaneously: one for non-XSK traffic and the other
for XSK traffic. The regular and XSK RQs use a single ID namespace split
into two halves: the lower half is regular RQs, and the upper half is
XSK RQs. When any zero-copy AF_XDP socket is active, changing the number
of channels is not allowed, because it would break to mapping between
XSK RQ IDs and channels.
XSK requires different page allocation and release routines. Such
functions as mlx5e_{alloc,free}_rx_mpwqe and mlx5e_{get,put}_rx_frag are
generic enough to be used for both regular and XSK RQs, and they use the
mlx5e_page_{alloc,release} wrappers around the real allocation
functions. Function pointers are not used to avoid losing the
performance with retpolines. Wherever it's certain that the regular
(non-XSK) page release function should be used, it's called directly.
Only the stats that could be meaningful for XSK are exposed to the
userspace. Those that don't take part in the XSK flow are not
considered.
Note that we don't wait for WQEs on the XSK RQ (unlike the regular RQ),
because the newer xdpsock sample doesn't provide any Fill Ring entries
at the setup stage.
We create a dedicated XSK SQ in the channel. This separation has its
advantages:
1. When the UMEM is closed, the XSK SQ can also be closed and stop
receiving completions. If an existing SQ was used for XSK, it would
continue receiving completions for the packets of the closed socket. If
a new UMEM was opened at that point, it would start getting completions
that don't belong to it.
2. Calculating statistics separately.
When the userspace kicks the TX, the driver triggers a hardware
interrupt by posting a NOP to a dedicated XSK ICO (internal control
operations) SQ, in order to trigger NAPI on the right CPU core. This XSK
ICO SQ is protected by a spinlock, as the userspace application may kick
the TX from any core.
Store the pointers to the UMEMs in the net device private context,
independently from the kernel. This way the driver can distinguish
between the zero-copy and non-zero-copy UMEMs. The kernel function
xdp_get_umem_from_qid does not care about this difference, but the
driver is only interested in zero-copy UMEMs, particularly, on the
cleanup it determines whether to close the XSK RQ and SQ or not by
looking at the presence of the UMEM. Use state_lock to protect the
access to this area of UMEM pointers.
LRO isn't compatible with XDP, but there may be active UMEMs while
XDP is off. If this is the case, don't allow LRO to ensure XDP can
be reenabled at any time.
The validation of XSK parameters typically happens when XSK queues
open. However, when the interface is down or the XDP program isn't
set, it's still possible to have active AF_XDP sockets and even to
open new, but the XSK queues will be closed. To cover these cases,
perform the validation also in these flows:
1. A new UMEM is registered, but the XSK queues aren't going to be
created due to missing XDP program or interface being down.
2. MTU changes while there are UMEMs registered.
Having this early check prevents mlx5e_open_channels from failing
at a later stage, where recovery is impossible and the application
has no chance to handle the error, because it got the successful
return value for an MTU change or XSK open operation.
The performance testing was performed on a machine with the following
configuration:
- 24 cores of Intel Xeon E5-2620 v3 @ 2.40 GHz
- Mellanox ConnectX-5 Ex with 100 Gbit/s link
The results with retpoline disabled, single stream:
txonly: 33.3 Mpps (21.5 Mpps with queue and app pinned to the same CPU)
rxdrop: 12.2 Mpps
l2fwd: 9.4 Mpps
The results with retpoline enabled, single stream:
txonly: 21.3 Mpps (14.1 Mpps with queue and app pinned to the same CPU)
rxdrop: 9.9 Mpps
l2fwd: 6.8 Mpps
Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-06-26 23:35:38 +09:00
|
|
|
mlx5e_destroy_direct_rqts(priv, priv->direct_tir);
|
2017-04-13 12:36:56 +09:00
|
|
|
mlx5e_destroy_rqt(priv, &priv->indir_rqt);
|
2018-08-05 12:58:05 +09:00
|
|
|
mlx5e_close_drop_rq(&priv->drop_rq);
|
|
|
|
mlx5e_destroy_q_counters(priv);
|
2017-04-13 12:36:55 +09:00
|
|
|
}
|
|
|
|
|
|
|
|
static const struct mlx5e_profile mlx5i_nic_profile = {
|
|
|
|
.init = mlx5i_init,
|
|
|
|
.cleanup = mlx5i_cleanup,
|
|
|
|
.init_tx = mlx5i_init_tx,
|
|
|
|
.cleanup_tx = mlx5i_cleanup_tx,
|
|
|
|
.init_rx = mlx5i_init_rx,
|
|
|
|
.cleanup_rx = mlx5i_cleanup_rx,
|
|
|
|
.enable = NULL, /* mlx5i_enable */
|
|
|
|
.disable = NULL, /* mlx5i_disable */
|
net/mlx5e: Don't refresh TIRs when updating representor SQs
Refreshing TIRs is done in order to update the TIRs with the current
state of SQs in the transport domain, so that the TIRs can filter out
undesired self-loopback packets based on the source SQ of the packet.
Representor TIRs will only receive packets that originate from their
associated vport, due to dedicated steering, and therefore will never
receive self-loopback packets, whose source vport will be the vport of
the E-Switch manager, and therefore not the vport associated with the
representor. As such, it is not necessary to refresh the representors'
TIRs, since self-loopback packets can't reach them.
Since representors only exist in switchdev mode, and there is no
scenario in which a representor will exist in the transport domain
alongside a non-representor, it is not necessary to refresh the
transport domain's TIRs upon changing the state of a representor's
queues. Therefore, do not refresh TIRs upon such a change. Achieve
this by adding an update_rx callback to the mlx5e_profile, which
refreshes TIRs for non-representors and does nothing for representors,
and replace instances of mlx5e_refresh_tirs() upon changing the state
of the queues with update_rx().
Signed-off-by: Gavi Teitz <gavi@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2019-05-23 15:58:56 +09:00
|
|
|
.update_rx = mlx5e_update_nic_rx,
|
2017-04-13 12:36:55 +09:00
|
|
|
.update_stats = NULL, /* mlx5i_update_stats */
|
2017-05-18 20:32:11 +09:00
|
|
|
.update_carrier = NULL, /* no HW update in IB link */
|
2017-04-13 12:37:04 +09:00
|
|
|
.rx_handlers.handle_rx_cqe = mlx5i_handle_rx_cqe,
|
|
|
|
.rx_handlers.handle_rx_cqe_mpwqe = NULL, /* Not supported */
|
2017-04-13 12:36:55 +09:00
|
|
|
.max_tc = MLX5I_MAX_NUM_TC,
|
2019-07-14 17:43:43 +09:00
|
|
|
.rq_groups = MLX5E_NUM_RQ_GROUPS(REGULAR),
|
2017-04-13 12:36:55 +09:00
|
|
|
};
|
|
|
|
|
2017-04-13 12:36:59 +09:00
|
|
|
/* mlx5i netdev NDos */
|
|
|
|
|
2017-05-21 14:56:20 +09:00
|
|
|
static int mlx5i_change_mtu(struct net_device *netdev, int new_mtu)
|
|
|
|
{
|
|
|
|
struct mlx5e_priv *priv = mlx5i_epriv(netdev);
|
|
|
|
struct mlx5e_channels new_channels = {};
|
2018-03-12 21:24:41 +09:00
|
|
|
struct mlx5e_params *params;
|
2017-05-21 14:56:20 +09:00
|
|
|
int err = 0;
|
|
|
|
|
|
|
|
mutex_lock(&priv->state_lock);
|
|
|
|
|
2018-03-12 21:24:41 +09:00
|
|
|
params = &priv->channels.params;
|
2017-05-21 14:56:20 +09:00
|
|
|
|
2018-03-12 21:24:41 +09:00
|
|
|
if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
|
|
|
|
params->sw_mtu = new_mtu;
|
|
|
|
netdev->mtu = params->sw_mtu;
|
2017-05-21 14:56:20 +09:00
|
|
|
goto out;
|
2018-03-12 21:24:41 +09:00
|
|
|
}
|
2017-05-21 14:56:20 +09:00
|
|
|
|
2018-03-12 21:24:41 +09:00
|
|
|
new_channels.params = *params;
|
|
|
|
new_channels.params.sw_mtu = new_mtu;
|
2018-11-27 00:22:16 +09:00
|
|
|
|
|
|
|
err = mlx5e_safe_switch_channels(priv, &new_channels, NULL);
|
2018-03-12 21:24:41 +09:00
|
|
|
if (err)
|
2017-05-21 14:56:20 +09:00
|
|
|
goto out;
|
|
|
|
|
2018-03-12 21:24:41 +09:00
|
|
|
netdev->mtu = new_channels.params.sw_mtu;
|
2017-05-21 14:56:20 +09:00
|
|
|
|
|
|
|
out:
|
|
|
|
mutex_unlock(&priv->state_lock);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2017-09-14 22:33:35 +09:00
|
|
|
int mlx5i_dev_init(struct net_device *dev)
|
2017-04-13 12:36:59 +09:00
|
|
|
{
|
|
|
|
struct mlx5e_priv *priv = mlx5i_epriv(dev);
|
|
|
|
struct mlx5i_priv *ipriv = priv->ppriv;
|
|
|
|
|
|
|
|
/* Set dev address using underlay QP */
|
|
|
|
dev->dev_addr[1] = (ipriv->qp.qpn >> 16) & 0xff;
|
|
|
|
dev->dev_addr[2] = (ipriv->qp.qpn >> 8) & 0xff;
|
|
|
|
dev->dev_addr[3] = (ipriv->qp.qpn) & 0xff;
|
|
|
|
|
2017-09-14 16:27:25 +09:00
|
|
|
/* Add QPN to net-device mapping to HT */
|
|
|
|
mlx5i_pkey_add_qpn(dev ,ipriv->qp.qpn);
|
|
|
|
|
2017-04-13 12:36:59 +09:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-10-31 21:24:19 +09:00
|
|
|
int mlx5i_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
|
2017-06-01 20:56:17 +09:00
|
|
|
{
|
|
|
|
struct mlx5e_priv *priv = mlx5i_epriv(dev);
|
|
|
|
|
|
|
|
switch (cmd) {
|
|
|
|
case SIOCSHWTSTAMP:
|
|
|
|
return mlx5e_hwstamp_set(priv, ifr);
|
|
|
|
case SIOCGHWTSTAMP:
|
|
|
|
return mlx5e_hwstamp_get(priv, ifr);
|
|
|
|
default:
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-09-14 22:33:35 +09:00
|
|
|
void mlx5i_dev_cleanup(struct net_device *dev)
|
2017-04-13 12:36:59 +09:00
|
|
|
{
|
2017-04-13 12:37:00 +09:00
|
|
|
struct mlx5e_priv *priv = mlx5i_epriv(dev);
|
2017-09-14 16:27:25 +09:00
|
|
|
struct mlx5i_priv *ipriv = priv->ppriv;
|
2017-04-13 12:37:00 +09:00
|
|
|
|
2017-09-12 20:11:29 +09:00
|
|
|
mlx5i_uninit_underlay_qp(priv);
|
2017-09-14 16:27:25 +09:00
|
|
|
|
|
|
|
/* Delete QPN to net-device mapping from HT */
|
|
|
|
mlx5i_pkey_del_qpn(dev, ipriv->qp.qpn);
|
2017-04-13 12:36:59 +09:00
|
|
|
}
|
|
|
|
|
|
|
|
static int mlx5i_open(struct net_device *netdev)
|
|
|
|
{
|
2017-09-13 17:37:02 +09:00
|
|
|
struct mlx5e_priv *epriv = mlx5i_epriv(netdev);
|
|
|
|
struct mlx5i_priv *ipriv = epriv->ppriv;
|
|
|
|
struct mlx5_core_dev *mdev = epriv->mdev;
|
2017-04-13 12:36:59 +09:00
|
|
|
int err;
|
|
|
|
|
2017-09-13 17:37:02 +09:00
|
|
|
mutex_lock(&epriv->state_lock);
|
2017-04-13 12:36:59 +09:00
|
|
|
|
2017-09-13 17:37:02 +09:00
|
|
|
set_bit(MLX5E_STATE_OPENED, &epriv->state);
|
2017-04-13 12:36:59 +09:00
|
|
|
|
2017-09-13 17:37:02 +09:00
|
|
|
err = mlx5i_init_underlay_qp(epriv);
|
|
|
|
if (err) {
|
|
|
|
mlx5_core_warn(mdev, "prepare underlay qp state failed, %d\n", err);
|
2017-04-13 12:36:59 +09:00
|
|
|
goto err_clear_state_opened_flag;
|
2017-09-13 17:37:02 +09:00
|
|
|
}
|
|
|
|
|
|
|
|
err = mlx5_fs_add_rx_underlay_qpn(mdev, ipriv->qp.qpn);
|
|
|
|
if (err) {
|
|
|
|
mlx5_core_warn(mdev, "attach underlay qp to ft failed, %d\n", err);
|
|
|
|
goto err_reset_qp;
|
|
|
|
}
|
2017-04-13 12:36:59 +09:00
|
|
|
|
2017-09-13 17:37:02 +09:00
|
|
|
err = mlx5e_open_channels(epriv, &epriv->channels);
|
|
|
|
if (err)
|
|
|
|
goto err_remove_fs_underlay_qp;
|
2017-05-18 20:32:11 +09:00
|
|
|
|
net/mlx5e: Don't refresh TIRs when updating representor SQs
Refreshing TIRs is done in order to update the TIRs with the current
state of SQs in the transport domain, so that the TIRs can filter out
undesired self-loopback packets based on the source SQ of the packet.
Representor TIRs will only receive packets that originate from their
associated vport, due to dedicated steering, and therefore will never
receive self-loopback packets, whose source vport will be the vport of
the E-Switch manager, and therefore not the vport associated with the
representor. As such, it is not necessary to refresh the representors'
TIRs, since self-loopback packets can't reach them.
Since representors only exist in switchdev mode, and there is no
scenario in which a representor will exist in the transport domain
alongside a non-representor, it is not necessary to refresh the
transport domain's TIRs upon changing the state of a representor's
queues. Therefore, do not refresh TIRs upon such a change. Achieve
this by adding an update_rx callback to the mlx5e_profile, which
refreshes TIRs for non-representors and does nothing for representors,
and replace instances of mlx5e_refresh_tirs() upon changing the state
of the queues with update_rx().
Signed-off-by: Gavi Teitz <gavi@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2019-05-23 15:58:56 +09:00
|
|
|
epriv->profile->update_rx(epriv);
|
2017-09-13 17:37:02 +09:00
|
|
|
mlx5e_activate_priv_channels(epriv);
|
|
|
|
|
|
|
|
mutex_unlock(&epriv->state_lock);
|
2017-04-13 12:36:59 +09:00
|
|
|
return 0;
|
|
|
|
|
2017-09-13 17:37:02 +09:00
|
|
|
err_remove_fs_underlay_qp:
|
|
|
|
mlx5_fs_remove_rx_underlay_qpn(mdev, ipriv->qp.qpn);
|
|
|
|
err_reset_qp:
|
|
|
|
mlx5i_uninit_underlay_qp(epriv);
|
2017-04-13 12:36:59 +09:00
|
|
|
err_clear_state_opened_flag:
|
2017-09-13 17:37:02 +09:00
|
|
|
clear_bit(MLX5E_STATE_OPENED, &epriv->state);
|
|
|
|
mutex_unlock(&epriv->state_lock);
|
2017-04-13 12:36:59 +09:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int mlx5i_close(struct net_device *netdev)
|
|
|
|
{
|
2017-09-13 17:37:02 +09:00
|
|
|
struct mlx5e_priv *epriv = mlx5i_epriv(netdev);
|
|
|
|
struct mlx5i_priv *ipriv = epriv->ppriv;
|
|
|
|
struct mlx5_core_dev *mdev = epriv->mdev;
|
2017-04-13 12:36:59 +09:00
|
|
|
|
|
|
|
/* May already be CLOSED in case a previous configuration operation
|
|
|
|
* (e.g RX/TX queue size change) that involves close&open failed.
|
|
|
|
*/
|
2017-09-13 17:37:02 +09:00
|
|
|
mutex_lock(&epriv->state_lock);
|
2017-04-13 12:36:59 +09:00
|
|
|
|
2017-09-13 17:37:02 +09:00
|
|
|
if (!test_bit(MLX5E_STATE_OPENED, &epriv->state))
|
2017-04-13 12:36:59 +09:00
|
|
|
goto unlock;
|
|
|
|
|
2017-09-13 17:37:02 +09:00
|
|
|
clear_bit(MLX5E_STATE_OPENED, &epriv->state);
|
2017-04-13 12:36:59 +09:00
|
|
|
|
2017-09-13 17:37:02 +09:00
|
|
|
netif_carrier_off(epriv->netdev);
|
|
|
|
mlx5_fs_remove_rx_underlay_qpn(mdev, ipriv->qp.qpn);
|
|
|
|
mlx5e_deactivate_priv_channels(epriv);
|
2018-01-17 21:09:15 +09:00
|
|
|
mlx5e_close_channels(&epriv->channels);
|
2018-09-27 20:17:54 +09:00
|
|
|
mlx5i_uninit_underlay_qp(epriv);
|
2017-04-13 12:36:59 +09:00
|
|
|
unlock:
|
2017-09-13 17:37:02 +09:00
|
|
|
mutex_unlock(&epriv->state_lock);
|
2017-04-13 12:36:59 +09:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-04-13 12:36:55 +09:00
|
|
|
/* IPoIB RDMA netdev callbacks */
|
2017-04-22 03:15:56 +09:00
|
|
|
static int mlx5i_attach_mcast(struct net_device *netdev, struct ib_device *hca,
|
2017-04-27 23:01:34 +09:00
|
|
|
union ib_gid *gid, u16 lid, int set_qkey,
|
|
|
|
u32 qkey)
|
2017-04-13 12:37:00 +09:00
|
|
|
{
|
|
|
|
struct mlx5e_priv *epriv = mlx5i_epriv(netdev);
|
|
|
|
struct mlx5_core_dev *mdev = epriv->mdev;
|
|
|
|
struct mlx5i_priv *ipriv = epriv->ppriv;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
mlx5_core_dbg(mdev, "attaching QPN 0x%x, MGID %pI6\n", ipriv->qp.qpn, gid->raw);
|
|
|
|
err = mlx5_core_attach_mcg(mdev, gid, ipriv->qp.qpn);
|
|
|
|
if (err)
|
|
|
|
mlx5_core_warn(mdev, "failed attaching QPN 0x%x, MGID %pI6\n",
|
|
|
|
ipriv->qp.qpn, gid->raw);
|
|
|
|
|
2017-04-27 23:01:34 +09:00
|
|
|
if (set_qkey) {
|
|
|
|
mlx5_core_dbg(mdev, "%s setting qkey 0x%x\n",
|
|
|
|
netdev->name, qkey);
|
|
|
|
ipriv->qkey = qkey;
|
|
|
|
}
|
|
|
|
|
2017-04-13 12:37:00 +09:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2017-04-22 03:15:56 +09:00
|
|
|
static int mlx5i_detach_mcast(struct net_device *netdev, struct ib_device *hca,
|
|
|
|
union ib_gid *gid, u16 lid)
|
2017-04-13 12:37:00 +09:00
|
|
|
{
|
|
|
|
struct mlx5e_priv *epriv = mlx5i_epriv(netdev);
|
|
|
|
struct mlx5_core_dev *mdev = epriv->mdev;
|
|
|
|
struct mlx5i_priv *ipriv = epriv->ppriv;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
mlx5_core_dbg(mdev, "detaching QPN 0x%x, MGID %pI6\n", ipriv->qp.qpn, gid->raw);
|
|
|
|
|
|
|
|
err = mlx5_core_detach_mcg(mdev, gid, ipriv->qp.qpn);
|
|
|
|
if (err)
|
2018-03-06 21:58:46 +09:00
|
|
|
mlx5_core_dbg(mdev, "failed detaching QPN 0x%x, MGID %pI6\n",
|
2017-04-13 12:37:00 +09:00
|
|
|
ipriv->qp.qpn, gid->raw);
|
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
2017-04-13 12:36:55 +09:00
|
|
|
|
2017-04-22 03:15:56 +09:00
|
|
|
static int mlx5i_xmit(struct net_device *dev, struct sk_buff *skb,
|
2017-04-27 23:01:34 +09:00
|
|
|
struct ib_ah *address, u32 dqpn)
|
2017-04-13 12:37:02 +09:00
|
|
|
{
|
|
|
|
struct mlx5e_priv *epriv = mlx5i_epriv(dev);
|
|
|
|
struct mlx5e_txqsq *sq = epriv->txq2sq[skb_get_queue_mapping(skb)];
|
|
|
|
struct mlx5_ib_ah *mah = to_mah(address);
|
2017-04-27 23:01:34 +09:00
|
|
|
struct mlx5i_priv *ipriv = epriv->ppriv;
|
2017-04-13 12:37:02 +09:00
|
|
|
|
2019-05-15 21:57:13 +09:00
|
|
|
return mlx5i_sq_xmit(sq, skb, &mah->av, dqpn, ipriv->qkey, netdev_xmit_more());
|
2017-04-13 12:37:02 +09:00
|
|
|
}
|
|
|
|
|
2017-09-13 18:17:50 +09:00
|
|
|
static void mlx5i_set_pkey_index(struct net_device *netdev, int id)
|
|
|
|
{
|
|
|
|
struct mlx5i_priv *ipriv = netdev_priv(netdev);
|
|
|
|
|
|
|
|
ipriv->pkey_index = (u16)id;
|
|
|
|
}
|
|
|
|
|
2017-04-13 12:36:55 +09:00
|
|
|
static int mlx5i_check_required_hca_cap(struct mlx5_core_dev *mdev)
|
|
|
|
{
|
|
|
|
if (MLX5_CAP_GEN(mdev, port_type) != MLX5_CAP_PORT_TYPE_IB)
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
|
|
|
if (!MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads)) {
|
|
|
|
mlx5_core_warn(mdev, "IPoIB enhanced offloads are not supported\n");
|
2017-04-27 23:01:34 +09:00
|
|
|
return -EOPNOTSUPP;
|
2017-04-13 12:36:55 +09:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-07-29 17:34:56 +09:00
|
|
|
static void mlx5_rdma_netdev_free(struct net_device *netdev)
|
|
|
|
{
|
|
|
|
struct mlx5e_priv *priv = mlx5i_epriv(netdev);
|
|
|
|
struct mlx5i_priv *ipriv = priv->ppriv;
|
|
|
|
const struct mlx5e_profile *profile = priv->profile;
|
|
|
|
|
|
|
|
mlx5e_detach_netdev(priv);
|
|
|
|
profile->cleanup(priv);
|
|
|
|
|
|
|
|
if (!ipriv->sub_interface) {
|
|
|
|
mlx5i_pkey_qpn_ht_cleanup(netdev);
|
|
|
|
mlx5e_destroy_mdev_resources(priv->mdev);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-08-14 20:08:51 +09:00
|
|
|
static bool mlx5_is_sub_interface(struct mlx5_core_dev *mdev)
|
2017-04-13 12:36:55 +09:00
|
|
|
{
|
2018-08-14 20:08:51 +09:00
|
|
|
return mdev->mlx5e_res.pdn != 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct mlx5e_profile *mlx5_get_profile(struct mlx5_core_dev *mdev)
|
|
|
|
{
|
|
|
|
if (mlx5_is_sub_interface(mdev))
|
|
|
|
return mlx5i_pkey_get_profile();
|
|
|
|
return &mlx5i_nic_profile;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int mlx5_rdma_setup_rn(struct ib_device *ibdev, u8 port_num,
|
|
|
|
struct net_device *netdev, void *param)
|
|
|
|
{
|
|
|
|
struct mlx5_core_dev *mdev = (struct mlx5_core_dev *)param;
|
|
|
|
const struct mlx5e_profile *prof = mlx5_get_profile(mdev);
|
2017-04-13 12:36:55 +09:00
|
|
|
struct mlx5i_priv *ipriv;
|
|
|
|
struct mlx5e_priv *epriv;
|
2017-04-27 23:01:34 +09:00
|
|
|
struct rdma_netdev *rn;
|
2017-04-13 12:36:55 +09:00
|
|
|
int err;
|
|
|
|
|
|
|
|
ipriv = netdev_priv(netdev);
|
|
|
|
epriv = mlx5i_epriv(netdev);
|
|
|
|
|
2018-08-14 20:08:51 +09:00
|
|
|
ipriv->sub_interface = mlx5_is_sub_interface(mdev);
|
2017-09-15 00:22:50 +09:00
|
|
|
if (!ipriv->sub_interface) {
|
|
|
|
err = mlx5i_pkey_qpn_ht_init(netdev);
|
|
|
|
if (err) {
|
|
|
|
mlx5_core_warn(mdev, "allocate qpn_to_netdev ht failed\n");
|
2018-10-02 15:54:59 +09:00
|
|
|
return err;
|
2017-09-15 00:22:50 +09:00
|
|
|
}
|
|
|
|
|
|
|
|
/* This should only be called once per mdev */
|
|
|
|
err = mlx5e_create_mdev_resources(mdev);
|
|
|
|
if (err)
|
|
|
|
goto destroy_ht;
|
2017-09-14 16:27:25 +09:00
|
|
|
}
|
|
|
|
|
2018-08-14 20:08:51 +09:00
|
|
|
prof->init(mdev, netdev, prof, ipriv);
|
2017-04-13 12:36:55 +09:00
|
|
|
|
2019-07-07 22:57:06 +09:00
|
|
|
err = mlx5e_attach_netdev(epriv);
|
|
|
|
if (err)
|
|
|
|
goto detach;
|
2017-04-13 12:36:55 +09:00
|
|
|
netif_carrier_off(netdev);
|
|
|
|
|
2017-04-27 23:01:34 +09:00
|
|
|
/* set rdma_netdev func pointers */
|
|
|
|
rn = &ipriv->rn;
|
|
|
|
rn->hca = ibdev;
|
|
|
|
rn->send = mlx5i_xmit;
|
|
|
|
rn->attach_mcast = mlx5i_attach_mcast;
|
|
|
|
rn->detach_mcast = mlx5i_detach_mcast;
|
2017-09-13 18:17:50 +09:00
|
|
|
rn->set_id = mlx5i_set_pkey_index;
|
2017-04-27 23:01:34 +09:00
|
|
|
|
2018-07-29 17:34:56 +09:00
|
|
|
netdev->priv_destructor = mlx5_rdma_netdev_free;
|
|
|
|
netdev->needs_free_netdev = 1;
|
|
|
|
|
2018-08-14 20:08:51 +09:00
|
|
|
return 0;
|
2017-04-13 12:36:55 +09:00
|
|
|
|
2019-07-07 22:57:06 +09:00
|
|
|
detach:
|
|
|
|
prof->cleanup(epriv);
|
|
|
|
if (ipriv->sub_interface)
|
|
|
|
return err;
|
|
|
|
mlx5e_destroy_mdev_resources(mdev);
|
2017-09-15 00:22:50 +09:00
|
|
|
destroy_ht:
|
|
|
|
mlx5i_pkey_qpn_ht_cleanup(netdev);
|
2018-08-14 20:08:51 +09:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
int mlx5_rdma_rn_get_params(struct mlx5_core_dev *mdev,
|
|
|
|
struct ib_device *device,
|
|
|
|
struct rdma_netdev_alloc_params *params)
|
|
|
|
{
|
|
|
|
int nch;
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
rc = mlx5i_check_required_hca_cap(mdev);
|
|
|
|
if (rc)
|
|
|
|
return rc;
|
2017-04-19 18:59:15 +09:00
|
|
|
|
2018-09-06 20:56:56 +09:00
|
|
|
nch = mlx5e_get_max_num_channels(mdev);
|
2018-08-14 20:08:51 +09:00
|
|
|
|
|
|
|
*params = (struct rdma_netdev_alloc_params){
|
|
|
|
.sizeof_priv = sizeof(struct mlx5i_priv) +
|
|
|
|
sizeof(struct mlx5e_priv),
|
|
|
|
.txqs = nch * MLX5E_MAX_NUM_TC,
|
|
|
|
.rxqs = nch,
|
|
|
|
.param = mdev,
|
|
|
|
.initialize_rdma_netdev = mlx5_rdma_setup_rn,
|
|
|
|
};
|
|
|
|
|
|
|
|
return 0;
|
2017-04-13 12:36:55 +09:00
|
|
|
}
|
2018-08-14 20:08:51 +09:00
|
|
|
EXPORT_SYMBOL(mlx5_rdma_rn_get_params);
|