Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

Daniel Borkmann says:

====================
The following pull-request contains BPF updates for your *net-next* tree.

The main changes are:

1) Add the ability to use unaligned chunks in the AF_XDP umem. By
   relaxing where the chunks can be placed, it allows to use an
   arbitrary buffer size and place whenever there is a free
   address in the umem. Helps more seamless DPDK AF_XDP driver
   integration. Support for i40e, ixgbe and mlx5e, from Kevin and
   Maxim.

2) Addition of a wakeup flag for AF_XDP tx and fill rings so the
   application can wake up the kernel for rx/tx processing which
   avoids busy-spinning of the latter, useful when app and driver
   is located on the same core. Support for i40e, ixgbe and mlx5e,
   from Magnus and Maxim.

3) bpftool fixes for printf()-like functions so compiler can actually
   enforce checks, bpftool build system improvements for custom output
   directories, and addition of 'bpftool map freeze' command, from Quentin.

4) Support attaching/detaching XDP programs from 'bpftool net' command,
   from Daniel.

5) Automatic xskmap cleanup when AF_XDP socket is released, and several
   barrier/{read,write}_once fixes in AF_XDP code, from Björn.

6) Relicense of bpf_helpers.h/bpf_endian.h for future libbpf
   inclusion as well as libbpf versioning improvements, from Andrii.

7) Several new BPF kselftests for verifier precision tracking, from Alexei.

8) Several BPF kselftest fixes wrt endianess to run on s390x, from Ilya.

9) And more BPF kselftest improvements all over the place, from Stanislav.

10) Add simple BPF map op cache for nfp driver to batch dumps, from Jakub.

11) AF_XDP socket umem mapping improvements for 32bit archs, from Ivan.

12) Add BPF-to-BPF call and BTF line info support for s390x JIT, from Yauheni.

13) Small optimization in arm64 JIT to spare 1 insns for BPF_MOD, from Jerin.

14) Fix an error check in bpf_tcp_gen_syncookie() helper, from Petar.

15) Various minor fixes and cleanups, from Nathan, Masahiro, Masanari,
    Peter, Wei, Yue.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2019-09-06 16:49:17 +02:00
commit 1e46c09ec1
124 changed files with 3492 additions and 701 deletions

View File

@ -153,10 +153,12 @@ an example, if the UMEM is 64k and each chunk is 4k, then the UMEM has
Frames passed to the kernel are used for the ingress path (RX rings).
The user application produces UMEM addrs to this ring. Note that the
kernel will mask the incoming addr. E.g. for a chunk size of 2k, the
log2(2048) LSB of the addr will be masked off, meaning that 2048, 2050
and 3000 refers to the same chunk.
The user application produces UMEM addrs to this ring. Note that, if
running the application with aligned chunk mode, the kernel will mask
the incoming addr. E.g. for a chunk size of 2k, the log2(2048) LSB of
the addr will be masked off, meaning that 2048, 2050 and 3000 refers
to the same chunk. If the user application is run in the unaligned
chunks mode, then the incoming addr will be left untouched.
UMEM Completion Ring

View File

@ -171,6 +171,9 @@
/* Rd = Ra + Rn * Rm */
#define A64_MADD(sf, Rd, Ra, Rn, Rm) aarch64_insn_gen_data3(Rd, Ra, Rn, Rm, \
A64_VARIANT(sf), AARCH64_INSN_DATA3_MADD)
/* Rd = Ra - Rn * Rm */
#define A64_MSUB(sf, Rd, Ra, Rn, Rm) aarch64_insn_gen_data3(Rd, Ra, Rn, Rm, \
A64_VARIANT(sf), AARCH64_INSN_DATA3_MSUB)
/* Rd = Rn * Rm */
#define A64_MUL(sf, Rd, Rn, Rm) A64_MADD(sf, Rd, A64_ZR, Rn, Rm)

View File

@ -409,8 +409,7 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
break;
case BPF_MOD:
emit(A64_UDIV(is64, tmp, dst, src), ctx);
emit(A64_MUL(is64, tmp, tmp, src), ctx);
emit(A64_SUB(is64, dst, dst, tmp), ctx);
emit(A64_MSUB(is64, dst, dst, tmp, src), ctx);
break;
}
break;
@ -516,8 +515,7 @@ emit_bswap_uxt:
case BPF_ALU64 | BPF_MOD | BPF_K:
emit_a64_mov_i(is64, tmp2, imm, ctx);
emit(A64_UDIV(is64, tmp, dst, tmp2), ctx);
emit(A64_MUL(is64, tmp, tmp, tmp2), ctx);
emit(A64_SUB(is64, dst, dst, tmp), ctx);
emit(A64_MSUB(is64, dst, dst, tmp, tmp2), ctx);
break;
case BPF_ALU | BPF_LSH | BPF_K:
case BPF_ALU64 | BPF_LSH | BPF_K:

View File

@ -502,7 +502,8 @@ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth)
* NOTE: Use noinline because for gcov (-fprofile-arcs) gcc allocates a lot of
* stack space for the large switch statement.
*/
static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i)
static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
int i, bool extra_pass)
{
struct bpf_insn *insn = &fp->insnsi[i];
int jmp_off, last, insn_count = 1;
@ -1011,10 +1012,14 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i
*/
case BPF_JMP | BPF_CALL:
{
/*
* b0 = (__bpf_call_base + imm)(b1, b2, b3, b4, b5)
*/
const u64 func = (u64)__bpf_call_base + imm;
u64 func;
bool func_addr_fixed;
int ret;
ret = bpf_jit_get_func_addr(fp, insn, extra_pass,
&func, &func_addr_fixed);
if (ret < 0)
return -1;
REG_SET_SEEN(BPF_REG_5);
jit->seen |= SEEN_FUNC;
@ -1283,7 +1288,8 @@ branch_oc:
/*
* Compile eBPF program into s390x code
*/
static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp)
static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp,
bool extra_pass)
{
int i, insn_count;
@ -1292,7 +1298,7 @@ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp)
bpf_jit_prologue(jit, fp->aux->stack_depth);
for (i = 0; i < fp->len; i += insn_count) {
insn_count = bpf_jit_insn(jit, fp, i);
insn_count = bpf_jit_insn(jit, fp, i, extra_pass);
if (insn_count < 0)
return -1;
/* Next instruction address */
@ -1311,6 +1317,12 @@ bool bpf_jit_needs_zext(void)
return true;
}
struct s390_jit_data {
struct bpf_binary_header *header;
struct bpf_jit ctx;
int pass;
};
/*
* Compile eBPF program "fp"
*/
@ -1318,7 +1330,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
{
struct bpf_prog *tmp, *orig_fp = fp;
struct bpf_binary_header *header;
struct s390_jit_data *jit_data;
bool tmp_blinded = false;
bool extra_pass = false;
struct bpf_jit jit;
int pass;
@ -1337,6 +1351,23 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
fp = tmp;
}
jit_data = fp->aux->jit_data;
if (!jit_data) {
jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL);
if (!jit_data) {
fp = orig_fp;
goto out;
}
fp->aux->jit_data = jit_data;
}
if (jit_data->ctx.addrs) {
jit = jit_data->ctx;
header = jit_data->header;
extra_pass = true;
pass = jit_data->pass + 1;
goto skip_init_ctx;
}
memset(&jit, 0, sizeof(jit));
jit.addrs = kcalloc(fp->len + 1, sizeof(*jit.addrs), GFP_KERNEL);
if (jit.addrs == NULL) {
@ -1349,7 +1380,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
* - 3: Calculate program size and addrs arrray
*/
for (pass = 1; pass <= 3; pass++) {
if (bpf_jit_prog(&jit, fp)) {
if (bpf_jit_prog(&jit, fp, extra_pass)) {
fp = orig_fp;
goto free_addrs;
}
@ -1361,12 +1392,14 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
fp = orig_fp;
goto free_addrs;
}
header = bpf_jit_binary_alloc(jit.size, &jit.prg_buf, 2, jit_fill_hole);
if (!header) {
fp = orig_fp;
goto free_addrs;
}
if (bpf_jit_prog(&jit, fp)) {
skip_init_ctx:
if (bpf_jit_prog(&jit, fp, extra_pass)) {
bpf_jit_binary_free(header);
fp = orig_fp;
goto free_addrs;
@ -1375,12 +1408,24 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
bpf_jit_dump(fp->len, jit.size, pass, jit.prg_buf);
print_fn_code(jit.prg_buf, jit.size_prg);
}
bpf_jit_binary_lock_ro(header);
if (!fp->is_func || extra_pass) {
bpf_jit_binary_lock_ro(header);
} else {
jit_data->header = header;
jit_data->ctx = jit;
jit_data->pass = pass;
}
fp->bpf_func = (void *) jit.prg_buf;
fp->jited = 1;
fp->jited_len = jit.size;
if (!fp->is_func || extra_pass) {
bpf_prog_fill_jited_linfo(fp, jit.addrs + 1);
free_addrs:
kfree(jit.addrs);
kfree(jit.addrs);
kfree(jit_data);
fp->aux->jit_data = NULL;
}
out:
if (tmp_blinded)
bpf_jit_prog_release_other(fp, fp == orig_fp ?

View File

@ -12530,7 +12530,8 @@ static int i40e_xdp_setup(struct i40e_vsi *vsi,
if (need_reset && prog)
for (i = 0; i < vsi->num_queue_pairs; i++)
if (vsi->xdp_rings[i]->xsk_umem)
(void)i40e_xsk_async_xmit(vsi->netdev, i);
(void)i40e_xsk_wakeup(vsi->netdev, i,
XDP_WAKEUP_RX);
return 0;
}
@ -12852,7 +12853,7 @@ static const struct net_device_ops i40e_netdev_ops = {
.ndo_bridge_setlink = i40e_ndo_bridge_setlink,
.ndo_bpf = i40e_xdp,
.ndo_xdp_xmit = i40e_xdp_xmit,
.ndo_xsk_async_xmit = i40e_xsk_async_xmit,
.ndo_xsk_wakeup = i40e_xsk_wakeup,
.ndo_dfwd_add_station = i40e_fwd_add,
.ndo_dfwd_del_station = i40e_fwd_del,
};

View File

@ -116,7 +116,7 @@ static int i40e_xsk_umem_enable(struct i40e_vsi *vsi, struct xdp_umem *umem,
return err;
/* Kick start the NAPI context so that receiving will start */
err = i40e_xsk_async_xmit(vsi->netdev, qid);
err = i40e_xsk_wakeup(vsi->netdev, qid, XDP_WAKEUP_RX);
if (err)
return err;
}
@ -190,7 +190,9 @@ int i40e_xsk_umem_setup(struct i40e_vsi *vsi, struct xdp_umem *umem,
**/
static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp)
{
struct xdp_umem *umem = rx_ring->xsk_umem;
int err, result = I40E_XDP_PASS;
u64 offset = umem->headroom;
struct i40e_ring *xdp_ring;
struct bpf_prog *xdp_prog;
u32 act;
@ -201,7 +203,10 @@ static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp)
*/
xdp_prog = READ_ONCE(rx_ring->xdp_prog);
act = bpf_prog_run_xdp(xdp_prog, xdp);
xdp->handle += xdp->data - xdp->data_hard_start;
offset += xdp->data - xdp->data_hard_start;
xdp->handle = xsk_umem_adjust_offset(umem, xdp->handle, offset);
switch (act) {
case XDP_PASS:
break;
@ -262,7 +267,7 @@ static bool i40e_alloc_buffer_zc(struct i40e_ring *rx_ring,
bi->addr = xdp_umem_get_data(umem, handle);
bi->addr += hr;
bi->handle = handle + umem->headroom;
bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom);
xsk_umem_discard_addr(umem);
return true;
@ -299,7 +304,7 @@ static bool i40e_alloc_buffer_slow_zc(struct i40e_ring *rx_ring,
bi->addr = xdp_umem_get_data(umem, handle);
bi->addr += hr;
bi->handle = handle + umem->headroom;
bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom);
xsk_umem_discard_addr_rq(umem);
return true;
@ -420,8 +425,6 @@ static void i40e_reuse_rx_buffer_zc(struct i40e_ring *rx_ring,
struct i40e_rx_buffer *old_bi)
{
struct i40e_rx_buffer *new_bi = &rx_ring->rx_bi[rx_ring->next_to_alloc];
unsigned long mask = (unsigned long)rx_ring->xsk_umem->chunk_mask;
u64 hr = rx_ring->xsk_umem->headroom + XDP_PACKET_HEADROOM;
u16 nta = rx_ring->next_to_alloc;
/* update, and store next to alloc */
@ -429,14 +432,9 @@ static void i40e_reuse_rx_buffer_zc(struct i40e_ring *rx_ring,
rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
/* transfer page from old buffer to new buffer */
new_bi->dma = old_bi->dma & mask;
new_bi->dma += hr;
new_bi->addr = (void *)((unsigned long)old_bi->addr & mask);
new_bi->addr += hr;
new_bi->handle = old_bi->handle & mask;
new_bi->handle += rx_ring->xsk_umem->headroom;
new_bi->dma = old_bi->dma;
new_bi->addr = old_bi->addr;
new_bi->handle = old_bi->handle;
old_bi->addr = NULL;
}
@ -471,7 +469,8 @@ void i40e_zca_free(struct zero_copy_allocator *alloc, unsigned long handle)
bi->addr = xdp_umem_get_data(rx_ring->xsk_umem, handle);
bi->addr += hr;
bi->handle = (u64)handle + rx_ring->xsk_umem->headroom;
bi->handle = xsk_umem_adjust_offset(rx_ring->xsk_umem, (u64)handle,
rx_ring->xsk_umem->headroom);
}
/**
@ -626,6 +625,15 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
i40e_finalize_xdp_rx(rx_ring, xdp_xmit);
i40e_update_rx_stats(rx_ring, total_rx_bytes, total_rx_packets);
if (xsk_umem_uses_need_wakeup(rx_ring->xsk_umem)) {
if (failure || rx_ring->next_to_clean == rx_ring->next_to_use)
xsk_set_rx_need_wakeup(rx_ring->xsk_umem);
else
xsk_clear_rx_need_wakeup(rx_ring->xsk_umem);
return (int)total_rx_packets;
}
return failure ? budget : (int)total_rx_packets;
}
@ -681,6 +689,8 @@ static bool i40e_xmit_zc(struct i40e_ring *xdp_ring, unsigned int budget)
i40e_xdp_ring_update_tail(xdp_ring);
xsk_umem_consume_tx_done(xdp_ring->xsk_umem);
if (xsk_umem_uses_need_wakeup(xdp_ring->xsk_umem))
xsk_clear_tx_need_wakeup(xdp_ring->xsk_umem);
}
return !!budget && work_done;
@ -759,19 +769,27 @@ bool i40e_clean_xdp_tx_irq(struct i40e_vsi *vsi,
i40e_update_tx_stats(tx_ring, completed_frames, total_bytes);
out_xmit:
if (xsk_umem_uses_need_wakeup(tx_ring->xsk_umem)) {
if (tx_ring->next_to_clean == tx_ring->next_to_use)
xsk_set_tx_need_wakeup(tx_ring->xsk_umem);
else
xsk_clear_tx_need_wakeup(tx_ring->xsk_umem);
}
xmit_done = i40e_xmit_zc(tx_ring, budget);
return work_done && xmit_done;
}
/**
* i40e_xsk_async_xmit - Implements the ndo_xsk_async_xmit
* i40e_xsk_wakeup - Implements the ndo_xsk_wakeup
* @dev: the netdevice
* @queue_id: queue id to wake up
* @flags: ignored in our case since we have Rx and Tx in the same NAPI.
*
* Returns <0 for errors, 0 otherwise.
**/
int i40e_xsk_async_xmit(struct net_device *dev, u32 queue_id)
int i40e_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags)
{
struct i40e_netdev_priv *np = netdev_priv(dev);
struct i40e_vsi *vsi = np->vsi;

View File

@ -18,6 +18,6 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget);
bool i40e_clean_xdp_tx_irq(struct i40e_vsi *vsi,
struct i40e_ring *tx_ring, int napi_budget);
int i40e_xsk_async_xmit(struct net_device *dev, u32 queue_id);
int i40e_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags);
#endif /* _I40E_XSK_H_ */

View File

@ -10260,7 +10260,8 @@ static int ixgbe_xdp_setup(struct net_device *dev, struct bpf_prog *prog)
if (need_reset && prog)
for (i = 0; i < adapter->num_rx_queues; i++)
if (adapter->xdp_ring[i]->xsk_umem)
(void)ixgbe_xsk_async_xmit(adapter->netdev, i);
(void)ixgbe_xsk_wakeup(adapter->netdev, i,
XDP_WAKEUP_RX);
return 0;
}
@ -10379,7 +10380,7 @@ static const struct net_device_ops ixgbe_netdev_ops = {
.ndo_features_check = ixgbe_features_check,
.ndo_bpf = ixgbe_xdp,
.ndo_xdp_xmit = ixgbe_xdp_xmit,
.ndo_xsk_async_xmit = ixgbe_xsk_async_xmit,
.ndo_xsk_wakeup = ixgbe_xsk_wakeup,
};
static void ixgbe_disable_txr_hw(struct ixgbe_adapter *adapter,

View File

@ -42,7 +42,7 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
void ixgbe_xsk_clean_rx_ring(struct ixgbe_ring *rx_ring);
bool ixgbe_clean_xdp_tx_irq(struct ixgbe_q_vector *q_vector,
struct ixgbe_ring *tx_ring, int napi_budget);
int ixgbe_xsk_async_xmit(struct net_device *dev, u32 queue_id);
int ixgbe_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags);
void ixgbe_xsk_clean_tx_ring(struct ixgbe_ring *tx_ring);
#endif /* #define _IXGBE_TXRX_COMMON_H_ */

View File

@ -100,7 +100,7 @@ static int ixgbe_xsk_umem_enable(struct ixgbe_adapter *adapter,
ixgbe_txrx_ring_enable(adapter, qid);
/* Kick start the NAPI context so that receiving will start */
err = ixgbe_xsk_async_xmit(adapter->netdev, qid);
err = ixgbe_xsk_wakeup(adapter->netdev, qid, XDP_WAKEUP_RX);
if (err)
return err;
}
@ -143,7 +143,9 @@ static int ixgbe_run_xdp_zc(struct ixgbe_adapter *adapter,
struct ixgbe_ring *rx_ring,
struct xdp_buff *xdp)
{
struct xdp_umem *umem = rx_ring->xsk_umem;
int err, result = IXGBE_XDP_PASS;
u64 offset = umem->headroom;
struct bpf_prog *xdp_prog;
struct xdp_frame *xdpf;
u32 act;
@ -151,7 +153,10 @@ static int ixgbe_run_xdp_zc(struct ixgbe_adapter *adapter,
rcu_read_lock();
xdp_prog = READ_ONCE(rx_ring->xdp_prog);
act = bpf_prog_run_xdp(xdp_prog, xdp);
xdp->handle += xdp->data - xdp->data_hard_start;
offset += xdp->data - xdp->data_hard_start;
xdp->handle = xsk_umem_adjust_offset(umem, xdp->handle, offset);
switch (act) {
case XDP_PASS:
break;
@ -201,8 +206,6 @@ ixgbe_rx_buffer *ixgbe_get_rx_buffer_zc(struct ixgbe_ring *rx_ring,
static void ixgbe_reuse_rx_buffer_zc(struct ixgbe_ring *rx_ring,
struct ixgbe_rx_buffer *obi)
{
unsigned long mask = (unsigned long)rx_ring->xsk_umem->chunk_mask;
u64 hr = rx_ring->xsk_umem->headroom + XDP_PACKET_HEADROOM;
u16 nta = rx_ring->next_to_alloc;
struct ixgbe_rx_buffer *nbi;
@ -212,14 +215,9 @@ static void ixgbe_reuse_rx_buffer_zc(struct ixgbe_ring *rx_ring,
rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
/* transfer page from old buffer to new buffer */
nbi->dma = obi->dma & mask;
nbi->dma += hr;
nbi->addr = (void *)((unsigned long)obi->addr & mask);
nbi->addr += hr;
nbi->handle = obi->handle & mask;
nbi->handle += rx_ring->xsk_umem->headroom;
nbi->dma = obi->dma;
nbi->addr = obi->addr;
nbi->handle = obi->handle;
obi->addr = NULL;
obi->skb = NULL;
@ -250,7 +248,8 @@ void ixgbe_zca_free(struct zero_copy_allocator *alloc, unsigned long handle)
bi->addr = xdp_umem_get_data(rx_ring->xsk_umem, handle);
bi->addr += hr;
bi->handle = (u64)handle + rx_ring->xsk_umem->headroom;
bi->handle = xsk_umem_adjust_offset(rx_ring->xsk_umem, (u64)handle,
rx_ring->xsk_umem->headroom);
}
static bool ixgbe_alloc_buffer_zc(struct ixgbe_ring *rx_ring,
@ -276,7 +275,7 @@ static bool ixgbe_alloc_buffer_zc(struct ixgbe_ring *rx_ring,
bi->addr = xdp_umem_get_data(umem, handle);
bi->addr += hr;
bi->handle = handle + umem->headroom;
bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom);
xsk_umem_discard_addr(umem);
return true;
@ -303,7 +302,7 @@ static bool ixgbe_alloc_buffer_slow_zc(struct ixgbe_ring *rx_ring,
bi->addr = xdp_umem_get_data(umem, handle);
bi->addr += hr;
bi->handle = handle + umem->headroom;
bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom);
xsk_umem_discard_addr_rq(umem);
return true;
@ -547,6 +546,14 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
q_vector->rx.total_packets += total_rx_packets;
q_vector->rx.total_bytes += total_rx_bytes;
if (xsk_umem_uses_need_wakeup(rx_ring->xsk_umem)) {
if (failure || rx_ring->next_to_clean == rx_ring->next_to_use)
xsk_set_rx_need_wakeup(rx_ring->xsk_umem);
else
xsk_clear_rx_need_wakeup(rx_ring->xsk_umem);
return (int)total_rx_packets;
}
return failure ? budget : (int)total_rx_packets;
}
@ -615,6 +622,8 @@ static bool ixgbe_xmit_zc(struct ixgbe_ring *xdp_ring, unsigned int budget)
if (tx_desc) {
ixgbe_xdp_ring_update_tail(xdp_ring);
xsk_umem_consume_tx_done(xdp_ring->xsk_umem);
if (xsk_umem_uses_need_wakeup(xdp_ring->xsk_umem))
xsk_clear_tx_need_wakeup(xdp_ring->xsk_umem);
}
return !!budget && work_done;
@ -688,11 +697,19 @@ bool ixgbe_clean_xdp_tx_irq(struct ixgbe_q_vector *q_vector,
if (xsk_frames)
xsk_umem_complete_tx(umem, xsk_frames);
if (xsk_umem_uses_need_wakeup(tx_ring->xsk_umem)) {
if (tx_ring->next_to_clean == tx_ring->next_to_use)
xsk_set_tx_need_wakeup(tx_ring->xsk_umem);
else
xsk_clear_tx_need_wakeup(tx_ring->xsk_umem);
}
xmit_done = ixgbe_xmit_zc(tx_ring, q_vector->tx.work_limit);
return budget > 0 && xmit_done;
}
int ixgbe_xsk_async_xmit(struct net_device *dev, u32 qid)
int ixgbe_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
{
struct ixgbe_adapter *adapter = netdev_priv(dev);
struct ixgbe_ring *ring;

View File

@ -25,18 +25,33 @@ u16 mlx5e_get_linear_rq_headroom(struct mlx5e_params *params,
return headroom;
}
u32 mlx5e_rx_get_linear_frag_sz(struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk)
u32 mlx5e_rx_get_min_frag_sz(struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk)
{
u32 hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu);
u16 linear_rq_headroom = mlx5e_get_linear_rq_headroom(params, xsk);
u32 frag_sz = linear_rq_headroom + hw_mtu;
return linear_rq_headroom + hw_mtu;
}
u32 mlx5e_rx_get_linear_frag_sz(struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk)
{
u32 frag_sz = mlx5e_rx_get_min_frag_sz(params, xsk);
/* AF_XDP doesn't build SKBs in place. */
if (!xsk)
frag_sz = MLX5_SKB_FRAG_SZ(frag_sz);
/* XDP in mlx5e doesn't support multiple packets per page. */
/* XDP in mlx5e doesn't support multiple packets per page. AF_XDP is a
* special case. It can run with frames smaller than a page, as it
* doesn't allocate pages dynamically. However, here we pretend that
* fragments are page-sized: it allows to treat XSK frames like pages
* by redirecting alloc and free operations to XSK rings and by using
* the fact there are no multiple packets per "page" (which is a frame).
* The latter is important, because frames may come in a random order,
* and we will have trouble assemblying a real page of multiple frames.
*/
if (mlx5e_rx_is_xdp(params, xsk))
frag_sz = max_t(u32, frag_sz, PAGE_SIZE);

View File

@ -76,6 +76,8 @@ static inline bool mlx5e_qid_validate(const struct mlx5e_profile *profile,
u16 mlx5e_get_linear_rq_headroom(struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk);
u32 mlx5e_rx_get_min_frag_sz(struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk);
u32 mlx5e_rx_get_linear_frag_sz(struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk);
u8 mlx5e_mpwqe_log_pkts_per_wqe(struct mlx5e_params *params,

View File

@ -122,6 +122,7 @@ bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di,
void *va, u16 *rx_headroom, u32 *len, bool xsk)
{
struct bpf_prog *prog = READ_ONCE(rq->xdp_prog);
struct xdp_umem *umem = rq->umem;
struct xdp_buff xdp;
u32 act;
int err;
@ -138,8 +139,11 @@ bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di,
xdp.rxq = &rq->xdp_rxq;
act = bpf_prog_run_xdp(prog, &xdp);
if (xsk)
xdp.handle += xdp.data - xdp.data_hard_start;
if (xsk) {
u64 off = xdp.data - xdp.data_hard_start;
xdp.handle = xsk_umem_adjust_offset(umem, xdp.handle, off);
}
switch (act) {
case XDP_PASS:
*rx_headroom = xdp.data - xdp.data_hard_start;

View File

@ -24,7 +24,8 @@ int mlx5e_xsk_page_alloc_umem(struct mlx5e_rq *rq,
if (!xsk_umem_peek_addr_rq(umem, &handle))
return -ENOMEM;
dma_info->xsk.handle = handle + rq->buff.umem_headroom;
dma_info->xsk.handle = xsk_umem_adjust_offset(umem, handle,
rq->buff.umem_headroom);
dma_info->xsk.data = xdp_umem_get_data(umem, dma_info->xsk.handle);
/* No need to add headroom to the DMA address. In striding RQ case, we
@ -104,7 +105,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
/* head_offset is not used in this function, because di->xsk.data and
* di->addr point directly to the necessary place. Furthermore, in the
* current implementation, one page = one packet = one frame, so
* current implementation, UMR pages are mapped to XSK frames, so
* head_offset should always be 0.
*/
WARN_ON_ONCE(head_offset);

View File

@ -5,6 +5,7 @@
#define __MLX5_EN_XSK_RX_H__
#include "en.h"
#include <net/xdp_sock.h>
/* RX data path */
@ -24,4 +25,17 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
struct mlx5e_wqe_frag_info *wi,
u32 cqe_bcnt);
static inline bool mlx5e_xsk_update_rx_wakeup(struct mlx5e_rq *rq, bool alloc_err)
{
if (!xsk_umem_uses_need_wakeup(rq->umem))
return alloc_err;
if (unlikely(alloc_err))
xsk_set_rx_need_wakeup(rq->umem);
else
xsk_clear_rx_need_wakeup(rq->umem);
return false;
}
#endif /* __MLX5_EN_XSK_RX_H__ */

View File

@ -4,18 +4,23 @@
#include "setup.h"
#include "en/params.h"
/* It matches XDP_UMEM_MIN_CHUNK_SIZE, but as this constant is private and may
* change unexpectedly, and mlx5e has a minimum valid stride size for striding
* RQ, keep this check in the driver.
*/
#define MLX5E_MIN_XSK_CHUNK_SIZE 2048
bool mlx5e_validate_xsk_param(struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk,
struct mlx5_core_dev *mdev)
{
/* AF_XDP doesn't support frames larger than PAGE_SIZE, and the current
* mlx5e XDP implementation doesn't support multiple packets per page.
*/
if (xsk->chunk_size != PAGE_SIZE)
/* AF_XDP doesn't support frames larger than PAGE_SIZE. */
if (xsk->chunk_size > PAGE_SIZE ||
xsk->chunk_size < MLX5E_MIN_XSK_CHUNK_SIZE)
return false;
/* Current MTU and XSK headroom don't allow packets to fit the frames. */
if (mlx5e_rx_get_linear_frag_sz(params, xsk) > xsk->chunk_size)
if (mlx5e_rx_get_min_frag_sz(params, xsk) > xsk->chunk_size)
return false;
/* frag_sz is different for regular and XSK RQs, so ensure that linear

View File

@ -7,7 +7,7 @@
#include "en/params.h"
#include <net/xdp_sock.h>
int mlx5e_xsk_async_xmit(struct net_device *dev, u32 qid)
int mlx5e_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
{
struct mlx5e_priv *priv = netdev_priv(dev);
struct mlx5e_params *params = &priv->channels.params;

View File

@ -5,11 +5,23 @@
#define __MLX5_EN_XSK_TX_H__
#include "en.h"
#include <net/xdp_sock.h>
/* TX data path */
int mlx5e_xsk_async_xmit(struct net_device *dev, u32 qid);
int mlx5e_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags);
bool mlx5e_xsk_tx(struct mlx5e_xdpsq *sq, unsigned int budget);
static inline void mlx5e_xsk_update_tx_wakeup(struct mlx5e_xdpsq *sq)
{
if (!xsk_umem_uses_need_wakeup(sq->umem))
return;
if (sq->pc != sq->cc)
xsk_clear_tx_need_wakeup(sq->umem);
else
xsk_set_tx_need_wakeup(sq->umem);
}
#endif /* __MLX5_EN_XSK_TX_H__ */

View File

@ -4580,7 +4580,7 @@ const struct net_device_ops mlx5e_netdev_ops = {
.ndo_tx_timeout = mlx5e_tx_timeout,
.ndo_bpf = mlx5e_xdp,
.ndo_xdp_xmit = mlx5e_xdp_xmit,
.ndo_xsk_async_xmit = mlx5e_xsk_async_xmit,
.ndo_xsk_wakeup = mlx5e_xsk_wakeup,
#ifdef CONFIG_MLX5_EN_ARFS
.ndo_rx_flow_steer = mlx5e_rx_flow_steer,
#endif

View File

@ -695,8 +695,11 @@ bool mlx5e_post_rx_mpwqes(struct mlx5e_rq *rq)
rq->mpwqe.umr_in_progress += rq->mpwqe.umr_last_bulk;
rq->mpwqe.actual_wq_head = head;
/* If XSK Fill Ring doesn't have enough frames, busy poll by
* rescheduling the NAPI poll.
/* If XSK Fill Ring doesn't have enough frames, report the error, so
* that one of the actions can be performed:
* 1. If need_wakeup is used, signal that the application has to kick
* the driver when it refills the Fill Ring.
* 2. Otherwise, busy poll by rescheduling the NAPI poll.
*/
if (unlikely(alloc_err == -ENOMEM && rq->umem))
return true;

View File

@ -33,6 +33,7 @@
#include <linux/irq.h>
#include "en.h"
#include "en/xdp.h"
#include "en/xsk/rx.h"
#include "en/xsk/tx.h"
static inline bool mlx5e_channel_no_affinity_change(struct mlx5e_channel *c)
@ -81,6 +82,29 @@ void mlx5e_trigger_irq(struct mlx5e_icosq *sq)
mlx5e_notify_hw(wq, sq->pc, sq->uar_map, &nopwqe->ctrl);
}
static bool mlx5e_napi_xsk_post(struct mlx5e_xdpsq *xsksq, struct mlx5e_rq *xskrq)
{
bool busy_xsk = false, xsk_rx_alloc_err;
/* Handle the race between the application querying need_wakeup and the
* driver setting it:
* 1. Update need_wakeup both before and after the TX. If it goes to
* "yes", it can only happen with the first update.
* 2. If the application queried need_wakeup before we set it, the
* packets will be transmitted anyway, even w/o a wakeup.
* 3. Give a chance to clear need_wakeup after new packets were queued
* for TX.
*/
mlx5e_xsk_update_tx_wakeup(xsksq);
busy_xsk |= mlx5e_xsk_tx(xsksq, MLX5E_TX_XSK_POLL_BUDGET);
mlx5e_xsk_update_tx_wakeup(xsksq);
xsk_rx_alloc_err = xskrq->post_wqes(xskrq);
busy_xsk |= mlx5e_xsk_update_rx_wakeup(xskrq, xsk_rx_alloc_err);
return busy_xsk;
}
int mlx5e_napi_poll(struct napi_struct *napi, int budget)
{
struct mlx5e_channel *c = container_of(napi, struct mlx5e_channel,
@ -122,8 +146,7 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget)
if (xsk_open) {
mlx5e_poll_ico_cq(&c->xskicosq.cq);
busy |= mlx5e_poll_xdpsq_cq(&xsksq->cq);
busy_xsk |= mlx5e_xsk_tx(xsksq, MLX5E_TX_XSK_POLL_BUDGET);
busy_xsk |= xskrq->post_wqes(xskrq);
busy_xsk |= mlx5e_napi_xsk_post(xsksq, xskrq);
}
busy |= busy_xsk;

View File

@ -6,6 +6,7 @@
#include <linux/bug.h>
#include <linux/jiffies.h>
#include <linux/skbuff.h>
#include <linux/timekeeping.h>
#include "../ccm.h"
#include "../nfp_app.h"
@ -175,29 +176,151 @@ nfp_bpf_ctrl_reply_val(struct nfp_app_bpf *bpf, struct cmsg_reply_map_op *reply,
return &reply->data[bpf->cmsg_key_sz * (n + 1) + bpf->cmsg_val_sz * n];
}
static bool nfp_bpf_ctrl_op_cache_invalidate(enum nfp_ccm_type op)
{
return op == NFP_CCM_TYPE_BPF_MAP_UPDATE ||
op == NFP_CCM_TYPE_BPF_MAP_DELETE;
}
static bool nfp_bpf_ctrl_op_cache_capable(enum nfp_ccm_type op)
{
return op == NFP_CCM_TYPE_BPF_MAP_LOOKUP ||
op == NFP_CCM_TYPE_BPF_MAP_GETNEXT;
}
static bool nfp_bpf_ctrl_op_cache_fill(enum nfp_ccm_type op)
{
return op == NFP_CCM_TYPE_BPF_MAP_GETFIRST ||
op == NFP_CCM_TYPE_BPF_MAP_GETNEXT;
}
static unsigned int
nfp_bpf_ctrl_op_cache_get(struct nfp_bpf_map *nfp_map, enum nfp_ccm_type op,
const u8 *key, u8 *out_key, u8 *out_value,
u32 *cache_gen)
{
struct bpf_map *map = &nfp_map->offmap->map;
struct nfp_app_bpf *bpf = nfp_map->bpf;
unsigned int i, count, n_entries;
struct cmsg_reply_map_op *reply;
n_entries = nfp_bpf_ctrl_op_cache_fill(op) ? bpf->cmsg_cache_cnt : 1;
spin_lock(&nfp_map->cache_lock);
*cache_gen = nfp_map->cache_gen;
if (nfp_map->cache_blockers)
n_entries = 1;
if (nfp_bpf_ctrl_op_cache_invalidate(op))
goto exit_block;
if (!nfp_bpf_ctrl_op_cache_capable(op))
goto exit_unlock;
if (!nfp_map->cache)
goto exit_unlock;
if (nfp_map->cache_to < ktime_get_ns())
goto exit_invalidate;
reply = (void *)nfp_map->cache->data;
count = be32_to_cpu(reply->count);
for (i = 0; i < count; i++) {
void *cached_key;
cached_key = nfp_bpf_ctrl_reply_key(bpf, reply, i);
if (memcmp(cached_key, key, map->key_size))
continue;
if (op == NFP_CCM_TYPE_BPF_MAP_LOOKUP)
memcpy(out_value, nfp_bpf_ctrl_reply_val(bpf, reply, i),
map->value_size);
if (op == NFP_CCM_TYPE_BPF_MAP_GETNEXT) {
if (i + 1 == count)
break;
memcpy(out_key,
nfp_bpf_ctrl_reply_key(bpf, reply, i + 1),
map->key_size);
}
n_entries = 0;
goto exit_unlock;
}
goto exit_unlock;
exit_block:
nfp_map->cache_blockers++;
exit_invalidate:
dev_consume_skb_any(nfp_map->cache);
nfp_map->cache = NULL;
exit_unlock:
spin_unlock(&nfp_map->cache_lock);
return n_entries;
}
static void
nfp_bpf_ctrl_op_cache_put(struct nfp_bpf_map *nfp_map, enum nfp_ccm_type op,
struct sk_buff *skb, u32 cache_gen)
{
bool blocker, filler;
blocker = nfp_bpf_ctrl_op_cache_invalidate(op);
filler = nfp_bpf_ctrl_op_cache_fill(op);
if (blocker || filler) {
u64 to = 0;
if (filler)
to = ktime_get_ns() + NFP_BPF_MAP_CACHE_TIME_NS;
spin_lock(&nfp_map->cache_lock);
if (blocker) {
nfp_map->cache_blockers--;
nfp_map->cache_gen++;
}
if (filler && !nfp_map->cache_blockers &&
nfp_map->cache_gen == cache_gen) {
nfp_map->cache_to = to;
swap(nfp_map->cache, skb);
}
spin_unlock(&nfp_map->cache_lock);
}
dev_consume_skb_any(skb);
}
static int
nfp_bpf_ctrl_entry_op(struct bpf_offloaded_map *offmap, enum nfp_ccm_type op,
u8 *key, u8 *value, u64 flags, u8 *out_key, u8 *out_value)
{
struct nfp_bpf_map *nfp_map = offmap->dev_priv;
unsigned int n_entries, reply_entries, count;
struct nfp_app_bpf *bpf = nfp_map->bpf;
struct bpf_map *map = &offmap->map;
struct cmsg_reply_map_op *reply;
struct cmsg_req_map_op *req;
struct sk_buff *skb;
u32 cache_gen;
int err;
/* FW messages have no space for more than 32 bits of flags */
if (flags >> 32)
return -EOPNOTSUPP;
/* Handle op cache */
n_entries = nfp_bpf_ctrl_op_cache_get(nfp_map, op, key, out_key,
out_value, &cache_gen);
if (!n_entries)
return 0;
skb = nfp_bpf_cmsg_map_req_alloc(bpf, 1);
if (!skb)
return -ENOMEM;
if (!skb) {
err = -ENOMEM;
goto err_cache_put;
}
req = (void *)skb->data;
req->tid = cpu_to_be32(nfp_map->tid);
req->count = cpu_to_be32(1);
req->count = cpu_to_be32(n_entries);
req->flags = cpu_to_be32(flags);
/* Copy inputs */
@ -207,16 +330,38 @@ nfp_bpf_ctrl_entry_op(struct bpf_offloaded_map *offmap, enum nfp_ccm_type op,
memcpy(nfp_bpf_ctrl_req_val(bpf, req, 0), value,
map->value_size);
skb = nfp_ccm_communicate(&bpf->ccm, skb, op,
nfp_bpf_cmsg_map_reply_size(bpf, 1));
if (IS_ERR(skb))
return PTR_ERR(skb);
skb = nfp_ccm_communicate(&bpf->ccm, skb, op, 0);
if (IS_ERR(skb)) {
err = PTR_ERR(skb);
goto err_cache_put;
}
if (skb->len < sizeof(*reply)) {
cmsg_warn(bpf, "cmsg drop - type 0x%02x too short %d!\n",
op, skb->len);
err = -EIO;
goto err_free;
}
reply = (void *)skb->data;
count = be32_to_cpu(reply->count);
err = nfp_bpf_ctrl_rc_to_errno(bpf, &reply->reply_hdr);
/* FW responds with message sized to hold the good entries,
* plus one extra entry if there was an error.
*/
reply_entries = count + !!err;
if (n_entries > 1 && count)
err = 0;
if (err)
goto err_free;
if (skb->len != nfp_bpf_cmsg_map_reply_size(bpf, reply_entries)) {
cmsg_warn(bpf, "cmsg drop - type 0x%02x too short %d for %d entries!\n",
op, skb->len, reply_entries);
err = -EIO;
goto err_free;
}
/* Copy outputs */
if (out_key)
memcpy(out_key, nfp_bpf_ctrl_reply_key(bpf, reply, 0),
@ -225,11 +370,13 @@ nfp_bpf_ctrl_entry_op(struct bpf_offloaded_map *offmap, enum nfp_ccm_type op,
memcpy(out_value, nfp_bpf_ctrl_reply_val(bpf, reply, 0),
map->value_size);
dev_consume_skb_any(skb);
nfp_bpf_ctrl_op_cache_put(nfp_map, op, skb, cache_gen);
return 0;
err_free:
dev_kfree_skb_any(skb);
err_cache_put:
nfp_bpf_ctrl_op_cache_put(nfp_map, op, NULL, cache_gen);
return err;
}
@ -267,11 +414,29 @@ int nfp_bpf_ctrl_getnext_entry(struct bpf_offloaded_map *offmap,
key, NULL, 0, next_key, NULL);
}
unsigned int nfp_bpf_ctrl_cmsg_min_mtu(struct nfp_app_bpf *bpf)
{
return max(nfp_bpf_cmsg_map_req_size(bpf, 1),
nfp_bpf_cmsg_map_reply_size(bpf, 1));
}
unsigned int nfp_bpf_ctrl_cmsg_mtu(struct nfp_app_bpf *bpf)
{
return max3((unsigned int)NFP_NET_DEFAULT_MTU,
nfp_bpf_cmsg_map_req_size(bpf, 1),
nfp_bpf_cmsg_map_reply_size(bpf, 1));
return max3(NFP_NET_DEFAULT_MTU,
nfp_bpf_cmsg_map_req_size(bpf, NFP_BPF_MAP_CACHE_CNT),
nfp_bpf_cmsg_map_reply_size(bpf, NFP_BPF_MAP_CACHE_CNT));
}
unsigned int nfp_bpf_ctrl_cmsg_cache_cnt(struct nfp_app_bpf *bpf)
{
unsigned int mtu, req_max, reply_max, entry_sz;
mtu = bpf->app->ctrl->dp.mtu;
entry_sz = bpf->cmsg_key_sz + bpf->cmsg_val_sz;
req_max = (mtu - sizeof(struct cmsg_req_map_op)) / entry_sz;
reply_max = (mtu - sizeof(struct cmsg_reply_map_op)) / entry_sz;
return min3(req_max, reply_max, NFP_BPF_MAP_CACHE_CNT);
}
void nfp_bpf_ctrl_msg_rx(struct nfp_app *app, struct sk_buff *skb)

View File

@ -24,6 +24,7 @@ enum bpf_cap_tlv_type {
NFP_BPF_CAP_TYPE_QUEUE_SELECT = 5,
NFP_BPF_CAP_TYPE_ADJUST_TAIL = 6,
NFP_BPF_CAP_TYPE_ABI_VERSION = 7,
NFP_BPF_CAP_TYPE_CMSG_MULTI_ENT = 8,
};
struct nfp_bpf_cap_tlv_func {

View File

@ -299,6 +299,14 @@ nfp_bpf_parse_cap_adjust_tail(struct nfp_app_bpf *bpf, void __iomem *value,
return 0;
}
static int
nfp_bpf_parse_cap_cmsg_multi_ent(struct nfp_app_bpf *bpf, void __iomem *value,
u32 length)
{
bpf->cmsg_multi_ent = true;
return 0;
}
static int
nfp_bpf_parse_cap_abi_version(struct nfp_app_bpf *bpf, void __iomem *value,
u32 length)
@ -375,6 +383,11 @@ static int nfp_bpf_parse_capabilities(struct nfp_app *app)
length))
goto err_release_free;
break;
case NFP_BPF_CAP_TYPE_CMSG_MULTI_ENT:
if (nfp_bpf_parse_cap_cmsg_multi_ent(app->priv, value,
length))
goto err_release_free;
break;
default:
nfp_dbg(cpp, "unknown BPF capability: %d\n", type);
break;
@ -415,6 +428,25 @@ static void nfp_bpf_ndo_uninit(struct nfp_app *app, struct net_device *netdev)
bpf_offload_dev_netdev_unregister(bpf->bpf_dev, netdev);
}
static int nfp_bpf_start(struct nfp_app *app)
{
struct nfp_app_bpf *bpf = app->priv;
if (app->ctrl->dp.mtu < nfp_bpf_ctrl_cmsg_min_mtu(bpf)) {
nfp_err(bpf->app->cpp,
"ctrl channel MTU below min required %u < %u\n",
app->ctrl->dp.mtu, nfp_bpf_ctrl_cmsg_min_mtu(bpf));
return -EINVAL;
}
if (bpf->cmsg_multi_ent)
bpf->cmsg_cache_cnt = nfp_bpf_ctrl_cmsg_cache_cnt(bpf);
else
bpf->cmsg_cache_cnt = 1;
return 0;
}
static int nfp_bpf_init(struct nfp_app *app)
{
struct nfp_app_bpf *bpf;
@ -488,6 +520,7 @@ const struct nfp_app_type app_bpf = {
.init = nfp_bpf_init,
.clean = nfp_bpf_clean,
.start = nfp_bpf_start,
.check_mtu = nfp_bpf_check_mtu,

View File

@ -99,6 +99,7 @@ enum pkt_vec {
* @maps_neutral: hash table of offload-neutral maps (on pointer)
*
* @abi_version: global BPF ABI version
* @cmsg_cache_cnt: number of entries to read for caching
*
* @adjust_head: adjust head capability
* @adjust_head.flags: extra flags for adjust head
@ -124,6 +125,7 @@ enum pkt_vec {
* @pseudo_random: FW initialized the pseudo-random machinery (CSRs)
* @queue_select: BPF can set the RX queue ID in packet vector
* @adjust_tail: BPF can simply trunc packet size for adjust tail
* @cmsg_multi_ent: FW can pack multiple map entries in a single cmsg
*/
struct nfp_app_bpf {
struct nfp_app *app;
@ -134,6 +136,8 @@ struct nfp_app_bpf {
unsigned int cmsg_key_sz;
unsigned int cmsg_val_sz;
unsigned int cmsg_cache_cnt;
struct list_head map_list;
unsigned int maps_in_use;
unsigned int map_elems_in_use;
@ -169,6 +173,7 @@ struct nfp_app_bpf {
bool pseudo_random;
bool queue_select;
bool adjust_tail;
bool cmsg_multi_ent;
};
enum nfp_bpf_map_use {
@ -183,11 +188,21 @@ struct nfp_bpf_map_word {
unsigned char non_zero_update :1;
};
#define NFP_BPF_MAP_CACHE_CNT 4U
#define NFP_BPF_MAP_CACHE_TIME_NS (250 * 1000)
/**
* struct nfp_bpf_map - private per-map data attached to BPF maps for offload
* @offmap: pointer to the offloaded BPF map
* @bpf: back pointer to bpf app private structure
* @tid: table id identifying map on datapath
*
* @cache_lock: protects @cache_blockers, @cache_to, @cache
* @cache_blockers: number of ops in flight which block caching
* @cache_gen: counter incremented by every blocker on exit
* @cache_to: time when cache will no longer be valid (ns)
* @cache: skb with cached response
*
* @l: link on the nfp_app_bpf->map_list list
* @use_map: map of how the value is used (in 4B chunks)
*/
@ -195,6 +210,13 @@ struct nfp_bpf_map {
struct bpf_offloaded_map *offmap;
struct nfp_app_bpf *bpf;
u32 tid;
spinlock_t cache_lock;
u32 cache_blockers;
u32 cache_gen;
u64 cache_to;
struct sk_buff *cache;
struct list_head l;
struct nfp_bpf_map_word use_map[];
};
@ -564,7 +586,9 @@ nfp_bpf_goto_meta(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
void *nfp_bpf_relo_for_vnic(struct nfp_prog *nfp_prog, struct nfp_bpf_vnic *bv);
unsigned int nfp_bpf_ctrl_cmsg_min_mtu(struct nfp_app_bpf *bpf);
unsigned int nfp_bpf_ctrl_cmsg_mtu(struct nfp_app_bpf *bpf);
unsigned int nfp_bpf_ctrl_cmsg_cache_cnt(struct nfp_app_bpf *bpf);
long long int
nfp_bpf_ctrl_alloc_map(struct nfp_app_bpf *bpf, struct bpf_map *map);
void

View File

@ -385,6 +385,7 @@ nfp_bpf_map_alloc(struct nfp_app_bpf *bpf, struct bpf_offloaded_map *offmap)
offmap->dev_priv = nfp_map;
nfp_map->offmap = offmap;
nfp_map->bpf = bpf;
spin_lock_init(&nfp_map->cache_lock);
res = nfp_bpf_ctrl_alloc_map(bpf, &offmap->map);
if (res < 0) {
@ -407,6 +408,8 @@ nfp_bpf_map_free(struct nfp_app_bpf *bpf, struct bpf_offloaded_map *offmap)
struct nfp_bpf_map *nfp_map = offmap->dev_priv;
nfp_bpf_ctrl_free_map(bpf, nfp_map);
dev_consume_skb_any(nfp_map->cache);
WARN_ON_ONCE(nfp_map->cache_blockers);
list_del_init(&nfp_map->l);
bpf->map_elems_in_use -= offmap->map.max_entries;
bpf->maps_in_use--;

View File

@ -66,7 +66,7 @@
#define NFP_NET_MAX_DMA_BITS 40
/* Default size for MTU and freelist buffer sizes */
#define NFP_NET_DEFAULT_MTU 1500
#define NFP_NET_DEFAULT_MTU 1500U
/* Maximum number of bytes prepended to a packet */
#define NFP_NET_MAX_PREPEND 64

View File

@ -4116,14 +4116,7 @@ int nfp_net_init(struct nfp_net *nn)
/* Set default MTU and Freelist buffer size */
if (!nfp_net_is_data_vnic(nn) && nn->app->ctrl_mtu) {
if (nn->app->ctrl_mtu <= nn->max_mtu) {
nn->dp.mtu = nn->app->ctrl_mtu;
} else {
if (nn->app->ctrl_mtu != NFP_APP_CTRL_MTU_MAX)
nn_warn(nn, "app requested MTU above max supported %u > %u\n",
nn->app->ctrl_mtu, nn->max_mtu);
nn->dp.mtu = nn->max_mtu;
}
nn->dp.mtu = min(nn->app->ctrl_mtu, nn->max_mtu);
} else if (nn->max_mtu < NFP_NET_DEFAULT_MTU) {
nn->dp.mtu = nn->max_mtu;
} else {

View File

@ -24,6 +24,9 @@ struct seq_file;
struct btf;
struct btf_type;
extern struct idr btf_idr;
extern spinlock_t btf_idr_lock;
/* map is generic key/value storage optionally accesible by eBPF programs */
struct bpf_map_ops {
/* funcs callable from userspace (via syscall) */
@ -647,6 +650,8 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock);
struct bpf_map *bpf_map_get_with_uref(u32 ufd);
struct bpf_map *__bpf_map_get(struct fd f);
struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref);
struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map,
bool uref);
void bpf_map_put_with_uref(struct bpf_map *map);
void bpf_map_put(struct bpf_map *map);
int bpf_map_charge_memlock(struct bpf_map *map, u32 pages);

View File

@ -355,6 +355,7 @@ struct bpf_verifier_env {
struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */
int stack_size; /* number of states to be processed */
bool strict_alignment; /* perform strict pointer alignment checks */
bool test_state_freq; /* test verifier with different pruning frequency */
struct bpf_verifier_state *cur_state; /* current verifier state */
struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
struct bpf_verifier_state_list *free_list;

View File

@ -901,6 +901,10 @@ struct netdev_bpf {
};
};
/* Flags for ndo_xsk_wakeup. */
#define XDP_WAKEUP_RX (1 << 0)
#define XDP_WAKEUP_TX (1 << 1)
#ifdef CONFIG_XFRM_OFFLOAD
struct xfrmdev_ops {
int (*xdo_dev_state_add) (struct xfrm_state *x);
@ -1227,6 +1231,12 @@ struct tlsdev_ops;
* that got dropped are freed/returned via xdp_return_frame().
* Returns negative number, means general error invoking ndo, meaning
* no frames were xmit'ed and core-caller will free all frames.
* int (*ndo_xsk_wakeup)(struct net_device *dev, u32 queue_id, u32 flags);
* This function is used to wake up the softirq, ksoftirqd or kthread
* responsible for sending and/or receiving packets on a specific
* queue id bound to an AF_XDP socket. The flags field specifies if
* only RX, only Tx, or both should be woken up using the flags
* XDP_WAKEUP_RX and XDP_WAKEUP_TX.
* struct devlink_port *(*ndo_get_devlink_port)(struct net_device *dev);
* Get devlink port instance associated with a given netdev.
* Called with a reference on the netdevice and devlink locks only,
@ -1426,8 +1436,8 @@ struct net_device_ops {
int (*ndo_xdp_xmit)(struct net_device *dev, int n,
struct xdp_frame **xdp,
u32 flags);
int (*ndo_xsk_async_xmit)(struct net_device *dev,
u32 queue_id);
int (*ndo_xsk_wakeup)(struct net_device *dev,
u32 queue_id, u32 flags);
struct devlink_port * (*ndo_get_devlink_port)(struct net_device *dev);
};

View File

@ -5,6 +5,10 @@
* propagate the unknown bits such that the tnum result represents all the
* possible results for possible values of the operands.
*/
#ifndef _LINUX_TNUM_H
#define _LINUX_TNUM_H
#include <linux/types.h>
struct tnum {
@ -81,3 +85,5 @@ bool tnum_in(struct tnum a, struct tnum b);
int tnum_strn(char *str, size_t size, struct tnum a);
/* Format a tnum as tristate binary expansion */
int tnum_sbin(char *str, size_t size, struct tnum a);
#endif /* _LINUX_TNUM_H */

View File

@ -10,4 +10,14 @@ void bpf_sk_storage_free(struct sock *sk);
extern const struct bpf_func_proto bpf_sk_storage_get_proto;
extern const struct bpf_func_proto bpf_sk_storage_delete_proto;
#ifdef CONFIG_BPF_SYSCALL
int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk);
#else
static inline int bpf_sk_storage_clone(const struct sock *sk,
struct sock *newsk)
{
return 0;
}
#endif
#endif /* _BPF_SK_STORAGE_H */

View File

@ -16,6 +16,13 @@
struct net_device;
struct xsk_queue;
/* Masks for xdp_umem_page flags.
* The low 12-bits of the addr will be 0 since this is the page address, so we
* can use them for flags.
*/
#define XSK_NEXT_PG_CONTIG_SHIFT 0
#define XSK_NEXT_PG_CONTIG_MASK (1ULL << XSK_NEXT_PG_CONTIG_SHIFT)
struct xdp_umem_page {
void *addr;
dma_addr_t dma;
@ -27,6 +34,13 @@ struct xdp_umem_fq_reuse {
u64 handles[];
};
/* Flags for the umem flags field.
*
* The NEED_WAKEUP flag is 1 due to the reuse of the flags field for public
* flags. See inlude/uapi/include/linux/if_xdp.h.
*/
#define XDP_UMEM_USES_NEED_WAKEUP (1 << 1)
struct xdp_umem {
struct xsk_queue *fq;
struct xsk_queue *cq;
@ -41,15 +55,27 @@ struct xdp_umem {
struct work_struct work;
struct page **pgs;
u32 npgs;
u16 queue_id;
u8 need_wakeup;
u8 flags;
int id;
struct net_device *dev;
struct xdp_umem_fq_reuse *fq_reuse;
u16 queue_id;
bool zc;
spinlock_t xsk_list_lock;
struct list_head xsk_list;
};
/* Nodes are linked in the struct xdp_sock map_list field, and used to
* track which maps a certain socket reside in.
*/
struct xsk_map;
struct xsk_map_node {
struct list_head node;
struct xsk_map *map;
struct xdp_sock **map_entry;
};
struct xdp_sock {
/* struct sock must be the first member of struct xdp_sock */
struct sock sk;
@ -75,6 +101,9 @@ struct xdp_sock {
/* Protects generic receive. */
spinlock_t rx_lock;
u64 rx_dropped;
struct list_head map_list;
/* Protects map_list */
spinlock_t map_list_lock;
};
struct xdp_buff;
@ -95,15 +124,47 @@ struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem,
struct xdp_umem_fq_reuse *newq);
void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq);
struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id);
void xsk_set_rx_need_wakeup(struct xdp_umem *umem);
void xsk_set_tx_need_wakeup(struct xdp_umem *umem);
void xsk_clear_rx_need_wakeup(struct xdp_umem *umem);
void xsk_clear_tx_need_wakeup(struct xdp_umem *umem);
bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem);
void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
struct xdp_sock **map_entry);
int xsk_map_inc(struct xsk_map *map);
void xsk_map_put(struct xsk_map *map);
static inline u64 xsk_umem_extract_addr(u64 addr)
{
return addr & XSK_UNALIGNED_BUF_ADDR_MASK;
}
static inline u64 xsk_umem_extract_offset(u64 addr)
{
return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT;
}
static inline u64 xsk_umem_add_offset_to_addr(u64 addr)
{
return xsk_umem_extract_addr(addr) + xsk_umem_extract_offset(addr);
}
static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
{
return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1));
unsigned long page_addr;
addr = xsk_umem_add_offset_to_addr(addr);
page_addr = (unsigned long)umem->pages[addr >> PAGE_SHIFT].addr;
return (char *)(page_addr & PAGE_MASK) + (addr & ~PAGE_MASK);
}
static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
{
return umem->pages[addr >> PAGE_SHIFT].dma + (addr & (PAGE_SIZE - 1));
addr = xsk_umem_add_offset_to_addr(addr);
return umem->pages[addr >> PAGE_SHIFT].dma + (addr & ~PAGE_MASK);
}
/* Reuse-queue aware version of FILL queue helpers */
@ -144,6 +205,19 @@ static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr)
rq->handles[rq->length++] = addr;
}
/* Handle the offset appropriately depending on aligned or unaligned mode.
* For unaligned mode, we store the offset in the upper 16-bits of the address.
* For aligned mode, we simply add the offset to the address.
*/
static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 address,
u64 offset)
{
if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG)
return address + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
else
return address + offset;
}
#else
static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
{
@ -213,6 +287,21 @@ static inline struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
return NULL;
}
static inline u64 xsk_umem_extract_addr(u64 addr)
{
return 0;
}
static inline u64 xsk_umem_extract_offset(u64 addr)
{
return 0;
}
static inline u64 xsk_umem_add_offset_to_addr(u64 addr)
{
return 0;
}
static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
{
return NULL;
@ -241,6 +330,33 @@ static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr)
{
}
static inline void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
{
}
static inline void xsk_set_tx_need_wakeup(struct xdp_umem *umem)
{
}
static inline void xsk_clear_rx_need_wakeup(struct xdp_umem *umem)
{
}
static inline void xsk_clear_tx_need_wakeup(struct xdp_umem *umem)
{
}
static inline bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
{
return false;
}
static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 handle,
u64 offset)
{
return 0;
}
#endif /* CONFIG_XDP_SOCKETS */
#endif /* _LINUX_XDP_SOCK_H */

View File

@ -106,6 +106,7 @@ enum bpf_cmd {
BPF_TASK_FD_QUERY,
BPF_MAP_LOOKUP_AND_DELETE_ELEM,
BPF_MAP_FREEZE,
BPF_BTF_GET_NEXT_ID,
};
enum bpf_map_type {
@ -284,6 +285,9 @@ enum bpf_attach_type {
*/
#define BPF_F_TEST_RND_HI32 (1U << 2)
/* The verifier internal test flag. Behavior is undefined */
#define BPF_F_TEST_STATE_FREQ (1U << 3)
/* When BPF ldimm64's insn[0].src_reg != 0 then this can have
* two extensions:
*
@ -337,6 +341,9 @@ enum bpf_attach_type {
#define BPF_F_RDONLY_PROG (1U << 7)
#define BPF_F_WRONLY_PROG (1U << 8)
/* Clone map from listener for newly accepted socket */
#define BPF_F_CLONE (1U << 9)
/* flags for BPF_PROG_QUERY */
#define BPF_F_QUERY_EFFECTIVE (1U << 0)
@ -576,6 +583,8 @@ union bpf_attr {
* limited to five).
*
* Each time the helper is called, it appends a line to the trace.
* Lines are discarded while *\/sys/kernel/debug/tracing/trace* is
* open, use *\/sys/kernel/debug/tracing/trace_pipe* to avoid this.
* The format of the trace is customizable, and the exact output
* one will get depends on the options set in
* *\/sys/kernel/debug/tracing/trace_options* (see also the
@ -1014,7 +1023,7 @@ union bpf_attr {
* The realm of the route for the packet associated to *skb*, or 0
* if none was found.
*
* int bpf_perf_event_output(struct pt_reg *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
* int bpf_perf_event_output(struct pt_regs *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
* Description
* Write raw *data* blob into a special BPF perf event held by
* *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
@ -1076,7 +1085,7 @@ union bpf_attr {
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_get_stackid(struct pt_reg *ctx, struct bpf_map *map, u64 flags)
* int bpf_get_stackid(struct pt_regs *ctx, struct bpf_map *map, u64 flags)
* Description
* Walk a user or a kernel stack and return its id. To achieve
* this, the helper needs *ctx*, which is a pointer to the context
@ -1725,7 +1734,7 @@ union bpf_attr {
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_override_return(struct pt_reg *regs, u64 rc)
* int bpf_override_return(struct pt_regs *regs, u64 rc)
* Description
* Used for error injection, this helper uses kprobes to override
* the return value of the probed function, and to set it to *rc*.

View File

@ -16,6 +16,18 @@
#define XDP_SHARED_UMEM (1 << 0)
#define XDP_COPY (1 << 1) /* Force copy-mode */
#define XDP_ZEROCOPY (1 << 2) /* Force zero-copy mode */
/* If this option is set, the driver might go sleep and in that case
* the XDP_RING_NEED_WAKEUP flag in the fill and/or Tx rings will be
* set. If it is set, the application need to explicitly wake up the
* driver with a poll() (Rx and Tx) or sendto() (Tx only). If you are
* running the driver and the application on the same core, you should
* use this option so that the kernel will yield to the user space
* application.
*/
#define XDP_USE_NEED_WAKEUP (1 << 3)
/* Flags for xsk_umem_config flags */
#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0)
struct sockaddr_xdp {
__u16 sxdp_family;
@ -25,10 +37,14 @@ struct sockaddr_xdp {
__u32 sxdp_shared_umem_fd;
};
/* XDP_RING flags */
#define XDP_RING_NEED_WAKEUP (1 << 0)
struct xdp_ring_offset {
__u64 producer;
__u64 consumer;
__u64 desc;
__u64 flags;
};
struct xdp_mmap_offsets {
@ -53,6 +69,7 @@ struct xdp_umem_reg {
__u64 len; /* Length of packet data area */
__u32 chunk_size;
__u32 headroom;
__u32 flags;
};
struct xdp_statistics {
@ -74,6 +91,11 @@ struct xdp_options {
#define XDP_UMEM_PGOFF_FILL_RING 0x100000000ULL
#define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000ULL
/* Masks for unaligned chunks mode */
#define XSK_UNALIGNED_BUF_OFFSET_SHIFT 48
#define XSK_UNALIGNED_BUF_ADDR_MASK \
((1ULL << XSK_UNALIGNED_BUF_OFFSET_SHIFT) - 1)
/* Rx/Tx descriptor */
struct xdp_desc {
__u64 addr;

View File

@ -195,8 +195,8 @@
i < btf_type_vlen(struct_type); \
i++, member++)
static DEFINE_IDR(btf_idr);
static DEFINE_SPINLOCK(btf_idr_lock);
DEFINE_IDR(btf_idr);
DEFINE_SPINLOCK(btf_idr_lock);
struct btf {
void *data;
@ -3376,6 +3376,15 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
btf_type_ops(t)->seq_show(btf, t, type_id, obj, 0, m);
}
#ifdef CONFIG_PROC_FS
static void bpf_btf_show_fdinfo(struct seq_file *m, struct file *filp)
{
const struct btf *btf = filp->private_data;
seq_printf(m, "btf_id:\t%u\n", btf->id);
}
#endif
static int btf_release(struct inode *inode, struct file *filp)
{
btf_put(filp->private_data);
@ -3383,6 +3392,9 @@ static int btf_release(struct inode *inode, struct file *filp)
}
const struct file_operations btf_fops = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = bpf_btf_show_fdinfo,
#endif
.release = btf_release,
};

View File

@ -683,8 +683,8 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
}
/* map_idr_lock should have been held */
static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map,
bool uref)
static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map,
bool uref)
{
int refold;
@ -704,6 +704,16 @@ static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map,
return map;
}
struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
{
spin_lock_bh(&map_idr_lock);
map = __bpf_map_inc_not_zero(map, uref);
spin_unlock_bh(&map_idr_lock);
return map;
}
EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero);
int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
{
return -ENOTSUPP;
@ -1619,6 +1629,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
BPF_F_ANY_ALIGNMENT |
BPF_F_TEST_STATE_FREQ |
BPF_F_TEST_RND_HI32))
return -EINVAL;
@ -2183,7 +2194,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
spin_lock_bh(&map_idr_lock);
map = idr_find(&map_idr, id);
if (map)
map = bpf_map_inc_not_zero(map, true);
map = __bpf_map_inc_not_zero(map, true);
else
map = ERR_PTR(-ENOENT);
spin_unlock_bh(&map_idr_lock);
@ -2880,6 +2891,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
err = bpf_obj_get_next_id(&attr, uattr,
&map_idr, &map_idr_lock);
break;
case BPF_BTF_GET_NEXT_ID:
err = bpf_obj_get_next_id(&attr, uattr,
&btf_idr, &btf_idr_lock);
break;
case BPF_PROG_GET_FD_BY_ID:
err = bpf_prog_get_fd_by_id(&attr);
break;

View File

@ -30,17 +30,12 @@ static struct kobject *btf_kobj;
static int __init btf_vmlinux_init(void)
{
int err;
if (!_binary__btf_vmlinux_bin_start)
return 0;
btf_kobj = kobject_create_and_add("btf", kernel_kobj);
if (IS_ERR(btf_kobj)) {
err = PTR_ERR(btf_kobj);
btf_kobj = NULL;
return err;
}
if (!btf_kobj)
return -ENOMEM;
bin_attr_btf_vmlinux.size = _binary__btf_vmlinux_bin_end -
_binary__btf_vmlinux_bin_start;

View File

@ -7223,7 +7223,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
struct bpf_verifier_state_list *sl, **pprev;
struct bpf_verifier_state *cur = env->cur_state, *new;
int i, j, err, states_cnt = 0;
bool add_new_state = false;
bool add_new_state = env->test_state_freq ? true : false;
cur->last_insn_idx = env->prev_insn_idx;
if (!env->insn_aux_data[insn_idx].prune_point)
@ -9263,6 +9263,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
env->allow_ptr_leaks = is_priv;
if (is_priv)
env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
ret = replace_map_fd_with_map_ptr(env);
if (ret < 0)
goto skip_full_check;

View File

@ -13,8 +13,71 @@ struct xsk_map {
struct bpf_map map;
struct xdp_sock **xsk_map;
struct list_head __percpu *flush_list;
spinlock_t lock; /* Synchronize map updates */
};
int xsk_map_inc(struct xsk_map *map)
{
struct bpf_map *m = &map->map;
m = bpf_map_inc(m, false);
return PTR_ERR_OR_ZERO(m);
}
void xsk_map_put(struct xsk_map *map)
{
bpf_map_put(&map->map);
}
static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map,
struct xdp_sock **map_entry)
{
struct xsk_map_node *node;
int err;
node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN);
if (!node)
return NULL;
err = xsk_map_inc(map);
if (err) {
kfree(node);
return ERR_PTR(err);
}
node->map = map;
node->map_entry = map_entry;
return node;
}
static void xsk_map_node_free(struct xsk_map_node *node)
{
xsk_map_put(node->map);
kfree(node);
}
static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node)
{
spin_lock_bh(&xs->map_list_lock);
list_add_tail(&node->node, &xs->map_list);
spin_unlock_bh(&xs->map_list_lock);
}
static void xsk_map_sock_delete(struct xdp_sock *xs,
struct xdp_sock **map_entry)
{
struct xsk_map_node *n, *tmp;
spin_lock_bh(&xs->map_list_lock);
list_for_each_entry_safe(n, tmp, &xs->map_list, node) {
if (map_entry == n->map_entry) {
list_del(&n->node);
xsk_map_node_free(n);
}
}
spin_unlock_bh(&xs->map_list_lock);
}
static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
{
struct xsk_map *m;
@ -34,6 +97,7 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&m->map, attr);
spin_lock_init(&m->lock);
cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *);
cost += sizeof(struct list_head) * num_possible_cpus();
@ -71,21 +135,9 @@ free_m:
static void xsk_map_free(struct bpf_map *map)
{
struct xsk_map *m = container_of(map, struct xsk_map, map);
int i;
bpf_clear_redirect_map(map);
synchronize_net();
for (i = 0; i < map->max_entries; i++) {
struct xdp_sock *xs;
xs = m->xsk_map[i];
if (!xs)
continue;
sock_put((struct sock *)xs);
}
free_percpu(m->flush_list);
bpf_map_area_free(m->xsk_map);
kfree(m);
@ -164,8 +216,9 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
u64 map_flags)
{
struct xsk_map *m = container_of(map, struct xsk_map, map);
struct xdp_sock *xs, *old_xs, **map_entry;
u32 i = *(u32 *)key, fd = *(u32 *)value;
struct xdp_sock *xs, *old_xs;
struct xsk_map_node *node;
struct socket *sock;
int err;
@ -173,8 +226,6 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
return -EINVAL;
if (unlikely(i >= m->map.max_entries))
return -E2BIG;
if (unlikely(map_flags == BPF_NOEXIST))
return -EEXIST;
sock = sockfd_lookup(fd, &err);
if (!sock)
@ -192,32 +243,70 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
return -EOPNOTSUPP;
}
sock_hold(sock->sk);
map_entry = &m->xsk_map[i];
node = xsk_map_node_alloc(m, map_entry);
if (IS_ERR(node)) {
sockfd_put(sock);
return PTR_ERR(node);
}
old_xs = xchg(&m->xsk_map[i], xs);
spin_lock_bh(&m->lock);
old_xs = READ_ONCE(*map_entry);
if (old_xs == xs) {
err = 0;
goto out;
} else if (old_xs && map_flags == BPF_NOEXIST) {
err = -EEXIST;
goto out;
} else if (!old_xs && map_flags == BPF_EXIST) {
err = -ENOENT;
goto out;
}
xsk_map_sock_add(xs, node);
WRITE_ONCE(*map_entry, xs);
if (old_xs)
sock_put((struct sock *)old_xs);
xsk_map_sock_delete(old_xs, map_entry);
spin_unlock_bh(&m->lock);
sockfd_put(sock);
return 0;
out:
spin_unlock_bh(&m->lock);
sockfd_put(sock);
xsk_map_node_free(node);
return err;
}
static int xsk_map_delete_elem(struct bpf_map *map, void *key)
{
struct xsk_map *m = container_of(map, struct xsk_map, map);
struct xdp_sock *old_xs;
struct xdp_sock *old_xs, **map_entry;
int k = *(u32 *)key;
if (k >= map->max_entries)
return -EINVAL;
old_xs = xchg(&m->xsk_map[k], NULL);
spin_lock_bh(&m->lock);
map_entry = &m->xsk_map[k];
old_xs = xchg(map_entry, NULL);
if (old_xs)
sock_put((struct sock *)old_xs);
xsk_map_sock_delete(old_xs, map_entry);
spin_unlock_bh(&m->lock);
return 0;
}
void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
struct xdp_sock **map_entry)
{
spin_lock_bh(&map->lock);
if (READ_ONCE(*map_entry) == xs) {
WRITE_ONCE(*map_entry, NULL);
xsk_map_sock_delete(xs, map_entry);
}
spin_unlock_bh(&map->lock);
}
const struct bpf_map_ops xsk_map_ops = {
.map_alloc = xsk_map_alloc,
.map_free = xsk_map_free,

View File

@ -520,7 +520,8 @@ config BPF_EVENTS
bool
default y
help
This allows the user to attach BPF programs to kprobe events.
This allows the user to attach BPF programs to kprobe, uprobe, and
tracepoint events.
config DYNAMIC_EVENTS
def_bool n

View File

@ -867,7 +867,7 @@ static struct bpf_test tests[] = {
},
CLASSIC,
{ },
{ { 4, 10 ^ 300 }, { 20, 10 ^ 300 } },
{ { 4, 0xA ^ 300 }, { 20, 0xA ^ 300 } },
},
{
"SPILL_FILL",

View File

@ -12,6 +12,9 @@
static atomic_t cache_idx;
#define SK_STORAGE_CREATE_FLAG_MASK \
(BPF_F_NO_PREALLOC | BPF_F_CLONE)
struct bucket {
struct hlist_head list;
raw_spinlock_t lock;
@ -209,7 +212,6 @@ static void selem_unlink_sk(struct bpf_sk_storage_elem *selem)
kfree_rcu(sk_storage, rcu);
}
/* sk_storage->lock must be held and sk_storage->list cannot be empty */
static void __selem_link_sk(struct bpf_sk_storage *sk_storage,
struct bpf_sk_storage_elem *selem)
{
@ -509,7 +511,7 @@ static int sk_storage_delete(struct sock *sk, struct bpf_map *map)
return 0;
}
/* Called by __sk_destruct() */
/* Called by __sk_destruct() & bpf_sk_storage_clone() */
void bpf_sk_storage_free(struct sock *sk)
{
struct bpf_sk_storage_elem *selem;
@ -557,6 +559,11 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
smap = (struct bpf_sk_storage_map *)map;
/* Note that this map might be concurrently cloned from
* bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
* RCU read section to finish before proceeding. New RCU
* read sections should be prevented via bpf_map_inc_not_zero.
*/
synchronize_rcu();
/* bpf prog and the userspace can no longer access this map
@ -601,7 +608,9 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
{
if (attr->map_flags != BPF_F_NO_PREALLOC || attr->max_entries ||
if (attr->map_flags & ~SK_STORAGE_CREATE_FLAG_MASK ||
!(attr->map_flags & BPF_F_NO_PREALLOC) ||
attr->max_entries ||
attr->key_size != sizeof(int) || !attr->value_size ||
/* Enforce BTF for userspace sk dumping */
!attr->btf_key_type_id || !attr->btf_value_type_id)
@ -739,6 +748,95 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
return err;
}
static struct bpf_sk_storage_elem *
bpf_sk_storage_clone_elem(struct sock *newsk,
struct bpf_sk_storage_map *smap,
struct bpf_sk_storage_elem *selem)
{
struct bpf_sk_storage_elem *copy_selem;
copy_selem = selem_alloc(smap, newsk, NULL, true);
if (!copy_selem)
return NULL;
if (map_value_has_spin_lock(&smap->map))
copy_map_value_locked(&smap->map, SDATA(copy_selem)->data,
SDATA(selem)->data, true);
else
copy_map_value(&smap->map, SDATA(copy_selem)->data,
SDATA(selem)->data);
return copy_selem;
}
int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
{
struct bpf_sk_storage *new_sk_storage = NULL;
struct bpf_sk_storage *sk_storage;
struct bpf_sk_storage_elem *selem;
int ret = 0;
RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
rcu_read_lock();
sk_storage = rcu_dereference(sk->sk_bpf_storage);
if (!sk_storage || hlist_empty(&sk_storage->list))
goto out;
hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
struct bpf_sk_storage_elem *copy_selem;
struct bpf_sk_storage_map *smap;
struct bpf_map *map;
smap = rcu_dereference(SDATA(selem)->smap);
if (!(smap->map.map_flags & BPF_F_CLONE))
continue;
/* Note that for lockless listeners adding new element
* here can race with cleanup in bpf_sk_storage_map_free.
* Try to grab map refcnt to make sure that it's still
* alive and prevent concurrent removal.
*/
map = bpf_map_inc_not_zero(&smap->map, false);
if (IS_ERR(map))
continue;
copy_selem = bpf_sk_storage_clone_elem(newsk, smap, selem);
if (!copy_selem) {
ret = -ENOMEM;
bpf_map_put(map);
goto out;
}
if (new_sk_storage) {
selem_link_map(smap, copy_selem);
__selem_link_sk(new_sk_storage, copy_selem);
} else {
ret = sk_storage_alloc(newsk, smap, copy_selem);
if (ret) {
kfree(copy_selem);
atomic_sub(smap->elem_size,
&newsk->sk_omem_alloc);
bpf_map_put(map);
goto out;
}
new_sk_storage = rcu_dereference(copy_selem->sk_storage);
}
bpf_map_put(map);
}
out:
rcu_read_unlock();
/* In case of an error, don't free anything explicitly here, the
* caller is responsible to call bpf_sk_storage_free.
*/
return ret;
}
BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
void *, value, u64, flags)
{

View File

@ -8126,12 +8126,15 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
bpf_chk = generic_xdp_install;
if (fd >= 0) {
u32 prog_id;
if (!offload && __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG)) {
NL_SET_ERR_MSG(extack, "native and generic XDP can't be active at the same time");
return -EEXIST;
}
if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) &&
__dev_xdp_query(dev, bpf_op, query)) {
prog_id = __dev_xdp_query(dev, bpf_op, query);
if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && prog_id) {
NL_SET_ERR_MSG(extack, "XDP program already attached");
return -EBUSY;
}
@ -8146,6 +8149,14 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
bpf_prog_put(prog);
return -EINVAL;
}
if (prog->aux->id == prog_id) {
bpf_prog_put(prog);
return 0;
}
} else {
if (!__dev_xdp_query(dev, bpf_op, query))
return 0;
}
err = dev_xdp_install(dev, bpf_op, extack, flags, prog);

View File

@ -5903,7 +5903,7 @@ BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len,
default:
return -EPROTONOSUPPORT;
}
if (mss <= 0)
if (mss == 0)
return -ENOENT;
return cookie | ((u64)mss << 32);

View File

@ -1851,9 +1851,12 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
goto out;
}
RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
#ifdef CONFIG_BPF_SYSCALL
RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
#endif
if (bpf_sk_storage_clone(sk, newsk)) {
sk_free_unlock_clone(newsk);
newsk = NULL;
goto out;
}
newsk->sk_err = 0;
newsk->sk_err_soft = 0;

View File

@ -14,7 +14,7 @@
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/idr.h>
#include <linux/highmem.h>
#include <linux/vmalloc.h>
#include "xdp_umem.h"
#include "xsk_queue.h"
@ -106,14 +106,22 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
umem->dev = dev;
umem->queue_id = queue_id;
if (flags & XDP_USE_NEED_WAKEUP) {
umem->flags |= XDP_UMEM_USES_NEED_WAKEUP;
/* Tx needs to be explicitly woken up the first time.
* Also for supporting drivers that do not implement this
* feature. They will always have to call sendto().
*/
xsk_set_tx_need_wakeup(umem);
}
dev_hold(dev);
if (force_copy)
/* For copy-mode, we are done. */
return 0;
if (!dev->netdev_ops->ndo_bpf ||
!dev->netdev_ops->ndo_xsk_async_xmit) {
if (!dev->netdev_ops->ndo_bpf || !dev->netdev_ops->ndo_xsk_wakeup) {
err = -EOPNOTSUPP;
goto err_unreg_umem;
}
@ -170,7 +178,30 @@ static void xdp_umem_unmap_pages(struct xdp_umem *umem)
unsigned int i;
for (i = 0; i < umem->npgs; i++)
kunmap(umem->pgs[i]);
if (PageHighMem(umem->pgs[i]))
vunmap(umem->pages[i].addr);
}
static int xdp_umem_map_pages(struct xdp_umem *umem)
{
unsigned int i;
void *addr;
for (i = 0; i < umem->npgs; i++) {
if (PageHighMem(umem->pgs[i]))
addr = vmap(&umem->pgs[i], 1, VM_MAP, PAGE_KERNEL);
else
addr = page_address(umem->pgs[i]);
if (!addr) {
xdp_umem_unmap_pages(umem);
return -ENOMEM;
}
umem->pages[i].addr = addr;
}
return 0;
}
static void xdp_umem_unpin_pages(struct xdp_umem *umem)
@ -309,10 +340,11 @@ static int xdp_umem_account_pages(struct xdp_umem *umem)
static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
{
bool unaligned_chunks = mr->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG;
u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
unsigned int chunks, chunks_per_page;
u64 addr = mr->addr, size = mr->len;
int size_chk, err, i;
int size_chk, err;
if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) {
/* Strictly speaking we could support this, if:
@ -324,7 +356,11 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
return -EINVAL;
}
if (!is_power_of_2(chunk_size))
if (mr->flags & ~(XDP_UMEM_UNALIGNED_CHUNK_FLAG |
XDP_UMEM_USES_NEED_WAKEUP))
return -EINVAL;
if (!unaligned_chunks && !is_power_of_2(chunk_size))
return -EINVAL;
if (!PAGE_ALIGNED(addr)) {
@ -341,9 +377,11 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
if (chunks == 0)
return -EINVAL;
chunks_per_page = PAGE_SIZE / chunk_size;
if (chunks < chunks_per_page || chunks % chunks_per_page)
return -EINVAL;
if (!unaligned_chunks) {
chunks_per_page = PAGE_SIZE / chunk_size;
if (chunks < chunks_per_page || chunks % chunks_per_page)
return -EINVAL;
}
headroom = ALIGN(headroom, 64);
@ -352,13 +390,15 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
return -EINVAL;
umem->address = (unsigned long)addr;
umem->chunk_mask = ~((u64)chunk_size - 1);
umem->chunk_mask = unaligned_chunks ? XSK_UNALIGNED_BUF_ADDR_MASK
: ~((u64)chunk_size - 1);
umem->size = size;
umem->headroom = headroom;
umem->chunk_size_nohr = chunk_size - headroom;
umem->npgs = size / PAGE_SIZE;
umem->pgs = NULL;
umem->user = NULL;
umem->flags = mr->flags;
INIT_LIST_HEAD(&umem->xsk_list);
spin_lock_init(&umem->xsk_list_lock);
@ -378,10 +418,11 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
goto out_pin;
}
for (i = 0; i < umem->npgs; i++)
umem->pages[i].addr = kmap(umem->pgs[i]);
err = xdp_umem_map_pages(umem);
if (!err)
return 0;
return 0;
kfree(umem->pages);
out_pin:
xdp_umem_unpin_pages(umem);

View File

@ -45,7 +45,7 @@ EXPORT_SYMBOL(xsk_umem_has_addrs);
u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
{
return xskq_peek_addr(umem->fq, addr);
return xskq_peek_addr(umem->fq, addr, umem);
}
EXPORT_SYMBOL(xsk_umem_peek_addr);
@ -55,21 +55,103 @@ void xsk_umem_discard_addr(struct xdp_umem *umem)
}
EXPORT_SYMBOL(xsk_umem_discard_addr);
void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
{
if (umem->need_wakeup & XDP_WAKEUP_RX)
return;
umem->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
umem->need_wakeup |= XDP_WAKEUP_RX;
}
EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
void xsk_set_tx_need_wakeup(struct xdp_umem *umem)
{
struct xdp_sock *xs;
if (umem->need_wakeup & XDP_WAKEUP_TX)
return;
rcu_read_lock();
list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
}
rcu_read_unlock();
umem->need_wakeup |= XDP_WAKEUP_TX;
}
EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
void xsk_clear_rx_need_wakeup(struct xdp_umem *umem)
{
if (!(umem->need_wakeup & XDP_WAKEUP_RX))
return;
umem->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
umem->need_wakeup &= ~XDP_WAKEUP_RX;
}
EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
void xsk_clear_tx_need_wakeup(struct xdp_umem *umem)
{
struct xdp_sock *xs;
if (!(umem->need_wakeup & XDP_WAKEUP_TX))
return;
rcu_read_lock();
list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
}
rcu_read_unlock();
umem->need_wakeup &= ~XDP_WAKEUP_TX;
}
EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
{
return umem->flags & XDP_UMEM_USES_NEED_WAKEUP;
}
EXPORT_SYMBOL(xsk_umem_uses_need_wakeup);
/* If a buffer crosses a page boundary, we need to do 2 memcpy's, one for
* each page. This is only required in copy mode.
*/
static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void *from_buf,
u32 len, u32 metalen)
{
void *to_buf = xdp_umem_get_data(umem, addr);
addr = xsk_umem_add_offset_to_addr(addr);
if (xskq_crosses_non_contig_pg(umem, addr, len + metalen)) {
void *next_pg_addr = umem->pages[(addr >> PAGE_SHIFT) + 1].addr;
u64 page_start = addr & ~(PAGE_SIZE - 1);
u64 first_len = PAGE_SIZE - (addr - page_start);
memcpy(to_buf, from_buf, first_len + metalen);
memcpy(next_pg_addr, from_buf + first_len, len - first_len);
return;
}
memcpy(to_buf, from_buf, len + metalen);
}
static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
{
void *to_buf, *from_buf;
u64 offset = xs->umem->headroom;
u64 addr, memcpy_addr;
void *from_buf;
u32 metalen;
u64 addr;
int err;
if (!xskq_peek_addr(xs->umem->fq, &addr) ||
if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) ||
len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
xs->rx_dropped++;
return -ENOSPC;
}
addr += xs->umem->headroom;
if (unlikely(xdp_data_meta_unsupported(xdp))) {
from_buf = xdp->data;
metalen = 0;
@ -78,9 +160,11 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
metalen = xdp->data - xdp->data_meta;
}
to_buf = xdp_umem_get_data(xs->umem, addr);
memcpy(to_buf, from_buf, len + metalen);
addr += metalen;
memcpy_addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
__xsk_rcv_memcpy(xs->umem, memcpy_addr, from_buf, len, metalen);
offset += metalen;
addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
err = xskq_produce_batch_desc(xs->rx, addr, len);
if (!err) {
xskq_discard_addr(xs->umem->fq);
@ -102,10 +186,23 @@ static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
return err;
}
static bool xsk_is_bound(struct xdp_sock *xs)
{
if (READ_ONCE(xs->state) == XSK_BOUND) {
/* Matches smp_wmb() in bind(). */
smp_rmb();
return true;
}
return false;
}
int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
{
u32 len;
if (!xsk_is_bound(xs))
return -EINVAL;
if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
return -EINVAL;
@ -125,6 +222,7 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
{
u32 metalen = xdp->data - xdp->data_meta;
u32 len = xdp->data_end - xdp->data;
u64 offset = xs->umem->headroom;
void *buffer;
u64 addr;
int err;
@ -136,17 +234,17 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
goto out_unlock;
}
if (!xskq_peek_addr(xs->umem->fq, &addr) ||
if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) ||
len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
err = -ENOSPC;
goto out_drop;
}
addr += xs->umem->headroom;
addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
buffer = xdp_umem_get_data(xs->umem, addr);
memcpy(buffer, xdp->data_meta, len + metalen);
addr += metalen;
addr = xsk_umem_adjust_offset(xs->umem, addr, metalen);
err = xskq_produce_batch_desc(xs->rx, addr, len);
if (err)
goto out_drop;
@ -190,7 +288,7 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc)
rcu_read_lock();
list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
if (!xskq_peek_desc(xs->tx, desc))
if (!xskq_peek_desc(xs->tx, desc, umem))
continue;
if (xskq_produce_addr_lazy(umem->cq, desc->addr))
@ -212,7 +310,8 @@ static int xsk_zc_xmit(struct sock *sk)
struct xdp_sock *xs = xdp_sk(sk);
struct net_device *dev = xs->dev;
return dev->netdev_ops->ndo_xsk_async_xmit(dev, xs->queue_id);
return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id,
XDP_WAKEUP_TX);
}
static void xsk_destruct_skb(struct sk_buff *skb)
@ -243,7 +342,7 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
if (xs->queue_id >= xs->dev->real_num_tx_queues)
goto out;
while (xskq_peek_desc(xs->tx, &desc)) {
while (xskq_peek_desc(xs->tx, &desc, xs->umem)) {
char *buffer;
u64 addr;
u32 len;
@ -272,7 +371,7 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
skb->dev = xs->dev;
skb->priority = sk->sk_priority;
skb->mark = sk->sk_mark;
skb_shinfo(skb)->destructor_arg = (void *)(long)addr;
skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
skb->destructor = xsk_destruct_skb;
err = dev_direct_xmit(skb, xs->queue_id);
@ -301,7 +400,7 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
struct sock *sk = sock->sk;
struct xdp_sock *xs = xdp_sk(sk);
if (unlikely(!xs->dev))
if (unlikely(!xsk_is_bound(xs)))
return -ENXIO;
if (unlikely(!(xs->dev->flags & IFF_UP)))
return -ENETDOWN;
@ -317,8 +416,19 @@ static unsigned int xsk_poll(struct file *file, struct socket *sock,
struct poll_table_struct *wait)
{
unsigned int mask = datagram_poll(file, sock, wait);
struct sock *sk = sock->sk;
struct xdp_sock *xs = xdp_sk(sk);
struct xdp_sock *xs = xdp_sk(sock->sk);
struct net_device *dev;
struct xdp_umem *umem;
if (unlikely(!xsk_is_bound(xs)))
return mask;
dev = xs->dev;
umem = xs->umem;
if (umem->need_wakeup)
dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id,
umem->need_wakeup);
if (xs->rx && !xskq_empty_desc(xs->rx))
mask |= POLLIN | POLLRDNORM;
@ -342,7 +452,7 @@ static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
/* Make sure queue is ready before it can be seen by others */
smp_wmb();
*queue = q;
WRITE_ONCE(*queue, q);
return 0;
}
@ -350,10 +460,9 @@ static void xsk_unbind_dev(struct xdp_sock *xs)
{
struct net_device *dev = xs->dev;
if (!dev || xs->state != XSK_BOUND)
if (xs->state != XSK_BOUND)
return;
xs->state = XSK_UNBOUND;
WRITE_ONCE(xs->state, XSK_UNBOUND);
/* Wait for driver to stop using the xdp socket. */
xdp_del_sk_umem(xs->umem, xs);
@ -362,6 +471,52 @@ static void xsk_unbind_dev(struct xdp_sock *xs)
dev_put(dev);
}
static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
struct xdp_sock ***map_entry)
{
struct xsk_map *map = NULL;
struct xsk_map_node *node;
*map_entry = NULL;
spin_lock_bh(&xs->map_list_lock);
node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
node);
if (node) {
WARN_ON(xsk_map_inc(node->map));
map = node->map;
*map_entry = node->map_entry;
}
spin_unlock_bh(&xs->map_list_lock);
return map;
}
static void xsk_delete_from_maps(struct xdp_sock *xs)
{
/* This function removes the current XDP socket from all the
* maps it resides in. We need to take extra care here, due to
* the two locks involved. Each map has a lock synchronizing
* updates to the entries, and each socket has a lock that
* synchronizes access to the list of maps (map_list). For
* deadlock avoidance the locks need to be taken in the order
* "map lock"->"socket map list lock". We start off by
* accessing the socket map list, and take a reference to the
* map to guarantee existence between the
* xsk_get_map_list_entry() and xsk_map_try_sock_delete()
* calls. Then we ask the map to remove the socket, which
* tries to remove the socket from the map. Note that there
* might be updates to the map between
* xsk_get_map_list_entry() and xsk_map_try_sock_delete().
*/
struct xdp_sock **map_entry = NULL;
struct xsk_map *map;
while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
xsk_map_try_sock_delete(map, xs, map_entry);
xsk_map_put(map);
}
}
static int xsk_release(struct socket *sock)
{
struct sock *sk = sock->sk;
@ -381,7 +536,10 @@ static int xsk_release(struct socket *sock)
sock_prot_inuse_add(net, sk->sk_prot, -1);
local_bh_enable();
xsk_delete_from_maps(xs);
mutex_lock(&xs->mutex);
xsk_unbind_dev(xs);
mutex_unlock(&xs->mutex);
xskq_destroy(xs->rx);
xskq_destroy(xs->tx);
@ -412,6 +570,24 @@ static struct socket *xsk_lookup_xsk_from_fd(int fd)
return sock;
}
/* Check if umem pages are contiguous.
* If zero-copy mode, use the DMA address to do the page contiguity check
* For all other modes we use addr (kernel virtual address)
* Store the result in the low bits of addr.
*/
static void xsk_check_page_contiguity(struct xdp_umem *umem, u32 flags)
{
struct xdp_umem_page *pgs = umem->pages;
int i, is_contig;
for (i = 0; i < umem->npgs - 1; i++) {
is_contig = (flags & XDP_ZEROCOPY) ?
(pgs[i].dma + PAGE_SIZE == pgs[i + 1].dma) :
(pgs[i].addr + PAGE_SIZE == pgs[i + 1].addr);
pgs[i].addr += is_contig << XSK_NEXT_PG_CONTIG_SHIFT;
}
}
static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
{
struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
@ -427,7 +603,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
return -EINVAL;
flags = sxdp->sxdp_flags;
if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY))
if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
XDP_USE_NEED_WAKEUP))
return -EINVAL;
rtnl_lock();
@ -454,7 +631,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
struct xdp_sock *umem_xs;
struct socket *sock;
if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY)) {
if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
(flags & XDP_USE_NEED_WAKEUP)) {
/* Cannot specify flags for shared sockets. */
err = -EINVAL;
goto out_unlock;
@ -473,19 +651,19 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
}
umem_xs = xdp_sk(sock->sk);
if (!umem_xs->umem) {
/* No umem to inherit. */
if (!xsk_is_bound(umem_xs)) {
err = -EBADF;
sockfd_put(sock);
goto out_unlock;
} else if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
}
if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
err = -EINVAL;
sockfd_put(sock);
goto out_unlock;
}
xdp_get_umem(umem_xs->umem);
xs->umem = umem_xs->umem;
WRITE_ONCE(xs->umem, umem_xs->umem);
sockfd_put(sock);
} else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
err = -EINVAL;
@ -500,6 +678,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
if (err)
goto out_unlock;
xsk_check_page_contiguity(xs->umem, flags);
}
xs->dev = dev;
@ -510,16 +690,28 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
xdp_add_sk_umem(xs->umem, xs);
out_unlock:
if (err)
if (err) {
dev_put(dev);
else
xs->state = XSK_BOUND;
} else {
/* Matches smp_rmb() in bind() for shared umem
* sockets, and xsk_is_bound().
*/
smp_wmb();
WRITE_ONCE(xs->state, XSK_BOUND);
}
out_release:
mutex_unlock(&xs->mutex);
rtnl_unlock();
return err;
}
struct xdp_umem_reg_v1 {
__u64 addr; /* Start of packet data area */
__u64 len; /* Length of packet data area */
__u32 chunk_size;
__u32 headroom;
};
static int xsk_setsockopt(struct socket *sock, int level, int optname,
char __user *optval, unsigned int optlen)
{
@ -549,15 +741,24 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
}
q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
err = xsk_init_queue(entries, q, false);
if (!err && optname == XDP_TX_RING)
/* Tx needs to be explicitly woken up the first time */
xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
mutex_unlock(&xs->mutex);
return err;
}
case XDP_UMEM_REG:
{
struct xdp_umem_reg mr;
size_t mr_size = sizeof(struct xdp_umem_reg);
struct xdp_umem_reg mr = {};
struct xdp_umem *umem;
if (copy_from_user(&mr, optval, sizeof(mr)))
if (optlen < sizeof(struct xdp_umem_reg_v1))
return -EINVAL;
else if (optlen < sizeof(mr))
mr_size = sizeof(struct xdp_umem_reg_v1);
if (copy_from_user(&mr, optval, mr_size))
return -EFAULT;
mutex_lock(&xs->mutex);
@ -574,7 +775,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
/* Make sure umem is ready before it can be seen by others */
smp_wmb();
xs->umem = umem;
WRITE_ONCE(xs->umem, umem);
mutex_unlock(&xs->mutex);
return 0;
}
@ -610,6 +811,20 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
return -ENOPROTOOPT;
}
static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
{
ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
ring->desc = offsetof(struct xdp_rxtx_ring, desc);
}
static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
{
ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
ring->desc = offsetof(struct xdp_umem_ring, desc);
}
static int xsk_getsockopt(struct socket *sock, int level, int optname,
char __user *optval, int __user *optlen)
{
@ -649,26 +864,49 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname,
case XDP_MMAP_OFFSETS:
{
struct xdp_mmap_offsets off;
struct xdp_mmap_offsets_v1 off_v1;
bool flags_supported = true;
void *to_copy;
if (len < sizeof(off))
if (len < sizeof(off_v1))
return -EINVAL;
else if (len < sizeof(off))
flags_supported = false;
off.rx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
off.rx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
off.rx.desc = offsetof(struct xdp_rxtx_ring, desc);
off.tx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
off.tx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
off.tx.desc = offsetof(struct xdp_rxtx_ring, desc);
if (flags_supported) {
/* xdp_ring_offset is identical to xdp_ring_offset_v1
* except for the flags field added to the end.
*/
xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
&off.rx);
xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
&off.tx);
xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
&off.fr);
xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
&off.cr);
off.rx.flags = offsetof(struct xdp_rxtx_ring,
ptrs.flags);
off.tx.flags = offsetof(struct xdp_rxtx_ring,
ptrs.flags);
off.fr.flags = offsetof(struct xdp_umem_ring,
ptrs.flags);
off.cr.flags = offsetof(struct xdp_umem_ring,
ptrs.flags);
off.fr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
off.fr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
off.fr.desc = offsetof(struct xdp_umem_ring, desc);
off.cr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
off.cr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
off.cr.desc = offsetof(struct xdp_umem_ring, desc);
len = sizeof(off);
to_copy = &off;
} else {
xsk_enter_rxtx_offsets(&off_v1.rx);
xsk_enter_rxtx_offsets(&off_v1.tx);
xsk_enter_umem_offsets(&off_v1.fr);
xsk_enter_umem_offsets(&off_v1.cr);
len = sizeof(off);
if (copy_to_user(optval, &off, len))
len = sizeof(off_v1);
to_copy = &off_v1;
}
if (copy_to_user(optval, to_copy, len))
return -EFAULT;
if (put_user(len, optlen))
return -EFAULT;
@ -713,7 +951,7 @@ static int xsk_mmap(struct file *file, struct socket *sock,
unsigned long pfn;
struct page *qpg;
if (xs->state != XSK_READY)
if (READ_ONCE(xs->state) != XSK_READY)
return -EBUSY;
if (offset == XDP_PGOFF_RX_RING) {
@ -855,6 +1093,9 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol,
spin_lock_init(&xs->rx_lock);
spin_lock_init(&xs->tx_completion_lock);
INIT_LIST_HEAD(&xs->map_list);
spin_lock_init(&xs->map_list_lock);
mutex_lock(&net->xdp.lock);
sk_add_node_rcu(sk, &net->xdp.list);
mutex_unlock(&net->xdp.lock);

View File

@ -4,6 +4,19 @@
#ifndef XSK_H_
#define XSK_H_
struct xdp_ring_offset_v1 {
__u64 producer;
__u64 consumer;
__u64 desc;
};
struct xdp_mmap_offsets_v1 {
struct xdp_ring_offset_v1 rx;
struct xdp_ring_offset_v1 tx;
struct xdp_ring_offset_v1 fr;
struct xdp_ring_offset_v1 cr;
};
static inline struct xdp_sock *xdp_sk(struct sock *sk)
{
return (struct xdp_sock *)sk;

View File

@ -56,7 +56,7 @@ static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb)
du.id = umem->id;
du.size = umem->size;
du.num_pages = umem->npgs;
du.chunk_size = (__u32)(~umem->chunk_mask + 1);
du.chunk_size = umem->chunk_size_nohr + umem->headroom;
du.headroom = umem->headroom;
du.ifindex = umem->dev ? umem->dev->ifindex : 0;
du.queue_id = umem->queue_id;
@ -97,6 +97,7 @@ static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb,
msg->xdiag_ino = sk_ino;
sock_diag_save_cookie(sk, msg->xdiag_cookie);
mutex_lock(&xs->mutex);
if ((req->xdiag_show & XDP_SHOW_INFO) && xsk_diag_put_info(xs, nlskb))
goto out_nlmsg_trim;
@ -117,10 +118,12 @@ static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb,
sock_diag_put_meminfo(sk, nlskb, XDP_DIAG_MEMINFO))
goto out_nlmsg_trim;
mutex_unlock(&xs->mutex);
nlmsg_end(nlskb, nlh);
return 0;
out_nlmsg_trim:
mutex_unlock(&xs->mutex);
nlmsg_cancel(nlskb, nlh);
return -EMSGSIZE;
}

View File

@ -16,6 +16,7 @@
struct xdp_ring {
u32 producer ____cacheline_aligned_in_smp;
u32 consumer ____cacheline_aligned_in_smp;
u32 flags;
};
/* Used for the RX and TX queues for packets */
@ -133,6 +134,17 @@ static inline bool xskq_has_addrs(struct xsk_queue *q, u32 cnt)
/* UMEM queue */
static inline bool xskq_crosses_non_contig_pg(struct xdp_umem *umem, u64 addr,
u64 length)
{
bool cross_pg = (addr & (PAGE_SIZE - 1)) + length > PAGE_SIZE;
bool next_pg_contig =
(unsigned long)umem->pages[(addr >> PAGE_SHIFT)].addr &
XSK_NEXT_PG_CONTIG_MASK;
return cross_pg && !next_pg_contig;
}
static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr)
{
if (addr >= q->size) {
@ -143,23 +155,51 @@ static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr)
return true;
}
static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr)
static inline bool xskq_is_valid_addr_unaligned(struct xsk_queue *q, u64 addr,
u64 length,
struct xdp_umem *umem)
{
u64 base_addr = xsk_umem_extract_addr(addr);
addr = xsk_umem_add_offset_to_addr(addr);
if (base_addr >= q->size || addr >= q->size ||
xskq_crosses_non_contig_pg(umem, addr, length)) {
q->invalid_descs++;
return false;
}
return true;
}
static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr,
struct xdp_umem *umem)
{
while (q->cons_tail != q->cons_head) {
struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
unsigned int idx = q->cons_tail & q->ring_mask;
*addr = READ_ONCE(ring->desc[idx]) & q->chunk_mask;
if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) {
if (xskq_is_valid_addr_unaligned(q, *addr,
umem->chunk_size_nohr,
umem))
return addr;
goto out;
}
if (xskq_is_valid_addr(q, *addr))
return addr;
out:
q->cons_tail++;
}
return NULL;
}
static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr)
static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr,
struct xdp_umem *umem)
{
if (q->cons_tail == q->cons_head) {
smp_mb(); /* D, matches A */
@ -170,7 +210,7 @@ static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr)
smp_rmb();
}
return xskq_validate_addr(q, addr);
return xskq_validate_addr(q, addr, umem);
}
static inline void xskq_discard_addr(struct xsk_queue *q)
@ -229,8 +269,21 @@ static inline int xskq_reserve_addr(struct xsk_queue *q)
/* Rx/Tx queue */
static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d)
static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d,
struct xdp_umem *umem)
{
if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) {
if (!xskq_is_valid_addr_unaligned(q, d->addr, d->len, umem))
return false;
if (d->len > umem->chunk_size_nohr || d->options) {
q->invalid_descs++;
return false;
}
return true;
}
if (!xskq_is_valid_addr(q, d->addr))
return false;
@ -244,14 +297,15 @@ static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d)
}
static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q,
struct xdp_desc *desc)
struct xdp_desc *desc,
struct xdp_umem *umem)
{
while (q->cons_tail != q->cons_head) {
struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
unsigned int idx = q->cons_tail & q->ring_mask;
*desc = READ_ONCE(ring->desc[idx]);
if (xskq_is_valid_desc(q, desc))
if (xskq_is_valid_desc(q, desc, umem))
return desc;
q->cons_tail++;
@ -261,7 +315,8 @@ static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q,
}
static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q,
struct xdp_desc *desc)
struct xdp_desc *desc,
struct xdp_umem *umem)
{
if (q->cons_tail == q->cons_head) {
smp_mb(); /* D, matches A */
@ -272,7 +327,7 @@ static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q,
smp_rmb(); /* C, matches B */
}
return xskq_validate_desc(q, desc);
return xskq_validate_desc(q, desc, umem);
}
static inline void xskq_discard_desc(struct xsk_queue *q)

View File

@ -9,5 +9,11 @@ void syscall_defines(void)
COMMENT("Linux system call numbers.");
SYSNR(__NR_write);
SYSNR(__NR_read);
#ifdef __NR_mmap2
SYSNR(__NR_mmap2);
#endif
#ifdef __NR_mmap
SYSNR(__NR_mmap);
#endif
}

View File

@ -68,12 +68,25 @@ PROG(SYS__NR_read)(struct pt_regs *ctx)
return 0;
}
PROG(SYS__NR_mmap)(struct pt_regs *ctx)
#ifdef __NR_mmap2
PROG(SYS__NR_mmap2)(struct pt_regs *ctx)
{
char fmt[] = "mmap\n";
char fmt[] = "mmap2\n";
bpf_trace_printk(fmt, sizeof(fmt));
return 0;
}
#endif
#ifdef __NR_mmap
PROG(SYS__NR_mmap)(struct pt_regs *ctx)
{
char fmt[] = "mmap\n";
bpf_trace_printk(fmt, sizeof(fmt));
return 0;
}
#endif
char _license[] SEC("license") = "GPL";
u32 _version SEC("version") = LINUX_VERSION_CODE;

View File

@ -67,8 +67,14 @@ static int opt_ifindex;
static int opt_queue;
static int opt_poll;
static int opt_interval = 1;
static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP;
static u32 opt_umem_flags;
static int opt_unaligned_chunks;
static int opt_mmap_flags;
static u32 opt_xdp_bind_flags;
static int opt_xsk_frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
static int opt_timeout = 1000;
static bool opt_need_wakeup = true;
static __u32 prog_id;
struct xsk_umem_info {
@ -282,7 +288,9 @@ static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size)
.comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
.frame_size = opt_xsk_frame_size,
.frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM,
.flags = opt_umem_flags
};
int ret;
umem = calloc(1, sizeof(*umem));
@ -291,6 +299,7 @@ static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size)
ret = xsk_umem__create(&umem->umem, buffer, size, &umem->fq, &umem->cq,
&cfg);
if (ret)
exit_with_error(-ret);
@ -352,6 +361,8 @@ static struct option long_options[] = {
{"zero-copy", no_argument, 0, 'z'},
{"copy", no_argument, 0, 'c'},
{"frame-size", required_argument, 0, 'f'},
{"no-need-wakeup", no_argument, 0, 'm'},
{"unaligned", no_argument, 0, 'u'},
{0, 0, 0, 0}
};
@ -372,6 +383,9 @@ static void usage(const char *prog)
" -z, --zero-copy Force zero-copy mode.\n"
" -c, --copy Force copy mode.\n"
" -f, --frame-size=n Set the frame size (must be a power of two, default is %d).\n"
" -m, --no-need-wakeup Turn off use of driver need wakeup flag.\n"
" -f, --frame-size=n Set the frame size (must be a power of two in aligned mode, default is %d).\n"
" -u, --unaligned Enable unaligned chunk placement\n"
"\n";
fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE);
exit(EXIT_FAILURE);
@ -384,8 +398,8 @@ static void parse_command_line(int argc, char **argv)
opterr = 0;
for (;;) {
c = getopt_long(argc, argv, "Frtli:q:psSNn:czf:", long_options,
&option_index);
c = getopt_long(argc, argv, "Frtli:q:psSNn:czf:mu",
long_options, &option_index);
if (c == -1)
break;
@ -424,12 +438,21 @@ static void parse_command_line(int argc, char **argv)
case 'c':
opt_xdp_bind_flags |= XDP_COPY;
break;
case 'u':
opt_umem_flags |= XDP_UMEM_UNALIGNED_CHUNK_FLAG;
opt_unaligned_chunks = 1;
opt_mmap_flags = MAP_HUGETLB;
break;
case 'F':
opt_xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
break;
case 'f':
opt_xsk_frame_size = atoi(optarg);
case 'm':
opt_need_wakeup = false;
opt_xdp_bind_flags &= ~XDP_USE_NEED_WAKEUP;
break;
default:
usage(basename(argv[0]));
}
@ -442,7 +465,8 @@ static void parse_command_line(int argc, char **argv)
usage(basename(argv[0]));
}
if (opt_xsk_frame_size & (opt_xsk_frame_size - 1)) {
if ((opt_xsk_frame_size & (opt_xsk_frame_size - 1)) &&
!opt_unaligned_chunks) {
fprintf(stderr, "--frame-size=%d is not a power of two\n",
opt_xsk_frame_size);
usage(basename(argv[0]));
@ -459,8 +483,10 @@ static void kick_tx(struct xsk_socket_info *xsk)
exit_with_error(errno);
}
static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk)
static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk,
struct pollfd *fds)
{
struct xsk_umem_info *umem = xsk->umem;
u32 idx_cq = 0, idx_fq = 0;
unsigned int rcvd;
size_t ndescs;
@ -468,27 +494,30 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk)
if (!xsk->outstanding_tx)
return;
kick_tx(xsk);
if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx))
kick_tx(xsk);
ndescs = (xsk->outstanding_tx > BATCH_SIZE) ? BATCH_SIZE :
xsk->outstanding_tx;
/* re-add completed Tx buffers */
rcvd = xsk_ring_cons__peek(&xsk->umem->cq, ndescs, &idx_cq);
rcvd = xsk_ring_cons__peek(&umem->cq, ndescs, &idx_cq);
if (rcvd > 0) {
unsigned int i;
int ret;
ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq);
while (ret != rcvd) {
if (ret < 0)
exit_with_error(-ret);
ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd,
&idx_fq);
if (xsk_ring_prod__needs_wakeup(&umem->fq))
ret = poll(fds, num_socks, opt_timeout);
ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq);
}
for (i = 0; i < rcvd; i++)
*xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) =
*xsk_ring_cons__comp_addr(&xsk->umem->cq,
idx_cq++);
*xsk_ring_prod__fill_addr(&umem->fq, idx_fq++) =
*xsk_ring_cons__comp_addr(&umem->cq, idx_cq++);
xsk_ring_prod__submit(&xsk->umem->fq, rcvd);
xsk_ring_cons__release(&xsk->umem->cq, rcvd);
@ -505,7 +534,8 @@ static inline void complete_tx_only(struct xsk_socket_info *xsk)
if (!xsk->outstanding_tx)
return;
kick_tx(xsk);
if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx))
kick_tx(xsk);
rcvd = xsk_ring_cons__peek(&xsk->umem->cq, BATCH_SIZE, &idx);
if (rcvd > 0) {
@ -515,30 +545,38 @@ static inline void complete_tx_only(struct xsk_socket_info *xsk)
}
}
static void rx_drop(struct xsk_socket_info *xsk)
static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds)
{
unsigned int rcvd, i;
u32 idx_rx = 0, idx_fq = 0;
int ret;
rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx);
if (!rcvd)
if (!rcvd) {
if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
ret = poll(fds, num_socks, opt_timeout);
return;
}
ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
while (ret != rcvd) {
if (ret < 0)
exit_with_error(-ret);
if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
ret = poll(fds, num_socks, opt_timeout);
ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
}
for (i = 0; i < rcvd; i++) {
u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr;
u32 len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len;
u64 orig = xsk_umem__extract_addr(addr);
addr = xsk_umem__add_offset_to_addr(addr);
char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr);
hex_dump(pkt, len, addr);
*xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = addr;
*xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = orig;
}
xsk_ring_prod__submit(&xsk->umem->fq, rcvd);
@ -549,42 +587,65 @@ static void rx_drop(struct xsk_socket_info *xsk)
static void rx_drop_all(void)
{
struct pollfd fds[MAX_SOCKS + 1];
int i, ret, timeout, nfds = 1;
int i, ret;
memset(fds, 0, sizeof(fds));
for (i = 0; i < num_socks; i++) {
fds[i].fd = xsk_socket__fd(xsks[i]->xsk);
fds[i].events = POLLIN;
timeout = 1000; /* 1sn */
}
for (;;) {
if (opt_poll) {
ret = poll(fds, nfds, timeout);
ret = poll(fds, num_socks, opt_timeout);
if (ret <= 0)
continue;
}
for (i = 0; i < num_socks; i++)
rx_drop(xsks[i]);
rx_drop(xsks[i], fds);
}
}
static void tx_only(struct xsk_socket_info *xsk)
static void tx_only(struct xsk_socket_info *xsk, u32 frame_nb)
{
int timeout, ret, nfds = 1;
struct pollfd fds[nfds + 1];
u32 idx, frame_nb = 0;
u32 idx;
if (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) == BATCH_SIZE) {
unsigned int i;
for (i = 0; i < BATCH_SIZE; i++) {
xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->addr =
(frame_nb + i) << XSK_UMEM__DEFAULT_FRAME_SHIFT;
xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->len =
sizeof(pkt_data) - 1;
}
xsk_ring_prod__submit(&xsk->tx, BATCH_SIZE);
xsk->outstanding_tx += BATCH_SIZE;
frame_nb += BATCH_SIZE;
frame_nb %= NUM_FRAMES;
}
complete_tx_only(xsk);
}
static void tx_only_all(void)
{
struct pollfd fds[MAX_SOCKS];
u32 frame_nb[MAX_SOCKS] = {};
int i, ret;
memset(fds, 0, sizeof(fds));
fds[0].fd = xsk_socket__fd(xsk->xsk);
fds[0].events = POLLOUT;
timeout = 1000; /* 1sn */
for (i = 0; i < num_socks; i++) {
fds[0].fd = xsk_socket__fd(xsks[i]->xsk);
fds[0].events = POLLOUT;
}
for (;;) {
if (opt_poll) {
ret = poll(fds, nfds, timeout);
ret = poll(fds, num_socks, opt_timeout);
if (ret <= 0)
continue;
@ -592,69 +653,78 @@ static void tx_only(struct xsk_socket_info *xsk)
continue;
}
if (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) ==
BATCH_SIZE) {
unsigned int i;
for (i = 0; i < BATCH_SIZE; i++) {
xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->addr
= (frame_nb + i) * opt_xsk_frame_size;
xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->len =
sizeof(pkt_data) - 1;
}
xsk_ring_prod__submit(&xsk->tx, BATCH_SIZE);
xsk->outstanding_tx += BATCH_SIZE;
frame_nb += BATCH_SIZE;
frame_nb %= NUM_FRAMES;
}
complete_tx_only(xsk);
for (i = 0; i < num_socks; i++)
tx_only(xsks[i], frame_nb[i]);
}
}
static void l2fwd(struct xsk_socket_info *xsk)
static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds)
{
for (;;) {
unsigned int rcvd, i;
u32 idx_rx = 0, idx_tx = 0;
int ret;
unsigned int rcvd, i;
u32 idx_rx = 0, idx_tx = 0;
int ret;
for (;;) {
complete_tx_l2fwd(xsk);
complete_tx_l2fwd(xsk, fds);
rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE,
&idx_rx);
if (rcvd > 0)
break;
}
rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx);
if (!rcvd) {
if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
ret = poll(fds, num_socks, opt_timeout);
return;
}
ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx);
while (ret != rcvd) {
if (ret < 0)
exit_with_error(-ret);
if (xsk_ring_prod__needs_wakeup(&xsk->tx))
kick_tx(xsk);
ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx);
while (ret != rcvd) {
if (ret < 0)
exit_with_error(-ret);
ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx);
}
for (i = 0; i < rcvd; i++) {
u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr;
u32 len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len;
u64 orig = xsk_umem__extract_addr(addr);
addr = xsk_umem__add_offset_to_addr(addr);
char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr);
swap_mac_addresses(pkt);
hex_dump(pkt, len, addr);
xsk_ring_prod__tx_desc(&xsk->tx, idx_tx)->addr = orig;
xsk_ring_prod__tx_desc(&xsk->tx, idx_tx++)->len = len;
}
xsk_ring_prod__submit(&xsk->tx, rcvd);
xsk_ring_cons__release(&xsk->rx, rcvd);
xsk->rx_npkts += rcvd;
xsk->outstanding_tx += rcvd;
}
static void l2fwd_all(void)
{
struct pollfd fds[MAX_SOCKS];
int i, ret;
memset(fds, 0, sizeof(fds));
for (i = 0; i < num_socks; i++) {
fds[i].fd = xsk_socket__fd(xsks[i]->xsk);
fds[i].events = POLLOUT | POLLIN;
}
for (;;) {
if (opt_poll) {
ret = poll(fds, num_socks, opt_timeout);
if (ret <= 0)
continue;
}
for (i = 0; i < rcvd; i++) {
u64 addr = xsk_ring_cons__rx_desc(&xsk->rx,
idx_rx)->addr;
u32 len = xsk_ring_cons__rx_desc(&xsk->rx,
idx_rx++)->len;
char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr);
swap_mac_addresses(pkt);
hex_dump(pkt, len, addr);
xsk_ring_prod__tx_desc(&xsk->tx, idx_tx)->addr = addr;
xsk_ring_prod__tx_desc(&xsk->tx, idx_tx++)->len = len;
}
xsk_ring_prod__submit(&xsk->tx, rcvd);
xsk_ring_cons__release(&xsk->rx, rcvd);
xsk->rx_npkts += rcvd;
xsk->outstanding_tx += rcvd;
for (i = 0; i < num_socks; i++)
l2fwd(xsks[i], fds);
}
}
@ -674,11 +744,14 @@ int main(int argc, char **argv)
exit(EXIT_FAILURE);
}
ret = posix_memalign(&bufs, getpagesize(), /* PAGE_SIZE aligned */
NUM_FRAMES * opt_xsk_frame_size);
if (ret)
exit_with_error(ret);
/* Reserve memory for the umem. Use hugepages if unaligned chunk mode */
bufs = mmap(NULL, NUM_FRAMES * opt_xsk_frame_size,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS | opt_mmap_flags, -1, 0);
if (bufs == MAP_FAILED) {
printf("ERROR: mmap failed\n");
exit(EXIT_FAILURE);
}
/* Create sockets... */
umem = xsk_configure_umem(bufs, NUM_FRAMES * opt_xsk_frame_size);
xsks[num_socks++] = xsk_configure_socket(umem);
@ -705,9 +778,9 @@ int main(int argc, char **argv)
if (opt_bench == BENCH_RXDROP)
rx_drop_all();
else if (opt_bench == BENCH_TXONLY)
tx_only(xsks[0]);
tx_only_all();
else
l2fwd(xsks[0]);
l2fwd_all();
return 0;
}

View File

@ -115,10 +115,12 @@ gen_btf()
LLVM_OBJCOPY=${OBJCOPY} ${PAHOLE} -J ${1}
# dump .BTF section into raw binary file to link with final vmlinux
bin_arch=$(${OBJDUMP} -f ${1} | grep architecture | \
bin_arch=$(LANG=C ${OBJDUMP} -f ${1} | grep architecture | \
cut -d, -f1 | cut -d' ' -f2)
bin_format=$(LANG=C ${OBJDUMP} -f ${1} | grep 'file format' | \
awk '{print $4}')
${OBJCOPY} --dump-section .BTF=.btf.vmlinux.bin ${1} 2>/dev/null
${OBJCOPY} -I binary -O ${CONFIG_OUTPUT_FORMAT} -B ${bin_arch} \
${OBJCOPY} -I binary -O ${bin_format} -B ${bin_arch} \
--rename-section .data=.BTF .btf.vmlinux.bin ${2}
}

View File

@ -1,4 +1,5 @@
FEATURE-DUMP.bpf
feature
bpf_asm
bpf_dbg
bpf_exp.yacc.*

View File

@ -81,10 +81,11 @@ $(OUTPUT)bpf_exp.lex.o: $(OUTPUT)bpf_exp.lex.c
clean: bpftool_clean
$(call QUIET_CLEAN, bpf-progs)
$(Q)rm -rf $(OUTPUT)*.o $(OUTPUT)bpf_jit_disasm $(OUTPUT)bpf_dbg \
$(Q)$(RM) -r -- $(OUTPUT)*.o $(OUTPUT)bpf_jit_disasm $(OUTPUT)bpf_dbg \
$(OUTPUT)bpf_asm $(OUTPUT)bpf_exp.yacc.* $(OUTPUT)bpf_exp.lex.*
$(call QUIET_CLEAN, core-gen)
$(Q)rm -f $(OUTPUT)FEATURE-DUMP.bpf
$(Q)$(RM) -- $(OUTPUT)FEATURE-DUMP.bpf
$(Q)$(RM) -r -- $(OUTPUT)feature
install: $(PROGS) bpftool_install
$(call QUIET_INSTALL, bpf_jit_disasm)

View File

@ -3,3 +3,5 @@
bpftool*.8
bpf-helpers.*
FEATURE-DUMP.bpftool
feature
libbpf

View File

@ -19,6 +19,7 @@ SYNOPSIS
BTF COMMANDS
=============
| **bpftool** **btf** { **show** | **list** } [**id** *BTF_ID*]
| **bpftool** **btf dump** *BTF_SRC* [**format** *FORMAT*]
| **bpftool** **btf help**
|
@ -29,6 +30,12 @@ BTF COMMANDS
DESCRIPTION
===========
**bpftool btf { show | list }** [**id** *BTF_ID*]
Show information about loaded BTF objects. If a BTF ID is
specified, show information only about given BTF object,
otherwise list all BTF objects currently loaded on the
system.
**bpftool btf dump** *BTF_SRC*
Dump BTF entries from a given *BTF_SRC*.

View File

@ -36,6 +36,7 @@ MAP COMMANDS
| **bpftool** **map pop** *MAP*
| **bpftool** **map enqueue** *MAP* **value** *VALUE*
| **bpftool** **map dequeue** *MAP*
| **bpftool** **map freeze** *MAP*
| **bpftool** **map help**
|
| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* }
@ -127,6 +128,14 @@ DESCRIPTION
**bpftool map dequeue** *MAP*
Dequeue and print **value** from the queue.
**bpftool map freeze** *MAP*
Freeze the map as read-only from user space. Entries from a
frozen map can not longer be updated or deleted with the
**bpf\ ()** system call. This operation is not reversible,
and the map remains immutable from user space until its
destruction. However, read and write permissions for BPF
programs to the map remain unchanged.
**bpftool map help**
Print short help message.

View File

@ -15,17 +15,22 @@ SYNOPSIS
*OPTIONS* := { [{ **-j** | **--json** }] [{ **-p** | **--pretty** }] }
*COMMANDS* :=
{ **show** | **list** } [ **dev** name ] | **help**
{ **show** | **list** | **attach** | **detach** | **help** }
NET COMMANDS
============
| **bpftool** **net { show | list } [ dev name ]**
| **bpftool** **net { show | list }** [ **dev** *NAME* ]
| **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ]
| **bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME*
| **bpftool** **net help**
|
| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* }
| *ATTACH_TYPE* := { **xdp** | **xdpgeneric** | **xdpdrv** | **xdpoffload** }
DESCRIPTION
===========
**bpftool net { show | list } [ dev name ]**
**bpftool net { show | list }** [ **dev** *NAME* ]
List bpf program attachments in the kernel networking subsystem.
Currently, only device driver xdp attachments and tc filter
@ -47,6 +52,24 @@ DESCRIPTION
all bpf programs attached to non clsact qdiscs, and finally all
bpf programs attached to root and clsact qdisc.
**bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ]
Attach bpf program *PROG* to network interface *NAME* with
type specified by *ATTACH_TYPE*. Previously attached bpf program
can be replaced by the command used with **overwrite** option.
Currently, only XDP-related modes are supported for *ATTACH_TYPE*.
*ATTACH_TYPE* can be of:
**xdp** - try native XDP and fallback to generic XDP if NIC driver does not support it;
**xdpgeneric** - Generic XDP. runs at generic XDP hook when packet already enters receive path as skb;
**xdpdrv** - Native XDP. runs earliest point in driver's receive path;
**xdpoffload** - Offload XDP. runs directly on NIC on each packet reception;
**bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME*
Detach bpf program attached to network interface *NAME* with
type specified by *ATTACH_TYPE*. To detach bpf program, same
*ATTACH_TYPE* previously used for attach must be specified.
Currently, only XDP-related modes are supported for *ATTACH_TYPE*.
**bpftool net help**
Print short help message.
@ -137,6 +160,34 @@ EXAMPLES
}
]
|
| **# bpftool net attach xdpdrv id 16 dev enp6s0np0**
| **# bpftool net**
::
xdp:
enp6s0np0(4) driver id 16
|
| **# bpftool net attach xdpdrv id 16 dev enp6s0np0**
| **# bpftool net attach xdpdrv id 20 dev enp6s0np0 overwrite**
| **# bpftool net**
::
xdp:
enp6s0np0(4) driver id 20
|
| **# bpftool net attach xdpdrv id 16 dev enp6s0np0**
| **# bpftool net detach xdpdrv dev enp6s0np0**
| **# bpftool net**
::
xdp:
SEE ALSO
========

View File

@ -17,27 +17,30 @@ endif
BPF_DIR = $(srctree)/tools/lib/bpf/
ifneq ($(OUTPUT),)
BPF_PATH = $(OUTPUT)
LIBBPF_OUTPUT = $(OUTPUT)/libbpf/
LIBBPF_PATH = $(LIBBPF_OUTPUT)
else
BPF_PATH = $(BPF_DIR)
LIBBPF_PATH = $(BPF_DIR)
endif
LIBBPF = $(BPF_PATH)libbpf.a
LIBBPF = $(LIBBPF_PATH)libbpf.a
BPFTOOL_VERSION := $(shell make --no-print-directory -sC ../../.. kernelversion)
BPFTOOL_VERSION := $(shell make -rR --no-print-directory -sC ../../.. kernelversion)
$(LIBBPF): FORCE
$(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(OUTPUT) $(OUTPUT)libbpf.a
$(if $(LIBBPF_OUTPUT),@mkdir -p $(LIBBPF_OUTPUT))
$(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(LIBBPF_OUTPUT) $(LIBBPF_OUTPUT)libbpf.a
$(LIBBPF)-clean:
$(call QUIET_CLEAN, libbpf)
$(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(OUTPUT) clean >/dev/null
$(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(LIBBPF_OUTPUT) clean >/dev/null
prefix ?= /usr/local
bash_compdir ?= /usr/share/bash-completion/completions
CFLAGS += -O2
CFLAGS += -W -Wall -Wextra -Wno-unused-parameter -Wshadow -Wno-missing-field-initializers
CFLAGS += -W -Wall -Wextra -Wno-unused-parameter -Wno-missing-field-initializers
CFLAGS += $(filter-out -Wswitch-enum,$(EXTRA_WARNINGS))
CFLAGS += -DPACKAGE='"bpftool"' -D__EXPORTED_HEADERS__ \
-I$(srctree)/kernel/bpf/ \
-I$(srctree)/tools/include \
@ -52,7 +55,7 @@ ifneq ($(EXTRA_LDFLAGS),)
LDFLAGS += $(EXTRA_LDFLAGS)
endif
LIBS = -lelf -lz $(LIBBPF)
LIBS = $(LIBBPF) -lelf -lz
INSTALL ?= install
RM ?= rm -f
@ -114,16 +117,18 @@ $(OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c
$(OUTPUT)feature.o: | zdep
$(OUTPUT)bpftool: $(OBJS) $(LIBBPF)
$(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ $(LIBS)
$(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(OBJS) $(LIBS)
$(OUTPUT)%.o: %.c
$(QUIET_CC)$(COMPILE.c) -MMD -o $@ $<
clean: $(LIBBPF)-clean
$(call QUIET_CLEAN, bpftool)
$(Q)$(RM) $(OUTPUT)bpftool $(OUTPUT)*.o $(OUTPUT)*.d
$(Q)$(RM) -- $(OUTPUT)bpftool $(OUTPUT)*.o $(OUTPUT)*.d
$(Q)$(RM) -r -- $(OUTPUT)libbpf/
$(call QUIET_CLEAN, core-gen)
$(Q)$(RM) $(OUTPUT)FEATURE-DUMP.bpftool
$(Q)$(RM) -- $(OUTPUT)FEATURE-DUMP.bpftool
$(Q)$(RM) -r -- $(OUTPUT)feature/
install: $(OUTPUT)bpftool
$(call QUIET_INSTALL, bpftool)
@ -134,8 +139,8 @@ install: $(OUTPUT)bpftool
uninstall:
$(call QUIET_UNINST, bpftool)
$(Q)$(RM) $(DESTDIR)$(prefix)/sbin/bpftool
$(Q)$(RM) $(DESTDIR)$(bash_compdir)/bpftool
$(Q)$(RM) -- $(DESTDIR)$(prefix)/sbin/bpftool
$(Q)$(RM) -- $(DESTDIR)$(bash_compdir)/bpftool
doc:
$(call descend,Documentation)

View File

@ -73,8 +73,8 @@ _bpftool_get_prog_tags()
_bpftool_get_btf_ids()
{
COMPREPLY+=( $( compgen -W "$( bpftool -jp prog 2>&1 | \
command sed -n 's/.*"btf_id": \(.*\),\?$/\1/p' )" -- "$cur" ) )
COMPREPLY+=( $( compgen -W "$( bpftool -jp btf 2>&1 | \
command sed -n 's/.*"id": \(.*\),$/\1/p' )" -- "$cur" ) )
}
_bpftool_get_obj_map_names()
@ -201,6 +201,10 @@ _bpftool()
_bpftool_get_prog_tags
return 0
;;
dev)
_sysfs_get_netdevs
return 0
;;
file|pinned)
_filedir
return 0
@ -399,10 +403,6 @@ _bpftool()
_filedir
return 0
;;
dev)
_sysfs_get_netdevs
return 0
;;
*)
COMPREPLY=( $( compgen -W "map" -- "$cur" ) )
_bpftool_once_attr 'type'
@ -449,7 +449,7 @@ _bpftool()
map)
local MAP_TYPE='id pinned'
case $command in
show|list|dump|peek|pop|dequeue)
show|list|dump|peek|pop|dequeue|freeze)
case $prev in
$command)
COMPREPLY=( $( compgen -W "$MAP_TYPE" -- "$cur" ) )
@ -498,10 +498,6 @@ _bpftool()
key|value|flags|name|entries)
return 0
;;
dev)
_sysfs_get_netdevs
return 0
;;
*)
_bpftool_once_attr 'type'
_bpftool_once_attr 'key'
@ -642,7 +638,7 @@ _bpftool()
[[ $prev == $object ]] && \
COMPREPLY=( $( compgen -W 'delete dump getnext help \
lookup pin event_pipe show list update create \
peek push enqueue pop dequeue' -- \
peek push enqueue pop dequeue freeze' -- \
"$cur" ) )
;;
esac
@ -674,7 +670,7 @@ _bpftool()
map)
_bpftool_get_map_ids
;;
dump)
$command)
_bpftool_get_btf_ids
;;
esac
@ -702,9 +698,21 @@ _bpftool()
;;
esac
;;
show|list)
case $prev in
$command)
COMPREPLY+=( $( compgen -W "id" -- "$cur" ) )
;;
id)
_bpftool_get_btf_ids
;;
esac
return 0
;;
*)
[[ $prev == $object ]] && \
COMPREPLY=( $( compgen -W 'dump help' -- "$cur" ) )
COMPREPLY=( $( compgen -W 'dump help show list' \
-- "$cur" ) )
;;
esac
;;
@ -778,18 +786,67 @@ _bpftool()
esac
;;
net)
local PROG_TYPE='id pinned tag'
local ATTACH_TYPES='xdp xdpgeneric xdpdrv xdpoffload'
case $command in
show|list)
[[ $prev != "$command" ]] && return 0
COMPREPLY=( $( compgen -W 'dev' -- "$cur" ) )
return 0
;;
attach)
case $cword in
3)
COMPREPLY=( $( compgen -W "$ATTACH_TYPES" -- "$cur" ) )
return 0
;;
4)
COMPREPLY=( $( compgen -W "$PROG_TYPE" -- "$cur" ) )
return 0
;;
5)
case $prev in
id)
_bpftool_get_prog_ids
;;
pinned)
_filedir
;;
esac
return 0
;;
6)
COMPREPLY=( $( compgen -W 'dev' -- "$cur" ) )
return 0
;;
8)
_bpftool_once_attr 'overwrite'
return 0
;;
esac
;;
detach)
case $cword in
3)
COMPREPLY=( $( compgen -W "$ATTACH_TYPES" -- "$cur" ) )
return 0
;;
4)
COMPREPLY=( $( compgen -W 'dev' -- "$cur" ) )
return 0
;;
esac
;;
*)
[[ $prev == $object ]] && \
COMPREPLY=( $( compgen -W 'help \
show list' -- "$cur" ) )
show list attach detach' -- "$cur" ) )
;;
esac
;;
feature)
case $command in
probe)
[[ $prev == "dev" ]] && _sysfs_get_netdevs && return 0
[[ $prev == "prefix" ]] && return 0
if _bpftool_search_list 'macros'; then
COMPREPLY+=( $( compgen -W 'prefix' -- "$cur" ) )

View File

@ -11,6 +11,7 @@
#include <bpf.h>
#include <libbpf.h>
#include <linux/btf.h>
#include <linux/hashtable.h>
#include "btf.h"
#include "json_writer.h"
@ -35,6 +36,16 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
[BTF_KIND_DATASEC] = "DATASEC",
};
struct btf_attach_table {
DECLARE_HASHTABLE(table, 16);
};
struct btf_attach_point {
__u32 obj_id;
__u32 btf_id;
struct hlist_node hash;
};
static const char *btf_int_enc_str(__u8 encoding)
{
switch (encoding) {
@ -449,7 +460,7 @@ static int do_dump(int argc, char **argv)
btf_id = strtoul(*argv, &endptr, 0);
if (*endptr) {
p_err("can't parse %s as ID", **argv);
p_err("can't parse %s as ID", *argv);
return -1;
}
NEXT_ARG();
@ -522,6 +533,330 @@ done:
return err;
}
static int btf_parse_fd(int *argc, char ***argv)
{
unsigned int id;
char *endptr;
int fd;
if (!is_prefix(*argv[0], "id")) {
p_err("expected 'id', got: '%s'?", **argv);
return -1;
}
NEXT_ARGP();
id = strtoul(**argv, &endptr, 0);
if (*endptr) {
p_err("can't parse %s as ID", **argv);
return -1;
}
NEXT_ARGP();
fd = bpf_btf_get_fd_by_id(id);
if (fd < 0)
p_err("can't get BTF object by id (%u): %s",
id, strerror(errno));
return fd;
}
static void delete_btf_table(struct btf_attach_table *tab)
{
struct btf_attach_point *obj;
struct hlist_node *tmp;
unsigned int bkt;
hash_for_each_safe(tab->table, bkt, tmp, obj, hash) {
hash_del(&obj->hash);
free(obj);
}
}
static int
build_btf_type_table(struct btf_attach_table *tab, enum bpf_obj_type type,
void *info, __u32 *len)
{
static const char * const names[] = {
[BPF_OBJ_UNKNOWN] = "unknown",
[BPF_OBJ_PROG] = "prog",
[BPF_OBJ_MAP] = "map",
};
struct btf_attach_point *obj_node;
__u32 btf_id, id = 0;
int err;
int fd;
while (true) {
switch (type) {
case BPF_OBJ_PROG:
err = bpf_prog_get_next_id(id, &id);
break;
case BPF_OBJ_MAP:
err = bpf_map_get_next_id(id, &id);
break;
default:
err = -1;
p_err("unexpected object type: %d", type);
goto err_free;
}
if (err) {
if (errno == ENOENT) {
err = 0;
break;
}
p_err("can't get next %s: %s%s", names[type],
strerror(errno),
errno == EINVAL ? " -- kernel too old?" : "");
goto err_free;
}
switch (type) {
case BPF_OBJ_PROG:
fd = bpf_prog_get_fd_by_id(id);
break;
case BPF_OBJ_MAP:
fd = bpf_map_get_fd_by_id(id);
break;
default:
err = -1;
p_err("unexpected object type: %d", type);
goto err_free;
}
if (fd < 0) {
if (errno == ENOENT)
continue;
p_err("can't get %s by id (%u): %s", names[type], id,
strerror(errno));
err = -1;
goto err_free;
}
memset(info, 0, *len);
err = bpf_obj_get_info_by_fd(fd, info, len);
close(fd);
if (err) {
p_err("can't get %s info: %s", names[type],
strerror(errno));
goto err_free;
}
switch (type) {
case BPF_OBJ_PROG:
btf_id = ((struct bpf_prog_info *)info)->btf_id;
break;
case BPF_OBJ_MAP:
btf_id = ((struct bpf_map_info *)info)->btf_id;
break;
default:
err = -1;
p_err("unexpected object type: %d", type);
goto err_free;
}
if (!btf_id)
continue;
obj_node = calloc(1, sizeof(*obj_node));
if (!obj_node) {
p_err("failed to allocate memory: %s", strerror(errno));
goto err_free;
}
obj_node->obj_id = id;
obj_node->btf_id = btf_id;
hash_add(tab->table, &obj_node->hash, obj_node->btf_id);
}
return 0;
err_free:
delete_btf_table(tab);
return err;
}
static int
build_btf_tables(struct btf_attach_table *btf_prog_table,
struct btf_attach_table *btf_map_table)
{
struct bpf_prog_info prog_info;
__u32 prog_len = sizeof(prog_info);
struct bpf_map_info map_info;
__u32 map_len = sizeof(map_info);
int err = 0;
err = build_btf_type_table(btf_prog_table, BPF_OBJ_PROG, &prog_info,
&prog_len);
if (err)
return err;
err = build_btf_type_table(btf_map_table, BPF_OBJ_MAP, &map_info,
&map_len);
if (err) {
delete_btf_table(btf_prog_table);
return err;
}
return 0;
}
static void
show_btf_plain(struct bpf_btf_info *info, int fd,
struct btf_attach_table *btf_prog_table,
struct btf_attach_table *btf_map_table)
{
struct btf_attach_point *obj;
int n;
printf("%u: ", info->id);
printf("size %uB", info->btf_size);
n = 0;
hash_for_each_possible(btf_prog_table->table, obj, hash, info->id) {
if (obj->btf_id == info->id)
printf("%s%u", n++ == 0 ? " prog_ids " : ",",
obj->obj_id);
}
n = 0;
hash_for_each_possible(btf_map_table->table, obj, hash, info->id) {
if (obj->btf_id == info->id)
printf("%s%u", n++ == 0 ? " map_ids " : ",",
obj->obj_id);
}
printf("\n");
}
static void
show_btf_json(struct bpf_btf_info *info, int fd,
struct btf_attach_table *btf_prog_table,
struct btf_attach_table *btf_map_table)
{
struct btf_attach_point *obj;
jsonw_start_object(json_wtr); /* btf object */
jsonw_uint_field(json_wtr, "id", info->id);
jsonw_uint_field(json_wtr, "size", info->btf_size);
jsonw_name(json_wtr, "prog_ids");
jsonw_start_array(json_wtr); /* prog_ids */
hash_for_each_possible(btf_prog_table->table, obj, hash,
info->id) {
if (obj->btf_id == info->id)
jsonw_uint(json_wtr, obj->obj_id);
}
jsonw_end_array(json_wtr); /* prog_ids */
jsonw_name(json_wtr, "map_ids");
jsonw_start_array(json_wtr); /* map_ids */
hash_for_each_possible(btf_map_table->table, obj, hash,
info->id) {
if (obj->btf_id == info->id)
jsonw_uint(json_wtr, obj->obj_id);
}
jsonw_end_array(json_wtr); /* map_ids */
jsonw_end_object(json_wtr); /* btf object */
}
static int
show_btf(int fd, struct btf_attach_table *btf_prog_table,
struct btf_attach_table *btf_map_table)
{
struct bpf_btf_info info = {};
__u32 len = sizeof(info);
int err;
err = bpf_obj_get_info_by_fd(fd, &info, &len);
if (err) {
p_err("can't get BTF object info: %s", strerror(errno));
return -1;
}
if (json_output)
show_btf_json(&info, fd, btf_prog_table, btf_map_table);
else
show_btf_plain(&info, fd, btf_prog_table, btf_map_table);
return 0;
}
static int do_show(int argc, char **argv)
{
struct btf_attach_table btf_prog_table;
struct btf_attach_table btf_map_table;
int err, fd = -1;
__u32 id = 0;
if (argc == 2) {
fd = btf_parse_fd(&argc, &argv);
if (fd < 0)
return -1;
}
if (argc) {
if (fd >= 0)
close(fd);
return BAD_ARG();
}
hash_init(btf_prog_table.table);
hash_init(btf_map_table.table);
err = build_btf_tables(&btf_prog_table, &btf_map_table);
if (err) {
if (fd >= 0)
close(fd);
return err;
}
if (fd >= 0) {
err = show_btf(fd, &btf_prog_table, &btf_map_table);
close(fd);
goto exit_free;
}
if (json_output)
jsonw_start_array(json_wtr); /* root array */
while (true) {
err = bpf_btf_get_next_id(id, &id);
if (err) {
if (errno == ENOENT) {
err = 0;
break;
}
p_err("can't get next BTF object: %s%s",
strerror(errno),
errno == EINVAL ? " -- kernel too old?" : "");
err = -1;
break;
}
fd = bpf_btf_get_fd_by_id(id);
if (fd < 0) {
if (errno == ENOENT)
continue;
p_err("can't get BTF object by id (%u): %s",
id, strerror(errno));
err = -1;
break;
}
err = show_btf(fd, &btf_prog_table, &btf_map_table);
close(fd);
if (err)
break;
}
if (json_output)
jsonw_end_array(json_wtr); /* root array */
exit_free:
delete_btf_table(&btf_prog_table);
delete_btf_table(&btf_map_table);
return err;
}
static int do_help(int argc, char **argv)
{
if (json_output) {
@ -530,7 +865,8 @@ static int do_help(int argc, char **argv)
}
fprintf(stderr,
"Usage: %s btf dump BTF_SRC [format FORMAT]\n"
"Usage: %s btf { show | list } [id BTF_ID]\n"
" %s btf dump BTF_SRC [format FORMAT]\n"
" %s btf help\n"
"\n"
" BTF_SRC := { id BTF_ID | prog PROG | map MAP [{key | value | kv | all}] | file FILE }\n"
@ -539,12 +875,14 @@ static int do_help(int argc, char **argv)
" " HELP_SPEC_PROGRAM "\n"
" " HELP_SPEC_OPTIONS "\n"
"",
bin_name, bin_name);
bin_name, bin_name, bin_name);
return 0;
}
static const struct cmd cmds[] = {
{ "show", do_show },
{ "list", do_show },
{ "help", do_help },
{ "dump", do_dump },
{ 0 }

View File

@ -26,9 +26,9 @@ static void btf_dumper_ptr(const void *data, json_writer_t *jw,
bool is_plain_text)
{
if (is_plain_text)
jsonw_printf(jw, "%p", *(unsigned long *)data);
jsonw_printf(jw, "%p", data);
else
jsonw_printf(jw, "%u", *(unsigned long *)data);
jsonw_printf(jw, "%lu", *(unsigned long *)data);
}
static int btf_dumper_modifier(const struct btf_dumper *d, __u32 type_id,
@ -216,7 +216,7 @@ static int btf_dumper_int(const struct btf_type *t, __u8 bit_offset,
switch (BTF_INT_ENCODING(*int_type)) {
case 0:
if (BTF_INT_BITS(*int_type) == 64)
jsonw_printf(jw, "%lu", *(__u64 *)data);
jsonw_printf(jw, "%llu", *(__u64 *)data);
else if (BTF_INT_BITS(*int_type) == 32)
jsonw_printf(jw, "%u", *(__u32 *)data);
else if (BTF_INT_BITS(*int_type) == 16)
@ -229,7 +229,7 @@ static int btf_dumper_int(const struct btf_type *t, __u8 bit_offset,
break;
case BTF_INT_SIGNED:
if (BTF_INT_BITS(*int_type) == 64)
jsonw_printf(jw, "%ld", *(long long *)data);
jsonw_printf(jw, "%lld", *(long long *)data);
else if (BTF_INT_BITS(*int_type) == 32)
jsonw_printf(jw, "%d", *(int *)data);
else if (BTF_INT_BITS(*int_type) == 16)

View File

@ -120,8 +120,8 @@ static int count_attached_bpf_progs(int cgroup_fd, enum bpf_attach_type type)
static int show_attached_bpf_progs(int cgroup_fd, enum bpf_attach_type type,
int level)
{
const char *attach_flags_str;
__u32 prog_ids[1024] = {0};
char *attach_flags_str;
__u32 prog_cnt, iter;
__u32 attach_flags;
char buf[32];

View File

@ -29,7 +29,7 @@
#define BPF_FS_MAGIC 0xcafe4a11
#endif
void __printf(1, 2) p_err(const char *fmt, ...)
void p_err(const char *fmt, ...)
{
va_list ap;
@ -47,7 +47,7 @@ void __printf(1, 2) p_err(const char *fmt, ...)
va_end(ap);
}
void __printf(1, 2) p_info(const char *fmt, ...)
void p_info(const char *fmt, ...)
{
va_list ap;

View File

@ -15,7 +15,6 @@
#include <malloc.h>
#include <inttypes.h>
#include <stdint.h>
#include <linux/compiler.h>
#include "json_writer.h"
@ -153,8 +152,7 @@ void jsonw_name(json_writer_t *self, const char *name)
putc(' ', self->out);
}
void __printf(2, 0)
jsonw_vprintf_enquote(json_writer_t *self, const char *fmt, va_list ap)
void jsonw_vprintf_enquote(json_writer_t *self, const char *fmt, va_list ap)
{
jsonw_eor(self);
putc('"', self->out);
@ -162,7 +160,7 @@ jsonw_vprintf_enquote(json_writer_t *self, const char *fmt, va_list ap)
putc('"', self->out);
}
void __printf(2, 3) jsonw_printf(json_writer_t *self, const char *fmt, ...)
void jsonw_printf(json_writer_t *self, const char *fmt, ...)
{
va_list ap;

View File

@ -14,6 +14,7 @@
#include <stdbool.h>
#include <stdint.h>
#include <stdarg.h>
#include <linux/compiler.h>
/* Opaque class structure */
typedef struct json_writer json_writer_t;
@ -30,8 +31,9 @@ void jsonw_pretty(json_writer_t *self, bool on);
void jsonw_name(json_writer_t *self, const char *name);
/* Add value */
void jsonw_vprintf_enquote(json_writer_t *self, const char *fmt, va_list ap);
void jsonw_printf(json_writer_t *self, const char *fmt, ...);
void __printf(2, 0) jsonw_vprintf_enquote(json_writer_t *self, const char *fmt,
va_list ap);
void __printf(2, 3) jsonw_printf(json_writer_t *self, const char *fmt, ...);
void jsonw_string(json_writer_t *self, const char *value);
void jsonw_bool(json_writer_t *self, bool value);
void jsonw_float(json_writer_t *self, double number);

View File

@ -139,7 +139,7 @@ int detect_common_prefix(const char *arg, ...)
strncat(msg, "'", sizeof(msg) - strlen(msg) - 1);
if (count >= 2) {
p_err(msg);
p_err("%s", msg);
return -1;
}

View File

@ -98,8 +98,8 @@ extern int bpf_flags;
extern struct pinned_obj_table prog_table;
extern struct pinned_obj_table map_table;
void p_err(const char *fmt, ...);
void p_info(const char *fmt, ...);
void __printf(1, 2) p_err(const char *fmt, ...);
void __printf(1, 2) p_info(const char *fmt, ...);
bool is_prefix(const char *pfx, const char *str);
int detect_common_prefix(const char *arg, ...);

View File

@ -481,9 +481,11 @@ static int parse_elem(char **argv, struct bpf_map_info *info,
static int show_map_close_json(int fd, struct bpf_map_info *info)
{
char *memlock;
char *memlock, *frozen_str;
int frozen = 0;
memlock = get_fdinfo(fd, "memlock");
frozen_str = get_fdinfo(fd, "frozen");
jsonw_start_object(json_wtr);
@ -533,6 +535,12 @@ static int show_map_close_json(int fd, struct bpf_map_info *info)
}
close(fd);
if (frozen_str) {
frozen = atoi(frozen_str);
free(frozen_str);
}
jsonw_int_field(json_wtr, "frozen", frozen);
if (info->btf_id)
jsonw_int_field(json_wtr, "btf_id", info->btf_id);
@ -555,9 +563,11 @@ static int show_map_close_json(int fd, struct bpf_map_info *info)
static int show_map_close_plain(int fd, struct bpf_map_info *info)
{
char *memlock;
char *memlock, *frozen_str;
int frozen = 0;
memlock = get_fdinfo(fd, "memlock");
frozen_str = get_fdinfo(fd, "frozen");
printf("%u: ", info->id);
if (info->type < ARRAY_SIZE(map_type_name))
@ -610,9 +620,23 @@ static int show_map_close_plain(int fd, struct bpf_map_info *info)
printf("\n\tpinned %s", obj->path);
}
}
printf("\n");
if (frozen_str) {
frozen = atoi(frozen_str);
free(frozen_str);
}
if (!info->btf_id && !frozen)
return 0;
printf("\t");
if (info->btf_id)
printf("\n\tbtf_id %d", info->btf_id);
printf("btf_id %d", info->btf_id);
if (frozen)
printf("%sfrozen", info->btf_id ? " " : "");
printf("\n");
return 0;
@ -1238,6 +1262,35 @@ exit_free:
return err;
}
static int do_freeze(int argc, char **argv)
{
int err, fd;
if (!REQ_ARGS(2))
return -1;
fd = map_parse_fd(&argc, &argv);
if (fd < 0)
return -1;
if (argc) {
close(fd);
return BAD_ARG();
}
err = bpf_map_freeze(fd);
close(fd);
if (err) {
p_err("failed to freeze map: %s", strerror(errno));
return err;
}
if (json_output)
jsonw_null(json_wtr);
return 0;
}
static int do_help(int argc, char **argv)
{
if (json_output) {
@ -1262,6 +1315,7 @@ static int do_help(int argc, char **argv)
" %s %s pop MAP\n"
" %s %s enqueue MAP value VALUE\n"
" %s %s dequeue MAP\n"
" %s %s freeze MAP\n"
" %s %s help\n"
"\n"
" " HELP_SPEC_MAP "\n"
@ -1280,7 +1334,8 @@ static int do_help(int argc, char **argv)
bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2]);
bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
bin_name, argv[-2]);
return 0;
}
@ -1302,6 +1357,7 @@ static const struct cmd cmds[] = {
{ "enqueue", do_update },
{ "pop", do_pop_dequeue },
{ "dequeue", do_pop_dequeue },
{ "freeze", do_freeze },
{ 0 }
};

View File

@ -157,7 +157,7 @@ int do_event_pipe(int argc, char **argv)
NEXT_ARG();
ctx.cpu = strtoul(*argv, &endptr, 0);
if (*endptr) {
p_err("can't parse %s as CPU ID", **argv);
p_err("can't parse %s as CPU ID", *argv);
goto err_close_map;
}
@ -168,7 +168,7 @@ int do_event_pipe(int argc, char **argv)
NEXT_ARG();
ctx.idx = strtoul(*argv, &endptr, 0);
if (*endptr) {
p_err("can't parse %s as index", **argv);
p_err("can't parse %s as index", *argv);
goto err_close_map;
}

View File

@ -55,6 +55,35 @@ struct bpf_attach_info {
__u32 flow_dissector_id;
};
enum net_attach_type {
NET_ATTACH_TYPE_XDP,
NET_ATTACH_TYPE_XDP_GENERIC,
NET_ATTACH_TYPE_XDP_DRIVER,
NET_ATTACH_TYPE_XDP_OFFLOAD,
};
static const char * const attach_type_strings[] = {
[NET_ATTACH_TYPE_XDP] = "xdp",
[NET_ATTACH_TYPE_XDP_GENERIC] = "xdpgeneric",
[NET_ATTACH_TYPE_XDP_DRIVER] = "xdpdrv",
[NET_ATTACH_TYPE_XDP_OFFLOAD] = "xdpoffload",
};
const size_t net_attach_type_size = ARRAY_SIZE(attach_type_strings);
static enum net_attach_type parse_attach_type(const char *str)
{
enum net_attach_type type;
for (type = 0; type < net_attach_type_size; type++) {
if (attach_type_strings[type] &&
is_prefix(str, attach_type_strings[type]))
return type;
}
return net_attach_type_size;
}
static int dump_link_nlmsg(void *cookie, void *msg, struct nlattr **tb)
{
struct bpf_netdev_t *netinfo = cookie;
@ -197,7 +226,7 @@ static int query_flow_dissector(struct bpf_attach_info *attach_info)
fd = open("/proc/self/ns/net", O_RDONLY);
if (fd < 0) {
p_err("can't open /proc/self/ns/net: %d",
p_err("can't open /proc/self/ns/net: %s",
strerror(errno));
return -1;
}
@ -223,6 +252,134 @@ static int query_flow_dissector(struct bpf_attach_info *attach_info)
return 0;
}
static int net_parse_dev(int *argc, char ***argv)
{
int ifindex;
if (is_prefix(**argv, "dev")) {
NEXT_ARGP();
ifindex = if_nametoindex(**argv);
if (!ifindex)
p_err("invalid devname %s", **argv);
NEXT_ARGP();
} else {
p_err("expected 'dev', got: '%s'?", **argv);
return -1;
}
return ifindex;
}
static int do_attach_detach_xdp(int progfd, enum net_attach_type attach_type,
int ifindex, bool overwrite)
{
__u32 flags = 0;
if (!overwrite)
flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
if (attach_type == NET_ATTACH_TYPE_XDP_GENERIC)
flags |= XDP_FLAGS_SKB_MODE;
if (attach_type == NET_ATTACH_TYPE_XDP_DRIVER)
flags |= XDP_FLAGS_DRV_MODE;
if (attach_type == NET_ATTACH_TYPE_XDP_OFFLOAD)
flags |= XDP_FLAGS_HW_MODE;
return bpf_set_link_xdp_fd(ifindex, progfd, flags);
}
static int do_attach(int argc, char **argv)
{
enum net_attach_type attach_type;
int progfd, ifindex, err = 0;
bool overwrite = false;
/* parse attach args */
if (!REQ_ARGS(5))
return -EINVAL;
attach_type = parse_attach_type(*argv);
if (attach_type == net_attach_type_size) {
p_err("invalid net attach/detach type: %s", *argv);
return -EINVAL;
}
NEXT_ARG();
progfd = prog_parse_fd(&argc, &argv);
if (progfd < 0)
return -EINVAL;
ifindex = net_parse_dev(&argc, &argv);
if (ifindex < 1) {
close(progfd);
return -EINVAL;
}
if (argc) {
if (is_prefix(*argv, "overwrite")) {
overwrite = true;
} else {
p_err("expected 'overwrite', got: '%s'?", *argv);
close(progfd);
return -EINVAL;
}
}
/* attach xdp prog */
if (is_prefix("xdp", attach_type_strings[attach_type]))
err = do_attach_detach_xdp(progfd, attach_type, ifindex,
overwrite);
if (err < 0) {
p_err("interface %s attach failed: %s",
attach_type_strings[attach_type], strerror(-err));
return err;
}
if (json_output)
jsonw_null(json_wtr);
return 0;
}
static int do_detach(int argc, char **argv)
{
enum net_attach_type attach_type;
int progfd, ifindex, err = 0;
/* parse detach args */
if (!REQ_ARGS(3))
return -EINVAL;
attach_type = parse_attach_type(*argv);
if (attach_type == net_attach_type_size) {
p_err("invalid net attach/detach type: %s", *argv);
return -EINVAL;
}
NEXT_ARG();
ifindex = net_parse_dev(&argc, &argv);
if (ifindex < 1)
return -EINVAL;
/* detach xdp prog */
progfd = -1;
if (is_prefix("xdp", attach_type_strings[attach_type]))
err = do_attach_detach_xdp(progfd, attach_type, ifindex, NULL);
if (err < 0) {
p_err("interface %s detach failed: %s",
attach_type_strings[attach_type], strerror(-err));
return err;
}
if (json_output)
jsonw_null(json_wtr);
return 0;
}
static int do_show(int argc, char **argv)
{
struct bpf_attach_info attach_info = {};
@ -232,13 +389,9 @@ static int do_show(int argc, char **argv)
char err_buf[256];
if (argc == 2) {
if (strcmp(argv[0], "dev") != 0)
usage();
filter_idx = if_nametoindex(argv[1]);
if (filter_idx == 0) {
fprintf(stderr, "invalid dev name %s\n", argv[1]);
filter_idx = net_parse_dev(&argc, &argv);
if (filter_idx < 1)
return -1;
}
} else if (argc != 0) {
usage();
}
@ -305,13 +458,20 @@ static int do_help(int argc, char **argv)
fprintf(stderr,
"Usage: %s %s { show | list } [dev <devname>]\n"
" %s %s attach ATTACH_TYPE PROG dev <devname> [ overwrite ]\n"
" %s %s detach ATTACH_TYPE dev <devname>\n"
" %s %s help\n"
"\n"
" " HELP_SPEC_PROGRAM "\n"
" ATTACH_TYPE := { xdp | xdpgeneric | xdpdrv | xdpoffload }\n"
"\n"
"Note: Only xdp and tc attachments are supported now.\n"
" For progs attached to cgroups, use \"bpftool cgroup\"\n"
" to dump program attachments. For program types\n"
" sk_{filter,skb,msg,reuseport} and lwt/seg6, please\n"
" consult iproute2.\n",
bin_name, argv[-2], bin_name, argv[-2]);
bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
bin_name, argv[-2]);
return 0;
}
@ -319,6 +479,8 @@ static int do_help(int argc, char **argv)
static const struct cmd cmds[] = {
{ "show", do_show },
{ "list", do_show },
{ "attach", do_attach },
{ "detach", do_detach },
{ "help", do_help },
{ 0 }
};

View File

@ -104,6 +104,8 @@ static void print_perf_json(int pid, int fd, __u32 prog_id, __u32 fd_type,
jsonw_string_field(json_wtr, "filename", buf);
jsonw_lluint_field(json_wtr, "offset", probe_offset);
break;
default:
break;
}
jsonw_end_object(json_wtr);
}
@ -140,6 +142,8 @@ static void print_perf_plain(int pid, int fd, __u32 prog_id, __u32 fd_type,
printf("uretprobe filename %s offset %llu\n", buf,
probe_offset);
break;
default:
break;
}
}

View File

@ -6,9 +6,11 @@
/*
* Common definitions for all gcc versions go here.
*/
#ifndef GCC_VERSION
#define GCC_VERSION (__GNUC__ * 10000 \
+ __GNUC_MINOR__ * 100 \
+ __GNUC_PATCHLEVEL__)
#endif
#if GCC_VERSION >= 70000 && !defined(__CHECKER__)
# define __fallthrough __attribute__ ((fallthrough))

View File

@ -106,6 +106,7 @@ enum bpf_cmd {
BPF_TASK_FD_QUERY,
BPF_MAP_LOOKUP_AND_DELETE_ELEM,
BPF_MAP_FREEZE,
BPF_BTF_GET_NEXT_ID,
};
enum bpf_map_type {
@ -284,6 +285,9 @@ enum bpf_attach_type {
*/
#define BPF_F_TEST_RND_HI32 (1U << 2)
/* The verifier internal test flag. Behavior is undefined */
#define BPF_F_TEST_STATE_FREQ (1U << 3)
/* When BPF ldimm64's insn[0].src_reg != 0 then this can have
* two extensions:
*
@ -337,6 +341,9 @@ enum bpf_attach_type {
#define BPF_F_RDONLY_PROG (1U << 7)
#define BPF_F_WRONLY_PROG (1U << 8)
/* Clone map from listener for newly accepted socket */
#define BPF_F_CLONE (1U << 9)
/* flags for BPF_PROG_QUERY */
#define BPF_F_QUERY_EFFECTIVE (1U << 0)
@ -576,6 +583,8 @@ union bpf_attr {
* limited to five).
*
* Each time the helper is called, it appends a line to the trace.
* Lines are discarded while *\/sys/kernel/debug/tracing/trace* is
* open, use *\/sys/kernel/debug/tracing/trace_pipe* to avoid this.
* The format of the trace is customizable, and the exact output
* one will get depends on the options set in
* *\/sys/kernel/debug/tracing/trace_options* (see also the
@ -1014,7 +1023,7 @@ union bpf_attr {
* The realm of the route for the packet associated to *skb*, or 0
* if none was found.
*
* int bpf_perf_event_output(struct pt_reg *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
* int bpf_perf_event_output(struct pt_regs *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
* Description
* Write raw *data* blob into a special BPF perf event held by
* *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
@ -1076,7 +1085,7 @@ union bpf_attr {
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_get_stackid(struct pt_reg *ctx, struct bpf_map *map, u64 flags)
* int bpf_get_stackid(struct pt_regs *ctx, struct bpf_map *map, u64 flags)
* Description
* Walk a user or a kernel stack and return its id. To achieve
* this, the helper needs *ctx*, which is a pointer to the context
@ -1725,7 +1734,7 @@ union bpf_attr {
* Return
* 0 on success, or a negative error in case of failure.
*
* int bpf_override_return(struct pt_reg *regs, u64 rc)
* int bpf_override_return(struct pt_regs *regs, u64 rc)
* Description
* Used for error injection, this helper uses kprobes to override
* the return value of the probed function, and to set it to *rc*.

View File

@ -16,6 +16,18 @@
#define XDP_SHARED_UMEM (1 << 0)
#define XDP_COPY (1 << 1) /* Force copy-mode */
#define XDP_ZEROCOPY (1 << 2) /* Force zero-copy mode */
/* If this option is set, the driver might go sleep and in that case
* the XDP_RING_NEED_WAKEUP flag in the fill and/or Tx rings will be
* set. If it is set, the application need to explicitly wake up the
* driver with a poll() (Rx and Tx) or sendto() (Tx only). If you are
* running the driver and the application on the same core, you should
* use this option so that the kernel will yield to the user space
* application.
*/
#define XDP_USE_NEED_WAKEUP (1 << 3)
/* Flags for xsk_umem_config flags */
#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0)
struct sockaddr_xdp {
__u16 sxdp_family;
@ -25,10 +37,14 @@ struct sockaddr_xdp {
__u32 sxdp_shared_umem_fd;
};
/* XDP_RING flags */
#define XDP_RING_NEED_WAKEUP (1 << 0)
struct xdp_ring_offset {
__u64 producer;
__u64 consumer;
__u64 desc;
__u64 flags;
};
struct xdp_mmap_offsets {
@ -53,6 +69,7 @@ struct xdp_umem_reg {
__u64 len; /* Length of packet data area */
__u32 chunk_size;
__u32 headroom;
__u32 flags;
};
struct xdp_statistics {
@ -74,6 +91,11 @@ struct xdp_options {
#define XDP_UMEM_PGOFF_FILL_RING 0x100000000ULL
#define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000ULL
/* Masks for unaligned chunks mode */
#define XSK_UNALIGNED_BUF_OFFSET_SHIFT 48
#define XSK_UNALIGNED_BUF_ADDR_MASK \
((1ULL << XSK_UNALIGNED_BUF_OFFSET_SHIFT) - 1)
/* Rx/Tx descriptor */
struct xdp_desc {
__u64 addr;

View File

@ -1,9 +1,10 @@
# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
# Most of this file is copied from tools/lib/traceevent/Makefile
BPF_VERSION = 0
BPF_PATCHLEVEL = 0
BPF_EXTRAVERSION = 4
LIBBPF_VERSION := $(shell \
grep -oE '^LIBBPF_([0-9.]+)' libbpf.map | \
sort -rV | head -n1 | cut -d'_' -f2)
LIBBPF_MAJOR_VERSION := $(firstword $(subst ., ,$(LIBBPF_VERSION)))
MAKEFLAGS += --no-print-directory
@ -79,15 +80,9 @@ export prefix libdir src obj
libdir_SQ = $(subst ','\'',$(libdir))
libdir_relative_SQ = $(subst ','\'',$(libdir_relative))
VERSION = $(BPF_VERSION)
PATCHLEVEL = $(BPF_PATCHLEVEL)
EXTRAVERSION = $(BPF_EXTRAVERSION)
OBJ = $@
N =
LIBBPF_VERSION = $(BPF_VERSION).$(BPF_PATCHLEVEL).$(BPF_EXTRAVERSION)
LIB_TARGET = libbpf.a libbpf.so.$(LIBBPF_VERSION)
LIB_FILE = libbpf.a libbpf.so*
PC_FILE = libbpf.pc
@ -113,6 +108,7 @@ override CFLAGS += -Werror -Wall
override CFLAGS += -fPIC
override CFLAGS += $(INCLUDES)
override CFLAGS += -fvisibility=hidden
override CFLAGS += -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64
ifeq ($(VERBOSE),1)
Q =
@ -138,7 +134,9 @@ LIB_FILE := $(addprefix $(OUTPUT),$(LIB_FILE))
PC_FILE := $(addprefix $(OUTPUT),$(PC_FILE))
GLOBAL_SYM_COUNT = $(shell readelf -s --wide $(BPF_IN) | \
awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {s++} END{print s}')
cut -d "@" -f1 | sed 's/_v[0-9]_[0-9]_[0-9].*//' | \
awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {print $$8}' | \
sort -u | wc -l)
VERSIONED_SYM_COUNT = $(shell readelf -s --wide $(OUTPUT)libbpf.so | \
grep -Eo '[^ ]+@LIBBPF_' | cut -d@ -f1 | sort -u | wc -l)
@ -178,10 +176,10 @@ $(BPF_IN): force elfdep bpfdep
$(OUTPUT)libbpf.so: $(OUTPUT)libbpf.so.$(LIBBPF_VERSION)
$(OUTPUT)libbpf.so.$(LIBBPF_VERSION): $(BPF_IN)
$(QUIET_LINK)$(CC) --shared -Wl,-soname,libbpf.so.$(VERSION) \
$(QUIET_LINK)$(CC) --shared -Wl,-soname,libbpf.so.$(LIBBPF_MAJOR_VERSION) \
-Wl,--version-script=$(VERSION_SCRIPT) $^ -lelf -o $@
@ln -sf $(@F) $(OUTPUT)libbpf.so
@ln -sf $(@F) $(OUTPUT)libbpf.so.$(VERSION)
@ln -sf $(@F) $(OUTPUT)libbpf.so.$(LIBBPF_MAJOR_VERSION)
$(OUTPUT)libbpf.a: $(BPF_IN)
$(QUIET_LINK)$(RM) $@; $(AR) rcs $@ $^
@ -205,6 +203,7 @@ check_abi: $(OUTPUT)libbpf.so
"Please make sure all LIBBPF_API symbols are" \
"versioned in $(VERSION_SCRIPT)." >&2; \
readelf -s --wide $(OUTPUT)libbpf-in.o | \
cut -d "@" -f1 | sed 's/_v[0-9]_[0-9]_[0-9].*//' | \
awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {print $$8}'| \
sort -u > $(OUTPUT)libbpf_global_syms.tmp; \
readelf -s --wide $(OUTPUT)libbpf.so | \
@ -257,7 +256,8 @@ config-clean:
clean:
$(call QUIET_CLEAN, libbpf) $(RM) $(TARGETS) $(CXX_TEST_TARGET) \
*.o *~ *.a *.so *.so.$(VERSION) .*.d .*.cmd *.pc LIBBPF-CFLAGS
*.o *~ *.a *.so *.so.$(LIBBPF_MAJOR_VERSION) .*.d .*.cmd \
*.pc LIBBPF-CFLAGS
$(call QUIET_CLEAN, core-gen) $(RM) $(OUTPUT)FEATURE-DUMP.libbpf

View File

@ -568,7 +568,7 @@ int bpf_prog_test_run_xattr(struct bpf_prog_test_run_attr *test_attr)
return ret;
}
int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id)
static int bpf_obj_get_next_id(__u32 start_id, __u32 *next_id, int cmd)
{
union bpf_attr attr;
int err;
@ -576,26 +576,26 @@ int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id)
memset(&attr, 0, sizeof(attr));
attr.start_id = start_id;
err = sys_bpf(BPF_PROG_GET_NEXT_ID, &attr, sizeof(attr));
err = sys_bpf(cmd, &attr, sizeof(attr));
if (!err)
*next_id = attr.next_id;
return err;
}
int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id)
{
return bpf_obj_get_next_id(start_id, next_id, BPF_PROG_GET_NEXT_ID);
}
int bpf_map_get_next_id(__u32 start_id, __u32 *next_id)
{
union bpf_attr attr;
int err;
return bpf_obj_get_next_id(start_id, next_id, BPF_MAP_GET_NEXT_ID);
}
memset(&attr, 0, sizeof(attr));
attr.start_id = start_id;
err = sys_bpf(BPF_MAP_GET_NEXT_ID, &attr, sizeof(attr));
if (!err)
*next_id = attr.next_id;
return err;
int bpf_btf_get_next_id(__u32 start_id, __u32 *next_id)
{
return bpf_obj_get_next_id(start_id, next_id, BPF_BTF_GET_NEXT_ID);
}
int bpf_prog_get_fd_by_id(__u32 id)

View File

@ -156,6 +156,7 @@ LIBBPF_API int bpf_prog_test_run(int prog_fd, int repeat, void *data,
__u32 *retval, __u32 *duration);
LIBBPF_API int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id);
LIBBPF_API int bpf_map_get_next_id(__u32 start_id, __u32 *next_id);
LIBBPF_API int bpf_btf_get_next_id(__u32 start_id, __u32 *next_id);
LIBBPF_API int bpf_prog_get_fd_by_id(__u32 id);
LIBBPF_API int bpf_map_get_fd_by_id(__u32 id);
LIBBPF_API int bpf_btf_get_fd_by_id(__u32 id);

View File

@ -183,4 +183,10 @@ LIBBPF_0.0.4 {
perf_buffer__new;
perf_buffer__new_raw;
perf_buffer__poll;
xsk_umem__create;
} LIBBPF_0.0.3;
LIBBPF_0.0.5 {
global:
bpf_btf_get_next_id;
} LIBBPF_0.0.4;

View File

@ -74,23 +74,6 @@ struct xsk_nl_info {
int fd;
};
/* For 32-bit systems, we need to use mmap2 as the offsets are 64-bit.
* Unfortunately, it is not part of glibc.
*/
static inline void *xsk_mmap(void *addr, size_t length, int prot, int flags,
int fd, __u64 offset)
{
#ifdef __NR_mmap2
unsigned int page_shift = __builtin_ffs(getpagesize()) - 1;
long ret = syscall(__NR_mmap2, addr, length, prot, flags, fd,
(off_t)(offset >> page_shift));
return (void *)ret;
#else
return mmap(addr, length, prot, flags, fd, offset);
#endif
}
int xsk_umem__fd(const struct xsk_umem *umem)
{
return umem ? umem->fd : -EINVAL;
@ -116,6 +99,7 @@ static void xsk_set_umem_config(struct xsk_umem_config *cfg,
cfg->comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM;
cfg->flags = XSK_UMEM__DEFAULT_FLAGS;
return;
}
@ -123,6 +107,7 @@ static void xsk_set_umem_config(struct xsk_umem_config *cfg,
cfg->comp_size = usr_cfg->comp_size;
cfg->frame_size = usr_cfg->frame_size;
cfg->frame_headroom = usr_cfg->frame_headroom;
cfg->flags = usr_cfg->flags;
}
static int xsk_set_xdp_socket_config(struct xsk_socket_config *cfg,
@ -149,9 +134,10 @@ static int xsk_set_xdp_socket_config(struct xsk_socket_config *cfg,
return 0;
}
int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 size,
struct xsk_ring_prod *fill, struct xsk_ring_cons *comp,
const struct xsk_umem_config *usr_config)
int xsk_umem__create_v0_0_4(struct xsk_umem **umem_ptr, void *umem_area,
__u64 size, struct xsk_ring_prod *fill,
struct xsk_ring_cons *comp,
const struct xsk_umem_config *usr_config)
{
struct xdp_mmap_offsets off;
struct xdp_umem_reg mr;
@ -182,6 +168,7 @@ int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 size,
mr.len = size;
mr.chunk_size = umem->config.frame_size;
mr.headroom = umem->config.frame_headroom;
mr.flags = umem->config.flags;
err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr));
if (err) {
@ -210,10 +197,9 @@ int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 size,
goto out_socket;
}
map = xsk_mmap(NULL, off.fr.desc +
umem->config.fill_size * sizeof(__u64),
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
umem->fd, XDP_UMEM_PGOFF_FILL_RING);
map = mmap(NULL, off.fr.desc + umem->config.fill_size * sizeof(__u64),
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, umem->fd,
XDP_UMEM_PGOFF_FILL_RING);
if (map == MAP_FAILED) {
err = -errno;
goto out_socket;
@ -224,13 +210,13 @@ int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 size,
fill->size = umem->config.fill_size;
fill->producer = map + off.fr.producer;
fill->consumer = map + off.fr.consumer;
fill->flags = map + off.fr.flags;
fill->ring = map + off.fr.desc;
fill->cached_cons = umem->config.fill_size;
map = xsk_mmap(NULL,
off.cr.desc + umem->config.comp_size * sizeof(__u64),
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
umem->fd, XDP_UMEM_PGOFF_COMPLETION_RING);
map = mmap(NULL, off.cr.desc + umem->config.comp_size * sizeof(__u64),
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, umem->fd,
XDP_UMEM_PGOFF_COMPLETION_RING);
if (map == MAP_FAILED) {
err = -errno;
goto out_mmap;
@ -241,6 +227,7 @@ int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 size,
comp->size = umem->config.comp_size;
comp->producer = map + off.cr.producer;
comp->consumer = map + off.cr.consumer;
comp->flags = map + off.cr.flags;
comp->ring = map + off.cr.desc;
*umem_ptr = umem;
@ -255,6 +242,29 @@ out_umem_alloc:
return err;
}
struct xsk_umem_config_v1 {
__u32 fill_size;
__u32 comp_size;
__u32 frame_size;
__u32 frame_headroom;
};
int xsk_umem__create_v0_0_2(struct xsk_umem **umem_ptr, void *umem_area,
__u64 size, struct xsk_ring_prod *fill,
struct xsk_ring_cons *comp,
const struct xsk_umem_config *usr_config)
{
struct xsk_umem_config config;
memcpy(&config, usr_config, sizeof(struct xsk_umem_config_v1));
config.flags = 0;
return xsk_umem__create_v0_0_4(umem_ptr, umem_area, size, fill, comp,
&config);
}
asm(".symver xsk_umem__create_v0_0_2, xsk_umem__create@LIBBPF_0.0.2");
asm(".symver xsk_umem__create_v0_0_4, xsk_umem__create@@LIBBPF_0.0.4");
static int xsk_load_xdp_prog(struct xsk_socket *xsk)
{
static const int log_buf_size = 16 * 1024;
@ -550,11 +560,10 @@ int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
}
if (rx) {
rx_map = xsk_mmap(NULL, off.rx.desc +
xsk->config.rx_size * sizeof(struct xdp_desc),
PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE,
xsk->fd, XDP_PGOFF_RX_RING);
rx_map = mmap(NULL, off.rx.desc +
xsk->config.rx_size * sizeof(struct xdp_desc),
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
xsk->fd, XDP_PGOFF_RX_RING);
if (rx_map == MAP_FAILED) {
err = -errno;
goto out_socket;
@ -564,16 +573,16 @@ int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
rx->size = xsk->config.rx_size;
rx->producer = rx_map + off.rx.producer;
rx->consumer = rx_map + off.rx.consumer;
rx->flags = rx_map + off.rx.flags;
rx->ring = rx_map + off.rx.desc;
}
xsk->rx = rx;
if (tx) {
tx_map = xsk_mmap(NULL, off.tx.desc +
xsk->config.tx_size * sizeof(struct xdp_desc),
PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE,
xsk->fd, XDP_PGOFF_TX_RING);
tx_map = mmap(NULL, off.tx.desc +
xsk->config.tx_size * sizeof(struct xdp_desc),
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
xsk->fd, XDP_PGOFF_TX_RING);
if (tx_map == MAP_FAILED) {
err = -errno;
goto out_mmap_rx;
@ -583,6 +592,7 @@ int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
tx->size = xsk->config.tx_size;
tx->producer = tx_map + off.tx.producer;
tx->consumer = tx_map + off.tx.consumer;
tx->flags = tx_map + off.tx.flags;
tx->ring = tx_map + off.tx.desc;
tx->cached_cons = xsk->config.tx_size;
}

View File

@ -32,6 +32,7 @@ struct name { \
__u32 *producer; \
__u32 *consumer; \
void *ring; \
__u32 *flags; \
}
DEFINE_XSK_RING(xsk_ring_prod);
@ -76,6 +77,11 @@ xsk_ring_cons__rx_desc(const struct xsk_ring_cons *rx, __u32 idx)
return &descs[idx & rx->mask];
}
static inline int xsk_ring_prod__needs_wakeup(const struct xsk_ring_prod *r)
{
return *r->flags & XDP_RING_NEED_WAKEUP;
}
static inline __u32 xsk_prod_nb_free(struct xsk_ring_prod *r, __u32 nb)
{
__u32 free_entries = r->cached_cons - r->cached_prod;
@ -162,6 +168,21 @@ static inline void *xsk_umem__get_data(void *umem_area, __u64 addr)
return &((char *)umem_area)[addr];
}
static inline __u64 xsk_umem__extract_addr(__u64 addr)
{
return addr & XSK_UNALIGNED_BUF_ADDR_MASK;
}
static inline __u64 xsk_umem__extract_offset(__u64 addr)
{
return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT;
}
static inline __u64 xsk_umem__add_offset_to_addr(__u64 addr)
{
return xsk_umem__extract_addr(addr) + xsk_umem__extract_offset(addr);
}
LIBBPF_API int xsk_umem__fd(const struct xsk_umem *umem);
LIBBPF_API int xsk_socket__fd(const struct xsk_socket *xsk);
@ -170,12 +191,14 @@ LIBBPF_API int xsk_socket__fd(const struct xsk_socket *xsk);
#define XSK_UMEM__DEFAULT_FRAME_SHIFT 12 /* 4096 bytes */
#define XSK_UMEM__DEFAULT_FRAME_SIZE (1 << XSK_UMEM__DEFAULT_FRAME_SHIFT)
#define XSK_UMEM__DEFAULT_FRAME_HEADROOM 0
#define XSK_UMEM__DEFAULT_FLAGS 0
struct xsk_umem_config {
__u32 fill_size;
__u32 comp_size;
__u32 frame_size;
__u32 frame_headroom;
__u32 flags;
};
/* Flags for the libbpf_flags field. */
@ -195,6 +218,16 @@ LIBBPF_API int xsk_umem__create(struct xsk_umem **umem,
struct xsk_ring_prod *fill,
struct xsk_ring_cons *comp,
const struct xsk_umem_config *config);
LIBBPF_API int xsk_umem__create_v0_0_2(struct xsk_umem **umem,
void *umem_area, __u64 size,
struct xsk_ring_prod *fill,
struct xsk_ring_cons *comp,
const struct xsk_umem_config *config);
LIBBPF_API int xsk_umem__create_v0_0_4(struct xsk_umem **umem,
void *umem_area, __u64 size,
struct xsk_ring_prod *fill,
struct xsk_ring_cons *comp,
const struct xsk_umem_config *config);
LIBBPF_API int xsk_socket__create(struct xsk_socket **xsk,
const char *ifname, __u32 queue_id,
struct xsk_umem *umem,

View File

@ -42,4 +42,5 @@ xdping
test_sockopt
test_sockopt_sk
test_sockopt_multi
test_sockopt_inherit
test_tcp_rtt

View File

@ -29,7 +29,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test
test_cgroup_storage test_select_reuseport test_section_names \
test_netcnt test_tcpnotify_user test_sock_fields test_sysctl test_hashmap \
test_btf_dump test_cgroup_attach xdping test_sockopt test_sockopt_sk \
test_sockopt_multi test_tcp_rtt
test_sockopt_multi test_sockopt_inherit test_tcp_rtt
BPF_OBJ_FILES = $(patsubst %.c,%.o, $(notdir $(wildcard progs/*.c)))
TEST_GEN_FILES = $(BPF_OBJ_FILES)
@ -66,7 +66,8 @@ TEST_PROGS := test_kmod.sh \
test_tcp_check_syncookie.sh \
test_tc_tunnel.sh \
test_tc_edt.sh \
test_xdping.sh
test_xdping.sh \
test_bpftool_build.sh
TEST_PROGS_EXTENDED := with_addr.sh \
with_tunnels.sh \
@ -115,6 +116,7 @@ $(OUTPUT)/test_cgroup_attach: cgroup_helpers.c
$(OUTPUT)/test_sockopt: cgroup_helpers.c
$(OUTPUT)/test_sockopt_sk: cgroup_helpers.c
$(OUTPUT)/test_sockopt_multi: cgroup_helpers.c
$(OUTPUT)/test_sockopt_inherit: cgroup_helpers.c
$(OUTPUT)/test_tcp_rtt: cgroup_helpers.c
.PHONY: force

View File

@ -1,4 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
#ifndef __BPF_ENDIAN__
#define __BPF_ENDIAN__
@ -29,6 +29,10 @@
# define __bpf_htonl(x) __builtin_bswap32(x)
# define __bpf_constant_ntohl(x) ___constant_swab32(x)
# define __bpf_constant_htonl(x) ___constant_swab32(x)
# define __bpf_be64_to_cpu(x) __builtin_bswap64(x)
# define __bpf_cpu_to_be64(x) __builtin_bswap64(x)
# define __bpf_constant_be64_to_cpu(x) ___constant_swab64(x)
# define __bpf_constant_cpu_to_be64(x) ___constant_swab64(x)
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
# define __bpf_ntohs(x) (x)
# define __bpf_htons(x) (x)
@ -38,6 +42,10 @@
# define __bpf_htonl(x) (x)
# define __bpf_constant_ntohl(x) (x)
# define __bpf_constant_htonl(x) (x)
# define __bpf_be64_to_cpu(x) (x)
# define __bpf_cpu_to_be64(x) (x)
# define __bpf_constant_be64_to_cpu(x) (x)
# define __bpf_constant_cpu_to_be64(x) (x)
#else
# error "Fix your compiler's __BYTE_ORDER__?!"
#endif
@ -54,5 +62,11 @@
#define bpf_ntohl(x) \
(__builtin_constant_p(x) ? \
__bpf_constant_ntohl(x) : __bpf_ntohl(x))
#define bpf_cpu_to_be64(x) \
(__builtin_constant_p(x) ? \
__bpf_constant_cpu_to_be64(x) : __bpf_cpu_to_be64(x))
#define bpf_be64_to_cpu(x) \
(__builtin_constant_p(x) ? \
__bpf_constant_be64_to_cpu(x) : __bpf_be64_to_cpu(x))
#endif /* __BPF_ENDIAN__ */

View File

@ -1,4 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
#ifndef __BPF_HELPERS_H
#define __BPF_HELPERS_H

View File

@ -48,16 +48,17 @@ void test_bpf_obj_id(void)
/* test_obj_id.o is a dumb prog. It should never fail
* to load.
*/
if (err)
error_cnt++;
assert(!err);
if (CHECK_FAIL(err))
continue;
/* Insert a magic value to the map */
map_fds[i] = bpf_find_map(__func__, objs[i], "test_map_id");
assert(map_fds[i] >= 0);
if (CHECK_FAIL(map_fds[i] < 0))
goto done;
err = bpf_map_update_elem(map_fds[i], &array_key,
&array_magic_value, 0);
assert(!err);
if (CHECK_FAIL(err))
goto done;
/* Check getting map info */
info_len = sizeof(struct bpf_map_info) * 2;
@ -96,9 +97,11 @@ void test_bpf_obj_id(void)
prog_infos[i].map_ids = ptr_to_u64(map_ids + i);
prog_infos[i].nr_map_ids = 2;
err = clock_gettime(CLOCK_REALTIME, &real_time_ts);
assert(!err);
if (CHECK_FAIL(err))
goto done;
err = clock_gettime(CLOCK_BOOTTIME, &boot_time_ts);
assert(!err);
if (CHECK_FAIL(err))
goto done;
err = bpf_obj_get_info_by_fd(prog_fds[i], &prog_infos[i],
&info_len);
load_time = (real_time_ts.tv_sec - boot_time_ts.tv_sec)
@ -224,7 +227,8 @@ void test_bpf_obj_id(void)
nr_id_found++;
err = bpf_map_lookup_elem(map_fd, &array_key, &array_value);
assert(!err);
if (CHECK_FAIL(err))
goto done;
err = bpf_obj_get_info_by_fd(map_fd, &map_info, &info_len);
CHECK(err || info_len != sizeof(struct bpf_map_info) ||

View File

@ -28,8 +28,6 @@ static int check_load(const char *file, enum bpf_prog_type type)
attr.prog_flags = BPF_F_TEST_RND_HI32;
err = bpf_prog_load_xattr(&attr, &obj, &prog_fd);
bpf_object__close(obj);
if (err)
error_cnt++;
return err;
}
@ -105,12 +103,7 @@ void test_bpf_verif_scale(void)
continue;
err = check_load(test->file, test->attach_type);
if (test->fails) { /* expected to fail */
if (err)
error_cnt--;
else
error_cnt++;
}
CHECK_FAIL(err && !test->fails);
}
if (env.verifier_stats)

View File

@ -344,7 +344,6 @@ struct test tests[] = {
.tcp.dest = 8080,
},
.keys = {
.nhoff = 0,
.nhoff = ETH_HLEN,
.thoff = ETH_HLEN + sizeof(struct iphdr) +
sizeof(struct iphdr),
@ -452,10 +451,8 @@ void test_flow_dissector(void)
err = bpf_flow_load(&obj, "./bpf_flow.o", "flow_dissector",
"jmp_table", "last_dissection", &prog_fd, &keys_fd);
if (err) {
error_cnt++;
if (CHECK_FAIL(err))
return;
}
for (i = 0; i < ARRAY_SIZE(tests); i++) {
struct bpf_flow_keys flow_keys;

View File

@ -135,10 +135,7 @@ void test_get_stack_raw_tp(void)
exp_cnt -= err;
}
goto close_prog_noerr;
close_prog:
error_cnt++;
close_prog_noerr:
if (!IS_ERR_OR_NULL(link))
bpf_link__destroy(link);
if (!IS_ERR_OR_NULL(pb))

View File

@ -7,10 +7,8 @@ static void test_global_data_number(struct bpf_object *obj, __u32 duration)
uint64_t num;
map_fd = bpf_find_map(__func__, obj, "result_number");
if (map_fd < 0) {
error_cnt++;
if (CHECK_FAIL(map_fd < 0))
return;
}
struct {
char *name;
@ -44,10 +42,8 @@ static void test_global_data_string(struct bpf_object *obj, __u32 duration)
char str[32];
map_fd = bpf_find_map(__func__, obj, "result_string");
if (map_fd < 0) {
error_cnt++;
if (CHECK_FAIL(map_fd < 0))
return;
}
struct {
char *name;
@ -81,10 +77,8 @@ static void test_global_data_struct(struct bpf_object *obj, __u32 duration)
struct foo val;
map_fd = bpf_find_map(__func__, obj, "result_struct");
if (map_fd < 0) {
error_cnt++;
if (CHECK_FAIL(map_fd < 0))
return;
}
struct {
char *name;
@ -112,16 +106,12 @@ static void test_global_data_rdonly(struct bpf_object *obj, __u32 duration)
__u8 *buff;
map = bpf_object__find_map_by_name(obj, "test_glo.rodata");
if (!map || !bpf_map__is_internal(map)) {
error_cnt++;
if (CHECK_FAIL(!map || !bpf_map__is_internal(map)))
return;
}
map_fd = bpf_map__fd(map);
if (map_fd < 0) {
error_cnt++;
if (CHECK_FAIL(map_fd < 0))
return;
}
buff = malloc(bpf_map__def(map)->value_size);
if (buff)

View File

@ -30,10 +30,8 @@ static void test_l4lb(const char *file)
u32 *magic = (u32 *)buf;
err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
if (err) {
error_cnt++;
if (CHECK_FAIL(err))
return;
}
map_fd = bpf_find_map(__func__, obj, "vip_map");
if (map_fd < 0)
@ -72,10 +70,9 @@ static void test_l4lb(const char *file)
bytes += stats[i].bytes;
pkts += stats[i].pkts;
}
if (bytes != MAGIC_BYTES * NUM_ITER * 2 || pkts != NUM_ITER * 2) {
error_cnt++;
if (CHECK_FAIL(bytes != MAGIC_BYTES * NUM_ITER * 2 ||
pkts != NUM_ITER * 2))
printf("test_l4lb:FAIL:stats %lld %lld\n", bytes, pkts);
}
out:
bpf_object__close(obj);
}

View File

@ -8,14 +8,12 @@ static void *parallel_map_access(void *arg)
for (i = 0; i < 10000; i++) {
err = bpf_map_lookup_elem_flags(map_fd, &key, vars, BPF_F_LOCK);
if (err) {
if (CHECK_FAIL(err)) {
printf("lookup failed\n");
error_cnt++;
goto out;
}
if (vars[0] != 0) {
if (CHECK_FAIL(vars[0] != 0)) {
printf("lookup #%d var[0]=%d\n", i, vars[0]);
error_cnt++;
goto out;
}
rnd = vars[1];
@ -24,7 +22,7 @@ static void *parallel_map_access(void *arg)
continue;
printf("lookup #%d var[1]=%d var[%d]=%d\n",
i, rnd, j, vars[j]);
error_cnt++;
CHECK_FAIL(vars[j] != rnd);
goto out;
}
}
@ -42,34 +40,36 @@ void test_map_lock(void)
void *ret;
err = bpf_prog_load(file, BPF_PROG_TYPE_CGROUP_SKB, &obj, &prog_fd);
if (err) {
if (CHECK_FAIL(err)) {
printf("test_map_lock:bpf_prog_load errno %d\n", errno);
goto close_prog;
}
map_fd[0] = bpf_find_map(__func__, obj, "hash_map");
if (map_fd[0] < 0)
if (CHECK_FAIL(map_fd[0] < 0))
goto close_prog;
map_fd[1] = bpf_find_map(__func__, obj, "array_map");
if (map_fd[1] < 0)
if (CHECK_FAIL(map_fd[1] < 0))
goto close_prog;
bpf_map_update_elem(map_fd[0], &key, vars, BPF_F_LOCK);
for (i = 0; i < 4; i++)
assert(pthread_create(&thread_id[i], NULL,
&spin_lock_thread, &prog_fd) == 0);
if (CHECK_FAIL(pthread_create(&thread_id[i], NULL,
&spin_lock_thread, &prog_fd)))
goto close_prog;
for (i = 4; i < 6; i++)
assert(pthread_create(&thread_id[i], NULL,
&parallel_map_access, &map_fd[i - 4]) == 0);
if (CHECK_FAIL(pthread_create(&thread_id[i], NULL,
&parallel_map_access,
&map_fd[i - 4])))
goto close_prog;
for (i = 0; i < 4; i++)
assert(pthread_join(thread_id[i], &ret) == 0 &&
ret == (void *)&prog_fd);
if (CHECK_FAIL(pthread_join(thread_id[i], &ret) ||
ret != (void *)&prog_fd))
goto close_prog;
for (i = 4; i < 6; i++)
assert(pthread_join(thread_id[i], &ret) == 0 &&
ret == (void *)&map_fd[i - 4]);
goto close_prog_noerr;
if (CHECK_FAIL(pthread_join(thread_id[i], &ret) ||
ret != (void *)&map_fd[i - 4]))
goto close_prog;
close_prog:
error_cnt++;
close_prog_noerr:
bpf_object__close(obj);
}

View File

@ -9,10 +9,8 @@ void test_pkt_access(void)
int err, prog_fd;
err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
if (err) {
error_cnt++;
if (CHECK_FAIL(err))
return;
}
err = bpf_prog_test_run(prog_fd, 100000, &pkt_v4, sizeof(pkt_v4),
NULL, NULL, &retval, &duration);

View File

@ -9,10 +9,8 @@ void test_pkt_md_access(void)
int err, prog_fd;
err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
if (err) {
error_cnt++;
if (CHECK_FAIL(err))
return;
}
err = bpf_prog_test_run(prog_fd, 10, &pkt_v4, sizeof(pkt_v4),
NULL, NULL, &retval, &duration);

View File

@ -27,10 +27,8 @@ static void test_queue_stack_map_by_type(int type)
return;
err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
if (err) {
error_cnt++;
if (CHECK_FAIL(err))
return;
}
map_in_fd = bpf_find_map(__func__, obj, "map_in");
if (map_in_fd < 0)
@ -43,10 +41,8 @@ static void test_queue_stack_map_by_type(int type)
/* Push 32 elements to the input map */
for (i = 0; i < MAP_SIZE; i++) {
err = bpf_map_update_elem(map_in_fd, NULL, &vals[i], 0);
if (err) {
error_cnt++;
if (CHECK_FAIL(err))
goto out;
}
}
/* The eBPF program pushes iph.saddr in the output map,

View File

@ -10,10 +10,8 @@ void test_reference_tracking(void)
int err = 0;
obj = bpf_object__open(file);
if (IS_ERR(obj)) {
error_cnt++;
if (CHECK_FAIL(IS_ERR(obj)))
return;
}
bpf_object__for_each_program(prog, obj) {
const char *title;

Some files were not shown because too many files have changed in this diff Show More