linux-brain/net/netfilter/core.c

673 lines
16 KiB
C
Raw Permalink Normal View History

/* netfilter.c: look after the filters for various protocols.
* Heavily influenced by the old firewall.c by David Bonn and Alan Cox.
*
* Thanks to Rob `CmdrTaco' Malda for not influencing this code in any
* way.
*
* This code is GPL.
*/
#include <linux/kernel.h>
#include <linux/netfilter.h>
#include <net/protocol.h>
#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/wait.h>
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/if.h>
#include <linux/netdevice.h>
#include <linux/netfilter_ipv6.h>
#include <linux/inetdevice.h>
#include <linux/proc_fs.h>
#include <linux/mutex.h>
#include <linux/mm.h>
#include <linux/rcupdate.h>
#include <net/net_namespace.h>
#include <net/netfilter/nf_queue.h>
#include <net/sock.h>
#include "nf_internals.h"
netfilter: add nf_ipv6_ops hook to fix xt_addrtype with IPv6 Quoting https://bugzilla.netfilter.org/show_bug.cgi?id=812: [ ip6tables -m addrtype ] When I tried to use in the nat/PREROUTING it messes up the routing cache even if the rule didn't matched at all. [..] If I remove the --limit-iface-in from the non-working scenario, so just use the -m addrtype --dst-type LOCAL it works! This happens when LOCAL type matching is requested with --limit-iface-in, and the default ipv6 route is via the interface the packet we test arrived on. Because xt_addrtype uses ip6_route_output, the ipv6 routing implementation creates an unwanted cached entry, and the packet won't make it to the real/expected destination. Silently ignoring --limit-iface-in makes the routing work but it breaks rule matching (--dst-type LOCAL with limit-iface-in is supposed to only match if the dst address is configured on the incoming interface; without --limit-iface-in it will match if the address is reachable via lo). The test should call ipv6_chk_addr() instead. However, this would add a link-time dependency on ipv6. There are two possible solutions: 1) Revert the commit that moved ipt_addrtype to xt_addrtype, and put ipv6 specific code into ip6t_addrtype. 2) add new "nf_ipv6_ops" struct to register pointers to ipv6 functions. While the former might seem preferable, Pablo pointed out that there are more xt modules with link-time dependeny issues regarding ipv6, so lets go for 2). Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2013-05-17 12:56:10 +09:00
const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly;
EXPORT_SYMBOL_GPL(nf_ipv6_ops);
DEFINE_PER_CPU(bool, nf_skb_duplicated);
EXPORT_SYMBOL_GPL(nf_skb_duplicated);
#ifdef CONFIG_JUMP_LABEL
static keys: Introduce 'struct static_key', static_key_true()/false() and static_key_slow_[inc|dec]() So here's a boot tested patch on top of Jason's series that does all the cleanups I talked about and turns jump labels into a more intuitive to use facility. It should also address the various misconceptions and confusions that surround jump labels. Typical usage scenarios: #include <linux/static_key.h> struct static_key key = STATIC_KEY_INIT_TRUE; if (static_key_false(&key)) do unlikely code else do likely code Or: if (static_key_true(&key)) do likely code else do unlikely code The static key is modified via: static_key_slow_inc(&key); ... static_key_slow_dec(&key); The 'slow' prefix makes it abundantly clear that this is an expensive operation. I've updated all in-kernel code to use this everywhere. Note that I (intentionally) have not pushed through the rename blindly through to the lowest levels: the actual jump-label patching arch facility should be named like that, so we want to decouple jump labels from the static-key facility a bit. On non-jump-label enabled architectures static keys default to likely()/unlikely() branches. Signed-off-by: Ingo Molnar <mingo@elte.hu> Acked-by: Jason Baron <jbaron@redhat.com> Acked-by: Steven Rostedt <rostedt@goodmis.org> Cc: a.p.zijlstra@chello.nl Cc: mathieu.desnoyers@efficios.com Cc: davem@davemloft.net Cc: ddaney.cavm@gmail.com Cc: Linus Torvalds <torvalds@linux-foundation.org> Link: http://lkml.kernel.org/r/20120222085809.GA26397@elte.hu Signed-off-by: Ingo Molnar <mingo@elte.hu>
2012-02-24 16:31:31 +09:00
struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
EXPORT_SYMBOL(nf_hooks_needed);
#endif
static DEFINE_MUTEX(nf_hook_mutex);
/* max hooks per family/hooknum */
#define MAX_HOOK_COUNT 1024
#define nf_entry_dereference(e) \
rcu_dereference_protected(e, lockdep_is_held(&nf_hook_mutex))
static struct nf_hook_entries *allocate_hook_entries_size(u16 num)
{
struct nf_hook_entries *e;
size_t alloc = sizeof(*e) +
sizeof(struct nf_hook_entry) * num +
sizeof(struct nf_hook_ops *) * num +
sizeof(struct nf_hook_entries_rcu_head);
if (num == 0)
return NULL;
e = kvzalloc(alloc, GFP_KERNEL);
if (e)
e->num_hook_entries = num;
return e;
}
static void __nf_hook_entries_free(struct rcu_head *h)
{
struct nf_hook_entries_rcu_head *head;
head = container_of(h, struct nf_hook_entries_rcu_head, head);
kvfree(head->allocation);
}
static void nf_hook_entries_free(struct nf_hook_entries *e)
{
struct nf_hook_entries_rcu_head *head;
struct nf_hook_ops **ops;
unsigned int num;
if (!e)
return;
num = e->num_hook_entries;
ops = nf_hook_entries_get_hook_ops(e);
head = (void *)&ops[num];
head->allocation = e;
call_rcu(&head->head, __nf_hook_entries_free);
}
static unsigned int accept_all(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
return NF_ACCEPT; /* ACCEPT makes nf_hook_slow call next hook */
}
static const struct nf_hook_ops dummy_ops = {
.hook = accept_all,
.priority = INT_MIN,
};
static struct nf_hook_entries *
nf_hook_entries_grow(const struct nf_hook_entries *old,
const struct nf_hook_ops *reg)
{
unsigned int i, alloc_entries, nhooks, old_entries;
struct nf_hook_ops **orig_ops = NULL;
struct nf_hook_ops **new_ops;
struct nf_hook_entries *new;
bool inserted = false;
alloc_entries = 1;
old_entries = old ? old->num_hook_entries : 0;
if (old) {
orig_ops = nf_hook_entries_get_hook_ops(old);
for (i = 0; i < old_entries; i++) {
if (orig_ops[i] != &dummy_ops)
alloc_entries++;
}
}
if (alloc_entries > MAX_HOOK_COUNT)
return ERR_PTR(-E2BIG);
new = allocate_hook_entries_size(alloc_entries);
if (!new)
return ERR_PTR(-ENOMEM);
new_ops = nf_hook_entries_get_hook_ops(new);
i = 0;
nhooks = 0;
while (i < old_entries) {
if (orig_ops[i] == &dummy_ops) {
++i;
continue;
}
netfilter: core: only allow one nat hook per hook point The netfilter NAT core cannot deal with more than one NAT hook per hook location (prerouting, input ...), because the NAT hooks install a NAT null binding in case the iptables nat table (iptable_nat hooks) or the corresponding nftables chain (nft nat hooks) doesn't specify a nat transformation. Null bindings are needed to detect port collsisions between NAT-ed and non-NAT-ed connections. This causes nftables NAT rules to not work when iptable_nat module is loaded, and vice versa because nat binding has already been attached when the second nat hook is consulted. The netfilter core is not really the correct location to handle this (hooks are just hooks, the core has no notion of what kinds of side effects a hook implements), but its the only place where we can check for conflicts between both iptables hooks and nftables hooks without adding dependencies. So add nat annotation to hook_ops to describe those hooks that will add NAT bindings and then make core reject if such a hook already exists. The annotation fills a padding hole, in case further restrictions appar we might change this to a 'u8 type' instead of bool. iptables error if nft nat hook active: iptables -t nat -A POSTROUTING -j MASQUERADE iptables v1.4.21: can't initialize iptables table `nat': File exists Perhaps iptables or your kernel needs to be upgraded. nftables error if iptables nat table present: nft -f /etc/nftables/ipv4-nat /usr/etc/nftables/ipv4-nat:3:1-2: Error: Could not process rule: File exists table nat { ^^ Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2017-12-09 01:01:54 +09:00
if (inserted || reg->priority > orig_ops[i]->priority) {
new_ops[nhooks] = (void *)orig_ops[i];
new->hooks[nhooks] = old->hooks[i];
i++;
} else {
new_ops[nhooks] = (void *)reg;
new->hooks[nhooks].hook = reg->hook;
new->hooks[nhooks].priv = reg->priv;
inserted = true;
}
nhooks++;
}
if (!inserted) {
new_ops[nhooks] = (void *)reg;
new->hooks[nhooks].hook = reg->hook;
new->hooks[nhooks].priv = reg->priv;
}
return new;
}
static void hooks_validate(const struct nf_hook_entries *hooks)
{
#ifdef CONFIG_DEBUG_MISC
struct nf_hook_ops **orig_ops;
int prio = INT_MIN;
size_t i = 0;
orig_ops = nf_hook_entries_get_hook_ops(hooks);
for (i = 0; i < hooks->num_hook_entries; i++) {
if (orig_ops[i] == &dummy_ops)
continue;
WARN_ON(orig_ops[i]->priority < prio);
if (orig_ops[i]->priority > prio)
prio = orig_ops[i]->priority;
}
#endif
}
int nf_hook_entries_insert_raw(struct nf_hook_entries __rcu **pp,
const struct nf_hook_ops *reg)
{
struct nf_hook_entries *new_hooks;
struct nf_hook_entries *p;
p = rcu_dereference_raw(*pp);
new_hooks = nf_hook_entries_grow(p, reg);
if (IS_ERR(new_hooks))
return PTR_ERR(new_hooks);
hooks_validate(new_hooks);
rcu_assign_pointer(*pp, new_hooks);
BUG_ON(p == new_hooks);
nf_hook_entries_free(p);
return 0;
}
EXPORT_SYMBOL_GPL(nf_hook_entries_insert_raw);
/*
* __nf_hook_entries_try_shrink - try to shrink hook array
*
* @old -- current hook blob at @pp
* @pp -- location of hook blob
*
* Hook unregistration must always succeed, so to-be-removed hooks
* are replaced by a dummy one that will just move to next hook.
*
* This counts the current dummy hooks, attempts to allocate new blob,
* copies the live hooks, then replaces and discards old one.
*
* return values:
*
* Returns address to free, or NULL.
*/
static void *__nf_hook_entries_try_shrink(struct nf_hook_entries *old,
struct nf_hook_entries __rcu **pp)
{
unsigned int i, j, skip = 0, hook_entries;
struct nf_hook_entries *new = NULL;
struct nf_hook_ops **orig_ops;
struct nf_hook_ops **new_ops;
if (WARN_ON_ONCE(!old))
return NULL;
orig_ops = nf_hook_entries_get_hook_ops(old);
for (i = 0; i < old->num_hook_entries; i++) {
if (orig_ops[i] == &dummy_ops)
skip++;
}
/* if skip == hook_entries all hooks have been removed */
hook_entries = old->num_hook_entries;
if (skip == hook_entries)
goto out_assign;
if (skip == 0)
return NULL;
hook_entries -= skip;
new = allocate_hook_entries_size(hook_entries);
if (!new)
return NULL;
new_ops = nf_hook_entries_get_hook_ops(new);
for (i = 0, j = 0; i < old->num_hook_entries; i++) {
if (orig_ops[i] == &dummy_ops)
continue;
new->hooks[j] = old->hooks[i];
new_ops[j] = (void *)orig_ops[i];
j++;
}
hooks_validate(new);
out_assign:
rcu_assign_pointer(*pp, new);
return old;
}
static struct nf_hook_entries __rcu **
nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum,
struct net_device *dev)
{
switch (pf) {
case NFPROTO_NETDEV:
break;
#ifdef CONFIG_NETFILTER_FAMILY_ARP
case NFPROTO_ARP:
if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_arp) <= hooknum))
return NULL;
return net->nf.hooks_arp + hooknum;
#endif
#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
case NFPROTO_BRIDGE:
if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_bridge) <= hooknum))
return NULL;
return net->nf.hooks_bridge + hooknum;
#endif
case NFPROTO_IPV4:
if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_ipv4) <= hooknum))
return NULL;
return net->nf.hooks_ipv4 + hooknum;
case NFPROTO_IPV6:
if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_ipv6) <= hooknum))
return NULL;
return net->nf.hooks_ipv6 + hooknum;
#if IS_ENABLED(CONFIG_DECNET)
case NFPROTO_DECNET:
if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_decnet) <= hooknum))
return NULL;
return net->nf.hooks_decnet + hooknum;
#endif
default:
WARN_ON_ONCE(1);
return NULL;
}
netfilter: add netfilter ingress hook after handle_ing() under unique static key This patch adds the Netfilter ingress hook just after the existing tc ingress hook, that seems to be the consensus solution for this. Note that the Netfilter hook resides under the global static key that enables ingress filtering. Nonetheless, Netfilter still also has its own static key for minimal impact on the existing handle_ing(). * Without this patch: Result: OK: 6216490(c6216338+d152) usec, 100000000 (60byte,0frags) 16086246pps 7721Mb/sec (7721398080bps) errors: 100000000 42.46% kpktgend_0 [kernel.kallsyms] [k] __netif_receive_skb_core 25.92% kpktgend_0 [kernel.kallsyms] [k] kfree_skb 7.81% kpktgend_0 [pktgen] [k] pktgen_thread_worker 5.62% kpktgend_0 [kernel.kallsyms] [k] ip_rcv 2.70% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_internal 2.34% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_sk 1.44% kpktgend_0 [kernel.kallsyms] [k] __build_skb * With this patch: Result: OK: 6214833(c6214731+d101) usec, 100000000 (60byte,0frags) 16090536pps 7723Mb/sec (7723457280bps) errors: 100000000 41.23% kpktgend_0 [kernel.kallsyms] [k] __netif_receive_skb_core 26.57% kpktgend_0 [kernel.kallsyms] [k] kfree_skb 7.72% kpktgend_0 [pktgen] [k] pktgen_thread_worker 5.55% kpktgend_0 [kernel.kallsyms] [k] ip_rcv 2.78% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_internal 2.06% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_sk 1.43% kpktgend_0 [kernel.kallsyms] [k] __build_skb * Without this patch + tc ingress: tc filter add dev eth4 parent ffff: protocol ip prio 1 \ u32 match ip dst 4.3.2.1/32 Result: OK: 9269001(c9268821+d179) usec, 100000000 (60byte,0frags) 10788648pps 5178Mb/sec (5178551040bps) errors: 100000000 40.99% kpktgend_0 [kernel.kallsyms] [k] __netif_receive_skb_core 17.50% kpktgend_0 [kernel.kallsyms] [k] kfree_skb 11.77% kpktgend_0 [cls_u32] [k] u32_classify 5.62% kpktgend_0 [kernel.kallsyms] [k] tc_classify_compat 5.18% kpktgend_0 [pktgen] [k] pktgen_thread_worker 3.23% kpktgend_0 [kernel.kallsyms] [k] tc_classify 2.97% kpktgend_0 [kernel.kallsyms] [k] ip_rcv 1.83% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_internal 1.50% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_sk 0.99% kpktgend_0 [kernel.kallsyms] [k] __build_skb * With this patch + tc ingress: tc filter add dev eth4 parent ffff: protocol ip prio 1 \ u32 match ip dst 4.3.2.1/32 Result: OK: 9308218(c9308091+d126) usec, 100000000 (60byte,0frags) 10743194pps 5156Mb/sec (5156733120bps) errors: 100000000 42.01% kpktgend_0 [kernel.kallsyms] [k] __netif_receive_skb_core 17.78% kpktgend_0 [kernel.kallsyms] [k] kfree_skb 11.70% kpktgend_0 [cls_u32] [k] u32_classify 5.46% kpktgend_0 [kernel.kallsyms] [k] tc_classify_compat 5.16% kpktgend_0 [pktgen] [k] pktgen_thread_worker 2.98% kpktgend_0 [kernel.kallsyms] [k] ip_rcv 2.84% kpktgend_0 [kernel.kallsyms] [k] tc_classify 1.96% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_internal 1.57% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_sk Note that the results are very similar before and after. I can see gcc gets the code under the ingress static key out of the hot path. Then, on that cold branch, it generates the code to accomodate the netfilter ingress static key. My explanation for this is that this reduces the pressure on the instruction cache for non-users as the new code is out of the hot path, and it comes with minimal impact for tc ingress users. Using gcc version 4.8.4 on: Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian CPU(s): 8 [...] L1d cache: 16K L1i cache: 64K L2 cache: 2048K L3 cache: 8192K Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> Acked-by: Alexei Starovoitov <ast@plumgrid.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-14 01:19:38 +09:00
#ifdef CONFIG_NETFILTER_INGRESS
if (hooknum == NF_NETDEV_INGRESS) {
if (dev && dev_net(dev) == net)
return &dev->nf_hooks_ingress;
netfilter: add netfilter ingress hook after handle_ing() under unique static key This patch adds the Netfilter ingress hook just after the existing tc ingress hook, that seems to be the consensus solution for this. Note that the Netfilter hook resides under the global static key that enables ingress filtering. Nonetheless, Netfilter still also has its own static key for minimal impact on the existing handle_ing(). * Without this patch: Result: OK: 6216490(c6216338+d152) usec, 100000000 (60byte,0frags) 16086246pps 7721Mb/sec (7721398080bps) errors: 100000000 42.46% kpktgend_0 [kernel.kallsyms] [k] __netif_receive_skb_core 25.92% kpktgend_0 [kernel.kallsyms] [k] kfree_skb 7.81% kpktgend_0 [pktgen] [k] pktgen_thread_worker 5.62% kpktgend_0 [kernel.kallsyms] [k] ip_rcv 2.70% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_internal 2.34% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_sk 1.44% kpktgend_0 [kernel.kallsyms] [k] __build_skb * With this patch: Result: OK: 6214833(c6214731+d101) usec, 100000000 (60byte,0frags) 16090536pps 7723Mb/sec (7723457280bps) errors: 100000000 41.23% kpktgend_0 [kernel.kallsyms] [k] __netif_receive_skb_core 26.57% kpktgend_0 [kernel.kallsyms] [k] kfree_skb 7.72% kpktgend_0 [pktgen] [k] pktgen_thread_worker 5.55% kpktgend_0 [kernel.kallsyms] [k] ip_rcv 2.78% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_internal 2.06% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_sk 1.43% kpktgend_0 [kernel.kallsyms] [k] __build_skb * Without this patch + tc ingress: tc filter add dev eth4 parent ffff: protocol ip prio 1 \ u32 match ip dst 4.3.2.1/32 Result: OK: 9269001(c9268821+d179) usec, 100000000 (60byte,0frags) 10788648pps 5178Mb/sec (5178551040bps) errors: 100000000 40.99% kpktgend_0 [kernel.kallsyms] [k] __netif_receive_skb_core 17.50% kpktgend_0 [kernel.kallsyms] [k] kfree_skb 11.77% kpktgend_0 [cls_u32] [k] u32_classify 5.62% kpktgend_0 [kernel.kallsyms] [k] tc_classify_compat 5.18% kpktgend_0 [pktgen] [k] pktgen_thread_worker 3.23% kpktgend_0 [kernel.kallsyms] [k] tc_classify 2.97% kpktgend_0 [kernel.kallsyms] [k] ip_rcv 1.83% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_internal 1.50% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_sk 0.99% kpktgend_0 [kernel.kallsyms] [k] __build_skb * With this patch + tc ingress: tc filter add dev eth4 parent ffff: protocol ip prio 1 \ u32 match ip dst 4.3.2.1/32 Result: OK: 9308218(c9308091+d126) usec, 100000000 (60byte,0frags) 10743194pps 5156Mb/sec (5156733120bps) errors: 100000000 42.01% kpktgend_0 [kernel.kallsyms] [k] __netif_receive_skb_core 17.78% kpktgend_0 [kernel.kallsyms] [k] kfree_skb 11.70% kpktgend_0 [cls_u32] [k] u32_classify 5.46% kpktgend_0 [kernel.kallsyms] [k] tc_classify_compat 5.16% kpktgend_0 [pktgen] [k] pktgen_thread_worker 2.98% kpktgend_0 [kernel.kallsyms] [k] ip_rcv 2.84% kpktgend_0 [kernel.kallsyms] [k] tc_classify 1.96% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_internal 1.57% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_sk Note that the results are very similar before and after. I can see gcc gets the code under the ingress static key out of the hot path. Then, on that cold branch, it generates the code to accomodate the netfilter ingress static key. My explanation for this is that this reduces the pressure on the instruction cache for non-users as the new code is out of the hot path, and it comes with minimal impact for tc ingress users. Using gcc version 4.8.4 on: Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian CPU(s): 8 [...] L1d cache: 16K L1i cache: 64K L2 cache: 2048K L3 cache: 8192K Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> Acked-by: Alexei Starovoitov <ast@plumgrid.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-14 01:19:38 +09:00
}
#endif
WARN_ON_ONCE(1);
return NULL;
}
static int __nf_register_net_hook(struct net *net, int pf,
const struct nf_hook_ops *reg)
{
struct nf_hook_entries *p, *new_hooks;
struct nf_hook_entries __rcu **pp;
if (pf == NFPROTO_NETDEV) {
#ifndef CONFIG_NETFILTER_INGRESS
if (reg->hooknum == NF_NETDEV_INGRESS)
return -EOPNOTSUPP;
#endif
if (reg->hooknum != NF_NETDEV_INGRESS ||
!reg->dev || dev_net(reg->dev) != net)
return -EINVAL;
}
pp = nf_hook_entry_head(net, pf, reg->hooknum, reg->dev);
if (!pp)
return -EINVAL;
mutex_lock(&nf_hook_mutex);
p = nf_entry_dereference(*pp);
new_hooks = nf_hook_entries_grow(p, reg);
if (!IS_ERR(new_hooks))
rcu_assign_pointer(*pp, new_hooks);
mutex_unlock(&nf_hook_mutex);
if (IS_ERR(new_hooks))
return PTR_ERR(new_hooks);
hooks_validate(new_hooks);
#ifdef CONFIG_NETFILTER_INGRESS
if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
net_inc_ingress_queue();
#endif
#ifdef CONFIG_JUMP_LABEL
static_key_slow_inc(&nf_hooks_needed[pf][reg->hooknum]);
#endif
BUG_ON(p == new_hooks);
nf_hook_entries_free(p);
return 0;
}
/*
* nf_remove_net_hook - remove a hook from blob
*
* @oldp: current address of hook blob
* @unreg: hook to unregister
*
* This cannot fail, hook unregistration must always succeed.
* Therefore replace the to-be-removed hook with a dummy hook.
*/
static bool nf_remove_net_hook(struct nf_hook_entries *old,
const struct nf_hook_ops *unreg)
{
struct nf_hook_ops **orig_ops;
unsigned int i;
orig_ops = nf_hook_entries_get_hook_ops(old);
for (i = 0; i < old->num_hook_entries; i++) {
if (orig_ops[i] != unreg)
continue;
WRITE_ONCE(old->hooks[i].hook, accept_all);
WRITE_ONCE(orig_ops[i], &dummy_ops);
return true;
}
return false;
}
static void __nf_unregister_net_hook(struct net *net, int pf,
const struct nf_hook_ops *reg)
{
struct nf_hook_entries __rcu **pp;
struct nf_hook_entries *p;
pp = nf_hook_entry_head(net, pf, reg->hooknum, reg->dev);
if (!pp)
return;
mutex_lock(&nf_hook_mutex);
p = nf_entry_dereference(*pp);
if (WARN_ON_ONCE(!p)) {
mutex_unlock(&nf_hook_mutex);
return;
}
if (nf_remove_net_hook(p, reg)) {
#ifdef CONFIG_NETFILTER_INGRESS
if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
net_dec_ingress_queue();
#endif
#ifdef CONFIG_JUMP_LABEL
static_key_slow_dec(&nf_hooks_needed[pf][reg->hooknum]);
#endif
} else {
WARN_ONCE(1, "hook not found, pf %d num %d", pf, reg->hooknum);
}
p = __nf_hook_entries_try_shrink(p, pp);
mutex_unlock(&nf_hook_mutex);
if (!p)
return;
nf_queue_nf_hook_drop(net);
nf_hook_entries_free(p);
}
void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
{
if (reg->pf == NFPROTO_INET) {
__nf_unregister_net_hook(net, NFPROTO_IPV4, reg);
__nf_unregister_net_hook(net, NFPROTO_IPV6, reg);
} else {
__nf_unregister_net_hook(net, reg->pf, reg);
}
}
EXPORT_SYMBOL(nf_unregister_net_hook);
void nf_hook_entries_delete_raw(struct nf_hook_entries __rcu **pp,
const struct nf_hook_ops *reg)
{
struct nf_hook_entries *p;
p = rcu_dereference_raw(*pp);
if (nf_remove_net_hook(p, reg)) {
p = __nf_hook_entries_try_shrink(p, pp);
nf_hook_entries_free(p);
}
}
EXPORT_SYMBOL_GPL(nf_hook_entries_delete_raw);
int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
{
int err;
if (reg->pf == NFPROTO_INET) {
err = __nf_register_net_hook(net, NFPROTO_IPV4, reg);
if (err < 0)
return err;
err = __nf_register_net_hook(net, NFPROTO_IPV6, reg);
if (err < 0) {
__nf_unregister_net_hook(net, NFPROTO_IPV4, reg);
return err;
}
} else {
err = __nf_register_net_hook(net, reg->pf, reg);
if (err < 0)
return err;
}
return 0;
}
EXPORT_SYMBOL(nf_register_net_hook);
int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg,
unsigned int n)
{
unsigned int i;
int err = 0;
for (i = 0; i < n; i++) {
err = nf_register_net_hook(net, &reg[i]);
if (err)
goto err;
}
return err;
err:
if (i > 0)
nf_unregister_net_hooks(net, reg, i);
return err;
}
EXPORT_SYMBOL(nf_register_net_hooks);
void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg,
unsigned int hookcount)
{
unsigned int i;
for (i = 0; i < hookcount; i++)
nf_unregister_net_hook(net, &reg[i]);
}
EXPORT_SYMBOL(nf_unregister_net_hooks);
/* Returns 1 if okfn() needs to be executed by the caller,
* -EPERM for NF_DROP, 0 otherwise. Caller must hold rcu_read_lock. */
int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
const struct nf_hook_entries *e, unsigned int s)
{
unsigned int verdict;
int ret;
for (; s < e->num_hook_entries; s++) {
verdict = nf_hook_entry_hookfn(&e->hooks[s], skb, state);
switch (verdict & NF_VERDICT_MASK) {
case NF_ACCEPT:
break;
case NF_DROP:
kfree_skb(skb);
ret = NF_DROP_GETERR(verdict);
if (ret == 0)
ret = -EPERM;
return ret;
case NF_QUEUE:
ret = nf_queue(skb, state, s, verdict);
if (ret == 1)
continue;
return ret;
default:
/* Implicit handling for NF_STOLEN, as well as any other
* non conventional verdicts.
*/
return 0;
}
}
return 1;
}
EXPORT_SYMBOL(nf_hook_slow);
/* This needs to be compiled in any case to avoid dependencies between the
* nfnetlink_queue code and nf_conntrack.
*/
struct nfnl_ct_hook __rcu *nfnl_ct_hook __read_mostly;
EXPORT_SYMBOL_GPL(nfnl_ct_hook);
struct nf_ct_hook __rcu *nf_ct_hook __read_mostly;
EXPORT_SYMBOL_GPL(nf_ct_hook);
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
/* This does not belong here, but locally generated errors need it if connection
tracking in use: without this, connection may not be in hash table, and hence
manufactured ICMP or RST packets will not be associated with it. */
void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *)
__rcu __read_mostly;
EXPORT_SYMBOL(ip_ct_attach);
struct nf_nat_hook __rcu *nf_nat_hook __read_mostly;
EXPORT_SYMBOL_GPL(nf_nat_hook);
void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb)
{
void (*attach)(struct sk_buff *, const struct sk_buff *);
if (skb->_nfct) {
rcu_read_lock();
attach = rcu_dereference(ip_ct_attach);
if (attach)
attach(new, skb);
rcu_read_unlock();
}
}
EXPORT_SYMBOL(nf_ct_attach);
void nf_conntrack_destroy(struct nf_conntrack *nfct)
{
struct nf_ct_hook *ct_hook;
rcu_read_lock();
ct_hook = rcu_dereference(nf_ct_hook);
BUG_ON(ct_hook == NULL);
ct_hook->destroy(nfct);
rcu_read_unlock();
}
EXPORT_SYMBOL(nf_conntrack_destroy);
bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
const struct sk_buff *skb)
{
struct nf_ct_hook *ct_hook;
bool ret = false;
rcu_read_lock();
ct_hook = rcu_dereference(nf_ct_hook);
if (ct_hook)
ret = ct_hook->get_tuple_skb(dst_tuple, skb);
rcu_read_unlock();
return ret;
}
EXPORT_SYMBOL(nf_ct_get_tuple_skb);
/* Built-in default zone used e.g. by modules. */
const struct nf_conntrack_zone nf_ct_zone_dflt = {
.id = NF_CT_DEFAULT_ZONE_ID,
.dir = NF_CT_DEFAULT_ZONE_DIR,
};
EXPORT_SYMBOL_GPL(nf_ct_zone_dflt);
#endif /* CONFIG_NF_CONNTRACK */
static void __net_init
__netfilter_net_init(struct nf_hook_entries __rcu **e, int max)
{
int h;
for (h = 0; h < max; h++)
RCU_INIT_POINTER(e[h], NULL);
}
static int __net_init netfilter_net_init(struct net *net)
{
__netfilter_net_init(net->nf.hooks_ipv4, ARRAY_SIZE(net->nf.hooks_ipv4));
__netfilter_net_init(net->nf.hooks_ipv6, ARRAY_SIZE(net->nf.hooks_ipv6));
#ifdef CONFIG_NETFILTER_FAMILY_ARP
__netfilter_net_init(net->nf.hooks_arp, ARRAY_SIZE(net->nf.hooks_arp));
#endif
#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
__netfilter_net_init(net->nf.hooks_bridge, ARRAY_SIZE(net->nf.hooks_bridge));
#endif
#if IS_ENABLED(CONFIG_DECNET)
__netfilter_net_init(net->nf.hooks_decnet, ARRAY_SIZE(net->nf.hooks_decnet));
#endif
#ifdef CONFIG_PROC_FS
net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter",
net->proc_net);
if (!net->nf.proc_netfilter) {
if (!net_eq(net, &init_net))
pr_err("cannot create netfilter proc entry");
return -ENOMEM;
}
#endif
return 0;
}
static void __net_exit netfilter_net_exit(struct net *net)
{
remove_proc_entry("netfilter", net->proc_net);
}
static struct pernet_operations netfilter_net_ops = {
.init = netfilter_net_init,
.exit = netfilter_net_exit,
};
int __init netfilter_init(void)
{
int ret;
ret = register_pernet_subsys(&netfilter_net_ops);
if (ret < 0)
goto err;
ret = netfilter_log_init();
if (ret < 0)
goto err_pernet;
return 0;
err_pernet:
unregister_pernet_subsys(&netfilter_net_ops);
err:
return ret;
}