netfilter: add generic flow table infrastructure

This patch defines the API to interact with flow tables, this allows to
add, delete and lookup for entries in the flow table. This also adds the
generic garbage code that removes entries that have expired, ie. no
traffic has been seen for a while.

Users of the flow table infrastructure can delete entries via
flow_offload_dead(), which sets the dying bit, this signals the garbage
collector to release an entry from user context.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
This commit is contained in:
Pablo Neira Ayuso 2018-01-07 01:04:11 +01:00
parent 3b49e2e94e
commit ac2a66665e
4 changed files with 533 additions and 0 deletions

View File

@ -1,7 +1,12 @@
#ifndef _NF_FLOW_TABLE_H
#define _NF_FLOW_TABLE_H
#include <linux/in.h>
#include <linux/in6.h>
#include <linux/netdevice.h>
#include <linux/rhashtable.h>
#include <linux/rcupdate.h>
#include <net/dst.h>
struct nf_flowtable;
@ -20,4 +25,93 @@ struct nf_flowtable {
struct delayed_work gc_work;
};
enum flow_offload_tuple_dir {
FLOW_OFFLOAD_DIR_ORIGINAL,
FLOW_OFFLOAD_DIR_REPLY,
__FLOW_OFFLOAD_DIR_MAX = FLOW_OFFLOAD_DIR_REPLY,
};
#define FLOW_OFFLOAD_DIR_MAX (__FLOW_OFFLOAD_DIR_MAX + 1)
struct flow_offload_tuple {
union {
struct in_addr src_v4;
struct in6_addr src_v6;
};
union {
struct in_addr dst_v4;
struct in6_addr dst_v6;
};
struct {
__be16 src_port;
__be16 dst_port;
};
int iifidx;
u8 l3proto;
u8 l4proto;
u8 dir;
int oifidx;
struct dst_entry *dst_cache;
};
struct flow_offload_tuple_rhash {
struct rhash_head node;
struct flow_offload_tuple tuple;
};
#define FLOW_OFFLOAD_SNAT 0x1
#define FLOW_OFFLOAD_DNAT 0x2
#define FLOW_OFFLOAD_DYING 0x4
struct flow_offload {
struct flow_offload_tuple_rhash tuplehash[FLOW_OFFLOAD_DIR_MAX];
u32 flags;
union {
/* Your private driver data here. */
u32 timeout;
};
};
#define NF_FLOW_TIMEOUT (30 * HZ)
struct nf_flow_route {
struct {
struct dst_entry *dst;
int ifindex;
} tuple[FLOW_OFFLOAD_DIR_MAX];
};
struct flow_offload *flow_offload_alloc(struct nf_conn *ct,
struct nf_flow_route *route);
void flow_offload_free(struct flow_offload *flow);
int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow);
void flow_offload_del(struct nf_flowtable *flow_table, struct flow_offload *flow);
struct flow_offload_tuple_rhash *flow_offload_lookup(struct nf_flowtable *flow_table,
struct flow_offload_tuple *tuple);
int nf_flow_table_iterate(struct nf_flowtable *flow_table,
void (*iter)(struct flow_offload *flow, void *data),
void *data);
void nf_flow_offload_work_gc(struct work_struct *work);
extern const struct rhashtable_params nf_flow_offload_rhash_params;
void flow_offload_dead(struct flow_offload *flow);
int nf_flow_snat_port(const struct flow_offload *flow,
struct sk_buff *skb, unsigned int thoff,
u8 protocol, enum flow_offload_tuple_dir dir);
int nf_flow_dnat_port(const struct flow_offload *flow,
struct sk_buff *skb, unsigned int thoff,
u8 protocol, enum flow_offload_tuple_dir dir);
struct flow_ports {
__be16 source, dest;
};
#define MODULE_ALIAS_NF_FLOWTABLE(family) \
MODULE_ALIAS("nf-flowtable-" __stringify(family))
#endif /* _FLOW_OFFLOAD_H */

View File

@ -657,6 +657,13 @@ endif # NF_TABLES_NETDEV
endif # NF_TABLES
config NF_FLOW_TABLE
tristate "Netfilter flow table module"
help
This option adds the flow table core infrastructure.
To compile it as a module, choose M here.
config NETFILTER_XTABLES
tristate "Netfilter Xtables support (required for ip_tables)"
default m if NETFILTER_ADVANCED=n

View File

@ -109,6 +109,9 @@ obj-$(CONFIG_NFT_FIB_NETDEV) += nft_fib_netdev.o
obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o
obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_netdev.o
# flow table infrastructure
obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o
# generic X tables
obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o

View File

@ -0,0 +1,429 @@
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/netfilter.h>
#include <linux/rhashtable.h>
#include <linux/netdevice.h>
#include <net/netfilter/nf_flow_table.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_tuple.h>
struct flow_offload_entry {
struct flow_offload flow;
struct nf_conn *ct;
struct rcu_head rcu_head;
};
struct flow_offload *
flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route)
{
struct flow_offload_entry *entry;
struct flow_offload *flow;
if (unlikely(nf_ct_is_dying(ct) ||
!atomic_inc_not_zero(&ct->ct_general.use)))
return NULL;
entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
if (!entry)
goto err_ct_refcnt;
flow = &entry->flow;
if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst))
goto err_dst_cache_original;
if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_REPLY].dst))
goto err_dst_cache_reply;
entry->ct = ct;
switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num) {
case NFPROTO_IPV4:
flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4 =
ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in;
flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4 =
ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in;
flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4 =
ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.in;
flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4 =
ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.in;
break;
case NFPROTO_IPV6:
flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6 =
ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in6;
flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6 =
ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6;
flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6 =
ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.in6;
flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6 =
ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.in6;
break;
}
flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l3proto =
ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto =
ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum;
flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l3proto =
ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l4proto =
ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum;
flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache =
route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst;
flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache =
route->tuple[FLOW_OFFLOAD_DIR_REPLY].dst;
flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port =
ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port;
flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port =
ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port;
flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port =
ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u.tcp.port;
flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port =
ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.tcp.port;
flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dir =
FLOW_OFFLOAD_DIR_ORIGINAL;
flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dir =
FLOW_OFFLOAD_DIR_REPLY;
flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.iifidx =
route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].ifindex;
flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.oifidx =
route->tuple[FLOW_OFFLOAD_DIR_REPLY].ifindex;
flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.iifidx =
route->tuple[FLOW_OFFLOAD_DIR_REPLY].ifindex;
flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.oifidx =
route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].ifindex;
if (ct->status & IPS_SRC_NAT)
flow->flags |= FLOW_OFFLOAD_SNAT;
else if (ct->status & IPS_DST_NAT)
flow->flags |= FLOW_OFFLOAD_DNAT;
return flow;
err_dst_cache_reply:
dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst);
err_dst_cache_original:
kfree(entry);
err_ct_refcnt:
nf_ct_put(ct);
return NULL;
}
EXPORT_SYMBOL_GPL(flow_offload_alloc);
void flow_offload_free(struct flow_offload *flow)
{
struct flow_offload_entry *e;
dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache);
dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache);
e = container_of(flow, struct flow_offload_entry, flow);
kfree(e);
}
EXPORT_SYMBOL_GPL(flow_offload_free);
void flow_offload_dead(struct flow_offload *flow)
{
flow->flags |= FLOW_OFFLOAD_DYING;
}
EXPORT_SYMBOL_GPL(flow_offload_dead);
int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
{
flow->timeout = (u32)jiffies;
rhashtable_insert_fast(&flow_table->rhashtable,
&flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
*flow_table->type->params);
rhashtable_insert_fast(&flow_table->rhashtable,
&flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
*flow_table->type->params);
return 0;
}
EXPORT_SYMBOL_GPL(flow_offload_add);
void flow_offload_del(struct nf_flowtable *flow_table,
struct flow_offload *flow)
{
struct flow_offload_entry *e;
rhashtable_remove_fast(&flow_table->rhashtable,
&flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
*flow_table->type->params);
rhashtable_remove_fast(&flow_table->rhashtable,
&flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
*flow_table->type->params);
e = container_of(flow, struct flow_offload_entry, flow);
kfree_rcu(e, rcu_head);
}
EXPORT_SYMBOL_GPL(flow_offload_del);
struct flow_offload_tuple_rhash *
flow_offload_lookup(struct nf_flowtable *flow_table,
struct flow_offload_tuple *tuple)
{
return rhashtable_lookup_fast(&flow_table->rhashtable, tuple,
*flow_table->type->params);
}
EXPORT_SYMBOL_GPL(flow_offload_lookup);
static void nf_flow_release_ct(const struct flow_offload *flow)
{
struct flow_offload_entry *e;
e = container_of(flow, struct flow_offload_entry, flow);
nf_ct_delete(e->ct, 0, 0);
nf_ct_put(e->ct);
}
int nf_flow_table_iterate(struct nf_flowtable *flow_table,
void (*iter)(struct flow_offload *flow, void *data),
void *data)
{
struct flow_offload_tuple_rhash *tuplehash;
struct rhashtable_iter hti;
struct flow_offload *flow;
int err;
err = rhashtable_walk_init(&flow_table->rhashtable, &hti, GFP_KERNEL);
if (err)
return err;
rhashtable_walk_start(&hti);
while ((tuplehash = rhashtable_walk_next(&hti))) {
if (IS_ERR(tuplehash)) {
err = PTR_ERR(tuplehash);
if (err != -EAGAIN)
goto out;
continue;
}
if (tuplehash->tuple.dir)
continue;
flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
iter(flow, data);
}
out:
rhashtable_walk_stop(&hti);
rhashtable_walk_exit(&hti);
return err;
}
EXPORT_SYMBOL_GPL(nf_flow_table_iterate);
static inline bool nf_flow_has_expired(const struct flow_offload *flow)
{
return (__s32)(flow->timeout - (u32)jiffies) <= 0;
}
static inline bool nf_flow_is_dying(const struct flow_offload *flow)
{
return flow->flags & FLOW_OFFLOAD_DYING;
}
void nf_flow_offload_work_gc(struct work_struct *work)
{
struct flow_offload_tuple_rhash *tuplehash;
struct nf_flowtable *flow_table;
struct rhashtable_iter hti;
struct flow_offload *flow;
int err;
flow_table = container_of(work, struct nf_flowtable, gc_work.work);
err = rhashtable_walk_init(&flow_table->rhashtable, &hti, GFP_KERNEL);
if (err)
goto schedule;
rhashtable_walk_start(&hti);
while ((tuplehash = rhashtable_walk_next(&hti))) {
if (IS_ERR(tuplehash)) {
err = PTR_ERR(tuplehash);
if (err != -EAGAIN)
goto out;
continue;
}
if (tuplehash->tuple.dir)
continue;
flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
if (nf_flow_has_expired(flow) ||
nf_flow_is_dying(flow)) {
flow_offload_del(flow_table, flow);
nf_flow_release_ct(flow);
}
}
out:
rhashtable_walk_stop(&hti);
rhashtable_walk_exit(&hti);
schedule:
queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
}
EXPORT_SYMBOL_GPL(nf_flow_offload_work_gc);
static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
{
const struct flow_offload_tuple *tuple = data;
return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed);
}
static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
{
const struct flow_offload_tuple_rhash *tuplehash = data;
return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed);
}
static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
const void *ptr)
{
const struct flow_offload_tuple *tuple = arg->key;
const struct flow_offload_tuple_rhash *x = ptr;
if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir)))
return 1;
return 0;
}
const struct rhashtable_params nf_flow_offload_rhash_params = {
.head_offset = offsetof(struct flow_offload_tuple_rhash, node),
.hashfn = flow_offload_hash,
.obj_hashfn = flow_offload_hash_obj,
.obj_cmpfn = flow_offload_hash_cmp,
.automatic_shrinking = true,
};
EXPORT_SYMBOL_GPL(nf_flow_offload_rhash_params);
static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
__be16 port, __be16 new_port)
{
struct tcphdr *tcph;
if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
skb_try_make_writable(skb, thoff + sizeof(*tcph)))
return -1;
tcph = (void *)(skb_network_header(skb) + thoff);
inet_proto_csum_replace2(&tcph->check, skb, port, new_port, true);
return 0;
}
static int nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff,
__be16 port, __be16 new_port)
{
struct udphdr *udph;
if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
skb_try_make_writable(skb, thoff + sizeof(*udph)))
return -1;
udph = (void *)(skb_network_header(skb) + thoff);
if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
inet_proto_csum_replace2(&udph->check, skb, port,
new_port, true);
if (!udph->check)
udph->check = CSUM_MANGLED_0;
}
return 0;
}
static int nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff,
u8 protocol, __be16 port, __be16 new_port)
{
switch (protocol) {
case IPPROTO_TCP:
if (nf_flow_nat_port_tcp(skb, thoff, port, new_port) < 0)
return NF_DROP;
break;
case IPPROTO_UDP:
if (nf_flow_nat_port_udp(skb, thoff, port, new_port) < 0)
return NF_DROP;
break;
}
return 0;
}
int nf_flow_snat_port(const struct flow_offload *flow,
struct sk_buff *skb, unsigned int thoff,
u8 protocol, enum flow_offload_tuple_dir dir)
{
struct flow_ports *hdr;
__be16 port, new_port;
if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) ||
skb_try_make_writable(skb, thoff + sizeof(*hdr)))
return -1;
hdr = (void *)(skb_network_header(skb) + thoff);
switch (dir) {
case FLOW_OFFLOAD_DIR_ORIGINAL:
port = hdr->source;
new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port;
hdr->source = new_port;
break;
case FLOW_OFFLOAD_DIR_REPLY:
port = hdr->dest;
new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port;
hdr->dest = new_port;
break;
default:
return -1;
}
return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
}
EXPORT_SYMBOL_GPL(nf_flow_snat_port);
int nf_flow_dnat_port(const struct flow_offload *flow,
struct sk_buff *skb, unsigned int thoff,
u8 protocol, enum flow_offload_tuple_dir dir)
{
struct flow_ports *hdr;
__be16 port, new_port;
if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) ||
skb_try_make_writable(skb, thoff + sizeof(*hdr)))
return -1;
hdr = (void *)(skb_network_header(skb) + thoff);
switch (dir) {
case FLOW_OFFLOAD_DIR_ORIGINAL:
port = hdr->dest;
new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port;
hdr->dest = new_port;
break;
case FLOW_OFFLOAD_DIR_REPLY:
port = hdr->source;
new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port;
hdr->source = new_port;
break;
default:
return -1;
}
return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
}
EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");