linux-brain/net/netfilter/nf_synproxy_core.c

1245 lines
30 KiB
C
Raw Permalink Normal View History

// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2013 Patrick McHardy <kaber@trash.net>
*/
#include <linux/module.h>
#include <linux/skbuff.h>
#include <asm/unaligned.h>
#include <net/tcp.h>
#include <net/netns/generic.h>
#include <linux/proc_fs.h>
#include <linux/netfilter_ipv6.h>
#include <linux/netfilter/nf_synproxy.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_conntrack_extend.h>
#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/nf_conntrack_synproxy.h>
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/nf_synproxy.h>
netns: make struct pernet_operations::id unsigned int Make struct pernet_operations::id unsigned. There are 2 reasons to do so: 1) This field is really an index into an zero based array and thus is unsigned entity. Using negative value is out-of-bound access by definition. 2) On x86_64 unsigned 32-bit data which are mixed with pointers via array indexing or offsets added or subtracted to pointers are preffered to signed 32-bit data. "int" being used as an array index needs to be sign-extended to 64-bit before being used. void f(long *p, int i) { g(p[i]); } roughly translates to movsx rsi, esi mov rdi, [rsi+...] call g MOVSX is 3 byte instruction which isn't necessary if the variable is unsigned because x86_64 is zero extending by default. Now, there is net_generic() function which, you guessed it right, uses "int" as an array index: static inline void *net_generic(const struct net *net, int id) { ... ptr = ng->ptr[id - 1]; ... } And this function is used a lot, so those sign extensions add up. Patch snipes ~1730 bytes on allyesconfig kernel (without all junk messing with code generation): add/remove: 0/0 grow/shrink: 70/598 up/down: 396/-2126 (-1730) Unfortunately some functions actually grow bigger. This is a semmingly random artefact of code generation with register allocator being used differently. gcc decides that some variable needs to live in new r8+ registers and every access now requires REX prefix. Or it is shifted into r12, so [r12+0] addressing mode has to be used which is longer than [r8] However, overall balance is in negative direction: add/remove: 0/0 grow/shrink: 70/598 up/down: 396/-2126 (-1730) function old new delta nfsd4_lock 3886 3959 +73 tipc_link_build_proto_msg 1096 1140 +44 mac80211_hwsim_new_radio 2776 2808 +32 tipc_mon_rcv 1032 1058 +26 svcauth_gss_legacy_init 1413 1429 +16 tipc_bcbase_select_primary 379 392 +13 nfsd4_exchange_id 1247 1260 +13 nfsd4_setclientid_confirm 782 793 +11 ... put_client_renew_locked 494 480 -14 ip_set_sockfn_get 730 716 -14 geneve_sock_add 829 813 -16 nfsd4_sequence_done 721 703 -18 nlmclnt_lookup_host 708 686 -22 nfsd4_lockt 1085 1063 -22 nfs_get_client 1077 1050 -27 tcf_bpf_init 1106 1076 -30 nfsd4_encode_fattr 5997 5930 -67 Total: Before=154856051, After=154854321, chg -0.00% Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-11-17 10:58:21 +09:00
unsigned int synproxy_net_id;
EXPORT_SYMBOL_GPL(synproxy_net_id);
bool
synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
const struct tcphdr *th, struct synproxy_options *opts)
{
int length = (th->doff * 4) - sizeof(*th);
u8 buf[40], *ptr;
if (unlikely(length < 0))
return false;
ptr = skb_header_pointer(skb, doff + sizeof(*th), length, buf);
if (ptr == NULL)
return false;
opts->options = 0;
while (length > 0) {
int opcode = *ptr++;
int opsize;
switch (opcode) {
case TCPOPT_EOL:
return true;
case TCPOPT_NOP:
length--;
continue;
default:
if (length < 2)
return true;
opsize = *ptr++;
if (opsize < 2)
return true;
if (opsize > length)
return true;
switch (opcode) {
case TCPOPT_MSS:
if (opsize == TCPOLEN_MSS) {
opts->mss_option = get_unaligned_be16(ptr);
opts->options |= NF_SYNPROXY_OPT_MSS;
}
break;
case TCPOPT_WINDOW:
if (opsize == TCPOLEN_WINDOW) {
opts->wscale = *ptr;
if (opts->wscale > TCP_MAX_WSCALE)
opts->wscale = TCP_MAX_WSCALE;
opts->options |= NF_SYNPROXY_OPT_WSCALE;
}
break;
case TCPOPT_TIMESTAMP:
if (opsize == TCPOLEN_TIMESTAMP) {
opts->tsval = get_unaligned_be32(ptr);
opts->tsecr = get_unaligned_be32(ptr + 4);
opts->options |= NF_SYNPROXY_OPT_TIMESTAMP;
}
break;
case TCPOPT_SACK_PERM:
if (opsize == TCPOLEN_SACK_PERM)
opts->options |= NF_SYNPROXY_OPT_SACK_PERM;
break;
}
ptr += opsize - 2;
length -= opsize;
}
}
return true;
}
EXPORT_SYMBOL_GPL(synproxy_parse_options);
static unsigned int
synproxy_options_size(const struct synproxy_options *opts)
{
unsigned int size = 0;
if (opts->options & NF_SYNPROXY_OPT_MSS)
size += TCPOLEN_MSS_ALIGNED;
if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP)
size += TCPOLEN_TSTAMP_ALIGNED;
else if (opts->options & NF_SYNPROXY_OPT_SACK_PERM)
size += TCPOLEN_SACKPERM_ALIGNED;
if (opts->options & NF_SYNPROXY_OPT_WSCALE)
size += TCPOLEN_WSCALE_ALIGNED;
return size;
}
static void
synproxy_build_options(struct tcphdr *th, const struct synproxy_options *opts)
{
__be32 *ptr = (__be32 *)(th + 1);
u8 options = opts->options;
if (options & NF_SYNPROXY_OPT_MSS)
*ptr++ = htonl((TCPOPT_MSS << 24) |
(TCPOLEN_MSS << 16) |
opts->mss_option);
if (options & NF_SYNPROXY_OPT_TIMESTAMP) {
if (options & NF_SYNPROXY_OPT_SACK_PERM)
*ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
(TCPOLEN_SACK_PERM << 16) |
(TCPOPT_TIMESTAMP << 8) |
TCPOLEN_TIMESTAMP);
else
*ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
(TCPOPT_TIMESTAMP << 8) |
TCPOLEN_TIMESTAMP);
*ptr++ = htonl(opts->tsval);
*ptr++ = htonl(opts->tsecr);
} else if (options & NF_SYNPROXY_OPT_SACK_PERM)
*ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
(TCPOPT_SACK_PERM << 8) |
TCPOLEN_SACK_PERM);
if (options & NF_SYNPROXY_OPT_WSCALE)
*ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_WINDOW << 16) |
(TCPOLEN_WINDOW << 8) |
opts->wscale);
}
void synproxy_init_timestamp_cookie(const struct nf_synproxy_info *info,
struct synproxy_options *opts)
{
opts->tsecr = opts->tsval;
opts->tsval = tcp_time_stamp_raw() & ~0x3f;
if (opts->options & NF_SYNPROXY_OPT_WSCALE) {
opts->tsval |= opts->wscale;
opts->wscale = info->wscale;
} else
opts->tsval |= 0xf;
if (opts->options & NF_SYNPROXY_OPT_SACK_PERM)
opts->tsval |= 1 << 4;
if (opts->options & NF_SYNPROXY_OPT_ECN)
opts->tsval |= 1 << 5;
}
EXPORT_SYMBOL_GPL(synproxy_init_timestamp_cookie);
static void
synproxy_check_timestamp_cookie(struct synproxy_options *opts)
{
opts->wscale = opts->tsecr & 0xf;
if (opts->wscale != 0xf)
opts->options |= NF_SYNPROXY_OPT_WSCALE;
opts->options |= opts->tsecr & (1 << 4) ? NF_SYNPROXY_OPT_SACK_PERM : 0;
opts->options |= opts->tsecr & (1 << 5) ? NF_SYNPROXY_OPT_ECN : 0;
}
static unsigned int
synproxy_tstamp_adjust(struct sk_buff *skb, unsigned int protoff,
struct tcphdr *th, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
const struct nf_conn_synproxy *synproxy)
{
unsigned int optoff, optend;
__be32 *ptr, old;
if (synproxy->tsoff == 0)
return 1;
optoff = protoff + sizeof(struct tcphdr);
optend = protoff + th->doff * 4;
if (skb_ensure_writable(skb, optend))
return 0;
while (optoff < optend) {
unsigned char *op = skb->data + optoff;
switch (op[0]) {
case TCPOPT_EOL:
return 1;
case TCPOPT_NOP:
optoff++;
continue;
default:
if (optoff + 1 == optend ||
optoff + op[1] > optend ||
op[1] < 2)
return 0;
if (op[0] == TCPOPT_TIMESTAMP &&
op[1] == TCPOLEN_TIMESTAMP) {
if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) {
ptr = (__be32 *)&op[2];
old = *ptr;
*ptr = htonl(ntohl(*ptr) -
synproxy->tsoff);
} else {
ptr = (__be32 *)&op[6];
old = *ptr;
*ptr = htonl(ntohl(*ptr) +
synproxy->tsoff);
}
inet_proto_csum_replace4(&th->check, skb,
old, *ptr, false);
return 1;
}
optoff += op[1];
}
}
return 1;
}
static struct nf_ct_ext_type nf_ct_synproxy_extend __read_mostly = {
.len = sizeof(struct nf_conn_synproxy),
.align = __alignof__(struct nf_conn_synproxy),
.id = NF_CT_EXT_SYNPROXY,
};
#ifdef CONFIG_PROC_FS
static void *synproxy_cpu_seq_start(struct seq_file *seq, loff_t *pos)
{
struct synproxy_net *snet = synproxy_pernet(seq_file_net(seq));
int cpu;
if (*pos == 0)
return SEQ_START_TOKEN;
for (cpu = *pos - 1; cpu < nr_cpu_ids; cpu++) {
if (!cpu_possible(cpu))
continue;
*pos = cpu + 1;
return per_cpu_ptr(snet->stats, cpu);
}
return NULL;
}
static void *synproxy_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct synproxy_net *snet = synproxy_pernet(seq_file_net(seq));
int cpu;
for (cpu = *pos; cpu < nr_cpu_ids; cpu++) {
if (!cpu_possible(cpu))
continue;
*pos = cpu + 1;
return per_cpu_ptr(snet->stats, cpu);
}
(*pos)++;
return NULL;
}
static void synproxy_cpu_seq_stop(struct seq_file *seq, void *v)
{
return;
}
static int synproxy_cpu_seq_show(struct seq_file *seq, void *v)
{
struct synproxy_stats *stats = v;
if (v == SEQ_START_TOKEN) {
seq_puts(seq, "entries\t\tsyn_received\t"
"cookie_invalid\tcookie_valid\t"
"cookie_retrans\tconn_reopened\n");
return 0;
}
seq_printf(seq, "%08x\t%08x\t%08x\t%08x\t%08x\t%08x\n", 0,
stats->syn_received,
stats->cookie_invalid,
stats->cookie_valid,
stats->cookie_retrans,
stats->conn_reopened);
return 0;
}
static const struct seq_operations synproxy_cpu_seq_ops = {
.start = synproxy_cpu_seq_start,
.next = synproxy_cpu_seq_next,
.stop = synproxy_cpu_seq_stop,
.show = synproxy_cpu_seq_show,
};
static int __net_init synproxy_proc_init(struct net *net)
{
if (!proc_create_net("synproxy", 0444, net->proc_net_stat,
&synproxy_cpu_seq_ops, sizeof(struct seq_net_private)))
return -ENOMEM;
return 0;
}
static void __net_exit synproxy_proc_exit(struct net *net)
{
remove_proc_entry("synproxy", net->proc_net_stat);
}
#else
static int __net_init synproxy_proc_init(struct net *net)
{
return 0;
}
static void __net_exit synproxy_proc_exit(struct net *net)
{
return;
}
#endif /* CONFIG_PROC_FS */
static int __net_init synproxy_net_init(struct net *net)
{
struct synproxy_net *snet = synproxy_pernet(net);
struct nf_conn *ct;
int err = -ENOMEM;
ct = nf_ct_tmpl_alloc(net, &nf_ct_zone_dflt, GFP_KERNEL);
if (!ct)
goto err1;
if (!nfct_seqadj_ext_add(ct))
goto err2;
if (!nfct_synproxy_ext_add(ct))
goto err2;
netfilter: fix netns dependencies with conntrack templates Quoting Daniel Borkmann: "When adding connection tracking template rules to a netns, f.e. to configure netfilter zones, the kernel will endlessly busy-loop as soon as we try to delete the given netns in case there's at least one template present, which is problematic i.e. if there is such bravery that the priviledged user inside the netns is assumed untrusted. Minimal example: ip netns add foo ip netns exec foo iptables -t raw -A PREROUTING -d 1.2.3.4 -j CT --zone 1 ip netns del foo What happens is that when nf_ct_iterate_cleanup() is being called from nf_conntrack_cleanup_net_list() for a provided netns, we always end up with a net->ct.count > 0 and thus jump back to i_see_dead_people. We don't get a soft-lockup as we still have a schedule() point, but the serving CPU spins on 100% from that point onwards. Since templates are normally allocated with nf_conntrack_alloc(), we also bump net->ct.count. The issue why they are not yet nf_ct_put() is because the per netns .exit() handler from x_tables (which would eventually invoke xt_CT's xt_ct_tg_destroy() that drops reference on info->ct) is called in the dependency chain at a *later* point in time than the per netns .exit() handler for the connection tracker. This is clearly a chicken'n'egg problem: after the connection tracker .exit() handler, we've teared down all the connection tracking infrastructure already, so rightfully, xt_ct_tg_destroy() cannot be invoked at a later point in time during the netns cleanup, as that would lead to a use-after-free. At the same time, we cannot make x_tables depend on the connection tracker module, so that the xt_ct_tg_destroy() would be invoked earlier in the cleanup chain." Daniel confirms this has to do with the order in which modules are loaded or having compiled nf_conntrack as modules while x_tables built-in. So we have no guarantees regarding the order in which netns callbacks are executed. Fix this by allocating the templates through kmalloc() from the respective SYNPROXY and CT targets, so they don't depend on the conntrack kmem cache. Then, release then via nf_ct_tmpl_free() from destroy_conntrack(). This branch is marked as unlikely since conntrack templates are rarely allocated and only from the configuration plane path. Note that templates are not kept in any list to avoid further dependencies with nf_conntrack anymore, thus, the tmpl larval list is removed. Reported-by: Daniel Borkmann <daniel@iogearbox.net> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> Tested-by: Daniel Borkmann <daniel@iogearbox.net>
2015-07-13 22:11:48 +09:00
__set_bit(IPS_CONFIRMED_BIT, &ct->status);
nf_conntrack_get(&ct->ct_general);
snet->tmpl = ct;
snet->stats = alloc_percpu(struct synproxy_stats);
if (snet->stats == NULL)
goto err2;
err = synproxy_proc_init(net);
if (err < 0)
goto err3;
return 0;
err3:
free_percpu(snet->stats);
err2:
netfilter: conntrack: use nf_ct_tmpl_free in CT/synproxy error paths Commit 0838aa7fcfcd ("netfilter: fix netns dependencies with conntrack templates") migrated templates to the new allocator api, but forgot to update error paths for them in CT and synproxy to use nf_ct_tmpl_free() instead of nf_conntrack_free(). Due to that, memory is being freed into the wrong kmemcache, but also we drop the per net reference count of ct objects causing an imbalance. In Brad's case, this leads to a wrap-around of net->ct.count and thus lets __nf_conntrack_alloc() refuse to create a new ct object: [ 10.340913] xt_addrtype: ipv6 does not support BROADCAST matching [ 10.810168] nf_conntrack: table full, dropping packet [ 11.917416] r8169 0000:07:00.0 eth0: link up [ 11.917438] IPv6: ADDRCONF(NETDEV_CHANGE): eth0: link becomes ready [ 12.815902] nf_conntrack: table full, dropping packet [ 15.688561] nf_conntrack: table full, dropping packet [ 15.689365] nf_conntrack: table full, dropping packet [ 15.690169] nf_conntrack: table full, dropping packet [ 15.690967] nf_conntrack: table full, dropping packet [...] With slab debugging, it also reports the wrong kmemcache (kmalloc-512 vs. nf_conntrack_ffffffff81ce75c0) and reports poison overwrites, etc. Thus, to fix the problem, export and use nf_ct_tmpl_free() instead. Fixes: 0838aa7fcfcd ("netfilter: fix netns dependencies with conntrack templates") Reported-by: Brad Jackson <bjackson0971@gmail.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2015-09-01 02:11:02 +09:00
nf_ct_tmpl_free(ct);
err1:
return err;
}
static void __net_exit synproxy_net_exit(struct net *net)
{
struct synproxy_net *snet = synproxy_pernet(net);
netfilter: nf_conntrack: don't release a conntrack with non-zero refcnt With this patch, the conntrack refcount is initially set to zero and it is bumped once it is added to any of the list, so we fulfill Eric's golden rule which is that all released objects always have a refcount that equals zero. Andrey Vagin reports that nf_conntrack_free can't be called for a conntrack with non-zero ref-counter, because it can race with nf_conntrack_find_get(). A conntrack slab is created with SLAB_DESTROY_BY_RCU. Non-zero ref-counter says that this conntrack is used. So when we release a conntrack with non-zero counter, we break this assumption. CPU1 CPU2 ____nf_conntrack_find() nf_ct_put() destroy_conntrack() ... init_conntrack __nf_conntrack_alloc (set use = 1) atomic_inc_not_zero(&ct->use) (use = 2) if (!l4proto->new(ct, skb, dataoff, timeouts)) nf_conntrack_free(ct); (use = 2 !!!) ... __nf_conntrack_alloc (set use = 1) if (!nf_ct_key_equal(h, tuple, zone)) nf_ct_put(ct); (use = 0) destroy_conntrack() /* continue to work with CT */ After applying the path "[PATCH] netfilter: nf_conntrack: fix RCU race in nf_conntrack_find_get" another bug was triggered in destroy_conntrack(): <4>[67096.759334] ------------[ cut here ]------------ <2>[67096.759353] kernel BUG at net/netfilter/nf_conntrack_core.c:211! ... <4>[67096.759837] Pid: 498649, comm: atdd veid: 666 Tainted: G C --------------- 2.6.32-042stab084.18 #1 042stab084_18 /DQ45CB <4>[67096.759932] RIP: 0010:[<ffffffffa03d99ac>] [<ffffffffa03d99ac>] destroy_conntrack+0x15c/0x190 [nf_conntrack] <4>[67096.760255] Call Trace: <4>[67096.760255] [<ffffffff814844a7>] nf_conntrack_destroy+0x17/0x30 <4>[67096.760255] [<ffffffffa03d9bb5>] nf_conntrack_find_get+0x85/0x130 [nf_conntrack] <4>[67096.760255] [<ffffffffa03d9fb2>] nf_conntrack_in+0x352/0xb60 [nf_conntrack] <4>[67096.760255] [<ffffffffa048c771>] ipv4_conntrack_local+0x51/0x60 [nf_conntrack_ipv4] <4>[67096.760255] [<ffffffff81484419>] nf_iterate+0x69/0xb0 <4>[67096.760255] [<ffffffff814b5b00>] ? dst_output+0x0/0x20 <4>[67096.760255] [<ffffffff814845d4>] nf_hook_slow+0x74/0x110 <4>[67096.760255] [<ffffffff814b5b00>] ? dst_output+0x0/0x20 <4>[67096.760255] [<ffffffff814b66d5>] raw_sendmsg+0x775/0x910 <4>[67096.760255] [<ffffffff8104c5a8>] ? flush_tlb_others_ipi+0x128/0x130 <4>[67096.760255] [<ffffffff8100bc4e>] ? apic_timer_interrupt+0xe/0x20 <4>[67096.760255] [<ffffffff8100bc4e>] ? apic_timer_interrupt+0xe/0x20 <4>[67096.760255] [<ffffffff814c136a>] inet_sendmsg+0x4a/0xb0 <4>[67096.760255] [<ffffffff81444e93>] ? sock_sendmsg+0x13/0x140 <4>[67096.760255] [<ffffffff81444f97>] sock_sendmsg+0x117/0x140 <4>[67096.760255] [<ffffffff8102e299>] ? native_smp_send_reschedule+0x49/0x60 <4>[67096.760255] [<ffffffff81519beb>] ? _spin_unlock_bh+0x1b/0x20 <4>[67096.760255] [<ffffffff8109d930>] ? autoremove_wake_function+0x0/0x40 <4>[67096.760255] [<ffffffff814960f0>] ? do_ip_setsockopt+0x90/0xd80 <4>[67096.760255] [<ffffffff8100bc4e>] ? apic_timer_interrupt+0xe/0x20 <4>[67096.760255] [<ffffffff8100bc4e>] ? apic_timer_interrupt+0xe/0x20 <4>[67096.760255] [<ffffffff814457c9>] sys_sendto+0x139/0x190 <4>[67096.760255] [<ffffffff810efa77>] ? audit_syscall_entry+0x1d7/0x200 <4>[67096.760255] [<ffffffff810ef7c5>] ? __audit_syscall_exit+0x265/0x290 <4>[67096.760255] [<ffffffff81474daf>] compat_sys_socketcall+0x13f/0x210 <4>[67096.760255] [<ffffffff8104dea3>] ia32_sysret+0x0/0x5 I have reused the original title for the RFC patch that Andrey posted and most of the original patch description. Cc: Eric Dumazet <edumazet@google.com> Cc: Andrew Vagin <avagin@parallels.com> Cc: Florian Westphal <fw@strlen.de> Reported-by: Andrew Vagin <avagin@parallels.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> Reviewed-by: Eric Dumazet <edumazet@google.com> Acked-by: Andrew Vagin <avagin@parallels.com>
2014-02-04 04:01:53 +09:00
nf_ct_put(snet->tmpl);
synproxy_proc_exit(net);
free_percpu(snet->stats);
}
static struct pernet_operations synproxy_net_ops = {
.init = synproxy_net_init,
.exit = synproxy_net_exit,
.id = &synproxy_net_id,
.size = sizeof(struct synproxy_net),
};
static int __init synproxy_core_init(void)
{
int err;
err = nf_ct_extend_register(&nf_ct_synproxy_extend);
if (err < 0)
goto err1;
err = register_pernet_subsys(&synproxy_net_ops);
if (err < 0)
goto err2;
return 0;
err2:
nf_ct_extend_unregister(&nf_ct_synproxy_extend);
err1:
return err;
}
static void __exit synproxy_core_exit(void)
{
unregister_pernet_subsys(&synproxy_net_ops);
nf_ct_extend_unregister(&nf_ct_synproxy_extend);
}
module_init(synproxy_core_init);
module_exit(synproxy_core_exit);
static struct iphdr *
synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr,
__be32 daddr)
{
struct iphdr *iph;
skb_reset_network_header(skb);
iph = skb_put(skb, sizeof(*iph));
iph->version = 4;
iph->ihl = sizeof(*iph) / 4;
iph->tos = 0;
iph->id = 0;
iph->frag_off = htons(IP_DF);
iph->ttl = net->ipv4.sysctl_ip_default_ttl;
iph->protocol = IPPROTO_TCP;
iph->check = 0;
iph->saddr = saddr;
iph->daddr = daddr;
return iph;
}
static void
synproxy_send_tcp(struct net *net,
const struct sk_buff *skb, struct sk_buff *nskb,
struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo,
struct iphdr *niph, struct tcphdr *nth,
unsigned int tcp_hdr_size)
{
nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0);
nskb->ip_summed = CHECKSUM_PARTIAL;
nskb->csum_start = (unsigned char *)nth - nskb->head;
nskb->csum_offset = offsetof(struct tcphdr, check);
skb_dst_set_noref(nskb, skb_dst(skb));
nskb->protocol = htons(ETH_P_IP);
netfilter: use actual socket sk rather than skb sk when routing harder [ Upstream commit 46d6c5ae953cc0be38efd0e469284df7c4328cf8 ] If netfilter changes the packet mark when mangling, the packet is rerouted using the route_me_harder set of functions. Prior to this commit, there's one big difference between route_me_harder and the ordinary initial routing functions, described in the comment above __ip_queue_xmit(): /* Note: skb->sk can be different from sk, in case of tunnels */ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, That function goes on to correctly make use of sk->sk_bound_dev_if, rather than skb->sk->sk_bound_dev_if. And indeed the comment is true: a tunnel will receive a packet in ndo_start_xmit with an initial skb->sk. It will make some transformations to that packet, and then it will send the encapsulated packet out of a *new* socket. That new socket will basically always have a different sk_bound_dev_if (otherwise there'd be a routing loop). So for the purposes of routing the encapsulated packet, the routing information as it pertains to the socket should come from that socket's sk, rather than the packet's original skb->sk. For that reason __ip_queue_xmit() and related functions all do the right thing. One might argue that all tunnels should just call skb_orphan(skb) before transmitting the encapsulated packet into the new socket. But tunnels do *not* do this -- and this is wisely avoided in skb_scrub_packet() too -- because features like TSQ rely on skb->destructor() being called when that buffer space is truely available again. Calling skb_orphan(skb) too early would result in buffers filling up unnecessarily and accounting info being all wrong. Instead, additional routing must take into account the new sk, just as __ip_queue_xmit() notes. So, this commit addresses the problem by fishing the correct sk out of state->sk -- it's already set properly in the call to nf_hook() in __ip_local_out(), which receives the sk as part of its normal functionality. So we make sure to plumb state->sk through the various route_me_harder functions, and then make correct use of it following the example of __ip_queue_xmit(). Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> Reviewed-by: Florian Westphal <fw@strlen.de> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> Signed-off-by: Sasha Levin <sashal@kernel.org>
2020-10-29 11:56:06 +09:00
if (ip_route_me_harder(net, nskb->sk, nskb, RTN_UNSPEC))
goto free_nskb;
if (nfct) {
nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo);
nf_conntrack_get(nfct);
}
ip_local_out(net, nskb->sk, nskb);
return;
free_nskb:
kfree_skb(nskb);
}
void
synproxy_send_client_synack(struct net *net,
const struct sk_buff *skb, const struct tcphdr *th,
const struct synproxy_options *opts)
{
struct sk_buff *nskb;
struct iphdr *iph, *niph;
struct tcphdr *nth;
unsigned int tcp_hdr_size;
u16 mss = opts->mss_encode;
iph = ip_hdr(skb);
tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
GFP_ATOMIC);
if (!nskb)
return;
skb_reserve(nskb, MAX_TCP_HEADER);
niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr);
skb_reset_transport_header(nskb);
nth = skb_put(nskb, tcp_hdr_size);
nth->source = th->dest;
nth->dest = th->source;
nth->seq = htonl(__cookie_v4_init_sequence(iph, th, &mss));
nth->ack_seq = htonl(ntohl(th->seq) + 1);
tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK;
if (opts->options & NF_SYNPROXY_OPT_ECN)
tcp_flag_word(nth) |= TCP_FLAG_ECE;
nth->doff = tcp_hdr_size / 4;
nth->window = 0;
nth->check = 0;
nth->urg_ptr = 0;
synproxy_build_options(nth, opts);
synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
}
EXPORT_SYMBOL_GPL(synproxy_send_client_synack);
static void
synproxy_send_server_syn(struct net *net,
const struct sk_buff *skb, const struct tcphdr *th,
const struct synproxy_options *opts, u32 recv_seq)
{
struct synproxy_net *snet = synproxy_pernet(net);
struct sk_buff *nskb;
struct iphdr *iph, *niph;
struct tcphdr *nth;
unsigned int tcp_hdr_size;
iph = ip_hdr(skb);
tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
GFP_ATOMIC);
if (!nskb)
return;
skb_reserve(nskb, MAX_TCP_HEADER);
niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr);
skb_reset_transport_header(nskb);
nth = skb_put(nskb, tcp_hdr_size);
nth->source = th->source;
nth->dest = th->dest;
nth->seq = htonl(recv_seq - 1);
/* ack_seq is used to relay our ISN to the synproxy hook to initialize
* sequence number translation once a connection tracking entry exists.
*/
nth->ack_seq = htonl(ntohl(th->ack_seq) - 1);
tcp_flag_word(nth) = TCP_FLAG_SYN;
if (opts->options & NF_SYNPROXY_OPT_ECN)
tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR;
nth->doff = tcp_hdr_size / 4;
nth->window = th->window;
nth->check = 0;
nth->urg_ptr = 0;
synproxy_build_options(nth, opts);
synproxy_send_tcp(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
niph, nth, tcp_hdr_size);
}
static void
synproxy_send_server_ack(struct net *net,
const struct ip_ct_tcp *state,
const struct sk_buff *skb, const struct tcphdr *th,
const struct synproxy_options *opts)
{
struct sk_buff *nskb;
struct iphdr *iph, *niph;
struct tcphdr *nth;
unsigned int tcp_hdr_size;
iph = ip_hdr(skb);
tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
GFP_ATOMIC);
if (!nskb)
return;
skb_reserve(nskb, MAX_TCP_HEADER);
niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr);
skb_reset_transport_header(nskb);
nth = skb_put(nskb, tcp_hdr_size);
nth->source = th->dest;
nth->dest = th->source;
nth->seq = htonl(ntohl(th->ack_seq));
nth->ack_seq = htonl(ntohl(th->seq) + 1);
tcp_flag_word(nth) = TCP_FLAG_ACK;
nth->doff = tcp_hdr_size / 4;
nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin);
nth->check = 0;
nth->urg_ptr = 0;
synproxy_build_options(nth, opts);
synproxy_send_tcp(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
}
static void
synproxy_send_client_ack(struct net *net,
const struct sk_buff *skb, const struct tcphdr *th,
const struct synproxy_options *opts)
{
struct sk_buff *nskb;
struct iphdr *iph, *niph;
struct tcphdr *nth;
unsigned int tcp_hdr_size;
iph = ip_hdr(skb);
tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
GFP_ATOMIC);
if (!nskb)
return;
skb_reserve(nskb, MAX_TCP_HEADER);
niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr);
skb_reset_transport_header(nskb);
nth = skb_put(nskb, tcp_hdr_size);
nth->source = th->source;
nth->dest = th->dest;
nth->seq = htonl(ntohl(th->seq) + 1);
nth->ack_seq = th->ack_seq;
tcp_flag_word(nth) = TCP_FLAG_ACK;
nth->doff = tcp_hdr_size / 4;
nth->window = htons(ntohs(th->window) >> opts->wscale);
nth->check = 0;
nth->urg_ptr = 0;
synproxy_build_options(nth, opts);
synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
}
bool
synproxy_recv_client_ack(struct net *net,
const struct sk_buff *skb, const struct tcphdr *th,
struct synproxy_options *opts, u32 recv_seq)
{
struct synproxy_net *snet = synproxy_pernet(net);
int mss;
mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1);
if (mss == 0) {
this_cpu_inc(snet->stats->cookie_invalid);
return false;
}
this_cpu_inc(snet->stats->cookie_valid);
opts->mss_option = mss;
opts->options |= NF_SYNPROXY_OPT_MSS;
if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP)
synproxy_check_timestamp_cookie(opts);
synproxy_send_server_syn(net, skb, th, opts, recv_seq);
return true;
}
EXPORT_SYMBOL_GPL(synproxy_recv_client_ack);
unsigned int
ipv4_synproxy_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *nhs)
{
struct net *net = nhs->net;
struct synproxy_net *snet = synproxy_pernet(net);
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
struct nf_conn_synproxy *synproxy;
struct synproxy_options opts = {};
const struct ip_ct_tcp *state;
struct tcphdr *th, _th;
unsigned int thoff;
ct = nf_ct_get(skb, &ctinfo);
if (!ct)
return NF_ACCEPT;
synproxy = nfct_synproxy(ct);
if (!synproxy)
return NF_ACCEPT;
if (nf_is_loopback_packet(skb) ||
ip_hdr(skb)->protocol != IPPROTO_TCP)
return NF_ACCEPT;
thoff = ip_hdrlen(skb);
th = skb_header_pointer(skb, thoff, sizeof(_th), &_th);
if (!th)
return NF_DROP;
state = &ct->proto.tcp;
switch (state->state) {
case TCP_CONNTRACK_CLOSE:
if (th->rst && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) {
nf_ct_seqadj_init(ct, ctinfo, synproxy->isn -
ntohl(th->seq) + 1);
break;
}
if (!th->syn || th->ack ||
CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
break;
/* Reopened connection - reset the sequence number and timestamp
* adjustments, they will get initialized once the connection is
* reestablished.
*/
nf_ct_seqadj_init(ct, ctinfo, 0);
synproxy->tsoff = 0;
this_cpu_inc(snet->stats->conn_reopened);
/* fall through */
case TCP_CONNTRACK_SYN_SENT:
if (!synproxy_parse_options(skb, thoff, th, &opts))
return NF_DROP;
if (!th->syn && th->ack &&
CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
/* Keep-Alives are sent with SEG.SEQ = SND.NXT-1,
* therefore we need to add 1 to make the SYN sequence
* number match the one of first SYN.
*/
if (synproxy_recv_client_ack(net, skb, th, &opts,
ntohl(th->seq) + 1)) {
this_cpu_inc(snet->stats->cookie_retrans);
consume_skb(skb);
return NF_STOLEN;
} else {
return NF_DROP;
}
}
synproxy->isn = ntohl(th->ack_seq);
if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP)
synproxy->its = opts.tsecr;
nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
break;
case TCP_CONNTRACK_SYN_RECV:
if (!th->syn || !th->ack)
break;
if (!synproxy_parse_options(skb, thoff, th, &opts))
return NF_DROP;
if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP) {
synproxy->tsoff = opts.tsval - synproxy->its;
nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
}
opts.options &= ~(NF_SYNPROXY_OPT_MSS |
NF_SYNPROXY_OPT_WSCALE |
NF_SYNPROXY_OPT_SACK_PERM);
swap(opts.tsval, opts.tsecr);
synproxy_send_server_ack(net, state, skb, th, &opts);
nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq));
nf_conntrack_event_cache(IPCT_SEQADJ, ct);
swap(opts.tsval, opts.tsecr);
synproxy_send_client_ack(net, skb, th, &opts);
consume_skb(skb);
return NF_STOLEN;
default:
break;
}
synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy);
return NF_ACCEPT;
}
EXPORT_SYMBOL_GPL(ipv4_synproxy_hook);
static const struct nf_hook_ops ipv4_synproxy_ops[] = {
{
.hook = ipv4_synproxy_hook,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
},
{
.hook = ipv4_synproxy_hook,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
},
};
int nf_synproxy_ipv4_init(struct synproxy_net *snet, struct net *net)
{
int err;
if (snet->hook_ref4 == 0) {
err = nf_register_net_hooks(net, ipv4_synproxy_ops,
ARRAY_SIZE(ipv4_synproxy_ops));
if (err)
return err;
}
snet->hook_ref4++;
return 0;
}
EXPORT_SYMBOL_GPL(nf_synproxy_ipv4_init);
void nf_synproxy_ipv4_fini(struct synproxy_net *snet, struct net *net)
{
snet->hook_ref4--;
if (snet->hook_ref4 == 0)
nf_unregister_net_hooks(net, ipv4_synproxy_ops,
ARRAY_SIZE(ipv4_synproxy_ops));
}
EXPORT_SYMBOL_GPL(nf_synproxy_ipv4_fini);
#if IS_ENABLED(CONFIG_IPV6)
static struct ipv6hdr *
synproxy_build_ip_ipv6(struct net *net, struct sk_buff *skb,
const struct in6_addr *saddr,
const struct in6_addr *daddr)
{
struct ipv6hdr *iph;
skb_reset_network_header(skb);
iph = skb_put(skb, sizeof(*iph));
ip6_flow_hdr(iph, 0, 0);
iph->hop_limit = net->ipv6.devconf_all->hop_limit;
iph->nexthdr = IPPROTO_TCP;
iph->saddr = *saddr;
iph->daddr = *daddr;
return iph;
}
static void
synproxy_send_tcp_ipv6(struct net *net,
const struct sk_buff *skb, struct sk_buff *nskb,
struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo,
struct ipv6hdr *niph, struct tcphdr *nth,
unsigned int tcp_hdr_size)
{
struct dst_entry *dst;
struct flowi6 fl6;
int err;
nth->check = ~tcp_v6_check(tcp_hdr_size, &niph->saddr, &niph->daddr, 0);
nskb->ip_summed = CHECKSUM_PARTIAL;
nskb->csum_start = (unsigned char *)nth - nskb->head;
nskb->csum_offset = offsetof(struct tcphdr, check);
memset(&fl6, 0, sizeof(fl6));
fl6.flowi6_proto = IPPROTO_TCP;
fl6.saddr = niph->saddr;
fl6.daddr = niph->daddr;
fl6.fl6_sport = nth->source;
fl6.fl6_dport = nth->dest;
security_skb_classify_flow((struct sk_buff *)skb,
flowi6_to_flowi(&fl6));
err = nf_ip6_route(net, &dst, flowi6_to_flowi(&fl6), false);
if (err) {
goto free_nskb;
}
dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
if (IS_ERR(dst))
goto free_nskb;
skb_dst_set(nskb, dst);
if (nfct) {
nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo);
nf_conntrack_get(nfct);
}
ip6_local_out(net, nskb->sk, nskb);
return;
free_nskb:
kfree_skb(nskb);
}
void
synproxy_send_client_synack_ipv6(struct net *net,
const struct sk_buff *skb,
const struct tcphdr *th,
const struct synproxy_options *opts)
{
struct sk_buff *nskb;
struct ipv6hdr *iph, *niph;
struct tcphdr *nth;
unsigned int tcp_hdr_size;
u16 mss = opts->mss_encode;
iph = ipv6_hdr(skb);
tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
GFP_ATOMIC);
if (!nskb)
return;
skb_reserve(nskb, MAX_TCP_HEADER);
niph = synproxy_build_ip_ipv6(net, nskb, &iph->daddr, &iph->saddr);
skb_reset_transport_header(nskb);
nth = skb_put(nskb, tcp_hdr_size);
nth->source = th->dest;
nth->dest = th->source;
nth->seq = htonl(nf_ipv6_cookie_init_sequence(iph, th, &mss));
nth->ack_seq = htonl(ntohl(th->seq) + 1);
tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK;
if (opts->options & NF_SYNPROXY_OPT_ECN)
tcp_flag_word(nth) |= TCP_FLAG_ECE;
nth->doff = tcp_hdr_size / 4;
nth->window = 0;
nth->check = 0;
nth->urg_ptr = 0;
synproxy_build_options(nth, opts);
synproxy_send_tcp_ipv6(net, skb, nskb, skb_nfct(skb),
IP_CT_ESTABLISHED_REPLY, niph, nth,
tcp_hdr_size);
}
EXPORT_SYMBOL_GPL(synproxy_send_client_synack_ipv6);
static void
synproxy_send_server_syn_ipv6(struct net *net, const struct sk_buff *skb,
const struct tcphdr *th,
const struct synproxy_options *opts, u32 recv_seq)
{
struct synproxy_net *snet = synproxy_pernet(net);
struct sk_buff *nskb;
struct ipv6hdr *iph, *niph;
struct tcphdr *nth;
unsigned int tcp_hdr_size;
iph = ipv6_hdr(skb);
tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
GFP_ATOMIC);
if (!nskb)
return;
skb_reserve(nskb, MAX_TCP_HEADER);
niph = synproxy_build_ip_ipv6(net, nskb, &iph->saddr, &iph->daddr);
skb_reset_transport_header(nskb);
nth = skb_put(nskb, tcp_hdr_size);
nth->source = th->source;
nth->dest = th->dest;
nth->seq = htonl(recv_seq - 1);
/* ack_seq is used to relay our ISN to the synproxy hook to initialize
* sequence number translation once a connection tracking entry exists.
*/
nth->ack_seq = htonl(ntohl(th->ack_seq) - 1);
tcp_flag_word(nth) = TCP_FLAG_SYN;
if (opts->options & NF_SYNPROXY_OPT_ECN)
tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR;
nth->doff = tcp_hdr_size / 4;
nth->window = th->window;
nth->check = 0;
nth->urg_ptr = 0;
synproxy_build_options(nth, opts);
synproxy_send_tcp_ipv6(net, skb, nskb, &snet->tmpl->ct_general,
IP_CT_NEW, niph, nth, tcp_hdr_size);
}
static void
synproxy_send_server_ack_ipv6(struct net *net, const struct ip_ct_tcp *state,
const struct sk_buff *skb,
const struct tcphdr *th,
const struct synproxy_options *opts)
{
struct sk_buff *nskb;
struct ipv6hdr *iph, *niph;
struct tcphdr *nth;
unsigned int tcp_hdr_size;
iph = ipv6_hdr(skb);
tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
GFP_ATOMIC);
if (!nskb)
return;
skb_reserve(nskb, MAX_TCP_HEADER);
niph = synproxy_build_ip_ipv6(net, nskb, &iph->daddr, &iph->saddr);
skb_reset_transport_header(nskb);
nth = skb_put(nskb, tcp_hdr_size);
nth->source = th->dest;
nth->dest = th->source;
nth->seq = htonl(ntohl(th->ack_seq));
nth->ack_seq = htonl(ntohl(th->seq) + 1);
tcp_flag_word(nth) = TCP_FLAG_ACK;
nth->doff = tcp_hdr_size / 4;
nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin);
nth->check = 0;
nth->urg_ptr = 0;
synproxy_build_options(nth, opts);
synproxy_send_tcp_ipv6(net, skb, nskb, NULL, 0, niph, nth,
tcp_hdr_size);
}
static void
synproxy_send_client_ack_ipv6(struct net *net, const struct sk_buff *skb,
const struct tcphdr *th,
const struct synproxy_options *opts)
{
struct sk_buff *nskb;
struct ipv6hdr *iph, *niph;
struct tcphdr *nth;
unsigned int tcp_hdr_size;
iph = ipv6_hdr(skb);
tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
GFP_ATOMIC);
if (!nskb)
return;
skb_reserve(nskb, MAX_TCP_HEADER);
niph = synproxy_build_ip_ipv6(net, nskb, &iph->saddr, &iph->daddr);
skb_reset_transport_header(nskb);
nth = skb_put(nskb, tcp_hdr_size);
nth->source = th->source;
nth->dest = th->dest;
nth->seq = htonl(ntohl(th->seq) + 1);
nth->ack_seq = th->ack_seq;
tcp_flag_word(nth) = TCP_FLAG_ACK;
nth->doff = tcp_hdr_size / 4;
nth->window = htons(ntohs(th->window) >> opts->wscale);
nth->check = 0;
nth->urg_ptr = 0;
synproxy_build_options(nth, opts);
synproxy_send_tcp_ipv6(net, skb, nskb, skb_nfct(skb),
IP_CT_ESTABLISHED_REPLY, niph, nth,
tcp_hdr_size);
}
bool
synproxy_recv_client_ack_ipv6(struct net *net,
const struct sk_buff *skb,
const struct tcphdr *th,
struct synproxy_options *opts, u32 recv_seq)
{
struct synproxy_net *snet = synproxy_pernet(net);
int mss;
mss = nf_cookie_v6_check(ipv6_hdr(skb), th, ntohl(th->ack_seq) - 1);
if (mss == 0) {
this_cpu_inc(snet->stats->cookie_invalid);
return false;
}
this_cpu_inc(snet->stats->cookie_valid);
opts->mss_option = mss;
opts->options |= NF_SYNPROXY_OPT_MSS;
if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP)
synproxy_check_timestamp_cookie(opts);
synproxy_send_server_syn_ipv6(net, skb, th, opts, recv_seq);
return true;
}
EXPORT_SYMBOL_GPL(synproxy_recv_client_ack_ipv6);
unsigned int
ipv6_synproxy_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *nhs)
{
struct net *net = nhs->net;
struct synproxy_net *snet = synproxy_pernet(net);
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
struct nf_conn_synproxy *synproxy;
struct synproxy_options opts = {};
const struct ip_ct_tcp *state;
struct tcphdr *th, _th;
__be16 frag_off;
u8 nexthdr;
int thoff;
ct = nf_ct_get(skb, &ctinfo);
if (!ct)
return NF_ACCEPT;
synproxy = nfct_synproxy(ct);
if (!synproxy)
return NF_ACCEPT;
if (nf_is_loopback_packet(skb))
return NF_ACCEPT;
nexthdr = ipv6_hdr(skb)->nexthdr;
thoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
&frag_off);
if (thoff < 0 || nexthdr != IPPROTO_TCP)
return NF_ACCEPT;
th = skb_header_pointer(skb, thoff, sizeof(_th), &_th);
if (!th)
return NF_DROP;
state = &ct->proto.tcp;
switch (state->state) {
case TCP_CONNTRACK_CLOSE:
if (th->rst && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) {
nf_ct_seqadj_init(ct, ctinfo, synproxy->isn -
ntohl(th->seq) + 1);
break;
}
if (!th->syn || th->ack ||
CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
break;
/* Reopened connection - reset the sequence number and timestamp
* adjustments, they will get initialized once the connection is
* reestablished.
*/
nf_ct_seqadj_init(ct, ctinfo, 0);
synproxy->tsoff = 0;
this_cpu_inc(snet->stats->conn_reopened);
/* fall through */
case TCP_CONNTRACK_SYN_SENT:
if (!synproxy_parse_options(skb, thoff, th, &opts))
return NF_DROP;
if (!th->syn && th->ack &&
CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
/* Keep-Alives are sent with SEG.SEQ = SND.NXT-1,
* therefore we need to add 1 to make the SYN sequence
* number match the one of first SYN.
*/
if (synproxy_recv_client_ack_ipv6(net, skb, th, &opts,
ntohl(th->seq) + 1)) {
this_cpu_inc(snet->stats->cookie_retrans);
consume_skb(skb);
return NF_STOLEN;
} else {
return NF_DROP;
}
}
synproxy->isn = ntohl(th->ack_seq);
if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP)
synproxy->its = opts.tsecr;
nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
break;
case TCP_CONNTRACK_SYN_RECV:
if (!th->syn || !th->ack)
break;
if (!synproxy_parse_options(skb, thoff, th, &opts))
return NF_DROP;
if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP) {
synproxy->tsoff = opts.tsval - synproxy->its;
nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
}
opts.options &= ~(NF_SYNPROXY_OPT_MSS |
NF_SYNPROXY_OPT_WSCALE |
NF_SYNPROXY_OPT_SACK_PERM);
swap(opts.tsval, opts.tsecr);
synproxy_send_server_ack_ipv6(net, state, skb, th, &opts);
nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq));
nf_conntrack_event_cache(IPCT_SEQADJ, ct);
swap(opts.tsval, opts.tsecr);
synproxy_send_client_ack_ipv6(net, skb, th, &opts);
consume_skb(skb);
return NF_STOLEN;
default:
break;
}
synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy);
return NF_ACCEPT;
}
EXPORT_SYMBOL_GPL(ipv6_synproxy_hook);
static const struct nf_hook_ops ipv6_synproxy_ops[] = {
{
.hook = ipv6_synproxy_hook,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
},
{
.hook = ipv6_synproxy_hook,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
},
};
int
nf_synproxy_ipv6_init(struct synproxy_net *snet, struct net *net)
{
int err;
if (snet->hook_ref6 == 0) {
err = nf_register_net_hooks(net, ipv6_synproxy_ops,
ARRAY_SIZE(ipv6_synproxy_ops));
if (err)
return err;
}
snet->hook_ref6++;
return 0;
}
EXPORT_SYMBOL_GPL(nf_synproxy_ipv6_init);
void
nf_synproxy_ipv6_fini(struct synproxy_net *snet, struct net *net)
{
snet->hook_ref6--;
if (snet->hook_ref6 == 0)
nf_unregister_net_hooks(net, ipv6_synproxy_ops,
ARRAY_SIZE(ipv6_synproxy_ops));
}
EXPORT_SYMBOL_GPL(nf_synproxy_ipv6_fini);
#endif /* CONFIG_IPV6 */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");