linux-brain/net/ipv4/ip_tunnel.c

1273 lines
30 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2013 Nicira, Inc.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/capability.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/in.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/if_arp.h>
#include <linux/init.h>
#include <linux/in6.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
#include <linux/netfilter_ipv4.h>
#include <linux/etherdevice.h>
#include <linux/if_ether.h>
#include <linux/if_vlan.h>
#include <linux/rculist.h>
#include <linux/err.h>
#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/protocol.h>
#include <net/ip_tunnels.h>
#include <net/arp.h>
#include <net/checksum.h>
#include <net/dsfield.h>
#include <net/inet_ecn.h>
#include <net/xfrm.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
#include <net/udp.h>
#include <net/dst_metadata.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
#endif
static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
{
return hash_32((__force u32)key ^ (__force u32)remote,
IP_TNL_HASH_BITS);
}
static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
__be16 flags, __be32 key)
{
if (p->i_flags & TUNNEL_KEY) {
if (flags & TUNNEL_KEY)
return key == p->i_key;
else
/* key expected, none present */
return false;
} else
return !(flags & TUNNEL_KEY);
}
/* Fallback tunnel: no source, no destination, no key, no options
Tunnel hash table:
We require exact key match i.e. if a key is present in packet
it will match only tunnel with the same key; if it is not present,
it will match only keyless tunnel.
All keysless packets, if not matched configured keyless tunnels
will match fallback tunnel.
Given src, dst and key, find appropriate for input tunnel.
*/
struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
int link, __be16 flags,
__be32 remote, __be32 local,
__be32 key)
{
struct ip_tunnel *t, *cand = NULL;
struct hlist_head *head;
ip_tunnel: fix use-after-free in ip_tunnel_lookup() [ Upstream commit ba61539c6ae57f4146284a5cb4f7b7ed8d42bf45 ] In the datapath, the ip_tunnel_lookup() is used and it internally uses fallback tunnel device pointer, which is fb_tunnel_dev. This pointer variable should be set to NULL when a fb interface is deleted. But there is no routine to set fb_tunnel_dev pointer to NULL. So, this pointer will be still used after interface is deleted and it eventually results in the use-after-free problem. Test commands: ip netns add A ip netns add B ip link add eth0 type veth peer name eth1 ip link set eth0 netns A ip link set eth1 netns B ip netns exec A ip link set lo up ip netns exec A ip link set eth0 up ip netns exec A ip link add gre1 type gre local 10.0.0.1 \ remote 10.0.0.2 ip netns exec A ip link set gre1 up ip netns exec A ip a a 10.0.100.1/24 dev gre1 ip netns exec A ip a a 10.0.0.1/24 dev eth0 ip netns exec B ip link set lo up ip netns exec B ip link set eth1 up ip netns exec B ip link add gre1 type gre local 10.0.0.2 \ remote 10.0.0.1 ip netns exec B ip link set gre1 up ip netns exec B ip a a 10.0.100.2/24 dev gre1 ip netns exec B ip a a 10.0.0.2/24 dev eth1 ip netns exec A hping3 10.0.100.2 -2 --flood -d 60000 & ip netns del B Splat looks like: [ 77.793450][ C3] ================================================================== [ 77.794702][ C3] BUG: KASAN: use-after-free in ip_tunnel_lookup+0xcc4/0xf30 [ 77.795573][ C3] Read of size 4 at addr ffff888060bd9c84 by task hping3/2905 [ 77.796398][ C3] [ 77.796664][ C3] CPU: 3 PID: 2905 Comm: hping3 Not tainted 5.8.0-rc1+ #616 [ 77.797474][ C3] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 77.798453][ C3] Call Trace: [ 77.798815][ C3] <IRQ> [ 77.799142][ C3] dump_stack+0x9d/0xdb [ 77.799605][ C3] print_address_description.constprop.7+0x2cc/0x450 [ 77.800365][ C3] ? ip_tunnel_lookup+0xcc4/0xf30 [ 77.800908][ C3] ? ip_tunnel_lookup+0xcc4/0xf30 [ 77.801517][ C3] ? ip_tunnel_lookup+0xcc4/0xf30 [ 77.802145][ C3] kasan_report+0x154/0x190 [ 77.802821][ C3] ? ip_tunnel_lookup+0xcc4/0xf30 [ 77.803503][ C3] ip_tunnel_lookup+0xcc4/0xf30 [ 77.804165][ C3] __ipgre_rcv+0x1ab/0xaa0 [ip_gre] [ 77.804862][ C3] ? rcu_read_lock_sched_held+0xc0/0xc0 [ 77.805621][ C3] gre_rcv+0x304/0x1910 [ip_gre] [ 77.806293][ C3] ? lock_acquire+0x1a9/0x870 [ 77.806925][ C3] ? gre_rcv+0xfe/0x354 [gre] [ 77.807559][ C3] ? erspan_xmit+0x2e60/0x2e60 [ip_gre] [ 77.808305][ C3] ? rcu_read_lock_sched_held+0xc0/0xc0 [ 77.809032][ C3] ? rcu_read_lock_held+0x90/0xa0 [ 77.809713][ C3] gre_rcv+0x1b8/0x354 [gre] [ ... ] Suggested-by: Eric Dumazet <eric.dumazet@gmail.com> Fixes: c54419321455 ("GRE: Refactor GRE tunneling code.") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2020-06-17 01:51:51 +09:00
struct net_device *ndev;
unsigned int hash;
hash = ip_tunnel_hash(key, remote);
head = &itn->tunnels[hash];
hlist_for_each_entry_rcu(t, head, hash_node) {
if (local != t->parms.iph.saddr ||
remote != t->parms.iph.daddr ||
!(t->dev->flags & IFF_UP))
continue;
if (!ip_tunnel_key_match(&t->parms, flags, key))
continue;
if (t->parms.link == link)
return t;
else
cand = t;
}
hlist_for_each_entry_rcu(t, head, hash_node) {
if (remote != t->parms.iph.daddr ||
ip_tunnel: fix ip_tunnel_lookup This patch fixes 3 similar bugs where incoming packets might be routed into wrong non-wildcard tunnels: 1) Consider the following setup: ip address add 1.1.1.1/24 dev eth0 ip address add 1.1.1.2/24 dev eth0 ip tunnel add ipip1 remote 2.2.2.2 local 1.1.1.1 mode ipip dev eth0 ip link set ipip1 up Incoming ipip packets from 2.2.2.2 were routed into ipip1 even if it has dst = 1.1.1.2. Moreover even if there was wildcard tunnel like ip tunnel add ipip0 remote 2.2.2.2 local any mode ipip dev eth0 but it was created before explicit one (with local 1.1.1.1), incoming ipip packets with src = 2.2.2.2 and dst = 1.1.1.2 were still routed into ipip1. Same issue existed with all tunnels that use ip_tunnel_lookup (gre, vti) 2) ip address add 1.1.1.1/24 dev eth0 ip tunnel add ipip1 remote 2.2.146.85 local 1.1.1.1 mode ipip dev eth0 ip link set ipip1 up Incoming ipip packets with dst = 1.1.1.1 were routed into ipip1, no matter what src address is. Any remote ip address which has ip_tunnel_hash = 0 raised this issue, 2.2.146.85 is just an example, there are more than 4 million of them. And again, wildcard tunnel like ip tunnel add ipip0 remote any local 1.1.1.1 mode ipip dev eth0 wouldn't be ever matched if it was created before explicit tunnel like above. Gre & vti tunnels had the same issue. 3) ip address add 1.1.1.1/24 dev eth0 ip tunnel add gre1 remote 2.2.146.84 local 1.1.1.1 key 1 mode gre dev eth0 ip link set gre1 up Any incoming gre packet with key = 1 were routed into gre1, no matter what src/dst addresses are. Any remote ip address which has ip_tunnel_hash = 0 raised the issue, 2.2.146.84 is just an example, there are more than 4 million of them. Wildcard tunnel like ip tunnel add gre2 remote any local any key 1 mode gre dev eth0 wouldn't be ever matched if it was created before explicit tunnel like above. All this stuff happened because while looking for a wildcard tunnel we didn't check that matched tunnel is a wildcard one. Fixed. Signed-off-by: Dmitry Popov <ixaphire@qrator.net> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-07-05 07:26:37 +09:00
t->parms.iph.saddr != 0 ||
!(t->dev->flags & IFF_UP))
continue;
if (!ip_tunnel_key_match(&t->parms, flags, key))
continue;
if (t->parms.link == link)
return t;
else if (!cand)
cand = t;
}
hash = ip_tunnel_hash(key, 0);
head = &itn->tunnels[hash];
hlist_for_each_entry_rcu(t, head, hash_node) {
ip_tunnel: fix ip_tunnel_lookup This patch fixes 3 similar bugs where incoming packets might be routed into wrong non-wildcard tunnels: 1) Consider the following setup: ip address add 1.1.1.1/24 dev eth0 ip address add 1.1.1.2/24 dev eth0 ip tunnel add ipip1 remote 2.2.2.2 local 1.1.1.1 mode ipip dev eth0 ip link set ipip1 up Incoming ipip packets from 2.2.2.2 were routed into ipip1 even if it has dst = 1.1.1.2. Moreover even if there was wildcard tunnel like ip tunnel add ipip0 remote 2.2.2.2 local any mode ipip dev eth0 but it was created before explicit one (with local 1.1.1.1), incoming ipip packets with src = 2.2.2.2 and dst = 1.1.1.2 were still routed into ipip1. Same issue existed with all tunnels that use ip_tunnel_lookup (gre, vti) 2) ip address add 1.1.1.1/24 dev eth0 ip tunnel add ipip1 remote 2.2.146.85 local 1.1.1.1 mode ipip dev eth0 ip link set ipip1 up Incoming ipip packets with dst = 1.1.1.1 were routed into ipip1, no matter what src address is. Any remote ip address which has ip_tunnel_hash = 0 raised this issue, 2.2.146.85 is just an example, there are more than 4 million of them. And again, wildcard tunnel like ip tunnel add ipip0 remote any local 1.1.1.1 mode ipip dev eth0 wouldn't be ever matched if it was created before explicit tunnel like above. Gre & vti tunnels had the same issue. 3) ip address add 1.1.1.1/24 dev eth0 ip tunnel add gre1 remote 2.2.146.84 local 1.1.1.1 key 1 mode gre dev eth0 ip link set gre1 up Any incoming gre packet with key = 1 were routed into gre1, no matter what src/dst addresses are. Any remote ip address which has ip_tunnel_hash = 0 raised the issue, 2.2.146.84 is just an example, there are more than 4 million of them. Wildcard tunnel like ip tunnel add gre2 remote any local any key 1 mode gre dev eth0 wouldn't be ever matched if it was created before explicit tunnel like above. All this stuff happened because while looking for a wildcard tunnel we didn't check that matched tunnel is a wildcard one. Fixed. Signed-off-by: Dmitry Popov <ixaphire@qrator.net> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-07-05 07:26:37 +09:00
if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
(local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
continue;
if (!(t->dev->flags & IFF_UP))
continue;
if (!ip_tunnel_key_match(&t->parms, flags, key))
continue;
if (t->parms.link == link)
return t;
else if (!cand)
cand = t;
}
hlist_for_each_entry_rcu(t, head, hash_node) {
if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
ip_tunnel: fix ip_tunnel_lookup This patch fixes 3 similar bugs where incoming packets might be routed into wrong non-wildcard tunnels: 1) Consider the following setup: ip address add 1.1.1.1/24 dev eth0 ip address add 1.1.1.2/24 dev eth0 ip tunnel add ipip1 remote 2.2.2.2 local 1.1.1.1 mode ipip dev eth0 ip link set ipip1 up Incoming ipip packets from 2.2.2.2 were routed into ipip1 even if it has dst = 1.1.1.2. Moreover even if there was wildcard tunnel like ip tunnel add ipip0 remote 2.2.2.2 local any mode ipip dev eth0 but it was created before explicit one (with local 1.1.1.1), incoming ipip packets with src = 2.2.2.2 and dst = 1.1.1.2 were still routed into ipip1. Same issue existed with all tunnels that use ip_tunnel_lookup (gre, vti) 2) ip address add 1.1.1.1/24 dev eth0 ip tunnel add ipip1 remote 2.2.146.85 local 1.1.1.1 mode ipip dev eth0 ip link set ipip1 up Incoming ipip packets with dst = 1.1.1.1 were routed into ipip1, no matter what src address is. Any remote ip address which has ip_tunnel_hash = 0 raised this issue, 2.2.146.85 is just an example, there are more than 4 million of them. And again, wildcard tunnel like ip tunnel add ipip0 remote any local 1.1.1.1 mode ipip dev eth0 wouldn't be ever matched if it was created before explicit tunnel like above. Gre & vti tunnels had the same issue. 3) ip address add 1.1.1.1/24 dev eth0 ip tunnel add gre1 remote 2.2.146.84 local 1.1.1.1 key 1 mode gre dev eth0 ip link set gre1 up Any incoming gre packet with key = 1 were routed into gre1, no matter what src/dst addresses are. Any remote ip address which has ip_tunnel_hash = 0 raised the issue, 2.2.146.84 is just an example, there are more than 4 million of them. Wildcard tunnel like ip tunnel add gre2 remote any local any key 1 mode gre dev eth0 wouldn't be ever matched if it was created before explicit tunnel like above. All this stuff happened because while looking for a wildcard tunnel we didn't check that matched tunnel is a wildcard one. Fixed. Signed-off-by: Dmitry Popov <ixaphire@qrator.net> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-07-05 07:26:37 +09:00
t->parms.iph.saddr != 0 ||
t->parms.iph.daddr != 0 ||
!(t->dev->flags & IFF_UP))
continue;
if (t->parms.link == link)
return t;
else if (!cand)
cand = t;
}
if (cand)
return cand;
t = rcu_dereference(itn->collect_md_tun);
if (t && t->dev->flags & IFF_UP)
return t;
ip_tunnel: fix use-after-free in ip_tunnel_lookup() [ Upstream commit ba61539c6ae57f4146284a5cb4f7b7ed8d42bf45 ] In the datapath, the ip_tunnel_lookup() is used and it internally uses fallback tunnel device pointer, which is fb_tunnel_dev. This pointer variable should be set to NULL when a fb interface is deleted. But there is no routine to set fb_tunnel_dev pointer to NULL. So, this pointer will be still used after interface is deleted and it eventually results in the use-after-free problem. Test commands: ip netns add A ip netns add B ip link add eth0 type veth peer name eth1 ip link set eth0 netns A ip link set eth1 netns B ip netns exec A ip link set lo up ip netns exec A ip link set eth0 up ip netns exec A ip link add gre1 type gre local 10.0.0.1 \ remote 10.0.0.2 ip netns exec A ip link set gre1 up ip netns exec A ip a a 10.0.100.1/24 dev gre1 ip netns exec A ip a a 10.0.0.1/24 dev eth0 ip netns exec B ip link set lo up ip netns exec B ip link set eth1 up ip netns exec B ip link add gre1 type gre local 10.0.0.2 \ remote 10.0.0.1 ip netns exec B ip link set gre1 up ip netns exec B ip a a 10.0.100.2/24 dev gre1 ip netns exec B ip a a 10.0.0.2/24 dev eth1 ip netns exec A hping3 10.0.100.2 -2 --flood -d 60000 & ip netns del B Splat looks like: [ 77.793450][ C3] ================================================================== [ 77.794702][ C3] BUG: KASAN: use-after-free in ip_tunnel_lookup+0xcc4/0xf30 [ 77.795573][ C3] Read of size 4 at addr ffff888060bd9c84 by task hping3/2905 [ 77.796398][ C3] [ 77.796664][ C3] CPU: 3 PID: 2905 Comm: hping3 Not tainted 5.8.0-rc1+ #616 [ 77.797474][ C3] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 77.798453][ C3] Call Trace: [ 77.798815][ C3] <IRQ> [ 77.799142][ C3] dump_stack+0x9d/0xdb [ 77.799605][ C3] print_address_description.constprop.7+0x2cc/0x450 [ 77.800365][ C3] ? ip_tunnel_lookup+0xcc4/0xf30 [ 77.800908][ C3] ? ip_tunnel_lookup+0xcc4/0xf30 [ 77.801517][ C3] ? ip_tunnel_lookup+0xcc4/0xf30 [ 77.802145][ C3] kasan_report+0x154/0x190 [ 77.802821][ C3] ? ip_tunnel_lookup+0xcc4/0xf30 [ 77.803503][ C3] ip_tunnel_lookup+0xcc4/0xf30 [ 77.804165][ C3] __ipgre_rcv+0x1ab/0xaa0 [ip_gre] [ 77.804862][ C3] ? rcu_read_lock_sched_held+0xc0/0xc0 [ 77.805621][ C3] gre_rcv+0x304/0x1910 [ip_gre] [ 77.806293][ C3] ? lock_acquire+0x1a9/0x870 [ 77.806925][ C3] ? gre_rcv+0xfe/0x354 [gre] [ 77.807559][ C3] ? erspan_xmit+0x2e60/0x2e60 [ip_gre] [ 77.808305][ C3] ? rcu_read_lock_sched_held+0xc0/0xc0 [ 77.809032][ C3] ? rcu_read_lock_held+0x90/0xa0 [ 77.809713][ C3] gre_rcv+0x1b8/0x354 [gre] [ ... ] Suggested-by: Eric Dumazet <eric.dumazet@gmail.com> Fixes: c54419321455 ("GRE: Refactor GRE tunneling code.") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2020-06-17 01:51:51 +09:00
ndev = READ_ONCE(itn->fb_tunnel_dev);
if (ndev && ndev->flags & IFF_UP)
return netdev_priv(ndev);
return NULL;
}
EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
struct ip_tunnel_parm *parms)
{
unsigned int h;
__be32 remote;
__be32 i_key = parms->i_key;
if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
remote = parms->iph.daddr;
else
remote = 0;
if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
i_key = 0;
h = ip_tunnel_hash(i_key, remote);
return &itn->tunnels[h];
}
static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
{
struct hlist_head *head = ip_bucket(itn, &t->parms);
if (t->collect_md)
rcu_assign_pointer(itn->collect_md_tun, t);
hlist_add_head_rcu(&t->hash_node, head);
}
static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
{
if (t->collect_md)
rcu_assign_pointer(itn->collect_md_tun, NULL);
hlist_del_init_rcu(&t->hash_node);
}
static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
struct ip_tunnel_parm *parms,
int type)
{
__be32 remote = parms->iph.daddr;
__be32 local = parms->iph.saddr;
__be32 key = parms->i_key;
__be16 flags = parms->i_flags;
int link = parms->link;
struct ip_tunnel *t = NULL;
struct hlist_head *head = ip_bucket(itn, parms);
hlist_for_each_entry_rcu(t, head, hash_node) {
if (local == t->parms.iph.saddr &&
remote == t->parms.iph.daddr &&
link == t->parms.link &&
type == t->dev->type &&
ip_tunnel_key_match(&t->parms, flags, key))
break;
}
return t;
}
static struct net_device *__ip_tunnel_create(struct net *net,
const struct rtnl_link_ops *ops,
struct ip_tunnel_parm *parms)
{
int err;
struct ip_tunnel *tunnel;
struct net_device *dev;
char name[IFNAMSIZ];
ip_tunnel: better validate user provided tunnel names Use dev_valid_name() to make sure user does not provide illegal device name. syzbot caught the following bug : BUG: KASAN: stack-out-of-bounds in strlcpy include/linux/string.h:300 [inline] BUG: KASAN: stack-out-of-bounds in __ip_tunnel_create+0xca/0x6b0 net/ipv4/ip_tunnel.c:257 Write of size 20 at addr ffff8801ac79f810 by task syzkaller268107/4482 CPU: 0 PID: 4482 Comm: syzkaller268107 Not tainted 4.16.0+ #1 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x1b9/0x29f lib/dump_stack.c:53 print_address_description+0x6c/0x20b mm/kasan/report.c:256 kasan_report_error mm/kasan/report.c:354 [inline] kasan_report.cold.7+0xac/0x2f5 mm/kasan/report.c:412 check_memory_region_inline mm/kasan/kasan.c:260 [inline] check_memory_region+0x13e/0x1b0 mm/kasan/kasan.c:267 memcpy+0x37/0x50 mm/kasan/kasan.c:303 strlcpy include/linux/string.h:300 [inline] __ip_tunnel_create+0xca/0x6b0 net/ipv4/ip_tunnel.c:257 ip_tunnel_create net/ipv4/ip_tunnel.c:352 [inline] ip_tunnel_ioctl+0x818/0xd40 net/ipv4/ip_tunnel.c:861 ipip_tunnel_ioctl+0x1c5/0x420 net/ipv4/ipip.c:350 dev_ifsioc+0x43e/0xb90 net/core/dev_ioctl.c:334 dev_ioctl+0x69a/0xcc0 net/core/dev_ioctl.c:525 sock_ioctl+0x47e/0x680 net/socket.c:1015 vfs_ioctl fs/ioctl.c:46 [inline] file_ioctl fs/ioctl.c:500 [inline] do_vfs_ioctl+0x1cf/0x1650 fs/ioctl.c:684 ksys_ioctl+0xa9/0xd0 fs/ioctl.c:701 SYSC_ioctl fs/ioctl.c:708 [inline] SyS_ioctl+0x24/0x30 fs/ioctl.c:706 do_syscall_64+0x29e/0x9d0 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x42/0xb7 Fixes: c54419321455 ("GRE: Refactor GRE tunneling code.") Signed-off-by: Eric Dumazet <edumazet@google.com> Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-05 22:39:27 +09:00
err = -E2BIG;
if (parms->name[0]) {
if (!dev_valid_name(parms->name))
goto failed;
strlcpy(name, parms->name, IFNAMSIZ);
ip_tunnel: better validate user provided tunnel names Use dev_valid_name() to make sure user does not provide illegal device name. syzbot caught the following bug : BUG: KASAN: stack-out-of-bounds in strlcpy include/linux/string.h:300 [inline] BUG: KASAN: stack-out-of-bounds in __ip_tunnel_create+0xca/0x6b0 net/ipv4/ip_tunnel.c:257 Write of size 20 at addr ffff8801ac79f810 by task syzkaller268107/4482 CPU: 0 PID: 4482 Comm: syzkaller268107 Not tainted 4.16.0+ #1 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x1b9/0x29f lib/dump_stack.c:53 print_address_description+0x6c/0x20b mm/kasan/report.c:256 kasan_report_error mm/kasan/report.c:354 [inline] kasan_report.cold.7+0xac/0x2f5 mm/kasan/report.c:412 check_memory_region_inline mm/kasan/kasan.c:260 [inline] check_memory_region+0x13e/0x1b0 mm/kasan/kasan.c:267 memcpy+0x37/0x50 mm/kasan/kasan.c:303 strlcpy include/linux/string.h:300 [inline] __ip_tunnel_create+0xca/0x6b0 net/ipv4/ip_tunnel.c:257 ip_tunnel_create net/ipv4/ip_tunnel.c:352 [inline] ip_tunnel_ioctl+0x818/0xd40 net/ipv4/ip_tunnel.c:861 ipip_tunnel_ioctl+0x1c5/0x420 net/ipv4/ipip.c:350 dev_ifsioc+0x43e/0xb90 net/core/dev_ioctl.c:334 dev_ioctl+0x69a/0xcc0 net/core/dev_ioctl.c:525 sock_ioctl+0x47e/0x680 net/socket.c:1015 vfs_ioctl fs/ioctl.c:46 [inline] file_ioctl fs/ioctl.c:500 [inline] do_vfs_ioctl+0x1cf/0x1650 fs/ioctl.c:684 ksys_ioctl+0xa9/0xd0 fs/ioctl.c:701 SYSC_ioctl fs/ioctl.c:708 [inline] SyS_ioctl+0x24/0x30 fs/ioctl.c:706 do_syscall_64+0x29e/0x9d0 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x42/0xb7 Fixes: c54419321455 ("GRE: Refactor GRE tunneling code.") Signed-off-by: Eric Dumazet <edumazet@google.com> Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-05 22:39:27 +09:00
} else {
if (strlen(ops->kind) > (IFNAMSIZ - 3))
goto failed;
strcpy(name, ops->kind);
strcat(name, "%d");
}
ASSERT_RTNL();
dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
if (!dev) {
err = -ENOMEM;
goto failed;
}
dev_net_set(dev, net);
dev->rtnl_link_ops = ops;
tunnel = netdev_priv(dev);
tunnel->parms = *parms;
tunnel->net = net;
err = register_netdevice(dev);
if (err)
goto failed_free;
return dev;
failed_free:
free_netdev(dev);
failed:
return ERR_PTR(err);
}
static int ip_tunnel_bind_dev(struct net_device *dev)
{
struct net_device *tdev = NULL;
struct ip_tunnel *tunnel = netdev_priv(dev);
const struct iphdr *iph;
int hlen = LL_MAX_HEADER;
int mtu = ETH_DATA_LEN;
int t_hlen = tunnel->hlen + sizeof(struct iphdr);
iph = &tunnel->parms.iph;
/* Guess output device to choose reasonable mtu and needed_headroom */
if (iph->daddr) {
struct flowi4 fl4;
struct rtable *rt;
ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
iph->saddr, tunnel->parms.o_key,
RT_TOS(iph->tos), tunnel->parms.link,
tunnel->fwmark, 0);
rt = ip_route_output_key(tunnel->net, &fl4);
if (!IS_ERR(rt)) {
tdev = rt->dst.dev;
ip_rt_put(rt);
}
if (dev->type != ARPHRD_ETHER)
dev->flags |= IFF_POINTOPOINT;
dst_cache_reset(&tunnel->dst_cache);
}
if (!tdev && tunnel->parms.link)
tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
if (tdev) {
hlen = tdev->hard_header_len + tdev->needed_headroom;
mtu = min(tdev->mtu, IP_MAX_MTU);
}
dev->needed_headroom = t_hlen + hlen;
mtu -= t_hlen + (dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0);
if (mtu < IPV4_MIN_MTU)
mtu = IPV4_MIN_MTU;
return mtu;
}
static struct ip_tunnel *ip_tunnel_create(struct net *net,
struct ip_tunnel_net *itn,
struct ip_tunnel_parm *parms)
{
struct ip_tunnel *nt;
struct net_device *dev;
int t_hlen;
int mtu;
int err;
dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
if (IS_ERR(dev))
return ERR_CAST(dev);
mtu = ip_tunnel_bind_dev(dev);
err = dev_set_mtu(dev, mtu);
if (err)
goto err_dev_set_mtu;
nt = netdev_priv(dev);
t_hlen = nt->hlen + sizeof(struct iphdr);
dev->min_mtu = ETH_MIN_MTU;
dev->max_mtu = IP_MAX_MTU - t_hlen;
if (dev->type == ARPHRD_ETHER)
dev->max_mtu -= dev->hard_header_len;
ip_tunnel_add(itn, nt);
return nt;
err_dev_set_mtu:
unregister_netdevice(dev);
return ERR_PTR(err);
}
int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
bool log_ecn_error)
{
struct pcpu_sw_netstats *tstats;
const struct iphdr *iph = ip_hdr(skb);
int err;
#ifdef CONFIG_NET_IPGRE_BROADCAST
if (ipv4_is_multicast(iph->daddr)) {
tunnel->dev->stats.multicast++;
skb->pkt_type = PACKET_BROADCAST;
}
#endif
if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
tunnel->dev->stats.rx_crc_errors++;
tunnel->dev->stats.rx_errors++;
goto drop;
}
if (tunnel->parms.i_flags&TUNNEL_SEQ) {
if (!(tpi->flags&TUNNEL_SEQ) ||
(tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
tunnel->dev->stats.rx_fifo_errors++;
tunnel->dev->stats.rx_errors++;
goto drop;
}
tunnel->i_seqno = ntohl(tpi->seq) + 1;
}
skb_reset_network_header(skb);
err = IP_ECN_decapsulate(iph, skb);
if (unlikely(err)) {
if (log_ecn_error)
net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
&iph->saddr, iph->tos);
if (err > 1) {
++tunnel->dev->stats.rx_frame_errors;
++tunnel->dev->stats.rx_errors;
goto drop;
}
}
tstats = this_cpu_ptr(tunnel->dev->tstats);
u64_stats_update_begin(&tstats->syncp);
tstats->rx_packets++;
tstats->rx_bytes += skb->len;
u64_stats_update_end(&tstats->syncp);
skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
if (tunnel->dev->type == ARPHRD_ETHER) {
skb->protocol = eth_type_trans(skb, tunnel->dev);
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
} else {
skb->dev = tunnel->dev;
}
if (tun_dst)
skb_dst_set(skb, (struct dst_entry *)tun_dst);
gro_cells_receive(&tunnel->gro_cells, skb);
return 0;
drop:
if (tun_dst)
dst_release((struct dst_entry *)tun_dst);
kfree_skb(skb);
return 0;
}
EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
unsigned int num)
{
if (num >= MAX_IPTUN_ENCAP_OPS)
return -ERANGE;
return !cmpxchg((const struct ip_tunnel_encap_ops **)
&iptun_encaps[num],
NULL, ops) ? 0 : -1;
}
EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
unsigned int num)
{
int ret;
if (num >= MAX_IPTUN_ENCAP_OPS)
return -ERANGE;
ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
&iptun_encaps[num],
ops, NULL) == ops) ? 0 : -1;
synchronize_net();
return ret;
}
EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
int ip_tunnel_encap_setup(struct ip_tunnel *t,
struct ip_tunnel_encap *ipencap)
{
int hlen;
memset(&t->encap, 0, sizeof(t->encap));
hlen = ip_encap_hlen(ipencap);
if (hlen < 0)
return hlen;
t->encap.type = ipencap->type;
t->encap.sport = ipencap->sport;
t->encap.dport = ipencap->dport;
t->encap.flags = ipencap->flags;
t->encap_hlen = hlen;
t->hlen = t->encap_hlen + t->tun_hlen;
return 0;
}
EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
struct rtable *rt, __be16 df,
const struct iphdr *inner_iph,
int tunnel_hlen, __be32 dst, bool md)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
int pkt_size;
int mtu;
tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
pkt_size = skb->len - tunnel_hlen;
pkt_size -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
if (df) {
mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen);
mtu -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
} else {
iptunnel: NULL pointer deref for ip_md_tunnel_xmit Naresh Kamboju noted the following oops during execution of selftest tools/testing/selftests/bpf/test_tunnel.sh on x86_64: [ 274.120445] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 [ 274.128285] #PF error: [INSTR] [ 274.131351] PGD 8000000414a0e067 P4D 8000000414a0e067 PUD 3b6334067 PMD 0 [ 274.138241] Oops: 0010 [#1] SMP PTI [ 274.141734] CPU: 1 PID: 11464 Comm: ping Not tainted 5.0.0-rc4-next-20190129 #1 [ 274.149046] Hardware name: Supermicro SYS-5019S-ML/X11SSH-F, BIOS 2.0b 07/27/2017 [ 274.156526] RIP: 0010: (null) [ 274.160280] Code: Bad RIP value. [ 274.163509] RSP: 0018:ffffbc9681f83540 EFLAGS: 00010286 [ 274.168726] RAX: 0000000000000000 RBX: ffffdc967fa80a18 RCX: 0000000000000000 [ 274.175851] RDX: ffff9db2ee08b540 RSI: 000000000000000e RDI: ffffdc967fa809a0 [ 274.182974] RBP: ffffbc9681f83580 R08: ffff9db2c4d62690 R09: 000000000000000c [ 274.190098] R10: 0000000000000000 R11: ffff9db2ee08b540 R12: ffff9db31ce7c000 [ 274.197222] R13: 0000000000000001 R14: 000000000000000c R15: ffff9db3179cf400 [ 274.204346] FS: 00007ff4ae7c5740(0000) GS:ffff9db31fa80000(0000) knlGS:0000000000000000 [ 274.212424] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 274.218162] CR2: ffffffffffffffd6 CR3: 00000004574da004 CR4: 00000000003606e0 [ 274.225292] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 274.232416] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 274.239541] Call Trace: [ 274.241988] ? tnl_update_pmtu+0x296/0x3b0 [ 274.246085] ip_md_tunnel_xmit+0x1bc/0x520 [ 274.250176] gre_fb_xmit+0x330/0x390 [ 274.253754] gre_tap_xmit+0x128/0x180 [ 274.257414] dev_hard_start_xmit+0xb7/0x300 [ 274.261598] sch_direct_xmit+0xf6/0x290 [ 274.265430] __qdisc_run+0x15d/0x5e0 [ 274.269007] __dev_queue_xmit+0x2c5/0xc00 [ 274.273011] ? dev_queue_xmit+0x10/0x20 [ 274.276842] ? eth_header+0x2b/0xc0 [ 274.280326] dev_queue_xmit+0x10/0x20 [ 274.283984] ? dev_queue_xmit+0x10/0x20 [ 274.287813] arp_xmit+0x1a/0xf0 [ 274.290952] arp_send_dst.part.19+0x46/0x60 [ 274.295138] arp_solicit+0x177/0x6b0 [ 274.298708] ? mod_timer+0x18e/0x440 [ 274.302281] neigh_probe+0x57/0x70 [ 274.305684] __neigh_event_send+0x197/0x2d0 [ 274.309862] neigh_resolve_output+0x18c/0x210 [ 274.314212] ip_finish_output2+0x257/0x690 [ 274.318304] ip_finish_output+0x219/0x340 [ 274.322314] ? ip_finish_output+0x219/0x340 [ 274.326493] ip_output+0x76/0x240 [ 274.329805] ? ip_fragment.constprop.53+0x80/0x80 [ 274.334510] ip_local_out+0x3f/0x70 [ 274.337992] ip_send_skb+0x19/0x40 [ 274.341391] ip_push_pending_frames+0x33/0x40 [ 274.345740] raw_sendmsg+0xc15/0x11d0 [ 274.349403] ? __might_fault+0x85/0x90 [ 274.353151] ? _copy_from_user+0x6b/0xa0 [ 274.357070] ? rw_copy_check_uvector+0x54/0x130 [ 274.361604] inet_sendmsg+0x42/0x1c0 [ 274.365179] ? inet_sendmsg+0x42/0x1c0 [ 274.368937] sock_sendmsg+0x3e/0x50 [ 274.372460] ___sys_sendmsg+0x26f/0x2d0 [ 274.376293] ? lock_acquire+0x95/0x190 [ 274.380043] ? __handle_mm_fault+0x7ce/0xb70 [ 274.384307] ? lock_acquire+0x95/0x190 [ 274.388053] ? __audit_syscall_entry+0xdd/0x130 [ 274.392586] ? ktime_get_coarse_real_ts64+0x64/0xc0 [ 274.397461] ? __audit_syscall_entry+0xdd/0x130 [ 274.401989] ? trace_hardirqs_on+0x4c/0x100 [ 274.406173] __sys_sendmsg+0x63/0xa0 [ 274.409744] ? __sys_sendmsg+0x63/0xa0 [ 274.413488] __x64_sys_sendmsg+0x1f/0x30 [ 274.417405] do_syscall_64+0x55/0x190 [ 274.421064] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 274.426113] RIP: 0033:0x7ff4ae0e6e87 [ 274.429686] Code: 64 89 02 48 c7 c0 ff ff ff ff eb b9 0f 1f 80 00 00 00 00 8b 05 ca d9 2b 00 48 63 d2 48 63 ff 85 c0 75 10 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 53 48 89 f3 48 83 ec 10 48 89 7c 24 08 [ 274.448422] RSP: 002b:00007ffcd9b76db8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 274.455978] RAX: ffffffffffffffda RBX: 0000000000000040 RCX: 00007ff4ae0e6e87 [ 274.463104] RDX: 0000000000000000 RSI: 00000000006092e0 RDI: 0000000000000003 [ 274.470228] RBP: 0000000000000000 R08: 00007ffcd9bc40a0 R09: 00007ffcd9bc4080 [ 274.477349] R10: 000000000000060a R11: 0000000000000246 R12: 0000000000000003 [ 274.484475] R13: 0000000000000016 R14: 00007ffcd9b77fa0 R15: 00007ffcd9b78da4 [ 274.491602] Modules linked in: cls_bpf sch_ingress iptable_filter ip_tables algif_hash af_alg x86_pkg_temp_thermal fuse [last unloaded: test_bpf] [ 274.504634] CR2: 0000000000000000 [ 274.507976] ---[ end trace 196d18386545eae1 ]--- [ 274.512588] RIP: 0010: (null) [ 274.516334] Code: Bad RIP value. [ 274.519557] RSP: 0018:ffffbc9681f83540 EFLAGS: 00010286 [ 274.524775] RAX: 0000000000000000 RBX: ffffdc967fa80a18 RCX: 0000000000000000 [ 274.531921] RDX: ffff9db2ee08b540 RSI: 000000000000000e RDI: ffffdc967fa809a0 [ 274.539082] RBP: ffffbc9681f83580 R08: ffff9db2c4d62690 R09: 000000000000000c [ 274.546205] R10: 0000000000000000 R11: ffff9db2ee08b540 R12: ffff9db31ce7c000 [ 274.553329] R13: 0000000000000001 R14: 000000000000000c R15: ffff9db3179cf400 [ 274.560456] FS: 00007ff4ae7c5740(0000) GS:ffff9db31fa80000(0000) knlGS:0000000000000000 [ 274.568541] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 274.574277] CR2: ffffffffffffffd6 CR3: 00000004574da004 CR4: 00000000003606e0 [ 274.581403] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 274.588535] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 274.595658] Kernel panic - not syncing: Fatal exception in interrupt [ 274.602046] Kernel Offset: 0x14400000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) [ 274.612827] ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]--- [ 274.620387] ------------[ cut here ]------------ I'm also seeing the same failure on x86_64, and it reproduces consistently. >From poking around it looks like the skb's dst entry is being used to calculate the mtu in: mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; ...but because that dst_entry has an "ops" value set to md_dst_ops, the various ops (including mtu) are not set: crash> struct sk_buff._skb_refdst ffff928f87447700 -x _skb_refdst = 0xffffcd6fbf5ea590 crash> struct dst_entry.ops 0xffffcd6fbf5ea590 ops = 0xffffffffa0193800 crash> struct dst_ops.mtu 0xffffffffa0193800 mtu = 0x0 crash> I confirmed that the dst entry also has dst->input set to dst_md_discard, so it looks like it's an entry that's been initialized via __metadata_dst_init alright. I think the fix here is to use skb_valid_dst(skb) - it checks for DST_METADATA also, and with that fix in place, the problem - which was previously 100% reproducible - disappears. The below patch resolves the panic and all bpf tunnel tests pass without incident. Fixes: c8b34e680a09 ("ip_tunnel: Add tnl_update_pmtu in ip_md_tunnel_xmit") Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org> Signed-off-by: Alan Maguire <alan.maguire@oracle.com> Acked-by: Alexei Starovoitov <ast@kernel.org> Tested-by: Anders Roxell <anders.roxell@linaro.org> Reported-by: Nicolas Dichtel <nicolas.dichtel@6wind.com> Tested-by: Nicolas Dichtel <nicolas.dichtel@6wind.com> Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-03-06 19:25:42 +09:00
mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
}
iptunnel: NULL pointer deref for ip_md_tunnel_xmit Naresh Kamboju noted the following oops during execution of selftest tools/testing/selftests/bpf/test_tunnel.sh on x86_64: [ 274.120445] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 [ 274.128285] #PF error: [INSTR] [ 274.131351] PGD 8000000414a0e067 P4D 8000000414a0e067 PUD 3b6334067 PMD 0 [ 274.138241] Oops: 0010 [#1] SMP PTI [ 274.141734] CPU: 1 PID: 11464 Comm: ping Not tainted 5.0.0-rc4-next-20190129 #1 [ 274.149046] Hardware name: Supermicro SYS-5019S-ML/X11SSH-F, BIOS 2.0b 07/27/2017 [ 274.156526] RIP: 0010: (null) [ 274.160280] Code: Bad RIP value. [ 274.163509] RSP: 0018:ffffbc9681f83540 EFLAGS: 00010286 [ 274.168726] RAX: 0000000000000000 RBX: ffffdc967fa80a18 RCX: 0000000000000000 [ 274.175851] RDX: ffff9db2ee08b540 RSI: 000000000000000e RDI: ffffdc967fa809a0 [ 274.182974] RBP: ffffbc9681f83580 R08: ffff9db2c4d62690 R09: 000000000000000c [ 274.190098] R10: 0000000000000000 R11: ffff9db2ee08b540 R12: ffff9db31ce7c000 [ 274.197222] R13: 0000000000000001 R14: 000000000000000c R15: ffff9db3179cf400 [ 274.204346] FS: 00007ff4ae7c5740(0000) GS:ffff9db31fa80000(0000) knlGS:0000000000000000 [ 274.212424] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 274.218162] CR2: ffffffffffffffd6 CR3: 00000004574da004 CR4: 00000000003606e0 [ 274.225292] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 274.232416] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 274.239541] Call Trace: [ 274.241988] ? tnl_update_pmtu+0x296/0x3b0 [ 274.246085] ip_md_tunnel_xmit+0x1bc/0x520 [ 274.250176] gre_fb_xmit+0x330/0x390 [ 274.253754] gre_tap_xmit+0x128/0x180 [ 274.257414] dev_hard_start_xmit+0xb7/0x300 [ 274.261598] sch_direct_xmit+0xf6/0x290 [ 274.265430] __qdisc_run+0x15d/0x5e0 [ 274.269007] __dev_queue_xmit+0x2c5/0xc00 [ 274.273011] ? dev_queue_xmit+0x10/0x20 [ 274.276842] ? eth_header+0x2b/0xc0 [ 274.280326] dev_queue_xmit+0x10/0x20 [ 274.283984] ? dev_queue_xmit+0x10/0x20 [ 274.287813] arp_xmit+0x1a/0xf0 [ 274.290952] arp_send_dst.part.19+0x46/0x60 [ 274.295138] arp_solicit+0x177/0x6b0 [ 274.298708] ? mod_timer+0x18e/0x440 [ 274.302281] neigh_probe+0x57/0x70 [ 274.305684] __neigh_event_send+0x197/0x2d0 [ 274.309862] neigh_resolve_output+0x18c/0x210 [ 274.314212] ip_finish_output2+0x257/0x690 [ 274.318304] ip_finish_output+0x219/0x340 [ 274.322314] ? ip_finish_output+0x219/0x340 [ 274.326493] ip_output+0x76/0x240 [ 274.329805] ? ip_fragment.constprop.53+0x80/0x80 [ 274.334510] ip_local_out+0x3f/0x70 [ 274.337992] ip_send_skb+0x19/0x40 [ 274.341391] ip_push_pending_frames+0x33/0x40 [ 274.345740] raw_sendmsg+0xc15/0x11d0 [ 274.349403] ? __might_fault+0x85/0x90 [ 274.353151] ? _copy_from_user+0x6b/0xa0 [ 274.357070] ? rw_copy_check_uvector+0x54/0x130 [ 274.361604] inet_sendmsg+0x42/0x1c0 [ 274.365179] ? inet_sendmsg+0x42/0x1c0 [ 274.368937] sock_sendmsg+0x3e/0x50 [ 274.372460] ___sys_sendmsg+0x26f/0x2d0 [ 274.376293] ? lock_acquire+0x95/0x190 [ 274.380043] ? __handle_mm_fault+0x7ce/0xb70 [ 274.384307] ? lock_acquire+0x95/0x190 [ 274.388053] ? __audit_syscall_entry+0xdd/0x130 [ 274.392586] ? ktime_get_coarse_real_ts64+0x64/0xc0 [ 274.397461] ? __audit_syscall_entry+0xdd/0x130 [ 274.401989] ? trace_hardirqs_on+0x4c/0x100 [ 274.406173] __sys_sendmsg+0x63/0xa0 [ 274.409744] ? __sys_sendmsg+0x63/0xa0 [ 274.413488] __x64_sys_sendmsg+0x1f/0x30 [ 274.417405] do_syscall_64+0x55/0x190 [ 274.421064] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 274.426113] RIP: 0033:0x7ff4ae0e6e87 [ 274.429686] Code: 64 89 02 48 c7 c0 ff ff ff ff eb b9 0f 1f 80 00 00 00 00 8b 05 ca d9 2b 00 48 63 d2 48 63 ff 85 c0 75 10 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 53 48 89 f3 48 83 ec 10 48 89 7c 24 08 [ 274.448422] RSP: 002b:00007ffcd9b76db8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 274.455978] RAX: ffffffffffffffda RBX: 0000000000000040 RCX: 00007ff4ae0e6e87 [ 274.463104] RDX: 0000000000000000 RSI: 00000000006092e0 RDI: 0000000000000003 [ 274.470228] RBP: 0000000000000000 R08: 00007ffcd9bc40a0 R09: 00007ffcd9bc4080 [ 274.477349] R10: 000000000000060a R11: 0000000000000246 R12: 0000000000000003 [ 274.484475] R13: 0000000000000016 R14: 00007ffcd9b77fa0 R15: 00007ffcd9b78da4 [ 274.491602] Modules linked in: cls_bpf sch_ingress iptable_filter ip_tables algif_hash af_alg x86_pkg_temp_thermal fuse [last unloaded: test_bpf] [ 274.504634] CR2: 0000000000000000 [ 274.507976] ---[ end trace 196d18386545eae1 ]--- [ 274.512588] RIP: 0010: (null) [ 274.516334] Code: Bad RIP value. [ 274.519557] RSP: 0018:ffffbc9681f83540 EFLAGS: 00010286 [ 274.524775] RAX: 0000000000000000 RBX: ffffdc967fa80a18 RCX: 0000000000000000 [ 274.531921] RDX: ffff9db2ee08b540 RSI: 000000000000000e RDI: ffffdc967fa809a0 [ 274.539082] RBP: ffffbc9681f83580 R08: ffff9db2c4d62690 R09: 000000000000000c [ 274.546205] R10: 0000000000000000 R11: ffff9db2ee08b540 R12: ffff9db31ce7c000 [ 274.553329] R13: 0000000000000001 R14: 000000000000000c R15: ffff9db3179cf400 [ 274.560456] FS: 00007ff4ae7c5740(0000) GS:ffff9db31fa80000(0000) knlGS:0000000000000000 [ 274.568541] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 274.574277] CR2: ffffffffffffffd6 CR3: 00000004574da004 CR4: 00000000003606e0 [ 274.581403] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 274.588535] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 274.595658] Kernel panic - not syncing: Fatal exception in interrupt [ 274.602046] Kernel Offset: 0x14400000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) [ 274.612827] ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]--- [ 274.620387] ------------[ cut here ]------------ I'm also seeing the same failure on x86_64, and it reproduces consistently. >From poking around it looks like the skb's dst entry is being used to calculate the mtu in: mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; ...but because that dst_entry has an "ops" value set to md_dst_ops, the various ops (including mtu) are not set: crash> struct sk_buff._skb_refdst ffff928f87447700 -x _skb_refdst = 0xffffcd6fbf5ea590 crash> struct dst_entry.ops 0xffffcd6fbf5ea590 ops = 0xffffffffa0193800 crash> struct dst_ops.mtu 0xffffffffa0193800 mtu = 0x0 crash> I confirmed that the dst entry also has dst->input set to dst_md_discard, so it looks like it's an entry that's been initialized via __metadata_dst_init alright. I think the fix here is to use skb_valid_dst(skb) - it checks for DST_METADATA also, and with that fix in place, the problem - which was previously 100% reproducible - disappears. The below patch resolves the panic and all bpf tunnel tests pass without incident. Fixes: c8b34e680a09 ("ip_tunnel: Add tnl_update_pmtu in ip_md_tunnel_xmit") Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org> Signed-off-by: Alan Maguire <alan.maguire@oracle.com> Acked-by: Alexei Starovoitov <ast@kernel.org> Tested-by: Anders Roxell <anders.roxell@linaro.org> Reported-by: Nicolas Dichtel <nicolas.dichtel@6wind.com> Tested-by: Nicolas Dichtel <nicolas.dichtel@6wind.com> Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-03-06 19:25:42 +09:00
if (skb_valid_dst(skb))
skb_dst_update_pmtu_no_confirm(skb, mtu);
if (skb->protocol == htons(ETH_P_IP)) {
if (!skb_is_gso(skb) &&
(inner_iph->frag_off & htons(IP_DF)) &&
mtu < pkt_size) {
memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
return -E2BIG;
}
}
#if IS_ENABLED(CONFIG_IPV6)
else if (skb->protocol == htons(ETH_P_IPV6)) {
iptunnel: NULL pointer deref for ip_md_tunnel_xmit Naresh Kamboju noted the following oops during execution of selftest tools/testing/selftests/bpf/test_tunnel.sh on x86_64: [ 274.120445] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 [ 274.128285] #PF error: [INSTR] [ 274.131351] PGD 8000000414a0e067 P4D 8000000414a0e067 PUD 3b6334067 PMD 0 [ 274.138241] Oops: 0010 [#1] SMP PTI [ 274.141734] CPU: 1 PID: 11464 Comm: ping Not tainted 5.0.0-rc4-next-20190129 #1 [ 274.149046] Hardware name: Supermicro SYS-5019S-ML/X11SSH-F, BIOS 2.0b 07/27/2017 [ 274.156526] RIP: 0010: (null) [ 274.160280] Code: Bad RIP value. [ 274.163509] RSP: 0018:ffffbc9681f83540 EFLAGS: 00010286 [ 274.168726] RAX: 0000000000000000 RBX: ffffdc967fa80a18 RCX: 0000000000000000 [ 274.175851] RDX: ffff9db2ee08b540 RSI: 000000000000000e RDI: ffffdc967fa809a0 [ 274.182974] RBP: ffffbc9681f83580 R08: ffff9db2c4d62690 R09: 000000000000000c [ 274.190098] R10: 0000000000000000 R11: ffff9db2ee08b540 R12: ffff9db31ce7c000 [ 274.197222] R13: 0000000000000001 R14: 000000000000000c R15: ffff9db3179cf400 [ 274.204346] FS: 00007ff4ae7c5740(0000) GS:ffff9db31fa80000(0000) knlGS:0000000000000000 [ 274.212424] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 274.218162] CR2: ffffffffffffffd6 CR3: 00000004574da004 CR4: 00000000003606e0 [ 274.225292] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 274.232416] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 274.239541] Call Trace: [ 274.241988] ? tnl_update_pmtu+0x296/0x3b0 [ 274.246085] ip_md_tunnel_xmit+0x1bc/0x520 [ 274.250176] gre_fb_xmit+0x330/0x390 [ 274.253754] gre_tap_xmit+0x128/0x180 [ 274.257414] dev_hard_start_xmit+0xb7/0x300 [ 274.261598] sch_direct_xmit+0xf6/0x290 [ 274.265430] __qdisc_run+0x15d/0x5e0 [ 274.269007] __dev_queue_xmit+0x2c5/0xc00 [ 274.273011] ? dev_queue_xmit+0x10/0x20 [ 274.276842] ? eth_header+0x2b/0xc0 [ 274.280326] dev_queue_xmit+0x10/0x20 [ 274.283984] ? dev_queue_xmit+0x10/0x20 [ 274.287813] arp_xmit+0x1a/0xf0 [ 274.290952] arp_send_dst.part.19+0x46/0x60 [ 274.295138] arp_solicit+0x177/0x6b0 [ 274.298708] ? mod_timer+0x18e/0x440 [ 274.302281] neigh_probe+0x57/0x70 [ 274.305684] __neigh_event_send+0x197/0x2d0 [ 274.309862] neigh_resolve_output+0x18c/0x210 [ 274.314212] ip_finish_output2+0x257/0x690 [ 274.318304] ip_finish_output+0x219/0x340 [ 274.322314] ? ip_finish_output+0x219/0x340 [ 274.326493] ip_output+0x76/0x240 [ 274.329805] ? ip_fragment.constprop.53+0x80/0x80 [ 274.334510] ip_local_out+0x3f/0x70 [ 274.337992] ip_send_skb+0x19/0x40 [ 274.341391] ip_push_pending_frames+0x33/0x40 [ 274.345740] raw_sendmsg+0xc15/0x11d0 [ 274.349403] ? __might_fault+0x85/0x90 [ 274.353151] ? _copy_from_user+0x6b/0xa0 [ 274.357070] ? rw_copy_check_uvector+0x54/0x130 [ 274.361604] inet_sendmsg+0x42/0x1c0 [ 274.365179] ? inet_sendmsg+0x42/0x1c0 [ 274.368937] sock_sendmsg+0x3e/0x50 [ 274.372460] ___sys_sendmsg+0x26f/0x2d0 [ 274.376293] ? lock_acquire+0x95/0x190 [ 274.380043] ? __handle_mm_fault+0x7ce/0xb70 [ 274.384307] ? lock_acquire+0x95/0x190 [ 274.388053] ? __audit_syscall_entry+0xdd/0x130 [ 274.392586] ? ktime_get_coarse_real_ts64+0x64/0xc0 [ 274.397461] ? __audit_syscall_entry+0xdd/0x130 [ 274.401989] ? trace_hardirqs_on+0x4c/0x100 [ 274.406173] __sys_sendmsg+0x63/0xa0 [ 274.409744] ? __sys_sendmsg+0x63/0xa0 [ 274.413488] __x64_sys_sendmsg+0x1f/0x30 [ 274.417405] do_syscall_64+0x55/0x190 [ 274.421064] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 274.426113] RIP: 0033:0x7ff4ae0e6e87 [ 274.429686] Code: 64 89 02 48 c7 c0 ff ff ff ff eb b9 0f 1f 80 00 00 00 00 8b 05 ca d9 2b 00 48 63 d2 48 63 ff 85 c0 75 10 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 53 48 89 f3 48 83 ec 10 48 89 7c 24 08 [ 274.448422] RSP: 002b:00007ffcd9b76db8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 274.455978] RAX: ffffffffffffffda RBX: 0000000000000040 RCX: 00007ff4ae0e6e87 [ 274.463104] RDX: 0000000000000000 RSI: 00000000006092e0 RDI: 0000000000000003 [ 274.470228] RBP: 0000000000000000 R08: 00007ffcd9bc40a0 R09: 00007ffcd9bc4080 [ 274.477349] R10: 000000000000060a R11: 0000000000000246 R12: 0000000000000003 [ 274.484475] R13: 0000000000000016 R14: 00007ffcd9b77fa0 R15: 00007ffcd9b78da4 [ 274.491602] Modules linked in: cls_bpf sch_ingress iptable_filter ip_tables algif_hash af_alg x86_pkg_temp_thermal fuse [last unloaded: test_bpf] [ 274.504634] CR2: 0000000000000000 [ 274.507976] ---[ end trace 196d18386545eae1 ]--- [ 274.512588] RIP: 0010: (null) [ 274.516334] Code: Bad RIP value. [ 274.519557] RSP: 0018:ffffbc9681f83540 EFLAGS: 00010286 [ 274.524775] RAX: 0000000000000000 RBX: ffffdc967fa80a18 RCX: 0000000000000000 [ 274.531921] RDX: ffff9db2ee08b540 RSI: 000000000000000e RDI: ffffdc967fa809a0 [ 274.539082] RBP: ffffbc9681f83580 R08: ffff9db2c4d62690 R09: 000000000000000c [ 274.546205] R10: 0000000000000000 R11: ffff9db2ee08b540 R12: ffff9db31ce7c000 [ 274.553329] R13: 0000000000000001 R14: 000000000000000c R15: ffff9db3179cf400 [ 274.560456] FS: 00007ff4ae7c5740(0000) GS:ffff9db31fa80000(0000) knlGS:0000000000000000 [ 274.568541] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 274.574277] CR2: ffffffffffffffd6 CR3: 00000004574da004 CR4: 00000000003606e0 [ 274.581403] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 274.588535] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 274.595658] Kernel panic - not syncing: Fatal exception in interrupt [ 274.602046] Kernel Offset: 0x14400000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) [ 274.612827] ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]--- [ 274.620387] ------------[ cut here ]------------ I'm also seeing the same failure on x86_64, and it reproduces consistently. >From poking around it looks like the skb's dst entry is being used to calculate the mtu in: mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; ...but because that dst_entry has an "ops" value set to md_dst_ops, the various ops (including mtu) are not set: crash> struct sk_buff._skb_refdst ffff928f87447700 -x _skb_refdst = 0xffffcd6fbf5ea590 crash> struct dst_entry.ops 0xffffcd6fbf5ea590 ops = 0xffffffffa0193800 crash> struct dst_ops.mtu 0xffffffffa0193800 mtu = 0x0 crash> I confirmed that the dst entry also has dst->input set to dst_md_discard, so it looks like it's an entry that's been initialized via __metadata_dst_init alright. I think the fix here is to use skb_valid_dst(skb) - it checks for DST_METADATA also, and with that fix in place, the problem - which was previously 100% reproducible - disappears. The below patch resolves the panic and all bpf tunnel tests pass without incident. Fixes: c8b34e680a09 ("ip_tunnel: Add tnl_update_pmtu in ip_md_tunnel_xmit") Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org> Signed-off-by: Alan Maguire <alan.maguire@oracle.com> Acked-by: Alexei Starovoitov <ast@kernel.org> Tested-by: Anders Roxell <anders.roxell@linaro.org> Reported-by: Nicolas Dichtel <nicolas.dichtel@6wind.com> Tested-by: Nicolas Dichtel <nicolas.dichtel@6wind.com> Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-03-06 19:25:42 +09:00
struct rt6_info *rt6;
__be32 daddr;
iptunnel: NULL pointer deref for ip_md_tunnel_xmit Naresh Kamboju noted the following oops during execution of selftest tools/testing/selftests/bpf/test_tunnel.sh on x86_64: [ 274.120445] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 [ 274.128285] #PF error: [INSTR] [ 274.131351] PGD 8000000414a0e067 P4D 8000000414a0e067 PUD 3b6334067 PMD 0 [ 274.138241] Oops: 0010 [#1] SMP PTI [ 274.141734] CPU: 1 PID: 11464 Comm: ping Not tainted 5.0.0-rc4-next-20190129 #1 [ 274.149046] Hardware name: Supermicro SYS-5019S-ML/X11SSH-F, BIOS 2.0b 07/27/2017 [ 274.156526] RIP: 0010: (null) [ 274.160280] Code: Bad RIP value. [ 274.163509] RSP: 0018:ffffbc9681f83540 EFLAGS: 00010286 [ 274.168726] RAX: 0000000000000000 RBX: ffffdc967fa80a18 RCX: 0000000000000000 [ 274.175851] RDX: ffff9db2ee08b540 RSI: 000000000000000e RDI: ffffdc967fa809a0 [ 274.182974] RBP: ffffbc9681f83580 R08: ffff9db2c4d62690 R09: 000000000000000c [ 274.190098] R10: 0000000000000000 R11: ffff9db2ee08b540 R12: ffff9db31ce7c000 [ 274.197222] R13: 0000000000000001 R14: 000000000000000c R15: ffff9db3179cf400 [ 274.204346] FS: 00007ff4ae7c5740(0000) GS:ffff9db31fa80000(0000) knlGS:0000000000000000 [ 274.212424] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 274.218162] CR2: ffffffffffffffd6 CR3: 00000004574da004 CR4: 00000000003606e0 [ 274.225292] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 274.232416] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 274.239541] Call Trace: [ 274.241988] ? tnl_update_pmtu+0x296/0x3b0 [ 274.246085] ip_md_tunnel_xmit+0x1bc/0x520 [ 274.250176] gre_fb_xmit+0x330/0x390 [ 274.253754] gre_tap_xmit+0x128/0x180 [ 274.257414] dev_hard_start_xmit+0xb7/0x300 [ 274.261598] sch_direct_xmit+0xf6/0x290 [ 274.265430] __qdisc_run+0x15d/0x5e0 [ 274.269007] __dev_queue_xmit+0x2c5/0xc00 [ 274.273011] ? dev_queue_xmit+0x10/0x20 [ 274.276842] ? eth_header+0x2b/0xc0 [ 274.280326] dev_queue_xmit+0x10/0x20 [ 274.283984] ? dev_queue_xmit+0x10/0x20 [ 274.287813] arp_xmit+0x1a/0xf0 [ 274.290952] arp_send_dst.part.19+0x46/0x60 [ 274.295138] arp_solicit+0x177/0x6b0 [ 274.298708] ? mod_timer+0x18e/0x440 [ 274.302281] neigh_probe+0x57/0x70 [ 274.305684] __neigh_event_send+0x197/0x2d0 [ 274.309862] neigh_resolve_output+0x18c/0x210 [ 274.314212] ip_finish_output2+0x257/0x690 [ 274.318304] ip_finish_output+0x219/0x340 [ 274.322314] ? ip_finish_output+0x219/0x340 [ 274.326493] ip_output+0x76/0x240 [ 274.329805] ? ip_fragment.constprop.53+0x80/0x80 [ 274.334510] ip_local_out+0x3f/0x70 [ 274.337992] ip_send_skb+0x19/0x40 [ 274.341391] ip_push_pending_frames+0x33/0x40 [ 274.345740] raw_sendmsg+0xc15/0x11d0 [ 274.349403] ? __might_fault+0x85/0x90 [ 274.353151] ? _copy_from_user+0x6b/0xa0 [ 274.357070] ? rw_copy_check_uvector+0x54/0x130 [ 274.361604] inet_sendmsg+0x42/0x1c0 [ 274.365179] ? inet_sendmsg+0x42/0x1c0 [ 274.368937] sock_sendmsg+0x3e/0x50 [ 274.372460] ___sys_sendmsg+0x26f/0x2d0 [ 274.376293] ? lock_acquire+0x95/0x190 [ 274.380043] ? __handle_mm_fault+0x7ce/0xb70 [ 274.384307] ? lock_acquire+0x95/0x190 [ 274.388053] ? __audit_syscall_entry+0xdd/0x130 [ 274.392586] ? ktime_get_coarse_real_ts64+0x64/0xc0 [ 274.397461] ? __audit_syscall_entry+0xdd/0x130 [ 274.401989] ? trace_hardirqs_on+0x4c/0x100 [ 274.406173] __sys_sendmsg+0x63/0xa0 [ 274.409744] ? __sys_sendmsg+0x63/0xa0 [ 274.413488] __x64_sys_sendmsg+0x1f/0x30 [ 274.417405] do_syscall_64+0x55/0x190 [ 274.421064] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 274.426113] RIP: 0033:0x7ff4ae0e6e87 [ 274.429686] Code: 64 89 02 48 c7 c0 ff ff ff ff eb b9 0f 1f 80 00 00 00 00 8b 05 ca d9 2b 00 48 63 d2 48 63 ff 85 c0 75 10 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 53 48 89 f3 48 83 ec 10 48 89 7c 24 08 [ 274.448422] RSP: 002b:00007ffcd9b76db8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 274.455978] RAX: ffffffffffffffda RBX: 0000000000000040 RCX: 00007ff4ae0e6e87 [ 274.463104] RDX: 0000000000000000 RSI: 00000000006092e0 RDI: 0000000000000003 [ 274.470228] RBP: 0000000000000000 R08: 00007ffcd9bc40a0 R09: 00007ffcd9bc4080 [ 274.477349] R10: 000000000000060a R11: 0000000000000246 R12: 0000000000000003 [ 274.484475] R13: 0000000000000016 R14: 00007ffcd9b77fa0 R15: 00007ffcd9b78da4 [ 274.491602] Modules linked in: cls_bpf sch_ingress iptable_filter ip_tables algif_hash af_alg x86_pkg_temp_thermal fuse [last unloaded: test_bpf] [ 274.504634] CR2: 0000000000000000 [ 274.507976] ---[ end trace 196d18386545eae1 ]--- [ 274.512588] RIP: 0010: (null) [ 274.516334] Code: Bad RIP value. [ 274.519557] RSP: 0018:ffffbc9681f83540 EFLAGS: 00010286 [ 274.524775] RAX: 0000000000000000 RBX: ffffdc967fa80a18 RCX: 0000000000000000 [ 274.531921] RDX: ffff9db2ee08b540 RSI: 000000000000000e RDI: ffffdc967fa809a0 [ 274.539082] RBP: ffffbc9681f83580 R08: ffff9db2c4d62690 R09: 000000000000000c [ 274.546205] R10: 0000000000000000 R11: ffff9db2ee08b540 R12: ffff9db31ce7c000 [ 274.553329] R13: 0000000000000001 R14: 000000000000000c R15: ffff9db3179cf400 [ 274.560456] FS: 00007ff4ae7c5740(0000) GS:ffff9db31fa80000(0000) knlGS:0000000000000000 [ 274.568541] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 274.574277] CR2: ffffffffffffffd6 CR3: 00000004574da004 CR4: 00000000003606e0 [ 274.581403] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 274.588535] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 274.595658] Kernel panic - not syncing: Fatal exception in interrupt [ 274.602046] Kernel Offset: 0x14400000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) [ 274.612827] ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]--- [ 274.620387] ------------[ cut here ]------------ I'm also seeing the same failure on x86_64, and it reproduces consistently. >From poking around it looks like the skb's dst entry is being used to calculate the mtu in: mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; ...but because that dst_entry has an "ops" value set to md_dst_ops, the various ops (including mtu) are not set: crash> struct sk_buff._skb_refdst ffff928f87447700 -x _skb_refdst = 0xffffcd6fbf5ea590 crash> struct dst_entry.ops 0xffffcd6fbf5ea590 ops = 0xffffffffa0193800 crash> struct dst_ops.mtu 0xffffffffa0193800 mtu = 0x0 crash> I confirmed that the dst entry also has dst->input set to dst_md_discard, so it looks like it's an entry that's been initialized via __metadata_dst_init alright. I think the fix here is to use skb_valid_dst(skb) - it checks for DST_METADATA also, and with that fix in place, the problem - which was previously 100% reproducible - disappears. The below patch resolves the panic and all bpf tunnel tests pass without incident. Fixes: c8b34e680a09 ("ip_tunnel: Add tnl_update_pmtu in ip_md_tunnel_xmit") Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org> Signed-off-by: Alan Maguire <alan.maguire@oracle.com> Acked-by: Alexei Starovoitov <ast@kernel.org> Tested-by: Anders Roxell <anders.roxell@linaro.org> Reported-by: Nicolas Dichtel <nicolas.dichtel@6wind.com> Tested-by: Nicolas Dichtel <nicolas.dichtel@6wind.com> Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-03-06 19:25:42 +09:00
rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) :
NULL;
daddr = md ? dst : tunnel->parms.iph.daddr;
if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
mtu >= IPV6_MIN_MTU) {
if ((daddr && !ipv4_is_multicast(daddr)) ||
rt6->rt6i_dst.plen == 128) {
rt6->rt6i_flags |= RTF_MODIFIED;
dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
}
}
if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
mtu < pkt_size) {
icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
return -E2BIG;
}
}
#endif
return 0;
}
void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
u8 proto, int tunnel_hlen)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
u32 headroom = sizeof(struct iphdr);
struct ip_tunnel_info *tun_info;
const struct ip_tunnel_key *key;
const struct iphdr *inner_iph;
struct rtable *rt = NULL;
struct flowi4 fl4;
__be16 df = 0;
u8 tos, ttl;
bool use_cache;
tun_info = skb_tunnel_info(skb);
if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
ip_tunnel_info_af(tun_info) != AF_INET))
goto tx_error;
key = &tun_info->key;
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
tos = key->tos;
if (tos == 1) {
if (skb->protocol == htons(ETH_P_IP))
tos = inner_iph->tos;
else if (skb->protocol == htons(ETH_P_IPV6))
tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
}
ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
0, skb->mark, skb_get_hash(skb));
if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
goto tx_error;
use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
if (use_cache)
rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
if (!rt) {
rt = ip_route_output_key(tunnel->net, &fl4);
if (IS_ERR(rt)) {
dev->stats.tx_carrier_errors++;
goto tx_error;
}
if (use_cache)
dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
fl4.saddr);
}
if (rt->dst.dev == dev) {
ip_rt_put(rt);
dev->stats.collisions++;
goto tx_error;
}
if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
df = htons(IP_DF);
if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
key->u.ipv4.dst, true)) {
ip_rt_put(rt);
goto tx_error;
}
tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
ttl = key->ttl;
if (ttl == 0) {
if (skb->protocol == htons(ETH_P_IP))
ttl = inner_iph->ttl;
else if (skb->protocol == htons(ETH_P_IPV6))
ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
else
ttl = ip4_dst_hoplimit(&rt->dst);
}
headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
if (headroom > dev->needed_headroom)
dev->needed_headroom = headroom;
if (skb_cow_head(skb, dev->needed_headroom)) {
ip_rt_put(rt);
goto tx_dropped;
}
iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
df, !net_eq(tunnel->net, dev_net(dev)));
return;
tx_error:
dev->stats.tx_errors++;
goto kfree;
tx_dropped:
dev->stats.tx_dropped++;
kfree:
kfree_skb(skb);
}
EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
const struct iphdr *tnl_params, u8 protocol)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
struct ip_tunnel_info *tun_info = NULL;
const struct iphdr *inner_iph;
unsigned int max_headroom; /* The extra header space needed */
struct rtable *rt = NULL; /* Route to the other host */
bool use_cache = false;
struct flowi4 fl4;
bool md = false;
bool connected;
u8 tos, ttl;
__be32 dst;
__be16 df;
inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
connected = (tunnel->parms.iph.daddr != 0);
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
dst = tnl_params->daddr;
if (dst == 0) {
/* NBMA tunnel */
if (!skb_dst(skb)) {
dev->stats.tx_fifo_errors++;
goto tx_error;
}
tun_info = skb_tunnel_info(skb);
if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
ip_tunnel_info_af(tun_info) == AF_INET &&
tun_info->key.u.ipv4.dst) {
dst = tun_info->key.u.ipv4.dst;
md = true;
connected = true;
}
else if (skb->protocol == htons(ETH_P_IP)) {
rt = skb_rtable(skb);
dst = rt_nexthop(rt, inner_iph->daddr);
}
#if IS_ENABLED(CONFIG_IPV6)
else if (skb->protocol == htons(ETH_P_IPV6)) {
const struct in6_addr *addr6;
struct neighbour *neigh;
bool do_tx_error_icmp;
int addr_type;
neigh = dst_neigh_lookup(skb_dst(skb),
&ipv6_hdr(skb)->daddr);
if (!neigh)
goto tx_error;
addr6 = (const struct in6_addr *)&neigh->primary_key;
addr_type = ipv6_addr_type(addr6);
if (addr_type == IPV6_ADDR_ANY) {
addr6 = &ipv6_hdr(skb)->daddr;
addr_type = ipv6_addr_type(addr6);
}
if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
do_tx_error_icmp = true;
else {
do_tx_error_icmp = false;
dst = addr6->s6_addr32[3];
}
neigh_release(neigh);
if (do_tx_error_icmp)
goto tx_error_icmp;
}
#endif
else
goto tx_error;
if (!md)
connected = false;
}
tos = tnl_params->tos;
if (tos & 0x1) {
tos &= ~0x1;
if (skb->protocol == htons(ETH_P_IP)) {
tos = inner_iph->tos;
connected = false;
} else if (skb->protocol == htons(ETH_P_IPV6)) {
tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
connected = false;
}
}
ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
tunnel->fwmark, skb_get_hash(skb));
if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
goto tx_error;
if (connected && md) {
use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
if (use_cache)
rt = dst_cache_get_ip4(&tun_info->dst_cache,
&fl4.saddr);
} else {
rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
&fl4.saddr) : NULL;
}
if (!rt) {
rt = ip_route_output_key(tunnel->net, &fl4);
if (IS_ERR(rt)) {
dev->stats.tx_carrier_errors++;
goto tx_error;
}
if (use_cache)
dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
fl4.saddr);
else if (!md && connected)
dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
fl4.saddr);
}
if (rt->dst.dev == dev) {
ip_rt_put(rt);
dev->stats.collisions++;
goto tx_error;
}
df = tnl_params->frag_off;
if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
df |= (inner_iph->frag_off & htons(IP_DF));
if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) {
ip_rt_put(rt);
goto tx_error;
}
if (tunnel->err_count > 0) {
if (time_before(jiffies,
tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
tunnel->err_count--;
dst_link_failure(skb);
} else
tunnel->err_count = 0;
}
tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
ttl = tnl_params->ttl;
if (ttl == 0) {
if (skb->protocol == htons(ETH_P_IP))
ttl = inner_iph->ttl;
#if IS_ENABLED(CONFIG_IPV6)
else if (skb->protocol == htons(ETH_P_IPV6))
ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
#endif
else
ttl = ip4_dst_hoplimit(&rt->dst);
}
max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
if (max_headroom > dev->needed_headroom)
dev->needed_headroom = max_headroom;
if (skb_cow_head(skb, dev->needed_headroom)) {
ip_rt_put(rt);
dev->stats.tx_dropped++;
kfree_skb(skb);
return;
}
iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
df, !net_eq(tunnel->net, dev_net(dev)));
return;
#if IS_ENABLED(CONFIG_IPV6)
tx_error_icmp:
dst_link_failure(skb);
#endif
tx_error:
dev->stats.tx_errors++;
kfree_skb(skb);
}
EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
static void ip_tunnel_update(struct ip_tunnel_net *itn,
struct ip_tunnel *t,
struct net_device *dev,
struct ip_tunnel_parm *p,
bool set_mtu,
__u32 fwmark)
{
ip_tunnel_del(itn, t);
t->parms.iph.saddr = p->iph.saddr;
t->parms.iph.daddr = p->iph.daddr;
t->parms.i_key = p->i_key;
t->parms.o_key = p->o_key;
if (dev->type != ARPHRD_ETHER) {
memcpy(dev->dev_addr, &p->iph.saddr, 4);
memcpy(dev->broadcast, &p->iph.daddr, 4);
}
ip_tunnel_add(itn, t);
t->parms.iph.ttl = p->iph.ttl;
t->parms.iph.tos = p->iph.tos;
t->parms.iph.frag_off = p->iph.frag_off;
if (t->parms.link != p->link || t->fwmark != fwmark) {
int mtu;
t->parms.link = p->link;
t->fwmark = fwmark;
mtu = ip_tunnel_bind_dev(dev);
if (set_mtu)
dev->mtu = mtu;
}
dst_cache_reset(&t->dst_cache);
netdev_state_change(dev);
}
int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
{
int err = 0;
struct ip_tunnel *t = netdev_priv(dev);
struct net *net = t->net;
struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
switch (cmd) {
case SIOCGETTUNNEL:
if (dev == itn->fb_tunnel_dev) {
t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
if (!t)
t = netdev_priv(dev);
}
memcpy(p, &t->parms, sizeof(*p));
break;
case SIOCADDTUNNEL:
case SIOCCHGTUNNEL:
err = -EPERM;
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
goto done;
if (p->iph.ttl)
p->iph.frag_off |= htons(IP_DF);
if (!(p->i_flags & VTI_ISVTI)) {
if (!(p->i_flags & TUNNEL_KEY))
p->i_key = 0;
if (!(p->o_flags & TUNNEL_KEY))
p->o_key = 0;
}
t = ip_tunnel_find(itn, p, itn->type);
if (cmd == SIOCADDTUNNEL) {
if (!t) {
t = ip_tunnel_create(net, itn, p);
err = PTR_ERR_OR_ZERO(t);
break;
}
err = -EEXIST;
break;
}
if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
if (t) {
if (t->dev != dev) {
err = -EEXIST;
break;
}
} else {
unsigned int nflags = 0;
if (ipv4_is_multicast(p->iph.daddr))
nflags = IFF_BROADCAST;
else if (p->iph.daddr)
nflags = IFF_POINTOPOINT;
if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
err = -EINVAL;
break;
}
t = netdev_priv(dev);
}
}
if (t) {
err = 0;
ip_tunnel_update(itn, t, dev, p, true, 0);
} else {
err = -ENOENT;
}
break;
case SIOCDELTUNNEL:
err = -EPERM;
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
goto done;
if (dev == itn->fb_tunnel_dev) {
err = -ENOENT;
t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
if (!t)
goto done;
err = -EPERM;
if (t == netdev_priv(itn->fb_tunnel_dev))
goto done;
dev = t->dev;
}
unregister_netdevice(dev);
err = 0;
break;
default:
err = -EINVAL;
}
done:
return err;
}
EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
int t_hlen = tunnel->hlen + sizeof(struct iphdr);
int max_mtu = IP_MAX_MTU - t_hlen;
if (dev->type == ARPHRD_ETHER)
max_mtu -= dev->hard_header_len;
if (new_mtu < ETH_MIN_MTU)
return -EINVAL;
if (new_mtu > max_mtu) {
if (strict)
return -EINVAL;
new_mtu = max_mtu;
}
dev->mtu = new_mtu;
return 0;
}
EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
{
return __ip_tunnel_change_mtu(dev, new_mtu, true);
}
EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
static void ip_tunnel_dev_free(struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
gro_cells_destroy(&tunnel->gro_cells);
dst_cache_destroy(&tunnel->dst_cache);
free_percpu(dev->tstats);
}
void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
struct ip_tunnel_net *itn;
itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
if (itn->fb_tunnel_dev != dev) {
ip_tunnel_del(itn, netdev_priv(dev));
unregister_netdevice_queue(dev, head);
}
}
EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
struct net *ip_tunnel_get_link_net(const struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
return tunnel->net;
}
EXPORT_SYMBOL(ip_tunnel_get_link_net);
int ip_tunnel_get_iflink(const struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
return tunnel->parms.link;
}
EXPORT_SYMBOL(ip_tunnel_get_iflink);
netns: make struct pernet_operations::id unsigned int Make struct pernet_operations::id unsigned. There are 2 reasons to do so: 1) This field is really an index into an zero based array and thus is unsigned entity. Using negative value is out-of-bound access by definition. 2) On x86_64 unsigned 32-bit data which are mixed with pointers via array indexing or offsets added or subtracted to pointers are preffered to signed 32-bit data. "int" being used as an array index needs to be sign-extended to 64-bit before being used. void f(long *p, int i) { g(p[i]); } roughly translates to movsx rsi, esi mov rdi, [rsi+...] call g MOVSX is 3 byte instruction which isn't necessary if the variable is unsigned because x86_64 is zero extending by default. Now, there is net_generic() function which, you guessed it right, uses "int" as an array index: static inline void *net_generic(const struct net *net, int id) { ... ptr = ng->ptr[id - 1]; ... } And this function is used a lot, so those sign extensions add up. Patch snipes ~1730 bytes on allyesconfig kernel (without all junk messing with code generation): add/remove: 0/0 grow/shrink: 70/598 up/down: 396/-2126 (-1730) Unfortunately some functions actually grow bigger. This is a semmingly random artefact of code generation with register allocator being used differently. gcc decides that some variable needs to live in new r8+ registers and every access now requires REX prefix. Or it is shifted into r12, so [r12+0] addressing mode has to be used which is longer than [r8] However, overall balance is in negative direction: add/remove: 0/0 grow/shrink: 70/598 up/down: 396/-2126 (-1730) function old new delta nfsd4_lock 3886 3959 +73 tipc_link_build_proto_msg 1096 1140 +44 mac80211_hwsim_new_radio 2776 2808 +32 tipc_mon_rcv 1032 1058 +26 svcauth_gss_legacy_init 1413 1429 +16 tipc_bcbase_select_primary 379 392 +13 nfsd4_exchange_id 1247 1260 +13 nfsd4_setclientid_confirm 782 793 +11 ... put_client_renew_locked 494 480 -14 ip_set_sockfn_get 730 716 -14 geneve_sock_add 829 813 -16 nfsd4_sequence_done 721 703 -18 nlmclnt_lookup_host 708 686 -22 nfsd4_lockt 1085 1063 -22 nfs_get_client 1077 1050 -27 tcf_bpf_init 1106 1076 -30 nfsd4_encode_fattr 5997 5930 -67 Total: Before=154856051, After=154854321, chg -0.00% Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-11-17 10:58:21 +09:00
int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
struct rtnl_link_ops *ops, char *devname)
{
struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
struct ip_tunnel_parm parms;
unsigned int i;
itn->rtnl_link_ops = ops;
for (i = 0; i < IP_TNL_HASH_SIZE; i++)
INIT_HLIST_HEAD(&itn->tunnels[i]);
if (!ops || !net_has_fallback_tunnels(net)) {
struct ip_tunnel_net *it_init_net;
it_init_net = net_generic(&init_net, ip_tnl_net_id);
itn->type = it_init_net->type;
itn->fb_tunnel_dev = NULL;
return 0;
}
memset(&parms, 0, sizeof(parms));
if (devname)
strlcpy(parms.name, devname, IFNAMSIZ);
rtnl_lock();
itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
/* FB netdevice is special: we have one, and only one per netns.
* Allowing to move it to another netns is clearly unsafe.
*/
if (!IS_ERR(itn->fb_tunnel_dev)) {
itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
itn->type = itn->fb_tunnel_dev->type;
}
rtnl_unlock();
return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
}
EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
struct list_head *head,
struct rtnl_link_ops *ops)
{
struct net_device *dev, *aux;
int h;
for_each_netdev_safe(net, dev, aux)
if (dev->rtnl_link_ops == ops)
unregister_netdevice_queue(dev, head);
for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
struct ip_tunnel *t;
struct hlist_node *n;
struct hlist_head *thead = &itn->tunnels[h];
hlist_for_each_entry_safe(t, n, thead, hash_node)
/* If dev is in the same netns, it has already
* been added to the list by the previous loop.
*/
if (!net_eq(dev_net(t->dev), net))
unregister_netdevice_queue(t->dev, head);
}
}
void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
struct rtnl_link_ops *ops)
{
struct ip_tunnel_net *itn;
struct net *net;
LIST_HEAD(list);
rtnl_lock();
list_for_each_entry(net, net_list, exit_list) {
itn = net_generic(net, id);
ip_tunnel_destroy(net, itn, &list, ops);
}
unregister_netdevice_many(&list);
rtnl_unlock();
}
EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
struct ip_tunnel_parm *p, __u32 fwmark)
{
struct ip_tunnel *nt;
struct net *net = dev_net(dev);
struct ip_tunnel_net *itn;
int mtu;
int err;
nt = netdev_priv(dev);
itn = net_generic(net, nt->ip_tnl_net_id);
if (nt->collect_md) {
if (rtnl_dereference(itn->collect_md_tun))
return -EEXIST;
} else {
if (ip_tunnel_find(itn, p, dev->type))
return -EEXIST;
}
nt->net = net;
nt->parms = *p;
nt->fwmark = fwmark;
err = register_netdevice(dev);
if (err)
goto err_register_netdevice;
if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
eth_hw_addr_random(dev);
mtu = ip_tunnel_bind_dev(dev);
if (tb[IFLA_MTU]) {
unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr));
if (dev->type == ARPHRD_ETHER)
max -= dev->hard_header_len;
mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max);
}
err = dev_set_mtu(dev, mtu);
if (err)
goto err_dev_set_mtu;
ip_tunnel_add(itn, nt);
return 0;
err_dev_set_mtu:
unregister_netdevice(dev);
err_register_netdevice:
return err;
}
EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
struct ip_tunnel_parm *p, __u32 fwmark)
{
struct ip_tunnel *t;
struct ip_tunnel *tunnel = netdev_priv(dev);
struct net *net = tunnel->net;
struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
if (dev == itn->fb_tunnel_dev)
return -EINVAL;
t = ip_tunnel_find(itn, p, dev->type);
if (t) {
if (t->dev != dev)
return -EEXIST;
} else {
t = tunnel;
if (dev->type != ARPHRD_ETHER) {
unsigned int nflags = 0;
if (ipv4_is_multicast(p->iph.daddr))
nflags = IFF_BROADCAST;
else if (p->iph.daddr)
nflags = IFF_POINTOPOINT;
if ((dev->flags ^ nflags) &
(IFF_POINTOPOINT | IFF_BROADCAST))
return -EINVAL;
}
}
ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
return 0;
}
EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
int ip_tunnel_init(struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
struct iphdr *iph = &tunnel->parms.iph;
int err;
net: Fix inconsistent teardown and release of private netdev state. Network devices can allocate reasources and private memory using netdev_ops->ndo_init(). However, the release of these resources can occur in one of two different places. Either netdev_ops->ndo_uninit() or netdev->destructor(). The decision of which operation frees the resources depends upon whether it is necessary for all netdev refs to be released before it is safe to perform the freeing. netdev_ops->ndo_uninit() presumably can occur right after the NETDEV_UNREGISTER notifier completes and the unicast and multicast address lists are flushed. netdev->destructor(), on the other hand, does not run until the netdev references all go away. Further complicating the situation is that netdev->destructor() almost universally does also a free_netdev(). This creates a problem for the logic in register_netdevice(). Because all callers of register_netdevice() manage the freeing of the netdev, and invoke free_netdev(dev) if register_netdevice() fails. If netdev_ops->ndo_init() succeeds, but something else fails inside of register_netdevice(), it does call ndo_ops->ndo_uninit(). But it is not able to invoke netdev->destructor(). This is because netdev->destructor() will do a free_netdev() and then the caller of register_netdevice() will do the same. However, this means that the resources that would normally be released by netdev->destructor() will not be. Over the years drivers have added local hacks to deal with this, by invoking their destructor parts by hand when register_netdevice() fails. Many drivers do not try to deal with this, and instead we have leaks. Let's close this hole by formalizing the distinction between what private things need to be freed up by netdev->destructor() and whether the driver needs unregister_netdevice() to perform the free_netdev(). netdev->priv_destructor() performs all actions to free up the private resources that used to be freed by netdev->destructor(), except for free_netdev(). netdev->needs_free_netdev is a boolean that indicates whether free_netdev() should be done at the end of unregister_netdevice(). Now, register_netdevice() can sanely release all resources after ndo_ops->ndo_init() succeeds, by invoking both ndo_ops->ndo_uninit() and netdev->priv_destructor(). And at the end of unregister_netdevice(), we invoke netdev->priv_destructor() and optionally call free_netdev(). Signed-off-by: David S. Miller <davem@davemloft.net>
2017-05-09 01:52:56 +09:00
dev->needs_free_netdev = true;
dev->priv_destructor = ip_tunnel_dev_free;
dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
if (!dev->tstats)
return -ENOMEM;
err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
if (err) {
free_percpu(dev->tstats);
return err;
}
err = gro_cells_init(&tunnel->gro_cells, dev);
if (err) {
dst_cache_destroy(&tunnel->dst_cache);
free_percpu(dev->tstats);
return err;
}
tunnel->dev = dev;
tunnel->net = dev_net(dev);
strcpy(tunnel->parms.name, dev->name);
iph->version = 4;
iph->ihl = 5;
if (tunnel->collect_md)
netif_keep_dst(dev);
return 0;
}
EXPORT_SYMBOL_GPL(ip_tunnel_init);
void ip_tunnel_uninit(struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
struct net *net = tunnel->net;
struct ip_tunnel_net *itn;
itn = net_generic(net, tunnel->ip_tnl_net_id);
ip_tunnel: fix use-after-free in ip_tunnel_lookup() [ Upstream commit ba61539c6ae57f4146284a5cb4f7b7ed8d42bf45 ] In the datapath, the ip_tunnel_lookup() is used and it internally uses fallback tunnel device pointer, which is fb_tunnel_dev. This pointer variable should be set to NULL when a fb interface is deleted. But there is no routine to set fb_tunnel_dev pointer to NULL. So, this pointer will be still used after interface is deleted and it eventually results in the use-after-free problem. Test commands: ip netns add A ip netns add B ip link add eth0 type veth peer name eth1 ip link set eth0 netns A ip link set eth1 netns B ip netns exec A ip link set lo up ip netns exec A ip link set eth0 up ip netns exec A ip link add gre1 type gre local 10.0.0.1 \ remote 10.0.0.2 ip netns exec A ip link set gre1 up ip netns exec A ip a a 10.0.100.1/24 dev gre1 ip netns exec A ip a a 10.0.0.1/24 dev eth0 ip netns exec B ip link set lo up ip netns exec B ip link set eth1 up ip netns exec B ip link add gre1 type gre local 10.0.0.2 \ remote 10.0.0.1 ip netns exec B ip link set gre1 up ip netns exec B ip a a 10.0.100.2/24 dev gre1 ip netns exec B ip a a 10.0.0.2/24 dev eth1 ip netns exec A hping3 10.0.100.2 -2 --flood -d 60000 & ip netns del B Splat looks like: [ 77.793450][ C3] ================================================================== [ 77.794702][ C3] BUG: KASAN: use-after-free in ip_tunnel_lookup+0xcc4/0xf30 [ 77.795573][ C3] Read of size 4 at addr ffff888060bd9c84 by task hping3/2905 [ 77.796398][ C3] [ 77.796664][ C3] CPU: 3 PID: 2905 Comm: hping3 Not tainted 5.8.0-rc1+ #616 [ 77.797474][ C3] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 77.798453][ C3] Call Trace: [ 77.798815][ C3] <IRQ> [ 77.799142][ C3] dump_stack+0x9d/0xdb [ 77.799605][ C3] print_address_description.constprop.7+0x2cc/0x450 [ 77.800365][ C3] ? ip_tunnel_lookup+0xcc4/0xf30 [ 77.800908][ C3] ? ip_tunnel_lookup+0xcc4/0xf30 [ 77.801517][ C3] ? ip_tunnel_lookup+0xcc4/0xf30 [ 77.802145][ C3] kasan_report+0x154/0x190 [ 77.802821][ C3] ? ip_tunnel_lookup+0xcc4/0xf30 [ 77.803503][ C3] ip_tunnel_lookup+0xcc4/0xf30 [ 77.804165][ C3] __ipgre_rcv+0x1ab/0xaa0 [ip_gre] [ 77.804862][ C3] ? rcu_read_lock_sched_held+0xc0/0xc0 [ 77.805621][ C3] gre_rcv+0x304/0x1910 [ip_gre] [ 77.806293][ C3] ? lock_acquire+0x1a9/0x870 [ 77.806925][ C3] ? gre_rcv+0xfe/0x354 [gre] [ 77.807559][ C3] ? erspan_xmit+0x2e60/0x2e60 [ip_gre] [ 77.808305][ C3] ? rcu_read_lock_sched_held+0xc0/0xc0 [ 77.809032][ C3] ? rcu_read_lock_held+0x90/0xa0 [ 77.809713][ C3] gre_rcv+0x1b8/0x354 [gre] [ ... ] Suggested-by: Eric Dumazet <eric.dumazet@gmail.com> Fixes: c54419321455 ("GRE: Refactor GRE tunneling code.") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2020-06-17 01:51:51 +09:00
ip_tunnel_del(itn, netdev_priv(dev));
if (itn->fb_tunnel_dev == dev)
WRITE_ONCE(itn->fb_tunnel_dev, NULL);
dst_cache_reset(&tunnel->dst_cache);
}
EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
/* Do least required initialization, rest of init is done in tunnel_init call */
netns: make struct pernet_operations::id unsigned int Make struct pernet_operations::id unsigned. There are 2 reasons to do so: 1) This field is really an index into an zero based array and thus is unsigned entity. Using negative value is out-of-bound access by definition. 2) On x86_64 unsigned 32-bit data which are mixed with pointers via array indexing or offsets added or subtracted to pointers are preffered to signed 32-bit data. "int" being used as an array index needs to be sign-extended to 64-bit before being used. void f(long *p, int i) { g(p[i]); } roughly translates to movsx rsi, esi mov rdi, [rsi+...] call g MOVSX is 3 byte instruction which isn't necessary if the variable is unsigned because x86_64 is zero extending by default. Now, there is net_generic() function which, you guessed it right, uses "int" as an array index: static inline void *net_generic(const struct net *net, int id) { ... ptr = ng->ptr[id - 1]; ... } And this function is used a lot, so those sign extensions add up. Patch snipes ~1730 bytes on allyesconfig kernel (without all junk messing with code generation): add/remove: 0/0 grow/shrink: 70/598 up/down: 396/-2126 (-1730) Unfortunately some functions actually grow bigger. This is a semmingly random artefact of code generation with register allocator being used differently. gcc decides that some variable needs to live in new r8+ registers and every access now requires REX prefix. Or it is shifted into r12, so [r12+0] addressing mode has to be used which is longer than [r8] However, overall balance is in negative direction: add/remove: 0/0 grow/shrink: 70/598 up/down: 396/-2126 (-1730) function old new delta nfsd4_lock 3886 3959 +73 tipc_link_build_proto_msg 1096 1140 +44 mac80211_hwsim_new_radio 2776 2808 +32 tipc_mon_rcv 1032 1058 +26 svcauth_gss_legacy_init 1413 1429 +16 tipc_bcbase_select_primary 379 392 +13 nfsd4_exchange_id 1247 1260 +13 nfsd4_setclientid_confirm 782 793 +11 ... put_client_renew_locked 494 480 -14 ip_set_sockfn_get 730 716 -14 geneve_sock_add 829 813 -16 nfsd4_sequence_done 721 703 -18 nlmclnt_lookup_host 708 686 -22 nfsd4_lockt 1085 1063 -22 nfs_get_client 1077 1050 -27 tcf_bpf_init 1106 1076 -30 nfsd4_encode_fattr 5997 5930 -67 Total: Before=154856051, After=154854321, chg -0.00% Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-11-17 10:58:21 +09:00
void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
tunnel->ip_tnl_net_id = net_id;
}
EXPORT_SYMBOL_GPL(ip_tunnel_setup);
MODULE_LICENSE("GPL");