route.c 76.7 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5
/*
 *	Linux INET6 implementation
 *	FIB front-end.
 *
 *	Authors:
6
 *	Pedro Roque		<roque@di.fc.ul.pt>
Linus Torvalds's avatar
Linus Torvalds committed
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
 *
 *	This program is free software; you can redistribute it and/or
 *      modify it under the terms of the GNU General Public License
 *      as published by the Free Software Foundation; either version
 *      2 of the License, or (at your option) any later version.
 */

/*	Changes:
 *
 *	YOSHIFUJI Hideaki @USAGI
 *		reworked default router selection.
 *		- respect outgoing interface
 *		- select from (probably) reachable routers (i.e.
 *		routers in REACHABLE, STALE, DELAY or PROBE states).
 *		- always select the same router if it is (probably)
 *		reachable.  otherwise, round-robin the list.
23 24
 *	Ville Nuorvala
 *		Fixed routing subtrees.
Linus Torvalds's avatar
Linus Torvalds committed
25 26
 */

27 28
#define pr_fmt(fmt) "IPv6: " fmt

29
#include <linux/capability.h>
Linus Torvalds's avatar
Linus Torvalds committed
30
#include <linux/errno.h>
31
#include <linux/export.h>
Linus Torvalds's avatar
Linus Torvalds committed
32 33 34 35 36 37 38 39
#include <linux/types.h>
#include <linux/times.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/route.h>
#include <linux/netdevice.h>
#include <linux/in6.h>
40
#include <linux/mroute6.h>
Linus Torvalds's avatar
Linus Torvalds committed
41 42 43 44
#include <linux/init.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
45
#include <linux/nsproxy.h>
46
#include <linux/slab.h>
47
#include <net/net_namespace.h>
Linus Torvalds's avatar
Linus Torvalds committed
48 49 50 51 52 53 54 55 56 57
#include <net/snmp.h>
#include <net/ipv6.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
#include <net/ndisc.h>
#include <net/addrconf.h>
#include <net/tcp.h>
#include <linux/rtnetlink.h>
#include <net/dst.h>
#include <net/xfrm.h>
58
#include <net/netevent.h>
59
#include <net/netlink.h>
60
#include <net/nexthop.h>
Linus Torvalds's avatar
Linus Torvalds committed
61 62 63 64 65 66 67

#include <asm/uaccess.h>

#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif

68
enum rt6_nud_state {
69 70 71
	RT6_NUD_FAIL_HARD = -3,
	RT6_NUD_FAIL_PROBE = -2,
	RT6_NUD_FAIL_DO_RR = -1,
72 73 74
	RT6_NUD_SUCCEED = 1
};

75
static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
Eric Dumazet's avatar
Eric Dumazet committed
76
				    const struct in6_addr *dest);
Linus Torvalds's avatar
Linus Torvalds committed
77
static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
78
static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
79
static unsigned int	 ip6_mtu(const struct dst_entry *dst);
Linus Torvalds's avatar
Linus Torvalds committed
80 81 82 83
static struct dst_entry *ip6_negative_advice(struct dst_entry *);
static void		ip6_dst_destroy(struct dst_entry *);
static void		ip6_dst_ifdown(struct dst_entry *,
				       struct net_device *dev, int how);
84
static int		 ip6_dst_gc(struct dst_ops *ops);
Linus Torvalds's avatar
Linus Torvalds committed
85 86

static int		ip6_pkt_discard(struct sk_buff *skb);
87
static int		ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb);
88
static int		ip6_pkt_prohibit(struct sk_buff *skb);
89
static int		ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb);
Linus Torvalds's avatar
Linus Torvalds committed
90
static void		ip6_link_failure(struct sk_buff *skb);
91 92 93 94
static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
					   struct sk_buff *skb, u32 mtu);
static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
					struct sk_buff *skb);
95
static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
Linus Torvalds's avatar
Linus Torvalds committed
96

97
#ifdef CONFIG_IPV6_ROUTE_INFO
98
static struct rt6_info *rt6_add_route_info(struct net *net,
99 100
					   const struct in6_addr *prefix, int prefixlen,
					   const struct in6_addr *gwaddr, int ifindex,
101
					   unsigned int pref);
102
static struct rt6_info *rt6_get_route_info(struct net *net,
103 104
					   const struct in6_addr *prefix, int prefixlen,
					   const struct in6_addr *gwaddr, int ifindex);
105 106
#endif

stephen hemminger's avatar
stephen hemminger committed
107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
static void rt6_bind_peer(struct rt6_info *rt, int create)
{
	struct inet_peer_base *base;
	struct inet_peer *peer;

	base = inetpeer_base_ptr(rt->_rt6i_peer);
	if (!base)
		return;

	peer = inet_getpeer_v6(base, &rt->rt6i_dst.addr, create);
	if (peer) {
		if (!rt6_set_peer(rt, peer))
			inet_putpeer(peer);
	}
}

static struct inet_peer *__rt6_get_peer(struct rt6_info *rt, int create)
{
	if (rt6_has_peer(rt))
		return rt6_peer_ptr(rt);

	rt6_bind_peer(rt, create);
	return (rt6_has_peer(rt) ? rt6_peer_ptr(rt) : NULL);
}

static struct inet_peer *rt6_get_peer_create(struct rt6_info *rt)
{
	return __rt6_get_peer(rt, 1);
}

137 138 139 140 141 142
static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
{
	struct rt6_info *rt = (struct rt6_info *) dst;
	struct inet_peer *peer;
	u32 *p = NULL;

143 144 145
	if (!(rt->dst.flags & DST_HOST))
		return NULL;

146
	peer = rt6_get_peer_create(rt);
147 148 149 150 151
	if (peer) {
		u32 *old_p = __DST_METRICS_PTR(old);
		unsigned long prev, new;

		p = peer->metrics;
152 153
		if (inet_metrics_new(peer) ||
		    (old & DST_METRICS_FORCE_OVERWRITE))
154 155 156 157 158 159 160 161 162 163 164 165 166 167
			memcpy(p, old_p, sizeof(u32) * RTAX_MAX);

		new = (unsigned long) p;
		prev = cmpxchg(&dst->_metrics, old, new);

		if (prev != old) {
			p = __DST_METRICS_PTR(prev);
			if (prev & DST_METRICS_READ_ONLY)
				p = NULL;
		}
	}
	return p;
}

168 169 170
static inline const void *choose_neigh_daddr(struct rt6_info *rt,
					     struct sk_buff *skb,
					     const void *daddr)
171 172 173
{
	struct in6_addr *p = &rt->rt6i_gateway;

David S. Miller's avatar
David S. Miller committed
174
	if (!ipv6_addr_any(p))
175
		return (const void *) p;
176 177
	else if (skb)
		return &ipv6_hdr(skb)->daddr;
178 179 180
	return daddr;
}

181 182 183
static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
					  struct sk_buff *skb,
					  const void *daddr)
184
{
185 186 187
	struct rt6_info *rt = (struct rt6_info *) dst;
	struct neighbour *n;

188
	daddr = choose_neigh_daddr(rt, skb, daddr);
189
	n = __ipv6_neigh_lookup(dst->dev, daddr);
190 191 192 193 194
	if (n)
		return n;
	return neigh_create(&nd_tbl, daddr, dst->dev);
}

195
static struct dst_ops ip6_dst_ops_template = {
Linus Torvalds's avatar
Linus Torvalds committed
196
	.family			=	AF_INET6,
197
	.protocol		=	cpu_to_be16(ETH_P_IPV6),
Linus Torvalds's avatar
Linus Torvalds committed
198 199 200
	.gc			=	ip6_dst_gc,
	.gc_thresh		=	1024,
	.check			=	ip6_dst_check,
201
	.default_advmss		=	ip6_default_advmss,
202
	.mtu			=	ip6_mtu,
203
	.cow_metrics		=	ipv6_cow_metrics,
Linus Torvalds's avatar
Linus Torvalds committed
204 205 206 207 208
	.destroy		=	ip6_dst_destroy,
	.ifdown			=	ip6_dst_ifdown,
	.negative_advice	=	ip6_negative_advice,
	.link_failure		=	ip6_link_failure,
	.update_pmtu		=	ip6_rt_update_pmtu,
209
	.redirect		=	rt6_do_redirect,
210
	.local_out		=	__ip6_local_out,
211
	.neigh_lookup		=	ip6_neigh_lookup,
Linus Torvalds's avatar
Linus Torvalds committed
212 213
};

214
static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
215
{
216 217 218
	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);

	return mtu ? : dst->dev->mtu;
219 220
}

221 222
static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
					 struct sk_buff *skb, u32 mtu)
223 224 225
{
}

226 227
static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
				      struct sk_buff *skb)
228 229 230
{
}

231 232 233 234 235 236
static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
					 unsigned long old)
{
	return NULL;
}

237 238
static struct dst_ops ip6_dst_blackhole_ops = {
	.family			=	AF_INET6,
239
	.protocol		=	cpu_to_be16(ETH_P_IPV6),
240 241
	.destroy		=	ip6_dst_destroy,
	.check			=	ip6_dst_check,
242
	.mtu			=	ip6_blackhole_mtu,
243
	.default_advmss		=	ip6_default_advmss,
244
	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
245
	.redirect		=	ip6_rt_blackhole_redirect,
246
	.cow_metrics		=	ip6_rt_blackhole_cow_metrics,
247
	.neigh_lookup		=	ip6_neigh_lookup,
248 249
};

250
static const u32 ip6_template_metrics[RTAX_MAX] = {
251
	[RTAX_HOPLIMIT - 1] = 0,
252 253
};

254
static const struct rt6_info ip6_null_entry_template = {
255 256 257
	.dst = {
		.__refcnt	= ATOMIC_INIT(1),
		.__use		= 1,
258
		.obsolete	= DST_OBSOLETE_FORCE_CHK,
259 260 261
		.error		= -ENETUNREACH,
		.input		= ip6_pkt_discard,
		.output		= ip6_pkt_discard_out,
Linus Torvalds's avatar
Linus Torvalds committed
262 263
	},
	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
264
	.rt6i_protocol  = RTPROT_KERNEL,
Linus Torvalds's avatar
Linus Torvalds committed
265 266 267 268
	.rt6i_metric	= ~(u32) 0,
	.rt6i_ref	= ATOMIC_INIT(1),
};

Thomas Graf's avatar
Thomas Graf committed
269 270
#ifdef CONFIG_IPV6_MULTIPLE_TABLES

271
static const struct rt6_info ip6_prohibit_entry_template = {
272 273 274
	.dst = {
		.__refcnt	= ATOMIC_INIT(1),
		.__use		= 1,
275
		.obsolete	= DST_OBSOLETE_FORCE_CHK,
276 277 278
		.error		= -EACCES,
		.input		= ip6_pkt_prohibit,
		.output		= ip6_pkt_prohibit_out,
Thomas Graf's avatar
Thomas Graf committed
279 280
	},
	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
281
	.rt6i_protocol  = RTPROT_KERNEL,
Thomas Graf's avatar
Thomas Graf committed
282 283 284 285
	.rt6i_metric	= ~(u32) 0,
	.rt6i_ref	= ATOMIC_INIT(1),
};

286
static const struct rt6_info ip6_blk_hole_entry_template = {
287 288 289
	.dst = {
		.__refcnt	= ATOMIC_INIT(1),
		.__use		= 1,
290
		.obsolete	= DST_OBSOLETE_FORCE_CHK,
291 292
		.error		= -EINVAL,
		.input		= dst_discard,
293
		.output		= dst_discard_sk,
Thomas Graf's avatar
Thomas Graf committed
294 295
	},
	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
296
	.rt6i_protocol  = RTPROT_KERNEL,
Thomas Graf's avatar
Thomas Graf committed
297 298 299 300 301 302
	.rt6i_metric	= ~(u32) 0,
	.rt6i_ref	= ATOMIC_INIT(1),
};

#endif

Linus Torvalds's avatar
Linus Torvalds committed
303
/* allocate dst with ip6_dst_ops */
304
static inline struct rt6_info *ip6_dst_alloc(struct net *net,
305
					     struct net_device *dev,
306 307
					     int flags,
					     struct fib6_table *table)
Linus Torvalds's avatar
Linus Torvalds committed
308
{
309
	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
310
					0, DST_OBSOLETE_FORCE_CHK, flags);
311

312
	if (rt) {
313 314 315
		struct dst_entry *dst = &rt->dst;

		memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
316
		rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers);
317
		rt->rt6i_genid = rt_genid_ipv6(net);
318
		INIT_LIST_HEAD(&rt->rt6i_siblings);
319
	}
320
	return rt;
Linus Torvalds's avatar
Linus Torvalds committed
321 322 323 324 325 326
}

static void ip6_dst_destroy(struct dst_entry *dst)
{
	struct rt6_info *rt = (struct rt6_info *)dst;
	struct inet6_dev *idev = rt->rt6i_idev;
327
	struct dst_entry *from = dst->from;
Linus Torvalds's avatar
Linus Torvalds committed
328

329 330 331
	if (!(rt->dst.flags & DST_HOST))
		dst_destroy_metrics_generic(dst);

332
	if (idev) {
Linus Torvalds's avatar
Linus Torvalds committed
333 334
		rt->rt6i_idev = NULL;
		in6_dev_put(idev);
335
	}
336

337 338
	dst->from = NULL;
	dst_release(from);
339

340 341
	if (rt6_has_peer(rt)) {
		struct inet_peer *peer = rt6_peer_ptr(rt);
342 343 344 345
		inet_putpeer(peer);
	}
}

Linus Torvalds's avatar
Linus Torvalds committed
346 347 348 349 350
static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
			   int how)
{
	struct rt6_info *rt = (struct rt6_info *)dst;
	struct inet6_dev *idev = rt->rt6i_idev;
351
	struct net_device *loopback_dev =
352
		dev_net(dev)->loopback_dev;
Linus Torvalds's avatar
Linus Torvalds committed
353

354 355 356 357 358 359 360 361 362
	if (dev != loopback_dev) {
		if (idev && idev->dev == dev) {
			struct inet6_dev *loopback_idev =
				in6_dev_get(loopback_dev);
			if (loopback_idev) {
				rt->rt6i_idev = loopback_idev;
				in6_dev_put(idev);
			}
		}
Linus Torvalds's avatar
Linus Torvalds committed
363 364 365
	}
}

366
static bool rt6_check_expired(const struct rt6_info *rt)
Linus Torvalds's avatar
Linus Torvalds committed
367
{
368 369
	if (rt->rt6i_flags & RTF_EXPIRES) {
		if (time_after(jiffies, rt->dst.expires))
370
			return true;
371
	} else if (rt->dst.from) {
372
		return rt6_check_expired((struct rt6_info *) rt->dst.from);
373
	}
374
	return false;
Linus Torvalds's avatar
Linus Torvalds committed
375 376
}

377 378 379 380 381 382 383 384 385
/* Multipath route selection:
 *   Hash based function using packet header and flowlabel.
 * Adapted from fib_info_hashfn()
 */
static int rt6_info_hash_nhsfn(unsigned int candidate_count,
			       const struct flowi6 *fl6)
{
	unsigned int val = fl6->flowi6_proto;

386 387
	val ^= ipv6_addr_hash(&fl6->daddr);
	val ^= ipv6_addr_hash(&fl6->saddr);
388 389 390 391 392 393

	/* Work only if this not encapsulated */
	switch (fl6->flowi6_proto) {
	case IPPROTO_UDP:
	case IPPROTO_TCP:
	case IPPROTO_SCTP:
394 395
		val ^= (__force u16)fl6->fl6_sport;
		val ^= (__force u16)fl6->fl6_dport;
396 397 398
		break;

	case IPPROTO_ICMPV6:
399 400
		val ^= (__force u16)fl6->fl6_icmp_type;
		val ^= (__force u16)fl6->fl6_icmp_code;
401 402 403
		break;
	}
	/* RFC6438 recommands to use flowlabel */
404
	val ^= (__force u32)fl6->flowlabel;
405 406 407 408 409 410 411

	/* Perhaps, we need to tune, this function? */
	val = val ^ (val >> 7) ^ (val >> 12);
	return val % candidate_count;
}

static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
412 413
					     struct flowi6 *fl6, int oif,
					     int strict)
414 415 416 417 418 419 420 421 422 423 424 425 426
{
	struct rt6_info *sibling, *next_sibling;
	int route_choosen;

	route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
	/* Don't change the route, if route_choosen == 0
	 * (siblings does not include ourself)
	 */
	if (route_choosen)
		list_for_each_entry_safe(sibling, next_sibling,
				&match->rt6i_siblings, rt6i_siblings) {
			route_choosen--;
			if (route_choosen == 0) {
427 428
				if (rt6_score_route(sibling, oif, strict) < 0)
					break;
429 430 431 432 433 434 435
				match = sibling;
				break;
			}
		}
	return match;
}

Linus Torvalds's avatar
Linus Torvalds committed
436
/*
437
 *	Route lookup. Any table->tb6_lock is implied.
Linus Torvalds's avatar
Linus Torvalds committed
438 439
 */

440 441
static inline struct rt6_info *rt6_device_match(struct net *net,
						    struct rt6_info *rt,
442
						    const struct in6_addr *saddr,
Linus Torvalds's avatar
Linus Torvalds committed
443
						    int oif,
444
						    int flags)
Linus Torvalds's avatar
Linus Torvalds committed
445 446 447 448
{
	struct rt6_info *local = NULL;
	struct rt6_info *sprt;

449 450 451
	if (!oif && ipv6_addr_any(saddr))
		goto out;

452
	for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
453
		struct net_device *dev = sprt->dst.dev;
454 455

		if (oif) {
Linus Torvalds's avatar
Linus Torvalds committed
456 457 458
			if (dev->ifindex == oif)
				return sprt;
			if (dev->flags & IFF_LOOPBACK) {
459
				if (!sprt->rt6i_idev ||
Linus Torvalds's avatar
Linus Torvalds committed
460
				    sprt->rt6i_idev->dev->ifindex != oif) {
461
					if (flags & RT6_LOOKUP_F_IFACE && oif)
Linus Torvalds's avatar
Linus Torvalds committed
462
						continue;
463
					if (local && (!oif ||
Linus Torvalds's avatar
Linus Torvalds committed
464 465 466 467 468
						      local->rt6i_idev->dev->ifindex == oif))
						continue;
				}
				local = sprt;
			}
469 470 471 472
		} else {
			if (ipv6_chk_addr(net, saddr, dev,
					  flags & RT6_LOOKUP_F_IFACE))
				return sprt;
Linus Torvalds's avatar
Linus Torvalds committed
473
		}
474
	}
Linus Torvalds's avatar
Linus Torvalds committed
475

476
	if (oif) {
Linus Torvalds's avatar
Linus Torvalds committed
477 478 479
		if (local)
			return local;

480
		if (flags & RT6_LOOKUP_F_IFACE)
481
			return net->ipv6.ip6_null_entry;
Linus Torvalds's avatar
Linus Torvalds committed
482
	}
483
out:
Linus Torvalds's avatar
Linus Torvalds committed
484 485 486
	return rt;
}

487
#ifdef CONFIG_IPV6_ROUTER_PREF
488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505
struct __rt6_probe_work {
	struct work_struct work;
	struct in6_addr target;
	struct net_device *dev;
};

static void rt6_probe_deferred(struct work_struct *w)
{
	struct in6_addr mcaddr;
	struct __rt6_probe_work *work =
		container_of(w, struct __rt6_probe_work, work);

	addrconf_addr_solict_mult(&work->target, &mcaddr);
	ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL);
	dev_put(work->dev);
	kfree(w);
}

506 507
static void rt6_probe(struct rt6_info *rt)
{
508
	struct neighbour *neigh;
509 510 511 512 513 514 515 516
	/*
	 * Okay, this does not seem to be appropriate
	 * for now, however, we need to check if it
	 * is really so; aka Router Reachability Probing.
	 *
	 * Router Reachability Probe MUST be rate-limited
	 * to no more than one per minute.
	 */
517
	if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
518
		return;
519 520 521 522 523 524
	rcu_read_lock_bh();
	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
	if (neigh) {
		write_lock(&neigh->lock);
		if (neigh->nud_state & NUD_VALID)
			goto out;
525
	}
526 527

	if (!neigh ||
528
	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
529
		struct __rt6_probe_work *work;
530

531 532 533
		work = kmalloc(sizeof(*work), GFP_ATOMIC);

		if (neigh && work)
534
			__neigh_set_probe_once(neigh);
535 536

		if (neigh)
537 538
			write_unlock(&neigh->lock);

539 540 541 542 543 544 545
		if (work) {
			INIT_WORK(&work->work, rt6_probe_deferred);
			work->target = rt->rt6i_gateway;
			dev_hold(rt->dst.dev);
			work->dev = rt->dst.dev;
			schedule_work(&work->work);
		}
546
	} else {
547 548
out:
		write_unlock(&neigh->lock);
549
	}
550
	rcu_read_unlock_bh();
551 552 553 554 555 556 557
}
#else
static inline void rt6_probe(struct rt6_info *rt)
{
}
#endif

Linus Torvalds's avatar
Linus Torvalds committed
558
/*
559
 * Default Router Selection (RFC 2461 6.3.6)
Linus Torvalds's avatar
Linus Torvalds committed
560
 */
561
static inline int rt6_check_dev(struct rt6_info *rt, int oif)
562
{
563
	struct net_device *dev = rt->dst.dev;
564
	if (!oif || dev->ifindex == oif)
565
		return 2;
566 567 568 569
	if ((dev->flags & IFF_LOOPBACK) &&
	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
		return 1;
	return 0;
570
}
Linus Torvalds's avatar
Linus Torvalds committed
571

572
static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
Linus Torvalds's avatar
Linus Torvalds committed
573
{
574
	struct neighbour *neigh;
575
	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
576

577 578
	if (rt->rt6i_flags & RTF_NONEXTHOP ||
	    !(rt->rt6i_flags & RTF_GATEWAY))
579
		return RT6_NUD_SUCCEED;
580 581 582 583 584

	rcu_read_lock_bh();
	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
	if (neigh) {
		read_lock(&neigh->lock);
585
		if (neigh->nud_state & NUD_VALID)
586
			ret = RT6_NUD_SUCCEED;
587
#ifdef CONFIG_IPV6_ROUTER_PREF
588
		else if (!(neigh->nud_state & NUD_FAILED))
589
			ret = RT6_NUD_SUCCEED;
590 591
		else
			ret = RT6_NUD_FAIL_PROBE;
592
#endif
593
		read_unlock(&neigh->lock);
594 595
	} else {
		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
596
		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
597
	}
598 599
	rcu_read_unlock_bh();

600
	return ret;
Linus Torvalds's avatar
Linus Torvalds committed
601 602
}

603 604
static int rt6_score_route(struct rt6_info *rt, int oif,
			   int strict)
Linus Torvalds's avatar
Linus Torvalds committed
605
{
606
	int m;
607

608
	m = rt6_check_dev(rt, oif);
609
	if (!m && (strict & RT6_LOOKUP_F_IFACE))
610
		return RT6_NUD_FAIL_HARD;
611 612 613
#ifdef CONFIG_IPV6_ROUTER_PREF
	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
#endif
614 615 616 617 618
	if (strict & RT6_LOOKUP_F_REACHABLE) {
		int n = rt6_check_neigh(rt);
		if (n < 0)
			return n;
	}
619 620 621
	return m;
}

622
static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
623 624
				   int *mpri, struct rt6_info *match,
				   bool *do_rr)
625
{
626
	int m;
627
	bool match_do_rr = false;
628 629 630 631 632

	if (rt6_check_expired(rt))
		goto out;

	m = rt6_score_route(rt, oif, strict);
633
	if (m == RT6_NUD_FAIL_DO_RR) {
634 635
		match_do_rr = true;
		m = 0; /* lowest valid score */
636
	} else if (m == RT6_NUD_FAIL_HARD) {
637
		goto out;
638 639 640 641
	}

	if (strict & RT6_LOOKUP_F_REACHABLE)
		rt6_probe(rt);
642

643
	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
644
	if (m > *mpri) {
645
		*do_rr = match_do_rr;
646 647 648 649 650 651 652 653 654
		*mpri = m;
		match = rt;
	}
out:
	return match;
}

static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
				     struct rt6_info *rr_head,
655 656
				     u32 metric, int oif, int strict,
				     bool *do_rr)
657 658
{
	struct rt6_info *rt, *match;
659
	int mpri = -1;
Linus Torvalds's avatar
Linus Torvalds committed
660

661 662
	match = NULL;
	for (rt = rr_head; rt && rt->rt6i_metric == metric;
663
	     rt = rt->dst.rt6_next)
664
		match = find_match(rt, oif, strict, &mpri, match, do_rr);
665
	for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
666
	     rt = rt->dst.rt6_next)
667
		match = find_match(rt, oif, strict, &mpri, match, do_rr);
Linus Torvalds's avatar
Linus Torvalds committed
668

669 670
	return match;
}
Linus Torvalds's avatar
Linus Torvalds committed
671

672 673 674
static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
{
	struct rt6_info *match, *rt0;
675
	struct net *net;
676
	bool do_rr = false;
Linus Torvalds's avatar
Linus Torvalds committed
677

678 679 680
	rt0 = fn->rr_ptr;
	if (!rt0)
		fn->rr_ptr = rt0 = fn->leaf;
Linus Torvalds's avatar
Linus Torvalds committed
681

682 683
	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
			     &do_rr);
Linus Torvalds's avatar
Linus Torvalds committed
684

685
	if (do_rr) {
686
		struct rt6_info *next = rt0->dst.rt6_next;
687

688
		/* no entries matched; do round-robin */
689 690 691 692 693
		if (!next || next->rt6i_metric != rt0->rt6i_metric)
			next = fn->leaf;

		if (next != rt0)
			fn->rr_ptr = next;
Linus Torvalds's avatar
Linus Torvalds committed
694 695
	}

696
	net = dev_net(rt0->dst.dev);
Eric Dumazet's avatar
Eric Dumazet committed
697
	return match ? match : net->ipv6.ip6_null_entry;
Linus Torvalds's avatar
Linus Torvalds committed
698 699
}

700 701
#ifdef CONFIG_IPV6_ROUTE_INFO
int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
702
		  const struct in6_addr *gwaddr)
703
{
704
	struct net *net = dev_net(dev);
705 706 707
	struct route_info *rinfo = (struct route_info *) opt;
	struct in6_addr prefix_buf, *prefix;
	unsigned int pref;
708
	unsigned long lifetime;
709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731
	struct rt6_info *rt;

	if (len < sizeof(struct route_info)) {
		return -EINVAL;
	}

	/* Sanity check for prefix_len and length */
	if (rinfo->length > 3) {
		return -EINVAL;
	} else if (rinfo->prefix_len > 128) {
		return -EINVAL;
	} else if (rinfo->prefix_len > 64) {
		if (rinfo->length < 2) {
			return -EINVAL;
		}
	} else if (rinfo->prefix_len > 0) {
		if (rinfo->length < 1) {
			return -EINVAL;
		}
	}

	pref = rinfo->route_pref;
	if (pref == ICMPV6_ROUTER_PREF_INVALID)
732
		return -EINVAL;
733

734
	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
735 736 737 738 739 740 741 742 743 744 745

	if (rinfo->length == 3)
		prefix = (struct in6_addr *)rinfo->prefix;
	else {
		/* this function is safe */
		ipv6_addr_prefix(&prefix_buf,
				 (struct in6_addr *)rinfo->prefix,
				 rinfo->prefix_len);
		prefix = &prefix_buf;
	}

746 747 748 749 750
	if (rinfo->prefix_len == 0)
		rt = rt6_get_dflt_router(gwaddr, dev);
	else
		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
					gwaddr, dev->ifindex);
751 752

	if (rt && !lifetime) {
753
		ip6_del_rt(rt);
754 755 756 757
		rt = NULL;
	}

	if (!rt && lifetime)
758
		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
759 760 761 762 763 764
					pref);
	else if (rt)
		rt->rt6i_flags = RTF_ROUTEINFO |
				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);

	if (rt) {
765 766 767 768 769
		if (!addrconf_finite_timeout(lifetime))
			rt6_clean_expires(rt);
		else
			rt6_set_expires(rt, jiffies + HZ * lifetime);

Amerigo Wang's avatar
Amerigo Wang committed
770
		ip6_rt_put(rt);
771 772 773 774 775
	}
	return 0;
}
#endif

776
#define BACKTRACK(__net, saddr)			\
777
do { \
778
	if (rt == __net->ipv6.ip6_null_entry) {	\
779
		struct fib6_node *pn; \
780
		while (1) { \
781 782 783 784
			if (fn->fn_flags & RTN_TL_ROOT) \
				goto out; \
			pn = fn->parent; \
			if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
785
				fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
786 787 788 789
			else \
				fn = pn; \
			if (fn->fn_flags & RTN_RTINFO) \
				goto restart; \
790 791
		} \
	} \
792
} while (0)
793

794 795
static struct rt6_info *ip6_pol_route_lookup(struct net *net,
					     struct fib6_table *table,
796
					     struct flowi6 *fl6, int flags)
Linus Torvalds's avatar
Linus Torvalds committed
797 798 799 800
{
	struct fib6_node *fn;
	struct rt6_info *rt;

801
	read_lock_bh(&table->tb6_lock);
802
	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
803 804
restart:
	rt = fn->leaf;
805
	rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
806
	if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
807
		rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
808
	BACKTRACK(net, &fl6->saddr);
809
out:
810
	dst_use(&rt->dst, jiffies);
811 812 813 814 815
	read_unlock_bh(&table->tb6_lock);
	return rt;

}

816 817 818 819 820 821 822
struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6,
				    int flags)
{
	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
}
EXPORT_SYMBOL_GPL(ip6_route_lookup);

823 824
struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
			    const struct in6_addr *saddr, int oif, int strict)
825
{
826 827 828
	struct flowi6 fl6 = {
		.flowi6_oif = oif,
		.daddr = *daddr,
829 830
	};
	struct dst_entry *dst;
831
	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
832

833
	if (saddr) {
834
		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
835 836 837
		flags |= RT6_LOOKUP_F_HAS_SADDR;
	}

838
	dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
839 840 841 842 843
	if (dst->error == 0)
		return (struct rt6_info *) dst;

	dst_release(dst);

Linus Torvalds's avatar
Linus Torvalds committed
844 845 846
	return NULL;
}

847 848
EXPORT_SYMBOL(rt6_lookup);

849
/* ip6_ins_rt is called with FREE table->tb6_lock.
Linus Torvalds's avatar
Linus Torvalds committed
850 851 852 853 854
   It takes new route entry, the addition fails by any reason the
   route is freed. In any case, if caller does not hold it, it may
   be destroyed.
 */

855 856
static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
			struct nlattr *mx, int mx_len)
Linus Torvalds's avatar
Linus Torvalds committed
857 858
{
	int err;
859
	struct fib6_table *table;
Linus Torvalds's avatar
Linus Torvalds committed
860

861 862
	table = rt->rt6i_table;
	write_lock_bh(&table->tb6_lock);
863
	err = fib6_add(&table->tb6_root, rt, info, mx, mx_len);
864
	write_unlock_bh(&table->tb6_lock);
Linus Torvalds's avatar
Linus Torvalds committed
865 866 867 868

	return err;
}

869 870
int ip6_ins_rt(struct rt6_info *rt)
{
871
	struct nl_info info = {
872
		.nl_net = dev_net(rt->dst.dev),
873
	};
874
	return __ip6_ins_rt(rt, &info, NULL, 0);
875 876
}

877
static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
Eric Dumazet's avatar
Eric Dumazet committed
878
				      const struct in6_addr *daddr,
879
				      const struct in6_addr *saddr)
Linus Torvalds's avatar
Linus Torvalds committed
880 881 882 883 884 885 886
{
	struct rt6_info *rt;

	/*
	 *	Clone the route.
	 */

Eric Dumazet's avatar
Eric Dumazet committed
887
	rt = ip6_rt_copy(ort, daddr);
Linus Torvalds's avatar
Linus Torvalds committed
888 889

	if (rt) {
890 891 892
		if (ort->rt6i_dst.plen != 128 &&
		    ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
			rt->rt6i_flags |= RTF_ANYCAST;
Linus Torvalds's avatar
Linus Torvalds committed
893 894 895 896 897

		rt->rt6i_flags |= RTF_CACHE;

#ifdef CONFIG_IPV6_SUBTREES
		if (rt->rt6i_src.plen && saddr) {
898
			rt->rt6i_src.addr = *saddr;
Linus Torvalds's avatar
Linus Torvalds committed
899 900 901
			rt->rt6i_src.plen = 128;
		}
#endif
902
	}
Linus Torvalds's avatar
Linus Torvalds committed
903

904 905
	return rt;
}
Linus Torvalds's avatar
Linus Torvalds committed
906

Eric Dumazet's avatar
Eric Dumazet committed
907 908
static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
					const struct in6_addr *daddr)
909
{
Eric Dumazet's avatar
Eric Dumazet committed
910 911
	struct rt6_info *rt = ip6_rt_copy(ort, daddr);

912
	if (rt)
913 914 915 916
		rt->rt6i_flags |= RTF_CACHE;
	return rt;
}

917
static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
918
				      struct flowi6 *fl6, int flags)
Linus Torvalds's avatar
Linus Torvalds committed
919 920
{
	struct fib6_node *fn;
921
	struct rt6_info *rt, *nrt;
922
	int strict = 0;
Linus Torvalds's avatar
Linus Torvalds committed
923
	int attempts = 3;
924
	int err;
925
	int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
Linus Torvalds's avatar
Linus Torvalds committed
926

927
	strict |= flags & RT6_LOOKUP_F_IFACE;
Linus Torvalds's avatar
Linus Torvalds committed
928 929

relookup:
930
	read_lock_bh(&table->tb6_lock);
Linus Torvalds's avatar
Linus Torvalds committed
931

932
restart_2:
933
	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
Linus Torvalds's avatar
Linus Torvalds committed
934 935

restart:
936
	rt = rt6_select(fn, oif, strict | reachable);
937 938
	if (rt->rt6i_nsiblings)
		rt = rt6_multipath_select(rt, fl6, oif, strict | reachable);
939
	BACKTRACK(net, &fl6->saddr);
940
	if (rt == net->ipv6.ip6_null_entry ||
941
	    rt->rt6i_flags & RTF_CACHE)
942
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
943

944
	dst_hold(&rt->dst);
945
	read_unlock_bh(&table->tb6_lock);
946

947
	if (!(rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY)))
948
		nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
949
	else if (!(rt->dst.flags & DST_HOST))
950
		nrt = rt6_alloc_clone(rt, &fl6->daddr);
951 952
	else
		goto out2;
953

Amerigo Wang's avatar
Amerigo Wang committed
954
	ip6_rt_put(rt);
955
	rt = nrt ? : net->ipv6.ip6_null_entry;
Linus Torvalds's avatar
Linus Torvalds committed
956

957
	dst_hold(&rt->dst);
958
	if (nrt) {
959
		err = ip6_ins_rt(nrt);
960
		if (!err)
Linus Torvalds's avatar
Linus Torvalds committed
961 962 963
			goto out2;
	}

964 965 966 967
	if (--attempts <= 0)
		goto out2;

	/*
968
	 * Race condition! In the gap, when table->tb6_lock was
969 970
	 * released someone could insert this route.  Relookup.
	 */
Amerigo Wang's avatar
Amerigo Wang committed
971
	ip6_rt_put(rt);
972 973 974
	goto relookup;

out:
975 976 977 978
	if (reachable) {
		reachable = 0;
		goto restart_2;
	}
979
	dst_hold(&rt->dst);
980
	read_unlock_bh(&table->tb6_lock);
Linus Torvalds's avatar
Linus Torvalds committed
981
out2:
982 983
	rt->dst.lastuse = jiffies;
	rt->dst.__use++;
984 985

	return rt;
Linus Torvalds's avatar
Linus Torvalds committed
986 987
}

988
static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
989
					    struct flowi6 *fl6, int flags)
990
{
991
	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
992 993
}

994 995 996 997 998 999 1000 1001 1002 1003
static struct dst_entry *ip6_route_input_lookup(struct net *net,
						struct net_device *dev,
						struct flowi6 *fl6, int flags)
{
	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
		flags |= RT6_LOOKUP_F_IFACE;

	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
}

1004 1005
void ip6_route_input(struct sk_buff *skb)
{
1006
	const struct ipv6hdr *iph = ipv6_hdr(skb);
1007
	struct net *net = dev_net(skb->dev);
1008
	int flags = RT6_LOOKUP_F_HAS_SADDR;
1009 1010 1011 1012
	struct flowi6 fl6 = {
		.flowi6_iif = skb->dev->ifindex,
		.daddr = iph->daddr,
		.saddr = iph->saddr,
1013
		.flowlabel = ip6_flowinfo(iph),
1014 1015
		.flowi6_mark = skb->mark,
		.flowi6_proto = iph->nexthdr,
1016
	};
1017

1018
	skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1019 1020
}

1021
static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1022
					     struct flowi6 *fl6, int flags)
Linus Torvalds's avatar
Linus Torvalds committed
1023
{
1024
	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);