route.c 76.8 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5
/*
 *	Linux INET6 implementation
 *	FIB front-end.
 *
 *	Authors:
6
 *	Pedro Roque		<roque@di.fc.ul.pt>
Linus Torvalds's avatar
Linus Torvalds committed
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
 *
 *	This program is free software; you can redistribute it and/or
 *      modify it under the terms of the GNU General Public License
 *      as published by the Free Software Foundation; either version
 *      2 of the License, or (at your option) any later version.
 */

/*	Changes:
 *
 *	YOSHIFUJI Hideaki @USAGI
 *		reworked default router selection.
 *		- respect outgoing interface
 *		- select from (probably) reachable routers (i.e.
 *		routers in REACHABLE, STALE, DELAY or PROBE states).
 *		- always select the same router if it is (probably)
 *		reachable.  otherwise, round-robin the list.
23 24
 *	Ville Nuorvala
 *		Fixed routing subtrees.
Linus Torvalds's avatar
Linus Torvalds committed
25 26
 */

27 28
#define pr_fmt(fmt) "IPv6: " fmt

29
#include <linux/capability.h>
Linus Torvalds's avatar
Linus Torvalds committed
30
#include <linux/errno.h>
31
#include <linux/export.h>
Linus Torvalds's avatar
Linus Torvalds committed
32 33 34 35 36 37 38 39
#include <linux/types.h>
#include <linux/times.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/route.h>
#include <linux/netdevice.h>
#include <linux/in6.h>
40
#include <linux/mroute6.h>
Linus Torvalds's avatar
Linus Torvalds committed
41 42 43 44
#include <linux/init.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
45
#include <linux/nsproxy.h>
46
#include <linux/slab.h>
47
#include <net/net_namespace.h>
Linus Torvalds's avatar
Linus Torvalds committed
48 49 50 51 52 53 54 55 56 57
#include <net/snmp.h>
#include <net/ipv6.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
#include <net/ndisc.h>
#include <net/addrconf.h>
#include <net/tcp.h>
#include <linux/rtnetlink.h>
#include <net/dst.h>
#include <net/xfrm.h>
58
#include <net/netevent.h>
59
#include <net/netlink.h>
60
#include <net/nexthop.h>
Linus Torvalds's avatar
Linus Torvalds committed
61 62 63 64 65 66 67

#include <asm/uaccess.h>

#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif

68
static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
Eric Dumazet's avatar
Eric Dumazet committed
69
				    const struct in6_addr *dest);
Linus Torvalds's avatar
Linus Torvalds committed
70
static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
71
static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
72
static unsigned int	 ip6_mtu(const struct dst_entry *dst);
Linus Torvalds's avatar
Linus Torvalds committed
73 74 75 76
static struct dst_entry *ip6_negative_advice(struct dst_entry *);
static void		ip6_dst_destroy(struct dst_entry *);
static void		ip6_dst_ifdown(struct dst_entry *,
				       struct net_device *dev, int how);
77
static int		 ip6_dst_gc(struct dst_ops *ops);
Linus Torvalds's avatar
Linus Torvalds committed
78 79 80 81

static int		ip6_pkt_discard(struct sk_buff *skb);
static int		ip6_pkt_discard_out(struct sk_buff *skb);
static void		ip6_link_failure(struct sk_buff *skb);
82 83 84 85
static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
					   struct sk_buff *skb, u32 mtu);
static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
					struct sk_buff *skb);
Linus Torvalds's avatar
Linus Torvalds committed
86

87
#ifdef CONFIG_IPV6_ROUTE_INFO
88
static struct rt6_info *rt6_add_route_info(struct net *net,
89 90
					   const struct in6_addr *prefix, int prefixlen,
					   const struct in6_addr *gwaddr, int ifindex,
91
					   unsigned int pref);
92
static struct rt6_info *rt6_get_route_info(struct net *net,
93 94
					   const struct in6_addr *prefix, int prefixlen,
					   const struct in6_addr *gwaddr, int ifindex);
95 96
#endif

97 98 99 100 101 102
static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
{
	struct rt6_info *rt = (struct rt6_info *) dst;
	struct inet_peer *peer;
	u32 *p = NULL;

103 104 105
	if (!(rt->dst.flags & DST_HOST))
		return NULL;

106
	peer = rt6_get_peer_create(rt);
107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
	if (peer) {
		u32 *old_p = __DST_METRICS_PTR(old);
		unsigned long prev, new;

		p = peer->metrics;
		if (inet_metrics_new(peer))
			memcpy(p, old_p, sizeof(u32) * RTAX_MAX);

		new = (unsigned long) p;
		prev = cmpxchg(&dst->_metrics, old, new);

		if (prev != old) {
			p = __DST_METRICS_PTR(prev);
			if (prev & DST_METRICS_READ_ONLY)
				p = NULL;
		}
	}
	return p;
}

127 128 129
static inline const void *choose_neigh_daddr(struct rt6_info *rt,
					     struct sk_buff *skb,
					     const void *daddr)
130 131 132
{
	struct in6_addr *p = &rt->rt6i_gateway;

David S. Miller's avatar
David S. Miller committed
133
	if (!ipv6_addr_any(p))
134
		return (const void *) p;
135 136
	else if (skb)
		return &ipv6_hdr(skb)->daddr;
137 138 139
	return daddr;
}

140 141 142
static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
					  struct sk_buff *skb,
					  const void *daddr)
143
{
144 145 146
	struct rt6_info *rt = (struct rt6_info *) dst;
	struct neighbour *n;

147
	daddr = choose_neigh_daddr(rt, skb, daddr);
148
	n = __ipv6_neigh_lookup(&nd_tbl, dst->dev, daddr);
149 150 151 152 153
	if (n)
		return n;
	return neigh_create(&nd_tbl, daddr, dst->dev);
}

154
static int rt6_bind_neighbour(struct rt6_info *rt, struct net_device *dev)
155
{
156 157 158 159 160 161
	struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dev, &rt->rt6i_gateway);
	if (!n) {
		n = neigh_create(&nd_tbl, &rt->rt6i_gateway, dev);
		if (IS_ERR(n))
			return PTR_ERR(n);
	}
162
	rt->n = n;
163 164

	return 0;
165 166
}

167
static struct dst_ops ip6_dst_ops_template = {
Linus Torvalds's avatar
Linus Torvalds committed
168
	.family			=	AF_INET6,
169
	.protocol		=	cpu_to_be16(ETH_P_IPV6),
Linus Torvalds's avatar
Linus Torvalds committed
170 171 172
	.gc			=	ip6_dst_gc,
	.gc_thresh		=	1024,
	.check			=	ip6_dst_check,
173
	.default_advmss		=	ip6_default_advmss,
174
	.mtu			=	ip6_mtu,
175
	.cow_metrics		=	ipv6_cow_metrics,
Linus Torvalds's avatar
Linus Torvalds committed
176 177 178 179 180
	.destroy		=	ip6_dst_destroy,
	.ifdown			=	ip6_dst_ifdown,
	.negative_advice	=	ip6_negative_advice,
	.link_failure		=	ip6_link_failure,
	.update_pmtu		=	ip6_rt_update_pmtu,
181
	.redirect		=	rt6_do_redirect,
182
	.local_out		=	__ip6_local_out,
183
	.neigh_lookup		=	ip6_neigh_lookup,
Linus Torvalds's avatar
Linus Torvalds committed
184 185
};

186
static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
187
{
188 189 190
	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);

	return mtu ? : dst->dev->mtu;
191 192
}

193 194
static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
					 struct sk_buff *skb, u32 mtu)
195 196 197
{
}

198 199
static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
				      struct sk_buff *skb)
200 201 202
{
}

203 204 205 206 207 208
static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
					 unsigned long old)
{
	return NULL;
}

209 210
static struct dst_ops ip6_dst_blackhole_ops = {
	.family			=	AF_INET6,
211
	.protocol		=	cpu_to_be16(ETH_P_IPV6),
212 213
	.destroy		=	ip6_dst_destroy,
	.check			=	ip6_dst_check,
214
	.mtu			=	ip6_blackhole_mtu,
215
	.default_advmss		=	ip6_default_advmss,
216
	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
217
	.redirect		=	ip6_rt_blackhole_redirect,
218
	.cow_metrics		=	ip6_rt_blackhole_cow_metrics,
219
	.neigh_lookup		=	ip6_neigh_lookup,
220 221
};

222
static const u32 ip6_template_metrics[RTAX_MAX] = {
223
	[RTAX_HOPLIMIT - 1] = 0,
224 225
};

226
static const struct rt6_info ip6_null_entry_template = {
227 228 229
	.dst = {
		.__refcnt	= ATOMIC_INIT(1),
		.__use		= 1,
230
		.obsolete	= DST_OBSOLETE_FORCE_CHK,
231 232 233
		.error		= -ENETUNREACH,
		.input		= ip6_pkt_discard,
		.output		= ip6_pkt_discard_out,
Linus Torvalds's avatar
Linus Torvalds committed
234 235
	},
	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
236
	.rt6i_protocol  = RTPROT_KERNEL,
Linus Torvalds's avatar
Linus Torvalds committed
237 238 239 240
	.rt6i_metric	= ~(u32) 0,
	.rt6i_ref	= ATOMIC_INIT(1),
};

Thomas Graf's avatar
Thomas Graf committed
241 242
#ifdef CONFIG_IPV6_MULTIPLE_TABLES

243 244 245
static int ip6_pkt_prohibit(struct sk_buff *skb);
static int ip6_pkt_prohibit_out(struct sk_buff *skb);

246
static const struct rt6_info ip6_prohibit_entry_template = {
247 248 249
	.dst = {
		.__refcnt	= ATOMIC_INIT(1),
		.__use		= 1,
250
		.obsolete	= DST_OBSOLETE_FORCE_CHK,
251 252 253
		.error		= -EACCES,
		.input		= ip6_pkt_prohibit,
		.output		= ip6_pkt_prohibit_out,
Thomas Graf's avatar
Thomas Graf committed
254 255
	},
	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
256
	.rt6i_protocol  = RTPROT_KERNEL,
Thomas Graf's avatar
Thomas Graf committed
257 258 259 260
	.rt6i_metric	= ~(u32) 0,
	.rt6i_ref	= ATOMIC_INIT(1),
};

261
static const struct rt6_info ip6_blk_hole_entry_template = {
262 263 264
	.dst = {
		.__refcnt	= ATOMIC_INIT(1),
		.__use		= 1,
265
		.obsolete	= DST_OBSOLETE_FORCE_CHK,
266 267 268
		.error		= -EINVAL,
		.input		= dst_discard,
		.output		= dst_discard,
Thomas Graf's avatar
Thomas Graf committed
269 270
	},
	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
271
	.rt6i_protocol  = RTPROT_KERNEL,
Thomas Graf's avatar
Thomas Graf committed
272 273 274 275 276 277
	.rt6i_metric	= ~(u32) 0,
	.rt6i_ref	= ATOMIC_INIT(1),
};

#endif

Linus Torvalds's avatar
Linus Torvalds committed
278
/* allocate dst with ip6_dst_ops */
279
static inline struct rt6_info *ip6_dst_alloc(struct net *net,
280
					     struct net_device *dev,
281 282
					     int flags,
					     struct fib6_table *table)
Linus Torvalds's avatar
Linus Torvalds committed
283
{
284
	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
285
					0, DST_OBSOLETE_FORCE_CHK, flags);
286

287
	if (rt) {
288 289 290
		struct dst_entry *dst = &rt->dst;

		memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
291
		rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers);
292
		rt->rt6i_genid = rt_genid(net);
293 294
		INIT_LIST_HEAD(&rt->rt6i_siblings);
		rt->rt6i_nsiblings = 0;
295
	}
296
	return rt;
Linus Torvalds's avatar
Linus Torvalds committed
297 298 299 300 301 302 303
}

static void ip6_dst_destroy(struct dst_entry *dst)
{
	struct rt6_info *rt = (struct rt6_info *)dst;
	struct inet6_dev *idev = rt->rt6i_idev;

304 305 306
	if (rt->n)
		neigh_release(rt->n);

307 308 309
	if (!(rt->dst.flags & DST_HOST))
		dst_destroy_metrics_generic(dst);

310
	if (idev) {
Linus Torvalds's avatar
Linus Torvalds committed
311 312
		rt->rt6i_idev = NULL;
		in6_dev_put(idev);
313
	}
314 315 316 317

	if (!(rt->rt6i_flags & RTF_EXPIRES) && dst->from)
		dst_release(dst->from);

318 319
	if (rt6_has_peer(rt)) {
		struct inet_peer *peer = rt6_peer_ptr(rt);
320 321 322 323 324 325
		inet_putpeer(peer);
	}
}

void rt6_bind_peer(struct rt6_info *rt, int create)
{
326
	struct inet_peer_base *base;
327 328
	struct inet_peer *peer;

329 330 331 332 333
	base = inetpeer_base_ptr(rt->_rt6i_peer);
	if (!base)
		return;

	peer = inet_getpeer_v6(base, &rt->rt6i_dst.addr, create);
334 335 336 337
	if (peer) {
		if (!rt6_set_peer(rt, peer))
			inet_putpeer(peer);
	}
Linus Torvalds's avatar
Linus Torvalds committed
338 339 340 341 342 343 344
}

static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
			   int how)
{
	struct rt6_info *rt = (struct rt6_info *)dst;
	struct inet6_dev *idev = rt->rt6i_idev;
345
	struct net_device *loopback_dev =
346
		dev_net(dev)->loopback_dev;
Linus Torvalds's avatar
Linus Torvalds committed
347

348 349 350 351 352 353 354 355 356 357 358 359 360
	if (dev != loopback_dev) {
		if (idev && idev->dev == dev) {
			struct inet6_dev *loopback_idev =
				in6_dev_get(loopback_dev);
			if (loopback_idev) {
				rt->rt6i_idev = loopback_idev;
				in6_dev_put(idev);
			}
		}
		if (rt->n && rt->n->dev == dev) {
			rt->n->dev = loopback_dev;
			dev_hold(loopback_dev);
			dev_put(dev);
Linus Torvalds's avatar
Linus Torvalds committed
361 362 363 364
		}
	}
}

365
static bool rt6_check_expired(const struct rt6_info *rt)
Linus Torvalds's avatar
Linus Torvalds committed
366
{
367 368
	if (rt->rt6i_flags & RTF_EXPIRES) {
		if (time_after(jiffies, rt->dst.expires))
369
			return true;
370
	} else if (rt->dst.from) {
371
		return rt6_check_expired((struct rt6_info *) rt->dst.from);
372
	}
373
	return false;
Linus Torvalds's avatar
Linus Torvalds committed
374 375
}

376
static bool rt6_need_strict(const struct in6_addr *daddr)
377
{
Eric Dumazet's avatar
Eric Dumazet committed
378 379
	return ipv6_addr_type(daddr) &
		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
380 381
}

382 383 384 385 386 387 388 389 390
/* Multipath route selection:
 *   Hash based function using packet header and flowlabel.
 * Adapted from fib_info_hashfn()
 */
static int rt6_info_hash_nhsfn(unsigned int candidate_count,
			       const struct flowi6 *fl6)
{
	unsigned int val = fl6->flowi6_proto;

391 392 393 394
	val ^= (__force u32)fl6->daddr.s6_addr32[0];
	val ^= (__force u32)fl6->daddr.s6_addr32[1];
	val ^= (__force u32)fl6->daddr.s6_addr32[2];
	val ^= (__force u32)fl6->daddr.s6_addr32[3];
395

396 397 398 399
	val ^= (__force u32)fl6->saddr.s6_addr32[0];
	val ^= (__force u32)fl6->saddr.s6_addr32[1];
	val ^= (__force u32)fl6->saddr.s6_addr32[2];
	val ^= (__force u32)fl6->saddr.s6_addr32[3];
400 401 402 403 404 405

	/* Work only if this not encapsulated */
	switch (fl6->flowi6_proto) {
	case IPPROTO_UDP:
	case IPPROTO_TCP:
	case IPPROTO_SCTP:
406 407
		val ^= (__force u16)fl6->fl6_sport;
		val ^= (__force u16)fl6->fl6_dport;
408 409 410
		break;

	case IPPROTO_ICMPV6:
411 412
		val ^= (__force u16)fl6->fl6_icmp_type;
		val ^= (__force u16)fl6->fl6_icmp_code;
413 414 415
		break;
	}
	/* RFC6438 recommands to use flowlabel */
416
	val ^= (__force u32)fl6->flowlabel;
417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444

	/* Perhaps, we need to tune, this function? */
	val = val ^ (val >> 7) ^ (val >> 12);
	return val % candidate_count;
}

static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
					     struct flowi6 *fl6)
{
	struct rt6_info *sibling, *next_sibling;
	int route_choosen;

	route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
	/* Don't change the route, if route_choosen == 0
	 * (siblings does not include ourself)
	 */
	if (route_choosen)
		list_for_each_entry_safe(sibling, next_sibling,
				&match->rt6i_siblings, rt6i_siblings) {
			route_choosen--;
			if (route_choosen == 0) {
				match = sibling;
				break;
			}
		}
	return match;
}

Linus Torvalds's avatar
Linus Torvalds committed
445
/*
446
 *	Route lookup. Any table->tb6_lock is implied.
Linus Torvalds's avatar
Linus Torvalds committed
447 448
 */

449 450
static inline struct rt6_info *rt6_device_match(struct net *net,
						    struct rt6_info *rt,
451
						    const struct in6_addr *saddr,
Linus Torvalds's avatar
Linus Torvalds committed
452
						    int oif,
453
						    int flags)
Linus Torvalds's avatar
Linus Torvalds committed
454 455 456 457
{
	struct rt6_info *local = NULL;
	struct rt6_info *sprt;

458 459 460
	if (!oif && ipv6_addr_any(saddr))
		goto out;

461
	for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
462
		struct net_device *dev = sprt->dst.dev;
463 464

		if (oif) {
Linus Torvalds's avatar
Linus Torvalds committed
465 466 467
			if (dev->ifindex == oif)
				return sprt;
			if (dev->flags & IFF_LOOPBACK) {
468
				if (!sprt->rt6i_idev ||
Linus Torvalds's avatar
Linus Torvalds committed
469
				    sprt->rt6i_idev->dev->ifindex != oif) {
470
					if (flags & RT6_LOOKUP_F_IFACE && oif)
Linus Torvalds's avatar
Linus Torvalds committed
471
						continue;
472
					if (local && (!oif ||
Linus Torvalds's avatar
Linus Torvalds committed
473 474 475 476 477
						      local->rt6i_idev->dev->ifindex == oif))
						continue;
				}
				local = sprt;
			}
478 479 480 481
		} else {
			if (ipv6_chk_addr(net, saddr, dev,
					  flags & RT6_LOOKUP_F_IFACE))
				return sprt;
Linus Torvalds's avatar
Linus Torvalds committed
482
		}
483
	}
Linus Torvalds's avatar
Linus Torvalds committed
484

485
	if (oif) {
Linus Torvalds's avatar
Linus Torvalds committed
486 487 488
		if (local)
			return local;

489
		if (flags & RT6_LOOKUP_F_IFACE)
490
			return net->ipv6.ip6_null_entry;
Linus Torvalds's avatar
Linus Torvalds committed
491
	}
492
out:
Linus Torvalds's avatar
Linus Torvalds committed
493 494 495
	return rt;
}

496 497 498
#ifdef CONFIG_IPV6_ROUTER_PREF
static void rt6_probe(struct rt6_info *rt)
{
499
	struct neighbour *neigh;
500 501 502 503 504 505 506 507
	/*
	 * Okay, this does not seem to be appropriate
	 * for now, however, we need to check if it
	 * is really so; aka Router Reachability Probing.
	 *
	 * Router Reachability Probe MUST be rate-limited
	 * to no more than one per minute.
	 */
508
	neigh = rt ? rt->n : NULL;
509
	if (!neigh || (neigh->nud_state & NUD_VALID))
510
		return;
511 512
	read_lock_bh(&neigh->lock);
	if (!(neigh->nud_state & NUD_VALID) &&
513
	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
514 515 516 517 518 519 520 521
		struct in6_addr mcaddr;
		struct in6_addr *target;

		neigh->updated = jiffies;
		read_unlock_bh(&neigh->lock);

		target = (struct in6_addr *)&neigh->primary_key;
		addrconf_addr_solict_mult(target, &mcaddr);
522
		ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL);
523
	} else {
524
		read_unlock_bh(&neigh->lock);
525
	}
526 527 528 529 530 531 532
}
#else
static inline void rt6_probe(struct rt6_info *rt)
{
}
#endif

Linus Torvalds's avatar
Linus Torvalds committed
533
/*
534
 * Default Router Selection (RFC 2461 6.3.6)
Linus Torvalds's avatar
Linus Torvalds committed
535
 */
536
static inline int rt6_check_dev(struct rt6_info *rt, int oif)
537
{
538
	struct net_device *dev = rt->dst.dev;
539
	if (!oif || dev->ifindex == oif)
540
		return 2;
541 542 543 544
	if ((dev->flags & IFF_LOOPBACK) &&
	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
		return 1;
	return 0;
545
}
Linus Torvalds's avatar
Linus Torvalds committed
546

547
static inline bool rt6_check_neigh(struct rt6_info *rt)
Linus Torvalds's avatar
Linus Torvalds committed
548
{
549
	struct neighbour *neigh;
550
	bool ret = false;
551

552
	neigh = rt->n;
553 554
	if (rt->rt6i_flags & RTF_NONEXTHOP ||
	    !(rt->rt6i_flags & RTF_GATEWAY))
555
		ret = true;
556
	else if (neigh) {
557 558
		read_lock_bh(&neigh->lock);
		if (neigh->nud_state & NUD_VALID)
559
			ret = true;
560
#ifdef CONFIG_IPV6_ROUTER_PREF
561 562
		else if (!(neigh->nud_state & NUD_FAILED))
			ret = true;
563
#endif
564
		read_unlock_bh(&neigh->lock);
565 566
	}
	return ret;
Linus Torvalds's avatar
Linus Torvalds committed
567 568
}

569 570
static int rt6_score_route(struct rt6_info *rt, int oif,
			   int strict)
Linus Torvalds's avatar
Linus Torvalds committed
571
{
572
	int m;
573

574
	m = rt6_check_dev(rt, oif);
575
	if (!m && (strict & RT6_LOOKUP_F_IFACE))
576
		return -1;
577 578 579
#ifdef CONFIG_IPV6_ROUTER_PREF
	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
#endif
580
	if (!rt6_check_neigh(rt) && (strict & RT6_LOOKUP_F_REACHABLE))
581 582 583 584
		return -1;
	return m;
}

585 586
static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
				   int *mpri, struct rt6_info *match)
587
{
588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614
	int m;

	if (rt6_check_expired(rt))
		goto out;

	m = rt6_score_route(rt, oif, strict);
	if (m < 0)
		goto out;

	if (m > *mpri) {
		if (strict & RT6_LOOKUP_F_REACHABLE)
			rt6_probe(match);
		*mpri = m;
		match = rt;
	} else if (strict & RT6_LOOKUP_F_REACHABLE) {
		rt6_probe(rt);
	}

out:
	return match;
}

static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
				     struct rt6_info *rr_head,
				     u32 metric, int oif, int strict)
{
	struct rt6_info *rt, *match;
615
	int mpri = -1;
Linus Torvalds's avatar
Linus Torvalds committed
616

617 618
	match = NULL;
	for (rt = rr_head; rt && rt->rt6i_metric == metric;
619
	     rt = rt->dst.rt6_next)
620 621
		match = find_match(rt, oif, strict, &mpri, match);
	for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
622
	     rt = rt->dst.rt6_next)
623
		match = find_match(rt, oif, strict, &mpri, match);
Linus Torvalds's avatar
Linus Torvalds committed
624

625 626
	return match;
}
Linus Torvalds's avatar
Linus Torvalds committed
627

628 629 630
static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
{
	struct rt6_info *match, *rt0;
631
	struct net *net;
Linus Torvalds's avatar
Linus Torvalds committed
632

633 634 635
	rt0 = fn->rr_ptr;
	if (!rt0)
		fn->rr_ptr = rt0 = fn->leaf;
Linus Torvalds's avatar
Linus Torvalds committed
636

637
	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
Linus Torvalds's avatar
Linus Torvalds committed
638

639
	if (!match &&
640
	    (strict & RT6_LOOKUP_F_REACHABLE)) {
641
		struct rt6_info *next = rt0->dst.rt6_next;
642

643
		/* no entries matched; do round-robin */
644 645 646 647 648
		if (!next || next->rt6i_metric != rt0->rt6i_metric)
			next = fn->leaf;

		if (next != rt0)
			fn->rr_ptr = next;
Linus Torvalds's avatar
Linus Torvalds committed
649 650
	}

651
	net = dev_net(rt0->dst.dev);
Eric Dumazet's avatar
Eric Dumazet committed
652
	return match ? match : net->ipv6.ip6_null_entry;
Linus Torvalds's avatar
Linus Torvalds committed
653 654
}

655 656
#ifdef CONFIG_IPV6_ROUTE_INFO
int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
657
		  const struct in6_addr *gwaddr)
658
{
659
	struct net *net = dev_net(dev);
660 661 662
	struct route_info *rinfo = (struct route_info *) opt;
	struct in6_addr prefix_buf, *prefix;
	unsigned int pref;
663
	unsigned long lifetime;
664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686
	struct rt6_info *rt;

	if (len < sizeof(struct route_info)) {
		return -EINVAL;
	}

	/* Sanity check for prefix_len and length */
	if (rinfo->length > 3) {
		return -EINVAL;
	} else if (rinfo->prefix_len > 128) {
		return -EINVAL;
	} else if (rinfo->prefix_len > 64) {
		if (rinfo->length < 2) {
			return -EINVAL;
		}
	} else if (rinfo->prefix_len > 0) {
		if (rinfo->length < 1) {
			return -EINVAL;
		}
	}

	pref = rinfo->route_pref;
	if (pref == ICMPV6_ROUTER_PREF_INVALID)
687
		return -EINVAL;
688

689
	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
690 691 692 693 694 695 696 697 698 699 700

	if (rinfo->length == 3)
		prefix = (struct in6_addr *)rinfo->prefix;
	else {
		/* this function is safe */
		ipv6_addr_prefix(&prefix_buf,
				 (struct in6_addr *)rinfo->prefix,
				 rinfo->prefix_len);
		prefix = &prefix_buf;
	}

701 702
	rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
				dev->ifindex);
703 704

	if (rt && !lifetime) {
705
		ip6_del_rt(rt);
706 707 708 709
		rt = NULL;
	}

	if (!rt && lifetime)
710
		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
711 712 713 714 715 716
					pref);
	else if (rt)
		rt->rt6i_flags = RTF_ROUTEINFO |
				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);

	if (rt) {
717 718 719 720 721
		if (!addrconf_finite_timeout(lifetime))
			rt6_clean_expires(rt);
		else
			rt6_set_expires(rt, jiffies + HZ * lifetime);

Amerigo Wang's avatar
Amerigo Wang committed
722
		ip6_rt_put(rt);
723 724 725 726 727
	}
	return 0;
}
#endif

728
#define BACKTRACK(__net, saddr)			\
729
do { \
730
	if (rt == __net->ipv6.ip6_null_entry) {	\
731
		struct fib6_node *pn; \
732
		while (1) { \
733 734 735 736
			if (fn->fn_flags & RTN_TL_ROOT) \
				goto out; \
			pn = fn->parent; \
			if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
737
				fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
738 739 740 741
			else \
				fn = pn; \
			if (fn->fn_flags & RTN_RTINFO) \
				goto restart; \
742 743
		} \
	} \
744
} while (0)
745

746 747
static struct rt6_info *ip6_pol_route_lookup(struct net *net,
					     struct fib6_table *table,
748
					     struct flowi6 *fl6, int flags)
Linus Torvalds's avatar
Linus Torvalds committed
749 750 751 752
{
	struct fib6_node *fn;
	struct rt6_info *rt;

753
	read_lock_bh(&table->tb6_lock);
754
	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
755 756
restart:
	rt = fn->leaf;
757
	rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
758 759
	if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
		rt = rt6_multipath_select(rt, fl6);
760
	BACKTRACK(net, &fl6->saddr);
761
out:
762
	dst_use(&rt->dst, jiffies);
763 764 765 766 767
	read_unlock_bh(&table->tb6_lock);
	return rt;

}

768 769 770 771 772 773 774
struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6,
				    int flags)
{
	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
}
EXPORT_SYMBOL_GPL(ip6_route_lookup);

775 776
struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
			    const struct in6_addr *saddr, int oif, int strict)
777
{
778 779 780
	struct flowi6 fl6 = {
		.flowi6_oif = oif,
		.daddr = *daddr,
781 782
	};
	struct dst_entry *dst;
783
	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
784

785
	if (saddr) {
786
		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
787 788 789
		flags |= RT6_LOOKUP_F_HAS_SADDR;
	}

790
	dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
791 792 793 794 795
	if (dst->error == 0)
		return (struct rt6_info *) dst;

	dst_release(dst);

Linus Torvalds's avatar
Linus Torvalds committed
796 797 798
	return NULL;
}

799 800
EXPORT_SYMBOL(rt6_lookup);

801
/* ip6_ins_rt is called with FREE table->tb6_lock.
Linus Torvalds's avatar
Linus Torvalds committed
802 803 804 805 806
   It takes new route entry, the addition fails by any reason the
   route is freed. In any case, if caller does not hold it, it may
   be destroyed.
 */

807
static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
Linus Torvalds's avatar
Linus Torvalds committed
808 809
{
	int err;
810
	struct fib6_table *table;
Linus Torvalds's avatar
Linus Torvalds committed
811

812 813
	table = rt->rt6i_table;
	write_lock_bh(&table->tb6_lock);
814
	err = fib6_add(&table->tb6_root, rt, info);
815
	write_unlock_bh(&table->tb6_lock);
Linus Torvalds's avatar
Linus Torvalds committed
816 817 818 819

	return err;
}

820 821
int ip6_ins_rt(struct rt6_info *rt)
{
822
	struct nl_info info = {
823
		.nl_net = dev_net(rt->dst.dev),
824
	};
825
	return __ip6_ins_rt(rt, &info);
826 827
}

828
static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
Eric Dumazet's avatar
Eric Dumazet committed
829
				      const struct in6_addr *daddr,
830
				      const struct in6_addr *saddr)
Linus Torvalds's avatar
Linus Torvalds committed
831 832 833 834 835 836 837
{
	struct rt6_info *rt;

	/*
	 *	Clone the route.
	 */

Eric Dumazet's avatar
Eric Dumazet committed
838
	rt = ip6_rt_copy(ort, daddr);
Linus Torvalds's avatar
Linus Torvalds committed
839 840

	if (rt) {
841 842
		int attempts = !in_softirq();

843
		if (!(rt->rt6i_flags & RTF_GATEWAY)) {
844
			if (ort->rt6i_dst.plen != 128 &&
Eric Dumazet's avatar
Eric Dumazet committed
845
			    ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
846
				rt->rt6i_flags |= RTF_ANYCAST;
847
			rt->rt6i_gateway = *daddr;
848
		}
Linus Torvalds's avatar
Linus Torvalds committed
849 850 851 852 853

		rt->rt6i_flags |= RTF_CACHE;

#ifdef CONFIG_IPV6_SUBTREES
		if (rt->rt6i_src.plen && saddr) {
854
			rt->rt6i_src.addr = *saddr;
Linus Torvalds's avatar
Linus Torvalds committed
855 856 857 858
			rt->rt6i_src.plen = 128;
		}
#endif

859
	retry:
860
		if (rt6_bind_neighbour(rt, rt->dst.dev)) {
861
			struct net *net = dev_net(rt->dst.dev);
862 863 864 865 866 867 868 869 870
			int saved_rt_min_interval =
				net->ipv6.sysctl.ip6_rt_gc_min_interval;
			int saved_rt_elasticity =
				net->ipv6.sysctl.ip6_rt_gc_elasticity;

			if (attempts-- > 0) {
				net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
				net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;

871
				ip6_dst_gc(&net->ipv6.ip6_dst_ops);
872 873 874 875 876 877 878 879

				net->ipv6.sysctl.ip6_rt_gc_elasticity =
					saved_rt_elasticity;
				net->ipv6.sysctl.ip6_rt_gc_min_interval =
					saved_rt_min_interval;
				goto retry;
			}

880
			net_warn_ratelimited("Neighbour table overflow\n");
881
			dst_free(&rt->dst);
882 883
			return NULL;
		}
884
	}
Linus Torvalds's avatar
Linus Torvalds committed
885

886 887
	return rt;
}
Linus Torvalds's avatar
Linus Torvalds committed
888

Eric Dumazet's avatar
Eric Dumazet committed
889 890
static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
					const struct in6_addr *daddr)
891
{
Eric Dumazet's avatar
Eric Dumazet committed
892 893
	struct rt6_info *rt = ip6_rt_copy(ort, daddr);

894 895
	if (rt) {
		rt->rt6i_flags |= RTF_CACHE;
896
		rt->n = neigh_clone(ort->n);
897 898 899 900
	}
	return rt;
}

901
static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
902
				      struct flowi6 *fl6, int flags)
Linus Torvalds's avatar
Linus Torvalds committed
903 904
{
	struct fib6_node *fn;
905
	struct rt6_info *rt, *nrt;
906
	int strict = 0;
Linus Torvalds's avatar
Linus Torvalds committed
907
	int attempts = 3;
908
	int err;
909
	int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
Linus Torvalds's avatar
Linus Torvalds committed
910

911
	strict |= flags & RT6_LOOKUP_F_IFACE;
Linus Torvalds's avatar
Linus Torvalds committed
912 913

relookup:
914
	read_lock_bh(&table->tb6_lock);
Linus Torvalds's avatar
Linus Torvalds committed
915

916
restart_2:
917
	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
Linus Torvalds's avatar
Linus Torvalds committed
918 919

restart:
920
	rt = rt6_select(fn, oif, strict | reachable);
921 922
	if (rt->rt6i_nsiblings && oif == 0)
		rt = rt6_multipath_select(rt, fl6);
923
	BACKTRACK(net, &fl6->saddr);
924
	if (rt == net->ipv6.ip6_null_entry ||
925
	    rt->rt6i_flags & RTF_CACHE)
926
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
927

928
	dst_hold(&rt->dst);
929
	read_unlock_bh(&table->tb6_lock);
930

931
	if (!rt->n && !(rt->rt6i_flags & RTF_NONEXTHOP))
932
		nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
933
	else if (!(rt->dst.flags & DST_HOST))
934
		nrt = rt6_alloc_clone(rt, &fl6->daddr);
935 936
	else
		goto out2;
937

Amerigo Wang's avatar
Amerigo Wang committed
938
	ip6_rt_put(rt);
939
	rt = nrt ? : net->ipv6.ip6_null_entry;
Linus Torvalds's avatar
Linus Torvalds committed
940

941
	dst_hold(&rt->dst);
942
	if (nrt) {
943
		err = ip6_ins_rt(nrt);
944
		if (!err)
Linus Torvalds's avatar
Linus Torvalds committed
945 946 947
			goto out2;
	}

948 949 950 951
	if (--attempts <= 0)
		goto out2;

	/*
952
	 * Race condition! In the gap, when table->tb6_lock was
953 954
	 * released someone could insert this route.  Relookup.
	 */
Amerigo Wang's avatar
Amerigo Wang committed
955
	ip6_rt_put(rt);
956 957 958
	goto relookup;

out:
959 960 961 962
	if (reachable) {
		reachable = 0;
		goto restart_2;
	}
963
	dst_hold(&rt->dst);
964
	read_unlock_bh(&table->tb6_lock);
Linus Torvalds's avatar
Linus Torvalds committed
965
out2:
966 967
	rt->dst.lastuse = jiffies;
	rt->dst.__use++;
968 969

	return rt;
Linus Torvalds's avatar
Linus Torvalds committed
970 971
}

972
static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
973
					    struct flowi6 *fl6, int flags)
974
{
975
	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
976 977
}

978 979 980 981 982 983 984 985 986 987
static struct dst_entry *ip6_route_input_lookup(struct net *net,
						struct net_device *dev,
						struct flowi6 *fl6, int flags)
{
	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
		flags |= RT6_LOOKUP_F_IFACE;

	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
}

988 989
void ip6_route_input(struct sk_buff *skb)
{
990
	const struct ipv6hdr *iph = ipv6_hdr(skb);
991
	struct net *net = dev_net(skb->dev);
992
	int flags = RT6_LOOKUP_F_HAS_SADDR;
993 994 995 996
	struct flowi6 fl6 = {
		.flowi6_iif = skb->dev->ifindex,
		.daddr = iph->daddr,
		.saddr = iph->saddr,
997
		.flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
998 999
		.flowi6_mark = skb->mark,
		.flowi6_proto = iph->nexthdr,
1000
	};
1001

1002
	skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1003 1004
}

1005
static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1006
					     struct flowi6 *fl6, int flags)
Linus Torvalds's avatar
Linus Torvalds committed
1007
{
1008
	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1009 1010
}

1011
struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
1012
				    struct flowi6 *fl6)
1013 1014 1015
{
	int flags = 0;

1016
	fl6->flowi6_iif = LOOPBACK_IFINDEX;
1017

1018
	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
1019
		flags |= RT6_LOOKUP_F_IFACE;
1020

1021
	if (!ipv6_addr_any(&fl6->saddr))
1022
		flags |= RT6_LOOKUP_F_HAS_SADDR;
1023 1024
	else if (sk)
		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1025

1026
	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
Linus Torvalds's avatar
Linus Torvalds committed
1027 1028
}