route.c 74.8 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		ROUTE - implementation of the IP router.
 *
 * Version:	$Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
 *
10
 * Authors:	Ross Biro
Linus Torvalds's avatar
Linus Torvalds committed
11
12
13
14
15
16
17
18
19
20
21
22
 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
 *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
 *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 * Fixes:
 *		Alan Cox	:	Verify area fixes.
 *		Alan Cox	:	cli() protects routing changes
 *		Rui Oliveira	:	ICMP routing table updates
 *		(rco@di.uminho.pt)	Routing table insertion and update
 *		Linus Torvalds	:	Rewrote bits to be sensible
 *		Alan Cox	:	Added BSD route gw semantics
23
 *		Alan Cox	:	Super /proc >4K
Linus Torvalds's avatar
Linus Torvalds committed
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
 *		Alan Cox	:	MTU in route table
 *		Alan Cox	: 	MSS actually. Also added the window
 *					clamper.
 *		Sam Lantinga	:	Fixed route matching in rt_del()
 *		Alan Cox	:	Routing cache support.
 *		Alan Cox	:	Removed compatibility cruft.
 *		Alan Cox	:	RTF_REJECT support.
 *		Alan Cox	:	TCP irtt support.
 *		Jonathan Naylor	:	Added Metric support.
 *	Miquel van Smoorenburg	:	BSD API fixes.
 *	Miquel van Smoorenburg	:	Metrics.
 *		Alan Cox	:	Use __u32 properly
 *		Alan Cox	:	Aligned routing errors more closely with BSD
 *					our system is still very different.
 *		Alan Cox	:	Faster /proc handling
 *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
 *					routing caches and better behaviour.
41
 *
Linus Torvalds's avatar
Linus Torvalds committed
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
 *		Olaf Erb	:	irtt wasn't being copied right.
 *		Bjorn Ekwall	:	Kerneld route support.
 *		Alan Cox	:	Multicast fixed (I hope)
 * 		Pavel Krauz	:	Limited broadcast fixed
 *		Mike McLagan	:	Routing by source
 *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
 *					route.c and rewritten from scratch.
 *		Andi Kleen	:	Load-limit warning messages.
 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
 *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
 *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
 *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
 *		Marc Boucher	:	routing by fwmark
 *	Robert Olsson		:	Added rt_cache statistics
 *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
57
 *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
58
59
 * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
 * 	Ilia Sotnikov		:	Removed TOS from hash calculations
Linus Torvalds's avatar
Linus Torvalds committed
60
61
62
63
64
65
66
67
68
69
70
71
72
73
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 */

#include <linux/module.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
74
#include <linux/bootmem.h>
Linus Torvalds's avatar
Linus Torvalds committed
75
76
77
78
79
80
81
82
83
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/proc_fs.h>
#include <linux/init.h>
84
#include <linux/workqueue.h>
Linus Torvalds's avatar
Linus Torvalds committed
85
86
87
88
89
90
91
92
93
94
#include <linux/skbuff.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
#include <linux/pkt_sched.h>
#include <linux/mroute.h>
#include <linux/netfilter_ipv4.h>
#include <linux/random.h>
#include <linux/jhash.h>
#include <linux/rcupdate.h>
#include <linux/times.h>
95
#include <net/dst.h>
96
#include <net/net_namespace.h>
Linus Torvalds's avatar
Linus Torvalds committed
97
98
99
100
101
102
103
104
105
106
#include <net/protocol.h>
#include <net/ip.h>
#include <net/route.h>
#include <net/inetpeer.h>
#include <net/sock.h>
#include <net/ip_fib.h>
#include <net/arp.h>
#include <net/tcp.h>
#include <net/icmp.h>
#include <net/xfrm.h>
107
#include <net/netevent.h>
108
#include <net/rtnetlink.h>
Linus Torvalds's avatar
Linus Torvalds committed
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif

#define RT_FL_TOS(oldflp) \
    ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))

#define IP_MAX_MTU	0xFFF0

#define RT_GC_TIMEOUT (300*HZ)

static int ip_rt_max_size;
static int ip_rt_gc_timeout		= RT_GC_TIMEOUT;
static int ip_rt_gc_interval		= 60 * HZ;
static int ip_rt_gc_min_interval	= HZ / 2;
static int ip_rt_redirect_number	= 9;
static int ip_rt_redirect_load		= HZ / 50;
static int ip_rt_redirect_silence	= ((HZ / 50) << (9 + 1));
static int ip_rt_error_cost		= HZ;
static int ip_rt_error_burst		= 5 * HZ;
static int ip_rt_gc_elasticity		= 8;
static int ip_rt_mtu_expires		= 10 * 60 * HZ;
static int ip_rt_min_pmtu		= 512 + 20 + 20;
static int ip_rt_min_advmss		= 256;
static int ip_rt_secret_interval	= 10 * 60 * HZ;

#define RTprint(a...)	printk(KERN_DEBUG a)

137
138
static void rt_worker_func(struct work_struct *work);
static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
Linus Torvalds's avatar
Linus Torvalds committed
139
140
141
142
143
144
145
146
147
148
149
150
151
static struct timer_list rt_secret_timer;

/*
 *	Interface to generic destination cache.
 */

static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
static void		 ipv4_dst_destroy(struct dst_entry *dst);
static void		 ipv4_dst_ifdown(struct dst_entry *dst,
					 struct net_device *dev, int how);
static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
static void		 ipv4_link_failure(struct sk_buff *skb);
static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152
static int rt_garbage_collect(struct dst_ops *ops);
Linus Torvalds's avatar
Linus Torvalds committed
153
154
155
156
157
158
159
160
161
162
163
164


static struct dst_ops ipv4_dst_ops = {
	.family =		AF_INET,
	.protocol =		__constant_htons(ETH_P_IP),
	.gc =			rt_garbage_collect,
	.check =		ipv4_dst_check,
	.destroy =		ipv4_dst_destroy,
	.ifdown =		ipv4_dst_ifdown,
	.negative_advice =	ipv4_negative_advice,
	.link_failure =		ipv4_link_failure,
	.update_pmtu =		ip_rt_update_pmtu,
165
	.local_out =		ip_local_out,
Linus Torvalds's avatar
Linus Torvalds committed
166
	.entry_size =		sizeof(struct rtable),
167
	.entries =		ATOMIC_INIT(0),
Linus Torvalds's avatar
Linus Torvalds committed
168
169
170
171
};

#define ECN_OR_COST(class)	TC_PRIO_##class

172
const __u8 ip_tos2prio[16] = {
Linus Torvalds's avatar
Linus Torvalds committed
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
	TC_PRIO_BESTEFFORT,
	ECN_OR_COST(FILLER),
	TC_PRIO_BESTEFFORT,
	ECN_OR_COST(BESTEFFORT),
	TC_PRIO_BULK,
	ECN_OR_COST(BULK),
	TC_PRIO_BULK,
	ECN_OR_COST(BULK),
	TC_PRIO_INTERACTIVE,
	ECN_OR_COST(INTERACTIVE),
	TC_PRIO_INTERACTIVE,
	ECN_OR_COST(INTERACTIVE),
	TC_PRIO_INTERACTIVE_BULK,
	ECN_OR_COST(INTERACTIVE_BULK),
	TC_PRIO_INTERACTIVE_BULK,
	ECN_OR_COST(INTERACTIVE_BULK)
};


/*
 * Route cache.
 */

/* The locking scheme is rather straight forward:
 *
 * 1) Read-Copy Update protects the buckets of the central route hash.
 * 2) Only writers remove entries, and they hold the lock
 *    as they look at rtable reference counts.
 * 3) Only readers acquire references to rtable entries,
 *    they do so with atomic increments and with the
 *    lock held.
 */

struct rt_hash_bucket {
	struct rtable	*chain;
208
};
209
210
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
	defined(CONFIG_PROVE_LOCKING)
211
212
213
/*
 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 * The size of this table is a power of two and depends on the number of CPUS.
214
 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
215
 */
216
217
#ifdef CONFIG_LOCKDEP
# define RT_HASH_LOCK_SZ	256
218
#else
219
220
221
222
223
224
225
226
227
228
229
# if NR_CPUS >= 32
#  define RT_HASH_LOCK_SZ	4096
# elif NR_CPUS >= 16
#  define RT_HASH_LOCK_SZ	2048
# elif NR_CPUS >= 8
#  define RT_HASH_LOCK_SZ	1024
# elif NR_CPUS >= 4
#  define RT_HASH_LOCK_SZ	512
# else
#  define RT_HASH_LOCK_SZ	256
# endif
230
231
232
233
#endif

static spinlock_t	*rt_hash_locks;
# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
234
235
236
237
238
239
240
241
242
243
244
245
246

static __init void rt_hash_lock_init(void)
{
	int i;

	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
			GFP_KERNEL);
	if (!rt_hash_locks)
		panic("IP: failed to allocate rt_hash_locks\n");

	for (i = 0; i < RT_HASH_LOCK_SZ; i++)
		spin_lock_init(&rt_hash_locks[i]);
}
247
248
#else
# define rt_hash_lock_addr(slot) NULL
249
250
251
252

static inline void rt_hash_lock_init(void)
{
}
253
#endif
Linus Torvalds's avatar
Linus Torvalds committed
254
255
256

static struct rt_hash_bucket 	*rt_hash_table;
static unsigned			rt_hash_mask;
257
static unsigned int		rt_hash_log;
258
static atomic_t			rt_genid;
Linus Torvalds's avatar
Linus Torvalds committed
259

260
static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
261
#define RT_CACHE_STAT_INC(field) \
262
	(__raw_get_cpu_var(rt_cache_stat).field++)
Linus Torvalds's avatar
Linus Torvalds committed
263

264
static unsigned int rt_hash_code(u32 daddr, u32 saddr)
Linus Torvalds's avatar
Linus Torvalds committed
265
{
266
267
	return jhash_2words(daddr, saddr, atomic_read(&rt_genid))
		& rt_hash_mask;
Linus Torvalds's avatar
Linus Torvalds committed
268
269
}

270
271
272
273
#define rt_hash(daddr, saddr, idx) \
	rt_hash_code((__force u32)(__be32)(daddr),\
		     (__force u32)(__be32)(saddr) ^ ((idx) << 5))

Linus Torvalds's avatar
Linus Torvalds committed
274
275
276
#ifdef CONFIG_PROC_FS
struct rt_cache_iter_state {
	int bucket;
277
	int genid;
Linus Torvalds's avatar
Linus Torvalds committed
278
279
};

280
static struct rtable *rt_cache_get_first(struct rt_cache_iter_state *st)
Linus Torvalds's avatar
Linus Torvalds committed
281
282
283
284
285
{
	struct rtable *r = NULL;

	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
		rcu_read_lock_bh();
286
287
288
289
290
291
		r = rcu_dereference(rt_hash_table[st->bucket].chain);
		while (r) {
			if (r->rt_genid == st->genid)
				return r;
			r = rcu_dereference(r->u.dst.rt_next);
		}
Linus Torvalds's avatar
Linus Torvalds committed
292
293
		rcu_read_unlock_bh();
	}
294
	return r;
Linus Torvalds's avatar
Linus Torvalds committed
295
296
}

297
static struct rtable *rt_cache_get_next(struct rt_cache_iter_state *st, struct rtable *r)
Linus Torvalds's avatar
Linus Torvalds committed
298
{
299
	r = r->u.dst.rt_next;
Linus Torvalds's avatar
Linus Torvalds committed
300
301
302
303
304
305
306
	while (!r) {
		rcu_read_unlock_bh();
		if (--st->bucket < 0)
			break;
		rcu_read_lock_bh();
		r = rt_hash_table[st->bucket].chain;
	}
307
	return rcu_dereference(r);
Linus Torvalds's avatar
Linus Torvalds committed
308
309
}

310
static struct rtable *rt_cache_get_idx(struct rt_cache_iter_state *st, loff_t pos)
Linus Torvalds's avatar
Linus Torvalds committed
311
{
312
	struct rtable *r = rt_cache_get_first(st);
Linus Torvalds's avatar
Linus Torvalds committed
313
314

	if (r)
315
316
317
		while (pos && (r = rt_cache_get_next(st, r))) {
			if (r->rt_genid != st->genid)
				continue;
Linus Torvalds's avatar
Linus Torvalds committed
318
			--pos;
319
		}
Linus Torvalds's avatar
Linus Torvalds committed
320
321
322
323
324
	return pos ? NULL : r;
}

static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
{
325
326
327
328
329
330
	struct rt_cache_iter_state *st = seq->private;

	if (*pos)
		return rt_cache_get_idx(st, *pos - 1);
	st->genid = atomic_read(&rt_genid);
	return SEQ_START_TOKEN;
Linus Torvalds's avatar
Linus Torvalds committed
331
332
333
334
}

static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
335
336
	struct rtable *r;
	struct rt_cache_iter_state *st = seq->private;
Linus Torvalds's avatar
Linus Torvalds committed
337
338

	if (v == SEQ_START_TOKEN)
339
		r = rt_cache_get_first(st);
Linus Torvalds's avatar
Linus Torvalds committed
340
	else
341
		r = rt_cache_get_next(st, v);
Linus Torvalds's avatar
Linus Torvalds committed
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
	++*pos;
	return r;
}

static void rt_cache_seq_stop(struct seq_file *seq, void *v)
{
	if (v && v != SEQ_START_TOKEN)
		rcu_read_unlock_bh();
}

static int rt_cache_seq_show(struct seq_file *seq, void *v)
{
	if (v == SEQ_START_TOKEN)
		seq_printf(seq, "%-127s\n",
			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
			   "HHUptod\tSpecDst");
	else {
		struct rtable *r = v;
		char temp[256];

		sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
			      "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
			r->u.dst.dev ? r->u.dst.dev->name : "*",
			(unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
			r->rt_flags, atomic_read(&r->u.dst.__refcnt),
			r->u.dst.__use, 0, (unsigned long)r->rt_src,
			(dst_metric(&r->u.dst, RTAX_ADVMSS) ?
			     (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
			dst_metric(&r->u.dst, RTAX_WINDOW),
			(int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
			      dst_metric(&r->u.dst, RTAX_RTTVAR)),
			r->fl.fl4_tos,
			r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
			r->u.dst.hh ? (r->u.dst.hh->hh_output ==
				       dev_queue_xmit) : 0,
			r->rt_spec_dst);
		seq_printf(seq, "%-127s\n", temp);
380
381
	}
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
382
383
}

384
static const struct seq_operations rt_cache_seq_ops = {
Linus Torvalds's avatar
Linus Torvalds committed
385
386
387
388
389
390
391
392
	.start  = rt_cache_seq_start,
	.next   = rt_cache_seq_next,
	.stop   = rt_cache_seq_stop,
	.show   = rt_cache_seq_show,
};

static int rt_cache_seq_open(struct inode *inode, struct file *file)
{
393
394
	return seq_open_private(file, &rt_cache_seq_ops,
			sizeof(struct rt_cache_iter_state));
Linus Torvalds's avatar
Linus Torvalds committed
395
396
}

397
static const struct file_operations rt_cache_seq_fops = {
Linus Torvalds's avatar
Linus Torvalds committed
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
	.owner	 = THIS_MODULE,
	.open	 = rt_cache_seq_open,
	.read	 = seq_read,
	.llseek	 = seq_lseek,
	.release = seq_release_private,
};


static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
{
	int cpu;

	if (*pos == 0)
		return SEQ_START_TOKEN;

	for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
		if (!cpu_possible(cpu))
			continue;
		*pos = cpu+1;
417
		return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds's avatar
Linus Torvalds committed
418
419
420
421
422
423
424
425
426
427
428
429
	}
	return NULL;
}

static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
	int cpu;

	for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
		if (!cpu_possible(cpu))
			continue;
		*pos = cpu+1;
430
		return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds's avatar
Linus Torvalds committed
431
432
	}
	return NULL;
433

Linus Torvalds's avatar
Linus Torvalds committed
434
435
436
437
438
439
440
441
442
443
444
445
}

static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
{

}

static int rt_cpu_seq_show(struct seq_file *seq, void *v)
{
	struct rt_cache_stat *st = v;

	if (v == SEQ_START_TOKEN) {
446
		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds's avatar
Linus Torvalds committed
447
448
		return 0;
	}
449

Linus Torvalds's avatar
Linus Torvalds committed
450
451
452
453
454
455
456
457
458
459
460
461
462
	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
		   atomic_read(&ipv4_dst_ops.entries),
		   st->in_hit,
		   st->in_slow_tot,
		   st->in_slow_mc,
		   st->in_no_route,
		   st->in_brd,
		   st->in_martian_dst,
		   st->in_martian_src,

		   st->out_hit,
		   st->out_slow_tot,
463
		   st->out_slow_mc,
Linus Torvalds's avatar
Linus Torvalds committed
464
465
466
467
468
469
470
471
472
473
474

		   st->gc_total,
		   st->gc_ignored,
		   st->gc_goal_miss,
		   st->gc_dst_overflow,
		   st->in_hlist_search,
		   st->out_hlist_search
		);
	return 0;
}

475
static const struct seq_operations rt_cpu_seq_ops = {
Linus Torvalds's avatar
Linus Torvalds committed
476
477
478
479
480
481
482
483
484
485
486
487
	.start  = rt_cpu_seq_start,
	.next   = rt_cpu_seq_next,
	.stop   = rt_cpu_seq_stop,
	.show   = rt_cpu_seq_show,
};


static int rt_cpu_seq_open(struct inode *inode, struct file *file)
{
	return seq_open(file, &rt_cpu_seq_ops);
}

488
static const struct file_operations rt_cpu_seq_fops = {
Linus Torvalds's avatar
Linus Torvalds committed
489
490
491
492
493
494
495
	.owner	 = THIS_MODULE,
	.open	 = rt_cpu_seq_open,
	.read	 = seq_read,
	.llseek	 = seq_lseek,
	.release = seq_release,
};

496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
#ifdef CONFIG_NET_CLS_ROUTE
static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
			   int length, int *eof, void *data)
{
	unsigned int i;

	if ((offset & 3) || (length & 3))
		return -EIO;

	if (offset >= sizeof(struct ip_rt_acct) * 256) {
		*eof = 1;
		return 0;
	}

	if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
		length = sizeof(struct ip_rt_acct) * 256 - offset;
		*eof = 1;
	}

	offset /= sizeof(u32);

	if (length > 0) {
		u32 *dst = (u32 *) buffer;

		*start = buffer;
		memset(dst, 0, length);

		for_each_possible_cpu(i) {
			unsigned int j;
			u32 *src;

			src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
			for (j = 0; j < length/4; j++)
				dst[j] += src[j];
		}
	}
	return length;
}
#endif
535
536
537
538
539
540
541
542
543
544

static __init int ip_rt_proc_init(struct net *net)
{
	struct proc_dir_entry *pde;

	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
			&rt_cache_seq_fops);
	if (!pde)
		goto err1;

545
546
	pde = proc_create("rt_cache", S_IRUGO,
			  net->proc_net_stat, &rt_cpu_seq_fops);
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
	if (!pde)
		goto err2;

#ifdef CONFIG_NET_CLS_ROUTE
	pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
			ip_rt_acct_read, NULL);
	if (!pde)
		goto err3;
#endif
	return 0;

#ifdef CONFIG_NET_CLS_ROUTE
err3:
	remove_proc_entry("rt_cache", net->proc_net_stat);
#endif
err2:
	remove_proc_entry("rt_cache", net->proc_net);
err1:
	return -ENOMEM;
}
#else
static inline int ip_rt_proc_init(struct net *net)
{
	return 0;
}
Linus Torvalds's avatar
Linus Torvalds committed
572
#endif /* CONFIG_PROC_FS */
573

Linus Torvalds's avatar
Linus Torvalds committed
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
static __inline__ void rt_free(struct rtable *rt)
{
	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
}

static __inline__ void rt_drop(struct rtable *rt)
{
	ip_rt_put(rt);
	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
}

static __inline__ int rt_fast_clean(struct rtable *rth)
{
	/* Kill broadcast/multicast entries very aggresively, if they
	   collide in hash table with more useful entries */
	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
590
		rth->fl.iif && rth->u.dst.rt_next;
Linus Torvalds's avatar
Linus Torvalds committed
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
}

static __inline__ int rt_valuable(struct rtable *rth)
{
	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
		rth->u.dst.expires;
}

static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
{
	unsigned long age;
	int ret = 0;

	if (atomic_read(&rth->u.dst.__refcnt))
		goto out;

	ret = 1;
	if (rth->u.dst.expires &&
	    time_after_eq(jiffies, rth->u.dst.expires))
		goto out;

	age = jiffies - rth->u.dst.lastuse;
	ret = 0;
	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
	    (age <= tmo2 && rt_valuable(rth)))
		goto out;
	ret = 1;
out:	return ret;
}

/* Bits of score are:
 * 31: very valuable
 * 30: not quite useless
 * 29..0: usage counter
 */
static inline u32 rt_score(struct rtable *rt)
{
	u32 score = jiffies - rt->u.dst.lastuse;

	score = ~score & ~(3<<30);

	if (rt_valuable(rt))
		score |= (1<<31);

	if (!rt->fl.iif ||
	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
		score |= (1<<30);

	return score;
}

static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
{
644
645
	return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
		(fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
646
		(fl1->mark ^ fl2->mark) |
647
648
649
650
		(*(u16 *)&fl1->nl_u.ip4_u.tos ^
		 *(u16 *)&fl2->nl_u.ip4_u.tos) |
		(fl1->oif ^ fl2->oif) |
		(fl1->iif ^ fl2->iif)) == 0;
Linus Torvalds's avatar
Linus Torvalds committed
651
652
}

653
654
655
656
657
static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
{
	return rt1->u.dst.dev->nd_net == rt2->u.dst.dev->nd_net;
}

658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
/*
 * Perform a full scan of hash table and free all entries.
 * Can be called by a softirq or a process.
 * In the later case, we want to be reschedule if necessary
 */
static void rt_do_flush(int process_context)
{
	unsigned int i;
	struct rtable *rth, *next;

	for (i = 0; i <= rt_hash_mask; i++) {
		if (process_context && need_resched())
			cond_resched();
		rth = rt_hash_table[i].chain;
		if (!rth)
			continue;

		spin_lock_bh(rt_hash_lock_addr(i));
		rth = rt_hash_table[i].chain;
		rt_hash_table[i].chain = NULL;
		spin_unlock_bh(rt_hash_lock_addr(i));

		for (; rth; rth = next) {
			next = rth->u.dst.rt_next;
			rt_free(rth);
		}
	}
}

static void rt_check_expire(void)
Linus Torvalds's avatar
Linus Torvalds committed
688
{
689
690
	static unsigned int rover;
	unsigned int i = rover, goal;
Linus Torvalds's avatar
Linus Torvalds committed
691
	struct rtable *rth, **rthp;
692
693
694
695
696
697
	u64 mult;

	mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
	if (ip_rt_gc_timeout > 1)
		do_div(mult, ip_rt_gc_timeout);
	goal = (unsigned int)mult;
698
699
	if (goal > rt_hash_mask)
		goal = rt_hash_mask + 1;
700
	for (; goal > 0; goal--) {
Linus Torvalds's avatar
Linus Torvalds committed
701
702
703
704
705
		unsigned long tmo = ip_rt_gc_timeout;

		i = (i + 1) & rt_hash_mask;
		rthp = &rt_hash_table[i].chain;

706
707
708
		if (need_resched())
			cond_resched();

709
		if (*rthp == NULL)
710
			continue;
711
		spin_lock_bh(rt_hash_lock_addr(i));
Linus Torvalds's avatar
Linus Torvalds committed
712
		while ((rth = *rthp) != NULL) {
713
714
715
716
717
			if (rth->rt_genid != atomic_read(&rt_genid)) {
				*rthp = rth->u.dst.rt_next;
				rt_free(rth);
				continue;
			}
Linus Torvalds's avatar
Linus Torvalds committed
718
719
			if (rth->u.dst.expires) {
				/* Entry is expired even if it is in use */
720
				if (time_before_eq(jiffies, rth->u.dst.expires)) {
Linus Torvalds's avatar
Linus Torvalds committed
721
					tmo >>= 1;
722
					rthp = &rth->u.dst.rt_next;
Linus Torvalds's avatar
Linus Torvalds committed
723
724
725
726
					continue;
				}
			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
				tmo >>= 1;
727
				rthp = &rth->u.dst.rt_next;
Linus Torvalds's avatar
Linus Torvalds committed
728
729
730
731
				continue;
			}

			/* Cleanup aged off entries. */
732
			*rthp = rth->u.dst.rt_next;
733
			rt_free(rth);
Linus Torvalds's avatar
Linus Torvalds committed
734
		}
735
		spin_unlock_bh(rt_hash_lock_addr(i));
Linus Torvalds's avatar
Linus Torvalds committed
736
737
	}
	rover = i;
738
739
740
741
}

/*
 * rt_worker_func() is run in process context.
742
 * we call rt_check_expire() to scan part of the hash table
743
744
745
 */
static void rt_worker_func(struct work_struct *work)
{
746
	rt_check_expire();
747
	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
Linus Torvalds's avatar
Linus Torvalds committed
748
749
}

750
751
752
753
754
/*
 * Pertubation of rt_genid by a small quantity [1..256]
 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 * many times (2^24) without giving recent rt_genid.
 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
Linus Torvalds's avatar
Linus Torvalds committed
755
 */
756
static void rt_cache_invalidate(void)
Linus Torvalds's avatar
Linus Torvalds committed
757
{
758
	unsigned char shuffle;
Linus Torvalds's avatar
Linus Torvalds committed
759

760
761
	get_random_bytes(&shuffle, sizeof(shuffle));
	atomic_add(shuffle + 1U, &rt_genid);
Linus Torvalds's avatar
Linus Torvalds committed
762
763
}

764
765
766
767
/*
 * delay < 0  : invalidate cache (fast : entries will be deleted later)
 * delay >= 0 : invalidate & flush cache (can be long)
 */
Linus Torvalds's avatar
Linus Torvalds committed
768
769
void rt_cache_flush(int delay)
{
770
771
772
	rt_cache_invalidate();
	if (delay >= 0)
		rt_do_flush(!in_softirq());
Linus Torvalds's avatar
Linus Torvalds committed
773
774
}

775
/*
776
 * We change rt_genid and let gc do the cleanup
777
 */
Linus Torvalds's avatar
Linus Torvalds committed
778
779
static void rt_secret_rebuild(unsigned long dummy)
{
780
	rt_cache_invalidate();
781
	mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
Linus Torvalds's avatar
Linus Torvalds committed
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
}

/*
   Short description of GC goals.

   We want to build algorithm, which will keep routing cache
   at some equilibrium point, when number of aged off entries
   is kept approximately equal to newly generated ones.

   Current expiration strength is variable "expire".
   We try to adjust it dynamically, so that if networking
   is idle expires is large enough to keep enough of warm entries,
   and when load increases it reduces to limit cache size.
 */

797
static int rt_garbage_collect(struct dst_ops *ops)
Linus Torvalds's avatar
Linus Torvalds committed
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
{
	static unsigned long expire = RT_GC_TIMEOUT;
	static unsigned long last_gc;
	static int rover;
	static int equilibrium;
	struct rtable *rth, **rthp;
	unsigned long now = jiffies;
	int goal;

	/*
	 * Garbage collection is pretty expensive,
	 * do not make it too frequently.
	 */

	RT_CACHE_STAT_INC(gc_total);

	if (now - last_gc < ip_rt_gc_min_interval &&
	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
		RT_CACHE_STAT_INC(gc_ignored);
		goto out;
	}

	/* Calculate number of entries, which we want to expire now. */
	goal = atomic_read(&ipv4_dst_ops.entries) -
		(ip_rt_gc_elasticity << rt_hash_log);
	if (goal <= 0) {
		if (equilibrium < ipv4_dst_ops.gc_thresh)
			equilibrium = ipv4_dst_ops.gc_thresh;
		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
		if (goal > 0) {
828
			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Linus Torvalds's avatar
Linus Torvalds committed
829
830
831
832
833
834
			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
		}
	} else {
		/* We are in dangerous area. Try to reduce cache really
		 * aggressively.
		 */
835
		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Linus Torvalds's avatar
Linus Torvalds committed
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
	}

	if (now - last_gc >= ip_rt_gc_min_interval)
		last_gc = now;

	if (goal <= 0) {
		equilibrium += goal;
		goto work_done;
	}

	do {
		int i, k;

		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
			unsigned long tmo = expire;

			k = (k + 1) & rt_hash_mask;
			rthp = &rt_hash_table[k].chain;
855
			spin_lock_bh(rt_hash_lock_addr(k));
Linus Torvalds's avatar
Linus Torvalds committed
856
			while ((rth = *rthp) != NULL) {
857
858
				if (rth->rt_genid == atomic_read(&rt_genid) &&
					!rt_may_expire(rth, tmo, expire)) {
Linus Torvalds's avatar
Linus Torvalds committed
859
					tmo >>= 1;
860
					rthp = &rth->u.dst.rt_next;
Linus Torvalds's avatar
Linus Torvalds committed
861
862
					continue;
				}
863
				*rthp = rth->u.dst.rt_next;
Linus Torvalds's avatar
Linus Torvalds committed
864
865
866
				rt_free(rth);
				goal--;
			}
867
			spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds's avatar
Linus Torvalds committed
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
			if (goal <= 0)
				break;
		}
		rover = k;

		if (goal <= 0)
			goto work_done;

		/* Goal is not achieved. We stop process if:

		   - if expire reduced to zero. Otherwise, expire is halfed.
		   - if table is not full.
		   - if we are called from interrupt.
		   - jiffies check is just fallback/debug loop breaker.
		     We will not spin here for long time in any case.
		 */

		RT_CACHE_STAT_INC(gc_goal_miss);

		if (expire == 0)
			break;

		expire >>= 1;
#if RT_CACHE_DEBUG >= 2
		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
				atomic_read(&ipv4_dst_ops.entries), goal, i);
#endif

		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
			goto out;
	} while (!in_softirq() && time_before_eq(jiffies, now));

	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
		goto out;
	if (net_ratelimit())
		printk(KERN_WARNING "dst cache overflow\n");
	RT_CACHE_STAT_INC(gc_dst_overflow);
	return 1;

work_done:
	expire += ip_rt_gc_min_interval;
	if (expire > ip_rt_gc_timeout ||
	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
		expire = ip_rt_gc_timeout;
#if RT_CACHE_DEBUG >= 2
	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
			atomic_read(&ipv4_dst_ops.entries), goal, rover);
#endif
out:	return 0;
}

static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
{
	struct rtable	*rth, **rthp;
	unsigned long	now;
	struct rtable *cand, **candp;
	u32 		min_score;
	int		chain_length;
	int attempts = !in_softirq();

restart:
	chain_length = 0;
	min_score = ~(u32)0;
	cand = NULL;
	candp = NULL;
	now = jiffies;

	rthp = &rt_hash_table[hash].chain;

937
	spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds's avatar
Linus Torvalds committed
938
	while ((rth = *rthp) != NULL) {
939
940
941
942
943
		if (rth->rt_genid != atomic_read(&rt_genid)) {
			*rthp = rth->u.dst.rt_next;
			rt_free(rth);
			continue;
		}
944
		if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
Linus Torvalds's avatar
Linus Torvalds committed
945
			/* Put it first */
946
			*rthp = rth->u.dst.rt_next;
Linus Torvalds's avatar
Linus Torvalds committed
947
948
949
950
951
			/*
			 * Since lookup is lockfree, the deletion
			 * must be visible to another weakly ordered CPU before
			 * the insertion at the start of the hash chain.
			 */
952
			rcu_assign_pointer(rth->u.dst.rt_next,
Linus Torvalds's avatar
Linus Torvalds committed
953
954
955
956
957
958
959
					   rt_hash_table[hash].chain);
			/*
			 * Since lookup is lockfree, the update writes
			 * must be ordered for consistency on SMP.
			 */
			rcu_assign_pointer(rt_hash_table[hash].chain, rth);

960
			dst_use(&rth->u.dst, now);
961
			spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds's avatar
Linus Torvalds committed
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979

			rt_drop(rt);
			*rp = rth;
			return 0;
		}

		if (!atomic_read(&rth->u.dst.__refcnt)) {
			u32 score = rt_score(rth);

			if (score <= min_score) {
				cand = rth;
				candp = rthp;
				min_score = score;
			}
		}

		chain_length++;

980
		rthp = &rth->u.dst.rt_next;
Linus Torvalds's avatar
Linus Torvalds committed
981
982
983
984
985
986
987
988
989
990
	}

	if (cand) {
		/* ip_rt_gc_elasticity used to be average length of chain
		 * length, when exceeded gc becomes really aggressive.
		 *
		 * The second limit is less certain. At the moment it allows
		 * only 2 entries per bucket. We will see.
		 */
		if (chain_length > ip_rt_gc_elasticity) {
991
			*candp = cand->u.dst.rt_next;
Linus Torvalds's avatar
Linus Torvalds committed
992
993
994
995
996
997
998
999
1000
1001
			rt_free(cand);
		}
	}

	/* Try to bind route to arp only if it is output
	   route or unicast forwarding path.
	 */
	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
		int err = arp_bind_neighbour(&rt->u.dst);
		if (err) {
1002
			spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds's avatar
Linus Torvalds committed
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017

			if (err != -ENOBUFS) {
				rt_drop(rt);
				return err;
			}

			/* Neighbour tables are full and nothing
			   can be released. Try to shrink route cache,
			   it is most likely it holds some neighbour records.
			 */
			if (attempts-- > 0) {
				int saved_elasticity = ip_rt_gc_elasticity;
				int saved_int = ip_rt_gc_min_interval;
				ip_rt_gc_elasticity	= 1;
				ip_rt_gc_min_interval	= 0;
1018
				rt_garbage_collect(&ipv4_dst_ops);
Linus Torvalds's avatar
Linus Torvalds committed
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
				ip_rt_gc_min_interval	= saved_int;
				ip_rt_gc_elasticity	= saved_elasticity;
				goto restart;
			}

			if (net_ratelimit())
				printk(KERN_WARNING "Neighbour table overflow.\n");
			rt_drop(rt);
			return -ENOBUFS;
		}
	}

1031
	rt->u.dst.rt_next = rt_hash_table[hash].chain;
Linus Torvalds's avatar
Linus Torvalds committed
1032
#if RT_CACHE_DEBUG >= 2
1033
	if (rt->u.dst.rt_next) {
Linus Torvalds's avatar
Linus Torvalds committed
1034
1035
1036
		struct rtable *trt;
		printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
		       NIPQUAD(rt->rt_dst));
1037
		for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
Linus Torvalds's avatar
Linus Torvalds committed
1038
1039
1040
1041
1042
			printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
		printk("\n");
	}
#endif
	rt_hash_table[hash].chain = rt;
1043
	spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds's avatar
Linus Torvalds committed
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
	*rp = rt;
	return 0;
}

void rt_bind_peer(struct rtable *rt, int create)
{
	static DEFINE_SPINLOCK(rt_peer_lock);
	struct inet_peer *peer;

	peer = inet_getpeer(rt->rt_dst, create);

	spin_lock_bh(&rt_peer_lock);
	if (rt->peer == NULL) {
		rt->peer = peer;
		peer = NULL;
	}
	spin_unlock_bh(&rt_peer_lock);
	if (peer)
		inet_putpeer(peer);
}

/*
 * Peer allocation may fail only in serious out-of-memory conditions.  However
 * we still can generate some output.
 * Random ID selection looks a bit dangerous because we have no chances to
 * select ID being unique in a reasonable period of time.
 * But broken packet identifier may be better than no packet at all.
 */
static void ip_select_fb_ident(struct iphdr *iph)
{
	static DEFINE_SPINLOCK(ip_fb_id_lock);
	static u32 ip_fallback_id;
	u32 salt;

	spin_lock_bh(&ip_fb_id_lock);
1079
	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
Linus Torvalds's avatar
Linus Torvalds committed
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
	iph->id = htons(salt & 0xFFFF);
	ip_fallback_id = salt;
	spin_unlock_bh(&ip_fb_id_lock);
}

void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
{
	struct rtable *rt = (struct rtable *) dst;

	if (rt) {
		if (rt->peer == NULL)
			rt_bind_peer(rt, 1);

		/* If peer is attached to destination, it is never detached,
		   so that we need not to grab a lock to dereference it.
		 */
		if (rt->peer) {
			iph->id = htons(inet_getid(rt->peer, more));
			return;
		}
	} else
1101
		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1102
		       __builtin_return_address(0));
Linus Torvalds's avatar
Linus Torvalds committed
1103
1104
1105
1106
1107
1108

	ip_select_fb_ident(iph);
}

static void rt_del(unsigned hash, struct rtable *rt)
{
1109
	struct rtable **rthp, *aux;
Linus Torvalds's avatar
Linus Torvalds committed
1110

1111
	rthp = &rt_hash_table[hash].chain;
1112
	spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds's avatar
Linus Torvalds committed
1113
	ip_rt_put(rt);
1114
1115
1116
1117
1118
	while ((aux = *rthp) != NULL) {
		if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) {
			*rthp = aux->u.dst.rt_next;
			rt_free(aux);
			continue;
Linus Torvalds's avatar
Linus Torvalds committed
1119
		}
1120
1121
		rthp = &aux->u.dst.rt_next;
	}
1122
	spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds's avatar
Linus Torvalds committed
1123
1124
}

Al Viro's avatar
Al Viro committed
1125
1126
void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
		    __be32 saddr, struct net_device *dev)
Linus Torvalds's avatar
Linus Torvalds committed
1127
1128
1129
1130
{
	int i, k;
	struct in_device *in_dev = in_dev_get(dev);
	struct rtable *rth, **rthp;
Al Viro's avatar
Al Viro committed
1131
	__be32  skeys[2] = { saddr, 0 };
Linus Torvalds's avatar
Linus Torvalds committed
1132
	int  ikeys[2] = { dev->ifindex, 0 };
1133
	struct netevent_redirect netevent;
Linus Torvalds's avatar
Linus Torvalds committed
1134
1135
1136
1137
1138

	if (!in_dev)
		return;

	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1139
	    || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1140
	    || ipv4_is_zeronet(new_gw))
Linus Torvalds's avatar
Linus Torvalds committed
1141
1142
1143
1144
1145
1146
1147
1148
		goto reject_redirect;

	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
			goto reject_redirect;
		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
			goto reject_redirect;
	} else {
1149
		if (inet_addr_type(&init_net, new_gw) != RTN_UNICAST)
Linus Torvalds's avatar
Linus Torvalds committed
1150
1151
1152
1153
1154
			goto reject_redirect;
	}

	for (i = 0; i < 2; i++) {
		for (k = 0; k < 2; k++) {
1155
			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
Linus Torvalds's avatar
Linus Torvalds committed
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165

			rthp=&rt_hash_table[hash].chain;

			rcu_read_lock();
			while ((rth = rcu_dereference(*rthp)) != NULL) {
				struct rtable *rt;

				if (rth->fl.fl4_dst != daddr ||
				    rth->fl.fl4_src != skeys[i] ||
				    rth->fl.oif != ikeys[k] ||
1166
1167
				    rth->fl.iif != 0 ||
				    rth->rt_genid != atomic_read(&rt_genid)) {
1168
					rthp = &rth->u.dst.rt_next;
Linus Torvalds's avatar
Linus Torvalds committed
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
					continue;
				}

				if (rth->rt_dst != daddr ||
				    rth->rt_src != saddr ||
				    rth->u.dst.error ||
				    rth->rt_gateway != old_gw ||
				    rth->u.dst.dev != dev)
					break;

				dst_hold(&rth->u.dst);
				rcu_read_unlock();

				rt = dst_alloc(&ipv4_dst_ops);
				if (rt == NULL) {
					ip_rt_put(rth);
					in_dev_put(in_dev);
					return;
				}

				/* Copy all the information. */
				*rt = *rth;
1191
				INIT_RCU_HEAD(&rt->u.dst.rcu_head);
Linus Torvalds's avatar
Linus Torvalds committed
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
				rt->u.dst.__use		= 1;
				atomic_set(&rt->u.dst.__refcnt, 1);
				rt->u.dst.child		= NULL;
				if (rt->u.dst.dev)
					dev_hold(rt->u.dst.dev);
				if (rt->idev)
					in_dev_hold(rt->idev);
				rt->u.dst.obsolete	= 0;
				rt->u.dst.lastuse	= jiffies;
				rt->u.dst.path		= &rt->u.dst;
				rt->u.dst.neighbour	= NULL;
				rt->u.dst.hh		= NULL;
				rt->u.dst.xfrm		= NULL;
1205
				rt->rt_genid		= atomic_read(&rt_genid);
Linus Torvalds's avatar
Linus Torvalds committed
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
				rt->rt_flags		|= RTCF_REDIRECTED;

				/* Gateway is different ... */
				rt->rt_gateway		= new_gw;

				/* Redirect received -> path was valid */
				dst_confirm(&rth->u.dst);

				if (rt->peer)
					atomic_inc(&rt->peer->refcnt);

				if (arp_bind_neighbour(&rt->u.dst) ||
				    !(rt->u.dst.neighbour->nud_state &
					    NUD_VALID)) {
					if (rt->u.dst.neighbour)
						neigh_event_send(rt->u.dst.neighbour, NULL);
					ip_rt_put(rth);
					rt_drop(rt);
					goto do_next;
				}
1226

1227
1228
				netevent.old = &rth->u.dst;
				netevent.new = &rt->u.dst;
1229
1230
				call_netevent_notifiers(NETEVENT_REDIRECT,
							&netevent);
Linus Torvalds's avatar
Linus Torvalds committed
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249

				rt_del(hash, rth);
				if (!rt_intern_hash(hash, rt, &rt))
					ip_rt_put(rt);
				goto do_next;
			}
			rcu_read_unlock();
		do_next:
			;
		}
	}
	in_dev_put(in_dev);
	return;

reject_redirect:
#ifdef CONFIG_IP_ROUTE_VERBOSE
	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
		printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
			"%u.%u.%u.%u ignored.\n"
1250
			"  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
Linus Torvalds's avatar
Linus Torvalds committed
1251
		       NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1252
		       NIPQUAD(saddr), NIPQUAD(daddr));
Linus Torvalds's avatar
Linus Torvalds committed
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
#endif
	in_dev_put(in_dev);
}

static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
{
	struct rtable *rt = (struct rtable*)dst;
	struct dst_entry *ret = dst;

	if (rt) {
		if (dst->obsolete) {
			ip_rt_put(rt);
			ret = NULL;
		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
			   rt->u.dst.expires) {
1268
1269
			unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
						rt->fl.oif);
Linus Torvalds's avatar
Linus Torvalds committed
1270
#if RT_CACHE_DEBUG >= 1
1271
			printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
Linus Torvalds's avatar
Linus Torvalds committed
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
					  "%u.%u.%u.%u/%02x dropped\n",
				NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
#endif
			rt_del(hash, rt);
			ret = NULL;
		}
	}
	return ret;
}

/*
 * Algorithm:
 *	1. The first ip_rt_redirect_number redirects are sent
 *	   with exponential backoff, then we stop sending them at all,
 *	   assuming that the host ignores our redirects.
 *	2. If we did not see packets requiring redirects
 *	   during ip_rt_redirect_silence, we assume that the host
 *	   forgot redirected route and start to send redirects again.
 *
 * This algorithm is much cheaper and more intelligent than dumb load limiting
 * in icmp.c.
 *
 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 */

void ip_rt_send_redirect(struct sk_buff *skb)
{
	struct rtable *rt = (struct rtable*)skb->dst;
	struct in_device *in_dev = in_dev_get(rt->u.dst.dev);

	if (!in_dev)
		return;

	if (!IN_DEV_TX_REDIRECTS(in_dev))
		goto out;

	/* No redirected packets during ip_rt_redirect_silence;
	 * reset the algorithm.
	 */
	if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
		rt->u.dst.rate_tokens = 0;

	/* Too many ignored redirects; do not send anything
	 * set u.dst.rate_last to the last seen redirected packet.
	 */
	if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
		rt->u.dst.rate_last = jiffies;
		goto out;
	}

	/* Check for load limit; set rate_last to the latest sent
	 * redirect.
	 */
1326
1327
	if (rt->u.dst.rate_tokens == 0 ||
	    time_after(jiffies,
Linus Torvalds's avatar
Linus Torvalds committed
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
		       (rt->u.dst.rate_last +
			(ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
		rt->u.dst.rate_last = jiffies;
		++rt->u.dst.rate_tokens;
#ifdef CONFIG_IP_ROUTE_VERBOSE
		if (IN_DEV_LOG_MARTIANS(in_dev) &&
		    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
		    net_ratelimit())
			printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
				"redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
				NIPQUAD(rt->rt_src), rt->rt_iif,
				NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
#endif
	}
out:
1344
	in_dev_put(in_dev);
Linus Torvalds's avatar
Linus Torvalds committed
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
}

static int ip_error(struct sk_buff *skb)
{
	struct rtable *rt = (struct rtable*)skb->dst;
	unsigned long now;
	int code;

	switch (rt->u.dst.error) {
		case EINVAL:
		default:
			goto out;
		case EHOSTUNREACH:
			code = ICMP_HOST_UNREACH;
			break;
		case ENETUNREACH:
			code = ICMP_NET_UNREACH;
1362
			IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
Linus Torvalds's avatar
Linus Torvalds committed
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
			break;
		case EACCES:
			code = ICMP_PKT_FILTERED;
			break;
	}

	now = jiffies;
	rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
	if (rt->u.dst.rate_tokens > ip_rt_error_burst)
		rt->u.dst.rate_tokens = ip_rt_error_burst;
	rt->u.dst.rate_last = now;
	if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
		rt->u.dst.rate_tokens -= ip_rt_error_cost;
		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
	}

out:	kfree_skb(skb);
	return 0;
1381
}
Linus Torvalds's avatar
Linus Torvalds committed
1382
1383
1384
1385
1386
1387

/*
 *	The last two values are not from the RFC but
 *	are needed for AMPRnet AX.25 paths.
 */

1388
static const unsigned short mtu_plateau[] =
Linus Torvalds's avatar
Linus Torvalds committed
1389
1390
1391
1392
1393
{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };

static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
{
	int i;
1394

Linus Torvalds's avatar
Linus Torvalds committed
1395
1396
1397
1398
1399
1400
	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
		if (old_mtu > mtu_plateau[i])
			return mtu_plateau[i];
	return 68;
}

1401
1402
unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
				 unsigned short new_mtu)
Linus Torvalds's avatar
Linus Torvalds committed
1403
1404
1405
1406
{
	int i;
	unsigned short old_mtu = ntohs(iph->tot_len);
	struct rtable *rth;
1407
1408
	__be32  skeys[2] = { iph->saddr, 0, };
	__be32  daddr = iph->daddr;
Linus Torvalds's avatar
Linus Torvalds committed
1409
1410
1411
1412
1413
1414
	unsigned short est_mtu = 0;

	if (ipv4_config.no_pmtu_disc)
		return 0;

	for (i = 0; i < 2; i++) {
1415
		unsigned hash = rt_hash(daddr, skeys[i], 0);
Linus Torvalds's avatar
Linus Torvalds committed
1416
1417
1418

		rcu_read_lock();
		for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1419
		     rth = rcu_dereference(rth->u.dst.rt_next)) {
Linus Torvalds's avatar
Linus Torvalds committed
1420
1421
1422
1423
1424
			if (rth->fl.fl4_dst == daddr &&
			    rth->fl.fl4_src == skeys[i] &&
			    rth->rt_dst  == daddr &&
			    rth->rt_src  == iph->saddr &&
			    rth->fl.iif == 0 &&
1425
			    !(dst_metric_locked(&rth->u.dst, RTAX_MTU)) &&
1426
1427
			    rth->u.dst.dev->nd_net == net &&
			    rth->rt_genid == atomic_read(&rt_genid)) {
Linus Torvalds's avatar
Linus Torvalds committed
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
				unsigned short mtu = new_mtu;

				if (new_mtu < 68 || new_mtu >= old_mtu) {

					/* BSD 4.2 compatibility hack :-( */
					if (mtu == 0 &&
					    old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
					    old_mtu >= 68 + (iph->ihl << 2))
						old_mtu -= iph->ihl << 2;

					mtu = guess_mtu(old_mtu);
				}
				if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1441
					if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
Linus Torvalds's avatar
Linus Torvalds committed
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
						dst_confirm(&rth->u.dst);
						if (mtu < ip_rt_min_pmtu) {
							mtu = ip_rt_min_pmtu;
							rth->u.dst.metrics[RTAX_LOCK-1] |=
								(1 << RTAX_MTU);
						}
						rth->u.dst.metrics[RTAX_MTU-1] = mtu;
						dst_set_expires(&rth->u.dst,
							ip_rt_mtu_expires);
					}
					est_mtu = mtu;
				}
			}
		}
		rcu_read_unlock();
	}
	return est_mtu ? : new_mtu;
}

static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
{
	if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
	    !(dst_metric_locked(dst, RTAX_MTU))) {
		if (mtu < ip_rt_min_pmtu) {
			mtu = ip_rt_min_pmtu;
			dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
		}
		dst->metrics[RTAX_MTU-1] = mtu;
		dst_set_expires(dst, ip_rt_mtu_expires);
1471
		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
Linus Torvalds's avatar
Linus Torvalds committed
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
	}
}

static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
{
	return NULL;
}

static void ipv4_dst_destroy(struct dst_entry *dst)
{
	struct rtable *rt = (struct rtable *) dst;
	struct inet_peer *peer = rt->peer;
	struct in_device *idev = rt->idev;

	if (peer) {
		rt->peer = NULL;
		inet_putpeer(peer);
	}

	if (idev) {
		rt->idev = NULL;
		in_dev_put(idev);
	}
}

static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
			    int how)
{
	struct rtable *rt = (struct rtable *) dst;
	struct in_device *idev = rt->idev;
1502
1503
1504
	if (dev != dev->nd_net->loopback_dev && idev && idev->dev == dev) {
		struct in_device *loopback_idev =
			in_dev_get(dev->nd_net->loopback_dev);
Linus Torvalds's avatar
Linus Torvalds committed
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
		if (loopback_idev) {
			rt->idev = loopback_idev;
			in_dev_put(idev);
		}
	}
}

static void ipv4_link_failure(struct sk_buff *skb)
{
	struct rtable *rt;

	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);

	rt = (struct rtable *) skb->dst;
	if (rt)
		dst_set_expires(&rt->u.dst, 0);
}

static int ip_rt_bug(struct sk_buff *skb)
{
	printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1526
		NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
Linus Torvalds's avatar
Linus Torvalds committed
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
		skb->dev ? skb->dev->name : "?");
	kfree_skb(skb);
	return 0;
}

/*
   We do not cache source address of outgoing interface,
   because it is used only by IP RR, TS and SRR options,
   so that it out of fast path.

   BTW remember: "addr" is allowed to be not aligned
   in IP options!
 */

void ip_rt_get_source(u8 *addr, struct rtable *rt)
{
1543
	__be32 src;
Linus Torvalds's avatar
Linus Torvalds committed
1544
1545
1546
1547
	struct fib_result res;

	if (rt->fl.iif == 0)
		src = rt->rt_src;
1548
	else if (fib_lookup(rt->u.dst.dev->nd_net, &rt->fl, &res) == 0) {
Linus Torvalds's avatar
Linus Torvalds committed
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
		src = FIB_RES_PREFSRC(res);
		fib_res_put(&res);
	} else
		src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
					RT_SCOPE_UNIVERSE);
	memcpy(addr, &src, 4);
}

#ifdef CONFIG_NET_CLS_ROUTE
static void set_class_tag(struct rtable *rt, u32 tag)
{
	if (!(rt->u.dst.tclassid & 0xFFFF))
		rt->u.dst.tclassid |= tag & 0xFFFF;
	if (!(rt->u.dst.tclassid & 0xFFFF0000))
		rt->u.dst.tclassid |= tag & 0xFFFF0000;
}
#endif

static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
{
	struct fib_info *fi = res->fi;

	if (fi) {
		if (FIB_RES_GW(*res) &&
		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
			rt->rt_gateway = FIB_RES_GW(*res);
		memcpy(rt->u.dst.metrics, fi->fib_metrics,
		       sizeof(rt->u.dst.metrics));
		if (fi->fib_mtu == 0) {
			rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
			if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
			    rt->rt_gateway != rt->rt_dst &&
			    rt->u.dst.dev->mtu > 576)
				rt->u.dst.metrics[RTAX_MTU-1] = 576;
		}
#ifdef CONFIG_NET_CLS_ROUTE
		rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
#endif
	} else
		rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;

	if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
	if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
		rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
	if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
		rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
				       ip_rt_min_advmss);
	if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
		rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;

#ifdef CONFIG_NET_CLS_ROUTE
#ifdef CONFIG_IP_MULTIPLE_TABLES
	set_class_tag(rt, fib_rules_tclass(res));
#endif
	set_class_tag(rt, itag);
#endif
1606
	rt->rt_type = res->type;
Linus Torvalds's avatar
Linus Torvalds committed
1607
1608
}

Al Viro's avatar
Al Viro committed
1609
static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds's avatar
Linus Torvalds committed
1610
1611
1612
1613
				u8 tos, struct net_device *dev, int our)
{
	unsigned hash;
	struct rtable *rth;
1614
	__be32 spec_dst;
Linus Torvalds's avatar
Linus Torvalds committed
1615
1616
1617
1618
1619
1620
1621
1622
	struct in_device *in_dev = in_dev_get(dev);
	u32 itag = 0;

	/* Primary sanity checks. */

	if (in_dev == NULL)
		return -EINVAL;

1623
	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1624
	    ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
Linus Torvalds's avatar
Linus Torvalds committed
1625
1626
		goto e_inval;

1627
1628
	if (ipv4_is_zeronet(saddr)) {
		if (!ipv4_is_local_multicast(daddr))
Linus Torvalds's avatar
Linus Torvalds committed
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
			goto e_inval;
		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
	} else if (fib_validate_source(saddr, 0, tos, 0,
					dev, &spec_dst, &itag) < 0)
		goto e_inval;

	rth = dst_alloc(&ipv4_dst_ops);
	if (!rth)
		goto e_nobufs;

	rth->u.dst.output= ip_rt_bug;

	atomic_set(&rth->u.dst.__refcnt, 1);
	rth->u.dst.flags= DST_HOST;
1643
	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds's avatar
Linus Torvalds committed
1644
1645
1646
1647
		rth->u.dst.flags |= DST_NOPOLICY;
	rth->fl.fl4_dst	= daddr;
	rth->rt_dst	= daddr;
	rth->fl.fl4_tos	= tos;
1648
	rth->fl.mark    = skb->mark;
Linus Torvalds's avatar
Linus Torvalds committed
1649
1650
1651
1652
1653
1654
1655
	rth->fl.fl4_src	= saddr;
	rth->rt_src	= saddr;
#ifdef CONFIG_NET_CLS_ROUTE
	rth->u.dst.tclassid = itag;
#endif
	rth->rt_iif	=
	rth->fl.iif	= dev->ifindex;