Commit 2e002662 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'fs-file-descriptor-optimization'

Merge file descriptor allocation speedup.

Eric Dumazet has a test-case for a fairly common network deamon load
pattern: openign and closing a lot of sockets that each have very little
work done on them.  It turns out that in that case, the cost of just
finding the correct file descriptor number can be a dominating factor.

We've long had a trivial optimization for allocating file descriptors
sequentially, but that optimization ends up being not very effective
when other file descriptors are being closed concurrently, and the fd
patterns are not some simple FIFO pattern.  In such cases we ended up
spending a lot of time just scanning the bitmap of open file descriptors
in order to find the next file descriptor number to open.

This trivial patch-series mitigates that by simply introducing a
second-level bitmap of which words in the first bitmap are already fully
allocated.  That cuts down the cost of scanning by an order of magnitude
in some pathological (but realistic) cases.

The second patch is an even more trivial patch to avoid unnecessarily
dirtying the cacheline for the close-on-exec bit array that normally
ends up being all empty.

* fs-file-descriptor-optimization:
  vfs: conditionally clear close-on-exec flag
  vfs: Fix pathological performance case for __alloc_fd()
parents 6a13feb9 fc90888d
......@@ -56,6 +56,9 @@ static void free_fdtable_rcu(struct rcu_head *rcu)
__free_fdtable(container_of(rcu, struct fdtable, rcu));
#define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long))
* Expand the fdset in the files_struct. Called with the files spinlock
* held for write.
......@@ -77,6 +80,11 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
memset((char *)(nfdt->open_fds) + cpy, 0, set);
memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
memset((char *)(nfdt->close_on_exec) + cpy, 0, set);
cpy = BITBIT_SIZE(ofdt->max_fds);
set = BITBIT_SIZE(nfdt->max_fds) - cpy;
memcpy(nfdt->full_fds_bits, ofdt->full_fds_bits, cpy);
memset(cpy+(char *)nfdt->full_fds_bits, 0, set);
static struct fdtable * alloc_fdtable(unsigned int nr)
......@@ -115,12 +123,14 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
fdt->fd = data;
data = alloc_fdmem(max_t(size_t,
if (!data)
goto out_arr;
fdt->open_fds = data;
data += nr / BITS_PER_BYTE;
fdt->close_on_exec = data;
data += nr / BITS_PER_BYTE;
fdt->full_fds_bits = data;
return fdt;
......@@ -226,17 +236,22 @@ static inline void __set_close_on_exec(int fd, struct fdtable *fdt)
static inline void __clear_close_on_exec(int fd, struct fdtable *fdt)
__clear_bit(fd, fdt->close_on_exec);
if (test_bit(fd, fdt->close_on_exec))
__clear_bit(fd, fdt->close_on_exec);
static inline void __set_open_fd(int fd, struct fdtable *fdt)
static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt)
__set_bit(fd, fdt->open_fds);
if (!~fdt->open_fds[fd])
__set_bit(fd, fdt->full_fds_bits);
static inline void __clear_open_fd(int fd, struct fdtable *fdt)
static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)
__clear_bit(fd, fdt->open_fds);
__clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits);
static int count_open_files(struct fdtable *fdt)
......@@ -280,6 +295,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
new_fdt->max_fds = NR_OPEN_DEFAULT;
new_fdt->close_on_exec = newf->close_on_exec_init;
new_fdt->open_fds = newf->open_fds_init;
new_fdt->full_fds_bits = newf->full_fds_bits_init;
new_fdt->fd = &newf->fd_array[0];
......@@ -323,6 +339,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
memcpy(new_fdt->open_fds, old_fdt->open_fds, open_files / 8);
memcpy(new_fdt->close_on_exec, old_fdt->close_on_exec, open_files / 8);
memcpy(new_fdt->full_fds_bits, old_fdt->full_fds_bits, BITBIT_SIZE(open_files));
for (i = open_files; i != 0; i--) {
struct file *f = *old_fds++;
......@@ -454,10 +471,25 @@ struct files_struct init_files = {
.fd = &init_files.fd_array[0],
.close_on_exec = init_files.close_on_exec_init,
.open_fds = init_files.open_fds_init,
.full_fds_bits = init_files.full_fds_bits_init,
.file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
static unsigned long find_next_fd(struct fdtable *fdt, unsigned long start)
unsigned long maxfd = fdt->max_fds;
unsigned long maxbit = maxfd / BITS_PER_LONG;
unsigned long bitbit = start / BITS_PER_LONG;
bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
if (bitbit > maxfd)
return maxfd;
if (bitbit > start)
start = bitbit;
return find_next_zero_bit(fdt->open_fds, maxfd, start);
* allocate a file descriptor, mark it busy.
......@@ -476,7 +508,7 @@ repeat:
fd = files->next_fd;
if (fd < fdt->max_fds)
fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd);
fd = find_next_fd(fdt, fd);
* N.B. For clone tasks sharing a files structure, this test
......@@ -26,6 +26,7 @@ struct fdtable {
struct file __rcu **fd; /* current fd array */
unsigned long *close_on_exec;
unsigned long *open_fds;
unsigned long *full_fds_bits;
struct rcu_head rcu;
......@@ -59,6 +60,7 @@ struct files_struct {
int next_fd;
unsigned long close_on_exec_init[1];
unsigned long open_fds_init[1];
unsigned long full_fds_bits_init[1];
struct file __rcu * fd_array[NR_OPEN_DEFAULT];
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment