This patch provides an improved fdtable allocation scheme, useful forexpanding fdtable file descriptor entries. The main focus is on the fdarray,as its memory usage grows 128 times faster than that of an fdset.

The allocation algorithm sizes the fdarray in such a way that its memory usageincreases in easy page-sized chunks. Additionally, it tries to account for theoptimal usage of the allocators involved: kmalloc() for sizes less than apage, and vmalloc() with page granularity for sizes greater than a page.Namely, the following sizes for the fdarray are considered, and the smallestthat accommodates the requested fd count is chosen: pagesize / 4 pagesize / 2 pagesize <- memory allocator switch point pagesize * 2 pagesize * 3 pagesize * 4 ...etc...Unlike the current implementation, this allocation scheme does not require aloop to compute the optimal fdarray size, and can be done in straightline code.

Furthermore, since the fdarray overflows the pagesize boundary long before anyof the fdsets do, it makes sense to optimize run-time by allocating both fdsetsin a single swoop. Even together, they will still be, by far, smaller than thefdarray.

As long as we're replacing the guts of fs/file.c, it makes sense to tidy upthe code. This work includes: simplification via refactoring, elimination of unnecessary code, and extensive commenting throughout the entire file.This is the last patch in the series. All the code should now be sparklyclean.

/*- * We use this list to defer free fdtables that have vmalloced- * sets/arrays. By keeping a per-cpu list, we avoid having to embed- * the work_struct in fdtable itself which avoids a 64 byte (i386) increase in- * this per-task structure.+ * We use this list to defer free fdtables that have vmalloced sets/arrays. By+ * keeping a per-cpu list, we avoid having to embed the work_struct in the+ * fdtable itself. */ static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list);

+ fddef = (struct fdtable_defer *)data; spin_lock(&fddef->lock); /*- * If someone already emptied the queue return.+ * If there are any fdtables scheduled for deletion, then try to+ * schedule this work. If we could not schedule, then run this function+ * again in a little while. */- if (!fddef->next)- goto out;- if (!schedule_work(&fddef->wq))- mod_timer(&fddef->timer, 5);-out:+ if (fddef->next)+ if (!schedule_work(&fddef->wq))+ mod_timer(&fddef->timer, 5); spin_unlock(&fddef->lock); }

+/**+ * free_fdtable_rcu - Free an fdtable or its wrapper files_struct.+ * @rcu: The RCU head structure embedded within the to-be-freed fdtable.+ *+ * In order to correctly free an fdtable that was in use by the system, this+ * function should be invoked as an RCU callback on the target fdtable. It must+ * be used on non-embedded fdtables or embedded fdtables once the wrapper+ * files_struct is to be discarded; it must not be used on embedded fdtables+ * where the wrapper files_struct must persist.+ */ void free_fdtable_rcu(struct rcu_head *rcu) {- struct fdtable *fdt = container_of(rcu, struct fdtable, rcu);- int fdset_size, fdarray_size;- struct fdtable_defer *fddef;-- BUG_ON(!fdt);- fdset_size = fdt->max_fds / 8;- fdarray_size = fdt->max_fds * sizeof(struct file *);+ struct fdtable *fdt;

+ fdt = container_of(rcu, struct fdtable, rcu); if (fdt->max_fds <= NR_OPEN_DEFAULT) { /*- * This fdtable is embedded in the files structure and that- * structure itself is getting destroyed.+ * This fdtable is embedded within a wrapper files_struct, and+ * both are now expired. Free the container. */ kmem_cache_free(files_cachep, container_of(fdt, struct files_struct, fdtab)); return; }- if (fdset_size <= PAGE_SIZE && fdarray_size <= PAGE_SIZE) {- kfree(fdt->open_fds);- kfree(fdt->close_on_exec);+ if (fdt->max_fds <= (PAGE_SIZE / sizeof(struct file *))) {+ /*+ * The fdarray was obtained with kmalloc, and since the fdset+ * will always be smaller we know it was also obtained with+ * kmalloc. Thus, we can dispose of the fdtable right now.+ */ kfree(fdt->fd);+ kfree(fdt->open_fds); kfree(fdt); } else {+ struct fdtable_defer *fddef;++ /*+ * The fdset has at least one component obtained with vmalloc.+ * Hence, we will handle deallocation from the workqueue+ * context. If we are unable to schedule the work, then we set+ * a timer to fire and reattempt to schedule later.+ */ fddef = &get_cpu_var(fdtable_defer_list); spin_lock(&fddef->lock); fdt->next = fddef->next; fddef->next = fdt;- /*- * vmallocs are handled from the workqueue context.- * If the per-cpu workqueue is running, then we- * defer work scheduling through a timer.- */ if (!schedule_work(&fddef->wq)) mod_timer(&fddef->timer, 5); spin_unlock(&fddef->lock);@@ -147,197 +177,179 @@ void free_fdtable_rcu(struct rcu_head *r } }

-/*- * Expand the file descriptor table.- * This function will allocate a new fdtable and both fd array and fdset, of- * the given size.- * Return <0 error code on error; 1 on successful completion.- * The files->file_lock should be held on entry, and will be held on exit.+/**+ * expand_files - Accommodate an fd index inside a files structure.+ * @files: The files structure that must be sized.+ * @nr: Requested fd index to be supported.+ *+ * Make sure that the given files structure can accommodate the provided fd+ * index within its associated fdtable. If the requested index exceeds the+ * current capacity and there is room for expansion, a larger fdtable will be+ * created and installed. The files->file_lock should be held on entry, and+ * will be held on exit.+ *+ * If the current fdtable is sufficient, 0 is returned. If the fdtable was+ * expanded and execution may have blocked, 1 is returned. On an error+ * condition, a negative error code is returned. */-static int expand_fdtable(struct files_struct *files, int nr)+int expand_files(struct files_struct *files, int nr) __releases(files->file_lock) __acquires(files->file_lock) {- struct fdtable *new_fdt, *cur_fdt;+ struct fdtable *cur_fdt, *new_fdt;++ cur_fdt = files_fdtable(files);+ /* Do we need to expand? */+ if (nr < cur_fdt->max_fds)+ return 0;+ /* Are we allowed to expand? */+ if (nr >= NR_OPEN)+ return -EMFILE;