iget-stop-freevxfs-from-using-iget-and-read_inode-fix
[linux-2.6/linux-trees-mm.git] / fs / revoke.c
blob4c23ecd70ce6727dbf49de53f9d21c82c6c4697f
1 /*
2 * fs/revoke.c - Invalidate all current open file descriptors of an inode.
4 * Copyright (C) 2006-2007 Pekka Enberg
6 * This file is released under the GPLv2.
7 */
9 #include <linux/file.h>
10 #include <linux/fs.h>
11 #include <linux/namei.h>
12 #include <linux/magic.h>
13 #include <linux/mm.h>
14 #include <linux/mman.h>
15 #include <linux/module.h>
16 #include <linux/mount.h>
17 #include <linux/sched.h>
18 #include <linux/revoked_fs_i.h>
19 #include <linux/syscalls.h>
21 /**
22 * fileset - an array of file pointers.
23 * @files: the array of file pointers
24 * @nr: number of elements in the array
25 * @end: index to next unused file pointer
27 struct fileset {
28 struct file **files;
29 unsigned long nr;
30 unsigned long end;
33 /**
34 * revoke_details - details of the revoke operation
35 * @inode: invalidate open file descriptors of this inode
36 * @fset: set of files that point to a revoked inode
37 * @restore_start: index to the first file pointer that is currently in
38 * use by a file descriptor but the real file has not
39 * been revoked
41 struct revoke_details {
42 struct fileset *fset;
43 unsigned long restore_start;
46 static struct kmem_cache *revokefs_inode_cache;
48 static inline bool fset_is_full(struct fileset *set)
50 return set->nr == set->end;
53 static inline struct file *fset_get_filp(struct fileset *set)
55 return set->files[set->end++];
58 static struct fileset *alloc_fset(unsigned long size)
60 struct fileset *fset;
62 fset = kzalloc(sizeof *fset, GFP_KERNEL);
63 if (!fset)
64 return NULL;
66 fset->files = kcalloc(size, sizeof(struct file *), GFP_KERNEL);
67 if (!fset->files) {
68 kfree(fset);
69 return NULL;
71 fset->nr = size;
72 return fset;
75 static void free_fset(struct fileset *fset)
77 int i;
79 for (i = fset->end; i < fset->nr; i++)
80 fput(fset->files[i]);
82 kfree(fset->files);
83 kfree(fset);
87 * Revoked file descriptors point to inodes in the revokefs filesystem.
89 static struct vfsmount *revokefs_mnt;
91 static struct file *get_revoked_file(void)
93 struct dentry *dentry;
94 struct inode *inode;
95 struct file *filp;
96 struct qstr name;
98 filp = get_empty_filp();
99 if (!filp)
100 goto err;
102 inode = new_inode(revokefs_mnt->mnt_sb);
103 if (!inode)
104 goto err_inode;
106 name.name = "revoked_file";
107 name.len = strlen(name.name);
108 dentry = d_alloc(revokefs_mnt->mnt_sb->s_root, &name);
109 if (!dentry)
110 goto err_dentry;
112 d_instantiate(dentry, inode);
114 filp->f_mapping = inode->i_mapping;
115 filp->f_dentry = dget(dentry);
116 filp->f_vfsmnt = mntget(revokefs_mnt);
117 filp->f_op = fops_get(inode->i_fop);
118 filp->f_pos = 0;
120 return filp;
122 err_dentry:
123 iput(inode);
124 err_inode:
125 fput(filp);
126 err:
127 return NULL;
130 static inline bool can_revoke_file(struct file *file, struct inode *inode,
131 struct file *to_exclude)
133 if (!file || file == to_exclude)
134 return false;
136 return file->f_dentry->d_inode == inode;
140 * LOCKING: task_lock(owner)
142 static int revoke_fds(struct task_struct *owner,
143 struct inode *inode,
144 struct file *to_exclude, struct fileset *fset)
146 struct files_struct *files;
147 struct fdtable *fdt;
148 unsigned int fd;
149 int err = 0;
151 files = get_files_struct(owner);
152 if (!files)
153 goto out;
155 spin_lock(&files->file_lock);
156 fdt = files_fdtable(files);
158 for (fd = 0; fd < fdt->max_fds; fd++) {
159 struct revokefs_inode_info *info;
160 struct file *filp, *new_filp;
161 struct inode *new_inode;
163 filp = fcheck_files(files, fd);
164 if (!can_revoke_file(filp, inode, to_exclude))
165 continue;
167 if (!filp->f_op->revoke) {
168 err = -EOPNOTSUPP;
169 goto failed;
172 if (fset_is_full(fset)) {
173 err = -ENOMEM;
174 goto failed;
177 new_filp = fset_get_filp(fset);
180 * Replace original struct file pointer with a pointer to
181 * a 'revoked file.' After this point, we don't need to worry
182 * about racing with sys_close or sys_dup.
184 rcu_assign_pointer(fdt->fd[fd], new_filp);
187 * Hold on to task until we can take down the file and its
188 * mmap.
190 get_task_struct(owner);
192 new_inode = new_filp->f_dentry->d_inode;
193 make_revoked_inode(new_inode, inode->i_mode & S_IFMT);
195 info = revokefs_i(new_inode);
196 info->fd = fd;
197 info->file = filp;
198 info->owner = owner;
200 failed:
201 spin_unlock(&files->file_lock);
202 put_files_struct(files);
203 out:
204 return err;
207 static inline bool can_revoke_vma(struct vm_area_struct *vma,
208 struct inode *inode, struct file *to_exclude)
210 struct file *file = vma->vm_file;
212 if (vma->vm_flags & VM_REVOKED)
213 return false;
215 if (!file || file == to_exclude)
216 return false;
218 return file->f_path.dentry->d_inode == inode;
221 static int __revoke_break_cow(struct task_struct *tsk, struct inode *inode,
222 struct file *to_exclude)
224 struct mm_struct *mm = tsk->mm;
225 struct vm_area_struct *vma;
226 int err = 0;
228 down_read(&mm->mmap_sem);
229 for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
230 int ret;
232 if (vma->vm_flags & VM_SHARED)
233 continue;
235 if (!can_revoke_vma(vma, inode, to_exclude))
236 continue;
238 ret = get_user_pages(tsk, tsk->mm, vma->vm_start,
239 vma_pages(vma), 1, 1, NULL, NULL);
240 if (ret < 0) {
241 err = ret;
242 break;
245 unlink_file_vma(vma);
246 fput(vma->vm_file);
247 vma->vm_file = NULL;
249 up_read(&mm->mmap_sem);
250 return err;
253 static int revoke_break_cow(struct fileset *fset, struct inode *inode,
254 struct file *to_exclude)
256 unsigned long i;
257 int err = 0;
259 for (i = 0; i < fset->end; i++) {
260 struct revokefs_inode_info *info;
261 struct file *this;
263 this = fset->files[i];
264 info = revokefs_i(this->f_dentry->d_inode);
266 err = __revoke_break_cow(info->owner, inode, to_exclude);
267 if (err)
268 break;
270 return err;
274 * LOCKING: down_write(&mm->mmap_sem)
275 * -> spin_lock(&mapping->i_mmap_lock)
277 static int revoke_vma(struct vm_area_struct *vma, struct zap_details *details)
279 unsigned long restart_addr, start_addr, end_addr;
280 int need_break;
282 start_addr = vma->vm_start;
283 end_addr = vma->vm_end;
285 again:
286 restart_addr = zap_page_range(vma, start_addr, end_addr - start_addr,
287 details);
289 need_break = need_resched() || need_lockbreak(details->i_mmap_lock);
290 if (need_break)
291 goto out_need_break;
293 if (restart_addr < end_addr) {
294 start_addr = restart_addr;
295 goto again;
297 vma->vm_flags |= VM_REVOKED;
298 return 0;
300 out_need_break:
301 spin_unlock(details->i_mmap_lock);
302 cond_resched();
303 spin_lock(details->i_mmap_lock);
304 return -EINTR;
308 * LOCKING: spin_lock(&mapping->i_mmap_lock)
310 static int revoke_mm(struct mm_struct *mm, struct address_space *mapping,
311 struct file *to_exclude)
313 struct vm_area_struct *vma;
314 struct zap_details details;
315 int err = 0;
317 details.i_mmap_lock = &mapping->i_mmap_lock;
320 * If ->mmap_sem is under contention, we continue scanning other
321 * mms and try again later.
323 if (!down_write_trylock(&mm->mmap_sem)) {
324 err = -EAGAIN;
325 goto out;
327 for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
328 if (!(vma->vm_flags & VM_SHARED))
329 continue;
331 if (!can_revoke_vma(vma, mapping->host, to_exclude))
332 continue;
334 err = revoke_vma(vma, &details);
335 if (err)
336 break;
338 __unlink_file_vma(vma);
339 fput(vma->vm_file);
340 vma->vm_file = NULL;
342 up_write(&mm->mmap_sem);
343 out:
344 return err;
348 * LOCKING: spin_lock(&mapping->i_mmap_lock)
350 static void revoke_mapping_tree(struct address_space *mapping,
351 struct file *to_exclude)
353 struct vm_area_struct *vma;
354 struct prio_tree_iter iter;
355 int try_again = 0;
357 restart:
358 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) {
359 int err;
361 if (!(vma->vm_flags & VM_SHARED))
362 continue;
364 if (likely(!can_revoke_vma(vma, mapping->host, to_exclude)))
365 continue;
367 err = revoke_mm(vma->vm_mm, mapping, to_exclude);
368 if (err == -EAGAIN)
369 try_again = 1;
371 goto restart;
373 if (try_again) {
374 cond_resched();
375 goto restart;
380 * LOCKING: spin_lock(&mapping->i_mmap_lock)
382 static void revoke_mapping_list(struct address_space *mapping,
383 struct file *to_exclude)
385 struct vm_area_struct *vma;
386 int try_again = 0;
388 restart:
389 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) {
390 int err;
392 if (likely(!can_revoke_vma(vma, mapping->host, to_exclude)))
393 continue;
395 err = revoke_mm(vma->vm_mm, mapping, to_exclude);
396 if (err == -EAGAIN) {
397 try_again = 1;
398 continue;
400 if (err == -EINTR)
401 goto restart;
403 if (try_again) {
404 cond_resched();
405 goto restart;
409 static void revoke_mapping(struct address_space *mapping, struct file *to_exclude)
411 spin_lock(&mapping->i_mmap_lock);
412 if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
413 revoke_mapping_tree(mapping, to_exclude);
414 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
415 revoke_mapping_list(mapping, to_exclude);
416 spin_unlock(&mapping->i_mmap_lock);
419 static void restore_file(struct revokefs_inode_info *info)
421 struct files_struct *files;
423 files = get_files_struct(info->owner);
424 if (files) {
425 struct fdtable *fdt;
426 struct file *filp;
428 spin_lock(&files->file_lock);
429 fdt = files_fdtable(files);
431 filp = fdt->fd[info->fd];
432 if (filp)
433 fput(filp);
435 rcu_assign_pointer(fdt->fd[info->fd], info->file);
436 FD_SET(info->fd, fdt->close_on_exec);
437 spin_unlock(&files->file_lock);
438 put_files_struct(files);
440 put_task_struct(info->owner);
441 info->owner = NULL; /* To avoid double-restore. */
444 static void restore_files(struct revoke_details *details)
446 unsigned long i;
448 for (i = details->restore_start; i < details->fset->end; i++) {
449 struct revokefs_inode_info *info;
450 struct file *filp;
452 filp = details->fset->files[i];
453 info = revokefs_i(filp->f_dentry->d_inode);
455 restore_file(info);
459 static int revoke_files(struct revoke_details *details)
461 unsigned long i;
462 int err = 0;
464 for (i = 0; i < details->fset->end; i++) {
465 struct revokefs_inode_info *info;
466 struct file *this, *filp;
467 struct inode *inode;
469 this = details->fset->files[i];
470 inode = this->f_dentry->d_inode;
471 info = revokefs_i(inode);
474 * Increase count before attempting to close file as
475 * an partially closed file can no longer be restored.
477 details->restore_start++;
478 filp = info->file;
479 err = filp->f_op->revoke(filp, inode->i_mapping);
480 put_task_struct(info->owner);
481 info->owner = NULL; /* To avoid restoring closed file. */
482 if (err)
483 goto out;
485 out:
486 return err;
490 * Returns the maximum number of file descriptors pointing to an inode.
492 * LOCKING: read_lock(&tasklist_lock)
494 static unsigned long inode_fds(struct inode *inode, struct file *to_exclude)
496 struct task_struct *g, *p;
497 unsigned long nr_fds = 0;
499 do_each_thread(g, p) {
500 struct files_struct *files;
501 struct fdtable *fdt;
502 unsigned int fd;
504 files = get_files_struct(p);
505 if (!files)
506 continue;
508 spin_lock(&files->file_lock);
509 fdt = files_fdtable(files);
510 for (fd = 0; fd < fdt->max_fds; fd++) {
511 struct file *file;
513 file = fcheck_files(files, fd);
514 if (can_revoke_file(file, inode, to_exclude)) {
515 nr_fds += fdt->max_fds;
516 break;
519 spin_unlock(&files->file_lock);
520 put_files_struct(files);
522 while_each_thread(g, p);
523 return nr_fds;
526 static struct fileset *__alloc_revoke_fset(unsigned long size)
528 struct fileset *fset;
529 int i;
531 fset = alloc_fset(size);
532 if (!fset)
533 return NULL;
535 for (i = 0; i < fset->nr; i++) {
536 struct file *filp;
538 filp = get_revoked_file();
539 if (!filp)
540 goto err;
542 fset->files[i] = filp;
544 return fset;
545 err:
546 free_fset(fset);
547 return NULL;
550 static int do_revoke(struct inode *inode, struct file *to_exclude)
552 struct revoke_details details;
553 struct fileset *fset = NULL;
554 struct task_struct *g, *p;
555 unsigned long nr_fds;
556 int err = 0;
558 if (current->fsuid != inode->i_uid && !capable(CAP_FOWNER)) {
559 err = -EPERM;
560 goto out;
563 retry:
564 if (signal_pending(current)) {
565 err = -ERESTARTSYS;
566 goto out;
569 read_lock(&tasklist_lock);
570 nr_fds = inode_fds(inode, to_exclude);
571 read_unlock(&tasklist_lock);
573 if (!nr_fds)
574 goto out;
577 * Pre-allocate memory because the first pass is done under
578 * tasklist_lock.
580 fset = __alloc_revoke_fset(nr_fds);
581 if (!fset) {
582 err = -ENOMEM;
583 goto out;
586 read_lock(&tasklist_lock);
589 * If someone forked while we were allocating memory, try again.
591 if (inode_fds(inode, to_exclude) > fset->nr) {
592 read_unlock(&tasklist_lock);
593 free_fset(fset);
594 goto retry;
597 details.fset = fset;
598 details.restore_start = 0;
601 * First revoke the descriptors. After we are done, no one can start
602 * new operations on them.
604 do_each_thread(g, p) {
605 err = revoke_fds(p, inode, to_exclude, fset);
606 if (err)
607 goto exit_loop;
609 while_each_thread(g, p);
610 exit_loop:
611 read_unlock(&tasklist_lock);
613 if (err)
614 goto out_restore;
617 * Take down shared memory mappings.
619 revoke_mapping(inode->i_mapping, to_exclude);
622 * Break COW for private mappings.
624 err = revoke_break_cow(fset, inode, to_exclude);
625 if (err)
626 goto out_restore;
629 * Now, revoke the files for good.
631 err = revoke_files(&details);
632 if (err)
633 goto out_restore;
635 out_free_table:
636 free_fset(fset);
637 out:
638 return err;
640 out_restore:
641 restore_files(&details);
642 goto out_free_table;
645 asmlinkage long sys_revokeat(int dfd, const char __user * filename)
647 struct nameidata nd;
648 int err;
650 err = __user_walk_fd(dfd, filename, 0, &nd);
651 if (!err) {
652 err = do_revoke(nd.dentry->d_inode, NULL);
653 path_release(&nd);
655 return err;
658 asmlinkage long sys_frevoke(unsigned int fd)
660 struct file *file = fget(fd);
661 int err = -EBADF;
663 if (file) {
664 err = do_revoke(file->f_dentry->d_inode, file);
665 fput(file);
667 return err;
670 int generic_file_revoke(struct file *file, struct address_space *new_mapping)
672 struct address_space *mapping = file->f_mapping;
673 int err;
676 * Flush pending writes.
678 err = do_fsync(file, 1);
679 if (err)
680 goto out;
682 file->f_mapping = new_mapping;
685 * Make pending reads fail.
687 err = invalidate_inode_pages2(mapping);
689 out:
690 return err;
692 EXPORT_SYMBOL(generic_file_revoke);
695 * Filesystem for revoked files.
698 static struct inode *revokefs_alloc_inode(struct super_block *sb)
700 struct revokefs_inode_info *info;
702 info = kmem_cache_alloc(revokefs_inode_cache, GFP_KERNEL);
703 if (!info)
704 return NULL;
706 return &info->vfs_inode;
709 static void revokefs_destroy_inode(struct inode *inode)
711 kmem_cache_free(revokefs_inode_cache, revokefs_i(inode));
714 static struct super_operations revokefs_super_ops = {
715 .alloc_inode = revokefs_alloc_inode,
716 .destroy_inode = revokefs_destroy_inode,
717 .drop_inode = generic_delete_inode,
720 static int revokefs_get_sb(struct file_system_type *fs_type,
721 int flags, const char *dev_name, void *data,
722 struct vfsmount *mnt)
724 return get_sb_pseudo(fs_type, "revoke:", &revokefs_super_ops,
725 REVOKEFS_MAGIC, mnt);
728 static struct file_system_type revokefs_fs_type = {
729 .name = "revokefs",
730 .get_sb = revokefs_get_sb,
731 .kill_sb = kill_anon_super
734 static void revokefs_init_inode(struct kmem_cache *cache, void *obj)
736 struct revokefs_inode_info *info = obj;
738 info->owner = NULL;
739 inode_init_once(&info->vfs_inode);
742 static int __init revokefs_init(void)
744 int err = -ENOMEM;
746 revokefs_inode_cache =
747 kmem_cache_create("revokefs_inode_cache",
748 sizeof(struct revokefs_inode_info),
750 (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
751 SLAB_MEM_SPREAD), revokefs_init_inode);
752 if (!revokefs_inode_cache)
753 goto out;
755 err = register_filesystem(&revokefs_fs_type);
756 if (err)
757 goto err_register;
759 revokefs_mnt = kern_mount(&revokefs_fs_type);
760 if (IS_ERR(revokefs_mnt)) {
761 err = PTR_ERR(revokefs_mnt);
762 goto err_mnt;
764 out:
765 return err;
766 err_mnt:
767 unregister_filesystem(&revokefs_fs_type);
768 err_register:
769 kmem_cache_destroy(revokefs_inode_cache);
770 return err;
773 late_initcall(revokefs_init);