4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 * Swap reorganised 29.12.95, Stephen Tweedie
8 #include <linux/malloc.h>
9 #include <linux/smp_lock.h>
10 #include <linux/kernel_stat.h>
11 #include <linux/swap.h>
12 #include <linux/swapctl.h>
13 #include <linux/blkdev.h> /* for blk_size */
14 #include <linux/vmalloc.h>
15 #include <linux/pagemap.h>
16 #include <linux/shm.h>
18 #include <asm/pgtable.h>
20 spinlock_t swaplock
= SPIN_LOCK_UNLOCKED
;
21 unsigned int nr_swapfiles
;
23 struct swap_list_t swap_list
= {-1, -1};
25 struct swap_info_struct swap_info
[MAX_SWAPFILES
];
27 #define SWAPFILE_CLUSTER 256
29 static inline int scan_swap_map(struct swap_info_struct
*si
, unsigned short count
)
33 * We try to cluster swap pages by allocating them
34 * sequentially in swap. Once we've allocated
35 * SWAPFILE_CLUSTER pages this way, however, we resort to
36 * first-free allocation, starting a new cluster. This
37 * prevents us from scattering swap pages all over the entire
38 * swap partition, so that we reduce overall disk seek times
39 * between swap pages. -- sct */
41 while (si
->cluster_next
<= si
->highest_bit
) {
42 offset
= si
->cluster_next
++;
43 if (si
->swap_map
[offset
])
49 si
->cluster_nr
= SWAPFILE_CLUSTER
;
51 /* try to find an empty (even not aligned) cluster. */
52 offset
= si
->lowest_bit
;
54 if (offset
+SWAPFILE_CLUSTER
-1 <= si
->highest_bit
)
57 for (nr
= offset
; nr
< offset
+SWAPFILE_CLUSTER
; nr
++)
61 goto check_next_cluster
;
63 /* We found a completly empty cluster, so start
68 /* No luck, so now go finegrined as usual. -Andrea */
69 for (offset
= si
->lowest_bit
; offset
<= si
->highest_bit
; offset
++) {
70 if (si
->swap_map
[offset
])
73 if (offset
== si
->lowest_bit
)
75 if (offset
== si
->highest_bit
)
77 si
->swap_map
[offset
] = count
;
79 si
->cluster_next
= offset
+1;
85 swp_entry_t
__get_swap_page(unsigned short count
)
87 struct swap_info_struct
* p
;
90 int type
, wrapped
= 0;
92 entry
.val
= 0; /* Out of memory */
93 if (count
>= SWAP_MAP_MAX
)
96 type
= swap_list
.next
;
99 if (nr_swap_pages
== 0)
103 p
= &swap_info
[type
];
104 if ((p
->flags
& SWP_WRITEOK
) == SWP_WRITEOK
) {
106 offset
= scan_swap_map(p
, count
);
107 swap_device_unlock(p
);
109 entry
= SWP_ENTRY(type
,offset
);
110 type
= swap_info
[type
].next
;
112 p
->prio
!= swap_info
[type
].prio
) {
113 swap_list
.next
= swap_list
.head
;
115 swap_list
.next
= type
;
122 if (type
< 0 || p
->prio
!= swap_info
[type
].prio
) {
123 type
= swap_list
.head
;
128 goto out
; /* out of swap space */
135 printk(KERN_ERR
"get_swap_page: bad count %hd from %p\n",
136 count
, __builtin_return_address(0));
142 * Caller has made sure that the swapdevice corresponding to entry
143 * is still around or has not been recycled.
145 void __swap_free(swp_entry_t entry
, unsigned short count
)
147 struct swap_info_struct
* p
;
148 unsigned long offset
, type
;
153 type
= SWP_TYPE(entry
);
154 if (type
>= nr_swapfiles
)
156 p
= & swap_info
[type
];
157 if (!(p
->flags
& SWP_USED
))
159 offset
= SWP_OFFSET(entry
);
160 if (offset
>= p
->max
)
162 if (!p
->swap_map
[offset
])
165 if (p
->prio
> swap_info
[swap_list
.next
].prio
)
166 swap_list
.next
= type
;
168 if (p
->swap_map
[offset
] < SWAP_MAP_MAX
) {
169 if (p
->swap_map
[offset
] < count
)
171 if (!(p
->swap_map
[offset
] -= count
)) {
172 if (offset
< p
->lowest_bit
)
173 p
->lowest_bit
= offset
;
174 if (offset
> p
->highest_bit
)
175 p
->highest_bit
= offset
;
179 swap_device_unlock(p
);
185 printk("swap_free: Trying to free nonexistent swap-page\n");
188 printk("swap_free: Trying to free swap from unused swap-device\n");
191 printk("swap_free: offset exceeds max\n");
194 printk("VM: Bad swap entry %08lx\n", entry
.val
);
197 swap_device_unlock(p
);
199 printk(KERN_ERR
"VM: Bad count %hd current count %hd\n", count
, p
->swap_map
[offset
]);
204 * The swap entry has been read in advance, and we return 1 to indicate
205 * that the page has been used or is no longer needed.
207 * Always set the resulting pte to be nowrite (the same as COW pages
208 * after one process has exited). We don't know just how many PTEs will
209 * share this swap entry, so be cautious and let do_wp_page work out
210 * what to do if a write is requested later.
212 static inline void unuse_pte(struct vm_area_struct
* vma
, unsigned long address
,
213 pte_t
*dir
, swp_entry_t entry
, struct page
* page
)
219 if (pte_present(pte
)) {
220 /* If this entry is swap-cached, then page must already
221 hold the right address for any copies in physical
223 if (pte_page(pte
) != page
)
225 /* We will be removing the swap cache in a moment, so... */
229 if (pte_to_swp_entry(pte
).val
!= entry
.val
)
231 set_pte(dir
, pte_mkdirty(mk_pte(page
, vma
->vm_page_prot
)));
237 static inline void unuse_pmd(struct vm_area_struct
* vma
, pmd_t
*dir
,
238 unsigned long address
, unsigned long size
, unsigned long offset
,
239 swp_entry_t entry
, struct page
* page
)
251 pte
= pte_offset(dir
, address
);
252 offset
+= address
& PMD_MASK
;
253 address
&= ~PMD_MASK
;
254 end
= address
+ size
;
258 unuse_pte(vma
, offset
+address
-vma
->vm_start
, pte
, entry
, page
);
259 address
+= PAGE_SIZE
;
261 } while (address
&& (address
< end
));
264 static inline void unuse_pgd(struct vm_area_struct
* vma
, pgd_t
*dir
,
265 unsigned long address
, unsigned long size
,
266 swp_entry_t entry
, struct page
* page
)
269 unsigned long offset
, end
;
278 pmd
= pmd_offset(dir
, address
);
279 offset
= address
& PGDIR_MASK
;
280 address
&= ~PGDIR_MASK
;
281 end
= address
+ size
;
282 if (end
> PGDIR_SIZE
)
287 unuse_pmd(vma
, pmd
, address
, end
- address
, offset
, entry
,
289 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
291 } while (address
&& (address
< end
));
294 static void unuse_vma(struct vm_area_struct
* vma
, pgd_t
*pgdir
,
295 swp_entry_t entry
, struct page
* page
)
297 unsigned long start
= vma
->vm_start
, end
= vma
->vm_end
;
302 unuse_pgd(vma
, pgdir
, start
, end
- start
, entry
, page
);
303 start
= (start
+ PGDIR_SIZE
) & PGDIR_MASK
;
305 } while (start
&& (start
< end
));
308 static void unuse_process(struct mm_struct
* mm
,
309 swp_entry_t entry
, struct page
* page
)
311 struct vm_area_struct
* vma
;
314 * Go through process' page directory.
318 spin_lock(&mm
->page_table_lock
);
319 for (vma
= mm
->mmap
; vma
; vma
= vma
->vm_next
) {
320 pgd_t
* pgd
= pgd_offset(mm
, vma
->vm_start
);
321 unuse_vma(vma
, pgd
, entry
, page
);
323 spin_unlock(&mm
->page_table_lock
);
328 * We completely avoid races by reading each swap page in advance,
329 * and then search for the process using it. All the necessary
330 * page table adjustments can then be made atomically.
332 static int try_to_unuse(unsigned int type
)
334 struct swap_info_struct
* si
= &swap_info
[type
];
335 struct task_struct
*p
;
342 * Find a swap page in use and read it in.
344 swap_device_lock(si
);
345 for (i
= 1; i
< si
->max
; i
++) {
346 if (si
->swap_map
[i
] > 0 && si
->swap_map
[i
] != SWAP_MAP_BAD
) {
348 * Prevent swaphandle from being completely
349 * unused by swap_free while we are trying
350 * to read in the page - this prevents warning
351 * messages from rw_swap_page_base.
353 if (si
->swap_map
[i
] != SWAP_MAP_MAX
)
355 swap_device_unlock(si
);
359 swap_device_unlock(si
);
363 entry
= SWP_ENTRY(type
, i
);
365 /* Get a page for the entry, using the existing swap
366 cache page if there is one. Otherwise, get a clean
367 page and read the swap into it. */
368 page
= read_swap_cache(entry
);
373 if (PageSwapCache(page
))
374 delete_from_swap_cache(page
);
375 read_lock(&tasklist_lock
);
377 unuse_process(p
->mm
, entry
, page
);
378 read_unlock(&tasklist_lock
);
379 shmem_unuse(entry
, page
);
380 /* Now get rid of the extra reference to the temporary
381 page we've been using. */
382 page_cache_release(page
);
384 * Check for and clear any overflowed swap map counts.
388 swap_device_lock(si
);
389 if (si
->swap_map
[i
] > 0) {
390 if (si
->swap_map
[i
] != SWAP_MAP_MAX
)
391 printk("VM: Undead swap entry %08lx\n",
396 swap_device_unlock(si
);
402 asmlinkage
long sys_swapoff(const char * specialfile
)
404 struct swap_info_struct
* p
= NULL
;
409 if (!capable(CAP_SYS_ADMIN
))
412 err
= user_path_walk(specialfile
, &nd
);
419 for (type
= swap_list
.head
; type
>= 0; type
= swap_info
[type
].next
) {
420 p
= swap_info
+ type
;
421 if ((p
->flags
& SWP_WRITEOK
) == SWP_WRITEOK
) {
423 if (p
->swap_file
== nd
.dentry
)
426 if (S_ISBLK(nd
.dentry
->d_inode
->i_mode
)
427 && (p
->swap_device
== nd
.dentry
->d_inode
->i_rdev
))
440 swap_list
.head
= p
->next
;
442 swap_info
[prev
].next
= p
->next
;
444 if (type
== swap_list
.next
) {
445 /* just pick something that's safe... */
446 swap_list
.next
= swap_list
.head
;
448 nr_swap_pages
-= p
->pages
;
451 err
= try_to_unuse(type
);
453 /* re-insert swap space back into swap_list */
455 for (prev
= -1, i
= swap_list
.head
; i
>= 0; prev
= i
, i
= swap_info
[i
].next
)
456 if (p
->prio
>= swap_info
[i
].prio
)
460 swap_list
.head
= swap_list
.next
= p
- swap_info
;
462 swap_info
[prev
].next
= p
- swap_info
;
463 nr_swap_pages
+= p
->pages
;
465 p
->flags
= SWP_WRITEOK
;
469 blkdev_put(nd
.dentry
->d_inode
->i_bdev
, BDEV_SWAP
);
472 nd
.dentry
= p
->swap_file
;
474 nd
.mnt
= p
->swap_vfsmnt
;
475 p
->swap_vfsmnt
= NULL
;
489 int get_swaparea_info(char *buf
)
491 char * page
= (char *) __get_free_page(GFP_KERNEL
);
492 struct swap_info_struct
*ptr
= swap_info
;
493 int i
, j
, len
= 0, usedswap
;
498 len
+= sprintf(buf
, "Filename\t\t\tType\t\tSize\tUsed\tPriority\n");
499 for (i
= 0 ; i
< nr_swapfiles
; i
++, ptr
++) {
500 if (ptr
->flags
& SWP_USED
) {
501 char * path
= d_path(ptr
->swap_file
, ptr
->swap_vfsmnt
,
504 len
+= sprintf(buf
+ len
, "%-31s ", path
);
506 if (!ptr
->swap_device
)
507 len
+= sprintf(buf
+ len
, "file\t\t");
509 len
+= sprintf(buf
+ len
, "partition\t");
512 for (j
= 0; j
< ptr
->max
; ++j
)
513 switch (ptr
->swap_map
[j
]) {
520 len
+= sprintf(buf
+ len
, "%d\t%d\t%d\n", ptr
->pages
<< (PAGE_SHIFT
- 10),
521 usedswap
<< (PAGE_SHIFT
- 10), ptr
->prio
);
524 free_page((unsigned long) page
);
528 int is_swap_partition(kdev_t dev
) {
529 struct swap_info_struct
*ptr
= swap_info
;
532 for (i
= 0 ; i
< nr_swapfiles
; i
++, ptr
++) {
533 if (ptr
->flags
& SWP_USED
)
534 if (ptr
->swap_device
== dev
)
541 * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
543 * The swapon system call
545 asmlinkage
long sys_swapon(const char * specialfile
, int swap_flags
)
547 struct swap_info_struct
* p
;
549 struct inode
* swap_inode
;
553 static int least_priority
= 0;
554 union swap_header
*swap_header
= 0;
555 int swap_header_version
;
556 int nr_good_pages
= 0;
557 unsigned long maxpages
;
559 struct block_device
*bdev
= NULL
;
561 if (!capable(CAP_SYS_ADMIN
))
565 for (type
= 0 ; type
< nr_swapfiles
; type
++,p
++)
566 if (!(p
->flags
& SWP_USED
))
569 if (type
>= MAX_SWAPFILES
)
571 if (type
>= nr_swapfiles
)
572 nr_swapfiles
= type
+1;
575 p
->swap_vfsmnt
= NULL
;
581 p
->sdev_lock
= SPIN_LOCK_UNLOCKED
;
584 if (swap_flags
& SWAP_FLAG_PREFER
) {
586 (swap_flags
& SWAP_FLAG_PRIO_MASK
)>>SWAP_FLAG_PRIO_SHIFT
;
588 p
->prio
= --least_priority
;
590 error
= user_path_walk(specialfile
, &nd
);
594 p
->swap_file
= nd
.dentry
;
595 p
->swap_vfsmnt
= nd
.mnt
;
596 swap_inode
= nd
.dentry
->d_inode
;
599 if (S_ISBLK(swap_inode
->i_mode
)) {
600 kdev_t dev
= swap_inode
->i_rdev
;
601 struct block_device_operations
*bdops
;
603 p
->swap_device
= dev
;
604 set_blocksize(dev
, PAGE_SIZE
);
606 bdev
= swap_inode
->i_bdev
;
607 bdops
= devfs_get_ops(devfs_get_handle_from_inode(swap_inode
));
608 if (bdops
) bdev
->bd_op
= bdops
;
610 error
= blkdev_get(bdev
, FMODE_READ
|FMODE_WRITE
, 0, BDEV_SWAP
);
613 set_blocksize(dev
, PAGE_SIZE
);
615 if (!dev
|| (blk_size
[MAJOR(dev
)] &&
616 !blk_size
[MAJOR(dev
)][MINOR(dev
)]))
619 for (i
= 0 ; i
< nr_swapfiles
; i
++) {
622 if (dev
== swap_info
[i
].swap_device
)
626 if (blk_size
[MAJOR(dev
)])
627 swapfilesize
= blk_size
[MAJOR(dev
)][MINOR(dev
)]
628 >> (PAGE_SHIFT
- 10);
629 } else if (S_ISREG(swap_inode
->i_mode
)) {
631 for (i
= 0 ; i
< nr_swapfiles
; i
++) {
632 if (i
== type
|| !swap_info
[i
].swap_file
)
634 if (swap_inode
== swap_info
[i
].swap_file
->d_inode
)
637 swapfilesize
= swap_inode
->i_size
>> PAGE_SHIFT
;
641 swap_header
= (void *) __get_free_page(GFP_USER
);
643 printk("Unable to start swapping: out of memory :-)\n");
648 lock_page(virt_to_page(swap_header
));
649 rw_swap_page_nolock(READ
, SWP_ENTRY(type
,0), (char *) swap_header
, 1);
651 if (!memcmp("SWAP-SPACE",swap_header
->magic
.magic
,10))
652 swap_header_version
= 1;
653 else if (!memcmp("SWAPSPACE2",swap_header
->magic
.magic
,10))
654 swap_header_version
= 2;
656 printk("Unable to find swap-space signature\n");
661 switch (swap_header_version
) {
663 memset(((char *) swap_header
)+PAGE_SIZE
-10,0,10);
667 for (i
= 1 ; i
< 8*PAGE_SIZE
; i
++) {
668 if (test_bit(i
,(char *) swap_header
)) {
677 p
->swap_map
= vmalloc(p
->max
* sizeof(short));
682 for (i
= 1 ; i
< p
->max
; i
++) {
683 if (test_bit(i
,(char *) swap_header
))
686 p
->swap_map
[i
] = SWAP_MAP_BAD
;
691 /* Check the swap header's sub-version and the size of
692 the swap file and bad block lists */
693 if (swap_header
->info
.version
!= 1) {
695 "Unable to handle swap header version %d\n",
696 swap_header
->info
.version
);
702 p
->highest_bit
= swap_header
->info
.last_page
- 1;
703 p
->max
= swap_header
->info
.last_page
;
705 maxpages
= SWP_OFFSET(SWP_ENTRY(0,~0UL));
706 if (p
->max
>= maxpages
)
710 if (swap_header
->info
.nr_badpages
> MAX_SWAP_BADPAGES
)
713 /* OK, set up the swap map and apply the bad block list */
714 if (!(p
->swap_map
= vmalloc (p
->max
* sizeof(short)))) {
720 memset(p
->swap_map
, 0, p
->max
* sizeof(short));
721 for (i
=0; i
<swap_header
->info
.nr_badpages
; i
++) {
722 int page
= swap_header
->info
.badpages
[i
];
723 if (page
<= 0 || page
>= swap_header
->info
.last_page
)
726 p
->swap_map
[page
] = SWAP_MAP_BAD
;
728 nr_good_pages
= swap_header
->info
.last_page
-
729 swap_header
->info
.nr_badpages
-
735 if (swapfilesize
&& p
->max
> swapfilesize
) {
737 "Swap area shorter than signature indicates\n");
741 if (!nr_good_pages
) {
742 printk(KERN_WARNING
"Empty swap-file\n");
746 p
->swap_map
[0] = SWAP_MAP_BAD
;
747 p
->flags
= SWP_WRITEOK
;
748 p
->pages
= nr_good_pages
;
750 nr_swap_pages
+= nr_good_pages
;
751 printk(KERN_INFO
"Adding Swap: %dk swap-space (priority %d)\n",
752 nr_good_pages
<<(PAGE_SHIFT
-10), p
->prio
);
754 /* insert swap space into swap_list: */
756 for (i
= swap_list
.head
; i
>= 0; i
= swap_info
[i
].next
) {
757 if (p
->prio
>= swap_info
[i
].prio
) {
764 swap_list
.head
= swap_list
.next
= p
- swap_info
;
766 swap_info
[prev
].next
= p
- swap_info
;
773 blkdev_put(bdev
, BDEV_SWAP
);
777 nd
.mnt
= p
->swap_vfsmnt
;
778 nd
.dentry
= p
->swap_file
;
781 p
->swap_vfsmnt
= NULL
;
784 if (!(swap_flags
& SWAP_FLAG_PREFER
))
789 free_page((long) swap_header
);
794 void si_swapinfo(struct sysinfo
*val
)
797 unsigned long freeswap
= 0;
798 unsigned long totalswap
= 0;
800 for (i
= 0; i
< nr_swapfiles
; i
++) {
802 if ((swap_info
[i
].flags
& SWP_WRITEOK
) != SWP_WRITEOK
)
804 for (j
= 0; j
< swap_info
[i
].max
; ++j
) {
805 switch (swap_info
[i
].swap_map
[j
]) {
815 val
->freeswap
= freeswap
;
816 val
->totalswap
= totalswap
;
821 * Verify that a swap entry is valid and increment its swap map count.
822 * Kernel_lock is held, which guarantees existance of swap device.
824 * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
825 * "permanent", but will be reclaimed by the next swapoff.
827 int swap_duplicate(swp_entry_t entry
)
829 struct swap_info_struct
* p
;
830 unsigned long offset
, type
;
833 /* Swap entry 0 is illegal */
836 type
= SWP_TYPE(entry
);
837 if (type
>= nr_swapfiles
)
839 p
= type
+ swap_info
;
840 offset
= SWP_OFFSET(entry
);
841 if (offset
>= p
->max
)
843 if (!p
->swap_map
[offset
])
846 * Entry is valid, so increment the map count.
849 if (p
->swap_map
[offset
] < SWAP_MAP_MAX
)
850 p
->swap_map
[offset
]++;
852 static int overflow
= 0;
854 printk("VM: swap entry overflow\n");
855 p
->swap_map
[offset
] = SWAP_MAP_MAX
;
857 swap_device_unlock(p
);
863 printk("Bad swap file entry %08lx\n", entry
.val
);
866 printk("Bad swap offset entry %08lx\n", entry
.val
);
869 printk("Unused swap offset entry in swap_dup %08lx\n", entry
.val
);
874 * Page lock needs to be held in all cases to prevent races with
875 * swap file deletion.
877 int swap_count(struct page
*page
)
879 struct swap_info_struct
* p
;
880 unsigned long offset
, type
;
884 entry
.val
= page
->index
;
887 type
= SWP_TYPE(entry
);
888 if (type
>= nr_swapfiles
)
890 p
= type
+ swap_info
;
891 offset
= SWP_OFFSET(entry
);
892 if (offset
>= p
->max
)
894 if (!p
->swap_map
[offset
])
896 retval
= p
->swap_map
[offset
];
901 printk(KERN_ERR
"swap_count: null entry!\n");
904 printk("Bad swap file entry %08lx\n", entry
.val
);
907 printk("Bad swap offset entry %08lx\n", entry
.val
);
910 printk("Unused swap offset entry in swap_count %08lx\n", entry
.val
);
915 * Kernel_lock protects against swap device deletion.
917 void get_swaphandle_info(swp_entry_t entry
, unsigned long *offset
,
918 kdev_t
*dev
, struct inode
**swapf
)
921 struct swap_info_struct
*p
;
923 type
= SWP_TYPE(entry
);
924 if (type
>= nr_swapfiles
) {
925 printk("Internal error: bad swap-device\n");
929 p
= &swap_info
[type
];
930 *offset
= SWP_OFFSET(entry
);
931 if (*offset
>= p
->max
) {
932 printk("rw_swap_page: weirdness\n");
935 if (p
->swap_map
&& !p
->swap_map
[*offset
]) {
936 printk("VM: Bad swap entry %08lx\n", entry
.val
);
939 if (!(p
->flags
& SWP_USED
)) {
940 printk(KERN_ERR
"rw_swap_page: "
941 "Trying to swap to unused swap-device\n");
945 if (p
->swap_device
) {
946 *dev
= p
->swap_device
;
947 } else if (p
->swap_file
) {
948 *swapf
= p
->swap_file
->d_inode
;
950 printk(KERN_ERR
"rw_swap_page: no swap file or device\n");
956 * Kernel_lock protects against swap device deletion. Grab an extra
957 * reference on the swaphandle so that it dos not become unused.
959 int valid_swaphandles(swp_entry_t entry
, unsigned long *offset
)
961 int ret
= 0, i
= 1 << page_cluster
;
963 struct swap_info_struct
*swapdev
= SWP_TYPE(entry
) + swap_info
;
965 *offset
= SWP_OFFSET(entry
);
966 toff
= *offset
= (*offset
>> page_cluster
) << page_cluster
;
968 swap_device_lock(swapdev
);
970 /* Don't read-ahead past the end of the swap area */
971 if (toff
>= swapdev
->max
)
973 /* Don't read in bad or busy pages */
974 if (!swapdev
->swap_map
[toff
])
976 if (swapdev
->swap_map
[toff
] == SWAP_MAP_BAD
)
978 swapdev
->swap_map
[toff
]++;
982 swap_device_unlock(swapdev
);