1 /* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
3 #include "../../inode.h"
4 #include "../../super.h"
5 #include "../../page_cache.h"
6 #include "../../carry.h"
7 #include "../../safe_link.h"
8 #include "../../vfs_ops.h"
10 #include <linux/writeback.h>
12 /* this file contains:
13 tail2extent and extent2tail */
15 /* exclusive access to a file is acquired when file state changes: tail2extent, empty2tail, extent2tail, etc */
16 void get_exclusive_access(struct unix_file_info
* uf_info
)
18 assert("nikita-3028", reiser4_schedulable());
19 assert("nikita-3047", LOCK_CNT_NIL(inode_sem_w
));
20 assert("nikita-3048", LOCK_CNT_NIL(inode_sem_r
));
22 * "deadlock avoidance": sometimes we commit a transaction under
23 * rw-semaphore on a file. Such commit can deadlock with another
24 * thread that captured some block (hence preventing atom from being
25 * committed) and waits on rw-semaphore.
27 reiser4_txn_restart_current();
28 LOCK_CNT_INC(inode_sem_w
);
29 down_write(&uf_info
->latch
);
30 uf_info
->exclusive_use
= 1;
31 assert("vs-1713", uf_info
->ea_owner
== NULL
);
32 assert("vs-1713", atomic_read(&uf_info
->nr_neas
) == 0);
33 ON_DEBUG(uf_info
->ea_owner
= current
);
36 void drop_exclusive_access(struct unix_file_info
* uf_info
)
38 assert("vs-1714", uf_info
->ea_owner
== current
);
39 assert("vs-1715", atomic_read(&uf_info
->nr_neas
) == 0);
40 ON_DEBUG(uf_info
->ea_owner
= NULL
);
41 uf_info
->exclusive_use
= 0;
42 up_write(&uf_info
->latch
);
43 assert("nikita-3049", LOCK_CNT_NIL(inode_sem_r
));
44 assert("nikita-3049", LOCK_CNT_GTZ(inode_sem_w
));
45 LOCK_CNT_DEC(inode_sem_w
);
46 reiser4_txn_restart_current();
50 * nea_grabbed - do something when file semaphore is down_read-ed
53 * This is called when nonexclisive access is obtained on file. All it does is
54 * for debugging purposes.
56 static void nea_grabbed(struct unix_file_info
*uf_info
)
59 LOCK_CNT_INC(inode_sem_r
);
60 assert("vs-1716", uf_info
->ea_owner
== NULL
);
61 atomic_inc(&uf_info
->nr_neas
);
62 uf_info
->last_reader
= current
;
67 * get_nonexclusive_access - get nonexclusive access to a file
68 * @uf_info: unix file specific part of inode to obtain access to
70 * Nonexclusive access is obtained on a file before read, write, readpage.
72 void get_nonexclusive_access(struct unix_file_info
*uf_info
)
74 assert("nikita-3029", reiser4_schedulable());
75 assert("nikita-3361", get_current_context()->trans
->atom
== NULL
);
77 down_read(&uf_info
->latch
);
82 * try_to_get_nonexclusive_access - try to get nonexclusive access to a file
83 * @uf_info: unix file specific part of inode to obtain access to
85 * Non-blocking version of nonexclusive access obtaining.
87 int try_to_get_nonexclusive_access(struct unix_file_info
*uf_info
)
91 result
= down_read_trylock(&uf_info
->latch
);
97 void drop_nonexclusive_access(struct unix_file_info
* uf_info
)
99 assert("vs-1718", uf_info
->ea_owner
== NULL
);
100 assert("vs-1719", atomic_read(&uf_info
->nr_neas
) > 0);
101 ON_DEBUG(atomic_dec(&uf_info
->nr_neas
));
103 up_read(&uf_info
->latch
);
105 LOCK_CNT_DEC(inode_sem_r
);
106 reiser4_txn_restart_current();
109 /* part of tail2extent. Cut all items covering @count bytes starting from
111 /* Audited by: green(2002.06.15) */
112 static int cut_formatting_items(struct inode
*inode
, loff_t offset
, int count
)
114 reiser4_key from
, to
;
116 /* AUDIT: How about putting an assertion here, what would check
117 all provided range is covered by tail items only? */
118 /* key of first byte in the range to be cut */
119 inode_file_plugin(inode
)->key_by_inode(inode
, offset
, &from
);
121 /* key of last byte in that range */
123 set_key_offset(&to
, (__u64
) (offset
+ count
- 1));
125 /* cut everything between those keys */
126 return reiser4_cut_tree(reiser4_tree_by_inode(inode
), &from
, &to
,
130 static void release_all_pages(struct page
**pages
, unsigned nr_pages
)
134 for (i
= 0; i
< nr_pages
; i
++) {
135 if (pages
[i
] == NULL
) {
138 for (j
= i
+ 1; j
< nr_pages
; j
++)
139 assert("vs-1620", pages
[j
] == NULL
);
143 page_cache_release(pages
[i
]);
148 /* part of tail2extent. replace tail items with extent one. Content of tail
149 items (@count bytes) being cut are copied already into
150 pages. extent_writepage method is called to create extents corresponding to
152 static int replace(struct inode
*inode
, struct page
**pages
, unsigned nr_pages
, int count
)
161 assert("vs-596", pages
[0]);
163 /* cut copied items */
164 result
= cut_formatting_items(inode
, page_offset(pages
[0]), count
);
170 /* put into tree replacement for just removed items: extent item, namely */
171 for (i
= 0; i
< nr_pages
; i
++) {
172 result
= add_to_page_cache_lru(pages
[i
], inode
->i_mapping
,
174 mapping_gfp_mask(inode
->
178 unlock_page(pages
[i
]);
179 result
= find_or_create_extent(pages
[i
]);
182 SetPageUptodate(pages
[i
]);
187 #define TAIL2EXTENT_PAGE_NUM 3 /* number of pages to fill before cutting tail
190 static int reserve_tail2extent_iteration(struct inode
*inode
)
192 reiser4_block_nr unformatted_nodes
;
195 tree
= reiser4_tree_by_inode(inode
);
197 /* number of unformatted nodes which will be created */
198 unformatted_nodes
= TAIL2EXTENT_PAGE_NUM
;
201 * space required for one iteration of extent->tail conversion:
203 * 1. kill N tail items
205 * 2. insert TAIL2EXTENT_PAGE_NUM unformatted nodes
207 * 3. insert TAIL2EXTENT_PAGE_NUM (worst-case single-block
208 * extents) extent units.
210 * 4. drilling to the leaf level by coord_by_key()
212 * 5. possible update of stat-data
216 return reiser4_grab_space
218 TAIL2EXTENT_PAGE_NUM
+
219 TAIL2EXTENT_PAGE_NUM
* estimate_one_insert_into_item(tree
) +
220 1 + estimate_one_insert_item(tree
) +
221 inode_file_plugin(inode
)->estimate
.update(inode
), BA_CAN_COMMIT
);
224 /* clear stat data's flag indicating that conversion is being converted */
225 static int complete_conversion(struct inode
*inode
)
231 reiser4_grab_space(inode_file_plugin(inode
)->estimate
.update(inode
),
234 reiser4_inode_clr_flag(inode
, REISER4_PART_MIXED
);
235 result
= reiser4_update_sd(inode
);
238 warning("vs-1696", "Failed to clear converting bit of %llu: %i",
239 (unsigned long long)get_inode_oid(inode
), result
);
249 * this is used by tail2extent and extent2tail to detect where previous
250 * uncompleted conversion stopped
252 static int find_start(struct inode
*inode
, reiser4_plugin_id id
, __u64
*offset
)
257 struct unix_file_info
*ufo
;
261 ufo
= unix_file_inode_data(inode
);
265 inode_file_plugin(inode
)->key_by_inode(inode
, *offset
, &key
);
268 result
= find_file_item_nohint(&coord
, &lh
, &key
,
269 ZNODE_READ_LOCK
, inode
);
271 if (result
== CBK_COORD_FOUND
) {
272 if (coord
.between
== AT_UNIT
) {
273 /*coord_clear_iplug(&coord); */
274 result
= zload(coord
.node
);
276 if (item_id_by_coord(&coord
) == id
)
279 item_plugin_by_coord(&coord
)->s
.
280 file
.append_key(&coord
,
285 result
= RETERR(-ENOENT
);
288 } while (result
== 0 && !found
);
289 *offset
= get_key_offset(&key
);
299 int tail2extent(struct unix_file_info
*uf_info
)
302 reiser4_key key
; /* key of next byte to be moved to page */
303 char *p_data
; /* data of page */
304 unsigned page_off
= 0, /* offset within the page where to copy data */
305 count
; /* number of bytes of item which can be
307 struct page
*pages
[TAIL2EXTENT_PAGE_NUM
];
309 int done
; /* set to 1 when all file is read */
317 assert("nikita-3362", ea_obtained(uf_info
));
318 inode
= unix_file_info_to_inode(uf_info
);
319 assert("nikita-3412", !IS_RDONLY(inode
));
320 assert("vs-1649", uf_info
->container
!= UF_CONTAINER_EXTENTS
);
321 assert("", !reiser4_inode_get_flag(inode
, REISER4_PART_IN_CONV
));
326 if (reiser4_inode_get_flag(inode
, REISER4_PART_MIXED
)) {
328 * file is marked on disk as there was a conversion which did
329 * not complete due to either crash or some error. Find which
330 * offset tail conversion stopped at
332 result
= find_start(inode
, FORMATTING_ID
, &offset
);
333 if (result
== -ENOENT
) {
334 /* no tail items found, everything is converted */
335 uf_info
->container
= UF_CONTAINER_EXTENTS
;
336 complete_conversion(inode
);
338 } else if (result
!= 0)
339 /* some other error */
344 reiser4_inode_set_flag(inode
, REISER4_PART_IN_CONV
);
346 /* get key of first byte of a file */
347 inode_file_plugin(inode
)->key_by_inode(inode
, offset
, &key
);
351 memset(pages
, 0, sizeof(pages
));
352 result
= reserve_tail2extent_iteration(inode
);
354 reiser4_inode_clr_flag(inode
, REISER4_PART_IN_CONV
);
357 if (first_iteration
) {
358 reiser4_inode_set_flag(inode
, REISER4_PART_MIXED
);
359 reiser4_update_sd(inode
);
363 for (i
= 0; i
< sizeof_array(pages
) && done
== 0; i
++) {
365 (get_key_offset(&key
) & ~PAGE_CACHE_MASK
) == 0);
366 page
= alloc_page(reiser4_ctx_gfp_mask_get());
368 result
= RETERR(-ENOMEM
);
373 (unsigned long)(get_key_offset(&key
) >>
376 * usually when one is going to longterm lock znode (as
377 * find_file_item does, for instance) he must not hold
378 * locked pages. However, there is an exception for
379 * case tail2extent. Pages appearing here are not
380 * reachable to everyone else, they are clean, they do
381 * not have jnodes attached so keeping them locked do
382 * not risk deadlock appearance
384 assert("vs-983", !PagePrivate(page
));
385 reiser4_invalidate_pages(inode
->i_mapping
, page
->index
,
388 for (page_off
= 0; page_off
< PAGE_CACHE_SIZE
;) {
393 /* FIXME: we might want to readahead here */
396 find_file_item_nohint(&coord
, &lh
, &key
,
399 if (result
!= CBK_COORD_FOUND
) {
401 * error happened of not items of file
405 page_cache_release(page
);
409 if (coord
.between
== AFTER_UNIT
) {
411 * end of file is reached. Padd page
416 p_data
= kmap_atomic(page
, KM_USER0
);
417 memset(p_data
+ page_off
, 0,
418 PAGE_CACHE_SIZE
- page_off
);
419 kunmap_atomic(p_data
, KM_USER0
);
423 result
= zload(coord
.node
);
425 page_cache_release(page
);
429 assert("vs-856", coord
.between
== AT_UNIT
);
430 item
= ((char *)item_body_by_coord(&coord
)) +
433 /* how many bytes to copy */
435 item_length_by_coord(&coord
) -
437 /* limit length of copy to end of page */
438 if (count
> PAGE_CACHE_SIZE
- page_off
)
439 count
= PAGE_CACHE_SIZE
- page_off
;
442 * copy item (as much as will fit starting from
443 * the beginning of the item) into the page
445 p_data
= kmap_atomic(page
, KM_USER0
);
446 memcpy(p_data
+ page_off
, item
, count
);
447 kunmap_atomic(p_data
, KM_USER0
);
452 get_key_offset(&key
) + count
);
456 } /* end of loop which fills one page by content of
457 * formatting items */
460 /* something was copied into page */
463 page_cache_release(page
);
464 assert("vs-1648", done
== 1);
467 } /* end of loop through pages of one conversion iteration */
470 result
= replace(inode
, pages
, i
, bytes
);
471 release_all_pages(pages
, sizeof_array(pages
));
475 * We have to drop exclusive access to avoid deadlock
476 * which may happen because called by reiser4_writepages
477 * capture_unix_file requires to get non-exclusive
478 * access to a file. It is safe to drop EA in the middle
479 * of tail2extent conversion because write_unix_file,
480 * setattr_unix_file(truncate), mmap_unix_file,
481 * release_unix_file(extent2tail) checks if conversion
482 * is not in progress (see comments before
483 * get_exclusive_access_careful().
484 * Other processes that acquire non-exclusive access
485 * (read_unix_file, reiser4_writepages, etc) should work
486 * on partially converted files.
488 drop_exclusive_access(uf_info
);
489 /* throttle the conversion
490 FIXME-EDWARD: Pass the precise number of pages
492 reiser4_throttle_write(inode
, 1);
493 get_exclusive_access(uf_info
);
496 * nobody is allowed to complete conversion but a
497 * process which started it
499 assert("", reiser4_inode_get_flag(inode
,
500 REISER4_PART_MIXED
));
504 /* file is converted to extent items */
505 reiser4_inode_clr_flag(inode
, REISER4_PART_IN_CONV
);
506 assert("vs-1697", reiser4_inode_get_flag(inode
,
507 REISER4_PART_MIXED
));
509 uf_info
->container
= UF_CONTAINER_EXTENTS
;
510 complete_conversion(inode
);
513 * conversion is not complete. Inode was already marked as
514 * REISER4_PART_MIXED and stat-data were updated at the first
515 * iteration of the loop above.
518 release_all_pages(pages
, sizeof_array(pages
));
519 reiser4_inode_clr_flag(inode
, REISER4_PART_IN_CONV
);
520 warning("edward-1548", "Partial conversion of %llu: %i",
521 (unsigned long long)get_inode_oid(inode
), result
);
525 /* this flag should be cleared, otherwise get_exclusive_access_careful()
526 will fall into infinite loop */
527 assert("edward-1549", !reiser4_inode_get_flag(inode
,
528 REISER4_PART_IN_CONV
));
532 static int reserve_extent2tail_iteration(struct inode
*inode
)
536 tree
= reiser4_tree_by_inode(inode
);
538 * reserve blocks for (in this order):
540 * 1. removal of extent item
542 * 2. insertion of tail by insert_flow()
544 * 3. drilling to the leaf level by coord_by_key()
546 * 4. possible update of stat-data
549 return reiser4_grab_space
550 (estimate_one_item_removal(tree
) +
551 estimate_insert_flow(tree
->height
) +
552 1 + estimate_one_insert_item(tree
) +
553 inode_file_plugin(inode
)->estimate
.update(inode
), BA_CAN_COMMIT
);
556 /* for every page of file: read page, cut part of extent pointing to this page,
557 put data of page tree by tail item */
558 int extent2tail(struct file
* file
, struct unix_file_info
*uf_info
)
563 unsigned long num_pages
, i
;
564 unsigned long start_page
;
570 assert("nikita-3362", ea_obtained(uf_info
));
571 inode
= unix_file_info_to_inode(uf_info
);
572 assert("nikita-3412", !IS_RDONLY(inode
));
573 assert("vs-1649", uf_info
->container
!= UF_CONTAINER_TAILS
);
574 assert("", !reiser4_inode_get_flag(inode
, REISER4_PART_IN_CONV
));
577 if (reiser4_inode_get_flag(inode
, REISER4_PART_MIXED
)) {
579 * file is marked on disk as there was a conversion which did
580 * not complete due to either crash or some error. Find which
581 * offset tail conversion stopped at
583 result
= find_start(inode
, EXTENT_POINTER_ID
, &offset
);
584 if (result
== -ENOENT
) {
585 /* no extent found, everything is converted */
586 uf_info
->container
= UF_CONTAINER_TAILS
;
587 complete_conversion(inode
);
589 } else if (result
!= 0)
590 /* some other error */
594 reiser4_inode_set_flag(inode
, REISER4_PART_IN_CONV
);
596 /* number of pages in the file */
598 (inode
->i_size
+ - offset
+ PAGE_CACHE_SIZE
- 1) >> PAGE_CACHE_SHIFT
;
599 start_page
= offset
>> PAGE_CACHE_SHIFT
;
601 inode_file_plugin(inode
)->key_by_inode(inode
, offset
, &from
);
605 for (i
= 0; i
< num_pages
; i
++) {
608 result
= reserve_extent2tail_iteration(inode
);
611 if (i
== 0 && offset
== 0) {
612 reiser4_inode_set_flag(inode
, REISER4_PART_MIXED
);
613 reiser4_update_sd(inode
);
616 page
= read_mapping_page(inode
->i_mapping
,
617 (unsigned)(i
+ start_page
), NULL
);
619 result
= PTR_ERR(page
);
623 wait_on_page_locked(page
);
625 if (!PageUptodate(page
)) {
626 page_cache_release(page
);
627 result
= RETERR(-EIO
);
631 /* cut part of file we have read */
632 start_byte
= (__u64
) ((i
+ start_page
) << PAGE_CACHE_SHIFT
);
633 set_key_offset(&from
, start_byte
);
634 set_key_offset(&to
, start_byte
+ PAGE_CACHE_SIZE
- 1);
636 * reiser4_cut_tree_object() returns -E_REPEAT to allow atom
637 * commits during over-long truncates. But
638 * extent->tail conversion should be performed in one
641 result
= reiser4_cut_tree(reiser4_tree_by_inode(inode
), &from
,
645 page_cache_release(page
);
649 /* put page data into tree via tail_write */
650 count
= PAGE_CACHE_SIZE
;
651 if ((i
== (num_pages
- 1)) &&
652 (inode
->i_size
& ~PAGE_CACHE_MASK
))
653 /* last page can be incompleted */
654 count
= (inode
->i_size
& ~PAGE_CACHE_MASK
);
656 loff_t pos
= start_byte
;
658 assert("edward-1537",
659 file
!= NULL
&& file
->f_dentry
!= NULL
);
660 assert("edward-1538",
661 file
->f_dentry
->d_inode
== inode
);
663 result
= reiser4_write_tail(file
, inode
,
664 (char __user
*)kmap(page
),
666 reiser4_free_file_fsdata(file
);
668 warning("", "reiser4_write_tail failed");
669 page_cache_release(page
);
670 reiser4_inode_clr_flag(inode
, REISER4_PART_IN_CONV
);
678 /* page is already detached from jnode and mapping. */
679 assert("vs-1086", page
->mapping
== NULL
);
680 assert("nikita-2690",
681 (!PagePrivate(page
) && jprivate(page
) == 0));
682 /* waiting for writeback completion with page lock held is
683 * perfectly valid. */
684 wait_on_page_writeback(page
);
685 reiser4_drop_page(page
);
686 /* release reference taken by read_cache_page() above */
687 page_cache_release(page
);
689 drop_exclusive_access(uf_info
);
691 * throttle the conversion.
692 * FIXME-EDWARD: Calculate and pass the precise number
693 * of pages that was dirtied
695 reiser4_throttle_write(inode
, 1);
696 get_exclusive_access(uf_info
);
698 * nobody is allowed to complete conversion but a process which
701 assert("", reiser4_inode_get_flag(inode
, REISER4_PART_MIXED
));
704 reiser4_inode_clr_flag(inode
, REISER4_PART_IN_CONV
);
706 if (i
== num_pages
) {
707 /* file is converted to formatted items */
708 assert("vs-1698", reiser4_inode_get_flag(inode
,
709 REISER4_PART_MIXED
));
711 inode_has_no_jnodes(reiser4_inode_data(inode
)));
713 uf_info
->container
= UF_CONTAINER_TAILS
;
714 complete_conversion(inode
);
718 * conversion is not complete. Inode was already marked as
719 * REISER4_PART_MIXED and stat-data were updated at the first
720 * iteration of the loop above.
722 warning("nikita-2282",
723 "Partial conversion of %llu: %lu of %lu: %i",
724 (unsigned long long)get_inode_oid(inode
), i
,
727 /* this flag should be cleared, otherwise get_exclusive_access_careful()
728 will fall into infinite loop */
729 assert("edward-1550", !reiser4_inode_get_flag(inode
,
730 REISER4_PART_IN_CONV
));
736 * c-indentation-style: "K&R"