1 /* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
3 #include "../../inode.h"
4 #include "../../super.h"
5 #include "../../page_cache.h"
6 #include "../../carry.h"
7 #include "../../safe_link.h"
8 #include "../../vfs_ops.h"
10 #include <linux/writeback.h>
12 /* this file contains:
13 tail2extent and extent2tail */
15 /* exclusive access to a file is acquired when file state changes: tail2extent, empty2tail, extent2tail, etc */
16 void get_exclusive_access(struct unix_file_info
* uf_info
)
18 assert("nikita-3028", reiser4_schedulable());
19 assert("nikita-3047", LOCK_CNT_NIL(inode_sem_w
));
20 assert("nikita-3048", LOCK_CNT_NIL(inode_sem_r
));
22 * "deadlock avoidance": sometimes we commit a transaction under
23 * rw-semaphore on a file. Such commit can deadlock with another
24 * thread that captured some block (hence preventing atom from being
25 * committed) and waits on rw-semaphore.
27 reiser4_txn_restart_current();
28 LOCK_CNT_INC(inode_sem_w
);
29 down_write(&uf_info
->latch
);
30 uf_info
->exclusive_use
= 1;
31 assert("vs-1713", uf_info
->ea_owner
== NULL
);
32 assert("vs-1713", atomic_read(&uf_info
->nr_neas
) == 0);
33 ON_DEBUG(uf_info
->ea_owner
= current
);
36 void drop_exclusive_access(struct unix_file_info
* uf_info
)
38 assert("vs-1714", uf_info
->ea_owner
== current
);
39 assert("vs-1715", atomic_read(&uf_info
->nr_neas
) == 0);
40 ON_DEBUG(uf_info
->ea_owner
= NULL
);
41 uf_info
->exclusive_use
= 0;
42 up_write(&uf_info
->latch
);
43 assert("nikita-3049", LOCK_CNT_NIL(inode_sem_r
));
44 assert("nikita-3049", LOCK_CNT_GTZ(inode_sem_w
));
45 LOCK_CNT_DEC(inode_sem_w
);
46 reiser4_txn_restart_current();
50 * nea_grabbed - do something when file semaphore is down_read-ed
53 * This is called when nonexclisive access is obtained on file. All it does is
54 * for debugging purposes.
56 static void nea_grabbed(struct unix_file_info
*uf_info
)
59 LOCK_CNT_INC(inode_sem_r
);
60 assert("vs-1716", uf_info
->ea_owner
== NULL
);
61 atomic_inc(&uf_info
->nr_neas
);
62 uf_info
->last_reader
= current
;
67 * get_nonexclusive_access - get nonexclusive access to a file
68 * @uf_info: unix file specific part of inode to obtain access to
70 * Nonexclusive access is obtained on a file before read, write, readpage.
72 void get_nonexclusive_access(struct unix_file_info
*uf_info
)
74 assert("nikita-3029", reiser4_schedulable());
75 assert("nikita-3361", get_current_context()->trans
->atom
== NULL
);
77 down_read(&uf_info
->latch
);
82 * try_to_get_nonexclusive_access - try to get nonexclusive access to a file
83 * @uf_info: unix file specific part of inode to obtain access to
85 * Non-blocking version of nonexclusive access obtaining.
87 int try_to_get_nonexclusive_access(struct unix_file_info
*uf_info
)
91 result
= down_read_trylock(&uf_info
->latch
);
97 void drop_nonexclusive_access(struct unix_file_info
* uf_info
)
99 assert("vs-1718", uf_info
->ea_owner
== NULL
);
100 assert("vs-1719", atomic_read(&uf_info
->nr_neas
) > 0);
101 ON_DEBUG(atomic_dec(&uf_info
->nr_neas
));
103 up_read(&uf_info
->latch
);
105 LOCK_CNT_DEC(inode_sem_r
);
106 reiser4_txn_restart_current();
109 /* part of tail2extent. Cut all items covering @count bytes starting from
111 /* Audited by: green(2002.06.15) */
112 static int cut_formatting_items(struct inode
*inode
, loff_t offset
, int count
)
114 reiser4_key from
, to
;
116 /* AUDIT: How about putting an assertion here, what would check
117 all provided range is covered by tail items only? */
118 /* key of first byte in the range to be cut */
119 inode_file_plugin(inode
)->key_by_inode(inode
, offset
, &from
);
121 /* key of last byte in that range */
123 set_key_offset(&to
, (__u64
) (offset
+ count
- 1));
125 /* cut everything between those keys */
126 return reiser4_cut_tree(reiser4_tree_by_inode(inode
), &from
, &to
,
130 static void release_all_pages(struct page
**pages
, unsigned nr_pages
)
134 for (i
= 0; i
< nr_pages
; i
++) {
135 if (pages
[i
] == NULL
) {
137 for (j
= i
+ 1; j
< nr_pages
; j
++)
138 assert("vs-1620", pages
[j
] == NULL
);
141 page_cache_release(pages
[i
]);
146 /* part of tail2extent. replace tail items with extent one. Content of tail
147 items (@count bytes) being cut are copied already into
148 pages. extent_writepage method is called to create extents corresponding to
150 static int replace(struct inode
*inode
, struct page
**pages
, unsigned nr_pages
, int count
)
159 assert("vs-596", pages
[0]);
161 /* cut copied items */
162 result
= cut_formatting_items(inode
, page_offset(pages
[0]), count
);
168 /* put into tree replacement for just removed items: extent item, namely */
169 for (i
= 0; i
< nr_pages
; i
++) {
170 result
= add_to_page_cache_lru(pages
[i
], inode
->i_mapping
,
172 mapping_gfp_mask(inode
->
176 unlock_page(pages
[i
]);
177 result
= find_or_create_extent(pages
[i
]);
180 SetPageUptodate(pages
[i
]);
185 #define TAIL2EXTENT_PAGE_NUM 3 /* number of pages to fill before cutting tail
188 static int reserve_tail2extent_iteration(struct inode
*inode
)
190 reiser4_block_nr unformatted_nodes
;
193 tree
= reiser4_tree_by_inode(inode
);
195 /* number of unformatted nodes which will be created */
196 unformatted_nodes
= TAIL2EXTENT_PAGE_NUM
;
199 * space required for one iteration of extent->tail conversion:
201 * 1. kill N tail items
203 * 2. insert TAIL2EXTENT_PAGE_NUM unformatted nodes
205 * 3. insert TAIL2EXTENT_PAGE_NUM (worst-case single-block
206 * extents) extent units.
208 * 4. drilling to the leaf level by coord_by_key()
210 * 5. possible update of stat-data
214 return reiser4_grab_space
216 TAIL2EXTENT_PAGE_NUM
+
217 TAIL2EXTENT_PAGE_NUM
* estimate_one_insert_into_item(tree
) +
218 1 + estimate_one_insert_item(tree
) +
219 inode_file_plugin(inode
)->estimate
.update(inode
), BA_CAN_COMMIT
);
222 /* clear stat data's flag indicating that conversion is being converted */
223 static int complete_conversion(struct inode
*inode
)
229 reiser4_grab_space(inode_file_plugin(inode
)->estimate
.update(inode
),
232 reiser4_inode_clr_flag(inode
, REISER4_PART_MIXED
);
233 result
= reiser4_update_sd(inode
);
236 warning("vs-1696", "Failed to clear converting bit of %llu: %i",
237 (unsigned long long)get_inode_oid(inode
), result
);
247 * this is used by tail2extent and extent2tail to detect where previous
248 * uncompleted conversion stopped
250 static int find_start(struct inode
*inode
, reiser4_plugin_id id
, __u64
*offset
)
255 struct unix_file_info
*ufo
;
259 ufo
= unix_file_inode_data(inode
);
263 inode_file_plugin(inode
)->key_by_inode(inode
, *offset
, &key
);
266 result
= find_file_item_nohint(&coord
, &lh
, &key
,
267 ZNODE_READ_LOCK
, inode
);
269 if (result
== CBK_COORD_FOUND
) {
270 if (coord
.between
== AT_UNIT
) {
271 /*coord_clear_iplug(&coord); */
272 result
= zload(coord
.node
);
274 if (item_id_by_coord(&coord
) == id
)
277 item_plugin_by_coord(&coord
)->s
.
278 file
.append_key(&coord
,
283 result
= RETERR(-ENOENT
);
286 } while (result
== 0 && !found
);
287 *offset
= get_key_offset(&key
);
297 int tail2extent(struct unix_file_info
*uf_info
)
300 reiser4_key key
; /* key of next byte to be moved to page */
301 char *p_data
; /* data of page */
302 unsigned page_off
= 0, /* offset within the page where to copy data */
303 count
; /* number of bytes of item which can be
305 struct page
*pages
[TAIL2EXTENT_PAGE_NUM
];
307 int done
; /* set to 1 when all file is read */
315 assert("nikita-3362", ea_obtained(uf_info
));
316 inode
= unix_file_info_to_inode(uf_info
);
317 assert("nikita-3412", !IS_RDONLY(inode
));
318 assert("vs-1649", uf_info
->container
!= UF_CONTAINER_EXTENTS
);
319 assert("", !reiser4_inode_get_flag(inode
, REISER4_PART_IN_CONV
));
324 if (reiser4_inode_get_flag(inode
, REISER4_PART_MIXED
)) {
326 * file is marked on disk as there was a conversion which did
327 * not complete due to either crash or some error. Find which
328 * offset tail conversion stopped at
330 result
= find_start(inode
, FORMATTING_ID
, &offset
);
331 if (result
== -ENOENT
) {
332 /* no tail items found, everything is converted */
333 uf_info
->container
= UF_CONTAINER_EXTENTS
;
334 complete_conversion(inode
);
336 } else if (result
!= 0)
337 /* some other error */
342 reiser4_inode_set_flag(inode
, REISER4_PART_IN_CONV
);
344 /* get key of first byte of a file */
345 inode_file_plugin(inode
)->key_by_inode(inode
, offset
, &key
);
349 memset(pages
, 0, sizeof(pages
));
350 result
= reserve_tail2extent_iteration(inode
);
353 if (first_iteration
) {
354 reiser4_inode_set_flag(inode
, REISER4_PART_MIXED
);
355 reiser4_update_sd(inode
);
359 for (i
= 0; i
< sizeof_array(pages
) && done
== 0; i
++) {
361 (get_key_offset(&key
) & ~PAGE_CACHE_MASK
) == 0);
362 page
= alloc_page(reiser4_ctx_gfp_mask_get());
364 result
= RETERR(-ENOMEM
);
369 (unsigned long)(get_key_offset(&key
) >>
372 * usually when one is going to longterm lock znode (as
373 * find_file_item does, for instance) he must not hold
374 * locked pages. However, there is an exception for
375 * case tail2extent. Pages appearing here are not
376 * reachable to everyone else, they are clean, they do
377 * not have jnodes attached so keeping them locked do
378 * not risk deadlock appearance
380 assert("vs-983", !PagePrivate(page
));
381 reiser4_invalidate_pages(inode
->i_mapping
, page
->index
,
384 for (page_off
= 0; page_off
< PAGE_CACHE_SIZE
;) {
389 /* FIXME: we might want to readahead here */
392 find_file_item_nohint(&coord
, &lh
, &key
,
395 if (result
!= CBK_COORD_FOUND
) {
397 * error happened of not items of file
401 page_cache_release(page
);
405 if (coord
.between
== AFTER_UNIT
) {
407 * end of file is reached. Padd page
412 p_data
= kmap_atomic(page
, KM_USER0
);
413 memset(p_data
+ page_off
, 0,
414 PAGE_CACHE_SIZE
- page_off
);
415 kunmap_atomic(p_data
, KM_USER0
);
419 result
= zload(coord
.node
);
421 page_cache_release(page
);
425 assert("vs-856", coord
.between
== AT_UNIT
);
426 item
= ((char *)item_body_by_coord(&coord
)) +
429 /* how many bytes to copy */
431 item_length_by_coord(&coord
) -
433 /* limit length of copy to end of page */
434 if (count
> PAGE_CACHE_SIZE
- page_off
)
435 count
= PAGE_CACHE_SIZE
- page_off
;
438 * copy item (as much as will fit starting from
439 * the beginning of the item) into the page
441 p_data
= kmap_atomic(page
, KM_USER0
);
442 memcpy(p_data
+ page_off
, item
, count
);
443 kunmap_atomic(p_data
, KM_USER0
);
448 get_key_offset(&key
) + count
);
452 } /* end of loop which fills one page by content of
453 * formatting items */
456 /* something was copied into page */
459 page_cache_release(page
);
460 assert("vs-1648", done
== 1);
463 } /* end of loop through pages of one conversion iteration */
466 result
= replace(inode
, pages
, i
, bytes
);
467 release_all_pages(pages
, sizeof_array(pages
));
471 * We have to drop exclusive access to avoid deadlock
472 * which may happen because called by reiser4_writepages
473 * capture_unix_file requires to get non-exclusive
474 * access to a file. It is safe to drop EA in the middle
475 * of tail2extent conversion because write_unix_file,
476 * setattr_unix_file(truncate), mmap_unix_file,
477 * release_unix_file(extent2tail) checks if conversion
478 * is not in progress (see comments before
479 * get_exclusive_access_careful().
480 * Other processes that acquire non-exclusive access
481 * (read_unix_file, reiser4_writepages, etc) should work
482 * on partially converted files.
484 drop_exclusive_access(uf_info
);
485 /* throttle the conversion */
486 reiser4_throttle_write(inode
);
487 get_exclusive_access(uf_info
);
490 * nobody is allowed to complete conversion but a
491 * process which started it
493 assert("", reiser4_inode_get_flag(inode
,
494 REISER4_PART_MIXED
));
498 reiser4_inode_clr_flag(inode
, REISER4_PART_IN_CONV
);
501 /* file is converted to extent items */
502 assert("vs-1697", reiser4_inode_get_flag(inode
,
503 REISER4_PART_MIXED
));
505 uf_info
->container
= UF_CONTAINER_EXTENTS
;
506 complete_conversion(inode
);
509 * conversion is not complete. Inode was already marked as
510 * REISER4_PART_CONV and stat-data were updated at the first
511 * iteration of the loop above.
514 release_all_pages(pages
, sizeof_array(pages
));
515 warning("nikita-2282", "Partial conversion of %llu: %i",
516 (unsigned long long)get_inode_oid(inode
), result
);
523 static int reserve_extent2tail_iteration(struct inode
*inode
)
527 tree
= reiser4_tree_by_inode(inode
);
529 * reserve blocks for (in this order):
531 * 1. removal of extent item
533 * 2. insertion of tail by insert_flow()
535 * 3. drilling to the leaf level by coord_by_key()
537 * 4. possible update of stat-data
540 return reiser4_grab_space
541 (estimate_one_item_removal(tree
) +
542 estimate_insert_flow(tree
->height
) +
543 1 + estimate_one_insert_item(tree
) +
544 inode_file_plugin(inode
)->estimate
.update(inode
), BA_CAN_COMMIT
);
547 /* for every page of file: read page, cut part of extent pointing to this page,
548 put data of page tree by tail item */
549 int extent2tail(struct file
* file
, struct unix_file_info
*uf_info
)
554 unsigned long num_pages
, i
;
555 unsigned long start_page
;
561 assert("nikita-3362", ea_obtained(uf_info
));
562 inode
= unix_file_info_to_inode(uf_info
);
563 assert("nikita-3412", !IS_RDONLY(inode
));
564 assert("vs-1649", uf_info
->container
!= UF_CONTAINER_TAILS
);
565 assert("", !reiser4_inode_get_flag(inode
, REISER4_PART_IN_CONV
));
568 if (reiser4_inode_get_flag(inode
, REISER4_PART_MIXED
)) {
570 * file is marked on disk as there was a conversion which did
571 * not complete due to either crash or some error. Find which
572 * offset tail conversion stopped at
574 result
= find_start(inode
, EXTENT_POINTER_ID
, &offset
);
575 if (result
== -ENOENT
) {
576 /* no extent found, everything is converted */
577 uf_info
->container
= UF_CONTAINER_TAILS
;
578 complete_conversion(inode
);
580 } else if (result
!= 0)
581 /* some other error */
585 reiser4_inode_set_flag(inode
, REISER4_PART_IN_CONV
);
587 /* number of pages in the file */
589 (inode
->i_size
+ - offset
+ PAGE_CACHE_SIZE
- 1) >> PAGE_CACHE_SHIFT
;
590 start_page
= offset
>> PAGE_CACHE_SHIFT
;
592 inode_file_plugin(inode
)->key_by_inode(inode
, offset
, &from
);
596 for (i
= 0; i
< num_pages
; i
++) {
599 result
= reserve_extent2tail_iteration(inode
);
602 if (i
== 0 && offset
== 0) {
603 reiser4_inode_set_flag(inode
, REISER4_PART_MIXED
);
604 reiser4_update_sd(inode
);
607 page
= read_mapping_page(inode
->i_mapping
,
608 (unsigned)(i
+ start_page
), NULL
);
610 result
= PTR_ERR(page
);
614 wait_on_page_locked(page
);
616 if (!PageUptodate(page
)) {
617 page_cache_release(page
);
618 result
= RETERR(-EIO
);
622 /* cut part of file we have read */
623 start_byte
= (__u64
) ((i
+ start_page
) << PAGE_CACHE_SHIFT
);
624 set_key_offset(&from
, start_byte
);
625 set_key_offset(&to
, start_byte
+ PAGE_CACHE_SIZE
- 1);
627 * reiser4_cut_tree_object() returns -E_REPEAT to allow atom
628 * commits during over-long truncates. But
629 * extent->tail conversion should be performed in one
632 result
= reiser4_cut_tree(reiser4_tree_by_inode(inode
), &from
,
636 page_cache_release(page
);
640 /* put page data into tree via tail_write */
641 count
= PAGE_CACHE_SIZE
;
642 if ((i
== (num_pages
- 1)) &&
643 (inode
->i_size
& ~PAGE_CACHE_MASK
))
644 /* last page can be incompleted */
645 count
= (inode
->i_size
& ~PAGE_CACHE_MASK
);
647 loff_t pos
= start_byte
;
649 assert("edward-1533",
650 file
!= NULL
&& file
->f_dentry
!= NULL
);
651 assert("edward-1534",
652 file
->f_dentry
->d_inode
== inode
);
654 result
= reiser4_write_tail(file
,
655 (char __user
*)kmap(page
),
657 reiser4_free_file_fsdata(file
);
659 warning("", "reiser4_write_tail failed");
660 page_cache_release(page
);
661 reiser4_inode_clr_flag(inode
, REISER4_PART_IN_CONV
);
669 /* page is already detached from jnode and mapping. */
670 assert("vs-1086", page
->mapping
== NULL
);
671 assert("nikita-2690",
672 (!PagePrivate(page
) && jprivate(page
) == 0));
673 /* waiting for writeback completion with page lock held is
674 * perfectly valid. */
675 wait_on_page_writeback(page
);
676 reiser4_drop_page(page
);
677 /* release reference taken by read_cache_page() above */
678 page_cache_release(page
);
680 drop_exclusive_access(uf_info
);
681 /* throttle the conversion */
682 reiser4_throttle_write(inode
);
683 get_exclusive_access(uf_info
);
685 * nobody is allowed to complete conversion but a process which
688 assert("", reiser4_inode_get_flag(inode
, REISER4_PART_MIXED
));
691 reiser4_inode_clr_flag(inode
, REISER4_PART_IN_CONV
);
693 if (i
== num_pages
) {
694 /* file is converted to formatted items */
695 assert("vs-1698", reiser4_inode_get_flag(inode
,
696 REISER4_PART_MIXED
));
698 inode_has_no_jnodes(reiser4_inode_data(inode
)));
700 uf_info
->container
= UF_CONTAINER_TAILS
;
701 complete_conversion(inode
);
705 * conversion is not complete. Inode was already marked as
706 * REISER4_PART_MIXED and stat-data were updated at the first *
707 * iteration of the loop above.
709 warning("nikita-2282",
710 "Partial conversion of %llu: %lu of %lu: %i",
711 (unsigned long long)get_inode_oid(inode
), i
,
719 * c-indentation-style: "K&R"