revert-mm-fix-blkdev-size-calculation-in-generic_write_checks
[linux-2.6/linux-trees-mm.git] / fs / reiser4 / plugin / file / tail_conversion.c
blobb955b0c8d88485041832dddda2a964718dc49c03
1 /* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
3 #include "../../inode.h"
4 #include "../../super.h"
5 #include "../../page_cache.h"
6 #include "../../carry.h"
7 #include "../../safe_link.h"
8 #include "../../vfs_ops.h"
10 #include <linux/writeback.h>
12 /* this file contains:
13 tail2extent and extent2tail */
15 /* exclusive access to a file is acquired when file state changes: tail2extent, empty2tail, extent2tail, etc */
16 void get_exclusive_access(struct unix_file_info * uf_info)
18 assert("nikita-3028", reiser4_schedulable());
19 assert("nikita-3047", LOCK_CNT_NIL(inode_sem_w));
20 assert("nikita-3048", LOCK_CNT_NIL(inode_sem_r));
22 * "deadlock avoidance": sometimes we commit a transaction under
23 * rw-semaphore on a file. Such commit can deadlock with another
24 * thread that captured some block (hence preventing atom from being
25 * committed) and waits on rw-semaphore.
27 reiser4_txn_restart_current();
28 LOCK_CNT_INC(inode_sem_w);
29 down_write(&uf_info->latch);
30 uf_info->exclusive_use = 1;
31 assert("vs-1713", uf_info->ea_owner == NULL);
32 assert("vs-1713", atomic_read(&uf_info->nr_neas) == 0);
33 ON_DEBUG(uf_info->ea_owner = current);
36 void drop_exclusive_access(struct unix_file_info * uf_info)
38 assert("vs-1714", uf_info->ea_owner == current);
39 assert("vs-1715", atomic_read(&uf_info->nr_neas) == 0);
40 ON_DEBUG(uf_info->ea_owner = NULL);
41 uf_info->exclusive_use = 0;
42 up_write(&uf_info->latch);
43 assert("nikita-3049", LOCK_CNT_NIL(inode_sem_r));
44 assert("nikita-3049", LOCK_CNT_GTZ(inode_sem_w));
45 LOCK_CNT_DEC(inode_sem_w);
46 reiser4_txn_restart_current();
49 /**
50 * nea_grabbed - do something when file semaphore is down_read-ed
51 * @uf_info:
53 * This is called when nonexclisive access is obtained on file. All it does is
54 * for debugging purposes.
56 static void nea_grabbed(struct unix_file_info *uf_info)
58 #if REISER4_DEBUG
59 LOCK_CNT_INC(inode_sem_r);
60 assert("vs-1716", uf_info->ea_owner == NULL);
61 atomic_inc(&uf_info->nr_neas);
62 uf_info->last_reader = current;
63 #endif
66 /**
67 * get_nonexclusive_access - get nonexclusive access to a file
68 * @uf_info: unix file specific part of inode to obtain access to
70 * Nonexclusive access is obtained on a file before read, write, readpage.
72 void get_nonexclusive_access(struct unix_file_info *uf_info)
74 assert("nikita-3029", reiser4_schedulable());
75 assert("nikita-3361", get_current_context()->trans->atom == NULL);
77 down_read(&uf_info->latch);
78 nea_grabbed(uf_info);
81 /**
82 * try_to_get_nonexclusive_access - try to get nonexclusive access to a file
83 * @uf_info: unix file specific part of inode to obtain access to
85 * Non-blocking version of nonexclusive access obtaining.
87 int try_to_get_nonexclusive_access(struct unix_file_info *uf_info)
89 int result;
91 result = down_read_trylock(&uf_info->latch);
92 if (result)
93 nea_grabbed(uf_info);
94 return result;
97 void drop_nonexclusive_access(struct unix_file_info * uf_info)
99 assert("vs-1718", uf_info->ea_owner == NULL);
100 assert("vs-1719", atomic_read(&uf_info->nr_neas) > 0);
101 ON_DEBUG(atomic_dec(&uf_info->nr_neas));
103 up_read(&uf_info->latch);
105 LOCK_CNT_DEC(inode_sem_r);
106 reiser4_txn_restart_current();
109 /* part of tail2extent. Cut all items covering @count bytes starting from
110 @offset */
111 /* Audited by: green(2002.06.15) */
112 static int cut_formatting_items(struct inode *inode, loff_t offset, int count)
114 reiser4_key from, to;
116 /* AUDIT: How about putting an assertion here, what would check
117 all provided range is covered by tail items only? */
118 /* key of first byte in the range to be cut */
119 inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
121 /* key of last byte in that range */
122 to = from;
123 set_key_offset(&to, (__u64) (offset + count - 1));
125 /* cut everything between those keys */
126 return reiser4_cut_tree(reiser4_tree_by_inode(inode), &from, &to,
127 inode, 0);
130 static void release_all_pages(struct page **pages, unsigned nr_pages)
132 unsigned i;
134 for (i = 0; i < nr_pages; i++) {
135 if (pages[i] == NULL) {
136 unsigned j;
137 for (j = i + 1; j < nr_pages; j++)
138 assert("vs-1620", pages[j] == NULL);
139 break;
141 page_cache_release(pages[i]);
142 pages[i] = NULL;
146 /* part of tail2extent. replace tail items with extent one. Content of tail
147 items (@count bytes) being cut are copied already into
148 pages. extent_writepage method is called to create extents corresponding to
149 those pages */
150 static int replace(struct inode *inode, struct page **pages, unsigned nr_pages, int count)
152 int result;
153 unsigned i;
154 STORE_COUNTERS;
156 if (nr_pages == 0)
157 return 0;
159 assert("vs-596", pages[0]);
161 /* cut copied items */
162 result = cut_formatting_items(inode, page_offset(pages[0]), count);
163 if (result)
164 return result;
166 CHECK_COUNTERS;
168 /* put into tree replacement for just removed items: extent item, namely */
169 for (i = 0; i < nr_pages; i++) {
170 result = add_to_page_cache_lru(pages[i], inode->i_mapping,
171 pages[i]->index,
172 mapping_gfp_mask(inode->
173 i_mapping));
174 if (result)
175 break;
176 unlock_page(pages[i]);
177 result = find_or_create_extent(pages[i]);
178 if (result)
179 break;
180 SetPageUptodate(pages[i]);
182 return result;
185 #define TAIL2EXTENT_PAGE_NUM 3 /* number of pages to fill before cutting tail
186 * items */
188 static int reserve_tail2extent_iteration(struct inode *inode)
190 reiser4_block_nr unformatted_nodes;
191 reiser4_tree *tree;
193 tree = reiser4_tree_by_inode(inode);
195 /* number of unformatted nodes which will be created */
196 unformatted_nodes = TAIL2EXTENT_PAGE_NUM;
199 * space required for one iteration of extent->tail conversion:
201 * 1. kill N tail items
203 * 2. insert TAIL2EXTENT_PAGE_NUM unformatted nodes
205 * 3. insert TAIL2EXTENT_PAGE_NUM (worst-case single-block
206 * extents) extent units.
208 * 4. drilling to the leaf level by coord_by_key()
210 * 5. possible update of stat-data
213 grab_space_enable();
214 return reiser4_grab_space
215 (2 * tree->height +
216 TAIL2EXTENT_PAGE_NUM +
217 TAIL2EXTENT_PAGE_NUM * estimate_one_insert_into_item(tree) +
218 1 + estimate_one_insert_item(tree) +
219 inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
222 /* clear stat data's flag indicating that conversion is being converted */
223 static int complete_conversion(struct inode *inode)
225 int result;
227 grab_space_enable();
228 result =
229 reiser4_grab_space(inode_file_plugin(inode)->estimate.update(inode),
230 BA_CAN_COMMIT);
231 if (result == 0) {
232 reiser4_inode_clr_flag(inode, REISER4_PART_MIXED);
233 result = reiser4_update_sd(inode);
235 if (result)
236 warning("vs-1696", "Failed to clear converting bit of %llu: %i",
237 (unsigned long long)get_inode_oid(inode), result);
238 return 0;
242 * find_start
243 * @inode:
244 * @id:
245 * @offset:
247 * this is used by tail2extent and extent2tail to detect where previous
248 * uncompleted conversion stopped
250 static int find_start(struct inode *inode, reiser4_plugin_id id, __u64 *offset)
252 int result;
253 lock_handle lh;
254 coord_t coord;
255 struct unix_file_info *ufo;
256 int found;
257 reiser4_key key;
259 ufo = unix_file_inode_data(inode);
260 init_lh(&lh);
261 result = 0;
262 found = 0;
263 inode_file_plugin(inode)->key_by_inode(inode, *offset, &key);
264 do {
265 init_lh(&lh);
266 result = find_file_item_nohint(&coord, &lh, &key,
267 ZNODE_READ_LOCK, inode);
269 if (result == CBK_COORD_FOUND) {
270 if (coord.between == AT_UNIT) {
271 /*coord_clear_iplug(&coord); */
272 result = zload(coord.node);
273 if (result == 0) {
274 if (item_id_by_coord(&coord) == id)
275 found = 1;
276 else
277 item_plugin_by_coord(&coord)->s.
278 file.append_key(&coord,
279 &key);
280 zrelse(coord.node);
282 } else
283 result = RETERR(-ENOENT);
285 done_lh(&lh);
286 } while (result == 0 && !found);
287 *offset = get_key_offset(&key);
288 return result;
292 * tail2extent
293 * @uf_info:
297 int tail2extent(struct unix_file_info *uf_info)
299 int result;
300 reiser4_key key; /* key of next byte to be moved to page */
301 char *p_data; /* data of page */
302 unsigned page_off = 0, /* offset within the page where to copy data */
303 count; /* number of bytes of item which can be
304 * copied to page */
305 struct page *pages[TAIL2EXTENT_PAGE_NUM];
306 struct page *page;
307 int done; /* set to 1 when all file is read */
308 char *item;
309 int i;
310 struct inode *inode;
311 int first_iteration;
312 int bytes;
313 __u64 offset;
315 assert("nikita-3362", ea_obtained(uf_info));
316 inode = unix_file_info_to_inode(uf_info);
317 assert("nikita-3412", !IS_RDONLY(inode));
318 assert("vs-1649", uf_info->container != UF_CONTAINER_EXTENTS);
319 assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV));
321 offset = 0;
322 first_iteration = 1;
323 result = 0;
324 if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
326 * file is marked on disk as there was a conversion which did
327 * not complete due to either crash or some error. Find which
328 * offset tail conversion stopped at
330 result = find_start(inode, FORMATTING_ID, &offset);
331 if (result == -ENOENT) {
332 /* no tail items found, everything is converted */
333 uf_info->container = UF_CONTAINER_EXTENTS;
334 complete_conversion(inode);
335 return 0;
336 } else if (result != 0)
337 /* some other error */
338 return result;
339 first_iteration = 0;
342 reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV);
344 /* get key of first byte of a file */
345 inode_file_plugin(inode)->key_by_inode(inode, offset, &key);
347 done = 0;
348 while (done == 0) {
349 memset(pages, 0, sizeof(pages));
350 result = reserve_tail2extent_iteration(inode);
351 if (result != 0)
352 goto out;
353 if (first_iteration) {
354 reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
355 reiser4_update_sd(inode);
356 first_iteration = 0;
358 bytes = 0;
359 for (i = 0; i < sizeof_array(pages) && done == 0; i++) {
360 assert("vs-598",
361 (get_key_offset(&key) & ~PAGE_CACHE_MASK) == 0);
362 page = alloc_page(reiser4_ctx_gfp_mask_get());
363 if (!page) {
364 result = RETERR(-ENOMEM);
365 goto error;
368 page->index =
369 (unsigned long)(get_key_offset(&key) >>
370 PAGE_CACHE_SHIFT);
372 * usually when one is going to longterm lock znode (as
373 * find_file_item does, for instance) he must not hold
374 * locked pages. However, there is an exception for
375 * case tail2extent. Pages appearing here are not
376 * reachable to everyone else, they are clean, they do
377 * not have jnodes attached so keeping them locked do
378 * not risk deadlock appearance
380 assert("vs-983", !PagePrivate(page));
381 reiser4_invalidate_pages(inode->i_mapping, page->index,
382 1, 0);
384 for (page_off = 0; page_off < PAGE_CACHE_SIZE;) {
385 coord_t coord;
386 lock_handle lh;
388 /* get next item */
389 /* FIXME: we might want to readahead here */
390 init_lh(&lh);
391 result =
392 find_file_item_nohint(&coord, &lh, &key,
393 ZNODE_READ_LOCK,
394 inode);
395 if (result != CBK_COORD_FOUND) {
397 * error happened of not items of file
398 * were found
400 done_lh(&lh);
401 page_cache_release(page);
402 goto error;
405 if (coord.between == AFTER_UNIT) {
407 * end of file is reached. Padd page
408 * with zeros
410 done_lh(&lh);
411 done = 1;
412 p_data = kmap_atomic(page, KM_USER0);
413 memset(p_data + page_off, 0,
414 PAGE_CACHE_SIZE - page_off);
415 kunmap_atomic(p_data, KM_USER0);
416 break;
419 result = zload(coord.node);
420 if (result) {
421 page_cache_release(page);
422 done_lh(&lh);
423 goto error;
425 assert("vs-856", coord.between == AT_UNIT);
426 item = ((char *)item_body_by_coord(&coord)) +
427 coord.unit_pos;
429 /* how many bytes to copy */
430 count =
431 item_length_by_coord(&coord) -
432 coord.unit_pos;
433 /* limit length of copy to end of page */
434 if (count > PAGE_CACHE_SIZE - page_off)
435 count = PAGE_CACHE_SIZE - page_off;
438 * copy item (as much as will fit starting from
439 * the beginning of the item) into the page
441 p_data = kmap_atomic(page, KM_USER0);
442 memcpy(p_data + page_off, item, count);
443 kunmap_atomic(p_data, KM_USER0);
445 page_off += count;
446 bytes += count;
447 set_key_offset(&key,
448 get_key_offset(&key) + count);
450 zrelse(coord.node);
451 done_lh(&lh);
452 } /* end of loop which fills one page by content of
453 * formatting items */
455 if (page_off) {
456 /* something was copied into page */
457 pages[i] = page;
458 } else {
459 page_cache_release(page);
460 assert("vs-1648", done == 1);
461 break;
463 } /* end of loop through pages of one conversion iteration */
465 if (i > 0) {
466 result = replace(inode, pages, i, bytes);
467 release_all_pages(pages, sizeof_array(pages));
468 if (result)
469 goto error;
471 * We have to drop exclusive access to avoid deadlock
472 * which may happen because called by reiser4_writepages
473 * capture_unix_file requires to get non-exclusive
474 * access to a file. It is safe to drop EA in the middle
475 * of tail2extent conversion because write_unix_file,
476 * setattr_unix_file(truncate), mmap_unix_file,
477 * release_unix_file(extent2tail) checks if conversion
478 * is not in progress (see comments before
479 * get_exclusive_access_careful().
480 * Other processes that acquire non-exclusive access
481 * (read_unix_file, reiser4_writepages, etc) should work
482 * on partially converted files.
484 drop_exclusive_access(uf_info);
485 /* throttle the conversion */
486 reiser4_throttle_write(inode);
487 get_exclusive_access(uf_info);
490 * nobody is allowed to complete conversion but a
491 * process which started it
493 assert("", reiser4_inode_get_flag(inode,
494 REISER4_PART_MIXED));
498 reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
500 if (result == 0) {
501 /* file is converted to extent items */
502 assert("vs-1697", reiser4_inode_get_flag(inode,
503 REISER4_PART_MIXED));
505 uf_info->container = UF_CONTAINER_EXTENTS;
506 complete_conversion(inode);
507 } else {
509 * conversion is not complete. Inode was already marked as
510 * REISER4_PART_CONV and stat-data were updated at the first
511 * iteration of the loop above.
513 error:
514 release_all_pages(pages, sizeof_array(pages));
515 warning("nikita-2282", "Partial conversion of %llu: %i",
516 (unsigned long long)get_inode_oid(inode), result);
519 out:
520 return result;
523 static int reserve_extent2tail_iteration(struct inode *inode)
525 reiser4_tree *tree;
527 tree = reiser4_tree_by_inode(inode);
529 * reserve blocks for (in this order):
531 * 1. removal of extent item
533 * 2. insertion of tail by insert_flow()
535 * 3. drilling to the leaf level by coord_by_key()
537 * 4. possible update of stat-data
539 grab_space_enable();
540 return reiser4_grab_space
541 (estimate_one_item_removal(tree) +
542 estimate_insert_flow(tree->height) +
543 1 + estimate_one_insert_item(tree) +
544 inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
547 /* for every page of file: read page, cut part of extent pointing to this page,
548 put data of page tree by tail item */
549 int extent2tail(struct file * file, struct unix_file_info *uf_info)
551 int result;
552 struct inode *inode;
553 struct page *page;
554 unsigned long num_pages, i;
555 unsigned long start_page;
556 reiser4_key from;
557 reiser4_key to;
558 unsigned count;
559 __u64 offset;
561 assert("nikita-3362", ea_obtained(uf_info));
562 inode = unix_file_info_to_inode(uf_info);
563 assert("nikita-3412", !IS_RDONLY(inode));
564 assert("vs-1649", uf_info->container != UF_CONTAINER_TAILS);
565 assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV));
567 offset = 0;
568 if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
570 * file is marked on disk as there was a conversion which did
571 * not complete due to either crash or some error. Find which
572 * offset tail conversion stopped at
574 result = find_start(inode, EXTENT_POINTER_ID, &offset);
575 if (result == -ENOENT) {
576 /* no extent found, everything is converted */
577 uf_info->container = UF_CONTAINER_TAILS;
578 complete_conversion(inode);
579 return 0;
580 } else if (result != 0)
581 /* some other error */
582 return result;
585 reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV);
587 /* number of pages in the file */
588 num_pages =
589 (inode->i_size + - offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
590 start_page = offset >> PAGE_CACHE_SHIFT;
592 inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
593 to = from;
595 result = 0;
596 for (i = 0; i < num_pages; i++) {
597 __u64 start_byte;
599 result = reserve_extent2tail_iteration(inode);
600 if (result != 0)
601 break;
602 if (i == 0 && offset == 0) {
603 reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
604 reiser4_update_sd(inode);
607 page = read_mapping_page(inode->i_mapping,
608 (unsigned)(i + start_page), NULL);
609 if (IS_ERR(page)) {
610 result = PTR_ERR(page);
611 break;
614 wait_on_page_locked(page);
616 if (!PageUptodate(page)) {
617 page_cache_release(page);
618 result = RETERR(-EIO);
619 break;
622 /* cut part of file we have read */
623 start_byte = (__u64) ((i + start_page) << PAGE_CACHE_SHIFT);
624 set_key_offset(&from, start_byte);
625 set_key_offset(&to, start_byte + PAGE_CACHE_SIZE - 1);
627 * reiser4_cut_tree_object() returns -E_REPEAT to allow atom
628 * commits during over-long truncates. But
629 * extent->tail conversion should be performed in one
630 * transaction.
632 result = reiser4_cut_tree(reiser4_tree_by_inode(inode), &from,
633 &to, inode, 0);
635 if (result) {
636 page_cache_release(page);
637 break;
640 /* put page data into tree via tail_write */
641 count = PAGE_CACHE_SIZE;
642 if ((i == (num_pages - 1)) &&
643 (inode->i_size & ~PAGE_CACHE_MASK))
644 /* last page can be incompleted */
645 count = (inode->i_size & ~PAGE_CACHE_MASK);
646 while (count) {
647 loff_t pos = start_byte;
649 assert("edward-1533",
650 file != NULL && file->f_dentry != NULL);
651 assert("edward-1534",
652 file->f_dentry->d_inode == inode);
654 result = reiser4_write_tail(file,
655 (char __user *)kmap(page),
656 count, &pos);
657 reiser4_free_file_fsdata(file);
658 if (result <= 0) {
659 warning("", "reiser4_write_tail failed");
660 page_cache_release(page);
661 reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
662 return result;
664 count -= result;
667 /* release page */
668 lock_page(page);
669 /* page is already detached from jnode and mapping. */
670 assert("vs-1086", page->mapping == NULL);
671 assert("nikita-2690",
672 (!PagePrivate(page) && jprivate(page) == 0));
673 /* waiting for writeback completion with page lock held is
674 * perfectly valid. */
675 wait_on_page_writeback(page);
676 reiser4_drop_page(page);
677 /* release reference taken by read_cache_page() above */
678 page_cache_release(page);
680 drop_exclusive_access(uf_info);
681 /* throttle the conversion */
682 reiser4_throttle_write(inode);
683 get_exclusive_access(uf_info);
685 * nobody is allowed to complete conversion but a process which
686 * started it
688 assert("", reiser4_inode_get_flag(inode, REISER4_PART_MIXED));
691 reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
693 if (i == num_pages) {
694 /* file is converted to formatted items */
695 assert("vs-1698", reiser4_inode_get_flag(inode,
696 REISER4_PART_MIXED));
697 assert("vs-1260",
698 inode_has_no_jnodes(reiser4_inode_data(inode)));
700 uf_info->container = UF_CONTAINER_TAILS;
701 complete_conversion(inode);
702 return 0;
705 * conversion is not complete. Inode was already marked as
706 * REISER4_PART_MIXED and stat-data were updated at the first *
707 * iteration of the loop above.
709 warning("nikita-2282",
710 "Partial conversion of %llu: %lu of %lu: %i",
711 (unsigned long long)get_inode_oid(inode), i,
712 num_pages, result);
714 return result;
718 * Local variables:
719 * c-indentation-style: "K&R"
720 * mode-name: "LC"
721 * c-basic-offset: 8
722 * tab-width: 8
723 * fill-column: 79
724 * scroll-step: 1
725 * End: