On Tue, Nov 06, 2007 at 02:33:53AM -0800, akpm@linux-foundation.org wrote:
[mmotm.git] / fs / reiser4 / plugin / file / tail_conversion.c
blobb529fa955d001caea7f3c1f6a2c010ca6b0d1789
1 /* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
3 #include "../../inode.h"
4 #include "../../super.h"
5 #include "../../page_cache.h"
6 #include "../../carry.h"
7 #include "../../safe_link.h"
8 #include "../../vfs_ops.h"
10 #include <linux/writeback.h>
12 /* this file contains:
13 tail2extent and extent2tail */
15 /* exclusive access to a file is acquired when file state changes: tail2extent, empty2tail, extent2tail, etc */
16 void get_exclusive_access(struct unix_file_info * uf_info)
18 assert("nikita-3028", reiser4_schedulable());
19 assert("nikita-3047", LOCK_CNT_NIL(inode_sem_w));
20 assert("nikita-3048", LOCK_CNT_NIL(inode_sem_r));
22 * "deadlock avoidance": sometimes we commit a transaction under
23 * rw-semaphore on a file. Such commit can deadlock with another
24 * thread that captured some block (hence preventing atom from being
25 * committed) and waits on rw-semaphore.
27 reiser4_txn_restart_current();
28 LOCK_CNT_INC(inode_sem_w);
29 down_write(&uf_info->latch);
30 uf_info->exclusive_use = 1;
31 assert("vs-1713", uf_info->ea_owner == NULL);
32 assert("vs-1713", atomic_read(&uf_info->nr_neas) == 0);
33 ON_DEBUG(uf_info->ea_owner = current);
36 void drop_exclusive_access(struct unix_file_info * uf_info)
38 assert("vs-1714", uf_info->ea_owner == current);
39 assert("vs-1715", atomic_read(&uf_info->nr_neas) == 0);
40 ON_DEBUG(uf_info->ea_owner = NULL);
41 uf_info->exclusive_use = 0;
42 up_write(&uf_info->latch);
43 assert("nikita-3049", LOCK_CNT_NIL(inode_sem_r));
44 assert("nikita-3049", LOCK_CNT_GTZ(inode_sem_w));
45 LOCK_CNT_DEC(inode_sem_w);
46 reiser4_txn_restart_current();
49 /**
50 * nea_grabbed - do something when file semaphore is down_read-ed
51 * @uf_info:
53 * This is called when nonexclisive access is obtained on file. All it does is
54 * for debugging purposes.
56 static void nea_grabbed(struct unix_file_info *uf_info)
58 #if REISER4_DEBUG
59 LOCK_CNT_INC(inode_sem_r);
60 assert("vs-1716", uf_info->ea_owner == NULL);
61 atomic_inc(&uf_info->nr_neas);
62 uf_info->last_reader = current;
63 #endif
66 /**
67 * get_nonexclusive_access - get nonexclusive access to a file
68 * @uf_info: unix file specific part of inode to obtain access to
70 * Nonexclusive access is obtained on a file before read, write, readpage.
72 void get_nonexclusive_access(struct unix_file_info *uf_info)
74 assert("nikita-3029", reiser4_schedulable());
75 assert("nikita-3361", get_current_context()->trans->atom == NULL);
77 down_read(&uf_info->latch);
78 nea_grabbed(uf_info);
81 /**
82 * try_to_get_nonexclusive_access - try to get nonexclusive access to a file
83 * @uf_info: unix file specific part of inode to obtain access to
85 * Non-blocking version of nonexclusive access obtaining.
87 int try_to_get_nonexclusive_access(struct unix_file_info *uf_info)
89 int result;
91 result = down_read_trylock(&uf_info->latch);
92 if (result)
93 nea_grabbed(uf_info);
94 return result;
97 void drop_nonexclusive_access(struct unix_file_info * uf_info)
99 assert("vs-1718", uf_info->ea_owner == NULL);
100 assert("vs-1719", atomic_read(&uf_info->nr_neas) > 0);
101 ON_DEBUG(atomic_dec(&uf_info->nr_neas));
103 up_read(&uf_info->latch);
105 LOCK_CNT_DEC(inode_sem_r);
106 reiser4_txn_restart_current();
109 /* part of tail2extent. Cut all items covering @count bytes starting from
110 @offset */
111 /* Audited by: green(2002.06.15) */
112 static int cut_formatting_items(struct inode *inode, loff_t offset, int count)
114 reiser4_key from, to;
116 /* AUDIT: How about putting an assertion here, what would check
117 all provided range is covered by tail items only? */
118 /* key of first byte in the range to be cut */
119 inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
121 /* key of last byte in that range */
122 to = from;
123 set_key_offset(&to, (__u64) (offset + count - 1));
125 /* cut everything between those keys */
126 return reiser4_cut_tree(reiser4_tree_by_inode(inode), &from, &to,
127 inode, 0);
130 static void release_all_pages(struct page **pages, unsigned nr_pages)
132 unsigned i;
134 for (i = 0; i < nr_pages; i++) {
135 if (pages[i] == NULL) {
136 #if REISER4_DEBUG
137 unsigned j;
138 for (j = i + 1; j < nr_pages; j++)
139 assert("vs-1620", pages[j] == NULL);
140 #endif
141 break;
143 page_cache_release(pages[i]);
144 pages[i] = NULL;
148 /* part of tail2extent. replace tail items with extent one. Content of tail
149 items (@count bytes) being cut are copied already into
150 pages. extent_writepage method is called to create extents corresponding to
151 those pages */
152 static int replace(struct inode *inode, struct page **pages, unsigned nr_pages, int count)
154 int result;
155 unsigned i;
156 STORE_COUNTERS;
158 if (nr_pages == 0)
159 return 0;
161 assert("vs-596", pages[0]);
163 /* cut copied items */
164 result = cut_formatting_items(inode, page_offset(pages[0]), count);
165 if (result)
166 return result;
168 CHECK_COUNTERS;
170 /* put into tree replacement for just removed items: extent item, namely */
171 for (i = 0; i < nr_pages; i++) {
172 result = add_to_page_cache_lru(pages[i], inode->i_mapping,
173 pages[i]->index,
174 mapping_gfp_mask(inode->
175 i_mapping));
176 if (result)
177 break;
178 unlock_page(pages[i]);
179 result = find_or_create_extent(pages[i]);
180 if (result)
181 break;
182 SetPageUptodate(pages[i]);
184 return result;
187 #define TAIL2EXTENT_PAGE_NUM 3 /* number of pages to fill before cutting tail
188 * items */
190 static int reserve_tail2extent_iteration(struct inode *inode)
192 reiser4_block_nr unformatted_nodes;
193 reiser4_tree *tree;
195 tree = reiser4_tree_by_inode(inode);
197 /* number of unformatted nodes which will be created */
198 unformatted_nodes = TAIL2EXTENT_PAGE_NUM;
201 * space required for one iteration of extent->tail conversion:
203 * 1. kill N tail items
205 * 2. insert TAIL2EXTENT_PAGE_NUM unformatted nodes
207 * 3. insert TAIL2EXTENT_PAGE_NUM (worst-case single-block
208 * extents) extent units.
210 * 4. drilling to the leaf level by coord_by_key()
212 * 5. possible update of stat-data
215 grab_space_enable();
216 return reiser4_grab_space
217 (2 * tree->height +
218 TAIL2EXTENT_PAGE_NUM +
219 TAIL2EXTENT_PAGE_NUM * estimate_one_insert_into_item(tree) +
220 1 + estimate_one_insert_item(tree) +
221 inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
224 /* clear stat data's flag indicating that conversion is being converted */
225 static int complete_conversion(struct inode *inode)
227 int result;
229 grab_space_enable();
230 result =
231 reiser4_grab_space(inode_file_plugin(inode)->estimate.update(inode),
232 BA_CAN_COMMIT);
233 if (result == 0) {
234 reiser4_inode_clr_flag(inode, REISER4_PART_MIXED);
235 result = reiser4_update_sd(inode);
237 if (result)
238 warning("vs-1696", "Failed to clear converting bit of %llu: %i",
239 (unsigned long long)get_inode_oid(inode), result);
240 return 0;
244 * find_start
245 * @inode:
246 * @id:
247 * @offset:
249 * this is used by tail2extent and extent2tail to detect where previous
250 * uncompleted conversion stopped
252 static int find_start(struct inode *inode, reiser4_plugin_id id, __u64 *offset)
254 int result;
255 lock_handle lh;
256 coord_t coord;
257 struct unix_file_info *ufo;
258 int found;
259 reiser4_key key;
261 ufo = unix_file_inode_data(inode);
262 init_lh(&lh);
263 result = 0;
264 found = 0;
265 inode_file_plugin(inode)->key_by_inode(inode, *offset, &key);
266 do {
267 init_lh(&lh);
268 result = find_file_item_nohint(&coord, &lh, &key,
269 ZNODE_READ_LOCK, inode);
271 if (result == CBK_COORD_FOUND) {
272 if (coord.between == AT_UNIT) {
273 /*coord_clear_iplug(&coord); */
274 result = zload(coord.node);
275 if (result == 0) {
276 if (item_id_by_coord(&coord) == id)
277 found = 1;
278 else
279 item_plugin_by_coord(&coord)->s.
280 file.append_key(&coord,
281 &key);
282 zrelse(coord.node);
284 } else
285 result = RETERR(-ENOENT);
287 done_lh(&lh);
288 } while (result == 0 && !found);
289 *offset = get_key_offset(&key);
290 return result;
294 * tail2extent
295 * @uf_info:
299 int tail2extent(struct unix_file_info *uf_info)
301 int result;
302 reiser4_key key; /* key of next byte to be moved to page */
303 char *p_data; /* data of page */
304 unsigned page_off = 0, /* offset within the page where to copy data */
305 count; /* number of bytes of item which can be
306 * copied to page */
307 struct page *pages[TAIL2EXTENT_PAGE_NUM];
308 struct page *page;
309 int done; /* set to 1 when all file is read */
310 char *item;
311 int i;
312 struct inode *inode;
313 int first_iteration;
314 int bytes;
315 __u64 offset;
317 assert("nikita-3362", ea_obtained(uf_info));
318 inode = unix_file_info_to_inode(uf_info);
319 assert("nikita-3412", !IS_RDONLY(inode));
320 assert("vs-1649", uf_info->container != UF_CONTAINER_EXTENTS);
321 assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV));
323 offset = 0;
324 first_iteration = 1;
325 result = 0;
326 if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
328 * file is marked on disk as there was a conversion which did
329 * not complete due to either crash or some error. Find which
330 * offset tail conversion stopped at
332 result = find_start(inode, FORMATTING_ID, &offset);
333 if (result == -ENOENT) {
334 /* no tail items found, everything is converted */
335 uf_info->container = UF_CONTAINER_EXTENTS;
336 complete_conversion(inode);
337 return 0;
338 } else if (result != 0)
339 /* some other error */
340 return result;
341 first_iteration = 0;
344 reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV);
346 /* get key of first byte of a file */
347 inode_file_plugin(inode)->key_by_inode(inode, offset, &key);
349 done = 0;
350 while (done == 0) {
351 memset(pages, 0, sizeof(pages));
352 result = reserve_tail2extent_iteration(inode);
353 if (result != 0) {
354 reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
355 goto out;
357 if (first_iteration) {
358 reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
359 reiser4_update_sd(inode);
360 first_iteration = 0;
362 bytes = 0;
363 for (i = 0; i < sizeof_array(pages) && done == 0; i++) {
364 assert("vs-598",
365 (get_key_offset(&key) & ~PAGE_CACHE_MASK) == 0);
366 page = alloc_page(reiser4_ctx_gfp_mask_get());
367 if (!page) {
368 result = RETERR(-ENOMEM);
369 goto error;
372 page->index =
373 (unsigned long)(get_key_offset(&key) >>
374 PAGE_CACHE_SHIFT);
376 * usually when one is going to longterm lock znode (as
377 * find_file_item does, for instance) he must not hold
378 * locked pages. However, there is an exception for
379 * case tail2extent. Pages appearing here are not
380 * reachable to everyone else, they are clean, they do
381 * not have jnodes attached so keeping them locked do
382 * not risk deadlock appearance
384 assert("vs-983", !PagePrivate(page));
385 reiser4_invalidate_pages(inode->i_mapping, page->index,
386 1, 0);
388 for (page_off = 0; page_off < PAGE_CACHE_SIZE;) {
389 coord_t coord;
390 lock_handle lh;
392 /* get next item */
393 /* FIXME: we might want to readahead here */
394 init_lh(&lh);
395 result =
396 find_file_item_nohint(&coord, &lh, &key,
397 ZNODE_READ_LOCK,
398 inode);
399 if (result != CBK_COORD_FOUND) {
401 * error happened of not items of file
402 * were found
404 done_lh(&lh);
405 page_cache_release(page);
406 goto error;
409 if (coord.between == AFTER_UNIT) {
411 * end of file is reached. Padd page
412 * with zeros
414 done_lh(&lh);
415 done = 1;
416 p_data = kmap_atomic(page, KM_USER0);
417 memset(p_data + page_off, 0,
418 PAGE_CACHE_SIZE - page_off);
419 kunmap_atomic(p_data, KM_USER0);
420 break;
423 result = zload(coord.node);
424 if (result) {
425 page_cache_release(page);
426 done_lh(&lh);
427 goto error;
429 assert("vs-856", coord.between == AT_UNIT);
430 item = ((char *)item_body_by_coord(&coord)) +
431 coord.unit_pos;
433 /* how many bytes to copy */
434 count =
435 item_length_by_coord(&coord) -
436 coord.unit_pos;
437 /* limit length of copy to end of page */
438 if (count > PAGE_CACHE_SIZE - page_off)
439 count = PAGE_CACHE_SIZE - page_off;
442 * copy item (as much as will fit starting from
443 * the beginning of the item) into the page
445 p_data = kmap_atomic(page, KM_USER0);
446 memcpy(p_data + page_off, item, count);
447 kunmap_atomic(p_data, KM_USER0);
449 page_off += count;
450 bytes += count;
451 set_key_offset(&key,
452 get_key_offset(&key) + count);
454 zrelse(coord.node);
455 done_lh(&lh);
456 } /* end of loop which fills one page by content of
457 * formatting items */
459 if (page_off) {
460 /* something was copied into page */
461 pages[i] = page;
462 } else {
463 page_cache_release(page);
464 assert("vs-1648", done == 1);
465 break;
467 } /* end of loop through pages of one conversion iteration */
469 if (i > 0) {
470 result = replace(inode, pages, i, bytes);
471 release_all_pages(pages, sizeof_array(pages));
472 if (result)
473 goto error;
475 * We have to drop exclusive access to avoid deadlock
476 * which may happen because called by reiser4_writepages
477 * capture_unix_file requires to get non-exclusive
478 * access to a file. It is safe to drop EA in the middle
479 * of tail2extent conversion because write_unix_file,
480 * setattr_unix_file(truncate), mmap_unix_file,
481 * release_unix_file(extent2tail) checks if conversion
482 * is not in progress (see comments before
483 * get_exclusive_access_careful().
484 * Other processes that acquire non-exclusive access
485 * (read_unix_file, reiser4_writepages, etc) should work
486 * on partially converted files.
488 drop_exclusive_access(uf_info);
489 /* throttle the conversion
490 FIXME-EDWARD: Pass the precise number of pages
491 that was dirtied */
492 reiser4_throttle_write(inode, 1);
493 get_exclusive_access(uf_info);
496 * nobody is allowed to complete conversion but a
497 * process which started it
499 assert("", reiser4_inode_get_flag(inode,
500 REISER4_PART_MIXED));
503 if (result == 0) {
504 /* file is converted to extent items */
505 reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
506 assert("vs-1697", reiser4_inode_get_flag(inode,
507 REISER4_PART_MIXED));
509 uf_info->container = UF_CONTAINER_EXTENTS;
510 complete_conversion(inode);
511 } else {
513 * conversion is not complete. Inode was already marked as
514 * REISER4_PART_MIXED and stat-data were updated at the first
515 * iteration of the loop above.
517 error:
518 release_all_pages(pages, sizeof_array(pages));
519 reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
520 warning("edward-1548", "Partial conversion of %llu: %i",
521 (unsigned long long)get_inode_oid(inode), result);
524 out:
525 /* this flag should be cleared, otherwise get_exclusive_access_careful()
526 will fall into infinite loop */
527 assert("edward-1549", !reiser4_inode_get_flag(inode,
528 REISER4_PART_IN_CONV));
529 return result;
532 static int reserve_extent2tail_iteration(struct inode *inode)
534 reiser4_tree *tree;
536 tree = reiser4_tree_by_inode(inode);
538 * reserve blocks for (in this order):
540 * 1. removal of extent item
542 * 2. insertion of tail by insert_flow()
544 * 3. drilling to the leaf level by coord_by_key()
546 * 4. possible update of stat-data
548 grab_space_enable();
549 return reiser4_grab_space
550 (estimate_one_item_removal(tree) +
551 estimate_insert_flow(tree->height) +
552 1 + estimate_one_insert_item(tree) +
553 inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
556 /* for every page of file: read page, cut part of extent pointing to this page,
557 put data of page tree by tail item */
558 int extent2tail(struct file * file, struct unix_file_info *uf_info)
560 int result;
561 struct inode *inode;
562 struct page *page;
563 unsigned long num_pages, i;
564 unsigned long start_page;
565 reiser4_key from;
566 reiser4_key to;
567 unsigned count;
568 __u64 offset;
570 assert("nikita-3362", ea_obtained(uf_info));
571 inode = unix_file_info_to_inode(uf_info);
572 assert("nikita-3412", !IS_RDONLY(inode));
573 assert("vs-1649", uf_info->container != UF_CONTAINER_TAILS);
574 assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV));
576 offset = 0;
577 if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
579 * file is marked on disk as there was a conversion which did
580 * not complete due to either crash or some error. Find which
581 * offset tail conversion stopped at
583 result = find_start(inode, EXTENT_POINTER_ID, &offset);
584 if (result == -ENOENT) {
585 /* no extent found, everything is converted */
586 uf_info->container = UF_CONTAINER_TAILS;
587 complete_conversion(inode);
588 return 0;
589 } else if (result != 0)
590 /* some other error */
591 return result;
594 reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV);
596 /* number of pages in the file */
597 num_pages =
598 (inode->i_size + - offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
599 start_page = offset >> PAGE_CACHE_SHIFT;
601 inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
602 to = from;
604 result = 0;
605 for (i = 0; i < num_pages; i++) {
606 __u64 start_byte;
608 result = reserve_extent2tail_iteration(inode);
609 if (result != 0)
610 break;
611 if (i == 0 && offset == 0) {
612 reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
613 reiser4_update_sd(inode);
616 page = read_mapping_page(inode->i_mapping,
617 (unsigned)(i + start_page), NULL);
618 if (IS_ERR(page)) {
619 result = PTR_ERR(page);
620 break;
623 wait_on_page_locked(page);
625 if (!PageUptodate(page)) {
626 page_cache_release(page);
627 result = RETERR(-EIO);
628 break;
631 /* cut part of file we have read */
632 start_byte = (__u64) ((i + start_page) << PAGE_CACHE_SHIFT);
633 set_key_offset(&from, start_byte);
634 set_key_offset(&to, start_byte + PAGE_CACHE_SIZE - 1);
636 * reiser4_cut_tree_object() returns -E_REPEAT to allow atom
637 * commits during over-long truncates. But
638 * extent->tail conversion should be performed in one
639 * transaction.
641 result = reiser4_cut_tree(reiser4_tree_by_inode(inode), &from,
642 &to, inode, 0);
644 if (result) {
645 page_cache_release(page);
646 break;
649 /* put page data into tree via tail_write */
650 count = PAGE_CACHE_SIZE;
651 if ((i == (num_pages - 1)) &&
652 (inode->i_size & ~PAGE_CACHE_MASK))
653 /* last page can be incompleted */
654 count = (inode->i_size & ~PAGE_CACHE_MASK);
655 while (count) {
656 loff_t pos = start_byte;
658 assert("edward-1537",
659 file != NULL && file->f_dentry != NULL);
660 assert("edward-1538",
661 file->f_dentry->d_inode == inode);
663 result = reiser4_write_tail(file, inode,
664 (char __user *)kmap(page),
665 count, &pos);
666 reiser4_free_file_fsdata(file);
667 if (result <= 0) {
668 warning("", "reiser4_write_tail failed");
669 page_cache_release(page);
670 reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
671 return result;
673 count -= result;
676 /* release page */
677 lock_page(page);
678 /* page is already detached from jnode and mapping. */
679 assert("vs-1086", page->mapping == NULL);
680 assert("nikita-2690",
681 (!PagePrivate(page) && jprivate(page) == 0));
682 /* waiting for writeback completion with page lock held is
683 * perfectly valid. */
684 wait_on_page_writeback(page);
685 reiser4_drop_page(page);
686 /* release reference taken by read_cache_page() above */
687 page_cache_release(page);
689 drop_exclusive_access(uf_info);
691 * throttle the conversion.
692 * FIXME-EDWARD: Calculate and pass the precise number
693 * of pages that was dirtied
695 reiser4_throttle_write(inode, 1);
696 get_exclusive_access(uf_info);
698 * nobody is allowed to complete conversion but a process which
699 * started it
701 assert("", reiser4_inode_get_flag(inode, REISER4_PART_MIXED));
704 reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
706 if (i == num_pages) {
707 /* file is converted to formatted items */
708 assert("vs-1698", reiser4_inode_get_flag(inode,
709 REISER4_PART_MIXED));
710 assert("vs-1260",
711 inode_has_no_jnodes(reiser4_inode_data(inode)));
713 uf_info->container = UF_CONTAINER_TAILS;
714 complete_conversion(inode);
715 return 0;
718 * conversion is not complete. Inode was already marked as
719 * REISER4_PART_MIXED and stat-data were updated at the first
720 * iteration of the loop above.
722 warning("nikita-2282",
723 "Partial conversion of %llu: %lu of %lu: %i",
724 (unsigned long long)get_inode_oid(inode), i,
725 num_pages, result);
727 /* this flag should be cleared, otherwise get_exclusive_access_careful()
728 will fall into infinite loop */
729 assert("edward-1550", !reiser4_inode_get_flag(inode,
730 REISER4_PART_IN_CONV));
731 return result;
735 * Local variables:
736 * c-indentation-style: "K&R"
737 * mode-name: "LC"
738 * c-basic-offset: 8
739 * tab-width: 8
740 * fill-column: 79
741 * scroll-step: 1
742 * End: