1 /* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
4 /* data-types and function declarations for transaction manager. See txnmgr.c
7 #ifndef __REISER4_TXNMGR_H__
8 #define __REISER4_TXNMGR_H__
15 #include <linux/types.h>
16 #include <linux/spinlock.h>
17 #include <asm/atomic.h>
18 #include <linux/wait.h>
20 /* TYPE DECLARATIONS */
22 /* This enumeration describes the possible types of a capture request (reiser4_try_capture).
23 A capture request dynamically assigns a block to the calling thread's transaction
26 /* A READ_ATOMIC request indicates that a block will be read and that the caller's
27 atom should fuse in order to ensure that the block commits atomically with the
29 TXN_CAPTURE_READ_ATOMIC
= (1 << 0),
31 /* A READ_NONCOM request indicates that a block will be read and that the caller is
32 willing to read a non-committed block without causing atoms to fuse. */
33 TXN_CAPTURE_READ_NONCOM
= (1 << 1),
35 /* A READ_MODIFY request indicates that a block will be read but that the caller
36 wishes for the block to be captured as it will be written. This capture request
37 mode is not currently used, but eventually it will be useful for preventing
38 deadlock in read-modify-write cycles. */
39 TXN_CAPTURE_READ_MODIFY
= (1 << 2),
41 /* A WRITE capture request indicates that a block will be modified and that atoms
42 should fuse to make the commit atomic. */
43 TXN_CAPTURE_WRITE
= (1 << 3),
45 /* CAPTURE_TYPES is a mask of the four above capture types, used to separate the
46 exclusive type designation from extra bits that may be supplied -- see
48 TXN_CAPTURE_TYPES
= (TXN_CAPTURE_READ_ATOMIC
|
49 TXN_CAPTURE_READ_NONCOM
| TXN_CAPTURE_READ_MODIFY
|
52 /* A subset of CAPTURE_TYPES, CAPTURE_WTYPES is a mask of request types that
53 indicate modification will occur. */
54 TXN_CAPTURE_WTYPES
= (TXN_CAPTURE_READ_MODIFY
| TXN_CAPTURE_WRITE
),
56 /* An option to reiser4_try_capture, NONBLOCKING indicates that the caller would
57 prefer not to sleep waiting for an aging atom to commit. */
58 TXN_CAPTURE_NONBLOCKING
= (1 << 4),
60 /* An option to reiser4_try_capture to prevent atom fusion, just simple
61 capturing is allowed */
62 TXN_CAPTURE_DONT_FUSE
= (1 << 5)
64 /* This macro selects only the exclusive capture request types, stripping out any
65 options that were supplied (i.e., NONBLOCKING). */
66 #define CAPTURE_TYPE(x) ((x) & TXN_CAPTURE_TYPES)
69 /* There are two kinds of transaction handle: WRITE_FUSING and READ_FUSING, the only
70 difference is in the handling of read requests. A WRITE_FUSING transaction handle
71 defaults read capture requests to TXN_CAPTURE_READ_NONCOM whereas a READ_FUSIONG
72 transaction handle defaults to TXN_CAPTURE_READ_ATOMIC. */
74 TXN_WRITE_FUSING
= (1 << 0),
75 TXN_READ_FUSING
= (1 << 1) | TXN_WRITE_FUSING
, /* READ implies WRITE */
78 /* Every atom has a stage, which is one of these exclusive values: */
80 /* Initially an atom is free. */
83 /* An atom begins by entering the CAPTURE_FUSE stage, where it proceeds to capture
84 blocks and fuse with other atoms. */
85 ASTAGE_CAPTURE_FUSE
= 1,
87 /* We need to have a ASTAGE_CAPTURE_SLOW in which an atom fuses with one node for every X nodes it flushes to disk where X > 1. */
89 /* When an atom reaches a certain age it must do all it can to commit. An atom in
90 the CAPTURE_WAIT stage refuses new transaction handles and prevents fusion from
91 atoms in the CAPTURE_FUSE stage. */
92 ASTAGE_CAPTURE_WAIT
= 2,
94 /* Waiting for I/O before commit. Copy-on-capture (see
95 http://namesys.com/v4/v4.html). */
96 ASTAGE_PRE_COMMIT
= 3,
98 /* Post-commit overwrite I/O. Steal-on-capture. */
99 ASTAGE_POST_COMMIT
= 4,
101 /* Atom which waits for the removal of the last reference to (it? ) to
102 * be deleted from memory */
110 /* Certain flags may be set in the txn_atom->flags field. */
112 /* Indicates that the atom should commit as soon as possible. */
113 ATOM_FORCE_COMMIT
= (1 << 0),
114 /* to avoid endless loop, mark the atom (which was considered as too
115 * small) after failed attempt to fuse it. */
116 ATOM_CANCEL_FUSION
= (1 << 1)
119 /* Flags for controlling commit_txnh */
121 /* Wait commit atom completion in commit_txnh */
122 TXNH_WAIT_COMMIT
= 0x2,
123 /* Don't commit atom when this handle is closed */
124 TXNH_DONT_COMMIT
= 0x4
125 } txn_handle_flags_t
;
127 /* TYPE DEFINITIONS */
129 /* A note on lock ordering: the handle & jnode spinlock protects reading of their ->atom
130 fields, so typically an operation on the atom through either of these objects must (1)
131 lock the object, (2) read the atom pointer, (3) lock the atom.
133 During atom fusion, the process holds locks on both atoms at once. Then, it iterates
134 through the list of handles and pages held by the smaller of the two atoms. For each
135 handle and page referencing the smaller atom, the fusing process must: (1) lock the
136 object, and (2) update the atom pointer.
138 You can see that there is a conflict of lock ordering here, so the more-complex
139 procedure should have priority, i.e., the fusing process has priority so that it is
140 guaranteed to make progress and to avoid restarts.
142 This decision, however, means additional complexity for aquiring the atom lock in the
145 The general original procedure followed in the code was:
147 TXN_OBJECT *obj = ...;
150 spin_lock (& obj->_lock);
154 if (! spin_trylock_atom (atom))
156 spin_unlock (& obj->_lock);
157 RESTART OPERATION, THERE WAS A RACE;
160 ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
162 It has however been found that this wastes CPU a lot in a manner that is
163 hard to profile. So, proper refcounting was added to atoms, and new
164 standard locking sequence is like following:
166 TXN_OBJECT *obj = ...;
169 spin_lock (& obj->_lock);
173 if (! spin_trylock_atom (atom))
175 atomic_inc (& atom->refcount);
176 spin_unlock (& obj->_lock);
177 spin_lock (&atom->_lock);
178 atomic_dec (& atom->refcount);
179 // HERE atom is locked
180 spin_unlock (&atom->_lock);
181 RESTART OPERATION, THERE WAS A RACE;
184 ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
186 (core of this is implemented in trylock_throttle() function)
188 See the jnode_get_atom() function for a common case.
190 As an additional (and important) optimization allowing to avoid restarts,
191 it is possible to re-check required pre-conditions at the HERE point in
192 code above and proceed without restarting if they are still satisfied.
195 /* An atomic transaction: this is the underlying system representation
196 of a transaction, not the one seen by clients.
198 Invariants involving this data-type:
203 /* The spinlock protecting the atom, held during fusion and various other state
207 /* The atom's reference counter, increasing (in case of a duplication
208 of an existing reference or when we are sure that some other
209 reference exists) may be done without taking spinlock, decrementing
210 of the ref. counter requires a spinlock to be held.
212 Each transaction handle counts in ->refcount. All jnodes count as
213 one reference acquired in atom_begin_andlock(), released in
214 commit_current_atom().
218 /* The atom_id identifies the atom in persistent records such as the log. */
221 /* Flags holding any of the txn_flags enumerated values (e.g.,
222 ATOM_FORCE_COMMIT). */
225 /* Number of open handles. */
228 /* The number of znodes captured by this atom. Equal to the sum of lengths of the
229 dirty_nodes[level] and clean_nodes lists. */
242 /* Current transaction stage. */
246 unsigned long start_time
;
248 /* The atom's delete set. It collects block numbers of the nodes
249 which were deleted during the transaction. */
250 struct list_head delete_set
;
252 /* The atom's wandered_block mapping. */
253 struct list_head wandered_map
;
255 /* The transaction's list of dirty captured nodes--per level. Index
256 by (level). dirty_nodes[0] is for znode-above-root */
257 struct list_head dirty_nodes
[REAL_MAX_ZTREE_HEIGHT
+ 1];
259 /* The transaction's list of clean captured nodes. */
260 struct list_head clean_nodes
;
262 /* The atom's overwrite set */
263 struct list_head ovrwr_nodes
;
265 /* nodes which are being written to disk */
266 struct list_head writeback_nodes
;
269 struct list_head inodes
;
271 /* List of handles associated with this atom. */
272 struct list_head txnh_list
;
274 /* Transaction list link: list of atoms in the transaction manager. */
275 struct list_head atom_link
;
277 /* List of handles waiting FOR this atom: see 'capture_fuse_wait' comment. */
278 struct list_head fwaitfor_list
;
280 /* List of this atom's handles that are waiting: see 'capture_fuse_wait' comment. */
281 struct list_head fwaiting_list
;
283 /* Numbers of objects which were deleted/created in this transaction
284 thereby numbers of objects IDs which were released/deallocated. */
285 int nr_objects_deleted
;
286 int nr_objects_created
;
287 /* number of blocks allocated during the transaction */
288 __u64 nr_blocks_allocated
;
289 /* All atom's flush queue objects are on this list */
290 struct list_head flush_queues
;
292 /* number of flush queues for this atom. */
294 /* Number of jnodes which were removed from atom's lists and put
298 /* number of threads who wait for this atom to complete commit */
300 /* number of threads which do jnode_flush() over this atom */
302 /* number of flush queues which are IN_USE and jnodes from fq->prepped
303 are submitted to disk by the reiser4_write_fq() routine. */
304 int nr_running_queues
;
305 /* A counter of grabbed unformatted nodes, see a description of the
306 * reiser4 space reservation scheme at block_alloc.c */
307 reiser4_block_nr flush_reserved
;
311 struct super_block
*super
;
314 #define ATOM_DIRTY_LIST(atom, level) (&(atom)->dirty_nodes[level])
315 #define ATOM_CLEAN_LIST(atom) (&(atom)->clean_nodes)
316 #define ATOM_OVRWR_LIST(atom) (&(atom)->ovrwr_nodes)
317 #define ATOM_WB_LIST(atom) (&(atom)->writeback_nodes)
318 #define ATOM_FQ_LIST(fq) (&(fq)->prepped)
320 #define NODE_LIST(node) (node)->list
321 #define ASSIGN_NODE_LIST(node, list) ON_DEBUG(NODE_LIST(node) = list)
323 count_jnode(txn_atom
*, jnode
*, atom_list old_list
,
324 atom_list new_list
, int check_lists
));
326 /* A transaction handle: the client obtains and commits this handle which is assigned by
327 the system to a txn_atom. */
329 /* Spinlock protecting ->atom pointer */
332 /* Flags for controlling commit_txnh() behavior */
333 /* from txn_handle_flags_t */
334 txn_handle_flags_t flags
;
336 /* Whether it is READ_FUSING or WRITE_FUSING. */
339 /* If assigned, the atom it is part of. */
342 /* Transaction list link. Head is in txn_atom. */
343 struct list_head txnh_link
;
346 /* The transaction manager: one is contained in the reiser4_super_info_data */
348 /* A spinlock protecting the atom list, id_count, flush_control */
349 spinlock_t tmgr_lock
;
352 struct list_head atoms_list
;
354 /* Number of atoms. */
357 /* A counter used to assign atom->atom_id values. */
360 /* a mutex object for commit serialization */
361 struct mutex commit_mutex
;
363 /* a list of all txnmrgs served by particular daemon. */
364 struct list_head linkage
;
366 /* description of daemon for this txnmgr */
367 ktxnmgrd_context
*daemon
;
369 /* parameters. Adjustable through mount options. */
370 unsigned int atom_max_size
;
371 unsigned int atom_max_age
;
372 unsigned int atom_min_size
;
373 /* max number of concurrent flushers for one atom, 0 - unlimited. */
374 unsigned int atom_max_flushers
;
375 struct dentry
*debugfs_atom_count
;
376 struct dentry
*debugfs_id_count
;
379 /* FUNCTION DECLARATIONS */
381 /* These are the externally (within Reiser4) visible transaction functions, therefore they
382 are prefixed with "txn_". For comments, see txnmgr.c. */
384 extern int init_txnmgr_static(void);
385 extern void done_txnmgr_static(void);
387 extern void reiser4_init_txnmgr(txn_mgr
*);
388 extern void reiser4_done_txnmgr(txn_mgr
*);
390 extern int reiser4_txn_reserve(int reserved
);
392 extern void reiser4_txn_begin(reiser4_context
* context
);
393 extern int reiser4_txn_end(reiser4_context
* context
);
395 extern void reiser4_txn_restart(reiser4_context
* context
);
396 extern void reiser4_txn_restart_current(void);
398 extern int txnmgr_force_commit_all(struct super_block
*, int);
399 extern int current_atom_should_commit(void);
401 extern jnode
*find_first_dirty_jnode(txn_atom
*, int);
403 extern int commit_some_atoms(txn_mgr
*);
404 extern int force_commit_atom(txn_handle
*);
405 extern int flush_current_atom(int, long, long *, txn_atom
**, jnode
*);
407 extern int flush_some_atom(jnode
*, long *, const struct writeback_control
*, int);
409 extern void reiser4_atom_set_stage(txn_atom
* atom
, txn_stage stage
);
411 extern int same_slum_check(jnode
* base
, jnode
* check
, int alloc_check
,
413 extern void atom_dec_and_unlock(txn_atom
* atom
);
415 extern int reiser4_try_capture(jnode
* node
, znode_lock_mode mode
, txn_capture flags
);
416 extern int try_capture_page_to_invalidate(struct page
*pg
);
418 extern void reiser4_uncapture_page(struct page
*pg
);
419 extern void reiser4_uncapture_block(jnode
*);
420 extern void reiser4_uncapture_jnode(jnode
*);
422 extern int reiser4_capture_inode(struct inode
*);
423 extern int reiser4_uncapture_inode(struct inode
*);
425 extern txn_atom
*get_current_atom_locked_nocheck(void);
430 * atom_is_protected - make sure that nobody but us can do anything with atom
431 * @atom: atom to be checked
433 * This is used to assert that atom either entered commit stages or is spin
436 static inline int atom_is_protected(txn_atom
*atom
)
438 if (atom
->stage
>= ASTAGE_PRE_COMMIT
)
440 assert_spin_locked(&(atom
->alock
));
446 /* Get the current atom and spinlock it if current atom present. May not return NULL */
447 static inline txn_atom
*get_current_atom_locked(void)
451 atom
= get_current_atom_locked_nocheck();
452 assert("zam-761", atom
!= NULL
);
457 extern txn_atom
*jnode_get_atom(jnode
*);
459 extern void reiser4_atom_wait_event(txn_atom
*);
460 extern void reiser4_atom_send_event(txn_atom
*);
462 extern void insert_into_atom_ovrwr_list(txn_atom
* atom
, jnode
* node
);
463 extern int reiser4_capture_super_block(struct super_block
*s
);
464 int capture_bulk(jnode
**, int count
);
466 /* See the comment on the function blocknrset.c:blocknr_set_add for the
467 calling convention of these three routines. */
468 extern void blocknr_set_init(struct list_head
* bset
);
469 extern void blocknr_set_destroy(struct list_head
* bset
);
470 extern void blocknr_set_merge(struct list_head
* from
, struct list_head
* into
);
471 extern int blocknr_set_add_extent(txn_atom
* atom
,
472 struct list_head
* bset
,
473 blocknr_set_entry
** new_bsep
,
474 const reiser4_block_nr
* start
,
475 const reiser4_block_nr
* len
);
476 extern int blocknr_set_add_pair(txn_atom
* atom
, struct list_head
* bset
,
477 blocknr_set_entry
** new_bsep
,
478 const reiser4_block_nr
* a
,
479 const reiser4_block_nr
* b
);
481 typedef int (*blocknr_set_actor_f
) (txn_atom
*, const reiser4_block_nr
*,
482 const reiser4_block_nr
*, void *);
484 extern int blocknr_set_iterator(txn_atom
* atom
, struct list_head
* bset
,
485 blocknr_set_actor_f actor
, void *data
,
488 /* flush code takes care about how to fuse flush queues */
489 extern void flush_init_atom(txn_atom
* atom
);
490 extern void flush_fuse_queues(txn_atom
* large
, txn_atom
* small
);
492 static inline void spin_lock_atom(txn_atom
*atom
)
494 /* check that spinlocks of lower priorities are not held */
495 assert("", (LOCK_CNT_NIL(spin_locked_txnh
) &&
496 LOCK_CNT_NIL(spin_locked_atom
) &&
497 LOCK_CNT_NIL(spin_locked_jnode
) &&
498 LOCK_CNT_NIL(spin_locked_zlock
) &&
499 LOCK_CNT_NIL(rw_locked_dk
) &&
500 LOCK_CNT_NIL(rw_locked_tree
)));
502 spin_lock(&(atom
->alock
));
504 LOCK_CNT_INC(spin_locked_atom
);
505 LOCK_CNT_INC(spin_locked
);
508 static inline void spin_lock_atom_nested(txn_atom
*atom
)
510 assert("", (LOCK_CNT_NIL(spin_locked_txnh
) &&
511 LOCK_CNT_NIL(spin_locked_jnode
) &&
512 LOCK_CNT_NIL(spin_locked_zlock
) &&
513 LOCK_CNT_NIL(rw_locked_dk
) &&
514 LOCK_CNT_NIL(rw_locked_tree
)));
516 spin_lock_nested(&(atom
->alock
), SINGLE_DEPTH_NESTING
);
518 LOCK_CNT_INC(spin_locked_atom
);
519 LOCK_CNT_INC(spin_locked
);
522 static inline int spin_trylock_atom(txn_atom
*atom
)
524 if (spin_trylock(&(atom
->alock
))) {
525 LOCK_CNT_INC(spin_locked_atom
);
526 LOCK_CNT_INC(spin_locked
);
532 static inline void spin_unlock_atom(txn_atom
*atom
)
534 assert_spin_locked(&(atom
->alock
));
535 assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_atom
));
536 assert("nikita-1376", LOCK_CNT_GTZ(spin_locked
));
538 LOCK_CNT_DEC(spin_locked_atom
);
539 LOCK_CNT_DEC(spin_locked
);
541 spin_unlock(&(atom
->alock
));
544 static inline void spin_lock_txnh(txn_handle
*txnh
)
546 /* check that spinlocks of lower priorities are not held */
547 assert("", (LOCK_CNT_NIL(rw_locked_dk
) &&
548 LOCK_CNT_NIL(spin_locked_zlock
) &&
549 LOCK_CNT_NIL(rw_locked_tree
)));
551 spin_lock(&(txnh
->hlock
));
553 LOCK_CNT_INC(spin_locked_txnh
);
554 LOCK_CNT_INC(spin_locked
);
557 static inline int spin_trylock_txnh(txn_handle
*txnh
)
559 if (spin_trylock(&(txnh
->hlock
))) {
560 LOCK_CNT_INC(spin_locked_txnh
);
561 LOCK_CNT_INC(spin_locked
);
567 static inline void spin_unlock_txnh(txn_handle
*txnh
)
569 assert_spin_locked(&(txnh
->hlock
));
570 assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnh
));
571 assert("nikita-1376", LOCK_CNT_GTZ(spin_locked
));
573 LOCK_CNT_DEC(spin_locked_txnh
);
574 LOCK_CNT_DEC(spin_locked
);
576 spin_unlock(&(txnh
->hlock
));
579 #define spin_ordering_pred_txnmgr(tmgr) \
580 ( LOCK_CNT_NIL(spin_locked_atom) && \
581 LOCK_CNT_NIL(spin_locked_txnh) && \
582 LOCK_CNT_NIL(spin_locked_jnode) && \
583 LOCK_CNT_NIL(rw_locked_zlock) && \
584 LOCK_CNT_NIL(rw_locked_dk) && \
585 LOCK_CNT_NIL(rw_locked_tree) )
587 static inline void spin_lock_txnmgr(txn_mgr
*mgr
)
589 /* check that spinlocks of lower priorities are not held */
590 assert("", (LOCK_CNT_NIL(spin_locked_atom
) &&
591 LOCK_CNT_NIL(spin_locked_txnh
) &&
592 LOCK_CNT_NIL(spin_locked_jnode
) &&
593 LOCK_CNT_NIL(spin_locked_zlock
) &&
594 LOCK_CNT_NIL(rw_locked_dk
) &&
595 LOCK_CNT_NIL(rw_locked_tree
)));
597 spin_lock(&(mgr
->tmgr_lock
));
599 LOCK_CNT_INC(spin_locked_txnmgr
);
600 LOCK_CNT_INC(spin_locked
);
603 static inline int spin_trylock_txnmgr(txn_mgr
*mgr
)
605 if (spin_trylock(&(mgr
->tmgr_lock
))) {
606 LOCK_CNT_INC(spin_locked_txnmgr
);
607 LOCK_CNT_INC(spin_locked
);
613 static inline void spin_unlock_txnmgr(txn_mgr
*mgr
)
615 assert_spin_locked(&(mgr
->tmgr_lock
));
616 assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnmgr
));
617 assert("nikita-1376", LOCK_CNT_GTZ(spin_locked
));
619 LOCK_CNT_DEC(spin_locked_txnmgr
);
620 LOCK_CNT_DEC(spin_locked
);
622 spin_unlock(&(mgr
->tmgr_lock
));
627 } flush_queue_state_t
;
629 typedef struct flush_queue flush_queue_t
;
631 /* This is an accumulator for jnodes prepared for writing to disk. A flush queue
632 is filled by the jnode_flush() routine, and written to disk under memory
633 pressure or at atom commit time. */
634 /* LOCKING: fq state and fq->atom are protected by guard spinlock, fq->nr_queued
635 field and fq->prepped list can be modified if atom is spin-locked and fq
636 object is "in-use" state. For read-only traversal of the fq->prepped list
637 and reading of the fq->nr_queued field it is enough to keep fq "in-use" or
638 only have atom spin-locked. */
640 /* linkage element is the first in this structure to make debugging
641 easier. See field in atom struct for description of list. */
642 struct list_head alink
;
643 /* A spinlock to protect changes of fq state and fq->atom pointer */
645 /* flush_queue state: [in_use | ready] */
646 flush_queue_state_t state
;
647 /* A list which contains queued nodes, queued nodes are removed from any
648 * atom's list and put on this ->prepped one. */
649 struct list_head prepped
;
650 /* number of submitted i/o requests */
651 atomic_t nr_submitted
;
652 /* number of i/o errors */
654 /* An atom this flush queue is attached to */
656 /* A wait queue head to wait on i/o completion */
657 wait_queue_head_t wait
;
659 /* A thread which took this fq in exclusive use, NULL if fq is free,
660 * used for debugging. */
661 struct task_struct
*owner
;
665 extern int reiser4_fq_by_atom(txn_atom
*, flush_queue_t
**);
666 extern void reiser4_fq_put_nolock(flush_queue_t
*);
667 extern void reiser4_fq_put(flush_queue_t
*);
668 extern void reiser4_fuse_fq(txn_atom
* to
, txn_atom
* from
);
669 extern void queue_jnode(flush_queue_t
*, jnode
*);
671 extern int reiser4_write_fq(flush_queue_t
*, long *, int);
672 extern int current_atom_finish_all_fq(void);
673 extern void init_atom_fq_parts(txn_atom
*);
675 extern reiser4_block_nr
txnmgr_count_deleted_blocks(void);
677 extern void znode_make_dirty(znode
* node
);
678 extern void jnode_make_dirty_locked(jnode
* node
);
680 extern int reiser4_sync_atom(txn_atom
* atom
);
683 extern int atom_fq_parts_are_clean(txn_atom
*);
686 extern void add_fq_to_bio(flush_queue_t
*, struct bio
*);
687 extern flush_queue_t
*get_fq_for_current_atom(void);
689 void reiser4_invalidate_list(struct list_head
* head
);
691 # endif /* __REISER4_TXNMGR_H__ */
695 c-indentation-style: "K&R"