1 /* Copyright 2005 by Hans Reiser, licensing governed by
6 /* return true, iff @coord points to the valid directory item that is part of
8 static int is_valid_dir_coord(struct inode
*inode
, coord_t
* coord
)
10 return plugin_of_group(item_plugin_by_coord(coord
),
11 DIR_ENTRY_ITEM_TYPE
) &&
12 inode_file_plugin(inode
)->owns_item(inode
, coord
);
15 /* compare two logical positions within the same directory */
16 static cmp_t
dir_pos_cmp(const struct dir_pos
* p1
, const struct dir_pos
* p2
)
20 assert("nikita-2534", p1
!= NULL
);
21 assert("nikita-2535", p2
!= NULL
);
23 result
= de_id_cmp(&p1
->dir_entry_key
, &p2
->dir_entry_key
);
24 if (result
== EQUAL_TO
) {
27 diff
= p1
->pos
- p2
->pos
;
29 (diff
< 0) ? LESS_THAN
: (diff
? GREATER_THAN
: EQUAL_TO
);
34 /* see comment before reiser4_readdir_common() for overview of why "adjustment" is
37 adjust_dir_pos(struct file
*dir
, struct readdir_pos
* readdir_spot
,
38 const struct dir_pos
* mod_point
, int adj
)
43 * new directory entry was added (adj == +1) or removed (adj == -1) at
44 * the @mod_point. Directory file descriptor @dir is doing readdir and
45 * is currently positioned at @readdir_spot. Latter has to be updated
46 * to maintain stable readdir.
48 /* directory is positioned to the beginning. */
49 if (readdir_spot
->entry_no
== 0)
52 pos
= &readdir_spot
->position
;
53 switch (dir_pos_cmp(mod_point
, pos
)) {
55 /* @mod_pos is _before_ @readdir_spot, that is, entry was
56 * added/removed on the left (in key order) of current
58 /* logical number of directory entry readdir is "looking" at
60 readdir_spot
->entry_no
+= adj
;
62 ergo(dir
!= NULL
, reiser4_get_dir_fpos(dir
) + adj
>= 0));
63 if (de_id_cmp(&pos
->dir_entry_key
,
64 &mod_point
->dir_entry_key
) == EQUAL_TO
) {
65 assert("nikita-2575", mod_point
->pos
< pos
->pos
);
67 * if entry added/removed has the same key as current
68 * for readdir, update counter of duplicate keys in
75 /* directory is modified after @pos: nothing to do. */
78 /* cannot insert an entry readdir is looking at, because it
80 assert("nikita-2576", adj
< 0);
81 /* directory entry to which @pos points to is being
84 NOTE-NIKITA: Right thing to do is to update @pos to point
85 to the next entry. This is complex (we are under spin-lock
86 for one thing). Just rewind it to the beginning. Next
87 readdir will have to scan the beginning of
88 directory. Proper solution is to use semaphore in
89 spin lock's stead and use rewind_right() here.
91 NOTE-NIKITA: now, semaphore is used, so...
93 memset(readdir_spot
, 0, sizeof *readdir_spot
);
97 /* scan all file-descriptors for this directory and adjust their
98 positions respectively. Should be used by implementations of
99 add_entry and rem_entry of dir plugin */
100 void reiser4_adjust_dir_file(struct inode
*dir
, const struct dentry
*de
,
103 reiser4_file_fsdata
*scan
;
104 struct dir_pos mod_point
;
106 assert("nikita-2536", dir
!= NULL
);
107 assert("nikita-2538", de
!= NULL
);
108 assert("nikita-2539", adj
!= 0);
110 build_de_id(dir
, &de
->d_name
, &mod_point
.dir_entry_key
);
111 mod_point
.pos
= offset
;
113 spin_lock_inode(dir
);
116 * new entry was added/removed in directory @dir. Scan all file
117 * descriptors for @dir that are currently involved into @readdir and
121 list_for_each_entry(scan
, get_readdir_list(dir
), dir
.linkage
)
122 adjust_dir_pos(scan
->back
, &scan
->dir
.readdir
, &mod_point
, adj
);
124 spin_unlock_inode(dir
);
128 * traverse tree to start/continue readdir from the readdir position @pos.
130 static int dir_go_to(struct file
*dir
, struct readdir_pos
* pos
, tap_t
* tap
)
136 assert("nikita-2554", pos
!= NULL
);
138 inode
= dir
->f_dentry
->d_inode
;
139 result
= inode_dir_plugin(inode
)->build_readdir_key(dir
, &key
);
142 result
= reiser4_object_lookup(inode
,
148 LEAF_LEVEL
, LEAF_LEVEL
,
150 if (result
== CBK_COORD_FOUND
)
151 result
= rewind_right(tap
, (int)pos
->position
.pos
);
153 tap
->coord
->node
= NULL
;
155 result
= RETERR(-EIO
);
161 * handling of non-unique keys: calculate at what ordinal position within
162 * sequence of directory items with identical keys @pos is.
164 static int set_pos(struct inode
*inode
, struct readdir_pos
* pos
, tap_t
* tap
)
173 coord_init_zero(&coord
);
175 reiser4_tap_init(&scan
, &coord
, &lh
, ZNODE_READ_LOCK
);
176 reiser4_tap_copy(&scan
, tap
);
177 reiser4_tap_load(&scan
);
178 pos
->position
.pos
= 0;
180 did
= &pos
->position
.dir_entry_key
;
182 if (is_valid_dir_coord(inode
, scan
.coord
)) {
184 build_de_id_by_key(unit_key_by_coord(scan
.coord
, &de_key
), did
);
188 result
= go_prev_unit(&scan
);
192 if (!is_valid_dir_coord(inode
, scan
.coord
)) {
197 /* get key of directory entry */
198 unit_key_by_coord(scan
.coord
, &de_key
);
199 if (de_id_key_cmp(did
, &de_key
) != EQUAL_TO
) {
200 /* duplicate-sequence is over */
206 result
= RETERR(-ENOENT
);
207 reiser4_tap_relse(&scan
);
208 reiser4_tap_done(&scan
);
213 * "rewind" directory to @offset, i.e., set @pos and @tap correspondingly.
215 static int dir_rewind(struct file
*dir
, struct readdir_pos
* pos
, tap_t
* tap
)
223 assert("nikita-2553", dir
!= NULL
);
224 assert("nikita-2548", pos
!= NULL
);
225 assert("nikita-2551", tap
->coord
!= NULL
);
226 assert("nikita-2552", tap
->lh
!= NULL
);
228 dirpos
= reiser4_get_dir_fpos(dir
);
229 shift
= dirpos
- pos
->fpos
;
230 /* this is logical directory entry within @dir which we are rewinding
232 destination
= pos
->entry_no
+ shift
;
234 inode
= dir
->f_dentry
->d_inode
;
236 return RETERR(-EINVAL
);
237 else if (destination
== 0ll || dirpos
== 0) {
238 /* rewind to the beginning of directory */
239 memset(pos
, 0, sizeof *pos
);
240 return dir_go_to(dir
, pos
, tap
);
241 } else if (destination
>= inode
->i_size
)
242 return RETERR(-ENOENT
);
245 /* I am afraid of negative numbers */
247 /* rewinding to the left */
248 if (shift
<= (int)pos
->position
.pos
) {
249 /* destination is within sequence of entries with
251 result
= dir_go_to(dir
, pos
, tap
);
253 shift
-= pos
->position
.pos
;
255 /* repetitions: deadlock is possible when
256 going to the left. */
257 result
= dir_go_to(dir
, pos
, tap
);
259 result
= rewind_left(tap
, shift
);
260 if (result
== -E_DEADLOCK
) {
261 reiser4_tap_done(tap
);
269 /* rewinding to the right */
270 result
= dir_go_to(dir
, pos
, tap
);
272 result
= rewind_right(tap
, shift
);
275 result
= set_pos(inode
, pos
, tap
);
277 /* update pos->position.pos */
278 pos
->entry_no
= destination
;
286 * Function that is called by common_readdir() on each directory entry while
287 * doing readdir. ->filldir callback may block, so we had to release long term
288 * lock while calling it. To avoid repeating tree traversal, seal is used. If
289 * seal is broken, we return -E_REPEAT. Node is unlocked in this case.
291 * Whether node is unlocked in case of any other error is undefined. It is
292 * guaranteed to be still locked if success (0) is returned.
294 * When ->filldir() wants no more, feed_entry() returns 1, and node is
298 feed_entry(struct file
*f
, struct readdir_pos
* pos
, tap_t
* tap
,
299 filldir_t filldir
, void *dirent
)
305 char buf
[DE_NAME_BUF_LEN
];
311 reiser4_key entry_key
;
314 iplug
= item_plugin_by_coord(coord
);
316 /* pointer to name within the node */
317 name
= iplug
->s
.dir
.extract_name(coord
, buf
);
318 assert("nikita-1371", name
!= NULL
);
320 /* key of object the entry points to */
321 if (iplug
->s
.dir
.extract_key(coord
, &sd_key
) != 0)
324 /* we must release longterm znode lock before calling filldir to avoid
325 deadlock which may happen if filldir causes page fault. So, copy
326 name to intermediate buffer */
327 if (strlen(name
) + 1 > sizeof(name_buf
)) {
328 local_name
= kmalloc(strlen(name
) + 1,
329 reiser4_ctx_gfp_mask_get());
330 if (local_name
== NULL
)
331 return RETERR(-ENOMEM
);
333 local_name
= name_buf
;
335 strcpy(local_name
, name
);
336 file_type
= iplug
->s
.dir
.extract_file_type(coord
);
338 unit_key_by_coord(coord
, &entry_key
);
339 reiser4_seal_init(&seal
, coord
, &entry_key
);
341 longterm_unlock_znode(tap
->lh
);
344 * send information about directory entry to the ->filldir() filler
345 * supplied to us by caller (VFS).
347 * ->filldir is entitled to do weird things. For example, ->filldir
348 * supplied by knfsd re-enters file system. Make sure no locks are
351 assert("nikita-3436", lock_stack_isclean(get_current_lock_stack()));
353 reiser4_txn_restart_current();
354 result
= filldir(dirent
, name
, (int)strlen(name
),
355 /* offset of this entry */
357 /* inode number of object bounden by this entry */
358 oid_to_uino(get_key_objectid(&sd_key
)), file_type
);
359 if (local_name
!= name_buf
)
362 /* ->filldir() is satisfied. (no space in buffer, IOW) */
365 result
= reiser4_seal_validate(&seal
, coord
, &entry_key
,
371 static void move_entry(struct readdir_pos
* pos
, coord_t
* coord
)
378 did
= &pos
->position
.dir_entry_key
;
380 /* get key of directory entry */
381 unit_key_by_coord(coord
, &de_key
);
383 if (de_id_key_cmp(did
, &de_key
) == EQUAL_TO
)
384 /* we are within sequence of directory entries
385 with duplicate keys. */
388 pos
->position
.pos
= 0;
389 build_de_id_by_key(&de_key
, did
);
397 * readdir support in reiser4 relies on ability to update readdir_pos embedded
398 * into reiser4_file_fsdata on each directory modification (name insertion and
399 * removal), see reiser4_readdir_common() function below. This obviously doesn't
400 * work when reiser4 is accessed over NFS, because NFS doesn't keep any state
401 * across client READDIR requests for the same directory.
403 * To address this we maintain a "pool" of detached reiser4_file_fsdata
404 * (d_cursor). Whenever NFS readdir request comes, we detect this, and try to
405 * find detached reiser4_file_fsdata corresponding to previous readdir
406 * request. In other words, additional state is maintained on the
407 * server. (This is somewhat contrary to the design goals of NFS protocol.)
409 * To efficiently detect when our ->readdir() method is called by NFS server,
410 * dentry is marked as "stateless" in reiser4_decode_fh() (this is checked by
411 * file_is_stateless() function).
413 * To find out d_cursor in the pool, we encode client id (cid) in the highest
414 * bits of NFS readdir cookie: when first readdir request comes to the given
415 * directory from the given client, cookie is set to 0. This situation is
416 * detected, global cid_counter is incremented, and stored in highest bits of
417 * all direntry offsets returned to the client, including last one. As the
418 * only valid readdir cookie is one obtained as direntry->offset, we are
419 * guaranteed that next readdir request (continuing current one) will have
420 * current cid in the highest bits of starting readdir cookie. All d_cursors
421 * are hashed into per-super-block hash table by (oid, cid) key.
423 * In addition d_cursors are placed into per-super-block radix tree where they
424 * are keyed by oid alone. This is necessary to efficiently remove them during
427 * At last, currently unused d_cursors are linked into special list. This list
428 * is used d_cursor_shrink to reclaim d_cursors on memory pressure.
433 * prepare for readdir.
435 static int dir_readdir_init(struct file
*f
, tap_t
* tap
,
436 struct readdir_pos
** pos
)
439 reiser4_file_fsdata
*fsdata
;
442 assert("nikita-1359", f
!= NULL
);
443 inode
= f
->f_dentry
->d_inode
;
444 assert("nikita-1360", inode
!= NULL
);
446 if (!S_ISDIR(inode
->i_mode
))
447 return RETERR(-ENOTDIR
);
449 /* try to find detached readdir state */
450 result
= reiser4_attach_fsdata(f
, inode
);
454 fsdata
= reiser4_get_file_fsdata(f
);
455 assert("nikita-2571", fsdata
!= NULL
);
457 return PTR_ERR(fsdata
);
459 /* add file descriptor to the readdir list hanging of directory
460 * inode. This list is used to scan "readdirs-in-progress" while
461 * inserting or removing names in the directory. */
462 spin_lock_inode(inode
);
463 if (list_empty_careful(&fsdata
->dir
.linkage
))
464 list_add(&fsdata
->dir
.linkage
, get_readdir_list(inode
));
465 *pos
= &fsdata
->dir
.readdir
;
466 spin_unlock_inode(inode
);
468 /* move @tap to the current position */
469 return dir_rewind(f
, *pos
, tap
);
472 /* this is implementation of vfs's llseek method of struct file_operations for
474 See comment before reiser4_readdir_common() for explanation.
476 loff_t
reiser4_llseek_dir_common(struct file
* file
, loff_t off
, int origin
)
478 reiser4_context
*ctx
;
482 inode
= file
->f_dentry
->d_inode
;
484 ctx
= reiser4_init_context(inode
->i_sb
);
488 mutex_lock(&inode
->i_mutex
);
491 result
= default_llseek(file
, off
, origin
);
497 struct readdir_pos
*pos
;
499 coord_init_zero(&coord
);
501 reiser4_tap_init(&tap
, &coord
, &lh
, ZNODE_READ_LOCK
);
503 ff
= dir_readdir_init(file
, &tap
, &pos
);
504 reiser4_detach_fsdata(file
);
506 result
= (loff_t
) ff
;
507 reiser4_tap_done(&tap
);
509 reiser4_detach_fsdata(file
);
510 mutex_unlock(&inode
->i_mutex
);
512 reiser4_exit_context(ctx
);
516 /* this is common implementation of vfs's readdir method of struct
521 readdir(2)/getdents(2) interface is based on implicit assumption that
522 readdir can be restarted from any particular point by supplying file system
523 with off_t-full of data. That is, file system fills ->d_off field in struct
524 dirent and later user passes ->d_off to the seekdir(3), which is, actually,
525 implemented by glibc as lseek(2) on directory.
527 Reiser4 cannot restart readdir from 64 bits of data, because two last
528 components of the key of directory entry are unknown, which given 128 bits:
529 locality and type fields in the key of directory entry are always known, to
530 start readdir() from given point objectid and offset fields have to be
533 Traditional UNIX API for scanning through directory
534 (readdir/seekdir/telldir/opendir/closedir/rewindir/getdents) is based on the
535 assumption that directory is structured very much like regular file, in
536 particular, it is implied that each name within given directory (directory
537 entry) can be uniquely identified by scalar offset and that such offset is
538 stable across the life-time of the name is identifies.
540 This is manifestly not so for reiser4. In reiser4 the only stable unique
541 identifies for the directory entry is its key that doesn't fit into
546 Within each file descriptor participating in readdir-ing of directory
547 plugin/dir/dir.h:readdir_pos is maintained. This structure keeps track of
548 the "current" directory entry that file descriptor looks at. It contains a
549 key of directory entry (plus some additional info to deal with non-unique
550 keys that we wouldn't dwell onto here) and a logical position of this
551 directory entry starting from the beginning of the directory, that is
552 ordinal number of this entry in the readdir order.
554 Obviously this logical position is not stable in the face of directory
555 modifications. To work around this, on each addition or removal of directory
556 entry all file descriptors for directory inode are scanned and their
557 readdir_pos are updated accordingly (adjust_dir_pos()).
559 int reiser4_readdir_common(struct file
*f
/* directory file being read */,
560 void *dirent
/* opaque data passed to us by VFS */,
561 filldir_t filld
/* filler function passed to us
564 reiser4_context
*ctx
;
570 struct readdir_pos
*pos
;
572 assert("nikita-1359", f
!= NULL
);
573 inode
= f
->f_dentry
->d_inode
;
574 assert("nikita-1360", inode
!= NULL
);
576 if (!S_ISDIR(inode
->i_mode
))
577 return RETERR(-ENOTDIR
);
579 ctx
= reiser4_init_context(inode
->i_sb
);
583 coord_init_zero(&coord
);
585 reiser4_tap_init(&tap
, &coord
, &lh
, ZNODE_READ_LOCK
);
587 reiser4_readdir_readahead_init(inode
, &tap
);
590 result
= dir_readdir_init(f
, &tap
, &pos
);
592 result
= reiser4_tap_load(&tap
);
593 /* scan entries one by one feeding them to @filld */
594 while (result
== 0) {
598 assert("nikita-2572", coord_is_existing_unit(coord
));
599 assert("nikita-3227", is_valid_dir_coord(inode
, coord
));
601 result
= feed_entry(f
, pos
, &tap
, filld
, dirent
);
604 } else if (result
== 0) {
606 result
= go_next_unit(&tap
);
607 if (result
== -E_NO_NEIGHBOR
||
611 } else if (result
== 0) {
612 if (is_valid_dir_coord(inode
, coord
))
613 move_entry(pos
, coord
);
617 } else if (result
== -E_REPEAT
) {
618 /* feed_entry() had to restart. */
620 reiser4_tap_relse(&tap
);
624 "reiser4_readdir_common: unexpected error %d",
627 reiser4_tap_relse(&tap
);
630 f
->f_version
= inode
->i_version
;
631 } else if (result
== -E_NO_NEIGHBOR
|| result
== -ENOENT
)
633 reiser4_tap_done(&tap
);
634 reiser4_detach_fsdata(f
);
636 /* try to update directory's atime */
637 if (reiser4_grab_space_force(inode_file_plugin(inode
)->estimate
.update(inode
),
639 warning("", "failed to update atime on readdir: %llu",
640 get_inode_oid(inode
));
644 context_set_commit_async(ctx
);
645 reiser4_exit_context(ctx
);
647 return (result
<= 0) ? result
: 0;
652 * c-indentation-style: "K&R"