On Tue, Nov 06, 2007 at 02:33:53AM -0800, akpm@linux-foundation.org wrote:
[mmotm.git] / fs / reiser4 / plugin / file_ops_readdir.c
blob1c93c198cbd00f130a2607ca42c4e9cc58311483
1 /* Copyright 2005 by Hans Reiser, licensing governed by
2 * reiser4/README */
4 #include "../inode.h"
6 /* return true, iff @coord points to the valid directory item that is part of
7 * @inode directory. */
8 static int is_valid_dir_coord(struct inode *inode, coord_t *coord)
10 return plugin_of_group(item_plugin_by_coord(coord),
11 DIR_ENTRY_ITEM_TYPE) &&
12 inode_file_plugin(inode)->owns_item(inode, coord);
15 /* compare two logical positions within the same directory */
16 static cmp_t dir_pos_cmp(const struct dir_pos *p1, const struct dir_pos *p2)
18 cmp_t result;
20 assert("nikita-2534", p1 != NULL);
21 assert("nikita-2535", p2 != NULL);
23 result = de_id_cmp(&p1->dir_entry_key, &p2->dir_entry_key);
24 if (result == EQUAL_TO) {
25 int diff;
27 diff = p1->pos - p2->pos;
28 result =
29 (diff < 0) ? LESS_THAN : (diff ? GREATER_THAN : EQUAL_TO);
31 return result;
34 /* see comment before reiser4_readdir_common() for overview of why "adjustment"
35 * is necessary. */
36 static void
37 adjust_dir_pos(struct file *dir, struct readdir_pos *readdir_spot,
38 const struct dir_pos *mod_point, int adj)
40 struct dir_pos *pos;
43 * new directory entry was added (adj == +1) or removed (adj == -1) at
44 * the @mod_point. Directory file descriptor @dir is doing readdir and
45 * is currently positioned at @readdir_spot. Latter has to be updated
46 * to maintain stable readdir.
48 /* directory is positioned to the beginning. */
49 if (readdir_spot->entry_no == 0)
50 return;
52 pos = &readdir_spot->position;
53 switch (dir_pos_cmp(mod_point, pos)) {
54 case LESS_THAN:
55 /* @mod_pos is _before_ @readdir_spot, that is, entry was
56 * added/removed on the left (in key order) of current
57 * position. */
58 /* logical number of directory entry readdir is "looking" at
59 * changes */
60 readdir_spot->entry_no += adj;
61 assert("nikita-2577",
62 ergo(dir != NULL, reiser4_get_dir_fpos(dir) + adj >= 0));
63 if (de_id_cmp(&pos->dir_entry_key,
64 &mod_point->dir_entry_key) == EQUAL_TO) {
65 assert("nikita-2575", mod_point->pos < pos->pos);
67 * if entry added/removed has the same key as current
68 * for readdir, update counter of duplicate keys in
69 * @readdir_spot.
71 pos->pos += adj;
73 break;
74 case GREATER_THAN:
75 /* directory is modified after @pos: nothing to do. */
76 break;
77 case EQUAL_TO:
78 /* cannot insert an entry readdir is looking at, because it
79 already exists. */
80 assert("nikita-2576", adj < 0);
81 /* directory entry to which @pos points to is being
82 removed.
84 NOTE-NIKITA: Right thing to do is to update @pos to point
85 to the next entry. This is complex (we are under spin-lock
86 for one thing). Just rewind it to the beginning. Next
87 readdir will have to scan the beginning of
88 directory. Proper solution is to use semaphore in
89 spin lock's stead and use rewind_right() here.
91 NOTE-NIKITA: now, semaphore is used, so...
93 memset(readdir_spot, 0, sizeof *readdir_spot);
97 /* scan all file-descriptors for this directory and adjust their
98 positions respectively. Should be used by implementations of
99 add_entry and rem_entry of dir plugin */
100 void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de,
101 int offset, int adj)
103 reiser4_file_fsdata *scan;
104 struct dir_pos mod_point;
106 assert("nikita-2536", dir != NULL);
107 assert("nikita-2538", de != NULL);
108 assert("nikita-2539", adj != 0);
110 build_de_id(dir, &de->d_name, &mod_point.dir_entry_key);
111 mod_point.pos = offset;
113 spin_lock_inode(dir);
116 * new entry was added/removed in directory @dir. Scan all file
117 * descriptors for @dir that are currently involved into @readdir and
118 * update them.
121 list_for_each_entry(scan, get_readdir_list(dir), dir.linkage)
122 adjust_dir_pos(scan->back, &scan->dir.readdir, &mod_point, adj);
124 spin_unlock_inode(dir);
128 * traverse tree to start/continue readdir from the readdir position @pos.
130 static int dir_go_to(struct file *dir, struct readdir_pos *pos, tap_t *tap)
132 reiser4_key key;
133 int result;
134 struct inode *inode;
136 assert("nikita-2554", pos != NULL);
138 inode = dir->f_dentry->d_inode;
139 result = inode_dir_plugin(inode)->build_readdir_key(dir, &key);
140 if (result != 0)
141 return result;
142 result = reiser4_object_lookup(inode,
143 &key,
144 tap->coord,
145 tap->lh,
146 tap->mode,
147 FIND_EXACT,
148 LEAF_LEVEL, LEAF_LEVEL,
149 0, &tap->ra_info);
150 if (result == CBK_COORD_FOUND)
151 result = rewind_right(tap, (int)pos->position.pos);
152 else {
153 tap->coord->node = NULL;
154 done_lh(tap->lh);
155 result = RETERR(-EIO);
157 return result;
161 * handling of non-unique keys: calculate at what ordinal position within
162 * sequence of directory items with identical keys @pos is.
164 static int set_pos(struct inode *inode, struct readdir_pos *pos, tap_t *tap)
166 int result;
167 coord_t coord;
168 lock_handle lh;
169 tap_t scan;
170 de_id *did;
171 reiser4_key de_key;
173 coord_init_zero(&coord);
174 init_lh(&lh);
175 reiser4_tap_init(&scan, &coord, &lh, ZNODE_READ_LOCK);
176 reiser4_tap_copy(&scan, tap);
177 reiser4_tap_load(&scan);
178 pos->position.pos = 0;
180 did = &pos->position.dir_entry_key;
182 if (is_valid_dir_coord(inode, scan.coord)) {
184 build_de_id_by_key(unit_key_by_coord(scan.coord, &de_key), did);
186 while (1) {
188 result = go_prev_unit(&scan);
189 if (result != 0)
190 break;
192 if (!is_valid_dir_coord(inode, scan.coord)) {
193 result = -EINVAL;
194 break;
197 /* get key of directory entry */
198 unit_key_by_coord(scan.coord, &de_key);
199 if (de_id_key_cmp(did, &de_key) != EQUAL_TO) {
200 /* duplicate-sequence is over */
201 break;
203 pos->position.pos++;
205 } else
206 result = RETERR(-ENOENT);
207 reiser4_tap_relse(&scan);
208 reiser4_tap_done(&scan);
209 return result;
213 * "rewind" directory to @offset, i.e., set @pos and @tap correspondingly.
215 static int dir_rewind(struct file *dir, struct readdir_pos *pos, tap_t *tap)
217 __u64 destination;
218 __s64 shift;
219 int result;
220 struct inode *inode;
221 loff_t dirpos;
223 assert("nikita-2553", dir != NULL);
224 assert("nikita-2548", pos != NULL);
225 assert("nikita-2551", tap->coord != NULL);
226 assert("nikita-2552", tap->lh != NULL);
228 dirpos = reiser4_get_dir_fpos(dir);
229 shift = dirpos - pos->fpos;
230 /* this is logical directory entry within @dir which we are rewinding
231 * to */
232 destination = pos->entry_no + shift;
234 inode = dir->f_dentry->d_inode;
235 if (dirpos < 0)
236 return RETERR(-EINVAL);
237 else if (destination == 0ll || dirpos == 0) {
238 /* rewind to the beginning of directory */
239 memset(pos, 0, sizeof *pos);
240 return dir_go_to(dir, pos, tap);
241 } else if (destination >= inode->i_size)
242 return RETERR(-ENOENT);
244 if (shift < 0) {
245 /* I am afraid of negative numbers */
246 shift = -shift;
247 /* rewinding to the left */
248 if (shift <= (int)pos->position.pos) {
249 /* destination is within sequence of entries with
250 duplicate keys. */
251 result = dir_go_to(dir, pos, tap);
252 } else {
253 shift -= pos->position.pos;
254 while (1) {
255 /* repetitions: deadlock is possible when
256 going to the left. */
257 result = dir_go_to(dir, pos, tap);
258 if (result == 0) {
259 result = rewind_left(tap, shift);
260 if (result == -E_DEADLOCK) {
261 reiser4_tap_done(tap);
262 continue;
265 break;
268 } else {
269 /* rewinding to the right */
270 result = dir_go_to(dir, pos, tap);
271 if (result == 0)
272 result = rewind_right(tap, shift);
274 if (result == 0) {
275 result = set_pos(inode, pos, tap);
276 if (result == 0) {
277 /* update pos->position.pos */
278 pos->entry_no = destination;
279 pos->fpos = dirpos;
282 return result;
286 * Function that is called by common_readdir() on each directory entry while
287 * doing readdir. ->filldir callback may block, so we had to release long term
288 * lock while calling it. To avoid repeating tree traversal, seal is used. If
289 * seal is broken, we return -E_REPEAT. Node is unlocked in this case.
291 * Whether node is unlocked in case of any other error is undefined. It is
292 * guaranteed to be still locked if success (0) is returned.
294 * When ->filldir() wants no more, feed_entry() returns 1, and node is
295 * unlocked.
297 static int
298 feed_entry(struct file *f, struct readdir_pos *pos, tap_t *tap,
299 filldir_t filldir, void *dirent)
301 item_plugin *iplug;
302 char *name;
303 reiser4_key sd_key;
304 int result;
305 char buf[DE_NAME_BUF_LEN];
306 char name_buf[32];
307 char *local_name;
308 unsigned file_type;
309 seal_t seal;
310 coord_t *coord;
311 reiser4_key entry_key;
313 coord = tap->coord;
314 iplug = item_plugin_by_coord(coord);
316 /* pointer to name within the node */
317 name = iplug->s.dir.extract_name(coord, buf);
318 assert("nikita-1371", name != NULL);
320 /* key of object the entry points to */
321 if (iplug->s.dir.extract_key(coord, &sd_key) != 0)
322 return RETERR(-EIO);
324 /* we must release longterm znode lock before calling filldir to avoid
325 deadlock which may happen if filldir causes page fault. So, copy
326 name to intermediate buffer */
327 if (strlen(name) + 1 > sizeof(name_buf)) {
328 local_name = kmalloc(strlen(name) + 1,
329 reiser4_ctx_gfp_mask_get());
330 if (local_name == NULL)
331 return RETERR(-ENOMEM);
332 } else
333 local_name = name_buf;
335 strcpy(local_name, name);
336 file_type = iplug->s.dir.extract_file_type(coord);
338 unit_key_by_coord(coord, &entry_key);
339 reiser4_seal_init(&seal, coord, &entry_key);
341 longterm_unlock_znode(tap->lh);
344 * send information about directory entry to the ->filldir() filler
345 * supplied to us by caller (VFS).
347 * ->filldir is entitled to do weird things. For example, ->filldir
348 * supplied by knfsd re-enters file system. Make sure no locks are
349 * held.
351 assert("nikita-3436", lock_stack_isclean(get_current_lock_stack()));
353 reiser4_txn_restart_current();
354 result = filldir(dirent, name, (int)strlen(name),
355 /* offset of this entry */
356 f->f_pos,
357 /* inode number of object bounden by this entry */
358 oid_to_uino(get_key_objectid(&sd_key)), file_type);
359 if (local_name != name_buf)
360 kfree(local_name);
361 if (result < 0)
362 /* ->filldir() is satisfied. (no space in buffer, IOW) */
363 result = 1;
364 else
365 result = reiser4_seal_validate(&seal, coord, &entry_key,
366 tap->lh, tap->mode,
367 ZNODE_LOCK_HIPRI);
368 return result;
371 static void move_entry(struct readdir_pos *pos, coord_t *coord)
373 reiser4_key de_key;
374 de_id *did;
376 /* update @pos */
377 ++pos->entry_no;
378 did = &pos->position.dir_entry_key;
380 /* get key of directory entry */
381 unit_key_by_coord(coord, &de_key);
383 if (de_id_key_cmp(did, &de_key) == EQUAL_TO)
384 /* we are within sequence of directory entries
385 with duplicate keys. */
386 ++pos->position.pos;
387 else {
388 pos->position.pos = 0;
389 build_de_id_by_key(&de_key, did);
391 ++pos->fpos;
395 * STATELESS READDIR
397 * readdir support in reiser4 relies on ability to update readdir_pos embedded
398 * into reiser4_file_fsdata on each directory modification (name insertion and
399 * removal), see reiser4_readdir_common() function below. This obviously doesn't
400 * work when reiser4 is accessed over NFS, because NFS doesn't keep any state
401 * across client READDIR requests for the same directory.
403 * To address this we maintain a "pool" of detached reiser4_file_fsdata
404 * (d_cursor). Whenever NFS readdir request comes, we detect this, and try to
405 * find detached reiser4_file_fsdata corresponding to previous readdir
406 * request. In other words, additional state is maintained on the
407 * server. (This is somewhat contrary to the design goals of NFS protocol.)
409 * To efficiently detect when our ->readdir() method is called by NFS server,
410 * dentry is marked as "stateless" in reiser4_decode_fh() (this is checked by
411 * file_is_stateless() function).
413 * To find out d_cursor in the pool, we encode client id (cid) in the highest
414 * bits of NFS readdir cookie: when first readdir request comes to the given
415 * directory from the given client, cookie is set to 0. This situation is
416 * detected, global cid_counter is incremented, and stored in highest bits of
417 * all direntry offsets returned to the client, including last one. As the
418 * only valid readdir cookie is one obtained as direntry->offset, we are
419 * guaranteed that next readdir request (continuing current one) will have
420 * current cid in the highest bits of starting readdir cookie. All d_cursors
421 * are hashed into per-super-block hash table by (oid, cid) key.
423 * In addition d_cursors are placed into per-super-block radix tree where they
424 * are keyed by oid alone. This is necessary to efficiently remove them during
425 * rmdir.
427 * At last, currently unused d_cursors are linked into special list. This list
428 * is used d_cursor_shrink to reclaim d_cursors on memory pressure.
433 * prepare for readdir.
435 static int dir_readdir_init(struct file *f, tap_t *tap,
436 struct readdir_pos **pos)
438 struct inode *inode;
439 reiser4_file_fsdata *fsdata;
440 int result;
442 assert("nikita-1359", f != NULL);
443 inode = f->f_dentry->d_inode;
444 assert("nikita-1360", inode != NULL);
446 if (!S_ISDIR(inode->i_mode))
447 return RETERR(-ENOTDIR);
449 /* try to find detached readdir state */
450 result = reiser4_attach_fsdata(f, inode);
451 if (result != 0)
452 return result;
454 fsdata = reiser4_get_file_fsdata(f);
455 assert("nikita-2571", fsdata != NULL);
456 if (IS_ERR(fsdata))
457 return PTR_ERR(fsdata);
459 /* add file descriptor to the readdir list hanging of directory
460 * inode. This list is used to scan "readdirs-in-progress" while
461 * inserting or removing names in the directory. */
462 spin_lock_inode(inode);
463 if (list_empty_careful(&fsdata->dir.linkage))
464 list_add(&fsdata->dir.linkage, get_readdir_list(inode));
465 *pos = &fsdata->dir.readdir;
466 spin_unlock_inode(inode);
468 /* move @tap to the current position */
469 return dir_rewind(f, *pos, tap);
472 /* this is implementation of vfs's llseek method of struct file_operations for
473 typical directory
474 See comment before reiser4_readdir_common() for explanation.
476 loff_t reiser4_llseek_dir_common(struct file *file, loff_t off, int origin)
478 reiser4_context *ctx;
479 loff_t result;
480 struct inode *inode;
482 inode = file->f_dentry->d_inode;
484 ctx = reiser4_init_context(inode->i_sb);
485 if (IS_ERR(ctx))
486 return PTR_ERR(ctx);
488 mutex_lock(&inode->i_mutex);
490 /* update ->f_pos */
491 result = default_llseek(file, off, origin);
492 if (result >= 0) {
493 int ff;
494 coord_t coord;
495 lock_handle lh;
496 tap_t tap;
497 struct readdir_pos *pos;
499 coord_init_zero(&coord);
500 init_lh(&lh);
501 reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
503 ff = dir_readdir_init(file, &tap, &pos);
504 reiser4_detach_fsdata(file);
505 if (ff != 0)
506 result = (loff_t) ff;
507 reiser4_tap_done(&tap);
509 reiser4_detach_fsdata(file);
510 mutex_unlock(&inode->i_mutex);
512 reiser4_exit_context(ctx);
513 return result;
516 /* this is common implementation of vfs's readdir method of struct
517 file_operations
519 readdir problems:
521 readdir(2)/getdents(2) interface is based on implicit assumption that
522 readdir can be restarted from any particular point by supplying file system
523 with off_t-full of data. That is, file system fills ->d_off field in struct
524 dirent and later user passes ->d_off to the seekdir(3), which is, actually,
525 implemented by glibc as lseek(2) on directory.
527 Reiser4 cannot restart readdir from 64 bits of data, because two last
528 components of the key of directory entry are unknown, which given 128 bits:
529 locality and type fields in the key of directory entry are always known, to
530 start readdir() from given point objectid and offset fields have to be
531 filled.
533 Traditional UNIX API for scanning through directory
534 (readdir/seekdir/telldir/opendir/closedir/rewindir/getdents) is based on the
535 assumption that directory is structured very much like regular file, in
536 particular, it is implied that each name within given directory (directory
537 entry) can be uniquely identified by scalar offset and that such offset is
538 stable across the life-time of the name is identifies.
540 This is manifestly not so for reiser4. In reiser4 the only stable unique
541 identifies for the directory entry is its key that doesn't fit into
542 seekdir/telldir API.
544 solution:
546 Within each file descriptor participating in readdir-ing of directory
547 plugin/dir/dir.h:readdir_pos is maintained. This structure keeps track of
548 the "current" directory entry that file descriptor looks at. It contains a
549 key of directory entry (plus some additional info to deal with non-unique
550 keys that we wouldn't dwell onto here) and a logical position of this
551 directory entry starting from the beginning of the directory, that is
552 ordinal number of this entry in the readdir order.
554 Obviously this logical position is not stable in the face of directory
555 modifications. To work around this, on each addition or removal of directory
556 entry all file descriptors for directory inode are scanned and their
557 readdir_pos are updated accordingly (adjust_dir_pos()).
559 int reiser4_readdir_common(struct file *f /* directory file being read */,
560 void *dirent /* opaque data passed to us by VFS */,
561 filldir_t filld /* filler function passed to us
562 * by VFS */)
564 reiser4_context *ctx;
565 int result;
566 struct inode *inode;
567 coord_t coord;
568 lock_handle lh;
569 tap_t tap;
570 struct readdir_pos *pos;
572 assert("nikita-1359", f != NULL);
573 inode = f->f_dentry->d_inode;
574 assert("nikita-1360", inode != NULL);
576 if (!S_ISDIR(inode->i_mode))
577 return RETERR(-ENOTDIR);
579 ctx = reiser4_init_context(inode->i_sb);
580 if (IS_ERR(ctx))
581 return PTR_ERR(ctx);
583 coord_init_zero(&coord);
584 init_lh(&lh);
585 reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
587 reiser4_readdir_readahead_init(inode, &tap);
589 repeat:
590 result = dir_readdir_init(f, &tap, &pos);
591 if (result == 0) {
592 result = reiser4_tap_load(&tap);
593 /* scan entries one by one feeding them to @filld */
594 while (result == 0) {
595 coord_t *coord;
597 coord = tap.coord;
598 assert("nikita-2572", coord_is_existing_unit(coord));
599 assert("nikita-3227", is_valid_dir_coord(inode, coord));
601 result = feed_entry(f, pos, &tap, filld, dirent);
602 if (result > 0) {
603 break;
604 } else if (result == 0) {
605 ++f->f_pos;
606 result = go_next_unit(&tap);
607 if (result == -E_NO_NEIGHBOR ||
608 result == -ENOENT) {
609 result = 0;
610 break;
611 } else if (result == 0) {
612 if (is_valid_dir_coord(inode, coord))
613 move_entry(pos, coord);
614 else
615 break;
617 } else if (result == -E_REPEAT) {
618 /* feed_entry() had to restart. */
619 ++f->f_pos;
620 reiser4_tap_relse(&tap);
621 goto repeat;
622 } else
623 warning("vs-1617",
624 "reiser4_readdir_common: unexpected error %d",
625 result);
627 reiser4_tap_relse(&tap);
629 if (result >= 0)
630 f->f_version = inode->i_version;
631 } else if (result == -E_NO_NEIGHBOR || result == -ENOENT)
632 result = 0;
633 reiser4_tap_done(&tap);
634 reiser4_detach_fsdata(f);
636 /* try to update directory's atime */
637 if (reiser4_grab_space_force(inode_file_plugin(inode)->estimate.update(inode),
638 BA_CAN_COMMIT) != 0)
639 warning("", "failed to update atime on readdir: %llu",
640 get_inode_oid(inode));
641 else
642 file_accessed(f);
644 context_set_commit_async(ctx);
645 reiser4_exit_context(ctx);
647 return (result <= 0) ? result : 0;
651 * Local variables:
652 * c-indentation-style: "K&R"
653 * mode-name: "LC"
654 * c-basic-offset: 8
655 * tab-width: 8
656 * fill-column: 79
657 * End: