* improve similar pkg suggestions and share code in core-functions for all scripts...
[t2sde.git] / package / kernel / linux / restore-reiserfs.patch
blobe23228de16a71f7c39a754e8b219a4362e8b3768
1 # --- T2-COPYRIGHT-BEGIN ---
2 # t2/package/*/linux/restore-reiserfs.patch
3 # Copyright (C) 2025 The T2 SDE Project
4 # SPDX-License-Identifier: GPL-2.0 or patched project license
5 # --- T2-COPYRIGHT-END ---
7 diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
8 index 9ab2a3d6f2b4..92bffcc6747a 100644
9 --- a/Documentation/filesystems/porting.rst
10 +++ b/Documentation/filesystems/porting.rst
11 @@ -177,7 +177,7 @@ settles down a bit.
12 **mandatory**
14 s_export_op is now required for exporting a filesystem.
15 -isofs, ext2, ext3, fat
16 +isofs, ext2, ext3, reiserfs, fat
17 can be used as examples of very different filesystems.
19 ---
20 diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
21 index 243f1f1b554a..e4be1378ba26 100644
22 --- a/Documentation/userspace-api/ioctl/ioctl-number.rst
23 +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
24 @@ -375,7 +375,7 @@ Code Seq# Include File Comments
25 0xCB 00-1F CBM serial IEC bus in development:
26 <mailto:michael.klein@puffin.lb.shuttle.de>
27 0xCC 00-0F drivers/misc/ibmvmc.h pseries VMC driver
28 -0xCD 01 linux/reiserfs_fs.h Dead since 6.13
29 +0xCD 01 linux/reiserfs_fs.h
30 0xCE 01-02 uapi/linux/cxl_mem.h Compute Express Link Memory Devices
31 0xCF 02 fs/smb/client/cifs_ioctl.h
32 0xDB 00-0F drivers/char/mwave/mwavepub.h
33 diff --git a/MAINTAINERS b/MAINTAINERS
34 index f90a5a415218..2b3797676c9e 100644
35 --- a/MAINTAINERS
36 +++ b/MAINTAINERS
37 @@ -19605,6 +19605,11 @@ F: Documentation/devicetree/bindings/regmap/
38 F: drivers/base/regmap/
39 F: include/linux/regmap.h
41 +REISERFS FILE SYSTEM
42 +L: reiserfs-devel@vger.kernel.org
43 +S: Obsolete
44 +F: fs/reiserfs/
46 REMOTE PROCESSOR (REMOTEPROC) SUBSYSTEM
47 M: Bjorn Andersson <andersson@kernel.org>
48 M: Mathieu Poirier <mathieu.poirier@linaro.org>
49 diff --git a/arch/alpha/configs/defconfig b/arch/alpha/configs/defconfig
50 index 3280bd9e6578..1816c1dc22b1 100644
51 --- a/arch/alpha/configs/defconfig
52 +++ b/arch/alpha/configs/defconfig
53 @@ -51,6 +51,7 @@ CONFIG_SERIAL_8250_CONSOLE=y
54 CONFIG_RTC_CLASS=y
55 CONFIG_RTC_DRV_CMOS=y
56 CONFIG_EXT2_FS=y
57 +CONFIG_REISERFS_FS=m
58 CONFIG_ISO9660_FS=y
59 CONFIG_MSDOS_FS=y
60 CONFIG_VFAT_FS=y
61 diff --git a/arch/arm/configs/pxa_defconfig b/arch/arm/configs/pxa_defconfig
62 index 38916ac4bce4..e1cb170c2bf0 100644
63 --- a/arch/arm/configs/pxa_defconfig
64 +++ b/arch/arm/configs/pxa_defconfig
65 @@ -583,6 +583,10 @@ CONFIG_EXT2_FS_SECURITY=y
66 CONFIG_EXT3_FS=y
67 CONFIG_EXT3_FS_POSIX_ACL=y
68 CONFIG_EXT3_FS_SECURITY=y
69 +CONFIG_REISERFS_FS=m
70 +CONFIG_REISERFS_FS_XATTR=y
71 +CONFIG_REISERFS_FS_POSIX_ACL=y
72 +CONFIG_REISERFS_FS_SECURITY=y
73 CONFIG_XFS_FS=m
74 CONFIG_AUTOFS_FS=m
75 CONFIG_FUSE_FS=m
76 diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig
77 index c705247e7b5b..a70aec9a05c4 100644
78 --- a/arch/m68k/configs/amiga_defconfig
79 +++ b/arch/m68k/configs/amiga_defconfig
80 @@ -449,6 +449,7 @@ CONFIG_RTC_DRV_RP5C01=m
81 # CONFIG_IOMMU_SUPPORT is not set
82 CONFIG_DAX=m
83 CONFIG_EXT4_FS=y
84 +CONFIG_REISERFS_FS=m
85 CONFIG_JFS_FS=m
86 CONFIG_OCFS2_FS=m
87 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
88 diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig
89 index 6d62b9187a58..312853f3d26a 100644
90 --- a/arch/m68k/configs/apollo_defconfig
91 +++ b/arch/m68k/configs/apollo_defconfig
92 @@ -406,6 +406,7 @@ CONFIG_RTC_DRV_GENERIC=m
93 # CONFIG_IOMMU_SUPPORT is not set
94 CONFIG_DAX=m
95 CONFIG_EXT4_FS=y
96 +CONFIG_REISERFS_FS=m
97 CONFIG_JFS_FS=m
98 CONFIG_OCFS2_FS=m
99 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
100 diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig
101 index c3c644df852d..0853e4358de9 100644
102 --- a/arch/m68k/configs/atari_defconfig
103 +++ b/arch/m68k/configs/atari_defconfig
104 @@ -426,6 +426,7 @@ CONFIG_RTC_DRV_GENERIC=m
105 # CONFIG_IOMMU_SUPPORT is not set
106 CONFIG_DAX=m
107 CONFIG_EXT4_FS=y
108 +CONFIG_REISERFS_FS=m
109 CONFIG_JFS_FS=m
110 CONFIG_OCFS2_FS=m
111 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
112 diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig
113 index 20261f819691..f738202d1f36 100644
114 --- a/arch/m68k/configs/bvme6000_defconfig
115 +++ b/arch/m68k/configs/bvme6000_defconfig
116 @@ -398,6 +398,7 @@ CONFIG_RTC_DRV_GENERIC=m
117 # CONFIG_IOMMU_SUPPORT is not set
118 CONFIG_DAX=m
119 CONFIG_EXT4_FS=y
120 +CONFIG_REISERFS_FS=m
121 CONFIG_JFS_FS=m
122 CONFIG_OCFS2_FS=m
123 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
124 diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig
125 index ce4fe93a0f70..74f74e03ccc9 100644
126 --- a/arch/m68k/configs/hp300_defconfig
127 +++ b/arch/m68k/configs/hp300_defconfig
128 @@ -408,6 +408,7 @@ CONFIG_RTC_DRV_GENERIC=m
129 # CONFIG_IOMMU_SUPPORT is not set
130 CONFIG_DAX=m
131 CONFIG_EXT4_FS=y
132 +CONFIG_REISERFS_FS=m
133 CONFIG_JFS_FS=m
134 CONFIG_OCFS2_FS=m
135 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
136 diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig
137 index 040ae75f47c3..14c8f1b374aa 100644
138 --- a/arch/m68k/configs/mac_defconfig
139 +++ b/arch/m68k/configs/mac_defconfig
140 @@ -425,6 +425,7 @@ CONFIG_RTC_DRV_GENERIC=m
141 # CONFIG_IOMMU_SUPPORT is not set
142 CONFIG_DAX=m
143 CONFIG_EXT4_FS=y
144 +CONFIG_REISERFS_FS=m
145 CONFIG_JFS_FS=m
146 CONFIG_OCFS2_FS=m
147 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
148 diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig
149 index f8edc9082724..41c8112c6d0d 100644
150 --- a/arch/m68k/configs/multi_defconfig
151 +++ b/arch/m68k/configs/multi_defconfig
152 @@ -511,6 +511,7 @@ CONFIG_RTC_DRV_GENERIC=m
153 # CONFIG_IOMMU_SUPPORT is not set
154 CONFIG_DAX=m
155 CONFIG_EXT4_FS=y
156 +CONFIG_REISERFS_FS=m
157 CONFIG_JFS_FS=m
158 CONFIG_OCFS2_FS=m
159 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
160 diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig
161 index 71fc71bb660e..e72d37ee90a7 100644
162 --- a/arch/m68k/configs/mvme147_defconfig
163 +++ b/arch/m68k/configs/mvme147_defconfig
164 @@ -397,6 +397,7 @@ CONFIG_RTC_DRV_GENERIC=m
165 # CONFIG_IOMMU_SUPPORT is not set
166 CONFIG_DAX=m
167 CONFIG_EXT4_FS=y
168 +CONFIG_REISERFS_FS=m
169 CONFIG_JFS_FS=m
170 CONFIG_OCFS2_FS=m
171 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
172 diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig
173 index 41072e68028e..733f1fc9a50a 100644
174 --- a/arch/m68k/configs/mvme16x_defconfig
175 +++ b/arch/m68k/configs/mvme16x_defconfig
176 @@ -398,6 +398,7 @@ CONFIG_RTC_DRV_GENERIC=m
177 # CONFIG_IOMMU_SUPPORT is not set
178 CONFIG_DAX=m
179 CONFIG_EXT4_FS=y
180 +CONFIG_REISERFS_FS=m
181 CONFIG_JFS_FS=m
182 CONFIG_OCFS2_FS=m
183 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
184 diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig
185 index e4c30e2b9bbb..3efe25435561 100644
186 --- a/arch/m68k/configs/q40_defconfig
187 +++ b/arch/m68k/configs/q40_defconfig
188 @@ -415,6 +415,7 @@ CONFIG_RTC_DRV_GENERIC=m
189 # CONFIG_IOMMU_SUPPORT is not set
190 CONFIG_DAX=m
191 CONFIG_EXT4_FS=y
192 +CONFIG_REISERFS_FS=m
193 CONFIG_JFS_FS=m
194 CONFIG_OCFS2_FS=m
195 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
196 diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig
197 index 980843a9ea1e..1b8ea0e7acb4 100644
198 --- a/arch/m68k/configs/sun3_defconfig
199 +++ b/arch/m68k/configs/sun3_defconfig
200 @@ -396,6 +396,7 @@ CONFIG_RTC_DRV_GENERIC=m
201 # CONFIG_IOMMU_SUPPORT is not set
202 CONFIG_DAX=m
203 CONFIG_EXT4_FS=y
204 +CONFIG_REISERFS_FS=m
205 CONFIG_JFS_FS=m
206 CONFIG_OCFS2_FS=m
207 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
208 diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig
209 index 38681cc6b598..5bda93f6a200 100644
210 --- a/arch/m68k/configs/sun3x_defconfig
211 +++ b/arch/m68k/configs/sun3x_defconfig
212 @@ -396,6 +396,7 @@ CONFIG_RTC_DRV_GENERIC=m
213 # CONFIG_IOMMU_SUPPORT is not set
214 CONFIG_DAX=m
215 CONFIG_EXT4_FS=y
216 +CONFIG_REISERFS_FS=m
217 CONFIG_JFS_FS=m
218 CONFIG_OCFS2_FS=m
219 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
220 diff --git a/arch/sh/configs/landisk_defconfig b/arch/sh/configs/landisk_defconfig
221 index d871623955c5..0311380160f4 100644
222 --- a/arch/sh/configs/landisk_defconfig
223 +++ b/arch/sh/configs/landisk_defconfig
224 @@ -95,6 +95,7 @@ CONFIG_USB_SISUSBVGA=m
225 CONFIG_EXT2_FS=y
226 CONFIG_EXT3_FS=y
227 # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
228 +CONFIG_REISERFS_FS=y
229 CONFIG_ISO9660_FS=m
230 CONFIG_MSDOS_FS=y
231 CONFIG_VFAT_FS=y
232 diff --git a/arch/sh/configs/titan_defconfig b/arch/sh/configs/titan_defconfig
233 index 99bc0e889287..c1032559ecd4 100644
234 --- a/arch/sh/configs/titan_defconfig
235 +++ b/arch/sh/configs/titan_defconfig
236 @@ -220,6 +220,7 @@ CONFIG_EXT2_FS=y
237 CONFIG_EXT3_FS=y
238 # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
239 # CONFIG_EXT3_FS_XATTR is not set
240 +CONFIG_REISERFS_FS=m
241 CONFIG_XFS_FS=m
242 CONFIG_FUSE_FS=m
243 CONFIG_ISO9660_FS=m
244 diff --git a/arch/um/configs/i386_defconfig b/arch/um/configs/i386_defconfig
245 index 9c9c77f1255a..e543cbac8792 100644
246 --- a/arch/um/configs/i386_defconfig
247 +++ b/arch/um/configs/i386_defconfig
248 @@ -61,6 +61,7 @@ CONFIG_UML_NET_DAEMON=y
249 CONFIG_UML_NET_MCAST=y
250 CONFIG_UML_NET_SLIRP=y
251 CONFIG_EXT4_FS=y
252 +CONFIG_REISERFS_FS=y
253 CONFIG_QUOTA=y
254 CONFIG_AUTOFS_FS=m
255 CONFIG_ISO9660_FS=m
256 diff --git a/arch/um/configs/x86_64_defconfig b/arch/um/configs/x86_64_defconfig
257 index 03b10d3f6816..939cb12318ca 100644
258 --- a/arch/um/configs/x86_64_defconfig
259 +++ b/arch/um/configs/x86_64_defconfig
260 @@ -59,6 +59,7 @@ CONFIG_UML_NET_DAEMON=y
261 CONFIG_UML_NET_MCAST=y
262 CONFIG_UML_NET_SLIRP=y
263 CONFIG_EXT4_FS=y
264 +CONFIG_REISERFS_FS=y
265 CONFIG_QUOTA=y
266 CONFIG_AUTOFS_FS=m
267 CONFIG_ISO9660_FS=m
268 diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
269 index a97f2c40c640..ed209f4f2798 100644
270 --- a/drivers/block/Kconfig
271 +++ b/drivers/block/Kconfig
272 @@ -130,7 +130,7 @@ config BLK_DEV_UBD_SYNC
273 kernel command line option. Alternatively, you can say Y here to
274 turn on synchronous operation by default for all block devices.
276 - If you're running a journalling file system (like xfs, for
277 + If you're running a journalling file system (like reiserfs, for
278 example) in your virtual machine, you will want to say Y here. If
279 you care for the safety of the data in your virtual machine, Y is a
280 wise choice too. In all other cases (for example, if you're just
281 diff --git a/fs/Kconfig b/fs/Kconfig
282 index 64d420e3c475..aae170fc2795 100644
283 --- a/fs/Kconfig
284 +++ b/fs/Kconfig
285 @@ -43,6 +43,7 @@ config FS_MBCACHE
286 default y if EXT4_FS=y
287 default m if EXT2_FS_XATTR || EXT4_FS
289 +source "fs/reiserfs/Kconfig"
290 source "fs/jfs/Kconfig"
292 source "fs/xfs/Kconfig"
293 diff --git a/fs/Makefile b/fs/Makefile
294 index 15df0a923d3a..61679fd587b7 100644
295 --- a/fs/Makefile
296 +++ b/fs/Makefile
297 @@ -61,6 +61,7 @@ obj-$(CONFIG_DLM) += dlm/
299 # Do not add any filesystems before this line
300 obj-$(CONFIG_NETFS_SUPPORT) += netfs/
301 +obj-$(CONFIG_REISERFS_FS) += reiserfs/
302 obj-$(CONFIG_EXT4_FS) += ext4/
303 # We place ext4 before ext2 so that clean ext3 root fs's do NOT mount using the
304 # ext2 driver, which doesn't know about journalling! Explicitly request ext2
305 diff --git a/fs/buffer.c b/fs/buffer.c
306 index b158cb7a5038..bb4a31b9559d 100644
307 --- a/fs/buffer.c
308 +++ b/fs/buffer.c
309 @@ -855,7 +855,8 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
310 * done a sync(). Just drop the buffers from the inode list.
312 * NOTE: we take the inode's blockdev's mapping's i_private_lock. Which
313 - * assumes that all the buffers are against the blockdev.
314 + * assumes that all the buffers are against the blockdev. Not true
315 + * for reiserfs.
317 void invalidate_inode_buffers(struct inode *inode)
319 diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
320 index 818083a36bef..4c925e55dbcd 100644
321 --- a/fs/quota/Kconfig
322 +++ b/fs/quota/Kconfig
323 @@ -9,13 +9,14 @@ config QUOTA
324 help
325 If you say Y here, you will be able to set per user limits for disk
326 usage (also called disk quotas). Currently, it works for the
327 - ext2, ext3, ext4, f2fs, jfs and ocfs2 file systems. Note that gfs2
328 - and xfs use their own quota system. Ext3 and ext4 also support
329 - journaled quotas for which you don't need to run quotacheck(8) after
330 - an unclean shutdown. For further details, read the Quota mini-HOWTO,
331 - available from <https://www.tldp.org/docs.html#howto>, or the
332 - documentation provided with the quota tools. Probably the quota
333 - support is only useful for multi user systems. If unsure, say N.
334 + ext2, ext3, ext4, f2fs, jfs, ocfs2 and reiserfs file systems.
335 + Note that gfs2 and xfs use their own quota system.
336 + Ext3, ext4 and reiserfs also support journaled quotas for which
337 + you don't need to run quotacheck(8) after an unclean shutdown.
338 + For further details, read the Quota mini-HOWTO, available from
339 + <https://www.tldp.org/docs.html#howto>, or the documentation provided
340 + with the quota tools. Probably the quota support is only useful for
341 + multi user systems. If unsure, say N.
343 config QUOTA_NETLINK_INTERFACE
344 bool "Report quota messages through netlink interface"
345 diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
346 new file mode 100644
347 index 000000000000..0e6fe26458fe
348 --- /dev/null
349 +++ b/fs/reiserfs/Kconfig
350 @@ -0,0 +1,91 @@
351 +# SPDX-License-Identifier: GPL-2.0-only
352 +config REISERFS_FS
353 + tristate "Reiserfs support (deprecated)"
354 + select BUFFER_HEAD
355 + select CRC32
356 + select LEGACY_DIRECT_IO
357 + help
358 + Reiserfs is deprecated and scheduled to be removed from the kernel
359 + in 2025. If you are still using it, please migrate to another
360 + filesystem or tell us your usecase for reiserfs.
362 + Reiserfs stores not just filenames but the files themselves in a
363 + balanced tree. Uses journalling.
365 + Balanced trees are more efficient than traditional file system
366 + architectural foundations.
368 + In general, ReiserFS is as fast as ext2, but is very efficient with
369 + large directories and small files. Additional patches are needed
370 + for NFS and quotas, please see
371 + <https://reiser4.wiki.kernel.org/index.php/Main_Page> for links.
373 + It is more easily extended to have features currently found in
374 + database and keyword search systems than block allocation based file
375 + systems are. The next version will be so extended, and will support
376 + plugins consistent with our motto ``It takes more than a license to
377 + make source code open.''
379 + Read <https://reiser4.wiki.kernel.org/index.php/Main_Page>
380 + to learn more about reiserfs.
382 + Sponsored by Threshold Networks, Emusic.com, and Bigstorage.com.
384 + If you like it, you can pay us to add new features to it that you
385 + need, buy a support contract, or pay us to port it to another OS.
387 +config REISERFS_CHECK
388 + bool "Enable reiserfs debug mode"
389 + depends on REISERFS_FS
390 + help
391 + If you set this to Y, then ReiserFS will perform every check it can
392 + possibly imagine of its internal consistency throughout its
393 + operation. It will also go substantially slower. More than once we
394 + have forgotten that this was on, and then gone despondent over the
395 + latest benchmarks.:-) Use of this option allows our team to go all
396 + out in checking for consistency when debugging without fear of its
397 + effect on end users. If you are on the verge of sending in a bug
398 + report, say Y and you might get a useful error message. Almost
399 + everyone should say N.
401 +config REISERFS_PROC_INFO
402 + bool "Stats in /proc/fs/reiserfs"
403 + depends on REISERFS_FS && PROC_FS
404 + help
405 + Create under /proc/fs/reiserfs a hierarchy of files, displaying
406 + various ReiserFS statistics and internal data at the expense of
407 + making your kernel or module slightly larger (+8 KB). This also
408 + increases the amount of kernel memory required for each mount.
409 + Almost everyone but ReiserFS developers and people fine-tuning
410 + reiserfs or tracing problems should say N.
412 +config REISERFS_FS_XATTR
413 + bool "ReiserFS extended attributes"
414 + depends on REISERFS_FS
415 + help
416 + Extended attributes are name:value pairs associated with inodes by
417 + the kernel or by users (see the attr(5) manual page for details).
419 + If unsure, say N.
421 +config REISERFS_FS_POSIX_ACL
422 + bool "ReiserFS POSIX Access Control Lists"
423 + depends on REISERFS_FS_XATTR
424 + select FS_POSIX_ACL
425 + help
426 + Posix Access Control Lists (ACLs) support permissions for users and
427 + groups beyond the owner/group/world scheme.
429 + If you don't know what Access Control Lists are, say N
431 +config REISERFS_FS_SECURITY
432 + bool "ReiserFS Security Labels"
433 + depends on REISERFS_FS_XATTR
434 + help
435 + Security labels support alternative access control models
436 + implemented by security modules like SELinux. This option
437 + enables an extended attribute handler for file security
438 + labels in the ReiserFS filesystem.
440 + If you are not using a security module that requires using
441 + extended attributes for file security labels, say N.
442 diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile
443 new file mode 100644
444 index 000000000000..bd29c58ccbd8
445 --- /dev/null
446 +++ b/fs/reiserfs/Makefile
447 @@ -0,0 +1,30 @@
448 +# SPDX-License-Identifier: GPL-2.0
450 +# Makefile for the linux reiser-filesystem routines.
453 +obj-$(CONFIG_REISERFS_FS) += reiserfs.o
455 +reiserfs-objs := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o \
456 + super.o prints.o objectid.o lbalance.o ibalance.o stree.o \
457 + hashes.o tail_conversion.o journal.o resize.o \
458 + item_ops.o ioctl.o xattr.o lock.o
460 +ifeq ($(CONFIG_REISERFS_PROC_INFO),y)
461 +reiserfs-objs += procfs.o
462 +endif
464 +ifeq ($(CONFIG_REISERFS_FS_XATTR),y)
465 +reiserfs-objs += xattr_user.o xattr_trusted.o
466 +endif
468 +ifeq ($(CONFIG_REISERFS_FS_SECURITY),y)
469 +reiserfs-objs += xattr_security.o
470 +endif
472 +ifeq ($(CONFIG_REISERFS_FS_POSIX_ACL),y)
473 +reiserfs-objs += xattr_acl.o
474 +endif
476 +TAGS:
477 + etags *.c
478 diff --git a/fs/reiserfs/README b/fs/reiserfs/README
479 new file mode 100644
480 index 000000000000..11e9ecf24b63
481 --- /dev/null
482 +++ b/fs/reiserfs/README
483 @@ -0,0 +1,151 @@
484 +[LICENSING]
486 +ReiserFS is hereby licensed under the GNU General
487 +Public License version 2.
489 +Source code files that contain the phrase "licensing governed by
490 +reiserfs/README" are "governed files" throughout this file. Governed
491 +files are licensed under the GPL. The portions of them owned by Hans
492 +Reiser, or authorized to be licensed by him, have been in the past,
493 +and likely will be in the future, licensed to other parties under
494 +other licenses. If you add your code to governed files, and don't
495 +want it to be owned by Hans Reiser, put your copyright label on that
496 +code so the poor blight and his customers can keep things straight.
497 +All portions of governed files not labeled otherwise are owned by Hans
498 +Reiser, and by adding your code to it, widely distributing it to
499 +others or sending us a patch, and leaving the sentence in stating that
500 +licensing is governed by the statement in this file, you accept this.
501 +It will be a kindness if you identify whether Hans Reiser is allowed
502 +to license code labeled as owned by you on your behalf other than
503 +under the GPL, because he wants to know if it is okay to do so and put
504 +a check in the mail to you (for non-trivial improvements) when he
505 +makes his next sale. He makes no guarantees as to the amount if any,
506 +though he feels motivated to motivate contributors, and you can surely
507 +discuss this with him before or after contributing. You have the
508 +right to decline to allow him to license your code contribution other
509 +than under the GPL.
511 +Further licensing options are available for commercial and/or other
512 +interests directly from Hans Reiser: hans@reiser.to. If you interpret
513 +the GPL as not allowing those additional licensing options, you read
514 +it wrongly, and Richard Stallman agrees with me, when carefully read
515 +you can see that those restrictions on additional terms do not apply
516 +to the owner of the copyright, and my interpretation of this shall
517 +govern for this license.
519 +Finally, nothing in this license shall be interpreted to allow you to
520 +fail to fairly credit me, or to remove my credits, without my
521 +permission, unless you are an end user not redistributing to others.
522 +If you have doubts about how to properly do that, or about what is
523 +fair, ask. (Last I spoke with him Richard was contemplating how best
524 +to address the fair crediting issue in the next GPL version.)
526 +[END LICENSING]
528 +Reiserfs is a file system based on balanced tree algorithms, which is
529 +described at https://reiser4.wiki.kernel.org/index.php/Main_Page
531 +Stop reading here. Go there, then return.
533 +Send bug reports to yura@namesys.botik.ru.
535 +mkreiserfs and other utilities are in reiserfs/utils, or wherever your
536 +Linux provider put them. There is some disagreement about how useful
537 +it is for users to get their fsck and mkreiserfs out of sync with the
538 +version of reiserfs that is in their kernel, with many important
539 +distributors wanting them out of sync.:-) Please try to remember to
540 +recompile and reinstall fsck and mkreiserfs with every update of
541 +reiserfs, this is a common source of confusion. Note that some of the
542 +utilities cannot be compiled without accessing the balancing code
543 +which is in the kernel code, and relocating the utilities may require
544 +you to specify where that code can be found.
546 +Yes, if you update your reiserfs kernel module you do have to
547 +recompile your kernel, most of the time. The errors you get will be
548 +quite cryptic if your forget to do so.
550 +Real users, as opposed to folks who want to hack and then understand
551 +what went wrong, will want REISERFS_CHECK off.
553 +Hideous Commercial Pitch: Spread your development costs across other OS
554 +vendors. Select from the best in the world, not the best in your
555 +building, by buying from third party OS component suppliers. Leverage
556 +the software component development power of the internet. Be the most
557 +aggressive in taking advantage of the commercial possibilities of
558 +decentralized internet development, and add value through your branded
559 +integration that you sell as an operating system. Let your competitors
560 +be the ones to compete against the entire internet by themselves. Be
561 +hip, get with the new economic trend, before your competitors do. Send
562 +email to hans@reiser.to.
564 +To understand the code, after reading the website, start reading the
565 +code by reading reiserfs_fs.h first.
567 +Hans Reiser was the project initiator, primary architect, source of all
568 +funding for the first 5.5 years, and one of the programmers. He owns
569 +the copyright.
571 +Vladimir Saveljev was one of the programmers, and he worked long hours
572 +writing the cleanest code. He always made the effort to be the best he
573 +could be, and to make his code the best that it could be. What resulted
574 +was quite remarkable. I don't think that money can ever motivate someone
575 +to work the way he did, he is one of the most selfless men I know.
577 +Yura helps with benchmarking, coding hashes, and block pre-allocation
578 +code.
580 +Anatoly Pinchuk is a former member of our team who worked closely with
581 +Vladimir throughout the project's development. He wrote a quite
582 +substantial portion of the total code. He realized that there was a
583 +space problem with packing tails of files for files larger than a node
584 +that start on a node aligned boundary (there are reasons to want to node
585 +align files), and he invented and implemented indirect items and
586 +unformatted nodes as the solution.
588 +Konstantin Shvachko was taking part in the early days.
590 +Mikhail Gilula was a brilliant innovator that has shown much generosity.
592 +Grigory Zaigralin was an extremely effective system administrator for
593 +our group.
595 +Igor Krasheninnikov was wonderful at hardware procurement, repair, and
596 +network installation.
598 +Jeremy Fitzhardinge wrote the teahash.c code, and he gives credit to a
599 +textbook he got the algorithm from in the code. Note that his analysis
600 +of how we could use the hashing code in making 32 bit NFS cookies work
601 +was probably more important than the actual algorithm. Colin Plumb also
602 +contributed to it.
604 +Chris Mason dived right into our code, and in just a few months produced
605 +the journaling code that dramatically increased the value of ReiserFS.
606 +He is just an amazing programmer.
608 +Igor Zagorovsky is writing much of the new item handler and extent code
609 +for our next major release.
611 +Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the
612 +resizer, and is hard at work on implementing allocate on flush. SGI
613 +implemented allocate on flush before us for XFS, and generously took
614 +the time to convince me we should do it also. They are great people,
615 +and a great company.
617 +Yuri Shevchuk and Nikita Danilov are doing squid cache optimization.
619 +Vitaly Fertman is doing fsck.
621 +Jeff Mahoney, of SuSE, contributed a few cleanup fixes, most notably
622 +the endian safe patches which allow ReiserFS to run on any platform
623 +supported by the Linux kernel.
625 +SuSE, IntegratedLinux.com, Ecila, MP3.com, bigstorage.com, and the
626 +Alpha PC Company made it possible for me to not have a day job
627 +anymore, and to dramatically increase our staffing. Ecila funded
628 +hypertext feature development, MP3.com funded journaling, SuSE funded
629 +core development, IntegratedLinux.com funded squid web cache
630 +appliances, bigstorage.com funded HSM, and the alpha PC company funded
631 +the alpha port. Many of these tasks were helped by sponsors other
632 +than the ones just named. SuSE has helped in much more than just
633 +funding....
635 diff --git a/fs/reiserfs/acl.h b/fs/reiserfs/acl.h
636 new file mode 100644
637 index 000000000000..2571b1a8be84
638 --- /dev/null
639 +++ b/fs/reiserfs/acl.h
640 @@ -0,0 +1,78 @@
641 +/* SPDX-License-Identifier: GPL-2.0 */
642 +#include <linux/init.h>
643 +#include <linux/posix_acl.h>
645 +#define REISERFS_ACL_VERSION 0x0001
647 +typedef struct {
648 + __le16 e_tag;
649 + __le16 e_perm;
650 + __le32 e_id;
651 +} reiserfs_acl_entry;
653 +typedef struct {
654 + __le16 e_tag;
655 + __le16 e_perm;
656 +} reiserfs_acl_entry_short;
658 +typedef struct {
659 + __le32 a_version;
660 +} reiserfs_acl_header;
662 +static inline size_t reiserfs_acl_size(int count)
664 + if (count <= 4) {
665 + return sizeof(reiserfs_acl_header) +
666 + count * sizeof(reiserfs_acl_entry_short);
667 + } else {
668 + return sizeof(reiserfs_acl_header) +
669 + 4 * sizeof(reiserfs_acl_entry_short) +
670 + (count - 4) * sizeof(reiserfs_acl_entry);
674 +static inline int reiserfs_acl_count(size_t size)
676 + ssize_t s;
677 + size -= sizeof(reiserfs_acl_header);
678 + s = size - 4 * sizeof(reiserfs_acl_entry_short);
679 + if (s < 0) {
680 + if (size % sizeof(reiserfs_acl_entry_short))
681 + return -1;
682 + return size / sizeof(reiserfs_acl_entry_short);
683 + } else {
684 + if (s % sizeof(reiserfs_acl_entry))
685 + return -1;
686 + return s / sizeof(reiserfs_acl_entry) + 4;
690 +#ifdef CONFIG_REISERFS_FS_POSIX_ACL
691 +struct posix_acl *reiserfs_get_acl(struct inode *inode, int type, bool rcu);
692 +int reiserfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
693 + struct posix_acl *acl, int type);
694 +int reiserfs_acl_chmod(struct dentry *dentry);
695 +int reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
696 + struct inode *dir, struct dentry *dentry,
697 + struct inode *inode);
698 +int reiserfs_cache_default_acl(struct inode *dir);
700 +#else
702 +#define reiserfs_cache_default_acl(inode) 0
703 +#define reiserfs_get_acl NULL
704 +#define reiserfs_set_acl NULL
706 +static inline int reiserfs_acl_chmod(struct dentry *dentry)
708 + return 0;
711 +static inline int
712 +reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
713 + const struct inode *dir, struct dentry *dentry,
714 + struct inode *inode)
716 + return 0;
718 +#endif
719 diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
720 new file mode 100644
721 index 000000000000..bf708ac287b4
722 --- /dev/null
723 +++ b/fs/reiserfs/bitmap.c
724 @@ -0,0 +1,1476 @@
726 + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
727 + */
728 +/* Reiserfs block (de)allocator, bitmap-based. */
730 +#include <linux/time.h>
731 +#include "reiserfs.h"
732 +#include <linux/errno.h>
733 +#include <linux/buffer_head.h>
734 +#include <linux/kernel.h>
735 +#include <linux/pagemap.h>
736 +#include <linux/vmalloc.h>
737 +#include <linux/quotaops.h>
738 +#include <linux/seq_file.h>
740 +#define PREALLOCATION_SIZE 9
742 +/* different reiserfs block allocator options */
744 +#define SB_ALLOC_OPTS(s) (REISERFS_SB(s)->s_alloc_options.bits)
746 +#define _ALLOC_concentrating_formatted_nodes 0
747 +#define _ALLOC_displacing_large_files 1
748 +#define _ALLOC_displacing_new_packing_localities 2
749 +#define _ALLOC_old_hashed_relocation 3
750 +#define _ALLOC_new_hashed_relocation 4
751 +#define _ALLOC_skip_busy 5
752 +#define _ALLOC_displace_based_on_dirid 6
753 +#define _ALLOC_hashed_formatted_nodes 7
754 +#define _ALLOC_old_way 8
755 +#define _ALLOC_hundredth_slices 9
756 +#define _ALLOC_dirid_groups 10
757 +#define _ALLOC_oid_groups 11
758 +#define _ALLOC_packing_groups 12
760 +#define concentrating_formatted_nodes(s) test_bit(_ALLOC_concentrating_formatted_nodes, &SB_ALLOC_OPTS(s))
761 +#define displacing_large_files(s) test_bit(_ALLOC_displacing_large_files, &SB_ALLOC_OPTS(s))
762 +#define displacing_new_packing_localities(s) test_bit(_ALLOC_displacing_new_packing_localities, &SB_ALLOC_OPTS(s))
764 +#define SET_OPTION(optname) \
765 + do { \
766 + reiserfs_info(s, "block allocator option \"%s\" is set", #optname); \
767 + set_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s)); \
768 + } while(0)
769 +#define TEST_OPTION(optname, s) \
770 + test_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s))
772 +static inline void get_bit_address(struct super_block *s,
773 + b_blocknr_t block,
774 + unsigned int *bmap_nr,
775 + unsigned int *offset)
777 + /*
778 + * It is in the bitmap block number equal to the block
779 + * number divided by the number of bits in a block.
780 + */
781 + *bmap_nr = block >> (s->s_blocksize_bits + 3);
782 + /* Within that bitmap block it is located at bit offset *offset. */
783 + *offset = block & ((s->s_blocksize << 3) - 1);
786 +int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value)
788 + unsigned int bmap, offset;
789 + unsigned int bmap_count = reiserfs_bmap_count(s);
791 + if (block == 0 || block >= SB_BLOCK_COUNT(s)) {
792 + reiserfs_error(s, "vs-4010",
793 + "block number is out of range %lu (%u)",
794 + block, SB_BLOCK_COUNT(s));
795 + return 0;
798 + get_bit_address(s, block, &bmap, &offset);
800 + /*
801 + * Old format filesystem? Unlikely, but the bitmaps are all
802 + * up front so we need to account for it.
803 + */
804 + if (unlikely(test_bit(REISERFS_OLD_FORMAT,
805 + &REISERFS_SB(s)->s_properties))) {
806 + b_blocknr_t bmap1 = REISERFS_SB(s)->s_sbh->b_blocknr + 1;
807 + if (block >= bmap1 &&
808 + block <= bmap1 + bmap_count) {
809 + reiserfs_error(s, "vs-4019", "bitmap block %lu(%u) "
810 + "can't be freed or reused",
811 + block, bmap_count);
812 + return 0;
814 + } else {
815 + if (offset == 0) {
816 + reiserfs_error(s, "vs-4020", "bitmap block %lu(%u) "
817 + "can't be freed or reused",
818 + block, bmap_count);
819 + return 0;
823 + if (bmap >= bmap_count) {
824 + reiserfs_error(s, "vs-4030", "bitmap for requested block "
825 + "is out of range: block=%lu, bitmap_nr=%u",
826 + block, bmap);
827 + return 0;
830 + if (bit_value == 0 && block == SB_ROOT_BLOCK(s)) {
831 + reiserfs_error(s, "vs-4050", "this is root block (%u), "
832 + "it must be busy", SB_ROOT_BLOCK(s));
833 + return 0;
836 + return 1;
840 + * Searches in journal structures for a given block number (bmap, off).
841 + * If block is found in reiserfs journal it suggests next free block
842 + * candidate to test.
843 + */
844 +static inline int is_block_in_journal(struct super_block *s, unsigned int bmap,
845 + int off, int *next)
847 + b_blocknr_t tmp;
849 + if (reiserfs_in_journal(s, bmap, off, 1, &tmp)) {
850 + if (tmp) { /* hint supplied */
851 + *next = tmp;
852 + PROC_INFO_INC(s, scan_bitmap.in_journal_hint);
853 + } else {
854 + (*next) = off + 1; /* inc offset to avoid looping. */
855 + PROC_INFO_INC(s, scan_bitmap.in_journal_nohint);
857 + PROC_INFO_INC(s, scan_bitmap.retry);
858 + return 1;
860 + return 0;
864 + * Searches for a window of zero bits with given minimum and maximum
865 + * lengths in one bitmap block
866 + */
867 +static int scan_bitmap_block(struct reiserfs_transaction_handle *th,
868 + unsigned int bmap_n, int *beg, int boundary,
869 + int min, int max, int unfm)
871 + struct super_block *s = th->t_super;
872 + struct reiserfs_bitmap_info *bi = &SB_AP_BITMAP(s)[bmap_n];
873 + struct buffer_head *bh;
874 + int end, next;
875 + int org = *beg;
877 + BUG_ON(!th->t_trans_id);
878 + RFALSE(bmap_n >= reiserfs_bmap_count(s), "Bitmap %u is out of "
879 + "range (0..%u)", bmap_n, reiserfs_bmap_count(s) - 1);
880 + PROC_INFO_INC(s, scan_bitmap.bmap);
882 + if (!bi) {
883 + reiserfs_error(s, "jdm-4055", "NULL bitmap info pointer "
884 + "for bitmap %d", bmap_n);
885 + return 0;
888 + bh = reiserfs_read_bitmap_block(s, bmap_n);
889 + if (bh == NULL)
890 + return 0;
892 + while (1) {
893 +cont:
894 + if (bi->free_count < min) {
895 + brelse(bh);
896 + return 0; /* No free blocks in this bitmap */
899 + /* search for a first zero bit -- beginning of a window */
900 + *beg = reiserfs_find_next_zero_le_bit
901 + ((unsigned long *)(bh->b_data), boundary, *beg);
903 + /*
904 + * search for a zero bit fails or the rest of bitmap block
905 + * cannot contain a zero window of minimum size
906 + */
907 + if (*beg + min > boundary) {
908 + brelse(bh);
909 + return 0;
912 + if (unfm && is_block_in_journal(s, bmap_n, *beg, beg))
913 + continue;
914 + /* first zero bit found; we check next bits */
915 + for (end = *beg + 1;; end++) {
916 + if (end >= *beg + max || end >= boundary
917 + || reiserfs_test_le_bit(end, bh->b_data)) {
918 + next = end;
919 + break;
922 + /*
923 + * finding the other end of zero bit window requires
924 + * looking into journal structures (in case of
925 + * searching for free blocks for unformatted nodes)
926 + */
927 + if (unfm && is_block_in_journal(s, bmap_n, end, &next))
928 + break;
931 + /*
932 + * now (*beg) points to beginning of zero bits window,
933 + * (end) points to one bit after the window end
934 + */
936 + /* found window of proper size */
937 + if (end - *beg >= min) {
938 + int i;
939 + reiserfs_prepare_for_journal(s, bh, 1);
940 + /*
941 + * try to set all blocks used checking are
942 + * they still free
943 + */
944 + for (i = *beg; i < end; i++) {
945 + /* Don't check in journal again. */
946 + if (reiserfs_test_and_set_le_bit
947 + (i, bh->b_data)) {
948 + /*
949 + * bit was set by another process while
950 + * we slept in prepare_for_journal()
951 + */
952 + PROC_INFO_INC(s, scan_bitmap.stolen);
954 + /*
955 + * we can continue with smaller set
956 + * of allocated blocks, if length of
957 + * this set is more or equal to `min'
958 + */
959 + if (i >= *beg + min) {
960 + end = i;
961 + break;
964 + /*
965 + * otherwise we clear all bit
966 + * were set ...
967 + */
968 + while (--i >= *beg)
969 + reiserfs_clear_le_bit
970 + (i, bh->b_data);
971 + reiserfs_restore_prepared_buffer(s, bh);
972 + *beg = org;
974 + /*
975 + * Search again in current block
976 + * from beginning
977 + */
978 + goto cont;
981 + bi->free_count -= (end - *beg);
982 + journal_mark_dirty(th, bh);
983 + brelse(bh);
985 + /* free block count calculation */
986 + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s),
987 + 1);
988 + PUT_SB_FREE_BLOCKS(s, SB_FREE_BLOCKS(s) - (end - *beg));
989 + journal_mark_dirty(th, SB_BUFFER_WITH_SB(s));
991 + return end - (*beg);
992 + } else {
993 + *beg = next;
998 +static int bmap_hash_id(struct super_block *s, u32 id)
1000 + char *hash_in = NULL;
1001 + unsigned long hash;
1002 + unsigned bm;
1004 + if (id <= 2) {
1005 + bm = 1;
1006 + } else {
1007 + hash_in = (char *)(&id);
1008 + hash = keyed_hash(hash_in, 4);
1009 + bm = hash % reiserfs_bmap_count(s);
1010 + if (!bm)
1011 + bm = 1;
1013 + /* this can only be true when SB_BMAP_NR = 1 */
1014 + if (bm >= reiserfs_bmap_count(s))
1015 + bm = 0;
1016 + return bm;
1020 + * hashes the id and then returns > 0 if the block group for the
1021 + * corresponding hash is full
1022 + */
1023 +static inline int block_group_used(struct super_block *s, u32 id)
1025 + int bm = bmap_hash_id(s, id);
1026 + struct reiserfs_bitmap_info *info = &SB_AP_BITMAP(s)[bm];
1028 + /*
1029 + * If we don't have cached information on this bitmap block, we're
1030 + * going to have to load it later anyway. Loading it here allows us
1031 + * to make a better decision. This favors long-term performance gain
1032 + * with a better on-disk layout vs. a short term gain of skipping the
1033 + * read and potentially having a bad placement.
1034 + */
1035 + if (info->free_count == UINT_MAX) {
1036 + struct buffer_head *bh = reiserfs_read_bitmap_block(s, bm);
1037 + brelse(bh);
1040 + if (info->free_count > ((s->s_blocksize << 3) * 60 / 100)) {
1041 + return 0;
1043 + return 1;
1047 + * the packing is returned in disk byte order
1048 + */
1049 +__le32 reiserfs_choose_packing(struct inode * dir)
1051 + __le32 packing;
1052 + if (TEST_OPTION(packing_groups, dir->i_sb)) {
1053 + u32 parent_dir = le32_to_cpu(INODE_PKEY(dir)->k_dir_id);
1054 + /*
1055 + * some versions of reiserfsck expect packing locality 1 to be
1056 + * special
1057 + */
1058 + if (parent_dir == 1 || block_group_used(dir->i_sb, parent_dir))
1059 + packing = INODE_PKEY(dir)->k_objectid;
1060 + else
1061 + packing = INODE_PKEY(dir)->k_dir_id;
1062 + } else
1063 + packing = INODE_PKEY(dir)->k_objectid;
1064 + return packing;
1068 + * Tries to find contiguous zero bit window (given size) in given region of
1069 + * bitmap and place new blocks there. Returns number of allocated blocks.
1070 + */
1071 +static int scan_bitmap(struct reiserfs_transaction_handle *th,
1072 + b_blocknr_t * start, b_blocknr_t finish,
1073 + int min, int max, int unfm, sector_t file_block)
1075 + int nr_allocated = 0;
1076 + struct super_block *s = th->t_super;
1077 + unsigned int bm, off;
1078 + unsigned int end_bm, end_off;
1079 + unsigned int off_max = s->s_blocksize << 3;
1081 + BUG_ON(!th->t_trans_id);
1082 + PROC_INFO_INC(s, scan_bitmap.call);
1084 + /* No point in looking for more free blocks */
1085 + if (SB_FREE_BLOCKS(s) <= 0)
1086 + return 0;
1088 + get_bit_address(s, *start, &bm, &off);
1089 + get_bit_address(s, finish, &end_bm, &end_off);
1090 + if (bm > reiserfs_bmap_count(s))
1091 + return 0;
1092 + if (end_bm > reiserfs_bmap_count(s))
1093 + end_bm = reiserfs_bmap_count(s);
1095 + /*
1096 + * When the bitmap is more than 10% free, anyone can allocate.
1097 + * When it's less than 10% free, only files that already use the
1098 + * bitmap are allowed. Once we pass 80% full, this restriction
1099 + * is lifted.
1101 + * We do this so that files that grow later still have space close to
1102 + * their original allocation. This improves locality, and presumably
1103 + * performance as a result.
1105 + * This is only an allocation policy and does not make up for getting a
1106 + * bad hint. Decent hinting must be implemented for this to work well.
1107 + */
1108 + if (TEST_OPTION(skip_busy, s)
1109 + && SB_FREE_BLOCKS(s) > SB_BLOCK_COUNT(s) / 20) {
1110 + for (; bm < end_bm; bm++, off = 0) {
1111 + if ((off && (!unfm || (file_block != 0)))
1112 + || SB_AP_BITMAP(s)[bm].free_count >
1113 + (s->s_blocksize << 3) / 10)
1114 + nr_allocated =
1115 + scan_bitmap_block(th, bm, &off, off_max,
1116 + min, max, unfm);
1117 + if (nr_allocated)
1118 + goto ret;
1120 + /* we know from above that start is a reasonable number */
1121 + get_bit_address(s, *start, &bm, &off);
1124 + for (; bm < end_bm; bm++, off = 0) {
1125 + nr_allocated =
1126 + scan_bitmap_block(th, bm, &off, off_max, min, max, unfm);
1127 + if (nr_allocated)
1128 + goto ret;
1131 + nr_allocated =
1132 + scan_bitmap_block(th, bm, &off, end_off + 1, min, max, unfm);
1134 +ret:
1135 + *start = bm * off_max + off;
1136 + return nr_allocated;
1140 +static void _reiserfs_free_block(struct reiserfs_transaction_handle *th,
1141 + struct inode *inode, b_blocknr_t block,
1142 + int for_unformatted)
1144 + struct super_block *s = th->t_super;
1145 + struct reiserfs_super_block *rs;
1146 + struct buffer_head *sbh, *bmbh;
1147 + struct reiserfs_bitmap_info *apbi;
1148 + unsigned int nr, offset;
1150 + BUG_ON(!th->t_trans_id);
1151 + PROC_INFO_INC(s, free_block);
1152 + rs = SB_DISK_SUPER_BLOCK(s);
1153 + sbh = SB_BUFFER_WITH_SB(s);
1154 + apbi = SB_AP_BITMAP(s);
1156 + get_bit_address(s, block, &nr, &offset);
1158 + if (nr >= reiserfs_bmap_count(s)) {
1159 + reiserfs_error(s, "vs-4075", "block %lu is out of range",
1160 + block);
1161 + return;
1164 + bmbh = reiserfs_read_bitmap_block(s, nr);
1165 + if (!bmbh)
1166 + return;
1168 + reiserfs_prepare_for_journal(s, bmbh, 1);
1170 + /* clear bit for the given block in bit map */
1171 + if (!reiserfs_test_and_clear_le_bit(offset, bmbh->b_data)) {
1172 + reiserfs_error(s, "vs-4080",
1173 + "block %lu: bit already cleared", block);
1175 + apbi[nr].free_count++;
1176 + journal_mark_dirty(th, bmbh);
1177 + brelse(bmbh);
1179 + reiserfs_prepare_for_journal(s, sbh, 1);
1180 + /* update super block */
1181 + set_sb_free_blocks(rs, sb_free_blocks(rs) + 1);
1183 + journal_mark_dirty(th, sbh);
1184 + if (for_unformatted) {
1185 + int depth = reiserfs_write_unlock_nested(s);
1186 + dquot_free_block_nodirty(inode, 1);
1187 + reiserfs_write_lock_nested(s, depth);
1191 +void reiserfs_free_block(struct reiserfs_transaction_handle *th,
1192 + struct inode *inode, b_blocknr_t block,
1193 + int for_unformatted)
1195 + struct super_block *s = th->t_super;
1197 + BUG_ON(!th->t_trans_id);
1198 + RFALSE(!s, "vs-4061: trying to free block on nonexistent device");
1199 + if (!is_reusable(s, block, 1))
1200 + return;
1202 + if (block > sb_block_count(REISERFS_SB(s)->s_rs)) {
1203 + reiserfs_error(th->t_super, "bitmap-4072",
1204 + "Trying to free block outside file system "
1205 + "boundaries (%lu > %lu)",
1206 + block, sb_block_count(REISERFS_SB(s)->s_rs));
1207 + return;
1209 + /* mark it before we clear it, just in case */
1210 + journal_mark_freed(th, s, block);
1211 + _reiserfs_free_block(th, inode, block, for_unformatted);
1214 +/* preallocated blocks don't need to be run through journal_mark_freed */
1215 +static void reiserfs_free_prealloc_block(struct reiserfs_transaction_handle *th,
1216 + struct inode *inode, b_blocknr_t block)
1218 + BUG_ON(!th->t_trans_id);
1219 + RFALSE(!th->t_super,
1220 + "vs-4060: trying to free block on nonexistent device");
1221 + if (!is_reusable(th->t_super, block, 1))
1222 + return;
1223 + _reiserfs_free_block(th, inode, block, 1);
1226 +static void __discard_prealloc(struct reiserfs_transaction_handle *th,
1227 + struct reiserfs_inode_info *ei)
1229 + unsigned long save = ei->i_prealloc_block;
1230 + int dirty = 0;
1231 + struct inode *inode = &ei->vfs_inode;
1233 + BUG_ON(!th->t_trans_id);
1234 +#ifdef CONFIG_REISERFS_CHECK
1235 + if (ei->i_prealloc_count < 0)
1236 + reiserfs_error(th->t_super, "zam-4001",
1237 + "inode has negative prealloc blocks count.");
1238 +#endif
1239 + while (ei->i_prealloc_count > 0) {
1240 + b_blocknr_t block_to_free;
1242 + /*
1243 + * reiserfs_free_prealloc_block can drop the write lock,
1244 + * which could allow another caller to free the same block.
1245 + * We can protect against it by modifying the prealloc
1246 + * state before calling it.
1247 + */
1248 + block_to_free = ei->i_prealloc_block++;
1249 + ei->i_prealloc_count--;
1250 + reiserfs_free_prealloc_block(th, inode, block_to_free);
1251 + dirty = 1;
1253 + if (dirty)
1254 + reiserfs_update_sd(th, inode);
1255 + ei->i_prealloc_block = save;
1256 + list_del_init(&ei->i_prealloc_list);
1259 +/* FIXME: It should be inline function */
1260 +void reiserfs_discard_prealloc(struct reiserfs_transaction_handle *th,
1261 + struct inode *inode)
1263 + struct reiserfs_inode_info *ei = REISERFS_I(inode);
1265 + BUG_ON(!th->t_trans_id);
1266 + if (ei->i_prealloc_count)
1267 + __discard_prealloc(th, ei);
1270 +void reiserfs_discard_all_prealloc(struct reiserfs_transaction_handle *th)
1272 + struct list_head *plist = &SB_JOURNAL(th->t_super)->j_prealloc_list;
1274 + BUG_ON(!th->t_trans_id);
1275 + while (!list_empty(plist)) {
1276 + struct reiserfs_inode_info *ei;
1277 + ei = list_entry(plist->next, struct reiserfs_inode_info,
1278 + i_prealloc_list);
1279 +#ifdef CONFIG_REISERFS_CHECK
1280 + if (!ei->i_prealloc_count) {
1281 + reiserfs_error(th->t_super, "zam-4001",
1282 + "inode is in prealloc list but has "
1283 + "no preallocated blocks.");
1285 +#endif
1286 + __discard_prealloc(th, ei);
1290 +void reiserfs_init_alloc_options(struct super_block *s)
1292 + set_bit(_ALLOC_skip_busy, &SB_ALLOC_OPTS(s));
1293 + set_bit(_ALLOC_dirid_groups, &SB_ALLOC_OPTS(s));
1294 + set_bit(_ALLOC_packing_groups, &SB_ALLOC_OPTS(s));
1297 +/* block allocator related options are parsed here */
1298 +int reiserfs_parse_alloc_options(struct super_block *s, char *options)
1300 + char *this_char, *value;
1302 + /* clear default settings */
1303 + REISERFS_SB(s)->s_alloc_options.bits = 0;
1305 + while ((this_char = strsep(&options, ":")) != NULL) {
1306 + if ((value = strchr(this_char, '=')) != NULL)
1307 + *value++ = 0;
1309 + if (!strcmp(this_char, "concentrating_formatted_nodes")) {
1310 + int temp;
1311 + SET_OPTION(concentrating_formatted_nodes);
1312 + temp = (value
1313 + && *value) ? simple_strtoul(value, &value,
1314 + 0) : 10;
1315 + if (temp <= 0 || temp > 100) {
1316 + REISERFS_SB(s)->s_alloc_options.border = 10;
1317 + } else {
1318 + REISERFS_SB(s)->s_alloc_options.border =
1319 + 100 / temp;
1321 + continue;
1323 + if (!strcmp(this_char, "displacing_large_files")) {
1324 + SET_OPTION(displacing_large_files);
1325 + REISERFS_SB(s)->s_alloc_options.large_file_size =
1326 + (value
1327 + && *value) ? simple_strtoul(value, &value, 0) : 16;
1328 + continue;
1330 + if (!strcmp(this_char, "displacing_new_packing_localities")) {
1331 + SET_OPTION(displacing_new_packing_localities);
1332 + continue;
1335 + if (!strcmp(this_char, "old_hashed_relocation")) {
1336 + SET_OPTION(old_hashed_relocation);
1337 + continue;
1340 + if (!strcmp(this_char, "new_hashed_relocation")) {
1341 + SET_OPTION(new_hashed_relocation);
1342 + continue;
1345 + if (!strcmp(this_char, "dirid_groups")) {
1346 + SET_OPTION(dirid_groups);
1347 + continue;
1349 + if (!strcmp(this_char, "oid_groups")) {
1350 + SET_OPTION(oid_groups);
1351 + continue;
1353 + if (!strcmp(this_char, "packing_groups")) {
1354 + SET_OPTION(packing_groups);
1355 + continue;
1357 + if (!strcmp(this_char, "hashed_formatted_nodes")) {
1358 + SET_OPTION(hashed_formatted_nodes);
1359 + continue;
1362 + if (!strcmp(this_char, "skip_busy")) {
1363 + SET_OPTION(skip_busy);
1364 + continue;
1367 + if (!strcmp(this_char, "hundredth_slices")) {
1368 + SET_OPTION(hundredth_slices);
1369 + continue;
1372 + if (!strcmp(this_char, "old_way")) {
1373 + SET_OPTION(old_way);
1374 + continue;
1377 + if (!strcmp(this_char, "displace_based_on_dirid")) {
1378 + SET_OPTION(displace_based_on_dirid);
1379 + continue;
1382 + if (!strcmp(this_char, "preallocmin")) {
1383 + REISERFS_SB(s)->s_alloc_options.preallocmin =
1384 + (value
1385 + && *value) ? simple_strtoul(value, &value, 0) : 4;
1386 + continue;
1389 + if (!strcmp(this_char, "preallocsize")) {
1390 + REISERFS_SB(s)->s_alloc_options.preallocsize =
1391 + (value
1392 + && *value) ? simple_strtoul(value, &value,
1393 + 0) :
1394 + PREALLOCATION_SIZE;
1395 + continue;
1398 + reiserfs_warning(s, "zam-4001", "unknown option - %s",
1399 + this_char);
1400 + return 1;
1403 + reiserfs_info(s, "allocator options = [%08x]\n", SB_ALLOC_OPTS(s));
1404 + return 0;
1407 +static void print_sep(struct seq_file *seq, int *first)
1409 + if (!*first)
1410 + seq_puts(seq, ":");
1411 + else
1412 + *first = 0;
1415 +void show_alloc_options(struct seq_file *seq, struct super_block *s)
1417 + int first = 1;
1419 + if (SB_ALLOC_OPTS(s) == ((1 << _ALLOC_skip_busy) |
1420 + (1 << _ALLOC_dirid_groups) | (1 << _ALLOC_packing_groups)))
1421 + return;
1423 + seq_puts(seq, ",alloc=");
1425 + if (TEST_OPTION(concentrating_formatted_nodes, s)) {
1426 + print_sep(seq, &first);
1427 + if (REISERFS_SB(s)->s_alloc_options.border != 10) {
1428 + seq_printf(seq, "concentrating_formatted_nodes=%d",
1429 + 100 / REISERFS_SB(s)->s_alloc_options.border);
1430 + } else
1431 + seq_puts(seq, "concentrating_formatted_nodes");
1433 + if (TEST_OPTION(displacing_large_files, s)) {
1434 + print_sep(seq, &first);
1435 + if (REISERFS_SB(s)->s_alloc_options.large_file_size != 16) {
1436 + seq_printf(seq, "displacing_large_files=%lu",
1437 + REISERFS_SB(s)->s_alloc_options.large_file_size);
1438 + } else
1439 + seq_puts(seq, "displacing_large_files");
1441 + if (TEST_OPTION(displacing_new_packing_localities, s)) {
1442 + print_sep(seq, &first);
1443 + seq_puts(seq, "displacing_new_packing_localities");
1445 + if (TEST_OPTION(old_hashed_relocation, s)) {
1446 + print_sep(seq, &first);
1447 + seq_puts(seq, "old_hashed_relocation");
1449 + if (TEST_OPTION(new_hashed_relocation, s)) {
1450 + print_sep(seq, &first);
1451 + seq_puts(seq, "new_hashed_relocation");
1453 + if (TEST_OPTION(dirid_groups, s)) {
1454 + print_sep(seq, &first);
1455 + seq_puts(seq, "dirid_groups");
1457 + if (TEST_OPTION(oid_groups, s)) {
1458 + print_sep(seq, &first);
1459 + seq_puts(seq, "oid_groups");
1461 + if (TEST_OPTION(packing_groups, s)) {
1462 + print_sep(seq, &first);
1463 + seq_puts(seq, "packing_groups");
1465 + if (TEST_OPTION(hashed_formatted_nodes, s)) {
1466 + print_sep(seq, &first);
1467 + seq_puts(seq, "hashed_formatted_nodes");
1469 + if (TEST_OPTION(skip_busy, s)) {
1470 + print_sep(seq, &first);
1471 + seq_puts(seq, "skip_busy");
1473 + if (TEST_OPTION(hundredth_slices, s)) {
1474 + print_sep(seq, &first);
1475 + seq_puts(seq, "hundredth_slices");
1477 + if (TEST_OPTION(old_way, s)) {
1478 + print_sep(seq, &first);
1479 + seq_puts(seq, "old_way");
1481 + if (TEST_OPTION(displace_based_on_dirid, s)) {
1482 + print_sep(seq, &first);
1483 + seq_puts(seq, "displace_based_on_dirid");
1485 + if (REISERFS_SB(s)->s_alloc_options.preallocmin != 0) {
1486 + print_sep(seq, &first);
1487 + seq_printf(seq, "preallocmin=%d",
1488 + REISERFS_SB(s)->s_alloc_options.preallocmin);
1490 + if (REISERFS_SB(s)->s_alloc_options.preallocsize != 17) {
1491 + print_sep(seq, &first);
1492 + seq_printf(seq, "preallocsize=%d",
1493 + REISERFS_SB(s)->s_alloc_options.preallocsize);
1497 +static inline void new_hashed_relocation(reiserfs_blocknr_hint_t * hint)
1499 + char *hash_in;
1501 + if (hint->formatted_node) {
1502 + hash_in = (char *)&hint->key.k_dir_id;
1503 + } else {
1504 + if (!hint->inode) {
1505 + /*hint->search_start = hint->beg;*/
1506 + hash_in = (char *)&hint->key.k_dir_id;
1507 + } else
1508 + if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
1509 + hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id);
1510 + else
1511 + hash_in =
1512 + (char *)(&INODE_PKEY(hint->inode)->k_objectid);
1515 + hint->search_start =
1516 + hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg);
1520 + * Relocation based on dirid, hashing them into a given bitmap block
1521 + * files. Formatted nodes are unaffected, a separate policy covers them
1522 + */
1523 +static void dirid_groups(reiserfs_blocknr_hint_t * hint)
1525 + unsigned long hash;
1526 + __u32 dirid = 0;
1527 + int bm = 0;
1528 + struct super_block *sb = hint->th->t_super;
1530 + if (hint->inode)
1531 + dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id);
1532 + else if (hint->formatted_node)
1533 + dirid = hint->key.k_dir_id;
1535 + if (dirid) {
1536 + bm = bmap_hash_id(sb, dirid);
1537 + hash = bm * (sb->s_blocksize << 3);
1538 + /* give a portion of the block group to metadata */
1539 + if (hint->inode)
1540 + hash += sb->s_blocksize / 2;
1541 + hint->search_start = hash;
1546 + * Relocation based on oid, hashing them into a given bitmap block
1547 + * files. Formatted nodes are unaffected, a separate policy covers them
1548 + */
1549 +static void oid_groups(reiserfs_blocknr_hint_t * hint)
1551 + if (hint->inode) {
1552 + unsigned long hash;
1553 + __u32 oid;
1554 + __u32 dirid;
1555 + int bm;
1557 + dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id);
1559 + /*
1560 + * keep the root dir and it's first set of subdirs close to
1561 + * the start of the disk
1562 + */
1563 + if (dirid <= 2)
1564 + hash = (hint->inode->i_sb->s_blocksize << 3);
1565 + else {
1566 + oid = le32_to_cpu(INODE_PKEY(hint->inode)->k_objectid);
1567 + bm = bmap_hash_id(hint->inode->i_sb, oid);
1568 + hash = bm * (hint->inode->i_sb->s_blocksize << 3);
1570 + hint->search_start = hash;
1575 + * returns 1 if it finds an indirect item and gets valid hint info
1576 + * from it, otherwise 0
1577 + */
1578 +static int get_left_neighbor(reiserfs_blocknr_hint_t * hint)
1580 + struct treepath *path;
1581 + struct buffer_head *bh;
1582 + struct item_head *ih;
1583 + int pos_in_item;
1584 + __le32 *item;
1585 + int ret = 0;
1587 + /*
1588 + * reiserfs code can call this function w/o pointer to path
1589 + * structure supplied; then we rely on supplied search_start
1590 + */
1591 + if (!hint->path)
1592 + return 0;
1594 + path = hint->path;
1595 + bh = get_last_bh(path);
1596 + RFALSE(!bh, "green-4002: Illegal path specified to get_left_neighbor");
1597 + ih = tp_item_head(path);
1598 + pos_in_item = path->pos_in_item;
1599 + item = tp_item_body(path);
1601 + hint->search_start = bh->b_blocknr;
1603 + /*
1604 + * for indirect item: go to left and look for the first non-hole entry
1605 + * in the indirect item
1606 + */
1607 + if (!hint->formatted_node && is_indirect_le_ih(ih)) {
1608 + if (pos_in_item == I_UNFM_NUM(ih))
1609 + pos_in_item--;
1610 + while (pos_in_item >= 0) {
1611 + int t = get_block_num(item, pos_in_item);
1612 + if (t) {
1613 + hint->search_start = t;
1614 + ret = 1;
1615 + break;
1617 + pos_in_item--;
1621 + /* does result value fit into specified region? */
1622 + return ret;
1626 + * should be, if formatted node, then try to put on first part of the device
1627 + * specified as number of percent with mount option device, else try to put
1628 + * on last of device. This is not to say it is good code to do so,
1629 + * but the effect should be measured.
1630 + */
1631 +static inline void set_border_in_hint(struct super_block *s,
1632 + reiserfs_blocknr_hint_t * hint)
1634 + b_blocknr_t border =
1635 + SB_BLOCK_COUNT(s) / REISERFS_SB(s)->s_alloc_options.border;
1637 + if (hint->formatted_node)
1638 + hint->end = border - 1;
1639 + else
1640 + hint->beg = border;
1643 +static inline void displace_large_file(reiserfs_blocknr_hint_t * hint)
1645 + if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
1646 + hint->search_start =
1647 + hint->beg +
1648 + keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_dir_id),
1649 + 4) % (hint->end - hint->beg);
1650 + else
1651 + hint->search_start =
1652 + hint->beg +
1653 + keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_objectid),
1654 + 4) % (hint->end - hint->beg);
1657 +static inline void hash_formatted_node(reiserfs_blocknr_hint_t * hint)
1659 + char *hash_in;
1661 + if (!hint->inode)
1662 + hash_in = (char *)&hint->key.k_dir_id;
1663 + else if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
1664 + hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id);
1665 + else
1666 + hash_in = (char *)(&INODE_PKEY(hint->inode)->k_objectid);
1668 + hint->search_start =
1669 + hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg);
1672 +static inline int
1673 +this_blocknr_allocation_would_make_it_a_large_file(reiserfs_blocknr_hint_t *
1674 + hint)
1676 + return hint->block ==
1677 + REISERFS_SB(hint->th->t_super)->s_alloc_options.large_file_size;
1680 +#ifdef DISPLACE_NEW_PACKING_LOCALITIES
1681 +static inline void displace_new_packing_locality(reiserfs_blocknr_hint_t * hint)
1683 + struct in_core_key *key = &hint->key;
1685 + hint->th->displace_new_blocks = 0;
1686 + hint->search_start =
1687 + hint->beg + keyed_hash((char *)(&key->k_objectid),
1688 + 4) % (hint->end - hint->beg);
1690 +#endif
1692 +static inline int old_hashed_relocation(reiserfs_blocknr_hint_t * hint)
1694 + b_blocknr_t border;
1695 + u32 hash_in;
1697 + if (hint->formatted_node || hint->inode == NULL) {
1698 + return 0;
1701 + hash_in = le32_to_cpu((INODE_PKEY(hint->inode))->k_dir_id);
1702 + border =
1703 + hint->beg + (u32) keyed_hash(((char *)(&hash_in)),
1704 + 4) % (hint->end - hint->beg - 1);
1705 + if (border > hint->search_start)
1706 + hint->search_start = border;
1708 + return 1;
1711 +static inline int old_way(reiserfs_blocknr_hint_t * hint)
1713 + b_blocknr_t border;
1715 + if (hint->formatted_node || hint->inode == NULL) {
1716 + return 0;
1719 + border =
1720 + hint->beg +
1721 + le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id) % (hint->end -
1722 + hint->beg);
1723 + if (border > hint->search_start)
1724 + hint->search_start = border;
1726 + return 1;
1729 +static inline void hundredth_slices(reiserfs_blocknr_hint_t * hint)
1731 + struct in_core_key *key = &hint->key;
1732 + b_blocknr_t slice_start;
1734 + slice_start =
1735 + (keyed_hash((char *)(&key->k_dir_id), 4) % 100) * (hint->end / 100);
1736 + if (slice_start > hint->search_start
1737 + || slice_start + (hint->end / 100) <= hint->search_start) {
1738 + hint->search_start = slice_start;
1742 +static void determine_search_start(reiserfs_blocknr_hint_t * hint,
1743 + int amount_needed)
1745 + struct super_block *s = hint->th->t_super;
1746 + int unfm_hint;
1748 + hint->beg = 0;
1749 + hint->end = SB_BLOCK_COUNT(s) - 1;
1751 + /* This is former border algorithm. Now with tunable border offset */
1752 + if (concentrating_formatted_nodes(s))
1753 + set_border_in_hint(s, hint);
1755 +#ifdef DISPLACE_NEW_PACKING_LOCALITIES
1756 + /*
1757 + * whenever we create a new directory, we displace it. At first
1758 + * we will hash for location, later we might look for a moderately
1759 + * empty place for it
1760 + */
1761 + if (displacing_new_packing_localities(s)
1762 + && hint->th->displace_new_blocks) {
1763 + displace_new_packing_locality(hint);
1765 + /*
1766 + * we do not continue determine_search_start,
1767 + * if new packing locality is being displaced
1768 + */
1769 + return;
1771 +#endif
1773 + /*
1774 + * all persons should feel encouraged to add more special cases
1775 + * here and test them
1776 + */
1778 + if (displacing_large_files(s) && !hint->formatted_node
1779 + && this_blocknr_allocation_would_make_it_a_large_file(hint)) {
1780 + displace_large_file(hint);
1781 + return;
1784 + /*
1785 + * if none of our special cases is relevant, use the left
1786 + * neighbor in the tree order of the new node we are allocating for
1787 + */
1788 + if (hint->formatted_node && TEST_OPTION(hashed_formatted_nodes, s)) {
1789 + hash_formatted_node(hint);
1790 + return;
1793 + unfm_hint = get_left_neighbor(hint);
1795 + /*
1796 + * Mimic old block allocator behaviour, that is if VFS allowed for
1797 + * preallocation, new blocks are displaced based on directory ID.
1798 + * Also, if suggested search_start is less than last preallocated
1799 + * block, we start searching from it, assuming that HDD dataflow
1800 + * is faster in forward direction
1801 + */
1802 + if (TEST_OPTION(old_way, s)) {
1803 + if (!hint->formatted_node) {
1804 + if (!reiserfs_hashed_relocation(s))
1805 + old_way(hint);
1806 + else if (!reiserfs_no_unhashed_relocation(s))
1807 + old_hashed_relocation(hint);
1809 + if (hint->inode
1810 + && hint->search_start <
1811 + REISERFS_I(hint->inode)->i_prealloc_block)
1812 + hint->search_start =
1813 + REISERFS_I(hint->inode)->i_prealloc_block;
1815 + return;
1818 + /* This is an approach proposed by Hans */
1819 + if (TEST_OPTION(hundredth_slices, s)
1820 + && !(displacing_large_files(s) && !hint->formatted_node)) {
1821 + hundredth_slices(hint);
1822 + return;
1825 + /* old_hashed_relocation only works on unformatted */
1826 + if (!unfm_hint && !hint->formatted_node &&
1827 + TEST_OPTION(old_hashed_relocation, s)) {
1828 + old_hashed_relocation(hint);
1831 + /* new_hashed_relocation works with both formatted/unformatted nodes */
1832 + if ((!unfm_hint || hint->formatted_node) &&
1833 + TEST_OPTION(new_hashed_relocation, s)) {
1834 + new_hashed_relocation(hint);
1837 + /* dirid grouping works only on unformatted nodes */
1838 + if (!unfm_hint && !hint->formatted_node && TEST_OPTION(dirid_groups, s)) {
1839 + dirid_groups(hint);
1841 +#ifdef DISPLACE_NEW_PACKING_LOCALITIES
1842 + if (hint->formatted_node && TEST_OPTION(dirid_groups, s)) {
1843 + dirid_groups(hint);
1845 +#endif
1847 + /* oid grouping works only on unformatted nodes */
1848 + if (!unfm_hint && !hint->formatted_node && TEST_OPTION(oid_groups, s)) {
1849 + oid_groups(hint);
1851 + return;
1854 +static int determine_prealloc_size(reiserfs_blocknr_hint_t * hint)
1856 + /* make minimum size a mount option and benchmark both ways */
1857 + /* we preallocate blocks only for regular files, specific size */
1858 + /* benchmark preallocating always and see what happens */
1860 + hint->prealloc_size = 0;
1862 + if (!hint->formatted_node && hint->preallocate) {
1863 + if (S_ISREG(hint->inode->i_mode) && !IS_PRIVATE(hint->inode)
1864 + && hint->inode->i_size >=
1865 + REISERFS_SB(hint->th->t_super)->s_alloc_options.
1866 + preallocmin * hint->inode->i_sb->s_blocksize)
1867 + hint->prealloc_size =
1868 + REISERFS_SB(hint->th->t_super)->s_alloc_options.
1869 + preallocsize - 1;
1871 + return CARRY_ON;
1874 +static inline int allocate_without_wrapping_disk(reiserfs_blocknr_hint_t * hint,
1875 + b_blocknr_t * new_blocknrs,
1876 + b_blocknr_t start,
1877 + b_blocknr_t finish, int min,
1878 + int amount_needed,
1879 + int prealloc_size)
1881 + int rest = amount_needed;
1882 + int nr_allocated;
1884 + while (rest > 0 && start <= finish) {
1885 + nr_allocated = scan_bitmap(hint->th, &start, finish, min,
1886 + rest + prealloc_size,
1887 + !hint->formatted_node, hint->block);
1889 + if (nr_allocated == 0) /* no new blocks allocated, return */
1890 + break;
1892 + /* fill free_blocknrs array first */
1893 + while (rest > 0 && nr_allocated > 0) {
1894 + *new_blocknrs++ = start++;
1895 + rest--;
1896 + nr_allocated--;
1899 + /* do we have something to fill prealloc. array also ? */
1900 + if (nr_allocated > 0) {
1901 + /*
1902 + * it means prealloc_size was greater that 0 and
1903 + * we do preallocation
1904 + */
1905 + list_add(&REISERFS_I(hint->inode)->i_prealloc_list,
1906 + &SB_JOURNAL(hint->th->t_super)->
1907 + j_prealloc_list);
1908 + REISERFS_I(hint->inode)->i_prealloc_block = start;
1909 + REISERFS_I(hint->inode)->i_prealloc_count =
1910 + nr_allocated;
1911 + break;
1915 + return (amount_needed - rest);
1918 +static inline int blocknrs_and_prealloc_arrays_from_search_start
1919 + (reiserfs_blocknr_hint_t * hint, b_blocknr_t * new_blocknrs,
1920 + int amount_needed) {
1921 + struct super_block *s = hint->th->t_super;
1922 + b_blocknr_t start = hint->search_start;
1923 + b_blocknr_t finish = SB_BLOCK_COUNT(s) - 1;
1924 + int passno = 0;
1925 + int nr_allocated = 0;
1926 + int depth;
1928 + determine_prealloc_size(hint);
1929 + if (!hint->formatted_node) {
1930 + int quota_ret;
1931 +#ifdef REISERQUOTA_DEBUG
1932 + reiserfs_debug(s, REISERFS_DEBUG_CODE,
1933 + "reiserquota: allocating %d blocks id=%u",
1934 + amount_needed, hint->inode->i_uid);
1935 +#endif
1936 + depth = reiserfs_write_unlock_nested(s);
1937 + quota_ret =
1938 + dquot_alloc_block_nodirty(hint->inode, amount_needed);
1939 + if (quota_ret) { /* Quota exceeded? */
1940 + reiserfs_write_lock_nested(s, depth);
1941 + return QUOTA_EXCEEDED;
1943 + if (hint->preallocate && hint->prealloc_size) {
1944 +#ifdef REISERQUOTA_DEBUG
1945 + reiserfs_debug(s, REISERFS_DEBUG_CODE,
1946 + "reiserquota: allocating (prealloc) %d blocks id=%u",
1947 + hint->prealloc_size, hint->inode->i_uid);
1948 +#endif
1949 + quota_ret = dquot_prealloc_block_nodirty(hint->inode,
1950 + hint->prealloc_size);
1951 + if (quota_ret)
1952 + hint->preallocate = hint->prealloc_size = 0;
1954 + /* for unformatted nodes, force large allocations */
1955 + reiserfs_write_lock_nested(s, depth);
1958 + do {
1959 + switch (passno++) {
1960 + case 0: /* Search from hint->search_start to end of disk */
1961 + start = hint->search_start;
1962 + finish = SB_BLOCK_COUNT(s) - 1;
1963 + break;
1964 + case 1: /* Search from hint->beg to hint->search_start */
1965 + start = hint->beg;
1966 + finish = hint->search_start;
1967 + break;
1968 + case 2: /* Last chance: Search from 0 to hint->beg */
1969 + start = 0;
1970 + finish = hint->beg;
1971 + break;
1972 + default:
1973 + /* We've tried searching everywhere, not enough space */
1974 + /* Free the blocks */
1975 + if (!hint->formatted_node) {
1976 +#ifdef REISERQUOTA_DEBUG
1977 + reiserfs_debug(s, REISERFS_DEBUG_CODE,
1978 + "reiserquota: freeing (nospace) %d blocks id=%u",
1979 + amount_needed +
1980 + hint->prealloc_size -
1981 + nr_allocated,
1982 + hint->inode->i_uid);
1983 +#endif
1984 + /* Free not allocated blocks */
1985 + depth = reiserfs_write_unlock_nested(s);
1986 + dquot_free_block_nodirty(hint->inode,
1987 + amount_needed + hint->prealloc_size -
1988 + nr_allocated);
1989 + reiserfs_write_lock_nested(s, depth);
1991 + while (nr_allocated--)
1992 + reiserfs_free_block(hint->th, hint->inode,
1993 + new_blocknrs[nr_allocated],
1994 + !hint->formatted_node);
1996 + return NO_DISK_SPACE;
1998 + } while ((nr_allocated += allocate_without_wrapping_disk(hint,
1999 + new_blocknrs +
2000 + nr_allocated,
2001 + start, finish,
2002 + 1,
2003 + amount_needed -
2004 + nr_allocated,
2005 + hint->
2006 + prealloc_size))
2007 + < amount_needed);
2008 + if (!hint->formatted_node &&
2009 + amount_needed + hint->prealloc_size >
2010 + nr_allocated + REISERFS_I(hint->inode)->i_prealloc_count) {
2011 + /* Some of preallocation blocks were not allocated */
2012 +#ifdef REISERQUOTA_DEBUG
2013 + reiserfs_debug(s, REISERFS_DEBUG_CODE,
2014 + "reiserquota: freeing (failed prealloc) %d blocks id=%u",
2015 + amount_needed + hint->prealloc_size -
2016 + nr_allocated -
2017 + REISERFS_I(hint->inode)->i_prealloc_count,
2018 + hint->inode->i_uid);
2019 +#endif
2021 + depth = reiserfs_write_unlock_nested(s);
2022 + dquot_free_block_nodirty(hint->inode, amount_needed +
2023 + hint->prealloc_size - nr_allocated -
2024 + REISERFS_I(hint->inode)->
2025 + i_prealloc_count);
2026 + reiserfs_write_lock_nested(s, depth);
2029 + return CARRY_ON;
2032 +/* grab new blocknrs from preallocated list */
2033 +/* return amount still needed after using them */
2034 +static int use_preallocated_list_if_available(reiserfs_blocknr_hint_t * hint,
2035 + b_blocknr_t * new_blocknrs,
2036 + int amount_needed)
2038 + struct inode *inode = hint->inode;
2040 + if (REISERFS_I(inode)->i_prealloc_count > 0) {
2041 + while (amount_needed) {
2043 + *new_blocknrs++ = REISERFS_I(inode)->i_prealloc_block++;
2044 + REISERFS_I(inode)->i_prealloc_count--;
2046 + amount_needed--;
2048 + if (REISERFS_I(inode)->i_prealloc_count <= 0) {
2049 + list_del(&REISERFS_I(inode)->i_prealloc_list);
2050 + break;
2054 + /* return amount still needed after using preallocated blocks */
2055 + return amount_needed;
2058 +int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *hint,
2059 + b_blocknr_t *new_blocknrs,
2060 + int amount_needed,
2061 + /* Amount of blocks we have already reserved */
2062 + int reserved_by_us)
2064 + int initial_amount_needed = amount_needed;
2065 + int ret;
2066 + struct super_block *s = hint->th->t_super;
2068 + /* Check if there is enough space, taking into account reserved space */
2069 + if (SB_FREE_BLOCKS(s) - REISERFS_SB(s)->reserved_blocks <
2070 + amount_needed - reserved_by_us)
2071 + return NO_DISK_SPACE;
2072 + /* should this be if !hint->inode && hint->preallocate? */
2073 + /* do you mean hint->formatted_node can be removed ? - Zam */
2074 + /*
2075 + * hint->formatted_node cannot be removed because we try to access
2076 + * inode information here, and there is often no inode associated with
2077 + * metadata allocations - green
2078 + */
2080 + if (!hint->formatted_node && hint->preallocate) {
2081 + amount_needed = use_preallocated_list_if_available
2082 + (hint, new_blocknrs, amount_needed);
2084 + /*
2085 + * We have all the block numbers we need from the
2086 + * prealloc list
2087 + */
2088 + if (amount_needed == 0)
2089 + return CARRY_ON;
2090 + new_blocknrs += (initial_amount_needed - amount_needed);
2093 + /* find search start and save it in hint structure */
2094 + determine_search_start(hint, amount_needed);
2095 + if (hint->search_start >= SB_BLOCK_COUNT(s))
2096 + hint->search_start = SB_BLOCK_COUNT(s) - 1;
2098 + /* allocation itself; fill new_blocknrs and preallocation arrays */
2099 + ret = blocknrs_and_prealloc_arrays_from_search_start
2100 + (hint, new_blocknrs, amount_needed);
2102 + /*
2103 + * We used prealloc. list to fill (partially) new_blocknrs array.
2104 + * If final allocation fails we need to return blocks back to
2105 + * prealloc. list or just free them. -- Zam (I chose second
2106 + * variant)
2107 + */
2108 + if (ret != CARRY_ON) {
2109 + while (amount_needed++ < initial_amount_needed) {
2110 + reiserfs_free_block(hint->th, hint->inode,
2111 + *(--new_blocknrs), 1);
2114 + return ret;
2117 +void reiserfs_cache_bitmap_metadata(struct super_block *sb,
2118 + struct buffer_head *bh,
2119 + struct reiserfs_bitmap_info *info)
2121 + unsigned long *cur = (unsigned long *)(bh->b_data + bh->b_size);
2123 + /* The first bit must ALWAYS be 1 */
2124 + if (!reiserfs_test_le_bit(0, (unsigned long *)bh->b_data))
2125 + reiserfs_error(sb, "reiserfs-2025", "bitmap block %lu is "
2126 + "corrupted: first bit must be 1", bh->b_blocknr);
2128 + info->free_count = 0;
2130 + while (--cur >= (unsigned long *)bh->b_data) {
2131 + /* 0 and ~0 are special, we can optimize for them */
2132 + if (*cur == 0)
2133 + info->free_count += BITS_PER_LONG;
2134 + else if (*cur != ~0L) /* A mix, investigate */
2135 + info->free_count += BITS_PER_LONG - hweight_long(*cur);
2139 +struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb,
2140 + unsigned int bitmap)
2142 + b_blocknr_t block = (sb->s_blocksize << 3) * bitmap;
2143 + struct reiserfs_bitmap_info *info = SB_AP_BITMAP(sb) + bitmap;
2144 + struct buffer_head *bh;
2146 + /*
2147 + * Way old format filesystems had the bitmaps packed up front.
2148 + * I doubt there are any of these left, but just in case...
2149 + */
2150 + if (unlikely(test_bit(REISERFS_OLD_FORMAT,
2151 + &REISERFS_SB(sb)->s_properties)))
2152 + block = REISERFS_SB(sb)->s_sbh->b_blocknr + 1 + bitmap;
2153 + else if (bitmap == 0)
2154 + block = (REISERFS_DISK_OFFSET_IN_BYTES >> sb->s_blocksize_bits) + 1;
2156 + bh = sb_bread(sb, block);
2157 + if (bh == NULL)
2158 + reiserfs_warning(sb, "sh-2029: %s: bitmap block (#%u) "
2159 + "reading failed", __func__, block);
2160 + else {
2161 + if (buffer_locked(bh)) {
2162 + int depth;
2163 + PROC_INFO_INC(sb, scan_bitmap.wait);
2164 + depth = reiserfs_write_unlock_nested(sb);
2165 + __wait_on_buffer(bh);
2166 + reiserfs_write_lock_nested(sb, depth);
2168 + BUG_ON(!buffer_uptodate(bh));
2169 + BUG_ON(atomic_read(&bh->b_count) == 0);
2171 + if (info->free_count == UINT_MAX)
2172 + reiserfs_cache_bitmap_metadata(sb, bh, info);
2175 + return bh;
2178 +int reiserfs_init_bitmap_cache(struct super_block *sb)
2180 + struct reiserfs_bitmap_info *bitmap;
2181 + unsigned int bmap_nr = reiserfs_bmap_count(sb);
2183 + bitmap = vmalloc(array_size(bmap_nr, sizeof(*bitmap)));
2184 + if (bitmap == NULL)
2185 + return -ENOMEM;
2187 + memset(bitmap, 0xff, sizeof(*bitmap) * bmap_nr);
2189 + SB_AP_BITMAP(sb) = bitmap;
2191 + return 0;
2194 +void reiserfs_free_bitmap_cache(struct super_block *sb)
2196 + if (SB_AP_BITMAP(sb)) {
2197 + vfree(SB_AP_BITMAP(sb));
2198 + SB_AP_BITMAP(sb) = NULL;
2201 diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
2202 new file mode 100644
2203 index 000000000000..79ee2b436685
2204 --- /dev/null
2205 +++ b/fs/reiserfs/dir.c
2206 @@ -0,0 +1,346 @@
2208 + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
2209 + */
2211 +#include <linux/string.h>
2212 +#include <linux/errno.h>
2213 +#include <linux/fs.h>
2214 +#include "reiserfs.h"
2215 +#include <linux/stat.h>
2216 +#include <linux/buffer_head.h>
2217 +#include <linux/slab.h>
2218 +#include <linux/uaccess.h>
2220 +extern const struct reiserfs_key MIN_KEY;
2222 +static int reiserfs_readdir(struct file *, struct dir_context *);
2223 +static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
2224 + int datasync);
2226 +const struct file_operations reiserfs_dir_operations = {
2227 + .llseek = generic_file_llseek,
2228 + .read = generic_read_dir,
2229 + .iterate_shared = reiserfs_readdir,
2230 + .fsync = reiserfs_dir_fsync,
2231 + .unlocked_ioctl = reiserfs_ioctl,
2232 +#ifdef CONFIG_COMPAT
2233 + .compat_ioctl = reiserfs_compat_ioctl,
2234 +#endif
2237 +static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
2238 + int datasync)
2240 + struct inode *inode = filp->f_mapping->host;
2241 + int err;
2243 + err = file_write_and_wait_range(filp, start, end);
2244 + if (err)
2245 + return err;
2247 + inode_lock(inode);
2248 + reiserfs_write_lock(inode->i_sb);
2249 + err = reiserfs_commit_for_inode(inode);
2250 + reiserfs_write_unlock(inode->i_sb);
2251 + inode_unlock(inode);
2252 + if (err < 0)
2253 + return err;
2254 + return 0;
2257 +#define store_ih(where,what) copy_item_head (where, what)
2259 +static inline bool is_privroot_deh(struct inode *dir, struct reiserfs_de_head *deh)
2261 + struct dentry *privroot = REISERFS_SB(dir->i_sb)->priv_root;
2262 + return (d_really_is_positive(privroot) &&
2263 + deh->deh_objectid == INODE_PKEY(d_inode(privroot))->k_objectid);
2266 +int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
2269 + /* key of current position in the directory (key of directory entry) */
2270 + struct cpu_key pos_key;
2272 + INITIALIZE_PATH(path_to_entry);
2273 + struct buffer_head *bh;
2274 + int item_num, entry_num;
2275 + const struct reiserfs_key *rkey;
2276 + struct item_head *ih, tmp_ih;
2277 + int search_res;
2278 + char *local_buf;
2279 + loff_t next_pos;
2280 + char small_buf[32]; /* avoid kmalloc if we can */
2281 + struct reiserfs_dir_entry de;
2282 + int ret = 0;
2283 + int depth;
2285 + reiserfs_write_lock(inode->i_sb);
2287 + reiserfs_check_lock_depth(inode->i_sb, "readdir");
2289 + /*
2290 + * form key for search the next directory entry using
2291 + * f_pos field of file structure
2292 + */
2293 + make_cpu_key(&pos_key, inode, ctx->pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3);
2294 + next_pos = cpu_key_k_offset(&pos_key);
2296 + path_to_entry.reada = PATH_READA;
2297 + while (1) {
2298 +research:
2299 + /*
2300 + * search the directory item, containing entry with
2301 + * specified key
2302 + */
2303 + search_res =
2304 + search_by_entry_key(inode->i_sb, &pos_key, &path_to_entry,
2305 + &de);
2306 + if (search_res == IO_ERROR) {
2307 + /*
2308 + * FIXME: we could just skip part of directory
2309 + * which could not be read
2310 + */
2311 + ret = -EIO;
2312 + goto out;
2314 + entry_num = de.de_entry_num;
2315 + bh = de.de_bh;
2316 + item_num = de.de_item_num;
2317 + ih = de.de_ih;
2318 + store_ih(&tmp_ih, ih);
2320 + /* we must have found item, that is item of this directory, */
2321 + RFALSE(COMP_SHORT_KEYS(&ih->ih_key, &pos_key),
2322 + "vs-9000: found item %h does not match to dir we readdir %K",
2323 + ih, &pos_key);
2324 + RFALSE(item_num > B_NR_ITEMS(bh) - 1,
2325 + "vs-9005 item_num == %d, item amount == %d",
2326 + item_num, B_NR_ITEMS(bh));
2328 + /*
2329 + * and entry must be not more than number of entries
2330 + * in the item
2331 + */
2332 + RFALSE(ih_entry_count(ih) < entry_num,
2333 + "vs-9010: entry number is too big %d (%d)",
2334 + entry_num, ih_entry_count(ih));
2336 + /*
2337 + * go through all entries in the directory item beginning
2338 + * from the entry, that has been found
2339 + */
2340 + if (search_res == POSITION_FOUND
2341 + || entry_num < ih_entry_count(ih)) {
2342 + struct reiserfs_de_head *deh =
2343 + B_I_DEH(bh, ih) + entry_num;
2345 + for (; entry_num < ih_entry_count(ih);
2346 + entry_num++, deh++) {
2347 + int d_reclen;
2348 + char *d_name;
2349 + ino_t d_ino;
2350 + loff_t cur_pos = deh_offset(deh);
2352 + /* it is hidden entry */
2353 + if (!de_visible(deh))
2354 + continue;
2355 + d_reclen = entry_length(bh, ih, entry_num);
2356 + d_name = B_I_DEH_ENTRY_FILE_NAME(bh, ih, deh);
2358 + if (d_reclen <= 0 ||
2359 + d_name + d_reclen > bh->b_data + bh->b_size) {
2360 + /*
2361 + * There is corrupted data in entry,
2362 + * We'd better stop here
2363 + */
2364 + pathrelse(&path_to_entry);
2365 + ret = -EIO;
2366 + goto out;
2369 + if (!d_name[d_reclen - 1])
2370 + d_reclen = strlen(d_name);
2372 + /* too big to send back to VFS */
2373 + if (d_reclen >
2374 + REISERFS_MAX_NAME(inode->i_sb->
2375 + s_blocksize)) {
2376 + continue;
2379 + /* Ignore the .reiserfs_priv entry */
2380 + if (is_privroot_deh(inode, deh))
2381 + continue;
2383 + ctx->pos = deh_offset(deh);
2384 + d_ino = deh_objectid(deh);
2385 + if (d_reclen <= 32) {
2386 + local_buf = small_buf;
2387 + } else {
2388 + local_buf = kmalloc(d_reclen,
2389 + GFP_NOFS);
2390 + if (!local_buf) {
2391 + pathrelse(&path_to_entry);
2392 + ret = -ENOMEM;
2393 + goto out;
2395 + if (item_moved(&tmp_ih, &path_to_entry)) {
2396 + kfree(local_buf);
2397 + goto research;
2401 + /*
2402 + * Note, that we copy name to user space via
2403 + * temporary buffer (local_buf) because
2404 + * filldir will block if user space buffer is
2405 + * swapped out. At that time entry can move to
2406 + * somewhere else
2407 + */
2408 + memcpy(local_buf, d_name, d_reclen);
2410 + /*
2411 + * Since filldir might sleep, we can release
2412 + * the write lock here for other waiters
2413 + */
2414 + depth = reiserfs_write_unlock_nested(inode->i_sb);
2415 + if (!dir_emit
2416 + (ctx, local_buf, d_reclen, d_ino,
2417 + DT_UNKNOWN)) {
2418 + reiserfs_write_lock_nested(inode->i_sb, depth);
2419 + if (local_buf != small_buf) {
2420 + kfree(local_buf);
2422 + goto end;
2424 + reiserfs_write_lock_nested(inode->i_sb, depth);
2425 + if (local_buf != small_buf) {
2426 + kfree(local_buf);
2429 + /* deh_offset(deh) may be invalid now. */
2430 + next_pos = cur_pos + 1;
2432 + if (item_moved(&tmp_ih, &path_to_entry)) {
2433 + set_cpu_key_k_offset(&pos_key,
2434 + next_pos);
2435 + goto research;
2437 + } /* for */
2440 + /* end of directory has been reached */
2441 + if (item_num != B_NR_ITEMS(bh) - 1)
2442 + goto end;
2444 + /*
2445 + * item we went through is last item of node. Using right
2446 + * delimiting key check is it directory end
2447 + */
2448 + rkey = get_rkey(&path_to_entry, inode->i_sb);
2449 + if (!comp_le_keys(rkey, &MIN_KEY)) {
2450 + /*
2451 + * set pos_key to key, that is the smallest and greater
2452 + * that key of the last entry in the item
2453 + */
2454 + set_cpu_key_k_offset(&pos_key, next_pos);
2455 + continue;
2458 + /* end of directory has been reached */
2459 + if (COMP_SHORT_KEYS(rkey, &pos_key)) {
2460 + goto end;
2463 + /* directory continues in the right neighboring block */
2464 + set_cpu_key_k_offset(&pos_key,
2465 + le_key_k_offset(KEY_FORMAT_3_5, rkey));
2467 + } /* while */
2469 +end:
2470 + ctx->pos = next_pos;
2471 + pathrelse(&path_to_entry);
2472 + reiserfs_check_path(&path_to_entry);
2473 +out:
2474 + reiserfs_write_unlock(inode->i_sb);
2475 + return ret;
2478 +static int reiserfs_readdir(struct file *file, struct dir_context *ctx)
2480 + return reiserfs_readdir_inode(file_inode(file), ctx);
2484 + * compose directory item containing "." and ".." entries (entries are
2485 + * not aligned to 4 byte boundary)
2486 + */
2487 +void make_empty_dir_item_v1(char *body, __le32 dirid, __le32 objid,
2488 + __le32 par_dirid, __le32 par_objid)
2490 + struct reiserfs_de_head *dot, *dotdot;
2492 + memset(body, 0, EMPTY_DIR_SIZE_V1);
2493 + dot = (struct reiserfs_de_head *)body;
2494 + dotdot = dot + 1;
2496 + /* direntry header of "." */
2497 + put_deh_offset(dot, DOT_OFFSET);
2498 + /* these two are from make_le_item_head, and are LE */
2499 + dot->deh_dir_id = dirid;
2500 + dot->deh_objectid = objid;
2501 + dot->deh_state = 0; /* Endian safe if 0 */
2502 + put_deh_location(dot, EMPTY_DIR_SIZE_V1 - strlen("."));
2503 + mark_de_visible(dot);
2505 + /* direntry header of ".." */
2506 + put_deh_offset(dotdot, DOT_DOT_OFFSET);
2507 + /* key of ".." for the root directory */
2508 + /* these two are from the inode, and are LE */
2509 + dotdot->deh_dir_id = par_dirid;
2510 + dotdot->deh_objectid = par_objid;
2511 + dotdot->deh_state = 0; /* Endian safe if 0 */
2512 + put_deh_location(dotdot, deh_location(dot) - strlen(".."));
2513 + mark_de_visible(dotdot);
2515 + /* copy ".." and "." */
2516 + memcpy(body + deh_location(dot), ".", 1);
2517 + memcpy(body + deh_location(dotdot), "..", 2);
2520 +/* compose directory item containing "." and ".." entries */
2521 +void make_empty_dir_item(char *body, __le32 dirid, __le32 objid,
2522 + __le32 par_dirid, __le32 par_objid)
2524 + struct reiserfs_de_head *dot, *dotdot;
2526 + memset(body, 0, EMPTY_DIR_SIZE);
2527 + dot = (struct reiserfs_de_head *)body;
2528 + dotdot = dot + 1;
2530 + /* direntry header of "." */
2531 + put_deh_offset(dot, DOT_OFFSET);
2532 + /* these two are from make_le_item_head, and are LE */
2533 + dot->deh_dir_id = dirid;
2534 + dot->deh_objectid = objid;
2535 + dot->deh_state = 0; /* Endian safe if 0 */
2536 + put_deh_location(dot, EMPTY_DIR_SIZE - ROUND_UP(strlen(".")));
2537 + mark_de_visible(dot);
2539 + /* direntry header of ".." */
2540 + put_deh_offset(dotdot, DOT_DOT_OFFSET);
2541 + /* key of ".." for the root directory */
2542 + /* these two are from the inode, and are LE */
2543 + dotdot->deh_dir_id = par_dirid;
2544 + dotdot->deh_objectid = par_objid;
2545 + dotdot->deh_state = 0; /* Endian safe if 0 */
2546 + put_deh_location(dotdot, deh_location(dot) - ROUND_UP(strlen("..")));
2547 + mark_de_visible(dotdot);
2549 + /* copy ".." and "." */
2550 + memcpy(body + deh_location(dot), ".", 1);
2551 + memcpy(body + deh_location(dotdot), "..", 2);
2553 diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
2554 new file mode 100644
2555 index 000000000000..5129efc6f2e6
2556 --- /dev/null
2557 +++ b/fs/reiserfs/do_balan.c
2558 @@ -0,0 +1,1900 @@
2560 + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
2561 + */
2564 + * Now we have all buffers that must be used in balancing of the tree
2565 + * Further calculations can not cause schedule(), and thus the buffer
2566 + * tree will be stable until the balancing will be finished
2567 + * balance the tree according to the analysis made before,
2568 + * and using buffers obtained after all above.
2569 + */
2571 +#include <linux/uaccess.h>
2572 +#include <linux/time.h>
2573 +#include "reiserfs.h"
2574 +#include <linux/buffer_head.h>
2575 +#include <linux/kernel.h>
2577 +static inline void buffer_info_init_left(struct tree_balance *tb,
2578 + struct buffer_info *bi)
2580 + bi->tb = tb;
2581 + bi->bi_bh = tb->L[0];
2582 + bi->bi_parent = tb->FL[0];
2583 + bi->bi_position = get_left_neighbor_position(tb, 0);
2586 +static inline void buffer_info_init_right(struct tree_balance *tb,
2587 + struct buffer_info *bi)
2589 + bi->tb = tb;
2590 + bi->bi_bh = tb->R[0];
2591 + bi->bi_parent = tb->FR[0];
2592 + bi->bi_position = get_right_neighbor_position(tb, 0);
2595 +static inline void buffer_info_init_tbS0(struct tree_balance *tb,
2596 + struct buffer_info *bi)
2598 + bi->tb = tb;
2599 + bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path);
2600 + bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
2601 + bi->bi_position = PATH_H_POSITION(tb->tb_path, 1);
2604 +static inline void buffer_info_init_bh(struct tree_balance *tb,
2605 + struct buffer_info *bi,
2606 + struct buffer_head *bh)
2608 + bi->tb = tb;
2609 + bi->bi_bh = bh;
2610 + bi->bi_parent = NULL;
2611 + bi->bi_position = 0;
2614 +inline void do_balance_mark_leaf_dirty(struct tree_balance *tb,
2615 + struct buffer_head *bh, int flag)
2617 + journal_mark_dirty(tb->transaction_handle, bh);
2620 +#define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty
2621 +#define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty
2624 + * summary:
2625 + * if deleting something ( tb->insert_size[0] < 0 )
2626 + * return(balance_leaf_when_delete()); (flag d handled here)
2627 + * else
2628 + * if lnum is larger than 0 we put items into the left node
2629 + * if rnum is larger than 0 we put items into the right node
2630 + * if snum1 is larger than 0 we put items into the new node s1
2631 + * if snum2 is larger than 0 we put items into the new node s2
2632 + * Note that all *num* count new items being created.
2633 + */
2635 +static void balance_leaf_when_delete_del(struct tree_balance *tb)
2637 + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
2638 + int item_pos = PATH_LAST_POSITION(tb->tb_path);
2639 + struct buffer_info bi;
2640 +#ifdef CONFIG_REISERFS_CHECK
2641 + struct item_head *ih = item_head(tbS0, item_pos);
2642 +#endif
2644 + RFALSE(ih_item_len(ih) + IH_SIZE != -tb->insert_size[0],
2645 + "vs-12013: mode Delete, insert size %d, ih to be deleted %h",
2646 + -tb->insert_size[0], ih);
2648 + buffer_info_init_tbS0(tb, &bi);
2649 + leaf_delete_items(&bi, 0, item_pos, 1, -1);
2651 + if (!item_pos && tb->CFL[0]) {
2652 + if (B_NR_ITEMS(tbS0)) {
2653 + replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0);
2654 + } else {
2655 + if (!PATH_H_POSITION(tb->tb_path, 1))
2656 + replace_key(tb, tb->CFL[0], tb->lkey[0],
2657 + PATH_H_PPARENT(tb->tb_path, 0), 0);
2661 + RFALSE(!item_pos && !tb->CFL[0],
2662 + "PAP-12020: tb->CFL[0]==%p, tb->L[0]==%p", tb->CFL[0],
2663 + tb->L[0]);
2666 +/* cut item in S[0] */
2667 +static void balance_leaf_when_delete_cut(struct tree_balance *tb)
2669 + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
2670 + int item_pos = PATH_LAST_POSITION(tb->tb_path);
2671 + struct item_head *ih = item_head(tbS0, item_pos);
2672 + int pos_in_item = tb->tb_path->pos_in_item;
2673 + struct buffer_info bi;
2674 + buffer_info_init_tbS0(tb, &bi);
2676 + if (is_direntry_le_ih(ih)) {
2677 + /*
2678 + * UFS unlink semantics are such that you can only
2679 + * delete one directory entry at a time.
2681 + * when we cut a directory tb->insert_size[0] means
2682 + * number of entries to be cut (always 1)
2683 + */
2684 + tb->insert_size[0] = -1;
2685 + leaf_cut_from_buffer(&bi, item_pos, pos_in_item,
2686 + -tb->insert_size[0]);
2688 + RFALSE(!item_pos && !pos_in_item && !tb->CFL[0],
2689 + "PAP-12030: can not change delimiting key. CFL[0]=%p",
2690 + tb->CFL[0]);
2692 + if (!item_pos && !pos_in_item && tb->CFL[0])
2693 + replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0);
2694 + } else {
2695 + leaf_cut_from_buffer(&bi, item_pos, pos_in_item,
2696 + -tb->insert_size[0]);
2698 + RFALSE(!ih_item_len(ih),
2699 + "PAP-12035: cut must leave non-zero dynamic "
2700 + "length of item");
2704 +static int balance_leaf_when_delete_left(struct tree_balance *tb)
2706 + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
2707 + int n = B_NR_ITEMS(tbS0);
2709 + /* L[0] must be joined with S[0] */
2710 + if (tb->lnum[0] == -1) {
2711 + /* R[0] must be also joined with S[0] */
2712 + if (tb->rnum[0] == -1) {
2713 + if (tb->FR[0] == PATH_H_PPARENT(tb->tb_path, 0)) {
2714 + /*
2715 + * all contents of all the
2716 + * 3 buffers will be in L[0]
2717 + */
2718 + if (PATH_H_POSITION(tb->tb_path, 1) == 0 &&
2719 + 1 < B_NR_ITEMS(tb->FR[0]))
2720 + replace_key(tb, tb->CFL[0],
2721 + tb->lkey[0], tb->FR[0], 1);
2723 + leaf_move_items(LEAF_FROM_S_TO_L, tb, n, -1,
2724 + NULL);
2725 + leaf_move_items(LEAF_FROM_R_TO_L, tb,
2726 + B_NR_ITEMS(tb->R[0]), -1,
2727 + NULL);
2729 + reiserfs_invalidate_buffer(tb, tbS0);
2730 + reiserfs_invalidate_buffer(tb, tb->R[0]);
2732 + return 0;
2735 + /* all contents of all the 3 buffers will be in R[0] */
2736 + leaf_move_items(LEAF_FROM_S_TO_R, tb, n, -1, NULL);
2737 + leaf_move_items(LEAF_FROM_L_TO_R, tb,
2738 + B_NR_ITEMS(tb->L[0]), -1, NULL);
2740 + /* right_delimiting_key is correct in R[0] */
2741 + replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
2743 + reiserfs_invalidate_buffer(tb, tbS0);
2744 + reiserfs_invalidate_buffer(tb, tb->L[0]);
2746 + return -1;
2749 + RFALSE(tb->rnum[0] != 0,
2750 + "PAP-12045: rnum must be 0 (%d)", tb->rnum[0]);
2751 + /* all contents of L[0] and S[0] will be in L[0] */
2752 + leaf_shift_left(tb, n, -1);
2754 + reiserfs_invalidate_buffer(tb, tbS0);
2756 + return 0;
2759 + /*
2760 + * a part of contents of S[0] will be in L[0] and
2761 + * the rest part of S[0] will be in R[0]
2762 + */
2764 + RFALSE((tb->lnum[0] + tb->rnum[0] < n) ||
2765 + (tb->lnum[0] + tb->rnum[0] > n + 1),
2766 + "PAP-12050: rnum(%d) and lnum(%d) and item "
2767 + "number(%d) in S[0] are not consistent",
2768 + tb->rnum[0], tb->lnum[0], n);
2769 + RFALSE((tb->lnum[0] + tb->rnum[0] == n) &&
2770 + (tb->lbytes != -1 || tb->rbytes != -1),
2771 + "PAP-12055: bad rbytes (%d)/lbytes (%d) "
2772 + "parameters when items are not split",
2773 + tb->rbytes, tb->lbytes);
2774 + RFALSE((tb->lnum[0] + tb->rnum[0] == n + 1) &&
2775 + (tb->lbytes < 1 || tb->rbytes != -1),
2776 + "PAP-12060: bad rbytes (%d)/lbytes (%d) "
2777 + "parameters when items are split",
2778 + tb->rbytes, tb->lbytes);
2780 + leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
2781 + leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
2783 + reiserfs_invalidate_buffer(tb, tbS0);
2785 + return 0;
2789 + * Balance leaf node in case of delete or cut: insert_size[0] < 0
2791 + * lnum, rnum can have values >= -1
2792 + * -1 means that the neighbor must be joined with S
2793 + * 0 means that nothing should be done with the neighbor
2794 + * >0 means to shift entirely or partly the specified number of items
2795 + * to the neighbor
2796 + */
2797 +static int balance_leaf_when_delete(struct tree_balance *tb, int flag)
2799 + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
2800 + struct buffer_info bi;
2801 + int n;
2803 + RFALSE(tb->FR[0] && B_LEVEL(tb->FR[0]) != DISK_LEAF_NODE_LEVEL + 1,
2804 + "vs- 12000: level: wrong FR %z", tb->FR[0]);
2805 + RFALSE(tb->blknum[0] > 1,
2806 + "PAP-12005: tb->blknum == %d, can not be > 1", tb->blknum[0]);
2807 + RFALSE(!tb->blknum[0] && !PATH_H_PPARENT(tb->tb_path, 0),
2808 + "PAP-12010: tree can not be empty");
2810 + buffer_info_init_tbS0(tb, &bi);
2812 + /* Delete or truncate the item */
2814 + BUG_ON(flag != M_DELETE && flag != M_CUT);
2815 + if (flag == M_DELETE)
2816 + balance_leaf_when_delete_del(tb);
2817 + else /* M_CUT */
2818 + balance_leaf_when_delete_cut(tb);
2821 + /*
2822 + * the rule is that no shifting occurs unless by shifting
2823 + * a node can be freed
2824 + */
2825 + n = B_NR_ITEMS(tbS0);
2828 + /* L[0] takes part in balancing */
2829 + if (tb->lnum[0])
2830 + return balance_leaf_when_delete_left(tb);
2832 + if (tb->rnum[0] == -1) {
2833 + /* all contents of R[0] and S[0] will be in R[0] */
2834 + leaf_shift_right(tb, n, -1);
2835 + reiserfs_invalidate_buffer(tb, tbS0);
2836 + return 0;
2839 + RFALSE(tb->rnum[0],
2840 + "PAP-12065: bad rnum parameter must be 0 (%d)", tb->rnum[0]);
2841 + return 0;
2844 +static unsigned int balance_leaf_insert_left(struct tree_balance *tb,
2845 + struct item_head *const ih,
2846 + const char * const body)
2848 + int ret;
2849 + struct buffer_info bi;
2850 + int n = B_NR_ITEMS(tb->L[0]);
2851 + unsigned body_shift_bytes = 0;
2853 + if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) {
2854 + /* part of new item falls into L[0] */
2855 + int new_item_len, shift;
2857 + ret = leaf_shift_left(tb, tb->lnum[0] - 1, -1);
2859 + /* Calculate item length to insert to S[0] */
2860 + new_item_len = ih_item_len(ih) - tb->lbytes;
2862 + /* Calculate and check item length to insert to L[0] */
2863 + put_ih_item_len(ih, ih_item_len(ih) - new_item_len);
2865 + RFALSE(ih_item_len(ih) <= 0,
2866 + "PAP-12080: there is nothing to insert into L[0]: "
2867 + "ih_item_len=%d", ih_item_len(ih));
2869 + /* Insert new item into L[0] */
2870 + buffer_info_init_left(tb, &bi);
2871 + leaf_insert_into_buf(&bi, n + tb->item_pos - ret, ih, body,
2872 + min_t(int, tb->zeroes_num, ih_item_len(ih)));
2874 + /*
2875 + * Calculate key component, item length and body to
2876 + * insert into S[0]
2877 + */
2878 + shift = 0;
2879 + if (is_indirect_le_ih(ih))
2880 + shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
2882 + add_le_ih_k_offset(ih, tb->lbytes << shift);
2884 + put_ih_item_len(ih, new_item_len);
2885 + if (tb->lbytes > tb->zeroes_num) {
2886 + body_shift_bytes = tb->lbytes - tb->zeroes_num;
2887 + tb->zeroes_num = 0;
2888 + } else
2889 + tb->zeroes_num -= tb->lbytes;
2891 + RFALSE(ih_item_len(ih) <= 0,
2892 + "PAP-12085: there is nothing to insert into S[0]: "
2893 + "ih_item_len=%d", ih_item_len(ih));
2894 + } else {
2895 + /* new item in whole falls into L[0] */
2896 + /* Shift lnum[0]-1 items to L[0] */
2897 + ret = leaf_shift_left(tb, tb->lnum[0] - 1, tb->lbytes);
2899 + /* Insert new item into L[0] */
2900 + buffer_info_init_left(tb, &bi);
2901 + leaf_insert_into_buf(&bi, n + tb->item_pos - ret, ih, body,
2902 + tb->zeroes_num);
2903 + tb->insert_size[0] = 0;
2904 + tb->zeroes_num = 0;
2906 + return body_shift_bytes;
2909 +static void balance_leaf_paste_left_shift_dirent(struct tree_balance *tb,
2910 + struct item_head * const ih,
2911 + const char * const body)
2913 + int n = B_NR_ITEMS(tb->L[0]);
2914 + struct buffer_info bi;
2916 + RFALSE(tb->zeroes_num,
2917 + "PAP-12090: invalid parameter in case of a directory");
2919 + /* directory item */
2920 + if (tb->lbytes > tb->pos_in_item) {
2921 + /* new directory entry falls into L[0] */
2922 + struct item_head *pasted;
2923 + int ret, l_pos_in_item = tb->pos_in_item;
2925 + /*
2926 + * Shift lnum[0] - 1 items in whole.
2927 + * Shift lbytes - 1 entries from given directory item
2928 + */
2929 + ret = leaf_shift_left(tb, tb->lnum[0], tb->lbytes - 1);
2930 + if (ret && !tb->item_pos) {
2931 + pasted = item_head(tb->L[0], B_NR_ITEMS(tb->L[0]) - 1);
2932 + l_pos_in_item += ih_entry_count(pasted) -
2933 + (tb->lbytes - 1);
2936 + /* Append given directory entry to directory item */
2937 + buffer_info_init_left(tb, &bi);
2938 + leaf_paste_in_buffer(&bi, n + tb->item_pos - ret,
2939 + l_pos_in_item, tb->insert_size[0],
2940 + body, tb->zeroes_num);
2942 + /*
2943 + * previous string prepared space for pasting new entry,
2944 + * following string pastes this entry
2945 + */
2947 + /*
2948 + * when we have merge directory item, pos_in_item
2949 + * has been changed too
2950 + */
2952 + /* paste new directory entry. 1 is entry number */
2953 + leaf_paste_entries(&bi, n + tb->item_pos - ret,
2954 + l_pos_in_item, 1,
2955 + (struct reiserfs_de_head *) body,
2956 + body + DEH_SIZE, tb->insert_size[0]);
2957 + tb->insert_size[0] = 0;
2958 + } else {
2959 + /* new directory item doesn't fall into L[0] */
2960 + /*
2961 + * Shift lnum[0]-1 items in whole. Shift lbytes
2962 + * directory entries from directory item number lnum[0]
2963 + */
2964 + leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
2967 + /* Calculate new position to append in item body */
2968 + tb->pos_in_item -= tb->lbytes;
2971 +static unsigned int balance_leaf_paste_left_shift(struct tree_balance *tb,
2972 + struct item_head * const ih,
2973 + const char * const body)
2975 + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
2976 + int n = B_NR_ITEMS(tb->L[0]);
2977 + struct buffer_info bi;
2978 + int body_shift_bytes = 0;
2980 + if (is_direntry_le_ih(item_head(tbS0, tb->item_pos))) {
2981 + balance_leaf_paste_left_shift_dirent(tb, ih, body);
2982 + return 0;
2985 + RFALSE(tb->lbytes <= 0,
2986 + "PAP-12095: there is nothing to shift to L[0]. "
2987 + "lbytes=%d", tb->lbytes);
2988 + RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)),
2989 + "PAP-12100: incorrect position to paste: "
2990 + "item_len=%d, pos_in_item=%d",
2991 + ih_item_len(item_head(tbS0, tb->item_pos)), tb->pos_in_item);
2993 + /* appended item will be in L[0] in whole */
2994 + if (tb->lbytes >= tb->pos_in_item) {
2995 + struct item_head *tbS0_pos_ih, *tbL0_ih;
2996 + struct item_head *tbS0_0_ih;
2997 + struct reiserfs_key *left_delim_key;
2998 + int ret, l_n, version, temp_l;
3000 + tbS0_pos_ih = item_head(tbS0, tb->item_pos);
3001 + tbS0_0_ih = item_head(tbS0, 0);
3003 + /*
3004 + * this bytes number must be appended
3005 + * to the last item of L[h]
3006 + */
3007 + l_n = tb->lbytes - tb->pos_in_item;
3009 + /* Calculate new insert_size[0] */
3010 + tb->insert_size[0] -= l_n;
3012 + RFALSE(tb->insert_size[0] <= 0,
3013 + "PAP-12105: there is nothing to paste into "
3014 + "L[0]. insert_size=%d", tb->insert_size[0]);
3016 + ret = leaf_shift_left(tb, tb->lnum[0],
3017 + ih_item_len(tbS0_pos_ih));
3019 + tbL0_ih = item_head(tb->L[0], n + tb->item_pos - ret);
3021 + /* Append to body of item in L[0] */
3022 + buffer_info_init_left(tb, &bi);
3023 + leaf_paste_in_buffer(&bi, n + tb->item_pos - ret,
3024 + ih_item_len(tbL0_ih), l_n, body,
3025 + min_t(int, l_n, tb->zeroes_num));
3027 + /*
3028 + * 0-th item in S0 can be only of DIRECT type
3029 + * when l_n != 0
3030 + */
3031 + temp_l = l_n;
3033 + RFALSE(ih_item_len(tbS0_0_ih),
3034 + "PAP-12106: item length must be 0");
3035 + RFALSE(comp_short_le_keys(&tbS0_0_ih->ih_key,
3036 + leaf_key(tb->L[0], n + tb->item_pos - ret)),
3037 + "PAP-12107: items must be of the same file");
3039 + if (is_indirect_le_ih(tbL0_ih)) {
3040 + int shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
3041 + temp_l = l_n << shift;
3043 + /* update key of first item in S0 */
3044 + version = ih_version(tbS0_0_ih);
3045 + add_le_key_k_offset(version, &tbS0_0_ih->ih_key, temp_l);
3047 + /* update left delimiting key */
3048 + left_delim_key = internal_key(tb->CFL[0], tb->lkey[0]);
3049 + add_le_key_k_offset(version, left_delim_key, temp_l);
3051 + /*
3052 + * Calculate new body, position in item and
3053 + * insert_size[0]
3054 + */
3055 + if (l_n > tb->zeroes_num) {
3056 + body_shift_bytes = l_n - tb->zeroes_num;
3057 + tb->zeroes_num = 0;
3058 + } else
3059 + tb->zeroes_num -= l_n;
3060 + tb->pos_in_item = 0;
3062 + RFALSE(comp_short_le_keys(&tbS0_0_ih->ih_key,
3063 + leaf_key(tb->L[0],
3064 + B_NR_ITEMS(tb->L[0]) - 1)) ||
3065 + !op_is_left_mergeable(leaf_key(tbS0, 0), tbS0->b_size) ||
3066 + !op_is_left_mergeable(left_delim_key, tbS0->b_size),
3067 + "PAP-12120: item must be merge-able with left "
3068 + "neighboring item");
3069 + } else {
3070 + /* only part of the appended item will be in L[0] */
3072 + /* Calculate position in item for append in S[0] */
3073 + tb->pos_in_item -= tb->lbytes;
3075 + RFALSE(tb->pos_in_item <= 0,
3076 + "PAP-12125: no place for paste. pos_in_item=%d",
3077 + tb->pos_in_item);
3079 + /*
3080 + * Shift lnum[0] - 1 items in whole.
3081 + * Shift lbytes - 1 byte from item number lnum[0]
3082 + */
3083 + leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
3085 + return body_shift_bytes;
3089 +/* appended item will be in L[0] in whole */
3090 +static void balance_leaf_paste_left_whole(struct tree_balance *tb,
3091 + struct item_head * const ih,
3092 + const char * const body)
3094 + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
3095 + int n = B_NR_ITEMS(tb->L[0]);
3096 + struct buffer_info bi;
3097 + struct item_head *pasted;
3098 + int ret;
3100 + /* if we paste into first item of S[0] and it is left mergable */
3101 + if (!tb->item_pos &&
3102 + op_is_left_mergeable(leaf_key(tbS0, 0), tbS0->b_size)) {
3103 + /*
3104 + * then increment pos_in_item by the size of the
3105 + * last item in L[0]
3106 + */
3107 + pasted = item_head(tb->L[0], n - 1);
3108 + if (is_direntry_le_ih(pasted))
3109 + tb->pos_in_item += ih_entry_count(pasted);
3110 + else
3111 + tb->pos_in_item += ih_item_len(pasted);
3114 + /*
3115 + * Shift lnum[0] - 1 items in whole.
3116 + * Shift lbytes - 1 byte from item number lnum[0]
3117 + */
3118 + ret = leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
3120 + /* Append to body of item in L[0] */
3121 + buffer_info_init_left(tb, &bi);
3122 + leaf_paste_in_buffer(&bi, n + tb->item_pos - ret, tb->pos_in_item,
3123 + tb->insert_size[0], body, tb->zeroes_num);
3125 + /* if appended item is directory, paste entry */
3126 + pasted = item_head(tb->L[0], n + tb->item_pos - ret);
3127 + if (is_direntry_le_ih(pasted))
3128 + leaf_paste_entries(&bi, n + tb->item_pos - ret,
3129 + tb->pos_in_item, 1,
3130 + (struct reiserfs_de_head *)body,
3131 + body + DEH_SIZE, tb->insert_size[0]);
3133 + /*
3134 + * if appended item is indirect item, put unformatted node
3135 + * into un list
3136 + */
3137 + if (is_indirect_le_ih(pasted))
3138 + set_ih_free_space(pasted, 0);
3140 + tb->insert_size[0] = 0;
3141 + tb->zeroes_num = 0;
3144 +static unsigned int balance_leaf_paste_left(struct tree_balance *tb,
3145 + struct item_head * const ih,
3146 + const char * const body)
3148 + /* we must shift the part of the appended item */
3149 + if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1)
3150 + return balance_leaf_paste_left_shift(tb, ih, body);
3151 + else
3152 + balance_leaf_paste_left_whole(tb, ih, body);
3153 + return 0;
3156 +/* Shift lnum[0] items from S[0] to the left neighbor L[0] */
3157 +static unsigned int balance_leaf_left(struct tree_balance *tb,
3158 + struct item_head * const ih,
3159 + const char * const body, int flag)
3161 + if (tb->lnum[0] <= 0)
3162 + return 0;
3164 + /* new item or it part falls to L[0], shift it too */
3165 + if (tb->item_pos < tb->lnum[0]) {
3166 + BUG_ON(flag != M_INSERT && flag != M_PASTE);
3168 + if (flag == M_INSERT)
3169 + return balance_leaf_insert_left(tb, ih, body);
3170 + else /* M_PASTE */
3171 + return balance_leaf_paste_left(tb, ih, body);
3172 + } else
3173 + /* new item doesn't fall into L[0] */
3174 + leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
3175 + return 0;
3179 +static void balance_leaf_insert_right(struct tree_balance *tb,
3180 + struct item_head * const ih,
3181 + const char * const body)
3184 + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
3185 + int n = B_NR_ITEMS(tbS0);
3186 + struct buffer_info bi;
3188 + /* new item or part of it doesn't fall into R[0] */
3189 + if (n - tb->rnum[0] >= tb->item_pos) {
3190 + leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
3191 + return;
3194 + /* new item or its part falls to R[0] */
3196 + /* part of new item falls into R[0] */
3197 + if (tb->item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1) {
3198 + loff_t old_key_comp, old_len, r_zeroes_number;
3199 + const char *r_body;
3200 + int shift;
3201 + loff_t offset;
3203 + leaf_shift_right(tb, tb->rnum[0] - 1, -1);
3205 + /* Remember key component and item length */
3206 + old_key_comp = le_ih_k_offset(ih);
3207 + old_len = ih_item_len(ih);
3209 + /*
3210 + * Calculate key component and item length to insert
3211 + * into R[0]
3212 + */
3213 + shift = 0;
3214 + if (is_indirect_le_ih(ih))
3215 + shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
3216 + offset = le_ih_k_offset(ih) + ((old_len - tb->rbytes) << shift);
3217 + set_le_ih_k_offset(ih, offset);
3218 + put_ih_item_len(ih, tb->rbytes);
3220 + /* Insert part of the item into R[0] */
3221 + buffer_info_init_right(tb, &bi);
3222 + if ((old_len - tb->rbytes) > tb->zeroes_num) {
3223 + r_zeroes_number = 0;
3224 + r_body = body + (old_len - tb->rbytes) - tb->zeroes_num;
3225 + } else {
3226 + r_body = body;
3227 + r_zeroes_number = tb->zeroes_num -
3228 + (old_len - tb->rbytes);
3229 + tb->zeroes_num -= r_zeroes_number;
3232 + leaf_insert_into_buf(&bi, 0, ih, r_body, r_zeroes_number);
3234 + /* Replace right delimiting key by first key in R[0] */
3235 + replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
3237 + /*
3238 + * Calculate key component and item length to
3239 + * insert into S[0]
3240 + */
3241 + set_le_ih_k_offset(ih, old_key_comp);
3242 + put_ih_item_len(ih, old_len - tb->rbytes);
3244 + tb->insert_size[0] -= tb->rbytes;
3246 + } else {
3247 + /* whole new item falls into R[0] */
3249 + /* Shift rnum[0]-1 items to R[0] */
3250 + leaf_shift_right(tb, tb->rnum[0] - 1, tb->rbytes);
3252 + /* Insert new item into R[0] */
3253 + buffer_info_init_right(tb, &bi);
3254 + leaf_insert_into_buf(&bi, tb->item_pos - n + tb->rnum[0] - 1,
3255 + ih, body, tb->zeroes_num);
3257 + if (tb->item_pos - n + tb->rnum[0] - 1 == 0)
3258 + replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
3260 + tb->zeroes_num = tb->insert_size[0] = 0;
3265 +static void balance_leaf_paste_right_shift_dirent(struct tree_balance *tb,
3266 + struct item_head * const ih,
3267 + const char * const body)
3269 + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
3270 + struct buffer_info bi;
3271 + int entry_count;
3273 + RFALSE(tb->zeroes_num,
3274 + "PAP-12145: invalid parameter in case of a directory");
3275 + entry_count = ih_entry_count(item_head(tbS0, tb->item_pos));
3277 + /* new directory entry falls into R[0] */
3278 + if (entry_count - tb->rbytes < tb->pos_in_item) {
3279 + int paste_entry_position;
3281 + RFALSE(tb->rbytes - 1 >= entry_count || !tb->insert_size[0],
3282 + "PAP-12150: no enough of entries to shift to R[0]: "
3283 + "rbytes=%d, entry_count=%d", tb->rbytes, entry_count);
3285 + /*
3286 + * Shift rnum[0]-1 items in whole.
3287 + * Shift rbytes-1 directory entries from directory
3288 + * item number rnum[0]
3289 + */
3290 + leaf_shift_right(tb, tb->rnum[0], tb->rbytes - 1);
3292 + /* Paste given directory entry to directory item */
3293 + paste_entry_position = tb->pos_in_item - entry_count +
3294 + tb->rbytes - 1;
3295 + buffer_info_init_right(tb, &bi);
3296 + leaf_paste_in_buffer(&bi, 0, paste_entry_position,
3297 + tb->insert_size[0], body, tb->zeroes_num);
3299 + /* paste entry */
3300 + leaf_paste_entries(&bi, 0, paste_entry_position, 1,
3301 + (struct reiserfs_de_head *) body,
3302 + body + DEH_SIZE, tb->insert_size[0]);
3304 + /* change delimiting keys */
3305 + if (paste_entry_position == 0)
3306 + replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
3308 + tb->insert_size[0] = 0;
3309 + tb->pos_in_item++;
3310 + } else {
3311 + /* new directory entry doesn't fall into R[0] */
3312 + leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
3316 +static void balance_leaf_paste_right_shift(struct tree_balance *tb,
3317 + struct item_head * const ih,
3318 + const char * const body)
3320 + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
3321 + int n_shift, n_rem, r_zeroes_number, version;
3322 + unsigned long temp_rem;
3323 + const char *r_body;
3324 + struct buffer_info bi;
3326 + /* we append to directory item */
3327 + if (is_direntry_le_ih(item_head(tbS0, tb->item_pos))) {
3328 + balance_leaf_paste_right_shift_dirent(tb, ih, body);
3329 + return;
3332 + /* regular object */
3334 + /*
3335 + * Calculate number of bytes which must be shifted
3336 + * from appended item
3337 + */
3338 + n_shift = tb->rbytes - tb->insert_size[0];
3339 + if (n_shift < 0)
3340 + n_shift = 0;
3342 + RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)),
3343 + "PAP-12155: invalid position to paste. ih_item_len=%d, "
3344 + "pos_in_item=%d", tb->pos_in_item,
3345 + ih_item_len(item_head(tbS0, tb->item_pos)));
3347 + leaf_shift_right(tb, tb->rnum[0], n_shift);
3349 + /*
3350 + * Calculate number of bytes which must remain in body
3351 + * after appending to R[0]
3352 + */
3353 + n_rem = tb->insert_size[0] - tb->rbytes;
3354 + if (n_rem < 0)
3355 + n_rem = 0;
3357 + temp_rem = n_rem;
3359 + version = ih_version(item_head(tb->R[0], 0));
3361 + if (is_indirect_le_key(version, leaf_key(tb->R[0], 0))) {
3362 + int shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
3363 + temp_rem = n_rem << shift;
3366 + add_le_key_k_offset(version, leaf_key(tb->R[0], 0), temp_rem);
3367 + add_le_key_k_offset(version, internal_key(tb->CFR[0], tb->rkey[0]),
3368 + temp_rem);
3370 + do_balance_mark_internal_dirty(tb, tb->CFR[0], 0);
3372 + /* Append part of body into R[0] */
3373 + buffer_info_init_right(tb, &bi);
3374 + if (n_rem > tb->zeroes_num) {
3375 + r_zeroes_number = 0;
3376 + r_body = body + n_rem - tb->zeroes_num;
3377 + } else {
3378 + r_body = body;
3379 + r_zeroes_number = tb->zeroes_num - n_rem;
3380 + tb->zeroes_num -= r_zeroes_number;
3383 + leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0] - n_rem,
3384 + r_body, r_zeroes_number);
3386 + if (is_indirect_le_ih(item_head(tb->R[0], 0)))
3387 + set_ih_free_space(item_head(tb->R[0], 0), 0);
3389 + tb->insert_size[0] = n_rem;
3390 + if (!n_rem)
3391 + tb->pos_in_item++;
3394 +static void balance_leaf_paste_right_whole(struct tree_balance *tb,
3395 + struct item_head * const ih,
3396 + const char * const body)
3398 + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
3399 + int n = B_NR_ITEMS(tbS0);
3400 + struct item_head *pasted;
3401 + struct buffer_info bi;
3403 + buffer_info_init_right(tb, &bi);
3404 + leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
3406 + /* append item in R[0] */
3407 + if (tb->pos_in_item >= 0) {
3408 + buffer_info_init_right(tb, &bi);
3409 + leaf_paste_in_buffer(&bi, tb->item_pos - n + tb->rnum[0],
3410 + tb->pos_in_item, tb->insert_size[0], body,
3411 + tb->zeroes_num);
3414 + /* paste new entry, if item is directory item */
3415 + pasted = item_head(tb->R[0], tb->item_pos - n + tb->rnum[0]);
3416 + if (is_direntry_le_ih(pasted) && tb->pos_in_item >= 0) {
3417 + leaf_paste_entries(&bi, tb->item_pos - n + tb->rnum[0],
3418 + tb->pos_in_item, 1,
3419 + (struct reiserfs_de_head *)body,
3420 + body + DEH_SIZE, tb->insert_size[0]);
3422 + if (!tb->pos_in_item) {
3424 + RFALSE(tb->item_pos - n + tb->rnum[0],
3425 + "PAP-12165: directory item must be first "
3426 + "item of node when pasting is in 0th position");
3428 + /* update delimiting keys */
3429 + replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
3433 + if (is_indirect_le_ih(pasted))
3434 + set_ih_free_space(pasted, 0);
3435 + tb->zeroes_num = tb->insert_size[0] = 0;
3438 +static void balance_leaf_paste_right(struct tree_balance *tb,
3439 + struct item_head * const ih,
3440 + const char * const body)
3442 + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
3443 + int n = B_NR_ITEMS(tbS0);
3445 + /* new item doesn't fall into R[0] */
3446 + if (n - tb->rnum[0] > tb->item_pos) {
3447 + leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
3448 + return;
3451 + /* pasted item or part of it falls to R[0] */
3453 + if (tb->item_pos == n - tb->rnum[0] && tb->rbytes != -1)
3454 + /* we must shift the part of the appended item */
3455 + balance_leaf_paste_right_shift(tb, ih, body);
3456 + else
3457 + /* pasted item in whole falls into R[0] */
3458 + balance_leaf_paste_right_whole(tb, ih, body);
3461 +/* shift rnum[0] items from S[0] to the right neighbor R[0] */
3462 +static void balance_leaf_right(struct tree_balance *tb,
3463 + struct item_head * const ih,
3464 + const char * const body, int flag)
3466 + if (tb->rnum[0] <= 0)
3467 + return;
3469 + BUG_ON(flag != M_INSERT && flag != M_PASTE);
3471 + if (flag == M_INSERT)
3472 + balance_leaf_insert_right(tb, ih, body);
3473 + else /* M_PASTE */
3474 + balance_leaf_paste_right(tb, ih, body);
3477 +static void balance_leaf_new_nodes_insert(struct tree_balance *tb,
3478 + struct item_head * const ih,
3479 + const char * const body,
3480 + struct item_head *insert_key,
3481 + struct buffer_head **insert_ptr,
3482 + int i)
3484 + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
3485 + int n = B_NR_ITEMS(tbS0);
3486 + struct buffer_info bi;
3487 + int shift;
3489 + /* new item or it part don't falls into S_new[i] */
3490 + if (n - tb->snum[i] >= tb->item_pos) {
3491 + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
3492 + tb->snum[i], tb->sbytes[i], tb->S_new[i]);
3493 + return;
3496 + /* new item or it's part falls to first new node S_new[i] */
3498 + /* part of new item falls into S_new[i] */
3499 + if (tb->item_pos == n - tb->snum[i] + 1 && tb->sbytes[i] != -1) {
3500 + int old_key_comp, old_len, r_zeroes_number;
3501 + const char *r_body;
3503 + /* Move snum[i]-1 items from S[0] to S_new[i] */
3504 + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i] - 1, -1,
3505 + tb->S_new[i]);
3507 + /* Remember key component and item length */
3508 + old_key_comp = le_ih_k_offset(ih);
3509 + old_len = ih_item_len(ih);
3511 + /*
3512 + * Calculate key component and item length to insert
3513 + * into S_new[i]
3514 + */
3515 + shift = 0;
3516 + if (is_indirect_le_ih(ih))
3517 + shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
3518 + set_le_ih_k_offset(ih,
3519 + le_ih_k_offset(ih) +
3520 + ((old_len - tb->sbytes[i]) << shift));
3522 + put_ih_item_len(ih, tb->sbytes[i]);
3524 + /* Insert part of the item into S_new[i] before 0-th item */
3525 + buffer_info_init_bh(tb, &bi, tb->S_new[i]);
3527 + if ((old_len - tb->sbytes[i]) > tb->zeroes_num) {
3528 + r_zeroes_number = 0;
3529 + r_body = body + (old_len - tb->sbytes[i]) -
3530 + tb->zeroes_num;
3531 + } else {
3532 + r_body = body;
3533 + r_zeroes_number = tb->zeroes_num - (old_len -
3534 + tb->sbytes[i]);
3535 + tb->zeroes_num -= r_zeroes_number;
3538 + leaf_insert_into_buf(&bi, 0, ih, r_body, r_zeroes_number);
3540 + /*
3541 + * Calculate key component and item length to
3542 + * insert into S[i]
3543 + */
3544 + set_le_ih_k_offset(ih, old_key_comp);
3545 + put_ih_item_len(ih, old_len - tb->sbytes[i]);
3546 + tb->insert_size[0] -= tb->sbytes[i];
3547 + } else {
3548 + /* whole new item falls into S_new[i] */
3550 + /*
3551 + * Shift snum[0] - 1 items to S_new[i]
3552 + * (sbytes[i] of split item)
3553 + */
3554 + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
3555 + tb->snum[i] - 1, tb->sbytes[i], tb->S_new[i]);
3557 + /* Insert new item into S_new[i] */
3558 + buffer_info_init_bh(tb, &bi, tb->S_new[i]);
3559 + leaf_insert_into_buf(&bi, tb->item_pos - n + tb->snum[i] - 1,
3560 + ih, body, tb->zeroes_num);
3562 + tb->zeroes_num = tb->insert_size[0] = 0;
3566 +/* we append to directory item */
3567 +static void balance_leaf_new_nodes_paste_dirent(struct tree_balance *tb,
3568 + struct item_head * const ih,
3569 + const char * const body,
3570 + struct item_head *insert_key,
3571 + struct buffer_head **insert_ptr,
3572 + int i)
3574 + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
3575 + struct item_head *aux_ih = item_head(tbS0, tb->item_pos);
3576 + int entry_count = ih_entry_count(aux_ih);
3577 + struct buffer_info bi;
3579 + if (entry_count - tb->sbytes[i] < tb->pos_in_item &&
3580 + tb->pos_in_item <= entry_count) {
3581 + /* new directory entry falls into S_new[i] */
3583 + RFALSE(!tb->insert_size[0],
3584 + "PAP-12215: insert_size is already 0");
3585 + RFALSE(tb->sbytes[i] - 1 >= entry_count,
3586 + "PAP-12220: there are no so much entries (%d), only %d",
3587 + tb->sbytes[i] - 1, entry_count);
3589 + /*
3590 + * Shift snum[i]-1 items in whole.
3591 + * Shift sbytes[i] directory entries
3592 + * from directory item number snum[i]
3593 + */
3594 + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i],
3595 + tb->sbytes[i] - 1, tb->S_new[i]);
3597 + /*
3598 + * Paste given directory entry to
3599 + * directory item
3600 + */
3601 + buffer_info_init_bh(tb, &bi, tb->S_new[i]);
3602 + leaf_paste_in_buffer(&bi, 0, tb->pos_in_item - entry_count +
3603 + tb->sbytes[i] - 1, tb->insert_size[0],
3604 + body, tb->zeroes_num);
3606 + /* paste new directory entry */
3607 + leaf_paste_entries(&bi, 0, tb->pos_in_item - entry_count +
3608 + tb->sbytes[i] - 1, 1,
3609 + (struct reiserfs_de_head *) body,
3610 + body + DEH_SIZE, tb->insert_size[0]);
3612 + tb->insert_size[0] = 0;
3613 + tb->pos_in_item++;
3614 + } else {
3615 + /* new directory entry doesn't fall into S_new[i] */
3616 + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i],
3617 + tb->sbytes[i], tb->S_new[i]);
3622 +static void balance_leaf_new_nodes_paste_shift(struct tree_balance *tb,
3623 + struct item_head * const ih,
3624 + const char * const body,
3625 + struct item_head *insert_key,
3626 + struct buffer_head **insert_ptr,
3627 + int i)
3629 + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
3630 + struct item_head *aux_ih = item_head(tbS0, tb->item_pos);
3631 + int n_shift, n_rem, r_zeroes_number, shift;
3632 + const char *r_body;
3633 + struct item_head *tmp;
3634 + struct buffer_info bi;
3636 + RFALSE(ih, "PAP-12210: ih must be 0");
3638 + if (is_direntry_le_ih(aux_ih)) {
3639 + balance_leaf_new_nodes_paste_dirent(tb, ih, body, insert_key,
3640 + insert_ptr, i);
3641 + return;
3644 + /* regular object */
3647 + RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)) ||
3648 + tb->insert_size[0] <= 0,
3649 + "PAP-12225: item too short or insert_size <= 0");
3651 + /*
3652 + * Calculate number of bytes which must be shifted from appended item
3653 + */
3654 + n_shift = tb->sbytes[i] - tb->insert_size[0];
3655 + if (n_shift < 0)
3656 + n_shift = 0;
3657 + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i], n_shift,
3658 + tb->S_new[i]);
3660 + /*
3661 + * Calculate number of bytes which must remain in body after
3662 + * append to S_new[i]
3663 + */
3664 + n_rem = tb->insert_size[0] - tb->sbytes[i];
3665 + if (n_rem < 0)
3666 + n_rem = 0;
3668 + /* Append part of body into S_new[0] */
3669 + buffer_info_init_bh(tb, &bi, tb->S_new[i]);
3670 + if (n_rem > tb->zeroes_num) {
3671 + r_zeroes_number = 0;
3672 + r_body = body + n_rem - tb->zeroes_num;
3673 + } else {
3674 + r_body = body;
3675 + r_zeroes_number = tb->zeroes_num - n_rem;
3676 + tb->zeroes_num -= r_zeroes_number;
3679 + leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0] - n_rem,
3680 + r_body, r_zeroes_number);
3682 + tmp = item_head(tb->S_new[i], 0);
3683 + shift = 0;
3684 + if (is_indirect_le_ih(tmp)) {
3685 + set_ih_free_space(tmp, 0);
3686 + shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
3688 + add_le_ih_k_offset(tmp, n_rem << shift);
3690 + tb->insert_size[0] = n_rem;
3691 + if (!n_rem)
3692 + tb->pos_in_item++;
3695 +static void balance_leaf_new_nodes_paste_whole(struct tree_balance *tb,
3696 + struct item_head * const ih,
3697 + const char * const body,
3698 + struct item_head *insert_key,
3699 + struct buffer_head **insert_ptr,
3700 + int i)
3703 + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
3704 + int n = B_NR_ITEMS(tbS0);
3705 + int leaf_mi;
3706 + struct item_head *pasted;
3707 + struct buffer_info bi;
3709 +#ifdef CONFIG_REISERFS_CHECK
3710 + struct item_head *ih_check = item_head(tbS0, tb->item_pos);
3712 + if (!is_direntry_le_ih(ih_check) &&
3713 + (tb->pos_in_item != ih_item_len(ih_check) ||
3714 + tb->insert_size[0] <= 0))
3715 + reiserfs_panic(tb->tb_sb,
3716 + "PAP-12235",
3717 + "pos_in_item must be equal to ih_item_len");
3718 +#endif
3720 + leaf_mi = leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i],
3721 + tb->sbytes[i], tb->S_new[i]);
3723 + RFALSE(leaf_mi,
3724 + "PAP-12240: unexpected value returned by leaf_move_items (%d)",
3725 + leaf_mi);
3727 + /* paste into item */
3728 + buffer_info_init_bh(tb, &bi, tb->S_new[i]);
3729 + leaf_paste_in_buffer(&bi, tb->item_pos - n + tb->snum[i],
3730 + tb->pos_in_item, tb->insert_size[0],
3731 + body, tb->zeroes_num);
3733 + pasted = item_head(tb->S_new[i], tb->item_pos - n +
3734 + tb->snum[i]);
3735 + if (is_direntry_le_ih(pasted))
3736 + leaf_paste_entries(&bi, tb->item_pos - n + tb->snum[i],
3737 + tb->pos_in_item, 1,
3738 + (struct reiserfs_de_head *)body,
3739 + body + DEH_SIZE, tb->insert_size[0]);
3741 + /* if we paste to indirect item update ih_free_space */
3742 + if (is_indirect_le_ih(pasted))
3743 + set_ih_free_space(pasted, 0);
3745 + tb->zeroes_num = tb->insert_size[0] = 0;
3748 +static void balance_leaf_new_nodes_paste(struct tree_balance *tb,
3749 + struct item_head * const ih,
3750 + const char * const body,
3751 + struct item_head *insert_key,
3752 + struct buffer_head **insert_ptr,
3753 + int i)
3755 + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
3756 + int n = B_NR_ITEMS(tbS0);
3758 + /* pasted item doesn't fall into S_new[i] */
3759 + if (n - tb->snum[i] > tb->item_pos) {
3760 + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
3761 + tb->snum[i], tb->sbytes[i], tb->S_new[i]);
3762 + return;
3765 + /* pasted item or part if it falls to S_new[i] */
3767 + if (tb->item_pos == n - tb->snum[i] && tb->sbytes[i] != -1)
3768 + /* we must shift part of the appended item */
3769 + balance_leaf_new_nodes_paste_shift(tb, ih, body, insert_key,
3770 + insert_ptr, i);
3771 + else
3772 + /* item falls wholly into S_new[i] */
3773 + balance_leaf_new_nodes_paste_whole(tb, ih, body, insert_key,
3774 + insert_ptr, i);
3777 +/* Fill new nodes that appear in place of S[0] */
3778 +static void balance_leaf_new_nodes(struct tree_balance *tb,
3779 + struct item_head * const ih,
3780 + const char * const body,
3781 + struct item_head *insert_key,
3782 + struct buffer_head **insert_ptr,
3783 + int flag)
3785 + int i;
3786 + for (i = tb->blknum[0] - 2; i >= 0; i--) {
3787 + BUG_ON(flag != M_INSERT && flag != M_PASTE);
3789 + RFALSE(!tb->snum[i],
3790 + "PAP-12200: snum[%d] == %d. Must be > 0", i,
3791 + tb->snum[i]);
3793 + /* here we shift from S to S_new nodes */
3795 + tb->S_new[i] = get_FEB(tb);
3797 + /* initialized block type and tree level */
3798 + set_blkh_level(B_BLK_HEAD(tb->S_new[i]), DISK_LEAF_NODE_LEVEL);
3800 + if (flag == M_INSERT)
3801 + balance_leaf_new_nodes_insert(tb, ih, body, insert_key,
3802 + insert_ptr, i);
3803 + else /* M_PASTE */
3804 + balance_leaf_new_nodes_paste(tb, ih, body, insert_key,
3805 + insert_ptr, i);
3807 + memcpy(insert_key + i, leaf_key(tb->S_new[i], 0), KEY_SIZE);
3808 + insert_ptr[i] = tb->S_new[i];
3810 + RFALSE(!buffer_journaled(tb->S_new[i])
3811 + || buffer_journal_dirty(tb->S_new[i])
3812 + || buffer_dirty(tb->S_new[i]),
3813 + "PAP-12247: S_new[%d] : (%b)",
3814 + i, tb->S_new[i]);
3818 +static void balance_leaf_finish_node_insert(struct tree_balance *tb,
3819 + struct item_head * const ih,
3820 + const char * const body)
3822 + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
3823 + struct buffer_info bi;
3824 + buffer_info_init_tbS0(tb, &bi);
3825 + leaf_insert_into_buf(&bi, tb->item_pos, ih, body, tb->zeroes_num);
3827 + /* If we insert the first key change the delimiting key */
3828 + if (tb->item_pos == 0) {
3829 + if (tb->CFL[0]) /* can be 0 in reiserfsck */
3830 + replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0);
3835 +static void balance_leaf_finish_node_paste_dirent(struct tree_balance *tb,
3836 + struct item_head * const ih,
3837 + const char * const body)
3839 + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
3840 + struct item_head *pasted = item_head(tbS0, tb->item_pos);
3841 + struct buffer_info bi;
3843 + if (tb->pos_in_item >= 0 && tb->pos_in_item <= ih_entry_count(pasted)) {
3844 + RFALSE(!tb->insert_size[0],
3845 + "PAP-12260: insert_size is 0 already");
3847 + /* prepare space */
3848 + buffer_info_init_tbS0(tb, &bi);
3849 + leaf_paste_in_buffer(&bi, tb->item_pos, tb->pos_in_item,
3850 + tb->insert_size[0], body, tb->zeroes_num);
3852 + /* paste entry */
3853 + leaf_paste_entries(&bi, tb->item_pos, tb->pos_in_item, 1,
3854 + (struct reiserfs_de_head *)body,
3855 + body + DEH_SIZE, tb->insert_size[0]);
3857 + if (!tb->item_pos && !tb->pos_in_item) {
3858 + RFALSE(!tb->CFL[0] || !tb->L[0],
3859 + "PAP-12270: CFL[0]/L[0] must be specified");
3860 + if (tb->CFL[0])
3861 + replace_key(tb, tb->CFL[0], tb->lkey[0],
3862 + tbS0, 0);
3865 + tb->insert_size[0] = 0;
3869 +static void balance_leaf_finish_node_paste(struct tree_balance *tb,
3870 + struct item_head * const ih,
3871 + const char * const body)
3873 + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
3874 + struct buffer_info bi;
3875 + struct item_head *pasted = item_head(tbS0, tb->item_pos);
3877 + /* when directory, may be new entry already pasted */
3878 + if (is_direntry_le_ih(pasted)) {
3879 + balance_leaf_finish_node_paste_dirent(tb, ih, body);
3880 + return;
3883 + /* regular object */
3885 + if (tb->pos_in_item == ih_item_len(pasted)) {
3886 + RFALSE(tb->insert_size[0] <= 0,
3887 + "PAP-12275: insert size must not be %d",
3888 + tb->insert_size[0]);
3889 + buffer_info_init_tbS0(tb, &bi);
3890 + leaf_paste_in_buffer(&bi, tb->item_pos,
3891 + tb->pos_in_item, tb->insert_size[0], body,
3892 + tb->zeroes_num);
3894 + if (is_indirect_le_ih(pasted))
3895 + set_ih_free_space(pasted, 0);
3897 + tb->insert_size[0] = 0;
3899 +#ifdef CONFIG_REISERFS_CHECK
3900 + else if (tb->insert_size[0]) {
3901 + print_cur_tb("12285");
3902 + reiserfs_panic(tb->tb_sb, "PAP-12285",
3903 + "insert_size must be 0 (%d)", tb->insert_size[0]);
3905 +#endif
3909 + * if the affected item was not wholly shifted then we
3910 + * perform all necessary operations on that part or whole
3911 + * of the affected item which remains in S
3912 + */
3913 +static void balance_leaf_finish_node(struct tree_balance *tb,
3914 + struct item_head * const ih,
3915 + const char * const body, int flag)
3917 + /* if we must insert or append into buffer S[0] */
3918 + if (0 <= tb->item_pos && tb->item_pos < tb->s0num) {
3919 + if (flag == M_INSERT)
3920 + balance_leaf_finish_node_insert(tb, ih, body);
3921 + else /* M_PASTE */
3922 + balance_leaf_finish_node_paste(tb, ih, body);
3926 +/**
3927 + * balance_leaf - reiserfs tree balancing algorithm
3928 + * @tb: tree balance state
3929 + * @ih: item header of inserted item (little endian)
3930 + * @body: body of inserted item or bytes to paste
3931 + * @flag: i - insert, d - delete, c - cut, p - paste (see do_balance)
3932 + * passed back:
3933 + * @insert_key: key to insert new nodes
3934 + * @insert_ptr: array of nodes to insert at the next level
3936 + * In our processing of one level we sometimes determine what must be
3937 + * inserted into the next higher level. This insertion consists of a
3938 + * key or two keys and their corresponding pointers.
3939 + */
3940 +static int balance_leaf(struct tree_balance *tb, struct item_head *ih,
3941 + const char *body, int flag,
3942 + struct item_head *insert_key,
3943 + struct buffer_head **insert_ptr)
3945 + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
3947 + PROC_INFO_INC(tb->tb_sb, balance_at[0]);
3949 + /* Make balance in case insert_size[0] < 0 */
3950 + if (tb->insert_size[0] < 0)
3951 + return balance_leaf_when_delete(tb, flag);
3953 + tb->item_pos = PATH_LAST_POSITION(tb->tb_path),
3954 + tb->pos_in_item = tb->tb_path->pos_in_item,
3955 + tb->zeroes_num = 0;
3956 + if (flag == M_INSERT && !body)
3957 + tb->zeroes_num = ih_item_len(ih);
3959 + /*
3960 + * for indirect item pos_in_item is measured in unformatted node
3961 + * pointers. Recalculate to bytes
3962 + */
3963 + if (flag != M_INSERT
3964 + && is_indirect_le_ih(item_head(tbS0, tb->item_pos)))
3965 + tb->pos_in_item *= UNFM_P_SIZE;
3967 + body += balance_leaf_left(tb, ih, body, flag);
3969 + /* tb->lnum[0] > 0 */
3970 + /* Calculate new item position */
3971 + tb->item_pos -= (tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0));
3973 + balance_leaf_right(tb, ih, body, flag);
3975 + /* tb->rnum[0] > 0 */
3976 + RFALSE(tb->blknum[0] > 3,
3977 + "PAP-12180: blknum can not be %d. It must be <= 3", tb->blknum[0]);
3978 + RFALSE(tb->blknum[0] < 0,
3979 + "PAP-12185: blknum can not be %d. It must be >= 0", tb->blknum[0]);
3981 + /*
3982 + * if while adding to a node we discover that it is possible to split
3983 + * it in two, and merge the left part into the left neighbor and the
3984 + * right part into the right neighbor, eliminating the node
3985 + */
3986 + if (tb->blknum[0] == 0) { /* node S[0] is empty now */
3988 + RFALSE(!tb->lnum[0] || !tb->rnum[0],
3989 + "PAP-12190: lnum and rnum must not be zero");
3990 + /*
3991 + * if insertion was done before 0-th position in R[0], right
3992 + * delimiting key of the tb->L[0]'s and left delimiting key are
3993 + * not set correctly
3994 + */
3995 + if (tb->CFL[0]) {
3996 + if (!tb->CFR[0])
3997 + reiserfs_panic(tb->tb_sb, "vs-12195",
3998 + "CFR not initialized");
3999 + copy_key(internal_key(tb->CFL[0], tb->lkey[0]),
4000 + internal_key(tb->CFR[0], tb->rkey[0]));
4001 + do_balance_mark_internal_dirty(tb, tb->CFL[0], 0);
4004 + reiserfs_invalidate_buffer(tb, tbS0);
4005 + return 0;
4008 + balance_leaf_new_nodes(tb, ih, body, insert_key, insert_ptr, flag);
4010 + balance_leaf_finish_node(tb, ih, body, flag);
4012 +#ifdef CONFIG_REISERFS_CHECK
4013 + if (flag == M_PASTE && tb->insert_size[0]) {
4014 + print_cur_tb("12290");
4015 + reiserfs_panic(tb->tb_sb,
4016 + "PAP-12290", "insert_size is still not 0 (%d)",
4017 + tb->insert_size[0]);
4019 +#endif
4021 + /* Leaf level of the tree is balanced (end of balance_leaf) */
4022 + return 0;
4025 +/* Make empty node */
4026 +void make_empty_node(struct buffer_info *bi)
4028 + struct block_head *blkh;
4030 + RFALSE(bi->bi_bh == NULL, "PAP-12295: pointer to the buffer is NULL");
4032 + blkh = B_BLK_HEAD(bi->bi_bh);
4033 + set_blkh_nr_item(blkh, 0);
4034 + set_blkh_free_space(blkh, MAX_CHILD_SIZE(bi->bi_bh));
4036 + if (bi->bi_parent)
4037 + B_N_CHILD(bi->bi_parent, bi->bi_position)->dc_size = 0; /* Endian safe if 0 */
4040 +/* Get first empty buffer */
4041 +struct buffer_head *get_FEB(struct tree_balance *tb)
4043 + int i;
4044 + struct buffer_info bi;
4046 + for (i = 0; i < MAX_FEB_SIZE; i++)
4047 + if (tb->FEB[i] != NULL)
4048 + break;
4050 + if (i == MAX_FEB_SIZE)
4051 + reiserfs_panic(tb->tb_sb, "vs-12300", "FEB list is empty");
4053 + buffer_info_init_bh(tb, &bi, tb->FEB[i]);
4054 + make_empty_node(&bi);
4055 + set_buffer_uptodate(tb->FEB[i]);
4056 + tb->used[i] = tb->FEB[i];
4057 + tb->FEB[i] = NULL;
4059 + return tb->used[i];
4062 +/* This is now used because reiserfs_free_block has to be able to schedule. */
4063 +static void store_thrown(struct tree_balance *tb, struct buffer_head *bh)
4065 + int i;
4067 + if (buffer_dirty(bh))
4068 + reiserfs_warning(tb->tb_sb, "reiserfs-12320",
4069 + "called with dirty buffer");
4070 + for (i = 0; i < ARRAY_SIZE(tb->thrown); i++)
4071 + if (!tb->thrown[i]) {
4072 + tb->thrown[i] = bh;
4073 + get_bh(bh); /* free_thrown puts this */
4074 + return;
4076 + reiserfs_warning(tb->tb_sb, "reiserfs-12321",
4077 + "too many thrown buffers");
4080 +static void free_thrown(struct tree_balance *tb)
4082 + int i;
4083 + b_blocknr_t blocknr;
4084 + for (i = 0; i < ARRAY_SIZE(tb->thrown); i++) {
4085 + if (tb->thrown[i]) {
4086 + blocknr = tb->thrown[i]->b_blocknr;
4087 + if (buffer_dirty(tb->thrown[i]))
4088 + reiserfs_warning(tb->tb_sb, "reiserfs-12322",
4089 + "called with dirty buffer %d",
4090 + blocknr);
4091 + brelse(tb->thrown[i]); /* incremented in store_thrown */
4092 + reiserfs_free_block(tb->transaction_handle, NULL,
4093 + blocknr, 0);
4098 +void reiserfs_invalidate_buffer(struct tree_balance *tb, struct buffer_head *bh)
4100 + struct block_head *blkh;
4101 + blkh = B_BLK_HEAD(bh);
4102 + set_blkh_level(blkh, FREE_LEVEL);
4103 + set_blkh_nr_item(blkh, 0);
4105 + clear_buffer_dirty(bh);
4106 + store_thrown(tb, bh);
4109 +/* Replace n_dest'th key in buffer dest by n_src'th key of buffer src.*/
4110 +void replace_key(struct tree_balance *tb, struct buffer_head *dest, int n_dest,
4111 + struct buffer_head *src, int n_src)
4114 + RFALSE(dest == NULL || src == NULL,
4115 + "vs-12305: source or destination buffer is 0 (src=%p, dest=%p)",
4116 + src, dest);
4117 + RFALSE(!B_IS_KEYS_LEVEL(dest),
4118 + "vs-12310: invalid level (%z) for destination buffer. dest must be leaf",
4119 + dest);
4120 + RFALSE(n_dest < 0 || n_src < 0,
4121 + "vs-12315: src(%d) or dest(%d) key number < 0", n_src, n_dest);
4122 + RFALSE(n_dest >= B_NR_ITEMS(dest) || n_src >= B_NR_ITEMS(src),
4123 + "vs-12320: src(%d(%d)) or dest(%d(%d)) key number is too big",
4124 + n_src, B_NR_ITEMS(src), n_dest, B_NR_ITEMS(dest));
4126 + if (B_IS_ITEMS_LEVEL(src))
4127 + /* source buffer contains leaf node */
4128 + memcpy(internal_key(dest, n_dest), item_head(src, n_src),
4129 + KEY_SIZE);
4130 + else
4131 + memcpy(internal_key(dest, n_dest), internal_key(src, n_src),
4132 + KEY_SIZE);
4134 + do_balance_mark_internal_dirty(tb, dest, 0);
4137 +int get_left_neighbor_position(struct tree_balance *tb, int h)
4139 + int Sh_position = PATH_H_POSITION(tb->tb_path, h + 1);
4141 + RFALSE(PATH_H_PPARENT(tb->tb_path, h) == NULL || tb->FL[h] == NULL,
4142 + "vs-12325: FL[%d](%p) or F[%d](%p) does not exist",
4143 + h, tb->FL[h], h, PATH_H_PPARENT(tb->tb_path, h));
4145 + if (Sh_position == 0)
4146 + return B_NR_ITEMS(tb->FL[h]);
4147 + else
4148 + return Sh_position - 1;
4151 +int get_right_neighbor_position(struct tree_balance *tb, int h)
4153 + int Sh_position = PATH_H_POSITION(tb->tb_path, h + 1);
4155 + RFALSE(PATH_H_PPARENT(tb->tb_path, h) == NULL || tb->FR[h] == NULL,
4156 + "vs-12330: F[%d](%p) or FR[%d](%p) does not exist",
4157 + h, PATH_H_PPARENT(tb->tb_path, h), h, tb->FR[h]);
4159 + if (Sh_position == B_NR_ITEMS(PATH_H_PPARENT(tb->tb_path, h)))
4160 + return 0;
4161 + else
4162 + return Sh_position + 1;
4165 +#ifdef CONFIG_REISERFS_CHECK
4167 +int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value);
4168 +static void check_internal_node(struct super_block *s, struct buffer_head *bh,
4169 + char *mes)
4171 + struct disk_child *dc;
4172 + int i;
4174 + RFALSE(!bh, "PAP-12336: bh == 0");
4176 + if (!bh || !B_IS_IN_TREE(bh))
4177 + return;
4179 + RFALSE(!buffer_dirty(bh) &&
4180 + !(buffer_journaled(bh) || buffer_journal_dirty(bh)),
4181 + "PAP-12337: buffer (%b) must be dirty", bh);
4182 + dc = B_N_CHILD(bh, 0);
4184 + for (i = 0; i <= B_NR_ITEMS(bh); i++, dc++) {
4185 + if (!is_reusable(s, dc_block_number(dc), 1)) {
4186 + print_cur_tb(mes);
4187 + reiserfs_panic(s, "PAP-12338",
4188 + "invalid child pointer %y in %b",
4189 + dc, bh);
4194 +static int locked_or_not_in_tree(struct tree_balance *tb,
4195 + struct buffer_head *bh, char *which)
4197 + if ((!buffer_journal_prepared(bh) && buffer_locked(bh)) ||
4198 + !B_IS_IN_TREE(bh)) {
4199 + reiserfs_warning(tb->tb_sb, "vs-12339", "%s (%b)", which, bh);
4200 + return 1;
4202 + return 0;
4205 +static int check_before_balancing(struct tree_balance *tb)
4207 + int retval = 0;
4209 + if (REISERFS_SB(tb->tb_sb)->cur_tb) {
4210 + reiserfs_panic(tb->tb_sb, "vs-12335", "suspect that schedule "
4211 + "occurred based on cur_tb not being null at "
4212 + "this point in code. do_balance cannot properly "
4213 + "handle concurrent tree accesses on a same "
4214 + "mount point.");
4217 + /*
4218 + * double check that buffers that we will modify are unlocked.
4219 + * (fix_nodes should already have prepped all of these for us).
4220 + */
4221 + if (tb->lnum[0]) {
4222 + retval |= locked_or_not_in_tree(tb, tb->L[0], "L[0]");
4223 + retval |= locked_or_not_in_tree(tb, tb->FL[0], "FL[0]");
4224 + retval |= locked_or_not_in_tree(tb, tb->CFL[0], "CFL[0]");
4225 + check_leaf(tb->L[0]);
4227 + if (tb->rnum[0]) {
4228 + retval |= locked_or_not_in_tree(tb, tb->R[0], "R[0]");
4229 + retval |= locked_or_not_in_tree(tb, tb->FR[0], "FR[0]");
4230 + retval |= locked_or_not_in_tree(tb, tb->CFR[0], "CFR[0]");
4231 + check_leaf(tb->R[0]);
4233 + retval |= locked_or_not_in_tree(tb, PATH_PLAST_BUFFER(tb->tb_path),
4234 + "S[0]");
4235 + check_leaf(PATH_PLAST_BUFFER(tb->tb_path));
4237 + return retval;
4240 +static void check_after_balance_leaf(struct tree_balance *tb)
4242 + if (tb->lnum[0]) {
4243 + if (B_FREE_SPACE(tb->L[0]) !=
4244 + MAX_CHILD_SIZE(tb->L[0]) -
4245 + dc_size(B_N_CHILD
4246 + (tb->FL[0], get_left_neighbor_position(tb, 0)))) {
4247 + print_cur_tb("12221");
4248 + reiserfs_panic(tb->tb_sb, "PAP-12355",
4249 + "shift to left was incorrect");
4252 + if (tb->rnum[0]) {
4253 + if (B_FREE_SPACE(tb->R[0]) !=
4254 + MAX_CHILD_SIZE(tb->R[0]) -
4255 + dc_size(B_N_CHILD
4256 + (tb->FR[0], get_right_neighbor_position(tb, 0)))) {
4257 + print_cur_tb("12222");
4258 + reiserfs_panic(tb->tb_sb, "PAP-12360",
4259 + "shift to right was incorrect");
4262 + if (PATH_H_PBUFFER(tb->tb_path, 1) &&
4263 + (B_FREE_SPACE(PATH_H_PBUFFER(tb->tb_path, 0)) !=
4264 + (MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)) -
4265 + dc_size(B_N_CHILD(PATH_H_PBUFFER(tb->tb_path, 1),
4266 + PATH_H_POSITION(tb->tb_path, 1)))))) {
4267 + int left = B_FREE_SPACE(PATH_H_PBUFFER(tb->tb_path, 0));
4268 + int right = (MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)) -
4269 + dc_size(B_N_CHILD(PATH_H_PBUFFER(tb->tb_path, 1),
4270 + PATH_H_POSITION(tb->tb_path,
4271 + 1))));
4272 + print_cur_tb("12223");
4273 + reiserfs_warning(tb->tb_sb, "reiserfs-12363",
4274 + "B_FREE_SPACE (PATH_H_PBUFFER(tb->tb_path,0)) = %d; "
4275 + "MAX_CHILD_SIZE (%d) - dc_size( %y, %d ) [%d] = %d",
4276 + left,
4277 + MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)),
4278 + PATH_H_PBUFFER(tb->tb_path, 1),
4279 + PATH_H_POSITION(tb->tb_path, 1),
4280 + dc_size(B_N_CHILD
4281 + (PATH_H_PBUFFER(tb->tb_path, 1),
4282 + PATH_H_POSITION(tb->tb_path, 1))),
4283 + right);
4284 + reiserfs_panic(tb->tb_sb, "PAP-12365", "S is incorrect");
4288 +static void check_leaf_level(struct tree_balance *tb)
4290 + check_leaf(tb->L[0]);
4291 + check_leaf(tb->R[0]);
4292 + check_leaf(PATH_PLAST_BUFFER(tb->tb_path));
4295 +static void check_internal_levels(struct tree_balance *tb)
4297 + int h;
4299 + /* check all internal nodes */
4300 + for (h = 1; tb->insert_size[h]; h++) {
4301 + check_internal_node(tb->tb_sb, PATH_H_PBUFFER(tb->tb_path, h),
4302 + "BAD BUFFER ON PATH");
4303 + if (tb->lnum[h])
4304 + check_internal_node(tb->tb_sb, tb->L[h], "BAD L");
4305 + if (tb->rnum[h])
4306 + check_internal_node(tb->tb_sb, tb->R[h], "BAD R");
4311 +#endif
4314 + * Now we have all of the buffers that must be used in balancing of
4315 + * the tree. We rely on the assumption that schedule() will not occur
4316 + * while do_balance works. ( Only interrupt handlers are acceptable.)
4317 + * We balance the tree according to the analysis made before this,
4318 + * using buffers already obtained. For SMP support it will someday be
4319 + * necessary to add ordered locking of tb.
4320 + */
4323 + * Some interesting rules of balancing:
4324 + * we delete a maximum of two nodes per level per balancing: we never
4325 + * delete R, when we delete two of three nodes L, S, R then we move
4326 + * them into R.
4328 + * we only delete L if we are deleting two nodes, if we delete only
4329 + * one node we delete S
4331 + * if we shift leaves then we shift as much as we can: this is a
4332 + * deliberate policy of extremism in node packing which results in
4333 + * higher average utilization after repeated random balance operations
4334 + * at the cost of more memory copies and more balancing as a result of
4335 + * small insertions to full nodes.
4337 + * if we shift internal nodes we try to evenly balance the node
4338 + * utilization, with consequent less balancing at the cost of lower
4339 + * utilization.
4341 + * one could argue that the policy for directories in leaves should be
4342 + * that of internal nodes, but we will wait until another day to
4343 + * evaluate this.... It would be nice to someday measure and prove
4344 + * these assumptions as to what is optimal....
4345 + */
4347 +static inline void do_balance_starts(struct tree_balance *tb)
4349 + /* use print_cur_tb() to see initial state of struct tree_balance */
4351 + /* store_print_tb (tb); */
4353 + /* do not delete, just comment it out */
4354 + /*
4355 + print_tb(flag, PATH_LAST_POSITION(tb->tb_path),
4356 + tb->tb_path->pos_in_item, tb, "check");
4357 + */
4358 + RFALSE(check_before_balancing(tb), "PAP-12340: locked buffers in TB");
4359 +#ifdef CONFIG_REISERFS_CHECK
4360 + REISERFS_SB(tb->tb_sb)->cur_tb = tb;
4361 +#endif
4364 +static inline void do_balance_completed(struct tree_balance *tb)
4367 +#ifdef CONFIG_REISERFS_CHECK
4368 + check_leaf_level(tb);
4369 + check_internal_levels(tb);
4370 + REISERFS_SB(tb->tb_sb)->cur_tb = NULL;
4371 +#endif
4373 + /*
4374 + * reiserfs_free_block is no longer schedule safe. So, we need to
4375 + * put the buffers we want freed on the thrown list during do_balance,
4376 + * and then free them now
4377 + */
4379 + REISERFS_SB(tb->tb_sb)->s_do_balance++;
4381 + /* release all nodes hold to perform the balancing */
4382 + unfix_nodes(tb);
4384 + free_thrown(tb);
4388 + * do_balance - balance the tree
4390 + * @tb: tree_balance structure
4391 + * @ih: item header of inserted item
4392 + * @body: body of inserted item or bytes to paste
4393 + * @flag: 'i' - insert, 'd' - delete, 'c' - cut, 'p' paste
4395 + * Cut means delete part of an item (includes removing an entry from a
4396 + * directory).
4398 + * Delete means delete whole item.
4400 + * Insert means add a new item into the tree.
4402 + * Paste means to append to the end of an existing file or to
4403 + * insert a directory entry.
4404 + */
4405 +void do_balance(struct tree_balance *tb, struct item_head *ih,
4406 + const char *body, int flag)
4408 + int child_pos; /* position of a child node in its parent */
4409 + int h; /* level of the tree being processed */
4411 + /*
4412 + * in our processing of one level we sometimes determine what
4413 + * must be inserted into the next higher level. This insertion
4414 + * consists of a key or two keys and their corresponding
4415 + * pointers
4416 + */
4417 + struct item_head insert_key[2];
4419 + /* inserted node-ptrs for the next level */
4420 + struct buffer_head *insert_ptr[2];
4422 + tb->tb_mode = flag;
4423 + tb->need_balance_dirty = 0;
4425 + if (FILESYSTEM_CHANGED_TB(tb)) {
4426 + reiserfs_panic(tb->tb_sb, "clm-6000", "fs generation has "
4427 + "changed");
4429 + /* if we have no real work to do */
4430 + if (!tb->insert_size[0]) {
4431 + reiserfs_warning(tb->tb_sb, "PAP-12350",
4432 + "insert_size == 0, mode == %c", flag);
4433 + unfix_nodes(tb);
4434 + return;
4437 + atomic_inc(&fs_generation(tb->tb_sb));
4438 + do_balance_starts(tb);
4440 + /*
4441 + * balance_leaf returns 0 except if combining L R and S into
4442 + * one node. see balance_internal() for explanation of this
4443 + * line of code.
4444 + */
4445 + child_pos = PATH_H_B_ITEM_ORDER(tb->tb_path, 0) +
4446 + balance_leaf(tb, ih, body, flag, insert_key, insert_ptr);
4448 +#ifdef CONFIG_REISERFS_CHECK
4449 + check_after_balance_leaf(tb);
4450 +#endif
4452 + /* Balance internal level of the tree. */
4453 + for (h = 1; h < MAX_HEIGHT && tb->insert_size[h]; h++)
4454 + child_pos = balance_internal(tb, h, child_pos, insert_key,
4455 + insert_ptr);
4457 + do_balance_completed(tb);
4459 diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
4460 new file mode 100644
4461 index 000000000000..8eb3ad3e8ae9
4462 --- /dev/null
4463 +++ b/fs/reiserfs/file.c
4464 @@ -0,0 +1,270 @@
4466 + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
4467 + */
4469 +#include <linux/time.h>
4470 +#include "reiserfs.h"
4471 +#include "acl.h"
4472 +#include "xattr.h"
4473 +#include <linux/uaccess.h>
4474 +#include <linux/pagemap.h>
4475 +#include <linux/swap.h>
4476 +#include <linux/writeback.h>
4477 +#include <linux/blkdev.h>
4478 +#include <linux/buffer_head.h>
4479 +#include <linux/quotaops.h>
4482 + * We pack the tails of files on file close, not at the time they are written.
4483 + * This implies an unnecessary copy of the tail and an unnecessary indirect item
4484 + * insertion/balancing, for files that are written in one write.
4485 + * It avoids unnecessary tail packings (balances) for files that are written in
4486 + * multiple writes and are small enough to have tails.
4488 + * file_release is called by the VFS layer when the file is closed. If
4489 + * this is the last open file descriptor, and the file
4490 + * small enough to have a tail, and the tail is currently in an
4491 + * unformatted node, the tail is converted back into a direct item.
4493 + * We use reiserfs_truncate_file to pack the tail, since it already has
4494 + * all the conditions coded.
4495 + */
4496 +static int reiserfs_file_release(struct inode *inode, struct file *filp)
4499 + struct reiserfs_transaction_handle th;
4500 + int err;
4501 + int jbegin_failure = 0;
4503 + BUG_ON(!S_ISREG(inode->i_mode));
4505 + if (!atomic_dec_and_mutex_lock(&REISERFS_I(inode)->openers,
4506 + &REISERFS_I(inode)->tailpack))
4507 + return 0;
4509 + /* fast out for when nothing needs to be done */
4510 + if ((!(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
4511 + !tail_has_to_be_packed(inode)) &&
4512 + REISERFS_I(inode)->i_prealloc_count <= 0) {
4513 + mutex_unlock(&REISERFS_I(inode)->tailpack);
4514 + return 0;
4517 + reiserfs_write_lock(inode->i_sb);
4518 + /*
4519 + * freeing preallocation only involves relogging blocks that
4520 + * are already in the current transaction. preallocation gets
4521 + * freed at the end of each transaction, so it is impossible for
4522 + * us to log any additional blocks (including quota blocks)
4523 + */
4524 + err = journal_begin(&th, inode->i_sb, 1);
4525 + if (err) {
4526 + /*
4527 + * uh oh, we can't allow the inode to go away while there
4528 + * is still preallocation blocks pending. Try to join the
4529 + * aborted transaction
4530 + */
4531 + jbegin_failure = err;
4532 + err = journal_join_abort(&th, inode->i_sb);
4534 + if (err) {
4535 + /*
4536 + * hmpf, our choices here aren't good. We can pin
4537 + * the inode which will disallow unmount from ever
4538 + * happening, we can do nothing, which will corrupt
4539 + * random memory on unmount, or we can forcibly
4540 + * remove the file from the preallocation list, which
4541 + * will leak blocks on disk. Lets pin the inode
4542 + * and let the admin know what is going on.
4543 + */
4544 + igrab(inode);
4545 + reiserfs_warning(inode->i_sb, "clm-9001",
4546 + "pinning inode %lu because the "
4547 + "preallocation can't be freed",
4548 + inode->i_ino);
4549 + goto out;
4552 + reiserfs_update_inode_transaction(inode);
4554 +#ifdef REISERFS_PREALLOCATE
4555 + reiserfs_discard_prealloc(&th, inode);
4556 +#endif
4557 + err = journal_end(&th);
4559 + /* copy back the error code from journal_begin */
4560 + if (!err)
4561 + err = jbegin_failure;
4563 + if (!err &&
4564 + (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) &&
4565 + tail_has_to_be_packed(inode)) {
4567 + /*
4568 + * if regular file is released by last holder and it has been
4569 + * appended (we append by unformatted node only) or its direct
4570 + * item(s) had to be converted, then it may have to be
4571 + * indirect2direct converted
4572 + */
4573 + err = reiserfs_truncate_file(inode, 0);
4575 +out:
4576 + reiserfs_write_unlock(inode->i_sb);
4577 + mutex_unlock(&REISERFS_I(inode)->tailpack);
4578 + return err;
4581 +static int reiserfs_file_open(struct inode *inode, struct file *file)
4583 + int err = dquot_file_open(inode, file);
4585 + /* somebody might be tailpacking on final close; wait for it */
4586 + if (!atomic_inc_not_zero(&REISERFS_I(inode)->openers)) {
4587 + mutex_lock(&REISERFS_I(inode)->tailpack);
4588 + atomic_inc(&REISERFS_I(inode)->openers);
4589 + mutex_unlock(&REISERFS_I(inode)->tailpack);
4591 + return err;
4594 +void reiserfs_vfs_truncate_file(struct inode *inode)
4596 + mutex_lock(&REISERFS_I(inode)->tailpack);
4597 + reiserfs_truncate_file(inode, 1);
4598 + mutex_unlock(&REISERFS_I(inode)->tailpack);
4601 +/* Sync a reiserfs file. */
4604 + * FIXME: sync_mapping_buffers() never has anything to sync. Can
4605 + * be removed...
4606 + */
4608 +static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end,
4609 + int datasync)
4611 + struct inode *inode = filp->f_mapping->host;
4612 + int err;
4613 + int barrier_done;
4615 + err = file_write_and_wait_range(filp, start, end);
4616 + if (err)
4617 + return err;
4619 + inode_lock(inode);
4620 + BUG_ON(!S_ISREG(inode->i_mode));
4621 + err = sync_mapping_buffers(inode->i_mapping);
4622 + reiserfs_write_lock(inode->i_sb);
4623 + barrier_done = reiserfs_commit_for_inode(inode);
4624 + reiserfs_write_unlock(inode->i_sb);
4625 + if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
4626 + blkdev_issue_flush(inode->i_sb->s_bdev);
4627 + inode_unlock(inode);
4628 + if (barrier_done < 0)
4629 + return barrier_done;
4630 + return (err < 0) ? -EIO : 0;
4633 +/* taken fs/buffer.c:__block_commit_write */
4634 +int reiserfs_commit_page(struct inode *inode, struct page *page,
4635 + unsigned from, unsigned to)
4637 + unsigned block_start, block_end;
4638 + int partial = 0;
4639 + unsigned blocksize;
4640 + struct buffer_head *bh, *head;
4641 + unsigned long i_size_index = inode->i_size >> PAGE_SHIFT;
4642 + int new;
4643 + int logit = reiserfs_file_data_log(inode);
4644 + struct super_block *s = inode->i_sb;
4645 + int bh_per_page = PAGE_SIZE / s->s_blocksize;
4646 + struct reiserfs_transaction_handle th;
4647 + int ret = 0;
4649 + th.t_trans_id = 0;
4650 + blocksize = i_blocksize(inode);
4652 + if (logit) {
4653 + reiserfs_write_lock(s);
4654 + ret = journal_begin(&th, s, bh_per_page + 1);
4655 + if (ret)
4656 + goto drop_write_lock;
4657 + reiserfs_update_inode_transaction(inode);
4659 + for (bh = head = page_buffers(page), block_start = 0;
4660 + bh != head || !block_start;
4661 + block_start = block_end, bh = bh->b_this_page) {
4663 + new = buffer_new(bh);
4664 + clear_buffer_new(bh);
4665 + block_end = block_start + blocksize;
4666 + if (block_end <= from || block_start >= to) {
4667 + if (!buffer_uptodate(bh))
4668 + partial = 1;
4669 + } else {
4670 + set_buffer_uptodate(bh);
4671 + if (logit) {
4672 + reiserfs_prepare_for_journal(s, bh, 1);
4673 + journal_mark_dirty(&th, bh);
4674 + } else if (!buffer_dirty(bh)) {
4675 + mark_buffer_dirty(bh);
4676 + /*
4677 + * do data=ordered on any page past the end
4678 + * of file and any buffer marked BH_New.
4679 + */
4680 + if (reiserfs_data_ordered(inode->i_sb) &&
4681 + (new || page->index >= i_size_index)) {
4682 + reiserfs_add_ordered_list(inode, bh);
4687 + if (logit) {
4688 + ret = journal_end(&th);
4689 +drop_write_lock:
4690 + reiserfs_write_unlock(s);
4692 + /*
4693 + * If this is a partial write which happened to make all buffers
4694 + * uptodate then we can optimize away a bogus read_folio() for
4695 + * the next read(). Here we 'discover' whether the page went
4696 + * uptodate as a result of this (potentially partial) write.
4697 + */
4698 + if (!partial)
4699 + SetPageUptodate(page);
4700 + return ret;
4703 +const struct file_operations reiserfs_file_operations = {
4704 + .unlocked_ioctl = reiserfs_ioctl,
4705 +#ifdef CONFIG_COMPAT
4706 + .compat_ioctl = reiserfs_compat_ioctl,
4707 +#endif
4708 + .mmap = generic_file_mmap,
4709 + .open = reiserfs_file_open,
4710 + .release = reiserfs_file_release,
4711 + .fsync = reiserfs_sync_file,
4712 + .read_iter = generic_file_read_iter,
4713 + .write_iter = generic_file_write_iter,
4714 + .splice_read = filemap_splice_read,
4715 + .splice_write = iter_file_splice_write,
4716 + .llseek = generic_file_llseek,
4719 +const struct inode_operations reiserfs_file_inode_operations = {
4720 + .setattr = reiserfs_setattr,
4721 + .listxattr = reiserfs_listxattr,
4722 + .permission = reiserfs_permission,
4723 + .get_inode_acl = reiserfs_get_acl,
4724 + .set_acl = reiserfs_set_acl,
4725 + .fileattr_get = reiserfs_fileattr_get,
4726 + .fileattr_set = reiserfs_fileattr_set,
4729 +const struct inode_operations reiserfs_priv_file_inode_operations = {
4730 + .setattr = reiserfs_setattr,
4731 + .permission = reiserfs_permission,
4732 + .fileattr_get = reiserfs_fileattr_get,
4733 + .fileattr_set = reiserfs_fileattr_set,
4735 diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
4736 new file mode 100644
4737 index 000000000000..6c13a8d9a73c
4738 --- /dev/null
4739 +++ b/fs/reiserfs/fix_node.c
4740 @@ -0,0 +1,2822 @@
4742 + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
4743 + */
4745 +#include <linux/time.h>
4746 +#include <linux/slab.h>
4747 +#include <linux/string.h>
4748 +#include "reiserfs.h"
4749 +#include <linux/buffer_head.h>
4752 + * To make any changes in the tree we find a node that contains item
4753 + * to be changed/deleted or position in the node we insert a new item
4754 + * to. We call this node S. To do balancing we need to decide what we
4755 + * will shift to left/right neighbor, or to a new node, where new item
4756 + * will be etc. To make this analysis simpler we build virtual
4757 + * node. Virtual node is an array of items, that will replace items of
4758 + * node S. (For instance if we are going to delete an item, virtual
4759 + * node does not contain it). Virtual node keeps information about
4760 + * item sizes and types, mergeability of first and last items, sizes
4761 + * of all entries in directory item. We use this array of items when
4762 + * calculating what we can shift to neighbors and how many nodes we
4763 + * have to have if we do not any shiftings, if we shift to left/right
4764 + * neighbor or to both.
4765 + */
4768 + * Takes item number in virtual node, returns number of item
4769 + * that it has in source buffer
4770 + */
4771 +static inline int old_item_num(int new_num, int affected_item_num, int mode)
4773 + if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num)
4774 + return new_num;
4776 + if (mode == M_INSERT) {
4778 + RFALSE(new_num == 0,
4779 + "vs-8005: for INSERT mode and item number of inserted item");
4781 + return new_num - 1;
4784 + RFALSE(mode != M_DELETE,
4785 + "vs-8010: old_item_num: mode must be M_DELETE (mode = \'%c\'",
4786 + mode);
4787 + /* delete mode */
4788 + return new_num + 1;
4791 +static void create_virtual_node(struct tree_balance *tb, int h)
4793 + struct item_head *ih;
4794 + struct virtual_node *vn = tb->tb_vn;
4795 + int new_num;
4796 + struct buffer_head *Sh; /* this comes from tb->S[h] */
4798 + Sh = PATH_H_PBUFFER(tb->tb_path, h);
4800 + /* size of changed node */
4801 + vn->vn_size =
4802 + MAX_CHILD_SIZE(Sh) - B_FREE_SPACE(Sh) + tb->insert_size[h];
4804 + /* for internal nodes array if virtual items is not created */
4805 + if (h) {
4806 + vn->vn_nr_item = (vn->vn_size - DC_SIZE) / (DC_SIZE + KEY_SIZE);
4807 + return;
4810 + /* number of items in virtual node */
4811 + vn->vn_nr_item =
4812 + B_NR_ITEMS(Sh) + ((vn->vn_mode == M_INSERT) ? 1 : 0) -
4813 + ((vn->vn_mode == M_DELETE) ? 1 : 0);
4815 + /* first virtual item */
4816 + vn->vn_vi = (struct virtual_item *)(tb->tb_vn + 1);
4817 + memset(vn->vn_vi, 0, vn->vn_nr_item * sizeof(struct virtual_item));
4818 + vn->vn_free_ptr += vn->vn_nr_item * sizeof(struct virtual_item);
4820 + /* first item in the node */
4821 + ih = item_head(Sh, 0);
4823 + /* define the mergeability for 0-th item (if it is not being deleted) */
4824 + if (op_is_left_mergeable(&ih->ih_key, Sh->b_size)
4825 + && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num))
4826 + vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE;
4828 + /*
4829 + * go through all items that remain in the virtual
4830 + * node (except for the new (inserted) one)
4831 + */
4832 + for (new_num = 0; new_num < vn->vn_nr_item; new_num++) {
4833 + int j;
4834 + struct virtual_item *vi = vn->vn_vi + new_num;
4835 + int is_affected =
4836 + ((new_num != vn->vn_affected_item_num) ? 0 : 1);
4838 + if (is_affected && vn->vn_mode == M_INSERT)
4839 + continue;
4841 + /* get item number in source node */
4842 + j = old_item_num(new_num, vn->vn_affected_item_num,
4843 + vn->vn_mode);
4845 + vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE;
4846 + vi->vi_ih = ih + j;
4847 + vi->vi_item = ih_item_body(Sh, ih + j);
4848 + vi->vi_uarea = vn->vn_free_ptr;
4850 + /*
4851 + * FIXME: there is no check that item operation did not
4852 + * consume too much memory
4853 + */
4854 + vn->vn_free_ptr +=
4855 + op_create_vi(vn, vi, is_affected, tb->insert_size[0]);
4856 + if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr)
4857 + reiserfs_panic(tb->tb_sb, "vs-8030",
4858 + "virtual node space consumed");
4860 + if (!is_affected)
4861 + /* this is not being changed */
4862 + continue;
4864 + if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) {
4865 + vn->vn_vi[new_num].vi_item_len += tb->insert_size[0];
4866 + /* pointer to data which is going to be pasted */
4867 + vi->vi_new_data = vn->vn_data;
4871 + /* virtual inserted item is not defined yet */
4872 + if (vn->vn_mode == M_INSERT) {
4873 + struct virtual_item *vi = vn->vn_vi + vn->vn_affected_item_num;
4875 + RFALSE(vn->vn_ins_ih == NULL,
4876 + "vs-8040: item header of inserted item is not specified");
4877 + vi->vi_item_len = tb->insert_size[0];
4878 + vi->vi_ih = vn->vn_ins_ih;
4879 + vi->vi_item = vn->vn_data;
4880 + vi->vi_uarea = vn->vn_free_ptr;
4882 + op_create_vi(vn, vi, 0 /*not pasted or cut */ ,
4883 + tb->insert_size[0]);
4886 + /*
4887 + * set right merge flag we take right delimiting key and
4888 + * check whether it is a mergeable item
4889 + */
4890 + if (tb->CFR[0]) {
4891 + struct reiserfs_key *key;
4893 + key = internal_key(tb->CFR[0], tb->rkey[0]);
4894 + if (op_is_left_mergeable(key, Sh->b_size)
4895 + && (vn->vn_mode != M_DELETE
4896 + || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1))
4897 + vn->vn_vi[vn->vn_nr_item - 1].vi_type |=
4898 + VI_TYPE_RIGHT_MERGEABLE;
4900 +#ifdef CONFIG_REISERFS_CHECK
4901 + if (op_is_left_mergeable(key, Sh->b_size) &&
4902 + !(vn->vn_mode != M_DELETE
4903 + || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1)) {
4904 + /*
4905 + * we delete last item and it could be merged
4906 + * with right neighbor's first item
4907 + */
4908 + if (!
4909 + (B_NR_ITEMS(Sh) == 1
4910 + && is_direntry_le_ih(item_head(Sh, 0))
4911 + && ih_entry_count(item_head(Sh, 0)) == 1)) {
4912 + /*
4913 + * node contains more than 1 item, or item
4914 + * is not directory item, or this item
4915 + * contains more than 1 entry
4916 + */
4917 + print_block(Sh, 0, -1, -1);
4918 + reiserfs_panic(tb->tb_sb, "vs-8045",
4919 + "rdkey %k, affected item==%d "
4920 + "(mode==%c) Must be %c",
4921 + key, vn->vn_affected_item_num,
4922 + vn->vn_mode, M_DELETE);
4925 +#endif
4931 + * Using virtual node check, how many items can be
4932 + * shifted to left neighbor
4933 + */
4934 +static void check_left(struct tree_balance *tb, int h, int cur_free)
4936 + int i;
4937 + struct virtual_node *vn = tb->tb_vn;
4938 + struct virtual_item *vi;
4939 + int d_size, ih_size;
4941 + RFALSE(cur_free < 0, "vs-8050: cur_free (%d) < 0", cur_free);
4943 + /* internal level */
4944 + if (h > 0) {
4945 + tb->lnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
4946 + return;
4949 + /* leaf level */
4951 + if (!cur_free || !vn->vn_nr_item) {
4952 + /* no free space or nothing to move */
4953 + tb->lnum[h] = 0;
4954 + tb->lbytes = -1;
4955 + return;
4958 + RFALSE(!PATH_H_PPARENT(tb->tb_path, 0),
4959 + "vs-8055: parent does not exist or invalid");
4961 + vi = vn->vn_vi;
4962 + if ((unsigned int)cur_free >=
4963 + (vn->vn_size -
4964 + ((vi->vi_type & VI_TYPE_LEFT_MERGEABLE) ? IH_SIZE : 0))) {
4965 + /* all contents of S[0] fits into L[0] */
4967 + RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
4968 + "vs-8055: invalid mode or balance condition failed");
4970 + tb->lnum[0] = vn->vn_nr_item;
4971 + tb->lbytes = -1;
4972 + return;
4975 + d_size = 0, ih_size = IH_SIZE;
4977 + /* first item may be merge with last item in left neighbor */
4978 + if (vi->vi_type & VI_TYPE_LEFT_MERGEABLE)
4979 + d_size = -((int)IH_SIZE), ih_size = 0;
4981 + tb->lnum[0] = 0;
4982 + for (i = 0; i < vn->vn_nr_item;
4983 + i++, ih_size = IH_SIZE, d_size = 0, vi++) {
4984 + d_size += vi->vi_item_len;
4985 + if (cur_free >= d_size) {
4986 + /* the item can be shifted entirely */
4987 + cur_free -= d_size;
4988 + tb->lnum[0]++;
4989 + continue;
4992 + /* the item cannot be shifted entirely, try to split it */
4993 + /*
4994 + * check whether L[0] can hold ih and at least one byte
4995 + * of the item body
4996 + */
4998 + /* cannot shift even a part of the current item */
4999 + if (cur_free <= ih_size) {
5000 + tb->lbytes = -1;
5001 + return;
5003 + cur_free -= ih_size;
5005 + tb->lbytes = op_check_left(vi, cur_free, 0, 0);
5006 + if (tb->lbytes != -1)
5007 + /* count partially shifted item */
5008 + tb->lnum[0]++;
5010 + break;
5013 + return;
5017 + * Using virtual node check, how many items can be
5018 + * shifted to right neighbor
5019 + */
5020 +static void check_right(struct tree_balance *tb, int h, int cur_free)
5022 + int i;
5023 + struct virtual_node *vn = tb->tb_vn;
5024 + struct virtual_item *vi;
5025 + int d_size, ih_size;
5027 + RFALSE(cur_free < 0, "vs-8070: cur_free < 0");
5029 + /* internal level */
5030 + if (h > 0) {
5031 + tb->rnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
5032 + return;
5035 + /* leaf level */
5037 + if (!cur_free || !vn->vn_nr_item) {
5038 + /* no free space */
5039 + tb->rnum[h] = 0;
5040 + tb->rbytes = -1;
5041 + return;
5044 + RFALSE(!PATH_H_PPARENT(tb->tb_path, 0),
5045 + "vs-8075: parent does not exist or invalid");
5047 + vi = vn->vn_vi + vn->vn_nr_item - 1;
5048 + if ((unsigned int)cur_free >=
5049 + (vn->vn_size -
5050 + ((vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) ? IH_SIZE : 0))) {
5051 + /* all contents of S[0] fits into R[0] */
5053 + RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
5054 + "vs-8080: invalid mode or balance condition failed");
5056 + tb->rnum[h] = vn->vn_nr_item;
5057 + tb->rbytes = -1;
5058 + return;
5061 + d_size = 0, ih_size = IH_SIZE;
5063 + /* last item may be merge with first item in right neighbor */
5064 + if (vi->vi_type & VI_TYPE_RIGHT_MERGEABLE)
5065 + d_size = -(int)IH_SIZE, ih_size = 0;
5067 + tb->rnum[0] = 0;
5068 + for (i = vn->vn_nr_item - 1; i >= 0;
5069 + i--, d_size = 0, ih_size = IH_SIZE, vi--) {
5070 + d_size += vi->vi_item_len;
5071 + if (cur_free >= d_size) {
5072 + /* the item can be shifted entirely */
5073 + cur_free -= d_size;
5074 + tb->rnum[0]++;
5075 + continue;
5078 + /*
5079 + * check whether R[0] can hold ih and at least one
5080 + * byte of the item body
5081 + */
5083 + /* cannot shift even a part of the current item */
5084 + if (cur_free <= ih_size) {
5085 + tb->rbytes = -1;
5086 + return;
5089 + /*
5090 + * R[0] can hold the header of the item and at least
5091 + * one byte of its body
5092 + */
5093 + cur_free -= ih_size; /* cur_free is still > 0 */
5095 + tb->rbytes = op_check_right(vi, cur_free);
5096 + if (tb->rbytes != -1)
5097 + /* count partially shifted item */
5098 + tb->rnum[0]++;
5100 + break;
5103 + return;
5107 + * from - number of items, which are shifted to left neighbor entirely
5108 + * to - number of item, which are shifted to right neighbor entirely
5109 + * from_bytes - number of bytes of boundary item (or directory entries)
5110 + * which are shifted to left neighbor
5111 + * to_bytes - number of bytes of boundary item (or directory entries)
5112 + * which are shifted to right neighbor
5113 + */
5114 +static int get_num_ver(int mode, struct tree_balance *tb, int h,
5115 + int from, int from_bytes,
5116 + int to, int to_bytes, short *snum012, int flow)
5118 + int i;
5119 + int units;
5120 + struct virtual_node *vn = tb->tb_vn;
5121 + int total_node_size, max_node_size, current_item_size;
5122 + int needed_nodes;
5124 + /* position of item we start filling node from */
5125 + int start_item;
5127 + /* position of item we finish filling node by */
5128 + int end_item;
5130 + /*
5131 + * number of first bytes (entries for directory) of start_item-th item
5132 + * we do not include into node that is being filled
5133 + */
5134 + int start_bytes;
5136 + /*
5137 + * number of last bytes (entries for directory) of end_item-th item
5138 + * we do node include into node that is being filled
5139 + */
5140 + int end_bytes;
5142 + /*
5143 + * these are positions in virtual item of items, that are split
5144 + * between S[0] and S1new and S1new and S2new
5145 + */
5146 + int split_item_positions[2];
5148 + split_item_positions[0] = -1;
5149 + split_item_positions[1] = -1;
5151 + /*
5152 + * We only create additional nodes if we are in insert or paste mode
5153 + * or we are in replace mode at the internal level. If h is 0 and
5154 + * the mode is M_REPLACE then in fix_nodes we change the mode to
5155 + * paste or insert before we get here in the code.
5156 + */
5157 + RFALSE(tb->insert_size[h] < 0 || (mode != M_INSERT && mode != M_PASTE),
5158 + "vs-8100: insert_size < 0 in overflow");
5160 + max_node_size = MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, h));
5162 + /*
5163 + * snum012 [0-2] - number of items, that lay
5164 + * to S[0], first new node and second new node
5165 + */
5166 + snum012[3] = -1; /* s1bytes */
5167 + snum012[4] = -1; /* s2bytes */
5169 + /* internal level */
5170 + if (h > 0) {
5171 + i = ((to - from) * (KEY_SIZE + DC_SIZE) + DC_SIZE);
5172 + if (i == max_node_size)
5173 + return 1;
5174 + return (i / max_node_size + 1);
5177 + /* leaf level */
5178 + needed_nodes = 1;
5179 + total_node_size = 0;
5181 + /* start from 'from'-th item */
5182 + start_item = from;
5183 + /* skip its first 'start_bytes' units */
5184 + start_bytes = ((from_bytes != -1) ? from_bytes : 0);
5186 + /* last included item is the 'end_item'-th one */
5187 + end_item = vn->vn_nr_item - to - 1;
5188 + /* do not count last 'end_bytes' units of 'end_item'-th item */
5189 + end_bytes = (to_bytes != -1) ? to_bytes : 0;
5191 + /*
5192 + * go through all item beginning from the start_item-th item
5193 + * and ending by the end_item-th item. Do not count first
5194 + * 'start_bytes' units of 'start_item'-th item and last
5195 + * 'end_bytes' of 'end_item'-th item
5196 + */
5197 + for (i = start_item; i <= end_item; i++) {
5198 + struct virtual_item *vi = vn->vn_vi + i;
5199 + int skip_from_end = ((i == end_item) ? end_bytes : 0);
5201 + RFALSE(needed_nodes > 3, "vs-8105: too many nodes are needed");
5203 + /* get size of current item */
5204 + current_item_size = vi->vi_item_len;
5206 + /*
5207 + * do not take in calculation head part (from_bytes)
5208 + * of from-th item
5209 + */
5210 + current_item_size -=
5211 + op_part_size(vi, 0 /*from start */ , start_bytes);
5213 + /* do not take in calculation tail part of last item */
5214 + current_item_size -=
5215 + op_part_size(vi, 1 /*from end */ , skip_from_end);
5217 + /* if item fits into current node entierly */
5218 + if (total_node_size + current_item_size <= max_node_size) {
5219 + snum012[needed_nodes - 1]++;
5220 + total_node_size += current_item_size;
5221 + start_bytes = 0;
5222 + continue;
5225 + /*
5226 + * virtual item length is longer, than max size of item in
5227 + * a node. It is impossible for direct item
5228 + */
5229 + if (current_item_size > max_node_size) {
5230 + RFALSE(is_direct_le_ih(vi->vi_ih),
5231 + "vs-8110: "
5232 + "direct item length is %d. It can not be longer than %d",
5233 + current_item_size, max_node_size);
5234 + /* we will try to split it */
5235 + flow = 1;
5238 + /* as we do not split items, take new node and continue */
5239 + if (!flow) {
5240 + needed_nodes++;
5241 + i--;
5242 + total_node_size = 0;
5243 + continue;
5246 + /*
5247 + * calculate number of item units which fit into node being
5248 + * filled
5249 + */
5251 + int free_space;
5253 + free_space = max_node_size - total_node_size - IH_SIZE;
5254 + units =
5255 + op_check_left(vi, free_space, start_bytes,
5256 + skip_from_end);
5257 + /*
5258 + * nothing fits into current node, take new
5259 + * node and continue
5260 + */
5261 + if (units == -1) {
5262 + needed_nodes++, i--, total_node_size = 0;
5263 + continue;
5267 + /* something fits into the current node */
5268 + start_bytes += units;
5269 + snum012[needed_nodes - 1 + 3] = units;
5271 + if (needed_nodes > 2)
5272 + reiserfs_warning(tb->tb_sb, "vs-8111",
5273 + "split_item_position is out of range");
5274 + snum012[needed_nodes - 1]++;
5275 + split_item_positions[needed_nodes - 1] = i;
5276 + needed_nodes++;
5277 + /* continue from the same item with start_bytes != -1 */
5278 + start_item = i;
5279 + i--;
5280 + total_node_size = 0;
5283 + /*
5284 + * sum012[4] (if it is not -1) contains number of units of which
5285 + * are to be in S1new, snum012[3] - to be in S0. They are supposed
5286 + * to be S1bytes and S2bytes correspondingly, so recalculate
5287 + */
5288 + if (snum012[4] > 0) {
5289 + int split_item_num;
5290 + int bytes_to_r, bytes_to_l;
5291 + int bytes_to_S1new;
5293 + split_item_num = split_item_positions[1];
5294 + bytes_to_l =
5295 + ((from == split_item_num
5296 + && from_bytes != -1) ? from_bytes : 0);
5297 + bytes_to_r =
5298 + ((end_item == split_item_num
5299 + && end_bytes != -1) ? end_bytes : 0);
5300 + bytes_to_S1new =
5301 + ((split_item_positions[0] ==
5302 + split_item_positions[1]) ? snum012[3] : 0);
5304 + /* s2bytes */
5305 + snum012[4] =
5306 + op_unit_num(&vn->vn_vi[split_item_num]) - snum012[4] -
5307 + bytes_to_r - bytes_to_l - bytes_to_S1new;
5309 + if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY &&
5310 + vn->vn_vi[split_item_num].vi_index != TYPE_INDIRECT)
5311 + reiserfs_warning(tb->tb_sb, "vs-8115",
5312 + "not directory or indirect item");
5315 + /* now we know S2bytes, calculate S1bytes */
5316 + if (snum012[3] > 0) {
5317 + int split_item_num;
5318 + int bytes_to_r, bytes_to_l;
5319 + int bytes_to_S2new;
5321 + split_item_num = split_item_positions[0];
5322 + bytes_to_l =
5323 + ((from == split_item_num
5324 + && from_bytes != -1) ? from_bytes : 0);
5325 + bytes_to_r =
5326 + ((end_item == split_item_num
5327 + && end_bytes != -1) ? end_bytes : 0);
5328 + bytes_to_S2new =
5329 + ((split_item_positions[0] == split_item_positions[1]
5330 + && snum012[4] != -1) ? snum012[4] : 0);
5332 + /* s1bytes */
5333 + snum012[3] =
5334 + op_unit_num(&vn->vn_vi[split_item_num]) - snum012[3] -
5335 + bytes_to_r - bytes_to_l - bytes_to_S2new;
5338 + return needed_nodes;
5343 + * Set parameters for balancing.
5344 + * Performs write of results of analysis of balancing into structure tb,
5345 + * where it will later be used by the functions that actually do the balancing.
5346 + * Parameters:
5347 + * tb tree_balance structure;
5348 + * h current level of the node;
5349 + * lnum number of items from S[h] that must be shifted to L[h];
5350 + * rnum number of items from S[h] that must be shifted to R[h];
5351 + * blk_num number of blocks that S[h] will be splitted into;
5352 + * s012 number of items that fall into splitted nodes.
5353 + * lbytes number of bytes which flow to the left neighbor from the
5354 + * item that is not shifted entirely
5355 + * rbytes number of bytes which flow to the right neighbor from the
5356 + * item that is not shifted entirely
5357 + * s1bytes number of bytes which flow to the first new node when
5358 + * S[0] splits (this number is contained in s012 array)
5359 + */
5361 +static void set_parameters(struct tree_balance *tb, int h, int lnum,
5362 + int rnum, int blk_num, short *s012, int lb, int rb)
5365 + tb->lnum[h] = lnum;
5366 + tb->rnum[h] = rnum;
5367 + tb->blknum[h] = blk_num;
5369 + /* only for leaf level */
5370 + if (h == 0) {
5371 + if (s012 != NULL) {
5372 + tb->s0num = *s012++;
5373 + tb->snum[0] = *s012++;
5374 + tb->snum[1] = *s012++;
5375 + tb->sbytes[0] = *s012++;
5376 + tb->sbytes[1] = *s012;
5378 + tb->lbytes = lb;
5379 + tb->rbytes = rb;
5381 + PROC_INFO_ADD(tb->tb_sb, lnum[h], lnum);
5382 + PROC_INFO_ADD(tb->tb_sb, rnum[h], rnum);
5384 + PROC_INFO_ADD(tb->tb_sb, lbytes[h], lb);
5385 + PROC_INFO_ADD(tb->tb_sb, rbytes[h], rb);
5389 + * check if node disappears if we shift tb->lnum[0] items to left
5390 + * neighbor and tb->rnum[0] to the right one.
5391 + */
5392 +static int is_leaf_removable(struct tree_balance *tb)
5394 + struct virtual_node *vn = tb->tb_vn;
5395 + int to_left, to_right;
5396 + int size;
5397 + int remain_items;
5399 + /*
5400 + * number of items that will be shifted to left (right) neighbor
5401 + * entirely
5402 + */
5403 + to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0);
5404 + to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0);
5405 + remain_items = vn->vn_nr_item;
5407 + /* how many items remain in S[0] after shiftings to neighbors */
5408 + remain_items -= (to_left + to_right);
5410 + /* all content of node can be shifted to neighbors */
5411 + if (remain_items < 1) {
5412 + set_parameters(tb, 0, to_left, vn->vn_nr_item - to_left, 0,
5413 + NULL, -1, -1);
5414 + return 1;
5417 + /* S[0] is not removable */
5418 + if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1)
5419 + return 0;
5421 + /* check whether we can divide 1 remaining item between neighbors */
5423 + /* get size of remaining item (in item units) */
5424 + size = op_unit_num(&vn->vn_vi[to_left]);
5426 + if (tb->lbytes + tb->rbytes >= size) {
5427 + set_parameters(tb, 0, to_left + 1, to_right + 1, 0, NULL,
5428 + tb->lbytes, -1);
5429 + return 1;
5432 + return 0;
5435 +/* check whether L, S, R can be joined in one node */
5436 +static int are_leaves_removable(struct tree_balance *tb, int lfree, int rfree)
5438 + struct virtual_node *vn = tb->tb_vn;
5439 + int ih_size;
5440 + struct buffer_head *S0;
5442 + S0 = PATH_H_PBUFFER(tb->tb_path, 0);
5444 + ih_size = 0;
5445 + if (vn->vn_nr_item) {
5446 + if (vn->vn_vi[0].vi_type & VI_TYPE_LEFT_MERGEABLE)
5447 + ih_size += IH_SIZE;
5449 + if (vn->vn_vi[vn->vn_nr_item - 1].
5450 + vi_type & VI_TYPE_RIGHT_MERGEABLE)
5451 + ih_size += IH_SIZE;
5452 + } else {
5453 + /* there was only one item and it will be deleted */
5454 + struct item_head *ih;
5456 + RFALSE(B_NR_ITEMS(S0) != 1,
5457 + "vs-8125: item number must be 1: it is %d",
5458 + B_NR_ITEMS(S0));
5460 + ih = item_head(S0, 0);
5461 + if (tb->CFR[0]
5462 + && !comp_short_le_keys(&ih->ih_key,
5463 + internal_key(tb->CFR[0],
5464 + tb->rkey[0])))
5465 + /*
5466 + * Directory must be in correct state here: that is
5467 + * somewhere at the left side should exist first
5468 + * directory item. But the item being deleted can
5469 + * not be that first one because its right neighbor
5470 + * is item of the same directory. (But first item
5471 + * always gets deleted in last turn). So, neighbors
5472 + * of deleted item can be merged, so we can save
5473 + * ih_size
5474 + */
5475 + if (is_direntry_le_ih(ih)) {
5476 + ih_size = IH_SIZE;
5478 + /*
5479 + * we might check that left neighbor exists
5480 + * and is of the same directory
5481 + */
5482 + RFALSE(le_ih_k_offset(ih) == DOT_OFFSET,
5483 + "vs-8130: first directory item can not be removed until directory is not empty");
5488 + if (MAX_CHILD_SIZE(S0) + vn->vn_size <= rfree + lfree + ih_size) {
5489 + set_parameters(tb, 0, -1, -1, -1, NULL, -1, -1);
5490 + PROC_INFO_INC(tb->tb_sb, leaves_removable);
5491 + return 1;
5493 + return 0;
5497 +/* when we do not split item, lnum and rnum are numbers of entire items */
5498 +#define SET_PAR_SHIFT_LEFT \
5499 +if (h)\
5501 + int to_l;\
5503 + to_l = (MAX_NR_KEY(Sh)+1 - lpar + vn->vn_nr_item + 1) / 2 -\
5504 + (MAX_NR_KEY(Sh) + 1 - lpar);\
5506 + set_parameters (tb, h, to_l, 0, lnver, NULL, -1, -1);\
5508 +else \
5510 + if (lset==LEFT_SHIFT_FLOW)\
5511 + set_parameters (tb, h, lpar, 0, lnver, snum012+lset,\
5512 + tb->lbytes, -1);\
5513 + else\
5514 + set_parameters (tb, h, lpar - (tb->lbytes!=-1), 0, lnver, snum012+lset,\
5515 + -1, -1);\
5518 +#define SET_PAR_SHIFT_RIGHT \
5519 +if (h)\
5521 + int to_r;\
5523 + to_r = (MAX_NR_KEY(Sh)+1 - rpar + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - rpar);\
5525 + set_parameters (tb, h, 0, to_r, rnver, NULL, -1, -1);\
5527 +else \
5529 + if (rset==RIGHT_SHIFT_FLOW)\
5530 + set_parameters (tb, h, 0, rpar, rnver, snum012+rset,\
5531 + -1, tb->rbytes);\
5532 + else\
5533 + set_parameters (tb, h, 0, rpar - (tb->rbytes!=-1), rnver, snum012+rset,\
5534 + -1, -1);\
5537 +static void free_buffers_in_tb(struct tree_balance *tb)
5539 + int i;
5541 + pathrelse(tb->tb_path);
5543 + for (i = 0; i < MAX_HEIGHT; i++) {
5544 + brelse(tb->L[i]);
5545 + brelse(tb->R[i]);
5546 + brelse(tb->FL[i]);
5547 + brelse(tb->FR[i]);
5548 + brelse(tb->CFL[i]);
5549 + brelse(tb->CFR[i]);
5551 + tb->L[i] = NULL;
5552 + tb->R[i] = NULL;
5553 + tb->FL[i] = NULL;
5554 + tb->FR[i] = NULL;
5555 + tb->CFL[i] = NULL;
5556 + tb->CFR[i] = NULL;
5561 + * Get new buffers for storing new nodes that are created while balancing.
5562 + * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked;
5563 + * CARRY_ON - schedule didn't occur while the function worked;
5564 + * NO_DISK_SPACE - no disk space.
5565 + */
5566 +/* The function is NOT SCHEDULE-SAFE! */
5567 +static int get_empty_nodes(struct tree_balance *tb, int h)
5569 + struct buffer_head *new_bh, *Sh = PATH_H_PBUFFER(tb->tb_path, h);
5570 + b_blocknr_t *blocknr, blocknrs[MAX_AMOUNT_NEEDED] = { 0, };
5571 + int counter, number_of_freeblk;
5572 + int amount_needed; /* number of needed empty blocks */
5573 + int retval = CARRY_ON;
5574 + struct super_block *sb = tb->tb_sb;
5576 + /*
5577 + * number_of_freeblk is the number of empty blocks which have been
5578 + * acquired for use by the balancing algorithm minus the number of
5579 + * empty blocks used in the previous levels of the analysis,
5580 + * number_of_freeblk = tb->cur_blknum can be non-zero if a schedule
5581 + * occurs after empty blocks are acquired, and the balancing analysis
5582 + * is then restarted, amount_needed is the number needed by this
5583 + * level (h) of the balancing analysis.
5585 + * Note that for systems with many processes writing, it would be
5586 + * more layout optimal to calculate the total number needed by all
5587 + * levels and then to run reiserfs_new_blocks to get all of them at
5588 + * once.
5589 + */
5591 + /*
5592 + * Initiate number_of_freeblk to the amount acquired prior to the
5593 + * restart of the analysis or 0 if not restarted, then subtract the
5594 + * amount needed by all of the levels of the tree below h.
5595 + */
5596 + /* blknum includes S[h], so we subtract 1 in this calculation */
5597 + for (counter = 0, number_of_freeblk = tb->cur_blknum;
5598 + counter < h; counter++)
5599 + number_of_freeblk -=
5600 + (tb->blknum[counter]) ? (tb->blknum[counter] -
5601 + 1) : 0;
5603 + /* Allocate missing empty blocks. */
5604 + /* if Sh == 0 then we are getting a new root */
5605 + amount_needed = (Sh) ? (tb->blknum[h] - 1) : 1;
5606 + /*
5607 + * Amount_needed = the amount that we need more than the
5608 + * amount that we have.
5609 + */
5610 + if (amount_needed > number_of_freeblk)
5611 + amount_needed -= number_of_freeblk;
5612 + else /* If we have enough already then there is nothing to do. */
5613 + return CARRY_ON;
5615 + /*
5616 + * No need to check quota - is not allocated for blocks used
5617 + * for formatted nodes
5618 + */
5619 + if (reiserfs_new_form_blocknrs(tb, blocknrs,
5620 + amount_needed) == NO_DISK_SPACE)
5621 + return NO_DISK_SPACE;
5623 + /* for each blocknumber we just got, get a buffer and stick it on FEB */
5624 + for (blocknr = blocknrs, counter = 0;
5625 + counter < amount_needed; blocknr++, counter++) {
5627 + RFALSE(!*blocknr,
5628 + "PAP-8135: reiserfs_new_blocknrs failed when got new blocks");
5630 + new_bh = sb_getblk(sb, *blocknr);
5631 + RFALSE(buffer_dirty(new_bh) ||
5632 + buffer_journaled(new_bh) ||
5633 + buffer_journal_dirty(new_bh),
5634 + "PAP-8140: journaled or dirty buffer %b for the new block",
5635 + new_bh);
5637 + /* Put empty buffers into the array. */
5638 + RFALSE(tb->FEB[tb->cur_blknum],
5639 + "PAP-8141: busy slot for new buffer");
5641 + set_buffer_journal_new(new_bh);
5642 + tb->FEB[tb->cur_blknum++] = new_bh;
5645 + if (retval == CARRY_ON && FILESYSTEM_CHANGED_TB(tb))
5646 + retval = REPEAT_SEARCH;
5648 + return retval;
5652 + * Get free space of the left neighbor, which is stored in the parent
5653 + * node of the left neighbor.
5654 + */
5655 +static int get_lfree(struct tree_balance *tb, int h)
5657 + struct buffer_head *l, *f;
5658 + int order;
5660 + if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL ||
5661 + (l = tb->FL[h]) == NULL)
5662 + return 0;
5664 + if (f == l)
5665 + order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) - 1;
5666 + else {
5667 + order = B_NR_ITEMS(l);
5668 + f = l;
5671 + return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order)));
5675 + * Get free space of the right neighbor,
5676 + * which is stored in the parent node of the right neighbor.
5677 + */
5678 +static int get_rfree(struct tree_balance *tb, int h)
5680 + struct buffer_head *r, *f;
5681 + int order;
5683 + if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL ||
5684 + (r = tb->FR[h]) == NULL)
5685 + return 0;
5687 + if (f == r)
5688 + order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) + 1;
5689 + else {
5690 + order = 0;
5691 + f = r;
5694 + return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order)));
5698 +/* Check whether left neighbor is in memory. */
5699 +static int is_left_neighbor_in_cache(struct tree_balance *tb, int h)
5701 + struct buffer_head *father, *left;
5702 + struct super_block *sb = tb->tb_sb;
5703 + b_blocknr_t left_neighbor_blocknr;
5704 + int left_neighbor_position;
5706 + /* Father of the left neighbor does not exist. */
5707 + if (!tb->FL[h])
5708 + return 0;
5710 + /* Calculate father of the node to be balanced. */
5711 + father = PATH_H_PBUFFER(tb->tb_path, h + 1);
5713 + RFALSE(!father ||
5714 + !B_IS_IN_TREE(father) ||
5715 + !B_IS_IN_TREE(tb->FL[h]) ||
5716 + !buffer_uptodate(father) ||
5717 + !buffer_uptodate(tb->FL[h]),
5718 + "vs-8165: F[h] (%b) or FL[h] (%b) is invalid",
5719 + father, tb->FL[h]);
5721 + /*
5722 + * Get position of the pointer to the left neighbor
5723 + * into the left father.
5724 + */
5725 + left_neighbor_position = (father == tb->FL[h]) ?
5726 + tb->lkey[h] : B_NR_ITEMS(tb->FL[h]);
5727 + /* Get left neighbor block number. */
5728 + left_neighbor_blocknr =
5729 + B_N_CHILD_NUM(tb->FL[h], left_neighbor_position);
5730 + /* Look for the left neighbor in the cache. */
5731 + if ((left = sb_find_get_block(sb, left_neighbor_blocknr))) {
5733 + RFALSE(buffer_uptodate(left) && !B_IS_IN_TREE(left),
5734 + "vs-8170: left neighbor (%b %z) is not in the tree",
5735 + left, left);
5736 + put_bh(left);
5737 + return 1;
5740 + return 0;
5743 +#define LEFT_PARENTS 'l'
5744 +#define RIGHT_PARENTS 'r'
5746 +static void decrement_key(struct cpu_key *key)
5748 + /* call item specific function for this key */
5749 + item_ops[cpu_key_k_type(key)]->decrement_key(key);
5753 + * Calculate far left/right parent of the left/right neighbor of the
5754 + * current node, that is calculate the left/right (FL[h]/FR[h]) neighbor
5755 + * of the parent F[h].
5756 + * Calculate left/right common parent of the current node and L[h]/R[h].
5757 + * Calculate left/right delimiting key position.
5758 + * Returns: PATH_INCORRECT - path in the tree is not correct
5759 + * SCHEDULE_OCCURRED - schedule occurred while the function worked
5760 + * CARRY_ON - schedule didn't occur while the function
5761 + * worked
5762 + */
5763 +static int get_far_parent(struct tree_balance *tb,
5764 + int h,
5765 + struct buffer_head **pfather,
5766 + struct buffer_head **pcom_father, char c_lr_par)
5768 + struct buffer_head *parent;
5769 + INITIALIZE_PATH(s_path_to_neighbor_father);
5770 + struct treepath *path = tb->tb_path;
5771 + struct cpu_key s_lr_father_key;
5772 + int counter,
5773 + position = INT_MAX,
5774 + first_last_position = 0,
5775 + path_offset = PATH_H_PATH_OFFSET(path, h);
5777 + /*
5778 + * Starting from F[h] go upwards in the tree, and look for the common
5779 + * ancestor of F[h], and its neighbor l/r, that should be obtained.
5780 + */
5782 + counter = path_offset;
5784 + RFALSE(counter < FIRST_PATH_ELEMENT_OFFSET,
5785 + "PAP-8180: invalid path length");
5787 + for (; counter > FIRST_PATH_ELEMENT_OFFSET; counter--) {
5788 + /*
5789 + * Check whether parent of the current buffer in the path
5790 + * is really parent in the tree.
5791 + */
5792 + if (!B_IS_IN_TREE
5793 + (parent = PATH_OFFSET_PBUFFER(path, counter - 1)))
5794 + return REPEAT_SEARCH;
5796 + /* Check whether position in the parent is correct. */
5797 + if ((position =
5798 + PATH_OFFSET_POSITION(path,
5799 + counter - 1)) >
5800 + B_NR_ITEMS(parent))
5801 + return REPEAT_SEARCH;
5803 + /*
5804 + * Check whether parent at the path really points
5805 + * to the child.
5806 + */
5807 + if (B_N_CHILD_NUM(parent, position) !=
5808 + PATH_OFFSET_PBUFFER(path, counter)->b_blocknr)
5809 + return REPEAT_SEARCH;
5811 + /*
5812 + * Return delimiting key if position in the parent is not
5813 + * equal to first/last one.
5814 + */
5815 + if (c_lr_par == RIGHT_PARENTS)
5816 + first_last_position = B_NR_ITEMS(parent);
5817 + if (position != first_last_position) {
5818 + *pcom_father = parent;
5819 + get_bh(*pcom_father);
5820 + /*(*pcom_father = parent)->b_count++; */
5821 + break;
5825 + /* if we are in the root of the tree, then there is no common father */
5826 + if (counter == FIRST_PATH_ELEMENT_OFFSET) {
5827 + /*
5828 + * Check whether first buffer in the path is the
5829 + * root of the tree.
5830 + */
5831 + if (PATH_OFFSET_PBUFFER
5832 + (tb->tb_path,
5833 + FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
5834 + SB_ROOT_BLOCK(tb->tb_sb)) {
5835 + *pfather = *pcom_father = NULL;
5836 + return CARRY_ON;
5838 + return REPEAT_SEARCH;
5841 + RFALSE(B_LEVEL(*pcom_father) <= DISK_LEAF_NODE_LEVEL,
5842 + "PAP-8185: (%b %z) level too small",
5843 + *pcom_father, *pcom_father);
5845 + /* Check whether the common parent is locked. */
5847 + if (buffer_locked(*pcom_father)) {
5849 + /* Release the write lock while the buffer is busy */
5850 + int depth = reiserfs_write_unlock_nested(tb->tb_sb);
5851 + __wait_on_buffer(*pcom_father);
5852 + reiserfs_write_lock_nested(tb->tb_sb, depth);
5853 + if (FILESYSTEM_CHANGED_TB(tb)) {
5854 + brelse(*pcom_father);
5855 + return REPEAT_SEARCH;
5859 + /*
5860 + * So, we got common parent of the current node and its
5861 + * left/right neighbor. Now we are getting the parent of the
5862 + * left/right neighbor.
5863 + */
5865 + /* Form key to get parent of the left/right neighbor. */
5866 + le_key2cpu_key(&s_lr_father_key,
5867 + internal_key(*pcom_father,
5868 + (c_lr_par ==
5869 + LEFT_PARENTS) ? (tb->lkey[h - 1] =
5870 + position -
5871 + 1) : (tb->rkey[h -
5872 + 1] =
5873 + position)));
5875 + if (c_lr_par == LEFT_PARENTS)
5876 + decrement_key(&s_lr_father_key);
5878 + if (search_by_key
5879 + (tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father,
5880 + h + 1) == IO_ERROR)
5881 + /* path is released */
5882 + return IO_ERROR;
5884 + if (FILESYSTEM_CHANGED_TB(tb)) {
5885 + pathrelse(&s_path_to_neighbor_father);
5886 + brelse(*pcom_father);
5887 + return REPEAT_SEARCH;
5890 + *pfather = PATH_PLAST_BUFFER(&s_path_to_neighbor_father);
5892 + RFALSE(B_LEVEL(*pfather) != h + 1,
5893 + "PAP-8190: (%b %z) level too small", *pfather, *pfather);
5894 + RFALSE(s_path_to_neighbor_father.path_length <
5895 + FIRST_PATH_ELEMENT_OFFSET, "PAP-8192: path length is too small");
5897 + s_path_to_neighbor_father.path_length--;
5898 + pathrelse(&s_path_to_neighbor_father);
5899 + return CARRY_ON;
5903 + * Get parents of neighbors of node in the path(S[path_offset]) and
5904 + * common parents of S[path_offset] and L[path_offset]/R[path_offset]:
5905 + * F[path_offset], FL[path_offset], FR[path_offset], CFL[path_offset],
5906 + * CFR[path_offset].
5907 + * Calculate numbers of left and right delimiting keys position:
5908 + * lkey[path_offset], rkey[path_offset].
5909 + * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked
5910 + * CARRY_ON - schedule didn't occur while the function worked
5911 + */
5912 +static int get_parents(struct tree_balance *tb, int h)
5914 + struct treepath *path = tb->tb_path;
5915 + int position,
5916 + ret,
5917 + path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h);
5918 + struct buffer_head *curf, *curcf;
5920 + /* Current node is the root of the tree or will be root of the tree */
5921 + if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
5922 + /*
5923 + * The root can not have parents.
5924 + * Release nodes which previously were obtained as
5925 + * parents of the current node neighbors.
5926 + */
5927 + brelse(tb->FL[h]);
5928 + brelse(tb->CFL[h]);
5929 + brelse(tb->FR[h]);
5930 + brelse(tb->CFR[h]);
5931 + tb->FL[h] = NULL;
5932 + tb->CFL[h] = NULL;
5933 + tb->FR[h] = NULL;
5934 + tb->CFR[h] = NULL;
5935 + return CARRY_ON;
5938 + /* Get parent FL[path_offset] of L[path_offset]. */
5939 + position = PATH_OFFSET_POSITION(path, path_offset - 1);
5940 + if (position) {
5941 + /* Current node is not the first child of its parent. */
5942 + curf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
5943 + curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
5944 + get_bh(curf);
5945 + get_bh(curf);
5946 + tb->lkey[h] = position - 1;
5947 + } else {
5948 + /*
5949 + * Calculate current parent of L[path_offset], which is the
5950 + * left neighbor of the current node. Calculate current
5951 + * common parent of L[path_offset] and the current node.
5952 + * Note that CFL[path_offset] not equal FL[path_offset] and
5953 + * CFL[path_offset] not equal F[path_offset].
5954 + * Calculate lkey[path_offset].
5955 + */
5956 + if ((ret = get_far_parent(tb, h + 1, &curf,
5957 + &curcf,
5958 + LEFT_PARENTS)) != CARRY_ON)
5959 + return ret;
5962 + brelse(tb->FL[h]);
5963 + tb->FL[h] = curf; /* New initialization of FL[h]. */
5964 + brelse(tb->CFL[h]);
5965 + tb->CFL[h] = curcf; /* New initialization of CFL[h]. */
5967 + RFALSE((curf && !B_IS_IN_TREE(curf)) ||
5968 + (curcf && !B_IS_IN_TREE(curcf)),
5969 + "PAP-8195: FL (%b) or CFL (%b) is invalid", curf, curcf);
5971 + /* Get parent FR[h] of R[h]. */
5973 + /* Current node is the last child of F[h]. FR[h] != F[h]. */
5974 + if (position == B_NR_ITEMS(PATH_H_PBUFFER(path, h + 1))) {
5975 + /*
5976 + * Calculate current parent of R[h], which is the right
5977 + * neighbor of F[h]. Calculate current common parent of
5978 + * R[h] and current node. Note that CFR[h] not equal
5979 + * FR[path_offset] and CFR[h] not equal F[h].
5980 + */
5981 + if ((ret =
5982 + get_far_parent(tb, h + 1, &curf, &curcf,
5983 + RIGHT_PARENTS)) != CARRY_ON)
5984 + return ret;
5985 + } else {
5986 + /* Current node is not the last child of its parent F[h]. */
5987 + curf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
5988 + curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
5989 + get_bh(curf);
5990 + get_bh(curf);
5991 + tb->rkey[h] = position;
5994 + brelse(tb->FR[h]);
5995 + /* New initialization of FR[path_offset]. */
5996 + tb->FR[h] = curf;
5998 + brelse(tb->CFR[h]);
5999 + /* New initialization of CFR[path_offset]. */
6000 + tb->CFR[h] = curcf;
6002 + RFALSE((curf && !B_IS_IN_TREE(curf)) ||
6003 + (curcf && !B_IS_IN_TREE(curcf)),
6004 + "PAP-8205: FR (%b) or CFR (%b) is invalid", curf, curcf);
6006 + return CARRY_ON;
6010 + * it is possible to remove node as result of shiftings to
6011 + * neighbors even when we insert or paste item.
6012 + */
6013 +static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree,
6014 + struct tree_balance *tb, int h)
6016 + struct buffer_head *Sh = PATH_H_PBUFFER(tb->tb_path, h);
6017 + int levbytes = tb->insert_size[h];
6018 + struct item_head *ih;
6019 + struct reiserfs_key *r_key = NULL;
6021 + ih = item_head(Sh, 0);
6022 + if (tb->CFR[h])
6023 + r_key = internal_key(tb->CFR[h], tb->rkey[h]);
6025 + if (lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes
6026 + /* shifting may merge items which might save space */
6028 + ((!h
6029 + && op_is_left_mergeable(&ih->ih_key, Sh->b_size)) ? IH_SIZE : 0)
6031 + ((!h && r_key
6032 + && op_is_left_mergeable(r_key, Sh->b_size)) ? IH_SIZE : 0)
6033 + + ((h) ? KEY_SIZE : 0)) {
6034 + /* node can not be removed */
6035 + if (sfree >= levbytes) {
6036 + /* new item fits into node S[h] without any shifting */
6037 + if (!h)
6038 + tb->s0num =
6039 + B_NR_ITEMS(Sh) +
6040 + ((mode == M_INSERT) ? 1 : 0);
6041 + set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
6042 + return NO_BALANCING_NEEDED;
6045 + PROC_INFO_INC(tb->tb_sb, can_node_be_removed[h]);
6046 + return !NO_BALANCING_NEEDED;
6050 + * Check whether current node S[h] is balanced when increasing its size by
6051 + * Inserting or Pasting.
6052 + * Calculate parameters for balancing for current level h.
6053 + * Parameters:
6054 + * tb tree_balance structure;
6055 + * h current level of the node;
6056 + * inum item number in S[h];
6057 + * mode i - insert, p - paste;
6058 + * Returns: 1 - schedule occurred;
6059 + * 0 - balancing for higher levels needed;
6060 + * -1 - no balancing for higher levels needed;
6061 + * -2 - no disk space.
6062 + */
6063 +/* ip means Inserting or Pasting */
6064 +static int ip_check_balance(struct tree_balance *tb, int h)
6066 + struct virtual_node *vn = tb->tb_vn;
6067 + /*
6068 + * Number of bytes that must be inserted into (value is negative
6069 + * if bytes are deleted) buffer which contains node being balanced.
6070 + * The mnemonic is that the attempted change in node space used
6071 + * level is levbytes bytes.
6072 + */
6073 + int levbytes;
6074 + int ret;
6076 + int lfree, sfree, rfree /* free space in L, S and R */ ;
6078 + /*
6079 + * nver is short for number of vertixes, and lnver is the number if
6080 + * we shift to the left, rnver is the number if we shift to the
6081 + * right, and lrnver is the number if we shift in both directions.
6082 + * The goal is to minimize first the number of vertixes, and second,
6083 + * the number of vertixes whose contents are changed by shifting,
6084 + * and third the number of uncached vertixes whose contents are
6085 + * changed by shifting and must be read from disk.
6086 + */
6087 + int nver, lnver, rnver, lrnver;
6089 + /*
6090 + * used at leaf level only, S0 = S[0] is the node being balanced,
6091 + * sInum [ I = 0,1,2 ] is the number of items that will
6092 + * remain in node SI after balancing. S1 and S2 are new
6093 + * nodes that might be created.
6094 + */
6096 + /*
6097 + * we perform 8 calls to get_num_ver(). For each call we
6098 + * calculate five parameters. where 4th parameter is s1bytes
6099 + * and 5th - s2bytes
6101 + * s0num, s1num, s2num for 8 cases
6102 + * 0,1 - do not shift and do not shift but bottle
6103 + * 2 - shift only whole item to left
6104 + * 3 - shift to left and bottle as much as possible
6105 + * 4,5 - shift to right (whole items and as much as possible
6106 + * 6,7 - shift to both directions (whole items and as much as possible)
6107 + */
6108 + short snum012[40] = { 0, };
6110 + /* Sh is the node whose balance is currently being checked */
6111 + struct buffer_head *Sh;
6113 + Sh = PATH_H_PBUFFER(tb->tb_path, h);
6114 + levbytes = tb->insert_size[h];
6116 + /* Calculate balance parameters for creating new root. */
6117 + if (!Sh) {
6118 + if (!h)
6119 + reiserfs_panic(tb->tb_sb, "vs-8210",
6120 + "S[0] can not be 0");
6121 + switch (ret = get_empty_nodes(tb, h)) {
6122 + /* no balancing for higher levels needed */
6123 + case CARRY_ON:
6124 + set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
6125 + return NO_BALANCING_NEEDED;
6127 + case NO_DISK_SPACE:
6128 + case REPEAT_SEARCH:
6129 + return ret;
6130 + default:
6131 + reiserfs_panic(tb->tb_sb, "vs-8215", "incorrect "
6132 + "return value of get_empty_nodes");
6136 + /* get parents of S[h] neighbors. */
6137 + ret = get_parents(tb, h);
6138 + if (ret != CARRY_ON)
6139 + return ret;
6141 + sfree = B_FREE_SPACE(Sh);
6143 + /* get free space of neighbors */
6144 + rfree = get_rfree(tb, h);
6145 + lfree = get_lfree(tb, h);
6147 + /* and new item fits into node S[h] without any shifting */
6148 + if (can_node_be_removed(vn->vn_mode, lfree, sfree, rfree, tb, h) ==
6149 + NO_BALANCING_NEEDED)
6150 + return NO_BALANCING_NEEDED;
6152 + create_virtual_node(tb, h);
6154 + /*
6155 + * determine maximal number of items we can shift to the left
6156 + * neighbor (in tb structure) and the maximal number of bytes
6157 + * that can flow to the left neighbor from the left most liquid
6158 + * item that cannot be shifted from S[0] entirely (returned value)
6159 + */
6160 + check_left(tb, h, lfree);
6162 + /*
6163 + * determine maximal number of items we can shift to the right
6164 + * neighbor (in tb structure) and the maximal number of bytes
6165 + * that can flow to the right neighbor from the right most liquid
6166 + * item that cannot be shifted from S[0] entirely (returned value)
6167 + */
6168 + check_right(tb, h, rfree);
6170 + /*
6171 + * all contents of internal node S[h] can be moved into its
6172 + * neighbors, S[h] will be removed after balancing
6173 + */
6174 + if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) {
6175 + int to_r;
6177 + /*
6178 + * Since we are working on internal nodes, and our internal
6179 + * nodes have fixed size entries, then we can balance by the
6180 + * number of items rather than the space they consume. In this
6181 + * routine we set the left node equal to the right node,
6182 + * allowing a difference of less than or equal to 1 child
6183 + * pointer.
6184 + */
6185 + to_r =
6186 + ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] +
6187 + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 -
6188 + tb->rnum[h]);
6189 + set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL,
6190 + -1, -1);
6191 + return CARRY_ON;
6194 + /*
6195 + * this checks balance condition, that any two neighboring nodes
6196 + * can not fit in one node
6197 + */
6198 + RFALSE(h &&
6199 + (tb->lnum[h] >= vn->vn_nr_item + 1 ||
6200 + tb->rnum[h] >= vn->vn_nr_item + 1),
6201 + "vs-8220: tree is not balanced on internal level");
6202 + RFALSE(!h && ((tb->lnum[h] >= vn->vn_nr_item && (tb->lbytes == -1)) ||
6203 + (tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1))),
6204 + "vs-8225: tree is not balanced on leaf level");
6206 + /*
6207 + * all contents of S[0] can be moved into its neighbors
6208 + * S[0] will be removed after balancing.
6209 + */
6210 + if (!h && is_leaf_removable(tb))
6211 + return CARRY_ON;
6213 + /*
6214 + * why do we perform this check here rather than earlier??
6215 + * Answer: we can win 1 node in some cases above. Moreover we
6216 + * checked it above, when we checked, that S[0] is not removable
6217 + * in principle
6218 + */
6220 + /* new item fits into node S[h] without any shifting */
6221 + if (sfree >= levbytes) {
6222 + if (!h)
6223 + tb->s0num = vn->vn_nr_item;
6224 + set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
6225 + return NO_BALANCING_NEEDED;
6229 + int lpar, rpar, nset, lset, rset, lrset;
6230 + /* regular overflowing of the node */
6232 + /*
6233 + * get_num_ver works in 2 modes (FLOW & NO_FLOW)
6234 + * lpar, rpar - number of items we can shift to left/right
6235 + * neighbor (including splitting item)
6236 + * nset, lset, rset, lrset - shows, whether flowing items
6237 + * give better packing
6238 + */
6239 +#define FLOW 1
6240 +#define NO_FLOW 0 /* do not any splitting */
6242 + /* we choose one of the following */
6243 +#define NOTHING_SHIFT_NO_FLOW 0
6244 +#define NOTHING_SHIFT_FLOW 5
6245 +#define LEFT_SHIFT_NO_FLOW 10
6246 +#define LEFT_SHIFT_FLOW 15
6247 +#define RIGHT_SHIFT_NO_FLOW 20
6248 +#define RIGHT_SHIFT_FLOW 25
6249 +#define LR_SHIFT_NO_FLOW 30
6250 +#define LR_SHIFT_FLOW 35
6252 + lpar = tb->lnum[h];
6253 + rpar = tb->rnum[h];
6255 + /*
6256 + * calculate number of blocks S[h] must be split into when
6257 + * nothing is shifted to the neighbors, as well as number of
6258 + * items in each part of the split node (s012 numbers),
6259 + * and number of bytes (s1bytes) of the shared drop which
6260 + * flow to S1 if any
6261 + */
6262 + nset = NOTHING_SHIFT_NO_FLOW;
6263 + nver = get_num_ver(vn->vn_mode, tb, h,
6264 + 0, -1, h ? vn->vn_nr_item : 0, -1,
6265 + snum012, NO_FLOW);
6267 + if (!h) {
6268 + int nver1;
6270 + /*
6271 + * note, that in this case we try to bottle
6272 + * between S[0] and S1 (S1 - the first new node)
6273 + */
6274 + nver1 = get_num_ver(vn->vn_mode, tb, h,
6275 + 0, -1, 0, -1,
6276 + snum012 + NOTHING_SHIFT_FLOW, FLOW);
6277 + if (nver > nver1)
6278 + nset = NOTHING_SHIFT_FLOW, nver = nver1;
6281 + /*
6282 + * calculate number of blocks S[h] must be split into when
6283 + * l_shift_num first items and l_shift_bytes of the right
6284 + * most liquid item to be shifted are shifted to the left
6285 + * neighbor, as well as number of items in each part of the
6286 + * splitted node (s012 numbers), and number of bytes
6287 + * (s1bytes) of the shared drop which flow to S1 if any
6288 + */
6289 + lset = LEFT_SHIFT_NO_FLOW;
6290 + lnver = get_num_ver(vn->vn_mode, tb, h,
6291 + lpar - ((h || tb->lbytes == -1) ? 0 : 1),
6292 + -1, h ? vn->vn_nr_item : 0, -1,
6293 + snum012 + LEFT_SHIFT_NO_FLOW, NO_FLOW);
6294 + if (!h) {
6295 + int lnver1;
6297 + lnver1 = get_num_ver(vn->vn_mode, tb, h,
6298 + lpar -
6299 + ((tb->lbytes != -1) ? 1 : 0),
6300 + tb->lbytes, 0, -1,
6301 + snum012 + LEFT_SHIFT_FLOW, FLOW);
6302 + if (lnver > lnver1)
6303 + lset = LEFT_SHIFT_FLOW, lnver = lnver1;
6306 + /*
6307 + * calculate number of blocks S[h] must be split into when
6308 + * r_shift_num first items and r_shift_bytes of the left most
6309 + * liquid item to be shifted are shifted to the right neighbor,
6310 + * as well as number of items in each part of the splitted
6311 + * node (s012 numbers), and number of bytes (s1bytes) of the
6312 + * shared drop which flow to S1 if any
6313 + */
6314 + rset = RIGHT_SHIFT_NO_FLOW;
6315 + rnver = get_num_ver(vn->vn_mode, tb, h,
6316 + 0, -1,
6317 + h ? (vn->vn_nr_item - rpar) : (rpar -
6318 + ((tb->
6319 + rbytes !=
6320 + -1) ? 1 :
6321 + 0)), -1,
6322 + snum012 + RIGHT_SHIFT_NO_FLOW, NO_FLOW);
6323 + if (!h) {
6324 + int rnver1;
6326 + rnver1 = get_num_ver(vn->vn_mode, tb, h,
6327 + 0, -1,
6328 + (rpar -
6329 + ((tb->rbytes != -1) ? 1 : 0)),
6330 + tb->rbytes,
6331 + snum012 + RIGHT_SHIFT_FLOW, FLOW);
6333 + if (rnver > rnver1)
6334 + rset = RIGHT_SHIFT_FLOW, rnver = rnver1;
6337 + /*
6338 + * calculate number of blocks S[h] must be split into when
6339 + * items are shifted in both directions, as well as number
6340 + * of items in each part of the splitted node (s012 numbers),
6341 + * and number of bytes (s1bytes) of the shared drop which
6342 + * flow to S1 if any
6343 + */
6344 + lrset = LR_SHIFT_NO_FLOW;
6345 + lrnver = get_num_ver(vn->vn_mode, tb, h,
6346 + lpar - ((h || tb->lbytes == -1) ? 0 : 1),
6347 + -1,
6348 + h ? (vn->vn_nr_item - rpar) : (rpar -
6349 + ((tb->
6350 + rbytes !=
6351 + -1) ? 1 :
6352 + 0)), -1,
6353 + snum012 + LR_SHIFT_NO_FLOW, NO_FLOW);
6354 + if (!h) {
6355 + int lrnver1;
6357 + lrnver1 = get_num_ver(vn->vn_mode, tb, h,
6358 + lpar -
6359 + ((tb->lbytes != -1) ? 1 : 0),
6360 + tb->lbytes,
6361 + (rpar -
6362 + ((tb->rbytes != -1) ? 1 : 0)),
6363 + tb->rbytes,
6364 + snum012 + LR_SHIFT_FLOW, FLOW);
6365 + if (lrnver > lrnver1)
6366 + lrset = LR_SHIFT_FLOW, lrnver = lrnver1;
6369 + /*
6370 + * Our general shifting strategy is:
6371 + * 1) to minimized number of new nodes;
6372 + * 2) to minimized number of neighbors involved in shifting;
6373 + * 3) to minimized number of disk reads;
6374 + */
6376 + /* we can win TWO or ONE nodes by shifting in both directions */
6377 + if (lrnver < lnver && lrnver < rnver) {
6378 + RFALSE(h &&
6379 + (tb->lnum[h] != 1 ||
6380 + tb->rnum[h] != 1 ||
6381 + lrnver != 1 || rnver != 2 || lnver != 2
6382 + || h != 1), "vs-8230: bad h");
6383 + if (lrset == LR_SHIFT_FLOW)
6384 + set_parameters(tb, h, tb->lnum[h], tb->rnum[h],
6385 + lrnver, snum012 + lrset,
6386 + tb->lbytes, tb->rbytes);
6387 + else
6388 + set_parameters(tb, h,
6389 + tb->lnum[h] -
6390 + ((tb->lbytes == -1) ? 0 : 1),
6391 + tb->rnum[h] -
6392 + ((tb->rbytes == -1) ? 0 : 1),
6393 + lrnver, snum012 + lrset, -1, -1);
6395 + return CARRY_ON;
6398 + /*
6399 + * if shifting doesn't lead to better packing
6400 + * then don't shift
6401 + */
6402 + if (nver == lrnver) {
6403 + set_parameters(tb, h, 0, 0, nver, snum012 + nset, -1,
6404 + -1);
6405 + return CARRY_ON;
6408 + /*
6409 + * now we know that for better packing shifting in only one
6410 + * direction either to the left or to the right is required
6411 + */
6413 + /*
6414 + * if shifting to the left is better than
6415 + * shifting to the right
6416 + */
6417 + if (lnver < rnver) {
6418 + SET_PAR_SHIFT_LEFT;
6419 + return CARRY_ON;
6422 + /*
6423 + * if shifting to the right is better than
6424 + * shifting to the left
6425 + */
6426 + if (lnver > rnver) {
6427 + SET_PAR_SHIFT_RIGHT;
6428 + return CARRY_ON;
6431 + /*
6432 + * now shifting in either direction gives the same number
6433 + * of nodes and we can make use of the cached neighbors
6434 + */
6435 + if (is_left_neighbor_in_cache(tb, h)) {
6436 + SET_PAR_SHIFT_LEFT;
6437 + return CARRY_ON;
6440 + /*
6441 + * shift to the right independently on whether the
6442 + * right neighbor in cache or not
6443 + */
6444 + SET_PAR_SHIFT_RIGHT;
6445 + return CARRY_ON;
6450 + * Check whether current node S[h] is balanced when Decreasing its size by
6451 + * Deleting or Cutting for INTERNAL node of S+tree.
6452 + * Calculate parameters for balancing for current level h.
6453 + * Parameters:
6454 + * tb tree_balance structure;
6455 + * h current level of the node;
6456 + * inum item number in S[h];
6457 + * mode i - insert, p - paste;
6458 + * Returns: 1 - schedule occurred;
6459 + * 0 - balancing for higher levels needed;
6460 + * -1 - no balancing for higher levels needed;
6461 + * -2 - no disk space.
6463 + * Note: Items of internal nodes have fixed size, so the balance condition for
6464 + * the internal part of S+tree is as for the B-trees.
6465 + */
6466 +static int dc_check_balance_internal(struct tree_balance *tb, int h)
6468 + struct virtual_node *vn = tb->tb_vn;
6470 + /*
6471 + * Sh is the node whose balance is currently being checked,
6472 + * and Fh is its father.
6473 + */
6474 + struct buffer_head *Sh, *Fh;
6475 + int ret;
6476 + int lfree, rfree /* free space in L and R */ ;
6478 + Sh = PATH_H_PBUFFER(tb->tb_path, h);
6479 + Fh = PATH_H_PPARENT(tb->tb_path, h);
6481 + /*
6482 + * using tb->insert_size[h], which is negative in this case,
6483 + * create_virtual_node calculates:
6484 + * new_nr_item = number of items node would have if operation is
6485 + * performed without balancing (new_nr_item);
6486 + */
6487 + create_virtual_node(tb, h);
6489 + if (!Fh) { /* S[h] is the root. */
6490 + /* no balancing for higher levels needed */
6491 + if (vn->vn_nr_item > 0) {
6492 + set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
6493 + return NO_BALANCING_NEEDED;
6495 + /*
6496 + * new_nr_item == 0.
6497 + * Current root will be deleted resulting in
6498 + * decrementing the tree height.
6499 + */
6500 + set_parameters(tb, h, 0, 0, 0, NULL, -1, -1);
6501 + return CARRY_ON;
6504 + if ((ret = get_parents(tb, h)) != CARRY_ON)
6505 + return ret;
6507 + /* get free space of neighbors */
6508 + rfree = get_rfree(tb, h);
6509 + lfree = get_lfree(tb, h);
6511 + /* determine maximal number of items we can fit into neighbors */
6512 + check_left(tb, h, lfree);
6513 + check_right(tb, h, rfree);
6515 + /*
6516 + * Balance condition for the internal node is valid.
6517 + * In this case we balance only if it leads to better packing.
6518 + */
6519 + if (vn->vn_nr_item >= MIN_NR_KEY(Sh)) {
6520 + /*
6521 + * Here we join S[h] with one of its neighbors,
6522 + * which is impossible with greater values of new_nr_item.
6523 + */
6524 + if (vn->vn_nr_item == MIN_NR_KEY(Sh)) {
6525 + /* All contents of S[h] can be moved to L[h]. */
6526 + if (tb->lnum[h] >= vn->vn_nr_item + 1) {
6527 + int n;
6528 + int order_L;
6530 + order_L =
6531 + ((n =
6532 + PATH_H_B_ITEM_ORDER(tb->tb_path,
6533 + h)) ==
6534 + 0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
6535 + n = dc_size(B_N_CHILD(tb->FL[h], order_L)) /
6536 + (DC_SIZE + KEY_SIZE);
6537 + set_parameters(tb, h, -n - 1, 0, 0, NULL, -1,
6538 + -1);
6539 + return CARRY_ON;
6542 + /* All contents of S[h] can be moved to R[h]. */
6543 + if (tb->rnum[h] >= vn->vn_nr_item + 1) {
6544 + int n;
6545 + int order_R;
6547 + order_R =
6548 + ((n =
6549 + PATH_H_B_ITEM_ORDER(tb->tb_path,
6550 + h)) ==
6551 + B_NR_ITEMS(Fh)) ? 0 : n + 1;
6552 + n = dc_size(B_N_CHILD(tb->FR[h], order_R)) /
6553 + (DC_SIZE + KEY_SIZE);
6554 + set_parameters(tb, h, 0, -n - 1, 0, NULL, -1,
6555 + -1);
6556 + return CARRY_ON;
6560 + /*
6561 + * All contents of S[h] can be moved to the neighbors
6562 + * (L[h] & R[h]).
6563 + */
6564 + if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) {
6565 + int to_r;
6567 + to_r =
6568 + ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] -
6569 + tb->rnum[h] + vn->vn_nr_item + 1) / 2 -
6570 + (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
6571 + set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r,
6572 + 0, NULL, -1, -1);
6573 + return CARRY_ON;
6576 + /* Balancing does not lead to better packing. */
6577 + set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
6578 + return NO_BALANCING_NEEDED;
6581 + /*
6582 + * Current node contain insufficient number of items.
6583 + * Balancing is required.
6584 + */
6585 + /* Check whether we can merge S[h] with left neighbor. */
6586 + if (tb->lnum[h] >= vn->vn_nr_item + 1)
6587 + if (is_left_neighbor_in_cache(tb, h)
6588 + || tb->rnum[h] < vn->vn_nr_item + 1 || !tb->FR[h]) {
6589 + int n;
6590 + int order_L;
6592 + order_L =
6593 + ((n =
6594 + PATH_H_B_ITEM_ORDER(tb->tb_path,
6595 + h)) ==
6596 + 0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
6597 + n = dc_size(B_N_CHILD(tb->FL[h], order_L)) / (DC_SIZE +
6598 + KEY_SIZE);
6599 + set_parameters(tb, h, -n - 1, 0, 0, NULL, -1, -1);
6600 + return CARRY_ON;
6603 + /* Check whether we can merge S[h] with right neighbor. */
6604 + if (tb->rnum[h] >= vn->vn_nr_item + 1) {
6605 + int n;
6606 + int order_R;
6608 + order_R =
6609 + ((n =
6610 + PATH_H_B_ITEM_ORDER(tb->tb_path,
6611 + h)) == B_NR_ITEMS(Fh)) ? 0 : (n + 1);
6612 + n = dc_size(B_N_CHILD(tb->FR[h], order_R)) / (DC_SIZE +
6613 + KEY_SIZE);
6614 + set_parameters(tb, h, 0, -n - 1, 0, NULL, -1, -1);
6615 + return CARRY_ON;
6618 + /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
6619 + if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) {
6620 + int to_r;
6622 + to_r =
6623 + ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] +
6624 + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 -
6625 + tb->rnum[h]);
6626 + set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL,
6627 + -1, -1);
6628 + return CARRY_ON;
6631 + /* For internal nodes try to borrow item from a neighbor */
6632 + RFALSE(!tb->FL[h] && !tb->FR[h], "vs-8235: trying to borrow for root");
6634 + /* Borrow one or two items from caching neighbor */
6635 + if (is_left_neighbor_in_cache(tb, h) || !tb->FR[h]) {
6636 + int from_l;
6638 + from_l =
6639 + (MAX_NR_KEY(Sh) + 1 - tb->lnum[h] + vn->vn_nr_item +
6640 + 1) / 2 - (vn->vn_nr_item + 1);
6641 + set_parameters(tb, h, -from_l, 0, 1, NULL, -1, -1);
6642 + return CARRY_ON;
6645 + set_parameters(tb, h, 0,
6646 + -((MAX_NR_KEY(Sh) + 1 - tb->rnum[h] + vn->vn_nr_item +
6647 + 1) / 2 - (vn->vn_nr_item + 1)), 1, NULL, -1, -1);
6648 + return CARRY_ON;
6652 + * Check whether current node S[h] is balanced when Decreasing its size by
6653 + * Deleting or Truncating for LEAF node of S+tree.
6654 + * Calculate parameters for balancing for current level h.
6655 + * Parameters:
6656 + * tb tree_balance structure;
6657 + * h current level of the node;
6658 + * inum item number in S[h];
6659 + * mode i - insert, p - paste;
6660 + * Returns: 1 - schedule occurred;
6661 + * 0 - balancing for higher levels needed;
6662 + * -1 - no balancing for higher levels needed;
6663 + * -2 - no disk space.
6664 + */
6665 +static int dc_check_balance_leaf(struct tree_balance *tb, int h)
6667 + struct virtual_node *vn = tb->tb_vn;
6669 + /*
6670 + * Number of bytes that must be deleted from
6671 + * (value is negative if bytes are deleted) buffer which
6672 + * contains node being balanced. The mnemonic is that the
6673 + * attempted change in node space used level is levbytes bytes.
6674 + */
6675 + int levbytes;
6677 + /* the maximal item size */
6678 + int maxsize, ret;
6680 + /*
6681 + * S0 is the node whose balance is currently being checked,
6682 + * and F0 is its father.
6683 + */
6684 + struct buffer_head *S0, *F0;
6685 + int lfree, rfree /* free space in L and R */ ;
6687 + S0 = PATH_H_PBUFFER(tb->tb_path, 0);
6688 + F0 = PATH_H_PPARENT(tb->tb_path, 0);
6690 + levbytes = tb->insert_size[h];
6692 + maxsize = MAX_CHILD_SIZE(S0); /* maximal possible size of an item */
6694 + if (!F0) { /* S[0] is the root now. */
6696 + RFALSE(-levbytes >= maxsize - B_FREE_SPACE(S0),
6697 + "vs-8240: attempt to create empty buffer tree");
6699 + set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
6700 + return NO_BALANCING_NEEDED;
6703 + if ((ret = get_parents(tb, h)) != CARRY_ON)
6704 + return ret;
6706 + /* get free space of neighbors */
6707 + rfree = get_rfree(tb, h);
6708 + lfree = get_lfree(tb, h);
6710 + create_virtual_node(tb, h);
6712 + /* if 3 leaves can be merge to one, set parameters and return */
6713 + if (are_leaves_removable(tb, lfree, rfree))
6714 + return CARRY_ON;
6716 + /*
6717 + * determine maximal number of items we can shift to the left/right
6718 + * neighbor and the maximal number of bytes that can flow to the
6719 + * left/right neighbor from the left/right most liquid item that
6720 + * cannot be shifted from S[0] entirely
6721 + */
6722 + check_left(tb, h, lfree);
6723 + check_right(tb, h, rfree);
6725 + /* check whether we can merge S with left neighbor. */
6726 + if (tb->lnum[0] >= vn->vn_nr_item && tb->lbytes == -1)
6727 + if (is_left_neighbor_in_cache(tb, h) || ((tb->rnum[0] - ((tb->rbytes == -1) ? 0 : 1)) < vn->vn_nr_item) || /* S can not be merged with R */
6728 + !tb->FR[h]) {
6730 + RFALSE(!tb->FL[h],
6731 + "vs-8245: dc_check_balance_leaf: FL[h] must exist");
6733 + /* set parameter to merge S[0] with its left neighbor */
6734 + set_parameters(tb, h, -1, 0, 0, NULL, -1, -1);
6735 + return CARRY_ON;
6738 + /* check whether we can merge S[0] with right neighbor. */
6739 + if (tb->rnum[0] >= vn->vn_nr_item && tb->rbytes == -1) {
6740 + set_parameters(tb, h, 0, -1, 0, NULL, -1, -1);
6741 + return CARRY_ON;
6744 + /*
6745 + * All contents of S[0] can be moved to the neighbors (L[0] & R[0]).
6746 + * Set parameters and return
6747 + */
6748 + if (is_leaf_removable(tb))
6749 + return CARRY_ON;
6751 + /* Balancing is not required. */
6752 + tb->s0num = vn->vn_nr_item;
6753 + set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
6754 + return NO_BALANCING_NEEDED;
6758 + * Check whether current node S[h] is balanced when Decreasing its size by
6759 + * Deleting or Cutting.
6760 + * Calculate parameters for balancing for current level h.
6761 + * Parameters:
6762 + * tb tree_balance structure;
6763 + * h current level of the node;
6764 + * inum item number in S[h];
6765 + * mode d - delete, c - cut.
6766 + * Returns: 1 - schedule occurred;
6767 + * 0 - balancing for higher levels needed;
6768 + * -1 - no balancing for higher levels needed;
6769 + * -2 - no disk space.
6770 + */
6771 +static int dc_check_balance(struct tree_balance *tb, int h)
6773 + RFALSE(!(PATH_H_PBUFFER(tb->tb_path, h)),
6774 + "vs-8250: S is not initialized");
6776 + if (h)
6777 + return dc_check_balance_internal(tb, h);
6778 + else
6779 + return dc_check_balance_leaf(tb, h);
6783 + * Check whether current node S[h] is balanced.
6784 + * Calculate parameters for balancing for current level h.
6785 + * Parameters:
6787 + * tb tree_balance structure:
6789 + * tb is a large structure that must be read about in the header
6790 + * file at the same time as this procedure if the reader is
6791 + * to successfully understand this procedure
6793 + * h current level of the node;
6794 + * inum item number in S[h];
6795 + * mode i - insert, p - paste, d - delete, c - cut.
6796 + * Returns: 1 - schedule occurred;
6797 + * 0 - balancing for higher levels needed;
6798 + * -1 - no balancing for higher levels needed;
6799 + * -2 - no disk space.
6800 + */
6801 +static int check_balance(int mode,
6802 + struct tree_balance *tb,
6803 + int h,
6804 + int inum,
6805 + int pos_in_item,
6806 + struct item_head *ins_ih, const void *data)
6808 + struct virtual_node *vn;
6810 + vn = tb->tb_vn = (struct virtual_node *)(tb->vn_buf);
6811 + vn->vn_free_ptr = (char *)(tb->tb_vn + 1);
6812 + vn->vn_mode = mode;
6813 + vn->vn_affected_item_num = inum;
6814 + vn->vn_pos_in_item = pos_in_item;
6815 + vn->vn_ins_ih = ins_ih;
6816 + vn->vn_data = data;
6818 + RFALSE(mode == M_INSERT && !vn->vn_ins_ih,
6819 + "vs-8255: ins_ih can not be 0 in insert mode");
6821 + /* Calculate balance parameters when size of node is increasing. */
6822 + if (tb->insert_size[h] > 0)
6823 + return ip_check_balance(tb, h);
6825 + /* Calculate balance parameters when size of node is decreasing. */
6826 + return dc_check_balance(tb, h);
6829 +/* Check whether parent at the path is the really parent of the current node.*/
6830 +static int get_direct_parent(struct tree_balance *tb, int h)
6832 + struct buffer_head *bh;
6833 + struct treepath *path = tb->tb_path;
6834 + int position,
6835 + path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h);
6837 + /* We are in the root or in the new root. */
6838 + if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
6840 + RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET - 1,
6841 + "PAP-8260: invalid offset in the path");
6843 + if (PATH_OFFSET_PBUFFER(path, FIRST_PATH_ELEMENT_OFFSET)->
6844 + b_blocknr == SB_ROOT_BLOCK(tb->tb_sb)) {
6845 + /* Root is not changed. */
6846 + PATH_OFFSET_PBUFFER(path, path_offset - 1) = NULL;
6847 + PATH_OFFSET_POSITION(path, path_offset - 1) = 0;
6848 + return CARRY_ON;
6850 + /* Root is changed and we must recalculate the path. */
6851 + return REPEAT_SEARCH;
6854 + /* Parent in the path is not in the tree. */
6855 + if (!B_IS_IN_TREE
6856 + (bh = PATH_OFFSET_PBUFFER(path, path_offset - 1)))
6857 + return REPEAT_SEARCH;
6859 + if ((position =
6860 + PATH_OFFSET_POSITION(path,
6861 + path_offset - 1)) > B_NR_ITEMS(bh))
6862 + return REPEAT_SEARCH;
6864 + /* Parent in the path is not parent of the current node in the tree. */
6865 + if (B_N_CHILD_NUM(bh, position) !=
6866 + PATH_OFFSET_PBUFFER(path, path_offset)->b_blocknr)
6867 + return REPEAT_SEARCH;
6869 + if (buffer_locked(bh)) {
6870 + int depth = reiserfs_write_unlock_nested(tb->tb_sb);
6871 + __wait_on_buffer(bh);
6872 + reiserfs_write_lock_nested(tb->tb_sb, depth);
6873 + if (FILESYSTEM_CHANGED_TB(tb))
6874 + return REPEAT_SEARCH;
6877 + /*
6878 + * Parent in the path is unlocked and really parent
6879 + * of the current node.
6880 + */
6881 + return CARRY_ON;
6885 + * Using lnum[h] and rnum[h] we should determine what neighbors
6886 + * of S[h] we
6887 + * need in order to balance S[h], and get them if necessary.
6888 + * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked;
6889 + * CARRY_ON - schedule didn't occur while the function worked;
6890 + */
6891 +static int get_neighbors(struct tree_balance *tb, int h)
6893 + int child_position,
6894 + path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h + 1);
6895 + unsigned long son_number;
6896 + struct super_block *sb = tb->tb_sb;
6897 + struct buffer_head *bh;
6898 + int depth;
6900 + PROC_INFO_INC(sb, get_neighbors[h]);
6902 + if (tb->lnum[h]) {
6903 + /* We need left neighbor to balance S[h]. */
6904 + PROC_INFO_INC(sb, need_l_neighbor[h]);
6905 + bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset);
6907 + RFALSE(bh == tb->FL[h] &&
6908 + !PATH_OFFSET_POSITION(tb->tb_path, path_offset),
6909 + "PAP-8270: invalid position in the parent");
6911 + child_position =
6912 + (bh ==
6913 + tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb->
6914 + FL[h]);
6915 + son_number = B_N_CHILD_NUM(tb->FL[h], child_position);
6916 + depth = reiserfs_write_unlock_nested(tb->tb_sb);
6917 + bh = sb_bread(sb, son_number);
6918 + reiserfs_write_lock_nested(tb->tb_sb, depth);
6919 + if (!bh)
6920 + return IO_ERROR;
6921 + if (FILESYSTEM_CHANGED_TB(tb)) {
6922 + brelse(bh);
6923 + PROC_INFO_INC(sb, get_neighbors_restart[h]);
6924 + return REPEAT_SEARCH;
6927 + RFALSE(!B_IS_IN_TREE(tb->FL[h]) ||
6928 + child_position > B_NR_ITEMS(tb->FL[h]) ||
6929 + B_N_CHILD_NUM(tb->FL[h], child_position) !=
6930 + bh->b_blocknr, "PAP-8275: invalid parent");
6931 + RFALSE(!B_IS_IN_TREE(bh), "PAP-8280: invalid child");
6932 + RFALSE(!h &&
6933 + B_FREE_SPACE(bh) !=
6934 + MAX_CHILD_SIZE(bh) -
6935 + dc_size(B_N_CHILD(tb->FL[0], child_position)),
6936 + "PAP-8290: invalid child size of left neighbor");
6938 + brelse(tb->L[h]);
6939 + tb->L[h] = bh;
6942 + /* We need right neighbor to balance S[path_offset]. */
6943 + if (tb->rnum[h]) {
6944 + PROC_INFO_INC(sb, need_r_neighbor[h]);
6945 + bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset);
6947 + RFALSE(bh == tb->FR[h] &&
6948 + PATH_OFFSET_POSITION(tb->tb_path,
6949 + path_offset) >=
6950 + B_NR_ITEMS(bh),
6951 + "PAP-8295: invalid position in the parent");
6953 + child_position =
6954 + (bh == tb->FR[h]) ? tb->rkey[h] + 1 : 0;
6955 + son_number = B_N_CHILD_NUM(tb->FR[h], child_position);
6956 + depth = reiserfs_write_unlock_nested(tb->tb_sb);
6957 + bh = sb_bread(sb, son_number);
6958 + reiserfs_write_lock_nested(tb->tb_sb, depth);
6959 + if (!bh)
6960 + return IO_ERROR;
6961 + if (FILESYSTEM_CHANGED_TB(tb)) {
6962 + brelse(bh);
6963 + PROC_INFO_INC(sb, get_neighbors_restart[h]);
6964 + return REPEAT_SEARCH;
6966 + brelse(tb->R[h]);
6967 + tb->R[h] = bh;
6969 + RFALSE(!h
6970 + && B_FREE_SPACE(bh) !=
6971 + MAX_CHILD_SIZE(bh) -
6972 + dc_size(B_N_CHILD(tb->FR[0], child_position)),
6973 + "PAP-8300: invalid child size of right neighbor (%d != %d - %d)",
6974 + B_FREE_SPACE(bh), MAX_CHILD_SIZE(bh),
6975 + dc_size(B_N_CHILD(tb->FR[0], child_position)));
6978 + return CARRY_ON;
6981 +static int get_virtual_node_size(struct super_block *sb, struct buffer_head *bh)
6983 + int max_num_of_items;
6984 + int max_num_of_entries;
6985 + unsigned long blocksize = sb->s_blocksize;
6987 +#define MIN_NAME_LEN 1
6989 + max_num_of_items = (blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN);
6990 + max_num_of_entries = (blocksize - BLKH_SIZE - IH_SIZE) /
6991 + (DEH_SIZE + MIN_NAME_LEN);
6993 + return sizeof(struct virtual_node) +
6994 + max(max_num_of_items * sizeof(struct virtual_item),
6995 + sizeof(struct virtual_item) +
6996 + struct_size_t(struct direntry_uarea, entry_sizes,
6997 + max_num_of_entries));
7001 + * maybe we should fail balancing we are going to perform when kmalloc
7002 + * fails several times. But now it will loop until kmalloc gets
7003 + * required memory
7004 + */
7005 +static int get_mem_for_virtual_node(struct tree_balance *tb)
7007 + int check_fs = 0;
7008 + int size;
7009 + char *buf;
7011 + size = get_virtual_node_size(tb->tb_sb, PATH_PLAST_BUFFER(tb->tb_path));
7013 + /* we have to allocate more memory for virtual node */
7014 + if (size > tb->vn_buf_size) {
7015 + if (tb->vn_buf) {
7016 + /* free memory allocated before */
7017 + kfree(tb->vn_buf);
7018 + /* this is not needed if kfree is atomic */
7019 + check_fs = 1;
7022 + /* virtual node requires now more memory */
7023 + tb->vn_buf_size = size;
7025 + /* get memory for virtual item */
7026 + buf = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN);
7027 + if (!buf) {
7028 + /*
7029 + * getting memory with GFP_KERNEL priority may involve
7030 + * balancing now (due to indirect_to_direct conversion
7031 + * on dcache shrinking). So, release path and collected
7032 + * resources here
7033 + */
7034 + free_buffers_in_tb(tb);
7035 + buf = kmalloc(size, GFP_NOFS);
7036 + if (!buf) {
7037 + tb->vn_buf_size = 0;
7039 + tb->vn_buf = buf;
7040 + schedule();
7041 + return REPEAT_SEARCH;
7044 + tb->vn_buf = buf;
7047 + if (check_fs && FILESYSTEM_CHANGED_TB(tb))
7048 + return REPEAT_SEARCH;
7050 + return CARRY_ON;
7053 +#ifdef CONFIG_REISERFS_CHECK
7054 +static void tb_buffer_sanity_check(struct super_block *sb,
7055 + struct buffer_head *bh,
7056 + const char *descr, int level)
7058 + if (bh) {
7059 + if (atomic_read(&(bh->b_count)) <= 0)
7061 + reiserfs_panic(sb, "jmacd-1", "negative or zero "
7062 + "reference counter for buffer %s[%d] "
7063 + "(%b)", descr, level, bh);
7065 + if (!buffer_uptodate(bh))
7066 + reiserfs_panic(sb, "jmacd-2", "buffer is not up "
7067 + "to date %s[%d] (%b)",
7068 + descr, level, bh);
7070 + if (!B_IS_IN_TREE(bh))
7071 + reiserfs_panic(sb, "jmacd-3", "buffer is not "
7072 + "in tree %s[%d] (%b)",
7073 + descr, level, bh);
7075 + if (bh->b_bdev != sb->s_bdev)
7076 + reiserfs_panic(sb, "jmacd-4", "buffer has wrong "
7077 + "device %s[%d] (%b)",
7078 + descr, level, bh);
7080 + if (bh->b_size != sb->s_blocksize)
7081 + reiserfs_panic(sb, "jmacd-5", "buffer has wrong "
7082 + "blocksize %s[%d] (%b)",
7083 + descr, level, bh);
7085 + if (bh->b_blocknr > SB_BLOCK_COUNT(sb))
7086 + reiserfs_panic(sb, "jmacd-6", "buffer block "
7087 + "number too high %s[%d] (%b)",
7088 + descr, level, bh);
7091 +#else
7092 +static void tb_buffer_sanity_check(struct super_block *sb,
7093 + struct buffer_head *bh,
7094 + const char *descr, int level)
7097 +#endif
7099 +static int clear_all_dirty_bits(struct super_block *s, struct buffer_head *bh)
7101 + return reiserfs_prepare_for_journal(s, bh, 0);
7104 +static int wait_tb_buffers_until_unlocked(struct tree_balance *tb)
7106 + struct buffer_head *locked;
7107 +#ifdef CONFIG_REISERFS_CHECK
7108 + int repeat_counter = 0;
7109 +#endif
7110 + int i;
7112 + do {
7114 + locked = NULL;
7116 + for (i = tb->tb_path->path_length;
7117 + !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i--) {
7118 + if (PATH_OFFSET_PBUFFER(tb->tb_path, i)) {
7119 + /*
7120 + * if I understand correctly, we can only
7121 + * be sure the last buffer in the path is
7122 + * in the tree --clm
7123 + */
7124 +#ifdef CONFIG_REISERFS_CHECK
7125 + if (PATH_PLAST_BUFFER(tb->tb_path) ==
7126 + PATH_OFFSET_PBUFFER(tb->tb_path, i))
7127 + tb_buffer_sanity_check(tb->tb_sb,
7128 + PATH_OFFSET_PBUFFER
7129 + (tb->tb_path,
7130 + i), "S",
7131 + tb->tb_path->
7132 + path_length - i);
7133 +#endif
7134 + if (!clear_all_dirty_bits(tb->tb_sb,
7135 + PATH_OFFSET_PBUFFER
7136 + (tb->tb_path,
7137 + i))) {
7138 + locked =
7139 + PATH_OFFSET_PBUFFER(tb->tb_path,
7140 + i);
7145 + for (i = 0; !locked && i < MAX_HEIGHT && tb->insert_size[i];
7146 + i++) {
7148 + if (tb->lnum[i]) {
7150 + if (tb->L[i]) {
7151 + tb_buffer_sanity_check(tb->tb_sb,
7152 + tb->L[i],
7153 + "L", i);
7154 + if (!clear_all_dirty_bits
7155 + (tb->tb_sb, tb->L[i]))
7156 + locked = tb->L[i];
7159 + if (!locked && tb->FL[i]) {
7160 + tb_buffer_sanity_check(tb->tb_sb,
7161 + tb->FL[i],
7162 + "FL", i);
7163 + if (!clear_all_dirty_bits
7164 + (tb->tb_sb, tb->FL[i]))
7165 + locked = tb->FL[i];
7168 + if (!locked && tb->CFL[i]) {
7169 + tb_buffer_sanity_check(tb->tb_sb,
7170 + tb->CFL[i],
7171 + "CFL", i);
7172 + if (!clear_all_dirty_bits
7173 + (tb->tb_sb, tb->CFL[i]))
7174 + locked = tb->CFL[i];
7179 + if (!locked && (tb->rnum[i])) {
7181 + if (tb->R[i]) {
7182 + tb_buffer_sanity_check(tb->tb_sb,
7183 + tb->R[i],
7184 + "R", i);
7185 + if (!clear_all_dirty_bits
7186 + (tb->tb_sb, tb->R[i]))
7187 + locked = tb->R[i];
7190 + if (!locked && tb->FR[i]) {
7191 + tb_buffer_sanity_check(tb->tb_sb,
7192 + tb->FR[i],
7193 + "FR", i);
7194 + if (!clear_all_dirty_bits
7195 + (tb->tb_sb, tb->FR[i]))
7196 + locked = tb->FR[i];
7199 + if (!locked && tb->CFR[i]) {
7200 + tb_buffer_sanity_check(tb->tb_sb,
7201 + tb->CFR[i],
7202 + "CFR", i);
7203 + if (!clear_all_dirty_bits
7204 + (tb->tb_sb, tb->CFR[i]))
7205 + locked = tb->CFR[i];
7210 + /*
7211 + * as far as I can tell, this is not required. The FEB list
7212 + * seems to be full of newly allocated nodes, which will
7213 + * never be locked, dirty, or anything else.
7214 + * To be safe, I'm putting in the checks and waits in.
7215 + * For the moment, they are needed to keep the code in
7216 + * journal.c from complaining about the buffer.
7217 + * That code is inside CONFIG_REISERFS_CHECK as well. --clm
7218 + */
7219 + for (i = 0; !locked && i < MAX_FEB_SIZE; i++) {
7220 + if (tb->FEB[i]) {
7221 + if (!clear_all_dirty_bits
7222 + (tb->tb_sb, tb->FEB[i]))
7223 + locked = tb->FEB[i];
7227 + if (locked) {
7228 + int depth;
7229 +#ifdef CONFIG_REISERFS_CHECK
7230 + repeat_counter++;
7231 + if ((repeat_counter % 10000) == 0) {
7232 + reiserfs_warning(tb->tb_sb, "reiserfs-8200",
7233 + "too many iterations waiting "
7234 + "for buffer to unlock "
7235 + "(%b)", locked);
7237 + /* Don't loop forever. Try to recover from possible error. */
7239 + return (FILESYSTEM_CHANGED_TB(tb)) ?
7240 + REPEAT_SEARCH : CARRY_ON;
7242 +#endif
7243 + depth = reiserfs_write_unlock_nested(tb->tb_sb);
7244 + __wait_on_buffer(locked);
7245 + reiserfs_write_lock_nested(tb->tb_sb, depth);
7246 + if (FILESYSTEM_CHANGED_TB(tb))
7247 + return REPEAT_SEARCH;
7250 + } while (locked);
7252 + return CARRY_ON;
7256 + * Prepare for balancing, that is
7257 + * get all necessary parents, and neighbors;
7258 + * analyze what and where should be moved;
7259 + * get sufficient number of new nodes;
7260 + * Balancing will start only after all resources will be collected at a time.
7262 + * When ported to SMP kernels, only at the last moment after all needed nodes
7263 + * are collected in cache, will the resources be locked using the usual
7264 + * textbook ordered lock acquisition algorithms. Note that ensuring that
7265 + * this code neither write locks what it does not need to write lock nor locks
7266 + * out of order will be a pain in the butt that could have been avoided.
7267 + * Grumble grumble. -Hans
7269 + * fix is meant in the sense of render unchanging
7271 + * Latency might be improved by first gathering a list of what buffers
7272 + * are needed and then getting as many of them in parallel as possible? -Hans
7274 + * Parameters:
7275 + * op_mode i - insert, d - delete, c - cut (truncate), p - paste (append)
7276 + * tb tree_balance structure;
7277 + * inum item number in S[h];
7278 + * pos_in_item - comment this if you can
7279 + * ins_ih item head of item being inserted
7280 + * data inserted item or data to be pasted
7281 + * Returns: 1 - schedule occurred while the function worked;
7282 + * 0 - schedule didn't occur while the function worked;
7283 + * -1 - if no_disk_space
7284 + */
7286 +int fix_nodes(int op_mode, struct tree_balance *tb,
7287 + struct item_head *ins_ih, const void *data)
7289 + int ret, h, item_num = PATH_LAST_POSITION(tb->tb_path);
7290 + int pos_in_item;
7292 + /*
7293 + * we set wait_tb_buffers_run when we have to restore any dirty
7294 + * bits cleared during wait_tb_buffers_run
7295 + */
7296 + int wait_tb_buffers_run = 0;
7297 + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
7299 + ++REISERFS_SB(tb->tb_sb)->s_fix_nodes;
7301 + pos_in_item = tb->tb_path->pos_in_item;
7303 + tb->fs_gen = get_generation(tb->tb_sb);
7305 + /*
7306 + * we prepare and log the super here so it will already be in the
7307 + * transaction when do_balance needs to change it.
7308 + * This way do_balance won't have to schedule when trying to prepare
7309 + * the super for logging
7310 + */
7311 + reiserfs_prepare_for_journal(tb->tb_sb,
7312 + SB_BUFFER_WITH_SB(tb->tb_sb), 1);
7313 + journal_mark_dirty(tb->transaction_handle,
7314 + SB_BUFFER_WITH_SB(tb->tb_sb));
7315 + if (FILESYSTEM_CHANGED_TB(tb))
7316 + return REPEAT_SEARCH;
7318 + /* if it possible in indirect_to_direct conversion */
7319 + if (buffer_locked(tbS0)) {
7320 + int depth = reiserfs_write_unlock_nested(tb->tb_sb);
7321 + __wait_on_buffer(tbS0);
7322 + reiserfs_write_lock_nested(tb->tb_sb, depth);
7323 + if (FILESYSTEM_CHANGED_TB(tb))
7324 + return REPEAT_SEARCH;
7326 +#ifdef CONFIG_REISERFS_CHECK
7327 + if (REISERFS_SB(tb->tb_sb)->cur_tb) {
7328 + print_cur_tb("fix_nodes");
7329 + reiserfs_panic(tb->tb_sb, "PAP-8305",
7330 + "there is pending do_balance");
7333 + if (!buffer_uptodate(tbS0) || !B_IS_IN_TREE(tbS0))
7334 + reiserfs_panic(tb->tb_sb, "PAP-8320", "S[0] (%b %z) is "
7335 + "not uptodate at the beginning of fix_nodes "
7336 + "or not in tree (mode %c)",
7337 + tbS0, tbS0, op_mode);
7339 + /* Check parameters. */
7340 + switch (op_mode) {
7341 + case M_INSERT:
7342 + if (item_num <= 0 || item_num > B_NR_ITEMS(tbS0))
7343 + reiserfs_panic(tb->tb_sb, "PAP-8330", "Incorrect "
7344 + "item number %d (in S0 - %d) in case "
7345 + "of insert", item_num,
7346 + B_NR_ITEMS(tbS0));
7347 + break;
7348 + case M_PASTE:
7349 + case M_DELETE:
7350 + case M_CUT:
7351 + if (item_num < 0 || item_num >= B_NR_ITEMS(tbS0)) {
7352 + print_block(tbS0, 0, -1, -1);
7353 + reiserfs_panic(tb->tb_sb, "PAP-8335", "Incorrect "
7354 + "item number(%d); mode = %c "
7355 + "insert_size = %d",
7356 + item_num, op_mode,
7357 + tb->insert_size[0]);
7359 + break;
7360 + default:
7361 + reiserfs_panic(tb->tb_sb, "PAP-8340", "Incorrect mode "
7362 + "of operation");
7364 +#endif
7366 + if (get_mem_for_virtual_node(tb) == REPEAT_SEARCH)
7367 + /* FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat */
7368 + return REPEAT_SEARCH;
7370 + /* Starting from the leaf level; for all levels h of the tree. */
7371 + for (h = 0; h < MAX_HEIGHT && tb->insert_size[h]; h++) {
7372 + ret = get_direct_parent(tb, h);
7373 + if (ret != CARRY_ON)
7374 + goto repeat;
7376 + ret = check_balance(op_mode, tb, h, item_num,
7377 + pos_in_item, ins_ih, data);
7378 + if (ret != CARRY_ON) {
7379 + if (ret == NO_BALANCING_NEEDED) {
7380 + /* No balancing for higher levels needed. */
7381 + ret = get_neighbors(tb, h);
7382 + if (ret != CARRY_ON)
7383 + goto repeat;
7384 + if (h != MAX_HEIGHT - 1)
7385 + tb->insert_size[h + 1] = 0;
7386 + /*
7387 + * ok, analysis and resource gathering
7388 + * are complete
7389 + */
7390 + break;
7392 + goto repeat;
7395 + ret = get_neighbors(tb, h);
7396 + if (ret != CARRY_ON)
7397 + goto repeat;
7399 + /*
7400 + * No disk space, or schedule occurred and analysis may be
7401 + * invalid and needs to be redone.
7402 + */
7403 + ret = get_empty_nodes(tb, h);
7404 + if (ret != CARRY_ON)
7405 + goto repeat;
7407 + /*
7408 + * We have a positive insert size but no nodes exist on this
7409 + * level, this means that we are creating a new root.
7410 + */
7411 + if (!PATH_H_PBUFFER(tb->tb_path, h)) {
7413 + RFALSE(tb->blknum[h] != 1,
7414 + "PAP-8350: creating new empty root");
7416 + if (h < MAX_HEIGHT - 1)
7417 + tb->insert_size[h + 1] = 0;
7418 + } else if (!PATH_H_PBUFFER(tb->tb_path, h + 1)) {
7419 + /*
7420 + * The tree needs to be grown, so this node S[h]
7421 + * which is the root node is split into two nodes,
7422 + * and a new node (S[h+1]) will be created to
7423 + * become the root node.
7424 + */
7425 + if (tb->blknum[h] > 1) {
7427 + RFALSE(h == MAX_HEIGHT - 1,
7428 + "PAP-8355: attempt to create too high of a tree");
7430 + tb->insert_size[h + 1] =
7431 + (DC_SIZE +
7432 + KEY_SIZE) * (tb->blknum[h] - 1) +
7433 + DC_SIZE;
7434 + } else if (h < MAX_HEIGHT - 1)
7435 + tb->insert_size[h + 1] = 0;
7436 + } else
7437 + tb->insert_size[h + 1] =
7438 + (DC_SIZE + KEY_SIZE) * (tb->blknum[h] - 1);
7441 + ret = wait_tb_buffers_until_unlocked(tb);
7442 + if (ret == CARRY_ON) {
7443 + if (FILESYSTEM_CHANGED_TB(tb)) {
7444 + wait_tb_buffers_run = 1;
7445 + ret = REPEAT_SEARCH;
7446 + goto repeat;
7447 + } else {
7448 + return CARRY_ON;
7450 + } else {
7451 + wait_tb_buffers_run = 1;
7452 + goto repeat;
7455 +repeat:
7456 + /*
7457 + * fix_nodes was unable to perform its calculation due to
7458 + * filesystem got changed under us, lack of free disk space or i/o
7459 + * failure. If the first is the case - the search will be
7460 + * repeated. For now - free all resources acquired so far except
7461 + * for the new allocated nodes
7462 + */
7464 + int i;
7466 + /* Release path buffers. */
7467 + if (wait_tb_buffers_run) {
7468 + pathrelse_and_restore(tb->tb_sb, tb->tb_path);
7469 + } else {
7470 + pathrelse(tb->tb_path);
7472 + /* brelse all resources collected for balancing */
7473 + for (i = 0; i < MAX_HEIGHT; i++) {
7474 + if (wait_tb_buffers_run) {
7475 + reiserfs_restore_prepared_buffer(tb->tb_sb,
7476 + tb->L[i]);
7477 + reiserfs_restore_prepared_buffer(tb->tb_sb,
7478 + tb->R[i]);
7479 + reiserfs_restore_prepared_buffer(tb->tb_sb,
7480 + tb->FL[i]);
7481 + reiserfs_restore_prepared_buffer(tb->tb_sb,
7482 + tb->FR[i]);
7483 + reiserfs_restore_prepared_buffer(tb->tb_sb,
7484 + tb->
7485 + CFL[i]);
7486 + reiserfs_restore_prepared_buffer(tb->tb_sb,
7487 + tb->
7488 + CFR[i]);
7491 + brelse(tb->L[i]);
7492 + brelse(tb->R[i]);
7493 + brelse(tb->FL[i]);
7494 + brelse(tb->FR[i]);
7495 + brelse(tb->CFL[i]);
7496 + brelse(tb->CFR[i]);
7498 + tb->L[i] = NULL;
7499 + tb->R[i] = NULL;
7500 + tb->FL[i] = NULL;
7501 + tb->FR[i] = NULL;
7502 + tb->CFL[i] = NULL;
7503 + tb->CFR[i] = NULL;
7506 + if (wait_tb_buffers_run) {
7507 + for (i = 0; i < MAX_FEB_SIZE; i++) {
7508 + if (tb->FEB[i])
7509 + reiserfs_restore_prepared_buffer
7510 + (tb->tb_sb, tb->FEB[i]);
7513 + return ret;
7518 +void unfix_nodes(struct tree_balance *tb)
7520 + int i;
7522 + /* Release path buffers. */
7523 + pathrelse_and_restore(tb->tb_sb, tb->tb_path);
7525 + /* brelse all resources collected for balancing */
7526 + for (i = 0; i < MAX_HEIGHT; i++) {
7527 + reiserfs_restore_prepared_buffer(tb->tb_sb, tb->L[i]);
7528 + reiserfs_restore_prepared_buffer(tb->tb_sb, tb->R[i]);
7529 + reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FL[i]);
7530 + reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FR[i]);
7531 + reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFL[i]);
7532 + reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFR[i]);
7534 + brelse(tb->L[i]);
7535 + brelse(tb->R[i]);
7536 + brelse(tb->FL[i]);
7537 + brelse(tb->FR[i]);
7538 + brelse(tb->CFL[i]);
7539 + brelse(tb->CFR[i]);
7542 + /* deal with list of allocated (used and unused) nodes */
7543 + for (i = 0; i < MAX_FEB_SIZE; i++) {
7544 + if (tb->FEB[i]) {
7545 + b_blocknr_t blocknr = tb->FEB[i]->b_blocknr;
7546 + /*
7547 + * de-allocated block which was not used by
7548 + * balancing and bforget about buffer for it
7549 + */
7550 + brelse(tb->FEB[i]);
7551 + reiserfs_free_block(tb->transaction_handle, NULL,
7552 + blocknr, 0);
7554 + if (tb->used[i]) {
7555 + /* release used as new nodes including a new root */
7556 + brelse(tb->used[i]);
7560 + kfree(tb->vn_buf);
7563 diff --git a/fs/reiserfs/hashes.c b/fs/reiserfs/hashes.c
7564 new file mode 100644
7565 index 000000000000..7a26c4fe6c46
7566 --- /dev/null
7567 +++ b/fs/reiserfs/hashes.c
7568 @@ -0,0 +1,177 @@
7571 + * Keyed 32-bit hash function using TEA in a Davis-Meyer function
7572 + * H0 = Key
7573 + * Hi = E Mi(Hi-1) + Hi-1
7575 + * (see Applied Cryptography, 2nd edition, p448).
7577 + * Jeremy Fitzhardinge <jeremy@zip.com.au> 1998
7579 + * Jeremy has agreed to the contents of reiserfs/README. -Hans
7580 + * Yura's function is added (04/07/2000)
7581 + */
7583 +#include <linux/kernel.h>
7584 +#include "reiserfs.h"
7585 +#include <asm/types.h>
7587 +#define DELTA 0x9E3779B9
7588 +#define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */
7589 +#define PARTROUNDS 6 /* 6 gets complete mixing */
7591 +/* a, b, c, d - data; h0, h1 - accumulated hash */
7592 +#define TEACORE(rounds) \
7593 + do { \
7594 + u32 sum = 0; \
7595 + int n = rounds; \
7596 + u32 b0, b1; \
7598 + b0 = h0; \
7599 + b1 = h1; \
7601 + do \
7602 + { \
7603 + sum += DELTA; \
7604 + b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); \
7605 + b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); \
7606 + } while(--n); \
7608 + h0 += b0; \
7609 + h1 += b1; \
7610 + } while(0)
7612 +u32 keyed_hash(const signed char *msg, int len)
7614 + u32 k[] = { 0x9464a485, 0x542e1a94, 0x3e846bff, 0xb75bcfc3 };
7616 + u32 h0 = k[0], h1 = k[1];
7617 + u32 a, b, c, d;
7618 + u32 pad;
7619 + int i;
7621 + /* assert(len >= 0 && len < 256); */
7623 + pad = (u32) len | ((u32) len << 8);
7624 + pad |= pad << 16;
7626 + while (len >= 16) {
7627 + a = (u32) msg[0] |
7628 + (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24;
7629 + b = (u32) msg[4] |
7630 + (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24;
7631 + c = (u32) msg[8] |
7632 + (u32) msg[9] << 8 |
7633 + (u32) msg[10] << 16 | (u32) msg[11] << 24;
7634 + d = (u32) msg[12] |
7635 + (u32) msg[13] << 8 |
7636 + (u32) msg[14] << 16 | (u32) msg[15] << 24;
7638 + TEACORE(PARTROUNDS);
7640 + len -= 16;
7641 + msg += 16;
7644 + if (len >= 12) {
7645 + a = (u32) msg[0] |
7646 + (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24;
7647 + b = (u32) msg[4] |
7648 + (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24;
7649 + c = (u32) msg[8] |
7650 + (u32) msg[9] << 8 |
7651 + (u32) msg[10] << 16 | (u32) msg[11] << 24;
7653 + d = pad;
7654 + for (i = 12; i < len; i++) {
7655 + d <<= 8;
7656 + d |= msg[i];
7658 + } else if (len >= 8) {
7659 + a = (u32) msg[0] |
7660 + (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24;
7661 + b = (u32) msg[4] |
7662 + (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24;
7664 + c = d = pad;
7665 + for (i = 8; i < len; i++) {
7666 + c <<= 8;
7667 + c |= msg[i];
7669 + } else if (len >= 4) {
7670 + a = (u32) msg[0] |
7671 + (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24;
7673 + b = c = d = pad;
7674 + for (i = 4; i < len; i++) {
7675 + b <<= 8;
7676 + b |= msg[i];
7678 + } else {
7679 + a = b = c = d = pad;
7680 + for (i = 0; i < len; i++) {
7681 + a <<= 8;
7682 + a |= msg[i];
7686 + TEACORE(FULLROUNDS);
7688 +/* return 0;*/
7689 + return h0 ^ h1;
7693 + * What follows in this file is copyright 2000 by Hans Reiser, and the
7694 + * licensing of what follows is governed by reiserfs/README
7695 + */
7696 +u32 yura_hash(const signed char *msg, int len)
7698 + int j, pow;
7699 + u32 a, c;
7700 + int i;
7702 + for (pow = 1, i = 1; i < len; i++)
7703 + pow = pow * 10;
7705 + if (len == 1)
7706 + a = msg[0] - 48;
7707 + else
7708 + a = (msg[0] - 48) * pow;
7710 + for (i = 1; i < len; i++) {
7711 + c = msg[i] - 48;
7712 + for (pow = 1, j = i; j < len - 1; j++)
7713 + pow = pow * 10;
7714 + a = a + c * pow;
7717 + for (; i < 40; i++) {
7718 + c = '0' - 48;
7719 + for (pow = 1, j = i; j < len - 1; j++)
7720 + pow = pow * 10;
7721 + a = a + c * pow;
7724 + for (; i < 256; i++) {
7725 + c = i;
7726 + for (pow = 1, j = i; j < len - 1; j++)
7727 + pow = pow * 10;
7728 + a = a + c * pow;
7731 + a = a << 7;
7732 + return a;
7735 +u32 r5_hash(const signed char *msg, int len)
7737 + u32 a = 0;
7738 + while (*msg) {
7739 + a += *msg << 4;
7740 + a += *msg >> 4;
7741 + a *= 11;
7742 + msg++;
7744 + return a;
7746 diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c
7747 new file mode 100644
7748 index 000000000000..5db6f45b3fed
7749 --- /dev/null
7750 +++ b/fs/reiserfs/ibalance.c
7751 @@ -0,0 +1,1161 @@
7753 + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
7754 + */
7756 +#include <linux/uaccess.h>
7757 +#include <linux/string.h>
7758 +#include <linux/time.h>
7759 +#include "reiserfs.h"
7760 +#include <linux/buffer_head.h>
7762 +/* this is one and only function that is used outside (do_balance.c) */
7763 +int balance_internal(struct tree_balance *,
7764 + int, int, struct item_head *, struct buffer_head **);
7767 + * modes of internal_shift_left, internal_shift_right and
7768 + * internal_insert_childs
7769 + */
7770 +#define INTERNAL_SHIFT_FROM_S_TO_L 0
7771 +#define INTERNAL_SHIFT_FROM_R_TO_S 1
7772 +#define INTERNAL_SHIFT_FROM_L_TO_S 2
7773 +#define INTERNAL_SHIFT_FROM_S_TO_R 3
7774 +#define INTERNAL_INSERT_TO_S 4
7775 +#define INTERNAL_INSERT_TO_L 5
7776 +#define INTERNAL_INSERT_TO_R 6
7778 +static void internal_define_dest_src_infos(int shift_mode,
7779 + struct tree_balance *tb,
7780 + int h,
7781 + struct buffer_info *dest_bi,
7782 + struct buffer_info *src_bi,
7783 + int *d_key, struct buffer_head **cf)
7785 + memset(dest_bi, 0, sizeof(struct buffer_info));
7786 + memset(src_bi, 0, sizeof(struct buffer_info));
7787 + /* define dest, src, dest parent, dest position */
7788 + switch (shift_mode) {
7790 + /* used in internal_shift_left */
7791 + case INTERNAL_SHIFT_FROM_S_TO_L:
7792 + src_bi->tb = tb;
7793 + src_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
7794 + src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
7795 + src_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
7796 + dest_bi->tb = tb;
7797 + dest_bi->bi_bh = tb->L[h];
7798 + dest_bi->bi_parent = tb->FL[h];
7799 + dest_bi->bi_position = get_left_neighbor_position(tb, h);
7800 + *d_key = tb->lkey[h];
7801 + *cf = tb->CFL[h];
7802 + break;
7803 + case INTERNAL_SHIFT_FROM_L_TO_S:
7804 + src_bi->tb = tb;
7805 + src_bi->bi_bh = tb->L[h];
7806 + src_bi->bi_parent = tb->FL[h];
7807 + src_bi->bi_position = get_left_neighbor_position(tb, h);
7808 + dest_bi->tb = tb;
7809 + dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
7810 + dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
7811 + /* dest position is analog of dest->b_item_order */
7812 + dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
7813 + *d_key = tb->lkey[h];
7814 + *cf = tb->CFL[h];
7815 + break;
7817 + /* used in internal_shift_left */
7818 + case INTERNAL_SHIFT_FROM_R_TO_S:
7819 + src_bi->tb = tb;
7820 + src_bi->bi_bh = tb->R[h];
7821 + src_bi->bi_parent = tb->FR[h];
7822 + src_bi->bi_position = get_right_neighbor_position(tb, h);
7823 + dest_bi->tb = tb;
7824 + dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
7825 + dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
7826 + dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
7827 + *d_key = tb->rkey[h];
7828 + *cf = tb->CFR[h];
7829 + break;
7831 + case INTERNAL_SHIFT_FROM_S_TO_R:
7832 + src_bi->tb = tb;
7833 + src_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
7834 + src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
7835 + src_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
7836 + dest_bi->tb = tb;
7837 + dest_bi->bi_bh = tb->R[h];
7838 + dest_bi->bi_parent = tb->FR[h];
7839 + dest_bi->bi_position = get_right_neighbor_position(tb, h);
7840 + *d_key = tb->rkey[h];
7841 + *cf = tb->CFR[h];
7842 + break;
7844 + case INTERNAL_INSERT_TO_L:
7845 + dest_bi->tb = tb;
7846 + dest_bi->bi_bh = tb->L[h];
7847 + dest_bi->bi_parent = tb->FL[h];
7848 + dest_bi->bi_position = get_left_neighbor_position(tb, h);
7849 + break;
7851 + case INTERNAL_INSERT_TO_S:
7852 + dest_bi->tb = tb;
7853 + dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
7854 + dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
7855 + dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
7856 + break;
7858 + case INTERNAL_INSERT_TO_R:
7859 + dest_bi->tb = tb;
7860 + dest_bi->bi_bh = tb->R[h];
7861 + dest_bi->bi_parent = tb->FR[h];
7862 + dest_bi->bi_position = get_right_neighbor_position(tb, h);
7863 + break;
7865 + default:
7866 + reiserfs_panic(tb->tb_sb, "ibalance-1",
7867 + "shift type is unknown (%d)",
7868 + shift_mode);
7873 + * Insert count node pointers into buffer cur before position to + 1.
7874 + * Insert count items into buffer cur before position to.
7875 + * Items and node pointers are specified by inserted and bh respectively.
7876 + */
7877 +static void internal_insert_childs(struct buffer_info *cur_bi,
7878 + int to, int count,
7879 + struct item_head *inserted,
7880 + struct buffer_head **bh)
7882 + struct buffer_head *cur = cur_bi->bi_bh;
7883 + struct block_head *blkh;
7884 + int nr;
7885 + struct reiserfs_key *ih;
7886 + struct disk_child new_dc[2];
7887 + struct disk_child *dc;
7888 + int i;
7890 + if (count <= 0)
7891 + return;
7893 + blkh = B_BLK_HEAD(cur);
7894 + nr = blkh_nr_item(blkh);
7896 + RFALSE(count > 2, "too many children (%d) are to be inserted", count);
7897 + RFALSE(B_FREE_SPACE(cur) < count * (KEY_SIZE + DC_SIZE),
7898 + "no enough free space (%d), needed %d bytes",
7899 + B_FREE_SPACE(cur), count * (KEY_SIZE + DC_SIZE));
7901 + /* prepare space for count disk_child */
7902 + dc = B_N_CHILD(cur, to + 1);
7904 + memmove(dc + count, dc, (nr + 1 - (to + 1)) * DC_SIZE);
7906 + /* copy to_be_insert disk children */
7907 + for (i = 0; i < count; i++) {
7908 + put_dc_size(&new_dc[i],
7909 + MAX_CHILD_SIZE(bh[i]) - B_FREE_SPACE(bh[i]));
7910 + put_dc_block_number(&new_dc[i], bh[i]->b_blocknr);
7912 + memcpy(dc, new_dc, DC_SIZE * count);
7914 + /* prepare space for count items */
7915 + ih = internal_key(cur, ((to == -1) ? 0 : to));
7917 + memmove(ih + count, ih,
7918 + (nr - to) * KEY_SIZE + (nr + 1 + count) * DC_SIZE);
7920 + /* copy item headers (keys) */
7921 + memcpy(ih, inserted, KEY_SIZE);
7922 + if (count > 1)
7923 + memcpy(ih + 1, inserted + 1, KEY_SIZE);
7925 + /* sizes, item number */
7926 + set_blkh_nr_item(blkh, blkh_nr_item(blkh) + count);
7927 + set_blkh_free_space(blkh,
7928 + blkh_free_space(blkh) - count * (DC_SIZE +
7929 + KEY_SIZE));
7931 + do_balance_mark_internal_dirty(cur_bi->tb, cur, 0);
7933 + /*&&&&&&&&&&&&&&&&&&&&&&&& */
7934 + check_internal(cur);
7935 + /*&&&&&&&&&&&&&&&&&&&&&&&& */
7937 + if (cur_bi->bi_parent) {
7938 + struct disk_child *t_dc =
7939 + B_N_CHILD(cur_bi->bi_parent, cur_bi->bi_position);
7940 + put_dc_size(t_dc,
7941 + dc_size(t_dc) + (count * (DC_SIZE + KEY_SIZE)));
7942 + do_balance_mark_internal_dirty(cur_bi->tb, cur_bi->bi_parent,
7943 + 0);
7945 + /*&&&&&&&&&&&&&&&&&&&&&&&& */
7946 + check_internal(cur_bi->bi_parent);
7947 + /*&&&&&&&&&&&&&&&&&&&&&&&& */
7953 + * Delete del_num items and node pointers from buffer cur starting from
7954 + * the first_i'th item and first_p'th pointers respectively.
7955 + */
7956 +static void internal_delete_pointers_items(struct buffer_info *cur_bi,
7957 + int first_p,
7958 + int first_i, int del_num)
7960 + struct buffer_head *cur = cur_bi->bi_bh;
7961 + int nr;
7962 + struct block_head *blkh;
7963 + struct reiserfs_key *key;
7964 + struct disk_child *dc;
7966 + RFALSE(cur == NULL, "buffer is 0");
7967 + RFALSE(del_num < 0,
7968 + "negative number of items (%d) can not be deleted", del_num);
7969 + RFALSE(first_p < 0 || first_p + del_num > B_NR_ITEMS(cur) + 1
7970 + || first_i < 0,
7971 + "first pointer order (%d) < 0 or "
7972 + "no so many pointers (%d), only (%d) or "
7973 + "first key order %d < 0", first_p, first_p + del_num,
7974 + B_NR_ITEMS(cur) + 1, first_i);
7975 + if (del_num == 0)
7976 + return;
7978 + blkh = B_BLK_HEAD(cur);
7979 + nr = blkh_nr_item(blkh);
7981 + if (first_p == 0 && del_num == nr + 1) {
7982 + RFALSE(first_i != 0,
7983 + "1st deleted key must have order 0, not %d", first_i);
7984 + make_empty_node(cur_bi);
7985 + return;
7988 + RFALSE(first_i + del_num > B_NR_ITEMS(cur),
7989 + "first_i = %d del_num = %d "
7990 + "no so many keys (%d) in the node (%b)(%z)",
7991 + first_i, del_num, first_i + del_num, cur, cur);
7993 + /* deleting */
7994 + dc = B_N_CHILD(cur, first_p);
7996 + memmove(dc, dc + del_num, (nr + 1 - first_p - del_num) * DC_SIZE);
7997 + key = internal_key(cur, first_i);
7998 + memmove(key, key + del_num,
7999 + (nr - first_i - del_num) * KEY_SIZE + (nr + 1 -
8000 + del_num) * DC_SIZE);
8002 + /* sizes, item number */
8003 + set_blkh_nr_item(blkh, blkh_nr_item(blkh) - del_num);
8004 + set_blkh_free_space(blkh,
8005 + blkh_free_space(blkh) +
8006 + (del_num * (KEY_SIZE + DC_SIZE)));
8008 + do_balance_mark_internal_dirty(cur_bi->tb, cur, 0);
8009 + /*&&&&&&&&&&&&&&&&&&&&&&& */
8010 + check_internal(cur);
8011 + /*&&&&&&&&&&&&&&&&&&&&&&& */
8013 + if (cur_bi->bi_parent) {
8014 + struct disk_child *t_dc;
8015 + t_dc = B_N_CHILD(cur_bi->bi_parent, cur_bi->bi_position);
8016 + put_dc_size(t_dc,
8017 + dc_size(t_dc) - (del_num * (KEY_SIZE + DC_SIZE)));
8019 + do_balance_mark_internal_dirty(cur_bi->tb, cur_bi->bi_parent,
8020 + 0);
8021 + /*&&&&&&&&&&&&&&&&&&&&&&&& */
8022 + check_internal(cur_bi->bi_parent);
8023 + /*&&&&&&&&&&&&&&&&&&&&&&&& */
8027 +/* delete n node pointers and items starting from given position */
8028 +static void internal_delete_childs(struct buffer_info *cur_bi, int from, int n)
8030 + int i_from;
8032 + i_from = (from == 0) ? from : from - 1;
8034 + /*
8035 + * delete n pointers starting from `from' position in CUR;
8036 + * delete n keys starting from 'i_from' position in CUR;
8037 + */
8038 + internal_delete_pointers_items(cur_bi, from, i_from, n);
8042 + * copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer
8043 + * dest
8044 + * last_first == FIRST_TO_LAST means that we copy first items
8045 + * from src to tail of dest
8046 + * last_first == LAST_TO_FIRST means that we copy last items
8047 + * from src to head of dest
8048 + */
8049 +static void internal_copy_pointers_items(struct buffer_info *dest_bi,
8050 + struct buffer_head *src,
8051 + int last_first, int cpy_num)
8053 + /*
8054 + * ATTENTION! Number of node pointers in DEST is equal to number
8055 + * of items in DEST as delimiting key have already inserted to
8056 + * buffer dest.
8057 + */
8058 + struct buffer_head *dest = dest_bi->bi_bh;
8059 + int nr_dest, nr_src;
8060 + int dest_order, src_order;
8061 + struct block_head *blkh;
8062 + struct reiserfs_key *key;
8063 + struct disk_child *dc;
8065 + nr_src = B_NR_ITEMS(src);
8067 + RFALSE(dest == NULL || src == NULL,
8068 + "src (%p) or dest (%p) buffer is 0", src, dest);
8069 + RFALSE(last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST,
8070 + "invalid last_first parameter (%d)", last_first);
8071 + RFALSE(nr_src < cpy_num - 1,
8072 + "no so many items (%d) in src (%d)", cpy_num, nr_src);
8073 + RFALSE(cpy_num < 0, "cpy_num less than 0 (%d)", cpy_num);
8074 + RFALSE(cpy_num - 1 + B_NR_ITEMS(dest) > (int)MAX_NR_KEY(dest),
8075 + "cpy_num (%d) + item number in dest (%d) can not be > MAX_NR_KEY(%d)",
8076 + cpy_num, B_NR_ITEMS(dest), MAX_NR_KEY(dest));
8078 + if (cpy_num == 0)
8079 + return;
8081 + /* coping */
8082 + blkh = B_BLK_HEAD(dest);
8083 + nr_dest = blkh_nr_item(blkh);
8085 + /*dest_order = (last_first == LAST_TO_FIRST) ? 0 : nr_dest; */
8086 + /*src_order = (last_first == LAST_TO_FIRST) ? (nr_src - cpy_num + 1) : 0; */
8087 + (last_first == LAST_TO_FIRST) ? (dest_order = 0, src_order =
8088 + nr_src - cpy_num + 1) : (dest_order =
8089 + nr_dest,
8090 + src_order =
8091 + 0);
8093 + /* prepare space for cpy_num pointers */
8094 + dc = B_N_CHILD(dest, dest_order);
8096 + memmove(dc + cpy_num, dc, (nr_dest - dest_order) * DC_SIZE);
8098 + /* insert pointers */
8099 + memcpy(dc, B_N_CHILD(src, src_order), DC_SIZE * cpy_num);
8101 + /* prepare space for cpy_num - 1 item headers */
8102 + key = internal_key(dest, dest_order);
8103 + memmove(key + cpy_num - 1, key,
8104 + KEY_SIZE * (nr_dest - dest_order) + DC_SIZE * (nr_dest +
8105 + cpy_num));
8107 + /* insert headers */
8108 + memcpy(key, internal_key(src, src_order), KEY_SIZE * (cpy_num - 1));
8110 + /* sizes, item number */
8111 + set_blkh_nr_item(blkh, blkh_nr_item(blkh) + (cpy_num - 1));
8112 + set_blkh_free_space(blkh,
8113 + blkh_free_space(blkh) - (KEY_SIZE * (cpy_num - 1) +
8114 + DC_SIZE * cpy_num));
8116 + do_balance_mark_internal_dirty(dest_bi->tb, dest, 0);
8118 + /*&&&&&&&&&&&&&&&&&&&&&&&& */
8119 + check_internal(dest);
8120 + /*&&&&&&&&&&&&&&&&&&&&&&&& */
8122 + if (dest_bi->bi_parent) {
8123 + struct disk_child *t_dc;
8124 + t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position);
8125 + put_dc_size(t_dc,
8126 + dc_size(t_dc) + (KEY_SIZE * (cpy_num - 1) +
8127 + DC_SIZE * cpy_num));
8129 + do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent,
8130 + 0);
8131 + /*&&&&&&&&&&&&&&&&&&&&&&&& */
8132 + check_internal(dest_bi->bi_parent);
8133 + /*&&&&&&&&&&&&&&&&&&&&&&&& */
8139 + * Copy cpy_num node pointers and cpy_num - 1 items from buffer src to
8140 + * buffer dest.
8141 + * Delete cpy_num - del_par items and node pointers from buffer src.
8142 + * last_first == FIRST_TO_LAST means, that we copy/delete first items from src.
8143 + * last_first == LAST_TO_FIRST means, that we copy/delete last items from src.
8144 + */
8145 +static void internal_move_pointers_items(struct buffer_info *dest_bi,
8146 + struct buffer_info *src_bi,
8147 + int last_first, int cpy_num,
8148 + int del_par)
8150 + int first_pointer;
8151 + int first_item;
8153 + internal_copy_pointers_items(dest_bi, src_bi->bi_bh, last_first,
8154 + cpy_num);
8156 + if (last_first == FIRST_TO_LAST) { /* shift_left occurs */
8157 + first_pointer = 0;
8158 + first_item = 0;
8159 + /*
8160 + * delete cpy_num - del_par pointers and keys starting for
8161 + * pointers with first_pointer, for key - with first_item
8162 + */
8163 + internal_delete_pointers_items(src_bi, first_pointer,
8164 + first_item, cpy_num - del_par);
8165 + } else { /* shift_right occurs */
8166 + int i, j;
8168 + i = (cpy_num - del_par ==
8169 + (j =
8170 + B_NR_ITEMS(src_bi->bi_bh)) + 1) ? 0 : j - cpy_num +
8171 + del_par;
8173 + internal_delete_pointers_items(src_bi,
8174 + j + 1 - cpy_num + del_par, i,
8175 + cpy_num - del_par);
8179 +/* Insert n_src'th key of buffer src before n_dest'th key of buffer dest. */
8180 +static void internal_insert_key(struct buffer_info *dest_bi,
8181 + /* insert key before key with n_dest number */
8182 + int dest_position_before,
8183 + struct buffer_head *src, int src_position)
8185 + struct buffer_head *dest = dest_bi->bi_bh;
8186 + int nr;
8187 + struct block_head *blkh;
8188 + struct reiserfs_key *key;
8190 + RFALSE(dest == NULL || src == NULL,
8191 + "source(%p) or dest(%p) buffer is 0", src, dest);
8192 + RFALSE(dest_position_before < 0 || src_position < 0,
8193 + "source(%d) or dest(%d) key number less than 0",
8194 + src_position, dest_position_before);
8195 + RFALSE(dest_position_before > B_NR_ITEMS(dest) ||
8196 + src_position >= B_NR_ITEMS(src),
8197 + "invalid position in dest (%d (key number %d)) or in src (%d (key number %d))",
8198 + dest_position_before, B_NR_ITEMS(dest),
8199 + src_position, B_NR_ITEMS(src));
8200 + RFALSE(B_FREE_SPACE(dest) < KEY_SIZE,
8201 + "no enough free space (%d) in dest buffer", B_FREE_SPACE(dest));
8203 + blkh = B_BLK_HEAD(dest);
8204 + nr = blkh_nr_item(blkh);
8206 + /* prepare space for inserting key */
8207 + key = internal_key(dest, dest_position_before);
8208 + memmove(key + 1, key,
8209 + (nr - dest_position_before) * KEY_SIZE + (nr + 1) * DC_SIZE);
8211 + /* insert key */
8212 + memcpy(key, internal_key(src, src_position), KEY_SIZE);
8214 + /* Change dirt, free space, item number fields. */
8216 + set_blkh_nr_item(blkh, blkh_nr_item(blkh) + 1);
8217 + set_blkh_free_space(blkh, blkh_free_space(blkh) - KEY_SIZE);
8219 + do_balance_mark_internal_dirty(dest_bi->tb, dest, 0);
8221 + if (dest_bi->bi_parent) {
8222 + struct disk_child *t_dc;
8223 + t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position);
8224 + put_dc_size(t_dc, dc_size(t_dc) + KEY_SIZE);
8226 + do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent,
8227 + 0);
8232 + * Insert d_key'th (delimiting) key from buffer cfl to tail of dest.
8233 + * Copy pointer_amount node pointers and pointer_amount - 1 items from
8234 + * buffer src to buffer dest.
8235 + * Replace d_key'th key in buffer cfl.
8236 + * Delete pointer_amount items and node pointers from buffer src.
8237 + */
8238 +/* this can be invoked both to shift from S to L and from R to S */
8239 +static void internal_shift_left(
8240 + /*
8241 + * INTERNAL_FROM_S_TO_L | INTERNAL_FROM_R_TO_S
8242 + */
8243 + int mode,
8244 + struct tree_balance *tb,
8245 + int h, int pointer_amount)
8247 + struct buffer_info dest_bi, src_bi;
8248 + struct buffer_head *cf;
8249 + int d_key_position;
8251 + internal_define_dest_src_infos(mode, tb, h, &dest_bi, &src_bi,
8252 + &d_key_position, &cf);
8254 + /*printk("pointer_amount = %d\n",pointer_amount); */
8256 + if (pointer_amount) {
8257 + /*
8258 + * insert delimiting key from common father of dest and
8259 + * src to node dest into position B_NR_ITEM(dest)
8260 + */
8261 + internal_insert_key(&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf,
8262 + d_key_position);
8264 + if (B_NR_ITEMS(src_bi.bi_bh) == pointer_amount - 1) {
8265 + if (src_bi.bi_position /*src->b_item_order */ == 0)
8266 + replace_key(tb, cf, d_key_position,
8267 + src_bi.
8268 + bi_parent /*src->b_parent */ , 0);
8269 + } else
8270 + replace_key(tb, cf, d_key_position, src_bi.bi_bh,
8271 + pointer_amount - 1);
8273 + /* last parameter is del_parameter */
8274 + internal_move_pointers_items(&dest_bi, &src_bi, FIRST_TO_LAST,
8275 + pointer_amount, 0);
8280 + * Insert delimiting key to L[h].
8281 + * Copy n node pointers and n - 1 items from buffer S[h] to L[h].
8282 + * Delete n - 1 items and node pointers from buffer S[h].
8283 + */
8284 +/* it always shifts from S[h] to L[h] */
8285 +static void internal_shift1_left(struct tree_balance *tb,
8286 + int h, int pointer_amount)
8288 + struct buffer_info dest_bi, src_bi;
8289 + struct buffer_head *cf;
8290 + int d_key_position;
8292 + internal_define_dest_src_infos(INTERNAL_SHIFT_FROM_S_TO_L, tb, h,
8293 + &dest_bi, &src_bi, &d_key_position, &cf);
8295 + /* insert lkey[h]-th key from CFL[h] to left neighbor L[h] */
8296 + if (pointer_amount > 0)
8297 + internal_insert_key(&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf,
8298 + d_key_position);
8300 + /* last parameter is del_parameter */
8301 + internal_move_pointers_items(&dest_bi, &src_bi, FIRST_TO_LAST,
8302 + pointer_amount, 1);
8306 + * Insert d_key'th (delimiting) key from buffer cfr to head of dest.
8307 + * Copy n node pointers and n - 1 items from buffer src to buffer dest.
8308 + * Replace d_key'th key in buffer cfr.
8309 + * Delete n items and node pointers from buffer src.
8310 + */
8311 +static void internal_shift_right(
8312 + /*
8313 + * INTERNAL_FROM_S_TO_R | INTERNAL_FROM_L_TO_S
8314 + */
8315 + int mode,
8316 + struct tree_balance *tb,
8317 + int h, int pointer_amount)
8319 + struct buffer_info dest_bi, src_bi;
8320 + struct buffer_head *cf;
8321 + int d_key_position;
8322 + int nr;
8324 + internal_define_dest_src_infos(mode, tb, h, &dest_bi, &src_bi,
8325 + &d_key_position, &cf);
8327 + nr = B_NR_ITEMS(src_bi.bi_bh);
8329 + if (pointer_amount > 0) {
8330 + /*
8331 + * insert delimiting key from common father of dest
8332 + * and src to dest node into position 0
8333 + */
8334 + internal_insert_key(&dest_bi, 0, cf, d_key_position);
8335 + if (nr == pointer_amount - 1) {
8336 + RFALSE(src_bi.bi_bh != PATH_H_PBUFFER(tb->tb_path, h) /*tb->S[h] */ ||
8337 + dest_bi.bi_bh != tb->R[h],
8338 + "src (%p) must be == tb->S[h](%p) when it disappears",
8339 + src_bi.bi_bh, PATH_H_PBUFFER(tb->tb_path, h));
8340 + /* when S[h] disappers replace left delemiting key as well */
8341 + if (tb->CFL[h])
8342 + replace_key(tb, cf, d_key_position, tb->CFL[h],
8343 + tb->lkey[h]);
8344 + } else
8345 + replace_key(tb, cf, d_key_position, src_bi.bi_bh,
8346 + nr - pointer_amount);
8349 + /* last parameter is del_parameter */
8350 + internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST,
8351 + pointer_amount, 0);
8355 + * Insert delimiting key to R[h].
8356 + * Copy n node pointers and n - 1 items from buffer S[h] to R[h].
8357 + * Delete n - 1 items and node pointers from buffer S[h].
8358 + */
8359 +/* it always shift from S[h] to R[h] */
8360 +static void internal_shift1_right(struct tree_balance *tb,
8361 + int h, int pointer_amount)
8363 + struct buffer_info dest_bi, src_bi;
8364 + struct buffer_head *cf;
8365 + int d_key_position;
8367 + internal_define_dest_src_infos(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
8368 + &dest_bi, &src_bi, &d_key_position, &cf);
8370 + /* insert rkey from CFR[h] to right neighbor R[h] */
8371 + if (pointer_amount > 0)
8372 + internal_insert_key(&dest_bi, 0, cf, d_key_position);
8374 + /* last parameter is del_parameter */
8375 + internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST,
8376 + pointer_amount, 1);
8380 + * Delete insert_num node pointers together with their left items
8381 + * and balance current node.
8382 + */
8383 +static void balance_internal_when_delete(struct tree_balance *tb,
8384 + int h, int child_pos)
8386 + int insert_num;
8387 + int n;
8388 + struct buffer_head *tbSh = PATH_H_PBUFFER(tb->tb_path, h);
8389 + struct buffer_info bi;
8391 + insert_num = tb->insert_size[h] / ((int)(DC_SIZE + KEY_SIZE));
8393 + /* delete child-node-pointer(s) together with their left item(s) */
8394 + bi.tb = tb;
8395 + bi.bi_bh = tbSh;
8396 + bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h);
8397 + bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
8399 + internal_delete_childs(&bi, child_pos, -insert_num);
8401 + RFALSE(tb->blknum[h] > 1,
8402 + "tb->blknum[%d]=%d when insert_size < 0", h, tb->blknum[h]);
8404 + n = B_NR_ITEMS(tbSh);
8406 + if (tb->lnum[h] == 0 && tb->rnum[h] == 0) {
8407 + if (tb->blknum[h] == 0) {
8408 + /* node S[h] (root of the tree) is empty now */
8409 + struct buffer_head *new_root;
8411 + RFALSE(n
8412 + || B_FREE_SPACE(tbSh) !=
8413 + MAX_CHILD_SIZE(tbSh) - DC_SIZE,
8414 + "buffer must have only 0 keys (%d)", n);
8415 + RFALSE(bi.bi_parent, "root has parent (%p)",
8416 + bi.bi_parent);
8418 + /* choose a new root */
8419 + if (!tb->L[h - 1] || !B_NR_ITEMS(tb->L[h - 1]))
8420 + new_root = tb->R[h - 1];
8421 + else
8422 + new_root = tb->L[h - 1];
8423 + /*
8424 + * switch super block's tree root block
8425 + * number to the new value */
8426 + PUT_SB_ROOT_BLOCK(tb->tb_sb, new_root->b_blocknr);
8427 + /*REISERFS_SB(tb->tb_sb)->s_rs->s_tree_height --; */
8428 + PUT_SB_TREE_HEIGHT(tb->tb_sb,
8429 + SB_TREE_HEIGHT(tb->tb_sb) - 1);
8431 + do_balance_mark_sb_dirty(tb,
8432 + REISERFS_SB(tb->tb_sb)->s_sbh,
8433 + 1);
8434 + /*&&&&&&&&&&&&&&&&&&&&&& */
8435 + /* use check_internal if new root is an internal node */
8436 + if (h > 1)
8437 + check_internal(new_root);
8438 + /*&&&&&&&&&&&&&&&&&&&&&& */
8440 + /* do what is needed for buffer thrown from tree */
8441 + reiserfs_invalidate_buffer(tb, tbSh);
8442 + return;
8444 + return;
8447 + /* join S[h] with L[h] */
8448 + if (tb->L[h] && tb->lnum[h] == -B_NR_ITEMS(tb->L[h]) - 1) {
8450 + RFALSE(tb->rnum[h] != 0,
8451 + "invalid tb->rnum[%d]==%d when joining S[h] with L[h]",
8452 + h, tb->rnum[h]);
8454 + internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, n + 1);
8455 + reiserfs_invalidate_buffer(tb, tbSh);
8457 + return;
8460 + /* join S[h] with R[h] */
8461 + if (tb->R[h] && tb->rnum[h] == -B_NR_ITEMS(tb->R[h]) - 1) {
8462 + RFALSE(tb->lnum[h] != 0,
8463 + "invalid tb->lnum[%d]==%d when joining S[h] with R[h]",
8464 + h, tb->lnum[h]);
8466 + internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, n + 1);
8468 + reiserfs_invalidate_buffer(tb, tbSh);
8469 + return;
8472 + /* borrow from left neighbor L[h] */
8473 + if (tb->lnum[h] < 0) {
8474 + RFALSE(tb->rnum[h] != 0,
8475 + "wrong tb->rnum[%d]==%d when borrow from L[h]", h,
8476 + tb->rnum[h]);
8477 + internal_shift_right(INTERNAL_SHIFT_FROM_L_TO_S, tb, h,
8478 + -tb->lnum[h]);
8479 + return;
8482 + /* borrow from right neighbor R[h] */
8483 + if (tb->rnum[h] < 0) {
8484 + RFALSE(tb->lnum[h] != 0,
8485 + "invalid tb->lnum[%d]==%d when borrow from R[h]",
8486 + h, tb->lnum[h]);
8487 + internal_shift_left(INTERNAL_SHIFT_FROM_R_TO_S, tb, h, -tb->rnum[h]); /*tb->S[h], tb->CFR[h], tb->rkey[h], tb->R[h], -tb->rnum[h]); */
8488 + return;
8491 + /* split S[h] into two parts and put them into neighbors */
8492 + if (tb->lnum[h] > 0) {
8493 + RFALSE(tb->rnum[h] == 0 || tb->lnum[h] + tb->rnum[h] != n + 1,
8494 + "invalid tb->lnum[%d]==%d or tb->rnum[%d]==%d when S[h](item number == %d) is split between them",
8495 + h, tb->lnum[h], h, tb->rnum[h], n);
8497 + internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h]); /*tb->L[h], tb->CFL[h], tb->lkey[h], tb->S[h], tb->lnum[h]); */
8498 + internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
8499 + tb->rnum[h]);
8501 + reiserfs_invalidate_buffer(tb, tbSh);
8503 + return;
8505 + reiserfs_panic(tb->tb_sb, "ibalance-2",
8506 + "unexpected tb->lnum[%d]==%d or tb->rnum[%d]==%d",
8507 + h, tb->lnum[h], h, tb->rnum[h]);
8510 +/* Replace delimiting key of buffers L[h] and S[h] by the given key.*/
8511 +static void replace_lkey(struct tree_balance *tb, int h, struct item_head *key)
8513 + RFALSE(tb->L[h] == NULL || tb->CFL[h] == NULL,
8514 + "L[h](%p) and CFL[h](%p) must exist in replace_lkey",
8515 + tb->L[h], tb->CFL[h]);
8517 + if (B_NR_ITEMS(PATH_H_PBUFFER(tb->tb_path, h)) == 0)
8518 + return;
8520 + memcpy(internal_key(tb->CFL[h], tb->lkey[h]), key, KEY_SIZE);
8522 + do_balance_mark_internal_dirty(tb, tb->CFL[h], 0);
8525 +/* Replace delimiting key of buffers S[h] and R[h] by the given key.*/
8526 +static void replace_rkey(struct tree_balance *tb, int h, struct item_head *key)
8528 + RFALSE(tb->R[h] == NULL || tb->CFR[h] == NULL,
8529 + "R[h](%p) and CFR[h](%p) must exist in replace_rkey",
8530 + tb->R[h], tb->CFR[h]);
8531 + RFALSE(B_NR_ITEMS(tb->R[h]) == 0,
8532 + "R[h] can not be empty if it exists (item number=%d)",
8533 + B_NR_ITEMS(tb->R[h]));
8535 + memcpy(internal_key(tb->CFR[h], tb->rkey[h]), key, KEY_SIZE);
8537 + do_balance_mark_internal_dirty(tb, tb->CFR[h], 0);
8542 + * if inserting/pasting {
8543 + * child_pos is the position of the node-pointer in S[h] that
8544 + * pointed to S[h-1] before balancing of the h-1 level;
8545 + * this means that new pointers and items must be inserted AFTER
8546 + * child_pos
8547 + * } else {
8548 + * it is the position of the leftmost pointer that must be deleted
8549 + * (together with its corresponding key to the left of the pointer)
8550 + * as a result of the previous level's balancing.
8551 + * }
8552 + */
8554 +int balance_internal(struct tree_balance *tb,
8555 + int h, /* level of the tree */
8556 + int child_pos,
8557 + /* key for insertion on higher level */
8558 + struct item_head *insert_key,
8559 + /* node for insertion on higher level */
8560 + struct buffer_head **insert_ptr)
8562 + struct buffer_head *tbSh = PATH_H_PBUFFER(tb->tb_path, h);
8563 + struct buffer_info bi;
8565 + /*
8566 + * we return this: it is 0 if there is no S[h],
8567 + * else it is tb->S[h]->b_item_order
8568 + */
8569 + int order;
8570 + int insert_num, n, k;
8571 + struct buffer_head *S_new;
8572 + struct item_head new_insert_key;
8573 + struct buffer_head *new_insert_ptr = NULL;
8574 + struct item_head *new_insert_key_addr = insert_key;
8576 + RFALSE(h < 1, "h (%d) can not be < 1 on internal level", h);
8578 + PROC_INFO_INC(tb->tb_sb, balance_at[h]);
8580 + order =
8581 + (tbSh) ? PATH_H_POSITION(tb->tb_path,
8582 + h + 1) /*tb->S[h]->b_item_order */ : 0;
8584 + /*
8585 + * Using insert_size[h] calculate the number insert_num of items
8586 + * that must be inserted to or deleted from S[h].
8587 + */
8588 + insert_num = tb->insert_size[h] / ((int)(KEY_SIZE + DC_SIZE));
8590 + /* Check whether insert_num is proper * */
8591 + RFALSE(insert_num < -2 || insert_num > 2,
8592 + "incorrect number of items inserted to the internal node (%d)",
8593 + insert_num);
8594 + RFALSE(h > 1 && (insert_num > 1 || insert_num < -1),
8595 + "incorrect number of items (%d) inserted to the internal node on a level (h=%d) higher than last internal level",
8596 + insert_num, h);
8598 + /* Make balance in case insert_num < 0 */
8599 + if (insert_num < 0) {
8600 + balance_internal_when_delete(tb, h, child_pos);
8601 + return order;
8604 + k = 0;
8605 + if (tb->lnum[h] > 0) {
8606 + /*
8607 + * shift lnum[h] items from S[h] to the left neighbor L[h].
8608 + * check how many of new items fall into L[h] or CFL[h] after
8609 + * shifting
8610 + */
8611 + n = B_NR_ITEMS(tb->L[h]); /* number of items in L[h] */
8612 + if (tb->lnum[h] <= child_pos) {
8613 + /* new items don't fall into L[h] or CFL[h] */
8614 + internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h,
8615 + tb->lnum[h]);
8616 + child_pos -= tb->lnum[h];
8617 + } else if (tb->lnum[h] > child_pos + insert_num) {
8618 + /* all new items fall into L[h] */
8619 + internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h,
8620 + tb->lnum[h] - insert_num);
8621 + /* insert insert_num keys and node-pointers into L[h] */
8622 + bi.tb = tb;
8623 + bi.bi_bh = tb->L[h];
8624 + bi.bi_parent = tb->FL[h];
8625 + bi.bi_position = get_left_neighbor_position(tb, h);
8626 + internal_insert_childs(&bi,
8627 + /*tb->L[h], tb->S[h-1]->b_next */
8628 + n + child_pos + 1,
8629 + insert_num, insert_key,
8630 + insert_ptr);
8632 + insert_num = 0;
8633 + } else {
8634 + struct disk_child *dc;
8636 + /*
8637 + * some items fall into L[h] or CFL[h],
8638 + * but some don't fall
8639 + */
8640 + internal_shift1_left(tb, h, child_pos + 1);
8641 + /* calculate number of new items that fall into L[h] */
8642 + k = tb->lnum[h] - child_pos - 1;
8643 + bi.tb = tb;
8644 + bi.bi_bh = tb->L[h];
8645 + bi.bi_parent = tb->FL[h];
8646 + bi.bi_position = get_left_neighbor_position(tb, h);
8647 + internal_insert_childs(&bi,
8648 + /*tb->L[h], tb->S[h-1]->b_next, */
8649 + n + child_pos + 1, k,
8650 + insert_key, insert_ptr);
8652 + replace_lkey(tb, h, insert_key + k);
8654 + /*
8655 + * replace the first node-ptr in S[h] by
8656 + * node-ptr to insert_ptr[k]
8657 + */
8658 + dc = B_N_CHILD(tbSh, 0);
8659 + put_dc_size(dc,
8660 + MAX_CHILD_SIZE(insert_ptr[k]) -
8661 + B_FREE_SPACE(insert_ptr[k]));
8662 + put_dc_block_number(dc, insert_ptr[k]->b_blocknr);
8664 + do_balance_mark_internal_dirty(tb, tbSh, 0);
8666 + k++;
8667 + insert_key += k;
8668 + insert_ptr += k;
8669 + insert_num -= k;
8670 + child_pos = 0;
8673 + /* tb->lnum[h] > 0 */
8674 + if (tb->rnum[h] > 0) {
8675 + /*shift rnum[h] items from S[h] to the right neighbor R[h] */
8676 + /*
8677 + * check how many of new items fall into R or CFR
8678 + * after shifting
8679 + */
8680 + n = B_NR_ITEMS(tbSh); /* number of items in S[h] */
8681 + if (n - tb->rnum[h] >= child_pos)
8682 + /* new items fall into S[h] */
8683 + internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
8684 + tb->rnum[h]);
8685 + else if (n + insert_num - tb->rnum[h] < child_pos) {
8686 + /* all new items fall into R[h] */
8687 + internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
8688 + tb->rnum[h] - insert_num);
8690 + /* insert insert_num keys and node-pointers into R[h] */
8691 + bi.tb = tb;
8692 + bi.bi_bh = tb->R[h];
8693 + bi.bi_parent = tb->FR[h];
8694 + bi.bi_position = get_right_neighbor_position(tb, h);
8695 + internal_insert_childs(&bi,
8696 + /*tb->R[h],tb->S[h-1]->b_next */
8697 + child_pos - n - insert_num +
8698 + tb->rnum[h] - 1,
8699 + insert_num, insert_key,
8700 + insert_ptr);
8701 + insert_num = 0;
8702 + } else {
8703 + struct disk_child *dc;
8705 + /* one of the items falls into CFR[h] */
8706 + internal_shift1_right(tb, h, n - child_pos + 1);
8707 + /* calculate number of new items that fall into R[h] */
8708 + k = tb->rnum[h] - n + child_pos - 1;
8709 + bi.tb = tb;
8710 + bi.bi_bh = tb->R[h];
8711 + bi.bi_parent = tb->FR[h];
8712 + bi.bi_position = get_right_neighbor_position(tb, h);
8713 + internal_insert_childs(&bi,
8714 + /*tb->R[h], tb->R[h]->b_child, */
8715 + 0, k, insert_key + 1,
8716 + insert_ptr + 1);
8718 + replace_rkey(tb, h, insert_key + insert_num - k - 1);
8720 + /*
8721 + * replace the first node-ptr in R[h] by
8722 + * node-ptr insert_ptr[insert_num-k-1]
8723 + */
8724 + dc = B_N_CHILD(tb->R[h], 0);
8725 + put_dc_size(dc,
8726 + MAX_CHILD_SIZE(insert_ptr
8727 + [insert_num - k - 1]) -
8728 + B_FREE_SPACE(insert_ptr
8729 + [insert_num - k - 1]));
8730 + put_dc_block_number(dc,
8731 + insert_ptr[insert_num - k -
8732 + 1]->b_blocknr);
8734 + do_balance_mark_internal_dirty(tb, tb->R[h], 0);
8736 + insert_num -= (k + 1);
8740 + /** Fill new node that appears instead of S[h] **/
8741 + RFALSE(tb->blknum[h] > 2, "blknum can not be > 2 for internal level");
8742 + RFALSE(tb->blknum[h] < 0, "blknum can not be < 0");
8744 + if (!tb->blknum[h]) { /* node S[h] is empty now */
8745 + RFALSE(!tbSh, "S[h] is equal NULL");
8747 + /* do what is needed for buffer thrown from tree */
8748 + reiserfs_invalidate_buffer(tb, tbSh);
8749 + return order;
8752 + if (!tbSh) {
8753 + /* create new root */
8754 + struct disk_child *dc;
8755 + struct buffer_head *tbSh_1 = PATH_H_PBUFFER(tb->tb_path, h - 1);
8756 + struct block_head *blkh;
8758 + if (tb->blknum[h] != 1)
8759 + reiserfs_panic(NULL, "ibalance-3", "One new node "
8760 + "required for creating the new root");
8761 + /* S[h] = empty buffer from the list FEB. */
8762 + tbSh = get_FEB(tb);
8763 + blkh = B_BLK_HEAD(tbSh);
8764 + set_blkh_level(blkh, h + 1);
8766 + /* Put the unique node-pointer to S[h] that points to S[h-1]. */
8768 + dc = B_N_CHILD(tbSh, 0);
8769 + put_dc_block_number(dc, tbSh_1->b_blocknr);
8770 + put_dc_size(dc,
8771 + (MAX_CHILD_SIZE(tbSh_1) - B_FREE_SPACE(tbSh_1)));
8773 + tb->insert_size[h] -= DC_SIZE;
8774 + set_blkh_free_space(blkh, blkh_free_space(blkh) - DC_SIZE);
8776 + do_balance_mark_internal_dirty(tb, tbSh, 0);
8778 + /*&&&&&&&&&&&&&&&&&&&&&&&& */
8779 + check_internal(tbSh);
8780 + /*&&&&&&&&&&&&&&&&&&&&&&&& */
8782 + /* put new root into path structure */
8783 + PATH_OFFSET_PBUFFER(tb->tb_path, ILLEGAL_PATH_ELEMENT_OFFSET) =
8784 + tbSh;
8786 + /* Change root in structure super block. */
8787 + PUT_SB_ROOT_BLOCK(tb->tb_sb, tbSh->b_blocknr);
8788 + PUT_SB_TREE_HEIGHT(tb->tb_sb, SB_TREE_HEIGHT(tb->tb_sb) + 1);
8789 + do_balance_mark_sb_dirty(tb, REISERFS_SB(tb->tb_sb)->s_sbh, 1);
8792 + if (tb->blknum[h] == 2) {
8793 + int snum;
8794 + struct buffer_info dest_bi, src_bi;
8796 + /* S_new = free buffer from list FEB */
8797 + S_new = get_FEB(tb);
8799 + set_blkh_level(B_BLK_HEAD(S_new), h + 1);
8801 + dest_bi.tb = tb;
8802 + dest_bi.bi_bh = S_new;
8803 + dest_bi.bi_parent = NULL;
8804 + dest_bi.bi_position = 0;
8805 + src_bi.tb = tb;
8806 + src_bi.bi_bh = tbSh;
8807 + src_bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h);
8808 + src_bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
8810 + n = B_NR_ITEMS(tbSh); /* number of items in S[h] */
8811 + snum = (insert_num + n + 1) / 2;
8812 + if (n - snum >= child_pos) {
8813 + /* new items don't fall into S_new */
8814 + /* store the delimiting key for the next level */
8815 + /* new_insert_key = (n - snum)'th key in S[h] */
8816 + memcpy(&new_insert_key, internal_key(tbSh, n - snum),
8817 + KEY_SIZE);
8818 + /* last parameter is del_par */
8819 + internal_move_pointers_items(&dest_bi, &src_bi,
8820 + LAST_TO_FIRST, snum, 0);
8821 + } else if (n + insert_num - snum < child_pos) {
8822 + /* all new items fall into S_new */
8823 + /* store the delimiting key for the next level */
8824 + /*
8825 + * new_insert_key = (n + insert_item - snum)'th
8826 + * key in S[h]
8827 + */
8828 + memcpy(&new_insert_key,
8829 + internal_key(tbSh, n + insert_num - snum),
8830 + KEY_SIZE);
8831 + /* last parameter is del_par */
8832 + internal_move_pointers_items(&dest_bi, &src_bi,
8833 + LAST_TO_FIRST,
8834 + snum - insert_num, 0);
8836 + /*
8837 + * insert insert_num keys and node-pointers
8838 + * into S_new
8839 + */
8840 + internal_insert_childs(&dest_bi,
8841 + /*S_new,tb->S[h-1]->b_next, */
8842 + child_pos - n - insert_num +
8843 + snum - 1,
8844 + insert_num, insert_key,
8845 + insert_ptr);
8847 + insert_num = 0;
8848 + } else {
8849 + struct disk_child *dc;
8851 + /* some items fall into S_new, but some don't fall */
8852 + /* last parameter is del_par */
8853 + internal_move_pointers_items(&dest_bi, &src_bi,
8854 + LAST_TO_FIRST,
8855 + n - child_pos + 1, 1);
8856 + /* calculate number of new items that fall into S_new */
8857 + k = snum - n + child_pos - 1;
8859 + internal_insert_childs(&dest_bi, /*S_new, */ 0, k,
8860 + insert_key + 1, insert_ptr + 1);
8862 + /* new_insert_key = insert_key[insert_num - k - 1] */
8863 + memcpy(&new_insert_key, insert_key + insert_num - k - 1,
8864 + KEY_SIZE);
8865 + /*
8866 + * replace first node-ptr in S_new by node-ptr
8867 + * to insert_ptr[insert_num-k-1]
8868 + */
8870 + dc = B_N_CHILD(S_new, 0);
8871 + put_dc_size(dc,
8872 + (MAX_CHILD_SIZE
8873 + (insert_ptr[insert_num - k - 1]) -
8874 + B_FREE_SPACE(insert_ptr
8875 + [insert_num - k - 1])));
8876 + put_dc_block_number(dc,
8877 + insert_ptr[insert_num - k -
8878 + 1]->b_blocknr);
8880 + do_balance_mark_internal_dirty(tb, S_new, 0);
8882 + insert_num -= (k + 1);
8884 + /* new_insert_ptr = node_pointer to S_new */
8885 + new_insert_ptr = S_new;
8887 + RFALSE(!buffer_journaled(S_new) || buffer_journal_dirty(S_new)
8888 + || buffer_dirty(S_new), "cm-00001: bad S_new (%b)",
8889 + S_new);
8891 + /* S_new is released in unfix_nodes */
8894 + n = B_NR_ITEMS(tbSh); /*number of items in S[h] */
8896 + if (0 <= child_pos && child_pos <= n && insert_num > 0) {
8897 + bi.tb = tb;
8898 + bi.bi_bh = tbSh;
8899 + bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h);
8900 + bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
8901 + internal_insert_childs(&bi, /*tbSh, */
8902 + /* ( tb->S[h-1]->b_parent == tb->S[h] ) ? tb->S[h-1]->b_next : tb->S[h]->b_child->b_next, */
8903 + child_pos, insert_num, insert_key,
8904 + insert_ptr);
8907 + insert_ptr[0] = new_insert_ptr;
8908 + if (new_insert_ptr)
8909 + memcpy(new_insert_key_addr, &new_insert_key, KEY_SIZE);
8911 + return order;
8913 diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
8914 new file mode 100644
8915 index 000000000000..d39ee5f6c075
8916 --- /dev/null
8917 +++ b/fs/reiserfs/inode.c
8918 @@ -0,0 +1,3416 @@
8920 + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
8921 + */
8923 +#include <linux/time.h>
8924 +#include <linux/fs.h>
8925 +#include "reiserfs.h"
8926 +#include "acl.h"
8927 +#include "xattr.h"
8928 +#include <linux/exportfs.h>
8929 +#include <linux/pagemap.h>
8930 +#include <linux/highmem.h>
8931 +#include <linux/slab.h>
8932 +#include <linux/uaccess.h>
8933 +#include <linux/unaligned.h>
8934 +#include <linux/buffer_head.h>
8935 +#include <linux/mpage.h>
8936 +#include <linux/writeback.h>
8937 +#include <linux/quotaops.h>
8938 +#include <linux/swap.h>
8939 +#include <linux/uio.h>
8940 +#include <linux/bio.h>
8942 +int reiserfs_commit_write(struct file *f, struct page *page,
8943 + unsigned from, unsigned to);
8945 +void reiserfs_evict_inode(struct inode *inode)
8947 + /*
8948 + * We need blocks for transaction + (user+group) quota
8949 + * update (possibly delete)
8950 + */
8951 + int jbegin_count =
8952 + JOURNAL_PER_BALANCE_CNT * 2 +
8953 + 2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb);
8954 + struct reiserfs_transaction_handle th;
8955 + int err;
8957 + if (!inode->i_nlink && !is_bad_inode(inode))
8958 + dquot_initialize(inode);
8960 + truncate_inode_pages_final(&inode->i_data);
8961 + if (inode->i_nlink)
8962 + goto no_delete;
8964 + /*
8965 + * The = 0 happens when we abort creating a new inode
8966 + * for some reason like lack of space..
8967 + * also handles bad_inode case
8968 + */
8969 + if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) {
8971 + reiserfs_delete_xattrs(inode);
8973 + reiserfs_write_lock(inode->i_sb);
8975 + if (journal_begin(&th, inode->i_sb, jbegin_count))
8976 + goto out;
8977 + reiserfs_update_inode_transaction(inode);
8979 + reiserfs_discard_prealloc(&th, inode);
8981 + err = reiserfs_delete_object(&th, inode);
8983 + /*
8984 + * Do quota update inside a transaction for journaled quotas.
8985 + * We must do that after delete_object so that quota updates
8986 + * go into the same transaction as stat data deletion
8987 + */
8988 + if (!err) {
8989 + int depth = reiserfs_write_unlock_nested(inode->i_sb);
8990 + dquot_free_inode(inode);
8991 + reiserfs_write_lock_nested(inode->i_sb, depth);
8994 + if (journal_end(&th))
8995 + goto out;
8997 + /*
8998 + * check return value from reiserfs_delete_object after
8999 + * ending the transaction
9000 + */
9001 + if (err)
9002 + goto out;
9004 + /*
9005 + * all items of file are deleted, so we can remove
9006 + * "save" link
9007 + * we can't do anything about an error here
9008 + */
9009 + remove_save_link(inode, 0 /* not truncate */);
9010 +out:
9011 + reiserfs_write_unlock(inode->i_sb);
9012 + } else {
9013 + /* no object items are in the tree */
9017 + /* note this must go after the journal_end to prevent deadlock */
9018 + clear_inode(inode);
9020 + dquot_drop(inode);
9021 + inode->i_blocks = 0;
9022 + return;
9024 +no_delete:
9025 + clear_inode(inode);
9026 + dquot_drop(inode);
9029 +static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid,
9030 + __u32 objectid, loff_t offset, int type, int length)
9032 + key->version = version;
9034 + key->on_disk_key.k_dir_id = dirid;
9035 + key->on_disk_key.k_objectid = objectid;
9036 + set_cpu_key_k_offset(key, offset);
9037 + set_cpu_key_k_type(key, type);
9038 + key->key_length = length;
9042 + * take base of inode_key (it comes from inode always) (dirid, objectid)
9043 + * and version from an inode, set offset and type of key
9044 + */
9045 +void make_cpu_key(struct cpu_key *key, struct inode *inode, loff_t offset,
9046 + int type, int length)
9048 + _make_cpu_key(key, get_inode_item_key_version(inode),
9049 + le32_to_cpu(INODE_PKEY(inode)->k_dir_id),
9050 + le32_to_cpu(INODE_PKEY(inode)->k_objectid), offset, type,
9051 + length);
9054 +/* when key is 0, do not set version and short key */
9055 +inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
9056 + int version,
9057 + loff_t offset, int type, int length,
9058 + int entry_count /*or ih_free_space */ )
9060 + if (key) {
9061 + ih->ih_key.k_dir_id = cpu_to_le32(key->on_disk_key.k_dir_id);
9062 + ih->ih_key.k_objectid =
9063 + cpu_to_le32(key->on_disk_key.k_objectid);
9065 + put_ih_version(ih, version);
9066 + set_le_ih_k_offset(ih, offset);
9067 + set_le_ih_k_type(ih, type);
9068 + put_ih_item_len(ih, length);
9069 + /* set_ih_free_space (ih, 0); */
9070 + /*
9071 + * for directory items it is entry count, for directs and stat
9072 + * datas - 0xffff, for indirects - 0
9073 + */
9074 + put_ih_entry_count(ih, entry_count);
9078 + * FIXME: we might cache recently accessed indirect item
9079 + * Ugh. Not too eager for that....
9080 + * I cut the code until such time as I see a convincing argument (benchmark).
9081 + * I don't want a bloated inode struct..., and I don't like code complexity....
9082 + */
9085 + * cutting the code is fine, since it really isn't in use yet and is easy
9086 + * to add back in. But, Vladimir has a really good idea here. Think
9087 + * about what happens for reading a file. For each page,
9088 + * The VFS layer calls reiserfs_read_folio, who searches the tree to find
9089 + * an indirect item. This indirect item has X number of pointers, where
9090 + * X is a big number if we've done the block allocation right. But,
9091 + * we only use one or two of these pointers during each call to read_folio,
9092 + * needlessly researching again later on.
9094 + * The size of the cache could be dynamic based on the size of the file.
9096 + * I'd also like to see us cache the location the stat data item, since
9097 + * we are needlessly researching for that frequently.
9099 + * --chris
9100 + */
9103 + * If this page has a file tail in it, and
9104 + * it was read in by get_block_create_0, the page data is valid,
9105 + * but tail is still sitting in a direct item, and we can't write to
9106 + * it. So, look through this page, and check all the mapped buffers
9107 + * to make sure they have valid block numbers. Any that don't need
9108 + * to be unmapped, so that __block_write_begin will correctly call
9109 + * reiserfs_get_block to convert the tail into an unformatted node
9110 + */
9111 +static inline void fix_tail_page_for_writing(struct page *page)
9113 + struct buffer_head *head, *next, *bh;
9115 + if (page && page_has_buffers(page)) {
9116 + head = page_buffers(page);
9117 + bh = head;
9118 + do {
9119 + next = bh->b_this_page;
9120 + if (buffer_mapped(bh) && bh->b_blocknr == 0) {
9121 + reiserfs_unmap_buffer(bh);
9123 + bh = next;
9124 + } while (bh != head);
9129 + * reiserfs_get_block does not need to allocate a block only if it has been
9130 + * done already or non-hole position has been found in the indirect item
9131 + */
9132 +static inline int allocation_needed(int retval, b_blocknr_t allocated,
9133 + struct item_head *ih,
9134 + __le32 * item, int pos_in_item)
9136 + if (allocated)
9137 + return 0;
9138 + if (retval == POSITION_FOUND && is_indirect_le_ih(ih) &&
9139 + get_block_num(item, pos_in_item))
9140 + return 0;
9141 + return 1;
9144 +static inline int indirect_item_found(int retval, struct item_head *ih)
9146 + return (retval == POSITION_FOUND) && is_indirect_le_ih(ih);
9149 +static inline void set_block_dev_mapped(struct buffer_head *bh,
9150 + b_blocknr_t block, struct inode *inode)
9152 + map_bh(bh, inode->i_sb, block);
9156 + * files which were created in the earlier version can not be longer,
9157 + * than 2 gb
9158 + */
9159 +static int file_capable(struct inode *inode, sector_t block)
9161 + /* it is new file. */
9162 + if (get_inode_item_key_version(inode) != KEY_FORMAT_3_5 ||
9163 + /* old file, but 'block' is inside of 2gb */
9164 + block < (1 << (31 - inode->i_sb->s_blocksize_bits)))
9165 + return 1;
9167 + return 0;
9170 +static int restart_transaction(struct reiserfs_transaction_handle *th,
9171 + struct inode *inode, struct treepath *path)
9173 + struct super_block *s = th->t_super;
9174 + int err;
9176 + BUG_ON(!th->t_trans_id);
9177 + BUG_ON(!th->t_refcount);
9179 + pathrelse(path);
9181 + /* we cannot restart while nested */
9182 + if (th->t_refcount > 1) {
9183 + return 0;
9185 + reiserfs_update_sd(th, inode);
9186 + err = journal_end(th);
9187 + if (!err) {
9188 + err = journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6);
9189 + if (!err)
9190 + reiserfs_update_inode_transaction(inode);
9192 + return err;
9196 + * it is called by get_block when create == 0. Returns block number
9197 + * for 'block'-th logical block of file. When it hits direct item it
9198 + * returns 0 (being called from bmap) or read direct item into piece
9199 + * of page (bh_result)
9200 + * Please improve the english/clarity in the comment above, as it is
9201 + * hard to understand.
9202 + */
9203 +static int _get_block_create_0(struct inode *inode, sector_t block,
9204 + struct buffer_head *bh_result, int args)
9206 + INITIALIZE_PATH(path);
9207 + struct cpu_key key;
9208 + struct buffer_head *bh;
9209 + struct item_head *ih, tmp_ih;
9210 + b_blocknr_t blocknr;
9211 + char *p;
9212 + int chars;
9213 + int ret;
9214 + int result;
9215 + int done = 0;
9216 + unsigned long offset;
9218 + /* prepare the key to look for the 'block'-th block of file */
9219 + make_cpu_key(&key, inode,
9220 + (loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY,
9221 + 3);
9223 + result = search_for_position_by_key(inode->i_sb, &key, &path);
9224 + if (result != POSITION_FOUND) {
9225 + pathrelse(&path);
9226 + if (result == IO_ERROR)
9227 + return -EIO;
9228 + /*
9229 + * We do not return -ENOENT if there is a hole but page is
9230 + * uptodate, because it means that there is some MMAPED data
9231 + * associated with it that is yet to be written to disk.
9232 + */
9233 + if ((args & GET_BLOCK_NO_HOLE)
9234 + && !PageUptodate(bh_result->b_page)) {
9235 + return -ENOENT;
9237 + return 0;
9240 + bh = get_last_bh(&path);
9241 + ih = tp_item_head(&path);
9242 + if (is_indirect_le_ih(ih)) {
9243 + __le32 *ind_item = (__le32 *) ih_item_body(bh, ih);
9245 + /*
9246 + * FIXME: here we could cache indirect item or part of it in
9247 + * the inode to avoid search_by_key in case of subsequent
9248 + * access to file
9249 + */
9250 + blocknr = get_block_num(ind_item, path.pos_in_item);
9251 + ret = 0;
9252 + if (blocknr) {
9253 + map_bh(bh_result, inode->i_sb, blocknr);
9254 + if (path.pos_in_item ==
9255 + ((ih_item_len(ih) / UNFM_P_SIZE) - 1)) {
9256 + set_buffer_boundary(bh_result);
9258 + } else
9259 + /*
9260 + * We do not return -ENOENT if there is a hole but
9261 + * page is uptodate, because it means that there is
9262 + * some MMAPED data associated with it that is
9263 + * yet to be written to disk.
9264 + */
9265 + if ((args & GET_BLOCK_NO_HOLE)
9266 + && !PageUptodate(bh_result->b_page)) {
9267 + ret = -ENOENT;
9270 + pathrelse(&path);
9271 + return ret;
9273 + /* requested data are in direct item(s) */
9274 + if (!(args & GET_BLOCK_READ_DIRECT)) {
9275 + /*
9276 + * we are called by bmap. FIXME: we can not map block of file
9277 + * when it is stored in direct item(s)
9278 + */
9279 + pathrelse(&path);
9280 + return -ENOENT;
9283 + /*
9284 + * if we've got a direct item, and the buffer or page was uptodate,
9285 + * we don't want to pull data off disk again. skip to the
9286 + * end, where we map the buffer and return
9287 + */
9288 + if (buffer_uptodate(bh_result)) {
9289 + goto finished;
9290 + } else
9291 + /*
9292 + * grab_tail_page can trigger calls to reiserfs_get_block on
9293 + * up to date pages without any buffers. If the page is up
9294 + * to date, we don't want read old data off disk. Set the up
9295 + * to date bit on the buffer instead and jump to the end
9296 + */
9297 + if (!bh_result->b_page || PageUptodate(bh_result->b_page)) {
9298 + set_buffer_uptodate(bh_result);
9299 + goto finished;
9301 + /* read file tail into part of page */
9302 + offset = (cpu_key_k_offset(&key) - 1) & (PAGE_SIZE - 1);
9303 + copy_item_head(&tmp_ih, ih);
9305 + /*
9306 + * we only want to kmap if we are reading the tail into the page.
9307 + * this is not the common case, so we don't kmap until we are
9308 + * sure we need to. But, this means the item might move if
9309 + * kmap schedules
9310 + */
9311 + p = (char *)kmap(bh_result->b_page);
9312 + p += offset;
9313 + memset(p, 0, inode->i_sb->s_blocksize);
9314 + do {
9315 + if (!is_direct_le_ih(ih)) {
9316 + BUG();
9318 + /*
9319 + * make sure we don't read more bytes than actually exist in
9320 + * the file. This can happen in odd cases where i_size isn't
9321 + * correct, and when direct item padding results in a few
9322 + * extra bytes at the end of the direct item
9323 + */
9324 + if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size)
9325 + break;
9326 + if ((le_ih_k_offset(ih) - 1 + ih_item_len(ih)) > inode->i_size) {
9327 + chars =
9328 + inode->i_size - (le_ih_k_offset(ih) - 1) -
9329 + path.pos_in_item;
9330 + done = 1;
9331 + } else {
9332 + chars = ih_item_len(ih) - path.pos_in_item;
9334 + memcpy(p, ih_item_body(bh, ih) + path.pos_in_item, chars);
9336 + if (done)
9337 + break;
9339 + p += chars;
9341 + /*
9342 + * we done, if read direct item is not the last item of
9343 + * node FIXME: we could try to check right delimiting key
9344 + * to see whether direct item continues in the right
9345 + * neighbor or rely on i_size
9346 + */
9347 + if (PATH_LAST_POSITION(&path) != (B_NR_ITEMS(bh) - 1))
9348 + break;
9350 + /* update key to look for the next piece */
9351 + set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + chars);
9352 + result = search_for_position_by_key(inode->i_sb, &key, &path);
9353 + if (result != POSITION_FOUND)
9354 + /* i/o error most likely */
9355 + break;
9356 + bh = get_last_bh(&path);
9357 + ih = tp_item_head(&path);
9358 + } while (1);
9360 + flush_dcache_page(bh_result->b_page);
9361 + kunmap(bh_result->b_page);
9363 +finished:
9364 + pathrelse(&path);
9366 + if (result == IO_ERROR)
9367 + return -EIO;
9369 + /*
9370 + * this buffer has valid data, but isn't valid for io. mapping it to
9371 + * block #0 tells the rest of reiserfs it just has a tail in it
9372 + */
9373 + map_bh(bh_result, inode->i_sb, 0);
9374 + set_buffer_uptodate(bh_result);
9375 + return 0;
9379 + * this is called to create file map. So, _get_block_create_0 will not
9380 + * read direct item
9381 + */
9382 +static int reiserfs_bmap(struct inode *inode, sector_t block,
9383 + struct buffer_head *bh_result, int create)
9385 + if (!file_capable(inode, block))
9386 + return -EFBIG;
9388 + reiserfs_write_lock(inode->i_sb);
9389 + /* do not read the direct item */
9390 + _get_block_create_0(inode, block, bh_result, 0);
9391 + reiserfs_write_unlock(inode->i_sb);
9392 + return 0;
9396 + * special version of get_block that is only used by grab_tail_page right
9397 + * now. It is sent to __block_write_begin, and when you try to get a
9398 + * block past the end of the file (or a block from a hole) it returns
9399 + * -ENOENT instead of a valid buffer. __block_write_begin expects to
9400 + * be able to do i/o on the buffers returned, unless an error value
9401 + * is also returned.
9403 + * So, this allows __block_write_begin to be used for reading a single block
9404 + * in a page. Where it does not produce a valid page for holes, or past the
9405 + * end of the file. This turns out to be exactly what we need for reading
9406 + * tails for conversion.
9408 + * The point of the wrapper is forcing a certain value for create, even
9409 + * though the VFS layer is calling this function with create==1. If you
9410 + * don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block,
9411 + * don't use this function.
9413 +static int reiserfs_get_block_create_0(struct inode *inode, sector_t block,
9414 + struct buffer_head *bh_result,
9415 + int create)
9417 + return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE);
9421 + * This is special helper for reiserfs_get_block in case we are executing
9422 + * direct_IO request.
9423 + */
9424 +static int reiserfs_get_blocks_direct_io(struct inode *inode,
9425 + sector_t iblock,
9426 + struct buffer_head *bh_result,
9427 + int create)
9429 + int ret;
9431 + bh_result->b_page = NULL;
9433 + /*
9434 + * We set the b_size before reiserfs_get_block call since it is
9435 + * referenced in convert_tail_for_hole() that may be called from
9436 + * reiserfs_get_block()
9437 + */
9438 + bh_result->b_size = i_blocksize(inode);
9440 + ret = reiserfs_get_block(inode, iblock, bh_result,
9441 + create | GET_BLOCK_NO_DANGLE);
9442 + if (ret)
9443 + goto out;
9445 + /* don't allow direct io onto tail pages */
9446 + if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
9447 + /*
9448 + * make sure future calls to the direct io funcs for this
9449 + * offset in the file fail by unmapping the buffer
9450 + */
9451 + clear_buffer_mapped(bh_result);
9452 + ret = -EINVAL;
9455 + /*
9456 + * Possible unpacked tail. Flush the data before pages have
9457 + * disappeared
9458 + */
9459 + if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
9460 + int err;
9462 + reiserfs_write_lock(inode->i_sb);
9464 + err = reiserfs_commit_for_inode(inode);
9465 + REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
9467 + reiserfs_write_unlock(inode->i_sb);
9469 + if (err < 0)
9470 + ret = err;
9472 +out:
9473 + return ret;
9477 + * helper function for when reiserfs_get_block is called for a hole
9478 + * but the file tail is still in a direct item
9479 + * bh_result is the buffer head for the hole
9480 + * tail_offset is the offset of the start of the tail in the file
9482 + * This calls prepare_write, which will start a new transaction
9483 + * you should not be in a transaction, or have any paths held when you
9484 + * call this.
9485 + */
9486 +static int convert_tail_for_hole(struct inode *inode,
9487 + struct buffer_head *bh_result,
9488 + loff_t tail_offset)
9490 + unsigned long index;
9491 + unsigned long tail_end;
9492 + unsigned long tail_start;
9493 + struct page *tail_page;
9494 + struct page *hole_page = bh_result->b_page;
9495 + int retval = 0;
9497 + if ((tail_offset & (bh_result->b_size - 1)) != 1)
9498 + return -EIO;
9500 + /* always try to read until the end of the block */
9501 + tail_start = tail_offset & (PAGE_SIZE - 1);
9502 + tail_end = (tail_start | (bh_result->b_size - 1)) + 1;
9504 + index = tail_offset >> PAGE_SHIFT;
9505 + /*
9506 + * hole_page can be zero in case of direct_io, we are sure
9507 + * that we cannot get here if we write with O_DIRECT into tail page
9508 + */
9509 + if (!hole_page || index != hole_page->index) {
9510 + tail_page = grab_cache_page(inode->i_mapping, index);
9511 + retval = -ENOMEM;
9512 + if (!tail_page) {
9513 + goto out;
9515 + } else {
9516 + tail_page = hole_page;
9519 + /*
9520 + * we don't have to make sure the conversion did not happen while
9521 + * we were locking the page because anyone that could convert
9522 + * must first take i_mutex.
9524 + * We must fix the tail page for writing because it might have buffers
9525 + * that are mapped, but have a block number of 0. This indicates tail
9526 + * data that has been read directly into the page, and
9527 + * __block_write_begin won't trigger a get_block in this case.
9528 + */
9529 + fix_tail_page_for_writing(tail_page);
9530 + retval = __reiserfs_write_begin(tail_page, tail_start,
9531 + tail_end - tail_start);
9532 + if (retval)
9533 + goto unlock;
9535 + /* tail conversion might change the data in the page */
9536 + flush_dcache_page(tail_page);
9538 + retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end);
9540 +unlock:
9541 + if (tail_page != hole_page) {
9542 + unlock_page(tail_page);
9543 + put_page(tail_page);
9545 +out:
9546 + return retval;
9549 +static inline int _allocate_block(struct reiserfs_transaction_handle *th,
9550 + sector_t block,
9551 + struct inode *inode,
9552 + b_blocknr_t * allocated_block_nr,
9553 + struct treepath *path, int flags)
9555 + BUG_ON(!th->t_trans_id);
9557 +#ifdef REISERFS_PREALLOCATE
9558 + if (!(flags & GET_BLOCK_NO_IMUX)) {
9559 + return reiserfs_new_unf_blocknrs2(th, inode, allocated_block_nr,
9560 + path, block);
9562 +#endif
9563 + return reiserfs_new_unf_blocknrs(th, inode, allocated_block_nr, path,
9564 + block);
9567 +int reiserfs_get_block(struct inode *inode, sector_t block,
9568 + struct buffer_head *bh_result, int create)
9570 + int repeat, retval = 0;
9571 + /* b_blocknr_t is (unsigned) 32 bit int*/
9572 + b_blocknr_t allocated_block_nr = 0;
9573 + INITIALIZE_PATH(path);
9574 + int pos_in_item;
9575 + struct cpu_key key;
9576 + struct buffer_head *bh, *unbh = NULL;
9577 + struct item_head *ih, tmp_ih;
9578 + __le32 *item;
9579 + int done;
9580 + int fs_gen;
9581 + struct reiserfs_transaction_handle *th = NULL;
9582 + /*
9583 + * space reserved in transaction batch:
9584 + * . 3 balancings in direct->indirect conversion
9585 + * . 1 block involved into reiserfs_update_sd()
9586 + * XXX in practically impossible worst case direct2indirect()
9587 + * can incur (much) more than 3 balancings.
9588 + * quota update for user, group
9589 + */
9590 + int jbegin_count =
9591 + JOURNAL_PER_BALANCE_CNT * 3 + 1 +
9592 + 2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
9593 + int version;
9594 + int dangle = 1;
9595 + loff_t new_offset =
9596 + (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1;
9598 + reiserfs_write_lock(inode->i_sb);
9599 + version = get_inode_item_key_version(inode);
9601 + if (!file_capable(inode, block)) {
9602 + reiserfs_write_unlock(inode->i_sb);
9603 + return -EFBIG;
9606 + /*
9607 + * if !create, we aren't changing the FS, so we don't need to
9608 + * log anything, so we don't need to start a transaction
9609 + */
9610 + if (!(create & GET_BLOCK_CREATE)) {
9611 + int ret;
9612 + /* find number of block-th logical block of the file */
9613 + ret = _get_block_create_0(inode, block, bh_result,
9614 + create | GET_BLOCK_READ_DIRECT);
9615 + reiserfs_write_unlock(inode->i_sb);
9616 + return ret;
9619 + /*
9620 + * if we're already in a transaction, make sure to close
9621 + * any new transactions we start in this func
9622 + */
9623 + if ((create & GET_BLOCK_NO_DANGLE) ||
9624 + reiserfs_transaction_running(inode->i_sb))
9625 + dangle = 0;
9627 + /*
9628 + * If file is of such a size, that it might have a tail and
9629 + * tails are enabled we should mark it as possibly needing
9630 + * tail packing on close
9631 + */
9632 + if ((have_large_tails(inode->i_sb)
9633 + && inode->i_size < i_block_size(inode) * 4)
9634 + || (have_small_tails(inode->i_sb)
9635 + && inode->i_size < i_block_size(inode)))
9636 + REISERFS_I(inode)->i_flags |= i_pack_on_close_mask;
9638 + /* set the key of the first byte in the 'block'-th block of file */
9639 + make_cpu_key(&key, inode, new_offset, TYPE_ANY, 3 /*key length */ );
9640 + if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) {
9641 +start_trans:
9642 + th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count);
9643 + if (!th) {
9644 + retval = -ENOMEM;
9645 + goto failure;
9647 + reiserfs_update_inode_transaction(inode);
9649 +research:
9651 + retval = search_for_position_by_key(inode->i_sb, &key, &path);
9652 + if (retval == IO_ERROR) {
9653 + retval = -EIO;
9654 + goto failure;
9657 + bh = get_last_bh(&path);
9658 + ih = tp_item_head(&path);
9659 + item = tp_item_body(&path);
9660 + pos_in_item = path.pos_in_item;
9662 + fs_gen = get_generation(inode->i_sb);
9663 + copy_item_head(&tmp_ih, ih);
9665 + if (allocation_needed
9666 + (retval, allocated_block_nr, ih, item, pos_in_item)) {
9667 + /* we have to allocate block for the unformatted node */
9668 + if (!th) {
9669 + pathrelse(&path);
9670 + goto start_trans;
9673 + repeat =
9674 + _allocate_block(th, block, inode, &allocated_block_nr,
9675 + &path, create);
9677 + /*
9678 + * restart the transaction to give the journal a chance to free
9679 + * some blocks. releases the path, so we have to go back to
9680 + * research if we succeed on the second try
9681 + */
9682 + if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) {
9683 + SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1;
9684 + retval = restart_transaction(th, inode, &path);
9685 + if (retval)
9686 + goto failure;
9687 + repeat =
9688 + _allocate_block(th, block, inode,
9689 + &allocated_block_nr, NULL, create);
9691 + if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) {
9692 + goto research;
9694 + if (repeat == QUOTA_EXCEEDED)
9695 + retval = -EDQUOT;
9696 + else
9697 + retval = -ENOSPC;
9698 + goto failure;
9701 + if (fs_changed(fs_gen, inode->i_sb)
9702 + && item_moved(&tmp_ih, &path)) {
9703 + goto research;
9707 + if (indirect_item_found(retval, ih)) {
9708 + b_blocknr_t unfm_ptr;
9709 + /*
9710 + * 'block'-th block is in the file already (there is
9711 + * corresponding cell in some indirect item). But it may be
9712 + * zero unformatted node pointer (hole)
9713 + */
9714 + unfm_ptr = get_block_num(item, pos_in_item);
9715 + if (unfm_ptr == 0) {
9716 + /* use allocated block to plug the hole */
9717 + reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
9718 + if (fs_changed(fs_gen, inode->i_sb)
9719 + && item_moved(&tmp_ih, &path)) {
9720 + reiserfs_restore_prepared_buffer(inode->i_sb,
9721 + bh);
9722 + goto research;
9724 + set_buffer_new(bh_result);
9725 + if (buffer_dirty(bh_result)
9726 + && reiserfs_data_ordered(inode->i_sb))
9727 + reiserfs_add_ordered_list(inode, bh_result);
9728 + put_block_num(item, pos_in_item, allocated_block_nr);
9729 + unfm_ptr = allocated_block_nr;
9730 + journal_mark_dirty(th, bh);
9731 + reiserfs_update_sd(th, inode);
9733 + set_block_dev_mapped(bh_result, unfm_ptr, inode);
9734 + pathrelse(&path);
9735 + retval = 0;
9736 + if (!dangle && th)
9737 + retval = reiserfs_end_persistent_transaction(th);
9739 + reiserfs_write_unlock(inode->i_sb);
9741 + /*
9742 + * the item was found, so new blocks were not added to the file
9743 + * there is no need to make sure the inode is updated with this
9744 + * transaction
9745 + */
9746 + return retval;
9749 + if (!th) {
9750 + pathrelse(&path);
9751 + goto start_trans;
9754 + /*
9755 + * desired position is not found or is in the direct item. We have
9756 + * to append file with holes up to 'block'-th block converting
9757 + * direct items to indirect one if necessary
9758 + */
9759 + done = 0;
9760 + do {
9761 + if (is_statdata_le_ih(ih)) {
9762 + __le32 unp = 0;
9763 + struct cpu_key tmp_key;
9765 + /* indirect item has to be inserted */
9766 + make_le_item_head(&tmp_ih, &key, version, 1,
9767 + TYPE_INDIRECT, UNFM_P_SIZE,
9768 + 0 /* free_space */ );
9770 + /*
9771 + * we are going to add 'block'-th block to the file.
9772 + * Use allocated block for that
9773 + */
9774 + if (cpu_key_k_offset(&key) == 1) {
9775 + unp = cpu_to_le32(allocated_block_nr);
9776 + set_block_dev_mapped(bh_result,
9777 + allocated_block_nr, inode);
9778 + set_buffer_new(bh_result);
9779 + done = 1;
9781 + tmp_key = key; /* ;) */
9782 + set_cpu_key_k_offset(&tmp_key, 1);
9783 + PATH_LAST_POSITION(&path)++;
9785 + retval =
9786 + reiserfs_insert_item(th, &path, &tmp_key, &tmp_ih,
9787 + inode, (char *)&unp);
9788 + if (retval) {
9789 + reiserfs_free_block(th, inode,
9790 + allocated_block_nr, 1);
9791 + /*
9792 + * retval == -ENOSPC, -EDQUOT or -EIO
9793 + * or -EEXIST
9794 + */
9795 + goto failure;
9797 + } else if (is_direct_le_ih(ih)) {
9798 + /* direct item has to be converted */
9799 + loff_t tail_offset;
9801 + tail_offset =
9802 + ((le_ih_k_offset(ih) -
9803 + 1) & ~(inode->i_sb->s_blocksize - 1)) + 1;
9805 + /*
9806 + * direct item we just found fits into block we have
9807 + * to map. Convert it into unformatted node: use
9808 + * bh_result for the conversion
9809 + */
9810 + if (tail_offset == cpu_key_k_offset(&key)) {
9811 + set_block_dev_mapped(bh_result,
9812 + allocated_block_nr, inode);
9813 + unbh = bh_result;
9814 + done = 1;
9815 + } else {
9816 + /*
9817 + * we have to pad file tail stored in direct
9818 + * item(s) up to block size and convert it
9819 + * to unformatted node. FIXME: this should
9820 + * also get into page cache
9821 + */
9823 + pathrelse(&path);
9824 + /*
9825 + * ugly, but we can only end the transaction if
9826 + * we aren't nested
9827 + */
9828 + BUG_ON(!th->t_refcount);
9829 + if (th->t_refcount == 1) {
9830 + retval =
9831 + reiserfs_end_persistent_transaction
9832 + (th);
9833 + th = NULL;
9834 + if (retval)
9835 + goto failure;
9838 + retval =
9839 + convert_tail_for_hole(inode, bh_result,
9840 + tail_offset);
9841 + if (retval) {
9842 + if (retval != -ENOSPC)
9843 + reiserfs_error(inode->i_sb,
9844 + "clm-6004",
9845 + "convert tail failed "
9846 + "inode %lu, error %d",
9847 + inode->i_ino,
9848 + retval);
9849 + if (allocated_block_nr) {
9850 + /*
9851 + * the bitmap, the super,
9852 + * and the stat data == 3
9853 + */
9854 + if (!th)
9855 + th = reiserfs_persistent_transaction(inode->i_sb, 3);
9856 + if (th)
9857 + reiserfs_free_block(th,
9858 + inode,
9859 + allocated_block_nr,
9860 + 1);
9862 + goto failure;
9864 + goto research;
9866 + retval =
9867 + direct2indirect(th, inode, &path, unbh,
9868 + tail_offset);
9869 + if (retval) {
9870 + reiserfs_unmap_buffer(unbh);
9871 + reiserfs_free_block(th, inode,
9872 + allocated_block_nr, 1);
9873 + goto failure;
9875 + /*
9876 + * it is important the set_buffer_uptodate is done
9877 + * after the direct2indirect. The buffer might
9878 + * contain valid data newer than the data on disk
9879 + * (read by read_folio, changed, and then sent here by
9880 + * writepage). direct2indirect needs to know if unbh
9881 + * was already up to date, so it can decide if the
9882 + * data in unbh needs to be replaced with data from
9883 + * the disk
9884 + */
9885 + set_buffer_uptodate(unbh);
9887 + /*
9888 + * unbh->b_page == NULL in case of DIRECT_IO request,
9889 + * this means buffer will disappear shortly, so it
9890 + * should not be added to
9891 + */
9892 + if (unbh->b_page) {
9893 + /*
9894 + * we've converted the tail, so we must
9895 + * flush unbh before the transaction commits
9896 + */
9897 + reiserfs_add_tail_list(inode, unbh);
9899 + /*
9900 + * mark it dirty now to prevent commit_write
9901 + * from adding this buffer to the inode's
9902 + * dirty buffer list
9903 + */
9904 + /*
9905 + * AKPM: changed __mark_buffer_dirty to
9906 + * mark_buffer_dirty(). It's still atomic,
9907 + * but it sets the page dirty too, which makes
9908 + * it eligible for writeback at any time by the
9909 + * VM (which was also the case with
9910 + * __mark_buffer_dirty())
9911 + */
9912 + mark_buffer_dirty(unbh);
9914 + } else {
9915 + /*
9916 + * append indirect item with holes if needed, when
9917 + * appending pointer to 'block'-th block use block,
9918 + * which is already allocated
9919 + */
9920 + struct cpu_key tmp_key;
9921 + /*
9922 + * We use this in case we need to allocate
9923 + * only one block which is a fastpath
9924 + */
9925 + unp_t unf_single = 0;
9926 + unp_t *un;
9927 + __u64 max_to_insert =
9928 + MAX_ITEM_LEN(inode->i_sb->s_blocksize) /
9929 + UNFM_P_SIZE;
9930 + __u64 blocks_needed;
9932 + RFALSE(pos_in_item != ih_item_len(ih) / UNFM_P_SIZE,
9933 + "vs-804: invalid position for append");
9934 + /*
9935 + * indirect item has to be appended,
9936 + * set up key of that position
9937 + * (key type is unimportant)
9938 + */
9939 + make_cpu_key(&tmp_key, inode,
9940 + le_key_k_offset(version,
9941 + &ih->ih_key) +
9942 + op_bytes_number(ih,
9943 + inode->i_sb->s_blocksize),
9944 + TYPE_INDIRECT, 3);
9946 + RFALSE(cpu_key_k_offset(&tmp_key) > cpu_key_k_offset(&key),
9947 + "green-805: invalid offset");
9948 + blocks_needed =
9949 + 1 +
9950 + ((cpu_key_k_offset(&key) -
9951 + cpu_key_k_offset(&tmp_key)) >> inode->i_sb->
9952 + s_blocksize_bits);
9954 + if (blocks_needed == 1) {
9955 + un = &unf_single;
9956 + } else {
9957 + un = kcalloc(min(blocks_needed, max_to_insert),
9958 + UNFM_P_SIZE, GFP_NOFS);
9959 + if (!un) {
9960 + un = &unf_single;
9961 + blocks_needed = 1;
9962 + max_to_insert = 0;
9965 + if (blocks_needed <= max_to_insert) {
9966 + /*
9967 + * we are going to add target block to
9968 + * the file. Use allocated block for that
9969 + */
9970 + un[blocks_needed - 1] =
9971 + cpu_to_le32(allocated_block_nr);
9972 + set_block_dev_mapped(bh_result,
9973 + allocated_block_nr, inode);
9974 + set_buffer_new(bh_result);
9975 + done = 1;
9976 + } else {
9977 + /* paste hole to the indirect item */
9978 + /*
9979 + * If kcalloc failed, max_to_insert becomes
9980 + * zero and it means we only have space for
9981 + * one block
9982 + */
9983 + blocks_needed =
9984 + max_to_insert ? max_to_insert : 1;
9986 + retval =
9987 + reiserfs_paste_into_item(th, &path, &tmp_key, inode,
9988 + (char *)un,
9989 + UNFM_P_SIZE *
9990 + blocks_needed);
9992 + if (blocks_needed != 1)
9993 + kfree(un);
9995 + if (retval) {
9996 + reiserfs_free_block(th, inode,
9997 + allocated_block_nr, 1);
9998 + goto failure;
10000 + if (!done) {
10001 + /*
10002 + * We need to mark new file size in case
10003 + * this function will be interrupted/aborted
10004 + * later on. And we may do this only for
10005 + * holes.
10006 + */
10007 + inode->i_size +=
10008 + inode->i_sb->s_blocksize * blocks_needed;
10012 + if (done == 1)
10013 + break;
10015 + /*
10016 + * this loop could log more blocks than we had originally
10017 + * asked for. So, we have to allow the transaction to end
10018 + * if it is too big or too full. Update the inode so things
10019 + * are consistent if we crash before the function returns
10020 + * release the path so that anybody waiting on the path before
10021 + * ending their transaction will be able to continue.
10022 + */
10023 + if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
10024 + retval = restart_transaction(th, inode, &path);
10025 + if (retval)
10026 + goto failure;
10028 + /*
10029 + * inserting indirect pointers for a hole can take a
10030 + * long time. reschedule if needed and also release the write
10031 + * lock for others.
10032 + */
10033 + reiserfs_cond_resched(inode->i_sb);
10035 + retval = search_for_position_by_key(inode->i_sb, &key, &path);
10036 + if (retval == IO_ERROR) {
10037 + retval = -EIO;
10038 + goto failure;
10040 + if (retval == POSITION_FOUND) {
10041 + reiserfs_warning(inode->i_sb, "vs-825",
10042 + "%K should not be found", &key);
10043 + retval = -EEXIST;
10044 + if (allocated_block_nr)
10045 + reiserfs_free_block(th, inode,
10046 + allocated_block_nr, 1);
10047 + pathrelse(&path);
10048 + goto failure;
10050 + bh = get_last_bh(&path);
10051 + ih = tp_item_head(&path);
10052 + item = tp_item_body(&path);
10053 + pos_in_item = path.pos_in_item;
10054 + } while (1);
10056 + retval = 0;
10058 +failure:
10059 + if (th && (!dangle || (retval && !th->t_trans_id))) {
10060 + int err;
10061 + if (th->t_trans_id)
10062 + reiserfs_update_sd(th, inode);
10063 + err = reiserfs_end_persistent_transaction(th);
10064 + if (err)
10065 + retval = err;
10068 + reiserfs_write_unlock(inode->i_sb);
10069 + reiserfs_check_path(&path);
10070 + return retval;
10073 +static void reiserfs_readahead(struct readahead_control *rac)
10075 + mpage_readahead(rac, reiserfs_get_block);
10079 + * Compute real number of used bytes by file
10080 + * Following three functions can go away when we'll have enough space in
10081 + * stat item
10082 + */
10083 +static int real_space_diff(struct inode *inode, int sd_size)
10085 + int bytes;
10086 + loff_t blocksize = inode->i_sb->s_blocksize;
10088 + if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode))
10089 + return sd_size;
10091 + /*
10092 + * End of file is also in full block with indirect reference, so round
10093 + * up to the next block.
10095 + * there is just no way to know if the tail is actually packed
10096 + * on the file, so we have to assume it isn't. When we pack the
10097 + * tail, we add 4 bytes to pretend there really is an unformatted
10098 + * node pointer
10099 + */
10100 + bytes =
10101 + ((inode->i_size +
10102 + (blocksize - 1)) >> inode->i_sb->s_blocksize_bits) * UNFM_P_SIZE +
10103 + sd_size;
10104 + return bytes;
10107 +static inline loff_t to_real_used_space(struct inode *inode, ulong blocks,
10108 + int sd_size)
10110 + if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
10111 + return inode->i_size +
10112 + (loff_t) (real_space_diff(inode, sd_size));
10114 + return ((loff_t) real_space_diff(inode, sd_size)) +
10115 + (((loff_t) blocks) << 9);
10118 +/* Compute number of blocks used by file in ReiserFS counting */
10119 +static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size)
10121 + loff_t bytes = inode_get_bytes(inode);
10122 + loff_t real_space = real_space_diff(inode, sd_size);
10124 + /* keeps fsck and non-quota versions of reiserfs happy */
10125 + if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
10126 + bytes += (loff_t) 511;
10129 + /*
10130 + * files from before the quota patch might i_blocks such that
10131 + * bytes < real_space. Deal with that here to prevent it from
10132 + * going negative.
10133 + */
10134 + if (bytes < real_space)
10135 + return 0;
10136 + return (bytes - real_space) >> 9;
10140 + * BAD: new directories have stat data of new type and all other items
10141 + * of old type. Version stored in the inode says about body items, so
10142 + * in update_stat_data we can not rely on inode, but have to check
10143 + * item version directly
10144 + */
10146 +/* called by read_locked_inode */
10147 +static void init_inode(struct inode *inode, struct treepath *path)
10149 + struct buffer_head *bh;
10150 + struct item_head *ih;
10151 + __u32 rdev;
10153 + bh = PATH_PLAST_BUFFER(path);
10154 + ih = tp_item_head(path);
10156 + copy_key(INODE_PKEY(inode), &ih->ih_key);
10158 + INIT_LIST_HEAD(&REISERFS_I(inode)->i_prealloc_list);
10159 + REISERFS_I(inode)->i_flags = 0;
10160 + REISERFS_I(inode)->i_prealloc_block = 0;
10161 + REISERFS_I(inode)->i_prealloc_count = 0;
10162 + REISERFS_I(inode)->i_trans_id = 0;
10163 + REISERFS_I(inode)->i_jl = NULL;
10164 + reiserfs_init_xattr_rwsem(inode);
10166 + if (stat_data_v1(ih)) {
10167 + struct stat_data_v1 *sd =
10168 + (struct stat_data_v1 *)ih_item_body(bh, ih);
10169 + unsigned long blocks;
10171 + set_inode_item_key_version(inode, KEY_FORMAT_3_5);
10172 + set_inode_sd_version(inode, STAT_DATA_V1);
10173 + inode->i_mode = sd_v1_mode(sd);
10174 + set_nlink(inode, sd_v1_nlink(sd));
10175 + i_uid_write(inode, sd_v1_uid(sd));
10176 + i_gid_write(inode, sd_v1_gid(sd));
10177 + inode->i_size = sd_v1_size(sd);
10178 + inode_set_atime(inode, sd_v1_atime(sd), 0);
10179 + inode_set_mtime(inode, sd_v1_mtime(sd), 0);
10180 + inode_set_ctime(inode, sd_v1_ctime(sd), 0);
10182 + inode->i_blocks = sd_v1_blocks(sd);
10183 + inode->i_generation = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
10184 + blocks = (inode->i_size + 511) >> 9;
10185 + blocks = _ROUND_UP(blocks, inode->i_sb->s_blocksize >> 9);
10187 + /*
10188 + * there was a bug in <=3.5.23 when i_blocks could take
10189 + * negative values. Starting from 3.5.17 this value could
10190 + * even be stored in stat data. For such files we set
10191 + * i_blocks based on file size. Just 2 notes: this can be
10192 + * wrong for sparse files. On-disk value will be only
10193 + * updated if file's inode will ever change
10194 + */
10195 + if (inode->i_blocks > blocks) {
10196 + inode->i_blocks = blocks;
10199 + rdev = sd_v1_rdev(sd);
10200 + REISERFS_I(inode)->i_first_direct_byte =
10201 + sd_v1_first_direct_byte(sd);
10203 + /*
10204 + * an early bug in the quota code can give us an odd
10205 + * number for the block count. This is incorrect, fix it here.
10206 + */
10207 + if (inode->i_blocks & 1) {
10208 + inode->i_blocks++;
10210 + inode_set_bytes(inode,
10211 + to_real_used_space(inode, inode->i_blocks,
10212 + SD_V1_SIZE));
10213 + /*
10214 + * nopack is initially zero for v1 objects. For v2 objects,
10215 + * nopack is initialised from sd_attrs
10216 + */
10217 + REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
10218 + } else {
10219 + /*
10220 + * new stat data found, but object may have old items
10221 + * (directories and symlinks)
10222 + */
10223 + struct stat_data *sd = (struct stat_data *)ih_item_body(bh, ih);
10225 + inode->i_mode = sd_v2_mode(sd);
10226 + set_nlink(inode, sd_v2_nlink(sd));
10227 + i_uid_write(inode, sd_v2_uid(sd));
10228 + inode->i_size = sd_v2_size(sd);
10229 + i_gid_write(inode, sd_v2_gid(sd));
10230 + inode_set_mtime(inode, sd_v2_mtime(sd), 0);
10231 + inode_set_atime(inode, sd_v2_atime(sd), 0);
10232 + inode_set_ctime(inode, sd_v2_ctime(sd), 0);
10233 + inode->i_blocks = sd_v2_blocks(sd);
10234 + rdev = sd_v2_rdev(sd);
10235 + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
10236 + inode->i_generation =
10237 + le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
10238 + else
10239 + inode->i_generation = sd_v2_generation(sd);
10241 + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
10242 + set_inode_item_key_version(inode, KEY_FORMAT_3_5);
10243 + else
10244 + set_inode_item_key_version(inode, KEY_FORMAT_3_6);
10245 + REISERFS_I(inode)->i_first_direct_byte = 0;
10246 + set_inode_sd_version(inode, STAT_DATA_V2);
10247 + inode_set_bytes(inode,
10248 + to_real_used_space(inode, inode->i_blocks,
10249 + SD_V2_SIZE));
10250 + /*
10251 + * read persistent inode attributes from sd and initialise
10252 + * generic inode flags from them
10253 + */
10254 + REISERFS_I(inode)->i_attrs = sd_v2_attrs(sd);
10255 + sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode);
10258 + pathrelse(path);
10259 + if (S_ISREG(inode->i_mode)) {
10260 + inode->i_op = &reiserfs_file_inode_operations;
10261 + inode->i_fop = &reiserfs_file_operations;
10262 + inode->i_mapping->a_ops = &reiserfs_address_space_operations;
10263 + } else if (S_ISDIR(inode->i_mode)) {
10264 + inode->i_op = &reiserfs_dir_inode_operations;
10265 + inode->i_fop = &reiserfs_dir_operations;
10266 + } else if (S_ISLNK(inode->i_mode)) {
10267 + inode->i_op = &reiserfs_symlink_inode_operations;
10268 + inode_nohighmem(inode);
10269 + inode->i_mapping->a_ops = &reiserfs_address_space_operations;
10270 + } else {
10271 + inode->i_blocks = 0;
10272 + inode->i_op = &reiserfs_special_inode_operations;
10273 + init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
10277 +/* update new stat data with inode fields */
10278 +static void inode2sd(void *sd, struct inode *inode, loff_t size)
10280 + struct stat_data *sd_v2 = (struct stat_data *)sd;
10282 + set_sd_v2_mode(sd_v2, inode->i_mode);
10283 + set_sd_v2_nlink(sd_v2, inode->i_nlink);
10284 + set_sd_v2_uid(sd_v2, i_uid_read(inode));
10285 + set_sd_v2_size(sd_v2, size);
10286 + set_sd_v2_gid(sd_v2, i_gid_read(inode));
10287 + set_sd_v2_mtime(sd_v2, inode_get_mtime_sec(inode));
10288 + set_sd_v2_atime(sd_v2, inode_get_atime_sec(inode));
10289 + set_sd_v2_ctime(sd_v2, inode_get_ctime_sec(inode));
10290 + set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE));
10291 + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
10292 + set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev));
10293 + else
10294 + set_sd_v2_generation(sd_v2, inode->i_generation);
10295 + set_sd_v2_attrs(sd_v2, REISERFS_I(inode)->i_attrs);
10298 +/* used to copy inode's fields to old stat data */
10299 +static void inode2sd_v1(void *sd, struct inode *inode, loff_t size)
10301 + struct stat_data_v1 *sd_v1 = (struct stat_data_v1 *)sd;
10303 + set_sd_v1_mode(sd_v1, inode->i_mode);
10304 + set_sd_v1_uid(sd_v1, i_uid_read(inode));
10305 + set_sd_v1_gid(sd_v1, i_gid_read(inode));
10306 + set_sd_v1_nlink(sd_v1, inode->i_nlink);
10307 + set_sd_v1_size(sd_v1, size);
10308 + set_sd_v1_atime(sd_v1, inode_get_atime_sec(inode));
10309 + set_sd_v1_ctime(sd_v1, inode_get_ctime_sec(inode));
10310 + set_sd_v1_mtime(sd_v1, inode_get_mtime_sec(inode));
10312 + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
10313 + set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev));
10314 + else
10315 + set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE));
10317 + /* Sigh. i_first_direct_byte is back */
10318 + set_sd_v1_first_direct_byte(sd_v1,
10319 + REISERFS_I(inode)->i_first_direct_byte);
10323 + * NOTE, you must prepare the buffer head before sending it here,
10324 + * and then log it after the call
10325 + */
10326 +static void update_stat_data(struct treepath *path, struct inode *inode,
10327 + loff_t size)
10329 + struct buffer_head *bh;
10330 + struct item_head *ih;
10332 + bh = PATH_PLAST_BUFFER(path);
10333 + ih = tp_item_head(path);
10335 + if (!is_statdata_le_ih(ih))
10336 + reiserfs_panic(inode->i_sb, "vs-13065", "key %k, found item %h",
10337 + INODE_PKEY(inode), ih);
10339 + /* path points to old stat data */
10340 + if (stat_data_v1(ih)) {
10341 + inode2sd_v1(ih_item_body(bh, ih), inode, size);
10342 + } else {
10343 + inode2sd(ih_item_body(bh, ih), inode, size);
10346 + return;
10349 +void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th,
10350 + struct inode *inode, loff_t size)
10352 + struct cpu_key key;
10353 + INITIALIZE_PATH(path);
10354 + struct buffer_head *bh;
10355 + int fs_gen;
10356 + struct item_head *ih, tmp_ih;
10357 + int retval;
10359 + BUG_ON(!th->t_trans_id);
10361 + /* key type is unimportant */
10362 + make_cpu_key(&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3);
10364 + for (;;) {
10365 + int pos;
10366 + /* look for the object's stat data */
10367 + retval = search_item(inode->i_sb, &key, &path);
10368 + if (retval == IO_ERROR) {
10369 + reiserfs_error(inode->i_sb, "vs-13050",
10370 + "i/o failure occurred trying to "
10371 + "update %K stat data", &key);
10372 + return;
10374 + if (retval == ITEM_NOT_FOUND) {
10375 + pos = PATH_LAST_POSITION(&path);
10376 + pathrelse(&path);
10377 + if (inode->i_nlink == 0) {
10378 + /*reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: i_nlink == 0, stat data not found"); */
10379 + return;
10381 + reiserfs_warning(inode->i_sb, "vs-13060",
10382 + "stat data of object %k (nlink == %d) "
10383 + "not found (pos %d)",
10384 + INODE_PKEY(inode), inode->i_nlink,
10385 + pos);
10386 + reiserfs_check_path(&path);
10387 + return;
10390 + /*
10391 + * sigh, prepare_for_journal might schedule. When it
10392 + * schedules the FS might change. We have to detect that,
10393 + * and loop back to the search if the stat data item has moved
10394 + */
10395 + bh = get_last_bh(&path);
10396 + ih = tp_item_head(&path);
10397 + copy_item_head(&tmp_ih, ih);
10398 + fs_gen = get_generation(inode->i_sb);
10399 + reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
10401 + /* Stat_data item has been moved after scheduling. */
10402 + if (fs_changed(fs_gen, inode->i_sb)
10403 + && item_moved(&tmp_ih, &path)) {
10404 + reiserfs_restore_prepared_buffer(inode->i_sb, bh);
10405 + continue;
10407 + break;
10409 + update_stat_data(&path, inode, size);
10410 + journal_mark_dirty(th, bh);
10411 + pathrelse(&path);
10412 + return;
10416 + * reiserfs_read_locked_inode is called to read the inode off disk, and it
10417 + * does a make_bad_inode when things go wrong. But, we need to make sure
10418 + * and clear the key in the private portion of the inode, otherwise a
10419 + * corresponding iput might try to delete whatever object the inode last
10420 + * represented.
10421 + */
10422 +static void reiserfs_make_bad_inode(struct inode *inode)
10424 + memset(INODE_PKEY(inode), 0, KEY_SIZE);
10425 + make_bad_inode(inode);
10429 + * initially this function was derived from minix or ext2's analog and
10430 + * evolved as the prototype did
10431 + */
10432 +int reiserfs_init_locked_inode(struct inode *inode, void *p)
10434 + struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p;
10435 + inode->i_ino = args->objectid;
10436 + INODE_PKEY(inode)->k_dir_id = cpu_to_le32(args->dirid);
10437 + return 0;
10441 + * looks for stat data in the tree, and fills up the fields of in-core
10442 + * inode stat data fields
10443 + */
10444 +void reiserfs_read_locked_inode(struct inode *inode,
10445 + struct reiserfs_iget_args *args)
10447 + INITIALIZE_PATH(path_to_sd);
10448 + struct cpu_key key;
10449 + unsigned long dirino;
10450 + int retval;
10452 + dirino = args->dirid;
10454 + /*
10455 + * set version 1, version 2 could be used too, because stat data
10456 + * key is the same in both versions
10457 + */
10458 + _make_cpu_key(&key, KEY_FORMAT_3_5, dirino, inode->i_ino, 0, 0, 3);
10460 + /* look for the object's stat data */
10461 + retval = search_item(inode->i_sb, &key, &path_to_sd);
10462 + if (retval == IO_ERROR) {
10463 + reiserfs_error(inode->i_sb, "vs-13070",
10464 + "i/o failure occurred trying to find "
10465 + "stat data of %K", &key);
10466 + reiserfs_make_bad_inode(inode);
10467 + return;
10470 + /* a stale NFS handle can trigger this without it being an error */
10471 + if (retval != ITEM_FOUND) {
10472 + pathrelse(&path_to_sd);
10473 + reiserfs_make_bad_inode(inode);
10474 + clear_nlink(inode);
10475 + return;
10478 + init_inode(inode, &path_to_sd);
10480 + /*
10481 + * It is possible that knfsd is trying to access inode of a file
10482 + * that is being removed from the disk by some other thread. As we
10483 + * update sd on unlink all that is required is to check for nlink
10484 + * here. This bug was first found by Sizif when debugging
10485 + * SquidNG/Butterfly, forgotten, and found again after Philippe
10486 + * Gramoulle <philippe.gramoulle@mmania.com> reproduced it.
10488 + * More logical fix would require changes in fs/inode.c:iput() to
10489 + * remove inode from hash-table _after_ fs cleaned disk stuff up and
10490 + * in iget() to return NULL if I_FREEING inode is found in
10491 + * hash-table.
10492 + */
10494 + /*
10495 + * Currently there is one place where it's ok to meet inode with
10496 + * nlink==0: processing of open-unlinked and half-truncated files
10497 + * during mount (fs/reiserfs/super.c:finish_unfinished()).
10498 + */
10499 + if ((inode->i_nlink == 0) &&
10500 + !REISERFS_SB(inode->i_sb)->s_is_unlinked_ok) {
10501 + reiserfs_warning(inode->i_sb, "vs-13075",
10502 + "dead inode read from disk %K. "
10503 + "This is likely to be race with knfsd. Ignore",
10504 + &key);
10505 + reiserfs_make_bad_inode(inode);
10508 + /* init inode should be relsing */
10509 + reiserfs_check_path(&path_to_sd);
10511 + /*
10512 + * Stat data v1 doesn't support ACLs.
10513 + */
10514 + if (get_inode_sd_version(inode) == STAT_DATA_V1)
10515 + cache_no_acl(inode);
10519 + * reiserfs_find_actor() - "find actor" reiserfs supplies to iget5_locked().
10521 + * @inode: inode from hash table to check
10522 + * @opaque: "cookie" passed to iget5_locked(). This is &reiserfs_iget_args.
10524 + * This function is called by iget5_locked() to distinguish reiserfs inodes
10525 + * having the same inode numbers. Such inodes can only exist due to some
10526 + * error condition. One of them should be bad. Inodes with identical
10527 + * inode numbers (objectids) are distinguished by parent directory ids.
10529 + */
10530 +int reiserfs_find_actor(struct inode *inode, void *opaque)
10532 + struct reiserfs_iget_args *args;
10534 + args = opaque;
10535 + /* args is already in CPU order */
10536 + return (inode->i_ino == args->objectid) &&
10537 + (le32_to_cpu(INODE_PKEY(inode)->k_dir_id) == args->dirid);
10540 +struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key)
10542 + struct inode *inode;
10543 + struct reiserfs_iget_args args;
10544 + int depth;
10546 + args.objectid = key->on_disk_key.k_objectid;
10547 + args.dirid = key->on_disk_key.k_dir_id;
10548 + depth = reiserfs_write_unlock_nested(s);
10549 + inode = iget5_locked(s, key->on_disk_key.k_objectid,
10550 + reiserfs_find_actor, reiserfs_init_locked_inode,
10551 + (void *)(&args));
10552 + reiserfs_write_lock_nested(s, depth);
10553 + if (!inode)
10554 + return ERR_PTR(-ENOMEM);
10556 + if (inode->i_state & I_NEW) {
10557 + reiserfs_read_locked_inode(inode, &args);
10558 + unlock_new_inode(inode);
10561 + if (comp_short_keys(INODE_PKEY(inode), key) || is_bad_inode(inode)) {
10562 + /* either due to i/o error or a stale NFS handle */
10563 + iput(inode);
10564 + inode = NULL;
10566 + return inode;
10569 +static struct dentry *reiserfs_get_dentry(struct super_block *sb,
10570 + u32 objectid, u32 dir_id, u32 generation)
10573 + struct cpu_key key;
10574 + struct inode *inode;
10576 + key.on_disk_key.k_objectid = objectid;
10577 + key.on_disk_key.k_dir_id = dir_id;
10578 + reiserfs_write_lock(sb);
10579 + inode = reiserfs_iget(sb, &key);
10580 + if (inode && !IS_ERR(inode) && generation != 0 &&
10581 + generation != inode->i_generation) {
10582 + iput(inode);
10583 + inode = NULL;
10585 + reiserfs_write_unlock(sb);
10587 + return d_obtain_alias(inode);
10590 +struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
10591 + int fh_len, int fh_type)
10593 + /*
10594 + * fhtype happens to reflect the number of u32s encoded.
10595 + * due to a bug in earlier code, fhtype might indicate there
10596 + * are more u32s then actually fitted.
10597 + * so if fhtype seems to be more than len, reduce fhtype.
10598 + * Valid types are:
10599 + * 2 - objectid + dir_id - legacy support
10600 + * 3 - objectid + dir_id + generation
10601 + * 4 - objectid + dir_id + objectid and dirid of parent - legacy
10602 + * 5 - objectid + dir_id + generation + objectid and dirid of parent
10603 + * 6 - as above plus generation of directory
10604 + * 6 does not fit in NFSv2 handles
10605 + */
10606 + if (fh_type > fh_len) {
10607 + if (fh_type != 6 || fh_len != 5)
10608 + reiserfs_warning(sb, "reiserfs-13077",
10609 + "nfsd/reiserfs, fhtype=%d, len=%d - odd",
10610 + fh_type, fh_len);
10611 + fh_type = fh_len;
10613 + if (fh_len < 2)
10614 + return NULL;
10616 + return reiserfs_get_dentry(sb, fid->raw[0], fid->raw[1],
10617 + (fh_type == 3 || fh_type >= 5) ? fid->raw[2] : 0);
10620 +struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid,
10621 + int fh_len, int fh_type)
10623 + if (fh_type > fh_len)
10624 + fh_type = fh_len;
10625 + if (fh_type < 4)
10626 + return NULL;
10628 + return reiserfs_get_dentry(sb,
10629 + (fh_type >= 5) ? fid->raw[3] : fid->raw[2],
10630 + (fh_type >= 5) ? fid->raw[4] : fid->raw[3],
10631 + (fh_type == 6) ? fid->raw[5] : 0);
10634 +int reiserfs_encode_fh(struct inode *inode, __u32 * data, int *lenp,
10635 + struct inode *parent)
10637 + int maxlen = *lenp;
10639 + if (parent && (maxlen < 5)) {
10640 + *lenp = 5;
10641 + return FILEID_INVALID;
10642 + } else if (maxlen < 3) {
10643 + *lenp = 3;
10644 + return FILEID_INVALID;
10647 + data[0] = inode->i_ino;
10648 + data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
10649 + data[2] = inode->i_generation;
10650 + *lenp = 3;
10651 + if (parent) {
10652 + data[3] = parent->i_ino;
10653 + data[4] = le32_to_cpu(INODE_PKEY(parent)->k_dir_id);
10654 + *lenp = 5;
10655 + if (maxlen >= 6) {
10656 + data[5] = parent->i_generation;
10657 + *lenp = 6;
10660 + return *lenp;
10664 + * looks for stat data, then copies fields to it, marks the buffer
10665 + * containing stat data as dirty
10666 + */
10668 + * reiserfs inodes are never really dirty, since the dirty inode call
10669 + * always logs them. This call allows the VFS inode marking routines
10670 + * to properly mark inodes for datasync and such, but only actually
10671 + * does something when called for a synchronous update.
10672 + */
10673 +int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc)
10675 + struct reiserfs_transaction_handle th;
10676 + int jbegin_count = 1;
10678 + if (sb_rdonly(inode->i_sb))
10679 + return -EROFS;
10680 + /*
10681 + * memory pressure can sometimes initiate write_inode calls with
10682 + * sync == 1,
10683 + * these cases are just when the system needs ram, not when the
10684 + * inode needs to reach disk for safety, and they can safely be
10685 + * ignored because the altered inode has already been logged.
10686 + */
10687 + if (wbc->sync_mode == WB_SYNC_ALL && !(current->flags & PF_MEMALLOC)) {
10688 + reiserfs_write_lock(inode->i_sb);
10689 + if (!journal_begin(&th, inode->i_sb, jbegin_count)) {
10690 + reiserfs_update_sd(&th, inode);
10691 + journal_end_sync(&th);
10693 + reiserfs_write_unlock(inode->i_sb);
10695 + return 0;
10699 + * stat data of new object is inserted already, this inserts the item
10700 + * containing "." and ".." entries
10701 + */
10702 +static int reiserfs_new_directory(struct reiserfs_transaction_handle *th,
10703 + struct inode *inode,
10704 + struct item_head *ih, struct treepath *path,
10705 + struct inode *dir)
10707 + struct super_block *sb = th->t_super;
10708 + char empty_dir[EMPTY_DIR_SIZE];
10709 + char *body = empty_dir;
10710 + struct cpu_key key;
10711 + int retval;
10713 + BUG_ON(!th->t_trans_id);
10715 + _make_cpu_key(&key, KEY_FORMAT_3_5, le32_to_cpu(ih->ih_key.k_dir_id),
10716 + le32_to_cpu(ih->ih_key.k_objectid), DOT_OFFSET,
10717 + TYPE_DIRENTRY, 3 /*key length */ );
10719 + /*
10720 + * compose item head for new item. Directories consist of items of
10721 + * old type (ITEM_VERSION_1). Do not set key (second arg is 0), it
10722 + * is done by reiserfs_new_inode
10723 + */
10724 + if (old_format_only(sb)) {
10725 + make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET,
10726 + TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2);
10728 + make_empty_dir_item_v1(body, ih->ih_key.k_dir_id,
10729 + ih->ih_key.k_objectid,
10730 + INODE_PKEY(dir)->k_dir_id,
10731 + INODE_PKEY(dir)->k_objectid);
10732 + } else {
10733 + make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET,
10734 + TYPE_DIRENTRY, EMPTY_DIR_SIZE, 2);
10736 + make_empty_dir_item(body, ih->ih_key.k_dir_id,
10737 + ih->ih_key.k_objectid,
10738 + INODE_PKEY(dir)->k_dir_id,
10739 + INODE_PKEY(dir)->k_objectid);
10742 + /* look for place in the tree for new item */
10743 + retval = search_item(sb, &key, path);
10744 + if (retval == IO_ERROR) {
10745 + reiserfs_error(sb, "vs-13080",
10746 + "i/o failure occurred creating new directory");
10747 + return -EIO;
10749 + if (retval == ITEM_FOUND) {
10750 + pathrelse(path);
10751 + reiserfs_warning(sb, "vs-13070",
10752 + "object with this key exists (%k)",
10753 + &(ih->ih_key));
10754 + return -EEXIST;
10757 + /* insert item, that is empty directory item */
10758 + return reiserfs_insert_item(th, path, &key, ih, inode, body);
10762 + * stat data of object has been inserted, this inserts the item
10763 + * containing the body of symlink
10764 + */
10765 +static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th,
10766 + struct inode *inode,
10767 + struct item_head *ih,
10768 + struct treepath *path, const char *symname,
10769 + int item_len)
10771 + struct super_block *sb = th->t_super;
10772 + struct cpu_key key;
10773 + int retval;
10775 + BUG_ON(!th->t_trans_id);
10777 + _make_cpu_key(&key, KEY_FORMAT_3_5,
10778 + le32_to_cpu(ih->ih_key.k_dir_id),
10779 + le32_to_cpu(ih->ih_key.k_objectid),
10780 + 1, TYPE_DIRECT, 3 /*key length */ );
10782 + make_le_item_head(ih, NULL, KEY_FORMAT_3_5, 1, TYPE_DIRECT, item_len,
10783 + 0 /*free_space */ );
10785 + /* look for place in the tree for new item */
10786 + retval = search_item(sb, &key, path);
10787 + if (retval == IO_ERROR) {
10788 + reiserfs_error(sb, "vs-13080",
10789 + "i/o failure occurred creating new symlink");
10790 + return -EIO;
10792 + if (retval == ITEM_FOUND) {
10793 + pathrelse(path);
10794 + reiserfs_warning(sb, "vs-13080",
10795 + "object with this key exists (%k)",
10796 + &(ih->ih_key));
10797 + return -EEXIST;
10800 + /* insert item, that is body of symlink */
10801 + return reiserfs_insert_item(th, path, &key, ih, inode, symname);
10805 + * inserts the stat data into the tree, and then calls
10806 + * reiserfs_new_directory (to insert ".", ".." item if new object is
10807 + * directory) or reiserfs_new_symlink (to insert symlink body if new
10808 + * object is symlink) or nothing (if new object is regular file)
10810 + * NOTE! uid and gid must already be set in the inode. If we return
10811 + * non-zero due to an error, we have to drop the quota previously allocated
10812 + * for the fresh inode. This can only be done outside a transaction, so
10813 + * if we return non-zero, we also end the transaction.
10815 + * @th: active transaction handle
10816 + * @dir: parent directory for new inode
10817 + * @mode: mode of new inode
10818 + * @symname: symlink contents if inode is symlink
10819 + * @isize: 0 for regular file, EMPTY_DIR_SIZE for dirs, strlen(symname) for
10820 + * symlinks
10821 + * @inode: inode to be filled
10822 + * @security: optional security context to associate with this inode
10823 + */
10824 +int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
10825 + struct inode *dir, umode_t mode, const char *symname,
10826 + /* 0 for regular, EMTRY_DIR_SIZE for dirs,
10827 + strlen (symname) for symlinks) */
10828 + loff_t i_size, struct dentry *dentry,
10829 + struct inode *inode,
10830 + struct reiserfs_security_handle *security)
10832 + struct super_block *sb = dir->i_sb;
10833 + struct reiserfs_iget_args args;
10834 + INITIALIZE_PATH(path_to_key);
10835 + struct cpu_key key;
10836 + struct item_head ih;
10837 + struct stat_data sd;
10838 + int retval;
10839 + int err;
10840 + int depth;
10842 + BUG_ON(!th->t_trans_id);
10844 + depth = reiserfs_write_unlock_nested(sb);
10845 + err = dquot_alloc_inode(inode);
10846 + reiserfs_write_lock_nested(sb, depth);
10847 + if (err)
10848 + goto out_end_trans;
10849 + if (!dir->i_nlink) {
10850 + err = -EPERM;
10851 + goto out_bad_inode;
10854 + /* item head of new item */
10855 + ih.ih_key.k_dir_id = reiserfs_choose_packing(dir);
10856 + ih.ih_key.k_objectid = cpu_to_le32(reiserfs_get_unused_objectid(th));
10857 + if (!ih.ih_key.k_objectid) {
10858 + err = -ENOMEM;
10859 + goto out_bad_inode;
10861 + args.objectid = inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
10862 + if (old_format_only(sb))
10863 + make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
10864 + TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
10865 + else
10866 + make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
10867 + TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
10868 + memcpy(INODE_PKEY(inode), &ih.ih_key, KEY_SIZE);
10869 + args.dirid = le32_to_cpu(ih.ih_key.k_dir_id);
10871 + depth = reiserfs_write_unlock_nested(inode->i_sb);
10872 + err = insert_inode_locked4(inode, args.objectid,
10873 + reiserfs_find_actor, &args);
10874 + reiserfs_write_lock_nested(inode->i_sb, depth);
10875 + if (err) {
10876 + err = -EINVAL;
10877 + goto out_bad_inode;
10880 + if (old_format_only(sb))
10881 + /*
10882 + * not a perfect generation count, as object ids can be reused,
10883 + * but this is as good as reiserfs can do right now.
10884 + * note that the private part of inode isn't filled in yet,
10885 + * we have to use the directory.
10886 + */
10887 + inode->i_generation = le32_to_cpu(INODE_PKEY(dir)->k_objectid);
10888 + else
10889 +#if defined( USE_INODE_GENERATION_COUNTER )
10890 + inode->i_generation =
10891 + le32_to_cpu(REISERFS_SB(sb)->s_rs->s_inode_generation);
10892 +#else
10893 + inode->i_generation = ++event;
10894 +#endif
10896 + /* fill stat data */
10897 + set_nlink(inode, (S_ISDIR(mode) ? 2 : 1));
10899 + /* uid and gid must already be set by the caller for quota init */
10901 + simple_inode_init_ts(inode);
10902 + inode->i_size = i_size;
10903 + inode->i_blocks = 0;
10904 + inode->i_bytes = 0;
10905 + REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 :
10906 + U32_MAX /*NO_BYTES_IN_DIRECT_ITEM */ ;
10908 + INIT_LIST_HEAD(&REISERFS_I(inode)->i_prealloc_list);
10909 + REISERFS_I(inode)->i_flags = 0;
10910 + REISERFS_I(inode)->i_prealloc_block = 0;
10911 + REISERFS_I(inode)->i_prealloc_count = 0;
10912 + REISERFS_I(inode)->i_trans_id = 0;
10913 + REISERFS_I(inode)->i_jl = NULL;
10914 + REISERFS_I(inode)->i_attrs =
10915 + REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
10916 + sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode);
10917 + reiserfs_init_xattr_rwsem(inode);
10919 + /* key to search for correct place for new stat data */
10920 + _make_cpu_key(&key, KEY_FORMAT_3_6, le32_to_cpu(ih.ih_key.k_dir_id),
10921 + le32_to_cpu(ih.ih_key.k_objectid), SD_OFFSET,
10922 + TYPE_STAT_DATA, 3 /*key length */ );
10924 + /* find proper place for inserting of stat data */
10925 + retval = search_item(sb, &key, &path_to_key);
10926 + if (retval == IO_ERROR) {
10927 + err = -EIO;
10928 + goto out_bad_inode;
10930 + if (retval == ITEM_FOUND) {
10931 + pathrelse(&path_to_key);
10932 + err = -EEXIST;
10933 + goto out_bad_inode;
10935 + if (old_format_only(sb)) {
10936 + /* i_uid or i_gid is too big to be stored in stat data v3.5 */
10937 + if (i_uid_read(inode) & ~0xffff || i_gid_read(inode) & ~0xffff) {
10938 + pathrelse(&path_to_key);
10939 + err = -EINVAL;
10940 + goto out_bad_inode;
10942 + inode2sd_v1(&sd, inode, inode->i_size);
10943 + } else {
10944 + inode2sd(&sd, inode, inode->i_size);
10946 + /*
10947 + * store in in-core inode the key of stat data and version all
10948 + * object items will have (directory items will have old offset
10949 + * format, other new objects will consist of new items)
10950 + */
10951 + if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode))
10952 + set_inode_item_key_version(inode, KEY_FORMAT_3_5);
10953 + else
10954 + set_inode_item_key_version(inode, KEY_FORMAT_3_6);
10955 + if (old_format_only(sb))
10956 + set_inode_sd_version(inode, STAT_DATA_V1);
10957 + else
10958 + set_inode_sd_version(inode, STAT_DATA_V2);
10960 + /* insert the stat data into the tree */
10961 +#ifdef DISPLACE_NEW_PACKING_LOCALITIES
10962 + if (REISERFS_I(dir)->new_packing_locality)
10963 + th->displace_new_blocks = 1;
10964 +#endif
10965 + retval =
10966 + reiserfs_insert_item(th, &path_to_key, &key, &ih, inode,
10967 + (char *)(&sd));
10968 + if (retval) {
10969 + err = retval;
10970 + reiserfs_check_path(&path_to_key);
10971 + goto out_bad_inode;
10973 +#ifdef DISPLACE_NEW_PACKING_LOCALITIES
10974 + if (!th->displace_new_blocks)
10975 + REISERFS_I(dir)->new_packing_locality = 0;
10976 +#endif
10977 + if (S_ISDIR(mode)) {
10978 + /* insert item with "." and ".." */
10979 + retval =
10980 + reiserfs_new_directory(th, inode, &ih, &path_to_key, dir);
10983 + if (S_ISLNK(mode)) {
10984 + /* insert body of symlink */
10985 + if (!old_format_only(sb))
10986 + i_size = ROUND_UP(i_size);
10987 + retval =
10988 + reiserfs_new_symlink(th, inode, &ih, &path_to_key, symname,
10989 + i_size);
10991 + if (retval) {
10992 + err = retval;
10993 + reiserfs_check_path(&path_to_key);
10994 + journal_end(th);
10995 + goto out_inserted_sd;
10998 + /*
10999 + * Mark it private if we're creating the privroot
11000 + * or something under it.
11001 + */
11002 + if (IS_PRIVATE(dir) || dentry == REISERFS_SB(sb)->priv_root)
11003 + reiserfs_init_priv_inode(inode);
11005 + if (reiserfs_posixacl(inode->i_sb)) {
11006 + reiserfs_write_unlock(inode->i_sb);
11007 + retval = reiserfs_inherit_default_acl(th, dir, dentry, inode);
11008 + reiserfs_write_lock(inode->i_sb);
11009 + if (retval) {
11010 + err = retval;
11011 + reiserfs_check_path(&path_to_key);
11012 + journal_end(th);
11013 + goto out_inserted_sd;
11015 + } else if (inode->i_sb->s_flags & SB_POSIXACL) {
11016 + reiserfs_warning(inode->i_sb, "jdm-13090",
11017 + "ACLs aren't enabled in the fs, "
11018 + "but vfs thinks they are!");
11021 + if (security->name) {
11022 + reiserfs_write_unlock(inode->i_sb);
11023 + retval = reiserfs_security_write(th, inode, security);
11024 + reiserfs_write_lock(inode->i_sb);
11025 + if (retval) {
11026 + err = retval;
11027 + reiserfs_check_path(&path_to_key);
11028 + retval = journal_end(th);
11029 + if (retval)
11030 + err = retval;
11031 + goto out_inserted_sd;
11035 + reiserfs_update_sd(th, inode);
11036 + reiserfs_check_path(&path_to_key);
11038 + return 0;
11040 +out_bad_inode:
11041 + /* Invalidate the object, nothing was inserted yet */
11042 + INODE_PKEY(inode)->k_objectid = 0;
11044 + /* Quota change must be inside a transaction for journaling */
11045 + depth = reiserfs_write_unlock_nested(inode->i_sb);
11046 + dquot_free_inode(inode);
11047 + reiserfs_write_lock_nested(inode->i_sb, depth);
11049 +out_end_trans:
11050 + journal_end(th);
11051 + /*
11052 + * Drop can be outside and it needs more credits so it's better
11053 + * to have it outside
11054 + */
11055 + depth = reiserfs_write_unlock_nested(inode->i_sb);
11056 + dquot_drop(inode);
11057 + reiserfs_write_lock_nested(inode->i_sb, depth);
11058 + inode->i_flags |= S_NOQUOTA;
11059 + make_bad_inode(inode);
11061 +out_inserted_sd:
11062 + clear_nlink(inode);
11063 + th->t_trans_id = 0; /* so the caller can't use this handle later */
11064 + if (inode->i_state & I_NEW)
11065 + unlock_new_inode(inode);
11066 + iput(inode);
11067 + return err;
11071 + * finds the tail page in the page cache,
11072 + * reads the last block in.
11074 + * On success, page_result is set to a locked, pinned page, and bh_result
11075 + * is set to an up to date buffer for the last block in the file. returns 0.
11077 + * tail conversion is not done, so bh_result might not be valid for writing
11078 + * check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before
11079 + * trying to write the block.
11081 + * on failure, nonzero is returned, page_result and bh_result are untouched.
11082 + */
11083 +static int grab_tail_page(struct inode *inode,
11084 + struct page **page_result,
11085 + struct buffer_head **bh_result)
11088 + /*
11089 + * we want the page with the last byte in the file,
11090 + * not the page that will hold the next byte for appending
11091 + */
11092 + unsigned long index = (inode->i_size - 1) >> PAGE_SHIFT;
11093 + unsigned long pos = 0;
11094 + unsigned long start = 0;
11095 + unsigned long blocksize = inode->i_sb->s_blocksize;
11096 + unsigned long offset = (inode->i_size) & (PAGE_SIZE - 1);
11097 + struct buffer_head *bh;
11098 + struct buffer_head *head;
11099 + struct folio *folio;
11100 + int error;
11102 + /*
11103 + * we know that we are only called with inode->i_size > 0.
11104 + * we also know that a file tail can never be as big as a block
11105 + * If i_size % blocksize == 0, our file is currently block aligned
11106 + * and it won't need converting or zeroing after a truncate.
11107 + */
11108 + if ((offset & (blocksize - 1)) == 0) {
11109 + return -ENOENT;
11111 + folio = __filemap_get_folio(inode->i_mapping, index,
11112 + FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
11113 + mapping_gfp_mask(inode->i_mapping));
11114 + if (IS_ERR(folio))
11115 + return PTR_ERR(folio);
11116 + /* start within the page of the last block in the file */
11117 + start = (offset / blocksize) * blocksize;
11119 + error = __block_write_begin(folio, start, offset - start,
11120 + reiserfs_get_block_create_0);
11121 + if (error)
11122 + goto unlock;
11124 + head = folio_buffers(folio);
11125 + bh = head;
11126 + do {
11127 + if (pos >= start) {
11128 + break;
11130 + bh = bh->b_this_page;
11131 + pos += blocksize;
11132 + } while (bh != head);
11134 + if (!buffer_uptodate(bh)) {
11135 + /*
11136 + * note, this should never happen, prepare_write should be
11137 + * taking care of this for us. If the buffer isn't up to
11138 + * date, I've screwed up the code to find the buffer, or the
11139 + * code to call prepare_write
11140 + */
11141 + reiserfs_error(inode->i_sb, "clm-6000",
11142 + "error reading block %lu", bh->b_blocknr);
11143 + error = -EIO;
11144 + goto unlock;
11146 + *bh_result = bh;
11147 + *page_result = &folio->page;
11149 + return error;
11151 +unlock:
11152 + folio_unlock(folio);
11153 + folio_put(folio);
11154 + return error;
11158 + * vfs version of truncate file. Must NOT be called with
11159 + * a transaction already started.
11161 + * some code taken from block_truncate_page
11162 + */
11163 +int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
11165 + struct reiserfs_transaction_handle th;
11166 + /* we want the offset for the first byte after the end of the file */
11167 + unsigned long offset = inode->i_size & (PAGE_SIZE - 1);
11168 + unsigned blocksize = inode->i_sb->s_blocksize;
11169 + unsigned length;
11170 + struct page *page = NULL;
11171 + int error;
11172 + struct buffer_head *bh = NULL;
11173 + int err2;
11175 + reiserfs_write_lock(inode->i_sb);
11177 + if (inode->i_size > 0) {
11178 + error = grab_tail_page(inode, &page, &bh);
11179 + if (error) {
11180 + /*
11181 + * -ENOENT means we truncated past the end of the
11182 + * file, and get_block_create_0 could not find a
11183 + * block to read in, which is ok.
11184 + */
11185 + if (error != -ENOENT)
11186 + reiserfs_error(inode->i_sb, "clm-6001",
11187 + "grab_tail_page failed %d",
11188 + error);
11189 + page = NULL;
11190 + bh = NULL;
11194 + /*
11195 + * so, if page != NULL, we have a buffer head for the offset at
11196 + * the end of the file. if the bh is mapped, and bh->b_blocknr != 0,
11197 + * then we have an unformatted node. Otherwise, we have a direct item,
11198 + * and no zeroing is required on disk. We zero after the truncate,
11199 + * because the truncate might pack the item anyway
11200 + * (it will unmap bh if it packs).
11202 + * it is enough to reserve space in transaction for 2 balancings:
11203 + * one for "save" link adding and another for the first
11204 + * cut_from_item. 1 is for update_sd
11205 + */
11206 + error = journal_begin(&th, inode->i_sb,
11207 + JOURNAL_PER_BALANCE_CNT * 2 + 1);
11208 + if (error)
11209 + goto out;
11210 + reiserfs_update_inode_transaction(inode);
11211 + if (update_timestamps)
11212 + /*
11213 + * we are doing real truncate: if the system crashes
11214 + * before the last transaction of truncating gets committed
11215 + * - on reboot the file either appears truncated properly
11216 + * or not truncated at all
11217 + */
11218 + add_save_link(&th, inode, 1);
11219 + err2 = reiserfs_do_truncate(&th, inode, page, update_timestamps);
11220 + error = journal_end(&th);
11221 + if (error)
11222 + goto out;
11224 + /* check reiserfs_do_truncate after ending the transaction */
11225 + if (err2) {
11226 + error = err2;
11227 + goto out;
11230 + if (update_timestamps) {
11231 + error = remove_save_link(inode, 1 /* truncate */);
11232 + if (error)
11233 + goto out;
11236 + if (page) {
11237 + length = offset & (blocksize - 1);
11238 + /* if we are not on a block boundary */
11239 + if (length) {
11240 + length = blocksize - length;
11241 + zero_user(page, offset, length);
11242 + if (buffer_mapped(bh) && bh->b_blocknr != 0) {
11243 + mark_buffer_dirty(bh);
11246 + unlock_page(page);
11247 + put_page(page);
11250 + reiserfs_write_unlock(inode->i_sb);
11252 + return 0;
11253 +out:
11254 + if (page) {
11255 + unlock_page(page);
11256 + put_page(page);
11259 + reiserfs_write_unlock(inode->i_sb);
11261 + return error;
11264 +static int map_block_for_writepage(struct inode *inode,
11265 + struct buffer_head *bh_result,
11266 + unsigned long block)
11268 + struct reiserfs_transaction_handle th;
11269 + int fs_gen;
11270 + struct item_head tmp_ih;
11271 + struct item_head *ih;
11272 + struct buffer_head *bh;
11273 + __le32 *item;
11274 + struct cpu_key key;
11275 + INITIALIZE_PATH(path);
11276 + int pos_in_item;
11277 + int jbegin_count = JOURNAL_PER_BALANCE_CNT;
11278 + loff_t byte_offset = ((loff_t)block << inode->i_sb->s_blocksize_bits)+1;
11279 + int retval;
11280 + int use_get_block = 0;
11281 + int bytes_copied = 0;
11282 + int copy_size;
11283 + int trans_running = 0;
11285 + /*
11286 + * catch places below that try to log something without
11287 + * starting a trans
11288 + */
11289 + th.t_trans_id = 0;
11291 + if (!buffer_uptodate(bh_result)) {
11292 + return -EIO;
11295 + kmap(bh_result->b_page);
11296 +start_over:
11297 + reiserfs_write_lock(inode->i_sb);
11298 + make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3);
11300 +research:
11301 + retval = search_for_position_by_key(inode->i_sb, &key, &path);
11302 + if (retval != POSITION_FOUND) {
11303 + use_get_block = 1;
11304 + goto out;
11307 + bh = get_last_bh(&path);
11308 + ih = tp_item_head(&path);
11309 + item = tp_item_body(&path);
11310 + pos_in_item = path.pos_in_item;
11312 + /* we've found an unformatted node */
11313 + if (indirect_item_found(retval, ih)) {
11314 + if (bytes_copied > 0) {
11315 + reiserfs_warning(inode->i_sb, "clm-6002",
11316 + "bytes_copied %d", bytes_copied);
11318 + if (!get_block_num(item, pos_in_item)) {
11319 + /* crap, we are writing to a hole */
11320 + use_get_block = 1;
11321 + goto out;
11323 + set_block_dev_mapped(bh_result,
11324 + get_block_num(item, pos_in_item), inode);
11325 + } else if (is_direct_le_ih(ih)) {
11326 + char *p;
11327 + p = page_address(bh_result->b_page);
11328 + p += (byte_offset - 1) & (PAGE_SIZE - 1);
11329 + copy_size = ih_item_len(ih) - pos_in_item;
11331 + fs_gen = get_generation(inode->i_sb);
11332 + copy_item_head(&tmp_ih, ih);
11334 + if (!trans_running) {
11335 + /* vs-3050 is gone, no need to drop the path */
11336 + retval = journal_begin(&th, inode->i_sb, jbegin_count);
11337 + if (retval)
11338 + goto out;
11339 + reiserfs_update_inode_transaction(inode);
11340 + trans_running = 1;
11341 + if (fs_changed(fs_gen, inode->i_sb)
11342 + && item_moved(&tmp_ih, &path)) {
11343 + reiserfs_restore_prepared_buffer(inode->i_sb,
11344 + bh);
11345 + goto research;
11349 + reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
11351 + if (fs_changed(fs_gen, inode->i_sb)
11352 + && item_moved(&tmp_ih, &path)) {
11353 + reiserfs_restore_prepared_buffer(inode->i_sb, bh);
11354 + goto research;
11357 + memcpy(ih_item_body(bh, ih) + pos_in_item, p + bytes_copied,
11358 + copy_size);
11360 + journal_mark_dirty(&th, bh);
11361 + bytes_copied += copy_size;
11362 + set_block_dev_mapped(bh_result, 0, inode);
11364 + /* are there still bytes left? */
11365 + if (bytes_copied < bh_result->b_size &&
11366 + (byte_offset + bytes_copied) < inode->i_size) {
11367 + set_cpu_key_k_offset(&key,
11368 + cpu_key_k_offset(&key) +
11369 + copy_size);
11370 + goto research;
11372 + } else {
11373 + reiserfs_warning(inode->i_sb, "clm-6003",
11374 + "bad item inode %lu", inode->i_ino);
11375 + retval = -EIO;
11376 + goto out;
11378 + retval = 0;
11380 +out:
11381 + pathrelse(&path);
11382 + if (trans_running) {
11383 + int err = journal_end(&th);
11384 + if (err)
11385 + retval = err;
11386 + trans_running = 0;
11388 + reiserfs_write_unlock(inode->i_sb);
11390 + /* this is where we fill in holes in the file. */
11391 + if (use_get_block) {
11392 + retval = reiserfs_get_block(inode, block, bh_result,
11393 + GET_BLOCK_CREATE | GET_BLOCK_NO_IMUX
11394 + | GET_BLOCK_NO_DANGLE);
11395 + if (!retval) {
11396 + if (!buffer_mapped(bh_result)
11397 + || bh_result->b_blocknr == 0) {
11398 + /* get_block failed to find a mapped unformatted node. */
11399 + use_get_block = 0;
11400 + goto start_over;
11404 + kunmap(bh_result->b_page);
11406 + if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
11407 + /*
11408 + * we've copied data from the page into the direct item, so the
11409 + * buffer in the page is now clean, mark it to reflect that.
11410 + */
11411 + lock_buffer(bh_result);
11412 + clear_buffer_dirty(bh_result);
11413 + unlock_buffer(bh_result);
11415 + return retval;
11419 + * mason@suse.com: updated in 2.5.54 to follow the same general io
11420 + * start/recovery path as __block_write_full_folio, along with special
11421 + * code to handle reiserfs tails.
11422 + */
11423 +static int reiserfs_write_folio(struct folio *folio,
11424 + struct writeback_control *wbc, void *data)
11426 + struct inode *inode = folio->mapping->host;
11427 + unsigned long end_index = inode->i_size >> PAGE_SHIFT;
11428 + int error = 0;
11429 + unsigned long block;
11430 + sector_t last_block;
11431 + struct buffer_head *head, *bh;
11432 + int partial = 0;
11433 + int nr = 0;
11434 + int checked = folio_test_checked(folio);
11435 + struct reiserfs_transaction_handle th;
11436 + struct super_block *s = inode->i_sb;
11437 + int bh_per_page = PAGE_SIZE / s->s_blocksize;
11438 + th.t_trans_id = 0;
11440 + /* no logging allowed when nonblocking or from PF_MEMALLOC */
11441 + if (checked && (current->flags & PF_MEMALLOC)) {
11442 + folio_redirty_for_writepage(wbc, folio);
11443 + folio_unlock(folio);
11444 + return 0;
11447 + /*
11448 + * The folio dirty bit is cleared before writepage is called, which
11449 + * means we have to tell create_empty_buffers to make dirty buffers
11450 + * The folio really should be up to date at this point, so tossing
11451 + * in the BH_Uptodate is just a sanity check.
11452 + */
11453 + head = folio_buffers(folio);
11454 + if (!head)
11455 + head = create_empty_buffers(folio, s->s_blocksize,
11456 + (1 << BH_Dirty) | (1 << BH_Uptodate));
11458 + /*
11459 + * last folio in the file, zero out any contents past the
11460 + * last byte in the file
11461 + */
11462 + if (folio->index >= end_index) {
11463 + unsigned last_offset;
11465 + last_offset = inode->i_size & (PAGE_SIZE - 1);
11466 + /* no file contents in this folio */
11467 + if (folio->index >= end_index + 1 || !last_offset) {
11468 + folio_unlock(folio);
11469 + return 0;
11471 + folio_zero_segment(folio, last_offset, folio_size(folio));
11473 + bh = head;
11474 + block = folio->index << (PAGE_SHIFT - s->s_blocksize_bits);
11475 + last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
11476 + /* first map all the buffers, logging any direct items we find */
11477 + do {
11478 + if (block > last_block) {
11479 + /*
11480 + * This can happen when the block size is less than
11481 + * the folio size. The corresponding bytes in the folio
11482 + * were zero filled above
11483 + */
11484 + clear_buffer_dirty(bh);
11485 + set_buffer_uptodate(bh);
11486 + } else if ((checked || buffer_dirty(bh)) &&
11487 + (!buffer_mapped(bh) || bh->b_blocknr == 0)) {
11488 + /*
11489 + * not mapped yet, or it points to a direct item, search
11490 + * the btree for the mapping info, and log any direct
11491 + * items found
11492 + */
11493 + if ((error = map_block_for_writepage(inode, bh, block))) {
11494 + goto fail;
11497 + bh = bh->b_this_page;
11498 + block++;
11499 + } while (bh != head);
11501 + /*
11502 + * we start the transaction after map_block_for_writepage,
11503 + * because it can create holes in the file (an unbounded operation).
11504 + * starting it here, we can make a reliable estimate for how many
11505 + * blocks we're going to log
11506 + */
11507 + if (checked) {
11508 + folio_clear_checked(folio);
11509 + reiserfs_write_lock(s);
11510 + error = journal_begin(&th, s, bh_per_page + 1);
11511 + if (error) {
11512 + reiserfs_write_unlock(s);
11513 + goto fail;
11515 + reiserfs_update_inode_transaction(inode);
11517 + /* now go through and lock any dirty buffers on the folio */
11518 + do {
11519 + get_bh(bh);
11520 + if (!buffer_mapped(bh))
11521 + continue;
11522 + if (buffer_mapped(bh) && bh->b_blocknr == 0)
11523 + continue;
11525 + if (checked) {
11526 + reiserfs_prepare_for_journal(s, bh, 1);
11527 + journal_mark_dirty(&th, bh);
11528 + continue;
11530 + /*
11531 + * from this point on, we know the buffer is mapped to a
11532 + * real block and not a direct item
11533 + */
11534 + if (wbc->sync_mode != WB_SYNC_NONE) {
11535 + lock_buffer(bh);
11536 + } else {
11537 + if (!trylock_buffer(bh)) {
11538 + folio_redirty_for_writepage(wbc, folio);
11539 + continue;
11542 + if (test_clear_buffer_dirty(bh)) {
11543 + mark_buffer_async_write(bh);
11544 + } else {
11545 + unlock_buffer(bh);
11547 + } while ((bh = bh->b_this_page) != head);
11549 + if (checked) {
11550 + error = journal_end(&th);
11551 + reiserfs_write_unlock(s);
11552 + if (error)
11553 + goto fail;
11555 + BUG_ON(folio_test_writeback(folio));
11556 + folio_start_writeback(folio);
11557 + folio_unlock(folio);
11559 + /*
11560 + * since any buffer might be the only dirty buffer on the folio,
11561 + * the first submit_bh can bring the folio out of writeback.
11562 + * be careful with the buffers.
11563 + */
11564 + do {
11565 + struct buffer_head *next = bh->b_this_page;
11566 + if (buffer_async_write(bh)) {
11567 + submit_bh(REQ_OP_WRITE, bh);
11568 + nr++;
11570 + put_bh(bh);
11571 + bh = next;
11572 + } while (bh != head);
11574 + error = 0;
11575 +done:
11576 + if (nr == 0) {
11577 + /*
11578 + * if this folio only had a direct item, it is very possible for
11579 + * no io to be required without there being an error. Or,
11580 + * someone else could have locked them and sent them down the
11581 + * pipe without locking the folio
11582 + */
11583 + bh = head;
11584 + do {
11585 + if (!buffer_uptodate(bh)) {
11586 + partial = 1;
11587 + break;
11589 + bh = bh->b_this_page;
11590 + } while (bh != head);
11591 + if (!partial)
11592 + folio_mark_uptodate(folio);
11593 + folio_end_writeback(folio);
11595 + return error;
11597 +fail:
11598 + /*
11599 + * catches various errors, we need to make sure any valid dirty blocks
11600 + * get to the media. The folio is currently locked and not marked for
11601 + * writeback
11602 + */
11603 + folio_clear_uptodate(folio);
11604 + bh = head;
11605 + do {
11606 + get_bh(bh);
11607 + if (buffer_mapped(bh) && buffer_dirty(bh) && bh->b_blocknr) {
11608 + lock_buffer(bh);
11609 + mark_buffer_async_write(bh);
11610 + } else {
11611 + /*
11612 + * clear any dirty bits that might have come from
11613 + * getting attached to a dirty folio
11614 + */
11615 + clear_buffer_dirty(bh);
11617 + bh = bh->b_this_page;
11618 + } while (bh != head);
11619 + BUG_ON(folio_test_writeback(folio));
11620 + folio_start_writeback(folio);
11621 + folio_unlock(folio);
11622 + do {
11623 + struct buffer_head *next = bh->b_this_page;
11624 + if (buffer_async_write(bh)) {
11625 + clear_buffer_dirty(bh);
11626 + submit_bh(REQ_OP_WRITE, bh);
11627 + nr++;
11629 + put_bh(bh);
11630 + bh = next;
11631 + } while (bh != head);
11632 + goto done;
11635 +static int reiserfs_read_folio(struct file *f, struct folio *folio)
11637 + return block_read_full_folio(folio, reiserfs_get_block);
11640 +static int reiserfs_writepages(struct address_space *mapping,
11641 + struct writeback_control *wbc)
11643 + reiserfs_wait_on_write_block(mapping->host->i_sb);
11644 + return write_cache_pages(mapping, wbc, reiserfs_write_folio, NULL);
11647 +static void reiserfs_truncate_failed_write(struct inode *inode)
11649 + truncate_inode_pages(inode->i_mapping, inode->i_size);
11650 + reiserfs_truncate_file(inode, 0);
11653 +static int reiserfs_write_begin(struct file *file,
11654 + struct address_space *mapping,
11655 + loff_t pos, unsigned len,
11656 + struct folio **foliop, void **fsdata)
11658 + struct inode *inode;
11659 + struct folio *folio;
11660 + pgoff_t index;
11661 + int ret;
11662 + int old_ref = 0;
11664 + inode = mapping->host;
11665 + index = pos >> PAGE_SHIFT;
11666 + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
11667 + mapping_gfp_mask(mapping));
11668 + if (IS_ERR(folio))
11669 + return PTR_ERR(folio);
11670 + *foliop = folio;
11672 + reiserfs_wait_on_write_block(inode->i_sb);
11673 + fix_tail_page_for_writing(&folio->page);
11674 + if (reiserfs_transaction_running(inode->i_sb)) {
11675 + struct reiserfs_transaction_handle *th;
11676 + th = (struct reiserfs_transaction_handle *)current->
11677 + journal_info;
11678 + BUG_ON(!th->t_refcount);
11679 + BUG_ON(!th->t_trans_id);
11680 + old_ref = th->t_refcount;
11681 + th->t_refcount++;
11683 + ret = __block_write_begin(folio, pos, len, reiserfs_get_block);
11684 + if (ret && reiserfs_transaction_running(inode->i_sb)) {
11685 + struct reiserfs_transaction_handle *th = current->journal_info;
11686 + /*
11687 + * this gets a little ugly. If reiserfs_get_block returned an
11688 + * error and left a transacstion running, we've got to close
11689 + * it, and we've got to free handle if it was a persistent
11690 + * transaction.
11692 + * But, if we had nested into an existing transaction, we need
11693 + * to just drop the ref count on the handle.
11695 + * If old_ref == 0, the transaction is from reiserfs_get_block,
11696 + * and it was a persistent trans. Otherwise, it was nested
11697 + * above.
11698 + */
11699 + if (th->t_refcount > old_ref) {
11700 + if (old_ref)
11701 + th->t_refcount--;
11702 + else {
11703 + int err;
11704 + reiserfs_write_lock(inode->i_sb);
11705 + err = reiserfs_end_persistent_transaction(th);
11706 + reiserfs_write_unlock(inode->i_sb);
11707 + if (err)
11708 + ret = err;
11712 + if (ret) {
11713 + folio_unlock(folio);
11714 + folio_put(folio);
11715 + /* Truncate allocated blocks */
11716 + reiserfs_truncate_failed_write(inode);
11718 + return ret;
11721 +int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len)
11723 + struct inode *inode = page->mapping->host;
11724 + int ret;
11725 + int old_ref = 0;
11726 + int depth;
11728 + depth = reiserfs_write_unlock_nested(inode->i_sb);
11729 + reiserfs_wait_on_write_block(inode->i_sb);
11730 + reiserfs_write_lock_nested(inode->i_sb, depth);
11732 + fix_tail_page_for_writing(page);
11733 + if (reiserfs_transaction_running(inode->i_sb)) {
11734 + struct reiserfs_transaction_handle *th;
11735 + th = (struct reiserfs_transaction_handle *)current->
11736 + journal_info;
11737 + BUG_ON(!th->t_refcount);
11738 + BUG_ON(!th->t_trans_id);
11739 + old_ref = th->t_refcount;
11740 + th->t_refcount++;
11743 + ret = __block_write_begin(page_folio(page), from, len, reiserfs_get_block);
11744 + if (ret && reiserfs_transaction_running(inode->i_sb)) {
11745 + struct reiserfs_transaction_handle *th = current->journal_info;
11746 + /*
11747 + * this gets a little ugly. If reiserfs_get_block returned an
11748 + * error and left a transacstion running, we've got to close
11749 + * it, and we've got to free handle if it was a persistent
11750 + * transaction.
11752 + * But, if we had nested into an existing transaction, we need
11753 + * to just drop the ref count on the handle.
11755 + * If old_ref == 0, the transaction is from reiserfs_get_block,
11756 + * and it was a persistent trans. Otherwise, it was nested
11757 + * above.
11758 + */
11759 + if (th->t_refcount > old_ref) {
11760 + if (old_ref)
11761 + th->t_refcount--;
11762 + else {
11763 + int err;
11764 + reiserfs_write_lock(inode->i_sb);
11765 + err = reiserfs_end_persistent_transaction(th);
11766 + reiserfs_write_unlock(inode->i_sb);
11767 + if (err)
11768 + ret = err;
11772 + return ret;
11776 +static sector_t reiserfs_aop_bmap(struct address_space *as, sector_t block)
11778 + return generic_block_bmap(as, block, reiserfs_bmap);
11781 +static int reiserfs_write_end(struct file *file, struct address_space *mapping,
11782 + loff_t pos, unsigned len, unsigned copied,
11783 + struct folio *folio, void *fsdata)
11785 + struct inode *inode = folio->mapping->host;
11786 + int ret = 0;
11787 + int update_sd = 0;
11788 + struct reiserfs_transaction_handle *th;
11789 + unsigned start;
11790 + bool locked = false;
11792 + reiserfs_wait_on_write_block(inode->i_sb);
11793 + if (reiserfs_transaction_running(inode->i_sb))
11794 + th = current->journal_info;
11795 + else
11796 + th = NULL;
11798 + start = pos & (PAGE_SIZE - 1);
11799 + if (unlikely(copied < len)) {
11800 + if (!folio_test_uptodate(folio))
11801 + copied = 0;
11803 + folio_zero_new_buffers(folio, start + copied, start + len);
11805 + flush_dcache_folio(folio);
11807 + reiserfs_commit_page(inode, &folio->page, start, start + copied);
11809 + /*
11810 + * generic_commit_write does this for us, but does not update the
11811 + * transaction tracking stuff when the size changes. So, we have
11812 + * to do the i_size updates here.
11813 + */
11814 + if (pos + copied > inode->i_size) {
11815 + struct reiserfs_transaction_handle myth;
11816 + reiserfs_write_lock(inode->i_sb);
11817 + locked = true;
11818 + /*
11819 + * If the file have grown beyond the border where it
11820 + * can have a tail, unmark it as needing a tail
11821 + * packing
11822 + */
11823 + if ((have_large_tails(inode->i_sb)
11824 + && inode->i_size > i_block_size(inode) * 4)
11825 + || (have_small_tails(inode->i_sb)
11826 + && inode->i_size > i_block_size(inode)))
11827 + REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
11829 + ret = journal_begin(&myth, inode->i_sb, 1);
11830 + if (ret)
11831 + goto journal_error;
11833 + reiserfs_update_inode_transaction(inode);
11834 + inode->i_size = pos + copied;
11835 + /*
11836 + * this will just nest into our transaction. It's important
11837 + * to use mark_inode_dirty so the inode gets pushed around on
11838 + * the dirty lists, and so that O_SYNC works as expected
11839 + */
11840 + mark_inode_dirty(inode);
11841 + reiserfs_update_sd(&myth, inode);
11842 + update_sd = 1;
11843 + ret = journal_end(&myth);
11844 + if (ret)
11845 + goto journal_error;
11847 + if (th) {
11848 + if (!locked) {
11849 + reiserfs_write_lock(inode->i_sb);
11850 + locked = true;
11852 + if (!update_sd)
11853 + mark_inode_dirty(inode);
11854 + ret = reiserfs_end_persistent_transaction(th);
11855 + if (ret)
11856 + goto out;
11859 +out:
11860 + if (locked)
11861 + reiserfs_write_unlock(inode->i_sb);
11862 + folio_unlock(folio);
11863 + folio_put(folio);
11865 + if (pos + len > inode->i_size)
11866 + reiserfs_truncate_failed_write(inode);
11868 + return ret == 0 ? copied : ret;
11870 +journal_error:
11871 + reiserfs_write_unlock(inode->i_sb);
11872 + locked = false;
11873 + if (th) {
11874 + if (!update_sd)
11875 + reiserfs_update_sd(th, inode);
11876 + ret = reiserfs_end_persistent_transaction(th);
11878 + goto out;
11881 +int reiserfs_commit_write(struct file *f, struct page *page,
11882 + unsigned from, unsigned to)
11884 + struct inode *inode = page->mapping->host;
11885 + loff_t pos = ((loff_t) page->index << PAGE_SHIFT) + to;
11886 + int ret = 0;
11887 + int update_sd = 0;
11888 + struct reiserfs_transaction_handle *th = NULL;
11889 + int depth;
11891 + depth = reiserfs_write_unlock_nested(inode->i_sb);
11892 + reiserfs_wait_on_write_block(inode->i_sb);
11893 + reiserfs_write_lock_nested(inode->i_sb, depth);
11895 + if (reiserfs_transaction_running(inode->i_sb)) {
11896 + th = current->journal_info;
11898 + reiserfs_commit_page(inode, page, from, to);
11900 + /*
11901 + * generic_commit_write does this for us, but does not update the
11902 + * transaction tracking stuff when the size changes. So, we have
11903 + * to do the i_size updates here.
11904 + */
11905 + if (pos > inode->i_size) {
11906 + struct reiserfs_transaction_handle myth;
11907 + /*
11908 + * If the file have grown beyond the border where it
11909 + * can have a tail, unmark it as needing a tail
11910 + * packing
11911 + */
11912 + if ((have_large_tails(inode->i_sb)
11913 + && inode->i_size > i_block_size(inode) * 4)
11914 + || (have_small_tails(inode->i_sb)
11915 + && inode->i_size > i_block_size(inode)))
11916 + REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
11918 + ret = journal_begin(&myth, inode->i_sb, 1);
11919 + if (ret)
11920 + goto journal_error;
11922 + reiserfs_update_inode_transaction(inode);
11923 + inode->i_size = pos;
11924 + /*
11925 + * this will just nest into our transaction. It's important
11926 + * to use mark_inode_dirty so the inode gets pushed around
11927 + * on the dirty lists, and so that O_SYNC works as expected
11928 + */
11929 + mark_inode_dirty(inode);
11930 + reiserfs_update_sd(&myth, inode);
11931 + update_sd = 1;
11932 + ret = journal_end(&myth);
11933 + if (ret)
11934 + goto journal_error;
11936 + if (th) {
11937 + if (!update_sd)
11938 + mark_inode_dirty(inode);
11939 + ret = reiserfs_end_persistent_transaction(th);
11940 + if (ret)
11941 + goto out;
11944 +out:
11945 + return ret;
11947 +journal_error:
11948 + if (th) {
11949 + if (!update_sd)
11950 + reiserfs_update_sd(th, inode);
11951 + ret = reiserfs_end_persistent_transaction(th);
11954 + return ret;
11957 +void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode)
11959 + if (reiserfs_attrs(inode->i_sb)) {
11960 + if (sd_attrs & REISERFS_SYNC_FL)
11961 + inode->i_flags |= S_SYNC;
11962 + else
11963 + inode->i_flags &= ~S_SYNC;
11964 + if (sd_attrs & REISERFS_IMMUTABLE_FL)
11965 + inode->i_flags |= S_IMMUTABLE;
11966 + else
11967 + inode->i_flags &= ~S_IMMUTABLE;
11968 + if (sd_attrs & REISERFS_APPEND_FL)
11969 + inode->i_flags |= S_APPEND;
11970 + else
11971 + inode->i_flags &= ~S_APPEND;
11972 + if (sd_attrs & REISERFS_NOATIME_FL)
11973 + inode->i_flags |= S_NOATIME;
11974 + else
11975 + inode->i_flags &= ~S_NOATIME;
11976 + if (sd_attrs & REISERFS_NOTAIL_FL)
11977 + REISERFS_I(inode)->i_flags |= i_nopack_mask;
11978 + else
11979 + REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
11984 + * decide if this buffer needs to stay around for data logging or ordered
11985 + * write purposes
11986 + */
11987 +static int invalidate_folio_can_drop(struct inode *inode, struct buffer_head *bh)
11989 + int ret = 1;
11990 + struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
11992 + lock_buffer(bh);
11993 + spin_lock(&j->j_dirty_buffers_lock);
11994 + if (!buffer_mapped(bh)) {
11995 + goto free_jh;
11997 + /*
11998 + * the page is locked, and the only places that log a data buffer
11999 + * also lock the page.
12000 + */
12001 + if (reiserfs_file_data_log(inode)) {
12002 + /*
12003 + * very conservative, leave the buffer pinned if
12004 + * anyone might need it.
12005 + */
12006 + if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
12007 + ret = 0;
12009 + } else if (buffer_dirty(bh)) {
12010 + struct reiserfs_journal_list *jl;
12011 + struct reiserfs_jh *jh = bh->b_private;
12013 + /*
12014 + * why is this safe?
12015 + * reiserfs_setattr updates i_size in the on disk
12016 + * stat data before allowing vmtruncate to be called.
12018 + * If buffer was put onto the ordered list for this
12019 + * transaction, we know for sure either this transaction
12020 + * or an older one already has updated i_size on disk,
12021 + * and this ordered data won't be referenced in the file
12022 + * if we crash.
12024 + * if the buffer was put onto the ordered list for an older
12025 + * transaction, we need to leave it around
12026 + */
12027 + if (jh && (jl = jh->jl)
12028 + && jl != SB_JOURNAL(inode->i_sb)->j_current_jl)
12029 + ret = 0;
12031 +free_jh:
12032 + if (ret && bh->b_private) {
12033 + reiserfs_free_jh(bh);
12035 + spin_unlock(&j->j_dirty_buffers_lock);
12036 + unlock_buffer(bh);
12037 + return ret;
12040 +/* clm -- taken from fs/buffer.c:block_invalidate_folio */
12041 +static void reiserfs_invalidate_folio(struct folio *folio, size_t offset,
12042 + size_t length)
12044 + struct buffer_head *head, *bh, *next;
12045 + struct inode *inode = folio->mapping->host;
12046 + unsigned int curr_off = 0;
12047 + unsigned int stop = offset + length;
12048 + int partial_page = (offset || length < folio_size(folio));
12049 + int ret = 1;
12051 + BUG_ON(!folio_test_locked(folio));
12053 + if (!partial_page)
12054 + folio_clear_checked(folio);
12056 + head = folio_buffers(folio);
12057 + if (!head)
12058 + goto out;
12060 + bh = head;
12061 + do {
12062 + unsigned int next_off = curr_off + bh->b_size;
12063 + next = bh->b_this_page;
12065 + if (next_off > stop)
12066 + goto out;
12068 + /*
12069 + * is this block fully invalidated?
12070 + */
12071 + if (offset <= curr_off) {
12072 + if (invalidate_folio_can_drop(inode, bh))
12073 + reiserfs_unmap_buffer(bh);
12074 + else
12075 + ret = 0;
12077 + curr_off = next_off;
12078 + bh = next;
12079 + } while (bh != head);
12081 + /*
12082 + * We release buffers only if the entire page is being invalidated.
12083 + * The get_block cached value has been unconditionally invalidated,
12084 + * so real IO is not possible anymore.
12085 + */
12086 + if (!partial_page && ret) {
12087 + ret = filemap_release_folio(folio, 0);
12088 + /* maybe should BUG_ON(!ret); - neilb */
12090 +out:
12091 + return;
12094 +static bool reiserfs_dirty_folio(struct address_space *mapping,
12095 + struct folio *folio)
12097 + if (reiserfs_file_data_log(mapping->host)) {
12098 + folio_set_checked(folio);
12099 + return filemap_dirty_folio(mapping, folio);
12101 + return block_dirty_folio(mapping, folio);
12105 + * Returns true if the folio's buffers were dropped. The folio is locked.
12107 + * Takes j_dirty_buffers_lock to protect the b_assoc_buffers list_heads
12108 + * in the buffers at folio_buffers(folio).
12110 + * even in -o notail mode, we can't be sure an old mount without -o notail
12111 + * didn't create files with tails.
12112 + */
12113 +static bool reiserfs_release_folio(struct folio *folio, gfp_t unused_gfp_flags)
12115 + struct inode *inode = folio->mapping->host;
12116 + struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
12117 + struct buffer_head *head;
12118 + struct buffer_head *bh;
12119 + bool ret = true;
12121 + WARN_ON(folio_test_checked(folio));
12122 + spin_lock(&j->j_dirty_buffers_lock);
12123 + head = folio_buffers(folio);
12124 + bh = head;
12125 + do {
12126 + if (bh->b_private) {
12127 + if (!buffer_dirty(bh) && !buffer_locked(bh)) {
12128 + reiserfs_free_jh(bh);
12129 + } else {
12130 + ret = false;
12131 + break;
12134 + bh = bh->b_this_page;
12135 + } while (bh != head);
12136 + if (ret)
12137 + ret = try_to_free_buffers(folio);
12138 + spin_unlock(&j->j_dirty_buffers_lock);
12139 + return ret;
12143 + * We thank Mingming Cao for helping us understand in great detail what
12144 + * to do in this section of the code.
12145 + */
12146 +static ssize_t reiserfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
12148 + struct file *file = iocb->ki_filp;
12149 + struct inode *inode = file->f_mapping->host;
12150 + size_t count = iov_iter_count(iter);
12151 + ssize_t ret;
12153 + ret = blockdev_direct_IO(iocb, inode, iter,
12154 + reiserfs_get_blocks_direct_io);
12156 + /*
12157 + * In case of error extending write may have instantiated a few
12158 + * blocks outside i_size. Trim these off again.
12159 + */
12160 + if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
12161 + loff_t isize = i_size_read(inode);
12162 + loff_t end = iocb->ki_pos + count;
12164 + if ((end > isize) && inode_newsize_ok(inode, isize) == 0) {
12165 + truncate_setsize(inode, isize);
12166 + reiserfs_vfs_truncate_file(inode);
12170 + return ret;
12173 +int reiserfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
12174 + struct iattr *attr)
12176 + struct inode *inode = d_inode(dentry);
12177 + unsigned int ia_valid;
12178 + int error;
12180 + error = setattr_prepare(&nop_mnt_idmap, dentry, attr);
12181 + if (error)
12182 + return error;
12184 + /* must be turned off for recursive notify_change calls */
12185 + ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID);
12187 + if (is_quota_modification(&nop_mnt_idmap, inode, attr)) {
12188 + error = dquot_initialize(inode);
12189 + if (error)
12190 + return error;
12192 + reiserfs_write_lock(inode->i_sb);
12193 + if (attr->ia_valid & ATTR_SIZE) {
12194 + /*
12195 + * version 2 items will be caught by the s_maxbytes check
12196 + * done for us in vmtruncate
12197 + */
12198 + if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 &&
12199 + attr->ia_size > MAX_NON_LFS) {
12200 + reiserfs_write_unlock(inode->i_sb);
12201 + error = -EFBIG;
12202 + goto out;
12205 + inode_dio_wait(inode);
12207 + /* fill in hole pointers in the expanding truncate case. */
12208 + if (attr->ia_size > inode->i_size) {
12209 + loff_t pos = attr->ia_size;
12211 + if ((pos & (inode->i_sb->s_blocksize - 1)) == 0)
12212 + pos++;
12213 + error = generic_cont_expand_simple(inode, pos);
12214 + if (REISERFS_I(inode)->i_prealloc_count > 0) {
12215 + int err;
12216 + struct reiserfs_transaction_handle th;
12217 + /* we're changing at most 2 bitmaps, inode + super */
12218 + err = journal_begin(&th, inode->i_sb, 4);
12219 + if (!err) {
12220 + reiserfs_discard_prealloc(&th, inode);
12221 + err = journal_end(&th);
12223 + if (err)
12224 + error = err;
12226 + if (error) {
12227 + reiserfs_write_unlock(inode->i_sb);
12228 + goto out;
12230 + /*
12231 + * file size is changed, ctime and mtime are
12232 + * to be updated
12233 + */
12234 + attr->ia_valid |= (ATTR_MTIME | ATTR_CTIME);
12237 + reiserfs_write_unlock(inode->i_sb);
12239 + if ((((attr->ia_valid & ATTR_UID) && (from_kuid(&init_user_ns, attr->ia_uid) & ~0xffff)) ||
12240 + ((attr->ia_valid & ATTR_GID) && (from_kgid(&init_user_ns, attr->ia_gid) & ~0xffff))) &&
12241 + (get_inode_sd_version(inode) == STAT_DATA_V1)) {
12242 + /* stat data of format v3.5 has 16 bit uid and gid */
12243 + error = -EINVAL;
12244 + goto out;
12247 + if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
12248 + (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
12249 + struct reiserfs_transaction_handle th;
12250 + int jbegin_count =
12251 + 2 *
12252 + (REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb) +
12253 + REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb)) +
12254 + 2;
12256 + error = reiserfs_chown_xattrs(inode, attr);
12258 + if (error)
12259 + return error;
12261 + /*
12262 + * (user+group)*(old+new) structure - we count quota
12263 + * info and , inode write (sb, inode)
12264 + */
12265 + reiserfs_write_lock(inode->i_sb);
12266 + error = journal_begin(&th, inode->i_sb, jbegin_count);
12267 + reiserfs_write_unlock(inode->i_sb);
12268 + if (error)
12269 + goto out;
12270 + error = dquot_transfer(&nop_mnt_idmap, inode, attr);
12271 + reiserfs_write_lock(inode->i_sb);
12272 + if (error) {
12273 + journal_end(&th);
12274 + reiserfs_write_unlock(inode->i_sb);
12275 + goto out;
12278 + /*
12279 + * Update corresponding info in inode so that everything
12280 + * is in one transaction
12281 + */
12282 + if (attr->ia_valid & ATTR_UID)
12283 + inode->i_uid = attr->ia_uid;
12284 + if (attr->ia_valid & ATTR_GID)
12285 + inode->i_gid = attr->ia_gid;
12286 + mark_inode_dirty(inode);
12287 + error = journal_end(&th);
12288 + reiserfs_write_unlock(inode->i_sb);
12289 + if (error)
12290 + goto out;
12293 + if ((attr->ia_valid & ATTR_SIZE) &&
12294 + attr->ia_size != i_size_read(inode)) {
12295 + error = inode_newsize_ok(inode, attr->ia_size);
12296 + if (!error) {
12297 + /*
12298 + * Could race against reiserfs_file_release
12299 + * if called from NFS, so take tailpack mutex.
12300 + */
12301 + mutex_lock(&REISERFS_I(inode)->tailpack);
12302 + truncate_setsize(inode, attr->ia_size);
12303 + reiserfs_truncate_file(inode, 1);
12304 + mutex_unlock(&REISERFS_I(inode)->tailpack);
12308 + if (!error) {
12309 + setattr_copy(&nop_mnt_idmap, inode, attr);
12310 + mark_inode_dirty(inode);
12313 + if (!error && reiserfs_posixacl(inode->i_sb)) {
12314 + if (attr->ia_valid & ATTR_MODE)
12315 + error = reiserfs_acl_chmod(dentry);
12318 +out:
12319 + return error;
12322 +const struct address_space_operations reiserfs_address_space_operations = {
12323 + .writepages = reiserfs_writepages,
12324 + .read_folio = reiserfs_read_folio,
12325 + .readahead = reiserfs_readahead,
12326 + .release_folio = reiserfs_release_folio,
12327 + .invalidate_folio = reiserfs_invalidate_folio,
12328 + .write_begin = reiserfs_write_begin,
12329 + .write_end = reiserfs_write_end,
12330 + .bmap = reiserfs_aop_bmap,
12331 + .direct_IO = reiserfs_direct_IO,
12332 + .dirty_folio = reiserfs_dirty_folio,
12333 + .migrate_folio = buffer_migrate_folio,
12335 diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
12336 new file mode 100644
12337 index 000000000000..dd33f8cc6eda
12338 --- /dev/null
12339 +++ b/fs/reiserfs/ioctl.c
12340 @@ -0,0 +1,221 @@
12342 + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
12343 + */
12345 +#include <linux/capability.h>
12346 +#include <linux/fs.h>
12347 +#include <linux/mount.h>
12348 +#include "reiserfs.h"
12349 +#include <linux/time.h>
12350 +#include <linux/uaccess.h>
12351 +#include <linux/pagemap.h>
12352 +#include <linux/compat.h>
12353 +#include <linux/fileattr.h>
12355 +int reiserfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
12357 + struct inode *inode = d_inode(dentry);
12359 + if (!reiserfs_attrs(inode->i_sb))
12360 + return -ENOTTY;
12362 + fileattr_fill_flags(fa, REISERFS_I(inode)->i_attrs);
12364 + return 0;
12367 +int reiserfs_fileattr_set(struct mnt_idmap *idmap,
12368 + struct dentry *dentry, struct fileattr *fa)
12370 + struct inode *inode = d_inode(dentry);
12371 + unsigned int flags = fa->flags;
12372 + int err;
12374 + reiserfs_write_lock(inode->i_sb);
12376 + err = -ENOTTY;
12377 + if (!reiserfs_attrs(inode->i_sb))
12378 + goto unlock;
12380 + err = -EOPNOTSUPP;
12381 + if (fileattr_has_fsx(fa))
12382 + goto unlock;
12384 + /*
12385 + * Is it quota file? Do not allow user to mess with it
12386 + */
12387 + err = -EPERM;
12388 + if (IS_NOQUOTA(inode))
12389 + goto unlock;
12391 + if ((flags & REISERFS_NOTAIL_FL) && S_ISREG(inode->i_mode)) {
12392 + err = reiserfs_unpack(inode);
12393 + if (err)
12394 + goto unlock;
12396 + sd_attrs_to_i_attrs(flags, inode);
12397 + REISERFS_I(inode)->i_attrs = flags;
12398 + inode_set_ctime_current(inode);
12399 + mark_inode_dirty(inode);
12400 + err = 0;
12401 +unlock:
12402 + reiserfs_write_unlock(inode->i_sb);
12404 + return err;
12408 + * reiserfs_ioctl - handler for ioctl for inode
12409 + * supported commands:
12410 + * 1) REISERFS_IOC_UNPACK - try to unpack tail from direct item into indirect
12411 + * and prevent packing file (argument arg has t
12412 + * be non-zero)
12413 + * 2) REISERFS_IOC_[GS]ETFLAGS, REISERFS_IOC_[GS]ETVERSION
12414 + * 3) That's all for a while ...
12415 + */
12416 +long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
12418 + struct inode *inode = file_inode(filp);
12419 + int err = 0;
12421 + reiserfs_write_lock(inode->i_sb);
12423 + switch (cmd) {
12424 + case REISERFS_IOC_UNPACK:
12425 + if (S_ISREG(inode->i_mode)) {
12426 + if (arg)
12427 + err = reiserfs_unpack(inode);
12428 + } else
12429 + err = -ENOTTY;
12430 + break;
12431 + /*
12432 + * following two cases are taken from fs/ext2/ioctl.c by Remy
12433 + * Card (card@masi.ibp.fr)
12434 + */
12435 + case REISERFS_IOC_GETVERSION:
12436 + err = put_user(inode->i_generation, (int __user *)arg);
12437 + break;
12438 + case REISERFS_IOC_SETVERSION:
12439 + if (!inode_owner_or_capable(&nop_mnt_idmap, inode)) {
12440 + err = -EPERM;
12441 + break;
12443 + err = mnt_want_write_file(filp);
12444 + if (err)
12445 + break;
12446 + if (get_user(inode->i_generation, (int __user *)arg)) {
12447 + err = -EFAULT;
12448 + goto setversion_out;
12450 + inode_set_ctime_current(inode);
12451 + mark_inode_dirty(inode);
12452 +setversion_out:
12453 + mnt_drop_write_file(filp);
12454 + break;
12455 + default:
12456 + err = -ENOTTY;
12459 + reiserfs_write_unlock(inode->i_sb);
12461 + return err;
12464 +#ifdef CONFIG_COMPAT
12465 +long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
12466 + unsigned long arg)
12468 + /*
12469 + * These are just misnamed, they actually
12470 + * get/put from/to user an int
12471 + */
12472 + switch (cmd) {
12473 + case REISERFS_IOC32_UNPACK:
12474 + cmd = REISERFS_IOC_UNPACK;
12475 + break;
12476 + case REISERFS_IOC32_GETVERSION:
12477 + cmd = REISERFS_IOC_GETVERSION;
12478 + break;
12479 + case REISERFS_IOC32_SETVERSION:
12480 + cmd = REISERFS_IOC_SETVERSION;
12481 + break;
12482 + default:
12483 + return -ENOIOCTLCMD;
12486 + return reiserfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
12488 +#endif
12490 +int reiserfs_commit_write(struct file *f, struct page *page,
12491 + unsigned from, unsigned to);
12493 + * reiserfs_unpack
12494 + * Function try to convert tail from direct item into indirect.
12495 + * It set up nopack attribute in the REISERFS_I(inode)->nopack
12496 + */
12497 +int reiserfs_unpack(struct inode *inode)
12499 + int retval = 0;
12500 + int index;
12501 + struct page *page;
12502 + struct address_space *mapping;
12503 + unsigned long write_from;
12504 + unsigned long blocksize = inode->i_sb->s_blocksize;
12506 + if (inode->i_size == 0) {
12507 + REISERFS_I(inode)->i_flags |= i_nopack_mask;
12508 + return 0;
12510 + /* ioctl already done */
12511 + if (REISERFS_I(inode)->i_flags & i_nopack_mask) {
12512 + return 0;
12515 + /* we need to make sure nobody is changing the file size beneath us */
12517 + int depth = reiserfs_write_unlock_nested(inode->i_sb);
12519 + inode_lock(inode);
12520 + reiserfs_write_lock_nested(inode->i_sb, depth);
12523 + reiserfs_write_lock(inode->i_sb);
12525 + write_from = inode->i_size & (blocksize - 1);
12526 + /* if we are on a block boundary, we are already unpacked. */
12527 + if (write_from == 0) {
12528 + REISERFS_I(inode)->i_flags |= i_nopack_mask;
12529 + goto out;
12532 + /*
12533 + * we unpack by finding the page with the tail, and calling
12534 + * __reiserfs_write_begin on that page. This will force a
12535 + * reiserfs_get_block to unpack the tail for us.
12536 + */
12537 + index = inode->i_size >> PAGE_SHIFT;
12538 + mapping = inode->i_mapping;
12539 + page = grab_cache_page(mapping, index);
12540 + retval = -ENOMEM;
12541 + if (!page) {
12542 + goto out;
12544 + retval = __reiserfs_write_begin(page, write_from, 0);
12545 + if (retval)
12546 + goto out_unlock;
12548 + /* conversion can change page contents, must flush */
12549 + flush_dcache_page(page);
12550 + retval = reiserfs_commit_write(NULL, page, write_from, write_from);
12551 + REISERFS_I(inode)->i_flags |= i_nopack_mask;
12553 +out_unlock:
12554 + unlock_page(page);
12555 + put_page(page);
12557 +out:
12558 + inode_unlock(inode);
12559 + reiserfs_write_unlock(inode->i_sb);
12560 + return retval;
12562 diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c
12563 new file mode 100644
12564 index 000000000000..5011c10287c6
12565 --- /dev/null
12566 +++ b/fs/reiserfs/item_ops.c
12567 @@ -0,0 +1,737 @@
12569 + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
12570 + */
12572 +#include <linux/time.h>
12573 +#include "reiserfs.h"
12576 + * this contains item handlers for old item types: sd, direct,
12577 + * indirect, directory
12578 + */
12581 + * and where are the comments? how about saying where we can find an
12582 + * explanation of each item handler method? -Hans
12583 + */
12585 +/* stat data functions */
12586 +static int sd_bytes_number(struct item_head *ih, int block_size)
12588 + return 0;
12591 +static void sd_decrement_key(struct cpu_key *key)
12593 + key->on_disk_key.k_objectid--;
12594 + set_cpu_key_k_type(key, TYPE_ANY);
12595 + set_cpu_key_k_offset(key, (loff_t)(~0ULL >> 1));
12598 +static int sd_is_left_mergeable(struct reiserfs_key *key, unsigned long bsize)
12600 + return 0;
12603 +static void sd_print_item(struct item_head *ih, char *item)
12605 + printk("\tmode | size | nlinks | first direct | mtime\n");
12606 + if (stat_data_v1(ih)) {
12607 + struct stat_data_v1 *sd = (struct stat_data_v1 *)item;
12609 + printk("\t0%-6o | %6u | %2u | %d | %u\n", sd_v1_mode(sd),
12610 + sd_v1_size(sd), sd_v1_nlink(sd),
12611 + sd_v1_first_direct_byte(sd),
12612 + sd_v1_mtime(sd));
12613 + } else {
12614 + struct stat_data *sd = (struct stat_data *)item;
12616 + printk("\t0%-6o | %6llu | %2u | %d | %u\n", sd_v2_mode(sd),
12617 + (unsigned long long)sd_v2_size(sd), sd_v2_nlink(sd),
12618 + sd_v2_rdev(sd), sd_v2_mtime(sd));
12622 +static void sd_check_item(struct item_head *ih, char *item)
12624 + /* unused */
12627 +static int sd_create_vi(struct virtual_node *vn,
12628 + struct virtual_item *vi,
12629 + int is_affected, int insert_size)
12631 + vi->vi_index = TYPE_STAT_DATA;
12632 + return 0;
12635 +static int sd_check_left(struct virtual_item *vi, int free,
12636 + int start_skip, int end_skip)
12638 + BUG_ON(start_skip || end_skip);
12639 + return -1;
12642 +static int sd_check_right(struct virtual_item *vi, int free)
12644 + return -1;
12647 +static int sd_part_size(struct virtual_item *vi, int first, int count)
12649 + BUG_ON(count);
12650 + return 0;
12653 +static int sd_unit_num(struct virtual_item *vi)
12655 + return vi->vi_item_len - IH_SIZE;
12658 +static void sd_print_vi(struct virtual_item *vi)
12660 + reiserfs_warning(NULL, "reiserfs-16100",
12661 + "STATDATA, index %d, type 0x%x, %h",
12662 + vi->vi_index, vi->vi_type, vi->vi_ih);
12665 +static struct item_operations stat_data_ops = {
12666 + .bytes_number = sd_bytes_number,
12667 + .decrement_key = sd_decrement_key,
12668 + .is_left_mergeable = sd_is_left_mergeable,
12669 + .print_item = sd_print_item,
12670 + .check_item = sd_check_item,
12672 + .create_vi = sd_create_vi,
12673 + .check_left = sd_check_left,
12674 + .check_right = sd_check_right,
12675 + .part_size = sd_part_size,
12676 + .unit_num = sd_unit_num,
12677 + .print_vi = sd_print_vi
12680 +/* direct item functions */
12681 +static int direct_bytes_number(struct item_head *ih, int block_size)
12683 + return ih_item_len(ih);
12686 +/* FIXME: this should probably switch to indirect as well */
12687 +static void direct_decrement_key(struct cpu_key *key)
12689 + cpu_key_k_offset_dec(key);
12690 + if (cpu_key_k_offset(key) == 0)
12691 + set_cpu_key_k_type(key, TYPE_STAT_DATA);
12694 +static int direct_is_left_mergeable(struct reiserfs_key *key,
12695 + unsigned long bsize)
12697 + int version = le_key_version(key);
12698 + return ((le_key_k_offset(version, key) & (bsize - 1)) != 1);
12701 +static void direct_print_item(struct item_head *ih, char *item)
12703 + int j = 0;
12705 +/* return; */
12706 + printk("\"");
12707 + while (j < ih_item_len(ih))
12708 + printk("%c", item[j++]);
12709 + printk("\"\n");
12712 +static void direct_check_item(struct item_head *ih, char *item)
12714 + /* unused */
12717 +static int direct_create_vi(struct virtual_node *vn,
12718 + struct virtual_item *vi,
12719 + int is_affected, int insert_size)
12721 + vi->vi_index = TYPE_DIRECT;
12722 + return 0;
12725 +static int direct_check_left(struct virtual_item *vi, int free,
12726 + int start_skip, int end_skip)
12728 + int bytes;
12730 + bytes = free - free % 8;
12731 + return bytes ? : -1;
12734 +static int direct_check_right(struct virtual_item *vi, int free)
12736 + return direct_check_left(vi, free, 0, 0);
12739 +static int direct_part_size(struct virtual_item *vi, int first, int count)
12741 + return count;
12744 +static int direct_unit_num(struct virtual_item *vi)
12746 + return vi->vi_item_len - IH_SIZE;
12749 +static void direct_print_vi(struct virtual_item *vi)
12751 + reiserfs_warning(NULL, "reiserfs-16101",
12752 + "DIRECT, index %d, type 0x%x, %h",
12753 + vi->vi_index, vi->vi_type, vi->vi_ih);
12756 +static struct item_operations direct_ops = {
12757 + .bytes_number = direct_bytes_number,
12758 + .decrement_key = direct_decrement_key,
12759 + .is_left_mergeable = direct_is_left_mergeable,
12760 + .print_item = direct_print_item,
12761 + .check_item = direct_check_item,
12763 + .create_vi = direct_create_vi,
12764 + .check_left = direct_check_left,
12765 + .check_right = direct_check_right,
12766 + .part_size = direct_part_size,
12767 + .unit_num = direct_unit_num,
12768 + .print_vi = direct_print_vi
12771 +/* indirect item functions */
12772 +static int indirect_bytes_number(struct item_head *ih, int block_size)
12774 + return ih_item_len(ih) / UNFM_P_SIZE * block_size;
12777 +/* decrease offset, if it becomes 0, change type to stat data */
12778 +static void indirect_decrement_key(struct cpu_key *key)
12780 + cpu_key_k_offset_dec(key);
12781 + if (cpu_key_k_offset(key) == 0)
12782 + set_cpu_key_k_type(key, TYPE_STAT_DATA);
12785 +/* if it is not first item of the body, then it is mergeable */
12786 +static int indirect_is_left_mergeable(struct reiserfs_key *key,
12787 + unsigned long bsize)
12789 + int version = le_key_version(key);
12790 + return (le_key_k_offset(version, key) != 1);
12793 +/* printing of indirect item */
12794 +static void start_new_sequence(__u32 * start, int *len, __u32 new)
12796 + *start = new;
12797 + *len = 1;
12800 +static int sequence_finished(__u32 start, int *len, __u32 new)
12802 + if (start == INT_MAX)
12803 + return 1;
12805 + if (start == 0 && new == 0) {
12806 + (*len)++;
12807 + return 0;
12809 + if (start != 0 && (start + *len) == new) {
12810 + (*len)++;
12811 + return 0;
12813 + return 1;
12816 +static void print_sequence(__u32 start, int len)
12818 + if (start == INT_MAX)
12819 + return;
12821 + if (len == 1)
12822 + printk(" %d", start);
12823 + else
12824 + printk(" %d(%d)", start, len);
12827 +static void indirect_print_item(struct item_head *ih, char *item)
12829 + int j;
12830 + __le32 *unp;
12831 + __u32 prev = INT_MAX;
12832 + int num = 0;
12834 + unp = (__le32 *) item;
12836 + if (ih_item_len(ih) % UNFM_P_SIZE)
12837 + reiserfs_warning(NULL, "reiserfs-16102", "invalid item len");
12839 + printk("%d pointers\n[ ", (int)I_UNFM_NUM(ih));
12840 + for (j = 0; j < I_UNFM_NUM(ih); j++) {
12841 + if (sequence_finished(prev, &num, get_block_num(unp, j))) {
12842 + print_sequence(prev, num);
12843 + start_new_sequence(&prev, &num, get_block_num(unp, j));
12846 + print_sequence(prev, num);
12847 + printk("]\n");
12850 +static void indirect_check_item(struct item_head *ih, char *item)
12852 + /* unused */
12855 +static int indirect_create_vi(struct virtual_node *vn,
12856 + struct virtual_item *vi,
12857 + int is_affected, int insert_size)
12859 + vi->vi_index = TYPE_INDIRECT;
12860 + return 0;
12863 +static int indirect_check_left(struct virtual_item *vi, int free,
12864 + int start_skip, int end_skip)
12866 + int bytes;
12868 + bytes = free - free % UNFM_P_SIZE;
12869 + return bytes ? : -1;
12872 +static int indirect_check_right(struct virtual_item *vi, int free)
12874 + return indirect_check_left(vi, free, 0, 0);
12878 + * return size in bytes of 'units' units. If first == 0 - calculate
12879 + * from the head (left), otherwise - from tail (right)
12880 + */
12881 +static int indirect_part_size(struct virtual_item *vi, int first, int units)
12883 + /* unit of indirect item is byte (yet) */
12884 + return units;
12887 +static int indirect_unit_num(struct virtual_item *vi)
12889 + /* unit of indirect item is byte (yet) */
12890 + return vi->vi_item_len - IH_SIZE;
12893 +static void indirect_print_vi(struct virtual_item *vi)
12895 + reiserfs_warning(NULL, "reiserfs-16103",
12896 + "INDIRECT, index %d, type 0x%x, %h",
12897 + vi->vi_index, vi->vi_type, vi->vi_ih);
12900 +static struct item_operations indirect_ops = {
12901 + .bytes_number = indirect_bytes_number,
12902 + .decrement_key = indirect_decrement_key,
12903 + .is_left_mergeable = indirect_is_left_mergeable,
12904 + .print_item = indirect_print_item,
12905 + .check_item = indirect_check_item,
12907 + .create_vi = indirect_create_vi,
12908 + .check_left = indirect_check_left,
12909 + .check_right = indirect_check_right,
12910 + .part_size = indirect_part_size,
12911 + .unit_num = indirect_unit_num,
12912 + .print_vi = indirect_print_vi
12915 +/* direntry functions */
12916 +static int direntry_bytes_number(struct item_head *ih, int block_size)
12918 + reiserfs_warning(NULL, "vs-16090",
12919 + "bytes number is asked for direntry");
12920 + return 0;
12923 +static void direntry_decrement_key(struct cpu_key *key)
12925 + cpu_key_k_offset_dec(key);
12926 + if (cpu_key_k_offset(key) == 0)
12927 + set_cpu_key_k_type(key, TYPE_STAT_DATA);
12930 +static int direntry_is_left_mergeable(struct reiserfs_key *key,
12931 + unsigned long bsize)
12933 + if (le32_to_cpu(key->u.k_offset_v1.k_offset) == DOT_OFFSET)
12934 + return 0;
12935 + return 1;
12939 +static void direntry_print_item(struct item_head *ih, char *item)
12941 + int i;
12942 + int namelen;
12943 + struct reiserfs_de_head *deh;
12944 + char *name;
12945 + static char namebuf[80];
12947 + printk("\n # %-15s%-30s%-15s%-15s%-15s\n", "Name",
12948 + "Key of pointed object", "Hash", "Gen number", "Status");
12950 + deh = (struct reiserfs_de_head *)item;
12952 + for (i = 0; i < ih_entry_count(ih); i++, deh++) {
12953 + namelen =
12954 + (i ? (deh_location(deh - 1)) : ih_item_len(ih)) -
12955 + deh_location(deh);
12956 + name = item + deh_location(deh);
12957 + if (name[namelen - 1] == 0)
12958 + namelen = strlen(name);
12960 + scnprintf(namebuf, sizeof(namebuf), "\"%.*s\"",
12961 + (int)sizeof(namebuf)-3, name);
12963 + printk("%d: %-15s%-15d%-15d%-15lld%-15lld(%s)\n",
12964 + i, namebuf,
12965 + deh_dir_id(deh), deh_objectid(deh),
12966 + GET_HASH_VALUE(deh_offset(deh)),
12967 + GET_GENERATION_NUMBER((deh_offset(deh))),
12968 + (de_hidden(deh)) ? "HIDDEN" : "VISIBLE");
12972 +static void direntry_check_item(struct item_head *ih, char *item)
12974 + int i;
12975 + struct reiserfs_de_head *deh;
12977 + /* unused */
12978 + deh = (struct reiserfs_de_head *)item;
12979 + for (i = 0; i < ih_entry_count(ih); i++, deh++) {
12984 +#define DIRENTRY_VI_FIRST_DIRENTRY_ITEM 1
12987 + * function returns old entry number in directory item in real node
12988 + * using new entry number in virtual item in virtual node
12989 + */
12990 +static inline int old_entry_num(int is_affected, int virtual_entry_num,
12991 + int pos_in_item, int mode)
12993 + if (mode == M_INSERT || mode == M_DELETE)
12994 + return virtual_entry_num;
12996 + if (!is_affected)
12997 + /* cut or paste is applied to another item */
12998 + return virtual_entry_num;
13000 + if (virtual_entry_num < pos_in_item)
13001 + return virtual_entry_num;
13003 + if (mode == M_CUT)
13004 + return virtual_entry_num + 1;
13006 + RFALSE(mode != M_PASTE || virtual_entry_num == 0,
13007 + "vs-8015: old_entry_num: mode must be M_PASTE (mode = \'%c\'",
13008 + mode);
13010 + return virtual_entry_num - 1;
13014 + * Create an array of sizes of directory entries for virtual
13015 + * item. Return space used by an item. FIXME: no control over
13016 + * consuming of space used by this item handler
13017 + */
13018 +static int direntry_create_vi(struct virtual_node *vn,
13019 + struct virtual_item *vi,
13020 + int is_affected, int insert_size)
13022 + struct direntry_uarea *dir_u = vi->vi_uarea;
13023 + int i, j;
13024 + int size = sizeof(struct direntry_uarea);
13025 + struct reiserfs_de_head *deh;
13027 + vi->vi_index = TYPE_DIRENTRY;
13029 + BUG_ON(!(vi->vi_ih) || !vi->vi_item);
13031 + dir_u->flags = 0;
13032 + if (le_ih_k_offset(vi->vi_ih) == DOT_OFFSET)
13033 + dir_u->flags |= DIRENTRY_VI_FIRST_DIRENTRY_ITEM;
13035 + deh = (struct reiserfs_de_head *)(vi->vi_item);
13037 + /* virtual directory item have this amount of entry after */
13038 + dir_u->entry_count = ih_entry_count(vi->vi_ih) +
13039 + ((is_affected) ? ((vn->vn_mode == M_CUT) ? -1 :
13040 + (vn->vn_mode == M_PASTE ? 1 : 0)) : 0);
13042 + for (i = 0; i < dir_u->entry_count; i++) {
13043 + j = old_entry_num(is_affected, i, vn->vn_pos_in_item,
13044 + vn->vn_mode);
13045 + dir_u->entry_sizes[i] =
13046 + (j ? deh_location(&deh[j - 1]) : ih_item_len(vi->vi_ih)) -
13047 + deh_location(&deh[j]) + DEH_SIZE;
13050 + size += (dir_u->entry_count * sizeof(short));
13052 + /* set size of pasted entry */
13053 + if (is_affected && vn->vn_mode == M_PASTE)
13054 + dir_u->entry_sizes[vn->vn_pos_in_item] = insert_size;
13056 +#ifdef CONFIG_REISERFS_CHECK
13057 + /* compare total size of entries with item length */
13059 + int k, l;
13061 + l = 0;
13062 + for (k = 0; k < dir_u->entry_count; k++)
13063 + l += dir_u->entry_sizes[k];
13065 + if (l + IH_SIZE != vi->vi_item_len +
13066 + ((is_affected
13067 + && (vn->vn_mode == M_PASTE
13068 + || vn->vn_mode == M_CUT)) ? insert_size : 0)) {
13069 + reiserfs_panic(NULL, "vs-8025", "(mode==%c, "
13070 + "insert_size==%d), invalid length of "
13071 + "directory item",
13072 + vn->vn_mode, insert_size);
13075 +#endif
13077 + return size;
13082 + * return number of entries which may fit into specified amount of
13083 + * free space, or -1 if free space is not enough even for 1 entry
13084 + */
13085 +static int direntry_check_left(struct virtual_item *vi, int free,
13086 + int start_skip, int end_skip)
13088 + int i;
13089 + int entries = 0;
13090 + struct direntry_uarea *dir_u = vi->vi_uarea;
13092 + for (i = start_skip; i < dir_u->entry_count - end_skip; i++) {
13093 + /* i-th entry doesn't fit into the remaining free space */
13094 + if (dir_u->entry_sizes[i] > free)
13095 + break;
13097 + free -= dir_u->entry_sizes[i];
13098 + entries++;
13101 + if (entries == dir_u->entry_count) {
13102 + reiserfs_panic(NULL, "item_ops-1",
13103 + "free space %d, entry_count %d", free,
13104 + dir_u->entry_count);
13107 + /* "." and ".." can not be separated from each other */
13108 + if (start_skip == 0 && (dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM)
13109 + && entries < 2)
13110 + entries = 0;
13112 + return entries ? : -1;
13115 +static int direntry_check_right(struct virtual_item *vi, int free)
13117 + int i;
13118 + int entries = 0;
13119 + struct direntry_uarea *dir_u = vi->vi_uarea;
13121 + for (i = dir_u->entry_count - 1; i >= 0; i--) {
13122 + /* i-th entry doesn't fit into the remaining free space */
13123 + if (dir_u->entry_sizes[i] > free)
13124 + break;
13126 + free -= dir_u->entry_sizes[i];
13127 + entries++;
13129 + BUG_ON(entries == dir_u->entry_count);
13131 + /* "." and ".." can not be separated from each other */
13132 + if ((dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM)
13133 + && entries > dir_u->entry_count - 2)
13134 + entries = dir_u->entry_count - 2;
13136 + return entries ? : -1;
13139 +/* sum of entry sizes between from-th and to-th entries including both edges */
13140 +static int direntry_part_size(struct virtual_item *vi, int first, int count)
13142 + int i, retval;
13143 + int from, to;
13144 + struct direntry_uarea *dir_u = vi->vi_uarea;
13146 + retval = 0;
13147 + if (first == 0)
13148 + from = 0;
13149 + else
13150 + from = dir_u->entry_count - count;
13151 + to = from + count - 1;
13153 + for (i = from; i <= to; i++)
13154 + retval += dir_u->entry_sizes[i];
13156 + return retval;
13159 +static int direntry_unit_num(struct virtual_item *vi)
13161 + struct direntry_uarea *dir_u = vi->vi_uarea;
13163 + return dir_u->entry_count;
13166 +static void direntry_print_vi(struct virtual_item *vi)
13168 + int i;
13169 + struct direntry_uarea *dir_u = vi->vi_uarea;
13171 + reiserfs_warning(NULL, "reiserfs-16104",
13172 + "DIRENTRY, index %d, type 0x%x, %h, flags 0x%x",
13173 + vi->vi_index, vi->vi_type, vi->vi_ih, dir_u->flags);
13174 + printk("%d entries: ", dir_u->entry_count);
13175 + for (i = 0; i < dir_u->entry_count; i++)
13176 + printk("%d ", dir_u->entry_sizes[i]);
13177 + printk("\n");
13180 +static struct item_operations direntry_ops = {
13181 + .bytes_number = direntry_bytes_number,
13182 + .decrement_key = direntry_decrement_key,
13183 + .is_left_mergeable = direntry_is_left_mergeable,
13184 + .print_item = direntry_print_item,
13185 + .check_item = direntry_check_item,
13187 + .create_vi = direntry_create_vi,
13188 + .check_left = direntry_check_left,
13189 + .check_right = direntry_check_right,
13190 + .part_size = direntry_part_size,
13191 + .unit_num = direntry_unit_num,
13192 + .print_vi = direntry_print_vi
13195 +/* Error catching functions to catch errors caused by incorrect item types. */
13196 +static int errcatch_bytes_number(struct item_head *ih, int block_size)
13198 + reiserfs_warning(NULL, "green-16001",
13199 + "Invalid item type observed, run fsck ASAP");
13200 + return 0;
13203 +static void errcatch_decrement_key(struct cpu_key *key)
13205 + reiserfs_warning(NULL, "green-16002",
13206 + "Invalid item type observed, run fsck ASAP");
13209 +static int errcatch_is_left_mergeable(struct reiserfs_key *key,
13210 + unsigned long bsize)
13212 + reiserfs_warning(NULL, "green-16003",
13213 + "Invalid item type observed, run fsck ASAP");
13214 + return 0;
13217 +static void errcatch_print_item(struct item_head *ih, char *item)
13219 + reiserfs_warning(NULL, "green-16004",
13220 + "Invalid item type observed, run fsck ASAP");
13223 +static void errcatch_check_item(struct item_head *ih, char *item)
13225 + reiserfs_warning(NULL, "green-16005",
13226 + "Invalid item type observed, run fsck ASAP");
13229 +static int errcatch_create_vi(struct virtual_node *vn,
13230 + struct virtual_item *vi,
13231 + int is_affected, int insert_size)
13233 + reiserfs_warning(NULL, "green-16006",
13234 + "Invalid item type observed, run fsck ASAP");
13235 + /*
13236 + * We might return -1 here as well, but it won't help as
13237 + * create_virtual_node() from where this operation is called
13238 + * from is of return type void.
13239 + */
13240 + return 0;
13243 +static int errcatch_check_left(struct virtual_item *vi, int free,
13244 + int start_skip, int end_skip)
13246 + reiserfs_warning(NULL, "green-16007",
13247 + "Invalid item type observed, run fsck ASAP");
13248 + return -1;
13251 +static int errcatch_check_right(struct virtual_item *vi, int free)
13253 + reiserfs_warning(NULL, "green-16008",
13254 + "Invalid item type observed, run fsck ASAP");
13255 + return -1;
13258 +static int errcatch_part_size(struct virtual_item *vi, int first, int count)
13260 + reiserfs_warning(NULL, "green-16009",
13261 + "Invalid item type observed, run fsck ASAP");
13262 + return 0;
13265 +static int errcatch_unit_num(struct virtual_item *vi)
13267 + reiserfs_warning(NULL, "green-16010",
13268 + "Invalid item type observed, run fsck ASAP");
13269 + return 0;
13272 +static void errcatch_print_vi(struct virtual_item *vi)
13274 + reiserfs_warning(NULL, "green-16011",
13275 + "Invalid item type observed, run fsck ASAP");
13278 +static struct item_operations errcatch_ops = {
13279 + .bytes_number = errcatch_bytes_number,
13280 + .decrement_key = errcatch_decrement_key,
13281 + .is_left_mergeable = errcatch_is_left_mergeable,
13282 + .print_item = errcatch_print_item,
13283 + .check_item = errcatch_check_item,
13285 + .create_vi = errcatch_create_vi,
13286 + .check_left = errcatch_check_left,
13287 + .check_right = errcatch_check_right,
13288 + .part_size = errcatch_part_size,
13289 + .unit_num = errcatch_unit_num,
13290 + .print_vi = errcatch_print_vi
13293 +#if ! (TYPE_STAT_DATA == 0 && TYPE_INDIRECT == 1 && TYPE_DIRECT == 2 && TYPE_DIRENTRY == 3)
13294 +#error Item types must use disk-format assigned values.
13295 +#endif
13297 +struct item_operations *item_ops[TYPE_ANY + 1] = {
13298 + &stat_data_ops,
13299 + &indirect_ops,
13300 + &direct_ops,
13301 + &direntry_ops,
13302 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
13303 + &errcatch_ops /* This is to catch errors with invalid type (15th entry for TYPE_ANY) */
13305 diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
13306 new file mode 100644
13307 index 000000000000..e477ee0ff35d
13308 --- /dev/null
13309 +++ b/fs/reiserfs/journal.c
13310 @@ -0,0 +1,4404 @@
13311 +// SPDX-License-Identifier: GPL-2.0
13313 + * Write ahead logging implementation copyright Chris Mason 2000
13315 + * The background commits make this code very interrelated, and
13316 + * overly complex. I need to rethink things a bit....The major players:
13318 + * journal_begin -- call with the number of blocks you expect to log.
13319 + * If the current transaction is too
13320 + * old, it will block until the current transaction is
13321 + * finished, and then start a new one.
13322 + * Usually, your transaction will get joined in with
13323 + * previous ones for speed.
13325 + * journal_join -- same as journal_begin, but won't block on the current
13326 + * transaction regardless of age. Don't ever call
13327 + * this. Ever. There are only two places it should be
13328 + * called from, and they are both inside this file.
13330 + * journal_mark_dirty -- adds blocks into this transaction. clears any flags
13331 + * that might make them get sent to disk
13332 + * and then marks them BH_JDirty. Puts the buffer head
13333 + * into the current transaction hash.
13335 + * journal_end -- if the current transaction is batchable, it does nothing
13336 + * otherwise, it could do an async/synchronous commit, or
13337 + * a full flush of all log and real blocks in the
13338 + * transaction.
13340 + * flush_old_commits -- if the current transaction is too old, it is ended and
13341 + * commit blocks are sent to disk. Forces commit blocks
13342 + * to disk for all backgrounded commits that have been
13343 + * around too long.
13344 + * -- Note, if you call this as an immediate flush from
13345 + * within kupdate, it will ignore the immediate flag
13346 + */
13348 +#include <linux/time.h>
13349 +#include <linux/semaphore.h>
13350 +#include <linux/vmalloc.h>
13351 +#include "reiserfs.h"
13352 +#include <linux/kernel.h>
13353 +#include <linux/errno.h>
13354 +#include <linux/fcntl.h>
13355 +#include <linux/stat.h>
13356 +#include <linux/string.h>
13357 +#include <linux/buffer_head.h>
13358 +#include <linux/workqueue.h>
13359 +#include <linux/writeback.h>
13360 +#include <linux/blkdev.h>
13361 +#include <linux/backing-dev.h>
13362 +#include <linux/uaccess.h>
13363 +#include <linux/slab.h>
13366 +/* gets a struct reiserfs_journal_list * from a list head */
13367 +#define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
13368 + j_list))
13370 +/* must be correct to keep the desc and commit structs at 4k */
13371 +#define JOURNAL_TRANS_HALF 1018
13372 +#define BUFNR 64 /*read ahead */
13374 +/* cnode stat bits. Move these into reiserfs_fs.h */
13376 +/* this block was freed, and can't be written. */
13377 +#define BLOCK_FREED 2
13378 +/* this block was freed during this transaction, and can't be written */
13379 +#define BLOCK_FREED_HOLDER 3
13381 +/* used in flush_journal_list */
13382 +#define BLOCK_NEEDS_FLUSH 4
13383 +#define BLOCK_DIRTIED 5
13385 +/* journal list state bits */
13386 +#define LIST_TOUCHED 1
13387 +#define LIST_DIRTY 2
13388 +#define LIST_COMMIT_PENDING 4 /* someone will commit this list */
13390 +/* flags for do_journal_end */
13391 +#define FLUSH_ALL 1 /* flush commit and real blocks */
13392 +#define COMMIT_NOW 2 /* end and commit this transaction */
13393 +#define WAIT 4 /* wait for the log blocks to hit the disk */
13395 +static int do_journal_end(struct reiserfs_transaction_handle *, int flags);
13396 +static int flush_journal_list(struct super_block *s,
13397 + struct reiserfs_journal_list *jl, int flushall);
13398 +static int flush_commit_list(struct super_block *s,
13399 + struct reiserfs_journal_list *jl, int flushall);
13400 +static int can_dirty(struct reiserfs_journal_cnode *cn);
13401 +static int journal_join(struct reiserfs_transaction_handle *th,
13402 + struct super_block *sb);
13403 +static void release_journal_dev(struct reiserfs_journal *journal);
13404 +static void dirty_one_transaction(struct super_block *s,
13405 + struct reiserfs_journal_list *jl);
13406 +static void flush_async_commits(struct work_struct *work);
13407 +static void queue_log_writer(struct super_block *s);
13409 +/* values for join in do_journal_begin_r */
13410 +enum {
13411 + JBEGIN_REG = 0, /* regular journal begin */
13412 + /* join the running transaction if at all possible */
13413 + JBEGIN_JOIN = 1,
13414 + /* called from cleanup code, ignores aborted flag */
13415 + JBEGIN_ABORT = 2,
13418 +static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
13419 + struct super_block *sb,
13420 + unsigned long nblocks, int join);
13422 +static void init_journal_hash(struct super_block *sb)
13424 + struct reiserfs_journal *journal = SB_JOURNAL(sb);
13425 + memset(journal->j_hash_table, 0,
13426 + JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *));
13430 + * clears BH_Dirty and sticks the buffer on the clean list. Called because
13431 + * I can't allow refile_buffer to make schedule happen after I've freed a
13432 + * block. Look at remove_from_transaction and journal_mark_freed for
13433 + * more details.
13434 + */
13435 +static int reiserfs_clean_and_file_buffer(struct buffer_head *bh)
13437 + if (bh) {
13438 + clear_buffer_dirty(bh);
13439 + clear_buffer_journal_test(bh);
13441 + return 0;
13444 +static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block
13445 + *sb)
13447 + struct reiserfs_bitmap_node *bn;
13448 + static int id;
13450 + bn = kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_NOFS);
13451 + if (!bn) {
13452 + return NULL;
13454 + bn->data = kzalloc(sb->s_blocksize, GFP_NOFS);
13455 + if (!bn->data) {
13456 + kfree(bn);
13457 + return NULL;
13459 + bn->id = id++;
13460 + INIT_LIST_HEAD(&bn->list);
13461 + return bn;
13464 +static struct reiserfs_bitmap_node *get_bitmap_node(struct super_block *sb)
13466 + struct reiserfs_journal *journal = SB_JOURNAL(sb);
13467 + struct reiserfs_bitmap_node *bn = NULL;
13468 + struct list_head *entry = journal->j_bitmap_nodes.next;
13470 + journal->j_used_bitmap_nodes++;
13471 +repeat:
13473 + if (entry != &journal->j_bitmap_nodes) {
13474 + bn = list_entry(entry, struct reiserfs_bitmap_node, list);
13475 + list_del(entry);
13476 + memset(bn->data, 0, sb->s_blocksize);
13477 + journal->j_free_bitmap_nodes--;
13478 + return bn;
13480 + bn = allocate_bitmap_node(sb);
13481 + if (!bn) {
13482 + yield();
13483 + goto repeat;
13485 + return bn;
13487 +static inline void free_bitmap_node(struct super_block *sb,
13488 + struct reiserfs_bitmap_node *bn)
13490 + struct reiserfs_journal *journal = SB_JOURNAL(sb);
13491 + journal->j_used_bitmap_nodes--;
13492 + if (journal->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) {
13493 + kfree(bn->data);
13494 + kfree(bn);
13495 + } else {
13496 + list_add(&bn->list, &journal->j_bitmap_nodes);
13497 + journal->j_free_bitmap_nodes++;
13501 +static void allocate_bitmap_nodes(struct super_block *sb)
13503 + int i;
13504 + struct reiserfs_journal *journal = SB_JOURNAL(sb);
13505 + struct reiserfs_bitmap_node *bn = NULL;
13506 + for (i = 0; i < REISERFS_MIN_BITMAP_NODES; i++) {
13507 + bn = allocate_bitmap_node(sb);
13508 + if (bn) {
13509 + list_add(&bn->list, &journal->j_bitmap_nodes);
13510 + journal->j_free_bitmap_nodes++;
13511 + } else {
13512 + /* this is ok, we'll try again when more are needed */
13513 + break;
13518 +static int set_bit_in_list_bitmap(struct super_block *sb,
13519 + b_blocknr_t block,
13520 + struct reiserfs_list_bitmap *jb)
13522 + unsigned int bmap_nr = block / (sb->s_blocksize << 3);
13523 + unsigned int bit_nr = block % (sb->s_blocksize << 3);
13525 + if (!jb->bitmaps[bmap_nr]) {
13526 + jb->bitmaps[bmap_nr] = get_bitmap_node(sb);
13528 + set_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]->data);
13529 + return 0;
13532 +static void cleanup_bitmap_list(struct super_block *sb,
13533 + struct reiserfs_list_bitmap *jb)
13535 + int i;
13536 + if (jb->bitmaps == NULL)
13537 + return;
13539 + for (i = 0; i < reiserfs_bmap_count(sb); i++) {
13540 + if (jb->bitmaps[i]) {
13541 + free_bitmap_node(sb, jb->bitmaps[i]);
13542 + jb->bitmaps[i] = NULL;
13548 + * only call this on FS unmount.
13549 + */
13550 +static int free_list_bitmaps(struct super_block *sb,
13551 + struct reiserfs_list_bitmap *jb_array)
13553 + int i;
13554 + struct reiserfs_list_bitmap *jb;
13555 + for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
13556 + jb = jb_array + i;
13557 + jb->journal_list = NULL;
13558 + cleanup_bitmap_list(sb, jb);
13559 + vfree(jb->bitmaps);
13560 + jb->bitmaps = NULL;
13562 + return 0;
13565 +static int free_bitmap_nodes(struct super_block *sb)
13567 + struct reiserfs_journal *journal = SB_JOURNAL(sb);
13568 + struct list_head *next = journal->j_bitmap_nodes.next;
13569 + struct reiserfs_bitmap_node *bn;
13571 + while (next != &journal->j_bitmap_nodes) {
13572 + bn = list_entry(next, struct reiserfs_bitmap_node, list);
13573 + list_del(next);
13574 + kfree(bn->data);
13575 + kfree(bn);
13576 + next = journal->j_bitmap_nodes.next;
13577 + journal->j_free_bitmap_nodes--;
13580 + return 0;
13584 + * get memory for JOURNAL_NUM_BITMAPS worth of bitmaps.
13585 + * jb_array is the array to be filled in.
13586 + */
13587 +int reiserfs_allocate_list_bitmaps(struct super_block *sb,
13588 + struct reiserfs_list_bitmap *jb_array,
13589 + unsigned int bmap_nr)
13591 + int i;
13592 + int failed = 0;
13593 + struct reiserfs_list_bitmap *jb;
13594 + int mem = bmap_nr * sizeof(struct reiserfs_bitmap_node *);
13596 + for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
13597 + jb = jb_array + i;
13598 + jb->journal_list = NULL;
13599 + jb->bitmaps = vzalloc(mem);
13600 + if (!jb->bitmaps) {
13601 + reiserfs_warning(sb, "clm-2000", "unable to "
13602 + "allocate bitmaps for journal lists");
13603 + failed = 1;
13604 + break;
13607 + if (failed) {
13608 + free_list_bitmaps(sb, jb_array);
13609 + return -1;
13611 + return 0;
13615 + * find an available list bitmap. If you can't find one, flush a commit list
13616 + * and try again
13617 + */
13618 +static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *sb,
13619 + struct reiserfs_journal_list
13620 + *jl)
13622 + int i, j;
13623 + struct reiserfs_journal *journal = SB_JOURNAL(sb);
13624 + struct reiserfs_list_bitmap *jb = NULL;
13626 + for (j = 0; j < (JOURNAL_NUM_BITMAPS * 3); j++) {
13627 + i = journal->j_list_bitmap_index;
13628 + journal->j_list_bitmap_index = (i + 1) % JOURNAL_NUM_BITMAPS;
13629 + jb = journal->j_list_bitmap + i;
13630 + if (journal->j_list_bitmap[i].journal_list) {
13631 + flush_commit_list(sb,
13632 + journal->j_list_bitmap[i].
13633 + journal_list, 1);
13634 + if (!journal->j_list_bitmap[i].journal_list) {
13635 + break;
13637 + } else {
13638 + break;
13641 + /* double check to make sure if flushed correctly */
13642 + if (jb->journal_list)
13643 + return NULL;
13644 + jb->journal_list = jl;
13645 + return jb;
13649 + * allocates a new chunk of X nodes, and links them all together as a list.
13650 + * Uses the cnode->next and cnode->prev pointers
13651 + * returns NULL on failure
13652 + */
13653 +static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes)
13655 + struct reiserfs_journal_cnode *head;
13656 + int i;
13657 + if (num_cnodes <= 0) {
13658 + return NULL;
13660 + head = vzalloc(array_size(num_cnodes,
13661 + sizeof(struct reiserfs_journal_cnode)));
13662 + if (!head) {
13663 + return NULL;
13665 + head[0].prev = NULL;
13666 + head[0].next = head + 1;
13667 + for (i = 1; i < num_cnodes; i++) {
13668 + head[i].prev = head + (i - 1);
13669 + head[i].next = head + (i + 1); /* if last one, overwrite it after the if */
13671 + head[num_cnodes - 1].next = NULL;
13672 + return head;
13675 +/* pulls a cnode off the free list, or returns NULL on failure */
13676 +static struct reiserfs_journal_cnode *get_cnode(struct super_block *sb)
13678 + struct reiserfs_journal_cnode *cn;
13679 + struct reiserfs_journal *journal = SB_JOURNAL(sb);
13681 + reiserfs_check_lock_depth(sb, "get_cnode");
13683 + if (journal->j_cnode_free <= 0) {
13684 + return NULL;
13686 + journal->j_cnode_used++;
13687 + journal->j_cnode_free--;
13688 + cn = journal->j_cnode_free_list;
13689 + if (!cn) {
13690 + return cn;
13692 + if (cn->next) {
13693 + cn->next->prev = NULL;
13695 + journal->j_cnode_free_list = cn->next;
13696 + memset(cn, 0, sizeof(struct reiserfs_journal_cnode));
13697 + return cn;
13701 + * returns a cnode to the free list
13702 + */
13703 +static void free_cnode(struct super_block *sb,
13704 + struct reiserfs_journal_cnode *cn)
13706 + struct reiserfs_journal *journal = SB_JOURNAL(sb);
13708 + reiserfs_check_lock_depth(sb, "free_cnode");
13710 + journal->j_cnode_used--;
13711 + journal->j_cnode_free++;
13712 + /* memset(cn, 0, sizeof(struct reiserfs_journal_cnode)) ; */
13713 + cn->next = journal->j_cnode_free_list;
13714 + if (journal->j_cnode_free_list) {
13715 + journal->j_cnode_free_list->prev = cn;
13717 + cn->prev = NULL; /* not needed with the memset, but I might kill the memset, and forget to do this */
13718 + journal->j_cnode_free_list = cn;
13721 +static void clear_prepared_bits(struct buffer_head *bh)
13723 + clear_buffer_journal_prepared(bh);
13724 + clear_buffer_journal_restore_dirty(bh);
13728 + * return a cnode with same dev, block number and size in table,
13729 + * or null if not found
13730 + */
13731 +static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct
13732 + super_block
13733 + *sb,
13734 + struct
13735 + reiserfs_journal_cnode
13736 + **table,
13737 + long bl)
13739 + struct reiserfs_journal_cnode *cn;
13740 + cn = journal_hash(table, sb, bl);
13741 + while (cn) {
13742 + if (cn->blocknr == bl && cn->sb == sb)
13743 + return cn;
13744 + cn = cn->hnext;
13746 + return (struct reiserfs_journal_cnode *)0;
13750 + * this actually means 'can this block be reallocated yet?'. If you set
13751 + * search_all, a block can only be allocated if it is not in the current
13752 + * transaction, was not freed by the current transaction, and has no chance
13753 + * of ever being overwritten by a replay after crashing.
13755 + * If you don't set search_all, a block can only be allocated if it is not
13756 + * in the current transaction. Since deleting a block removes it from the
13757 + * current transaction, this case should never happen. If you don't set
13758 + * search_all, make sure you never write the block without logging it.
13760 + * next_zero_bit is a suggestion about the next block to try for find_forward.
13761 + * when bl is rejected because it is set in a journal list bitmap, we search
13762 + * for the next zero bit in the bitmap that rejected bl. Then, we return
13763 + * that through next_zero_bit for find_forward to try.
13765 + * Just because we return something in next_zero_bit does not mean we won't
13766 + * reject it on the next call to reiserfs_in_journal
13767 + */
13768 +int reiserfs_in_journal(struct super_block *sb,
13769 + unsigned int bmap_nr, int bit_nr, int search_all,
13770 + b_blocknr_t * next_zero_bit)
13772 + struct reiserfs_journal *journal = SB_JOURNAL(sb);
13773 + struct reiserfs_list_bitmap *jb;
13774 + int i;
13775 + unsigned long bl;
13777 + *next_zero_bit = 0; /* always start this at zero. */
13779 + PROC_INFO_INC(sb, journal.in_journal);
13780 + /*
13781 + * If we aren't doing a search_all, this is a metablock, and it
13782 + * will be logged before use. if we crash before the transaction
13783 + * that freed it commits, this transaction won't have committed
13784 + * either, and the block will never be written
13785 + */
13786 + if (search_all) {
13787 + for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
13788 + PROC_INFO_INC(sb, journal.in_journal_bitmap);
13789 + jb = journal->j_list_bitmap + i;
13790 + if (jb->journal_list && jb->bitmaps[bmap_nr] &&
13791 + test_bit(bit_nr,
13792 + (unsigned long *)jb->bitmaps[bmap_nr]->
13793 + data)) {
13794 + *next_zero_bit =
13795 + find_next_zero_bit((unsigned long *)
13796 + (jb->bitmaps[bmap_nr]->
13797 + data),
13798 + sb->s_blocksize << 3,
13799 + bit_nr + 1);
13800 + return 1;
13805 + bl = bmap_nr * (sb->s_blocksize << 3) + bit_nr;
13806 + /* is it in any old transactions? */
13807 + if (search_all
13808 + && (get_journal_hash_dev(sb, journal->j_list_hash_table, bl))) {
13809 + return 1;
13812 + /* is it in the current transaction. This should never happen */
13813 + if ((get_journal_hash_dev(sb, journal->j_hash_table, bl))) {
13814 + BUG();
13815 + return 1;
13818 + PROC_INFO_INC(sb, journal.in_journal_reusable);
13819 + /* safe for reuse */
13820 + return 0;
13823 +/* insert cn into table */
13824 +static inline void insert_journal_hash(struct reiserfs_journal_cnode **table,
13825 + struct reiserfs_journal_cnode *cn)
13827 + struct reiserfs_journal_cnode *cn_orig;
13829 + cn_orig = journal_hash(table, cn->sb, cn->blocknr);
13830 + cn->hnext = cn_orig;
13831 + cn->hprev = NULL;
13832 + if (cn_orig) {
13833 + cn_orig->hprev = cn;
13835 + journal_hash(table, cn->sb, cn->blocknr) = cn;
13838 +/* lock the current transaction */
13839 +static inline void lock_journal(struct super_block *sb)
13841 + PROC_INFO_INC(sb, journal.lock_journal);
13843 + reiserfs_mutex_lock_safe(&SB_JOURNAL(sb)->j_mutex, sb);
13846 +/* unlock the current transaction */
13847 +static inline void unlock_journal(struct super_block *sb)
13849 + mutex_unlock(&SB_JOURNAL(sb)->j_mutex);
13852 +static inline void get_journal_list(struct reiserfs_journal_list *jl)
13854 + jl->j_refcount++;
13857 +static inline void put_journal_list(struct super_block *s,
13858 + struct reiserfs_journal_list *jl)
13860 + if (jl->j_refcount < 1) {
13861 + reiserfs_panic(s, "journal-2", "trans id %u, refcount at %d",
13862 + jl->j_trans_id, jl->j_refcount);
13864 + if (--jl->j_refcount == 0)
13865 + kfree(jl);
13869 + * this used to be much more involved, and I'm keeping it just in case
13870 + * things get ugly again. it gets called by flush_commit_list, and
13871 + * cleans up any data stored about blocks freed during a transaction.
13872 + */
13873 +static void cleanup_freed_for_journal_list(struct super_block *sb,
13874 + struct reiserfs_journal_list *jl)
13877 + struct reiserfs_list_bitmap *jb = jl->j_list_bitmap;
13878 + if (jb) {
13879 + cleanup_bitmap_list(sb, jb);
13881 + jl->j_list_bitmap->journal_list = NULL;
13882 + jl->j_list_bitmap = NULL;
13885 +static int journal_list_still_alive(struct super_block *s,
13886 + unsigned int trans_id)
13888 + struct reiserfs_journal *journal = SB_JOURNAL(s);
13889 + struct list_head *entry = &journal->j_journal_list;
13890 + struct reiserfs_journal_list *jl;
13892 + if (!list_empty(entry)) {
13893 + jl = JOURNAL_LIST_ENTRY(entry->next);
13894 + if (jl->j_trans_id <= trans_id) {
13895 + return 1;
13898 + return 0;
13902 + * If page->mapping was null, we failed to truncate this page for
13903 + * some reason. Most likely because it was truncated after being
13904 + * logged via data=journal.
13906 + * This does a check to see if the buffer belongs to one of these
13907 + * lost pages before doing the final put_bh. If page->mapping was
13908 + * null, it tries to free buffers on the page, which should make the
13909 + * final put_page drop the page from the lru.
13910 + */
13911 +static void release_buffer_page(struct buffer_head *bh)
13913 + struct folio *folio = bh->b_folio;
13914 + if (!folio->mapping && folio_trylock(folio)) {
13915 + folio_get(folio);
13916 + put_bh(bh);
13917 + if (!folio->mapping)
13918 + try_to_free_buffers(folio);
13919 + folio_unlock(folio);
13920 + folio_put(folio);
13921 + } else {
13922 + put_bh(bh);
13926 +static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
13928 + if (buffer_journaled(bh)) {
13929 + reiserfs_warning(NULL, "clm-2084",
13930 + "pinned buffer %lu:%pg sent to disk",
13931 + bh->b_blocknr, bh->b_bdev);
13933 + if (uptodate)
13934 + set_buffer_uptodate(bh);
13935 + else
13936 + clear_buffer_uptodate(bh);
13938 + unlock_buffer(bh);
13939 + release_buffer_page(bh);
13942 +static void reiserfs_end_ordered_io(struct buffer_head *bh, int uptodate)
13944 + if (uptodate)
13945 + set_buffer_uptodate(bh);
13946 + else
13947 + clear_buffer_uptodate(bh);
13948 + unlock_buffer(bh);
13949 + put_bh(bh);
13952 +static void submit_logged_buffer(struct buffer_head *bh)
13954 + get_bh(bh);
13955 + bh->b_end_io = reiserfs_end_buffer_io_sync;
13956 + clear_buffer_journal_new(bh);
13957 + clear_buffer_dirty(bh);
13958 + if (!test_clear_buffer_journal_test(bh))
13959 + BUG();
13960 + if (!buffer_uptodate(bh))
13961 + BUG();
13962 + submit_bh(REQ_OP_WRITE, bh);
13965 +static void submit_ordered_buffer(struct buffer_head *bh)
13967 + get_bh(bh);
13968 + bh->b_end_io = reiserfs_end_ordered_io;
13969 + clear_buffer_dirty(bh);
13970 + if (!buffer_uptodate(bh))
13971 + BUG();
13972 + submit_bh(REQ_OP_WRITE, bh);
13975 +#define CHUNK_SIZE 32
13976 +struct buffer_chunk {
13977 + struct buffer_head *bh[CHUNK_SIZE];
13978 + int nr;
13981 +static void write_chunk(struct buffer_chunk *chunk)
13983 + int i;
13984 + for (i = 0; i < chunk->nr; i++) {
13985 + submit_logged_buffer(chunk->bh[i]);
13987 + chunk->nr = 0;
13990 +static void write_ordered_chunk(struct buffer_chunk *chunk)
13992 + int i;
13993 + for (i = 0; i < chunk->nr; i++) {
13994 + submit_ordered_buffer(chunk->bh[i]);
13996 + chunk->nr = 0;
13999 +static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh,
14000 + spinlock_t * lock, void (fn) (struct buffer_chunk *))
14002 + int ret = 0;
14003 + BUG_ON(chunk->nr >= CHUNK_SIZE);
14004 + chunk->bh[chunk->nr++] = bh;
14005 + if (chunk->nr >= CHUNK_SIZE) {
14006 + ret = 1;
14007 + if (lock) {
14008 + spin_unlock(lock);
14009 + fn(chunk);
14010 + spin_lock(lock);
14011 + } else {
14012 + fn(chunk);
14015 + return ret;
14018 +static atomic_t nr_reiserfs_jh = ATOMIC_INIT(0);
14019 +static struct reiserfs_jh *alloc_jh(void)
14021 + struct reiserfs_jh *jh;
14022 + while (1) {
14023 + jh = kmalloc(sizeof(*jh), GFP_NOFS);
14024 + if (jh) {
14025 + atomic_inc(&nr_reiserfs_jh);
14026 + return jh;
14028 + yield();
14033 + * we want to free the jh when the buffer has been written
14034 + * and waited on
14035 + */
14036 +void reiserfs_free_jh(struct buffer_head *bh)
14038 + struct reiserfs_jh *jh;
14040 + jh = bh->b_private;
14041 + if (jh) {
14042 + bh->b_private = NULL;
14043 + jh->bh = NULL;
14044 + list_del_init(&jh->list);
14045 + kfree(jh);
14046 + if (atomic_read(&nr_reiserfs_jh) <= 0)
14047 + BUG();
14048 + atomic_dec(&nr_reiserfs_jh);
14049 + put_bh(bh);
14053 +static inline int __add_jh(struct reiserfs_journal *j, struct buffer_head *bh,
14054 + int tail)
14056 + struct reiserfs_jh *jh;
14058 + if (bh->b_private) {
14059 + spin_lock(&j->j_dirty_buffers_lock);
14060 + if (!bh->b_private) {
14061 + spin_unlock(&j->j_dirty_buffers_lock);
14062 + goto no_jh;
14064 + jh = bh->b_private;
14065 + list_del_init(&jh->list);
14066 + } else {
14067 +no_jh:
14068 + get_bh(bh);
14069 + jh = alloc_jh();
14070 + spin_lock(&j->j_dirty_buffers_lock);
14071 + /*
14072 + * buffer must be locked for __add_jh, should be able to have
14073 + * two adds at the same time
14074 + */
14075 + BUG_ON(bh->b_private);
14076 + jh->bh = bh;
14077 + bh->b_private = jh;
14079 + jh->jl = j->j_current_jl;
14080 + if (tail)
14081 + list_add_tail(&jh->list, &jh->jl->j_tail_bh_list);
14082 + else {
14083 + list_add_tail(&jh->list, &jh->jl->j_bh_list);
14085 + spin_unlock(&j->j_dirty_buffers_lock);
14086 + return 0;
14089 +int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh)
14091 + return __add_jh(SB_JOURNAL(inode->i_sb), bh, 1);
14093 +int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh)
14095 + return __add_jh(SB_JOURNAL(inode->i_sb), bh, 0);
14098 +#define JH_ENTRY(l) list_entry((l), struct reiserfs_jh, list)
14099 +static int write_ordered_buffers(spinlock_t * lock,
14100 + struct reiserfs_journal *j,
14101 + struct reiserfs_journal_list *jl,
14102 + struct list_head *list)
14104 + struct buffer_head *bh;
14105 + struct reiserfs_jh *jh;
14106 + int ret = j->j_errno;
14107 + struct buffer_chunk chunk;
14108 + struct list_head tmp;
14109 + INIT_LIST_HEAD(&tmp);
14111 + chunk.nr = 0;
14112 + spin_lock(lock);
14113 + while (!list_empty(list)) {
14114 + jh = JH_ENTRY(list->next);
14115 + bh = jh->bh;
14116 + get_bh(bh);
14117 + if (!trylock_buffer(bh)) {
14118 + if (!buffer_dirty(bh)) {
14119 + list_move(&jh->list, &tmp);
14120 + goto loop_next;
14122 + spin_unlock(lock);
14123 + if (chunk.nr)
14124 + write_ordered_chunk(&chunk);
14125 + wait_on_buffer(bh);
14126 + cond_resched();
14127 + spin_lock(lock);
14128 + goto loop_next;
14130 + /*
14131 + * in theory, dirty non-uptodate buffers should never get here,
14132 + * but the upper layer io error paths still have a few quirks.
14133 + * Handle them here as gracefully as we can
14134 + */
14135 + if (!buffer_uptodate(bh) && buffer_dirty(bh)) {
14136 + clear_buffer_dirty(bh);
14137 + ret = -EIO;
14139 + if (buffer_dirty(bh)) {
14140 + list_move(&jh->list, &tmp);
14141 + add_to_chunk(&chunk, bh, lock, write_ordered_chunk);
14142 + } else {
14143 + reiserfs_free_jh(bh);
14144 + unlock_buffer(bh);
14146 +loop_next:
14147 + put_bh(bh);
14148 + cond_resched_lock(lock);
14150 + if (chunk.nr) {
14151 + spin_unlock(lock);
14152 + write_ordered_chunk(&chunk);
14153 + spin_lock(lock);
14155 + while (!list_empty(&tmp)) {
14156 + jh = JH_ENTRY(tmp.prev);
14157 + bh = jh->bh;
14158 + get_bh(bh);
14159 + reiserfs_free_jh(bh);
14161 + if (buffer_locked(bh)) {
14162 + spin_unlock(lock);
14163 + wait_on_buffer(bh);
14164 + spin_lock(lock);
14166 + if (!buffer_uptodate(bh)) {
14167 + ret = -EIO;
14169 + /*
14170 + * ugly interaction with invalidate_folio here.
14171 + * reiserfs_invalidate_folio will pin any buffer that has a
14172 + * valid journal head from an older transaction. If someone
14173 + * else sets our buffer dirty after we write it in the first
14174 + * loop, and then someone truncates the page away, nobody
14175 + * will ever write the buffer. We're safe if we write the
14176 + * page one last time after freeing the journal header.
14177 + */
14178 + if (buffer_dirty(bh) && unlikely(bh->b_folio->mapping == NULL)) {
14179 + spin_unlock(lock);
14180 + write_dirty_buffer(bh, 0);
14181 + spin_lock(lock);
14183 + put_bh(bh);
14184 + cond_resched_lock(lock);
14186 + spin_unlock(lock);
14187 + return ret;
14190 +static int flush_older_commits(struct super_block *s,
14191 + struct reiserfs_journal_list *jl)
14193 + struct reiserfs_journal *journal = SB_JOURNAL(s);
14194 + struct reiserfs_journal_list *other_jl;
14195 + struct reiserfs_journal_list *first_jl;
14196 + struct list_head *entry;
14197 + unsigned int trans_id = jl->j_trans_id;
14198 + unsigned int other_trans_id;
14200 +find_first:
14201 + /*
14202 + * first we walk backwards to find the oldest uncommitted transation
14203 + */
14204 + first_jl = jl;
14205 + entry = jl->j_list.prev;
14206 + while (1) {
14207 + other_jl = JOURNAL_LIST_ENTRY(entry);
14208 + if (entry == &journal->j_journal_list ||
14209 + atomic_read(&other_jl->j_older_commits_done))
14210 + break;
14212 + first_jl = other_jl;
14213 + entry = other_jl->j_list.prev;
14216 + /* if we didn't find any older uncommitted transactions, return now */
14217 + if (first_jl == jl) {
14218 + return 0;
14221 + entry = &first_jl->j_list;
14222 + while (1) {
14223 + other_jl = JOURNAL_LIST_ENTRY(entry);
14224 + other_trans_id = other_jl->j_trans_id;
14226 + if (other_trans_id < trans_id) {
14227 + if (atomic_read(&other_jl->j_commit_left) != 0) {
14228 + flush_commit_list(s, other_jl, 0);
14230 + /* list we were called with is gone, return */
14231 + if (!journal_list_still_alive(s, trans_id))
14232 + return 1;
14234 + /*
14235 + * the one we just flushed is gone, this means
14236 + * all older lists are also gone, so first_jl
14237 + * is no longer valid either. Go back to the
14238 + * beginning.
14239 + */
14240 + if (!journal_list_still_alive
14241 + (s, other_trans_id)) {
14242 + goto find_first;
14245 + entry = entry->next;
14246 + if (entry == &journal->j_journal_list)
14247 + return 0;
14248 + } else {
14249 + return 0;
14252 + return 0;
14255 +static int reiserfs_async_progress_wait(struct super_block *s)
14257 + struct reiserfs_journal *j = SB_JOURNAL(s);
14259 + if (atomic_read(&j->j_async_throttle)) {
14260 + int depth;
14262 + depth = reiserfs_write_unlock_nested(s);
14263 + wait_var_event_timeout(&j->j_async_throttle,
14264 + atomic_read(&j->j_async_throttle) == 0,
14265 + HZ / 10);
14266 + reiserfs_write_lock_nested(s, depth);
14269 + return 0;
14273 + * if this journal list still has commit blocks unflushed, send them to disk.
14275 + * log areas must be flushed in order (transaction 2 can't commit before
14276 + * transaction 1) Before the commit block can by written, every other log
14277 + * block must be safely on disk
14278 + */
14279 +static int flush_commit_list(struct super_block *s,
14280 + struct reiserfs_journal_list *jl, int flushall)
14282 + int i;
14283 + b_blocknr_t bn;
14284 + struct buffer_head *tbh = NULL;
14285 + unsigned int trans_id = jl->j_trans_id;
14286 + struct reiserfs_journal *journal = SB_JOURNAL(s);
14287 + int retval = 0;
14288 + int write_len;
14289 + int depth;
14291 + reiserfs_check_lock_depth(s, "flush_commit_list");
14293 + if (atomic_read(&jl->j_older_commits_done)) {
14294 + return 0;
14297 + /*
14298 + * before we can put our commit blocks on disk, we have to make
14299 + * sure everyone older than us is on disk too
14300 + */
14301 + BUG_ON(jl->j_len <= 0);
14302 + BUG_ON(trans_id == journal->j_trans_id);
14304 + get_journal_list(jl);
14305 + if (flushall) {
14306 + if (flush_older_commits(s, jl) == 1) {
14307 + /*
14308 + * list disappeared during flush_older_commits.
14309 + * return
14310 + */
14311 + goto put_jl;
14315 + /* make sure nobody is trying to flush this one at the same time */
14316 + reiserfs_mutex_lock_safe(&jl->j_commit_mutex, s);
14318 + if (!journal_list_still_alive(s, trans_id)) {
14319 + mutex_unlock(&jl->j_commit_mutex);
14320 + goto put_jl;
14322 + BUG_ON(jl->j_trans_id == 0);
14324 + /* this commit is done, exit */
14325 + if (atomic_read(&jl->j_commit_left) <= 0) {
14326 + if (flushall) {
14327 + atomic_set(&jl->j_older_commits_done, 1);
14329 + mutex_unlock(&jl->j_commit_mutex);
14330 + goto put_jl;
14333 + if (!list_empty(&jl->j_bh_list)) {
14334 + int ret;
14336 + /*
14337 + * We might sleep in numerous places inside
14338 + * write_ordered_buffers. Relax the write lock.
14339 + */
14340 + depth = reiserfs_write_unlock_nested(s);
14341 + ret = write_ordered_buffers(&journal->j_dirty_buffers_lock,
14342 + journal, jl, &jl->j_bh_list);
14343 + if (ret < 0 && retval == 0)
14344 + retval = ret;
14345 + reiserfs_write_lock_nested(s, depth);
14347 + BUG_ON(!list_empty(&jl->j_bh_list));
14348 + /*
14349 + * for the description block and all the log blocks, submit any buffers
14350 + * that haven't already reached the disk. Try to write at least 256
14351 + * log blocks. later on, we will only wait on blocks that correspond
14352 + * to this transaction, but while we're unplugging we might as well
14353 + * get a chunk of data on there.
14354 + */
14355 + atomic_inc(&journal->j_async_throttle);
14356 + write_len = jl->j_len + 1;
14357 + if (write_len < 256)
14358 + write_len = 256;
14359 + for (i = 0 ; i < write_len ; i++) {
14360 + bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) %
14361 + SB_ONDISK_JOURNAL_SIZE(s);
14362 + tbh = journal_find_get_block(s, bn);
14363 + if (tbh) {
14364 + if (buffer_dirty(tbh)) {
14365 + depth = reiserfs_write_unlock_nested(s);
14366 + write_dirty_buffer(tbh, 0);
14367 + reiserfs_write_lock_nested(s, depth);
14369 + put_bh(tbh) ;
14372 + if (atomic_dec_and_test(&journal->j_async_throttle))
14373 + wake_up_var(&journal->j_async_throttle);
14375 + for (i = 0; i < (jl->j_len + 1); i++) {
14376 + bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
14377 + (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s);
14378 + tbh = journal_find_get_block(s, bn);
14380 + depth = reiserfs_write_unlock_nested(s);
14381 + __wait_on_buffer(tbh);
14382 + reiserfs_write_lock_nested(s, depth);
14383 + /*
14384 + * since we're using ll_rw_blk above, it might have skipped
14385 + * over a locked buffer. Double check here
14386 + */
14387 + /* redundant, sync_dirty_buffer() checks */
14388 + if (buffer_dirty(tbh)) {
14389 + depth = reiserfs_write_unlock_nested(s);
14390 + sync_dirty_buffer(tbh);
14391 + reiserfs_write_lock_nested(s, depth);
14393 + if (unlikely(!buffer_uptodate(tbh))) {
14394 +#ifdef CONFIG_REISERFS_CHECK
14395 + reiserfs_warning(s, "journal-601",
14396 + "buffer write failed");
14397 +#endif
14398 + retval = -EIO;
14400 + /* once for journal_find_get_block */
14401 + put_bh(tbh);
14402 + /* once due to original getblk in do_journal_end */
14403 + put_bh(tbh);
14404 + atomic_dec(&jl->j_commit_left);
14407 + BUG_ON(atomic_read(&jl->j_commit_left) != 1);
14409 + /*
14410 + * If there was a write error in the journal - we can't commit
14411 + * this transaction - it will be invalid and, if successful,
14412 + * will just end up propagating the write error out to
14413 + * the file system.
14414 + */
14415 + if (likely(!retval && !reiserfs_is_journal_aborted (journal))) {
14416 + if (buffer_dirty(jl->j_commit_bh))
14417 + BUG();
14418 + mark_buffer_dirty(jl->j_commit_bh) ;
14419 + depth = reiserfs_write_unlock_nested(s);
14420 + if (reiserfs_barrier_flush(s))
14421 + __sync_dirty_buffer(jl->j_commit_bh,
14422 + REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
14423 + else
14424 + sync_dirty_buffer(jl->j_commit_bh);
14425 + reiserfs_write_lock_nested(s, depth);
14428 + /*
14429 + * If there was a write error in the journal - we can't commit this
14430 + * transaction - it will be invalid and, if successful, will just end
14431 + * up propagating the write error out to the filesystem.
14432 + */
14433 + if (unlikely(!buffer_uptodate(jl->j_commit_bh))) {
14434 +#ifdef CONFIG_REISERFS_CHECK
14435 + reiserfs_warning(s, "journal-615", "buffer write failed");
14436 +#endif
14437 + retval = -EIO;
14439 + bforget(jl->j_commit_bh);
14440 + if (journal->j_last_commit_id != 0 &&
14441 + (jl->j_trans_id - journal->j_last_commit_id) != 1) {
14442 + reiserfs_warning(s, "clm-2200", "last commit %lu, current %lu",
14443 + journal->j_last_commit_id, jl->j_trans_id);
14445 + journal->j_last_commit_id = jl->j_trans_id;
14447 + /*
14448 + * now, every commit block is on the disk. It is safe to allow
14449 + * blocks freed during this transaction to be reallocated
14450 + */
14451 + cleanup_freed_for_journal_list(s, jl);
14453 + retval = retval ? retval : journal->j_errno;
14455 + /* mark the metadata dirty */
14456 + if (!retval)
14457 + dirty_one_transaction(s, jl);
14458 + atomic_dec(&jl->j_commit_left);
14460 + if (flushall) {
14461 + atomic_set(&jl->j_older_commits_done, 1);
14463 + mutex_unlock(&jl->j_commit_mutex);
14464 +put_jl:
14465 + put_journal_list(s, jl);
14467 + if (retval)
14468 + reiserfs_abort(s, retval, "Journal write error in %s",
14469 + __func__);
14470 + return retval;
14474 + * flush_journal_list frequently needs to find a newer transaction for a
14475 + * given block. This does that, or returns NULL if it can't find anything
14476 + */
14477 +static struct reiserfs_journal_list *find_newer_jl_for_cn(struct
14478 + reiserfs_journal_cnode
14479 + *cn)
14481 + struct super_block *sb = cn->sb;
14482 + b_blocknr_t blocknr = cn->blocknr;
14484 + cn = cn->hprev;
14485 + while (cn) {
14486 + if (cn->sb == sb && cn->blocknr == blocknr && cn->jlist) {
14487 + return cn->jlist;
14489 + cn = cn->hprev;
14491 + return NULL;
14494 +static void remove_journal_hash(struct super_block *,
14495 + struct reiserfs_journal_cnode **,
14496 + struct reiserfs_journal_list *, unsigned long,
14497 + int);
14500 + * once all the real blocks have been flushed, it is safe to remove them
14501 + * from the journal list for this transaction. Aside from freeing the
14502 + * cnode, this also allows the block to be reallocated for data blocks
14503 + * if it had been deleted.
14504 + */
14505 +static void remove_all_from_journal_list(struct super_block *sb,
14506 + struct reiserfs_journal_list *jl,
14507 + int debug)
14509 + struct reiserfs_journal *journal = SB_JOURNAL(sb);
14510 + struct reiserfs_journal_cnode *cn, *last;
14511 + cn = jl->j_realblock;
14513 + /*
14514 + * which is better, to lock once around the whole loop, or
14515 + * to lock for each call to remove_journal_hash?
14516 + */
14517 + while (cn) {
14518 + if (cn->blocknr != 0) {
14519 + if (debug) {
14520 + reiserfs_warning(sb, "reiserfs-2201",
14521 + "block %u, bh is %d, state %ld",
14522 + cn->blocknr, cn->bh ? 1 : 0,
14523 + cn->state);
14525 + cn->state = 0;
14526 + remove_journal_hash(sb, journal->j_list_hash_table,
14527 + jl, cn->blocknr, 1);
14529 + last = cn;
14530 + cn = cn->next;
14531 + free_cnode(sb, last);
14533 + jl->j_realblock = NULL;
14537 + * if this timestamp is greater than the timestamp we wrote last to the
14538 + * header block, write it to the header block. once this is done, I can
14539 + * safely say the log area for this transaction won't ever be replayed,
14540 + * and I can start releasing blocks in this transaction for reuse as data
14541 + * blocks. called by flush_journal_list, before it calls
14542 + * remove_all_from_journal_list
14543 + */
14544 +static int _update_journal_header_block(struct super_block *sb,
14545 + unsigned long offset,
14546 + unsigned int trans_id)
14548 + struct reiserfs_journal_header *jh;
14549 + struct reiserfs_journal *journal = SB_JOURNAL(sb);
14550 + int depth;
14552 + if (reiserfs_is_journal_aborted(journal))
14553 + return -EIO;
14555 + if (trans_id >= journal->j_last_flush_trans_id) {
14556 + if (buffer_locked((journal->j_header_bh))) {
14557 + depth = reiserfs_write_unlock_nested(sb);
14558 + __wait_on_buffer(journal->j_header_bh);
14559 + reiserfs_write_lock_nested(sb, depth);
14560 + if (unlikely(!buffer_uptodate(journal->j_header_bh))) {
14561 +#ifdef CONFIG_REISERFS_CHECK
14562 + reiserfs_warning(sb, "journal-699",
14563 + "buffer write failed");
14564 +#endif
14565 + return -EIO;
14568 + journal->j_last_flush_trans_id = trans_id;
14569 + journal->j_first_unflushed_offset = offset;
14570 + jh = (struct reiserfs_journal_header *)(journal->j_header_bh->
14571 + b_data);
14572 + jh->j_last_flush_trans_id = cpu_to_le32(trans_id);
14573 + jh->j_first_unflushed_offset = cpu_to_le32(offset);
14574 + jh->j_mount_id = cpu_to_le32(journal->j_mount_id);
14576 + set_buffer_dirty(journal->j_header_bh);
14577 + depth = reiserfs_write_unlock_nested(sb);
14579 + if (reiserfs_barrier_flush(sb))
14580 + __sync_dirty_buffer(journal->j_header_bh,
14581 + REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
14582 + else
14583 + sync_dirty_buffer(journal->j_header_bh);
14585 + reiserfs_write_lock_nested(sb, depth);
14586 + if (!buffer_uptodate(journal->j_header_bh)) {
14587 + reiserfs_warning(sb, "journal-837",
14588 + "IO error during journal replay");
14589 + return -EIO;
14592 + return 0;
14595 +static int update_journal_header_block(struct super_block *sb,
14596 + unsigned long offset,
14597 + unsigned int trans_id)
14599 + return _update_journal_header_block(sb, offset, trans_id);
14603 +** flush any and all journal lists older than you are
14604 +** can only be called from flush_journal_list
14606 +static int flush_older_journal_lists(struct super_block *sb,
14607 + struct reiserfs_journal_list *jl)
14609 + struct list_head *entry;
14610 + struct reiserfs_journal_list *other_jl;
14611 + struct reiserfs_journal *journal = SB_JOURNAL(sb);
14612 + unsigned int trans_id = jl->j_trans_id;
14614 + /*
14615 + * we know we are the only ones flushing things, no extra race
14616 + * protection is required.
14617 + */
14618 +restart:
14619 + entry = journal->j_journal_list.next;
14620 + /* Did we wrap? */
14621 + if (entry == &journal->j_journal_list)
14622 + return 0;
14623 + other_jl = JOURNAL_LIST_ENTRY(entry);
14624 + if (other_jl->j_trans_id < trans_id) {
14625 + BUG_ON(other_jl->j_refcount <= 0);
14626 + /* do not flush all */
14627 + flush_journal_list(sb, other_jl, 0);
14629 + /* other_jl is now deleted from the list */
14630 + goto restart;
14632 + return 0;
14635 +static void del_from_work_list(struct super_block *s,
14636 + struct reiserfs_journal_list *jl)
14638 + struct reiserfs_journal *journal = SB_JOURNAL(s);
14639 + if (!list_empty(&jl->j_working_list)) {
14640 + list_del_init(&jl->j_working_list);
14641 + journal->j_num_work_lists--;
14646 + * flush a journal list, both commit and real blocks
14648 + * always set flushall to 1, unless you are calling from inside
14649 + * flush_journal_list
14651 + * IMPORTANT. This can only be called while there are no journal writers,
14652 + * and the journal is locked. That means it can only be called from
14653 + * do_journal_end, or by journal_release
14654 + */
14655 +static int flush_journal_list(struct super_block *s,
14656 + struct reiserfs_journal_list *jl, int flushall)
14658 + struct reiserfs_journal_list *pjl;
14659 + struct reiserfs_journal_cnode *cn;
14660 + int count;
14661 + int was_jwait = 0;
14662 + int was_dirty = 0;
14663 + struct buffer_head *saved_bh;
14664 + unsigned long j_len_saved = jl->j_len;
14665 + struct reiserfs_journal *journal = SB_JOURNAL(s);
14666 + int err = 0;
14667 + int depth;
14669 + BUG_ON(j_len_saved <= 0);
14671 + if (atomic_read(&journal->j_wcount) != 0) {
14672 + reiserfs_warning(s, "clm-2048", "called with wcount %d",
14673 + atomic_read(&journal->j_wcount));
14676 + /* if flushall == 0, the lock is already held */
14677 + if (flushall) {
14678 + reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
14679 + } else if (mutex_trylock(&journal->j_flush_mutex)) {
14680 + BUG();
14683 + count = 0;
14684 + if (j_len_saved > journal->j_trans_max) {
14685 + reiserfs_panic(s, "journal-715", "length is %lu, trans id %lu",
14686 + j_len_saved, jl->j_trans_id);
14687 + return 0;
14690 + /* if all the work is already done, get out of here */
14691 + if (atomic_read(&jl->j_nonzerolen) <= 0 &&
14692 + atomic_read(&jl->j_commit_left) <= 0) {
14693 + goto flush_older_and_return;
14696 + /*
14697 + * start by putting the commit list on disk. This will also flush
14698 + * the commit lists of any olders transactions
14699 + */
14700 + flush_commit_list(s, jl, 1);
14702 + if (!(jl->j_state & LIST_DIRTY)
14703 + && !reiserfs_is_journal_aborted(journal))
14704 + BUG();
14706 + /* are we done now? */
14707 + if (atomic_read(&jl->j_nonzerolen) <= 0 &&
14708 + atomic_read(&jl->j_commit_left) <= 0) {
14709 + goto flush_older_and_return;
14712 + /*
14713 + * loop through each cnode, see if we need to write it,
14714 + * or wait on a more recent transaction, or just ignore it
14715 + */
14716 + if (atomic_read(&journal->j_wcount) != 0) {
14717 + reiserfs_panic(s, "journal-844", "journal list is flushing, "
14718 + "wcount is not 0");
14720 + cn = jl->j_realblock;
14721 + while (cn) {
14722 + was_jwait = 0;
14723 + was_dirty = 0;
14724 + saved_bh = NULL;
14725 + /* blocknr of 0 is no longer in the hash, ignore it */
14726 + if (cn->blocknr == 0) {
14727 + goto free_cnode;
14730 + /*
14731 + * This transaction failed commit.
14732 + * Don't write out to the disk
14733 + */
14734 + if (!(jl->j_state & LIST_DIRTY))
14735 + goto free_cnode;
14737 + pjl = find_newer_jl_for_cn(cn);
14738 + /*
14739 + * the order is important here. We check pjl to make sure we
14740 + * don't clear BH_JDirty_wait if we aren't the one writing this
14741 + * block to disk
14742 + */
14743 + if (!pjl && cn->bh) {
14744 + saved_bh = cn->bh;
14746 + /*
14747 + * we do this to make sure nobody releases the
14748 + * buffer while we are working with it
14749 + */
14750 + get_bh(saved_bh);
14752 + if (buffer_journal_dirty(saved_bh)) {
14753 + BUG_ON(!can_dirty(cn));
14754 + was_jwait = 1;
14755 + was_dirty = 1;
14756 + } else if (can_dirty(cn)) {
14757 + /*
14758 + * everything with !pjl && jwait
14759 + * should be writable
14760 + */
14761 + BUG();
14765 + /*
14766 + * if someone has this block in a newer transaction, just make
14767 + * sure they are committed, and don't try writing it to disk
14768 + */
14769 + if (pjl) {
14770 + if (atomic_read(&pjl->j_commit_left))
14771 + flush_commit_list(s, pjl, 1);
14772 + goto free_cnode;
14775 + /*
14776 + * bh == NULL when the block got to disk on its own, OR,
14777 + * the block got freed in a future transaction
14778 + */
14779 + if (saved_bh == NULL) {
14780 + goto free_cnode;
14783 + /*
14784 + * this should never happen. kupdate_one_transaction has
14785 + * this list locked while it works, so we should never see a
14786 + * buffer here that is not marked JDirty_wait
14787 + */
14788 + if ((!was_jwait) && !buffer_locked(saved_bh)) {
14789 + reiserfs_warning(s, "journal-813",
14790 + "BAD! buffer %llu %cdirty %cjwait, "
14791 + "not in a newer transaction",
14792 + (unsigned long long)saved_bh->
14793 + b_blocknr, was_dirty ? ' ' : '!',
14794 + was_jwait ? ' ' : '!');
14796 + if (was_dirty) {
14797 + /*
14798 + * we inc again because saved_bh gets decremented
14799 + * at free_cnode
14800 + */
14801 + get_bh(saved_bh);
14802 + set_bit(BLOCK_NEEDS_FLUSH, &cn->state);
14803 + lock_buffer(saved_bh);
14804 + BUG_ON(cn->blocknr != saved_bh->b_blocknr);
14805 + if (buffer_dirty(saved_bh))
14806 + submit_logged_buffer(saved_bh);
14807 + else
14808 + unlock_buffer(saved_bh);
14809 + count++;
14810 + } else {
14811 + reiserfs_warning(s, "clm-2082",
14812 + "Unable to flush buffer %llu in %s",
14813 + (unsigned long long)saved_bh->
14814 + b_blocknr, __func__);
14816 +free_cnode:
14817 + cn = cn->next;
14818 + if (saved_bh) {
14819 + /*
14820 + * we incremented this to keep others from
14821 + * taking the buffer head away
14822 + */
14823 + put_bh(saved_bh);
14824 + if (atomic_read(&saved_bh->b_count) < 0) {
14825 + reiserfs_warning(s, "journal-945",
14826 + "saved_bh->b_count < 0");
14830 + if (count > 0) {
14831 + cn = jl->j_realblock;
14832 + while (cn) {
14833 + if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) {
14834 + if (!cn->bh) {
14835 + reiserfs_panic(s, "journal-1011",
14836 + "cn->bh is NULL");
14839 + depth = reiserfs_write_unlock_nested(s);
14840 + __wait_on_buffer(cn->bh);
14841 + reiserfs_write_lock_nested(s, depth);
14843 + if (!cn->bh) {
14844 + reiserfs_panic(s, "journal-1012",
14845 + "cn->bh is NULL");
14847 + if (unlikely(!buffer_uptodate(cn->bh))) {
14848 +#ifdef CONFIG_REISERFS_CHECK
14849 + reiserfs_warning(s, "journal-949",
14850 + "buffer write failed");
14851 +#endif
14852 + err = -EIO;
14854 + /*
14855 + * note, we must clear the JDirty_wait bit
14856 + * after the up to date check, otherwise we
14857 + * race against our flushpage routine
14858 + */
14859 + BUG_ON(!test_clear_buffer_journal_dirty
14860 + (cn->bh));
14862 + /* drop one ref for us */
14863 + put_bh(cn->bh);
14864 + /* drop one ref for journal_mark_dirty */
14865 + release_buffer_page(cn->bh);
14867 + cn = cn->next;
14871 + if (err)
14872 + reiserfs_abort(s, -EIO,
14873 + "Write error while pushing transaction to disk in %s",
14874 + __func__);
14875 +flush_older_and_return:
14877 + /*
14878 + * before we can update the journal header block, we _must_ flush all
14879 + * real blocks from all older transactions to disk. This is because
14880 + * once the header block is updated, this transaction will not be
14881 + * replayed after a crash
14882 + */
14883 + if (flushall) {
14884 + flush_older_journal_lists(s, jl);
14887 + err = journal->j_errno;
14888 + /*
14889 + * before we can remove everything from the hash tables for this
14890 + * transaction, we must make sure it can never be replayed
14892 + * since we are only called from do_journal_end, we know for sure there
14893 + * are no allocations going on while we are flushing journal lists. So,
14894 + * we only need to update the journal header block for the last list
14895 + * being flushed
14896 + */
14897 + if (!err && flushall) {
14898 + err =
14899 + update_journal_header_block(s,
14900 + (jl->j_start + jl->j_len +
14901 + 2) % SB_ONDISK_JOURNAL_SIZE(s),
14902 + jl->j_trans_id);
14903 + if (err)
14904 + reiserfs_abort(s, -EIO,
14905 + "Write error while updating journal header in %s",
14906 + __func__);
14908 + remove_all_from_journal_list(s, jl, 0);
14909 + list_del_init(&jl->j_list);
14910 + journal->j_num_lists--;
14911 + del_from_work_list(s, jl);
14913 + if (journal->j_last_flush_id != 0 &&
14914 + (jl->j_trans_id - journal->j_last_flush_id) != 1) {
14915 + reiserfs_warning(s, "clm-2201", "last flush %lu, current %lu",
14916 + journal->j_last_flush_id, jl->j_trans_id);
14918 + journal->j_last_flush_id = jl->j_trans_id;
14920 + /*
14921 + * not strictly required since we are freeing the list, but it should
14922 + * help find code using dead lists later on
14923 + */
14924 + jl->j_len = 0;
14925 + atomic_set(&jl->j_nonzerolen, 0);
14926 + jl->j_start = 0;
14927 + jl->j_realblock = NULL;
14928 + jl->j_commit_bh = NULL;
14929 + jl->j_trans_id = 0;
14930 + jl->j_state = 0;
14931 + put_journal_list(s, jl);
14932 + if (flushall)
14933 + mutex_unlock(&journal->j_flush_mutex);
14934 + return err;
14937 +static int write_one_transaction(struct super_block *s,
14938 + struct reiserfs_journal_list *jl,
14939 + struct buffer_chunk *chunk)
14941 + struct reiserfs_journal_cnode *cn;
14942 + int ret = 0;
14944 + jl->j_state |= LIST_TOUCHED;
14945 + del_from_work_list(s, jl);
14946 + if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) {
14947 + return 0;
14950 + cn = jl->j_realblock;
14951 + while (cn) {
14952 + /*
14953 + * if the blocknr == 0, this has been cleared from the hash,
14954 + * skip it
14955 + */
14956 + if (cn->blocknr == 0) {
14957 + goto next;
14959 + if (cn->bh && can_dirty(cn) && buffer_dirty(cn->bh)) {
14960 + struct buffer_head *tmp_bh;
14961 + /*
14962 + * we can race against journal_mark_freed when we try
14963 + * to lock_buffer(cn->bh), so we have to inc the buffer
14964 + * count, and recheck things after locking
14965 + */
14966 + tmp_bh = cn->bh;
14967 + get_bh(tmp_bh);
14968 + lock_buffer(tmp_bh);
14969 + if (cn->bh && can_dirty(cn) && buffer_dirty(tmp_bh)) {
14970 + if (!buffer_journal_dirty(tmp_bh) ||
14971 + buffer_journal_prepared(tmp_bh))
14972 + BUG();
14973 + add_to_chunk(chunk, tmp_bh, NULL, write_chunk);
14974 + ret++;
14975 + } else {
14976 + /* note, cn->bh might be null now */
14977 + unlock_buffer(tmp_bh);
14979 + put_bh(tmp_bh);
14981 +next:
14982 + cn = cn->next;
14983 + cond_resched();
14985 + return ret;
14988 +/* used by flush_commit_list */
14989 +static void dirty_one_transaction(struct super_block *s,
14990 + struct reiserfs_journal_list *jl)
14992 + struct reiserfs_journal_cnode *cn;
14993 + struct reiserfs_journal_list *pjl;
14995 + jl->j_state |= LIST_DIRTY;
14996 + cn = jl->j_realblock;
14997 + while (cn) {
14998 + /*
14999 + * look for a more recent transaction that logged this
15000 + * buffer. Only the most recent transaction with a buffer in
15001 + * it is allowed to send that buffer to disk
15002 + */
15003 + pjl = find_newer_jl_for_cn(cn);
15004 + if (!pjl && cn->blocknr && cn->bh
15005 + && buffer_journal_dirty(cn->bh)) {
15006 + BUG_ON(!can_dirty(cn));
15007 + /*
15008 + * if the buffer is prepared, it will either be logged
15009 + * or restored. If restored, we need to make sure
15010 + * it actually gets marked dirty
15011 + */
15012 + clear_buffer_journal_new(cn->bh);
15013 + if (buffer_journal_prepared(cn->bh)) {
15014 + set_buffer_journal_restore_dirty(cn->bh);
15015 + } else {
15016 + set_buffer_journal_test(cn->bh);
15017 + mark_buffer_dirty(cn->bh);
15020 + cn = cn->next;
15024 +static int kupdate_transactions(struct super_block *s,
15025 + struct reiserfs_journal_list *jl,
15026 + struct reiserfs_journal_list **next_jl,
15027 + unsigned int *next_trans_id,
15028 + int num_blocks, int num_trans)
15030 + int ret = 0;
15031 + int written = 0;
15032 + int transactions_flushed = 0;
15033 + unsigned int orig_trans_id = jl->j_trans_id;
15034 + struct buffer_chunk chunk;
15035 + struct list_head *entry;
15036 + struct reiserfs_journal *journal = SB_JOURNAL(s);
15037 + chunk.nr = 0;
15039 + reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
15040 + if (!journal_list_still_alive(s, orig_trans_id)) {
15041 + goto done;
15044 + /*
15045 + * we've got j_flush_mutex held, nobody is going to delete any
15046 + * of these lists out from underneath us
15047 + */
15048 + while ((num_trans && transactions_flushed < num_trans) ||
15049 + (!num_trans && written < num_blocks)) {
15051 + if (jl->j_len == 0 || (jl->j_state & LIST_TOUCHED) ||
15052 + atomic_read(&jl->j_commit_left)
15053 + || !(jl->j_state & LIST_DIRTY)) {
15054 + del_from_work_list(s, jl);
15055 + break;
15057 + ret = write_one_transaction(s, jl, &chunk);
15059 + if (ret < 0)
15060 + goto done;
15061 + transactions_flushed++;
15062 + written += ret;
15063 + entry = jl->j_list.next;
15065 + /* did we wrap? */
15066 + if (entry == &journal->j_journal_list) {
15067 + break;
15069 + jl = JOURNAL_LIST_ENTRY(entry);
15071 + /* don't bother with older transactions */
15072 + if (jl->j_trans_id <= orig_trans_id)
15073 + break;
15075 + if (chunk.nr) {
15076 + write_chunk(&chunk);
15079 +done:
15080 + mutex_unlock(&journal->j_flush_mutex);
15081 + return ret;
15085 + * for o_sync and fsync heavy applications, they tend to use
15086 + * all the journa list slots with tiny transactions. These
15087 + * trigger lots and lots of calls to update the header block, which
15088 + * adds seeks and slows things down.
15090 + * This function tries to clear out a large chunk of the journal lists
15091 + * at once, which makes everything faster since only the newest journal
15092 + * list updates the header block
15093 + */
15094 +static int flush_used_journal_lists(struct super_block *s,
15095 + struct reiserfs_journal_list *jl)
15097 + unsigned long len = 0;
15098 + unsigned long cur_len;
15099 + int i;
15100 + int limit = 256;
15101 + struct reiserfs_journal_list *tjl;
15102 + struct reiserfs_journal_list *flush_jl;
15103 + unsigned int trans_id;
15104 + struct reiserfs_journal *journal = SB_JOURNAL(s);
15106 + flush_jl = tjl = jl;
15108 + /* in data logging mode, try harder to flush a lot of blocks */
15109 + if (reiserfs_data_log(s))
15110 + limit = 1024;
15111 + /* flush for 256 transactions or limit blocks, whichever comes first */
15112 + for (i = 0; i < 256 && len < limit; i++) {
15113 + if (atomic_read(&tjl->j_commit_left) ||
15114 + tjl->j_trans_id < jl->j_trans_id) {
15115 + break;
15117 + cur_len = atomic_read(&tjl->j_nonzerolen);
15118 + if (cur_len > 0) {
15119 + tjl->j_state &= ~LIST_TOUCHED;
15121 + len += cur_len;
15122 + flush_jl = tjl;
15123 + if (tjl->j_list.next == &journal->j_journal_list)
15124 + break;
15125 + tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next);
15127 + get_journal_list(jl);
15128 + get_journal_list(flush_jl);
15130 + /*
15131 + * try to find a group of blocks we can flush across all the
15132 + * transactions, but only bother if we've actually spanned
15133 + * across multiple lists
15134 + */
15135 + if (flush_jl != jl)
15136 + kupdate_transactions(s, jl, &tjl, &trans_id, len, i);
15138 + flush_journal_list(s, flush_jl, 1);
15139 + put_journal_list(s, flush_jl);
15140 + put_journal_list(s, jl);
15141 + return 0;
15145 + * removes any nodes in table with name block and dev as bh.
15146 + * only touchs the hnext and hprev pointers.
15147 + */
15148 +static void remove_journal_hash(struct super_block *sb,
15149 + struct reiserfs_journal_cnode **table,
15150 + struct reiserfs_journal_list *jl,
15151 + unsigned long block, int remove_freed)
15153 + struct reiserfs_journal_cnode *cur;
15154 + struct reiserfs_journal_cnode **head;
15156 + head = &(journal_hash(table, sb, block));
15157 + if (!head) {
15158 + return;
15160 + cur = *head;
15161 + while (cur) {
15162 + if (cur->blocknr == block && cur->sb == sb
15163 + && (jl == NULL || jl == cur->jlist)
15164 + && (!test_bit(BLOCK_FREED, &cur->state) || remove_freed)) {
15165 + if (cur->hnext) {
15166 + cur->hnext->hprev = cur->hprev;
15168 + if (cur->hprev) {
15169 + cur->hprev->hnext = cur->hnext;
15170 + } else {
15171 + *head = cur->hnext;
15173 + cur->blocknr = 0;
15174 + cur->sb = NULL;
15175 + cur->state = 0;
15176 + /*
15177 + * anybody who clears the cur->bh will also
15178 + * dec the nonzerolen
15179 + */
15180 + if (cur->bh && cur->jlist)
15181 + atomic_dec(&cur->jlist->j_nonzerolen);
15182 + cur->bh = NULL;
15183 + cur->jlist = NULL;
15185 + cur = cur->hnext;
15189 +static void free_journal_ram(struct super_block *sb)
15191 + struct reiserfs_journal *journal = SB_JOURNAL(sb);
15192 + kfree(journal->j_current_jl);
15193 + journal->j_num_lists--;
15195 + vfree(journal->j_cnode_free_orig);
15196 + free_list_bitmaps(sb, journal->j_list_bitmap);
15197 + free_bitmap_nodes(sb); /* must be after free_list_bitmaps */
15198 + if (journal->j_header_bh) {
15199 + brelse(journal->j_header_bh);
15201 + /*
15202 + * j_header_bh is on the journal dev, make sure
15203 + * not to release the journal dev until we brelse j_header_bh
15204 + */
15205 + release_journal_dev(journal);
15206 + vfree(journal);
15210 + * call on unmount. Only set error to 1 if you haven't made your way out
15211 + * of read_super() yet. Any other caller must keep error at 0.
15212 + */
15213 +static int do_journal_release(struct reiserfs_transaction_handle *th,
15214 + struct super_block *sb, int error)
15216 + struct reiserfs_transaction_handle myth;
15217 + struct reiserfs_journal *journal = SB_JOURNAL(sb);
15219 + /*
15220 + * we only want to flush out transactions if we were
15221 + * called with error == 0
15222 + */
15223 + if (!error && !sb_rdonly(sb)) {
15224 + /* end the current trans */
15225 + BUG_ON(!th->t_trans_id);
15226 + do_journal_end(th, FLUSH_ALL);
15228 + /*
15229 + * make sure something gets logged to force
15230 + * our way into the flush code
15231 + */
15232 + if (!journal_join(&myth, sb)) {
15233 + reiserfs_prepare_for_journal(sb,
15234 + SB_BUFFER_WITH_SB(sb),
15235 + 1);
15236 + journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb));
15237 + do_journal_end(&myth, FLUSH_ALL);
15241 + /* this also catches errors during the do_journal_end above */
15242 + if (!error && reiserfs_is_journal_aborted(journal)) {
15243 + memset(&myth, 0, sizeof(myth));
15244 + if (!journal_join_abort(&myth, sb)) {
15245 + reiserfs_prepare_for_journal(sb,
15246 + SB_BUFFER_WITH_SB(sb),
15247 + 1);
15248 + journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb));
15249 + do_journal_end(&myth, FLUSH_ALL);
15254 + /*
15255 + * We must release the write lock here because
15256 + * the workqueue job (flush_async_commit) needs this lock
15257 + */
15258 + reiserfs_write_unlock(sb);
15260 + /*
15261 + * Cancel flushing of old commits. Note that neither of these works
15262 + * will be requeued because superblock is being shutdown and doesn't
15263 + * have SB_ACTIVE set.
15264 + */
15265 + reiserfs_cancel_old_flush(sb);
15266 + /* wait for all commits to finish */
15267 + cancel_delayed_work_sync(&SB_JOURNAL(sb)->j_work);
15269 + free_journal_ram(sb);
15271 + reiserfs_write_lock(sb);
15273 + return 0;
15276 +/* * call on unmount. flush all journal trans, release all alloc'd ram */
15277 +int journal_release(struct reiserfs_transaction_handle *th,
15278 + struct super_block *sb)
15280 + return do_journal_release(th, sb, 0);
15283 +/* only call from an error condition inside reiserfs_read_super! */
15284 +int journal_release_error(struct reiserfs_transaction_handle *th,
15285 + struct super_block *sb)
15287 + return do_journal_release(th, sb, 1);
15291 + * compares description block with commit block.
15292 + * returns 1 if they differ, 0 if they are the same
15293 + */
15294 +static int journal_compare_desc_commit(struct super_block *sb,
15295 + struct reiserfs_journal_desc *desc,
15296 + struct reiserfs_journal_commit *commit)
15298 + if (get_commit_trans_id(commit) != get_desc_trans_id(desc) ||
15299 + get_commit_trans_len(commit) != get_desc_trans_len(desc) ||
15300 + get_commit_trans_len(commit) > SB_JOURNAL(sb)->j_trans_max ||
15301 + get_commit_trans_len(commit) <= 0) {
15302 + return 1;
15304 + return 0;
15308 + * returns 0 if it did not find a description block
15309 + * returns -1 if it found a corrupt commit block
15310 + * returns 1 if both desc and commit were valid
15311 + * NOTE: only called during fs mount
15312 + */
15313 +static int journal_transaction_is_valid(struct super_block *sb,
15314 + struct buffer_head *d_bh,
15315 + unsigned int *oldest_invalid_trans_id,
15316 + unsigned long *newest_mount_id)
15318 + struct reiserfs_journal_desc *desc;
15319 + struct reiserfs_journal_commit *commit;
15320 + struct buffer_head *c_bh;
15321 + unsigned long offset;
15323 + if (!d_bh)
15324 + return 0;
15326 + desc = (struct reiserfs_journal_desc *)d_bh->b_data;
15327 + if (get_desc_trans_len(desc) > 0
15328 + && !memcmp(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8)) {
15329 + if (oldest_invalid_trans_id && *oldest_invalid_trans_id
15330 + && get_desc_trans_id(desc) > *oldest_invalid_trans_id) {
15331 + reiserfs_debug(sb, REISERFS_DEBUG_CODE,
15332 + "journal-986: transaction "
15333 + "is valid returning because trans_id %d is greater than "
15334 + "oldest_invalid %lu",
15335 + get_desc_trans_id(desc),
15336 + *oldest_invalid_trans_id);
15337 + return 0;
15339 + if (newest_mount_id
15340 + && *newest_mount_id > get_desc_mount_id(desc)) {
15341 + reiserfs_debug(sb, REISERFS_DEBUG_CODE,
15342 + "journal-1087: transaction "
15343 + "is valid returning because mount_id %d is less than "
15344 + "newest_mount_id %lu",
15345 + get_desc_mount_id(desc),
15346 + *newest_mount_id);
15347 + return -1;
15349 + if (get_desc_trans_len(desc) > SB_JOURNAL(sb)->j_trans_max) {
15350 + reiserfs_warning(sb, "journal-2018",
15351 + "Bad transaction length %d "
15352 + "encountered, ignoring transaction",
15353 + get_desc_trans_len(desc));
15354 + return -1;
15356 + offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
15358 + /*
15359 + * ok, we have a journal description block,
15360 + * let's see if the transaction was valid
15361 + */
15362 + c_bh =
15363 + journal_bread(sb,
15364 + SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
15365 + ((offset + get_desc_trans_len(desc) +
15366 + 1) % SB_ONDISK_JOURNAL_SIZE(sb)));
15367 + if (!c_bh)
15368 + return 0;
15369 + commit = (struct reiserfs_journal_commit *)c_bh->b_data;
15370 + if (journal_compare_desc_commit(sb, desc, commit)) {
15371 + reiserfs_debug(sb, REISERFS_DEBUG_CODE,
15372 + "journal_transaction_is_valid, commit offset %ld had bad "
15373 + "time %d or length %d",
15374 + c_bh->b_blocknr -
15375 + SB_ONDISK_JOURNAL_1st_BLOCK(sb),
15376 + get_commit_trans_id(commit),
15377 + get_commit_trans_len(commit));
15378 + brelse(c_bh);
15379 + if (oldest_invalid_trans_id) {
15380 + *oldest_invalid_trans_id =
15381 + get_desc_trans_id(desc);
15382 + reiserfs_debug(sb, REISERFS_DEBUG_CODE,
15383 + "journal-1004: "
15384 + "transaction_is_valid setting oldest invalid trans_id "
15385 + "to %d",
15386 + get_desc_trans_id(desc));
15388 + return -1;
15390 + brelse(c_bh);
15391 + reiserfs_debug(sb, REISERFS_DEBUG_CODE,
15392 + "journal-1006: found valid "
15393 + "transaction start offset %llu, len %d id %d",
15394 + d_bh->b_blocknr -
15395 + SB_ONDISK_JOURNAL_1st_BLOCK(sb),
15396 + get_desc_trans_len(desc),
15397 + get_desc_trans_id(desc));
15398 + return 1;
15399 + } else {
15400 + return 0;
15404 +static void brelse_array(struct buffer_head **heads, int num)
15406 + int i;
15407 + for (i = 0; i < num; i++) {
15408 + brelse(heads[i]);
15413 + * given the start, and values for the oldest acceptable transactions,
15414 + * this either reads in a replays a transaction, or returns because the
15415 + * transaction is invalid, or too old.
15416 + * NOTE: only called during fs mount
15417 + */
15418 +static int journal_read_transaction(struct super_block *sb,
15419 + unsigned long cur_dblock,
15420 + unsigned long oldest_start,
15421 + unsigned int oldest_trans_id,
15422 + unsigned long newest_mount_id)
15424 + struct reiserfs_journal *journal = SB_JOURNAL(sb);
15425 + struct reiserfs_journal_desc *desc;
15426 + struct reiserfs_journal_commit *commit;
15427 + unsigned int trans_id = 0;
15428 + struct buffer_head *c_bh;
15429 + struct buffer_head *d_bh;
15430 + struct buffer_head **log_blocks = NULL;
15431 + struct buffer_head **real_blocks = NULL;
15432 + unsigned int trans_offset;
15433 + int i;
15434 + int trans_half;
15436 + d_bh = journal_bread(sb, cur_dblock);
15437 + if (!d_bh)
15438 + return 1;
15439 + desc = (struct reiserfs_journal_desc *)d_bh->b_data;
15440 + trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
15441 + reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1037: "
15442 + "journal_read_transaction, offset %llu, len %d mount_id %d",
15443 + d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb),
15444 + get_desc_trans_len(desc), get_desc_mount_id(desc));
15445 + if (get_desc_trans_id(desc) < oldest_trans_id) {
15446 + reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1039: "
15447 + "journal_read_trans skipping because %lu is too old",
15448 + cur_dblock -
15449 + SB_ONDISK_JOURNAL_1st_BLOCK(sb));
15450 + brelse(d_bh);
15451 + return 1;
15453 + if (get_desc_mount_id(desc) != newest_mount_id) {
15454 + reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1146: "
15455 + "journal_read_trans skipping because %d is != "
15456 + "newest_mount_id %lu", get_desc_mount_id(desc),
15457 + newest_mount_id);
15458 + brelse(d_bh);
15459 + return 1;
15461 + c_bh = journal_bread(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
15462 + ((trans_offset + get_desc_trans_len(desc) + 1) %
15463 + SB_ONDISK_JOURNAL_SIZE(sb)));
15464 + if (!c_bh) {
15465 + brelse(d_bh);
15466 + return 1;
15468 + commit = (struct reiserfs_journal_commit *)c_bh->b_data;
15469 + if (journal_compare_desc_commit(sb, desc, commit)) {
15470 + reiserfs_debug(sb, REISERFS_DEBUG_CODE,
15471 + "journal_read_transaction, "
15472 + "commit offset %llu had bad time %d or length %d",
15473 + c_bh->b_blocknr -
15474 + SB_ONDISK_JOURNAL_1st_BLOCK(sb),
15475 + get_commit_trans_id(commit),
15476 + get_commit_trans_len(commit));
15477 + brelse(c_bh);
15478 + brelse(d_bh);
15479 + return 1;
15482 + if (bdev_read_only(sb->s_bdev)) {
15483 + reiserfs_warning(sb, "clm-2076",
15484 + "device is readonly, unable to replay log");
15485 + brelse(c_bh);
15486 + brelse(d_bh);
15487 + return -EROFS;
15490 + trans_id = get_desc_trans_id(desc);
15491 + /*
15492 + * now we know we've got a good transaction, and it was
15493 + * inside the valid time ranges
15494 + */
15495 + log_blocks = kmalloc_array(get_desc_trans_len(desc),
15496 + sizeof(struct buffer_head *),
15497 + GFP_NOFS);
15498 + real_blocks = kmalloc_array(get_desc_trans_len(desc),
15499 + sizeof(struct buffer_head *),
15500 + GFP_NOFS);
15501 + if (!log_blocks || !real_blocks) {
15502 + brelse(c_bh);
15503 + brelse(d_bh);
15504 + kfree(log_blocks);
15505 + kfree(real_blocks);
15506 + reiserfs_warning(sb, "journal-1169",
15507 + "kmalloc failed, unable to mount FS");
15508 + return -1;
15510 + /* get all the buffer heads */
15511 + trans_half = journal_trans_half(sb->s_blocksize);
15512 + for (i = 0; i < get_desc_trans_len(desc); i++) {
15513 + log_blocks[i] =
15514 + journal_getblk(sb,
15515 + SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
15516 + (trans_offset + 1 +
15517 + i) % SB_ONDISK_JOURNAL_SIZE(sb));
15518 + if (i < trans_half) {
15519 + real_blocks[i] =
15520 + sb_getblk(sb,
15521 + le32_to_cpu(desc->j_realblock[i]));
15522 + } else {
15523 + real_blocks[i] =
15524 + sb_getblk(sb,
15525 + le32_to_cpu(commit->
15526 + j_realblock[i - trans_half]));
15528 + if (real_blocks[i]->b_blocknr > SB_BLOCK_COUNT(sb)) {
15529 + reiserfs_warning(sb, "journal-1207",
15530 + "REPLAY FAILURE fsck required! "
15531 + "Block to replay is outside of "
15532 + "filesystem");
15533 + goto abort_replay;
15535 + /* make sure we don't try to replay onto log or reserved area */
15536 + if (is_block_in_log_or_reserved_area
15537 + (sb, real_blocks[i]->b_blocknr)) {
15538 + reiserfs_warning(sb, "journal-1204",
15539 + "REPLAY FAILURE fsck required! "
15540 + "Trying to replay onto a log block");
15541 +abort_replay:
15542 + brelse_array(log_blocks, i);
15543 + brelse_array(real_blocks, i);
15544 + brelse(c_bh);
15545 + brelse(d_bh);
15546 + kfree(log_blocks);
15547 + kfree(real_blocks);
15548 + return -1;
15551 + /* read in the log blocks, memcpy to the corresponding real block */
15552 + bh_read_batch(get_desc_trans_len(desc), log_blocks);
15553 + for (i = 0; i < get_desc_trans_len(desc); i++) {
15555 + wait_on_buffer(log_blocks[i]);
15556 + if (!buffer_uptodate(log_blocks[i])) {
15557 + reiserfs_warning(sb, "journal-1212",
15558 + "REPLAY FAILURE fsck required! "
15559 + "buffer write failed");
15560 + brelse_array(log_blocks + i,
15561 + get_desc_trans_len(desc) - i);
15562 + brelse_array(real_blocks, get_desc_trans_len(desc));
15563 + brelse(c_bh);
15564 + brelse(d_bh);
15565 + kfree(log_blocks);
15566 + kfree(real_blocks);
15567 + return -1;
15569 + memcpy(real_blocks[i]->b_data, log_blocks[i]->b_data,
15570 + real_blocks[i]->b_size);
15571 + set_buffer_uptodate(real_blocks[i]);
15572 + brelse(log_blocks[i]);
15574 + /* flush out the real blocks */
15575 + for (i = 0; i < get_desc_trans_len(desc); i++) {
15576 + set_buffer_dirty(real_blocks[i]);
15577 + write_dirty_buffer(real_blocks[i], 0);
15579 + for (i = 0; i < get_desc_trans_len(desc); i++) {
15580 + wait_on_buffer(real_blocks[i]);
15581 + if (!buffer_uptodate(real_blocks[i])) {
15582 + reiserfs_warning(sb, "journal-1226",
15583 + "REPLAY FAILURE, fsck required! "
15584 + "buffer write failed");
15585 + brelse_array(real_blocks + i,
15586 + get_desc_trans_len(desc) - i);
15587 + brelse(c_bh);
15588 + brelse(d_bh);
15589 + kfree(log_blocks);
15590 + kfree(real_blocks);
15591 + return -1;
15593 + brelse(real_blocks[i]);
15595 + cur_dblock =
15596 + SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
15597 + ((trans_offset + get_desc_trans_len(desc) +
15598 + 2) % SB_ONDISK_JOURNAL_SIZE(sb));
15599 + reiserfs_debug(sb, REISERFS_DEBUG_CODE,
15600 + "journal-1095: setting journal " "start to offset %ld",
15601 + cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb));
15603 + /*
15604 + * init starting values for the first transaction, in case
15605 + * this is the last transaction to be replayed.
15606 + */
15607 + journal->j_start = cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
15608 + journal->j_last_flush_trans_id = trans_id;
15609 + journal->j_trans_id = trans_id + 1;
15610 + /* check for trans_id overflow */
15611 + if (journal->j_trans_id == 0)
15612 + journal->j_trans_id = 10;
15613 + brelse(c_bh);
15614 + brelse(d_bh);
15615 + kfree(log_blocks);
15616 + kfree(real_blocks);
15617 + return 0;
15621 + * This function reads blocks starting from block and to max_block of bufsize
15622 + * size (but no more than BUFNR blocks at a time). This proved to improve
15623 + * mounting speed on self-rebuilding raid5 arrays at least.
15624 + * Right now it is only used from journal code. But later we might use it
15625 + * from other places.
15626 + * Note: Do not use journal_getblk/sb_getblk functions here!
15627 + */
15628 +static struct buffer_head *reiserfs_breada(struct block_device *dev,
15629 + b_blocknr_t block, int bufsize,
15630 + b_blocknr_t max_block)
15632 + struct buffer_head *bhlist[BUFNR];
15633 + unsigned int blocks = BUFNR;
15634 + struct buffer_head *bh;
15635 + int i, j;
15637 + bh = __getblk(dev, block, bufsize);
15638 + if (!bh || buffer_uptodate(bh))
15639 + return (bh);
15641 + if (block + BUFNR > max_block) {
15642 + blocks = max_block - block;
15644 + bhlist[0] = bh;
15645 + j = 1;
15646 + for (i = 1; i < blocks; i++) {
15647 + bh = __getblk(dev, block + i, bufsize);
15648 + if (!bh)
15649 + break;
15650 + if (buffer_uptodate(bh)) {
15651 + brelse(bh);
15652 + break;
15653 + } else
15654 + bhlist[j++] = bh;
15656 + bh = bhlist[0];
15657 + bh_read_nowait(bh, 0);
15658 + bh_readahead_batch(j - 1, &bhlist[1], 0);
15659 + for (i = 1; i < j; i++)
15660 + brelse(bhlist[i]);
15661 + wait_on_buffer(bh);
15662 + if (buffer_uptodate(bh))
15663 + return bh;
15664 + brelse(bh);
15665 + return NULL;
15669 + * read and replay the log
15670 + * on a clean unmount, the journal header's next unflushed pointer will be
15671 + * to an invalid transaction. This tests that before finding all the
15672 + * transactions in the log, which makes normal mount times fast.
15674 + * After a crash, this starts with the next unflushed transaction, and
15675 + * replays until it finds one too old, or invalid.
15677 + * On exit, it sets things up so the first transaction will work correctly.
15678 + * NOTE: only called during fs mount
15679 + */
15680 +static int journal_read(struct super_block *sb)
15682 + struct reiserfs_journal *journal = SB_JOURNAL(sb);
15683 + struct reiserfs_journal_desc *desc;
15684 + unsigned int oldest_trans_id = 0;
15685 + unsigned int oldest_invalid_trans_id = 0;
15686 + time64_t start;
15687 + unsigned long oldest_start = 0;
15688 + unsigned long cur_dblock = 0;
15689 + unsigned long newest_mount_id = 9;
15690 + struct buffer_head *d_bh;
15691 + struct reiserfs_journal_header *jh;
15692 + int valid_journal_header = 0;
15693 + int replay_count = 0;
15694 + int continue_replay = 1;
15695 + int ret;
15697 + cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb);
15698 + reiserfs_info(sb, "checking transaction log (%pg)\n",
15699 + file_bdev(journal->j_bdev_file));
15700 + start = ktime_get_seconds();
15702 + /*
15703 + * step 1, read in the journal header block. Check the transaction
15704 + * it says is the first unflushed, and if that transaction is not
15705 + * valid, replay is done
15706 + */
15707 + journal->j_header_bh = journal_bread(sb,
15708 + SB_ONDISK_JOURNAL_1st_BLOCK(sb)
15709 + + SB_ONDISK_JOURNAL_SIZE(sb));
15710 + if (!journal->j_header_bh) {
15711 + return 1;
15713 + jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data);
15714 + if (le32_to_cpu(jh->j_first_unflushed_offset) <
15715 + SB_ONDISK_JOURNAL_SIZE(sb)
15716 + && le32_to_cpu(jh->j_last_flush_trans_id) > 0) {
15717 + oldest_start =
15718 + SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
15719 + le32_to_cpu(jh->j_first_unflushed_offset);
15720 + oldest_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1;
15721 + newest_mount_id = le32_to_cpu(jh->j_mount_id);
15722 + reiserfs_debug(sb, REISERFS_DEBUG_CODE,
15723 + "journal-1153: found in "
15724 + "header: first_unflushed_offset %d, last_flushed_trans_id "
15725 + "%lu", le32_to_cpu(jh->j_first_unflushed_offset),
15726 + le32_to_cpu(jh->j_last_flush_trans_id));
15727 + valid_journal_header = 1;
15729 + /*
15730 + * now, we try to read the first unflushed offset. If it
15731 + * is not valid, there is nothing more we can do, and it
15732 + * makes no sense to read through the whole log.
15733 + */
15734 + d_bh =
15735 + journal_bread(sb,
15736 + SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
15737 + le32_to_cpu(jh->j_first_unflushed_offset));
15738 + ret = journal_transaction_is_valid(sb, d_bh, NULL, NULL);
15739 + if (!ret) {
15740 + continue_replay = 0;
15742 + brelse(d_bh);
15743 + goto start_log_replay;
15746 + /*
15747 + * ok, there are transactions that need to be replayed. start
15748 + * with the first log block, find all the valid transactions, and
15749 + * pick out the oldest.
15750 + */
15751 + while (continue_replay
15752 + && cur_dblock <
15753 + (SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
15754 + SB_ONDISK_JOURNAL_SIZE(sb))) {
15755 + /*
15756 + * Note that it is required for blocksize of primary fs
15757 + * device and journal device to be the same
15758 + */
15759 + d_bh =
15760 + reiserfs_breada(file_bdev(journal->j_bdev_file), cur_dblock,
15761 + sb->s_blocksize,
15762 + SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
15763 + SB_ONDISK_JOURNAL_SIZE(sb));
15764 + ret =
15765 + journal_transaction_is_valid(sb, d_bh,
15766 + &oldest_invalid_trans_id,
15767 + &newest_mount_id);
15768 + if (ret == 1) {
15769 + desc = (struct reiserfs_journal_desc *)d_bh->b_data;
15770 + if (oldest_start == 0) { /* init all oldest_ values */
15771 + oldest_trans_id = get_desc_trans_id(desc);
15772 + oldest_start = d_bh->b_blocknr;
15773 + newest_mount_id = get_desc_mount_id(desc);
15774 + reiserfs_debug(sb, REISERFS_DEBUG_CODE,
15775 + "journal-1179: Setting "
15776 + "oldest_start to offset %llu, trans_id %lu",
15777 + oldest_start -
15778 + SB_ONDISK_JOURNAL_1st_BLOCK
15779 + (sb), oldest_trans_id);
15780 + } else if (oldest_trans_id > get_desc_trans_id(desc)) {
15781 + /* one we just read was older */
15782 + oldest_trans_id = get_desc_trans_id(desc);
15783 + oldest_start = d_bh->b_blocknr;
15784 + reiserfs_debug(sb, REISERFS_DEBUG_CODE,
15785 + "journal-1180: Resetting "
15786 + "oldest_start to offset %lu, trans_id %lu",
15787 + oldest_start -
15788 + SB_ONDISK_JOURNAL_1st_BLOCK
15789 + (sb), oldest_trans_id);
15791 + if (newest_mount_id < get_desc_mount_id(desc)) {
15792 + newest_mount_id = get_desc_mount_id(desc);
15793 + reiserfs_debug(sb, REISERFS_DEBUG_CODE,
15794 + "journal-1299: Setting "
15795 + "newest_mount_id to %d",
15796 + get_desc_mount_id(desc));
15798 + cur_dblock += get_desc_trans_len(desc) + 2;
15799 + } else {
15800 + cur_dblock++;
15802 + brelse(d_bh);
15805 +start_log_replay:
15806 + cur_dblock = oldest_start;
15807 + if (oldest_trans_id) {
15808 + reiserfs_debug(sb, REISERFS_DEBUG_CODE,
15809 + "journal-1206: Starting replay "
15810 + "from offset %llu, trans_id %lu",
15811 + cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb),
15812 + oldest_trans_id);
15815 + replay_count = 0;
15816 + while (continue_replay && oldest_trans_id > 0) {
15817 + ret =
15818 + journal_read_transaction(sb, cur_dblock, oldest_start,
15819 + oldest_trans_id, newest_mount_id);
15820 + if (ret < 0) {
15821 + return ret;
15822 + } else if (ret != 0) {
15823 + break;
15825 + cur_dblock =
15826 + SB_ONDISK_JOURNAL_1st_BLOCK(sb) + journal->j_start;
15827 + replay_count++;
15828 + if (cur_dblock == oldest_start)
15829 + break;
15832 + if (oldest_trans_id == 0) {
15833 + reiserfs_debug(sb, REISERFS_DEBUG_CODE,
15834 + "journal-1225: No valid " "transactions found");
15836 + /*
15837 + * j_start does not get set correctly if we don't replay any
15838 + * transactions. if we had a valid journal_header, set j_start
15839 + * to the first unflushed transaction value, copy the trans_id
15840 + * from the header
15841 + */
15842 + if (valid_journal_header && replay_count == 0) {
15843 + journal->j_start = le32_to_cpu(jh->j_first_unflushed_offset);
15844 + journal->j_trans_id =
15845 + le32_to_cpu(jh->j_last_flush_trans_id) + 1;
15846 + /* check for trans_id overflow */
15847 + if (journal->j_trans_id == 0)
15848 + journal->j_trans_id = 10;
15849 + journal->j_last_flush_trans_id =
15850 + le32_to_cpu(jh->j_last_flush_trans_id);
15851 + journal->j_mount_id = le32_to_cpu(jh->j_mount_id) + 1;
15852 + } else {
15853 + journal->j_mount_id = newest_mount_id + 1;
15855 + reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1299: Setting "
15856 + "newest_mount_id to %lu", journal->j_mount_id);
15857 + journal->j_first_unflushed_offset = journal->j_start;
15858 + if (replay_count > 0) {
15859 + reiserfs_info(sb,
15860 + "replayed %d transactions in %lu seconds\n",
15861 + replay_count, ktime_get_seconds() - start);
15863 + /* needed to satisfy the locking in _update_journal_header_block */
15864 + reiserfs_write_lock(sb);
15865 + if (!bdev_read_only(sb->s_bdev) &&
15866 + _update_journal_header_block(sb, journal->j_start,
15867 + journal->j_last_flush_trans_id)) {
15868 + reiserfs_write_unlock(sb);
15869 + /*
15870 + * replay failed, caller must call free_journal_ram and abort
15871 + * the mount
15872 + */
15873 + return -1;
15875 + reiserfs_write_unlock(sb);
15876 + return 0;
15879 +static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
15881 + struct reiserfs_journal_list *jl;
15882 + jl = kzalloc(sizeof(struct reiserfs_journal_list),
15883 + GFP_NOFS | __GFP_NOFAIL);
15884 + INIT_LIST_HEAD(&jl->j_list);
15885 + INIT_LIST_HEAD(&jl->j_working_list);
15886 + INIT_LIST_HEAD(&jl->j_tail_bh_list);
15887 + INIT_LIST_HEAD(&jl->j_bh_list);
15888 + mutex_init(&jl->j_commit_mutex);
15889 + SB_JOURNAL(s)->j_num_lists++;
15890 + get_journal_list(jl);
15891 + return jl;
15894 +static void journal_list_init(struct super_block *sb)
15896 + SB_JOURNAL(sb)->j_current_jl = alloc_journal_list(sb);
15899 +static void release_journal_dev(struct reiserfs_journal *journal)
15901 + if (journal->j_bdev_file) {
15902 + bdev_fput(journal->j_bdev_file);
15903 + journal->j_bdev_file = NULL;
15907 +static int journal_init_dev(struct super_block *super,
15908 + struct reiserfs_journal *journal,
15909 + const char *jdev_name)
15911 + blk_mode_t blkdev_mode = BLK_OPEN_READ;
15912 + void *holder = journal;
15913 + int result;
15914 + dev_t jdev;
15916 + result = 0;
15918 + journal->j_bdev_file = NULL;
15919 + jdev = SB_ONDISK_JOURNAL_DEVICE(super) ?
15920 + new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev;
15922 + if (!bdev_read_only(super->s_bdev))
15923 + blkdev_mode |= BLK_OPEN_WRITE;
15925 + /* there is no "jdev" option and journal is on separate device */
15926 + if ((!jdev_name || !jdev_name[0])) {
15927 + if (jdev == super->s_dev)
15928 + holder = NULL;
15929 + journal->j_bdev_file = bdev_file_open_by_dev(jdev, blkdev_mode,
15930 + holder, NULL);
15931 + if (IS_ERR(journal->j_bdev_file)) {
15932 + result = PTR_ERR(journal->j_bdev_file);
15933 + journal->j_bdev_file = NULL;
15934 + reiserfs_warning(super, "sh-458",
15935 + "cannot init journal device unknown-block(%u,%u): %i",
15936 + MAJOR(jdev), MINOR(jdev), result);
15937 + return result;
15938 + } else if (jdev != super->s_dev)
15939 + set_blocksize(journal->j_bdev_file, super->s_blocksize);
15941 + return 0;
15944 + journal->j_bdev_file = bdev_file_open_by_path(jdev_name, blkdev_mode,
15945 + holder, NULL);
15946 + if (IS_ERR(journal->j_bdev_file)) {
15947 + result = PTR_ERR(journal->j_bdev_file);
15948 + journal->j_bdev_file = NULL;
15949 + reiserfs_warning(super, "sh-457",
15950 + "journal_init_dev: Cannot open '%s': %i",
15951 + jdev_name, result);
15952 + return result;
15955 + set_blocksize(journal->j_bdev_file, super->s_blocksize);
15956 + reiserfs_info(super,
15957 + "journal_init_dev: journal device: %pg\n",
15958 + file_bdev(journal->j_bdev_file));
15959 + return 0;
15963 + * When creating/tuning a file system user can assign some
15964 + * journal params within boundaries which depend on the ratio
15965 + * blocksize/standard_blocksize.
15967 + * For blocks >= standard_blocksize transaction size should
15968 + * be not less then JOURNAL_TRANS_MIN_DEFAULT, and not more
15969 + * then JOURNAL_TRANS_MAX_DEFAULT.
15971 + * For blocks < standard_blocksize these boundaries should be
15972 + * decreased proportionally.
15973 + */
15974 +#define REISERFS_STANDARD_BLKSIZE (4096)
15976 +static int check_advise_trans_params(struct super_block *sb,
15977 + struct reiserfs_journal *journal)
15979 + if (journal->j_trans_max) {
15980 + /* Non-default journal params. Do sanity check for them. */
15981 + int ratio = 1;
15982 + if (sb->s_blocksize < REISERFS_STANDARD_BLKSIZE)
15983 + ratio = REISERFS_STANDARD_BLKSIZE / sb->s_blocksize;
15985 + if (journal->j_trans_max > JOURNAL_TRANS_MAX_DEFAULT / ratio ||
15986 + journal->j_trans_max < JOURNAL_TRANS_MIN_DEFAULT / ratio ||
15987 + SB_ONDISK_JOURNAL_SIZE(sb) / journal->j_trans_max <
15988 + JOURNAL_MIN_RATIO) {
15989 + reiserfs_warning(sb, "sh-462",
15990 + "bad transaction max size (%u). "
15991 + "FSCK?", journal->j_trans_max);
15992 + return 1;
15994 + if (journal->j_max_batch != (journal->j_trans_max) *
15995 + JOURNAL_MAX_BATCH_DEFAULT/JOURNAL_TRANS_MAX_DEFAULT) {
15996 + reiserfs_warning(sb, "sh-463",
15997 + "bad transaction max batch (%u). "
15998 + "FSCK?", journal->j_max_batch);
15999 + return 1;
16001 + } else {
16002 + /*
16003 + * Default journal params.
16004 + * The file system was created by old version
16005 + * of mkreiserfs, so some fields contain zeros,
16006 + * and we need to advise proper values for them
16007 + */
16008 + if (sb->s_blocksize != REISERFS_STANDARD_BLKSIZE) {
16009 + reiserfs_warning(sb, "sh-464", "bad blocksize (%u)",
16010 + sb->s_blocksize);
16011 + return 1;
16013 + journal->j_trans_max = JOURNAL_TRANS_MAX_DEFAULT;
16014 + journal->j_max_batch = JOURNAL_MAX_BATCH_DEFAULT;
16015 + journal->j_max_commit_age = JOURNAL_MAX_COMMIT_AGE;
16017 + return 0;
16020 +/* must be called once on fs mount. calls journal_read for you */
16021 +int journal_init(struct super_block *sb, const char *j_dev_name,
16022 + int old_format, unsigned int commit_max_age)
16024 + int num_cnodes = SB_ONDISK_JOURNAL_SIZE(sb) * 2;
16025 + struct buffer_head *bhjh;
16026 + struct reiserfs_super_block *rs;
16027 + struct reiserfs_journal_header *jh;
16028 + struct reiserfs_journal *journal;
16029 + struct reiserfs_journal_list *jl;
16030 + int ret;
16032 + journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal));
16033 + if (!journal) {
16034 + reiserfs_warning(sb, "journal-1256",
16035 + "unable to get memory for journal structure");
16036 + return 1;
16038 + INIT_LIST_HEAD(&journal->j_bitmap_nodes);
16039 + INIT_LIST_HEAD(&journal->j_prealloc_list);
16040 + INIT_LIST_HEAD(&journal->j_working_list);
16041 + INIT_LIST_HEAD(&journal->j_journal_list);
16042 + journal->j_persistent_trans = 0;
16043 + if (reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap,
16044 + reiserfs_bmap_count(sb)))
16045 + goto free_and_return;
16047 + allocate_bitmap_nodes(sb);
16049 + /* reserved for journal area support */
16050 + SB_JOURNAL_1st_RESERVED_BLOCK(sb) = (old_format ?
16051 + REISERFS_OLD_DISK_OFFSET_IN_BYTES
16052 + / sb->s_blocksize +
16053 + reiserfs_bmap_count(sb) +
16054 + 1 :
16055 + REISERFS_DISK_OFFSET_IN_BYTES /
16056 + sb->s_blocksize + 2);
16058 + /*
16059 + * Sanity check to see is the standard journal fitting
16060 + * within first bitmap (actual for small blocksizes)
16061 + */
16062 + if (!SB_ONDISK_JOURNAL_DEVICE(sb) &&
16063 + (SB_JOURNAL_1st_RESERVED_BLOCK(sb) +
16064 + SB_ONDISK_JOURNAL_SIZE(sb) > sb->s_blocksize * 8)) {
16065 + reiserfs_warning(sb, "journal-1393",
16066 + "journal does not fit for area addressed "
16067 + "by first of bitmap blocks. It starts at "
16068 + "%u and its size is %u. Block size %ld",
16069 + SB_JOURNAL_1st_RESERVED_BLOCK(sb),
16070 + SB_ONDISK_JOURNAL_SIZE(sb),
16071 + sb->s_blocksize);
16072 + goto free_and_return;
16075 + /*
16076 + * Sanity check to see if journal first block is correct.
16077 + * If journal first block is invalid it can cause
16078 + * zeroing important superblock members.
16079 + */
16080 + if (!SB_ONDISK_JOURNAL_DEVICE(sb) &&
16081 + SB_ONDISK_JOURNAL_1st_BLOCK(sb) < SB_JOURNAL_1st_RESERVED_BLOCK(sb)) {
16082 + reiserfs_warning(sb, "journal-1393",
16083 + "journal 1st super block is invalid: 1st reserved block %d, but actual 1st block is %d",
16084 + SB_JOURNAL_1st_RESERVED_BLOCK(sb),
16085 + SB_ONDISK_JOURNAL_1st_BLOCK(sb));
16086 + goto free_and_return;
16089 + if (journal_init_dev(sb, journal, j_dev_name) != 0) {
16090 + reiserfs_warning(sb, "sh-462",
16091 + "unable to initialize journal device");
16092 + goto free_and_return;
16095 + rs = SB_DISK_SUPER_BLOCK(sb);
16097 + /* read journal header */
16098 + bhjh = journal_bread(sb,
16099 + SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
16100 + SB_ONDISK_JOURNAL_SIZE(sb));
16101 + if (!bhjh) {
16102 + reiserfs_warning(sb, "sh-459",
16103 + "unable to read journal header");
16104 + goto free_and_return;
16106 + jh = (struct reiserfs_journal_header *)(bhjh->b_data);
16108 + /* make sure that journal matches to the super block */
16109 + if (is_reiserfs_jr(rs)
16110 + && (le32_to_cpu(jh->jh_journal.jp_journal_magic) !=
16111 + sb_jp_journal_magic(rs))) {
16112 + reiserfs_warning(sb, "sh-460",
16113 + "journal header magic %x (device %pg) does "
16114 + "not match to magic found in super block %x",
16115 + jh->jh_journal.jp_journal_magic,
16116 + file_bdev(journal->j_bdev_file),
16117 + sb_jp_journal_magic(rs));
16118 + brelse(bhjh);
16119 + goto free_and_return;
16122 + journal->j_trans_max = le32_to_cpu(jh->jh_journal.jp_journal_trans_max);
16123 + journal->j_max_batch = le32_to_cpu(jh->jh_journal.jp_journal_max_batch);
16124 + journal->j_max_commit_age =
16125 + le32_to_cpu(jh->jh_journal.jp_journal_max_commit_age);
16126 + journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE;
16128 + if (check_advise_trans_params(sb, journal) != 0)
16129 + goto free_and_return;
16130 + journal->j_default_max_commit_age = journal->j_max_commit_age;
16132 + if (commit_max_age != 0) {
16133 + journal->j_max_commit_age = commit_max_age;
16134 + journal->j_max_trans_age = commit_max_age;
16137 + reiserfs_info(sb, "journal params: device %pg, size %u, "
16138 + "journal first block %u, max trans len %u, max batch %u, "
16139 + "max commit age %u, max trans age %u\n",
16140 + file_bdev(journal->j_bdev_file),
16141 + SB_ONDISK_JOURNAL_SIZE(sb),
16142 + SB_ONDISK_JOURNAL_1st_BLOCK(sb),
16143 + journal->j_trans_max,
16144 + journal->j_max_batch,
16145 + journal->j_max_commit_age, journal->j_max_trans_age);
16147 + brelse(bhjh);
16149 + journal->j_list_bitmap_index = 0;
16150 + journal_list_init(sb);
16152 + memset(journal->j_list_hash_table, 0,
16153 + JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *));
16155 + INIT_LIST_HEAD(&journal->j_dirty_buffers);
16156 + spin_lock_init(&journal->j_dirty_buffers_lock);
16158 + journal->j_start = 0;
16159 + journal->j_len = 0;
16160 + journal->j_len_alloc = 0;
16161 + atomic_set(&journal->j_wcount, 0);
16162 + atomic_set(&journal->j_async_throttle, 0);
16163 + journal->j_bcount = 0;
16164 + journal->j_trans_start_time = 0;
16165 + journal->j_last = NULL;
16166 + journal->j_first = NULL;
16167 + init_waitqueue_head(&journal->j_join_wait);
16168 + mutex_init(&journal->j_mutex);
16169 + mutex_init(&journal->j_flush_mutex);
16171 + journal->j_trans_id = 10;
16172 + journal->j_mount_id = 10;
16173 + journal->j_state = 0;
16174 + atomic_set(&journal->j_jlock, 0);
16175 + journal->j_cnode_free_list = allocate_cnodes(num_cnodes);
16176 + journal->j_cnode_free_orig = journal->j_cnode_free_list;
16177 + journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0;
16178 + journal->j_cnode_used = 0;
16179 + journal->j_must_wait = 0;
16181 + if (journal->j_cnode_free == 0) {
16182 + reiserfs_warning(sb, "journal-2004", "Journal cnode memory "
16183 + "allocation failed (%ld bytes). Journal is "
16184 + "too large for available memory. Usually "
16185 + "this is due to a journal that is too large.",
16186 + sizeof (struct reiserfs_journal_cnode) * num_cnodes);
16187 + goto free_and_return;
16190 + init_journal_hash(sb);
16191 + jl = journal->j_current_jl;
16193 + /*
16194 + * get_list_bitmap() may call flush_commit_list() which
16195 + * requires the lock. Calling flush_commit_list() shouldn't happen
16196 + * this early but I like to be paranoid.
16197 + */
16198 + reiserfs_write_lock(sb);
16199 + jl->j_list_bitmap = get_list_bitmap(sb, jl);
16200 + reiserfs_write_unlock(sb);
16201 + if (!jl->j_list_bitmap) {
16202 + reiserfs_warning(sb, "journal-2005",
16203 + "get_list_bitmap failed for journal list 0");
16204 + goto free_and_return;
16207 + ret = journal_read(sb);
16208 + if (ret < 0) {
16209 + reiserfs_warning(sb, "reiserfs-2006",
16210 + "Replay Failure, unable to mount");
16211 + goto free_and_return;
16214 + INIT_DELAYED_WORK(&journal->j_work, flush_async_commits);
16215 + journal->j_work_sb = sb;
16216 + return 0;
16217 +free_and_return:
16218 + free_journal_ram(sb);
16219 + return 1;
16223 + * test for a polite end of the current transaction. Used by file_write,
16224 + * and should be used by delete to make sure they don't write more than
16225 + * can fit inside a single transaction
16226 + */
16227 +int journal_transaction_should_end(struct reiserfs_transaction_handle *th,
16228 + int new_alloc)
16230 + struct reiserfs_journal *journal = SB_JOURNAL(th->t_super);
16231 + time64_t now = ktime_get_seconds();
16232 + /* cannot restart while nested */
16233 + BUG_ON(!th->t_trans_id);
16234 + if (th->t_refcount > 1)
16235 + return 0;
16236 + if (journal->j_must_wait > 0 ||
16237 + (journal->j_len_alloc + new_alloc) >= journal->j_max_batch ||
16238 + atomic_read(&journal->j_jlock) ||
16239 + (now - journal->j_trans_start_time) > journal->j_max_trans_age ||
16240 + journal->j_cnode_free < (journal->j_trans_max * 3)) {
16241 + return 1;
16244 + journal->j_len_alloc += new_alloc;
16245 + th->t_blocks_allocated += new_alloc ;
16246 + return 0;
16249 +/* this must be called inside a transaction */
16250 +void reiserfs_block_writes(struct reiserfs_transaction_handle *th)
16252 + struct reiserfs_journal *journal = SB_JOURNAL(th->t_super);
16253 + BUG_ON(!th->t_trans_id);
16254 + journal->j_must_wait = 1;
16255 + set_bit(J_WRITERS_BLOCKED, &journal->j_state);
16256 + return;
16259 +/* this must be called without a transaction started */
16260 +void reiserfs_allow_writes(struct super_block *s)
16262 + struct reiserfs_journal *journal = SB_JOURNAL(s);
16263 + clear_bit(J_WRITERS_BLOCKED, &journal->j_state);
16264 + wake_up(&journal->j_join_wait);
16267 +/* this must be called without a transaction started */
16268 +void reiserfs_wait_on_write_block(struct super_block *s)
16270 + struct reiserfs_journal *journal = SB_JOURNAL(s);
16271 + wait_event(journal->j_join_wait,
16272 + !test_bit(J_WRITERS_BLOCKED, &journal->j_state));
16275 +static void queue_log_writer(struct super_block *s)
16277 + wait_queue_entry_t wait;
16278 + struct reiserfs_journal *journal = SB_JOURNAL(s);
16279 + set_bit(J_WRITERS_QUEUED, &journal->j_state);
16281 + /*
16282 + * we don't want to use wait_event here because
16283 + * we only want to wait once.
16284 + */
16285 + init_waitqueue_entry(&wait, current);
16286 + add_wait_queue(&journal->j_join_wait, &wait);
16287 + set_current_state(TASK_UNINTERRUPTIBLE);
16288 + if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) {
16289 + int depth = reiserfs_write_unlock_nested(s);
16290 + schedule();
16291 + reiserfs_write_lock_nested(s, depth);
16293 + __set_current_state(TASK_RUNNING);
16294 + remove_wait_queue(&journal->j_join_wait, &wait);
16297 +static void wake_queued_writers(struct super_block *s)
16299 + struct reiserfs_journal *journal = SB_JOURNAL(s);
16300 + if (test_and_clear_bit(J_WRITERS_QUEUED, &journal->j_state))
16301 + wake_up(&journal->j_join_wait);
16304 +static void let_transaction_grow(struct super_block *sb, unsigned int trans_id)
16306 + struct reiserfs_journal *journal = SB_JOURNAL(sb);
16307 + unsigned long bcount = journal->j_bcount;
16308 + while (1) {
16309 + int depth;
16311 + depth = reiserfs_write_unlock_nested(sb);
16312 + schedule_timeout_uninterruptible(1);
16313 + reiserfs_write_lock_nested(sb, depth);
16315 + journal->j_current_jl->j_state |= LIST_COMMIT_PENDING;
16316 + while ((atomic_read(&journal->j_wcount) > 0 ||
16317 + atomic_read(&journal->j_jlock)) &&
16318 + journal->j_trans_id == trans_id) {
16319 + queue_log_writer(sb);
16321 + if (journal->j_trans_id != trans_id)
16322 + break;
16323 + if (bcount == journal->j_bcount)
16324 + break;
16325 + bcount = journal->j_bcount;
16330 + * join == true if you must join an existing transaction.
16331 + * join == false if you can deal with waiting for others to finish
16333 + * this will block until the transaction is joinable. send the number of
16334 + * blocks you expect to use in nblocks.
16336 +static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
16337 + struct super_block *sb, unsigned long nblocks,
16338 + int join)
16340 + time64_t now = ktime_get_seconds();
16341 + unsigned int old_trans_id;
16342 + struct reiserfs_journal *journal = SB_JOURNAL(sb);
16343 + struct reiserfs_transaction_handle myth;
16344 + int retval;
16345 + int depth;
16347 + reiserfs_check_lock_depth(sb, "journal_begin");
16348 + BUG_ON(nblocks > journal->j_trans_max);
16350 + PROC_INFO_INC(sb, journal.journal_being);
16351 + /* set here for journal_join */
16352 + th->t_refcount = 1;
16353 + th->t_super = sb;
16355 +relock:
16356 + lock_journal(sb);
16357 + if (join != JBEGIN_ABORT && reiserfs_is_journal_aborted(journal)) {
16358 + unlock_journal(sb);
16359 + retval = journal->j_errno;
16360 + goto out_fail;
16362 + journal->j_bcount++;
16364 + if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) {
16365 + unlock_journal(sb);
16366 + depth = reiserfs_write_unlock_nested(sb);
16367 + reiserfs_wait_on_write_block(sb);
16368 + reiserfs_write_lock_nested(sb, depth);
16369 + PROC_INFO_INC(sb, journal.journal_relock_writers);
16370 + goto relock;
16372 + now = ktime_get_seconds();
16374 + /*
16375 + * if there is no room in the journal OR
16376 + * if this transaction is too old, and we weren't called joinable,
16377 + * wait for it to finish before beginning we don't sleep if there
16378 + * aren't other writers
16379 + */
16381 + if ((!join && journal->j_must_wait > 0) ||
16382 + (!join
16383 + && (journal->j_len_alloc + nblocks + 2) >= journal->j_max_batch)
16384 + || (!join && atomic_read(&journal->j_wcount) > 0
16385 + && journal->j_trans_start_time > 0
16386 + && (now - journal->j_trans_start_time) >
16387 + journal->j_max_trans_age) || (!join
16388 + && atomic_read(&journal->j_jlock))
16389 + || (!join && journal->j_cnode_free < (journal->j_trans_max * 3))) {
16391 + old_trans_id = journal->j_trans_id;
16392 + /* allow others to finish this transaction */
16393 + unlock_journal(sb);
16395 + if (!join && (journal->j_len_alloc + nblocks + 2) >=
16396 + journal->j_max_batch &&
16397 + ((journal->j_len + nblocks + 2) * 100) <
16398 + (journal->j_len_alloc * 75)) {
16399 + if (atomic_read(&journal->j_wcount) > 10) {
16400 + queue_log_writer(sb);
16401 + goto relock;
16404 + /*
16405 + * don't mess with joining the transaction if all we
16406 + * have to do is wait for someone else to do a commit
16407 + */
16408 + if (atomic_read(&journal->j_jlock)) {
16409 + while (journal->j_trans_id == old_trans_id &&
16410 + atomic_read(&journal->j_jlock)) {
16411 + queue_log_writer(sb);
16413 + goto relock;
16415 + retval = journal_join(&myth, sb);
16416 + if (retval)
16417 + goto out_fail;
16419 + /* someone might have ended the transaction while we joined */
16420 + if (old_trans_id != journal->j_trans_id) {
16421 + retval = do_journal_end(&myth, 0);
16422 + } else {
16423 + retval = do_journal_end(&myth, COMMIT_NOW);
16426 + if (retval)
16427 + goto out_fail;
16429 + PROC_INFO_INC(sb, journal.journal_relock_wcount);
16430 + goto relock;
16432 + /* we are the first writer, set trans_id */
16433 + if (journal->j_trans_start_time == 0) {
16434 + journal->j_trans_start_time = ktime_get_seconds();
16436 + atomic_inc(&journal->j_wcount);
16437 + journal->j_len_alloc += nblocks;
16438 + th->t_blocks_logged = 0;
16439 + th->t_blocks_allocated = nblocks;
16440 + th->t_trans_id = journal->j_trans_id;
16441 + unlock_journal(sb);
16442 + INIT_LIST_HEAD(&th->t_list);
16443 + return 0;
16445 +out_fail:
16446 + memset(th, 0, sizeof(*th));
16447 + /*
16448 + * Re-set th->t_super, so we can properly keep track of how many
16449 + * persistent transactions there are. We need to do this so if this
16450 + * call is part of a failed restart_transaction, we can free it later
16451 + */
16452 + th->t_super = sb;
16453 + return retval;
16456 +struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct
16457 + super_block
16458 + *s,
16459 + int nblocks)
16461 + int ret;
16462 + struct reiserfs_transaction_handle *th;
16464 + /*
16465 + * if we're nesting into an existing transaction. It will be
16466 + * persistent on its own
16467 + */
16468 + if (reiserfs_transaction_running(s)) {
16469 + th = current->journal_info;
16470 + th->t_refcount++;
16471 + BUG_ON(th->t_refcount < 2);
16473 + return th;
16475 + th = kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS);
16476 + if (!th)
16477 + return NULL;
16478 + ret = journal_begin(th, s, nblocks);
16479 + if (ret) {
16480 + kfree(th);
16481 + return NULL;
16484 + SB_JOURNAL(s)->j_persistent_trans++;
16485 + return th;
16488 +int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th)
16490 + struct super_block *s = th->t_super;
16491 + int ret = 0;
16492 + if (th->t_trans_id)
16493 + ret = journal_end(th);
16494 + else
16495 + ret = -EIO;
16496 + if (th->t_refcount == 0) {
16497 + SB_JOURNAL(s)->j_persistent_trans--;
16498 + kfree(th);
16500 + return ret;
16503 +static int journal_join(struct reiserfs_transaction_handle *th,
16504 + struct super_block *sb)
16506 + struct reiserfs_transaction_handle *cur_th = current->journal_info;
16508 + /*
16509 + * this keeps do_journal_end from NULLing out the
16510 + * current->journal_info pointer
16511 + */
16512 + th->t_handle_save = cur_th;
16513 + BUG_ON(cur_th && cur_th->t_refcount > 1);
16514 + return do_journal_begin_r(th, sb, 1, JBEGIN_JOIN);
16517 +int journal_join_abort(struct reiserfs_transaction_handle *th,
16518 + struct super_block *sb)
16520 + struct reiserfs_transaction_handle *cur_th = current->journal_info;
16522 + /*
16523 + * this keeps do_journal_end from NULLing out the
16524 + * current->journal_info pointer
16525 + */
16526 + th->t_handle_save = cur_th;
16527 + BUG_ON(cur_th && cur_th->t_refcount > 1);
16528 + return do_journal_begin_r(th, sb, 1, JBEGIN_ABORT);
16531 +int journal_begin(struct reiserfs_transaction_handle *th,
16532 + struct super_block *sb, unsigned long nblocks)
16534 + struct reiserfs_transaction_handle *cur_th = current->journal_info;
16535 + int ret;
16537 + th->t_handle_save = NULL;
16538 + if (cur_th) {
16539 + /* we are nesting into the current transaction */
16540 + if (cur_th->t_super == sb) {
16541 + BUG_ON(!cur_th->t_refcount);
16542 + cur_th->t_refcount++;
16543 + memcpy(th, cur_th, sizeof(*th));
16544 + if (th->t_refcount <= 1)
16545 + reiserfs_warning(sb, "reiserfs-2005",
16546 + "BAD: refcount <= 1, but "
16547 + "journal_info != 0");
16548 + return 0;
16549 + } else {
16550 + /*
16551 + * we've ended up with a handle from a different
16552 + * filesystem. save it and restore on journal_end.
16553 + * This should never really happen...
16554 + */
16555 + reiserfs_warning(sb, "clm-2100",
16556 + "nesting info a different FS");
16557 + th->t_handle_save = current->journal_info;
16558 + current->journal_info = th;
16560 + } else {
16561 + current->journal_info = th;
16563 + ret = do_journal_begin_r(th, sb, nblocks, JBEGIN_REG);
16564 + BUG_ON(current->journal_info != th);
16566 + /*
16567 + * I guess this boils down to being the reciprocal of clm-2100 above.
16568 + * If do_journal_begin_r fails, we need to put it back, since
16569 + * journal_end won't be called to do it. */
16570 + if (ret)
16571 + current->journal_info = th->t_handle_save;
16572 + else
16573 + BUG_ON(!th->t_refcount);
16575 + return ret;
16579 + * puts bh into the current transaction. If it was already there, reorders
16580 + * removes the old pointers from the hash, and puts new ones in (to make
16581 + * sure replay happen in the right order).
16583 + * if it was dirty, cleans and files onto the clean list. I can't let it
16584 + * be dirty again until the transaction is committed.
16586 + * if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len.
16587 + */
16588 +int journal_mark_dirty(struct reiserfs_transaction_handle *th,
16589 + struct buffer_head *bh)
16591 + struct super_block *sb = th->t_super;
16592 + struct reiserfs_journal *journal = SB_JOURNAL(sb);
16593 + struct reiserfs_journal_cnode *cn = NULL;
16594 + int count_already_incd = 0;
16595 + int prepared = 0;
16596 + BUG_ON(!th->t_trans_id);
16598 + PROC_INFO_INC(sb, journal.mark_dirty);
16599 + if (th->t_trans_id != journal->j_trans_id) {
16600 + reiserfs_panic(th->t_super, "journal-1577",
16601 + "handle trans id %ld != current trans id %ld",
16602 + th->t_trans_id, journal->j_trans_id);
16605 + prepared = test_clear_buffer_journal_prepared(bh);
16606 + clear_buffer_journal_restore_dirty(bh);
16607 + /* already in this transaction, we are done */
16608 + if (buffer_journaled(bh)) {
16609 + PROC_INFO_INC(sb, journal.mark_dirty_already);
16610 + return 0;
16613 + /*
16614 + * this must be turned into a panic instead of a warning. We can't
16615 + * allow a dirty or journal_dirty or locked buffer to be logged, as
16616 + * some changes could get to disk too early. NOT GOOD.
16617 + */
16618 + if (!prepared || buffer_dirty(bh)) {
16619 + reiserfs_warning(sb, "journal-1777",
16620 + "buffer %llu bad state "
16621 + "%cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT",
16622 + (unsigned long long)bh->b_blocknr,
16623 + prepared ? ' ' : '!',
16624 + buffer_locked(bh) ? ' ' : '!',
16625 + buffer_dirty(bh) ? ' ' : '!',
16626 + buffer_journal_dirty(bh) ? ' ' : '!');
16629 + if (atomic_read(&journal->j_wcount) <= 0) {
16630 + reiserfs_warning(sb, "journal-1409",
16631 + "returning because j_wcount was %d",
16632 + atomic_read(&journal->j_wcount));
16633 + return 1;
16635 + /*
16636 + * this error means I've screwed up, and we've overflowed
16637 + * the transaction. Nothing can be done here, except make the
16638 + * FS readonly or panic.
16639 + */
16640 + if (journal->j_len >= journal->j_trans_max) {
16641 + reiserfs_panic(th->t_super, "journal-1413",
16642 + "j_len (%lu) is too big",
16643 + journal->j_len);
16646 + if (buffer_journal_dirty(bh)) {
16647 + count_already_incd = 1;
16648 + PROC_INFO_INC(sb, journal.mark_dirty_notjournal);
16649 + clear_buffer_journal_dirty(bh);
16652 + if (journal->j_len > journal->j_len_alloc) {
16653 + journal->j_len_alloc = journal->j_len + JOURNAL_PER_BALANCE_CNT;
16656 + set_buffer_journaled(bh);
16658 + /* now put this guy on the end */
16659 + if (!cn) {
16660 + cn = get_cnode(sb);
16661 + if (!cn) {
16662 + reiserfs_panic(sb, "journal-4", "get_cnode failed!");
16665 + if (th->t_blocks_logged == th->t_blocks_allocated) {
16666 + th->t_blocks_allocated += JOURNAL_PER_BALANCE_CNT;
16667 + journal->j_len_alloc += JOURNAL_PER_BALANCE_CNT;
16669 + th->t_blocks_logged++;
16670 + journal->j_len++;
16672 + cn->bh = bh;
16673 + cn->blocknr = bh->b_blocknr;
16674 + cn->sb = sb;
16675 + cn->jlist = NULL;
16676 + insert_journal_hash(journal->j_hash_table, cn);
16677 + if (!count_already_incd) {
16678 + get_bh(bh);
16681 + cn->next = NULL;
16682 + cn->prev = journal->j_last;
16683 + cn->bh = bh;
16684 + if (journal->j_last) {
16685 + journal->j_last->next = cn;
16686 + journal->j_last = cn;
16687 + } else {
16688 + journal->j_first = cn;
16689 + journal->j_last = cn;
16691 + reiserfs_schedule_old_flush(sb);
16692 + return 0;
16695 +int journal_end(struct reiserfs_transaction_handle *th)
16697 + struct super_block *sb = th->t_super;
16698 + if (!current->journal_info && th->t_refcount > 1)
16699 + reiserfs_warning(sb, "REISER-NESTING",
16700 + "th NULL, refcount %d", th->t_refcount);
16702 + if (!th->t_trans_id) {
16703 + WARN_ON(1);
16704 + return -EIO;
16707 + th->t_refcount--;
16708 + if (th->t_refcount > 0) {
16709 + struct reiserfs_transaction_handle *cur_th =
16710 + current->journal_info;
16712 + /*
16713 + * we aren't allowed to close a nested transaction on a
16714 + * different filesystem from the one in the task struct
16715 + */
16716 + BUG_ON(cur_th->t_super != th->t_super);
16718 + if (th != cur_th) {
16719 + memcpy(current->journal_info, th, sizeof(*th));
16720 + th->t_trans_id = 0;
16722 + return 0;
16723 + } else {
16724 + return do_journal_end(th, 0);
16729 + * removes from the current transaction, relsing and descrementing any counters.
16730 + * also files the removed buffer directly onto the clean list
16732 + * called by journal_mark_freed when a block has been deleted
16734 + * returns 1 if it cleaned and relsed the buffer. 0 otherwise
16735 + */
16736 +static int remove_from_transaction(struct super_block *sb,
16737 + b_blocknr_t blocknr, int already_cleaned)
16739 + struct buffer_head *bh;
16740 + struct reiserfs_journal_cnode *cn;
16741 + struct reiserfs_journal *journal = SB_JOURNAL(sb);
16742 + int ret = 0;
16744 + cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr);
16745 + if (!cn || !cn->bh) {
16746 + return ret;
16748 + bh = cn->bh;
16749 + if (cn->prev) {
16750 + cn->prev->next = cn->next;
16752 + if (cn->next) {
16753 + cn->next->prev = cn->prev;
16755 + if (cn == journal->j_first) {
16756 + journal->j_first = cn->next;
16758 + if (cn == journal->j_last) {
16759 + journal->j_last = cn->prev;
16761 + remove_journal_hash(sb, journal->j_hash_table, NULL,
16762 + bh->b_blocknr, 0);
16763 + clear_buffer_journaled(bh); /* don't log this one */
16765 + if (!already_cleaned) {
16766 + clear_buffer_journal_dirty(bh);
16767 + clear_buffer_dirty(bh);
16768 + clear_buffer_journal_test(bh);
16769 + put_bh(bh);
16770 + if (atomic_read(&bh->b_count) < 0) {
16771 + reiserfs_warning(sb, "journal-1752",
16772 + "b_count < 0");
16774 + ret = 1;
16776 + journal->j_len--;
16777 + journal->j_len_alloc--;
16778 + free_cnode(sb, cn);
16779 + return ret;
16783 + * for any cnode in a journal list, it can only be dirtied of all the
16784 + * transactions that include it are committed to disk.
16785 + * this checks through each transaction, and returns 1 if you are allowed
16786 + * to dirty, and 0 if you aren't
16788 + * it is called by dirty_journal_list, which is called after
16789 + * flush_commit_list has gotten all the log blocks for a given
16790 + * transaction on disk
16792 + */
16793 +static int can_dirty(struct reiserfs_journal_cnode *cn)
16795 + struct super_block *sb = cn->sb;
16796 + b_blocknr_t blocknr = cn->blocknr;
16797 + struct reiserfs_journal_cnode *cur = cn->hprev;
16798 + int can_dirty = 1;
16800 + /*
16801 + * first test hprev. These are all newer than cn, so any node here
16802 + * with the same block number and dev means this node can't be sent
16803 + * to disk right now.
16804 + */
16805 + while (cur && can_dirty) {
16806 + if (cur->jlist && cur->bh && cur->blocknr && cur->sb == sb &&
16807 + cur->blocknr == blocknr) {
16808 + can_dirty = 0;
16810 + cur = cur->hprev;
16812 + /*
16813 + * then test hnext. These are all older than cn. As long as they
16814 + * are committed to the log, it is safe to write cn to disk
16815 + */
16816 + cur = cn->hnext;
16817 + while (cur && can_dirty) {
16818 + if (cur->jlist && cur->jlist->j_len > 0 &&
16819 + atomic_read(&cur->jlist->j_commit_left) > 0 && cur->bh &&
16820 + cur->blocknr && cur->sb == sb && cur->blocknr == blocknr) {
16821 + can_dirty = 0;
16823 + cur = cur->hnext;
16825 + return can_dirty;
16829 + * syncs the commit blocks, but does not force the real buffers to disk
16830 + * will wait until the current transaction is done/committed before returning
16831 + */
16832 +int journal_end_sync(struct reiserfs_transaction_handle *th)
16834 + struct super_block *sb = th->t_super;
16835 + struct reiserfs_journal *journal = SB_JOURNAL(sb);
16837 + BUG_ON(!th->t_trans_id);
16838 + /* you can sync while nested, very, very bad */
16839 + BUG_ON(th->t_refcount > 1);
16840 + if (journal->j_len == 0) {
16841 + reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
16842 + 1);
16843 + journal_mark_dirty(th, SB_BUFFER_WITH_SB(sb));
16845 + return do_journal_end(th, COMMIT_NOW | WAIT);
16848 +/* writeback the pending async commits to disk */
16849 +static void flush_async_commits(struct work_struct *work)
16851 + struct reiserfs_journal *journal =
16852 + container_of(work, struct reiserfs_journal, j_work.work);
16853 + struct super_block *sb = journal->j_work_sb;
16854 + struct reiserfs_journal_list *jl;
16855 + struct list_head *entry;
16857 + reiserfs_write_lock(sb);
16858 + if (!list_empty(&journal->j_journal_list)) {
16859 + /* last entry is the youngest, commit it and you get everything */
16860 + entry = journal->j_journal_list.prev;
16861 + jl = JOURNAL_LIST_ENTRY(entry);
16862 + flush_commit_list(sb, jl, 1);
16864 + reiserfs_write_unlock(sb);
16868 + * flushes any old transactions to disk
16869 + * ends the current transaction if it is too old
16870 + */
16871 +void reiserfs_flush_old_commits(struct super_block *sb)
16873 + time64_t now;
16874 + struct reiserfs_transaction_handle th;
16875 + struct reiserfs_journal *journal = SB_JOURNAL(sb);
16877 + now = ktime_get_seconds();
16878 + /*
16879 + * safety check so we don't flush while we are replaying the log during
16880 + * mount
16881 + */
16882 + if (list_empty(&journal->j_journal_list))
16883 + return;
16885 + /*
16886 + * check the current transaction. If there are no writers, and it is
16887 + * too old, finish it, and force the commit blocks to disk
16888 + */
16889 + if (atomic_read(&journal->j_wcount) <= 0 &&
16890 + journal->j_trans_start_time > 0 &&
16891 + journal->j_len > 0 &&
16892 + (now - journal->j_trans_start_time) > journal->j_max_trans_age) {
16893 + if (!journal_join(&th, sb)) {
16894 + reiserfs_prepare_for_journal(sb,
16895 + SB_BUFFER_WITH_SB(sb),
16896 + 1);
16897 + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(sb));
16899 + /*
16900 + * we're only being called from kreiserfsd, it makes
16901 + * no sense to do an async commit so that kreiserfsd
16902 + * can do it later
16903 + */
16904 + do_journal_end(&th, COMMIT_NOW | WAIT);
16910 + * returns 0 if do_journal_end should return right away, returns 1 if
16911 + * do_journal_end should finish the commit
16913 + * if the current transaction is too old, but still has writers, this will
16914 + * wait on j_join_wait until all the writers are done. By the time it
16915 + * wakes up, the transaction it was called has already ended, so it just
16916 + * flushes the commit list and returns 0.
16918 + * Won't batch when flush or commit_now is set. Also won't batch when
16919 + * others are waiting on j_join_wait.
16921 + * Note, we can't allow the journal_end to proceed while there are still
16922 + * writers in the log.
16923 + */
16924 +static int check_journal_end(struct reiserfs_transaction_handle *th, int flags)
16927 + time64_t now;
16928 + int flush = flags & FLUSH_ALL;
16929 + int commit_now = flags & COMMIT_NOW;
16930 + int wait_on_commit = flags & WAIT;
16931 + struct reiserfs_journal_list *jl;
16932 + struct super_block *sb = th->t_super;
16933 + struct reiserfs_journal *journal = SB_JOURNAL(sb);
16935 + BUG_ON(!th->t_trans_id);
16937 + if (th->t_trans_id != journal->j_trans_id) {
16938 + reiserfs_panic(th->t_super, "journal-1577",
16939 + "handle trans id %ld != current trans id %ld",
16940 + th->t_trans_id, journal->j_trans_id);
16943 + journal->j_len_alloc -= (th->t_blocks_allocated - th->t_blocks_logged);
16944 + /* <= 0 is allowed. unmounting might not call begin */
16945 + if (atomic_read(&journal->j_wcount) > 0)
16946 + atomic_dec(&journal->j_wcount);
16948 + /*
16949 + * BUG, deal with case where j_len is 0, but people previously
16950 + * freed blocks need to be released will be dealt with by next
16951 + * transaction that actually writes something, but should be taken
16952 + * care of in this trans
16953 + */
16954 + BUG_ON(journal->j_len == 0);
16956 + /*
16957 + * if wcount > 0, and we are called to with flush or commit_now,
16958 + * we wait on j_join_wait. We will wake up when the last writer has
16959 + * finished the transaction, and started it on its way to the disk.
16960 + * Then, we flush the commit or journal list, and just return 0
16961 + * because the rest of journal end was already done for this
16962 + * transaction.
16963 + */
16964 + if (atomic_read(&journal->j_wcount) > 0) {
16965 + if (flush || commit_now) {
16966 + unsigned trans_id;
16968 + jl = journal->j_current_jl;
16969 + trans_id = jl->j_trans_id;
16970 + if (wait_on_commit)
16971 + jl->j_state |= LIST_COMMIT_PENDING;
16972 + atomic_set(&journal->j_jlock, 1);
16973 + if (flush) {
16974 + journal->j_next_full_flush = 1;
16976 + unlock_journal(sb);
16978 + /*
16979 + * sleep while the current transaction is
16980 + * still j_jlocked
16981 + */
16982 + while (journal->j_trans_id == trans_id) {
16983 + if (atomic_read(&journal->j_jlock)) {
16984 + queue_log_writer(sb);
16985 + } else {
16986 + lock_journal(sb);
16987 + if (journal->j_trans_id == trans_id) {
16988 + atomic_set(&journal->j_jlock,
16989 + 1);
16991 + unlock_journal(sb);
16994 + BUG_ON(journal->j_trans_id == trans_id);
16996 + if (commit_now
16997 + && journal_list_still_alive(sb, trans_id)
16998 + && wait_on_commit) {
16999 + flush_commit_list(sb, jl, 1);
17001 + return 0;
17003 + unlock_journal(sb);
17004 + return 0;
17007 + /* deal with old transactions where we are the last writers */
17008 + now = ktime_get_seconds();
17009 + if ((now - journal->j_trans_start_time) > journal->j_max_trans_age) {
17010 + commit_now = 1;
17011 + journal->j_next_async_flush = 1;
17013 + /* don't batch when someone is waiting on j_join_wait */
17014 + /* don't batch when syncing the commit or flushing the whole trans */
17015 + if (!(journal->j_must_wait > 0) && !(atomic_read(&journal->j_jlock))
17016 + && !flush && !commit_now && (journal->j_len < journal->j_max_batch)
17017 + && journal->j_len_alloc < journal->j_max_batch
17018 + && journal->j_cnode_free > (journal->j_trans_max * 3)) {
17019 + journal->j_bcount++;
17020 + unlock_journal(sb);
17021 + return 0;
17024 + if (journal->j_start > SB_ONDISK_JOURNAL_SIZE(sb)) {
17025 + reiserfs_panic(sb, "journal-003",
17026 + "j_start (%ld) is too high",
17027 + journal->j_start);
17029 + return 1;
17033 + * Does all the work that makes deleting blocks safe.
17034 + * when deleting a block mark BH_JNew, just remove it from the current
17035 + * transaction, clean it's buffer_head and move on.
17037 + * otherwise:
17038 + * set a bit for the block in the journal bitmap. That will prevent it from
17039 + * being allocated for unformatted nodes before this transaction has finished.
17041 + * mark any cnodes for this block as BLOCK_FREED, and clear their bh pointers.
17042 + * That will prevent any old transactions with this block from trying to flush
17043 + * to the real location. Since we aren't removing the cnode from the
17044 + * journal_list_hash, *the block can't be reallocated yet.
17046 + * Then remove it from the current transaction, decrementing any counters and
17047 + * filing it on the clean list.
17048 + */
17049 +int journal_mark_freed(struct reiserfs_transaction_handle *th,
17050 + struct super_block *sb, b_blocknr_t blocknr)
17052 + struct reiserfs_journal *journal = SB_JOURNAL(sb);
17053 + struct reiserfs_journal_cnode *cn = NULL;
17054 + struct buffer_head *bh = NULL;
17055 + struct reiserfs_list_bitmap *jb = NULL;
17056 + int cleaned = 0;
17057 + BUG_ON(!th->t_trans_id);
17059 + cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr);
17060 + if (cn && cn->bh) {
17061 + bh = cn->bh;
17062 + get_bh(bh);
17064 + /* if it is journal new, we just remove it from this transaction */
17065 + if (bh && buffer_journal_new(bh)) {
17066 + clear_buffer_journal_new(bh);
17067 + clear_prepared_bits(bh);
17068 + reiserfs_clean_and_file_buffer(bh);
17069 + cleaned = remove_from_transaction(sb, blocknr, cleaned);
17070 + } else {
17071 + /*
17072 + * set the bit for this block in the journal bitmap
17073 + * for this transaction
17074 + */
17075 + jb = journal->j_current_jl->j_list_bitmap;
17076 + if (!jb) {
17077 + reiserfs_panic(sb, "journal-1702",
17078 + "journal_list_bitmap is NULL");
17080 + set_bit_in_list_bitmap(sb, blocknr, jb);
17082 + /* Note, the entire while loop is not allowed to schedule. */
17084 + if (bh) {
17085 + clear_prepared_bits(bh);
17086 + reiserfs_clean_and_file_buffer(bh);
17088 + cleaned = remove_from_transaction(sb, blocknr, cleaned);
17090 + /*
17091 + * find all older transactions with this block,
17092 + * make sure they don't try to write it out
17093 + */
17094 + cn = get_journal_hash_dev(sb, journal->j_list_hash_table,
17095 + blocknr);
17096 + while (cn) {
17097 + if (sb == cn->sb && blocknr == cn->blocknr) {
17098 + set_bit(BLOCK_FREED, &cn->state);
17099 + if (cn->bh) {
17100 + /*
17101 + * remove_from_transaction will brelse
17102 + * the buffer if it was in the current
17103 + * trans
17104 + */
17105 + if (!cleaned) {
17106 + clear_buffer_journal_dirty(cn->
17107 + bh);
17108 + clear_buffer_dirty(cn->bh);
17109 + clear_buffer_journal_test(cn->
17110 + bh);
17111 + cleaned = 1;
17112 + put_bh(cn->bh);
17113 + if (atomic_read
17114 + (&cn->bh->b_count) < 0) {
17115 + reiserfs_warning(sb,
17116 + "journal-2138",
17117 + "cn->bh->b_count < 0");
17120 + /*
17121 + * since we are clearing the bh,
17122 + * we MUST dec nonzerolen
17123 + */
17124 + if (cn->jlist) {
17125 + atomic_dec(&cn->jlist->
17126 + j_nonzerolen);
17128 + cn->bh = NULL;
17131 + cn = cn->hnext;
17135 + if (bh)
17136 + release_buffer_page(bh); /* get_hash grabs the buffer */
17137 + return 0;
17140 +void reiserfs_update_inode_transaction(struct inode *inode)
17142 + struct reiserfs_journal *journal = SB_JOURNAL(inode->i_sb);
17143 + REISERFS_I(inode)->i_jl = journal->j_current_jl;
17144 + REISERFS_I(inode)->i_trans_id = journal->j_trans_id;
17148 + * returns -1 on error, 0 if no commits/barriers were done and 1
17149 + * if a transaction was actually committed and the barrier was done
17150 + */
17151 +static int __commit_trans_jl(struct inode *inode, unsigned long id,
17152 + struct reiserfs_journal_list *jl)
17154 + struct reiserfs_transaction_handle th;
17155 + struct super_block *sb = inode->i_sb;
17156 + struct reiserfs_journal *journal = SB_JOURNAL(sb);
17157 + int ret = 0;
17159 + /*
17160 + * is it from the current transaction,
17161 + * or from an unknown transaction?
17162 + */
17163 + if (id == journal->j_trans_id) {
17164 + jl = journal->j_current_jl;
17165 + /*
17166 + * try to let other writers come in and
17167 + * grow this transaction
17168 + */
17169 + let_transaction_grow(sb, id);
17170 + if (journal->j_trans_id != id) {
17171 + goto flush_commit_only;
17174 + ret = journal_begin(&th, sb, 1);
17175 + if (ret)
17176 + return ret;
17178 + /* someone might have ended this transaction while we joined */
17179 + if (journal->j_trans_id != id) {
17180 + reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
17181 + 1);
17182 + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(sb));
17183 + ret = journal_end(&th);
17184 + goto flush_commit_only;
17187 + ret = journal_end_sync(&th);
17188 + if (!ret)
17189 + ret = 1;
17191 + } else {
17192 + /*
17193 + * this gets tricky, we have to make sure the journal list in
17194 + * the inode still exists. We know the list is still around
17195 + * if we've got a larger transaction id than the oldest list
17196 + */
17197 +flush_commit_only:
17198 + if (journal_list_still_alive(inode->i_sb, id)) {
17199 + /*
17200 + * we only set ret to 1 when we know for sure
17201 + * the barrier hasn't been started yet on the commit
17202 + * block.
17203 + */
17204 + if (atomic_read(&jl->j_commit_left) > 1)
17205 + ret = 1;
17206 + flush_commit_list(sb, jl, 1);
17207 + if (journal->j_errno)
17208 + ret = journal->j_errno;
17211 + /* otherwise the list is gone, and long since committed */
17212 + return ret;
17215 +int reiserfs_commit_for_inode(struct inode *inode)
17217 + unsigned int id = REISERFS_I(inode)->i_trans_id;
17218 + struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl;
17220 + /*
17221 + * for the whole inode, assume unset id means it was
17222 + * changed in the current transaction. More conservative
17223 + */
17224 + if (!id || !jl) {
17225 + reiserfs_update_inode_transaction(inode);
17226 + id = REISERFS_I(inode)->i_trans_id;
17227 + /* jl will be updated in __commit_trans_jl */
17230 + return __commit_trans_jl(inode, id, jl);
17233 +void reiserfs_restore_prepared_buffer(struct super_block *sb,
17234 + struct buffer_head *bh)
17236 + struct reiserfs_journal *journal = SB_JOURNAL(sb);
17237 + PROC_INFO_INC(sb, journal.restore_prepared);
17238 + if (!bh) {
17239 + return;
17241 + if (test_clear_buffer_journal_restore_dirty(bh) &&
17242 + buffer_journal_dirty(bh)) {
17243 + struct reiserfs_journal_cnode *cn;
17244 + reiserfs_write_lock(sb);
17245 + cn = get_journal_hash_dev(sb,
17246 + journal->j_list_hash_table,
17247 + bh->b_blocknr);
17248 + if (cn && can_dirty(cn)) {
17249 + set_buffer_journal_test(bh);
17250 + mark_buffer_dirty(bh);
17252 + reiserfs_write_unlock(sb);
17254 + clear_buffer_journal_prepared(bh);
17257 +extern struct tree_balance *cur_tb;
17259 + * before we can change a metadata block, we have to make sure it won't
17260 + * be written to disk while we are altering it. So, we must:
17261 + * clean it
17262 + * wait on it.
17263 + */
17264 +int reiserfs_prepare_for_journal(struct super_block *sb,
17265 + struct buffer_head *bh, int wait)
17267 + PROC_INFO_INC(sb, journal.prepare);
17269 + if (!trylock_buffer(bh)) {
17270 + if (!wait)
17271 + return 0;
17272 + lock_buffer(bh);
17274 + set_buffer_journal_prepared(bh);
17275 + if (test_clear_buffer_dirty(bh) && buffer_journal_dirty(bh)) {
17276 + clear_buffer_journal_test(bh);
17277 + set_buffer_journal_restore_dirty(bh);
17279 + unlock_buffer(bh);
17280 + return 1;
17284 + * long and ugly. If flush, will not return until all commit
17285 + * blocks and all real buffers in the trans are on disk.
17286 + * If no_async, won't return until all commit blocks are on disk.
17288 + * keep reading, there are comments as you go along
17290 + * If the journal is aborted, we just clean up. Things like flushing
17291 + * journal lists, etc just won't happen.
17292 + */
17293 +static int do_journal_end(struct reiserfs_transaction_handle *th, int flags)
17295 + struct super_block *sb = th->t_super;
17296 + struct reiserfs_journal *journal = SB_JOURNAL(sb);
17297 + struct reiserfs_journal_cnode *cn, *next, *jl_cn;
17298 + struct reiserfs_journal_cnode *last_cn = NULL;
17299 + struct reiserfs_journal_desc *desc;
17300 + struct reiserfs_journal_commit *commit;
17301 + struct buffer_head *c_bh; /* commit bh */
17302 + struct buffer_head *d_bh; /* desc bh */
17303 + int cur_write_start = 0; /* start index of current log write */
17304 + int i;
17305 + int flush;
17306 + int wait_on_commit;
17307 + struct reiserfs_journal_list *jl, *temp_jl;
17308 + struct list_head *entry, *safe;
17309 + unsigned long jindex;
17310 + unsigned int commit_trans_id;
17311 + int trans_half;
17312 + int depth;
17314 + BUG_ON(th->t_refcount > 1);
17315 + BUG_ON(!th->t_trans_id);
17316 + BUG_ON(!th->t_super);
17318 + /*
17319 + * protect flush_older_commits from doing mistakes if the
17320 + * transaction ID counter gets overflowed.
17321 + */
17322 + if (th->t_trans_id == ~0U)
17323 + flags |= FLUSH_ALL | COMMIT_NOW | WAIT;
17324 + flush = flags & FLUSH_ALL;
17325 + wait_on_commit = flags & WAIT;
17327 + current->journal_info = th->t_handle_save;
17328 + reiserfs_check_lock_depth(sb, "journal end");
17329 + if (journal->j_len == 0) {
17330 + reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
17331 + 1);
17332 + journal_mark_dirty(th, SB_BUFFER_WITH_SB(sb));
17335 + lock_journal(sb);
17336 + if (journal->j_next_full_flush) {
17337 + flags |= FLUSH_ALL;
17338 + flush = 1;
17340 + if (journal->j_next_async_flush) {
17341 + flags |= COMMIT_NOW | WAIT;
17342 + wait_on_commit = 1;
17345 + /*
17346 + * check_journal_end locks the journal, and unlocks if it does
17347 + * not return 1 it tells us if we should continue with the
17348 + * journal_end, or just return
17349 + */
17350 + if (!check_journal_end(th, flags)) {
17351 + reiserfs_schedule_old_flush(sb);
17352 + wake_queued_writers(sb);
17353 + reiserfs_async_progress_wait(sb);
17354 + goto out;
17357 + /* check_journal_end might set these, check again */
17358 + if (journal->j_next_full_flush) {
17359 + flush = 1;
17362 + /*
17363 + * j must wait means we have to flush the log blocks, and the
17364 + * real blocks for this transaction
17365 + */
17366 + if (journal->j_must_wait > 0) {
17367 + flush = 1;
17369 +#ifdef REISERFS_PREALLOCATE
17370 + /*
17371 + * quota ops might need to nest, setup the journal_info pointer
17372 + * for them and raise the refcount so that it is > 0.
17373 + */
17374 + current->journal_info = th;
17375 + th->t_refcount++;
17377 + /* it should not involve new blocks into the transaction */
17378 + reiserfs_discard_all_prealloc(th);
17380 + th->t_refcount--;
17381 + current->journal_info = th->t_handle_save;
17382 +#endif
17384 + /* setup description block */
17385 + d_bh =
17386 + journal_getblk(sb,
17387 + SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
17388 + journal->j_start);
17389 + set_buffer_uptodate(d_bh);
17390 + desc = (struct reiserfs_journal_desc *)(d_bh)->b_data;
17391 + memset(d_bh->b_data, 0, d_bh->b_size);
17392 + memcpy(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8);
17393 + set_desc_trans_id(desc, journal->j_trans_id);
17395 + /*
17396 + * setup commit block. Don't write (keep it clean too) this one
17397 + * until after everyone else is written
17398 + */
17399 + c_bh = journal_getblk(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
17400 + ((journal->j_start + journal->j_len +
17401 + 1) % SB_ONDISK_JOURNAL_SIZE(sb)));
17402 + commit = (struct reiserfs_journal_commit *)c_bh->b_data;
17403 + memset(c_bh->b_data, 0, c_bh->b_size);
17404 + set_commit_trans_id(commit, journal->j_trans_id);
17405 + set_buffer_uptodate(c_bh);
17407 + /* init this journal list */
17408 + jl = journal->j_current_jl;
17410 + /*
17411 + * we lock the commit before doing anything because
17412 + * we want to make sure nobody tries to run flush_commit_list until
17413 + * the new transaction is fully setup, and we've already flushed the
17414 + * ordered bh list
17415 + */
17416 + reiserfs_mutex_lock_safe(&jl->j_commit_mutex, sb);
17418 + /* save the transaction id in case we need to commit it later */
17419 + commit_trans_id = jl->j_trans_id;
17421 + atomic_set(&jl->j_older_commits_done, 0);
17422 + jl->j_trans_id = journal->j_trans_id;
17423 + jl->j_timestamp = journal->j_trans_start_time;
17424 + jl->j_commit_bh = c_bh;
17425 + jl->j_start = journal->j_start;
17426 + jl->j_len = journal->j_len;
17427 + atomic_set(&jl->j_nonzerolen, journal->j_len);
17428 + atomic_set(&jl->j_commit_left, journal->j_len + 2);
17429 + jl->j_realblock = NULL;
17431 + /*
17432 + * The ENTIRE FOR LOOP MUST not cause schedule to occur.
17433 + * for each real block, add it to the journal list hash,
17434 + * copy into real block index array in the commit or desc block
17435 + */
17436 + trans_half = journal_trans_half(sb->s_blocksize);
17437 + for (i = 0, cn = journal->j_first; cn; cn = cn->next, i++) {
17438 + if (buffer_journaled(cn->bh)) {
17439 + jl_cn = get_cnode(sb);
17440 + if (!jl_cn) {
17441 + reiserfs_panic(sb, "journal-1676",
17442 + "get_cnode returned NULL");
17444 + if (i == 0) {
17445 + jl->j_realblock = jl_cn;
17447 + jl_cn->prev = last_cn;
17448 + jl_cn->next = NULL;
17449 + if (last_cn) {
17450 + last_cn->next = jl_cn;
17452 + last_cn = jl_cn;
17453 + /*
17454 + * make sure the block we are trying to log
17455 + * is not a block of journal or reserved area
17456 + */
17457 + if (is_block_in_log_or_reserved_area
17458 + (sb, cn->bh->b_blocknr)) {
17459 + reiserfs_panic(sb, "journal-2332",
17460 + "Trying to log block %lu, "
17461 + "which is a log block",
17462 + cn->bh->b_blocknr);
17464 + jl_cn->blocknr = cn->bh->b_blocknr;
17465 + jl_cn->state = 0;
17466 + jl_cn->sb = sb;
17467 + jl_cn->bh = cn->bh;
17468 + jl_cn->jlist = jl;
17469 + insert_journal_hash(journal->j_list_hash_table, jl_cn);
17470 + if (i < trans_half) {
17471 + desc->j_realblock[i] =
17472 + cpu_to_le32(cn->bh->b_blocknr);
17473 + } else {
17474 + commit->j_realblock[i - trans_half] =
17475 + cpu_to_le32(cn->bh->b_blocknr);
17477 + } else {
17478 + i--;
17481 + set_desc_trans_len(desc, journal->j_len);
17482 + set_desc_mount_id(desc, journal->j_mount_id);
17483 + set_desc_trans_id(desc, journal->j_trans_id);
17484 + set_commit_trans_len(commit, journal->j_len);
17486 + /*
17487 + * special check in case all buffers in the journal
17488 + * were marked for not logging
17489 + */
17490 + BUG_ON(journal->j_len == 0);
17492 + /*
17493 + * we're about to dirty all the log blocks, mark the description block
17494 + * dirty now too. Don't mark the commit block dirty until all the
17495 + * others are on disk
17496 + */
17497 + mark_buffer_dirty(d_bh);
17499 + /*
17500 + * first data block is j_start + 1, so add one to
17501 + * cur_write_start wherever you use it
17502 + */
17503 + cur_write_start = journal->j_start;
17504 + cn = journal->j_first;
17505 + jindex = 1; /* start at one so we don't get the desc again */
17506 + while (cn) {
17507 + clear_buffer_journal_new(cn->bh);
17508 + /* copy all the real blocks into log area. dirty log blocks */
17509 + if (buffer_journaled(cn->bh)) {
17510 + struct buffer_head *tmp_bh;
17511 + char *addr;
17512 + struct page *page;
17513 + tmp_bh =
17514 + journal_getblk(sb,
17515 + SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
17516 + ((cur_write_start +
17517 + jindex) %
17518 + SB_ONDISK_JOURNAL_SIZE(sb)));
17519 + set_buffer_uptodate(tmp_bh);
17520 + page = cn->bh->b_page;
17521 + addr = kmap(page);
17522 + memcpy(tmp_bh->b_data,
17523 + addr + offset_in_page(cn->bh->b_data),
17524 + cn->bh->b_size);
17525 + kunmap(page);
17526 + mark_buffer_dirty(tmp_bh);
17527 + jindex++;
17528 + set_buffer_journal_dirty(cn->bh);
17529 + clear_buffer_journaled(cn->bh);
17530 + } else {
17531 + /*
17532 + * JDirty cleared sometime during transaction.
17533 + * don't log this one
17534 + */
17535 + reiserfs_warning(sb, "journal-2048",
17536 + "BAD, buffer in journal hash, "
17537 + "but not JDirty!");
17538 + brelse(cn->bh);
17540 + next = cn->next;
17541 + free_cnode(sb, cn);
17542 + cn = next;
17543 + reiserfs_cond_resched(sb);
17546 + /*
17547 + * we are done with both the c_bh and d_bh, but
17548 + * c_bh must be written after all other commit blocks,
17549 + * so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1.
17550 + */
17552 + journal->j_current_jl = alloc_journal_list(sb);
17554 + /* now it is safe to insert this transaction on the main list */
17555 + list_add_tail(&jl->j_list, &journal->j_journal_list);
17556 + list_add_tail(&jl->j_working_list, &journal->j_working_list);
17557 + journal->j_num_work_lists++;
17559 + /* reset journal values for the next transaction */
17560 + journal->j_start =
17561 + (journal->j_start + journal->j_len +
17562 + 2) % SB_ONDISK_JOURNAL_SIZE(sb);
17563 + atomic_set(&journal->j_wcount, 0);
17564 + journal->j_bcount = 0;
17565 + journal->j_last = NULL;
17566 + journal->j_first = NULL;
17567 + journal->j_len = 0;
17568 + journal->j_trans_start_time = 0;
17569 + /* check for trans_id overflow */
17570 + if (++journal->j_trans_id == 0)
17571 + journal->j_trans_id = 10;
17572 + journal->j_current_jl->j_trans_id = journal->j_trans_id;
17573 + journal->j_must_wait = 0;
17574 + journal->j_len_alloc = 0;
17575 + journal->j_next_full_flush = 0;
17576 + journal->j_next_async_flush = 0;
17577 + init_journal_hash(sb);
17579 + /*
17580 + * make sure reiserfs_add_jh sees the new current_jl before we
17581 + * write out the tails
17582 + */
17583 + smp_mb();
17585 + /*
17586 + * tail conversion targets have to hit the disk before we end the
17587 + * transaction. Otherwise a later transaction might repack the tail
17588 + * before this transaction commits, leaving the data block unflushed
17589 + * and clean, if we crash before the later transaction commits, the
17590 + * data block is lost.
17591 + */
17592 + if (!list_empty(&jl->j_tail_bh_list)) {
17593 + depth = reiserfs_write_unlock_nested(sb);
17594 + write_ordered_buffers(&journal->j_dirty_buffers_lock,
17595 + journal, jl, &jl->j_tail_bh_list);
17596 + reiserfs_write_lock_nested(sb, depth);
17598 + BUG_ON(!list_empty(&jl->j_tail_bh_list));
17599 + mutex_unlock(&jl->j_commit_mutex);
17601 + /*
17602 + * honor the flush wishes from the caller, simple commits can
17603 + * be done outside the journal lock, they are done below
17605 + * if we don't flush the commit list right now, we put it into
17606 + * the work queue so the people waiting on the async progress work
17607 + * queue don't wait for this proc to flush journal lists and such.
17608 + */
17609 + if (flush) {
17610 + flush_commit_list(sb, jl, 1);
17611 + flush_journal_list(sb, jl, 1);
17612 + } else if (!(jl->j_state & LIST_COMMIT_PENDING)) {
17613 + /*
17614 + * Avoid queueing work when sb is being shut down. Transaction
17615 + * will be flushed on journal shutdown.
17616 + */
17617 + if (sb->s_flags & SB_ACTIVE)
17618 + queue_delayed_work(REISERFS_SB(sb)->commit_wq,
17619 + &journal->j_work, HZ / 10);
17622 + /*
17623 + * if the next transaction has any chance of wrapping, flush
17624 + * transactions that might get overwritten. If any journal lists
17625 + * are very old flush them as well.
17626 + */
17627 +first_jl:
17628 + list_for_each_safe(entry, safe, &journal->j_journal_list) {
17629 + temp_jl = JOURNAL_LIST_ENTRY(entry);
17630 + if (journal->j_start <= temp_jl->j_start) {
17631 + if ((journal->j_start + journal->j_trans_max + 1) >=
17632 + temp_jl->j_start) {
17633 + flush_used_journal_lists(sb, temp_jl);
17634 + goto first_jl;
17635 + } else if ((journal->j_start +
17636 + journal->j_trans_max + 1) <
17637 + SB_ONDISK_JOURNAL_SIZE(sb)) {
17638 + /*
17639 + * if we don't cross into the next
17640 + * transaction and we don't wrap, there is
17641 + * no way we can overlap any later transactions
17642 + * break now
17643 + */
17644 + break;
17646 + } else if ((journal->j_start +
17647 + journal->j_trans_max + 1) >
17648 + SB_ONDISK_JOURNAL_SIZE(sb)) {
17649 + if (((journal->j_start + journal->j_trans_max + 1) %
17650 + SB_ONDISK_JOURNAL_SIZE(sb)) >=
17651 + temp_jl->j_start) {
17652 + flush_used_journal_lists(sb, temp_jl);
17653 + goto first_jl;
17654 + } else {
17655 + /*
17656 + * we don't overlap anything from out start
17657 + * to the end of the log, and our wrapped
17658 + * portion doesn't overlap anything at
17659 + * the start of the log. We can break
17660 + */
17661 + break;
17666 + journal->j_current_jl->j_list_bitmap =
17667 + get_list_bitmap(sb, journal->j_current_jl);
17669 + if (!(journal->j_current_jl->j_list_bitmap)) {
17670 + reiserfs_panic(sb, "journal-1996",
17671 + "could not get a list bitmap");
17674 + atomic_set(&journal->j_jlock, 0);
17675 + unlock_journal(sb);
17676 + /* wake up any body waiting to join. */
17677 + clear_bit(J_WRITERS_QUEUED, &journal->j_state);
17678 + wake_up(&journal->j_join_wait);
17680 + if (!flush && wait_on_commit &&
17681 + journal_list_still_alive(sb, commit_trans_id)) {
17682 + flush_commit_list(sb, jl, 1);
17684 +out:
17685 + reiserfs_check_lock_depth(sb, "journal end2");
17687 + memset(th, 0, sizeof(*th));
17688 + /*
17689 + * Re-set th->t_super, so we can properly keep track of how many
17690 + * persistent transactions there are. We need to do this so if this
17691 + * call is part of a failed restart_transaction, we can free it later
17692 + */
17693 + th->t_super = sb;
17695 + return journal->j_errno;
17698 +/* Send the file system read only and refuse new transactions */
17699 +void reiserfs_abort_journal(struct super_block *sb, int errno)
17701 + struct reiserfs_journal *journal = SB_JOURNAL(sb);
17702 + if (test_bit(J_ABORTED, &journal->j_state))
17703 + return;
17705 + if (!journal->j_errno)
17706 + journal->j_errno = errno;
17708 + sb->s_flags |= SB_RDONLY;
17709 + set_bit(J_ABORTED, &journal->j_state);
17711 +#ifdef CONFIG_REISERFS_CHECK
17712 + dump_stack();
17713 +#endif
17715 diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
17716 new file mode 100644
17717 index 000000000000..7f868569d4d0
17718 --- /dev/null
17719 +++ b/fs/reiserfs/lbalance.c
17720 @@ -0,0 +1,1426 @@
17722 + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
17723 + */
17725 +#include <linux/uaccess.h>
17726 +#include <linux/string.h>
17727 +#include <linux/time.h>
17728 +#include "reiserfs.h"
17729 +#include <linux/buffer_head.h>
17732 + * copy copy_count entries from source directory item to dest buffer
17733 + * (creating new item if needed)
17734 + */
17735 +static void leaf_copy_dir_entries(struct buffer_info *dest_bi,
17736 + struct buffer_head *source, int last_first,
17737 + int item_num, int from, int copy_count)
17739 + struct buffer_head *dest = dest_bi->bi_bh;
17740 + /*
17741 + * either the number of target item, or if we must create a
17742 + * new item, the number of the item we will create it next to
17743 + */
17744 + int item_num_in_dest;
17746 + struct item_head *ih;
17747 + struct reiserfs_de_head *deh;
17748 + int copy_records_len; /* length of all records in item to be copied */
17749 + char *records;
17751 + ih = item_head(source, item_num);
17753 + RFALSE(!is_direntry_le_ih(ih), "vs-10000: item must be directory item");
17755 + /*
17756 + * length of all record to be copied and first byte of
17757 + * the last of them
17758 + */
17759 + deh = B_I_DEH(source, ih);
17760 + if (copy_count) {
17761 + copy_records_len = (from ? deh_location(&deh[from - 1]) :
17762 + ih_item_len(ih)) -
17763 + deh_location(&deh[from + copy_count - 1]);
17764 + records =
17765 + source->b_data + ih_location(ih) +
17766 + deh_location(&deh[from + copy_count - 1]);
17767 + } else {
17768 + copy_records_len = 0;
17769 + records = NULL;
17772 + /* when copy last to first, dest buffer can contain 0 items */
17773 + item_num_in_dest =
17774 + (last_first ==
17775 + LAST_TO_FIRST) ? ((B_NR_ITEMS(dest)) ? 0 : -1) : (B_NR_ITEMS(dest)
17776 + - 1);
17778 + /*
17779 + * if there are no items in dest or the first/last item in
17780 + * dest is not item of the same directory
17781 + */
17782 + if ((item_num_in_dest == -1) ||
17783 + (last_first == FIRST_TO_LAST && le_ih_k_offset(ih) == DOT_OFFSET) ||
17784 + (last_first == LAST_TO_FIRST
17785 + && comp_short_le_keys /*COMP_SHORT_KEYS */ (&ih->ih_key,
17786 + leaf_key(dest,
17787 + item_num_in_dest))))
17789 + /* create new item in dest */
17790 + struct item_head new_ih;
17792 + /* form item header */
17793 + memcpy(&new_ih.ih_key, &ih->ih_key, KEY_SIZE);
17794 + put_ih_version(&new_ih, KEY_FORMAT_3_5);
17795 + /* calculate item len */
17796 + put_ih_item_len(&new_ih,
17797 + DEH_SIZE * copy_count + copy_records_len);
17798 + put_ih_entry_count(&new_ih, 0);
17800 + if (last_first == LAST_TO_FIRST) {
17801 + /* form key by the following way */
17802 + if (from < ih_entry_count(ih)) {
17803 + set_le_ih_k_offset(&new_ih,
17804 + deh_offset(&deh[from]));
17805 + } else {
17806 + /*
17807 + * no entries will be copied to this
17808 + * item in this function
17809 + */
17810 + set_le_ih_k_offset(&new_ih, U32_MAX);
17811 + /*
17812 + * this item is not yet valid, but we
17813 + * want I_IS_DIRECTORY_ITEM to return 1
17814 + * for it, so we -1
17815 + */
17817 + set_le_key_k_type(KEY_FORMAT_3_5, &new_ih.ih_key,
17818 + TYPE_DIRENTRY);
17821 + /* insert item into dest buffer */
17822 + leaf_insert_into_buf(dest_bi,
17823 + (last_first ==
17824 + LAST_TO_FIRST) ? 0 : B_NR_ITEMS(dest),
17825 + &new_ih, NULL, 0);
17826 + } else {
17827 + /* prepare space for entries */
17828 + leaf_paste_in_buffer(dest_bi,
17829 + (last_first ==
17830 + FIRST_TO_LAST) ? (B_NR_ITEMS(dest) -
17831 + 1) : 0, MAX_US_INT,
17832 + DEH_SIZE * copy_count + copy_records_len,
17833 + records, 0);
17836 + item_num_in_dest =
17837 + (last_first == FIRST_TO_LAST) ? (B_NR_ITEMS(dest) - 1) : 0;
17839 + leaf_paste_entries(dest_bi, item_num_in_dest,
17840 + (last_first ==
17841 + FIRST_TO_LAST) ? ih_entry_count(item_head(dest,
17842 + item_num_in_dest))
17843 + : 0, copy_count, deh + from, records,
17844 + DEH_SIZE * copy_count + copy_records_len);
17848 + * Copy the first (if last_first == FIRST_TO_LAST) or last
17849 + * (last_first == LAST_TO_FIRST) item or part of it or nothing
17850 + * (see the return 0 below) from SOURCE to the end (if last_first)
17851 + * or beginning (!last_first) of the DEST
17852 + */
17853 +/* returns 1 if anything was copied, else 0 */
17854 +static int leaf_copy_boundary_item(struct buffer_info *dest_bi,
17855 + struct buffer_head *src, int last_first,
17856 + int bytes_or_entries)
17858 + struct buffer_head *dest = dest_bi->bi_bh;
17859 + /* number of items in the source and destination buffers */
17860 + int dest_nr_item, src_nr_item;
17861 + struct item_head *ih;
17862 + struct item_head *dih;
17864 + dest_nr_item = B_NR_ITEMS(dest);
17866 + /*
17867 + * if ( DEST is empty or first item of SOURCE and last item of
17868 + * DEST are the items of different objects or of different types )
17869 + * then there is no need to treat this item differently from the
17870 + * other items that we copy, so we return
17871 + */
17872 + if (last_first == FIRST_TO_LAST) {
17873 + ih = item_head(src, 0);
17874 + dih = item_head(dest, dest_nr_item - 1);
17876 + /* there is nothing to merge */
17877 + if (!dest_nr_item
17878 + || (!op_is_left_mergeable(&ih->ih_key, src->b_size)))
17879 + return 0;
17881 + RFALSE(!ih_item_len(ih),
17882 + "vs-10010: item can not have empty length");
17884 + if (is_direntry_le_ih(ih)) {
17885 + if (bytes_or_entries == -1)
17886 + /* copy all entries to dest */
17887 + bytes_or_entries = ih_entry_count(ih);
17888 + leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST, 0, 0,
17889 + bytes_or_entries);
17890 + return 1;
17893 + /*
17894 + * copy part of the body of the first item of SOURCE
17895 + * to the end of the body of the last item of the DEST
17896 + * part defined by 'bytes_or_entries'; if bytes_or_entries
17897 + * == -1 copy whole body; don't create new item header
17898 + */
17899 + if (bytes_or_entries == -1)
17900 + bytes_or_entries = ih_item_len(ih);
17902 +#ifdef CONFIG_REISERFS_CHECK
17903 + else {
17904 + if (bytes_or_entries == ih_item_len(ih)
17905 + && is_indirect_le_ih(ih))
17906 + if (get_ih_free_space(ih))
17907 + reiserfs_panic(sb_from_bi(dest_bi),
17908 + "vs-10020",
17909 + "last unformatted node "
17910 + "must be filled "
17911 + "entirely (%h)", ih);
17913 +#endif
17915 + /*
17916 + * merge first item (or its part) of src buffer with the last
17917 + * item of dest buffer. Both are of the same file
17918 + */
17919 + leaf_paste_in_buffer(dest_bi,
17920 + dest_nr_item - 1, ih_item_len(dih),
17921 + bytes_or_entries, ih_item_body(src, ih), 0);
17923 + if (is_indirect_le_ih(dih)) {
17924 + RFALSE(get_ih_free_space(dih),
17925 + "vs-10030: merge to left: last unformatted node of non-last indirect item %h must have zerto free space",
17926 + ih);
17927 + if (bytes_or_entries == ih_item_len(ih))
17928 + set_ih_free_space(dih, get_ih_free_space(ih));
17931 + return 1;
17934 + /* copy boundary item to right (last_first == LAST_TO_FIRST) */
17936 + /*
17937 + * (DEST is empty or last item of SOURCE and first item of DEST
17938 + * are the items of different object or of different types)
17939 + */
17940 + src_nr_item = B_NR_ITEMS(src);
17941 + ih = item_head(src, src_nr_item - 1);
17942 + dih = item_head(dest, 0);
17944 + if (!dest_nr_item || !op_is_left_mergeable(&dih->ih_key, src->b_size))
17945 + return 0;
17947 + if (is_direntry_le_ih(ih)) {
17948 + /*
17949 + * bytes_or_entries = entries number in last
17950 + * item body of SOURCE
17951 + */
17952 + if (bytes_or_entries == -1)
17953 + bytes_or_entries = ih_entry_count(ih);
17955 + leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST,
17956 + src_nr_item - 1,
17957 + ih_entry_count(ih) - bytes_or_entries,
17958 + bytes_or_entries);
17959 + return 1;
17962 + /*
17963 + * copy part of the body of the last item of SOURCE to the
17964 + * begin of the body of the first item of the DEST; part defined
17965 + * by 'bytes_or_entries'; if byte_or_entriess == -1 copy whole body;
17966 + * change first item key of the DEST; don't create new item header
17967 + */
17969 + RFALSE(is_indirect_le_ih(ih) && get_ih_free_space(ih),
17970 + "vs-10040: merge to right: last unformatted node of non-last indirect item must be filled entirely (%h)",
17971 + ih);
17973 + if (bytes_or_entries == -1) {
17974 + /* bytes_or_entries = length of last item body of SOURCE */
17975 + bytes_or_entries = ih_item_len(ih);
17977 + RFALSE(le_ih_k_offset(dih) !=
17978 + le_ih_k_offset(ih) + op_bytes_number(ih, src->b_size),
17979 + "vs-10050: items %h and %h do not match", ih, dih);
17981 + /* change first item key of the DEST */
17982 + set_le_ih_k_offset(dih, le_ih_k_offset(ih));
17984 + /* item becomes non-mergeable */
17985 + /* or mergeable if left item was */
17986 + set_le_ih_k_type(dih, le_ih_k_type(ih));
17987 + } else {
17988 + /* merge to right only part of item */
17989 + RFALSE(ih_item_len(ih) <= bytes_or_entries,
17990 + "vs-10060: no so much bytes %lu (needed %lu)",
17991 + (unsigned long)ih_item_len(ih),
17992 + (unsigned long)bytes_or_entries);
17994 + /* change first item key of the DEST */
17995 + if (is_direct_le_ih(dih)) {
17996 + RFALSE(le_ih_k_offset(dih) <=
17997 + (unsigned long)bytes_or_entries,
17998 + "vs-10070: dih %h, bytes_or_entries(%d)", dih,
17999 + bytes_or_entries);
18000 + set_le_ih_k_offset(dih,
18001 + le_ih_k_offset(dih) -
18002 + bytes_or_entries);
18003 + } else {
18004 + RFALSE(le_ih_k_offset(dih) <=
18005 + (bytes_or_entries / UNFM_P_SIZE) * dest->b_size,
18006 + "vs-10080: dih %h, bytes_or_entries(%d)",
18007 + dih,
18008 + (bytes_or_entries / UNFM_P_SIZE) * dest->b_size);
18009 + set_le_ih_k_offset(dih,
18010 + le_ih_k_offset(dih) -
18011 + ((bytes_or_entries / UNFM_P_SIZE) *
18012 + dest->b_size));
18016 + leaf_paste_in_buffer(dest_bi, 0, 0, bytes_or_entries,
18017 + ih_item_body(src,
18018 + ih) + ih_item_len(ih) - bytes_or_entries,
18019 + 0);
18020 + return 1;
18024 + * copy cpy_mun items from buffer src to buffer dest
18025 + * last_first == FIRST_TO_LAST means, that we copy cpy_num items beginning
18026 + * from first-th item in src to tail of dest
18027 + * last_first == LAST_TO_FIRST means, that we copy cpy_num items beginning
18028 + * from first-th item in src to head of dest
18029 + */
18030 +static void leaf_copy_items_entirely(struct buffer_info *dest_bi,
18031 + struct buffer_head *src, int last_first,
18032 + int first, int cpy_num)
18034 + struct buffer_head *dest;
18035 + int nr, free_space;
18036 + int dest_before;
18037 + int last_loc, last_inserted_loc, location;
18038 + int i, j;
18039 + struct block_head *blkh;
18040 + struct item_head *ih;
18042 + RFALSE(last_first != LAST_TO_FIRST && last_first != FIRST_TO_LAST,
18043 + "vs-10090: bad last_first parameter %d", last_first);
18044 + RFALSE(B_NR_ITEMS(src) - first < cpy_num,
18045 + "vs-10100: too few items in source %d, required %d from %d",
18046 + B_NR_ITEMS(src), cpy_num, first);
18047 + RFALSE(cpy_num < 0, "vs-10110: can not copy negative amount of items");
18048 + RFALSE(!dest_bi, "vs-10120: can not copy negative amount of items");
18050 + dest = dest_bi->bi_bh;
18052 + RFALSE(!dest, "vs-10130: can not copy negative amount of items");
18054 + if (cpy_num == 0)
18055 + return;
18057 + blkh = B_BLK_HEAD(dest);
18058 + nr = blkh_nr_item(blkh);
18059 + free_space = blkh_free_space(blkh);
18061 + /*
18062 + * we will insert items before 0-th or nr-th item in dest buffer.
18063 + * It depends of last_first parameter
18064 + */
18065 + dest_before = (last_first == LAST_TO_FIRST) ? 0 : nr;
18067 + /* location of head of first new item */
18068 + ih = item_head(dest, dest_before);
18070 + RFALSE(blkh_free_space(blkh) < cpy_num * IH_SIZE,
18071 + "vs-10140: not enough free space for headers %d (needed %d)",
18072 + B_FREE_SPACE(dest), cpy_num * IH_SIZE);
18074 + /* prepare space for headers */
18075 + memmove(ih + cpy_num, ih, (nr - dest_before) * IH_SIZE);
18077 + /* copy item headers */
18078 + memcpy(ih, item_head(src, first), cpy_num * IH_SIZE);
18080 + free_space -= (IH_SIZE * cpy_num);
18081 + set_blkh_free_space(blkh, free_space);
18083 + /* location of unmovable item */
18084 + j = location = (dest_before == 0) ? dest->b_size : ih_location(ih - 1);
18085 + for (i = dest_before; i < nr + cpy_num; i++) {
18086 + location -= ih_item_len(ih + i - dest_before);
18087 + put_ih_location(ih + i - dest_before, location);
18090 + /* prepare space for items */
18091 + last_loc = ih_location(&ih[nr + cpy_num - 1 - dest_before]);
18092 + last_inserted_loc = ih_location(&ih[cpy_num - 1]);
18094 + /* check free space */
18095 + RFALSE(free_space < j - last_inserted_loc,
18096 + "vs-10150: not enough free space for items %d (needed %d)",
18097 + free_space, j - last_inserted_loc);
18099 + memmove(dest->b_data + last_loc,
18100 + dest->b_data + last_loc + j - last_inserted_loc,
18101 + last_inserted_loc - last_loc);
18103 + /* copy items */
18104 + memcpy(dest->b_data + last_inserted_loc,
18105 + item_body(src, (first + cpy_num - 1)),
18106 + j - last_inserted_loc);
18108 + /* sizes, item number */
18109 + set_blkh_nr_item(blkh, nr + cpy_num);
18110 + set_blkh_free_space(blkh, free_space - (j - last_inserted_loc));
18112 + do_balance_mark_leaf_dirty(dest_bi->tb, dest, 0);
18114 + if (dest_bi->bi_parent) {
18115 + struct disk_child *t_dc;
18116 + t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position);
18117 + RFALSE(dc_block_number(t_dc) != dest->b_blocknr,
18118 + "vs-10160: block number in bh does not match to field in disk_child structure %lu and %lu",
18119 + (long unsigned)dest->b_blocknr,
18120 + (long unsigned)dc_block_number(t_dc));
18121 + put_dc_size(t_dc,
18122 + dc_size(t_dc) + (j - last_inserted_loc +
18123 + IH_SIZE * cpy_num));
18125 + do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent,
18126 + 0);
18131 + * This function splits the (liquid) item into two items (useful when
18132 + * shifting part of an item into another node.)
18133 + */
18134 +static void leaf_item_bottle(struct buffer_info *dest_bi,
18135 + struct buffer_head *src, int last_first,
18136 + int item_num, int cpy_bytes)
18138 + struct buffer_head *dest = dest_bi->bi_bh;
18139 + struct item_head *ih;
18141 + RFALSE(cpy_bytes == -1,
18142 + "vs-10170: bytes == - 1 means: do not split item");
18144 + if (last_first == FIRST_TO_LAST) {
18145 + /*
18146 + * if ( if item in position item_num in buffer SOURCE
18147 + * is directory item )
18148 + */
18149 + ih = item_head(src, item_num);
18150 + if (is_direntry_le_ih(ih))
18151 + leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST,
18152 + item_num, 0, cpy_bytes);
18153 + else {
18154 + struct item_head n_ih;
18156 + /*
18157 + * copy part of the body of the item number 'item_num'
18158 + * of SOURCE to the end of the DEST part defined by
18159 + * 'cpy_bytes'; create new item header; change old
18160 + * item_header (????); n_ih = new item_header;
18161 + */
18162 + memcpy(&n_ih, ih, IH_SIZE);
18163 + put_ih_item_len(&n_ih, cpy_bytes);
18164 + if (is_indirect_le_ih(ih)) {
18165 + RFALSE(cpy_bytes == ih_item_len(ih)
18166 + && get_ih_free_space(ih),
18167 + "vs-10180: when whole indirect item is bottle to left neighbor, it must have free_space==0 (not %lu)",
18168 + (long unsigned)get_ih_free_space(ih));
18169 + set_ih_free_space(&n_ih, 0);
18172 + RFALSE(op_is_left_mergeable(&ih->ih_key, src->b_size),
18173 + "vs-10190: bad mergeability of item %h", ih);
18174 + n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */
18175 + leaf_insert_into_buf(dest_bi, B_NR_ITEMS(dest), &n_ih,
18176 + item_body(src, item_num), 0);
18178 + } else {
18179 + /*
18180 + * if ( if item in position item_num in buffer
18181 + * SOURCE is directory item )
18182 + */
18183 + ih = item_head(src, item_num);
18184 + if (is_direntry_le_ih(ih))
18185 + leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST,
18186 + item_num,
18187 + ih_entry_count(ih) - cpy_bytes,
18188 + cpy_bytes);
18189 + else {
18190 + struct item_head n_ih;
18192 + /*
18193 + * copy part of the body of the item number 'item_num'
18194 + * of SOURCE to the begin of the DEST part defined by
18195 + * 'cpy_bytes'; create new item header;
18196 + * n_ih = new item_header;
18197 + */
18198 + memcpy(&n_ih.ih_key, &ih->ih_key, KEY_SIZE);
18200 + /* Endian safe, both le */
18201 + n_ih.ih_version = ih->ih_version;
18203 + if (is_direct_le_ih(ih)) {
18204 + set_le_ih_k_offset(&n_ih,
18205 + le_ih_k_offset(ih) +
18206 + ih_item_len(ih) - cpy_bytes);
18207 + set_le_ih_k_type(&n_ih, TYPE_DIRECT);
18208 + set_ih_free_space(&n_ih, MAX_US_INT);
18209 + } else {
18210 + /* indirect item */
18211 + RFALSE(!cpy_bytes && get_ih_free_space(ih),
18212 + "vs-10200: ih->ih_free_space must be 0 when indirect item will be appended");
18213 + set_le_ih_k_offset(&n_ih,
18214 + le_ih_k_offset(ih) +
18215 + (ih_item_len(ih) -
18216 + cpy_bytes) / UNFM_P_SIZE *
18217 + dest->b_size);
18218 + set_le_ih_k_type(&n_ih, TYPE_INDIRECT);
18219 + set_ih_free_space(&n_ih, get_ih_free_space(ih));
18222 + /* set item length */
18223 + put_ih_item_len(&n_ih, cpy_bytes);
18225 + /* Endian safe, both le */
18226 + n_ih.ih_version = ih->ih_version;
18228 + leaf_insert_into_buf(dest_bi, 0, &n_ih,
18229 + item_body(src, item_num) +
18230 + ih_item_len(ih) - cpy_bytes, 0);
18236 + * If cpy_bytes equals minus one than copy cpy_num whole items from SOURCE
18237 + * to DEST. If cpy_bytes not equal to minus one than copy cpy_num-1 whole
18238 + * items from SOURCE to DEST. From last item copy cpy_num bytes for regular
18239 + * item and cpy_num directory entries for directory item.
18240 + */
18241 +static int leaf_copy_items(struct buffer_info *dest_bi, struct buffer_head *src,
18242 + int last_first, int cpy_num, int cpy_bytes)
18244 + struct buffer_head *dest;
18245 + int pos, i, src_nr_item, bytes;
18247 + dest = dest_bi->bi_bh;
18248 + RFALSE(!dest || !src, "vs-10210: !dest || !src");
18249 + RFALSE(last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST,
18250 + "vs-10220:last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST");
18251 + RFALSE(B_NR_ITEMS(src) < cpy_num,
18252 + "vs-10230: No enough items: %d, req. %d", B_NR_ITEMS(src),
18253 + cpy_num);
18254 + RFALSE(cpy_num < 0, "vs-10240: cpy_num < 0 (%d)", cpy_num);
18256 + if (cpy_num == 0)
18257 + return 0;
18259 + if (last_first == FIRST_TO_LAST) {
18260 + /* copy items to left */
18261 + pos = 0;
18262 + if (cpy_num == 1)
18263 + bytes = cpy_bytes;
18264 + else
18265 + bytes = -1;
18267 + /*
18268 + * copy the first item or it part or nothing to the end of
18269 + * the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,0,bytes))
18270 + */
18271 + i = leaf_copy_boundary_item(dest_bi, src, FIRST_TO_LAST, bytes);
18272 + cpy_num -= i;
18273 + if (cpy_num == 0)
18274 + return i;
18275 + pos += i;
18276 + if (cpy_bytes == -1)
18277 + /*
18278 + * copy first cpy_num items starting from position
18279 + * 'pos' of SOURCE to end of DEST
18280 + */
18281 + leaf_copy_items_entirely(dest_bi, src, FIRST_TO_LAST,
18282 + pos, cpy_num);
18283 + else {
18284 + /*
18285 + * copy first cpy_num-1 items starting from position
18286 + * 'pos-1' of the SOURCE to the end of the DEST
18287 + */
18288 + leaf_copy_items_entirely(dest_bi, src, FIRST_TO_LAST,
18289 + pos, cpy_num - 1);
18291 + /*
18292 + * copy part of the item which number is
18293 + * cpy_num+pos-1 to the end of the DEST
18294 + */
18295 + leaf_item_bottle(dest_bi, src, FIRST_TO_LAST,
18296 + cpy_num + pos - 1, cpy_bytes);
18298 + } else {
18299 + /* copy items to right */
18300 + src_nr_item = B_NR_ITEMS(src);
18301 + if (cpy_num == 1)
18302 + bytes = cpy_bytes;
18303 + else
18304 + bytes = -1;
18306 + /*
18307 + * copy the last item or it part or nothing to the
18308 + * begin of the DEST
18309 + * (i = leaf_copy_boundary_item(DEST,SOURCE,1,bytes));
18310 + */
18311 + i = leaf_copy_boundary_item(dest_bi, src, LAST_TO_FIRST, bytes);
18313 + cpy_num -= i;
18314 + if (cpy_num == 0)
18315 + return i;
18317 + pos = src_nr_item - cpy_num - i;
18318 + if (cpy_bytes == -1) {
18319 + /*
18320 + * starting from position 'pos' copy last cpy_num
18321 + * items of SOURCE to begin of DEST
18322 + */
18323 + leaf_copy_items_entirely(dest_bi, src, LAST_TO_FIRST,
18324 + pos, cpy_num);
18325 + } else {
18326 + /*
18327 + * copy last cpy_num-1 items starting from position
18328 + * 'pos+1' of the SOURCE to the begin of the DEST;
18329 + */
18330 + leaf_copy_items_entirely(dest_bi, src, LAST_TO_FIRST,
18331 + pos + 1, cpy_num - 1);
18333 + /*
18334 + * copy part of the item which number is pos to
18335 + * the begin of the DEST
18336 + */
18337 + leaf_item_bottle(dest_bi, src, LAST_TO_FIRST, pos,
18338 + cpy_bytes);
18341 + return i;
18345 + * there are types of coping: from S[0] to L[0], from S[0] to R[0],
18346 + * from R[0] to L[0]. for each of these we have to define parent and
18347 + * positions of destination and source buffers
18348 + */
18349 +static void leaf_define_dest_src_infos(int shift_mode, struct tree_balance *tb,
18350 + struct buffer_info *dest_bi,
18351 + struct buffer_info *src_bi,
18352 + int *first_last,
18353 + struct buffer_head *Snew)
18355 + memset(dest_bi, 0, sizeof(struct buffer_info));
18356 + memset(src_bi, 0, sizeof(struct buffer_info));
18358 + /* define dest, src, dest parent, dest position */
18359 + switch (shift_mode) {
18360 + case LEAF_FROM_S_TO_L: /* it is used in leaf_shift_left */
18361 + src_bi->tb = tb;
18362 + src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path);
18363 + src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
18365 + /* src->b_item_order */
18366 + src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0);
18367 + dest_bi->tb = tb;
18368 + dest_bi->bi_bh = tb->L[0];
18369 + dest_bi->bi_parent = tb->FL[0];
18370 + dest_bi->bi_position = get_left_neighbor_position(tb, 0);
18371 + *first_last = FIRST_TO_LAST;
18372 + break;
18374 + case LEAF_FROM_S_TO_R: /* it is used in leaf_shift_right */
18375 + src_bi->tb = tb;
18376 + src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path);
18377 + src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
18378 + src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0);
18379 + dest_bi->tb = tb;
18380 + dest_bi->bi_bh = tb->R[0];
18381 + dest_bi->bi_parent = tb->FR[0];
18382 + dest_bi->bi_position = get_right_neighbor_position(tb, 0);
18383 + *first_last = LAST_TO_FIRST;
18384 + break;
18386 + case LEAF_FROM_R_TO_L: /* it is used in balance_leaf_when_delete */
18387 + src_bi->tb = tb;
18388 + src_bi->bi_bh = tb->R[0];
18389 + src_bi->bi_parent = tb->FR[0];
18390 + src_bi->bi_position = get_right_neighbor_position(tb, 0);
18391 + dest_bi->tb = tb;
18392 + dest_bi->bi_bh = tb->L[0];
18393 + dest_bi->bi_parent = tb->FL[0];
18394 + dest_bi->bi_position = get_left_neighbor_position(tb, 0);
18395 + *first_last = FIRST_TO_LAST;
18396 + break;
18398 + case LEAF_FROM_L_TO_R: /* it is used in balance_leaf_when_delete */
18399 + src_bi->tb = tb;
18400 + src_bi->bi_bh = tb->L[0];
18401 + src_bi->bi_parent = tb->FL[0];
18402 + src_bi->bi_position = get_left_neighbor_position(tb, 0);
18403 + dest_bi->tb = tb;
18404 + dest_bi->bi_bh = tb->R[0];
18405 + dest_bi->bi_parent = tb->FR[0];
18406 + dest_bi->bi_position = get_right_neighbor_position(tb, 0);
18407 + *first_last = LAST_TO_FIRST;
18408 + break;
18410 + case LEAF_FROM_S_TO_SNEW:
18411 + src_bi->tb = tb;
18412 + src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path);
18413 + src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
18414 + src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0);
18415 + dest_bi->tb = tb;
18416 + dest_bi->bi_bh = Snew;
18417 + dest_bi->bi_parent = NULL;
18418 + dest_bi->bi_position = 0;
18419 + *first_last = LAST_TO_FIRST;
18420 + break;
18422 + default:
18423 + reiserfs_panic(sb_from_bi(src_bi), "vs-10250",
18424 + "shift type is unknown (%d)", shift_mode);
18426 + RFALSE(!src_bi->bi_bh || !dest_bi->bi_bh,
18427 + "vs-10260: mode==%d, source (%p) or dest (%p) buffer is initialized incorrectly",
18428 + shift_mode, src_bi->bi_bh, dest_bi->bi_bh);
18432 + * copy mov_num items and mov_bytes of the (mov_num-1)th item to
18433 + * neighbor. Delete them from source
18434 + */
18435 +int leaf_move_items(int shift_mode, struct tree_balance *tb, int mov_num,
18436 + int mov_bytes, struct buffer_head *Snew)
18438 + int ret_value;
18439 + struct buffer_info dest_bi, src_bi;
18440 + int first_last;
18442 + leaf_define_dest_src_infos(shift_mode, tb, &dest_bi, &src_bi,
18443 + &first_last, Snew);
18445 + ret_value =
18446 + leaf_copy_items(&dest_bi, src_bi.bi_bh, first_last, mov_num,
18447 + mov_bytes);
18449 + leaf_delete_items(&src_bi, first_last,
18450 + (first_last ==
18451 + FIRST_TO_LAST) ? 0 : (B_NR_ITEMS(src_bi.bi_bh) -
18452 + mov_num), mov_num, mov_bytes);
18454 + return ret_value;
18458 + * Shift shift_num items (and shift_bytes of last shifted item if
18459 + * shift_bytes != -1) from S[0] to L[0] and replace the delimiting key
18460 + */
18461 +int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes)
18463 + struct buffer_head *S0 = PATH_PLAST_BUFFER(tb->tb_path);
18464 + int i;
18466 + /*
18467 + * move shift_num (and shift_bytes bytes) items from S[0]
18468 + * to left neighbor L[0]
18469 + */
18470 + i = leaf_move_items(LEAF_FROM_S_TO_L, tb, shift_num, shift_bytes, NULL);
18472 + if (shift_num) {
18473 + /* number of items in S[0] == 0 */
18474 + if (B_NR_ITEMS(S0) == 0) {
18476 + RFALSE(shift_bytes != -1,
18477 + "vs-10270: S0 is empty now, but shift_bytes != -1 (%d)",
18478 + shift_bytes);
18479 +#ifdef CONFIG_REISERFS_CHECK
18480 + if (tb->tb_mode == M_PASTE || tb->tb_mode == M_INSERT) {
18481 + print_cur_tb("vs-10275");
18482 + reiserfs_panic(tb->tb_sb, "vs-10275",
18483 + "balance condition corrupted "
18484 + "(%c)", tb->tb_mode);
18486 +#endif
18488 + if (PATH_H_POSITION(tb->tb_path, 1) == 0)
18489 + replace_key(tb, tb->CFL[0], tb->lkey[0],
18490 + PATH_H_PPARENT(tb->tb_path, 0), 0);
18492 + } else {
18493 + /* replace lkey in CFL[0] by 0-th key from S[0]; */
18494 + replace_key(tb, tb->CFL[0], tb->lkey[0], S0, 0);
18496 + RFALSE((shift_bytes != -1 &&
18497 + !(is_direntry_le_ih(item_head(S0, 0))
18498 + && !ih_entry_count(item_head(S0, 0)))) &&
18499 + (!op_is_left_mergeable
18500 + (leaf_key(S0, 0), S0->b_size)),
18501 + "vs-10280: item must be mergeable");
18505 + return i;
18508 +/* CLEANING STOPPED HERE */
18511 + * Shift shift_num (shift_bytes) items from S[0] to the right neighbor,
18512 + * and replace the delimiting key
18513 + */
18514 +int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes)
18516 + int ret_value;
18518 + /*
18519 + * move shift_num (and shift_bytes) items from S[0] to
18520 + * right neighbor R[0]
18521 + */
18522 + ret_value =
18523 + leaf_move_items(LEAF_FROM_S_TO_R, tb, shift_num, shift_bytes, NULL);
18525 + /* replace rkey in CFR[0] by the 0-th key from R[0] */
18526 + if (shift_num) {
18527 + replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
18531 + return ret_value;
18534 +static void leaf_delete_items_entirely(struct buffer_info *bi,
18535 + int first, int del_num);
18537 + * If del_bytes == -1, starting from position 'first' delete del_num
18538 + * items in whole in buffer CUR.
18539 + * If not.
18540 + * If last_first == 0. Starting from position 'first' delete del_num-1
18541 + * items in whole. Delete part of body of the first item. Part defined by
18542 + * del_bytes. Don't delete first item header
18543 + * If last_first == 1. Starting from position 'first+1' delete del_num-1
18544 + * items in whole. Delete part of body of the last item . Part defined by
18545 + * del_bytes. Don't delete last item header.
18547 +void leaf_delete_items(struct buffer_info *cur_bi, int last_first,
18548 + int first, int del_num, int del_bytes)
18550 + struct buffer_head *bh;
18551 + int item_amount = B_NR_ITEMS(bh = cur_bi->bi_bh);
18553 + RFALSE(!bh, "10155: bh is not defined");
18554 + RFALSE(del_num < 0, "10160: del_num can not be < 0. del_num==%d",
18555 + del_num);
18556 + RFALSE(first < 0
18557 + || first + del_num > item_amount,
18558 + "10165: invalid number of first item to be deleted (%d) or "
18559 + "no so much items (%d) to delete (only %d)", first,
18560 + first + del_num, item_amount);
18562 + if (del_num == 0)
18563 + return;
18565 + if (first == 0 && del_num == item_amount && del_bytes == -1) {
18566 + make_empty_node(cur_bi);
18567 + do_balance_mark_leaf_dirty(cur_bi->tb, bh, 0);
18568 + return;
18571 + if (del_bytes == -1)
18572 + /* delete del_num items beginning from item in position first */
18573 + leaf_delete_items_entirely(cur_bi, first, del_num);
18574 + else {
18575 + if (last_first == FIRST_TO_LAST) {
18576 + /*
18577 + * delete del_num-1 items beginning from
18578 + * item in position first
18579 + */
18580 + leaf_delete_items_entirely(cur_bi, first, del_num - 1);
18582 + /*
18583 + * delete the part of the first item of the bh
18584 + * do not delete item header
18585 + */
18586 + leaf_cut_from_buffer(cur_bi, 0, 0, del_bytes);
18587 + } else {
18588 + struct item_head *ih;
18589 + int len;
18591 + /*
18592 + * delete del_num-1 items beginning from
18593 + * item in position first+1
18594 + */
18595 + leaf_delete_items_entirely(cur_bi, first + 1,
18596 + del_num - 1);
18598 + ih = item_head(bh, B_NR_ITEMS(bh) - 1);
18599 + if (is_direntry_le_ih(ih))
18600 + /* the last item is directory */
18601 + /*
18602 + * len = numbers of directory entries
18603 + * in this item
18604 + */
18605 + len = ih_entry_count(ih);
18606 + else
18607 + /* len = body len of item */
18608 + len = ih_item_len(ih);
18610 + /*
18611 + * delete the part of the last item of the bh
18612 + * do not delete item header
18613 + */
18614 + leaf_cut_from_buffer(cur_bi, B_NR_ITEMS(bh) - 1,
18615 + len - del_bytes, del_bytes);
18620 +/* insert item into the leaf node in position before */
18621 +void leaf_insert_into_buf(struct buffer_info *bi, int before,
18622 + struct item_head * const inserted_item_ih,
18623 + const char * const inserted_item_body,
18624 + int zeros_number)
18626 + struct buffer_head *bh = bi->bi_bh;
18627 + int nr, free_space;
18628 + struct block_head *blkh;
18629 + struct item_head *ih;
18630 + int i;
18631 + int last_loc, unmoved_loc;
18632 + char *to;
18634 + blkh = B_BLK_HEAD(bh);
18635 + nr = blkh_nr_item(blkh);
18636 + free_space = blkh_free_space(blkh);
18638 + /* check free space */
18639 + RFALSE(free_space < ih_item_len(inserted_item_ih) + IH_SIZE,
18640 + "vs-10170: not enough free space in block %z, new item %h",
18641 + bh, inserted_item_ih);
18642 + RFALSE(zeros_number > ih_item_len(inserted_item_ih),
18643 + "vs-10172: zero number == %d, item length == %d",
18644 + zeros_number, ih_item_len(inserted_item_ih));
18646 + /* get item new item must be inserted before */
18647 + ih = item_head(bh, before);
18649 + /* prepare space for the body of new item */
18650 + last_loc = nr ? ih_location(&ih[nr - before - 1]) : bh->b_size;
18651 + unmoved_loc = before ? ih_location(ih - 1) : bh->b_size;
18653 + memmove(bh->b_data + last_loc - ih_item_len(inserted_item_ih),
18654 + bh->b_data + last_loc, unmoved_loc - last_loc);
18656 + to = bh->b_data + unmoved_loc - ih_item_len(inserted_item_ih);
18657 + memset(to, 0, zeros_number);
18658 + to += zeros_number;
18660 + /* copy body to prepared space */
18661 + if (inserted_item_body)
18662 + memmove(to, inserted_item_body,
18663 + ih_item_len(inserted_item_ih) - zeros_number);
18664 + else
18665 + memset(to, '\0', ih_item_len(inserted_item_ih) - zeros_number);
18667 + /* insert item header */
18668 + memmove(ih + 1, ih, IH_SIZE * (nr - before));
18669 + memmove(ih, inserted_item_ih, IH_SIZE);
18671 + /* change locations */
18672 + for (i = before; i < nr + 1; i++) {
18673 + unmoved_loc -= ih_item_len(&ih[i - before]);
18674 + put_ih_location(&ih[i - before], unmoved_loc);
18677 + /* sizes, free space, item number */
18678 + set_blkh_nr_item(blkh, blkh_nr_item(blkh) + 1);
18679 + set_blkh_free_space(blkh,
18680 + free_space - (IH_SIZE +
18681 + ih_item_len(inserted_item_ih)));
18682 + do_balance_mark_leaf_dirty(bi->tb, bh, 1);
18684 + if (bi->bi_parent) {
18685 + struct disk_child *t_dc;
18686 + t_dc = B_N_CHILD(bi->bi_parent, bi->bi_position);
18687 + put_dc_size(t_dc,
18688 + dc_size(t_dc) + (IH_SIZE +
18689 + ih_item_len(inserted_item_ih)));
18690 + do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0);
18695 + * paste paste_size bytes to affected_item_num-th item.
18696 + * When item is a directory, this only prepare space for new entries
18697 + */
18698 +void leaf_paste_in_buffer(struct buffer_info *bi, int affected_item_num,
18699 + int pos_in_item, int paste_size,
18700 + const char *body, int zeros_number)
18702 + struct buffer_head *bh = bi->bi_bh;
18703 + int nr, free_space;
18704 + struct block_head *blkh;
18705 + struct item_head *ih;
18706 + int i;
18707 + int last_loc, unmoved_loc;
18709 + blkh = B_BLK_HEAD(bh);
18710 + nr = blkh_nr_item(blkh);
18711 + free_space = blkh_free_space(blkh);
18713 + /* check free space */
18714 + RFALSE(free_space < paste_size,
18715 + "vs-10175: not enough free space: needed %d, available %d",
18716 + paste_size, free_space);
18718 +#ifdef CONFIG_REISERFS_CHECK
18719 + if (zeros_number > paste_size) {
18720 + struct super_block *sb = NULL;
18721 + if (bi && bi->tb)
18722 + sb = bi->tb->tb_sb;
18723 + print_cur_tb("10177");
18724 + reiserfs_panic(sb, "vs-10177",
18725 + "zeros_number == %d, paste_size == %d",
18726 + zeros_number, paste_size);
18728 +#endif /* CONFIG_REISERFS_CHECK */
18730 + /* item to be appended */
18731 + ih = item_head(bh, affected_item_num);
18733 + last_loc = ih_location(&ih[nr - affected_item_num - 1]);
18734 + unmoved_loc = affected_item_num ? ih_location(ih - 1) : bh->b_size;
18736 + /* prepare space */
18737 + memmove(bh->b_data + last_loc - paste_size, bh->b_data + last_loc,
18738 + unmoved_loc - last_loc);
18740 + /* change locations */
18741 + for (i = affected_item_num; i < nr; i++)
18742 + put_ih_location(&ih[i - affected_item_num],
18743 + ih_location(&ih[i - affected_item_num]) -
18744 + paste_size);
18746 + if (body) {
18747 + if (!is_direntry_le_ih(ih)) {
18748 + if (!pos_in_item) {
18749 + /* shift data to right */
18750 + memmove(bh->b_data + ih_location(ih) +
18751 + paste_size,
18752 + bh->b_data + ih_location(ih),
18753 + ih_item_len(ih));
18754 + /* paste data in the head of item */
18755 + memset(bh->b_data + ih_location(ih), 0,
18756 + zeros_number);
18757 + memcpy(bh->b_data + ih_location(ih) +
18758 + zeros_number, body,
18759 + paste_size - zeros_number);
18760 + } else {
18761 + memset(bh->b_data + unmoved_loc - paste_size, 0,
18762 + zeros_number);
18763 + memcpy(bh->b_data + unmoved_loc - paste_size +
18764 + zeros_number, body,
18765 + paste_size - zeros_number);
18768 + } else
18769 + memset(bh->b_data + unmoved_loc - paste_size, '\0', paste_size);
18771 + put_ih_item_len(ih, ih_item_len(ih) + paste_size);
18773 + /* change free space */
18774 + set_blkh_free_space(blkh, free_space - paste_size);
18776 + do_balance_mark_leaf_dirty(bi->tb, bh, 0);
18778 + if (bi->bi_parent) {
18779 + struct disk_child *t_dc =
18780 + B_N_CHILD(bi->bi_parent, bi->bi_position);
18781 + put_dc_size(t_dc, dc_size(t_dc) + paste_size);
18782 + do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0);
18787 + * cuts DEL_COUNT entries beginning from FROM-th entry. Directory item
18788 + * does not have free space, so it moves DEHs and remaining records as
18789 + * necessary. Return value is size of removed part of directory item
18790 + * in bytes.
18791 + */
18792 +static int leaf_cut_entries(struct buffer_head *bh,
18793 + struct item_head *ih, int from, int del_count)
18795 + char *item;
18796 + struct reiserfs_de_head *deh;
18797 + int prev_record_offset; /* offset of record, that is (from-1)th */
18798 + char *prev_record; /* */
18799 + int cut_records_len; /* length of all removed records */
18800 + int i;
18802 + /*
18803 + * make sure that item is directory and there are enough entries to
18804 + * remove
18805 + */
18806 + RFALSE(!is_direntry_le_ih(ih), "10180: item is not directory item");
18807 + RFALSE(ih_entry_count(ih) < from + del_count,
18808 + "10185: item contains not enough entries: entry_count = %d, from = %d, to delete = %d",
18809 + ih_entry_count(ih), from, del_count);
18811 + if (del_count == 0)
18812 + return 0;
18814 + /* first byte of item */
18815 + item = bh->b_data + ih_location(ih);
18817 + /* entry head array */
18818 + deh = B_I_DEH(bh, ih);
18820 + /*
18821 + * first byte of remaining entries, those are BEFORE cut entries
18822 + * (prev_record) and length of all removed records (cut_records_len)
18823 + */
18824 + prev_record_offset =
18825 + (from ? deh_location(&deh[from - 1]) : ih_item_len(ih));
18826 + cut_records_len = prev_record_offset /*from_record */ -
18827 + deh_location(&deh[from + del_count - 1]);
18828 + prev_record = item + prev_record_offset;
18830 + /* adjust locations of remaining entries */
18831 + for (i = ih_entry_count(ih) - 1; i > from + del_count - 1; i--)
18832 + put_deh_location(&deh[i],
18833 + deh_location(&deh[i]) -
18834 + (DEH_SIZE * del_count));
18836 + for (i = 0; i < from; i++)
18837 + put_deh_location(&deh[i],
18838 + deh_location(&deh[i]) - (DEH_SIZE * del_count +
18839 + cut_records_len));
18841 + put_ih_entry_count(ih, ih_entry_count(ih) - del_count);
18843 + /* shift entry head array and entries those are AFTER removed entries */
18844 + memmove((char *)(deh + from),
18845 + deh + from + del_count,
18846 + prev_record - cut_records_len - (char *)(deh + from +
18847 + del_count));
18849 + /* shift records, those are BEFORE removed entries */
18850 + memmove(prev_record - cut_records_len - DEH_SIZE * del_count,
18851 + prev_record, item + ih_item_len(ih) - prev_record);
18853 + return DEH_SIZE * del_count + cut_records_len;
18857 + * when cut item is part of regular file
18858 + * pos_in_item - first byte that must be cut
18859 + * cut_size - number of bytes to be cut beginning from pos_in_item
18861 + * when cut item is part of directory
18862 + * pos_in_item - number of first deleted entry
18863 + * cut_size - count of deleted entries
18864 + */
18865 +void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num,
18866 + int pos_in_item, int cut_size)
18868 + int nr;
18869 + struct buffer_head *bh = bi->bi_bh;
18870 + struct block_head *blkh;
18871 + struct item_head *ih;
18872 + int last_loc, unmoved_loc;
18873 + int i;
18875 + blkh = B_BLK_HEAD(bh);
18876 + nr = blkh_nr_item(blkh);
18878 + /* item head of truncated item */
18879 + ih = item_head(bh, cut_item_num);
18881 + if (is_direntry_le_ih(ih)) {
18882 + /* first cut entry () */
18883 + cut_size = leaf_cut_entries(bh, ih, pos_in_item, cut_size);
18884 + if (pos_in_item == 0) {
18885 + /* change key */
18886 + RFALSE(cut_item_num,
18887 + "when 0-th enrty of item is cut, that item must be first in the node, not %d-th",
18888 + cut_item_num);
18889 + /* change item key by key of first entry in the item */
18890 + set_le_ih_k_offset(ih, deh_offset(B_I_DEH(bh, ih)));
18892 + } else {
18893 + /* item is direct or indirect */
18894 + RFALSE(is_statdata_le_ih(ih), "10195: item is stat data");
18895 + RFALSE(pos_in_item && pos_in_item + cut_size != ih_item_len(ih),
18896 + "10200: invalid offset (%lu) or trunc_size (%lu) or ih_item_len (%lu)",
18897 + (long unsigned)pos_in_item, (long unsigned)cut_size,
18898 + (long unsigned)ih_item_len(ih));
18900 + /* shift item body to left if cut is from the head of item */
18901 + if (pos_in_item == 0) {
18902 + memmove(bh->b_data + ih_location(ih),
18903 + bh->b_data + ih_location(ih) + cut_size,
18904 + ih_item_len(ih) - cut_size);
18906 + /* change key of item */
18907 + if (is_direct_le_ih(ih))
18908 + set_le_ih_k_offset(ih,
18909 + le_ih_k_offset(ih) +
18910 + cut_size);
18911 + else {
18912 + set_le_ih_k_offset(ih,
18913 + le_ih_k_offset(ih) +
18914 + (cut_size / UNFM_P_SIZE) *
18915 + bh->b_size);
18916 + RFALSE(ih_item_len(ih) == cut_size
18917 + && get_ih_free_space(ih),
18918 + "10205: invalid ih_free_space (%h)", ih);
18923 + /* location of the last item */
18924 + last_loc = ih_location(&ih[nr - cut_item_num - 1]);
18926 + /* location of the item, which is remaining at the same place */
18927 + unmoved_loc = cut_item_num ? ih_location(ih - 1) : bh->b_size;
18929 + /* shift */
18930 + memmove(bh->b_data + last_loc + cut_size, bh->b_data + last_loc,
18931 + unmoved_loc - last_loc - cut_size);
18933 + /* change item length */
18934 + put_ih_item_len(ih, ih_item_len(ih) - cut_size);
18936 + if (is_indirect_le_ih(ih)) {
18937 + if (pos_in_item)
18938 + set_ih_free_space(ih, 0);
18941 + /* change locations */
18942 + for (i = cut_item_num; i < nr; i++)
18943 + put_ih_location(&ih[i - cut_item_num],
18944 + ih_location(&ih[i - cut_item_num]) + cut_size);
18946 + /* size, free space */
18947 + set_blkh_free_space(blkh, blkh_free_space(blkh) + cut_size);
18949 + do_balance_mark_leaf_dirty(bi->tb, bh, 0);
18951 + if (bi->bi_parent) {
18952 + struct disk_child *t_dc;
18953 + t_dc = B_N_CHILD(bi->bi_parent, bi->bi_position);
18954 + put_dc_size(t_dc, dc_size(t_dc) - cut_size);
18955 + do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0);
18959 +/* delete del_num items from buffer starting from the first'th item */
18960 +static void leaf_delete_items_entirely(struct buffer_info *bi,
18961 + int first, int del_num)
18963 + struct buffer_head *bh = bi->bi_bh;
18964 + int nr;
18965 + int i, j;
18966 + int last_loc, last_removed_loc;
18967 + struct block_head *blkh;
18968 + struct item_head *ih;
18970 + RFALSE(bh == NULL, "10210: buffer is 0");
18971 + RFALSE(del_num < 0, "10215: del_num less than 0 (%d)", del_num);
18973 + if (del_num == 0)
18974 + return;
18976 + blkh = B_BLK_HEAD(bh);
18977 + nr = blkh_nr_item(blkh);
18979 + RFALSE(first < 0 || first + del_num > nr,
18980 + "10220: first=%d, number=%d, there is %d items", first, del_num,
18981 + nr);
18983 + if (first == 0 && del_num == nr) {
18984 + /* this does not work */
18985 + make_empty_node(bi);
18987 + do_balance_mark_leaf_dirty(bi->tb, bh, 0);
18988 + return;
18991 + ih = item_head(bh, first);
18993 + /* location of unmovable item */
18994 + j = (first == 0) ? bh->b_size : ih_location(ih - 1);
18996 + /* delete items */
18997 + last_loc = ih_location(&ih[nr - 1 - first]);
18998 + last_removed_loc = ih_location(&ih[del_num - 1]);
19000 + memmove(bh->b_data + last_loc + j - last_removed_loc,
19001 + bh->b_data + last_loc, last_removed_loc - last_loc);
19003 + /* delete item headers */
19004 + memmove(ih, ih + del_num, (nr - first - del_num) * IH_SIZE);
19006 + /* change item location */
19007 + for (i = first; i < nr - del_num; i++)
19008 + put_ih_location(&ih[i - first],
19009 + ih_location(&ih[i - first]) + (j -
19010 + last_removed_loc));
19012 + /* sizes, item number */
19013 + set_blkh_nr_item(blkh, blkh_nr_item(blkh) - del_num);
19014 + set_blkh_free_space(blkh,
19015 + blkh_free_space(blkh) + (j - last_removed_loc +
19016 + IH_SIZE * del_num));
19018 + do_balance_mark_leaf_dirty(bi->tb, bh, 0);
19020 + if (bi->bi_parent) {
19021 + struct disk_child *t_dc =
19022 + B_N_CHILD(bi->bi_parent, bi->bi_position);
19023 + put_dc_size(t_dc,
19024 + dc_size(t_dc) - (j - last_removed_loc +
19025 + IH_SIZE * del_num));
19026 + do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0);
19031 + * paste new_entry_count entries (new_dehs, records) into position
19032 + * before to item_num-th item
19033 + */
19034 +void leaf_paste_entries(struct buffer_info *bi,
19035 + int item_num,
19036 + int before,
19037 + int new_entry_count,
19038 + struct reiserfs_de_head *new_dehs,
19039 + const char *records, int paste_size)
19041 + struct item_head *ih;
19042 + char *item;
19043 + struct reiserfs_de_head *deh;
19044 + char *insert_point;
19045 + int i;
19046 + struct buffer_head *bh = bi->bi_bh;
19048 + if (new_entry_count == 0)
19049 + return;
19051 + ih = item_head(bh, item_num);
19053 + /*
19054 + * make sure, that item is directory, and there are enough
19055 + * records in it
19056 + */
19057 + RFALSE(!is_direntry_le_ih(ih), "10225: item is not directory item");
19058 + RFALSE(ih_entry_count(ih) < before,
19059 + "10230: there are no entry we paste entries before. entry_count = %d, before = %d",
19060 + ih_entry_count(ih), before);
19062 + /* first byte of dest item */
19063 + item = bh->b_data + ih_location(ih);
19065 + /* entry head array */
19066 + deh = B_I_DEH(bh, ih);
19068 + /* new records will be pasted at this point */
19069 + insert_point =
19070 + item +
19071 + (before ? deh_location(&deh[before - 1])
19072 + : (ih_item_len(ih) - paste_size));
19074 + /* adjust locations of records that will be AFTER new records */
19075 + for (i = ih_entry_count(ih) - 1; i >= before; i--)
19076 + put_deh_location(&deh[i],
19077 + deh_location(&deh[i]) +
19078 + (DEH_SIZE * new_entry_count));
19080 + /* adjust locations of records that will be BEFORE new records */
19081 + for (i = 0; i < before; i++)
19082 + put_deh_location(&deh[i],
19083 + deh_location(&deh[i]) + paste_size);
19085 + put_ih_entry_count(ih, ih_entry_count(ih) + new_entry_count);
19087 + /* prepare space for pasted records */
19088 + memmove(insert_point + paste_size, insert_point,
19089 + item + (ih_item_len(ih) - paste_size) - insert_point);
19091 + /* copy new records */
19092 + memcpy(insert_point + DEH_SIZE * new_entry_count, records,
19093 + paste_size - DEH_SIZE * new_entry_count);
19095 + /* prepare space for new entry heads */
19096 + deh += before;
19097 + memmove((char *)(deh + new_entry_count), deh,
19098 + insert_point - (char *)deh);
19100 + /* copy new entry heads */
19101 + deh = (struct reiserfs_de_head *)((char *)deh);
19102 + memcpy(deh, new_dehs, DEH_SIZE * new_entry_count);
19104 + /* set locations of new records */
19105 + for (i = 0; i < new_entry_count; i++) {
19106 + put_deh_location(&deh[i],
19107 + deh_location(&deh[i]) +
19108 + (-deh_location
19109 + (&new_dehs[new_entry_count - 1]) +
19110 + insert_point + DEH_SIZE * new_entry_count -
19111 + item));
19114 + /* change item key if necessary (when we paste before 0-th entry */
19115 + if (!before) {
19116 + set_le_ih_k_offset(ih, deh_offset(new_dehs));
19118 +#ifdef CONFIG_REISERFS_CHECK
19120 + int prev, next;
19121 + /* check record locations */
19122 + deh = B_I_DEH(bh, ih);
19123 + for (i = 0; i < ih_entry_count(ih); i++) {
19124 + next =
19125 + (i <
19126 + ih_entry_count(ih) -
19127 + 1) ? deh_location(&deh[i + 1]) : 0;
19128 + prev = (i != 0) ? deh_location(&deh[i - 1]) : 0;
19130 + if (prev && prev <= deh_location(&deh[i]))
19131 + reiserfs_error(sb_from_bi(bi), "vs-10240",
19132 + "directory item (%h) "
19133 + "corrupted (prev %a, "
19134 + "cur(%d) %a)",
19135 + ih, deh + i - 1, i, deh + i);
19136 + if (next && next >= deh_location(&deh[i]))
19137 + reiserfs_error(sb_from_bi(bi), "vs-10250",
19138 + "directory item (%h) "
19139 + "corrupted (cur(%d) %a, "
19140 + "next %a)",
19141 + ih, i, deh + i, deh + i + 1);
19144 +#endif
19147 diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c
19148 new file mode 100644
19149 index 000000000000..46bd7bd63a71
19150 --- /dev/null
19151 +++ b/fs/reiserfs/lock.c
19152 @@ -0,0 +1,101 @@
19153 +// SPDX-License-Identifier: GPL-2.0
19154 +#include "reiserfs.h"
19155 +#include <linux/mutex.h>
19158 + * The previous reiserfs locking scheme was heavily based on
19159 + * the tricky properties of the Bkl:
19161 + * - it was acquired recursively by a same task
19162 + * - the performances relied on the release-while-schedule() property
19164 + * Now that we replace it by a mutex, we still want to keep the same
19165 + * recursive property to avoid big changes in the code structure.
19166 + * We use our own lock_owner here because the owner field on a mutex
19167 + * is only available in SMP or mutex debugging, also we only need this field
19168 + * for this mutex, no need for a system wide mutex facility.
19170 + * Also this lock is often released before a call that could block because
19171 + * reiserfs performances were partially based on the release while schedule()
19172 + * property of the Bkl.
19173 + */
19174 +void reiserfs_write_lock(struct super_block *s)
19176 + struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
19178 + if (sb_i->lock_owner != current) {
19179 + mutex_lock(&sb_i->lock);
19180 + sb_i->lock_owner = current;
19183 + /* No need to protect it, only the current task touches it */
19184 + sb_i->lock_depth++;
19187 +void reiserfs_write_unlock(struct super_block *s)
19189 + struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
19191 + /*
19192 + * Are we unlocking without even holding the lock?
19193 + * Such a situation must raise a BUG() if we don't want
19194 + * to corrupt the data.
19195 + */
19196 + BUG_ON(sb_i->lock_owner != current);
19198 + if (--sb_i->lock_depth == -1) {
19199 + sb_i->lock_owner = NULL;
19200 + mutex_unlock(&sb_i->lock);
19204 +int __must_check reiserfs_write_unlock_nested(struct super_block *s)
19206 + struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
19207 + int depth;
19209 + /* this can happen when the lock isn't always held */
19210 + if (sb_i->lock_owner != current)
19211 + return -1;
19213 + depth = sb_i->lock_depth;
19215 + sb_i->lock_depth = -1;
19216 + sb_i->lock_owner = NULL;
19217 + mutex_unlock(&sb_i->lock);
19219 + return depth;
19222 +void reiserfs_write_lock_nested(struct super_block *s, int depth)
19224 + struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
19226 + /* this can happen when the lock isn't always held */
19227 + if (depth == -1)
19228 + return;
19230 + mutex_lock(&sb_i->lock);
19231 + sb_i->lock_owner = current;
19232 + sb_i->lock_depth = depth;
19236 + * Utility function to force a BUG if it is called without the superblock
19237 + * write lock held. caller is the string printed just before calling BUG()
19238 + */
19239 +void reiserfs_check_lock_depth(struct super_block *sb, char *caller)
19241 + struct reiserfs_sb_info *sb_i = REISERFS_SB(sb);
19243 + WARN_ON(sb_i->lock_depth < 0);
19246 +#ifdef CONFIG_REISERFS_CHECK
19247 +void reiserfs_lock_check_recursive(struct super_block *sb)
19249 + struct reiserfs_sb_info *sb_i = REISERFS_SB(sb);
19251 + WARN_ONCE((sb_i->lock_depth > 0), "Unwanted recursive reiserfs lock!\n");
19253 +#endif
19254 diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
19255 new file mode 100644
19256 index 000000000000..7e7b531fcc49
19257 --- /dev/null
19258 +++ b/fs/reiserfs/namei.c
19259 @@ -0,0 +1,1725 @@
19261 + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
19263 + * Trivial changes by Alan Cox to remove EHASHCOLLISION for compatibility
19265 + * Trivial Changes:
19266 + * Rights granted to Hans Reiser to redistribute under other terms providing
19267 + * he accepts all liability including but not limited to patent, fitness
19268 + * for purpose, and direct or indirect claims arising from failure to perform.
19270 + * NO WARRANTY
19271 + */
19273 +#include <linux/time.h>
19274 +#include <linux/bitops.h>
19275 +#include <linux/slab.h>
19276 +#include "reiserfs.h"
19277 +#include "acl.h"
19278 +#include "xattr.h"
19279 +#include <linux/quotaops.h>
19281 +#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) set_nlink(i, 1); }
19282 +#define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) drop_nlink(i);
19285 + * directory item contains array of entry headers. This performs
19286 + * binary search through that array
19287 + */
19288 +static int bin_search_in_dir_item(struct reiserfs_dir_entry *de, loff_t off)
19290 + struct item_head *ih = de->de_ih;
19291 + struct reiserfs_de_head *deh = de->de_deh;
19292 + int rbound, lbound, j;
19294 + lbound = 0;
19295 + rbound = ih_entry_count(ih) - 1;
19297 + for (j = (rbound + lbound) / 2; lbound <= rbound;
19298 + j = (rbound + lbound) / 2) {
19299 + if (off < deh_offset(deh + j)) {
19300 + rbound = j - 1;
19301 + continue;
19303 + if (off > deh_offset(deh + j)) {
19304 + lbound = j + 1;
19305 + continue;
19307 + /* this is not name found, but matched third key component */
19308 + de->de_entry_num = j;
19309 + return NAME_FOUND;
19312 + de->de_entry_num = lbound;
19313 + return NAME_NOT_FOUND;
19317 + * comment? maybe something like set de to point to what the path points to?
19318 + */
19319 +static inline void set_de_item_location(struct reiserfs_dir_entry *de,
19320 + struct treepath *path)
19322 + de->de_bh = get_last_bh(path);
19323 + de->de_ih = tp_item_head(path);
19324 + de->de_deh = B_I_DEH(de->de_bh, de->de_ih);
19325 + de->de_item_num = PATH_LAST_POSITION(path);
19329 + * de_bh, de_ih, de_deh (points to first element of array), de_item_num is set
19330 + */
19331 +inline void set_de_name_and_namelen(struct reiserfs_dir_entry *de)
19333 + struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num;
19335 + BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih));
19337 + de->de_entrylen = entry_length(de->de_bh, de->de_ih, de->de_entry_num);
19338 + de->de_namelen = de->de_entrylen - (de_with_sd(deh) ? SD_SIZE : 0);
19339 + de->de_name = ih_item_body(de->de_bh, de->de_ih) + deh_location(deh);
19340 + if (de->de_name[de->de_namelen - 1] == 0)
19341 + de->de_namelen = strlen(de->de_name);
19344 +/* what entry points to */
19345 +static inline void set_de_object_key(struct reiserfs_dir_entry *de)
19347 + BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih));
19348 + de->de_dir_id = deh_dir_id(&de->de_deh[de->de_entry_num]);
19349 + de->de_objectid = deh_objectid(&de->de_deh[de->de_entry_num]);
19352 +static inline void store_de_entry_key(struct reiserfs_dir_entry *de)
19354 + struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num;
19356 + BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih));
19358 + /* store key of the found entry */
19359 + de->de_entry_key.version = KEY_FORMAT_3_5;
19360 + de->de_entry_key.on_disk_key.k_dir_id =
19361 + le32_to_cpu(de->de_ih->ih_key.k_dir_id);
19362 + de->de_entry_key.on_disk_key.k_objectid =
19363 + le32_to_cpu(de->de_ih->ih_key.k_objectid);
19364 + set_cpu_key_k_offset(&de->de_entry_key, deh_offset(deh));
19365 + set_cpu_key_k_type(&de->de_entry_key, TYPE_DIRENTRY);
19369 + * We assign a key to each directory item, and place multiple entries in a
19370 + * single directory item. A directory item has a key equal to the key of
19371 + * the first directory entry in it.
19373 + * This function first calls search_by_key, then, if item whose first entry
19374 + * matches is not found it looks for the entry inside directory item found
19375 + * by search_by_key. Fills the path to the entry, and to the entry position
19376 + * in the item
19377 + */
19378 +/* The function is NOT SCHEDULE-SAFE! */
19379 +int search_by_entry_key(struct super_block *sb, const struct cpu_key *key,
19380 + struct treepath *path, struct reiserfs_dir_entry *de)
19382 + int retval;
19384 + retval = search_item(sb, key, path);
19385 + switch (retval) {
19386 + case ITEM_NOT_FOUND:
19387 + if (!PATH_LAST_POSITION(path)) {
19388 + reiserfs_error(sb, "vs-7000", "search_by_key "
19389 + "returned item position == 0");
19390 + pathrelse(path);
19391 + return IO_ERROR;
19393 + PATH_LAST_POSITION(path)--;
19394 + break;
19396 + case ITEM_FOUND:
19397 + break;
19399 + case IO_ERROR:
19400 + return retval;
19402 + default:
19403 + pathrelse(path);
19404 + reiserfs_error(sb, "vs-7002", "no path to here");
19405 + return IO_ERROR;
19408 + set_de_item_location(de, path);
19410 +#ifdef CONFIG_REISERFS_CHECK
19411 + if (!is_direntry_le_ih(de->de_ih) ||
19412 + COMP_SHORT_KEYS(&de->de_ih->ih_key, key)) {
19413 + print_block(de->de_bh, 0, -1, -1);
19414 + reiserfs_panic(sb, "vs-7005", "found item %h is not directory "
19415 + "item or does not belong to the same directory "
19416 + "as key %K", de->de_ih, key);
19418 +#endif /* CONFIG_REISERFS_CHECK */
19420 + /*
19421 + * binary search in directory item by third component of the
19422 + * key. sets de->de_entry_num of de
19423 + */
19424 + retval = bin_search_in_dir_item(de, cpu_key_k_offset(key));
19425 + path->pos_in_item = de->de_entry_num;
19426 + if (retval != NAME_NOT_FOUND) {
19427 + /*
19428 + * ugly, but rename needs de_bh, de_deh, de_name,
19429 + * de_namelen, de_objectid set
19430 + */
19431 + set_de_name_and_namelen(de);
19432 + set_de_object_key(de);
19434 + return retval;
19437 +/* Keyed 32-bit hash function using TEA in a Davis-Meyer function */
19440 + * The third component is hashed, and you can choose from more than
19441 + * one hash function. Per directory hashes are not yet implemented
19442 + * but are thought about. This function should be moved to hashes.c
19443 + * Jedi, please do so. -Hans
19444 + */
19445 +static __u32 get_third_component(struct super_block *s,
19446 + const char *name, int len)
19448 + __u32 res;
19450 + if (!len || (len == 1 && name[0] == '.'))
19451 + return DOT_OFFSET;
19452 + if (len == 2 && name[0] == '.' && name[1] == '.')
19453 + return DOT_DOT_OFFSET;
19455 + res = REISERFS_SB(s)->s_hash_function(name, len);
19457 + /* take bits from 7-th to 30-th including both bounds */
19458 + res = GET_HASH_VALUE(res);
19459 + if (res == 0)
19460 + /*
19461 + * needed to have no names before "." and ".." those have hash
19462 + * value == 0 and generation conters 1 and 2 accordingly
19463 + */
19464 + res = 128;
19465 + return res + MAX_GENERATION_NUMBER;
19468 +static int reiserfs_match(struct reiserfs_dir_entry *de,
19469 + const char *name, int namelen)
19471 + int retval = NAME_NOT_FOUND;
19473 + if ((namelen == de->de_namelen) &&
19474 + !memcmp(de->de_name, name, de->de_namelen))
19475 + retval =
19476 + (de_visible(de->de_deh + de->de_entry_num) ? NAME_FOUND :
19477 + NAME_FOUND_INVISIBLE);
19479 + return retval;
19482 +/* de's de_bh, de_ih, de_deh, de_item_num, de_entry_num are set already */
19484 +/* used when hash collisions exist */
19486 +static int linear_search_in_dir_item(struct cpu_key *key,
19487 + struct reiserfs_dir_entry *de,
19488 + const char *name, int namelen)
19490 + struct reiserfs_de_head *deh = de->de_deh;
19491 + int retval;
19492 + int i;
19494 + i = de->de_entry_num;
19496 + if (i == ih_entry_count(de->de_ih) ||
19497 + GET_HASH_VALUE(deh_offset(deh + i)) !=
19498 + GET_HASH_VALUE(cpu_key_k_offset(key))) {
19499 + i--;
19502 + RFALSE(de->de_deh != B_I_DEH(de->de_bh, de->de_ih),
19503 + "vs-7010: array of entry headers not found");
19505 + deh += i;
19507 + for (; i >= 0; i--, deh--) {
19508 + /* hash value does not match, no need to check whole name */
19509 + if (GET_HASH_VALUE(deh_offset(deh)) !=
19510 + GET_HASH_VALUE(cpu_key_k_offset(key))) {
19511 + return NAME_NOT_FOUND;
19514 + /* mark that this generation number is used */
19515 + if (de->de_gen_number_bit_string)
19516 + set_bit(GET_GENERATION_NUMBER(deh_offset(deh)),
19517 + de->de_gen_number_bit_string);
19519 + /* calculate pointer to name and namelen */
19520 + de->de_entry_num = i;
19521 + set_de_name_and_namelen(de);
19523 + /*
19524 + * de's de_name, de_namelen, de_recordlen are set.
19525 + * Fill the rest.
19526 + */
19527 + if ((retval =
19528 + reiserfs_match(de, name, namelen)) != NAME_NOT_FOUND) {
19530 + /* key of pointed object */
19531 + set_de_object_key(de);
19533 + store_de_entry_key(de);
19535 + /* retval can be NAME_FOUND or NAME_FOUND_INVISIBLE */
19536 + return retval;
19540 + if (GET_GENERATION_NUMBER(le_ih_k_offset(de->de_ih)) == 0)
19541 + /*
19542 + * we have reached left most entry in the node. In common we
19543 + * have to go to the left neighbor, but if generation counter
19544 + * is 0 already, we know for sure, that there is no name with
19545 + * the same hash value
19546 + */
19547 + /*
19548 + * FIXME: this work correctly only because hash value can not
19549 + * be 0. Btw, in case of Yura's hash it is probably possible,
19550 + * so, this is a bug
19551 + */
19552 + return NAME_NOT_FOUND;
19554 + RFALSE(de->de_item_num,
19555 + "vs-7015: two diritems of the same directory in one node?");
19557 + return GOTO_PREVIOUS_ITEM;
19561 + * may return NAME_FOUND, NAME_FOUND_INVISIBLE, NAME_NOT_FOUND
19562 + * FIXME: should add something like IOERROR
19563 + */
19564 +static int reiserfs_find_entry(struct inode *dir, const char *name, int namelen,
19565 + struct treepath *path_to_entry,
19566 + struct reiserfs_dir_entry *de)
19568 + struct cpu_key key_to_search;
19569 + int retval;
19571 + if (namelen > REISERFS_MAX_NAME(dir->i_sb->s_blocksize))
19572 + return NAME_NOT_FOUND;
19574 + /* we will search for this key in the tree */
19575 + make_cpu_key(&key_to_search, dir,
19576 + get_third_component(dir->i_sb, name, namelen),
19577 + TYPE_DIRENTRY, 3);
19579 + while (1) {
19580 + retval =
19581 + search_by_entry_key(dir->i_sb, &key_to_search,
19582 + path_to_entry, de);
19583 + if (retval == IO_ERROR) {
19584 + reiserfs_error(dir->i_sb, "zam-7001", "io error");
19585 + return IO_ERROR;
19588 + /* compare names for all entries having given hash value */
19589 + retval =
19590 + linear_search_in_dir_item(&key_to_search, de, name,
19591 + namelen);
19592 + /*
19593 + * there is no need to scan directory anymore.
19594 + * Given entry found or does not exist
19595 + */
19596 + if (retval != GOTO_PREVIOUS_ITEM) {
19597 + path_to_entry->pos_in_item = de->de_entry_num;
19598 + return retval;
19601 + /*
19602 + * there is left neighboring item of this directory
19603 + * and given entry can be there
19604 + */
19605 + set_cpu_key_k_offset(&key_to_search,
19606 + le_ih_k_offset(de->de_ih) - 1);
19607 + pathrelse(path_to_entry);
19609 + } /* while (1) */
19612 +static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
19613 + unsigned int flags)
19615 + int retval;
19616 + struct inode *inode = NULL;
19617 + struct reiserfs_dir_entry de;
19618 + INITIALIZE_PATH(path_to_entry);
19620 + if (REISERFS_MAX_NAME(dir->i_sb->s_blocksize) < dentry->d_name.len)
19621 + return ERR_PTR(-ENAMETOOLONG);
19623 + reiserfs_write_lock(dir->i_sb);
19625 + de.de_gen_number_bit_string = NULL;
19626 + retval =
19627 + reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len,
19628 + &path_to_entry, &de);
19629 + pathrelse(&path_to_entry);
19630 + if (retval == NAME_FOUND) {
19631 + inode = reiserfs_iget(dir->i_sb,
19632 + (struct cpu_key *)&de.de_dir_id);
19633 + if (!inode || IS_ERR(inode)) {
19634 + reiserfs_write_unlock(dir->i_sb);
19635 + return ERR_PTR(-EACCES);
19638 + /*
19639 + * Propagate the private flag so we know we're
19640 + * in the priv tree. Also clear xattr support
19641 + * since we don't have xattrs on xattr files.
19642 + */
19643 + if (IS_PRIVATE(dir))
19644 + reiserfs_init_priv_inode(inode);
19646 + reiserfs_write_unlock(dir->i_sb);
19647 + if (retval == IO_ERROR) {
19648 + return ERR_PTR(-EIO);
19651 + return d_splice_alias(inode, dentry);
19655 + * looks up the dentry of the parent directory for child.
19656 + * taken from ext2_get_parent
19657 + */
19658 +struct dentry *reiserfs_get_parent(struct dentry *child)
19660 + int retval;
19661 + struct inode *inode = NULL;
19662 + struct reiserfs_dir_entry de;
19663 + INITIALIZE_PATH(path_to_entry);
19664 + struct inode *dir = d_inode(child);
19666 + if (dir->i_nlink == 0) {
19667 + return ERR_PTR(-ENOENT);
19669 + de.de_gen_number_bit_string = NULL;
19671 + reiserfs_write_lock(dir->i_sb);
19672 + retval = reiserfs_find_entry(dir, "..", 2, &path_to_entry, &de);
19673 + pathrelse(&path_to_entry);
19674 + if (retval != NAME_FOUND) {
19675 + reiserfs_write_unlock(dir->i_sb);
19676 + return ERR_PTR(-ENOENT);
19678 + inode = reiserfs_iget(dir->i_sb, (struct cpu_key *)&de.de_dir_id);
19679 + reiserfs_write_unlock(dir->i_sb);
19681 + return d_obtain_alias(inode);
19684 +/* add entry to the directory (entry can be hidden).
19686 +insert definition of when hidden directories are used here -Hans
19688 + Does not mark dir inode dirty, do it after successesfull call to it */
19690 +static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
19691 + struct inode *dir, const char *name, int namelen,
19692 + struct inode *inode, int visible)
19694 + struct cpu_key entry_key;
19695 + struct reiserfs_de_head *deh;
19696 + INITIALIZE_PATH(path);
19697 + struct reiserfs_dir_entry de;
19698 + DECLARE_BITMAP(bit_string, MAX_GENERATION_NUMBER + 1);
19699 + int gen_number;
19701 + /*
19702 + * 48 bytes now and we avoid kmalloc if we
19703 + * create file with short name
19704 + */
19705 + char small_buf[32 + DEH_SIZE];
19707 + char *buffer;
19708 + int buflen, paste_size;
19709 + int retval;
19711 + BUG_ON(!th->t_trans_id);
19713 + /* each entry has unique key. compose it */
19714 + make_cpu_key(&entry_key, dir,
19715 + get_third_component(dir->i_sb, name, namelen),
19716 + TYPE_DIRENTRY, 3);
19718 + /* get memory for composing the entry */
19719 + buflen = DEH_SIZE + ROUND_UP(namelen);
19720 + if (buflen > sizeof(small_buf)) {
19721 + buffer = kmalloc(buflen, GFP_NOFS);
19722 + if (!buffer)
19723 + return -ENOMEM;
19724 + } else
19725 + buffer = small_buf;
19727 + paste_size =
19728 + (get_inode_sd_version(dir) ==
19729 + STAT_DATA_V1) ? (DEH_SIZE + namelen) : buflen;
19731 + /*
19732 + * fill buffer : directory entry head, name[, dir objectid | ,
19733 + * stat data | ,stat data, dir objectid ]
19734 + */
19735 + deh = (struct reiserfs_de_head *)buffer;
19736 + deh->deh_location = 0; /* JDM Endian safe if 0 */
19737 + put_deh_offset(deh, cpu_key_k_offset(&entry_key));
19738 + deh->deh_state = 0; /* JDM Endian safe if 0 */
19739 + /* put key (ino analog) to de */
19741 + /* safe: k_dir_id is le */
19742 + deh->deh_dir_id = INODE_PKEY(inode)->k_dir_id;
19743 + /* safe: k_objectid is le */
19744 + deh->deh_objectid = INODE_PKEY(inode)->k_objectid;
19746 + /* copy name */
19747 + memcpy((char *)(deh + 1), name, namelen);
19748 + /* padd by 0s to the 4 byte boundary */
19749 + padd_item((char *)(deh + 1), ROUND_UP(namelen), namelen);
19751 + /*
19752 + * entry is ready to be pasted into tree, set 'visibility'
19753 + * and 'stat data in entry' attributes
19754 + */
19755 + mark_de_without_sd(deh);
19756 + visible ? mark_de_visible(deh) : mark_de_hidden(deh);
19758 + /* find the proper place for the new entry */
19759 + memset(bit_string, 0, sizeof(bit_string));
19760 + de.de_gen_number_bit_string = bit_string;
19761 + retval = reiserfs_find_entry(dir, name, namelen, &path, &de);
19762 + if (retval != NAME_NOT_FOUND) {
19763 + if (buffer != small_buf)
19764 + kfree(buffer);
19765 + pathrelse(&path);
19767 + if (retval == IO_ERROR) {
19768 + return -EIO;
19771 + if (retval != NAME_FOUND) {
19772 + reiserfs_error(dir->i_sb, "zam-7002",
19773 + "reiserfs_find_entry() returned "
19774 + "unexpected value (%d)", retval);
19777 + return -EEXIST;
19780 + gen_number =
19781 + find_first_zero_bit(bit_string,
19782 + MAX_GENERATION_NUMBER + 1);
19783 + if (gen_number > MAX_GENERATION_NUMBER) {
19784 + /* there is no free generation number */
19785 + reiserfs_warning(dir->i_sb, "reiserfs-7010",
19786 + "Congratulations! we have got hash function "
19787 + "screwed up");
19788 + if (buffer != small_buf)
19789 + kfree(buffer);
19790 + pathrelse(&path);
19791 + return -EBUSY;
19793 + /* adjust offset of directory enrty */
19794 + put_deh_offset(deh, SET_GENERATION_NUMBER(deh_offset(deh), gen_number));
19795 + set_cpu_key_k_offset(&entry_key, deh_offset(deh));
19797 + /* update max-hash-collisions counter in reiserfs_sb_info */
19798 + PROC_INFO_MAX(th->t_super, max_hash_collisions, gen_number);
19800 + /* we need to re-search for the insertion point */
19801 + if (gen_number != 0) {
19802 + if (search_by_entry_key(dir->i_sb, &entry_key, &path, &de) !=
19803 + NAME_NOT_FOUND) {
19804 + reiserfs_warning(dir->i_sb, "vs-7032",
19805 + "entry with this key (%K) already "
19806 + "exists", &entry_key);
19808 + if (buffer != small_buf)
19809 + kfree(buffer);
19810 + pathrelse(&path);
19811 + return -EBUSY;
19815 + /* perform the insertion of the entry that we have prepared */
19816 + retval =
19817 + reiserfs_paste_into_item(th, &path, &entry_key, dir, buffer,
19818 + paste_size);
19819 + if (buffer != small_buf)
19820 + kfree(buffer);
19821 + if (retval) {
19822 + reiserfs_check_path(&path);
19823 + return retval;
19826 + dir->i_size += paste_size;
19827 + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
19828 + if (!S_ISDIR(inode->i_mode) && visible)
19829 + /* reiserfs_mkdir or reiserfs_rename will do that by itself */
19830 + reiserfs_update_sd(th, dir);
19832 + reiserfs_check_path(&path);
19833 + return 0;
19837 + * quota utility function, call if you've had to abort after calling
19838 + * new_inode_init, and have not called reiserfs_new_inode yet.
19839 + * This should only be called on inodes that do not have stat data
19840 + * inserted into the tree yet.
19841 + */
19842 +static int drop_new_inode(struct inode *inode)
19844 + dquot_drop(inode);
19845 + make_bad_inode(inode);
19846 + inode->i_flags |= S_NOQUOTA;
19847 + iput(inode);
19848 + return 0;
19852 + * utility function that does setup for reiserfs_new_inode.
19853 + * dquot_initialize needs lots of credits so it's better to have it
19854 + * outside of a transaction, so we had to pull some bits of
19855 + * reiserfs_new_inode out into this func.
19856 + */
19857 +static int new_inode_init(struct inode *inode, struct inode *dir, umode_t mode)
19859 + /*
19860 + * Make inode invalid - just in case we are going to drop it before
19861 + * the initialization happens
19862 + */
19863 + INODE_PKEY(inode)->k_objectid = 0;
19865 + /*
19866 + * the quota init calls have to know who to charge the quota to, so
19867 + * we have to set uid and gid here
19868 + */
19869 + inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
19870 + return dquot_initialize(inode);
19873 +static int reiserfs_create(struct mnt_idmap *idmap, struct inode *dir,
19874 + struct dentry *dentry, umode_t mode, bool excl)
19876 + int retval;
19877 + struct inode *inode;
19878 + /*
19879 + * We need blocks for transaction + (user+group)*(quotas
19880 + * for new inode + update of quota for directory owner)
19881 + */
19882 + int jbegin_count =
19883 + JOURNAL_PER_BALANCE_CNT * 2 +
19884 + 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
19885 + REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
19886 + struct reiserfs_transaction_handle th;
19887 + struct reiserfs_security_handle security;
19889 + retval = dquot_initialize(dir);
19890 + if (retval)
19891 + return retval;
19893 + if (!(inode = new_inode(dir->i_sb))) {
19894 + return -ENOMEM;
19896 + retval = new_inode_init(inode, dir, mode);
19897 + if (retval) {
19898 + drop_new_inode(inode);
19899 + return retval;
19902 + jbegin_count += reiserfs_cache_default_acl(dir);
19903 + retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
19904 + if (retval < 0) {
19905 + drop_new_inode(inode);
19906 + return retval;
19908 + jbegin_count += retval;
19909 + reiserfs_write_lock(dir->i_sb);
19911 + retval = journal_begin(&th, dir->i_sb, jbegin_count);
19912 + if (retval) {
19913 + drop_new_inode(inode);
19914 + goto out_failed;
19917 + retval =
19918 + reiserfs_new_inode(&th, dir, mode, NULL, 0 /*i_size */ , dentry,
19919 + inode, &security);
19920 + if (retval)
19921 + goto out_failed;
19923 + inode->i_op = &reiserfs_file_inode_operations;
19924 + inode->i_fop = &reiserfs_file_operations;
19925 + inode->i_mapping->a_ops = &reiserfs_address_space_operations;
19927 + retval =
19928 + reiserfs_add_entry(&th, dir, dentry->d_name.name,
19929 + dentry->d_name.len, inode, 1 /*visible */ );
19930 + if (retval) {
19931 + int err;
19932 + drop_nlink(inode);
19933 + reiserfs_update_sd(&th, inode);
19934 + err = journal_end(&th);
19935 + if (err)
19936 + retval = err;
19937 + unlock_new_inode(inode);
19938 + iput(inode);
19939 + goto out_failed;
19941 + reiserfs_update_inode_transaction(inode);
19942 + reiserfs_update_inode_transaction(dir);
19944 + d_instantiate_new(dentry, inode);
19945 + retval = journal_end(&th);
19947 +out_failed:
19948 + reiserfs_write_unlock(dir->i_sb);
19949 + reiserfs_security_free(&security);
19950 + return retval;
19953 +static int reiserfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
19954 + struct dentry *dentry, umode_t mode, dev_t rdev)
19956 + int retval;
19957 + struct inode *inode;
19958 + struct reiserfs_transaction_handle th;
19959 + struct reiserfs_security_handle security;
19960 + /*
19961 + * We need blocks for transaction + (user+group)*(quotas
19962 + * for new inode + update of quota for directory owner)
19963 + */
19964 + int jbegin_count =
19965 + JOURNAL_PER_BALANCE_CNT * 3 +
19966 + 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
19967 + REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
19969 + retval = dquot_initialize(dir);
19970 + if (retval)
19971 + return retval;
19973 + if (!(inode = new_inode(dir->i_sb))) {
19974 + return -ENOMEM;
19976 + retval = new_inode_init(inode, dir, mode);
19977 + if (retval) {
19978 + drop_new_inode(inode);
19979 + return retval;
19982 + jbegin_count += reiserfs_cache_default_acl(dir);
19983 + retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
19984 + if (retval < 0) {
19985 + drop_new_inode(inode);
19986 + return retval;
19988 + jbegin_count += retval;
19989 + reiserfs_write_lock(dir->i_sb);
19991 + retval = journal_begin(&th, dir->i_sb, jbegin_count);
19992 + if (retval) {
19993 + drop_new_inode(inode);
19994 + goto out_failed;
19997 + retval =
19998 + reiserfs_new_inode(&th, dir, mode, NULL, 0 /*i_size */ , dentry,
19999 + inode, &security);
20000 + if (retval) {
20001 + goto out_failed;
20004 + inode->i_op = &reiserfs_special_inode_operations;
20005 + init_special_inode(inode, inode->i_mode, rdev);
20007 + /* FIXME: needed for block and char devices only */
20008 + reiserfs_update_sd(&th, inode);
20010 + reiserfs_update_inode_transaction(inode);
20011 + reiserfs_update_inode_transaction(dir);
20013 + retval =
20014 + reiserfs_add_entry(&th, dir, dentry->d_name.name,
20015 + dentry->d_name.len, inode, 1 /*visible */ );
20016 + if (retval) {
20017 + int err;
20018 + drop_nlink(inode);
20019 + reiserfs_update_sd(&th, inode);
20020 + err = journal_end(&th);
20021 + if (err)
20022 + retval = err;
20023 + unlock_new_inode(inode);
20024 + iput(inode);
20025 + goto out_failed;
20028 + d_instantiate_new(dentry, inode);
20029 + retval = journal_end(&th);
20031 +out_failed:
20032 + reiserfs_write_unlock(dir->i_sb);
20033 + reiserfs_security_free(&security);
20034 + return retval;
20037 +static int reiserfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
20038 + struct dentry *dentry, umode_t mode)
20040 + int retval;
20041 + struct inode *inode;
20042 + struct reiserfs_transaction_handle th;
20043 + struct reiserfs_security_handle security;
20044 + /*
20045 + * We need blocks for transaction + (user+group)*(quotas
20046 + * for new inode + update of quota for directory owner)
20047 + */
20048 + int jbegin_count =
20049 + JOURNAL_PER_BALANCE_CNT * 3 +
20050 + 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
20051 + REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
20053 + retval = dquot_initialize(dir);
20054 + if (retval)
20055 + return retval;
20057 +#ifdef DISPLACE_NEW_PACKING_LOCALITIES
20058 + /*
20059 + * set flag that new packing locality created and new blocks
20060 + * for the content of that directory are not displaced yet
20061 + */
20062 + REISERFS_I(dir)->new_packing_locality = 1;
20063 +#endif
20064 + mode = S_IFDIR | mode;
20065 + if (!(inode = new_inode(dir->i_sb))) {
20066 + return -ENOMEM;
20068 + retval = new_inode_init(inode, dir, mode);
20069 + if (retval) {
20070 + drop_new_inode(inode);
20071 + return retval;
20074 + jbegin_count += reiserfs_cache_default_acl(dir);
20075 + retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
20076 + if (retval < 0) {
20077 + drop_new_inode(inode);
20078 + return retval;
20080 + jbegin_count += retval;
20081 + reiserfs_write_lock(dir->i_sb);
20083 + retval = journal_begin(&th, dir->i_sb, jbegin_count);
20084 + if (retval) {
20085 + drop_new_inode(inode);
20086 + goto out_failed;
20089 + /*
20090 + * inc the link count now, so another writer doesn't overflow
20091 + * it while we sleep later on.
20092 + */
20093 + INC_DIR_INODE_NLINK(dir)
20095 + retval = reiserfs_new_inode(&th, dir, mode, NULL /*symlink */,
20096 + old_format_only(dir->i_sb) ?
20097 + EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE,
20098 + dentry, inode, &security);
20099 + if (retval) {
20100 + DEC_DIR_INODE_NLINK(dir)
20101 + goto out_failed;
20104 + reiserfs_update_inode_transaction(inode);
20105 + reiserfs_update_inode_transaction(dir);
20107 + inode->i_op = &reiserfs_dir_inode_operations;
20108 + inode->i_fop = &reiserfs_dir_operations;
20110 + /* note, _this_ add_entry will not update dir's stat data */
20111 + retval =
20112 + reiserfs_add_entry(&th, dir, dentry->d_name.name,
20113 + dentry->d_name.len, inode, 1 /*visible */ );
20114 + if (retval) {
20115 + int err;
20116 + clear_nlink(inode);
20117 + DEC_DIR_INODE_NLINK(dir);
20118 + reiserfs_update_sd(&th, inode);
20119 + err = journal_end(&th);
20120 + if (err)
20121 + retval = err;
20122 + unlock_new_inode(inode);
20123 + iput(inode);
20124 + goto out_failed;
20126 + /* the above add_entry did not update dir's stat data */
20127 + reiserfs_update_sd(&th, dir);
20129 + d_instantiate_new(dentry, inode);
20130 + retval = journal_end(&th);
20131 +out_failed:
20132 + reiserfs_write_unlock(dir->i_sb);
20133 + reiserfs_security_free(&security);
20134 + return retval;
20137 +static inline int reiserfs_empty_dir(struct inode *inode)
20139 + /*
20140 + * we can cheat because an old format dir cannot have
20141 + * EMPTY_DIR_SIZE, and a new format dir cannot have
20142 + * EMPTY_DIR_SIZE_V1. So, if the inode is either size,
20143 + * regardless of disk format version, the directory is empty.
20144 + */
20145 + if (inode->i_size != EMPTY_DIR_SIZE &&
20146 + inode->i_size != EMPTY_DIR_SIZE_V1) {
20147 + return 0;
20149 + return 1;
20152 +static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
20154 + int retval, err;
20155 + struct inode *inode;
20156 + struct reiserfs_transaction_handle th;
20157 + int jbegin_count;
20158 + INITIALIZE_PATH(path);
20159 + struct reiserfs_dir_entry de;
20161 + /*
20162 + * we will be doing 2 balancings and update 2 stat data, we
20163 + * change quotas of the owner of the directory and of the owner
20164 + * of the parent directory. The quota structure is possibly
20165 + * deleted only on last iput => outside of this transaction
20166 + */
20167 + jbegin_count =
20168 + JOURNAL_PER_BALANCE_CNT * 2 + 2 +
20169 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
20171 + retval = dquot_initialize(dir);
20172 + if (retval)
20173 + return retval;
20175 + reiserfs_write_lock(dir->i_sb);
20176 + retval = journal_begin(&th, dir->i_sb, jbegin_count);
20177 + if (retval)
20178 + goto out_rmdir;
20180 + de.de_gen_number_bit_string = NULL;
20181 + if ((retval =
20182 + reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len,
20183 + &path, &de)) == NAME_NOT_FOUND) {
20184 + retval = -ENOENT;
20185 + goto end_rmdir;
20186 + } else if (retval == IO_ERROR) {
20187 + retval = -EIO;
20188 + goto end_rmdir;
20191 + inode = d_inode(dentry);
20193 + reiserfs_update_inode_transaction(inode);
20194 + reiserfs_update_inode_transaction(dir);
20196 + if (de.de_objectid != inode->i_ino) {
20197 + /*
20198 + * FIXME: compare key of an object and a key found in the entry
20199 + */
20200 + retval = -EIO;
20201 + goto end_rmdir;
20203 + if (!reiserfs_empty_dir(inode)) {
20204 + retval = -ENOTEMPTY;
20205 + goto end_rmdir;
20208 + /* cut entry from dir directory */
20209 + retval = reiserfs_cut_from_item(&th, &path, &de.de_entry_key,
20210 + dir, NULL, /* page */
20211 + 0 /*new file size - not used here */ );
20212 + if (retval < 0)
20213 + goto end_rmdir;
20215 + if (inode->i_nlink != 2 && inode->i_nlink != 1)
20216 + reiserfs_error(inode->i_sb, "reiserfs-7040",
20217 + "empty directory has nlink != 2 (%d)",
20218 + inode->i_nlink);
20220 + clear_nlink(inode);
20221 + inode_set_mtime_to_ts(dir,
20222 + inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
20223 + reiserfs_update_sd(&th, inode);
20225 + DEC_DIR_INODE_NLINK(dir)
20226 + dir->i_size -= (DEH_SIZE + de.de_entrylen);
20227 + reiserfs_update_sd(&th, dir);
20229 + /* prevent empty directory from getting lost */
20230 + add_save_link(&th, inode, 0 /* not truncate */ );
20232 + retval = journal_end(&th);
20233 + reiserfs_check_path(&path);
20234 +out_rmdir:
20235 + reiserfs_write_unlock(dir->i_sb);
20236 + return retval;
20238 +end_rmdir:
20239 + /*
20240 + * we must release path, because we did not call
20241 + * reiserfs_cut_from_item, or reiserfs_cut_from_item does not
20242 + * release path if operation was not complete
20243 + */
20244 + pathrelse(&path);
20245 + err = journal_end(&th);
20246 + reiserfs_write_unlock(dir->i_sb);
20247 + return err ? err : retval;
20250 +static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
20252 + int retval, err;
20253 + struct inode *inode;
20254 + struct reiserfs_dir_entry de;
20255 + INITIALIZE_PATH(path);
20256 + struct reiserfs_transaction_handle th;
20257 + int jbegin_count;
20258 + unsigned long savelink;
20260 + retval = dquot_initialize(dir);
20261 + if (retval)
20262 + return retval;
20264 + inode = d_inode(dentry);
20266 + /*
20267 + * in this transaction we can be doing at max two balancings and
20268 + * update two stat datas, we change quotas of the owner of the
20269 + * directory and of the owner of the parent directory. The quota
20270 + * structure is possibly deleted only on iput => outside of
20271 + * this transaction
20272 + */
20273 + jbegin_count =
20274 + JOURNAL_PER_BALANCE_CNT * 2 + 2 +
20275 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
20277 + reiserfs_write_lock(dir->i_sb);
20278 + retval = journal_begin(&th, dir->i_sb, jbegin_count);
20279 + if (retval)
20280 + goto out_unlink;
20282 + de.de_gen_number_bit_string = NULL;
20283 + if ((retval =
20284 + reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len,
20285 + &path, &de)) == NAME_NOT_FOUND) {
20286 + retval = -ENOENT;
20287 + goto end_unlink;
20288 + } else if (retval == IO_ERROR) {
20289 + retval = -EIO;
20290 + goto end_unlink;
20293 + reiserfs_update_inode_transaction(inode);
20294 + reiserfs_update_inode_transaction(dir);
20296 + if (de.de_objectid != inode->i_ino) {
20297 + /*
20298 + * FIXME: compare key of an object and a key found in the entry
20299 + */
20300 + retval = -EIO;
20301 + goto end_unlink;
20304 + if (!inode->i_nlink) {
20305 + reiserfs_warning(inode->i_sb, "reiserfs-7042",
20306 + "deleting nonexistent file (%lu), %d",
20307 + inode->i_ino, inode->i_nlink);
20308 + set_nlink(inode, 1);
20311 + drop_nlink(inode);
20313 + /*
20314 + * we schedule before doing the add_save_link call, save the link
20315 + * count so we don't race
20316 + */
20317 + savelink = inode->i_nlink;
20319 + retval =
20320 + reiserfs_cut_from_item(&th, &path, &de.de_entry_key, dir, NULL,
20321 + 0);
20322 + if (retval < 0) {
20323 + inc_nlink(inode);
20324 + goto end_unlink;
20326 + inode_set_ctime_current(inode);
20327 + reiserfs_update_sd(&th, inode);
20329 + dir->i_size -= (de.de_entrylen + DEH_SIZE);
20330 + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
20331 + reiserfs_update_sd(&th, dir);
20333 + if (!savelink)
20334 + /* prevent file from getting lost */
20335 + add_save_link(&th, inode, 0 /* not truncate */ );
20337 + retval = journal_end(&th);
20338 + reiserfs_check_path(&path);
20339 + reiserfs_write_unlock(dir->i_sb);
20340 + return retval;
20342 +end_unlink:
20343 + pathrelse(&path);
20344 + err = journal_end(&th);
20345 + reiserfs_check_path(&path);
20346 + if (err)
20347 + retval = err;
20348 +out_unlink:
20349 + reiserfs_write_unlock(dir->i_sb);
20350 + return retval;
20353 +static int reiserfs_symlink(struct mnt_idmap *idmap,
20354 + struct inode *parent_dir, struct dentry *dentry,
20355 + const char *symname)
20357 + int retval;
20358 + struct inode *inode;
20359 + char *name;
20360 + int item_len;
20361 + struct reiserfs_transaction_handle th;
20362 + struct reiserfs_security_handle security;
20363 + int mode = S_IFLNK | S_IRWXUGO;
20364 + /*
20365 + * We need blocks for transaction + (user+group)*(quotas for
20366 + * new inode + update of quota for directory owner)
20367 + */
20368 + int jbegin_count =
20369 + JOURNAL_PER_BALANCE_CNT * 3 +
20370 + 2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) +
20371 + REISERFS_QUOTA_TRANS_BLOCKS(parent_dir->i_sb));
20373 + retval = dquot_initialize(parent_dir);
20374 + if (retval)
20375 + return retval;
20377 + if (!(inode = new_inode(parent_dir->i_sb))) {
20378 + return -ENOMEM;
20380 + retval = new_inode_init(inode, parent_dir, mode);
20381 + if (retval) {
20382 + drop_new_inode(inode);
20383 + return retval;
20386 + retval = reiserfs_security_init(parent_dir, inode, &dentry->d_name,
20387 + &security);
20388 + if (retval < 0) {
20389 + drop_new_inode(inode);
20390 + return retval;
20392 + jbegin_count += retval;
20394 + reiserfs_write_lock(parent_dir->i_sb);
20395 + item_len = ROUND_UP(strlen(symname));
20396 + if (item_len > MAX_DIRECT_ITEM_LEN(parent_dir->i_sb->s_blocksize)) {
20397 + retval = -ENAMETOOLONG;
20398 + drop_new_inode(inode);
20399 + goto out_failed;
20402 + name = kmalloc(item_len, GFP_NOFS);
20403 + if (!name) {
20404 + drop_new_inode(inode);
20405 + retval = -ENOMEM;
20406 + goto out_failed;
20408 + memcpy(name, symname, strlen(symname));
20409 + padd_item(name, item_len, strlen(symname));
20411 + retval = journal_begin(&th, parent_dir->i_sb, jbegin_count);
20412 + if (retval) {
20413 + drop_new_inode(inode);
20414 + kfree(name);
20415 + goto out_failed;
20418 + retval =
20419 + reiserfs_new_inode(&th, parent_dir, mode, name, strlen(symname),
20420 + dentry, inode, &security);
20421 + kfree(name);
20422 + if (retval) { /* reiserfs_new_inode iputs for us */
20423 + goto out_failed;
20426 + reiserfs_update_inode_transaction(inode);
20427 + reiserfs_update_inode_transaction(parent_dir);
20429 + inode->i_op = &reiserfs_symlink_inode_operations;
20430 + inode_nohighmem(inode);
20431 + inode->i_mapping->a_ops = &reiserfs_address_space_operations;
20433 + retval = reiserfs_add_entry(&th, parent_dir, dentry->d_name.name,
20434 + dentry->d_name.len, inode, 1 /*visible */ );
20435 + if (retval) {
20436 + int err;
20437 + drop_nlink(inode);
20438 + reiserfs_update_sd(&th, inode);
20439 + err = journal_end(&th);
20440 + if (err)
20441 + retval = err;
20442 + unlock_new_inode(inode);
20443 + iput(inode);
20444 + goto out_failed;
20447 + d_instantiate_new(dentry, inode);
20448 + retval = journal_end(&th);
20449 +out_failed:
20450 + reiserfs_write_unlock(parent_dir->i_sb);
20451 + reiserfs_security_free(&security);
20452 + return retval;
20455 +static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
20456 + struct dentry *dentry)
20458 + int retval;
20459 + struct inode *inode = d_inode(old_dentry);
20460 + struct reiserfs_transaction_handle th;
20461 + /*
20462 + * We need blocks for transaction + update of quotas for
20463 + * the owners of the directory
20464 + */
20465 + int jbegin_count =
20466 + JOURNAL_PER_BALANCE_CNT * 3 +
20467 + 2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
20469 + retval = dquot_initialize(dir);
20470 + if (retval)
20471 + return retval;
20473 + reiserfs_write_lock(dir->i_sb);
20474 + if (inode->i_nlink >= REISERFS_LINK_MAX) {
20475 + /* FIXME: sd_nlink is 32 bit for new files */
20476 + reiserfs_write_unlock(dir->i_sb);
20477 + return -EMLINK;
20480 + /* inc before scheduling so reiserfs_unlink knows we are here */
20481 + inc_nlink(inode);
20483 + retval = journal_begin(&th, dir->i_sb, jbegin_count);
20484 + if (retval) {
20485 + drop_nlink(inode);
20486 + reiserfs_write_unlock(dir->i_sb);
20487 + return retval;
20490 + /* create new entry */
20491 + retval =
20492 + reiserfs_add_entry(&th, dir, dentry->d_name.name,
20493 + dentry->d_name.len, inode, 1 /*visible */ );
20495 + reiserfs_update_inode_transaction(inode);
20496 + reiserfs_update_inode_transaction(dir);
20498 + if (retval) {
20499 + int err;
20500 + drop_nlink(inode);
20501 + err = journal_end(&th);
20502 + reiserfs_write_unlock(dir->i_sb);
20503 + return err ? err : retval;
20506 + inode_set_ctime_current(inode);
20507 + reiserfs_update_sd(&th, inode);
20509 + ihold(inode);
20510 + d_instantiate(dentry, inode);
20511 + retval = journal_end(&th);
20512 + reiserfs_write_unlock(dir->i_sb);
20513 + return retval;
20516 +/* de contains information pointing to an entry which */
20517 +static int de_still_valid(const char *name, int len,
20518 + struct reiserfs_dir_entry *de)
20520 + struct reiserfs_dir_entry tmp = *de;
20522 + /* recalculate pointer to name and name length */
20523 + set_de_name_and_namelen(&tmp);
20524 + /* FIXME: could check more */
20525 + if (tmp.de_namelen != len || memcmp(name, de->de_name, len))
20526 + return 0;
20527 + return 1;
20530 +static int entry_points_to_object(const char *name, int len,
20531 + struct reiserfs_dir_entry *de,
20532 + struct inode *inode)
20534 + if (!de_still_valid(name, len, de))
20535 + return 0;
20537 + if (inode) {
20538 + if (!de_visible(de->de_deh + de->de_entry_num))
20539 + reiserfs_panic(inode->i_sb, "vs-7042",
20540 + "entry must be visible");
20541 + return (de->de_objectid == inode->i_ino) ? 1 : 0;
20544 + /* this must be added hidden entry */
20545 + if (de_visible(de->de_deh + de->de_entry_num))
20546 + reiserfs_panic(NULL, "vs-7043", "entry must be visible");
20548 + return 1;
20551 +/* sets key of objectid the entry has to point to */
20552 +static void set_ino_in_dir_entry(struct reiserfs_dir_entry *de,
20553 + struct reiserfs_key *key)
20555 + /* JDM These operations are endian safe - both are le */
20556 + de->de_deh[de->de_entry_num].deh_dir_id = key->k_dir_id;
20557 + de->de_deh[de->de_entry_num].deh_objectid = key->k_objectid;
20561 + * process, that is going to call fix_nodes/do_balance must hold only
20562 + * one path. If it holds 2 or more, it can get into endless waiting in
20563 + * get_empty_nodes or its clones
20564 + */
20565 +static int reiserfs_rename(struct mnt_idmap *idmap,
20566 + struct inode *old_dir, struct dentry *old_dentry,
20567 + struct inode *new_dir, struct dentry *new_dentry,
20568 + unsigned int flags)
20570 + int retval;
20571 + INITIALIZE_PATH(old_entry_path);
20572 + INITIALIZE_PATH(new_entry_path);
20573 + INITIALIZE_PATH(dot_dot_entry_path);
20574 + struct item_head new_entry_ih, old_entry_ih, dot_dot_ih;
20575 + struct reiserfs_dir_entry old_de, new_de, dot_dot_de;
20576 + struct inode *old_inode, *new_dentry_inode;
20577 + struct reiserfs_transaction_handle th;
20578 + int jbegin_count;
20579 + unsigned long savelink = 1;
20580 + bool update_dir_parent = false;
20582 + if (flags & ~RENAME_NOREPLACE)
20583 + return -EINVAL;
20585 + /*
20586 + * three balancings: (1) old name removal, (2) new name insertion
20587 + * and (3) maybe "save" link insertion
20588 + * stat data updates: (1) old directory,
20589 + * (2) new directory and (3) maybe old object stat data (when it is
20590 + * directory) and (4) maybe stat data of object to which new entry
20591 + * pointed initially and (5) maybe block containing ".." of
20592 + * renamed directory
20593 + * quota updates: two parent directories
20594 + */
20595 + jbegin_count =
20596 + JOURNAL_PER_BALANCE_CNT * 3 + 5 +
20597 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb);
20599 + retval = dquot_initialize(old_dir);
20600 + if (retval)
20601 + return retval;
20602 + retval = dquot_initialize(new_dir);
20603 + if (retval)
20604 + return retval;
20606 + old_inode = d_inode(old_dentry);
20607 + new_dentry_inode = d_inode(new_dentry);
20609 + /*
20610 + * make sure that oldname still exists and points to an object we
20611 + * are going to rename
20612 + */
20613 + old_de.de_gen_number_bit_string = NULL;
20614 + reiserfs_write_lock(old_dir->i_sb);
20615 + retval =
20616 + reiserfs_find_entry(old_dir, old_dentry->d_name.name,
20617 + old_dentry->d_name.len, &old_entry_path,
20618 + &old_de);
20619 + pathrelse(&old_entry_path);
20620 + if (retval == IO_ERROR) {
20621 + reiserfs_write_unlock(old_dir->i_sb);
20622 + return -EIO;
20625 + if (retval != NAME_FOUND || old_de.de_objectid != old_inode->i_ino) {
20626 + reiserfs_write_unlock(old_dir->i_sb);
20627 + return -ENOENT;
20630 + if (S_ISDIR(old_inode->i_mode)) {
20631 + /*
20632 + * make sure that directory being renamed has correct ".."
20633 + * and that its new parent directory has not too many links
20634 + * already
20635 + */
20636 + if (new_dentry_inode) {
20637 + if (!reiserfs_empty_dir(new_dentry_inode)) {
20638 + reiserfs_write_unlock(old_dir->i_sb);
20639 + return -ENOTEMPTY;
20643 + if (old_dir != new_dir) {
20644 + /*
20645 + * directory is renamed, its parent directory will be
20646 + * changed, so find ".." entry
20647 + */
20648 + dot_dot_de.de_gen_number_bit_string = NULL;
20649 + retval =
20650 + reiserfs_find_entry(old_inode, "..", 2,
20651 + &dot_dot_entry_path,
20652 + &dot_dot_de);
20653 + pathrelse(&dot_dot_entry_path);
20654 + if (retval != NAME_FOUND) {
20655 + reiserfs_write_unlock(old_dir->i_sb);
20656 + return -EIO;
20659 + /* inode number of .. must equal old_dir->i_ino */
20660 + if (dot_dot_de.de_objectid != old_dir->i_ino) {
20661 + reiserfs_write_unlock(old_dir->i_sb);
20662 + return -EIO;
20664 + update_dir_parent = true;
20668 + retval = journal_begin(&th, old_dir->i_sb, jbegin_count);
20669 + if (retval) {
20670 + reiserfs_write_unlock(old_dir->i_sb);
20671 + return retval;
20674 + /* add new entry (or find the existing one) */
20675 + retval =
20676 + reiserfs_add_entry(&th, new_dir, new_dentry->d_name.name,
20677 + new_dentry->d_name.len, old_inode, 0);
20678 + if (retval == -EEXIST) {
20679 + if (!new_dentry_inode) {
20680 + reiserfs_panic(old_dir->i_sb, "vs-7050",
20681 + "new entry is found, new inode == 0");
20683 + } else if (retval) {
20684 + int err = journal_end(&th);
20685 + reiserfs_write_unlock(old_dir->i_sb);
20686 + return err ? err : retval;
20689 + reiserfs_update_inode_transaction(old_dir);
20690 + reiserfs_update_inode_transaction(new_dir);
20692 + /*
20693 + * this makes it so an fsync on an open fd for the old name will
20694 + * commit the rename operation
20695 + */
20696 + reiserfs_update_inode_transaction(old_inode);
20698 + if (new_dentry_inode)
20699 + reiserfs_update_inode_transaction(new_dentry_inode);
20701 + while (1) {
20702 + /*
20703 + * look for old name using corresponding entry key
20704 + * (found by reiserfs_find_entry)
20705 + */
20706 + if ((retval =
20707 + search_by_entry_key(new_dir->i_sb, &old_de.de_entry_key,
20708 + &old_entry_path,
20709 + &old_de)) != NAME_FOUND) {
20710 + pathrelse(&old_entry_path);
20711 + journal_end(&th);
20712 + reiserfs_write_unlock(old_dir->i_sb);
20713 + return -EIO;
20716 + copy_item_head(&old_entry_ih, tp_item_head(&old_entry_path));
20718 + reiserfs_prepare_for_journal(old_inode->i_sb, old_de.de_bh, 1);
20720 + /* look for new name by reiserfs_find_entry */
20721 + new_de.de_gen_number_bit_string = NULL;
20722 + retval =
20723 + reiserfs_find_entry(new_dir, new_dentry->d_name.name,
20724 + new_dentry->d_name.len, &new_entry_path,
20725 + &new_de);
20726 + /*
20727 + * reiserfs_add_entry should not return IO_ERROR,
20728 + * because it is called with essentially same parameters from
20729 + * reiserfs_add_entry above, and we'll catch any i/o errors
20730 + * before we get here.
20731 + */
20732 + if (retval != NAME_FOUND_INVISIBLE && retval != NAME_FOUND) {
20733 + pathrelse(&new_entry_path);
20734 + pathrelse(&old_entry_path);
20735 + journal_end(&th);
20736 + reiserfs_write_unlock(old_dir->i_sb);
20737 + return -EIO;
20740 + copy_item_head(&new_entry_ih, tp_item_head(&new_entry_path));
20742 + reiserfs_prepare_for_journal(old_inode->i_sb, new_de.de_bh, 1);
20744 + if (update_dir_parent) {
20745 + if ((retval =
20746 + search_by_entry_key(new_dir->i_sb,
20747 + &dot_dot_de.de_entry_key,
20748 + &dot_dot_entry_path,
20749 + &dot_dot_de)) != NAME_FOUND) {
20750 + pathrelse(&dot_dot_entry_path);
20751 + pathrelse(&new_entry_path);
20752 + pathrelse(&old_entry_path);
20753 + journal_end(&th);
20754 + reiserfs_write_unlock(old_dir->i_sb);
20755 + return -EIO;
20757 + copy_item_head(&dot_dot_ih,
20758 + tp_item_head(&dot_dot_entry_path));
20759 + /* node containing ".." gets into transaction */
20760 + reiserfs_prepare_for_journal(old_inode->i_sb,
20761 + dot_dot_de.de_bh, 1);
20763 + /*
20764 + * we should check seals here, not do
20765 + * this stuff, yes? Then, having
20766 + * gathered everything into RAM we
20767 + * should lock the buffers, yes? -Hans
20768 + */
20769 + /*
20770 + * probably. our rename needs to hold more
20771 + * than one path at once. The seals would
20772 + * have to be written to deal with multi-path
20773 + * issues -chris
20774 + */
20775 + /*
20776 + * sanity checking before doing the rename - avoid races many
20777 + * of the above checks could have scheduled. We have to be
20778 + * sure our items haven't been shifted by another process.
20779 + */
20780 + if (item_moved(&new_entry_ih, &new_entry_path) ||
20781 + !entry_points_to_object(new_dentry->d_name.name,
20782 + new_dentry->d_name.len,
20783 + &new_de, new_dentry_inode) ||
20784 + item_moved(&old_entry_ih, &old_entry_path) ||
20785 + !entry_points_to_object(old_dentry->d_name.name,
20786 + old_dentry->d_name.len,
20787 + &old_de, old_inode)) {
20788 + reiserfs_restore_prepared_buffer(old_inode->i_sb,
20789 + new_de.de_bh);
20790 + reiserfs_restore_prepared_buffer(old_inode->i_sb,
20791 + old_de.de_bh);
20792 + if (update_dir_parent)
20793 + reiserfs_restore_prepared_buffer(old_inode->
20794 + i_sb,
20795 + dot_dot_de.
20796 + de_bh);
20797 + continue;
20799 + if (update_dir_parent) {
20800 + if (item_moved(&dot_dot_ih, &dot_dot_entry_path) ||
20801 + !entry_points_to_object("..", 2, &dot_dot_de,
20802 + old_dir)) {
20803 + reiserfs_restore_prepared_buffer(old_inode->
20804 + i_sb,
20805 + old_de.de_bh);
20806 + reiserfs_restore_prepared_buffer(old_inode->
20807 + i_sb,
20808 + new_de.de_bh);
20809 + reiserfs_restore_prepared_buffer(old_inode->
20810 + i_sb,
20811 + dot_dot_de.
20812 + de_bh);
20813 + continue;
20817 + RFALSE(update_dir_parent &&
20818 + !buffer_journal_prepared(dot_dot_de.de_bh), "");
20820 + break;
20823 + /*
20824 + * ok, all the changes can be done in one fell swoop when we
20825 + * have claimed all the buffers needed.
20826 + */
20828 + mark_de_visible(new_de.de_deh + new_de.de_entry_num);
20829 + set_ino_in_dir_entry(&new_de, INODE_PKEY(old_inode));
20830 + journal_mark_dirty(&th, new_de.de_bh);
20832 + mark_de_hidden(old_de.de_deh + old_de.de_entry_num);
20833 + journal_mark_dirty(&th, old_de.de_bh);
20834 + /*
20835 + * thanks to Alex Adriaanse <alex_a@caltech.edu> for patch
20836 + * which adds ctime update of renamed object
20837 + */
20838 + simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
20840 + if (new_dentry_inode) {
20841 + /* adjust link number of the victim */
20842 + if (S_ISDIR(new_dentry_inode->i_mode)) {
20843 + clear_nlink(new_dentry_inode);
20844 + } else {
20845 + drop_nlink(new_dentry_inode);
20847 + savelink = new_dentry_inode->i_nlink;
20850 + if (update_dir_parent) {
20851 + /* adjust ".." of renamed directory */
20852 + set_ino_in_dir_entry(&dot_dot_de, INODE_PKEY(new_dir));
20853 + journal_mark_dirty(&th, dot_dot_de.de_bh);
20855 + if (S_ISDIR(old_inode->i_mode)) {
20856 + /*
20857 + * there (in new_dir) was no directory, so it got new link
20858 + * (".." of renamed directory)
20859 + */
20860 + if (!new_dentry_inode)
20861 + INC_DIR_INODE_NLINK(new_dir);
20863 + /* old directory lost one link - ".. " of renamed directory */
20864 + DEC_DIR_INODE_NLINK(old_dir);
20866 + /*
20867 + * looks like in 2.3.99pre3 brelse is atomic.
20868 + * so we can use pathrelse
20869 + */
20870 + pathrelse(&new_entry_path);
20871 + pathrelse(&dot_dot_entry_path);
20873 + /*
20874 + * FIXME: this reiserfs_cut_from_item's return value may screw up
20875 + * anybody, but it will panic if will not be able to find the
20876 + * entry. This needs one more clean up
20877 + */
20878 + if (reiserfs_cut_from_item
20879 + (&th, &old_entry_path, &old_de.de_entry_key, old_dir, NULL,
20880 + 0) < 0)
20881 + reiserfs_error(old_dir->i_sb, "vs-7060",
20882 + "couldn't not cut old name. Fsck later?");
20884 + old_dir->i_size -= DEH_SIZE + old_de.de_entrylen;
20886 + reiserfs_update_sd(&th, old_dir);
20887 + reiserfs_update_sd(&th, new_dir);
20888 + reiserfs_update_sd(&th, old_inode);
20890 + if (new_dentry_inode) {
20891 + if (savelink == 0)
20892 + add_save_link(&th, new_dentry_inode,
20893 + 0 /* not truncate */ );
20894 + reiserfs_update_sd(&th, new_dentry_inode);
20897 + retval = journal_end(&th);
20898 + reiserfs_write_unlock(old_dir->i_sb);
20899 + return retval;
20902 +static const struct inode_operations reiserfs_priv_dir_inode_operations = {
20903 + .create = reiserfs_create,
20904 + .lookup = reiserfs_lookup,
20905 + .link = reiserfs_link,
20906 + .unlink = reiserfs_unlink,
20907 + .symlink = reiserfs_symlink,
20908 + .mkdir = reiserfs_mkdir,
20909 + .rmdir = reiserfs_rmdir,
20910 + .mknod = reiserfs_mknod,
20911 + .rename = reiserfs_rename,
20912 + .setattr = reiserfs_setattr,
20913 + .permission = reiserfs_permission,
20914 + .fileattr_get = reiserfs_fileattr_get,
20915 + .fileattr_set = reiserfs_fileattr_set,
20918 +static const struct inode_operations reiserfs_priv_symlink_inode_operations = {
20919 + .get_link = page_get_link,
20920 + .setattr = reiserfs_setattr,
20921 + .permission = reiserfs_permission,
20924 +static const struct inode_operations reiserfs_priv_special_inode_operations = {
20925 + .setattr = reiserfs_setattr,
20926 + .permission = reiserfs_permission,
20929 +void reiserfs_init_priv_inode(struct inode *inode)
20931 + inode->i_flags |= S_PRIVATE;
20932 + inode->i_opflags &= ~IOP_XATTR;
20934 + if (S_ISREG(inode->i_mode))
20935 + inode->i_op = &reiserfs_priv_file_inode_operations;
20936 + else if (S_ISDIR(inode->i_mode))
20937 + inode->i_op = &reiserfs_priv_dir_inode_operations;
20938 + else if (S_ISLNK(inode->i_mode))
20939 + inode->i_op = &reiserfs_priv_symlink_inode_operations;
20940 + else
20941 + inode->i_op = &reiserfs_priv_special_inode_operations;
20944 +/* directories can handle most operations... */
20945 +const struct inode_operations reiserfs_dir_inode_operations = {
20946 + .create = reiserfs_create,
20947 + .lookup = reiserfs_lookup,
20948 + .link = reiserfs_link,
20949 + .unlink = reiserfs_unlink,
20950 + .symlink = reiserfs_symlink,
20951 + .mkdir = reiserfs_mkdir,
20952 + .rmdir = reiserfs_rmdir,
20953 + .mknod = reiserfs_mknod,
20954 + .rename = reiserfs_rename,
20955 + .setattr = reiserfs_setattr,
20956 + .listxattr = reiserfs_listxattr,
20957 + .permission = reiserfs_permission,
20958 + .get_inode_acl = reiserfs_get_acl,
20959 + .set_acl = reiserfs_set_acl,
20960 + .fileattr_get = reiserfs_fileattr_get,
20961 + .fileattr_set = reiserfs_fileattr_set,
20965 + * symlink operations.. same as page_symlink_inode_operations, with xattr
20966 + * stuff added
20967 + */
20968 +const struct inode_operations reiserfs_symlink_inode_operations = {
20969 + .get_link = page_get_link,
20970 + .setattr = reiserfs_setattr,
20971 + .listxattr = reiserfs_listxattr,
20972 + .permission = reiserfs_permission,
20976 + * special file operations.. just xattr/acl stuff
20977 + */
20978 +const struct inode_operations reiserfs_special_inode_operations = {
20979 + .setattr = reiserfs_setattr,
20980 + .listxattr = reiserfs_listxattr,
20981 + .permission = reiserfs_permission,
20982 + .get_inode_acl = reiserfs_get_acl,
20983 + .set_acl = reiserfs_set_acl,
20985 diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c
20986 new file mode 100644
20987 index 000000000000..34baf5c0f265
20988 --- /dev/null
20989 +++ b/fs/reiserfs/objectid.c
20990 @@ -0,0 +1,216 @@
20992 + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
20993 + */
20995 +#include <linux/string.h>
20996 +#include <linux/time.h>
20997 +#include <linux/uuid.h>
20998 +#include "reiserfs.h"
21000 +/* find where objectid map starts */
21001 +#define objectid_map(s,rs) (old_format_only (s) ? \
21002 + (__le32 *)((struct reiserfs_super_block_v1 *)(rs) + 1) :\
21003 + (__le32 *)((rs) + 1))
21005 +#ifdef CONFIG_REISERFS_CHECK
21007 +static void check_objectid_map(struct super_block *s, __le32 * map)
21009 + if (le32_to_cpu(map[0]) != 1)
21010 + reiserfs_panic(s, "vs-15010", "map corrupted: %lx",
21011 + (long unsigned int)le32_to_cpu(map[0]));
21013 + /* FIXME: add something else here */
21016 +#else
21017 +static void check_objectid_map(struct super_block *s, __le32 * map)
21020 +#endif
21023 + * When we allocate objectids we allocate the first unused objectid.
21024 + * Each sequence of objectids in use (the odd sequences) is followed
21025 + * by a sequence of objectids not in use (the even sequences). We
21026 + * only need to record the last objectid in each of these sequences
21027 + * (both the odd and even sequences) in order to fully define the
21028 + * boundaries of the sequences. A consequence of allocating the first
21029 + * objectid not in use is that under most conditions this scheme is
21030 + * extremely compact. The exception is immediately after a sequence
21031 + * of operations which deletes a large number of objects of
21032 + * non-sequential objectids, and even then it will become compact
21033 + * again as soon as more objects are created. Note that many
21034 + * interesting optimizations of layout could result from complicating
21035 + * objectid assignment, but we have deferred making them for now.
21036 + */
21038 +/* get unique object identifier */
21039 +__u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th)
21041 + struct super_block *s = th->t_super;
21042 + struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s);
21043 + __le32 *map = objectid_map(s, rs);
21044 + __u32 unused_objectid;
21046 + BUG_ON(!th->t_trans_id);
21048 + check_objectid_map(s, map);
21050 + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
21051 + /* comment needed -Hans */
21052 + unused_objectid = le32_to_cpu(map[1]);
21053 + if (unused_objectid == U32_MAX) {
21054 + reiserfs_warning(s, "reiserfs-15100", "no more object ids");
21055 + reiserfs_restore_prepared_buffer(s, SB_BUFFER_WITH_SB(s));
21056 + return 0;
21059 + /*
21060 + * This incrementation allocates the first unused objectid. That
21061 + * is to say, the first entry on the objectid map is the first
21062 + * unused objectid, and by incrementing it we use it. See below
21063 + * where we check to see if we eliminated a sequence of unused
21064 + * objectids....
21065 + */
21066 + map[1] = cpu_to_le32(unused_objectid + 1);
21068 + /*
21069 + * Now we check to see if we eliminated the last remaining member of
21070 + * the first even sequence (and can eliminate the sequence by
21071 + * eliminating its last objectid from oids), and can collapse the
21072 + * first two odd sequences into one sequence. If so, then the net
21073 + * result is to eliminate a pair of objectids from oids. We do this
21074 + * by shifting the entire map to the left.
21075 + */
21076 + if (sb_oid_cursize(rs) > 2 && map[1] == map[2]) {
21077 + memmove(map + 1, map + 3,
21078 + (sb_oid_cursize(rs) - 3) * sizeof(__u32));
21079 + set_sb_oid_cursize(rs, sb_oid_cursize(rs) - 2);
21082 + journal_mark_dirty(th, SB_BUFFER_WITH_SB(s));
21083 + return unused_objectid;
21086 +/* makes object identifier unused */
21087 +void reiserfs_release_objectid(struct reiserfs_transaction_handle *th,
21088 + __u32 objectid_to_release)
21090 + struct super_block *s = th->t_super;
21091 + struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s);
21092 + __le32 *map = objectid_map(s, rs);
21093 + int i = 0;
21095 + BUG_ON(!th->t_trans_id);
21096 + /*return; */
21097 + check_objectid_map(s, map);
21099 + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
21100 + journal_mark_dirty(th, SB_BUFFER_WITH_SB(s));
21102 + /*
21103 + * start at the beginning of the objectid map (i = 0) and go to
21104 + * the end of it (i = disk_sb->s_oid_cursize). Linear search is
21105 + * what we use, though it is possible that binary search would be
21106 + * more efficient after performing lots of deletions (which is
21107 + * when oids is large.) We only check even i's.
21108 + */
21109 + while (i < sb_oid_cursize(rs)) {
21110 + if (objectid_to_release == le32_to_cpu(map[i])) {
21111 + /* This incrementation unallocates the objectid. */
21112 + le32_add_cpu(&map[i], 1);
21114 + /*
21115 + * Did we unallocate the last member of an
21116 + * odd sequence, and can shrink oids?
21117 + */
21118 + if (map[i] == map[i + 1]) {
21119 + /* shrink objectid map */
21120 + memmove(map + i, map + i + 2,
21121 + (sb_oid_cursize(rs) - i -
21122 + 2) * sizeof(__u32));
21123 + set_sb_oid_cursize(rs, sb_oid_cursize(rs) - 2);
21125 + RFALSE(sb_oid_cursize(rs) < 2 ||
21126 + sb_oid_cursize(rs) > sb_oid_maxsize(rs),
21127 + "vs-15005: objectid map corrupted cur_size == %d (max == %d)",
21128 + sb_oid_cursize(rs), sb_oid_maxsize(rs));
21130 + return;
21133 + if (objectid_to_release > le32_to_cpu(map[i]) &&
21134 + objectid_to_release < le32_to_cpu(map[i + 1])) {
21135 + /* size of objectid map is not changed */
21136 + if (objectid_to_release + 1 == le32_to_cpu(map[i + 1])) {
21137 + le32_add_cpu(&map[i + 1], -1);
21138 + return;
21141 + /*
21142 + * JDM comparing two little-endian values for
21143 + * equality -- safe
21144 + */
21145 + /*
21146 + * objectid map must be expanded, but
21147 + * there is no space
21148 + */
21149 + if (sb_oid_cursize(rs) == sb_oid_maxsize(rs)) {
21150 + PROC_INFO_INC(s, leaked_oid);
21151 + return;
21154 + /* expand the objectid map */
21155 + memmove(map + i + 3, map + i + 1,
21156 + (sb_oid_cursize(rs) - i - 1) * sizeof(__u32));
21157 + map[i + 1] = cpu_to_le32(objectid_to_release);
21158 + map[i + 2] = cpu_to_le32(objectid_to_release + 1);
21159 + set_sb_oid_cursize(rs, sb_oid_cursize(rs) + 2);
21160 + return;
21162 + i += 2;
21165 + reiserfs_error(s, "vs-15011", "tried to free free object id (%lu)",
21166 + (long unsigned)objectid_to_release);
21169 +int reiserfs_convert_objectid_map_v1(struct super_block *s)
21171 + struct reiserfs_super_block *disk_sb = SB_DISK_SUPER_BLOCK(s);
21172 + int cur_size = sb_oid_cursize(disk_sb);
21173 + int new_size = (s->s_blocksize - SB_SIZE) / sizeof(__u32) / 2 * 2;
21174 + int old_max = sb_oid_maxsize(disk_sb);
21175 + struct reiserfs_super_block_v1 *disk_sb_v1;
21176 + __le32 *objectid_map;
21177 + int i;
21179 + disk_sb_v1 =
21180 + (struct reiserfs_super_block_v1 *)(SB_BUFFER_WITH_SB(s)->b_data);
21181 + objectid_map = (__le32 *) (disk_sb_v1 + 1);
21183 + if (cur_size > new_size) {
21184 + /*
21185 + * mark everyone used that was listed as free at
21186 + * the end of the objectid map
21187 + */
21188 + objectid_map[new_size - 1] = objectid_map[cur_size - 1];
21189 + set_sb_oid_cursize(disk_sb, new_size);
21191 + /* move the smaller objectid map past the end of the new super */
21192 + for (i = new_size - 1; i >= 0; i--) {
21193 + objectid_map[i + (old_max - new_size)] = objectid_map[i];
21196 + /* set the max size so we don't overflow later */
21197 + set_sb_oid_maxsize(disk_sb, new_size);
21199 + /* Zero out label and generate random UUID */
21200 + memset(disk_sb->s_label, 0, sizeof(disk_sb->s_label));
21201 + generate_random_uuid(disk_sb->s_uuid);
21203 + /* finally, zero out the unused chunk of the new super */
21204 + memset(disk_sb->s_unused, 0, sizeof(disk_sb->s_unused));
21205 + return 0;
21207 diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
21208 new file mode 100644
21209 index 000000000000..84a194b77f19
21210 --- /dev/null
21211 +++ b/fs/reiserfs/prints.c
21212 @@ -0,0 +1,792 @@
21214 + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
21215 + */
21217 +#include <linux/time.h>
21218 +#include <linux/fs.h>
21219 +#include "reiserfs.h"
21220 +#include <linux/string.h>
21221 +#include <linux/buffer_head.h>
21223 +#include <linux/stdarg.h>
21225 +static char error_buf[1024];
21226 +static char fmt_buf[1024];
21227 +static char off_buf[80];
21229 +static char *reiserfs_cpu_offset(struct cpu_key *key)
21231 + if (cpu_key_k_type(key) == TYPE_DIRENTRY)
21232 + sprintf(off_buf, "%llu(%llu)",
21233 + (unsigned long long)
21234 + GET_HASH_VALUE(cpu_key_k_offset(key)),
21235 + (unsigned long long)
21236 + GET_GENERATION_NUMBER(cpu_key_k_offset(key)));
21237 + else
21238 + sprintf(off_buf, "0x%Lx",
21239 + (unsigned long long)cpu_key_k_offset(key));
21240 + return off_buf;
21243 +static char *le_offset(struct reiserfs_key *key)
21245 + int version;
21247 + version = le_key_version(key);
21248 + if (le_key_k_type(version, key) == TYPE_DIRENTRY)
21249 + sprintf(off_buf, "%llu(%llu)",
21250 + (unsigned long long)
21251 + GET_HASH_VALUE(le_key_k_offset(version, key)),
21252 + (unsigned long long)
21253 + GET_GENERATION_NUMBER(le_key_k_offset(version, key)));
21254 + else
21255 + sprintf(off_buf, "0x%Lx",
21256 + (unsigned long long)le_key_k_offset(version, key));
21257 + return off_buf;
21260 +static char *cpu_type(struct cpu_key *key)
21262 + if (cpu_key_k_type(key) == TYPE_STAT_DATA)
21263 + return "SD";
21264 + if (cpu_key_k_type(key) == TYPE_DIRENTRY)
21265 + return "DIR";
21266 + if (cpu_key_k_type(key) == TYPE_DIRECT)
21267 + return "DIRECT";
21268 + if (cpu_key_k_type(key) == TYPE_INDIRECT)
21269 + return "IND";
21270 + return "UNKNOWN";
21273 +static char *le_type(struct reiserfs_key *key)
21275 + int version;
21277 + version = le_key_version(key);
21279 + if (le_key_k_type(version, key) == TYPE_STAT_DATA)
21280 + return "SD";
21281 + if (le_key_k_type(version, key) == TYPE_DIRENTRY)
21282 + return "DIR";
21283 + if (le_key_k_type(version, key) == TYPE_DIRECT)
21284 + return "DIRECT";
21285 + if (le_key_k_type(version, key) == TYPE_INDIRECT)
21286 + return "IND";
21287 + return "UNKNOWN";
21290 +/* %k */
21291 +static int scnprintf_le_key(char *buf, size_t size, struct reiserfs_key *key)
21293 + if (key)
21294 + return scnprintf(buf, size, "[%d %d %s %s]",
21295 + le32_to_cpu(key->k_dir_id),
21296 + le32_to_cpu(key->k_objectid), le_offset(key),
21297 + le_type(key));
21298 + else
21299 + return scnprintf(buf, size, "[NULL]");
21302 +/* %K */
21303 +static int scnprintf_cpu_key(char *buf, size_t size, struct cpu_key *key)
21305 + if (key)
21306 + return scnprintf(buf, size, "[%d %d %s %s]",
21307 + key->on_disk_key.k_dir_id,
21308 + key->on_disk_key.k_objectid,
21309 + reiserfs_cpu_offset(key), cpu_type(key));
21310 + else
21311 + return scnprintf(buf, size, "[NULL]");
21314 +static int scnprintf_de_head(char *buf, size_t size,
21315 + struct reiserfs_de_head *deh)
21317 + if (deh)
21318 + return scnprintf(buf, size,
21319 + "[offset=%d dir_id=%d objectid=%d location=%d state=%04x]",
21320 + deh_offset(deh), deh_dir_id(deh),
21321 + deh_objectid(deh), deh_location(deh),
21322 + deh_state(deh));
21323 + else
21324 + return scnprintf(buf, size, "[NULL]");
21328 +static int scnprintf_item_head(char *buf, size_t size, struct item_head *ih)
21330 + if (ih) {
21331 + char *p = buf;
21332 + char * const end = buf + size;
21334 + p += scnprintf(p, end - p, "%s",
21335 + (ih_version(ih) == KEY_FORMAT_3_6) ?
21336 + "*3.6* " : "*3.5*");
21338 + p += scnprintf_le_key(p, end - p, &ih->ih_key);
21340 + p += scnprintf(p, end - p,
21341 + ", item_len %d, item_location %d, free_space(entry_count) %d",
21342 + ih_item_len(ih), ih_location(ih),
21343 + ih_free_space(ih));
21344 + return p - buf;
21345 + } else
21346 + return scnprintf(buf, size, "[NULL]");
21349 +static int scnprintf_direntry(char *buf, size_t size,
21350 + struct reiserfs_dir_entry *de)
21352 + char name[20];
21354 + memcpy(name, de->de_name, de->de_namelen > 19 ? 19 : de->de_namelen);
21355 + name[de->de_namelen > 19 ? 19 : de->de_namelen] = 0;
21356 + return scnprintf(buf, size, "\"%s\"==>[%d %d]",
21357 + name, de->de_dir_id, de->de_objectid);
21360 +static int scnprintf_block_head(char *buf, size_t size, struct buffer_head *bh)
21362 + return scnprintf(buf, size,
21363 + "level=%d, nr_items=%d, free_space=%d rdkey ",
21364 + B_LEVEL(bh), B_NR_ITEMS(bh), B_FREE_SPACE(bh));
21367 +static int scnprintf_buffer_head(char *buf, size_t size, struct buffer_head *bh)
21369 + return scnprintf(buf, size,
21370 + "dev %pg, size %zd, blocknr %llu, count %d, state 0x%lx, page %p, (%s, %s, %s)",
21371 + bh->b_bdev, bh->b_size,
21372 + (unsigned long long)bh->b_blocknr,
21373 + atomic_read(&(bh->b_count)),
21374 + bh->b_state, bh->b_page,
21375 + buffer_uptodate(bh) ? "UPTODATE" : "!UPTODATE",
21376 + buffer_dirty(bh) ? "DIRTY" : "CLEAN",
21377 + buffer_locked(bh) ? "LOCKED" : "UNLOCKED");
21380 +static int scnprintf_disk_child(char *buf, size_t size, struct disk_child *dc)
21382 + return scnprintf(buf, size, "[dc_number=%d, dc_size=%u]",
21383 + dc_block_number(dc), dc_size(dc));
21386 +static char *is_there_reiserfs_struct(char *fmt, int *what)
21388 + char *k = fmt;
21390 + while ((k = strchr(k, '%')) != NULL) {
21391 + if (k[1] == 'k' || k[1] == 'K' || k[1] == 'h' || k[1] == 't' ||
21392 + k[1] == 'z' || k[1] == 'b' || k[1] == 'y' || k[1] == 'a') {
21393 + *what = k[1];
21394 + break;
21396 + k++;
21398 + return k;
21402 + * debugging reiserfs we used to print out a lot of different
21403 + * variables, like keys, item headers, buffer heads etc. Values of
21404 + * most fields matter. So it took a long time just to write
21405 + * appropriative printk. With this reiserfs_warning you can use format
21406 + * specification for complex structures like you used to do with
21407 + * printfs for integers, doubles and pointers. For instance, to print
21408 + * out key structure you have to write just:
21409 + * reiserfs_warning ("bad key %k", key);
21410 + * instead of
21411 + * printk ("bad key %lu %lu %lu %lu", key->k_dir_id, key->k_objectid,
21412 + * key->k_offset, key->k_uniqueness);
21413 + */
21414 +static DEFINE_SPINLOCK(error_lock);
21415 +static void prepare_error_buf(const char *fmt, va_list args)
21417 + char *fmt1 = fmt_buf;
21418 + char *k;
21419 + char *p = error_buf;
21420 + char * const end = &error_buf[sizeof(error_buf)];
21421 + int what;
21423 + spin_lock(&error_lock);
21425 + if (WARN_ON(strscpy(fmt_buf, fmt, sizeof(fmt_buf)) < 0)) {
21426 + strscpy(error_buf, "format string too long", end - error_buf);
21427 + goto out_unlock;
21430 + while ((k = is_there_reiserfs_struct(fmt1, &what)) != NULL) {
21431 + *k = 0;
21433 + p += vscnprintf(p, end - p, fmt1, args);
21435 + switch (what) {
21436 + case 'k':
21437 + p += scnprintf_le_key(p, end - p,
21438 + va_arg(args, struct reiserfs_key *));
21439 + break;
21440 + case 'K':
21441 + p += scnprintf_cpu_key(p, end - p,
21442 + va_arg(args, struct cpu_key *));
21443 + break;
21444 + case 'h':
21445 + p += scnprintf_item_head(p, end - p,
21446 + va_arg(args, struct item_head *));
21447 + break;
21448 + case 't':
21449 + p += scnprintf_direntry(p, end - p,
21450 + va_arg(args, struct reiserfs_dir_entry *));
21451 + break;
21452 + case 'y':
21453 + p += scnprintf_disk_child(p, end - p,
21454 + va_arg(args, struct disk_child *));
21455 + break;
21456 + case 'z':
21457 + p += scnprintf_block_head(p, end - p,
21458 + va_arg(args, struct buffer_head *));
21459 + break;
21460 + case 'b':
21461 + p += scnprintf_buffer_head(p, end - p,
21462 + va_arg(args, struct buffer_head *));
21463 + break;
21464 + case 'a':
21465 + p += scnprintf_de_head(p, end - p,
21466 + va_arg(args, struct reiserfs_de_head *));
21467 + break;
21470 + fmt1 = k + 2;
21472 + p += vscnprintf(p, end - p, fmt1, args);
21473 +out_unlock:
21474 + spin_unlock(&error_lock);
21479 + * in addition to usual conversion specifiers this accepts reiserfs
21480 + * specific conversion specifiers:
21481 + * %k to print little endian key,
21482 + * %K to print cpu key,
21483 + * %h to print item_head,
21484 + * %t to print directory entry
21485 + * %z to print block head (arg must be struct buffer_head *
21486 + * %b to print buffer_head
21487 + */
21489 +#define do_reiserfs_warning(fmt)\
21491 + va_list args;\
21492 + va_start( args, fmt );\
21493 + prepare_error_buf( fmt, args );\
21494 + va_end( args );\
21497 +void __reiserfs_warning(struct super_block *sb, const char *id,
21498 + const char *function, const char *fmt, ...)
21500 + do_reiserfs_warning(fmt);
21501 + if (sb)
21502 + printk(KERN_WARNING "REISERFS warning (device %s): %s%s%s: "
21503 + "%s\n", sb->s_id, id ? id : "", id ? " " : "",
21504 + function, error_buf);
21505 + else
21506 + printk(KERN_WARNING "REISERFS warning: %s%s%s: %s\n",
21507 + id ? id : "", id ? " " : "", function, error_buf);
21510 +/* No newline.. reiserfs_info calls can be followed by printk's */
21511 +void reiserfs_info(struct super_block *sb, const char *fmt, ...)
21513 + do_reiserfs_warning(fmt);
21514 + if (sb)
21515 + printk(KERN_NOTICE "REISERFS (device %s): %s",
21516 + sb->s_id, error_buf);
21517 + else
21518 + printk(KERN_NOTICE "REISERFS %s:", error_buf);
21521 +/* No newline.. reiserfs_printk calls can be followed by printk's */
21522 +static void reiserfs_printk(const char *fmt, ...)
21524 + do_reiserfs_warning(fmt);
21525 + printk(error_buf);
21528 +void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...)
21530 +#ifdef CONFIG_REISERFS_CHECK
21531 + do_reiserfs_warning(fmt);
21532 + if (s)
21533 + printk(KERN_DEBUG "REISERFS debug (device %s): %s\n",
21534 + s->s_id, error_buf);
21535 + else
21536 + printk(KERN_DEBUG "REISERFS debug: %s\n", error_buf);
21537 +#endif
21541 + * The format:
21543 + * maintainer-errorid: [function-name:] message
21545 + * where errorid is unique to the maintainer and function-name is
21546 + * optional, is recommended, so that anyone can easily find the bug
21547 + * with a simple grep for the short to type string
21548 + * maintainer-errorid. Don't bother with reusing errorids, there are
21549 + * lots of numbers out there.
21551 + * Example:
21553 + * reiserfs_panic(
21554 + * p_sb, "reiser-29: reiserfs_new_blocknrs: "
21555 + * "one of search_start or rn(%d) is equal to MAX_B_NUM,"
21556 + * "which means that we are optimizing location based on the "
21557 + * "bogus location of a temp buffer (%p).",
21558 + * rn, bh
21559 + * );
21561 + * Regular panic()s sometimes clear the screen before the message can
21562 + * be read, thus the need for the while loop.
21564 + * Numbering scheme for panic used by Vladimir and Anatoly( Hans completely
21565 + * ignores this scheme, and considers it pointless complexity):
21567 + * panics in reiserfs_fs.h have numbers from 1000 to 1999
21568 + * super.c 2000 to 2999
21569 + * preserve.c (unused) 3000 to 3999
21570 + * bitmap.c 4000 to 4999
21571 + * stree.c 5000 to 5999
21572 + * prints.c 6000 to 6999
21573 + * namei.c 7000 to 7999
21574 + * fix_nodes.c 8000 to 8999
21575 + * dir.c 9000 to 9999
21576 + * lbalance.c 10000 to 10999
21577 + * ibalance.c 11000 to 11999 not ready
21578 + * do_balan.c 12000 to 12999
21579 + * inode.c 13000 to 13999
21580 + * file.c 14000 to 14999
21581 + * objectid.c 15000 - 15999
21582 + * buffer.c 16000 - 16999
21583 + * symlink.c 17000 - 17999
21585 + * . */
21587 +void __reiserfs_panic(struct super_block *sb, const char *id,
21588 + const char *function, const char *fmt, ...)
21590 + do_reiserfs_warning(fmt);
21592 +#ifdef CONFIG_REISERFS_CHECK
21593 + dump_stack();
21594 +#endif
21595 + if (sb)
21596 + printk(KERN_WARNING "REISERFS panic (device %s): %s%s%s: %s\n",
21597 + sb->s_id, id ? id : "", id ? " " : "",
21598 + function, error_buf);
21599 + else
21600 + printk(KERN_WARNING "REISERFS panic: %s%s%s: %s\n",
21601 + id ? id : "", id ? " " : "", function, error_buf);
21602 + BUG();
21605 +void __reiserfs_error(struct super_block *sb, const char *id,
21606 + const char *function, const char *fmt, ...)
21608 + do_reiserfs_warning(fmt);
21610 + BUG_ON(sb == NULL);
21612 + if (reiserfs_error_panic(sb))
21613 + __reiserfs_panic(sb, id, function, error_buf);
21615 + if (id && id[0])
21616 + printk(KERN_CRIT "REISERFS error (device %s): %s %s: %s\n",
21617 + sb->s_id, id, function, error_buf);
21618 + else
21619 + printk(KERN_CRIT "REISERFS error (device %s): %s: %s\n",
21620 + sb->s_id, function, error_buf);
21622 + if (sb_rdonly(sb))
21623 + return;
21625 + reiserfs_info(sb, "Remounting filesystem read-only\n");
21626 + sb->s_flags |= SB_RDONLY;
21627 + reiserfs_abort_journal(sb, -EIO);
21630 +void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...)
21632 + do_reiserfs_warning(fmt);
21634 + if (reiserfs_error_panic(sb)) {
21635 + panic(KERN_CRIT "REISERFS panic (device %s): %s\n", sb->s_id,
21636 + error_buf);
21639 + if (reiserfs_is_journal_aborted(SB_JOURNAL(sb)))
21640 + return;
21642 + printk(KERN_CRIT "REISERFS abort (device %s): %s\n", sb->s_id,
21643 + error_buf);
21645 + sb->s_flags |= SB_RDONLY;
21646 + reiserfs_abort_journal(sb, errno);
21650 + * this prints internal nodes (4 keys/items in line) (dc_number,
21651 + * dc_size)[k_dirid, k_objectid, k_offset, k_uniqueness](dc_number,
21652 + * dc_size)...
21653 + */
21654 +static int print_internal(struct buffer_head *bh, int first, int last)
21656 + struct reiserfs_key *key;
21657 + struct disk_child *dc;
21658 + int i;
21659 + int from, to;
21661 + if (!B_IS_KEYS_LEVEL(bh))
21662 + return 1;
21664 + check_internal(bh);
21666 + if (first == -1) {
21667 + from = 0;
21668 + to = B_NR_ITEMS(bh);
21669 + } else {
21670 + from = first;
21671 + to = min_t(int, last, B_NR_ITEMS(bh));
21674 + reiserfs_printk("INTERNAL NODE (%ld) contains %z\n", bh->b_blocknr, bh);
21676 + dc = B_N_CHILD(bh, from);
21677 + reiserfs_printk("PTR %d: %y ", from, dc);
21679 + for (i = from, key = internal_key(bh, from), dc++; i < to;
21680 + i++, key++, dc++) {
21681 + reiserfs_printk("KEY %d: %k PTR %d: %y ", i, key, i + 1, dc);
21682 + if (i && i % 4 == 0)
21683 + printk("\n");
21685 + printk("\n");
21686 + return 0;
21689 +static int print_leaf(struct buffer_head *bh, int print_mode, int first,
21690 + int last)
21692 + struct block_head *blkh;
21693 + struct item_head *ih;
21694 + int i, nr;
21695 + int from, to;
21697 + if (!B_IS_ITEMS_LEVEL(bh))
21698 + return 1;
21700 + check_leaf(bh);
21702 + blkh = B_BLK_HEAD(bh);
21703 + ih = item_head(bh, 0);
21704 + nr = blkh_nr_item(blkh);
21706 + printk
21707 + ("\n===================================================================\n");
21708 + reiserfs_printk("LEAF NODE (%ld) contains %z\n", bh->b_blocknr, bh);
21710 + if (!(print_mode & PRINT_LEAF_ITEMS)) {
21711 + reiserfs_printk("FIRST ITEM_KEY: %k, LAST ITEM KEY: %k\n",
21712 + &(ih->ih_key), &((ih + nr - 1)->ih_key));
21713 + return 0;
21716 + if (first < 0 || first > nr - 1)
21717 + from = 0;
21718 + else
21719 + from = first;
21721 + if (last < 0 || last > nr)
21722 + to = nr;
21723 + else
21724 + to = last;
21726 + ih += from;
21727 + printk
21728 + ("-------------------------------------------------------------------------------\n");
21729 + printk
21730 + ("|##| type | key | ilen | free_space | version | loc |\n");
21731 + for (i = from; i < to; i++, ih++) {
21732 + printk
21733 + ("-------------------------------------------------------------------------------\n");
21734 + reiserfs_printk("|%2d| %h |\n", i, ih);
21735 + if (print_mode & PRINT_LEAF_ITEMS)
21736 + op_print_item(ih, ih_item_body(bh, ih));
21739 + printk
21740 + ("===================================================================\n");
21742 + return 0;
21745 +char *reiserfs_hashname(int code)
21747 + if (code == YURA_HASH)
21748 + return "rupasov";
21749 + if (code == TEA_HASH)
21750 + return "tea";
21751 + if (code == R5_HASH)
21752 + return "r5";
21754 + return "unknown";
21757 +/* return 1 if this is not super block */
21758 +static int print_super_block(struct buffer_head *bh)
21760 + struct reiserfs_super_block *rs =
21761 + (struct reiserfs_super_block *)(bh->b_data);
21762 + int skipped, data_blocks;
21763 + char *version;
21765 + if (is_reiserfs_3_5(rs)) {
21766 + version = "3.5";
21767 + } else if (is_reiserfs_3_6(rs)) {
21768 + version = "3.6";
21769 + } else if (is_reiserfs_jr(rs)) {
21770 + version = ((sb_version(rs) == REISERFS_VERSION_2) ?
21771 + "3.6" : "3.5");
21772 + } else {
21773 + return 1;
21776 + printk("%pg\'s super block is in block %llu\n", bh->b_bdev,
21777 + (unsigned long long)bh->b_blocknr);
21778 + printk("Reiserfs version %s\n", version);
21779 + printk("Block count %u\n", sb_block_count(rs));
21780 + printk("Blocksize %d\n", sb_blocksize(rs));
21781 + printk("Free blocks %u\n", sb_free_blocks(rs));
21782 + /*
21783 + * FIXME: this would be confusing if
21784 + * someone stores reiserfs super block in some data block ;)
21785 +// skipped = (bh->b_blocknr * bh->b_size) / sb_blocksize(rs);
21786 + */
21787 + skipped = bh->b_blocknr;
21788 + data_blocks = sb_block_count(rs) - skipped - 1 - sb_bmap_nr(rs) -
21789 + (!is_reiserfs_jr(rs) ? sb_jp_journal_size(rs) +
21790 + 1 : sb_reserved_for_journal(rs)) - sb_free_blocks(rs);
21791 + printk
21792 + ("Busy blocks (skipped %d, bitmaps - %d, journal (or reserved) blocks - %d\n"
21793 + "1 super block, %d data blocks\n", skipped, sb_bmap_nr(rs),
21794 + (!is_reiserfs_jr(rs) ? (sb_jp_journal_size(rs) + 1) :
21795 + sb_reserved_for_journal(rs)), data_blocks);
21796 + printk("Root block %u\n", sb_root_block(rs));
21797 + printk("Journal block (first) %d\n", sb_jp_journal_1st_block(rs));
21798 + printk("Journal dev %d\n", sb_jp_journal_dev(rs));
21799 + printk("Journal orig size %d\n", sb_jp_journal_size(rs));
21800 + printk("FS state %d\n", sb_fs_state(rs));
21801 + printk("Hash function \"%s\"\n",
21802 + reiserfs_hashname(sb_hash_function_code(rs)));
21804 + printk("Tree height %d\n", sb_tree_height(rs));
21805 + return 0;
21808 +static int print_desc_block(struct buffer_head *bh)
21810 + struct reiserfs_journal_desc *desc;
21812 + if (memcmp(get_journal_desc_magic(bh), JOURNAL_DESC_MAGIC, 8))
21813 + return 1;
21815 + desc = (struct reiserfs_journal_desc *)(bh->b_data);
21816 + printk("Desc block %llu (j_trans_id %d, j_mount_id %d, j_len %d)",
21817 + (unsigned long long)bh->b_blocknr, get_desc_trans_id(desc),
21818 + get_desc_mount_id(desc), get_desc_trans_len(desc));
21820 + return 0;
21822 +/* ..., int print_mode, int first, int last) */
21823 +void print_block(struct buffer_head *bh, ...)
21825 + va_list args;
21826 + int mode, first, last;
21828 + if (!bh) {
21829 + printk("print_block: buffer is NULL\n");
21830 + return;
21833 + va_start(args, bh);
21835 + mode = va_arg(args, int);
21836 + first = va_arg(args, int);
21837 + last = va_arg(args, int);
21838 + if (print_leaf(bh, mode, first, last))
21839 + if (print_internal(bh, first, last))
21840 + if (print_super_block(bh))
21841 + if (print_desc_block(bh))
21842 + printk
21843 + ("Block %llu contains unformatted data\n",
21844 + (unsigned long long)bh->b_blocknr);
21846 + va_end(args);
21849 +static char print_tb_buf[2048];
21851 +/* this stores initial state of tree balance in the print_tb_buf */
21852 +void store_print_tb(struct tree_balance *tb)
21854 + int h = 0;
21855 + int i;
21856 + struct buffer_head *tbSh, *tbFh;
21858 + if (!tb)
21859 + return;
21861 + sprintf(print_tb_buf, "\n"
21862 + "BALANCING %d\n"
21863 + "MODE=%c, ITEM_POS=%d POS_IN_ITEM=%d\n"
21864 + "=====================================================================\n"
21865 + "* h * S * L * R * F * FL * FR * CFL * CFR *\n",
21866 + REISERFS_SB(tb->tb_sb)->s_do_balance,
21867 + tb->tb_mode, PATH_LAST_POSITION(tb->tb_path),
21868 + tb->tb_path->pos_in_item);
21870 + for (h = 0; h < ARRAY_SIZE(tb->insert_size); h++) {
21871 + if (PATH_H_PATH_OFFSET(tb->tb_path, h) <=
21872 + tb->tb_path->path_length
21873 + && PATH_H_PATH_OFFSET(tb->tb_path,
21874 + h) > ILLEGAL_PATH_ELEMENT_OFFSET) {
21875 + tbSh = PATH_H_PBUFFER(tb->tb_path, h);
21876 + tbFh = PATH_H_PPARENT(tb->tb_path, h);
21877 + } else {
21878 + tbSh = NULL;
21879 + tbFh = NULL;
21881 + sprintf(print_tb_buf + strlen(print_tb_buf),
21882 + "* %d * %3lld(%2d) * %3lld(%2d) * %3lld(%2d) * %5lld * %5lld * %5lld * %5lld * %5lld *\n",
21883 + h,
21884 + (tbSh) ? (long long)(tbSh->b_blocknr) : (-1LL),
21885 + (tbSh) ? atomic_read(&tbSh->b_count) : -1,
21886 + (tb->L[h]) ? (long long)(tb->L[h]->b_blocknr) : (-1LL),
21887 + (tb->L[h]) ? atomic_read(&tb->L[h]->b_count) : -1,
21888 + (tb->R[h]) ? (long long)(tb->R[h]->b_blocknr) : (-1LL),
21889 + (tb->R[h]) ? atomic_read(&tb->R[h]->b_count) : -1,
21890 + (tbFh) ? (long long)(tbFh->b_blocknr) : (-1LL),
21891 + (tb->FL[h]) ? (long long)(tb->FL[h]->
21892 + b_blocknr) : (-1LL),
21893 + (tb->FR[h]) ? (long long)(tb->FR[h]->
21894 + b_blocknr) : (-1LL),
21895 + (tb->CFL[h]) ? (long long)(tb->CFL[h]->
21896 + b_blocknr) : (-1LL),
21897 + (tb->CFR[h]) ? (long long)(tb->CFR[h]->
21898 + b_blocknr) : (-1LL));
21901 + sprintf(print_tb_buf + strlen(print_tb_buf),
21902 + "=====================================================================\n"
21903 + "* h * size * ln * lb * rn * rb * blkn * s0 * s1 * s1b * s2 * s2b * curb * lk * rk *\n"
21904 + "* 0 * %4d * %2d * %2d * %2d * %2d * %4d * %2d * %2d * %3d * %2d * %3d * %4d * %2d * %2d *\n",
21905 + tb->insert_size[0], tb->lnum[0], tb->lbytes, tb->rnum[0],
21906 + tb->rbytes, tb->blknum[0], tb->s0num, tb->snum[0],
21907 + tb->sbytes[0], tb->snum[1], tb->sbytes[1],
21908 + tb->cur_blknum, tb->lkey[0], tb->rkey[0]);
21910 + /* this prints balance parameters for non-leaf levels */
21911 + h = 0;
21912 + do {
21913 + h++;
21914 + sprintf(print_tb_buf + strlen(print_tb_buf),
21915 + "* %d * %4d * %2d * * %2d * * %2d *\n",
21916 + h, tb->insert_size[h], tb->lnum[h], tb->rnum[h],
21917 + tb->blknum[h]);
21918 + } while (tb->insert_size[h]);
21920 + sprintf(print_tb_buf + strlen(print_tb_buf),
21921 + "=====================================================================\n"
21922 + "FEB list: ");
21924 + /* print FEB list (list of buffers in form (bh (b_blocknr, b_count), that will be used for new nodes) */
21925 + h = 0;
21926 + for (i = 0; i < ARRAY_SIZE(tb->FEB); i++)
21927 + sprintf(print_tb_buf + strlen(print_tb_buf),
21928 + "%p (%llu %d)%s", tb->FEB[i],
21929 + tb->FEB[i] ? (unsigned long long)tb->FEB[i]->
21930 + b_blocknr : 0ULL,
21931 + tb->FEB[i] ? atomic_read(&tb->FEB[i]->b_count) : 0,
21932 + (i == ARRAY_SIZE(tb->FEB) - 1) ? "\n" : ", ");
21934 + sprintf(print_tb_buf + strlen(print_tb_buf),
21935 + "======================== the end ====================================\n");
21938 +void print_cur_tb(char *mes)
21940 + printk("%s\n%s", mes, print_tb_buf);
21943 +static void check_leaf_block_head(struct buffer_head *bh)
21945 + struct block_head *blkh;
21946 + int nr;
21948 + blkh = B_BLK_HEAD(bh);
21949 + nr = blkh_nr_item(blkh);
21950 + if (nr > (bh->b_size - BLKH_SIZE) / IH_SIZE)
21951 + reiserfs_panic(NULL, "vs-6010", "invalid item number %z",
21952 + bh);
21953 + if (blkh_free_space(blkh) > bh->b_size - BLKH_SIZE - IH_SIZE * nr)
21954 + reiserfs_panic(NULL, "vs-6020", "invalid free space %z",
21955 + bh);
21959 +static void check_internal_block_head(struct buffer_head *bh)
21961 + if (!(B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL && B_LEVEL(bh) <= MAX_HEIGHT))
21962 + reiserfs_panic(NULL, "vs-6025", "invalid level %z", bh);
21964 + if (B_NR_ITEMS(bh) > (bh->b_size - BLKH_SIZE) / IH_SIZE)
21965 + reiserfs_panic(NULL, "vs-6030", "invalid item number %z", bh);
21967 + if (B_FREE_SPACE(bh) !=
21968 + bh->b_size - BLKH_SIZE - KEY_SIZE * B_NR_ITEMS(bh) -
21969 + DC_SIZE * (B_NR_ITEMS(bh) + 1))
21970 + reiserfs_panic(NULL, "vs-6040", "invalid free space %z", bh);
21974 +void check_leaf(struct buffer_head *bh)
21976 + int i;
21977 + struct item_head *ih;
21979 + if (!bh)
21980 + return;
21981 + check_leaf_block_head(bh);
21982 + for (i = 0, ih = item_head(bh, 0); i < B_NR_ITEMS(bh); i++, ih++)
21983 + op_check_item(ih, ih_item_body(bh, ih));
21986 +void check_internal(struct buffer_head *bh)
21988 + if (!bh)
21989 + return;
21990 + check_internal_block_head(bh);
21993 +void print_statistics(struct super_block *s)
21996 + /*
21997 + printk ("reiserfs_put_super: session statistics: balances %d, fix_nodes %d, \
21998 + bmap with search %d, without %d, dir2ind %d, ind2dir %d\n",
21999 + REISERFS_SB(s)->s_do_balance, REISERFS_SB(s)->s_fix_nodes,
22000 + REISERFS_SB(s)->s_bmaps, REISERFS_SB(s)->s_bmaps_without_search,
22001 + REISERFS_SB(s)->s_direct2indirect, REISERFS_SB(s)->s_indirect2direct);
22002 + */
22005 diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
22006 new file mode 100644
22007 index 000000000000..5c68a4a52d78
22008 --- /dev/null
22009 +++ b/fs/reiserfs/procfs.c
22010 @@ -0,0 +1,490 @@
22011 +/* -*- linux-c -*- */
22013 +/* fs/reiserfs/procfs.c */
22016 + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
22017 + */
22019 +/* proc info support a la one created by Sizif@Botik.RU for PGC */
22021 +#include <linux/module.h>
22022 +#include <linux/time.h>
22023 +#include <linux/seq_file.h>
22024 +#include <linux/uaccess.h>
22025 +#include "reiserfs.h"
22026 +#include <linux/init.h>
22027 +#include <linux/proc_fs.h>
22028 +#include <linux/blkdev.h>
22031 + * LOCKING:
22033 + * These guys are evicted from procfs as the very first step in ->kill_sb().
22035 + */
22037 +static int show_version(struct seq_file *m, void *unused)
22039 + struct super_block *sb = m->private;
22040 + char *format;
22042 + if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_6)) {
22043 + format = "3.6";
22044 + } else if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_5)) {
22045 + format = "3.5";
22046 + } else {
22047 + format = "unknown";
22050 + seq_printf(m, "%s format\twith checks %s\n", format,
22051 +#if defined( CONFIG_REISERFS_CHECK )
22052 + "on"
22053 +#else
22054 + "off"
22055 +#endif
22056 + );
22057 + return 0;
22060 +#define SF( x ) ( r -> x )
22061 +#define SFP( x ) SF( s_proc_info_data.x )
22062 +#define SFPL( x ) SFP( x[ level ] )
22063 +#define SFPF( x ) SFP( scan_bitmap.x )
22064 +#define SFPJ( x ) SFP( journal.x )
22066 +#define D2C( x ) le16_to_cpu( x )
22067 +#define D4C( x ) le32_to_cpu( x )
22068 +#define DF( x ) D2C( rs -> s_v1.x )
22069 +#define DFL( x ) D4C( rs -> s_v1.x )
22071 +#define objectid_map( s, rs ) (old_format_only (s) ? \
22072 + (__le32 *)((struct reiserfs_super_block_v1 *)rs + 1) : \
22073 + (__le32 *)(rs + 1))
22074 +#define MAP( i ) D4C( objectid_map( sb, rs )[ i ] )
22076 +#define DJF( x ) le32_to_cpu( rs -> x )
22077 +#define DJP( x ) le32_to_cpu( jp -> x )
22078 +#define JF( x ) ( r -> s_journal -> x )
22080 +static int show_super(struct seq_file *m, void *unused)
22082 + struct super_block *sb = m->private;
22083 + struct reiserfs_sb_info *r = REISERFS_SB(sb);
22085 + seq_printf(m, "state: \t%s\n"
22086 + "mount options: \t%s%s%s%s%s%s%s%s%s%s%s\n"
22087 + "gen. counter: \t%i\n"
22088 + "s_disk_reads: \t%i\n"
22089 + "s_disk_writes: \t%i\n"
22090 + "s_fix_nodes: \t%i\n"
22091 + "s_do_balance: \t%i\n"
22092 + "s_unneeded_left_neighbor: \t%i\n"
22093 + "s_good_search_by_key_reada: \t%i\n"
22094 + "s_bmaps: \t%i\n"
22095 + "s_bmaps_without_search: \t%i\n"
22096 + "s_direct2indirect: \t%i\n"
22097 + "s_indirect2direct: \t%i\n"
22098 + "\n"
22099 + "max_hash_collisions: \t%i\n"
22100 + "breads: \t%lu\n"
22101 + "bread_misses: \t%lu\n"
22102 + "search_by_key: \t%lu\n"
22103 + "search_by_key_fs_changed: \t%lu\n"
22104 + "search_by_key_restarted: \t%lu\n"
22105 + "insert_item_restarted: \t%lu\n"
22106 + "paste_into_item_restarted: \t%lu\n"
22107 + "cut_from_item_restarted: \t%lu\n"
22108 + "delete_solid_item_restarted: \t%lu\n"
22109 + "delete_item_restarted: \t%lu\n"
22110 + "leaked_oid: \t%lu\n"
22111 + "leaves_removable: \t%lu\n",
22112 + SF(s_mount_state) == REISERFS_VALID_FS ?
22113 + "REISERFS_VALID_FS" : "REISERFS_ERROR_FS",
22114 + reiserfs_r5_hash(sb) ? "FORCE_R5 " : "",
22115 + reiserfs_rupasov_hash(sb) ? "FORCE_RUPASOV " : "",
22116 + reiserfs_tea_hash(sb) ? "FORCE_TEA " : "",
22117 + reiserfs_hash_detect(sb) ? "DETECT_HASH " : "",
22118 + reiserfs_no_border(sb) ? "NO_BORDER " : "BORDER ",
22119 + reiserfs_no_unhashed_relocation(sb) ?
22120 + "NO_UNHASHED_RELOCATION " : "",
22121 + reiserfs_hashed_relocation(sb) ? "UNHASHED_RELOCATION " : "",
22122 + reiserfs_test4(sb) ? "TEST4 " : "",
22123 + have_large_tails(sb) ? "TAILS " : have_small_tails(sb) ?
22124 + "SMALL_TAILS " : "NO_TAILS ",
22125 + replay_only(sb) ? "REPLAY_ONLY " : "",
22126 + convert_reiserfs(sb) ? "CONV " : "",
22127 + atomic_read(&r->s_generation_counter),
22128 + SF(s_disk_reads), SF(s_disk_writes), SF(s_fix_nodes),
22129 + SF(s_do_balance), SF(s_unneeded_left_neighbor),
22130 + SF(s_good_search_by_key_reada), SF(s_bmaps),
22131 + SF(s_bmaps_without_search), SF(s_direct2indirect),
22132 + SF(s_indirect2direct), SFP(max_hash_collisions), SFP(breads),
22133 + SFP(bread_miss), SFP(search_by_key),
22134 + SFP(search_by_key_fs_changed), SFP(search_by_key_restarted),
22135 + SFP(insert_item_restarted), SFP(paste_into_item_restarted),
22136 + SFP(cut_from_item_restarted),
22137 + SFP(delete_solid_item_restarted), SFP(delete_item_restarted),
22138 + SFP(leaked_oid), SFP(leaves_removable));
22140 + return 0;
22143 +static int show_per_level(struct seq_file *m, void *unused)
22145 + struct super_block *sb = m->private;
22146 + struct reiserfs_sb_info *r = REISERFS_SB(sb);
22147 + int level;
22149 + seq_printf(m, "level\t"
22150 + " balances"
22151 + " [sbk: reads"
22152 + " fs_changed"
22153 + " restarted]"
22154 + " free space"
22155 + " items"
22156 + " can_remove"
22157 + " lnum"
22158 + " rnum"
22159 + " lbytes"
22160 + " rbytes"
22161 + " get_neig"
22162 + " get_neig_res" " need_l_neig" " need_r_neig" "\n");
22164 + for (level = 0; level < MAX_HEIGHT; ++level) {
22165 + seq_printf(m, "%i\t"
22166 + " %12lu"
22167 + " %12lu"
22168 + " %12lu"
22169 + " %12lu"
22170 + " %12lu"
22171 + " %12lu"
22172 + " %12lu"
22173 + " %12li"
22174 + " %12li"
22175 + " %12li"
22176 + " %12li"
22177 + " %12lu"
22178 + " %12lu"
22179 + " %12lu"
22180 + " %12lu"
22181 + "\n",
22182 + level,
22183 + SFPL(balance_at),
22184 + SFPL(sbk_read_at),
22185 + SFPL(sbk_fs_changed),
22186 + SFPL(sbk_restarted),
22187 + SFPL(free_at),
22188 + SFPL(items_at),
22189 + SFPL(can_node_be_removed),
22190 + SFPL(lnum),
22191 + SFPL(rnum),
22192 + SFPL(lbytes),
22193 + SFPL(rbytes),
22194 + SFPL(get_neighbors),
22195 + SFPL(get_neighbors_restart),
22196 + SFPL(need_l_neighbor), SFPL(need_r_neighbor)
22197 + );
22199 + return 0;
22202 +static int show_bitmap(struct seq_file *m, void *unused)
22204 + struct super_block *sb = m->private;
22205 + struct reiserfs_sb_info *r = REISERFS_SB(sb);
22207 + seq_printf(m, "free_block: %lu\n"
22208 + " scan_bitmap:"
22209 + " wait"
22210 + " bmap"
22211 + " retry"
22212 + " stolen"
22213 + " journal_hint"
22214 + "journal_nohint"
22215 + "\n"
22216 + " %14lu"
22217 + " %14lu"
22218 + " %14lu"
22219 + " %14lu"
22220 + " %14lu"
22221 + " %14lu"
22222 + " %14lu"
22223 + "\n",
22224 + SFP(free_block),
22225 + SFPF(call),
22226 + SFPF(wait),
22227 + SFPF(bmap),
22228 + SFPF(retry),
22229 + SFPF(stolen),
22230 + SFPF(in_journal_hint), SFPF(in_journal_nohint));
22232 + return 0;
22235 +static int show_on_disk_super(struct seq_file *m, void *unused)
22237 + struct super_block *sb = m->private;
22238 + struct reiserfs_sb_info *sb_info = REISERFS_SB(sb);
22239 + struct reiserfs_super_block *rs = sb_info->s_rs;
22240 + int hash_code = DFL(s_hash_function_code);
22241 + __u32 flags = DJF(s_flags);
22243 + seq_printf(m, "block_count: \t%i\n"
22244 + "free_blocks: \t%i\n"
22245 + "root_block: \t%i\n"
22246 + "blocksize: \t%i\n"
22247 + "oid_maxsize: \t%i\n"
22248 + "oid_cursize: \t%i\n"
22249 + "umount_state: \t%i\n"
22250 + "magic: \t%10.10s\n"
22251 + "fs_state: \t%i\n"
22252 + "hash: \t%s\n"
22253 + "tree_height: \t%i\n"
22254 + "bmap_nr: \t%i\n"
22255 + "version: \t%i\n"
22256 + "flags: \t%x[%s]\n"
22257 + "reserved_for_journal: \t%i\n",
22258 + DFL(s_block_count),
22259 + DFL(s_free_blocks),
22260 + DFL(s_root_block),
22261 + DF(s_blocksize),
22262 + DF(s_oid_maxsize),
22263 + DF(s_oid_cursize),
22264 + DF(s_umount_state),
22265 + rs->s_v1.s_magic,
22266 + DF(s_fs_state),
22267 + hash_code == TEA_HASH ? "tea" :
22268 + (hash_code == YURA_HASH) ? "rupasov" :
22269 + (hash_code == R5_HASH) ? "r5" :
22270 + (hash_code == UNSET_HASH) ? "unset" : "unknown",
22271 + DF(s_tree_height),
22272 + DF(s_bmap_nr),
22273 + DF(s_version), flags, (flags & reiserfs_attrs_cleared)
22274 + ? "attrs_cleared" : "", DF(s_reserved_for_journal));
22276 + return 0;
22279 +static int show_oidmap(struct seq_file *m, void *unused)
22281 + struct super_block *sb = m->private;
22282 + struct reiserfs_sb_info *sb_info = REISERFS_SB(sb);
22283 + struct reiserfs_super_block *rs = sb_info->s_rs;
22284 + unsigned int mapsize = le16_to_cpu(rs->s_v1.s_oid_cursize);
22285 + unsigned long total_used = 0;
22286 + int i;
22288 + for (i = 0; i < mapsize; ++i) {
22289 + __u32 right;
22291 + right = (i == mapsize - 1) ? MAX_KEY_OBJECTID : MAP(i + 1);
22292 + seq_printf(m, "%s: [ %x .. %x )\n",
22293 + (i & 1) ? "free" : "used", MAP(i), right);
22294 + if (!(i & 1)) {
22295 + total_used += right - MAP(i);
22298 +#if defined( REISERFS_USE_OIDMAPF )
22299 + if (sb_info->oidmap.use_file && (sb_info->oidmap.mapf != NULL)) {
22300 + loff_t size = file_inode(sb_info->oidmap.mapf)->i_size;
22301 + total_used += size / sizeof(reiserfs_oidinterval_d_t);
22303 +#endif
22304 + seq_printf(m, "total: \t%i [%i/%i] used: %lu [exact]\n",
22305 + mapsize,
22306 + mapsize, le16_to_cpu(rs->s_v1.s_oid_maxsize), total_used);
22307 + return 0;
22310 +static time64_t ktime_mono_to_real_seconds(time64_t mono)
22312 + ktime_t kt = ktime_set(mono, NSEC_PER_SEC/2);
22314 + return ktime_divns(ktime_mono_to_real(kt), NSEC_PER_SEC);
22317 +static int show_journal(struct seq_file *m, void *unused)
22319 + struct super_block *sb = m->private;
22320 + struct reiserfs_sb_info *r = REISERFS_SB(sb);
22321 + struct reiserfs_super_block *rs = r->s_rs;
22322 + struct journal_params *jp = &rs->s_v1.s_journal;
22324 + seq_printf(m, /* on-disk fields */
22325 + "jp_journal_1st_block: \t%i\n"
22326 + "jp_journal_dev: \t%pg[%x]\n"
22327 + "jp_journal_size: \t%i\n"
22328 + "jp_journal_trans_max: \t%i\n"
22329 + "jp_journal_magic: \t%i\n"
22330 + "jp_journal_max_batch: \t%i\n"
22331 + "jp_journal_max_commit_age: \t%i\n"
22332 + "jp_journal_max_trans_age: \t%i\n"
22333 + /* incore fields */
22334 + "j_1st_reserved_block: \t%i\n"
22335 + "j_state: \t%li\n"
22336 + "j_trans_id: \t%u\n"
22337 + "j_mount_id: \t%lu\n"
22338 + "j_start: \t%lu\n"
22339 + "j_len: \t%lu\n"
22340 + "j_len_alloc: \t%lu\n"
22341 + "j_wcount: \t%i\n"
22342 + "j_bcount: \t%lu\n"
22343 + "j_first_unflushed_offset: \t%lu\n"
22344 + "j_last_flush_trans_id: \t%u\n"
22345 + "j_trans_start_time: \t%lli\n"
22346 + "j_list_bitmap_index: \t%i\n"
22347 + "j_must_wait: \t%i\n"
22348 + "j_next_full_flush: \t%i\n"
22349 + "j_next_async_flush: \t%i\n"
22350 + "j_cnode_used: \t%i\n" "j_cnode_free: \t%i\n" "\n"
22351 + /* reiserfs_proc_info_data_t.journal fields */
22352 + "in_journal: \t%12lu\n"
22353 + "in_journal_bitmap: \t%12lu\n"
22354 + "in_journal_reusable: \t%12lu\n"
22355 + "lock_journal: \t%12lu\n"
22356 + "lock_journal_wait: \t%12lu\n"
22357 + "journal_begin: \t%12lu\n"
22358 + "journal_relock_writers: \t%12lu\n"
22359 + "journal_relock_wcount: \t%12lu\n"
22360 + "mark_dirty: \t%12lu\n"
22361 + "mark_dirty_already: \t%12lu\n"
22362 + "mark_dirty_notjournal: \t%12lu\n"
22363 + "restore_prepared: \t%12lu\n"
22364 + "prepare: \t%12lu\n"
22365 + "prepare_retry: \t%12lu\n",
22366 + DJP(jp_journal_1st_block),
22367 + file_bdev(SB_JOURNAL(sb)->j_bdev_file),
22368 + DJP(jp_journal_dev),
22369 + DJP(jp_journal_size),
22370 + DJP(jp_journal_trans_max),
22371 + DJP(jp_journal_magic),
22372 + DJP(jp_journal_max_batch),
22373 + SB_JOURNAL(sb)->j_max_commit_age,
22374 + DJP(jp_journal_max_trans_age),
22375 + JF(j_1st_reserved_block),
22376 + JF(j_state),
22377 + JF(j_trans_id),
22378 + JF(j_mount_id),
22379 + JF(j_start),
22380 + JF(j_len),
22381 + JF(j_len_alloc),
22382 + atomic_read(&r->s_journal->j_wcount),
22383 + JF(j_bcount),
22384 + JF(j_first_unflushed_offset),
22385 + JF(j_last_flush_trans_id),
22386 + ktime_mono_to_real_seconds(JF(j_trans_start_time)),
22387 + JF(j_list_bitmap_index),
22388 + JF(j_must_wait),
22389 + JF(j_next_full_flush),
22390 + JF(j_next_async_flush),
22391 + JF(j_cnode_used),
22392 + JF(j_cnode_free),
22393 + SFPJ(in_journal),
22394 + SFPJ(in_journal_bitmap),
22395 + SFPJ(in_journal_reusable),
22396 + SFPJ(lock_journal),
22397 + SFPJ(lock_journal_wait),
22398 + SFPJ(journal_being),
22399 + SFPJ(journal_relock_writers),
22400 + SFPJ(journal_relock_wcount),
22401 + SFPJ(mark_dirty),
22402 + SFPJ(mark_dirty_already),
22403 + SFPJ(mark_dirty_notjournal),
22404 + SFPJ(restore_prepared), SFPJ(prepare), SFPJ(prepare_retry)
22405 + );
22406 + return 0;
22409 +static struct proc_dir_entry *proc_info_root = NULL;
22410 +static const char proc_info_root_name[] = "fs/reiserfs";
22412 +static void add_file(struct super_block *sb, char *name,
22413 + int (*func) (struct seq_file *, void *))
22415 + proc_create_single_data(name, 0, REISERFS_SB(sb)->procdir, func, sb);
22418 +int reiserfs_proc_info_init(struct super_block *sb)
22420 + char b[BDEVNAME_SIZE];
22421 + char *s;
22423 + /* Some block devices use /'s */
22424 + strscpy(b, sb->s_id, BDEVNAME_SIZE);
22425 + s = strchr(b, '/');
22426 + if (s)
22427 + *s = '!';
22429 + spin_lock_init(&__PINFO(sb).lock);
22430 + REISERFS_SB(sb)->procdir = proc_mkdir_data(b, 0, proc_info_root, sb);
22431 + if (REISERFS_SB(sb)->procdir) {
22432 + add_file(sb, "version", show_version);
22433 + add_file(sb, "super", show_super);
22434 + add_file(sb, "per-level", show_per_level);
22435 + add_file(sb, "bitmap", show_bitmap);
22436 + add_file(sb, "on-disk-super", show_on_disk_super);
22437 + add_file(sb, "oidmap", show_oidmap);
22438 + add_file(sb, "journal", show_journal);
22439 + return 0;
22441 + reiserfs_warning(sb, "cannot create /proc/%s/%s",
22442 + proc_info_root_name, b);
22443 + return 1;
22446 +int reiserfs_proc_info_done(struct super_block *sb)
22448 + struct proc_dir_entry *de = REISERFS_SB(sb)->procdir;
22449 + if (de) {
22450 + char b[BDEVNAME_SIZE];
22451 + char *s;
22453 + /* Some block devices use /'s */
22454 + strscpy(b, sb->s_id, BDEVNAME_SIZE);
22455 + s = strchr(b, '/');
22456 + if (s)
22457 + *s = '!';
22459 + remove_proc_subtree(b, proc_info_root);
22460 + REISERFS_SB(sb)->procdir = NULL;
22462 + return 0;
22465 +int reiserfs_proc_info_global_init(void)
22467 + if (proc_info_root == NULL) {
22468 + proc_info_root = proc_mkdir(proc_info_root_name, NULL);
22469 + if (!proc_info_root) {
22470 + reiserfs_warning(NULL, "cannot create /proc/%s",
22471 + proc_info_root_name);
22472 + return 1;
22475 + return 0;
22478 +int reiserfs_proc_info_global_done(void)
22480 + if (proc_info_root != NULL) {
22481 + proc_info_root = NULL;
22482 + remove_proc_entry(proc_info_root_name, NULL);
22484 + return 0;
22487 + * Revision 1.1.8.2 2001/07/15 17:08:42 god
22488 + * . use get_super() in procfs.c
22489 + * . remove remove_save_link() from reiserfs_do_truncate()
22491 + * I accept terms and conditions stated in the Legal Agreement
22492 + * (available at http://www.namesys.com/legalese.html)
22494 + * Revision 1.1.8.1 2001/07/11 16:48:50 god
22495 + * proc info support
22497 + * I accept terms and conditions stated in the Legal Agreement
22498 + * (available at http://www.namesys.com/legalese.html)
22500 + */
22501 diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
22502 new file mode 100644
22503 index 000000000000..12fc20af8e17
22504 --- /dev/null
22505 +++ b/fs/reiserfs/reiserfs.h
22506 @@ -0,0 +1,3419 @@
22507 +/* SPDX-License-Identifier: GPL-2.0 */
22509 + * Copyright 1996, 1997, 1998 Hans Reiser, see reiserfs/README for
22510 + * licensing and copyright details
22511 + */
22513 +#include <linux/reiserfs_fs.h>
22515 +#include <linux/slab.h>
22516 +#include <linux/interrupt.h>
22517 +#include <linux/sched.h>
22518 +#include <linux/bug.h>
22519 +#include <linux/workqueue.h>
22520 +#include <linux/unaligned.h>
22521 +#include <linux/bitops.h>
22522 +#include <linux/proc_fs.h>
22523 +#include <linux/buffer_head.h>
22525 +/* the 32 bit compat definitions with int argument */
22526 +#define REISERFS_IOC32_UNPACK _IOW(0xCD, 1, int)
22527 +#define REISERFS_IOC32_GETVERSION FS_IOC32_GETVERSION
22528 +#define REISERFS_IOC32_SETVERSION FS_IOC32_SETVERSION
22530 +struct reiserfs_journal_list;
22532 +/* bitmasks for i_flags field in reiserfs-specific part of inode */
22533 +typedef enum {
22534 + /*
22535 + * this says what format of key do all items (but stat data) of
22536 + * an object have. If this is set, that format is 3.6 otherwise - 3.5
22537 + */
22538 + i_item_key_version_mask = 0x0001,
22540 + /*
22541 + * If this is unset, object has 3.5 stat data, otherwise,
22542 + * it has 3.6 stat data with 64bit size, 32bit nlink etc.
22543 + */
22544 + i_stat_data_version_mask = 0x0002,
22546 + /* file might need tail packing on close */
22547 + i_pack_on_close_mask = 0x0004,
22549 + /* don't pack tail of file */
22550 + i_nopack_mask = 0x0008,
22552 + /*
22553 + * If either of these are set, "safe link" was created for this
22554 + * file during truncate or unlink. Safe link is used to avoid
22555 + * leakage of disk space on crash with some files open, but unlinked.
22556 + */
22557 + i_link_saved_unlink_mask = 0x0010,
22558 + i_link_saved_truncate_mask = 0x0020,
22560 + i_has_xattr_dir = 0x0040,
22561 + i_data_log = 0x0080,
22562 +} reiserfs_inode_flags;
22564 +struct reiserfs_inode_info {
22565 + __u32 i_key[4]; /* key is still 4 32 bit integers */
22567 + /*
22568 + * transient inode flags that are never stored on disk. Bitmasks
22569 + * for this field are defined above.
22570 + */
22571 + __u32 i_flags;
22573 + /* offset of first byte stored in direct item. */
22574 + __u32 i_first_direct_byte;
22576 + /* copy of persistent inode flags read from sd_attrs. */
22577 + __u32 i_attrs;
22579 + /* first unused block of a sequence of unused blocks */
22580 + int i_prealloc_block;
22581 + int i_prealloc_count; /* length of that sequence */
22583 + /* per-transaction list of inodes which have preallocated blocks */
22584 + struct list_head i_prealloc_list;
22586 + /*
22587 + * new_packing_locality is created; new blocks for the contents
22588 + * of this directory should be displaced
22589 + */
22590 + unsigned new_packing_locality:1;
22592 + /*
22593 + * we use these for fsync or O_SYNC to decide which transaction
22594 + * needs to be committed in order for this inode to be properly
22595 + * flushed
22596 + */
22597 + unsigned int i_trans_id;
22599 + struct reiserfs_journal_list *i_jl;
22600 + atomic_t openers;
22601 + struct mutex tailpack;
22602 +#ifdef CONFIG_REISERFS_FS_XATTR
22603 + struct rw_semaphore i_xattr_sem;
22604 +#endif
22605 +#ifdef CONFIG_QUOTA
22606 + struct dquot __rcu *i_dquot[MAXQUOTAS];
22607 +#endif
22609 + struct inode vfs_inode;
22612 +typedef enum {
22613 + reiserfs_attrs_cleared = 0x00000001,
22614 +} reiserfs_super_block_flags;
22617 + * struct reiserfs_super_block accessors/mutators since this is a disk
22618 + * structure, it will always be in little endian format.
22619 + */
22620 +#define sb_block_count(sbp) (le32_to_cpu((sbp)->s_v1.s_block_count))
22621 +#define set_sb_block_count(sbp,v) ((sbp)->s_v1.s_block_count = cpu_to_le32(v))
22622 +#define sb_free_blocks(sbp) (le32_to_cpu((sbp)->s_v1.s_free_blocks))
22623 +#define set_sb_free_blocks(sbp,v) ((sbp)->s_v1.s_free_blocks = cpu_to_le32(v))
22624 +#define sb_root_block(sbp) (le32_to_cpu((sbp)->s_v1.s_root_block))
22625 +#define set_sb_root_block(sbp,v) ((sbp)->s_v1.s_root_block = cpu_to_le32(v))
22627 +#define sb_jp_journal_1st_block(sbp) \
22628 + (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_1st_block))
22629 +#define set_sb_jp_journal_1st_block(sbp,v) \
22630 + ((sbp)->s_v1.s_journal.jp_journal_1st_block = cpu_to_le32(v))
22631 +#define sb_jp_journal_dev(sbp) \
22632 + (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_dev))
22633 +#define set_sb_jp_journal_dev(sbp,v) \
22634 + ((sbp)->s_v1.s_journal.jp_journal_dev = cpu_to_le32(v))
22635 +#define sb_jp_journal_size(sbp) \
22636 + (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_size))
22637 +#define set_sb_jp_journal_size(sbp,v) \
22638 + ((sbp)->s_v1.s_journal.jp_journal_size = cpu_to_le32(v))
22639 +#define sb_jp_journal_trans_max(sbp) \
22640 + (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_trans_max))
22641 +#define set_sb_jp_journal_trans_max(sbp,v) \
22642 + ((sbp)->s_v1.s_journal.jp_journal_trans_max = cpu_to_le32(v))
22643 +#define sb_jp_journal_magic(sbp) \
22644 + (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_magic))
22645 +#define set_sb_jp_journal_magic(sbp,v) \
22646 + ((sbp)->s_v1.s_journal.jp_journal_magic = cpu_to_le32(v))
22647 +#define sb_jp_journal_max_batch(sbp) \
22648 + (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_max_batch))
22649 +#define set_sb_jp_journal_max_batch(sbp,v) \
22650 + ((sbp)->s_v1.s_journal.jp_journal_max_batch = cpu_to_le32(v))
22651 +#define sb_jp_jourmal_max_commit_age(sbp) \
22652 + (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_max_commit_age))
22653 +#define set_sb_jp_journal_max_commit_age(sbp,v) \
22654 + ((sbp)->s_v1.s_journal.jp_journal_max_commit_age = cpu_to_le32(v))
22656 +#define sb_blocksize(sbp) (le16_to_cpu((sbp)->s_v1.s_blocksize))
22657 +#define set_sb_blocksize(sbp,v) ((sbp)->s_v1.s_blocksize = cpu_to_le16(v))
22658 +#define sb_oid_maxsize(sbp) (le16_to_cpu((sbp)->s_v1.s_oid_maxsize))
22659 +#define set_sb_oid_maxsize(sbp,v) ((sbp)->s_v1.s_oid_maxsize = cpu_to_le16(v))
22660 +#define sb_oid_cursize(sbp) (le16_to_cpu((sbp)->s_v1.s_oid_cursize))
22661 +#define set_sb_oid_cursize(sbp,v) ((sbp)->s_v1.s_oid_cursize = cpu_to_le16(v))
22662 +#define sb_umount_state(sbp) (le16_to_cpu((sbp)->s_v1.s_umount_state))
22663 +#define set_sb_umount_state(sbp,v) ((sbp)->s_v1.s_umount_state = cpu_to_le16(v))
22664 +#define sb_fs_state(sbp) (le16_to_cpu((sbp)->s_v1.s_fs_state))
22665 +#define set_sb_fs_state(sbp,v) ((sbp)->s_v1.s_fs_state = cpu_to_le16(v))
22666 +#define sb_hash_function_code(sbp) \
22667 + (le32_to_cpu((sbp)->s_v1.s_hash_function_code))
22668 +#define set_sb_hash_function_code(sbp,v) \
22669 + ((sbp)->s_v1.s_hash_function_code = cpu_to_le32(v))
22670 +#define sb_tree_height(sbp) (le16_to_cpu((sbp)->s_v1.s_tree_height))
22671 +#define set_sb_tree_height(sbp,v) ((sbp)->s_v1.s_tree_height = cpu_to_le16(v))
22672 +#define sb_bmap_nr(sbp) (le16_to_cpu((sbp)->s_v1.s_bmap_nr))
22673 +#define set_sb_bmap_nr(sbp,v) ((sbp)->s_v1.s_bmap_nr = cpu_to_le16(v))
22674 +#define sb_version(sbp) (le16_to_cpu((sbp)->s_v1.s_version))
22675 +#define set_sb_version(sbp,v) ((sbp)->s_v1.s_version = cpu_to_le16(v))
22677 +#define sb_mnt_count(sbp) (le16_to_cpu((sbp)->s_mnt_count))
22678 +#define set_sb_mnt_count(sbp, v) ((sbp)->s_mnt_count = cpu_to_le16(v))
22680 +#define sb_reserved_for_journal(sbp) \
22681 + (le16_to_cpu((sbp)->s_v1.s_reserved_for_journal))
22682 +#define set_sb_reserved_for_journal(sbp,v) \
22683 + ((sbp)->s_v1.s_reserved_for_journal = cpu_to_le16(v))
22685 +/* LOGGING -- */
22688 + * These all interelate for performance.
22690 + * If the journal block count is smaller than n transactions, you lose speed.
22691 + * I don't know what n is yet, I'm guessing 8-16.
22693 + * typical transaction size depends on the application, how often fsync is
22694 + * called, and how many metadata blocks you dirty in a 30 second period.
22695 + * The more small files (<16k) you use, the larger your transactions will
22696 + * be.
22698 + * If your journal fills faster than dirty buffers get flushed to disk, it
22699 + * must flush them before allowing the journal to wrap, which slows things
22700 + * down. If you need high speed meta data updates, the journal should be
22701 + * big enough to prevent wrapping before dirty meta blocks get to disk.
22703 + * If the batch max is smaller than the transaction max, you'll waste space
22704 + * at the end of the journal because journal_end sets the next transaction
22705 + * to start at 0 if the next transaction has any chance of wrapping.
22707 + * The large the batch max age, the better the speed, and the more meta
22708 + * data changes you'll lose after a crash.
22709 + */
22711 +/* don't mess with these for a while */
22712 +/* we have a node size define somewhere in reiserfs_fs.h. -Hans */
22713 +#define JOURNAL_BLOCK_SIZE 4096 /* BUG gotta get rid of this */
22714 +#define JOURNAL_MAX_CNODE 1500 /* max cnodes to allocate. */
22715 +#define JOURNAL_HASH_SIZE 8192
22717 +/* number of copies of the bitmaps to have floating. Must be >= 2 */
22718 +#define JOURNAL_NUM_BITMAPS 5
22721 + * One of these for every block in every transaction
22722 + * Each one is in two hash tables. First, a hash of the current transaction,
22723 + * and after journal_end, a hash of all the in memory transactions.
22724 + * next and prev are used by the current transaction (journal_hash).
22725 + * hnext and hprev are used by journal_list_hash. If a block is in more
22726 + * than one transaction, the journal_list_hash links it in multiple times.
22727 + * This allows flush_journal_list to remove just the cnode belonging to a
22728 + * given transaction.
22729 + */
22730 +struct reiserfs_journal_cnode {
22731 + struct buffer_head *bh; /* real buffer head */
22732 + struct super_block *sb; /* dev of real buffer head */
22734 + /* block number of real buffer head, == 0 when buffer on disk */
22735 + __u32 blocknr;
22737 + unsigned long state;
22739 + /* journal list this cnode lives in */
22740 + struct reiserfs_journal_list *jlist;
22742 + struct reiserfs_journal_cnode *next; /* next in transaction list */
22743 + struct reiserfs_journal_cnode *prev; /* prev in transaction list */
22744 + struct reiserfs_journal_cnode *hprev; /* prev in hash list */
22745 + struct reiserfs_journal_cnode *hnext; /* next in hash list */
22748 +struct reiserfs_bitmap_node {
22749 + int id;
22750 + char *data;
22751 + struct list_head list;
22754 +struct reiserfs_list_bitmap {
22755 + struct reiserfs_journal_list *journal_list;
22756 + struct reiserfs_bitmap_node **bitmaps;
22760 + * one of these for each transaction. The most important part here is the
22761 + * j_realblock. this list of cnodes is used to hash all the blocks in all
22762 + * the commits, to mark all the real buffer heads dirty once all the commits
22763 + * hit the disk, and to make sure every real block in a transaction is on
22764 + * disk before allowing the log area to be overwritten
22765 + */
22766 +struct reiserfs_journal_list {
22767 + unsigned long j_start;
22768 + unsigned long j_state;
22769 + unsigned long j_len;
22770 + atomic_t j_nonzerolen;
22771 + atomic_t j_commit_left;
22773 + /* all commits older than this on disk */
22774 + atomic_t j_older_commits_done;
22776 + struct mutex j_commit_mutex;
22777 + unsigned int j_trans_id;
22778 + time64_t j_timestamp; /* write-only but useful for crash dump analysis */
22779 + struct reiserfs_list_bitmap *j_list_bitmap;
22780 + struct buffer_head *j_commit_bh; /* commit buffer head */
22781 + struct reiserfs_journal_cnode *j_realblock;
22782 + struct reiserfs_journal_cnode *j_freedlist; /* list of buffers that were freed during this trans. free each of these on flush */
22783 + /* time ordered list of all active transactions */
22784 + struct list_head j_list;
22786 + /*
22787 + * time ordered list of all transactions we haven't tried
22788 + * to flush yet
22789 + */
22790 + struct list_head j_working_list;
22792 + /* list of tail conversion targets in need of flush before commit */
22793 + struct list_head j_tail_bh_list;
22795 + /* list of data=ordered buffers in need of flush before commit */
22796 + struct list_head j_bh_list;
22797 + int j_refcount;
22800 +struct reiserfs_journal {
22801 + struct buffer_head **j_ap_blocks; /* journal blocks on disk */
22802 + /* newest journal block */
22803 + struct reiserfs_journal_cnode *j_last;
22805 + /* oldest journal block. start here for traverse */
22806 + struct reiserfs_journal_cnode *j_first;
22808 + struct file *j_bdev_file;
22810 + /* first block on s_dev of reserved area journal */
22811 + int j_1st_reserved_block;
22813 + unsigned long j_state;
22814 + unsigned int j_trans_id;
22815 + unsigned long j_mount_id;
22817 + /* start of current waiting commit (index into j_ap_blocks) */
22818 + unsigned long j_start;
22819 + unsigned long j_len; /* length of current waiting commit */
22821 + /* number of buffers requested by journal_begin() */
22822 + unsigned long j_len_alloc;
22824 + atomic_t j_wcount; /* count of writers for current commit */
22826 + /* batch count. allows turning X transactions into 1 */
22827 + unsigned long j_bcount;
22829 + /* first unflushed transactions offset */
22830 + unsigned long j_first_unflushed_offset;
22832 + /* last fully flushed journal timestamp */
22833 + unsigned j_last_flush_trans_id;
22835 + struct buffer_head *j_header_bh;
22837 + time64_t j_trans_start_time; /* time this transaction started */
22838 + struct mutex j_mutex;
22839 + struct mutex j_flush_mutex;
22841 + /* wait for current transaction to finish before starting new one */
22842 + wait_queue_head_t j_join_wait;
22844 + atomic_t j_jlock; /* lock for j_join_wait */
22845 + int j_list_bitmap_index; /* number of next list bitmap to use */
22847 + /* no more journal begins allowed. MUST sleep on j_join_wait */
22848 + int j_must_wait;
22850 + /* next journal_end will flush all journal list */
22851 + int j_next_full_flush;
22853 + /* next journal_end will flush all async commits */
22854 + int j_next_async_flush;
22856 + int j_cnode_used; /* number of cnodes on the used list */
22857 + int j_cnode_free; /* number of cnodes on the free list */
22859 + /* max number of blocks in a transaction. */
22860 + unsigned int j_trans_max;
22862 + /* max number of blocks to batch into a trans */
22863 + unsigned int j_max_batch;
22865 + /* in seconds, how old can an async commit be */
22866 + unsigned int j_max_commit_age;
22868 + /* in seconds, how old can a transaction be */
22869 + unsigned int j_max_trans_age;
22871 + /* the default for the max commit age */
22872 + unsigned int j_default_max_commit_age;
22874 + struct reiserfs_journal_cnode *j_cnode_free_list;
22876 + /* orig pointer returned from vmalloc */
22877 + struct reiserfs_journal_cnode *j_cnode_free_orig;
22879 + struct reiserfs_journal_list *j_current_jl;
22880 + int j_free_bitmap_nodes;
22881 + int j_used_bitmap_nodes;
22883 + int j_num_lists; /* total number of active transactions */
22884 + int j_num_work_lists; /* number that need attention from kreiserfsd */
22886 + /* debugging to make sure things are flushed in order */
22887 + unsigned int j_last_flush_id;
22889 + /* debugging to make sure things are committed in order */
22890 + unsigned int j_last_commit_id;
22892 + struct list_head j_bitmap_nodes;
22893 + struct list_head j_dirty_buffers;
22894 + spinlock_t j_dirty_buffers_lock; /* protects j_dirty_buffers */
22896 + /* list of all active transactions */
22897 + struct list_head j_journal_list;
22899 + /* lists that haven't been touched by writeback attempts */
22900 + struct list_head j_working_list;
22902 + /* hash table for real buffer heads in current trans */
22903 + struct reiserfs_journal_cnode *j_hash_table[JOURNAL_HASH_SIZE];
22905 + /* hash table for all the real buffer heads in all the transactions */
22906 + struct reiserfs_journal_cnode *j_list_hash_table[JOURNAL_HASH_SIZE];
22908 + /* array of bitmaps to record the deleted blocks */
22909 + struct reiserfs_list_bitmap j_list_bitmap[JOURNAL_NUM_BITMAPS];
22911 + /* list of inodes which have preallocated blocks */
22912 + struct list_head j_prealloc_list;
22913 + int j_persistent_trans;
22914 + unsigned long j_max_trans_size;
22915 + unsigned long j_max_batch_size;
22917 + int j_errno;
22919 + /* when flushing ordered buffers, throttle new ordered writers */
22920 + struct delayed_work j_work;
22921 + struct super_block *j_work_sb;
22922 + atomic_t j_async_throttle;
22925 +enum journal_state_bits {
22926 + J_WRITERS_BLOCKED = 1, /* set when new writers not allowed */
22927 + J_WRITERS_QUEUED, /* set when log is full due to too many writers */
22928 + J_ABORTED, /* set when log is aborted */
22931 +/* ick. magic string to find desc blocks in the journal */
22932 +#define JOURNAL_DESC_MAGIC "ReIsErLB"
22934 +typedef __u32(*hashf_t) (const signed char *, int);
22936 +struct reiserfs_bitmap_info {
22937 + __u32 free_count;
22940 +struct proc_dir_entry;
22942 +#if defined( CONFIG_PROC_FS ) && defined( CONFIG_REISERFS_PROC_INFO )
22943 +typedef unsigned long int stat_cnt_t;
22944 +typedef struct reiserfs_proc_info_data {
22945 + spinlock_t lock;
22946 + int exiting;
22947 + int max_hash_collisions;
22949 + stat_cnt_t breads;
22950 + stat_cnt_t bread_miss;
22951 + stat_cnt_t search_by_key;
22952 + stat_cnt_t search_by_key_fs_changed;
22953 + stat_cnt_t search_by_key_restarted;
22955 + stat_cnt_t insert_item_restarted;
22956 + stat_cnt_t paste_into_item_restarted;
22957 + stat_cnt_t cut_from_item_restarted;
22958 + stat_cnt_t delete_solid_item_restarted;
22959 + stat_cnt_t delete_item_restarted;
22961 + stat_cnt_t leaked_oid;
22962 + stat_cnt_t leaves_removable;
22964 + /*
22965 + * balances per level.
22966 + * Use explicit 5 as MAX_HEIGHT is not visible yet.
22967 + */
22968 + stat_cnt_t balance_at[5]; /* XXX */
22969 + /* sbk == search_by_key */
22970 + stat_cnt_t sbk_read_at[5]; /* XXX */
22971 + stat_cnt_t sbk_fs_changed[5];
22972 + stat_cnt_t sbk_restarted[5];
22973 + stat_cnt_t items_at[5]; /* XXX */
22974 + stat_cnt_t free_at[5]; /* XXX */
22975 + stat_cnt_t can_node_be_removed[5]; /* XXX */
22976 + long int lnum[5]; /* XXX */
22977 + long int rnum[5]; /* XXX */
22978 + long int lbytes[5]; /* XXX */
22979 + long int rbytes[5]; /* XXX */
22980 + stat_cnt_t get_neighbors[5];
22981 + stat_cnt_t get_neighbors_restart[5];
22982 + stat_cnt_t need_l_neighbor[5];
22983 + stat_cnt_t need_r_neighbor[5];
22985 + stat_cnt_t free_block;
22986 + struct __scan_bitmap_stats {
22987 + stat_cnt_t call;
22988 + stat_cnt_t wait;
22989 + stat_cnt_t bmap;
22990 + stat_cnt_t retry;
22991 + stat_cnt_t in_journal_hint;
22992 + stat_cnt_t in_journal_nohint;
22993 + stat_cnt_t stolen;
22994 + } scan_bitmap;
22995 + struct __journal_stats {
22996 + stat_cnt_t in_journal;
22997 + stat_cnt_t in_journal_bitmap;
22998 + stat_cnt_t in_journal_reusable;
22999 + stat_cnt_t lock_journal;
23000 + stat_cnt_t lock_journal_wait;
23001 + stat_cnt_t journal_being;
23002 + stat_cnt_t journal_relock_writers;
23003 + stat_cnt_t journal_relock_wcount;
23004 + stat_cnt_t mark_dirty;
23005 + stat_cnt_t mark_dirty_already;
23006 + stat_cnt_t mark_dirty_notjournal;
23007 + stat_cnt_t restore_prepared;
23008 + stat_cnt_t prepare;
23009 + stat_cnt_t prepare_retry;
23010 + } journal;
23011 +} reiserfs_proc_info_data_t;
23012 +#else
23013 +typedef struct reiserfs_proc_info_data {
23014 +} reiserfs_proc_info_data_t;
23015 +#endif
23017 +/* Number of quota types we support */
23018 +#define REISERFS_MAXQUOTAS 2
23020 +/* reiserfs union of in-core super block data */
23021 +struct reiserfs_sb_info {
23022 + /* Buffer containing the super block */
23023 + struct buffer_head *s_sbh;
23025 + /* Pointer to the on-disk super block in the buffer */
23026 + struct reiserfs_super_block *s_rs;
23027 + struct reiserfs_bitmap_info *s_ap_bitmap;
23029 + /* pointer to journal information */
23030 + struct reiserfs_journal *s_journal;
23032 + unsigned short s_mount_state; /* reiserfs state (valid, invalid) */
23034 + /* Serialize writers access, replace the old bkl */
23035 + struct mutex lock;
23037 + /* Owner of the lock (can be recursive) */
23038 + struct task_struct *lock_owner;
23040 + /* Depth of the lock, start from -1 like the bkl */
23041 + int lock_depth;
23043 + struct workqueue_struct *commit_wq;
23045 + /* Comment? -Hans */
23046 + void (*end_io_handler) (struct buffer_head *, int);
23048 + /*
23049 + * pointer to function which is used to sort names in directory.
23050 + * Set on mount
23051 + */
23052 + hashf_t s_hash_function;
23054 + /* reiserfs's mount options are set here */
23055 + unsigned long s_mount_opt;
23057 + /* This is a structure that describes block allocator options */
23058 + struct {
23059 + /* Bitfield for enable/disable kind of options */
23060 + unsigned long bits;
23062 + /*
23063 + * size started from which we consider file
23064 + * to be a large one (in blocks)
23065 + */
23066 + unsigned long large_file_size;
23068 + int border; /* percentage of disk, border takes */
23070 + /*
23071 + * Minimal file size (in blocks) starting
23072 + * from which we do preallocations
23073 + */
23074 + int preallocmin;
23076 + /*
23077 + * Number of blocks we try to prealloc when file
23078 + * reaches preallocmin size (in blocks) or prealloc_list
23079 + is empty.
23080 + */
23081 + int preallocsize;
23082 + } s_alloc_options;
23084 + /* Comment? -Hans */
23085 + wait_queue_head_t s_wait;
23086 + /* increased by one every time the tree gets re-balanced */
23087 + atomic_t s_generation_counter;
23089 + /* File system properties. Currently holds on-disk FS format */
23090 + unsigned long s_properties;
23092 + /* session statistics */
23093 + int s_disk_reads;
23094 + int s_disk_writes;
23095 + int s_fix_nodes;
23096 + int s_do_balance;
23097 + int s_unneeded_left_neighbor;
23098 + int s_good_search_by_key_reada;
23099 + int s_bmaps;
23100 + int s_bmaps_without_search;
23101 + int s_direct2indirect;
23102 + int s_indirect2direct;
23104 + /*
23105 + * set up when it's ok for reiserfs_read_inode2() to read from
23106 + * disk inode with nlink==0. Currently this is only used during
23107 + * finish_unfinished() processing at mount time
23108 + */
23109 + int s_is_unlinked_ok;
23111 + reiserfs_proc_info_data_t s_proc_info_data;
23112 + struct proc_dir_entry *procdir;
23114 + /* amount of blocks reserved for further allocations */
23115 + int reserved_blocks;
23118 + /* this lock on now only used to protect reserved_blocks variable */
23119 + spinlock_t bitmap_lock;
23120 + struct dentry *priv_root; /* root of /.reiserfs_priv */
23121 + struct dentry *xattr_root; /* root of /.reiserfs_priv/xattrs */
23122 + int j_errno;
23124 + int work_queued; /* non-zero delayed work is queued */
23125 + struct delayed_work old_work; /* old transactions flush delayed work */
23126 + spinlock_t old_work_lock; /* protects old_work and work_queued */
23128 +#ifdef CONFIG_QUOTA
23129 + char *s_qf_names[REISERFS_MAXQUOTAS];
23130 + int s_jquota_fmt;
23131 +#endif
23132 + char *s_jdev; /* Stored jdev for mount option showing */
23133 +#ifdef CONFIG_REISERFS_CHECK
23135 + /*
23136 + * Detects whether more than one copy of tb exists per superblock
23137 + * as a means of checking whether do_balance is executing
23138 + * concurrently against another tree reader/writer on a same
23139 + * mount point.
23140 + */
23141 + struct tree_balance *cur_tb;
23142 +#endif
23145 +/* Definitions of reiserfs on-disk properties: */
23146 +#define REISERFS_3_5 0
23147 +#define REISERFS_3_6 1
23148 +#define REISERFS_OLD_FORMAT 2
23150 +/* Mount options */
23151 +enum reiserfs_mount_options {
23152 + /* large tails will be created in a session */
23153 + REISERFS_LARGETAIL,
23154 + /*
23155 + * small (for files less than block size) tails will
23156 + * be created in a session
23157 + */
23158 + REISERFS_SMALLTAIL,
23160 + /* replay journal and return 0. Use by fsck */
23161 + REPLAYONLY,
23163 + /*
23164 + * -o conv: causes conversion of old format super block to the
23165 + * new format. If not specified - old partition will be dealt
23166 + * with in a manner of 3.5.x
23167 + */
23168 + REISERFS_CONVERT,
23170 + /*
23171 + * -o hash={tea, rupasov, r5, detect} is meant for properly mounting
23172 + * reiserfs disks from 3.5.19 or earlier. 99% of the time, this
23173 + * option is not required. If the normal autodection code can't
23174 + * determine which hash to use (because both hashes had the same
23175 + * value for a file) use this option to force a specific hash.
23176 + * It won't allow you to override the existing hash on the FS, so
23177 + * if you have a tea hash disk, and mount with -o hash=rupasov,
23178 + * the mount will fail.
23179 + */
23180 + FORCE_TEA_HASH, /* try to force tea hash on mount */
23181 + FORCE_RUPASOV_HASH, /* try to force rupasov hash on mount */
23182 + FORCE_R5_HASH, /* try to force rupasov hash on mount */
23183 + FORCE_HASH_DETECT, /* try to detect hash function on mount */
23185 + REISERFS_DATA_LOG,
23186 + REISERFS_DATA_ORDERED,
23187 + REISERFS_DATA_WRITEBACK,
23189 + /*
23190 + * used for testing experimental features, makes benchmarking new
23191 + * features with and without more convenient, should never be used by
23192 + * users in any code shipped to users (ideally)
23193 + */
23195 + REISERFS_NO_BORDER,
23196 + REISERFS_NO_UNHASHED_RELOCATION,
23197 + REISERFS_HASHED_RELOCATION,
23198 + REISERFS_ATTRS,
23199 + REISERFS_XATTRS_USER,
23200 + REISERFS_POSIXACL,
23201 + REISERFS_EXPOSE_PRIVROOT,
23202 + REISERFS_BARRIER_NONE,
23203 + REISERFS_BARRIER_FLUSH,
23205 + /* Actions on error */
23206 + REISERFS_ERROR_PANIC,
23207 + REISERFS_ERROR_RO,
23208 + REISERFS_ERROR_CONTINUE,
23210 + REISERFS_USRQUOTA, /* User quota option specified */
23211 + REISERFS_GRPQUOTA, /* Group quota option specified */
23213 + REISERFS_TEST1,
23214 + REISERFS_TEST2,
23215 + REISERFS_TEST3,
23216 + REISERFS_TEST4,
23217 + REISERFS_UNSUPPORTED_OPT,
23220 +#define reiserfs_r5_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_R5_HASH))
23221 +#define reiserfs_rupasov_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_RUPASOV_HASH))
23222 +#define reiserfs_tea_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_TEA_HASH))
23223 +#define reiserfs_hash_detect(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_HASH_DETECT))
23224 +#define reiserfs_no_border(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_NO_BORDER))
23225 +#define reiserfs_no_unhashed_relocation(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_NO_UNHASHED_RELOCATION))
23226 +#define reiserfs_hashed_relocation(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_HASHED_RELOCATION))
23227 +#define reiserfs_test4(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_TEST4))
23229 +#define have_large_tails(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_LARGETAIL))
23230 +#define have_small_tails(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_SMALLTAIL))
23231 +#define replay_only(s) (REISERFS_SB(s)->s_mount_opt & (1 << REPLAYONLY))
23232 +#define reiserfs_attrs(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ATTRS))
23233 +#define old_format_only(s) (REISERFS_SB(s)->s_properties & (1 << REISERFS_3_5))
23234 +#define convert_reiserfs(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_CONVERT))
23235 +#define reiserfs_data_log(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_LOG))
23236 +#define reiserfs_data_ordered(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_ORDERED))
23237 +#define reiserfs_data_writeback(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_WRITEBACK))
23238 +#define reiserfs_xattrs_user(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_XATTRS_USER))
23239 +#define reiserfs_posixacl(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_POSIXACL))
23240 +#define reiserfs_expose_privroot(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_EXPOSE_PRIVROOT))
23241 +#define reiserfs_xattrs_optional(s) (reiserfs_xattrs_user(s) || reiserfs_posixacl(s))
23242 +#define reiserfs_barrier_none(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_NONE))
23243 +#define reiserfs_barrier_flush(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_FLUSH))
23245 +#define reiserfs_error_panic(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ERROR_PANIC))
23246 +#define reiserfs_error_ro(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ERROR_RO))
23248 +void reiserfs_file_buffer(struct buffer_head *bh, int list);
23249 +extern struct file_system_type reiserfs_fs_type;
23250 +int reiserfs_resize(struct super_block *, unsigned long);
23252 +#define CARRY_ON 0
23253 +#define SCHEDULE_OCCURRED 1
23255 +#define SB_BUFFER_WITH_SB(s) (REISERFS_SB(s)->s_sbh)
23256 +#define SB_JOURNAL(s) (REISERFS_SB(s)->s_journal)
23257 +#define SB_JOURNAL_1st_RESERVED_BLOCK(s) (SB_JOURNAL(s)->j_1st_reserved_block)
23258 +#define SB_JOURNAL_LEN_FREE(s) (SB_JOURNAL(s)->j_journal_len_free)
23259 +#define SB_AP_BITMAP(s) (REISERFS_SB(s)->s_ap_bitmap)
23261 +#define SB_DISK_JOURNAL_HEAD(s) (SB_JOURNAL(s)->j_header_bh->)
23263 +#define reiserfs_is_journal_aborted(journal) (unlikely (__reiserfs_is_journal_aborted (journal)))
23264 +static inline int __reiserfs_is_journal_aborted(struct reiserfs_journal
23265 + *journal)
23267 + return test_bit(J_ABORTED, &journal->j_state);
23271 + * Locking primitives. The write lock is a per superblock
23272 + * special mutex that has properties close to the Big Kernel Lock
23273 + * which was used in the previous locking scheme.
23274 + */
23275 +void reiserfs_write_lock(struct super_block *s);
23276 +void reiserfs_write_unlock(struct super_block *s);
23277 +int __must_check reiserfs_write_unlock_nested(struct super_block *s);
23278 +void reiserfs_write_lock_nested(struct super_block *s, int depth);
23280 +#ifdef CONFIG_REISERFS_CHECK
23281 +void reiserfs_lock_check_recursive(struct super_block *s);
23282 +#else
23283 +static inline void reiserfs_lock_check_recursive(struct super_block *s) { }
23284 +#endif
23287 + * Several mutexes depend on the write lock.
23288 + * However sometimes we want to relax the write lock while we hold
23289 + * these mutexes, according to the release/reacquire on schedule()
23290 + * properties of the Bkl that were used.
23291 + * Reiserfs performances and locking were based on this scheme.
23292 + * Now that the write lock is a mutex and not the bkl anymore, doing so
23293 + * may result in a deadlock:
23295 + * A acquire write_lock
23296 + * A acquire j_commit_mutex
23297 + * A release write_lock and wait for something
23298 + * B acquire write_lock
23299 + * B can't acquire j_commit_mutex and sleep
23300 + * A can't acquire write lock anymore
23301 + * deadlock
23303 + * What we do here is avoiding such deadlock by playing the same game
23304 + * than the Bkl: if we can't acquire a mutex that depends on the write lock,
23305 + * we release the write lock, wait a bit and then retry.
23307 + * The mutexes concerned by this hack are:
23308 + * - The commit mutex of a journal list
23309 + * - The flush mutex
23310 + * - The journal lock
23311 + * - The inode mutex
23312 + */
23313 +static inline void reiserfs_mutex_lock_safe(struct mutex *m,
23314 + struct super_block *s)
23316 + int depth;
23318 + depth = reiserfs_write_unlock_nested(s);
23319 + mutex_lock(m);
23320 + reiserfs_write_lock_nested(s, depth);
23323 +static inline void
23324 +reiserfs_mutex_lock_nested_safe(struct mutex *m, unsigned int subclass,
23325 + struct super_block *s)
23327 + int depth;
23329 + depth = reiserfs_write_unlock_nested(s);
23330 + mutex_lock_nested(m, subclass);
23331 + reiserfs_write_lock_nested(s, depth);
23334 +static inline void
23335 +reiserfs_down_read_safe(struct rw_semaphore *sem, struct super_block *s)
23337 + int depth;
23338 + depth = reiserfs_write_unlock_nested(s);
23339 + down_read(sem);
23340 + reiserfs_write_lock_nested(s, depth);
23344 + * When we schedule, we usually want to also release the write lock,
23345 + * according to the previous bkl based locking scheme of reiserfs.
23346 + */
23347 +static inline void reiserfs_cond_resched(struct super_block *s)
23349 + if (need_resched()) {
23350 + int depth;
23352 + depth = reiserfs_write_unlock_nested(s);
23353 + schedule();
23354 + reiserfs_write_lock_nested(s, depth);
23358 +struct fid;
23361 + * in reading the #defines, it may help to understand that they employ
23362 + * the following abbreviations:
23364 + * B = Buffer
23365 + * I = Item header
23366 + * H = Height within the tree (should be changed to LEV)
23367 + * N = Number of the item in the node
23368 + * STAT = stat data
23369 + * DEH = Directory Entry Header
23370 + * EC = Entry Count
23371 + * E = Entry number
23372 + * UL = Unsigned Long
23373 + * BLKH = BLocK Header
23374 + * UNFM = UNForMatted node
23375 + * DC = Disk Child
23376 + * P = Path
23378 + * These #defines are named by concatenating these abbreviations,
23379 + * where first comes the arguments, and last comes the return value,
23380 + * of the macro.
23381 + */
23383 +#define USE_INODE_GENERATION_COUNTER
23385 +#define REISERFS_PREALLOCATE
23386 +#define DISPLACE_NEW_PACKING_LOCALITIES
23387 +#define PREALLOCATION_SIZE 9
23389 +/* n must be power of 2 */
23390 +#define _ROUND_UP(x,n) (((x)+(n)-1u) & ~((n)-1u))
23393 + * to be ok for alpha and others we have to align structures to 8 byte
23394 + * boundary.
23395 + * FIXME: do not change 4 by anything else: there is code which relies on that
23396 + */
23397 +#define ROUND_UP(x) _ROUND_UP(x,8LL)
23400 + * debug levels. Right now, CONFIG_REISERFS_CHECK means print all debug
23401 + * messages.
23402 + */
23403 +#define REISERFS_DEBUG_CODE 5 /* extra messages to help find/debug errors */
23405 +void __reiserfs_warning(struct super_block *s, const char *id,
23406 + const char *func, const char *fmt, ...);
23407 +#define reiserfs_warning(s, id, fmt, args...) \
23408 + __reiserfs_warning(s, id, __func__, fmt, ##args)
23409 +/* assertions handling */
23411 +/* always check a condition and panic if it's false. */
23412 +#define __RASSERT(cond, scond, format, args...) \
23413 +do { \
23414 + if (!(cond)) \
23415 + reiserfs_panic(NULL, "assertion failure", "(" #cond ") at " \
23416 + __FILE__ ":%i:%s: " format "\n", \
23417 + __LINE__, __func__ , ##args); \
23418 +} while (0)
23420 +#define RASSERT(cond, format, args...) __RASSERT(cond, #cond, format, ##args)
23422 +#if defined( CONFIG_REISERFS_CHECK )
23423 +#define RFALSE(cond, format, args...) __RASSERT(!(cond), "!(" #cond ")", format, ##args)
23424 +#else
23425 +#define RFALSE( cond, format, args... ) do {;} while( 0 )
23426 +#endif
23428 +#define CONSTF __attribute_const__
23430 + * Disk Data Structures
23431 + */
23433 +/***************************************************************************
23434 + * SUPER BLOCK *
23435 + ***************************************************************************/
23438 + * Structure of super block on disk, a version of which in RAM is often
23439 + * accessed as REISERFS_SB(s)->s_rs. The version in RAM is part of a larger
23440 + * structure containing fields never written to disk.
23441 + */
23442 +#define UNSET_HASH 0 /* Detect hash on disk */
23443 +#define TEA_HASH 1
23444 +#define YURA_HASH 2
23445 +#define R5_HASH 3
23446 +#define DEFAULT_HASH R5_HASH
23448 +struct journal_params {
23449 + /* where does journal start from on its * device */
23450 + __le32 jp_journal_1st_block;
23452 + /* journal device st_rdev */
23453 + __le32 jp_journal_dev;
23455 + /* size of the journal */
23456 + __le32 jp_journal_size;
23458 + /* max number of blocks in a transaction. */
23459 + __le32 jp_journal_trans_max;
23461 + /*
23462 + * random value made on fs creation
23463 + * (this was sb_journal_block_count)
23464 + */
23465 + __le32 jp_journal_magic;
23467 + /* max number of blocks to batch into a trans */
23468 + __le32 jp_journal_max_batch;
23470 + /* in seconds, how old can an async commit be */
23471 + __le32 jp_journal_max_commit_age;
23473 + /* in seconds, how old can a transaction be */
23474 + __le32 jp_journal_max_trans_age;
23477 +/* this is the super from 3.5.X, where X >= 10 */
23478 +struct reiserfs_super_block_v1 {
23479 + __le32 s_block_count; /* blocks count */
23480 + __le32 s_free_blocks; /* free blocks count */
23481 + __le32 s_root_block; /* root block number */
23482 + struct journal_params s_journal;
23483 + __le16 s_blocksize; /* block size */
23485 + /* max size of object id array, see get_objectid() commentary */
23486 + __le16 s_oid_maxsize;
23487 + __le16 s_oid_cursize; /* current size of object id array */
23489 + /* this is set to 1 when filesystem was umounted, to 2 - when not */
23490 + __le16 s_umount_state;
23492 + /*
23493 + * reiserfs magic string indicates that file system is reiserfs:
23494 + * "ReIsErFs" or "ReIsEr2Fs" or "ReIsEr3Fs"
23495 + */
23496 + char s_magic[10];
23498 + /*
23499 + * it is set to used by fsck to mark which
23500 + * phase of rebuilding is done
23501 + */
23502 + __le16 s_fs_state;
23503 + /*
23504 + * indicate, what hash function is being use
23505 + * to sort names in a directory
23506 + */
23507 + __le32 s_hash_function_code;
23508 + __le16 s_tree_height; /* height of disk tree */
23510 + /*
23511 + * amount of bitmap blocks needed to address
23512 + * each block of file system
23513 + */
23514 + __le16 s_bmap_nr;
23516 + /*
23517 + * this field is only reliable on filesystem with non-standard journal
23518 + */
23519 + __le16 s_version;
23521 + /*
23522 + * size in blocks of journal area on main device, we need to
23523 + * keep after making fs with non-standard journal
23524 + */
23525 + __le16 s_reserved_for_journal;
23526 +} __attribute__ ((__packed__));
23528 +#define SB_SIZE_V1 (sizeof(struct reiserfs_super_block_v1))
23530 +/* this is the on disk super block */
23531 +struct reiserfs_super_block {
23532 + struct reiserfs_super_block_v1 s_v1;
23533 + __le32 s_inode_generation;
23535 + /* Right now used only by inode-attributes, if enabled */
23536 + __le32 s_flags;
23538 + unsigned char s_uuid[16]; /* filesystem unique identifier */
23539 + unsigned char s_label[16]; /* filesystem volume label */
23540 + __le16 s_mnt_count; /* Count of mounts since last fsck */
23541 + __le16 s_max_mnt_count; /* Maximum mounts before check */
23542 + __le32 s_lastcheck; /* Timestamp of last fsck */
23543 + __le32 s_check_interval; /* Interval between checks */
23545 + /*
23546 + * zero filled by mkreiserfs and reiserfs_convert_objectid_map_v1()
23547 + * so any additions must be updated there as well. */
23548 + char s_unused[76];
23549 +} __attribute__ ((__packed__));
23551 +#define SB_SIZE (sizeof(struct reiserfs_super_block))
23553 +#define REISERFS_VERSION_1 0
23554 +#define REISERFS_VERSION_2 2
23556 +/* on-disk super block fields converted to cpu form */
23557 +#define SB_DISK_SUPER_BLOCK(s) (REISERFS_SB(s)->s_rs)
23558 +#define SB_V1_DISK_SUPER_BLOCK(s) (&(SB_DISK_SUPER_BLOCK(s)->s_v1))
23559 +#define SB_BLOCKSIZE(s) \
23560 + le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_blocksize))
23561 +#define SB_BLOCK_COUNT(s) \
23562 + le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_block_count))
23563 +#define SB_FREE_BLOCKS(s) \
23564 + le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_free_blocks))
23565 +#define SB_REISERFS_MAGIC(s) \
23566 + (SB_V1_DISK_SUPER_BLOCK(s)->s_magic)
23567 +#define SB_ROOT_BLOCK(s) \
23568 + le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_root_block))
23569 +#define SB_TREE_HEIGHT(s) \
23570 + le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_tree_height))
23571 +#define SB_REISERFS_STATE(s) \
23572 + le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_umount_state))
23573 +#define SB_VERSION(s) le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_version))
23574 +#define SB_BMAP_NR(s) le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_bmap_nr))
23576 +#define PUT_SB_BLOCK_COUNT(s, val) \
23577 + do { SB_V1_DISK_SUPER_BLOCK(s)->s_block_count = cpu_to_le32(val); } while (0)
23578 +#define PUT_SB_FREE_BLOCKS(s, val) \
23579 + do { SB_V1_DISK_SUPER_BLOCK(s)->s_free_blocks = cpu_to_le32(val); } while (0)
23580 +#define PUT_SB_ROOT_BLOCK(s, val) \
23581 + do { SB_V1_DISK_SUPER_BLOCK(s)->s_root_block = cpu_to_le32(val); } while (0)
23582 +#define PUT_SB_TREE_HEIGHT(s, val) \
23583 + do { SB_V1_DISK_SUPER_BLOCK(s)->s_tree_height = cpu_to_le16(val); } while (0)
23584 +#define PUT_SB_REISERFS_STATE(s, val) \
23585 + do { SB_V1_DISK_SUPER_BLOCK(s)->s_umount_state = cpu_to_le16(val); } while (0)
23586 +#define PUT_SB_VERSION(s, val) \
23587 + do { SB_V1_DISK_SUPER_BLOCK(s)->s_version = cpu_to_le16(val); } while (0)
23588 +#define PUT_SB_BMAP_NR(s, val) \
23589 + do { SB_V1_DISK_SUPER_BLOCK(s)->s_bmap_nr = cpu_to_le16 (val); } while (0)
23591 +#define SB_ONDISK_JP(s) (&SB_V1_DISK_SUPER_BLOCK(s)->s_journal)
23592 +#define SB_ONDISK_JOURNAL_SIZE(s) \
23593 + le32_to_cpu ((SB_ONDISK_JP(s)->jp_journal_size))
23594 +#define SB_ONDISK_JOURNAL_1st_BLOCK(s) \
23595 + le32_to_cpu ((SB_ONDISK_JP(s)->jp_journal_1st_block))
23596 +#define SB_ONDISK_JOURNAL_DEVICE(s) \
23597 + le32_to_cpu ((SB_ONDISK_JP(s)->jp_journal_dev))
23598 +#define SB_ONDISK_RESERVED_FOR_JOURNAL(s) \
23599 + le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_reserved_for_journal))
23601 +#define is_block_in_log_or_reserved_area(s, block) \
23602 + block >= SB_JOURNAL_1st_RESERVED_BLOCK(s) \
23603 + && block < SB_JOURNAL_1st_RESERVED_BLOCK(s) + \
23604 + ((!is_reiserfs_jr(SB_DISK_SUPER_BLOCK(s)) ? \
23605 + SB_ONDISK_JOURNAL_SIZE(s) + 1 : SB_ONDISK_RESERVED_FOR_JOURNAL(s)))
23607 +int is_reiserfs_3_5(struct reiserfs_super_block *rs);
23608 +int is_reiserfs_3_6(struct reiserfs_super_block *rs);
23609 +int is_reiserfs_jr(struct reiserfs_super_block *rs);
23612 + * ReiserFS leaves the first 64k unused, so that partition labels have
23613 + * enough space. If someone wants to write a fancy bootloader that
23614 + * needs more than 64k, let us know, and this will be increased in size.
23615 + * This number must be larger than the largest block size on any
23616 + * platform, or code will break. -Hans
23617 + */
23618 +#define REISERFS_DISK_OFFSET_IN_BYTES (64 * 1024)
23619 +#define REISERFS_FIRST_BLOCK unused_define
23620 +#define REISERFS_JOURNAL_OFFSET_IN_BYTES REISERFS_DISK_OFFSET_IN_BYTES
23622 +/* the spot for the super in versions 3.5 - 3.5.10 (inclusive) */
23623 +#define REISERFS_OLD_DISK_OFFSET_IN_BYTES (8 * 1024)
23625 +/* reiserfs internal error code (used by search_by_key and fix_nodes)) */
23626 +#define CARRY_ON 0
23627 +#define REPEAT_SEARCH -1
23628 +#define IO_ERROR -2
23629 +#define NO_DISK_SPACE -3
23630 +#define NO_BALANCING_NEEDED (-4)
23631 +#define NO_MORE_UNUSED_CONTIGUOUS_BLOCKS (-5)
23632 +#define QUOTA_EXCEEDED -6
23634 +typedef __u32 b_blocknr_t;
23635 +typedef __le32 unp_t;
23637 +struct unfm_nodeinfo {
23638 + unp_t unfm_nodenum;
23639 + unsigned short unfm_freespace;
23642 +/* there are two formats of keys: 3.5 and 3.6 */
23643 +#define KEY_FORMAT_3_5 0
23644 +#define KEY_FORMAT_3_6 1
23646 +/* there are two stat datas */
23647 +#define STAT_DATA_V1 0
23648 +#define STAT_DATA_V2 1
23650 +static inline struct reiserfs_inode_info *REISERFS_I(const struct inode *inode)
23652 + return container_of(inode, struct reiserfs_inode_info, vfs_inode);
23655 +static inline struct reiserfs_sb_info *REISERFS_SB(const struct super_block *sb)
23657 + return sb->s_fs_info;
23661 + * Don't trust REISERFS_SB(sb)->s_bmap_nr, it's a u16
23662 + * which overflows on large file systems.
23663 + */
23664 +static inline __u32 reiserfs_bmap_count(struct super_block *sb)
23666 + return (SB_BLOCK_COUNT(sb) - 1) / (sb->s_blocksize * 8) + 1;
23669 +static inline int bmap_would_wrap(unsigned bmap_nr)
23671 + return bmap_nr > ((1LL << 16) - 1);
23674 +extern const struct xattr_handler * const reiserfs_xattr_handlers[];
23677 + * this says about version of key of all items (but stat data) the
23678 + * object consists of
23679 + */
23680 +#define get_inode_item_key_version( inode ) \
23681 + ((REISERFS_I(inode)->i_flags & i_item_key_version_mask) ? KEY_FORMAT_3_6 : KEY_FORMAT_3_5)
23683 +#define set_inode_item_key_version( inode, version ) \
23684 + ({ if((version)==KEY_FORMAT_3_6) \
23685 + REISERFS_I(inode)->i_flags |= i_item_key_version_mask; \
23686 + else \
23687 + REISERFS_I(inode)->i_flags &= ~i_item_key_version_mask; })
23689 +#define get_inode_sd_version(inode) \
23690 + ((REISERFS_I(inode)->i_flags & i_stat_data_version_mask) ? STAT_DATA_V2 : STAT_DATA_V1)
23692 +#define set_inode_sd_version(inode, version) \
23693 + ({ if((version)==STAT_DATA_V2) \
23694 + REISERFS_I(inode)->i_flags |= i_stat_data_version_mask; \
23695 + else \
23696 + REISERFS_I(inode)->i_flags &= ~i_stat_data_version_mask; })
23699 + * This is an aggressive tail suppression policy, I am hoping it
23700 + * improves our benchmarks. The principle behind it is that percentage
23701 + * space saving is what matters, not absolute space saving. This is
23702 + * non-intuitive, but it helps to understand it if you consider that the
23703 + * cost to access 4 blocks is not much more than the cost to access 1
23704 + * block, if you have to do a seek and rotate. A tail risks a
23705 + * non-linear disk access that is significant as a percentage of total
23706 + * time cost for a 4 block file and saves an amount of space that is
23707 + * less significant as a percentage of space, or so goes the hypothesis.
23708 + * -Hans
23709 + */
23710 +#define STORE_TAIL_IN_UNFM_S1(n_file_size,n_tail_size,n_block_size) \
23712 + (!(n_tail_size)) || \
23713 + (((n_tail_size) > MAX_DIRECT_ITEM_LEN(n_block_size)) || \
23714 + ( (n_file_size) >= (n_block_size) * 4 ) || \
23715 + ( ( (n_file_size) >= (n_block_size) * 3 ) && \
23716 + ( (n_tail_size) >= (MAX_DIRECT_ITEM_LEN(n_block_size))/4) ) || \
23717 + ( ( (n_file_size) >= (n_block_size) * 2 ) && \
23718 + ( (n_tail_size) >= (MAX_DIRECT_ITEM_LEN(n_block_size))/2) ) || \
23719 + ( ( (n_file_size) >= (n_block_size) ) && \
23720 + ( (n_tail_size) >= (MAX_DIRECT_ITEM_LEN(n_block_size) * 3)/4) ) ) \
23724 + * Another strategy for tails, this one means only create a tail if all the
23725 + * file would fit into one DIRECT item.
23726 + * Primary intention for this one is to increase performance by decreasing
23727 + * seeking.
23729 +#define STORE_TAIL_IN_UNFM_S2(n_file_size,n_tail_size,n_block_size) \
23731 + (!(n_tail_size)) || \
23732 + (((n_file_size) > MAX_DIRECT_ITEM_LEN(n_block_size)) ) \
23736 + * values for s_umount_state field
23737 + */
23738 +#define REISERFS_VALID_FS 1
23739 +#define REISERFS_ERROR_FS 2
23742 + * there are 5 item types currently
23743 + */
23744 +#define TYPE_STAT_DATA 0
23745 +#define TYPE_INDIRECT 1
23746 +#define TYPE_DIRECT 2
23747 +#define TYPE_DIRENTRY 3
23748 +#define TYPE_MAXTYPE 3
23749 +#define TYPE_ANY 15 /* FIXME: comment is required */
23751 +/***************************************************************************
23752 + * KEY & ITEM HEAD *
23753 + ***************************************************************************/
23755 +/* * directories use this key as well as old files */
23756 +struct offset_v1 {
23757 + __le32 k_offset;
23758 + __le32 k_uniqueness;
23759 +} __attribute__ ((__packed__));
23761 +struct offset_v2 {
23762 + __le64 v;
23763 +} __attribute__ ((__packed__));
23765 +static inline __u16 offset_v2_k_type(const struct offset_v2 *v2)
23767 + __u8 type = le64_to_cpu(v2->v) >> 60;
23768 + return (type <= TYPE_MAXTYPE) ? type : TYPE_ANY;
23771 +static inline void set_offset_v2_k_type(struct offset_v2 *v2, int type)
23773 + v2->v =
23774 + (v2->v & cpu_to_le64(~0ULL >> 4)) | cpu_to_le64((__u64) type << 60);
23777 +static inline loff_t offset_v2_k_offset(const struct offset_v2 *v2)
23779 + return le64_to_cpu(v2->v) & (~0ULL >> 4);
23782 +static inline void set_offset_v2_k_offset(struct offset_v2 *v2, loff_t offset)
23784 + offset &= (~0ULL >> 4);
23785 + v2->v = (v2->v & cpu_to_le64(15ULL << 60)) | cpu_to_le64(offset);
23789 + * Key of an item determines its location in the S+tree, and
23790 + * is composed of 4 components
23791 + */
23792 +struct reiserfs_key {
23793 + /* packing locality: by default parent directory object id */
23794 + __le32 k_dir_id;
23796 + __le32 k_objectid; /* object identifier */
23797 + union {
23798 + struct offset_v1 k_offset_v1;
23799 + struct offset_v2 k_offset_v2;
23800 + } __attribute__ ((__packed__)) u;
23801 +} __attribute__ ((__packed__));
23803 +struct in_core_key {
23804 + /* packing locality: by default parent directory object id */
23805 + __u32 k_dir_id;
23806 + __u32 k_objectid; /* object identifier */
23807 + __u64 k_offset;
23808 + __u8 k_type;
23811 +struct cpu_key {
23812 + struct in_core_key on_disk_key;
23813 + int version;
23814 + /* 3 in all cases but direct2indirect and indirect2direct conversion */
23815 + int key_length;
23819 + * Our function for comparing keys can compare keys of different
23820 + * lengths. It takes as a parameter the length of the keys it is to
23821 + * compare. These defines are used in determining what is to be passed
23822 + * to it as that parameter.
23823 + */
23824 +#define REISERFS_FULL_KEY_LEN 4
23825 +#define REISERFS_SHORT_KEY_LEN 2
23827 +/* The result of the key compare */
23828 +#define FIRST_GREATER 1
23829 +#define SECOND_GREATER -1
23830 +#define KEYS_IDENTICAL 0
23831 +#define KEY_FOUND 1
23832 +#define KEY_NOT_FOUND 0
23834 +#define KEY_SIZE (sizeof(struct reiserfs_key))
23836 +/* return values for search_by_key and clones */
23837 +#define ITEM_FOUND 1
23838 +#define ITEM_NOT_FOUND 0
23839 +#define ENTRY_FOUND 1
23840 +#define ENTRY_NOT_FOUND 0
23841 +#define DIRECTORY_NOT_FOUND -1
23842 +#define REGULAR_FILE_FOUND -2
23843 +#define DIRECTORY_FOUND -3
23844 +#define BYTE_FOUND 1
23845 +#define BYTE_NOT_FOUND 0
23846 +#define FILE_NOT_FOUND -1
23848 +#define POSITION_FOUND 1
23849 +#define POSITION_NOT_FOUND 0
23851 +/* return values for reiserfs_find_entry and search_by_entry_key */
23852 +#define NAME_FOUND 1
23853 +#define NAME_NOT_FOUND 0
23854 +#define GOTO_PREVIOUS_ITEM 2
23855 +#define NAME_FOUND_INVISIBLE 3
23858 + * Everything in the filesystem is stored as a set of items. The
23859 + * item head contains the key of the item, its free space (for
23860 + * indirect items) and specifies the location of the item itself
23861 + * within the block.
23862 + */
23864 +struct item_head {
23865 + /*
23866 + * Everything in the tree is found by searching for it based on
23867 + * its key.
23868 + */
23869 + struct reiserfs_key ih_key;
23870 + union {
23871 + /*
23872 + * The free space in the last unformatted node of an
23873 + * indirect item if this is an indirect item. This
23874 + * equals 0xFFFF iff this is a direct item or stat data
23875 + * item. Note that the key, not this field, is used to
23876 + * determine the item type, and thus which field this
23877 + * union contains.
23878 + */
23879 + __le16 ih_free_space_reserved;
23881 + /*
23882 + * Iff this is a directory item, this field equals the
23883 + * number of directory entries in the directory item.
23884 + */
23885 + __le16 ih_entry_count;
23886 + } __attribute__ ((__packed__)) u;
23887 + __le16 ih_item_len; /* total size of the item body */
23889 + /* an offset to the item body within the block */
23890 + __le16 ih_item_location;
23892 + /*
23893 + * 0 for all old items, 2 for new ones. Highest bit is set by fsck
23894 + * temporary, cleaned after all done
23895 + */
23896 + __le16 ih_version;
23897 +} __attribute__ ((__packed__));
23898 +/* size of item header */
23899 +#define IH_SIZE (sizeof(struct item_head))
23901 +#define ih_free_space(ih) le16_to_cpu((ih)->u.ih_free_space_reserved)
23902 +#define ih_version(ih) le16_to_cpu((ih)->ih_version)
23903 +#define ih_entry_count(ih) le16_to_cpu((ih)->u.ih_entry_count)
23904 +#define ih_location(ih) le16_to_cpu((ih)->ih_item_location)
23905 +#define ih_item_len(ih) le16_to_cpu((ih)->ih_item_len)
23907 +#define put_ih_free_space(ih, val) do { (ih)->u.ih_free_space_reserved = cpu_to_le16(val); } while(0)
23908 +#define put_ih_version(ih, val) do { (ih)->ih_version = cpu_to_le16(val); } while (0)
23909 +#define put_ih_entry_count(ih, val) do { (ih)->u.ih_entry_count = cpu_to_le16(val); } while (0)
23910 +#define put_ih_location(ih, val) do { (ih)->ih_item_location = cpu_to_le16(val); } while (0)
23911 +#define put_ih_item_len(ih, val) do { (ih)->ih_item_len = cpu_to_le16(val); } while (0)
23913 +#define unreachable_item(ih) (ih_version(ih) & (1 << 15))
23915 +#define get_ih_free_space(ih) (ih_version (ih) == KEY_FORMAT_3_6 ? 0 : ih_free_space (ih))
23916 +#define set_ih_free_space(ih,val) put_ih_free_space((ih), ((ih_version(ih) == KEY_FORMAT_3_6) ? 0 : (val)))
23919 + * these operate on indirect items, where you've got an array of ints
23920 + * at a possibly unaligned location. These are a noop on ia32
23922 + * p is the array of __u32, i is the index into the array, v is the value
23923 + * to store there.
23924 + */
23925 +#define get_block_num(p, i) get_unaligned_le32((p) + (i))
23926 +#define put_block_num(p, i, v) put_unaligned_le32((v), (p) + (i))
23928 +/* * in old version uniqueness field shows key type */
23929 +#define V1_SD_UNIQUENESS 0
23930 +#define V1_INDIRECT_UNIQUENESS 0xfffffffe
23931 +#define V1_DIRECT_UNIQUENESS 0xffffffff
23932 +#define V1_DIRENTRY_UNIQUENESS 500
23933 +#define V1_ANY_UNIQUENESS 555 /* FIXME: comment is required */
23935 +/* here are conversion routines */
23936 +static inline int uniqueness2type(__u32 uniqueness) CONSTF;
23937 +static inline int uniqueness2type(__u32 uniqueness)
23939 + switch ((int)uniqueness) {
23940 + case V1_SD_UNIQUENESS:
23941 + return TYPE_STAT_DATA;
23942 + case V1_INDIRECT_UNIQUENESS:
23943 + return TYPE_INDIRECT;
23944 + case V1_DIRECT_UNIQUENESS:
23945 + return TYPE_DIRECT;
23946 + case V1_DIRENTRY_UNIQUENESS:
23947 + return TYPE_DIRENTRY;
23948 + case V1_ANY_UNIQUENESS:
23949 + default:
23950 + return TYPE_ANY;
23954 +static inline __u32 type2uniqueness(int type) CONSTF;
23955 +static inline __u32 type2uniqueness(int type)
23957 + switch (type) {
23958 + case TYPE_STAT_DATA:
23959 + return V1_SD_UNIQUENESS;
23960 + case TYPE_INDIRECT:
23961 + return V1_INDIRECT_UNIQUENESS;
23962 + case TYPE_DIRECT:
23963 + return V1_DIRECT_UNIQUENESS;
23964 + case TYPE_DIRENTRY:
23965 + return V1_DIRENTRY_UNIQUENESS;
23966 + case TYPE_ANY:
23967 + default:
23968 + return V1_ANY_UNIQUENESS;
23973 + * key is pointer to on disk key which is stored in le, result is cpu,
23974 + * there is no way to get version of object from key, so, provide
23975 + * version to these defines
23976 + */
23977 +static inline loff_t le_key_k_offset(int version,
23978 + const struct reiserfs_key *key)
23980 + return (version == KEY_FORMAT_3_5) ?
23981 + le32_to_cpu(key->u.k_offset_v1.k_offset) :
23982 + offset_v2_k_offset(&(key->u.k_offset_v2));
23985 +static inline loff_t le_ih_k_offset(const struct item_head *ih)
23987 + return le_key_k_offset(ih_version(ih), &(ih->ih_key));
23990 +static inline loff_t le_key_k_type(int version, const struct reiserfs_key *key)
23992 + if (version == KEY_FORMAT_3_5) {
23993 + loff_t val = le32_to_cpu(key->u.k_offset_v1.k_uniqueness);
23994 + return uniqueness2type(val);
23995 + } else
23996 + return offset_v2_k_type(&(key->u.k_offset_v2));
23999 +static inline loff_t le_ih_k_type(const struct item_head *ih)
24001 + return le_key_k_type(ih_version(ih), &(ih->ih_key));
24004 +static inline void set_le_key_k_offset(int version, struct reiserfs_key *key,
24005 + loff_t offset)
24007 + if (version == KEY_FORMAT_3_5)
24008 + key->u.k_offset_v1.k_offset = cpu_to_le32(offset);
24009 + else
24010 + set_offset_v2_k_offset(&key->u.k_offset_v2, offset);
24013 +static inline void add_le_key_k_offset(int version, struct reiserfs_key *key,
24014 + loff_t offset)
24016 + set_le_key_k_offset(version, key,
24017 + le_key_k_offset(version, key) + offset);
24020 +static inline void add_le_ih_k_offset(struct item_head *ih, loff_t offset)
24022 + add_le_key_k_offset(ih_version(ih), &(ih->ih_key), offset);
24025 +static inline void set_le_ih_k_offset(struct item_head *ih, loff_t offset)
24027 + set_le_key_k_offset(ih_version(ih), &(ih->ih_key), offset);
24030 +static inline void set_le_key_k_type(int version, struct reiserfs_key *key,
24031 + int type)
24033 + if (version == KEY_FORMAT_3_5) {
24034 + type = type2uniqueness(type);
24035 + key->u.k_offset_v1.k_uniqueness = cpu_to_le32(type);
24036 + } else
24037 + set_offset_v2_k_type(&key->u.k_offset_v2, type);
24040 +static inline void set_le_ih_k_type(struct item_head *ih, int type)
24042 + set_le_key_k_type(ih_version(ih), &(ih->ih_key), type);
24045 +static inline int is_direntry_le_key(int version, struct reiserfs_key *key)
24047 + return le_key_k_type(version, key) == TYPE_DIRENTRY;
24050 +static inline int is_direct_le_key(int version, struct reiserfs_key *key)
24052 + return le_key_k_type(version, key) == TYPE_DIRECT;
24055 +static inline int is_indirect_le_key(int version, struct reiserfs_key *key)
24057 + return le_key_k_type(version, key) == TYPE_INDIRECT;
24060 +static inline int is_statdata_le_key(int version, struct reiserfs_key *key)
24062 + return le_key_k_type(version, key) == TYPE_STAT_DATA;
24065 +/* item header has version. */
24066 +static inline int is_direntry_le_ih(struct item_head *ih)
24068 + return is_direntry_le_key(ih_version(ih), &ih->ih_key);
24071 +static inline int is_direct_le_ih(struct item_head *ih)
24073 + return is_direct_le_key(ih_version(ih), &ih->ih_key);
24076 +static inline int is_indirect_le_ih(struct item_head *ih)
24078 + return is_indirect_le_key(ih_version(ih), &ih->ih_key);
24081 +static inline int is_statdata_le_ih(struct item_head *ih)
24083 + return is_statdata_le_key(ih_version(ih), &ih->ih_key);
24086 +/* key is pointer to cpu key, result is cpu */
24087 +static inline loff_t cpu_key_k_offset(const struct cpu_key *key)
24089 + return key->on_disk_key.k_offset;
24092 +static inline loff_t cpu_key_k_type(const struct cpu_key *key)
24094 + return key->on_disk_key.k_type;
24097 +static inline void set_cpu_key_k_offset(struct cpu_key *key, loff_t offset)
24099 + key->on_disk_key.k_offset = offset;
24102 +static inline void set_cpu_key_k_type(struct cpu_key *key, int type)
24104 + key->on_disk_key.k_type = type;
24107 +static inline void cpu_key_k_offset_dec(struct cpu_key *key)
24109 + key->on_disk_key.k_offset--;
24112 +#define is_direntry_cpu_key(key) (cpu_key_k_type (key) == TYPE_DIRENTRY)
24113 +#define is_direct_cpu_key(key) (cpu_key_k_type (key) == TYPE_DIRECT)
24114 +#define is_indirect_cpu_key(key) (cpu_key_k_type (key) == TYPE_INDIRECT)
24115 +#define is_statdata_cpu_key(key) (cpu_key_k_type (key) == TYPE_STAT_DATA)
24117 +/* are these used ? */
24118 +#define is_direntry_cpu_ih(ih) (is_direntry_cpu_key (&((ih)->ih_key)))
24119 +#define is_direct_cpu_ih(ih) (is_direct_cpu_key (&((ih)->ih_key)))
24120 +#define is_indirect_cpu_ih(ih) (is_indirect_cpu_key (&((ih)->ih_key)))
24121 +#define is_statdata_cpu_ih(ih) (is_statdata_cpu_key (&((ih)->ih_key)))
24123 +#define I_K_KEY_IN_ITEM(ih, key, n_blocksize) \
24124 + (!COMP_SHORT_KEYS(ih, key) && \
24125 + I_OFF_BYTE_IN_ITEM(ih, k_offset(key), n_blocksize))
24127 +/* maximal length of item */
24128 +#define MAX_ITEM_LEN(block_size) (block_size - BLKH_SIZE - IH_SIZE)
24129 +#define MIN_ITEM_LEN 1
24131 +/* object identifier for root dir */
24132 +#define REISERFS_ROOT_OBJECTID 2
24133 +#define REISERFS_ROOT_PARENT_OBJECTID 1
24135 +extern struct reiserfs_key root_key;
24138 + * Picture represents a leaf of the S+tree
24139 + * ______________________________________________________
24140 + * | | Array of | | |
24141 + * |Block | Object-Item | F r e e | Objects- |
24142 + * | head | Headers | S p a c e | Items |
24143 + * |______|_______________|___________________|___________|
24144 + */
24147 + * Header of a disk block. More precisely, header of a formatted leaf
24148 + * or internal node, and not the header of an unformatted node.
24149 + */
24150 +struct block_head {
24151 + __le16 blk_level; /* Level of a block in the tree. */
24152 + __le16 blk_nr_item; /* Number of keys/items in a block. */
24153 + __le16 blk_free_space; /* Block free space in bytes. */
24154 + __le16 blk_reserved;
24155 + /* dump this in v4/planA */
24157 + /* kept only for compatibility */
24158 + struct reiserfs_key blk_right_delim_key;
24161 +#define BLKH_SIZE (sizeof(struct block_head))
24162 +#define blkh_level(p_blkh) (le16_to_cpu((p_blkh)->blk_level))
24163 +#define blkh_nr_item(p_blkh) (le16_to_cpu((p_blkh)->blk_nr_item))
24164 +#define blkh_free_space(p_blkh) (le16_to_cpu((p_blkh)->blk_free_space))
24165 +#define blkh_reserved(p_blkh) (le16_to_cpu((p_blkh)->blk_reserved))
24166 +#define set_blkh_level(p_blkh,val) ((p_blkh)->blk_level = cpu_to_le16(val))
24167 +#define set_blkh_nr_item(p_blkh,val) ((p_blkh)->blk_nr_item = cpu_to_le16(val))
24168 +#define set_blkh_free_space(p_blkh,val) ((p_blkh)->blk_free_space = cpu_to_le16(val))
24169 +#define set_blkh_reserved(p_blkh,val) ((p_blkh)->blk_reserved = cpu_to_le16(val))
24170 +#define blkh_right_delim_key(p_blkh) ((p_blkh)->blk_right_delim_key)
24171 +#define set_blkh_right_delim_key(p_blkh,val) ((p_blkh)->blk_right_delim_key = val)
24173 +/* values for blk_level field of the struct block_head */
24176 + * When node gets removed from the tree its blk_level is set to FREE_LEVEL.
24177 + * It is then used to see whether the node is still in the tree
24178 + */
24179 +#define FREE_LEVEL 0
24181 +#define DISK_LEAF_NODE_LEVEL 1 /* Leaf node level. */
24184 + * Given the buffer head of a formatted node, resolve to the
24185 + * block head of that node.
24186 + */
24187 +#define B_BLK_HEAD(bh) ((struct block_head *)((bh)->b_data))
24188 +/* Number of items that are in buffer. */
24189 +#define B_NR_ITEMS(bh) (blkh_nr_item(B_BLK_HEAD(bh)))
24190 +#define B_LEVEL(bh) (blkh_level(B_BLK_HEAD(bh)))
24191 +#define B_FREE_SPACE(bh) (blkh_free_space(B_BLK_HEAD(bh)))
24193 +#define PUT_B_NR_ITEMS(bh, val) do { set_blkh_nr_item(B_BLK_HEAD(bh), val); } while (0)
24194 +#define PUT_B_LEVEL(bh, val) do { set_blkh_level(B_BLK_HEAD(bh), val); } while (0)
24195 +#define PUT_B_FREE_SPACE(bh, val) do { set_blkh_free_space(B_BLK_HEAD(bh), val); } while (0)
24197 +/* Get right delimiting key. -- little endian */
24198 +#define B_PRIGHT_DELIM_KEY(bh) (&(blk_right_delim_key(B_BLK_HEAD(bh))))
24200 +/* Does the buffer contain a disk leaf. */
24201 +#define B_IS_ITEMS_LEVEL(bh) (B_LEVEL(bh) == DISK_LEAF_NODE_LEVEL)
24203 +/* Does the buffer contain a disk internal node */
24204 +#define B_IS_KEYS_LEVEL(bh) (B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL \
24205 + && B_LEVEL(bh) <= MAX_HEIGHT)
24207 +/***************************************************************************
24208 + * STAT DATA *
24209 + ***************************************************************************/
24212 + * old stat data is 32 bytes long. We are going to distinguish new one by
24213 + * different size
24215 +struct stat_data_v1 {
24216 + __le16 sd_mode; /* file type, permissions */
24217 + __le16 sd_nlink; /* number of hard links */
24218 + __le16 sd_uid; /* owner */
24219 + __le16 sd_gid; /* group */
24220 + __le32 sd_size; /* file size */
24221 + __le32 sd_atime; /* time of last access */
24222 + __le32 sd_mtime; /* time file was last modified */
24224 + /*
24225 + * time inode (stat data) was last changed
24226 + * (except changes to sd_atime and sd_mtime)
24227 + */
24228 + __le32 sd_ctime;
24229 + union {
24230 + __le32 sd_rdev;
24231 + __le32 sd_blocks; /* number of blocks file uses */
24232 + } __attribute__ ((__packed__)) u;
24234 + /*
24235 + * first byte of file which is stored in a direct item: except that if
24236 + * it equals 1 it is a symlink and if it equals ~(__u32)0 there is no
24237 + * direct item. The existence of this field really grates on me.
24238 + * Let's replace it with a macro based on sd_size and our tail
24239 + * suppression policy. Someday. -Hans
24240 + */
24241 + __le32 sd_first_direct_byte;
24242 +} __attribute__ ((__packed__));
24244 +#define SD_V1_SIZE (sizeof(struct stat_data_v1))
24245 +#define stat_data_v1(ih) (ih_version (ih) == KEY_FORMAT_3_5)
24246 +#define sd_v1_mode(sdp) (le16_to_cpu((sdp)->sd_mode))
24247 +#define set_sd_v1_mode(sdp,v) ((sdp)->sd_mode = cpu_to_le16(v))
24248 +#define sd_v1_nlink(sdp) (le16_to_cpu((sdp)->sd_nlink))
24249 +#define set_sd_v1_nlink(sdp,v) ((sdp)->sd_nlink = cpu_to_le16(v))
24250 +#define sd_v1_uid(sdp) (le16_to_cpu((sdp)->sd_uid))
24251 +#define set_sd_v1_uid(sdp,v) ((sdp)->sd_uid = cpu_to_le16(v))
24252 +#define sd_v1_gid(sdp) (le16_to_cpu((sdp)->sd_gid))
24253 +#define set_sd_v1_gid(sdp,v) ((sdp)->sd_gid = cpu_to_le16(v))
24254 +#define sd_v1_size(sdp) (le32_to_cpu((sdp)->sd_size))
24255 +#define set_sd_v1_size(sdp,v) ((sdp)->sd_size = cpu_to_le32(v))
24256 +#define sd_v1_atime(sdp) (le32_to_cpu((sdp)->sd_atime))
24257 +#define set_sd_v1_atime(sdp,v) ((sdp)->sd_atime = cpu_to_le32(v))
24258 +#define sd_v1_mtime(sdp) (le32_to_cpu((sdp)->sd_mtime))
24259 +#define set_sd_v1_mtime(sdp,v) ((sdp)->sd_mtime = cpu_to_le32(v))
24260 +#define sd_v1_ctime(sdp) (le32_to_cpu((sdp)->sd_ctime))
24261 +#define set_sd_v1_ctime(sdp,v) ((sdp)->sd_ctime = cpu_to_le32(v))
24262 +#define sd_v1_rdev(sdp) (le32_to_cpu((sdp)->u.sd_rdev))
24263 +#define set_sd_v1_rdev(sdp,v) ((sdp)->u.sd_rdev = cpu_to_le32(v))
24264 +#define sd_v1_blocks(sdp) (le32_to_cpu((sdp)->u.sd_blocks))
24265 +#define set_sd_v1_blocks(sdp,v) ((sdp)->u.sd_blocks = cpu_to_le32(v))
24266 +#define sd_v1_first_direct_byte(sdp) \
24267 + (le32_to_cpu((sdp)->sd_first_direct_byte))
24268 +#define set_sd_v1_first_direct_byte(sdp,v) \
24269 + ((sdp)->sd_first_direct_byte = cpu_to_le32(v))
24271 +/* inode flags stored in sd_attrs (nee sd_reserved) */
24274 + * we want common flags to have the same values as in ext2,
24275 + * so chattr(1) will work without problems
24276 + */
24277 +#define REISERFS_IMMUTABLE_FL FS_IMMUTABLE_FL
24278 +#define REISERFS_APPEND_FL FS_APPEND_FL
24279 +#define REISERFS_SYNC_FL FS_SYNC_FL
24280 +#define REISERFS_NOATIME_FL FS_NOATIME_FL
24281 +#define REISERFS_NODUMP_FL FS_NODUMP_FL
24282 +#define REISERFS_SECRM_FL FS_SECRM_FL
24283 +#define REISERFS_UNRM_FL FS_UNRM_FL
24284 +#define REISERFS_COMPR_FL FS_COMPR_FL
24285 +#define REISERFS_NOTAIL_FL FS_NOTAIL_FL
24287 +/* persistent flags that file inherits from the parent directory */
24288 +#define REISERFS_INHERIT_MASK ( REISERFS_IMMUTABLE_FL | \
24289 + REISERFS_SYNC_FL | \
24290 + REISERFS_NOATIME_FL | \
24291 + REISERFS_NODUMP_FL | \
24292 + REISERFS_SECRM_FL | \
24293 + REISERFS_COMPR_FL | \
24294 + REISERFS_NOTAIL_FL )
24297 + * Stat Data on disk (reiserfs version of UFS disk inode minus the
24298 + * address blocks)
24299 + */
24300 +struct stat_data {
24301 + __le16 sd_mode; /* file type, permissions */
24302 + __le16 sd_attrs; /* persistent inode flags */
24303 + __le32 sd_nlink; /* number of hard links */
24304 + __le64 sd_size; /* file size */
24305 + __le32 sd_uid; /* owner */
24306 + __le32 sd_gid; /* group */
24307 + __le32 sd_atime; /* time of last access */
24308 + __le32 sd_mtime; /* time file was last modified */
24310 + /*
24311 + * time inode (stat data) was last changed
24312 + * (except changes to sd_atime and sd_mtime)
24313 + */
24314 + __le32 sd_ctime;
24315 + __le32 sd_blocks;
24316 + union {
24317 + __le32 sd_rdev;
24318 + __le32 sd_generation;
24319 + } __attribute__ ((__packed__)) u;
24320 +} __attribute__ ((__packed__));
24322 +/* this is 44 bytes long */
24323 +#define SD_SIZE (sizeof(struct stat_data))
24324 +#define SD_V2_SIZE SD_SIZE
24325 +#define stat_data_v2(ih) (ih_version (ih) == KEY_FORMAT_3_6)
24326 +#define sd_v2_mode(sdp) (le16_to_cpu((sdp)->sd_mode))
24327 +#define set_sd_v2_mode(sdp,v) ((sdp)->sd_mode = cpu_to_le16(v))
24328 +/* sd_reserved */
24329 +/* set_sd_reserved */
24330 +#define sd_v2_nlink(sdp) (le32_to_cpu((sdp)->sd_nlink))
24331 +#define set_sd_v2_nlink(sdp,v) ((sdp)->sd_nlink = cpu_to_le32(v))
24332 +#define sd_v2_size(sdp) (le64_to_cpu((sdp)->sd_size))
24333 +#define set_sd_v2_size(sdp,v) ((sdp)->sd_size = cpu_to_le64(v))
24334 +#define sd_v2_uid(sdp) (le32_to_cpu((sdp)->sd_uid))
24335 +#define set_sd_v2_uid(sdp,v) ((sdp)->sd_uid = cpu_to_le32(v))
24336 +#define sd_v2_gid(sdp) (le32_to_cpu((sdp)->sd_gid))
24337 +#define set_sd_v2_gid(sdp,v) ((sdp)->sd_gid = cpu_to_le32(v))
24338 +#define sd_v2_atime(sdp) (le32_to_cpu((sdp)->sd_atime))
24339 +#define set_sd_v2_atime(sdp,v) ((sdp)->sd_atime = cpu_to_le32(v))
24340 +#define sd_v2_mtime(sdp) (le32_to_cpu((sdp)->sd_mtime))
24341 +#define set_sd_v2_mtime(sdp,v) ((sdp)->sd_mtime = cpu_to_le32(v))
24342 +#define sd_v2_ctime(sdp) (le32_to_cpu((sdp)->sd_ctime))
24343 +#define set_sd_v2_ctime(sdp,v) ((sdp)->sd_ctime = cpu_to_le32(v))
24344 +#define sd_v2_blocks(sdp) (le32_to_cpu((sdp)->sd_blocks))
24345 +#define set_sd_v2_blocks(sdp,v) ((sdp)->sd_blocks = cpu_to_le32(v))
24346 +#define sd_v2_rdev(sdp) (le32_to_cpu((sdp)->u.sd_rdev))
24347 +#define set_sd_v2_rdev(sdp,v) ((sdp)->u.sd_rdev = cpu_to_le32(v))
24348 +#define sd_v2_generation(sdp) (le32_to_cpu((sdp)->u.sd_generation))
24349 +#define set_sd_v2_generation(sdp,v) ((sdp)->u.sd_generation = cpu_to_le32(v))
24350 +#define sd_v2_attrs(sdp) (le16_to_cpu((sdp)->sd_attrs))
24351 +#define set_sd_v2_attrs(sdp,v) ((sdp)->sd_attrs = cpu_to_le16(v))
24353 +/***************************************************************************
24354 + * DIRECTORY STRUCTURE *
24355 + ***************************************************************************/
24357 + * Picture represents the structure of directory items
24358 + * ________________________________________________
24359 + * | Array of | | | | | |
24360 + * | directory |N-1| N-2 | .... | 1st |0th|
24361 + * | entry headers | | | | | |
24362 + * |_______________|___|_____|________|_______|___|
24363 + * <---- directory entries ------>
24365 + * First directory item has k_offset component 1. We store "." and ".."
24366 + * in one item, always, we never split "." and ".." into differing
24367 + * items. This makes, among other things, the code for removing
24368 + * directories simpler.
24369 + */
24370 +#define SD_OFFSET 0
24371 +#define SD_UNIQUENESS 0
24372 +#define DOT_OFFSET 1
24373 +#define DOT_DOT_OFFSET 2
24374 +#define DIRENTRY_UNIQUENESS 500
24376 +#define FIRST_ITEM_OFFSET 1
24379 + * Q: How to get key of object pointed to by entry from entry?
24381 + * A: Each directory entry has its header. This header has deh_dir_id
24382 + * and deh_objectid fields, those are key of object, entry points to
24383 + */
24386 + * NOT IMPLEMENTED:
24387 + * Directory will someday contain stat data of object
24388 + */
24390 +struct reiserfs_de_head {
24391 + __le32 deh_offset; /* third component of the directory entry key */
24393 + /*
24394 + * objectid of the parent directory of the object, that is referenced
24395 + * by directory entry
24396 + */
24397 + __le32 deh_dir_id;
24399 + /* objectid of the object, that is referenced by directory entry */
24400 + __le32 deh_objectid;
24401 + __le16 deh_location; /* offset of name in the whole item */
24403 + /*
24404 + * whether 1) entry contains stat data (for future), and
24405 + * 2) whether entry is hidden (unlinked)
24406 + */
24407 + __le16 deh_state;
24408 +} __attribute__ ((__packed__));
24409 +#define DEH_SIZE sizeof(struct reiserfs_de_head)
24410 +#define deh_offset(p_deh) (le32_to_cpu((p_deh)->deh_offset))
24411 +#define deh_dir_id(p_deh) (le32_to_cpu((p_deh)->deh_dir_id))
24412 +#define deh_objectid(p_deh) (le32_to_cpu((p_deh)->deh_objectid))
24413 +#define deh_location(p_deh) (le16_to_cpu((p_deh)->deh_location))
24414 +#define deh_state(p_deh) (le16_to_cpu((p_deh)->deh_state))
24416 +#define put_deh_offset(p_deh,v) ((p_deh)->deh_offset = cpu_to_le32((v)))
24417 +#define put_deh_dir_id(p_deh,v) ((p_deh)->deh_dir_id = cpu_to_le32((v)))
24418 +#define put_deh_objectid(p_deh,v) ((p_deh)->deh_objectid = cpu_to_le32((v)))
24419 +#define put_deh_location(p_deh,v) ((p_deh)->deh_location = cpu_to_le16((v)))
24420 +#define put_deh_state(p_deh,v) ((p_deh)->deh_state = cpu_to_le16((v)))
24422 +/* empty directory contains two entries "." and ".." and their headers */
24423 +#define EMPTY_DIR_SIZE \
24424 +(DEH_SIZE * 2 + ROUND_UP (sizeof(".") - 1) + ROUND_UP (sizeof("..") - 1))
24426 +/* old format directories have this size when empty */
24427 +#define EMPTY_DIR_SIZE_V1 (DEH_SIZE * 2 + 3)
24429 +#define DEH_Statdata 0 /* not used now */
24430 +#define DEH_Visible 2
24432 +/* 64 bit systems (and the S/390) need to be aligned explicitly -jdm */
24433 +#if BITS_PER_LONG == 64 || defined(__s390__) || defined(__hppa__)
24434 +# define ADDR_UNALIGNED_BITS (3)
24435 +#endif
24438 + * These are only used to manipulate deh_state.
24439 + * Because of this, we'll use the ext2_ bit routines,
24440 + * since they are little endian
24441 + */
24442 +#ifdef ADDR_UNALIGNED_BITS
24444 +# define aligned_address(addr) ((void *)((long)(addr) & ~((1UL << ADDR_UNALIGNED_BITS) - 1)))
24445 +# define unaligned_offset(addr) (((int)((long)(addr) & ((1 << ADDR_UNALIGNED_BITS) - 1))) << 3)
24447 +# define set_bit_unaligned(nr, addr) \
24448 + __test_and_set_bit_le((nr) + unaligned_offset(addr), aligned_address(addr))
24449 +# define clear_bit_unaligned(nr, addr) \
24450 + __test_and_clear_bit_le((nr) + unaligned_offset(addr), aligned_address(addr))
24451 +# define test_bit_unaligned(nr, addr) \
24452 + test_bit_le((nr) + unaligned_offset(addr), aligned_address(addr))
24454 +#else
24456 +# define set_bit_unaligned(nr, addr) __test_and_set_bit_le(nr, addr)
24457 +# define clear_bit_unaligned(nr, addr) __test_and_clear_bit_le(nr, addr)
24458 +# define test_bit_unaligned(nr, addr) test_bit_le(nr, addr)
24460 +#endif
24462 +#define mark_de_with_sd(deh) set_bit_unaligned (DEH_Statdata, &((deh)->deh_state))
24463 +#define mark_de_without_sd(deh) clear_bit_unaligned (DEH_Statdata, &((deh)->deh_state))
24464 +#define mark_de_visible(deh) set_bit_unaligned (DEH_Visible, &((deh)->deh_state))
24465 +#define mark_de_hidden(deh) clear_bit_unaligned (DEH_Visible, &((deh)->deh_state))
24467 +#define de_with_sd(deh) test_bit_unaligned (DEH_Statdata, &((deh)->deh_state))
24468 +#define de_visible(deh) test_bit_unaligned (DEH_Visible, &((deh)->deh_state))
24469 +#define de_hidden(deh) !test_bit_unaligned (DEH_Visible, &((deh)->deh_state))
24471 +extern void make_empty_dir_item_v1(char *body, __le32 dirid, __le32 objid,
24472 + __le32 par_dirid, __le32 par_objid);
24473 +extern void make_empty_dir_item(char *body, __le32 dirid, __le32 objid,
24474 + __le32 par_dirid, __le32 par_objid);
24476 +/* two entries per block (at least) */
24477 +#define REISERFS_MAX_NAME(block_size) 255
24480 + * this structure is used for operations on directory entries. It is
24481 + * not a disk structure.
24483 + * When reiserfs_find_entry or search_by_entry_key find directory
24484 + * entry, they return filled reiserfs_dir_entry structure
24485 + */
24486 +struct reiserfs_dir_entry {
24487 + struct buffer_head *de_bh;
24488 + int de_item_num;
24489 + struct item_head *de_ih;
24490 + int de_entry_num;
24491 + struct reiserfs_de_head *de_deh;
24492 + int de_entrylen;
24493 + int de_namelen;
24494 + char *de_name;
24495 + unsigned long *de_gen_number_bit_string;
24497 + __u32 de_dir_id;
24498 + __u32 de_objectid;
24500 + struct cpu_key de_entry_key;
24504 + * these defines are useful when a particular member of
24505 + * a reiserfs_dir_entry is needed
24506 + */
24508 +/* pointer to file name, stored in entry */
24509 +#define B_I_DEH_ENTRY_FILE_NAME(bh, ih, deh) \
24510 + (ih_item_body(bh, ih) + deh_location(deh))
24512 +/* length of name */
24513 +#define I_DEH_N_ENTRY_FILE_NAME_LENGTH(ih,deh,entry_num) \
24514 +(I_DEH_N_ENTRY_LENGTH (ih, deh, entry_num) - (de_with_sd (deh) ? SD_SIZE : 0))
24516 +/* hash value occupies bits from 7 up to 30 */
24517 +#define GET_HASH_VALUE(offset) ((offset) & 0x7fffff80LL)
24518 +/* generation number occupies 7 bits starting from 0 up to 6 */
24519 +#define GET_GENERATION_NUMBER(offset) ((offset) & 0x7fLL)
24520 +#define MAX_GENERATION_NUMBER 127
24522 +#define SET_GENERATION_NUMBER(offset,gen_number) (GET_HASH_VALUE(offset)|(gen_number))
24525 + * Picture represents an internal node of the reiserfs tree
24526 + * ______________________________________________________
24527 + * | | Array of | Array of | Free |
24528 + * |block | keys | pointers | space |
24529 + * | head | N | N+1 | |
24530 + * |______|_______________|___________________|___________|
24531 + */
24533 +/***************************************************************************
24534 + * DISK CHILD *
24535 + ***************************************************************************/
24537 + * Disk child pointer:
24538 + * The pointer from an internal node of the tree to a node that is on disk.
24539 + */
24540 +struct disk_child {
24541 + __le32 dc_block_number; /* Disk child's block number. */
24542 + __le16 dc_size; /* Disk child's used space. */
24543 + __le16 dc_reserved;
24546 +#define DC_SIZE (sizeof(struct disk_child))
24547 +#define dc_block_number(dc_p) (le32_to_cpu((dc_p)->dc_block_number))
24548 +#define dc_size(dc_p) (le16_to_cpu((dc_p)->dc_size))
24549 +#define put_dc_block_number(dc_p, val) do { (dc_p)->dc_block_number = cpu_to_le32(val); } while(0)
24550 +#define put_dc_size(dc_p, val) do { (dc_p)->dc_size = cpu_to_le16(val); } while(0)
24552 +/* Get disk child by buffer header and position in the tree node. */
24553 +#define B_N_CHILD(bh, n_pos) ((struct disk_child *)\
24554 +((bh)->b_data + BLKH_SIZE + B_NR_ITEMS(bh) * KEY_SIZE + DC_SIZE * (n_pos)))
24556 +/* Get disk child number by buffer header and position in the tree node. */
24557 +#define B_N_CHILD_NUM(bh, n_pos) (dc_block_number(B_N_CHILD(bh, n_pos)))
24558 +#define PUT_B_N_CHILD_NUM(bh, n_pos, val) \
24559 + (put_dc_block_number(B_N_CHILD(bh, n_pos), val))
24561 + /* maximal value of field child_size in structure disk_child */
24562 + /* child size is the combined size of all items and their headers */
24563 +#define MAX_CHILD_SIZE(bh) ((int)( (bh)->b_size - BLKH_SIZE ))
24565 +/* amount of used space in buffer (not including block head) */
24566 +#define B_CHILD_SIZE(cur) (MAX_CHILD_SIZE(cur)-(B_FREE_SPACE(cur)))
24568 +/* max and min number of keys in internal node */
24569 +#define MAX_NR_KEY(bh) ( (MAX_CHILD_SIZE(bh)-DC_SIZE)/(KEY_SIZE+DC_SIZE) )
24570 +#define MIN_NR_KEY(bh) (MAX_NR_KEY(bh)/2)
24572 +/***************************************************************************
24573 + * PATH STRUCTURES AND DEFINES *
24574 + ***************************************************************************/
24577 + * search_by_key fills up the path from the root to the leaf as it descends
24578 + * the tree looking for the key. It uses reiserfs_bread to try to find
24579 + * buffers in the cache given their block number. If it does not find
24580 + * them in the cache it reads them from disk. For each node search_by_key
24581 + * finds using reiserfs_bread it then uses bin_search to look through that
24582 + * node. bin_search will find the position of the block_number of the next
24583 + * node if it is looking through an internal node. If it is looking through
24584 + * a leaf node bin_search will find the position of the item which has key
24585 + * either equal to given key, or which is the maximal key less than the
24586 + * given key.
24587 + */
24589 +struct path_element {
24590 + /* Pointer to the buffer at the path in the tree. */
24591 + struct buffer_head *pe_buffer;
24592 + /* Position in the tree node which is placed in the buffer above. */
24593 + int pe_position;
24597 + * maximal height of a tree. don't change this without
24598 + * changing JOURNAL_PER_BALANCE_CNT
24599 + */
24600 +#define MAX_HEIGHT 5
24602 +/* Must be equals MAX_HEIGHT + FIRST_PATH_ELEMENT_OFFSET */
24603 +#define EXTENDED_MAX_HEIGHT 7
24605 +/* Must be equal to at least 2. */
24606 +#define FIRST_PATH_ELEMENT_OFFSET 2
24608 +/* Must be equal to FIRST_PATH_ELEMENT_OFFSET - 1 */
24609 +#define ILLEGAL_PATH_ELEMENT_OFFSET 1
24611 +/* this MUST be MAX_HEIGHT + 1. See about FEB below */
24612 +#define MAX_FEB_SIZE 6
24615 + * We need to keep track of who the ancestors of nodes are. When we
24616 + * perform a search we record which nodes were visited while
24617 + * descending the tree looking for the node we searched for. This list
24618 + * of nodes is called the path. This information is used while
24619 + * performing balancing. Note that this path information may become
24620 + * invalid, and this means we must check it when using it to see if it
24621 + * is still valid. You'll need to read search_by_key and the comments
24622 + * in it, especially about decrement_counters_in_path(), to understand
24623 + * this structure.
24625 + * Paths make the code so much harder to work with and debug.... An
24626 + * enormous number of bugs are due to them, and trying to write or modify
24627 + * code that uses them just makes my head hurt. They are based on an
24628 + * excessive effort to avoid disturbing the precious VFS code.:-( The
24629 + * gods only know how we are going to SMP the code that uses them.
24630 + * znodes are the way!
24631 + */
24633 +#define PATH_READA 0x1 /* do read ahead */
24634 +#define PATH_READA_BACK 0x2 /* read backwards */
24636 +struct treepath {
24637 + int path_length; /* Length of the array above. */
24638 + int reada;
24639 + /* Array of the path elements. */
24640 + struct path_element path_elements[EXTENDED_MAX_HEIGHT];
24641 + int pos_in_item;
24644 +#define pos_in_item(path) ((path)->pos_in_item)
24646 +#define INITIALIZE_PATH(var) \
24647 +struct treepath var = {.path_length = ILLEGAL_PATH_ELEMENT_OFFSET, .reada = 0,}
24649 +/* Get path element by path and path position. */
24650 +#define PATH_OFFSET_PELEMENT(path, n_offset) ((path)->path_elements + (n_offset))
24652 +/* Get buffer header at the path by path and path position. */
24653 +#define PATH_OFFSET_PBUFFER(path, n_offset) (PATH_OFFSET_PELEMENT(path, n_offset)->pe_buffer)
24655 +/* Get position in the element at the path by path and path position. */
24656 +#define PATH_OFFSET_POSITION(path, n_offset) (PATH_OFFSET_PELEMENT(path, n_offset)->pe_position)
24658 +#define PATH_PLAST_BUFFER(path) (PATH_OFFSET_PBUFFER((path), (path)->path_length))
24661 + * you know, to the person who didn't write this the macro name does not
24662 + * at first suggest what it does. Maybe POSITION_FROM_PATH_END? Or
24663 + * maybe we should just focus on dumping paths... -Hans
24664 + */
24665 +#define PATH_LAST_POSITION(path) (PATH_OFFSET_POSITION((path), (path)->path_length))
24668 + * in do_balance leaf has h == 0 in contrast with path structure,
24669 + * where root has level == 0. That is why we need these defines
24670 + */
24672 +/* tb->S[h] */
24673 +#define PATH_H_PBUFFER(path, h) \
24674 + PATH_OFFSET_PBUFFER(path, path->path_length - (h))
24676 +/* tb->F[h] or tb->S[0]->b_parent */
24677 +#define PATH_H_PPARENT(path, h) PATH_H_PBUFFER(path, (h) + 1)
24679 +#define PATH_H_POSITION(path, h) \
24680 + PATH_OFFSET_POSITION(path, path->path_length - (h))
24682 +/* tb->S[h]->b_item_order */
24683 +#define PATH_H_B_ITEM_ORDER(path, h) PATH_H_POSITION(path, h + 1)
24685 +#define PATH_H_PATH_OFFSET(path, n_h) ((path)->path_length - (n_h))
24687 +static inline void *reiserfs_node_data(const struct buffer_head *bh)
24689 + return bh->b_data + sizeof(struct block_head);
24692 +/* get key from internal node */
24693 +static inline struct reiserfs_key *internal_key(struct buffer_head *bh,
24694 + int item_num)
24696 + struct reiserfs_key *key = reiserfs_node_data(bh);
24698 + return &key[item_num];
24701 +/* get the item header from leaf node */
24702 +static inline struct item_head *item_head(const struct buffer_head *bh,
24703 + int item_num)
24705 + struct item_head *ih = reiserfs_node_data(bh);
24707 + return &ih[item_num];
24710 +/* get the key from leaf node */
24711 +static inline struct reiserfs_key *leaf_key(const struct buffer_head *bh,
24712 + int item_num)
24714 + return &item_head(bh, item_num)->ih_key;
24717 +static inline void *ih_item_body(const struct buffer_head *bh,
24718 + const struct item_head *ih)
24720 + return bh->b_data + ih_location(ih);
24723 +/* get item body from leaf node */
24724 +static inline void *item_body(const struct buffer_head *bh, int item_num)
24726 + return ih_item_body(bh, item_head(bh, item_num));
24729 +static inline struct item_head *tp_item_head(const struct treepath *path)
24731 + return item_head(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION(path));
24734 +static inline void *tp_item_body(const struct treepath *path)
24736 + return item_body(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION(path));
24739 +#define get_last_bh(path) PATH_PLAST_BUFFER(path)
24740 +#define get_item_pos(path) PATH_LAST_POSITION(path)
24741 +#define item_moved(ih,path) comp_items(ih, path)
24742 +#define path_changed(ih,path) comp_items (ih, path)
24744 +/* array of the entry headers */
24745 + /* get item body */
24746 +#define B_I_DEH(bh, ih) ((struct reiserfs_de_head *)(ih_item_body(bh, ih)))
24749 + * length of the directory entry in directory item. This define
24750 + * calculates length of i-th directory entry using directory entry
24751 + * locations from dir entry head. When it calculates length of 0-th
24752 + * directory entry, it uses length of whole item in place of entry
24753 + * location of the non-existent following entry in the calculation.
24754 + * See picture above.
24755 + */
24756 +static inline int entry_length(const struct buffer_head *bh,
24757 + const struct item_head *ih, int pos_in_item)
24759 + struct reiserfs_de_head *deh;
24761 + deh = B_I_DEH(bh, ih) + pos_in_item;
24762 + if (pos_in_item)
24763 + return deh_location(deh - 1) - deh_location(deh);
24765 + return ih_item_len(ih) - deh_location(deh);
24768 +/***************************************************************************
24769 + * MISC *
24770 + ***************************************************************************/
24772 +/* Size of pointer to the unformatted node. */
24773 +#define UNFM_P_SIZE (sizeof(unp_t))
24774 +#define UNFM_P_SHIFT 2
24776 +/* in in-core inode key is stored on le form */
24777 +#define INODE_PKEY(inode) ((struct reiserfs_key *)(REISERFS_I(inode)->i_key))
24779 +#define MAX_UL_INT 0xffffffff
24780 +#define MAX_INT 0x7ffffff
24781 +#define MAX_US_INT 0xffff
24783 +// reiserfs version 2 has max offset 60 bits. Version 1 - 32 bit offset
24784 +static inline loff_t max_reiserfs_offset(struct inode *inode)
24786 + if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5)
24787 + return (loff_t) U32_MAX;
24789 + return (loff_t) ((~(__u64) 0) >> 4);
24792 +#define MAX_KEY_OBJECTID MAX_UL_INT
24794 +#define MAX_B_NUM MAX_UL_INT
24795 +#define MAX_FC_NUM MAX_US_INT
24797 +/* the purpose is to detect overflow of an unsigned short */
24798 +#define REISERFS_LINK_MAX (MAX_US_INT - 1000)
24801 + * The following defines are used in reiserfs_insert_item
24802 + * and reiserfs_append_item
24803 + */
24804 +#define REISERFS_KERNEL_MEM 0 /* kernel memory mode */
24805 +#define REISERFS_USER_MEM 1 /* user memory mode */
24807 +#define fs_generation(s) (REISERFS_SB(s)->s_generation_counter)
24808 +#define get_generation(s) atomic_read (&fs_generation(s))
24809 +#define FILESYSTEM_CHANGED_TB(tb) (get_generation((tb)->tb_sb) != (tb)->fs_gen)
24810 +#define __fs_changed(gen,s) (gen != get_generation (s))
24811 +#define fs_changed(gen,s) \
24812 +({ \
24813 + reiserfs_cond_resched(s); \
24814 + __fs_changed(gen, s); \
24817 +/***************************************************************************
24818 + * FIXATE NODES *
24819 + ***************************************************************************/
24821 +#define VI_TYPE_LEFT_MERGEABLE 1
24822 +#define VI_TYPE_RIGHT_MERGEABLE 2
24825 + * To make any changes in the tree we always first find node, that
24826 + * contains item to be changed/deleted or place to insert a new
24827 + * item. We call this node S. To do balancing we need to decide what
24828 + * we will shift to left/right neighbor, or to a new node, where new
24829 + * item will be etc. To make this analysis simpler we build virtual
24830 + * node. Virtual node is an array of items, that will replace items of
24831 + * node S. (For instance if we are going to delete an item, virtual
24832 + * node does not contain it). Virtual node keeps information about
24833 + * item sizes and types, mergeability of first and last items, sizes
24834 + * of all entries in directory item. We use this array of items when
24835 + * calculating what we can shift to neighbors and how many nodes we
24836 + * have to have if we do not any shiftings, if we shift to left/right
24837 + * neighbor or to both.
24838 + */
24839 +struct virtual_item {
24840 + int vi_index; /* index in the array of item operations */
24841 + unsigned short vi_type; /* left/right mergeability */
24843 + /* length of item that it will have after balancing */
24844 + unsigned short vi_item_len;
24846 + struct item_head *vi_ih;
24847 + const char *vi_item; /* body of item (old or new) */
24848 + const void *vi_new_data; /* 0 always but paste mode */
24849 + void *vi_uarea; /* item specific area */
24852 +struct virtual_node {
24853 + /* this is a pointer to the free space in the buffer */
24854 + char *vn_free_ptr;
24856 + unsigned short vn_nr_item; /* number of items in virtual node */
24858 + /*
24859 + * size of node , that node would have if it has
24860 + * unlimited size and no balancing is performed
24861 + */
24862 + short vn_size;
24864 + /* mode of balancing (paste, insert, delete, cut) */
24865 + short vn_mode;
24867 + short vn_affected_item_num;
24868 + short vn_pos_in_item;
24870 + /* item header of inserted item, 0 for other modes */
24871 + struct item_head *vn_ins_ih;
24872 + const void *vn_data;
24874 + /* array of items (including a new one, excluding item to be deleted) */
24875 + struct virtual_item *vn_vi;
24878 +/* used by directory items when creating virtual nodes */
24879 +struct direntry_uarea {
24880 + int flags;
24881 + __u16 entry_count;
24882 + __u16 entry_sizes[];
24883 +} __attribute__ ((__packed__));
24885 +/***************************************************************************
24886 + * TREE BALANCE *
24887 + ***************************************************************************/
24890 + * This temporary structure is used in tree balance algorithms, and
24891 + * constructed as we go to the extent that its various parts are
24892 + * needed. It contains arrays of nodes that can potentially be
24893 + * involved in the balancing of node S, and parameters that define how
24894 + * each of the nodes must be balanced. Note that in these algorithms
24895 + * for balancing the worst case is to need to balance the current node
24896 + * S and the left and right neighbors and all of their parents plus
24897 + * create a new node. We implement S1 balancing for the leaf nodes
24898 + * and S0 balancing for the internal nodes (S1 and S0 are defined in
24899 + * our papers.)
24900 + */
24902 +/* size of the array of buffers to free at end of do_balance */
24903 +#define MAX_FREE_BLOCK 7
24905 +/* maximum number of FEB blocknrs on a single level */
24906 +#define MAX_AMOUNT_NEEDED 2
24908 +/* someday somebody will prefix every field in this struct with tb_ */
24909 +struct tree_balance {
24910 + int tb_mode;
24911 + int need_balance_dirty;
24912 + struct super_block *tb_sb;
24913 + struct reiserfs_transaction_handle *transaction_handle;
24914 + struct treepath *tb_path;
24916 + /* array of left neighbors of nodes in the path */
24917 + struct buffer_head *L[MAX_HEIGHT];
24919 + /* array of right neighbors of nodes in the path */
24920 + struct buffer_head *R[MAX_HEIGHT];
24922 + /* array of fathers of the left neighbors */
24923 + struct buffer_head *FL[MAX_HEIGHT];
24925 + /* array of fathers of the right neighbors */
24926 + struct buffer_head *FR[MAX_HEIGHT];
24927 + /* array of common parents of center node and its left neighbor */
24928 + struct buffer_head *CFL[MAX_HEIGHT];
24930 + /* array of common parents of center node and its right neighbor */
24931 + struct buffer_head *CFR[MAX_HEIGHT];
24933 + /*
24934 + * array of empty buffers. Number of buffers in array equals
24935 + * cur_blknum.
24936 + */
24937 + struct buffer_head *FEB[MAX_FEB_SIZE];
24938 + struct buffer_head *used[MAX_FEB_SIZE];
24939 + struct buffer_head *thrown[MAX_FEB_SIZE];
24941 + /*
24942 + * array of number of items which must be shifted to the left in
24943 + * order to balance the current node; for leaves includes item that
24944 + * will be partially shifted; for internal nodes, it is the number
24945 + * of child pointers rather than items. It includes the new item
24946 + * being created. The code sometimes subtracts one to get the
24947 + * number of wholly shifted items for other purposes.
24948 + */
24949 + int lnum[MAX_HEIGHT];
24951 + /* substitute right for left in comment above */
24952 + int rnum[MAX_HEIGHT];
24954 + /*
24955 + * array indexed by height h mapping the key delimiting L[h] and
24956 + * S[h] to its item number within the node CFL[h]
24957 + */
24958 + int lkey[MAX_HEIGHT];
24960 + /* substitute r for l in comment above */
24961 + int rkey[MAX_HEIGHT];
24963 + /*
24964 + * the number of bytes by we are trying to add or remove from
24965 + * S[h]. A negative value means removing.
24966 + */
24967 + int insert_size[MAX_HEIGHT];
24969 + /*
24970 + * number of nodes that will replace node S[h] after balancing
24971 + * on the level h of the tree. If 0 then S is being deleted,
24972 + * if 1 then S is remaining and no new nodes are being created,
24973 + * if 2 or 3 then 1 or 2 new nodes is being created
24974 + */
24975 + int blknum[MAX_HEIGHT];
24977 + /* fields that are used only for balancing leaves of the tree */
24979 + /* number of empty blocks having been already allocated */
24980 + int cur_blknum;
24982 + /* number of items that fall into left most node when S[0] splits */
24983 + int s0num;
24985 + /*
24986 + * number of bytes which can flow to the left neighbor from the left
24987 + * most liquid item that cannot be shifted from S[0] entirely
24988 + * if -1 then nothing will be partially shifted
24989 + */
24990 + int lbytes;
24992 + /*
24993 + * number of bytes which will flow to the right neighbor from the right
24994 + * most liquid item that cannot be shifted from S[0] entirely
24995 + * if -1 then nothing will be partially shifted
24996 + */
24997 + int rbytes;
25000 + /*
25001 + * index into the array of item headers in
25002 + * S[0] of the affected item
25003 + */
25004 + int item_pos;
25006 + /* new nodes allocated to hold what could not fit into S */
25007 + struct buffer_head *S_new[2];
25009 + /*
25010 + * number of items that will be placed into nodes in S_new
25011 + * when S[0] splits
25012 + */
25013 + int snum[2];
25015 + /*
25016 + * number of bytes which flow to nodes in S_new when S[0] splits
25017 + * note: if S[0] splits into 3 nodes, then items do not need to be cut
25018 + */
25019 + int sbytes[2];
25021 + int pos_in_item;
25022 + int zeroes_num;
25024 + /*
25025 + * buffers which are to be freed after do_balance finishes
25026 + * by unfix_nodes
25027 + */
25028 + struct buffer_head *buf_to_free[MAX_FREE_BLOCK];
25030 + /*
25031 + * kmalloced memory. Used to create virtual node and keep
25032 + * map of dirtied bitmap blocks
25033 + */
25034 + char *vn_buf;
25036 + int vn_buf_size; /* size of the vn_buf */
25038 + /* VN starts after bitmap of bitmap blocks */
25039 + struct virtual_node *tb_vn;
25041 + /*
25042 + * saved value of `reiserfs_generation' counter see
25043 + * FILESYSTEM_CHANGED() macro in reiserfs_fs.h
25044 + */
25045 + int fs_gen;
25047 +#ifdef DISPLACE_NEW_PACKING_LOCALITIES
25048 + /*
25049 + * key pointer, to pass to block allocator or
25050 + * another low-level subsystem
25051 + */
25052 + struct in_core_key key;
25053 +#endif
25056 +/* These are modes of balancing */
25058 +/* When inserting an item. */
25059 +#define M_INSERT 'i'
25061 + * When inserting into (directories only) or appending onto an already
25062 + * existent item.
25063 + */
25064 +#define M_PASTE 'p'
25065 +/* When deleting an item. */
25066 +#define M_DELETE 'd'
25067 +/* When truncating an item or removing an entry from a (directory) item. */
25068 +#define M_CUT 'c'
25070 +/* used when balancing on leaf level skipped (in reiserfsck) */
25071 +#define M_INTERNAL 'n'
25074 + * When further balancing is not needed, then do_balance does not need
25075 + * to be called.
25076 + */
25077 +#define M_SKIP_BALANCING 's'
25078 +#define M_CONVERT 'v'
25080 +/* modes of leaf_move_items */
25081 +#define LEAF_FROM_S_TO_L 0
25082 +#define LEAF_FROM_S_TO_R 1
25083 +#define LEAF_FROM_R_TO_L 2
25084 +#define LEAF_FROM_L_TO_R 3
25085 +#define LEAF_FROM_S_TO_SNEW 4
25087 +#define FIRST_TO_LAST 0
25088 +#define LAST_TO_FIRST 1
25091 + * used in do_balance for passing parent of node information that has
25092 + * been gotten from tb struct
25093 + */
25094 +struct buffer_info {
25095 + struct tree_balance *tb;
25096 + struct buffer_head *bi_bh;
25097 + struct buffer_head *bi_parent;
25098 + int bi_position;
25101 +static inline struct super_block *sb_from_tb(struct tree_balance *tb)
25103 + return tb ? tb->tb_sb : NULL;
25106 +static inline struct super_block *sb_from_bi(struct buffer_info *bi)
25108 + return bi ? sb_from_tb(bi->tb) : NULL;
25112 + * there are 4 types of items: stat data, directory item, indirect, direct.
25113 + * +-------------------+------------+--------------+------------+
25114 + * | | k_offset | k_uniqueness | mergeable? |
25115 + * +-------------------+------------+--------------+------------+
25116 + * | stat data | 0 | 0 | no |
25117 + * +-------------------+------------+--------------+------------+
25118 + * | 1st directory item| DOT_OFFSET | DIRENTRY_ .. | no |
25119 + * | non 1st directory | hash value | UNIQUENESS | yes |
25120 + * | item | | | |
25121 + * +-------------------+------------+--------------+------------+
25122 + * | indirect item | offset + 1 |TYPE_INDIRECT | [1] |
25123 + * +-------------------+------------+--------------+------------+
25124 + * | direct item | offset + 1 |TYPE_DIRECT | [2] |
25125 + * +-------------------+------------+--------------+------------+
25127 + * [1] if this is not the first indirect item of the object
25128 + * [2] if this is not the first direct item of the object
25131 +struct item_operations {
25132 + int (*bytes_number) (struct item_head * ih, int block_size);
25133 + void (*decrement_key) (struct cpu_key *);
25134 + int (*is_left_mergeable) (struct reiserfs_key * ih,
25135 + unsigned long bsize);
25136 + void (*print_item) (struct item_head *, char *item);
25137 + void (*check_item) (struct item_head *, char *item);
25139 + int (*create_vi) (struct virtual_node * vn, struct virtual_item * vi,
25140 + int is_affected, int insert_size);
25141 + int (*check_left) (struct virtual_item * vi, int free,
25142 + int start_skip, int end_skip);
25143 + int (*check_right) (struct virtual_item * vi, int free);
25144 + int (*part_size) (struct virtual_item * vi, int from, int to);
25145 + int (*unit_num) (struct virtual_item * vi);
25146 + void (*print_vi) (struct virtual_item * vi);
25149 +extern struct item_operations *item_ops[TYPE_ANY + 1];
25151 +#define op_bytes_number(ih,bsize) item_ops[le_ih_k_type (ih)]->bytes_number (ih, bsize)
25152 +#define op_is_left_mergeable(key,bsize) item_ops[le_key_k_type (le_key_version (key), key)]->is_left_mergeable (key, bsize)
25153 +#define op_print_item(ih,item) item_ops[le_ih_k_type (ih)]->print_item (ih, item)
25154 +#define op_check_item(ih,item) item_ops[le_ih_k_type (ih)]->check_item (ih, item)
25155 +#define op_create_vi(vn,vi,is_affected,insert_size) item_ops[le_ih_k_type ((vi)->vi_ih)]->create_vi (vn,vi,is_affected,insert_size)
25156 +#define op_check_left(vi,free,start_skip,end_skip) item_ops[(vi)->vi_index]->check_left (vi, free, start_skip, end_skip)
25157 +#define op_check_right(vi,free) item_ops[(vi)->vi_index]->check_right (vi, free)
25158 +#define op_part_size(vi,from,to) item_ops[(vi)->vi_index]->part_size (vi, from, to)
25159 +#define op_unit_num(vi) item_ops[(vi)->vi_index]->unit_num (vi)
25160 +#define op_print_vi(vi) item_ops[(vi)->vi_index]->print_vi (vi)
25162 +#define COMP_SHORT_KEYS comp_short_keys
25164 +/* number of blocks pointed to by the indirect item */
25165 +#define I_UNFM_NUM(ih) (ih_item_len(ih) / UNFM_P_SIZE)
25168 + * the used space within the unformatted node corresponding
25169 + * to pos within the item pointed to by ih
25170 + */
25171 +#define I_POS_UNFM_SIZE(ih,pos,size) (((pos) == I_UNFM_NUM(ih) - 1 ) ? (size) - ih_free_space(ih) : (size))
25174 + * number of bytes contained by the direct item or the
25175 + * unformatted nodes the indirect item points to
25176 + */
25178 +/* following defines use reiserfs buffer header and item header */
25180 +/* get stat-data */
25181 +#define B_I_STAT_DATA(bh, ih) ( (struct stat_data * )((bh)->b_data + ih_location(ih)) )
25183 +/* this is 3976 for size==4096 */
25184 +#define MAX_DIRECT_ITEM_LEN(size) ((size) - BLKH_SIZE - 2*IH_SIZE - SD_SIZE - UNFM_P_SIZE)
25187 + * indirect items consist of entries which contain blocknrs, pos
25188 + * indicates which entry, and B_I_POS_UNFM_POINTER resolves to the
25189 + * blocknr contained by the entry pos points to
25190 + */
25191 +#define B_I_POS_UNFM_POINTER(bh, ih, pos) \
25192 + le32_to_cpu(*(((unp_t *)ih_item_body(bh, ih)) + (pos)))
25193 +#define PUT_B_I_POS_UNFM_POINTER(bh, ih, pos, val) \
25194 + (*(((unp_t *)ih_item_body(bh, ih)) + (pos)) = cpu_to_le32(val))
25196 +struct reiserfs_iget_args {
25197 + __u32 objectid;
25198 + __u32 dirid;
25201 +/***************************************************************************
25202 + * FUNCTION DECLARATIONS *
25203 + ***************************************************************************/
25205 +#define get_journal_desc_magic(bh) (bh->b_data + bh->b_size - 12)
25207 +#define journal_trans_half(blocksize) \
25208 + ((blocksize - sizeof(struct reiserfs_journal_desc) - 12) / sizeof(__u32))
25210 +/* journal.c see journal.c for all the comments here */
25212 +/* first block written in a commit. */
25213 +struct reiserfs_journal_desc {
25214 + __le32 j_trans_id; /* id of commit */
25216 + /* length of commit. len +1 is the commit block */
25217 + __le32 j_len;
25219 + __le32 j_mount_id; /* mount id of this trans */
25220 + __le32 j_realblock[]; /* real locations for each block */
25223 +#define get_desc_trans_id(d) le32_to_cpu((d)->j_trans_id)
25224 +#define get_desc_trans_len(d) le32_to_cpu((d)->j_len)
25225 +#define get_desc_mount_id(d) le32_to_cpu((d)->j_mount_id)
25227 +#define set_desc_trans_id(d,val) do { (d)->j_trans_id = cpu_to_le32 (val); } while (0)
25228 +#define set_desc_trans_len(d,val) do { (d)->j_len = cpu_to_le32 (val); } while (0)
25229 +#define set_desc_mount_id(d,val) do { (d)->j_mount_id = cpu_to_le32 (val); } while (0)
25231 +/* last block written in a commit */
25232 +struct reiserfs_journal_commit {
25233 + __le32 j_trans_id; /* must match j_trans_id from the desc block */
25234 + __le32 j_len; /* ditto */
25235 + __le32 j_realblock[]; /* real locations for each block */
25238 +#define get_commit_trans_id(c) le32_to_cpu((c)->j_trans_id)
25239 +#define get_commit_trans_len(c) le32_to_cpu((c)->j_len)
25240 +#define get_commit_mount_id(c) le32_to_cpu((c)->j_mount_id)
25242 +#define set_commit_trans_id(c,val) do { (c)->j_trans_id = cpu_to_le32 (val); } while (0)
25243 +#define set_commit_trans_len(c,val) do { (c)->j_len = cpu_to_le32 (val); } while (0)
25246 + * this header block gets written whenever a transaction is considered
25247 + * fully flushed, and is more recent than the last fully flushed transaction.
25248 + * fully flushed means all the log blocks and all the real blocks are on
25249 + * disk, and this transaction does not need to be replayed.
25250 + */
25251 +struct reiserfs_journal_header {
25252 + /* id of last fully flushed transaction */
25253 + __le32 j_last_flush_trans_id;
25255 + /* offset in the log of where to start replay after a crash */
25256 + __le32 j_first_unflushed_offset;
25258 + __le32 j_mount_id;
25259 + /* 12 */ struct journal_params jh_journal;
25262 +/* biggest tunable defines are right here */
25263 +#define JOURNAL_BLOCK_COUNT 8192 /* number of blocks in the journal */
25265 +/* biggest possible single transaction, don't change for now (8/3/99) */
25266 +#define JOURNAL_TRANS_MAX_DEFAULT 1024
25267 +#define JOURNAL_TRANS_MIN_DEFAULT 256
25270 + * max blocks to batch into one transaction,
25271 + * don't make this any bigger than 900
25272 + */
25273 +#define JOURNAL_MAX_BATCH_DEFAULT 900
25274 +#define JOURNAL_MIN_RATIO 2
25275 +#define JOURNAL_MAX_COMMIT_AGE 30
25276 +#define JOURNAL_MAX_TRANS_AGE 30
25277 +#define JOURNAL_PER_BALANCE_CNT (3 * (MAX_HEIGHT-2) + 9)
25278 +#define JOURNAL_BLOCKS_PER_OBJECT(sb) (JOURNAL_PER_BALANCE_CNT * 3 + \
25279 + 2 * (REISERFS_QUOTA_INIT_BLOCKS(sb) + \
25280 + REISERFS_QUOTA_TRANS_BLOCKS(sb)))
25282 +#ifdef CONFIG_QUOTA
25283 +#define REISERFS_QUOTA_OPTS ((1 << REISERFS_USRQUOTA) | (1 << REISERFS_GRPQUOTA))
25284 +/* We need to update data and inode (atime) */
25285 +#define REISERFS_QUOTA_TRANS_BLOCKS(s) (REISERFS_SB(s)->s_mount_opt & REISERFS_QUOTA_OPTS ? 2 : 0)
25286 +/* 1 balancing, 1 bitmap, 1 data per write + stat data update */
25287 +#define REISERFS_QUOTA_INIT_BLOCKS(s) (REISERFS_SB(s)->s_mount_opt & REISERFS_QUOTA_OPTS ? \
25288 +(DQUOT_INIT_ALLOC*(JOURNAL_PER_BALANCE_CNT+2)+DQUOT_INIT_REWRITE+1) : 0)
25289 +/* same as with INIT */
25290 +#define REISERFS_QUOTA_DEL_BLOCKS(s) (REISERFS_SB(s)->s_mount_opt & REISERFS_QUOTA_OPTS ? \
25291 +(DQUOT_DEL_ALLOC*(JOURNAL_PER_BALANCE_CNT+2)+DQUOT_DEL_REWRITE+1) : 0)
25292 +#else
25293 +#define REISERFS_QUOTA_TRANS_BLOCKS(s) 0
25294 +#define REISERFS_QUOTA_INIT_BLOCKS(s) 0
25295 +#define REISERFS_QUOTA_DEL_BLOCKS(s) 0
25296 +#endif
25299 + * both of these can be as low as 1, or as high as you want. The min is the
25300 + * number of 4k bitmap nodes preallocated on mount. New nodes are allocated
25301 + * as needed, and released when transactions are committed. On release, if
25302 + * the current number of nodes is > max, the node is freed, otherwise,
25303 + * it is put on a free list for faster use later.
25305 +#define REISERFS_MIN_BITMAP_NODES 10
25306 +#define REISERFS_MAX_BITMAP_NODES 100
25308 +/* these are based on journal hash size of 8192 */
25309 +#define JBH_HASH_SHIFT 13
25310 +#define JBH_HASH_MASK 8191
25312 +#define _jhashfn(sb,block) \
25313 + (((unsigned long)sb>>L1_CACHE_SHIFT) ^ \
25314 + (((block)<<(JBH_HASH_SHIFT - 6)) ^ ((block) >> 13) ^ ((block) << (JBH_HASH_SHIFT - 12))))
25315 +#define journal_hash(t,sb,block) ((t)[_jhashfn((sb),(block)) & JBH_HASH_MASK])
25317 +/* We need these to make journal.c code more readable */
25318 +#define journal_find_get_block(s, block) __find_get_block(\
25319 + file_bdev(SB_JOURNAL(s)->j_bdev_file), block, s->s_blocksize)
25320 +#define journal_getblk(s, block) __getblk(file_bdev(SB_JOURNAL(s)->j_bdev_file),\
25321 + block, s->s_blocksize)
25322 +#define journal_bread(s, block) __bread(file_bdev(SB_JOURNAL(s)->j_bdev_file),\
25323 + block, s->s_blocksize)
25325 +enum reiserfs_bh_state_bits {
25326 + BH_JDirty = BH_PrivateStart, /* buffer is in current transaction */
25327 + BH_JDirty_wait,
25328 + /*
25329 + * disk block was taken off free list before being in a
25330 + * finished transaction, or written to disk. Can be reused immed.
25331 + */
25332 + BH_JNew,
25333 + BH_JPrepared,
25334 + BH_JRestore_dirty,
25335 + BH_JTest, /* debugging only will go away */
25338 +BUFFER_FNS(JDirty, journaled);
25339 +TAS_BUFFER_FNS(JDirty, journaled);
25340 +BUFFER_FNS(JDirty_wait, journal_dirty);
25341 +TAS_BUFFER_FNS(JDirty_wait, journal_dirty);
25342 +BUFFER_FNS(JNew, journal_new);
25343 +TAS_BUFFER_FNS(JNew, journal_new);
25344 +BUFFER_FNS(JPrepared, journal_prepared);
25345 +TAS_BUFFER_FNS(JPrepared, journal_prepared);
25346 +BUFFER_FNS(JRestore_dirty, journal_restore_dirty);
25347 +TAS_BUFFER_FNS(JRestore_dirty, journal_restore_dirty);
25348 +BUFFER_FNS(JTest, journal_test);
25349 +TAS_BUFFER_FNS(JTest, journal_test);
25351 +/* transaction handle which is passed around for all journal calls */
25352 +struct reiserfs_transaction_handle {
25353 + /*
25354 + * super for this FS when journal_begin was called. saves calls to
25355 + * reiserfs_get_super also used by nested transactions to make
25356 + * sure they are nesting on the right FS _must_ be first
25357 + * in the handle
25358 + */
25359 + struct super_block *t_super;
25361 + int t_refcount;
25362 + int t_blocks_logged; /* number of blocks this writer has logged */
25363 + int t_blocks_allocated; /* number of blocks this writer allocated */
25365 + /* sanity check, equals the current trans id */
25366 + unsigned int t_trans_id;
25368 + void *t_handle_save; /* save existing current->journal_info */
25370 + /*
25371 + * if new block allocation occurres, that block
25372 + * should be displaced from others
25373 + */
25374 + unsigned displace_new_blocks:1;
25376 + struct list_head t_list;
25380 + * used to keep track of ordered and tail writes, attached to the buffer
25381 + * head through b_journal_head.
25382 + */
25383 +struct reiserfs_jh {
25384 + struct reiserfs_journal_list *jl;
25385 + struct buffer_head *bh;
25386 + struct list_head list;
25389 +void reiserfs_free_jh(struct buffer_head *bh);
25390 +int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh);
25391 +int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh);
25392 +int journal_mark_dirty(struct reiserfs_transaction_handle *,
25393 + struct buffer_head *bh);
25395 +static inline int reiserfs_file_data_log(struct inode *inode)
25397 + if (reiserfs_data_log(inode->i_sb) ||
25398 + (REISERFS_I(inode)->i_flags & i_data_log))
25399 + return 1;
25400 + return 0;
25403 +static inline int reiserfs_transaction_running(struct super_block *s)
25405 + struct reiserfs_transaction_handle *th = current->journal_info;
25406 + if (th && th->t_super == s)
25407 + return 1;
25408 + if (th && th->t_super == NULL)
25409 + BUG();
25410 + return 0;
25413 +static inline int reiserfs_transaction_free_space(struct reiserfs_transaction_handle *th)
25415 + return th->t_blocks_allocated - th->t_blocks_logged;
25418 +struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct
25419 + super_block
25420 + *,
25421 + int count);
25422 +int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *);
25423 +void reiserfs_vfs_truncate_file(struct inode *inode);
25424 +int reiserfs_commit_page(struct inode *inode, struct page *page,
25425 + unsigned from, unsigned to);
25426 +void reiserfs_flush_old_commits(struct super_block *);
25427 +int reiserfs_commit_for_inode(struct inode *);
25428 +int reiserfs_inode_needs_commit(struct inode *);
25429 +void reiserfs_update_inode_transaction(struct inode *);
25430 +void reiserfs_wait_on_write_block(struct super_block *s);
25431 +void reiserfs_block_writes(struct reiserfs_transaction_handle *th);
25432 +void reiserfs_allow_writes(struct super_block *s);
25433 +void reiserfs_check_lock_depth(struct super_block *s, char *caller);
25434 +int reiserfs_prepare_for_journal(struct super_block *, struct buffer_head *bh,
25435 + int wait);
25436 +void reiserfs_restore_prepared_buffer(struct super_block *,
25437 + struct buffer_head *bh);
25438 +int journal_init(struct super_block *, const char *j_dev_name, int old_format,
25439 + unsigned int);
25440 +int journal_release(struct reiserfs_transaction_handle *, struct super_block *);
25441 +int journal_release_error(struct reiserfs_transaction_handle *,
25442 + struct super_block *);
25443 +int journal_end(struct reiserfs_transaction_handle *);
25444 +int journal_end_sync(struct reiserfs_transaction_handle *);
25445 +int journal_mark_freed(struct reiserfs_transaction_handle *,
25446 + struct super_block *, b_blocknr_t blocknr);
25447 +int journal_transaction_should_end(struct reiserfs_transaction_handle *, int);
25448 +int reiserfs_in_journal(struct super_block *sb, unsigned int bmap_nr,
25449 + int bit_nr, int searchall, b_blocknr_t *next);
25450 +int journal_begin(struct reiserfs_transaction_handle *,
25451 + struct super_block *sb, unsigned long);
25452 +int journal_join_abort(struct reiserfs_transaction_handle *,
25453 + struct super_block *sb);
25454 +void reiserfs_abort_journal(struct super_block *sb, int errno);
25455 +void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...);
25456 +int reiserfs_allocate_list_bitmaps(struct super_block *s,
25457 + struct reiserfs_list_bitmap *, unsigned int);
25459 +void reiserfs_schedule_old_flush(struct super_block *s);
25460 +void reiserfs_cancel_old_flush(struct super_block *s);
25461 +void add_save_link(struct reiserfs_transaction_handle *th,
25462 + struct inode *inode, int truncate);
25463 +int remove_save_link(struct inode *inode, int truncate);
25465 +/* objectid.c */
25466 +__u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th);
25467 +void reiserfs_release_objectid(struct reiserfs_transaction_handle *th,
25468 + __u32 objectid_to_release);
25469 +int reiserfs_convert_objectid_map_v1(struct super_block *);
25471 +/* stree.c */
25472 +int B_IS_IN_TREE(const struct buffer_head *);
25473 +extern void copy_item_head(struct item_head *to,
25474 + const struct item_head *from);
25476 +/* first key is in cpu form, second - le */
25477 +extern int comp_short_keys(const struct reiserfs_key *le_key,
25478 + const struct cpu_key *cpu_key);
25479 +extern void le_key2cpu_key(struct cpu_key *to, const struct reiserfs_key *from);
25481 +/* both are in le form */
25482 +extern int comp_le_keys(const struct reiserfs_key *,
25483 + const struct reiserfs_key *);
25484 +extern int comp_short_le_keys(const struct reiserfs_key *,
25485 + const struct reiserfs_key *);
25487 +/* * get key version from on disk key - kludge */
25488 +static inline int le_key_version(const struct reiserfs_key *key)
25490 + int type;
25492 + type = offset_v2_k_type(&(key->u.k_offset_v2));
25493 + if (type != TYPE_DIRECT && type != TYPE_INDIRECT
25494 + && type != TYPE_DIRENTRY)
25495 + return KEY_FORMAT_3_5;
25497 + return KEY_FORMAT_3_6;
25501 +static inline void copy_key(struct reiserfs_key *to,
25502 + const struct reiserfs_key *from)
25504 + memcpy(to, from, KEY_SIZE);
25507 +int comp_items(const struct item_head *stored_ih, const struct treepath *path);
25508 +const struct reiserfs_key *get_rkey(const struct treepath *chk_path,
25509 + const struct super_block *sb);
25510 +int search_by_key(struct super_block *, const struct cpu_key *,
25511 + struct treepath *, int);
25512 +#define search_item(s,key,path) search_by_key (s, key, path, DISK_LEAF_NODE_LEVEL)
25513 +int search_for_position_by_key(struct super_block *sb,
25514 + const struct cpu_key *cpu_key,
25515 + struct treepath *search_path);
25516 +extern void decrement_bcount(struct buffer_head *bh);
25517 +void decrement_counters_in_path(struct treepath *search_path);
25518 +void pathrelse(struct treepath *search_path);
25519 +int reiserfs_check_path(struct treepath *p);
25520 +void pathrelse_and_restore(struct super_block *s, struct treepath *search_path);
25522 +int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
25523 + struct treepath *path,
25524 + const struct cpu_key *key,
25525 + struct item_head *ih,
25526 + struct inode *inode, const char *body);
25528 +int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th,
25529 + struct treepath *path,
25530 + const struct cpu_key *key,
25531 + struct inode *inode,
25532 + const char *body, int paste_size);
25534 +int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
25535 + struct treepath *path,
25536 + struct cpu_key *key,
25537 + struct inode *inode,
25538 + struct page *page, loff_t new_file_size);
25540 +int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
25541 + struct treepath *path,
25542 + const struct cpu_key *key,
25543 + struct inode *inode, struct buffer_head *un_bh);
25545 +void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
25546 + struct inode *inode, struct reiserfs_key *key);
25547 +int reiserfs_delete_object(struct reiserfs_transaction_handle *th,
25548 + struct inode *inode);
25549 +int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
25550 + struct inode *inode, struct page *,
25551 + int update_timestamps);
25553 +#define i_block_size(inode) ((inode)->i_sb->s_blocksize)
25554 +#define file_size(inode) ((inode)->i_size)
25555 +#define tail_size(inode) (file_size (inode) & (i_block_size (inode) - 1))
25557 +#define tail_has_to_be_packed(inode) (have_large_tails ((inode)->i_sb)?\
25558 +!STORE_TAIL_IN_UNFM_S1(file_size (inode), tail_size(inode), inode->i_sb->s_blocksize):have_small_tails ((inode)->i_sb)?!STORE_TAIL_IN_UNFM_S2(file_size (inode), tail_size(inode), inode->i_sb->s_blocksize):0 )
25560 +void padd_item(char *item, int total_length, int length);
25562 +/* inode.c */
25563 +/* args for the create parameter of reiserfs_get_block */
25564 +#define GET_BLOCK_NO_CREATE 0 /* don't create new blocks or convert tails */
25565 +#define GET_BLOCK_CREATE 1 /* add anything you need to find block */
25566 +#define GET_BLOCK_NO_HOLE 2 /* return -ENOENT for file holes */
25567 +#define GET_BLOCK_READ_DIRECT 4 /* read the tail if indirect item not found */
25568 +#define GET_BLOCK_NO_IMUX 8 /* i_mutex is not held, don't preallocate */
25569 +#define GET_BLOCK_NO_DANGLE 16 /* don't leave any transactions running */
25571 +void reiserfs_read_locked_inode(struct inode *inode,
25572 + struct reiserfs_iget_args *args);
25573 +int reiserfs_find_actor(struct inode *inode, void *p);
25574 +int reiserfs_init_locked_inode(struct inode *inode, void *p);
25575 +void reiserfs_evict_inode(struct inode *inode);
25576 +int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc);
25577 +int reiserfs_get_block(struct inode *inode, sector_t block,
25578 + struct buffer_head *bh_result, int create);
25579 +struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
25580 + int fh_len, int fh_type);
25581 +struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid,
25582 + int fh_len, int fh_type);
25583 +int reiserfs_encode_fh(struct inode *inode, __u32 * data, int *lenp,
25584 + struct inode *parent);
25586 +int reiserfs_truncate_file(struct inode *, int update_timestamps);
25587 +void make_cpu_key(struct cpu_key *cpu_key, struct inode *inode, loff_t offset,
25588 + int type, int key_length);
25589 +void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
25590 + int version,
25591 + loff_t offset, int type, int length, int entry_count);
25592 +struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key);
25594 +struct reiserfs_security_handle;
25595 +int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
25596 + struct inode *dir, umode_t mode,
25597 + const char *symname, loff_t i_size,
25598 + struct dentry *dentry, struct inode *inode,
25599 + struct reiserfs_security_handle *security);
25601 +void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th,
25602 + struct inode *inode, loff_t size);
25604 +static inline void reiserfs_update_sd(struct reiserfs_transaction_handle *th,
25605 + struct inode *inode)
25607 + reiserfs_update_sd_size(th, inode, inode->i_size);
25610 +void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode);
25611 +int reiserfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
25612 + struct iattr *attr);
25614 +int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len);
25616 +/* namei.c */
25617 +void reiserfs_init_priv_inode(struct inode *inode);
25618 +void set_de_name_and_namelen(struct reiserfs_dir_entry *de);
25619 +int search_by_entry_key(struct super_block *sb, const struct cpu_key *key,
25620 + struct treepath *path, struct reiserfs_dir_entry *de);
25621 +struct dentry *reiserfs_get_parent(struct dentry *);
25623 +#ifdef CONFIG_REISERFS_PROC_INFO
25624 +int reiserfs_proc_info_init(struct super_block *sb);
25625 +int reiserfs_proc_info_done(struct super_block *sb);
25626 +int reiserfs_proc_info_global_init(void);
25627 +int reiserfs_proc_info_global_done(void);
25629 +#define PROC_EXP( e ) e
25631 +#define __PINFO( sb ) REISERFS_SB(sb) -> s_proc_info_data
25632 +#define PROC_INFO_MAX( sb, field, value ) \
25633 + __PINFO( sb ).field = \
25634 + max( REISERFS_SB( sb ) -> s_proc_info_data.field, value )
25635 +#define PROC_INFO_INC( sb, field ) ( ++ ( __PINFO( sb ).field ) )
25636 +#define PROC_INFO_ADD( sb, field, val ) ( __PINFO( sb ).field += ( val ) )
25637 +#define PROC_INFO_BH_STAT( sb, bh, level ) \
25638 + PROC_INFO_INC( sb, sbk_read_at[ ( level ) ] ); \
25639 + PROC_INFO_ADD( sb, free_at[ ( level ) ], B_FREE_SPACE( bh ) ); \
25640 + PROC_INFO_ADD( sb, items_at[ ( level ) ], B_NR_ITEMS( bh ) )
25641 +#else
25642 +static inline int reiserfs_proc_info_init(struct super_block *sb)
25644 + return 0;
25647 +static inline int reiserfs_proc_info_done(struct super_block *sb)
25649 + return 0;
25652 +static inline int reiserfs_proc_info_global_init(void)
25654 + return 0;
25657 +static inline int reiserfs_proc_info_global_done(void)
25659 + return 0;
25662 +#define PROC_EXP( e )
25663 +#define VOID_V ( ( void ) 0 )
25664 +#define PROC_INFO_MAX( sb, field, value ) VOID_V
25665 +#define PROC_INFO_INC( sb, field ) VOID_V
25666 +#define PROC_INFO_ADD( sb, field, val ) VOID_V
25667 +#define PROC_INFO_BH_STAT(sb, bh, n_node_level) VOID_V
25668 +#endif
25670 +/* dir.c */
25671 +extern const struct inode_operations reiserfs_dir_inode_operations;
25672 +extern const struct inode_operations reiserfs_symlink_inode_operations;
25673 +extern const struct inode_operations reiserfs_special_inode_operations;
25674 +extern const struct file_operations reiserfs_dir_operations;
25675 +int reiserfs_readdir_inode(struct inode *, struct dir_context *);
25677 +/* tail_conversion.c */
25678 +int direct2indirect(struct reiserfs_transaction_handle *, struct inode *,
25679 + struct treepath *, struct buffer_head *, loff_t);
25680 +int indirect2direct(struct reiserfs_transaction_handle *, struct inode *,
25681 + struct page *, struct treepath *, const struct cpu_key *,
25682 + loff_t, char *);
25683 +void reiserfs_unmap_buffer(struct buffer_head *);
25685 +/* file.c */
25686 +extern const struct inode_operations reiserfs_file_inode_operations;
25687 +extern const struct inode_operations reiserfs_priv_file_inode_operations;
25688 +extern const struct file_operations reiserfs_file_operations;
25689 +extern const struct address_space_operations reiserfs_address_space_operations;
25691 +/* fix_nodes.c */
25693 +int fix_nodes(int n_op_mode, struct tree_balance *tb,
25694 + struct item_head *ins_ih, const void *);
25695 +void unfix_nodes(struct tree_balance *);
25697 +/* prints.c */
25698 +void __reiserfs_panic(struct super_block *s, const char *id,
25699 + const char *function, const char *fmt, ...)
25700 + __attribute__ ((noreturn));
25701 +#define reiserfs_panic(s, id, fmt, args...) \
25702 + __reiserfs_panic(s, id, __func__, fmt, ##args)
25703 +void __reiserfs_error(struct super_block *s, const char *id,
25704 + const char *function, const char *fmt, ...);
25705 +#define reiserfs_error(s, id, fmt, args...) \
25706 + __reiserfs_error(s, id, __func__, fmt, ##args)
25707 +void reiserfs_info(struct super_block *s, const char *fmt, ...);
25708 +void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...);
25709 +void print_indirect_item(struct buffer_head *bh, int item_num);
25710 +void store_print_tb(struct tree_balance *tb);
25711 +void print_cur_tb(char *mes);
25712 +void print_de(struct reiserfs_dir_entry *de);
25713 +void print_bi(struct buffer_info *bi, char *mes);
25714 +#define PRINT_LEAF_ITEMS 1 /* print all items */
25715 +#define PRINT_DIRECTORY_ITEMS 2 /* print directory items */
25716 +#define PRINT_DIRECT_ITEMS 4 /* print contents of direct items */
25717 +void print_block(struct buffer_head *bh, ...);
25718 +void print_bmap(struct super_block *s, int silent);
25719 +void print_bmap_block(int i, char *data, int size, int silent);
25720 +/*void print_super_block (struct super_block * s, char * mes);*/
25721 +void print_objectid_map(struct super_block *s);
25722 +void print_block_head(struct buffer_head *bh, char *mes);
25723 +void check_leaf(struct buffer_head *bh);
25724 +void check_internal(struct buffer_head *bh);
25725 +void print_statistics(struct super_block *s);
25726 +char *reiserfs_hashname(int code);
25728 +/* lbalance.c */
25729 +int leaf_move_items(int shift_mode, struct tree_balance *tb, int mov_num,
25730 + int mov_bytes, struct buffer_head *Snew);
25731 +int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes);
25732 +int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes);
25733 +void leaf_delete_items(struct buffer_info *cur_bi, int last_first, int first,
25734 + int del_num, int del_bytes);
25735 +void leaf_insert_into_buf(struct buffer_info *bi, int before,
25736 + struct item_head * const inserted_item_ih,
25737 + const char * const inserted_item_body,
25738 + int zeros_number);
25739 +void leaf_paste_in_buffer(struct buffer_info *bi, int pasted_item_num,
25740 + int pos_in_item, int paste_size,
25741 + const char * const body, int zeros_number);
25742 +void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num,
25743 + int pos_in_item, int cut_size);
25744 +void leaf_paste_entries(struct buffer_info *bi, int item_num, int before,
25745 + int new_entry_count, struct reiserfs_de_head *new_dehs,
25746 + const char *records, int paste_size);
25747 +/* ibalance.c */
25748 +int balance_internal(struct tree_balance *, int, int, struct item_head *,
25749 + struct buffer_head **);
25751 +/* do_balance.c */
25752 +void do_balance_mark_leaf_dirty(struct tree_balance *tb,
25753 + struct buffer_head *bh, int flag);
25754 +#define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty
25755 +#define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty
25757 +void do_balance(struct tree_balance *tb, struct item_head *ih,
25758 + const char *body, int flag);
25759 +void reiserfs_invalidate_buffer(struct tree_balance *tb,
25760 + struct buffer_head *bh);
25762 +int get_left_neighbor_position(struct tree_balance *tb, int h);
25763 +int get_right_neighbor_position(struct tree_balance *tb, int h);
25764 +void replace_key(struct tree_balance *tb, struct buffer_head *, int,
25765 + struct buffer_head *, int);
25766 +void make_empty_node(struct buffer_info *);
25767 +struct buffer_head *get_FEB(struct tree_balance *);
25769 +/* bitmap.c */
25772 + * structure contains hints for block allocator, and it is a container for
25773 + * arguments, such as node, search path, transaction_handle, etc.
25774 + */
25775 +struct __reiserfs_blocknr_hint {
25776 + /* inode passed to allocator, if we allocate unf. nodes */
25777 + struct inode *inode;
25779 + sector_t block; /* file offset, in blocks */
25780 + struct in_core_key key;
25782 + /*
25783 + * search path, used by allocator to deternine search_start by
25784 + * various ways
25785 + */
25786 + struct treepath *path;
25788 + /*
25789 + * transaction handle is needed to log super blocks
25790 + * and bitmap blocks changes
25791 + */
25792 + struct reiserfs_transaction_handle *th;
25794 + b_blocknr_t beg, end;
25796 + /*
25797 + * a field used to transfer search start value (block number)
25798 + * between different block allocator procedures
25799 + * (determine_search_start() and others)
25800 + */
25801 + b_blocknr_t search_start;
25803 + /*
25804 + * is set in determine_prealloc_size() function,
25805 + * used by underlayed function that do actual allocation
25806 + */
25807 + int prealloc_size;
25809 + /*
25810 + * the allocator uses different polices for getting disk
25811 + * space for formatted/unformatted blocks with/without preallocation
25812 + */
25813 + unsigned formatted_node:1;
25814 + unsigned preallocate:1;
25817 +typedef struct __reiserfs_blocknr_hint reiserfs_blocknr_hint_t;
25819 +int reiserfs_parse_alloc_options(struct super_block *, char *);
25820 +void reiserfs_init_alloc_options(struct super_block *s);
25823 + * given a directory, this will tell you what packing locality
25824 + * to use for a new object underneat it. The locality is returned
25825 + * in disk byte order (le).
25826 + */
25827 +__le32 reiserfs_choose_packing(struct inode *dir);
25829 +void show_alloc_options(struct seq_file *seq, struct super_block *s);
25830 +int reiserfs_init_bitmap_cache(struct super_block *sb);
25831 +void reiserfs_free_bitmap_cache(struct super_block *sb);
25832 +void reiserfs_cache_bitmap_metadata(struct super_block *sb, struct buffer_head *bh, struct reiserfs_bitmap_info *info);
25833 +struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb, unsigned int bitmap);
25834 +int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value);
25835 +void reiserfs_free_block(struct reiserfs_transaction_handle *th, struct inode *,
25836 + b_blocknr_t, int for_unformatted);
25837 +int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *, b_blocknr_t *, int,
25838 + int);
25839 +static inline int reiserfs_new_form_blocknrs(struct tree_balance *tb,
25840 + b_blocknr_t * new_blocknrs,
25841 + int amount_needed)
25843 + reiserfs_blocknr_hint_t hint = {
25844 + .th = tb->transaction_handle,
25845 + .path = tb->tb_path,
25846 + .inode = NULL,
25847 + .key = tb->key,
25848 + .block = 0,
25849 + .formatted_node = 1
25850 + };
25851 + return reiserfs_allocate_blocknrs(&hint, new_blocknrs, amount_needed,
25852 + 0);
25855 +static inline int reiserfs_new_unf_blocknrs(struct reiserfs_transaction_handle
25856 + *th, struct inode *inode,
25857 + b_blocknr_t * new_blocknrs,
25858 + struct treepath *path,
25859 + sector_t block)
25861 + reiserfs_blocknr_hint_t hint = {
25862 + .th = th,
25863 + .path = path,
25864 + .inode = inode,
25865 + .block = block,
25866 + .formatted_node = 0,
25867 + .preallocate = 0
25868 + };
25869 + return reiserfs_allocate_blocknrs(&hint, new_blocknrs, 1, 0);
25872 +#ifdef REISERFS_PREALLOCATE
25873 +static inline int reiserfs_new_unf_blocknrs2(struct reiserfs_transaction_handle
25874 + *th, struct inode *inode,
25875 + b_blocknr_t * new_blocknrs,
25876 + struct treepath *path,
25877 + sector_t block)
25879 + reiserfs_blocknr_hint_t hint = {
25880 + .th = th,
25881 + .path = path,
25882 + .inode = inode,
25883 + .block = block,
25884 + .formatted_node = 0,
25885 + .preallocate = 1
25886 + };
25887 + return reiserfs_allocate_blocknrs(&hint, new_blocknrs, 1, 0);
25890 +void reiserfs_discard_prealloc(struct reiserfs_transaction_handle *th,
25891 + struct inode *inode);
25892 +void reiserfs_discard_all_prealloc(struct reiserfs_transaction_handle *th);
25893 +#endif
25895 +/* hashes.c */
25896 +__u32 keyed_hash(const signed char *msg, int len);
25897 +__u32 yura_hash(const signed char *msg, int len);
25898 +__u32 r5_hash(const signed char *msg, int len);
25900 +#define reiserfs_set_le_bit __set_bit_le
25901 +#define reiserfs_test_and_set_le_bit __test_and_set_bit_le
25902 +#define reiserfs_clear_le_bit __clear_bit_le
25903 +#define reiserfs_test_and_clear_le_bit __test_and_clear_bit_le
25904 +#define reiserfs_test_le_bit test_bit_le
25905 +#define reiserfs_find_next_zero_le_bit find_next_zero_bit_le
25908 + * sometimes reiserfs_truncate may require to allocate few new blocks
25909 + * to perform indirect2direct conversion. People probably used to
25910 + * think, that truncate should work without problems on a filesystem
25911 + * without free disk space. They may complain that they can not
25912 + * truncate due to lack of free disk space. This spare space allows us
25913 + * to not worry about it. 500 is probably too much, but it should be
25914 + * absolutely safe
25915 + */
25916 +#define SPARE_SPACE 500
25918 +/* prototypes from ioctl.c */
25919 +int reiserfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
25920 +int reiserfs_fileattr_set(struct mnt_idmap *idmap,
25921 + struct dentry *dentry, struct fileattr *fa);
25922 +long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
25923 +long reiserfs_compat_ioctl(struct file *filp,
25924 + unsigned int cmd, unsigned long arg);
25925 +int reiserfs_unpack(struct inode *inode);
25926 diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
25927 new file mode 100644
25928 index 000000000000..7b498a0d060b
25929 --- /dev/null
25930 +++ b/fs/reiserfs/resize.c
25931 @@ -0,0 +1,230 @@
25933 + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
25934 + */
25937 + * Written by Alexander Zarochentcev.
25939 + * The kernel part of the (on-line) reiserfs resizer.
25940 + */
25942 +#include <linux/kernel.h>
25943 +#include <linux/mm.h>
25944 +#include <linux/vmalloc.h>
25945 +#include <linux/string.h>
25946 +#include <linux/errno.h>
25947 +#include "reiserfs.h"
25948 +#include <linux/buffer_head.h>
25950 +int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
25952 + int err = 0;
25953 + struct reiserfs_super_block *sb;
25954 + struct reiserfs_bitmap_info *bitmap;
25955 + struct reiserfs_bitmap_info *info;
25956 + struct reiserfs_bitmap_info *old_bitmap = SB_AP_BITMAP(s);
25957 + struct buffer_head *bh;
25958 + struct reiserfs_transaction_handle th;
25959 + unsigned int bmap_nr_new, bmap_nr;
25960 + unsigned int block_r_new, block_r;
25962 + struct reiserfs_list_bitmap *jb;
25963 + struct reiserfs_list_bitmap jbitmap[JOURNAL_NUM_BITMAPS];
25965 + unsigned long int block_count, free_blocks;
25966 + int i;
25967 + int copy_size;
25968 + int depth;
25970 + sb = SB_DISK_SUPER_BLOCK(s);
25972 + if (SB_BLOCK_COUNT(s) >= block_count_new) {
25973 + printk("can\'t shrink filesystem on-line\n");
25974 + return -EINVAL;
25977 + /* check the device size */
25978 + depth = reiserfs_write_unlock_nested(s);
25979 + bh = sb_bread(s, block_count_new - 1);
25980 + reiserfs_write_lock_nested(s, depth);
25981 + if (!bh) {
25982 + printk("reiserfs_resize: can\'t read last block\n");
25983 + return -EINVAL;
25985 + bforget(bh);
25987 + /*
25988 + * old disk layout detection; those partitions can be mounted, but
25989 + * cannot be resized
25990 + */
25991 + if (SB_BUFFER_WITH_SB(s)->b_blocknr * SB_BUFFER_WITH_SB(s)->b_size
25992 + != REISERFS_DISK_OFFSET_IN_BYTES) {
25993 + printk
25994 + ("reiserfs_resize: unable to resize a reiserfs without distributed bitmap (fs version < 3.5.12)\n");
25995 + return -ENOTSUPP;
25998 + /* count used bits in last bitmap block */
25999 + block_r = SB_BLOCK_COUNT(s) -
26000 + (reiserfs_bmap_count(s) - 1) * s->s_blocksize * 8;
26002 + /* count bitmap blocks in new fs */
26003 + bmap_nr_new = block_count_new / (s->s_blocksize * 8);
26004 + block_r_new = block_count_new - bmap_nr_new * s->s_blocksize * 8;
26005 + if (block_r_new)
26006 + bmap_nr_new++;
26007 + else
26008 + block_r_new = s->s_blocksize * 8;
26010 + /* save old values */
26011 + block_count = SB_BLOCK_COUNT(s);
26012 + bmap_nr = reiserfs_bmap_count(s);
26014 + /* resizing of reiserfs bitmaps (journal and real), if needed */
26015 + if (bmap_nr_new > bmap_nr) {
26016 + /* reallocate journal bitmaps */
26017 + if (reiserfs_allocate_list_bitmaps(s, jbitmap, bmap_nr_new) < 0) {
26018 + printk
26019 + ("reiserfs_resize: unable to allocate memory for journal bitmaps\n");
26020 + return -ENOMEM;
26022 + /*
26023 + * the new journal bitmaps are zero filled, now we copy i
26024 + * the bitmap node pointers from the old journal bitmap
26025 + * structs, and then transfer the new data structures
26026 + * into the journal struct.
26028 + * using the copy_size var below allows this code to work for
26029 + * both shrinking and expanding the FS.
26030 + */
26031 + copy_size = min(bmap_nr_new, bmap_nr);
26032 + copy_size =
26033 + copy_size * sizeof(struct reiserfs_list_bitmap_node *);
26034 + for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
26035 + struct reiserfs_bitmap_node **node_tmp;
26036 + jb = SB_JOURNAL(s)->j_list_bitmap + i;
26037 + memcpy(jbitmap[i].bitmaps, jb->bitmaps, copy_size);
26039 + /*
26040 + * just in case vfree schedules on us, copy the new
26041 + * pointer into the journal struct before freeing the
26042 + * old one
26043 + */
26044 + node_tmp = jb->bitmaps;
26045 + jb->bitmaps = jbitmap[i].bitmaps;
26046 + vfree(node_tmp);
26049 + /*
26050 + * allocate additional bitmap blocks, reallocate
26051 + * array of bitmap block pointers
26052 + */
26053 + bitmap =
26054 + vzalloc(array_size(bmap_nr_new,
26055 + sizeof(struct reiserfs_bitmap_info)));
26056 + if (!bitmap) {
26057 + /*
26058 + * Journal bitmaps are still supersized, but the
26059 + * memory isn't leaked, so I guess it's ok
26060 + */
26061 + printk("reiserfs_resize: unable to allocate memory.\n");
26062 + return -ENOMEM;
26064 + for (i = 0; i < bmap_nr; i++)
26065 + bitmap[i] = old_bitmap[i];
26067 + /*
26068 + * This doesn't go through the journal, but it doesn't have to.
26069 + * The changes are still atomic: We're synced up when the
26070 + * journal transaction begins, and the new bitmaps don't
26071 + * matter if the transaction fails.
26072 + */
26073 + for (i = bmap_nr; i < bmap_nr_new; i++) {
26074 + int depth;
26075 + /*
26076 + * don't use read_bitmap_block since it will cache
26077 + * the uninitialized bitmap
26078 + */
26079 + depth = reiserfs_write_unlock_nested(s);
26080 + bh = sb_bread(s, i * s->s_blocksize * 8);
26081 + reiserfs_write_lock_nested(s, depth);
26082 + if (!bh) {
26083 + vfree(bitmap);
26084 + return -EIO;
26086 + memset(bh->b_data, 0, sb_blocksize(sb));
26087 + reiserfs_set_le_bit(0, bh->b_data);
26088 + reiserfs_cache_bitmap_metadata(s, bh, bitmap + i);
26090 + set_buffer_uptodate(bh);
26091 + mark_buffer_dirty(bh);
26092 + depth = reiserfs_write_unlock_nested(s);
26093 + sync_dirty_buffer(bh);
26094 + reiserfs_write_lock_nested(s, depth);
26095 + /* update bitmap_info stuff */
26096 + bitmap[i].free_count = sb_blocksize(sb) * 8 - 1;
26097 + brelse(bh);
26099 + /* free old bitmap blocks array */
26100 + SB_AP_BITMAP(s) = bitmap;
26101 + vfree(old_bitmap);
26104 + /*
26105 + * begin transaction, if there was an error, it's fine. Yes, we have
26106 + * incorrect bitmaps now, but none of it is ever going to touch the
26107 + * disk anyway.
26108 + */
26109 + err = journal_begin(&th, s, 10);
26110 + if (err)
26111 + return err;
26113 + /* Extend old last bitmap block - new blocks have been made available */
26114 + info = SB_AP_BITMAP(s) + bmap_nr - 1;
26115 + bh = reiserfs_read_bitmap_block(s, bmap_nr - 1);
26116 + if (!bh) {
26117 + int jerr = journal_end(&th);
26118 + if (jerr)
26119 + return jerr;
26120 + return -EIO;
26123 + reiserfs_prepare_for_journal(s, bh, 1);
26124 + for (i = block_r; i < s->s_blocksize * 8; i++)
26125 + reiserfs_clear_le_bit(i, bh->b_data);
26126 + info->free_count += s->s_blocksize * 8 - block_r;
26128 + journal_mark_dirty(&th, bh);
26129 + brelse(bh);
26131 + /* Correct new last bitmap block - It may not be full */
26132 + info = SB_AP_BITMAP(s) + bmap_nr_new - 1;
26133 + bh = reiserfs_read_bitmap_block(s, bmap_nr_new - 1);
26134 + if (!bh) {
26135 + int jerr = journal_end(&th);
26136 + if (jerr)
26137 + return jerr;
26138 + return -EIO;
26141 + reiserfs_prepare_for_journal(s, bh, 1);
26142 + for (i = block_r_new; i < s->s_blocksize * 8; i++)
26143 + reiserfs_set_le_bit(i, bh->b_data);
26144 + journal_mark_dirty(&th, bh);
26145 + brelse(bh);
26147 + info->free_count -= s->s_blocksize * 8 - block_r_new;
26148 + /* update super */
26149 + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
26150 + free_blocks = SB_FREE_BLOCKS(s);
26151 + PUT_SB_FREE_BLOCKS(s,
26152 + free_blocks + (block_count_new - block_count -
26153 + (bmap_nr_new - bmap_nr)));
26154 + PUT_SB_BLOCK_COUNT(s, block_count_new);
26155 + PUT_SB_BMAP_NR(s, bmap_would_wrap(bmap_nr_new) ? : bmap_nr_new);
26157 + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
26159 + SB_JOURNAL(s)->j_must_wait = 1;
26160 + return journal_end(&th);
26162 diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
26163 new file mode 100644
26164 index 000000000000..5faf702f8d15
26165 --- /dev/null
26166 +++ b/fs/reiserfs/stree.c
26167 @@ -0,0 +1,2280 @@
26169 + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
26170 + */
26173 + * Written by Anatoly P. Pinchuk pap@namesys.botik.ru
26174 + * Programm System Institute
26175 + * Pereslavl-Zalessky Russia
26176 + */
26178 +#include <linux/time.h>
26179 +#include <linux/string.h>
26180 +#include <linux/pagemap.h>
26181 +#include <linux/bio.h>
26182 +#include "reiserfs.h"
26183 +#include <linux/buffer_head.h>
26184 +#include <linux/quotaops.h>
26186 +/* Does the buffer contain a disk block which is in the tree. */
26187 +inline int B_IS_IN_TREE(const struct buffer_head *bh)
26190 + RFALSE(B_LEVEL(bh) > MAX_HEIGHT,
26191 + "PAP-1010: block (%b) has too big level (%z)", bh, bh);
26193 + return (B_LEVEL(bh) != FREE_LEVEL);
26196 +/* to get item head in le form */
26197 +inline void copy_item_head(struct item_head *to,
26198 + const struct item_head *from)
26200 + memcpy(to, from, IH_SIZE);
26204 + * k1 is pointer to on-disk structure which is stored in little-endian
26205 + * form. k2 is pointer to cpu variable. For key of items of the same
26206 + * object this returns 0.
26207 + * Returns: -1 if key1 < key2
26208 + * 0 if key1 == key2
26209 + * 1 if key1 > key2
26210 + */
26211 +inline int comp_short_keys(const struct reiserfs_key *le_key,
26212 + const struct cpu_key *cpu_key)
26214 + __u32 n;
26215 + n = le32_to_cpu(le_key->k_dir_id);
26216 + if (n < cpu_key->on_disk_key.k_dir_id)
26217 + return -1;
26218 + if (n > cpu_key->on_disk_key.k_dir_id)
26219 + return 1;
26220 + n = le32_to_cpu(le_key->k_objectid);
26221 + if (n < cpu_key->on_disk_key.k_objectid)
26222 + return -1;
26223 + if (n > cpu_key->on_disk_key.k_objectid)
26224 + return 1;
26225 + return 0;
26229 + * k1 is pointer to on-disk structure which is stored in little-endian
26230 + * form. k2 is pointer to cpu variable.
26231 + * Compare keys using all 4 key fields.
26232 + * Returns: -1 if key1 < key2 0
26233 + * if key1 = key2 1 if key1 > key2
26234 + */
26235 +static inline int comp_keys(const struct reiserfs_key *le_key,
26236 + const struct cpu_key *cpu_key)
26238 + int retval;
26240 + retval = comp_short_keys(le_key, cpu_key);
26241 + if (retval)
26242 + return retval;
26243 + if (le_key_k_offset(le_key_version(le_key), le_key) <
26244 + cpu_key_k_offset(cpu_key))
26245 + return -1;
26246 + if (le_key_k_offset(le_key_version(le_key), le_key) >
26247 + cpu_key_k_offset(cpu_key))
26248 + return 1;
26250 + if (cpu_key->key_length == 3)
26251 + return 0;
26253 + /* this part is needed only when tail conversion is in progress */
26254 + if (le_key_k_type(le_key_version(le_key), le_key) <
26255 + cpu_key_k_type(cpu_key))
26256 + return -1;
26258 + if (le_key_k_type(le_key_version(le_key), le_key) >
26259 + cpu_key_k_type(cpu_key))
26260 + return 1;
26262 + return 0;
26265 +inline int comp_short_le_keys(const struct reiserfs_key *key1,
26266 + const struct reiserfs_key *key2)
26268 + __u32 *k1_u32, *k2_u32;
26269 + int key_length = REISERFS_SHORT_KEY_LEN;
26271 + k1_u32 = (__u32 *) key1;
26272 + k2_u32 = (__u32 *) key2;
26273 + for (; key_length--; ++k1_u32, ++k2_u32) {
26274 + if (le32_to_cpu(*k1_u32) < le32_to_cpu(*k2_u32))
26275 + return -1;
26276 + if (le32_to_cpu(*k1_u32) > le32_to_cpu(*k2_u32))
26277 + return 1;
26279 + return 0;
26282 +inline void le_key2cpu_key(struct cpu_key *to, const struct reiserfs_key *from)
26284 + int version;
26285 + to->on_disk_key.k_dir_id = le32_to_cpu(from->k_dir_id);
26286 + to->on_disk_key.k_objectid = le32_to_cpu(from->k_objectid);
26288 + /* find out version of the key */
26289 + version = le_key_version(from);
26290 + to->version = version;
26291 + to->on_disk_key.k_offset = le_key_k_offset(version, from);
26292 + to->on_disk_key.k_type = le_key_k_type(version, from);
26296 + * this does not say which one is bigger, it only returns 1 if keys
26297 + * are not equal, 0 otherwise
26298 + */
26299 +inline int comp_le_keys(const struct reiserfs_key *k1,
26300 + const struct reiserfs_key *k2)
26302 + return memcmp(k1, k2, sizeof(struct reiserfs_key));
26305 +/**************************************************************************
26306 + * Binary search toolkit function *
26307 + * Search for an item in the array by the item key *
26308 + * Returns: 1 if found, 0 if not found; *
26309 + * *pos = number of the searched element if found, else the *
26310 + * number of the first element that is larger than key. *
26311 + **************************************************************************/
26313 + * For those not familiar with binary search: lbound is the leftmost item
26314 + * that it could be, rbound the rightmost item that it could be. We examine
26315 + * the item halfway between lbound and rbound, and that tells us either
26316 + * that we can increase lbound, or decrease rbound, or that we have found it,
26317 + * or if lbound <= rbound that there are no possible items, and we have not
26318 + * found it. With each examination we cut the number of possible items it
26319 + * could be by one more than half rounded down, or we find it.
26320 + */
26321 +static inline int bin_search(const void *key, /* Key to search for. */
26322 + const void *base, /* First item in the array. */
26323 + int num, /* Number of items in the array. */
26324 + /*
26325 + * Item size in the array. searched. Lest the
26326 + * reader be confused, note that this is crafted
26327 + * as a general function, and when it is applied
26328 + * specifically to the array of item headers in a
26329 + * node, width is actually the item header size
26330 + * not the item size.
26331 + */
26332 + int width,
26333 + int *pos /* Number of the searched for element. */
26336 + int rbound, lbound, j;
26338 + for (j = ((rbound = num - 1) + (lbound = 0)) / 2;
26339 + lbound <= rbound; j = (rbound + lbound) / 2)
26340 + switch (comp_keys
26341 + ((struct reiserfs_key *)((char *)base + j * width),
26342 + (struct cpu_key *)key)) {
26343 + case -1:
26344 + lbound = j + 1;
26345 + continue;
26346 + case 1:
26347 + rbound = j - 1;
26348 + continue;
26349 + case 0:
26350 + *pos = j;
26351 + return ITEM_FOUND; /* Key found in the array. */
26354 + /*
26355 + * bin_search did not find given key, it returns position of key,
26356 + * that is minimal and greater than the given one.
26357 + */
26358 + *pos = lbound;
26359 + return ITEM_NOT_FOUND;
26363 +/* Minimal possible key. It is never in the tree. */
26364 +const struct reiserfs_key MIN_KEY = { 0, 0, {{0, 0},} };
26366 +/* Maximal possible key. It is never in the tree. */
26367 +static const struct reiserfs_key MAX_KEY = {
26368 + cpu_to_le32(0xffffffff),
26369 + cpu_to_le32(0xffffffff),
26370 + {{cpu_to_le32(0xffffffff),
26371 + cpu_to_le32(0xffffffff)},}
26375 + * Get delimiting key of the buffer by looking for it in the buffers in the
26376 + * path, starting from the bottom of the path, and going upwards. We must
26377 + * check the path's validity at each step. If the key is not in the path,
26378 + * there is no delimiting key in the tree (buffer is first or last buffer
26379 + * in tree), and in this case we return a special key, either MIN_KEY or
26380 + * MAX_KEY.
26381 + */
26382 +static inline const struct reiserfs_key *get_lkey(const struct treepath *chk_path,
26383 + const struct super_block *sb)
26385 + int position, path_offset = chk_path->path_length;
26386 + struct buffer_head *parent;
26388 + RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET,
26389 + "PAP-5010: invalid offset in the path");
26391 + /* While not higher in path than first element. */
26392 + while (path_offset-- > FIRST_PATH_ELEMENT_OFFSET) {
26394 + RFALSE(!buffer_uptodate
26395 + (PATH_OFFSET_PBUFFER(chk_path, path_offset)),
26396 + "PAP-5020: parent is not uptodate");
26398 + /* Parent at the path is not in the tree now. */
26399 + if (!B_IS_IN_TREE
26400 + (parent =
26401 + PATH_OFFSET_PBUFFER(chk_path, path_offset)))
26402 + return &MAX_KEY;
26403 + /* Check whether position in the parent is correct. */
26404 + if ((position =
26405 + PATH_OFFSET_POSITION(chk_path,
26406 + path_offset)) >
26407 + B_NR_ITEMS(parent))
26408 + return &MAX_KEY;
26409 + /* Check whether parent at the path really points to the child. */
26410 + if (B_N_CHILD_NUM(parent, position) !=
26411 + PATH_OFFSET_PBUFFER(chk_path,
26412 + path_offset + 1)->b_blocknr)
26413 + return &MAX_KEY;
26414 + /*
26415 + * Return delimiting key if position in the parent
26416 + * is not equal to zero.
26417 + */
26418 + if (position)
26419 + return internal_key(parent, position - 1);
26421 + /* Return MIN_KEY if we are in the root of the buffer tree. */
26422 + if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)->
26423 + b_blocknr == SB_ROOT_BLOCK(sb))
26424 + return &MIN_KEY;
26425 + return &MAX_KEY;
26428 +/* Get delimiting key of the buffer at the path and its right neighbor. */
26429 +inline const struct reiserfs_key *get_rkey(const struct treepath *chk_path,
26430 + const struct super_block *sb)
26432 + int position, path_offset = chk_path->path_length;
26433 + struct buffer_head *parent;
26435 + RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET,
26436 + "PAP-5030: invalid offset in the path");
26438 + while (path_offset-- > FIRST_PATH_ELEMENT_OFFSET) {
26440 + RFALSE(!buffer_uptodate
26441 + (PATH_OFFSET_PBUFFER(chk_path, path_offset)),
26442 + "PAP-5040: parent is not uptodate");
26444 + /* Parent at the path is not in the tree now. */
26445 + if (!B_IS_IN_TREE
26446 + (parent =
26447 + PATH_OFFSET_PBUFFER(chk_path, path_offset)))
26448 + return &MIN_KEY;
26449 + /* Check whether position in the parent is correct. */
26450 + if ((position =
26451 + PATH_OFFSET_POSITION(chk_path,
26452 + path_offset)) >
26453 + B_NR_ITEMS(parent))
26454 + return &MIN_KEY;
26455 + /*
26456 + * Check whether parent at the path really points
26457 + * to the child.
26458 + */
26459 + if (B_N_CHILD_NUM(parent, position) !=
26460 + PATH_OFFSET_PBUFFER(chk_path,
26461 + path_offset + 1)->b_blocknr)
26462 + return &MIN_KEY;
26464 + /*
26465 + * Return delimiting key if position in the parent
26466 + * is not the last one.
26467 + */
26468 + if (position != B_NR_ITEMS(parent))
26469 + return internal_key(parent, position);
26472 + /* Return MAX_KEY if we are in the root of the buffer tree. */
26473 + if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)->
26474 + b_blocknr == SB_ROOT_BLOCK(sb))
26475 + return &MAX_KEY;
26476 + return &MIN_KEY;
26480 + * Check whether a key is contained in the tree rooted from a buffer at a path.
26481 + * This works by looking at the left and right delimiting keys for the buffer
26482 + * in the last path_element in the path. These delimiting keys are stored
26483 + * at least one level above that buffer in the tree. If the buffer is the
26484 + * first or last node in the tree order then one of the delimiting keys may
26485 + * be absent, and in this case get_lkey and get_rkey return a special key
26486 + * which is MIN_KEY or MAX_KEY.
26487 + */
26488 +static inline int key_in_buffer(
26489 + /* Path which should be checked. */
26490 + struct treepath *chk_path,
26491 + /* Key which should be checked. */
26492 + const struct cpu_key *key,
26493 + struct super_block *sb
26497 + RFALSE(!key || chk_path->path_length < FIRST_PATH_ELEMENT_OFFSET
26498 + || chk_path->path_length > MAX_HEIGHT,
26499 + "PAP-5050: pointer to the key(%p) is NULL or invalid path length(%d)",
26500 + key, chk_path->path_length);
26501 + RFALSE(!PATH_PLAST_BUFFER(chk_path)->b_bdev,
26502 + "PAP-5060: device must not be NODEV");
26504 + if (comp_keys(get_lkey(chk_path, sb), key) == 1)
26505 + /* left delimiting key is bigger, that the key we look for */
26506 + return 0;
26507 + /* if ( comp_keys(key, get_rkey(chk_path, sb)) != -1 ) */
26508 + if (comp_keys(get_rkey(chk_path, sb), key) != 1)
26509 + /* key must be less than right delimitiing key */
26510 + return 0;
26511 + return 1;
26514 +int reiserfs_check_path(struct treepath *p)
26516 + RFALSE(p->path_length != ILLEGAL_PATH_ELEMENT_OFFSET,
26517 + "path not properly relsed");
26518 + return 0;
26522 + * Drop the reference to each buffer in a path and restore
26523 + * dirty bits clean when preparing the buffer for the log.
26524 + * This version should only be called from fix_nodes()
26525 + */
26526 +void pathrelse_and_restore(struct super_block *sb,
26527 + struct treepath *search_path)
26529 + int path_offset = search_path->path_length;
26531 + RFALSE(path_offset < ILLEGAL_PATH_ELEMENT_OFFSET,
26532 + "clm-4000: invalid path offset");
26534 + while (path_offset > ILLEGAL_PATH_ELEMENT_OFFSET) {
26535 + struct buffer_head *bh;
26536 + bh = PATH_OFFSET_PBUFFER(search_path, path_offset--);
26537 + reiserfs_restore_prepared_buffer(sb, bh);
26538 + brelse(bh);
26540 + search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
26543 +/* Drop the reference to each buffer in a path */
26544 +void pathrelse(struct treepath *search_path)
26546 + int path_offset = search_path->path_length;
26548 + RFALSE(path_offset < ILLEGAL_PATH_ELEMENT_OFFSET,
26549 + "PAP-5090: invalid path offset");
26551 + while (path_offset > ILLEGAL_PATH_ELEMENT_OFFSET)
26552 + brelse(PATH_OFFSET_PBUFFER(search_path, path_offset--));
26554 + search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
26557 +static int has_valid_deh_location(struct buffer_head *bh, struct item_head *ih)
26559 + struct reiserfs_de_head *deh;
26560 + int i;
26562 + deh = B_I_DEH(bh, ih);
26563 + for (i = 0; i < ih_entry_count(ih); i++) {
26564 + if (deh_location(&deh[i]) > ih_item_len(ih)) {
26565 + reiserfs_warning(NULL, "reiserfs-5094",
26566 + "directory entry location seems wrong %h",
26567 + &deh[i]);
26568 + return 0;
26572 + return 1;
26575 +static int is_leaf(char *buf, int blocksize, struct buffer_head *bh)
26577 + struct block_head *blkh;
26578 + struct item_head *ih;
26579 + int used_space;
26580 + int prev_location;
26581 + int i;
26582 + int nr;
26584 + blkh = (struct block_head *)buf;
26585 + if (blkh_level(blkh) != DISK_LEAF_NODE_LEVEL) {
26586 + reiserfs_warning(NULL, "reiserfs-5080",
26587 + "this should be caught earlier");
26588 + return 0;
26591 + nr = blkh_nr_item(blkh);
26592 + if (nr < 1 || nr > ((blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN))) {
26593 + /* item number is too big or too small */
26594 + reiserfs_warning(NULL, "reiserfs-5081",
26595 + "nr_item seems wrong: %z", bh);
26596 + return 0;
26598 + ih = (struct item_head *)(buf + BLKH_SIZE) + nr - 1;
26599 + used_space = BLKH_SIZE + IH_SIZE * nr + (blocksize - ih_location(ih));
26601 + /* free space does not match to calculated amount of use space */
26602 + if (used_space != blocksize - blkh_free_space(blkh)) {
26603 + reiserfs_warning(NULL, "reiserfs-5082",
26604 + "free space seems wrong: %z", bh);
26605 + return 0;
26607 + /*
26608 + * FIXME: it is_leaf will hit performance too much - we may have
26609 + * return 1 here
26610 + */
26612 + /* check tables of item heads */
26613 + ih = (struct item_head *)(buf + BLKH_SIZE);
26614 + prev_location = blocksize;
26615 + for (i = 0; i < nr; i++, ih++) {
26616 + if (le_ih_k_type(ih) == TYPE_ANY) {
26617 + reiserfs_warning(NULL, "reiserfs-5083",
26618 + "wrong item type for item %h",
26619 + ih);
26620 + return 0;
26622 + if (ih_location(ih) >= blocksize
26623 + || ih_location(ih) < IH_SIZE * nr) {
26624 + reiserfs_warning(NULL, "reiserfs-5084",
26625 + "item location seems wrong: %h",
26626 + ih);
26627 + return 0;
26629 + if (ih_item_len(ih) < 1
26630 + || ih_item_len(ih) > MAX_ITEM_LEN(blocksize)) {
26631 + reiserfs_warning(NULL, "reiserfs-5085",
26632 + "item length seems wrong: %h",
26633 + ih);
26634 + return 0;
26636 + if (prev_location - ih_location(ih) != ih_item_len(ih)) {
26637 + reiserfs_warning(NULL, "reiserfs-5086",
26638 + "item location seems wrong "
26639 + "(second one): %h", ih);
26640 + return 0;
26642 + if (is_direntry_le_ih(ih)) {
26643 + if (ih_item_len(ih) < (ih_entry_count(ih) * IH_SIZE)) {
26644 + reiserfs_warning(NULL, "reiserfs-5093",
26645 + "item entry count seems wrong %h",
26646 + ih);
26647 + return 0;
26649 + return has_valid_deh_location(bh, ih);
26651 + prev_location = ih_location(ih);
26654 + /* one may imagine many more checks */
26655 + return 1;
26658 +/* returns 1 if buf looks like an internal node, 0 otherwise */
26659 +static int is_internal(char *buf, int blocksize, struct buffer_head *bh)
26661 + struct block_head *blkh;
26662 + int nr;
26663 + int used_space;
26665 + blkh = (struct block_head *)buf;
26666 + nr = blkh_level(blkh);
26667 + if (nr <= DISK_LEAF_NODE_LEVEL || nr > MAX_HEIGHT) {
26668 + /* this level is not possible for internal nodes */
26669 + reiserfs_warning(NULL, "reiserfs-5087",
26670 + "this should be caught earlier");
26671 + return 0;
26674 + nr = blkh_nr_item(blkh);
26675 + /* for internal which is not root we might check min number of keys */
26676 + if (nr > (blocksize - BLKH_SIZE - DC_SIZE) / (KEY_SIZE + DC_SIZE)) {
26677 + reiserfs_warning(NULL, "reiserfs-5088",
26678 + "number of key seems wrong: %z", bh);
26679 + return 0;
26682 + used_space = BLKH_SIZE + KEY_SIZE * nr + DC_SIZE * (nr + 1);
26683 + if (used_space != blocksize - blkh_free_space(blkh)) {
26684 + reiserfs_warning(NULL, "reiserfs-5089",
26685 + "free space seems wrong: %z", bh);
26686 + return 0;
26689 + /* one may imagine many more checks */
26690 + return 1;
26694 + * make sure that bh contains formatted node of reiserfs tree of
26695 + * 'level'-th level
26696 + */
26697 +static int is_tree_node(struct buffer_head *bh, int level)
26699 + if (B_LEVEL(bh) != level) {
26700 + reiserfs_warning(NULL, "reiserfs-5090", "node level %d does "
26701 + "not match to the expected one %d",
26702 + B_LEVEL(bh), level);
26703 + return 0;
26705 + if (level == DISK_LEAF_NODE_LEVEL)
26706 + return is_leaf(bh->b_data, bh->b_size, bh);
26708 + return is_internal(bh->b_data, bh->b_size, bh);
26711 +#define SEARCH_BY_KEY_READA 16
26714 + * The function is NOT SCHEDULE-SAFE!
26715 + * It might unlock the write lock if we needed to wait for a block
26716 + * to be read. Note that in this case it won't recover the lock to avoid
26717 + * high contention resulting from too much lock requests, especially
26718 + * the caller (search_by_key) will perform other schedule-unsafe
26719 + * operations just after calling this function.
26721 + * @return depth of lock to be restored after read completes
26722 + */
26723 +static int search_by_key_reada(struct super_block *s,
26724 + struct buffer_head **bh,
26725 + b_blocknr_t *b, int num)
26727 + int i, j;
26728 + int depth = -1;
26730 + for (i = 0; i < num; i++) {
26731 + bh[i] = sb_getblk(s, b[i]);
26733 + /*
26734 + * We are going to read some blocks on which we
26735 + * have a reference. It's safe, though we might be
26736 + * reading blocks concurrently changed if we release
26737 + * the lock. But it's still fine because we check later
26738 + * if the tree changed
26739 + */
26740 + for (j = 0; j < i; j++) {
26741 + /*
26742 + * note, this needs attention if we are getting rid of the BKL
26743 + * you have to make sure the prepared bit isn't set on this
26744 + * buffer
26745 + */
26746 + if (!buffer_uptodate(bh[j])) {
26747 + if (depth == -1)
26748 + depth = reiserfs_write_unlock_nested(s);
26749 + bh_readahead(bh[j], REQ_RAHEAD);
26751 + brelse(bh[j]);
26753 + return depth;
26757 + * This function fills up the path from the root to the leaf as it
26758 + * descends the tree looking for the key. It uses reiserfs_bread to
26759 + * try to find buffers in the cache given their block number. If it
26760 + * does not find them in the cache it reads them from disk. For each
26761 + * node search_by_key finds using reiserfs_bread it then uses
26762 + * bin_search to look through that node. bin_search will find the
26763 + * position of the block_number of the next node if it is looking
26764 + * through an internal node. If it is looking through a leaf node
26765 + * bin_search will find the position of the item which has key either
26766 + * equal to given key, or which is the maximal key less than the given
26767 + * key. search_by_key returns a path that must be checked for the
26768 + * correctness of the top of the path but need not be checked for the
26769 + * correctness of the bottom of the path
26770 + */
26772 + * search_by_key - search for key (and item) in stree
26773 + * @sb: superblock
26774 + * @key: pointer to key to search for
26775 + * @search_path: Allocated and initialized struct treepath; Returned filled
26776 + * on success.
26777 + * @stop_level: How far down the tree to search, Use DISK_LEAF_NODE_LEVEL to
26778 + * stop at leaf level.
26780 + * The function is NOT SCHEDULE-SAFE!
26781 + */
26782 +int search_by_key(struct super_block *sb, const struct cpu_key *key,
26783 + struct treepath *search_path, int stop_level)
26785 + b_blocknr_t block_number;
26786 + int expected_level;
26787 + struct buffer_head *bh;
26788 + struct path_element *last_element;
26789 + int node_level, retval;
26790 + int fs_gen;
26791 + struct buffer_head *reada_bh[SEARCH_BY_KEY_READA];
26792 + b_blocknr_t reada_blocks[SEARCH_BY_KEY_READA];
26793 + int reada_count = 0;
26795 +#ifdef CONFIG_REISERFS_CHECK
26796 + int repeat_counter = 0;
26797 +#endif
26799 + PROC_INFO_INC(sb, search_by_key);
26801 + /*
26802 + * As we add each node to a path we increase its count. This means
26803 + * that we must be careful to release all nodes in a path before we
26804 + * either discard the path struct or re-use the path struct, as we
26805 + * do here.
26806 + */
26808 + pathrelse(search_path);
26810 + /*
26811 + * With each iteration of this loop we search through the items in the
26812 + * current node, and calculate the next current node(next path element)
26813 + * for the next iteration of this loop..
26814 + */
26815 + block_number = SB_ROOT_BLOCK(sb);
26816 + expected_level = -1;
26817 + while (1) {
26819 +#ifdef CONFIG_REISERFS_CHECK
26820 + if (!(++repeat_counter % 50000))
26821 + reiserfs_warning(sb, "PAP-5100",
26822 + "%s: there were %d iterations of "
26823 + "while loop looking for key %K",
26824 + current->comm, repeat_counter,
26825 + key);
26826 +#endif
26828 + /* prep path to have another element added to it. */
26829 + last_element =
26830 + PATH_OFFSET_PELEMENT(search_path,
26831 + ++search_path->path_length);
26832 + fs_gen = get_generation(sb);
26834 + /*
26835 + * Read the next tree node, and set the last element
26836 + * in the path to have a pointer to it.
26837 + */
26838 + if ((bh = last_element->pe_buffer =
26839 + sb_getblk(sb, block_number))) {
26841 + /*
26842 + * We'll need to drop the lock if we encounter any
26843 + * buffers that need to be read. If all of them are
26844 + * already up to date, we don't need to drop the lock.
26845 + */
26846 + int depth = -1;
26848 + if (!buffer_uptodate(bh) && reada_count > 1)
26849 + depth = search_by_key_reada(sb, reada_bh,
26850 + reada_blocks, reada_count);
26852 + if (!buffer_uptodate(bh) && depth == -1)
26853 + depth = reiserfs_write_unlock_nested(sb);
26855 + bh_read_nowait(bh, 0);
26856 + wait_on_buffer(bh);
26858 + if (depth != -1)
26859 + reiserfs_write_lock_nested(sb, depth);
26860 + if (!buffer_uptodate(bh))
26861 + goto io_error;
26862 + } else {
26863 +io_error:
26864 + search_path->path_length--;
26865 + pathrelse(search_path);
26866 + return IO_ERROR;
26868 + reada_count = 0;
26869 + if (expected_level == -1)
26870 + expected_level = SB_TREE_HEIGHT(sb);
26871 + expected_level--;
26873 + /*
26874 + * It is possible that schedule occurred. We must check
26875 + * whether the key to search is still in the tree rooted
26876 + * from the current buffer. If not then repeat search
26877 + * from the root.
26878 + */
26879 + if (fs_changed(fs_gen, sb) &&
26880 + (!B_IS_IN_TREE(bh) ||
26881 + B_LEVEL(bh) != expected_level ||
26882 + !key_in_buffer(search_path, key, sb))) {
26883 + PROC_INFO_INC(sb, search_by_key_fs_changed);
26884 + PROC_INFO_INC(sb, search_by_key_restarted);
26885 + PROC_INFO_INC(sb,
26886 + sbk_restarted[expected_level - 1]);
26887 + pathrelse(search_path);
26889 + /*
26890 + * Get the root block number so that we can
26891 + * repeat the search starting from the root.
26892 + */
26893 + block_number = SB_ROOT_BLOCK(sb);
26894 + expected_level = -1;
26896 + /* repeat search from the root */
26897 + continue;
26900 + /*
26901 + * only check that the key is in the buffer if key is not
26902 + * equal to the MAX_KEY. Latter case is only possible in
26903 + * "finish_unfinished()" processing during mount.
26904 + */
26905 + RFALSE(comp_keys(&MAX_KEY, key) &&
26906 + !key_in_buffer(search_path, key, sb),
26907 + "PAP-5130: key is not in the buffer");
26908 +#ifdef CONFIG_REISERFS_CHECK
26909 + if (REISERFS_SB(sb)->cur_tb) {
26910 + print_cur_tb("5140");
26911 + reiserfs_panic(sb, "PAP-5140",
26912 + "schedule occurred in do_balance!");
26914 +#endif
26916 + /*
26917 + * make sure, that the node contents look like a node of
26918 + * certain level
26919 + */
26920 + if (!is_tree_node(bh, expected_level)) {
26921 + reiserfs_error(sb, "vs-5150",
26922 + "invalid format found in block %ld. "
26923 + "Fsck?", bh->b_blocknr);
26924 + pathrelse(search_path);
26925 + return IO_ERROR;
26928 + /* ok, we have acquired next formatted node in the tree */
26929 + node_level = B_LEVEL(bh);
26931 + PROC_INFO_BH_STAT(sb, bh, node_level - 1);
26933 + RFALSE(node_level < stop_level,
26934 + "vs-5152: tree level (%d) is less than stop level (%d)",
26935 + node_level, stop_level);
26937 + retval = bin_search(key, item_head(bh, 0),
26938 + B_NR_ITEMS(bh),
26939 + (node_level ==
26940 + DISK_LEAF_NODE_LEVEL) ? IH_SIZE :
26941 + KEY_SIZE,
26942 + &last_element->pe_position);
26943 + if (node_level == stop_level) {
26944 + return retval;
26947 + /* we are not in the stop level */
26948 + /*
26949 + * item has been found, so we choose the pointer which
26950 + * is to the right of the found one
26951 + */
26952 + if (retval == ITEM_FOUND)
26953 + last_element->pe_position++;
26955 + /*
26956 + * if item was not found we choose the position which is to
26957 + * the left of the found item. This requires no code,
26958 + * bin_search did it already.
26959 + */
26961 + /*
26962 + * So we have chosen a position in the current node which is
26963 + * an internal node. Now we calculate child block number by
26964 + * position in the node.
26965 + */
26966 + block_number =
26967 + B_N_CHILD_NUM(bh, last_element->pe_position);
26969 + /*
26970 + * if we are going to read leaf nodes, try for read
26971 + * ahead as well
26972 + */
26973 + if ((search_path->reada & PATH_READA) &&
26974 + node_level == DISK_LEAF_NODE_LEVEL + 1) {
26975 + int pos = last_element->pe_position;
26976 + int limit = B_NR_ITEMS(bh);
26977 + struct reiserfs_key *le_key;
26979 + if (search_path->reada & PATH_READA_BACK)
26980 + limit = 0;
26981 + while (reada_count < SEARCH_BY_KEY_READA) {
26982 + if (pos == limit)
26983 + break;
26984 + reada_blocks[reada_count++] =
26985 + B_N_CHILD_NUM(bh, pos);
26986 + if (search_path->reada & PATH_READA_BACK)
26987 + pos--;
26988 + else
26989 + pos++;
26991 + /*
26992 + * check to make sure we're in the same object
26993 + */
26994 + le_key = internal_key(bh, pos);
26995 + if (le32_to_cpu(le_key->k_objectid) !=
26996 + key->on_disk_key.k_objectid) {
26997 + break;
27005 + * Form the path to an item and position in this item which contains
27006 + * file byte defined by key. If there is no such item
27007 + * corresponding to the key, we point the path to the item with
27008 + * maximal key less than key, and *pos_in_item is set to one
27009 + * past the last entry/byte in the item. If searching for entry in a
27010 + * directory item, and it is not found, *pos_in_item is set to one
27011 + * entry more than the entry with maximal key which is less than the
27012 + * sought key.
27014 + * Note that if there is no entry in this same node which is one more,
27015 + * then we point to an imaginary entry. for direct items, the
27016 + * position is in units of bytes, for indirect items the position is
27017 + * in units of blocknr entries, for directory items the position is in
27018 + * units of directory entries.
27019 + */
27020 +/* The function is NOT SCHEDULE-SAFE! */
27021 +int search_for_position_by_key(struct super_block *sb,
27022 + /* Key to search (cpu variable) */
27023 + const struct cpu_key *p_cpu_key,
27024 + /* Filled up by this function. */
27025 + struct treepath *search_path)
27027 + struct item_head *p_le_ih; /* pointer to on-disk structure */
27028 + int blk_size;
27029 + loff_t item_offset, offset;
27030 + struct reiserfs_dir_entry de;
27031 + int retval;
27033 + /* If searching for directory entry. */
27034 + if (is_direntry_cpu_key(p_cpu_key))
27035 + return search_by_entry_key(sb, p_cpu_key, search_path,
27036 + &de);
27038 + /* If not searching for directory entry. */
27040 + /* If item is found. */
27041 + retval = search_item(sb, p_cpu_key, search_path);
27042 + if (retval == IO_ERROR)
27043 + return retval;
27044 + if (retval == ITEM_FOUND) {
27046 + RFALSE(!ih_item_len
27047 + (item_head
27048 + (PATH_PLAST_BUFFER(search_path),
27049 + PATH_LAST_POSITION(search_path))),
27050 + "PAP-5165: item length equals zero");
27052 + pos_in_item(search_path) = 0;
27053 + return POSITION_FOUND;
27056 + RFALSE(!PATH_LAST_POSITION(search_path),
27057 + "PAP-5170: position equals zero");
27059 + /* Item is not found. Set path to the previous item. */
27060 + p_le_ih =
27061 + item_head(PATH_PLAST_BUFFER(search_path),
27062 + --PATH_LAST_POSITION(search_path));
27063 + blk_size = sb->s_blocksize;
27065 + if (comp_short_keys(&p_le_ih->ih_key, p_cpu_key))
27066 + return FILE_NOT_FOUND;
27068 + /* FIXME: quite ugly this far */
27070 + item_offset = le_ih_k_offset(p_le_ih);
27071 + offset = cpu_key_k_offset(p_cpu_key);
27073 + /* Needed byte is contained in the item pointed to by the path. */
27074 + if (item_offset <= offset &&
27075 + item_offset + op_bytes_number(p_le_ih, blk_size) > offset) {
27076 + pos_in_item(search_path) = offset - item_offset;
27077 + if (is_indirect_le_ih(p_le_ih)) {
27078 + pos_in_item(search_path) /= blk_size;
27080 + return POSITION_FOUND;
27083 + /*
27084 + * Needed byte is not contained in the item pointed to by the
27085 + * path. Set pos_in_item out of the item.
27086 + */
27087 + if (is_indirect_le_ih(p_le_ih))
27088 + pos_in_item(search_path) =
27089 + ih_item_len(p_le_ih) / UNFM_P_SIZE;
27090 + else
27091 + pos_in_item(search_path) = ih_item_len(p_le_ih);
27093 + return POSITION_NOT_FOUND;
27096 +/* Compare given item and item pointed to by the path. */
27097 +int comp_items(const struct item_head *stored_ih, const struct treepath *path)
27099 + struct buffer_head *bh = PATH_PLAST_BUFFER(path);
27100 + struct item_head *ih;
27102 + /* Last buffer at the path is not in the tree. */
27103 + if (!B_IS_IN_TREE(bh))
27104 + return 1;
27106 + /* Last path position is invalid. */
27107 + if (PATH_LAST_POSITION(path) >= B_NR_ITEMS(bh))
27108 + return 1;
27110 + /* we need only to know, whether it is the same item */
27111 + ih = tp_item_head(path);
27112 + return memcmp(stored_ih, ih, IH_SIZE);
27115 +/* prepare for delete or cut of direct item */
27116 +static inline int prepare_for_direct_item(struct treepath *path,
27117 + struct item_head *le_ih,
27118 + struct inode *inode,
27119 + loff_t new_file_length, int *cut_size)
27121 + loff_t round_len;
27123 + if (new_file_length == max_reiserfs_offset(inode)) {
27124 + /* item has to be deleted */
27125 + *cut_size = -(IH_SIZE + ih_item_len(le_ih));
27126 + return M_DELETE;
27128 + /* new file gets truncated */
27129 + if (get_inode_item_key_version(inode) == KEY_FORMAT_3_6) {
27130 + round_len = ROUND_UP(new_file_length);
27131 + /* this was new_file_length < le_ih ... */
27132 + if (round_len < le_ih_k_offset(le_ih)) {
27133 + *cut_size = -(IH_SIZE + ih_item_len(le_ih));
27134 + return M_DELETE; /* Delete this item. */
27136 + /* Calculate first position and size for cutting from item. */
27137 + pos_in_item(path) = round_len - (le_ih_k_offset(le_ih) - 1);
27138 + *cut_size = -(ih_item_len(le_ih) - pos_in_item(path));
27140 + return M_CUT; /* Cut from this item. */
27143 + /* old file: items may have any length */
27145 + if (new_file_length < le_ih_k_offset(le_ih)) {
27146 + *cut_size = -(IH_SIZE + ih_item_len(le_ih));
27147 + return M_DELETE; /* Delete this item. */
27150 + /* Calculate first position and size for cutting from item. */
27151 + *cut_size = -(ih_item_len(le_ih) -
27152 + (pos_in_item(path) =
27153 + new_file_length + 1 - le_ih_k_offset(le_ih)));
27154 + return M_CUT; /* Cut from this item. */
27157 +static inline int prepare_for_direntry_item(struct treepath *path,
27158 + struct item_head *le_ih,
27159 + struct inode *inode,
27160 + loff_t new_file_length,
27161 + int *cut_size)
27163 + if (le_ih_k_offset(le_ih) == DOT_OFFSET &&
27164 + new_file_length == max_reiserfs_offset(inode)) {
27165 + RFALSE(ih_entry_count(le_ih) != 2,
27166 + "PAP-5220: incorrect empty directory item (%h)", le_ih);
27167 + *cut_size = -(IH_SIZE + ih_item_len(le_ih));
27168 + /* Delete the directory item containing "." and ".." entry. */
27169 + return M_DELETE;
27172 + if (ih_entry_count(le_ih) == 1) {
27173 + /*
27174 + * Delete the directory item such as there is one record only
27175 + * in this item
27176 + */
27177 + *cut_size = -(IH_SIZE + ih_item_len(le_ih));
27178 + return M_DELETE;
27181 + /* Cut one record from the directory item. */
27182 + *cut_size =
27183 + -(DEH_SIZE +
27184 + entry_length(get_last_bh(path), le_ih, pos_in_item(path)));
27185 + return M_CUT;
27188 +#define JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD (2 * JOURNAL_PER_BALANCE_CNT + 1)
27191 + * If the path points to a directory or direct item, calculate mode
27192 + * and the size cut, for balance.
27193 + * If the path points to an indirect item, remove some number of its
27194 + * unformatted nodes.
27195 + * In case of file truncate calculate whether this item must be
27196 + * deleted/truncated or last unformatted node of this item will be
27197 + * converted to a direct item.
27198 + * This function returns a determination of what balance mode the
27199 + * calling function should employ.
27200 + */
27201 +static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th,
27202 + struct inode *inode,
27203 + struct treepath *path,
27204 + const struct cpu_key *item_key,
27205 + /*
27206 + * Number of unformatted nodes
27207 + * which were removed from end
27208 + * of the file.
27209 + */
27210 + int *removed,
27211 + int *cut_size,
27212 + /* MAX_KEY_OFFSET in case of delete. */
27213 + unsigned long long new_file_length
27216 + struct super_block *sb = inode->i_sb;
27217 + struct item_head *p_le_ih = tp_item_head(path);
27218 + struct buffer_head *bh = PATH_PLAST_BUFFER(path);
27220 + BUG_ON(!th->t_trans_id);
27222 + /* Stat_data item. */
27223 + if (is_statdata_le_ih(p_le_ih)) {
27225 + RFALSE(new_file_length != max_reiserfs_offset(inode),
27226 + "PAP-5210: mode must be M_DELETE");
27228 + *cut_size = -(IH_SIZE + ih_item_len(p_le_ih));
27229 + return M_DELETE;
27232 + /* Directory item. */
27233 + if (is_direntry_le_ih(p_le_ih))
27234 + return prepare_for_direntry_item(path, p_le_ih, inode,
27235 + new_file_length,
27236 + cut_size);
27238 + /* Direct item. */
27239 + if (is_direct_le_ih(p_le_ih))
27240 + return prepare_for_direct_item(path, p_le_ih, inode,
27241 + new_file_length, cut_size);
27243 + /* Case of an indirect item. */
27245 + int blk_size = sb->s_blocksize;
27246 + struct item_head s_ih;
27247 + int need_re_search;
27248 + int delete = 0;
27249 + int result = M_CUT;
27250 + int pos = 0;
27252 + if ( new_file_length == max_reiserfs_offset (inode) ) {
27253 + /*
27254 + * prepare_for_delete_or_cut() is called by
27255 + * reiserfs_delete_item()
27256 + */
27257 + new_file_length = 0;
27258 + delete = 1;
27261 + do {
27262 + need_re_search = 0;
27263 + *cut_size = 0;
27264 + bh = PATH_PLAST_BUFFER(path);
27265 + copy_item_head(&s_ih, tp_item_head(path));
27266 + pos = I_UNFM_NUM(&s_ih);
27268 + while (le_ih_k_offset (&s_ih) + (pos - 1) * blk_size > new_file_length) {
27269 + __le32 *unfm;
27270 + __u32 block;
27272 + /*
27273 + * Each unformatted block deletion may involve
27274 + * one additional bitmap block into the transaction,
27275 + * thereby the initial journal space reservation
27276 + * might not be enough.
27277 + */
27278 + if (!delete && (*cut_size) != 0 &&
27279 + reiserfs_transaction_free_space(th) < JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD)
27280 + break;
27282 + unfm = (__le32 *)ih_item_body(bh, &s_ih) + pos - 1;
27283 + block = get_block_num(unfm, 0);
27285 + if (block != 0) {
27286 + reiserfs_prepare_for_journal(sb, bh, 1);
27287 + put_block_num(unfm, 0, 0);
27288 + journal_mark_dirty(th, bh);
27289 + reiserfs_free_block(th, inode, block, 1);
27292 + reiserfs_cond_resched(sb);
27294 + if (item_moved (&s_ih, path)) {
27295 + need_re_search = 1;
27296 + break;
27299 + pos --;
27300 + (*removed)++;
27301 + (*cut_size) -= UNFM_P_SIZE;
27303 + if (pos == 0) {
27304 + (*cut_size) -= IH_SIZE;
27305 + result = M_DELETE;
27306 + break;
27309 + /*
27310 + * a trick. If the buffer has been logged, this will
27311 + * do nothing. If we've broken the loop without logging
27312 + * it, it will restore the buffer
27313 + */
27314 + reiserfs_restore_prepared_buffer(sb, bh);
27315 + } while (need_re_search &&
27316 + search_for_position_by_key(sb, item_key, path) == POSITION_FOUND);
27317 + pos_in_item(path) = pos * UNFM_P_SIZE;
27319 + if (*cut_size == 0) {
27320 + /*
27321 + * Nothing was cut. maybe convert last unformatted node to the
27322 + * direct item?
27323 + */
27324 + result = M_CONVERT;
27326 + return result;
27330 +/* Calculate number of bytes which will be deleted or cut during balance */
27331 +static int calc_deleted_bytes_number(struct tree_balance *tb, char mode)
27333 + int del_size;
27334 + struct item_head *p_le_ih = tp_item_head(tb->tb_path);
27336 + if (is_statdata_le_ih(p_le_ih))
27337 + return 0;
27339 + del_size =
27340 + (mode ==
27341 + M_DELETE) ? ih_item_len(p_le_ih) : -tb->insert_size[0];
27342 + if (is_direntry_le_ih(p_le_ih)) {
27343 + /*
27344 + * return EMPTY_DIR_SIZE; We delete emty directories only.
27345 + * we can't use EMPTY_DIR_SIZE, as old format dirs have a
27346 + * different empty size. ick. FIXME, is this right?
27347 + */
27348 + return del_size;
27351 + if (is_indirect_le_ih(p_le_ih))
27352 + del_size = (del_size / UNFM_P_SIZE) *
27353 + (PATH_PLAST_BUFFER(tb->tb_path)->b_size);
27354 + return del_size;
27357 +static void init_tb_struct(struct reiserfs_transaction_handle *th,
27358 + struct tree_balance *tb,
27359 + struct super_block *sb,
27360 + struct treepath *path, int size)
27363 + BUG_ON(!th->t_trans_id);
27365 + memset(tb, '\0', sizeof(struct tree_balance));
27366 + tb->transaction_handle = th;
27367 + tb->tb_sb = sb;
27368 + tb->tb_path = path;
27369 + PATH_OFFSET_PBUFFER(path, ILLEGAL_PATH_ELEMENT_OFFSET) = NULL;
27370 + PATH_OFFSET_POSITION(path, ILLEGAL_PATH_ELEMENT_OFFSET) = 0;
27371 + tb->insert_size[0] = size;
27374 +void padd_item(char *item, int total_length, int length)
27376 + int i;
27378 + for (i = total_length; i > length;)
27379 + item[--i] = 0;
27382 +#ifdef REISERQUOTA_DEBUG
27383 +char key2type(struct reiserfs_key *ih)
27385 + if (is_direntry_le_key(2, ih))
27386 + return 'd';
27387 + if (is_direct_le_key(2, ih))
27388 + return 'D';
27389 + if (is_indirect_le_key(2, ih))
27390 + return 'i';
27391 + if (is_statdata_le_key(2, ih))
27392 + return 's';
27393 + return 'u';
27396 +char head2type(struct item_head *ih)
27398 + if (is_direntry_le_ih(ih))
27399 + return 'd';
27400 + if (is_direct_le_ih(ih))
27401 + return 'D';
27402 + if (is_indirect_le_ih(ih))
27403 + return 'i';
27404 + if (is_statdata_le_ih(ih))
27405 + return 's';
27406 + return 'u';
27408 +#endif
27411 + * Delete object item.
27412 + * th - active transaction handle
27413 + * path - path to the deleted item
27414 + * item_key - key to search for the deleted item
27415 + * indode - used for updating i_blocks and quotas
27416 + * un_bh - NULL or unformatted node pointer
27417 + */
27418 +int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
27419 + struct treepath *path, const struct cpu_key *item_key,
27420 + struct inode *inode, struct buffer_head *un_bh)
27422 + struct super_block *sb = inode->i_sb;
27423 + struct tree_balance s_del_balance;
27424 + struct item_head s_ih;
27425 + struct item_head *q_ih;
27426 + int quota_cut_bytes;
27427 + int ret_value, del_size, removed;
27428 + int depth;
27430 +#ifdef CONFIG_REISERFS_CHECK
27431 + char mode;
27432 +#endif
27434 + BUG_ON(!th->t_trans_id);
27436 + init_tb_struct(th, &s_del_balance, sb, path,
27437 + 0 /*size is unknown */ );
27439 + while (1) {
27440 + removed = 0;
27442 +#ifdef CONFIG_REISERFS_CHECK
27443 + mode =
27444 +#endif
27445 + prepare_for_delete_or_cut(th, inode, path,
27446 + item_key, &removed,
27447 + &del_size,
27448 + max_reiserfs_offset(inode));
27450 + RFALSE(mode != M_DELETE, "PAP-5320: mode must be M_DELETE");
27452 + copy_item_head(&s_ih, tp_item_head(path));
27453 + s_del_balance.insert_size[0] = del_size;
27455 + ret_value = fix_nodes(M_DELETE, &s_del_balance, NULL, NULL);
27456 + if (ret_value != REPEAT_SEARCH)
27457 + break;
27459 + PROC_INFO_INC(sb, delete_item_restarted);
27461 + /* file system changed, repeat search */
27462 + ret_value =
27463 + search_for_position_by_key(sb, item_key, path);
27464 + if (ret_value == IO_ERROR)
27465 + break;
27466 + if (ret_value == FILE_NOT_FOUND) {
27467 + reiserfs_warning(sb, "vs-5340",
27468 + "no items of the file %K found",
27469 + item_key);
27470 + break;
27472 + } /* while (1) */
27474 + if (ret_value != CARRY_ON) {
27475 + unfix_nodes(&s_del_balance);
27476 + return 0;
27479 + /* reiserfs_delete_item returns item length when success */
27480 + ret_value = calc_deleted_bytes_number(&s_del_balance, M_DELETE);
27481 + q_ih = tp_item_head(path);
27482 + quota_cut_bytes = ih_item_len(q_ih);
27484 + /*
27485 + * hack so the quota code doesn't have to guess if the file has a
27486 + * tail. On tail insert, we allocate quota for 1 unformatted node.
27487 + * We test the offset because the tail might have been
27488 + * split into multiple items, and we only want to decrement for
27489 + * the unfm node once
27490 + */
27491 + if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(q_ih)) {
27492 + if ((le_ih_k_offset(q_ih) & (sb->s_blocksize - 1)) == 1) {
27493 + quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE;
27494 + } else {
27495 + quota_cut_bytes = 0;
27499 + if (un_bh) {
27500 + int off;
27501 + char *data;
27503 + /*
27504 + * We are in direct2indirect conversion, so move tail contents
27505 + * to the unformatted node
27506 + */
27507 + /*
27508 + * note, we do the copy before preparing the buffer because we
27509 + * don't care about the contents of the unformatted node yet.
27510 + * the only thing we really care about is the direct item's
27511 + * data is in the unformatted node.
27513 + * Otherwise, we would have to call
27514 + * reiserfs_prepare_for_journal on the unformatted node,
27515 + * which might schedule, meaning we'd have to loop all the
27516 + * way back up to the start of the while loop.
27518 + * The unformatted node must be dirtied later on. We can't be
27519 + * sure here if the entire tail has been deleted yet.
27521 + * un_bh is from the page cache (all unformatted nodes are
27522 + * from the page cache) and might be a highmem page. So, we
27523 + * can't use un_bh->b_data.
27524 + * -clm
27525 + */
27527 + data = kmap_atomic(un_bh->b_page);
27528 + off = ((le_ih_k_offset(&s_ih) - 1) & (PAGE_SIZE - 1));
27529 + memcpy(data + off,
27530 + ih_item_body(PATH_PLAST_BUFFER(path), &s_ih),
27531 + ret_value);
27532 + kunmap_atomic(data);
27535 + /* Perform balancing after all resources have been collected at once. */
27536 + do_balance(&s_del_balance, NULL, NULL, M_DELETE);
27538 +#ifdef REISERQUOTA_DEBUG
27539 + reiserfs_debug(sb, REISERFS_DEBUG_CODE,
27540 + "reiserquota delete_item(): freeing %u, id=%u type=%c",
27541 + quota_cut_bytes, inode->i_uid, head2type(&s_ih));
27542 +#endif
27543 + depth = reiserfs_write_unlock_nested(inode->i_sb);
27544 + dquot_free_space_nodirty(inode, quota_cut_bytes);
27545 + reiserfs_write_lock_nested(inode->i_sb, depth);
27547 + /* Return deleted body length */
27548 + return ret_value;
27552 + * Summary Of Mechanisms For Handling Collisions Between Processes:
27554 + * deletion of the body of the object is performed by iput(), with the
27555 + * result that if multiple processes are operating on a file, the
27556 + * deletion of the body of the file is deferred until the last process
27557 + * that has an open inode performs its iput().
27559 + * writes and truncates are protected from collisions by use of
27560 + * semaphores.
27562 + * creates, linking, and mknod are protected from collisions with other
27563 + * processes by making the reiserfs_add_entry() the last step in the
27564 + * creation, and then rolling back all changes if there was a collision.
27565 + * - Hans
27568 +/* this deletes item which never gets split */
27569 +void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
27570 + struct inode *inode, struct reiserfs_key *key)
27572 + struct super_block *sb = th->t_super;
27573 + struct tree_balance tb;
27574 + INITIALIZE_PATH(path);
27575 + int item_len = 0;
27576 + int tb_init = 0;
27577 + struct cpu_key cpu_key = {};
27578 + int retval;
27579 + int quota_cut_bytes = 0;
27581 + BUG_ON(!th->t_trans_id);
27583 + le_key2cpu_key(&cpu_key, key);
27585 + while (1) {
27586 + retval = search_item(th->t_super, &cpu_key, &path);
27587 + if (retval == IO_ERROR) {
27588 + reiserfs_error(th->t_super, "vs-5350",
27589 + "i/o failure occurred trying "
27590 + "to delete %K", &cpu_key);
27591 + break;
27593 + if (retval != ITEM_FOUND) {
27594 + pathrelse(&path);
27595 + /*
27596 + * No need for a warning, if there is just no free
27597 + * space to insert '..' item into the
27598 + * newly-created subdir
27599 + */
27600 + if (!
27601 + ((unsigned long long)
27602 + GET_HASH_VALUE(le_key_k_offset
27603 + (le_key_version(key), key)) == 0
27604 + && (unsigned long long)
27605 + GET_GENERATION_NUMBER(le_key_k_offset
27606 + (le_key_version(key),
27607 + key)) == 1))
27608 + reiserfs_warning(th->t_super, "vs-5355",
27609 + "%k not found", key);
27610 + break;
27612 + if (!tb_init) {
27613 + tb_init = 1;
27614 + item_len = ih_item_len(tp_item_head(&path));
27615 + init_tb_struct(th, &tb, th->t_super, &path,
27616 + -(IH_SIZE + item_len));
27618 + quota_cut_bytes = ih_item_len(tp_item_head(&path));
27620 + retval = fix_nodes(M_DELETE, &tb, NULL, NULL);
27621 + if (retval == REPEAT_SEARCH) {
27622 + PROC_INFO_INC(th->t_super, delete_solid_item_restarted);
27623 + continue;
27626 + if (retval == CARRY_ON) {
27627 + do_balance(&tb, NULL, NULL, M_DELETE);
27628 + /*
27629 + * Should we count quota for item? (we don't
27630 + * count quotas for save-links)
27631 + */
27632 + if (inode) {
27633 + int depth;
27634 +#ifdef REISERQUOTA_DEBUG
27635 + reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE,
27636 + "reiserquota delete_solid_item(): freeing %u id=%u type=%c",
27637 + quota_cut_bytes, inode->i_uid,
27638 + key2type(key));
27639 +#endif
27640 + depth = reiserfs_write_unlock_nested(sb);
27641 + dquot_free_space_nodirty(inode,
27642 + quota_cut_bytes);
27643 + reiserfs_write_lock_nested(sb, depth);
27645 + break;
27648 + /* IO_ERROR, NO_DISK_SPACE, etc */
27649 + reiserfs_warning(th->t_super, "vs-5360",
27650 + "could not delete %K due to fix_nodes failure",
27651 + &cpu_key);
27652 + unfix_nodes(&tb);
27653 + break;
27656 + reiserfs_check_path(&path);
27659 +int reiserfs_delete_object(struct reiserfs_transaction_handle *th,
27660 + struct inode *inode)
27662 + int err;
27663 + inode->i_size = 0;
27664 + BUG_ON(!th->t_trans_id);
27666 + /* for directory this deletes item containing "." and ".." */
27667 + err =
27668 + reiserfs_do_truncate(th, inode, NULL, 0 /*no timestamp updates */ );
27669 + if (err)
27670 + return err;
27672 +#if defined( USE_INODE_GENERATION_COUNTER )
27673 + if (!old_format_only(th->t_super)) {
27674 + __le32 *inode_generation;
27676 + inode_generation =
27677 + &REISERFS_SB(th->t_super)->s_rs->s_inode_generation;
27678 + le32_add_cpu(inode_generation, 1);
27680 +/* USE_INODE_GENERATION_COUNTER */
27681 +#endif
27682 + reiserfs_delete_solid_item(th, inode, INODE_PKEY(inode));
27684 + return err;
27687 +static void unmap_buffers(struct page *page, loff_t pos)
27689 + struct buffer_head *bh;
27690 + struct buffer_head *head;
27691 + struct buffer_head *next;
27692 + unsigned long tail_index;
27693 + unsigned long cur_index;
27695 + if (page) {
27696 + if (page_has_buffers(page)) {
27697 + tail_index = pos & (PAGE_SIZE - 1);
27698 + cur_index = 0;
27699 + head = page_buffers(page);
27700 + bh = head;
27701 + do {
27702 + next = bh->b_this_page;
27704 + /*
27705 + * we want to unmap the buffers that contain
27706 + * the tail, and all the buffers after it
27707 + * (since the tail must be at the end of the
27708 + * file). We don't want to unmap file data
27709 + * before the tail, since it might be dirty
27710 + * and waiting to reach disk
27711 + */
27712 + cur_index += bh->b_size;
27713 + if (cur_index > tail_index) {
27714 + reiserfs_unmap_buffer(bh);
27716 + bh = next;
27717 + } while (bh != head);
27722 +static int maybe_indirect_to_direct(struct reiserfs_transaction_handle *th,
27723 + struct inode *inode,
27724 + struct page *page,
27725 + struct treepath *path,
27726 + const struct cpu_key *item_key,
27727 + loff_t new_file_size, char *mode)
27729 + struct super_block *sb = inode->i_sb;
27730 + int block_size = sb->s_blocksize;
27731 + int cut_bytes;
27732 + BUG_ON(!th->t_trans_id);
27733 + BUG_ON(new_file_size != inode->i_size);
27735 + /*
27736 + * the page being sent in could be NULL if there was an i/o error
27737 + * reading in the last block. The user will hit problems trying to
27738 + * read the file, but for now we just skip the indirect2direct
27739 + */
27740 + if (atomic_read(&inode->i_count) > 1 ||
27741 + !tail_has_to_be_packed(inode) ||
27742 + !page || (REISERFS_I(inode)->i_flags & i_nopack_mask)) {
27743 + /* leave tail in an unformatted node */
27744 + *mode = M_SKIP_BALANCING;
27745 + cut_bytes =
27746 + block_size - (new_file_size & (block_size - 1));
27747 + pathrelse(path);
27748 + return cut_bytes;
27751 + /* Perform the conversion to a direct_item. */
27752 + return indirect2direct(th, inode, page, path, item_key,
27753 + new_file_size, mode);
27757 + * we did indirect_to_direct conversion. And we have inserted direct
27758 + * item successesfully, but there were no disk space to cut unfm
27759 + * pointer being converted. Therefore we have to delete inserted
27760 + * direct item(s)
27761 + */
27762 +static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th,
27763 + struct inode *inode, struct treepath *path)
27765 + struct cpu_key tail_key;
27766 + int tail_len;
27767 + int removed;
27768 + BUG_ON(!th->t_trans_id);
27770 + make_cpu_key(&tail_key, inode, inode->i_size + 1, TYPE_DIRECT, 4);
27771 + tail_key.key_length = 4;
27773 + tail_len =
27774 + (cpu_key_k_offset(&tail_key) & (inode->i_sb->s_blocksize - 1)) - 1;
27775 + while (tail_len) {
27776 + /* look for the last byte of the tail */
27777 + if (search_for_position_by_key(inode->i_sb, &tail_key, path) ==
27778 + POSITION_NOT_FOUND)
27779 + reiserfs_panic(inode->i_sb, "vs-5615",
27780 + "found invalid item");
27781 + RFALSE(path->pos_in_item !=
27782 + ih_item_len(tp_item_head(path)) - 1,
27783 + "vs-5616: appended bytes found");
27784 + PATH_LAST_POSITION(path)--;
27786 + removed =
27787 + reiserfs_delete_item(th, path, &tail_key, inode,
27788 + NULL /*unbh not needed */ );
27789 + RFALSE(removed <= 0
27790 + || removed > tail_len,
27791 + "vs-5617: there was tail %d bytes, removed item length %d bytes",
27792 + tail_len, removed);
27793 + tail_len -= removed;
27794 + set_cpu_key_k_offset(&tail_key,
27795 + cpu_key_k_offset(&tail_key) - removed);
27797 + reiserfs_warning(inode->i_sb, "reiserfs-5091", "indirect_to_direct "
27798 + "conversion has been rolled back due to "
27799 + "lack of disk space");
27800 + mark_inode_dirty(inode);
27803 +/* (Truncate or cut entry) or delete object item. Returns < 0 on failure */
27804 +int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
27805 + struct treepath *path,
27806 + struct cpu_key *item_key,
27807 + struct inode *inode,
27808 + struct page *page, loff_t new_file_size)
27810 + struct super_block *sb = inode->i_sb;
27811 + /*
27812 + * Every function which is going to call do_balance must first
27813 + * create a tree_balance structure. Then it must fill up this
27814 + * structure by using the init_tb_struct and fix_nodes functions.
27815 + * After that we can make tree balancing.
27816 + */
27817 + struct tree_balance s_cut_balance;
27818 + struct item_head *p_le_ih;
27819 + int cut_size = 0; /* Amount to be cut. */
27820 + int ret_value = CARRY_ON;
27821 + int removed = 0; /* Number of the removed unformatted nodes. */
27822 + int is_inode_locked = 0;
27823 + char mode; /* Mode of the balance. */
27824 + int retval2 = -1;
27825 + int quota_cut_bytes;
27826 + loff_t tail_pos = 0;
27827 + int depth;
27829 + BUG_ON(!th->t_trans_id);
27831 + init_tb_struct(th, &s_cut_balance, inode->i_sb, path,
27832 + cut_size);
27834 + /*
27835 + * Repeat this loop until we either cut the item without needing
27836 + * to balance, or we fix_nodes without schedule occurring
27837 + */
27838 + while (1) {
27839 + /*
27840 + * Determine the balance mode, position of the first byte to
27841 + * be cut, and size to be cut. In case of the indirect item
27842 + * free unformatted nodes which are pointed to by the cut
27843 + * pointers.
27844 + */
27846 + mode =
27847 + prepare_for_delete_or_cut(th, inode, path,
27848 + item_key, &removed,
27849 + &cut_size, new_file_size);
27850 + if (mode == M_CONVERT) {
27851 + /*
27852 + * convert last unformatted node to direct item or
27853 + * leave tail in the unformatted node
27854 + */
27855 + RFALSE(ret_value != CARRY_ON,
27856 + "PAP-5570: can not convert twice");
27858 + ret_value =
27859 + maybe_indirect_to_direct(th, inode, page,
27860 + path, item_key,
27861 + new_file_size, &mode);
27862 + if (mode == M_SKIP_BALANCING)
27863 + /* tail has been left in the unformatted node */
27864 + return ret_value;
27866 + is_inode_locked = 1;
27868 + /*
27869 + * removing of last unformatted node will
27870 + * change value we have to return to truncate.
27871 + * Save it
27872 + */
27873 + retval2 = ret_value;
27875 + /*
27876 + * So, we have performed the first part of the
27877 + * conversion:
27878 + * inserting the new direct item. Now we are
27879 + * removing the last unformatted node pointer.
27880 + * Set key to search for it.
27881 + */
27882 + set_cpu_key_k_type(item_key, TYPE_INDIRECT);
27883 + item_key->key_length = 4;
27884 + new_file_size -=
27885 + (new_file_size & (sb->s_blocksize - 1));
27886 + tail_pos = new_file_size;
27887 + set_cpu_key_k_offset(item_key, new_file_size + 1);
27888 + if (search_for_position_by_key
27889 + (sb, item_key,
27890 + path) == POSITION_NOT_FOUND) {
27891 + print_block(PATH_PLAST_BUFFER(path), 3,
27892 + PATH_LAST_POSITION(path) - 1,
27893 + PATH_LAST_POSITION(path) + 1);
27894 + reiserfs_panic(sb, "PAP-5580", "item to "
27895 + "convert does not exist (%K)",
27896 + item_key);
27898 + continue;
27900 + if (cut_size == 0) {
27901 + pathrelse(path);
27902 + return 0;
27905 + s_cut_balance.insert_size[0] = cut_size;
27907 + ret_value = fix_nodes(mode, &s_cut_balance, NULL, NULL);
27908 + if (ret_value != REPEAT_SEARCH)
27909 + break;
27911 + PROC_INFO_INC(sb, cut_from_item_restarted);
27913 + ret_value =
27914 + search_for_position_by_key(sb, item_key, path);
27915 + if (ret_value == POSITION_FOUND)
27916 + continue;
27918 + reiserfs_warning(sb, "PAP-5610", "item %K not found",
27919 + item_key);
27920 + unfix_nodes(&s_cut_balance);
27921 + return (ret_value == IO_ERROR) ? -EIO : -ENOENT;
27922 + } /* while */
27924 + /* check fix_nodes results (IO_ERROR or NO_DISK_SPACE) */
27925 + if (ret_value != CARRY_ON) {
27926 + if (is_inode_locked) {
27927 + /*
27928 + * FIXME: this seems to be not needed: we are always
27929 + * able to cut item
27930 + */
27931 + indirect_to_direct_roll_back(th, inode, path);
27933 + if (ret_value == NO_DISK_SPACE)
27934 + reiserfs_warning(sb, "reiserfs-5092",
27935 + "NO_DISK_SPACE");
27936 + unfix_nodes(&s_cut_balance);
27937 + return -EIO;
27940 + /* go ahead and perform balancing */
27942 + RFALSE(mode == M_PASTE || mode == M_INSERT, "invalid mode");
27944 + /* Calculate number of bytes that need to be cut from the item. */
27945 + quota_cut_bytes =
27946 + (mode ==
27947 + M_DELETE) ? ih_item_len(tp_item_head(path)) : -s_cut_balance.
27948 + insert_size[0];
27949 + if (retval2 == -1)
27950 + ret_value = calc_deleted_bytes_number(&s_cut_balance, mode);
27951 + else
27952 + ret_value = retval2;
27954 + /*
27955 + * For direct items, we only change the quota when deleting the last
27956 + * item.
27957 + */
27958 + p_le_ih = tp_item_head(s_cut_balance.tb_path);
27959 + if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(p_le_ih)) {
27960 + if (mode == M_DELETE &&
27961 + (le_ih_k_offset(p_le_ih) & (sb->s_blocksize - 1)) ==
27962 + 1) {
27963 + /* FIXME: this is to keep 3.5 happy */
27964 + REISERFS_I(inode)->i_first_direct_byte = U32_MAX;
27965 + quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE;
27966 + } else {
27967 + quota_cut_bytes = 0;
27970 +#ifdef CONFIG_REISERFS_CHECK
27971 + if (is_inode_locked) {
27972 + struct item_head *le_ih =
27973 + tp_item_head(s_cut_balance.tb_path);
27974 + /*
27975 + * we are going to complete indirect2direct conversion. Make
27976 + * sure, that we exactly remove last unformatted node pointer
27977 + * of the item
27978 + */
27979 + if (!is_indirect_le_ih(le_ih))
27980 + reiserfs_panic(sb, "vs-5652",
27981 + "item must be indirect %h", le_ih);
27983 + if (mode == M_DELETE && ih_item_len(le_ih) != UNFM_P_SIZE)
27984 + reiserfs_panic(sb, "vs-5653", "completing "
27985 + "indirect2direct conversion indirect "
27986 + "item %h being deleted must be of "
27987 + "4 byte long", le_ih);
27989 + if (mode == M_CUT
27990 + && s_cut_balance.insert_size[0] != -UNFM_P_SIZE) {
27991 + reiserfs_panic(sb, "vs-5654", "can not complete "
27992 + "indirect2direct conversion of %h "
27993 + "(CUT, insert_size==%d)",
27994 + le_ih, s_cut_balance.insert_size[0]);
27996 + /*
27997 + * it would be useful to make sure, that right neighboring
27998 + * item is direct item of this file
27999 + */
28001 +#endif
28003 + do_balance(&s_cut_balance, NULL, NULL, mode);
28004 + if (is_inode_locked) {
28005 + /*
28006 + * we've done an indirect->direct conversion. when the
28007 + * data block was freed, it was removed from the list of
28008 + * blocks that must be flushed before the transaction
28009 + * commits, make sure to unmap and invalidate it
28010 + */
28011 + unmap_buffers(page, tail_pos);
28012 + REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
28014 +#ifdef REISERQUOTA_DEBUG
28015 + reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
28016 + "reiserquota cut_from_item(): freeing %u id=%u type=%c",
28017 + quota_cut_bytes, inode->i_uid, '?');
28018 +#endif
28019 + depth = reiserfs_write_unlock_nested(sb);
28020 + dquot_free_space_nodirty(inode, quota_cut_bytes);
28021 + reiserfs_write_lock_nested(sb, depth);
28022 + return ret_value;
28025 +static void truncate_directory(struct reiserfs_transaction_handle *th,
28026 + struct inode *inode)
28028 + BUG_ON(!th->t_trans_id);
28029 + if (inode->i_nlink)
28030 + reiserfs_error(inode->i_sb, "vs-5655", "link count != 0");
28032 + set_le_key_k_offset(KEY_FORMAT_3_5, INODE_PKEY(inode), DOT_OFFSET);
28033 + set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_DIRENTRY);
28034 + reiserfs_delete_solid_item(th, inode, INODE_PKEY(inode));
28035 + reiserfs_update_sd(th, inode);
28036 + set_le_key_k_offset(KEY_FORMAT_3_5, INODE_PKEY(inode), SD_OFFSET);
28037 + set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_STAT_DATA);
28041 + * Truncate file to the new size. Note, this must be called with a
28042 + * transaction already started
28043 + */
28044 +int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
28045 + struct inode *inode, /* ->i_size contains new size */
28046 + struct page *page, /* up to date for last block */
28047 + /*
28048 + * when it is called by file_release to convert
28049 + * the tail - no timestamps should be updated
28050 + */
28051 + int update_timestamps
28054 + INITIALIZE_PATH(s_search_path); /* Path to the current object item. */
28055 + struct item_head *p_le_ih; /* Pointer to an item header. */
28057 + /* Key to search for a previous file item. */
28058 + struct cpu_key s_item_key;
28059 + loff_t file_size, /* Old file size. */
28060 + new_file_size; /* New file size. */
28061 + int deleted; /* Number of deleted or truncated bytes. */
28062 + int retval;
28063 + int err = 0;
28065 + BUG_ON(!th->t_trans_id);
28066 + if (!
28067 + (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
28068 + || S_ISLNK(inode->i_mode)))
28069 + return 0;
28071 + /* deletion of directory - no need to update timestamps */
28072 + if (S_ISDIR(inode->i_mode)) {
28073 + truncate_directory(th, inode);
28074 + return 0;
28077 + /* Get new file size. */
28078 + new_file_size = inode->i_size;
28080 + /* FIXME: note, that key type is unimportant here */
28081 + make_cpu_key(&s_item_key, inode, max_reiserfs_offset(inode),
28082 + TYPE_DIRECT, 3);
28084 + retval =
28085 + search_for_position_by_key(inode->i_sb, &s_item_key,
28086 + &s_search_path);
28087 + if (retval == IO_ERROR) {
28088 + reiserfs_error(inode->i_sb, "vs-5657",
28089 + "i/o failure occurred trying to truncate %K",
28090 + &s_item_key);
28091 + err = -EIO;
28092 + goto out;
28094 + if (retval == POSITION_FOUND || retval == FILE_NOT_FOUND) {
28095 + reiserfs_error(inode->i_sb, "PAP-5660",
28096 + "wrong result %d of search for %K", retval,
28097 + &s_item_key);
28099 + err = -EIO;
28100 + goto out;
28103 + s_search_path.pos_in_item--;
28105 + /* Get real file size (total length of all file items) */
28106 + p_le_ih = tp_item_head(&s_search_path);
28107 + if (is_statdata_le_ih(p_le_ih))
28108 + file_size = 0;
28109 + else {
28110 + loff_t offset = le_ih_k_offset(p_le_ih);
28111 + int bytes =
28112 + op_bytes_number(p_le_ih, inode->i_sb->s_blocksize);
28114 + /*
28115 + * this may mismatch with real file size: if last direct item
28116 + * had no padding zeros and last unformatted node had no free
28117 + * space, this file would have this file size
28118 + */
28119 + file_size = offset + bytes - 1;
28121 + /*
28122 + * are we doing a full truncate or delete, if so
28123 + * kick in the reada code
28124 + */
28125 + if (new_file_size == 0)
28126 + s_search_path.reada = PATH_READA | PATH_READA_BACK;
28128 + if (file_size == 0 || file_size < new_file_size) {
28129 + goto update_and_out;
28132 + /* Update key to search for the last file item. */
28133 + set_cpu_key_k_offset(&s_item_key, file_size);
28135 + do {
28136 + /* Cut or delete file item. */
28137 + deleted =
28138 + reiserfs_cut_from_item(th, &s_search_path, &s_item_key,
28139 + inode, page, new_file_size);
28140 + if (deleted < 0) {
28141 + reiserfs_warning(inode->i_sb, "vs-5665",
28142 + "reiserfs_cut_from_item failed");
28143 + reiserfs_check_path(&s_search_path);
28144 + return 0;
28147 + RFALSE(deleted > file_size,
28148 + "PAP-5670: reiserfs_cut_from_item: too many bytes deleted: deleted %d, file_size %lu, item_key %K",
28149 + deleted, file_size, &s_item_key);
28151 + /* Change key to search the last file item. */
28152 + file_size -= deleted;
28154 + set_cpu_key_k_offset(&s_item_key, file_size);
28156 + /*
28157 + * While there are bytes to truncate and previous
28158 + * file item is presented in the tree.
28159 + */
28161 + /*
28162 + * This loop could take a really long time, and could log
28163 + * many more blocks than a transaction can hold. So, we do
28164 + * a polite journal end here, and if the transaction needs
28165 + * ending, we make sure the file is consistent before ending
28166 + * the current trans and starting a new one
28167 + */
28168 + if (journal_transaction_should_end(th, 0) ||
28169 + reiserfs_transaction_free_space(th) <= JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD) {
28170 + pathrelse(&s_search_path);
28172 + if (update_timestamps) {
28173 + inode_set_mtime_to_ts(inode,
28174 + current_time(inode));
28175 + inode_set_ctime_current(inode);
28177 + reiserfs_update_sd(th, inode);
28179 + err = journal_end(th);
28180 + if (err)
28181 + goto out;
28182 + err = journal_begin(th, inode->i_sb,
28183 + JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD + JOURNAL_PER_BALANCE_CNT * 4) ;
28184 + if (err)
28185 + goto out;
28186 + reiserfs_update_inode_transaction(inode);
28188 + } while (file_size > ROUND_UP(new_file_size) &&
28189 + search_for_position_by_key(inode->i_sb, &s_item_key,
28190 + &s_search_path) == POSITION_FOUND);
28192 + RFALSE(file_size > ROUND_UP(new_file_size),
28193 + "PAP-5680: truncate did not finish: new_file_size %lld, current %lld, oid %d",
28194 + new_file_size, file_size, s_item_key.on_disk_key.k_objectid);
28196 +update_and_out:
28197 + if (update_timestamps) {
28198 + /* this is truncate, not file closing */
28199 + inode_set_mtime_to_ts(inode, current_time(inode));
28200 + inode_set_ctime_current(inode);
28202 + reiserfs_update_sd(th, inode);
28204 +out:
28205 + pathrelse(&s_search_path);
28206 + return err;
28209 +#ifdef CONFIG_REISERFS_CHECK
28210 +/* this makes sure, that we __append__, not overwrite or add holes */
28211 +static void check_research_for_paste(struct treepath *path,
28212 + const struct cpu_key *key)
28214 + struct item_head *found_ih = tp_item_head(path);
28216 + if (is_direct_le_ih(found_ih)) {
28217 + if (le_ih_k_offset(found_ih) +
28218 + op_bytes_number(found_ih,
28219 + get_last_bh(path)->b_size) !=
28220 + cpu_key_k_offset(key)
28221 + || op_bytes_number(found_ih,
28222 + get_last_bh(path)->b_size) !=
28223 + pos_in_item(path))
28224 + reiserfs_panic(NULL, "PAP-5720", "found direct item "
28225 + "%h or position (%d) does not match "
28226 + "to key %K", found_ih,
28227 + pos_in_item(path), key);
28229 + if (is_indirect_le_ih(found_ih)) {
28230 + if (le_ih_k_offset(found_ih) +
28231 + op_bytes_number(found_ih,
28232 + get_last_bh(path)->b_size) !=
28233 + cpu_key_k_offset(key)
28234 + || I_UNFM_NUM(found_ih) != pos_in_item(path)
28235 + || get_ih_free_space(found_ih) != 0)
28236 + reiserfs_panic(NULL, "PAP-5730", "found indirect "
28237 + "item (%h) or position (%d) does not "
28238 + "match to key (%K)",
28239 + found_ih, pos_in_item(path), key);
28242 +#endif /* config reiserfs check */
28245 + * Paste bytes to the existing item.
28246 + * Returns bytes number pasted into the item.
28247 + */
28248 +int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th,
28249 + /* Path to the pasted item. */
28250 + struct treepath *search_path,
28251 + /* Key to search for the needed item. */
28252 + const struct cpu_key *key,
28253 + /* Inode item belongs to */
28254 + struct inode *inode,
28255 + /* Pointer to the bytes to paste. */
28256 + const char *body,
28257 + /* Size of pasted bytes. */
28258 + int pasted_size)
28260 + struct super_block *sb = inode->i_sb;
28261 + struct tree_balance s_paste_balance;
28262 + int retval;
28263 + int fs_gen;
28264 + int depth;
28266 + BUG_ON(!th->t_trans_id);
28268 + fs_gen = get_generation(inode->i_sb);
28270 +#ifdef REISERQUOTA_DEBUG
28271 + reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
28272 + "reiserquota paste_into_item(): allocating %u id=%u type=%c",
28273 + pasted_size, inode->i_uid,
28274 + key2type(&key->on_disk_key));
28275 +#endif
28277 + depth = reiserfs_write_unlock_nested(sb);
28278 + retval = dquot_alloc_space_nodirty(inode, pasted_size);
28279 + reiserfs_write_lock_nested(sb, depth);
28280 + if (retval) {
28281 + pathrelse(search_path);
28282 + return retval;
28284 + init_tb_struct(th, &s_paste_balance, th->t_super, search_path,
28285 + pasted_size);
28286 +#ifdef DISPLACE_NEW_PACKING_LOCALITIES
28287 + s_paste_balance.key = key->on_disk_key;
28288 +#endif
28290 + /* DQUOT_* can schedule, must check before the fix_nodes */
28291 + if (fs_changed(fs_gen, inode->i_sb)) {
28292 + goto search_again;
28295 + while ((retval =
28296 + fix_nodes(M_PASTE, &s_paste_balance, NULL,
28297 + body)) == REPEAT_SEARCH) {
28298 +search_again:
28299 + /* file system changed while we were in the fix_nodes */
28300 + PROC_INFO_INC(th->t_super, paste_into_item_restarted);
28301 + retval =
28302 + search_for_position_by_key(th->t_super, key,
28303 + search_path);
28304 + if (retval == IO_ERROR) {
28305 + retval = -EIO;
28306 + goto error_out;
28308 + if (retval == POSITION_FOUND) {
28309 + reiserfs_warning(inode->i_sb, "PAP-5710",
28310 + "entry or pasted byte (%K) exists",
28311 + key);
28312 + retval = -EEXIST;
28313 + goto error_out;
28315 +#ifdef CONFIG_REISERFS_CHECK
28316 + check_research_for_paste(search_path, key);
28317 +#endif
28320 + /*
28321 + * Perform balancing after all resources are collected by fix_nodes,
28322 + * and accessing them will not risk triggering schedule.
28323 + */
28324 + if (retval == CARRY_ON) {
28325 + do_balance(&s_paste_balance, NULL /*ih */ , body, M_PASTE);
28326 + return 0;
28328 + retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO;
28329 +error_out:
28330 + /* this also releases the path */
28331 + unfix_nodes(&s_paste_balance);
28332 +#ifdef REISERQUOTA_DEBUG
28333 + reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
28334 + "reiserquota paste_into_item(): freeing %u id=%u type=%c",
28335 + pasted_size, inode->i_uid,
28336 + key2type(&key->on_disk_key));
28337 +#endif
28338 + depth = reiserfs_write_unlock_nested(sb);
28339 + dquot_free_space_nodirty(inode, pasted_size);
28340 + reiserfs_write_lock_nested(sb, depth);
28341 + return retval;
28345 + * Insert new item into the buffer at the path.
28346 + * th - active transaction handle
28347 + * path - path to the inserted item
28348 + * ih - pointer to the item header to insert
28349 + * body - pointer to the bytes to insert
28350 + */
28351 +int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
28352 + struct treepath *path, const struct cpu_key *key,
28353 + struct item_head *ih, struct inode *inode,
28354 + const char *body)
28356 + struct tree_balance s_ins_balance;
28357 + int retval;
28358 + int fs_gen = 0;
28359 + int quota_bytes = 0;
28361 + BUG_ON(!th->t_trans_id);
28363 + if (inode) { /* Do we count quotas for item? */
28364 + int depth;
28365 + fs_gen = get_generation(inode->i_sb);
28366 + quota_bytes = ih_item_len(ih);
28368 + /*
28369 + * hack so the quota code doesn't have to guess
28370 + * if the file has a tail, links are always tails,
28371 + * so there's no guessing needed
28372 + */
28373 + if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(ih))
28374 + quota_bytes = inode->i_sb->s_blocksize + UNFM_P_SIZE;
28375 +#ifdef REISERQUOTA_DEBUG
28376 + reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
28377 + "reiserquota insert_item(): allocating %u id=%u type=%c",
28378 + quota_bytes, inode->i_uid, head2type(ih));
28379 +#endif
28380 + /*
28381 + * We can't dirty inode here. It would be immediately
28382 + * written but appropriate stat item isn't inserted yet...
28383 + */
28384 + depth = reiserfs_write_unlock_nested(inode->i_sb);
28385 + retval = dquot_alloc_space_nodirty(inode, quota_bytes);
28386 + reiserfs_write_lock_nested(inode->i_sb, depth);
28387 + if (retval) {
28388 + pathrelse(path);
28389 + return retval;
28392 + init_tb_struct(th, &s_ins_balance, th->t_super, path,
28393 + IH_SIZE + ih_item_len(ih));
28394 +#ifdef DISPLACE_NEW_PACKING_LOCALITIES
28395 + s_ins_balance.key = key->on_disk_key;
28396 +#endif
28397 + /*
28398 + * DQUOT_* can schedule, must check to be sure calling
28399 + * fix_nodes is safe
28400 + */
28401 + if (inode && fs_changed(fs_gen, inode->i_sb)) {
28402 + goto search_again;
28405 + while ((retval =
28406 + fix_nodes(M_INSERT, &s_ins_balance, ih,
28407 + body)) == REPEAT_SEARCH) {
28408 +search_again:
28409 + /* file system changed while we were in the fix_nodes */
28410 + PROC_INFO_INC(th->t_super, insert_item_restarted);
28411 + retval = search_item(th->t_super, key, path);
28412 + if (retval == IO_ERROR) {
28413 + retval = -EIO;
28414 + goto error_out;
28416 + if (retval == ITEM_FOUND) {
28417 + reiserfs_warning(th->t_super, "PAP-5760",
28418 + "key %K already exists in the tree",
28419 + key);
28420 + retval = -EEXIST;
28421 + goto error_out;
28425 + /* make balancing after all resources will be collected at a time */
28426 + if (retval == CARRY_ON) {
28427 + do_balance(&s_ins_balance, ih, body, M_INSERT);
28428 + return 0;
28431 + retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO;
28432 +error_out:
28433 + /* also releases the path */
28434 + unfix_nodes(&s_ins_balance);
28435 +#ifdef REISERQUOTA_DEBUG
28436 + if (inode)
28437 + reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE,
28438 + "reiserquota insert_item(): freeing %u id=%u type=%c",
28439 + quota_bytes, inode->i_uid, head2type(ih));
28440 +#endif
28441 + if (inode) {
28442 + int depth = reiserfs_write_unlock_nested(inode->i_sb);
28443 + dquot_free_space_nodirty(inode, quota_bytes);
28444 + reiserfs_write_lock_nested(inode->i_sb, depth);
28446 + return retval;
28448 diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
28449 new file mode 100644
28450 index 000000000000..ab76468da02d
28451 --- /dev/null
28452 +++ b/fs/reiserfs/super.c
28453 @@ -0,0 +1,2646 @@
28455 + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
28457 + * Trivial changes by Alan Cox to add the LFS fixes
28459 + * Trivial Changes:
28460 + * Rights granted to Hans Reiser to redistribute under other terms providing
28461 + * he accepts all liability including but not limited to patent, fitness
28462 + * for purpose, and direct or indirect claims arising from failure to perform.
28464 + * NO WARRANTY
28465 + */
28467 +#include <linux/module.h>
28468 +#include <linux/slab.h>
28469 +#include <linux/vmalloc.h>
28470 +#include <linux/time.h>
28471 +#include <linux/uaccess.h>
28472 +#include "reiserfs.h"
28473 +#include "acl.h"
28474 +#include "xattr.h"
28475 +#include <linux/init.h>
28476 +#include <linux/blkdev.h>
28477 +#include <linux/backing-dev.h>
28478 +#include <linux/buffer_head.h>
28479 +#include <linux/exportfs.h>
28480 +#include <linux/quotaops.h>
28481 +#include <linux/vfs.h>
28482 +#include <linux/mount.h>
28483 +#include <linux/namei.h>
28484 +#include <linux/crc32.h>
28485 +#include <linux/seq_file.h>
28487 +struct file_system_type reiserfs_fs_type;
28489 +static const char reiserfs_3_5_magic_string[] = REISERFS_SUPER_MAGIC_STRING;
28490 +static const char reiserfs_3_6_magic_string[] = REISER2FS_SUPER_MAGIC_STRING;
28491 +static const char reiserfs_jr_magic_string[] = REISER2FS_JR_SUPER_MAGIC_STRING;
28493 +int is_reiserfs_3_5(struct reiserfs_super_block *rs)
28495 + return !strncmp(rs->s_v1.s_magic, reiserfs_3_5_magic_string,
28496 + strlen(reiserfs_3_5_magic_string));
28499 +int is_reiserfs_3_6(struct reiserfs_super_block *rs)
28501 + return !strncmp(rs->s_v1.s_magic, reiserfs_3_6_magic_string,
28502 + strlen(reiserfs_3_6_magic_string));
28505 +int is_reiserfs_jr(struct reiserfs_super_block *rs)
28507 + return !strncmp(rs->s_v1.s_magic, reiserfs_jr_magic_string,
28508 + strlen(reiserfs_jr_magic_string));
28511 +static int is_any_reiserfs_magic_string(struct reiserfs_super_block *rs)
28513 + return (is_reiserfs_3_5(rs) || is_reiserfs_3_6(rs) ||
28514 + is_reiserfs_jr(rs));
28517 +static int reiserfs_remount(struct super_block *s, int *flags, char *data);
28518 +static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf);
28520 +static int reiserfs_sync_fs(struct super_block *s, int wait)
28522 + struct reiserfs_transaction_handle th;
28524 + /*
28525 + * Writeback quota in non-journalled quota case - journalled quota has
28526 + * no dirty dquots
28527 + */
28528 + dquot_writeback_dquots(s, -1);
28529 + reiserfs_write_lock(s);
28530 + if (!journal_begin(&th, s, 1))
28531 + if (!journal_end_sync(&th))
28532 + reiserfs_flush_old_commits(s);
28533 + reiserfs_write_unlock(s);
28534 + return 0;
28537 +static void flush_old_commits(struct work_struct *work)
28539 + struct reiserfs_sb_info *sbi;
28540 + struct super_block *s;
28542 + sbi = container_of(work, struct reiserfs_sb_info, old_work.work);
28543 + s = sbi->s_journal->j_work_sb;
28545 + /*
28546 + * We need s_umount for protecting quota writeback. We have to use
28547 + * trylock as reiserfs_cancel_old_flush() may be waiting for this work
28548 + * to complete with s_umount held.
28549 + */
28550 + if (!down_read_trylock(&s->s_umount)) {
28551 + /* Requeue work if we are not cancelling it */
28552 + spin_lock(&sbi->old_work_lock);
28553 + if (sbi->work_queued == 1)
28554 + queue_delayed_work(system_long_wq, &sbi->old_work, HZ);
28555 + spin_unlock(&sbi->old_work_lock);
28556 + return;
28558 + spin_lock(&sbi->old_work_lock);
28559 + /* Avoid clobbering the cancel state... */
28560 + if (sbi->work_queued == 1)
28561 + sbi->work_queued = 0;
28562 + spin_unlock(&sbi->old_work_lock);
28564 + reiserfs_sync_fs(s, 1);
28565 + up_read(&s->s_umount);
28568 +void reiserfs_schedule_old_flush(struct super_block *s)
28570 + struct reiserfs_sb_info *sbi = REISERFS_SB(s);
28571 + unsigned long delay;
28573 + /*
28574 + * Avoid scheduling flush when sb is being shut down. It can race
28575 + * with journal shutdown and free still queued delayed work.
28576 + */
28577 + if (sb_rdonly(s) || !(s->s_flags & SB_ACTIVE))
28578 + return;
28580 + spin_lock(&sbi->old_work_lock);
28581 + if (!sbi->work_queued) {
28582 + delay = msecs_to_jiffies(dirty_writeback_interval * 10);
28583 + queue_delayed_work(system_long_wq, &sbi->old_work, delay);
28584 + sbi->work_queued = 1;
28586 + spin_unlock(&sbi->old_work_lock);
28589 +void reiserfs_cancel_old_flush(struct super_block *s)
28591 + struct reiserfs_sb_info *sbi = REISERFS_SB(s);
28593 + spin_lock(&sbi->old_work_lock);
28594 + /* Make sure no new flushes will be queued */
28595 + sbi->work_queued = 2;
28596 + spin_unlock(&sbi->old_work_lock);
28597 + cancel_delayed_work_sync(&REISERFS_SB(s)->old_work);
28600 +static int reiserfs_freeze(struct super_block *s)
28602 + struct reiserfs_transaction_handle th;
28604 + reiserfs_cancel_old_flush(s);
28606 + reiserfs_write_lock(s);
28607 + if (!sb_rdonly(s)) {
28608 + int err = journal_begin(&th, s, 1);
28609 + if (err) {
28610 + reiserfs_block_writes(&th);
28611 + } else {
28612 + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s),
28613 + 1);
28614 + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
28615 + reiserfs_block_writes(&th);
28616 + journal_end_sync(&th);
28619 + reiserfs_write_unlock(s);
28620 + return 0;
28623 +static int reiserfs_unfreeze(struct super_block *s)
28625 + struct reiserfs_sb_info *sbi = REISERFS_SB(s);
28627 + reiserfs_allow_writes(s);
28628 + spin_lock(&sbi->old_work_lock);
28629 + /* Allow old_work to run again */
28630 + sbi->work_queued = 0;
28631 + spin_unlock(&sbi->old_work_lock);
28632 + return 0;
28635 +extern const struct in_core_key MAX_IN_CORE_KEY;
28638 + * this is used to delete "save link" when there are no items of a
28639 + * file it points to. It can either happen if unlink is completed but
28640 + * "save unlink" removal, or if file has both unlink and truncate
28641 + * pending and as unlink completes first (because key of "save link"
28642 + * protecting unlink is bigger that a key lf "save link" which
28643 + * protects truncate), so there left no items to make truncate
28644 + * completion on
28645 + */
28646 +static int remove_save_link_only(struct super_block *s,
28647 + struct reiserfs_key *key, int oid_free)
28649 + struct reiserfs_transaction_handle th;
28650 + int err;
28652 + /* we are going to do one balancing */
28653 + err = journal_begin(&th, s, JOURNAL_PER_BALANCE_CNT);
28654 + if (err)
28655 + return err;
28657 + reiserfs_delete_solid_item(&th, NULL, key);
28658 + if (oid_free)
28659 + /* removals are protected by direct items */
28660 + reiserfs_release_objectid(&th, le32_to_cpu(key->k_objectid));
28662 + return journal_end(&th);
28665 +#ifdef CONFIG_QUOTA
28666 +static int reiserfs_quota_on_mount(struct super_block *, int);
28667 +#endif
28670 + * Look for uncompleted unlinks and truncates and complete them
28672 + * Called with superblock write locked. If quotas are enabled, we have to
28673 + * release/retake lest we call dquot_quota_on_mount(), proceed to
28674 + * schedule_on_each_cpu() in invalidate_bdev() and deadlock waiting for the per
28675 + * cpu worklets to complete flush_async_commits() that in turn wait for the
28676 + * superblock write lock.
28677 + */
28678 +static int finish_unfinished(struct super_block *s)
28680 + INITIALIZE_PATH(path);
28681 + struct cpu_key max_cpu_key, obj_key;
28682 + struct reiserfs_key save_link_key, last_inode_key;
28683 + int retval = 0;
28684 + struct item_head *ih;
28685 + struct buffer_head *bh;
28686 + int item_pos;
28687 + char *item;
28688 + int done;
28689 + struct inode *inode;
28690 + int truncate;
28691 +#ifdef CONFIG_QUOTA
28692 + int i;
28693 + int ms_active_set;
28694 + int quota_enabled[REISERFS_MAXQUOTAS];
28695 +#endif
28697 + /* compose key to look for "save" links */
28698 + max_cpu_key.version = KEY_FORMAT_3_5;
28699 + max_cpu_key.on_disk_key.k_dir_id = ~0U;
28700 + max_cpu_key.on_disk_key.k_objectid = ~0U;
28701 + set_cpu_key_k_offset(&max_cpu_key, ~0U);
28702 + max_cpu_key.key_length = 3;
28704 + memset(&last_inode_key, 0, sizeof(last_inode_key));
28706 +#ifdef CONFIG_QUOTA
28707 + /* Needed for iput() to work correctly and not trash data */
28708 + if (s->s_flags & SB_ACTIVE) {
28709 + ms_active_set = 0;
28710 + } else {
28711 + ms_active_set = 1;
28712 + s->s_flags |= SB_ACTIVE;
28714 + /* Turn on quotas so that they are updated correctly */
28715 + for (i = 0; i < REISERFS_MAXQUOTAS; i++) {
28716 + quota_enabled[i] = 1;
28717 + if (REISERFS_SB(s)->s_qf_names[i]) {
28718 + int ret;
28720 + if (sb_has_quota_active(s, i)) {
28721 + quota_enabled[i] = 0;
28722 + continue;
28724 + reiserfs_write_unlock(s);
28725 + ret = reiserfs_quota_on_mount(s, i);
28726 + reiserfs_write_lock(s);
28727 + if (ret < 0)
28728 + reiserfs_warning(s, "reiserfs-2500",
28729 + "cannot turn on journaled "
28730 + "quota: error %d", ret);
28733 +#endif
28735 + done = 0;
28736 + REISERFS_SB(s)->s_is_unlinked_ok = 1;
28737 + while (!retval) {
28738 + int depth;
28739 + retval = search_item(s, &max_cpu_key, &path);
28740 + if (retval != ITEM_NOT_FOUND) {
28741 + reiserfs_error(s, "vs-2140",
28742 + "search_by_key returned %d", retval);
28743 + break;
28746 + bh = get_last_bh(&path);
28747 + item_pos = get_item_pos(&path);
28748 + if (item_pos != B_NR_ITEMS(bh)) {
28749 + reiserfs_warning(s, "vs-2060",
28750 + "wrong position found");
28751 + break;
28753 + item_pos--;
28754 + ih = item_head(bh, item_pos);
28756 + if (le32_to_cpu(ih->ih_key.k_dir_id) != MAX_KEY_OBJECTID)
28757 + /* there are no "save" links anymore */
28758 + break;
28760 + save_link_key = ih->ih_key;
28761 + if (is_indirect_le_ih(ih))
28762 + truncate = 1;
28763 + else
28764 + truncate = 0;
28766 + /* reiserfs_iget needs k_dirid and k_objectid only */
28767 + item = ih_item_body(bh, ih);
28768 + obj_key.on_disk_key.k_dir_id = le32_to_cpu(*(__le32 *) item);
28769 + obj_key.on_disk_key.k_objectid =
28770 + le32_to_cpu(ih->ih_key.k_objectid);
28771 + obj_key.on_disk_key.k_offset = 0;
28772 + obj_key.on_disk_key.k_type = 0;
28774 + pathrelse(&path);
28776 + inode = reiserfs_iget(s, &obj_key);
28777 + if (IS_ERR_OR_NULL(inode)) {
28778 + /*
28779 + * the unlink almost completed, it just did not
28780 + * manage to remove "save" link and release objectid
28781 + */
28782 + reiserfs_warning(s, "vs-2180", "iget failed for %K",
28783 + &obj_key);
28784 + retval = remove_save_link_only(s, &save_link_key, 1);
28785 + continue;
28788 + if (!truncate && inode->i_nlink) {
28789 + /* file is not unlinked */
28790 + reiserfs_warning(s, "vs-2185",
28791 + "file %K is not unlinked",
28792 + &obj_key);
28793 + retval = remove_save_link_only(s, &save_link_key, 0);
28794 + continue;
28796 + depth = reiserfs_write_unlock_nested(inode->i_sb);
28797 + dquot_initialize(inode);
28798 + reiserfs_write_lock_nested(inode->i_sb, depth);
28800 + if (truncate && S_ISDIR(inode->i_mode)) {
28801 + /*
28802 + * We got a truncate request for a dir which
28803 + * is impossible. The only imaginable way is to
28804 + * execute unfinished truncate request then boot
28805 + * into old kernel, remove the file and create dir
28806 + * with the same key.
28807 + */
28808 + reiserfs_warning(s, "green-2101",
28809 + "impossible truncate on a "
28810 + "directory %k. Please report",
28811 + INODE_PKEY(inode));
28812 + retval = remove_save_link_only(s, &save_link_key, 0);
28813 + truncate = 0;
28814 + iput(inode);
28815 + continue;
28818 + if (truncate) {
28819 + REISERFS_I(inode)->i_flags |=
28820 + i_link_saved_truncate_mask;
28821 + /*
28822 + * not completed truncate found. New size was
28823 + * committed together with "save" link
28824 + */
28825 + reiserfs_info(s, "Truncating %k to %lld ..",
28826 + INODE_PKEY(inode), inode->i_size);
28828 + /* don't update modification time */
28829 + reiserfs_truncate_file(inode, 0);
28831 + retval = remove_save_link(inode, truncate);
28832 + } else {
28833 + REISERFS_I(inode)->i_flags |= i_link_saved_unlink_mask;
28834 + /* not completed unlink (rmdir) found */
28835 + reiserfs_info(s, "Removing %k..", INODE_PKEY(inode));
28836 + if (memcmp(&last_inode_key, INODE_PKEY(inode),
28837 + sizeof(last_inode_key))){
28838 + last_inode_key = *INODE_PKEY(inode);
28839 + /* removal gets completed in iput */
28840 + retval = 0;
28841 + } else {
28842 + reiserfs_warning(s, "super-2189", "Dead loop "
28843 + "in finish_unfinished "
28844 + "detected, just remove "
28845 + "save link\n");
28846 + retval = remove_save_link_only(s,
28847 + &save_link_key, 0);
28851 + iput(inode);
28852 + printk("done\n");
28853 + done++;
28855 + REISERFS_SB(s)->s_is_unlinked_ok = 0;
28857 +#ifdef CONFIG_QUOTA
28858 + /* Turn quotas off */
28859 + reiserfs_write_unlock(s);
28860 + for (i = 0; i < REISERFS_MAXQUOTAS; i++) {
28861 + if (sb_dqopt(s)->files[i] && quota_enabled[i])
28862 + dquot_quota_off(s, i);
28864 + reiserfs_write_lock(s);
28865 + if (ms_active_set)
28866 + /* Restore the flag back */
28867 + s->s_flags &= ~SB_ACTIVE;
28868 +#endif
28869 + pathrelse(&path);
28870 + if (done)
28871 + reiserfs_info(s, "There were %d uncompleted unlinks/truncates. "
28872 + "Completed\n", done);
28873 + return retval;
28877 + * to protect file being unlinked from getting lost we "safe" link files
28878 + * being unlinked. This link will be deleted in the same transaction with last
28879 + * item of file. mounting the filesystem we scan all these links and remove
28880 + * files which almost got lost
28881 + */
28882 +void add_save_link(struct reiserfs_transaction_handle *th,
28883 + struct inode *inode, int truncate)
28885 + INITIALIZE_PATH(path);
28886 + int retval;
28887 + struct cpu_key key;
28888 + struct item_head ih;
28889 + __le32 link;
28891 + BUG_ON(!th->t_trans_id);
28893 + /* file can only get one "save link" of each kind */
28894 + RFALSE(truncate &&
28895 + (REISERFS_I(inode)->i_flags & i_link_saved_truncate_mask),
28896 + "saved link already exists for truncated inode %lx",
28897 + (long)inode->i_ino);
28898 + RFALSE(!truncate &&
28899 + (REISERFS_I(inode)->i_flags & i_link_saved_unlink_mask),
28900 + "saved link already exists for unlinked inode %lx",
28901 + (long)inode->i_ino);
28903 + /* setup key of "save" link */
28904 + key.version = KEY_FORMAT_3_5;
28905 + key.on_disk_key.k_dir_id = MAX_KEY_OBJECTID;
28906 + key.on_disk_key.k_objectid = inode->i_ino;
28907 + if (!truncate) {
28908 + /* unlink, rmdir, rename */
28909 + set_cpu_key_k_offset(&key, 1 + inode->i_sb->s_blocksize);
28910 + set_cpu_key_k_type(&key, TYPE_DIRECT);
28912 + /* item head of "safe" link */
28913 + make_le_item_head(&ih, &key, key.version,
28914 + 1 + inode->i_sb->s_blocksize, TYPE_DIRECT,
28915 + 4 /*length */ , 0xffff /*free space */ );
28916 + } else {
28917 + /* truncate */
28918 + if (S_ISDIR(inode->i_mode))
28919 + reiserfs_warning(inode->i_sb, "green-2102",
28920 + "Adding a truncate savelink for "
28921 + "a directory %k! Please report",
28922 + INODE_PKEY(inode));
28923 + set_cpu_key_k_offset(&key, 1);
28924 + set_cpu_key_k_type(&key, TYPE_INDIRECT);
28926 + /* item head of "safe" link */
28927 + make_le_item_head(&ih, &key, key.version, 1, TYPE_INDIRECT,
28928 + 4 /*length */ , 0 /*free space */ );
28930 + key.key_length = 3;
28932 + /* look for its place in the tree */
28933 + retval = search_item(inode->i_sb, &key, &path);
28934 + if (retval != ITEM_NOT_FOUND) {
28935 + if (retval != -ENOSPC)
28936 + reiserfs_error(inode->i_sb, "vs-2100",
28937 + "search_by_key (%K) returned %d", &key,
28938 + retval);
28939 + pathrelse(&path);
28940 + return;
28943 + /* body of "save" link */
28944 + link = INODE_PKEY(inode)->k_dir_id;
28946 + /* put "save" link into tree, don't charge quota to anyone */
28947 + retval =
28948 + reiserfs_insert_item(th, &path, &key, &ih, NULL, (char *)&link);
28949 + if (retval) {
28950 + if (retval != -ENOSPC)
28951 + reiserfs_error(inode->i_sb, "vs-2120",
28952 + "insert_item returned %d", retval);
28953 + } else {
28954 + if (truncate)
28955 + REISERFS_I(inode)->i_flags |=
28956 + i_link_saved_truncate_mask;
28957 + else
28958 + REISERFS_I(inode)->i_flags |= i_link_saved_unlink_mask;
28962 +/* this opens transaction unlike add_save_link */
28963 +int remove_save_link(struct inode *inode, int truncate)
28965 + struct reiserfs_transaction_handle th;
28966 + struct reiserfs_key key;
28967 + int err;
28969 + /* we are going to do one balancing only */
28970 + err = journal_begin(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT);
28971 + if (err)
28972 + return err;
28974 + /* setup key of "save" link */
28975 + key.k_dir_id = cpu_to_le32(MAX_KEY_OBJECTID);
28976 + key.k_objectid = INODE_PKEY(inode)->k_objectid;
28977 + if (!truncate) {
28978 + /* unlink, rmdir, rename */
28979 + set_le_key_k_offset(KEY_FORMAT_3_5, &key,
28980 + 1 + inode->i_sb->s_blocksize);
28981 + set_le_key_k_type(KEY_FORMAT_3_5, &key, TYPE_DIRECT);
28982 + } else {
28983 + /* truncate */
28984 + set_le_key_k_offset(KEY_FORMAT_3_5, &key, 1);
28985 + set_le_key_k_type(KEY_FORMAT_3_5, &key, TYPE_INDIRECT);
28988 + if ((truncate &&
28989 + (REISERFS_I(inode)->i_flags & i_link_saved_truncate_mask)) ||
28990 + (!truncate &&
28991 + (REISERFS_I(inode)->i_flags & i_link_saved_unlink_mask)))
28992 + /* don't take quota bytes from anywhere */
28993 + reiserfs_delete_solid_item(&th, NULL, &key);
28994 + if (!truncate) {
28995 + reiserfs_release_objectid(&th, inode->i_ino);
28996 + REISERFS_I(inode)->i_flags &= ~i_link_saved_unlink_mask;
28997 + } else
28998 + REISERFS_I(inode)->i_flags &= ~i_link_saved_truncate_mask;
29000 + return journal_end(&th);
29003 +static void reiserfs_kill_sb(struct super_block *s)
29005 + if (REISERFS_SB(s)) {
29006 + reiserfs_proc_info_done(s);
29007 + /*
29008 + * Force any pending inode evictions to occur now. Any
29009 + * inodes to be removed that have extended attributes
29010 + * associated with them need to clean them up before
29011 + * we can release the extended attribute root dentries.
29012 + * shrink_dcache_for_umount will BUG if we don't release
29013 + * those before it's called so ->put_super is too late.
29014 + */
29015 + shrink_dcache_sb(s);
29017 + dput(REISERFS_SB(s)->xattr_root);
29018 + REISERFS_SB(s)->xattr_root = NULL;
29019 + dput(REISERFS_SB(s)->priv_root);
29020 + REISERFS_SB(s)->priv_root = NULL;
29023 + kill_block_super(s);
29026 +#ifdef CONFIG_QUOTA
29027 +static int reiserfs_quota_off(struct super_block *sb, int type);
29029 +static void reiserfs_quota_off_umount(struct super_block *s)
29031 + int type;
29033 + for (type = 0; type < REISERFS_MAXQUOTAS; type++)
29034 + reiserfs_quota_off(s, type);
29036 +#else
29037 +static inline void reiserfs_quota_off_umount(struct super_block *s)
29040 +#endif
29042 +static void reiserfs_put_super(struct super_block *s)
29044 + struct reiserfs_transaction_handle th;
29045 + th.t_trans_id = 0;
29047 + reiserfs_quota_off_umount(s);
29049 + reiserfs_write_lock(s);
29051 + /*
29052 + * change file system state to current state if it was mounted
29053 + * with read-write permissions
29054 + */
29055 + if (!sb_rdonly(s)) {
29056 + if (!journal_begin(&th, s, 10)) {
29057 + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s),
29058 + 1);
29059 + set_sb_umount_state(SB_DISK_SUPER_BLOCK(s),
29060 + REISERFS_SB(s)->s_mount_state);
29061 + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
29065 + /*
29066 + * note, journal_release checks for readonly mount, and can
29067 + * decide not to do a journal_end
29068 + */
29069 + journal_release(&th, s);
29071 + reiserfs_free_bitmap_cache(s);
29073 + brelse(SB_BUFFER_WITH_SB(s));
29075 + print_statistics(s);
29077 + if (REISERFS_SB(s)->reserved_blocks != 0) {
29078 + reiserfs_warning(s, "green-2005", "reserved blocks left %d",
29079 + REISERFS_SB(s)->reserved_blocks);
29082 + reiserfs_write_unlock(s);
29083 + mutex_destroy(&REISERFS_SB(s)->lock);
29084 + destroy_workqueue(REISERFS_SB(s)->commit_wq);
29085 + kfree(REISERFS_SB(s)->s_jdev);
29086 + kfree(s->s_fs_info);
29087 + s->s_fs_info = NULL;
29090 +static struct kmem_cache *reiserfs_inode_cachep;
29092 +static struct inode *reiserfs_alloc_inode(struct super_block *sb)
29094 + struct reiserfs_inode_info *ei;
29095 + ei = alloc_inode_sb(sb, reiserfs_inode_cachep, GFP_KERNEL);
29096 + if (!ei)
29097 + return NULL;
29098 + atomic_set(&ei->openers, 0);
29099 + mutex_init(&ei->tailpack);
29100 +#ifdef CONFIG_QUOTA
29101 + memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
29102 +#endif
29104 + return &ei->vfs_inode;
29107 +static void reiserfs_free_inode(struct inode *inode)
29109 + kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode));
29112 +static void init_once(void *foo)
29114 + struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo;
29116 + INIT_LIST_HEAD(&ei->i_prealloc_list);
29117 + inode_init_once(&ei->vfs_inode);
29120 +static int __init init_inodecache(void)
29122 + reiserfs_inode_cachep = kmem_cache_create("reiser_inode_cache",
29123 + sizeof(struct
29124 + reiserfs_inode_info),
29125 + 0, (SLAB_RECLAIM_ACCOUNT|
29126 + SLAB_ACCOUNT),
29127 + init_once);
29128 + if (reiserfs_inode_cachep == NULL)
29129 + return -ENOMEM;
29130 + return 0;
29133 +static void destroy_inodecache(void)
29135 + /*
29136 + * Make sure all delayed rcu free inodes are flushed before we
29137 + * destroy cache.
29138 + */
29139 + rcu_barrier();
29140 + kmem_cache_destroy(reiserfs_inode_cachep);
29143 +/* we don't mark inodes dirty, we just log them */
29144 +static void reiserfs_dirty_inode(struct inode *inode, int flags)
29146 + struct reiserfs_transaction_handle th;
29148 + int err = 0;
29150 + if (sb_rdonly(inode->i_sb)) {
29151 + reiserfs_warning(inode->i_sb, "clm-6006",
29152 + "writing inode %lu on readonly FS",
29153 + inode->i_ino);
29154 + return;
29156 + reiserfs_write_lock(inode->i_sb);
29158 + /*
29159 + * this is really only used for atime updates, so they don't have
29160 + * to be included in O_SYNC or fsync
29161 + */
29162 + err = journal_begin(&th, inode->i_sb, 1);
29163 + if (err)
29164 + goto out;
29166 + reiserfs_update_sd(&th, inode);
29167 + journal_end(&th);
29169 +out:
29170 + reiserfs_write_unlock(inode->i_sb);
29173 +static int reiserfs_show_options(struct seq_file *seq, struct dentry *root)
29175 + struct super_block *s = root->d_sb;
29176 + struct reiserfs_journal *journal = SB_JOURNAL(s);
29177 + long opts = REISERFS_SB(s)->s_mount_opt;
29179 + if (opts & (1 << REISERFS_LARGETAIL))
29180 + seq_puts(seq, ",tails=on");
29181 + else if (!(opts & (1 << REISERFS_SMALLTAIL)))
29182 + seq_puts(seq, ",notail");
29183 + /* tails=small is default so we don't show it */
29185 + if (!(opts & (1 << REISERFS_BARRIER_FLUSH)))
29186 + seq_puts(seq, ",barrier=none");
29187 + /* barrier=flush is default so we don't show it */
29189 + if (opts & (1 << REISERFS_ERROR_CONTINUE))
29190 + seq_puts(seq, ",errors=continue");
29191 + else if (opts & (1 << REISERFS_ERROR_PANIC))
29192 + seq_puts(seq, ",errors=panic");
29193 + /* errors=ro is default so we don't show it */
29195 + if (opts & (1 << REISERFS_DATA_LOG))
29196 + seq_puts(seq, ",data=journal");
29197 + else if (opts & (1 << REISERFS_DATA_WRITEBACK))
29198 + seq_puts(seq, ",data=writeback");
29199 + /* data=ordered is default so we don't show it */
29201 + if (opts & (1 << REISERFS_ATTRS))
29202 + seq_puts(seq, ",attrs");
29204 + if (opts & (1 << REISERFS_XATTRS_USER))
29205 + seq_puts(seq, ",user_xattr");
29207 + if (opts & (1 << REISERFS_EXPOSE_PRIVROOT))
29208 + seq_puts(seq, ",expose_privroot");
29210 + if (opts & (1 << REISERFS_POSIXACL))
29211 + seq_puts(seq, ",acl");
29213 + if (REISERFS_SB(s)->s_jdev)
29214 + seq_show_option(seq, "jdev", REISERFS_SB(s)->s_jdev);
29216 + if (journal->j_max_commit_age != journal->j_default_max_commit_age)
29217 + seq_printf(seq, ",commit=%d", journal->j_max_commit_age);
29219 +#ifdef CONFIG_QUOTA
29220 + if (REISERFS_SB(s)->s_qf_names[USRQUOTA])
29221 + seq_show_option(seq, "usrjquota",
29222 + REISERFS_SB(s)->s_qf_names[USRQUOTA]);
29223 + else if (opts & (1 << REISERFS_USRQUOTA))
29224 + seq_puts(seq, ",usrquota");
29225 + if (REISERFS_SB(s)->s_qf_names[GRPQUOTA])
29226 + seq_show_option(seq, "grpjquota",
29227 + REISERFS_SB(s)->s_qf_names[GRPQUOTA]);
29228 + else if (opts & (1 << REISERFS_GRPQUOTA))
29229 + seq_puts(seq, ",grpquota");
29230 + if (REISERFS_SB(s)->s_jquota_fmt) {
29231 + if (REISERFS_SB(s)->s_jquota_fmt == QFMT_VFS_OLD)
29232 + seq_puts(seq, ",jqfmt=vfsold");
29233 + else if (REISERFS_SB(s)->s_jquota_fmt == QFMT_VFS_V0)
29234 + seq_puts(seq, ",jqfmt=vfsv0");
29236 +#endif
29238 + /* Block allocator options */
29239 + if (opts & (1 << REISERFS_NO_BORDER))
29240 + seq_puts(seq, ",block-allocator=noborder");
29241 + if (opts & (1 << REISERFS_NO_UNHASHED_RELOCATION))
29242 + seq_puts(seq, ",block-allocator=no_unhashed_relocation");
29243 + if (opts & (1 << REISERFS_HASHED_RELOCATION))
29244 + seq_puts(seq, ",block-allocator=hashed_relocation");
29245 + if (opts & (1 << REISERFS_TEST4))
29246 + seq_puts(seq, ",block-allocator=test4");
29247 + show_alloc_options(seq, s);
29248 + return 0;
29251 +#ifdef CONFIG_QUOTA
29252 +static ssize_t reiserfs_quota_write(struct super_block *, int, const char *,
29253 + size_t, loff_t);
29254 +static ssize_t reiserfs_quota_read(struct super_block *, int, char *, size_t,
29255 + loff_t);
29257 +static struct dquot __rcu **reiserfs_get_dquots(struct inode *inode)
29259 + return REISERFS_I(inode)->i_dquot;
29261 +#endif
29263 +static const struct super_operations reiserfs_sops = {
29264 + .alloc_inode = reiserfs_alloc_inode,
29265 + .free_inode = reiserfs_free_inode,
29266 + .write_inode = reiserfs_write_inode,
29267 + .dirty_inode = reiserfs_dirty_inode,
29268 + .evict_inode = reiserfs_evict_inode,
29269 + .put_super = reiserfs_put_super,
29270 + .sync_fs = reiserfs_sync_fs,
29271 + .freeze_fs = reiserfs_freeze,
29272 + .unfreeze_fs = reiserfs_unfreeze,
29273 + .statfs = reiserfs_statfs,
29274 + .remount_fs = reiserfs_remount,
29275 + .show_options = reiserfs_show_options,
29276 +#ifdef CONFIG_QUOTA
29277 + .quota_read = reiserfs_quota_read,
29278 + .quota_write = reiserfs_quota_write,
29279 + .get_dquots = reiserfs_get_dquots,
29280 +#endif
29283 +#ifdef CONFIG_QUOTA
29284 +#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
29286 +static int reiserfs_write_dquot(struct dquot *);
29287 +static int reiserfs_acquire_dquot(struct dquot *);
29288 +static int reiserfs_release_dquot(struct dquot *);
29289 +static int reiserfs_mark_dquot_dirty(struct dquot *);
29290 +static int reiserfs_write_info(struct super_block *, int);
29291 +static int reiserfs_quota_on(struct super_block *, int, int, const struct path *);
29293 +static const struct dquot_operations reiserfs_quota_operations = {
29294 + .write_dquot = reiserfs_write_dquot,
29295 + .acquire_dquot = reiserfs_acquire_dquot,
29296 + .release_dquot = reiserfs_release_dquot,
29297 + .mark_dirty = reiserfs_mark_dquot_dirty,
29298 + .write_info = reiserfs_write_info,
29299 + .alloc_dquot = dquot_alloc,
29300 + .destroy_dquot = dquot_destroy,
29301 + .get_next_id = dquot_get_next_id,
29304 +static const struct quotactl_ops reiserfs_qctl_operations = {
29305 + .quota_on = reiserfs_quota_on,
29306 + .quota_off = reiserfs_quota_off,
29307 + .quota_sync = dquot_quota_sync,
29308 + .get_state = dquot_get_state,
29309 + .set_info = dquot_set_dqinfo,
29310 + .get_dqblk = dquot_get_dqblk,
29311 + .set_dqblk = dquot_set_dqblk,
29313 +#endif
29315 +static const struct export_operations reiserfs_export_ops = {
29316 + .encode_fh = reiserfs_encode_fh,
29317 + .fh_to_dentry = reiserfs_fh_to_dentry,
29318 + .fh_to_parent = reiserfs_fh_to_parent,
29319 + .get_parent = reiserfs_get_parent,
29323 + * this struct is used in reiserfs_getopt () for containing the value for
29324 + * those mount options that have values rather than being toggles.
29325 + */
29326 +typedef struct {
29327 + char *value;
29328 + /*
29329 + * bitmask which is to set on mount_options bitmask
29330 + * when this value is found, 0 is no bits are to be changed.
29331 + */
29332 + int setmask;
29333 + /*
29334 + * bitmask which is to clear on mount_options bitmask
29335 + * when this value is found, 0 is no bits are to be changed.
29336 + * This is applied BEFORE setmask
29337 + */
29338 + int clrmask;
29339 +} arg_desc_t;
29341 +/* Set this bit in arg_required to allow empty arguments */
29342 +#define REISERFS_OPT_ALLOWEMPTY 31
29345 + * this struct is used in reiserfs_getopt() for describing the
29346 + * set of reiserfs mount options
29347 + */
29348 +typedef struct {
29349 + char *option_name;
29351 + /* 0 if argument is not required, not 0 otherwise */
29352 + int arg_required;
29354 + /* list of values accepted by an option */
29355 + const arg_desc_t *values;
29357 + /*
29358 + * bitmask which is to set on mount_options bitmask
29359 + * when this value is found, 0 is no bits are to be changed.
29360 + */
29361 + int setmask;
29363 + /*
29364 + * bitmask which is to clear on mount_options bitmask
29365 + * when this value is found, 0 is no bits are to be changed.
29366 + * This is applied BEFORE setmask
29367 + */
29368 + int clrmask;
29369 +} opt_desc_t;
29371 +/* possible values for -o data= */
29372 +static const arg_desc_t logging_mode[] = {
29373 + {"ordered", 1 << REISERFS_DATA_ORDERED,
29374 + (1 << REISERFS_DATA_LOG | 1 << REISERFS_DATA_WRITEBACK)},
29375 + {"journal", 1 << REISERFS_DATA_LOG,
29376 + (1 << REISERFS_DATA_ORDERED | 1 << REISERFS_DATA_WRITEBACK)},
29377 + {"writeback", 1 << REISERFS_DATA_WRITEBACK,
29378 + (1 << REISERFS_DATA_ORDERED | 1 << REISERFS_DATA_LOG)},
29379 + {.value = NULL}
29382 +/* possible values for -o barrier= */
29383 +static const arg_desc_t barrier_mode[] = {
29384 + {"none", 1 << REISERFS_BARRIER_NONE, 1 << REISERFS_BARRIER_FLUSH},
29385 + {"flush", 1 << REISERFS_BARRIER_FLUSH, 1 << REISERFS_BARRIER_NONE},
29386 + {.value = NULL}
29390 + * possible values for "-o block-allocator=" and bits which are to be set in
29391 + * s_mount_opt of reiserfs specific part of in-core super block
29392 + */
29393 +static const arg_desc_t balloc[] = {
29394 + {"noborder", 1 << REISERFS_NO_BORDER, 0},
29395 + {"border", 0, 1 << REISERFS_NO_BORDER},
29396 + {"no_unhashed_relocation", 1 << REISERFS_NO_UNHASHED_RELOCATION, 0},
29397 + {"hashed_relocation", 1 << REISERFS_HASHED_RELOCATION, 0},
29398 + {"test4", 1 << REISERFS_TEST4, 0},
29399 + {"notest4", 0, 1 << REISERFS_TEST4},
29400 + {NULL, 0, 0}
29403 +static const arg_desc_t tails[] = {
29404 + {"on", 1 << REISERFS_LARGETAIL, 1 << REISERFS_SMALLTAIL},
29405 + {"off", 0, (1 << REISERFS_LARGETAIL) | (1 << REISERFS_SMALLTAIL)},
29406 + {"small", 1 << REISERFS_SMALLTAIL, 1 << REISERFS_LARGETAIL},
29407 + {NULL, 0, 0}
29410 +static const arg_desc_t error_actions[] = {
29411 + {"panic", 1 << REISERFS_ERROR_PANIC,
29412 + (1 << REISERFS_ERROR_RO | 1 << REISERFS_ERROR_CONTINUE)},
29413 + {"ro-remount", 1 << REISERFS_ERROR_RO,
29414 + (1 << REISERFS_ERROR_PANIC | 1 << REISERFS_ERROR_CONTINUE)},
29415 +#ifdef REISERFS_JOURNAL_ERROR_ALLOWS_NO_LOG
29416 + {"continue", 1 << REISERFS_ERROR_CONTINUE,
29417 + (1 << REISERFS_ERROR_PANIC | 1 << REISERFS_ERROR_RO)},
29418 +#endif
29419 + {NULL, 0, 0},
29423 + * proceed only one option from a list *cur - string containing of mount
29424 + * options
29425 + * opts - array of options which are accepted
29426 + * opt_arg - if option is found and requires an argument and if it is specifed
29427 + * in the input - pointer to the argument is stored here
29428 + * bit_flags - if option requires to set a certain bit - it is set here
29429 + * return -1 if unknown option is found, opt->arg_required otherwise
29430 + */
29431 +static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts,
29432 + char **opt_arg, unsigned long *bit_flags)
29434 + char *p;
29435 + /*
29436 + * foo=bar,
29437 + * ^ ^ ^
29438 + * | | +-- option_end
29439 + * | +-- arg_start
29440 + * +-- option_start
29441 + */
29442 + const opt_desc_t *opt;
29443 + const arg_desc_t *arg;
29445 + p = *cur;
29447 + /* assume argument cannot contain commas */
29448 + *cur = strchr(p, ',');
29449 + if (*cur) {
29450 + *(*cur) = '\0';
29451 + (*cur)++;
29454 + if (!strncmp(p, "alloc=", 6)) {
29455 + /*
29456 + * Ugly special case, probably we should redo options
29457 + * parser so that it can understand several arguments for
29458 + * some options, also so that it can fill several bitfields
29459 + * with option values.
29460 + */
29461 + if (reiserfs_parse_alloc_options(s, p + 6)) {
29462 + return -1;
29463 + } else {
29464 + return 0;
29468 + /* for every option in the list */
29469 + for (opt = opts; opt->option_name; opt++) {
29470 + if (!strncmp(p, opt->option_name, strlen(opt->option_name))) {
29471 + if (bit_flags) {
29472 + if (opt->clrmask ==
29473 + (1 << REISERFS_UNSUPPORTED_OPT))
29474 + reiserfs_warning(s, "super-6500",
29475 + "%s not supported.\n",
29476 + p);
29477 + else
29478 + *bit_flags &= ~opt->clrmask;
29479 + if (opt->setmask ==
29480 + (1 << REISERFS_UNSUPPORTED_OPT))
29481 + reiserfs_warning(s, "super-6501",
29482 + "%s not supported.\n",
29483 + p);
29484 + else
29485 + *bit_flags |= opt->setmask;
29487 + break;
29490 + if (!opt->option_name) {
29491 + reiserfs_warning(s, "super-6502",
29492 + "unknown mount option \"%s\"", p);
29493 + return -1;
29496 + p += strlen(opt->option_name);
29497 + switch (*p) {
29498 + case '=':
29499 + if (!opt->arg_required) {
29500 + reiserfs_warning(s, "super-6503",
29501 + "the option \"%s\" does not "
29502 + "require an argument\n",
29503 + opt->option_name);
29504 + return -1;
29506 + break;
29508 + case 0:
29509 + if (opt->arg_required) {
29510 + reiserfs_warning(s, "super-6504",
29511 + "the option \"%s\" requires an "
29512 + "argument\n", opt->option_name);
29513 + return -1;
29515 + break;
29516 + default:
29517 + reiserfs_warning(s, "super-6505",
29518 + "head of option \"%s\" is only correct\n",
29519 + opt->option_name);
29520 + return -1;
29523 + /*
29524 + * move to the argument, or to next option if argument is not
29525 + * required
29526 + */
29527 + p++;
29529 + if (opt->arg_required
29530 + && !(opt->arg_required & (1 << REISERFS_OPT_ALLOWEMPTY))
29531 + && !strlen(p)) {
29532 + /* this catches "option=," if not allowed */
29533 + reiserfs_warning(s, "super-6506",
29534 + "empty argument for \"%s\"\n",
29535 + opt->option_name);
29536 + return -1;
29539 + if (!opt->values) {
29540 + /* *=NULLopt_arg contains pointer to argument */
29541 + *opt_arg = p;
29542 + return opt->arg_required & ~(1 << REISERFS_OPT_ALLOWEMPTY);
29545 + /* values possible for this option are listed in opt->values */
29546 + for (arg = opt->values; arg->value; arg++) {
29547 + if (!strcmp(p, arg->value)) {
29548 + if (bit_flags) {
29549 + *bit_flags &= ~arg->clrmask;
29550 + *bit_flags |= arg->setmask;
29552 + return opt->arg_required;
29556 + reiserfs_warning(s, "super-6506",
29557 + "bad value \"%s\" for option \"%s\"\n", p,
29558 + opt->option_name);
29559 + return -1;
29562 +/* returns 0 if something is wrong in option string, 1 - otherwise */
29563 +static int reiserfs_parse_options(struct super_block *s,
29565 + /* string given via mount's -o */
29566 + char *options,
29568 + /*
29569 + * after the parsing phase, contains the
29570 + * collection of bitflags defining what
29571 + * mount options were selected.
29572 + */
29573 + unsigned long *mount_options,
29575 + /* strtol-ed from NNN of resize=NNN */
29576 + unsigned long *blocks,
29577 + char **jdev_name,
29578 + unsigned int *commit_max_age,
29579 + char **qf_names,
29580 + unsigned int *qfmt)
29582 + int c;
29583 + char *arg = NULL;
29584 + char *pos;
29585 + opt_desc_t opts[] = {
29586 + /*
29587 + * Compatibility stuff, so that -o notail for old
29588 + * setups still work
29589 + */
29590 + {"tails",.arg_required = 't',.values = tails},
29591 + {"notail",.clrmask =
29592 + (1 << REISERFS_LARGETAIL) | (1 << REISERFS_SMALLTAIL)},
29593 + {"conv",.setmask = 1 << REISERFS_CONVERT},
29594 + {"attrs",.setmask = 1 << REISERFS_ATTRS},
29595 + {"noattrs",.clrmask = 1 << REISERFS_ATTRS},
29596 + {"expose_privroot", .setmask = 1 << REISERFS_EXPOSE_PRIVROOT},
29597 +#ifdef CONFIG_REISERFS_FS_XATTR
29598 + {"user_xattr",.setmask = 1 << REISERFS_XATTRS_USER},
29599 + {"nouser_xattr",.clrmask = 1 << REISERFS_XATTRS_USER},
29600 +#else
29601 + {"user_xattr",.setmask = 1 << REISERFS_UNSUPPORTED_OPT},
29602 + {"nouser_xattr",.clrmask = 1 << REISERFS_UNSUPPORTED_OPT},
29603 +#endif
29604 +#ifdef CONFIG_REISERFS_FS_POSIX_ACL
29605 + {"acl",.setmask = 1 << REISERFS_POSIXACL},
29606 + {"noacl",.clrmask = 1 << REISERFS_POSIXACL},
29607 +#else
29608 + {"acl",.setmask = 1 << REISERFS_UNSUPPORTED_OPT},
29609 + {"noacl",.clrmask = 1 << REISERFS_UNSUPPORTED_OPT},
29610 +#endif
29611 + {.option_name = "nolog"},
29612 + {"replayonly",.setmask = 1 << REPLAYONLY},
29613 + {"block-allocator",.arg_required = 'a',.values = balloc},
29614 + {"data",.arg_required = 'd',.values = logging_mode},
29615 + {"barrier",.arg_required = 'b',.values = barrier_mode},
29616 + {"resize",.arg_required = 'r',.values = NULL},
29617 + {"jdev",.arg_required = 'j',.values = NULL},
29618 + {"nolargeio",.arg_required = 'w',.values = NULL},
29619 + {"commit",.arg_required = 'c',.values = NULL},
29620 + {"usrquota",.setmask = 1 << REISERFS_USRQUOTA},
29621 + {"grpquota",.setmask = 1 << REISERFS_GRPQUOTA},
29622 + {"noquota",.clrmask = 1 << REISERFS_USRQUOTA | 1 << REISERFS_GRPQUOTA},
29623 + {"errors",.arg_required = 'e',.values = error_actions},
29624 + {"usrjquota",.arg_required =
29625 + 'u' | (1 << REISERFS_OPT_ALLOWEMPTY),.values = NULL},
29626 + {"grpjquota",.arg_required =
29627 + 'g' | (1 << REISERFS_OPT_ALLOWEMPTY),.values = NULL},
29628 + {"jqfmt",.arg_required = 'f',.values = NULL},
29629 + {.option_name = NULL}
29630 + };
29632 + *blocks = 0;
29633 + if (!options || !*options)
29634 + /*
29635 + * use default configuration: create tails, journaling on, no
29636 + * conversion to newest format
29637 + */
29638 + return 1;
29640 + for (pos = options; pos;) {
29641 + c = reiserfs_getopt(s, &pos, opts, &arg, mount_options);
29642 + if (c == -1)
29643 + /* wrong option is given */
29644 + return 0;
29646 + if (c == 'r') {
29647 + char *p;
29649 + p = NULL;
29650 + /* "resize=NNN" or "resize=auto" */
29652 + if (!strcmp(arg, "auto")) {
29653 + /* From JFS code, to auto-get the size. */
29654 + *blocks = sb_bdev_nr_blocks(s);
29655 + } else {
29656 + *blocks = simple_strtoul(arg, &p, 0);
29657 + if (*p != '\0') {
29658 + /* NNN does not look like a number */
29659 + reiserfs_warning(s, "super-6507",
29660 + "bad value %s for "
29661 + "-oresize\n", arg);
29662 + return 0;
29667 + if (c == 'c') {
29668 + char *p = NULL;
29669 + unsigned long val = simple_strtoul(arg, &p, 0);
29670 + /* commit=NNN (time in seconds) */
29671 + if (*p != '\0' || val >= (unsigned int)-1) {
29672 + reiserfs_warning(s, "super-6508",
29673 + "bad value %s for -ocommit\n",
29674 + arg);
29675 + return 0;
29677 + *commit_max_age = (unsigned int)val;
29680 + if (c == 'w') {
29681 + reiserfs_warning(s, "super-6509", "nolargeio option "
29682 + "is no longer supported");
29683 + return 0;
29686 + if (c == 'j') {
29687 + if (arg && *arg && jdev_name) {
29688 + /* Hm, already assigned? */
29689 + if (*jdev_name) {
29690 + reiserfs_warning(s, "super-6510",
29691 + "journal device was "
29692 + "already specified to "
29693 + "be %s", *jdev_name);
29694 + return 0;
29696 + *jdev_name = arg;
29699 +#ifdef CONFIG_QUOTA
29700 + if (c == 'u' || c == 'g') {
29701 + int qtype = c == 'u' ? USRQUOTA : GRPQUOTA;
29703 + if (sb_any_quota_loaded(s) &&
29704 + (!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) {
29705 + reiserfs_warning(s, "super-6511",
29706 + "cannot change journaled "
29707 + "quota options when quota "
29708 + "turned on.");
29709 + return 0;
29711 + if (qf_names[qtype] !=
29712 + REISERFS_SB(s)->s_qf_names[qtype])
29713 + kfree(qf_names[qtype]);
29714 + qf_names[qtype] = NULL;
29715 + if (*arg) { /* Some filename specified? */
29716 + if (REISERFS_SB(s)->s_qf_names[qtype]
29717 + && strcmp(REISERFS_SB(s)->s_qf_names[qtype],
29718 + arg)) {
29719 + reiserfs_warning(s, "super-6512",
29720 + "%s quota file "
29721 + "already specified.",
29722 + QTYPE2NAME(qtype));
29723 + return 0;
29725 + if (strchr(arg, '/')) {
29726 + reiserfs_warning(s, "super-6513",
29727 + "quotafile must be "
29728 + "on filesystem root.");
29729 + return 0;
29731 + qf_names[qtype] = kstrdup(arg, GFP_KERNEL);
29732 + if (!qf_names[qtype]) {
29733 + reiserfs_warning(s, "reiserfs-2502",
29734 + "not enough memory "
29735 + "for storing "
29736 + "quotafile name.");
29737 + return 0;
29739 + if (qtype == USRQUOTA)
29740 + *mount_options |= 1 << REISERFS_USRQUOTA;
29741 + else
29742 + *mount_options |= 1 << REISERFS_GRPQUOTA;
29743 + } else {
29744 + if (qtype == USRQUOTA)
29745 + *mount_options &= ~(1 << REISERFS_USRQUOTA);
29746 + else
29747 + *mount_options &= ~(1 << REISERFS_GRPQUOTA);
29750 + if (c == 'f') {
29751 + if (!strcmp(arg, "vfsold"))
29752 + *qfmt = QFMT_VFS_OLD;
29753 + else if (!strcmp(arg, "vfsv0"))
29754 + *qfmt = QFMT_VFS_V0;
29755 + else {
29756 + reiserfs_warning(s, "super-6514",
29757 + "unknown quota format "
29758 + "specified.");
29759 + return 0;
29761 + if (sb_any_quota_loaded(s) &&
29762 + *qfmt != REISERFS_SB(s)->s_jquota_fmt) {
29763 + reiserfs_warning(s, "super-6515",
29764 + "cannot change journaled "
29765 + "quota options when quota "
29766 + "turned on.");
29767 + return 0;
29770 +#else
29771 + if (c == 'u' || c == 'g' || c == 'f') {
29772 + reiserfs_warning(s, "reiserfs-2503", "journaled "
29773 + "quota options not supported.");
29774 + return 0;
29776 +#endif
29779 +#ifdef CONFIG_QUOTA
29780 + if (!REISERFS_SB(s)->s_jquota_fmt && !*qfmt
29781 + && (qf_names[USRQUOTA] || qf_names[GRPQUOTA])) {
29782 + reiserfs_warning(s, "super-6515",
29783 + "journaled quota format not specified.");
29784 + return 0;
29786 + if ((!(*mount_options & (1 << REISERFS_USRQUOTA)) &&
29787 + sb_has_quota_loaded(s, USRQUOTA)) ||
29788 + (!(*mount_options & (1 << REISERFS_GRPQUOTA)) &&
29789 + sb_has_quota_loaded(s, GRPQUOTA))) {
29790 + reiserfs_warning(s, "super-6516", "quota options must "
29791 + "be present when quota is turned on.");
29792 + return 0;
29794 +#endif
29796 + return 1;
29799 +static void switch_data_mode(struct super_block *s, unsigned long mode)
29801 + REISERFS_SB(s)->s_mount_opt &= ~((1 << REISERFS_DATA_LOG) |
29802 + (1 << REISERFS_DATA_ORDERED) |
29803 + (1 << REISERFS_DATA_WRITEBACK));
29804 + REISERFS_SB(s)->s_mount_opt |= (1 << mode);
29807 +static void handle_data_mode(struct super_block *s, unsigned long mount_options)
29809 + if (mount_options & (1 << REISERFS_DATA_LOG)) {
29810 + if (!reiserfs_data_log(s)) {
29811 + switch_data_mode(s, REISERFS_DATA_LOG);
29812 + reiserfs_info(s, "switching to journaled data mode\n");
29814 + } else if (mount_options & (1 << REISERFS_DATA_ORDERED)) {
29815 + if (!reiserfs_data_ordered(s)) {
29816 + switch_data_mode(s, REISERFS_DATA_ORDERED);
29817 + reiserfs_info(s, "switching to ordered data mode\n");
29819 + } else if (mount_options & (1 << REISERFS_DATA_WRITEBACK)) {
29820 + if (!reiserfs_data_writeback(s)) {
29821 + switch_data_mode(s, REISERFS_DATA_WRITEBACK);
29822 + reiserfs_info(s, "switching to writeback data mode\n");
29827 +static void handle_barrier_mode(struct super_block *s, unsigned long bits)
29829 + int flush = (1 << REISERFS_BARRIER_FLUSH);
29830 + int none = (1 << REISERFS_BARRIER_NONE);
29831 + int all_barrier = flush | none;
29833 + if (bits & all_barrier) {
29834 + REISERFS_SB(s)->s_mount_opt &= ~all_barrier;
29835 + if (bits & flush) {
29836 + REISERFS_SB(s)->s_mount_opt |= flush;
29837 + printk("reiserfs: enabling write barrier flush mode\n");
29838 + } else if (bits & none) {
29839 + REISERFS_SB(s)->s_mount_opt |= none;
29840 + printk("reiserfs: write barriers turned off\n");
29845 +static void handle_attrs(struct super_block *s)
29847 + struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s);
29849 + if (reiserfs_attrs(s)) {
29850 + if (old_format_only(s)) {
29851 + reiserfs_warning(s, "super-6517", "cannot support "
29852 + "attributes on 3.5.x disk format");
29853 + REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS);
29854 + return;
29856 + if (!(le32_to_cpu(rs->s_flags) & reiserfs_attrs_cleared)) {
29857 + reiserfs_warning(s, "super-6518", "cannot support "
29858 + "attributes until flag is set in "
29859 + "super-block");
29860 + REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS);
29865 +#ifdef CONFIG_QUOTA
29866 +static void handle_quota_files(struct super_block *s, char **qf_names,
29867 + unsigned int *qfmt)
29869 + int i;
29871 + for (i = 0; i < REISERFS_MAXQUOTAS; i++) {
29872 + if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i])
29873 + kfree(REISERFS_SB(s)->s_qf_names[i]);
29874 + REISERFS_SB(s)->s_qf_names[i] = qf_names[i];
29876 + if (*qfmt)
29877 + REISERFS_SB(s)->s_jquota_fmt = *qfmt;
29879 +#endif
29881 +static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
29883 + struct reiserfs_super_block *rs;
29884 + struct reiserfs_transaction_handle th;
29885 + unsigned long blocks;
29886 + unsigned long mount_options = REISERFS_SB(s)->s_mount_opt;
29887 + unsigned long safe_mask = 0;
29888 + unsigned int commit_max_age = (unsigned int)-1;
29889 + struct reiserfs_journal *journal = SB_JOURNAL(s);
29890 + int err;
29891 + char *qf_names[REISERFS_MAXQUOTAS];
29892 + unsigned int qfmt = 0;
29893 +#ifdef CONFIG_QUOTA
29894 + int i;
29895 +#endif
29897 + sync_filesystem(s);
29898 + reiserfs_write_lock(s);
29900 +#ifdef CONFIG_QUOTA
29901 + memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names));
29902 +#endif
29904 + rs = SB_DISK_SUPER_BLOCK(s);
29906 + if (!reiserfs_parse_options
29907 + (s, arg, &mount_options, &blocks, NULL, &commit_max_age,
29908 + qf_names, &qfmt)) {
29909 +#ifdef CONFIG_QUOTA
29910 + for (i = 0; i < REISERFS_MAXQUOTAS; i++)
29911 + if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i])
29912 + kfree(qf_names[i]);
29913 +#endif
29914 + err = -EINVAL;
29915 + goto out_err_unlock;
29917 +#ifdef CONFIG_QUOTA
29918 + handle_quota_files(s, qf_names, &qfmt);
29919 +#endif
29921 + handle_attrs(s);
29923 + /* Add options that are safe here */
29924 + safe_mask |= 1 << REISERFS_SMALLTAIL;
29925 + safe_mask |= 1 << REISERFS_LARGETAIL;
29926 + safe_mask |= 1 << REISERFS_NO_BORDER;
29927 + safe_mask |= 1 << REISERFS_NO_UNHASHED_RELOCATION;
29928 + safe_mask |= 1 << REISERFS_HASHED_RELOCATION;
29929 + safe_mask |= 1 << REISERFS_TEST4;
29930 + safe_mask |= 1 << REISERFS_ATTRS;
29931 + safe_mask |= 1 << REISERFS_XATTRS_USER;
29932 + safe_mask |= 1 << REISERFS_POSIXACL;
29933 + safe_mask |= 1 << REISERFS_BARRIER_FLUSH;
29934 + safe_mask |= 1 << REISERFS_BARRIER_NONE;
29935 + safe_mask |= 1 << REISERFS_ERROR_RO;
29936 + safe_mask |= 1 << REISERFS_ERROR_CONTINUE;
29937 + safe_mask |= 1 << REISERFS_ERROR_PANIC;
29938 + safe_mask |= 1 << REISERFS_USRQUOTA;
29939 + safe_mask |= 1 << REISERFS_GRPQUOTA;
29941 + /*
29942 + * Update the bitmask, taking care to keep
29943 + * the bits we're not allowed to change here
29944 + */
29945 + REISERFS_SB(s)->s_mount_opt =
29946 + (REISERFS_SB(s)->
29947 + s_mount_opt & ~safe_mask) | (mount_options & safe_mask);
29949 + if (commit_max_age != 0 && commit_max_age != (unsigned int)-1) {
29950 + journal->j_max_commit_age = commit_max_age;
29951 + journal->j_max_trans_age = commit_max_age;
29952 + } else if (commit_max_age == 0) {
29953 + /* 0 means restore defaults. */
29954 + journal->j_max_commit_age = journal->j_default_max_commit_age;
29955 + journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE;
29958 + if (blocks) {
29959 + err = reiserfs_resize(s, blocks);
29960 + if (err != 0)
29961 + goto out_err_unlock;
29964 + if (*mount_flags & SB_RDONLY) {
29965 + reiserfs_write_unlock(s);
29966 + reiserfs_xattr_init(s, *mount_flags);
29967 + /* remount read-only */
29968 + if (sb_rdonly(s))
29969 + /* it is read-only already */
29970 + goto out_ok_unlocked;
29972 + err = dquot_suspend(s, -1);
29973 + if (err < 0)
29974 + goto out_err;
29976 + /* try to remount file system with read-only permissions */
29977 + if (sb_umount_state(rs) == REISERFS_VALID_FS
29978 + || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) {
29979 + goto out_ok_unlocked;
29982 + reiserfs_write_lock(s);
29984 + err = journal_begin(&th, s, 10);
29985 + if (err)
29986 + goto out_err_unlock;
29988 + /* Mounting a rw partition read-only. */
29989 + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
29990 + set_sb_umount_state(rs, REISERFS_SB(s)->s_mount_state);
29991 + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
29992 + } else {
29993 + /* remount read-write */
29994 + if (!sb_rdonly(s)) {
29995 + reiserfs_write_unlock(s);
29996 + reiserfs_xattr_init(s, *mount_flags);
29997 + goto out_ok_unlocked; /* We are read-write already */
30000 + if (reiserfs_is_journal_aborted(journal)) {
30001 + err = journal->j_errno;
30002 + goto out_err_unlock;
30005 + handle_data_mode(s, mount_options);
30006 + handle_barrier_mode(s, mount_options);
30007 + REISERFS_SB(s)->s_mount_state = sb_umount_state(rs);
30009 + /* now it is safe to call journal_begin */
30010 + s->s_flags &= ~SB_RDONLY;
30011 + err = journal_begin(&th, s, 10);
30012 + if (err)
30013 + goto out_err_unlock;
30015 + /* Mount a partition which is read-only, read-write */
30016 + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
30017 + REISERFS_SB(s)->s_mount_state = sb_umount_state(rs);
30018 + s->s_flags &= ~SB_RDONLY;
30019 + set_sb_umount_state(rs, REISERFS_ERROR_FS);
30020 + if (!old_format_only(s))
30021 + set_sb_mnt_count(rs, sb_mnt_count(rs) + 1);
30022 + /* mark_buffer_dirty (SB_BUFFER_WITH_SB (s), 1); */
30023 + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
30024 + REISERFS_SB(s)->s_mount_state = REISERFS_VALID_FS;
30026 + /* this will force a full flush of all journal lists */
30027 + SB_JOURNAL(s)->j_must_wait = 1;
30028 + err = journal_end(&th);
30029 + if (err)
30030 + goto out_err_unlock;
30032 + reiserfs_write_unlock(s);
30033 + if (!(*mount_flags & SB_RDONLY)) {
30034 + dquot_resume(s, -1);
30035 + reiserfs_write_lock(s);
30036 + finish_unfinished(s);
30037 + reiserfs_write_unlock(s);
30038 + reiserfs_xattr_init(s, *mount_flags);
30041 +out_ok_unlocked:
30042 + return 0;
30044 +out_err_unlock:
30045 + reiserfs_write_unlock(s);
30046 +out_err:
30047 + return err;
30050 +static int read_super_block(struct super_block *s, int offset)
30052 + struct buffer_head *bh;
30053 + struct reiserfs_super_block *rs;
30054 + int fs_blocksize;
30056 + bh = sb_bread(s, offset / s->s_blocksize);
30057 + if (!bh) {
30058 + reiserfs_warning(s, "sh-2006",
30059 + "bread failed (dev %s, block %lu, size %lu)",
30060 + s->s_id, offset / s->s_blocksize,
30061 + s->s_blocksize);
30062 + return 1;
30065 + rs = (struct reiserfs_super_block *)bh->b_data;
30066 + if (!is_any_reiserfs_magic_string(rs)) {
30067 + brelse(bh);
30068 + return 1;
30070 + /*
30071 + * ok, reiserfs signature (old or new) found in at the given offset
30072 + */
30073 + fs_blocksize = sb_blocksize(rs);
30074 + brelse(bh);
30075 + sb_set_blocksize(s, fs_blocksize);
30077 + bh = sb_bread(s, offset / s->s_blocksize);
30078 + if (!bh) {
30079 + reiserfs_warning(s, "sh-2007",
30080 + "bread failed (dev %s, block %lu, size %lu)",
30081 + s->s_id, offset / s->s_blocksize,
30082 + s->s_blocksize);
30083 + return 1;
30086 + rs = (struct reiserfs_super_block *)bh->b_data;
30087 + if (sb_blocksize(rs) != s->s_blocksize) {
30088 + reiserfs_warning(s, "sh-2011", "can't find a reiserfs "
30089 + "filesystem on (dev %s, block %llu, size %lu)",
30090 + s->s_id,
30091 + (unsigned long long)bh->b_blocknr,
30092 + s->s_blocksize);
30093 + brelse(bh);
30094 + return 1;
30097 + if (rs->s_v1.s_root_block == cpu_to_le32(-1)) {
30098 + brelse(bh);
30099 + reiserfs_warning(s, "super-6519", "Unfinished reiserfsck "
30100 + "--rebuild-tree run detected. Please run\n"
30101 + "reiserfsck --rebuild-tree and wait for a "
30102 + "completion. If that fails\n"
30103 + "get newer reiserfsprogs package");
30104 + return 1;
30107 + reiserfs_warning(NULL, "", "reiserfs filesystem is deprecated and "
30108 + "scheduled to be removed from the kernel in 2025");
30109 + SB_BUFFER_WITH_SB(s) = bh;
30110 + SB_DISK_SUPER_BLOCK(s) = rs;
30112 + /*
30113 + * magic is of non-standard journal filesystem, look at s_version to
30114 + * find which format is in use
30115 + */
30116 + if (is_reiserfs_jr(rs)) {
30117 + if (sb_version(rs) == REISERFS_VERSION_2)
30118 + reiserfs_info(s, "found reiserfs format \"3.6\""
30119 + " with non-standard journal\n");
30120 + else if (sb_version(rs) == REISERFS_VERSION_1)
30121 + reiserfs_info(s, "found reiserfs format \"3.5\""
30122 + " with non-standard journal\n");
30123 + else {
30124 + reiserfs_warning(s, "sh-2012", "found unknown "
30125 + "format \"%u\" of reiserfs with "
30126 + "non-standard magic", sb_version(rs));
30127 + return 1;
30129 + } else
30130 + /*
30131 + * s_version of standard format may contain incorrect
30132 + * information, so we just look at the magic string
30133 + */
30134 + reiserfs_info(s,
30135 + "found reiserfs format \"%s\" with standard journal\n",
30136 + is_reiserfs_3_5(rs) ? "3.5" : "3.6");
30138 + s->s_op = &reiserfs_sops;
30139 + s->s_export_op = &reiserfs_export_ops;
30140 +#ifdef CONFIG_QUOTA
30141 + s->s_qcop = &reiserfs_qctl_operations;
30142 + s->dq_op = &reiserfs_quota_operations;
30143 + s->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
30144 +#endif
30146 + /*
30147 + * new format is limited by the 32 bit wide i_blocks field, want to
30148 + * be one full block below that.
30149 + */
30150 + s->s_maxbytes = (512LL << 32) - s->s_blocksize;
30151 + return 0;
30154 +/* after journal replay, reread all bitmap and super blocks */
30155 +static int reread_meta_blocks(struct super_block *s)
30157 + if (bh_read(SB_BUFFER_WITH_SB(s), 0) < 0) {
30158 + reiserfs_warning(s, "reiserfs-2504", "error reading the super");
30159 + return 1;
30162 + return 0;
30165 +/* hash detection stuff */
30168 + * if root directory is empty - we set default - Yura's - hash and
30169 + * warn about it
30170 + * FIXME: we look for only one name in a directory. If tea and yura
30171 + * both have the same value - we ask user to send report to the
30172 + * mailing list
30173 + */
30174 +static __u32 find_hash_out(struct super_block *s)
30176 + int retval;
30177 + struct inode *inode;
30178 + struct cpu_key key;
30179 + INITIALIZE_PATH(path);
30180 + struct reiserfs_dir_entry de;
30181 + struct reiserfs_de_head *deh;
30182 + __u32 hash = DEFAULT_HASH;
30183 + __u32 deh_hashval, teahash, r5hash, yurahash;
30185 + inode = d_inode(s->s_root);
30187 + make_cpu_key(&key, inode, ~0, TYPE_DIRENTRY, 3);
30188 + retval = search_by_entry_key(s, &key, &path, &de);
30189 + if (retval == IO_ERROR) {
30190 + pathrelse(&path);
30191 + return UNSET_HASH;
30193 + if (retval == NAME_NOT_FOUND)
30194 + de.de_entry_num--;
30196 + set_de_name_and_namelen(&de);
30197 + deh = de.de_deh + de.de_entry_num;
30199 + if (deh_offset(deh) == DOT_DOT_OFFSET) {
30200 + /* allow override in this case */
30201 + if (reiserfs_rupasov_hash(s))
30202 + hash = YURA_HASH;
30203 + reiserfs_info(s, "FS seems to be empty, autodetect is using the default hash\n");
30204 + goto out;
30207 + deh_hashval = GET_HASH_VALUE(deh_offset(deh));
30208 + r5hash = GET_HASH_VALUE(r5_hash(de.de_name, de.de_namelen));
30209 + teahash = GET_HASH_VALUE(keyed_hash(de.de_name, de.de_namelen));
30210 + yurahash = GET_HASH_VALUE(yura_hash(de.de_name, de.de_namelen));
30212 + if ((teahash == r5hash && deh_hashval == r5hash) ||
30213 + (teahash == yurahash && deh_hashval == yurahash) ||
30214 + (r5hash == yurahash && deh_hashval == yurahash)) {
30215 + reiserfs_warning(s, "reiserfs-2506",
30216 + "Unable to automatically detect hash "
30217 + "function. Please mount with -o "
30218 + "hash={tea,rupasov,r5}");
30219 + hash = UNSET_HASH;
30220 + goto out;
30223 + if (deh_hashval == yurahash)
30224 + hash = YURA_HASH;
30225 + else if (deh_hashval == teahash)
30226 + hash = TEA_HASH;
30227 + else if (deh_hashval == r5hash)
30228 + hash = R5_HASH;
30229 + else {
30230 + reiserfs_warning(s, "reiserfs-2506",
30231 + "Unrecognised hash function");
30232 + hash = UNSET_HASH;
30234 +out:
30235 + pathrelse(&path);
30236 + return hash;
30239 +/* finds out which hash names are sorted with */
30240 +static int what_hash(struct super_block *s)
30242 + __u32 code;
30244 + code = sb_hash_function_code(SB_DISK_SUPER_BLOCK(s));
30246 + /*
30247 + * reiserfs_hash_detect() == true if any of the hash mount options
30248 + * were used. We must check them to make sure the user isn't
30249 + * using a bad hash value
30250 + */
30251 + if (code == UNSET_HASH || reiserfs_hash_detect(s))
30252 + code = find_hash_out(s);
30254 + if (code != UNSET_HASH && reiserfs_hash_detect(s)) {
30255 + /*
30256 + * detection has found the hash, and we must check against the
30257 + * mount options
30258 + */
30259 + if (reiserfs_rupasov_hash(s) && code != YURA_HASH) {
30260 + reiserfs_warning(s, "reiserfs-2507",
30261 + "Error, %s hash detected, "
30262 + "unable to force rupasov hash",
30263 + reiserfs_hashname(code));
30264 + code = UNSET_HASH;
30265 + } else if (reiserfs_tea_hash(s) && code != TEA_HASH) {
30266 + reiserfs_warning(s, "reiserfs-2508",
30267 + "Error, %s hash detected, "
30268 + "unable to force tea hash",
30269 + reiserfs_hashname(code));
30270 + code = UNSET_HASH;
30271 + } else if (reiserfs_r5_hash(s) && code != R5_HASH) {
30272 + reiserfs_warning(s, "reiserfs-2509",
30273 + "Error, %s hash detected, "
30274 + "unable to force r5 hash",
30275 + reiserfs_hashname(code));
30276 + code = UNSET_HASH;
30278 + } else {
30279 + /*
30280 + * find_hash_out was not called or
30281 + * could not determine the hash
30282 + */
30283 + if (reiserfs_rupasov_hash(s)) {
30284 + code = YURA_HASH;
30285 + } else if (reiserfs_tea_hash(s)) {
30286 + code = TEA_HASH;
30287 + } else if (reiserfs_r5_hash(s)) {
30288 + code = R5_HASH;
30292 + /*
30293 + * if we are mounted RW, and we have a new valid hash code, update
30294 + * the super
30295 + */
30296 + if (code != UNSET_HASH &&
30297 + !sb_rdonly(s) &&
30298 + code != sb_hash_function_code(SB_DISK_SUPER_BLOCK(s))) {
30299 + set_sb_hash_function_code(SB_DISK_SUPER_BLOCK(s), code);
30301 + return code;
30304 +/* return pointer to appropriate function */
30305 +static hashf_t hash_function(struct super_block *s)
30307 + switch (what_hash(s)) {
30308 + case TEA_HASH:
30309 + reiserfs_info(s, "Using tea hash to sort names\n");
30310 + return keyed_hash;
30311 + case YURA_HASH:
30312 + reiserfs_info(s, "Using rupasov hash to sort names\n");
30313 + return yura_hash;
30314 + case R5_HASH:
30315 + reiserfs_info(s, "Using r5 hash to sort names\n");
30316 + return r5_hash;
30318 + return NULL;
30321 +/* this is used to set up correct value for old partitions */
30322 +static int function2code(hashf_t func)
30324 + if (func == keyed_hash)
30325 + return TEA_HASH;
30326 + if (func == yura_hash)
30327 + return YURA_HASH;
30328 + if (func == r5_hash)
30329 + return R5_HASH;
30331 + BUG(); /* should never happen */
30333 + return 0;
30336 +#define SWARN(silent, s, id, ...) \
30337 + if (!(silent)) \
30338 + reiserfs_warning(s, id, __VA_ARGS__)
30340 +static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
30342 + struct inode *root_inode;
30343 + struct reiserfs_transaction_handle th;
30344 + int old_format = 0;
30345 + unsigned long blocks;
30346 + unsigned int commit_max_age = 0;
30347 + int jinit_done = 0;
30348 + struct reiserfs_iget_args args;
30349 + struct reiserfs_super_block *rs;
30350 + char *jdev_name;
30351 + struct reiserfs_sb_info *sbi;
30352 + int errval = -EINVAL;
30353 + char *qf_names[REISERFS_MAXQUOTAS] = {};
30354 + unsigned int qfmt = 0;
30356 + sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL);
30357 + if (!sbi)
30358 + return -ENOMEM;
30359 + s->s_fs_info = sbi;
30360 + /* Set default values for options: non-aggressive tails, RO on errors */
30361 + sbi->s_mount_opt |= (1 << REISERFS_SMALLTAIL);
30362 + sbi->s_mount_opt |= (1 << REISERFS_ERROR_RO);
30363 + sbi->s_mount_opt |= (1 << REISERFS_BARRIER_FLUSH);
30364 + /* no preallocation minimum, be smart in reiserfs_file_write instead */
30365 + sbi->s_alloc_options.preallocmin = 0;
30366 + /* Preallocate by 16 blocks (17-1) at once */
30367 + sbi->s_alloc_options.preallocsize = 17;
30368 + /* setup default block allocator options */
30369 + reiserfs_init_alloc_options(s);
30371 + spin_lock_init(&sbi->old_work_lock);
30372 + INIT_DELAYED_WORK(&sbi->old_work, flush_old_commits);
30373 + mutex_init(&sbi->lock);
30374 + sbi->lock_depth = -1;
30376 + sbi->commit_wq = alloc_workqueue("reiserfs/%s", WQ_MEM_RECLAIM, 0,
30377 + s->s_id);
30378 + if (!sbi->commit_wq) {
30379 + SWARN(silent, s, "", "Cannot allocate commit workqueue");
30380 + errval = -ENOMEM;
30381 + goto error_unlocked;
30384 + jdev_name = NULL;
30385 + if (reiserfs_parse_options
30386 + (s, (char *)data, &sbi->s_mount_opt, &blocks, &jdev_name,
30387 + &commit_max_age, qf_names, &qfmt) == 0) {
30388 + goto error_unlocked;
30390 + if (jdev_name && jdev_name[0]) {
30391 + sbi->s_jdev = kstrdup(jdev_name, GFP_KERNEL);
30392 + if (!sbi->s_jdev) {
30393 + SWARN(silent, s, "", "Cannot allocate memory for "
30394 + "journal device name");
30395 + goto error_unlocked;
30398 +#ifdef CONFIG_QUOTA
30399 + handle_quota_files(s, qf_names, &qfmt);
30400 +#endif
30402 + if (blocks) {
30403 + SWARN(silent, s, "jmacd-7", "resize option for remount only");
30404 + goto error_unlocked;
30407 + /*
30408 + * try old format (undistributed bitmap, super block in 8-th 1k
30409 + * block of a device)
30410 + */
30411 + if (!read_super_block(s, REISERFS_OLD_DISK_OFFSET_IN_BYTES))
30412 + old_format = 1;
30414 + /*
30415 + * try new format (64-th 1k block), which can contain reiserfs
30416 + * super block
30417 + */
30418 + else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) {
30419 + SWARN(silent, s, "sh-2021", "can not find reiserfs on %s",
30420 + s->s_id);
30421 + goto error_unlocked;
30424 + s->s_time_min = 0;
30425 + s->s_time_max = U32_MAX;
30427 + rs = SB_DISK_SUPER_BLOCK(s);
30428 + /*
30429 + * Let's do basic sanity check to verify that underlying device is not
30430 + * smaller than the filesystem. If the check fails then abort and
30431 + * scream, because bad stuff will happen otherwise.
30432 + */
30433 + if (bdev_nr_bytes(s->s_bdev) < sb_block_count(rs) * sb_blocksize(rs)) {
30434 + SWARN(silent, s, "", "Filesystem cannot be "
30435 + "mounted because it is bigger than the device");
30436 + SWARN(silent, s, "", "You may need to run fsck "
30437 + "or increase size of your LVM partition");
30438 + SWARN(silent, s, "", "Or may be you forgot to "
30439 + "reboot after fdisk when it told you to");
30440 + goto error_unlocked;
30443 + sbi->s_mount_state = SB_REISERFS_STATE(s);
30444 + sbi->s_mount_state = REISERFS_VALID_FS;
30446 + if ((errval = reiserfs_init_bitmap_cache(s))) {
30447 + SWARN(silent, s, "jmacd-8", "unable to read bitmap");
30448 + goto error_unlocked;
30451 + errval = -EINVAL;
30452 +#ifdef CONFIG_REISERFS_CHECK
30453 + SWARN(silent, s, "", "CONFIG_REISERFS_CHECK is set ON");
30454 + SWARN(silent, s, "", "- it is slow mode for debugging.");
30455 +#endif
30457 + /* make data=ordered the default */
30458 + if (!reiserfs_data_log(s) && !reiserfs_data_ordered(s) &&
30459 + !reiserfs_data_writeback(s)) {
30460 + sbi->s_mount_opt |= (1 << REISERFS_DATA_ORDERED);
30463 + if (reiserfs_data_log(s)) {
30464 + reiserfs_info(s, "using journaled data mode\n");
30465 + } else if (reiserfs_data_ordered(s)) {
30466 + reiserfs_info(s, "using ordered data mode\n");
30467 + } else {
30468 + reiserfs_info(s, "using writeback data mode\n");
30470 + if (reiserfs_barrier_flush(s)) {
30471 + printk("reiserfs: using flush barriers\n");
30474 + if (journal_init(s, jdev_name, old_format, commit_max_age)) {
30475 + SWARN(silent, s, "sh-2022",
30476 + "unable to initialize journal space");
30477 + goto error_unlocked;
30478 + } else {
30479 + /*
30480 + * once this is set, journal_release must be called
30481 + * if we error out of the mount
30482 + */
30483 + jinit_done = 1;
30486 + if (reread_meta_blocks(s)) {
30487 + SWARN(silent, s, "jmacd-9",
30488 + "unable to reread meta blocks after journal init");
30489 + goto error_unlocked;
30492 + if (replay_only(s))
30493 + goto error_unlocked;
30495 + s->s_xattr = reiserfs_xattr_handlers;
30497 + if (bdev_read_only(s->s_bdev) && !sb_rdonly(s)) {
30498 + SWARN(silent, s, "clm-7000",
30499 + "Detected readonly device, marking FS readonly");
30500 + s->s_flags |= SB_RDONLY;
30502 + args.objectid = REISERFS_ROOT_OBJECTID;
30503 + args.dirid = REISERFS_ROOT_PARENT_OBJECTID;
30504 + root_inode =
30505 + iget5_locked(s, REISERFS_ROOT_OBJECTID, reiserfs_find_actor,
30506 + reiserfs_init_locked_inode, (void *)&args);
30507 + if (!root_inode) {
30508 + SWARN(silent, s, "jmacd-10", "get root inode failed");
30509 + goto error_unlocked;
30512 + /*
30513 + * This path assumed to be called with the BKL in the old times.
30514 + * Now we have inherited the big reiserfs lock from it and many
30515 + * reiserfs helpers called in the mount path and elsewhere require
30516 + * this lock to be held even if it's not always necessary. Let's be
30517 + * conservative and hold it early. The window can be reduced after
30518 + * careful review of the code.
30519 + */
30520 + reiserfs_write_lock(s);
30522 + if (root_inode->i_state & I_NEW) {
30523 + reiserfs_read_locked_inode(root_inode, &args);
30524 + unlock_new_inode(root_inode);
30527 + if (!S_ISDIR(root_inode->i_mode) || !inode_get_bytes(root_inode) ||
30528 + !root_inode->i_size) {
30529 + SWARN(silent, s, "", "corrupt root inode, run fsck");
30530 + iput(root_inode);
30531 + errval = -EUCLEAN;
30532 + goto error;
30535 + s->s_root = d_make_root(root_inode);
30536 + if (!s->s_root)
30537 + goto error;
30538 + /* define and initialize hash function */
30539 + sbi->s_hash_function = hash_function(s);
30540 + if (sbi->s_hash_function == NULL) {
30541 + dput(s->s_root);
30542 + s->s_root = NULL;
30543 + goto error;
30546 + if (is_reiserfs_3_5(rs)
30547 + || (is_reiserfs_jr(rs) && SB_VERSION(s) == REISERFS_VERSION_1))
30548 + set_bit(REISERFS_3_5, &sbi->s_properties);
30549 + else if (old_format)
30550 + set_bit(REISERFS_OLD_FORMAT, &sbi->s_properties);
30551 + else
30552 + set_bit(REISERFS_3_6, &sbi->s_properties);
30554 + if (!sb_rdonly(s)) {
30556 + errval = journal_begin(&th, s, 1);
30557 + if (errval) {
30558 + dput(s->s_root);
30559 + s->s_root = NULL;
30560 + goto error;
30562 + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
30564 + set_sb_umount_state(rs, REISERFS_ERROR_FS);
30565 + set_sb_fs_state(rs, 0);
30567 + /*
30568 + * Clear out s_bmap_nr if it would wrap. We can handle this
30569 + * case, but older revisions can't. This will cause the
30570 + * file system to fail mount on those older implementations,
30571 + * avoiding corruption. -jeffm
30572 + */
30573 + if (bmap_would_wrap(reiserfs_bmap_count(s)) &&
30574 + sb_bmap_nr(rs) != 0) {
30575 + reiserfs_warning(s, "super-2030", "This file system "
30576 + "claims to use %u bitmap blocks in "
30577 + "its super block, but requires %u. "
30578 + "Clearing to zero.", sb_bmap_nr(rs),
30579 + reiserfs_bmap_count(s));
30581 + set_sb_bmap_nr(rs, 0);
30584 + if (old_format_only(s)) {
30585 + /*
30586 + * filesystem of format 3.5 either with standard
30587 + * or non-standard journal
30588 + */
30589 + if (convert_reiserfs(s)) {
30590 + /* and -o conv is given */
30591 + if (!silent)
30592 + reiserfs_info(s,
30593 + "converting 3.5 filesystem to the 3.6 format");
30595 + if (is_reiserfs_3_5(rs))
30596 + /*
30597 + * put magic string of 3.6 format.
30598 + * 2.2 will not be able to
30599 + * mount this filesystem anymore
30600 + */
30601 + memcpy(rs->s_v1.s_magic,
30602 + reiserfs_3_6_magic_string,
30603 + sizeof
30604 + (reiserfs_3_6_magic_string));
30606 + set_sb_version(rs, REISERFS_VERSION_2);
30607 + reiserfs_convert_objectid_map_v1(s);
30608 + set_bit(REISERFS_3_6, &sbi->s_properties);
30609 + clear_bit(REISERFS_3_5, &sbi->s_properties);
30610 + } else if (!silent) {
30611 + reiserfs_info(s, "using 3.5.x disk format\n");
30613 + } else
30614 + set_sb_mnt_count(rs, sb_mnt_count(rs) + 1);
30617 + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
30618 + errval = journal_end(&th);
30619 + if (errval) {
30620 + dput(s->s_root);
30621 + s->s_root = NULL;
30622 + goto error;
30625 + reiserfs_write_unlock(s);
30626 + if ((errval = reiserfs_lookup_privroot(s)) ||
30627 + (errval = reiserfs_xattr_init(s, s->s_flags))) {
30628 + dput(s->s_root);
30629 + s->s_root = NULL;
30630 + goto error_unlocked;
30632 + reiserfs_write_lock(s);
30634 + /*
30635 + * look for files which were to be removed in previous session
30636 + */
30637 + finish_unfinished(s);
30638 + } else {
30639 + if (old_format_only(s) && !silent) {
30640 + reiserfs_info(s, "using 3.5.x disk format\n");
30643 + reiserfs_write_unlock(s);
30644 + if ((errval = reiserfs_lookup_privroot(s)) ||
30645 + (errval = reiserfs_xattr_init(s, s->s_flags))) {
30646 + dput(s->s_root);
30647 + s->s_root = NULL;
30648 + goto error_unlocked;
30650 + reiserfs_write_lock(s);
30652 + /*
30653 + * mark hash in super block: it could be unset. overwrite should be ok
30654 + */
30655 + set_sb_hash_function_code(rs, function2code(sbi->s_hash_function));
30657 + handle_attrs(s);
30659 + reiserfs_proc_info_init(s);
30661 + init_waitqueue_head(&(sbi->s_wait));
30662 + spin_lock_init(&sbi->bitmap_lock);
30664 + reiserfs_write_unlock(s);
30666 + return (0);
30668 +error:
30669 + reiserfs_write_unlock(s);
30671 +error_unlocked:
30672 + /* kill the commit thread, free journal ram */
30673 + if (jinit_done) {
30674 + reiserfs_write_lock(s);
30675 + journal_release_error(NULL, s);
30676 + reiserfs_write_unlock(s);
30679 + if (sbi->commit_wq)
30680 + destroy_workqueue(sbi->commit_wq);
30682 + reiserfs_cancel_old_flush(s);
30684 + reiserfs_free_bitmap_cache(s);
30685 + if (SB_BUFFER_WITH_SB(s))
30686 + brelse(SB_BUFFER_WITH_SB(s));
30687 +#ifdef CONFIG_QUOTA
30689 + int j;
30690 + for (j = 0; j < REISERFS_MAXQUOTAS; j++)
30691 + kfree(qf_names[j]);
30693 +#endif
30694 + kfree(sbi->s_jdev);
30695 + kfree(sbi);
30697 + s->s_fs_info = NULL;
30698 + return errval;
30701 +static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf)
30703 + struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(dentry->d_sb);
30705 + buf->f_namelen = (REISERFS_MAX_NAME(s->s_blocksize));
30706 + buf->f_bfree = sb_free_blocks(rs);
30707 + buf->f_bavail = buf->f_bfree;
30708 + buf->f_blocks = sb_block_count(rs) - sb_bmap_nr(rs) - 1;
30709 + buf->f_bsize = dentry->d_sb->s_blocksize;
30710 + /* changed to accommodate gcc folks. */
30711 + buf->f_type = REISERFS_SUPER_MAGIC;
30712 + buf->f_fsid.val[0] = (u32)crc32_le(0, rs->s_uuid, sizeof(rs->s_uuid)/2);
30713 + buf->f_fsid.val[1] = (u32)crc32_le(0, rs->s_uuid + sizeof(rs->s_uuid)/2,
30714 + sizeof(rs->s_uuid)/2);
30716 + return 0;
30719 +#ifdef CONFIG_QUOTA
30720 +static int reiserfs_write_dquot(struct dquot *dquot)
30722 + struct reiserfs_transaction_handle th;
30723 + int ret, err;
30724 + int depth;
30726 + reiserfs_write_lock(dquot->dq_sb);
30727 + ret =
30728 + journal_begin(&th, dquot->dq_sb,
30729 + REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
30730 + if (ret)
30731 + goto out;
30732 + depth = reiserfs_write_unlock_nested(dquot->dq_sb);
30733 + ret = dquot_commit(dquot);
30734 + reiserfs_write_lock_nested(dquot->dq_sb, depth);
30735 + err = journal_end(&th);
30736 + if (!ret && err)
30737 + ret = err;
30738 +out:
30739 + reiserfs_write_unlock(dquot->dq_sb);
30740 + return ret;
30743 +static int reiserfs_acquire_dquot(struct dquot *dquot)
30745 + struct reiserfs_transaction_handle th;
30746 + int ret, err;
30747 + int depth;
30749 + reiserfs_write_lock(dquot->dq_sb);
30750 + ret =
30751 + journal_begin(&th, dquot->dq_sb,
30752 + REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb));
30753 + if (ret)
30754 + goto out;
30755 + depth = reiserfs_write_unlock_nested(dquot->dq_sb);
30756 + ret = dquot_acquire(dquot);
30757 + reiserfs_write_lock_nested(dquot->dq_sb, depth);
30758 + err = journal_end(&th);
30759 + if (!ret && err)
30760 + ret = err;
30761 +out:
30762 + reiserfs_write_unlock(dquot->dq_sb);
30763 + return ret;
30766 +static int reiserfs_release_dquot(struct dquot *dquot)
30768 + struct reiserfs_transaction_handle th;
30769 + int ret, err;
30771 + reiserfs_write_lock(dquot->dq_sb);
30772 + ret =
30773 + journal_begin(&th, dquot->dq_sb,
30774 + REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb));
30775 + reiserfs_write_unlock(dquot->dq_sb);
30776 + if (ret) {
30777 + /* Release dquot anyway to avoid endless cycle in dqput() */
30778 + dquot_release(dquot);
30779 + goto out;
30781 + ret = dquot_release(dquot);
30782 + reiserfs_write_lock(dquot->dq_sb);
30783 + err = journal_end(&th);
30784 + if (!ret && err)
30785 + ret = err;
30786 + reiserfs_write_unlock(dquot->dq_sb);
30787 +out:
30788 + return ret;
30791 +static int reiserfs_mark_dquot_dirty(struct dquot *dquot)
30793 + /* Are we journaling quotas? */
30794 + if (REISERFS_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
30795 + REISERFS_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
30796 + dquot_mark_dquot_dirty(dquot);
30797 + return reiserfs_write_dquot(dquot);
30798 + } else
30799 + return dquot_mark_dquot_dirty(dquot);
30802 +static int reiserfs_write_info(struct super_block *sb, int type)
30804 + struct reiserfs_transaction_handle th;
30805 + int ret, err;
30806 + int depth;
30808 + /* Data block + inode block */
30809 + reiserfs_write_lock(sb);
30810 + ret = journal_begin(&th, sb, 2);
30811 + if (ret)
30812 + goto out;
30813 + depth = reiserfs_write_unlock_nested(sb);
30814 + ret = dquot_commit_info(sb, type);
30815 + reiserfs_write_lock_nested(sb, depth);
30816 + err = journal_end(&th);
30817 + if (!ret && err)
30818 + ret = err;
30819 +out:
30820 + reiserfs_write_unlock(sb);
30821 + return ret;
30825 + * Turn on quotas during mount time - we need to find the quota file and such...
30826 + */
30827 +static int reiserfs_quota_on_mount(struct super_block *sb, int type)
30829 + return dquot_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type],
30830 + REISERFS_SB(sb)->s_jquota_fmt, type);
30834 + * Standard function to be called on quota_on
30835 + */
30836 +static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
30837 + const struct path *path)
30839 + int err;
30840 + struct inode *inode;
30841 + struct reiserfs_transaction_handle th;
30842 + int opt = type == USRQUOTA ? REISERFS_USRQUOTA : REISERFS_GRPQUOTA;
30844 + reiserfs_write_lock(sb);
30845 + if (!(REISERFS_SB(sb)->s_mount_opt & (1 << opt))) {
30846 + err = -EINVAL;
30847 + goto out;
30850 + /* Quotafile not on the same filesystem? */
30851 + if (path->dentry->d_sb != sb) {
30852 + err = -EXDEV;
30853 + goto out;
30855 + inode = d_inode(path->dentry);
30856 + /*
30857 + * We must not pack tails for quota files on reiserfs for quota
30858 + * IO to work
30859 + */
30860 + if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) {
30861 + err = reiserfs_unpack(inode);
30862 + if (err) {
30863 + reiserfs_warning(sb, "super-6520",
30864 + "Unpacking tail of quota file failed"
30865 + " (%d). Cannot turn on quotas.", err);
30866 + err = -EINVAL;
30867 + goto out;
30869 + mark_inode_dirty(inode);
30871 + /* Journaling quota? */
30872 + if (REISERFS_SB(sb)->s_qf_names[type]) {
30873 + /* Quotafile not of fs root? */
30874 + if (path->dentry->d_parent != sb->s_root)
30875 + reiserfs_warning(sb, "super-6521",
30876 + "Quota file not on filesystem root. "
30877 + "Journalled quota will not work.");
30880 + /*
30881 + * When we journal data on quota file, we have to flush journal to see
30882 + * all updates to the file when we bypass pagecache...
30883 + */
30884 + if (reiserfs_file_data_log(inode)) {
30885 + /* Just start temporary transaction and finish it */
30886 + err = journal_begin(&th, sb, 1);
30887 + if (err)
30888 + goto out;
30889 + err = journal_end_sync(&th);
30890 + if (err)
30891 + goto out;
30893 + reiserfs_write_unlock(sb);
30894 + err = dquot_quota_on(sb, type, format_id, path);
30895 + if (!err) {
30896 + inode_lock(inode);
30897 + REISERFS_I(inode)->i_attrs |= REISERFS_IMMUTABLE_FL |
30898 + REISERFS_NOATIME_FL;
30899 + inode_set_flags(inode, S_IMMUTABLE | S_NOATIME,
30900 + S_IMMUTABLE | S_NOATIME);
30901 + inode_unlock(inode);
30902 + mark_inode_dirty(inode);
30904 + return err;
30905 +out:
30906 + reiserfs_write_unlock(sb);
30907 + return err;
30910 +static int reiserfs_quota_off(struct super_block *sb, int type)
30912 + int err;
30913 + struct inode *inode = sb_dqopt(sb)->files[type];
30915 + if (!inode || !igrab(inode))
30916 + goto out;
30918 + err = dquot_quota_off(sb, type);
30919 + if (err)
30920 + goto out_put;
30922 + inode_lock(inode);
30923 + REISERFS_I(inode)->i_attrs &= ~(REISERFS_IMMUTABLE_FL |
30924 + REISERFS_NOATIME_FL);
30925 + inode_set_flags(inode, 0, S_IMMUTABLE | S_NOATIME);
30926 + inode_unlock(inode);
30927 + mark_inode_dirty(inode);
30928 +out_put:
30929 + iput(inode);
30930 + return err;
30931 +out:
30932 + return dquot_quota_off(sb, type);
30936 + * Read data from quotafile - avoid pagecache and such because we cannot afford
30937 + * acquiring the locks... As quota files are never truncated and quota code
30938 + * itself serializes the operations (and no one else should touch the files)
30939 + * we don't have to be afraid of races
30940 + */
30941 +static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data,
30942 + size_t len, loff_t off)
30944 + struct inode *inode = sb_dqopt(sb)->files[type];
30945 + unsigned long blk = off >> sb->s_blocksize_bits;
30946 + int err = 0, offset = off & (sb->s_blocksize - 1), tocopy;
30947 + size_t toread;
30948 + struct buffer_head tmp_bh, *bh;
30949 + loff_t i_size = i_size_read(inode);
30951 + if (off > i_size)
30952 + return 0;
30953 + if (off + len > i_size)
30954 + len = i_size - off;
30955 + toread = len;
30956 + while (toread > 0) {
30957 + tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread);
30958 + tmp_bh.b_state = 0;
30959 + /*
30960 + * Quota files are without tails so we can safely
30961 + * use this function
30962 + */
30963 + reiserfs_write_lock(sb);
30964 + err = reiserfs_get_block(inode, blk, &tmp_bh, 0);
30965 + reiserfs_write_unlock(sb);
30966 + if (err)
30967 + return err;
30968 + if (!buffer_mapped(&tmp_bh)) /* A hole? */
30969 + memset(data, 0, tocopy);
30970 + else {
30971 + bh = sb_bread(sb, tmp_bh.b_blocknr);
30972 + if (!bh)
30973 + return -EIO;
30974 + memcpy(data, bh->b_data + offset, tocopy);
30975 + brelse(bh);
30977 + offset = 0;
30978 + toread -= tocopy;
30979 + data += tocopy;
30980 + blk++;
30982 + return len;
30986 + * Write to quotafile (we know the transaction is already started and has
30987 + * enough credits)
30988 + */
30989 +static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
30990 + const char *data, size_t len, loff_t off)
30992 + struct inode *inode = sb_dqopt(sb)->files[type];
30993 + unsigned long blk = off >> sb->s_blocksize_bits;
30994 + int err = 0, offset = off & (sb->s_blocksize - 1), tocopy;
30995 + int journal_quota = REISERFS_SB(sb)->s_qf_names[type] != NULL;
30996 + size_t towrite = len;
30997 + struct buffer_head tmp_bh, *bh;
30999 + if (!current->journal_info) {
31000 + printk(KERN_WARNING "reiserfs: Quota write (off=%llu, len=%llu) cancelled because transaction is not started.\n",
31001 + (unsigned long long)off, (unsigned long long)len);
31002 + return -EIO;
31004 + while (towrite > 0) {
31005 + tocopy = min_t(unsigned long, sb->s_blocksize - offset, towrite);
31006 + tmp_bh.b_state = 0;
31007 + reiserfs_write_lock(sb);
31008 + err = reiserfs_get_block(inode, blk, &tmp_bh, GET_BLOCK_CREATE);
31009 + reiserfs_write_unlock(sb);
31010 + if (err)
31011 + goto out;
31012 + if (offset || tocopy != sb->s_blocksize)
31013 + bh = sb_bread(sb, tmp_bh.b_blocknr);
31014 + else
31015 + bh = sb_getblk(sb, tmp_bh.b_blocknr);
31016 + if (!bh) {
31017 + err = -EIO;
31018 + goto out;
31020 + lock_buffer(bh);
31021 + memcpy(bh->b_data + offset, data, tocopy);
31022 + flush_dcache_page(bh->b_page);
31023 + set_buffer_uptodate(bh);
31024 + unlock_buffer(bh);
31025 + reiserfs_write_lock(sb);
31026 + reiserfs_prepare_for_journal(sb, bh, 1);
31027 + journal_mark_dirty(current->journal_info, bh);
31028 + if (!journal_quota)
31029 + reiserfs_add_ordered_list(inode, bh);
31030 + reiserfs_write_unlock(sb);
31031 + brelse(bh);
31032 + offset = 0;
31033 + towrite -= tocopy;
31034 + data += tocopy;
31035 + blk++;
31037 +out:
31038 + if (len == towrite)
31039 + return err;
31040 + if (inode->i_size < off + len - towrite)
31041 + i_size_write(inode, off + len - towrite);
31042 + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
31043 + mark_inode_dirty(inode);
31044 + return len - towrite;
31047 +#endif
31049 +static struct dentry *get_super_block(struct file_system_type *fs_type,
31050 + int flags, const char *dev_name,
31051 + void *data)
31053 + return mount_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super);
31056 +static int __init init_reiserfs_fs(void)
31058 + int ret;
31060 + ret = init_inodecache();
31061 + if (ret)
31062 + return ret;
31064 + reiserfs_proc_info_global_init();
31066 + ret = register_filesystem(&reiserfs_fs_type);
31067 + if (ret)
31068 + goto out;
31070 + return 0;
31071 +out:
31072 + reiserfs_proc_info_global_done();
31073 + destroy_inodecache();
31075 + return ret;
31078 +static void __exit exit_reiserfs_fs(void)
31080 + reiserfs_proc_info_global_done();
31081 + unregister_filesystem(&reiserfs_fs_type);
31082 + destroy_inodecache();
31085 +struct file_system_type reiserfs_fs_type = {
31086 + .owner = THIS_MODULE,
31087 + .name = "reiserfs",
31088 + .mount = get_super_block,
31089 + .kill_sb = reiserfs_kill_sb,
31090 + .fs_flags = FS_REQUIRES_DEV,
31092 +MODULE_ALIAS_FS("reiserfs");
31094 +MODULE_DESCRIPTION("ReiserFS journaled filesystem");
31095 +MODULE_AUTHOR("Hans Reiser <reiser@namesys.com>");
31096 +MODULE_LICENSE("GPL");
31098 +module_init(init_reiserfs_fs);
31099 +module_exit(exit_reiserfs_fs);
31100 diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c
31101 new file mode 100644
31102 index 000000000000..2cec61af2a9e
31103 --- /dev/null
31104 +++ b/fs/reiserfs/tail_conversion.c
31105 @@ -0,0 +1,318 @@
31106 +// SPDX-License-Identifier: GPL-2.0
31108 + * Copyright 1999 Hans Reiser, see reiserfs/README for licensing and copyright
31109 + * details
31110 + */
31112 +#include <linux/time.h>
31113 +#include <linux/pagemap.h>
31114 +#include <linux/buffer_head.h>
31115 +#include "reiserfs.h"
31118 + * access to tail : when one is going to read tail it must make sure, that is
31119 + * not running. direct2indirect and indirect2direct can not run concurrently
31120 + */
31123 + * Converts direct items to an unformatted node. Panics if file has no
31124 + * tail. -ENOSPC if no disk space for conversion
31125 + */
31127 + * path points to first direct item of the file regardless of how many of
31128 + * them are there
31129 + */
31130 +int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
31131 + struct treepath *path, struct buffer_head *unbh,
31132 + loff_t tail_offset)
31134 + struct super_block *sb = inode->i_sb;
31135 + struct buffer_head *up_to_date_bh;
31136 + struct item_head *p_le_ih = tp_item_head(path);
31137 + unsigned long total_tail = 0;
31139 + /* Key to search for the last byte of the converted item. */
31140 + struct cpu_key end_key;
31142 + /*
31143 + * new indirect item to be inserted or key
31144 + * of unfm pointer to be pasted
31145 + */
31146 + struct item_head ind_ih;
31147 + int blk_size;
31148 + /* returned value for reiserfs_insert_item and clones */
31149 + int retval;
31150 + /* Handle on an unformatted node that will be inserted in the tree. */
31151 + unp_t unfm_ptr;
31153 + BUG_ON(!th->t_trans_id);
31155 + REISERFS_SB(sb)->s_direct2indirect++;
31157 + blk_size = sb->s_blocksize;
31159 + /*
31160 + * and key to search for append or insert pointer to the new
31161 + * unformatted node.
31162 + */
31163 + copy_item_head(&ind_ih, p_le_ih);
31164 + set_le_ih_k_offset(&ind_ih, tail_offset);
31165 + set_le_ih_k_type(&ind_ih, TYPE_INDIRECT);
31167 + /* Set the key to search for the place for new unfm pointer */
31168 + make_cpu_key(&end_key, inode, tail_offset, TYPE_INDIRECT, 4);
31170 + /* FIXME: we could avoid this */
31171 + if (search_for_position_by_key(sb, &end_key, path) == POSITION_FOUND) {
31172 + reiserfs_error(sb, "PAP-14030",
31173 + "pasted or inserted byte exists in "
31174 + "the tree %K. Use fsck to repair.", &end_key);
31175 + pathrelse(path);
31176 + return -EIO;
31179 + p_le_ih = tp_item_head(path);
31181 + unfm_ptr = cpu_to_le32(unbh->b_blocknr);
31183 + if (is_statdata_le_ih(p_le_ih)) {
31184 + /* Insert new indirect item. */
31185 + set_ih_free_space(&ind_ih, 0); /* delete at nearest future */
31186 + put_ih_item_len(&ind_ih, UNFM_P_SIZE);
31187 + PATH_LAST_POSITION(path)++;
31188 + retval =
31189 + reiserfs_insert_item(th, path, &end_key, &ind_ih, inode,
31190 + (char *)&unfm_ptr);
31191 + } else {
31192 + /* Paste into last indirect item of an object. */
31193 + retval = reiserfs_paste_into_item(th, path, &end_key, inode,
31194 + (char *)&unfm_ptr,
31195 + UNFM_P_SIZE);
31197 + if (retval) {
31198 + return retval;
31200 + /*
31201 + * note: from here there are two keys which have matching first
31202 + * three key components. They only differ by the fourth one.
31203 + */
31205 + /* Set the key to search for the direct items of the file */
31206 + make_cpu_key(&end_key, inode, max_reiserfs_offset(inode), TYPE_DIRECT,
31207 + 4);
31209 + /*
31210 + * Move bytes from the direct items to the new unformatted node
31211 + * and delete them.
31212 + */
31213 + while (1) {
31214 + int tail_size;
31216 + /*
31217 + * end_key.k_offset is set so, that we will always have found
31218 + * last item of the file
31219 + */
31220 + if (search_for_position_by_key(sb, &end_key, path) ==
31221 + POSITION_FOUND)
31222 + reiserfs_panic(sb, "PAP-14050",
31223 + "direct item (%K) not found", &end_key);
31224 + p_le_ih = tp_item_head(path);
31225 + RFALSE(!is_direct_le_ih(p_le_ih),
31226 + "vs-14055: direct item expected(%K), found %h",
31227 + &end_key, p_le_ih);
31228 + tail_size = (le_ih_k_offset(p_le_ih) & (blk_size - 1))
31229 + + ih_item_len(p_le_ih) - 1;
31231 + /*
31232 + * we only send the unbh pointer if the buffer is not
31233 + * up to date. this avoids overwriting good data from
31234 + * writepage() with old data from the disk or buffer cache
31235 + * Special case: unbh->b_page will be NULL if we are coming
31236 + * through DIRECT_IO handler here.
31237 + */
31238 + if (!unbh->b_page || buffer_uptodate(unbh)
31239 + || PageUptodate(unbh->b_page)) {
31240 + up_to_date_bh = NULL;
31241 + } else {
31242 + up_to_date_bh = unbh;
31244 + retval = reiserfs_delete_item(th, path, &end_key, inode,
31245 + up_to_date_bh);
31247 + total_tail += retval;
31249 + /* done: file does not have direct items anymore */
31250 + if (tail_size == retval)
31251 + break;
31254 + /*
31255 + * if we've copied bytes from disk into the page, we need to zero
31256 + * out the unused part of the block (it was not up to date before)
31257 + */
31258 + if (up_to_date_bh) {
31259 + unsigned pgoff =
31260 + (tail_offset + total_tail - 1) & (PAGE_SIZE - 1);
31261 + char *kaddr = kmap_atomic(up_to_date_bh->b_page);
31262 + memset(kaddr + pgoff, 0, blk_size - total_tail);
31263 + kunmap_atomic(kaddr);
31266 + REISERFS_I(inode)->i_first_direct_byte = U32_MAX;
31268 + return 0;
31271 +/* stolen from fs/buffer.c */
31272 +void reiserfs_unmap_buffer(struct buffer_head *bh)
31274 + lock_buffer(bh);
31275 + if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
31276 + BUG();
31278 + clear_buffer_dirty(bh);
31279 + /*
31280 + * Remove the buffer from whatever list it belongs to. We are mostly
31281 + * interested in removing it from per-sb j_dirty_buffers list, to avoid
31282 + * BUG() on attempt to write not mapped buffer
31283 + */
31284 + if ((!list_empty(&bh->b_assoc_buffers) || bh->b_private) && bh->b_page) {
31285 + struct inode *inode = bh->b_folio->mapping->host;
31286 + struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
31287 + spin_lock(&j->j_dirty_buffers_lock);
31288 + list_del_init(&bh->b_assoc_buffers);
31289 + reiserfs_free_jh(bh);
31290 + spin_unlock(&j->j_dirty_buffers_lock);
31292 + clear_buffer_mapped(bh);
31293 + clear_buffer_req(bh);
31294 + clear_buffer_new(bh);
31295 + bh->b_bdev = NULL;
31296 + unlock_buffer(bh);
31300 + * this first locks inode (neither reads nor sync are permitted),
31301 + * reads tail through page cache, insert direct item. When direct item
31302 + * inserted successfully inode is left locked. Return value is always
31303 + * what we expect from it (number of cut bytes). But when tail remains
31304 + * in the unformatted node, we set mode to SKIP_BALANCING and unlock
31305 + * inode
31306 + */
31307 +int indirect2direct(struct reiserfs_transaction_handle *th,
31308 + struct inode *inode, struct page *page,
31309 + struct treepath *path, /* path to the indirect item. */
31310 + const struct cpu_key *item_key, /* Key to look for
31311 + * unformatted node
31312 + * pointer to be cut. */
31313 + loff_t n_new_file_size, /* New file size. */
31314 + char *mode)
31316 + struct super_block *sb = inode->i_sb;
31317 + struct item_head s_ih;
31318 + unsigned long block_size = sb->s_blocksize;
31319 + char *tail;
31320 + int tail_len, round_tail_len;
31321 + loff_t pos, pos1; /* position of first byte of the tail */
31322 + struct cpu_key key;
31324 + BUG_ON(!th->t_trans_id);
31326 + REISERFS_SB(sb)->s_indirect2direct++;
31328 + *mode = M_SKIP_BALANCING;
31330 + /* store item head path points to. */
31331 + copy_item_head(&s_ih, tp_item_head(path));
31333 + tail_len = (n_new_file_size & (block_size - 1));
31334 + if (get_inode_sd_version(inode) == STAT_DATA_V2)
31335 + round_tail_len = ROUND_UP(tail_len);
31336 + else
31337 + round_tail_len = tail_len;
31339 + pos =
31340 + le_ih_k_offset(&s_ih) - 1 + (ih_item_len(&s_ih) / UNFM_P_SIZE -
31341 + 1) * sb->s_blocksize;
31342 + pos1 = pos;
31344 + /*
31345 + * we are protected by i_mutex. The tail can not disapper, not
31346 + * append can be done either
31347 + * we are in truncate or packing tail in file_release
31348 + */
31350 + tail = (char *)kmap(page); /* this can schedule */
31352 + if (path_changed(&s_ih, path)) {
31353 + /* re-search indirect item */
31354 + if (search_for_position_by_key(sb, item_key, path)
31355 + == POSITION_NOT_FOUND)
31356 + reiserfs_panic(sb, "PAP-5520",
31357 + "item to be converted %K does not exist",
31358 + item_key);
31359 + copy_item_head(&s_ih, tp_item_head(path));
31360 +#ifdef CONFIG_REISERFS_CHECK
31361 + pos = le_ih_k_offset(&s_ih) - 1 +
31362 + (ih_item_len(&s_ih) / UNFM_P_SIZE -
31363 + 1) * sb->s_blocksize;
31364 + if (pos != pos1)
31365 + reiserfs_panic(sb, "vs-5530", "tail position "
31366 + "changed while we were reading it");
31367 +#endif
31370 + /* Set direct item header to insert. */
31371 + make_le_item_head(&s_ih, NULL, get_inode_item_key_version(inode),
31372 + pos1 + 1, TYPE_DIRECT, round_tail_len,
31373 + 0xffff /*ih_free_space */ );
31375 + /*
31376 + * we want a pointer to the first byte of the tail in the page.
31377 + * the page was locked and this part of the page was up to date when
31378 + * indirect2direct was called, so we know the bytes are still valid
31379 + */
31380 + tail = tail + (pos & (PAGE_SIZE - 1));
31382 + PATH_LAST_POSITION(path)++;
31384 + key = *item_key;
31385 + set_cpu_key_k_type(&key, TYPE_DIRECT);
31386 + key.key_length = 4;
31387 + /* Insert tail as new direct item in the tree */
31388 + if (reiserfs_insert_item(th, path, &key, &s_ih, inode,
31389 + tail ? tail : NULL) < 0) {
31390 + /*
31391 + * No disk memory. So we can not convert last unformatted node
31392 + * to the direct item. In this case we used to adjust
31393 + * indirect items's ih_free_space. Now ih_free_space is not
31394 + * used, it would be ideal to write zeros to corresponding
31395 + * unformatted node. For now i_size is considered as guard for
31396 + * going out of file size
31397 + */
31398 + kunmap(page);
31399 + return block_size - round_tail_len;
31401 + kunmap(page);
31403 + /* make sure to get the i_blocks changes from reiserfs_insert_item */
31404 + reiserfs_update_sd(th, inode);
31406 + /*
31407 + * note: we have now the same as in above direct2indirect
31408 + * conversion: there are two keys which have matching first three
31409 + * key components. They only differ by the fourth one.
31410 + */
31412 + /*
31413 + * We have inserted new direct item and must remove last
31414 + * unformatted node.
31415 + */
31416 + *mode = M_CUT;
31418 + /* we store position of first direct item in the in-core inode */
31419 + /* mark_file_with_tail (inode, pos1 + 1); */
31420 + REISERFS_I(inode)->i_first_direct_byte = pos1 + 1;
31422 + return block_size - round_tail_len;
31424 diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
31425 new file mode 100644
31426 index 000000000000..998035a6388e
31427 --- /dev/null
31428 +++ b/fs/reiserfs/xattr.c
31429 @@ -0,0 +1,1039 @@
31430 +// SPDX-License-Identifier: GPL-2.0
31432 + * linux/fs/reiserfs/xattr.c
31434 + * Copyright (c) 2002 by Jeff Mahoney, <jeffm@suse.com>
31436 + */
31439 + * In order to implement EA/ACLs in a clean, backwards compatible manner,
31440 + * they are implemented as files in a "private" directory.
31441 + * Each EA is in it's own file, with the directory layout like so (/ is assumed
31442 + * to be relative to fs root). Inside the /.reiserfs_priv/xattrs directory,
31443 + * directories named using the capital-hex form of the objectid and
31444 + * generation number are used. Inside each directory are individual files
31445 + * named with the name of the extended attribute.
31447 + * So, for objectid 12648430, we could have:
31448 + * /.reiserfs_priv/xattrs/C0FFEE.0/system.posix_acl_access
31449 + * /.reiserfs_priv/xattrs/C0FFEE.0/system.posix_acl_default
31450 + * /.reiserfs_priv/xattrs/C0FFEE.0/user.Content-Type
31451 + * .. or similar.
31453 + * The file contents are the text of the EA. The size is known based on the
31454 + * stat data describing the file.
31456 + * In the case of system.posix_acl_access and system.posix_acl_default, since
31457 + * these are special cases for filesystem ACLs, they are interpreted by the
31458 + * kernel, in addition, they are negatively and positively cached and attached
31459 + * to the inode so that unnecessary lookups are avoided.
31461 + * Locking works like so:
31462 + * Directory components (xattr root, xattr dir) are protectd by their i_mutex.
31463 + * The xattrs themselves are protected by the xattr_sem.
31464 + */
31466 +#include "reiserfs.h"
31467 +#include <linux/capability.h>
31468 +#include <linux/dcache.h>
31469 +#include <linux/namei.h>
31470 +#include <linux/errno.h>
31471 +#include <linux/gfp.h>
31472 +#include <linux/fs.h>
31473 +#include <linux/file.h>
31474 +#include <linux/pagemap.h>
31475 +#include <linux/xattr.h>
31476 +#include "xattr.h"
31477 +#include "acl.h"
31478 +#include <linux/uaccess.h>
31479 +#include <net/checksum.h>
31480 +#include <linux/stat.h>
31481 +#include <linux/quotaops.h>
31482 +#include <linux/security.h>
31483 +#include <linux/posix_acl_xattr.h>
31484 +#include <linux/xattr.h>
31486 +#define PRIVROOT_NAME ".reiserfs_priv"
31487 +#define XAROOT_NAME "xattrs"
31491 + * Helpers for inode ops. We do this so that we don't have all the VFS
31492 + * overhead and also for proper i_mutex annotation.
31493 + * dir->i_mutex must be held for all of them.
31494 + */
31495 +#ifdef CONFIG_REISERFS_FS_XATTR
31496 +static int xattr_create(struct inode *dir, struct dentry *dentry, int mode)
31498 + BUG_ON(!inode_is_locked(dir));
31499 + return dir->i_op->create(&nop_mnt_idmap, dir, dentry, mode, true);
31501 +#endif
31503 +static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
31505 + BUG_ON(!inode_is_locked(dir));
31506 + return dir->i_op->mkdir(&nop_mnt_idmap, dir, dentry, mode);
31510 + * We use I_MUTEX_CHILD here to silence lockdep. It's safe because xattr
31511 + * mutation ops aren't called during rename or splace, which are the
31512 + * only other users of I_MUTEX_CHILD. It violates the ordering, but that's
31513 + * better than allocating another subclass just for this code.
31514 + */
31515 +static int xattr_unlink(struct inode *dir, struct dentry *dentry)
31517 + int error;
31519 + BUG_ON(!inode_is_locked(dir));
31521 + inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD);
31522 + error = dir->i_op->unlink(dir, dentry);
31523 + inode_unlock(d_inode(dentry));
31525 + if (!error)
31526 + d_delete(dentry);
31527 + return error;
31530 +static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
31532 + int error;
31534 + BUG_ON(!inode_is_locked(dir));
31536 + inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD);
31537 + error = dir->i_op->rmdir(dir, dentry);
31538 + if (!error)
31539 + d_inode(dentry)->i_flags |= S_DEAD;
31540 + inode_unlock(d_inode(dentry));
31541 + if (!error)
31542 + d_delete(dentry);
31544 + return error;
31547 +#define xattr_may_create(flags) (!flags || flags & XATTR_CREATE)
31549 +static struct dentry *open_xa_root(struct super_block *sb, int flags)
31551 + struct dentry *privroot = REISERFS_SB(sb)->priv_root;
31552 + struct dentry *xaroot;
31554 + if (d_really_is_negative(privroot))
31555 + return ERR_PTR(-EOPNOTSUPP);
31557 + inode_lock_nested(d_inode(privroot), I_MUTEX_XATTR);
31559 + xaroot = dget(REISERFS_SB(sb)->xattr_root);
31560 + if (!xaroot)
31561 + xaroot = ERR_PTR(-EOPNOTSUPP);
31562 + else if (d_really_is_negative(xaroot)) {
31563 + int err = -ENODATA;
31565 + if (xattr_may_create(flags))
31566 + err = xattr_mkdir(d_inode(privroot), xaroot, 0700);
31567 + if (err) {
31568 + dput(xaroot);
31569 + xaroot = ERR_PTR(err);
31573 + inode_unlock(d_inode(privroot));
31574 + return xaroot;
31577 +static struct dentry *open_xa_dir(const struct inode *inode, int flags)
31579 + struct dentry *xaroot, *xadir;
31580 + char namebuf[17];
31582 + xaroot = open_xa_root(inode->i_sb, flags);
31583 + if (IS_ERR(xaroot))
31584 + return xaroot;
31586 + snprintf(namebuf, sizeof(namebuf), "%X.%X",
31587 + le32_to_cpu(INODE_PKEY(inode)->k_objectid),
31588 + inode->i_generation);
31590 + inode_lock_nested(d_inode(xaroot), I_MUTEX_XATTR);
31592 + xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf));
31593 + if (!IS_ERR(xadir) && d_really_is_negative(xadir)) {
31594 + int err = -ENODATA;
31596 + if (xattr_may_create(flags))
31597 + err = xattr_mkdir(d_inode(xaroot), xadir, 0700);
31598 + if (err) {
31599 + dput(xadir);
31600 + xadir = ERR_PTR(err);
31604 + inode_unlock(d_inode(xaroot));
31605 + dput(xaroot);
31606 + return xadir;
31610 + * The following are side effects of other operations that aren't explicitly
31611 + * modifying extended attributes. This includes operations such as permissions
31612 + * or ownership changes, object deletions, etc.
31613 + */
31614 +struct reiserfs_dentry_buf {
31615 + struct dir_context ctx;
31616 + struct dentry *xadir;
31617 + int count;
31618 + int err;
31619 + struct dentry *dentries[8];
31622 +static bool
31623 +fill_with_dentries(struct dir_context *ctx, const char *name, int namelen,
31624 + loff_t offset, u64 ino, unsigned int d_type)
31626 + struct reiserfs_dentry_buf *dbuf =
31627 + container_of(ctx, struct reiserfs_dentry_buf, ctx);
31628 + struct dentry *dentry;
31630 + WARN_ON_ONCE(!inode_is_locked(d_inode(dbuf->xadir)));
31632 + if (dbuf->count == ARRAY_SIZE(dbuf->dentries))
31633 + return false;
31635 + if (name[0] == '.' && (namelen < 2 ||
31636 + (namelen == 2 && name[1] == '.')))
31637 + return true;
31639 + dentry = lookup_one_len(name, dbuf->xadir, namelen);
31640 + if (IS_ERR(dentry)) {
31641 + dbuf->err = PTR_ERR(dentry);
31642 + return false;
31643 + } else if (d_really_is_negative(dentry)) {
31644 + /* A directory entry exists, but no file? */
31645 + reiserfs_error(dentry->d_sb, "xattr-20003",
31646 + "Corrupted directory: xattr %pd listed but "
31647 + "not found for file %pd.\n",
31648 + dentry, dbuf->xadir);
31649 + dput(dentry);
31650 + dbuf->err = -EIO;
31651 + return false;
31654 + dbuf->dentries[dbuf->count++] = dentry;
31655 + return true;
31658 +static void
31659 +cleanup_dentry_buf(struct reiserfs_dentry_buf *buf)
31661 + int i;
31663 + for (i = 0; i < buf->count; i++)
31664 + if (buf->dentries[i])
31665 + dput(buf->dentries[i]);
31668 +static int reiserfs_for_each_xattr(struct inode *inode,
31669 + int (*action)(struct dentry *, void *),
31670 + void *data)
31672 + struct dentry *dir;
31673 + int i, err = 0;
31674 + struct reiserfs_dentry_buf buf = {
31675 + .ctx.actor = fill_with_dentries,
31676 + };
31678 + /* Skip out, an xattr has no xattrs associated with it */
31679 + if (IS_PRIVATE(inode) || get_inode_sd_version(inode) == STAT_DATA_V1)
31680 + return 0;
31682 + dir = open_xa_dir(inode, XATTR_REPLACE);
31683 + if (IS_ERR(dir)) {
31684 + err = PTR_ERR(dir);
31685 + goto out;
31686 + } else if (d_really_is_negative(dir)) {
31687 + err = 0;
31688 + goto out_dir;
31691 + inode_lock_nested(d_inode(dir), I_MUTEX_XATTR);
31693 + buf.xadir = dir;
31694 + while (1) {
31695 + err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx);
31696 + if (err)
31697 + break;
31698 + if (buf.err) {
31699 + err = buf.err;
31700 + break;
31702 + if (!buf.count)
31703 + break;
31704 + for (i = 0; !err && i < buf.count && buf.dentries[i]; i++) {
31705 + struct dentry *dentry = buf.dentries[i];
31707 + if (!d_is_dir(dentry))
31708 + err = action(dentry, data);
31710 + dput(dentry);
31711 + buf.dentries[i] = NULL;
31713 + if (err)
31714 + break;
31715 + buf.count = 0;
31717 + inode_unlock(d_inode(dir));
31719 + cleanup_dentry_buf(&buf);
31721 + if (!err) {
31722 + /*
31723 + * We start a transaction here to avoid a ABBA situation
31724 + * between the xattr root's i_mutex and the journal lock.
31725 + * This doesn't incur much additional overhead since the
31726 + * new transaction will just nest inside the
31727 + * outer transaction.
31728 + */
31729 + int blocks = JOURNAL_PER_BALANCE_CNT * 2 + 2 +
31730 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
31731 + struct reiserfs_transaction_handle th;
31733 + reiserfs_write_lock(inode->i_sb);
31734 + err = journal_begin(&th, inode->i_sb, blocks);
31735 + reiserfs_write_unlock(inode->i_sb);
31736 + if (!err) {
31737 + int jerror;
31739 + inode_lock_nested(d_inode(dir->d_parent),
31740 + I_MUTEX_XATTR);
31741 + err = action(dir, data);
31742 + reiserfs_write_lock(inode->i_sb);
31743 + jerror = journal_end(&th);
31744 + reiserfs_write_unlock(inode->i_sb);
31745 + inode_unlock(d_inode(dir->d_parent));
31746 + err = jerror ?: err;
31749 +out_dir:
31750 + dput(dir);
31751 +out:
31752 + /*
31753 + * -ENODATA: this object doesn't have any xattrs
31754 + * -EOPNOTSUPP: this file system doesn't have xattrs enabled on disk.
31755 + * Neither are errors
31756 + */
31757 + if (err == -ENODATA || err == -EOPNOTSUPP)
31758 + err = 0;
31759 + return err;
31762 +static int delete_one_xattr(struct dentry *dentry, void *data)
31764 + struct inode *dir = d_inode(dentry->d_parent);
31766 + /* This is the xattr dir, handle specially. */
31767 + if (d_is_dir(dentry))
31768 + return xattr_rmdir(dir, dentry);
31770 + return xattr_unlink(dir, dentry);
31773 +static int chown_one_xattr(struct dentry *dentry, void *data)
31775 + struct iattr *attrs = data;
31776 + int ia_valid = attrs->ia_valid;
31777 + int err;
31779 + /*
31780 + * We only want the ownership bits. Otherwise, we'll do
31781 + * things like change a directory to a regular file if
31782 + * ATTR_MODE is set.
31783 + */
31784 + attrs->ia_valid &= (ATTR_UID|ATTR_GID);
31785 + err = reiserfs_setattr(&nop_mnt_idmap, dentry, attrs);
31786 + attrs->ia_valid = ia_valid;
31788 + return err;
31791 +/* No i_mutex, but the inode is unconnected. */
31792 +int reiserfs_delete_xattrs(struct inode *inode)
31794 + int err = reiserfs_for_each_xattr(inode, delete_one_xattr, NULL);
31796 + if (err)
31797 + reiserfs_warning(inode->i_sb, "jdm-20004",
31798 + "Couldn't delete all xattrs (%d)\n", err);
31799 + return err;
31802 +/* inode->i_mutex: down */
31803 +int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs)
31805 + int err = reiserfs_for_each_xattr(inode, chown_one_xattr, attrs);
31807 + if (err)
31808 + reiserfs_warning(inode->i_sb, "jdm-20007",
31809 + "Couldn't chown all xattrs (%d)\n", err);
31810 + return err;
31813 +#ifdef CONFIG_REISERFS_FS_XATTR
31815 + * Returns a dentry corresponding to a specific extended attribute file
31816 + * for the inode. If flags allow, the file is created. Otherwise, a
31817 + * valid or negative dentry, or an error is returned.
31818 + */
31819 +static struct dentry *xattr_lookup(struct inode *inode, const char *name,
31820 + int flags)
31822 + struct dentry *xadir, *xafile;
31823 + int err = 0;
31825 + xadir = open_xa_dir(inode, flags);
31826 + if (IS_ERR(xadir))
31827 + return ERR_CAST(xadir);
31829 + inode_lock_nested(d_inode(xadir), I_MUTEX_XATTR);
31830 + xafile = lookup_one_len(name, xadir, strlen(name));
31831 + if (IS_ERR(xafile)) {
31832 + err = PTR_ERR(xafile);
31833 + goto out;
31836 + if (d_really_is_positive(xafile) && (flags & XATTR_CREATE))
31837 + err = -EEXIST;
31839 + if (d_really_is_negative(xafile)) {
31840 + err = -ENODATA;
31841 + if (xattr_may_create(flags))
31842 + err = xattr_create(d_inode(xadir), xafile,
31843 + 0700|S_IFREG);
31846 + if (err)
31847 + dput(xafile);
31848 +out:
31849 + inode_unlock(d_inode(xadir));
31850 + dput(xadir);
31851 + if (err)
31852 + return ERR_PTR(err);
31853 + return xafile;
31856 +/* Internal operations on file data */
31857 +static inline void reiserfs_put_page(struct page *page)
31859 + kunmap(page);
31860 + put_page(page);
31863 +static struct page *reiserfs_get_page(struct inode *dir, size_t n)
31865 + struct address_space *mapping = dir->i_mapping;
31866 + struct page *page;
31867 + /*
31868 + * We can deadlock if we try to free dentries,
31869 + * and an unlink/rmdir has just occurred - GFP_NOFS avoids this
31870 + */
31871 + mapping_set_gfp_mask(mapping, GFP_NOFS);
31872 + page = read_mapping_page(mapping, n >> PAGE_SHIFT, NULL);
31873 + if (!IS_ERR(page))
31874 + kmap(page);
31875 + return page;
31878 +static inline __u32 xattr_hash(const char *msg, int len)
31880 + /*
31881 + * csum_partial() gives different results for little-endian and
31882 + * big endian hosts. Images created on little-endian hosts and
31883 + * mounted on big-endian hosts(and vice versa) will see csum mismatches
31884 + * when trying to fetch xattrs. Treating the hash as __wsum_t would
31885 + * lower the frequency of mismatch. This is an endianness bug in
31886 + * reiserfs. The return statement would result in a sparse warning. Do
31887 + * not fix the sparse warning so as to not hide a reminder of the bug.
31888 + */
31889 + return csum_partial(msg, len, 0);
31892 +int reiserfs_commit_write(struct file *f, struct page *page,
31893 + unsigned from, unsigned to);
31895 +static void update_ctime(struct inode *inode)
31897 + struct timespec64 now = current_time(inode);
31898 + struct timespec64 ctime = inode_get_ctime(inode);
31900 + if (inode_unhashed(inode) || !inode->i_nlink ||
31901 + timespec64_equal(&ctime, &now))
31902 + return;
31904 + inode_set_ctime_to_ts(inode, now);
31905 + mark_inode_dirty(inode);
31908 +static int lookup_and_delete_xattr(struct inode *inode, const char *name)
31910 + int err = 0;
31911 + struct dentry *dentry, *xadir;
31913 + xadir = open_xa_dir(inode, XATTR_REPLACE);
31914 + if (IS_ERR(xadir))
31915 + return PTR_ERR(xadir);
31917 + inode_lock_nested(d_inode(xadir), I_MUTEX_XATTR);
31918 + dentry = lookup_one_len(name, xadir, strlen(name));
31919 + if (IS_ERR(dentry)) {
31920 + err = PTR_ERR(dentry);
31921 + goto out_dput;
31924 + if (d_really_is_positive(dentry)) {
31925 + err = xattr_unlink(d_inode(xadir), dentry);
31926 + update_ctime(inode);
31929 + dput(dentry);
31930 +out_dput:
31931 + inode_unlock(d_inode(xadir));
31932 + dput(xadir);
31933 + return err;
31937 +/* Generic extended attribute operations that can be used by xa plugins */
31940 + * inode->i_mutex: down
31941 + */
31942 +int
31943 +reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
31944 + struct inode *inode, const char *name,
31945 + const void *buffer, size_t buffer_size, int flags)
31947 + int err = 0;
31948 + struct dentry *dentry;
31949 + struct page *page;
31950 + char *data;
31951 + size_t file_pos = 0;
31952 + size_t buffer_pos = 0;
31953 + size_t new_size;
31954 + __u32 xahash = 0;
31956 + if (get_inode_sd_version(inode) == STAT_DATA_V1)
31957 + return -EOPNOTSUPP;
31959 + if (!buffer) {
31960 + err = lookup_and_delete_xattr(inode, name);
31961 + return err;
31964 + dentry = xattr_lookup(inode, name, flags);
31965 + if (IS_ERR(dentry))
31966 + return PTR_ERR(dentry);
31968 + down_write(&REISERFS_I(inode)->i_xattr_sem);
31970 + xahash = xattr_hash(buffer, buffer_size);
31971 + while (buffer_pos < buffer_size || buffer_pos == 0) {
31972 + size_t chunk;
31973 + size_t skip = 0;
31974 + size_t page_offset = (file_pos & (PAGE_SIZE - 1));
31976 + if (buffer_size - buffer_pos > PAGE_SIZE)
31977 + chunk = PAGE_SIZE;
31978 + else
31979 + chunk = buffer_size - buffer_pos;
31981 + page = reiserfs_get_page(d_inode(dentry), file_pos);
31982 + if (IS_ERR(page)) {
31983 + err = PTR_ERR(page);
31984 + goto out_unlock;
31987 + lock_page(page);
31988 + data = page_address(page);
31990 + if (file_pos == 0) {
31991 + struct reiserfs_xattr_header *rxh;
31993 + skip = file_pos = sizeof(struct reiserfs_xattr_header);
31994 + if (chunk + skip > PAGE_SIZE)
31995 + chunk = PAGE_SIZE - skip;
31996 + rxh = (struct reiserfs_xattr_header *)data;
31997 + rxh->h_magic = cpu_to_le32(REISERFS_XATTR_MAGIC);
31998 + rxh->h_hash = cpu_to_le32(xahash);
32001 + reiserfs_write_lock(inode->i_sb);
32002 + err = __reiserfs_write_begin(page, page_offset, chunk + skip);
32003 + if (!err) {
32004 + if (buffer)
32005 + memcpy(data + skip, buffer + buffer_pos, chunk);
32006 + err = reiserfs_commit_write(NULL, page, page_offset,
32007 + page_offset + chunk +
32008 + skip);
32010 + reiserfs_write_unlock(inode->i_sb);
32011 + unlock_page(page);
32012 + reiserfs_put_page(page);
32013 + buffer_pos += chunk;
32014 + file_pos += chunk;
32015 + skip = 0;
32016 + if (err || buffer_size == 0 || !buffer)
32017 + break;
32020 + new_size = buffer_size + sizeof(struct reiserfs_xattr_header);
32021 + if (!err && new_size < i_size_read(d_inode(dentry))) {
32022 + struct iattr newattrs = {
32023 + .ia_ctime = current_time(inode),
32024 + .ia_size = new_size,
32025 + .ia_valid = ATTR_SIZE | ATTR_CTIME,
32026 + };
32028 + inode_lock_nested(d_inode(dentry), I_MUTEX_XATTR);
32029 + inode_dio_wait(d_inode(dentry));
32031 + err = reiserfs_setattr(&nop_mnt_idmap, dentry, &newattrs);
32032 + inode_unlock(d_inode(dentry));
32033 + } else
32034 + update_ctime(inode);
32035 +out_unlock:
32036 + up_write(&REISERFS_I(inode)->i_xattr_sem);
32037 + dput(dentry);
32038 + return err;
32041 +/* We need to start a transaction to maintain lock ordering */
32042 +int reiserfs_xattr_set(struct inode *inode, const char *name,
32043 + const void *buffer, size_t buffer_size, int flags)
32046 + struct reiserfs_transaction_handle th;
32047 + int error, error2;
32048 + size_t jbegin_count = reiserfs_xattr_nblocks(inode, buffer_size);
32050 + /* Check before we start a transaction and then do nothing. */
32051 + if (!d_really_is_positive(REISERFS_SB(inode->i_sb)->priv_root))
32052 + return -EOPNOTSUPP;
32054 + if (!(flags & XATTR_REPLACE))
32055 + jbegin_count += reiserfs_xattr_jcreate_nblocks(inode);
32057 + reiserfs_write_lock(inode->i_sb);
32058 + error = journal_begin(&th, inode->i_sb, jbegin_count);
32059 + reiserfs_write_unlock(inode->i_sb);
32060 + if (error) {
32061 + return error;
32064 + error = reiserfs_xattr_set_handle(&th, inode, name,
32065 + buffer, buffer_size, flags);
32067 + reiserfs_write_lock(inode->i_sb);
32068 + error2 = journal_end(&th);
32069 + reiserfs_write_unlock(inode->i_sb);
32070 + if (error == 0)
32071 + error = error2;
32073 + return error;
32077 + * inode->i_mutex: down
32078 + */
32079 +int
32080 +reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer,
32081 + size_t buffer_size)
32083 + ssize_t err = 0;
32084 + struct dentry *dentry;
32085 + size_t isize;
32086 + size_t file_pos = 0;
32087 + size_t buffer_pos = 0;
32088 + struct page *page;
32089 + __u32 hash = 0;
32091 + if (name == NULL)
32092 + return -EINVAL;
32094 + /*
32095 + * We can't have xattrs attached to v1 items since they don't have
32096 + * generation numbers
32097 + */
32098 + if (get_inode_sd_version(inode) == STAT_DATA_V1)
32099 + return -EOPNOTSUPP;
32101 + /*
32102 + * priv_root needn't be initialized during mount so allow initial
32103 + * lookups to succeed.
32104 + */
32105 + if (!REISERFS_SB(inode->i_sb)->priv_root)
32106 + return 0;
32108 + dentry = xattr_lookup(inode, name, XATTR_REPLACE);
32109 + if (IS_ERR(dentry)) {
32110 + err = PTR_ERR(dentry);
32111 + goto out;
32114 + down_read(&REISERFS_I(inode)->i_xattr_sem);
32116 + isize = i_size_read(d_inode(dentry));
32118 + /* Just return the size needed */
32119 + if (buffer == NULL) {
32120 + err = isize - sizeof(struct reiserfs_xattr_header);
32121 + goto out_unlock;
32124 + if (buffer_size < isize - sizeof(struct reiserfs_xattr_header)) {
32125 + err = -ERANGE;
32126 + goto out_unlock;
32129 + while (file_pos < isize) {
32130 + size_t chunk;
32131 + char *data;
32132 + size_t skip = 0;
32134 + if (isize - file_pos > PAGE_SIZE)
32135 + chunk = PAGE_SIZE;
32136 + else
32137 + chunk = isize - file_pos;
32139 + page = reiserfs_get_page(d_inode(dentry), file_pos);
32140 + if (IS_ERR(page)) {
32141 + err = PTR_ERR(page);
32142 + goto out_unlock;
32145 + lock_page(page);
32146 + data = page_address(page);
32147 + if (file_pos == 0) {
32148 + struct reiserfs_xattr_header *rxh =
32149 + (struct reiserfs_xattr_header *)data;
32150 + skip = file_pos = sizeof(struct reiserfs_xattr_header);
32151 + chunk -= skip;
32152 + /* Magic doesn't match up.. */
32153 + if (rxh->h_magic != cpu_to_le32(REISERFS_XATTR_MAGIC)) {
32154 + unlock_page(page);
32155 + reiserfs_put_page(page);
32156 + reiserfs_warning(inode->i_sb, "jdm-20001",
32157 + "Invalid magic for xattr (%s) "
32158 + "associated with %k", name,
32159 + INODE_PKEY(inode));
32160 + err = -EIO;
32161 + goto out_unlock;
32163 + hash = le32_to_cpu(rxh->h_hash);
32165 + memcpy(buffer + buffer_pos, data + skip, chunk);
32166 + unlock_page(page);
32167 + reiserfs_put_page(page);
32168 + file_pos += chunk;
32169 + buffer_pos += chunk;
32170 + skip = 0;
32172 + err = isize - sizeof(struct reiserfs_xattr_header);
32174 + if (xattr_hash(buffer, isize - sizeof(struct reiserfs_xattr_header)) !=
32175 + hash) {
32176 + reiserfs_warning(inode->i_sb, "jdm-20002",
32177 + "Invalid hash for xattr (%s) associated "
32178 + "with %k", name, INODE_PKEY(inode));
32179 + err = -EIO;
32182 +out_unlock:
32183 + up_read(&REISERFS_I(inode)->i_xattr_sem);
32184 + dput(dentry);
32186 +out:
32187 + return err;
32191 + * In order to implement different sets of xattr operations for each xattr
32192 + * prefix with the generic xattr API, a filesystem should create a
32193 + * null-terminated array of struct xattr_handler (one for each prefix) and
32194 + * hang a pointer to it off of the s_xattr field of the superblock.
32196 + * The generic_fooxattr() functions will use this list to dispatch xattr
32197 + * operations to the correct xattr_handler.
32198 + */
32199 +#define for_each_xattr_handler(handlers, handler) \
32200 + for ((handler) = *(handlers)++; \
32201 + (handler) != NULL; \
32202 + (handler) = *(handlers)++)
32204 +static inline bool reiserfs_posix_acl_list(const char *name,
32205 + struct dentry *dentry)
32207 + return (posix_acl_type(name) >= 0) &&
32208 + IS_POSIXACL(d_backing_inode(dentry));
32211 +/* This is the implementation for the xattr plugin infrastructure */
32212 +static inline bool reiserfs_xattr_list(const struct xattr_handler * const *handlers,
32213 + const char *name, struct dentry *dentry)
32215 + if (handlers) {
32216 + const struct xattr_handler *xah = NULL;
32218 + for_each_xattr_handler(handlers, xah) {
32219 + const char *prefix = xattr_prefix(xah);
32221 + if (strncmp(prefix, name, strlen(prefix)))
32222 + continue;
32224 + if (!xattr_handler_can_list(xah, dentry))
32225 + return false;
32227 + return true;
32231 + return reiserfs_posix_acl_list(name, dentry);
32234 +struct listxattr_buf {
32235 + struct dir_context ctx;
32236 + size_t size;
32237 + size_t pos;
32238 + char *buf;
32239 + struct dentry *dentry;
32242 +static bool listxattr_filler(struct dir_context *ctx, const char *name,
32243 + int namelen, loff_t offset, u64 ino,
32244 + unsigned int d_type)
32246 + struct listxattr_buf *b =
32247 + container_of(ctx, struct listxattr_buf, ctx);
32248 + size_t size;
32250 + if (name[0] != '.' ||
32251 + (namelen != 1 && (name[1] != '.' || namelen != 2))) {
32252 + if (!reiserfs_xattr_list(b->dentry->d_sb->s_xattr, name,
32253 + b->dentry))
32254 + return true;
32255 + size = namelen + 1;
32256 + if (b->buf) {
32257 + if (b->pos + size > b->size) {
32258 + b->pos = -ERANGE;
32259 + return false;
32261 + memcpy(b->buf + b->pos, name, namelen);
32262 + b->buf[b->pos + namelen] = 0;
32264 + b->pos += size;
32266 + return true;
32270 + * Inode operation listxattr()
32272 + * We totally ignore the generic listxattr here because it would be stupid
32273 + * not to. Since the xattrs are organized in a directory, we can just
32274 + * readdir to find them.
32275 + */
32276 +ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
32278 + struct dentry *dir;
32279 + int err = 0;
32280 + struct listxattr_buf buf = {
32281 + .ctx.actor = listxattr_filler,
32282 + .dentry = dentry,
32283 + .buf = buffer,
32284 + .size = buffer ? size : 0,
32285 + };
32287 + if (d_really_is_negative(dentry))
32288 + return -EINVAL;
32290 + if (get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
32291 + return -EOPNOTSUPP;
32293 + dir = open_xa_dir(d_inode(dentry), XATTR_REPLACE);
32294 + if (IS_ERR(dir)) {
32295 + err = PTR_ERR(dir);
32296 + if (err == -ENODATA)
32297 + err = 0; /* Not an error if there aren't any xattrs */
32298 + goto out;
32301 + inode_lock_nested(d_inode(dir), I_MUTEX_XATTR);
32302 + err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx);
32303 + inode_unlock(d_inode(dir));
32305 + if (!err)
32306 + err = buf.pos;
32308 + dput(dir);
32309 +out:
32310 + return err;
32313 +static int create_privroot(struct dentry *dentry)
32315 + int err;
32316 + struct inode *inode = d_inode(dentry->d_parent);
32318 + WARN_ON_ONCE(!inode_is_locked(inode));
32320 + err = xattr_mkdir(inode, dentry, 0700);
32321 + if (err || d_really_is_negative(dentry)) {
32322 + reiserfs_warning(dentry->d_sb, "jdm-20006",
32323 + "xattrs/ACLs enabled and couldn't "
32324 + "find/create .reiserfs_priv. "
32325 + "Failing mount.");
32326 + return -EOPNOTSUPP;
32329 + reiserfs_init_priv_inode(d_inode(dentry));
32330 + reiserfs_info(dentry->d_sb, "Created %s - reserved for xattr "
32331 + "storage.\n", PRIVROOT_NAME);
32333 + return 0;
32336 +#else
32337 +int __init reiserfs_xattr_register_handlers(void) { return 0; }
32338 +void reiserfs_xattr_unregister_handlers(void) {}
32339 +static int create_privroot(struct dentry *dentry) { return 0; }
32340 +#endif
32342 +/* Actual operations that are exported to VFS-land */
32343 +const struct xattr_handler * const reiserfs_xattr_handlers[] = {
32344 +#ifdef CONFIG_REISERFS_FS_XATTR
32345 + &reiserfs_xattr_user_handler,
32346 + &reiserfs_xattr_trusted_handler,
32347 +#endif
32348 +#ifdef CONFIG_REISERFS_FS_SECURITY
32349 + &reiserfs_xattr_security_handler,
32350 +#endif
32351 + NULL
32354 +static int xattr_mount_check(struct super_block *s)
32356 + /*
32357 + * We need generation numbers to ensure that the oid mapping is correct
32358 + * v3.5 filesystems don't have them.
32359 + */
32360 + if (old_format_only(s)) {
32361 + if (reiserfs_xattrs_optional(s)) {
32362 + /*
32363 + * Old format filesystem, but optional xattrs have
32364 + * been enabled. Error out.
32365 + */
32366 + reiserfs_warning(s, "jdm-2005",
32367 + "xattrs/ACLs not supported "
32368 + "on pre-v3.6 format filesystems. "
32369 + "Failing mount.");
32370 + return -EOPNOTSUPP;
32374 + return 0;
32377 +int reiserfs_permission(struct mnt_idmap *idmap, struct inode *inode,
32378 + int mask)
32380 + /*
32381 + * We don't do permission checks on the internal objects.
32382 + * Permissions are determined by the "owning" object.
32383 + */
32384 + if (IS_PRIVATE(inode))
32385 + return 0;
32387 + return generic_permission(&nop_mnt_idmap, inode, mask);
32390 +static int xattr_hide_revalidate(struct dentry *dentry, unsigned int flags)
32392 + return -EPERM;
32395 +static const struct dentry_operations xattr_lookup_poison_ops = {
32396 + .d_revalidate = xattr_hide_revalidate,
32399 +int reiserfs_lookup_privroot(struct super_block *s)
32401 + struct dentry *dentry;
32402 + int err = 0;
32404 + /* If we don't have the privroot located yet - go find it */
32405 + inode_lock(d_inode(s->s_root));
32406 + dentry = lookup_one_len(PRIVROOT_NAME, s->s_root,
32407 + strlen(PRIVROOT_NAME));
32408 + if (!IS_ERR(dentry)) {
32409 + REISERFS_SB(s)->priv_root = dentry;
32410 + d_set_d_op(dentry, &xattr_lookup_poison_ops);
32411 + if (d_really_is_positive(dentry))
32412 + reiserfs_init_priv_inode(d_inode(dentry));
32413 + } else
32414 + err = PTR_ERR(dentry);
32415 + inode_unlock(d_inode(s->s_root));
32417 + return err;
32421 + * We need to take a copy of the mount flags since things like
32422 + * SB_RDONLY don't get set until *after* we're called.
32423 + * mount_flags != mount_options
32424 + */
32425 +int reiserfs_xattr_init(struct super_block *s, int mount_flags)
32427 + int err = 0;
32428 + struct dentry *privroot = REISERFS_SB(s)->priv_root;
32430 + err = xattr_mount_check(s);
32431 + if (err)
32432 + goto error;
32434 + if (d_really_is_negative(privroot) && !(mount_flags & SB_RDONLY)) {
32435 + inode_lock(d_inode(s->s_root));
32436 + err = create_privroot(REISERFS_SB(s)->priv_root);
32437 + inode_unlock(d_inode(s->s_root));
32440 + if (d_really_is_positive(privroot)) {
32441 + inode_lock(d_inode(privroot));
32442 + if (!REISERFS_SB(s)->xattr_root) {
32443 + struct dentry *dentry;
32445 + dentry = lookup_one_len(XAROOT_NAME, privroot,
32446 + strlen(XAROOT_NAME));
32447 + if (!IS_ERR(dentry))
32448 + REISERFS_SB(s)->xattr_root = dentry;
32449 + else
32450 + err = PTR_ERR(dentry);
32452 + inode_unlock(d_inode(privroot));
32455 +error:
32456 + if (err) {
32457 + clear_bit(REISERFS_XATTRS_USER, &REISERFS_SB(s)->s_mount_opt);
32458 + clear_bit(REISERFS_POSIXACL, &REISERFS_SB(s)->s_mount_opt);
32461 + /* The super_block SB_POSIXACL must mirror the (no)acl mount option. */
32462 + if (reiserfs_posixacl(s))
32463 + s->s_flags |= SB_POSIXACL;
32464 + else
32465 + s->s_flags &= ~SB_POSIXACL;
32467 + return err;
32469 diff --git a/fs/reiserfs/xattr.h b/fs/reiserfs/xattr.h
32470 new file mode 100644
32471 index 000000000000..5868a4e990e3
32472 --- /dev/null
32473 +++ b/fs/reiserfs/xattr.h
32474 @@ -0,0 +1,117 @@
32475 +/* SPDX-License-Identifier: GPL-2.0 */
32476 +#include <linux/reiserfs_xattr.h>
32477 +#include <linux/init.h>
32478 +#include <linux/list.h>
32479 +#include <linux/rwsem.h>
32480 +#include <linux/xattr.h>
32482 +struct inode;
32483 +struct dentry;
32484 +struct iattr;
32485 +struct super_block;
32487 +int reiserfs_xattr_register_handlers(void) __init;
32488 +void reiserfs_xattr_unregister_handlers(void);
32489 +int reiserfs_xattr_init(struct super_block *sb, int mount_flags);
32490 +int reiserfs_lookup_privroot(struct super_block *sb);
32491 +int reiserfs_delete_xattrs(struct inode *inode);
32492 +int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs);
32493 +int reiserfs_permission(struct mnt_idmap *idmap,
32494 + struct inode *inode, int mask);
32496 +#ifdef CONFIG_REISERFS_FS_XATTR
32497 +#define has_xattr_dir(inode) (REISERFS_I(inode)->i_flags & i_has_xattr_dir)
32498 +ssize_t reiserfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
32500 +int reiserfs_xattr_get(struct inode *, const char *, void *, size_t);
32501 +int reiserfs_xattr_set(struct inode *, const char *, const void *, size_t, int);
32502 +int reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *,
32503 + struct inode *, const char *, const void *,
32504 + size_t, int);
32506 +extern const struct xattr_handler reiserfs_xattr_user_handler;
32507 +extern const struct xattr_handler reiserfs_xattr_trusted_handler;
32508 +extern const struct xattr_handler reiserfs_xattr_security_handler;
32509 +#ifdef CONFIG_REISERFS_FS_SECURITY
32510 +int reiserfs_security_init(struct inode *dir, struct inode *inode,
32511 + const struct qstr *qstr,
32512 + struct reiserfs_security_handle *sec);
32513 +int reiserfs_security_write(struct reiserfs_transaction_handle *th,
32514 + struct inode *inode,
32515 + struct reiserfs_security_handle *sec);
32516 +void reiserfs_security_free(struct reiserfs_security_handle *sec);
32517 +#endif
32519 +static inline int reiserfs_xattrs_initialized(struct super_block *sb)
32521 + return REISERFS_SB(sb)->priv_root && REISERFS_SB(sb)->xattr_root;
32524 +#define xattr_size(size) ((size) + sizeof(struct reiserfs_xattr_header))
32525 +static inline loff_t reiserfs_xattr_nblocks(struct inode *inode, loff_t size)
32527 + loff_t ret = 0;
32528 + if (reiserfs_file_data_log(inode)) {
32529 + ret = _ROUND_UP(xattr_size(size), inode->i_sb->s_blocksize);
32530 + ret >>= inode->i_sb->s_blocksize_bits;
32532 + return ret;
32536 + * We may have to create up to 3 objects: xattr root, xattr dir, xattr file.
32537 + * Let's try to be smart about it.
32538 + * xattr root: We cache it. If it's not cached, we may need to create it.
32539 + * xattr dir: If anything has been loaded for this inode, we can set a flag
32540 + * saying so.
32541 + * xattr file: Since we don't cache xattrs, we can't tell. We always include
32542 + * blocks for it.
32544 + * However, since root and dir can be created between calls - YOU MUST SAVE
32545 + * THIS VALUE.
32546 + */
32547 +static inline size_t reiserfs_xattr_jcreate_nblocks(struct inode *inode)
32549 + size_t nblocks = JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
32551 + if ((REISERFS_I(inode)->i_flags & i_has_xattr_dir) == 0) {
32552 + nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
32553 + if (d_really_is_negative(REISERFS_SB(inode->i_sb)->xattr_root))
32554 + nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
32557 + return nblocks;
32560 +static inline void reiserfs_init_xattr_rwsem(struct inode *inode)
32562 + init_rwsem(&REISERFS_I(inode)->i_xattr_sem);
32565 +#else
32567 +#define reiserfs_listxattr NULL
32569 +static inline void reiserfs_init_xattr_rwsem(struct inode *inode)
32572 +#endif /* CONFIG_REISERFS_FS_XATTR */
32574 +#ifndef CONFIG_REISERFS_FS_SECURITY
32575 +static inline int reiserfs_security_init(struct inode *dir,
32576 + struct inode *inode,
32577 + const struct qstr *qstr,
32578 + struct reiserfs_security_handle *sec)
32580 + return 0;
32582 +static inline int
32583 +reiserfs_security_write(struct reiserfs_transaction_handle *th,
32584 + struct inode *inode,
32585 + struct reiserfs_security_handle *sec)
32587 + return 0;
32589 +static inline void reiserfs_security_free(struct reiserfs_security_handle *sec)
32591 +#endif
32592 diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
32593 new file mode 100644
32594 index 000000000000..064264992b49
32595 --- /dev/null
32596 +++ b/fs/reiserfs/xattr_acl.c
32597 @@ -0,0 +1,411 @@
32598 +// SPDX-License-Identifier: GPL-2.0
32599 +#include <linux/capability.h>
32600 +#include <linux/fs.h>
32601 +#include <linux/posix_acl.h>
32602 +#include "reiserfs.h"
32603 +#include <linux/errno.h>
32604 +#include <linux/pagemap.h>
32605 +#include <linux/xattr.h>
32606 +#include <linux/slab.h>
32607 +#include <linux/posix_acl_xattr.h>
32608 +#include "xattr.h"
32609 +#include "acl.h"
32610 +#include <linux/uaccess.h>
32612 +static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th,
32613 + struct inode *inode, int type,
32614 + struct posix_acl *acl);
32617 +int
32618 +reiserfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
32619 + struct posix_acl *acl, int type)
32621 + int error, error2;
32622 + struct reiserfs_transaction_handle th;
32623 + size_t jcreate_blocks;
32624 + int size = acl ? posix_acl_xattr_size(acl->a_count) : 0;
32625 + int update_mode = 0;
32626 + struct inode *inode = d_inode(dentry);
32627 + umode_t mode = inode->i_mode;
32629 + /*
32630 + * Pessimism: We can't assume that anything from the xattr root up
32631 + * has been created.
32632 + */
32634 + jcreate_blocks = reiserfs_xattr_jcreate_nblocks(inode) +
32635 + reiserfs_xattr_nblocks(inode, size) * 2;
32637 + reiserfs_write_lock(inode->i_sb);
32638 + error = journal_begin(&th, inode->i_sb, jcreate_blocks);
32639 + reiserfs_write_unlock(inode->i_sb);
32640 + if (error == 0) {
32641 + if (type == ACL_TYPE_ACCESS && acl) {
32642 + error = posix_acl_update_mode(&nop_mnt_idmap, inode,
32643 + &mode, &acl);
32644 + if (error)
32645 + goto unlock;
32646 + update_mode = 1;
32648 + error = __reiserfs_set_acl(&th, inode, type, acl);
32649 + if (!error && update_mode)
32650 + inode->i_mode = mode;
32651 +unlock:
32652 + reiserfs_write_lock(inode->i_sb);
32653 + error2 = journal_end(&th);
32654 + reiserfs_write_unlock(inode->i_sb);
32655 + if (error2)
32656 + error = error2;
32659 + return error;
32663 + * Convert from filesystem to in-memory representation.
32664 + */
32665 +static struct posix_acl *reiserfs_posix_acl_from_disk(const void *value, size_t size)
32667 + const char *end = (char *)value + size;
32668 + int n, count;
32669 + struct posix_acl *acl;
32671 + if (!value)
32672 + return NULL;
32673 + if (size < sizeof(reiserfs_acl_header))
32674 + return ERR_PTR(-EINVAL);
32675 + if (((reiserfs_acl_header *) value)->a_version !=
32676 + cpu_to_le32(REISERFS_ACL_VERSION))
32677 + return ERR_PTR(-EINVAL);
32678 + value = (char *)value + sizeof(reiserfs_acl_header);
32679 + count = reiserfs_acl_count(size);
32680 + if (count < 0)
32681 + return ERR_PTR(-EINVAL);
32682 + if (count == 0)
32683 + return NULL;
32684 + acl = posix_acl_alloc(count, GFP_NOFS);
32685 + if (!acl)
32686 + return ERR_PTR(-ENOMEM);
32687 + for (n = 0; n < count; n++) {
32688 + reiserfs_acl_entry *entry = (reiserfs_acl_entry *) value;
32689 + if ((char *)value + sizeof(reiserfs_acl_entry_short) > end)
32690 + goto fail;
32691 + acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
32692 + acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
32693 + switch (acl->a_entries[n].e_tag) {
32694 + case ACL_USER_OBJ:
32695 + case ACL_GROUP_OBJ:
32696 + case ACL_MASK:
32697 + case ACL_OTHER:
32698 + value = (char *)value +
32699 + sizeof(reiserfs_acl_entry_short);
32700 + break;
32702 + case ACL_USER:
32703 + value = (char *)value + sizeof(reiserfs_acl_entry);
32704 + if ((char *)value > end)
32705 + goto fail;
32706 + acl->a_entries[n].e_uid =
32707 + make_kuid(&init_user_ns,
32708 + le32_to_cpu(entry->e_id));
32709 + break;
32710 + case ACL_GROUP:
32711 + value = (char *)value + sizeof(reiserfs_acl_entry);
32712 + if ((char *)value > end)
32713 + goto fail;
32714 + acl->a_entries[n].e_gid =
32715 + make_kgid(&init_user_ns,
32716 + le32_to_cpu(entry->e_id));
32717 + break;
32719 + default:
32720 + goto fail;
32723 + if (value != end)
32724 + goto fail;
32725 + return acl;
32727 +fail:
32728 + posix_acl_release(acl);
32729 + return ERR_PTR(-EINVAL);
32733 + * Convert from in-memory to filesystem representation.
32734 + */
32735 +static void *reiserfs_posix_acl_to_disk(const struct posix_acl *acl, size_t * size)
32737 + reiserfs_acl_header *ext_acl;
32738 + char *e;
32739 + int n;
32741 + *size = reiserfs_acl_size(acl->a_count);
32742 + ext_acl = kmalloc(sizeof(reiserfs_acl_header) +
32743 + acl->a_count *
32744 + sizeof(reiserfs_acl_entry),
32745 + GFP_NOFS);
32746 + if (!ext_acl)
32747 + return ERR_PTR(-ENOMEM);
32748 + ext_acl->a_version = cpu_to_le32(REISERFS_ACL_VERSION);
32749 + e = (char *)ext_acl + sizeof(reiserfs_acl_header);
32750 + for (n = 0; n < acl->a_count; n++) {
32751 + const struct posix_acl_entry *acl_e = &acl->a_entries[n];
32752 + reiserfs_acl_entry *entry = (reiserfs_acl_entry *) e;
32753 + entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag);
32754 + entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
32755 + switch (acl->a_entries[n].e_tag) {
32756 + case ACL_USER:
32757 + entry->e_id = cpu_to_le32(
32758 + from_kuid(&init_user_ns, acl_e->e_uid));
32759 + e += sizeof(reiserfs_acl_entry);
32760 + break;
32761 + case ACL_GROUP:
32762 + entry->e_id = cpu_to_le32(
32763 + from_kgid(&init_user_ns, acl_e->e_gid));
32764 + e += sizeof(reiserfs_acl_entry);
32765 + break;
32767 + case ACL_USER_OBJ:
32768 + case ACL_GROUP_OBJ:
32769 + case ACL_MASK:
32770 + case ACL_OTHER:
32771 + e += sizeof(reiserfs_acl_entry_short);
32772 + break;
32774 + default:
32775 + goto fail;
32778 + return (char *)ext_acl;
32780 +fail:
32781 + kfree(ext_acl);
32782 + return ERR_PTR(-EINVAL);
32786 + * Inode operation get_posix_acl().
32788 + * inode->i_mutex: down
32789 + * BKL held [before 2.5.x]
32790 + */
32791 +struct posix_acl *reiserfs_get_acl(struct inode *inode, int type, bool rcu)
32793 + char *name, *value;
32794 + struct posix_acl *acl;
32795 + int size;
32796 + int retval;
32798 + if (rcu)
32799 + return ERR_PTR(-ECHILD);
32801 + switch (type) {
32802 + case ACL_TYPE_ACCESS:
32803 + name = XATTR_NAME_POSIX_ACL_ACCESS;
32804 + break;
32805 + case ACL_TYPE_DEFAULT:
32806 + name = XATTR_NAME_POSIX_ACL_DEFAULT;
32807 + break;
32808 + default:
32809 + BUG();
32812 + size = reiserfs_xattr_get(inode, name, NULL, 0);
32813 + if (size < 0) {
32814 + if (size == -ENODATA || size == -ENOSYS)
32815 + return NULL;
32816 + return ERR_PTR(size);
32819 + value = kmalloc(size, GFP_NOFS);
32820 + if (!value)
32821 + return ERR_PTR(-ENOMEM);
32823 + retval = reiserfs_xattr_get(inode, name, value, size);
32824 + if (retval == -ENODATA || retval == -ENOSYS) {
32825 + /*
32826 + * This shouldn't actually happen as it should have
32827 + * been caught above.. but just in case
32828 + */
32829 + acl = NULL;
32830 + } else if (retval < 0) {
32831 + acl = ERR_PTR(retval);
32832 + } else {
32833 + acl = reiserfs_posix_acl_from_disk(value, retval);
32836 + kfree(value);
32837 + return acl;
32841 + * Inode operation set_posix_acl().
32843 + * inode->i_mutex: down
32844 + * BKL held [before 2.5.x]
32845 + */
32846 +static int
32847 +__reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
32848 + int type, struct posix_acl *acl)
32850 + char *name;
32851 + void *value = NULL;
32852 + size_t size = 0;
32853 + int error;
32855 + switch (type) {
32856 + case ACL_TYPE_ACCESS:
32857 + name = XATTR_NAME_POSIX_ACL_ACCESS;
32858 + break;
32859 + case ACL_TYPE_DEFAULT:
32860 + name = XATTR_NAME_POSIX_ACL_DEFAULT;
32861 + if (!S_ISDIR(inode->i_mode))
32862 + return acl ? -EACCES : 0;
32863 + break;
32864 + default:
32865 + return -EINVAL;
32868 + if (acl) {
32869 + value = reiserfs_posix_acl_to_disk(acl, &size);
32870 + if (IS_ERR(value))
32871 + return (int)PTR_ERR(value);
32874 + error = reiserfs_xattr_set_handle(th, inode, name, value, size, 0);
32876 + /*
32877 + * Ensure that the inode gets dirtied if we're only using
32878 + * the mode bits and an old ACL didn't exist. We don't need
32879 + * to check if the inode is hashed here since we won't get
32880 + * called by reiserfs_inherit_default_acl().
32881 + */
32882 + if (error == -ENODATA) {
32883 + error = 0;
32884 + if (type == ACL_TYPE_ACCESS) {
32885 + inode_set_ctime_current(inode);
32886 + mark_inode_dirty(inode);
32890 + kfree(value);
32892 + if (!error)
32893 + set_cached_acl(inode, type, acl);
32895 + return error;
32899 + * dir->i_mutex: locked,
32900 + * inode is new and not released into the wild yet
32901 + */
32902 +int
32903 +reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
32904 + struct inode *dir, struct dentry *dentry,
32905 + struct inode *inode)
32907 + struct posix_acl *default_acl, *acl;
32908 + int err = 0;
32910 + /* ACLs only get applied to files and directories */
32911 + if (S_ISLNK(inode->i_mode))
32912 + return 0;
32914 + /*
32915 + * ACLs can only be used on "new" objects, so if it's an old object
32916 + * there is nothing to inherit from
32917 + */
32918 + if (get_inode_sd_version(dir) == STAT_DATA_V1)
32919 + goto apply_umask;
32921 + /*
32922 + * Don't apply ACLs to objects in the .reiserfs_priv tree.. This
32923 + * would be useless since permissions are ignored, and a pain because
32924 + * it introduces locking cycles
32925 + */
32926 + if (IS_PRIVATE(inode))
32927 + goto apply_umask;
32929 + err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
32930 + if (err)
32931 + return err;
32933 + if (default_acl) {
32934 + err = __reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT,
32935 + default_acl);
32936 + posix_acl_release(default_acl);
32938 + if (acl) {
32939 + if (!err)
32940 + err = __reiserfs_set_acl(th, inode, ACL_TYPE_ACCESS,
32941 + acl);
32942 + posix_acl_release(acl);
32945 + return err;
32947 +apply_umask:
32948 + /* no ACL, apply umask */
32949 + inode->i_mode &= ~current_umask();
32950 + return err;
32953 +/* This is used to cache the default acl before a new object is created.
32954 + * The biggest reason for this is to get an idea of how many blocks will
32955 + * actually be required for the create operation if we must inherit an ACL.
32956 + * An ACL write can add up to 3 object creations and an additional file write
32957 + * so we'd prefer not to reserve that many blocks in the journal if we can.
32958 + * It also has the advantage of not loading the ACL with a transaction open,
32959 + * this may seem silly, but if the owner of the directory is doing the
32960 + * creation, the ACL may not be loaded since the permissions wouldn't require
32961 + * it.
32962 + * We return the number of blocks required for the transaction.
32963 + */
32964 +int reiserfs_cache_default_acl(struct inode *inode)
32966 + struct posix_acl *acl;
32967 + int nblocks = 0;
32969 + if (IS_PRIVATE(inode))
32970 + return 0;
32972 + acl = get_inode_acl(inode, ACL_TYPE_DEFAULT);
32974 + if (acl && !IS_ERR(acl)) {
32975 + int size = reiserfs_acl_size(acl->a_count);
32977 + /* Other xattrs can be created during inode creation. We don't
32978 + * want to claim too many blocks, so we check to see if we
32979 + * need to create the tree to the xattrs, and then we
32980 + * just want two files. */
32981 + nblocks = reiserfs_xattr_jcreate_nblocks(inode);
32982 + nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
32984 + REISERFS_I(inode)->i_flags |= i_has_xattr_dir;
32986 + /* We need to account for writes + bitmaps for two files */
32987 + nblocks += reiserfs_xattr_nblocks(inode, size) * 4;
32988 + posix_acl_release(acl);
32991 + return nblocks;
32995 + * Called under i_mutex
32996 + */
32997 +int reiserfs_acl_chmod(struct dentry *dentry)
32999 + struct inode *inode = d_inode(dentry);
33001 + if (IS_PRIVATE(inode))
33002 + return 0;
33003 + if (get_inode_sd_version(inode) == STAT_DATA_V1 ||
33004 + !reiserfs_posixacl(inode->i_sb))
33005 + return 0;
33007 + return posix_acl_chmod(&nop_mnt_idmap, dentry, inode->i_mode);
33009 diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
33010 new file mode 100644
33011 index 000000000000..078dd8cc312f
33012 --- /dev/null
33013 +++ b/fs/reiserfs/xattr_security.c
33014 @@ -0,0 +1,127 @@
33015 +// SPDX-License-Identifier: GPL-2.0
33016 +#include "reiserfs.h"
33017 +#include <linux/errno.h>
33018 +#include <linux/fs.h>
33019 +#include <linux/pagemap.h>
33020 +#include <linux/xattr.h>
33021 +#include <linux/slab.h>
33022 +#include "xattr.h"
33023 +#include <linux/security.h>
33024 +#include <linux/uaccess.h>
33026 +static int
33027 +security_get(const struct xattr_handler *handler, struct dentry *unused,
33028 + struct inode *inode, const char *name, void *buffer, size_t size)
33030 + if (IS_PRIVATE(inode))
33031 + return -EPERM;
33033 + return reiserfs_xattr_get(inode, xattr_full_name(handler, name),
33034 + buffer, size);
33037 +static int
33038 +security_set(const struct xattr_handler *handler,
33039 + struct mnt_idmap *idmap, struct dentry *unused,
33040 + struct inode *inode, const char *name, const void *buffer,
33041 + size_t size, int flags)
33043 + if (IS_PRIVATE(inode))
33044 + return -EPERM;
33046 + return reiserfs_xattr_set(inode,
33047 + xattr_full_name(handler, name),
33048 + buffer, size, flags);
33051 +static bool security_list(struct dentry *dentry)
33053 + return !IS_PRIVATE(d_inode(dentry));
33056 +static int
33057 +reiserfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
33058 + void *fs_info)
33060 + struct reiserfs_security_handle *sec = fs_info;
33062 + sec->value = kmemdup(xattr_array->value, xattr_array->value_len,
33063 + GFP_KERNEL);
33064 + if (!sec->value)
33065 + return -ENOMEM;
33067 + sec->name = xattr_array->name;
33068 + sec->length = xattr_array->value_len;
33069 + return 0;
33072 +/* Initializes the security context for a new inode and returns the number
33073 + * of blocks needed for the transaction. If successful, reiserfs_security
33074 + * must be released using reiserfs_security_free when the caller is done. */
33075 +int reiserfs_security_init(struct inode *dir, struct inode *inode,
33076 + const struct qstr *qstr,
33077 + struct reiserfs_security_handle *sec)
33079 + int blocks = 0;
33080 + int error;
33082 + sec->name = NULL;
33083 + sec->value = NULL;
33084 + sec->length = 0;
33086 + /* Don't add selinux attributes on xattrs - they'll never get used */
33087 + if (IS_PRIVATE(dir))
33088 + return 0;
33090 + error = security_inode_init_security(inode, dir, qstr,
33091 + &reiserfs_initxattrs, sec);
33092 + if (error) {
33093 + sec->name = NULL;
33094 + sec->value = NULL;
33095 + sec->length = 0;
33096 + return error;
33099 + if (sec->length && reiserfs_xattrs_initialized(inode->i_sb)) {
33100 + blocks = reiserfs_xattr_jcreate_nblocks(inode) +
33101 + reiserfs_xattr_nblocks(inode, sec->length);
33102 + /* We don't want to count the directories twice if we have
33103 + * a default ACL. */
33104 + REISERFS_I(inode)->i_flags |= i_has_xattr_dir;
33106 + return blocks;
33109 +int reiserfs_security_write(struct reiserfs_transaction_handle *th,
33110 + struct inode *inode,
33111 + struct reiserfs_security_handle *sec)
33113 + char xattr_name[XATTR_NAME_MAX + 1] = XATTR_SECURITY_PREFIX;
33114 + int error;
33116 + if (XATTR_SECURITY_PREFIX_LEN + strlen(sec->name) > XATTR_NAME_MAX)
33117 + return -EINVAL;
33119 + strlcat(xattr_name, sec->name, sizeof(xattr_name));
33121 + error = reiserfs_xattr_set_handle(th, inode, xattr_name, sec->value,
33122 + sec->length, XATTR_CREATE);
33123 + if (error == -ENODATA || error == -EOPNOTSUPP)
33124 + error = 0;
33126 + return error;
33129 +void reiserfs_security_free(struct reiserfs_security_handle *sec)
33131 + kfree(sec->value);
33132 + sec->name = NULL;
33133 + sec->value = NULL;
33136 +const struct xattr_handler reiserfs_xattr_security_handler = {
33137 + .prefix = XATTR_SECURITY_PREFIX,
33138 + .get = security_get,
33139 + .set = security_set,
33140 + .list = security_list,
33142 diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
33143 new file mode 100644
33144 index 000000000000..0c0c74d8db0e
33145 --- /dev/null
33146 +++ b/fs/reiserfs/xattr_trusted.c
33147 @@ -0,0 +1,46 @@
33148 +// SPDX-License-Identifier: GPL-2.0
33149 +#include "reiserfs.h"
33150 +#include <linux/capability.h>
33151 +#include <linux/errno.h>
33152 +#include <linux/fs.h>
33153 +#include <linux/pagemap.h>
33154 +#include <linux/xattr.h>
33155 +#include "xattr.h"
33156 +#include <linux/uaccess.h>
33158 +static int
33159 +trusted_get(const struct xattr_handler *handler, struct dentry *unused,
33160 + struct inode *inode, const char *name, void *buffer, size_t size)
33162 + if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode))
33163 + return -EPERM;
33165 + return reiserfs_xattr_get(inode, xattr_full_name(handler, name),
33166 + buffer, size);
33169 +static int
33170 +trusted_set(const struct xattr_handler *handler,
33171 + struct mnt_idmap *idmap, struct dentry *unused,
33172 + struct inode *inode, const char *name, const void *buffer,
33173 + size_t size, int flags)
33175 + if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode))
33176 + return -EPERM;
33178 + return reiserfs_xattr_set(inode,
33179 + xattr_full_name(handler, name),
33180 + buffer, size, flags);
33183 +static bool trusted_list(struct dentry *dentry)
33185 + return capable(CAP_SYS_ADMIN) && !IS_PRIVATE(d_inode(dentry));
33188 +const struct xattr_handler reiserfs_xattr_trusted_handler = {
33189 + .prefix = XATTR_TRUSTED_PREFIX,
33190 + .get = trusted_get,
33191 + .set = trusted_set,
33192 + .list = trusted_list,
33194 diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
33195 new file mode 100644
33196 index 000000000000..88195181e1d7
33197 --- /dev/null
33198 +++ b/fs/reiserfs/xattr_user.c
33199 @@ -0,0 +1,43 @@
33200 +// SPDX-License-Identifier: GPL-2.0
33201 +#include "reiserfs.h"
33202 +#include <linux/errno.h>
33203 +#include <linux/fs.h>
33204 +#include <linux/pagemap.h>
33205 +#include <linux/xattr.h>
33206 +#include "xattr.h"
33207 +#include <linux/uaccess.h>
33209 +static int
33210 +user_get(const struct xattr_handler *handler, struct dentry *unused,
33211 + struct inode *inode, const char *name, void *buffer, size_t size)
33213 + if (!reiserfs_xattrs_user(inode->i_sb))
33214 + return -EOPNOTSUPP;
33215 + return reiserfs_xattr_get(inode, xattr_full_name(handler, name),
33216 + buffer, size);
33219 +static int
33220 +user_set(const struct xattr_handler *handler, struct mnt_idmap *idmap,
33221 + struct dentry *unused,
33222 + struct inode *inode, const char *name, const void *buffer,
33223 + size_t size, int flags)
33225 + if (!reiserfs_xattrs_user(inode->i_sb))
33226 + return -EOPNOTSUPP;
33227 + return reiserfs_xattr_set(inode,
33228 + xattr_full_name(handler, name),
33229 + buffer, size, flags);
33232 +static bool user_list(struct dentry *dentry)
33234 + return reiserfs_xattrs_user(dentry->d_sb);
33237 +const struct xattr_handler reiserfs_xattr_user_handler = {
33238 + .prefix = XATTR_USER_PREFIX,
33239 + .get = user_get,
33240 + .set = user_set,
33241 + .list = user_list,
33243 diff --git a/include/uapi/linux/reiserfs_fs.h b/include/uapi/linux/reiserfs_fs.h
33244 new file mode 100644
33245 index 000000000000..5bb921409f2b
33246 --- /dev/null
33247 +++ b/include/uapi/linux/reiserfs_fs.h
33248 @@ -0,0 +1,27 @@
33249 +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
33251 + * Copyright 1996, 1997, 1998 Hans Reiser, see reiserfs/README for licensing and copyright details
33252 + */
33253 +#ifndef _LINUX_REISER_FS_H
33254 +#define _LINUX_REISER_FS_H
33256 +#include <linux/types.h>
33257 +#include <linux/magic.h>
33260 + * include/linux/reiser_fs.h
33262 + * Reiser File System constants and structures
33264 + */
33266 +/* ioctl's command */
33267 +#define REISERFS_IOC_UNPACK _IOW(0xCD,1,long)
33268 +/* define following flags to be the same as in ext2, so that chattr(1),
33269 + lsattr(1) will work with us. */
33270 +#define REISERFS_IOC_GETFLAGS FS_IOC_GETFLAGS
33271 +#define REISERFS_IOC_SETFLAGS FS_IOC_SETFLAGS
33272 +#define REISERFS_IOC_GETVERSION FS_IOC_GETVERSION
33273 +#define REISERFS_IOC_SETVERSION FS_IOC_SETVERSION
33275 +#endif /* _LINUX_REISER_FS_H */
33276 diff --git a/include/uapi/linux/reiserfs_xattr.h b/include/uapi/linux/reiserfs_xattr.h
33277 new file mode 100644
33278 index 000000000000..503ad018ce5b
33279 --- /dev/null
33280 +++ b/include/uapi/linux/reiserfs_xattr.h
33281 @@ -0,0 +1,25 @@
33282 +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
33284 + File: linux/reiserfs_xattr.h
33287 +#ifndef _LINUX_REISERFS_XATTR_H
33288 +#define _LINUX_REISERFS_XATTR_H
33290 +#include <linux/types.h>
33292 +/* Magic value in header */
33293 +#define REISERFS_XATTR_MAGIC 0x52465841 /* "RFXA" */
33295 +struct reiserfs_xattr_header {
33296 + __le32 h_magic; /* magic number for identification */
33297 + __le32 h_hash; /* hash of the value */
33300 +struct reiserfs_security_handle {
33301 + const char *name;
33302 + void *value;
33303 + __kernel_size_t length;
33306 +#endif /* _LINUX_REISERFS_XATTR_H */
33307 diff --git a/scripts/selinux/mdp/mdp.c b/scripts/selinux/mdp/mdp.c
33308 index ea7fbe595971..52365921c043 100644
33309 --- a/scripts/selinux/mdp/mdp.c
33310 +++ b/scripts/selinux/mdp/mdp.c
33311 @@ -167,6 +167,9 @@ int main(int argc, char *argv[])
33312 #ifdef CONFIG_JFS_SECURITY
33313 FS_USE("xattr", "jfs");
33314 #endif
33315 +#ifdef CONFIG_REISERFS_FS_SECURITY
33316 + FS_USE("xattr", "reiserfs");
33317 +#endif
33318 #ifdef CONFIG_JFFS2_FS_SECURITY
33319 FS_USE("xattr", "jffs2");
33320 #endif
33321 diff --git a/tools/objtool/noreturns.h b/tools/objtool/noreturns.h
33322 index f37614cc2c1b..e7da92489167 100644
33323 --- a/tools/objtool/noreturns.h
33324 +++ b/tools/objtool/noreturns.h
33325 @@ -11,6 +11,7 @@ NORETURN(__ia32_sys_exit)
33326 NORETURN(__ia32_sys_exit_group)
33327 NORETURN(__kunit_abort)
33328 NORETURN(__module_put_and_kthread_exit)
33329 +NORETURN(__reiserfs_panic)
33330 NORETURN(__stack_chk_fail)
33331 NORETURN(__tdx_hypercall_failed)
33332 NORETURN(__ubsan_handle_builtin_unreachable)
33333 diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test.c b/tools/testing/selftests/filesystems/statmount/statmount_test.c
33334 index 8eb6aa606a0d..c773334bbcc9 100644
33335 --- a/tools/testing/selftests/filesystems/statmount/statmount_test.c
33336 +++ b/tools/testing/selftests/filesystems/statmount/statmount_test.c
33337 @@ -27,7 +27,7 @@ static const char *const known_fs[] = {
33338 "ipathfs", "iso9660", "jffs2", "jfs", "minix", "mqueue", "msdos",
33339 "nfs", "nfs4", "nfsd", "nilfs2", "nsfs", "ntfs", "ntfs3", "ocfs2",
33340 "ocfs2_dlmfs", "ocxlflash", "omfs", "openpromfs", "overlay", "pipefs",
33341 - "proc", "pstore", "pvfs2", "qnx4", "qnx6", "ramfs",
33342 + "proc", "pstore", "pvfs2", "qnx4", "qnx6", "ramfs", "reiserfs",
33343 "resctrl", "romfs", "rootfs", "rpc_pipefs", "s390_hypfs", "secretmem",
33344 "securityfs", "selinuxfs", "smackfs", "smb3", "sockfs", "spufs",
33345 "squashfs", "sysfs", "sysv", "tmpfs", "tracefs", "ubifs", "udf",