package/kernel/linux/restore-reiserfs.patch

   1 # --- T2-COPYRIGHT-BEGIN ---
   2 # t2/package/*/linux/restore-reiserfs.patch
   3 # Copyright (C) 2025 The T2 SDE Project
   4 # SPDX-License-Identifier: GPL-2.0 or patched project license
   5 # --- T2-COPYRIGHT-END ---
   6
   7 diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
   8 index 9ab2a3d6f2b4..92bffcc6747a 100644
   9 --- a/Documentation/filesystems/porting.rst
  10 +++ b/Documentation/filesystems/porting.rst
  11 @@ -177,7 +177,7 @@ settles down a bit.
  12  **mandatory**
  13
  14  s_export_op is now required for exporting a filesystem.
  15 -isofs, ext2, ext3, fat
  16 +isofs, ext2, ext3, reiserfs, fat
  17  can be used as examples of very different filesystems.
  18
  19  ---
  20 diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
  21 index 243f1f1b554a..e4be1378ba26 100644
  22 --- a/Documentation/userspace-api/ioctl/ioctl-number.rst
  23 +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
  24 @@ -375,7 +375,7 @@ Code  Seq#    Include File                                           Comments
  25  0xCB  00-1F                                                          CBM serial IEC bus in development:
  26                                                                       <mailto:michael.klein@puffin.lb.shuttle.de>
  27  0xCC  00-0F  drivers/misc/ibmvmc.h                                   pseries VMC driver
  28 -0xCD  01     linux/reiserfs_fs.h                                     Dead since 6.13
  29 +0xCD  01     linux/reiserfs_fs.h
  30  0xCE  01-02  uapi/linux/cxl_mem.h                                    Compute Express Link Memory Devices
  31  0xCF  02     fs/smb/client/cifs_ioctl.h
  32  0xDB  00-0F  drivers/char/mwave/mwavepub.h
  33 diff --git a/MAINTAINERS b/MAINTAINERS
  34 index f90a5a415218..2b3797676c9e 100644
  35 --- a/MAINTAINERS
  36 +++ b/MAINTAINERS
  37 @@ -19605,6 +19605,11 @@ F:     Documentation/devicetree/bindings/regmap/
  38  F:     drivers/base/regmap/
  39  F:     include/linux/regmap.h
  40
  41 +REISERFS FILE SYSTEM
  42 +L:     reiserfs-devel@vger.kernel.org
  43 +S:     Obsolete
  44 +F:     fs/reiserfs/
  45 +
  46  REMOTE PROCESSOR (REMOTEPROC) SUBSYSTEM
  47  M:     Bjorn Andersson <andersson@kernel.org>
  48  M:     Mathieu Poirier <mathieu.poirier@linaro.org>
  49 diff --git a/arch/alpha/configs/defconfig b/arch/alpha/configs/defconfig
  50 index 3280bd9e6578..1816c1dc22b1 100644
  51 --- a/arch/alpha/configs/defconfig
  52 +++ b/arch/alpha/configs/defconfig
  53 @@ -51,6 +51,7 @@ CONFIG_SERIAL_8250_CONSOLE=y
  54  CONFIG_RTC_CLASS=y
  55  CONFIG_RTC_DRV_CMOS=y
  56  CONFIG_EXT2_FS=y
  57 +CONFIG_REISERFS_FS=m
  58  CONFIG_ISO9660_FS=y
  59  CONFIG_MSDOS_FS=y
  60  CONFIG_VFAT_FS=y
  61 diff --git a/arch/arm/configs/pxa_defconfig b/arch/arm/configs/pxa_defconfig
  62 index 38916ac4bce4..e1cb170c2bf0 100644
  63 --- a/arch/arm/configs/pxa_defconfig
  64 +++ b/arch/arm/configs/pxa_defconfig
  65 @@ -583,6 +583,10 @@ CONFIG_EXT2_FS_SECURITY=y
  66  CONFIG_EXT3_FS=y
  67  CONFIG_EXT3_FS_POSIX_ACL=y
  68  CONFIG_EXT3_FS_SECURITY=y
  69 +CONFIG_REISERFS_FS=m
  70 +CONFIG_REISERFS_FS_XATTR=y
  71 +CONFIG_REISERFS_FS_POSIX_ACL=y
  72 +CONFIG_REISERFS_FS_SECURITY=y
  73  CONFIG_XFS_FS=m
  74  CONFIG_AUTOFS_FS=m
  75  CONFIG_FUSE_FS=m
  76 diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig
  77 index c705247e7b5b..a70aec9a05c4 100644
  78 --- a/arch/m68k/configs/amiga_defconfig
  79 +++ b/arch/m68k/configs/amiga_defconfig
  80 @@ -449,6 +449,7 @@ CONFIG_RTC_DRV_RP5C01=m
  81  # CONFIG_IOMMU_SUPPORT is not set
  82  CONFIG_DAX=m
  83  CONFIG_EXT4_FS=y
  84 +CONFIG_REISERFS_FS=m
  85  CONFIG_JFS_FS=m
  86  CONFIG_OCFS2_FS=m
  87  # CONFIG_OCFS2_DEBUG_MASKLOG is not set
  88 diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig
  89 index 6d62b9187a58..312853f3d26a 100644
  90 --- a/arch/m68k/configs/apollo_defconfig
  91 +++ b/arch/m68k/configs/apollo_defconfig
  92 @@ -406,6 +406,7 @@ CONFIG_RTC_DRV_GENERIC=m
  93  # CONFIG_IOMMU_SUPPORT is not set
  94  CONFIG_DAX=m
  95  CONFIG_EXT4_FS=y
  96 +CONFIG_REISERFS_FS=m
  97  CONFIG_JFS_FS=m
  98  CONFIG_OCFS2_FS=m
  99  # CONFIG_OCFS2_DEBUG_MASKLOG is not set
 100 diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig
 101 index c3c644df852d..0853e4358de9 100644
 102 --- a/arch/m68k/configs/atari_defconfig
 103 +++ b/arch/m68k/configs/atari_defconfig
 104 @@ -426,6 +426,7 @@ CONFIG_RTC_DRV_GENERIC=m
 105  # CONFIG_IOMMU_SUPPORT is not set
 106  CONFIG_DAX=m
 107  CONFIG_EXT4_FS=y
 108 +CONFIG_REISERFS_FS=m
 109  CONFIG_JFS_FS=m
 110  CONFIG_OCFS2_FS=m
 111  # CONFIG_OCFS2_DEBUG_MASKLOG is not set
 112 diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig
 113 index 20261f819691..f738202d1f36 100644
 114 --- a/arch/m68k/configs/bvme6000_defconfig
 115 +++ b/arch/m68k/configs/bvme6000_defconfig
 116 @@ -398,6 +398,7 @@ CONFIG_RTC_DRV_GENERIC=m
 117  # CONFIG_IOMMU_SUPPORT is not set
 118  CONFIG_DAX=m
 119  CONFIG_EXT4_FS=y
 120 +CONFIG_REISERFS_FS=m
 121  CONFIG_JFS_FS=m
 122  CONFIG_OCFS2_FS=m
 123  # CONFIG_OCFS2_DEBUG_MASKLOG is not set
 124 diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig
 125 index ce4fe93a0f70..74f74e03ccc9 100644
 126 --- a/arch/m68k/configs/hp300_defconfig
 127 +++ b/arch/m68k/configs/hp300_defconfig
 128 @@ -408,6 +408,7 @@ CONFIG_RTC_DRV_GENERIC=m
 129  # CONFIG_IOMMU_SUPPORT is not set
 130  CONFIG_DAX=m
 131  CONFIG_EXT4_FS=y
 132 +CONFIG_REISERFS_FS=m
 133  CONFIG_JFS_FS=m
 134  CONFIG_OCFS2_FS=m
 135  # CONFIG_OCFS2_DEBUG_MASKLOG is not set
 136 diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig
 137 index 040ae75f47c3..14c8f1b374aa 100644
 138 --- a/arch/m68k/configs/mac_defconfig
 139 +++ b/arch/m68k/configs/mac_defconfig
 140 @@ -425,6 +425,7 @@ CONFIG_RTC_DRV_GENERIC=m
 141  # CONFIG_IOMMU_SUPPORT is not set
 142  CONFIG_DAX=m
 143  CONFIG_EXT4_FS=y
 144 +CONFIG_REISERFS_FS=m
 145  CONFIG_JFS_FS=m
 146  CONFIG_OCFS2_FS=m
 147  # CONFIG_OCFS2_DEBUG_MASKLOG is not set
 148 diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig
 149 index f8edc9082724..41c8112c6d0d 100644
 150 --- a/arch/m68k/configs/multi_defconfig
 151 +++ b/arch/m68k/configs/multi_defconfig
 152 @@ -511,6 +511,7 @@ CONFIG_RTC_DRV_GENERIC=m
 153  # CONFIG_IOMMU_SUPPORT is not set
 154  CONFIG_DAX=m
 155  CONFIG_EXT4_FS=y
 156 +CONFIG_REISERFS_FS=m
 157  CONFIG_JFS_FS=m
 158  CONFIG_OCFS2_FS=m
 159  # CONFIG_OCFS2_DEBUG_MASKLOG is not set
 160 diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig
 161 index 71fc71bb660e..e72d37ee90a7 100644
 162 --- a/arch/m68k/configs/mvme147_defconfig
 163 +++ b/arch/m68k/configs/mvme147_defconfig
 164 @@ -397,6 +397,7 @@ CONFIG_RTC_DRV_GENERIC=m
 165  # CONFIG_IOMMU_SUPPORT is not set
 166  CONFIG_DAX=m
 167  CONFIG_EXT4_FS=y
 168 +CONFIG_REISERFS_FS=m
 169  CONFIG_JFS_FS=m
 170  CONFIG_OCFS2_FS=m
 171  # CONFIG_OCFS2_DEBUG_MASKLOG is not set
 172 diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig
 173 index 41072e68028e..733f1fc9a50a 100644
 174 --- a/arch/m68k/configs/mvme16x_defconfig
 175 +++ b/arch/m68k/configs/mvme16x_defconfig
 176 @@ -398,6 +398,7 @@ CONFIG_RTC_DRV_GENERIC=m
 177  # CONFIG_IOMMU_SUPPORT is not set
 178  CONFIG_DAX=m
 179  CONFIG_EXT4_FS=y
 180 +CONFIG_REISERFS_FS=m
 181  CONFIG_JFS_FS=m
 182  CONFIG_OCFS2_FS=m
 183  # CONFIG_OCFS2_DEBUG_MASKLOG is not set
 184 diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig
 185 index e4c30e2b9bbb..3efe25435561 100644
 186 --- a/arch/m68k/configs/q40_defconfig
 187 +++ b/arch/m68k/configs/q40_defconfig
 188 @@ -415,6 +415,7 @@ CONFIG_RTC_DRV_GENERIC=m
 189  # CONFIG_IOMMU_SUPPORT is not set
 190  CONFIG_DAX=m
 191  CONFIG_EXT4_FS=y
 192 +CONFIG_REISERFS_FS=m
 193  CONFIG_JFS_FS=m
 194  CONFIG_OCFS2_FS=m
 195  # CONFIG_OCFS2_DEBUG_MASKLOG is not set
 196 diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig
 197 index 980843a9ea1e..1b8ea0e7acb4 100644
 198 --- a/arch/m68k/configs/sun3_defconfig
 199 +++ b/arch/m68k/configs/sun3_defconfig
 200 @@ -396,6 +396,7 @@ CONFIG_RTC_DRV_GENERIC=m
 201  # CONFIG_IOMMU_SUPPORT is not set
 202  CONFIG_DAX=m
 203  CONFIG_EXT4_FS=y
 204 +CONFIG_REISERFS_FS=m
 205  CONFIG_JFS_FS=m
 206  CONFIG_OCFS2_FS=m
 207  # CONFIG_OCFS2_DEBUG_MASKLOG is not set
 208 diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig
 209 index 38681cc6b598..5bda93f6a200 100644
 210 --- a/arch/m68k/configs/sun3x_defconfig
 211 +++ b/arch/m68k/configs/sun3x_defconfig
 212 @@ -396,6 +396,7 @@ CONFIG_RTC_DRV_GENERIC=m
 213  # CONFIG_IOMMU_SUPPORT is not set
 214  CONFIG_DAX=m
 215  CONFIG_EXT4_FS=y
 216 +CONFIG_REISERFS_FS=m
 217  CONFIG_JFS_FS=m
 218  CONFIG_OCFS2_FS=m
 219  # CONFIG_OCFS2_DEBUG_MASKLOG is not set
 220 diff --git a/arch/sh/configs/landisk_defconfig b/arch/sh/configs/landisk_defconfig
 221 index d871623955c5..0311380160f4 100644
 222 --- a/arch/sh/configs/landisk_defconfig
 223 +++ b/arch/sh/configs/landisk_defconfig
 224 @@ -95,6 +95,7 @@ CONFIG_USB_SISUSBVGA=m
 225  CONFIG_EXT2_FS=y
 226  CONFIG_EXT3_FS=y
 227  # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 228 +CONFIG_REISERFS_FS=y
 229  CONFIG_ISO9660_FS=m
 230  CONFIG_MSDOS_FS=y
 231  CONFIG_VFAT_FS=y
 232 diff --git a/arch/sh/configs/titan_defconfig b/arch/sh/configs/titan_defconfig
 233 index 99bc0e889287..c1032559ecd4 100644
 234 --- a/arch/sh/configs/titan_defconfig
 235 +++ b/arch/sh/configs/titan_defconfig
 236 @@ -220,6 +220,7 @@ CONFIG_EXT2_FS=y
 237  CONFIG_EXT3_FS=y
 238  # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 239  # CONFIG_EXT3_FS_XATTR is not set
 240 +CONFIG_REISERFS_FS=m
 241  CONFIG_XFS_FS=m
 242  CONFIG_FUSE_FS=m
 243  CONFIG_ISO9660_FS=m
 244 diff --git a/arch/um/configs/i386_defconfig b/arch/um/configs/i386_defconfig
 245 index 9c9c77f1255a..e543cbac8792 100644
 246 --- a/arch/um/configs/i386_defconfig
 247 +++ b/arch/um/configs/i386_defconfig
 248 @@ -61,6 +61,7 @@ CONFIG_UML_NET_DAEMON=y
 249  CONFIG_UML_NET_MCAST=y
 250  CONFIG_UML_NET_SLIRP=y
 251  CONFIG_EXT4_FS=y
 252 +CONFIG_REISERFS_FS=y
 253  CONFIG_QUOTA=y
 254  CONFIG_AUTOFS_FS=m
 255  CONFIG_ISO9660_FS=m
 256 diff --git a/arch/um/configs/x86_64_defconfig b/arch/um/configs/x86_64_defconfig
 257 index 03b10d3f6816..939cb12318ca 100644
 258 --- a/arch/um/configs/x86_64_defconfig
 259 +++ b/arch/um/configs/x86_64_defconfig
 260 @@ -59,6 +59,7 @@ CONFIG_UML_NET_DAEMON=y
 261  CONFIG_UML_NET_MCAST=y
 262  CONFIG_UML_NET_SLIRP=y
 263  CONFIG_EXT4_FS=y
 264 +CONFIG_REISERFS_FS=y
 265  CONFIG_QUOTA=y
 266  CONFIG_AUTOFS_FS=m
 267  CONFIG_ISO9660_FS=m
 268 diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
 269 index a97f2c40c640..ed209f4f2798 100644
 270 --- a/drivers/block/Kconfig
 271 +++ b/drivers/block/Kconfig
 272 @@ -130,7 +130,7 @@ config BLK_DEV_UBD_SYNC
 273            kernel command line option.  Alternatively, you can say Y here to
 274            turn on synchronous operation by default for all block devices.
 275
 276 -          If you're running a journalling file system (like xfs, for
 277 +          If you're running a journalling file system (like reiserfs, for
 278            example) in your virtual machine, you will want to say Y here.  If
 279            you care for the safety of the data in your virtual machine, Y is a
 280            wise choice too.  In all other cases (for example, if you're just
 281 diff --git a/fs/Kconfig b/fs/Kconfig
 282 index 64d420e3c475..aae170fc2795 100644
 283 --- a/fs/Kconfig
 284 +++ b/fs/Kconfig
 285 @@ -43,6 +43,7 @@ config FS_MBCACHE
 286         default y if EXT4_FS=y
 287         default m if EXT2_FS_XATTR || EXT4_FS
 288
 289 +source "fs/reiserfs/Kconfig"
 290  source "fs/jfs/Kconfig"
 291
 292  source "fs/xfs/Kconfig"
 293 diff --git a/fs/Makefile b/fs/Makefile
 294 index 15df0a923d3a..61679fd587b7 100644
 295 --- a/fs/Makefile
 296 +++ b/fs/Makefile
 297 @@ -61,6 +61,7 @@ obj-$(CONFIG_DLM)             += dlm/
 298
 299  # Do not add any filesystems before this line
 300  obj-$(CONFIG_NETFS_SUPPORT)    += netfs/
 301 +obj-$(CONFIG_REISERFS_FS)      += reiserfs/
 302  obj-$(CONFIG_EXT4_FS)          += ext4/
 303  # We place ext4 before ext2 so that clean ext3 root fs's do NOT mount using the
 304  # ext2 driver, which doesn't know about journalling!  Explicitly request ext2
 305 diff --git a/fs/buffer.c b/fs/buffer.c
 306 index b158cb7a5038..bb4a31b9559d 100644
 307 --- a/fs/buffer.c
 308 +++ b/fs/buffer.c
 309 @@ -855,7 +855,8 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 310   * done a sync().  Just drop the buffers from the inode list.
 311   *
 312   * NOTE: we take the inode's blockdev's mapping's i_private_lock.  Which
 313 - * assumes that all the buffers are against the blockdev.
 314 + * assumes that all the buffers are against the blockdev.  Not true
 315 + * for reiserfs.
 316   */
 317  void invalidate_inode_buffers(struct inode *inode)
 318  {
 319 diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
 320 index 818083a36bef..4c925e55dbcd 100644
 321 --- a/fs/quota/Kconfig
 322 +++ b/fs/quota/Kconfig
 323 @@ -9,13 +9,14 @@ config QUOTA
 324         help
 325           If you say Y here, you will be able to set per user limits for disk
 326           usage (also called disk quotas). Currently, it works for the
 327 -         ext2, ext3, ext4, f2fs, jfs and ocfs2 file systems. Note that gfs2
 328 -         and xfs use their own quota system. Ext3 and ext4 also support
 329 -         journaled quotas for which you don't need to run quotacheck(8) after
 330 -         an unclean shutdown. For further details, read the Quota mini-HOWTO,
 331 -         available from <https://www.tldp.org/docs.html#howto>, or the
 332 -         documentation provided with the quota tools. Probably the quota
 333 -         support is only useful for multi user systems. If unsure, say N.
 334 +         ext2, ext3, ext4, f2fs, jfs, ocfs2 and reiserfs file systems.
 335 +         Note that gfs2 and xfs use their own quota system.
 336 +         Ext3, ext4 and reiserfs also support journaled quotas for which
 337 +         you don't need to run quotacheck(8) after an unclean shutdown.
 338 +         For further details, read the Quota mini-HOWTO, available from
 339 +         <https://www.tldp.org/docs.html#howto>, or the documentation provided
 340 +         with the quota tools. Probably the quota support is only useful for
 341 +         multi user systems. If unsure, say N.
 342
 343  config QUOTA_NETLINK_INTERFACE
 344         bool "Report quota messages through netlink interface"
 345 diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
 346 new file mode 100644
 347 index 000000000000..0e6fe26458fe
 348 --- /dev/null
 349 +++ b/fs/reiserfs/Kconfig
 350 @@ -0,0 +1,91 @@
 351 +# SPDX-License-Identifier: GPL-2.0-only
 352 +config REISERFS_FS
 353 +       tristate "Reiserfs support (deprecated)"
 354 +       select BUFFER_HEAD
 355 +       select CRC32
 356 +       select LEGACY_DIRECT_IO
 357 +       help
 358 +         Reiserfs is deprecated and scheduled to be removed from the kernel
 359 +         in 2025. If you are still using it, please migrate to another
 360 +         filesystem or tell us your usecase for reiserfs.
 361 +
 362 +         Reiserfs stores not just filenames but the files themselves in a
 363 +         balanced tree.  Uses journalling.
 364 +
 365 +         Balanced trees are more efficient than traditional file system
 366 +         architectural foundations.
 367 +
 368 +         In general, ReiserFS is as fast as ext2, but is very efficient with
 369 +         large directories and small files.  Additional patches are needed
 370 +         for NFS and quotas, please see
 371 +         <https://reiser4.wiki.kernel.org/index.php/Main_Page> for links.
 372 +
 373 +         It is more easily extended to have features currently found in
 374 +         database and keyword search systems than block allocation based file
 375 +         systems are.  The next version will be so extended, and will support
 376 +         plugins consistent with our motto ``It takes more than a license to
 377 +         make source code open.''
 378 +
 379 +         Read <https://reiser4.wiki.kernel.org/index.php/Main_Page>
 380 +         to learn more about reiserfs.
 381 +
 382 +         Sponsored by Threshold Networks, Emusic.com, and Bigstorage.com.
 383 +
 384 +         If you like it, you can pay us to add new features to it that you
 385 +         need, buy a support contract, or pay us to port it to another OS.
 386 +
 387 +config REISERFS_CHECK
 388 +       bool "Enable reiserfs debug mode"
 389 +       depends on REISERFS_FS
 390 +       help
 391 +         If you set this to Y, then ReiserFS will perform every check it can
 392 +         possibly imagine of its internal consistency throughout its
 393 +         operation.  It will also go substantially slower.  More than once we
 394 +         have forgotten that this was on, and then gone despondent over the
 395 +         latest benchmarks.:-) Use of this option allows our team to go all
 396 +         out in checking for consistency when debugging without fear of its
 397 +         effect on end users.  If you are on the verge of sending in a bug
 398 +         report, say Y and you might get a useful error message.  Almost
 399 +         everyone should say N.
 400 +
 401 +config REISERFS_PROC_INFO
 402 +       bool "Stats in /proc/fs/reiserfs"
 403 +       depends on REISERFS_FS && PROC_FS
 404 +       help
 405 +         Create under /proc/fs/reiserfs a hierarchy of files, displaying
 406 +         various ReiserFS statistics and internal data at the expense of
 407 +         making your kernel or module slightly larger (+8 KB). This also
 408 +         increases the amount of kernel memory required for each mount.
 409 +         Almost everyone but ReiserFS developers and people fine-tuning
 410 +         reiserfs or tracing problems should say N.
 411 +
 412 +config REISERFS_FS_XATTR
 413 +       bool "ReiserFS extended attributes"
 414 +       depends on REISERFS_FS
 415 +       help
 416 +         Extended attributes are name:value pairs associated with inodes by
 417 +         the kernel or by users (see the attr(5) manual page for details).
 418 +
 419 +         If unsure, say N.
 420 +
 421 +config REISERFS_FS_POSIX_ACL
 422 +       bool "ReiserFS POSIX Access Control Lists"
 423 +       depends on REISERFS_FS_XATTR
 424 +       select FS_POSIX_ACL
 425 +       help
 426 +         Posix Access Control Lists (ACLs) support permissions for users and
 427 +         groups beyond the owner/group/world scheme.
 428 +
 429 +         If you don't know what Access Control Lists are, say N
 430 +
 431 +config REISERFS_FS_SECURITY
 432 +       bool "ReiserFS Security Labels"
 433 +       depends on REISERFS_FS_XATTR
 434 +       help
 435 +         Security labels support alternative access control models
 436 +         implemented by security modules like SELinux.  This option
 437 +         enables an extended attribute handler for file security
 438 +         labels in the ReiserFS filesystem.
 439 +
 440 +         If you are not using a security module that requires using
 441 +         extended attributes for file security labels, say N.
 442 diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile
 443 new file mode 100644
 444 index 000000000000..bd29c58ccbd8
 445 --- /dev/null
 446 +++ b/fs/reiserfs/Makefile
 447 @@ -0,0 +1,30 @@
 448 +# SPDX-License-Identifier: GPL-2.0
 449 +#
 450 +# Makefile for the linux reiser-filesystem routines.
 451 +#
 452 +
 453 +obj-$(CONFIG_REISERFS_FS) += reiserfs.o
 454 +
 455 +reiserfs-objs := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o \
 456 +                super.o prints.o objectid.o lbalance.o ibalance.o stree.o \
 457 +                hashes.o tail_conversion.o journal.o resize.o \
 458 +                item_ops.o ioctl.o xattr.o lock.o
 459 +
 460 +ifeq ($(CONFIG_REISERFS_PROC_INFO),y)
 461 +reiserfs-objs += procfs.o
 462 +endif
 463 +
 464 +ifeq ($(CONFIG_REISERFS_FS_XATTR),y)
 465 +reiserfs-objs += xattr_user.o xattr_trusted.o
 466 +endif
 467 +
 468 +ifeq ($(CONFIG_REISERFS_FS_SECURITY),y)
 469 +reiserfs-objs += xattr_security.o
 470 +endif
 471 +
 472 +ifeq ($(CONFIG_REISERFS_FS_POSIX_ACL),y)
 473 +reiserfs-objs += xattr_acl.o
 474 +endif
 475 +
 476 +TAGS:
 477 +       etags *.c
 478 diff --git a/fs/reiserfs/README b/fs/reiserfs/README
 479 new file mode 100644
 480 index 000000000000..11e9ecf24b63
 481 --- /dev/null
 482 +++ b/fs/reiserfs/README
 483 @@ -0,0 +1,151 @@
 484 +[LICENSING]
 485 +
 486 +ReiserFS is hereby licensed under the GNU General
 487 +Public License version 2.
 488 +
 489 +Source code files that contain the phrase "licensing governed by
 490 +reiserfs/README" are "governed files" throughout this file.  Governed
 491 +files are licensed under the GPL.  The portions of them owned by Hans
 492 +Reiser, or authorized to be licensed by him, have been in the past,
 493 +and likely will be in the future, licensed to other parties under
 494 +other licenses.  If you add your code to governed files, and don't
 495 +want it to be owned by Hans Reiser, put your copyright label on that
 496 +code so the poor blight and his customers can keep things straight.
 497 +All portions of governed files not labeled otherwise are owned by Hans
 498 +Reiser, and by adding your code to it, widely distributing it to
 499 +others or sending us a patch, and leaving the sentence in stating that
 500 +licensing is governed by the statement in this file, you accept this.
 501 +It will be a kindness if you identify whether Hans Reiser is allowed
 502 +to license code labeled as owned by you on your behalf other than
 503 +under the GPL, because he wants to know if it is okay to do so and put
 504 +a check in the mail to you (for non-trivial improvements) when he
 505 +makes his next sale.  He makes no guarantees as to the amount if any,
 506 +though he feels motivated to motivate contributors, and you can surely
 507 +discuss this with him before or after contributing.  You have the
 508 +right to decline to allow him to license your code contribution other
 509 +than under the GPL.
 510 +
 511 +Further licensing options are available for commercial and/or other
 512 +interests directly from Hans Reiser: hans@reiser.to.  If you interpret
 513 +the GPL as not allowing those additional licensing options, you read
 514 +it wrongly, and Richard Stallman agrees with me, when carefully read
 515 +you can see that those restrictions on additional terms do not apply
 516 +to the owner of the copyright, and my interpretation of this shall
 517 +govern for this license.
 518 +
 519 +Finally, nothing in this license shall be interpreted to allow you to
 520 +fail to fairly credit me, or to remove my credits, without my
 521 +permission, unless you are an end user not redistributing to others.
 522 +If you have doubts about how to properly do that, or about what is
 523 +fair, ask.  (Last I spoke with him Richard was contemplating how best
 524 +to address the fair crediting issue in the next GPL version.)
 525 +
 526 +[END LICENSING]
 527 +
 528 +Reiserfs is a file system based on balanced tree algorithms, which is
 529 +described at https://reiser4.wiki.kernel.org/index.php/Main_Page
 530 +
 531 +Stop reading here.  Go there, then return.
 532 +
 533 +Send bug reports to yura@namesys.botik.ru.
 534 +
 535 +mkreiserfs and other utilities are in reiserfs/utils, or wherever your
 536 +Linux provider put them.  There is some disagreement about how useful
 537 +it is for users to get their fsck and mkreiserfs out of sync with the
 538 +version of reiserfs that is in their kernel, with many important
 539 +distributors wanting them out of sync.:-) Please try to remember to
 540 +recompile and reinstall fsck and mkreiserfs with every update of
 541 +reiserfs, this is a common source of confusion.  Note that some of the
 542 +utilities cannot be compiled without accessing the balancing code
 543 +which is in the kernel code, and relocating the utilities may require
 544 +you to specify where that code can be found.
 545 +
 546 +Yes, if you update your reiserfs kernel module you do have to
 547 +recompile your kernel, most of the time.  The errors you get will be
 548 +quite cryptic if your forget to do so.
 549 +
 550 +Real users, as opposed to folks who want to hack and then understand
 551 +what went wrong, will want REISERFS_CHECK off.
 552 +
 553 +Hideous Commercial Pitch: Spread your development costs across other OS
 554 +vendors.  Select from the best in the world, not the best in your
 555 +building, by buying from third party OS component suppliers.  Leverage
 556 +the software component development power of the internet.  Be the most
 557 +aggressive in taking advantage of the commercial possibilities of
 558 +decentralized internet development, and add value through your branded
 559 +integration that you sell as an operating system.  Let your competitors
 560 +be the ones to compete against the entire internet by themselves.  Be
 561 +hip, get with the new economic trend, before your competitors do.  Send
 562 +email to hans@reiser.to.
 563 +
 564 +To understand the code, after reading the website, start reading the
 565 +code by reading reiserfs_fs.h first.
 566 +
 567 +Hans Reiser was the project initiator, primary architect, source of all
 568 +funding for the first 5.5 years, and one of the programmers.  He owns
 569 +the copyright.
 570 +
 571 +Vladimir Saveljev was one of the programmers, and he worked long hours
 572 +writing the cleanest code.  He always made the effort to be the best he
 573 +could be, and to make his code the best that it could be.  What resulted
 574 +was quite remarkable. I don't think that money can ever motivate someone
 575 +to work the way he did, he is one of the most selfless men I know.
 576 +
 577 +Yura helps with benchmarking, coding hashes, and block pre-allocation
 578 +code.
 579 +
 580 +Anatoly Pinchuk is a former member of our team who worked closely with
 581 +Vladimir throughout the project's development.  He wrote a quite
 582 +substantial portion of the total code.  He realized that there was a
 583 +space problem with packing tails of files for files larger than a node
 584 +that start on a node aligned boundary (there are reasons to want to node
 585 +align files), and he invented and implemented indirect items and
 586 +unformatted nodes as the solution.
 587 +
 588 +Konstantin Shvachko was taking part in the early days.
 589 +
 590 +Mikhail Gilula was a brilliant innovator that has shown much generosity.
 591 +
 592 +Grigory Zaigralin was an extremely effective system administrator for
 593 +our group.
 594 +
 595 +Igor Krasheninnikov was wonderful at hardware procurement, repair, and
 596 +network installation.
 597 +
 598 +Jeremy Fitzhardinge wrote the teahash.c code, and he gives credit to a
 599 +textbook he got the algorithm from in the code.  Note that his analysis
 600 +of how we could use the hashing code in making 32 bit NFS cookies work
 601 +was probably more important than the actual algorithm.  Colin Plumb also
 602 +contributed to it.
 603 +
 604 +Chris Mason dived right into our code, and in just a few months produced
 605 +the journaling code that dramatically increased the value of ReiserFS.
 606 +He is just an amazing programmer.
 607 +
 608 +Igor Zagorovsky is writing much of the new item handler and extent code
 609 +for our next major release.
 610 +
 611 +Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the
 612 +resizer, and is hard at work on implementing allocate on flush.  SGI
 613 +implemented allocate on flush before us for XFS, and generously took
 614 +the time to convince me we should do it also.  They are great people,
 615 +and a great company.
 616 +
 617 +Yuri Shevchuk and Nikita Danilov are doing squid cache optimization.
 618 +
 619 +Vitaly Fertman is doing fsck.
 620 +
 621 +Jeff Mahoney, of SuSE, contributed a few cleanup fixes, most notably
 622 +the endian safe patches which allow ReiserFS to run on any platform
 623 +supported by the Linux kernel.
 624 +
 625 +SuSE, IntegratedLinux.com, Ecila, MP3.com, bigstorage.com, and the
 626 +Alpha PC Company made it possible for me to not have a day job
 627 +anymore, and to dramatically increase our staffing.  Ecila funded
 628 +hypertext feature development, MP3.com funded journaling, SuSE funded
 629 +core development, IntegratedLinux.com funded squid web cache
 630 +appliances, bigstorage.com funded HSM, and the alpha PC company funded
 631 +the alpha port.  Many of these tasks were helped by sponsors other
 632 +than the ones just named.  SuSE has helped in much more than just
 633 +funding....
 634 +
 635 diff --git a/fs/reiserfs/acl.h b/fs/reiserfs/acl.h
 636 new file mode 100644
 637 index 000000000000..2571b1a8be84
 638 --- /dev/null
 639 +++ b/fs/reiserfs/acl.h
 640 @@ -0,0 +1,78 @@
 641 +/* SPDX-License-Identifier: GPL-2.0 */
 642 +#include <linux/init.h>
 643 +#include <linux/posix_acl.h>
 644 +
 645 +#define REISERFS_ACL_VERSION   0x0001
 646 +
 647 +typedef struct {
 648 +       __le16 e_tag;
 649 +       __le16 e_perm;
 650 +       __le32 e_id;
 651 +} reiserfs_acl_entry;
 652 +
 653 +typedef struct {
 654 +       __le16 e_tag;
 655 +       __le16 e_perm;
 656 +} reiserfs_acl_entry_short;
 657 +
 658 +typedef struct {
 659 +       __le32 a_version;
 660 +} reiserfs_acl_header;
 661 +
 662 +static inline size_t reiserfs_acl_size(int count)
 663 +{
 664 +       if (count <= 4) {
 665 +               return sizeof(reiserfs_acl_header) +
 666 +                   count * sizeof(reiserfs_acl_entry_short);
 667 +       } else {
 668 +               return sizeof(reiserfs_acl_header) +
 669 +                   4 * sizeof(reiserfs_acl_entry_short) +
 670 +                   (count - 4) * sizeof(reiserfs_acl_entry);
 671 +       }
 672 +}
 673 +
 674 +static inline int reiserfs_acl_count(size_t size)
 675 +{
 676 +       ssize_t s;
 677 +       size -= sizeof(reiserfs_acl_header);
 678 +       s = size - 4 * sizeof(reiserfs_acl_entry_short);
 679 +       if (s < 0) {
 680 +               if (size % sizeof(reiserfs_acl_entry_short))
 681 +                       return -1;
 682 +               return size / sizeof(reiserfs_acl_entry_short);
 683 +       } else {
 684 +               if (s % sizeof(reiserfs_acl_entry))
 685 +                       return -1;
 686 +               return s / sizeof(reiserfs_acl_entry) + 4;
 687 +       }
 688 +}
 689 +
 690 +#ifdef CONFIG_REISERFS_FS_POSIX_ACL
 691 +struct posix_acl *reiserfs_get_acl(struct inode *inode, int type, bool rcu);
 692 +int reiserfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
 693 +                    struct posix_acl *acl, int type);
 694 +int reiserfs_acl_chmod(struct dentry *dentry);
 695 +int reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
 696 +                                struct inode *dir, struct dentry *dentry,
 697 +                                struct inode *inode);
 698 +int reiserfs_cache_default_acl(struct inode *dir);
 699 +
 700 +#else
 701 +
 702 +#define reiserfs_cache_default_acl(inode) 0
 703 +#define reiserfs_get_acl NULL
 704 +#define reiserfs_set_acl NULL
 705 +
 706 +static inline int reiserfs_acl_chmod(struct dentry *dentry)
 707 +{
 708 +       return 0;
 709 +}
 710 +
 711 +static inline int
 712 +reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
 713 +                            const struct inode *dir, struct dentry *dentry,
 714 +                            struct inode *inode)
 715 +{
 716 +       return 0;
 717 +}
 718 +#endif
 719 diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
 720 new file mode 100644
 721 index 000000000000..bf708ac287b4
 722 --- /dev/null
 723 +++ b/fs/reiserfs/bitmap.c
 724 @@ -0,0 +1,1476 @@
 725 +/*
 726 + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
 727 + */
 728 +/* Reiserfs block (de)allocator, bitmap-based. */
 729 +
 730 +#include <linux/time.h>
 731 +#include "reiserfs.h"
 732 +#include <linux/errno.h>
 733 +#include <linux/buffer_head.h>
 734 +#include <linux/kernel.h>
 735 +#include <linux/pagemap.h>
 736 +#include <linux/vmalloc.h>
 737 +#include <linux/quotaops.h>
 738 +#include <linux/seq_file.h>
 739 +
 740 +#define PREALLOCATION_SIZE 9
 741 +
 742 +/* different reiserfs block allocator options */
 743 +
 744 +#define SB_ALLOC_OPTS(s) (REISERFS_SB(s)->s_alloc_options.bits)
 745 +
 746 +#define  _ALLOC_concentrating_formatted_nodes 0
 747 +#define  _ALLOC_displacing_large_files 1
 748 +#define  _ALLOC_displacing_new_packing_localities 2
 749 +#define  _ALLOC_old_hashed_relocation 3
 750 +#define  _ALLOC_new_hashed_relocation 4
 751 +#define  _ALLOC_skip_busy 5
 752 +#define  _ALLOC_displace_based_on_dirid 6
 753 +#define  _ALLOC_hashed_formatted_nodes 7
 754 +#define  _ALLOC_old_way 8
 755 +#define  _ALLOC_hundredth_slices 9
 756 +#define  _ALLOC_dirid_groups 10
 757 +#define  _ALLOC_oid_groups 11
 758 +#define  _ALLOC_packing_groups 12
 759 +
 760 +#define  concentrating_formatted_nodes(s)      test_bit(_ALLOC_concentrating_formatted_nodes, &SB_ALLOC_OPTS(s))
 761 +#define  displacing_large_files(s)             test_bit(_ALLOC_displacing_large_files, &SB_ALLOC_OPTS(s))
 762 +#define  displacing_new_packing_localities(s)  test_bit(_ALLOC_displacing_new_packing_localities, &SB_ALLOC_OPTS(s))
 763 +
 764 +#define SET_OPTION(optname) \
 765 +   do { \
 766 +       reiserfs_info(s, "block allocator option \"%s\" is set", #optname); \
 767 +       set_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s)); \
 768 +    } while(0)
 769 +#define TEST_OPTION(optname, s) \
 770 +    test_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s))
 771 +
 772 +static inline void get_bit_address(struct super_block *s,
 773 +                                  b_blocknr_t block,
 774 +                                  unsigned int *bmap_nr,
 775 +                                  unsigned int *offset)
 776 +{
 777 +       /*
 778 +        * It is in the bitmap block number equal to the block
 779 +        * number divided by the number of bits in a block.
 780 +        */
 781 +       *bmap_nr = block >> (s->s_blocksize_bits + 3);
 782 +       /* Within that bitmap block it is located at bit offset *offset. */
 783 +       *offset = block & ((s->s_blocksize << 3) - 1);
 784 +}
 785 +
 786 +int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value)
 787 +{
 788 +       unsigned int bmap, offset;
 789 +       unsigned int bmap_count = reiserfs_bmap_count(s);
 790 +
 791 +       if (block == 0 || block >= SB_BLOCK_COUNT(s)) {
 792 +               reiserfs_error(s, "vs-4010",
 793 +                              "block number is out of range %lu (%u)",
 794 +                              block, SB_BLOCK_COUNT(s));
 795 +               return 0;
 796 +       }
 797 +
 798 +       get_bit_address(s, block, &bmap, &offset);
 799 +
 800 +       /*
 801 +        * Old format filesystem? Unlikely, but the bitmaps are all
 802 +        * up front so we need to account for it.
 803 +        */
 804 +       if (unlikely(test_bit(REISERFS_OLD_FORMAT,
 805 +                             &REISERFS_SB(s)->s_properties))) {
 806 +               b_blocknr_t bmap1 = REISERFS_SB(s)->s_sbh->b_blocknr + 1;
 807 +               if (block >= bmap1 &&
 808 +                   block <= bmap1 + bmap_count) {
 809 +                       reiserfs_error(s, "vs-4019", "bitmap block %lu(%u) "
 810 +                                      "can't be freed or reused",
 811 +                                      block, bmap_count);
 812 +                       return 0;
 813 +               }
 814 +       } else {
 815 +               if (offset == 0) {
 816 +                       reiserfs_error(s, "vs-4020", "bitmap block %lu(%u) "
 817 +                                      "can't be freed or reused",
 818 +                                      block, bmap_count);
 819 +                       return 0;
 820 +               }
 821 +       }
 822 +
 823 +       if (bmap >= bmap_count) {
 824 +               reiserfs_error(s, "vs-4030", "bitmap for requested block "
 825 +                              "is out of range: block=%lu, bitmap_nr=%u",
 826 +                              block, bmap);
 827 +               return 0;
 828 +       }
 829 +
 830 +       if (bit_value == 0 && block == SB_ROOT_BLOCK(s)) {
 831 +               reiserfs_error(s, "vs-4050", "this is root block (%u), "
 832 +                              "it must be busy", SB_ROOT_BLOCK(s));
 833 +               return 0;
 834 +       }
 835 +
 836 +       return 1;
 837 +}
 838 +
 839 +/*
 840 + * Searches in journal structures for a given block number (bmap, off).
 841 + * If block is found in reiserfs journal it suggests next free block
 842 + * candidate to test.
 843 + */
 844 +static inline int is_block_in_journal(struct super_block *s, unsigned int bmap,
 845 +                                     int off, int *next)
 846 +{
 847 +       b_blocknr_t tmp;
 848 +
 849 +       if (reiserfs_in_journal(s, bmap, off, 1, &tmp)) {
 850 +               if (tmp) {      /* hint supplied */
 851 +                       *next = tmp;
 852 +                       PROC_INFO_INC(s, scan_bitmap.in_journal_hint);
 853 +               } else {
 854 +                       (*next) = off + 1;  /* inc offset to avoid looping. */
 855 +                       PROC_INFO_INC(s, scan_bitmap.in_journal_nohint);
 856 +               }
 857 +               PROC_INFO_INC(s, scan_bitmap.retry);
 858 +               return 1;
 859 +       }
 860 +       return 0;
 861 +}
 862 +
 863 +/*
 864 + * Searches for a window of zero bits with given minimum and maximum
 865 + * lengths in one bitmap block
 866 + */
 867 +static int scan_bitmap_block(struct reiserfs_transaction_handle *th,
 868 +                            unsigned int bmap_n, int *beg, int boundary,
 869 +                            int min, int max, int unfm)
 870 +{
 871 +       struct super_block *s = th->t_super;
 872 +       struct reiserfs_bitmap_info *bi = &SB_AP_BITMAP(s)[bmap_n];
 873 +       struct buffer_head *bh;
 874 +       int end, next;
 875 +       int org = *beg;
 876 +
 877 +       BUG_ON(!th->t_trans_id);
 878 +       RFALSE(bmap_n >= reiserfs_bmap_count(s), "Bitmap %u is out of "
 879 +              "range (0..%u)", bmap_n, reiserfs_bmap_count(s) - 1);
 880 +       PROC_INFO_INC(s, scan_bitmap.bmap);
 881 +
 882 +       if (!bi) {
 883 +               reiserfs_error(s, "jdm-4055", "NULL bitmap info pointer "
 884 +                              "for bitmap %d", bmap_n);
 885 +               return 0;
 886 +       }
 887 +
 888 +       bh = reiserfs_read_bitmap_block(s, bmap_n);
 889 +       if (bh == NULL)
 890 +               return 0;
 891 +
 892 +       while (1) {
 893 +cont:
 894 +               if (bi->free_count < min) {
 895 +                       brelse(bh);
 896 +                       return 0;       /* No free blocks in this bitmap */
 897 +               }
 898 +
 899 +               /* search for a first zero bit -- beginning of a window */
 900 +               *beg = reiserfs_find_next_zero_le_bit
 901 +                   ((unsigned long *)(bh->b_data), boundary, *beg);
 902 +
 903 +               /*
 904 +                * search for a zero bit fails or the rest of bitmap block
 905 +                * cannot contain a zero window of minimum size
 906 +                */
 907 +               if (*beg + min > boundary) {
 908 +                       brelse(bh);
 909 +                       return 0;
 910 +               }
 911 +
 912 +               if (unfm && is_block_in_journal(s, bmap_n, *beg, beg))
 913 +                       continue;
 914 +               /* first zero bit found; we check next bits */
 915 +               for (end = *beg + 1;; end++) {
 916 +                       if (end >= *beg + max || end >= boundary
 917 +                           || reiserfs_test_le_bit(end, bh->b_data)) {
 918 +                               next = end;
 919 +                               break;
 920 +                       }
 921 +
 922 +                       /*
 923 +                        * finding the other end of zero bit window requires
 924 +                        * looking into journal structures (in case of
 925 +                        * searching for free blocks for unformatted nodes)
 926 +                        */
 927 +                       if (unfm && is_block_in_journal(s, bmap_n, end, &next))
 928 +                               break;
 929 +               }
 930 +
 931 +               /*
 932 +                * now (*beg) points to beginning of zero bits window,
 933 +                * (end) points to one bit after the window end
 934 +                */
 935 +
 936 +               /* found window of proper size */
 937 +               if (end - *beg >= min) {
 938 +                       int i;
 939 +                       reiserfs_prepare_for_journal(s, bh, 1);
 940 +                       /*
 941 +                        * try to set all blocks used checking are
 942 +                        * they still free
 943 +                        */
 944 +                       for (i = *beg; i < end; i++) {
 945 +                               /* Don't check in journal again. */
 946 +                               if (reiserfs_test_and_set_le_bit
 947 +                                   (i, bh->b_data)) {
 948 +                                       /*
 949 +                                        * bit was set by another process while
 950 +                                        * we slept in prepare_for_journal()
 951 +                                        */
 952 +                                       PROC_INFO_INC(s, scan_bitmap.stolen);
 953 +
 954 +                                       /*
 955 +                                        * we can continue with smaller set
 956 +                                        * of allocated blocks, if length of
 957 +                                        * this set is more or equal to `min'
 958 +                                        */
 959 +                                       if (i >= *beg + min) {
 960 +                                               end = i;
 961 +                                               break;
 962 +                                       }
 963 +
 964 +                                       /*
 965 +                                        * otherwise we clear all bit
 966 +                                        * were set ...
 967 +                                        */
 968 +                                       while (--i >= *beg)
 969 +                                               reiserfs_clear_le_bit
 970 +                                                   (i, bh->b_data);
 971 +                                       reiserfs_restore_prepared_buffer(s, bh);
 972 +                                       *beg = org;
 973 +
 974 +                                       /*
 975 +                                        * Search again in current block
 976 +                                        * from beginning
 977 +                                        */
 978 +                                       goto cont;
 979 +                               }
 980 +                       }
 981 +                       bi->free_count -= (end - *beg);
 982 +                       journal_mark_dirty(th, bh);
 983 +                       brelse(bh);
 984 +
 985 +                       /* free block count calculation */
 986 +                       reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s),
 987 +                                                    1);
 988 +                       PUT_SB_FREE_BLOCKS(s, SB_FREE_BLOCKS(s) - (end - *beg));
 989 +                       journal_mark_dirty(th, SB_BUFFER_WITH_SB(s));
 990 +
 991 +                       return end - (*beg);
 992 +               } else {
 993 +                       *beg = next;
 994 +               }
 995 +       }
 996 +}
 997 +
 998 +static int bmap_hash_id(struct super_block *s, u32 id)
 999 +{
1000 +       char *hash_in = NULL;
1001 +       unsigned long hash;
1002 +       unsigned bm;
1003 +
1004 +       if (id <= 2) {
1005 +               bm = 1;
1006 +       } else {
1007 +               hash_in = (char *)(&id);
1008 +               hash = keyed_hash(hash_in, 4);
1009 +               bm = hash % reiserfs_bmap_count(s);
1010 +               if (!bm)
1011 +                       bm = 1;
1012 +       }
1013 +       /* this can only be true when SB_BMAP_NR = 1 */
1014 +       if (bm >= reiserfs_bmap_count(s))
1015 +               bm = 0;
1016 +       return bm;
1017 +}
1018 +
1019 +/*
1020 + * hashes the id and then returns > 0 if the block group for the
1021 + * corresponding hash is full
1022 + */
1023 +static inline int block_group_used(struct super_block *s, u32 id)
1024 +{
1025 +       int bm = bmap_hash_id(s, id);
1026 +       struct reiserfs_bitmap_info *info = &SB_AP_BITMAP(s)[bm];
1027 +
1028 +       /*
1029 +        * If we don't have cached information on this bitmap block, we're
1030 +        * going to have to load it later anyway. Loading it here allows us
1031 +        * to make a better decision. This favors long-term performance gain
1032 +        * with a better on-disk layout vs. a short term gain of skipping the
1033 +        * read and potentially having a bad placement.
1034 +        */
1035 +       if (info->free_count == UINT_MAX) {
1036 +               struct buffer_head *bh = reiserfs_read_bitmap_block(s, bm);
1037 +               brelse(bh);
1038 +       }
1039 +
1040 +       if (info->free_count > ((s->s_blocksize << 3) * 60 / 100)) {
1041 +               return 0;
1042 +       }
1043 +       return 1;
1044 +}
1045 +
1046 +/*
1047 + * the packing is returned in disk byte order
1048 + */
1049 +__le32 reiserfs_choose_packing(struct inode * dir)
1050 +{
1051 +       __le32 packing;
1052 +       if (TEST_OPTION(packing_groups, dir->i_sb)) {
1053 +               u32 parent_dir = le32_to_cpu(INODE_PKEY(dir)->k_dir_id);
1054 +               /*
1055 +                * some versions of reiserfsck expect packing locality 1 to be
1056 +                * special
1057 +                */
1058 +               if (parent_dir == 1 || block_group_used(dir->i_sb, parent_dir))
1059 +                       packing = INODE_PKEY(dir)->k_objectid;
1060 +               else
1061 +                       packing = INODE_PKEY(dir)->k_dir_id;
1062 +       } else
1063 +               packing = INODE_PKEY(dir)->k_objectid;
1064 +       return packing;
1065 +}
1066 +
1067 +/*
1068 + * Tries to find contiguous zero bit window (given size) in given region of
1069 + * bitmap and place new blocks there. Returns number of allocated blocks.
1070 + */
1071 +static int scan_bitmap(struct reiserfs_transaction_handle *th,
1072 +                      b_blocknr_t * start, b_blocknr_t finish,
1073 +                      int min, int max, int unfm, sector_t file_block)
1074 +{
1075 +       int nr_allocated = 0;
1076 +       struct super_block *s = th->t_super;
1077 +       unsigned int bm, off;
1078 +       unsigned int end_bm, end_off;
1079 +       unsigned int off_max = s->s_blocksize << 3;
1080 +
1081 +       BUG_ON(!th->t_trans_id);
1082 +       PROC_INFO_INC(s, scan_bitmap.call);
1083 +
1084 +       /* No point in looking for more free blocks */
1085 +       if (SB_FREE_BLOCKS(s) <= 0)
1086 +               return 0;
1087 +
1088 +       get_bit_address(s, *start, &bm, &off);
1089 +       get_bit_address(s, finish, &end_bm, &end_off);
1090 +       if (bm > reiserfs_bmap_count(s))
1091 +               return 0;
1092 +       if (end_bm > reiserfs_bmap_count(s))
1093 +               end_bm = reiserfs_bmap_count(s);
1094 +
1095 +       /*
1096 +        * When the bitmap is more than 10% free, anyone can allocate.
1097 +        * When it's less than 10% free, only files that already use the
1098 +        * bitmap are allowed. Once we pass 80% full, this restriction
1099 +        * is lifted.
1100 +        *
1101 +        * We do this so that files that grow later still have space close to
1102 +        * their original allocation. This improves locality, and presumably
1103 +        * performance as a result.
1104 +        *
1105 +        * This is only an allocation policy and does not make up for getting a
1106 +        * bad hint. Decent hinting must be implemented for this to work well.
1107 +        */
1108 +       if (TEST_OPTION(skip_busy, s)
1109 +           && SB_FREE_BLOCKS(s) > SB_BLOCK_COUNT(s) / 20) {
1110 +               for (; bm < end_bm; bm++, off = 0) {
1111 +                       if ((off && (!unfm || (file_block != 0)))
1112 +                           || SB_AP_BITMAP(s)[bm].free_count >
1113 +                           (s->s_blocksize << 3) / 10)
1114 +                               nr_allocated =
1115 +                                   scan_bitmap_block(th, bm, &off, off_max,
1116 +                                                     min, max, unfm);
1117 +                       if (nr_allocated)
1118 +                               goto ret;
1119 +               }
1120 +               /* we know from above that start is a reasonable number */
1121 +               get_bit_address(s, *start, &bm, &off);
1122 +       }
1123 +
1124 +       for (; bm < end_bm; bm++, off = 0) {
1125 +               nr_allocated =
1126 +                   scan_bitmap_block(th, bm, &off, off_max, min, max, unfm);
1127 +               if (nr_allocated)
1128 +                       goto ret;
1129 +       }
1130 +
1131 +       nr_allocated =
1132 +           scan_bitmap_block(th, bm, &off, end_off + 1, min, max, unfm);
1133 +
1134 +ret:
1135 +       *start = bm * off_max + off;
1136 +       return nr_allocated;
1137 +
1138 +}
1139 +
1140 +static void _reiserfs_free_block(struct reiserfs_transaction_handle *th,
1141 +                                struct inode *inode, b_blocknr_t block,
1142 +                                int for_unformatted)
1143 +{
1144 +       struct super_block *s = th->t_super;
1145 +       struct reiserfs_super_block *rs;
1146 +       struct buffer_head *sbh, *bmbh;
1147 +       struct reiserfs_bitmap_info *apbi;
1148 +       unsigned int nr, offset;
1149 +
1150 +       BUG_ON(!th->t_trans_id);
1151 +       PROC_INFO_INC(s, free_block);
1152 +       rs = SB_DISK_SUPER_BLOCK(s);
1153 +       sbh = SB_BUFFER_WITH_SB(s);
1154 +       apbi = SB_AP_BITMAP(s);
1155 +
1156 +       get_bit_address(s, block, &nr, &offset);
1157 +
1158 +       if (nr >= reiserfs_bmap_count(s)) {
1159 +               reiserfs_error(s, "vs-4075", "block %lu is out of range",
1160 +                              block);
1161 +               return;
1162 +       }
1163 +
1164 +       bmbh = reiserfs_read_bitmap_block(s, nr);
1165 +       if (!bmbh)
1166 +               return;
1167 +
1168 +       reiserfs_prepare_for_journal(s, bmbh, 1);
1169 +
1170 +       /* clear bit for the given block in bit map */
1171 +       if (!reiserfs_test_and_clear_le_bit(offset, bmbh->b_data)) {
1172 +               reiserfs_error(s, "vs-4080",
1173 +                              "block %lu: bit already cleared", block);
1174 +       }
1175 +       apbi[nr].free_count++;
1176 +       journal_mark_dirty(th, bmbh);
1177 +       brelse(bmbh);
1178 +
1179 +       reiserfs_prepare_for_journal(s, sbh, 1);
1180 +       /* update super block */
1181 +       set_sb_free_blocks(rs, sb_free_blocks(rs) + 1);
1182 +
1183 +       journal_mark_dirty(th, sbh);
1184 +       if (for_unformatted) {
1185 +               int depth = reiserfs_write_unlock_nested(s);
1186 +               dquot_free_block_nodirty(inode, 1);
1187 +               reiserfs_write_lock_nested(s, depth);
1188 +       }
1189 +}
1190 +
1191 +void reiserfs_free_block(struct reiserfs_transaction_handle *th,
1192 +                        struct inode *inode, b_blocknr_t block,
1193 +                        int for_unformatted)
1194 +{
1195 +       struct super_block *s = th->t_super;
1196 +
1197 +       BUG_ON(!th->t_trans_id);
1198 +       RFALSE(!s, "vs-4061: trying to free block on nonexistent device");
1199 +       if (!is_reusable(s, block, 1))
1200 +               return;
1201 +
1202 +       if (block > sb_block_count(REISERFS_SB(s)->s_rs)) {
1203 +               reiserfs_error(th->t_super, "bitmap-4072",
1204 +                              "Trying to free block outside file system "
1205 +                              "boundaries (%lu > %lu)",
1206 +                              block, sb_block_count(REISERFS_SB(s)->s_rs));
1207 +               return;
1208 +       }
1209 +       /* mark it before we clear it, just in case */
1210 +       journal_mark_freed(th, s, block);
1211 +       _reiserfs_free_block(th, inode, block, for_unformatted);
1212 +}
1213 +
1214 +/* preallocated blocks don't need to be run through journal_mark_freed */
1215 +static void reiserfs_free_prealloc_block(struct reiserfs_transaction_handle *th,
1216 +                                        struct inode *inode, b_blocknr_t block)
1217 +{
1218 +       BUG_ON(!th->t_trans_id);
1219 +       RFALSE(!th->t_super,
1220 +              "vs-4060: trying to free block on nonexistent device");
1221 +       if (!is_reusable(th->t_super, block, 1))
1222 +               return;
1223 +       _reiserfs_free_block(th, inode, block, 1);
1224 +}
1225 +
1226 +static void __discard_prealloc(struct reiserfs_transaction_handle *th,
1227 +                              struct reiserfs_inode_info *ei)
1228 +{
1229 +       unsigned long save = ei->i_prealloc_block;
1230 +       int dirty = 0;
1231 +       struct inode *inode = &ei->vfs_inode;
1232 +
1233 +       BUG_ON(!th->t_trans_id);
1234 +#ifdef CONFIG_REISERFS_CHECK
1235 +       if (ei->i_prealloc_count < 0)
1236 +               reiserfs_error(th->t_super, "zam-4001",
1237 +                              "inode has negative prealloc blocks count.");
1238 +#endif
1239 +       while (ei->i_prealloc_count > 0) {
1240 +               b_blocknr_t block_to_free;
1241 +
1242 +               /*
1243 +                * reiserfs_free_prealloc_block can drop the write lock,
1244 +                * which could allow another caller to free the same block.
1245 +                * We can protect against it by modifying the prealloc
1246 +                * state before calling it.
1247 +                */
1248 +               block_to_free = ei->i_prealloc_block++;
1249 +               ei->i_prealloc_count--;
1250 +               reiserfs_free_prealloc_block(th, inode, block_to_free);
1251 +               dirty = 1;
1252 +       }
1253 +       if (dirty)
1254 +               reiserfs_update_sd(th, inode);
1255 +       ei->i_prealloc_block = save;
1256 +       list_del_init(&ei->i_prealloc_list);
1257 +}
1258 +
1259 +/* FIXME: It should be inline function */
1260 +void reiserfs_discard_prealloc(struct reiserfs_transaction_handle *th,
1261 +                              struct inode *inode)
1262 +{
1263 +       struct reiserfs_inode_info *ei = REISERFS_I(inode);
1264 +
1265 +       BUG_ON(!th->t_trans_id);
1266 +       if (ei->i_prealloc_count)
1267 +               __discard_prealloc(th, ei);
1268 +}
1269 +
1270 +void reiserfs_discard_all_prealloc(struct reiserfs_transaction_handle *th)
1271 +{
1272 +       struct list_head *plist = &SB_JOURNAL(th->t_super)->j_prealloc_list;
1273 +
1274 +       BUG_ON(!th->t_trans_id);
1275 +       while (!list_empty(plist)) {
1276 +               struct reiserfs_inode_info *ei;
1277 +               ei = list_entry(plist->next, struct reiserfs_inode_info,
1278 +                               i_prealloc_list);
1279 +#ifdef CONFIG_REISERFS_CHECK
1280 +               if (!ei->i_prealloc_count) {
1281 +                       reiserfs_error(th->t_super, "zam-4001",
1282 +                                      "inode is in prealloc list but has "
1283 +                                      "no preallocated blocks.");
1284 +               }
1285 +#endif
1286 +               __discard_prealloc(th, ei);
1287 +       }
1288 +}
1289 +
1290 +void reiserfs_init_alloc_options(struct super_block *s)
1291 +{
1292 +       set_bit(_ALLOC_skip_busy, &SB_ALLOC_OPTS(s));
1293 +       set_bit(_ALLOC_dirid_groups, &SB_ALLOC_OPTS(s));
1294 +       set_bit(_ALLOC_packing_groups, &SB_ALLOC_OPTS(s));
1295 +}
1296 +
1297 +/* block allocator related options are parsed here */
1298 +int reiserfs_parse_alloc_options(struct super_block *s, char *options)
1299 +{
1300 +       char *this_char, *value;
1301 +
1302 +       /* clear default settings */
1303 +       REISERFS_SB(s)->s_alloc_options.bits = 0;
1304 +
1305 +       while ((this_char = strsep(&options, ":")) != NULL) {
1306 +               if ((value = strchr(this_char, '=')) != NULL)
1307 +                       *value++ = 0;
1308 +
1309 +               if (!strcmp(this_char, "concentrating_formatted_nodes")) {
1310 +                       int temp;
1311 +                       SET_OPTION(concentrating_formatted_nodes);
1312 +                       temp = (value
1313 +                               && *value) ? simple_strtoul(value, &value,
1314 +                                                           0) : 10;
1315 +                       if (temp <= 0 || temp > 100) {
1316 +                               REISERFS_SB(s)->s_alloc_options.border = 10;
1317 +                       } else {
1318 +                               REISERFS_SB(s)->s_alloc_options.border =
1319 +                                   100 / temp;
1320 +                       }
1321 +                       continue;
1322 +               }
1323 +               if (!strcmp(this_char, "displacing_large_files")) {
1324 +                       SET_OPTION(displacing_large_files);
1325 +                       REISERFS_SB(s)->s_alloc_options.large_file_size =
1326 +                           (value
1327 +                            && *value) ? simple_strtoul(value, &value, 0) : 16;
1328 +                       continue;
1329 +               }
1330 +               if (!strcmp(this_char, "displacing_new_packing_localities")) {
1331 +                       SET_OPTION(displacing_new_packing_localities);
1332 +                       continue;
1333 +               }
1334 +
1335 +               if (!strcmp(this_char, "old_hashed_relocation")) {
1336 +                       SET_OPTION(old_hashed_relocation);
1337 +                       continue;
1338 +               }
1339 +
1340 +               if (!strcmp(this_char, "new_hashed_relocation")) {
1341 +                       SET_OPTION(new_hashed_relocation);
1342 +                       continue;
1343 +               }
1344 +
1345 +               if (!strcmp(this_char, "dirid_groups")) {
1346 +                       SET_OPTION(dirid_groups);
1347 +                       continue;
1348 +               }
1349 +               if (!strcmp(this_char, "oid_groups")) {
1350 +                       SET_OPTION(oid_groups);
1351 +                       continue;
1352 +               }
1353 +               if (!strcmp(this_char, "packing_groups")) {
1354 +                       SET_OPTION(packing_groups);
1355 +                       continue;
1356 +               }
1357 +               if (!strcmp(this_char, "hashed_formatted_nodes")) {
1358 +                       SET_OPTION(hashed_formatted_nodes);
1359 +                       continue;
1360 +               }
1361 +
1362 +               if (!strcmp(this_char, "skip_busy")) {
1363 +                       SET_OPTION(skip_busy);
1364 +                       continue;
1365 +               }
1366 +
1367 +               if (!strcmp(this_char, "hundredth_slices")) {
1368 +                       SET_OPTION(hundredth_slices);
1369 +                       continue;
1370 +               }
1371 +
1372 +               if (!strcmp(this_char, "old_way")) {
1373 +                       SET_OPTION(old_way);
1374 +                       continue;
1375 +               }
1376 +
1377 +               if (!strcmp(this_char, "displace_based_on_dirid")) {
1378 +                       SET_OPTION(displace_based_on_dirid);
1379 +                       continue;
1380 +               }
1381 +
1382 +               if (!strcmp(this_char, "preallocmin")) {
1383 +                       REISERFS_SB(s)->s_alloc_options.preallocmin =
1384 +                           (value
1385 +                            && *value) ? simple_strtoul(value, &value, 0) : 4;
1386 +                       continue;
1387 +               }
1388 +
1389 +               if (!strcmp(this_char, "preallocsize")) {
1390 +                       REISERFS_SB(s)->s_alloc_options.preallocsize =
1391 +                           (value
1392 +                            && *value) ? simple_strtoul(value, &value,
1393 +                                                        0) :
1394 +                           PREALLOCATION_SIZE;
1395 +                       continue;
1396 +               }
1397 +
1398 +               reiserfs_warning(s, "zam-4001", "unknown option - %s",
1399 +                                this_char);
1400 +               return 1;
1401 +       }
1402 +
1403 +       reiserfs_info(s, "allocator options = [%08x]\n", SB_ALLOC_OPTS(s));
1404 +       return 0;
1405 +}
1406 +
1407 +static void print_sep(struct seq_file *seq, int *first)
1408 +{
1409 +       if (!*first)
1410 +               seq_puts(seq, ":");
1411 +       else
1412 +               *first = 0;
1413 +}
1414 +
1415 +void show_alloc_options(struct seq_file *seq, struct super_block *s)
1416 +{
1417 +       int first = 1;
1418 +
1419 +       if (SB_ALLOC_OPTS(s) == ((1 << _ALLOC_skip_busy) |
1420 +               (1 << _ALLOC_dirid_groups) | (1 << _ALLOC_packing_groups)))
1421 +               return;
1422 +
1423 +       seq_puts(seq, ",alloc=");
1424 +
1425 +       if (TEST_OPTION(concentrating_formatted_nodes, s)) {
1426 +               print_sep(seq, &first);
1427 +               if (REISERFS_SB(s)->s_alloc_options.border != 10) {
1428 +                       seq_printf(seq, "concentrating_formatted_nodes=%d",
1429 +                               100 / REISERFS_SB(s)->s_alloc_options.border);
1430 +               } else
1431 +                       seq_puts(seq, "concentrating_formatted_nodes");
1432 +       }
1433 +       if (TEST_OPTION(displacing_large_files, s)) {
1434 +               print_sep(seq, &first);
1435 +               if (REISERFS_SB(s)->s_alloc_options.large_file_size != 16) {
1436 +                       seq_printf(seq, "displacing_large_files=%lu",
1437 +                           REISERFS_SB(s)->s_alloc_options.large_file_size);
1438 +               } else
1439 +                       seq_puts(seq, "displacing_large_files");
1440 +       }
1441 +       if (TEST_OPTION(displacing_new_packing_localities, s)) {
1442 +               print_sep(seq, &first);
1443 +               seq_puts(seq, "displacing_new_packing_localities");
1444 +       }
1445 +       if (TEST_OPTION(old_hashed_relocation, s)) {
1446 +               print_sep(seq, &first);
1447 +               seq_puts(seq, "old_hashed_relocation");
1448 +       }
1449 +       if (TEST_OPTION(new_hashed_relocation, s)) {
1450 +               print_sep(seq, &first);
1451 +               seq_puts(seq, "new_hashed_relocation");
1452 +       }
1453 +       if (TEST_OPTION(dirid_groups, s)) {
1454 +               print_sep(seq, &first);
1455 +               seq_puts(seq, "dirid_groups");
1456 +       }
1457 +       if (TEST_OPTION(oid_groups, s)) {
1458 +               print_sep(seq, &first);
1459 +               seq_puts(seq, "oid_groups");
1460 +       }
1461 +       if (TEST_OPTION(packing_groups, s)) {
1462 +               print_sep(seq, &first);
1463 +               seq_puts(seq, "packing_groups");
1464 +       }
1465 +       if (TEST_OPTION(hashed_formatted_nodes, s)) {
1466 +               print_sep(seq, &first);
1467 +               seq_puts(seq, "hashed_formatted_nodes");
1468 +       }
1469 +       if (TEST_OPTION(skip_busy, s)) {
1470 +               print_sep(seq, &first);
1471 +               seq_puts(seq, "skip_busy");
1472 +       }
1473 +       if (TEST_OPTION(hundredth_slices, s)) {
1474 +               print_sep(seq, &first);
1475 +               seq_puts(seq, "hundredth_slices");
1476 +       }
1477 +       if (TEST_OPTION(old_way, s)) {
1478 +               print_sep(seq, &first);
1479 +               seq_puts(seq, "old_way");
1480 +       }
1481 +       if (TEST_OPTION(displace_based_on_dirid, s)) {
1482 +               print_sep(seq, &first);
1483 +               seq_puts(seq, "displace_based_on_dirid");
1484 +       }
1485 +       if (REISERFS_SB(s)->s_alloc_options.preallocmin != 0) {
1486 +               print_sep(seq, &first);
1487 +               seq_printf(seq, "preallocmin=%d",
1488 +                               REISERFS_SB(s)->s_alloc_options.preallocmin);
1489 +       }
1490 +       if (REISERFS_SB(s)->s_alloc_options.preallocsize != 17) {
1491 +               print_sep(seq, &first);
1492 +               seq_printf(seq, "preallocsize=%d",
1493 +                               REISERFS_SB(s)->s_alloc_options.preallocsize);
1494 +       }
1495 +}
1496 +
1497 +static inline void new_hashed_relocation(reiserfs_blocknr_hint_t * hint)
1498 +{
1499 +       char *hash_in;
1500 +
1501 +       if (hint->formatted_node) {
1502 +               hash_in = (char *)&hint->key.k_dir_id;
1503 +       } else {
1504 +               if (!hint->inode) {
1505 +                       /*hint->search_start = hint->beg;*/
1506 +                       hash_in = (char *)&hint->key.k_dir_id;
1507 +               } else
1508 +                   if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
1509 +                       hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id);
1510 +               else
1511 +                       hash_in =
1512 +                           (char *)(&INODE_PKEY(hint->inode)->k_objectid);
1513 +       }
1514 +
1515 +       hint->search_start =
1516 +           hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg);
1517 +}
1518 +
1519 +/*
1520 + * Relocation based on dirid, hashing them into a given bitmap block
1521 + * files. Formatted nodes are unaffected, a separate policy covers them
1522 + */
1523 +static void dirid_groups(reiserfs_blocknr_hint_t * hint)
1524 +{
1525 +       unsigned long hash;
1526 +       __u32 dirid = 0;
1527 +       int bm = 0;
1528 +       struct super_block *sb = hint->th->t_super;
1529 +
1530 +       if (hint->inode)
1531 +               dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id);
1532 +       else if (hint->formatted_node)
1533 +               dirid = hint->key.k_dir_id;
1534 +
1535 +       if (dirid) {
1536 +               bm = bmap_hash_id(sb, dirid);
1537 +               hash = bm * (sb->s_blocksize << 3);
1538 +               /* give a portion of the block group to metadata */
1539 +               if (hint->inode)
1540 +                       hash += sb->s_blocksize / 2;
1541 +               hint->search_start = hash;
1542 +       }
1543 +}
1544 +
1545 +/*
1546 + * Relocation based on oid, hashing them into a given bitmap block
1547 + * files. Formatted nodes are unaffected, a separate policy covers them
1548 + */
1549 +static void oid_groups(reiserfs_blocknr_hint_t * hint)
1550 +{
1551 +       if (hint->inode) {
1552 +               unsigned long hash;
1553 +               __u32 oid;
1554 +               __u32 dirid;
1555 +               int bm;
1556 +
1557 +               dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id);
1558 +
1559 +               /*
1560 +                * keep the root dir and it's first set of subdirs close to
1561 +                * the start of the disk
1562 +                */
1563 +               if (dirid <= 2)
1564 +                       hash = (hint->inode->i_sb->s_blocksize << 3);
1565 +               else {
1566 +                       oid = le32_to_cpu(INODE_PKEY(hint->inode)->k_objectid);
1567 +                       bm = bmap_hash_id(hint->inode->i_sb, oid);
1568 +                       hash = bm * (hint->inode->i_sb->s_blocksize << 3);
1569 +               }
1570 +               hint->search_start = hash;
1571 +       }
1572 +}
1573 +
1574 +/*
1575 + * returns 1 if it finds an indirect item and gets valid hint info
1576 + * from it, otherwise 0
1577 + */
1578 +static int get_left_neighbor(reiserfs_blocknr_hint_t * hint)
1579 +{
1580 +       struct treepath *path;
1581 +       struct buffer_head *bh;
1582 +       struct item_head *ih;
1583 +       int pos_in_item;
1584 +       __le32 *item;
1585 +       int ret = 0;
1586 +
1587 +       /*
1588 +        * reiserfs code can call this function w/o pointer to path
1589 +        * structure supplied; then we rely on supplied search_start
1590 +        */
1591 +       if (!hint->path)
1592 +               return 0;
1593 +
1594 +       path = hint->path;
1595 +       bh = get_last_bh(path);
1596 +       RFALSE(!bh, "green-4002: Illegal path specified to get_left_neighbor");
1597 +       ih = tp_item_head(path);
1598 +       pos_in_item = path->pos_in_item;
1599 +       item = tp_item_body(path);
1600 +
1601 +       hint->search_start = bh->b_blocknr;
1602 +
1603 +       /*
1604 +        * for indirect item: go to left and look for the first non-hole entry
1605 +        * in the indirect item
1606 +        */
1607 +       if (!hint->formatted_node && is_indirect_le_ih(ih)) {
1608 +               if (pos_in_item == I_UNFM_NUM(ih))
1609 +                       pos_in_item--;
1610 +               while (pos_in_item >= 0) {
1611 +                       int t = get_block_num(item, pos_in_item);
1612 +                       if (t) {
1613 +                               hint->search_start = t;
1614 +                               ret = 1;
1615 +                               break;
1616 +                       }
1617 +                       pos_in_item--;
1618 +               }
1619 +       }
1620 +
1621 +       /* does result value fit into specified region? */
1622 +       return ret;
1623 +}
1624 +
1625 +/*
1626 + * should be, if formatted node, then try to put on first part of the device
1627 + * specified as number of percent with mount option device, else try to put
1628 + * on last of device.  This is not to say it is good code to do so,
1629 + * but the effect should be measured.
1630 + */
1631 +static inline void set_border_in_hint(struct super_block *s,
1632 +                                     reiserfs_blocknr_hint_t * hint)
1633 +{
1634 +       b_blocknr_t border =
1635 +           SB_BLOCK_COUNT(s) / REISERFS_SB(s)->s_alloc_options.border;
1636 +
1637 +       if (hint->formatted_node)
1638 +               hint->end = border - 1;
1639 +       else
1640 +               hint->beg = border;
1641 +}
1642 +
1643 +static inline void displace_large_file(reiserfs_blocknr_hint_t * hint)
1644 +{
1645 +       if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
1646 +               hint->search_start =
1647 +                   hint->beg +
1648 +                   keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_dir_id),
1649 +                              4) % (hint->end - hint->beg);
1650 +       else
1651 +               hint->search_start =
1652 +                   hint->beg +
1653 +                   keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_objectid),
1654 +                              4) % (hint->end - hint->beg);
1655 +}
1656 +
1657 +static inline void hash_formatted_node(reiserfs_blocknr_hint_t * hint)
1658 +{
1659 +       char *hash_in;
1660 +
1661 +       if (!hint->inode)
1662 +               hash_in = (char *)&hint->key.k_dir_id;
1663 +       else if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
1664 +               hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id);
1665 +       else
1666 +               hash_in = (char *)(&INODE_PKEY(hint->inode)->k_objectid);
1667 +
1668 +       hint->search_start =
1669 +           hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg);
1670 +}
1671 +
1672 +static inline int
1673 +this_blocknr_allocation_would_make_it_a_large_file(reiserfs_blocknr_hint_t *
1674 +                                                  hint)
1675 +{
1676 +       return hint->block ==
1677 +           REISERFS_SB(hint->th->t_super)->s_alloc_options.large_file_size;
1678 +}
1679 +
1680 +#ifdef DISPLACE_NEW_PACKING_LOCALITIES
1681 +static inline void displace_new_packing_locality(reiserfs_blocknr_hint_t * hint)
1682 +{
1683 +       struct in_core_key *key = &hint->key;
1684 +
1685 +       hint->th->displace_new_blocks = 0;
1686 +       hint->search_start =
1687 +           hint->beg + keyed_hash((char *)(&key->k_objectid),
1688 +                                  4) % (hint->end - hint->beg);
1689 +}
1690 +#endif
1691 +
1692 +static inline int old_hashed_relocation(reiserfs_blocknr_hint_t * hint)
1693 +{
1694 +       b_blocknr_t border;
1695 +       u32 hash_in;
1696 +
1697 +       if (hint->formatted_node || hint->inode == NULL) {
1698 +               return 0;
1699 +       }
1700 +
1701 +       hash_in = le32_to_cpu((INODE_PKEY(hint->inode))->k_dir_id);
1702 +       border =
1703 +           hint->beg + (u32) keyed_hash(((char *)(&hash_in)),
1704 +                                        4) % (hint->end - hint->beg - 1);
1705 +       if (border > hint->search_start)
1706 +               hint->search_start = border;
1707 +
1708 +       return 1;
1709 +}
1710 +
1711 +static inline int old_way(reiserfs_blocknr_hint_t * hint)
1712 +{
1713 +       b_blocknr_t border;
1714 +
1715 +       if (hint->formatted_node || hint->inode == NULL) {
1716 +               return 0;
1717 +       }
1718 +
1719 +       border =
1720 +           hint->beg +
1721 +           le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id) % (hint->end -
1722 +                                                             hint->beg);
1723 +       if (border > hint->search_start)
1724 +               hint->search_start = border;
1725 +
1726 +       return 1;
1727 +}
1728 +
1729 +static inline void hundredth_slices(reiserfs_blocknr_hint_t * hint)
1730 +{
1731 +       struct in_core_key *key = &hint->key;
1732 +       b_blocknr_t slice_start;
1733 +
1734 +       slice_start =
1735 +           (keyed_hash((char *)(&key->k_dir_id), 4) % 100) * (hint->end / 100);
1736 +       if (slice_start > hint->search_start
1737 +           || slice_start + (hint->end / 100) <= hint->search_start) {
1738 +               hint->search_start = slice_start;
1739 +       }
1740 +}
1741 +
1742 +static void determine_search_start(reiserfs_blocknr_hint_t * hint,
1743 +                                  int amount_needed)
1744 +{
1745 +       struct super_block *s = hint->th->t_super;
1746 +       int unfm_hint;
1747 +
1748 +       hint->beg = 0;
1749 +       hint->end = SB_BLOCK_COUNT(s) - 1;
1750 +
1751 +       /* This is former border algorithm. Now with tunable border offset */
1752 +       if (concentrating_formatted_nodes(s))
1753 +               set_border_in_hint(s, hint);
1754 +
1755 +#ifdef DISPLACE_NEW_PACKING_LOCALITIES
1756 +       /*
1757 +        * whenever we create a new directory, we displace it.  At first
1758 +        * we will hash for location, later we might look for a moderately
1759 +        * empty place for it
1760 +        */
1761 +       if (displacing_new_packing_localities(s)
1762 +           && hint->th->displace_new_blocks) {
1763 +               displace_new_packing_locality(hint);
1764 +
1765 +               /*
1766 +                * we do not continue determine_search_start,
1767 +                * if new packing locality is being displaced
1768 +                */
1769 +               return;
1770 +       }
1771 +#endif
1772 +
1773 +       /*
1774 +        * all persons should feel encouraged to add more special cases
1775 +        * here and test them
1776 +        */
1777 +
1778 +       if (displacing_large_files(s) && !hint->formatted_node
1779 +           && this_blocknr_allocation_would_make_it_a_large_file(hint)) {
1780 +               displace_large_file(hint);
1781 +               return;
1782 +       }
1783 +
1784 +       /*
1785 +        * if none of our special cases is relevant, use the left
1786 +        * neighbor in the tree order of the new node we are allocating for
1787 +        */
1788 +       if (hint->formatted_node && TEST_OPTION(hashed_formatted_nodes, s)) {
1789 +               hash_formatted_node(hint);
1790 +               return;
1791 +       }
1792 +
1793 +       unfm_hint = get_left_neighbor(hint);
1794 +
1795 +       /*
1796 +        * Mimic old block allocator behaviour, that is if VFS allowed for
1797 +        * preallocation, new blocks are displaced based on directory ID.
1798 +        * Also, if suggested search_start is less than last preallocated
1799 +        * block, we start searching from it, assuming that HDD dataflow
1800 +        * is faster in forward direction
1801 +        */
1802 +       if (TEST_OPTION(old_way, s)) {
1803 +               if (!hint->formatted_node) {
1804 +                       if (!reiserfs_hashed_relocation(s))
1805 +                               old_way(hint);
1806 +                       else if (!reiserfs_no_unhashed_relocation(s))
1807 +                               old_hashed_relocation(hint);
1808 +
1809 +                       if (hint->inode
1810 +                           && hint->search_start <
1811 +                           REISERFS_I(hint->inode)->i_prealloc_block)
1812 +                               hint->search_start =
1813 +                                   REISERFS_I(hint->inode)->i_prealloc_block;
1814 +               }
1815 +               return;
1816 +       }
1817 +
1818 +       /* This is an approach proposed by Hans */
1819 +       if (TEST_OPTION(hundredth_slices, s)
1820 +           && !(displacing_large_files(s) && !hint->formatted_node)) {
1821 +               hundredth_slices(hint);
1822 +               return;
1823 +       }
1824 +
1825 +       /* old_hashed_relocation only works on unformatted */
1826 +       if (!unfm_hint && !hint->formatted_node &&
1827 +           TEST_OPTION(old_hashed_relocation, s)) {
1828 +               old_hashed_relocation(hint);
1829 +       }
1830 +
1831 +       /* new_hashed_relocation works with both formatted/unformatted nodes */
1832 +       if ((!unfm_hint || hint->formatted_node) &&
1833 +           TEST_OPTION(new_hashed_relocation, s)) {
1834 +               new_hashed_relocation(hint);
1835 +       }
1836 +
1837 +       /* dirid grouping works only on unformatted nodes */
1838 +       if (!unfm_hint && !hint->formatted_node && TEST_OPTION(dirid_groups, s)) {
1839 +               dirid_groups(hint);
1840 +       }
1841 +#ifdef DISPLACE_NEW_PACKING_LOCALITIES
1842 +       if (hint->formatted_node && TEST_OPTION(dirid_groups, s)) {
1843 +               dirid_groups(hint);
1844 +       }
1845 +#endif
1846 +
1847 +       /* oid grouping works only on unformatted nodes */
1848 +       if (!unfm_hint && !hint->formatted_node && TEST_OPTION(oid_groups, s)) {
1849 +               oid_groups(hint);
1850 +       }
1851 +       return;
1852 +}
1853 +
1854 +static int determine_prealloc_size(reiserfs_blocknr_hint_t * hint)
1855 +{
1856 +       /* make minimum size a mount option and benchmark both ways */
1857 +       /* we preallocate blocks only for regular files, specific size */
1858 +       /* benchmark preallocating always and see what happens */
1859 +
1860 +       hint->prealloc_size = 0;
1861 +
1862 +       if (!hint->formatted_node && hint->preallocate) {
1863 +               if (S_ISREG(hint->inode->i_mode) && !IS_PRIVATE(hint->inode)
1864 +                   && hint->inode->i_size >=
1865 +                   REISERFS_SB(hint->th->t_super)->s_alloc_options.
1866 +                   preallocmin * hint->inode->i_sb->s_blocksize)
1867 +                       hint->prealloc_size =
1868 +                           REISERFS_SB(hint->th->t_super)->s_alloc_options.
1869 +                           preallocsize - 1;
1870 +       }
1871 +       return CARRY_ON;
1872 +}
1873 +
1874 +static inline int allocate_without_wrapping_disk(reiserfs_blocknr_hint_t * hint,
1875 +                                                b_blocknr_t * new_blocknrs,
1876 +                                                b_blocknr_t start,
1877 +                                                b_blocknr_t finish, int min,
1878 +                                                int amount_needed,
1879 +                                                int prealloc_size)
1880 +{
1881 +       int rest = amount_needed;
1882 +       int nr_allocated;
1883 +
1884 +       while (rest > 0 && start <= finish) {
1885 +               nr_allocated = scan_bitmap(hint->th, &start, finish, min,
1886 +                                          rest + prealloc_size,
1887 +                                          !hint->formatted_node, hint->block);
1888 +
1889 +               if (nr_allocated == 0)  /* no new blocks allocated, return */
1890 +                       break;
1891 +
1892 +               /* fill free_blocknrs array first */
1893 +               while (rest > 0 && nr_allocated > 0) {
1894 +                       *new_blocknrs++ = start++;
1895 +                       rest--;
1896 +                       nr_allocated--;
1897 +               }
1898 +
1899 +               /* do we have something to fill prealloc. array also ? */
1900 +               if (nr_allocated > 0) {
1901 +                       /*
1902 +                        * it means prealloc_size was greater that 0 and
1903 +                        * we do preallocation
1904 +                        */
1905 +                       list_add(&REISERFS_I(hint->inode)->i_prealloc_list,
1906 +                                &SB_JOURNAL(hint->th->t_super)->
1907 +                                j_prealloc_list);
1908 +                       REISERFS_I(hint->inode)->i_prealloc_block = start;
1909 +                       REISERFS_I(hint->inode)->i_prealloc_count =
1910 +                           nr_allocated;
1911 +                       break;
1912 +               }
1913 +       }
1914 +
1915 +       return (amount_needed - rest);
1916 +}
1917 +
1918 +static inline int blocknrs_and_prealloc_arrays_from_search_start
1919 +    (reiserfs_blocknr_hint_t * hint, b_blocknr_t * new_blocknrs,
1920 +     int amount_needed) {
1921 +       struct super_block *s = hint->th->t_super;
1922 +       b_blocknr_t start = hint->search_start;
1923 +       b_blocknr_t finish = SB_BLOCK_COUNT(s) - 1;
1924 +       int passno = 0;
1925 +       int nr_allocated = 0;
1926 +       int depth;
1927 +
1928 +       determine_prealloc_size(hint);
1929 +       if (!hint->formatted_node) {
1930 +               int quota_ret;
1931 +#ifdef REISERQUOTA_DEBUG
1932 +               reiserfs_debug(s, REISERFS_DEBUG_CODE,
1933 +                              "reiserquota: allocating %d blocks id=%u",
1934 +                              amount_needed, hint->inode->i_uid);
1935 +#endif
1936 +               depth = reiserfs_write_unlock_nested(s);
1937 +               quota_ret =
1938 +                   dquot_alloc_block_nodirty(hint->inode, amount_needed);
1939 +               if (quota_ret) {        /* Quota exceeded? */
1940 +                       reiserfs_write_lock_nested(s, depth);
1941 +                       return QUOTA_EXCEEDED;
1942 +               }
1943 +               if (hint->preallocate && hint->prealloc_size) {
1944 +#ifdef REISERQUOTA_DEBUG
1945 +                       reiserfs_debug(s, REISERFS_DEBUG_CODE,
1946 +                                      "reiserquota: allocating (prealloc) %d blocks id=%u",
1947 +                                      hint->prealloc_size, hint->inode->i_uid);
1948 +#endif
1949 +                       quota_ret = dquot_prealloc_block_nodirty(hint->inode,
1950 +                                                        hint->prealloc_size);
1951 +                       if (quota_ret)
1952 +                               hint->preallocate = hint->prealloc_size = 0;
1953 +               }
1954 +               /* for unformatted nodes, force large allocations */
1955 +               reiserfs_write_lock_nested(s, depth);
1956 +       }
1957 +
1958 +       do {
1959 +               switch (passno++) {
1960 +               case 0: /* Search from hint->search_start to end of disk */
1961 +                       start = hint->search_start;
1962 +                       finish = SB_BLOCK_COUNT(s) - 1;
1963 +                       break;
1964 +               case 1: /* Search from hint->beg to hint->search_start */
1965 +                       start = hint->beg;
1966 +                       finish = hint->search_start;
1967 +                       break;
1968 +               case 2: /* Last chance: Search from 0 to hint->beg */
1969 +                       start = 0;
1970 +                       finish = hint->beg;
1971 +                       break;
1972 +               default:
1973 +                       /* We've tried searching everywhere, not enough space */
1974 +                       /* Free the blocks */
1975 +                       if (!hint->formatted_node) {
1976 +#ifdef REISERQUOTA_DEBUG
1977 +                               reiserfs_debug(s, REISERFS_DEBUG_CODE,
1978 +                                              "reiserquota: freeing (nospace) %d blocks id=%u",
1979 +                                              amount_needed +
1980 +                                              hint->prealloc_size -
1981 +                                              nr_allocated,
1982 +                                              hint->inode->i_uid);
1983 +#endif
1984 +                               /* Free not allocated blocks */
1985 +                               depth = reiserfs_write_unlock_nested(s);
1986 +                               dquot_free_block_nodirty(hint->inode,
1987 +                                       amount_needed + hint->prealloc_size -
1988 +                                       nr_allocated);
1989 +                               reiserfs_write_lock_nested(s, depth);
1990 +                       }
1991 +                       while (nr_allocated--)
1992 +                               reiserfs_free_block(hint->th, hint->inode,
1993 +                                                   new_blocknrs[nr_allocated],
1994 +                                                   !hint->formatted_node);
1995 +
1996 +                       return NO_DISK_SPACE;
1997 +               }
1998 +       } while ((nr_allocated += allocate_without_wrapping_disk(hint,
1999 +                                                                new_blocknrs +
2000 +                                                                nr_allocated,
2001 +                                                                start, finish,
2002 +                                                                1,
2003 +                                                                amount_needed -
2004 +                                                                nr_allocated,
2005 +                                                                hint->
2006 +                                                                prealloc_size))
2007 +                < amount_needed);
2008 +       if (!hint->formatted_node &&
2009 +           amount_needed + hint->prealloc_size >
2010 +           nr_allocated + REISERFS_I(hint->inode)->i_prealloc_count) {
2011 +               /* Some of preallocation blocks were not allocated */
2012 +#ifdef REISERQUOTA_DEBUG
2013 +               reiserfs_debug(s, REISERFS_DEBUG_CODE,
2014 +                              "reiserquota: freeing (failed prealloc) %d blocks id=%u",
2015 +                              amount_needed + hint->prealloc_size -
2016 +                              nr_allocated -
2017 +                              REISERFS_I(hint->inode)->i_prealloc_count,
2018 +                              hint->inode->i_uid);
2019 +#endif
2020 +
2021 +               depth = reiserfs_write_unlock_nested(s);
2022 +               dquot_free_block_nodirty(hint->inode, amount_needed +
2023 +                                        hint->prealloc_size - nr_allocated -
2024 +                                        REISERFS_I(hint->inode)->
2025 +                                        i_prealloc_count);
2026 +               reiserfs_write_lock_nested(s, depth);
2027 +       }
2028 +
2029 +       return CARRY_ON;
2030 +}
2031 +
2032 +/* grab new blocknrs from preallocated list */
2033 +/* return amount still needed after using them */
2034 +static int use_preallocated_list_if_available(reiserfs_blocknr_hint_t * hint,
2035 +                                             b_blocknr_t * new_blocknrs,
2036 +                                             int amount_needed)
2037 +{
2038 +       struct inode *inode = hint->inode;
2039 +
2040 +       if (REISERFS_I(inode)->i_prealloc_count > 0) {
2041 +               while (amount_needed) {
2042 +
2043 +                       *new_blocknrs++ = REISERFS_I(inode)->i_prealloc_block++;
2044 +                       REISERFS_I(inode)->i_prealloc_count--;
2045 +
2046 +                       amount_needed--;
2047 +
2048 +                       if (REISERFS_I(inode)->i_prealloc_count <= 0) {
2049 +                               list_del(&REISERFS_I(inode)->i_prealloc_list);
2050 +                               break;
2051 +                       }
2052 +               }
2053 +       }
2054 +       /* return amount still needed after using preallocated blocks */
2055 +       return amount_needed;
2056 +}
2057 +
2058 +int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *hint,
2059 +                              b_blocknr_t *new_blocknrs,
2060 +                              int amount_needed,
2061 +                              /* Amount of blocks we have already reserved */
2062 +                              int reserved_by_us)
2063 +{
2064 +       int initial_amount_needed = amount_needed;
2065 +       int ret;
2066 +       struct super_block *s = hint->th->t_super;
2067 +
2068 +       /* Check if there is enough space, taking into account reserved space */
2069 +       if (SB_FREE_BLOCKS(s) - REISERFS_SB(s)->reserved_blocks <
2070 +           amount_needed - reserved_by_us)
2071 +               return NO_DISK_SPACE;
2072 +       /* should this be if !hint->inode &&  hint->preallocate? */
2073 +       /* do you mean hint->formatted_node can be removed ? - Zam */
2074 +       /*
2075 +        * hint->formatted_node cannot be removed because we try to access
2076 +        * inode information here, and there is often no inode associated with
2077 +        * metadata allocations - green
2078 +        */
2079 +
2080 +       if (!hint->formatted_node && hint->preallocate) {
2081 +               amount_needed = use_preallocated_list_if_available
2082 +                   (hint, new_blocknrs, amount_needed);
2083 +
2084 +               /*
2085 +                * We have all the block numbers we need from the
2086 +                * prealloc list
2087 +                */
2088 +               if (amount_needed == 0)
2089 +                       return CARRY_ON;
2090 +               new_blocknrs += (initial_amount_needed - amount_needed);
2091 +       }
2092 +
2093 +       /* find search start and save it in hint structure */
2094 +       determine_search_start(hint, amount_needed);
2095 +       if (hint->search_start >= SB_BLOCK_COUNT(s))
2096 +               hint->search_start = SB_BLOCK_COUNT(s) - 1;
2097 +
2098 +       /* allocation itself; fill new_blocknrs and preallocation arrays */
2099 +       ret = blocknrs_and_prealloc_arrays_from_search_start
2100 +           (hint, new_blocknrs, amount_needed);
2101 +
2102 +       /*
2103 +        * We used prealloc. list to fill (partially) new_blocknrs array.
2104 +        * If final allocation fails we need to return blocks back to
2105 +        * prealloc. list or just free them. -- Zam (I chose second
2106 +        * variant)
2107 +        */
2108 +       if (ret != CARRY_ON) {
2109 +               while (amount_needed++ < initial_amount_needed) {
2110 +                       reiserfs_free_block(hint->th, hint->inode,
2111 +                                           *(--new_blocknrs), 1);
2112 +               }
2113 +       }
2114 +       return ret;
2115 +}
2116 +
2117 +void reiserfs_cache_bitmap_metadata(struct super_block *sb,
2118 +                                    struct buffer_head *bh,
2119 +                                    struct reiserfs_bitmap_info *info)
2120 +{
2121 +       unsigned long *cur = (unsigned long *)(bh->b_data + bh->b_size);
2122 +
2123 +       /* The first bit must ALWAYS be 1 */
2124 +       if (!reiserfs_test_le_bit(0, (unsigned long *)bh->b_data))
2125 +               reiserfs_error(sb, "reiserfs-2025", "bitmap block %lu is "
2126 +                              "corrupted: first bit must be 1", bh->b_blocknr);
2127 +
2128 +       info->free_count = 0;
2129 +
2130 +       while (--cur >= (unsigned long *)bh->b_data) {
2131 +               /* 0 and ~0 are special, we can optimize for them */
2132 +               if (*cur == 0)
2133 +                       info->free_count += BITS_PER_LONG;
2134 +               else if (*cur != ~0L)   /* A mix, investigate */
2135 +                       info->free_count += BITS_PER_LONG - hweight_long(*cur);
2136 +       }
2137 +}
2138 +
2139 +struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb,
2140 +                                               unsigned int bitmap)
2141 +{
2142 +       b_blocknr_t block = (sb->s_blocksize << 3) * bitmap;
2143 +       struct reiserfs_bitmap_info *info = SB_AP_BITMAP(sb) + bitmap;
2144 +       struct buffer_head *bh;
2145 +
2146 +       /*
2147 +        * Way old format filesystems had the bitmaps packed up front.
2148 +        * I doubt there are any of these left, but just in case...
2149 +        */
2150 +       if (unlikely(test_bit(REISERFS_OLD_FORMAT,
2151 +                             &REISERFS_SB(sb)->s_properties)))
2152 +               block = REISERFS_SB(sb)->s_sbh->b_blocknr + 1 + bitmap;
2153 +       else if (bitmap == 0)
2154 +               block = (REISERFS_DISK_OFFSET_IN_BYTES >> sb->s_blocksize_bits) + 1;
2155 +
2156 +       bh = sb_bread(sb, block);
2157 +       if (bh == NULL)
2158 +               reiserfs_warning(sb, "sh-2029: %s: bitmap block (#%u) "
2159 +                                "reading failed", __func__, block);
2160 +       else {
2161 +               if (buffer_locked(bh)) {
2162 +                       int depth;
2163 +                       PROC_INFO_INC(sb, scan_bitmap.wait);
2164 +                       depth = reiserfs_write_unlock_nested(sb);
2165 +                       __wait_on_buffer(bh);
2166 +                       reiserfs_write_lock_nested(sb, depth);
2167 +               }
2168 +               BUG_ON(!buffer_uptodate(bh));
2169 +               BUG_ON(atomic_read(&bh->b_count) == 0);
2170 +
2171 +               if (info->free_count == UINT_MAX)
2172 +                       reiserfs_cache_bitmap_metadata(sb, bh, info);
2173 +       }
2174 +
2175 +       return bh;
2176 +}
2177 +
2178 +int reiserfs_init_bitmap_cache(struct super_block *sb)
2179 +{
2180 +       struct reiserfs_bitmap_info *bitmap;
2181 +       unsigned int bmap_nr = reiserfs_bmap_count(sb);
2182 +
2183 +       bitmap = vmalloc(array_size(bmap_nr, sizeof(*bitmap)));
2184 +       if (bitmap == NULL)
2185 +               return -ENOMEM;
2186 +
2187 +       memset(bitmap, 0xff, sizeof(*bitmap) * bmap_nr);
2188 +
2189 +       SB_AP_BITMAP(sb) = bitmap;
2190 +
2191 +       return 0;
2192 +}
2193 +
2194 +void reiserfs_free_bitmap_cache(struct super_block *sb)
2195 +{
2196 +       if (SB_AP_BITMAP(sb)) {
2197 +               vfree(SB_AP_BITMAP(sb));
2198 +               SB_AP_BITMAP(sb) = NULL;
2199 +       }
2200 +}
2201 diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
2202 new file mode 100644
2203 index 000000000000..79ee2b436685
2204 --- /dev/null
2205 +++ b/fs/reiserfs/dir.c
2206 @@ -0,0 +1,346 @@
2207 +/*
2208 + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
2209 + */
2210 +
2211 +#include <linux/string.h>
2212 +#include <linux/errno.h>
2213 +#include <linux/fs.h>
2214 +#include "reiserfs.h"
2215 +#include <linux/stat.h>
2216 +#include <linux/buffer_head.h>
2217 +#include <linux/slab.h>
2218 +#include <linux/uaccess.h>
2219 +
2220 +extern const struct reiserfs_key MIN_KEY;
2221 +
2222 +static int reiserfs_readdir(struct file *, struct dir_context *);
2223 +static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
2224 +                             int datasync);
2225 +
2226 +const struct file_operations reiserfs_dir_operations = {
2227 +       .llseek = generic_file_llseek,
2228 +       .read = generic_read_dir,
2229 +       .iterate_shared = reiserfs_readdir,
2230 +       .fsync = reiserfs_dir_fsync,
2231 +       .unlocked_ioctl = reiserfs_ioctl,
2232 +#ifdef CONFIG_COMPAT
2233 +       .compat_ioctl = reiserfs_compat_ioctl,
2234 +#endif
2235 +};
2236 +
2237 +static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
2238 +                             int datasync)
2239 +{
2240 +       struct inode *inode = filp->f_mapping->host;
2241 +       int err;
2242 +
2243 +       err = file_write_and_wait_range(filp, start, end);
2244 +       if (err)
2245 +               return err;
2246 +
2247 +       inode_lock(inode);
2248 +       reiserfs_write_lock(inode->i_sb);
2249 +       err = reiserfs_commit_for_inode(inode);
2250 +       reiserfs_write_unlock(inode->i_sb);
2251 +       inode_unlock(inode);
2252 +       if (err < 0)
2253 +               return err;
2254 +       return 0;
2255 +}
2256 +
2257 +#define store_ih(where,what) copy_item_head (where, what)
2258 +
2259 +static inline bool is_privroot_deh(struct inode *dir, struct reiserfs_de_head *deh)
2260 +{
2261 +       struct dentry *privroot = REISERFS_SB(dir->i_sb)->priv_root;
2262 +       return (d_really_is_positive(privroot) &&
2263 +               deh->deh_objectid == INODE_PKEY(d_inode(privroot))->k_objectid);
2264 +}
2265 +
2266 +int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
2267 +{
2268 +
2269 +       /* key of current position in the directory (key of directory entry) */
2270 +       struct cpu_key pos_key;
2271 +
2272 +       INITIALIZE_PATH(path_to_entry);
2273 +       struct buffer_head *bh;
2274 +       int item_num, entry_num;
2275 +       const struct reiserfs_key *rkey;
2276 +       struct item_head *ih, tmp_ih;
2277 +       int search_res;
2278 +       char *local_buf;
2279 +       loff_t next_pos;
2280 +       char small_buf[32];     /* avoid kmalloc if we can */
2281 +       struct reiserfs_dir_entry de;
2282 +       int ret = 0;
2283 +       int depth;
2284 +
2285 +       reiserfs_write_lock(inode->i_sb);
2286 +
2287 +       reiserfs_check_lock_depth(inode->i_sb, "readdir");
2288 +
2289 +       /*
2290 +        * form key for search the next directory entry using
2291 +        * f_pos field of file structure
2292 +        */
2293 +       make_cpu_key(&pos_key, inode, ctx->pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3);
2294 +       next_pos = cpu_key_k_offset(&pos_key);
2295 +
2296 +       path_to_entry.reada = PATH_READA;
2297 +       while (1) {
2298 +research:
2299 +               /*
2300 +                * search the directory item, containing entry with
2301 +                * specified key
2302 +                */
2303 +               search_res =
2304 +                   search_by_entry_key(inode->i_sb, &pos_key, &path_to_entry,
2305 +                                       &de);
2306 +               if (search_res == IO_ERROR) {
2307 +                       /*
2308 +                        * FIXME: we could just skip part of directory
2309 +                        * which could not be read
2310 +                        */
2311 +                       ret = -EIO;
2312 +                       goto out;
2313 +               }
2314 +               entry_num = de.de_entry_num;
2315 +               bh = de.de_bh;
2316 +               item_num = de.de_item_num;
2317 +               ih = de.de_ih;
2318 +               store_ih(&tmp_ih, ih);
2319 +
2320 +               /* we must have found item, that is item of this directory, */
2321 +               RFALSE(COMP_SHORT_KEYS(&ih->ih_key, &pos_key),
2322 +                      "vs-9000: found item %h does not match to dir we readdir %K",
2323 +                      ih, &pos_key);
2324 +               RFALSE(item_num > B_NR_ITEMS(bh) - 1,
2325 +                      "vs-9005 item_num == %d, item amount == %d",
2326 +                      item_num, B_NR_ITEMS(bh));
2327 +
2328 +               /*
2329 +                * and entry must be not more than number of entries
2330 +                * in the item
2331 +                */
2332 +               RFALSE(ih_entry_count(ih) < entry_num,
2333 +                      "vs-9010: entry number is too big %d (%d)",
2334 +                      entry_num, ih_entry_count(ih));
2335 +
2336 +               /*
2337 +                * go through all entries in the directory item beginning
2338 +                * from the entry, that has been found
2339 +                */
2340 +               if (search_res == POSITION_FOUND
2341 +                   || entry_num < ih_entry_count(ih)) {
2342 +                       struct reiserfs_de_head *deh =
2343 +                           B_I_DEH(bh, ih) + entry_num;
2344 +
2345 +                       for (; entry_num < ih_entry_count(ih);
2346 +                            entry_num++, deh++) {
2347 +                               int d_reclen;
2348 +                               char *d_name;
2349 +                               ino_t d_ino;
2350 +                               loff_t cur_pos = deh_offset(deh);
2351 +
2352 +                               /* it is hidden entry */
2353 +                               if (!de_visible(deh))
2354 +                                       continue;
2355 +                               d_reclen = entry_length(bh, ih, entry_num);
2356 +                               d_name = B_I_DEH_ENTRY_FILE_NAME(bh, ih, deh);
2357 +
2358 +                               if (d_reclen <= 0 ||
2359 +                                   d_name + d_reclen > bh->b_data + bh->b_size) {
2360 +                                       /*
2361 +                                        * There is corrupted data in entry,
2362 +                                        * We'd better stop here
2363 +                                        */
2364 +                                       pathrelse(&path_to_entry);
2365 +                                       ret = -EIO;
2366 +                                       goto out;
2367 +                               }
2368 +
2369 +                               if (!d_name[d_reclen - 1])
2370 +                                       d_reclen = strlen(d_name);
2371 +
2372 +                               /* too big to send back to VFS */
2373 +                               if (d_reclen >
2374 +                                   REISERFS_MAX_NAME(inode->i_sb->
2375 +                                                     s_blocksize)) {
2376 +                                       continue;
2377 +                               }
2378 +
2379 +                               /* Ignore the .reiserfs_priv entry */
2380 +                               if (is_privroot_deh(inode, deh))
2381 +                                       continue;
2382 +
2383 +                               ctx->pos = deh_offset(deh);
2384 +                               d_ino = deh_objectid(deh);
2385 +                               if (d_reclen <= 32) {
2386 +                                       local_buf = small_buf;
2387 +                               } else {
2388 +                                       local_buf = kmalloc(d_reclen,
2389 +                                                           GFP_NOFS);
2390 +                                       if (!local_buf) {
2391 +                                               pathrelse(&path_to_entry);
2392 +                                               ret = -ENOMEM;
2393 +                                               goto out;
2394 +                                       }
2395 +                                       if (item_moved(&tmp_ih, &path_to_entry)) {
2396 +                                               kfree(local_buf);
2397 +                                               goto research;
2398 +                                       }
2399 +                               }
2400 +
2401 +                               /*
2402 +                                * Note, that we copy name to user space via
2403 +                                * temporary buffer (local_buf) because
2404 +                                * filldir will block if user space buffer is
2405 +                                * swapped out. At that time entry can move to
2406 +                                * somewhere else
2407 +                                */
2408 +                               memcpy(local_buf, d_name, d_reclen);
2409 +
2410 +                               /*
2411 +                                * Since filldir might sleep, we can release
2412 +                                * the write lock here for other waiters
2413 +                                */
2414 +                               depth = reiserfs_write_unlock_nested(inode->i_sb);
2415 +                               if (!dir_emit
2416 +                                   (ctx, local_buf, d_reclen, d_ino,
2417 +                                    DT_UNKNOWN)) {
2418 +                                       reiserfs_write_lock_nested(inode->i_sb, depth);
2419 +                                       if (local_buf != small_buf) {
2420 +                                               kfree(local_buf);
2421 +                                       }
2422 +                                       goto end;
2423 +                               }
2424 +                               reiserfs_write_lock_nested(inode->i_sb, depth);
2425 +                               if (local_buf != small_buf) {
2426 +                                       kfree(local_buf);
2427 +                               }
2428 +
2429 +                               /* deh_offset(deh) may be invalid now. */
2430 +                               next_pos = cur_pos + 1;
2431 +
2432 +                               if (item_moved(&tmp_ih, &path_to_entry)) {
2433 +                                       set_cpu_key_k_offset(&pos_key,
2434 +                                                            next_pos);
2435 +                                       goto research;
2436 +                               }
2437 +                       }       /* for */
2438 +               }
2439 +
2440 +               /* end of directory has been reached */
2441 +               if (item_num != B_NR_ITEMS(bh) - 1)
2442 +                       goto end;
2443 +
2444 +               /*
2445 +                * item we went through is last item of node. Using right
2446 +                * delimiting key check is it directory end
2447 +                */
2448 +               rkey = get_rkey(&path_to_entry, inode->i_sb);
2449 +               if (!comp_le_keys(rkey, &MIN_KEY)) {
2450 +                       /*
2451 +                        * set pos_key to key, that is the smallest and greater
2452 +                        * that key of the last entry in the item
2453 +                        */
2454 +                       set_cpu_key_k_offset(&pos_key, next_pos);
2455 +                       continue;
2456 +               }
2457 +
2458 +               /* end of directory has been reached */
2459 +               if (COMP_SHORT_KEYS(rkey, &pos_key)) {
2460 +                       goto end;
2461 +               }
2462 +
2463 +               /* directory continues in the right neighboring block */
2464 +               set_cpu_key_k_offset(&pos_key,
2465 +                                    le_key_k_offset(KEY_FORMAT_3_5, rkey));
2466 +
2467 +       }                       /* while */
2468 +
2469 +end:
2470 +       ctx->pos = next_pos;
2471 +       pathrelse(&path_to_entry);
2472 +       reiserfs_check_path(&path_to_entry);
2473 +out:
2474 +       reiserfs_write_unlock(inode->i_sb);
2475 +       return ret;
2476 +}
2477 +
2478 +static int reiserfs_readdir(struct file *file, struct dir_context *ctx)
2479 +{
2480 +       return reiserfs_readdir_inode(file_inode(file), ctx);
2481 +}
2482 +
2483 +/*
2484 + * compose directory item containing "." and ".." entries (entries are
2485 + * not aligned to 4 byte boundary)
2486 + */
2487 +void make_empty_dir_item_v1(char *body, __le32 dirid, __le32 objid,
2488 +                           __le32 par_dirid, __le32 par_objid)
2489 +{
2490 +       struct reiserfs_de_head *dot, *dotdot;
2491 +
2492 +       memset(body, 0, EMPTY_DIR_SIZE_V1);
2493 +       dot = (struct reiserfs_de_head *)body;
2494 +       dotdot = dot + 1;
2495 +
2496 +       /* direntry header of "." */
2497 +       put_deh_offset(dot, DOT_OFFSET);
2498 +       /* these two are from make_le_item_head, and are LE */
2499 +       dot->deh_dir_id = dirid;
2500 +       dot->deh_objectid = objid;
2501 +       dot->deh_state = 0;     /* Endian safe if 0 */
2502 +       put_deh_location(dot, EMPTY_DIR_SIZE_V1 - strlen("."));
2503 +       mark_de_visible(dot);
2504 +
2505 +       /* direntry header of ".." */
2506 +       put_deh_offset(dotdot, DOT_DOT_OFFSET);
2507 +       /* key of ".." for the root directory */
2508 +       /* these two are from the inode, and are LE */
2509 +       dotdot->deh_dir_id = par_dirid;
2510 +       dotdot->deh_objectid = par_objid;
2511 +       dotdot->deh_state = 0;  /* Endian safe if 0 */
2512 +       put_deh_location(dotdot, deh_location(dot) - strlen(".."));
2513 +       mark_de_visible(dotdot);
2514 +
2515 +       /* copy ".." and "." */
2516 +       memcpy(body + deh_location(dot), ".", 1);
2517 +       memcpy(body + deh_location(dotdot), "..", 2);
2518 +}
2519 +
2520 +/* compose directory item containing "." and ".." entries */
2521 +void make_empty_dir_item(char *body, __le32 dirid, __le32 objid,
2522 +                        __le32 par_dirid, __le32 par_objid)
2523 +{
2524 +       struct reiserfs_de_head *dot, *dotdot;
2525 +
2526 +       memset(body, 0, EMPTY_DIR_SIZE);
2527 +       dot = (struct reiserfs_de_head *)body;
2528 +       dotdot = dot + 1;
2529 +
2530 +       /* direntry header of "." */
2531 +       put_deh_offset(dot, DOT_OFFSET);
2532 +       /* these two are from make_le_item_head, and are LE */
2533 +       dot->deh_dir_id = dirid;
2534 +       dot->deh_objectid = objid;
2535 +       dot->deh_state = 0;     /* Endian safe if 0 */
2536 +       put_deh_location(dot, EMPTY_DIR_SIZE - ROUND_UP(strlen(".")));
2537 +       mark_de_visible(dot);
2538 +
2539 +       /* direntry header of ".." */
2540 +       put_deh_offset(dotdot, DOT_DOT_OFFSET);
2541 +       /* key of ".." for the root directory */
2542 +       /* these two are from the inode, and are LE */
2543 +       dotdot->deh_dir_id = par_dirid;
2544 +       dotdot->deh_objectid = par_objid;
2545 +       dotdot->deh_state = 0;  /* Endian safe if 0 */
2546 +       put_deh_location(dotdot, deh_location(dot) - ROUND_UP(strlen("..")));
2547 +       mark_de_visible(dotdot);
2548 +
2549 +       /* copy ".." and "." */
2550 +       memcpy(body + deh_location(dot), ".", 1);
2551 +       memcpy(body + deh_location(dotdot), "..", 2);
2552 +}
2553 diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
2554 new file mode 100644
2555 index 000000000000..5129efc6f2e6
2556 --- /dev/null
2557 +++ b/fs/reiserfs/do_balan.c
2558 @@ -0,0 +1,1900 @@
2559 +/*
2560 + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
2561 + */
2562 +
2563 +/*
2564 + * Now we have all buffers that must be used in balancing of the tree
2565 + * Further calculations can not cause schedule(), and thus the buffer
2566 + * tree will be stable until the balancing will be finished
2567 + * balance the tree according to the analysis made before,
2568 + * and using buffers obtained after all above.
2569 + */
2570 +
2571 +#include <linux/uaccess.h>
2572 +#include <linux/time.h>
2573 +#include "reiserfs.h"
2574 +#include <linux/buffer_head.h>
2575 +#include <linux/kernel.h>
2576 +
2577 +static inline void buffer_info_init_left(struct tree_balance *tb,
2578 +                                         struct buffer_info *bi)
2579 +{
2580 +       bi->tb          = tb;
2581 +       bi->bi_bh       = tb->L[0];
2582 +       bi->bi_parent   = tb->FL[0];
2583 +       bi->bi_position = get_left_neighbor_position(tb, 0);
2584 +}
2585 +
2586 +static inline void buffer_info_init_right(struct tree_balance *tb,
2587 +                                          struct buffer_info *bi)
2588 +{
2589 +       bi->tb          = tb;
2590 +       bi->bi_bh       = tb->R[0];
2591 +       bi->bi_parent   = tb->FR[0];
2592 +       bi->bi_position = get_right_neighbor_position(tb, 0);
2593 +}
2594 +
2595 +static inline void buffer_info_init_tbS0(struct tree_balance *tb,
2596 +                                         struct buffer_info *bi)
2597 +{
2598 +       bi->tb          = tb;
2599 +       bi->bi_bh        = PATH_PLAST_BUFFER(tb->tb_path);
2600 +       bi->bi_parent   = PATH_H_PPARENT(tb->tb_path, 0);
2601 +       bi->bi_position = PATH_H_POSITION(tb->tb_path, 1);
2602 +}
2603 +
2604 +static inline void buffer_info_init_bh(struct tree_balance *tb,
2605 +                                       struct buffer_info *bi,
2606 +                                       struct buffer_head *bh)
2607 +{
2608 +       bi->tb          = tb;
2609 +       bi->bi_bh       = bh;
2610 +       bi->bi_parent   = NULL;
2611 +       bi->bi_position = 0;
2612 +}
2613 +
2614 +inline void do_balance_mark_leaf_dirty(struct tree_balance *tb,
2615 +                                      struct buffer_head *bh, int flag)
2616 +{
2617 +       journal_mark_dirty(tb->transaction_handle, bh);
2618 +}
2619 +
2620 +#define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty
2621 +#define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty
2622 +
2623 +/*
2624 + * summary:
2625 + *  if deleting something ( tb->insert_size[0] < 0 )
2626 + *    return(balance_leaf_when_delete()); (flag d handled here)
2627 + *  else
2628 + *    if lnum is larger than 0 we put items into the left node
2629 + *    if rnum is larger than 0 we put items into the right node
2630 + *    if snum1 is larger than 0 we put items into the new node s1
2631 + *    if snum2 is larger than 0 we put items into the new node s2
2632 + * Note that all *num* count new items being created.
2633 + */
2634 +
2635 +static void balance_leaf_when_delete_del(struct tree_balance *tb)
2636 +{
2637 +       struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
2638 +       int item_pos = PATH_LAST_POSITION(tb->tb_path);
2639 +       struct buffer_info bi;
2640 +#ifdef CONFIG_REISERFS_CHECK
2641 +       struct item_head *ih = item_head(tbS0, item_pos);
2642 +#endif
2643 +
2644 +       RFALSE(ih_item_len(ih) + IH_SIZE != -tb->insert_size[0],
2645 +              "vs-12013: mode Delete, insert size %d, ih to be deleted %h",
2646 +              -tb->insert_size[0], ih);
2647 +
2648 +       buffer_info_init_tbS0(tb, &bi);
2649 +       leaf_delete_items(&bi, 0, item_pos, 1, -1);
2650 +
2651 +       if (!item_pos && tb->CFL[0]) {
2652 +               if (B_NR_ITEMS(tbS0)) {
2653 +                       replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0);
2654 +               } else {
2655 +                       if (!PATH_H_POSITION(tb->tb_path, 1))
2656 +                               replace_key(tb, tb->CFL[0], tb->lkey[0],
2657 +                                           PATH_H_PPARENT(tb->tb_path, 0), 0);
2658 +               }
2659 +       }
2660 +
2661 +       RFALSE(!item_pos && !tb->CFL[0],
2662 +              "PAP-12020: tb->CFL[0]==%p, tb->L[0]==%p", tb->CFL[0],
2663 +              tb->L[0]);
2664 +}
2665 +
2666 +/* cut item in S[0] */
2667 +static void balance_leaf_when_delete_cut(struct tree_balance *tb)
2668 +{
2669 +       struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
2670 +       int item_pos = PATH_LAST_POSITION(tb->tb_path);
2671 +       struct item_head *ih = item_head(tbS0, item_pos);
2672 +       int pos_in_item = tb->tb_path->pos_in_item;
2673 +       struct buffer_info bi;
2674 +       buffer_info_init_tbS0(tb, &bi);
2675 +
2676 +       if (is_direntry_le_ih(ih)) {
2677 +               /*
2678 +                * UFS unlink semantics are such that you can only
2679 +                * delete one directory entry at a time.
2680 +                *
2681 +                * when we cut a directory tb->insert_size[0] means
2682 +                * number of entries to be cut (always 1)
2683 +                */
2684 +               tb->insert_size[0] = -1;
2685 +               leaf_cut_from_buffer(&bi, item_pos, pos_in_item,
2686 +                                    -tb->insert_size[0]);
2687 +
2688 +               RFALSE(!item_pos && !pos_in_item && !tb->CFL[0],
2689 +                      "PAP-12030: can not change delimiting key. CFL[0]=%p",
2690 +                      tb->CFL[0]);
2691 +
2692 +               if (!item_pos && !pos_in_item && tb->CFL[0])
2693 +                       replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0);
2694 +       } else {
2695 +               leaf_cut_from_buffer(&bi, item_pos, pos_in_item,
2696 +                                    -tb->insert_size[0]);
2697 +
2698 +               RFALSE(!ih_item_len(ih),
2699 +                      "PAP-12035: cut must leave non-zero dynamic "
2700 +                      "length of item");
2701 +       }
2702 +}
2703 +
2704 +static int balance_leaf_when_delete_left(struct tree_balance *tb)
2705 +{
2706 +       struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
2707 +       int n = B_NR_ITEMS(tbS0);
2708 +
2709 +       /* L[0] must be joined with S[0] */
2710 +       if (tb->lnum[0] == -1) {
2711 +               /* R[0] must be also joined with S[0] */
2712 +               if (tb->rnum[0] == -1) {
2713 +                       if (tb->FR[0] == PATH_H_PPARENT(tb->tb_path, 0)) {
2714 +                               /*
2715 +                                * all contents of all the
2716 +                                * 3 buffers will be in L[0]
2717 +                                */
2718 +                               if (PATH_H_POSITION(tb->tb_path, 1) == 0 &&
2719 +                                   1 < B_NR_ITEMS(tb->FR[0]))
2720 +                                       replace_key(tb, tb->CFL[0],
2721 +                                                   tb->lkey[0], tb->FR[0], 1);
2722 +
2723 +                               leaf_move_items(LEAF_FROM_S_TO_L, tb, n, -1,
2724 +                                               NULL);
2725 +                               leaf_move_items(LEAF_FROM_R_TO_L, tb,
2726 +                                               B_NR_ITEMS(tb->R[0]), -1,
2727 +                                               NULL);
2728 +
2729 +                               reiserfs_invalidate_buffer(tb, tbS0);
2730 +                               reiserfs_invalidate_buffer(tb, tb->R[0]);
2731 +
2732 +                               return 0;
2733 +                       }
2734 +
2735 +                       /* all contents of all the 3 buffers will be in R[0] */
2736 +                       leaf_move_items(LEAF_FROM_S_TO_R, tb, n, -1, NULL);
2737 +                       leaf_move_items(LEAF_FROM_L_TO_R, tb,
2738 +                                       B_NR_ITEMS(tb->L[0]), -1, NULL);
2739 +
2740 +                       /* right_delimiting_key is correct in R[0] */
2741 +                       replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
2742 +
2743 +                       reiserfs_invalidate_buffer(tb, tbS0);
2744 +                       reiserfs_invalidate_buffer(tb, tb->L[0]);
2745 +
2746 +                       return -1;
2747 +               }
2748 +
2749 +               RFALSE(tb->rnum[0] != 0,
2750 +                      "PAP-12045: rnum must be 0 (%d)", tb->rnum[0]);
2751 +               /* all contents of L[0] and S[0] will be in L[0] */
2752 +               leaf_shift_left(tb, n, -1);
2753 +
2754 +               reiserfs_invalidate_buffer(tb, tbS0);
2755 +
2756 +               return 0;
2757 +       }
2758 +
2759 +       /*
2760 +        * a part of contents of S[0] will be in L[0] and
2761 +        * the rest part of S[0] will be in R[0]
2762 +        */
2763 +
2764 +       RFALSE((tb->lnum[0] + tb->rnum[0] < n) ||
2765 +              (tb->lnum[0] + tb->rnum[0] > n + 1),
2766 +              "PAP-12050: rnum(%d) and lnum(%d) and item "
2767 +              "number(%d) in S[0] are not consistent",
2768 +              tb->rnum[0], tb->lnum[0], n);
2769 +       RFALSE((tb->lnum[0] + tb->rnum[0] == n) &&
2770 +              (tb->lbytes != -1 || tb->rbytes != -1),
2771 +              "PAP-12055: bad rbytes (%d)/lbytes (%d) "
2772 +              "parameters when items are not split",
2773 +              tb->rbytes, tb->lbytes);
2774 +       RFALSE((tb->lnum[0] + tb->rnum[0] == n + 1) &&
2775 +              (tb->lbytes < 1 || tb->rbytes != -1),
2776 +              "PAP-12060: bad rbytes (%d)/lbytes (%d) "
2777 +              "parameters when items are split",
2778 +              tb->rbytes, tb->lbytes);
2779 +
2780 +       leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
2781 +       leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
2782 +
2783 +       reiserfs_invalidate_buffer(tb, tbS0);
2784 +
2785 +       return 0;
2786 +}
2787 +
2788 +/*
2789 + * Balance leaf node in case of delete or cut: insert_size[0] < 0
2790 + *
2791 + * lnum, rnum can have values >= -1
2792 + *     -1 means that the neighbor must be joined with S
2793 + *      0 means that nothing should be done with the neighbor
2794 + *     >0 means to shift entirely or partly the specified number of items
2795 + *         to the neighbor
2796 + */
2797 +static int balance_leaf_when_delete(struct tree_balance *tb, int flag)
2798 +{
2799 +       struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
2800 +       struct buffer_info bi;
2801 +       int n;
2802 +
2803 +       RFALSE(tb->FR[0] && B_LEVEL(tb->FR[0]) != DISK_LEAF_NODE_LEVEL + 1,
2804 +              "vs- 12000: level: wrong FR %z", tb->FR[0]);
2805 +       RFALSE(tb->blknum[0] > 1,
2806 +              "PAP-12005: tb->blknum == %d, can not be > 1", tb->blknum[0]);
2807 +       RFALSE(!tb->blknum[0] && !PATH_H_PPARENT(tb->tb_path, 0),
2808 +              "PAP-12010: tree can not be empty");
2809 +
2810 +       buffer_info_init_tbS0(tb, &bi);
2811 +
2812 +       /* Delete or truncate the item */
2813 +
2814 +       BUG_ON(flag != M_DELETE && flag != M_CUT);
2815 +       if (flag == M_DELETE)
2816 +               balance_leaf_when_delete_del(tb);
2817 +       else /* M_CUT */
2818 +               balance_leaf_when_delete_cut(tb);
2819 +
2820 +
2821 +       /*
2822 +        * the rule is that no shifting occurs unless by shifting
2823 +        * a node can be freed
2824 +        */
2825 +       n = B_NR_ITEMS(tbS0);
2826 +
2827 +
2828 +       /* L[0] takes part in balancing */
2829 +       if (tb->lnum[0])
2830 +               return balance_leaf_when_delete_left(tb);
2831 +
2832 +       if (tb->rnum[0] == -1) {
2833 +               /* all contents of R[0] and S[0] will be in R[0] */
2834 +               leaf_shift_right(tb, n, -1);
2835 +               reiserfs_invalidate_buffer(tb, tbS0);
2836 +               return 0;
2837 +       }
2838 +
2839 +       RFALSE(tb->rnum[0],
2840 +              "PAP-12065: bad rnum parameter must be 0 (%d)", tb->rnum[0]);
2841 +       return 0;
2842 +}
2843 +
2844 +static unsigned int balance_leaf_insert_left(struct tree_balance *tb,
2845 +                                            struct item_head *const ih,
2846 +                                            const char * const body)
2847 +{
2848 +       int ret;
2849 +       struct buffer_info bi;
2850 +       int n = B_NR_ITEMS(tb->L[0]);
2851 +       unsigned body_shift_bytes = 0;
2852 +
2853 +       if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) {
2854 +               /* part of new item falls into L[0] */
2855 +               int new_item_len, shift;
2856 +
2857 +               ret = leaf_shift_left(tb, tb->lnum[0] - 1, -1);
2858 +
2859 +               /* Calculate item length to insert to S[0] */
2860 +               new_item_len = ih_item_len(ih) - tb->lbytes;
2861 +
2862 +               /* Calculate and check item length to insert to L[0] */
2863 +               put_ih_item_len(ih, ih_item_len(ih) - new_item_len);
2864 +
2865 +               RFALSE(ih_item_len(ih) <= 0,
2866 +                      "PAP-12080: there is nothing to insert into L[0]: "
2867 +                      "ih_item_len=%d", ih_item_len(ih));
2868 +
2869 +               /* Insert new item into L[0] */
2870 +               buffer_info_init_left(tb, &bi);
2871 +               leaf_insert_into_buf(&bi, n + tb->item_pos - ret, ih, body,
2872 +                            min_t(int, tb->zeroes_num, ih_item_len(ih)));
2873 +
2874 +               /*
2875 +                * Calculate key component, item length and body to
2876 +                * insert into S[0]
2877 +                */
2878 +               shift = 0;
2879 +               if (is_indirect_le_ih(ih))
2880 +                       shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
2881 +
2882 +               add_le_ih_k_offset(ih, tb->lbytes << shift);
2883 +
2884 +               put_ih_item_len(ih, new_item_len);
2885 +               if (tb->lbytes > tb->zeroes_num) {
2886 +                       body_shift_bytes = tb->lbytes - tb->zeroes_num;
2887 +                       tb->zeroes_num = 0;
2888 +               } else
2889 +                       tb->zeroes_num -= tb->lbytes;
2890 +
2891 +               RFALSE(ih_item_len(ih) <= 0,
2892 +                      "PAP-12085: there is nothing to insert into S[0]: "
2893 +                      "ih_item_len=%d", ih_item_len(ih));
2894 +       } else {
2895 +               /* new item in whole falls into L[0] */
2896 +               /* Shift lnum[0]-1 items to L[0] */
2897 +               ret = leaf_shift_left(tb, tb->lnum[0] - 1, tb->lbytes);
2898 +
2899 +               /* Insert new item into L[0] */
2900 +               buffer_info_init_left(tb, &bi);
2901 +               leaf_insert_into_buf(&bi, n + tb->item_pos - ret, ih, body,
2902 +                                    tb->zeroes_num);
2903 +               tb->insert_size[0] = 0;
2904 +               tb->zeroes_num = 0;
2905 +       }
2906 +       return body_shift_bytes;
2907 +}
2908 +
2909 +static void balance_leaf_paste_left_shift_dirent(struct tree_balance *tb,
2910 +                                                struct item_head * const ih,
2911 +                                                const char * const body)
2912 +{
2913 +       int n = B_NR_ITEMS(tb->L[0]);
2914 +       struct buffer_info bi;
2915 +
2916 +       RFALSE(tb->zeroes_num,
2917 +              "PAP-12090: invalid parameter in case of a directory");
2918 +
2919 +       /* directory item */
2920 +       if (tb->lbytes > tb->pos_in_item) {
2921 +               /* new directory entry falls into L[0] */
2922 +               struct item_head *pasted;
2923 +               int ret, l_pos_in_item = tb->pos_in_item;
2924 +
2925 +               /*
2926 +                * Shift lnum[0] - 1 items in whole.
2927 +                * Shift lbytes - 1 entries from given directory item
2928 +                */
2929 +               ret = leaf_shift_left(tb, tb->lnum[0], tb->lbytes - 1);
2930 +               if (ret && !tb->item_pos) {
2931 +                       pasted = item_head(tb->L[0], B_NR_ITEMS(tb->L[0]) - 1);
2932 +                       l_pos_in_item += ih_entry_count(pasted) -
2933 +                                        (tb->lbytes - 1);
2934 +               }
2935 +
2936 +               /* Append given directory entry to directory item */
2937 +               buffer_info_init_left(tb, &bi);
2938 +               leaf_paste_in_buffer(&bi, n + tb->item_pos - ret,
2939 +                                    l_pos_in_item, tb->insert_size[0],
2940 +                                    body, tb->zeroes_num);
2941 +
2942 +               /*
2943 +                * previous string prepared space for pasting new entry,
2944 +                * following string pastes this entry
2945 +                */
2946 +
2947 +               /*
2948 +                * when we have merge directory item, pos_in_item
2949 +                * has been changed too
2950 +                */
2951 +
2952 +               /* paste new directory entry. 1 is entry number */
2953 +               leaf_paste_entries(&bi, n + tb->item_pos - ret,
2954 +                                  l_pos_in_item, 1,
2955 +                                  (struct reiserfs_de_head *) body,
2956 +                                  body + DEH_SIZE, tb->insert_size[0]);
2957 +               tb->insert_size[0] = 0;
2958 +       } else {
2959 +               /* new directory item doesn't fall into L[0] */
2960 +               /*
2961 +                * Shift lnum[0]-1 items in whole. Shift lbytes
2962 +                * directory entries from directory item number lnum[0]
2963 +                */
2964 +               leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
2965 +       }
2966 +
2967 +       /* Calculate new position to append in item body */
2968 +       tb->pos_in_item -= tb->lbytes;
2969 +}
2970 +
2971 +static unsigned int balance_leaf_paste_left_shift(struct tree_balance *tb,
2972 +                                                 struct item_head * const ih,
2973 +                                                 const char * const body)
2974 +{
2975 +       struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
2976 +       int n = B_NR_ITEMS(tb->L[0]);
2977 +       struct buffer_info bi;
2978 +       int body_shift_bytes = 0;
2979 +
2980 +       if (is_direntry_le_ih(item_head(tbS0, tb->item_pos))) {
2981 +               balance_leaf_paste_left_shift_dirent(tb, ih, body);
2982 +               return 0;
2983 +       }
2984 +
2985 +       RFALSE(tb->lbytes <= 0,
2986 +              "PAP-12095: there is nothing to shift to L[0]. "
2987 +              "lbytes=%d", tb->lbytes);
2988 +       RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)),
2989 +              "PAP-12100: incorrect position to paste: "
2990 +              "item_len=%d, pos_in_item=%d",
2991 +              ih_item_len(item_head(tbS0, tb->item_pos)), tb->pos_in_item);
2992 +
2993 +       /* appended item will be in L[0] in whole */
2994 +       if (tb->lbytes >= tb->pos_in_item) {
2995 +               struct item_head *tbS0_pos_ih, *tbL0_ih;
2996 +               struct item_head *tbS0_0_ih;
2997 +               struct reiserfs_key *left_delim_key;
2998 +               int ret, l_n, version, temp_l;
2999 +
3000 +               tbS0_pos_ih = item_head(tbS0, tb->item_pos);
3001 +               tbS0_0_ih = item_head(tbS0, 0);
3002 +
3003 +               /*
3004 +                * this bytes number must be appended
3005 +                * to the last item of L[h]
3006 +                */
3007 +               l_n = tb->lbytes - tb->pos_in_item;
3008 +
3009 +               /* Calculate new insert_size[0] */
3010 +               tb->insert_size[0] -= l_n;
3011 +
3012 +               RFALSE(tb->insert_size[0] <= 0,
3013 +                      "PAP-12105: there is nothing to paste into "
3014 +                      "L[0]. insert_size=%d", tb->insert_size[0]);
3015 +
3016 +               ret = leaf_shift_left(tb, tb->lnum[0],
3017 +                                     ih_item_len(tbS0_pos_ih));
3018 +
3019 +               tbL0_ih = item_head(tb->L[0], n + tb->item_pos - ret);
3020 +
3021 +               /* Append to body of item in L[0] */
3022 +               buffer_info_init_left(tb, &bi);
3023 +               leaf_paste_in_buffer(&bi, n + tb->item_pos - ret,
3024 +                                    ih_item_len(tbL0_ih), l_n, body,
3025 +                                    min_t(int, l_n, tb->zeroes_num));
3026 +
3027 +               /*
3028 +                * 0-th item in S0 can be only of DIRECT type
3029 +                * when l_n != 0
3030 +                */
3031 +               temp_l = l_n;
3032 +
3033 +               RFALSE(ih_item_len(tbS0_0_ih),
3034 +                      "PAP-12106: item length must be 0");
3035 +               RFALSE(comp_short_le_keys(&tbS0_0_ih->ih_key,
3036 +                      leaf_key(tb->L[0], n + tb->item_pos - ret)),
3037 +                      "PAP-12107: items must be of the same file");
3038 +
3039 +               if (is_indirect_le_ih(tbL0_ih)) {
3040 +                       int shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
3041 +                       temp_l = l_n << shift;
3042 +               }
3043 +               /* update key of first item in S0 */
3044 +               version = ih_version(tbS0_0_ih);
3045 +               add_le_key_k_offset(version, &tbS0_0_ih->ih_key, temp_l);
3046 +
3047 +               /* update left delimiting key */
3048 +               left_delim_key = internal_key(tb->CFL[0], tb->lkey[0]);
3049 +               add_le_key_k_offset(version, left_delim_key, temp_l);
3050 +
3051 +               /*
3052 +                * Calculate new body, position in item and
3053 +                * insert_size[0]
3054 +                */
3055 +               if (l_n > tb->zeroes_num) {
3056 +                       body_shift_bytes = l_n - tb->zeroes_num;
3057 +                       tb->zeroes_num = 0;
3058 +               } else
3059 +                       tb->zeroes_num -= l_n;
3060 +               tb->pos_in_item = 0;
3061 +
3062 +               RFALSE(comp_short_le_keys(&tbS0_0_ih->ih_key,
3063 +                                         leaf_key(tb->L[0],
3064 +                                                B_NR_ITEMS(tb->L[0]) - 1)) ||
3065 +                      !op_is_left_mergeable(leaf_key(tbS0, 0), tbS0->b_size) ||
3066 +                      !op_is_left_mergeable(left_delim_key, tbS0->b_size),
3067 +                      "PAP-12120: item must be merge-able with left "
3068 +                      "neighboring item");
3069 +       } else {
3070 +               /* only part of the appended item will be in L[0] */
3071 +
3072 +               /* Calculate position in item for append in S[0] */
3073 +               tb->pos_in_item -= tb->lbytes;
3074 +
3075 +               RFALSE(tb->pos_in_item <= 0,
3076 +                      "PAP-12125: no place for paste. pos_in_item=%d",
3077 +                      tb->pos_in_item);
3078 +
3079 +               /*
3080 +                * Shift lnum[0] - 1 items in whole.
3081 +                * Shift lbytes - 1 byte from item number lnum[0]
3082 +                */
3083 +               leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
3084 +       }
3085 +       return body_shift_bytes;
3086 +}
3087 +
3088 +
3089 +/* appended item will be in L[0] in whole */
3090 +static void balance_leaf_paste_left_whole(struct tree_balance *tb,
3091 +                                         struct item_head * const ih,
3092 +                                         const char * const body)
3093 +{
3094 +       struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
3095 +       int n = B_NR_ITEMS(tb->L[0]);
3096 +       struct buffer_info bi;
3097 +       struct item_head *pasted;
3098 +       int ret;
3099 +
3100 +       /* if we paste into first item of S[0] and it is left mergable */
3101 +       if (!tb->item_pos &&
3102 +           op_is_left_mergeable(leaf_key(tbS0, 0), tbS0->b_size)) {
3103 +               /*
3104 +                * then increment pos_in_item by the size of the
3105 +                * last item in L[0]
3106 +                */
3107 +               pasted = item_head(tb->L[0], n - 1);
3108 +               if (is_direntry_le_ih(pasted))
3109 +                       tb->pos_in_item += ih_entry_count(pasted);
3110 +               else
3111 +                       tb->pos_in_item += ih_item_len(pasted);
3112 +       }
3113 +
3114 +       /*
3115 +        * Shift lnum[0] - 1 items in whole.
3116 +        * Shift lbytes - 1 byte from item number lnum[0]
3117 +        */
3118 +       ret = leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
3119 +
3120 +       /* Append to body of item in L[0] */
3121 +       buffer_info_init_left(tb, &bi);
3122 +       leaf_paste_in_buffer(&bi, n + tb->item_pos - ret, tb->pos_in_item,
3123 +                            tb->insert_size[0], body, tb->zeroes_num);
3124 +
3125 +       /* if appended item is directory, paste entry */
3126 +       pasted = item_head(tb->L[0], n + tb->item_pos - ret);
3127 +       if (is_direntry_le_ih(pasted))
3128 +               leaf_paste_entries(&bi, n + tb->item_pos - ret,
3129 +                                  tb->pos_in_item, 1,
3130 +                                  (struct reiserfs_de_head *)body,
3131 +                                  body + DEH_SIZE, tb->insert_size[0]);
3132 +
3133 +       /*
3134 +        * if appended item is indirect item, put unformatted node
3135 +        * into un list
3136 +        */
3137 +       if (is_indirect_le_ih(pasted))
3138 +               set_ih_free_space(pasted, 0);
3139 +
3140 +       tb->insert_size[0] = 0;
3141 +       tb->zeroes_num = 0;
3142 +}
3143 +
3144 +static unsigned int balance_leaf_paste_left(struct tree_balance *tb,
3145 +                                           struct item_head * const ih,
3146 +                                           const char * const body)
3147 +{
3148 +       /* we must shift the part of the appended item */
3149 +       if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1)
3150 +               return balance_leaf_paste_left_shift(tb, ih, body);
3151 +       else
3152 +               balance_leaf_paste_left_whole(tb, ih, body);
3153 +       return 0;
3154 +}
3155 +
3156 +/* Shift lnum[0] items from S[0] to the left neighbor L[0] */
3157 +static unsigned int balance_leaf_left(struct tree_balance *tb,
3158 +                                     struct item_head * const ih,
3159 +                                     const char * const body, int flag)
3160 +{
3161 +       if (tb->lnum[0] <= 0)
3162 +               return 0;
3163 +
3164 +       /* new item or it part falls to L[0], shift it too */
3165 +       if (tb->item_pos < tb->lnum[0]) {
3166 +               BUG_ON(flag != M_INSERT && flag != M_PASTE);
3167 +
3168 +               if (flag == M_INSERT)
3169 +                       return balance_leaf_insert_left(tb, ih, body);
3170 +               else /* M_PASTE */
3171 +                       return balance_leaf_paste_left(tb, ih, body);
3172 +       } else
3173 +               /* new item doesn't fall into L[0] */
3174 +               leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
3175 +       return 0;
3176 +}
3177 +
3178 +
3179 +static void balance_leaf_insert_right(struct tree_balance *tb,
3180 +                                     struct item_head * const ih,
3181 +                                     const char * const body)
3182 +{
3183 +
3184 +       struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
3185 +       int n = B_NR_ITEMS(tbS0);
3186 +       struct buffer_info bi;
3187 +
3188 +       /* new item or part of it doesn't fall into R[0] */
3189 +       if (n - tb->rnum[0] >= tb->item_pos) {
3190 +               leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
3191 +               return;
3192 +       }
3193 +
3194 +       /* new item or its part falls to R[0] */
3195 +
3196 +       /* part of new item falls into R[0] */
3197 +       if (tb->item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1) {
3198 +               loff_t old_key_comp, old_len, r_zeroes_number;
3199 +               const char *r_body;
3200 +               int shift;
3201 +               loff_t offset;
3202 +
3203 +               leaf_shift_right(tb, tb->rnum[0] - 1, -1);
3204 +
3205 +               /* Remember key component and item length */
3206 +               old_key_comp = le_ih_k_offset(ih);
3207 +               old_len = ih_item_len(ih);
3208 +
3209 +               /*
3210 +                * Calculate key component and item length to insert
3211 +                * into R[0]
3212 +                */
3213 +               shift = 0;
3214 +               if (is_indirect_le_ih(ih))
3215 +                       shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
3216 +               offset = le_ih_k_offset(ih) + ((old_len - tb->rbytes) << shift);
3217 +               set_le_ih_k_offset(ih, offset);
3218 +               put_ih_item_len(ih, tb->rbytes);
3219 +
3220 +               /* Insert part of the item into R[0] */
3221 +               buffer_info_init_right(tb, &bi);
3222 +               if ((old_len - tb->rbytes) > tb->zeroes_num) {
3223 +                       r_zeroes_number = 0;
3224 +                       r_body = body + (old_len - tb->rbytes) - tb->zeroes_num;
3225 +               } else {
3226 +                       r_body = body;
3227 +                       r_zeroes_number = tb->zeroes_num -
3228 +                                         (old_len - tb->rbytes);
3229 +                       tb->zeroes_num -= r_zeroes_number;
3230 +               }
3231 +
3232 +               leaf_insert_into_buf(&bi, 0, ih, r_body, r_zeroes_number);
3233 +
3234 +               /* Replace right delimiting key by first key in R[0] */
3235 +               replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
3236 +
3237 +               /*
3238 +                * Calculate key component and item length to
3239 +                * insert into S[0]
3240 +                */
3241 +               set_le_ih_k_offset(ih, old_key_comp);
3242 +               put_ih_item_len(ih, old_len - tb->rbytes);
3243 +
3244 +               tb->insert_size[0] -= tb->rbytes;
3245 +
3246 +       } else {
3247 +               /* whole new item falls into R[0] */
3248 +
3249 +               /* Shift rnum[0]-1 items to R[0] */
3250 +               leaf_shift_right(tb, tb->rnum[0] - 1, tb->rbytes);
3251 +
3252 +               /* Insert new item into R[0] */
3253 +               buffer_info_init_right(tb, &bi);
3254 +               leaf_insert_into_buf(&bi, tb->item_pos - n + tb->rnum[0] - 1,
3255 +                                    ih, body, tb->zeroes_num);
3256 +
3257 +               if (tb->item_pos - n + tb->rnum[0] - 1 == 0)
3258 +                       replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
3259 +
3260 +               tb->zeroes_num = tb->insert_size[0] = 0;
3261 +       }
3262 +}
3263 +
3264 +
3265 +static void balance_leaf_paste_right_shift_dirent(struct tree_balance *tb,
3266 +                                    struct item_head * const ih,
3267 +                                    const char * const body)
3268 +{
3269 +       struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
3270 +       struct buffer_info bi;
3271 +       int entry_count;
3272 +
3273 +       RFALSE(tb->zeroes_num,
3274 +              "PAP-12145: invalid parameter in case of a directory");
3275 +       entry_count = ih_entry_count(item_head(tbS0, tb->item_pos));
3276 +
3277 +       /* new directory entry falls into R[0] */
3278 +       if (entry_count - tb->rbytes < tb->pos_in_item) {
3279 +               int paste_entry_position;
3280 +
3281 +               RFALSE(tb->rbytes - 1 >= entry_count || !tb->insert_size[0],
3282 +                      "PAP-12150: no enough of entries to shift to R[0]: "
3283 +                      "rbytes=%d, entry_count=%d", tb->rbytes, entry_count);
3284 +
3285 +               /*
3286 +                * Shift rnum[0]-1 items in whole.
3287 +                * Shift rbytes-1 directory entries from directory
3288 +                * item number rnum[0]
3289 +                */
3290 +               leaf_shift_right(tb, tb->rnum[0], tb->rbytes - 1);
3291 +
3292 +               /* Paste given directory entry to directory item */
3293 +               paste_entry_position = tb->pos_in_item - entry_count +
3294 +                                      tb->rbytes - 1;
3295 +               buffer_info_init_right(tb, &bi);
3296 +               leaf_paste_in_buffer(&bi, 0, paste_entry_position,
3297 +                                    tb->insert_size[0], body, tb->zeroes_num);
3298 +
3299 +               /* paste entry */
3300 +               leaf_paste_entries(&bi, 0, paste_entry_position, 1,
3301 +                                  (struct reiserfs_de_head *) body,
3302 +                                  body + DEH_SIZE, tb->insert_size[0]);
3303 +
3304 +               /* change delimiting keys */
3305 +               if (paste_entry_position == 0)
3306 +                       replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
3307 +
3308 +               tb->insert_size[0] = 0;
3309 +               tb->pos_in_item++;
3310 +       } else {
3311 +               /* new directory entry doesn't fall into R[0] */
3312 +               leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
3313 +       }
3314 +}
3315 +
3316 +static void balance_leaf_paste_right_shift(struct tree_balance *tb,
3317 +                                    struct item_head * const ih,
3318 +                                    const char * const body)
3319 +{
3320 +       struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
3321 +       int n_shift, n_rem, r_zeroes_number, version;
3322 +       unsigned long temp_rem;
3323 +       const char *r_body;
3324 +       struct buffer_info bi;
3325 +
3326 +       /* we append to directory item */
3327 +       if (is_direntry_le_ih(item_head(tbS0, tb->item_pos))) {
3328 +               balance_leaf_paste_right_shift_dirent(tb, ih, body);
3329 +               return;
3330 +       }
3331 +
3332 +       /* regular object */
3333 +
3334 +       /*
3335 +        * Calculate number of bytes which must be shifted
3336 +        * from appended item
3337 +        */
3338 +       n_shift = tb->rbytes - tb->insert_size[0];
3339 +       if (n_shift < 0)
3340 +               n_shift = 0;
3341 +
3342 +       RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)),
3343 +              "PAP-12155: invalid position to paste. ih_item_len=%d, "
3344 +              "pos_in_item=%d", tb->pos_in_item,
3345 +              ih_item_len(item_head(tbS0, tb->item_pos)));
3346 +
3347 +       leaf_shift_right(tb, tb->rnum[0], n_shift);
3348 +
3349 +       /*
3350 +        * Calculate number of bytes which must remain in body
3351 +        * after appending to R[0]
3352 +        */
3353 +       n_rem = tb->insert_size[0] - tb->rbytes;
3354 +       if (n_rem < 0)
3355 +               n_rem = 0;
3356 +
3357 +       temp_rem = n_rem;
3358 +
3359 +       version = ih_version(item_head(tb->R[0], 0));
3360 +
3361 +       if (is_indirect_le_key(version, leaf_key(tb->R[0], 0))) {
3362 +               int shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
3363 +               temp_rem = n_rem << shift;
3364 +       }
3365 +
3366 +       add_le_key_k_offset(version, leaf_key(tb->R[0], 0), temp_rem);
3367 +       add_le_key_k_offset(version, internal_key(tb->CFR[0], tb->rkey[0]),
3368 +                           temp_rem);
3369 +
3370 +       do_balance_mark_internal_dirty(tb, tb->CFR[0], 0);
3371 +
3372 +       /* Append part of body into R[0] */
3373 +       buffer_info_init_right(tb, &bi);
3374 +       if (n_rem > tb->zeroes_num) {
3375 +               r_zeroes_number = 0;
3376 +               r_body = body + n_rem - tb->zeroes_num;
3377 +       } else {
3378 +               r_body = body;
3379 +               r_zeroes_number = tb->zeroes_num - n_rem;
3380 +               tb->zeroes_num -= r_zeroes_number;
3381 +       }
3382 +
3383 +       leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0] - n_rem,
3384 +                            r_body, r_zeroes_number);
3385 +
3386 +       if (is_indirect_le_ih(item_head(tb->R[0], 0)))
3387 +               set_ih_free_space(item_head(tb->R[0], 0), 0);
3388 +
3389 +       tb->insert_size[0] = n_rem;
3390 +       if (!n_rem)
3391 +               tb->pos_in_item++;
3392 +}
3393 +
3394 +static void balance_leaf_paste_right_whole(struct tree_balance *tb,
3395 +                                    struct item_head * const ih,
3396 +                                    const char * const body)
3397 +{
3398 +       struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
3399 +       int n = B_NR_ITEMS(tbS0);
3400 +       struct item_head *pasted;
3401 +       struct buffer_info bi;
3402 +
3403 +       buffer_info_init_right(tb, &bi);
3404 +       leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
3405 +
3406 +       /* append item in R[0] */
3407 +       if (tb->pos_in_item >= 0) {
3408 +               buffer_info_init_right(tb, &bi);
3409 +               leaf_paste_in_buffer(&bi, tb->item_pos - n + tb->rnum[0],
3410 +                                    tb->pos_in_item, tb->insert_size[0], body,
3411 +                                    tb->zeroes_num);
3412 +       }
3413 +
3414 +       /* paste new entry, if item is directory item */
3415 +       pasted = item_head(tb->R[0], tb->item_pos - n + tb->rnum[0]);
3416 +       if (is_direntry_le_ih(pasted) && tb->pos_in_item >= 0) {
3417 +               leaf_paste_entries(&bi, tb->item_pos - n + tb->rnum[0],
3418 +                                  tb->pos_in_item, 1,
3419 +                                  (struct reiserfs_de_head *)body,
3420 +                                  body + DEH_SIZE, tb->insert_size[0]);
3421 +
3422 +               if (!tb->pos_in_item) {
3423 +
3424 +                       RFALSE(tb->item_pos - n + tb->rnum[0],
3425 +                              "PAP-12165: directory item must be first "
3426 +                              "item of node when pasting is in 0th position");
3427 +
3428 +                       /* update delimiting keys */
3429 +                       replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
3430 +               }
3431 +       }
3432 +
3433 +       if (is_indirect_le_ih(pasted))
3434 +               set_ih_free_space(pasted, 0);
3435 +       tb->zeroes_num = tb->insert_size[0] = 0;
3436 +}
3437 +
3438 +static void balance_leaf_paste_right(struct tree_balance *tb,
3439 +                                    struct item_head * const ih,
3440 +                                    const char * const body)
3441 +{
3442 +       struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
3443 +       int n = B_NR_ITEMS(tbS0);
3444 +
3445 +       /* new item doesn't fall into R[0] */
3446 +       if (n - tb->rnum[0] > tb->item_pos) {
3447 +               leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
3448 +               return;
3449 +       }
3450 +
3451 +       /* pasted item or part of it falls to R[0] */
3452 +
3453 +       if (tb->item_pos == n - tb->rnum[0] && tb->rbytes != -1)
3454 +               /* we must shift the part of the appended item */
3455 +               balance_leaf_paste_right_shift(tb, ih, body);
3456 +       else
3457 +               /* pasted item in whole falls into R[0] */
3458 +               balance_leaf_paste_right_whole(tb, ih, body);
3459 +}
3460 +
3461 +/* shift rnum[0] items from S[0] to the right neighbor R[0] */
3462 +static void balance_leaf_right(struct tree_balance *tb,
3463 +                              struct item_head * const ih,
3464 +                              const char * const body, int flag)
3465 +{
3466 +       if (tb->rnum[0] <= 0)
3467 +               return;
3468 +
3469 +       BUG_ON(flag != M_INSERT && flag != M_PASTE);
3470 +
3471 +       if (flag == M_INSERT)
3472 +               balance_leaf_insert_right(tb, ih, body);
3473 +       else /* M_PASTE */
3474 +               balance_leaf_paste_right(tb, ih, body);
3475 +}
3476 +
3477 +static void balance_leaf_new_nodes_insert(struct tree_balance *tb,
3478 +                                         struct item_head * const ih,
3479 +                                         const char * const body,
3480 +                                         struct item_head *insert_key,
3481 +                                         struct buffer_head **insert_ptr,
3482 +                                         int i)
3483 +{
3484 +       struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
3485 +       int n = B_NR_ITEMS(tbS0);
3486 +       struct buffer_info bi;
3487 +       int shift;
3488 +
3489 +       /* new item or it part don't falls into S_new[i] */
3490 +       if (n - tb->snum[i] >= tb->item_pos) {
3491 +               leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
3492 +                               tb->snum[i], tb->sbytes[i], tb->S_new[i]);
3493 +               return;
3494 +       }
3495 +
3496 +       /* new item or it's part falls to first new node S_new[i] */
3497 +
3498 +       /* part of new item falls into S_new[i] */
3499 +       if (tb->item_pos == n - tb->snum[i] + 1 && tb->sbytes[i] != -1) {
3500 +               int old_key_comp, old_len, r_zeroes_number;
3501 +               const char *r_body;
3502 +
3503 +               /* Move snum[i]-1 items from S[0] to S_new[i] */
3504 +               leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i] - 1, -1,
3505 +                               tb->S_new[i]);
3506 +
3507 +               /* Remember key component and item length */
3508 +               old_key_comp = le_ih_k_offset(ih);
3509 +               old_len = ih_item_len(ih);
3510 +
3511 +               /*
3512 +                * Calculate key component and item length to insert
3513 +                * into S_new[i]
3514 +                */
3515 +               shift = 0;
3516 +               if (is_indirect_le_ih(ih))
3517 +                       shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
3518 +               set_le_ih_k_offset(ih,
3519 +                                  le_ih_k_offset(ih) +
3520 +                                  ((old_len - tb->sbytes[i]) << shift));
3521 +
3522 +               put_ih_item_len(ih, tb->sbytes[i]);
3523 +
3524 +               /* Insert part of the item into S_new[i] before 0-th item */
3525 +               buffer_info_init_bh(tb, &bi, tb->S_new[i]);
3526 +
3527 +               if ((old_len - tb->sbytes[i]) > tb->zeroes_num) {
3528 +                       r_zeroes_number = 0;
3529 +                       r_body = body + (old_len - tb->sbytes[i]) -
3530 +                                        tb->zeroes_num;
3531 +               } else {
3532 +                       r_body = body;
3533 +                       r_zeroes_number = tb->zeroes_num - (old_len -
3534 +                                         tb->sbytes[i]);
3535 +                       tb->zeroes_num -= r_zeroes_number;
3536 +               }
3537 +
3538 +               leaf_insert_into_buf(&bi, 0, ih, r_body, r_zeroes_number);
3539 +
3540 +               /*
3541 +                * Calculate key component and item length to
3542 +                * insert into S[i]
3543 +                */
3544 +               set_le_ih_k_offset(ih, old_key_comp);
3545 +               put_ih_item_len(ih, old_len - tb->sbytes[i]);
3546 +               tb->insert_size[0] -= tb->sbytes[i];
3547 +       } else {
3548 +               /* whole new item falls into S_new[i] */
3549 +
3550 +               /*
3551 +                * Shift snum[0] - 1 items to S_new[i]
3552 +                * (sbytes[i] of split item)
3553 +                */
3554 +               leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
3555 +                               tb->snum[i] - 1, tb->sbytes[i], tb->S_new[i]);
3556 +
3557 +               /* Insert new item into S_new[i] */
3558 +               buffer_info_init_bh(tb, &bi, tb->S_new[i]);
3559 +               leaf_insert_into_buf(&bi, tb->item_pos - n + tb->snum[i] - 1,
3560 +                                    ih, body, tb->zeroes_num);
3561 +
3562 +               tb->zeroes_num = tb->insert_size[0] = 0;
3563 +       }
3564 +}
3565 +
3566 +/* we append to directory item */
3567 +static void balance_leaf_new_nodes_paste_dirent(struct tree_balance *tb,
3568 +                                        struct item_head * const ih,
3569 +                                        const char * const body,
3570 +                                        struct item_head *insert_key,
3571 +                                        struct buffer_head **insert_ptr,
3572 +                                        int i)
3573 +{
3574 +       struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
3575 +       struct item_head *aux_ih = item_head(tbS0, tb->item_pos);
3576 +       int entry_count = ih_entry_count(aux_ih);
3577 +       struct buffer_info bi;
3578 +
3579 +       if (entry_count - tb->sbytes[i] < tb->pos_in_item &&
3580 +           tb->pos_in_item <= entry_count) {
3581 +               /* new directory entry falls into S_new[i] */
3582 +
3583 +               RFALSE(!tb->insert_size[0],
3584 +                      "PAP-12215: insert_size is already 0");
3585 +               RFALSE(tb->sbytes[i] - 1 >= entry_count,
3586 +                      "PAP-12220: there are no so much entries (%d), only %d",
3587 +                      tb->sbytes[i] - 1, entry_count);
3588 +
3589 +               /*
3590 +                * Shift snum[i]-1 items in whole.
3591 +                * Shift sbytes[i] directory entries
3592 +                * from directory item number snum[i]
3593 +                */
3594 +               leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i],
3595 +                               tb->sbytes[i] - 1, tb->S_new[i]);
3596 +
3597 +               /*
3598 +                * Paste given directory entry to
3599 +                * directory item
3600 +                */
3601 +               buffer_info_init_bh(tb, &bi, tb->S_new[i]);
3602 +               leaf_paste_in_buffer(&bi, 0, tb->pos_in_item - entry_count +
3603 +                                    tb->sbytes[i] - 1, tb->insert_size[0],
3604 +                                    body, tb->zeroes_num);
3605 +
3606 +               /* paste new directory entry */
3607 +               leaf_paste_entries(&bi, 0, tb->pos_in_item - entry_count +
3608 +                                  tb->sbytes[i] - 1, 1,
3609 +                                  (struct reiserfs_de_head *) body,
3610 +                                  body + DEH_SIZE, tb->insert_size[0]);
3611 +
3612 +               tb->insert_size[0] = 0;
3613 +               tb->pos_in_item++;
3614 +       } else {
3615 +               /* new directory entry doesn't fall into S_new[i] */
3616 +               leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i],
3617 +                               tb->sbytes[i], tb->S_new[i]);
3618 +       }
3619 +
3620 +}
3621 +
3622 +static void balance_leaf_new_nodes_paste_shift(struct tree_balance *tb,
3623 +                                        struct item_head * const ih,
3624 +                                        const char * const body,
3625 +                                        struct item_head *insert_key,
3626 +                                        struct buffer_head **insert_ptr,
3627 +                                        int i)
3628 +{
3629 +       struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
3630 +       struct item_head *aux_ih = item_head(tbS0, tb->item_pos);
3631 +       int n_shift, n_rem, r_zeroes_number, shift;
3632 +       const char *r_body;
3633 +       struct item_head *tmp;
3634 +       struct buffer_info bi;
3635 +
3636 +       RFALSE(ih, "PAP-12210: ih must be 0");
3637 +
3638 +       if (is_direntry_le_ih(aux_ih)) {
3639 +               balance_leaf_new_nodes_paste_dirent(tb, ih, body, insert_key,
3640 +                                                   insert_ptr, i);
3641 +               return;
3642 +       }
3643 +
3644 +       /* regular object */
3645 +
3646 +
3647 +       RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)) ||
3648 +              tb->insert_size[0] <= 0,
3649 +              "PAP-12225: item too short or insert_size <= 0");
3650 +
3651 +       /*
3652 +        * Calculate number of bytes which must be shifted from appended item
3653 +        */
3654 +       n_shift = tb->sbytes[i] - tb->insert_size[0];
3655 +       if (n_shift < 0)
3656 +               n_shift = 0;
3657 +       leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i], n_shift,
3658 +                       tb->S_new[i]);
3659 +
3660 +       /*
3661 +        * Calculate number of bytes which must remain in body after
3662 +        * append to S_new[i]
3663 +        */
3664 +       n_rem = tb->insert_size[0] - tb->sbytes[i];
3665 +       if (n_rem < 0)
3666 +               n_rem = 0;
3667 +
3668 +       /* Append part of body into S_new[0] */
3669 +       buffer_info_init_bh(tb, &bi, tb->S_new[i]);
3670 +       if (n_rem > tb->zeroes_num) {
3671 +               r_zeroes_number = 0;
3672 +               r_body = body + n_rem - tb->zeroes_num;
3673 +       } else {
3674 +               r_body = body;
3675 +               r_zeroes_number = tb->zeroes_num - n_rem;
3676 +               tb->zeroes_num -= r_zeroes_number;
3677 +       }
3678 +
3679 +       leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0] - n_rem,
3680 +                            r_body, r_zeroes_number);
3681 +
3682 +       tmp = item_head(tb->S_new[i], 0);
3683 +       shift = 0;
3684 +       if (is_indirect_le_ih(tmp)) {
3685 +               set_ih_free_space(tmp, 0);
3686 +               shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
3687 +       }
3688 +       add_le_ih_k_offset(tmp, n_rem << shift);
3689 +
3690 +       tb->insert_size[0] = n_rem;
3691 +       if (!n_rem)
3692 +               tb->pos_in_item++;
3693 +}
3694 +
3695 +static void balance_leaf_new_nodes_paste_whole(struct tree_balance *tb,
3696 +                                              struct item_head * const ih,
3697 +                                              const char * const body,
3698 +                                              struct item_head *insert_key,
3699 +                                              struct buffer_head **insert_ptr,
3700 +                                              int i)
3701 +
3702 +{
3703 +       struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
3704 +       int n = B_NR_ITEMS(tbS0);
3705 +       int leaf_mi;
3706 +       struct item_head *pasted;
3707 +       struct buffer_info bi;
3708 +
3709 +#ifdef CONFIG_REISERFS_CHECK
3710 +       struct item_head *ih_check = item_head(tbS0, tb->item_pos);
3711 +
3712 +       if (!is_direntry_le_ih(ih_check) &&
3713 +           (tb->pos_in_item != ih_item_len(ih_check) ||
3714 +           tb->insert_size[0] <= 0))
3715 +               reiserfs_panic(tb->tb_sb,
3716 +                            "PAP-12235",
3717 +                            "pos_in_item must be equal to ih_item_len");
3718 +#endif
3719 +
3720 +       leaf_mi = leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i],
3721 +                                 tb->sbytes[i], tb->S_new[i]);
3722 +
3723 +       RFALSE(leaf_mi,
3724 +              "PAP-12240: unexpected value returned by leaf_move_items (%d)",
3725 +              leaf_mi);
3726 +
3727 +       /* paste into item */
3728 +       buffer_info_init_bh(tb, &bi, tb->S_new[i]);
3729 +       leaf_paste_in_buffer(&bi, tb->item_pos - n + tb->snum[i],
3730 +                            tb->pos_in_item, tb->insert_size[0],
3731 +                            body, tb->zeroes_num);
3732 +
3733 +       pasted = item_head(tb->S_new[i], tb->item_pos - n +
3734 +                          tb->snum[i]);
3735 +       if (is_direntry_le_ih(pasted))
3736 +               leaf_paste_entries(&bi, tb->item_pos - n + tb->snum[i],
3737 +                                  tb->pos_in_item, 1,
3738 +                                  (struct reiserfs_de_head *)body,
3739 +                                  body + DEH_SIZE, tb->insert_size[0]);
3740 +
3741 +       /* if we paste to indirect item update ih_free_space */
3742 +       if (is_indirect_le_ih(pasted))
3743 +               set_ih_free_space(pasted, 0);
3744 +
3745 +       tb->zeroes_num = tb->insert_size[0] = 0;
3746 +
3747 +}
3748 +static void balance_leaf_new_nodes_paste(struct tree_balance *tb,
3749 +                                        struct item_head * const ih,
3750 +                                        const char * const body,
3751 +                                        struct item_head *insert_key,
3752 +                                        struct buffer_head **insert_ptr,
3753 +                                        int i)
3754 +{
3755 +       struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
3756 +       int n = B_NR_ITEMS(tbS0);
3757 +
3758 +       /* pasted item doesn't fall into S_new[i] */
3759 +       if (n - tb->snum[i] > tb->item_pos) {
3760 +               leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
3761 +                               tb->snum[i], tb->sbytes[i], tb->S_new[i]);
3762 +               return;
3763 +       }
3764 +
3765 +       /* pasted item or part if it falls to S_new[i] */
3766 +
3767 +       if (tb->item_pos == n - tb->snum[i] && tb->sbytes[i] != -1)
3768 +               /* we must shift part of the appended item */
3769 +               balance_leaf_new_nodes_paste_shift(tb, ih, body, insert_key,
3770 +                                                  insert_ptr, i);
3771 +       else
3772 +               /* item falls wholly into S_new[i] */
3773 +               balance_leaf_new_nodes_paste_whole(tb, ih, body, insert_key,
3774 +                                                  insert_ptr, i);
3775 +}
3776 +
3777 +/* Fill new nodes that appear in place of S[0] */
3778 +static void balance_leaf_new_nodes(struct tree_balance *tb,
3779 +                                  struct item_head * const ih,
3780 +                                  const char * const body,
3781 +                                  struct item_head *insert_key,
3782 +                                  struct buffer_head **insert_ptr,
3783 +                                  int flag)
3784 +{
3785 +       int i;
3786 +       for (i = tb->blknum[0] - 2; i >= 0; i--) {
3787 +               BUG_ON(flag != M_INSERT && flag != M_PASTE);
3788 +
3789 +               RFALSE(!tb->snum[i],
3790 +                      "PAP-12200: snum[%d] == %d. Must be > 0", i,
3791 +                      tb->snum[i]);
3792 +
3793 +               /* here we shift from S to S_new nodes */
3794 +
3795 +               tb->S_new[i] = get_FEB(tb);
3796 +
3797 +               /* initialized block type and tree level */
3798 +               set_blkh_level(B_BLK_HEAD(tb->S_new[i]), DISK_LEAF_NODE_LEVEL);
3799 +
3800 +               if (flag == M_INSERT)
3801 +                       balance_leaf_new_nodes_insert(tb, ih, body, insert_key,
3802 +                                                     insert_ptr, i);
3803 +               else /* M_PASTE */
3804 +                       balance_leaf_new_nodes_paste(tb, ih, body, insert_key,
3805 +                                                    insert_ptr, i);
3806 +
3807 +               memcpy(insert_key + i, leaf_key(tb->S_new[i], 0), KEY_SIZE);
3808 +               insert_ptr[i] = tb->S_new[i];
3809 +
3810 +               RFALSE(!buffer_journaled(tb->S_new[i])
3811 +                      || buffer_journal_dirty(tb->S_new[i])
3812 +                      || buffer_dirty(tb->S_new[i]),
3813 +                      "PAP-12247: S_new[%d] : (%b)",
3814 +                      i, tb->S_new[i]);
3815 +       }
3816 +}
3817 +
3818 +static void balance_leaf_finish_node_insert(struct tree_balance *tb,
3819 +                                           struct item_head * const ih,
3820 +                                           const char * const body)
3821 +{
3822 +       struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
3823 +       struct buffer_info bi;
3824 +       buffer_info_init_tbS0(tb, &bi);
3825 +       leaf_insert_into_buf(&bi, tb->item_pos, ih, body, tb->zeroes_num);
3826 +
3827 +       /* If we insert the first key change the delimiting key */
3828 +       if (tb->item_pos == 0) {
3829 +               if (tb->CFL[0]) /* can be 0 in reiserfsck */
3830 +                       replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0);
3831 +
3832 +       }
3833 +}
3834 +
3835 +static void balance_leaf_finish_node_paste_dirent(struct tree_balance *tb,
3836 +                                                 struct item_head * const ih,
3837 +                                                 const char * const body)
3838 +{
3839 +       struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
3840 +       struct item_head *pasted = item_head(tbS0, tb->item_pos);
3841 +       struct buffer_info bi;
3842 +
3843 +       if (tb->pos_in_item >= 0 && tb->pos_in_item <= ih_entry_count(pasted)) {
3844 +               RFALSE(!tb->insert_size[0],
3845 +                      "PAP-12260: insert_size is 0 already");
3846 +
3847 +               /* prepare space */
3848 +               buffer_info_init_tbS0(tb, &bi);
3849 +               leaf_paste_in_buffer(&bi, tb->item_pos, tb->pos_in_item,
3850 +                                    tb->insert_size[0], body, tb->zeroes_num);
3851 +
3852 +               /* paste entry */
3853 +               leaf_paste_entries(&bi, tb->item_pos, tb->pos_in_item, 1,
3854 +                                  (struct reiserfs_de_head *)body,
3855 +                                  body + DEH_SIZE, tb->insert_size[0]);
3856 +
3857 +               if (!tb->item_pos && !tb->pos_in_item) {
3858 +                       RFALSE(!tb->CFL[0] || !tb->L[0],
3859 +                              "PAP-12270: CFL[0]/L[0] must  be specified");
3860 +                       if (tb->CFL[0])
3861 +                               replace_key(tb, tb->CFL[0], tb->lkey[0],
3862 +                                           tbS0, 0);
3863 +               }
3864 +
3865 +               tb->insert_size[0] = 0;
3866 +       }
3867 +}
3868 +
3869 +static void balance_leaf_finish_node_paste(struct tree_balance *tb,
3870 +                                          struct item_head * const ih,
3871 +                                          const char * const body)
3872 +{
3873 +       struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
3874 +       struct buffer_info bi;
3875 +       struct item_head *pasted = item_head(tbS0, tb->item_pos);
3876 +
3877 +       /* when directory, may be new entry already pasted */
3878 +       if (is_direntry_le_ih(pasted)) {
3879 +               balance_leaf_finish_node_paste_dirent(tb, ih, body);
3880 +               return;
3881 +       }
3882 +
3883 +       /* regular object */
3884 +
3885 +       if (tb->pos_in_item == ih_item_len(pasted)) {
3886 +               RFALSE(tb->insert_size[0] <= 0,
3887 +                      "PAP-12275: insert size must not be %d",
3888 +                      tb->insert_size[0]);
3889 +               buffer_info_init_tbS0(tb, &bi);
3890 +               leaf_paste_in_buffer(&bi, tb->item_pos,
3891 +                                    tb->pos_in_item, tb->insert_size[0], body,
3892 +                                    tb->zeroes_num);
3893 +
3894 +               if (is_indirect_le_ih(pasted))
3895 +                       set_ih_free_space(pasted, 0);
3896 +
3897 +               tb->insert_size[0] = 0;
3898 +       }
3899 +#ifdef CONFIG_REISERFS_CHECK
3900 +       else if (tb->insert_size[0]) {
3901 +               print_cur_tb("12285");
3902 +               reiserfs_panic(tb->tb_sb, "PAP-12285",
3903 +                   "insert_size must be 0 (%d)", tb->insert_size[0]);
3904 +       }
3905 +#endif
3906 +}
3907 +
3908 +/*
3909 + * if the affected item was not wholly shifted then we
3910 + * perform all necessary operations on that part or whole
3911 + * of the affected item which remains in S
3912 + */
3913 +static void balance_leaf_finish_node(struct tree_balance *tb,
3914 +                                     struct item_head * const ih,
3915 +                                     const char * const body, int flag)
3916 +{
3917 +       /* if we must insert or append into buffer S[0] */
3918 +       if (0 <= tb->item_pos && tb->item_pos < tb->s0num) {
3919 +               if (flag == M_INSERT)
3920 +                       balance_leaf_finish_node_insert(tb, ih, body);
3921 +               else /* M_PASTE */
3922 +                       balance_leaf_finish_node_paste(tb, ih, body);
3923 +       }
3924 +}
3925 +
3926 +/**
3927 + * balance_leaf - reiserfs tree balancing algorithm
3928 + * @tb: tree balance state
3929 + * @ih: item header of inserted item (little endian)
3930 + * @body: body of inserted item or bytes to paste
3931 + * @flag: i - insert, d - delete, c - cut, p - paste (see do_balance)
3932 + * passed back:
3933 + * @insert_key: key to insert new nodes
3934 + * @insert_ptr: array of nodes to insert at the next level
3935 + *
3936 + * In our processing of one level we sometimes determine what must be
3937 + * inserted into the next higher level.  This insertion consists of a
3938 + * key or two keys and their corresponding pointers.
3939 + */
3940 +static int balance_leaf(struct tree_balance *tb, struct item_head *ih,
3941 +                       const char *body, int flag,
3942 +                       struct item_head *insert_key,
3943 +                       struct buffer_head **insert_ptr)
3944 +{
3945 +       struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
3946 +
3947 +       PROC_INFO_INC(tb->tb_sb, balance_at[0]);
3948 +
3949 +       /* Make balance in case insert_size[0] < 0 */
3950 +       if (tb->insert_size[0] < 0)
3951 +               return balance_leaf_when_delete(tb, flag);
3952 +
3953 +       tb->item_pos = PATH_LAST_POSITION(tb->tb_path),
3954 +       tb->pos_in_item = tb->tb_path->pos_in_item,
3955 +       tb->zeroes_num = 0;
3956 +       if (flag == M_INSERT && !body)
3957 +               tb->zeroes_num = ih_item_len(ih);
3958 +
3959 +       /*
3960 +        * for indirect item pos_in_item is measured in unformatted node
3961 +        * pointers. Recalculate to bytes
3962 +        */
3963 +       if (flag != M_INSERT
3964 +           && is_indirect_le_ih(item_head(tbS0, tb->item_pos)))
3965 +               tb->pos_in_item *= UNFM_P_SIZE;
3966 +
3967 +       body += balance_leaf_left(tb, ih, body, flag);
3968 +
3969 +       /* tb->lnum[0] > 0 */
3970 +       /* Calculate new item position */
3971 +       tb->item_pos -= (tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0));
3972 +
3973 +       balance_leaf_right(tb, ih, body, flag);
3974 +
3975 +       /* tb->rnum[0] > 0 */
3976 +       RFALSE(tb->blknum[0] > 3,
3977 +              "PAP-12180: blknum can not be %d. It must be <= 3", tb->blknum[0]);
3978 +       RFALSE(tb->blknum[0] < 0,
3979 +              "PAP-12185: blknum can not be %d. It must be >= 0", tb->blknum[0]);
3980 +
3981 +       /*
3982 +        * if while adding to a node we discover that it is possible to split
3983 +        * it in two, and merge the left part into the left neighbor and the
3984 +        * right part into the right neighbor, eliminating the node
3985 +        */
3986 +       if (tb->blknum[0] == 0) {       /* node S[0] is empty now */
3987 +
3988 +               RFALSE(!tb->lnum[0] || !tb->rnum[0],
3989 +                      "PAP-12190: lnum and rnum must not be zero");
3990 +               /*
3991 +                * if insertion was done before 0-th position in R[0], right
3992 +                * delimiting key of the tb->L[0]'s and left delimiting key are
3993 +                * not set correctly
3994 +                */
3995 +               if (tb->CFL[0]) {
3996 +                       if (!tb->CFR[0])
3997 +                               reiserfs_panic(tb->tb_sb, "vs-12195",
3998 +                                              "CFR not initialized");
3999 +                       copy_key(internal_key(tb->CFL[0], tb->lkey[0]),
4000 +                                internal_key(tb->CFR[0], tb->rkey[0]));
4001 +                       do_balance_mark_internal_dirty(tb, tb->CFL[0], 0);
4002 +               }
4003 +
4004 +               reiserfs_invalidate_buffer(tb, tbS0);
4005 +               return 0;
4006 +       }
4007 +
4008 +       balance_leaf_new_nodes(tb, ih, body, insert_key, insert_ptr, flag);
4009 +
4010 +       balance_leaf_finish_node(tb, ih, body, flag);
4011 +
4012 +#ifdef CONFIG_REISERFS_CHECK
4013 +       if (flag == M_PASTE && tb->insert_size[0]) {
4014 +               print_cur_tb("12290");
4015 +               reiserfs_panic(tb->tb_sb,
4016 +                              "PAP-12290", "insert_size is still not 0 (%d)",
4017 +                              tb->insert_size[0]);
4018 +       }
4019 +#endif
4020 +
4021 +       /* Leaf level of the tree is balanced (end of balance_leaf) */
4022 +       return 0;
4023 +}
4024 +
4025 +/* Make empty node */
4026 +void make_empty_node(struct buffer_info *bi)
4027 +{
4028 +       struct block_head *blkh;
4029 +
4030 +       RFALSE(bi->bi_bh == NULL, "PAP-12295: pointer to the buffer is NULL");
4031 +
4032 +       blkh = B_BLK_HEAD(bi->bi_bh);
4033 +       set_blkh_nr_item(blkh, 0);
4034 +       set_blkh_free_space(blkh, MAX_CHILD_SIZE(bi->bi_bh));
4035 +
4036 +       if (bi->bi_parent)
4037 +               B_N_CHILD(bi->bi_parent, bi->bi_position)->dc_size = 0; /* Endian safe if 0 */
4038 +}
4039 +
4040 +/* Get first empty buffer */
4041 +struct buffer_head *get_FEB(struct tree_balance *tb)
4042 +{
4043 +       int i;
4044 +       struct buffer_info bi;
4045 +
4046 +       for (i = 0; i < MAX_FEB_SIZE; i++)
4047 +               if (tb->FEB[i] != NULL)
4048 +                       break;
4049 +
4050 +       if (i == MAX_FEB_SIZE)
4051 +               reiserfs_panic(tb->tb_sb, "vs-12300", "FEB list is empty");
4052 +
4053 +       buffer_info_init_bh(tb, &bi, tb->FEB[i]);
4054 +       make_empty_node(&bi);
4055 +       set_buffer_uptodate(tb->FEB[i]);
4056 +       tb->used[i] = tb->FEB[i];
4057 +       tb->FEB[i] = NULL;
4058 +
4059 +       return tb->used[i];
4060 +}
4061 +
4062 +/* This is now used because reiserfs_free_block has to be able to schedule. */
4063 +static void store_thrown(struct tree_balance *tb, struct buffer_head *bh)
4064 +{
4065 +       int i;
4066 +
4067 +       if (buffer_dirty(bh))
4068 +               reiserfs_warning(tb->tb_sb, "reiserfs-12320",
4069 +                                "called with dirty buffer");
4070 +       for (i = 0; i < ARRAY_SIZE(tb->thrown); i++)
4071 +               if (!tb->thrown[i]) {
4072 +                       tb->thrown[i] = bh;
4073 +                       get_bh(bh);     /* free_thrown puts this */
4074 +                       return;
4075 +               }
4076 +       reiserfs_warning(tb->tb_sb, "reiserfs-12321",
4077 +                        "too many thrown buffers");
4078 +}
4079 +
4080 +static void free_thrown(struct tree_balance *tb)
4081 +{
4082 +       int i;
4083 +       b_blocknr_t blocknr;
4084 +       for (i = 0; i < ARRAY_SIZE(tb->thrown); i++) {
4085 +               if (tb->thrown[i]) {
4086 +                       blocknr = tb->thrown[i]->b_blocknr;
4087 +                       if (buffer_dirty(tb->thrown[i]))
4088 +                               reiserfs_warning(tb->tb_sb, "reiserfs-12322",
4089 +                                                "called with dirty buffer %d",
4090 +                                                blocknr);
4091 +                       brelse(tb->thrown[i]);  /* incremented in store_thrown */
4092 +                       reiserfs_free_block(tb->transaction_handle, NULL,
4093 +                                           blocknr, 0);
4094 +               }
4095 +       }
4096 +}
4097 +
4098 +void reiserfs_invalidate_buffer(struct tree_balance *tb, struct buffer_head *bh)
4099 +{
4100 +       struct block_head *blkh;
4101 +       blkh = B_BLK_HEAD(bh);
4102 +       set_blkh_level(blkh, FREE_LEVEL);
4103 +       set_blkh_nr_item(blkh, 0);
4104 +
4105 +       clear_buffer_dirty(bh);
4106 +       store_thrown(tb, bh);
4107 +}
4108 +
4109 +/* Replace n_dest'th key in buffer dest by n_src'th key of buffer src.*/
4110 +void replace_key(struct tree_balance *tb, struct buffer_head *dest, int n_dest,
4111 +                struct buffer_head *src, int n_src)
4112 +{
4113 +
4114 +       RFALSE(dest == NULL || src == NULL,
4115 +              "vs-12305: source or destination buffer is 0 (src=%p, dest=%p)",
4116 +              src, dest);
4117 +       RFALSE(!B_IS_KEYS_LEVEL(dest),
4118 +              "vs-12310: invalid level (%z) for destination buffer. dest must be leaf",
4119 +              dest);
4120 +       RFALSE(n_dest < 0 || n_src < 0,
4121 +              "vs-12315: src(%d) or dest(%d) key number < 0", n_src, n_dest);
4122 +       RFALSE(n_dest >= B_NR_ITEMS(dest) || n_src >= B_NR_ITEMS(src),
4123 +              "vs-12320: src(%d(%d)) or dest(%d(%d)) key number is too big",
4124 +              n_src, B_NR_ITEMS(src), n_dest, B_NR_ITEMS(dest));
4125 +
4126 +       if (B_IS_ITEMS_LEVEL(src))
4127 +               /* source buffer contains leaf node */
4128 +               memcpy(internal_key(dest, n_dest), item_head(src, n_src),
4129 +                      KEY_SIZE);
4130 +       else
4131 +               memcpy(internal_key(dest, n_dest), internal_key(src, n_src),
4132 +                      KEY_SIZE);
4133 +
4134 +       do_balance_mark_internal_dirty(tb, dest, 0);
4135 +}
4136 +
4137 +int get_left_neighbor_position(struct tree_balance *tb, int h)
4138 +{
4139 +       int Sh_position = PATH_H_POSITION(tb->tb_path, h + 1);
4140 +
4141 +       RFALSE(PATH_H_PPARENT(tb->tb_path, h) == NULL || tb->FL[h] == NULL,
4142 +              "vs-12325: FL[%d](%p) or F[%d](%p) does not exist",
4143 +              h, tb->FL[h], h, PATH_H_PPARENT(tb->tb_path, h));
4144 +
4145 +       if (Sh_position == 0)
4146 +               return B_NR_ITEMS(tb->FL[h]);
4147 +       else
4148 +               return Sh_position - 1;
4149 +}
4150 +
4151 +int get_right_neighbor_position(struct tree_balance *tb, int h)
4152 +{
4153 +       int Sh_position = PATH_H_POSITION(tb->tb_path, h + 1);
4154 +
4155 +       RFALSE(PATH_H_PPARENT(tb->tb_path, h) == NULL || tb->FR[h] == NULL,
4156 +              "vs-12330: F[%d](%p) or FR[%d](%p) does not exist",
4157 +              h, PATH_H_PPARENT(tb->tb_path, h), h, tb->FR[h]);
4158 +
4159 +       if (Sh_position == B_NR_ITEMS(PATH_H_PPARENT(tb->tb_path, h)))
4160 +               return 0;
4161 +       else
4162 +               return Sh_position + 1;
4163 +}
4164 +
4165 +#ifdef CONFIG_REISERFS_CHECK
4166 +
4167 +int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value);
4168 +static void check_internal_node(struct super_block *s, struct buffer_head *bh,
4169 +                               char *mes)
4170 +{
4171 +       struct disk_child *dc;
4172 +       int i;
4173 +
4174 +       RFALSE(!bh, "PAP-12336: bh == 0");
4175 +
4176 +       if (!bh || !B_IS_IN_TREE(bh))
4177 +               return;
4178 +
4179 +       RFALSE(!buffer_dirty(bh) &&
4180 +              !(buffer_journaled(bh) || buffer_journal_dirty(bh)),
4181 +              "PAP-12337: buffer (%b) must be dirty", bh);
4182 +       dc = B_N_CHILD(bh, 0);
4183 +
4184 +       for (i = 0; i <= B_NR_ITEMS(bh); i++, dc++) {
4185 +               if (!is_reusable(s, dc_block_number(dc), 1)) {
4186 +                       print_cur_tb(mes);
4187 +                       reiserfs_panic(s, "PAP-12338",
4188 +                                      "invalid child pointer %y in %b",
4189 +                                      dc, bh);
4190 +               }
4191 +       }
4192 +}
4193 +
4194 +static int locked_or_not_in_tree(struct tree_balance *tb,
4195 +                                 struct buffer_head *bh, char *which)
4196 +{
4197 +       if ((!buffer_journal_prepared(bh) && buffer_locked(bh)) ||
4198 +           !B_IS_IN_TREE(bh)) {
4199 +               reiserfs_warning(tb->tb_sb, "vs-12339", "%s (%b)", which, bh);
4200 +               return 1;
4201 +       }
4202 +       return 0;
4203 +}
4204 +
4205 +static int check_before_balancing(struct tree_balance *tb)
4206 +{
4207 +       int retval = 0;
4208 +
4209 +       if (REISERFS_SB(tb->tb_sb)->cur_tb) {
4210 +               reiserfs_panic(tb->tb_sb, "vs-12335", "suspect that schedule "
4211 +                              "occurred based on cur_tb not being null at "
4212 +                              "this point in code. do_balance cannot properly "
4213 +                              "handle concurrent tree accesses on a same "
4214 +                              "mount point.");
4215 +       }
4216 +
4217 +       /*
4218 +        * double check that buffers that we will modify are unlocked.
4219 +        * (fix_nodes should already have prepped all of these for us).
4220 +        */
4221 +       if (tb->lnum[0]) {
4222 +               retval |= locked_or_not_in_tree(tb, tb->L[0], "L[0]");
4223 +               retval |= locked_or_not_in_tree(tb, tb->FL[0], "FL[0]");
4224 +               retval |= locked_or_not_in_tree(tb, tb->CFL[0], "CFL[0]");
4225 +               check_leaf(tb->L[0]);
4226 +       }
4227 +       if (tb->rnum[0]) {
4228 +               retval |= locked_or_not_in_tree(tb, tb->R[0], "R[0]");
4229 +               retval |= locked_or_not_in_tree(tb, tb->FR[0], "FR[0]");
4230 +               retval |= locked_or_not_in_tree(tb, tb->CFR[0], "CFR[0]");
4231 +               check_leaf(tb->R[0]);
4232 +       }
4233 +       retval |= locked_or_not_in_tree(tb, PATH_PLAST_BUFFER(tb->tb_path),
4234 +                                       "S[0]");
4235 +       check_leaf(PATH_PLAST_BUFFER(tb->tb_path));
4236 +
4237 +       return retval;
4238 +}
4239 +
4240 +static void check_after_balance_leaf(struct tree_balance *tb)
4241 +{
4242 +       if (tb->lnum[0]) {
4243 +               if (B_FREE_SPACE(tb->L[0]) !=
4244 +                   MAX_CHILD_SIZE(tb->L[0]) -
4245 +                   dc_size(B_N_CHILD
4246 +                           (tb->FL[0], get_left_neighbor_position(tb, 0)))) {
4247 +                       print_cur_tb("12221");
4248 +                       reiserfs_panic(tb->tb_sb, "PAP-12355",
4249 +                                      "shift to left was incorrect");
4250 +               }
4251 +       }
4252 +       if (tb->rnum[0]) {
4253 +               if (B_FREE_SPACE(tb->R[0]) !=
4254 +                   MAX_CHILD_SIZE(tb->R[0]) -
4255 +                   dc_size(B_N_CHILD
4256 +                           (tb->FR[0], get_right_neighbor_position(tb, 0)))) {
4257 +                       print_cur_tb("12222");
4258 +                       reiserfs_panic(tb->tb_sb, "PAP-12360",
4259 +                                      "shift to right was incorrect");
4260 +               }
4261 +       }
4262 +       if (PATH_H_PBUFFER(tb->tb_path, 1) &&
4263 +           (B_FREE_SPACE(PATH_H_PBUFFER(tb->tb_path, 0)) !=
4264 +            (MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)) -
4265 +             dc_size(B_N_CHILD(PATH_H_PBUFFER(tb->tb_path, 1),
4266 +                               PATH_H_POSITION(tb->tb_path, 1)))))) {
4267 +               int left = B_FREE_SPACE(PATH_H_PBUFFER(tb->tb_path, 0));
4268 +               int right = (MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)) -
4269 +                            dc_size(B_N_CHILD(PATH_H_PBUFFER(tb->tb_path, 1),
4270 +                                              PATH_H_POSITION(tb->tb_path,
4271 +                                                              1))));
4272 +               print_cur_tb("12223");
4273 +               reiserfs_warning(tb->tb_sb, "reiserfs-12363",
4274 +                                "B_FREE_SPACE (PATH_H_PBUFFER(tb->tb_path,0)) = %d; "
4275 +                                "MAX_CHILD_SIZE (%d) - dc_size( %y, %d ) [%d] = %d",
4276 +                                left,
4277 +                                MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)),
4278 +                                PATH_H_PBUFFER(tb->tb_path, 1),
4279 +                                PATH_H_POSITION(tb->tb_path, 1),
4280 +                                dc_size(B_N_CHILD
4281 +                                        (PATH_H_PBUFFER(tb->tb_path, 1),
4282 +                                         PATH_H_POSITION(tb->tb_path, 1))),
4283 +                                right);
4284 +               reiserfs_panic(tb->tb_sb, "PAP-12365", "S is incorrect");
4285 +       }
4286 +}
4287 +
4288 +static void check_leaf_level(struct tree_balance *tb)
4289 +{
4290 +       check_leaf(tb->L[0]);
4291 +       check_leaf(tb->R[0]);
4292 +       check_leaf(PATH_PLAST_BUFFER(tb->tb_path));
4293 +}
4294 +
4295 +static void check_internal_levels(struct tree_balance *tb)
4296 +{
4297 +       int h;
4298 +
4299 +       /* check all internal nodes */
4300 +       for (h = 1; tb->insert_size[h]; h++) {
4301 +               check_internal_node(tb->tb_sb, PATH_H_PBUFFER(tb->tb_path, h),
4302 +                                   "BAD BUFFER ON PATH");
4303 +               if (tb->lnum[h])
4304 +                       check_internal_node(tb->tb_sb, tb->L[h], "BAD L");
4305 +               if (tb->rnum[h])
4306 +                       check_internal_node(tb->tb_sb, tb->R[h], "BAD R");
4307 +       }
4308 +
4309 +}
4310 +
4311 +#endif
4312 +
4313 +/*
4314 + * Now we have all of the buffers that must be used in balancing of
4315 + * the tree.  We rely on the assumption that schedule() will not occur
4316 + * while do_balance works. ( Only interrupt handlers are acceptable.)
4317 + * We balance the tree according to the analysis made before this,
4318 + * using buffers already obtained.  For SMP support it will someday be
4319 + * necessary to add ordered locking of tb.
4320 + */
4321 +
4322 +/*
4323 + * Some interesting rules of balancing:
4324 + * we delete a maximum of two nodes per level per balancing: we never
4325 + * delete R, when we delete two of three nodes L, S, R then we move
4326 + * them into R.
4327 + *
4328 + * we only delete L if we are deleting two nodes, if we delete only
4329 + * one node we delete S
4330 + *
4331 + * if we shift leaves then we shift as much as we can: this is a
4332 + * deliberate policy of extremism in node packing which results in
4333 + * higher average utilization after repeated random balance operations
4334 + * at the cost of more memory copies and more balancing as a result of
4335 + * small insertions to full nodes.
4336 + *
4337 + * if we shift internal nodes we try to evenly balance the node
4338 + * utilization, with consequent less balancing at the cost of lower
4339 + * utilization.
4340 + *
4341 + * one could argue that the policy for directories in leaves should be
4342 + * that of internal nodes, but we will wait until another day to
4343 + * evaluate this....  It would be nice to someday measure and prove
4344 + * these assumptions as to what is optimal....
4345 + */
4346 +
4347 +static inline void do_balance_starts(struct tree_balance *tb)
4348 +{
4349 +       /* use print_cur_tb() to see initial state of struct tree_balance */
4350 +
4351 +       /* store_print_tb (tb); */
4352 +
4353 +       /* do not delete, just comment it out */
4354 +       /*
4355 +       print_tb(flag, PATH_LAST_POSITION(tb->tb_path),
4356 +                tb->tb_path->pos_in_item, tb, "check");
4357 +       */
4358 +       RFALSE(check_before_balancing(tb), "PAP-12340: locked buffers in TB");
4359 +#ifdef CONFIG_REISERFS_CHECK
4360 +       REISERFS_SB(tb->tb_sb)->cur_tb = tb;
4361 +#endif
4362 +}
4363 +
4364 +static inline void do_balance_completed(struct tree_balance *tb)
4365 +{
4366 +
4367 +#ifdef CONFIG_REISERFS_CHECK
4368 +       check_leaf_level(tb);
4369 +       check_internal_levels(tb);
4370 +       REISERFS_SB(tb->tb_sb)->cur_tb = NULL;
4371 +#endif
4372 +
4373 +       /*
4374 +        * reiserfs_free_block is no longer schedule safe.  So, we need to
4375 +        * put the buffers we want freed on the thrown list during do_balance,
4376 +        * and then free them now
4377 +        */
4378 +
4379 +       REISERFS_SB(tb->tb_sb)->s_do_balance++;
4380 +
4381 +       /* release all nodes hold to perform the balancing */
4382 +       unfix_nodes(tb);
4383 +
4384 +       free_thrown(tb);
4385 +}
4386 +
4387 +/*
4388 + * do_balance - balance the tree
4389 + *
4390 + * @tb: tree_balance structure
4391 + * @ih: item header of inserted item
4392 + * @body: body of inserted item or bytes to paste
4393 + * @flag: 'i' - insert, 'd' - delete, 'c' - cut, 'p' paste
4394 + *
4395 + * Cut means delete part of an item (includes removing an entry from a
4396 + * directory).
4397 + *
4398 + * Delete means delete whole item.
4399 + *
4400 + * Insert means add a new item into the tree.
4401 + *
4402 + * Paste means to append to the end of an existing file or to
4403 + * insert a directory entry.
4404 + */
4405 +void do_balance(struct tree_balance *tb, struct item_head *ih,
4406 +               const char *body, int flag)
4407 +{
4408 +       int child_pos;          /* position of a child node in its parent */
4409 +       int h;                  /* level of the tree being processed */
4410 +
4411 +       /*
4412 +        * in our processing of one level we sometimes determine what
4413 +        * must be inserted into the next higher level.  This insertion
4414 +        * consists of a key or two keys and their corresponding
4415 +        * pointers
4416 +        */
4417 +       struct item_head insert_key[2];
4418 +
4419 +       /* inserted node-ptrs for the next level */
4420 +       struct buffer_head *insert_ptr[2];
4421 +
4422 +       tb->tb_mode = flag;
4423 +       tb->need_balance_dirty = 0;
4424 +
4425 +       if (FILESYSTEM_CHANGED_TB(tb)) {
4426 +               reiserfs_panic(tb->tb_sb, "clm-6000", "fs generation has "
4427 +                              "changed");
4428 +       }
4429 +       /* if we have no real work to do  */
4430 +       if (!tb->insert_size[0]) {
4431 +               reiserfs_warning(tb->tb_sb, "PAP-12350",
4432 +                                "insert_size == 0, mode == %c", flag);
4433 +               unfix_nodes(tb);
4434 +               return;
4435 +       }
4436 +
4437 +       atomic_inc(&fs_generation(tb->tb_sb));
4438 +       do_balance_starts(tb);
4439 +
4440 +       /*
4441 +        * balance_leaf returns 0 except if combining L R and S into
4442 +        * one node.  see balance_internal() for explanation of this
4443 +        * line of code.
4444 +        */
4445 +       child_pos = PATH_H_B_ITEM_ORDER(tb->tb_path, 0) +
4446 +           balance_leaf(tb, ih, body, flag, insert_key, insert_ptr);
4447 +
4448 +#ifdef CONFIG_REISERFS_CHECK
4449 +       check_after_balance_leaf(tb);
4450 +#endif
4451 +
4452 +       /* Balance internal level of the tree. */
4453 +       for (h = 1; h < MAX_HEIGHT && tb->insert_size[h]; h++)
4454 +               child_pos = balance_internal(tb, h, child_pos, insert_key,
4455 +                                            insert_ptr);
4456 +
4457 +       do_balance_completed(tb);
4458 +}
4459 diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
4460 new file mode 100644
4461 index 000000000000..8eb3ad3e8ae9
4462 --- /dev/null
4463 +++ b/fs/reiserfs/file.c
4464 @@ -0,0 +1,270 @@
4465 +/*
4466 + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
4467 + */
4468 +
4469 +#include <linux/time.h>
4470 +#include "reiserfs.h"
4471 +#include "acl.h"
4472 +#include "xattr.h"
4473 +#include <linux/uaccess.h>
4474 +#include <linux/pagemap.h>
4475 +#include <linux/swap.h>
4476 +#include <linux/writeback.h>
4477 +#include <linux/blkdev.h>
4478 +#include <linux/buffer_head.h>
4479 +#include <linux/quotaops.h>
4480 +
4481 +/*
4482 + * We pack the tails of files on file close, not at the time they are written.
4483 + * This implies an unnecessary copy of the tail and an unnecessary indirect item
4484 + * insertion/balancing, for files that are written in one write.
4485 + * It avoids unnecessary tail packings (balances) for files that are written in
4486 + * multiple writes and are small enough to have tails.
4487 + *
4488 + * file_release is called by the VFS layer when the file is closed.  If
4489 + * this is the last open file descriptor, and the file
4490 + * small enough to have a tail, and the tail is currently in an
4491 + * unformatted node, the tail is converted back into a direct item.
4492 + *
4493 + * We use reiserfs_truncate_file to pack the tail, since it already has
4494 + * all the conditions coded.
4495 + */
4496 +static int reiserfs_file_release(struct inode *inode, struct file *filp)
4497 +{
4498 +
4499 +       struct reiserfs_transaction_handle th;
4500 +       int err;
4501 +       int jbegin_failure = 0;
4502 +
4503 +       BUG_ON(!S_ISREG(inode->i_mode));
4504 +
4505 +       if (!atomic_dec_and_mutex_lock(&REISERFS_I(inode)->openers,
4506 +                                      &REISERFS_I(inode)->tailpack))
4507 +               return 0;
4508 +
4509 +       /* fast out for when nothing needs to be done */
4510 +       if ((!(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
4511 +            !tail_has_to_be_packed(inode)) &&
4512 +           REISERFS_I(inode)->i_prealloc_count <= 0) {
4513 +               mutex_unlock(&REISERFS_I(inode)->tailpack);
4514 +               return 0;
4515 +       }
4516 +
4517 +       reiserfs_write_lock(inode->i_sb);
4518 +       /*
4519 +        * freeing preallocation only involves relogging blocks that
4520 +        * are already in the current transaction.  preallocation gets
4521 +        * freed at the end of each transaction, so it is impossible for
4522 +        * us to log any additional blocks (including quota blocks)
4523 +        */
4524 +       err = journal_begin(&th, inode->i_sb, 1);
4525 +       if (err) {
4526 +               /*
4527 +                * uh oh, we can't allow the inode to go away while there
4528 +                * is still preallocation blocks pending.  Try to join the
4529 +                * aborted transaction
4530 +                */
4531 +               jbegin_failure = err;
4532 +               err = journal_join_abort(&th, inode->i_sb);
4533 +
4534 +               if (err) {
4535 +                       /*
4536 +                        * hmpf, our choices here aren't good.  We can pin
4537 +                        * the inode which will disallow unmount from ever
4538 +                        * happening, we can do nothing, which will corrupt
4539 +                        * random memory on unmount, or we can forcibly
4540 +                        * remove the file from the preallocation list, which
4541 +                        * will leak blocks on disk.  Lets pin the inode
4542 +                        * and let the admin know what is going on.
4543 +                        */
4544 +                       igrab(inode);
4545 +                       reiserfs_warning(inode->i_sb, "clm-9001",
4546 +                                        "pinning inode %lu because the "
4547 +                                        "preallocation can't be freed",
4548 +                                        inode->i_ino);
4549 +                       goto out;
4550 +               }
4551 +       }
4552 +       reiserfs_update_inode_transaction(inode);
4553 +
4554 +#ifdef REISERFS_PREALLOCATE
4555 +       reiserfs_discard_prealloc(&th, inode);
4556 +#endif
4557 +       err = journal_end(&th);
4558 +
4559 +       /* copy back the error code from journal_begin */
4560 +       if (!err)
4561 +               err = jbegin_failure;
4562 +
4563 +       if (!err &&
4564 +           (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) &&
4565 +           tail_has_to_be_packed(inode)) {
4566 +
4567 +               /*
4568 +                * if regular file is released by last holder and it has been
4569 +                * appended (we append by unformatted node only) or its direct
4570 +                * item(s) had to be converted, then it may have to be
4571 +                * indirect2direct converted
4572 +                */
4573 +               err = reiserfs_truncate_file(inode, 0);
4574 +       }
4575 +out:
4576 +       reiserfs_write_unlock(inode->i_sb);
4577 +       mutex_unlock(&REISERFS_I(inode)->tailpack);
4578 +       return err;
4579 +}
4580 +
4581 +static int reiserfs_file_open(struct inode *inode, struct file *file)
4582 +{
4583 +       int err = dquot_file_open(inode, file);
4584 +
4585 +       /* somebody might be tailpacking on final close; wait for it */
4586 +        if (!atomic_inc_not_zero(&REISERFS_I(inode)->openers)) {
4587 +               mutex_lock(&REISERFS_I(inode)->tailpack);
4588 +               atomic_inc(&REISERFS_I(inode)->openers);
4589 +               mutex_unlock(&REISERFS_I(inode)->tailpack);
4590 +       }
4591 +       return err;
4592 +}
4593 +
4594 +void reiserfs_vfs_truncate_file(struct inode *inode)
4595 +{
4596 +       mutex_lock(&REISERFS_I(inode)->tailpack);
4597 +       reiserfs_truncate_file(inode, 1);
4598 +       mutex_unlock(&REISERFS_I(inode)->tailpack);
4599 +}
4600 +
4601 +/* Sync a reiserfs file. */
4602 +
4603 +/*
4604 + * FIXME: sync_mapping_buffers() never has anything to sync.  Can
4605 + * be removed...
4606 + */
4607 +
4608 +static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end,
4609 +                             int datasync)
4610 +{
4611 +       struct inode *inode = filp->f_mapping->host;
4612 +       int err;
4613 +       int barrier_done;
4614 +
4615 +       err = file_write_and_wait_range(filp, start, end);
4616 +       if (err)
4617 +               return err;
4618 +
4619 +       inode_lock(inode);
4620 +       BUG_ON(!S_ISREG(inode->i_mode));
4621 +       err = sync_mapping_buffers(inode->i_mapping);
4622 +       reiserfs_write_lock(inode->i_sb);
4623 +       barrier_done = reiserfs_commit_for_inode(inode);
4624 +       reiserfs_write_unlock(inode->i_sb);
4625 +       if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
4626 +               blkdev_issue_flush(inode->i_sb->s_bdev);
4627 +       inode_unlock(inode);
4628 +       if (barrier_done < 0)
4629 +               return barrier_done;
4630 +       return (err < 0) ? -EIO : 0;
4631 +}
4632 +
4633 +/* taken fs/buffer.c:__block_commit_write */
4634 +int reiserfs_commit_page(struct inode *inode, struct page *page,
4635 +                        unsigned from, unsigned to)
4636 +{
4637 +       unsigned block_start, block_end;
4638 +       int partial = 0;
4639 +       unsigned blocksize;
4640 +       struct buffer_head *bh, *head;
4641 +       unsigned long i_size_index = inode->i_size >> PAGE_SHIFT;
4642 +       int new;
4643 +       int logit = reiserfs_file_data_log(inode);
4644 +       struct super_block *s = inode->i_sb;
4645 +       int bh_per_page = PAGE_SIZE / s->s_blocksize;
4646 +       struct reiserfs_transaction_handle th;
4647 +       int ret = 0;
4648 +
4649 +       th.t_trans_id = 0;
4650 +       blocksize = i_blocksize(inode);
4651 +
4652 +       if (logit) {
4653 +               reiserfs_write_lock(s);
4654 +               ret = journal_begin(&th, s, bh_per_page + 1);
4655 +               if (ret)
4656 +                       goto drop_write_lock;
4657 +               reiserfs_update_inode_transaction(inode);
4658 +       }
4659 +       for (bh = head = page_buffers(page), block_start = 0;
4660 +            bh != head || !block_start;
4661 +            block_start = block_end, bh = bh->b_this_page) {
4662 +
4663 +               new = buffer_new(bh);
4664 +               clear_buffer_new(bh);
4665 +               block_end = block_start + blocksize;
4666 +               if (block_end <= from || block_start >= to) {
4667 +                       if (!buffer_uptodate(bh))
4668 +                               partial = 1;
4669 +               } else {
4670 +                       set_buffer_uptodate(bh);
4671 +                       if (logit) {
4672 +                               reiserfs_prepare_for_journal(s, bh, 1);
4673 +                               journal_mark_dirty(&th, bh);
4674 +                       } else if (!buffer_dirty(bh)) {
4675 +                               mark_buffer_dirty(bh);
4676 +                               /*
4677 +                                * do data=ordered on any page past the end
4678 +                                * of file and any buffer marked BH_New.
4679 +                                */
4680 +                               if (reiserfs_data_ordered(inode->i_sb) &&
4681 +                                   (new || page->index >= i_size_index)) {
4682 +                                       reiserfs_add_ordered_list(inode, bh);
4683 +                               }
4684 +                       }
4685 +               }
4686 +       }
4687 +       if (logit) {
4688 +               ret = journal_end(&th);
4689 +drop_write_lock:
4690 +               reiserfs_write_unlock(s);
4691 +       }
4692 +       /*
4693 +        * If this is a partial write which happened to make all buffers
4694 +        * uptodate then we can optimize away a bogus read_folio() for
4695 +        * the next read(). Here we 'discover' whether the page went
4696 +        * uptodate as a result of this (potentially partial) write.
4697 +        */
4698 +       if (!partial)
4699 +               SetPageUptodate(page);
4700 +       return ret;
4701 +}
4702 +
4703 +const struct file_operations reiserfs_file_operations = {
4704 +       .unlocked_ioctl = reiserfs_ioctl,
4705 +#ifdef CONFIG_COMPAT
4706 +       .compat_ioctl = reiserfs_compat_ioctl,
4707 +#endif
4708 +       .mmap = generic_file_mmap,
4709 +       .open = reiserfs_file_open,
4710 +       .release = reiserfs_file_release,
4711 +       .fsync = reiserfs_sync_file,
4712 +       .read_iter = generic_file_read_iter,
4713 +       .write_iter = generic_file_write_iter,
4714 +       .splice_read = filemap_splice_read,
4715 +       .splice_write = iter_file_splice_write,
4716 +       .llseek = generic_file_llseek,
4717 +};
4718 +
4719 +const struct inode_operations reiserfs_file_inode_operations = {
4720 +       .setattr = reiserfs_setattr,
4721 +       .listxattr = reiserfs_listxattr,
4722 +       .permission = reiserfs_permission,
4723 +       .get_inode_acl = reiserfs_get_acl,
4724 +       .set_acl = reiserfs_set_acl,
4725 +       .fileattr_get = reiserfs_fileattr_get,
4726 +       .fileattr_set = reiserfs_fileattr_set,
4727 +};
4728 +
4729 +const struct inode_operations reiserfs_priv_file_inode_operations = {
4730 +       .setattr = reiserfs_setattr,
4731 +       .permission = reiserfs_permission,
4732 +       .fileattr_get = reiserfs_fileattr_get,
4733 +       .fileattr_set = reiserfs_fileattr_set,
4734 +};
4735 diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
4736 new file mode 100644
4737 index 000000000000..6c13a8d9a73c
4738 --- /dev/null
4739 +++ b/fs/reiserfs/fix_node.c
4740 @@ -0,0 +1,2822 @@
4741 +/*
4742 + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
4743 + */
4744 +
4745 +#include <linux/time.h>
4746 +#include <linux/slab.h>
4747 +#include <linux/string.h>
4748 +#include "reiserfs.h"
4749 +#include <linux/buffer_head.h>
4750 +
4751 +/*
4752 + * To make any changes in the tree we find a node that contains item
4753 + * to be changed/deleted or position in the node we insert a new item
4754 + * to. We call this node S. To do balancing we need to decide what we
4755 + * will shift to left/right neighbor, or to a new node, where new item
4756 + * will be etc. To make this analysis simpler we build virtual
4757 + * node. Virtual node is an array of items, that will replace items of
4758 + * node S. (For instance if we are going to delete an item, virtual
4759 + * node does not contain it). Virtual node keeps information about
4760 + * item sizes and types, mergeability of first and last items, sizes
4761 + * of all entries in directory item. We use this array of items when
4762 + * calculating what we can shift to neighbors and how many nodes we
4763 + * have to have if we do not any shiftings, if we shift to left/right
4764 + * neighbor or to both.
4765 + */
4766 +
4767 +/*
4768 + * Takes item number in virtual node, returns number of item
4769 + * that it has in source buffer
4770 + */
4771 +static inline int old_item_num(int new_num, int affected_item_num, int mode)
4772 +{
4773 +       if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num)
4774 +               return new_num;
4775 +
4776 +       if (mode == M_INSERT) {
4777 +
4778 +               RFALSE(new_num == 0,
4779 +                      "vs-8005: for INSERT mode and item number of inserted item");
4780 +
4781 +               return new_num - 1;
4782 +       }
4783 +
4784 +       RFALSE(mode != M_DELETE,
4785 +              "vs-8010: old_item_num: mode must be M_DELETE (mode = \'%c\'",
4786 +              mode);
4787 +       /* delete mode */
4788 +       return new_num + 1;
4789 +}
4790 +
4791 +static void create_virtual_node(struct tree_balance *tb, int h)
4792 +{
4793 +       struct item_head *ih;
4794 +       struct virtual_node *vn = tb->tb_vn;
4795 +       int new_num;
4796 +       struct buffer_head *Sh; /* this comes from tb->S[h] */
4797 +
4798 +       Sh = PATH_H_PBUFFER(tb->tb_path, h);
4799 +
4800 +       /* size of changed node */
4801 +       vn->vn_size =
4802 +           MAX_CHILD_SIZE(Sh) - B_FREE_SPACE(Sh) + tb->insert_size[h];
4803 +
4804 +       /* for internal nodes array if virtual items is not created */
4805 +       if (h) {
4806 +               vn->vn_nr_item = (vn->vn_size - DC_SIZE) / (DC_SIZE + KEY_SIZE);
4807 +               return;
4808 +       }
4809 +
4810 +       /* number of items in virtual node  */
4811 +       vn->vn_nr_item =
4812 +           B_NR_ITEMS(Sh) + ((vn->vn_mode == M_INSERT) ? 1 : 0) -
4813 +           ((vn->vn_mode == M_DELETE) ? 1 : 0);
4814 +
4815 +       /* first virtual item */
4816 +       vn->vn_vi = (struct virtual_item *)(tb->tb_vn + 1);
4817 +       memset(vn->vn_vi, 0, vn->vn_nr_item * sizeof(struct virtual_item));
4818 +       vn->vn_free_ptr += vn->vn_nr_item * sizeof(struct virtual_item);
4819 +
4820 +       /* first item in the node */
4821 +       ih = item_head(Sh, 0);
4822 +
4823 +       /* define the mergeability for 0-th item (if it is not being deleted) */
4824 +       if (op_is_left_mergeable(&ih->ih_key, Sh->b_size)
4825 +           && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num))
4826 +               vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE;
4827 +
4828 +       /*
4829 +        * go through all items that remain in the virtual
4830 +        * node (except for the new (inserted) one)
4831 +        */
4832 +       for (new_num = 0; new_num < vn->vn_nr_item; new_num++) {
4833 +               int j;
4834 +               struct virtual_item *vi = vn->vn_vi + new_num;
4835 +               int is_affected =
4836 +                   ((new_num != vn->vn_affected_item_num) ? 0 : 1);
4837 +
4838 +               if (is_affected && vn->vn_mode == M_INSERT)
4839 +                       continue;
4840 +
4841 +               /* get item number in source node */
4842 +               j = old_item_num(new_num, vn->vn_affected_item_num,
4843 +                                vn->vn_mode);
4844 +
4845 +               vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE;
4846 +               vi->vi_ih = ih + j;
4847 +               vi->vi_item = ih_item_body(Sh, ih + j);
4848 +               vi->vi_uarea = vn->vn_free_ptr;
4849 +
4850 +               /*
4851 +                * FIXME: there is no check that item operation did not
4852 +                * consume too much memory
4853 +                */
4854 +               vn->vn_free_ptr +=
4855 +                   op_create_vi(vn, vi, is_affected, tb->insert_size[0]);
4856 +               if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr)
4857 +                       reiserfs_panic(tb->tb_sb, "vs-8030",
4858 +                                      "virtual node space consumed");
4859 +
4860 +               if (!is_affected)
4861 +                       /* this is not being changed */
4862 +                       continue;
4863 +
4864 +               if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) {
4865 +                       vn->vn_vi[new_num].vi_item_len += tb->insert_size[0];
4866 +                       /* pointer to data which is going to be pasted */
4867 +                       vi->vi_new_data = vn->vn_data;
4868 +               }
4869 +       }
4870 +
4871 +       /* virtual inserted item is not defined yet */
4872 +       if (vn->vn_mode == M_INSERT) {
4873 +               struct virtual_item *vi = vn->vn_vi + vn->vn_affected_item_num;
4874 +
4875 +               RFALSE(vn->vn_ins_ih == NULL,
4876 +                      "vs-8040: item header of inserted item is not specified");
4877 +               vi->vi_item_len = tb->insert_size[0];
4878 +               vi->vi_ih = vn->vn_ins_ih;
4879 +               vi->vi_item = vn->vn_data;
4880 +               vi->vi_uarea = vn->vn_free_ptr;
4881 +
4882 +               op_create_vi(vn, vi, 0 /*not pasted or cut */ ,
4883 +                            tb->insert_size[0]);
4884 +       }
4885 +
4886 +       /*
4887 +        * set right merge flag we take right delimiting key and
4888 +        * check whether it is a mergeable item
4889 +        */
4890 +       if (tb->CFR[0]) {
4891 +               struct reiserfs_key *key;
4892 +
4893 +               key = internal_key(tb->CFR[0], tb->rkey[0]);
4894 +               if (op_is_left_mergeable(key, Sh->b_size)
4895 +                   && (vn->vn_mode != M_DELETE
4896 +                       || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1))
4897 +                       vn->vn_vi[vn->vn_nr_item - 1].vi_type |=
4898 +                           VI_TYPE_RIGHT_MERGEABLE;
4899 +
4900 +#ifdef CONFIG_REISERFS_CHECK
4901 +               if (op_is_left_mergeable(key, Sh->b_size) &&
4902 +                   !(vn->vn_mode != M_DELETE
4903 +                     || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1)) {
4904 +                       /*
4905 +                        * we delete last item and it could be merged
4906 +                        * with right neighbor's first item
4907 +                        */
4908 +                       if (!
4909 +                           (B_NR_ITEMS(Sh) == 1
4910 +                            && is_direntry_le_ih(item_head(Sh, 0))
4911 +                            && ih_entry_count(item_head(Sh, 0)) == 1)) {
4912 +                               /*
4913 +                                * node contains more than 1 item, or item
4914 +                                * is not directory item, or this item
4915 +                                * contains more than 1 entry
4916 +                                */
4917 +                               print_block(Sh, 0, -1, -1);
4918 +                               reiserfs_panic(tb->tb_sb, "vs-8045",
4919 +                                              "rdkey %k, affected item==%d "
4920 +                                              "(mode==%c) Must be %c",
4921 +                                              key, vn->vn_affected_item_num,
4922 +                                              vn->vn_mode, M_DELETE);
4923 +                       }
4924 +               }
4925 +#endif
4926 +
4927 +       }
4928 +}
4929 +
4930 +/*
4931 + * Using virtual node check, how many items can be
4932 + * shifted to left neighbor
4933 + */
4934 +static void check_left(struct tree_balance *tb, int h, int cur_free)
4935 +{
4936 +       int i;
4937 +       struct virtual_node *vn = tb->tb_vn;
4938 +       struct virtual_item *vi;
4939 +       int d_size, ih_size;
4940 +
4941 +       RFALSE(cur_free < 0, "vs-8050: cur_free (%d) < 0", cur_free);
4942 +
4943 +       /* internal level */
4944 +       if (h > 0) {
4945 +               tb->lnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
4946 +               return;
4947 +       }
4948 +
4949 +       /* leaf level */
4950 +
4951 +       if (!cur_free || !vn->vn_nr_item) {
4952 +               /* no free space or nothing to move */
4953 +               tb->lnum[h] = 0;
4954 +               tb->lbytes = -1;
4955 +               return;
4956 +       }
4957 +
4958 +       RFALSE(!PATH_H_PPARENT(tb->tb_path, 0),
4959 +              "vs-8055: parent does not exist or invalid");
4960 +
4961 +       vi = vn->vn_vi;
4962 +       if ((unsigned int)cur_free >=
4963 +           (vn->vn_size -
4964 +            ((vi->vi_type & VI_TYPE_LEFT_MERGEABLE) ? IH_SIZE : 0))) {
4965 +               /* all contents of S[0] fits into L[0] */
4966 +
4967 +               RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
4968 +                      "vs-8055: invalid mode or balance condition failed");
4969 +
4970 +               tb->lnum[0] = vn->vn_nr_item;
4971 +               tb->lbytes = -1;
4972 +               return;
4973 +       }
4974 +
4975 +       d_size = 0, ih_size = IH_SIZE;
4976 +
4977 +       /* first item may be merge with last item in left neighbor */
4978 +       if (vi->vi_type & VI_TYPE_LEFT_MERGEABLE)
4979 +               d_size = -((int)IH_SIZE), ih_size = 0;
4980 +
4981 +       tb->lnum[0] = 0;
4982 +       for (i = 0; i < vn->vn_nr_item;
4983 +            i++, ih_size = IH_SIZE, d_size = 0, vi++) {
4984 +               d_size += vi->vi_item_len;
4985 +               if (cur_free >= d_size) {
4986 +                       /* the item can be shifted entirely */
4987 +                       cur_free -= d_size;
4988 +                       tb->lnum[0]++;
4989 +                       continue;
4990 +               }
4991 +
4992 +               /* the item cannot be shifted entirely, try to split it */
4993 +               /*
4994 +                * check whether L[0] can hold ih and at least one byte
4995 +                * of the item body
4996 +                */
4997 +
4998 +               /* cannot shift even a part of the current item */
4999 +               if (cur_free <= ih_size) {
5000 +                       tb->lbytes = -1;
5001 +                       return;
5002 +               }
5003 +               cur_free -= ih_size;
5004 +
5005 +               tb->lbytes = op_check_left(vi, cur_free, 0, 0);
5006 +               if (tb->lbytes != -1)
5007 +                       /* count partially shifted item */
5008 +                       tb->lnum[0]++;
5009 +
5010 +               break;
5011 +       }
5012 +
5013 +       return;
5014 +}
5015 +
5016 +/*
5017 + * Using virtual node check, how many items can be
5018 + * shifted to right neighbor
5019 + */
5020 +static void check_right(struct tree_balance *tb, int h, int cur_free)
5021 +{
5022 +       int i;
5023 +       struct virtual_node *vn = tb->tb_vn;
5024 +       struct virtual_item *vi;
5025 +       int d_size, ih_size;
5026 +
5027 +       RFALSE(cur_free < 0, "vs-8070: cur_free < 0");
5028 +
5029 +       /* internal level */
5030 +       if (h > 0) {
5031 +               tb->rnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
5032 +               return;
5033 +       }
5034 +
5035 +       /* leaf level */
5036 +
5037 +       if (!cur_free || !vn->vn_nr_item) {
5038 +               /* no free space  */
5039 +               tb->rnum[h] = 0;
5040 +               tb->rbytes = -1;
5041 +               return;
5042 +       }
5043 +
5044 +       RFALSE(!PATH_H_PPARENT(tb->tb_path, 0),
5045 +              "vs-8075: parent does not exist or invalid");
5046 +
5047 +       vi = vn->vn_vi + vn->vn_nr_item - 1;
5048 +       if ((unsigned int)cur_free >=
5049 +           (vn->vn_size -
5050 +            ((vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) ? IH_SIZE : 0))) {
5051 +               /* all contents of S[0] fits into R[0] */
5052 +
5053 +               RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
5054 +                      "vs-8080: invalid mode or balance condition failed");
5055 +
5056 +               tb->rnum[h] = vn->vn_nr_item;
5057 +               tb->rbytes = -1;
5058 +               return;
5059 +       }
5060 +
5061 +       d_size = 0, ih_size = IH_SIZE;
5062 +
5063 +       /* last item may be merge with first item in right neighbor */
5064 +       if (vi->vi_type & VI_TYPE_RIGHT_MERGEABLE)
5065 +               d_size = -(int)IH_SIZE, ih_size = 0;
5066 +
5067 +       tb->rnum[0] = 0;
5068 +       for (i = vn->vn_nr_item - 1; i >= 0;
5069 +            i--, d_size = 0, ih_size = IH_SIZE, vi--) {
5070 +               d_size += vi->vi_item_len;
5071 +               if (cur_free >= d_size) {
5072 +                       /* the item can be shifted entirely */
5073 +                       cur_free -= d_size;
5074 +                       tb->rnum[0]++;
5075 +                       continue;
5076 +               }
5077 +
5078 +               /*
5079 +                * check whether R[0] can hold ih and at least one
5080 +                * byte of the item body
5081 +                */
5082 +
5083 +               /* cannot shift even a part of the current item */
5084 +               if (cur_free <= ih_size) {
5085 +                       tb->rbytes = -1;
5086 +                       return;
5087 +               }
5088 +
5089 +               /*
5090 +                * R[0] can hold the header of the item and at least
5091 +                * one byte of its body
5092 +                */
5093 +               cur_free -= ih_size;    /* cur_free is still > 0 */
5094 +
5095 +               tb->rbytes = op_check_right(vi, cur_free);
5096 +               if (tb->rbytes != -1)
5097 +                       /* count partially shifted item */
5098 +                       tb->rnum[0]++;
5099 +
5100 +               break;
5101 +       }
5102 +
5103 +       return;
5104 +}
5105 +
5106 +/*
5107 + * from - number of items, which are shifted to left neighbor entirely
5108 + * to - number of item, which are shifted to right neighbor entirely
5109 + * from_bytes - number of bytes of boundary item (or directory entries)
5110 + *              which are shifted to left neighbor
5111 + * to_bytes - number of bytes of boundary item (or directory entries)
5112 + *            which are shifted to right neighbor
5113 + */
5114 +static int get_num_ver(int mode, struct tree_balance *tb, int h,
5115 +                      int from, int from_bytes,
5116 +                      int to, int to_bytes, short *snum012, int flow)
5117 +{
5118 +       int i;
5119 +       int units;
5120 +       struct virtual_node *vn = tb->tb_vn;
5121 +       int total_node_size, max_node_size, current_item_size;
5122 +       int needed_nodes;
5123 +
5124 +       /* position of item we start filling node from */
5125 +       int start_item;
5126 +
5127 +       /* position of item we finish filling node by */
5128 +       int end_item;
5129 +
5130 +       /*
5131 +        * number of first bytes (entries for directory) of start_item-th item
5132 +        * we do not include into node that is being filled
5133 +        */
5134 +       int start_bytes;
5135 +
5136 +       /*
5137 +        * number of last bytes (entries for directory) of end_item-th item
5138 +        * we do node include into node that is being filled
5139 +        */
5140 +       int end_bytes;
5141 +
5142 +       /*
5143 +        * these are positions in virtual item of items, that are split
5144 +        * between S[0] and S1new and S1new and S2new
5145 +        */
5146 +       int split_item_positions[2];
5147 +
5148 +       split_item_positions[0] = -1;
5149 +       split_item_positions[1] = -1;
5150 +
5151 +       /*
5152 +        * We only create additional nodes if we are in insert or paste mode
5153 +        * or we are in replace mode at the internal level. If h is 0 and
5154 +        * the mode is M_REPLACE then in fix_nodes we change the mode to
5155 +        * paste or insert before we get here in the code.
5156 +        */
5157 +       RFALSE(tb->insert_size[h] < 0 || (mode != M_INSERT && mode != M_PASTE),
5158 +              "vs-8100: insert_size < 0 in overflow");
5159 +
5160 +       max_node_size = MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, h));
5161 +
5162 +       /*
5163 +        * snum012 [0-2] - number of items, that lay
5164 +        * to S[0], first new node and second new node
5165 +        */
5166 +       snum012[3] = -1;        /* s1bytes */
5167 +       snum012[4] = -1;        /* s2bytes */
5168 +
5169 +       /* internal level */
5170 +       if (h > 0) {
5171 +               i = ((to - from) * (KEY_SIZE + DC_SIZE) + DC_SIZE);
5172 +               if (i == max_node_size)
5173 +                       return 1;
5174 +               return (i / max_node_size + 1);
5175 +       }
5176 +
5177 +       /* leaf level */
5178 +       needed_nodes = 1;
5179 +       total_node_size = 0;
5180 +
5181 +       /* start from 'from'-th item */
5182 +       start_item = from;
5183 +       /* skip its first 'start_bytes' units */
5184 +       start_bytes = ((from_bytes != -1) ? from_bytes : 0);
5185 +
5186 +       /* last included item is the 'end_item'-th one */
5187 +       end_item = vn->vn_nr_item - to - 1;
5188 +       /* do not count last 'end_bytes' units of 'end_item'-th item */
5189 +       end_bytes = (to_bytes != -1) ? to_bytes : 0;
5190 +
5191 +       /*
5192 +        * go through all item beginning from the start_item-th item
5193 +        * and ending by the end_item-th item. Do not count first
5194 +        * 'start_bytes' units of 'start_item'-th item and last
5195 +        * 'end_bytes' of 'end_item'-th item
5196 +        */
5197 +       for (i = start_item; i <= end_item; i++) {
5198 +               struct virtual_item *vi = vn->vn_vi + i;
5199 +               int skip_from_end = ((i == end_item) ? end_bytes : 0);
5200 +
5201 +               RFALSE(needed_nodes > 3, "vs-8105: too many nodes are needed");
5202 +
5203 +               /* get size of current item */
5204 +               current_item_size = vi->vi_item_len;
5205 +
5206 +               /*
5207 +                * do not take in calculation head part (from_bytes)
5208 +                * of from-th item
5209 +                */
5210 +               current_item_size -=
5211 +                   op_part_size(vi, 0 /*from start */ , start_bytes);
5212 +
5213 +               /* do not take in calculation tail part of last item */
5214 +               current_item_size -=
5215 +                   op_part_size(vi, 1 /*from end */ , skip_from_end);
5216 +
5217 +               /* if item fits into current node entierly */
5218 +               if (total_node_size + current_item_size <= max_node_size) {
5219 +                       snum012[needed_nodes - 1]++;
5220 +                       total_node_size += current_item_size;
5221 +                       start_bytes = 0;
5222 +                       continue;
5223 +               }
5224 +
5225 +               /*
5226 +                * virtual item length is longer, than max size of item in
5227 +                * a node. It is impossible for direct item
5228 +                */
5229 +               if (current_item_size > max_node_size) {
5230 +                       RFALSE(is_direct_le_ih(vi->vi_ih),
5231 +                              "vs-8110: "
5232 +                              "direct item length is %d. It can not be longer than %d",
5233 +                              current_item_size, max_node_size);
5234 +                       /* we will try to split it */
5235 +                       flow = 1;
5236 +               }
5237 +
5238 +               /* as we do not split items, take new node and continue */
5239 +               if (!flow) {
5240 +                       needed_nodes++;
5241 +                       i--;
5242 +                       total_node_size = 0;
5243 +                       continue;
5244 +               }
5245 +
5246 +               /*
5247 +                * calculate number of item units which fit into node being
5248 +                * filled
5249 +                */
5250 +               {
5251 +                       int free_space;
5252 +
5253 +                       free_space = max_node_size - total_node_size - IH_SIZE;
5254 +                       units =
5255 +                           op_check_left(vi, free_space, start_bytes,
5256 +                                         skip_from_end);
5257 +                       /*
5258 +                        * nothing fits into current node, take new
5259 +                        * node and continue
5260 +                        */
5261 +                       if (units == -1) {
5262 +                               needed_nodes++, i--, total_node_size = 0;
5263 +                               continue;
5264 +                       }
5265 +               }
5266 +
5267 +               /* something fits into the current node */
5268 +               start_bytes += units;
5269 +               snum012[needed_nodes - 1 + 3] = units;
5270 +
5271 +               if (needed_nodes > 2)
5272 +                       reiserfs_warning(tb->tb_sb, "vs-8111",
5273 +                                        "split_item_position is out of range");
5274 +               snum012[needed_nodes - 1]++;
5275 +               split_item_positions[needed_nodes - 1] = i;
5276 +               needed_nodes++;
5277 +               /* continue from the same item with start_bytes != -1 */
5278 +               start_item = i;
5279 +               i--;
5280 +               total_node_size = 0;
5281 +       }
5282 +
5283 +       /*
5284 +        * sum012[4] (if it is not -1) contains number of units of which
5285 +        * are to be in S1new, snum012[3] - to be in S0. They are supposed
5286 +        * to be S1bytes and S2bytes correspondingly, so recalculate
5287 +        */
5288 +       if (snum012[4] > 0) {
5289 +               int split_item_num;
5290 +               int bytes_to_r, bytes_to_l;
5291 +               int bytes_to_S1new;
5292 +
5293 +               split_item_num = split_item_positions[1];
5294 +               bytes_to_l =
5295 +                   ((from == split_item_num
5296 +                     && from_bytes != -1) ? from_bytes : 0);
5297 +               bytes_to_r =
5298 +                   ((end_item == split_item_num
5299 +                     && end_bytes != -1) ? end_bytes : 0);
5300 +               bytes_to_S1new =
5301 +                   ((split_item_positions[0] ==
5302 +                     split_item_positions[1]) ? snum012[3] : 0);
5303 +
5304 +               /* s2bytes */
5305 +               snum012[4] =
5306 +                   op_unit_num(&vn->vn_vi[split_item_num]) - snum012[4] -
5307 +                   bytes_to_r - bytes_to_l - bytes_to_S1new;
5308 +
5309 +               if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY &&
5310 +                   vn->vn_vi[split_item_num].vi_index != TYPE_INDIRECT)
5311 +                       reiserfs_warning(tb->tb_sb, "vs-8115",
5312 +                                        "not directory or indirect item");
5313 +       }
5314 +
5315 +       /* now we know S2bytes, calculate S1bytes */
5316 +       if (snum012[3] > 0) {
5317 +               int split_item_num;
5318 +               int bytes_to_r, bytes_to_l;
5319 +               int bytes_to_S2new;
5320 +
5321 +               split_item_num = split_item_positions[0];
5322 +               bytes_to_l =
5323 +                   ((from == split_item_num
5324 +                     && from_bytes != -1) ? from_bytes : 0);
5325 +               bytes_to_r =
5326 +                   ((end_item == split_item_num
5327 +                     && end_bytes != -1) ? end_bytes : 0);
5328 +               bytes_to_S2new =
5329 +                   ((split_item_positions[0] == split_item_positions[1]
5330 +                     && snum012[4] != -1) ? snum012[4] : 0);
5331 +
5332 +               /* s1bytes */
5333 +               snum012[3] =
5334 +                   op_unit_num(&vn->vn_vi[split_item_num]) - snum012[3] -
5335 +                   bytes_to_r - bytes_to_l - bytes_to_S2new;
5336 +       }
5337 +
5338 +       return needed_nodes;
5339 +}
5340 +
5341 +
5342 +/*
5343 + * Set parameters for balancing.
5344 + * Performs write of results of analysis of balancing into structure tb,
5345 + * where it will later be used by the functions that actually do the balancing.
5346 + * Parameters:
5347 + *     tb      tree_balance structure;
5348 + *     h       current level of the node;
5349 + *     lnum    number of items from S[h] that must be shifted to L[h];
5350 + *     rnum    number of items from S[h] that must be shifted to R[h];
5351 + *     blk_num number of blocks that S[h] will be splitted into;
5352 + *     s012    number of items that fall into splitted nodes.
5353 + *     lbytes  number of bytes which flow to the left neighbor from the
5354 + *              item that is not shifted entirely
5355 + *     rbytes  number of bytes which flow to the right neighbor from the
5356 + *              item that is not shifted entirely
5357 + *     s1bytes number of bytes which flow to the first  new node when
5358 + *              S[0] splits (this number is contained in s012 array)
5359 + */
5360 +
5361 +static void set_parameters(struct tree_balance *tb, int h, int lnum,
5362 +                          int rnum, int blk_num, short *s012, int lb, int rb)
5363 +{
5364 +
5365 +       tb->lnum[h] = lnum;
5366 +       tb->rnum[h] = rnum;
5367 +       tb->blknum[h] = blk_num;
5368 +
5369 +       /* only for leaf level */
5370 +       if (h == 0) {
5371 +               if (s012 != NULL) {
5372 +                       tb->s0num = *s012++;
5373 +                       tb->snum[0] = *s012++;
5374 +                       tb->snum[1] = *s012++;
5375 +                       tb->sbytes[0] = *s012++;
5376 +                       tb->sbytes[1] = *s012;
5377 +               }
5378 +               tb->lbytes = lb;
5379 +               tb->rbytes = rb;
5380 +       }
5381 +       PROC_INFO_ADD(tb->tb_sb, lnum[h], lnum);
5382 +       PROC_INFO_ADD(tb->tb_sb, rnum[h], rnum);
5383 +
5384 +       PROC_INFO_ADD(tb->tb_sb, lbytes[h], lb);
5385 +       PROC_INFO_ADD(tb->tb_sb, rbytes[h], rb);
5386 +}
5387 +
5388 +/*
5389 + * check if node disappears if we shift tb->lnum[0] items to left
5390 + * neighbor and tb->rnum[0] to the right one.
5391 + */
5392 +static int is_leaf_removable(struct tree_balance *tb)
5393 +{
5394 +       struct virtual_node *vn = tb->tb_vn;
5395 +       int to_left, to_right;
5396 +       int size;
5397 +       int remain_items;
5398 +
5399 +       /*
5400 +        * number of items that will be shifted to left (right) neighbor
5401 +        * entirely
5402 +        */
5403 +       to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0);
5404 +       to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0);
5405 +       remain_items = vn->vn_nr_item;
5406 +
5407 +       /* how many items remain in S[0] after shiftings to neighbors */
5408 +       remain_items -= (to_left + to_right);
5409 +
5410 +       /* all content of node can be shifted to neighbors */
5411 +       if (remain_items < 1) {
5412 +               set_parameters(tb, 0, to_left, vn->vn_nr_item - to_left, 0,
5413 +                              NULL, -1, -1);
5414 +               return 1;
5415 +       }
5416 +
5417 +       /* S[0] is not removable */
5418 +       if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1)
5419 +               return 0;
5420 +
5421 +       /* check whether we can divide 1 remaining item between neighbors */
5422 +
5423 +       /* get size of remaining item (in item units) */
5424 +       size = op_unit_num(&vn->vn_vi[to_left]);
5425 +
5426 +       if (tb->lbytes + tb->rbytes >= size) {
5427 +               set_parameters(tb, 0, to_left + 1, to_right + 1, 0, NULL,
5428 +                              tb->lbytes, -1);
5429 +               return 1;
5430 +       }
5431 +
5432 +       return 0;
5433 +}
5434 +
5435 +/* check whether L, S, R can be joined in one node */
5436 +static int are_leaves_removable(struct tree_balance *tb, int lfree, int rfree)
5437 +{
5438 +       struct virtual_node *vn = tb->tb_vn;
5439 +       int ih_size;
5440 +       struct buffer_head *S0;
5441 +
5442 +       S0 = PATH_H_PBUFFER(tb->tb_path, 0);
5443 +
5444 +       ih_size = 0;
5445 +       if (vn->vn_nr_item) {
5446 +               if (vn->vn_vi[0].vi_type & VI_TYPE_LEFT_MERGEABLE)
5447 +                       ih_size += IH_SIZE;
5448 +
5449 +               if (vn->vn_vi[vn->vn_nr_item - 1].
5450 +                   vi_type & VI_TYPE_RIGHT_MERGEABLE)
5451 +                       ih_size += IH_SIZE;
5452 +       } else {
5453 +               /* there was only one item and it will be deleted */
5454 +               struct item_head *ih;
5455 +
5456 +               RFALSE(B_NR_ITEMS(S0) != 1,
5457 +                      "vs-8125: item number must be 1: it is %d",
5458 +                      B_NR_ITEMS(S0));
5459 +
5460 +               ih = item_head(S0, 0);
5461 +               if (tb->CFR[0]
5462 +                   && !comp_short_le_keys(&ih->ih_key,
5463 +                                          internal_key(tb->CFR[0],
5464 +                                                         tb->rkey[0])))
5465 +                       /*
5466 +                        * Directory must be in correct state here: that is
5467 +                        * somewhere at the left side should exist first
5468 +                        * directory item. But the item being deleted can
5469 +                        * not be that first one because its right neighbor
5470 +                        * is item of the same directory. (But first item
5471 +                        * always gets deleted in last turn). So, neighbors
5472 +                        * of deleted item can be merged, so we can save
5473 +                        * ih_size
5474 +                        */
5475 +                       if (is_direntry_le_ih(ih)) {
5476 +                               ih_size = IH_SIZE;
5477 +
5478 +                               /*
5479 +                                * we might check that left neighbor exists
5480 +                                * and is of the same directory
5481 +                                */
5482 +                               RFALSE(le_ih_k_offset(ih) == DOT_OFFSET,
5483 +                                      "vs-8130: first directory item can not be removed until directory is not empty");
5484 +                       }
5485 +
5486 +       }
5487 +
5488 +       if (MAX_CHILD_SIZE(S0) + vn->vn_size <= rfree + lfree + ih_size) {
5489 +               set_parameters(tb, 0, -1, -1, -1, NULL, -1, -1);
5490 +               PROC_INFO_INC(tb->tb_sb, leaves_removable);
5491 +               return 1;
5492 +       }
5493 +       return 0;
5494 +
5495 +}
5496 +
5497 +/* when we do not split item, lnum and rnum are numbers of entire items */
5498 +#define SET_PAR_SHIFT_LEFT \
5499 +if (h)\
5500 +{\
5501 +   int to_l;\
5502 +   \
5503 +   to_l = (MAX_NR_KEY(Sh)+1 - lpar + vn->vn_nr_item + 1) / 2 -\
5504 +             (MAX_NR_KEY(Sh) + 1 - lpar);\
5505 +             \
5506 +             set_parameters (tb, h, to_l, 0, lnver, NULL, -1, -1);\
5507 +}\
5508 +else \
5509 +{\
5510 +   if (lset==LEFT_SHIFT_FLOW)\
5511 +     set_parameters (tb, h, lpar, 0, lnver, snum012+lset,\
5512 +                    tb->lbytes, -1);\
5513 +   else\
5514 +     set_parameters (tb, h, lpar - (tb->lbytes!=-1), 0, lnver, snum012+lset,\
5515 +                    -1, -1);\
5516 +}
5517 +
5518 +#define SET_PAR_SHIFT_RIGHT \
5519 +if (h)\
5520 +{\
5521 +   int to_r;\
5522 +   \
5523 +   to_r = (MAX_NR_KEY(Sh)+1 - rpar + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - rpar);\
5524 +   \
5525 +   set_parameters (tb, h, 0, to_r, rnver, NULL, -1, -1);\
5526 +}\
5527 +else \
5528 +{\
5529 +   if (rset==RIGHT_SHIFT_FLOW)\
5530 +     set_parameters (tb, h, 0, rpar, rnver, snum012+rset,\
5531 +                 -1, tb->rbytes);\
5532 +   else\
5533 +     set_parameters (tb, h, 0, rpar - (tb->rbytes!=-1), rnver, snum012+rset,\
5534 +                 -1, -1);\
5535 +}
5536 +
5537 +static void free_buffers_in_tb(struct tree_balance *tb)
5538 +{
5539 +       int i;
5540 +
5541 +       pathrelse(tb->tb_path);
5542 +
5543 +       for (i = 0; i < MAX_HEIGHT; i++) {
5544 +               brelse(tb->L[i]);
5545 +               brelse(tb->R[i]);
5546 +               brelse(tb->FL[i]);
5547 +               brelse(tb->FR[i]);
5548 +               brelse(tb->CFL[i]);
5549 +               brelse(tb->CFR[i]);
5550 +
5551 +               tb->L[i] = NULL;
5552 +               tb->R[i] = NULL;
5553 +               tb->FL[i] = NULL;
5554 +               tb->FR[i] = NULL;
5555 +               tb->CFL[i] = NULL;
5556 +               tb->CFR[i] = NULL;
5557 +       }
5558 +}
5559 +
5560 +/*
5561 + * Get new buffers for storing new nodes that are created while balancing.
5562 + * Returns:    SCHEDULE_OCCURRED - schedule occurred while the function worked;
5563 + *             CARRY_ON - schedule didn't occur while the function worked;
5564 + *             NO_DISK_SPACE - no disk space.
5565 + */
5566 +/* The function is NOT SCHEDULE-SAFE! */
5567 +static int get_empty_nodes(struct tree_balance *tb, int h)
5568 +{
5569 +       struct buffer_head *new_bh, *Sh = PATH_H_PBUFFER(tb->tb_path, h);
5570 +       b_blocknr_t *blocknr, blocknrs[MAX_AMOUNT_NEEDED] = { 0, };
5571 +       int counter, number_of_freeblk;
5572 +       int  amount_needed;     /* number of needed empty blocks */
5573 +       int  retval = CARRY_ON;
5574 +       struct super_block *sb = tb->tb_sb;
5575 +
5576 +       /*
5577 +        * number_of_freeblk is the number of empty blocks which have been
5578 +        * acquired for use by the balancing algorithm minus the number of
5579 +        * empty blocks used in the previous levels of the analysis,
5580 +        * number_of_freeblk = tb->cur_blknum can be non-zero if a schedule
5581 +        * occurs after empty blocks are acquired, and the balancing analysis
5582 +        * is then restarted, amount_needed is the number needed by this
5583 +        * level (h) of the balancing analysis.
5584 +        *
5585 +        * Note that for systems with many processes writing, it would be
5586 +        * more layout optimal to calculate the total number needed by all
5587 +        * levels and then to run reiserfs_new_blocks to get all of them at
5588 +        * once.
5589 +        */
5590 +
5591 +       /*
5592 +        * Initiate number_of_freeblk to the amount acquired prior to the
5593 +        * restart of the analysis or 0 if not restarted, then subtract the
5594 +        * amount needed by all of the levels of the tree below h.
5595 +        */
5596 +       /* blknum includes S[h], so we subtract 1 in this calculation */
5597 +       for (counter = 0, number_of_freeblk = tb->cur_blknum;
5598 +            counter < h; counter++)
5599 +               number_of_freeblk -=
5600 +                   (tb->blknum[counter]) ? (tb->blknum[counter] -
5601 +                                                  1) : 0;
5602 +
5603 +       /* Allocate missing empty blocks. */
5604 +       /* if Sh == 0  then we are getting a new root */
5605 +       amount_needed = (Sh) ? (tb->blknum[h] - 1) : 1;
5606 +       /*
5607 +        * Amount_needed = the amount that we need more than the
5608 +        * amount that we have.
5609 +        */
5610 +       if (amount_needed > number_of_freeblk)
5611 +               amount_needed -= number_of_freeblk;
5612 +       else    /* If we have enough already then there is nothing to do. */
5613 +               return CARRY_ON;
5614 +
5615 +       /*
5616 +        * No need to check quota - is not allocated for blocks used
5617 +        * for formatted nodes
5618 +        */
5619 +       if (reiserfs_new_form_blocknrs(tb, blocknrs,
5620 +                                      amount_needed) == NO_DISK_SPACE)
5621 +               return NO_DISK_SPACE;
5622 +
5623 +       /* for each blocknumber we just got, get a buffer and stick it on FEB */
5624 +       for (blocknr = blocknrs, counter = 0;
5625 +            counter < amount_needed; blocknr++, counter++) {
5626 +
5627 +               RFALSE(!*blocknr,
5628 +                      "PAP-8135: reiserfs_new_blocknrs failed when got new blocks");
5629 +
5630 +               new_bh = sb_getblk(sb, *blocknr);
5631 +               RFALSE(buffer_dirty(new_bh) ||
5632 +                      buffer_journaled(new_bh) ||
5633 +                      buffer_journal_dirty(new_bh),
5634 +                      "PAP-8140: journaled or dirty buffer %b for the new block",
5635 +                      new_bh);
5636 +
5637 +               /* Put empty buffers into the array. */
5638 +               RFALSE(tb->FEB[tb->cur_blknum],
5639 +                      "PAP-8141: busy slot for new buffer");
5640 +
5641 +               set_buffer_journal_new(new_bh);
5642 +               tb->FEB[tb->cur_blknum++] = new_bh;
5643 +       }
5644 +
5645 +       if (retval == CARRY_ON && FILESYSTEM_CHANGED_TB(tb))
5646 +               retval = REPEAT_SEARCH;
5647 +
5648 +       return retval;
5649 +}
5650 +
5651 +/*
5652 + * Get free space of the left neighbor, which is stored in the parent
5653 + * node of the left neighbor.
5654 + */
5655 +static int get_lfree(struct tree_balance *tb, int h)
5656 +{
5657 +       struct buffer_head *l, *f;
5658 +       int order;
5659 +
5660 +       if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL ||
5661 +           (l = tb->FL[h]) == NULL)
5662 +               return 0;
5663 +
5664 +       if (f == l)
5665 +               order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) - 1;
5666 +       else {
5667 +               order = B_NR_ITEMS(l);
5668 +               f = l;
5669 +       }
5670 +
5671 +       return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order)));
5672 +}
5673 +
5674 +/*
5675 + * Get free space of the right neighbor,
5676 + * which is stored in the parent node of the right neighbor.
5677 + */
5678 +static int get_rfree(struct tree_balance *tb, int h)
5679 +{
5680 +       struct buffer_head *r, *f;
5681 +       int order;
5682 +
5683 +       if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL ||
5684 +           (r = tb->FR[h]) == NULL)
5685 +               return 0;
5686 +
5687 +       if (f == r)
5688 +               order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) + 1;
5689 +       else {
5690 +               order = 0;
5691 +               f = r;
5692 +       }
5693 +
5694 +       return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order)));
5695 +
5696 +}
5697 +
5698 +/* Check whether left neighbor is in memory. */
5699 +static int is_left_neighbor_in_cache(struct tree_balance *tb, int h)
5700 +{
5701 +       struct buffer_head *father, *left;
5702 +       struct super_block *sb = tb->tb_sb;
5703 +       b_blocknr_t left_neighbor_blocknr;
5704 +       int left_neighbor_position;
5705 +
5706 +       /* Father of the left neighbor does not exist. */
5707 +       if (!tb->FL[h])
5708 +               return 0;
5709 +
5710 +       /* Calculate father of the node to be balanced. */
5711 +       father = PATH_H_PBUFFER(tb->tb_path, h + 1);
5712 +
5713 +       RFALSE(!father ||
5714 +              !B_IS_IN_TREE(father) ||
5715 +              !B_IS_IN_TREE(tb->FL[h]) ||
5716 +              !buffer_uptodate(father) ||
5717 +              !buffer_uptodate(tb->FL[h]),
5718 +              "vs-8165: F[h] (%b) or FL[h] (%b) is invalid",
5719 +              father, tb->FL[h]);
5720 +
5721 +       /*
5722 +        * Get position of the pointer to the left neighbor
5723 +        * into the left father.
5724 +        */
5725 +       left_neighbor_position = (father == tb->FL[h]) ?
5726 +           tb->lkey[h] : B_NR_ITEMS(tb->FL[h]);
5727 +       /* Get left neighbor block number. */
5728 +       left_neighbor_blocknr =
5729 +           B_N_CHILD_NUM(tb->FL[h], left_neighbor_position);
5730 +       /* Look for the left neighbor in the cache. */
5731 +       if ((left = sb_find_get_block(sb, left_neighbor_blocknr))) {
5732 +
5733 +               RFALSE(buffer_uptodate(left) && !B_IS_IN_TREE(left),
5734 +                      "vs-8170: left neighbor (%b %z) is not in the tree",
5735 +                      left, left);
5736 +               put_bh(left);
5737 +               return 1;
5738 +       }
5739 +
5740 +       return 0;
5741 +}
5742 +
5743 +#define LEFT_PARENTS  'l'
5744 +#define RIGHT_PARENTS 'r'
5745 +
5746 +static void decrement_key(struct cpu_key *key)
5747 +{
5748 +       /* call item specific function for this key */
5749 +       item_ops[cpu_key_k_type(key)]->decrement_key(key);
5750 +}
5751 +
5752 +/*
5753 + * Calculate far left/right parent of the left/right neighbor of the
5754 + * current node, that is calculate the left/right (FL[h]/FR[h]) neighbor
5755 + * of the parent F[h].
5756 + * Calculate left/right common parent of the current node and L[h]/R[h].
5757 + * Calculate left/right delimiting key position.
5758 + * Returns:    PATH_INCORRECT    - path in the tree is not correct
5759 + *             SCHEDULE_OCCURRED - schedule occurred while the function worked
5760 + *             CARRY_ON          - schedule didn't occur while the function
5761 + *                                 worked
5762 + */
5763 +static int get_far_parent(struct tree_balance *tb,
5764 +                         int h,
5765 +                         struct buffer_head **pfather,
5766 +                         struct buffer_head **pcom_father, char c_lr_par)
5767 +{
5768 +       struct buffer_head *parent;
5769 +       INITIALIZE_PATH(s_path_to_neighbor_father);
5770 +       struct treepath *path = tb->tb_path;
5771 +       struct cpu_key s_lr_father_key;
5772 +       int counter,
5773 +           position = INT_MAX,
5774 +           first_last_position = 0,
5775 +           path_offset = PATH_H_PATH_OFFSET(path, h);
5776 +
5777 +       /*
5778 +        * Starting from F[h] go upwards in the tree, and look for the common
5779 +        * ancestor of F[h], and its neighbor l/r, that should be obtained.
5780 +        */
5781 +
5782 +       counter = path_offset;
5783 +
5784 +       RFALSE(counter < FIRST_PATH_ELEMENT_OFFSET,
5785 +              "PAP-8180: invalid path length");
5786 +
5787 +       for (; counter > FIRST_PATH_ELEMENT_OFFSET; counter--) {
5788 +               /*
5789 +                * Check whether parent of the current buffer in the path
5790 +                * is really parent in the tree.
5791 +                */
5792 +               if (!B_IS_IN_TREE
5793 +                   (parent = PATH_OFFSET_PBUFFER(path, counter - 1)))
5794 +                       return REPEAT_SEARCH;
5795 +
5796 +               /* Check whether position in the parent is correct. */
5797 +               if ((position =
5798 +                    PATH_OFFSET_POSITION(path,
5799 +                                         counter - 1)) >
5800 +                   B_NR_ITEMS(parent))
5801 +                       return REPEAT_SEARCH;
5802 +
5803 +               /*
5804 +                * Check whether parent at the path really points
5805 +                * to the child.
5806 +                */
5807 +               if (B_N_CHILD_NUM(parent, position) !=
5808 +                   PATH_OFFSET_PBUFFER(path, counter)->b_blocknr)
5809 +                       return REPEAT_SEARCH;
5810 +
5811 +               /*
5812 +                * Return delimiting key if position in the parent is not
5813 +                * equal to first/last one.
5814 +                */
5815 +               if (c_lr_par == RIGHT_PARENTS)
5816 +                       first_last_position = B_NR_ITEMS(parent);
5817 +               if (position != first_last_position) {
5818 +                       *pcom_father = parent;
5819 +                       get_bh(*pcom_father);
5820 +                       /*(*pcom_father = parent)->b_count++; */
5821 +                       break;
5822 +               }
5823 +       }
5824 +
5825 +       /* if we are in the root of the tree, then there is no common father */
5826 +       if (counter == FIRST_PATH_ELEMENT_OFFSET) {
5827 +               /*
5828 +                * Check whether first buffer in the path is the
5829 +                * root of the tree.
5830 +                */
5831 +               if (PATH_OFFSET_PBUFFER
5832 +                   (tb->tb_path,
5833 +                    FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
5834 +                   SB_ROOT_BLOCK(tb->tb_sb)) {
5835 +                       *pfather = *pcom_father = NULL;
5836 +                       return CARRY_ON;
5837 +               }
5838 +               return REPEAT_SEARCH;
5839 +       }
5840 +
5841 +       RFALSE(B_LEVEL(*pcom_father) <= DISK_LEAF_NODE_LEVEL,
5842 +              "PAP-8185: (%b %z) level too small",
5843 +              *pcom_father, *pcom_father);
5844 +
5845 +       /* Check whether the common parent is locked. */
5846 +
5847 +       if (buffer_locked(*pcom_father)) {
5848 +
5849 +               /* Release the write lock while the buffer is busy */
5850 +               int depth = reiserfs_write_unlock_nested(tb->tb_sb);
5851 +               __wait_on_buffer(*pcom_father);
5852 +               reiserfs_write_lock_nested(tb->tb_sb, depth);
5853 +               if (FILESYSTEM_CHANGED_TB(tb)) {
5854 +                       brelse(*pcom_father);
5855 +                       return REPEAT_SEARCH;
5856 +               }
5857 +       }
5858 +
5859 +       /*
5860 +        * So, we got common parent of the current node and its
5861 +        * left/right neighbor.  Now we are getting the parent of the
5862 +        * left/right neighbor.
5863 +        */
5864 +
5865 +       /* Form key to get parent of the left/right neighbor. */
5866 +       le_key2cpu_key(&s_lr_father_key,
5867 +                      internal_key(*pcom_father,
5868 +                                     (c_lr_par ==
5869 +                                      LEFT_PARENTS) ? (tb->lkey[h - 1] =
5870 +                                                       position -
5871 +                                                       1) : (tb->rkey[h -
5872 +                                                                          1] =
5873 +                                                             position)));
5874 +
5875 +       if (c_lr_par == LEFT_PARENTS)
5876 +               decrement_key(&s_lr_father_key);
5877 +
5878 +       if (search_by_key
5879 +           (tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father,
5880 +            h + 1) == IO_ERROR)
5881 +               /* path is released */
5882 +               return IO_ERROR;
5883 +
5884 +       if (FILESYSTEM_CHANGED_TB(tb)) {
5885 +               pathrelse(&s_path_to_neighbor_father);
5886 +               brelse(*pcom_father);
5887 +               return REPEAT_SEARCH;
5888 +       }
5889 +
5890 +       *pfather = PATH_PLAST_BUFFER(&s_path_to_neighbor_father);
5891 +
5892 +       RFALSE(B_LEVEL(*pfather) != h + 1,
5893 +              "PAP-8190: (%b %z) level too small", *pfather, *pfather);
5894 +       RFALSE(s_path_to_neighbor_father.path_length <
5895 +              FIRST_PATH_ELEMENT_OFFSET, "PAP-8192: path length is too small");
5896 +
5897 +       s_path_to_neighbor_father.path_length--;
5898 +       pathrelse(&s_path_to_neighbor_father);
5899 +       return CARRY_ON;
5900 +}
5901 +
5902 +/*
5903 + * Get parents of neighbors of node in the path(S[path_offset]) and
5904 + * common parents of S[path_offset] and L[path_offset]/R[path_offset]:
5905 + * F[path_offset], FL[path_offset], FR[path_offset], CFL[path_offset],
5906 + * CFR[path_offset].
5907 + * Calculate numbers of left and right delimiting keys position:
5908 + * lkey[path_offset], rkey[path_offset].
5909 + * Returns:    SCHEDULE_OCCURRED - schedule occurred while the function worked
5910 + *             CARRY_ON - schedule didn't occur while the function worked
5911 + */
5912 +static int get_parents(struct tree_balance *tb, int h)
5913 +{
5914 +       struct treepath *path = tb->tb_path;
5915 +       int position,
5916 +           ret,
5917 +           path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h);
5918 +       struct buffer_head *curf, *curcf;
5919 +
5920 +       /* Current node is the root of the tree or will be root of the tree */
5921 +       if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
5922 +               /*
5923 +                * The root can not have parents.
5924 +                * Release nodes which previously were obtained as
5925 +                * parents of the current node neighbors.
5926 +                */
5927 +               brelse(tb->FL[h]);
5928 +               brelse(tb->CFL[h]);
5929 +               brelse(tb->FR[h]);
5930 +               brelse(tb->CFR[h]);
5931 +               tb->FL[h]  = NULL;
5932 +               tb->CFL[h] = NULL;
5933 +               tb->FR[h]  = NULL;
5934 +               tb->CFR[h] = NULL;
5935 +               return CARRY_ON;
5936 +       }
5937 +
5938 +       /* Get parent FL[path_offset] of L[path_offset]. */
5939 +       position = PATH_OFFSET_POSITION(path, path_offset - 1);
5940 +       if (position) {
5941 +               /* Current node is not the first child of its parent. */
5942 +               curf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
5943 +               curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
5944 +               get_bh(curf);
5945 +               get_bh(curf);
5946 +               tb->lkey[h] = position - 1;
5947 +       } else {
5948 +               /*
5949 +                * Calculate current parent of L[path_offset], which is the
5950 +                * left neighbor of the current node.  Calculate current
5951 +                * common parent of L[path_offset] and the current node.
5952 +                * Note that CFL[path_offset] not equal FL[path_offset] and
5953 +                * CFL[path_offset] not equal F[path_offset].
5954 +                * Calculate lkey[path_offset].
5955 +                */
5956 +               if ((ret = get_far_parent(tb, h + 1, &curf,
5957 +                                                 &curcf,
5958 +                                                 LEFT_PARENTS)) != CARRY_ON)
5959 +                       return ret;
5960 +       }
5961 +
5962 +       brelse(tb->FL[h]);
5963 +       tb->FL[h] = curf;       /* New initialization of FL[h]. */
5964 +       brelse(tb->CFL[h]);
5965 +       tb->CFL[h] = curcf;     /* New initialization of CFL[h]. */
5966 +
5967 +       RFALSE((curf && !B_IS_IN_TREE(curf)) ||
5968 +              (curcf && !B_IS_IN_TREE(curcf)),
5969 +              "PAP-8195: FL (%b) or CFL (%b) is invalid", curf, curcf);
5970 +
5971 +       /* Get parent FR[h] of R[h]. */
5972 +
5973 +       /* Current node is the last child of F[h]. FR[h] != F[h]. */
5974 +       if (position == B_NR_ITEMS(PATH_H_PBUFFER(path, h + 1))) {
5975 +               /*
5976 +                * Calculate current parent of R[h], which is the right
5977 +                * neighbor of F[h].  Calculate current common parent of
5978 +                * R[h] and current node. Note that CFR[h] not equal
5979 +                * FR[path_offset] and CFR[h] not equal F[h].
5980 +                */
5981 +               if ((ret =
5982 +                    get_far_parent(tb, h + 1, &curf, &curcf,
5983 +                                   RIGHT_PARENTS)) != CARRY_ON)
5984 +                       return ret;
5985 +       } else {
5986 +               /* Current node is not the last child of its parent F[h]. */
5987 +               curf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
5988 +               curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
5989 +               get_bh(curf);
5990 +               get_bh(curf);
5991 +               tb->rkey[h] = position;
5992 +       }
5993 +
5994 +       brelse(tb->FR[h]);
5995 +       /* New initialization of FR[path_offset]. */
5996 +       tb->FR[h] = curf;
5997 +
5998 +       brelse(tb->CFR[h]);
5999 +       /* New initialization of CFR[path_offset]. */
6000 +       tb->CFR[h] = curcf;
6001 +
6002 +       RFALSE((curf && !B_IS_IN_TREE(curf)) ||
6003 +              (curcf && !B_IS_IN_TREE(curcf)),
6004 +              "PAP-8205: FR (%b) or CFR (%b) is invalid", curf, curcf);
6005 +
6006 +       return CARRY_ON;
6007 +}
6008 +
6009 +/*
6010 + * it is possible to remove node as result of shiftings to
6011 + * neighbors even when we insert or paste item.
6012 + */
6013 +static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree,
6014 +                                     struct tree_balance *tb, int h)
6015 +{
6016 +       struct buffer_head *Sh = PATH_H_PBUFFER(tb->tb_path, h);
6017 +       int levbytes = tb->insert_size[h];
6018 +       struct item_head *ih;
6019 +       struct reiserfs_key *r_key = NULL;
6020 +
6021 +       ih = item_head(Sh, 0);
6022 +       if (tb->CFR[h])
6023 +               r_key = internal_key(tb->CFR[h], tb->rkey[h]);
6024 +
6025 +       if (lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes
6026 +           /* shifting may merge items which might save space */
6027 +           -
6028 +           ((!h
6029 +             && op_is_left_mergeable(&ih->ih_key, Sh->b_size)) ? IH_SIZE : 0)
6030 +           -
6031 +           ((!h && r_key
6032 +             && op_is_left_mergeable(r_key, Sh->b_size)) ? IH_SIZE : 0)
6033 +           + ((h) ? KEY_SIZE : 0)) {
6034 +               /* node can not be removed */
6035 +               if (sfree >= levbytes) {
6036 +                       /* new item fits into node S[h] without any shifting */
6037 +                       if (!h)
6038 +                               tb->s0num =
6039 +                                   B_NR_ITEMS(Sh) +
6040 +                                   ((mode == M_INSERT) ? 1 : 0);
6041 +                       set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
6042 +                       return NO_BALANCING_NEEDED;
6043 +               }
6044 +       }
6045 +       PROC_INFO_INC(tb->tb_sb, can_node_be_removed[h]);
6046 +       return !NO_BALANCING_NEEDED;
6047 +}
6048 +
6049 +/*
6050 + * Check whether current node S[h] is balanced when increasing its size by
6051 + * Inserting or Pasting.
6052 + * Calculate parameters for balancing for current level h.
6053 + * Parameters:
6054 + *     tb      tree_balance structure;
6055 + *     h       current level of the node;
6056 + *     inum    item number in S[h];
6057 + *     mode    i - insert, p - paste;
6058 + * Returns:    1 - schedule occurred;
6059 + *             0 - balancing for higher levels needed;
6060 + *            -1 - no balancing for higher levels needed;
6061 + *            -2 - no disk space.
6062 + */
6063 +/* ip means Inserting or Pasting */
6064 +static int ip_check_balance(struct tree_balance *tb, int h)
6065 +{
6066 +       struct virtual_node *vn = tb->tb_vn;
6067 +       /*
6068 +        * Number of bytes that must be inserted into (value is negative
6069 +        * if bytes are deleted) buffer which contains node being balanced.
6070 +        * The mnemonic is that the attempted change in node space used
6071 +        * level is levbytes bytes.
6072 +        */
6073 +       int levbytes;
6074 +       int ret;
6075 +
6076 +       int lfree, sfree, rfree /* free space in L, S and R */ ;
6077 +
6078 +       /*
6079 +        * nver is short for number of vertixes, and lnver is the number if
6080 +        * we shift to the left, rnver is the number if we shift to the
6081 +        * right, and lrnver is the number if we shift in both directions.
6082 +        * The goal is to minimize first the number of vertixes, and second,
6083 +        * the number of vertixes whose contents are changed by shifting,
6084 +        * and third the number of uncached vertixes whose contents are
6085 +        * changed by shifting and must be read from disk.
6086 +        */
6087 +       int nver, lnver, rnver, lrnver;
6088 +
6089 +       /*
6090 +        * used at leaf level only, S0 = S[0] is the node being balanced,
6091 +        * sInum [ I = 0,1,2 ] is the number of items that will
6092 +        * remain in node SI after balancing.  S1 and S2 are new
6093 +        * nodes that might be created.
6094 +        */
6095 +
6096 +       /*
6097 +        * we perform 8 calls to get_num_ver().  For each call we
6098 +        * calculate five parameters.  where 4th parameter is s1bytes
6099 +        * and 5th - s2bytes
6100 +        *
6101 +        * s0num, s1num, s2num for 8 cases
6102 +        * 0,1 - do not shift and do not shift but bottle
6103 +        * 2   - shift only whole item to left
6104 +        * 3   - shift to left and bottle as much as possible
6105 +        * 4,5 - shift to right (whole items and as much as possible
6106 +        * 6,7 - shift to both directions (whole items and as much as possible)
6107 +        */
6108 +       short snum012[40] = { 0, };
6109 +
6110 +       /* Sh is the node whose balance is currently being checked */
6111 +       struct buffer_head *Sh;
6112 +
6113 +       Sh = PATH_H_PBUFFER(tb->tb_path, h);
6114 +       levbytes = tb->insert_size[h];
6115 +
6116 +       /* Calculate balance parameters for creating new root. */
6117 +       if (!Sh) {
6118 +               if (!h)
6119 +                       reiserfs_panic(tb->tb_sb, "vs-8210",
6120 +                                      "S[0] can not be 0");
6121 +               switch (ret = get_empty_nodes(tb, h)) {
6122 +               /* no balancing for higher levels needed */
6123 +               case CARRY_ON:
6124 +                       set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
6125 +                       return NO_BALANCING_NEEDED;
6126 +
6127 +               case NO_DISK_SPACE:
6128 +               case REPEAT_SEARCH:
6129 +                       return ret;
6130 +               default:
6131 +                       reiserfs_panic(tb->tb_sb, "vs-8215", "incorrect "
6132 +                                      "return value of get_empty_nodes");
6133 +               }
6134 +       }
6135 +
6136 +       /* get parents of S[h] neighbors. */
6137 +       ret = get_parents(tb, h);
6138 +       if (ret != CARRY_ON)
6139 +               return ret;
6140 +
6141 +       sfree = B_FREE_SPACE(Sh);
6142 +
6143 +       /* get free space of neighbors */
6144 +       rfree = get_rfree(tb, h);
6145 +       lfree = get_lfree(tb, h);
6146 +
6147 +       /* and new item fits into node S[h] without any shifting */
6148 +       if (can_node_be_removed(vn->vn_mode, lfree, sfree, rfree, tb, h) ==
6149 +           NO_BALANCING_NEEDED)
6150 +               return NO_BALANCING_NEEDED;
6151 +
6152 +       create_virtual_node(tb, h);
6153 +
6154 +       /*
6155 +        * determine maximal number of items we can shift to the left
6156 +        * neighbor (in tb structure) and the maximal number of bytes
6157 +        * that can flow to the left neighbor from the left most liquid
6158 +        * item that cannot be shifted from S[0] entirely (returned value)
6159 +        */
6160 +       check_left(tb, h, lfree);
6161 +
6162 +       /*
6163 +        * determine maximal number of items we can shift to the right
6164 +        * neighbor (in tb structure) and the maximal number of bytes
6165 +        * that can flow to the right neighbor from the right most liquid
6166 +        * item that cannot be shifted from S[0] entirely (returned value)
6167 +        */
6168 +       check_right(tb, h, rfree);
6169 +
6170 +       /*
6171 +        * all contents of internal node S[h] can be moved into its
6172 +        * neighbors, S[h] will be removed after balancing
6173 +        */
6174 +       if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) {
6175 +               int to_r;
6176 +
6177 +               /*
6178 +                * Since we are working on internal nodes, and our internal
6179 +                * nodes have fixed size entries, then we can balance by the
6180 +                * number of items rather than the space they consume.  In this
6181 +                * routine we set the left node equal to the right node,
6182 +                * allowing a difference of less than or equal to 1 child
6183 +                * pointer.
6184 +                */
6185 +               to_r =
6186 +                   ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] +
6187 +                    vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 -
6188 +                                               tb->rnum[h]);
6189 +               set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL,
6190 +                              -1, -1);
6191 +               return CARRY_ON;
6192 +       }
6193 +
6194 +       /*
6195 +        * this checks balance condition, that any two neighboring nodes
6196 +        * can not fit in one node
6197 +        */
6198 +       RFALSE(h &&
6199 +              (tb->lnum[h] >= vn->vn_nr_item + 1 ||
6200 +               tb->rnum[h] >= vn->vn_nr_item + 1),
6201 +              "vs-8220: tree is not balanced on internal level");
6202 +       RFALSE(!h && ((tb->lnum[h] >= vn->vn_nr_item && (tb->lbytes == -1)) ||
6203 +                     (tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1))),
6204 +              "vs-8225: tree is not balanced on leaf level");
6205 +
6206 +       /*
6207 +        * all contents of S[0] can be moved into its neighbors
6208 +        * S[0] will be removed after balancing.
6209 +        */
6210 +       if (!h && is_leaf_removable(tb))
6211 +               return CARRY_ON;
6212 +
6213 +       /*
6214 +        * why do we perform this check here rather than earlier??
6215 +        * Answer: we can win 1 node in some cases above. Moreover we
6216 +        * checked it above, when we checked, that S[0] is not removable
6217 +        * in principle
6218 +        */
6219 +
6220 +        /* new item fits into node S[h] without any shifting */
6221 +       if (sfree >= levbytes) {
6222 +               if (!h)
6223 +                       tb->s0num = vn->vn_nr_item;
6224 +               set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
6225 +               return NO_BALANCING_NEEDED;
6226 +       }
6227 +
6228 +       {
6229 +               int lpar, rpar, nset, lset, rset, lrset;
6230 +               /* regular overflowing of the node */
6231 +
6232 +               /*
6233 +                * get_num_ver works in 2 modes (FLOW & NO_FLOW)
6234 +                * lpar, rpar - number of items we can shift to left/right
6235 +                *              neighbor (including splitting item)
6236 +                * nset, lset, rset, lrset - shows, whether flowing items
6237 +                *                           give better packing
6238 +                */
6239 +#define FLOW 1
6240 +#define NO_FLOW 0              /* do not any splitting */
6241 +
6242 +               /* we choose one of the following */
6243 +#define NOTHING_SHIFT_NO_FLOW  0
6244 +#define NOTHING_SHIFT_FLOW     5
6245 +#define LEFT_SHIFT_NO_FLOW     10
6246 +#define LEFT_SHIFT_FLOW                15
6247 +#define RIGHT_SHIFT_NO_FLOW    20
6248 +#define RIGHT_SHIFT_FLOW       25
6249 +#define LR_SHIFT_NO_FLOW       30
6250 +#define LR_SHIFT_FLOW          35
6251 +
6252 +               lpar = tb->lnum[h];
6253 +               rpar = tb->rnum[h];
6254 +
6255 +               /*
6256 +                * calculate number of blocks S[h] must be split into when
6257 +                * nothing is shifted to the neighbors, as well as number of
6258 +                * items in each part of the split node (s012 numbers),
6259 +                * and number of bytes (s1bytes) of the shared drop which
6260 +                * flow to S1 if any
6261 +                */
6262 +               nset = NOTHING_SHIFT_NO_FLOW;
6263 +               nver = get_num_ver(vn->vn_mode, tb, h,
6264 +                                  0, -1, h ? vn->vn_nr_item : 0, -1,
6265 +                                  snum012, NO_FLOW);
6266 +
6267 +               if (!h) {
6268 +                       int nver1;
6269 +
6270 +                       /*
6271 +                        * note, that in this case we try to bottle
6272 +                        * between S[0] and S1 (S1 - the first new node)
6273 +                        */
6274 +                       nver1 = get_num_ver(vn->vn_mode, tb, h,
6275 +                                           0, -1, 0, -1,
6276 +                                           snum012 + NOTHING_SHIFT_FLOW, FLOW);
6277 +                       if (nver > nver1)
6278 +                               nset = NOTHING_SHIFT_FLOW, nver = nver1;
6279 +               }
6280 +
6281 +               /*
6282 +                * calculate number of blocks S[h] must be split into when
6283 +                * l_shift_num first items and l_shift_bytes of the right
6284 +                * most liquid item to be shifted are shifted to the left
6285 +                * neighbor, as well as number of items in each part of the
6286 +                * splitted node (s012 numbers), and number of bytes
6287 +                * (s1bytes) of the shared drop which flow to S1 if any
6288 +                */
6289 +               lset = LEFT_SHIFT_NO_FLOW;
6290 +               lnver = get_num_ver(vn->vn_mode, tb, h,
6291 +                                   lpar - ((h || tb->lbytes == -1) ? 0 : 1),
6292 +                                   -1, h ? vn->vn_nr_item : 0, -1,
6293 +                                   snum012 + LEFT_SHIFT_NO_FLOW, NO_FLOW);
6294 +               if (!h) {
6295 +                       int lnver1;
6296 +
6297 +                       lnver1 = get_num_ver(vn->vn_mode, tb, h,
6298 +                                            lpar -
6299 +                                            ((tb->lbytes != -1) ? 1 : 0),
6300 +                                            tb->lbytes, 0, -1,
6301 +                                            snum012 + LEFT_SHIFT_FLOW, FLOW);
6302 +                       if (lnver > lnver1)
6303 +                               lset = LEFT_SHIFT_FLOW, lnver = lnver1;
6304 +               }
6305 +
6306 +               /*
6307 +                * calculate number of blocks S[h] must be split into when
6308 +                * r_shift_num first items and r_shift_bytes of the left most
6309 +                * liquid item to be shifted are shifted to the right neighbor,
6310 +                * as well as number of items in each part of the splitted
6311 +                * node (s012 numbers), and number of bytes (s1bytes) of the
6312 +                * shared drop which flow to S1 if any
6313 +                */
6314 +               rset = RIGHT_SHIFT_NO_FLOW;
6315 +               rnver = get_num_ver(vn->vn_mode, tb, h,
6316 +                                   0, -1,
6317 +                                   h ? (vn->vn_nr_item - rpar) : (rpar -
6318 +                                                                  ((tb->
6319 +                                                                    rbytes !=
6320 +                                                                    -1) ? 1 :
6321 +                                                                   0)), -1,
6322 +                                   snum012 + RIGHT_SHIFT_NO_FLOW, NO_FLOW);
6323 +               if (!h) {
6324 +                       int rnver1;
6325 +
6326 +                       rnver1 = get_num_ver(vn->vn_mode, tb, h,
6327 +                                            0, -1,
6328 +                                            (rpar -
6329 +                                             ((tb->rbytes != -1) ? 1 : 0)),
6330 +                                            tb->rbytes,
6331 +                                            snum012 + RIGHT_SHIFT_FLOW, FLOW);
6332 +
6333 +                       if (rnver > rnver1)
6334 +                               rset = RIGHT_SHIFT_FLOW, rnver = rnver1;
6335 +               }
6336 +
6337 +               /*
6338 +                * calculate number of blocks S[h] must be split into when
6339 +                * items are shifted in both directions, as well as number
6340 +                * of items in each part of the splitted node (s012 numbers),
6341 +                * and number of bytes (s1bytes) of the shared drop which
6342 +                * flow to S1 if any
6343 +                */
6344 +               lrset = LR_SHIFT_NO_FLOW;
6345 +               lrnver = get_num_ver(vn->vn_mode, tb, h,
6346 +                                    lpar - ((h || tb->lbytes == -1) ? 0 : 1),
6347 +                                    -1,
6348 +                                    h ? (vn->vn_nr_item - rpar) : (rpar -
6349 +                                                                   ((tb->
6350 +                                                                     rbytes !=
6351 +                                                                     -1) ? 1 :
6352 +                                                                    0)), -1,
6353 +                                    snum012 + LR_SHIFT_NO_FLOW, NO_FLOW);
6354 +               if (!h) {
6355 +                       int lrnver1;
6356 +
6357 +                       lrnver1 = get_num_ver(vn->vn_mode, tb, h,
6358 +                                             lpar -
6359 +                                             ((tb->lbytes != -1) ? 1 : 0),
6360 +                                             tb->lbytes,
6361 +                                             (rpar -
6362 +                                              ((tb->rbytes != -1) ? 1 : 0)),
6363 +                                             tb->rbytes,
6364 +                                             snum012 + LR_SHIFT_FLOW, FLOW);
6365 +                       if (lrnver > lrnver1)
6366 +                               lrset = LR_SHIFT_FLOW, lrnver = lrnver1;
6367 +               }
6368 +
6369 +               /*
6370 +                * Our general shifting strategy is:
6371 +                * 1) to minimized number of new nodes;
6372 +                * 2) to minimized number of neighbors involved in shifting;
6373 +                * 3) to minimized number of disk reads;
6374 +                */
6375 +
6376 +               /* we can win TWO or ONE nodes by shifting in both directions */
6377 +               if (lrnver < lnver && lrnver < rnver) {
6378 +                       RFALSE(h &&
6379 +                              (tb->lnum[h] != 1 ||
6380 +                               tb->rnum[h] != 1 ||
6381 +                               lrnver != 1 || rnver != 2 || lnver != 2
6382 +                               || h != 1), "vs-8230: bad h");
6383 +                       if (lrset == LR_SHIFT_FLOW)
6384 +                               set_parameters(tb, h, tb->lnum[h], tb->rnum[h],
6385 +                                              lrnver, snum012 + lrset,
6386 +                                              tb->lbytes, tb->rbytes);
6387 +                       else
6388 +                               set_parameters(tb, h,
6389 +                                              tb->lnum[h] -
6390 +                                              ((tb->lbytes == -1) ? 0 : 1),
6391 +                                              tb->rnum[h] -
6392 +                                              ((tb->rbytes == -1) ? 0 : 1),
6393 +                                              lrnver, snum012 + lrset, -1, -1);
6394 +
6395 +                       return CARRY_ON;
6396 +               }
6397 +
6398 +               /*
6399 +                * if shifting doesn't lead to better packing
6400 +                * then don't shift
6401 +                */
6402 +               if (nver == lrnver) {
6403 +                       set_parameters(tb, h, 0, 0, nver, snum012 + nset, -1,
6404 +                                      -1);
6405 +                       return CARRY_ON;
6406 +               }
6407 +
6408 +               /*
6409 +                * now we know that for better packing shifting in only one
6410 +                * direction either to the left or to the right is required
6411 +                */
6412 +
6413 +               /*
6414 +                * if shifting to the left is better than
6415 +                * shifting to the right
6416 +                */
6417 +               if (lnver < rnver) {
6418 +                       SET_PAR_SHIFT_LEFT;
6419 +                       return CARRY_ON;
6420 +               }
6421 +
6422 +               /*
6423 +                * if shifting to the right is better than
6424 +                * shifting to the left
6425 +                */
6426 +               if (lnver > rnver) {
6427 +                       SET_PAR_SHIFT_RIGHT;
6428 +                       return CARRY_ON;
6429 +               }
6430 +
6431 +               /*
6432 +                * now shifting in either direction gives the same number
6433 +                * of nodes and we can make use of the cached neighbors
6434 +                */
6435 +               if (is_left_neighbor_in_cache(tb, h)) {
6436 +                       SET_PAR_SHIFT_LEFT;
6437 +                       return CARRY_ON;
6438 +               }
6439 +
6440 +               /*
6441 +                * shift to the right independently on whether the
6442 +                * right neighbor in cache or not
6443 +                */
6444 +               SET_PAR_SHIFT_RIGHT;
6445 +               return CARRY_ON;
6446 +       }
6447 +}
6448 +
6449 +/*
6450 + * Check whether current node S[h] is balanced when Decreasing its size by
6451 + * Deleting or Cutting for INTERNAL node of S+tree.
6452 + * Calculate parameters for balancing for current level h.
6453 + * Parameters:
6454 + *     tb      tree_balance structure;
6455 + *     h       current level of the node;
6456 + *     inum    item number in S[h];
6457 + *     mode    i - insert, p - paste;
6458 + * Returns:    1 - schedule occurred;
6459 + *             0 - balancing for higher levels needed;
6460 + *            -1 - no balancing for higher levels needed;
6461 + *            -2 - no disk space.
6462 + *
6463 + * Note: Items of internal nodes have fixed size, so the balance condition for
6464 + * the internal part of S+tree is as for the B-trees.
6465 + */
6466 +static int dc_check_balance_internal(struct tree_balance *tb, int h)
6467 +{
6468 +       struct virtual_node *vn = tb->tb_vn;
6469 +
6470 +       /*
6471 +        * Sh is the node whose balance is currently being checked,
6472 +        * and Fh is its father.
6473 +        */
6474 +       struct buffer_head *Sh, *Fh;
6475 +       int ret;
6476 +       int lfree, rfree /* free space in L and R */ ;
6477 +
6478 +       Sh = PATH_H_PBUFFER(tb->tb_path, h);
6479 +       Fh = PATH_H_PPARENT(tb->tb_path, h);
6480 +
6481 +       /*
6482 +        * using tb->insert_size[h], which is negative in this case,
6483 +        * create_virtual_node calculates:
6484 +        * new_nr_item = number of items node would have if operation is
6485 +        * performed without balancing (new_nr_item);
6486 +        */
6487 +       create_virtual_node(tb, h);
6488 +
6489 +       if (!Fh) {              /* S[h] is the root. */
6490 +               /* no balancing for higher levels needed */
6491 +               if (vn->vn_nr_item > 0) {
6492 +                       set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
6493 +                       return NO_BALANCING_NEEDED;
6494 +               }
6495 +               /*
6496 +                * new_nr_item == 0.
6497 +                * Current root will be deleted resulting in
6498 +                * decrementing the tree height.
6499 +                */
6500 +               set_parameters(tb, h, 0, 0, 0, NULL, -1, -1);
6501 +               return CARRY_ON;
6502 +       }
6503 +
6504 +       if ((ret = get_parents(tb, h)) != CARRY_ON)
6505 +               return ret;
6506 +
6507 +       /* get free space of neighbors */
6508 +       rfree = get_rfree(tb, h);
6509 +       lfree = get_lfree(tb, h);
6510 +
6511 +       /* determine maximal number of items we can fit into neighbors */
6512 +       check_left(tb, h, lfree);
6513 +       check_right(tb, h, rfree);
6514 +
6515 +       /*
6516 +        * Balance condition for the internal node is valid.
6517 +        * In this case we balance only if it leads to better packing.
6518 +        */
6519 +       if (vn->vn_nr_item >= MIN_NR_KEY(Sh)) {
6520 +               /*
6521 +                * Here we join S[h] with one of its neighbors,
6522 +                * which is impossible with greater values of new_nr_item.
6523 +                */
6524 +               if (vn->vn_nr_item == MIN_NR_KEY(Sh)) {
6525 +                       /* All contents of S[h] can be moved to L[h]. */
6526 +                       if (tb->lnum[h] >= vn->vn_nr_item + 1) {
6527 +                               int n;
6528 +                               int order_L;
6529 +
6530 +                               order_L =
6531 +                                   ((n =
6532 +                                     PATH_H_B_ITEM_ORDER(tb->tb_path,
6533 +                                                         h)) ==
6534 +                                    0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
6535 +                               n = dc_size(B_N_CHILD(tb->FL[h], order_L)) /
6536 +                                   (DC_SIZE + KEY_SIZE);
6537 +                               set_parameters(tb, h, -n - 1, 0, 0, NULL, -1,
6538 +                                              -1);
6539 +                               return CARRY_ON;
6540 +                       }
6541 +
6542 +                       /* All contents of S[h] can be moved to R[h]. */
6543 +                       if (tb->rnum[h] >= vn->vn_nr_item + 1) {
6544 +                               int n;
6545 +                               int order_R;
6546 +
6547 +                               order_R =
6548 +                                   ((n =
6549 +                                     PATH_H_B_ITEM_ORDER(tb->tb_path,
6550 +                                                         h)) ==
6551 +                                    B_NR_ITEMS(Fh)) ? 0 : n + 1;
6552 +                               n = dc_size(B_N_CHILD(tb->FR[h], order_R)) /
6553 +                                   (DC_SIZE + KEY_SIZE);
6554 +                               set_parameters(tb, h, 0, -n - 1, 0, NULL, -1,
6555 +                                              -1);
6556 +                               return CARRY_ON;
6557 +                       }
6558 +               }
6559 +
6560 +               /*
6561 +                * All contents of S[h] can be moved to the neighbors
6562 +                * (L[h] & R[h]).
6563 +                */
6564 +               if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) {
6565 +                       int to_r;
6566 +
6567 +                       to_r =
6568 +                           ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] -
6569 +                            tb->rnum[h] + vn->vn_nr_item + 1) / 2 -
6570 +                           (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
6571 +                       set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r,
6572 +                                      0, NULL, -1, -1);
6573 +                       return CARRY_ON;
6574 +               }
6575 +
6576 +               /* Balancing does not lead to better packing. */
6577 +               set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
6578 +               return NO_BALANCING_NEEDED;
6579 +       }
6580 +
6581 +       /*
6582 +        * Current node contain insufficient number of items.
6583 +        * Balancing is required.
6584 +        */
6585 +       /* Check whether we can merge S[h] with left neighbor. */
6586 +       if (tb->lnum[h] >= vn->vn_nr_item + 1)
6587 +               if (is_left_neighbor_in_cache(tb, h)
6588 +                   || tb->rnum[h] < vn->vn_nr_item + 1 || !tb->FR[h]) {
6589 +                       int n;
6590 +                       int order_L;
6591 +
6592 +                       order_L =
6593 +                           ((n =
6594 +                             PATH_H_B_ITEM_ORDER(tb->tb_path,
6595 +                                                 h)) ==
6596 +                            0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
6597 +                       n = dc_size(B_N_CHILD(tb->FL[h], order_L)) / (DC_SIZE +
6598 +                                                                     KEY_SIZE);
6599 +                       set_parameters(tb, h, -n - 1, 0, 0, NULL, -1, -1);
6600 +                       return CARRY_ON;
6601 +               }
6602 +
6603 +       /* Check whether we can merge S[h] with right neighbor. */
6604 +       if (tb->rnum[h] >= vn->vn_nr_item + 1) {
6605 +               int n;
6606 +               int order_R;
6607 +
6608 +               order_R =
6609 +                   ((n =
6610 +                     PATH_H_B_ITEM_ORDER(tb->tb_path,
6611 +                                         h)) == B_NR_ITEMS(Fh)) ? 0 : (n + 1);
6612 +               n = dc_size(B_N_CHILD(tb->FR[h], order_R)) / (DC_SIZE +
6613 +                                                             KEY_SIZE);
6614 +               set_parameters(tb, h, 0, -n - 1, 0, NULL, -1, -1);
6615 +               return CARRY_ON;
6616 +       }
6617 +
6618 +       /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
6619 +       if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) {
6620 +               int to_r;
6621 +
6622 +               to_r =
6623 +                   ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] +
6624 +                    vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 -
6625 +                                               tb->rnum[h]);
6626 +               set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL,
6627 +                              -1, -1);
6628 +               return CARRY_ON;
6629 +       }
6630 +
6631 +       /* For internal nodes try to borrow item from a neighbor */
6632 +       RFALSE(!tb->FL[h] && !tb->FR[h], "vs-8235: trying to borrow for root");
6633 +
6634 +       /* Borrow one or two items from caching neighbor */
6635 +       if (is_left_neighbor_in_cache(tb, h) || !tb->FR[h]) {
6636 +               int from_l;
6637 +
6638 +               from_l =
6639 +                   (MAX_NR_KEY(Sh) + 1 - tb->lnum[h] + vn->vn_nr_item +
6640 +                    1) / 2 - (vn->vn_nr_item + 1);
6641 +               set_parameters(tb, h, -from_l, 0, 1, NULL, -1, -1);
6642 +               return CARRY_ON;
6643 +       }
6644 +
6645 +       set_parameters(tb, h, 0,
6646 +                      -((MAX_NR_KEY(Sh) + 1 - tb->rnum[h] + vn->vn_nr_item +
6647 +                         1) / 2 - (vn->vn_nr_item + 1)), 1, NULL, -1, -1);
6648 +       return CARRY_ON;
6649 +}
6650 +
6651 +/*
6652 + * Check whether current node S[h] is balanced when Decreasing its size by
6653 + * Deleting or Truncating for LEAF node of S+tree.
6654 + * Calculate parameters for balancing for current level h.
6655 + * Parameters:
6656 + *     tb      tree_balance structure;
6657 + *     h       current level of the node;
6658 + *     inum    item number in S[h];
6659 + *     mode    i - insert, p - paste;
6660 + * Returns:    1 - schedule occurred;
6661 + *             0 - balancing for higher levels needed;
6662 + *            -1 - no balancing for higher levels needed;
6663 + *            -2 - no disk space.
6664 + */
6665 +static int dc_check_balance_leaf(struct tree_balance *tb, int h)
6666 +{
6667 +       struct virtual_node *vn = tb->tb_vn;
6668 +
6669 +       /*
6670 +        * Number of bytes that must be deleted from
6671 +        * (value is negative if bytes are deleted) buffer which
6672 +        * contains node being balanced.  The mnemonic is that the
6673 +        * attempted change in node space used level is levbytes bytes.
6674 +        */
6675 +       int levbytes;
6676 +
6677 +       /* the maximal item size */
6678 +       int maxsize, ret;
6679 +
6680 +       /*
6681 +        * S0 is the node whose balance is currently being checked,
6682 +        * and F0 is its father.
6683 +        */
6684 +       struct buffer_head *S0, *F0;
6685 +       int lfree, rfree /* free space in L and R */ ;
6686 +
6687 +       S0 = PATH_H_PBUFFER(tb->tb_path, 0);
6688 +       F0 = PATH_H_PPARENT(tb->tb_path, 0);
6689 +
6690 +       levbytes = tb->insert_size[h];
6691 +
6692 +       maxsize = MAX_CHILD_SIZE(S0);   /* maximal possible size of an item */
6693 +
6694 +       if (!F0) {              /* S[0] is the root now. */
6695 +
6696 +               RFALSE(-levbytes >= maxsize - B_FREE_SPACE(S0),
6697 +                      "vs-8240: attempt to create empty buffer tree");
6698 +
6699 +               set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
6700 +               return NO_BALANCING_NEEDED;
6701 +       }
6702 +
6703 +       if ((ret = get_parents(tb, h)) != CARRY_ON)
6704 +               return ret;
6705 +
6706 +       /* get free space of neighbors */
6707 +       rfree = get_rfree(tb, h);
6708 +       lfree = get_lfree(tb, h);
6709 +
6710 +       create_virtual_node(tb, h);
6711 +
6712 +       /* if 3 leaves can be merge to one, set parameters and return */
6713 +       if (are_leaves_removable(tb, lfree, rfree))
6714 +               return CARRY_ON;
6715 +
6716 +       /*
6717 +        * determine maximal number of items we can shift to the left/right
6718 +        * neighbor and the maximal number of bytes that can flow to the
6719 +        * left/right neighbor from the left/right most liquid item that
6720 +        * cannot be shifted from S[0] entirely
6721 +        */
6722 +       check_left(tb, h, lfree);
6723 +       check_right(tb, h, rfree);
6724 +
6725 +       /* check whether we can merge S with left neighbor. */
6726 +       if (tb->lnum[0] >= vn->vn_nr_item && tb->lbytes == -1)
6727 +               if (is_left_neighbor_in_cache(tb, h) || ((tb->rnum[0] - ((tb->rbytes == -1) ? 0 : 1)) < vn->vn_nr_item) ||      /* S can not be merged with R */
6728 +                   !tb->FR[h]) {
6729 +
6730 +                       RFALSE(!tb->FL[h],
6731 +                              "vs-8245: dc_check_balance_leaf: FL[h] must exist");
6732 +
6733 +                       /* set parameter to merge S[0] with its left neighbor */
6734 +                       set_parameters(tb, h, -1, 0, 0, NULL, -1, -1);
6735 +                       return CARRY_ON;
6736 +               }
6737 +
6738 +       /* check whether we can merge S[0] with right neighbor. */
6739 +       if (tb->rnum[0] >= vn->vn_nr_item && tb->rbytes == -1) {
6740 +               set_parameters(tb, h, 0, -1, 0, NULL, -1, -1);
6741 +               return CARRY_ON;
6742 +       }
6743 +
6744 +       /*
6745 +        * All contents of S[0] can be moved to the neighbors (L[0] & R[0]).
6746 +        * Set parameters and return
6747 +        */
6748 +       if (is_leaf_removable(tb))
6749 +               return CARRY_ON;
6750 +
6751 +       /* Balancing is not required. */
6752 +       tb->s0num = vn->vn_nr_item;
6753 +       set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
6754 +       return NO_BALANCING_NEEDED;
6755 +}
6756 +
6757 +/*
6758 + * Check whether current node S[h] is balanced when Decreasing its size by
6759 + * Deleting or Cutting.
6760 + * Calculate parameters for balancing for current level h.
6761 + * Parameters:
6762 + *     tb      tree_balance structure;
6763 + *     h       current level of the node;
6764 + *     inum    item number in S[h];
6765 + *     mode    d - delete, c - cut.
6766 + * Returns:    1 - schedule occurred;
6767 + *             0 - balancing for higher levels needed;
6768 + *            -1 - no balancing for higher levels needed;
6769 + *            -2 - no disk space.
6770 + */
6771 +static int dc_check_balance(struct tree_balance *tb, int h)
6772 +{
6773 +       RFALSE(!(PATH_H_PBUFFER(tb->tb_path, h)),
6774 +              "vs-8250: S is not initialized");
6775 +
6776 +       if (h)
6777 +               return dc_check_balance_internal(tb, h);
6778 +       else
6779 +               return dc_check_balance_leaf(tb, h);
6780 +}
6781 +
6782 +/*
6783 + * Check whether current node S[h] is balanced.
6784 + * Calculate parameters for balancing for current level h.
6785 + * Parameters:
6786 + *
6787 + *     tb      tree_balance structure:
6788 + *
6789 + *              tb is a large structure that must be read about in the header
6790 + *             file at the same time as this procedure if the reader is
6791 + *             to successfully understand this procedure
6792 + *
6793 + *     h       current level of the node;
6794 + *     inum    item number in S[h];
6795 + *     mode    i - insert, p - paste, d - delete, c - cut.
6796 + * Returns:    1 - schedule occurred;
6797 + *             0 - balancing for higher levels needed;
6798 + *            -1 - no balancing for higher levels needed;
6799 + *            -2 - no disk space.
6800 + */
6801 +static int check_balance(int mode,
6802 +                        struct tree_balance *tb,
6803 +                        int h,
6804 +                        int inum,
6805 +                        int pos_in_item,
6806 +                        struct item_head *ins_ih, const void *data)
6807 +{
6808 +       struct virtual_node *vn;
6809 +
6810 +       vn = tb->tb_vn = (struct virtual_node *)(tb->vn_buf);
6811 +       vn->vn_free_ptr = (char *)(tb->tb_vn + 1);
6812 +       vn->vn_mode = mode;
6813 +       vn->vn_affected_item_num = inum;
6814 +       vn->vn_pos_in_item = pos_in_item;
6815 +       vn->vn_ins_ih = ins_ih;
6816 +       vn->vn_data = data;
6817 +
6818 +       RFALSE(mode == M_INSERT && !vn->vn_ins_ih,
6819 +              "vs-8255: ins_ih can not be 0 in insert mode");
6820 +
6821 +       /* Calculate balance parameters when size of node is increasing. */
6822 +       if (tb->insert_size[h] > 0)
6823 +               return ip_check_balance(tb, h);
6824 +
6825 +       /* Calculate balance parameters when  size of node is decreasing. */
6826 +       return dc_check_balance(tb, h);
6827 +}
6828 +
6829 +/* Check whether parent at the path is the really parent of the current node.*/
6830 +static int get_direct_parent(struct tree_balance *tb, int h)
6831 +{
6832 +       struct buffer_head *bh;
6833 +       struct treepath *path = tb->tb_path;
6834 +       int position,
6835 +           path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h);
6836 +
6837 +       /* We are in the root or in the new root. */
6838 +       if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
6839 +
6840 +               RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET - 1,
6841 +                      "PAP-8260: invalid offset in the path");
6842 +
6843 +               if (PATH_OFFSET_PBUFFER(path, FIRST_PATH_ELEMENT_OFFSET)->
6844 +                   b_blocknr == SB_ROOT_BLOCK(tb->tb_sb)) {
6845 +                       /* Root is not changed. */
6846 +                       PATH_OFFSET_PBUFFER(path, path_offset - 1) = NULL;
6847 +                       PATH_OFFSET_POSITION(path, path_offset - 1) = 0;
6848 +                       return CARRY_ON;
6849 +               }
6850 +               /* Root is changed and we must recalculate the path. */
6851 +               return REPEAT_SEARCH;
6852 +       }
6853 +
6854 +       /* Parent in the path is not in the tree. */
6855 +       if (!B_IS_IN_TREE
6856 +           (bh = PATH_OFFSET_PBUFFER(path, path_offset - 1)))
6857 +               return REPEAT_SEARCH;
6858 +
6859 +       if ((position =
6860 +            PATH_OFFSET_POSITION(path,
6861 +                                 path_offset - 1)) > B_NR_ITEMS(bh))
6862 +               return REPEAT_SEARCH;
6863 +
6864 +       /* Parent in the path is not parent of the current node in the tree. */
6865 +       if (B_N_CHILD_NUM(bh, position) !=
6866 +           PATH_OFFSET_PBUFFER(path, path_offset)->b_blocknr)
6867 +               return REPEAT_SEARCH;
6868 +
6869 +       if (buffer_locked(bh)) {
6870 +               int depth = reiserfs_write_unlock_nested(tb->tb_sb);
6871 +               __wait_on_buffer(bh);
6872 +               reiserfs_write_lock_nested(tb->tb_sb, depth);
6873 +               if (FILESYSTEM_CHANGED_TB(tb))
6874 +                       return REPEAT_SEARCH;
6875 +       }
6876 +
6877 +       /*
6878 +        * Parent in the path is unlocked and really parent
6879 +        * of the current node.
6880 +        */
6881 +       return CARRY_ON;
6882 +}
6883 +
6884 +/*
6885 + * Using lnum[h] and rnum[h] we should determine what neighbors
6886 + * of S[h] we
6887 + * need in order to balance S[h], and get them if necessary.
6888 + * Returns:    SCHEDULE_OCCURRED - schedule occurred while the function worked;
6889 + *             CARRY_ON - schedule didn't occur while the function worked;
6890 + */
6891 +static int get_neighbors(struct tree_balance *tb, int h)
6892 +{
6893 +       int child_position,
6894 +           path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h + 1);
6895 +       unsigned long son_number;
6896 +       struct super_block *sb = tb->tb_sb;
6897 +       struct buffer_head *bh;
6898 +       int depth;
6899 +
6900 +       PROC_INFO_INC(sb, get_neighbors[h]);
6901 +
6902 +       if (tb->lnum[h]) {
6903 +               /* We need left neighbor to balance S[h]. */
6904 +               PROC_INFO_INC(sb, need_l_neighbor[h]);
6905 +               bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset);
6906 +
6907 +               RFALSE(bh == tb->FL[h] &&
6908 +                      !PATH_OFFSET_POSITION(tb->tb_path, path_offset),
6909 +                      "PAP-8270: invalid position in the parent");
6910 +
6911 +               child_position =
6912 +                   (bh ==
6913 +                    tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb->
6914 +                                                                      FL[h]);
6915 +               son_number = B_N_CHILD_NUM(tb->FL[h], child_position);
6916 +               depth = reiserfs_write_unlock_nested(tb->tb_sb);
6917 +               bh = sb_bread(sb, son_number);
6918 +               reiserfs_write_lock_nested(tb->tb_sb, depth);
6919 +               if (!bh)
6920 +                       return IO_ERROR;
6921 +               if (FILESYSTEM_CHANGED_TB(tb)) {
6922 +                       brelse(bh);
6923 +                       PROC_INFO_INC(sb, get_neighbors_restart[h]);
6924 +                       return REPEAT_SEARCH;
6925 +               }
6926 +
6927 +               RFALSE(!B_IS_IN_TREE(tb->FL[h]) ||
6928 +                      child_position > B_NR_ITEMS(tb->FL[h]) ||
6929 +                      B_N_CHILD_NUM(tb->FL[h], child_position) !=
6930 +                      bh->b_blocknr, "PAP-8275: invalid parent");
6931 +               RFALSE(!B_IS_IN_TREE(bh), "PAP-8280: invalid child");
6932 +               RFALSE(!h &&
6933 +                      B_FREE_SPACE(bh) !=
6934 +                      MAX_CHILD_SIZE(bh) -
6935 +                      dc_size(B_N_CHILD(tb->FL[0], child_position)),
6936 +                      "PAP-8290: invalid child size of left neighbor");
6937 +
6938 +               brelse(tb->L[h]);
6939 +               tb->L[h] = bh;
6940 +       }
6941 +
6942 +       /* We need right neighbor to balance S[path_offset]. */
6943 +       if (tb->rnum[h]) {
6944 +               PROC_INFO_INC(sb, need_r_neighbor[h]);
6945 +               bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset);
6946 +
6947 +               RFALSE(bh == tb->FR[h] &&
6948 +                      PATH_OFFSET_POSITION(tb->tb_path,
6949 +                                           path_offset) >=
6950 +                      B_NR_ITEMS(bh),
6951 +                      "PAP-8295: invalid position in the parent");
6952 +
6953 +               child_position =
6954 +                   (bh == tb->FR[h]) ? tb->rkey[h] + 1 : 0;
6955 +               son_number = B_N_CHILD_NUM(tb->FR[h], child_position);
6956 +               depth = reiserfs_write_unlock_nested(tb->tb_sb);
6957 +               bh = sb_bread(sb, son_number);
6958 +               reiserfs_write_lock_nested(tb->tb_sb, depth);
6959 +               if (!bh)
6960 +                       return IO_ERROR;
6961 +               if (FILESYSTEM_CHANGED_TB(tb)) {
6962 +                       brelse(bh);
6963 +                       PROC_INFO_INC(sb, get_neighbors_restart[h]);
6964 +                       return REPEAT_SEARCH;
6965 +               }
6966 +               brelse(tb->R[h]);
6967 +               tb->R[h] = bh;
6968 +
6969 +               RFALSE(!h
6970 +                      && B_FREE_SPACE(bh) !=
6971 +                      MAX_CHILD_SIZE(bh) -
6972 +                      dc_size(B_N_CHILD(tb->FR[0], child_position)),
6973 +                      "PAP-8300: invalid child size of right neighbor (%d != %d - %d)",
6974 +                      B_FREE_SPACE(bh), MAX_CHILD_SIZE(bh),
6975 +                      dc_size(B_N_CHILD(tb->FR[0], child_position)));
6976 +
6977 +       }
6978 +       return CARRY_ON;
6979 +}
6980 +
6981 +static int get_virtual_node_size(struct super_block *sb, struct buffer_head *bh)
6982 +{
6983 +       int max_num_of_items;
6984 +       int max_num_of_entries;
6985 +       unsigned long blocksize = sb->s_blocksize;
6986 +
6987 +#define MIN_NAME_LEN 1
6988 +
6989 +       max_num_of_items = (blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN);
6990 +       max_num_of_entries = (blocksize - BLKH_SIZE - IH_SIZE) /
6991 +           (DEH_SIZE + MIN_NAME_LEN);
6992 +
6993 +       return sizeof(struct virtual_node) +
6994 +           max(max_num_of_items * sizeof(struct virtual_item),
6995 +               sizeof(struct virtual_item) +
6996 +               struct_size_t(struct direntry_uarea, entry_sizes,
6997 +                             max_num_of_entries));
6998 +}
6999 +
7000 +/*
7001 + * maybe we should fail balancing we are going to perform when kmalloc
7002 + * fails several times. But now it will loop until kmalloc gets
7003 + * required memory
7004 + */
7005 +static int get_mem_for_virtual_node(struct tree_balance *tb)
7006 +{
7007 +       int check_fs = 0;
7008 +       int size;
7009 +       char *buf;
7010 +
7011 +       size = get_virtual_node_size(tb->tb_sb, PATH_PLAST_BUFFER(tb->tb_path));
7012 +
7013 +       /* we have to allocate more memory for virtual node */
7014 +       if (size > tb->vn_buf_size) {
7015 +               if (tb->vn_buf) {
7016 +                       /* free memory allocated before */
7017 +                       kfree(tb->vn_buf);
7018 +                       /* this is not needed if kfree is atomic */
7019 +                       check_fs = 1;
7020 +               }
7021 +
7022 +               /* virtual node requires now more memory */
7023 +               tb->vn_buf_size = size;
7024 +
7025 +               /* get memory for virtual item */
7026 +               buf = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN);
7027 +               if (!buf) {
7028 +                       /*
7029 +                        * getting memory with GFP_KERNEL priority may involve
7030 +                        * balancing now (due to indirect_to_direct conversion
7031 +                        * on dcache shrinking). So, release path and collected
7032 +                        * resources here
7033 +                        */
7034 +                       free_buffers_in_tb(tb);
7035 +                       buf = kmalloc(size, GFP_NOFS);
7036 +                       if (!buf) {
7037 +                               tb->vn_buf_size = 0;
7038 +                       }
7039 +                       tb->vn_buf = buf;
7040 +                       schedule();
7041 +                       return REPEAT_SEARCH;
7042 +               }
7043 +
7044 +               tb->vn_buf = buf;
7045 +       }
7046 +
7047 +       if (check_fs && FILESYSTEM_CHANGED_TB(tb))
7048 +               return REPEAT_SEARCH;
7049 +
7050 +       return CARRY_ON;
7051 +}
7052 +
7053 +#ifdef CONFIG_REISERFS_CHECK
7054 +static void tb_buffer_sanity_check(struct super_block *sb,
7055 +                                  struct buffer_head *bh,
7056 +                                  const char *descr, int level)
7057 +{
7058 +       if (bh) {
7059 +               if (atomic_read(&(bh->b_count)) <= 0)
7060 +
7061 +                       reiserfs_panic(sb, "jmacd-1", "negative or zero "
7062 +                                      "reference counter for buffer %s[%d] "
7063 +                                      "(%b)", descr, level, bh);
7064 +
7065 +               if (!buffer_uptodate(bh))
7066 +                       reiserfs_panic(sb, "jmacd-2", "buffer is not up "
7067 +                                      "to date %s[%d] (%b)",
7068 +                                      descr, level, bh);
7069 +
7070 +               if (!B_IS_IN_TREE(bh))
7071 +                       reiserfs_panic(sb, "jmacd-3", "buffer is not "
7072 +                                      "in tree %s[%d] (%b)",
7073 +                                      descr, level, bh);
7074 +
7075 +               if (bh->b_bdev != sb->s_bdev)
7076 +                       reiserfs_panic(sb, "jmacd-4", "buffer has wrong "
7077 +                                      "device %s[%d] (%b)",
7078 +                                      descr, level, bh);
7079 +
7080 +               if (bh->b_size != sb->s_blocksize)
7081 +                       reiserfs_panic(sb, "jmacd-5", "buffer has wrong "
7082 +                                      "blocksize %s[%d] (%b)",
7083 +                                      descr, level, bh);
7084 +
7085 +               if (bh->b_blocknr > SB_BLOCK_COUNT(sb))
7086 +                       reiserfs_panic(sb, "jmacd-6", "buffer block "
7087 +                                      "number too high %s[%d] (%b)",
7088 +                                      descr, level, bh);
7089 +       }
7090 +}
7091 +#else
7092 +static void tb_buffer_sanity_check(struct super_block *sb,
7093 +                                  struct buffer_head *bh,
7094 +                                  const char *descr, int level)
7095 +{;
7096 +}
7097 +#endif
7098 +
7099 +static int clear_all_dirty_bits(struct super_block *s, struct buffer_head *bh)
7100 +{
7101 +       return reiserfs_prepare_for_journal(s, bh, 0);
7102 +}
7103 +
7104 +static int wait_tb_buffers_until_unlocked(struct tree_balance *tb)
7105 +{
7106 +       struct buffer_head *locked;
7107 +#ifdef CONFIG_REISERFS_CHECK
7108 +       int repeat_counter = 0;
7109 +#endif
7110 +       int i;
7111 +
7112 +       do {
7113 +
7114 +               locked = NULL;
7115 +
7116 +               for (i = tb->tb_path->path_length;
7117 +                    !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i--) {
7118 +                       if (PATH_OFFSET_PBUFFER(tb->tb_path, i)) {
7119 +                               /*
7120 +                                * if I understand correctly, we can only
7121 +                                * be sure the last buffer in the path is
7122 +                                * in the tree --clm
7123 +                                */
7124 +#ifdef CONFIG_REISERFS_CHECK
7125 +                               if (PATH_PLAST_BUFFER(tb->tb_path) ==
7126 +                                   PATH_OFFSET_PBUFFER(tb->tb_path, i))
7127 +                                       tb_buffer_sanity_check(tb->tb_sb,
7128 +                                                              PATH_OFFSET_PBUFFER
7129 +                                                              (tb->tb_path,
7130 +                                                               i), "S",
7131 +                                                              tb->tb_path->
7132 +                                                              path_length - i);
7133 +#endif
7134 +                               if (!clear_all_dirty_bits(tb->tb_sb,
7135 +                                                         PATH_OFFSET_PBUFFER
7136 +                                                         (tb->tb_path,
7137 +                                                          i))) {
7138 +                                       locked =
7139 +                                           PATH_OFFSET_PBUFFER(tb->tb_path,
7140 +                                                               i);
7141 +                               }
7142 +                       }
7143 +               }
7144 +
7145 +               for (i = 0; !locked && i < MAX_HEIGHT && tb->insert_size[i];
7146 +                    i++) {
7147 +
7148 +                       if (tb->lnum[i]) {
7149 +
7150 +                               if (tb->L[i]) {
7151 +                                       tb_buffer_sanity_check(tb->tb_sb,
7152 +                                                              tb->L[i],
7153 +                                                              "L", i);
7154 +                                       if (!clear_all_dirty_bits
7155 +                                           (tb->tb_sb, tb->L[i]))
7156 +                                               locked = tb->L[i];
7157 +                               }
7158 +
7159 +                               if (!locked && tb->FL[i]) {
7160 +                                       tb_buffer_sanity_check(tb->tb_sb,
7161 +                                                              tb->FL[i],
7162 +                                                              "FL", i);
7163 +                                       if (!clear_all_dirty_bits
7164 +                                           (tb->tb_sb, tb->FL[i]))
7165 +                                               locked = tb->FL[i];
7166 +                               }
7167 +
7168 +                               if (!locked && tb->CFL[i]) {
7169 +                                       tb_buffer_sanity_check(tb->tb_sb,
7170 +                                                              tb->CFL[i],
7171 +                                                              "CFL", i);
7172 +                                       if (!clear_all_dirty_bits
7173 +                                           (tb->tb_sb, tb->CFL[i]))
7174 +                                               locked = tb->CFL[i];
7175 +                               }
7176 +
7177 +                       }
7178 +
7179 +                       if (!locked && (tb->rnum[i])) {
7180 +
7181 +                               if (tb->R[i]) {
7182 +                                       tb_buffer_sanity_check(tb->tb_sb,
7183 +                                                              tb->R[i],
7184 +                                                              "R", i);
7185 +                                       if (!clear_all_dirty_bits
7186 +                                           (tb->tb_sb, tb->R[i]))
7187 +                                               locked = tb->R[i];
7188 +                               }
7189 +
7190 +                               if (!locked && tb->FR[i]) {
7191 +                                       tb_buffer_sanity_check(tb->tb_sb,
7192 +                                                              tb->FR[i],
7193 +                                                              "FR", i);
7194 +                                       if (!clear_all_dirty_bits
7195 +                                           (tb->tb_sb, tb->FR[i]))
7196 +                                               locked = tb->FR[i];
7197 +                               }
7198 +
7199 +                               if (!locked && tb->CFR[i]) {
7200 +                                       tb_buffer_sanity_check(tb->tb_sb,
7201 +                                                              tb->CFR[i],
7202 +                                                              "CFR", i);
7203 +                                       if (!clear_all_dirty_bits
7204 +                                           (tb->tb_sb, tb->CFR[i]))
7205 +                                               locked = tb->CFR[i];
7206 +                               }
7207 +                       }
7208 +               }
7209 +
7210 +               /*
7211 +                * as far as I can tell, this is not required.  The FEB list
7212 +                * seems to be full of newly allocated nodes, which will
7213 +                * never be locked, dirty, or anything else.
7214 +                * To be safe, I'm putting in the checks and waits in.
7215 +                * For the moment, they are needed to keep the code in
7216 +                * journal.c from complaining about the buffer.
7217 +                * That code is inside CONFIG_REISERFS_CHECK as well.  --clm
7218 +                */
7219 +               for (i = 0; !locked && i < MAX_FEB_SIZE; i++) {
7220 +                       if (tb->FEB[i]) {
7221 +                               if (!clear_all_dirty_bits
7222 +                                   (tb->tb_sb, tb->FEB[i]))
7223 +                                       locked = tb->FEB[i];
7224 +                       }
7225 +               }
7226 +
7227 +               if (locked) {
7228 +                       int depth;
7229 +#ifdef CONFIG_REISERFS_CHECK
7230 +                       repeat_counter++;
7231 +                       if ((repeat_counter % 10000) == 0) {
7232 +                               reiserfs_warning(tb->tb_sb, "reiserfs-8200",
7233 +                                                "too many iterations waiting "
7234 +                                                "for buffer to unlock "
7235 +                                                "(%b)", locked);
7236 +
7237 +                               /* Don't loop forever.  Try to recover from possible error. */
7238 +
7239 +                               return (FILESYSTEM_CHANGED_TB(tb)) ?
7240 +                                   REPEAT_SEARCH : CARRY_ON;
7241 +                       }
7242 +#endif
7243 +                       depth = reiserfs_write_unlock_nested(tb->tb_sb);
7244 +                       __wait_on_buffer(locked);
7245 +                       reiserfs_write_lock_nested(tb->tb_sb, depth);
7246 +                       if (FILESYSTEM_CHANGED_TB(tb))
7247 +                               return REPEAT_SEARCH;
7248 +               }
7249 +
7250 +       } while (locked);
7251 +
7252 +       return CARRY_ON;
7253 +}
7254 +
7255 +/*
7256 + * Prepare for balancing, that is
7257 + *     get all necessary parents, and neighbors;
7258 + *     analyze what and where should be moved;
7259 + *     get sufficient number of new nodes;
7260 + * Balancing will start only after all resources will be collected at a time.
7261 + *
7262 + * When ported to SMP kernels, only at the last moment after all needed nodes
7263 + * are collected in cache, will the resources be locked using the usual
7264 + * textbook ordered lock acquisition algorithms.  Note that ensuring that
7265 + * this code neither write locks what it does not need to write lock nor locks
7266 + * out of order will be a pain in the butt that could have been avoided.
7267 + * Grumble grumble. -Hans
7268 + *
7269 + * fix is meant in the sense of render unchanging
7270 + *
7271 + * Latency might be improved by first gathering a list of what buffers
7272 + * are needed and then getting as many of them in parallel as possible? -Hans
7273 + *
7274 + * Parameters:
7275 + *     op_mode i - insert, d - delete, c - cut (truncate), p - paste (append)
7276 + *     tb      tree_balance structure;
7277 + *     inum    item number in S[h];
7278 + *      pos_in_item - comment this if you can
7279 + *      ins_ih item head of item being inserted
7280 + *     data    inserted item or data to be pasted
7281 + * Returns:    1 - schedule occurred while the function worked;
7282 + *             0 - schedule didn't occur while the function worked;
7283 + *             -1 - if no_disk_space
7284 + */
7285 +
7286 +int fix_nodes(int op_mode, struct tree_balance *tb,
7287 +             struct item_head *ins_ih, const void *data)
7288 +{
7289 +       int ret, h, item_num = PATH_LAST_POSITION(tb->tb_path);
7290 +       int pos_in_item;
7291 +
7292 +       /*
7293 +        * we set wait_tb_buffers_run when we have to restore any dirty
7294 +        * bits cleared during wait_tb_buffers_run
7295 +        */
7296 +       int wait_tb_buffers_run = 0;
7297 +       struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
7298 +
7299 +       ++REISERFS_SB(tb->tb_sb)->s_fix_nodes;
7300 +
7301 +       pos_in_item = tb->tb_path->pos_in_item;
7302 +
7303 +       tb->fs_gen = get_generation(tb->tb_sb);
7304 +
7305 +       /*
7306 +        * we prepare and log the super here so it will already be in the
7307 +        * transaction when do_balance needs to change it.
7308 +        * This way do_balance won't have to schedule when trying to prepare
7309 +        * the super for logging
7310 +        */
7311 +       reiserfs_prepare_for_journal(tb->tb_sb,
7312 +                                    SB_BUFFER_WITH_SB(tb->tb_sb), 1);
7313 +       journal_mark_dirty(tb->transaction_handle,
7314 +                          SB_BUFFER_WITH_SB(tb->tb_sb));
7315 +       if (FILESYSTEM_CHANGED_TB(tb))
7316 +               return REPEAT_SEARCH;
7317 +
7318 +       /* if it possible in indirect_to_direct conversion */
7319 +       if (buffer_locked(tbS0)) {
7320 +               int depth = reiserfs_write_unlock_nested(tb->tb_sb);
7321 +               __wait_on_buffer(tbS0);
7322 +               reiserfs_write_lock_nested(tb->tb_sb, depth);
7323 +               if (FILESYSTEM_CHANGED_TB(tb))
7324 +                       return REPEAT_SEARCH;
7325 +       }
7326 +#ifdef CONFIG_REISERFS_CHECK
7327 +       if (REISERFS_SB(tb->tb_sb)->cur_tb) {
7328 +               print_cur_tb("fix_nodes");
7329 +               reiserfs_panic(tb->tb_sb, "PAP-8305",
7330 +                              "there is pending do_balance");
7331 +       }
7332 +
7333 +       if (!buffer_uptodate(tbS0) || !B_IS_IN_TREE(tbS0))
7334 +               reiserfs_panic(tb->tb_sb, "PAP-8320", "S[0] (%b %z) is "
7335 +                              "not uptodate at the beginning of fix_nodes "
7336 +                              "or not in tree (mode %c)",
7337 +                              tbS0, tbS0, op_mode);
7338 +
7339 +       /* Check parameters. */
7340 +       switch (op_mode) {
7341 +       case M_INSERT:
7342 +               if (item_num <= 0 || item_num > B_NR_ITEMS(tbS0))
7343 +                       reiserfs_panic(tb->tb_sb, "PAP-8330", "Incorrect "
7344 +                                      "item number %d (in S0 - %d) in case "
7345 +                                      "of insert", item_num,
7346 +                                      B_NR_ITEMS(tbS0));
7347 +               break;
7348 +       case M_PASTE:
7349 +       case M_DELETE:
7350 +       case M_CUT:
7351 +               if (item_num < 0 || item_num >= B_NR_ITEMS(tbS0)) {
7352 +                       print_block(tbS0, 0, -1, -1);
7353 +                       reiserfs_panic(tb->tb_sb, "PAP-8335", "Incorrect "
7354 +                                      "item number(%d); mode = %c "
7355 +                                      "insert_size = %d",
7356 +                                      item_num, op_mode,
7357 +                                      tb->insert_size[0]);
7358 +               }
7359 +               break;
7360 +       default:
7361 +               reiserfs_panic(tb->tb_sb, "PAP-8340", "Incorrect mode "
7362 +                              "of operation");
7363 +       }
7364 +#endif
7365 +
7366 +       if (get_mem_for_virtual_node(tb) == REPEAT_SEARCH)
7367 +               /* FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat */
7368 +               return REPEAT_SEARCH;
7369 +
7370 +       /* Starting from the leaf level; for all levels h of the tree. */
7371 +       for (h = 0; h < MAX_HEIGHT && tb->insert_size[h]; h++) {
7372 +               ret = get_direct_parent(tb, h);
7373 +               if (ret != CARRY_ON)
7374 +                       goto repeat;
7375 +
7376 +               ret = check_balance(op_mode, tb, h, item_num,
7377 +                                   pos_in_item, ins_ih, data);
7378 +               if (ret != CARRY_ON) {
7379 +                       if (ret == NO_BALANCING_NEEDED) {
7380 +                               /* No balancing for higher levels needed. */
7381 +                               ret = get_neighbors(tb, h);
7382 +                               if (ret != CARRY_ON)
7383 +                                       goto repeat;
7384 +                               if (h != MAX_HEIGHT - 1)
7385 +                                       tb->insert_size[h + 1] = 0;
7386 +                               /*
7387 +                                * ok, analysis and resource gathering
7388 +                                * are complete
7389 +                                */
7390 +                               break;
7391 +                       }
7392 +                       goto repeat;
7393 +               }
7394 +
7395 +               ret = get_neighbors(tb, h);
7396 +               if (ret != CARRY_ON)
7397 +                       goto repeat;
7398 +
7399 +               /*
7400 +                * No disk space, or schedule occurred and analysis may be
7401 +                * invalid and needs to be redone.
7402 +                */
7403 +               ret = get_empty_nodes(tb, h);
7404 +               if (ret != CARRY_ON)
7405 +                       goto repeat;
7406 +
7407 +               /*
7408 +                * We have a positive insert size but no nodes exist on this
7409 +                * level, this means that we are creating a new root.
7410 +                */
7411 +               if (!PATH_H_PBUFFER(tb->tb_path, h)) {
7412 +
7413 +                       RFALSE(tb->blknum[h] != 1,
7414 +                              "PAP-8350: creating new empty root");
7415 +
7416 +                       if (h < MAX_HEIGHT - 1)
7417 +                               tb->insert_size[h + 1] = 0;
7418 +               } else if (!PATH_H_PBUFFER(tb->tb_path, h + 1)) {
7419 +                       /*
7420 +                        * The tree needs to be grown, so this node S[h]
7421 +                        * which is the root node is split into two nodes,
7422 +                        * and a new node (S[h+1]) will be created to
7423 +                        * become the root node.
7424 +                        */
7425 +                       if (tb->blknum[h] > 1) {
7426 +
7427 +                               RFALSE(h == MAX_HEIGHT - 1,
7428 +                                      "PAP-8355: attempt to create too high of a tree");
7429 +
7430 +                               tb->insert_size[h + 1] =
7431 +                                   (DC_SIZE +
7432 +                                    KEY_SIZE) * (tb->blknum[h] - 1) +
7433 +                                   DC_SIZE;
7434 +                       } else if (h < MAX_HEIGHT - 1)
7435 +                               tb->insert_size[h + 1] = 0;
7436 +               } else
7437 +                       tb->insert_size[h + 1] =
7438 +                           (DC_SIZE + KEY_SIZE) * (tb->blknum[h] - 1);
7439 +       }
7440 +
7441 +       ret = wait_tb_buffers_until_unlocked(tb);
7442 +       if (ret == CARRY_ON) {
7443 +               if (FILESYSTEM_CHANGED_TB(tb)) {
7444 +                       wait_tb_buffers_run = 1;
7445 +                       ret = REPEAT_SEARCH;
7446 +                       goto repeat;
7447 +               } else {
7448 +                       return CARRY_ON;
7449 +               }
7450 +       } else {
7451 +               wait_tb_buffers_run = 1;
7452 +               goto repeat;
7453 +       }
7454 +
7455 +repeat:
7456 +       /*
7457 +        * fix_nodes was unable to perform its calculation due to
7458 +        * filesystem got changed under us, lack of free disk space or i/o
7459 +        * failure. If the first is the case - the search will be
7460 +        * repeated. For now - free all resources acquired so far except
7461 +        * for the new allocated nodes
7462 +        */
7463 +       {
7464 +               int i;
7465 +
7466 +               /* Release path buffers. */
7467 +               if (wait_tb_buffers_run) {
7468 +                       pathrelse_and_restore(tb->tb_sb, tb->tb_path);
7469 +               } else {
7470 +                       pathrelse(tb->tb_path);
7471 +               }
7472 +               /* brelse all resources collected for balancing */
7473 +               for (i = 0; i < MAX_HEIGHT; i++) {
7474 +                       if (wait_tb_buffers_run) {
7475 +                               reiserfs_restore_prepared_buffer(tb->tb_sb,
7476 +                                                                tb->L[i]);
7477 +                               reiserfs_restore_prepared_buffer(tb->tb_sb,
7478 +                                                                tb->R[i]);
7479 +                               reiserfs_restore_prepared_buffer(tb->tb_sb,
7480 +                                                                tb->FL[i]);
7481 +                               reiserfs_restore_prepared_buffer(tb->tb_sb,
7482 +                                                                tb->FR[i]);
7483 +                               reiserfs_restore_prepared_buffer(tb->tb_sb,
7484 +                                                                tb->
7485 +                                                                CFL[i]);
7486 +                               reiserfs_restore_prepared_buffer(tb->tb_sb,
7487 +                                                                tb->
7488 +                                                                CFR[i]);
7489 +                       }
7490 +
7491 +                       brelse(tb->L[i]);
7492 +                       brelse(tb->R[i]);
7493 +                       brelse(tb->FL[i]);
7494 +                       brelse(tb->FR[i]);
7495 +                       brelse(tb->CFL[i]);
7496 +                       brelse(tb->CFR[i]);
7497 +
7498 +                       tb->L[i] = NULL;
7499 +                       tb->R[i] = NULL;
7500 +                       tb->FL[i] = NULL;
7501 +                       tb->FR[i] = NULL;
7502 +                       tb->CFL[i] = NULL;
7503 +                       tb->CFR[i] = NULL;
7504 +               }
7505 +
7506 +               if (wait_tb_buffers_run) {
7507 +                       for (i = 0; i < MAX_FEB_SIZE; i++) {
7508 +                               if (tb->FEB[i])
7509 +                                       reiserfs_restore_prepared_buffer
7510 +                                           (tb->tb_sb, tb->FEB[i]);
7511 +                       }
7512 +               }
7513 +               return ret;
7514 +       }
7515 +
7516 +}
7517 +
7518 +void unfix_nodes(struct tree_balance *tb)
7519 +{
7520 +       int i;
7521 +
7522 +       /* Release path buffers. */
7523 +       pathrelse_and_restore(tb->tb_sb, tb->tb_path);
7524 +
7525 +       /* brelse all resources collected for balancing */
7526 +       for (i = 0; i < MAX_HEIGHT; i++) {
7527 +               reiserfs_restore_prepared_buffer(tb->tb_sb, tb->L[i]);
7528 +               reiserfs_restore_prepared_buffer(tb->tb_sb, tb->R[i]);
7529 +               reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FL[i]);
7530 +               reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FR[i]);
7531 +               reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFL[i]);
7532 +               reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFR[i]);
7533 +
7534 +               brelse(tb->L[i]);
7535 +               brelse(tb->R[i]);
7536 +               brelse(tb->FL[i]);
7537 +               brelse(tb->FR[i]);
7538 +               brelse(tb->CFL[i]);
7539 +               brelse(tb->CFR[i]);
7540 +       }
7541 +
7542 +       /* deal with list of allocated (used and unused) nodes */
7543 +       for (i = 0; i < MAX_FEB_SIZE; i++) {
7544 +               if (tb->FEB[i]) {
7545 +                       b_blocknr_t blocknr = tb->FEB[i]->b_blocknr;
7546 +                       /*
7547 +                        * de-allocated block which was not used by
7548 +                        * balancing and bforget about buffer for it
7549 +                        */
7550 +                       brelse(tb->FEB[i]);
7551 +                       reiserfs_free_block(tb->transaction_handle, NULL,
7552 +                                           blocknr, 0);
7553 +               }
7554 +               if (tb->used[i]) {
7555 +                       /* release used as new nodes including a new root */
7556 +                       brelse(tb->used[i]);
7557 +               }
7558 +       }
7559 +
7560 +       kfree(tb->vn_buf);
7561 +
7562 +}
7563 diff --git a/fs/reiserfs/hashes.c b/fs/reiserfs/hashes.c
7564 new file mode 100644
7565 index 000000000000..7a26c4fe6c46
7566 --- /dev/null
7567 +++ b/fs/reiserfs/hashes.c
7568 @@ -0,0 +1,177 @@
7569 +
7570 +/*
7571 + * Keyed 32-bit hash function using TEA in a Davis-Meyer function
7572 + *   H0 = Key
7573 + *   Hi = E Mi(Hi-1) + Hi-1
7574 + *
7575 + * (see Applied Cryptography, 2nd edition, p448).
7576 + *
7577 + * Jeremy Fitzhardinge <jeremy@zip.com.au> 1998
7578 + *
7579 + * Jeremy has agreed to the contents of reiserfs/README. -Hans
7580 + * Yura's function is added (04/07/2000)
7581 + */
7582 +
7583 +#include <linux/kernel.h>
7584 +#include "reiserfs.h"
7585 +#include <asm/types.h>
7586 +
7587 +#define DELTA 0x9E3779B9
7588 +#define FULLROUNDS 10          /* 32 is overkill, 16 is strong crypto */
7589 +#define PARTROUNDS 6           /* 6 gets complete mixing */
7590 +
7591 +/* a, b, c, d - data; h0, h1 - accumulated hash */
7592 +#define TEACORE(rounds)                                                        \
7593 +       do {                                                            \
7594 +               u32 sum = 0;                                            \
7595 +               int n = rounds;                                         \
7596 +               u32 b0, b1;                                             \
7597 +                                                                       \
7598 +               b0 = h0;                                                \
7599 +               b1 = h1;                                                \
7600 +                                                                       \
7601 +               do                                                      \
7602 +               {                                                       \
7603 +                       sum += DELTA;                                   \
7604 +                       b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); \
7605 +                       b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); \
7606 +               } while(--n);                                           \
7607 +                                                                       \
7608 +               h0 += b0;                                               \
7609 +               h1 += b1;                                               \
7610 +       } while(0)
7611 +
7612 +u32 keyed_hash(const signed char *msg, int len)
7613 +{
7614 +       u32 k[] = { 0x9464a485, 0x542e1a94, 0x3e846bff, 0xb75bcfc3 };
7615 +
7616 +       u32 h0 = k[0], h1 = k[1];
7617 +       u32 a, b, c, d;
7618 +       u32 pad;
7619 +       int i;
7620 +
7621 +       /*      assert(len >= 0 && len < 256); */
7622 +
7623 +       pad = (u32) len | ((u32) len << 8);
7624 +       pad |= pad << 16;
7625 +
7626 +       while (len >= 16) {
7627 +               a = (u32) msg[0] |
7628 +                   (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24;
7629 +               b = (u32) msg[4] |
7630 +                   (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24;
7631 +               c = (u32) msg[8] |
7632 +                   (u32) msg[9] << 8 |
7633 +                   (u32) msg[10] << 16 | (u32) msg[11] << 24;
7634 +               d = (u32) msg[12] |
7635 +                   (u32) msg[13] << 8 |
7636 +                   (u32) msg[14] << 16 | (u32) msg[15] << 24;
7637 +
7638 +               TEACORE(PARTROUNDS);
7639 +
7640 +               len -= 16;
7641 +               msg += 16;
7642 +       }
7643 +
7644 +       if (len >= 12) {
7645 +               a = (u32) msg[0] |
7646 +                   (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24;
7647 +               b = (u32) msg[4] |
7648 +                   (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24;
7649 +               c = (u32) msg[8] |
7650 +                   (u32) msg[9] << 8 |
7651 +                   (u32) msg[10] << 16 | (u32) msg[11] << 24;
7652 +
7653 +               d = pad;
7654 +               for (i = 12; i < len; i++) {
7655 +                       d <<= 8;
7656 +                       d |= msg[i];
7657 +               }
7658 +       } else if (len >= 8) {
7659 +               a = (u32) msg[0] |
7660 +                   (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24;
7661 +               b = (u32) msg[4] |
7662 +                   (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24;
7663 +
7664 +               c = d = pad;
7665 +               for (i = 8; i < len; i++) {
7666 +                       c <<= 8;
7667 +                       c |= msg[i];
7668 +               }
7669 +       } else if (len >= 4) {
7670 +               a = (u32) msg[0] |
7671 +                   (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24;
7672 +
7673 +               b = c = d = pad;
7674 +               for (i = 4; i < len; i++) {
7675 +                       b <<= 8;
7676 +                       b |= msg[i];
7677 +               }
7678 +       } else {
7679 +               a = b = c = d = pad;
7680 +               for (i = 0; i < len; i++) {
7681 +                       a <<= 8;
7682 +                       a |= msg[i];
7683 +               }
7684 +       }
7685 +
7686 +       TEACORE(FULLROUNDS);
7687 +
7688 +/*     return 0;*/
7689 +       return h0 ^ h1;
7690 +}
7691 +
7692 +/*
7693 + * What follows in this file is copyright 2000 by Hans Reiser, and the
7694 + * licensing of what follows is governed by reiserfs/README
7695 + */
7696 +u32 yura_hash(const signed char *msg, int len)
7697 +{
7698 +       int j, pow;
7699 +       u32 a, c;
7700 +       int i;
7701 +
7702 +       for (pow = 1, i = 1; i < len; i++)
7703 +               pow = pow * 10;
7704 +
7705 +       if (len == 1)
7706 +               a = msg[0] - 48;
7707 +       else
7708 +               a = (msg[0] - 48) * pow;
7709 +
7710 +       for (i = 1; i < len; i++) {
7711 +               c = msg[i] - 48;
7712 +               for (pow = 1, j = i; j < len - 1; j++)
7713 +                       pow = pow * 10;
7714 +               a = a + c * pow;
7715 +       }
7716 +
7717 +       for (; i < 40; i++) {
7718 +               c = '0' - 48;
7719 +               for (pow = 1, j = i; j < len - 1; j++)
7720 +                       pow = pow * 10;
7721 +               a = a + c * pow;
7722 +       }
7723 +
7724 +       for (; i < 256; i++) {
7725 +               c = i;
7726 +               for (pow = 1, j = i; j < len - 1; j++)
7727 +                       pow = pow * 10;
7728 +               a = a + c * pow;
7729 +       }
7730 +
7731 +       a = a << 7;
7732 +       return a;
7733 +}
7734 +
7735 +u32 r5_hash(const signed char *msg, int len)
7736 +{
7737 +       u32 a = 0;
7738 +       while (*msg) {
7739 +               a += *msg << 4;
7740 +               a += *msg >> 4;
7741 +               a *= 11;
7742 +               msg++;
7743 +       }
7744 +       return a;
7745 +}
7746 diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c
7747 new file mode 100644
7748 index 000000000000..5db6f45b3fed
7749 --- /dev/null
7750 +++ b/fs/reiserfs/ibalance.c
7751 @@ -0,0 +1,1161 @@
7752 +/*
7753 + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
7754 + */
7755 +
7756 +#include <linux/uaccess.h>
7757 +#include <linux/string.h>
7758 +#include <linux/time.h>
7759 +#include "reiserfs.h"
7760 +#include <linux/buffer_head.h>
7761 +
7762 +/* this is one and only function that is used outside (do_balance.c) */
7763 +int balance_internal(struct tree_balance *,
7764 +                    int, int, struct item_head *, struct buffer_head **);
7765 +
7766 +/*
7767 + * modes of internal_shift_left, internal_shift_right and
7768 + * internal_insert_childs
7769 + */
7770 +#define INTERNAL_SHIFT_FROM_S_TO_L 0
7771 +#define INTERNAL_SHIFT_FROM_R_TO_S 1
7772 +#define INTERNAL_SHIFT_FROM_L_TO_S 2
7773 +#define INTERNAL_SHIFT_FROM_S_TO_R 3
7774 +#define INTERNAL_INSERT_TO_S 4
7775 +#define INTERNAL_INSERT_TO_L 5
7776 +#define INTERNAL_INSERT_TO_R 6
7777 +
7778 +static void internal_define_dest_src_infos(int shift_mode,
7779 +                                          struct tree_balance *tb,
7780 +                                          int h,
7781 +                                          struct buffer_info *dest_bi,
7782 +                                          struct buffer_info *src_bi,
7783 +                                          int *d_key, struct buffer_head **cf)
7784 +{
7785 +       memset(dest_bi, 0, sizeof(struct buffer_info));
7786 +       memset(src_bi, 0, sizeof(struct buffer_info));
7787 +       /* define dest, src, dest parent, dest position */
7788 +       switch (shift_mode) {
7789 +
7790 +       /* used in internal_shift_left */
7791 +       case INTERNAL_SHIFT_FROM_S_TO_L:
7792 +               src_bi->tb = tb;
7793 +               src_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
7794 +               src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
7795 +               src_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
7796 +               dest_bi->tb = tb;
7797 +               dest_bi->bi_bh = tb->L[h];
7798 +               dest_bi->bi_parent = tb->FL[h];
7799 +               dest_bi->bi_position = get_left_neighbor_position(tb, h);
7800 +               *d_key = tb->lkey[h];
7801 +               *cf = tb->CFL[h];
7802 +               break;
7803 +       case INTERNAL_SHIFT_FROM_L_TO_S:
7804 +               src_bi->tb = tb;
7805 +               src_bi->bi_bh = tb->L[h];
7806 +               src_bi->bi_parent = tb->FL[h];
7807 +               src_bi->bi_position = get_left_neighbor_position(tb, h);
7808 +               dest_bi->tb = tb;
7809 +               dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
7810 +               dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
7811 +               /* dest position is analog of dest->b_item_order */
7812 +               dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
7813 +               *d_key = tb->lkey[h];
7814 +               *cf = tb->CFL[h];
7815 +               break;
7816 +
7817 +       /* used in internal_shift_left */
7818 +       case INTERNAL_SHIFT_FROM_R_TO_S:
7819 +               src_bi->tb = tb;
7820 +               src_bi->bi_bh = tb->R[h];
7821 +               src_bi->bi_parent = tb->FR[h];
7822 +               src_bi->bi_position = get_right_neighbor_position(tb, h);
7823 +               dest_bi->tb = tb;
7824 +               dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
7825 +               dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
7826 +               dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
7827 +               *d_key = tb->rkey[h];
7828 +               *cf = tb->CFR[h];
7829 +               break;
7830 +
7831 +       case INTERNAL_SHIFT_FROM_S_TO_R:
7832 +               src_bi->tb = tb;
7833 +               src_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
7834 +               src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
7835 +               src_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
7836 +               dest_bi->tb = tb;
7837 +               dest_bi->bi_bh = tb->R[h];
7838 +               dest_bi->bi_parent = tb->FR[h];
7839 +               dest_bi->bi_position = get_right_neighbor_position(tb, h);
7840 +               *d_key = tb->rkey[h];
7841 +               *cf = tb->CFR[h];
7842 +               break;
7843 +
7844 +       case INTERNAL_INSERT_TO_L:
7845 +               dest_bi->tb = tb;
7846 +               dest_bi->bi_bh = tb->L[h];
7847 +               dest_bi->bi_parent = tb->FL[h];
7848 +               dest_bi->bi_position = get_left_neighbor_position(tb, h);
7849 +               break;
7850 +
7851 +       case INTERNAL_INSERT_TO_S:
7852 +               dest_bi->tb = tb;
7853 +               dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
7854 +               dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
7855 +               dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
7856 +               break;
7857 +
7858 +       case INTERNAL_INSERT_TO_R:
7859 +               dest_bi->tb = tb;
7860 +               dest_bi->bi_bh = tb->R[h];
7861 +               dest_bi->bi_parent = tb->FR[h];
7862 +               dest_bi->bi_position = get_right_neighbor_position(tb, h);
7863 +               break;
7864 +
7865 +       default:
7866 +               reiserfs_panic(tb->tb_sb, "ibalance-1",
7867 +                              "shift type is unknown (%d)",
7868 +                              shift_mode);
7869 +       }
7870 +}
7871 +
7872 +/*
7873 + * Insert count node pointers into buffer cur before position to + 1.
7874 + * Insert count items into buffer cur before position to.
7875 + * Items and node pointers are specified by inserted and bh respectively.
7876 + */
7877 +static void internal_insert_childs(struct buffer_info *cur_bi,
7878 +                                  int to, int count,
7879 +                                  struct item_head *inserted,
7880 +                                  struct buffer_head **bh)
7881 +{
7882 +       struct buffer_head *cur = cur_bi->bi_bh;
7883 +       struct block_head *blkh;
7884 +       int nr;
7885 +       struct reiserfs_key *ih;
7886 +       struct disk_child new_dc[2];
7887 +       struct disk_child *dc;
7888 +       int i;
7889 +
7890 +       if (count <= 0)
7891 +               return;
7892 +
7893 +       blkh = B_BLK_HEAD(cur);
7894 +       nr = blkh_nr_item(blkh);
7895 +
7896 +       RFALSE(count > 2, "too many children (%d) are to be inserted", count);
7897 +       RFALSE(B_FREE_SPACE(cur) < count * (KEY_SIZE + DC_SIZE),
7898 +              "no enough free space (%d), needed %d bytes",
7899 +              B_FREE_SPACE(cur), count * (KEY_SIZE + DC_SIZE));
7900 +
7901 +       /* prepare space for count disk_child */
7902 +       dc = B_N_CHILD(cur, to + 1);
7903 +
7904 +       memmove(dc + count, dc, (nr + 1 - (to + 1)) * DC_SIZE);
7905 +
7906 +       /* copy to_be_insert disk children */
7907 +       for (i = 0; i < count; i++) {
7908 +               put_dc_size(&new_dc[i],
7909 +                           MAX_CHILD_SIZE(bh[i]) - B_FREE_SPACE(bh[i]));
7910 +               put_dc_block_number(&new_dc[i], bh[i]->b_blocknr);
7911 +       }
7912 +       memcpy(dc, new_dc, DC_SIZE * count);
7913 +
7914 +       /* prepare space for count items  */
7915 +       ih = internal_key(cur, ((to == -1) ? 0 : to));
7916 +
7917 +       memmove(ih + count, ih,
7918 +               (nr - to) * KEY_SIZE + (nr + 1 + count) * DC_SIZE);
7919 +
7920 +       /* copy item headers (keys) */
7921 +       memcpy(ih, inserted, KEY_SIZE);
7922 +       if (count > 1)
7923 +               memcpy(ih + 1, inserted + 1, KEY_SIZE);
7924 +
7925 +       /* sizes, item number */
7926 +       set_blkh_nr_item(blkh, blkh_nr_item(blkh) + count);
7927 +       set_blkh_free_space(blkh,
7928 +                           blkh_free_space(blkh) - count * (DC_SIZE +
7929 +                                                            KEY_SIZE));
7930 +
7931 +       do_balance_mark_internal_dirty(cur_bi->tb, cur, 0);
7932 +
7933 +       /*&&&&&&&&&&&&&&&&&&&&&&&& */
7934 +       check_internal(cur);
7935 +       /*&&&&&&&&&&&&&&&&&&&&&&&& */
7936 +
7937 +       if (cur_bi->bi_parent) {
7938 +               struct disk_child *t_dc =
7939 +                   B_N_CHILD(cur_bi->bi_parent, cur_bi->bi_position);
7940 +               put_dc_size(t_dc,
7941 +                           dc_size(t_dc) + (count * (DC_SIZE + KEY_SIZE)));
7942 +               do_balance_mark_internal_dirty(cur_bi->tb, cur_bi->bi_parent,
7943 +                                              0);
7944 +
7945 +               /*&&&&&&&&&&&&&&&&&&&&&&&& */
7946 +               check_internal(cur_bi->bi_parent);
7947 +               /*&&&&&&&&&&&&&&&&&&&&&&&& */
7948 +       }
7949 +
7950 +}
7951 +
7952 +/*
7953 + * Delete del_num items and node pointers from buffer cur starting from
7954 + * the first_i'th item and first_p'th pointers respectively.
7955 + */
7956 +static void internal_delete_pointers_items(struct buffer_info *cur_bi,
7957 +                                          int first_p,
7958 +                                          int first_i, int del_num)
7959 +{
7960 +       struct buffer_head *cur = cur_bi->bi_bh;
7961 +       int nr;
7962 +       struct block_head *blkh;
7963 +       struct reiserfs_key *key;
7964 +       struct disk_child *dc;
7965 +
7966 +       RFALSE(cur == NULL, "buffer is 0");
7967 +       RFALSE(del_num < 0,
7968 +              "negative number of items (%d) can not be deleted", del_num);
7969 +       RFALSE(first_p < 0 || first_p + del_num > B_NR_ITEMS(cur) + 1
7970 +              || first_i < 0,
7971 +              "first pointer order (%d) < 0 or "
7972 +              "no so many pointers (%d), only (%d) or "
7973 +              "first key order %d < 0", first_p, first_p + del_num,
7974 +              B_NR_ITEMS(cur) + 1, first_i);
7975 +       if (del_num == 0)
7976 +               return;
7977 +
7978 +       blkh = B_BLK_HEAD(cur);
7979 +       nr = blkh_nr_item(blkh);
7980 +
7981 +       if (first_p == 0 && del_num == nr + 1) {
7982 +               RFALSE(first_i != 0,
7983 +                      "1st deleted key must have order 0, not %d", first_i);
7984 +               make_empty_node(cur_bi);
7985 +               return;
7986 +       }
7987 +
7988 +       RFALSE(first_i + del_num > B_NR_ITEMS(cur),
7989 +              "first_i = %d del_num = %d "
7990 +              "no so many keys (%d) in the node (%b)(%z)",
7991 +              first_i, del_num, first_i + del_num, cur, cur);
7992 +
7993 +       /* deleting */
7994 +       dc = B_N_CHILD(cur, first_p);
7995 +
7996 +       memmove(dc, dc + del_num, (nr + 1 - first_p - del_num) * DC_SIZE);
7997 +       key = internal_key(cur, first_i);
7998 +       memmove(key, key + del_num,
7999 +               (nr - first_i - del_num) * KEY_SIZE + (nr + 1 -
8000 +                                                      del_num) * DC_SIZE);
8001 +
8002 +       /* sizes, item number */
8003 +       set_blkh_nr_item(blkh, blkh_nr_item(blkh) - del_num);
8004 +       set_blkh_free_space(blkh,
8005 +                           blkh_free_space(blkh) +
8006 +                           (del_num * (KEY_SIZE + DC_SIZE)));
8007 +
8008 +       do_balance_mark_internal_dirty(cur_bi->tb, cur, 0);
8009 +       /*&&&&&&&&&&&&&&&&&&&&&&& */
8010 +       check_internal(cur);
8011 +       /*&&&&&&&&&&&&&&&&&&&&&&& */
8012 +
8013 +       if (cur_bi->bi_parent) {
8014 +               struct disk_child *t_dc;
8015 +               t_dc = B_N_CHILD(cur_bi->bi_parent, cur_bi->bi_position);
8016 +               put_dc_size(t_dc,
8017 +                           dc_size(t_dc) - (del_num * (KEY_SIZE + DC_SIZE)));
8018 +
8019 +               do_balance_mark_internal_dirty(cur_bi->tb, cur_bi->bi_parent,
8020 +                                              0);
8021 +               /*&&&&&&&&&&&&&&&&&&&&&&&& */
8022 +               check_internal(cur_bi->bi_parent);
8023 +               /*&&&&&&&&&&&&&&&&&&&&&&&& */
8024 +       }
8025 +}
8026 +
8027 +/* delete n node pointers and items starting from given position */
8028 +static void internal_delete_childs(struct buffer_info *cur_bi, int from, int n)
8029 +{
8030 +       int i_from;
8031 +
8032 +       i_from = (from == 0) ? from : from - 1;
8033 +
8034 +       /*
8035 +        * delete n pointers starting from `from' position in CUR;
8036 +        * delete n keys starting from 'i_from' position in CUR;
8037 +        */
8038 +       internal_delete_pointers_items(cur_bi, from, i_from, n);
8039 +}
8040 +
8041 +/*
8042 + * copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer
8043 + * dest
8044 + * last_first == FIRST_TO_LAST means that we copy first items
8045 + *                             from src to tail of dest
8046 + * last_first == LAST_TO_FIRST means that we copy last items
8047 + *                             from src to head of dest
8048 + */
8049 +static void internal_copy_pointers_items(struct buffer_info *dest_bi,
8050 +                                        struct buffer_head *src,
8051 +                                        int last_first, int cpy_num)
8052 +{
8053 +       /*
8054 +        * ATTENTION! Number of node pointers in DEST is equal to number
8055 +        * of items in DEST  as delimiting key have already inserted to
8056 +        * buffer dest.
8057 +        */
8058 +       struct buffer_head *dest = dest_bi->bi_bh;
8059 +       int nr_dest, nr_src;
8060 +       int dest_order, src_order;
8061 +       struct block_head *blkh;
8062 +       struct reiserfs_key *key;
8063 +       struct disk_child *dc;
8064 +
8065 +       nr_src = B_NR_ITEMS(src);
8066 +
8067 +       RFALSE(dest == NULL || src == NULL,
8068 +              "src (%p) or dest (%p) buffer is 0", src, dest);
8069 +       RFALSE(last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST,
8070 +              "invalid last_first parameter (%d)", last_first);
8071 +       RFALSE(nr_src < cpy_num - 1,
8072 +              "no so many items (%d) in src (%d)", cpy_num, nr_src);
8073 +       RFALSE(cpy_num < 0, "cpy_num less than 0 (%d)", cpy_num);
8074 +       RFALSE(cpy_num - 1 + B_NR_ITEMS(dest) > (int)MAX_NR_KEY(dest),
8075 +              "cpy_num (%d) + item number in dest (%d) can not be > MAX_NR_KEY(%d)",
8076 +              cpy_num, B_NR_ITEMS(dest), MAX_NR_KEY(dest));
8077 +
8078 +       if (cpy_num == 0)
8079 +               return;
8080 +
8081 +       /* coping */
8082 +       blkh = B_BLK_HEAD(dest);
8083 +       nr_dest = blkh_nr_item(blkh);
8084 +
8085 +       /*dest_order = (last_first == LAST_TO_FIRST) ? 0 : nr_dest; */
8086 +       /*src_order = (last_first == LAST_TO_FIRST) ? (nr_src - cpy_num + 1) : 0; */
8087 +       (last_first == LAST_TO_FIRST) ? (dest_order = 0, src_order =
8088 +                                        nr_src - cpy_num + 1) : (dest_order =
8089 +                                                                 nr_dest,
8090 +                                                                 src_order =
8091 +                                                                 0);
8092 +
8093 +       /* prepare space for cpy_num pointers */
8094 +       dc = B_N_CHILD(dest, dest_order);
8095 +
8096 +       memmove(dc + cpy_num, dc, (nr_dest - dest_order) * DC_SIZE);
8097 +
8098 +       /* insert pointers */
8099 +       memcpy(dc, B_N_CHILD(src, src_order), DC_SIZE * cpy_num);
8100 +
8101 +       /* prepare space for cpy_num - 1 item headers */
8102 +       key = internal_key(dest, dest_order);
8103 +       memmove(key + cpy_num - 1, key,
8104 +               KEY_SIZE * (nr_dest - dest_order) + DC_SIZE * (nr_dest +
8105 +                                                              cpy_num));
8106 +
8107 +       /* insert headers */
8108 +       memcpy(key, internal_key(src, src_order), KEY_SIZE * (cpy_num - 1));
8109 +
8110 +       /* sizes, item number */
8111 +       set_blkh_nr_item(blkh, blkh_nr_item(blkh) + (cpy_num - 1));
8112 +       set_blkh_free_space(blkh,
8113 +                           blkh_free_space(blkh) - (KEY_SIZE * (cpy_num - 1) +
8114 +                                                    DC_SIZE * cpy_num));
8115 +
8116 +       do_balance_mark_internal_dirty(dest_bi->tb, dest, 0);
8117 +
8118 +       /*&&&&&&&&&&&&&&&&&&&&&&&& */
8119 +       check_internal(dest);
8120 +       /*&&&&&&&&&&&&&&&&&&&&&&&& */
8121 +
8122 +       if (dest_bi->bi_parent) {
8123 +               struct disk_child *t_dc;
8124 +               t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position);
8125 +               put_dc_size(t_dc,
8126 +                           dc_size(t_dc) + (KEY_SIZE * (cpy_num - 1) +
8127 +                                            DC_SIZE * cpy_num));
8128 +
8129 +               do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent,
8130 +                                              0);
8131 +               /*&&&&&&&&&&&&&&&&&&&&&&&& */
8132 +               check_internal(dest_bi->bi_parent);
8133 +               /*&&&&&&&&&&&&&&&&&&&&&&&& */
8134 +       }
8135 +
8136 +}
8137 +
8138 +/*
8139 + * Copy cpy_num node pointers and cpy_num - 1 items from buffer src to
8140 + * buffer dest.
8141 + * Delete cpy_num - del_par items and node pointers from buffer src.
8142 + * last_first == FIRST_TO_LAST means, that we copy/delete first items from src.
8143 + * last_first == LAST_TO_FIRST means, that we copy/delete last items from src.
8144 + */
8145 +static void internal_move_pointers_items(struct buffer_info *dest_bi,
8146 +                                        struct buffer_info *src_bi,
8147 +                                        int last_first, int cpy_num,
8148 +                                        int del_par)
8149 +{
8150 +       int first_pointer;
8151 +       int first_item;
8152 +
8153 +       internal_copy_pointers_items(dest_bi, src_bi->bi_bh, last_first,
8154 +                                    cpy_num);
8155 +
8156 +       if (last_first == FIRST_TO_LAST) {      /* shift_left occurs */
8157 +               first_pointer = 0;
8158 +               first_item = 0;
8159 +               /*
8160 +                * delete cpy_num - del_par pointers and keys starting for
8161 +                * pointers with first_pointer, for key - with first_item
8162 +                */
8163 +               internal_delete_pointers_items(src_bi, first_pointer,
8164 +                                              first_item, cpy_num - del_par);
8165 +       } else {                /* shift_right occurs */
8166 +               int i, j;
8167 +
8168 +               i = (cpy_num - del_par ==
8169 +                    (j =
8170 +                     B_NR_ITEMS(src_bi->bi_bh)) + 1) ? 0 : j - cpy_num +
8171 +                   del_par;
8172 +
8173 +               internal_delete_pointers_items(src_bi,
8174 +                                              j + 1 - cpy_num + del_par, i,
8175 +                                              cpy_num - del_par);
8176 +       }
8177 +}
8178 +
8179 +/* Insert n_src'th key of buffer src before n_dest'th key of buffer dest. */
8180 +static void internal_insert_key(struct buffer_info *dest_bi,
8181 +                               /* insert key before key with n_dest number */
8182 +                               int dest_position_before,
8183 +                               struct buffer_head *src, int src_position)
8184 +{
8185 +       struct buffer_head *dest = dest_bi->bi_bh;
8186 +       int nr;
8187 +       struct block_head *blkh;
8188 +       struct reiserfs_key *key;
8189 +
8190 +       RFALSE(dest == NULL || src == NULL,
8191 +              "source(%p) or dest(%p) buffer is 0", src, dest);
8192 +       RFALSE(dest_position_before < 0 || src_position < 0,
8193 +              "source(%d) or dest(%d) key number less than 0",
8194 +              src_position, dest_position_before);
8195 +       RFALSE(dest_position_before > B_NR_ITEMS(dest) ||
8196 +              src_position >= B_NR_ITEMS(src),
8197 +              "invalid position in dest (%d (key number %d)) or in src (%d (key number %d))",
8198 +              dest_position_before, B_NR_ITEMS(dest),
8199 +              src_position, B_NR_ITEMS(src));
8200 +       RFALSE(B_FREE_SPACE(dest) < KEY_SIZE,
8201 +              "no enough free space (%d) in dest buffer", B_FREE_SPACE(dest));
8202 +
8203 +       blkh = B_BLK_HEAD(dest);
8204 +       nr = blkh_nr_item(blkh);
8205 +
8206 +       /* prepare space for inserting key */
8207 +       key = internal_key(dest, dest_position_before);
8208 +       memmove(key + 1, key,
8209 +               (nr - dest_position_before) * KEY_SIZE + (nr + 1) * DC_SIZE);
8210 +
8211 +       /* insert key */
8212 +       memcpy(key, internal_key(src, src_position), KEY_SIZE);
8213 +
8214 +       /* Change dirt, free space, item number fields. */
8215 +
8216 +       set_blkh_nr_item(blkh, blkh_nr_item(blkh) + 1);
8217 +       set_blkh_free_space(blkh, blkh_free_space(blkh) - KEY_SIZE);
8218 +
8219 +       do_balance_mark_internal_dirty(dest_bi->tb, dest, 0);
8220 +
8221 +       if (dest_bi->bi_parent) {
8222 +               struct disk_child *t_dc;
8223 +               t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position);
8224 +               put_dc_size(t_dc, dc_size(t_dc) + KEY_SIZE);
8225 +
8226 +               do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent,
8227 +                                              0);
8228 +       }
8229 +}
8230 +
8231 +/*
8232 + * Insert d_key'th (delimiting) key from buffer cfl to tail of dest.
8233 + * Copy pointer_amount node pointers and pointer_amount - 1 items from
8234 + * buffer src to buffer dest.
8235 + * Replace  d_key'th key in buffer cfl.
8236 + * Delete pointer_amount items and node pointers from buffer src.
8237 + */
8238 +/* this can be invoked both to shift from S to L and from R to S */
8239 +static void internal_shift_left(
8240 +                               /*
8241 +                                * INTERNAL_FROM_S_TO_L | INTERNAL_FROM_R_TO_S
8242 +                                */
8243 +                               int mode,
8244 +                               struct tree_balance *tb,
8245 +                               int h, int pointer_amount)
8246 +{
8247 +       struct buffer_info dest_bi, src_bi;
8248 +       struct buffer_head *cf;
8249 +       int d_key_position;
8250 +
8251 +       internal_define_dest_src_infos(mode, tb, h, &dest_bi, &src_bi,
8252 +                                      &d_key_position, &cf);
8253 +
8254 +       /*printk("pointer_amount = %d\n",pointer_amount); */
8255 +
8256 +       if (pointer_amount) {
8257 +               /*
8258 +                * insert delimiting key from common father of dest and
8259 +                * src to node dest into position B_NR_ITEM(dest)
8260 +                */
8261 +               internal_insert_key(&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf,
8262 +                                   d_key_position);
8263 +
8264 +               if (B_NR_ITEMS(src_bi.bi_bh) == pointer_amount - 1) {
8265 +                       if (src_bi.bi_position /*src->b_item_order */  == 0)
8266 +                               replace_key(tb, cf, d_key_position,
8267 +                                           src_bi.
8268 +                                           bi_parent /*src->b_parent */ , 0);
8269 +               } else
8270 +                       replace_key(tb, cf, d_key_position, src_bi.bi_bh,
8271 +                                   pointer_amount - 1);
8272 +       }
8273 +       /* last parameter is del_parameter */
8274 +       internal_move_pointers_items(&dest_bi, &src_bi, FIRST_TO_LAST,
8275 +                                    pointer_amount, 0);
8276 +
8277 +}
8278 +
8279 +/*
8280 + * Insert delimiting key to L[h].
8281 + * Copy n node pointers and n - 1 items from buffer S[h] to L[h].
8282 + * Delete n - 1 items and node pointers from buffer S[h].
8283 + */
8284 +/* it always shifts from S[h] to L[h] */
8285 +static void internal_shift1_left(struct tree_balance *tb,
8286 +                                int h, int pointer_amount)
8287 +{
8288 +       struct buffer_info dest_bi, src_bi;
8289 +       struct buffer_head *cf;
8290 +       int d_key_position;
8291 +
8292 +       internal_define_dest_src_infos(INTERNAL_SHIFT_FROM_S_TO_L, tb, h,
8293 +                                      &dest_bi, &src_bi, &d_key_position, &cf);
8294 +
8295 +       /* insert lkey[h]-th key  from CFL[h] to left neighbor L[h] */
8296 +       if (pointer_amount > 0)
8297 +               internal_insert_key(&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf,
8298 +                                   d_key_position);
8299 +
8300 +       /* last parameter is del_parameter */
8301 +       internal_move_pointers_items(&dest_bi, &src_bi, FIRST_TO_LAST,
8302 +                                    pointer_amount, 1);
8303 +}
8304 +
8305 +/*
8306 + * Insert d_key'th (delimiting) key from buffer cfr to head of dest.
8307 + * Copy n node pointers and n - 1 items from buffer src to buffer dest.
8308 + * Replace  d_key'th key in buffer cfr.
8309 + * Delete n items and node pointers from buffer src.
8310 + */
8311 +static void internal_shift_right(
8312 +                                /*
8313 +                                 * INTERNAL_FROM_S_TO_R | INTERNAL_FROM_L_TO_S
8314 +                                 */
8315 +                                int mode,
8316 +                                struct tree_balance *tb,
8317 +                                int h, int pointer_amount)
8318 +{
8319 +       struct buffer_info dest_bi, src_bi;
8320 +       struct buffer_head *cf;
8321 +       int d_key_position;
8322 +       int nr;
8323 +
8324 +       internal_define_dest_src_infos(mode, tb, h, &dest_bi, &src_bi,
8325 +                                      &d_key_position, &cf);
8326 +
8327 +       nr = B_NR_ITEMS(src_bi.bi_bh);
8328 +
8329 +       if (pointer_amount > 0) {
8330 +               /*
8331 +                * insert delimiting key from common father of dest
8332 +                * and src to dest node into position 0
8333 +                */
8334 +               internal_insert_key(&dest_bi, 0, cf, d_key_position);
8335 +               if (nr == pointer_amount - 1) {
8336 +                       RFALSE(src_bi.bi_bh != PATH_H_PBUFFER(tb->tb_path, h) /*tb->S[h] */ ||
8337 +                              dest_bi.bi_bh != tb->R[h],
8338 +                              "src (%p) must be == tb->S[h](%p) when it disappears",
8339 +                              src_bi.bi_bh, PATH_H_PBUFFER(tb->tb_path, h));
8340 +                       /* when S[h] disappers replace left delemiting key as well */
8341 +                       if (tb->CFL[h])
8342 +                               replace_key(tb, cf, d_key_position, tb->CFL[h],
8343 +                                           tb->lkey[h]);
8344 +               } else
8345 +                       replace_key(tb, cf, d_key_position, src_bi.bi_bh,
8346 +                                   nr - pointer_amount);
8347 +       }
8348 +
8349 +       /* last parameter is del_parameter */
8350 +       internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST,
8351 +                                    pointer_amount, 0);
8352 +}
8353 +
8354 +/*
8355 + * Insert delimiting key to R[h].
8356 + * Copy n node pointers and n - 1 items from buffer S[h] to R[h].
8357 + * Delete n - 1 items and node pointers from buffer S[h].
8358 + */
8359 +/* it always shift from S[h] to R[h] */
8360 +static void internal_shift1_right(struct tree_balance *tb,
8361 +                                 int h, int pointer_amount)
8362 +{
8363 +       struct buffer_info dest_bi, src_bi;
8364 +       struct buffer_head *cf;
8365 +       int d_key_position;
8366 +
8367 +       internal_define_dest_src_infos(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
8368 +                                      &dest_bi, &src_bi, &d_key_position, &cf);
8369 +
8370 +       /* insert rkey from CFR[h] to right neighbor R[h] */
8371 +       if (pointer_amount > 0)
8372 +               internal_insert_key(&dest_bi, 0, cf, d_key_position);
8373 +
8374 +       /* last parameter is del_parameter */
8375 +       internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST,
8376 +                                    pointer_amount, 1);
8377 +}
8378 +
8379 +/*
8380 + * Delete insert_num node pointers together with their left items
8381 + * and balance current node.
8382 + */
8383 +static void balance_internal_when_delete(struct tree_balance *tb,
8384 +                                        int h, int child_pos)
8385 +{
8386 +       int insert_num;
8387 +       int n;
8388 +       struct buffer_head *tbSh = PATH_H_PBUFFER(tb->tb_path, h);
8389 +       struct buffer_info bi;
8390 +
8391 +       insert_num = tb->insert_size[h] / ((int)(DC_SIZE + KEY_SIZE));
8392 +
8393 +       /* delete child-node-pointer(s) together with their left item(s) */
8394 +       bi.tb = tb;
8395 +       bi.bi_bh = tbSh;
8396 +       bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h);
8397 +       bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
8398 +
8399 +       internal_delete_childs(&bi, child_pos, -insert_num);
8400 +
8401 +       RFALSE(tb->blknum[h] > 1,
8402 +              "tb->blknum[%d]=%d when insert_size < 0", h, tb->blknum[h]);
8403 +
8404 +       n = B_NR_ITEMS(tbSh);
8405 +
8406 +       if (tb->lnum[h] == 0 && tb->rnum[h] == 0) {
8407 +               if (tb->blknum[h] == 0) {
8408 +                       /* node S[h] (root of the tree) is empty now */
8409 +                       struct buffer_head *new_root;
8410 +
8411 +                       RFALSE(n
8412 +                              || B_FREE_SPACE(tbSh) !=
8413 +                              MAX_CHILD_SIZE(tbSh) - DC_SIZE,
8414 +                              "buffer must have only 0 keys (%d)", n);
8415 +                       RFALSE(bi.bi_parent, "root has parent (%p)",
8416 +                              bi.bi_parent);
8417 +
8418 +                       /* choose a new root */
8419 +                       if (!tb->L[h - 1] || !B_NR_ITEMS(tb->L[h - 1]))
8420 +                               new_root = tb->R[h - 1];
8421 +                       else
8422 +                               new_root = tb->L[h - 1];
8423 +                       /*
8424 +                        * switch super block's tree root block
8425 +                        * number to the new value */
8426 +                       PUT_SB_ROOT_BLOCK(tb->tb_sb, new_root->b_blocknr);
8427 +                       /*REISERFS_SB(tb->tb_sb)->s_rs->s_tree_height --; */
8428 +                       PUT_SB_TREE_HEIGHT(tb->tb_sb,
8429 +                                          SB_TREE_HEIGHT(tb->tb_sb) - 1);
8430 +
8431 +                       do_balance_mark_sb_dirty(tb,
8432 +                                                REISERFS_SB(tb->tb_sb)->s_sbh,
8433 +                                                1);
8434 +                       /*&&&&&&&&&&&&&&&&&&&&&& */
8435 +                       /* use check_internal if new root is an internal node */
8436 +                       if (h > 1)
8437 +                               check_internal(new_root);
8438 +                       /*&&&&&&&&&&&&&&&&&&&&&& */
8439 +
8440 +                       /* do what is needed for buffer thrown from tree */
8441 +                       reiserfs_invalidate_buffer(tb, tbSh);
8442 +                       return;
8443 +               }
8444 +               return;
8445 +       }
8446 +
8447 +       /* join S[h] with L[h] */
8448 +       if (tb->L[h] && tb->lnum[h] == -B_NR_ITEMS(tb->L[h]) - 1) {
8449 +
8450 +               RFALSE(tb->rnum[h] != 0,
8451 +                      "invalid tb->rnum[%d]==%d when joining S[h] with L[h]",
8452 +                      h, tb->rnum[h]);
8453 +
8454 +               internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, n + 1);
8455 +               reiserfs_invalidate_buffer(tb, tbSh);
8456 +
8457 +               return;
8458 +       }
8459 +
8460 +       /* join S[h] with R[h] */
8461 +       if (tb->R[h] && tb->rnum[h] == -B_NR_ITEMS(tb->R[h]) - 1) {
8462 +               RFALSE(tb->lnum[h] != 0,
8463 +                      "invalid tb->lnum[%d]==%d when joining S[h] with R[h]",
8464 +                      h, tb->lnum[h]);
8465 +
8466 +               internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, n + 1);
8467 +
8468 +               reiserfs_invalidate_buffer(tb, tbSh);
8469 +               return;
8470 +       }
8471 +
8472 +       /* borrow from left neighbor L[h] */
8473 +       if (tb->lnum[h] < 0) {
8474 +               RFALSE(tb->rnum[h] != 0,
8475 +                      "wrong tb->rnum[%d]==%d when borrow from L[h]", h,
8476 +                      tb->rnum[h]);
8477 +               internal_shift_right(INTERNAL_SHIFT_FROM_L_TO_S, tb, h,
8478 +                                    -tb->lnum[h]);
8479 +               return;
8480 +       }
8481 +
8482 +       /* borrow from right neighbor R[h] */
8483 +       if (tb->rnum[h] < 0) {
8484 +               RFALSE(tb->lnum[h] != 0,
8485 +                      "invalid tb->lnum[%d]==%d when borrow from R[h]",
8486 +                      h, tb->lnum[h]);
8487 +               internal_shift_left(INTERNAL_SHIFT_FROM_R_TO_S, tb, h, -tb->rnum[h]);   /*tb->S[h], tb->CFR[h], tb->rkey[h], tb->R[h], -tb->rnum[h]); */
8488 +               return;
8489 +       }
8490 +
8491 +       /* split S[h] into two parts and put them into neighbors */
8492 +       if (tb->lnum[h] > 0) {
8493 +               RFALSE(tb->rnum[h] == 0 || tb->lnum[h] + tb->rnum[h] != n + 1,
8494 +                      "invalid tb->lnum[%d]==%d or tb->rnum[%d]==%d when S[h](item number == %d) is split between them",
8495 +                      h, tb->lnum[h], h, tb->rnum[h], n);
8496 +
8497 +               internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h]);    /*tb->L[h], tb->CFL[h], tb->lkey[h], tb->S[h], tb->lnum[h]); */
8498 +               internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
8499 +                                    tb->rnum[h]);
8500 +
8501 +               reiserfs_invalidate_buffer(tb, tbSh);
8502 +
8503 +               return;
8504 +       }
8505 +       reiserfs_panic(tb->tb_sb, "ibalance-2",
8506 +                      "unexpected tb->lnum[%d]==%d or tb->rnum[%d]==%d",
8507 +                      h, tb->lnum[h], h, tb->rnum[h]);
8508 +}
8509 +
8510 +/* Replace delimiting key of buffers L[h] and S[h] by the given key.*/
8511 +static void replace_lkey(struct tree_balance *tb, int h, struct item_head *key)
8512 +{
8513 +       RFALSE(tb->L[h] == NULL || tb->CFL[h] == NULL,
8514 +              "L[h](%p) and CFL[h](%p) must exist in replace_lkey",
8515 +              tb->L[h], tb->CFL[h]);
8516 +
8517 +       if (B_NR_ITEMS(PATH_H_PBUFFER(tb->tb_path, h)) == 0)
8518 +               return;
8519 +
8520 +       memcpy(internal_key(tb->CFL[h], tb->lkey[h]), key, KEY_SIZE);
8521 +
8522 +       do_balance_mark_internal_dirty(tb, tb->CFL[h], 0);
8523 +}
8524 +
8525 +/* Replace delimiting key of buffers S[h] and R[h] by the given key.*/
8526 +static void replace_rkey(struct tree_balance *tb, int h, struct item_head *key)
8527 +{
8528 +       RFALSE(tb->R[h] == NULL || tb->CFR[h] == NULL,
8529 +              "R[h](%p) and CFR[h](%p) must exist in replace_rkey",
8530 +              tb->R[h], tb->CFR[h]);
8531 +       RFALSE(B_NR_ITEMS(tb->R[h]) == 0,
8532 +              "R[h] can not be empty if it exists (item number=%d)",
8533 +              B_NR_ITEMS(tb->R[h]));
8534 +
8535 +       memcpy(internal_key(tb->CFR[h], tb->rkey[h]), key, KEY_SIZE);
8536 +
8537 +       do_balance_mark_internal_dirty(tb, tb->CFR[h], 0);
8538 +}
8539 +
8540 +
8541 +/*
8542 + * if inserting/pasting {
8543 + *   child_pos is the position of the node-pointer in S[h] that
8544 + *   pointed to S[h-1] before balancing of the h-1 level;
8545 + *   this means that new pointers and items must be inserted AFTER
8546 + *   child_pos
8547 + * } else {
8548 + *   it is the position of the leftmost pointer that must be deleted
8549 + *   (together with its corresponding key to the left of the pointer)
8550 + *   as a result of the previous level's balancing.
8551 + * }
8552 + */
8553 +
8554 +int balance_internal(struct tree_balance *tb,
8555 +                    int h,     /* level of the tree */
8556 +                    int child_pos,
8557 +                    /* key for insertion on higher level    */
8558 +                    struct item_head *insert_key,
8559 +                    /* node for insertion on higher level */
8560 +                    struct buffer_head **insert_ptr)
8561 +{
8562 +       struct buffer_head *tbSh = PATH_H_PBUFFER(tb->tb_path, h);
8563 +       struct buffer_info bi;
8564 +
8565 +       /*
8566 +        * we return this: it is 0 if there is no S[h],
8567 +        * else it is tb->S[h]->b_item_order
8568 +        */
8569 +       int order;
8570 +       int insert_num, n, k;
8571 +       struct buffer_head *S_new;
8572 +       struct item_head new_insert_key;
8573 +       struct buffer_head *new_insert_ptr = NULL;
8574 +       struct item_head *new_insert_key_addr = insert_key;
8575 +
8576 +       RFALSE(h < 1, "h (%d) can not be < 1 on internal level", h);
8577 +
8578 +       PROC_INFO_INC(tb->tb_sb, balance_at[h]);
8579 +
8580 +       order =
8581 +           (tbSh) ? PATH_H_POSITION(tb->tb_path,
8582 +                                    h + 1) /*tb->S[h]->b_item_order */ : 0;
8583 +
8584 +       /*
8585 +        * Using insert_size[h] calculate the number insert_num of items
8586 +        * that must be inserted to or deleted from S[h].
8587 +        */
8588 +       insert_num = tb->insert_size[h] / ((int)(KEY_SIZE + DC_SIZE));
8589 +
8590 +       /* Check whether insert_num is proper * */
8591 +       RFALSE(insert_num < -2 || insert_num > 2,
8592 +              "incorrect number of items inserted to the internal node (%d)",
8593 +              insert_num);
8594 +       RFALSE(h > 1 && (insert_num > 1 || insert_num < -1),
8595 +              "incorrect number of items (%d) inserted to the internal node on a level (h=%d) higher than last internal level",
8596 +              insert_num, h);
8597 +
8598 +       /* Make balance in case insert_num < 0 */
8599 +       if (insert_num < 0) {
8600 +               balance_internal_when_delete(tb, h, child_pos);
8601 +               return order;
8602 +       }
8603 +
8604 +       k = 0;
8605 +       if (tb->lnum[h] > 0) {
8606 +               /*
8607 +                * shift lnum[h] items from S[h] to the left neighbor L[h].
8608 +                * check how many of new items fall into L[h] or CFL[h] after
8609 +                * shifting
8610 +                */
8611 +               n = B_NR_ITEMS(tb->L[h]);       /* number of items in L[h] */
8612 +               if (tb->lnum[h] <= child_pos) {
8613 +                       /* new items don't fall into L[h] or CFL[h] */
8614 +                       internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h,
8615 +                                           tb->lnum[h]);
8616 +                       child_pos -= tb->lnum[h];
8617 +               } else if (tb->lnum[h] > child_pos + insert_num) {
8618 +                       /* all new items fall into L[h] */
8619 +                       internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h,
8620 +                                           tb->lnum[h] - insert_num);
8621 +                       /* insert insert_num keys and node-pointers into L[h] */
8622 +                       bi.tb = tb;
8623 +                       bi.bi_bh = tb->L[h];
8624 +                       bi.bi_parent = tb->FL[h];
8625 +                       bi.bi_position = get_left_neighbor_position(tb, h);
8626 +                       internal_insert_childs(&bi,
8627 +                                              /*tb->L[h], tb->S[h-1]->b_next */
8628 +                                              n + child_pos + 1,
8629 +                                              insert_num, insert_key,
8630 +                                              insert_ptr);
8631 +
8632 +                       insert_num = 0;
8633 +               } else {
8634 +                       struct disk_child *dc;
8635 +
8636 +                       /*
8637 +                        * some items fall into L[h] or CFL[h],
8638 +                        * but some don't fall
8639 +                        */
8640 +                       internal_shift1_left(tb, h, child_pos + 1);
8641 +                       /* calculate number of new items that fall into L[h] */
8642 +                       k = tb->lnum[h] - child_pos - 1;
8643 +                       bi.tb = tb;
8644 +                       bi.bi_bh = tb->L[h];
8645 +                       bi.bi_parent = tb->FL[h];
8646 +                       bi.bi_position = get_left_neighbor_position(tb, h);
8647 +                       internal_insert_childs(&bi,
8648 +                                              /*tb->L[h], tb->S[h-1]->b_next, */
8649 +                                              n + child_pos + 1, k,
8650 +                                              insert_key, insert_ptr);
8651 +
8652 +                       replace_lkey(tb, h, insert_key + k);
8653 +
8654 +                       /*
8655 +                        * replace the first node-ptr in S[h] by
8656 +                        * node-ptr to insert_ptr[k]
8657 +                        */
8658 +                       dc = B_N_CHILD(tbSh, 0);
8659 +                       put_dc_size(dc,
8660 +                                   MAX_CHILD_SIZE(insert_ptr[k]) -
8661 +                                   B_FREE_SPACE(insert_ptr[k]));
8662 +                       put_dc_block_number(dc, insert_ptr[k]->b_blocknr);
8663 +
8664 +                       do_balance_mark_internal_dirty(tb, tbSh, 0);
8665 +
8666 +                       k++;
8667 +                       insert_key += k;
8668 +                       insert_ptr += k;
8669 +                       insert_num -= k;
8670 +                       child_pos = 0;
8671 +               }
8672 +       }
8673 +       /* tb->lnum[h] > 0 */
8674 +       if (tb->rnum[h] > 0) {
8675 +               /*shift rnum[h] items from S[h] to the right neighbor R[h] */
8676 +               /*
8677 +                * check how many of new items fall into R or CFR
8678 +                * after shifting
8679 +                */
8680 +               n = B_NR_ITEMS(tbSh);   /* number of items in S[h] */
8681 +               if (n - tb->rnum[h] >= child_pos)
8682 +                       /* new items fall into S[h] */
8683 +                       internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
8684 +                                            tb->rnum[h]);
8685 +               else if (n + insert_num - tb->rnum[h] < child_pos) {
8686 +                       /* all new items fall into R[h] */
8687 +                       internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
8688 +                                            tb->rnum[h] - insert_num);
8689 +
8690 +                       /* insert insert_num keys and node-pointers into R[h] */
8691 +                       bi.tb = tb;
8692 +                       bi.bi_bh = tb->R[h];
8693 +                       bi.bi_parent = tb->FR[h];
8694 +                       bi.bi_position = get_right_neighbor_position(tb, h);
8695 +                       internal_insert_childs(&bi,
8696 +                                              /*tb->R[h],tb->S[h-1]->b_next */
8697 +                                              child_pos - n - insert_num +
8698 +                                              tb->rnum[h] - 1,
8699 +                                              insert_num, insert_key,
8700 +                                              insert_ptr);
8701 +                       insert_num = 0;
8702 +               } else {
8703 +                       struct disk_child *dc;
8704 +
8705 +                       /* one of the items falls into CFR[h] */
8706 +                       internal_shift1_right(tb, h, n - child_pos + 1);
8707 +                       /* calculate number of new items that fall into R[h] */
8708 +                       k = tb->rnum[h] - n + child_pos - 1;
8709 +                       bi.tb = tb;
8710 +                       bi.bi_bh = tb->R[h];
8711 +                       bi.bi_parent = tb->FR[h];
8712 +                       bi.bi_position = get_right_neighbor_position(tb, h);
8713 +                       internal_insert_childs(&bi,
8714 +                                              /*tb->R[h], tb->R[h]->b_child, */
8715 +                                              0, k, insert_key + 1,
8716 +                                              insert_ptr + 1);
8717 +
8718 +                       replace_rkey(tb, h, insert_key + insert_num - k - 1);
8719 +
8720 +                       /*
8721 +                        * replace the first node-ptr in R[h] by
8722 +                        * node-ptr insert_ptr[insert_num-k-1]
8723 +                        */
8724 +                       dc = B_N_CHILD(tb->R[h], 0);
8725 +                       put_dc_size(dc,
8726 +                                   MAX_CHILD_SIZE(insert_ptr
8727 +                                                  [insert_num - k - 1]) -
8728 +                                   B_FREE_SPACE(insert_ptr
8729 +                                                [insert_num - k - 1]));
8730 +                       put_dc_block_number(dc,
8731 +                                           insert_ptr[insert_num - k -
8732 +                                                      1]->b_blocknr);
8733 +
8734 +                       do_balance_mark_internal_dirty(tb, tb->R[h], 0);
8735 +
8736 +                       insert_num -= (k + 1);
8737 +               }
8738 +       }
8739 +
8740 +       /** Fill new node that appears instead of S[h] **/
8741 +       RFALSE(tb->blknum[h] > 2, "blknum can not be > 2 for internal level");
8742 +       RFALSE(tb->blknum[h] < 0, "blknum can not be < 0");
8743 +
8744 +       if (!tb->blknum[h]) {   /* node S[h] is empty now */
8745 +               RFALSE(!tbSh, "S[h] is equal NULL");
8746 +
8747 +               /* do what is needed for buffer thrown from tree */
8748 +               reiserfs_invalidate_buffer(tb, tbSh);
8749 +               return order;
8750 +       }
8751 +
8752 +       if (!tbSh) {
8753 +               /* create new root */
8754 +               struct disk_child *dc;
8755 +               struct buffer_head *tbSh_1 = PATH_H_PBUFFER(tb->tb_path, h - 1);
8756 +               struct block_head *blkh;
8757 +
8758 +               if (tb->blknum[h] != 1)
8759 +                       reiserfs_panic(NULL, "ibalance-3", "One new node "
8760 +                                      "required for creating the new root");
8761 +               /* S[h] = empty buffer from the list FEB. */
8762 +               tbSh = get_FEB(tb);
8763 +               blkh = B_BLK_HEAD(tbSh);
8764 +               set_blkh_level(blkh, h + 1);
8765 +
8766 +               /* Put the unique node-pointer to S[h] that points to S[h-1]. */
8767 +
8768 +               dc = B_N_CHILD(tbSh, 0);
8769 +               put_dc_block_number(dc, tbSh_1->b_blocknr);
8770 +               put_dc_size(dc,
8771 +                           (MAX_CHILD_SIZE(tbSh_1) - B_FREE_SPACE(tbSh_1)));
8772 +
8773 +               tb->insert_size[h] -= DC_SIZE;
8774 +               set_blkh_free_space(blkh, blkh_free_space(blkh) - DC_SIZE);
8775 +
8776 +               do_balance_mark_internal_dirty(tb, tbSh, 0);
8777 +
8778 +               /*&&&&&&&&&&&&&&&&&&&&&&&& */
8779 +               check_internal(tbSh);
8780 +               /*&&&&&&&&&&&&&&&&&&&&&&&& */
8781 +
8782 +               /* put new root into path structure */
8783 +               PATH_OFFSET_PBUFFER(tb->tb_path, ILLEGAL_PATH_ELEMENT_OFFSET) =
8784 +                   tbSh;
8785 +
8786 +               /* Change root in structure super block. */
8787 +               PUT_SB_ROOT_BLOCK(tb->tb_sb, tbSh->b_blocknr);
8788 +               PUT_SB_TREE_HEIGHT(tb->tb_sb, SB_TREE_HEIGHT(tb->tb_sb) + 1);
8789 +               do_balance_mark_sb_dirty(tb, REISERFS_SB(tb->tb_sb)->s_sbh, 1);
8790 +       }
8791 +
8792 +       if (tb->blknum[h] == 2) {
8793 +               int snum;
8794 +               struct buffer_info dest_bi, src_bi;
8795 +
8796 +               /* S_new = free buffer from list FEB */
8797 +               S_new = get_FEB(tb);
8798 +
8799 +               set_blkh_level(B_BLK_HEAD(S_new), h + 1);
8800 +
8801 +               dest_bi.tb = tb;
8802 +               dest_bi.bi_bh = S_new;
8803 +               dest_bi.bi_parent = NULL;
8804 +               dest_bi.bi_position = 0;
8805 +               src_bi.tb = tb;
8806 +               src_bi.bi_bh = tbSh;
8807 +               src_bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h);
8808 +               src_bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
8809 +
8810 +               n = B_NR_ITEMS(tbSh);   /* number of items in S[h] */
8811 +               snum = (insert_num + n + 1) / 2;
8812 +               if (n - snum >= child_pos) {
8813 +                       /* new items don't fall into S_new */
8814 +                       /*  store the delimiting key for the next level */
8815 +                       /* new_insert_key = (n - snum)'th key in S[h] */
8816 +                       memcpy(&new_insert_key, internal_key(tbSh, n - snum),
8817 +                              KEY_SIZE);
8818 +                       /* last parameter is del_par */
8819 +                       internal_move_pointers_items(&dest_bi, &src_bi,
8820 +                                                    LAST_TO_FIRST, snum, 0);
8821 +               } else if (n + insert_num - snum < child_pos) {
8822 +                       /* all new items fall into S_new */
8823 +                       /*  store the delimiting key for the next level */
8824 +                       /*
8825 +                        * new_insert_key = (n + insert_item - snum)'th
8826 +                        * key in S[h]
8827 +                        */
8828 +                       memcpy(&new_insert_key,
8829 +                              internal_key(tbSh, n + insert_num - snum),
8830 +                              KEY_SIZE);
8831 +                       /* last parameter is del_par */
8832 +                       internal_move_pointers_items(&dest_bi, &src_bi,
8833 +                                                    LAST_TO_FIRST,
8834 +                                                    snum - insert_num, 0);
8835 +
8836 +                       /*
8837 +                        * insert insert_num keys and node-pointers
8838 +                        * into S_new
8839 +                        */
8840 +                       internal_insert_childs(&dest_bi,
8841 +                                              /*S_new,tb->S[h-1]->b_next, */
8842 +                                              child_pos - n - insert_num +
8843 +                                              snum - 1,
8844 +                                              insert_num, insert_key,
8845 +                                              insert_ptr);
8846 +
8847 +                       insert_num = 0;
8848 +               } else {
8849 +                       struct disk_child *dc;
8850 +
8851 +                       /* some items fall into S_new, but some don't fall */
8852 +                       /* last parameter is del_par */
8853 +                       internal_move_pointers_items(&dest_bi, &src_bi,
8854 +                                                    LAST_TO_FIRST,
8855 +                                                    n - child_pos + 1, 1);
8856 +                       /* calculate number of new items that fall into S_new */
8857 +                       k = snum - n + child_pos - 1;
8858 +
8859 +                       internal_insert_childs(&dest_bi, /*S_new, */ 0, k,
8860 +                                              insert_key + 1, insert_ptr + 1);
8861 +
8862 +                       /* new_insert_key = insert_key[insert_num - k - 1] */
8863 +                       memcpy(&new_insert_key, insert_key + insert_num - k - 1,
8864 +                              KEY_SIZE);
8865 +                       /*
8866 +                        * replace first node-ptr in S_new by node-ptr
8867 +                        * to insert_ptr[insert_num-k-1]
8868 +                        */
8869 +
8870 +                       dc = B_N_CHILD(S_new, 0);
8871 +                       put_dc_size(dc,
8872 +                                   (MAX_CHILD_SIZE
8873 +                                    (insert_ptr[insert_num - k - 1]) -
8874 +                                    B_FREE_SPACE(insert_ptr
8875 +                                                 [insert_num - k - 1])));
8876 +                       put_dc_block_number(dc,
8877 +                                           insert_ptr[insert_num - k -
8878 +                                                      1]->b_blocknr);
8879 +
8880 +                       do_balance_mark_internal_dirty(tb, S_new, 0);
8881 +
8882 +                       insert_num -= (k + 1);
8883 +               }
8884 +               /* new_insert_ptr = node_pointer to S_new */
8885 +               new_insert_ptr = S_new;
8886 +
8887 +               RFALSE(!buffer_journaled(S_new) || buffer_journal_dirty(S_new)
8888 +                      || buffer_dirty(S_new), "cm-00001: bad S_new (%b)",
8889 +                      S_new);
8890 +
8891 +               /* S_new is released in unfix_nodes */
8892 +       }
8893 +
8894 +       n = B_NR_ITEMS(tbSh);   /*number of items in S[h] */
8895 +
8896 +       if (0 <= child_pos && child_pos <= n && insert_num > 0) {
8897 +               bi.tb = tb;
8898 +               bi.bi_bh = tbSh;
8899 +               bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h);
8900 +               bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
8901 +               internal_insert_childs(&bi,     /*tbSh, */
8902 +                                      /*          ( tb->S[h-1]->b_parent == tb->S[h] ) ? tb->S[h-1]->b_next :  tb->S[h]->b_child->b_next, */
8903 +                                      child_pos, insert_num, insert_key,
8904 +                                      insert_ptr);
8905 +       }
8906 +
8907 +       insert_ptr[0] = new_insert_ptr;
8908 +       if (new_insert_ptr)
8909 +               memcpy(new_insert_key_addr, &new_insert_key, KEY_SIZE);
8910 +
8911 +       return order;
8912 +}
8913 diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
8914 new file mode 100644
8915 index 000000000000..d39ee5f6c075
8916 --- /dev/null
8917 +++ b/fs/reiserfs/inode.c
8918 @@ -0,0 +1,3416 @@
8919 +/*
8920 + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
8921 + */
8922 +
8923 +#include <linux/time.h>
8924 +#include <linux/fs.h>
8925 +#include "reiserfs.h"
8926 +#include "acl.h"
8927 +#include "xattr.h"
8928 +#include <linux/exportfs.h>
8929 +#include <linux/pagemap.h>
8930 +#include <linux/highmem.h>
8931 +#include <linux/slab.h>
8932 +#include <linux/uaccess.h>
8933 +#include <linux/unaligned.h>
8934 +#include <linux/buffer_head.h>
8935 +#include <linux/mpage.h>
8936 +#include <linux/writeback.h>
8937 +#include <linux/quotaops.h>
8938 +#include <linux/swap.h>
8939 +#include <linux/uio.h>
8940 +#include <linux/bio.h>
8941 +
8942 +int reiserfs_commit_write(struct file *f, struct page *page,
8943 +                         unsigned from, unsigned to);
8944 +
8945 +void reiserfs_evict_inode(struct inode *inode)
8946 +{
8947 +       /*
8948 +        * We need blocks for transaction + (user+group) quota
8949 +        * update (possibly delete)
8950 +        */
8951 +       int jbegin_count =
8952 +           JOURNAL_PER_BALANCE_CNT * 2 +
8953 +           2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb);
8954 +       struct reiserfs_transaction_handle th;
8955 +       int err;
8956 +
8957 +       if (!inode->i_nlink && !is_bad_inode(inode))
8958 +               dquot_initialize(inode);
8959 +
8960 +       truncate_inode_pages_final(&inode->i_data);
8961 +       if (inode->i_nlink)
8962 +               goto no_delete;
8963 +
8964 +       /*
8965 +        * The = 0 happens when we abort creating a new inode
8966 +        * for some reason like lack of space..
8967 +        * also handles bad_inode case
8968 +        */
8969 +       if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) {
8970 +
8971 +               reiserfs_delete_xattrs(inode);
8972 +
8973 +               reiserfs_write_lock(inode->i_sb);
8974 +
8975 +               if (journal_begin(&th, inode->i_sb, jbegin_count))
8976 +                       goto out;
8977 +               reiserfs_update_inode_transaction(inode);
8978 +
8979 +               reiserfs_discard_prealloc(&th, inode);
8980 +
8981 +               err = reiserfs_delete_object(&th, inode);
8982 +
8983 +               /*
8984 +                * Do quota update inside a transaction for journaled quotas.
8985 +                * We must do that after delete_object so that quota updates
8986 +                * go into the same transaction as stat data deletion
8987 +                */
8988 +               if (!err) {
8989 +                       int depth = reiserfs_write_unlock_nested(inode->i_sb);
8990 +                       dquot_free_inode(inode);
8991 +                       reiserfs_write_lock_nested(inode->i_sb, depth);
8992 +               }
8993 +
8994 +               if (journal_end(&th))
8995 +                       goto out;
8996 +
8997 +               /*
8998 +                * check return value from reiserfs_delete_object after
8999 +                * ending the transaction
9000 +                */
9001 +               if (err)
9002 +                   goto out;
9003 +
9004 +               /*
9005 +                * all items of file are deleted, so we can remove
9006 +                * "save" link
9007 +                * we can't do anything about an error here
9008 +                */
9009 +               remove_save_link(inode, 0 /* not truncate */);
9010 +out:
9011 +               reiserfs_write_unlock(inode->i_sb);
9012 +       } else {
9013 +               /* no object items are in the tree */
9014 +               ;
9015 +       }
9016 +
9017 +       /* note this must go after the journal_end to prevent deadlock */
9018 +       clear_inode(inode);
9019 +
9020 +       dquot_drop(inode);
9021 +       inode->i_blocks = 0;
9022 +       return;
9023 +
9024 +no_delete:
9025 +       clear_inode(inode);
9026 +       dquot_drop(inode);
9027 +}
9028 +
9029 +static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid,
9030 +                         __u32 objectid, loff_t offset, int type, int length)
9031 +{
9032 +       key->version = version;
9033 +
9034 +       key->on_disk_key.k_dir_id = dirid;
9035 +       key->on_disk_key.k_objectid = objectid;
9036 +       set_cpu_key_k_offset(key, offset);
9037 +       set_cpu_key_k_type(key, type);
9038 +       key->key_length = length;
9039 +}
9040 +
9041 +/*
9042 + * take base of inode_key (it comes from inode always) (dirid, objectid)
9043 + * and version from an inode, set offset and type of key
9044 + */
9045 +void make_cpu_key(struct cpu_key *key, struct inode *inode, loff_t offset,
9046 +                 int type, int length)
9047 +{
9048 +       _make_cpu_key(key, get_inode_item_key_version(inode),
9049 +                     le32_to_cpu(INODE_PKEY(inode)->k_dir_id),
9050 +                     le32_to_cpu(INODE_PKEY(inode)->k_objectid), offset, type,
9051 +                     length);
9052 +}
9053 +
9054 +/* when key is 0, do not set version and short key */
9055 +inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
9056 +                             int version,
9057 +                             loff_t offset, int type, int length,
9058 +                             int entry_count /*or ih_free_space */ )
9059 +{
9060 +       if (key) {
9061 +               ih->ih_key.k_dir_id = cpu_to_le32(key->on_disk_key.k_dir_id);
9062 +               ih->ih_key.k_objectid =
9063 +                   cpu_to_le32(key->on_disk_key.k_objectid);
9064 +       }
9065 +       put_ih_version(ih, version);
9066 +       set_le_ih_k_offset(ih, offset);
9067 +       set_le_ih_k_type(ih, type);
9068 +       put_ih_item_len(ih, length);
9069 +       /*    set_ih_free_space (ih, 0); */
9070 +       /*
9071 +        * for directory items it is entry count, for directs and stat
9072 +        * datas - 0xffff, for indirects - 0
9073 +        */
9074 +       put_ih_entry_count(ih, entry_count);
9075 +}
9076 +
9077 +/*
9078 + * FIXME: we might cache recently accessed indirect item
9079 + * Ugh.  Not too eager for that....
9080 + * I cut the code until such time as I see a convincing argument (benchmark).
9081 + * I don't want a bloated inode struct..., and I don't like code complexity....
9082 + */
9083 +
9084 +/*
9085 + * cutting the code is fine, since it really isn't in use yet and is easy
9086 + * to add back in.  But, Vladimir has a really good idea here.  Think
9087 + * about what happens for reading a file.  For each page,
9088 + * The VFS layer calls reiserfs_read_folio, who searches the tree to find
9089 + * an indirect item.  This indirect item has X number of pointers, where
9090 + * X is a big number if we've done the block allocation right.  But,
9091 + * we only use one or two of these pointers during each call to read_folio,
9092 + * needlessly researching again later on.
9093 + *
9094 + * The size of the cache could be dynamic based on the size of the file.
9095 + *
9096 + * I'd also like to see us cache the location the stat data item, since
9097 + * we are needlessly researching for that frequently.
9098 + *
9099 + * --chris
9100 + */
9101 +
9102 +/*
9103 + * If this page has a file tail in it, and
9104 + * it was read in by get_block_create_0, the page data is valid,
9105 + * but tail is still sitting in a direct item, and we can't write to
9106 + * it.  So, look through this page, and check all the mapped buffers
9107 + * to make sure they have valid block numbers.  Any that don't need
9108 + * to be unmapped, so that __block_write_begin will correctly call
9109 + * reiserfs_get_block to convert the tail into an unformatted node
9110 + */
9111 +static inline void fix_tail_page_for_writing(struct page *page)
9112 +{
9113 +       struct buffer_head *head, *next, *bh;
9114 +
9115 +       if (page && page_has_buffers(page)) {
9116 +               head = page_buffers(page);
9117 +               bh = head;
9118 +               do {
9119 +                       next = bh->b_this_page;
9120 +                       if (buffer_mapped(bh) && bh->b_blocknr == 0) {
9121 +                               reiserfs_unmap_buffer(bh);
9122 +                       }
9123 +                       bh = next;
9124 +               } while (bh != head);
9125 +       }
9126 +}
9127 +
9128 +/*
9129 + * reiserfs_get_block does not need to allocate a block only if it has been
9130 + * done already or non-hole position has been found in the indirect item
9131 + */
9132 +static inline int allocation_needed(int retval, b_blocknr_t allocated,
9133 +                                   struct item_head *ih,
9134 +                                   __le32 * item, int pos_in_item)
9135 +{
9136 +       if (allocated)
9137 +               return 0;
9138 +       if (retval == POSITION_FOUND && is_indirect_le_ih(ih) &&
9139 +           get_block_num(item, pos_in_item))
9140 +               return 0;
9141 +       return 1;
9142 +}
9143 +
9144 +static inline int indirect_item_found(int retval, struct item_head *ih)
9145 +{
9146 +       return (retval == POSITION_FOUND) && is_indirect_le_ih(ih);
9147 +}
9148 +
9149 +static inline void set_block_dev_mapped(struct buffer_head *bh,
9150 +                                       b_blocknr_t block, struct inode *inode)
9151 +{
9152 +       map_bh(bh, inode->i_sb, block);
9153 +}
9154 +
9155 +/*
9156 + * files which were created in the earlier version can not be longer,
9157 + * than 2 gb
9158 + */
9159 +static int file_capable(struct inode *inode, sector_t block)
9160 +{
9161 +       /* it is new file. */
9162 +       if (get_inode_item_key_version(inode) != KEY_FORMAT_3_5 ||
9163 +           /* old file, but 'block' is inside of 2gb */
9164 +           block < (1 << (31 - inode->i_sb->s_blocksize_bits)))
9165 +               return 1;
9166 +
9167 +       return 0;
9168 +}
9169 +
9170 +static int restart_transaction(struct reiserfs_transaction_handle *th,
9171 +                              struct inode *inode, struct treepath *path)
9172 +{
9173 +       struct super_block *s = th->t_super;
9174 +       int err;
9175 +
9176 +       BUG_ON(!th->t_trans_id);
9177 +       BUG_ON(!th->t_refcount);
9178 +
9179 +       pathrelse(path);
9180 +
9181 +       /* we cannot restart while nested */
9182 +       if (th->t_refcount > 1) {
9183 +               return 0;
9184 +       }
9185 +       reiserfs_update_sd(th, inode);
9186 +       err = journal_end(th);
9187 +       if (!err) {
9188 +               err = journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6);
9189 +               if (!err)
9190 +                       reiserfs_update_inode_transaction(inode);
9191 +       }
9192 +       return err;
9193 +}
9194 +
9195 +/*
9196 + * it is called by get_block when create == 0. Returns block number
9197 + * for 'block'-th logical block of file. When it hits direct item it
9198 + * returns 0 (being called from bmap) or read direct item into piece
9199 + * of page (bh_result)
9200 + * Please improve the english/clarity in the comment above, as it is
9201 + * hard to understand.
9202 + */
9203 +static int _get_block_create_0(struct inode *inode, sector_t block,
9204 +                              struct buffer_head *bh_result, int args)
9205 +{
9206 +       INITIALIZE_PATH(path);
9207 +       struct cpu_key key;
9208 +       struct buffer_head *bh;
9209 +       struct item_head *ih, tmp_ih;
9210 +       b_blocknr_t blocknr;
9211 +       char *p;
9212 +       int chars;
9213 +       int ret;
9214 +       int result;
9215 +       int done = 0;
9216 +       unsigned long offset;
9217 +
9218 +       /* prepare the key to look for the 'block'-th block of file */
9219 +       make_cpu_key(&key, inode,
9220 +                    (loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY,
9221 +                    3);
9222 +
9223 +       result = search_for_position_by_key(inode->i_sb, &key, &path);
9224 +       if (result != POSITION_FOUND) {
9225 +               pathrelse(&path);
9226 +               if (result == IO_ERROR)
9227 +                       return -EIO;
9228 +               /*
9229 +                * We do not return -ENOENT if there is a hole but page is
9230 +                * uptodate, because it means that there is some MMAPED data
9231 +                * associated with it that is yet to be written to disk.
9232 +                */
9233 +               if ((args & GET_BLOCK_NO_HOLE)
9234 +                   && !PageUptodate(bh_result->b_page)) {
9235 +                       return -ENOENT;
9236 +               }
9237 +               return 0;
9238 +       }
9239 +
9240 +       bh = get_last_bh(&path);
9241 +       ih = tp_item_head(&path);
9242 +       if (is_indirect_le_ih(ih)) {
9243 +               __le32 *ind_item = (__le32 *) ih_item_body(bh, ih);
9244 +
9245 +               /*
9246 +                * FIXME: here we could cache indirect item or part of it in
9247 +                * the inode to avoid search_by_key in case of subsequent
9248 +                * access to file
9249 +                */
9250 +               blocknr = get_block_num(ind_item, path.pos_in_item);
9251 +               ret = 0;
9252 +               if (blocknr) {
9253 +                       map_bh(bh_result, inode->i_sb, blocknr);
9254 +                       if (path.pos_in_item ==
9255 +                           ((ih_item_len(ih) / UNFM_P_SIZE) - 1)) {
9256 +                               set_buffer_boundary(bh_result);
9257 +                       }
9258 +               } else
9259 +                       /*
9260 +                        * We do not return -ENOENT if there is a hole but
9261 +                        * page is uptodate, because it means that there is
9262 +                        * some MMAPED data associated with it that is
9263 +                        * yet to be written to disk.
9264 +                        */
9265 +               if ((args & GET_BLOCK_NO_HOLE)
9266 +                           && !PageUptodate(bh_result->b_page)) {
9267 +                       ret = -ENOENT;
9268 +               }
9269 +
9270 +               pathrelse(&path);
9271 +               return ret;
9272 +       }
9273 +       /* requested data are in direct item(s) */
9274 +       if (!(args & GET_BLOCK_READ_DIRECT)) {
9275 +               /*
9276 +                * we are called by bmap. FIXME: we can not map block of file
9277 +                * when it is stored in direct item(s)
9278 +                */
9279 +               pathrelse(&path);
9280 +               return -ENOENT;
9281 +       }
9282 +
9283 +       /*
9284 +        * if we've got a direct item, and the buffer or page was uptodate,
9285 +        * we don't want to pull data off disk again.  skip to the
9286 +        * end, where we map the buffer and return
9287 +        */
9288 +       if (buffer_uptodate(bh_result)) {
9289 +               goto finished;
9290 +       } else
9291 +               /*
9292 +                * grab_tail_page can trigger calls to reiserfs_get_block on
9293 +                * up to date pages without any buffers.  If the page is up
9294 +                * to date, we don't want read old data off disk.  Set the up
9295 +                * to date bit on the buffer instead and jump to the end
9296 +                */
9297 +       if (!bh_result->b_page || PageUptodate(bh_result->b_page)) {
9298 +               set_buffer_uptodate(bh_result);
9299 +               goto finished;
9300 +       }
9301 +       /* read file tail into part of page */
9302 +       offset = (cpu_key_k_offset(&key) - 1) & (PAGE_SIZE - 1);
9303 +       copy_item_head(&tmp_ih, ih);
9304 +
9305 +       /*
9306 +        * we only want to kmap if we are reading the tail into the page.
9307 +        * this is not the common case, so we don't kmap until we are
9308 +        * sure we need to.  But, this means the item might move if
9309 +        * kmap schedules
9310 +        */
9311 +       p = (char *)kmap(bh_result->b_page);
9312 +       p += offset;
9313 +       memset(p, 0, inode->i_sb->s_blocksize);
9314 +       do {
9315 +               if (!is_direct_le_ih(ih)) {
9316 +                       BUG();
9317 +               }
9318 +               /*
9319 +                * make sure we don't read more bytes than actually exist in
9320 +                * the file.  This can happen in odd cases where i_size isn't
9321 +                * correct, and when direct item padding results in a few
9322 +                * extra bytes at the end of the direct item
9323 +                */
9324 +               if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size)
9325 +                       break;
9326 +               if ((le_ih_k_offset(ih) - 1 + ih_item_len(ih)) > inode->i_size) {
9327 +                       chars =
9328 +                           inode->i_size - (le_ih_k_offset(ih) - 1) -
9329 +                           path.pos_in_item;
9330 +                       done = 1;
9331 +               } else {
9332 +                       chars = ih_item_len(ih) - path.pos_in_item;
9333 +               }
9334 +               memcpy(p, ih_item_body(bh, ih) + path.pos_in_item, chars);
9335 +
9336 +               if (done)
9337 +                       break;
9338 +
9339 +               p += chars;
9340 +
9341 +               /*
9342 +                * we done, if read direct item is not the last item of
9343 +                * node FIXME: we could try to check right delimiting key
9344 +                * to see whether direct item continues in the right
9345 +                * neighbor or rely on i_size
9346 +                */
9347 +               if (PATH_LAST_POSITION(&path) != (B_NR_ITEMS(bh) - 1))
9348 +                       break;
9349 +
9350 +               /* update key to look for the next piece */
9351 +               set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + chars);
9352 +               result = search_for_position_by_key(inode->i_sb, &key, &path);
9353 +               if (result != POSITION_FOUND)
9354 +                       /* i/o error most likely */
9355 +                       break;
9356 +               bh = get_last_bh(&path);
9357 +               ih = tp_item_head(&path);
9358 +       } while (1);
9359 +
9360 +       flush_dcache_page(bh_result->b_page);
9361 +       kunmap(bh_result->b_page);
9362 +
9363 +finished:
9364 +       pathrelse(&path);
9365 +
9366 +       if (result == IO_ERROR)
9367 +               return -EIO;
9368 +
9369 +       /*
9370 +        * this buffer has valid data, but isn't valid for io.  mapping it to
9371 +        * block #0 tells the rest of reiserfs it just has a tail in it
9372 +        */
9373 +       map_bh(bh_result, inode->i_sb, 0);
9374 +       set_buffer_uptodate(bh_result);
9375 +       return 0;
9376 +}
9377 +
9378 +/*
9379 + * this is called to create file map. So, _get_block_create_0 will not
9380 + * read direct item
9381 + */
9382 +static int reiserfs_bmap(struct inode *inode, sector_t block,
9383 +                        struct buffer_head *bh_result, int create)
9384 +{
9385 +       if (!file_capable(inode, block))
9386 +               return -EFBIG;
9387 +
9388 +       reiserfs_write_lock(inode->i_sb);
9389 +       /* do not read the direct item */
9390 +       _get_block_create_0(inode, block, bh_result, 0);
9391 +       reiserfs_write_unlock(inode->i_sb);
9392 +       return 0;
9393 +}
9394 +
9395 +/*
9396 + * special version of get_block that is only used by grab_tail_page right
9397 + * now.  It is sent to __block_write_begin, and when you try to get a
9398 + * block past the end of the file (or a block from a hole) it returns
9399 + * -ENOENT instead of a valid buffer.  __block_write_begin expects to
9400 + * be able to do i/o on the buffers returned, unless an error value
9401 + * is also returned.
9402 + *
9403 + * So, this allows __block_write_begin to be used for reading a single block
9404 + * in a page.  Where it does not produce a valid page for holes, or past the
9405 + * end of the file.  This turns out to be exactly what we need for reading
9406 + * tails for conversion.
9407 + *
9408 + * The point of the wrapper is forcing a certain value for create, even
9409 + * though the VFS layer is calling this function with create==1.  If you
9410 + * don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block,
9411 + * don't use this function.
9412 +*/
9413 +static int reiserfs_get_block_create_0(struct inode *inode, sector_t block,
9414 +                                      struct buffer_head *bh_result,
9415 +                                      int create)
9416 +{
9417 +       return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE);
9418 +}
9419 +
9420 +/*
9421 + * This is special helper for reiserfs_get_block in case we are executing
9422 + * direct_IO request.
9423 + */
9424 +static int reiserfs_get_blocks_direct_io(struct inode *inode,
9425 +                                        sector_t iblock,
9426 +                                        struct buffer_head *bh_result,
9427 +                                        int create)
9428 +{
9429 +       int ret;
9430 +
9431 +       bh_result->b_page = NULL;
9432 +
9433 +       /*
9434 +        * We set the b_size before reiserfs_get_block call since it is
9435 +        * referenced in convert_tail_for_hole() that may be called from
9436 +        * reiserfs_get_block()
9437 +        */
9438 +       bh_result->b_size = i_blocksize(inode);
9439 +
9440 +       ret = reiserfs_get_block(inode, iblock, bh_result,
9441 +                                create | GET_BLOCK_NO_DANGLE);
9442 +       if (ret)
9443 +               goto out;
9444 +
9445 +       /* don't allow direct io onto tail pages */
9446 +       if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
9447 +               /*
9448 +                * make sure future calls to the direct io funcs for this
9449 +                * offset in the file fail by unmapping the buffer
9450 +                */
9451 +               clear_buffer_mapped(bh_result);
9452 +               ret = -EINVAL;
9453 +       }
9454 +
9455 +       /*
9456 +        * Possible unpacked tail. Flush the data before pages have
9457 +        * disappeared
9458 +        */
9459 +       if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
9460 +               int err;
9461 +
9462 +               reiserfs_write_lock(inode->i_sb);
9463 +
9464 +               err = reiserfs_commit_for_inode(inode);
9465 +               REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
9466 +
9467 +               reiserfs_write_unlock(inode->i_sb);
9468 +
9469 +               if (err < 0)
9470 +                       ret = err;
9471 +       }
9472 +out:
9473 +       return ret;
9474 +}
9475 +
9476 +/*
9477 + * helper function for when reiserfs_get_block is called for a hole
9478 + * but the file tail is still in a direct item
9479 + * bh_result is the buffer head for the hole
9480 + * tail_offset is the offset of the start of the tail in the file
9481 + *
9482 + * This calls prepare_write, which will start a new transaction
9483 + * you should not be in a transaction, or have any paths held when you
9484 + * call this.
9485 + */
9486 +static int convert_tail_for_hole(struct inode *inode,
9487 +                                struct buffer_head *bh_result,
9488 +                                loff_t tail_offset)
9489 +{
9490 +       unsigned long index;
9491 +       unsigned long tail_end;
9492 +       unsigned long tail_start;
9493 +       struct page *tail_page;
9494 +       struct page *hole_page = bh_result->b_page;
9495 +       int retval = 0;
9496 +
9497 +       if ((tail_offset & (bh_result->b_size - 1)) != 1)
9498 +               return -EIO;
9499 +
9500 +       /* always try to read until the end of the block */
9501 +       tail_start = tail_offset & (PAGE_SIZE - 1);
9502 +       tail_end = (tail_start | (bh_result->b_size - 1)) + 1;
9503 +
9504 +       index = tail_offset >> PAGE_SHIFT;
9505 +       /*
9506 +        * hole_page can be zero in case of direct_io, we are sure
9507 +        * that we cannot get here if we write with O_DIRECT into tail page
9508 +        */
9509 +       if (!hole_page || index != hole_page->index) {
9510 +               tail_page = grab_cache_page(inode->i_mapping, index);
9511 +               retval = -ENOMEM;
9512 +               if (!tail_page) {
9513 +                       goto out;
9514 +               }
9515 +       } else {
9516 +               tail_page = hole_page;
9517 +       }
9518 +
9519 +       /*
9520 +        * we don't have to make sure the conversion did not happen while
9521 +        * we were locking the page because anyone that could convert
9522 +        * must first take i_mutex.
9523 +        *
9524 +        * We must fix the tail page for writing because it might have buffers
9525 +        * that are mapped, but have a block number of 0.  This indicates tail
9526 +        * data that has been read directly into the page, and
9527 +        * __block_write_begin won't trigger a get_block in this case.
9528 +        */
9529 +       fix_tail_page_for_writing(tail_page);
9530 +       retval = __reiserfs_write_begin(tail_page, tail_start,
9531 +                                     tail_end - tail_start);
9532 +       if (retval)
9533 +               goto unlock;
9534 +
9535 +       /* tail conversion might change the data in the page */
9536 +       flush_dcache_page(tail_page);
9537 +
9538 +       retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end);
9539 +
9540 +unlock:
9541 +       if (tail_page != hole_page) {
9542 +               unlock_page(tail_page);
9543 +               put_page(tail_page);
9544 +       }
9545 +out:
9546 +       return retval;
9547 +}
9548 +
9549 +static inline int _allocate_block(struct reiserfs_transaction_handle *th,
9550 +                                 sector_t block,
9551 +                                 struct inode *inode,
9552 +                                 b_blocknr_t * allocated_block_nr,
9553 +                                 struct treepath *path, int flags)
9554 +{
9555 +       BUG_ON(!th->t_trans_id);
9556 +
9557 +#ifdef REISERFS_PREALLOCATE
9558 +       if (!(flags & GET_BLOCK_NO_IMUX)) {
9559 +               return reiserfs_new_unf_blocknrs2(th, inode, allocated_block_nr,
9560 +                                                 path, block);
9561 +       }
9562 +#endif
9563 +       return reiserfs_new_unf_blocknrs(th, inode, allocated_block_nr, path,
9564 +                                        block);
9565 +}
9566 +
9567 +int reiserfs_get_block(struct inode *inode, sector_t block,
9568 +                      struct buffer_head *bh_result, int create)
9569 +{
9570 +       int repeat, retval = 0;
9571 +       /* b_blocknr_t is (unsigned) 32 bit int*/
9572 +       b_blocknr_t allocated_block_nr = 0;
9573 +       INITIALIZE_PATH(path);
9574 +       int pos_in_item;
9575 +       struct cpu_key key;
9576 +       struct buffer_head *bh, *unbh = NULL;
9577 +       struct item_head *ih, tmp_ih;
9578 +       __le32 *item;
9579 +       int done;
9580 +       int fs_gen;
9581 +       struct reiserfs_transaction_handle *th = NULL;
9582 +       /*
9583 +        * space reserved in transaction batch:
9584 +        * . 3 balancings in direct->indirect conversion
9585 +        * . 1 block involved into reiserfs_update_sd()
9586 +        * XXX in practically impossible worst case direct2indirect()
9587 +        * can incur (much) more than 3 balancings.
9588 +        * quota update for user, group
9589 +        */
9590 +       int jbegin_count =
9591 +           JOURNAL_PER_BALANCE_CNT * 3 + 1 +
9592 +           2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
9593 +       int version;
9594 +       int dangle = 1;
9595 +       loff_t new_offset =
9596 +           (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1;
9597 +
9598 +       reiserfs_write_lock(inode->i_sb);
9599 +       version = get_inode_item_key_version(inode);
9600 +
9601 +       if (!file_capable(inode, block)) {
9602 +               reiserfs_write_unlock(inode->i_sb);
9603 +               return -EFBIG;
9604 +       }
9605 +
9606 +       /*
9607 +        * if !create, we aren't changing the FS, so we don't need to
9608 +        * log anything, so we don't need to start a transaction
9609 +        */
9610 +       if (!(create & GET_BLOCK_CREATE)) {
9611 +               int ret;
9612 +               /* find number of block-th logical block of the file */
9613 +               ret = _get_block_create_0(inode, block, bh_result,
9614 +                                         create | GET_BLOCK_READ_DIRECT);
9615 +               reiserfs_write_unlock(inode->i_sb);
9616 +               return ret;
9617 +       }
9618 +
9619 +       /*
9620 +        * if we're already in a transaction, make sure to close
9621 +        * any new transactions we start in this func
9622 +        */
9623 +       if ((create & GET_BLOCK_NO_DANGLE) ||
9624 +           reiserfs_transaction_running(inode->i_sb))
9625 +               dangle = 0;
9626 +
9627 +       /*
9628 +        * If file is of such a size, that it might have a tail and
9629 +        * tails are enabled  we should mark it as possibly needing
9630 +        * tail packing on close
9631 +        */
9632 +       if ((have_large_tails(inode->i_sb)
9633 +            && inode->i_size < i_block_size(inode) * 4)
9634 +           || (have_small_tails(inode->i_sb)
9635 +               && inode->i_size < i_block_size(inode)))
9636 +               REISERFS_I(inode)->i_flags |= i_pack_on_close_mask;
9637 +
9638 +       /* set the key of the first byte in the 'block'-th block of file */
9639 +       make_cpu_key(&key, inode, new_offset, TYPE_ANY, 3 /*key length */ );
9640 +       if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) {
9641 +start_trans:
9642 +               th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count);
9643 +               if (!th) {
9644 +                       retval = -ENOMEM;
9645 +                       goto failure;
9646 +               }
9647 +               reiserfs_update_inode_transaction(inode);
9648 +       }
9649 +research:
9650 +
9651 +       retval = search_for_position_by_key(inode->i_sb, &key, &path);
9652 +       if (retval == IO_ERROR) {
9653 +               retval = -EIO;
9654 +               goto failure;
9655 +       }
9656 +
9657 +       bh = get_last_bh(&path);
9658 +       ih = tp_item_head(&path);
9659 +       item = tp_item_body(&path);
9660 +       pos_in_item = path.pos_in_item;
9661 +
9662 +       fs_gen = get_generation(inode->i_sb);
9663 +       copy_item_head(&tmp_ih, ih);
9664 +
9665 +       if (allocation_needed
9666 +           (retval, allocated_block_nr, ih, item, pos_in_item)) {
9667 +               /* we have to allocate block for the unformatted node */
9668 +               if (!th) {
9669 +                       pathrelse(&path);
9670 +                       goto start_trans;
9671 +               }
9672 +
9673 +               repeat =
9674 +                   _allocate_block(th, block, inode, &allocated_block_nr,
9675 +                                   &path, create);
9676 +
9677 +               /*
9678 +                * restart the transaction to give the journal a chance to free
9679 +                * some blocks.  releases the path, so we have to go back to
9680 +                * research if we succeed on the second try
9681 +                */
9682 +               if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) {
9683 +                       SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1;
9684 +                       retval = restart_transaction(th, inode, &path);
9685 +                       if (retval)
9686 +                               goto failure;
9687 +                       repeat =
9688 +                           _allocate_block(th, block, inode,
9689 +                                           &allocated_block_nr, NULL, create);
9690 +
9691 +                       if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) {
9692 +                               goto research;
9693 +                       }
9694 +                       if (repeat == QUOTA_EXCEEDED)
9695 +                               retval = -EDQUOT;
9696 +                       else
9697 +                               retval = -ENOSPC;
9698 +                       goto failure;
9699 +               }
9700 +
9701 +               if (fs_changed(fs_gen, inode->i_sb)
9702 +                   && item_moved(&tmp_ih, &path)) {
9703 +                       goto research;
9704 +               }
9705 +       }
9706 +
9707 +       if (indirect_item_found(retval, ih)) {
9708 +               b_blocknr_t unfm_ptr;
9709 +               /*
9710 +                * 'block'-th block is in the file already (there is
9711 +                * corresponding cell in some indirect item). But it may be
9712 +                * zero unformatted node pointer (hole)
9713 +                */
9714 +               unfm_ptr = get_block_num(item, pos_in_item);
9715 +               if (unfm_ptr == 0) {
9716 +                       /* use allocated block to plug the hole */
9717 +                       reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
9718 +                       if (fs_changed(fs_gen, inode->i_sb)
9719 +                           && item_moved(&tmp_ih, &path)) {
9720 +                               reiserfs_restore_prepared_buffer(inode->i_sb,
9721 +                                                                bh);
9722 +                               goto research;
9723 +                       }
9724 +                       set_buffer_new(bh_result);
9725 +                       if (buffer_dirty(bh_result)
9726 +                           && reiserfs_data_ordered(inode->i_sb))
9727 +                               reiserfs_add_ordered_list(inode, bh_result);
9728 +                       put_block_num(item, pos_in_item, allocated_block_nr);
9729 +                       unfm_ptr = allocated_block_nr;
9730 +                       journal_mark_dirty(th, bh);
9731 +                       reiserfs_update_sd(th, inode);
9732 +               }
9733 +               set_block_dev_mapped(bh_result, unfm_ptr, inode);
9734 +               pathrelse(&path);
9735 +               retval = 0;
9736 +               if (!dangle && th)
9737 +                       retval = reiserfs_end_persistent_transaction(th);
9738 +
9739 +               reiserfs_write_unlock(inode->i_sb);
9740 +
9741 +               /*
9742 +                * the item was found, so new blocks were not added to the file
9743 +                * there is no need to make sure the inode is updated with this
9744 +                * transaction
9745 +                */
9746 +               return retval;
9747 +       }
9748 +
9749 +       if (!th) {
9750 +               pathrelse(&path);
9751 +               goto start_trans;
9752 +       }
9753 +
9754 +       /*
9755 +        * desired position is not found or is in the direct item. We have
9756 +        * to append file with holes up to 'block'-th block converting
9757 +        * direct items to indirect one if necessary
9758 +        */
9759 +       done = 0;
9760 +       do {
9761 +               if (is_statdata_le_ih(ih)) {
9762 +                       __le32 unp = 0;
9763 +                       struct cpu_key tmp_key;
9764 +
9765 +                       /* indirect item has to be inserted */
9766 +                       make_le_item_head(&tmp_ih, &key, version, 1,
9767 +                                         TYPE_INDIRECT, UNFM_P_SIZE,
9768 +                                         0 /* free_space */ );
9769 +
9770 +                       /*
9771 +                        * we are going to add 'block'-th block to the file.
9772 +                        * Use allocated block for that
9773 +                        */
9774 +                       if (cpu_key_k_offset(&key) == 1) {
9775 +                               unp = cpu_to_le32(allocated_block_nr);
9776 +                               set_block_dev_mapped(bh_result,
9777 +                                                    allocated_block_nr, inode);
9778 +                               set_buffer_new(bh_result);
9779 +                               done = 1;
9780 +                       }
9781 +                       tmp_key = key;  /* ;) */
9782 +                       set_cpu_key_k_offset(&tmp_key, 1);
9783 +                       PATH_LAST_POSITION(&path)++;
9784 +
9785 +                       retval =
9786 +                           reiserfs_insert_item(th, &path, &tmp_key, &tmp_ih,
9787 +                                                inode, (char *)&unp);
9788 +                       if (retval) {
9789 +                               reiserfs_free_block(th, inode,
9790 +                                                   allocated_block_nr, 1);
9791 +                               /*
9792 +                                * retval == -ENOSPC, -EDQUOT or -EIO
9793 +                                * or -EEXIST
9794 +                                */
9795 +                               goto failure;
9796 +                       }
9797 +               } else if (is_direct_le_ih(ih)) {
9798 +                       /* direct item has to be converted */
9799 +                       loff_t tail_offset;
9800 +
9801 +                       tail_offset =
9802 +                           ((le_ih_k_offset(ih) -
9803 +                             1) & ~(inode->i_sb->s_blocksize - 1)) + 1;
9804 +
9805 +                       /*
9806 +                        * direct item we just found fits into block we have
9807 +                        * to map. Convert it into unformatted node: use
9808 +                        * bh_result for the conversion
9809 +                        */
9810 +                       if (tail_offset == cpu_key_k_offset(&key)) {
9811 +                               set_block_dev_mapped(bh_result,
9812 +                                                    allocated_block_nr, inode);
9813 +                               unbh = bh_result;
9814 +                               done = 1;
9815 +                       } else {
9816 +                               /*
9817 +                                * we have to pad file tail stored in direct
9818 +                                * item(s) up to block size and convert it
9819 +                                * to unformatted node. FIXME: this should
9820 +                                * also get into page cache
9821 +                                */
9822 +
9823 +                               pathrelse(&path);
9824 +                               /*
9825 +                                * ugly, but we can only end the transaction if
9826 +                                * we aren't nested
9827 +                                */
9828 +                               BUG_ON(!th->t_refcount);
9829 +                               if (th->t_refcount == 1) {
9830 +                                       retval =
9831 +                                           reiserfs_end_persistent_transaction
9832 +                                           (th);
9833 +                                       th = NULL;
9834 +                                       if (retval)
9835 +                                               goto failure;
9836 +                               }
9837 +
9838 +                               retval =
9839 +                                   convert_tail_for_hole(inode, bh_result,
9840 +                                                         tail_offset);
9841 +                               if (retval) {
9842 +                                       if (retval != -ENOSPC)
9843 +                                               reiserfs_error(inode->i_sb,
9844 +                                                       "clm-6004",
9845 +                                                       "convert tail failed "
9846 +                                                       "inode %lu, error %d",
9847 +                                                       inode->i_ino,
9848 +                                                       retval);
9849 +                                       if (allocated_block_nr) {
9850 +                                               /*
9851 +                                                * the bitmap, the super,
9852 +                                                * and the stat data == 3
9853 +                                                */
9854 +                                               if (!th)
9855 +                                                       th = reiserfs_persistent_transaction(inode->i_sb, 3);
9856 +                                               if (th)
9857 +                                                       reiserfs_free_block(th,
9858 +                                                                           inode,
9859 +                                                                           allocated_block_nr,
9860 +                                                                           1);
9861 +                                       }
9862 +                                       goto failure;
9863 +                               }
9864 +                               goto research;
9865 +                       }
9866 +                       retval =
9867 +                           direct2indirect(th, inode, &path, unbh,
9868 +                                           tail_offset);
9869 +                       if (retval) {
9870 +                               reiserfs_unmap_buffer(unbh);
9871 +                               reiserfs_free_block(th, inode,
9872 +                                                   allocated_block_nr, 1);
9873 +                               goto failure;
9874 +                       }
9875 +                       /*
9876 +                        * it is important the set_buffer_uptodate is done
9877 +                        * after the direct2indirect.  The buffer might
9878 +                        * contain valid data newer than the data on disk
9879 +                        * (read by read_folio, changed, and then sent here by
9880 +                        * writepage).  direct2indirect needs to know if unbh
9881 +                        * was already up to date, so it can decide if the
9882 +                        * data in unbh needs to be replaced with data from
9883 +                        * the disk
9884 +                        */
9885 +                       set_buffer_uptodate(unbh);
9886 +
9887 +                       /*
9888 +                        * unbh->b_page == NULL in case of DIRECT_IO request,
9889 +                        * this means buffer will disappear shortly, so it
9890 +                        * should not be added to
9891 +                        */
9892 +                       if (unbh->b_page) {
9893 +                               /*
9894 +                                * we've converted the tail, so we must
9895 +                                * flush unbh before the transaction commits
9896 +                                */
9897 +                               reiserfs_add_tail_list(inode, unbh);
9898 +
9899 +                               /*
9900 +                                * mark it dirty now to prevent commit_write
9901 +                                * from adding this buffer to the inode's
9902 +                                * dirty buffer list
9903 +                                */
9904 +                               /*
9905 +                                * AKPM: changed __mark_buffer_dirty to
9906 +                                * mark_buffer_dirty().  It's still atomic,
9907 +                                * but it sets the page dirty too, which makes
9908 +                                * it eligible for writeback at any time by the
9909 +                                * VM (which was also the case with
9910 +                                * __mark_buffer_dirty())
9911 +                                */
9912 +                               mark_buffer_dirty(unbh);
9913 +                       }
9914 +               } else {
9915 +                       /*
9916 +                        * append indirect item with holes if needed, when
9917 +                        * appending pointer to 'block'-th block use block,
9918 +                        * which is already allocated
9919 +                        */
9920 +                       struct cpu_key tmp_key;
9921 +                       /*
9922 +                        * We use this in case we need to allocate
9923 +                        * only one block which is a fastpath
9924 +                        */
9925 +                       unp_t unf_single = 0;
9926 +                       unp_t *un;
9927 +                       __u64 max_to_insert =
9928 +                           MAX_ITEM_LEN(inode->i_sb->s_blocksize) /
9929 +                           UNFM_P_SIZE;
9930 +                       __u64 blocks_needed;
9931 +
9932 +                       RFALSE(pos_in_item != ih_item_len(ih) / UNFM_P_SIZE,
9933 +                              "vs-804: invalid position for append");
9934 +                       /*
9935 +                        * indirect item has to be appended,
9936 +                        * set up key of that position
9937 +                        * (key type is unimportant)
9938 +                        */
9939 +                       make_cpu_key(&tmp_key, inode,
9940 +                                    le_key_k_offset(version,
9941 +                                                    &ih->ih_key) +
9942 +                                    op_bytes_number(ih,
9943 +                                                    inode->i_sb->s_blocksize),
9944 +                                    TYPE_INDIRECT, 3);
9945 +
9946 +                       RFALSE(cpu_key_k_offset(&tmp_key) > cpu_key_k_offset(&key),
9947 +                              "green-805: invalid offset");
9948 +                       blocks_needed =
9949 +                           1 +
9950 +                           ((cpu_key_k_offset(&key) -
9951 +                             cpu_key_k_offset(&tmp_key)) >> inode->i_sb->
9952 +                            s_blocksize_bits);
9953 +
9954 +                       if (blocks_needed == 1) {
9955 +                               un = &unf_single;
9956 +                       } else {
9957 +                               un = kcalloc(min(blocks_needed, max_to_insert),
9958 +                                            UNFM_P_SIZE, GFP_NOFS);
9959 +                               if (!un) {
9960 +                                       un = &unf_single;
9961 +                                       blocks_needed = 1;
9962 +                                       max_to_insert = 0;
9963 +                               }
9964 +                       }
9965 +                       if (blocks_needed <= max_to_insert) {
9966 +                               /*
9967 +                                * we are going to add target block to
9968 +                                * the file. Use allocated block for that
9969 +                                */
9970 +                               un[blocks_needed - 1] =
9971 +                                   cpu_to_le32(allocated_block_nr);
9972 +                               set_block_dev_mapped(bh_result,
9973 +                                                    allocated_block_nr, inode);
9974 +                               set_buffer_new(bh_result);
9975 +                               done = 1;
9976 +                       } else {
9977 +                               /* paste hole to the indirect item */
9978 +                               /*
9979 +                                * If kcalloc failed, max_to_insert becomes
9980 +                                * zero and it means we only have space for
9981 +                                * one block
9982 +                                */
9983 +                               blocks_needed =
9984 +                                   max_to_insert ? max_to_insert : 1;
9985 +                       }
9986 +                       retval =
9987 +                           reiserfs_paste_into_item(th, &path, &tmp_key, inode,
9988 +                                                    (char *)un,
9989 +                                                    UNFM_P_SIZE *
9990 +                                                    blocks_needed);
9991 +
9992 +                       if (blocks_needed != 1)
9993 +                               kfree(un);
9994 +
9995 +                       if (retval) {
9996 +                               reiserfs_free_block(th, inode,
9997 +                                                   allocated_block_nr, 1);
9998 +                               goto failure;
9999 +                       }
10000 +                       if (!done) {
10001 +                               /*
10002 +                                * We need to mark new file size in case
10003 +                                * this function will be interrupted/aborted
10004 +                                * later on. And we may do this only for
10005 +                                * holes.
10006 +                                */
10007 +                               inode->i_size +=
10008 +                                   inode->i_sb->s_blocksize * blocks_needed;
10009 +                       }
10010 +               }
10011 +
10012 +               if (done == 1)
10013 +                       break;
10014 +
10015 +               /*
10016 +                * this loop could log more blocks than we had originally
10017 +                * asked for.  So, we have to allow the transaction to end
10018 +                * if it is too big or too full.  Update the inode so things
10019 +                * are consistent if we crash before the function returns
10020 +                * release the path so that anybody waiting on the path before
10021 +                * ending their transaction will be able to continue.
10022 +                */
10023 +               if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
10024 +                       retval = restart_transaction(th, inode, &path);
10025 +                       if (retval)
10026 +                               goto failure;
10027 +               }
10028 +               /*
10029 +                * inserting indirect pointers for a hole can take a
10030 +                * long time.  reschedule if needed and also release the write
10031 +                * lock for others.
10032 +                */
10033 +               reiserfs_cond_resched(inode->i_sb);
10034 +
10035 +               retval = search_for_position_by_key(inode->i_sb, &key, &path);
10036 +               if (retval == IO_ERROR) {
10037 +                       retval = -EIO;
10038 +                       goto failure;
10039 +               }
10040 +               if (retval == POSITION_FOUND) {
10041 +                       reiserfs_warning(inode->i_sb, "vs-825",
10042 +                                        "%K should not be found", &key);
10043 +                       retval = -EEXIST;
10044 +                       if (allocated_block_nr)
10045 +                               reiserfs_free_block(th, inode,
10046 +                                                   allocated_block_nr, 1);
10047 +                       pathrelse(&path);
10048 +                       goto failure;
10049 +               }
10050 +               bh = get_last_bh(&path);
10051 +               ih = tp_item_head(&path);
10052 +               item = tp_item_body(&path);
10053 +               pos_in_item = path.pos_in_item;
10054 +       } while (1);
10055 +
10056 +       retval = 0;
10057 +
10058 +failure:
10059 +       if (th && (!dangle || (retval && !th->t_trans_id))) {
10060 +               int err;
10061 +               if (th->t_trans_id)
10062 +                       reiserfs_update_sd(th, inode);
10063 +               err = reiserfs_end_persistent_transaction(th);
10064 +               if (err)
10065 +                       retval = err;
10066 +       }
10067 +
10068 +       reiserfs_write_unlock(inode->i_sb);
10069 +       reiserfs_check_path(&path);
10070 +       return retval;
10071 +}
10072 +
10073 +static void reiserfs_readahead(struct readahead_control *rac)
10074 +{
10075 +       mpage_readahead(rac, reiserfs_get_block);
10076 +}
10077 +
10078 +/*
10079 + * Compute real number of used bytes by file
10080 + * Following three functions can go away when we'll have enough space in
10081 + * stat item
10082 + */
10083 +static int real_space_diff(struct inode *inode, int sd_size)
10084 +{
10085 +       int bytes;
10086 +       loff_t blocksize = inode->i_sb->s_blocksize;
10087 +
10088 +       if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode))
10089 +               return sd_size;
10090 +
10091 +       /*
10092 +        * End of file is also in full block with indirect reference, so round
10093 +        * up to the next block.
10094 +        *
10095 +        * there is just no way to know if the tail is actually packed
10096 +        * on the file, so we have to assume it isn't.  When we pack the
10097 +        * tail, we add 4 bytes to pretend there really is an unformatted
10098 +        * node pointer
10099 +        */
10100 +       bytes =
10101 +           ((inode->i_size +
10102 +             (blocksize - 1)) >> inode->i_sb->s_blocksize_bits) * UNFM_P_SIZE +
10103 +           sd_size;
10104 +       return bytes;
10105 +}
10106 +
10107 +static inline loff_t to_real_used_space(struct inode *inode, ulong blocks,
10108 +                                       int sd_size)
10109 +{
10110 +       if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
10111 +               return inode->i_size +
10112 +                   (loff_t) (real_space_diff(inode, sd_size));
10113 +       }
10114 +       return ((loff_t) real_space_diff(inode, sd_size)) +
10115 +           (((loff_t) blocks) << 9);
10116 +}
10117 +
10118 +/* Compute number of blocks used by file in ReiserFS counting */
10119 +static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size)
10120 +{
10121 +       loff_t bytes = inode_get_bytes(inode);
10122 +       loff_t real_space = real_space_diff(inode, sd_size);
10123 +
10124 +       /* keeps fsck and non-quota versions of reiserfs happy */
10125 +       if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
10126 +               bytes += (loff_t) 511;
10127 +       }
10128 +
10129 +       /*
10130 +        * files from before the quota patch might i_blocks such that
10131 +        * bytes < real_space.  Deal with that here to prevent it from
10132 +        * going negative.
10133 +        */
10134 +       if (bytes < real_space)
10135 +               return 0;
10136 +       return (bytes - real_space) >> 9;
10137 +}
10138 +
10139 +/*
10140 + * BAD: new directories have stat data of new type and all other items
10141 + * of old type. Version stored in the inode says about body items, so
10142 + * in update_stat_data we can not rely on inode, but have to check
10143 + * item version directly
10144 + */
10145 +
10146 +/* called by read_locked_inode */
10147 +static void init_inode(struct inode *inode, struct treepath *path)
10148 +{
10149 +       struct buffer_head *bh;
10150 +       struct item_head *ih;
10151 +       __u32 rdev;
10152 +
10153 +       bh = PATH_PLAST_BUFFER(path);
10154 +       ih = tp_item_head(path);
10155 +
10156 +       copy_key(INODE_PKEY(inode), &ih->ih_key);
10157 +
10158 +       INIT_LIST_HEAD(&REISERFS_I(inode)->i_prealloc_list);
10159 +       REISERFS_I(inode)->i_flags = 0;
10160 +       REISERFS_I(inode)->i_prealloc_block = 0;
10161 +       REISERFS_I(inode)->i_prealloc_count = 0;
10162 +       REISERFS_I(inode)->i_trans_id = 0;
10163 +       REISERFS_I(inode)->i_jl = NULL;
10164 +       reiserfs_init_xattr_rwsem(inode);
10165 +
10166 +       if (stat_data_v1(ih)) {
10167 +               struct stat_data_v1 *sd =
10168 +                   (struct stat_data_v1 *)ih_item_body(bh, ih);
10169 +               unsigned long blocks;
10170 +
10171 +               set_inode_item_key_version(inode, KEY_FORMAT_3_5);
10172 +               set_inode_sd_version(inode, STAT_DATA_V1);
10173 +               inode->i_mode = sd_v1_mode(sd);
10174 +               set_nlink(inode, sd_v1_nlink(sd));
10175 +               i_uid_write(inode, sd_v1_uid(sd));
10176 +               i_gid_write(inode, sd_v1_gid(sd));
10177 +               inode->i_size = sd_v1_size(sd);
10178 +               inode_set_atime(inode, sd_v1_atime(sd), 0);
10179 +               inode_set_mtime(inode, sd_v1_mtime(sd), 0);
10180 +               inode_set_ctime(inode, sd_v1_ctime(sd), 0);
10181 +
10182 +               inode->i_blocks = sd_v1_blocks(sd);
10183 +               inode->i_generation = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
10184 +               blocks = (inode->i_size + 511) >> 9;
10185 +               blocks = _ROUND_UP(blocks, inode->i_sb->s_blocksize >> 9);
10186 +
10187 +               /*
10188 +                * there was a bug in <=3.5.23 when i_blocks could take
10189 +                * negative values. Starting from 3.5.17 this value could
10190 +                * even be stored in stat data. For such files we set
10191 +                * i_blocks based on file size. Just 2 notes: this can be
10192 +                * wrong for sparse files. On-disk value will be only
10193 +                * updated if file's inode will ever change
10194 +                */
10195 +               if (inode->i_blocks > blocks) {
10196 +                       inode->i_blocks = blocks;
10197 +               }
10198 +
10199 +               rdev = sd_v1_rdev(sd);
10200 +               REISERFS_I(inode)->i_first_direct_byte =
10201 +                   sd_v1_first_direct_byte(sd);
10202 +
10203 +               /*
10204 +                * an early bug in the quota code can give us an odd
10205 +                * number for the block count.  This is incorrect, fix it here.
10206 +                */
10207 +               if (inode->i_blocks & 1) {
10208 +                       inode->i_blocks++;
10209 +               }
10210 +               inode_set_bytes(inode,
10211 +                               to_real_used_space(inode, inode->i_blocks,
10212 +                                                  SD_V1_SIZE));
10213 +               /*
10214 +                * nopack is initially zero for v1 objects. For v2 objects,
10215 +                * nopack is initialised from sd_attrs
10216 +                */
10217 +               REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
10218 +       } else {
10219 +               /*
10220 +                * new stat data found, but object may have old items
10221 +                * (directories and symlinks)
10222 +                */
10223 +               struct stat_data *sd = (struct stat_data *)ih_item_body(bh, ih);
10224 +
10225 +               inode->i_mode = sd_v2_mode(sd);
10226 +               set_nlink(inode, sd_v2_nlink(sd));
10227 +               i_uid_write(inode, sd_v2_uid(sd));
10228 +               inode->i_size = sd_v2_size(sd);
10229 +               i_gid_write(inode, sd_v2_gid(sd));
10230 +               inode_set_mtime(inode, sd_v2_mtime(sd), 0);
10231 +               inode_set_atime(inode, sd_v2_atime(sd), 0);
10232 +               inode_set_ctime(inode, sd_v2_ctime(sd), 0);
10233 +               inode->i_blocks = sd_v2_blocks(sd);
10234 +               rdev = sd_v2_rdev(sd);
10235 +               if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
10236 +                       inode->i_generation =
10237 +                           le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
10238 +               else
10239 +                       inode->i_generation = sd_v2_generation(sd);
10240 +
10241 +               if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
10242 +                       set_inode_item_key_version(inode, KEY_FORMAT_3_5);
10243 +               else
10244 +                       set_inode_item_key_version(inode, KEY_FORMAT_3_6);
10245 +               REISERFS_I(inode)->i_first_direct_byte = 0;
10246 +               set_inode_sd_version(inode, STAT_DATA_V2);
10247 +               inode_set_bytes(inode,
10248 +                               to_real_used_space(inode, inode->i_blocks,
10249 +                                                  SD_V2_SIZE));
10250 +               /*
10251 +                * read persistent inode attributes from sd and initialise
10252 +                * generic inode flags from them
10253 +                */
10254 +               REISERFS_I(inode)->i_attrs = sd_v2_attrs(sd);
10255 +               sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode);
10256 +       }
10257 +
10258 +       pathrelse(path);
10259 +       if (S_ISREG(inode->i_mode)) {
10260 +               inode->i_op = &reiserfs_file_inode_operations;
10261 +               inode->i_fop = &reiserfs_file_operations;
10262 +               inode->i_mapping->a_ops = &reiserfs_address_space_operations;
10263 +       } else if (S_ISDIR(inode->i_mode)) {
10264 +               inode->i_op = &reiserfs_dir_inode_operations;
10265 +               inode->i_fop = &reiserfs_dir_operations;
10266 +       } else if (S_ISLNK(inode->i_mode)) {
10267 +               inode->i_op = &reiserfs_symlink_inode_operations;
10268 +               inode_nohighmem(inode);
10269 +               inode->i_mapping->a_ops = &reiserfs_address_space_operations;
10270 +       } else {
10271 +               inode->i_blocks = 0;
10272 +               inode->i_op = &reiserfs_special_inode_operations;
10273 +               init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
10274 +       }
10275 +}
10276 +
10277 +/* update new stat data with inode fields */
10278 +static void inode2sd(void *sd, struct inode *inode, loff_t size)
10279 +{
10280 +       struct stat_data *sd_v2 = (struct stat_data *)sd;
10281 +
10282 +       set_sd_v2_mode(sd_v2, inode->i_mode);
10283 +       set_sd_v2_nlink(sd_v2, inode->i_nlink);
10284 +       set_sd_v2_uid(sd_v2, i_uid_read(inode));
10285 +       set_sd_v2_size(sd_v2, size);
10286 +       set_sd_v2_gid(sd_v2, i_gid_read(inode));
10287 +       set_sd_v2_mtime(sd_v2, inode_get_mtime_sec(inode));
10288 +       set_sd_v2_atime(sd_v2, inode_get_atime_sec(inode));
10289 +       set_sd_v2_ctime(sd_v2, inode_get_ctime_sec(inode));
10290 +       set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE));
10291 +       if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
10292 +               set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev));
10293 +       else
10294 +               set_sd_v2_generation(sd_v2, inode->i_generation);
10295 +       set_sd_v2_attrs(sd_v2, REISERFS_I(inode)->i_attrs);
10296 +}
10297 +
10298 +/* used to copy inode's fields to old stat data */
10299 +static void inode2sd_v1(void *sd, struct inode *inode, loff_t size)
10300 +{
10301 +       struct stat_data_v1 *sd_v1 = (struct stat_data_v1 *)sd;
10302 +
10303 +       set_sd_v1_mode(sd_v1, inode->i_mode);
10304 +       set_sd_v1_uid(sd_v1, i_uid_read(inode));
10305 +       set_sd_v1_gid(sd_v1, i_gid_read(inode));
10306 +       set_sd_v1_nlink(sd_v1, inode->i_nlink);
10307 +       set_sd_v1_size(sd_v1, size);
10308 +       set_sd_v1_atime(sd_v1, inode_get_atime_sec(inode));
10309 +       set_sd_v1_ctime(sd_v1, inode_get_ctime_sec(inode));
10310 +       set_sd_v1_mtime(sd_v1, inode_get_mtime_sec(inode));
10311 +
10312 +       if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
10313 +               set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev));
10314 +       else
10315 +               set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE));
10316 +
10317 +       /* Sigh. i_first_direct_byte is back */
10318 +       set_sd_v1_first_direct_byte(sd_v1,
10319 +                                   REISERFS_I(inode)->i_first_direct_byte);
10320 +}
10321 +
10322 +/*
10323 + * NOTE, you must prepare the buffer head before sending it here,
10324 + * and then log it after the call
10325 + */
10326 +static void update_stat_data(struct treepath *path, struct inode *inode,
10327 +                            loff_t size)
10328 +{
10329 +       struct buffer_head *bh;
10330 +       struct item_head *ih;
10331 +
10332 +       bh = PATH_PLAST_BUFFER(path);
10333 +       ih = tp_item_head(path);
10334 +
10335 +       if (!is_statdata_le_ih(ih))
10336 +               reiserfs_panic(inode->i_sb, "vs-13065", "key %k, found item %h",
10337 +                              INODE_PKEY(inode), ih);
10338 +
10339 +       /* path points to old stat data */
10340 +       if (stat_data_v1(ih)) {
10341 +               inode2sd_v1(ih_item_body(bh, ih), inode, size);
10342 +       } else {
10343 +               inode2sd(ih_item_body(bh, ih), inode, size);
10344 +       }
10345 +
10346 +       return;
10347 +}
10348 +
10349 +void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th,
10350 +                            struct inode *inode, loff_t size)
10351 +{
10352 +       struct cpu_key key;
10353 +       INITIALIZE_PATH(path);
10354 +       struct buffer_head *bh;
10355 +       int fs_gen;
10356 +       struct item_head *ih, tmp_ih;
10357 +       int retval;
10358 +
10359 +       BUG_ON(!th->t_trans_id);
10360 +
10361 +       /* key type is unimportant */
10362 +       make_cpu_key(&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3);
10363 +
10364 +       for (;;) {
10365 +               int pos;
10366 +               /* look for the object's stat data */
10367 +               retval = search_item(inode->i_sb, &key, &path);
10368 +               if (retval == IO_ERROR) {
10369 +                       reiserfs_error(inode->i_sb, "vs-13050",
10370 +                                      "i/o failure occurred trying to "
10371 +                                      "update %K stat data", &key);
10372 +                       return;
10373 +               }
10374 +               if (retval == ITEM_NOT_FOUND) {
10375 +                       pos = PATH_LAST_POSITION(&path);
10376 +                       pathrelse(&path);
10377 +                       if (inode->i_nlink == 0) {
10378 +                               /*reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: i_nlink == 0, stat data not found"); */
10379 +                               return;
10380 +                       }
10381 +                       reiserfs_warning(inode->i_sb, "vs-13060",
10382 +                                        "stat data of object %k (nlink == %d) "
10383 +                                        "not found (pos %d)",
10384 +                                        INODE_PKEY(inode), inode->i_nlink,
10385 +                                        pos);
10386 +                       reiserfs_check_path(&path);
10387 +                       return;
10388 +               }
10389 +
10390 +               /*
10391 +                * sigh, prepare_for_journal might schedule.  When it
10392 +                * schedules the FS might change.  We have to detect that,
10393 +                * and loop back to the search if the stat data item has moved
10394 +                */
10395 +               bh = get_last_bh(&path);
10396 +               ih = tp_item_head(&path);
10397 +               copy_item_head(&tmp_ih, ih);
10398 +               fs_gen = get_generation(inode->i_sb);
10399 +               reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
10400 +
10401 +               /* Stat_data item has been moved after scheduling. */
10402 +               if (fs_changed(fs_gen, inode->i_sb)
10403 +                   && item_moved(&tmp_ih, &path)) {
10404 +                       reiserfs_restore_prepared_buffer(inode->i_sb, bh);
10405 +                       continue;
10406 +               }
10407 +               break;
10408 +       }
10409 +       update_stat_data(&path, inode, size);
10410 +       journal_mark_dirty(th, bh);
10411 +       pathrelse(&path);
10412 +       return;
10413 +}
10414 +
10415 +/*
10416 + * reiserfs_read_locked_inode is called to read the inode off disk, and it
10417 + * does a make_bad_inode when things go wrong.  But, we need to make sure
10418 + * and clear the key in the private portion of the inode, otherwise a
10419 + * corresponding iput might try to delete whatever object the inode last
10420 + * represented.
10421 + */
10422 +static void reiserfs_make_bad_inode(struct inode *inode)
10423 +{
10424 +       memset(INODE_PKEY(inode), 0, KEY_SIZE);
10425 +       make_bad_inode(inode);
10426 +}
10427 +
10428 +/*
10429 + * initially this function was derived from minix or ext2's analog and
10430 + * evolved as the prototype did
10431 + */
10432 +int reiserfs_init_locked_inode(struct inode *inode, void *p)
10433 +{
10434 +       struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p;
10435 +       inode->i_ino = args->objectid;
10436 +       INODE_PKEY(inode)->k_dir_id = cpu_to_le32(args->dirid);
10437 +       return 0;
10438 +}
10439 +
10440 +/*
10441 + * looks for stat data in the tree, and fills up the fields of in-core
10442 + * inode stat data fields
10443 + */
10444 +void reiserfs_read_locked_inode(struct inode *inode,
10445 +                               struct reiserfs_iget_args *args)
10446 +{
10447 +       INITIALIZE_PATH(path_to_sd);
10448 +       struct cpu_key key;
10449 +       unsigned long dirino;
10450 +       int retval;
10451 +
10452 +       dirino = args->dirid;
10453 +
10454 +       /*
10455 +        * set version 1, version 2 could be used too, because stat data
10456 +        * key is the same in both versions
10457 +        */
10458 +       _make_cpu_key(&key, KEY_FORMAT_3_5, dirino, inode->i_ino, 0, 0, 3);
10459 +
10460 +       /* look for the object's stat data */
10461 +       retval = search_item(inode->i_sb, &key, &path_to_sd);
10462 +       if (retval == IO_ERROR) {
10463 +               reiserfs_error(inode->i_sb, "vs-13070",
10464 +                              "i/o failure occurred trying to find "
10465 +                              "stat data of %K", &key);
10466 +               reiserfs_make_bad_inode(inode);
10467 +               return;
10468 +       }
10469 +
10470 +       /* a stale NFS handle can trigger this without it being an error */
10471 +       if (retval != ITEM_FOUND) {
10472 +               pathrelse(&path_to_sd);
10473 +               reiserfs_make_bad_inode(inode);
10474 +               clear_nlink(inode);
10475 +               return;
10476 +       }
10477 +
10478 +       init_inode(inode, &path_to_sd);
10479 +
10480 +       /*
10481 +        * It is possible that knfsd is trying to access inode of a file
10482 +        * that is being removed from the disk by some other thread. As we
10483 +        * update sd on unlink all that is required is to check for nlink
10484 +        * here. This bug was first found by Sizif when debugging
10485 +        * SquidNG/Butterfly, forgotten, and found again after Philippe
10486 +        * Gramoulle <philippe.gramoulle@mmania.com> reproduced it.
10487 +
10488 +        * More logical fix would require changes in fs/inode.c:iput() to
10489 +        * remove inode from hash-table _after_ fs cleaned disk stuff up and
10490 +        * in iget() to return NULL if I_FREEING inode is found in
10491 +        * hash-table.
10492 +        */
10493 +
10494 +       /*
10495 +        * Currently there is one place where it's ok to meet inode with
10496 +        * nlink==0: processing of open-unlinked and half-truncated files
10497 +        * during mount (fs/reiserfs/super.c:finish_unfinished()).
10498 +        */
10499 +       if ((inode->i_nlink == 0) &&
10500 +           !REISERFS_SB(inode->i_sb)->s_is_unlinked_ok) {
10501 +               reiserfs_warning(inode->i_sb, "vs-13075",
10502 +                                "dead inode read from disk %K. "
10503 +                                "This is likely to be race with knfsd. Ignore",
10504 +                                &key);
10505 +               reiserfs_make_bad_inode(inode);
10506 +       }
10507 +
10508 +       /* init inode should be relsing */
10509 +       reiserfs_check_path(&path_to_sd);
10510 +
10511 +       /*
10512 +        * Stat data v1 doesn't support ACLs.
10513 +        */
10514 +       if (get_inode_sd_version(inode) == STAT_DATA_V1)
10515 +               cache_no_acl(inode);
10516 +}
10517 +
10518 +/*
10519 + * reiserfs_find_actor() - "find actor" reiserfs supplies to iget5_locked().
10520 + *
10521 + * @inode:    inode from hash table to check
10522 + * @opaque:   "cookie" passed to iget5_locked(). This is &reiserfs_iget_args.
10523 + *
10524 + * This function is called by iget5_locked() to distinguish reiserfs inodes
10525 + * having the same inode numbers. Such inodes can only exist due to some
10526 + * error condition. One of them should be bad. Inodes with identical
10527 + * inode numbers (objectids) are distinguished by parent directory ids.
10528 + *
10529 + */
10530 +int reiserfs_find_actor(struct inode *inode, void *opaque)
10531 +{
10532 +       struct reiserfs_iget_args *args;
10533 +
10534 +       args = opaque;
10535 +       /* args is already in CPU order */
10536 +       return (inode->i_ino == args->objectid) &&
10537 +           (le32_to_cpu(INODE_PKEY(inode)->k_dir_id) == args->dirid);
10538 +}
10539 +
10540 +struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key)
10541 +{
10542 +       struct inode *inode;
10543 +       struct reiserfs_iget_args args;
10544 +       int depth;
10545 +
10546 +       args.objectid = key->on_disk_key.k_objectid;
10547 +       args.dirid = key->on_disk_key.k_dir_id;
10548 +       depth = reiserfs_write_unlock_nested(s);
10549 +       inode = iget5_locked(s, key->on_disk_key.k_objectid,
10550 +                            reiserfs_find_actor, reiserfs_init_locked_inode,
10551 +                            (void *)(&args));
10552 +       reiserfs_write_lock_nested(s, depth);
10553 +       if (!inode)
10554 +               return ERR_PTR(-ENOMEM);
10555 +
10556 +       if (inode->i_state & I_NEW) {
10557 +               reiserfs_read_locked_inode(inode, &args);
10558 +               unlock_new_inode(inode);
10559 +       }
10560 +
10561 +       if (comp_short_keys(INODE_PKEY(inode), key) || is_bad_inode(inode)) {
10562 +               /* either due to i/o error or a stale NFS handle */
10563 +               iput(inode);
10564 +               inode = NULL;
10565 +       }
10566 +       return inode;
10567 +}
10568 +
10569 +static struct dentry *reiserfs_get_dentry(struct super_block *sb,
10570 +       u32 objectid, u32 dir_id, u32 generation)
10571 +
10572 +{
10573 +       struct cpu_key key;
10574 +       struct inode *inode;
10575 +
10576 +       key.on_disk_key.k_objectid = objectid;
10577 +       key.on_disk_key.k_dir_id = dir_id;
10578 +       reiserfs_write_lock(sb);
10579 +       inode = reiserfs_iget(sb, &key);
10580 +       if (inode && !IS_ERR(inode) && generation != 0 &&
10581 +           generation != inode->i_generation) {
10582 +               iput(inode);
10583 +               inode = NULL;
10584 +       }
10585 +       reiserfs_write_unlock(sb);
10586 +
10587 +       return d_obtain_alias(inode);
10588 +}
10589 +
10590 +struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
10591 +               int fh_len, int fh_type)
10592 +{
10593 +       /*
10594 +        * fhtype happens to reflect the number of u32s encoded.
10595 +        * due to a bug in earlier code, fhtype might indicate there
10596 +        * are more u32s then actually fitted.
10597 +        * so if fhtype seems to be more than len, reduce fhtype.
10598 +        * Valid types are:
10599 +        *   2 - objectid + dir_id - legacy support
10600 +        *   3 - objectid + dir_id + generation
10601 +        *   4 - objectid + dir_id + objectid and dirid of parent - legacy
10602 +        *   5 - objectid + dir_id + generation + objectid and dirid of parent
10603 +        *   6 - as above plus generation of directory
10604 +        * 6 does not fit in NFSv2 handles
10605 +        */
10606 +       if (fh_type > fh_len) {
10607 +               if (fh_type != 6 || fh_len != 5)
10608 +                       reiserfs_warning(sb, "reiserfs-13077",
10609 +                               "nfsd/reiserfs, fhtype=%d, len=%d - odd",
10610 +                               fh_type, fh_len);
10611 +               fh_type = fh_len;
10612 +       }
10613 +       if (fh_len < 2)
10614 +               return NULL;
10615 +
10616 +       return reiserfs_get_dentry(sb, fid->raw[0], fid->raw[1],
10617 +               (fh_type == 3 || fh_type >= 5) ? fid->raw[2] : 0);
10618 +}
10619 +
10620 +struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid,
10621 +               int fh_len, int fh_type)
10622 +{
10623 +       if (fh_type > fh_len)
10624 +               fh_type = fh_len;
10625 +       if (fh_type < 4)
10626 +               return NULL;
10627 +
10628 +       return reiserfs_get_dentry(sb,
10629 +               (fh_type >= 5) ? fid->raw[3] : fid->raw[2],
10630 +               (fh_type >= 5) ? fid->raw[4] : fid->raw[3],
10631 +               (fh_type == 6) ? fid->raw[5] : 0);
10632 +}
10633 +
10634 +int reiserfs_encode_fh(struct inode *inode, __u32 * data, int *lenp,
10635 +                      struct inode *parent)
10636 +{
10637 +       int maxlen = *lenp;
10638 +
10639 +       if (parent && (maxlen < 5)) {
10640 +               *lenp = 5;
10641 +               return FILEID_INVALID;
10642 +       } else if (maxlen < 3) {
10643 +               *lenp = 3;
10644 +               return FILEID_INVALID;
10645 +       }
10646 +
10647 +       data[0] = inode->i_ino;
10648 +       data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
10649 +       data[2] = inode->i_generation;
10650 +       *lenp = 3;
10651 +       if (parent) {
10652 +               data[3] = parent->i_ino;
10653 +               data[4] = le32_to_cpu(INODE_PKEY(parent)->k_dir_id);
10654 +               *lenp = 5;
10655 +               if (maxlen >= 6) {
10656 +                       data[5] = parent->i_generation;
10657 +                       *lenp = 6;
10658 +               }
10659 +       }
10660 +       return *lenp;
10661 +}
10662 +
10663 +/*
10664 + * looks for stat data, then copies fields to it, marks the buffer
10665 + * containing stat data as dirty
10666 + */
10667 +/*
10668 + * reiserfs inodes are never really dirty, since the dirty inode call
10669 + * always logs them.  This call allows the VFS inode marking routines
10670 + * to properly mark inodes for datasync and such, but only actually
10671 + * does something when called for a synchronous update.
10672 + */
10673 +int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc)
10674 +{
10675 +       struct reiserfs_transaction_handle th;
10676 +       int jbegin_count = 1;
10677 +
10678 +       if (sb_rdonly(inode->i_sb))
10679 +               return -EROFS;
10680 +       /*
10681 +        * memory pressure can sometimes initiate write_inode calls with
10682 +        * sync == 1,
10683 +        * these cases are just when the system needs ram, not when the
10684 +        * inode needs to reach disk for safety, and they can safely be
10685 +        * ignored because the altered inode has already been logged.
10686 +        */
10687 +       if (wbc->sync_mode == WB_SYNC_ALL && !(current->flags & PF_MEMALLOC)) {
10688 +               reiserfs_write_lock(inode->i_sb);
10689 +               if (!journal_begin(&th, inode->i_sb, jbegin_count)) {
10690 +                       reiserfs_update_sd(&th, inode);
10691 +                       journal_end_sync(&th);
10692 +               }
10693 +               reiserfs_write_unlock(inode->i_sb);
10694 +       }
10695 +       return 0;
10696 +}
10697 +
10698 +/*
10699 + * stat data of new object is inserted already, this inserts the item
10700 + * containing "." and ".." entries
10701 + */
10702 +static int reiserfs_new_directory(struct reiserfs_transaction_handle *th,
10703 +                                 struct inode *inode,
10704 +                                 struct item_head *ih, struct treepath *path,
10705 +                                 struct inode *dir)
10706 +{
10707 +       struct super_block *sb = th->t_super;
10708 +       char empty_dir[EMPTY_DIR_SIZE];
10709 +       char *body = empty_dir;
10710 +       struct cpu_key key;
10711 +       int retval;
10712 +
10713 +       BUG_ON(!th->t_trans_id);
10714 +
10715 +       _make_cpu_key(&key, KEY_FORMAT_3_5, le32_to_cpu(ih->ih_key.k_dir_id),
10716 +                     le32_to_cpu(ih->ih_key.k_objectid), DOT_OFFSET,
10717 +                     TYPE_DIRENTRY, 3 /*key length */ );
10718 +
10719 +       /*
10720 +        * compose item head for new item. Directories consist of items of
10721 +        * old type (ITEM_VERSION_1). Do not set key (second arg is 0), it
10722 +        * is done by reiserfs_new_inode
10723 +        */
10724 +       if (old_format_only(sb)) {
10725 +               make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET,
10726 +                                 TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2);
10727 +
10728 +               make_empty_dir_item_v1(body, ih->ih_key.k_dir_id,
10729 +                                      ih->ih_key.k_objectid,
10730 +                                      INODE_PKEY(dir)->k_dir_id,
10731 +                                      INODE_PKEY(dir)->k_objectid);
10732 +       } else {
10733 +               make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET,
10734 +                                 TYPE_DIRENTRY, EMPTY_DIR_SIZE, 2);
10735 +
10736 +               make_empty_dir_item(body, ih->ih_key.k_dir_id,
10737 +                                   ih->ih_key.k_objectid,
10738 +                                   INODE_PKEY(dir)->k_dir_id,
10739 +                                   INODE_PKEY(dir)->k_objectid);
10740 +       }
10741 +
10742 +       /* look for place in the tree for new item */
10743 +       retval = search_item(sb, &key, path);
10744 +       if (retval == IO_ERROR) {
10745 +               reiserfs_error(sb, "vs-13080",
10746 +                              "i/o failure occurred creating new directory");
10747 +               return -EIO;
10748 +       }
10749 +       if (retval == ITEM_FOUND) {
10750 +               pathrelse(path);
10751 +               reiserfs_warning(sb, "vs-13070",
10752 +                                "object with this key exists (%k)",
10753 +                                &(ih->ih_key));
10754 +               return -EEXIST;
10755 +       }
10756 +
10757 +       /* insert item, that is empty directory item */
10758 +       return reiserfs_insert_item(th, path, &key, ih, inode, body);
10759 +}
10760 +
10761 +/*
10762 + * stat data of object has been inserted, this inserts the item
10763 + * containing the body of symlink
10764 + */
10765 +static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th,
10766 +                               struct inode *inode,
10767 +                               struct item_head *ih,
10768 +                               struct treepath *path, const char *symname,
10769 +                               int item_len)
10770 +{
10771 +       struct super_block *sb = th->t_super;
10772 +       struct cpu_key key;
10773 +       int retval;
10774 +
10775 +       BUG_ON(!th->t_trans_id);
10776 +
10777 +       _make_cpu_key(&key, KEY_FORMAT_3_5,
10778 +                     le32_to_cpu(ih->ih_key.k_dir_id),
10779 +                     le32_to_cpu(ih->ih_key.k_objectid),
10780 +                     1, TYPE_DIRECT, 3 /*key length */ );
10781 +
10782 +       make_le_item_head(ih, NULL, KEY_FORMAT_3_5, 1, TYPE_DIRECT, item_len,
10783 +                         0 /*free_space */ );
10784 +
10785 +       /* look for place in the tree for new item */
10786 +       retval = search_item(sb, &key, path);
10787 +       if (retval == IO_ERROR) {
10788 +               reiserfs_error(sb, "vs-13080",
10789 +                              "i/o failure occurred creating new symlink");
10790 +               return -EIO;
10791 +       }
10792 +       if (retval == ITEM_FOUND) {
10793 +               pathrelse(path);
10794 +               reiserfs_warning(sb, "vs-13080",
10795 +                                "object with this key exists (%k)",
10796 +                                &(ih->ih_key));
10797 +               return -EEXIST;
10798 +       }
10799 +
10800 +       /* insert item, that is body of symlink */
10801 +       return reiserfs_insert_item(th, path, &key, ih, inode, symname);
10802 +}
10803 +
10804 +/*
10805 + * inserts the stat data into the tree, and then calls
10806 + * reiserfs_new_directory (to insert ".", ".." item if new object is
10807 + * directory) or reiserfs_new_symlink (to insert symlink body if new
10808 + * object is symlink) or nothing (if new object is regular file)
10809 +
10810 + * NOTE! uid and gid must already be set in the inode.  If we return
10811 + * non-zero due to an error, we have to drop the quota previously allocated
10812 + * for the fresh inode.  This can only be done outside a transaction, so
10813 + * if we return non-zero, we also end the transaction.
10814 + *
10815 + * @th: active transaction handle
10816 + * @dir: parent directory for new inode
10817 + * @mode: mode of new inode
10818 + * @symname: symlink contents if inode is symlink
10819 + * @isize: 0 for regular file, EMPTY_DIR_SIZE for dirs, strlen(symname) for
10820 + *         symlinks
10821 + * @inode: inode to be filled
10822 + * @security: optional security context to associate with this inode
10823 + */
10824 +int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
10825 +                      struct inode *dir, umode_t mode, const char *symname,
10826 +                      /* 0 for regular, EMTRY_DIR_SIZE for dirs,
10827 +                         strlen (symname) for symlinks) */
10828 +                      loff_t i_size, struct dentry *dentry,
10829 +                      struct inode *inode,
10830 +                      struct reiserfs_security_handle *security)
10831 +{
10832 +       struct super_block *sb = dir->i_sb;
10833 +       struct reiserfs_iget_args args;
10834 +       INITIALIZE_PATH(path_to_key);
10835 +       struct cpu_key key;
10836 +       struct item_head ih;
10837 +       struct stat_data sd;
10838 +       int retval;
10839 +       int err;
10840 +       int depth;
10841 +
10842 +       BUG_ON(!th->t_trans_id);
10843 +
10844 +       depth = reiserfs_write_unlock_nested(sb);
10845 +       err = dquot_alloc_inode(inode);
10846 +       reiserfs_write_lock_nested(sb, depth);
10847 +       if (err)
10848 +               goto out_end_trans;
10849 +       if (!dir->i_nlink) {
10850 +               err = -EPERM;
10851 +               goto out_bad_inode;
10852 +       }
10853 +
10854 +       /* item head of new item */
10855 +       ih.ih_key.k_dir_id = reiserfs_choose_packing(dir);
10856 +       ih.ih_key.k_objectid = cpu_to_le32(reiserfs_get_unused_objectid(th));
10857 +       if (!ih.ih_key.k_objectid) {
10858 +               err = -ENOMEM;
10859 +               goto out_bad_inode;
10860 +       }
10861 +       args.objectid = inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
10862 +       if (old_format_only(sb))
10863 +               make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
10864 +                                 TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
10865 +       else
10866 +               make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
10867 +                                 TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
10868 +       memcpy(INODE_PKEY(inode), &ih.ih_key, KEY_SIZE);
10869 +       args.dirid = le32_to_cpu(ih.ih_key.k_dir_id);
10870 +
10871 +       depth = reiserfs_write_unlock_nested(inode->i_sb);
10872 +       err = insert_inode_locked4(inode, args.objectid,
10873 +                            reiserfs_find_actor, &args);
10874 +       reiserfs_write_lock_nested(inode->i_sb, depth);
10875 +       if (err) {
10876 +               err = -EINVAL;
10877 +               goto out_bad_inode;
10878 +       }
10879 +
10880 +       if (old_format_only(sb))
10881 +               /*
10882 +                * not a perfect generation count, as object ids can be reused,
10883 +                * but this is as good as reiserfs can do right now.
10884 +                * note that the private part of inode isn't filled in yet,
10885 +                * we have to use the directory.
10886 +                */
10887 +               inode->i_generation = le32_to_cpu(INODE_PKEY(dir)->k_objectid);
10888 +       else
10889 +#if defined( USE_INODE_GENERATION_COUNTER )
10890 +               inode->i_generation =
10891 +                   le32_to_cpu(REISERFS_SB(sb)->s_rs->s_inode_generation);
10892 +#else
10893 +               inode->i_generation = ++event;
10894 +#endif
10895 +
10896 +       /* fill stat data */
10897 +       set_nlink(inode, (S_ISDIR(mode) ? 2 : 1));
10898 +
10899 +       /* uid and gid must already be set by the caller for quota init */
10900 +
10901 +       simple_inode_init_ts(inode);
10902 +       inode->i_size = i_size;
10903 +       inode->i_blocks = 0;
10904 +       inode->i_bytes = 0;
10905 +       REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 :
10906 +           U32_MAX /*NO_BYTES_IN_DIRECT_ITEM */ ;
10907 +
10908 +       INIT_LIST_HEAD(&REISERFS_I(inode)->i_prealloc_list);
10909 +       REISERFS_I(inode)->i_flags = 0;
10910 +       REISERFS_I(inode)->i_prealloc_block = 0;
10911 +       REISERFS_I(inode)->i_prealloc_count = 0;
10912 +       REISERFS_I(inode)->i_trans_id = 0;
10913 +       REISERFS_I(inode)->i_jl = NULL;
10914 +       REISERFS_I(inode)->i_attrs =
10915 +           REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
10916 +       sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode);
10917 +       reiserfs_init_xattr_rwsem(inode);
10918 +
10919 +       /* key to search for correct place for new stat data */
10920 +       _make_cpu_key(&key, KEY_FORMAT_3_6, le32_to_cpu(ih.ih_key.k_dir_id),
10921 +                     le32_to_cpu(ih.ih_key.k_objectid), SD_OFFSET,
10922 +                     TYPE_STAT_DATA, 3 /*key length */ );
10923 +
10924 +       /* find proper place for inserting of stat data */
10925 +       retval = search_item(sb, &key, &path_to_key);
10926 +       if (retval == IO_ERROR) {
10927 +               err = -EIO;
10928 +               goto out_bad_inode;
10929 +       }
10930 +       if (retval == ITEM_FOUND) {
10931 +               pathrelse(&path_to_key);
10932 +               err = -EEXIST;
10933 +               goto out_bad_inode;
10934 +       }
10935 +       if (old_format_only(sb)) {
10936 +               /* i_uid or i_gid is too big to be stored in stat data v3.5 */
10937 +               if (i_uid_read(inode) & ~0xffff || i_gid_read(inode) & ~0xffff) {
10938 +                       pathrelse(&path_to_key);
10939 +                       err = -EINVAL;
10940 +                       goto out_bad_inode;
10941 +               }
10942 +               inode2sd_v1(&sd, inode, inode->i_size);
10943 +       } else {
10944 +               inode2sd(&sd, inode, inode->i_size);
10945 +       }
10946 +       /*
10947 +        * store in in-core inode the key of stat data and version all
10948 +        * object items will have (directory items will have old offset
10949 +        * format, other new objects will consist of new items)
10950 +        */
10951 +       if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode))
10952 +               set_inode_item_key_version(inode, KEY_FORMAT_3_5);
10953 +       else
10954 +               set_inode_item_key_version(inode, KEY_FORMAT_3_6);
10955 +       if (old_format_only(sb))
10956 +               set_inode_sd_version(inode, STAT_DATA_V1);
10957 +       else
10958 +               set_inode_sd_version(inode, STAT_DATA_V2);
10959 +
10960 +       /* insert the stat data into the tree */
10961 +#ifdef DISPLACE_NEW_PACKING_LOCALITIES
10962 +       if (REISERFS_I(dir)->new_packing_locality)
10963 +               th->displace_new_blocks = 1;
10964 +#endif
10965 +       retval =
10966 +           reiserfs_insert_item(th, &path_to_key, &key, &ih, inode,
10967 +                                (char *)(&sd));
10968 +       if (retval) {
10969 +               err = retval;
10970 +               reiserfs_check_path(&path_to_key);
10971 +               goto out_bad_inode;
10972 +       }
10973 +#ifdef DISPLACE_NEW_PACKING_LOCALITIES
10974 +       if (!th->displace_new_blocks)
10975 +               REISERFS_I(dir)->new_packing_locality = 0;
10976 +#endif
10977 +       if (S_ISDIR(mode)) {
10978 +               /* insert item with "." and ".." */
10979 +               retval =
10980 +                   reiserfs_new_directory(th, inode, &ih, &path_to_key, dir);
10981 +       }
10982 +
10983 +       if (S_ISLNK(mode)) {
10984 +               /* insert body of symlink */
10985 +               if (!old_format_only(sb))
10986 +                       i_size = ROUND_UP(i_size);
10987 +               retval =
10988 +                   reiserfs_new_symlink(th, inode, &ih, &path_to_key, symname,
10989 +                                        i_size);
10990 +       }
10991 +       if (retval) {
10992 +               err = retval;
10993 +               reiserfs_check_path(&path_to_key);
10994 +               journal_end(th);
10995 +               goto out_inserted_sd;
10996 +       }
10997 +
10998 +       /*
10999 +        * Mark it private if we're creating the privroot
11000 +        * or something under it.
11001 +        */
11002 +       if (IS_PRIVATE(dir) || dentry == REISERFS_SB(sb)->priv_root)
11003 +               reiserfs_init_priv_inode(inode);
11004 +
11005 +       if (reiserfs_posixacl(inode->i_sb)) {
11006 +               reiserfs_write_unlock(inode->i_sb);
11007 +               retval = reiserfs_inherit_default_acl(th, dir, dentry, inode);
11008 +               reiserfs_write_lock(inode->i_sb);
11009 +               if (retval) {
11010 +                       err = retval;
11011 +                       reiserfs_check_path(&path_to_key);
11012 +                       journal_end(th);
11013 +                       goto out_inserted_sd;
11014 +               }
11015 +       } else if (inode->i_sb->s_flags & SB_POSIXACL) {
11016 +               reiserfs_warning(inode->i_sb, "jdm-13090",
11017 +                                "ACLs aren't enabled in the fs, "
11018 +                                "but vfs thinks they are!");
11019 +       }
11020 +
11021 +       if (security->name) {
11022 +               reiserfs_write_unlock(inode->i_sb);
11023 +               retval = reiserfs_security_write(th, inode, security);
11024 +               reiserfs_write_lock(inode->i_sb);
11025 +               if (retval) {
11026 +                       err = retval;
11027 +                       reiserfs_check_path(&path_to_key);
11028 +                       retval = journal_end(th);
11029 +                       if (retval)
11030 +                               err = retval;
11031 +                       goto out_inserted_sd;
11032 +               }
11033 +       }
11034 +
11035 +       reiserfs_update_sd(th, inode);
11036 +       reiserfs_check_path(&path_to_key);
11037 +
11038 +       return 0;
11039 +
11040 +out_bad_inode:
11041 +       /* Invalidate the object, nothing was inserted yet */
11042 +       INODE_PKEY(inode)->k_objectid = 0;
11043 +
11044 +       /* Quota change must be inside a transaction for journaling */
11045 +       depth = reiserfs_write_unlock_nested(inode->i_sb);
11046 +       dquot_free_inode(inode);
11047 +       reiserfs_write_lock_nested(inode->i_sb, depth);
11048 +
11049 +out_end_trans:
11050 +       journal_end(th);
11051 +       /*
11052 +        * Drop can be outside and it needs more credits so it's better
11053 +        * to have it outside
11054 +        */
11055 +       depth = reiserfs_write_unlock_nested(inode->i_sb);
11056 +       dquot_drop(inode);
11057 +       reiserfs_write_lock_nested(inode->i_sb, depth);
11058 +       inode->i_flags |= S_NOQUOTA;
11059 +       make_bad_inode(inode);
11060 +
11061 +out_inserted_sd:
11062 +       clear_nlink(inode);
11063 +       th->t_trans_id = 0;     /* so the caller can't use this handle later */
11064 +       if (inode->i_state & I_NEW)
11065 +               unlock_new_inode(inode);
11066 +       iput(inode);
11067 +       return err;
11068 +}
11069 +
11070 +/*
11071 + * finds the tail page in the page cache,
11072 + * reads the last block in.
11073 + *
11074 + * On success, page_result is set to a locked, pinned page, and bh_result
11075 + * is set to an up to date buffer for the last block in the file.  returns 0.
11076 + *
11077 + * tail conversion is not done, so bh_result might not be valid for writing
11078 + * check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before
11079 + * trying to write the block.
11080 + *
11081 + * on failure, nonzero is returned, page_result and bh_result are untouched.
11082 + */
11083 +static int grab_tail_page(struct inode *inode,
11084 +                         struct page **page_result,
11085 +                         struct buffer_head **bh_result)
11086 +{
11087 +
11088 +       /*
11089 +        * we want the page with the last byte in the file,
11090 +        * not the page that will hold the next byte for appending
11091 +        */
11092 +       unsigned long index = (inode->i_size - 1) >> PAGE_SHIFT;
11093 +       unsigned long pos = 0;
11094 +       unsigned long start = 0;
11095 +       unsigned long blocksize = inode->i_sb->s_blocksize;
11096 +       unsigned long offset = (inode->i_size) & (PAGE_SIZE - 1);
11097 +       struct buffer_head *bh;
11098 +       struct buffer_head *head;
11099 +       struct folio *folio;
11100 +       int error;
11101 +
11102 +       /*
11103 +        * we know that we are only called with inode->i_size > 0.
11104 +        * we also know that a file tail can never be as big as a block
11105 +        * If i_size % blocksize == 0, our file is currently block aligned
11106 +        * and it won't need converting or zeroing after a truncate.
11107 +        */
11108 +       if ((offset & (blocksize - 1)) == 0) {
11109 +               return -ENOENT;
11110 +       }
11111 +       folio = __filemap_get_folio(inode->i_mapping, index,
11112 +                       FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
11113 +                       mapping_gfp_mask(inode->i_mapping));
11114 +       if (IS_ERR(folio))
11115 +               return PTR_ERR(folio);
11116 +       /* start within the page of the last block in the file */
11117 +       start = (offset / blocksize) * blocksize;
11118 +
11119 +       error = __block_write_begin(folio, start, offset - start,
11120 +                                   reiserfs_get_block_create_0);
11121 +       if (error)
11122 +               goto unlock;
11123 +
11124 +       head = folio_buffers(folio);
11125 +       bh = head;
11126 +       do {
11127 +               if (pos >= start) {
11128 +                       break;
11129 +               }
11130 +               bh = bh->b_this_page;
11131 +               pos += blocksize;
11132 +       } while (bh != head);
11133 +
11134 +       if (!buffer_uptodate(bh)) {
11135 +               /*
11136 +                * note, this should never happen, prepare_write should be
11137 +                * taking care of this for us.  If the buffer isn't up to
11138 +                * date, I've screwed up the code to find the buffer, or the
11139 +                * code to call prepare_write
11140 +                */
11141 +               reiserfs_error(inode->i_sb, "clm-6000",
11142 +                              "error reading block %lu", bh->b_blocknr);
11143 +               error = -EIO;
11144 +               goto unlock;
11145 +       }
11146 +       *bh_result = bh;
11147 +       *page_result = &folio->page;
11148 +
11149 +       return error;
11150 +
11151 +unlock:
11152 +       folio_unlock(folio);
11153 +       folio_put(folio);
11154 +       return error;
11155 +}
11156 +
11157 +/*
11158 + * vfs version of truncate file.  Must NOT be called with
11159 + * a transaction already started.
11160 + *
11161 + * some code taken from block_truncate_page
11162 + */
11163 +int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
11164 +{
11165 +       struct reiserfs_transaction_handle th;
11166 +       /* we want the offset for the first byte after the end of the file */
11167 +       unsigned long offset = inode->i_size & (PAGE_SIZE - 1);
11168 +       unsigned blocksize = inode->i_sb->s_blocksize;
11169 +       unsigned length;
11170 +       struct page *page = NULL;
11171 +       int error;
11172 +       struct buffer_head *bh = NULL;
11173 +       int err2;
11174 +
11175 +       reiserfs_write_lock(inode->i_sb);
11176 +
11177 +       if (inode->i_size > 0) {
11178 +               error = grab_tail_page(inode, &page, &bh);
11179 +               if (error) {
11180 +                       /*
11181 +                        * -ENOENT means we truncated past the end of the
11182 +                        * file, and get_block_create_0 could not find a
11183 +                        * block to read in, which is ok.
11184 +                        */
11185 +                       if (error != -ENOENT)
11186 +                               reiserfs_error(inode->i_sb, "clm-6001",
11187 +                                              "grab_tail_page failed %d",
11188 +                                              error);
11189 +                       page = NULL;
11190 +                       bh = NULL;
11191 +               }
11192 +       }
11193 +
11194 +       /*
11195 +        * so, if page != NULL, we have a buffer head for the offset at
11196 +        * the end of the file. if the bh is mapped, and bh->b_blocknr != 0,
11197 +        * then we have an unformatted node.  Otherwise, we have a direct item,
11198 +        * and no zeroing is required on disk.  We zero after the truncate,
11199 +        * because the truncate might pack the item anyway
11200 +        * (it will unmap bh if it packs).
11201 +        *
11202 +        * it is enough to reserve space in transaction for 2 balancings:
11203 +        * one for "save" link adding and another for the first
11204 +        * cut_from_item. 1 is for update_sd
11205 +        */
11206 +       error = journal_begin(&th, inode->i_sb,
11207 +                             JOURNAL_PER_BALANCE_CNT * 2 + 1);
11208 +       if (error)
11209 +               goto out;
11210 +       reiserfs_update_inode_transaction(inode);
11211 +       if (update_timestamps)
11212 +               /*
11213 +                * we are doing real truncate: if the system crashes
11214 +                * before the last transaction of truncating gets committed
11215 +                * - on reboot the file either appears truncated properly
11216 +                * or not truncated at all
11217 +                */
11218 +               add_save_link(&th, inode, 1);
11219 +       err2 = reiserfs_do_truncate(&th, inode, page, update_timestamps);
11220 +       error = journal_end(&th);
11221 +       if (error)
11222 +               goto out;
11223 +
11224 +       /* check reiserfs_do_truncate after ending the transaction */
11225 +       if (err2) {
11226 +               error = err2;
11227 +               goto out;
11228 +       }
11229 +
11230 +       if (update_timestamps) {
11231 +               error = remove_save_link(inode, 1 /* truncate */);
11232 +               if (error)
11233 +                       goto out;
11234 +       }
11235 +
11236 +       if (page) {
11237 +               length = offset & (blocksize - 1);
11238 +               /* if we are not on a block boundary */
11239 +               if (length) {
11240 +                       length = blocksize - length;
11241 +                       zero_user(page, offset, length);
11242 +                       if (buffer_mapped(bh) && bh->b_blocknr != 0) {
11243 +                               mark_buffer_dirty(bh);
11244 +                       }
11245 +               }
11246 +               unlock_page(page);
11247 +               put_page(page);
11248 +       }
11249 +
11250 +       reiserfs_write_unlock(inode->i_sb);
11251 +
11252 +       return 0;
11253 +out:
11254 +       if (page) {
11255 +               unlock_page(page);
11256 +               put_page(page);
11257 +       }
11258 +
11259 +       reiserfs_write_unlock(inode->i_sb);
11260 +
11261 +       return error;
11262 +}
11263 +
11264 +static int map_block_for_writepage(struct inode *inode,
11265 +                                  struct buffer_head *bh_result,
11266 +                                  unsigned long block)
11267 +{
11268 +       struct reiserfs_transaction_handle th;
11269 +       int fs_gen;
11270 +       struct item_head tmp_ih;
11271 +       struct item_head *ih;
11272 +       struct buffer_head *bh;
11273 +       __le32 *item;
11274 +       struct cpu_key key;
11275 +       INITIALIZE_PATH(path);
11276 +       int pos_in_item;
11277 +       int jbegin_count = JOURNAL_PER_BALANCE_CNT;
11278 +       loff_t byte_offset = ((loff_t)block << inode->i_sb->s_blocksize_bits)+1;
11279 +       int retval;
11280 +       int use_get_block = 0;
11281 +       int bytes_copied = 0;
11282 +       int copy_size;
11283 +       int trans_running = 0;
11284 +
11285 +       /*
11286 +        * catch places below that try to log something without
11287 +        * starting a trans
11288 +        */
11289 +       th.t_trans_id = 0;
11290 +
11291 +       if (!buffer_uptodate(bh_result)) {
11292 +               return -EIO;
11293 +       }
11294 +
11295 +       kmap(bh_result->b_page);
11296 +start_over:
11297 +       reiserfs_write_lock(inode->i_sb);
11298 +       make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3);
11299 +
11300 +research:
11301 +       retval = search_for_position_by_key(inode->i_sb, &key, &path);
11302 +       if (retval != POSITION_FOUND) {
11303 +               use_get_block = 1;
11304 +               goto out;
11305 +       }
11306 +
11307 +       bh = get_last_bh(&path);
11308 +       ih = tp_item_head(&path);
11309 +       item = tp_item_body(&path);
11310 +       pos_in_item = path.pos_in_item;
11311 +
11312 +       /* we've found an unformatted node */
11313 +       if (indirect_item_found(retval, ih)) {
11314 +               if (bytes_copied > 0) {
11315 +                       reiserfs_warning(inode->i_sb, "clm-6002",
11316 +                                        "bytes_copied %d", bytes_copied);
11317 +               }
11318 +               if (!get_block_num(item, pos_in_item)) {
11319 +                       /* crap, we are writing to a hole */
11320 +                       use_get_block = 1;
11321 +                       goto out;
11322 +               }
11323 +               set_block_dev_mapped(bh_result,
11324 +                                    get_block_num(item, pos_in_item), inode);
11325 +       } else if (is_direct_le_ih(ih)) {
11326 +               char *p;
11327 +               p = page_address(bh_result->b_page);
11328 +               p += (byte_offset - 1) & (PAGE_SIZE - 1);
11329 +               copy_size = ih_item_len(ih) - pos_in_item;
11330 +
11331 +               fs_gen = get_generation(inode->i_sb);
11332 +               copy_item_head(&tmp_ih, ih);
11333 +
11334 +               if (!trans_running) {
11335 +                       /* vs-3050 is gone, no need to drop the path */
11336 +                       retval = journal_begin(&th, inode->i_sb, jbegin_count);
11337 +                       if (retval)
11338 +                               goto out;
11339 +                       reiserfs_update_inode_transaction(inode);
11340 +                       trans_running = 1;
11341 +                       if (fs_changed(fs_gen, inode->i_sb)
11342 +                           && item_moved(&tmp_ih, &path)) {
11343 +                               reiserfs_restore_prepared_buffer(inode->i_sb,
11344 +                                                                bh);
11345 +                               goto research;
11346 +                       }
11347 +               }
11348 +
11349 +               reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
11350 +
11351 +               if (fs_changed(fs_gen, inode->i_sb)
11352 +                   && item_moved(&tmp_ih, &path)) {
11353 +                       reiserfs_restore_prepared_buffer(inode->i_sb, bh);
11354 +                       goto research;
11355 +               }
11356 +
11357 +               memcpy(ih_item_body(bh, ih) + pos_in_item, p + bytes_copied,
11358 +                      copy_size);
11359 +
11360 +               journal_mark_dirty(&th, bh);
11361 +               bytes_copied += copy_size;
11362 +               set_block_dev_mapped(bh_result, 0, inode);
11363 +
11364 +               /* are there still bytes left? */
11365 +               if (bytes_copied < bh_result->b_size &&
11366 +                   (byte_offset + bytes_copied) < inode->i_size) {
11367 +                       set_cpu_key_k_offset(&key,
11368 +                                            cpu_key_k_offset(&key) +
11369 +                                            copy_size);
11370 +                       goto research;
11371 +               }
11372 +       } else {
11373 +               reiserfs_warning(inode->i_sb, "clm-6003",
11374 +                                "bad item inode %lu", inode->i_ino);
11375 +               retval = -EIO;
11376 +               goto out;
11377 +       }
11378 +       retval = 0;
11379 +
11380 +out:
11381 +       pathrelse(&path);
11382 +       if (trans_running) {
11383 +               int err = journal_end(&th);
11384 +               if (err)
11385 +                       retval = err;
11386 +               trans_running = 0;
11387 +       }
11388 +       reiserfs_write_unlock(inode->i_sb);
11389 +
11390 +       /* this is where we fill in holes in the file. */
11391 +       if (use_get_block) {
11392 +               retval = reiserfs_get_block(inode, block, bh_result,
11393 +                                           GET_BLOCK_CREATE | GET_BLOCK_NO_IMUX
11394 +                                           | GET_BLOCK_NO_DANGLE);
11395 +               if (!retval) {
11396 +                       if (!buffer_mapped(bh_result)
11397 +                           || bh_result->b_blocknr == 0) {
11398 +                               /* get_block failed to find a mapped unformatted node. */
11399 +                               use_get_block = 0;
11400 +                               goto start_over;
11401 +                       }
11402 +               }
11403 +       }
11404 +       kunmap(bh_result->b_page);
11405 +
11406 +       if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
11407 +               /*
11408 +                * we've copied data from the page into the direct item, so the
11409 +                * buffer in the page is now clean, mark it to reflect that.
11410 +                */
11411 +               lock_buffer(bh_result);
11412 +               clear_buffer_dirty(bh_result);
11413 +               unlock_buffer(bh_result);
11414 +       }
11415 +       return retval;
11416 +}
11417 +
11418 +/*
11419 + * mason@suse.com: updated in 2.5.54 to follow the same general io
11420 + * start/recovery path as __block_write_full_folio, along with special
11421 + * code to handle reiserfs tails.
11422 + */
11423 +static int reiserfs_write_folio(struct folio *folio,
11424 +               struct writeback_control *wbc, void *data)
11425 +{
11426 +       struct inode *inode = folio->mapping->host;
11427 +       unsigned long end_index = inode->i_size >> PAGE_SHIFT;
11428 +       int error = 0;
11429 +       unsigned long block;
11430 +       sector_t last_block;
11431 +       struct buffer_head *head, *bh;
11432 +       int partial = 0;
11433 +       int nr = 0;
11434 +       int checked = folio_test_checked(folio);
11435 +       struct reiserfs_transaction_handle th;
11436 +       struct super_block *s = inode->i_sb;
11437 +       int bh_per_page = PAGE_SIZE / s->s_blocksize;
11438 +       th.t_trans_id = 0;
11439 +
11440 +       /* no logging allowed when nonblocking or from PF_MEMALLOC */
11441 +       if (checked && (current->flags & PF_MEMALLOC)) {
11442 +               folio_redirty_for_writepage(wbc, folio);
11443 +               folio_unlock(folio);
11444 +               return 0;
11445 +       }
11446 +
11447 +       /*
11448 +        * The folio dirty bit is cleared before writepage is called, which
11449 +        * means we have to tell create_empty_buffers to make dirty buffers
11450 +        * The folio really should be up to date at this point, so tossing
11451 +        * in the BH_Uptodate is just a sanity check.
11452 +        */
11453 +       head = folio_buffers(folio);
11454 +       if (!head)
11455 +               head = create_empty_buffers(folio, s->s_blocksize,
11456 +                                    (1 << BH_Dirty) | (1 << BH_Uptodate));
11457 +
11458 +       /*
11459 +        * last folio in the file, zero out any contents past the
11460 +        * last byte in the file
11461 +        */
11462 +       if (folio->index >= end_index) {
11463 +               unsigned last_offset;
11464 +
11465 +               last_offset = inode->i_size & (PAGE_SIZE - 1);
11466 +               /* no file contents in this folio */
11467 +               if (folio->index >= end_index + 1 || !last_offset) {
11468 +                       folio_unlock(folio);
11469 +                       return 0;
11470 +               }
11471 +               folio_zero_segment(folio, last_offset, folio_size(folio));
11472 +       }
11473 +       bh = head;
11474 +       block = folio->index << (PAGE_SHIFT - s->s_blocksize_bits);
11475 +       last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
11476 +       /* first map all the buffers, logging any direct items we find */
11477 +       do {
11478 +               if (block > last_block) {
11479 +                       /*
11480 +                        * This can happen when the block size is less than
11481 +                        * the folio size.  The corresponding bytes in the folio
11482 +                        * were zero filled above
11483 +                        */
11484 +                       clear_buffer_dirty(bh);
11485 +                       set_buffer_uptodate(bh);
11486 +               } else if ((checked || buffer_dirty(bh)) &&
11487 +                          (!buffer_mapped(bh) || bh->b_blocknr == 0)) {
11488 +                       /*
11489 +                        * not mapped yet, or it points to a direct item, search
11490 +                        * the btree for the mapping info, and log any direct
11491 +                        * items found
11492 +                        */
11493 +                       if ((error = map_block_for_writepage(inode, bh, block))) {
11494 +                               goto fail;
11495 +                       }
11496 +               }
11497 +               bh = bh->b_this_page;
11498 +               block++;
11499 +       } while (bh != head);
11500 +
11501 +       /*
11502 +        * we start the transaction after map_block_for_writepage,
11503 +        * because it can create holes in the file (an unbounded operation).
11504 +        * starting it here, we can make a reliable estimate for how many
11505 +        * blocks we're going to log
11506 +        */
11507 +       if (checked) {
11508 +               folio_clear_checked(folio);
11509 +               reiserfs_write_lock(s);
11510 +               error = journal_begin(&th, s, bh_per_page + 1);
11511 +               if (error) {
11512 +                       reiserfs_write_unlock(s);
11513 +                       goto fail;
11514 +               }
11515 +               reiserfs_update_inode_transaction(inode);
11516 +       }
11517 +       /* now go through and lock any dirty buffers on the folio */
11518 +       do {
11519 +               get_bh(bh);
11520 +               if (!buffer_mapped(bh))
11521 +                       continue;
11522 +               if (buffer_mapped(bh) && bh->b_blocknr == 0)
11523 +                       continue;
11524 +
11525 +               if (checked) {
11526 +                       reiserfs_prepare_for_journal(s, bh, 1);
11527 +                       journal_mark_dirty(&th, bh);
11528 +                       continue;
11529 +               }
11530 +               /*
11531 +                * from this point on, we know the buffer is mapped to a
11532 +                * real block and not a direct item
11533 +                */
11534 +               if (wbc->sync_mode != WB_SYNC_NONE) {
11535 +                       lock_buffer(bh);
11536 +               } else {
11537 +                       if (!trylock_buffer(bh)) {
11538 +                               folio_redirty_for_writepage(wbc, folio);
11539 +                               continue;
11540 +                       }
11541 +               }
11542 +               if (test_clear_buffer_dirty(bh)) {
11543 +                       mark_buffer_async_write(bh);
11544 +               } else {
11545 +                       unlock_buffer(bh);
11546 +               }
11547 +       } while ((bh = bh->b_this_page) != head);
11548 +
11549 +       if (checked) {
11550 +               error = journal_end(&th);
11551 +               reiserfs_write_unlock(s);
11552 +               if (error)
11553 +                       goto fail;
11554 +       }
11555 +       BUG_ON(folio_test_writeback(folio));
11556 +       folio_start_writeback(folio);
11557 +       folio_unlock(folio);
11558 +
11559 +       /*
11560 +        * since any buffer might be the only dirty buffer on the folio,
11561 +        * the first submit_bh can bring the folio out of writeback.
11562 +        * be careful with the buffers.
11563 +        */
11564 +       do {
11565 +               struct buffer_head *next = bh->b_this_page;
11566 +               if (buffer_async_write(bh)) {
11567 +                       submit_bh(REQ_OP_WRITE, bh);
11568 +                       nr++;
11569 +               }
11570 +               put_bh(bh);
11571 +               bh = next;
11572 +       } while (bh != head);
11573 +
11574 +       error = 0;
11575 +done:
11576 +       if (nr == 0) {
11577 +               /*
11578 +                * if this folio only had a direct item, it is very possible for
11579 +                * no io to be required without there being an error.  Or,
11580 +                * someone else could have locked them and sent them down the
11581 +                * pipe without locking the folio
11582 +                */
11583 +               bh = head;
11584 +               do {
11585 +                       if (!buffer_uptodate(bh)) {
11586 +                               partial = 1;
11587 +                               break;
11588 +                       }
11589 +                       bh = bh->b_this_page;
11590 +               } while (bh != head);
11591 +               if (!partial)
11592 +                       folio_mark_uptodate(folio);
11593 +               folio_end_writeback(folio);
11594 +       }
11595 +       return error;
11596 +
11597 +fail:
11598 +       /*
11599 +        * catches various errors, we need to make sure any valid dirty blocks
11600 +        * get to the media.  The folio is currently locked and not marked for
11601 +        * writeback
11602 +        */
11603 +       folio_clear_uptodate(folio);
11604 +       bh = head;
11605 +       do {
11606 +               get_bh(bh);
11607 +               if (buffer_mapped(bh) && buffer_dirty(bh) && bh->b_blocknr) {
11608 +                       lock_buffer(bh);
11609 +                       mark_buffer_async_write(bh);
11610 +               } else {
11611 +                       /*
11612 +                        * clear any dirty bits that might have come from
11613 +                        * getting attached to a dirty folio
11614 +                        */
11615 +                       clear_buffer_dirty(bh);
11616 +               }
11617 +               bh = bh->b_this_page;
11618 +       } while (bh != head);
11619 +       BUG_ON(folio_test_writeback(folio));
11620 +       folio_start_writeback(folio);
11621 +       folio_unlock(folio);
11622 +       do {
11623 +               struct buffer_head *next = bh->b_this_page;
11624 +               if (buffer_async_write(bh)) {
11625 +                       clear_buffer_dirty(bh);
11626 +                       submit_bh(REQ_OP_WRITE, bh);
11627 +                       nr++;
11628 +               }
11629 +               put_bh(bh);
11630 +               bh = next;
11631 +       } while (bh != head);
11632 +       goto done;
11633 +}
11634 +
11635 +static int reiserfs_read_folio(struct file *f, struct folio *folio)
11636 +{
11637 +       return block_read_full_folio(folio, reiserfs_get_block);
11638 +}
11639 +
11640 +static int reiserfs_writepages(struct address_space *mapping,
11641 +               struct writeback_control *wbc)
11642 +{
11643 +       reiserfs_wait_on_write_block(mapping->host->i_sb);
11644 +       return write_cache_pages(mapping, wbc, reiserfs_write_folio, NULL);
11645 +}
11646 +
11647 +static void reiserfs_truncate_failed_write(struct inode *inode)
11648 +{
11649 +       truncate_inode_pages(inode->i_mapping, inode->i_size);
11650 +       reiserfs_truncate_file(inode, 0);
11651 +}
11652 +
11653 +static int reiserfs_write_begin(struct file *file,
11654 +                               struct address_space *mapping,
11655 +                               loff_t pos, unsigned len,
11656 +                               struct folio **foliop, void **fsdata)
11657 +{
11658 +       struct inode *inode;
11659 +       struct folio *folio;
11660 +       pgoff_t index;
11661 +       int ret;
11662 +       int old_ref = 0;
11663 +
11664 +       inode = mapping->host;
11665 +       index = pos >> PAGE_SHIFT;
11666 +       folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
11667 +                       mapping_gfp_mask(mapping));
11668 +       if (IS_ERR(folio))
11669 +               return PTR_ERR(folio);
11670 +       *foliop = folio;
11671 +
11672 +       reiserfs_wait_on_write_block(inode->i_sb);
11673 +       fix_tail_page_for_writing(&folio->page);
11674 +       if (reiserfs_transaction_running(inode->i_sb)) {
11675 +               struct reiserfs_transaction_handle *th;
11676 +               th = (struct reiserfs_transaction_handle *)current->
11677 +                   journal_info;
11678 +               BUG_ON(!th->t_refcount);
11679 +               BUG_ON(!th->t_trans_id);
11680 +               old_ref = th->t_refcount;
11681 +               th->t_refcount++;
11682 +       }
11683 +       ret = __block_write_begin(folio, pos, len, reiserfs_get_block);
11684 +       if (ret && reiserfs_transaction_running(inode->i_sb)) {
11685 +               struct reiserfs_transaction_handle *th = current->journal_info;
11686 +               /*
11687 +                * this gets a little ugly.  If reiserfs_get_block returned an
11688 +                * error and left a transacstion running, we've got to close
11689 +                * it, and we've got to free handle if it was a persistent
11690 +                * transaction.
11691 +                *
11692 +                * But, if we had nested into an existing transaction, we need
11693 +                * to just drop the ref count on the handle.
11694 +                *
11695 +                * If old_ref == 0, the transaction is from reiserfs_get_block,
11696 +                * and it was a persistent trans.  Otherwise, it was nested
11697 +                * above.
11698 +                */
11699 +               if (th->t_refcount > old_ref) {
11700 +                       if (old_ref)
11701 +                               th->t_refcount--;
11702 +                       else {
11703 +                               int err;
11704 +                               reiserfs_write_lock(inode->i_sb);
11705 +                               err = reiserfs_end_persistent_transaction(th);
11706 +                               reiserfs_write_unlock(inode->i_sb);
11707 +                               if (err)
11708 +                                       ret = err;
11709 +                       }
11710 +               }
11711 +       }
11712 +       if (ret) {
11713 +               folio_unlock(folio);
11714 +               folio_put(folio);
11715 +               /* Truncate allocated blocks */
11716 +               reiserfs_truncate_failed_write(inode);
11717 +       }
11718 +       return ret;
11719 +}
11720 +
11721 +int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len)
11722 +{
11723 +       struct inode *inode = page->mapping->host;
11724 +       int ret;
11725 +       int old_ref = 0;
11726 +       int depth;
11727 +
11728 +       depth = reiserfs_write_unlock_nested(inode->i_sb);
11729 +       reiserfs_wait_on_write_block(inode->i_sb);
11730 +       reiserfs_write_lock_nested(inode->i_sb, depth);
11731 +
11732 +       fix_tail_page_for_writing(page);
11733 +       if (reiserfs_transaction_running(inode->i_sb)) {
11734 +               struct reiserfs_transaction_handle *th;
11735 +               th = (struct reiserfs_transaction_handle *)current->
11736 +                   journal_info;
11737 +               BUG_ON(!th->t_refcount);
11738 +               BUG_ON(!th->t_trans_id);
11739 +               old_ref = th->t_refcount;
11740 +               th->t_refcount++;
11741 +       }
11742 +
11743 +       ret = __block_write_begin(page_folio(page), from, len, reiserfs_get_block);
11744 +       if (ret && reiserfs_transaction_running(inode->i_sb)) {
11745 +               struct reiserfs_transaction_handle *th = current->journal_info;
11746 +               /*
11747 +                * this gets a little ugly.  If reiserfs_get_block returned an
11748 +                * error and left a transacstion running, we've got to close
11749 +                * it, and we've got to free handle if it was a persistent
11750 +                * transaction.
11751 +                *
11752 +                * But, if we had nested into an existing transaction, we need
11753 +                * to just drop the ref count on the handle.
11754 +                *
11755 +                * If old_ref == 0, the transaction is from reiserfs_get_block,
11756 +                * and it was a persistent trans.  Otherwise, it was nested
11757 +                * above.
11758 +                */
11759 +               if (th->t_refcount > old_ref) {
11760 +                       if (old_ref)
11761 +                               th->t_refcount--;
11762 +                       else {
11763 +                               int err;
11764 +                               reiserfs_write_lock(inode->i_sb);
11765 +                               err = reiserfs_end_persistent_transaction(th);
11766 +                               reiserfs_write_unlock(inode->i_sb);
11767 +                               if (err)
11768 +                                       ret = err;
11769 +                       }
11770 +               }
11771 +       }
11772 +       return ret;
11773 +
11774 +}
11775 +
11776 +static sector_t reiserfs_aop_bmap(struct address_space *as, sector_t block)
11777 +{
11778 +       return generic_block_bmap(as, block, reiserfs_bmap);
11779 +}
11780 +
11781 +static int reiserfs_write_end(struct file *file, struct address_space *mapping,
11782 +                             loff_t pos, unsigned len, unsigned copied,
11783 +                             struct folio *folio, void *fsdata)
11784 +{
11785 +       struct inode *inode = folio->mapping->host;
11786 +       int ret = 0;
11787 +       int update_sd = 0;
11788 +       struct reiserfs_transaction_handle *th;
11789 +       unsigned start;
11790 +       bool locked = false;
11791 +
11792 +       reiserfs_wait_on_write_block(inode->i_sb);
11793 +       if (reiserfs_transaction_running(inode->i_sb))
11794 +               th = current->journal_info;
11795 +       else
11796 +               th = NULL;
11797 +
11798 +       start = pos & (PAGE_SIZE - 1);
11799 +       if (unlikely(copied < len)) {
11800 +               if (!folio_test_uptodate(folio))
11801 +                       copied = 0;
11802 +
11803 +               folio_zero_new_buffers(folio, start + copied, start + len);
11804 +       }
11805 +       flush_dcache_folio(folio);
11806 +
11807 +       reiserfs_commit_page(inode, &folio->page, start, start + copied);
11808 +
11809 +       /*
11810 +        * generic_commit_write does this for us, but does not update the
11811 +        * transaction tracking stuff when the size changes.  So, we have
11812 +        * to do the i_size updates here.
11813 +        */
11814 +       if (pos + copied > inode->i_size) {
11815 +               struct reiserfs_transaction_handle myth;
11816 +               reiserfs_write_lock(inode->i_sb);
11817 +               locked = true;
11818 +               /*
11819 +                * If the file have grown beyond the border where it
11820 +                * can have a tail, unmark it as needing a tail
11821 +                * packing
11822 +                */
11823 +               if ((have_large_tails(inode->i_sb)
11824 +                    && inode->i_size > i_block_size(inode) * 4)
11825 +                   || (have_small_tails(inode->i_sb)
11826 +                       && inode->i_size > i_block_size(inode)))
11827 +                       REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
11828 +
11829 +               ret = journal_begin(&myth, inode->i_sb, 1);
11830 +               if (ret)
11831 +                       goto journal_error;
11832 +
11833 +               reiserfs_update_inode_transaction(inode);
11834 +               inode->i_size = pos + copied;
11835 +               /*
11836 +                * this will just nest into our transaction.  It's important
11837 +                * to use mark_inode_dirty so the inode gets pushed around on
11838 +                * the dirty lists, and so that O_SYNC works as expected
11839 +                */
11840 +               mark_inode_dirty(inode);
11841 +               reiserfs_update_sd(&myth, inode);
11842 +               update_sd = 1;
11843 +               ret = journal_end(&myth);
11844 +               if (ret)
11845 +                       goto journal_error;
11846 +       }
11847 +       if (th) {
11848 +               if (!locked) {
11849 +                       reiserfs_write_lock(inode->i_sb);
11850 +                       locked = true;
11851 +               }
11852 +               if (!update_sd)
11853 +                       mark_inode_dirty(inode);
11854 +               ret = reiserfs_end_persistent_transaction(th);
11855 +               if (ret)
11856 +                       goto out;
11857 +       }
11858 +
11859 +out:
11860 +       if (locked)
11861 +               reiserfs_write_unlock(inode->i_sb);
11862 +       folio_unlock(folio);
11863 +       folio_put(folio);
11864 +
11865 +       if (pos + len > inode->i_size)
11866 +               reiserfs_truncate_failed_write(inode);
11867 +
11868 +       return ret == 0 ? copied : ret;
11869 +
11870 +journal_error:
11871 +       reiserfs_write_unlock(inode->i_sb);
11872 +       locked = false;
11873 +       if (th) {
11874 +               if (!update_sd)
11875 +                       reiserfs_update_sd(th, inode);
11876 +               ret = reiserfs_end_persistent_transaction(th);
11877 +       }
11878 +       goto out;
11879 +}
11880 +
11881 +int reiserfs_commit_write(struct file *f, struct page *page,
11882 +                         unsigned from, unsigned to)
11883 +{
11884 +       struct inode *inode = page->mapping->host;
11885 +       loff_t pos = ((loff_t) page->index << PAGE_SHIFT) + to;
11886 +       int ret = 0;
11887 +       int update_sd = 0;
11888 +       struct reiserfs_transaction_handle *th = NULL;
11889 +       int depth;
11890 +
11891 +       depth = reiserfs_write_unlock_nested(inode->i_sb);
11892 +       reiserfs_wait_on_write_block(inode->i_sb);
11893 +       reiserfs_write_lock_nested(inode->i_sb, depth);
11894 +
11895 +       if (reiserfs_transaction_running(inode->i_sb)) {
11896 +               th = current->journal_info;
11897 +       }
11898 +       reiserfs_commit_page(inode, page, from, to);
11899 +
11900 +       /*
11901 +        * generic_commit_write does this for us, but does not update the
11902 +        * transaction tracking stuff when the size changes.  So, we have
11903 +        * to do the i_size updates here.
11904 +        */
11905 +       if (pos > inode->i_size) {
11906 +               struct reiserfs_transaction_handle myth;
11907 +               /*
11908 +                * If the file have grown beyond the border where it
11909 +                * can have a tail, unmark it as needing a tail
11910 +                * packing
11911 +                */
11912 +               if ((have_large_tails(inode->i_sb)
11913 +                    && inode->i_size > i_block_size(inode) * 4)
11914 +                   || (have_small_tails(inode->i_sb)
11915 +                       && inode->i_size > i_block_size(inode)))
11916 +                       REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
11917 +
11918 +               ret = journal_begin(&myth, inode->i_sb, 1);
11919 +               if (ret)
11920 +                       goto journal_error;
11921 +
11922 +               reiserfs_update_inode_transaction(inode);
11923 +               inode->i_size = pos;
11924 +               /*
11925 +                * this will just nest into our transaction.  It's important
11926 +                * to use mark_inode_dirty so the inode gets pushed around
11927 +                * on the dirty lists, and so that O_SYNC works as expected
11928 +                */
11929 +               mark_inode_dirty(inode);
11930 +               reiserfs_update_sd(&myth, inode);
11931 +               update_sd = 1;
11932 +               ret = journal_end(&myth);
11933 +               if (ret)
11934 +                       goto journal_error;
11935 +       }
11936 +       if (th) {
11937 +               if (!update_sd)
11938 +                       mark_inode_dirty(inode);
11939 +               ret = reiserfs_end_persistent_transaction(th);
11940 +               if (ret)
11941 +                       goto out;
11942 +       }
11943 +
11944 +out:
11945 +       return ret;
11946 +
11947 +journal_error:
11948 +       if (th) {
11949 +               if (!update_sd)
11950 +                       reiserfs_update_sd(th, inode);
11951 +               ret = reiserfs_end_persistent_transaction(th);
11952 +       }
11953 +
11954 +       return ret;
11955 +}
11956 +
11957 +void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode)
11958 +{
11959 +       if (reiserfs_attrs(inode->i_sb)) {
11960 +               if (sd_attrs & REISERFS_SYNC_FL)
11961 +                       inode->i_flags |= S_SYNC;
11962 +               else
11963 +                       inode->i_flags &= ~S_SYNC;
11964 +               if (sd_attrs & REISERFS_IMMUTABLE_FL)
11965 +                       inode->i_flags |= S_IMMUTABLE;
11966 +               else
11967 +                       inode->i_flags &= ~S_IMMUTABLE;
11968 +               if (sd_attrs & REISERFS_APPEND_FL)
11969 +                       inode->i_flags |= S_APPEND;
11970 +               else
11971 +                       inode->i_flags &= ~S_APPEND;
11972 +               if (sd_attrs & REISERFS_NOATIME_FL)
11973 +                       inode->i_flags |= S_NOATIME;
11974 +               else
11975 +                       inode->i_flags &= ~S_NOATIME;
11976 +               if (sd_attrs & REISERFS_NOTAIL_FL)
11977 +                       REISERFS_I(inode)->i_flags |= i_nopack_mask;
11978 +               else
11979 +                       REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
11980 +       }
11981 +}
11982 +
11983 +/*
11984 + * decide if this buffer needs to stay around for data logging or ordered
11985 + * write purposes
11986 + */
11987 +static int invalidate_folio_can_drop(struct inode *inode, struct buffer_head *bh)
11988 +{
11989 +       int ret = 1;
11990 +       struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
11991 +
11992 +       lock_buffer(bh);
11993 +       spin_lock(&j->j_dirty_buffers_lock);
11994 +       if (!buffer_mapped(bh)) {
11995 +               goto free_jh;
11996 +       }
11997 +       /*
11998 +        * the page is locked, and the only places that log a data buffer
11999 +        * also lock the page.
12000 +        */
12001 +       if (reiserfs_file_data_log(inode)) {
12002 +               /*
12003 +                * very conservative, leave the buffer pinned if
12004 +                * anyone might need it.
12005 +                */
12006 +               if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
12007 +                       ret = 0;
12008 +               }
12009 +       } else  if (buffer_dirty(bh)) {
12010 +               struct reiserfs_journal_list *jl;
12011 +               struct reiserfs_jh *jh = bh->b_private;
12012 +
12013 +               /*
12014 +                * why is this safe?
12015 +                * reiserfs_setattr updates i_size in the on disk
12016 +                * stat data before allowing vmtruncate to be called.
12017 +                *
12018 +                * If buffer was put onto the ordered list for this
12019 +                * transaction, we know for sure either this transaction
12020 +                * or an older one already has updated i_size on disk,
12021 +                * and this ordered data won't be referenced in the file
12022 +                * if we crash.
12023 +                *
12024 +                * if the buffer was put onto the ordered list for an older
12025 +                * transaction, we need to leave it around
12026 +                */
12027 +               if (jh && (jl = jh->jl)
12028 +                   && jl != SB_JOURNAL(inode->i_sb)->j_current_jl)
12029 +                       ret = 0;
12030 +       }
12031 +free_jh:
12032 +       if (ret && bh->b_private) {
12033 +               reiserfs_free_jh(bh);
12034 +       }
12035 +       spin_unlock(&j->j_dirty_buffers_lock);
12036 +       unlock_buffer(bh);
12037 +       return ret;
12038 +}
12039 +
12040 +/* clm -- taken from fs/buffer.c:block_invalidate_folio */
12041 +static void reiserfs_invalidate_folio(struct folio *folio, size_t offset,
12042 +                                   size_t length)
12043 +{
12044 +       struct buffer_head *head, *bh, *next;
12045 +       struct inode *inode = folio->mapping->host;
12046 +       unsigned int curr_off = 0;
12047 +       unsigned int stop = offset + length;
12048 +       int partial_page = (offset || length < folio_size(folio));
12049 +       int ret = 1;
12050 +
12051 +       BUG_ON(!folio_test_locked(folio));
12052 +
12053 +       if (!partial_page)
12054 +               folio_clear_checked(folio);
12055 +
12056 +       head = folio_buffers(folio);
12057 +       if (!head)
12058 +               goto out;
12059 +
12060 +       bh = head;
12061 +       do {
12062 +               unsigned int next_off = curr_off + bh->b_size;
12063 +               next = bh->b_this_page;
12064 +
12065 +               if (next_off > stop)
12066 +                       goto out;
12067 +
12068 +               /*
12069 +                * is this block fully invalidated?
12070 +                */
12071 +               if (offset <= curr_off) {
12072 +                       if (invalidate_folio_can_drop(inode, bh))
12073 +                               reiserfs_unmap_buffer(bh);
12074 +                       else
12075 +                               ret = 0;
12076 +               }
12077 +               curr_off = next_off;
12078 +               bh = next;
12079 +       } while (bh != head);
12080 +
12081 +       /*
12082 +        * We release buffers only if the entire page is being invalidated.
12083 +        * The get_block cached value has been unconditionally invalidated,
12084 +        * so real IO is not possible anymore.
12085 +        */
12086 +       if (!partial_page && ret) {
12087 +               ret = filemap_release_folio(folio, 0);
12088 +               /* maybe should BUG_ON(!ret); - neilb */
12089 +       }
12090 +out:
12091 +       return;
12092 +}
12093 +
12094 +static bool reiserfs_dirty_folio(struct address_space *mapping,
12095 +               struct folio *folio)
12096 +{
12097 +       if (reiserfs_file_data_log(mapping->host)) {
12098 +               folio_set_checked(folio);
12099 +               return filemap_dirty_folio(mapping, folio);
12100 +       }
12101 +       return block_dirty_folio(mapping, folio);
12102 +}
12103 +
12104 +/*
12105 + * Returns true if the folio's buffers were dropped.  The folio is locked.
12106 + *
12107 + * Takes j_dirty_buffers_lock to protect the b_assoc_buffers list_heads
12108 + * in the buffers at folio_buffers(folio).
12109 + *
12110 + * even in -o notail mode, we can't be sure an old mount without -o notail
12111 + * didn't create files with tails.
12112 + */
12113 +static bool reiserfs_release_folio(struct folio *folio, gfp_t unused_gfp_flags)
12114 +{
12115 +       struct inode *inode = folio->mapping->host;
12116 +       struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
12117 +       struct buffer_head *head;
12118 +       struct buffer_head *bh;
12119 +       bool ret = true;
12120 +
12121 +       WARN_ON(folio_test_checked(folio));
12122 +       spin_lock(&j->j_dirty_buffers_lock);
12123 +       head = folio_buffers(folio);
12124 +       bh = head;
12125 +       do {
12126 +               if (bh->b_private) {
12127 +                       if (!buffer_dirty(bh) && !buffer_locked(bh)) {
12128 +                               reiserfs_free_jh(bh);
12129 +                       } else {
12130 +                               ret = false;
12131 +                               break;
12132 +                       }
12133 +               }
12134 +               bh = bh->b_this_page;
12135 +       } while (bh != head);
12136 +       if (ret)
12137 +               ret = try_to_free_buffers(folio);
12138 +       spin_unlock(&j->j_dirty_buffers_lock);
12139 +       return ret;
12140 +}
12141 +
12142 +/*
12143 + * We thank Mingming Cao for helping us understand in great detail what
12144 + * to do in this section of the code.
12145 + */
12146 +static ssize_t reiserfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
12147 +{
12148 +       struct file *file = iocb->ki_filp;
12149 +       struct inode *inode = file->f_mapping->host;
12150 +       size_t count = iov_iter_count(iter);
12151 +       ssize_t ret;
12152 +
12153 +       ret = blockdev_direct_IO(iocb, inode, iter,
12154 +                                reiserfs_get_blocks_direct_io);
12155 +
12156 +       /*
12157 +        * In case of error extending write may have instantiated a few
12158 +        * blocks outside i_size. Trim these off again.
12159 +        */
12160 +       if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
12161 +               loff_t isize = i_size_read(inode);
12162 +               loff_t end = iocb->ki_pos + count;
12163 +
12164 +               if ((end > isize) && inode_newsize_ok(inode, isize) == 0) {
12165 +                       truncate_setsize(inode, isize);
12166 +                       reiserfs_vfs_truncate_file(inode);
12167 +               }
12168 +       }
12169 +
12170 +       return ret;
12171 +}
12172 +
12173 +int reiserfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
12174 +                    struct iattr *attr)
12175 +{
12176 +       struct inode *inode = d_inode(dentry);
12177 +       unsigned int ia_valid;
12178 +       int error;
12179 +
12180 +       error = setattr_prepare(&nop_mnt_idmap, dentry, attr);
12181 +       if (error)
12182 +               return error;
12183 +
12184 +       /* must be turned off for recursive notify_change calls */
12185 +       ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID);
12186 +
12187 +       if (is_quota_modification(&nop_mnt_idmap, inode, attr)) {
12188 +               error = dquot_initialize(inode);
12189 +               if (error)
12190 +                       return error;
12191 +       }
12192 +       reiserfs_write_lock(inode->i_sb);
12193 +       if (attr->ia_valid & ATTR_SIZE) {
12194 +               /*
12195 +                * version 2 items will be caught by the s_maxbytes check
12196 +                * done for us in vmtruncate
12197 +                */
12198 +               if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 &&
12199 +                   attr->ia_size > MAX_NON_LFS) {
12200 +                       reiserfs_write_unlock(inode->i_sb);
12201 +                       error = -EFBIG;
12202 +                       goto out;
12203 +               }
12204 +
12205 +               inode_dio_wait(inode);
12206 +
12207 +               /* fill in hole pointers in the expanding truncate case. */
12208 +               if (attr->ia_size > inode->i_size) {
12209 +                       loff_t pos = attr->ia_size;
12210 +
12211 +                       if ((pos & (inode->i_sb->s_blocksize - 1)) == 0)
12212 +                               pos++;
12213 +                       error = generic_cont_expand_simple(inode, pos);
12214 +                       if (REISERFS_I(inode)->i_prealloc_count > 0) {
12215 +                               int err;
12216 +                               struct reiserfs_transaction_handle th;
12217 +                               /* we're changing at most 2 bitmaps, inode + super */
12218 +                               err = journal_begin(&th, inode->i_sb, 4);
12219 +                               if (!err) {
12220 +                                       reiserfs_discard_prealloc(&th, inode);
12221 +                                       err = journal_end(&th);
12222 +                               }
12223 +                               if (err)
12224 +                                       error = err;
12225 +                       }
12226 +                       if (error) {
12227 +                               reiserfs_write_unlock(inode->i_sb);
12228 +                               goto out;
12229 +                       }
12230 +                       /*
12231 +                        * file size is changed, ctime and mtime are
12232 +                        * to be updated
12233 +                        */
12234 +                       attr->ia_valid |= (ATTR_MTIME | ATTR_CTIME);
12235 +               }
12236 +       }
12237 +       reiserfs_write_unlock(inode->i_sb);
12238 +
12239 +       if ((((attr->ia_valid & ATTR_UID) && (from_kuid(&init_user_ns, attr->ia_uid) & ~0xffff)) ||
12240 +            ((attr->ia_valid & ATTR_GID) && (from_kgid(&init_user_ns, attr->ia_gid) & ~0xffff))) &&
12241 +           (get_inode_sd_version(inode) == STAT_DATA_V1)) {
12242 +               /* stat data of format v3.5 has 16 bit uid and gid */
12243 +               error = -EINVAL;
12244 +               goto out;
12245 +       }
12246 +
12247 +       if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
12248 +           (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
12249 +               struct reiserfs_transaction_handle th;
12250 +               int jbegin_count =
12251 +                   2 *
12252 +                   (REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb) +
12253 +                    REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb)) +
12254 +                   2;
12255 +
12256 +               error = reiserfs_chown_xattrs(inode, attr);
12257 +
12258 +               if (error)
12259 +                       return error;
12260 +
12261 +               /*
12262 +                * (user+group)*(old+new) structure - we count quota
12263 +                * info and , inode write (sb, inode)
12264 +                */
12265 +               reiserfs_write_lock(inode->i_sb);
12266 +               error = journal_begin(&th, inode->i_sb, jbegin_count);
12267 +               reiserfs_write_unlock(inode->i_sb);
12268 +               if (error)
12269 +                       goto out;
12270 +               error = dquot_transfer(&nop_mnt_idmap, inode, attr);
12271 +               reiserfs_write_lock(inode->i_sb);
12272 +               if (error) {
12273 +                       journal_end(&th);
12274 +                       reiserfs_write_unlock(inode->i_sb);
12275 +                       goto out;
12276 +               }
12277 +
12278 +               /*
12279 +                * Update corresponding info in inode so that everything
12280 +                * is in one transaction
12281 +                */
12282 +               if (attr->ia_valid & ATTR_UID)
12283 +                       inode->i_uid = attr->ia_uid;
12284 +               if (attr->ia_valid & ATTR_GID)
12285 +                       inode->i_gid = attr->ia_gid;
12286 +               mark_inode_dirty(inode);
12287 +               error = journal_end(&th);
12288 +               reiserfs_write_unlock(inode->i_sb);
12289 +               if (error)
12290 +                       goto out;
12291 +       }
12292 +
12293 +       if ((attr->ia_valid & ATTR_SIZE) &&
12294 +           attr->ia_size != i_size_read(inode)) {
12295 +               error = inode_newsize_ok(inode, attr->ia_size);
12296 +               if (!error) {
12297 +                       /*
12298 +                        * Could race against reiserfs_file_release
12299 +                        * if called from NFS, so take tailpack mutex.
12300 +                        */
12301 +                       mutex_lock(&REISERFS_I(inode)->tailpack);
12302 +                       truncate_setsize(inode, attr->ia_size);
12303 +                       reiserfs_truncate_file(inode, 1);
12304 +                       mutex_unlock(&REISERFS_I(inode)->tailpack);
12305 +               }
12306 +       }
12307 +
12308 +       if (!error) {
12309 +               setattr_copy(&nop_mnt_idmap, inode, attr);
12310 +               mark_inode_dirty(inode);
12311 +       }
12312 +
12313 +       if (!error && reiserfs_posixacl(inode->i_sb)) {
12314 +               if (attr->ia_valid & ATTR_MODE)
12315 +                       error = reiserfs_acl_chmod(dentry);
12316 +       }
12317 +
12318 +out:
12319 +       return error;
12320 +}
12321 +
12322 +const struct address_space_operations reiserfs_address_space_operations = {
12323 +       .writepages = reiserfs_writepages,
12324 +       .read_folio = reiserfs_read_folio,
12325 +       .readahead = reiserfs_readahead,
12326 +       .release_folio = reiserfs_release_folio,
12327 +       .invalidate_folio = reiserfs_invalidate_folio,
12328 +       .write_begin = reiserfs_write_begin,
12329 +       .write_end = reiserfs_write_end,
12330 +       .bmap = reiserfs_aop_bmap,
12331 +       .direct_IO = reiserfs_direct_IO,
12332 +       .dirty_folio = reiserfs_dirty_folio,
12333 +       .migrate_folio = buffer_migrate_folio,
12334 +};
12335 diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
12336 new file mode 100644
12337 index 000000000000..dd33f8cc6eda
12338 --- /dev/null
12339 +++ b/fs/reiserfs/ioctl.c
12340 @@ -0,0 +1,221 @@
12341 +/*
12342 + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
12343 + */
12344 +
12345 +#include <linux/capability.h>
12346 +#include <linux/fs.h>
12347 +#include <linux/mount.h>
12348 +#include "reiserfs.h"
12349 +#include <linux/time.h>
12350 +#include <linux/uaccess.h>
12351 +#include <linux/pagemap.h>
12352 +#include <linux/compat.h>
12353 +#include <linux/fileattr.h>
12354 +
12355 +int reiserfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
12356 +{
12357 +       struct inode *inode = d_inode(dentry);
12358 +
12359 +       if (!reiserfs_attrs(inode->i_sb))
12360 +               return -ENOTTY;
12361 +
12362 +       fileattr_fill_flags(fa, REISERFS_I(inode)->i_attrs);
12363 +
12364 +       return 0;
12365 +}
12366 +
12367 +int reiserfs_fileattr_set(struct mnt_idmap *idmap,
12368 +                         struct dentry *dentry, struct fileattr *fa)
12369 +{
12370 +       struct inode *inode = d_inode(dentry);
12371 +       unsigned int flags = fa->flags;
12372 +       int err;
12373 +
12374 +       reiserfs_write_lock(inode->i_sb);
12375 +
12376 +       err = -ENOTTY;
12377 +       if (!reiserfs_attrs(inode->i_sb))
12378 +               goto unlock;
12379 +
12380 +       err = -EOPNOTSUPP;
12381 +       if (fileattr_has_fsx(fa))
12382 +               goto unlock;
12383 +
12384 +       /*
12385 +        * Is it quota file? Do not allow user to mess with it
12386 +        */
12387 +       err = -EPERM;
12388 +       if (IS_NOQUOTA(inode))
12389 +               goto unlock;
12390 +
12391 +       if ((flags & REISERFS_NOTAIL_FL) && S_ISREG(inode->i_mode)) {
12392 +               err = reiserfs_unpack(inode);
12393 +               if (err)
12394 +                       goto unlock;
12395 +       }
12396 +       sd_attrs_to_i_attrs(flags, inode);
12397 +       REISERFS_I(inode)->i_attrs = flags;
12398 +       inode_set_ctime_current(inode);
12399 +       mark_inode_dirty(inode);
12400 +       err = 0;
12401 +unlock:
12402 +       reiserfs_write_unlock(inode->i_sb);
12403 +
12404 +       return err;
12405 +}
12406 +
12407 +/*
12408 + * reiserfs_ioctl - handler for ioctl for inode
12409 + * supported commands:
12410 + *  1) REISERFS_IOC_UNPACK - try to unpack tail from direct item into indirect
12411 + *                           and prevent packing file (argument arg has t
12412 + *                           be non-zero)
12413 + *  2) REISERFS_IOC_[GS]ETFLAGS, REISERFS_IOC_[GS]ETVERSION
12414 + *  3) That's all for a while ...
12415 + */
12416 +long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
12417 +{
12418 +       struct inode *inode = file_inode(filp);
12419 +       int err = 0;
12420 +
12421 +       reiserfs_write_lock(inode->i_sb);
12422 +
12423 +       switch (cmd) {
12424 +       case REISERFS_IOC_UNPACK:
12425 +               if (S_ISREG(inode->i_mode)) {
12426 +                       if (arg)
12427 +                               err = reiserfs_unpack(inode);
12428 +               } else
12429 +                       err = -ENOTTY;
12430 +               break;
12431 +               /*
12432 +                * following two cases are taken from fs/ext2/ioctl.c by Remy
12433 +                * Card (card@masi.ibp.fr)
12434 +                */
12435 +       case REISERFS_IOC_GETVERSION:
12436 +               err = put_user(inode->i_generation, (int __user *)arg);
12437 +               break;
12438 +       case REISERFS_IOC_SETVERSION:
12439 +               if (!inode_owner_or_capable(&nop_mnt_idmap, inode)) {
12440 +                       err = -EPERM;
12441 +                       break;
12442 +               }
12443 +               err = mnt_want_write_file(filp);
12444 +               if (err)
12445 +                       break;
12446 +               if (get_user(inode->i_generation, (int __user *)arg)) {
12447 +                       err = -EFAULT;
12448 +                       goto setversion_out;
12449 +               }
12450 +               inode_set_ctime_current(inode);
12451 +               mark_inode_dirty(inode);
12452 +setversion_out:
12453 +               mnt_drop_write_file(filp);
12454 +               break;
12455 +       default:
12456 +               err = -ENOTTY;
12457 +       }
12458 +
12459 +       reiserfs_write_unlock(inode->i_sb);
12460 +
12461 +       return err;
12462 +}
12463 +
12464 +#ifdef CONFIG_COMPAT
12465 +long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
12466 +                               unsigned long arg)
12467 +{
12468 +       /*
12469 +        * These are just misnamed, they actually
12470 +        * get/put from/to user an int
12471 +        */
12472 +       switch (cmd) {
12473 +       case REISERFS_IOC32_UNPACK:
12474 +               cmd = REISERFS_IOC_UNPACK;
12475 +               break;
12476 +       case REISERFS_IOC32_GETVERSION:
12477 +               cmd = REISERFS_IOC_GETVERSION;
12478 +               break;
12479 +       case REISERFS_IOC32_SETVERSION:
12480 +               cmd = REISERFS_IOC_SETVERSION;
12481 +               break;
12482 +       default:
12483 +               return -ENOIOCTLCMD;
12484 +       }
12485 +
12486 +       return reiserfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
12487 +}
12488 +#endif
12489 +
12490 +int reiserfs_commit_write(struct file *f, struct page *page,
12491 +                         unsigned from, unsigned to);
12492 +/*
12493 + * reiserfs_unpack
12494 + * Function try to convert tail from direct item into indirect.
12495 + * It set up nopack attribute in the REISERFS_I(inode)->nopack
12496 + */
12497 +int reiserfs_unpack(struct inode *inode)
12498 +{
12499 +       int retval = 0;
12500 +       int index;
12501 +       struct page *page;
12502 +       struct address_space *mapping;
12503 +       unsigned long write_from;
12504 +       unsigned long blocksize = inode->i_sb->s_blocksize;
12505 +
12506 +       if (inode->i_size == 0) {
12507 +               REISERFS_I(inode)->i_flags |= i_nopack_mask;
12508 +               return 0;
12509 +       }
12510 +       /* ioctl already done */
12511 +       if (REISERFS_I(inode)->i_flags & i_nopack_mask) {
12512 +               return 0;
12513 +       }
12514 +
12515 +       /* we need to make sure nobody is changing the file size beneath us */
12516 +       {
12517 +               int depth = reiserfs_write_unlock_nested(inode->i_sb);
12518 +
12519 +               inode_lock(inode);
12520 +               reiserfs_write_lock_nested(inode->i_sb, depth);
12521 +       }
12522 +
12523 +       reiserfs_write_lock(inode->i_sb);
12524 +
12525 +       write_from = inode->i_size & (blocksize - 1);
12526 +       /* if we are on a block boundary, we are already unpacked.  */
12527 +       if (write_from == 0) {
12528 +               REISERFS_I(inode)->i_flags |= i_nopack_mask;
12529 +               goto out;
12530 +       }
12531 +
12532 +       /*
12533 +        * we unpack by finding the page with the tail, and calling
12534 +        * __reiserfs_write_begin on that page.  This will force a
12535 +        * reiserfs_get_block to unpack the tail for us.
12536 +        */
12537 +       index = inode->i_size >> PAGE_SHIFT;
12538 +       mapping = inode->i_mapping;
12539 +       page = grab_cache_page(mapping, index);
12540 +       retval = -ENOMEM;
12541 +       if (!page) {
12542 +               goto out;
12543 +       }
12544 +       retval = __reiserfs_write_begin(page, write_from, 0);
12545 +       if (retval)
12546 +               goto out_unlock;
12547 +
12548 +       /* conversion can change page contents, must flush */
12549 +       flush_dcache_page(page);
12550 +       retval = reiserfs_commit_write(NULL, page, write_from, write_from);
12551 +       REISERFS_I(inode)->i_flags |= i_nopack_mask;
12552 +
12553 +out_unlock:
12554 +       unlock_page(page);
12555 +       put_page(page);
12556 +
12557 +out:
12558 +       inode_unlock(inode);
12559 +       reiserfs_write_unlock(inode->i_sb);
12560 +       return retval;
12561 +}
12562 diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c
12563 new file mode 100644
12564 index 000000000000..5011c10287c6
12565 --- /dev/null
12566 +++ b/fs/reiserfs/item_ops.c
12567 @@ -0,0 +1,737 @@
12568 +/*
12569 + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
12570 + */
12571 +
12572 +#include <linux/time.h>
12573 +#include "reiserfs.h"
12574 +
12575 +/*
12576 + * this contains item handlers for old item types: sd, direct,
12577 + * indirect, directory
12578 + */
12579 +
12580 +/*
12581 + * and where are the comments? how about saying where we can find an
12582 + * explanation of each item handler method? -Hans
12583 + */
12584 +
12585 +/* stat data functions */
12586 +static int sd_bytes_number(struct item_head *ih, int block_size)
12587 +{
12588 +       return 0;
12589 +}
12590 +
12591 +static void sd_decrement_key(struct cpu_key *key)
12592 +{
12593 +       key->on_disk_key.k_objectid--;
12594 +       set_cpu_key_k_type(key, TYPE_ANY);
12595 +       set_cpu_key_k_offset(key, (loff_t)(~0ULL >> 1));
12596 +}
12597 +
12598 +static int sd_is_left_mergeable(struct reiserfs_key *key, unsigned long bsize)
12599 +{
12600 +       return 0;
12601 +}
12602 +
12603 +static void sd_print_item(struct item_head *ih, char *item)
12604 +{
12605 +       printk("\tmode | size | nlinks | first direct | mtime\n");
12606 +       if (stat_data_v1(ih)) {
12607 +               struct stat_data_v1 *sd = (struct stat_data_v1 *)item;
12608 +
12609 +               printk("\t0%-6o | %6u | %2u | %d | %u\n", sd_v1_mode(sd),
12610 +                      sd_v1_size(sd), sd_v1_nlink(sd),
12611 +                      sd_v1_first_direct_byte(sd),
12612 +                      sd_v1_mtime(sd));
12613 +       } else {
12614 +               struct stat_data *sd = (struct stat_data *)item;
12615 +
12616 +               printk("\t0%-6o | %6llu | %2u | %d | %u\n", sd_v2_mode(sd),
12617 +                      (unsigned long long)sd_v2_size(sd), sd_v2_nlink(sd),
12618 +                      sd_v2_rdev(sd), sd_v2_mtime(sd));
12619 +       }
12620 +}
12621 +
12622 +static void sd_check_item(struct item_head *ih, char *item)
12623 +{
12624 +       /* unused */
12625 +}
12626 +
12627 +static int sd_create_vi(struct virtual_node *vn,
12628 +                       struct virtual_item *vi,
12629 +                       int is_affected, int insert_size)
12630 +{
12631 +       vi->vi_index = TYPE_STAT_DATA;
12632 +       return 0;
12633 +}
12634 +
12635 +static int sd_check_left(struct virtual_item *vi, int free,
12636 +                        int start_skip, int end_skip)
12637 +{
12638 +       BUG_ON(start_skip || end_skip);
12639 +       return -1;
12640 +}
12641 +
12642 +static int sd_check_right(struct virtual_item *vi, int free)
12643 +{
12644 +       return -1;
12645 +}
12646 +
12647 +static int sd_part_size(struct virtual_item *vi, int first, int count)
12648 +{
12649 +       BUG_ON(count);
12650 +       return 0;
12651 +}
12652 +
12653 +static int sd_unit_num(struct virtual_item *vi)
12654 +{
12655 +       return vi->vi_item_len - IH_SIZE;
12656 +}
12657 +
12658 +static void sd_print_vi(struct virtual_item *vi)
12659 +{
12660 +       reiserfs_warning(NULL, "reiserfs-16100",
12661 +                        "STATDATA, index %d, type 0x%x, %h",
12662 +                        vi->vi_index, vi->vi_type, vi->vi_ih);
12663 +}
12664 +
12665 +static struct item_operations stat_data_ops = {
12666 +       .bytes_number = sd_bytes_number,
12667 +       .decrement_key = sd_decrement_key,
12668 +       .is_left_mergeable = sd_is_left_mergeable,
12669 +       .print_item = sd_print_item,
12670 +       .check_item = sd_check_item,
12671 +
12672 +       .create_vi = sd_create_vi,
12673 +       .check_left = sd_check_left,
12674 +       .check_right = sd_check_right,
12675 +       .part_size = sd_part_size,
12676 +       .unit_num = sd_unit_num,
12677 +       .print_vi = sd_print_vi
12678 +};
12679 +
12680 +/* direct item functions */
12681 +static int direct_bytes_number(struct item_head *ih, int block_size)
12682 +{
12683 +       return ih_item_len(ih);
12684 +}
12685 +
12686 +/* FIXME: this should probably switch to indirect as well */
12687 +static void direct_decrement_key(struct cpu_key *key)
12688 +{
12689 +       cpu_key_k_offset_dec(key);
12690 +       if (cpu_key_k_offset(key) == 0)
12691 +               set_cpu_key_k_type(key, TYPE_STAT_DATA);
12692 +}
12693 +
12694 +static int direct_is_left_mergeable(struct reiserfs_key *key,
12695 +                                   unsigned long bsize)
12696 +{
12697 +       int version = le_key_version(key);
12698 +       return ((le_key_k_offset(version, key) & (bsize - 1)) != 1);
12699 +}
12700 +
12701 +static void direct_print_item(struct item_head *ih, char *item)
12702 +{
12703 +       int j = 0;
12704 +
12705 +/*    return; */
12706 +       printk("\"");
12707 +       while (j < ih_item_len(ih))
12708 +               printk("%c", item[j++]);
12709 +       printk("\"\n");
12710 +}
12711 +
12712 +static void direct_check_item(struct item_head *ih, char *item)
12713 +{
12714 +       /* unused */
12715 +}
12716 +
12717 +static int direct_create_vi(struct virtual_node *vn,
12718 +                           struct virtual_item *vi,
12719 +                           int is_affected, int insert_size)
12720 +{
12721 +       vi->vi_index = TYPE_DIRECT;
12722 +       return 0;
12723 +}
12724 +
12725 +static int direct_check_left(struct virtual_item *vi, int free,
12726 +                            int start_skip, int end_skip)
12727 +{
12728 +       int bytes;
12729 +
12730 +       bytes = free - free % 8;
12731 +       return bytes ? : -1;
12732 +}
12733 +
12734 +static int direct_check_right(struct virtual_item *vi, int free)
12735 +{
12736 +       return direct_check_left(vi, free, 0, 0);
12737 +}
12738 +
12739 +static int direct_part_size(struct virtual_item *vi, int first, int count)
12740 +{
12741 +       return count;
12742 +}
12743 +
12744 +static int direct_unit_num(struct virtual_item *vi)
12745 +{
12746 +       return vi->vi_item_len - IH_SIZE;
12747 +}
12748 +
12749 +static void direct_print_vi(struct virtual_item *vi)
12750 +{
12751 +       reiserfs_warning(NULL, "reiserfs-16101",
12752 +                        "DIRECT, index %d, type 0x%x, %h",
12753 +                        vi->vi_index, vi->vi_type, vi->vi_ih);
12754 +}
12755 +
12756 +static struct item_operations direct_ops = {
12757 +       .bytes_number = direct_bytes_number,
12758 +       .decrement_key = direct_decrement_key,
12759 +       .is_left_mergeable = direct_is_left_mergeable,
12760 +       .print_item = direct_print_item,
12761 +       .check_item = direct_check_item,
12762 +
12763 +       .create_vi = direct_create_vi,
12764 +       .check_left = direct_check_left,
12765 +       .check_right = direct_check_right,
12766 +       .part_size = direct_part_size,
12767 +       .unit_num = direct_unit_num,
12768 +       .print_vi = direct_print_vi
12769 +};
12770 +
12771 +/* indirect item functions */
12772 +static int indirect_bytes_number(struct item_head *ih, int block_size)
12773 +{
12774 +       return ih_item_len(ih) / UNFM_P_SIZE * block_size;
12775 +}
12776 +
12777 +/* decrease offset, if it becomes 0, change type to stat data */
12778 +static void indirect_decrement_key(struct cpu_key *key)
12779 +{
12780 +       cpu_key_k_offset_dec(key);
12781 +       if (cpu_key_k_offset(key) == 0)
12782 +               set_cpu_key_k_type(key, TYPE_STAT_DATA);
12783 +}
12784 +
12785 +/* if it is not first item of the body, then it is mergeable */
12786 +static int indirect_is_left_mergeable(struct reiserfs_key *key,
12787 +                                     unsigned long bsize)
12788 +{
12789 +       int version = le_key_version(key);
12790 +       return (le_key_k_offset(version, key) != 1);
12791 +}
12792 +
12793 +/* printing of indirect item */
12794 +static void start_new_sequence(__u32 * start, int *len, __u32 new)
12795 +{
12796 +       *start = new;
12797 +       *len = 1;
12798 +}
12799 +
12800 +static int sequence_finished(__u32 start, int *len, __u32 new)
12801 +{
12802 +       if (start == INT_MAX)
12803 +               return 1;
12804 +
12805 +       if (start == 0 && new == 0) {
12806 +               (*len)++;
12807 +               return 0;
12808 +       }
12809 +       if (start != 0 && (start + *len) == new) {
12810 +               (*len)++;
12811 +               return 0;
12812 +       }
12813 +       return 1;
12814 +}
12815 +
12816 +static void print_sequence(__u32 start, int len)
12817 +{
12818 +       if (start == INT_MAX)
12819 +               return;
12820 +
12821 +       if (len == 1)
12822 +               printk(" %d", start);
12823 +       else
12824 +               printk(" %d(%d)", start, len);
12825 +}
12826 +
12827 +static void indirect_print_item(struct item_head *ih, char *item)
12828 +{
12829 +       int j;
12830 +       __le32 *unp;
12831 +       __u32 prev = INT_MAX;
12832 +       int num = 0;
12833 +
12834 +       unp = (__le32 *) item;
12835 +
12836 +       if (ih_item_len(ih) % UNFM_P_SIZE)
12837 +               reiserfs_warning(NULL, "reiserfs-16102", "invalid item len");
12838 +
12839 +       printk("%d pointers\n[ ", (int)I_UNFM_NUM(ih));
12840 +       for (j = 0; j < I_UNFM_NUM(ih); j++) {
12841 +               if (sequence_finished(prev, &num, get_block_num(unp, j))) {
12842 +                       print_sequence(prev, num);
12843 +                       start_new_sequence(&prev, &num, get_block_num(unp, j));
12844 +               }
12845 +       }
12846 +       print_sequence(prev, num);
12847 +       printk("]\n");
12848 +}
12849 +
12850 +static void indirect_check_item(struct item_head *ih, char *item)
12851 +{
12852 +       /* unused */
12853 +}
12854 +
12855 +static int indirect_create_vi(struct virtual_node *vn,
12856 +                             struct virtual_item *vi,
12857 +                             int is_affected, int insert_size)
12858 +{
12859 +       vi->vi_index = TYPE_INDIRECT;
12860 +       return 0;
12861 +}
12862 +
12863 +static int indirect_check_left(struct virtual_item *vi, int free,
12864 +                              int start_skip, int end_skip)
12865 +{
12866 +       int bytes;
12867 +
12868 +       bytes = free - free % UNFM_P_SIZE;
12869 +       return bytes ? : -1;
12870 +}
12871 +
12872 +static int indirect_check_right(struct virtual_item *vi, int free)
12873 +{
12874 +       return indirect_check_left(vi, free, 0, 0);
12875 +}
12876 +
12877 +/*
12878 + * return size in bytes of 'units' units. If first == 0 - calculate
12879 + * from the head (left), otherwise - from tail (right)
12880 + */
12881 +static int indirect_part_size(struct virtual_item *vi, int first, int units)
12882 +{
12883 +       /* unit of indirect item is byte (yet) */
12884 +       return units;
12885 +}
12886 +
12887 +static int indirect_unit_num(struct virtual_item *vi)
12888 +{
12889 +       /* unit of indirect item is byte (yet) */
12890 +       return vi->vi_item_len - IH_SIZE;
12891 +}
12892 +
12893 +static void indirect_print_vi(struct virtual_item *vi)
12894 +{
12895 +       reiserfs_warning(NULL, "reiserfs-16103",
12896 +                        "INDIRECT, index %d, type 0x%x, %h",
12897 +                        vi->vi_index, vi->vi_type, vi->vi_ih);
12898 +}
12899 +
12900 +static struct item_operations indirect_ops = {
12901 +       .bytes_number = indirect_bytes_number,
12902 +       .decrement_key = indirect_decrement_key,
12903 +       .is_left_mergeable = indirect_is_left_mergeable,
12904 +       .print_item = indirect_print_item,
12905 +       .check_item = indirect_check_item,
12906 +
12907 +       .create_vi = indirect_create_vi,
12908 +       .check_left = indirect_check_left,
12909 +       .check_right = indirect_check_right,
12910 +       .part_size = indirect_part_size,
12911 +       .unit_num = indirect_unit_num,
12912 +       .print_vi = indirect_print_vi
12913 +};
12914 +
12915 +/* direntry functions */
12916 +static int direntry_bytes_number(struct item_head *ih, int block_size)
12917 +{
12918 +       reiserfs_warning(NULL, "vs-16090",
12919 +                        "bytes number is asked for direntry");
12920 +       return 0;
12921 +}
12922 +
12923 +static void direntry_decrement_key(struct cpu_key *key)
12924 +{
12925 +       cpu_key_k_offset_dec(key);
12926 +       if (cpu_key_k_offset(key) == 0)
12927 +               set_cpu_key_k_type(key, TYPE_STAT_DATA);
12928 +}
12929 +
12930 +static int direntry_is_left_mergeable(struct reiserfs_key *key,
12931 +                                     unsigned long bsize)
12932 +{
12933 +       if (le32_to_cpu(key->u.k_offset_v1.k_offset) == DOT_OFFSET)
12934 +               return 0;
12935 +       return 1;
12936 +
12937 +}
12938 +
12939 +static void direntry_print_item(struct item_head *ih, char *item)
12940 +{
12941 +       int i;
12942 +       int namelen;
12943 +       struct reiserfs_de_head *deh;
12944 +       char *name;
12945 +       static char namebuf[80];
12946 +
12947 +       printk("\n # %-15s%-30s%-15s%-15s%-15s\n", "Name",
12948 +              "Key of pointed object", "Hash", "Gen number", "Status");
12949 +
12950 +       deh = (struct reiserfs_de_head *)item;
12951 +
12952 +       for (i = 0; i < ih_entry_count(ih); i++, deh++) {
12953 +               namelen =
12954 +                   (i ? (deh_location(deh - 1)) : ih_item_len(ih)) -
12955 +                   deh_location(deh);
12956 +               name = item + deh_location(deh);
12957 +               if (name[namelen - 1] == 0)
12958 +                       namelen = strlen(name);
12959 +
12960 +               scnprintf(namebuf, sizeof(namebuf), "\"%.*s\"",
12961 +                         (int)sizeof(namebuf)-3, name);
12962 +
12963 +               printk("%d:  %-15s%-15d%-15d%-15lld%-15lld(%s)\n",
12964 +                      i, namebuf,
12965 +                      deh_dir_id(deh), deh_objectid(deh),
12966 +                      GET_HASH_VALUE(deh_offset(deh)),
12967 +                      GET_GENERATION_NUMBER((deh_offset(deh))),
12968 +                      (de_hidden(deh)) ? "HIDDEN" : "VISIBLE");
12969 +       }
12970 +}
12971 +
12972 +static void direntry_check_item(struct item_head *ih, char *item)
12973 +{
12974 +       int i;
12975 +       struct reiserfs_de_head *deh;
12976 +
12977 +       /* unused */
12978 +       deh = (struct reiserfs_de_head *)item;
12979 +       for (i = 0; i < ih_entry_count(ih); i++, deh++) {
12980 +               ;
12981 +       }
12982 +}
12983 +
12984 +#define DIRENTRY_VI_FIRST_DIRENTRY_ITEM 1
12985 +
12986 +/*
12987 + * function returns old entry number in directory item in real node
12988 + * using new entry number in virtual item in virtual node
12989 + */
12990 +static inline int old_entry_num(int is_affected, int virtual_entry_num,
12991 +                               int pos_in_item, int mode)
12992 +{
12993 +       if (mode == M_INSERT || mode == M_DELETE)
12994 +               return virtual_entry_num;
12995 +
12996 +       if (!is_affected)
12997 +               /* cut or paste is applied to another item */
12998 +               return virtual_entry_num;
12999 +
13000 +       if (virtual_entry_num < pos_in_item)
13001 +               return virtual_entry_num;
13002 +
13003 +       if (mode == M_CUT)
13004 +               return virtual_entry_num + 1;
13005 +
13006 +       RFALSE(mode != M_PASTE || virtual_entry_num == 0,
13007 +              "vs-8015: old_entry_num: mode must be M_PASTE (mode = \'%c\'",
13008 +              mode);
13009 +
13010 +       return virtual_entry_num - 1;
13011 +}
13012 +
13013 +/*
13014 + * Create an array of sizes of directory entries for virtual
13015 + * item. Return space used by an item. FIXME: no control over
13016 + * consuming of space used by this item handler
13017 + */
13018 +static int direntry_create_vi(struct virtual_node *vn,
13019 +                             struct virtual_item *vi,
13020 +                             int is_affected, int insert_size)
13021 +{
13022 +       struct direntry_uarea *dir_u = vi->vi_uarea;
13023 +       int i, j;
13024 +       int size = sizeof(struct direntry_uarea);
13025 +       struct reiserfs_de_head *deh;
13026 +
13027 +       vi->vi_index = TYPE_DIRENTRY;
13028 +
13029 +       BUG_ON(!(vi->vi_ih) || !vi->vi_item);
13030 +
13031 +       dir_u->flags = 0;
13032 +       if (le_ih_k_offset(vi->vi_ih) == DOT_OFFSET)
13033 +               dir_u->flags |= DIRENTRY_VI_FIRST_DIRENTRY_ITEM;
13034 +
13035 +       deh = (struct reiserfs_de_head *)(vi->vi_item);
13036 +
13037 +       /* virtual directory item have this amount of entry after */
13038 +       dir_u->entry_count = ih_entry_count(vi->vi_ih) +
13039 +           ((is_affected) ? ((vn->vn_mode == M_CUT) ? -1 :
13040 +                             (vn->vn_mode == M_PASTE ? 1 : 0)) : 0);
13041 +
13042 +       for (i = 0; i < dir_u->entry_count; i++) {
13043 +               j = old_entry_num(is_affected, i, vn->vn_pos_in_item,
13044 +                                 vn->vn_mode);
13045 +               dir_u->entry_sizes[i] =
13046 +                   (j ? deh_location(&deh[j - 1]) : ih_item_len(vi->vi_ih)) -
13047 +                   deh_location(&deh[j]) + DEH_SIZE;
13048 +       }
13049 +
13050 +       size += (dir_u->entry_count * sizeof(short));
13051 +
13052 +       /* set size of pasted entry */
13053 +       if (is_affected && vn->vn_mode == M_PASTE)
13054 +               dir_u->entry_sizes[vn->vn_pos_in_item] = insert_size;
13055 +
13056 +#ifdef CONFIG_REISERFS_CHECK
13057 +       /* compare total size of entries with item length */
13058 +       {
13059 +               int k, l;
13060 +
13061 +               l = 0;
13062 +               for (k = 0; k < dir_u->entry_count; k++)
13063 +                       l += dir_u->entry_sizes[k];
13064 +
13065 +               if (l + IH_SIZE != vi->vi_item_len +
13066 +                   ((is_affected
13067 +                     && (vn->vn_mode == M_PASTE
13068 +                         || vn->vn_mode == M_CUT)) ? insert_size : 0)) {
13069 +                       reiserfs_panic(NULL, "vs-8025", "(mode==%c, "
13070 +                                      "insert_size==%d), invalid length of "
13071 +                                      "directory item",
13072 +                                      vn->vn_mode, insert_size);
13073 +               }
13074 +       }
13075 +#endif
13076 +
13077 +       return size;
13078 +
13079 +}
13080 +
13081 +/*
13082 + * return number of entries which may fit into specified amount of
13083 + * free space, or -1 if free space is not enough even for 1 entry
13084 + */
13085 +static int direntry_check_left(struct virtual_item *vi, int free,
13086 +                              int start_skip, int end_skip)
13087 +{
13088 +       int i;
13089 +       int entries = 0;
13090 +       struct direntry_uarea *dir_u = vi->vi_uarea;
13091 +
13092 +       for (i = start_skip; i < dir_u->entry_count - end_skip; i++) {
13093 +               /* i-th entry doesn't fit into the remaining free space */
13094 +               if (dir_u->entry_sizes[i] > free)
13095 +                       break;
13096 +
13097 +               free -= dir_u->entry_sizes[i];
13098 +               entries++;
13099 +       }
13100 +
13101 +       if (entries == dir_u->entry_count) {
13102 +               reiserfs_panic(NULL, "item_ops-1",
13103 +                              "free space %d, entry_count %d", free,
13104 +                              dir_u->entry_count);
13105 +       }
13106 +
13107 +       /* "." and ".." can not be separated from each other */
13108 +       if (start_skip == 0 && (dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM)
13109 +           && entries < 2)
13110 +               entries = 0;
13111 +
13112 +       return entries ? : -1;
13113 +}
13114 +
13115 +static int direntry_check_right(struct virtual_item *vi, int free)
13116 +{
13117 +       int i;
13118 +       int entries = 0;
13119 +       struct direntry_uarea *dir_u = vi->vi_uarea;
13120 +
13121 +       for (i = dir_u->entry_count - 1; i >= 0; i--) {
13122 +               /* i-th entry doesn't fit into the remaining free space */
13123 +               if (dir_u->entry_sizes[i] > free)
13124 +                       break;
13125 +
13126 +               free -= dir_u->entry_sizes[i];
13127 +               entries++;
13128 +       }
13129 +       BUG_ON(entries == dir_u->entry_count);
13130 +
13131 +       /* "." and ".." can not be separated from each other */
13132 +       if ((dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM)
13133 +           && entries > dir_u->entry_count - 2)
13134 +               entries = dir_u->entry_count - 2;
13135 +
13136 +       return entries ? : -1;
13137 +}
13138 +
13139 +/* sum of entry sizes between from-th and to-th entries including both edges */
13140 +static int direntry_part_size(struct virtual_item *vi, int first, int count)
13141 +{
13142 +       int i, retval;
13143 +       int from, to;
13144 +       struct direntry_uarea *dir_u = vi->vi_uarea;
13145 +
13146 +       retval = 0;
13147 +       if (first == 0)
13148 +               from = 0;
13149 +       else
13150 +               from = dir_u->entry_count - count;
13151 +       to = from + count - 1;
13152 +
13153 +       for (i = from; i <= to; i++)
13154 +               retval += dir_u->entry_sizes[i];
13155 +
13156 +       return retval;
13157 +}
13158 +
13159 +static int direntry_unit_num(struct virtual_item *vi)
13160 +{
13161 +       struct direntry_uarea *dir_u = vi->vi_uarea;
13162 +
13163 +       return dir_u->entry_count;
13164 +}
13165 +
13166 +static void direntry_print_vi(struct virtual_item *vi)
13167 +{
13168 +       int i;
13169 +       struct direntry_uarea *dir_u = vi->vi_uarea;
13170 +
13171 +       reiserfs_warning(NULL, "reiserfs-16104",
13172 +                        "DIRENTRY, index %d, type 0x%x, %h, flags 0x%x",
13173 +                        vi->vi_index, vi->vi_type, vi->vi_ih, dir_u->flags);
13174 +       printk("%d entries: ", dir_u->entry_count);
13175 +       for (i = 0; i < dir_u->entry_count; i++)
13176 +               printk("%d ", dir_u->entry_sizes[i]);
13177 +       printk("\n");
13178 +}
13179 +
13180 +static struct item_operations direntry_ops = {
13181 +       .bytes_number = direntry_bytes_number,
13182 +       .decrement_key = direntry_decrement_key,
13183 +       .is_left_mergeable = direntry_is_left_mergeable,
13184 +       .print_item = direntry_print_item,
13185 +       .check_item = direntry_check_item,
13186 +
13187 +       .create_vi = direntry_create_vi,
13188 +       .check_left = direntry_check_left,
13189 +       .check_right = direntry_check_right,
13190 +       .part_size = direntry_part_size,
13191 +       .unit_num = direntry_unit_num,
13192 +       .print_vi = direntry_print_vi
13193 +};
13194 +
13195 +/* Error catching functions to catch errors caused by incorrect item types. */
13196 +static int errcatch_bytes_number(struct item_head *ih, int block_size)
13197 +{
13198 +       reiserfs_warning(NULL, "green-16001",
13199 +                        "Invalid item type observed, run fsck ASAP");
13200 +       return 0;
13201 +}
13202 +
13203 +static void errcatch_decrement_key(struct cpu_key *key)
13204 +{
13205 +       reiserfs_warning(NULL, "green-16002",
13206 +                        "Invalid item type observed, run fsck ASAP");
13207 +}
13208 +
13209 +static int errcatch_is_left_mergeable(struct reiserfs_key *key,
13210 +                                     unsigned long bsize)
13211 +{
13212 +       reiserfs_warning(NULL, "green-16003",
13213 +                        "Invalid item type observed, run fsck ASAP");
13214 +       return 0;
13215 +}
13216 +
13217 +static void errcatch_print_item(struct item_head *ih, char *item)
13218 +{
13219 +       reiserfs_warning(NULL, "green-16004",
13220 +                        "Invalid item type observed, run fsck ASAP");
13221 +}
13222 +
13223 +static void errcatch_check_item(struct item_head *ih, char *item)
13224 +{
13225 +       reiserfs_warning(NULL, "green-16005",
13226 +                        "Invalid item type observed, run fsck ASAP");
13227 +}
13228 +
13229 +static int errcatch_create_vi(struct virtual_node *vn,
13230 +                             struct virtual_item *vi,
13231 +                             int is_affected, int insert_size)
13232 +{
13233 +       reiserfs_warning(NULL, "green-16006",
13234 +                        "Invalid item type observed, run fsck ASAP");
13235 +       /*
13236 +        * We might return -1 here as well, but it won't help as
13237 +        * create_virtual_node() from where this operation is called
13238 +        * from is of return type void.
13239 +        */
13240 +       return 0;
13241 +}
13242 +
13243 +static int errcatch_check_left(struct virtual_item *vi, int free,
13244 +                              int start_skip, int end_skip)
13245 +{
13246 +       reiserfs_warning(NULL, "green-16007",
13247 +                        "Invalid item type observed, run fsck ASAP");
13248 +       return -1;
13249 +}
13250 +
13251 +static int errcatch_check_right(struct virtual_item *vi, int free)
13252 +{
13253 +       reiserfs_warning(NULL, "green-16008",
13254 +                        "Invalid item type observed, run fsck ASAP");
13255 +       return -1;
13256 +}
13257 +
13258 +static int errcatch_part_size(struct virtual_item *vi, int first, int count)
13259 +{
13260 +       reiserfs_warning(NULL, "green-16009",
13261 +                        "Invalid item type observed, run fsck ASAP");
13262 +       return 0;
13263 +}
13264 +
13265 +static int errcatch_unit_num(struct virtual_item *vi)
13266 +{
13267 +       reiserfs_warning(NULL, "green-16010",
13268 +                        "Invalid item type observed, run fsck ASAP");
13269 +       return 0;
13270 +}
13271 +
13272 +static void errcatch_print_vi(struct virtual_item *vi)
13273 +{
13274 +       reiserfs_warning(NULL, "green-16011",
13275 +                        "Invalid item type observed, run fsck ASAP");
13276 +}
13277 +
13278 +static struct item_operations errcatch_ops = {
13279 +       .bytes_number = errcatch_bytes_number,
13280 +       .decrement_key = errcatch_decrement_key,
13281 +       .is_left_mergeable = errcatch_is_left_mergeable,
13282 +       .print_item = errcatch_print_item,
13283 +       .check_item = errcatch_check_item,
13284 +
13285 +       .create_vi = errcatch_create_vi,
13286 +       .check_left = errcatch_check_left,
13287 +       .check_right = errcatch_check_right,
13288 +       .part_size = errcatch_part_size,
13289 +       .unit_num = errcatch_unit_num,
13290 +       .print_vi = errcatch_print_vi
13291 +};
13292 +
13293 +#if ! (TYPE_STAT_DATA == 0 && TYPE_INDIRECT == 1 && TYPE_DIRECT == 2 && TYPE_DIRENTRY == 3)
13294 +#error Item types must use disk-format assigned values.
13295 +#endif
13296 +
13297 +struct item_operations *item_ops[TYPE_ANY + 1] = {
13298 +       &stat_data_ops,
13299 +       &indirect_ops,
13300 +       &direct_ops,
13301 +       &direntry_ops,
13302 +       NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
13303 +       &errcatch_ops           /* This is to catch errors with invalid type (15th entry for TYPE_ANY) */
13304 +};
13305 diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
13306 new file mode 100644
13307 index 000000000000..e477ee0ff35d
13308 --- /dev/null
13309 +++ b/fs/reiserfs/journal.c
13310 @@ -0,0 +1,4404 @@
13311 +// SPDX-License-Identifier: GPL-2.0
13312 +/*
13313 + * Write ahead logging implementation copyright Chris Mason 2000
13314 + *
13315 + * The background commits make this code very interrelated, and
13316 + * overly complex.  I need to rethink things a bit....The major players:
13317 + *
13318 + * journal_begin -- call with the number of blocks you expect to log.
13319 + *                  If the current transaction is too
13320 + *                 old, it will block until the current transaction is
13321 + *                 finished, and then start a new one.
13322 + *                 Usually, your transaction will get joined in with
13323 + *                  previous ones for speed.
13324 + *
13325 + * journal_join  -- same as journal_begin, but won't block on the current
13326 + *                  transaction regardless of age.  Don't ever call
13327 + *                  this.  Ever.  There are only two places it should be
13328 + *                  called from, and they are both inside this file.
13329 + *
13330 + * journal_mark_dirty -- adds blocks into this transaction.  clears any flags
13331 + *                       that might make them get sent to disk
13332 + *                       and then marks them BH_JDirty.  Puts the buffer head
13333 + *                       into the current transaction hash.
13334 + *
13335 + * journal_end -- if the current transaction is batchable, it does nothing
13336 + *                   otherwise, it could do an async/synchronous commit, or
13337 + *                   a full flush of all log and real blocks in the
13338 + *                   transaction.
13339 + *
13340 + * flush_old_commits -- if the current transaction is too old, it is ended and
13341 + *                      commit blocks are sent to disk.  Forces commit blocks
13342 + *                      to disk for all backgrounded commits that have been
13343 + *                      around too long.
13344 + *                  -- Note, if you call this as an immediate flush from
13345 + *                     within kupdate, it will ignore the immediate flag
13346 + */
13347 +
13348 +#include <linux/time.h>
13349 +#include <linux/semaphore.h>
13350 +#include <linux/vmalloc.h>
13351 +#include "reiserfs.h"
13352 +#include <linux/kernel.h>
13353 +#include <linux/errno.h>
13354 +#include <linux/fcntl.h>
13355 +#include <linux/stat.h>
13356 +#include <linux/string.h>
13357 +#include <linux/buffer_head.h>
13358 +#include <linux/workqueue.h>
13359 +#include <linux/writeback.h>
13360 +#include <linux/blkdev.h>
13361 +#include <linux/backing-dev.h>
13362 +#include <linux/uaccess.h>
13363 +#include <linux/slab.h>
13364 +
13365 +
13366 +/* gets a struct reiserfs_journal_list * from a list head */
13367 +#define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
13368 +                               j_list))
13369 +
13370 +/* must be correct to keep the desc and commit structs at 4k */
13371 +#define JOURNAL_TRANS_HALF 1018
13372 +#define BUFNR 64               /*read ahead */
13373 +
13374 +/* cnode stat bits.  Move these into reiserfs_fs.h */
13375 +
13376 +/* this block was freed, and can't be written.  */
13377 +#define BLOCK_FREED 2
13378 +/* this block was freed during this transaction, and can't be written */
13379 +#define BLOCK_FREED_HOLDER 3
13380 +
13381 +/* used in flush_journal_list */
13382 +#define BLOCK_NEEDS_FLUSH 4
13383 +#define BLOCK_DIRTIED 5
13384 +
13385 +/* journal list state bits */
13386 +#define LIST_TOUCHED 1
13387 +#define LIST_DIRTY   2
13388 +#define LIST_COMMIT_PENDING  4 /* someone will commit this list */
13389 +
13390 +/* flags for do_journal_end */
13391 +#define FLUSH_ALL   1          /* flush commit and real blocks */
13392 +#define COMMIT_NOW  2          /* end and commit this transaction */
13393 +#define WAIT        4          /* wait for the log blocks to hit the disk */
13394 +
13395 +static int do_journal_end(struct reiserfs_transaction_handle *, int flags);
13396 +static int flush_journal_list(struct super_block *s,
13397 +                             struct reiserfs_journal_list *jl, int flushall);
13398 +static int flush_commit_list(struct super_block *s,
13399 +                            struct reiserfs_journal_list *jl, int flushall);
13400 +static int can_dirty(struct reiserfs_journal_cnode *cn);
13401 +static int journal_join(struct reiserfs_transaction_handle *th,
13402 +                       struct super_block *sb);
13403 +static void release_journal_dev(struct reiserfs_journal *journal);
13404 +static void dirty_one_transaction(struct super_block *s,
13405 +                                struct reiserfs_journal_list *jl);
13406 +static void flush_async_commits(struct work_struct *work);
13407 +static void queue_log_writer(struct super_block *s);
13408 +
13409 +/* values for join in do_journal_begin_r */
13410 +enum {
13411 +       JBEGIN_REG = 0,         /* regular journal begin */
13412 +       /* join the running transaction if at all possible */
13413 +       JBEGIN_JOIN = 1,
13414 +       /* called from cleanup code, ignores aborted flag */
13415 +       JBEGIN_ABORT = 2,
13416 +};
13417 +
13418 +static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
13419 +                             struct super_block *sb,
13420 +                             unsigned long nblocks, int join);
13421 +
13422 +static void init_journal_hash(struct super_block *sb)
13423 +{
13424 +       struct reiserfs_journal *journal = SB_JOURNAL(sb);
13425 +       memset(journal->j_hash_table, 0,
13426 +              JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *));
13427 +}
13428 +
13429 +/*
13430 + * clears BH_Dirty and sticks the buffer on the clean list.  Called because
13431 + * I can't allow refile_buffer to make schedule happen after I've freed a
13432 + * block.  Look at remove_from_transaction and journal_mark_freed for
13433 + * more details.
13434 + */
13435 +static int reiserfs_clean_and_file_buffer(struct buffer_head *bh)
13436 +{
13437 +       if (bh) {
13438 +               clear_buffer_dirty(bh);
13439 +               clear_buffer_journal_test(bh);
13440 +       }
13441 +       return 0;
13442 +}
13443 +
13444 +static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block
13445 +                                                        *sb)
13446 +{
13447 +       struct reiserfs_bitmap_node *bn;
13448 +       static int id;
13449 +
13450 +       bn = kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_NOFS);
13451 +       if (!bn) {
13452 +               return NULL;
13453 +       }
13454 +       bn->data = kzalloc(sb->s_blocksize, GFP_NOFS);
13455 +       if (!bn->data) {
13456 +               kfree(bn);
13457 +               return NULL;
13458 +       }
13459 +       bn->id = id++;
13460 +       INIT_LIST_HEAD(&bn->list);
13461 +       return bn;
13462 +}
13463 +
13464 +static struct reiserfs_bitmap_node *get_bitmap_node(struct super_block *sb)
13465 +{
13466 +       struct reiserfs_journal *journal = SB_JOURNAL(sb);
13467 +       struct reiserfs_bitmap_node *bn = NULL;
13468 +       struct list_head *entry = journal->j_bitmap_nodes.next;
13469 +
13470 +       journal->j_used_bitmap_nodes++;
13471 +repeat:
13472 +
13473 +       if (entry != &journal->j_bitmap_nodes) {
13474 +               bn = list_entry(entry, struct reiserfs_bitmap_node, list);
13475 +               list_del(entry);
13476 +               memset(bn->data, 0, sb->s_blocksize);
13477 +               journal->j_free_bitmap_nodes--;
13478 +               return bn;
13479 +       }
13480 +       bn = allocate_bitmap_node(sb);
13481 +       if (!bn) {
13482 +               yield();
13483 +               goto repeat;
13484 +       }
13485 +       return bn;
13486 +}
13487 +static inline void free_bitmap_node(struct super_block *sb,
13488 +                                   struct reiserfs_bitmap_node *bn)
13489 +{
13490 +       struct reiserfs_journal *journal = SB_JOURNAL(sb);
13491 +       journal->j_used_bitmap_nodes--;
13492 +       if (journal->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) {
13493 +               kfree(bn->data);
13494 +               kfree(bn);
13495 +       } else {
13496 +               list_add(&bn->list, &journal->j_bitmap_nodes);
13497 +               journal->j_free_bitmap_nodes++;
13498 +       }
13499 +}
13500 +
13501 +static void allocate_bitmap_nodes(struct super_block *sb)
13502 +{
13503 +       int i;
13504 +       struct reiserfs_journal *journal = SB_JOURNAL(sb);
13505 +       struct reiserfs_bitmap_node *bn = NULL;
13506 +       for (i = 0; i < REISERFS_MIN_BITMAP_NODES; i++) {
13507 +               bn = allocate_bitmap_node(sb);
13508 +               if (bn) {
13509 +                       list_add(&bn->list, &journal->j_bitmap_nodes);
13510 +                       journal->j_free_bitmap_nodes++;
13511 +               } else {
13512 +                       /* this is ok, we'll try again when more are needed */
13513 +                       break;
13514 +               }
13515 +       }
13516 +}
13517 +
13518 +static int set_bit_in_list_bitmap(struct super_block *sb,
13519 +                                 b_blocknr_t block,
13520 +                                 struct reiserfs_list_bitmap *jb)
13521 +{
13522 +       unsigned int bmap_nr = block / (sb->s_blocksize << 3);
13523 +       unsigned int bit_nr = block % (sb->s_blocksize << 3);
13524 +
13525 +       if (!jb->bitmaps[bmap_nr]) {
13526 +               jb->bitmaps[bmap_nr] = get_bitmap_node(sb);
13527 +       }
13528 +       set_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]->data);
13529 +       return 0;
13530 +}
13531 +
13532 +static void cleanup_bitmap_list(struct super_block *sb,
13533 +                               struct reiserfs_list_bitmap *jb)
13534 +{
13535 +       int i;
13536 +       if (jb->bitmaps == NULL)
13537 +               return;
13538 +
13539 +       for (i = 0; i < reiserfs_bmap_count(sb); i++) {
13540 +               if (jb->bitmaps[i]) {
13541 +                       free_bitmap_node(sb, jb->bitmaps[i]);
13542 +                       jb->bitmaps[i] = NULL;
13543 +               }
13544 +       }
13545 +}
13546 +
13547 +/*
13548 + * only call this on FS unmount.
13549 + */
13550 +static int free_list_bitmaps(struct super_block *sb,
13551 +                            struct reiserfs_list_bitmap *jb_array)
13552 +{
13553 +       int i;
13554 +       struct reiserfs_list_bitmap *jb;
13555 +       for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
13556 +               jb = jb_array + i;
13557 +               jb->journal_list = NULL;
13558 +               cleanup_bitmap_list(sb, jb);
13559 +               vfree(jb->bitmaps);
13560 +               jb->bitmaps = NULL;
13561 +       }
13562 +       return 0;
13563 +}
13564 +
13565 +static int free_bitmap_nodes(struct super_block *sb)
13566 +{
13567 +       struct reiserfs_journal *journal = SB_JOURNAL(sb);
13568 +       struct list_head *next = journal->j_bitmap_nodes.next;
13569 +       struct reiserfs_bitmap_node *bn;
13570 +
13571 +       while (next != &journal->j_bitmap_nodes) {
13572 +               bn = list_entry(next, struct reiserfs_bitmap_node, list);
13573 +               list_del(next);
13574 +               kfree(bn->data);
13575 +               kfree(bn);
13576 +               next = journal->j_bitmap_nodes.next;
13577 +               journal->j_free_bitmap_nodes--;
13578 +       }
13579 +
13580 +       return 0;
13581 +}
13582 +
13583 +/*
13584 + * get memory for JOURNAL_NUM_BITMAPS worth of bitmaps.
13585 + * jb_array is the array to be filled in.
13586 + */
13587 +int reiserfs_allocate_list_bitmaps(struct super_block *sb,
13588 +                                  struct reiserfs_list_bitmap *jb_array,
13589 +                                  unsigned int bmap_nr)
13590 +{
13591 +       int i;
13592 +       int failed = 0;
13593 +       struct reiserfs_list_bitmap *jb;
13594 +       int mem = bmap_nr * sizeof(struct reiserfs_bitmap_node *);
13595 +
13596 +       for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
13597 +               jb = jb_array + i;
13598 +               jb->journal_list = NULL;
13599 +               jb->bitmaps = vzalloc(mem);
13600 +               if (!jb->bitmaps) {
13601 +                       reiserfs_warning(sb, "clm-2000", "unable to "
13602 +                                        "allocate bitmaps for journal lists");
13603 +                       failed = 1;
13604 +                       break;
13605 +               }
13606 +       }
13607 +       if (failed) {
13608 +               free_list_bitmaps(sb, jb_array);
13609 +               return -1;
13610 +       }
13611 +       return 0;
13612 +}
13613 +
13614 +/*
13615 + * find an available list bitmap.  If you can't find one, flush a commit list
13616 + * and try again
13617 + */
13618 +static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *sb,
13619 +                                                   struct reiserfs_journal_list
13620 +                                                   *jl)
13621 +{
13622 +       int i, j;
13623 +       struct reiserfs_journal *journal = SB_JOURNAL(sb);
13624 +       struct reiserfs_list_bitmap *jb = NULL;
13625 +
13626 +       for (j = 0; j < (JOURNAL_NUM_BITMAPS * 3); j++) {
13627 +               i = journal->j_list_bitmap_index;
13628 +               journal->j_list_bitmap_index = (i + 1) % JOURNAL_NUM_BITMAPS;
13629 +               jb = journal->j_list_bitmap + i;
13630 +               if (journal->j_list_bitmap[i].journal_list) {
13631 +                       flush_commit_list(sb,
13632 +                                         journal->j_list_bitmap[i].
13633 +                                         journal_list, 1);
13634 +                       if (!journal->j_list_bitmap[i].journal_list) {
13635 +                               break;
13636 +                       }
13637 +               } else {
13638 +                       break;
13639 +               }
13640 +       }
13641 +       /* double check to make sure if flushed correctly */
13642 +       if (jb->journal_list)
13643 +               return NULL;
13644 +       jb->journal_list = jl;
13645 +       return jb;
13646 +}
13647 +
13648 +/*
13649 + * allocates a new chunk of X nodes, and links them all together as a list.
13650 + * Uses the cnode->next and cnode->prev pointers
13651 + * returns NULL on failure
13652 + */
13653 +static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes)
13654 +{
13655 +       struct reiserfs_journal_cnode *head;
13656 +       int i;
13657 +       if (num_cnodes <= 0) {
13658 +               return NULL;
13659 +       }
13660 +       head = vzalloc(array_size(num_cnodes,
13661 +                                 sizeof(struct reiserfs_journal_cnode)));
13662 +       if (!head) {
13663 +               return NULL;
13664 +       }
13665 +       head[0].prev = NULL;
13666 +       head[0].next = head + 1;
13667 +       for (i = 1; i < num_cnodes; i++) {
13668 +               head[i].prev = head + (i - 1);
13669 +               head[i].next = head + (i + 1);  /* if last one, overwrite it after the if */
13670 +       }
13671 +       head[num_cnodes - 1].next = NULL;
13672 +       return head;
13673 +}
13674 +
13675 +/* pulls a cnode off the free list, or returns NULL on failure */
13676 +static struct reiserfs_journal_cnode *get_cnode(struct super_block *sb)
13677 +{
13678 +       struct reiserfs_journal_cnode *cn;
13679 +       struct reiserfs_journal *journal = SB_JOURNAL(sb);
13680 +
13681 +       reiserfs_check_lock_depth(sb, "get_cnode");
13682 +
13683 +       if (journal->j_cnode_free <= 0) {
13684 +               return NULL;
13685 +       }
13686 +       journal->j_cnode_used++;
13687 +       journal->j_cnode_free--;
13688 +       cn = journal->j_cnode_free_list;
13689 +       if (!cn) {
13690 +               return cn;
13691 +       }
13692 +       if (cn->next) {
13693 +               cn->next->prev = NULL;
13694 +       }
13695 +       journal->j_cnode_free_list = cn->next;
13696 +       memset(cn, 0, sizeof(struct reiserfs_journal_cnode));
13697 +       return cn;
13698 +}
13699 +
13700 +/*
13701 + * returns a cnode to the free list
13702 + */
13703 +static void free_cnode(struct super_block *sb,
13704 +                      struct reiserfs_journal_cnode *cn)
13705 +{
13706 +       struct reiserfs_journal *journal = SB_JOURNAL(sb);
13707 +
13708 +       reiserfs_check_lock_depth(sb, "free_cnode");
13709 +
13710 +       journal->j_cnode_used--;
13711 +       journal->j_cnode_free++;
13712 +       /* memset(cn, 0, sizeof(struct reiserfs_journal_cnode)) ; */
13713 +       cn->next = journal->j_cnode_free_list;
13714 +       if (journal->j_cnode_free_list) {
13715 +               journal->j_cnode_free_list->prev = cn;
13716 +       }
13717 +       cn->prev = NULL;        /* not needed with the memset, but I might kill the memset, and forget to do this */
13718 +       journal->j_cnode_free_list = cn;
13719 +}
13720 +
13721 +static void clear_prepared_bits(struct buffer_head *bh)
13722 +{
13723 +       clear_buffer_journal_prepared(bh);
13724 +       clear_buffer_journal_restore_dirty(bh);
13725 +}
13726 +
13727 +/*
13728 + * return a cnode with same dev, block number and size in table,
13729 + * or null if not found
13730 + */
13731 +static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct
13732 +                                                                 super_block
13733 +                                                                 *sb,
13734 +                                                                 struct
13735 +                                                                 reiserfs_journal_cnode
13736 +                                                                 **table,
13737 +                                                                 long bl)
13738 +{
13739 +       struct reiserfs_journal_cnode *cn;
13740 +       cn = journal_hash(table, sb, bl);
13741 +       while (cn) {
13742 +               if (cn->blocknr == bl && cn->sb == sb)
13743 +                       return cn;
13744 +               cn = cn->hnext;
13745 +       }
13746 +       return (struct reiserfs_journal_cnode *)0;
13747 +}
13748 +
13749 +/*
13750 + * this actually means 'can this block be reallocated yet?'.  If you set
13751 + * search_all, a block can only be allocated if it is not in the current
13752 + * transaction, was not freed by the current transaction, and has no chance
13753 + * of ever being overwritten by a replay after crashing.
13754 + *
13755 + * If you don't set search_all, a block can only be allocated if it is not
13756 + * in the current transaction.  Since deleting a block removes it from the
13757 + * current transaction, this case should never happen.  If you don't set
13758 + * search_all, make sure you never write the block without logging it.
13759 + *
13760 + * next_zero_bit is a suggestion about the next block to try for find_forward.
13761 + * when bl is rejected because it is set in a journal list bitmap, we search
13762 + * for the next zero bit in the bitmap that rejected bl.  Then, we return
13763 + * that through next_zero_bit for find_forward to try.
13764 + *
13765 + * Just because we return something in next_zero_bit does not mean we won't
13766 + * reject it on the next call to reiserfs_in_journal
13767 + */
13768 +int reiserfs_in_journal(struct super_block *sb,
13769 +                       unsigned int bmap_nr, int bit_nr, int search_all,
13770 +                       b_blocknr_t * next_zero_bit)
13771 +{
13772 +       struct reiserfs_journal *journal = SB_JOURNAL(sb);
13773 +       struct reiserfs_list_bitmap *jb;
13774 +       int i;
13775 +       unsigned long bl;
13776 +
13777 +       *next_zero_bit = 0;     /* always start this at zero. */
13778 +
13779 +       PROC_INFO_INC(sb, journal.in_journal);
13780 +       /*
13781 +        * If we aren't doing a search_all, this is a metablock, and it
13782 +        * will be logged before use.  if we crash before the transaction
13783 +        * that freed it commits,  this transaction won't have committed
13784 +        * either, and the block will never be written
13785 +        */
13786 +       if (search_all) {
13787 +               for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
13788 +                       PROC_INFO_INC(sb, journal.in_journal_bitmap);
13789 +                       jb = journal->j_list_bitmap + i;
13790 +                       if (jb->journal_list && jb->bitmaps[bmap_nr] &&
13791 +                           test_bit(bit_nr,
13792 +                                    (unsigned long *)jb->bitmaps[bmap_nr]->
13793 +                                    data)) {
13794 +                               *next_zero_bit =
13795 +                                   find_next_zero_bit((unsigned long *)
13796 +                                                      (jb->bitmaps[bmap_nr]->
13797 +                                                       data),
13798 +                                                      sb->s_blocksize << 3,
13799 +                                                      bit_nr + 1);
13800 +                               return 1;
13801 +                       }
13802 +               }
13803 +       }
13804 +
13805 +       bl = bmap_nr * (sb->s_blocksize << 3) + bit_nr;
13806 +       /* is it in any old transactions? */
13807 +       if (search_all
13808 +           && (get_journal_hash_dev(sb, journal->j_list_hash_table, bl))) {
13809 +               return 1;
13810 +       }
13811 +
13812 +       /* is it in the current transaction.  This should never happen */
13813 +       if ((get_journal_hash_dev(sb, journal->j_hash_table, bl))) {
13814 +               BUG();
13815 +               return 1;
13816 +       }
13817 +
13818 +       PROC_INFO_INC(sb, journal.in_journal_reusable);
13819 +       /* safe for reuse */
13820 +       return 0;
13821 +}
13822 +
13823 +/* insert cn into table */
13824 +static inline void insert_journal_hash(struct reiserfs_journal_cnode **table,
13825 +                                      struct reiserfs_journal_cnode *cn)
13826 +{
13827 +       struct reiserfs_journal_cnode *cn_orig;
13828 +
13829 +       cn_orig = journal_hash(table, cn->sb, cn->blocknr);
13830 +       cn->hnext = cn_orig;
13831 +       cn->hprev = NULL;
13832 +       if (cn_orig) {
13833 +               cn_orig->hprev = cn;
13834 +       }
13835 +       journal_hash(table, cn->sb, cn->blocknr) = cn;
13836 +}
13837 +
13838 +/* lock the current transaction */
13839 +static inline void lock_journal(struct super_block *sb)
13840 +{
13841 +       PROC_INFO_INC(sb, journal.lock_journal);
13842 +
13843 +       reiserfs_mutex_lock_safe(&SB_JOURNAL(sb)->j_mutex, sb);
13844 +}
13845 +
13846 +/* unlock the current transaction */
13847 +static inline void unlock_journal(struct super_block *sb)
13848 +{
13849 +       mutex_unlock(&SB_JOURNAL(sb)->j_mutex);
13850 +}
13851 +
13852 +static inline void get_journal_list(struct reiserfs_journal_list *jl)
13853 +{
13854 +       jl->j_refcount++;
13855 +}
13856 +
13857 +static inline void put_journal_list(struct super_block *s,
13858 +                                   struct reiserfs_journal_list *jl)
13859 +{
13860 +       if (jl->j_refcount < 1) {
13861 +               reiserfs_panic(s, "journal-2", "trans id %u, refcount at %d",
13862 +                              jl->j_trans_id, jl->j_refcount);
13863 +       }
13864 +       if (--jl->j_refcount == 0)
13865 +               kfree(jl);
13866 +}
13867 +
13868 +/*
13869 + * this used to be much more involved, and I'm keeping it just in case
13870 + * things get ugly again.  it gets called by flush_commit_list, and
13871 + * cleans up any data stored about blocks freed during a transaction.
13872 + */
13873 +static void cleanup_freed_for_journal_list(struct super_block *sb,
13874 +                                          struct reiserfs_journal_list *jl)
13875 +{
13876 +
13877 +       struct reiserfs_list_bitmap *jb = jl->j_list_bitmap;
13878 +       if (jb) {
13879 +               cleanup_bitmap_list(sb, jb);
13880 +       }
13881 +       jl->j_list_bitmap->journal_list = NULL;
13882 +       jl->j_list_bitmap = NULL;
13883 +}
13884 +
13885 +static int journal_list_still_alive(struct super_block *s,
13886 +                                   unsigned int trans_id)
13887 +{
13888 +       struct reiserfs_journal *journal = SB_JOURNAL(s);
13889 +       struct list_head *entry = &journal->j_journal_list;
13890 +       struct reiserfs_journal_list *jl;
13891 +
13892 +       if (!list_empty(entry)) {
13893 +               jl = JOURNAL_LIST_ENTRY(entry->next);
13894 +               if (jl->j_trans_id <= trans_id) {
13895 +                       return 1;
13896 +               }
13897 +       }
13898 +       return 0;
13899 +}
13900 +
13901 +/*
13902 + * If page->mapping was null, we failed to truncate this page for
13903 + * some reason.  Most likely because it was truncated after being
13904 + * logged via data=journal.
13905 + *
13906 + * This does a check to see if the buffer belongs to one of these
13907 + * lost pages before doing the final put_bh.  If page->mapping was
13908 + * null, it tries to free buffers on the page, which should make the
13909 + * final put_page drop the page from the lru.
13910 + */
13911 +static void release_buffer_page(struct buffer_head *bh)
13912 +{
13913 +       struct folio *folio = bh->b_folio;
13914 +       if (!folio->mapping && folio_trylock(folio)) {
13915 +               folio_get(folio);
13916 +               put_bh(bh);
13917 +               if (!folio->mapping)
13918 +                       try_to_free_buffers(folio);
13919 +               folio_unlock(folio);
13920 +               folio_put(folio);
13921 +       } else {
13922 +               put_bh(bh);
13923 +       }
13924 +}
13925 +
13926 +static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
13927 +{
13928 +       if (buffer_journaled(bh)) {
13929 +               reiserfs_warning(NULL, "clm-2084",
13930 +                                "pinned buffer %lu:%pg sent to disk",
13931 +                                bh->b_blocknr, bh->b_bdev);
13932 +       }
13933 +       if (uptodate)
13934 +               set_buffer_uptodate(bh);
13935 +       else
13936 +               clear_buffer_uptodate(bh);
13937 +
13938 +       unlock_buffer(bh);
13939 +       release_buffer_page(bh);
13940 +}
13941 +
13942 +static void reiserfs_end_ordered_io(struct buffer_head *bh, int uptodate)
13943 +{
13944 +       if (uptodate)
13945 +               set_buffer_uptodate(bh);
13946 +       else
13947 +               clear_buffer_uptodate(bh);
13948 +       unlock_buffer(bh);
13949 +       put_bh(bh);
13950 +}
13951 +
13952 +static void submit_logged_buffer(struct buffer_head *bh)
13953 +{
13954 +       get_bh(bh);
13955 +       bh->b_end_io = reiserfs_end_buffer_io_sync;
13956 +       clear_buffer_journal_new(bh);
13957 +       clear_buffer_dirty(bh);
13958 +       if (!test_clear_buffer_journal_test(bh))
13959 +               BUG();
13960 +       if (!buffer_uptodate(bh))
13961 +               BUG();
13962 +       submit_bh(REQ_OP_WRITE, bh);
13963 +}
13964 +
13965 +static void submit_ordered_buffer(struct buffer_head *bh)
13966 +{
13967 +       get_bh(bh);
13968 +       bh->b_end_io = reiserfs_end_ordered_io;
13969 +       clear_buffer_dirty(bh);
13970 +       if (!buffer_uptodate(bh))
13971 +               BUG();
13972 +       submit_bh(REQ_OP_WRITE, bh);
13973 +}
13974 +
13975 +#define CHUNK_SIZE 32
13976 +struct buffer_chunk {
13977 +       struct buffer_head *bh[CHUNK_SIZE];
13978 +       int nr;
13979 +};
13980 +
13981 +static void write_chunk(struct buffer_chunk *chunk)
13982 +{
13983 +       int i;
13984 +       for (i = 0; i < chunk->nr; i++) {
13985 +               submit_logged_buffer(chunk->bh[i]);
13986 +       }
13987 +       chunk->nr = 0;
13988 +}
13989 +
13990 +static void write_ordered_chunk(struct buffer_chunk *chunk)
13991 +{
13992 +       int i;
13993 +       for (i = 0; i < chunk->nr; i++) {
13994 +               submit_ordered_buffer(chunk->bh[i]);
13995 +       }
13996 +       chunk->nr = 0;
13997 +}
13998 +
13999 +static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh,
14000 +                       spinlock_t * lock, void (fn) (struct buffer_chunk *))
14001 +{
14002 +       int ret = 0;
14003 +       BUG_ON(chunk->nr >= CHUNK_SIZE);
14004 +       chunk->bh[chunk->nr++] = bh;
14005 +       if (chunk->nr >= CHUNK_SIZE) {
14006 +               ret = 1;
14007 +               if (lock) {
14008 +                       spin_unlock(lock);
14009 +                       fn(chunk);
14010 +                       spin_lock(lock);
14011 +               } else {
14012 +                       fn(chunk);
14013 +               }
14014 +       }
14015 +       return ret;
14016 +}
14017 +
14018 +static atomic_t nr_reiserfs_jh = ATOMIC_INIT(0);
14019 +static struct reiserfs_jh *alloc_jh(void)
14020 +{
14021 +       struct reiserfs_jh *jh;
14022 +       while (1) {
14023 +               jh = kmalloc(sizeof(*jh), GFP_NOFS);
14024 +               if (jh) {
14025 +                       atomic_inc(&nr_reiserfs_jh);
14026 +                       return jh;
14027 +               }
14028 +               yield();
14029 +       }
14030 +}
14031 +
14032 +/*
14033 + * we want to free the jh when the buffer has been written
14034 + * and waited on
14035 + */
14036 +void reiserfs_free_jh(struct buffer_head *bh)
14037 +{
14038 +       struct reiserfs_jh *jh;
14039 +
14040 +       jh = bh->b_private;
14041 +       if (jh) {
14042 +               bh->b_private = NULL;
14043 +               jh->bh = NULL;
14044 +               list_del_init(&jh->list);
14045 +               kfree(jh);
14046 +               if (atomic_read(&nr_reiserfs_jh) <= 0)
14047 +                       BUG();
14048 +               atomic_dec(&nr_reiserfs_jh);
14049 +               put_bh(bh);
14050 +       }
14051 +}
14052 +
14053 +static inline int __add_jh(struct reiserfs_journal *j, struct buffer_head *bh,
14054 +                          int tail)
14055 +{
14056 +       struct reiserfs_jh *jh;
14057 +
14058 +       if (bh->b_private) {
14059 +               spin_lock(&j->j_dirty_buffers_lock);
14060 +               if (!bh->b_private) {
14061 +                       spin_unlock(&j->j_dirty_buffers_lock);
14062 +                       goto no_jh;
14063 +               }
14064 +               jh = bh->b_private;
14065 +               list_del_init(&jh->list);
14066 +       } else {
14067 +no_jh:
14068 +               get_bh(bh);
14069 +               jh = alloc_jh();
14070 +               spin_lock(&j->j_dirty_buffers_lock);
14071 +               /*
14072 +                * buffer must be locked for __add_jh, should be able to have
14073 +                * two adds at the same time
14074 +                */
14075 +               BUG_ON(bh->b_private);
14076 +               jh->bh = bh;
14077 +               bh->b_private = jh;
14078 +       }
14079 +       jh->jl = j->j_current_jl;
14080 +       if (tail)
14081 +               list_add_tail(&jh->list, &jh->jl->j_tail_bh_list);
14082 +       else {
14083 +               list_add_tail(&jh->list, &jh->jl->j_bh_list);
14084 +       }
14085 +       spin_unlock(&j->j_dirty_buffers_lock);
14086 +       return 0;
14087 +}
14088 +
14089 +int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh)
14090 +{
14091 +       return __add_jh(SB_JOURNAL(inode->i_sb), bh, 1);
14092 +}
14093 +int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh)
14094 +{
14095 +       return __add_jh(SB_JOURNAL(inode->i_sb), bh, 0);
14096 +}
14097 +
14098 +#define JH_ENTRY(l) list_entry((l), struct reiserfs_jh, list)
14099 +static int write_ordered_buffers(spinlock_t * lock,
14100 +                                struct reiserfs_journal *j,
14101 +                                struct reiserfs_journal_list *jl,
14102 +                                struct list_head *list)
14103 +{
14104 +       struct buffer_head *bh;
14105 +       struct reiserfs_jh *jh;
14106 +       int ret = j->j_errno;
14107 +       struct buffer_chunk chunk;
14108 +       struct list_head tmp;
14109 +       INIT_LIST_HEAD(&tmp);
14110 +
14111 +       chunk.nr = 0;
14112 +       spin_lock(lock);
14113 +       while (!list_empty(list)) {
14114 +               jh = JH_ENTRY(list->next);
14115 +               bh = jh->bh;
14116 +               get_bh(bh);
14117 +               if (!trylock_buffer(bh)) {
14118 +                       if (!buffer_dirty(bh)) {
14119 +                               list_move(&jh->list, &tmp);
14120 +                               goto loop_next;
14121 +                       }
14122 +                       spin_unlock(lock);
14123 +                       if (chunk.nr)
14124 +                               write_ordered_chunk(&chunk);
14125 +                       wait_on_buffer(bh);
14126 +                       cond_resched();
14127 +                       spin_lock(lock);
14128 +                       goto loop_next;
14129 +               }
14130 +               /*
14131 +                * in theory, dirty non-uptodate buffers should never get here,
14132 +                * but the upper layer io error paths still have a few quirks.
14133 +                * Handle them here as gracefully as we can
14134 +                */
14135 +               if (!buffer_uptodate(bh) && buffer_dirty(bh)) {
14136 +                       clear_buffer_dirty(bh);
14137 +                       ret = -EIO;
14138 +               }
14139 +               if (buffer_dirty(bh)) {
14140 +                       list_move(&jh->list, &tmp);
14141 +                       add_to_chunk(&chunk, bh, lock, write_ordered_chunk);
14142 +               } else {
14143 +                       reiserfs_free_jh(bh);
14144 +                       unlock_buffer(bh);
14145 +               }
14146 +loop_next:
14147 +               put_bh(bh);
14148 +               cond_resched_lock(lock);
14149 +       }
14150 +       if (chunk.nr) {
14151 +               spin_unlock(lock);
14152 +               write_ordered_chunk(&chunk);
14153 +               spin_lock(lock);
14154 +       }
14155 +       while (!list_empty(&tmp)) {
14156 +               jh = JH_ENTRY(tmp.prev);
14157 +               bh = jh->bh;
14158 +               get_bh(bh);
14159 +               reiserfs_free_jh(bh);
14160 +
14161 +               if (buffer_locked(bh)) {
14162 +                       spin_unlock(lock);
14163 +                       wait_on_buffer(bh);
14164 +                       spin_lock(lock);
14165 +               }
14166 +               if (!buffer_uptodate(bh)) {
14167 +                       ret = -EIO;
14168 +               }
14169 +               /*
14170 +                * ugly interaction with invalidate_folio here.
14171 +                * reiserfs_invalidate_folio will pin any buffer that has a
14172 +                * valid journal head from an older transaction.  If someone
14173 +                * else sets our buffer dirty after we write it in the first
14174 +                * loop, and then someone truncates the page away, nobody
14175 +                * will ever write the buffer. We're safe if we write the
14176 +                * page one last time after freeing the journal header.
14177 +                */
14178 +               if (buffer_dirty(bh) && unlikely(bh->b_folio->mapping == NULL)) {
14179 +                       spin_unlock(lock);
14180 +                       write_dirty_buffer(bh, 0);
14181 +                       spin_lock(lock);
14182 +               }
14183 +               put_bh(bh);
14184 +               cond_resched_lock(lock);
14185 +       }
14186 +       spin_unlock(lock);
14187 +       return ret;
14188 +}
14189 +
14190 +static int flush_older_commits(struct super_block *s,
14191 +                              struct reiserfs_journal_list *jl)
14192 +{
14193 +       struct reiserfs_journal *journal = SB_JOURNAL(s);
14194 +       struct reiserfs_journal_list *other_jl;
14195 +       struct reiserfs_journal_list *first_jl;
14196 +       struct list_head *entry;
14197 +       unsigned int trans_id = jl->j_trans_id;
14198 +       unsigned int other_trans_id;
14199 +
14200 +find_first:
14201 +       /*
14202 +        * first we walk backwards to find the oldest uncommitted transation
14203 +        */
14204 +       first_jl = jl;
14205 +       entry = jl->j_list.prev;
14206 +       while (1) {
14207 +               other_jl = JOURNAL_LIST_ENTRY(entry);
14208 +               if (entry == &journal->j_journal_list ||
14209 +                   atomic_read(&other_jl->j_older_commits_done))
14210 +                       break;
14211 +
14212 +               first_jl = other_jl;
14213 +               entry = other_jl->j_list.prev;
14214 +       }
14215 +
14216 +       /* if we didn't find any older uncommitted transactions, return now */
14217 +       if (first_jl == jl) {
14218 +               return 0;
14219 +       }
14220 +
14221 +       entry = &first_jl->j_list;
14222 +       while (1) {
14223 +               other_jl = JOURNAL_LIST_ENTRY(entry);
14224 +               other_trans_id = other_jl->j_trans_id;
14225 +
14226 +               if (other_trans_id < trans_id) {
14227 +                       if (atomic_read(&other_jl->j_commit_left) != 0) {
14228 +                               flush_commit_list(s, other_jl, 0);
14229 +
14230 +                               /* list we were called with is gone, return */
14231 +                               if (!journal_list_still_alive(s, trans_id))
14232 +                                       return 1;
14233 +
14234 +                               /*
14235 +                                * the one we just flushed is gone, this means
14236 +                                * all older lists are also gone, so first_jl
14237 +                                * is no longer valid either.  Go back to the
14238 +                                * beginning.
14239 +                                */
14240 +                               if (!journal_list_still_alive
14241 +                                   (s, other_trans_id)) {
14242 +                                       goto find_first;
14243 +                               }
14244 +                       }
14245 +                       entry = entry->next;
14246 +                       if (entry == &journal->j_journal_list)
14247 +                               return 0;
14248 +               } else {
14249 +                       return 0;
14250 +               }
14251 +       }
14252 +       return 0;
14253 +}
14254 +
14255 +static int reiserfs_async_progress_wait(struct super_block *s)
14256 +{
14257 +       struct reiserfs_journal *j = SB_JOURNAL(s);
14258 +
14259 +       if (atomic_read(&j->j_async_throttle)) {
14260 +               int depth;
14261 +
14262 +               depth = reiserfs_write_unlock_nested(s);
14263 +               wait_var_event_timeout(&j->j_async_throttle,
14264 +                                      atomic_read(&j->j_async_throttle) == 0,
14265 +                                      HZ / 10);
14266 +               reiserfs_write_lock_nested(s, depth);
14267 +       }
14268 +
14269 +       return 0;
14270 +}
14271 +
14272 +/*
14273 + * if this journal list still has commit blocks unflushed, send them to disk.
14274 + *
14275 + * log areas must be flushed in order (transaction 2 can't commit before
14276 + * transaction 1) Before the commit block can by written, every other log
14277 + * block must be safely on disk
14278 + */
14279 +static int flush_commit_list(struct super_block *s,
14280 +                            struct reiserfs_journal_list *jl, int flushall)
14281 +{
14282 +       int i;
14283 +       b_blocknr_t bn;
14284 +       struct buffer_head *tbh = NULL;
14285 +       unsigned int trans_id = jl->j_trans_id;
14286 +       struct reiserfs_journal *journal = SB_JOURNAL(s);
14287 +       int retval = 0;
14288 +       int write_len;
14289 +       int depth;
14290 +
14291 +       reiserfs_check_lock_depth(s, "flush_commit_list");
14292 +
14293 +       if (atomic_read(&jl->j_older_commits_done)) {
14294 +               return 0;
14295 +       }
14296 +
14297 +       /*
14298 +        * before we can put our commit blocks on disk, we have to make
14299 +        * sure everyone older than us is on disk too
14300 +        */
14301 +       BUG_ON(jl->j_len <= 0);
14302 +       BUG_ON(trans_id == journal->j_trans_id);
14303 +
14304 +       get_journal_list(jl);
14305 +       if (flushall) {
14306 +               if (flush_older_commits(s, jl) == 1) {
14307 +                       /*
14308 +                        * list disappeared during flush_older_commits.
14309 +                        * return
14310 +                        */
14311 +                       goto put_jl;
14312 +               }
14313 +       }
14314 +
14315 +       /* make sure nobody is trying to flush this one at the same time */
14316 +       reiserfs_mutex_lock_safe(&jl->j_commit_mutex, s);
14317 +
14318 +       if (!journal_list_still_alive(s, trans_id)) {
14319 +               mutex_unlock(&jl->j_commit_mutex);
14320 +               goto put_jl;
14321 +       }
14322 +       BUG_ON(jl->j_trans_id == 0);
14323 +
14324 +       /* this commit is done, exit */
14325 +       if (atomic_read(&jl->j_commit_left) <= 0) {
14326 +               if (flushall) {
14327 +                       atomic_set(&jl->j_older_commits_done, 1);
14328 +               }
14329 +               mutex_unlock(&jl->j_commit_mutex);
14330 +               goto put_jl;
14331 +       }
14332 +
14333 +       if (!list_empty(&jl->j_bh_list)) {
14334 +               int ret;
14335 +
14336 +               /*
14337 +                * We might sleep in numerous places inside
14338 +                * write_ordered_buffers. Relax the write lock.
14339 +                */
14340 +               depth = reiserfs_write_unlock_nested(s);
14341 +               ret = write_ordered_buffers(&journal->j_dirty_buffers_lock,
14342 +                                           journal, jl, &jl->j_bh_list);
14343 +               if (ret < 0 && retval == 0)
14344 +                       retval = ret;
14345 +               reiserfs_write_lock_nested(s, depth);
14346 +       }
14347 +       BUG_ON(!list_empty(&jl->j_bh_list));
14348 +       /*
14349 +        * for the description block and all the log blocks, submit any buffers
14350 +        * that haven't already reached the disk.  Try to write at least 256
14351 +        * log blocks. later on, we will only wait on blocks that correspond
14352 +        * to this transaction, but while we're unplugging we might as well
14353 +        * get a chunk of data on there.
14354 +        */
14355 +       atomic_inc(&journal->j_async_throttle);
14356 +       write_len = jl->j_len + 1;
14357 +       if (write_len < 256)
14358 +               write_len = 256;
14359 +       for (i = 0 ; i < write_len ; i++) {
14360 +               bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) %
14361 +                   SB_ONDISK_JOURNAL_SIZE(s);
14362 +               tbh = journal_find_get_block(s, bn);
14363 +               if (tbh) {
14364 +                       if (buffer_dirty(tbh)) {
14365 +                           depth = reiserfs_write_unlock_nested(s);
14366 +                           write_dirty_buffer(tbh, 0);
14367 +                           reiserfs_write_lock_nested(s, depth);
14368 +                       }
14369 +                       put_bh(tbh) ;
14370 +               }
14371 +       }
14372 +       if (atomic_dec_and_test(&journal->j_async_throttle))
14373 +               wake_up_var(&journal->j_async_throttle);
14374 +
14375 +       for (i = 0; i < (jl->j_len + 1); i++) {
14376 +               bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
14377 +                   (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s);
14378 +               tbh = journal_find_get_block(s, bn);
14379 +
14380 +               depth = reiserfs_write_unlock_nested(s);
14381 +               __wait_on_buffer(tbh);
14382 +               reiserfs_write_lock_nested(s, depth);
14383 +               /*
14384 +                * since we're using ll_rw_blk above, it might have skipped
14385 +                * over a locked buffer.  Double check here
14386 +                */
14387 +               /* redundant, sync_dirty_buffer() checks */
14388 +               if (buffer_dirty(tbh)) {
14389 +                       depth = reiserfs_write_unlock_nested(s);
14390 +                       sync_dirty_buffer(tbh);
14391 +                       reiserfs_write_lock_nested(s, depth);
14392 +               }
14393 +               if (unlikely(!buffer_uptodate(tbh))) {
14394 +#ifdef CONFIG_REISERFS_CHECK
14395 +                       reiserfs_warning(s, "journal-601",
14396 +                                        "buffer write failed");
14397 +#endif
14398 +                       retval = -EIO;
14399 +               }
14400 +               /* once for journal_find_get_block */
14401 +               put_bh(tbh);
14402 +               /* once due to original getblk in do_journal_end */
14403 +               put_bh(tbh);
14404 +               atomic_dec(&jl->j_commit_left);
14405 +       }
14406 +
14407 +       BUG_ON(atomic_read(&jl->j_commit_left) != 1);
14408 +
14409 +       /*
14410 +        * If there was a write error in the journal - we can't commit
14411 +        * this transaction - it will be invalid and, if successful,
14412 +        * will just end up propagating the write error out to
14413 +        * the file system.
14414 +        */
14415 +       if (likely(!retval && !reiserfs_is_journal_aborted (journal))) {
14416 +               if (buffer_dirty(jl->j_commit_bh))
14417 +                       BUG();
14418 +               mark_buffer_dirty(jl->j_commit_bh) ;
14419 +               depth = reiserfs_write_unlock_nested(s);
14420 +               if (reiserfs_barrier_flush(s))
14421 +                       __sync_dirty_buffer(jl->j_commit_bh,
14422 +                                       REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
14423 +               else
14424 +                       sync_dirty_buffer(jl->j_commit_bh);
14425 +               reiserfs_write_lock_nested(s, depth);
14426 +       }
14427 +
14428 +       /*
14429 +        * If there was a write error in the journal - we can't commit this
14430 +        * transaction - it will be invalid and, if successful, will just end
14431 +        * up propagating the write error out to the filesystem.
14432 +        */
14433 +       if (unlikely(!buffer_uptodate(jl->j_commit_bh))) {
14434 +#ifdef CONFIG_REISERFS_CHECK
14435 +               reiserfs_warning(s, "journal-615", "buffer write failed");
14436 +#endif
14437 +               retval = -EIO;
14438 +       }
14439 +       bforget(jl->j_commit_bh);
14440 +       if (journal->j_last_commit_id != 0 &&
14441 +           (jl->j_trans_id - journal->j_last_commit_id) != 1) {
14442 +               reiserfs_warning(s, "clm-2200", "last commit %lu, current %lu",
14443 +                                journal->j_last_commit_id, jl->j_trans_id);
14444 +       }
14445 +       journal->j_last_commit_id = jl->j_trans_id;
14446 +
14447 +       /*
14448 +        * now, every commit block is on the disk.  It is safe to allow
14449 +        * blocks freed during this transaction to be reallocated
14450 +        */
14451 +       cleanup_freed_for_journal_list(s, jl);
14452 +
14453 +       retval = retval ? retval : journal->j_errno;
14454 +
14455 +       /* mark the metadata dirty */
14456 +       if (!retval)
14457 +               dirty_one_transaction(s, jl);
14458 +       atomic_dec(&jl->j_commit_left);
14459 +
14460 +       if (flushall) {
14461 +               atomic_set(&jl->j_older_commits_done, 1);
14462 +       }
14463 +       mutex_unlock(&jl->j_commit_mutex);
14464 +put_jl:
14465 +       put_journal_list(s, jl);
14466 +
14467 +       if (retval)
14468 +               reiserfs_abort(s, retval, "Journal write error in %s",
14469 +                              __func__);
14470 +       return retval;
14471 +}
14472 +
14473 +/*
14474 + * flush_journal_list frequently needs to find a newer transaction for a
14475 + * given block.  This does that, or returns NULL if it can't find anything
14476 + */
14477 +static struct reiserfs_journal_list *find_newer_jl_for_cn(struct
14478 +                                                         reiserfs_journal_cnode
14479 +                                                         *cn)
14480 +{
14481 +       struct super_block *sb = cn->sb;
14482 +       b_blocknr_t blocknr = cn->blocknr;
14483 +
14484 +       cn = cn->hprev;
14485 +       while (cn) {
14486 +               if (cn->sb == sb && cn->blocknr == blocknr && cn->jlist) {
14487 +                       return cn->jlist;
14488 +               }
14489 +               cn = cn->hprev;
14490 +       }
14491 +       return NULL;
14492 +}
14493 +
14494 +static void remove_journal_hash(struct super_block *,
14495 +                               struct reiserfs_journal_cnode **,
14496 +                               struct reiserfs_journal_list *, unsigned long,
14497 +                               int);
14498 +
14499 +/*
14500 + * once all the real blocks have been flushed, it is safe to remove them
14501 + * from the journal list for this transaction.  Aside from freeing the
14502 + * cnode, this also allows the block to be reallocated for data blocks
14503 + * if it had been deleted.
14504 + */
14505 +static void remove_all_from_journal_list(struct super_block *sb,
14506 +                                        struct reiserfs_journal_list *jl,
14507 +                                        int debug)
14508 +{
14509 +       struct reiserfs_journal *journal = SB_JOURNAL(sb);
14510 +       struct reiserfs_journal_cnode *cn, *last;
14511 +       cn = jl->j_realblock;
14512 +
14513 +       /*
14514 +        * which is better, to lock once around the whole loop, or
14515 +        * to lock for each call to remove_journal_hash?
14516 +        */
14517 +       while (cn) {
14518 +               if (cn->blocknr != 0) {
14519 +                       if (debug) {
14520 +                               reiserfs_warning(sb, "reiserfs-2201",
14521 +                                                "block %u, bh is %d, state %ld",
14522 +                                                cn->blocknr, cn->bh ? 1 : 0,
14523 +                                                cn->state);
14524 +                       }
14525 +                       cn->state = 0;
14526 +                       remove_journal_hash(sb, journal->j_list_hash_table,
14527 +                                           jl, cn->blocknr, 1);
14528 +               }
14529 +               last = cn;
14530 +               cn = cn->next;
14531 +               free_cnode(sb, last);
14532 +       }
14533 +       jl->j_realblock = NULL;
14534 +}
14535 +
14536 +/*
14537 + * if this timestamp is greater than the timestamp we wrote last to the
14538 + * header block, write it to the header block.  once this is done, I can
14539 + * safely say the log area for this transaction won't ever be replayed,
14540 + * and I can start releasing blocks in this transaction for reuse as data
14541 + * blocks.  called by flush_journal_list, before it calls
14542 + * remove_all_from_journal_list
14543 + */
14544 +static int _update_journal_header_block(struct super_block *sb,
14545 +                                       unsigned long offset,
14546 +                                       unsigned int trans_id)
14547 +{
14548 +       struct reiserfs_journal_header *jh;
14549 +       struct reiserfs_journal *journal = SB_JOURNAL(sb);
14550 +       int depth;
14551 +
14552 +       if (reiserfs_is_journal_aborted(journal))
14553 +               return -EIO;
14554 +
14555 +       if (trans_id >= journal->j_last_flush_trans_id) {
14556 +               if (buffer_locked((journal->j_header_bh))) {
14557 +                       depth = reiserfs_write_unlock_nested(sb);
14558 +                       __wait_on_buffer(journal->j_header_bh);
14559 +                       reiserfs_write_lock_nested(sb, depth);
14560 +                       if (unlikely(!buffer_uptodate(journal->j_header_bh))) {
14561 +#ifdef CONFIG_REISERFS_CHECK
14562 +                               reiserfs_warning(sb, "journal-699",
14563 +                                                "buffer write failed");
14564 +#endif
14565 +                               return -EIO;
14566 +                       }
14567 +               }
14568 +               journal->j_last_flush_trans_id = trans_id;
14569 +               journal->j_first_unflushed_offset = offset;
14570 +               jh = (struct reiserfs_journal_header *)(journal->j_header_bh->
14571 +                                                       b_data);
14572 +               jh->j_last_flush_trans_id = cpu_to_le32(trans_id);
14573 +               jh->j_first_unflushed_offset = cpu_to_le32(offset);
14574 +               jh->j_mount_id = cpu_to_le32(journal->j_mount_id);
14575 +
14576 +               set_buffer_dirty(journal->j_header_bh);
14577 +               depth = reiserfs_write_unlock_nested(sb);
14578 +
14579 +               if (reiserfs_barrier_flush(sb))
14580 +                       __sync_dirty_buffer(journal->j_header_bh,
14581 +                                       REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
14582 +               else
14583 +                       sync_dirty_buffer(journal->j_header_bh);
14584 +
14585 +               reiserfs_write_lock_nested(sb, depth);
14586 +               if (!buffer_uptodate(journal->j_header_bh)) {
14587 +                       reiserfs_warning(sb, "journal-837",
14588 +                                        "IO error during journal replay");
14589 +                       return -EIO;
14590 +               }
14591 +       }
14592 +       return 0;
14593 +}
14594 +
14595 +static int update_journal_header_block(struct super_block *sb,
14596 +                                      unsigned long offset,
14597 +                                      unsigned int trans_id)
14598 +{
14599 +       return _update_journal_header_block(sb, offset, trans_id);
14600 +}
14601 +
14602 +/*
14603 +** flush any and all journal lists older than you are
14604 +** can only be called from flush_journal_list
14605 +*/
14606 +static int flush_older_journal_lists(struct super_block *sb,
14607 +                                    struct reiserfs_journal_list *jl)
14608 +{
14609 +       struct list_head *entry;
14610 +       struct reiserfs_journal_list *other_jl;
14611 +       struct reiserfs_journal *journal = SB_JOURNAL(sb);
14612 +       unsigned int trans_id = jl->j_trans_id;
14613 +
14614 +       /*
14615 +        * we know we are the only ones flushing things, no extra race
14616 +        * protection is required.
14617 +        */
14618 +restart:
14619 +       entry = journal->j_journal_list.next;
14620 +       /* Did we wrap? */
14621 +       if (entry == &journal->j_journal_list)
14622 +               return 0;
14623 +       other_jl = JOURNAL_LIST_ENTRY(entry);
14624 +       if (other_jl->j_trans_id < trans_id) {
14625 +               BUG_ON(other_jl->j_refcount <= 0);
14626 +               /* do not flush all */
14627 +               flush_journal_list(sb, other_jl, 0);
14628 +
14629 +               /* other_jl is now deleted from the list */
14630 +               goto restart;
14631 +       }
14632 +       return 0;
14633 +}
14634 +
14635 +static void del_from_work_list(struct super_block *s,
14636 +                              struct reiserfs_journal_list *jl)
14637 +{
14638 +       struct reiserfs_journal *journal = SB_JOURNAL(s);
14639 +       if (!list_empty(&jl->j_working_list)) {
14640 +               list_del_init(&jl->j_working_list);
14641 +               journal->j_num_work_lists--;
14642 +       }
14643 +}
14644 +
14645 +/*
14646 + * flush a journal list, both commit and real blocks
14647 + *
14648 + * always set flushall to 1, unless you are calling from inside
14649 + * flush_journal_list
14650 + *
14651 + * IMPORTANT.  This can only be called while there are no journal writers,
14652 + * and the journal is locked.  That means it can only be called from
14653 + * do_journal_end, or by journal_release
14654 + */
14655 +static int flush_journal_list(struct super_block *s,
14656 +                             struct reiserfs_journal_list *jl, int flushall)
14657 +{
14658 +       struct reiserfs_journal_list *pjl;
14659 +       struct reiserfs_journal_cnode *cn;
14660 +       int count;
14661 +       int was_jwait = 0;
14662 +       int was_dirty = 0;
14663 +       struct buffer_head *saved_bh;
14664 +       unsigned long j_len_saved = jl->j_len;
14665 +       struct reiserfs_journal *journal = SB_JOURNAL(s);
14666 +       int err = 0;
14667 +       int depth;
14668 +
14669 +       BUG_ON(j_len_saved <= 0);
14670 +
14671 +       if (atomic_read(&journal->j_wcount) != 0) {
14672 +               reiserfs_warning(s, "clm-2048", "called with wcount %d",
14673 +                                atomic_read(&journal->j_wcount));
14674 +       }
14675 +
14676 +       /* if flushall == 0, the lock is already held */
14677 +       if (flushall) {
14678 +               reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
14679 +       } else if (mutex_trylock(&journal->j_flush_mutex)) {
14680 +               BUG();
14681 +       }
14682 +
14683 +       count = 0;
14684 +       if (j_len_saved > journal->j_trans_max) {
14685 +               reiserfs_panic(s, "journal-715", "length is %lu, trans id %lu",
14686 +                              j_len_saved, jl->j_trans_id);
14687 +               return 0;
14688 +       }
14689 +
14690 +       /* if all the work is already done, get out of here */
14691 +       if (atomic_read(&jl->j_nonzerolen) <= 0 &&
14692 +           atomic_read(&jl->j_commit_left) <= 0) {
14693 +               goto flush_older_and_return;
14694 +       }
14695 +
14696 +       /*
14697 +        * start by putting the commit list on disk.  This will also flush
14698 +        * the commit lists of any olders transactions
14699 +        */
14700 +       flush_commit_list(s, jl, 1);
14701 +
14702 +       if (!(jl->j_state & LIST_DIRTY)
14703 +           && !reiserfs_is_journal_aborted(journal))
14704 +               BUG();
14705 +
14706 +       /* are we done now? */
14707 +       if (atomic_read(&jl->j_nonzerolen) <= 0 &&
14708 +           atomic_read(&jl->j_commit_left) <= 0) {
14709 +               goto flush_older_and_return;
14710 +       }
14711 +
14712 +       /*
14713 +        * loop through each cnode, see if we need to write it,
14714 +        * or wait on a more recent transaction, or just ignore it
14715 +        */
14716 +       if (atomic_read(&journal->j_wcount) != 0) {
14717 +               reiserfs_panic(s, "journal-844", "journal list is flushing, "
14718 +                              "wcount is not 0");
14719 +       }
14720 +       cn = jl->j_realblock;
14721 +       while (cn) {
14722 +               was_jwait = 0;
14723 +               was_dirty = 0;
14724 +               saved_bh = NULL;
14725 +               /* blocknr of 0 is no longer in the hash, ignore it */
14726 +               if (cn->blocknr == 0) {
14727 +                       goto free_cnode;
14728 +               }
14729 +
14730 +               /*
14731 +                * This transaction failed commit.
14732 +                * Don't write out to the disk
14733 +                */
14734 +               if (!(jl->j_state & LIST_DIRTY))
14735 +                       goto free_cnode;
14736 +
14737 +               pjl = find_newer_jl_for_cn(cn);
14738 +               /*
14739 +                * the order is important here.  We check pjl to make sure we
14740 +                * don't clear BH_JDirty_wait if we aren't the one writing this
14741 +                * block to disk
14742 +                */
14743 +               if (!pjl && cn->bh) {
14744 +                       saved_bh = cn->bh;
14745 +
14746 +                       /*
14747 +                        * we do this to make sure nobody releases the
14748 +                        * buffer while we are working with it
14749 +                        */
14750 +                       get_bh(saved_bh);
14751 +
14752 +                       if (buffer_journal_dirty(saved_bh)) {
14753 +                               BUG_ON(!can_dirty(cn));
14754 +                               was_jwait = 1;
14755 +                               was_dirty = 1;
14756 +                       } else if (can_dirty(cn)) {
14757 +                               /*
14758 +                                * everything with !pjl && jwait
14759 +                                * should be writable
14760 +                                */
14761 +                               BUG();
14762 +                       }
14763 +               }
14764 +
14765 +               /*
14766 +                * if someone has this block in a newer transaction, just make
14767 +                * sure they are committed, and don't try writing it to disk
14768 +                */
14769 +               if (pjl) {
14770 +                       if (atomic_read(&pjl->j_commit_left))
14771 +                               flush_commit_list(s, pjl, 1);
14772 +                       goto free_cnode;
14773 +               }
14774 +
14775 +               /*
14776 +                * bh == NULL when the block got to disk on its own, OR,
14777 +                * the block got freed in a future transaction
14778 +                */
14779 +               if (saved_bh == NULL) {
14780 +                       goto free_cnode;
14781 +               }
14782 +
14783 +               /*
14784 +                * this should never happen.  kupdate_one_transaction has
14785 +                * this list locked while it works, so we should never see a
14786 +                * buffer here that is not marked JDirty_wait
14787 +                */
14788 +               if ((!was_jwait) && !buffer_locked(saved_bh)) {
14789 +                       reiserfs_warning(s, "journal-813",
14790 +                                        "BAD! buffer %llu %cdirty %cjwait, "
14791 +                                        "not in a newer transaction",
14792 +                                        (unsigned long long)saved_bh->
14793 +                                        b_blocknr, was_dirty ? ' ' : '!',
14794 +                                        was_jwait ? ' ' : '!');
14795 +               }
14796 +               if (was_dirty) {
14797 +                       /*
14798 +                        * we inc again because saved_bh gets decremented
14799 +                        * at free_cnode
14800 +                        */
14801 +                       get_bh(saved_bh);
14802 +                       set_bit(BLOCK_NEEDS_FLUSH, &cn->state);
14803 +                       lock_buffer(saved_bh);
14804 +                       BUG_ON(cn->blocknr != saved_bh->b_blocknr);
14805 +                       if (buffer_dirty(saved_bh))
14806 +                               submit_logged_buffer(saved_bh);
14807 +                       else
14808 +                               unlock_buffer(saved_bh);
14809 +                       count++;
14810 +               } else {
14811 +                       reiserfs_warning(s, "clm-2082",
14812 +                                        "Unable to flush buffer %llu in %s",
14813 +                                        (unsigned long long)saved_bh->
14814 +                                        b_blocknr, __func__);
14815 +               }
14816 +free_cnode:
14817 +               cn = cn->next;
14818 +               if (saved_bh) {
14819 +                       /*
14820 +                        * we incremented this to keep others from
14821 +                        * taking the buffer head away
14822 +                        */
14823 +                       put_bh(saved_bh);
14824 +                       if (atomic_read(&saved_bh->b_count) < 0) {
14825 +                               reiserfs_warning(s, "journal-945",
14826 +                                                "saved_bh->b_count < 0");
14827 +                       }
14828 +               }
14829 +       }
14830 +       if (count > 0) {
14831 +               cn = jl->j_realblock;
14832 +               while (cn) {
14833 +                       if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) {
14834 +                               if (!cn->bh) {
14835 +                                       reiserfs_panic(s, "journal-1011",
14836 +                                                      "cn->bh is NULL");
14837 +                               }
14838 +
14839 +                               depth = reiserfs_write_unlock_nested(s);
14840 +                               __wait_on_buffer(cn->bh);
14841 +                               reiserfs_write_lock_nested(s, depth);
14842 +
14843 +                               if (!cn->bh) {
14844 +                                       reiserfs_panic(s, "journal-1012",
14845 +                                                      "cn->bh is NULL");
14846 +                               }
14847 +                               if (unlikely(!buffer_uptodate(cn->bh))) {
14848 +#ifdef CONFIG_REISERFS_CHECK
14849 +                                       reiserfs_warning(s, "journal-949",
14850 +                                                        "buffer write failed");
14851 +#endif
14852 +                                       err = -EIO;
14853 +                               }
14854 +                               /*
14855 +                                * note, we must clear the JDirty_wait bit
14856 +                                * after the up to date check, otherwise we
14857 +                                * race against our flushpage routine
14858 +                                */
14859 +                               BUG_ON(!test_clear_buffer_journal_dirty
14860 +                                      (cn->bh));
14861 +
14862 +                               /* drop one ref for us */
14863 +                               put_bh(cn->bh);
14864 +                               /* drop one ref for journal_mark_dirty */
14865 +                               release_buffer_page(cn->bh);
14866 +                       }
14867 +                       cn = cn->next;
14868 +               }
14869 +       }
14870 +
14871 +       if (err)
14872 +               reiserfs_abort(s, -EIO,
14873 +                              "Write error while pushing transaction to disk in %s",
14874 +                              __func__);
14875 +flush_older_and_return:
14876 +
14877 +       /*
14878 +        * before we can update the journal header block, we _must_ flush all
14879 +        * real blocks from all older transactions to disk.  This is because
14880 +        * once the header block is updated, this transaction will not be
14881 +        * replayed after a crash
14882 +        */
14883 +       if (flushall) {
14884 +               flush_older_journal_lists(s, jl);
14885 +       }
14886 +
14887 +       err = journal->j_errno;
14888 +       /*
14889 +        * before we can remove everything from the hash tables for this
14890 +        * transaction, we must make sure it can never be replayed
14891 +        *
14892 +        * since we are only called from do_journal_end, we know for sure there
14893 +        * are no allocations going on while we are flushing journal lists.  So,
14894 +        * we only need to update the journal header block for the last list
14895 +        * being flushed
14896 +        */
14897 +       if (!err && flushall) {
14898 +               err =
14899 +                   update_journal_header_block(s,
14900 +                                               (jl->j_start + jl->j_len +
14901 +                                                2) % SB_ONDISK_JOURNAL_SIZE(s),
14902 +                                               jl->j_trans_id);
14903 +               if (err)
14904 +                       reiserfs_abort(s, -EIO,
14905 +                                      "Write error while updating journal header in %s",
14906 +                                      __func__);
14907 +       }
14908 +       remove_all_from_journal_list(s, jl, 0);
14909 +       list_del_init(&jl->j_list);
14910 +       journal->j_num_lists--;
14911 +       del_from_work_list(s, jl);
14912 +
14913 +       if (journal->j_last_flush_id != 0 &&
14914 +           (jl->j_trans_id - journal->j_last_flush_id) != 1) {
14915 +               reiserfs_warning(s, "clm-2201", "last flush %lu, current %lu",
14916 +                                journal->j_last_flush_id, jl->j_trans_id);
14917 +       }
14918 +       journal->j_last_flush_id = jl->j_trans_id;
14919 +
14920 +       /*
14921 +        * not strictly required since we are freeing the list, but it should
14922 +        * help find code using dead lists later on
14923 +        */
14924 +       jl->j_len = 0;
14925 +       atomic_set(&jl->j_nonzerolen, 0);
14926 +       jl->j_start = 0;
14927 +       jl->j_realblock = NULL;
14928 +       jl->j_commit_bh = NULL;
14929 +       jl->j_trans_id = 0;
14930 +       jl->j_state = 0;
14931 +       put_journal_list(s, jl);
14932 +       if (flushall)
14933 +               mutex_unlock(&journal->j_flush_mutex);
14934 +       return err;
14935 +}
14936 +
14937 +static int write_one_transaction(struct super_block *s,
14938 +                                struct reiserfs_journal_list *jl,
14939 +                                struct buffer_chunk *chunk)
14940 +{
14941 +       struct reiserfs_journal_cnode *cn;
14942 +       int ret = 0;
14943 +
14944 +       jl->j_state |= LIST_TOUCHED;
14945 +       del_from_work_list(s, jl);
14946 +       if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) {
14947 +               return 0;
14948 +       }
14949 +
14950 +       cn = jl->j_realblock;
14951 +       while (cn) {
14952 +               /*
14953 +                * if the blocknr == 0, this has been cleared from the hash,
14954 +                * skip it
14955 +                */
14956 +               if (cn->blocknr == 0) {
14957 +                       goto next;
14958 +               }
14959 +               if (cn->bh && can_dirty(cn) && buffer_dirty(cn->bh)) {
14960 +                       struct buffer_head *tmp_bh;
14961 +                       /*
14962 +                        * we can race against journal_mark_freed when we try
14963 +                        * to lock_buffer(cn->bh), so we have to inc the buffer
14964 +                        * count, and recheck things after locking
14965 +                        */
14966 +                       tmp_bh = cn->bh;
14967 +                       get_bh(tmp_bh);
14968 +                       lock_buffer(tmp_bh);
14969 +                       if (cn->bh && can_dirty(cn) && buffer_dirty(tmp_bh)) {
14970 +                               if (!buffer_journal_dirty(tmp_bh) ||
14971 +                                   buffer_journal_prepared(tmp_bh))
14972 +                                       BUG();
14973 +                               add_to_chunk(chunk, tmp_bh, NULL, write_chunk);
14974 +                               ret++;
14975 +                       } else {
14976 +                               /* note, cn->bh might be null now */
14977 +                               unlock_buffer(tmp_bh);
14978 +                       }
14979 +                       put_bh(tmp_bh);
14980 +               }
14981 +next:
14982 +               cn = cn->next;
14983 +               cond_resched();
14984 +       }
14985 +       return ret;
14986 +}
14987 +
14988 +/* used by flush_commit_list */
14989 +static void dirty_one_transaction(struct super_block *s,
14990 +                                struct reiserfs_journal_list *jl)
14991 +{
14992 +       struct reiserfs_journal_cnode *cn;
14993 +       struct reiserfs_journal_list *pjl;
14994 +
14995 +       jl->j_state |= LIST_DIRTY;
14996 +       cn = jl->j_realblock;
14997 +       while (cn) {
14998 +               /*
14999 +                * look for a more recent transaction that logged this
15000 +                * buffer.  Only the most recent transaction with a buffer in
15001 +                * it is allowed to send that buffer to disk
15002 +                */
15003 +               pjl = find_newer_jl_for_cn(cn);
15004 +               if (!pjl && cn->blocknr && cn->bh
15005 +                   && buffer_journal_dirty(cn->bh)) {
15006 +                       BUG_ON(!can_dirty(cn));
15007 +                       /*
15008 +                        * if the buffer is prepared, it will either be logged
15009 +                        * or restored.  If restored, we need to make sure
15010 +                        * it actually gets marked dirty
15011 +                        */
15012 +                       clear_buffer_journal_new(cn->bh);
15013 +                       if (buffer_journal_prepared(cn->bh)) {
15014 +                               set_buffer_journal_restore_dirty(cn->bh);
15015 +                       } else {
15016 +                               set_buffer_journal_test(cn->bh);
15017 +                               mark_buffer_dirty(cn->bh);
15018 +                       }
15019 +               }
15020 +               cn = cn->next;
15021 +       }
15022 +}
15023 +
15024 +static int kupdate_transactions(struct super_block *s,
15025 +                               struct reiserfs_journal_list *jl,
15026 +                               struct reiserfs_journal_list **next_jl,
15027 +                               unsigned int *next_trans_id,
15028 +                               int num_blocks, int num_trans)
15029 +{
15030 +       int ret = 0;
15031 +       int written = 0;
15032 +       int transactions_flushed = 0;
15033 +       unsigned int orig_trans_id = jl->j_trans_id;
15034 +       struct buffer_chunk chunk;
15035 +       struct list_head *entry;
15036 +       struct reiserfs_journal *journal = SB_JOURNAL(s);
15037 +       chunk.nr = 0;
15038 +
15039 +       reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
15040 +       if (!journal_list_still_alive(s, orig_trans_id)) {
15041 +               goto done;
15042 +       }
15043 +
15044 +       /*
15045 +        * we've got j_flush_mutex held, nobody is going to delete any
15046 +        * of these lists out from underneath us
15047 +        */
15048 +       while ((num_trans && transactions_flushed < num_trans) ||
15049 +              (!num_trans && written < num_blocks)) {
15050 +
15051 +               if (jl->j_len == 0 || (jl->j_state & LIST_TOUCHED) ||
15052 +                   atomic_read(&jl->j_commit_left)
15053 +                   || !(jl->j_state & LIST_DIRTY)) {
15054 +                       del_from_work_list(s, jl);
15055 +                       break;
15056 +               }
15057 +               ret = write_one_transaction(s, jl, &chunk);
15058 +
15059 +               if (ret < 0)
15060 +                       goto done;
15061 +               transactions_flushed++;
15062 +               written += ret;
15063 +               entry = jl->j_list.next;
15064 +
15065 +               /* did we wrap? */
15066 +               if (entry == &journal->j_journal_list) {
15067 +                       break;
15068 +               }
15069 +               jl = JOURNAL_LIST_ENTRY(entry);
15070 +
15071 +               /* don't bother with older transactions */
15072 +               if (jl->j_trans_id <= orig_trans_id)
15073 +                       break;
15074 +       }
15075 +       if (chunk.nr) {
15076 +               write_chunk(&chunk);
15077 +       }
15078 +
15079 +done:
15080 +       mutex_unlock(&journal->j_flush_mutex);
15081 +       return ret;
15082 +}
15083 +
15084 +/*
15085 + * for o_sync and fsync heavy applications, they tend to use
15086 + * all the journa list slots with tiny transactions.  These
15087 + * trigger lots and lots of calls to update the header block, which
15088 + * adds seeks and slows things down.
15089 + *
15090 + * This function tries to clear out a large chunk of the journal lists
15091 + * at once, which makes everything faster since only the newest journal
15092 + * list updates the header block
15093 + */
15094 +static int flush_used_journal_lists(struct super_block *s,
15095 +                                   struct reiserfs_journal_list *jl)
15096 +{
15097 +       unsigned long len = 0;
15098 +       unsigned long cur_len;
15099 +       int i;
15100 +       int limit = 256;
15101 +       struct reiserfs_journal_list *tjl;
15102 +       struct reiserfs_journal_list *flush_jl;
15103 +       unsigned int trans_id;
15104 +       struct reiserfs_journal *journal = SB_JOURNAL(s);
15105 +
15106 +       flush_jl = tjl = jl;
15107 +
15108 +       /* in data logging mode, try harder to flush a lot of blocks */
15109 +       if (reiserfs_data_log(s))
15110 +               limit = 1024;
15111 +       /* flush for 256 transactions or limit blocks, whichever comes first */
15112 +       for (i = 0; i < 256 && len < limit; i++) {
15113 +               if (atomic_read(&tjl->j_commit_left) ||
15114 +                   tjl->j_trans_id < jl->j_trans_id) {
15115 +                       break;
15116 +               }
15117 +               cur_len = atomic_read(&tjl->j_nonzerolen);
15118 +               if (cur_len > 0) {
15119 +                       tjl->j_state &= ~LIST_TOUCHED;
15120 +               }
15121 +               len += cur_len;
15122 +               flush_jl = tjl;
15123 +               if (tjl->j_list.next == &journal->j_journal_list)
15124 +                       break;
15125 +               tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next);
15126 +       }
15127 +       get_journal_list(jl);
15128 +       get_journal_list(flush_jl);
15129 +
15130 +       /*
15131 +        * try to find a group of blocks we can flush across all the
15132 +        * transactions, but only bother if we've actually spanned
15133 +        * across multiple lists
15134 +        */
15135 +       if (flush_jl != jl)
15136 +               kupdate_transactions(s, jl, &tjl, &trans_id, len, i);
15137 +
15138 +       flush_journal_list(s, flush_jl, 1);
15139 +       put_journal_list(s, flush_jl);
15140 +       put_journal_list(s, jl);
15141 +       return 0;
15142 +}
15143 +
15144 +/*
15145 + * removes any nodes in table with name block and dev as bh.
15146 + * only touchs the hnext and hprev pointers.
15147 + */
15148 +static void remove_journal_hash(struct super_block *sb,
15149 +                        struct reiserfs_journal_cnode **table,
15150 +                        struct reiserfs_journal_list *jl,
15151 +                        unsigned long block, int remove_freed)
15152 +{
15153 +       struct reiserfs_journal_cnode *cur;
15154 +       struct reiserfs_journal_cnode **head;
15155 +
15156 +       head = &(journal_hash(table, sb, block));
15157 +       if (!head) {
15158 +               return;
15159 +       }
15160 +       cur = *head;
15161 +       while (cur) {
15162 +               if (cur->blocknr == block && cur->sb == sb
15163 +                   && (jl == NULL || jl == cur->jlist)
15164 +                   && (!test_bit(BLOCK_FREED, &cur->state) || remove_freed)) {
15165 +                       if (cur->hnext) {
15166 +                               cur->hnext->hprev = cur->hprev;
15167 +                       }
15168 +                       if (cur->hprev) {
15169 +                               cur->hprev->hnext = cur->hnext;
15170 +                       } else {
15171 +                               *head = cur->hnext;
15172 +                       }
15173 +                       cur->blocknr = 0;
15174 +                       cur->sb = NULL;
15175 +                       cur->state = 0;
15176 +                       /*
15177 +                        * anybody who clears the cur->bh will also
15178 +                        * dec the nonzerolen
15179 +                        */
15180 +                       if (cur->bh && cur->jlist)
15181 +                               atomic_dec(&cur->jlist->j_nonzerolen);
15182 +                       cur->bh = NULL;
15183 +                       cur->jlist = NULL;
15184 +               }
15185 +               cur = cur->hnext;
15186 +       }
15187 +}
15188 +
15189 +static void free_journal_ram(struct super_block *sb)
15190 +{
15191 +       struct reiserfs_journal *journal = SB_JOURNAL(sb);
15192 +       kfree(journal->j_current_jl);
15193 +       journal->j_num_lists--;
15194 +
15195 +       vfree(journal->j_cnode_free_orig);
15196 +       free_list_bitmaps(sb, journal->j_list_bitmap);
15197 +       free_bitmap_nodes(sb);  /* must be after free_list_bitmaps */
15198 +       if (journal->j_header_bh) {
15199 +               brelse(journal->j_header_bh);
15200 +       }
15201 +       /*
15202 +        * j_header_bh is on the journal dev, make sure
15203 +        * not to release the journal dev until we brelse j_header_bh
15204 +        */
15205 +       release_journal_dev(journal);
15206 +       vfree(journal);
15207 +}
15208 +
15209 +/*
15210 + * call on unmount.  Only set error to 1 if you haven't made your way out
15211 + * of read_super() yet.  Any other caller must keep error at 0.
15212 + */
15213 +static int do_journal_release(struct reiserfs_transaction_handle *th,
15214 +                             struct super_block *sb, int error)
15215 +{
15216 +       struct reiserfs_transaction_handle myth;
15217 +       struct reiserfs_journal *journal = SB_JOURNAL(sb);
15218 +
15219 +       /*
15220 +        * we only want to flush out transactions if we were
15221 +        * called with error == 0
15222 +        */
15223 +       if (!error && !sb_rdonly(sb)) {
15224 +               /* end the current trans */
15225 +               BUG_ON(!th->t_trans_id);
15226 +               do_journal_end(th, FLUSH_ALL);
15227 +
15228 +               /*
15229 +                * make sure something gets logged to force
15230 +                * our way into the flush code
15231 +                */
15232 +               if (!journal_join(&myth, sb)) {
15233 +                       reiserfs_prepare_for_journal(sb,
15234 +                                                    SB_BUFFER_WITH_SB(sb),
15235 +                                                    1);
15236 +                       journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb));
15237 +                       do_journal_end(&myth, FLUSH_ALL);
15238 +               }
15239 +       }
15240 +
15241 +       /* this also catches errors during the do_journal_end above */
15242 +       if (!error && reiserfs_is_journal_aborted(journal)) {
15243 +               memset(&myth, 0, sizeof(myth));
15244 +               if (!journal_join_abort(&myth, sb)) {
15245 +                       reiserfs_prepare_for_journal(sb,
15246 +                                                    SB_BUFFER_WITH_SB(sb),
15247 +                                                    1);
15248 +                       journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb));
15249 +                       do_journal_end(&myth, FLUSH_ALL);
15250 +               }
15251 +       }
15252 +
15253 +
15254 +       /*
15255 +        * We must release the write lock here because
15256 +        * the workqueue job (flush_async_commit) needs this lock
15257 +        */
15258 +       reiserfs_write_unlock(sb);
15259 +
15260 +       /*
15261 +        * Cancel flushing of old commits. Note that neither of these works
15262 +        * will be requeued because superblock is being shutdown and doesn't
15263 +        * have SB_ACTIVE set.
15264 +        */
15265 +       reiserfs_cancel_old_flush(sb);
15266 +       /* wait for all commits to finish */
15267 +       cancel_delayed_work_sync(&SB_JOURNAL(sb)->j_work);
15268 +
15269 +       free_journal_ram(sb);
15270 +
15271 +       reiserfs_write_lock(sb);
15272 +
15273 +       return 0;
15274 +}
15275 +
15276 +/* * call on unmount.  flush all journal trans, release all alloc'd ram */
15277 +int journal_release(struct reiserfs_transaction_handle *th,
15278 +                   struct super_block *sb)
15279 +{
15280 +       return do_journal_release(th, sb, 0);
15281 +}
15282 +
15283 +/* only call from an error condition inside reiserfs_read_super!  */
15284 +int journal_release_error(struct reiserfs_transaction_handle *th,
15285 +                         struct super_block *sb)
15286 +{
15287 +       return do_journal_release(th, sb, 1);
15288 +}
15289 +
15290 +/*
15291 + * compares description block with commit block.
15292 + * returns 1 if they differ, 0 if they are the same
15293 + */
15294 +static int journal_compare_desc_commit(struct super_block *sb,
15295 +                                      struct reiserfs_journal_desc *desc,
15296 +                                      struct reiserfs_journal_commit *commit)
15297 +{
15298 +       if (get_commit_trans_id(commit) != get_desc_trans_id(desc) ||
15299 +           get_commit_trans_len(commit) != get_desc_trans_len(desc) ||
15300 +           get_commit_trans_len(commit) > SB_JOURNAL(sb)->j_trans_max ||
15301 +           get_commit_trans_len(commit) <= 0) {
15302 +               return 1;
15303 +       }
15304 +       return 0;
15305 +}
15306 +
15307 +/*
15308 + * returns 0 if it did not find a description block
15309 + * returns -1 if it found a corrupt commit block
15310 + * returns 1 if both desc and commit were valid
15311 + * NOTE: only called during fs mount
15312 + */
15313 +static int journal_transaction_is_valid(struct super_block *sb,
15314 +                                       struct buffer_head *d_bh,
15315 +                                       unsigned int *oldest_invalid_trans_id,
15316 +                                       unsigned long *newest_mount_id)
15317 +{
15318 +       struct reiserfs_journal_desc *desc;
15319 +       struct reiserfs_journal_commit *commit;
15320 +       struct buffer_head *c_bh;
15321 +       unsigned long offset;
15322 +
15323 +       if (!d_bh)
15324 +               return 0;
15325 +
15326 +       desc = (struct reiserfs_journal_desc *)d_bh->b_data;
15327 +       if (get_desc_trans_len(desc) > 0
15328 +           && !memcmp(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8)) {
15329 +               if (oldest_invalid_trans_id && *oldest_invalid_trans_id
15330 +                   && get_desc_trans_id(desc) > *oldest_invalid_trans_id) {
15331 +                       reiserfs_debug(sb, REISERFS_DEBUG_CODE,
15332 +                                      "journal-986: transaction "
15333 +                                      "is valid returning because trans_id %d is greater than "
15334 +                                      "oldest_invalid %lu",
15335 +                                      get_desc_trans_id(desc),
15336 +                                      *oldest_invalid_trans_id);
15337 +                       return 0;
15338 +               }
15339 +               if (newest_mount_id
15340 +                   && *newest_mount_id > get_desc_mount_id(desc)) {
15341 +                       reiserfs_debug(sb, REISERFS_DEBUG_CODE,
15342 +                                      "journal-1087: transaction "
15343 +                                      "is valid returning because mount_id %d is less than "
15344 +                                      "newest_mount_id %lu",
15345 +                                      get_desc_mount_id(desc),
15346 +                                      *newest_mount_id);
15347 +                       return -1;
15348 +               }
15349 +               if (get_desc_trans_len(desc) > SB_JOURNAL(sb)->j_trans_max) {
15350 +                       reiserfs_warning(sb, "journal-2018",
15351 +                                        "Bad transaction length %d "
15352 +                                        "encountered, ignoring transaction",
15353 +                                        get_desc_trans_len(desc));
15354 +                       return -1;
15355 +               }
15356 +               offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
15357 +
15358 +               /*
15359 +                * ok, we have a journal description block,
15360 +                * let's see if the transaction was valid
15361 +                */
15362 +               c_bh =
15363 +                   journal_bread(sb,
15364 +                                 SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
15365 +                                 ((offset + get_desc_trans_len(desc) +
15366 +                                   1) % SB_ONDISK_JOURNAL_SIZE(sb)));
15367 +               if (!c_bh)
15368 +                       return 0;
15369 +               commit = (struct reiserfs_journal_commit *)c_bh->b_data;
15370 +               if (journal_compare_desc_commit(sb, desc, commit)) {
15371 +                       reiserfs_debug(sb, REISERFS_DEBUG_CODE,
15372 +                                      "journal_transaction_is_valid, commit offset %ld had bad "
15373 +                                      "time %d or length %d",
15374 +                                      c_bh->b_blocknr -
15375 +                                      SB_ONDISK_JOURNAL_1st_BLOCK(sb),
15376 +                                      get_commit_trans_id(commit),
15377 +                                      get_commit_trans_len(commit));
15378 +                       brelse(c_bh);
15379 +                       if (oldest_invalid_trans_id) {
15380 +                               *oldest_invalid_trans_id =
15381 +                                   get_desc_trans_id(desc);
15382 +                               reiserfs_debug(sb, REISERFS_DEBUG_CODE,
15383 +                                              "journal-1004: "
15384 +                                              "transaction_is_valid setting oldest invalid trans_id "
15385 +                                              "to %d",
15386 +                                              get_desc_trans_id(desc));
15387 +                       }
15388 +                       return -1;
15389 +               }
15390 +               brelse(c_bh);
15391 +               reiserfs_debug(sb, REISERFS_DEBUG_CODE,
15392 +                              "journal-1006: found valid "
15393 +                              "transaction start offset %llu, len %d id %d",
15394 +                              d_bh->b_blocknr -
15395 +                              SB_ONDISK_JOURNAL_1st_BLOCK(sb),
15396 +                              get_desc_trans_len(desc),
15397 +                              get_desc_trans_id(desc));
15398 +               return 1;
15399 +       } else {
15400 +               return 0;
15401 +       }
15402 +}
15403 +
15404 +static void brelse_array(struct buffer_head **heads, int num)
15405 +{
15406 +       int i;
15407 +       for (i = 0; i < num; i++) {
15408 +               brelse(heads[i]);
15409 +       }
15410 +}
15411 +
15412 +/*
15413 + * given the start, and values for the oldest acceptable transactions,
15414 + * this either reads in a replays a transaction, or returns because the
15415 + * transaction is invalid, or too old.
15416 + * NOTE: only called during fs mount
15417 + */
15418 +static int journal_read_transaction(struct super_block *sb,
15419 +                                   unsigned long cur_dblock,
15420 +                                   unsigned long oldest_start,
15421 +                                   unsigned int oldest_trans_id,
15422 +                                   unsigned long newest_mount_id)
15423 +{
15424 +       struct reiserfs_journal *journal = SB_JOURNAL(sb);
15425 +       struct reiserfs_journal_desc *desc;
15426 +       struct reiserfs_journal_commit *commit;
15427 +       unsigned int trans_id = 0;
15428 +       struct buffer_head *c_bh;
15429 +       struct buffer_head *d_bh;
15430 +       struct buffer_head **log_blocks = NULL;
15431 +       struct buffer_head **real_blocks = NULL;
15432 +       unsigned int trans_offset;
15433 +       int i;
15434 +       int trans_half;
15435 +
15436 +       d_bh = journal_bread(sb, cur_dblock);
15437 +       if (!d_bh)
15438 +               return 1;
15439 +       desc = (struct reiserfs_journal_desc *)d_bh->b_data;
15440 +       trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
15441 +       reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1037: "
15442 +                      "journal_read_transaction, offset %llu, len %d mount_id %d",
15443 +                      d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb),
15444 +                      get_desc_trans_len(desc), get_desc_mount_id(desc));
15445 +       if (get_desc_trans_id(desc) < oldest_trans_id) {
15446 +               reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1039: "
15447 +                              "journal_read_trans skipping because %lu is too old",
15448 +                              cur_dblock -
15449 +                              SB_ONDISK_JOURNAL_1st_BLOCK(sb));
15450 +               brelse(d_bh);
15451 +               return 1;
15452 +       }
15453 +       if (get_desc_mount_id(desc) != newest_mount_id) {
15454 +               reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1146: "
15455 +                              "journal_read_trans skipping because %d is != "
15456 +                              "newest_mount_id %lu", get_desc_mount_id(desc),
15457 +                              newest_mount_id);
15458 +               brelse(d_bh);
15459 +               return 1;
15460 +       }
15461 +       c_bh = journal_bread(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
15462 +                            ((trans_offset + get_desc_trans_len(desc) + 1) %
15463 +                             SB_ONDISK_JOURNAL_SIZE(sb)));
15464 +       if (!c_bh) {
15465 +               brelse(d_bh);
15466 +               return 1;
15467 +       }
15468 +       commit = (struct reiserfs_journal_commit *)c_bh->b_data;
15469 +       if (journal_compare_desc_commit(sb, desc, commit)) {
15470 +               reiserfs_debug(sb, REISERFS_DEBUG_CODE,
15471 +                              "journal_read_transaction, "
15472 +                              "commit offset %llu had bad time %d or length %d",
15473 +                              c_bh->b_blocknr -
15474 +                              SB_ONDISK_JOURNAL_1st_BLOCK(sb),
15475 +                              get_commit_trans_id(commit),
15476 +                              get_commit_trans_len(commit));
15477 +               brelse(c_bh);
15478 +               brelse(d_bh);
15479 +               return 1;
15480 +       }
15481 +
15482 +       if (bdev_read_only(sb->s_bdev)) {
15483 +               reiserfs_warning(sb, "clm-2076",
15484 +                                "device is readonly, unable to replay log");
15485 +               brelse(c_bh);
15486 +               brelse(d_bh);
15487 +               return -EROFS;
15488 +       }
15489 +
15490 +       trans_id = get_desc_trans_id(desc);
15491 +       /*
15492 +        * now we know we've got a good transaction, and it was
15493 +        * inside the valid time ranges
15494 +        */
15495 +       log_blocks = kmalloc_array(get_desc_trans_len(desc),
15496 +                                  sizeof(struct buffer_head *),
15497 +                                  GFP_NOFS);
15498 +       real_blocks = kmalloc_array(get_desc_trans_len(desc),
15499 +                                   sizeof(struct buffer_head *),
15500 +                                   GFP_NOFS);
15501 +       if (!log_blocks || !real_blocks) {
15502 +               brelse(c_bh);
15503 +               brelse(d_bh);
15504 +               kfree(log_blocks);
15505 +               kfree(real_blocks);
15506 +               reiserfs_warning(sb, "journal-1169",
15507 +                                "kmalloc failed, unable to mount FS");
15508 +               return -1;
15509 +       }
15510 +       /* get all the buffer heads */
15511 +       trans_half = journal_trans_half(sb->s_blocksize);
15512 +       for (i = 0; i < get_desc_trans_len(desc); i++) {
15513 +               log_blocks[i] =
15514 +                   journal_getblk(sb,
15515 +                                  SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
15516 +                                  (trans_offset + 1 +
15517 +                                   i) % SB_ONDISK_JOURNAL_SIZE(sb));
15518 +               if (i < trans_half) {
15519 +                       real_blocks[i] =
15520 +                           sb_getblk(sb,
15521 +                                     le32_to_cpu(desc->j_realblock[i]));
15522 +               } else {
15523 +                       real_blocks[i] =
15524 +                           sb_getblk(sb,
15525 +                                     le32_to_cpu(commit->
15526 +                                                 j_realblock[i - trans_half]));
15527 +               }
15528 +               if (real_blocks[i]->b_blocknr > SB_BLOCK_COUNT(sb)) {
15529 +                       reiserfs_warning(sb, "journal-1207",
15530 +                                        "REPLAY FAILURE fsck required! "
15531 +                                        "Block to replay is outside of "
15532 +                                        "filesystem");
15533 +                       goto abort_replay;
15534 +               }
15535 +               /* make sure we don't try to replay onto log or reserved area */
15536 +               if (is_block_in_log_or_reserved_area
15537 +                   (sb, real_blocks[i]->b_blocknr)) {
15538 +                       reiserfs_warning(sb, "journal-1204",
15539 +                                        "REPLAY FAILURE fsck required! "
15540 +                                        "Trying to replay onto a log block");
15541 +abort_replay:
15542 +                       brelse_array(log_blocks, i);
15543 +                       brelse_array(real_blocks, i);
15544 +                       brelse(c_bh);
15545 +                       brelse(d_bh);
15546 +                       kfree(log_blocks);
15547 +                       kfree(real_blocks);
15548 +                       return -1;
15549 +               }
15550 +       }
15551 +       /* read in the log blocks, memcpy to the corresponding real block */
15552 +       bh_read_batch(get_desc_trans_len(desc), log_blocks);
15553 +       for (i = 0; i < get_desc_trans_len(desc); i++) {
15554 +
15555 +               wait_on_buffer(log_blocks[i]);
15556 +               if (!buffer_uptodate(log_blocks[i])) {
15557 +                       reiserfs_warning(sb, "journal-1212",
15558 +                                        "REPLAY FAILURE fsck required! "
15559 +                                        "buffer write failed");
15560 +                       brelse_array(log_blocks + i,
15561 +                                    get_desc_trans_len(desc) - i);
15562 +                       brelse_array(real_blocks, get_desc_trans_len(desc));
15563 +                       brelse(c_bh);
15564 +                       brelse(d_bh);
15565 +                       kfree(log_blocks);
15566 +                       kfree(real_blocks);
15567 +                       return -1;
15568 +               }
15569 +               memcpy(real_blocks[i]->b_data, log_blocks[i]->b_data,
15570 +                      real_blocks[i]->b_size);
15571 +               set_buffer_uptodate(real_blocks[i]);
15572 +               brelse(log_blocks[i]);
15573 +       }
15574 +       /* flush out the real blocks */
15575 +       for (i = 0; i < get_desc_trans_len(desc); i++) {
15576 +               set_buffer_dirty(real_blocks[i]);
15577 +               write_dirty_buffer(real_blocks[i], 0);
15578 +       }
15579 +       for (i = 0; i < get_desc_trans_len(desc); i++) {
15580 +               wait_on_buffer(real_blocks[i]);
15581 +               if (!buffer_uptodate(real_blocks[i])) {
15582 +                       reiserfs_warning(sb, "journal-1226",
15583 +                                        "REPLAY FAILURE, fsck required! "
15584 +                                        "buffer write failed");
15585 +                       brelse_array(real_blocks + i,
15586 +                                    get_desc_trans_len(desc) - i);
15587 +                       brelse(c_bh);
15588 +                       brelse(d_bh);
15589 +                       kfree(log_blocks);
15590 +                       kfree(real_blocks);
15591 +                       return -1;
15592 +               }
15593 +               brelse(real_blocks[i]);
15594 +       }
15595 +       cur_dblock =
15596 +           SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
15597 +           ((trans_offset + get_desc_trans_len(desc) +
15598 +             2) % SB_ONDISK_JOURNAL_SIZE(sb));
15599 +       reiserfs_debug(sb, REISERFS_DEBUG_CODE,
15600 +                      "journal-1095: setting journal " "start to offset %ld",
15601 +                      cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb));
15602 +
15603 +       /*
15604 +        * init starting values for the first transaction, in case
15605 +        * this is the last transaction to be replayed.
15606 +        */
15607 +       journal->j_start = cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
15608 +       journal->j_last_flush_trans_id = trans_id;
15609 +       journal->j_trans_id = trans_id + 1;
15610 +       /* check for trans_id overflow */
15611 +       if (journal->j_trans_id == 0)
15612 +               journal->j_trans_id = 10;
15613 +       brelse(c_bh);
15614 +       brelse(d_bh);
15615 +       kfree(log_blocks);
15616 +       kfree(real_blocks);
15617 +       return 0;
15618 +}
15619 +
15620 +/*
15621 + * This function reads blocks starting from block and to max_block of bufsize
15622 + * size (but no more than BUFNR blocks at a time). This proved to improve
15623 + * mounting speed on self-rebuilding raid5 arrays at least.
15624 + * Right now it is only used from journal code. But later we might use it
15625 + * from other places.
15626 + * Note: Do not use journal_getblk/sb_getblk functions here!
15627 + */
15628 +static struct buffer_head *reiserfs_breada(struct block_device *dev,
15629 +                                          b_blocknr_t block, int bufsize,
15630 +                                          b_blocknr_t max_block)
15631 +{
15632 +       struct buffer_head *bhlist[BUFNR];
15633 +       unsigned int blocks = BUFNR;
15634 +       struct buffer_head *bh;
15635 +       int i, j;
15636 +
15637 +       bh = __getblk(dev, block, bufsize);
15638 +       if (!bh || buffer_uptodate(bh))
15639 +               return (bh);
15640 +
15641 +       if (block + BUFNR > max_block) {
15642 +               blocks = max_block - block;
15643 +       }
15644 +       bhlist[0] = bh;
15645 +       j = 1;
15646 +       for (i = 1; i < blocks; i++) {
15647 +               bh = __getblk(dev, block + i, bufsize);
15648 +               if (!bh)
15649 +                       break;
15650 +               if (buffer_uptodate(bh)) {
15651 +                       brelse(bh);
15652 +                       break;
15653 +               } else
15654 +                       bhlist[j++] = bh;
15655 +       }
15656 +       bh = bhlist[0];
15657 +       bh_read_nowait(bh, 0);
15658 +       bh_readahead_batch(j - 1, &bhlist[1], 0);
15659 +       for (i = 1; i < j; i++)
15660 +               brelse(bhlist[i]);
15661 +       wait_on_buffer(bh);
15662 +       if (buffer_uptodate(bh))
15663 +               return bh;
15664 +       brelse(bh);
15665 +       return NULL;
15666 +}
15667 +
15668 +/*
15669 + * read and replay the log
15670 + * on a clean unmount, the journal header's next unflushed pointer will be
15671 + * to an invalid transaction.  This tests that before finding all the
15672 + * transactions in the log, which makes normal mount times fast.
15673 + *
15674 + * After a crash, this starts with the next unflushed transaction, and
15675 + * replays until it finds one too old, or invalid.
15676 + *
15677 + * On exit, it sets things up so the first transaction will work correctly.
15678 + * NOTE: only called during fs mount
15679 + */
15680 +static int journal_read(struct super_block *sb)
15681 +{
15682 +       struct reiserfs_journal *journal = SB_JOURNAL(sb);
15683 +       struct reiserfs_journal_desc *desc;
15684 +       unsigned int oldest_trans_id = 0;
15685 +       unsigned int oldest_invalid_trans_id = 0;
15686 +       time64_t start;
15687 +       unsigned long oldest_start = 0;
15688 +       unsigned long cur_dblock = 0;
15689 +       unsigned long newest_mount_id = 9;
15690 +       struct buffer_head *d_bh;
15691 +       struct reiserfs_journal_header *jh;
15692 +       int valid_journal_header = 0;
15693 +       int replay_count = 0;
15694 +       int continue_replay = 1;
15695 +       int ret;
15696 +
15697 +       cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb);
15698 +       reiserfs_info(sb, "checking transaction log (%pg)\n",
15699 +                     file_bdev(journal->j_bdev_file));
15700 +       start = ktime_get_seconds();
15701 +
15702 +       /*
15703 +        * step 1, read in the journal header block.  Check the transaction
15704 +        * it says is the first unflushed, and if that transaction is not
15705 +        * valid, replay is done
15706 +        */
15707 +       journal->j_header_bh = journal_bread(sb,
15708 +                                            SB_ONDISK_JOURNAL_1st_BLOCK(sb)
15709 +                                            + SB_ONDISK_JOURNAL_SIZE(sb));
15710 +       if (!journal->j_header_bh) {
15711 +               return 1;
15712 +       }
15713 +       jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data);
15714 +       if (le32_to_cpu(jh->j_first_unflushed_offset) <
15715 +           SB_ONDISK_JOURNAL_SIZE(sb)
15716 +           && le32_to_cpu(jh->j_last_flush_trans_id) > 0) {
15717 +               oldest_start =
15718 +                   SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
15719 +                   le32_to_cpu(jh->j_first_unflushed_offset);
15720 +               oldest_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1;
15721 +               newest_mount_id = le32_to_cpu(jh->j_mount_id);
15722 +               reiserfs_debug(sb, REISERFS_DEBUG_CODE,
15723 +                              "journal-1153: found in "
15724 +                              "header: first_unflushed_offset %d, last_flushed_trans_id "
15725 +                              "%lu", le32_to_cpu(jh->j_first_unflushed_offset),
15726 +                              le32_to_cpu(jh->j_last_flush_trans_id));
15727 +               valid_journal_header = 1;
15728 +
15729 +               /*
15730 +                * now, we try to read the first unflushed offset.  If it
15731 +                * is not valid, there is nothing more we can do, and it
15732 +                * makes no sense to read through the whole log.
15733 +                */
15734 +               d_bh =
15735 +                   journal_bread(sb,
15736 +                                 SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
15737 +                                 le32_to_cpu(jh->j_first_unflushed_offset));
15738 +               ret = journal_transaction_is_valid(sb, d_bh, NULL, NULL);
15739 +               if (!ret) {
15740 +                       continue_replay = 0;
15741 +               }
15742 +               brelse(d_bh);
15743 +               goto start_log_replay;
15744 +       }
15745 +
15746 +       /*
15747 +        * ok, there are transactions that need to be replayed.  start
15748 +        * with the first log block, find all the valid transactions, and
15749 +        * pick out the oldest.
15750 +        */
15751 +       while (continue_replay
15752 +              && cur_dblock <
15753 +              (SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
15754 +               SB_ONDISK_JOURNAL_SIZE(sb))) {
15755 +               /*
15756 +                * Note that it is required for blocksize of primary fs
15757 +                * device and journal device to be the same
15758 +                */
15759 +               d_bh =
15760 +                   reiserfs_breada(file_bdev(journal->j_bdev_file), cur_dblock,
15761 +                                   sb->s_blocksize,
15762 +                                   SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
15763 +                                   SB_ONDISK_JOURNAL_SIZE(sb));
15764 +               ret =
15765 +                   journal_transaction_is_valid(sb, d_bh,
15766 +                                                &oldest_invalid_trans_id,
15767 +                                                &newest_mount_id);
15768 +               if (ret == 1) {
15769 +                       desc = (struct reiserfs_journal_desc *)d_bh->b_data;
15770 +                       if (oldest_start == 0) {        /* init all oldest_ values */
15771 +                               oldest_trans_id = get_desc_trans_id(desc);
15772 +                               oldest_start = d_bh->b_blocknr;
15773 +                               newest_mount_id = get_desc_mount_id(desc);
15774 +                               reiserfs_debug(sb, REISERFS_DEBUG_CODE,
15775 +                                              "journal-1179: Setting "
15776 +                                              "oldest_start to offset %llu, trans_id %lu",
15777 +                                              oldest_start -
15778 +                                              SB_ONDISK_JOURNAL_1st_BLOCK
15779 +                                              (sb), oldest_trans_id);
15780 +                       } else if (oldest_trans_id > get_desc_trans_id(desc)) {
15781 +                               /* one we just read was older */
15782 +                               oldest_trans_id = get_desc_trans_id(desc);
15783 +                               oldest_start = d_bh->b_blocknr;
15784 +                               reiserfs_debug(sb, REISERFS_DEBUG_CODE,
15785 +                                              "journal-1180: Resetting "
15786 +                                              "oldest_start to offset %lu, trans_id %lu",
15787 +                                              oldest_start -
15788 +                                              SB_ONDISK_JOURNAL_1st_BLOCK
15789 +                                              (sb), oldest_trans_id);
15790 +                       }
15791 +                       if (newest_mount_id < get_desc_mount_id(desc)) {
15792 +                               newest_mount_id = get_desc_mount_id(desc);
15793 +                               reiserfs_debug(sb, REISERFS_DEBUG_CODE,
15794 +                                              "journal-1299: Setting "
15795 +                                              "newest_mount_id to %d",
15796 +                                              get_desc_mount_id(desc));
15797 +                       }
15798 +                       cur_dblock += get_desc_trans_len(desc) + 2;
15799 +               } else {
15800 +                       cur_dblock++;
15801 +               }
15802 +               brelse(d_bh);
15803 +       }
15804 +
15805 +start_log_replay:
15806 +       cur_dblock = oldest_start;
15807 +       if (oldest_trans_id) {
15808 +               reiserfs_debug(sb, REISERFS_DEBUG_CODE,
15809 +                              "journal-1206: Starting replay "
15810 +                              "from offset %llu, trans_id %lu",
15811 +                              cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb),
15812 +                              oldest_trans_id);
15813 +
15814 +       }
15815 +       replay_count = 0;
15816 +       while (continue_replay && oldest_trans_id > 0) {
15817 +               ret =
15818 +                   journal_read_transaction(sb, cur_dblock, oldest_start,
15819 +                                            oldest_trans_id, newest_mount_id);
15820 +               if (ret < 0) {
15821 +                       return ret;
15822 +               } else if (ret != 0) {
15823 +                       break;
15824 +               }
15825 +               cur_dblock =
15826 +                   SB_ONDISK_JOURNAL_1st_BLOCK(sb) + journal->j_start;
15827 +               replay_count++;
15828 +               if (cur_dblock == oldest_start)
15829 +                       break;
15830 +       }
15831 +
15832 +       if (oldest_trans_id == 0) {
15833 +               reiserfs_debug(sb, REISERFS_DEBUG_CODE,
15834 +                              "journal-1225: No valid " "transactions found");
15835 +       }
15836 +       /*
15837 +        * j_start does not get set correctly if we don't replay any
15838 +        * transactions.  if we had a valid journal_header, set j_start
15839 +        * to the first unflushed transaction value, copy the trans_id
15840 +        * from the header
15841 +        */
15842 +       if (valid_journal_header && replay_count == 0) {
15843 +               journal->j_start = le32_to_cpu(jh->j_first_unflushed_offset);
15844 +               journal->j_trans_id =
15845 +                   le32_to_cpu(jh->j_last_flush_trans_id) + 1;
15846 +               /* check for trans_id overflow */
15847 +               if (journal->j_trans_id == 0)
15848 +                       journal->j_trans_id = 10;
15849 +               journal->j_last_flush_trans_id =
15850 +                   le32_to_cpu(jh->j_last_flush_trans_id);
15851 +               journal->j_mount_id = le32_to_cpu(jh->j_mount_id) + 1;
15852 +       } else {
15853 +               journal->j_mount_id = newest_mount_id + 1;
15854 +       }
15855 +       reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1299: Setting "
15856 +                      "newest_mount_id to %lu", journal->j_mount_id);
15857 +       journal->j_first_unflushed_offset = journal->j_start;
15858 +       if (replay_count > 0) {
15859 +               reiserfs_info(sb,
15860 +                             "replayed %d transactions in %lu seconds\n",
15861 +                             replay_count, ktime_get_seconds() - start);
15862 +       }
15863 +       /* needed to satisfy the locking in _update_journal_header_block */
15864 +       reiserfs_write_lock(sb);
15865 +       if (!bdev_read_only(sb->s_bdev) &&
15866 +           _update_journal_header_block(sb, journal->j_start,
15867 +                                        journal->j_last_flush_trans_id)) {
15868 +               reiserfs_write_unlock(sb);
15869 +               /*
15870 +                * replay failed, caller must call free_journal_ram and abort
15871 +                * the mount
15872 +                */
15873 +               return -1;
15874 +       }
15875 +       reiserfs_write_unlock(sb);
15876 +       return 0;
15877 +}
15878 +
15879 +static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
15880 +{
15881 +       struct reiserfs_journal_list *jl;
15882 +       jl = kzalloc(sizeof(struct reiserfs_journal_list),
15883 +                    GFP_NOFS | __GFP_NOFAIL);
15884 +       INIT_LIST_HEAD(&jl->j_list);
15885 +       INIT_LIST_HEAD(&jl->j_working_list);
15886 +       INIT_LIST_HEAD(&jl->j_tail_bh_list);
15887 +       INIT_LIST_HEAD(&jl->j_bh_list);
15888 +       mutex_init(&jl->j_commit_mutex);
15889 +       SB_JOURNAL(s)->j_num_lists++;
15890 +       get_journal_list(jl);
15891 +       return jl;
15892 +}
15893 +
15894 +static void journal_list_init(struct super_block *sb)
15895 +{
15896 +       SB_JOURNAL(sb)->j_current_jl = alloc_journal_list(sb);
15897 +}
15898 +
15899 +static void release_journal_dev(struct reiserfs_journal *journal)
15900 +{
15901 +       if (journal->j_bdev_file) {
15902 +               bdev_fput(journal->j_bdev_file);
15903 +               journal->j_bdev_file = NULL;
15904 +       }
15905 +}
15906 +
15907 +static int journal_init_dev(struct super_block *super,
15908 +                           struct reiserfs_journal *journal,
15909 +                           const char *jdev_name)
15910 +{
15911 +       blk_mode_t blkdev_mode = BLK_OPEN_READ;
15912 +       void *holder = journal;
15913 +       int result;
15914 +       dev_t jdev;
15915 +
15916 +       result = 0;
15917 +
15918 +       journal->j_bdev_file = NULL;
15919 +       jdev = SB_ONDISK_JOURNAL_DEVICE(super) ?
15920 +           new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev;
15921 +
15922 +       if (!bdev_read_only(super->s_bdev))
15923 +               blkdev_mode |= BLK_OPEN_WRITE;
15924 +
15925 +       /* there is no "jdev" option and journal is on separate device */
15926 +       if ((!jdev_name || !jdev_name[0])) {
15927 +               if (jdev == super->s_dev)
15928 +                       holder = NULL;
15929 +               journal->j_bdev_file = bdev_file_open_by_dev(jdev, blkdev_mode,
15930 +                                                         holder, NULL);
15931 +               if (IS_ERR(journal->j_bdev_file)) {
15932 +                       result = PTR_ERR(journal->j_bdev_file);
15933 +                       journal->j_bdev_file = NULL;
15934 +                       reiserfs_warning(super, "sh-458",
15935 +                                        "cannot init journal device unknown-block(%u,%u): %i",
15936 +                                        MAJOR(jdev), MINOR(jdev), result);
15937 +                       return result;
15938 +               } else if (jdev != super->s_dev)
15939 +                       set_blocksize(journal->j_bdev_file, super->s_blocksize);
15940 +
15941 +               return 0;
15942 +       }
15943 +
15944 +       journal->j_bdev_file = bdev_file_open_by_path(jdev_name, blkdev_mode,
15945 +                                                  holder, NULL);
15946 +       if (IS_ERR(journal->j_bdev_file)) {
15947 +               result = PTR_ERR(journal->j_bdev_file);
15948 +               journal->j_bdev_file = NULL;
15949 +               reiserfs_warning(super, "sh-457",
15950 +                                "journal_init_dev: Cannot open '%s': %i",
15951 +                                jdev_name, result);
15952 +               return result;
15953 +       }
15954 +
15955 +       set_blocksize(journal->j_bdev_file, super->s_blocksize);
15956 +       reiserfs_info(super,
15957 +                     "journal_init_dev: journal device: %pg\n",
15958 +                     file_bdev(journal->j_bdev_file));
15959 +       return 0;
15960 +}
15961 +
15962 +/*
15963 + * When creating/tuning a file system user can assign some
15964 + * journal params within boundaries which depend on the ratio
15965 + * blocksize/standard_blocksize.
15966 + *
15967 + * For blocks >= standard_blocksize transaction size should
15968 + * be not less then JOURNAL_TRANS_MIN_DEFAULT, and not more
15969 + * then JOURNAL_TRANS_MAX_DEFAULT.
15970 + *
15971 + * For blocks < standard_blocksize these boundaries should be
15972 + * decreased proportionally.
15973 + */
15974 +#define REISERFS_STANDARD_BLKSIZE (4096)
15975 +
15976 +static int check_advise_trans_params(struct super_block *sb,
15977 +                                    struct reiserfs_journal *journal)
15978 +{
15979 +        if (journal->j_trans_max) {
15980 +               /* Non-default journal params.  Do sanity check for them. */
15981 +               int ratio = 1;
15982 +               if (sb->s_blocksize < REISERFS_STANDARD_BLKSIZE)
15983 +                       ratio = REISERFS_STANDARD_BLKSIZE / sb->s_blocksize;
15984 +
15985 +               if (journal->j_trans_max > JOURNAL_TRANS_MAX_DEFAULT / ratio ||
15986 +                   journal->j_trans_max < JOURNAL_TRANS_MIN_DEFAULT / ratio ||
15987 +                   SB_ONDISK_JOURNAL_SIZE(sb) / journal->j_trans_max <
15988 +                   JOURNAL_MIN_RATIO) {
15989 +                       reiserfs_warning(sb, "sh-462",
15990 +                                        "bad transaction max size (%u). "
15991 +                                        "FSCK?", journal->j_trans_max);
15992 +                       return 1;
15993 +               }
15994 +               if (journal->j_max_batch != (journal->j_trans_max) *
15995 +                       JOURNAL_MAX_BATCH_DEFAULT/JOURNAL_TRANS_MAX_DEFAULT) {
15996 +                       reiserfs_warning(sb, "sh-463",
15997 +                                        "bad transaction max batch (%u). "
15998 +                                        "FSCK?", journal->j_max_batch);
15999 +                       return 1;
16000 +               }
16001 +       } else {
16002 +               /*
16003 +                * Default journal params.
16004 +                * The file system was created by old version
16005 +                * of mkreiserfs, so some fields contain zeros,
16006 +                * and we need to advise proper values for them
16007 +                */
16008 +               if (sb->s_blocksize != REISERFS_STANDARD_BLKSIZE) {
16009 +                       reiserfs_warning(sb, "sh-464", "bad blocksize (%u)",
16010 +                                        sb->s_blocksize);
16011 +                       return 1;
16012 +               }
16013 +               journal->j_trans_max = JOURNAL_TRANS_MAX_DEFAULT;
16014 +               journal->j_max_batch = JOURNAL_MAX_BATCH_DEFAULT;
16015 +               journal->j_max_commit_age = JOURNAL_MAX_COMMIT_AGE;
16016 +       }
16017 +       return 0;
16018 +}
16019 +
16020 +/* must be called once on fs mount.  calls journal_read for you */
16021 +int journal_init(struct super_block *sb, const char *j_dev_name,
16022 +                int old_format, unsigned int commit_max_age)
16023 +{
16024 +       int num_cnodes = SB_ONDISK_JOURNAL_SIZE(sb) * 2;
16025 +       struct buffer_head *bhjh;
16026 +       struct reiserfs_super_block *rs;
16027 +       struct reiserfs_journal_header *jh;
16028 +       struct reiserfs_journal *journal;
16029 +       struct reiserfs_journal_list *jl;
16030 +       int ret;
16031 +
16032 +       journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal));
16033 +       if (!journal) {
16034 +               reiserfs_warning(sb, "journal-1256",
16035 +                                "unable to get memory for journal structure");
16036 +               return 1;
16037 +       }
16038 +       INIT_LIST_HEAD(&journal->j_bitmap_nodes);
16039 +       INIT_LIST_HEAD(&journal->j_prealloc_list);
16040 +       INIT_LIST_HEAD(&journal->j_working_list);
16041 +       INIT_LIST_HEAD(&journal->j_journal_list);
16042 +       journal->j_persistent_trans = 0;
16043 +       if (reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap,
16044 +                                          reiserfs_bmap_count(sb)))
16045 +               goto free_and_return;
16046 +
16047 +       allocate_bitmap_nodes(sb);
16048 +
16049 +       /* reserved for journal area support */
16050 +       SB_JOURNAL_1st_RESERVED_BLOCK(sb) = (old_format ?
16051 +                                                REISERFS_OLD_DISK_OFFSET_IN_BYTES
16052 +                                                / sb->s_blocksize +
16053 +                                                reiserfs_bmap_count(sb) +
16054 +                                                1 :
16055 +                                                REISERFS_DISK_OFFSET_IN_BYTES /
16056 +                                                sb->s_blocksize + 2);
16057 +
16058 +       /*
16059 +        * Sanity check to see is the standard journal fitting
16060 +        * within first bitmap (actual for small blocksizes)
16061 +        */
16062 +       if (!SB_ONDISK_JOURNAL_DEVICE(sb) &&
16063 +           (SB_JOURNAL_1st_RESERVED_BLOCK(sb) +
16064 +            SB_ONDISK_JOURNAL_SIZE(sb) > sb->s_blocksize * 8)) {
16065 +               reiserfs_warning(sb, "journal-1393",
16066 +                                "journal does not fit for area addressed "
16067 +                                "by first of bitmap blocks. It starts at "
16068 +                                "%u and its size is %u. Block size %ld",
16069 +                                SB_JOURNAL_1st_RESERVED_BLOCK(sb),
16070 +                                SB_ONDISK_JOURNAL_SIZE(sb),
16071 +                                sb->s_blocksize);
16072 +               goto free_and_return;
16073 +       }
16074 +
16075 +       /*
16076 +        * Sanity check to see if journal first block is correct.
16077 +        * If journal first block is invalid it can cause
16078 +        * zeroing important superblock members.
16079 +        */
16080 +       if (!SB_ONDISK_JOURNAL_DEVICE(sb) &&
16081 +           SB_ONDISK_JOURNAL_1st_BLOCK(sb) < SB_JOURNAL_1st_RESERVED_BLOCK(sb)) {
16082 +               reiserfs_warning(sb, "journal-1393",
16083 +                                "journal 1st super block is invalid: 1st reserved block %d, but actual 1st block is %d",
16084 +                                SB_JOURNAL_1st_RESERVED_BLOCK(sb),
16085 +                                SB_ONDISK_JOURNAL_1st_BLOCK(sb));
16086 +               goto free_and_return;
16087 +       }
16088 +
16089 +       if (journal_init_dev(sb, journal, j_dev_name) != 0) {
16090 +               reiserfs_warning(sb, "sh-462",
16091 +                                "unable to initialize journal device");
16092 +               goto free_and_return;
16093 +       }
16094 +
16095 +       rs = SB_DISK_SUPER_BLOCK(sb);
16096 +
16097 +       /* read journal header */
16098 +       bhjh = journal_bread(sb,
16099 +                            SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
16100 +                            SB_ONDISK_JOURNAL_SIZE(sb));
16101 +       if (!bhjh) {
16102 +               reiserfs_warning(sb, "sh-459",
16103 +                                "unable to read journal header");
16104 +               goto free_and_return;
16105 +       }
16106 +       jh = (struct reiserfs_journal_header *)(bhjh->b_data);
16107 +
16108 +       /* make sure that journal matches to the super block */
16109 +       if (is_reiserfs_jr(rs)
16110 +           && (le32_to_cpu(jh->jh_journal.jp_journal_magic) !=
16111 +               sb_jp_journal_magic(rs))) {
16112 +               reiserfs_warning(sb, "sh-460",
16113 +                                "journal header magic %x (device %pg) does "
16114 +                                "not match to magic found in super block %x",
16115 +                                jh->jh_journal.jp_journal_magic,
16116 +                                file_bdev(journal->j_bdev_file),
16117 +                                sb_jp_journal_magic(rs));
16118 +               brelse(bhjh);
16119 +               goto free_and_return;
16120 +       }
16121 +
16122 +       journal->j_trans_max = le32_to_cpu(jh->jh_journal.jp_journal_trans_max);
16123 +       journal->j_max_batch = le32_to_cpu(jh->jh_journal.jp_journal_max_batch);
16124 +       journal->j_max_commit_age =
16125 +           le32_to_cpu(jh->jh_journal.jp_journal_max_commit_age);
16126 +       journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE;
16127 +
16128 +       if (check_advise_trans_params(sb, journal) != 0)
16129 +               goto free_and_return;
16130 +       journal->j_default_max_commit_age = journal->j_max_commit_age;
16131 +
16132 +       if (commit_max_age != 0) {
16133 +               journal->j_max_commit_age = commit_max_age;
16134 +               journal->j_max_trans_age = commit_max_age;
16135 +       }
16136 +
16137 +       reiserfs_info(sb, "journal params: device %pg, size %u, "
16138 +                     "journal first block %u, max trans len %u, max batch %u, "
16139 +                     "max commit age %u, max trans age %u\n",
16140 +                     file_bdev(journal->j_bdev_file),
16141 +                     SB_ONDISK_JOURNAL_SIZE(sb),
16142 +                     SB_ONDISK_JOURNAL_1st_BLOCK(sb),
16143 +                     journal->j_trans_max,
16144 +                     journal->j_max_batch,
16145 +                     journal->j_max_commit_age, journal->j_max_trans_age);
16146 +
16147 +       brelse(bhjh);
16148 +
16149 +       journal->j_list_bitmap_index = 0;
16150 +       journal_list_init(sb);
16151 +
16152 +       memset(journal->j_list_hash_table, 0,
16153 +              JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *));
16154 +
16155 +       INIT_LIST_HEAD(&journal->j_dirty_buffers);
16156 +       spin_lock_init(&journal->j_dirty_buffers_lock);
16157 +
16158 +       journal->j_start = 0;
16159 +       journal->j_len = 0;
16160 +       journal->j_len_alloc = 0;
16161 +       atomic_set(&journal->j_wcount, 0);
16162 +       atomic_set(&journal->j_async_throttle, 0);
16163 +       journal->j_bcount = 0;
16164 +       journal->j_trans_start_time = 0;
16165 +       journal->j_last = NULL;
16166 +       journal->j_first = NULL;
16167 +       init_waitqueue_head(&journal->j_join_wait);
16168 +       mutex_init(&journal->j_mutex);
16169 +       mutex_init(&journal->j_flush_mutex);
16170 +
16171 +       journal->j_trans_id = 10;
16172 +       journal->j_mount_id = 10;
16173 +       journal->j_state = 0;
16174 +       atomic_set(&journal->j_jlock, 0);
16175 +       journal->j_cnode_free_list = allocate_cnodes(num_cnodes);
16176 +       journal->j_cnode_free_orig = journal->j_cnode_free_list;
16177 +       journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0;
16178 +       journal->j_cnode_used = 0;
16179 +       journal->j_must_wait = 0;
16180 +
16181 +       if (journal->j_cnode_free == 0) {
16182 +               reiserfs_warning(sb, "journal-2004", "Journal cnode memory "
16183 +                                "allocation failed (%ld bytes). Journal is "
16184 +                                "too large for available memory. Usually "
16185 +                                "this is due to a journal that is too large.",
16186 +                                sizeof (struct reiserfs_journal_cnode) * num_cnodes);
16187 +               goto free_and_return;
16188 +       }
16189 +
16190 +       init_journal_hash(sb);
16191 +       jl = journal->j_current_jl;
16192 +
16193 +       /*
16194 +        * get_list_bitmap() may call flush_commit_list() which
16195 +        * requires the lock. Calling flush_commit_list() shouldn't happen
16196 +        * this early but I like to be paranoid.
16197 +        */
16198 +       reiserfs_write_lock(sb);
16199 +       jl->j_list_bitmap = get_list_bitmap(sb, jl);
16200 +       reiserfs_write_unlock(sb);
16201 +       if (!jl->j_list_bitmap) {
16202 +               reiserfs_warning(sb, "journal-2005",
16203 +                                "get_list_bitmap failed for journal list 0");
16204 +               goto free_and_return;
16205 +       }
16206 +
16207 +       ret = journal_read(sb);
16208 +       if (ret < 0) {
16209 +               reiserfs_warning(sb, "reiserfs-2006",
16210 +                                "Replay Failure, unable to mount");
16211 +               goto free_and_return;
16212 +       }
16213 +
16214 +       INIT_DELAYED_WORK(&journal->j_work, flush_async_commits);
16215 +       journal->j_work_sb = sb;
16216 +       return 0;
16217 +free_and_return:
16218 +       free_journal_ram(sb);
16219 +       return 1;
16220 +}
16221 +
16222 +/*
16223 + * test for a polite end of the current transaction.  Used by file_write,
16224 + * and should be used by delete to make sure they don't write more than
16225 + * can fit inside a single transaction
16226 + */
16227 +int journal_transaction_should_end(struct reiserfs_transaction_handle *th,
16228 +                                  int new_alloc)
16229 +{
16230 +       struct reiserfs_journal *journal = SB_JOURNAL(th->t_super);
16231 +       time64_t now = ktime_get_seconds();
16232 +       /* cannot restart while nested */
16233 +       BUG_ON(!th->t_trans_id);
16234 +       if (th->t_refcount > 1)
16235 +               return 0;
16236 +       if (journal->j_must_wait > 0 ||
16237 +           (journal->j_len_alloc + new_alloc) >= journal->j_max_batch ||
16238 +           atomic_read(&journal->j_jlock) ||
16239 +           (now - journal->j_trans_start_time) > journal->j_max_trans_age ||
16240 +           journal->j_cnode_free < (journal->j_trans_max * 3)) {
16241 +               return 1;
16242 +       }
16243 +
16244 +       journal->j_len_alloc += new_alloc;
16245 +       th->t_blocks_allocated += new_alloc ;
16246 +       return 0;
16247 +}
16248 +
16249 +/* this must be called inside a transaction */
16250 +void reiserfs_block_writes(struct reiserfs_transaction_handle *th)
16251 +{
16252 +       struct reiserfs_journal *journal = SB_JOURNAL(th->t_super);
16253 +       BUG_ON(!th->t_trans_id);
16254 +       journal->j_must_wait = 1;
16255 +       set_bit(J_WRITERS_BLOCKED, &journal->j_state);
16256 +       return;
16257 +}
16258 +
16259 +/* this must be called without a transaction started */
16260 +void reiserfs_allow_writes(struct super_block *s)
16261 +{
16262 +       struct reiserfs_journal *journal = SB_JOURNAL(s);
16263 +       clear_bit(J_WRITERS_BLOCKED, &journal->j_state);
16264 +       wake_up(&journal->j_join_wait);
16265 +}
16266 +
16267 +/* this must be called without a transaction started */
16268 +void reiserfs_wait_on_write_block(struct super_block *s)
16269 +{
16270 +       struct reiserfs_journal *journal = SB_JOURNAL(s);
16271 +       wait_event(journal->j_join_wait,
16272 +                  !test_bit(J_WRITERS_BLOCKED, &journal->j_state));
16273 +}
16274 +
16275 +static void queue_log_writer(struct super_block *s)
16276 +{
16277 +       wait_queue_entry_t wait;
16278 +       struct reiserfs_journal *journal = SB_JOURNAL(s);
16279 +       set_bit(J_WRITERS_QUEUED, &journal->j_state);
16280 +
16281 +       /*
16282 +        * we don't want to use wait_event here because
16283 +        * we only want to wait once.
16284 +        */
16285 +       init_waitqueue_entry(&wait, current);
16286 +       add_wait_queue(&journal->j_join_wait, &wait);
16287 +       set_current_state(TASK_UNINTERRUPTIBLE);
16288 +       if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) {
16289 +               int depth = reiserfs_write_unlock_nested(s);
16290 +               schedule();
16291 +               reiserfs_write_lock_nested(s, depth);
16292 +       }
16293 +       __set_current_state(TASK_RUNNING);
16294 +       remove_wait_queue(&journal->j_join_wait, &wait);
16295 +}
16296 +
16297 +static void wake_queued_writers(struct super_block *s)
16298 +{
16299 +       struct reiserfs_journal *journal = SB_JOURNAL(s);
16300 +       if (test_and_clear_bit(J_WRITERS_QUEUED, &journal->j_state))
16301 +               wake_up(&journal->j_join_wait);
16302 +}
16303 +
16304 +static void let_transaction_grow(struct super_block *sb, unsigned int trans_id)
16305 +{
16306 +       struct reiserfs_journal *journal = SB_JOURNAL(sb);
16307 +       unsigned long bcount = journal->j_bcount;
16308 +       while (1) {
16309 +               int depth;
16310 +
16311 +               depth = reiserfs_write_unlock_nested(sb);
16312 +               schedule_timeout_uninterruptible(1);
16313 +               reiserfs_write_lock_nested(sb, depth);
16314 +
16315 +               journal->j_current_jl->j_state |= LIST_COMMIT_PENDING;
16316 +               while ((atomic_read(&journal->j_wcount) > 0 ||
16317 +                       atomic_read(&journal->j_jlock)) &&
16318 +                      journal->j_trans_id == trans_id) {
16319 +                       queue_log_writer(sb);
16320 +               }
16321 +               if (journal->j_trans_id != trans_id)
16322 +                       break;
16323 +               if (bcount == journal->j_bcount)
16324 +                       break;
16325 +               bcount = journal->j_bcount;
16326 +       }
16327 +}
16328 +
16329 +/*
16330 + * join == true if you must join an existing transaction.
16331 + * join == false if you can deal with waiting for others to finish
16332 + *
16333 + * this will block until the transaction is joinable.  send the number of
16334 + * blocks you expect to use in nblocks.
16335 +*/
16336 +static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
16337 +                             struct super_block *sb, unsigned long nblocks,
16338 +                             int join)
16339 +{
16340 +       time64_t now = ktime_get_seconds();
16341 +       unsigned int old_trans_id;
16342 +       struct reiserfs_journal *journal = SB_JOURNAL(sb);
16343 +       struct reiserfs_transaction_handle myth;
16344 +       int retval;
16345 +       int depth;
16346 +
16347 +       reiserfs_check_lock_depth(sb, "journal_begin");
16348 +       BUG_ON(nblocks > journal->j_trans_max);
16349 +
16350 +       PROC_INFO_INC(sb, journal.journal_being);
16351 +       /* set here for journal_join */
16352 +       th->t_refcount = 1;
16353 +       th->t_super = sb;
16354 +
16355 +relock:
16356 +       lock_journal(sb);
16357 +       if (join != JBEGIN_ABORT && reiserfs_is_journal_aborted(journal)) {
16358 +               unlock_journal(sb);
16359 +               retval = journal->j_errno;
16360 +               goto out_fail;
16361 +       }
16362 +       journal->j_bcount++;
16363 +
16364 +       if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) {
16365 +               unlock_journal(sb);
16366 +               depth = reiserfs_write_unlock_nested(sb);
16367 +               reiserfs_wait_on_write_block(sb);
16368 +               reiserfs_write_lock_nested(sb, depth);
16369 +               PROC_INFO_INC(sb, journal.journal_relock_writers);
16370 +               goto relock;
16371 +       }
16372 +       now = ktime_get_seconds();
16373 +
16374 +       /*
16375 +        * if there is no room in the journal OR
16376 +        * if this transaction is too old, and we weren't called joinable,
16377 +        * wait for it to finish before beginning we don't sleep if there
16378 +        * aren't other writers
16379 +        */
16380 +
16381 +       if ((!join && journal->j_must_wait > 0) ||
16382 +           (!join
16383 +            && (journal->j_len_alloc + nblocks + 2) >= journal->j_max_batch)
16384 +           || (!join && atomic_read(&journal->j_wcount) > 0
16385 +               && journal->j_trans_start_time > 0
16386 +               && (now - journal->j_trans_start_time) >
16387 +               journal->j_max_trans_age) || (!join
16388 +                                             && atomic_read(&journal->j_jlock))
16389 +           || (!join && journal->j_cnode_free < (journal->j_trans_max * 3))) {
16390 +
16391 +               old_trans_id = journal->j_trans_id;
16392 +               /* allow others to finish this transaction */
16393 +               unlock_journal(sb);
16394 +
16395 +               if (!join && (journal->j_len_alloc + nblocks + 2) >=
16396 +                   journal->j_max_batch &&
16397 +                   ((journal->j_len + nblocks + 2) * 100) <
16398 +                   (journal->j_len_alloc * 75)) {
16399 +                       if (atomic_read(&journal->j_wcount) > 10) {
16400 +                               queue_log_writer(sb);
16401 +                               goto relock;
16402 +                       }
16403 +               }
16404 +               /*
16405 +                * don't mess with joining the transaction if all we
16406 +                * have to do is wait for someone else to do a commit
16407 +                */
16408 +               if (atomic_read(&journal->j_jlock)) {
16409 +                       while (journal->j_trans_id == old_trans_id &&
16410 +                              atomic_read(&journal->j_jlock)) {
16411 +                               queue_log_writer(sb);
16412 +                       }
16413 +                       goto relock;
16414 +               }
16415 +               retval = journal_join(&myth, sb);
16416 +               if (retval)
16417 +                       goto out_fail;
16418 +
16419 +               /* someone might have ended the transaction while we joined */
16420 +               if (old_trans_id != journal->j_trans_id) {
16421 +                       retval = do_journal_end(&myth, 0);
16422 +               } else {
16423 +                       retval = do_journal_end(&myth, COMMIT_NOW);
16424 +               }
16425 +
16426 +               if (retval)
16427 +                       goto out_fail;
16428 +
16429 +               PROC_INFO_INC(sb, journal.journal_relock_wcount);
16430 +               goto relock;
16431 +       }
16432 +       /* we are the first writer, set trans_id */
16433 +       if (journal->j_trans_start_time == 0) {
16434 +               journal->j_trans_start_time = ktime_get_seconds();
16435 +       }
16436 +       atomic_inc(&journal->j_wcount);
16437 +       journal->j_len_alloc += nblocks;
16438 +       th->t_blocks_logged = 0;
16439 +       th->t_blocks_allocated = nblocks;
16440 +       th->t_trans_id = journal->j_trans_id;
16441 +       unlock_journal(sb);
16442 +       INIT_LIST_HEAD(&th->t_list);
16443 +       return 0;
16444 +
16445 +out_fail:
16446 +       memset(th, 0, sizeof(*th));
16447 +       /*
16448 +        * Re-set th->t_super, so we can properly keep track of how many
16449 +        * persistent transactions there are. We need to do this so if this
16450 +        * call is part of a failed restart_transaction, we can free it later
16451 +        */
16452 +       th->t_super = sb;
16453 +       return retval;
16454 +}
16455 +
16456 +struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct
16457 +                                                                   super_block
16458 +                                                                   *s,
16459 +                                                                   int nblocks)
16460 +{
16461 +       int ret;
16462 +       struct reiserfs_transaction_handle *th;
16463 +
16464 +       /*
16465 +        * if we're nesting into an existing transaction.  It will be
16466 +        * persistent on its own
16467 +        */
16468 +       if (reiserfs_transaction_running(s)) {
16469 +               th = current->journal_info;
16470 +               th->t_refcount++;
16471 +               BUG_ON(th->t_refcount < 2);
16472 +
16473 +               return th;
16474 +       }
16475 +       th = kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS);
16476 +       if (!th)
16477 +               return NULL;
16478 +       ret = journal_begin(th, s, nblocks);
16479 +       if (ret) {
16480 +               kfree(th);
16481 +               return NULL;
16482 +       }
16483 +
16484 +       SB_JOURNAL(s)->j_persistent_trans++;
16485 +       return th;
16486 +}
16487 +
16488 +int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th)
16489 +{
16490 +       struct super_block *s = th->t_super;
16491 +       int ret = 0;
16492 +       if (th->t_trans_id)
16493 +               ret = journal_end(th);
16494 +       else
16495 +               ret = -EIO;
16496 +       if (th->t_refcount == 0) {
16497 +               SB_JOURNAL(s)->j_persistent_trans--;
16498 +               kfree(th);
16499 +       }
16500 +       return ret;
16501 +}
16502 +
16503 +static int journal_join(struct reiserfs_transaction_handle *th,
16504 +                       struct super_block *sb)
16505 +{
16506 +       struct reiserfs_transaction_handle *cur_th = current->journal_info;
16507 +
16508 +       /*
16509 +        * this keeps do_journal_end from NULLing out the
16510 +        * current->journal_info pointer
16511 +        */
16512 +       th->t_handle_save = cur_th;
16513 +       BUG_ON(cur_th && cur_th->t_refcount > 1);
16514 +       return do_journal_begin_r(th, sb, 1, JBEGIN_JOIN);
16515 +}
16516 +
16517 +int journal_join_abort(struct reiserfs_transaction_handle *th,
16518 +                      struct super_block *sb)
16519 +{
16520 +       struct reiserfs_transaction_handle *cur_th = current->journal_info;
16521 +
16522 +       /*
16523 +        * this keeps do_journal_end from NULLing out the
16524 +        * current->journal_info pointer
16525 +        */
16526 +       th->t_handle_save = cur_th;
16527 +       BUG_ON(cur_th && cur_th->t_refcount > 1);
16528 +       return do_journal_begin_r(th, sb, 1, JBEGIN_ABORT);
16529 +}
16530 +
16531 +int journal_begin(struct reiserfs_transaction_handle *th,
16532 +                 struct super_block *sb, unsigned long nblocks)
16533 +{
16534 +       struct reiserfs_transaction_handle *cur_th = current->journal_info;
16535 +       int ret;
16536 +
16537 +       th->t_handle_save = NULL;
16538 +       if (cur_th) {
16539 +               /* we are nesting into the current transaction */
16540 +               if (cur_th->t_super == sb) {
16541 +                       BUG_ON(!cur_th->t_refcount);
16542 +                       cur_th->t_refcount++;
16543 +                       memcpy(th, cur_th, sizeof(*th));
16544 +                       if (th->t_refcount <= 1)
16545 +                               reiserfs_warning(sb, "reiserfs-2005",
16546 +                                                "BAD: refcount <= 1, but "
16547 +                                                "journal_info != 0");
16548 +                       return 0;
16549 +               } else {
16550 +                       /*
16551 +                        * we've ended up with a handle from a different
16552 +                        * filesystem.  save it and restore on journal_end.
16553 +                        * This should never really happen...
16554 +                        */
16555 +                       reiserfs_warning(sb, "clm-2100",
16556 +                                        "nesting info a different FS");
16557 +                       th->t_handle_save = current->journal_info;
16558 +                       current->journal_info = th;
16559 +               }
16560 +       } else {
16561 +               current->journal_info = th;
16562 +       }
16563 +       ret = do_journal_begin_r(th, sb, nblocks, JBEGIN_REG);
16564 +       BUG_ON(current->journal_info != th);
16565 +
16566 +       /*
16567 +        * I guess this boils down to being the reciprocal of clm-2100 above.
16568 +        * If do_journal_begin_r fails, we need to put it back, since
16569 +        * journal_end won't be called to do it. */
16570 +       if (ret)
16571 +               current->journal_info = th->t_handle_save;
16572 +       else
16573 +               BUG_ON(!th->t_refcount);
16574 +
16575 +       return ret;
16576 +}
16577 +
16578 +/*
16579 + * puts bh into the current transaction.  If it was already there, reorders
16580 + * removes the old pointers from the hash, and puts new ones in (to make
16581 + * sure replay happen in the right order).
16582 + *
16583 + * if it was dirty, cleans and files onto the clean list.  I can't let it
16584 + * be dirty again until the transaction is committed.
16585 + *
16586 + * if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len.
16587 + */
16588 +int journal_mark_dirty(struct reiserfs_transaction_handle *th,
16589 +                      struct buffer_head *bh)
16590 +{
16591 +       struct super_block *sb = th->t_super;
16592 +       struct reiserfs_journal *journal = SB_JOURNAL(sb);
16593 +       struct reiserfs_journal_cnode *cn = NULL;
16594 +       int count_already_incd = 0;
16595 +       int prepared = 0;
16596 +       BUG_ON(!th->t_trans_id);
16597 +
16598 +       PROC_INFO_INC(sb, journal.mark_dirty);
16599 +       if (th->t_trans_id != journal->j_trans_id) {
16600 +               reiserfs_panic(th->t_super, "journal-1577",
16601 +                              "handle trans id %ld != current trans id %ld",
16602 +                              th->t_trans_id, journal->j_trans_id);
16603 +       }
16604 +
16605 +       prepared = test_clear_buffer_journal_prepared(bh);
16606 +       clear_buffer_journal_restore_dirty(bh);
16607 +       /* already in this transaction, we are done */
16608 +       if (buffer_journaled(bh)) {
16609 +               PROC_INFO_INC(sb, journal.mark_dirty_already);
16610 +               return 0;
16611 +       }
16612 +
16613 +       /*
16614 +        * this must be turned into a panic instead of a warning.  We can't
16615 +        * allow a dirty or journal_dirty or locked buffer to be logged, as
16616 +        * some changes could get to disk too early.  NOT GOOD.
16617 +        */
16618 +       if (!prepared || buffer_dirty(bh)) {
16619 +               reiserfs_warning(sb, "journal-1777",
16620 +                                "buffer %llu bad state "
16621 +                                "%cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT",
16622 +                                (unsigned long long)bh->b_blocknr,
16623 +                                prepared ? ' ' : '!',
16624 +                                buffer_locked(bh) ? ' ' : '!',
16625 +                                buffer_dirty(bh) ? ' ' : '!',
16626 +                                buffer_journal_dirty(bh) ? ' ' : '!');
16627 +       }
16628 +
16629 +       if (atomic_read(&journal->j_wcount) <= 0) {
16630 +               reiserfs_warning(sb, "journal-1409",
16631 +                                "returning because j_wcount was %d",
16632 +                                atomic_read(&journal->j_wcount));
16633 +               return 1;
16634 +       }
16635 +       /*
16636 +        * this error means I've screwed up, and we've overflowed
16637 +        * the transaction.  Nothing can be done here, except make the
16638 +        * FS readonly or panic.
16639 +        */
16640 +       if (journal->j_len >= journal->j_trans_max) {
16641 +               reiserfs_panic(th->t_super, "journal-1413",
16642 +                              "j_len (%lu) is too big",
16643 +                              journal->j_len);
16644 +       }
16645 +
16646 +       if (buffer_journal_dirty(bh)) {
16647 +               count_already_incd = 1;
16648 +               PROC_INFO_INC(sb, journal.mark_dirty_notjournal);
16649 +               clear_buffer_journal_dirty(bh);
16650 +       }
16651 +
16652 +       if (journal->j_len > journal->j_len_alloc) {
16653 +               journal->j_len_alloc = journal->j_len + JOURNAL_PER_BALANCE_CNT;
16654 +       }
16655 +
16656 +       set_buffer_journaled(bh);
16657 +
16658 +       /* now put this guy on the end */
16659 +       if (!cn) {
16660 +               cn = get_cnode(sb);
16661 +               if (!cn) {
16662 +                       reiserfs_panic(sb, "journal-4", "get_cnode failed!");
16663 +               }
16664 +
16665 +               if (th->t_blocks_logged == th->t_blocks_allocated) {
16666 +                       th->t_blocks_allocated += JOURNAL_PER_BALANCE_CNT;
16667 +                       journal->j_len_alloc += JOURNAL_PER_BALANCE_CNT;
16668 +               }
16669 +               th->t_blocks_logged++;
16670 +               journal->j_len++;
16671 +
16672 +               cn->bh = bh;
16673 +               cn->blocknr = bh->b_blocknr;
16674 +               cn->sb = sb;
16675 +               cn->jlist = NULL;
16676 +               insert_journal_hash(journal->j_hash_table, cn);
16677 +               if (!count_already_incd) {
16678 +                       get_bh(bh);
16679 +               }
16680 +       }
16681 +       cn->next = NULL;
16682 +       cn->prev = journal->j_last;
16683 +       cn->bh = bh;
16684 +       if (journal->j_last) {
16685 +               journal->j_last->next = cn;
16686 +               journal->j_last = cn;
16687 +       } else {
16688 +               journal->j_first = cn;
16689 +               journal->j_last = cn;
16690 +       }
16691 +       reiserfs_schedule_old_flush(sb);
16692 +       return 0;
16693 +}
16694 +
16695 +int journal_end(struct reiserfs_transaction_handle *th)
16696 +{
16697 +       struct super_block *sb = th->t_super;
16698 +       if (!current->journal_info && th->t_refcount > 1)
16699 +               reiserfs_warning(sb, "REISER-NESTING",
16700 +                                "th NULL, refcount %d", th->t_refcount);
16701 +
16702 +       if (!th->t_trans_id) {
16703 +               WARN_ON(1);
16704 +               return -EIO;
16705 +       }
16706 +
16707 +       th->t_refcount--;
16708 +       if (th->t_refcount > 0) {
16709 +               struct reiserfs_transaction_handle *cur_th =
16710 +                   current->journal_info;
16711 +
16712 +               /*
16713 +                * we aren't allowed to close a nested transaction on a
16714 +                * different filesystem from the one in the task struct
16715 +                */
16716 +               BUG_ON(cur_th->t_super != th->t_super);
16717 +
16718 +               if (th != cur_th) {
16719 +                       memcpy(current->journal_info, th, sizeof(*th));
16720 +                       th->t_trans_id = 0;
16721 +               }
16722 +               return 0;
16723 +       } else {
16724 +               return do_journal_end(th, 0);
16725 +       }
16726 +}
16727 +
16728 +/*
16729 + * removes from the current transaction, relsing and descrementing any counters.
16730 + * also files the removed buffer directly onto the clean list
16731 + *
16732 + * called by journal_mark_freed when a block has been deleted
16733 + *
16734 + * returns 1 if it cleaned and relsed the buffer. 0 otherwise
16735 + */
16736 +static int remove_from_transaction(struct super_block *sb,
16737 +                                  b_blocknr_t blocknr, int already_cleaned)
16738 +{
16739 +       struct buffer_head *bh;
16740 +       struct reiserfs_journal_cnode *cn;
16741 +       struct reiserfs_journal *journal = SB_JOURNAL(sb);
16742 +       int ret = 0;
16743 +
16744 +       cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr);
16745 +       if (!cn || !cn->bh) {
16746 +               return ret;
16747 +       }
16748 +       bh = cn->bh;
16749 +       if (cn->prev) {
16750 +               cn->prev->next = cn->next;
16751 +       }
16752 +       if (cn->next) {
16753 +               cn->next->prev = cn->prev;
16754 +       }
16755 +       if (cn == journal->j_first) {
16756 +               journal->j_first = cn->next;
16757 +       }
16758 +       if (cn == journal->j_last) {
16759 +               journal->j_last = cn->prev;
16760 +       }
16761 +       remove_journal_hash(sb, journal->j_hash_table, NULL,
16762 +                           bh->b_blocknr, 0);
16763 +       clear_buffer_journaled(bh);     /* don't log this one */
16764 +
16765 +       if (!already_cleaned) {
16766 +               clear_buffer_journal_dirty(bh);
16767 +               clear_buffer_dirty(bh);
16768 +               clear_buffer_journal_test(bh);
16769 +               put_bh(bh);
16770 +               if (atomic_read(&bh->b_count) < 0) {
16771 +                       reiserfs_warning(sb, "journal-1752",
16772 +                                        "b_count < 0");
16773 +               }
16774 +               ret = 1;
16775 +       }
16776 +       journal->j_len--;
16777 +       journal->j_len_alloc--;
16778 +       free_cnode(sb, cn);
16779 +       return ret;
16780 +}
16781 +
16782 +/*
16783 + * for any cnode in a journal list, it can only be dirtied of all the
16784 + * transactions that include it are committed to disk.
16785 + * this checks through each transaction, and returns 1 if you are allowed
16786 + * to dirty, and 0 if you aren't
16787 + *
16788 + * it is called by dirty_journal_list, which is called after
16789 + * flush_commit_list has gotten all the log blocks for a given
16790 + * transaction on disk
16791 + *
16792 + */
16793 +static int can_dirty(struct reiserfs_journal_cnode *cn)
16794 +{
16795 +       struct super_block *sb = cn->sb;
16796 +       b_blocknr_t blocknr = cn->blocknr;
16797 +       struct reiserfs_journal_cnode *cur = cn->hprev;
16798 +       int can_dirty = 1;
16799 +
16800 +       /*
16801 +        * first test hprev.  These are all newer than cn, so any node here
16802 +        * with the same block number and dev means this node can't be sent
16803 +        * to disk right now.
16804 +        */
16805 +       while (cur && can_dirty) {
16806 +               if (cur->jlist && cur->bh && cur->blocknr && cur->sb == sb &&
16807 +                   cur->blocknr == blocknr) {
16808 +                       can_dirty = 0;
16809 +               }
16810 +               cur = cur->hprev;
16811 +       }
16812 +       /*
16813 +        * then test hnext.  These are all older than cn.  As long as they
16814 +        * are committed to the log, it is safe to write cn to disk
16815 +        */
16816 +       cur = cn->hnext;
16817 +       while (cur && can_dirty) {
16818 +               if (cur->jlist && cur->jlist->j_len > 0 &&
16819 +                   atomic_read(&cur->jlist->j_commit_left) > 0 && cur->bh &&
16820 +                   cur->blocknr && cur->sb == sb && cur->blocknr == blocknr) {
16821 +                       can_dirty = 0;
16822 +               }
16823 +               cur = cur->hnext;
16824 +       }
16825 +       return can_dirty;
16826 +}
16827 +
16828 +/*
16829 + * syncs the commit blocks, but does not force the real buffers to disk
16830 + * will wait until the current transaction is done/committed before returning
16831 + */
16832 +int journal_end_sync(struct reiserfs_transaction_handle *th)
16833 +{
16834 +       struct super_block *sb = th->t_super;
16835 +       struct reiserfs_journal *journal = SB_JOURNAL(sb);
16836 +
16837 +       BUG_ON(!th->t_trans_id);
16838 +       /* you can sync while nested, very, very bad */
16839 +       BUG_ON(th->t_refcount > 1);
16840 +       if (journal->j_len == 0) {
16841 +               reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
16842 +                                            1);
16843 +               journal_mark_dirty(th, SB_BUFFER_WITH_SB(sb));
16844 +       }
16845 +       return do_journal_end(th, COMMIT_NOW | WAIT);
16846 +}
16847 +
16848 +/* writeback the pending async commits to disk */
16849 +static void flush_async_commits(struct work_struct *work)
16850 +{
16851 +       struct reiserfs_journal *journal =
16852 +               container_of(work, struct reiserfs_journal, j_work.work);
16853 +       struct super_block *sb = journal->j_work_sb;
16854 +       struct reiserfs_journal_list *jl;
16855 +       struct list_head *entry;
16856 +
16857 +       reiserfs_write_lock(sb);
16858 +       if (!list_empty(&journal->j_journal_list)) {
16859 +               /* last entry is the youngest, commit it and you get everything */
16860 +               entry = journal->j_journal_list.prev;
16861 +               jl = JOURNAL_LIST_ENTRY(entry);
16862 +               flush_commit_list(sb, jl, 1);
16863 +       }
16864 +       reiserfs_write_unlock(sb);
16865 +}
16866 +
16867 +/*
16868 + * flushes any old transactions to disk
16869 + * ends the current transaction if it is too old
16870 + */
16871 +void reiserfs_flush_old_commits(struct super_block *sb)
16872 +{
16873 +       time64_t now;
16874 +       struct reiserfs_transaction_handle th;
16875 +       struct reiserfs_journal *journal = SB_JOURNAL(sb);
16876 +
16877 +       now = ktime_get_seconds();
16878 +       /*
16879 +        * safety check so we don't flush while we are replaying the log during
16880 +        * mount
16881 +        */
16882 +       if (list_empty(&journal->j_journal_list))
16883 +               return;
16884 +
16885 +       /*
16886 +        * check the current transaction.  If there are no writers, and it is
16887 +        * too old, finish it, and force the commit blocks to disk
16888 +        */
16889 +       if (atomic_read(&journal->j_wcount) <= 0 &&
16890 +           journal->j_trans_start_time > 0 &&
16891 +           journal->j_len > 0 &&
16892 +           (now - journal->j_trans_start_time) > journal->j_max_trans_age) {
16893 +               if (!journal_join(&th, sb)) {
16894 +                       reiserfs_prepare_for_journal(sb,
16895 +                                                    SB_BUFFER_WITH_SB(sb),
16896 +                                                    1);
16897 +                       journal_mark_dirty(&th, SB_BUFFER_WITH_SB(sb));
16898 +
16899 +                       /*
16900 +                        * we're only being called from kreiserfsd, it makes
16901 +                        * no sense to do an async commit so that kreiserfsd
16902 +                        * can do it later
16903 +                        */
16904 +                       do_journal_end(&th, COMMIT_NOW | WAIT);
16905 +               }
16906 +       }
16907 +}
16908 +
16909 +/*
16910 + * returns 0 if do_journal_end should return right away, returns 1 if
16911 + * do_journal_end should finish the commit
16912 + *
16913 + * if the current transaction is too old, but still has writers, this will
16914 + * wait on j_join_wait until all the writers are done.  By the time it
16915 + * wakes up, the transaction it was called has already ended, so it just
16916 + * flushes the commit list and returns 0.
16917 + *
16918 + * Won't batch when flush or commit_now is set.  Also won't batch when
16919 + * others are waiting on j_join_wait.
16920 + *
16921 + * Note, we can't allow the journal_end to proceed while there are still
16922 + * writers in the log.
16923 + */
16924 +static int check_journal_end(struct reiserfs_transaction_handle *th, int flags)
16925 +{
16926 +
16927 +       time64_t now;
16928 +       int flush = flags & FLUSH_ALL;
16929 +       int commit_now = flags & COMMIT_NOW;
16930 +       int wait_on_commit = flags & WAIT;
16931 +       struct reiserfs_journal_list *jl;
16932 +       struct super_block *sb = th->t_super;
16933 +       struct reiserfs_journal *journal = SB_JOURNAL(sb);
16934 +
16935 +       BUG_ON(!th->t_trans_id);
16936 +
16937 +       if (th->t_trans_id != journal->j_trans_id) {
16938 +               reiserfs_panic(th->t_super, "journal-1577",
16939 +                              "handle trans id %ld != current trans id %ld",
16940 +                              th->t_trans_id, journal->j_trans_id);
16941 +       }
16942 +
16943 +       journal->j_len_alloc -= (th->t_blocks_allocated - th->t_blocks_logged);
16944 +       /* <= 0 is allowed.  unmounting might not call begin */
16945 +       if (atomic_read(&journal->j_wcount) > 0)
16946 +               atomic_dec(&journal->j_wcount);
16947 +
16948 +       /*
16949 +        * BUG, deal with case where j_len is 0, but people previously
16950 +        * freed blocks need to be released will be dealt with by next
16951 +        * transaction that actually writes something, but should be taken
16952 +        * care of in this trans
16953 +        */
16954 +       BUG_ON(journal->j_len == 0);
16955 +
16956 +       /*
16957 +        * if wcount > 0, and we are called to with flush or commit_now,
16958 +        * we wait on j_join_wait.  We will wake up when the last writer has
16959 +        * finished the transaction, and started it on its way to the disk.
16960 +        * Then, we flush the commit or journal list, and just return 0
16961 +        * because the rest of journal end was already done for this
16962 +        * transaction.
16963 +        */
16964 +       if (atomic_read(&journal->j_wcount) > 0) {
16965 +               if (flush || commit_now) {
16966 +                       unsigned trans_id;
16967 +
16968 +                       jl = journal->j_current_jl;
16969 +                       trans_id = jl->j_trans_id;
16970 +                       if (wait_on_commit)
16971 +                               jl->j_state |= LIST_COMMIT_PENDING;
16972 +                       atomic_set(&journal->j_jlock, 1);
16973 +                       if (flush) {
16974 +                               journal->j_next_full_flush = 1;
16975 +                       }
16976 +                       unlock_journal(sb);
16977 +
16978 +                       /*
16979 +                        * sleep while the current transaction is
16980 +                        * still j_jlocked
16981 +                        */
16982 +                       while (journal->j_trans_id == trans_id) {
16983 +                               if (atomic_read(&journal->j_jlock)) {
16984 +                                       queue_log_writer(sb);
16985 +                               } else {
16986 +                                       lock_journal(sb);
16987 +                                       if (journal->j_trans_id == trans_id) {
16988 +                                               atomic_set(&journal->j_jlock,
16989 +                                                          1);
16990 +                                       }
16991 +                                       unlock_journal(sb);
16992 +                               }
16993 +                       }
16994 +                       BUG_ON(journal->j_trans_id == trans_id);
16995 +
16996 +                       if (commit_now
16997 +                           && journal_list_still_alive(sb, trans_id)
16998 +                           && wait_on_commit) {
16999 +                               flush_commit_list(sb, jl, 1);
17000 +                       }
17001 +                       return 0;
17002 +               }
17003 +               unlock_journal(sb);
17004 +               return 0;
17005 +       }
17006 +
17007 +       /* deal with old transactions where we are the last writers */
17008 +       now = ktime_get_seconds();
17009 +       if ((now - journal->j_trans_start_time) > journal->j_max_trans_age) {
17010 +               commit_now = 1;
17011 +               journal->j_next_async_flush = 1;
17012 +       }
17013 +       /* don't batch when someone is waiting on j_join_wait */
17014 +       /* don't batch when syncing the commit or flushing the whole trans */
17015 +       if (!(journal->j_must_wait > 0) && !(atomic_read(&journal->j_jlock))
17016 +           && !flush && !commit_now && (journal->j_len < journal->j_max_batch)
17017 +           && journal->j_len_alloc < journal->j_max_batch
17018 +           && journal->j_cnode_free > (journal->j_trans_max * 3)) {
17019 +               journal->j_bcount++;
17020 +               unlock_journal(sb);
17021 +               return 0;
17022 +       }
17023 +
17024 +       if (journal->j_start > SB_ONDISK_JOURNAL_SIZE(sb)) {
17025 +               reiserfs_panic(sb, "journal-003",
17026 +                              "j_start (%ld) is too high",
17027 +                              journal->j_start);
17028 +       }
17029 +       return 1;
17030 +}
17031 +
17032 +/*
17033 + * Does all the work that makes deleting blocks safe.
17034 + * when deleting a block mark BH_JNew, just remove it from the current
17035 + * transaction, clean it's buffer_head and move on.
17036 + *
17037 + * otherwise:
17038 + * set a bit for the block in the journal bitmap.  That will prevent it from
17039 + * being allocated for unformatted nodes before this transaction has finished.
17040 + *
17041 + * mark any cnodes for this block as BLOCK_FREED, and clear their bh pointers.
17042 + * That will prevent any old transactions with this block from trying to flush
17043 + * to the real location.  Since we aren't removing the cnode from the
17044 + * journal_list_hash, *the block can't be reallocated yet.
17045 + *
17046 + * Then remove it from the current transaction, decrementing any counters and
17047 + * filing it on the clean list.
17048 + */
17049 +int journal_mark_freed(struct reiserfs_transaction_handle *th,
17050 +                      struct super_block *sb, b_blocknr_t blocknr)
17051 +{
17052 +       struct reiserfs_journal *journal = SB_JOURNAL(sb);
17053 +       struct reiserfs_journal_cnode *cn = NULL;
17054 +       struct buffer_head *bh = NULL;
17055 +       struct reiserfs_list_bitmap *jb = NULL;
17056 +       int cleaned = 0;
17057 +       BUG_ON(!th->t_trans_id);
17058 +
17059 +       cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr);
17060 +       if (cn && cn->bh) {
17061 +               bh = cn->bh;
17062 +               get_bh(bh);
17063 +       }
17064 +       /* if it is journal new, we just remove it from this transaction */
17065 +       if (bh && buffer_journal_new(bh)) {
17066 +               clear_buffer_journal_new(bh);
17067 +               clear_prepared_bits(bh);
17068 +               reiserfs_clean_and_file_buffer(bh);
17069 +               cleaned = remove_from_transaction(sb, blocknr, cleaned);
17070 +       } else {
17071 +               /*
17072 +                * set the bit for this block in the journal bitmap
17073 +                * for this transaction
17074 +                */
17075 +               jb = journal->j_current_jl->j_list_bitmap;
17076 +               if (!jb) {
17077 +                       reiserfs_panic(sb, "journal-1702",
17078 +                                      "journal_list_bitmap is NULL");
17079 +               }
17080 +               set_bit_in_list_bitmap(sb, blocknr, jb);
17081 +
17082 +               /* Note, the entire while loop is not allowed to schedule.  */
17083 +
17084 +               if (bh) {
17085 +                       clear_prepared_bits(bh);
17086 +                       reiserfs_clean_and_file_buffer(bh);
17087 +               }
17088 +               cleaned = remove_from_transaction(sb, blocknr, cleaned);
17089 +
17090 +               /*
17091 +                * find all older transactions with this block,
17092 +                * make sure they don't try to write it out
17093 +                */
17094 +               cn = get_journal_hash_dev(sb, journal->j_list_hash_table,
17095 +                                         blocknr);
17096 +               while (cn) {
17097 +                       if (sb == cn->sb && blocknr == cn->blocknr) {
17098 +                               set_bit(BLOCK_FREED, &cn->state);
17099 +                               if (cn->bh) {
17100 +                                       /*
17101 +                                        * remove_from_transaction will brelse
17102 +                                        * the buffer if it was in the current
17103 +                                        * trans
17104 +                                        */
17105 +                                       if (!cleaned) {
17106 +                                               clear_buffer_journal_dirty(cn->
17107 +                                                                          bh);
17108 +                                               clear_buffer_dirty(cn->bh);
17109 +                                               clear_buffer_journal_test(cn->
17110 +                                                                         bh);
17111 +                                               cleaned = 1;
17112 +                                               put_bh(cn->bh);
17113 +                                               if (atomic_read
17114 +                                                   (&cn->bh->b_count) < 0) {
17115 +                                                       reiserfs_warning(sb,
17116 +                                                                "journal-2138",
17117 +                                                                "cn->bh->b_count < 0");
17118 +                                               }
17119 +                                       }
17120 +                                       /*
17121 +                                        * since we are clearing the bh,
17122 +                                        * we MUST dec nonzerolen
17123 +                                        */
17124 +                                       if (cn->jlist) {
17125 +                                               atomic_dec(&cn->jlist->
17126 +                                                          j_nonzerolen);
17127 +                                       }
17128 +                                       cn->bh = NULL;
17129 +                               }
17130 +                       }
17131 +                       cn = cn->hnext;
17132 +               }
17133 +       }
17134 +
17135 +       if (bh)
17136 +               release_buffer_page(bh); /* get_hash grabs the buffer */
17137 +       return 0;
17138 +}
17139 +
17140 +void reiserfs_update_inode_transaction(struct inode *inode)
17141 +{
17142 +       struct reiserfs_journal *journal = SB_JOURNAL(inode->i_sb);
17143 +       REISERFS_I(inode)->i_jl = journal->j_current_jl;
17144 +       REISERFS_I(inode)->i_trans_id = journal->j_trans_id;
17145 +}
17146 +
17147 +/*
17148 + * returns -1 on error, 0 if no commits/barriers were done and 1
17149 + * if a transaction was actually committed and the barrier was done
17150 + */
17151 +static int __commit_trans_jl(struct inode *inode, unsigned long id,
17152 +                            struct reiserfs_journal_list *jl)
17153 +{
17154 +       struct reiserfs_transaction_handle th;
17155 +       struct super_block *sb = inode->i_sb;
17156 +       struct reiserfs_journal *journal = SB_JOURNAL(sb);
17157 +       int ret = 0;
17158 +
17159 +       /*
17160 +        * is it from the current transaction,
17161 +        * or from an unknown transaction?
17162 +        */
17163 +       if (id == journal->j_trans_id) {
17164 +               jl = journal->j_current_jl;
17165 +               /*
17166 +                * try to let other writers come in and
17167 +                * grow this transaction
17168 +                */
17169 +               let_transaction_grow(sb, id);
17170 +               if (journal->j_trans_id != id) {
17171 +                       goto flush_commit_only;
17172 +               }
17173 +
17174 +               ret = journal_begin(&th, sb, 1);
17175 +               if (ret)
17176 +                       return ret;
17177 +
17178 +               /* someone might have ended this transaction while we joined */
17179 +               if (journal->j_trans_id != id) {
17180 +                       reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
17181 +                                                    1);
17182 +                       journal_mark_dirty(&th, SB_BUFFER_WITH_SB(sb));
17183 +                       ret = journal_end(&th);
17184 +                       goto flush_commit_only;
17185 +               }
17186 +
17187 +               ret = journal_end_sync(&th);
17188 +               if (!ret)
17189 +                       ret = 1;
17190 +
17191 +       } else {
17192 +               /*
17193 +                * this gets tricky, we have to make sure the journal list in
17194 +                * the inode still exists.  We know the list is still around
17195 +                * if we've got a larger transaction id than the oldest list
17196 +                */
17197 +flush_commit_only:
17198 +               if (journal_list_still_alive(inode->i_sb, id)) {
17199 +                       /*
17200 +                        * we only set ret to 1 when we know for sure
17201 +                        * the barrier hasn't been started yet on the commit
17202 +                        * block.
17203 +                        */
17204 +                       if (atomic_read(&jl->j_commit_left) > 1)
17205 +                               ret = 1;
17206 +                       flush_commit_list(sb, jl, 1);
17207 +                       if (journal->j_errno)
17208 +                               ret = journal->j_errno;
17209 +               }
17210 +       }
17211 +       /* otherwise the list is gone, and long since committed */
17212 +       return ret;
17213 +}
17214 +
17215 +int reiserfs_commit_for_inode(struct inode *inode)
17216 +{
17217 +       unsigned int id = REISERFS_I(inode)->i_trans_id;
17218 +       struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl;
17219 +
17220 +       /*
17221 +        * for the whole inode, assume unset id means it was
17222 +        * changed in the current transaction.  More conservative
17223 +        */
17224 +       if (!id || !jl) {
17225 +               reiserfs_update_inode_transaction(inode);
17226 +               id = REISERFS_I(inode)->i_trans_id;
17227 +               /* jl will be updated in __commit_trans_jl */
17228 +       }
17229 +
17230 +       return __commit_trans_jl(inode, id, jl);
17231 +}
17232 +
17233 +void reiserfs_restore_prepared_buffer(struct super_block *sb,
17234 +                                     struct buffer_head *bh)
17235 +{
17236 +       struct reiserfs_journal *journal = SB_JOURNAL(sb);
17237 +       PROC_INFO_INC(sb, journal.restore_prepared);
17238 +       if (!bh) {
17239 +               return;
17240 +       }
17241 +       if (test_clear_buffer_journal_restore_dirty(bh) &&
17242 +           buffer_journal_dirty(bh)) {
17243 +               struct reiserfs_journal_cnode *cn;
17244 +               reiserfs_write_lock(sb);
17245 +               cn = get_journal_hash_dev(sb,
17246 +                                         journal->j_list_hash_table,
17247 +                                         bh->b_blocknr);
17248 +               if (cn && can_dirty(cn)) {
17249 +                       set_buffer_journal_test(bh);
17250 +                       mark_buffer_dirty(bh);
17251 +               }
17252 +               reiserfs_write_unlock(sb);
17253 +       }
17254 +       clear_buffer_journal_prepared(bh);
17255 +}
17256 +
17257 +extern struct tree_balance *cur_tb;
17258 +/*
17259 + * before we can change a metadata block, we have to make sure it won't
17260 + * be written to disk while we are altering it.  So, we must:
17261 + * clean it
17262 + * wait on it.
17263 + */
17264 +int reiserfs_prepare_for_journal(struct super_block *sb,
17265 +                                struct buffer_head *bh, int wait)
17266 +{
17267 +       PROC_INFO_INC(sb, journal.prepare);
17268 +
17269 +       if (!trylock_buffer(bh)) {
17270 +               if (!wait)
17271 +                       return 0;
17272 +               lock_buffer(bh);
17273 +       }
17274 +       set_buffer_journal_prepared(bh);
17275 +       if (test_clear_buffer_dirty(bh) && buffer_journal_dirty(bh)) {
17276 +               clear_buffer_journal_test(bh);
17277 +               set_buffer_journal_restore_dirty(bh);
17278 +       }
17279 +       unlock_buffer(bh);
17280 +       return 1;
17281 +}
17282 +
17283 +/*
17284 + * long and ugly.  If flush, will not return until all commit
17285 + * blocks and all real buffers in the trans are on disk.
17286 + * If no_async, won't return until all commit blocks are on disk.
17287 + *
17288 + * keep reading, there are comments as you go along
17289 + *
17290 + * If the journal is aborted, we just clean up. Things like flushing
17291 + * journal lists, etc just won't happen.
17292 + */
17293 +static int do_journal_end(struct reiserfs_transaction_handle *th, int flags)
17294 +{
17295 +       struct super_block *sb = th->t_super;
17296 +       struct reiserfs_journal *journal = SB_JOURNAL(sb);
17297 +       struct reiserfs_journal_cnode *cn, *next, *jl_cn;
17298 +       struct reiserfs_journal_cnode *last_cn = NULL;
17299 +       struct reiserfs_journal_desc *desc;
17300 +       struct reiserfs_journal_commit *commit;
17301 +       struct buffer_head *c_bh;       /* commit bh */
17302 +       struct buffer_head *d_bh;       /* desc bh */
17303 +       int cur_write_start = 0;        /* start index of current log write */
17304 +       int i;
17305 +       int flush;
17306 +       int wait_on_commit;
17307 +       struct reiserfs_journal_list *jl, *temp_jl;
17308 +       struct list_head *entry, *safe;
17309 +       unsigned long jindex;
17310 +       unsigned int commit_trans_id;
17311 +       int trans_half;
17312 +       int depth;
17313 +
17314 +       BUG_ON(th->t_refcount > 1);
17315 +       BUG_ON(!th->t_trans_id);
17316 +       BUG_ON(!th->t_super);
17317 +
17318 +       /*
17319 +        * protect flush_older_commits from doing mistakes if the
17320 +        * transaction ID counter gets overflowed.
17321 +        */
17322 +       if (th->t_trans_id == ~0U)
17323 +               flags |= FLUSH_ALL | COMMIT_NOW | WAIT;
17324 +       flush = flags & FLUSH_ALL;
17325 +       wait_on_commit = flags & WAIT;
17326 +
17327 +       current->journal_info = th->t_handle_save;
17328 +       reiserfs_check_lock_depth(sb, "journal end");
17329 +       if (journal->j_len == 0) {
17330 +               reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
17331 +                                            1);
17332 +               journal_mark_dirty(th, SB_BUFFER_WITH_SB(sb));
17333 +       }
17334 +
17335 +       lock_journal(sb);
17336 +       if (journal->j_next_full_flush) {
17337 +               flags |= FLUSH_ALL;
17338 +               flush = 1;
17339 +       }
17340 +       if (journal->j_next_async_flush) {
17341 +               flags |= COMMIT_NOW | WAIT;
17342 +               wait_on_commit = 1;
17343 +       }
17344 +
17345 +       /*
17346 +        * check_journal_end locks the journal, and unlocks if it does
17347 +        * not return 1 it tells us if we should continue with the
17348 +        * journal_end, or just return
17349 +        */
17350 +       if (!check_journal_end(th, flags)) {
17351 +               reiserfs_schedule_old_flush(sb);
17352 +               wake_queued_writers(sb);
17353 +               reiserfs_async_progress_wait(sb);
17354 +               goto out;
17355 +       }
17356 +
17357 +       /* check_journal_end might set these, check again */
17358 +       if (journal->j_next_full_flush) {
17359 +               flush = 1;
17360 +       }
17361 +
17362 +       /*
17363 +        * j must wait means we have to flush the log blocks, and the
17364 +        * real blocks for this transaction
17365 +        */
17366 +       if (journal->j_must_wait > 0) {
17367 +               flush = 1;
17368 +       }
17369 +#ifdef REISERFS_PREALLOCATE
17370 +       /*
17371 +        * quota ops might need to nest, setup the journal_info pointer
17372 +        * for them and raise the refcount so that it is > 0.
17373 +        */
17374 +       current->journal_info = th;
17375 +       th->t_refcount++;
17376 +
17377 +       /* it should not involve new blocks into the transaction */
17378 +       reiserfs_discard_all_prealloc(th);
17379 +
17380 +       th->t_refcount--;
17381 +       current->journal_info = th->t_handle_save;
17382 +#endif
17383 +
17384 +       /* setup description block */
17385 +       d_bh =
17386 +           journal_getblk(sb,
17387 +                          SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
17388 +                          journal->j_start);
17389 +       set_buffer_uptodate(d_bh);
17390 +       desc = (struct reiserfs_journal_desc *)(d_bh)->b_data;
17391 +       memset(d_bh->b_data, 0, d_bh->b_size);
17392 +       memcpy(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8);
17393 +       set_desc_trans_id(desc, journal->j_trans_id);
17394 +
17395 +       /*
17396 +        * setup commit block.  Don't write (keep it clean too) this one
17397 +        * until after everyone else is written
17398 +        */
17399 +       c_bh = journal_getblk(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
17400 +                             ((journal->j_start + journal->j_len +
17401 +                               1) % SB_ONDISK_JOURNAL_SIZE(sb)));
17402 +       commit = (struct reiserfs_journal_commit *)c_bh->b_data;
17403 +       memset(c_bh->b_data, 0, c_bh->b_size);
17404 +       set_commit_trans_id(commit, journal->j_trans_id);
17405 +       set_buffer_uptodate(c_bh);
17406 +
17407 +       /* init this journal list */
17408 +       jl = journal->j_current_jl;
17409 +
17410 +       /*
17411 +        * we lock the commit before doing anything because
17412 +        * we want to make sure nobody tries to run flush_commit_list until
17413 +        * the new transaction is fully setup, and we've already flushed the
17414 +        * ordered bh list
17415 +        */
17416 +       reiserfs_mutex_lock_safe(&jl->j_commit_mutex, sb);
17417 +
17418 +       /* save the transaction id in case we need to commit it later */
17419 +       commit_trans_id = jl->j_trans_id;
17420 +
17421 +       atomic_set(&jl->j_older_commits_done, 0);
17422 +       jl->j_trans_id = journal->j_trans_id;
17423 +       jl->j_timestamp = journal->j_trans_start_time;
17424 +       jl->j_commit_bh = c_bh;
17425 +       jl->j_start = journal->j_start;
17426 +       jl->j_len = journal->j_len;
17427 +       atomic_set(&jl->j_nonzerolen, journal->j_len);
17428 +       atomic_set(&jl->j_commit_left, journal->j_len + 2);
17429 +       jl->j_realblock = NULL;
17430 +
17431 +       /*
17432 +        * The ENTIRE FOR LOOP MUST not cause schedule to occur.
17433 +        * for each real block, add it to the journal list hash,
17434 +        * copy into real block index array in the commit or desc block
17435 +        */
17436 +       trans_half = journal_trans_half(sb->s_blocksize);
17437 +       for (i = 0, cn = journal->j_first; cn; cn = cn->next, i++) {
17438 +               if (buffer_journaled(cn->bh)) {
17439 +                       jl_cn = get_cnode(sb);
17440 +                       if (!jl_cn) {
17441 +                               reiserfs_panic(sb, "journal-1676",
17442 +                                              "get_cnode returned NULL");
17443 +                       }
17444 +                       if (i == 0) {
17445 +                               jl->j_realblock = jl_cn;
17446 +                       }
17447 +                       jl_cn->prev = last_cn;
17448 +                       jl_cn->next = NULL;
17449 +                       if (last_cn) {
17450 +                               last_cn->next = jl_cn;
17451 +                       }
17452 +                       last_cn = jl_cn;
17453 +                       /*
17454 +                        * make sure the block we are trying to log
17455 +                        * is not a block of journal or reserved area
17456 +                        */
17457 +                       if (is_block_in_log_or_reserved_area
17458 +                           (sb, cn->bh->b_blocknr)) {
17459 +                               reiserfs_panic(sb, "journal-2332",
17460 +                                              "Trying to log block %lu, "
17461 +                                              "which is a log block",
17462 +                                              cn->bh->b_blocknr);
17463 +                       }
17464 +                       jl_cn->blocknr = cn->bh->b_blocknr;
17465 +                       jl_cn->state = 0;
17466 +                       jl_cn->sb = sb;
17467 +                       jl_cn->bh = cn->bh;
17468 +                       jl_cn->jlist = jl;
17469 +                       insert_journal_hash(journal->j_list_hash_table, jl_cn);
17470 +                       if (i < trans_half) {
17471 +                               desc->j_realblock[i] =
17472 +                                   cpu_to_le32(cn->bh->b_blocknr);
17473 +                       } else {
17474 +                               commit->j_realblock[i - trans_half] =
17475 +                                   cpu_to_le32(cn->bh->b_blocknr);
17476 +                       }
17477 +               } else {
17478 +                       i--;
17479 +               }
17480 +       }
17481 +       set_desc_trans_len(desc, journal->j_len);
17482 +       set_desc_mount_id(desc, journal->j_mount_id);
17483 +       set_desc_trans_id(desc, journal->j_trans_id);
17484 +       set_commit_trans_len(commit, journal->j_len);
17485 +
17486 +       /*
17487 +        * special check in case all buffers in the journal
17488 +        * were marked for not logging
17489 +        */
17490 +       BUG_ON(journal->j_len == 0);
17491 +
17492 +       /*
17493 +        * we're about to dirty all the log blocks, mark the description block
17494 +        * dirty now too.  Don't mark the commit block dirty until all the
17495 +        * others are on disk
17496 +        */
17497 +       mark_buffer_dirty(d_bh);
17498 +
17499 +       /*
17500 +        * first data block is j_start + 1, so add one to
17501 +        * cur_write_start wherever you use it
17502 +        */
17503 +       cur_write_start = journal->j_start;
17504 +       cn = journal->j_first;
17505 +       jindex = 1;     /* start at one so we don't get the desc again */
17506 +       while (cn) {
17507 +               clear_buffer_journal_new(cn->bh);
17508 +               /* copy all the real blocks into log area.  dirty log blocks */
17509 +               if (buffer_journaled(cn->bh)) {
17510 +                       struct buffer_head *tmp_bh;
17511 +                       char *addr;
17512 +                       struct page *page;
17513 +                       tmp_bh =
17514 +                           journal_getblk(sb,
17515 +                                          SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
17516 +                                          ((cur_write_start +
17517 +                                            jindex) %
17518 +                                           SB_ONDISK_JOURNAL_SIZE(sb)));
17519 +                       set_buffer_uptodate(tmp_bh);
17520 +                       page = cn->bh->b_page;
17521 +                       addr = kmap(page);
17522 +                       memcpy(tmp_bh->b_data,
17523 +                              addr + offset_in_page(cn->bh->b_data),
17524 +                              cn->bh->b_size);
17525 +                       kunmap(page);
17526 +                       mark_buffer_dirty(tmp_bh);
17527 +                       jindex++;
17528 +                       set_buffer_journal_dirty(cn->bh);
17529 +                       clear_buffer_journaled(cn->bh);
17530 +               } else {
17531 +                       /*
17532 +                        * JDirty cleared sometime during transaction.
17533 +                        * don't log this one
17534 +                        */
17535 +                       reiserfs_warning(sb, "journal-2048",
17536 +                                        "BAD, buffer in journal hash, "
17537 +                                        "but not JDirty!");
17538 +                       brelse(cn->bh);
17539 +               }
17540 +               next = cn->next;
17541 +               free_cnode(sb, cn);
17542 +               cn = next;
17543 +               reiserfs_cond_resched(sb);
17544 +       }
17545 +
17546 +       /*
17547 +        * we are done with both the c_bh and d_bh, but
17548 +        * c_bh must be written after all other commit blocks,
17549 +        * so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1.
17550 +        */
17551 +
17552 +       journal->j_current_jl = alloc_journal_list(sb);
17553 +
17554 +       /* now it is safe to insert this transaction on the main list */
17555 +       list_add_tail(&jl->j_list, &journal->j_journal_list);
17556 +       list_add_tail(&jl->j_working_list, &journal->j_working_list);
17557 +       journal->j_num_work_lists++;
17558 +
17559 +       /* reset journal values for the next transaction */
17560 +       journal->j_start =
17561 +           (journal->j_start + journal->j_len +
17562 +            2) % SB_ONDISK_JOURNAL_SIZE(sb);
17563 +       atomic_set(&journal->j_wcount, 0);
17564 +       journal->j_bcount = 0;
17565 +       journal->j_last = NULL;
17566 +       journal->j_first = NULL;
17567 +       journal->j_len = 0;
17568 +       journal->j_trans_start_time = 0;
17569 +       /* check for trans_id overflow */
17570 +       if (++journal->j_trans_id == 0)
17571 +               journal->j_trans_id = 10;
17572 +       journal->j_current_jl->j_trans_id = journal->j_trans_id;
17573 +       journal->j_must_wait = 0;
17574 +       journal->j_len_alloc = 0;
17575 +       journal->j_next_full_flush = 0;
17576 +       journal->j_next_async_flush = 0;
17577 +       init_journal_hash(sb);
17578 +
17579 +       /*
17580 +        * make sure reiserfs_add_jh sees the new current_jl before we
17581 +        * write out the tails
17582 +        */
17583 +       smp_mb();
17584 +
17585 +       /*
17586 +        * tail conversion targets have to hit the disk before we end the
17587 +        * transaction.  Otherwise a later transaction might repack the tail
17588 +        * before this transaction commits, leaving the data block unflushed
17589 +        * and clean, if we crash before the later transaction commits, the
17590 +        * data block is lost.
17591 +        */
17592 +       if (!list_empty(&jl->j_tail_bh_list)) {
17593 +               depth = reiserfs_write_unlock_nested(sb);
17594 +               write_ordered_buffers(&journal->j_dirty_buffers_lock,
17595 +                                     journal, jl, &jl->j_tail_bh_list);
17596 +               reiserfs_write_lock_nested(sb, depth);
17597 +       }
17598 +       BUG_ON(!list_empty(&jl->j_tail_bh_list));
17599 +       mutex_unlock(&jl->j_commit_mutex);
17600 +
17601 +       /*
17602 +        * honor the flush wishes from the caller, simple commits can
17603 +        * be done outside the journal lock, they are done below
17604 +        *
17605 +        * if we don't flush the commit list right now, we put it into
17606 +        * the work queue so the people waiting on the async progress work
17607 +        * queue don't wait for this proc to flush journal lists and such.
17608 +        */
17609 +       if (flush) {
17610 +               flush_commit_list(sb, jl, 1);
17611 +               flush_journal_list(sb, jl, 1);
17612 +       } else if (!(jl->j_state & LIST_COMMIT_PENDING)) {
17613 +               /*
17614 +                * Avoid queueing work when sb is being shut down. Transaction
17615 +                * will be flushed on journal shutdown.
17616 +                */
17617 +               if (sb->s_flags & SB_ACTIVE)
17618 +                       queue_delayed_work(REISERFS_SB(sb)->commit_wq,
17619 +                                          &journal->j_work, HZ / 10);
17620 +       }
17621 +
17622 +       /*
17623 +        * if the next transaction has any chance of wrapping, flush
17624 +        * transactions that might get overwritten.  If any journal lists
17625 +        * are very old flush them as well.
17626 +        */
17627 +first_jl:
17628 +       list_for_each_safe(entry, safe, &journal->j_journal_list) {
17629 +               temp_jl = JOURNAL_LIST_ENTRY(entry);
17630 +               if (journal->j_start <= temp_jl->j_start) {
17631 +                       if ((journal->j_start + journal->j_trans_max + 1) >=
17632 +                           temp_jl->j_start) {
17633 +                               flush_used_journal_lists(sb, temp_jl);
17634 +                               goto first_jl;
17635 +                       } else if ((journal->j_start +
17636 +                                   journal->j_trans_max + 1) <
17637 +                                  SB_ONDISK_JOURNAL_SIZE(sb)) {
17638 +                               /*
17639 +                                * if we don't cross into the next
17640 +                                * transaction and we don't wrap, there is
17641 +                                * no way we can overlap any later transactions
17642 +                                * break now
17643 +                                */
17644 +                               break;
17645 +                       }
17646 +               } else if ((journal->j_start +
17647 +                           journal->j_trans_max + 1) >
17648 +                          SB_ONDISK_JOURNAL_SIZE(sb)) {
17649 +                       if (((journal->j_start + journal->j_trans_max + 1) %
17650 +                            SB_ONDISK_JOURNAL_SIZE(sb)) >=
17651 +                           temp_jl->j_start) {
17652 +                               flush_used_journal_lists(sb, temp_jl);
17653 +                               goto first_jl;
17654 +                       } else {
17655 +                               /*
17656 +                               * we don't overlap anything from out start
17657 +                               * to the end of the log, and our wrapped
17658 +                               * portion doesn't overlap anything at
17659 +                               * the start of the log.  We can break
17660 +                               */
17661 +                               break;
17662 +                       }
17663 +               }
17664 +       }
17665 +
17666 +       journal->j_current_jl->j_list_bitmap =
17667 +           get_list_bitmap(sb, journal->j_current_jl);
17668 +
17669 +       if (!(journal->j_current_jl->j_list_bitmap)) {
17670 +               reiserfs_panic(sb, "journal-1996",
17671 +                              "could not get a list bitmap");
17672 +       }
17673 +
17674 +       atomic_set(&journal->j_jlock, 0);
17675 +       unlock_journal(sb);
17676 +       /* wake up any body waiting to join. */
17677 +       clear_bit(J_WRITERS_QUEUED, &journal->j_state);
17678 +       wake_up(&journal->j_join_wait);
17679 +
17680 +       if (!flush && wait_on_commit &&
17681 +           journal_list_still_alive(sb, commit_trans_id)) {
17682 +               flush_commit_list(sb, jl, 1);
17683 +       }
17684 +out:
17685 +       reiserfs_check_lock_depth(sb, "journal end2");
17686 +
17687 +       memset(th, 0, sizeof(*th));
17688 +       /*
17689 +        * Re-set th->t_super, so we can properly keep track of how many
17690 +        * persistent transactions there are. We need to do this so if this
17691 +        * call is part of a failed restart_transaction, we can free it later
17692 +        */
17693 +       th->t_super = sb;
17694 +
17695 +       return journal->j_errno;
17696 +}
17697 +
17698 +/* Send the file system read only and refuse new transactions */
17699 +void reiserfs_abort_journal(struct super_block *sb, int errno)
17700 +{
17701 +       struct reiserfs_journal *journal = SB_JOURNAL(sb);
17702 +       if (test_bit(J_ABORTED, &journal->j_state))
17703 +               return;
17704 +
17705 +       if (!journal->j_errno)
17706 +               journal->j_errno = errno;
17707 +
17708 +       sb->s_flags |= SB_RDONLY;
17709 +       set_bit(J_ABORTED, &journal->j_state);
17710 +
17711 +#ifdef CONFIG_REISERFS_CHECK
17712 +       dump_stack();
17713 +#endif
17714 +}
17715 diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
17716 new file mode 100644
17717 index 000000000000..7f868569d4d0
17718 --- /dev/null
17719 +++ b/fs/reiserfs/lbalance.c
17720 @@ -0,0 +1,1426 @@
17721 +/*
17722 + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
17723 + */
17724 +
17725 +#include <linux/uaccess.h>
17726 +#include <linux/string.h>
17727 +#include <linux/time.h>
17728 +#include "reiserfs.h"
17729 +#include <linux/buffer_head.h>
17730 +
17731 +/*
17732 + * copy copy_count entries from source directory item to dest buffer
17733 + * (creating new item if needed)
17734 + */
17735 +static void leaf_copy_dir_entries(struct buffer_info *dest_bi,
17736 +                                 struct buffer_head *source, int last_first,
17737 +                                 int item_num, int from, int copy_count)
17738 +{
17739 +       struct buffer_head *dest = dest_bi->bi_bh;
17740 +       /*
17741 +        * either the number of target item, or if we must create a
17742 +        * new item, the number of the item we will create it next to
17743 +        */
17744 +       int item_num_in_dest;
17745 +
17746 +       struct item_head *ih;
17747 +       struct reiserfs_de_head *deh;
17748 +       int copy_records_len;   /* length of all records in item to be copied */
17749 +       char *records;
17750 +
17751 +       ih = item_head(source, item_num);
17752 +
17753 +       RFALSE(!is_direntry_le_ih(ih), "vs-10000: item must be directory item");
17754 +
17755 +       /*
17756 +        * length of all record to be copied and first byte of
17757 +        * the last of them
17758 +        */
17759 +       deh = B_I_DEH(source, ih);
17760 +       if (copy_count) {
17761 +               copy_records_len = (from ? deh_location(&deh[from - 1]) :
17762 +                                   ih_item_len(ih)) -
17763 +                   deh_location(&deh[from + copy_count - 1]);
17764 +               records =
17765 +                   source->b_data + ih_location(ih) +
17766 +                   deh_location(&deh[from + copy_count - 1]);
17767 +       } else {
17768 +               copy_records_len = 0;
17769 +               records = NULL;
17770 +       }
17771 +
17772 +       /* when copy last to first, dest buffer can contain 0 items */
17773 +       item_num_in_dest =
17774 +           (last_first ==
17775 +            LAST_TO_FIRST) ? ((B_NR_ITEMS(dest)) ? 0 : -1) : (B_NR_ITEMS(dest)
17776 +                                                              - 1);
17777 +
17778 +       /*
17779 +        * if there are no items in dest or the first/last item in
17780 +        * dest is not item of the same directory
17781 +        */
17782 +       if ((item_num_in_dest == -1) ||
17783 +           (last_first == FIRST_TO_LAST && le_ih_k_offset(ih) == DOT_OFFSET) ||
17784 +           (last_first == LAST_TO_FIRST
17785 +            && comp_short_le_keys /*COMP_SHORT_KEYS */ (&ih->ih_key,
17786 +                                                        leaf_key(dest,
17787 +                                                                 item_num_in_dest))))
17788 +       {
17789 +               /* create new item in dest */
17790 +               struct item_head new_ih;
17791 +
17792 +               /* form item header */
17793 +               memcpy(&new_ih.ih_key, &ih->ih_key, KEY_SIZE);
17794 +               put_ih_version(&new_ih, KEY_FORMAT_3_5);
17795 +               /* calculate item len */
17796 +               put_ih_item_len(&new_ih,
17797 +                               DEH_SIZE * copy_count + copy_records_len);
17798 +               put_ih_entry_count(&new_ih, 0);
17799 +
17800 +               if (last_first == LAST_TO_FIRST) {
17801 +                       /* form key by the following way */
17802 +                       if (from < ih_entry_count(ih)) {
17803 +                               set_le_ih_k_offset(&new_ih,
17804 +                                                  deh_offset(&deh[from]));
17805 +                       } else {
17806 +                               /*
17807 +                                * no entries will be copied to this
17808 +                                * item in this function
17809 +                                */
17810 +                               set_le_ih_k_offset(&new_ih, U32_MAX);
17811 +                               /*
17812 +                                * this item is not yet valid, but we
17813 +                                * want I_IS_DIRECTORY_ITEM to return 1
17814 +                                * for it, so we -1
17815 +                                */
17816 +                       }
17817 +                       set_le_key_k_type(KEY_FORMAT_3_5, &new_ih.ih_key,
17818 +                                         TYPE_DIRENTRY);
17819 +               }
17820 +
17821 +               /* insert item into dest buffer */
17822 +               leaf_insert_into_buf(dest_bi,
17823 +                                    (last_first ==
17824 +                                     LAST_TO_FIRST) ? 0 : B_NR_ITEMS(dest),
17825 +                                    &new_ih, NULL, 0);
17826 +       } else {
17827 +               /* prepare space for entries */
17828 +               leaf_paste_in_buffer(dest_bi,
17829 +                                    (last_first ==
17830 +                                     FIRST_TO_LAST) ? (B_NR_ITEMS(dest) -
17831 +                                                       1) : 0, MAX_US_INT,
17832 +                                    DEH_SIZE * copy_count + copy_records_len,
17833 +                                    records, 0);
17834 +       }
17835 +
17836 +       item_num_in_dest =
17837 +           (last_first == FIRST_TO_LAST) ? (B_NR_ITEMS(dest) - 1) : 0;
17838 +
17839 +       leaf_paste_entries(dest_bi, item_num_in_dest,
17840 +                          (last_first ==
17841 +                           FIRST_TO_LAST) ? ih_entry_count(item_head(dest,
17842 +                                                                         item_num_in_dest))
17843 +                          : 0, copy_count, deh + from, records,
17844 +                          DEH_SIZE * copy_count + copy_records_len);
17845 +}
17846 +
17847 +/*
17848 + * Copy the first (if last_first == FIRST_TO_LAST) or last
17849 + * (last_first == LAST_TO_FIRST) item or part of it or nothing
17850 + * (see the return 0 below) from SOURCE to the end (if last_first)
17851 + * or beginning (!last_first) of the DEST
17852 + */
17853 +/* returns 1 if anything was copied, else 0 */
17854 +static int leaf_copy_boundary_item(struct buffer_info *dest_bi,
17855 +                                  struct buffer_head *src, int last_first,
17856 +                                  int bytes_or_entries)
17857 +{
17858 +       struct buffer_head *dest = dest_bi->bi_bh;
17859 +       /* number of items in the source and destination buffers */
17860 +       int dest_nr_item, src_nr_item;
17861 +       struct item_head *ih;
17862 +       struct item_head *dih;
17863 +
17864 +       dest_nr_item = B_NR_ITEMS(dest);
17865 +
17866 +       /*
17867 +        * if ( DEST is empty or first item of SOURCE and last item of
17868 +        * DEST are the items of different objects or of different types )
17869 +        * then there is no need to treat this item differently from the
17870 +        * other items that we copy, so we return
17871 +        */
17872 +       if (last_first == FIRST_TO_LAST) {
17873 +               ih = item_head(src, 0);
17874 +               dih = item_head(dest, dest_nr_item - 1);
17875 +
17876 +               /* there is nothing to merge */
17877 +               if (!dest_nr_item
17878 +                   || (!op_is_left_mergeable(&ih->ih_key, src->b_size)))
17879 +                       return 0;
17880 +
17881 +               RFALSE(!ih_item_len(ih),
17882 +                      "vs-10010: item can not have empty length");
17883 +
17884 +               if (is_direntry_le_ih(ih)) {
17885 +                       if (bytes_or_entries == -1)
17886 +                               /* copy all entries to dest */
17887 +                               bytes_or_entries = ih_entry_count(ih);
17888 +                       leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST, 0, 0,
17889 +                                             bytes_or_entries);
17890 +                       return 1;
17891 +               }
17892 +
17893 +               /*
17894 +                * copy part of the body of the first item of SOURCE
17895 +                * to the end of the body of the last item of the DEST
17896 +                * part defined by 'bytes_or_entries'; if bytes_or_entries
17897 +                * == -1 copy whole body; don't create new item header
17898 +                */
17899 +               if (bytes_or_entries == -1)
17900 +                       bytes_or_entries = ih_item_len(ih);
17901 +
17902 +#ifdef CONFIG_REISERFS_CHECK
17903 +               else {
17904 +                       if (bytes_or_entries == ih_item_len(ih)
17905 +                           && is_indirect_le_ih(ih))
17906 +                               if (get_ih_free_space(ih))
17907 +                                       reiserfs_panic(sb_from_bi(dest_bi),
17908 +                                                      "vs-10020",
17909 +                                                      "last unformatted node "
17910 +                                                      "must be filled "
17911 +                                                      "entirely (%h)", ih);
17912 +               }
17913 +#endif
17914 +
17915 +               /*
17916 +                * merge first item (or its part) of src buffer with the last
17917 +                * item of dest buffer. Both are of the same file
17918 +                */
17919 +               leaf_paste_in_buffer(dest_bi,
17920 +                                    dest_nr_item - 1, ih_item_len(dih),
17921 +                                    bytes_or_entries, ih_item_body(src, ih), 0);
17922 +
17923 +               if (is_indirect_le_ih(dih)) {
17924 +                       RFALSE(get_ih_free_space(dih),
17925 +                              "vs-10030: merge to left: last unformatted node of non-last indirect item %h must have zerto free space",
17926 +                              ih);
17927 +                       if (bytes_or_entries == ih_item_len(ih))
17928 +                               set_ih_free_space(dih, get_ih_free_space(ih));
17929 +               }
17930 +
17931 +               return 1;
17932 +       }
17933 +
17934 +       /* copy boundary item to right (last_first == LAST_TO_FIRST) */
17935 +
17936 +       /*
17937 +        * (DEST is empty or last item of SOURCE and first item of DEST
17938 +        * are the items of different object or of different types)
17939 +        */
17940 +       src_nr_item = B_NR_ITEMS(src);
17941 +       ih = item_head(src, src_nr_item - 1);
17942 +       dih = item_head(dest, 0);
17943 +
17944 +       if (!dest_nr_item || !op_is_left_mergeable(&dih->ih_key, src->b_size))
17945 +               return 0;
17946 +
17947 +       if (is_direntry_le_ih(ih)) {
17948 +               /*
17949 +                * bytes_or_entries = entries number in last
17950 +                * item body of SOURCE
17951 +                */
17952 +               if (bytes_or_entries == -1)
17953 +                       bytes_or_entries = ih_entry_count(ih);
17954 +
17955 +               leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST,
17956 +                                     src_nr_item - 1,
17957 +                                     ih_entry_count(ih) - bytes_or_entries,
17958 +                                     bytes_or_entries);
17959 +               return 1;
17960 +       }
17961 +
17962 +       /*
17963 +        * copy part of the body of the last item of SOURCE to the
17964 +        * begin of the body of the first item of the DEST; part defined
17965 +        * by 'bytes_or_entries'; if byte_or_entriess == -1 copy whole body;
17966 +        * change first item key of the DEST; don't create new item header
17967 +        */
17968 +
17969 +       RFALSE(is_indirect_le_ih(ih) && get_ih_free_space(ih),
17970 +              "vs-10040: merge to right: last unformatted node of non-last indirect item must be filled entirely (%h)",
17971 +              ih);
17972 +
17973 +       if (bytes_or_entries == -1) {
17974 +               /* bytes_or_entries = length of last item body of SOURCE */
17975 +               bytes_or_entries = ih_item_len(ih);
17976 +
17977 +               RFALSE(le_ih_k_offset(dih) !=
17978 +                      le_ih_k_offset(ih) + op_bytes_number(ih, src->b_size),
17979 +                      "vs-10050: items %h and %h do not match", ih, dih);
17980 +
17981 +               /* change first item key of the DEST */
17982 +               set_le_ih_k_offset(dih, le_ih_k_offset(ih));
17983 +
17984 +               /* item becomes non-mergeable */
17985 +               /* or mergeable if left item was */
17986 +               set_le_ih_k_type(dih, le_ih_k_type(ih));
17987 +       } else {
17988 +               /* merge to right only part of item */
17989 +               RFALSE(ih_item_len(ih) <= bytes_or_entries,
17990 +                      "vs-10060: no so much bytes %lu (needed %lu)",
17991 +                      (unsigned long)ih_item_len(ih),
17992 +                      (unsigned long)bytes_or_entries);
17993 +
17994 +               /* change first item key of the DEST */
17995 +               if (is_direct_le_ih(dih)) {
17996 +                       RFALSE(le_ih_k_offset(dih) <=
17997 +                              (unsigned long)bytes_or_entries,
17998 +                              "vs-10070: dih %h, bytes_or_entries(%d)", dih,
17999 +                              bytes_or_entries);
18000 +                       set_le_ih_k_offset(dih,
18001 +                                          le_ih_k_offset(dih) -
18002 +                                          bytes_or_entries);
18003 +               } else {
18004 +                       RFALSE(le_ih_k_offset(dih) <=
18005 +                              (bytes_or_entries / UNFM_P_SIZE) * dest->b_size,
18006 +                              "vs-10080: dih %h, bytes_or_entries(%d)",
18007 +                              dih,
18008 +                              (bytes_or_entries / UNFM_P_SIZE) * dest->b_size);
18009 +                       set_le_ih_k_offset(dih,
18010 +                                          le_ih_k_offset(dih) -
18011 +                                          ((bytes_or_entries / UNFM_P_SIZE) *
18012 +                                           dest->b_size));
18013 +               }
18014 +       }
18015 +
18016 +       leaf_paste_in_buffer(dest_bi, 0, 0, bytes_or_entries,
18017 +                            ih_item_body(src,
18018 +                                      ih) + ih_item_len(ih) - bytes_or_entries,
18019 +                            0);
18020 +       return 1;
18021 +}
18022 +
18023 +/*
18024 + * copy cpy_mun items from buffer src to buffer dest
18025 + * last_first == FIRST_TO_LAST means, that we copy cpy_num items beginning
18026 + *                             from first-th item in src to tail of dest
18027 + * last_first == LAST_TO_FIRST means, that we copy cpy_num items beginning
18028 + *                             from first-th item in src to head of dest
18029 + */
18030 +static void leaf_copy_items_entirely(struct buffer_info *dest_bi,
18031 +                                    struct buffer_head *src, int last_first,
18032 +                                    int first, int cpy_num)
18033 +{
18034 +       struct buffer_head *dest;
18035 +       int nr, free_space;
18036 +       int dest_before;
18037 +       int last_loc, last_inserted_loc, location;
18038 +       int i, j;
18039 +       struct block_head *blkh;
18040 +       struct item_head *ih;
18041 +
18042 +       RFALSE(last_first != LAST_TO_FIRST && last_first != FIRST_TO_LAST,
18043 +              "vs-10090: bad last_first parameter %d", last_first);
18044 +       RFALSE(B_NR_ITEMS(src) - first < cpy_num,
18045 +              "vs-10100: too few items in source %d, required %d from %d",
18046 +              B_NR_ITEMS(src), cpy_num, first);
18047 +       RFALSE(cpy_num < 0, "vs-10110: can not copy negative amount of items");
18048 +       RFALSE(!dest_bi, "vs-10120: can not copy negative amount of items");
18049 +
18050 +       dest = dest_bi->bi_bh;
18051 +
18052 +       RFALSE(!dest, "vs-10130: can not copy negative amount of items");
18053 +
18054 +       if (cpy_num == 0)
18055 +               return;
18056 +
18057 +       blkh = B_BLK_HEAD(dest);
18058 +       nr = blkh_nr_item(blkh);
18059 +       free_space = blkh_free_space(blkh);
18060 +
18061 +       /*
18062 +        * we will insert items before 0-th or nr-th item in dest buffer.
18063 +        * It depends of last_first parameter
18064 +        */
18065 +       dest_before = (last_first == LAST_TO_FIRST) ? 0 : nr;
18066 +
18067 +       /* location of head of first new item */
18068 +       ih = item_head(dest, dest_before);
18069 +
18070 +       RFALSE(blkh_free_space(blkh) < cpy_num * IH_SIZE,
18071 +              "vs-10140: not enough free space for headers %d (needed %d)",
18072 +              B_FREE_SPACE(dest), cpy_num * IH_SIZE);
18073 +
18074 +       /* prepare space for headers */
18075 +       memmove(ih + cpy_num, ih, (nr - dest_before) * IH_SIZE);
18076 +
18077 +       /* copy item headers */
18078 +       memcpy(ih, item_head(src, first), cpy_num * IH_SIZE);
18079 +
18080 +       free_space -= (IH_SIZE * cpy_num);
18081 +       set_blkh_free_space(blkh, free_space);
18082 +
18083 +       /* location of unmovable item */
18084 +       j = location = (dest_before == 0) ? dest->b_size : ih_location(ih - 1);
18085 +       for (i = dest_before; i < nr + cpy_num; i++) {
18086 +               location -= ih_item_len(ih + i - dest_before);
18087 +               put_ih_location(ih + i - dest_before, location);
18088 +       }
18089 +
18090 +       /* prepare space for items */
18091 +       last_loc = ih_location(&ih[nr + cpy_num - 1 - dest_before]);
18092 +       last_inserted_loc = ih_location(&ih[cpy_num - 1]);
18093 +
18094 +       /* check free space */
18095 +       RFALSE(free_space < j - last_inserted_loc,
18096 +              "vs-10150: not enough free space for items %d (needed %d)",
18097 +              free_space, j - last_inserted_loc);
18098 +
18099 +       memmove(dest->b_data + last_loc,
18100 +               dest->b_data + last_loc + j - last_inserted_loc,
18101 +               last_inserted_loc - last_loc);
18102 +
18103 +       /* copy items */
18104 +       memcpy(dest->b_data + last_inserted_loc,
18105 +              item_body(src, (first + cpy_num - 1)),
18106 +              j - last_inserted_loc);
18107 +
18108 +       /* sizes, item number */
18109 +       set_blkh_nr_item(blkh, nr + cpy_num);
18110 +       set_blkh_free_space(blkh, free_space - (j - last_inserted_loc));
18111 +
18112 +       do_balance_mark_leaf_dirty(dest_bi->tb, dest, 0);
18113 +
18114 +       if (dest_bi->bi_parent) {
18115 +               struct disk_child *t_dc;
18116 +               t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position);
18117 +               RFALSE(dc_block_number(t_dc) != dest->b_blocknr,
18118 +                      "vs-10160: block number in bh does not match to field in disk_child structure %lu and %lu",
18119 +                      (long unsigned)dest->b_blocknr,
18120 +                      (long unsigned)dc_block_number(t_dc));
18121 +               put_dc_size(t_dc,
18122 +                           dc_size(t_dc) + (j - last_inserted_loc +
18123 +                                            IH_SIZE * cpy_num));
18124 +
18125 +               do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent,
18126 +                                              0);
18127 +       }
18128 +}
18129 +
18130 +/*
18131 + * This function splits the (liquid) item into two items (useful when
18132 + * shifting part of an item into another node.)
18133 + */
18134 +static void leaf_item_bottle(struct buffer_info *dest_bi,
18135 +                            struct buffer_head *src, int last_first,
18136 +                            int item_num, int cpy_bytes)
18137 +{
18138 +       struct buffer_head *dest = dest_bi->bi_bh;
18139 +       struct item_head *ih;
18140 +
18141 +       RFALSE(cpy_bytes == -1,
18142 +              "vs-10170: bytes == - 1 means: do not split item");
18143 +
18144 +       if (last_first == FIRST_TO_LAST) {
18145 +               /*
18146 +                * if ( if item in position item_num in buffer SOURCE
18147 +                * is directory item )
18148 +                */
18149 +               ih = item_head(src, item_num);
18150 +               if (is_direntry_le_ih(ih))
18151 +                       leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST,
18152 +                                             item_num, 0, cpy_bytes);
18153 +               else {
18154 +                       struct item_head n_ih;
18155 +
18156 +                       /*
18157 +                        * copy part of the body of the item number 'item_num'
18158 +                        * of SOURCE to the end of the DEST part defined by
18159 +                        * 'cpy_bytes'; create new item header; change old
18160 +                        * item_header (????); n_ih = new item_header;
18161 +                        */
18162 +                       memcpy(&n_ih, ih, IH_SIZE);
18163 +                       put_ih_item_len(&n_ih, cpy_bytes);
18164 +                       if (is_indirect_le_ih(ih)) {
18165 +                               RFALSE(cpy_bytes == ih_item_len(ih)
18166 +                                      && get_ih_free_space(ih),
18167 +                                      "vs-10180: when whole indirect item is bottle to left neighbor, it must have free_space==0 (not %lu)",
18168 +                                      (long unsigned)get_ih_free_space(ih));
18169 +                               set_ih_free_space(&n_ih, 0);
18170 +                       }
18171 +
18172 +                       RFALSE(op_is_left_mergeable(&ih->ih_key, src->b_size),
18173 +                              "vs-10190: bad mergeability of item %h", ih);
18174 +                       n_ih.ih_version = ih->ih_version;       /* JDM Endian safe, both le */
18175 +                       leaf_insert_into_buf(dest_bi, B_NR_ITEMS(dest), &n_ih,
18176 +                                            item_body(src, item_num), 0);
18177 +               }
18178 +       } else {
18179 +               /*
18180 +                * if ( if item in position item_num in buffer
18181 +                * SOURCE is directory item )
18182 +                */
18183 +               ih = item_head(src, item_num);
18184 +               if (is_direntry_le_ih(ih))
18185 +                       leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST,
18186 +                                             item_num,
18187 +                                             ih_entry_count(ih) - cpy_bytes,
18188 +                                             cpy_bytes);
18189 +               else {
18190 +                       struct item_head n_ih;
18191 +
18192 +                       /*
18193 +                        * copy part of the body of the item number 'item_num'
18194 +                        * of SOURCE to the begin of the DEST part defined by
18195 +                        * 'cpy_bytes'; create new item header;
18196 +                        * n_ih = new item_header;
18197 +                        */
18198 +                       memcpy(&n_ih.ih_key, &ih->ih_key, KEY_SIZE);
18199 +
18200 +                       /* Endian safe, both le */
18201 +                       n_ih.ih_version = ih->ih_version;
18202 +
18203 +                       if (is_direct_le_ih(ih)) {
18204 +                               set_le_ih_k_offset(&n_ih,
18205 +                                                  le_ih_k_offset(ih) +
18206 +                                                  ih_item_len(ih) - cpy_bytes);
18207 +                               set_le_ih_k_type(&n_ih, TYPE_DIRECT);
18208 +                               set_ih_free_space(&n_ih, MAX_US_INT);
18209 +                       } else {
18210 +                               /* indirect item */
18211 +                               RFALSE(!cpy_bytes && get_ih_free_space(ih),
18212 +                                      "vs-10200: ih->ih_free_space must be 0 when indirect item will be appended");
18213 +                               set_le_ih_k_offset(&n_ih,
18214 +                                                  le_ih_k_offset(ih) +
18215 +                                                  (ih_item_len(ih) -
18216 +                                                   cpy_bytes) / UNFM_P_SIZE *
18217 +                                                  dest->b_size);
18218 +                               set_le_ih_k_type(&n_ih, TYPE_INDIRECT);
18219 +                               set_ih_free_space(&n_ih, get_ih_free_space(ih));
18220 +                       }
18221 +
18222 +                       /* set item length */
18223 +                       put_ih_item_len(&n_ih, cpy_bytes);
18224 +
18225 +                       /* Endian safe, both le */
18226 +                       n_ih.ih_version = ih->ih_version;
18227 +
18228 +                       leaf_insert_into_buf(dest_bi, 0, &n_ih,
18229 +                                            item_body(src, item_num) +
18230 +                                               ih_item_len(ih) - cpy_bytes, 0);
18231 +               }
18232 +       }
18233 +}
18234 +
18235 +/*
18236 + * If cpy_bytes equals minus one than copy cpy_num whole items from SOURCE
18237 + * to DEST.  If cpy_bytes not equal to minus one than copy cpy_num-1 whole
18238 + * items from SOURCE to DEST.  From last item copy cpy_num bytes for regular
18239 + * item and cpy_num directory entries for directory item.
18240 + */
18241 +static int leaf_copy_items(struct buffer_info *dest_bi, struct buffer_head *src,
18242 +                          int last_first, int cpy_num, int cpy_bytes)
18243 +{
18244 +       struct buffer_head *dest;
18245 +       int pos, i, src_nr_item, bytes;
18246 +
18247 +       dest = dest_bi->bi_bh;
18248 +       RFALSE(!dest || !src, "vs-10210: !dest || !src");
18249 +       RFALSE(last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST,
18250 +              "vs-10220:last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST");
18251 +       RFALSE(B_NR_ITEMS(src) < cpy_num,
18252 +              "vs-10230: No enough items: %d, req. %d", B_NR_ITEMS(src),
18253 +              cpy_num);
18254 +       RFALSE(cpy_num < 0, "vs-10240: cpy_num < 0 (%d)", cpy_num);
18255 +
18256 +       if (cpy_num == 0)
18257 +               return 0;
18258 +
18259 +       if (last_first == FIRST_TO_LAST) {
18260 +               /* copy items to left */
18261 +               pos = 0;
18262 +               if (cpy_num == 1)
18263 +                       bytes = cpy_bytes;
18264 +               else
18265 +                       bytes = -1;
18266 +
18267 +               /*
18268 +                * copy the first item or it part or nothing to the end of
18269 +                * the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,0,bytes))
18270 +                */
18271 +               i = leaf_copy_boundary_item(dest_bi, src, FIRST_TO_LAST, bytes);
18272 +               cpy_num -= i;
18273 +               if (cpy_num == 0)
18274 +                       return i;
18275 +               pos += i;
18276 +               if (cpy_bytes == -1)
18277 +                       /*
18278 +                        * copy first cpy_num items starting from position
18279 +                        * 'pos' of SOURCE to end of DEST
18280 +                        */
18281 +                       leaf_copy_items_entirely(dest_bi, src, FIRST_TO_LAST,
18282 +                                                pos, cpy_num);
18283 +               else {
18284 +                       /*
18285 +                        * copy first cpy_num-1 items starting from position
18286 +                        * 'pos-1' of the SOURCE to the end of the DEST
18287 +                        */
18288 +                       leaf_copy_items_entirely(dest_bi, src, FIRST_TO_LAST,
18289 +                                                pos, cpy_num - 1);
18290 +
18291 +                       /*
18292 +                        * copy part of the item which number is
18293 +                        * cpy_num+pos-1 to the end of the DEST
18294 +                        */
18295 +                       leaf_item_bottle(dest_bi, src, FIRST_TO_LAST,
18296 +                                        cpy_num + pos - 1, cpy_bytes);
18297 +               }
18298 +       } else {
18299 +               /* copy items to right */
18300 +               src_nr_item = B_NR_ITEMS(src);
18301 +               if (cpy_num == 1)
18302 +                       bytes = cpy_bytes;
18303 +               else
18304 +                       bytes = -1;
18305 +
18306 +               /*
18307 +                * copy the last item or it part or nothing to the
18308 +                * begin of the DEST
18309 +                * (i = leaf_copy_boundary_item(DEST,SOURCE,1,bytes));
18310 +                */
18311 +               i = leaf_copy_boundary_item(dest_bi, src, LAST_TO_FIRST, bytes);
18312 +
18313 +               cpy_num -= i;
18314 +               if (cpy_num == 0)
18315 +                       return i;
18316 +
18317 +               pos = src_nr_item - cpy_num - i;
18318 +               if (cpy_bytes == -1) {
18319 +                       /*
18320 +                        * starting from position 'pos' copy last cpy_num
18321 +                        * items of SOURCE to begin of DEST
18322 +                        */
18323 +                       leaf_copy_items_entirely(dest_bi, src, LAST_TO_FIRST,
18324 +                                                pos, cpy_num);
18325 +               } else {
18326 +                       /*
18327 +                        * copy last cpy_num-1 items starting from position
18328 +                        * 'pos+1' of the SOURCE to the begin of the DEST;
18329 +                        */
18330 +                       leaf_copy_items_entirely(dest_bi, src, LAST_TO_FIRST,
18331 +                                                pos + 1, cpy_num - 1);
18332 +
18333 +                       /*
18334 +                        * copy part of the item which number is pos to
18335 +                        * the begin of the DEST
18336 +                        */
18337 +                       leaf_item_bottle(dest_bi, src, LAST_TO_FIRST, pos,
18338 +                                        cpy_bytes);
18339 +               }
18340 +       }
18341 +       return i;
18342 +}
18343 +
18344 +/*
18345 + * there are types of coping: from S[0] to L[0], from S[0] to R[0],
18346 + * from R[0] to L[0]. for each of these we have to define parent and
18347 + * positions of destination and source buffers
18348 + */
18349 +static void leaf_define_dest_src_infos(int shift_mode, struct tree_balance *tb,
18350 +                                      struct buffer_info *dest_bi,
18351 +                                      struct buffer_info *src_bi,
18352 +                                      int *first_last,
18353 +                                      struct buffer_head *Snew)
18354 +{
18355 +       memset(dest_bi, 0, sizeof(struct buffer_info));
18356 +       memset(src_bi, 0, sizeof(struct buffer_info));
18357 +
18358 +       /* define dest, src, dest parent, dest position */
18359 +       switch (shift_mode) {
18360 +       case LEAF_FROM_S_TO_L:  /* it is used in leaf_shift_left */
18361 +               src_bi->tb = tb;
18362 +               src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path);
18363 +               src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
18364 +
18365 +               /* src->b_item_order */
18366 +               src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0);
18367 +               dest_bi->tb = tb;
18368 +               dest_bi->bi_bh = tb->L[0];
18369 +               dest_bi->bi_parent = tb->FL[0];
18370 +               dest_bi->bi_position = get_left_neighbor_position(tb, 0);
18371 +               *first_last = FIRST_TO_LAST;
18372 +               break;
18373 +
18374 +       case LEAF_FROM_S_TO_R:  /* it is used in leaf_shift_right */
18375 +               src_bi->tb = tb;
18376 +               src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path);
18377 +               src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
18378 +               src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0);
18379 +               dest_bi->tb = tb;
18380 +               dest_bi->bi_bh = tb->R[0];
18381 +               dest_bi->bi_parent = tb->FR[0];
18382 +               dest_bi->bi_position = get_right_neighbor_position(tb, 0);
18383 +               *first_last = LAST_TO_FIRST;
18384 +               break;
18385 +
18386 +       case LEAF_FROM_R_TO_L:  /* it is used in balance_leaf_when_delete */
18387 +               src_bi->tb = tb;
18388 +               src_bi->bi_bh = tb->R[0];
18389 +               src_bi->bi_parent = tb->FR[0];
18390 +               src_bi->bi_position = get_right_neighbor_position(tb, 0);
18391 +               dest_bi->tb = tb;
18392 +               dest_bi->bi_bh = tb->L[0];
18393 +               dest_bi->bi_parent = tb->FL[0];
18394 +               dest_bi->bi_position = get_left_neighbor_position(tb, 0);
18395 +               *first_last = FIRST_TO_LAST;
18396 +               break;
18397 +
18398 +       case LEAF_FROM_L_TO_R:  /* it is used in balance_leaf_when_delete */
18399 +               src_bi->tb = tb;
18400 +               src_bi->bi_bh = tb->L[0];
18401 +               src_bi->bi_parent = tb->FL[0];
18402 +               src_bi->bi_position = get_left_neighbor_position(tb, 0);
18403 +               dest_bi->tb = tb;
18404 +               dest_bi->bi_bh = tb->R[0];
18405 +               dest_bi->bi_parent = tb->FR[0];
18406 +               dest_bi->bi_position = get_right_neighbor_position(tb, 0);
18407 +               *first_last = LAST_TO_FIRST;
18408 +               break;
18409 +
18410 +       case LEAF_FROM_S_TO_SNEW:
18411 +               src_bi->tb = tb;
18412 +               src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path);
18413 +               src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
18414 +               src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0);
18415 +               dest_bi->tb = tb;
18416 +               dest_bi->bi_bh = Snew;
18417 +               dest_bi->bi_parent = NULL;
18418 +               dest_bi->bi_position = 0;
18419 +               *first_last = LAST_TO_FIRST;
18420 +               break;
18421 +
18422 +       default:
18423 +               reiserfs_panic(sb_from_bi(src_bi), "vs-10250",
18424 +                              "shift type is unknown (%d)", shift_mode);
18425 +       }
18426 +       RFALSE(!src_bi->bi_bh || !dest_bi->bi_bh,
18427 +              "vs-10260: mode==%d, source (%p) or dest (%p) buffer is initialized incorrectly",
18428 +              shift_mode, src_bi->bi_bh, dest_bi->bi_bh);
18429 +}
18430 +
18431 +/*
18432 + * copy mov_num items and mov_bytes of the (mov_num-1)th item to
18433 + * neighbor. Delete them from source
18434 + */
18435 +int leaf_move_items(int shift_mode, struct tree_balance *tb, int mov_num,
18436 +                   int mov_bytes, struct buffer_head *Snew)
18437 +{
18438 +       int ret_value;
18439 +       struct buffer_info dest_bi, src_bi;
18440 +       int first_last;
18441 +
18442 +       leaf_define_dest_src_infos(shift_mode, tb, &dest_bi, &src_bi,
18443 +                                  &first_last, Snew);
18444 +
18445 +       ret_value =
18446 +           leaf_copy_items(&dest_bi, src_bi.bi_bh, first_last, mov_num,
18447 +                           mov_bytes);
18448 +
18449 +       leaf_delete_items(&src_bi, first_last,
18450 +                         (first_last ==
18451 +                          FIRST_TO_LAST) ? 0 : (B_NR_ITEMS(src_bi.bi_bh) -
18452 +                                                mov_num), mov_num, mov_bytes);
18453 +
18454 +       return ret_value;
18455 +}
18456 +
18457 +/*
18458 + * Shift shift_num items (and shift_bytes of last shifted item if
18459 + * shift_bytes != -1) from S[0] to L[0] and replace the delimiting key
18460 + */
18461 +int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes)
18462 +{
18463 +       struct buffer_head *S0 = PATH_PLAST_BUFFER(tb->tb_path);
18464 +       int i;
18465 +
18466 +       /*
18467 +        * move shift_num (and shift_bytes bytes) items from S[0]
18468 +        * to left neighbor L[0]
18469 +        */
18470 +       i = leaf_move_items(LEAF_FROM_S_TO_L, tb, shift_num, shift_bytes, NULL);
18471 +
18472 +       if (shift_num) {
18473 +               /* number of items in S[0] == 0 */
18474 +               if (B_NR_ITEMS(S0) == 0) {
18475 +
18476 +                       RFALSE(shift_bytes != -1,
18477 +                              "vs-10270: S0 is empty now, but shift_bytes != -1 (%d)",
18478 +                              shift_bytes);
18479 +#ifdef CONFIG_REISERFS_CHECK
18480 +                       if (tb->tb_mode == M_PASTE || tb->tb_mode == M_INSERT) {
18481 +                               print_cur_tb("vs-10275");
18482 +                               reiserfs_panic(tb->tb_sb, "vs-10275",
18483 +                                              "balance condition corrupted "
18484 +                                              "(%c)", tb->tb_mode);
18485 +                       }
18486 +#endif
18487 +
18488 +                       if (PATH_H_POSITION(tb->tb_path, 1) == 0)
18489 +                               replace_key(tb, tb->CFL[0], tb->lkey[0],
18490 +                                           PATH_H_PPARENT(tb->tb_path, 0), 0);
18491 +
18492 +               } else {
18493 +                       /* replace lkey in CFL[0] by 0-th key from S[0]; */
18494 +                       replace_key(tb, tb->CFL[0], tb->lkey[0], S0, 0);
18495 +
18496 +                       RFALSE((shift_bytes != -1 &&
18497 +                               !(is_direntry_le_ih(item_head(S0, 0))
18498 +                                 && !ih_entry_count(item_head(S0, 0)))) &&
18499 +                              (!op_is_left_mergeable
18500 +                               (leaf_key(S0, 0), S0->b_size)),
18501 +                              "vs-10280: item must be mergeable");
18502 +               }
18503 +       }
18504 +
18505 +       return i;
18506 +}
18507 +
18508 +/* CLEANING STOPPED HERE */
18509 +
18510 +/*
18511 + * Shift shift_num (shift_bytes) items from S[0] to the right neighbor,
18512 + * and replace the delimiting key
18513 + */
18514 +int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes)
18515 +{
18516 +       int ret_value;
18517 +
18518 +       /*
18519 +        * move shift_num (and shift_bytes) items from S[0] to
18520 +        * right neighbor R[0]
18521 +        */
18522 +       ret_value =
18523 +           leaf_move_items(LEAF_FROM_S_TO_R, tb, shift_num, shift_bytes, NULL);
18524 +
18525 +       /* replace rkey in CFR[0] by the 0-th key from R[0] */
18526 +       if (shift_num) {
18527 +               replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
18528 +
18529 +       }
18530 +
18531 +       return ret_value;
18532 +}
18533 +
18534 +static void leaf_delete_items_entirely(struct buffer_info *bi,
18535 +                                      int first, int del_num);
18536 +/*
18537 + * If del_bytes == -1, starting from position 'first' delete del_num
18538 + * items in whole in buffer CUR.
18539 + *   If not.
18540 + *   If last_first == 0. Starting from position 'first' delete del_num-1
18541 + *   items in whole. Delete part of body of the first item. Part defined by
18542 + *   del_bytes. Don't delete first item header
18543 + *   If last_first == 1. Starting from position 'first+1' delete del_num-1
18544 + *   items in whole. Delete part of body of the last item . Part defined by
18545 + *   del_bytes. Don't delete last item header.
18546 +*/
18547 +void leaf_delete_items(struct buffer_info *cur_bi, int last_first,
18548 +                      int first, int del_num, int del_bytes)
18549 +{
18550 +       struct buffer_head *bh;
18551 +       int item_amount = B_NR_ITEMS(bh = cur_bi->bi_bh);
18552 +
18553 +       RFALSE(!bh, "10155: bh is not defined");
18554 +       RFALSE(del_num < 0, "10160: del_num can not be < 0. del_num==%d",
18555 +              del_num);
18556 +       RFALSE(first < 0
18557 +              || first + del_num > item_amount,
18558 +              "10165: invalid number of first item to be deleted (%d) or "
18559 +              "no so much items (%d) to delete (only %d)", first,
18560 +              first + del_num, item_amount);
18561 +
18562 +       if (del_num == 0)
18563 +               return;
18564 +
18565 +       if (first == 0 && del_num == item_amount && del_bytes == -1) {
18566 +               make_empty_node(cur_bi);
18567 +               do_balance_mark_leaf_dirty(cur_bi->tb, bh, 0);
18568 +               return;
18569 +       }
18570 +
18571 +       if (del_bytes == -1)
18572 +               /* delete del_num items beginning from item in position first */
18573 +               leaf_delete_items_entirely(cur_bi, first, del_num);
18574 +       else {
18575 +               if (last_first == FIRST_TO_LAST) {
18576 +                       /*
18577 +                        * delete del_num-1 items beginning from
18578 +                        * item in position first
18579 +                        */
18580 +                       leaf_delete_items_entirely(cur_bi, first, del_num - 1);
18581 +
18582 +                       /*
18583 +                        * delete the part of the first item of the bh
18584 +                        * do not delete item header
18585 +                        */
18586 +                       leaf_cut_from_buffer(cur_bi, 0, 0, del_bytes);
18587 +               } else {
18588 +                       struct item_head *ih;
18589 +                       int len;
18590 +
18591 +                       /*
18592 +                        * delete del_num-1 items beginning from
18593 +                        * item in position first+1
18594 +                        */
18595 +                       leaf_delete_items_entirely(cur_bi, first + 1,
18596 +                                                  del_num - 1);
18597 +
18598 +                       ih = item_head(bh, B_NR_ITEMS(bh) - 1);
18599 +                       if (is_direntry_le_ih(ih))
18600 +                               /* the last item is directory  */
18601 +                               /*
18602 +                                * len = numbers of directory entries
18603 +                                * in this item
18604 +                                */
18605 +                               len = ih_entry_count(ih);
18606 +                       else
18607 +                               /* len = body len of item */
18608 +                               len = ih_item_len(ih);
18609 +
18610 +                       /*
18611 +                        * delete the part of the last item of the bh
18612 +                        * do not delete item header
18613 +                        */
18614 +                       leaf_cut_from_buffer(cur_bi, B_NR_ITEMS(bh) - 1,
18615 +                                            len - del_bytes, del_bytes);
18616 +               }
18617 +       }
18618 +}
18619 +
18620 +/* insert item into the leaf node in position before */
18621 +void leaf_insert_into_buf(struct buffer_info *bi, int before,
18622 +                         struct item_head * const inserted_item_ih,
18623 +                         const char * const inserted_item_body,
18624 +                         int zeros_number)
18625 +{
18626 +       struct buffer_head *bh = bi->bi_bh;
18627 +       int nr, free_space;
18628 +       struct block_head *blkh;
18629 +       struct item_head *ih;
18630 +       int i;
18631 +       int last_loc, unmoved_loc;
18632 +       char *to;
18633 +
18634 +       blkh = B_BLK_HEAD(bh);
18635 +       nr = blkh_nr_item(blkh);
18636 +       free_space = blkh_free_space(blkh);
18637 +
18638 +       /* check free space */
18639 +       RFALSE(free_space < ih_item_len(inserted_item_ih) + IH_SIZE,
18640 +              "vs-10170: not enough free space in block %z, new item %h",
18641 +              bh, inserted_item_ih);
18642 +       RFALSE(zeros_number > ih_item_len(inserted_item_ih),
18643 +              "vs-10172: zero number == %d, item length == %d",
18644 +              zeros_number, ih_item_len(inserted_item_ih));
18645 +
18646 +       /* get item new item must be inserted before */
18647 +       ih = item_head(bh, before);
18648 +
18649 +       /* prepare space for the body of new item */
18650 +       last_loc = nr ? ih_location(&ih[nr - before - 1]) : bh->b_size;
18651 +       unmoved_loc = before ? ih_location(ih - 1) : bh->b_size;
18652 +
18653 +       memmove(bh->b_data + last_loc - ih_item_len(inserted_item_ih),
18654 +               bh->b_data + last_loc, unmoved_loc - last_loc);
18655 +
18656 +       to = bh->b_data + unmoved_loc - ih_item_len(inserted_item_ih);
18657 +       memset(to, 0, zeros_number);
18658 +       to += zeros_number;
18659 +
18660 +       /* copy body to prepared space */
18661 +       if (inserted_item_body)
18662 +               memmove(to, inserted_item_body,
18663 +                       ih_item_len(inserted_item_ih) - zeros_number);
18664 +       else
18665 +               memset(to, '\0', ih_item_len(inserted_item_ih) - zeros_number);
18666 +
18667 +       /* insert item header */
18668 +       memmove(ih + 1, ih, IH_SIZE * (nr - before));
18669 +       memmove(ih, inserted_item_ih, IH_SIZE);
18670 +
18671 +       /* change locations */
18672 +       for (i = before; i < nr + 1; i++) {
18673 +               unmoved_loc -= ih_item_len(&ih[i - before]);
18674 +               put_ih_location(&ih[i - before], unmoved_loc);
18675 +       }
18676 +
18677 +       /* sizes, free space, item number */
18678 +       set_blkh_nr_item(blkh, blkh_nr_item(blkh) + 1);
18679 +       set_blkh_free_space(blkh,
18680 +                           free_space - (IH_SIZE +
18681 +                                         ih_item_len(inserted_item_ih)));
18682 +       do_balance_mark_leaf_dirty(bi->tb, bh, 1);
18683 +
18684 +       if (bi->bi_parent) {
18685 +               struct disk_child *t_dc;
18686 +               t_dc = B_N_CHILD(bi->bi_parent, bi->bi_position);
18687 +               put_dc_size(t_dc,
18688 +                           dc_size(t_dc) + (IH_SIZE +
18689 +                                            ih_item_len(inserted_item_ih)));
18690 +               do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0);
18691 +       }
18692 +}
18693 +
18694 +/*
18695 + * paste paste_size bytes to affected_item_num-th item.
18696 + * When item is a directory, this only prepare space for new entries
18697 + */
18698 +void leaf_paste_in_buffer(struct buffer_info *bi, int affected_item_num,
18699 +                         int pos_in_item, int paste_size,
18700 +                         const char *body, int zeros_number)
18701 +{
18702 +       struct buffer_head *bh = bi->bi_bh;
18703 +       int nr, free_space;
18704 +       struct block_head *blkh;
18705 +       struct item_head *ih;
18706 +       int i;
18707 +       int last_loc, unmoved_loc;
18708 +
18709 +       blkh = B_BLK_HEAD(bh);
18710 +       nr = blkh_nr_item(blkh);
18711 +       free_space = blkh_free_space(blkh);
18712 +
18713 +       /* check free space */
18714 +       RFALSE(free_space < paste_size,
18715 +              "vs-10175: not enough free space: needed %d, available %d",
18716 +              paste_size, free_space);
18717 +
18718 +#ifdef CONFIG_REISERFS_CHECK
18719 +       if (zeros_number > paste_size) {
18720 +               struct super_block *sb = NULL;
18721 +               if (bi && bi->tb)
18722 +                       sb = bi->tb->tb_sb;
18723 +               print_cur_tb("10177");
18724 +               reiserfs_panic(sb, "vs-10177",
18725 +                              "zeros_number == %d, paste_size == %d",
18726 +                              zeros_number, paste_size);
18727 +       }
18728 +#endif                         /* CONFIG_REISERFS_CHECK */
18729 +
18730 +       /* item to be appended */
18731 +       ih = item_head(bh, affected_item_num);
18732 +
18733 +       last_loc = ih_location(&ih[nr - affected_item_num - 1]);
18734 +       unmoved_loc = affected_item_num ? ih_location(ih - 1) : bh->b_size;
18735 +
18736 +       /* prepare space */
18737 +       memmove(bh->b_data + last_loc - paste_size, bh->b_data + last_loc,
18738 +               unmoved_loc - last_loc);
18739 +
18740 +       /* change locations */
18741 +       for (i = affected_item_num; i < nr; i++)
18742 +               put_ih_location(&ih[i - affected_item_num],
18743 +                               ih_location(&ih[i - affected_item_num]) -
18744 +                               paste_size);
18745 +
18746 +       if (body) {
18747 +               if (!is_direntry_le_ih(ih)) {
18748 +                       if (!pos_in_item) {
18749 +                               /* shift data to right */
18750 +                               memmove(bh->b_data + ih_location(ih) +
18751 +                                       paste_size,
18752 +                                       bh->b_data + ih_location(ih),
18753 +                                       ih_item_len(ih));
18754 +                               /* paste data in the head of item */
18755 +                               memset(bh->b_data + ih_location(ih), 0,
18756 +                                      zeros_number);
18757 +                               memcpy(bh->b_data + ih_location(ih) +
18758 +                                      zeros_number, body,
18759 +                                      paste_size - zeros_number);
18760 +                       } else {
18761 +                               memset(bh->b_data + unmoved_loc - paste_size, 0,
18762 +                                      zeros_number);
18763 +                               memcpy(bh->b_data + unmoved_loc - paste_size +
18764 +                                      zeros_number, body,
18765 +                                      paste_size - zeros_number);
18766 +                       }
18767 +               }
18768 +       } else
18769 +               memset(bh->b_data + unmoved_loc - paste_size, '\0', paste_size);
18770 +
18771 +       put_ih_item_len(ih, ih_item_len(ih) + paste_size);
18772 +
18773 +       /* change free space */
18774 +       set_blkh_free_space(blkh, free_space - paste_size);
18775 +
18776 +       do_balance_mark_leaf_dirty(bi->tb, bh, 0);
18777 +
18778 +       if (bi->bi_parent) {
18779 +               struct disk_child *t_dc =
18780 +                   B_N_CHILD(bi->bi_parent, bi->bi_position);
18781 +               put_dc_size(t_dc, dc_size(t_dc) + paste_size);
18782 +               do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0);
18783 +       }
18784 +}
18785 +
18786 +/*
18787 + * cuts DEL_COUNT entries beginning from FROM-th entry. Directory item
18788 + * does not have free space, so it moves DEHs and remaining records as
18789 + * necessary. Return value is size of removed part of directory item
18790 + * in bytes.
18791 + */
18792 +static int leaf_cut_entries(struct buffer_head *bh,
18793 +                           struct item_head *ih, int from, int del_count)
18794 +{
18795 +       char *item;
18796 +       struct reiserfs_de_head *deh;
18797 +       int prev_record_offset; /* offset of record, that is (from-1)th */
18798 +       char *prev_record;      /* */
18799 +       int cut_records_len;    /* length of all removed records */
18800 +       int i;
18801 +
18802 +       /*
18803 +        * make sure that item is directory and there are enough entries to
18804 +        * remove
18805 +        */
18806 +       RFALSE(!is_direntry_le_ih(ih), "10180: item is not directory item");
18807 +       RFALSE(ih_entry_count(ih) < from + del_count,
18808 +              "10185: item contains not enough entries: entry_count = %d, from = %d, to delete = %d",
18809 +              ih_entry_count(ih), from, del_count);
18810 +
18811 +       if (del_count == 0)
18812 +               return 0;
18813 +
18814 +       /* first byte of item */
18815 +       item = bh->b_data + ih_location(ih);
18816 +
18817 +       /* entry head array */
18818 +       deh = B_I_DEH(bh, ih);
18819 +
18820 +       /*
18821 +        * first byte of remaining entries, those are BEFORE cut entries
18822 +        * (prev_record) and length of all removed records (cut_records_len)
18823 +        */
18824 +       prev_record_offset =
18825 +           (from ? deh_location(&deh[from - 1]) : ih_item_len(ih));
18826 +       cut_records_len = prev_record_offset /*from_record */  -
18827 +           deh_location(&deh[from + del_count - 1]);
18828 +       prev_record = item + prev_record_offset;
18829 +
18830 +       /* adjust locations of remaining entries */
18831 +       for (i = ih_entry_count(ih) - 1; i > from + del_count - 1; i--)
18832 +               put_deh_location(&deh[i],
18833 +                                deh_location(&deh[i]) -
18834 +                                (DEH_SIZE * del_count));
18835 +
18836 +       for (i = 0; i < from; i++)
18837 +               put_deh_location(&deh[i],
18838 +                                deh_location(&deh[i]) - (DEH_SIZE * del_count +
18839 +                                                         cut_records_len));
18840 +
18841 +       put_ih_entry_count(ih, ih_entry_count(ih) - del_count);
18842 +
18843 +       /* shift entry head array and entries those are AFTER removed entries */
18844 +       memmove((char *)(deh + from),
18845 +               deh + from + del_count,
18846 +               prev_record - cut_records_len - (char *)(deh + from +
18847 +                                                        del_count));
18848 +
18849 +       /* shift records, those are BEFORE removed entries */
18850 +       memmove(prev_record - cut_records_len - DEH_SIZE * del_count,
18851 +               prev_record, item + ih_item_len(ih) - prev_record);
18852 +
18853 +       return DEH_SIZE * del_count + cut_records_len;
18854 +}
18855 +
18856 +/*
18857 + * when cut item is part of regular file
18858 + *      pos_in_item - first byte that must be cut
18859 + *      cut_size - number of bytes to be cut beginning from pos_in_item
18860 + *
18861 + * when cut item is part of directory
18862 + *      pos_in_item - number of first deleted entry
18863 + *      cut_size - count of deleted entries
18864 + */
18865 +void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num,
18866 +                         int pos_in_item, int cut_size)
18867 +{
18868 +       int nr;
18869 +       struct buffer_head *bh = bi->bi_bh;
18870 +       struct block_head *blkh;
18871 +       struct item_head *ih;
18872 +       int last_loc, unmoved_loc;
18873 +       int i;
18874 +
18875 +       blkh = B_BLK_HEAD(bh);
18876 +       nr = blkh_nr_item(blkh);
18877 +
18878 +       /* item head of truncated item */
18879 +       ih = item_head(bh, cut_item_num);
18880 +
18881 +       if (is_direntry_le_ih(ih)) {
18882 +               /* first cut entry () */
18883 +               cut_size = leaf_cut_entries(bh, ih, pos_in_item, cut_size);
18884 +               if (pos_in_item == 0) {
18885 +                       /* change key */
18886 +                       RFALSE(cut_item_num,
18887 +                              "when 0-th enrty of item is cut, that item must be first in the node, not %d-th",
18888 +                              cut_item_num);
18889 +                       /* change item key by key of first entry in the item */
18890 +                       set_le_ih_k_offset(ih, deh_offset(B_I_DEH(bh, ih)));
18891 +               }
18892 +       } else {
18893 +               /* item is direct or indirect */
18894 +               RFALSE(is_statdata_le_ih(ih), "10195: item is stat data");
18895 +               RFALSE(pos_in_item && pos_in_item + cut_size != ih_item_len(ih),
18896 +                      "10200: invalid offset (%lu) or trunc_size (%lu) or ih_item_len (%lu)",
18897 +                      (long unsigned)pos_in_item, (long unsigned)cut_size,
18898 +                      (long unsigned)ih_item_len(ih));
18899 +
18900 +               /* shift item body to left if cut is from the head of item */
18901 +               if (pos_in_item == 0) {
18902 +                       memmove(bh->b_data + ih_location(ih),
18903 +                               bh->b_data + ih_location(ih) + cut_size,
18904 +                               ih_item_len(ih) - cut_size);
18905 +
18906 +                       /* change key of item */
18907 +                       if (is_direct_le_ih(ih))
18908 +                               set_le_ih_k_offset(ih,
18909 +                                                  le_ih_k_offset(ih) +
18910 +                                                  cut_size);
18911 +                       else {
18912 +                               set_le_ih_k_offset(ih,
18913 +                                                  le_ih_k_offset(ih) +
18914 +                                                  (cut_size / UNFM_P_SIZE) *
18915 +                                                  bh->b_size);
18916 +                               RFALSE(ih_item_len(ih) == cut_size
18917 +                                      && get_ih_free_space(ih),
18918 +                                      "10205: invalid ih_free_space (%h)", ih);
18919 +                       }
18920 +               }
18921 +       }
18922 +
18923 +       /* location of the last item */
18924 +       last_loc = ih_location(&ih[nr - cut_item_num - 1]);
18925 +
18926 +       /* location of the item, which is remaining at the same place */
18927 +       unmoved_loc = cut_item_num ? ih_location(ih - 1) : bh->b_size;
18928 +
18929 +       /* shift */
18930 +       memmove(bh->b_data + last_loc + cut_size, bh->b_data + last_loc,
18931 +               unmoved_loc - last_loc - cut_size);
18932 +
18933 +       /* change item length */
18934 +       put_ih_item_len(ih, ih_item_len(ih) - cut_size);
18935 +
18936 +       if (is_indirect_le_ih(ih)) {
18937 +               if (pos_in_item)
18938 +                       set_ih_free_space(ih, 0);
18939 +       }
18940 +
18941 +       /* change locations */
18942 +       for (i = cut_item_num; i < nr; i++)
18943 +               put_ih_location(&ih[i - cut_item_num],
18944 +                               ih_location(&ih[i - cut_item_num]) + cut_size);
18945 +
18946 +       /* size, free space */
18947 +       set_blkh_free_space(blkh, blkh_free_space(blkh) + cut_size);
18948 +
18949 +       do_balance_mark_leaf_dirty(bi->tb, bh, 0);
18950 +
18951 +       if (bi->bi_parent) {
18952 +               struct disk_child *t_dc;
18953 +               t_dc = B_N_CHILD(bi->bi_parent, bi->bi_position);
18954 +               put_dc_size(t_dc, dc_size(t_dc) - cut_size);
18955 +               do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0);
18956 +       }
18957 +}
18958 +
18959 +/* delete del_num items from buffer starting from the first'th item */
18960 +static void leaf_delete_items_entirely(struct buffer_info *bi,
18961 +                                      int first, int del_num)
18962 +{
18963 +       struct buffer_head *bh = bi->bi_bh;
18964 +       int nr;
18965 +       int i, j;
18966 +       int last_loc, last_removed_loc;
18967 +       struct block_head *blkh;
18968 +       struct item_head *ih;
18969 +
18970 +       RFALSE(bh == NULL, "10210: buffer is 0");
18971 +       RFALSE(del_num < 0, "10215: del_num less than 0 (%d)", del_num);
18972 +
18973 +       if (del_num == 0)
18974 +               return;
18975 +
18976 +       blkh = B_BLK_HEAD(bh);
18977 +       nr = blkh_nr_item(blkh);
18978 +
18979 +       RFALSE(first < 0 || first + del_num > nr,
18980 +              "10220: first=%d, number=%d, there is %d items", first, del_num,
18981 +              nr);
18982 +
18983 +       if (first == 0 && del_num == nr) {
18984 +               /* this does not work */
18985 +               make_empty_node(bi);
18986 +
18987 +               do_balance_mark_leaf_dirty(bi->tb, bh, 0);
18988 +               return;
18989 +       }
18990 +
18991 +       ih = item_head(bh, first);
18992 +
18993 +       /* location of unmovable item */
18994 +       j = (first == 0) ? bh->b_size : ih_location(ih - 1);
18995 +
18996 +       /* delete items */
18997 +       last_loc = ih_location(&ih[nr - 1 - first]);
18998 +       last_removed_loc = ih_location(&ih[del_num - 1]);
18999 +
19000 +       memmove(bh->b_data + last_loc + j - last_removed_loc,
19001 +               bh->b_data + last_loc, last_removed_loc - last_loc);
19002 +
19003 +       /* delete item headers */
19004 +       memmove(ih, ih + del_num, (nr - first - del_num) * IH_SIZE);
19005 +
19006 +       /* change item location */
19007 +       for (i = first; i < nr - del_num; i++)
19008 +               put_ih_location(&ih[i - first],
19009 +                               ih_location(&ih[i - first]) + (j -
19010 +                                                                last_removed_loc));
19011 +
19012 +       /* sizes, item number */
19013 +       set_blkh_nr_item(blkh, blkh_nr_item(blkh) - del_num);
19014 +       set_blkh_free_space(blkh,
19015 +                           blkh_free_space(blkh) + (j - last_removed_loc +
19016 +                                                    IH_SIZE * del_num));
19017 +
19018 +       do_balance_mark_leaf_dirty(bi->tb, bh, 0);
19019 +
19020 +       if (bi->bi_parent) {
19021 +               struct disk_child *t_dc =
19022 +                   B_N_CHILD(bi->bi_parent, bi->bi_position);
19023 +               put_dc_size(t_dc,
19024 +                           dc_size(t_dc) - (j - last_removed_loc +
19025 +                                            IH_SIZE * del_num));
19026 +               do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0);
19027 +       }
19028 +}
19029 +
19030 +/*
19031 + * paste new_entry_count entries (new_dehs, records) into position
19032 + * before to item_num-th item
19033 + */
19034 +void leaf_paste_entries(struct buffer_info *bi,
19035 +                       int item_num,
19036 +                       int before,
19037 +                       int new_entry_count,
19038 +                       struct reiserfs_de_head *new_dehs,
19039 +                       const char *records, int paste_size)
19040 +{
19041 +       struct item_head *ih;
19042 +       char *item;
19043 +       struct reiserfs_de_head *deh;
19044 +       char *insert_point;
19045 +       int i;
19046 +       struct buffer_head *bh = bi->bi_bh;
19047 +
19048 +       if (new_entry_count == 0)
19049 +               return;
19050 +
19051 +       ih = item_head(bh, item_num);
19052 +
19053 +       /*
19054 +        * make sure, that item is directory, and there are enough
19055 +        * records in it
19056 +        */
19057 +       RFALSE(!is_direntry_le_ih(ih), "10225: item is not directory item");
19058 +       RFALSE(ih_entry_count(ih) < before,
19059 +              "10230: there are no entry we paste entries before. entry_count = %d, before = %d",
19060 +              ih_entry_count(ih), before);
19061 +
19062 +       /* first byte of dest item */
19063 +       item = bh->b_data + ih_location(ih);
19064 +
19065 +       /* entry head array */
19066 +       deh = B_I_DEH(bh, ih);
19067 +
19068 +       /* new records will be pasted at this point */
19069 +       insert_point =
19070 +           item +
19071 +           (before ? deh_location(&deh[before - 1])
19072 +            : (ih_item_len(ih) - paste_size));
19073 +
19074 +       /* adjust locations of records that will be AFTER new records */
19075 +       for (i = ih_entry_count(ih) - 1; i >= before; i--)
19076 +               put_deh_location(&deh[i],
19077 +                                deh_location(&deh[i]) +
19078 +                                (DEH_SIZE * new_entry_count));
19079 +
19080 +       /* adjust locations of records that will be BEFORE new records */
19081 +       for (i = 0; i < before; i++)
19082 +               put_deh_location(&deh[i],
19083 +                                deh_location(&deh[i]) + paste_size);
19084 +
19085 +       put_ih_entry_count(ih, ih_entry_count(ih) + new_entry_count);
19086 +
19087 +       /* prepare space for pasted records */
19088 +       memmove(insert_point + paste_size, insert_point,
19089 +               item + (ih_item_len(ih) - paste_size) - insert_point);
19090 +
19091 +       /* copy new records */
19092 +       memcpy(insert_point + DEH_SIZE * new_entry_count, records,
19093 +              paste_size - DEH_SIZE * new_entry_count);
19094 +
19095 +       /* prepare space for new entry heads */
19096 +       deh += before;
19097 +       memmove((char *)(deh + new_entry_count), deh,
19098 +               insert_point - (char *)deh);
19099 +
19100 +       /* copy new entry heads */
19101 +       deh = (struct reiserfs_de_head *)((char *)deh);
19102 +       memcpy(deh, new_dehs, DEH_SIZE * new_entry_count);
19103 +
19104 +       /* set locations of new records */
19105 +       for (i = 0; i < new_entry_count; i++) {
19106 +               put_deh_location(&deh[i],
19107 +                                deh_location(&deh[i]) +
19108 +                                (-deh_location
19109 +                                 (&new_dehs[new_entry_count - 1]) +
19110 +                                 insert_point + DEH_SIZE * new_entry_count -
19111 +                                 item));
19112 +       }
19113 +
19114 +       /* change item key if necessary (when we paste before 0-th entry */
19115 +       if (!before) {
19116 +               set_le_ih_k_offset(ih, deh_offset(new_dehs));
19117 +       }
19118 +#ifdef CONFIG_REISERFS_CHECK
19119 +       {
19120 +               int prev, next;
19121 +               /* check record locations */
19122 +               deh = B_I_DEH(bh, ih);
19123 +               for (i = 0; i < ih_entry_count(ih); i++) {
19124 +                       next =
19125 +                           (i <
19126 +                            ih_entry_count(ih) -
19127 +                            1) ? deh_location(&deh[i + 1]) : 0;
19128 +                       prev = (i != 0) ? deh_location(&deh[i - 1]) : 0;
19129 +
19130 +                       if (prev && prev <= deh_location(&deh[i]))
19131 +                               reiserfs_error(sb_from_bi(bi), "vs-10240",
19132 +                                              "directory item (%h) "
19133 +                                              "corrupted (prev %a, "
19134 +                                              "cur(%d) %a)",
19135 +                                              ih, deh + i - 1, i, deh + i);
19136 +                       if (next && next >= deh_location(&deh[i]))
19137 +                               reiserfs_error(sb_from_bi(bi), "vs-10250",
19138 +                                              "directory item (%h) "
19139 +                                              "corrupted (cur(%d) %a, "
19140 +                                              "next %a)",
19141 +                                              ih, i, deh + i, deh + i + 1);
19142 +               }
19143 +       }
19144 +#endif
19145 +
19146 +}
19147 diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c
19148 new file mode 100644
19149 index 000000000000..46bd7bd63a71
19150 --- /dev/null
19151 +++ b/fs/reiserfs/lock.c
19152 @@ -0,0 +1,101 @@
19153 +// SPDX-License-Identifier: GPL-2.0
19154 +#include "reiserfs.h"
19155 +#include <linux/mutex.h>
19156 +
19157 +/*
19158 + * The previous reiserfs locking scheme was heavily based on
19159 + * the tricky properties of the Bkl:
19160 + *
19161 + * - it was acquired recursively by a same task
19162 + * - the performances relied on the release-while-schedule() property
19163 + *
19164 + * Now that we replace it by a mutex, we still want to keep the same
19165 + * recursive property to avoid big changes in the code structure.
19166 + * We use our own lock_owner here because the owner field on a mutex
19167 + * is only available in SMP or mutex debugging, also we only need this field
19168 + * for this mutex, no need for a system wide mutex facility.
19169 + *
19170 + * Also this lock is often released before a call that could block because
19171 + * reiserfs performances were partially based on the release while schedule()
19172 + * property of the Bkl.
19173 + */
19174 +void reiserfs_write_lock(struct super_block *s)
19175 +{
19176 +       struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
19177 +
19178 +       if (sb_i->lock_owner != current) {
19179 +               mutex_lock(&sb_i->lock);
19180 +               sb_i->lock_owner = current;
19181 +       }
19182 +
19183 +       /* No need to protect it, only the current task touches it */
19184 +       sb_i->lock_depth++;
19185 +}
19186 +
19187 +void reiserfs_write_unlock(struct super_block *s)
19188 +{
19189 +       struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
19190 +
19191 +       /*
19192 +        * Are we unlocking without even holding the lock?
19193 +        * Such a situation must raise a BUG() if we don't want
19194 +        * to corrupt the data.
19195 +        */
19196 +       BUG_ON(sb_i->lock_owner != current);
19197 +
19198 +       if (--sb_i->lock_depth == -1) {
19199 +               sb_i->lock_owner = NULL;
19200 +               mutex_unlock(&sb_i->lock);
19201 +       }
19202 +}
19203 +
19204 +int __must_check reiserfs_write_unlock_nested(struct super_block *s)
19205 +{
19206 +       struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
19207 +       int depth;
19208 +
19209 +       /* this can happen when the lock isn't always held */
19210 +       if (sb_i->lock_owner != current)
19211 +               return -1;
19212 +
19213 +       depth = sb_i->lock_depth;
19214 +
19215 +       sb_i->lock_depth = -1;
19216 +       sb_i->lock_owner = NULL;
19217 +       mutex_unlock(&sb_i->lock);
19218 +
19219 +       return depth;
19220 +}
19221 +
19222 +void reiserfs_write_lock_nested(struct super_block *s, int depth)
19223 +{
19224 +       struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
19225 +
19226 +       /* this can happen when the lock isn't always held */
19227 +       if (depth == -1)
19228 +               return;
19229 +
19230 +       mutex_lock(&sb_i->lock);
19231 +       sb_i->lock_owner = current;
19232 +       sb_i->lock_depth = depth;
19233 +}
19234 +
19235 +/*
19236 + * Utility function to force a BUG if it is called without the superblock
19237 + * write lock held.  caller is the string printed just before calling BUG()
19238 + */
19239 +void reiserfs_check_lock_depth(struct super_block *sb, char *caller)
19240 +{
19241 +       struct reiserfs_sb_info *sb_i = REISERFS_SB(sb);
19242 +
19243 +       WARN_ON(sb_i->lock_depth < 0);
19244 +}
19245 +
19246 +#ifdef CONFIG_REISERFS_CHECK
19247 +void reiserfs_lock_check_recursive(struct super_block *sb)
19248 +{
19249 +       struct reiserfs_sb_info *sb_i = REISERFS_SB(sb);
19250 +
19251 +       WARN_ONCE((sb_i->lock_depth > 0), "Unwanted recursive reiserfs lock!\n");
19252 +}
19253 +#endif
19254 diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
19255 new file mode 100644
19256 index 000000000000..7e7b531fcc49
19257 --- /dev/null
19258 +++ b/fs/reiserfs/namei.c
19259 @@ -0,0 +1,1725 @@
19260 +/*
19261 + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
19262 + *
19263 + * Trivial changes by Alan Cox to remove EHASHCOLLISION for compatibility
19264 + *
19265 + * Trivial Changes:
19266 + * Rights granted to Hans Reiser to redistribute under other terms providing
19267 + * he accepts all liability including but not limited to patent, fitness
19268 + * for purpose, and direct or indirect claims arising from failure to perform.
19269 + *
19270 + * NO WARRANTY
19271 + */
19272 +
19273 +#include <linux/time.h>
19274 +#include <linux/bitops.h>
19275 +#include <linux/slab.h>
19276 +#include "reiserfs.h"
19277 +#include "acl.h"
19278 +#include "xattr.h"
19279 +#include <linux/quotaops.h>
19280 +
19281 +#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) set_nlink(i, 1); }
19282 +#define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) drop_nlink(i);
19283 +
19284 +/*
19285 + * directory item contains array of entry headers. This performs
19286 + * binary search through that array
19287 + */
19288 +static int bin_search_in_dir_item(struct reiserfs_dir_entry *de, loff_t off)
19289 +{
19290 +       struct item_head *ih = de->de_ih;
19291 +       struct reiserfs_de_head *deh = de->de_deh;
19292 +       int rbound, lbound, j;
19293 +
19294 +       lbound = 0;
19295 +       rbound = ih_entry_count(ih) - 1;
19296 +
19297 +       for (j = (rbound + lbound) / 2; lbound <= rbound;
19298 +            j = (rbound + lbound) / 2) {
19299 +               if (off < deh_offset(deh + j)) {
19300 +                       rbound = j - 1;
19301 +                       continue;
19302 +               }
19303 +               if (off > deh_offset(deh + j)) {
19304 +                       lbound = j + 1;
19305 +                       continue;
19306 +               }
19307 +               /* this is not name found, but matched third key component */
19308 +               de->de_entry_num = j;
19309 +               return NAME_FOUND;
19310 +       }
19311 +
19312 +       de->de_entry_num = lbound;
19313 +       return NAME_NOT_FOUND;
19314 +}
19315 +
19316 +/*
19317 + * comment?  maybe something like set de to point to what the path points to?
19318 + */
19319 +static inline void set_de_item_location(struct reiserfs_dir_entry *de,
19320 +                                       struct treepath *path)
19321 +{
19322 +       de->de_bh = get_last_bh(path);
19323 +       de->de_ih = tp_item_head(path);
19324 +       de->de_deh = B_I_DEH(de->de_bh, de->de_ih);
19325 +       de->de_item_num = PATH_LAST_POSITION(path);
19326 +}
19327 +
19328 +/*
19329 + * de_bh, de_ih, de_deh (points to first element of array), de_item_num is set
19330 + */
19331 +inline void set_de_name_and_namelen(struct reiserfs_dir_entry *de)
19332 +{
19333 +       struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num;
19334 +
19335 +       BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih));
19336 +
19337 +       de->de_entrylen = entry_length(de->de_bh, de->de_ih, de->de_entry_num);
19338 +       de->de_namelen = de->de_entrylen - (de_with_sd(deh) ? SD_SIZE : 0);
19339 +       de->de_name = ih_item_body(de->de_bh, de->de_ih) + deh_location(deh);
19340 +       if (de->de_name[de->de_namelen - 1] == 0)
19341 +               de->de_namelen = strlen(de->de_name);
19342 +}
19343 +
19344 +/* what entry points to */
19345 +static inline void set_de_object_key(struct reiserfs_dir_entry *de)
19346 +{
19347 +       BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih));
19348 +       de->de_dir_id = deh_dir_id(&de->de_deh[de->de_entry_num]);
19349 +       de->de_objectid = deh_objectid(&de->de_deh[de->de_entry_num]);
19350 +}
19351 +
19352 +static inline void store_de_entry_key(struct reiserfs_dir_entry *de)
19353 +{
19354 +       struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num;
19355 +
19356 +       BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih));
19357 +
19358 +       /* store key of the found entry */
19359 +       de->de_entry_key.version = KEY_FORMAT_3_5;
19360 +       de->de_entry_key.on_disk_key.k_dir_id =
19361 +           le32_to_cpu(de->de_ih->ih_key.k_dir_id);
19362 +       de->de_entry_key.on_disk_key.k_objectid =
19363 +           le32_to_cpu(de->de_ih->ih_key.k_objectid);
19364 +       set_cpu_key_k_offset(&de->de_entry_key, deh_offset(deh));
19365 +       set_cpu_key_k_type(&de->de_entry_key, TYPE_DIRENTRY);
19366 +}
19367 +
19368 +/*
19369 + * We assign a key to each directory item, and place multiple entries in a
19370 + * single directory item.  A directory item has a key equal to the key of
19371 + * the first directory entry in it.
19372 +
19373 + * This function first calls search_by_key, then, if item whose first entry
19374 + * matches is not found it looks for the entry inside directory item found
19375 + * by search_by_key. Fills the path to the entry, and to the entry position
19376 + * in the item
19377 + */
19378 +/* The function is NOT SCHEDULE-SAFE! */
19379 +int search_by_entry_key(struct super_block *sb, const struct cpu_key *key,
19380 +                       struct treepath *path, struct reiserfs_dir_entry *de)
19381 +{
19382 +       int retval;
19383 +
19384 +       retval = search_item(sb, key, path);
19385 +       switch (retval) {
19386 +       case ITEM_NOT_FOUND:
19387 +               if (!PATH_LAST_POSITION(path)) {
19388 +                       reiserfs_error(sb, "vs-7000", "search_by_key "
19389 +                                      "returned item position == 0");
19390 +                       pathrelse(path);
19391 +                       return IO_ERROR;
19392 +               }
19393 +               PATH_LAST_POSITION(path)--;
19394 +               break;
19395 +
19396 +       case ITEM_FOUND:
19397 +               break;
19398 +
19399 +       case IO_ERROR:
19400 +               return retval;
19401 +
19402 +       default:
19403 +               pathrelse(path);
19404 +               reiserfs_error(sb, "vs-7002", "no path to here");
19405 +               return IO_ERROR;
19406 +       }
19407 +
19408 +       set_de_item_location(de, path);
19409 +
19410 +#ifdef CONFIG_REISERFS_CHECK
19411 +       if (!is_direntry_le_ih(de->de_ih) ||
19412 +           COMP_SHORT_KEYS(&de->de_ih->ih_key, key)) {
19413 +               print_block(de->de_bh, 0, -1, -1);
19414 +               reiserfs_panic(sb, "vs-7005", "found item %h is not directory "
19415 +                              "item or does not belong to the same directory "
19416 +                              "as key %K", de->de_ih, key);
19417 +       }
19418 +#endif                         /* CONFIG_REISERFS_CHECK */
19419 +
19420 +       /*
19421 +        * binary search in directory item by third component of the
19422 +        * key. sets de->de_entry_num of de
19423 +        */
19424 +       retval = bin_search_in_dir_item(de, cpu_key_k_offset(key));
19425 +       path->pos_in_item = de->de_entry_num;
19426 +       if (retval != NAME_NOT_FOUND) {
19427 +               /*
19428 +                * ugly, but rename needs de_bh, de_deh, de_name,
19429 +                * de_namelen, de_objectid set
19430 +                */
19431 +               set_de_name_and_namelen(de);
19432 +               set_de_object_key(de);
19433 +       }
19434 +       return retval;
19435 +}
19436 +
19437 +/* Keyed 32-bit hash function using TEA in a Davis-Meyer function */
19438 +
19439 +/*
19440 + * The third component is hashed, and you can choose from more than
19441 + * one hash function.  Per directory hashes are not yet implemented
19442 + * but are thought about. This function should be moved to hashes.c
19443 + * Jedi, please do so.  -Hans
19444 + */
19445 +static __u32 get_third_component(struct super_block *s,
19446 +                                const char *name, int len)
19447 +{
19448 +       __u32 res;
19449 +
19450 +       if (!len || (len == 1 && name[0] == '.'))
19451 +               return DOT_OFFSET;
19452 +       if (len == 2 && name[0] == '.' && name[1] == '.')
19453 +               return DOT_DOT_OFFSET;
19454 +
19455 +       res = REISERFS_SB(s)->s_hash_function(name, len);
19456 +
19457 +       /* take bits from 7-th to 30-th including both bounds */
19458 +       res = GET_HASH_VALUE(res);
19459 +       if (res == 0)
19460 +               /*
19461 +                * needed to have no names before "." and ".." those have hash
19462 +                * value == 0 and generation conters 1 and 2 accordingly
19463 +                */
19464 +               res = 128;
19465 +       return res + MAX_GENERATION_NUMBER;
19466 +}
19467 +
19468 +static int reiserfs_match(struct reiserfs_dir_entry *de,
19469 +                         const char *name, int namelen)
19470 +{
19471 +       int retval = NAME_NOT_FOUND;
19472 +
19473 +       if ((namelen == de->de_namelen) &&
19474 +           !memcmp(de->de_name, name, de->de_namelen))
19475 +               retval =
19476 +                   (de_visible(de->de_deh + de->de_entry_num) ? NAME_FOUND :
19477 +                    NAME_FOUND_INVISIBLE);
19478 +
19479 +       return retval;
19480 +}
19481 +
19482 +/* de's de_bh, de_ih, de_deh, de_item_num, de_entry_num are set already */
19483 +
19484 +/* used when hash collisions exist */
19485 +
19486 +static int linear_search_in_dir_item(struct cpu_key *key,
19487 +                                    struct reiserfs_dir_entry *de,
19488 +                                    const char *name, int namelen)
19489 +{
19490 +       struct reiserfs_de_head *deh = de->de_deh;
19491 +       int retval;
19492 +       int i;
19493 +
19494 +       i = de->de_entry_num;
19495 +
19496 +       if (i == ih_entry_count(de->de_ih) ||
19497 +           GET_HASH_VALUE(deh_offset(deh + i)) !=
19498 +           GET_HASH_VALUE(cpu_key_k_offset(key))) {
19499 +               i--;
19500 +       }
19501 +
19502 +       RFALSE(de->de_deh != B_I_DEH(de->de_bh, de->de_ih),
19503 +              "vs-7010: array of entry headers not found");
19504 +
19505 +       deh += i;
19506 +
19507 +       for (; i >= 0; i--, deh--) {
19508 +               /* hash value does not match, no need to check whole name */
19509 +               if (GET_HASH_VALUE(deh_offset(deh)) !=
19510 +                   GET_HASH_VALUE(cpu_key_k_offset(key))) {
19511 +                       return NAME_NOT_FOUND;
19512 +               }
19513 +
19514 +               /* mark that this generation number is used */
19515 +               if (de->de_gen_number_bit_string)
19516 +                       set_bit(GET_GENERATION_NUMBER(deh_offset(deh)),
19517 +                               de->de_gen_number_bit_string);
19518 +
19519 +               /* calculate pointer to name and namelen */
19520 +               de->de_entry_num = i;
19521 +               set_de_name_and_namelen(de);
19522 +
19523 +               /*
19524 +                * de's de_name, de_namelen, de_recordlen are set.
19525 +                * Fill the rest.
19526 +                */
19527 +               if ((retval =
19528 +                    reiserfs_match(de, name, namelen)) != NAME_NOT_FOUND) {
19529 +
19530 +                       /* key of pointed object */
19531 +                       set_de_object_key(de);
19532 +
19533 +                       store_de_entry_key(de);
19534 +
19535 +                       /* retval can be NAME_FOUND or NAME_FOUND_INVISIBLE */
19536 +                       return retval;
19537 +               }
19538 +       }
19539 +
19540 +       if (GET_GENERATION_NUMBER(le_ih_k_offset(de->de_ih)) == 0)
19541 +               /*
19542 +                * we have reached left most entry in the node. In common we
19543 +                * have to go to the left neighbor, but if generation counter
19544 +                * is 0 already, we know for sure, that there is no name with
19545 +                * the same hash value
19546 +                */
19547 +               /*
19548 +                * FIXME: this work correctly only because hash value can not
19549 +                *  be 0. Btw, in case of Yura's hash it is probably possible,
19550 +                * so, this is a bug
19551 +                */
19552 +               return NAME_NOT_FOUND;
19553 +
19554 +       RFALSE(de->de_item_num,
19555 +              "vs-7015: two diritems of the same directory in one node?");
19556 +
19557 +       return GOTO_PREVIOUS_ITEM;
19558 +}
19559 +
19560 +/*
19561 + * may return NAME_FOUND, NAME_FOUND_INVISIBLE, NAME_NOT_FOUND
19562 + * FIXME: should add something like IOERROR
19563 + */
19564 +static int reiserfs_find_entry(struct inode *dir, const char *name, int namelen,
19565 +                              struct treepath *path_to_entry,
19566 +                              struct reiserfs_dir_entry *de)
19567 +{
19568 +       struct cpu_key key_to_search;
19569 +       int retval;
19570 +
19571 +       if (namelen > REISERFS_MAX_NAME(dir->i_sb->s_blocksize))
19572 +               return NAME_NOT_FOUND;
19573 +
19574 +       /* we will search for this key in the tree */
19575 +       make_cpu_key(&key_to_search, dir,
19576 +                    get_third_component(dir->i_sb, name, namelen),
19577 +                    TYPE_DIRENTRY, 3);
19578 +
19579 +       while (1) {
19580 +               retval =
19581 +                   search_by_entry_key(dir->i_sb, &key_to_search,
19582 +                                       path_to_entry, de);
19583 +               if (retval == IO_ERROR) {
19584 +                       reiserfs_error(dir->i_sb, "zam-7001", "io error");
19585 +                       return IO_ERROR;
19586 +               }
19587 +
19588 +               /* compare names for all entries having given hash value */
19589 +               retval =
19590 +                   linear_search_in_dir_item(&key_to_search, de, name,
19591 +                                             namelen);
19592 +               /*
19593 +                * there is no need to scan directory anymore.
19594 +                * Given entry found or does not exist
19595 +                */
19596 +               if (retval != GOTO_PREVIOUS_ITEM) {
19597 +                       path_to_entry->pos_in_item = de->de_entry_num;
19598 +                       return retval;
19599 +               }
19600 +
19601 +               /*
19602 +                * there is left neighboring item of this directory
19603 +                * and given entry can be there
19604 +                */
19605 +               set_cpu_key_k_offset(&key_to_search,
19606 +                                    le_ih_k_offset(de->de_ih) - 1);
19607 +               pathrelse(path_to_entry);
19608 +
19609 +       }                       /* while (1) */
19610 +}
19611 +
19612 +static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
19613 +                                     unsigned int flags)
19614 +{
19615 +       int retval;
19616 +       struct inode *inode = NULL;
19617 +       struct reiserfs_dir_entry de;
19618 +       INITIALIZE_PATH(path_to_entry);
19619 +
19620 +       if (REISERFS_MAX_NAME(dir->i_sb->s_blocksize) < dentry->d_name.len)
19621 +               return ERR_PTR(-ENAMETOOLONG);
19622 +
19623 +       reiserfs_write_lock(dir->i_sb);
19624 +
19625 +       de.de_gen_number_bit_string = NULL;
19626 +       retval =
19627 +           reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len,
19628 +                               &path_to_entry, &de);
19629 +       pathrelse(&path_to_entry);
19630 +       if (retval == NAME_FOUND) {
19631 +               inode = reiserfs_iget(dir->i_sb,
19632 +                                     (struct cpu_key *)&de.de_dir_id);
19633 +               if (!inode || IS_ERR(inode)) {
19634 +                       reiserfs_write_unlock(dir->i_sb);
19635 +                       return ERR_PTR(-EACCES);
19636 +               }
19637 +
19638 +               /*
19639 +                * Propagate the private flag so we know we're
19640 +                * in the priv tree.  Also clear xattr support
19641 +                * since we don't have xattrs on xattr files.
19642 +                */
19643 +               if (IS_PRIVATE(dir))
19644 +                       reiserfs_init_priv_inode(inode);
19645 +       }
19646 +       reiserfs_write_unlock(dir->i_sb);
19647 +       if (retval == IO_ERROR) {
19648 +               return ERR_PTR(-EIO);
19649 +       }
19650 +
19651 +       return d_splice_alias(inode, dentry);
19652 +}
19653 +
19654 +/*
19655 + * looks up the dentry of the parent directory for child.
19656 + * taken from ext2_get_parent
19657 + */
19658 +struct dentry *reiserfs_get_parent(struct dentry *child)
19659 +{
19660 +       int retval;
19661 +       struct inode *inode = NULL;
19662 +       struct reiserfs_dir_entry de;
19663 +       INITIALIZE_PATH(path_to_entry);
19664 +       struct inode *dir = d_inode(child);
19665 +
19666 +       if (dir->i_nlink == 0) {
19667 +               return ERR_PTR(-ENOENT);
19668 +       }
19669 +       de.de_gen_number_bit_string = NULL;
19670 +
19671 +       reiserfs_write_lock(dir->i_sb);
19672 +       retval = reiserfs_find_entry(dir, "..", 2, &path_to_entry, &de);
19673 +       pathrelse(&path_to_entry);
19674 +       if (retval != NAME_FOUND) {
19675 +               reiserfs_write_unlock(dir->i_sb);
19676 +               return ERR_PTR(-ENOENT);
19677 +       }
19678 +       inode = reiserfs_iget(dir->i_sb, (struct cpu_key *)&de.de_dir_id);
19679 +       reiserfs_write_unlock(dir->i_sb);
19680 +
19681 +       return d_obtain_alias(inode);
19682 +}
19683 +
19684 +/* add entry to the directory (entry can be hidden).
19685 +
19686 +insert definition of when hidden directories are used here -Hans
19687 +
19688 + Does not mark dir   inode dirty, do it after successesfull call to it */
19689 +
19690 +static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
19691 +                             struct inode *dir, const char *name, int namelen,
19692 +                             struct inode *inode, int visible)
19693 +{
19694 +       struct cpu_key entry_key;
19695 +       struct reiserfs_de_head *deh;
19696 +       INITIALIZE_PATH(path);
19697 +       struct reiserfs_dir_entry de;
19698 +       DECLARE_BITMAP(bit_string, MAX_GENERATION_NUMBER + 1);
19699 +       int gen_number;
19700 +
19701 +       /*
19702 +        * 48 bytes now and we avoid kmalloc if we
19703 +        * create file with short name
19704 +        */
19705 +       char small_buf[32 + DEH_SIZE];
19706 +
19707 +       char *buffer;
19708 +       int buflen, paste_size;
19709 +       int retval;
19710 +
19711 +       BUG_ON(!th->t_trans_id);
19712 +
19713 +       /* each entry has unique key. compose it */
19714 +       make_cpu_key(&entry_key, dir,
19715 +                    get_third_component(dir->i_sb, name, namelen),
19716 +                    TYPE_DIRENTRY, 3);
19717 +
19718 +       /* get memory for composing the entry */
19719 +       buflen = DEH_SIZE + ROUND_UP(namelen);
19720 +       if (buflen > sizeof(small_buf)) {
19721 +               buffer = kmalloc(buflen, GFP_NOFS);
19722 +               if (!buffer)
19723 +                       return -ENOMEM;
19724 +       } else
19725 +               buffer = small_buf;
19726 +
19727 +       paste_size =
19728 +           (get_inode_sd_version(dir) ==
19729 +            STAT_DATA_V1) ? (DEH_SIZE + namelen) : buflen;
19730 +
19731 +       /*
19732 +        * fill buffer : directory entry head, name[, dir objectid | ,
19733 +        * stat data | ,stat data, dir objectid ]
19734 +        */
19735 +       deh = (struct reiserfs_de_head *)buffer;
19736 +       deh->deh_location = 0;  /* JDM Endian safe if 0 */
19737 +       put_deh_offset(deh, cpu_key_k_offset(&entry_key));
19738 +       deh->deh_state = 0;     /* JDM Endian safe if 0 */
19739 +       /* put key (ino analog) to de */
19740 +
19741 +       /* safe: k_dir_id is le */
19742 +       deh->deh_dir_id = INODE_PKEY(inode)->k_dir_id;
19743 +       /* safe: k_objectid is le */
19744 +       deh->deh_objectid = INODE_PKEY(inode)->k_objectid;
19745 +
19746 +       /* copy name */
19747 +       memcpy((char *)(deh + 1), name, namelen);
19748 +       /* padd by 0s to the 4 byte boundary */
19749 +       padd_item((char *)(deh + 1), ROUND_UP(namelen), namelen);
19750 +
19751 +       /*
19752 +        * entry is ready to be pasted into tree, set 'visibility'
19753 +        * and 'stat data in entry' attributes
19754 +        */
19755 +       mark_de_without_sd(deh);
19756 +       visible ? mark_de_visible(deh) : mark_de_hidden(deh);
19757 +
19758 +       /* find the proper place for the new entry */
19759 +       memset(bit_string, 0, sizeof(bit_string));
19760 +       de.de_gen_number_bit_string = bit_string;
19761 +       retval = reiserfs_find_entry(dir, name, namelen, &path, &de);
19762 +       if (retval != NAME_NOT_FOUND) {
19763 +               if (buffer != small_buf)
19764 +                       kfree(buffer);
19765 +               pathrelse(&path);
19766 +
19767 +               if (retval == IO_ERROR) {
19768 +                       return -EIO;
19769 +               }
19770 +
19771 +               if (retval != NAME_FOUND) {
19772 +                       reiserfs_error(dir->i_sb, "zam-7002",
19773 +                                      "reiserfs_find_entry() returned "
19774 +                                      "unexpected value (%d)", retval);
19775 +               }
19776 +
19777 +               return -EEXIST;
19778 +       }
19779 +
19780 +       gen_number =
19781 +           find_first_zero_bit(bit_string,
19782 +                               MAX_GENERATION_NUMBER + 1);
19783 +       if (gen_number > MAX_GENERATION_NUMBER) {
19784 +               /* there is no free generation number */
19785 +               reiserfs_warning(dir->i_sb, "reiserfs-7010",
19786 +                                "Congratulations! we have got hash function "
19787 +                                "screwed up");
19788 +               if (buffer != small_buf)
19789 +                       kfree(buffer);
19790 +               pathrelse(&path);
19791 +               return -EBUSY;
19792 +       }
19793 +       /* adjust offset of directory enrty */
19794 +       put_deh_offset(deh, SET_GENERATION_NUMBER(deh_offset(deh), gen_number));
19795 +       set_cpu_key_k_offset(&entry_key, deh_offset(deh));
19796 +
19797 +       /* update max-hash-collisions counter in reiserfs_sb_info */
19798 +       PROC_INFO_MAX(th->t_super, max_hash_collisions, gen_number);
19799 +
19800 +       /* we need to re-search for the insertion point */
19801 +       if (gen_number != 0) {
19802 +               if (search_by_entry_key(dir->i_sb, &entry_key, &path, &de) !=
19803 +                   NAME_NOT_FOUND) {
19804 +                       reiserfs_warning(dir->i_sb, "vs-7032",
19805 +                                        "entry with this key (%K) already "
19806 +                                        "exists", &entry_key);
19807 +
19808 +                       if (buffer != small_buf)
19809 +                               kfree(buffer);
19810 +                       pathrelse(&path);
19811 +                       return -EBUSY;
19812 +               }
19813 +       }
19814 +
19815 +       /* perform the insertion of the entry that we have prepared */
19816 +       retval =
19817 +           reiserfs_paste_into_item(th, &path, &entry_key, dir, buffer,
19818 +                                    paste_size);
19819 +       if (buffer != small_buf)
19820 +               kfree(buffer);
19821 +       if (retval) {
19822 +               reiserfs_check_path(&path);
19823 +               return retval;
19824 +       }
19825 +
19826 +       dir->i_size += paste_size;
19827 +       inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
19828 +       if (!S_ISDIR(inode->i_mode) && visible)
19829 +               /* reiserfs_mkdir or reiserfs_rename will do that by itself */
19830 +               reiserfs_update_sd(th, dir);
19831 +
19832 +       reiserfs_check_path(&path);
19833 +       return 0;
19834 +}
19835 +
19836 +/*
19837 + * quota utility function, call if you've had to abort after calling
19838 + * new_inode_init, and have not called reiserfs_new_inode yet.
19839 + * This should only be called on inodes that do not have stat data
19840 + * inserted into the tree yet.
19841 + */
19842 +static int drop_new_inode(struct inode *inode)
19843 +{
19844 +       dquot_drop(inode);
19845 +       make_bad_inode(inode);
19846 +       inode->i_flags |= S_NOQUOTA;
19847 +       iput(inode);
19848 +       return 0;
19849 +}
19850 +
19851 +/*
19852 + * utility function that does setup for reiserfs_new_inode.
19853 + * dquot_initialize needs lots of credits so it's better to have it
19854 + * outside of a transaction, so we had to pull some bits of
19855 + * reiserfs_new_inode out into this func.
19856 + */
19857 +static int new_inode_init(struct inode *inode, struct inode *dir, umode_t mode)
19858 +{
19859 +       /*
19860 +        * Make inode invalid - just in case we are going to drop it before
19861 +        * the initialization happens
19862 +        */
19863 +       INODE_PKEY(inode)->k_objectid = 0;
19864 +
19865 +       /*
19866 +        * the quota init calls have to know who to charge the quota to, so
19867 +        * we have to set uid and gid here
19868 +        */
19869 +       inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
19870 +       return dquot_initialize(inode);
19871 +}
19872 +
19873 +static int reiserfs_create(struct mnt_idmap *idmap, struct inode *dir,
19874 +                          struct dentry *dentry, umode_t mode, bool excl)
19875 +{
19876 +       int retval;
19877 +       struct inode *inode;
19878 +       /*
19879 +        * We need blocks for transaction + (user+group)*(quotas
19880 +        * for new inode + update of quota for directory owner)
19881 +        */
19882 +       int jbegin_count =
19883 +           JOURNAL_PER_BALANCE_CNT * 2 +
19884 +           2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
19885 +                REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
19886 +       struct reiserfs_transaction_handle th;
19887 +       struct reiserfs_security_handle security;
19888 +
19889 +       retval = dquot_initialize(dir);
19890 +       if (retval)
19891 +               return retval;
19892 +
19893 +       if (!(inode = new_inode(dir->i_sb))) {
19894 +               return -ENOMEM;
19895 +       }
19896 +       retval = new_inode_init(inode, dir, mode);
19897 +       if (retval) {
19898 +               drop_new_inode(inode);
19899 +               return retval;
19900 +       }
19901 +
19902 +       jbegin_count += reiserfs_cache_default_acl(dir);
19903 +       retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
19904 +       if (retval < 0) {
19905 +               drop_new_inode(inode);
19906 +               return retval;
19907 +       }
19908 +       jbegin_count += retval;
19909 +       reiserfs_write_lock(dir->i_sb);
19910 +
19911 +       retval = journal_begin(&th, dir->i_sb, jbegin_count);
19912 +       if (retval) {
19913 +               drop_new_inode(inode);
19914 +               goto out_failed;
19915 +       }
19916 +
19917 +       retval =
19918 +           reiserfs_new_inode(&th, dir, mode, NULL, 0 /*i_size */ , dentry,
19919 +                              inode, &security);
19920 +       if (retval)
19921 +               goto out_failed;
19922 +
19923 +       inode->i_op = &reiserfs_file_inode_operations;
19924 +       inode->i_fop = &reiserfs_file_operations;
19925 +       inode->i_mapping->a_ops = &reiserfs_address_space_operations;
19926 +
19927 +       retval =
19928 +           reiserfs_add_entry(&th, dir, dentry->d_name.name,
19929 +                              dentry->d_name.len, inode, 1 /*visible */ );
19930 +       if (retval) {
19931 +               int err;
19932 +               drop_nlink(inode);
19933 +               reiserfs_update_sd(&th, inode);
19934 +               err = journal_end(&th);
19935 +               if (err)
19936 +                       retval = err;
19937 +               unlock_new_inode(inode);
19938 +               iput(inode);
19939 +               goto out_failed;
19940 +       }
19941 +       reiserfs_update_inode_transaction(inode);
19942 +       reiserfs_update_inode_transaction(dir);
19943 +
19944 +       d_instantiate_new(dentry, inode);
19945 +       retval = journal_end(&th);
19946 +
19947 +out_failed:
19948 +       reiserfs_write_unlock(dir->i_sb);
19949 +       reiserfs_security_free(&security);
19950 +       return retval;
19951 +}
19952 +
19953 +static int reiserfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
19954 +                         struct dentry *dentry, umode_t mode, dev_t rdev)
19955 +{
19956 +       int retval;
19957 +       struct inode *inode;
19958 +       struct reiserfs_transaction_handle th;
19959 +       struct reiserfs_security_handle security;
19960 +       /*
19961 +        * We need blocks for transaction + (user+group)*(quotas
19962 +        * for new inode + update of quota for directory owner)
19963 +        */
19964 +       int jbegin_count =
19965 +           JOURNAL_PER_BALANCE_CNT * 3 +
19966 +           2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
19967 +                REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
19968 +
19969 +       retval = dquot_initialize(dir);
19970 +       if (retval)
19971 +               return retval;
19972 +
19973 +       if (!(inode = new_inode(dir->i_sb))) {
19974 +               return -ENOMEM;
19975 +       }
19976 +       retval = new_inode_init(inode, dir, mode);
19977 +       if (retval) {
19978 +               drop_new_inode(inode);
19979 +               return retval;
19980 +       }
19981 +
19982 +       jbegin_count += reiserfs_cache_default_acl(dir);
19983 +       retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
19984 +       if (retval < 0) {
19985 +               drop_new_inode(inode);
19986 +               return retval;
19987 +       }
19988 +       jbegin_count += retval;
19989 +       reiserfs_write_lock(dir->i_sb);
19990 +
19991 +       retval = journal_begin(&th, dir->i_sb, jbegin_count);
19992 +       if (retval) {
19993 +               drop_new_inode(inode);
19994 +               goto out_failed;
19995 +       }
19996 +
19997 +       retval =
19998 +           reiserfs_new_inode(&th, dir, mode, NULL, 0 /*i_size */ , dentry,
19999 +                              inode, &security);
20000 +       if (retval) {
20001 +               goto out_failed;
20002 +       }
20003 +
20004 +       inode->i_op = &reiserfs_special_inode_operations;
20005 +       init_special_inode(inode, inode->i_mode, rdev);
20006 +
20007 +       /* FIXME: needed for block and char devices only */
20008 +       reiserfs_update_sd(&th, inode);
20009 +
20010 +       reiserfs_update_inode_transaction(inode);
20011 +       reiserfs_update_inode_transaction(dir);
20012 +
20013 +       retval =
20014 +           reiserfs_add_entry(&th, dir, dentry->d_name.name,
20015 +                              dentry->d_name.len, inode, 1 /*visible */ );
20016 +       if (retval) {
20017 +               int err;
20018 +               drop_nlink(inode);
20019 +               reiserfs_update_sd(&th, inode);
20020 +               err = journal_end(&th);
20021 +               if (err)
20022 +                       retval = err;
20023 +               unlock_new_inode(inode);
20024 +               iput(inode);
20025 +               goto out_failed;
20026 +       }
20027 +
20028 +       d_instantiate_new(dentry, inode);
20029 +       retval = journal_end(&th);
20030 +
20031 +out_failed:
20032 +       reiserfs_write_unlock(dir->i_sb);
20033 +       reiserfs_security_free(&security);
20034 +       return retval;
20035 +}
20036 +
20037 +static int reiserfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
20038 +                         struct dentry *dentry, umode_t mode)
20039 +{
20040 +       int retval;
20041 +       struct inode *inode;
20042 +       struct reiserfs_transaction_handle th;
20043 +       struct reiserfs_security_handle security;
20044 +       /*
20045 +        * We need blocks for transaction + (user+group)*(quotas
20046 +        * for new inode + update of quota for directory owner)
20047 +        */
20048 +       int jbegin_count =
20049 +           JOURNAL_PER_BALANCE_CNT * 3 +
20050 +           2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
20051 +                REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
20052 +
20053 +       retval = dquot_initialize(dir);
20054 +       if (retval)
20055 +               return retval;
20056 +
20057 +#ifdef DISPLACE_NEW_PACKING_LOCALITIES
20058 +       /*
20059 +        * set flag that new packing locality created and new blocks
20060 +        * for the content of that directory are not displaced yet
20061 +        */
20062 +       REISERFS_I(dir)->new_packing_locality = 1;
20063 +#endif
20064 +       mode = S_IFDIR | mode;
20065 +       if (!(inode = new_inode(dir->i_sb))) {
20066 +               return -ENOMEM;
20067 +       }
20068 +       retval = new_inode_init(inode, dir, mode);
20069 +       if (retval) {
20070 +               drop_new_inode(inode);
20071 +               return retval;
20072 +       }
20073 +
20074 +       jbegin_count += reiserfs_cache_default_acl(dir);
20075 +       retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
20076 +       if (retval < 0) {
20077 +               drop_new_inode(inode);
20078 +               return retval;
20079 +       }
20080 +       jbegin_count += retval;
20081 +       reiserfs_write_lock(dir->i_sb);
20082 +
20083 +       retval = journal_begin(&th, dir->i_sb, jbegin_count);
20084 +       if (retval) {
20085 +               drop_new_inode(inode);
20086 +               goto out_failed;
20087 +       }
20088 +
20089 +       /*
20090 +        * inc the link count now, so another writer doesn't overflow
20091 +        * it while we sleep later on.
20092 +        */
20093 +       INC_DIR_INODE_NLINK(dir)
20094 +
20095 +       retval = reiserfs_new_inode(&th, dir, mode, NULL /*symlink */,
20096 +                                   old_format_only(dir->i_sb) ?
20097 +                                   EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE,
20098 +                                   dentry, inode, &security);
20099 +       if (retval) {
20100 +               DEC_DIR_INODE_NLINK(dir)
20101 +               goto out_failed;
20102 +       }
20103 +
20104 +       reiserfs_update_inode_transaction(inode);
20105 +       reiserfs_update_inode_transaction(dir);
20106 +
20107 +       inode->i_op = &reiserfs_dir_inode_operations;
20108 +       inode->i_fop = &reiserfs_dir_operations;
20109 +
20110 +       /* note, _this_ add_entry will not update dir's stat data */
20111 +       retval =
20112 +           reiserfs_add_entry(&th, dir, dentry->d_name.name,
20113 +                              dentry->d_name.len, inode, 1 /*visible */ );
20114 +       if (retval) {
20115 +               int err;
20116 +               clear_nlink(inode);
20117 +               DEC_DIR_INODE_NLINK(dir);
20118 +               reiserfs_update_sd(&th, inode);
20119 +               err = journal_end(&th);
20120 +               if (err)
20121 +                       retval = err;
20122 +               unlock_new_inode(inode);
20123 +               iput(inode);
20124 +               goto out_failed;
20125 +       }
20126 +       /* the above add_entry did not update dir's stat data */
20127 +       reiserfs_update_sd(&th, dir);
20128 +
20129 +       d_instantiate_new(dentry, inode);
20130 +       retval = journal_end(&th);
20131 +out_failed:
20132 +       reiserfs_write_unlock(dir->i_sb);
20133 +       reiserfs_security_free(&security);
20134 +       return retval;
20135 +}
20136 +
20137 +static inline int reiserfs_empty_dir(struct inode *inode)
20138 +{
20139 +       /*
20140 +        * we can cheat because an old format dir cannot have
20141 +        * EMPTY_DIR_SIZE, and a new format dir cannot have
20142 +        * EMPTY_DIR_SIZE_V1.  So, if the inode is either size,
20143 +        * regardless of disk format version, the directory is empty.
20144 +        */
20145 +       if (inode->i_size != EMPTY_DIR_SIZE &&
20146 +           inode->i_size != EMPTY_DIR_SIZE_V1) {
20147 +               return 0;
20148 +       }
20149 +       return 1;
20150 +}
20151 +
20152 +static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
20153 +{
20154 +       int retval, err;
20155 +       struct inode *inode;
20156 +       struct reiserfs_transaction_handle th;
20157 +       int jbegin_count;
20158 +       INITIALIZE_PATH(path);
20159 +       struct reiserfs_dir_entry de;
20160 +
20161 +       /*
20162 +        * we will be doing 2 balancings and update 2 stat data, we
20163 +        * change quotas of the owner of the directory and of the owner
20164 +        * of the parent directory.  The quota structure is possibly
20165 +        * deleted only on last iput => outside of this transaction
20166 +        */
20167 +       jbegin_count =
20168 +           JOURNAL_PER_BALANCE_CNT * 2 + 2 +
20169 +           4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
20170 +
20171 +       retval = dquot_initialize(dir);
20172 +       if (retval)
20173 +               return retval;
20174 +
20175 +       reiserfs_write_lock(dir->i_sb);
20176 +       retval = journal_begin(&th, dir->i_sb, jbegin_count);
20177 +       if (retval)
20178 +               goto out_rmdir;
20179 +
20180 +       de.de_gen_number_bit_string = NULL;
20181 +       if ((retval =
20182 +            reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len,
20183 +                                &path, &de)) == NAME_NOT_FOUND) {
20184 +               retval = -ENOENT;
20185 +               goto end_rmdir;
20186 +       } else if (retval == IO_ERROR) {
20187 +               retval = -EIO;
20188 +               goto end_rmdir;
20189 +       }
20190 +
20191 +       inode = d_inode(dentry);
20192 +
20193 +       reiserfs_update_inode_transaction(inode);
20194 +       reiserfs_update_inode_transaction(dir);
20195 +
20196 +       if (de.de_objectid != inode->i_ino) {
20197 +               /*
20198 +                * FIXME: compare key of an object and a key found in the entry
20199 +                */
20200 +               retval = -EIO;
20201 +               goto end_rmdir;
20202 +       }
20203 +       if (!reiserfs_empty_dir(inode)) {
20204 +               retval = -ENOTEMPTY;
20205 +               goto end_rmdir;
20206 +       }
20207 +
20208 +       /* cut entry from dir directory */
20209 +       retval = reiserfs_cut_from_item(&th, &path, &de.de_entry_key,
20210 +                                       dir, NULL,      /* page */
20211 +                                       0 /*new file size - not used here */ );
20212 +       if (retval < 0)
20213 +               goto end_rmdir;
20214 +
20215 +       if (inode->i_nlink != 2 && inode->i_nlink != 1)
20216 +               reiserfs_error(inode->i_sb, "reiserfs-7040",
20217 +                              "empty directory has nlink != 2 (%d)",
20218 +                              inode->i_nlink);
20219 +
20220 +       clear_nlink(inode);
20221 +       inode_set_mtime_to_ts(dir,
20222 +                             inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
20223 +       reiserfs_update_sd(&th, inode);
20224 +
20225 +       DEC_DIR_INODE_NLINK(dir)
20226 +       dir->i_size -= (DEH_SIZE + de.de_entrylen);
20227 +       reiserfs_update_sd(&th, dir);
20228 +
20229 +       /* prevent empty directory from getting lost */
20230 +       add_save_link(&th, inode, 0 /* not truncate */ );
20231 +
20232 +       retval = journal_end(&th);
20233 +       reiserfs_check_path(&path);
20234 +out_rmdir:
20235 +       reiserfs_write_unlock(dir->i_sb);
20236 +       return retval;
20237 +
20238 +end_rmdir:
20239 +       /*
20240 +        * we must release path, because we did not call
20241 +        * reiserfs_cut_from_item, or reiserfs_cut_from_item does not
20242 +        * release path if operation was not complete
20243 +        */
20244 +       pathrelse(&path);
20245 +       err = journal_end(&th);
20246 +       reiserfs_write_unlock(dir->i_sb);
20247 +       return err ? err : retval;
20248 +}
20249 +
20250 +static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
20251 +{
20252 +       int retval, err;
20253 +       struct inode *inode;
20254 +       struct reiserfs_dir_entry de;
20255 +       INITIALIZE_PATH(path);
20256 +       struct reiserfs_transaction_handle th;
20257 +       int jbegin_count;
20258 +       unsigned long savelink;
20259 +
20260 +       retval = dquot_initialize(dir);
20261 +       if (retval)
20262 +               return retval;
20263 +
20264 +       inode = d_inode(dentry);
20265 +
20266 +       /*
20267 +        * in this transaction we can be doing at max two balancings and
20268 +        * update two stat datas, we change quotas of the owner of the
20269 +        * directory and of the owner of the parent directory. The quota
20270 +        * structure is possibly deleted only on iput => outside of
20271 +        * this transaction
20272 +        */
20273 +       jbegin_count =
20274 +           JOURNAL_PER_BALANCE_CNT * 2 + 2 +
20275 +           4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
20276 +
20277 +       reiserfs_write_lock(dir->i_sb);
20278 +       retval = journal_begin(&th, dir->i_sb, jbegin_count);
20279 +       if (retval)
20280 +               goto out_unlink;
20281 +
20282 +       de.de_gen_number_bit_string = NULL;
20283 +       if ((retval =
20284 +            reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len,
20285 +                                &path, &de)) == NAME_NOT_FOUND) {
20286 +               retval = -ENOENT;
20287 +               goto end_unlink;
20288 +       } else if (retval == IO_ERROR) {
20289 +               retval = -EIO;
20290 +               goto end_unlink;
20291 +       }
20292 +
20293 +       reiserfs_update_inode_transaction(inode);
20294 +       reiserfs_update_inode_transaction(dir);
20295 +
20296 +       if (de.de_objectid != inode->i_ino) {
20297 +               /*
20298 +                * FIXME: compare key of an object and a key found in the entry
20299 +                */
20300 +               retval = -EIO;
20301 +               goto end_unlink;
20302 +       }
20303 +
20304 +       if (!inode->i_nlink) {
20305 +               reiserfs_warning(inode->i_sb, "reiserfs-7042",
20306 +                                "deleting nonexistent file (%lu), %d",
20307 +                                inode->i_ino, inode->i_nlink);
20308 +               set_nlink(inode, 1);
20309 +       }
20310 +
20311 +       drop_nlink(inode);
20312 +
20313 +       /*
20314 +        * we schedule before doing the add_save_link call, save the link
20315 +        * count so we don't race
20316 +        */
20317 +       savelink = inode->i_nlink;
20318 +
20319 +       retval =
20320 +           reiserfs_cut_from_item(&th, &path, &de.de_entry_key, dir, NULL,
20321 +                                  0);
20322 +       if (retval < 0) {
20323 +               inc_nlink(inode);
20324 +               goto end_unlink;
20325 +       }
20326 +       inode_set_ctime_current(inode);
20327 +       reiserfs_update_sd(&th, inode);
20328 +
20329 +       dir->i_size -= (de.de_entrylen + DEH_SIZE);
20330 +       inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
20331 +       reiserfs_update_sd(&th, dir);
20332 +
20333 +       if (!savelink)
20334 +               /* prevent file from getting lost */
20335 +               add_save_link(&th, inode, 0 /* not truncate */ );
20336 +
20337 +       retval = journal_end(&th);
20338 +       reiserfs_check_path(&path);
20339 +       reiserfs_write_unlock(dir->i_sb);
20340 +       return retval;
20341 +
20342 +end_unlink:
20343 +       pathrelse(&path);
20344 +       err = journal_end(&th);
20345 +       reiserfs_check_path(&path);
20346 +       if (err)
20347 +               retval = err;
20348 +out_unlink:
20349 +       reiserfs_write_unlock(dir->i_sb);
20350 +       return retval;
20351 +}
20352 +
20353 +static int reiserfs_symlink(struct mnt_idmap *idmap,
20354 +                           struct inode *parent_dir, struct dentry *dentry,
20355 +                           const char *symname)
20356 +{
20357 +       int retval;
20358 +       struct inode *inode;
20359 +       char *name;
20360 +       int item_len;
20361 +       struct reiserfs_transaction_handle th;
20362 +       struct reiserfs_security_handle security;
20363 +       int mode = S_IFLNK | S_IRWXUGO;
20364 +       /*
20365 +        * We need blocks for transaction + (user+group)*(quotas for
20366 +        * new inode + update of quota for directory owner)
20367 +        */
20368 +       int jbegin_count =
20369 +           JOURNAL_PER_BALANCE_CNT * 3 +
20370 +           2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) +
20371 +                REISERFS_QUOTA_TRANS_BLOCKS(parent_dir->i_sb));
20372 +
20373 +       retval = dquot_initialize(parent_dir);
20374 +       if (retval)
20375 +               return retval;
20376 +
20377 +       if (!(inode = new_inode(parent_dir->i_sb))) {
20378 +               return -ENOMEM;
20379 +       }
20380 +       retval = new_inode_init(inode, parent_dir, mode);
20381 +       if (retval) {
20382 +               drop_new_inode(inode);
20383 +               return retval;
20384 +       }
20385 +
20386 +       retval = reiserfs_security_init(parent_dir, inode, &dentry->d_name,
20387 +                                       &security);
20388 +       if (retval < 0) {
20389 +               drop_new_inode(inode);
20390 +               return retval;
20391 +       }
20392 +       jbegin_count += retval;
20393 +
20394 +       reiserfs_write_lock(parent_dir->i_sb);
20395 +       item_len = ROUND_UP(strlen(symname));
20396 +       if (item_len > MAX_DIRECT_ITEM_LEN(parent_dir->i_sb->s_blocksize)) {
20397 +               retval = -ENAMETOOLONG;
20398 +               drop_new_inode(inode);
20399 +               goto out_failed;
20400 +       }
20401 +
20402 +       name = kmalloc(item_len, GFP_NOFS);
20403 +       if (!name) {
20404 +               drop_new_inode(inode);
20405 +               retval = -ENOMEM;
20406 +               goto out_failed;
20407 +       }
20408 +       memcpy(name, symname, strlen(symname));
20409 +       padd_item(name, item_len, strlen(symname));
20410 +
20411 +       retval = journal_begin(&th, parent_dir->i_sb, jbegin_count);
20412 +       if (retval) {
20413 +               drop_new_inode(inode);
20414 +               kfree(name);
20415 +               goto out_failed;
20416 +       }
20417 +
20418 +       retval =
20419 +           reiserfs_new_inode(&th, parent_dir, mode, name, strlen(symname),
20420 +                              dentry, inode, &security);
20421 +       kfree(name);
20422 +       if (retval) {           /* reiserfs_new_inode iputs for us */
20423 +               goto out_failed;
20424 +       }
20425 +
20426 +       reiserfs_update_inode_transaction(inode);
20427 +       reiserfs_update_inode_transaction(parent_dir);
20428 +
20429 +       inode->i_op = &reiserfs_symlink_inode_operations;
20430 +       inode_nohighmem(inode);
20431 +       inode->i_mapping->a_ops = &reiserfs_address_space_operations;
20432 +
20433 +       retval = reiserfs_add_entry(&th, parent_dir, dentry->d_name.name,
20434 +                                   dentry->d_name.len, inode, 1 /*visible */ );
20435 +       if (retval) {
20436 +               int err;
20437 +               drop_nlink(inode);
20438 +               reiserfs_update_sd(&th, inode);
20439 +               err = journal_end(&th);
20440 +               if (err)
20441 +                       retval = err;
20442 +               unlock_new_inode(inode);
20443 +               iput(inode);
20444 +               goto out_failed;
20445 +       }
20446 +
20447 +       d_instantiate_new(dentry, inode);
20448 +       retval = journal_end(&th);
20449 +out_failed:
20450 +       reiserfs_write_unlock(parent_dir->i_sb);
20451 +       reiserfs_security_free(&security);
20452 +       return retval;
20453 +}
20454 +
20455 +static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
20456 +                        struct dentry *dentry)
20457 +{
20458 +       int retval;
20459 +       struct inode *inode = d_inode(old_dentry);
20460 +       struct reiserfs_transaction_handle th;
20461 +       /*
20462 +        * We need blocks for transaction + update of quotas for
20463 +        * the owners of the directory
20464 +        */
20465 +       int jbegin_count =
20466 +           JOURNAL_PER_BALANCE_CNT * 3 +
20467 +           2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
20468 +
20469 +       retval = dquot_initialize(dir);
20470 +       if (retval)
20471 +               return retval;
20472 +
20473 +       reiserfs_write_lock(dir->i_sb);
20474 +       if (inode->i_nlink >= REISERFS_LINK_MAX) {
20475 +               /* FIXME: sd_nlink is 32 bit for new files */
20476 +               reiserfs_write_unlock(dir->i_sb);
20477 +               return -EMLINK;
20478 +       }
20479 +
20480 +       /* inc before scheduling so reiserfs_unlink knows we are here */
20481 +       inc_nlink(inode);
20482 +
20483 +       retval = journal_begin(&th, dir->i_sb, jbegin_count);
20484 +       if (retval) {
20485 +               drop_nlink(inode);
20486 +               reiserfs_write_unlock(dir->i_sb);
20487 +               return retval;
20488 +       }
20489 +
20490 +       /* create new entry */
20491 +       retval =
20492 +           reiserfs_add_entry(&th, dir, dentry->d_name.name,
20493 +                              dentry->d_name.len, inode, 1 /*visible */ );
20494 +
20495 +       reiserfs_update_inode_transaction(inode);
20496 +       reiserfs_update_inode_transaction(dir);
20497 +
20498 +       if (retval) {
20499 +               int err;
20500 +               drop_nlink(inode);
20501 +               err = journal_end(&th);
20502 +               reiserfs_write_unlock(dir->i_sb);
20503 +               return err ? err : retval;
20504 +       }
20505 +
20506 +       inode_set_ctime_current(inode);
20507 +       reiserfs_update_sd(&th, inode);
20508 +
20509 +       ihold(inode);
20510 +       d_instantiate(dentry, inode);
20511 +       retval = journal_end(&th);
20512 +       reiserfs_write_unlock(dir->i_sb);
20513 +       return retval;
20514 +}
20515 +
20516 +/* de contains information pointing to an entry which */
20517 +static int de_still_valid(const char *name, int len,
20518 +                         struct reiserfs_dir_entry *de)
20519 +{
20520 +       struct reiserfs_dir_entry tmp = *de;
20521 +
20522 +       /* recalculate pointer to name and name length */
20523 +       set_de_name_and_namelen(&tmp);
20524 +       /* FIXME: could check more */
20525 +       if (tmp.de_namelen != len || memcmp(name, de->de_name, len))
20526 +               return 0;
20527 +       return 1;
20528 +}
20529 +
20530 +static int entry_points_to_object(const char *name, int len,
20531 +                                 struct reiserfs_dir_entry *de,
20532 +                                 struct inode *inode)
20533 +{
20534 +       if (!de_still_valid(name, len, de))
20535 +               return 0;
20536 +
20537 +       if (inode) {
20538 +               if (!de_visible(de->de_deh + de->de_entry_num))
20539 +                       reiserfs_panic(inode->i_sb, "vs-7042",
20540 +                                      "entry must be visible");
20541 +               return (de->de_objectid == inode->i_ino) ? 1 : 0;
20542 +       }
20543 +
20544 +       /* this must be added hidden entry */
20545 +       if (de_visible(de->de_deh + de->de_entry_num))
20546 +               reiserfs_panic(NULL, "vs-7043", "entry must be visible");
20547 +
20548 +       return 1;
20549 +}
20550 +
20551 +/* sets key of objectid the entry has to point to */
20552 +static void set_ino_in_dir_entry(struct reiserfs_dir_entry *de,
20553 +                                struct reiserfs_key *key)
20554 +{
20555 +       /* JDM These operations are endian safe - both are le */
20556 +       de->de_deh[de->de_entry_num].deh_dir_id = key->k_dir_id;
20557 +       de->de_deh[de->de_entry_num].deh_objectid = key->k_objectid;
20558 +}
20559 +
20560 +/*
20561 + * process, that is going to call fix_nodes/do_balance must hold only
20562 + * one path. If it holds 2 or more, it can get into endless waiting in
20563 + * get_empty_nodes or its clones
20564 + */
20565 +static int reiserfs_rename(struct mnt_idmap *idmap,
20566 +                          struct inode *old_dir, struct dentry *old_dentry,
20567 +                          struct inode *new_dir, struct dentry *new_dentry,
20568 +                          unsigned int flags)
20569 +{
20570 +       int retval;
20571 +       INITIALIZE_PATH(old_entry_path);
20572 +       INITIALIZE_PATH(new_entry_path);
20573 +       INITIALIZE_PATH(dot_dot_entry_path);
20574 +       struct item_head new_entry_ih, old_entry_ih, dot_dot_ih;
20575 +       struct reiserfs_dir_entry old_de, new_de, dot_dot_de;
20576 +       struct inode *old_inode, *new_dentry_inode;
20577 +       struct reiserfs_transaction_handle th;
20578 +       int jbegin_count;
20579 +       unsigned long savelink = 1;
20580 +       bool update_dir_parent = false;
20581 +
20582 +       if (flags & ~RENAME_NOREPLACE)
20583 +               return -EINVAL;
20584 +
20585 +       /*
20586 +        * three balancings: (1) old name removal, (2) new name insertion
20587 +        * and (3) maybe "save" link insertion
20588 +        * stat data updates: (1) old directory,
20589 +        * (2) new directory and (3) maybe old object stat data (when it is
20590 +        * directory) and (4) maybe stat data of object to which new entry
20591 +        * pointed initially and (5) maybe block containing ".." of
20592 +        * renamed directory
20593 +        * quota updates: two parent directories
20594 +        */
20595 +       jbegin_count =
20596 +           JOURNAL_PER_BALANCE_CNT * 3 + 5 +
20597 +           4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb);
20598 +
20599 +       retval = dquot_initialize(old_dir);
20600 +       if (retval)
20601 +               return retval;
20602 +       retval = dquot_initialize(new_dir);
20603 +       if (retval)
20604 +               return retval;
20605 +
20606 +       old_inode = d_inode(old_dentry);
20607 +       new_dentry_inode = d_inode(new_dentry);
20608 +
20609 +       /*
20610 +        * make sure that oldname still exists and points to an object we
20611 +        * are going to rename
20612 +        */
20613 +       old_de.de_gen_number_bit_string = NULL;
20614 +       reiserfs_write_lock(old_dir->i_sb);
20615 +       retval =
20616 +           reiserfs_find_entry(old_dir, old_dentry->d_name.name,
20617 +                               old_dentry->d_name.len, &old_entry_path,
20618 +                               &old_de);
20619 +       pathrelse(&old_entry_path);
20620 +       if (retval == IO_ERROR) {
20621 +               reiserfs_write_unlock(old_dir->i_sb);
20622 +               return -EIO;
20623 +       }
20624 +
20625 +       if (retval != NAME_FOUND || old_de.de_objectid != old_inode->i_ino) {
20626 +               reiserfs_write_unlock(old_dir->i_sb);
20627 +               return -ENOENT;
20628 +       }
20629 +
20630 +       if (S_ISDIR(old_inode->i_mode)) {
20631 +               /*
20632 +                * make sure that directory being renamed has correct ".."
20633 +                * and that its new parent directory has not too many links
20634 +                * already
20635 +                */
20636 +               if (new_dentry_inode) {
20637 +                       if (!reiserfs_empty_dir(new_dentry_inode)) {
20638 +                               reiserfs_write_unlock(old_dir->i_sb);
20639 +                               return -ENOTEMPTY;
20640 +                       }
20641 +               }
20642 +
20643 +               if (old_dir != new_dir) {
20644 +                       /*
20645 +                        * directory is renamed, its parent directory will be
20646 +                        * changed, so find ".." entry
20647 +                        */
20648 +                       dot_dot_de.de_gen_number_bit_string = NULL;
20649 +                       retval =
20650 +                           reiserfs_find_entry(old_inode, "..", 2,
20651 +                                       &dot_dot_entry_path,
20652 +                                       &dot_dot_de);
20653 +                       pathrelse(&dot_dot_entry_path);
20654 +                       if (retval != NAME_FOUND) {
20655 +                               reiserfs_write_unlock(old_dir->i_sb);
20656 +                               return -EIO;
20657 +                       }
20658 +
20659 +                       /* inode number of .. must equal old_dir->i_ino */
20660 +                       if (dot_dot_de.de_objectid != old_dir->i_ino) {
20661 +                               reiserfs_write_unlock(old_dir->i_sb);
20662 +                               return -EIO;
20663 +                       }
20664 +                       update_dir_parent = true;
20665 +               }
20666 +       }
20667 +
20668 +       retval = journal_begin(&th, old_dir->i_sb, jbegin_count);
20669 +       if (retval) {
20670 +               reiserfs_write_unlock(old_dir->i_sb);
20671 +               return retval;
20672 +       }
20673 +
20674 +       /* add new entry (or find the existing one) */
20675 +       retval =
20676 +           reiserfs_add_entry(&th, new_dir, new_dentry->d_name.name,
20677 +                              new_dentry->d_name.len, old_inode, 0);
20678 +       if (retval == -EEXIST) {
20679 +               if (!new_dentry_inode) {
20680 +                       reiserfs_panic(old_dir->i_sb, "vs-7050",
20681 +                                      "new entry is found, new inode == 0");
20682 +               }
20683 +       } else if (retval) {
20684 +               int err = journal_end(&th);
20685 +               reiserfs_write_unlock(old_dir->i_sb);
20686 +               return err ? err : retval;
20687 +       }
20688 +
20689 +       reiserfs_update_inode_transaction(old_dir);
20690 +       reiserfs_update_inode_transaction(new_dir);
20691 +
20692 +       /*
20693 +        * this makes it so an fsync on an open fd for the old name will
20694 +        * commit the rename operation
20695 +        */
20696 +       reiserfs_update_inode_transaction(old_inode);
20697 +
20698 +       if (new_dentry_inode)
20699 +               reiserfs_update_inode_transaction(new_dentry_inode);
20700 +
20701 +       while (1) {
20702 +               /*
20703 +                * look for old name using corresponding entry key
20704 +                * (found by reiserfs_find_entry)
20705 +                */
20706 +               if ((retval =
20707 +                    search_by_entry_key(new_dir->i_sb, &old_de.de_entry_key,
20708 +                                        &old_entry_path,
20709 +                                        &old_de)) != NAME_FOUND) {
20710 +                       pathrelse(&old_entry_path);
20711 +                       journal_end(&th);
20712 +                       reiserfs_write_unlock(old_dir->i_sb);
20713 +                       return -EIO;
20714 +               }
20715 +
20716 +               copy_item_head(&old_entry_ih, tp_item_head(&old_entry_path));
20717 +
20718 +               reiserfs_prepare_for_journal(old_inode->i_sb, old_de.de_bh, 1);
20719 +
20720 +               /* look for new name by reiserfs_find_entry */
20721 +               new_de.de_gen_number_bit_string = NULL;
20722 +               retval =
20723 +                   reiserfs_find_entry(new_dir, new_dentry->d_name.name,
20724 +                                       new_dentry->d_name.len, &new_entry_path,
20725 +                                       &new_de);
20726 +               /*
20727 +                * reiserfs_add_entry should not return IO_ERROR,
20728 +                * because it is called with essentially same parameters from
20729 +                * reiserfs_add_entry above, and we'll catch any i/o errors
20730 +                * before we get here.
20731 +                */
20732 +               if (retval != NAME_FOUND_INVISIBLE && retval != NAME_FOUND) {
20733 +                       pathrelse(&new_entry_path);
20734 +                       pathrelse(&old_entry_path);
20735 +                       journal_end(&th);
20736 +                       reiserfs_write_unlock(old_dir->i_sb);
20737 +                       return -EIO;
20738 +               }
20739 +
20740 +               copy_item_head(&new_entry_ih, tp_item_head(&new_entry_path));
20741 +
20742 +               reiserfs_prepare_for_journal(old_inode->i_sb, new_de.de_bh, 1);
20743 +
20744 +               if (update_dir_parent) {
20745 +                       if ((retval =
20746 +                            search_by_entry_key(new_dir->i_sb,
20747 +                                                &dot_dot_de.de_entry_key,
20748 +                                                &dot_dot_entry_path,
20749 +                                                &dot_dot_de)) != NAME_FOUND) {
20750 +                               pathrelse(&dot_dot_entry_path);
20751 +                               pathrelse(&new_entry_path);
20752 +                               pathrelse(&old_entry_path);
20753 +                               journal_end(&th);
20754 +                               reiserfs_write_unlock(old_dir->i_sb);
20755 +                               return -EIO;
20756 +                       }
20757 +                       copy_item_head(&dot_dot_ih,
20758 +                                      tp_item_head(&dot_dot_entry_path));
20759 +                       /* node containing ".." gets into transaction */
20760 +                       reiserfs_prepare_for_journal(old_inode->i_sb,
20761 +                                                    dot_dot_de.de_bh, 1);
20762 +               }
20763 +               /*
20764 +                * we should check seals here, not do
20765 +                * this stuff, yes? Then, having
20766 +                * gathered everything into RAM we
20767 +                * should lock the buffers, yes?  -Hans
20768 +                */
20769 +               /*
20770 +                * probably.  our rename needs to hold more
20771 +                * than one path at once.  The seals would
20772 +                * have to be written to deal with multi-path
20773 +                * issues -chris
20774 +                */
20775 +               /*
20776 +                * sanity checking before doing the rename - avoid races many
20777 +                * of the above checks could have scheduled.  We have to be
20778 +                * sure our items haven't been shifted by another process.
20779 +                */
20780 +               if (item_moved(&new_entry_ih, &new_entry_path) ||
20781 +                   !entry_points_to_object(new_dentry->d_name.name,
20782 +                                           new_dentry->d_name.len,
20783 +                                           &new_de, new_dentry_inode) ||
20784 +                   item_moved(&old_entry_ih, &old_entry_path) ||
20785 +                   !entry_points_to_object(old_dentry->d_name.name,
20786 +                                           old_dentry->d_name.len,
20787 +                                           &old_de, old_inode)) {
20788 +                       reiserfs_restore_prepared_buffer(old_inode->i_sb,
20789 +                                                        new_de.de_bh);
20790 +                       reiserfs_restore_prepared_buffer(old_inode->i_sb,
20791 +                                                        old_de.de_bh);
20792 +                       if (update_dir_parent)
20793 +                               reiserfs_restore_prepared_buffer(old_inode->
20794 +                                                                i_sb,
20795 +                                                                dot_dot_de.
20796 +                                                                de_bh);
20797 +                       continue;
20798 +               }
20799 +               if (update_dir_parent) {
20800 +                       if (item_moved(&dot_dot_ih, &dot_dot_entry_path) ||
20801 +                           !entry_points_to_object("..", 2, &dot_dot_de,
20802 +                                                   old_dir)) {
20803 +                               reiserfs_restore_prepared_buffer(old_inode->
20804 +                                                                i_sb,
20805 +                                                                old_de.de_bh);
20806 +                               reiserfs_restore_prepared_buffer(old_inode->
20807 +                                                                i_sb,
20808 +                                                                new_de.de_bh);
20809 +                               reiserfs_restore_prepared_buffer(old_inode->
20810 +                                                                i_sb,
20811 +                                                                dot_dot_de.
20812 +                                                                de_bh);
20813 +                               continue;
20814 +                       }
20815 +               }
20816 +
20817 +               RFALSE(update_dir_parent &&
20818 +                      !buffer_journal_prepared(dot_dot_de.de_bh), "");
20819 +
20820 +               break;
20821 +       }
20822 +
20823 +       /*
20824 +        * ok, all the changes can be done in one fell swoop when we
20825 +        * have claimed all the buffers needed.
20826 +        */
20827 +
20828 +       mark_de_visible(new_de.de_deh + new_de.de_entry_num);
20829 +       set_ino_in_dir_entry(&new_de, INODE_PKEY(old_inode));
20830 +       journal_mark_dirty(&th, new_de.de_bh);
20831 +
20832 +       mark_de_hidden(old_de.de_deh + old_de.de_entry_num);
20833 +       journal_mark_dirty(&th, old_de.de_bh);
20834 +       /*
20835 +        * thanks to Alex Adriaanse <alex_a@caltech.edu> for patch
20836 +        * which adds ctime update of renamed object
20837 +        */
20838 +       simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
20839 +
20840 +       if (new_dentry_inode) {
20841 +               /* adjust link number of the victim */
20842 +               if (S_ISDIR(new_dentry_inode->i_mode)) {
20843 +                       clear_nlink(new_dentry_inode);
20844 +               } else {
20845 +                       drop_nlink(new_dentry_inode);
20846 +               }
20847 +               savelink = new_dentry_inode->i_nlink;
20848 +       }
20849 +
20850 +       if (update_dir_parent) {
20851 +               /* adjust ".." of renamed directory */
20852 +               set_ino_in_dir_entry(&dot_dot_de, INODE_PKEY(new_dir));
20853 +               journal_mark_dirty(&th, dot_dot_de.de_bh);
20854 +       }
20855 +       if (S_ISDIR(old_inode->i_mode)) {
20856 +               /*
20857 +                * there (in new_dir) was no directory, so it got new link
20858 +                * (".."  of renamed directory)
20859 +                */
20860 +               if (!new_dentry_inode)
20861 +                       INC_DIR_INODE_NLINK(new_dir);
20862 +
20863 +               /* old directory lost one link - ".. " of renamed directory */
20864 +               DEC_DIR_INODE_NLINK(old_dir);
20865 +       }
20866 +       /*
20867 +        * looks like in 2.3.99pre3 brelse is atomic.
20868 +        * so we can use pathrelse
20869 +        */
20870 +       pathrelse(&new_entry_path);
20871 +       pathrelse(&dot_dot_entry_path);
20872 +
20873 +       /*
20874 +        * FIXME: this reiserfs_cut_from_item's return value may screw up
20875 +        * anybody, but it will panic if will not be able to find the
20876 +        * entry. This needs one more clean up
20877 +        */
20878 +       if (reiserfs_cut_from_item
20879 +           (&th, &old_entry_path, &old_de.de_entry_key, old_dir, NULL,
20880 +            0) < 0)
20881 +               reiserfs_error(old_dir->i_sb, "vs-7060",
20882 +                              "couldn't not cut old name. Fsck later?");
20883 +
20884 +       old_dir->i_size -= DEH_SIZE + old_de.de_entrylen;
20885 +
20886 +       reiserfs_update_sd(&th, old_dir);
20887 +       reiserfs_update_sd(&th, new_dir);
20888 +       reiserfs_update_sd(&th, old_inode);
20889 +
20890 +       if (new_dentry_inode) {
20891 +               if (savelink == 0)
20892 +                       add_save_link(&th, new_dentry_inode,
20893 +                                     0 /* not truncate */ );
20894 +               reiserfs_update_sd(&th, new_dentry_inode);
20895 +       }
20896 +
20897 +       retval = journal_end(&th);
20898 +       reiserfs_write_unlock(old_dir->i_sb);
20899 +       return retval;
20900 +}
20901 +
20902 +static const struct inode_operations reiserfs_priv_dir_inode_operations = {
20903 +       .create = reiserfs_create,
20904 +       .lookup = reiserfs_lookup,
20905 +       .link = reiserfs_link,
20906 +       .unlink = reiserfs_unlink,
20907 +       .symlink = reiserfs_symlink,
20908 +       .mkdir = reiserfs_mkdir,
20909 +       .rmdir = reiserfs_rmdir,
20910 +       .mknod = reiserfs_mknod,
20911 +       .rename = reiserfs_rename,
20912 +       .setattr = reiserfs_setattr,
20913 +       .permission = reiserfs_permission,
20914 +       .fileattr_get = reiserfs_fileattr_get,
20915 +       .fileattr_set = reiserfs_fileattr_set,
20916 +};
20917 +
20918 +static const struct inode_operations reiserfs_priv_symlink_inode_operations = {
20919 +       .get_link       = page_get_link,
20920 +       .setattr = reiserfs_setattr,
20921 +       .permission = reiserfs_permission,
20922 +};
20923 +
20924 +static const struct inode_operations reiserfs_priv_special_inode_operations = {
20925 +       .setattr = reiserfs_setattr,
20926 +       .permission = reiserfs_permission,
20927 +};
20928 +
20929 +void reiserfs_init_priv_inode(struct inode *inode)
20930 +{
20931 +       inode->i_flags |= S_PRIVATE;
20932 +       inode->i_opflags &= ~IOP_XATTR;
20933 +
20934 +       if (S_ISREG(inode->i_mode))
20935 +               inode->i_op = &reiserfs_priv_file_inode_operations;
20936 +       else if (S_ISDIR(inode->i_mode))
20937 +               inode->i_op = &reiserfs_priv_dir_inode_operations;
20938 +       else if (S_ISLNK(inode->i_mode))
20939 +               inode->i_op = &reiserfs_priv_symlink_inode_operations;
20940 +       else
20941 +               inode->i_op = &reiserfs_priv_special_inode_operations;
20942 +}
20943 +
20944 +/* directories can handle most operations...  */
20945 +const struct inode_operations reiserfs_dir_inode_operations = {
20946 +       .create = reiserfs_create,
20947 +       .lookup = reiserfs_lookup,
20948 +       .link = reiserfs_link,
20949 +       .unlink = reiserfs_unlink,
20950 +       .symlink = reiserfs_symlink,
20951 +       .mkdir = reiserfs_mkdir,
20952 +       .rmdir = reiserfs_rmdir,
20953 +       .mknod = reiserfs_mknod,
20954 +       .rename = reiserfs_rename,
20955 +       .setattr = reiserfs_setattr,
20956 +       .listxattr = reiserfs_listxattr,
20957 +       .permission = reiserfs_permission,
20958 +       .get_inode_acl = reiserfs_get_acl,
20959 +       .set_acl = reiserfs_set_acl,
20960 +       .fileattr_get = reiserfs_fileattr_get,
20961 +       .fileattr_set = reiserfs_fileattr_set,
20962 +};
20963 +
20964 +/*
20965 + * symlink operations.. same as page_symlink_inode_operations, with xattr
20966 + * stuff added
20967 + */
20968 +const struct inode_operations reiserfs_symlink_inode_operations = {
20969 +       .get_link       = page_get_link,
20970 +       .setattr = reiserfs_setattr,
20971 +       .listxattr = reiserfs_listxattr,
20972 +       .permission = reiserfs_permission,
20973 +};
20974 +
20975 +/*
20976 + * special file operations.. just xattr/acl stuff
20977 + */
20978 +const struct inode_operations reiserfs_special_inode_operations = {
20979 +       .setattr = reiserfs_setattr,
20980 +       .listxattr = reiserfs_listxattr,
20981 +       .permission = reiserfs_permission,
20982 +       .get_inode_acl = reiserfs_get_acl,
20983 +       .set_acl = reiserfs_set_acl,
20984 +};
20985 diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c
20986 new file mode 100644
20987 index 000000000000..34baf5c0f265
20988 --- /dev/null
20989 +++ b/fs/reiserfs/objectid.c
20990 @@ -0,0 +1,216 @@
20991 +/*
20992 + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
20993 + */
20994 +
20995 +#include <linux/string.h>
20996 +#include <linux/time.h>
20997 +#include <linux/uuid.h>
20998 +#include "reiserfs.h"
20999 +
21000 +/* find where objectid map starts */
21001 +#define objectid_map(s,rs) (old_format_only (s) ? \
21002 +                         (__le32 *)((struct reiserfs_super_block_v1 *)(rs) + 1) :\
21003 +                        (__le32 *)((rs) + 1))
21004 +
21005 +#ifdef CONFIG_REISERFS_CHECK
21006 +
21007 +static void check_objectid_map(struct super_block *s, __le32 * map)
21008 +{
21009 +       if (le32_to_cpu(map[0]) != 1)
21010 +               reiserfs_panic(s, "vs-15010", "map corrupted: %lx",
21011 +                              (long unsigned int)le32_to_cpu(map[0]));
21012 +
21013 +       /* FIXME: add something else here */
21014 +}
21015 +
21016 +#else
21017 +static void check_objectid_map(struct super_block *s, __le32 * map)
21018 +{;
21019 +}
21020 +#endif
21021 +
21022 +/*
21023 + * When we allocate objectids we allocate the first unused objectid.
21024 + * Each sequence of objectids in use (the odd sequences) is followed
21025 + * by a sequence of objectids not in use (the even sequences).  We
21026 + * only need to record the last objectid in each of these sequences
21027 + * (both the odd and even sequences) in order to fully define the
21028 + * boundaries of the sequences.  A consequence of allocating the first
21029 + * objectid not in use is that under most conditions this scheme is
21030 + * extremely compact.  The exception is immediately after a sequence
21031 + * of operations which deletes a large number of objects of
21032 + * non-sequential objectids, and even then it will become compact
21033 + * again as soon as more objects are created.  Note that many
21034 + * interesting optimizations of layout could result from complicating
21035 + * objectid assignment, but we have deferred making them for now.
21036 + */
21037 +
21038 +/* get unique object identifier */
21039 +__u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th)
21040 +{
21041 +       struct super_block *s = th->t_super;
21042 +       struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s);
21043 +       __le32 *map = objectid_map(s, rs);
21044 +       __u32 unused_objectid;
21045 +
21046 +       BUG_ON(!th->t_trans_id);
21047 +
21048 +       check_objectid_map(s, map);
21049 +
21050 +       reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
21051 +       /* comment needed -Hans */
21052 +       unused_objectid = le32_to_cpu(map[1]);
21053 +       if (unused_objectid == U32_MAX) {
21054 +               reiserfs_warning(s, "reiserfs-15100", "no more object ids");
21055 +               reiserfs_restore_prepared_buffer(s, SB_BUFFER_WITH_SB(s));
21056 +               return 0;
21057 +       }
21058 +
21059 +       /*
21060 +        * This incrementation allocates the first unused objectid. That
21061 +        * is to say, the first entry on the objectid map is the first
21062 +        * unused objectid, and by incrementing it we use it.  See below
21063 +        * where we check to see if we eliminated a sequence of unused
21064 +        * objectids....
21065 +        */
21066 +       map[1] = cpu_to_le32(unused_objectid + 1);
21067 +
21068 +       /*
21069 +        * Now we check to see if we eliminated the last remaining member of
21070 +        * the first even sequence (and can eliminate the sequence by
21071 +        * eliminating its last objectid from oids), and can collapse the
21072 +        * first two odd sequences into one sequence.  If so, then the net
21073 +        * result is to eliminate a pair of objectids from oids.  We do this
21074 +        * by shifting the entire map to the left.
21075 +        */
21076 +       if (sb_oid_cursize(rs) > 2 && map[1] == map[2]) {
21077 +               memmove(map + 1, map + 3,
21078 +                       (sb_oid_cursize(rs) - 3) * sizeof(__u32));
21079 +               set_sb_oid_cursize(rs, sb_oid_cursize(rs) - 2);
21080 +       }
21081 +
21082 +       journal_mark_dirty(th, SB_BUFFER_WITH_SB(s));
21083 +       return unused_objectid;
21084 +}
21085 +
21086 +/* makes object identifier unused */
21087 +void reiserfs_release_objectid(struct reiserfs_transaction_handle *th,
21088 +                              __u32 objectid_to_release)
21089 +{
21090 +       struct super_block *s = th->t_super;
21091 +       struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s);
21092 +       __le32 *map = objectid_map(s, rs);
21093 +       int i = 0;
21094 +
21095 +       BUG_ON(!th->t_trans_id);
21096 +       /*return; */
21097 +       check_objectid_map(s, map);
21098 +
21099 +       reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
21100 +       journal_mark_dirty(th, SB_BUFFER_WITH_SB(s));
21101 +
21102 +       /*
21103 +        * start at the beginning of the objectid map (i = 0) and go to
21104 +        * the end of it (i = disk_sb->s_oid_cursize).  Linear search is
21105 +        * what we use, though it is possible that binary search would be
21106 +        * more efficient after performing lots of deletions (which is
21107 +        * when oids is large.)  We only check even i's.
21108 +        */
21109 +       while (i < sb_oid_cursize(rs)) {
21110 +               if (objectid_to_release == le32_to_cpu(map[i])) {
21111 +                       /* This incrementation unallocates the objectid. */
21112 +                       le32_add_cpu(&map[i], 1);
21113 +
21114 +                       /*
21115 +                        * Did we unallocate the last member of an
21116 +                        * odd sequence, and can shrink oids?
21117 +                        */
21118 +                       if (map[i] == map[i + 1]) {
21119 +                               /* shrink objectid map */
21120 +                               memmove(map + i, map + i + 2,
21121 +                                       (sb_oid_cursize(rs) - i -
21122 +                                        2) * sizeof(__u32));
21123 +                               set_sb_oid_cursize(rs, sb_oid_cursize(rs) - 2);
21124 +
21125 +                               RFALSE(sb_oid_cursize(rs) < 2 ||
21126 +                                      sb_oid_cursize(rs) > sb_oid_maxsize(rs),
21127 +                                      "vs-15005: objectid map corrupted cur_size == %d (max == %d)",
21128 +                                      sb_oid_cursize(rs), sb_oid_maxsize(rs));
21129 +                       }
21130 +                       return;
21131 +               }
21132 +
21133 +               if (objectid_to_release > le32_to_cpu(map[i]) &&
21134 +                   objectid_to_release < le32_to_cpu(map[i + 1])) {
21135 +                       /* size of objectid map is not changed */
21136 +                       if (objectid_to_release + 1 == le32_to_cpu(map[i + 1])) {
21137 +                               le32_add_cpu(&map[i + 1], -1);
21138 +                               return;
21139 +                       }
21140 +
21141 +                       /*
21142 +                        * JDM comparing two little-endian values for
21143 +                        * equality -- safe
21144 +                        */
21145 +                       /*
21146 +                        * objectid map must be expanded, but
21147 +                        * there is no space
21148 +                        */
21149 +                       if (sb_oid_cursize(rs) == sb_oid_maxsize(rs)) {
21150 +                               PROC_INFO_INC(s, leaked_oid);
21151 +                               return;
21152 +                       }
21153 +
21154 +                       /* expand the objectid map */
21155 +                       memmove(map + i + 3, map + i + 1,
21156 +                               (sb_oid_cursize(rs) - i - 1) * sizeof(__u32));
21157 +                       map[i + 1] = cpu_to_le32(objectid_to_release);
21158 +                       map[i + 2] = cpu_to_le32(objectid_to_release + 1);
21159 +                       set_sb_oid_cursize(rs, sb_oid_cursize(rs) + 2);
21160 +                       return;
21161 +               }
21162 +               i += 2;
21163 +       }
21164 +
21165 +       reiserfs_error(s, "vs-15011", "tried to free free object id (%lu)",
21166 +                      (long unsigned)objectid_to_release);
21167 +}
21168 +
21169 +int reiserfs_convert_objectid_map_v1(struct super_block *s)
21170 +{
21171 +       struct reiserfs_super_block *disk_sb = SB_DISK_SUPER_BLOCK(s);
21172 +       int cur_size = sb_oid_cursize(disk_sb);
21173 +       int new_size = (s->s_blocksize - SB_SIZE) / sizeof(__u32) / 2 * 2;
21174 +       int old_max = sb_oid_maxsize(disk_sb);
21175 +       struct reiserfs_super_block_v1 *disk_sb_v1;
21176 +       __le32 *objectid_map;
21177 +       int i;
21178 +
21179 +       disk_sb_v1 =
21180 +           (struct reiserfs_super_block_v1 *)(SB_BUFFER_WITH_SB(s)->b_data);
21181 +       objectid_map = (__le32 *) (disk_sb_v1 + 1);
21182 +
21183 +       if (cur_size > new_size) {
21184 +               /*
21185 +                * mark everyone used that was listed as free at
21186 +                * the end of the objectid map
21187 +                */
21188 +               objectid_map[new_size - 1] = objectid_map[cur_size - 1];
21189 +               set_sb_oid_cursize(disk_sb, new_size);
21190 +       }
21191 +       /* move the smaller objectid map past the end of the new super */
21192 +       for (i = new_size - 1; i >= 0; i--) {
21193 +               objectid_map[i + (old_max - new_size)] = objectid_map[i];
21194 +       }
21195 +
21196 +       /* set the max size so we don't overflow later */
21197 +       set_sb_oid_maxsize(disk_sb, new_size);
21198 +
21199 +       /* Zero out label and generate random UUID */
21200 +       memset(disk_sb->s_label, 0, sizeof(disk_sb->s_label));
21201 +       generate_random_uuid(disk_sb->s_uuid);
21202 +
21203 +       /* finally, zero out the unused chunk of the new super */
21204 +       memset(disk_sb->s_unused, 0, sizeof(disk_sb->s_unused));
21205 +       return 0;
21206 +}
21207 diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
21208 new file mode 100644
21209 index 000000000000..84a194b77f19
21210 --- /dev/null
21211 +++ b/fs/reiserfs/prints.c
21212 @@ -0,0 +1,792 @@
21213 +/*
21214 + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
21215 + */
21216 +
21217 +#include <linux/time.h>
21218 +#include <linux/fs.h>
21219 +#include "reiserfs.h"
21220 +#include <linux/string.h>
21221 +#include <linux/buffer_head.h>
21222 +
21223 +#include <linux/stdarg.h>
21224 +
21225 +static char error_buf[1024];
21226 +static char fmt_buf[1024];
21227 +static char off_buf[80];
21228 +
21229 +static char *reiserfs_cpu_offset(struct cpu_key *key)
21230 +{
21231 +       if (cpu_key_k_type(key) == TYPE_DIRENTRY)
21232 +               sprintf(off_buf, "%llu(%llu)",
21233 +                       (unsigned long long)
21234 +                       GET_HASH_VALUE(cpu_key_k_offset(key)),
21235 +                       (unsigned long long)
21236 +                       GET_GENERATION_NUMBER(cpu_key_k_offset(key)));
21237 +       else
21238 +               sprintf(off_buf, "0x%Lx",
21239 +                       (unsigned long long)cpu_key_k_offset(key));
21240 +       return off_buf;
21241 +}
21242 +
21243 +static char *le_offset(struct reiserfs_key *key)
21244 +{
21245 +       int version;
21246 +
21247 +       version = le_key_version(key);
21248 +       if (le_key_k_type(version, key) == TYPE_DIRENTRY)
21249 +               sprintf(off_buf, "%llu(%llu)",
21250 +                       (unsigned long long)
21251 +                       GET_HASH_VALUE(le_key_k_offset(version, key)),
21252 +                       (unsigned long long)
21253 +                       GET_GENERATION_NUMBER(le_key_k_offset(version, key)));
21254 +       else
21255 +               sprintf(off_buf, "0x%Lx",
21256 +                       (unsigned long long)le_key_k_offset(version, key));
21257 +       return off_buf;
21258 +}
21259 +
21260 +static char *cpu_type(struct cpu_key *key)
21261 +{
21262 +       if (cpu_key_k_type(key) == TYPE_STAT_DATA)
21263 +               return "SD";
21264 +       if (cpu_key_k_type(key) == TYPE_DIRENTRY)
21265 +               return "DIR";
21266 +       if (cpu_key_k_type(key) == TYPE_DIRECT)
21267 +               return "DIRECT";
21268 +       if (cpu_key_k_type(key) == TYPE_INDIRECT)
21269 +               return "IND";
21270 +       return "UNKNOWN";
21271 +}
21272 +
21273 +static char *le_type(struct reiserfs_key *key)
21274 +{
21275 +       int version;
21276 +
21277 +       version = le_key_version(key);
21278 +
21279 +       if (le_key_k_type(version, key) == TYPE_STAT_DATA)
21280 +               return "SD";
21281 +       if (le_key_k_type(version, key) == TYPE_DIRENTRY)
21282 +               return "DIR";
21283 +       if (le_key_k_type(version, key) == TYPE_DIRECT)
21284 +               return "DIRECT";
21285 +       if (le_key_k_type(version, key) == TYPE_INDIRECT)
21286 +               return "IND";
21287 +       return "UNKNOWN";
21288 +}
21289 +
21290 +/* %k */
21291 +static int scnprintf_le_key(char *buf, size_t size, struct reiserfs_key *key)
21292 +{
21293 +       if (key)
21294 +               return scnprintf(buf, size, "[%d %d %s %s]",
21295 +                                le32_to_cpu(key->k_dir_id),
21296 +                                le32_to_cpu(key->k_objectid), le_offset(key),
21297 +                                le_type(key));
21298 +       else
21299 +               return scnprintf(buf, size, "[NULL]");
21300 +}
21301 +
21302 +/* %K */
21303 +static int scnprintf_cpu_key(char *buf, size_t size, struct cpu_key *key)
21304 +{
21305 +       if (key)
21306 +               return scnprintf(buf, size, "[%d %d %s %s]",
21307 +                                key->on_disk_key.k_dir_id,
21308 +                                key->on_disk_key.k_objectid,
21309 +                                reiserfs_cpu_offset(key), cpu_type(key));
21310 +       else
21311 +               return scnprintf(buf, size, "[NULL]");
21312 +}
21313 +
21314 +static int scnprintf_de_head(char *buf, size_t size,
21315 +                            struct reiserfs_de_head *deh)
21316 +{
21317 +       if (deh)
21318 +               return scnprintf(buf, size,
21319 +                                "[offset=%d dir_id=%d objectid=%d location=%d state=%04x]",
21320 +                                deh_offset(deh), deh_dir_id(deh),
21321 +                                deh_objectid(deh), deh_location(deh),
21322 +                                deh_state(deh));
21323 +       else
21324 +               return scnprintf(buf, size, "[NULL]");
21325 +
21326 +}
21327 +
21328 +static int scnprintf_item_head(char *buf, size_t size, struct item_head *ih)
21329 +{
21330 +       if (ih) {
21331 +               char *p = buf;
21332 +               char * const end = buf + size;
21333 +
21334 +               p += scnprintf(p, end - p, "%s",
21335 +                              (ih_version(ih) == KEY_FORMAT_3_6) ?
21336 +                              "*3.6* " : "*3.5*");
21337 +
21338 +               p += scnprintf_le_key(p, end - p, &ih->ih_key);
21339 +
21340 +               p += scnprintf(p, end - p,
21341 +                              ", item_len %d, item_location %d, free_space(entry_count) %d",
21342 +                              ih_item_len(ih), ih_location(ih),
21343 +                              ih_free_space(ih));
21344 +               return p - buf;
21345 +       } else
21346 +               return scnprintf(buf, size, "[NULL]");
21347 +}
21348 +
21349 +static int scnprintf_direntry(char *buf, size_t size,
21350 +                             struct reiserfs_dir_entry *de)
21351 +{
21352 +       char name[20];
21353 +
21354 +       memcpy(name, de->de_name, de->de_namelen > 19 ? 19 : de->de_namelen);
21355 +       name[de->de_namelen > 19 ? 19 : de->de_namelen] = 0;
21356 +       return scnprintf(buf, size, "\"%s\"==>[%d %d]",
21357 +                        name, de->de_dir_id, de->de_objectid);
21358 +}
21359 +
21360 +static int scnprintf_block_head(char *buf, size_t size, struct buffer_head *bh)
21361 +{
21362 +       return scnprintf(buf, size,
21363 +                        "level=%d, nr_items=%d, free_space=%d rdkey ",
21364 +                        B_LEVEL(bh), B_NR_ITEMS(bh), B_FREE_SPACE(bh));
21365 +}
21366 +
21367 +static int scnprintf_buffer_head(char *buf, size_t size, struct buffer_head *bh)
21368 +{
21369 +       return scnprintf(buf, size,
21370 +                        "dev %pg, size %zd, blocknr %llu, count %d, state 0x%lx, page %p, (%s, %s, %s)",
21371 +                        bh->b_bdev, bh->b_size,
21372 +                        (unsigned long long)bh->b_blocknr,
21373 +                        atomic_read(&(bh->b_count)),
21374 +                        bh->b_state, bh->b_page,
21375 +                        buffer_uptodate(bh) ? "UPTODATE" : "!UPTODATE",
21376 +                        buffer_dirty(bh) ? "DIRTY" : "CLEAN",
21377 +                        buffer_locked(bh) ? "LOCKED" : "UNLOCKED");
21378 +}
21379 +
21380 +static int scnprintf_disk_child(char *buf, size_t size, struct disk_child *dc)
21381 +{
21382 +       return scnprintf(buf, size, "[dc_number=%d, dc_size=%u]",
21383 +                        dc_block_number(dc), dc_size(dc));
21384 +}
21385 +
21386 +static char *is_there_reiserfs_struct(char *fmt, int *what)
21387 +{
21388 +       char *k = fmt;
21389 +
21390 +       while ((k = strchr(k, '%')) != NULL) {
21391 +               if (k[1] == 'k' || k[1] == 'K' || k[1] == 'h' || k[1] == 't' ||
21392 +                   k[1] == 'z' || k[1] == 'b' || k[1] == 'y' || k[1] == 'a') {
21393 +                       *what = k[1];
21394 +                       break;
21395 +               }
21396 +               k++;
21397 +       }
21398 +       return k;
21399 +}
21400 +
21401 +/*
21402 + * debugging reiserfs we used to print out a lot of different
21403 + * variables, like keys, item headers, buffer heads etc. Values of
21404 + * most fields matter. So it took a long time just to write
21405 + * appropriative printk. With this reiserfs_warning you can use format
21406 + * specification for complex structures like you used to do with
21407 + * printfs for integers, doubles and pointers. For instance, to print
21408 + * out key structure you have to write just:
21409 + * reiserfs_warning ("bad key %k", key);
21410 + * instead of
21411 + * printk ("bad key %lu %lu %lu %lu", key->k_dir_id, key->k_objectid,
21412 + *         key->k_offset, key->k_uniqueness);
21413 + */
21414 +static DEFINE_SPINLOCK(error_lock);
21415 +static void prepare_error_buf(const char *fmt, va_list args)
21416 +{
21417 +       char *fmt1 = fmt_buf;
21418 +       char *k;
21419 +       char *p = error_buf;
21420 +       char * const end = &error_buf[sizeof(error_buf)];
21421 +       int what;
21422 +
21423 +       spin_lock(&error_lock);
21424 +
21425 +       if (WARN_ON(strscpy(fmt_buf, fmt, sizeof(fmt_buf)) < 0)) {
21426 +               strscpy(error_buf, "format string too long", end - error_buf);
21427 +               goto out_unlock;
21428 +       }
21429 +
21430 +       while ((k = is_there_reiserfs_struct(fmt1, &what)) != NULL) {
21431 +               *k = 0;
21432 +
21433 +               p += vscnprintf(p, end - p, fmt1, args);
21434 +
21435 +               switch (what) {
21436 +               case 'k':
21437 +                       p += scnprintf_le_key(p, end - p,
21438 +                                             va_arg(args, struct reiserfs_key *));
21439 +                       break;
21440 +               case 'K':
21441 +                       p += scnprintf_cpu_key(p, end - p,
21442 +                                              va_arg(args, struct cpu_key *));
21443 +                       break;
21444 +               case 'h':
21445 +                       p += scnprintf_item_head(p, end - p,
21446 +                                                va_arg(args, struct item_head *));
21447 +                       break;
21448 +               case 't':
21449 +                       p += scnprintf_direntry(p, end - p,
21450 +                                               va_arg(args, struct reiserfs_dir_entry *));
21451 +                       break;
21452 +               case 'y':
21453 +                       p += scnprintf_disk_child(p, end - p,
21454 +                                                 va_arg(args, struct disk_child *));
21455 +                       break;
21456 +               case 'z':
21457 +                       p += scnprintf_block_head(p, end - p,
21458 +                                                 va_arg(args, struct buffer_head *));
21459 +                       break;
21460 +               case 'b':
21461 +                       p += scnprintf_buffer_head(p, end - p,
21462 +                                                  va_arg(args, struct buffer_head *));
21463 +                       break;
21464 +               case 'a':
21465 +                       p += scnprintf_de_head(p, end - p,
21466 +                                              va_arg(args, struct reiserfs_de_head *));
21467 +                       break;
21468 +               }
21469 +
21470 +               fmt1 = k + 2;
21471 +       }
21472 +       p += vscnprintf(p, end - p, fmt1, args);
21473 +out_unlock:
21474 +       spin_unlock(&error_lock);
21475 +
21476 +}
21477 +
21478 +/*
21479 + * in addition to usual conversion specifiers this accepts reiserfs
21480 + * specific conversion specifiers:
21481 + * %k to print little endian key,
21482 + * %K to print cpu key,
21483 + * %h to print item_head,
21484 + * %t to print directory entry
21485 + * %z to print block head (arg must be struct buffer_head *
21486 + * %b to print buffer_head
21487 + */
21488 +
21489 +#define do_reiserfs_warning(fmt)\
21490 +{\
21491 +    va_list args;\
21492 +    va_start( args, fmt );\
21493 +    prepare_error_buf( fmt, args );\
21494 +    va_end( args );\
21495 +}
21496 +
21497 +void __reiserfs_warning(struct super_block *sb, const char *id,
21498 +                        const char *function, const char *fmt, ...)
21499 +{
21500 +       do_reiserfs_warning(fmt);
21501 +       if (sb)
21502 +               printk(KERN_WARNING "REISERFS warning (device %s): %s%s%s: "
21503 +                      "%s\n", sb->s_id, id ? id : "", id ? " " : "",
21504 +                      function, error_buf);
21505 +       else
21506 +               printk(KERN_WARNING "REISERFS warning: %s%s%s: %s\n",
21507 +                      id ? id : "", id ? " " : "", function, error_buf);
21508 +}
21509 +
21510 +/* No newline.. reiserfs_info calls can be followed by printk's */
21511 +void reiserfs_info(struct super_block *sb, const char *fmt, ...)
21512 +{
21513 +       do_reiserfs_warning(fmt);
21514 +       if (sb)
21515 +               printk(KERN_NOTICE "REISERFS (device %s): %s",
21516 +                      sb->s_id, error_buf);
21517 +       else
21518 +               printk(KERN_NOTICE "REISERFS %s:", error_buf);
21519 +}
21520 +
21521 +/* No newline.. reiserfs_printk calls can be followed by printk's */
21522 +static void reiserfs_printk(const char *fmt, ...)
21523 +{
21524 +       do_reiserfs_warning(fmt);
21525 +       printk(error_buf);
21526 +}
21527 +
21528 +void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...)
21529 +{
21530 +#ifdef CONFIG_REISERFS_CHECK
21531 +       do_reiserfs_warning(fmt);
21532 +       if (s)
21533 +               printk(KERN_DEBUG "REISERFS debug (device %s): %s\n",
21534 +                      s->s_id, error_buf);
21535 +       else
21536 +               printk(KERN_DEBUG "REISERFS debug: %s\n", error_buf);
21537 +#endif
21538 +}
21539 +
21540 +/*
21541 + * The format:
21542 + *
21543 + *          maintainer-errorid: [function-name:] message
21544 + *
21545 + *   where errorid is unique to the maintainer and function-name is
21546 + *   optional, is recommended, so that anyone can easily find the bug
21547 + *   with a simple grep for the short to type string
21548 + *   maintainer-errorid.  Don't bother with reusing errorids, there are
21549 + *   lots of numbers out there.
21550 + *
21551 + *   Example:
21552 + *
21553 + *   reiserfs_panic(
21554 + *     p_sb, "reiser-29: reiserfs_new_blocknrs: "
21555 + *     "one of search_start or rn(%d) is equal to MAX_B_NUM,"
21556 + *     "which means that we are optimizing location based on the "
21557 + *     "bogus location of a temp buffer (%p).",
21558 + *     rn, bh
21559 + *   );
21560 + *
21561 + *   Regular panic()s sometimes clear the screen before the message can
21562 + *   be read, thus the need for the while loop.
21563 + *
21564 + *   Numbering scheme for panic used by Vladimir and Anatoly( Hans completely
21565 + *   ignores this scheme, and considers it pointless complexity):
21566 + *
21567 + *   panics in reiserfs_fs.h have numbers from 1000 to 1999
21568 + *   super.c                   2000 to 2999
21569 + *   preserve.c (unused)       3000 to 3999
21570 + *   bitmap.c                  4000 to 4999
21571 + *   stree.c                   5000 to 5999
21572 + *   prints.c                  6000 to 6999
21573 + *   namei.c                   7000 to 7999
21574 + *   fix_nodes.c               8000 to 8999
21575 + *   dir.c                     9000 to 9999
21576 + *   lbalance.c                        10000 to 10999
21577 + *   ibalance.c                        11000 to 11999 not ready
21578 + *   do_balan.c                        12000 to 12999
21579 + *   inode.c                   13000 to 13999
21580 + *   file.c                    14000 to 14999
21581 + *   objectid.c                        15000 - 15999
21582 + *   buffer.c                  16000 - 16999
21583 + *   symlink.c                 17000 - 17999
21584 + *
21585 + *  .  */
21586 +
21587 +void __reiserfs_panic(struct super_block *sb, const char *id,
21588 +                     const char *function, const char *fmt, ...)
21589 +{
21590 +       do_reiserfs_warning(fmt);
21591 +
21592 +#ifdef CONFIG_REISERFS_CHECK
21593 +       dump_stack();
21594 +#endif
21595 +       if (sb)
21596 +               printk(KERN_WARNING "REISERFS panic (device %s): %s%s%s: %s\n",
21597 +                     sb->s_id, id ? id : "", id ? " " : "",
21598 +                     function, error_buf);
21599 +       else
21600 +               printk(KERN_WARNING "REISERFS panic: %s%s%s: %s\n",
21601 +                     id ? id : "", id ? " " : "", function, error_buf);
21602 +       BUG();
21603 +}
21604 +
21605 +void __reiserfs_error(struct super_block *sb, const char *id,
21606 +                     const char *function, const char *fmt, ...)
21607 +{
21608 +       do_reiserfs_warning(fmt);
21609 +
21610 +       BUG_ON(sb == NULL);
21611 +
21612 +       if (reiserfs_error_panic(sb))
21613 +               __reiserfs_panic(sb, id, function, error_buf);
21614 +
21615 +       if (id && id[0])
21616 +               printk(KERN_CRIT "REISERFS error (device %s): %s %s: %s\n",
21617 +                      sb->s_id, id, function, error_buf);
21618 +       else
21619 +               printk(KERN_CRIT "REISERFS error (device %s): %s: %s\n",
21620 +                      sb->s_id, function, error_buf);
21621 +
21622 +       if (sb_rdonly(sb))
21623 +               return;
21624 +
21625 +       reiserfs_info(sb, "Remounting filesystem read-only\n");
21626 +       sb->s_flags |= SB_RDONLY;
21627 +       reiserfs_abort_journal(sb, -EIO);
21628 +}
21629 +
21630 +void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...)
21631 +{
21632 +       do_reiserfs_warning(fmt);
21633 +
21634 +       if (reiserfs_error_panic(sb)) {
21635 +               panic(KERN_CRIT "REISERFS panic (device %s): %s\n", sb->s_id,
21636 +                     error_buf);
21637 +       }
21638 +
21639 +       if (reiserfs_is_journal_aborted(SB_JOURNAL(sb)))
21640 +               return;
21641 +
21642 +       printk(KERN_CRIT "REISERFS abort (device %s): %s\n", sb->s_id,
21643 +              error_buf);
21644 +
21645 +       sb->s_flags |= SB_RDONLY;
21646 +       reiserfs_abort_journal(sb, errno);
21647 +}
21648 +
21649 +/*
21650 + * this prints internal nodes (4 keys/items in line) (dc_number,
21651 + * dc_size)[k_dirid, k_objectid, k_offset, k_uniqueness](dc_number,
21652 + * dc_size)...
21653 + */
21654 +static int print_internal(struct buffer_head *bh, int first, int last)
21655 +{
21656 +       struct reiserfs_key *key;
21657 +       struct disk_child *dc;
21658 +       int i;
21659 +       int from, to;
21660 +
21661 +       if (!B_IS_KEYS_LEVEL(bh))
21662 +               return 1;
21663 +
21664 +       check_internal(bh);
21665 +
21666 +       if (first == -1) {
21667 +               from = 0;
21668 +               to = B_NR_ITEMS(bh);
21669 +       } else {
21670 +               from = first;
21671 +               to = min_t(int, last, B_NR_ITEMS(bh));
21672 +       }
21673 +
21674 +       reiserfs_printk("INTERNAL NODE (%ld) contains %z\n", bh->b_blocknr, bh);
21675 +
21676 +       dc = B_N_CHILD(bh, from);
21677 +       reiserfs_printk("PTR %d: %y ", from, dc);
21678 +
21679 +       for (i = from, key = internal_key(bh, from), dc++; i < to;
21680 +            i++, key++, dc++) {
21681 +               reiserfs_printk("KEY %d: %k PTR %d: %y ", i, key, i + 1, dc);
21682 +               if (i && i % 4 == 0)
21683 +                       printk("\n");
21684 +       }
21685 +       printk("\n");
21686 +       return 0;
21687 +}
21688 +
21689 +static int print_leaf(struct buffer_head *bh, int print_mode, int first,
21690 +                     int last)
21691 +{
21692 +       struct block_head *blkh;
21693 +       struct item_head *ih;
21694 +       int i, nr;
21695 +       int from, to;
21696 +
21697 +       if (!B_IS_ITEMS_LEVEL(bh))
21698 +               return 1;
21699 +
21700 +       check_leaf(bh);
21701 +
21702 +       blkh = B_BLK_HEAD(bh);
21703 +       ih = item_head(bh, 0);
21704 +       nr = blkh_nr_item(blkh);
21705 +
21706 +       printk
21707 +           ("\n===================================================================\n");
21708 +       reiserfs_printk("LEAF NODE (%ld) contains %z\n", bh->b_blocknr, bh);
21709 +
21710 +       if (!(print_mode & PRINT_LEAF_ITEMS)) {
21711 +               reiserfs_printk("FIRST ITEM_KEY: %k, LAST ITEM KEY: %k\n",
21712 +                               &(ih->ih_key), &((ih + nr - 1)->ih_key));
21713 +               return 0;
21714 +       }
21715 +
21716 +       if (first < 0 || first > nr - 1)
21717 +               from = 0;
21718 +       else
21719 +               from = first;
21720 +
21721 +       if (last < 0 || last > nr)
21722 +               to = nr;
21723 +       else
21724 +               to = last;
21725 +
21726 +       ih += from;
21727 +       printk
21728 +           ("-------------------------------------------------------------------------------\n");
21729 +       printk
21730 +           ("|##|   type    |           key           | ilen | free_space | version | loc  |\n");
21731 +       for (i = from; i < to; i++, ih++) {
21732 +               printk
21733 +                   ("-------------------------------------------------------------------------------\n");
21734 +               reiserfs_printk("|%2d| %h |\n", i, ih);
21735 +               if (print_mode & PRINT_LEAF_ITEMS)
21736 +                       op_print_item(ih, ih_item_body(bh, ih));
21737 +       }
21738 +
21739 +       printk
21740 +           ("===================================================================\n");
21741 +
21742 +       return 0;
21743 +}
21744 +
21745 +char *reiserfs_hashname(int code)
21746 +{
21747 +       if (code == YURA_HASH)
21748 +               return "rupasov";
21749 +       if (code == TEA_HASH)
21750 +               return "tea";
21751 +       if (code == R5_HASH)
21752 +               return "r5";
21753 +
21754 +       return "unknown";
21755 +}
21756 +
21757 +/* return 1 if this is not super block */
21758 +static int print_super_block(struct buffer_head *bh)
21759 +{
21760 +       struct reiserfs_super_block *rs =
21761 +           (struct reiserfs_super_block *)(bh->b_data);
21762 +       int skipped, data_blocks;
21763 +       char *version;
21764 +
21765 +       if (is_reiserfs_3_5(rs)) {
21766 +               version = "3.5";
21767 +       } else if (is_reiserfs_3_6(rs)) {
21768 +               version = "3.6";
21769 +       } else if (is_reiserfs_jr(rs)) {
21770 +               version = ((sb_version(rs) == REISERFS_VERSION_2) ?
21771 +                          "3.6" : "3.5");
21772 +       } else {
21773 +               return 1;
21774 +       }
21775 +
21776 +       printk("%pg\'s super block is in block %llu\n", bh->b_bdev,
21777 +              (unsigned long long)bh->b_blocknr);
21778 +       printk("Reiserfs version %s\n", version);
21779 +       printk("Block count %u\n", sb_block_count(rs));
21780 +       printk("Blocksize %d\n", sb_blocksize(rs));
21781 +       printk("Free blocks %u\n", sb_free_blocks(rs));
21782 +       /*
21783 +        * FIXME: this would be confusing if
21784 +        * someone stores reiserfs super block in some data block ;)
21785 +//    skipped = (bh->b_blocknr * bh->b_size) / sb_blocksize(rs);
21786 +        */
21787 +       skipped = bh->b_blocknr;
21788 +       data_blocks = sb_block_count(rs) - skipped - 1 - sb_bmap_nr(rs) -
21789 +           (!is_reiserfs_jr(rs) ? sb_jp_journal_size(rs) +
21790 +            1 : sb_reserved_for_journal(rs)) - sb_free_blocks(rs);
21791 +       printk
21792 +           ("Busy blocks (skipped %d, bitmaps - %d, journal (or reserved) blocks - %d\n"
21793 +            "1 super block, %d data blocks\n", skipped, sb_bmap_nr(rs),
21794 +            (!is_reiserfs_jr(rs) ? (sb_jp_journal_size(rs) + 1) :
21795 +             sb_reserved_for_journal(rs)), data_blocks);
21796 +       printk("Root block %u\n", sb_root_block(rs));
21797 +       printk("Journal block (first) %d\n", sb_jp_journal_1st_block(rs));
21798 +       printk("Journal dev %d\n", sb_jp_journal_dev(rs));
21799 +       printk("Journal orig size %d\n", sb_jp_journal_size(rs));
21800 +       printk("FS state %d\n", sb_fs_state(rs));
21801 +       printk("Hash function \"%s\"\n",
21802 +              reiserfs_hashname(sb_hash_function_code(rs)));
21803 +
21804 +       printk("Tree height %d\n", sb_tree_height(rs));
21805 +       return 0;
21806 +}
21807 +
21808 +static int print_desc_block(struct buffer_head *bh)
21809 +{
21810 +       struct reiserfs_journal_desc *desc;
21811 +
21812 +       if (memcmp(get_journal_desc_magic(bh), JOURNAL_DESC_MAGIC, 8))
21813 +               return 1;
21814 +
21815 +       desc = (struct reiserfs_journal_desc *)(bh->b_data);
21816 +       printk("Desc block %llu (j_trans_id %d, j_mount_id %d, j_len %d)",
21817 +              (unsigned long long)bh->b_blocknr, get_desc_trans_id(desc),
21818 +              get_desc_mount_id(desc), get_desc_trans_len(desc));
21819 +
21820 +       return 0;
21821 +}
21822 +/* ..., int print_mode, int first, int last) */
21823 +void print_block(struct buffer_head *bh, ...)
21824 +{
21825 +       va_list args;
21826 +       int mode, first, last;
21827 +
21828 +       if (!bh) {
21829 +               printk("print_block: buffer is NULL\n");
21830 +               return;
21831 +       }
21832 +
21833 +       va_start(args, bh);
21834 +
21835 +       mode = va_arg(args, int);
21836 +       first = va_arg(args, int);
21837 +       last = va_arg(args, int);
21838 +       if (print_leaf(bh, mode, first, last))
21839 +               if (print_internal(bh, first, last))
21840 +                       if (print_super_block(bh))
21841 +                               if (print_desc_block(bh))
21842 +                                       printk
21843 +                                           ("Block %llu contains unformatted data\n",
21844 +                                            (unsigned long long)bh->b_blocknr);
21845 +
21846 +       va_end(args);
21847 +}
21848 +
21849 +static char print_tb_buf[2048];
21850 +
21851 +/* this stores initial state of tree balance in the print_tb_buf */
21852 +void store_print_tb(struct tree_balance *tb)
21853 +{
21854 +       int h = 0;
21855 +       int i;
21856 +       struct buffer_head *tbSh, *tbFh;
21857 +
21858 +       if (!tb)
21859 +               return;
21860 +
21861 +       sprintf(print_tb_buf, "\n"
21862 +               "BALANCING %d\n"
21863 +               "MODE=%c, ITEM_POS=%d POS_IN_ITEM=%d\n"
21864 +               "=====================================================================\n"
21865 +               "* h *    S    *    L    *    R    *   F   *   FL  *   FR  *  CFL  *  CFR  *\n",
21866 +               REISERFS_SB(tb->tb_sb)->s_do_balance,
21867 +               tb->tb_mode, PATH_LAST_POSITION(tb->tb_path),
21868 +               tb->tb_path->pos_in_item);
21869 +
21870 +       for (h = 0; h < ARRAY_SIZE(tb->insert_size); h++) {
21871 +               if (PATH_H_PATH_OFFSET(tb->tb_path, h) <=
21872 +                   tb->tb_path->path_length
21873 +                   && PATH_H_PATH_OFFSET(tb->tb_path,
21874 +                                         h) > ILLEGAL_PATH_ELEMENT_OFFSET) {
21875 +                       tbSh = PATH_H_PBUFFER(tb->tb_path, h);
21876 +                       tbFh = PATH_H_PPARENT(tb->tb_path, h);
21877 +               } else {
21878 +                       tbSh = NULL;
21879 +                       tbFh = NULL;
21880 +               }
21881 +               sprintf(print_tb_buf + strlen(print_tb_buf),
21882 +                       "* %d * %3lld(%2d) * %3lld(%2d) * %3lld(%2d) * %5lld * %5lld * %5lld * %5lld * %5lld *\n",
21883 +                       h,
21884 +                       (tbSh) ? (long long)(tbSh->b_blocknr) : (-1LL),
21885 +                       (tbSh) ? atomic_read(&tbSh->b_count) : -1,
21886 +                       (tb->L[h]) ? (long long)(tb->L[h]->b_blocknr) : (-1LL),
21887 +                       (tb->L[h]) ? atomic_read(&tb->L[h]->b_count) : -1,
21888 +                       (tb->R[h]) ? (long long)(tb->R[h]->b_blocknr) : (-1LL),
21889 +                       (tb->R[h]) ? atomic_read(&tb->R[h]->b_count) : -1,
21890 +                       (tbFh) ? (long long)(tbFh->b_blocknr) : (-1LL),
21891 +                       (tb->FL[h]) ? (long long)(tb->FL[h]->
21892 +                                                 b_blocknr) : (-1LL),
21893 +                       (tb->FR[h]) ? (long long)(tb->FR[h]->
21894 +                                                 b_blocknr) : (-1LL),
21895 +                       (tb->CFL[h]) ? (long long)(tb->CFL[h]->
21896 +                                                  b_blocknr) : (-1LL),
21897 +                       (tb->CFR[h]) ? (long long)(tb->CFR[h]->
21898 +                                                  b_blocknr) : (-1LL));
21899 +       }
21900 +
21901 +       sprintf(print_tb_buf + strlen(print_tb_buf),
21902 +               "=====================================================================\n"
21903 +               "* h * size * ln * lb * rn * rb * blkn * s0 * s1 * s1b * s2 * s2b * curb * lk * rk *\n"
21904 +               "* 0 * %4d * %2d * %2d * %2d * %2d * %4d * %2d * %2d * %3d * %2d * %3d * %4d * %2d * %2d *\n",
21905 +               tb->insert_size[0], tb->lnum[0], tb->lbytes, tb->rnum[0],
21906 +               tb->rbytes, tb->blknum[0], tb->s0num, tb->snum[0],
21907 +               tb->sbytes[0], tb->snum[1], tb->sbytes[1],
21908 +               tb->cur_blknum, tb->lkey[0], tb->rkey[0]);
21909 +
21910 +       /* this prints balance parameters for non-leaf levels */
21911 +       h = 0;
21912 +       do {
21913 +               h++;
21914 +               sprintf(print_tb_buf + strlen(print_tb_buf),
21915 +                       "* %d * %4d * %2d *    * %2d *    * %2d *\n",
21916 +                       h, tb->insert_size[h], tb->lnum[h], tb->rnum[h],
21917 +                       tb->blknum[h]);
21918 +       } while (tb->insert_size[h]);
21919 +
21920 +       sprintf(print_tb_buf + strlen(print_tb_buf),
21921 +               "=====================================================================\n"
21922 +               "FEB list: ");
21923 +
21924 +       /* print FEB list (list of buffers in form (bh (b_blocknr, b_count), that will be used for new nodes) */
21925 +       h = 0;
21926 +       for (i = 0; i < ARRAY_SIZE(tb->FEB); i++)
21927 +               sprintf(print_tb_buf + strlen(print_tb_buf),
21928 +                       "%p (%llu %d)%s", tb->FEB[i],
21929 +                       tb->FEB[i] ? (unsigned long long)tb->FEB[i]->
21930 +                       b_blocknr : 0ULL,
21931 +                       tb->FEB[i] ? atomic_read(&tb->FEB[i]->b_count) : 0,
21932 +                       (i == ARRAY_SIZE(tb->FEB) - 1) ? "\n" : ", ");
21933 +
21934 +       sprintf(print_tb_buf + strlen(print_tb_buf),
21935 +               "======================== the end ====================================\n");
21936 +}
21937 +
21938 +void print_cur_tb(char *mes)
21939 +{
21940 +       printk("%s\n%s", mes, print_tb_buf);
21941 +}
21942 +
21943 +static void check_leaf_block_head(struct buffer_head *bh)
21944 +{
21945 +       struct block_head *blkh;
21946 +       int nr;
21947 +
21948 +       blkh = B_BLK_HEAD(bh);
21949 +       nr = blkh_nr_item(blkh);
21950 +       if (nr > (bh->b_size - BLKH_SIZE) / IH_SIZE)
21951 +               reiserfs_panic(NULL, "vs-6010", "invalid item number %z",
21952 +                              bh);
21953 +       if (blkh_free_space(blkh) > bh->b_size - BLKH_SIZE - IH_SIZE * nr)
21954 +               reiserfs_panic(NULL, "vs-6020", "invalid free space %z",
21955 +                              bh);
21956 +
21957 +}
21958 +
21959 +static void check_internal_block_head(struct buffer_head *bh)
21960 +{
21961 +       if (!(B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL && B_LEVEL(bh) <= MAX_HEIGHT))
21962 +               reiserfs_panic(NULL, "vs-6025", "invalid level %z", bh);
21963 +
21964 +       if (B_NR_ITEMS(bh) > (bh->b_size - BLKH_SIZE) / IH_SIZE)
21965 +               reiserfs_panic(NULL, "vs-6030", "invalid item number %z", bh);
21966 +
21967 +       if (B_FREE_SPACE(bh) !=
21968 +           bh->b_size - BLKH_SIZE - KEY_SIZE * B_NR_ITEMS(bh) -
21969 +           DC_SIZE * (B_NR_ITEMS(bh) + 1))
21970 +               reiserfs_panic(NULL, "vs-6040", "invalid free space %z", bh);
21971 +
21972 +}
21973 +
21974 +void check_leaf(struct buffer_head *bh)
21975 +{
21976 +       int i;
21977 +       struct item_head *ih;
21978 +
21979 +       if (!bh)
21980 +               return;
21981 +       check_leaf_block_head(bh);
21982 +       for (i = 0, ih = item_head(bh, 0); i < B_NR_ITEMS(bh); i++, ih++)
21983 +               op_check_item(ih, ih_item_body(bh, ih));
21984 +}
21985 +
21986 +void check_internal(struct buffer_head *bh)
21987 +{
21988 +       if (!bh)
21989 +               return;
21990 +       check_internal_block_head(bh);
21991 +}
21992 +
21993 +void print_statistics(struct super_block *s)
21994 +{
21995 +
21996 +       /*
21997 +          printk ("reiserfs_put_super: session statistics: balances %d, fix_nodes %d, \
21998 +          bmap with search %d, without %d, dir2ind %d, ind2dir %d\n",
21999 +          REISERFS_SB(s)->s_do_balance, REISERFS_SB(s)->s_fix_nodes,
22000 +          REISERFS_SB(s)->s_bmaps, REISERFS_SB(s)->s_bmaps_without_search,
22001 +          REISERFS_SB(s)->s_direct2indirect, REISERFS_SB(s)->s_indirect2direct);
22002 +        */
22003 +
22004 +}
22005 diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
22006 new file mode 100644
22007 index 000000000000..5c68a4a52d78
22008 --- /dev/null
22009 +++ b/fs/reiserfs/procfs.c
22010 @@ -0,0 +1,490 @@
22011 +/* -*- linux-c -*- */
22012 +
22013 +/* fs/reiserfs/procfs.c */
22014 +
22015 +/*
22016 + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
22017 + */
22018 +
22019 +/* proc info support a la one created by Sizif@Botik.RU for PGC */
22020 +
22021 +#include <linux/module.h>
22022 +#include <linux/time.h>
22023 +#include <linux/seq_file.h>
22024 +#include <linux/uaccess.h>
22025 +#include "reiserfs.h"
22026 +#include <linux/init.h>
22027 +#include <linux/proc_fs.h>
22028 +#include <linux/blkdev.h>
22029 +
22030 +/*
22031 + * LOCKING:
22032 + *
22033 + * These guys are evicted from procfs as the very first step in ->kill_sb().
22034 + *
22035 + */
22036 +
22037 +static int show_version(struct seq_file *m, void *unused)
22038 +{
22039 +       struct super_block *sb = m->private;
22040 +       char *format;
22041 +
22042 +       if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_6)) {
22043 +               format = "3.6";
22044 +       } else if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_5)) {
22045 +               format = "3.5";
22046 +       } else {
22047 +               format = "unknown";
22048 +       }
22049 +
22050 +       seq_printf(m, "%s format\twith checks %s\n", format,
22051 +#if defined( CONFIG_REISERFS_CHECK )
22052 +                  "on"
22053 +#else
22054 +                  "off"
22055 +#endif
22056 +           );
22057 +       return 0;
22058 +}
22059 +
22060 +#define SF( x ) ( r -> x )
22061 +#define SFP( x ) SF( s_proc_info_data.x )
22062 +#define SFPL( x ) SFP( x[ level ] )
22063 +#define SFPF( x ) SFP( scan_bitmap.x )
22064 +#define SFPJ( x ) SFP( journal.x )
22065 +
22066 +#define D2C( x ) le16_to_cpu( x )
22067 +#define D4C( x ) le32_to_cpu( x )
22068 +#define DF( x ) D2C( rs -> s_v1.x )
22069 +#define DFL( x ) D4C( rs -> s_v1.x )
22070 +
22071 +#define objectid_map( s, rs ) (old_format_only (s) ?                           \
22072 +                         (__le32 *)((struct reiserfs_super_block_v1 *)rs + 1) :        \
22073 +                        (__le32 *)(rs + 1))
22074 +#define MAP( i ) D4C( objectid_map( sb, rs )[ i ] )
22075 +
22076 +#define DJF( x ) le32_to_cpu( rs -> x )
22077 +#define DJP( x ) le32_to_cpu( jp -> x )
22078 +#define JF( x ) ( r -> s_journal -> x )
22079 +
22080 +static int show_super(struct seq_file *m, void *unused)
22081 +{
22082 +       struct super_block *sb = m->private;
22083 +       struct reiserfs_sb_info *r = REISERFS_SB(sb);
22084 +
22085 +       seq_printf(m, "state: \t%s\n"
22086 +                  "mount options: \t%s%s%s%s%s%s%s%s%s%s%s\n"
22087 +                  "gen. counter: \t%i\n"
22088 +                  "s_disk_reads: \t%i\n"
22089 +                  "s_disk_writes: \t%i\n"
22090 +                  "s_fix_nodes: \t%i\n"
22091 +                  "s_do_balance: \t%i\n"
22092 +                  "s_unneeded_left_neighbor: \t%i\n"
22093 +                  "s_good_search_by_key_reada: \t%i\n"
22094 +                  "s_bmaps: \t%i\n"
22095 +                  "s_bmaps_without_search: \t%i\n"
22096 +                  "s_direct2indirect: \t%i\n"
22097 +                  "s_indirect2direct: \t%i\n"
22098 +                  "\n"
22099 +                  "max_hash_collisions: \t%i\n"
22100 +                  "breads: \t%lu\n"
22101 +                  "bread_misses: \t%lu\n"
22102 +                  "search_by_key: \t%lu\n"
22103 +                  "search_by_key_fs_changed: \t%lu\n"
22104 +                  "search_by_key_restarted: \t%lu\n"
22105 +                  "insert_item_restarted: \t%lu\n"
22106 +                  "paste_into_item_restarted: \t%lu\n"
22107 +                  "cut_from_item_restarted: \t%lu\n"
22108 +                  "delete_solid_item_restarted: \t%lu\n"
22109 +                  "delete_item_restarted: \t%lu\n"
22110 +                  "leaked_oid: \t%lu\n"
22111 +                  "leaves_removable: \t%lu\n",
22112 +                  SF(s_mount_state) == REISERFS_VALID_FS ?
22113 +                  "REISERFS_VALID_FS" : "REISERFS_ERROR_FS",
22114 +                  reiserfs_r5_hash(sb) ? "FORCE_R5 " : "",
22115 +                  reiserfs_rupasov_hash(sb) ? "FORCE_RUPASOV " : "",
22116 +                  reiserfs_tea_hash(sb) ? "FORCE_TEA " : "",
22117 +                  reiserfs_hash_detect(sb) ? "DETECT_HASH " : "",
22118 +                  reiserfs_no_border(sb) ? "NO_BORDER " : "BORDER ",
22119 +                  reiserfs_no_unhashed_relocation(sb) ?
22120 +                  "NO_UNHASHED_RELOCATION " : "",
22121 +                  reiserfs_hashed_relocation(sb) ? "UNHASHED_RELOCATION " : "",
22122 +                  reiserfs_test4(sb) ? "TEST4 " : "",
22123 +                  have_large_tails(sb) ? "TAILS " : have_small_tails(sb) ?
22124 +                  "SMALL_TAILS " : "NO_TAILS ",
22125 +                  replay_only(sb) ? "REPLAY_ONLY " : "",
22126 +                  convert_reiserfs(sb) ? "CONV " : "",
22127 +                  atomic_read(&r->s_generation_counter),
22128 +                  SF(s_disk_reads), SF(s_disk_writes), SF(s_fix_nodes),
22129 +                  SF(s_do_balance), SF(s_unneeded_left_neighbor),
22130 +                  SF(s_good_search_by_key_reada), SF(s_bmaps),
22131 +                  SF(s_bmaps_without_search), SF(s_direct2indirect),
22132 +                  SF(s_indirect2direct), SFP(max_hash_collisions), SFP(breads),
22133 +                  SFP(bread_miss), SFP(search_by_key),
22134 +                  SFP(search_by_key_fs_changed), SFP(search_by_key_restarted),
22135 +                  SFP(insert_item_restarted), SFP(paste_into_item_restarted),
22136 +                  SFP(cut_from_item_restarted),
22137 +                  SFP(delete_solid_item_restarted), SFP(delete_item_restarted),
22138 +                  SFP(leaked_oid), SFP(leaves_removable));
22139 +
22140 +       return 0;
22141 +}
22142 +
22143 +static int show_per_level(struct seq_file *m, void *unused)
22144 +{
22145 +       struct super_block *sb = m->private;
22146 +       struct reiserfs_sb_info *r = REISERFS_SB(sb);
22147 +       int level;
22148 +
22149 +       seq_printf(m, "level\t"
22150 +                  "     balances"
22151 +                  " [sbk:  reads"
22152 +                  "   fs_changed"
22153 +                  "   restarted]"
22154 +                  "   free space"
22155 +                  "        items"
22156 +                  "   can_remove"
22157 +                  "         lnum"
22158 +                  "         rnum"
22159 +                  "       lbytes"
22160 +                  "       rbytes"
22161 +                  "     get_neig"
22162 +                  " get_neig_res" "  need_l_neig" "  need_r_neig" "\n");
22163 +
22164 +       for (level = 0; level < MAX_HEIGHT; ++level) {
22165 +               seq_printf(m, "%i\t"
22166 +                          " %12lu"
22167 +                          " %12lu"
22168 +                          " %12lu"
22169 +                          " %12lu"
22170 +                          " %12lu"
22171 +                          " %12lu"
22172 +                          " %12lu"
22173 +                          " %12li"
22174 +                          " %12li"
22175 +                          " %12li"
22176 +                          " %12li"
22177 +                          " %12lu"
22178 +                          " %12lu"
22179 +                          " %12lu"
22180 +                          " %12lu"
22181 +                          "\n",
22182 +                          level,
22183 +                          SFPL(balance_at),
22184 +                          SFPL(sbk_read_at),
22185 +                          SFPL(sbk_fs_changed),
22186 +                          SFPL(sbk_restarted),
22187 +                          SFPL(free_at),
22188 +                          SFPL(items_at),
22189 +                          SFPL(can_node_be_removed),
22190 +                          SFPL(lnum),
22191 +                          SFPL(rnum),
22192 +                          SFPL(lbytes),
22193 +                          SFPL(rbytes),
22194 +                          SFPL(get_neighbors),
22195 +                          SFPL(get_neighbors_restart),
22196 +                          SFPL(need_l_neighbor), SFPL(need_r_neighbor)
22197 +                   );
22198 +       }
22199 +       return 0;
22200 +}
22201 +
22202 +static int show_bitmap(struct seq_file *m, void *unused)
22203 +{
22204 +       struct super_block *sb = m->private;
22205 +       struct reiserfs_sb_info *r = REISERFS_SB(sb);
22206 +
22207 +       seq_printf(m, "free_block: %lu\n"
22208 +                  "  scan_bitmap:"
22209 +                  "          wait"
22210 +                  "          bmap"
22211 +                  "         retry"
22212 +                  "        stolen"
22213 +                  "  journal_hint"
22214 +                  "journal_nohint"
22215 +                  "\n"
22216 +                  " %14lu"
22217 +                  " %14lu"
22218 +                  " %14lu"
22219 +                  " %14lu"
22220 +                  " %14lu"
22221 +                  " %14lu"
22222 +                  " %14lu"
22223 +                  "\n",
22224 +                  SFP(free_block),
22225 +                  SFPF(call),
22226 +                  SFPF(wait),
22227 +                  SFPF(bmap),
22228 +                  SFPF(retry),
22229 +                  SFPF(stolen),
22230 +                  SFPF(in_journal_hint), SFPF(in_journal_nohint));
22231 +
22232 +       return 0;
22233 +}
22234 +
22235 +static int show_on_disk_super(struct seq_file *m, void *unused)
22236 +{
22237 +       struct super_block *sb = m->private;
22238 +       struct reiserfs_sb_info *sb_info = REISERFS_SB(sb);
22239 +       struct reiserfs_super_block *rs = sb_info->s_rs;
22240 +       int hash_code = DFL(s_hash_function_code);
22241 +       __u32 flags = DJF(s_flags);
22242 +
22243 +       seq_printf(m, "block_count: \t%i\n"
22244 +                  "free_blocks: \t%i\n"
22245 +                  "root_block: \t%i\n"
22246 +                  "blocksize: \t%i\n"
22247 +                  "oid_maxsize: \t%i\n"
22248 +                  "oid_cursize: \t%i\n"
22249 +                  "umount_state: \t%i\n"
22250 +                  "magic: \t%10.10s\n"
22251 +                  "fs_state: \t%i\n"
22252 +                  "hash: \t%s\n"
22253 +                  "tree_height: \t%i\n"
22254 +                  "bmap_nr: \t%i\n"
22255 +                  "version: \t%i\n"
22256 +                  "flags: \t%x[%s]\n"
22257 +                  "reserved_for_journal: \t%i\n",
22258 +                  DFL(s_block_count),
22259 +                  DFL(s_free_blocks),
22260 +                  DFL(s_root_block),
22261 +                  DF(s_blocksize),
22262 +                  DF(s_oid_maxsize),
22263 +                  DF(s_oid_cursize),
22264 +                  DF(s_umount_state),
22265 +                  rs->s_v1.s_magic,
22266 +                  DF(s_fs_state),
22267 +                  hash_code == TEA_HASH ? "tea" :
22268 +                  (hash_code == YURA_HASH) ? "rupasov" :
22269 +                  (hash_code == R5_HASH) ? "r5" :
22270 +                  (hash_code == UNSET_HASH) ? "unset" : "unknown",
22271 +                  DF(s_tree_height),
22272 +                  DF(s_bmap_nr),
22273 +                  DF(s_version), flags, (flags & reiserfs_attrs_cleared)
22274 +                  ? "attrs_cleared" : "", DF(s_reserved_for_journal));
22275 +
22276 +       return 0;
22277 +}
22278 +
22279 +static int show_oidmap(struct seq_file *m, void *unused)
22280 +{
22281 +       struct super_block *sb = m->private;
22282 +       struct reiserfs_sb_info *sb_info = REISERFS_SB(sb);
22283 +       struct reiserfs_super_block *rs = sb_info->s_rs;
22284 +       unsigned int mapsize = le16_to_cpu(rs->s_v1.s_oid_cursize);
22285 +       unsigned long total_used = 0;
22286 +       int i;
22287 +
22288 +       for (i = 0; i < mapsize; ++i) {
22289 +               __u32 right;
22290 +
22291 +               right = (i == mapsize - 1) ? MAX_KEY_OBJECTID : MAP(i + 1);
22292 +               seq_printf(m, "%s: [ %x .. %x )\n",
22293 +                          (i & 1) ? "free" : "used", MAP(i), right);
22294 +               if (!(i & 1)) {
22295 +                       total_used += right - MAP(i);
22296 +               }
22297 +       }
22298 +#if defined( REISERFS_USE_OIDMAPF )
22299 +       if (sb_info->oidmap.use_file && (sb_info->oidmap.mapf != NULL)) {
22300 +               loff_t size = file_inode(sb_info->oidmap.mapf)->i_size;
22301 +               total_used += size / sizeof(reiserfs_oidinterval_d_t);
22302 +       }
22303 +#endif
22304 +       seq_printf(m, "total: \t%i [%i/%i] used: %lu [exact]\n",
22305 +                  mapsize,
22306 +                  mapsize, le16_to_cpu(rs->s_v1.s_oid_maxsize), total_used);
22307 +       return 0;
22308 +}
22309 +
22310 +static time64_t ktime_mono_to_real_seconds(time64_t mono)
22311 +{
22312 +       ktime_t kt = ktime_set(mono, NSEC_PER_SEC/2);
22313 +
22314 +       return ktime_divns(ktime_mono_to_real(kt), NSEC_PER_SEC);
22315 +}
22316 +
22317 +static int show_journal(struct seq_file *m, void *unused)
22318 +{
22319 +       struct super_block *sb = m->private;
22320 +       struct reiserfs_sb_info *r = REISERFS_SB(sb);
22321 +       struct reiserfs_super_block *rs = r->s_rs;
22322 +       struct journal_params *jp = &rs->s_v1.s_journal;
22323 +
22324 +       seq_printf(m,           /* on-disk fields */
22325 +                  "jp_journal_1st_block: \t%i\n"
22326 +                  "jp_journal_dev: \t%pg[%x]\n"
22327 +                  "jp_journal_size: \t%i\n"
22328 +                  "jp_journal_trans_max: \t%i\n"
22329 +                  "jp_journal_magic: \t%i\n"
22330 +                  "jp_journal_max_batch: \t%i\n"
22331 +                  "jp_journal_max_commit_age: \t%i\n"
22332 +                  "jp_journal_max_trans_age: \t%i\n"
22333 +                  /* incore fields */
22334 +                  "j_1st_reserved_block: \t%i\n"
22335 +                  "j_state: \t%li\n"
22336 +                  "j_trans_id: \t%u\n"
22337 +                  "j_mount_id: \t%lu\n"
22338 +                  "j_start: \t%lu\n"
22339 +                  "j_len: \t%lu\n"
22340 +                  "j_len_alloc: \t%lu\n"
22341 +                  "j_wcount: \t%i\n"
22342 +                  "j_bcount: \t%lu\n"
22343 +                  "j_first_unflushed_offset: \t%lu\n"
22344 +                  "j_last_flush_trans_id: \t%u\n"
22345 +                  "j_trans_start_time: \t%lli\n"
22346 +                  "j_list_bitmap_index: \t%i\n"
22347 +                  "j_must_wait: \t%i\n"
22348 +                  "j_next_full_flush: \t%i\n"
22349 +                  "j_next_async_flush: \t%i\n"
22350 +                  "j_cnode_used: \t%i\n" "j_cnode_free: \t%i\n" "\n"
22351 +                  /* reiserfs_proc_info_data_t.journal fields */
22352 +                  "in_journal: \t%12lu\n"
22353 +                  "in_journal_bitmap: \t%12lu\n"
22354 +                  "in_journal_reusable: \t%12lu\n"
22355 +                  "lock_journal: \t%12lu\n"
22356 +                  "lock_journal_wait: \t%12lu\n"
22357 +                  "journal_begin: \t%12lu\n"
22358 +                  "journal_relock_writers: \t%12lu\n"
22359 +                  "journal_relock_wcount: \t%12lu\n"
22360 +                  "mark_dirty: \t%12lu\n"
22361 +                  "mark_dirty_already: \t%12lu\n"
22362 +                  "mark_dirty_notjournal: \t%12lu\n"
22363 +                  "restore_prepared: \t%12lu\n"
22364 +                  "prepare: \t%12lu\n"
22365 +                  "prepare_retry: \t%12lu\n",
22366 +                  DJP(jp_journal_1st_block),
22367 +                  file_bdev(SB_JOURNAL(sb)->j_bdev_file),
22368 +                  DJP(jp_journal_dev),
22369 +                  DJP(jp_journal_size),
22370 +                  DJP(jp_journal_trans_max),
22371 +                  DJP(jp_journal_magic),
22372 +                  DJP(jp_journal_max_batch),
22373 +                  SB_JOURNAL(sb)->j_max_commit_age,
22374 +                  DJP(jp_journal_max_trans_age),
22375 +                  JF(j_1st_reserved_block),
22376 +                  JF(j_state),
22377 +                  JF(j_trans_id),
22378 +                  JF(j_mount_id),
22379 +                  JF(j_start),
22380 +                  JF(j_len),
22381 +                  JF(j_len_alloc),
22382 +                  atomic_read(&r->s_journal->j_wcount),
22383 +                  JF(j_bcount),
22384 +                  JF(j_first_unflushed_offset),
22385 +                  JF(j_last_flush_trans_id),
22386 +                  ktime_mono_to_real_seconds(JF(j_trans_start_time)),
22387 +                  JF(j_list_bitmap_index),
22388 +                  JF(j_must_wait),
22389 +                  JF(j_next_full_flush),
22390 +                  JF(j_next_async_flush),
22391 +                  JF(j_cnode_used),
22392 +                  JF(j_cnode_free),
22393 +                  SFPJ(in_journal),
22394 +                  SFPJ(in_journal_bitmap),
22395 +                  SFPJ(in_journal_reusable),
22396 +                  SFPJ(lock_journal),
22397 +                  SFPJ(lock_journal_wait),
22398 +                  SFPJ(journal_being),
22399 +                  SFPJ(journal_relock_writers),
22400 +                  SFPJ(journal_relock_wcount),
22401 +                  SFPJ(mark_dirty),
22402 +                  SFPJ(mark_dirty_already),
22403 +                  SFPJ(mark_dirty_notjournal),
22404 +                  SFPJ(restore_prepared), SFPJ(prepare), SFPJ(prepare_retry)
22405 +           );
22406 +       return 0;
22407 +}
22408 +
22409 +static struct proc_dir_entry *proc_info_root = NULL;
22410 +static const char proc_info_root_name[] = "fs/reiserfs";
22411 +
22412 +static void add_file(struct super_block *sb, char *name,
22413 +                    int (*func) (struct seq_file *, void *))
22414 +{
22415 +       proc_create_single_data(name, 0, REISERFS_SB(sb)->procdir, func, sb);
22416 +}
22417 +
22418 +int reiserfs_proc_info_init(struct super_block *sb)
22419 +{
22420 +       char b[BDEVNAME_SIZE];
22421 +       char *s;
22422 +
22423 +       /* Some block devices use /'s */
22424 +       strscpy(b, sb->s_id, BDEVNAME_SIZE);
22425 +       s = strchr(b, '/');
22426 +       if (s)
22427 +               *s = '!';
22428 +
22429 +       spin_lock_init(&__PINFO(sb).lock);
22430 +       REISERFS_SB(sb)->procdir = proc_mkdir_data(b, 0, proc_info_root, sb);
22431 +       if (REISERFS_SB(sb)->procdir) {
22432 +               add_file(sb, "version", show_version);
22433 +               add_file(sb, "super", show_super);
22434 +               add_file(sb, "per-level", show_per_level);
22435 +               add_file(sb, "bitmap", show_bitmap);
22436 +               add_file(sb, "on-disk-super", show_on_disk_super);
22437 +               add_file(sb, "oidmap", show_oidmap);
22438 +               add_file(sb, "journal", show_journal);
22439 +               return 0;
22440 +       }
22441 +       reiserfs_warning(sb, "cannot create /proc/%s/%s",
22442 +                        proc_info_root_name, b);
22443 +       return 1;
22444 +}
22445 +
22446 +int reiserfs_proc_info_done(struct super_block *sb)
22447 +{
22448 +       struct proc_dir_entry *de = REISERFS_SB(sb)->procdir;
22449 +       if (de) {
22450 +               char b[BDEVNAME_SIZE];
22451 +               char *s;
22452 +
22453 +               /* Some block devices use /'s */
22454 +               strscpy(b, sb->s_id, BDEVNAME_SIZE);
22455 +               s = strchr(b, '/');
22456 +               if (s)
22457 +                       *s = '!';
22458 +
22459 +               remove_proc_subtree(b, proc_info_root);
22460 +               REISERFS_SB(sb)->procdir = NULL;
22461 +       }
22462 +       return 0;
22463 +}
22464 +
22465 +int reiserfs_proc_info_global_init(void)
22466 +{
22467 +       if (proc_info_root == NULL) {
22468 +               proc_info_root = proc_mkdir(proc_info_root_name, NULL);
22469 +               if (!proc_info_root) {
22470 +                       reiserfs_warning(NULL, "cannot create /proc/%s",
22471 +                                        proc_info_root_name);
22472 +                       return 1;
22473 +               }
22474 +       }
22475 +       return 0;
22476 +}
22477 +
22478 +int reiserfs_proc_info_global_done(void)
22479 +{
22480 +       if (proc_info_root != NULL) {
22481 +               proc_info_root = NULL;
22482 +               remove_proc_entry(proc_info_root_name, NULL);
22483 +       }
22484 +       return 0;
22485 +}
22486 +/*
22487 + * Revision 1.1.8.2  2001/07/15 17:08:42  god
22488 + *  . use get_super() in procfs.c
22489 + *  . remove remove_save_link() from reiserfs_do_truncate()
22490 + *
22491 + * I accept terms and conditions stated in the Legal Agreement
22492 + * (available at http://www.namesys.com/legalese.html)
22493 + *
22494 + * Revision 1.1.8.1  2001/07/11 16:48:50  god
22495 + * proc info support
22496 + *
22497 + * I accept terms and conditions stated in the Legal Agreement
22498 + * (available at http://www.namesys.com/legalese.html)
22499 + *
22500 + */
22501 diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
22502 new file mode 100644
22503 index 000000000000..12fc20af8e17
22504 --- /dev/null
22505 +++ b/fs/reiserfs/reiserfs.h
22506 @@ -0,0 +1,3419 @@
22507 +/* SPDX-License-Identifier: GPL-2.0 */
22508 +/*
22509 + * Copyright 1996, 1997, 1998 Hans Reiser, see reiserfs/README for
22510 + * licensing and copyright details
22511 + */
22512 +
22513 +#include <linux/reiserfs_fs.h>
22514 +
22515 +#include <linux/slab.h>
22516 +#include <linux/interrupt.h>
22517 +#include <linux/sched.h>
22518 +#include <linux/bug.h>
22519 +#include <linux/workqueue.h>
22520 +#include <linux/unaligned.h>
22521 +#include <linux/bitops.h>
22522 +#include <linux/proc_fs.h>
22523 +#include <linux/buffer_head.h>
22524 +
22525 +/* the 32 bit compat definitions with int argument */
22526 +#define REISERFS_IOC32_UNPACK          _IOW(0xCD, 1, int)
22527 +#define REISERFS_IOC32_GETVERSION      FS_IOC32_GETVERSION
22528 +#define REISERFS_IOC32_SETVERSION      FS_IOC32_SETVERSION
22529 +
22530 +struct reiserfs_journal_list;
22531 +
22532 +/* bitmasks for i_flags field in reiserfs-specific part of inode */
22533 +typedef enum {
22534 +       /*
22535 +        * this says what format of key do all items (but stat data) of
22536 +        * an object have.  If this is set, that format is 3.6 otherwise - 3.5
22537 +        */
22538 +       i_item_key_version_mask = 0x0001,
22539 +
22540 +       /*
22541 +        * If this is unset, object has 3.5 stat data, otherwise,
22542 +        * it has 3.6 stat data with 64bit size, 32bit nlink etc.
22543 +        */
22544 +       i_stat_data_version_mask = 0x0002,
22545 +
22546 +       /* file might need tail packing on close */
22547 +       i_pack_on_close_mask = 0x0004,
22548 +
22549 +       /* don't pack tail of file */
22550 +       i_nopack_mask = 0x0008,
22551 +
22552 +       /*
22553 +        * If either of these are set, "safe link" was created for this
22554 +        * file during truncate or unlink. Safe link is used to avoid
22555 +        * leakage of disk space on crash with some files open, but unlinked.
22556 +        */
22557 +       i_link_saved_unlink_mask = 0x0010,
22558 +       i_link_saved_truncate_mask = 0x0020,
22559 +
22560 +       i_has_xattr_dir = 0x0040,
22561 +       i_data_log = 0x0080,
22562 +} reiserfs_inode_flags;
22563 +
22564 +struct reiserfs_inode_info {
22565 +       __u32 i_key[4];         /* key is still 4 32 bit integers */
22566 +
22567 +       /*
22568 +        * transient inode flags that are never stored on disk. Bitmasks
22569 +        * for this field are defined above.
22570 +        */
22571 +       __u32 i_flags;
22572 +
22573 +       /* offset of first byte stored in direct item. */
22574 +       __u32 i_first_direct_byte;
22575 +
22576 +       /* copy of persistent inode flags read from sd_attrs. */
22577 +       __u32 i_attrs;
22578 +
22579 +       /* first unused block of a sequence of unused blocks */
22580 +       int i_prealloc_block;
22581 +       int i_prealloc_count;   /* length of that sequence */
22582 +
22583 +       /* per-transaction list of inodes which  have preallocated blocks */
22584 +       struct list_head i_prealloc_list;
22585 +
22586 +       /*
22587 +        * new_packing_locality is created; new blocks for the contents
22588 +        * of this directory should be displaced
22589 +        */
22590 +       unsigned new_packing_locality:1;
22591 +
22592 +       /*
22593 +        * we use these for fsync or O_SYNC to decide which transaction
22594 +        * needs to be committed in order for this inode to be properly
22595 +        * flushed
22596 +        */
22597 +       unsigned int i_trans_id;
22598 +
22599 +       struct reiserfs_journal_list *i_jl;
22600 +       atomic_t openers;
22601 +       struct mutex tailpack;
22602 +#ifdef CONFIG_REISERFS_FS_XATTR
22603 +       struct rw_semaphore i_xattr_sem;
22604 +#endif
22605 +#ifdef CONFIG_QUOTA
22606 +       struct dquot __rcu *i_dquot[MAXQUOTAS];
22607 +#endif
22608 +
22609 +       struct inode vfs_inode;
22610 +};
22611 +
22612 +typedef enum {
22613 +       reiserfs_attrs_cleared = 0x00000001,
22614 +} reiserfs_super_block_flags;
22615 +
22616 +/*
22617 + * struct reiserfs_super_block accessors/mutators since this is a disk
22618 + * structure, it will always be in little endian format.
22619 + */
22620 +#define sb_block_count(sbp)         (le32_to_cpu((sbp)->s_v1.s_block_count))
22621 +#define set_sb_block_count(sbp,v)   ((sbp)->s_v1.s_block_count = cpu_to_le32(v))
22622 +#define sb_free_blocks(sbp)         (le32_to_cpu((sbp)->s_v1.s_free_blocks))
22623 +#define set_sb_free_blocks(sbp,v)   ((sbp)->s_v1.s_free_blocks = cpu_to_le32(v))
22624 +#define sb_root_block(sbp)          (le32_to_cpu((sbp)->s_v1.s_root_block))
22625 +#define set_sb_root_block(sbp,v)    ((sbp)->s_v1.s_root_block = cpu_to_le32(v))
22626 +
22627 +#define sb_jp_journal_1st_block(sbp)  \
22628 +              (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_1st_block))
22629 +#define set_sb_jp_journal_1st_block(sbp,v) \
22630 +              ((sbp)->s_v1.s_journal.jp_journal_1st_block = cpu_to_le32(v))
22631 +#define sb_jp_journal_dev(sbp) \
22632 +              (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_dev))
22633 +#define set_sb_jp_journal_dev(sbp,v) \
22634 +              ((sbp)->s_v1.s_journal.jp_journal_dev = cpu_to_le32(v))
22635 +#define sb_jp_journal_size(sbp) \
22636 +              (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_size))
22637 +#define set_sb_jp_journal_size(sbp,v) \
22638 +              ((sbp)->s_v1.s_journal.jp_journal_size = cpu_to_le32(v))
22639 +#define sb_jp_journal_trans_max(sbp) \
22640 +              (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_trans_max))
22641 +#define set_sb_jp_journal_trans_max(sbp,v) \
22642 +              ((sbp)->s_v1.s_journal.jp_journal_trans_max = cpu_to_le32(v))
22643 +#define sb_jp_journal_magic(sbp) \
22644 +              (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_magic))
22645 +#define set_sb_jp_journal_magic(sbp,v) \
22646 +              ((sbp)->s_v1.s_journal.jp_journal_magic = cpu_to_le32(v))
22647 +#define sb_jp_journal_max_batch(sbp) \
22648 +              (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_max_batch))
22649 +#define set_sb_jp_journal_max_batch(sbp,v) \
22650 +              ((sbp)->s_v1.s_journal.jp_journal_max_batch = cpu_to_le32(v))
22651 +#define sb_jp_jourmal_max_commit_age(sbp) \
22652 +              (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_max_commit_age))
22653 +#define set_sb_jp_journal_max_commit_age(sbp,v) \
22654 +              ((sbp)->s_v1.s_journal.jp_journal_max_commit_age = cpu_to_le32(v))
22655 +
22656 +#define sb_blocksize(sbp)          (le16_to_cpu((sbp)->s_v1.s_blocksize))
22657 +#define set_sb_blocksize(sbp,v)    ((sbp)->s_v1.s_blocksize = cpu_to_le16(v))
22658 +#define sb_oid_maxsize(sbp)        (le16_to_cpu((sbp)->s_v1.s_oid_maxsize))
22659 +#define set_sb_oid_maxsize(sbp,v)  ((sbp)->s_v1.s_oid_maxsize = cpu_to_le16(v))
22660 +#define sb_oid_cursize(sbp)        (le16_to_cpu((sbp)->s_v1.s_oid_cursize))
22661 +#define set_sb_oid_cursize(sbp,v)  ((sbp)->s_v1.s_oid_cursize = cpu_to_le16(v))
22662 +#define sb_umount_state(sbp)       (le16_to_cpu((sbp)->s_v1.s_umount_state))
22663 +#define set_sb_umount_state(sbp,v) ((sbp)->s_v1.s_umount_state = cpu_to_le16(v))
22664 +#define sb_fs_state(sbp)           (le16_to_cpu((sbp)->s_v1.s_fs_state))
22665 +#define set_sb_fs_state(sbp,v)     ((sbp)->s_v1.s_fs_state = cpu_to_le16(v))
22666 +#define sb_hash_function_code(sbp) \
22667 +              (le32_to_cpu((sbp)->s_v1.s_hash_function_code))
22668 +#define set_sb_hash_function_code(sbp,v) \
22669 +              ((sbp)->s_v1.s_hash_function_code = cpu_to_le32(v))
22670 +#define sb_tree_height(sbp)        (le16_to_cpu((sbp)->s_v1.s_tree_height))
22671 +#define set_sb_tree_height(sbp,v)  ((sbp)->s_v1.s_tree_height = cpu_to_le16(v))
22672 +#define sb_bmap_nr(sbp)            (le16_to_cpu((sbp)->s_v1.s_bmap_nr))
22673 +#define set_sb_bmap_nr(sbp,v)      ((sbp)->s_v1.s_bmap_nr = cpu_to_le16(v))
22674 +#define sb_version(sbp)            (le16_to_cpu((sbp)->s_v1.s_version))
22675 +#define set_sb_version(sbp,v)      ((sbp)->s_v1.s_version = cpu_to_le16(v))
22676 +
22677 +#define sb_mnt_count(sbp)         (le16_to_cpu((sbp)->s_mnt_count))
22678 +#define set_sb_mnt_count(sbp, v)   ((sbp)->s_mnt_count = cpu_to_le16(v))
22679 +
22680 +#define sb_reserved_for_journal(sbp) \
22681 +              (le16_to_cpu((sbp)->s_v1.s_reserved_for_journal))
22682 +#define set_sb_reserved_for_journal(sbp,v) \
22683 +              ((sbp)->s_v1.s_reserved_for_journal = cpu_to_le16(v))
22684 +
22685 +/* LOGGING -- */
22686 +
22687 +/*
22688 + * These all interelate for performance.
22689 + *
22690 + * If the journal block count is smaller than n transactions, you lose speed.
22691 + * I don't know what n is yet, I'm guessing 8-16.
22692 + *
22693 + * typical transaction size depends on the application, how often fsync is
22694 + * called, and how many metadata blocks you dirty in a 30 second period.
22695 + * The more small files (<16k) you use, the larger your transactions will
22696 + * be.
22697 + *
22698 + * If your journal fills faster than dirty buffers get flushed to disk, it
22699 + * must flush them before allowing the journal to wrap, which slows things
22700 + * down.  If you need high speed meta data updates, the journal should be
22701 + * big enough to prevent wrapping before dirty meta blocks get to disk.
22702 + *
22703 + * If the batch max is smaller than the transaction max, you'll waste space
22704 + * at the end of the journal because journal_end sets the next transaction
22705 + * to start at 0 if the next transaction has any chance of wrapping.
22706 + *
22707 + * The large the batch max age, the better the speed, and the more meta
22708 + * data changes you'll lose after a crash.
22709 + */
22710 +
22711 +/* don't mess with these for a while */
22712 +/* we have a node size define somewhere in reiserfs_fs.h. -Hans */
22713 +#define JOURNAL_BLOCK_SIZE  4096       /* BUG gotta get rid of this */
22714 +#define JOURNAL_MAX_CNODE   1500       /* max cnodes to allocate. */
22715 +#define JOURNAL_HASH_SIZE 8192
22716 +
22717 +/* number of copies of the bitmaps to have floating.  Must be >= 2 */
22718 +#define JOURNAL_NUM_BITMAPS 5
22719 +
22720 +/*
22721 + * One of these for every block in every transaction
22722 + * Each one is in two hash tables.  First, a hash of the current transaction,
22723 + * and after journal_end, a hash of all the in memory transactions.
22724 + * next and prev are used by the current transaction (journal_hash).
22725 + * hnext and hprev are used by journal_list_hash.  If a block is in more
22726 + * than one transaction, the journal_list_hash links it in multiple times.
22727 + * This allows flush_journal_list to remove just the cnode belonging to a
22728 + * given transaction.
22729 + */
22730 +struct reiserfs_journal_cnode {
22731 +       struct buffer_head *bh; /* real buffer head */
22732 +       struct super_block *sb; /* dev of real buffer head */
22733 +
22734 +       /* block number of real buffer head, == 0 when buffer on disk */
22735 +       __u32 blocknr;
22736 +
22737 +       unsigned long state;
22738 +
22739 +       /* journal list this cnode lives in */
22740 +       struct reiserfs_journal_list *jlist;
22741 +
22742 +       struct reiserfs_journal_cnode *next;    /* next in transaction list */
22743 +       struct reiserfs_journal_cnode *prev;    /* prev in transaction list */
22744 +       struct reiserfs_journal_cnode *hprev;   /* prev in hash list */
22745 +       struct reiserfs_journal_cnode *hnext;   /* next in hash list */
22746 +};
22747 +
22748 +struct reiserfs_bitmap_node {
22749 +       int id;
22750 +       char *data;
22751 +       struct list_head list;
22752 +};
22753 +
22754 +struct reiserfs_list_bitmap {
22755 +       struct reiserfs_journal_list *journal_list;
22756 +       struct reiserfs_bitmap_node **bitmaps;
22757 +};
22758 +
22759 +/*
22760 + * one of these for each transaction.  The most important part here is the
22761 + * j_realblock.  this list of cnodes is used to hash all the blocks in all
22762 + * the commits, to mark all the real buffer heads dirty once all the commits
22763 + * hit the disk, and to make sure every real block in a transaction is on
22764 + * disk before allowing the log area to be overwritten
22765 + */
22766 +struct reiserfs_journal_list {
22767 +       unsigned long j_start;
22768 +       unsigned long j_state;
22769 +       unsigned long j_len;
22770 +       atomic_t j_nonzerolen;
22771 +       atomic_t j_commit_left;
22772 +
22773 +       /* all commits older than this on disk */
22774 +       atomic_t j_older_commits_done;
22775 +
22776 +       struct mutex j_commit_mutex;
22777 +       unsigned int j_trans_id;
22778 +       time64_t j_timestamp; /* write-only but useful for crash dump analysis */
22779 +       struct reiserfs_list_bitmap *j_list_bitmap;
22780 +       struct buffer_head *j_commit_bh;        /* commit buffer head */
22781 +       struct reiserfs_journal_cnode *j_realblock;
22782 +       struct reiserfs_journal_cnode *j_freedlist;     /* list of buffers that were freed during this trans.  free each of these on flush */
22783 +       /* time ordered list of all active transactions */
22784 +       struct list_head j_list;
22785 +
22786 +       /*
22787 +        * time ordered list of all transactions we haven't tried
22788 +        * to flush yet
22789 +        */
22790 +       struct list_head j_working_list;
22791 +
22792 +       /* list of tail conversion targets in need of flush before commit */
22793 +       struct list_head j_tail_bh_list;
22794 +
22795 +       /* list of data=ordered buffers in need of flush before commit */
22796 +       struct list_head j_bh_list;
22797 +       int j_refcount;
22798 +};
22799 +
22800 +struct reiserfs_journal {
22801 +       struct buffer_head **j_ap_blocks;       /* journal blocks on disk */
22802 +       /* newest journal block */
22803 +       struct reiserfs_journal_cnode *j_last;
22804 +
22805 +       /* oldest journal block.  start here for traverse */
22806 +       struct reiserfs_journal_cnode *j_first;
22807 +
22808 +       struct file *j_bdev_file;
22809 +
22810 +       /* first block on s_dev of reserved area journal */
22811 +       int j_1st_reserved_block;
22812 +
22813 +       unsigned long j_state;
22814 +       unsigned int j_trans_id;
22815 +       unsigned long j_mount_id;
22816 +
22817 +       /* start of current waiting commit (index into j_ap_blocks) */
22818 +       unsigned long j_start;
22819 +       unsigned long j_len;    /* length of current waiting commit */
22820 +
22821 +       /* number of buffers requested by journal_begin() */
22822 +       unsigned long j_len_alloc;
22823 +
22824 +       atomic_t j_wcount;      /* count of writers for current commit */
22825 +
22826 +       /* batch count. allows turning X transactions into 1 */
22827 +       unsigned long j_bcount;
22828 +
22829 +       /* first unflushed transactions offset */
22830 +       unsigned long j_first_unflushed_offset;
22831 +
22832 +       /* last fully flushed journal timestamp */
22833 +       unsigned j_last_flush_trans_id;
22834 +
22835 +       struct buffer_head *j_header_bh;
22836 +
22837 +       time64_t j_trans_start_time;    /* time this transaction started */
22838 +       struct mutex j_mutex;
22839 +       struct mutex j_flush_mutex;
22840 +
22841 +       /* wait for current transaction to finish before starting new one */
22842 +       wait_queue_head_t j_join_wait;
22843 +
22844 +       atomic_t j_jlock;               /* lock for j_join_wait */
22845 +       int j_list_bitmap_index;        /* number of next list bitmap to use */
22846 +
22847 +       /* no more journal begins allowed. MUST sleep on j_join_wait */
22848 +       int j_must_wait;
22849 +
22850 +       /* next journal_end will flush all journal list */
22851 +       int j_next_full_flush;
22852 +
22853 +       /* next journal_end will flush all async commits */
22854 +       int j_next_async_flush;
22855 +
22856 +       int j_cnode_used;       /* number of cnodes on the used list */
22857 +       int j_cnode_free;       /* number of cnodes on the free list */
22858 +
22859 +       /* max number of blocks in a transaction.  */
22860 +       unsigned int j_trans_max;
22861 +
22862 +       /* max number of blocks to batch into a trans */
22863 +       unsigned int j_max_batch;
22864 +
22865 +       /* in seconds, how old can an async commit be */
22866 +       unsigned int j_max_commit_age;
22867 +
22868 +       /* in seconds, how old can a transaction be */
22869 +       unsigned int j_max_trans_age;
22870 +
22871 +       /* the default for the max commit age */
22872 +       unsigned int j_default_max_commit_age;
22873 +
22874 +       struct reiserfs_journal_cnode *j_cnode_free_list;
22875 +
22876 +       /* orig pointer returned from vmalloc */
22877 +       struct reiserfs_journal_cnode *j_cnode_free_orig;
22878 +
22879 +       struct reiserfs_journal_list *j_current_jl;
22880 +       int j_free_bitmap_nodes;
22881 +       int j_used_bitmap_nodes;
22882 +
22883 +       int j_num_lists;        /* total number of active transactions */
22884 +       int j_num_work_lists;   /* number that need attention from kreiserfsd */
22885 +
22886 +       /* debugging to make sure things are flushed in order */
22887 +       unsigned int j_last_flush_id;
22888 +
22889 +       /* debugging to make sure things are committed in order */
22890 +       unsigned int j_last_commit_id;
22891 +
22892 +       struct list_head j_bitmap_nodes;
22893 +       struct list_head j_dirty_buffers;
22894 +       spinlock_t j_dirty_buffers_lock;        /* protects j_dirty_buffers */
22895 +
22896 +       /* list of all active transactions */
22897 +       struct list_head j_journal_list;
22898 +
22899 +       /* lists that haven't been touched by writeback attempts */
22900 +       struct list_head j_working_list;
22901 +
22902 +       /* hash table for real buffer heads in current trans */
22903 +       struct reiserfs_journal_cnode *j_hash_table[JOURNAL_HASH_SIZE];
22904 +
22905 +       /* hash table for all the real buffer heads in all the transactions */
22906 +       struct reiserfs_journal_cnode *j_list_hash_table[JOURNAL_HASH_SIZE];
22907 +
22908 +       /* array of bitmaps to record the deleted blocks */
22909 +       struct reiserfs_list_bitmap j_list_bitmap[JOURNAL_NUM_BITMAPS];
22910 +
22911 +       /* list of inodes which have preallocated blocks */
22912 +       struct list_head j_prealloc_list;
22913 +       int j_persistent_trans;
22914 +       unsigned long j_max_trans_size;
22915 +       unsigned long j_max_batch_size;
22916 +
22917 +       int j_errno;
22918 +
22919 +       /* when flushing ordered buffers, throttle new ordered writers */
22920 +       struct delayed_work j_work;
22921 +       struct super_block *j_work_sb;
22922 +       atomic_t j_async_throttle;
22923 +};
22924 +
22925 +enum journal_state_bits {
22926 +       J_WRITERS_BLOCKED = 1,  /* set when new writers not allowed */
22927 +       J_WRITERS_QUEUED,    /* set when log is full due to too many writers */
22928 +       J_ABORTED,           /* set when log is aborted */
22929 +};
22930 +
22931 +/* ick.  magic string to find desc blocks in the journal */
22932 +#define JOURNAL_DESC_MAGIC "ReIsErLB"
22933 +
22934 +typedef __u32(*hashf_t) (const signed char *, int);
22935 +
22936 +struct reiserfs_bitmap_info {
22937 +       __u32 free_count;
22938 +};
22939 +
22940 +struct proc_dir_entry;
22941 +
22942 +#if defined( CONFIG_PROC_FS ) && defined( CONFIG_REISERFS_PROC_INFO )
22943 +typedef unsigned long int stat_cnt_t;
22944 +typedef struct reiserfs_proc_info_data {
22945 +       spinlock_t lock;
22946 +       int exiting;
22947 +       int max_hash_collisions;
22948 +
22949 +       stat_cnt_t breads;
22950 +       stat_cnt_t bread_miss;
22951 +       stat_cnt_t search_by_key;
22952 +       stat_cnt_t search_by_key_fs_changed;
22953 +       stat_cnt_t search_by_key_restarted;
22954 +
22955 +       stat_cnt_t insert_item_restarted;
22956 +       stat_cnt_t paste_into_item_restarted;
22957 +       stat_cnt_t cut_from_item_restarted;
22958 +       stat_cnt_t delete_solid_item_restarted;
22959 +       stat_cnt_t delete_item_restarted;
22960 +
22961 +       stat_cnt_t leaked_oid;
22962 +       stat_cnt_t leaves_removable;
22963 +
22964 +       /*
22965 +        * balances per level.
22966 +        * Use explicit 5 as MAX_HEIGHT is not visible yet.
22967 +        */
22968 +       stat_cnt_t balance_at[5];       /* XXX */
22969 +       /* sbk == search_by_key */
22970 +       stat_cnt_t sbk_read_at[5];      /* XXX */
22971 +       stat_cnt_t sbk_fs_changed[5];
22972 +       stat_cnt_t sbk_restarted[5];
22973 +       stat_cnt_t items_at[5]; /* XXX */
22974 +       stat_cnt_t free_at[5];  /* XXX */
22975 +       stat_cnt_t can_node_be_removed[5];      /* XXX */
22976 +       long int lnum[5];       /* XXX */
22977 +       long int rnum[5];       /* XXX */
22978 +       long int lbytes[5];     /* XXX */
22979 +       long int rbytes[5];     /* XXX */
22980 +       stat_cnt_t get_neighbors[5];
22981 +       stat_cnt_t get_neighbors_restart[5];
22982 +       stat_cnt_t need_l_neighbor[5];
22983 +       stat_cnt_t need_r_neighbor[5];
22984 +
22985 +       stat_cnt_t free_block;
22986 +       struct __scan_bitmap_stats {
22987 +               stat_cnt_t call;
22988 +               stat_cnt_t wait;
22989 +               stat_cnt_t bmap;
22990 +               stat_cnt_t retry;
22991 +               stat_cnt_t in_journal_hint;
22992 +               stat_cnt_t in_journal_nohint;
22993 +               stat_cnt_t stolen;
22994 +       } scan_bitmap;
22995 +       struct __journal_stats {
22996 +               stat_cnt_t in_journal;
22997 +               stat_cnt_t in_journal_bitmap;
22998 +               stat_cnt_t in_journal_reusable;
22999 +               stat_cnt_t lock_journal;
23000 +               stat_cnt_t lock_journal_wait;
23001 +               stat_cnt_t journal_being;
23002 +               stat_cnt_t journal_relock_writers;
23003 +               stat_cnt_t journal_relock_wcount;
23004 +               stat_cnt_t mark_dirty;
23005 +               stat_cnt_t mark_dirty_already;
23006 +               stat_cnt_t mark_dirty_notjournal;
23007 +               stat_cnt_t restore_prepared;
23008 +               stat_cnt_t prepare;
23009 +               stat_cnt_t prepare_retry;
23010 +       } journal;
23011 +} reiserfs_proc_info_data_t;
23012 +#else
23013 +typedef struct reiserfs_proc_info_data {
23014 +} reiserfs_proc_info_data_t;
23015 +#endif
23016 +
23017 +/* Number of quota types we support */
23018 +#define REISERFS_MAXQUOTAS 2
23019 +
23020 +/* reiserfs union of in-core super block data */
23021 +struct reiserfs_sb_info {
23022 +       /* Buffer containing the super block */
23023 +       struct buffer_head *s_sbh;
23024 +
23025 +       /* Pointer to the on-disk super block in the buffer */
23026 +       struct reiserfs_super_block *s_rs;
23027 +       struct reiserfs_bitmap_info *s_ap_bitmap;
23028 +
23029 +       /* pointer to journal information */
23030 +       struct reiserfs_journal *s_journal;
23031 +
23032 +       unsigned short s_mount_state;   /* reiserfs state (valid, invalid) */
23033 +
23034 +       /* Serialize writers access, replace the old bkl */
23035 +       struct mutex lock;
23036 +
23037 +       /* Owner of the lock (can be recursive) */
23038 +       struct task_struct *lock_owner;
23039 +
23040 +       /* Depth of the lock, start from -1 like the bkl */
23041 +       int lock_depth;
23042 +
23043 +       struct workqueue_struct *commit_wq;
23044 +
23045 +       /* Comment? -Hans */
23046 +       void (*end_io_handler) (struct buffer_head *, int);
23047 +
23048 +       /*
23049 +        * pointer to function which is used to sort names in directory.
23050 +        * Set on mount
23051 +        */
23052 +       hashf_t s_hash_function;
23053 +
23054 +       /* reiserfs's mount options are set here */
23055 +       unsigned long s_mount_opt;
23056 +
23057 +       /* This is a structure that describes block allocator options */
23058 +       struct {
23059 +               /* Bitfield for enable/disable kind of options */
23060 +               unsigned long bits;
23061 +
23062 +               /*
23063 +                * size started from which we consider file
23064 +                * to be a large one (in blocks)
23065 +                */
23066 +               unsigned long large_file_size;
23067 +
23068 +               int border;     /* percentage of disk, border takes */
23069 +
23070 +               /*
23071 +                * Minimal file size (in blocks) starting
23072 +                * from which we do preallocations
23073 +                */
23074 +               int preallocmin;
23075 +
23076 +               /*
23077 +                * Number of blocks we try to prealloc when file
23078 +                * reaches preallocmin size (in blocks) or prealloc_list
23079 +                is empty.
23080 +                */
23081 +               int preallocsize;
23082 +       } s_alloc_options;
23083 +
23084 +       /* Comment? -Hans */
23085 +       wait_queue_head_t s_wait;
23086 +       /* increased by one every time the  tree gets re-balanced */
23087 +       atomic_t s_generation_counter;
23088 +
23089 +       /* File system properties. Currently holds on-disk FS format */
23090 +       unsigned long s_properties;
23091 +
23092 +       /* session statistics */
23093 +       int s_disk_reads;
23094 +       int s_disk_writes;
23095 +       int s_fix_nodes;
23096 +       int s_do_balance;
23097 +       int s_unneeded_left_neighbor;
23098 +       int s_good_search_by_key_reada;
23099 +       int s_bmaps;
23100 +       int s_bmaps_without_search;
23101 +       int s_direct2indirect;
23102 +       int s_indirect2direct;
23103 +
23104 +       /*
23105 +        * set up when it's ok for reiserfs_read_inode2() to read from
23106 +        * disk inode with nlink==0. Currently this is only used during
23107 +        * finish_unfinished() processing at mount time
23108 +        */
23109 +       int s_is_unlinked_ok;
23110 +
23111 +       reiserfs_proc_info_data_t s_proc_info_data;
23112 +       struct proc_dir_entry *procdir;
23113 +
23114 +       /* amount of blocks reserved for further allocations */
23115 +       int reserved_blocks;
23116 +
23117 +
23118 +       /* this lock on now only used to protect reserved_blocks variable */
23119 +       spinlock_t bitmap_lock;
23120 +       struct dentry *priv_root;       /* root of /.reiserfs_priv */
23121 +       struct dentry *xattr_root;      /* root of /.reiserfs_priv/xattrs */
23122 +       int j_errno;
23123 +
23124 +       int work_queued;              /* non-zero delayed work is queued */
23125 +       struct delayed_work old_work; /* old transactions flush delayed work */
23126 +       spinlock_t old_work_lock;     /* protects old_work and work_queued */
23127 +
23128 +#ifdef CONFIG_QUOTA
23129 +       char *s_qf_names[REISERFS_MAXQUOTAS];
23130 +       int s_jquota_fmt;
23131 +#endif
23132 +       char *s_jdev;           /* Stored jdev for mount option showing */
23133 +#ifdef CONFIG_REISERFS_CHECK
23134 +
23135 +       /*
23136 +        * Detects whether more than one copy of tb exists per superblock
23137 +        * as a means of checking whether do_balance is executing
23138 +        * concurrently against another tree reader/writer on a same
23139 +        * mount point.
23140 +        */
23141 +       struct tree_balance *cur_tb;
23142 +#endif
23143 +};
23144 +
23145 +/* Definitions of reiserfs on-disk properties: */
23146 +#define REISERFS_3_5 0
23147 +#define REISERFS_3_6 1
23148 +#define REISERFS_OLD_FORMAT 2
23149 +
23150 +/* Mount options */
23151 +enum reiserfs_mount_options {
23152 +       /* large tails will be created in a session */
23153 +       REISERFS_LARGETAIL,
23154 +       /*
23155 +        * small (for files less than block size) tails will
23156 +        * be created in a session
23157 +        */
23158 +       REISERFS_SMALLTAIL,
23159 +
23160 +       /* replay journal and return 0. Use by fsck */
23161 +       REPLAYONLY,
23162 +
23163 +       /*
23164 +        * -o conv: causes conversion of old format super block to the
23165 +        * new format. If not specified - old partition will be dealt
23166 +        * with in a manner of 3.5.x
23167 +        */
23168 +       REISERFS_CONVERT,
23169 +
23170 +       /*
23171 +        * -o hash={tea, rupasov, r5, detect} is meant for properly mounting
23172 +        * reiserfs disks from 3.5.19 or earlier.  99% of the time, this
23173 +        * option is not required.  If the normal autodection code can't
23174 +        * determine which hash to use (because both hashes had the same
23175 +        * value for a file) use this option to force a specific hash.
23176 +        * It won't allow you to override the existing hash on the FS, so
23177 +        * if you have a tea hash disk, and mount with -o hash=rupasov,
23178 +        * the mount will fail.
23179 +        */
23180 +       FORCE_TEA_HASH,         /* try to force tea hash on mount */
23181 +       FORCE_RUPASOV_HASH,     /* try to force rupasov hash on mount */
23182 +       FORCE_R5_HASH,          /* try to force rupasov hash on mount */
23183 +       FORCE_HASH_DETECT,      /* try to detect hash function on mount */
23184 +
23185 +       REISERFS_DATA_LOG,
23186 +       REISERFS_DATA_ORDERED,
23187 +       REISERFS_DATA_WRITEBACK,
23188 +
23189 +       /*
23190 +        * used for testing experimental features, makes benchmarking new
23191 +        * features with and without more convenient, should never be used by
23192 +        * users in any code shipped to users (ideally)
23193 +        */
23194 +
23195 +       REISERFS_NO_BORDER,
23196 +       REISERFS_NO_UNHASHED_RELOCATION,
23197 +       REISERFS_HASHED_RELOCATION,
23198 +       REISERFS_ATTRS,
23199 +       REISERFS_XATTRS_USER,
23200 +       REISERFS_POSIXACL,
23201 +       REISERFS_EXPOSE_PRIVROOT,
23202 +       REISERFS_BARRIER_NONE,
23203 +       REISERFS_BARRIER_FLUSH,
23204 +
23205 +       /* Actions on error */
23206 +       REISERFS_ERROR_PANIC,
23207 +       REISERFS_ERROR_RO,
23208 +       REISERFS_ERROR_CONTINUE,
23209 +
23210 +       REISERFS_USRQUOTA,      /* User quota option specified */
23211 +       REISERFS_GRPQUOTA,      /* Group quota option specified */
23212 +
23213 +       REISERFS_TEST1,
23214 +       REISERFS_TEST2,
23215 +       REISERFS_TEST3,
23216 +       REISERFS_TEST4,
23217 +       REISERFS_UNSUPPORTED_OPT,
23218 +};
23219 +
23220 +#define reiserfs_r5_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_R5_HASH))
23221 +#define reiserfs_rupasov_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_RUPASOV_HASH))
23222 +#define reiserfs_tea_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_TEA_HASH))
23223 +#define reiserfs_hash_detect(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_HASH_DETECT))
23224 +#define reiserfs_no_border(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_NO_BORDER))
23225 +#define reiserfs_no_unhashed_relocation(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_NO_UNHASHED_RELOCATION))
23226 +#define reiserfs_hashed_relocation(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_HASHED_RELOCATION))
23227 +#define reiserfs_test4(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_TEST4))
23228 +
23229 +#define have_large_tails(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_LARGETAIL))
23230 +#define have_small_tails(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_SMALLTAIL))
23231 +#define replay_only(s) (REISERFS_SB(s)->s_mount_opt & (1 << REPLAYONLY))
23232 +#define reiserfs_attrs(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ATTRS))
23233 +#define old_format_only(s) (REISERFS_SB(s)->s_properties & (1 << REISERFS_3_5))
23234 +#define convert_reiserfs(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_CONVERT))
23235 +#define reiserfs_data_log(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_LOG))
23236 +#define reiserfs_data_ordered(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_ORDERED))
23237 +#define reiserfs_data_writeback(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_WRITEBACK))
23238 +#define reiserfs_xattrs_user(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_XATTRS_USER))
23239 +#define reiserfs_posixacl(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_POSIXACL))
23240 +#define reiserfs_expose_privroot(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_EXPOSE_PRIVROOT))
23241 +#define reiserfs_xattrs_optional(s) (reiserfs_xattrs_user(s) || reiserfs_posixacl(s))
23242 +#define reiserfs_barrier_none(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_NONE))
23243 +#define reiserfs_barrier_flush(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_FLUSH))
23244 +
23245 +#define reiserfs_error_panic(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ERROR_PANIC))
23246 +#define reiserfs_error_ro(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ERROR_RO))
23247 +
23248 +void reiserfs_file_buffer(struct buffer_head *bh, int list);
23249 +extern struct file_system_type reiserfs_fs_type;
23250 +int reiserfs_resize(struct super_block *, unsigned long);
23251 +
23252 +#define CARRY_ON                0
23253 +#define SCHEDULE_OCCURRED       1
23254 +
23255 +#define SB_BUFFER_WITH_SB(s) (REISERFS_SB(s)->s_sbh)
23256 +#define SB_JOURNAL(s) (REISERFS_SB(s)->s_journal)
23257 +#define SB_JOURNAL_1st_RESERVED_BLOCK(s) (SB_JOURNAL(s)->j_1st_reserved_block)
23258 +#define SB_JOURNAL_LEN_FREE(s) (SB_JOURNAL(s)->j_journal_len_free)
23259 +#define SB_AP_BITMAP(s) (REISERFS_SB(s)->s_ap_bitmap)
23260 +
23261 +#define SB_DISK_JOURNAL_HEAD(s) (SB_JOURNAL(s)->j_header_bh->)
23262 +
23263 +#define reiserfs_is_journal_aborted(journal) (unlikely (__reiserfs_is_journal_aborted (journal)))
23264 +static inline int __reiserfs_is_journal_aborted(struct reiserfs_journal
23265 +                                               *journal)
23266 +{
23267 +       return test_bit(J_ABORTED, &journal->j_state);
23268 +}
23269 +
23270 +/*
23271 + * Locking primitives. The write lock is a per superblock
23272 + * special mutex that has properties close to the Big Kernel Lock
23273 + * which was used in the previous locking scheme.
23274 + */
23275 +void reiserfs_write_lock(struct super_block *s);
23276 +void reiserfs_write_unlock(struct super_block *s);
23277 +int __must_check reiserfs_write_unlock_nested(struct super_block *s);
23278 +void reiserfs_write_lock_nested(struct super_block *s, int depth);
23279 +
23280 +#ifdef CONFIG_REISERFS_CHECK
23281 +void reiserfs_lock_check_recursive(struct super_block *s);
23282 +#else
23283 +static inline void reiserfs_lock_check_recursive(struct super_block *s) { }
23284 +#endif
23285 +
23286 +/*
23287 + * Several mutexes depend on the write lock.
23288 + * However sometimes we want to relax the write lock while we hold
23289 + * these mutexes, according to the release/reacquire on schedule()
23290 + * properties of the Bkl that were used.
23291 + * Reiserfs performances and locking were based on this scheme.
23292 + * Now that the write lock is a mutex and not the bkl anymore, doing so
23293 + * may result in a deadlock:
23294 + *
23295 + * A acquire write_lock
23296 + * A acquire j_commit_mutex
23297 + * A release write_lock and wait for something
23298 + * B acquire write_lock
23299 + * B can't acquire j_commit_mutex and sleep
23300 + * A can't acquire write lock anymore
23301 + * deadlock
23302 + *
23303 + * What we do here is avoiding such deadlock by playing the same game
23304 + * than the Bkl: if we can't acquire a mutex that depends on the write lock,
23305 + * we release the write lock, wait a bit and then retry.
23306 + *
23307 + * The mutexes concerned by this hack are:
23308 + * - The commit mutex of a journal list
23309 + * - The flush mutex
23310 + * - The journal lock
23311 + * - The inode mutex
23312 + */
23313 +static inline void reiserfs_mutex_lock_safe(struct mutex *m,
23314 +                                           struct super_block *s)
23315 +{
23316 +       int depth;
23317 +
23318 +       depth = reiserfs_write_unlock_nested(s);
23319 +       mutex_lock(m);
23320 +       reiserfs_write_lock_nested(s, depth);
23321 +}
23322 +
23323 +static inline void
23324 +reiserfs_mutex_lock_nested_safe(struct mutex *m, unsigned int subclass,
23325 +                               struct super_block *s)
23326 +{
23327 +       int depth;
23328 +
23329 +       depth = reiserfs_write_unlock_nested(s);
23330 +       mutex_lock_nested(m, subclass);
23331 +       reiserfs_write_lock_nested(s, depth);
23332 +}
23333 +
23334 +static inline void
23335 +reiserfs_down_read_safe(struct rw_semaphore *sem, struct super_block *s)
23336 +{
23337 +       int depth;
23338 +       depth = reiserfs_write_unlock_nested(s);
23339 +       down_read(sem);
23340 +       reiserfs_write_lock_nested(s, depth);
23341 +}
23342 +
23343 +/*
23344 + * When we schedule, we usually want to also release the write lock,
23345 + * according to the previous bkl based locking scheme of reiserfs.
23346 + */
23347 +static inline void reiserfs_cond_resched(struct super_block *s)
23348 +{
23349 +       if (need_resched()) {
23350 +               int depth;
23351 +
23352 +               depth = reiserfs_write_unlock_nested(s);
23353 +               schedule();
23354 +               reiserfs_write_lock_nested(s, depth);
23355 +       }
23356 +}
23357 +
23358 +struct fid;
23359 +
23360 +/*
23361 + * in reading the #defines, it may help to understand that they employ
23362 + *  the following abbreviations:
23363 + *
23364 + *  B = Buffer
23365 + *  I = Item header
23366 + *  H = Height within the tree (should be changed to LEV)
23367 + *  N = Number of the item in the node
23368 + *  STAT = stat data
23369 + *  DEH = Directory Entry Header
23370 + *  EC = Entry Count
23371 + *  E = Entry number
23372 + *  UL = Unsigned Long
23373 + *  BLKH = BLocK Header
23374 + *  UNFM = UNForMatted node
23375 + *  DC = Disk Child
23376 + *  P = Path
23377 + *
23378 + *  These #defines are named by concatenating these abbreviations,
23379 + *  where first comes the arguments, and last comes the return value,
23380 + *  of the macro.
23381 + */
23382 +
23383 +#define USE_INODE_GENERATION_COUNTER
23384 +
23385 +#define REISERFS_PREALLOCATE
23386 +#define DISPLACE_NEW_PACKING_LOCALITIES
23387 +#define PREALLOCATION_SIZE 9
23388 +
23389 +/* n must be power of 2 */
23390 +#define _ROUND_UP(x,n) (((x)+(n)-1u) & ~((n)-1u))
23391 +
23392 +/*
23393 + * to be ok for alpha and others we have to align structures to 8 byte
23394 + * boundary.
23395 + * FIXME: do not change 4 by anything else: there is code which relies on that
23396 + */
23397 +#define ROUND_UP(x) _ROUND_UP(x,8LL)
23398 +
23399 +/*
23400 + * debug levels.  Right now, CONFIG_REISERFS_CHECK means print all debug
23401 + * messages.
23402 + */
23403 +#define REISERFS_DEBUG_CODE 5  /* extra messages to help find/debug errors */
23404 +
23405 +void __reiserfs_warning(struct super_block *s, const char *id,
23406 +                        const char *func, const char *fmt, ...);
23407 +#define reiserfs_warning(s, id, fmt, args...) \
23408 +        __reiserfs_warning(s, id, __func__, fmt, ##args)
23409 +/* assertions handling */
23410 +
23411 +/* always check a condition and panic if it's false. */
23412 +#define __RASSERT(cond, scond, format, args...)                        \
23413 +do {                                                                   \
23414 +       if (!(cond))                                                    \
23415 +               reiserfs_panic(NULL, "assertion failure", "(" #cond ") at " \
23416 +                              __FILE__ ":%i:%s: " format "\n",         \
23417 +                              __LINE__, __func__ , ##args);            \
23418 +} while (0)
23419 +
23420 +#define RASSERT(cond, format, args...) __RASSERT(cond, #cond, format, ##args)
23421 +
23422 +#if defined( CONFIG_REISERFS_CHECK )
23423 +#define RFALSE(cond, format, args...) __RASSERT(!(cond), "!(" #cond ")", format, ##args)
23424 +#else
23425 +#define RFALSE( cond, format, args... ) do {;} while( 0 )
23426 +#endif
23427 +
23428 +#define CONSTF __attribute_const__
23429 +/*
23430 + * Disk Data Structures
23431 + */
23432 +
23433 +/***************************************************************************
23434 + *                             SUPER BLOCK                                 *
23435 + ***************************************************************************/
23436 +
23437 +/*
23438 + * Structure of super block on disk, a version of which in RAM is often
23439 + * accessed as REISERFS_SB(s)->s_rs. The version in RAM is part of a larger
23440 + * structure containing fields never written to disk.
23441 + */
23442 +#define UNSET_HASH 0   /* Detect hash on disk */
23443 +#define TEA_HASH  1
23444 +#define YURA_HASH 2
23445 +#define R5_HASH   3
23446 +#define DEFAULT_HASH R5_HASH
23447 +
23448 +struct journal_params {
23449 +       /* where does journal start from on its * device */
23450 +       __le32 jp_journal_1st_block;
23451 +
23452 +       /* journal device st_rdev */
23453 +       __le32 jp_journal_dev;
23454 +
23455 +       /* size of the journal */
23456 +       __le32 jp_journal_size;
23457 +
23458 +       /* max number of blocks in a transaction. */
23459 +       __le32 jp_journal_trans_max;
23460 +
23461 +       /*
23462 +        * random value made on fs creation
23463 +        * (this was sb_journal_block_count)
23464 +        */
23465 +       __le32 jp_journal_magic;
23466 +
23467 +       /* max number of blocks to batch into a trans */
23468 +       __le32 jp_journal_max_batch;
23469 +
23470 +       /* in seconds, how old can an async  commit be */
23471 +       __le32 jp_journal_max_commit_age;
23472 +
23473 +       /* in seconds, how old can a transaction be */
23474 +       __le32 jp_journal_max_trans_age;
23475 +};
23476 +
23477 +/* this is the super from 3.5.X, where X >= 10 */
23478 +struct reiserfs_super_block_v1 {
23479 +       __le32 s_block_count;   /* blocks count         */
23480 +       __le32 s_free_blocks;   /* free blocks count    */
23481 +       __le32 s_root_block;    /* root block number    */
23482 +       struct journal_params s_journal;
23483 +       __le16 s_blocksize;     /* block size */
23484 +
23485 +       /* max size of object id array, see get_objectid() commentary  */
23486 +       __le16 s_oid_maxsize;
23487 +       __le16 s_oid_cursize;   /* current size of object id array */
23488 +
23489 +       /* this is set to 1 when filesystem was umounted, to 2 - when not */
23490 +       __le16 s_umount_state;
23491 +
23492 +       /*
23493 +        * reiserfs magic string indicates that file system is reiserfs:
23494 +        * "ReIsErFs" or "ReIsEr2Fs" or "ReIsEr3Fs"
23495 +        */
23496 +       char s_magic[10];
23497 +
23498 +       /*
23499 +        * it is set to used by fsck to mark which
23500 +        * phase of rebuilding is done
23501 +        */
23502 +       __le16 s_fs_state;
23503 +       /*
23504 +        * indicate, what hash function is being use
23505 +        * to sort names in a directory
23506 +        */
23507 +       __le32 s_hash_function_code;
23508 +       __le16 s_tree_height;   /* height of disk tree */
23509 +
23510 +       /*
23511 +        * amount of bitmap blocks needed to address
23512 +        * each block of file system
23513 +        */
23514 +       __le16 s_bmap_nr;
23515 +
23516 +       /*
23517 +        * this field is only reliable on filesystem with non-standard journal
23518 +        */
23519 +       __le16 s_version;
23520 +
23521 +       /*
23522 +        * size in blocks of journal area on main device, we need to
23523 +        * keep after making fs with non-standard journal
23524 +        */
23525 +       __le16 s_reserved_for_journal;
23526 +} __attribute__ ((__packed__));
23527 +
23528 +#define SB_SIZE_V1 (sizeof(struct reiserfs_super_block_v1))
23529 +
23530 +/* this is the on disk super block */
23531 +struct reiserfs_super_block {
23532 +       struct reiserfs_super_block_v1 s_v1;
23533 +       __le32 s_inode_generation;
23534 +
23535 +       /* Right now used only by inode-attributes, if enabled */
23536 +       __le32 s_flags;
23537 +
23538 +       unsigned char s_uuid[16];       /* filesystem unique identifier */
23539 +       unsigned char s_label[16];      /* filesystem volume label */
23540 +       __le16 s_mnt_count;             /* Count of mounts since last fsck */
23541 +       __le16 s_max_mnt_count;         /* Maximum mounts before check */
23542 +       __le32 s_lastcheck;             /* Timestamp of last fsck */
23543 +       __le32 s_check_interval;        /* Interval between checks */
23544 +
23545 +       /*
23546 +        * zero filled by mkreiserfs and reiserfs_convert_objectid_map_v1()
23547 +        * so any additions must be updated there as well. */
23548 +       char s_unused[76];
23549 +} __attribute__ ((__packed__));
23550 +
23551 +#define SB_SIZE (sizeof(struct reiserfs_super_block))
23552 +
23553 +#define REISERFS_VERSION_1 0
23554 +#define REISERFS_VERSION_2 2
23555 +
23556 +/* on-disk super block fields converted to cpu form */
23557 +#define SB_DISK_SUPER_BLOCK(s) (REISERFS_SB(s)->s_rs)
23558 +#define SB_V1_DISK_SUPER_BLOCK(s) (&(SB_DISK_SUPER_BLOCK(s)->s_v1))
23559 +#define SB_BLOCKSIZE(s) \
23560 +        le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_blocksize))
23561 +#define SB_BLOCK_COUNT(s) \
23562 +        le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_block_count))
23563 +#define SB_FREE_BLOCKS(s) \
23564 +        le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_free_blocks))
23565 +#define SB_REISERFS_MAGIC(s) \
23566 +        (SB_V1_DISK_SUPER_BLOCK(s)->s_magic)
23567 +#define SB_ROOT_BLOCK(s) \
23568 +        le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_root_block))
23569 +#define SB_TREE_HEIGHT(s) \
23570 +        le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_tree_height))
23571 +#define SB_REISERFS_STATE(s) \
23572 +        le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_umount_state))
23573 +#define SB_VERSION(s) le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_version))
23574 +#define SB_BMAP_NR(s) le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_bmap_nr))
23575 +
23576 +#define PUT_SB_BLOCK_COUNT(s, val) \
23577 +   do { SB_V1_DISK_SUPER_BLOCK(s)->s_block_count = cpu_to_le32(val); } while (0)
23578 +#define PUT_SB_FREE_BLOCKS(s, val) \
23579 +   do { SB_V1_DISK_SUPER_BLOCK(s)->s_free_blocks = cpu_to_le32(val); } while (0)
23580 +#define PUT_SB_ROOT_BLOCK(s, val) \
23581 +   do { SB_V1_DISK_SUPER_BLOCK(s)->s_root_block = cpu_to_le32(val); } while (0)
23582 +#define PUT_SB_TREE_HEIGHT(s, val) \
23583 +   do { SB_V1_DISK_SUPER_BLOCK(s)->s_tree_height = cpu_to_le16(val); } while (0)
23584 +#define PUT_SB_REISERFS_STATE(s, val) \
23585 +   do { SB_V1_DISK_SUPER_BLOCK(s)->s_umount_state = cpu_to_le16(val); } while (0)
23586 +#define PUT_SB_VERSION(s, val) \
23587 +   do { SB_V1_DISK_SUPER_BLOCK(s)->s_version = cpu_to_le16(val); } while (0)
23588 +#define PUT_SB_BMAP_NR(s, val) \
23589 +   do { SB_V1_DISK_SUPER_BLOCK(s)->s_bmap_nr = cpu_to_le16 (val); } while (0)
23590 +
23591 +#define SB_ONDISK_JP(s) (&SB_V1_DISK_SUPER_BLOCK(s)->s_journal)
23592 +#define SB_ONDISK_JOURNAL_SIZE(s) \
23593 +         le32_to_cpu ((SB_ONDISK_JP(s)->jp_journal_size))
23594 +#define SB_ONDISK_JOURNAL_1st_BLOCK(s) \
23595 +         le32_to_cpu ((SB_ONDISK_JP(s)->jp_journal_1st_block))
23596 +#define SB_ONDISK_JOURNAL_DEVICE(s) \
23597 +         le32_to_cpu ((SB_ONDISK_JP(s)->jp_journal_dev))
23598 +#define SB_ONDISK_RESERVED_FOR_JOURNAL(s) \
23599 +         le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_reserved_for_journal))
23600 +
23601 +#define is_block_in_log_or_reserved_area(s, block) \
23602 +         block >= SB_JOURNAL_1st_RESERVED_BLOCK(s) \
23603 +         && block < SB_JOURNAL_1st_RESERVED_BLOCK(s) +  \
23604 +         ((!is_reiserfs_jr(SB_DISK_SUPER_BLOCK(s)) ? \
23605 +         SB_ONDISK_JOURNAL_SIZE(s) + 1 : SB_ONDISK_RESERVED_FOR_JOURNAL(s)))
23606 +
23607 +int is_reiserfs_3_5(struct reiserfs_super_block *rs);
23608 +int is_reiserfs_3_6(struct reiserfs_super_block *rs);
23609 +int is_reiserfs_jr(struct reiserfs_super_block *rs);
23610 +
23611 +/*
23612 + * ReiserFS leaves the first 64k unused, so that partition labels have
23613 + * enough space.  If someone wants to write a fancy bootloader that
23614 + * needs more than 64k, let us know, and this will be increased in size.
23615 + * This number must be larger than the largest block size on any
23616 + * platform, or code will break.  -Hans
23617 + */
23618 +#define REISERFS_DISK_OFFSET_IN_BYTES (64 * 1024)
23619 +#define REISERFS_FIRST_BLOCK unused_define
23620 +#define REISERFS_JOURNAL_OFFSET_IN_BYTES REISERFS_DISK_OFFSET_IN_BYTES
23621 +
23622 +/* the spot for the super in versions 3.5 - 3.5.10 (inclusive) */
23623 +#define REISERFS_OLD_DISK_OFFSET_IN_BYTES (8 * 1024)
23624 +
23625 +/* reiserfs internal error code (used by search_by_key and fix_nodes)) */
23626 +#define CARRY_ON      0
23627 +#define REPEAT_SEARCH -1
23628 +#define IO_ERROR      -2
23629 +#define NO_DISK_SPACE -3
23630 +#define NO_BALANCING_NEEDED  (-4)
23631 +#define NO_MORE_UNUSED_CONTIGUOUS_BLOCKS (-5)
23632 +#define QUOTA_EXCEEDED -6
23633 +
23634 +typedef __u32 b_blocknr_t;
23635 +typedef __le32 unp_t;
23636 +
23637 +struct unfm_nodeinfo {
23638 +       unp_t unfm_nodenum;
23639 +       unsigned short unfm_freespace;
23640 +};
23641 +
23642 +/* there are two formats of keys: 3.5 and 3.6 */
23643 +#define KEY_FORMAT_3_5 0
23644 +#define KEY_FORMAT_3_6 1
23645 +
23646 +/* there are two stat datas */
23647 +#define STAT_DATA_V1 0
23648 +#define STAT_DATA_V2 1
23649 +
23650 +static inline struct reiserfs_inode_info *REISERFS_I(const struct inode *inode)
23651 +{
23652 +       return container_of(inode, struct reiserfs_inode_info, vfs_inode);
23653 +}
23654 +
23655 +static inline struct reiserfs_sb_info *REISERFS_SB(const struct super_block *sb)
23656 +{
23657 +       return sb->s_fs_info;
23658 +}
23659 +
23660 +/*
23661 + * Don't trust REISERFS_SB(sb)->s_bmap_nr, it's a u16
23662 + * which overflows on large file systems.
23663 + */
23664 +static inline __u32 reiserfs_bmap_count(struct super_block *sb)
23665 +{
23666 +       return (SB_BLOCK_COUNT(sb) - 1) / (sb->s_blocksize * 8) + 1;
23667 +}
23668 +
23669 +static inline int bmap_would_wrap(unsigned bmap_nr)
23670 +{
23671 +       return bmap_nr > ((1LL << 16) - 1);
23672 +}
23673 +
23674 +extern const struct xattr_handler * const reiserfs_xattr_handlers[];
23675 +
23676 +/*
23677 + * this says about version of key of all items (but stat data) the
23678 + * object consists of
23679 + */
23680 +#define get_inode_item_key_version( inode )                                    \
23681 +    ((REISERFS_I(inode)->i_flags & i_item_key_version_mask) ? KEY_FORMAT_3_6 : KEY_FORMAT_3_5)
23682 +
23683 +#define set_inode_item_key_version( inode, version )                           \
23684 +         ({ if((version)==KEY_FORMAT_3_6)                                      \
23685 +                REISERFS_I(inode)->i_flags |= i_item_key_version_mask;      \
23686 +            else                                                               \
23687 +                REISERFS_I(inode)->i_flags &= ~i_item_key_version_mask; })
23688 +
23689 +#define get_inode_sd_version(inode)                                            \
23690 +    ((REISERFS_I(inode)->i_flags & i_stat_data_version_mask) ? STAT_DATA_V2 : STAT_DATA_V1)
23691 +
23692 +#define set_inode_sd_version(inode, version)                                   \
23693 +         ({ if((version)==STAT_DATA_V2)                                        \
23694 +                REISERFS_I(inode)->i_flags |= i_stat_data_version_mask;     \
23695 +            else                                                               \
23696 +                REISERFS_I(inode)->i_flags &= ~i_stat_data_version_mask; })
23697 +
23698 +/*
23699 + * This is an aggressive tail suppression policy, I am hoping it
23700 + * improves our benchmarks. The principle behind it is that percentage
23701 + * space saving is what matters, not absolute space saving.  This is
23702 + * non-intuitive, but it helps to understand it if you consider that the
23703 + * cost to access 4 blocks is not much more than the cost to access 1
23704 + * block, if you have to do a seek and rotate.  A tail risks a
23705 + * non-linear disk access that is significant as a percentage of total
23706 + * time cost for a 4 block file and saves an amount of space that is
23707 + * less significant as a percentage of space, or so goes the hypothesis.
23708 + * -Hans
23709 + */
23710 +#define STORE_TAIL_IN_UNFM_S1(n_file_size,n_tail_size,n_block_size) \
23711 +(\
23712 +  (!(n_tail_size)) || \
23713 +  (((n_tail_size) > MAX_DIRECT_ITEM_LEN(n_block_size)) || \
23714 +   ( (n_file_size) >= (n_block_size) * 4 ) || \
23715 +   ( ( (n_file_size) >= (n_block_size) * 3 ) && \
23716 +     ( (n_tail_size) >=   (MAX_DIRECT_ITEM_LEN(n_block_size))/4) ) || \
23717 +   ( ( (n_file_size) >= (n_block_size) * 2 ) && \
23718 +     ( (n_tail_size) >=   (MAX_DIRECT_ITEM_LEN(n_block_size))/2) ) || \
23719 +   ( ( (n_file_size) >= (n_block_size) ) && \
23720 +     ( (n_tail_size) >=   (MAX_DIRECT_ITEM_LEN(n_block_size) * 3)/4) ) ) \
23721 +)
23722 +
23723 +/*
23724 + * Another strategy for tails, this one means only create a tail if all the
23725 + * file would fit into one DIRECT item.
23726 + * Primary intention for this one is to increase performance by decreasing
23727 + * seeking.
23728 +*/
23729 +#define STORE_TAIL_IN_UNFM_S2(n_file_size,n_tail_size,n_block_size) \
23730 +(\
23731 +  (!(n_tail_size)) || \
23732 +  (((n_file_size) > MAX_DIRECT_ITEM_LEN(n_block_size)) ) \
23733 +)
23734 +
23735 +/*
23736 + * values for s_umount_state field
23737 + */
23738 +#define REISERFS_VALID_FS    1
23739 +#define REISERFS_ERROR_FS    2
23740 +
23741 +/*
23742 + * there are 5 item types currently
23743 + */
23744 +#define TYPE_STAT_DATA 0
23745 +#define TYPE_INDIRECT 1
23746 +#define TYPE_DIRECT 2
23747 +#define TYPE_DIRENTRY 3
23748 +#define TYPE_MAXTYPE 3
23749 +#define TYPE_ANY 15            /* FIXME: comment is required */
23750 +
23751 +/***************************************************************************
23752 + *                       KEY & ITEM HEAD                                   *
23753 + ***************************************************************************/
23754 +
23755 +/* * directories use this key as well as old files */
23756 +struct offset_v1 {
23757 +       __le32 k_offset;
23758 +       __le32 k_uniqueness;
23759 +} __attribute__ ((__packed__));
23760 +
23761 +struct offset_v2 {
23762 +       __le64 v;
23763 +} __attribute__ ((__packed__));
23764 +
23765 +static inline __u16 offset_v2_k_type(const struct offset_v2 *v2)
23766 +{
23767 +       __u8 type = le64_to_cpu(v2->v) >> 60;
23768 +       return (type <= TYPE_MAXTYPE) ? type : TYPE_ANY;
23769 +}
23770 +
23771 +static inline void set_offset_v2_k_type(struct offset_v2 *v2, int type)
23772 +{
23773 +       v2->v =
23774 +           (v2->v & cpu_to_le64(~0ULL >> 4)) | cpu_to_le64((__u64) type << 60);
23775 +}
23776 +
23777 +static inline loff_t offset_v2_k_offset(const struct offset_v2 *v2)
23778 +{
23779 +       return le64_to_cpu(v2->v) & (~0ULL >> 4);
23780 +}
23781 +
23782 +static inline void set_offset_v2_k_offset(struct offset_v2 *v2, loff_t offset)
23783 +{
23784 +       offset &= (~0ULL >> 4);
23785 +       v2->v = (v2->v & cpu_to_le64(15ULL << 60)) | cpu_to_le64(offset);
23786 +}
23787 +
23788 +/*
23789 + * Key of an item determines its location in the S+tree, and
23790 + * is composed of 4 components
23791 + */
23792 +struct reiserfs_key {
23793 +       /* packing locality: by default parent directory object id */
23794 +       __le32 k_dir_id;
23795 +
23796 +       __le32 k_objectid;      /* object identifier */
23797 +       union {
23798 +               struct offset_v1 k_offset_v1;
23799 +               struct offset_v2 k_offset_v2;
23800 +       } __attribute__ ((__packed__)) u;
23801 +} __attribute__ ((__packed__));
23802 +
23803 +struct in_core_key {
23804 +       /* packing locality: by default parent directory object id */
23805 +       __u32 k_dir_id;
23806 +       __u32 k_objectid;       /* object identifier */
23807 +       __u64 k_offset;
23808 +       __u8 k_type;
23809 +};
23810 +
23811 +struct cpu_key {
23812 +       struct in_core_key on_disk_key;
23813 +       int version;
23814 +       /* 3 in all cases but direct2indirect and indirect2direct conversion */
23815 +       int key_length;
23816 +};
23817 +
23818 +/*
23819 + * Our function for comparing keys can compare keys of different
23820 + * lengths.  It takes as a parameter the length of the keys it is to
23821 + * compare.  These defines are used in determining what is to be passed
23822 + * to it as that parameter.
23823 + */
23824 +#define REISERFS_FULL_KEY_LEN     4
23825 +#define REISERFS_SHORT_KEY_LEN    2
23826 +
23827 +/* The result of the key compare */
23828 +#define FIRST_GREATER 1
23829 +#define SECOND_GREATER -1
23830 +#define KEYS_IDENTICAL 0
23831 +#define KEY_FOUND 1
23832 +#define KEY_NOT_FOUND 0
23833 +
23834 +#define KEY_SIZE (sizeof(struct reiserfs_key))
23835 +
23836 +/* return values for search_by_key and clones */
23837 +#define ITEM_FOUND 1
23838 +#define ITEM_NOT_FOUND 0
23839 +#define ENTRY_FOUND 1
23840 +#define ENTRY_NOT_FOUND 0
23841 +#define DIRECTORY_NOT_FOUND -1
23842 +#define REGULAR_FILE_FOUND -2
23843 +#define DIRECTORY_FOUND -3
23844 +#define BYTE_FOUND 1
23845 +#define BYTE_NOT_FOUND 0
23846 +#define FILE_NOT_FOUND -1
23847 +
23848 +#define POSITION_FOUND 1
23849 +#define POSITION_NOT_FOUND 0
23850 +
23851 +/* return values for reiserfs_find_entry and search_by_entry_key */
23852 +#define NAME_FOUND 1
23853 +#define NAME_NOT_FOUND 0
23854 +#define GOTO_PREVIOUS_ITEM 2
23855 +#define NAME_FOUND_INVISIBLE 3
23856 +
23857 +/*
23858 + * Everything in the filesystem is stored as a set of items.  The
23859 + * item head contains the key of the item, its free space (for
23860 + * indirect items) and specifies the location of the item itself
23861 + * within the block.
23862 + */
23863 +
23864 +struct item_head {
23865 +       /*
23866 +        * Everything in the tree is found by searching for it based on
23867 +        * its key.
23868 +        */
23869 +       struct reiserfs_key ih_key;
23870 +       union {
23871 +               /*
23872 +                * The free space in the last unformatted node of an
23873 +                * indirect item if this is an indirect item.  This
23874 +                * equals 0xFFFF iff this is a direct item or stat data
23875 +                * item. Note that the key, not this field, is used to
23876 +                * determine the item type, and thus which field this
23877 +                * union contains.
23878 +                */
23879 +               __le16 ih_free_space_reserved;
23880 +
23881 +               /*
23882 +                * Iff this is a directory item, this field equals the
23883 +                * number of directory entries in the directory item.
23884 +                */
23885 +               __le16 ih_entry_count;
23886 +       } __attribute__ ((__packed__)) u;
23887 +       __le16 ih_item_len;     /* total size of the item body */
23888 +
23889 +       /* an offset to the item body within the block */
23890 +       __le16 ih_item_location;
23891 +
23892 +       /*
23893 +        * 0 for all old items, 2 for new ones. Highest bit is set by fsck
23894 +        * temporary, cleaned after all done
23895 +        */
23896 +       __le16 ih_version;
23897 +} __attribute__ ((__packed__));
23898 +/* size of item header     */
23899 +#define IH_SIZE (sizeof(struct item_head))
23900 +
23901 +#define ih_free_space(ih)            le16_to_cpu((ih)->u.ih_free_space_reserved)
23902 +#define ih_version(ih)               le16_to_cpu((ih)->ih_version)
23903 +#define ih_entry_count(ih)           le16_to_cpu((ih)->u.ih_entry_count)
23904 +#define ih_location(ih)              le16_to_cpu((ih)->ih_item_location)
23905 +#define ih_item_len(ih)              le16_to_cpu((ih)->ih_item_len)
23906 +
23907 +#define put_ih_free_space(ih, val)   do { (ih)->u.ih_free_space_reserved = cpu_to_le16(val); } while(0)
23908 +#define put_ih_version(ih, val)      do { (ih)->ih_version = cpu_to_le16(val); } while (0)
23909 +#define put_ih_entry_count(ih, val)  do { (ih)->u.ih_entry_count = cpu_to_le16(val); } while (0)
23910 +#define put_ih_location(ih, val)     do { (ih)->ih_item_location = cpu_to_le16(val); } while (0)
23911 +#define put_ih_item_len(ih, val)     do { (ih)->ih_item_len = cpu_to_le16(val); } while (0)
23912 +
23913 +#define unreachable_item(ih) (ih_version(ih) & (1 << 15))
23914 +
23915 +#define get_ih_free_space(ih) (ih_version (ih) == KEY_FORMAT_3_6 ? 0 : ih_free_space (ih))
23916 +#define set_ih_free_space(ih,val) put_ih_free_space((ih), ((ih_version(ih) == KEY_FORMAT_3_6) ? 0 : (val)))
23917 +
23918 +/*
23919 + * these operate on indirect items, where you've got an array of ints
23920 + * at a possibly unaligned location.  These are a noop on ia32
23921 + *
23922 + * p is the array of __u32, i is the index into the array, v is the value
23923 + * to store there.
23924 + */
23925 +#define get_block_num(p, i) get_unaligned_le32((p) + (i))
23926 +#define put_block_num(p, i, v) put_unaligned_le32((v), (p) + (i))
23927 +
23928 +/* * in old version uniqueness field shows key type */
23929 +#define V1_SD_UNIQUENESS 0
23930 +#define V1_INDIRECT_UNIQUENESS 0xfffffffe
23931 +#define V1_DIRECT_UNIQUENESS 0xffffffff
23932 +#define V1_DIRENTRY_UNIQUENESS 500
23933 +#define V1_ANY_UNIQUENESS 555  /* FIXME: comment is required */
23934 +
23935 +/* here are conversion routines */
23936 +static inline int uniqueness2type(__u32 uniqueness) CONSTF;
23937 +static inline int uniqueness2type(__u32 uniqueness)
23938 +{
23939 +       switch ((int)uniqueness) {
23940 +       case V1_SD_UNIQUENESS:
23941 +               return TYPE_STAT_DATA;
23942 +       case V1_INDIRECT_UNIQUENESS:
23943 +               return TYPE_INDIRECT;
23944 +       case V1_DIRECT_UNIQUENESS:
23945 +               return TYPE_DIRECT;
23946 +       case V1_DIRENTRY_UNIQUENESS:
23947 +               return TYPE_DIRENTRY;
23948 +       case V1_ANY_UNIQUENESS:
23949 +       default:
23950 +               return TYPE_ANY;
23951 +       }
23952 +}
23953 +
23954 +static inline __u32 type2uniqueness(int type) CONSTF;
23955 +static inline __u32 type2uniqueness(int type)
23956 +{
23957 +       switch (type) {
23958 +       case TYPE_STAT_DATA:
23959 +               return V1_SD_UNIQUENESS;
23960 +       case TYPE_INDIRECT:
23961 +               return V1_INDIRECT_UNIQUENESS;
23962 +       case TYPE_DIRECT:
23963 +               return V1_DIRECT_UNIQUENESS;
23964 +       case TYPE_DIRENTRY:
23965 +               return V1_DIRENTRY_UNIQUENESS;
23966 +       case TYPE_ANY:
23967 +       default:
23968 +               return V1_ANY_UNIQUENESS;
23969 +       }
23970 +}
23971 +
23972 +/*
23973 + * key is pointer to on disk key which is stored in le, result is cpu,
23974 + * there is no way to get version of object from key, so, provide
23975 + * version to these defines
23976 + */
23977 +static inline loff_t le_key_k_offset(int version,
23978 +                                    const struct reiserfs_key *key)
23979 +{
23980 +       return (version == KEY_FORMAT_3_5) ?
23981 +           le32_to_cpu(key->u.k_offset_v1.k_offset) :
23982 +           offset_v2_k_offset(&(key->u.k_offset_v2));
23983 +}
23984 +
23985 +static inline loff_t le_ih_k_offset(const struct item_head *ih)
23986 +{
23987 +       return le_key_k_offset(ih_version(ih), &(ih->ih_key));
23988 +}
23989 +
23990 +static inline loff_t le_key_k_type(int version, const struct reiserfs_key *key)
23991 +{
23992 +       if (version == KEY_FORMAT_3_5) {
23993 +               loff_t val = le32_to_cpu(key->u.k_offset_v1.k_uniqueness);
23994 +               return uniqueness2type(val);
23995 +       } else
23996 +               return offset_v2_k_type(&(key->u.k_offset_v2));
23997 +}
23998 +
23999 +static inline loff_t le_ih_k_type(const struct item_head *ih)
24000 +{
24001 +       return le_key_k_type(ih_version(ih), &(ih->ih_key));
24002 +}
24003 +
24004 +static inline void set_le_key_k_offset(int version, struct reiserfs_key *key,
24005 +                                      loff_t offset)
24006 +{
24007 +       if (version == KEY_FORMAT_3_5)
24008 +               key->u.k_offset_v1.k_offset = cpu_to_le32(offset);
24009 +       else
24010 +               set_offset_v2_k_offset(&key->u.k_offset_v2, offset);
24011 +}
24012 +
24013 +static inline void add_le_key_k_offset(int version, struct reiserfs_key *key,
24014 +                                      loff_t offset)
24015 +{
24016 +       set_le_key_k_offset(version, key,
24017 +                           le_key_k_offset(version, key) + offset);
24018 +}
24019 +
24020 +static inline void add_le_ih_k_offset(struct item_head *ih, loff_t offset)
24021 +{
24022 +       add_le_key_k_offset(ih_version(ih), &(ih->ih_key), offset);
24023 +}
24024 +
24025 +static inline void set_le_ih_k_offset(struct item_head *ih, loff_t offset)
24026 +{
24027 +       set_le_key_k_offset(ih_version(ih), &(ih->ih_key), offset);
24028 +}
24029 +
24030 +static inline void set_le_key_k_type(int version, struct reiserfs_key *key,
24031 +                                    int type)
24032 +{
24033 +       if (version == KEY_FORMAT_3_5) {
24034 +               type = type2uniqueness(type);
24035 +               key->u.k_offset_v1.k_uniqueness = cpu_to_le32(type);
24036 +       } else
24037 +              set_offset_v2_k_type(&key->u.k_offset_v2, type);
24038 +}
24039 +
24040 +static inline void set_le_ih_k_type(struct item_head *ih, int type)
24041 +{
24042 +       set_le_key_k_type(ih_version(ih), &(ih->ih_key), type);
24043 +}
24044 +
24045 +static inline int is_direntry_le_key(int version, struct reiserfs_key *key)
24046 +{
24047 +       return le_key_k_type(version, key) == TYPE_DIRENTRY;
24048 +}
24049 +
24050 +static inline int is_direct_le_key(int version, struct reiserfs_key *key)
24051 +{
24052 +       return le_key_k_type(version, key) == TYPE_DIRECT;
24053 +}
24054 +
24055 +static inline int is_indirect_le_key(int version, struct reiserfs_key *key)
24056 +{
24057 +       return le_key_k_type(version, key) == TYPE_INDIRECT;
24058 +}
24059 +
24060 +static inline int is_statdata_le_key(int version, struct reiserfs_key *key)
24061 +{
24062 +       return le_key_k_type(version, key) == TYPE_STAT_DATA;
24063 +}
24064 +
24065 +/* item header has version.  */
24066 +static inline int is_direntry_le_ih(struct item_head *ih)
24067 +{
24068 +       return is_direntry_le_key(ih_version(ih), &ih->ih_key);
24069 +}
24070 +
24071 +static inline int is_direct_le_ih(struct item_head *ih)
24072 +{
24073 +       return is_direct_le_key(ih_version(ih), &ih->ih_key);
24074 +}
24075 +
24076 +static inline int is_indirect_le_ih(struct item_head *ih)
24077 +{
24078 +       return is_indirect_le_key(ih_version(ih), &ih->ih_key);
24079 +}
24080 +
24081 +static inline int is_statdata_le_ih(struct item_head *ih)
24082 +{
24083 +       return is_statdata_le_key(ih_version(ih), &ih->ih_key);
24084 +}
24085 +
24086 +/* key is pointer to cpu key, result is cpu */
24087 +static inline loff_t cpu_key_k_offset(const struct cpu_key *key)
24088 +{
24089 +       return key->on_disk_key.k_offset;
24090 +}
24091 +
24092 +static inline loff_t cpu_key_k_type(const struct cpu_key *key)
24093 +{
24094 +       return key->on_disk_key.k_type;
24095 +}
24096 +
24097 +static inline void set_cpu_key_k_offset(struct cpu_key *key, loff_t offset)
24098 +{
24099 +       key->on_disk_key.k_offset = offset;
24100 +}
24101 +
24102 +static inline void set_cpu_key_k_type(struct cpu_key *key, int type)
24103 +{
24104 +       key->on_disk_key.k_type = type;
24105 +}
24106 +
24107 +static inline void cpu_key_k_offset_dec(struct cpu_key *key)
24108 +{
24109 +       key->on_disk_key.k_offset--;
24110 +}
24111 +
24112 +#define is_direntry_cpu_key(key) (cpu_key_k_type (key) == TYPE_DIRENTRY)
24113 +#define is_direct_cpu_key(key) (cpu_key_k_type (key) == TYPE_DIRECT)
24114 +#define is_indirect_cpu_key(key) (cpu_key_k_type (key) == TYPE_INDIRECT)
24115 +#define is_statdata_cpu_key(key) (cpu_key_k_type (key) == TYPE_STAT_DATA)
24116 +
24117 +/* are these used ? */
24118 +#define is_direntry_cpu_ih(ih) (is_direntry_cpu_key (&((ih)->ih_key)))
24119 +#define is_direct_cpu_ih(ih) (is_direct_cpu_key (&((ih)->ih_key)))
24120 +#define is_indirect_cpu_ih(ih) (is_indirect_cpu_key (&((ih)->ih_key)))
24121 +#define is_statdata_cpu_ih(ih) (is_statdata_cpu_key (&((ih)->ih_key)))
24122 +
24123 +#define I_K_KEY_IN_ITEM(ih, key, n_blocksize) \
24124 +    (!COMP_SHORT_KEYS(ih, key) && \
24125 +         I_OFF_BYTE_IN_ITEM(ih, k_offset(key), n_blocksize))
24126 +
24127 +/* maximal length of item */
24128 +#define MAX_ITEM_LEN(block_size) (block_size - BLKH_SIZE - IH_SIZE)
24129 +#define MIN_ITEM_LEN 1
24130 +
24131 +/* object identifier for root dir */
24132 +#define REISERFS_ROOT_OBJECTID 2
24133 +#define REISERFS_ROOT_PARENT_OBJECTID 1
24134 +
24135 +extern struct reiserfs_key root_key;
24136 +
24137 +/*
24138 + * Picture represents a leaf of the S+tree
24139 + *  ______________________________________________________
24140 + * |      |  Array of     |                   |           |
24141 + * |Block |  Object-Item  |      F r e e      |  Objects- |
24142 + * | head |  Headers      |     S p a c e     |   Items   |
24143 + * |______|_______________|___________________|___________|
24144 + */
24145 +
24146 +/*
24147 + * Header of a disk block.  More precisely, header of a formatted leaf
24148 + * or internal node, and not the header of an unformatted node.
24149 + */
24150 +struct block_head {
24151 +       __le16 blk_level;       /* Level of a block in the tree. */
24152 +       __le16 blk_nr_item;     /* Number of keys/items in a block. */
24153 +       __le16 blk_free_space;  /* Block free space in bytes. */
24154 +       __le16 blk_reserved;
24155 +       /* dump this in v4/planA */
24156 +
24157 +       /* kept only for compatibility */
24158 +       struct reiserfs_key blk_right_delim_key;
24159 +};
24160 +
24161 +#define BLKH_SIZE                     (sizeof(struct block_head))
24162 +#define blkh_level(p_blkh)            (le16_to_cpu((p_blkh)->blk_level))
24163 +#define blkh_nr_item(p_blkh)          (le16_to_cpu((p_blkh)->blk_nr_item))
24164 +#define blkh_free_space(p_blkh)       (le16_to_cpu((p_blkh)->blk_free_space))
24165 +#define blkh_reserved(p_blkh)         (le16_to_cpu((p_blkh)->blk_reserved))
24166 +#define set_blkh_level(p_blkh,val)    ((p_blkh)->blk_level = cpu_to_le16(val))
24167 +#define set_blkh_nr_item(p_blkh,val)  ((p_blkh)->blk_nr_item = cpu_to_le16(val))
24168 +#define set_blkh_free_space(p_blkh,val) ((p_blkh)->blk_free_space = cpu_to_le16(val))
24169 +#define set_blkh_reserved(p_blkh,val) ((p_blkh)->blk_reserved = cpu_to_le16(val))
24170 +#define blkh_right_delim_key(p_blkh)  ((p_blkh)->blk_right_delim_key)
24171 +#define set_blkh_right_delim_key(p_blkh,val)  ((p_blkh)->blk_right_delim_key = val)
24172 +
24173 +/* values for blk_level field of the struct block_head */
24174 +
24175 +/*
24176 + * When node gets removed from the tree its blk_level is set to FREE_LEVEL.
24177 + * It is then  used to see whether the node is still in the tree
24178 + */
24179 +#define FREE_LEVEL 0
24180 +
24181 +#define DISK_LEAF_NODE_LEVEL  1        /* Leaf node level. */
24182 +
24183 +/*
24184 + * Given the buffer head of a formatted node, resolve to the
24185 + * block head of that node.
24186 + */
24187 +#define B_BLK_HEAD(bh)                 ((struct block_head *)((bh)->b_data))
24188 +/* Number of items that are in buffer. */
24189 +#define B_NR_ITEMS(bh)                 (blkh_nr_item(B_BLK_HEAD(bh)))
24190 +#define B_LEVEL(bh)                    (blkh_level(B_BLK_HEAD(bh)))
24191 +#define B_FREE_SPACE(bh)               (blkh_free_space(B_BLK_HEAD(bh)))
24192 +
24193 +#define PUT_B_NR_ITEMS(bh, val)                do { set_blkh_nr_item(B_BLK_HEAD(bh), val); } while (0)
24194 +#define PUT_B_LEVEL(bh, val)           do { set_blkh_level(B_BLK_HEAD(bh), val); } while (0)
24195 +#define PUT_B_FREE_SPACE(bh, val)      do { set_blkh_free_space(B_BLK_HEAD(bh), val); } while (0)
24196 +
24197 +/* Get right delimiting key. -- little endian */
24198 +#define B_PRIGHT_DELIM_KEY(bh)         (&(blk_right_delim_key(B_BLK_HEAD(bh))))
24199 +
24200 +/* Does the buffer contain a disk leaf. */
24201 +#define B_IS_ITEMS_LEVEL(bh)           (B_LEVEL(bh) == DISK_LEAF_NODE_LEVEL)
24202 +
24203 +/* Does the buffer contain a disk internal node */
24204 +#define B_IS_KEYS_LEVEL(bh)      (B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL \
24205 +                                           && B_LEVEL(bh) <= MAX_HEIGHT)
24206 +
24207 +/***************************************************************************
24208 + *                             STAT DATA                                   *
24209 + ***************************************************************************/
24210 +
24211 +/*
24212 + * old stat data is 32 bytes long. We are going to distinguish new one by
24213 + * different size
24214 +*/
24215 +struct stat_data_v1 {
24216 +       __le16 sd_mode;         /* file type, permissions */
24217 +       __le16 sd_nlink;        /* number of hard links */
24218 +       __le16 sd_uid;          /* owner */
24219 +       __le16 sd_gid;          /* group */
24220 +       __le32 sd_size;         /* file size */
24221 +       __le32 sd_atime;        /* time of last access */
24222 +       __le32 sd_mtime;        /* time file was last modified  */
24223 +
24224 +       /*
24225 +        * time inode (stat data) was last changed
24226 +        * (except changes to sd_atime and sd_mtime)
24227 +        */
24228 +       __le32 sd_ctime;
24229 +       union {
24230 +               __le32 sd_rdev;
24231 +               __le32 sd_blocks;       /* number of blocks file uses */
24232 +       } __attribute__ ((__packed__)) u;
24233 +
24234 +       /*
24235 +        * first byte of file which is stored in a direct item: except that if
24236 +        * it equals 1 it is a symlink and if it equals ~(__u32)0 there is no
24237 +        * direct item.  The existence of this field really grates on me.
24238 +        * Let's replace it with a macro based on sd_size and our tail
24239 +        * suppression policy.  Someday.  -Hans
24240 +        */
24241 +       __le32 sd_first_direct_byte;
24242 +} __attribute__ ((__packed__));
24243 +
24244 +#define SD_V1_SIZE              (sizeof(struct stat_data_v1))
24245 +#define stat_data_v1(ih)        (ih_version (ih) == KEY_FORMAT_3_5)
24246 +#define sd_v1_mode(sdp)         (le16_to_cpu((sdp)->sd_mode))
24247 +#define set_sd_v1_mode(sdp,v)   ((sdp)->sd_mode = cpu_to_le16(v))
24248 +#define sd_v1_nlink(sdp)        (le16_to_cpu((sdp)->sd_nlink))
24249 +#define set_sd_v1_nlink(sdp,v)  ((sdp)->sd_nlink = cpu_to_le16(v))
24250 +#define sd_v1_uid(sdp)          (le16_to_cpu((sdp)->sd_uid))
24251 +#define set_sd_v1_uid(sdp,v)    ((sdp)->sd_uid = cpu_to_le16(v))
24252 +#define sd_v1_gid(sdp)          (le16_to_cpu((sdp)->sd_gid))
24253 +#define set_sd_v1_gid(sdp,v)    ((sdp)->sd_gid = cpu_to_le16(v))
24254 +#define sd_v1_size(sdp)         (le32_to_cpu((sdp)->sd_size))
24255 +#define set_sd_v1_size(sdp,v)   ((sdp)->sd_size = cpu_to_le32(v))
24256 +#define sd_v1_atime(sdp)        (le32_to_cpu((sdp)->sd_atime))
24257 +#define set_sd_v1_atime(sdp,v)  ((sdp)->sd_atime = cpu_to_le32(v))
24258 +#define sd_v1_mtime(sdp)        (le32_to_cpu((sdp)->sd_mtime))
24259 +#define set_sd_v1_mtime(sdp,v)  ((sdp)->sd_mtime = cpu_to_le32(v))
24260 +#define sd_v1_ctime(sdp)        (le32_to_cpu((sdp)->sd_ctime))
24261 +#define set_sd_v1_ctime(sdp,v)  ((sdp)->sd_ctime = cpu_to_le32(v))
24262 +#define sd_v1_rdev(sdp)         (le32_to_cpu((sdp)->u.sd_rdev))
24263 +#define set_sd_v1_rdev(sdp,v)   ((sdp)->u.sd_rdev = cpu_to_le32(v))
24264 +#define sd_v1_blocks(sdp)       (le32_to_cpu((sdp)->u.sd_blocks))
24265 +#define set_sd_v1_blocks(sdp,v) ((sdp)->u.sd_blocks = cpu_to_le32(v))
24266 +#define sd_v1_first_direct_byte(sdp) \
24267 +                                (le32_to_cpu((sdp)->sd_first_direct_byte))
24268 +#define set_sd_v1_first_direct_byte(sdp,v) \
24269 +                                ((sdp)->sd_first_direct_byte = cpu_to_le32(v))
24270 +
24271 +/* inode flags stored in sd_attrs (nee sd_reserved) */
24272 +
24273 +/*
24274 + * we want common flags to have the same values as in ext2,
24275 + * so chattr(1) will work without problems
24276 + */
24277 +#define REISERFS_IMMUTABLE_FL FS_IMMUTABLE_FL
24278 +#define REISERFS_APPEND_FL    FS_APPEND_FL
24279 +#define REISERFS_SYNC_FL      FS_SYNC_FL
24280 +#define REISERFS_NOATIME_FL   FS_NOATIME_FL
24281 +#define REISERFS_NODUMP_FL    FS_NODUMP_FL
24282 +#define REISERFS_SECRM_FL     FS_SECRM_FL
24283 +#define REISERFS_UNRM_FL      FS_UNRM_FL
24284 +#define REISERFS_COMPR_FL     FS_COMPR_FL
24285 +#define REISERFS_NOTAIL_FL    FS_NOTAIL_FL
24286 +
24287 +/* persistent flags that file inherits from the parent directory */
24288 +#define REISERFS_INHERIT_MASK ( REISERFS_IMMUTABLE_FL |        \
24289 +                               REISERFS_SYNC_FL |      \
24290 +                               REISERFS_NOATIME_FL |   \
24291 +                               REISERFS_NODUMP_FL |    \
24292 +                               REISERFS_SECRM_FL |     \
24293 +                               REISERFS_COMPR_FL |     \
24294 +                               REISERFS_NOTAIL_FL )
24295 +
24296 +/*
24297 + * Stat Data on disk (reiserfs version of UFS disk inode minus the
24298 + * address blocks)
24299 + */
24300 +struct stat_data {
24301 +       __le16 sd_mode;         /* file type, permissions */
24302 +       __le16 sd_attrs;        /* persistent inode flags */
24303 +       __le32 sd_nlink;        /* number of hard links */
24304 +       __le64 sd_size;         /* file size */
24305 +       __le32 sd_uid;          /* owner */
24306 +       __le32 sd_gid;          /* group */
24307 +       __le32 sd_atime;        /* time of last access */
24308 +       __le32 sd_mtime;        /* time file was last modified  */
24309 +
24310 +       /*
24311 +        * time inode (stat data) was last changed
24312 +        * (except changes to sd_atime and sd_mtime)
24313 +        */
24314 +       __le32 sd_ctime;
24315 +       __le32 sd_blocks;
24316 +       union {
24317 +               __le32 sd_rdev;
24318 +               __le32 sd_generation;
24319 +       } __attribute__ ((__packed__)) u;
24320 +} __attribute__ ((__packed__));
24321 +
24322 +/* this is 44 bytes long */
24323 +#define SD_SIZE (sizeof(struct stat_data))
24324 +#define SD_V2_SIZE              SD_SIZE
24325 +#define stat_data_v2(ih)        (ih_version (ih) == KEY_FORMAT_3_6)
24326 +#define sd_v2_mode(sdp)         (le16_to_cpu((sdp)->sd_mode))
24327 +#define set_sd_v2_mode(sdp,v)   ((sdp)->sd_mode = cpu_to_le16(v))
24328 +/* sd_reserved */
24329 +/* set_sd_reserved */
24330 +#define sd_v2_nlink(sdp)        (le32_to_cpu((sdp)->sd_nlink))
24331 +#define set_sd_v2_nlink(sdp,v)  ((sdp)->sd_nlink = cpu_to_le32(v))
24332 +#define sd_v2_size(sdp)         (le64_to_cpu((sdp)->sd_size))
24333 +#define set_sd_v2_size(sdp,v)   ((sdp)->sd_size = cpu_to_le64(v))
24334 +#define sd_v2_uid(sdp)          (le32_to_cpu((sdp)->sd_uid))
24335 +#define set_sd_v2_uid(sdp,v)    ((sdp)->sd_uid = cpu_to_le32(v))
24336 +#define sd_v2_gid(sdp)          (le32_to_cpu((sdp)->sd_gid))
24337 +#define set_sd_v2_gid(sdp,v)    ((sdp)->sd_gid = cpu_to_le32(v))
24338 +#define sd_v2_atime(sdp)        (le32_to_cpu((sdp)->sd_atime))
24339 +#define set_sd_v2_atime(sdp,v)  ((sdp)->sd_atime = cpu_to_le32(v))
24340 +#define sd_v2_mtime(sdp)        (le32_to_cpu((sdp)->sd_mtime))
24341 +#define set_sd_v2_mtime(sdp,v)  ((sdp)->sd_mtime = cpu_to_le32(v))
24342 +#define sd_v2_ctime(sdp)        (le32_to_cpu((sdp)->sd_ctime))
24343 +#define set_sd_v2_ctime(sdp,v)  ((sdp)->sd_ctime = cpu_to_le32(v))
24344 +#define sd_v2_blocks(sdp)       (le32_to_cpu((sdp)->sd_blocks))
24345 +#define set_sd_v2_blocks(sdp,v) ((sdp)->sd_blocks = cpu_to_le32(v))
24346 +#define sd_v2_rdev(sdp)         (le32_to_cpu((sdp)->u.sd_rdev))
24347 +#define set_sd_v2_rdev(sdp,v)   ((sdp)->u.sd_rdev = cpu_to_le32(v))
24348 +#define sd_v2_generation(sdp)   (le32_to_cpu((sdp)->u.sd_generation))
24349 +#define set_sd_v2_generation(sdp,v) ((sdp)->u.sd_generation = cpu_to_le32(v))
24350 +#define sd_v2_attrs(sdp)         (le16_to_cpu((sdp)->sd_attrs))
24351 +#define set_sd_v2_attrs(sdp,v)   ((sdp)->sd_attrs = cpu_to_le16(v))
24352 +
24353 +/***************************************************************************
24354 + *                      DIRECTORY STRUCTURE                                *
24355 + ***************************************************************************/
24356 +/*
24357 + * Picture represents the structure of directory items
24358 + * ________________________________________________
24359 + * |  Array of     |   |     |        |       |   |
24360 + * | directory     |N-1| N-2 | ....   |   1st |0th|
24361 + * | entry headers |   |     |        |       |   |
24362 + * |_______________|___|_____|________|_______|___|
24363 + *                  <----   directory entries         ------>
24364 + *
24365 + * First directory item has k_offset component 1. We store "." and ".."
24366 + * in one item, always, we never split "." and ".." into differing
24367 + * items.  This makes, among other things, the code for removing
24368 + * directories simpler.
24369 + */
24370 +#define SD_OFFSET  0
24371 +#define SD_UNIQUENESS 0
24372 +#define DOT_OFFSET 1
24373 +#define DOT_DOT_OFFSET 2
24374 +#define DIRENTRY_UNIQUENESS 500
24375 +
24376 +#define FIRST_ITEM_OFFSET 1
24377 +
24378 +/*
24379 + * Q: How to get key of object pointed to by entry from entry?
24380 + *
24381 + * A: Each directory entry has its header. This header has deh_dir_id
24382 + *    and deh_objectid fields, those are key of object, entry points to
24383 + */
24384 +
24385 +/*
24386 + * NOT IMPLEMENTED:
24387 + * Directory will someday contain stat data of object
24388 + */
24389 +
24390 +struct reiserfs_de_head {
24391 +       __le32 deh_offset;      /* third component of the directory entry key */
24392 +
24393 +       /*
24394 +        * objectid of the parent directory of the object, that is referenced
24395 +        * by directory entry
24396 +        */
24397 +       __le32 deh_dir_id;
24398 +
24399 +       /* objectid of the object, that is referenced by directory entry */
24400 +       __le32 deh_objectid;
24401 +       __le16 deh_location;    /* offset of name in the whole item */
24402 +
24403 +       /*
24404 +        * whether 1) entry contains stat data (for future), and
24405 +        * 2) whether entry is hidden (unlinked)
24406 +        */
24407 +       __le16 deh_state;
24408 +} __attribute__ ((__packed__));
24409 +#define DEH_SIZE                  sizeof(struct reiserfs_de_head)
24410 +#define deh_offset(p_deh)         (le32_to_cpu((p_deh)->deh_offset))
24411 +#define deh_dir_id(p_deh)         (le32_to_cpu((p_deh)->deh_dir_id))
24412 +#define deh_objectid(p_deh)       (le32_to_cpu((p_deh)->deh_objectid))
24413 +#define deh_location(p_deh)       (le16_to_cpu((p_deh)->deh_location))
24414 +#define deh_state(p_deh)          (le16_to_cpu((p_deh)->deh_state))
24415 +
24416 +#define put_deh_offset(p_deh,v)   ((p_deh)->deh_offset = cpu_to_le32((v)))
24417 +#define put_deh_dir_id(p_deh,v)   ((p_deh)->deh_dir_id = cpu_to_le32((v)))
24418 +#define put_deh_objectid(p_deh,v) ((p_deh)->deh_objectid = cpu_to_le32((v)))
24419 +#define put_deh_location(p_deh,v) ((p_deh)->deh_location = cpu_to_le16((v)))
24420 +#define put_deh_state(p_deh,v)    ((p_deh)->deh_state = cpu_to_le16((v)))
24421 +
24422 +/* empty directory contains two entries "." and ".." and their headers */
24423 +#define EMPTY_DIR_SIZE \
24424 +(DEH_SIZE * 2 + ROUND_UP (sizeof(".") - 1) + ROUND_UP (sizeof("..") - 1))
24425 +
24426 +/* old format directories have this size when empty */
24427 +#define EMPTY_DIR_SIZE_V1 (DEH_SIZE * 2 + 3)
24428 +
24429 +#define DEH_Statdata 0         /* not used now */
24430 +#define DEH_Visible 2
24431 +
24432 +/* 64 bit systems (and the S/390) need to be aligned explicitly -jdm */
24433 +#if BITS_PER_LONG == 64 || defined(__s390__) || defined(__hppa__)
24434 +#   define ADDR_UNALIGNED_BITS  (3)
24435 +#endif
24436 +
24437 +/*
24438 + * These are only used to manipulate deh_state.
24439 + * Because of this, we'll use the ext2_ bit routines,
24440 + * since they are little endian
24441 + */
24442 +#ifdef ADDR_UNALIGNED_BITS
24443 +
24444 +#   define aligned_address(addr)           ((void *)((long)(addr) & ~((1UL << ADDR_UNALIGNED_BITS) - 1)))
24445 +#   define unaligned_offset(addr)          (((int)((long)(addr) & ((1 << ADDR_UNALIGNED_BITS) - 1))) << 3)
24446 +
24447 +#   define set_bit_unaligned(nr, addr) \
24448 +       __test_and_set_bit_le((nr) + unaligned_offset(addr), aligned_address(addr))
24449 +#   define clear_bit_unaligned(nr, addr)       \
24450 +       __test_and_clear_bit_le((nr) + unaligned_offset(addr), aligned_address(addr))
24451 +#   define test_bit_unaligned(nr, addr)        \
24452 +       test_bit_le((nr) + unaligned_offset(addr), aligned_address(addr))
24453 +
24454 +#else
24455 +
24456 +#   define set_bit_unaligned(nr, addr) __test_and_set_bit_le(nr, addr)
24457 +#   define clear_bit_unaligned(nr, addr)       __test_and_clear_bit_le(nr, addr)
24458 +#   define test_bit_unaligned(nr, addr)        test_bit_le(nr, addr)
24459 +
24460 +#endif
24461 +
24462 +#define mark_de_with_sd(deh)        set_bit_unaligned (DEH_Statdata, &((deh)->deh_state))
24463 +#define mark_de_without_sd(deh)     clear_bit_unaligned (DEH_Statdata, &((deh)->deh_state))
24464 +#define mark_de_visible(deh)       set_bit_unaligned (DEH_Visible, &((deh)->deh_state))
24465 +#define mark_de_hidden(deh)        clear_bit_unaligned (DEH_Visible, &((deh)->deh_state))
24466 +
24467 +#define de_with_sd(deh)                    test_bit_unaligned (DEH_Statdata, &((deh)->deh_state))
24468 +#define de_visible(deh)                    test_bit_unaligned (DEH_Visible, &((deh)->deh_state))
24469 +#define de_hidden(deh)             !test_bit_unaligned (DEH_Visible, &((deh)->deh_state))
24470 +
24471 +extern void make_empty_dir_item_v1(char *body, __le32 dirid, __le32 objid,
24472 +                                  __le32 par_dirid, __le32 par_objid);
24473 +extern void make_empty_dir_item(char *body, __le32 dirid, __le32 objid,
24474 +                               __le32 par_dirid, __le32 par_objid);
24475 +
24476 +/* two entries per block (at least) */
24477 +#define REISERFS_MAX_NAME(block_size) 255
24478 +
24479 +/*
24480 + * this structure is used for operations on directory entries. It is
24481 + * not a disk structure.
24482 + *
24483 + * When reiserfs_find_entry or search_by_entry_key find directory
24484 + * entry, they return filled reiserfs_dir_entry structure
24485 + */
24486 +struct reiserfs_dir_entry {
24487 +       struct buffer_head *de_bh;
24488 +       int de_item_num;
24489 +       struct item_head *de_ih;
24490 +       int de_entry_num;
24491 +       struct reiserfs_de_head *de_deh;
24492 +       int de_entrylen;
24493 +       int de_namelen;
24494 +       char *de_name;
24495 +       unsigned long *de_gen_number_bit_string;
24496 +
24497 +       __u32 de_dir_id;
24498 +       __u32 de_objectid;
24499 +
24500 +       struct cpu_key de_entry_key;
24501 +};
24502 +
24503 +/*
24504 + * these defines are useful when a particular member of
24505 + * a reiserfs_dir_entry is needed
24506 + */
24507 +
24508 +/* pointer to file name, stored in entry */
24509 +#define B_I_DEH_ENTRY_FILE_NAME(bh, ih, deh) \
24510 +                               (ih_item_body(bh, ih) + deh_location(deh))
24511 +
24512 +/* length of name */
24513 +#define I_DEH_N_ENTRY_FILE_NAME_LENGTH(ih,deh,entry_num) \
24514 +(I_DEH_N_ENTRY_LENGTH (ih, deh, entry_num) - (de_with_sd (deh) ? SD_SIZE : 0))
24515 +
24516 +/* hash value occupies bits from 7 up to 30 */
24517 +#define GET_HASH_VALUE(offset) ((offset) & 0x7fffff80LL)
24518 +/* generation number occupies 7 bits starting from 0 up to 6 */
24519 +#define GET_GENERATION_NUMBER(offset) ((offset) & 0x7fLL)
24520 +#define MAX_GENERATION_NUMBER  127
24521 +
24522 +#define SET_GENERATION_NUMBER(offset,gen_number) (GET_HASH_VALUE(offset)|(gen_number))
24523 +
24524 +/*
24525 + * Picture represents an internal node of the reiserfs tree
24526 + *  ______________________________________________________
24527 + * |      |  Array of     |  Array of         |  Free     |
24528 + * |block |    keys       |  pointers         | space     |
24529 + * | head |      N        |      N+1          |           |
24530 + * |______|_______________|___________________|___________|
24531 + */
24532 +
24533 +/***************************************************************************
24534 + *                      DISK CHILD                                         *
24535 + ***************************************************************************/
24536 +/*
24537 + * Disk child pointer:
24538 + * The pointer from an internal node of the tree to a node that is on disk.
24539 + */
24540 +struct disk_child {
24541 +       __le32 dc_block_number; /* Disk child's block number. */
24542 +       __le16 dc_size;         /* Disk child's used space.   */
24543 +       __le16 dc_reserved;
24544 +};
24545 +
24546 +#define DC_SIZE (sizeof(struct disk_child))
24547 +#define dc_block_number(dc_p)  (le32_to_cpu((dc_p)->dc_block_number))
24548 +#define dc_size(dc_p)          (le16_to_cpu((dc_p)->dc_size))
24549 +#define put_dc_block_number(dc_p, val)   do { (dc_p)->dc_block_number = cpu_to_le32(val); } while(0)
24550 +#define put_dc_size(dc_p, val)   do { (dc_p)->dc_size = cpu_to_le16(val); } while(0)
24551 +
24552 +/* Get disk child by buffer header and position in the tree node. */
24553 +#define B_N_CHILD(bh, n_pos)  ((struct disk_child *)\
24554 +((bh)->b_data + BLKH_SIZE + B_NR_ITEMS(bh) * KEY_SIZE + DC_SIZE * (n_pos)))
24555 +
24556 +/* Get disk child number by buffer header and position in the tree node. */
24557 +#define B_N_CHILD_NUM(bh, n_pos) (dc_block_number(B_N_CHILD(bh, n_pos)))
24558 +#define PUT_B_N_CHILD_NUM(bh, n_pos, val) \
24559 +                               (put_dc_block_number(B_N_CHILD(bh, n_pos), val))
24560 +
24561 + /* maximal value of field child_size in structure disk_child */
24562 + /* child size is the combined size of all items and their headers */
24563 +#define MAX_CHILD_SIZE(bh) ((int)( (bh)->b_size - BLKH_SIZE ))
24564 +
24565 +/* amount of used space in buffer (not including block head) */
24566 +#define B_CHILD_SIZE(cur) (MAX_CHILD_SIZE(cur)-(B_FREE_SPACE(cur)))
24567 +
24568 +/* max and min number of keys in internal node */
24569 +#define MAX_NR_KEY(bh) ( (MAX_CHILD_SIZE(bh)-DC_SIZE)/(KEY_SIZE+DC_SIZE) )
24570 +#define MIN_NR_KEY(bh)    (MAX_NR_KEY(bh)/2)
24571 +
24572 +/***************************************************************************
24573 + *                      PATH STRUCTURES AND DEFINES                        *
24574 + ***************************************************************************/
24575 +
24576 +/*
24577 + * search_by_key fills up the path from the root to the leaf as it descends
24578 + * the tree looking for the key.  It uses reiserfs_bread to try to find
24579 + * buffers in the cache given their block number.  If it does not find
24580 + * them in the cache it reads them from disk.  For each node search_by_key
24581 + * finds using reiserfs_bread it then uses bin_search to look through that
24582 + * node.  bin_search will find the position of the block_number of the next
24583 + * node if it is looking through an internal node.  If it is looking through
24584 + * a leaf node bin_search will find the position of the item which has key
24585 + * either equal to given key, or which is the maximal key less than the
24586 + * given key.
24587 + */
24588 +
24589 +struct path_element {
24590 +       /* Pointer to the buffer at the path in the tree. */
24591 +       struct buffer_head *pe_buffer;
24592 +       /* Position in the tree node which is placed in the buffer above. */
24593 +       int pe_position;
24594 +};
24595 +
24596 +/*
24597 + * maximal height of a tree. don't change this without
24598 + * changing JOURNAL_PER_BALANCE_CNT
24599 + */
24600 +#define MAX_HEIGHT 5
24601 +
24602 +/* Must be equals MAX_HEIGHT + FIRST_PATH_ELEMENT_OFFSET */
24603 +#define EXTENDED_MAX_HEIGHT         7
24604 +
24605 +/* Must be equal to at least 2. */
24606 +#define FIRST_PATH_ELEMENT_OFFSET   2
24607 +
24608 +/* Must be equal to FIRST_PATH_ELEMENT_OFFSET - 1 */
24609 +#define ILLEGAL_PATH_ELEMENT_OFFSET 1
24610 +
24611 +/* this MUST be MAX_HEIGHT + 1. See about FEB below */
24612 +#define MAX_FEB_SIZE 6
24613 +
24614 +/*
24615 + * We need to keep track of who the ancestors of nodes are.  When we
24616 + * perform a search we record which nodes were visited while
24617 + * descending the tree looking for the node we searched for. This list
24618 + * of nodes is called the path.  This information is used while
24619 + * performing balancing.  Note that this path information may become
24620 + * invalid, and this means we must check it when using it to see if it
24621 + * is still valid. You'll need to read search_by_key and the comments
24622 + * in it, especially about decrement_counters_in_path(), to understand
24623 + * this structure.
24624 + *
24625 + * Paths make the code so much harder to work with and debug.... An
24626 + * enormous number of bugs are due to them, and trying to write or modify
24627 + * code that uses them just makes my head hurt.  They are based on an
24628 + * excessive effort to avoid disturbing the precious VFS code.:-( The
24629 + * gods only know how we are going to SMP the code that uses them.
24630 + * znodes are the way!
24631 + */
24632 +
24633 +#define PATH_READA     0x1     /* do read ahead */
24634 +#define PATH_READA_BACK 0x2    /* read backwards */
24635 +
24636 +struct treepath {
24637 +       int path_length;        /* Length of the array above.   */
24638 +       int reada;
24639 +       /* Array of the path elements.  */
24640 +       struct path_element path_elements[EXTENDED_MAX_HEIGHT];
24641 +       int pos_in_item;
24642 +};
24643 +
24644 +#define pos_in_item(path) ((path)->pos_in_item)
24645 +
24646 +#define INITIALIZE_PATH(var) \
24647 +struct treepath var = {.path_length = ILLEGAL_PATH_ELEMENT_OFFSET, .reada = 0,}
24648 +
24649 +/* Get path element by path and path position. */
24650 +#define PATH_OFFSET_PELEMENT(path, n_offset)  ((path)->path_elements + (n_offset))
24651 +
24652 +/* Get buffer header at the path by path and path position. */
24653 +#define PATH_OFFSET_PBUFFER(path, n_offset)   (PATH_OFFSET_PELEMENT(path, n_offset)->pe_buffer)
24654 +
24655 +/* Get position in the element at the path by path and path position. */
24656 +#define PATH_OFFSET_POSITION(path, n_offset) (PATH_OFFSET_PELEMENT(path, n_offset)->pe_position)
24657 +
24658 +#define PATH_PLAST_BUFFER(path) (PATH_OFFSET_PBUFFER((path), (path)->path_length))
24659 +
24660 +/*
24661 + * you know, to the person who didn't write this the macro name does not
24662 + * at first suggest what it does.  Maybe POSITION_FROM_PATH_END? Or
24663 + * maybe we should just focus on dumping paths... -Hans
24664 + */
24665 +#define PATH_LAST_POSITION(path) (PATH_OFFSET_POSITION((path), (path)->path_length))
24666 +
24667 +/*
24668 + * in do_balance leaf has h == 0 in contrast with path structure,
24669 + * where root has level == 0. That is why we need these defines
24670 + */
24671 +
24672 +/* tb->S[h] */
24673 +#define PATH_H_PBUFFER(path, h) \
24674 +                       PATH_OFFSET_PBUFFER(path, path->path_length - (h))
24675 +
24676 +/* tb->F[h] or tb->S[0]->b_parent */
24677 +#define PATH_H_PPARENT(path, h) PATH_H_PBUFFER(path, (h) + 1)
24678 +
24679 +#define PATH_H_POSITION(path, h) \
24680 +                       PATH_OFFSET_POSITION(path, path->path_length - (h))
24681 +
24682 +/* tb->S[h]->b_item_order */
24683 +#define PATH_H_B_ITEM_ORDER(path, h) PATH_H_POSITION(path, h + 1)
24684 +
24685 +#define PATH_H_PATH_OFFSET(path, n_h) ((path)->path_length - (n_h))
24686 +
24687 +static inline void *reiserfs_node_data(const struct buffer_head *bh)
24688 +{
24689 +       return bh->b_data + sizeof(struct block_head);
24690 +}
24691 +
24692 +/* get key from internal node */
24693 +static inline struct reiserfs_key *internal_key(struct buffer_head *bh,
24694 +                                               int item_num)
24695 +{
24696 +       struct reiserfs_key *key = reiserfs_node_data(bh);
24697 +
24698 +       return &key[item_num];
24699 +}
24700 +
24701 +/* get the item header from leaf node */
24702 +static inline struct item_head *item_head(const struct buffer_head *bh,
24703 +                                         int item_num)
24704 +{
24705 +       struct item_head *ih = reiserfs_node_data(bh);
24706 +
24707 +       return &ih[item_num];
24708 +}
24709 +
24710 +/* get the key from leaf node */
24711 +static inline struct reiserfs_key *leaf_key(const struct buffer_head *bh,
24712 +                                           int item_num)
24713 +{
24714 +       return &item_head(bh, item_num)->ih_key;
24715 +}
24716 +
24717 +static inline void *ih_item_body(const struct buffer_head *bh,
24718 +                                const struct item_head *ih)
24719 +{
24720 +       return bh->b_data + ih_location(ih);
24721 +}
24722 +
24723 +/* get item body from leaf node */
24724 +static inline void *item_body(const struct buffer_head *bh, int item_num)
24725 +{
24726 +       return ih_item_body(bh, item_head(bh, item_num));
24727 +}
24728 +
24729 +static inline struct item_head *tp_item_head(const struct treepath *path)
24730 +{
24731 +       return item_head(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION(path));
24732 +}
24733 +
24734 +static inline void *tp_item_body(const struct treepath *path)
24735 +{
24736 +       return item_body(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION(path));
24737 +}
24738 +
24739 +#define get_last_bh(path) PATH_PLAST_BUFFER(path)
24740 +#define get_item_pos(path) PATH_LAST_POSITION(path)
24741 +#define item_moved(ih,path) comp_items(ih, path)
24742 +#define path_changed(ih,path) comp_items (ih, path)
24743 +
24744 +/* array of the entry headers */
24745 + /* get item body */
24746 +#define B_I_DEH(bh, ih) ((struct reiserfs_de_head *)(ih_item_body(bh, ih)))
24747 +
24748 +/*
24749 + * length of the directory entry in directory item. This define
24750 + * calculates length of i-th directory entry using directory entry
24751 + * locations from dir entry head. When it calculates length of 0-th
24752 + * directory entry, it uses length of whole item in place of entry
24753 + * location of the non-existent following entry in the calculation.
24754 + * See picture above.
24755 + */
24756 +static inline int entry_length(const struct buffer_head *bh,
24757 +                              const struct item_head *ih, int pos_in_item)
24758 +{
24759 +       struct reiserfs_de_head *deh;
24760 +
24761 +       deh = B_I_DEH(bh, ih) + pos_in_item;
24762 +       if (pos_in_item)
24763 +               return deh_location(deh - 1) - deh_location(deh);
24764 +
24765 +       return ih_item_len(ih) - deh_location(deh);
24766 +}
24767 +
24768 +/***************************************************************************
24769 + *                       MISC                                              *
24770 + ***************************************************************************/
24771 +
24772 +/* Size of pointer to the unformatted node. */
24773 +#define UNFM_P_SIZE (sizeof(unp_t))
24774 +#define UNFM_P_SHIFT 2
24775 +
24776 +/* in in-core inode key is stored on le form */
24777 +#define INODE_PKEY(inode) ((struct reiserfs_key *)(REISERFS_I(inode)->i_key))
24778 +
24779 +#define MAX_UL_INT 0xffffffff
24780 +#define MAX_INT    0x7ffffff
24781 +#define MAX_US_INT 0xffff
24782 +
24783 +// reiserfs version 2 has max offset 60 bits. Version 1 - 32 bit offset
24784 +static inline loff_t max_reiserfs_offset(struct inode *inode)
24785 +{
24786 +       if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5)
24787 +               return (loff_t) U32_MAX;
24788 +
24789 +       return (loff_t) ((~(__u64) 0) >> 4);
24790 +}
24791 +
24792 +#define MAX_KEY_OBJECTID       MAX_UL_INT
24793 +
24794 +#define MAX_B_NUM  MAX_UL_INT
24795 +#define MAX_FC_NUM MAX_US_INT
24796 +
24797 +/* the purpose is to detect overflow of an unsigned short */
24798 +#define REISERFS_LINK_MAX (MAX_US_INT - 1000)
24799 +
24800 +/*
24801 + * The following defines are used in reiserfs_insert_item
24802 + * and reiserfs_append_item
24803 + */
24804 +#define REISERFS_KERNEL_MEM            0       /* kernel memory mode */
24805 +#define REISERFS_USER_MEM              1       /* user memory mode */
24806 +
24807 +#define fs_generation(s) (REISERFS_SB(s)->s_generation_counter)
24808 +#define get_generation(s) atomic_read (&fs_generation(s))
24809 +#define FILESYSTEM_CHANGED_TB(tb)  (get_generation((tb)->tb_sb) != (tb)->fs_gen)
24810 +#define __fs_changed(gen,s) (gen != get_generation (s))
24811 +#define fs_changed(gen,s)              \
24812 +({                                     \
24813 +       reiserfs_cond_resched(s);       \
24814 +       __fs_changed(gen, s);           \
24815 +})
24816 +
24817 +/***************************************************************************
24818 + *                  FIXATE NODES                                           *
24819 + ***************************************************************************/
24820 +
24821 +#define VI_TYPE_LEFT_MERGEABLE 1
24822 +#define VI_TYPE_RIGHT_MERGEABLE 2
24823 +
24824 +/*
24825 + * To make any changes in the tree we always first find node, that
24826 + * contains item to be changed/deleted or place to insert a new
24827 + * item. We call this node S. To do balancing we need to decide what
24828 + * we will shift to left/right neighbor, or to a new node, where new
24829 + * item will be etc. To make this analysis simpler we build virtual
24830 + * node. Virtual node is an array of items, that will replace items of
24831 + * node S. (For instance if we are going to delete an item, virtual
24832 + * node does not contain it). Virtual node keeps information about
24833 + * item sizes and types, mergeability of first and last items, sizes
24834 + * of all entries in directory item. We use this array of items when
24835 + * calculating what we can shift to neighbors and how many nodes we
24836 + * have to have if we do not any shiftings, if we shift to left/right
24837 + * neighbor or to both.
24838 + */
24839 +struct virtual_item {
24840 +       int vi_index;           /* index in the array of item operations */
24841 +       unsigned short vi_type; /* left/right mergeability */
24842 +
24843 +       /* length of item that it will have after balancing */
24844 +       unsigned short vi_item_len;
24845 +
24846 +       struct item_head *vi_ih;
24847 +       const char *vi_item;    /* body of item (old or new) */
24848 +       const void *vi_new_data;        /* 0 always but paste mode */
24849 +       void *vi_uarea;         /* item specific area */
24850 +};
24851 +
24852 +struct virtual_node {
24853 +       /* this is a pointer to the free space in the buffer */
24854 +       char *vn_free_ptr;
24855 +
24856 +       unsigned short vn_nr_item;      /* number of items in virtual node */
24857 +
24858 +       /*
24859 +        * size of node , that node would have if it has
24860 +        * unlimited size and no balancing is performed
24861 +        */
24862 +       short vn_size;
24863 +
24864 +       /* mode of balancing (paste, insert, delete, cut) */
24865 +       short vn_mode;
24866 +
24867 +       short vn_affected_item_num;
24868 +       short vn_pos_in_item;
24869 +
24870 +       /* item header of inserted item, 0 for other modes */
24871 +       struct item_head *vn_ins_ih;
24872 +       const void *vn_data;
24873 +
24874 +       /* array of items (including a new one, excluding item to be deleted) */
24875 +       struct virtual_item *vn_vi;
24876 +};
24877 +
24878 +/* used by directory items when creating virtual nodes */
24879 +struct direntry_uarea {
24880 +       int flags;
24881 +       __u16 entry_count;
24882 +       __u16 entry_sizes[];
24883 +} __attribute__ ((__packed__));
24884 +
24885 +/***************************************************************************
24886 + *                  TREE BALANCE                                           *
24887 + ***************************************************************************/
24888 +
24889 +/*
24890 + * This temporary structure is used in tree balance algorithms, and
24891 + * constructed as we go to the extent that its various parts are
24892 + * needed.  It contains arrays of nodes that can potentially be
24893 + * involved in the balancing of node S, and parameters that define how
24894 + * each of the nodes must be balanced.  Note that in these algorithms
24895 + * for balancing the worst case is to need to balance the current node
24896 + * S and the left and right neighbors and all of their parents plus
24897 + * create a new node.  We implement S1 balancing for the leaf nodes
24898 + * and S0 balancing for the internal nodes (S1 and S0 are defined in
24899 + * our papers.)
24900 + */
24901 +
24902 +/* size of the array of buffers to free at end of do_balance */
24903 +#define MAX_FREE_BLOCK 7
24904 +
24905 +/* maximum number of FEB blocknrs on a single level */
24906 +#define MAX_AMOUNT_NEEDED 2
24907 +
24908 +/* someday somebody will prefix every field in this struct with tb_ */
24909 +struct tree_balance {
24910 +       int tb_mode;
24911 +       int need_balance_dirty;
24912 +       struct super_block *tb_sb;
24913 +       struct reiserfs_transaction_handle *transaction_handle;
24914 +       struct treepath *tb_path;
24915 +
24916 +       /* array of left neighbors of nodes in the path */
24917 +       struct buffer_head *L[MAX_HEIGHT];
24918 +
24919 +       /* array of right neighbors of nodes in the path */
24920 +       struct buffer_head *R[MAX_HEIGHT];
24921 +
24922 +       /* array of fathers of the left neighbors */
24923 +       struct buffer_head *FL[MAX_HEIGHT];
24924 +
24925 +       /* array of fathers of the right neighbors */
24926 +       struct buffer_head *FR[MAX_HEIGHT];
24927 +       /* array of common parents of center node and its left neighbor */
24928 +       struct buffer_head *CFL[MAX_HEIGHT];
24929 +
24930 +       /* array of common parents of center node and its right neighbor */
24931 +       struct buffer_head *CFR[MAX_HEIGHT];
24932 +
24933 +       /*
24934 +        * array of empty buffers. Number of buffers in array equals
24935 +        * cur_blknum.
24936 +        */
24937 +       struct buffer_head *FEB[MAX_FEB_SIZE];
24938 +       struct buffer_head *used[MAX_FEB_SIZE];
24939 +       struct buffer_head *thrown[MAX_FEB_SIZE];
24940 +
24941 +       /*
24942 +        * array of number of items which must be shifted to the left in
24943 +        * order to balance the current node; for leaves includes item that
24944 +        * will be partially shifted; for internal nodes, it is the number
24945 +        * of child pointers rather than items. It includes the new item
24946 +        * being created. The code sometimes subtracts one to get the
24947 +        * number of wholly shifted items for other purposes.
24948 +        */
24949 +       int lnum[MAX_HEIGHT];
24950 +
24951 +       /* substitute right for left in comment above */
24952 +       int rnum[MAX_HEIGHT];
24953 +
24954 +       /*
24955 +        * array indexed by height h mapping the key delimiting L[h] and
24956 +        * S[h] to its item number within the node CFL[h]
24957 +        */
24958 +       int lkey[MAX_HEIGHT];
24959 +
24960 +       /* substitute r for l in comment above */
24961 +       int rkey[MAX_HEIGHT];
24962 +
24963 +       /*
24964 +        * the number of bytes by we are trying to add or remove from
24965 +        * S[h]. A negative value means removing.
24966 +        */
24967 +       int insert_size[MAX_HEIGHT];
24968 +
24969 +       /*
24970 +        * number of nodes that will replace node S[h] after balancing
24971 +        * on the level h of the tree.  If 0 then S is being deleted,
24972 +        * if 1 then S is remaining and no new nodes are being created,
24973 +        * if 2 or 3 then 1 or 2 new nodes is being created
24974 +        */
24975 +       int blknum[MAX_HEIGHT];
24976 +
24977 +       /* fields that are used only for balancing leaves of the tree */
24978 +
24979 +       /* number of empty blocks having been already allocated */
24980 +       int cur_blknum;
24981 +
24982 +       /* number of items that fall into left most node when S[0] splits */
24983 +       int s0num;
24984 +
24985 +       /*
24986 +        * number of bytes which can flow to the left neighbor from the left
24987 +        * most liquid item that cannot be shifted from S[0] entirely
24988 +        * if -1 then nothing will be partially shifted
24989 +        */
24990 +       int lbytes;
24991 +
24992 +       /*
24993 +        * number of bytes which will flow to the right neighbor from the right
24994 +        * most liquid item that cannot be shifted from S[0] entirely
24995 +        * if -1 then nothing will be partially shifted
24996 +        */
24997 +       int rbytes;
24998 +
24999 +
25000 +       /*
25001 +        * index into the array of item headers in
25002 +        * S[0] of the affected item
25003 +        */
25004 +       int item_pos;
25005 +
25006 +       /* new nodes allocated to hold what could not fit into S */
25007 +       struct buffer_head *S_new[2];
25008 +
25009 +       /*
25010 +        * number of items that will be placed into nodes in S_new
25011 +        * when S[0] splits
25012 +        */
25013 +       int snum[2];
25014 +
25015 +       /*
25016 +        * number of bytes which flow to nodes in S_new when S[0] splits
25017 +        * note: if S[0] splits into 3 nodes, then items do not need to be cut
25018 +        */
25019 +       int sbytes[2];
25020 +
25021 +       int pos_in_item;
25022 +       int zeroes_num;
25023 +
25024 +       /*
25025 +        * buffers which are to be freed after do_balance finishes
25026 +        * by unfix_nodes
25027 +        */
25028 +       struct buffer_head *buf_to_free[MAX_FREE_BLOCK];
25029 +
25030 +       /*
25031 +        * kmalloced memory. Used to create virtual node and keep
25032 +        * map of dirtied bitmap blocks
25033 +        */
25034 +       char *vn_buf;
25035 +
25036 +       int vn_buf_size;        /* size of the vn_buf */
25037 +
25038 +       /* VN starts after bitmap of bitmap blocks */
25039 +       struct virtual_node *tb_vn;
25040 +
25041 +       /*
25042 +        * saved value of `reiserfs_generation' counter see
25043 +        * FILESYSTEM_CHANGED() macro in reiserfs_fs.h
25044 +        */
25045 +       int fs_gen;
25046 +
25047 +#ifdef DISPLACE_NEW_PACKING_LOCALITIES
25048 +       /*
25049 +        * key pointer, to pass to block allocator or
25050 +        * another low-level subsystem
25051 +        */
25052 +       struct in_core_key key;
25053 +#endif
25054 +};
25055 +
25056 +/* These are modes of balancing */
25057 +
25058 +/* When inserting an item. */
25059 +#define M_INSERT       'i'
25060 +/*
25061 + * When inserting into (directories only) or appending onto an already
25062 + * existent item.
25063 + */
25064 +#define M_PASTE                'p'
25065 +/* When deleting an item. */
25066 +#define M_DELETE       'd'
25067 +/* When truncating an item or removing an entry from a (directory) item. */
25068 +#define M_CUT          'c'
25069 +
25070 +/* used when balancing on leaf level skipped (in reiserfsck) */
25071 +#define M_INTERNAL     'n'
25072 +
25073 +/*
25074 + * When further balancing is not needed, then do_balance does not need
25075 + * to be called.
25076 + */
25077 +#define M_SKIP_BALANCING               's'
25078 +#define M_CONVERT      'v'
25079 +
25080 +/* modes of leaf_move_items */
25081 +#define LEAF_FROM_S_TO_L 0
25082 +#define LEAF_FROM_S_TO_R 1
25083 +#define LEAF_FROM_R_TO_L 2
25084 +#define LEAF_FROM_L_TO_R 3
25085 +#define LEAF_FROM_S_TO_SNEW 4
25086 +
25087 +#define FIRST_TO_LAST 0
25088 +#define LAST_TO_FIRST 1
25089 +
25090 +/*
25091 + * used in do_balance for passing parent of node information that has
25092 + * been gotten from tb struct
25093 + */
25094 +struct buffer_info {
25095 +       struct tree_balance *tb;
25096 +       struct buffer_head *bi_bh;
25097 +       struct buffer_head *bi_parent;
25098 +       int bi_position;
25099 +};
25100 +
25101 +static inline struct super_block *sb_from_tb(struct tree_balance *tb)
25102 +{
25103 +       return tb ? tb->tb_sb : NULL;
25104 +}
25105 +
25106 +static inline struct super_block *sb_from_bi(struct buffer_info *bi)
25107 +{
25108 +       return bi ? sb_from_tb(bi->tb) : NULL;
25109 +}
25110 +
25111 +/*
25112 + * there are 4 types of items: stat data, directory item, indirect, direct.
25113 + * +-------------------+------------+--------------+------------+
25114 + * |                   |  k_offset  | k_uniqueness | mergeable? |
25115 + * +-------------------+------------+--------------+------------+
25116 + * |     stat data     |     0      |      0       |   no       |
25117 + * +-------------------+------------+--------------+------------+
25118 + * | 1st directory item| DOT_OFFSET | DIRENTRY_ .. |   no       |
25119 + * | non 1st directory | hash value | UNIQUENESS   |   yes      |
25120 + * |     item          |            |              |            |
25121 + * +-------------------+------------+--------------+------------+
25122 + * | indirect item     | offset + 1 |TYPE_INDIRECT |    [1]    |
25123 + * +-------------------+------------+--------------+------------+
25124 + * | direct item       | offset + 1 |TYPE_DIRECT   |    [2]     |
25125 + * +-------------------+------------+--------------+------------+
25126 + *
25127 + * [1] if this is not the first indirect item of the object
25128 + * [2] if this is not the first direct item of the object
25129 +*/
25130 +
25131 +struct item_operations {
25132 +       int (*bytes_number) (struct item_head * ih, int block_size);
25133 +       void (*decrement_key) (struct cpu_key *);
25134 +       int (*is_left_mergeable) (struct reiserfs_key * ih,
25135 +                                 unsigned long bsize);
25136 +       void (*print_item) (struct item_head *, char *item);
25137 +       void (*check_item) (struct item_head *, char *item);
25138 +
25139 +       int (*create_vi) (struct virtual_node * vn, struct virtual_item * vi,
25140 +                         int is_affected, int insert_size);
25141 +       int (*check_left) (struct virtual_item * vi, int free,
25142 +                          int start_skip, int end_skip);
25143 +       int (*check_right) (struct virtual_item * vi, int free);
25144 +       int (*part_size) (struct virtual_item * vi, int from, int to);
25145 +       int (*unit_num) (struct virtual_item * vi);
25146 +       void (*print_vi) (struct virtual_item * vi);
25147 +};
25148 +
25149 +extern struct item_operations *item_ops[TYPE_ANY + 1];
25150 +
25151 +#define op_bytes_number(ih,bsize)                    item_ops[le_ih_k_type (ih)]->bytes_number (ih, bsize)
25152 +#define op_is_left_mergeable(key,bsize)              item_ops[le_key_k_type (le_key_version (key), key)]->is_left_mergeable (key, bsize)
25153 +#define op_print_item(ih,item)                       item_ops[le_ih_k_type (ih)]->print_item (ih, item)
25154 +#define op_check_item(ih,item)                       item_ops[le_ih_k_type (ih)]->check_item (ih, item)
25155 +#define op_create_vi(vn,vi,is_affected,insert_size)  item_ops[le_ih_k_type ((vi)->vi_ih)]->create_vi (vn,vi,is_affected,insert_size)
25156 +#define op_check_left(vi,free,start_skip,end_skip) item_ops[(vi)->vi_index]->check_left (vi, free, start_skip, end_skip)
25157 +#define op_check_right(vi,free)                      item_ops[(vi)->vi_index]->check_right (vi, free)
25158 +#define op_part_size(vi,from,to)                     item_ops[(vi)->vi_index]->part_size (vi, from, to)
25159 +#define op_unit_num(vi)                                     item_ops[(vi)->vi_index]->unit_num (vi)
25160 +#define op_print_vi(vi)                              item_ops[(vi)->vi_index]->print_vi (vi)
25161 +
25162 +#define COMP_SHORT_KEYS comp_short_keys
25163 +
25164 +/* number of blocks pointed to by the indirect item */
25165 +#define I_UNFM_NUM(ih) (ih_item_len(ih) / UNFM_P_SIZE)
25166 +
25167 +/*
25168 + * the used space within the unformatted node corresponding
25169 + * to pos within the item pointed to by ih
25170 + */
25171 +#define I_POS_UNFM_SIZE(ih,pos,size) (((pos) == I_UNFM_NUM(ih) - 1 ) ? (size) - ih_free_space(ih) : (size))
25172 +
25173 +/*
25174 + * number of bytes contained by the direct item or the
25175 + * unformatted nodes the indirect item points to
25176 + */
25177 +
25178 +/* following defines use reiserfs buffer header and item header */
25179 +
25180 +/* get stat-data */
25181 +#define B_I_STAT_DATA(bh, ih) ( (struct stat_data * )((bh)->b_data + ih_location(ih)) )
25182 +
25183 +/* this is 3976 for size==4096 */
25184 +#define MAX_DIRECT_ITEM_LEN(size) ((size) - BLKH_SIZE - 2*IH_SIZE - SD_SIZE - UNFM_P_SIZE)
25185 +
25186 +/*
25187 + * indirect items consist of entries which contain blocknrs, pos
25188 + * indicates which entry, and B_I_POS_UNFM_POINTER resolves to the
25189 + * blocknr contained by the entry pos points to
25190 + */
25191 +#define B_I_POS_UNFM_POINTER(bh, ih, pos)                              \
25192 +       le32_to_cpu(*(((unp_t *)ih_item_body(bh, ih)) + (pos)))
25193 +#define PUT_B_I_POS_UNFM_POINTER(bh, ih, pos, val)                     \
25194 +       (*(((unp_t *)ih_item_body(bh, ih)) + (pos)) = cpu_to_le32(val))
25195 +
25196 +struct reiserfs_iget_args {
25197 +       __u32 objectid;
25198 +       __u32 dirid;
25199 +};
25200 +
25201 +/***************************************************************************
25202 + *                    FUNCTION DECLARATIONS                                *
25203 + ***************************************************************************/
25204 +
25205 +#define get_journal_desc_magic(bh) (bh->b_data + bh->b_size - 12)
25206 +
25207 +#define journal_trans_half(blocksize) \
25208 +       ((blocksize - sizeof(struct reiserfs_journal_desc) - 12) / sizeof(__u32))
25209 +
25210 +/* journal.c see journal.c for all the comments here */
25211 +
25212 +/* first block written in a commit.  */
25213 +struct reiserfs_journal_desc {
25214 +       __le32 j_trans_id;      /* id of commit */
25215 +
25216 +       /* length of commit. len +1 is the commit block */
25217 +       __le32 j_len;
25218 +
25219 +       __le32 j_mount_id;      /* mount id of this trans */
25220 +       __le32 j_realblock[];   /* real locations for each block */
25221 +};
25222 +
25223 +#define get_desc_trans_id(d)   le32_to_cpu((d)->j_trans_id)
25224 +#define get_desc_trans_len(d)  le32_to_cpu((d)->j_len)
25225 +#define get_desc_mount_id(d)   le32_to_cpu((d)->j_mount_id)
25226 +
25227 +#define set_desc_trans_id(d,val)       do { (d)->j_trans_id = cpu_to_le32 (val); } while (0)
25228 +#define set_desc_trans_len(d,val)      do { (d)->j_len = cpu_to_le32 (val); } while (0)
25229 +#define set_desc_mount_id(d,val)       do { (d)->j_mount_id = cpu_to_le32 (val); } while (0)
25230 +
25231 +/* last block written in a commit */
25232 +struct reiserfs_journal_commit {
25233 +       __le32 j_trans_id;      /* must match j_trans_id from the desc block */
25234 +       __le32 j_len;           /* ditto */
25235 +       __le32 j_realblock[];   /* real locations for each block */
25236 +};
25237 +
25238 +#define get_commit_trans_id(c) le32_to_cpu((c)->j_trans_id)
25239 +#define get_commit_trans_len(c)        le32_to_cpu((c)->j_len)
25240 +#define get_commit_mount_id(c) le32_to_cpu((c)->j_mount_id)
25241 +
25242 +#define set_commit_trans_id(c,val)     do { (c)->j_trans_id = cpu_to_le32 (val); } while (0)
25243 +#define set_commit_trans_len(c,val)    do { (c)->j_len = cpu_to_le32 (val); } while (0)
25244 +
25245 +/*
25246 + * this header block gets written whenever a transaction is considered
25247 + * fully flushed, and is more recent than the last fully flushed transaction.
25248 + * fully flushed means all the log blocks and all the real blocks are on
25249 + * disk, and this transaction does not need to be replayed.
25250 + */
25251 +struct reiserfs_journal_header {
25252 +       /* id of last fully flushed transaction */
25253 +       __le32 j_last_flush_trans_id;
25254 +
25255 +       /* offset in the log of where to start replay after a crash */
25256 +       __le32 j_first_unflushed_offset;
25257 +
25258 +       __le32 j_mount_id;
25259 +       /* 12 */ struct journal_params jh_journal;
25260 +};
25261 +
25262 +/* biggest tunable defines are right here */
25263 +#define JOURNAL_BLOCK_COUNT 8192       /* number of blocks in the journal */
25264 +
25265 +/* biggest possible single transaction, don't change for now (8/3/99) */
25266 +#define JOURNAL_TRANS_MAX_DEFAULT 1024
25267 +#define JOURNAL_TRANS_MIN_DEFAULT 256
25268 +
25269 +/*
25270 + * max blocks to batch into one transaction,
25271 + * don't make this any bigger than 900
25272 + */
25273 +#define JOURNAL_MAX_BATCH_DEFAULT   900
25274 +#define JOURNAL_MIN_RATIO 2
25275 +#define JOURNAL_MAX_COMMIT_AGE 30
25276 +#define JOURNAL_MAX_TRANS_AGE 30
25277 +#define JOURNAL_PER_BALANCE_CNT (3 * (MAX_HEIGHT-2) + 9)
25278 +#define JOURNAL_BLOCKS_PER_OBJECT(sb)  (JOURNAL_PER_BALANCE_CNT * 3 + \
25279 +                                        2 * (REISERFS_QUOTA_INIT_BLOCKS(sb) + \
25280 +                                             REISERFS_QUOTA_TRANS_BLOCKS(sb)))
25281 +
25282 +#ifdef CONFIG_QUOTA
25283 +#define REISERFS_QUOTA_OPTS ((1 << REISERFS_USRQUOTA) | (1 << REISERFS_GRPQUOTA))
25284 +/* We need to update data and inode (atime) */
25285 +#define REISERFS_QUOTA_TRANS_BLOCKS(s) (REISERFS_SB(s)->s_mount_opt & REISERFS_QUOTA_OPTS ? 2 : 0)
25286 +/* 1 balancing, 1 bitmap, 1 data per write + stat data update */
25287 +#define REISERFS_QUOTA_INIT_BLOCKS(s) (REISERFS_SB(s)->s_mount_opt & REISERFS_QUOTA_OPTS ? \
25288 +(DQUOT_INIT_ALLOC*(JOURNAL_PER_BALANCE_CNT+2)+DQUOT_INIT_REWRITE+1) : 0)
25289 +/* same as with INIT */
25290 +#define REISERFS_QUOTA_DEL_BLOCKS(s) (REISERFS_SB(s)->s_mount_opt & REISERFS_QUOTA_OPTS ? \
25291 +(DQUOT_DEL_ALLOC*(JOURNAL_PER_BALANCE_CNT+2)+DQUOT_DEL_REWRITE+1) : 0)
25292 +#else
25293 +#define REISERFS_QUOTA_TRANS_BLOCKS(s) 0
25294 +#define REISERFS_QUOTA_INIT_BLOCKS(s) 0
25295 +#define REISERFS_QUOTA_DEL_BLOCKS(s) 0
25296 +#endif
25297 +
25298 +/*
25299 + * both of these can be as low as 1, or as high as you want.  The min is the
25300 + * number of 4k bitmap nodes preallocated on mount. New nodes are allocated
25301 + * as needed, and released when transactions are committed.  On release, if
25302 + * the current number of nodes is > max, the node is freed, otherwise,
25303 + * it is put on a free list for faster use later.
25304 +*/
25305 +#define REISERFS_MIN_BITMAP_NODES 10
25306 +#define REISERFS_MAX_BITMAP_NODES 100
25307 +
25308 +/* these are based on journal hash size of 8192 */
25309 +#define JBH_HASH_SHIFT 13
25310 +#define JBH_HASH_MASK 8191
25311 +
25312 +#define _jhashfn(sb,block)     \
25313 +       (((unsigned long)sb>>L1_CACHE_SHIFT) ^ \
25314 +        (((block)<<(JBH_HASH_SHIFT - 6)) ^ ((block) >> 13) ^ ((block) << (JBH_HASH_SHIFT - 12))))
25315 +#define journal_hash(t,sb,block) ((t)[_jhashfn((sb),(block)) & JBH_HASH_MASK])
25316 +
25317 +/* We need these to make journal.c code more readable */
25318 +#define journal_find_get_block(s, block) __find_get_block(\
25319 +               file_bdev(SB_JOURNAL(s)->j_bdev_file), block, s->s_blocksize)
25320 +#define journal_getblk(s, block) __getblk(file_bdev(SB_JOURNAL(s)->j_bdev_file),\
25321 +               block, s->s_blocksize)
25322 +#define journal_bread(s, block) __bread(file_bdev(SB_JOURNAL(s)->j_bdev_file),\
25323 +               block, s->s_blocksize)
25324 +
25325 +enum reiserfs_bh_state_bits {
25326 +       BH_JDirty = BH_PrivateStart,    /* buffer is in current transaction */
25327 +       BH_JDirty_wait,
25328 +       /*
25329 +        * disk block was taken off free list before being in a
25330 +        * finished transaction, or written to disk. Can be reused immed.
25331 +        */
25332 +       BH_JNew,
25333 +       BH_JPrepared,
25334 +       BH_JRestore_dirty,
25335 +       BH_JTest,               /* debugging only will go away */
25336 +};
25337 +
25338 +BUFFER_FNS(JDirty, journaled);
25339 +TAS_BUFFER_FNS(JDirty, journaled);
25340 +BUFFER_FNS(JDirty_wait, journal_dirty);
25341 +TAS_BUFFER_FNS(JDirty_wait, journal_dirty);
25342 +BUFFER_FNS(JNew, journal_new);
25343 +TAS_BUFFER_FNS(JNew, journal_new);
25344 +BUFFER_FNS(JPrepared, journal_prepared);
25345 +TAS_BUFFER_FNS(JPrepared, journal_prepared);
25346 +BUFFER_FNS(JRestore_dirty, journal_restore_dirty);
25347 +TAS_BUFFER_FNS(JRestore_dirty, journal_restore_dirty);
25348 +BUFFER_FNS(JTest, journal_test);
25349 +TAS_BUFFER_FNS(JTest, journal_test);
25350 +
25351 +/* transaction handle which is passed around for all journal calls */
25352 +struct reiserfs_transaction_handle {
25353 +       /*
25354 +        * super for this FS when journal_begin was called. saves calls to
25355 +        * reiserfs_get_super also used by nested transactions to make
25356 +        * sure they are nesting on the right FS _must_ be first
25357 +        * in the handle
25358 +        */
25359 +       struct super_block *t_super;
25360 +
25361 +       int t_refcount;
25362 +       int t_blocks_logged;    /* number of blocks this writer has logged */
25363 +       int t_blocks_allocated; /* number of blocks this writer allocated */
25364 +
25365 +       /* sanity check, equals the current trans id */
25366 +       unsigned int t_trans_id;
25367 +
25368 +       void *t_handle_save;    /* save existing current->journal_info */
25369 +
25370 +       /*
25371 +        * if new block allocation occurres, that block
25372 +        * should be displaced from others
25373 +        */
25374 +       unsigned displace_new_blocks:1;
25375 +
25376 +       struct list_head t_list;
25377 +};
25378 +
25379 +/*
25380 + * used to keep track of ordered and tail writes, attached to the buffer
25381 + * head through b_journal_head.
25382 + */
25383 +struct reiserfs_jh {
25384 +       struct reiserfs_journal_list *jl;
25385 +       struct buffer_head *bh;
25386 +       struct list_head list;
25387 +};
25388 +
25389 +void reiserfs_free_jh(struct buffer_head *bh);
25390 +int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh);
25391 +int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh);
25392 +int journal_mark_dirty(struct reiserfs_transaction_handle *,
25393 +                      struct buffer_head *bh);
25394 +
25395 +static inline int reiserfs_file_data_log(struct inode *inode)
25396 +{
25397 +       if (reiserfs_data_log(inode->i_sb) ||
25398 +           (REISERFS_I(inode)->i_flags & i_data_log))
25399 +               return 1;
25400 +       return 0;
25401 +}
25402 +
25403 +static inline int reiserfs_transaction_running(struct super_block *s)
25404 +{
25405 +       struct reiserfs_transaction_handle *th = current->journal_info;
25406 +       if (th && th->t_super == s)
25407 +               return 1;
25408 +       if (th && th->t_super == NULL)
25409 +               BUG();
25410 +       return 0;
25411 +}
25412 +
25413 +static inline int reiserfs_transaction_free_space(struct reiserfs_transaction_handle *th)
25414 +{
25415 +       return th->t_blocks_allocated - th->t_blocks_logged;
25416 +}
25417 +
25418 +struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct
25419 +                                                                   super_block
25420 +                                                                   *,
25421 +                                                                   int count);
25422 +int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *);
25423 +void reiserfs_vfs_truncate_file(struct inode *inode);
25424 +int reiserfs_commit_page(struct inode *inode, struct page *page,
25425 +                        unsigned from, unsigned to);
25426 +void reiserfs_flush_old_commits(struct super_block *);
25427 +int reiserfs_commit_for_inode(struct inode *);
25428 +int reiserfs_inode_needs_commit(struct inode *);
25429 +void reiserfs_update_inode_transaction(struct inode *);
25430 +void reiserfs_wait_on_write_block(struct super_block *s);
25431 +void reiserfs_block_writes(struct reiserfs_transaction_handle *th);
25432 +void reiserfs_allow_writes(struct super_block *s);
25433 +void reiserfs_check_lock_depth(struct super_block *s, char *caller);
25434 +int reiserfs_prepare_for_journal(struct super_block *, struct buffer_head *bh,
25435 +                                int wait);
25436 +void reiserfs_restore_prepared_buffer(struct super_block *,
25437 +                                     struct buffer_head *bh);
25438 +int journal_init(struct super_block *, const char *j_dev_name, int old_format,
25439 +                unsigned int);
25440 +int journal_release(struct reiserfs_transaction_handle *, struct super_block *);
25441 +int journal_release_error(struct reiserfs_transaction_handle *,
25442 +                         struct super_block *);
25443 +int journal_end(struct reiserfs_transaction_handle *);
25444 +int journal_end_sync(struct reiserfs_transaction_handle *);
25445 +int journal_mark_freed(struct reiserfs_transaction_handle *,
25446 +                      struct super_block *, b_blocknr_t blocknr);
25447 +int journal_transaction_should_end(struct reiserfs_transaction_handle *, int);
25448 +int reiserfs_in_journal(struct super_block *sb, unsigned int bmap_nr,
25449 +                        int bit_nr, int searchall, b_blocknr_t *next);
25450 +int journal_begin(struct reiserfs_transaction_handle *,
25451 +                 struct super_block *sb, unsigned long);
25452 +int journal_join_abort(struct reiserfs_transaction_handle *,
25453 +                      struct super_block *sb);
25454 +void reiserfs_abort_journal(struct super_block *sb, int errno);
25455 +void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...);
25456 +int reiserfs_allocate_list_bitmaps(struct super_block *s,
25457 +                                  struct reiserfs_list_bitmap *, unsigned int);
25458 +
25459 +void reiserfs_schedule_old_flush(struct super_block *s);
25460 +void reiserfs_cancel_old_flush(struct super_block *s);
25461 +void add_save_link(struct reiserfs_transaction_handle *th,
25462 +                  struct inode *inode, int truncate);
25463 +int remove_save_link(struct inode *inode, int truncate);
25464 +
25465 +/* objectid.c */
25466 +__u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th);
25467 +void reiserfs_release_objectid(struct reiserfs_transaction_handle *th,
25468 +                              __u32 objectid_to_release);
25469 +int reiserfs_convert_objectid_map_v1(struct super_block *);
25470 +
25471 +/* stree.c */
25472 +int B_IS_IN_TREE(const struct buffer_head *);
25473 +extern void copy_item_head(struct item_head *to,
25474 +                          const struct item_head *from);
25475 +
25476 +/* first key is in cpu form, second - le */
25477 +extern int comp_short_keys(const struct reiserfs_key *le_key,
25478 +                          const struct cpu_key *cpu_key);
25479 +extern void le_key2cpu_key(struct cpu_key *to, const struct reiserfs_key *from);
25480 +
25481 +/* both are in le form */
25482 +extern int comp_le_keys(const struct reiserfs_key *,
25483 +                       const struct reiserfs_key *);
25484 +extern int comp_short_le_keys(const struct reiserfs_key *,
25485 +                             const struct reiserfs_key *);
25486 +
25487 +/* * get key version from on disk key - kludge */
25488 +static inline int le_key_version(const struct reiserfs_key *key)
25489 +{
25490 +       int type;
25491 +
25492 +       type = offset_v2_k_type(&(key->u.k_offset_v2));
25493 +       if (type != TYPE_DIRECT && type != TYPE_INDIRECT
25494 +           && type != TYPE_DIRENTRY)
25495 +               return KEY_FORMAT_3_5;
25496 +
25497 +       return KEY_FORMAT_3_6;
25498 +
25499 +}
25500 +
25501 +static inline void copy_key(struct reiserfs_key *to,
25502 +                           const struct reiserfs_key *from)
25503 +{
25504 +       memcpy(to, from, KEY_SIZE);
25505 +}
25506 +
25507 +int comp_items(const struct item_head *stored_ih, const struct treepath *path);
25508 +const struct reiserfs_key *get_rkey(const struct treepath *chk_path,
25509 +                                   const struct super_block *sb);
25510 +int search_by_key(struct super_block *, const struct cpu_key *,
25511 +                 struct treepath *, int);
25512 +#define search_item(s,key,path) search_by_key (s, key, path, DISK_LEAF_NODE_LEVEL)
25513 +int search_for_position_by_key(struct super_block *sb,
25514 +                              const struct cpu_key *cpu_key,
25515 +                              struct treepath *search_path);
25516 +extern void decrement_bcount(struct buffer_head *bh);
25517 +void decrement_counters_in_path(struct treepath *search_path);
25518 +void pathrelse(struct treepath *search_path);
25519 +int reiserfs_check_path(struct treepath *p);
25520 +void pathrelse_and_restore(struct super_block *s, struct treepath *search_path);
25521 +
25522 +int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
25523 +                        struct treepath *path,
25524 +                        const struct cpu_key *key,
25525 +                        struct item_head *ih,
25526 +                        struct inode *inode, const char *body);
25527 +
25528 +int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th,
25529 +                            struct treepath *path,
25530 +                            const struct cpu_key *key,
25531 +                            struct inode *inode,
25532 +                            const char *body, int paste_size);
25533 +
25534 +int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
25535 +                          struct treepath *path,
25536 +                          struct cpu_key *key,
25537 +                          struct inode *inode,
25538 +                          struct page *page, loff_t new_file_size);
25539 +
25540 +int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
25541 +                        struct treepath *path,
25542 +                        const struct cpu_key *key,
25543 +                        struct inode *inode, struct buffer_head *un_bh);
25544 +
25545 +void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
25546 +                               struct inode *inode, struct reiserfs_key *key);
25547 +int reiserfs_delete_object(struct reiserfs_transaction_handle *th,
25548 +                          struct inode *inode);
25549 +int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
25550 +                        struct inode *inode, struct page *,
25551 +                        int update_timestamps);
25552 +
25553 +#define i_block_size(inode) ((inode)->i_sb->s_blocksize)
25554 +#define file_size(inode) ((inode)->i_size)
25555 +#define tail_size(inode) (file_size (inode) & (i_block_size (inode) - 1))
25556 +
25557 +#define tail_has_to_be_packed(inode) (have_large_tails ((inode)->i_sb)?\
25558 +!STORE_TAIL_IN_UNFM_S1(file_size (inode), tail_size(inode), inode->i_sb->s_blocksize):have_small_tails ((inode)->i_sb)?!STORE_TAIL_IN_UNFM_S2(file_size (inode), tail_size(inode), inode->i_sb->s_blocksize):0 )
25559 +
25560 +void padd_item(char *item, int total_length, int length);
25561 +
25562 +/* inode.c */
25563 +/* args for the create parameter of reiserfs_get_block */
25564 +#define GET_BLOCK_NO_CREATE 0   /* don't create new blocks or convert tails */
25565 +#define GET_BLOCK_CREATE 1      /* add anything you need to find block */
25566 +#define GET_BLOCK_NO_HOLE 2     /* return -ENOENT for file holes */
25567 +#define GET_BLOCK_READ_DIRECT 4         /* read the tail if indirect item not found */
25568 +#define GET_BLOCK_NO_IMUX     8         /* i_mutex is not held, don't preallocate */
25569 +#define GET_BLOCK_NO_DANGLE   16 /* don't leave any transactions running */
25570 +
25571 +void reiserfs_read_locked_inode(struct inode *inode,
25572 +                               struct reiserfs_iget_args *args);
25573 +int reiserfs_find_actor(struct inode *inode, void *p);
25574 +int reiserfs_init_locked_inode(struct inode *inode, void *p);
25575 +void reiserfs_evict_inode(struct inode *inode);
25576 +int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc);
25577 +int reiserfs_get_block(struct inode *inode, sector_t block,
25578 +                      struct buffer_head *bh_result, int create);
25579 +struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
25580 +                                    int fh_len, int fh_type);
25581 +struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid,
25582 +                                    int fh_len, int fh_type);
25583 +int reiserfs_encode_fh(struct inode *inode, __u32 * data, int *lenp,
25584 +                      struct inode *parent);
25585 +
25586 +int reiserfs_truncate_file(struct inode *, int update_timestamps);
25587 +void make_cpu_key(struct cpu_key *cpu_key, struct inode *inode, loff_t offset,
25588 +                 int type, int key_length);
25589 +void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
25590 +                      int version,
25591 +                      loff_t offset, int type, int length, int entry_count);
25592 +struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key);
25593 +
25594 +struct reiserfs_security_handle;
25595 +int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
25596 +                      struct inode *dir, umode_t mode,
25597 +                      const char *symname, loff_t i_size,
25598 +                      struct dentry *dentry, struct inode *inode,
25599 +                      struct reiserfs_security_handle *security);
25600 +
25601 +void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th,
25602 +                            struct inode *inode, loff_t size);
25603 +
25604 +static inline void reiserfs_update_sd(struct reiserfs_transaction_handle *th,
25605 +                                     struct inode *inode)
25606 +{
25607 +       reiserfs_update_sd_size(th, inode, inode->i_size);
25608 +}
25609 +
25610 +void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode);
25611 +int reiserfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
25612 +                    struct iattr *attr);
25613 +
25614 +int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len);
25615 +
25616 +/* namei.c */
25617 +void reiserfs_init_priv_inode(struct inode *inode);
25618 +void set_de_name_and_namelen(struct reiserfs_dir_entry *de);
25619 +int search_by_entry_key(struct super_block *sb, const struct cpu_key *key,
25620 +                       struct treepath *path, struct reiserfs_dir_entry *de);
25621 +struct dentry *reiserfs_get_parent(struct dentry *);
25622 +
25623 +#ifdef CONFIG_REISERFS_PROC_INFO
25624 +int reiserfs_proc_info_init(struct super_block *sb);
25625 +int reiserfs_proc_info_done(struct super_block *sb);
25626 +int reiserfs_proc_info_global_init(void);
25627 +int reiserfs_proc_info_global_done(void);
25628 +
25629 +#define PROC_EXP( e )   e
25630 +
25631 +#define __PINFO( sb ) REISERFS_SB(sb) -> s_proc_info_data
25632 +#define PROC_INFO_MAX( sb, field, value )                                                              \
25633 +    __PINFO( sb ).field =                                                                                              \
25634 +        max( REISERFS_SB( sb ) -> s_proc_info_data.field, value )
25635 +#define PROC_INFO_INC( sb, field ) ( ++ ( __PINFO( sb ).field ) )
25636 +#define PROC_INFO_ADD( sb, field, val ) ( __PINFO( sb ).field += ( val ) )
25637 +#define PROC_INFO_BH_STAT( sb, bh, level )                                                     \
25638 +    PROC_INFO_INC( sb, sbk_read_at[ ( level ) ] );                                             \
25639 +    PROC_INFO_ADD( sb, free_at[ ( level ) ], B_FREE_SPACE( bh ) );     \
25640 +    PROC_INFO_ADD( sb, items_at[ ( level ) ], B_NR_ITEMS( bh ) )
25641 +#else
25642 +static inline int reiserfs_proc_info_init(struct super_block *sb)
25643 +{
25644 +       return 0;
25645 +}
25646 +
25647 +static inline int reiserfs_proc_info_done(struct super_block *sb)
25648 +{
25649 +       return 0;
25650 +}
25651 +
25652 +static inline int reiserfs_proc_info_global_init(void)
25653 +{
25654 +       return 0;
25655 +}
25656 +
25657 +static inline int reiserfs_proc_info_global_done(void)
25658 +{
25659 +       return 0;
25660 +}
25661 +
25662 +#define PROC_EXP( e )
25663 +#define VOID_V ( ( void ) 0 )
25664 +#define PROC_INFO_MAX( sb, field, value ) VOID_V
25665 +#define PROC_INFO_INC( sb, field ) VOID_V
25666 +#define PROC_INFO_ADD( sb, field, val ) VOID_V
25667 +#define PROC_INFO_BH_STAT(sb, bh, n_node_level) VOID_V
25668 +#endif
25669 +
25670 +/* dir.c */
25671 +extern const struct inode_operations reiserfs_dir_inode_operations;
25672 +extern const struct inode_operations reiserfs_symlink_inode_operations;
25673 +extern const struct inode_operations reiserfs_special_inode_operations;
25674 +extern const struct file_operations reiserfs_dir_operations;
25675 +int reiserfs_readdir_inode(struct inode *, struct dir_context *);
25676 +
25677 +/* tail_conversion.c */
25678 +int direct2indirect(struct reiserfs_transaction_handle *, struct inode *,
25679 +                   struct treepath *, struct buffer_head *, loff_t);
25680 +int indirect2direct(struct reiserfs_transaction_handle *, struct inode *,
25681 +                   struct page *, struct treepath *, const struct cpu_key *,
25682 +                   loff_t, char *);
25683 +void reiserfs_unmap_buffer(struct buffer_head *);
25684 +
25685 +/* file.c */
25686 +extern const struct inode_operations reiserfs_file_inode_operations;
25687 +extern const struct inode_operations reiserfs_priv_file_inode_operations;
25688 +extern const struct file_operations reiserfs_file_operations;
25689 +extern const struct address_space_operations reiserfs_address_space_operations;
25690 +
25691 +/* fix_nodes.c */
25692 +
25693 +int fix_nodes(int n_op_mode, struct tree_balance *tb,
25694 +             struct item_head *ins_ih, const void *);
25695 +void unfix_nodes(struct tree_balance *);
25696 +
25697 +/* prints.c */
25698 +void __reiserfs_panic(struct super_block *s, const char *id,
25699 +                     const char *function, const char *fmt, ...)
25700 +    __attribute__ ((noreturn));
25701 +#define reiserfs_panic(s, id, fmt, args...) \
25702 +       __reiserfs_panic(s, id, __func__, fmt, ##args)
25703 +void __reiserfs_error(struct super_block *s, const char *id,
25704 +                     const char *function, const char *fmt, ...);
25705 +#define reiserfs_error(s, id, fmt, args...) \
25706 +        __reiserfs_error(s, id, __func__, fmt, ##args)
25707 +void reiserfs_info(struct super_block *s, const char *fmt, ...);
25708 +void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...);
25709 +void print_indirect_item(struct buffer_head *bh, int item_num);
25710 +void store_print_tb(struct tree_balance *tb);
25711 +void print_cur_tb(char *mes);
25712 +void print_de(struct reiserfs_dir_entry *de);
25713 +void print_bi(struct buffer_info *bi, char *mes);
25714 +#define PRINT_LEAF_ITEMS 1     /* print all items */
25715 +#define PRINT_DIRECTORY_ITEMS 2        /* print directory items */
25716 +#define PRINT_DIRECT_ITEMS 4   /* print contents of direct items */
25717 +void print_block(struct buffer_head *bh, ...);
25718 +void print_bmap(struct super_block *s, int silent);
25719 +void print_bmap_block(int i, char *data, int size, int silent);
25720 +/*void print_super_block (struct super_block * s, char * mes);*/
25721 +void print_objectid_map(struct super_block *s);
25722 +void print_block_head(struct buffer_head *bh, char *mes);
25723 +void check_leaf(struct buffer_head *bh);
25724 +void check_internal(struct buffer_head *bh);
25725 +void print_statistics(struct super_block *s);
25726 +char *reiserfs_hashname(int code);
25727 +
25728 +/* lbalance.c */
25729 +int leaf_move_items(int shift_mode, struct tree_balance *tb, int mov_num,
25730 +                   int mov_bytes, struct buffer_head *Snew);
25731 +int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes);
25732 +int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes);
25733 +void leaf_delete_items(struct buffer_info *cur_bi, int last_first, int first,
25734 +                      int del_num, int del_bytes);
25735 +void leaf_insert_into_buf(struct buffer_info *bi, int before,
25736 +                         struct item_head * const inserted_item_ih,
25737 +                         const char * const inserted_item_body,
25738 +                         int zeros_number);
25739 +void leaf_paste_in_buffer(struct buffer_info *bi, int pasted_item_num,
25740 +                         int pos_in_item, int paste_size,
25741 +                         const char * const body, int zeros_number);
25742 +void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num,
25743 +                         int pos_in_item, int cut_size);
25744 +void leaf_paste_entries(struct buffer_info *bi, int item_num, int before,
25745 +                       int new_entry_count, struct reiserfs_de_head *new_dehs,
25746 +                       const char *records, int paste_size);
25747 +/* ibalance.c */
25748 +int balance_internal(struct tree_balance *, int, int, struct item_head *,
25749 +                    struct buffer_head **);
25750 +
25751 +/* do_balance.c */
25752 +void do_balance_mark_leaf_dirty(struct tree_balance *tb,
25753 +                               struct buffer_head *bh, int flag);
25754 +#define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty
25755 +#define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty
25756 +
25757 +void do_balance(struct tree_balance *tb, struct item_head *ih,
25758 +               const char *body, int flag);
25759 +void reiserfs_invalidate_buffer(struct tree_balance *tb,
25760 +                               struct buffer_head *bh);
25761 +
25762 +int get_left_neighbor_position(struct tree_balance *tb, int h);
25763 +int get_right_neighbor_position(struct tree_balance *tb, int h);
25764 +void replace_key(struct tree_balance *tb, struct buffer_head *, int,
25765 +                struct buffer_head *, int);
25766 +void make_empty_node(struct buffer_info *);
25767 +struct buffer_head *get_FEB(struct tree_balance *);
25768 +
25769 +/* bitmap.c */
25770 +
25771 +/*
25772 + * structure contains hints for block allocator, and it is a container for
25773 + * arguments, such as node, search path, transaction_handle, etc.
25774 + */
25775 +struct __reiserfs_blocknr_hint {
25776 +       /* inode passed to allocator, if we allocate unf. nodes */
25777 +       struct inode *inode;
25778 +
25779 +       sector_t block;         /* file offset, in blocks */
25780 +       struct in_core_key key;
25781 +
25782 +       /*
25783 +        * search path, used by allocator to deternine search_start by
25784 +        * various ways
25785 +        */
25786 +       struct treepath *path;
25787 +
25788 +       /*
25789 +        * transaction handle is needed to log super blocks
25790 +        * and bitmap blocks changes
25791 +        */
25792 +       struct reiserfs_transaction_handle *th;
25793 +
25794 +       b_blocknr_t beg, end;
25795 +
25796 +       /*
25797 +        * a field used to transfer search start value (block number)
25798 +        * between different block allocator procedures
25799 +        * (determine_search_start() and others)
25800 +        */
25801 +       b_blocknr_t search_start;
25802 +
25803 +       /*
25804 +        * is set in determine_prealloc_size() function,
25805 +        * used by underlayed function that do actual allocation
25806 +        */
25807 +       int prealloc_size;
25808 +
25809 +       /*
25810 +        * the allocator uses different polices for getting disk
25811 +        * space for formatted/unformatted blocks with/without preallocation
25812 +        */
25813 +       unsigned formatted_node:1;
25814 +       unsigned preallocate:1;
25815 +};
25816 +
25817 +typedef struct __reiserfs_blocknr_hint reiserfs_blocknr_hint_t;
25818 +
25819 +int reiserfs_parse_alloc_options(struct super_block *, char *);
25820 +void reiserfs_init_alloc_options(struct super_block *s);
25821 +
25822 +/*
25823 + * given a directory, this will tell you what packing locality
25824 + * to use for a new object underneat it.  The locality is returned
25825 + * in disk byte order (le).
25826 + */
25827 +__le32 reiserfs_choose_packing(struct inode *dir);
25828 +
25829 +void show_alloc_options(struct seq_file *seq, struct super_block *s);
25830 +int reiserfs_init_bitmap_cache(struct super_block *sb);
25831 +void reiserfs_free_bitmap_cache(struct super_block *sb);
25832 +void reiserfs_cache_bitmap_metadata(struct super_block *sb, struct buffer_head *bh, struct reiserfs_bitmap_info *info);
25833 +struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb, unsigned int bitmap);
25834 +int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value);
25835 +void reiserfs_free_block(struct reiserfs_transaction_handle *th, struct inode *,
25836 +                        b_blocknr_t, int for_unformatted);
25837 +int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *, b_blocknr_t *, int,
25838 +                              int);
25839 +static inline int reiserfs_new_form_blocknrs(struct tree_balance *tb,
25840 +                                            b_blocknr_t * new_blocknrs,
25841 +                                            int amount_needed)
25842 +{
25843 +       reiserfs_blocknr_hint_t hint = {
25844 +               .th = tb->transaction_handle,
25845 +               .path = tb->tb_path,
25846 +               .inode = NULL,
25847 +               .key = tb->key,
25848 +               .block = 0,
25849 +               .formatted_node = 1
25850 +       };
25851 +       return reiserfs_allocate_blocknrs(&hint, new_blocknrs, amount_needed,
25852 +                                         0);
25853 +}
25854 +
25855 +static inline int reiserfs_new_unf_blocknrs(struct reiserfs_transaction_handle
25856 +                                           *th, struct inode *inode,
25857 +                                           b_blocknr_t * new_blocknrs,
25858 +                                           struct treepath *path,
25859 +                                           sector_t block)
25860 +{
25861 +       reiserfs_blocknr_hint_t hint = {
25862 +               .th = th,
25863 +               .path = path,
25864 +               .inode = inode,
25865 +               .block = block,
25866 +               .formatted_node = 0,
25867 +               .preallocate = 0
25868 +       };
25869 +       return reiserfs_allocate_blocknrs(&hint, new_blocknrs, 1, 0);
25870 +}
25871 +
25872 +#ifdef REISERFS_PREALLOCATE
25873 +static inline int reiserfs_new_unf_blocknrs2(struct reiserfs_transaction_handle
25874 +                                            *th, struct inode *inode,
25875 +                                            b_blocknr_t * new_blocknrs,
25876 +                                            struct treepath *path,
25877 +                                            sector_t block)
25878 +{
25879 +       reiserfs_blocknr_hint_t hint = {
25880 +               .th = th,
25881 +               .path = path,
25882 +               .inode = inode,
25883 +               .block = block,
25884 +               .formatted_node = 0,
25885 +               .preallocate = 1
25886 +       };
25887 +       return reiserfs_allocate_blocknrs(&hint, new_blocknrs, 1, 0);
25888 +}
25889 +
25890 +void reiserfs_discard_prealloc(struct reiserfs_transaction_handle *th,
25891 +                              struct inode *inode);
25892 +void reiserfs_discard_all_prealloc(struct reiserfs_transaction_handle *th);
25893 +#endif
25894 +
25895 +/* hashes.c */
25896 +__u32 keyed_hash(const signed char *msg, int len);
25897 +__u32 yura_hash(const signed char *msg, int len);
25898 +__u32 r5_hash(const signed char *msg, int len);
25899 +
25900 +#define reiserfs_set_le_bit            __set_bit_le
25901 +#define reiserfs_test_and_set_le_bit   __test_and_set_bit_le
25902 +#define reiserfs_clear_le_bit          __clear_bit_le
25903 +#define reiserfs_test_and_clear_le_bit __test_and_clear_bit_le
25904 +#define reiserfs_test_le_bit           test_bit_le
25905 +#define reiserfs_find_next_zero_le_bit find_next_zero_bit_le
25906 +
25907 +/*
25908 + * sometimes reiserfs_truncate may require to allocate few new blocks
25909 + * to perform indirect2direct conversion. People probably used to
25910 + * think, that truncate should work without problems on a filesystem
25911 + * without free disk space. They may complain that they can not
25912 + * truncate due to lack of free disk space. This spare space allows us
25913 + * to not worry about it. 500 is probably too much, but it should be
25914 + * absolutely safe
25915 + */
25916 +#define SPARE_SPACE 500
25917 +
25918 +/* prototypes from ioctl.c */
25919 +int reiserfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
25920 +int reiserfs_fileattr_set(struct mnt_idmap *idmap,
25921 +                         struct dentry *dentry, struct fileattr *fa);
25922 +long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
25923 +long reiserfs_compat_ioctl(struct file *filp,
25924 +                  unsigned int cmd, unsigned long arg);
25925 +int reiserfs_unpack(struct inode *inode);
25926 diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
25927 new file mode 100644
25928 index 000000000000..7b498a0d060b
25929 --- /dev/null
25930 +++ b/fs/reiserfs/resize.c
25931 @@ -0,0 +1,230 @@
25932 +/*
25933 + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
25934 + */
25935 +
25936 +/*
25937 + * Written by Alexander Zarochentcev.
25938 + *
25939 + * The kernel part of the (on-line) reiserfs resizer.
25940 + */
25941 +
25942 +#include <linux/kernel.h>
25943 +#include <linux/mm.h>
25944 +#include <linux/vmalloc.h>
25945 +#include <linux/string.h>
25946 +#include <linux/errno.h>
25947 +#include "reiserfs.h"
25948 +#include <linux/buffer_head.h>
25949 +
25950 +int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
25951 +{
25952 +       int err = 0;
25953 +       struct reiserfs_super_block *sb;
25954 +       struct reiserfs_bitmap_info *bitmap;
25955 +       struct reiserfs_bitmap_info *info;
25956 +       struct reiserfs_bitmap_info *old_bitmap = SB_AP_BITMAP(s);
25957 +       struct buffer_head *bh;
25958 +       struct reiserfs_transaction_handle th;
25959 +       unsigned int bmap_nr_new, bmap_nr;
25960 +       unsigned int block_r_new, block_r;
25961 +
25962 +       struct reiserfs_list_bitmap *jb;
25963 +       struct reiserfs_list_bitmap jbitmap[JOURNAL_NUM_BITMAPS];
25964 +
25965 +       unsigned long int block_count, free_blocks;
25966 +       int i;
25967 +       int copy_size;
25968 +       int depth;
25969 +
25970 +       sb = SB_DISK_SUPER_BLOCK(s);
25971 +
25972 +       if (SB_BLOCK_COUNT(s) >= block_count_new) {
25973 +               printk("can\'t shrink filesystem on-line\n");
25974 +               return -EINVAL;
25975 +       }
25976 +
25977 +       /* check the device size */
25978 +       depth = reiserfs_write_unlock_nested(s);
25979 +       bh = sb_bread(s, block_count_new - 1);
25980 +       reiserfs_write_lock_nested(s, depth);
25981 +       if (!bh) {
25982 +               printk("reiserfs_resize: can\'t read last block\n");
25983 +               return -EINVAL;
25984 +       }
25985 +       bforget(bh);
25986 +
25987 +       /*
25988 +        * old disk layout detection; those partitions can be mounted, but
25989 +        * cannot be resized
25990 +        */
25991 +       if (SB_BUFFER_WITH_SB(s)->b_blocknr * SB_BUFFER_WITH_SB(s)->b_size
25992 +           != REISERFS_DISK_OFFSET_IN_BYTES) {
25993 +               printk
25994 +                   ("reiserfs_resize: unable to resize a reiserfs without distributed bitmap (fs version < 3.5.12)\n");
25995 +               return -ENOTSUPP;
25996 +       }
25997 +
25998 +       /* count used bits in last bitmap block */
25999 +       block_r = SB_BLOCK_COUNT(s) -
26000 +                       (reiserfs_bmap_count(s) - 1) * s->s_blocksize * 8;
26001 +
26002 +       /* count bitmap blocks in new fs */
26003 +       bmap_nr_new = block_count_new / (s->s_blocksize * 8);
26004 +       block_r_new = block_count_new - bmap_nr_new * s->s_blocksize * 8;
26005 +       if (block_r_new)
26006 +               bmap_nr_new++;
26007 +       else
26008 +               block_r_new = s->s_blocksize * 8;
26009 +
26010 +       /* save old values */
26011 +       block_count = SB_BLOCK_COUNT(s);
26012 +       bmap_nr = reiserfs_bmap_count(s);
26013 +
26014 +       /* resizing of reiserfs bitmaps (journal and real), if needed */
26015 +       if (bmap_nr_new > bmap_nr) {
26016 +               /* reallocate journal bitmaps */
26017 +               if (reiserfs_allocate_list_bitmaps(s, jbitmap, bmap_nr_new) < 0) {
26018 +                       printk
26019 +                           ("reiserfs_resize: unable to allocate memory for journal bitmaps\n");
26020 +                       return -ENOMEM;
26021 +               }
26022 +               /*
26023 +                * the new journal bitmaps are zero filled, now we copy i
26024 +                * the bitmap node pointers from the old journal bitmap
26025 +                * structs, and then transfer the new data structures
26026 +                * into the journal struct.
26027 +                *
26028 +                * using the copy_size var below allows this code to work for
26029 +                * both shrinking and expanding the FS.
26030 +                */
26031 +               copy_size = min(bmap_nr_new, bmap_nr);
26032 +               copy_size =
26033 +                   copy_size * sizeof(struct reiserfs_list_bitmap_node *);
26034 +               for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
26035 +                       struct reiserfs_bitmap_node **node_tmp;
26036 +                       jb = SB_JOURNAL(s)->j_list_bitmap + i;
26037 +                       memcpy(jbitmap[i].bitmaps, jb->bitmaps, copy_size);
26038 +
26039 +                       /*
26040 +                        * just in case vfree schedules on us, copy the new
26041 +                        * pointer into the journal struct before freeing the
26042 +                        * old one
26043 +                        */
26044 +                       node_tmp = jb->bitmaps;
26045 +                       jb->bitmaps = jbitmap[i].bitmaps;
26046 +                       vfree(node_tmp);
26047 +               }
26048 +
26049 +               /*
26050 +                * allocate additional bitmap blocks, reallocate
26051 +                * array of bitmap block pointers
26052 +                */
26053 +               bitmap =
26054 +                   vzalloc(array_size(bmap_nr_new,
26055 +                                      sizeof(struct reiserfs_bitmap_info)));
26056 +               if (!bitmap) {
26057 +                       /*
26058 +                        * Journal bitmaps are still supersized, but the
26059 +                        * memory isn't leaked, so I guess it's ok
26060 +                        */
26061 +                       printk("reiserfs_resize: unable to allocate memory.\n");
26062 +                       return -ENOMEM;
26063 +               }
26064 +               for (i = 0; i < bmap_nr; i++)
26065 +                       bitmap[i] = old_bitmap[i];
26066 +
26067 +               /*
26068 +                * This doesn't go through the journal, but it doesn't have to.
26069 +                * The changes are still atomic: We're synced up when the
26070 +                * journal transaction begins, and the new bitmaps don't
26071 +                * matter if the transaction fails.
26072 +                */
26073 +               for (i = bmap_nr; i < bmap_nr_new; i++) {
26074 +                       int depth;
26075 +                       /*
26076 +                        * don't use read_bitmap_block since it will cache
26077 +                        * the uninitialized bitmap
26078 +                        */
26079 +                       depth = reiserfs_write_unlock_nested(s);
26080 +                       bh = sb_bread(s, i * s->s_blocksize * 8);
26081 +                       reiserfs_write_lock_nested(s, depth);
26082 +                       if (!bh) {
26083 +                               vfree(bitmap);
26084 +                               return -EIO;
26085 +                       }
26086 +                       memset(bh->b_data, 0, sb_blocksize(sb));
26087 +                       reiserfs_set_le_bit(0, bh->b_data);
26088 +                       reiserfs_cache_bitmap_metadata(s, bh, bitmap + i);
26089 +
26090 +                       set_buffer_uptodate(bh);
26091 +                       mark_buffer_dirty(bh);
26092 +                       depth = reiserfs_write_unlock_nested(s);
26093 +                       sync_dirty_buffer(bh);
26094 +                       reiserfs_write_lock_nested(s, depth);
26095 +                       /* update bitmap_info stuff */
26096 +                       bitmap[i].free_count = sb_blocksize(sb) * 8 - 1;
26097 +                       brelse(bh);
26098 +               }
26099 +               /* free old bitmap blocks array */
26100 +               SB_AP_BITMAP(s) = bitmap;
26101 +               vfree(old_bitmap);
26102 +       }
26103 +
26104 +       /*
26105 +        * begin transaction, if there was an error, it's fine. Yes, we have
26106 +        * incorrect bitmaps now, but none of it is ever going to touch the
26107 +        * disk anyway.
26108 +        */
26109 +       err = journal_begin(&th, s, 10);
26110 +       if (err)
26111 +               return err;
26112 +
26113 +       /* Extend old last bitmap block - new blocks have been made available */
26114 +       info = SB_AP_BITMAP(s) + bmap_nr - 1;
26115 +       bh = reiserfs_read_bitmap_block(s, bmap_nr - 1);
26116 +       if (!bh) {
26117 +               int jerr = journal_end(&th);
26118 +               if (jerr)
26119 +                       return jerr;
26120 +               return -EIO;
26121 +       }
26122 +
26123 +       reiserfs_prepare_for_journal(s, bh, 1);
26124 +       for (i = block_r; i < s->s_blocksize * 8; i++)
26125 +               reiserfs_clear_le_bit(i, bh->b_data);
26126 +       info->free_count += s->s_blocksize * 8 - block_r;
26127 +
26128 +       journal_mark_dirty(&th, bh);
26129 +       brelse(bh);
26130 +
26131 +       /* Correct new last bitmap block - It may not be full */
26132 +       info = SB_AP_BITMAP(s) + bmap_nr_new - 1;
26133 +       bh = reiserfs_read_bitmap_block(s, bmap_nr_new - 1);
26134 +       if (!bh) {
26135 +               int jerr = journal_end(&th);
26136 +               if (jerr)
26137 +                       return jerr;
26138 +               return -EIO;
26139 +       }
26140 +
26141 +       reiserfs_prepare_for_journal(s, bh, 1);
26142 +       for (i = block_r_new; i < s->s_blocksize * 8; i++)
26143 +               reiserfs_set_le_bit(i, bh->b_data);
26144 +       journal_mark_dirty(&th, bh);
26145 +       brelse(bh);
26146 +
26147 +       info->free_count -= s->s_blocksize * 8 - block_r_new;
26148 +       /* update super */
26149 +       reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
26150 +       free_blocks = SB_FREE_BLOCKS(s);
26151 +       PUT_SB_FREE_BLOCKS(s,
26152 +                          free_blocks + (block_count_new - block_count -
26153 +                                         (bmap_nr_new - bmap_nr)));
26154 +       PUT_SB_BLOCK_COUNT(s, block_count_new);
26155 +       PUT_SB_BMAP_NR(s, bmap_would_wrap(bmap_nr_new) ? : bmap_nr_new);
26156 +
26157 +       journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
26158 +
26159 +       SB_JOURNAL(s)->j_must_wait = 1;
26160 +       return journal_end(&th);
26161 +}
26162 diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
26163 new file mode 100644
26164 index 000000000000..5faf702f8d15
26165 --- /dev/null
26166 +++ b/fs/reiserfs/stree.c
26167 @@ -0,0 +1,2280 @@
26168 +/*
26169 + *  Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
26170 + */
26171 +
26172 +/*
26173 + *  Written by Anatoly P. Pinchuk pap@namesys.botik.ru
26174 + *  Programm System Institute
26175 + *  Pereslavl-Zalessky Russia
26176 + */
26177 +
26178 +#include <linux/time.h>
26179 +#include <linux/string.h>
26180 +#include <linux/pagemap.h>
26181 +#include <linux/bio.h>
26182 +#include "reiserfs.h"
26183 +#include <linux/buffer_head.h>
26184 +#include <linux/quotaops.h>
26185 +
26186 +/* Does the buffer contain a disk block which is in the tree. */
26187 +inline int B_IS_IN_TREE(const struct buffer_head *bh)
26188 +{
26189 +
26190 +       RFALSE(B_LEVEL(bh) > MAX_HEIGHT,
26191 +              "PAP-1010: block (%b) has too big level (%z)", bh, bh);
26192 +
26193 +       return (B_LEVEL(bh) != FREE_LEVEL);
26194 +}
26195 +
26196 +/* to get item head in le form */
26197 +inline void copy_item_head(struct item_head *to,
26198 +                          const struct item_head *from)
26199 +{
26200 +       memcpy(to, from, IH_SIZE);
26201 +}
26202 +
26203 +/*
26204 + * k1 is pointer to on-disk structure which is stored in little-endian
26205 + * form. k2 is pointer to cpu variable. For key of items of the same
26206 + * object this returns 0.
26207 + * Returns: -1 if key1 < key2
26208 + * 0 if key1 == key2
26209 + * 1 if key1 > key2
26210 + */
26211 +inline int comp_short_keys(const struct reiserfs_key *le_key,
26212 +                          const struct cpu_key *cpu_key)
26213 +{
26214 +       __u32 n;
26215 +       n = le32_to_cpu(le_key->k_dir_id);
26216 +       if (n < cpu_key->on_disk_key.k_dir_id)
26217 +               return -1;
26218 +       if (n > cpu_key->on_disk_key.k_dir_id)
26219 +               return 1;
26220 +       n = le32_to_cpu(le_key->k_objectid);
26221 +       if (n < cpu_key->on_disk_key.k_objectid)
26222 +               return -1;
26223 +       if (n > cpu_key->on_disk_key.k_objectid)
26224 +               return 1;
26225 +       return 0;
26226 +}
26227 +
26228 +/*
26229 + * k1 is pointer to on-disk structure which is stored in little-endian
26230 + * form. k2 is pointer to cpu variable.
26231 + * Compare keys using all 4 key fields.
26232 + * Returns: -1 if key1 < key2 0
26233 + * if key1 = key2 1 if key1 > key2
26234 + */
26235 +static inline int comp_keys(const struct reiserfs_key *le_key,
26236 +                           const struct cpu_key *cpu_key)
26237 +{
26238 +       int retval;
26239 +
26240 +       retval = comp_short_keys(le_key, cpu_key);
26241 +       if (retval)
26242 +               return retval;
26243 +       if (le_key_k_offset(le_key_version(le_key), le_key) <
26244 +           cpu_key_k_offset(cpu_key))
26245 +               return -1;
26246 +       if (le_key_k_offset(le_key_version(le_key), le_key) >
26247 +           cpu_key_k_offset(cpu_key))
26248 +               return 1;
26249 +
26250 +       if (cpu_key->key_length == 3)
26251 +               return 0;
26252 +
26253 +       /* this part is needed only when tail conversion is in progress */
26254 +       if (le_key_k_type(le_key_version(le_key), le_key) <
26255 +           cpu_key_k_type(cpu_key))
26256 +               return -1;
26257 +
26258 +       if (le_key_k_type(le_key_version(le_key), le_key) >
26259 +           cpu_key_k_type(cpu_key))
26260 +               return 1;
26261 +
26262 +       return 0;
26263 +}
26264 +
26265 +inline int comp_short_le_keys(const struct reiserfs_key *key1,
26266 +                             const struct reiserfs_key *key2)
26267 +{
26268 +       __u32 *k1_u32, *k2_u32;
26269 +       int key_length = REISERFS_SHORT_KEY_LEN;
26270 +
26271 +       k1_u32 = (__u32 *) key1;
26272 +       k2_u32 = (__u32 *) key2;
26273 +       for (; key_length--; ++k1_u32, ++k2_u32) {
26274 +               if (le32_to_cpu(*k1_u32) < le32_to_cpu(*k2_u32))
26275 +                       return -1;
26276 +               if (le32_to_cpu(*k1_u32) > le32_to_cpu(*k2_u32))
26277 +                       return 1;
26278 +       }
26279 +       return 0;
26280 +}
26281 +
26282 +inline void le_key2cpu_key(struct cpu_key *to, const struct reiserfs_key *from)
26283 +{
26284 +       int version;
26285 +       to->on_disk_key.k_dir_id = le32_to_cpu(from->k_dir_id);
26286 +       to->on_disk_key.k_objectid = le32_to_cpu(from->k_objectid);
26287 +
26288 +       /* find out version of the key */
26289 +       version = le_key_version(from);
26290 +       to->version = version;
26291 +       to->on_disk_key.k_offset = le_key_k_offset(version, from);
26292 +       to->on_disk_key.k_type = le_key_k_type(version, from);
26293 +}
26294 +
26295 +/*
26296 + * this does not say which one is bigger, it only returns 1 if keys
26297 + * are not equal, 0 otherwise
26298 + */
26299 +inline int comp_le_keys(const struct reiserfs_key *k1,
26300 +                       const struct reiserfs_key *k2)
26301 +{
26302 +       return memcmp(k1, k2, sizeof(struct reiserfs_key));
26303 +}
26304 +
26305 +/**************************************************************************
26306 + *  Binary search toolkit function                                        *
26307 + *  Search for an item in the array by the item key                       *
26308 + *  Returns:    1 if found,  0 if not found;                              *
26309 + *        *pos = number of the searched element if found, else the        *
26310 + *        number of the first element that is larger than key.            *
26311 + **************************************************************************/
26312 +/*
26313 + * For those not familiar with binary search: lbound is the leftmost item
26314 + * that it could be, rbound the rightmost item that it could be.  We examine
26315 + * the item halfway between lbound and rbound, and that tells us either
26316 + * that we can increase lbound, or decrease rbound, or that we have found it,
26317 + * or if lbound <= rbound that there are no possible items, and we have not
26318 + * found it. With each examination we cut the number of possible items it
26319 + * could be by one more than half rounded down, or we find it.
26320 + */
26321 +static inline int bin_search(const void *key,  /* Key to search for. */
26322 +                            const void *base,  /* First item in the array. */
26323 +                            int num,   /* Number of items in the array. */
26324 +                            /*
26325 +                             * Item size in the array.  searched. Lest the
26326 +                             * reader be confused, note that this is crafted
26327 +                             * as a general function, and when it is applied
26328 +                             * specifically to the array of item headers in a
26329 +                             * node, width is actually the item header size
26330 +                             * not the item size.
26331 +                             */
26332 +                            int width,
26333 +                            int *pos /* Number of the searched for element. */
26334 +    )
26335 +{
26336 +       int rbound, lbound, j;
26337 +
26338 +       for (j = ((rbound = num - 1) + (lbound = 0)) / 2;
26339 +            lbound <= rbound; j = (rbound + lbound) / 2)
26340 +               switch (comp_keys
26341 +                       ((struct reiserfs_key *)((char *)base + j * width),
26342 +                        (struct cpu_key *)key)) {
26343 +               case -1:
26344 +                       lbound = j + 1;
26345 +                       continue;
26346 +               case 1:
26347 +                       rbound = j - 1;
26348 +                       continue;
26349 +               case 0:
26350 +                       *pos = j;
26351 +                       return ITEM_FOUND;      /* Key found in the array.  */
26352 +               }
26353 +
26354 +       /*
26355 +        * bin_search did not find given key, it returns position of key,
26356 +        * that is minimal and greater than the given one.
26357 +        */
26358 +       *pos = lbound;
26359 +       return ITEM_NOT_FOUND;
26360 +}
26361 +
26362 +
26363 +/* Minimal possible key. It is never in the tree. */
26364 +const struct reiserfs_key MIN_KEY = { 0, 0, {{0, 0},} };
26365 +
26366 +/* Maximal possible key. It is never in the tree. */
26367 +static const struct reiserfs_key MAX_KEY = {
26368 +       cpu_to_le32(0xffffffff),
26369 +       cpu_to_le32(0xffffffff),
26370 +       {{cpu_to_le32(0xffffffff),
26371 +         cpu_to_le32(0xffffffff)},}
26372 +};
26373 +
26374 +/*
26375 + * Get delimiting key of the buffer by looking for it in the buffers in the
26376 + * path, starting from the bottom of the path, and going upwards.  We must
26377 + * check the path's validity at each step.  If the key is not in the path,
26378 + * there is no delimiting key in the tree (buffer is first or last buffer
26379 + * in tree), and in this case we return a special key, either MIN_KEY or
26380 + * MAX_KEY.
26381 + */
26382 +static inline const struct reiserfs_key *get_lkey(const struct treepath *chk_path,
26383 +                                                 const struct super_block *sb)
26384 +{
26385 +       int position, path_offset = chk_path->path_length;
26386 +       struct buffer_head *parent;
26387 +
26388 +       RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET,
26389 +              "PAP-5010: invalid offset in the path");
26390 +
26391 +       /* While not higher in path than first element. */
26392 +       while (path_offset-- > FIRST_PATH_ELEMENT_OFFSET) {
26393 +
26394 +               RFALSE(!buffer_uptodate
26395 +                      (PATH_OFFSET_PBUFFER(chk_path, path_offset)),
26396 +                      "PAP-5020: parent is not uptodate");
26397 +
26398 +               /* Parent at the path is not in the tree now. */
26399 +               if (!B_IS_IN_TREE
26400 +                   (parent =
26401 +                    PATH_OFFSET_PBUFFER(chk_path, path_offset)))
26402 +                       return &MAX_KEY;
26403 +               /* Check whether position in the parent is correct. */
26404 +               if ((position =
26405 +                    PATH_OFFSET_POSITION(chk_path,
26406 +                                         path_offset)) >
26407 +                   B_NR_ITEMS(parent))
26408 +                       return &MAX_KEY;
26409 +               /* Check whether parent at the path really points to the child. */
26410 +               if (B_N_CHILD_NUM(parent, position) !=
26411 +                   PATH_OFFSET_PBUFFER(chk_path,
26412 +                                       path_offset + 1)->b_blocknr)
26413 +                       return &MAX_KEY;
26414 +               /*
26415 +                * Return delimiting key if position in the parent
26416 +                * is not equal to zero.
26417 +                */
26418 +               if (position)
26419 +                       return internal_key(parent, position - 1);
26420 +       }
26421 +       /* Return MIN_KEY if we are in the root of the buffer tree. */
26422 +       if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)->
26423 +           b_blocknr == SB_ROOT_BLOCK(sb))
26424 +               return &MIN_KEY;
26425 +       return &MAX_KEY;
26426 +}
26427 +
26428 +/* Get delimiting key of the buffer at the path and its right neighbor. */
26429 +inline const struct reiserfs_key *get_rkey(const struct treepath *chk_path,
26430 +                                          const struct super_block *sb)
26431 +{
26432 +       int position, path_offset = chk_path->path_length;
26433 +       struct buffer_head *parent;
26434 +
26435 +       RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET,
26436 +              "PAP-5030: invalid offset in the path");
26437 +
26438 +       while (path_offset-- > FIRST_PATH_ELEMENT_OFFSET) {
26439 +
26440 +               RFALSE(!buffer_uptodate
26441 +                      (PATH_OFFSET_PBUFFER(chk_path, path_offset)),
26442 +                      "PAP-5040: parent is not uptodate");
26443 +
26444 +               /* Parent at the path is not in the tree now. */
26445 +               if (!B_IS_IN_TREE
26446 +                   (parent =
26447 +                    PATH_OFFSET_PBUFFER(chk_path, path_offset)))
26448 +                       return &MIN_KEY;
26449 +               /* Check whether position in the parent is correct. */
26450 +               if ((position =
26451 +                    PATH_OFFSET_POSITION(chk_path,
26452 +                                         path_offset)) >
26453 +                   B_NR_ITEMS(parent))
26454 +                       return &MIN_KEY;
26455 +               /*
26456 +                * Check whether parent at the path really points
26457 +                * to the child.
26458 +                */
26459 +               if (B_N_CHILD_NUM(parent, position) !=
26460 +                   PATH_OFFSET_PBUFFER(chk_path,
26461 +                                       path_offset + 1)->b_blocknr)
26462 +                       return &MIN_KEY;
26463 +
26464 +               /*
26465 +                * Return delimiting key if position in the parent
26466 +                * is not the last one.
26467 +                */
26468 +               if (position != B_NR_ITEMS(parent))
26469 +                       return internal_key(parent, position);
26470 +       }
26471 +
26472 +       /* Return MAX_KEY if we are in the root of the buffer tree. */
26473 +       if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)->
26474 +           b_blocknr == SB_ROOT_BLOCK(sb))
26475 +               return &MAX_KEY;
26476 +       return &MIN_KEY;
26477 +}
26478 +
26479 +/*
26480 + * Check whether a key is contained in the tree rooted from a buffer at a path.
26481 + * This works by looking at the left and right delimiting keys for the buffer
26482 + * in the last path_element in the path.  These delimiting keys are stored
26483 + * at least one level above that buffer in the tree. If the buffer is the
26484 + * first or last node in the tree order then one of the delimiting keys may
26485 + * be absent, and in this case get_lkey and get_rkey return a special key
26486 + * which is MIN_KEY or MAX_KEY.
26487 + */
26488 +static inline int key_in_buffer(
26489 +                               /* Path which should be checked. */
26490 +                               struct treepath *chk_path,
26491 +                               /* Key which should be checked. */
26492 +                               const struct cpu_key *key,
26493 +                               struct super_block *sb
26494 +    )
26495 +{
26496 +
26497 +       RFALSE(!key || chk_path->path_length < FIRST_PATH_ELEMENT_OFFSET
26498 +              || chk_path->path_length > MAX_HEIGHT,
26499 +              "PAP-5050: pointer to the key(%p) is NULL or invalid path length(%d)",
26500 +              key, chk_path->path_length);
26501 +       RFALSE(!PATH_PLAST_BUFFER(chk_path)->b_bdev,
26502 +              "PAP-5060: device must not be NODEV");
26503 +
26504 +       if (comp_keys(get_lkey(chk_path, sb), key) == 1)
26505 +               /* left delimiting key is bigger, that the key we look for */
26506 +               return 0;
26507 +       /*  if ( comp_keys(key, get_rkey(chk_path, sb)) != -1 ) */
26508 +       if (comp_keys(get_rkey(chk_path, sb), key) != 1)
26509 +               /* key must be less than right delimitiing key */
26510 +               return 0;
26511 +       return 1;
26512 +}
26513 +
26514 +int reiserfs_check_path(struct treepath *p)
26515 +{
26516 +       RFALSE(p->path_length != ILLEGAL_PATH_ELEMENT_OFFSET,
26517 +              "path not properly relsed");
26518 +       return 0;
26519 +}
26520 +
26521 +/*
26522 + * Drop the reference to each buffer in a path and restore
26523 + * dirty bits clean when preparing the buffer for the log.
26524 + * This version should only be called from fix_nodes()
26525 + */
26526 +void pathrelse_and_restore(struct super_block *sb,
26527 +                          struct treepath *search_path)
26528 +{
26529 +       int path_offset = search_path->path_length;
26530 +
26531 +       RFALSE(path_offset < ILLEGAL_PATH_ELEMENT_OFFSET,
26532 +              "clm-4000: invalid path offset");
26533 +
26534 +       while (path_offset > ILLEGAL_PATH_ELEMENT_OFFSET) {
26535 +               struct buffer_head *bh;
26536 +               bh = PATH_OFFSET_PBUFFER(search_path, path_offset--);
26537 +               reiserfs_restore_prepared_buffer(sb, bh);
26538 +               brelse(bh);
26539 +       }
26540 +       search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
26541 +}
26542 +
26543 +/* Drop the reference to each buffer in a path */
26544 +void pathrelse(struct treepath *search_path)
26545 +{
26546 +       int path_offset = search_path->path_length;
26547 +
26548 +       RFALSE(path_offset < ILLEGAL_PATH_ELEMENT_OFFSET,
26549 +              "PAP-5090: invalid path offset");
26550 +
26551 +       while (path_offset > ILLEGAL_PATH_ELEMENT_OFFSET)
26552 +               brelse(PATH_OFFSET_PBUFFER(search_path, path_offset--));
26553 +
26554 +       search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
26555 +}
26556 +
26557 +static int has_valid_deh_location(struct buffer_head *bh, struct item_head *ih)
26558 +{
26559 +       struct reiserfs_de_head *deh;
26560 +       int i;
26561 +
26562 +       deh = B_I_DEH(bh, ih);
26563 +       for (i = 0; i < ih_entry_count(ih); i++) {
26564 +               if (deh_location(&deh[i]) > ih_item_len(ih)) {
26565 +                       reiserfs_warning(NULL, "reiserfs-5094",
26566 +                                        "directory entry location seems wrong %h",
26567 +                                        &deh[i]);
26568 +                       return 0;
26569 +               }
26570 +       }
26571 +
26572 +       return 1;
26573 +}
26574 +
26575 +static int is_leaf(char *buf, int blocksize, struct buffer_head *bh)
26576 +{
26577 +       struct block_head *blkh;
26578 +       struct item_head *ih;
26579 +       int used_space;
26580 +       int prev_location;
26581 +       int i;
26582 +       int nr;
26583 +
26584 +       blkh = (struct block_head *)buf;
26585 +       if (blkh_level(blkh) != DISK_LEAF_NODE_LEVEL) {
26586 +               reiserfs_warning(NULL, "reiserfs-5080",
26587 +                                "this should be caught earlier");
26588 +               return 0;
26589 +       }
26590 +
26591 +       nr = blkh_nr_item(blkh);
26592 +       if (nr < 1 || nr > ((blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN))) {
26593 +               /* item number is too big or too small */
26594 +               reiserfs_warning(NULL, "reiserfs-5081",
26595 +                                "nr_item seems wrong: %z", bh);
26596 +               return 0;
26597 +       }
26598 +       ih = (struct item_head *)(buf + BLKH_SIZE) + nr - 1;
26599 +       used_space = BLKH_SIZE + IH_SIZE * nr + (blocksize - ih_location(ih));
26600 +
26601 +       /* free space does not match to calculated amount of use space */
26602 +       if (used_space != blocksize - blkh_free_space(blkh)) {
26603 +               reiserfs_warning(NULL, "reiserfs-5082",
26604 +                                "free space seems wrong: %z", bh);
26605 +               return 0;
26606 +       }
26607 +       /*
26608 +        * FIXME: it is_leaf will hit performance too much - we may have
26609 +        * return 1 here
26610 +        */
26611 +
26612 +       /* check tables of item heads */
26613 +       ih = (struct item_head *)(buf + BLKH_SIZE);
26614 +       prev_location = blocksize;
26615 +       for (i = 0; i < nr; i++, ih++) {
26616 +               if (le_ih_k_type(ih) == TYPE_ANY) {
26617 +                       reiserfs_warning(NULL, "reiserfs-5083",
26618 +                                        "wrong item type for item %h",
26619 +                                        ih);
26620 +                       return 0;
26621 +               }
26622 +               if (ih_location(ih) >= blocksize
26623 +                   || ih_location(ih) < IH_SIZE * nr) {
26624 +                       reiserfs_warning(NULL, "reiserfs-5084",
26625 +                                        "item location seems wrong: %h",
26626 +                                        ih);
26627 +                       return 0;
26628 +               }
26629 +               if (ih_item_len(ih) < 1
26630 +                   || ih_item_len(ih) > MAX_ITEM_LEN(blocksize)) {
26631 +                       reiserfs_warning(NULL, "reiserfs-5085",
26632 +                                        "item length seems wrong: %h",
26633 +                                        ih);
26634 +                       return 0;
26635 +               }
26636 +               if (prev_location - ih_location(ih) != ih_item_len(ih)) {
26637 +                       reiserfs_warning(NULL, "reiserfs-5086",
26638 +                                        "item location seems wrong "
26639 +                                        "(second one): %h", ih);
26640 +                       return 0;
26641 +               }
26642 +               if (is_direntry_le_ih(ih)) {
26643 +                       if (ih_item_len(ih) < (ih_entry_count(ih) * IH_SIZE)) {
26644 +                               reiserfs_warning(NULL, "reiserfs-5093",
26645 +                                                "item entry count seems wrong %h",
26646 +                                                ih);
26647 +                               return 0;
26648 +                       }
26649 +                       return has_valid_deh_location(bh, ih);
26650 +               }
26651 +               prev_location = ih_location(ih);
26652 +       }
26653 +
26654 +       /* one may imagine many more checks */
26655 +       return 1;
26656 +}
26657 +
26658 +/* returns 1 if buf looks like an internal node, 0 otherwise */
26659 +static int is_internal(char *buf, int blocksize, struct buffer_head *bh)
26660 +{
26661 +       struct block_head *blkh;
26662 +       int nr;
26663 +       int used_space;
26664 +
26665 +       blkh = (struct block_head *)buf;
26666 +       nr = blkh_level(blkh);
26667 +       if (nr <= DISK_LEAF_NODE_LEVEL || nr > MAX_HEIGHT) {
26668 +               /* this level is not possible for internal nodes */
26669 +               reiserfs_warning(NULL, "reiserfs-5087",
26670 +                                "this should be caught earlier");
26671 +               return 0;
26672 +       }
26673 +
26674 +       nr = blkh_nr_item(blkh);
26675 +       /* for internal which is not root we might check min number of keys */
26676 +       if (nr > (blocksize - BLKH_SIZE - DC_SIZE) / (KEY_SIZE + DC_SIZE)) {
26677 +               reiserfs_warning(NULL, "reiserfs-5088",
26678 +                                "number of key seems wrong: %z", bh);
26679 +               return 0;
26680 +       }
26681 +
26682 +       used_space = BLKH_SIZE + KEY_SIZE * nr + DC_SIZE * (nr + 1);
26683 +       if (used_space != blocksize - blkh_free_space(blkh)) {
26684 +               reiserfs_warning(NULL, "reiserfs-5089",
26685 +                                "free space seems wrong: %z", bh);
26686 +               return 0;
26687 +       }
26688 +
26689 +       /* one may imagine many more checks */
26690 +       return 1;
26691 +}
26692 +
26693 +/*
26694 + * make sure that bh contains formatted node of reiserfs tree of
26695 + * 'level'-th level
26696 + */
26697 +static int is_tree_node(struct buffer_head *bh, int level)
26698 +{
26699 +       if (B_LEVEL(bh) != level) {
26700 +               reiserfs_warning(NULL, "reiserfs-5090", "node level %d does "
26701 +                                "not match to the expected one %d",
26702 +                                B_LEVEL(bh), level);
26703 +               return 0;
26704 +       }
26705 +       if (level == DISK_LEAF_NODE_LEVEL)
26706 +               return is_leaf(bh->b_data, bh->b_size, bh);
26707 +
26708 +       return is_internal(bh->b_data, bh->b_size, bh);
26709 +}
26710 +
26711 +#define SEARCH_BY_KEY_READA 16
26712 +
26713 +/*
26714 + * The function is NOT SCHEDULE-SAFE!
26715 + * It might unlock the write lock if we needed to wait for a block
26716 + * to be read. Note that in this case it won't recover the lock to avoid
26717 + * high contention resulting from too much lock requests, especially
26718 + * the caller (search_by_key) will perform other schedule-unsafe
26719 + * operations just after calling this function.
26720 + *
26721 + * @return depth of lock to be restored after read completes
26722 + */
26723 +static int search_by_key_reada(struct super_block *s,
26724 +                               struct buffer_head **bh,
26725 +                               b_blocknr_t *b, int num)
26726 +{
26727 +       int i, j;
26728 +       int depth = -1;
26729 +
26730 +       for (i = 0; i < num; i++) {
26731 +               bh[i] = sb_getblk(s, b[i]);
26732 +       }
26733 +       /*
26734 +        * We are going to read some blocks on which we
26735 +        * have a reference. It's safe, though we might be
26736 +        * reading blocks concurrently changed if we release
26737 +        * the lock. But it's still fine because we check later
26738 +        * if the tree changed
26739 +        */
26740 +       for (j = 0; j < i; j++) {
26741 +               /*
26742 +                * note, this needs attention if we are getting rid of the BKL
26743 +                * you have to make sure the prepared bit isn't set on this
26744 +                * buffer
26745 +                */
26746 +               if (!buffer_uptodate(bh[j])) {
26747 +                       if (depth == -1)
26748 +                               depth = reiserfs_write_unlock_nested(s);
26749 +                       bh_readahead(bh[j], REQ_RAHEAD);
26750 +               }
26751 +               brelse(bh[j]);
26752 +       }
26753 +       return depth;
26754 +}
26755 +
26756 +/*
26757 + * This function fills up the path from the root to the leaf as it
26758 + * descends the tree looking for the key.  It uses reiserfs_bread to
26759 + * try to find buffers in the cache given their block number.  If it
26760 + * does not find them in the cache it reads them from disk.  For each
26761 + * node search_by_key finds using reiserfs_bread it then uses
26762 + * bin_search to look through that node.  bin_search will find the
26763 + * position of the block_number of the next node if it is looking
26764 + * through an internal node.  If it is looking through a leaf node
26765 + * bin_search will find the position of the item which has key either
26766 + * equal to given key, or which is the maximal key less than the given
26767 + * key.  search_by_key returns a path that must be checked for the
26768 + * correctness of the top of the path but need not be checked for the
26769 + * correctness of the bottom of the path
26770 + */
26771 +/*
26772 + * search_by_key - search for key (and item) in stree
26773 + * @sb: superblock
26774 + * @key: pointer to key to search for
26775 + * @search_path: Allocated and initialized struct treepath; Returned filled
26776 + *              on success.
26777 + * @stop_level: How far down the tree to search, Use DISK_LEAF_NODE_LEVEL to
26778 + *             stop at leaf level.
26779 + *
26780 + * The function is NOT SCHEDULE-SAFE!
26781 + */
26782 +int search_by_key(struct super_block *sb, const struct cpu_key *key,
26783 +                 struct treepath *search_path, int stop_level)
26784 +{
26785 +       b_blocknr_t block_number;
26786 +       int expected_level;
26787 +       struct buffer_head *bh;
26788 +       struct path_element *last_element;
26789 +       int node_level, retval;
26790 +       int fs_gen;
26791 +       struct buffer_head *reada_bh[SEARCH_BY_KEY_READA];
26792 +       b_blocknr_t reada_blocks[SEARCH_BY_KEY_READA];
26793 +       int reada_count = 0;
26794 +
26795 +#ifdef CONFIG_REISERFS_CHECK
26796 +       int repeat_counter = 0;
26797 +#endif
26798 +
26799 +       PROC_INFO_INC(sb, search_by_key);
26800 +
26801 +       /*
26802 +        * As we add each node to a path we increase its count.  This means
26803 +        * that we must be careful to release all nodes in a path before we
26804 +        * either discard the path struct or re-use the path struct, as we
26805 +        * do here.
26806 +        */
26807 +
26808 +       pathrelse(search_path);
26809 +
26810 +       /*
26811 +        * With each iteration of this loop we search through the items in the
26812 +        * current node, and calculate the next current node(next path element)
26813 +        * for the next iteration of this loop..
26814 +        */
26815 +       block_number = SB_ROOT_BLOCK(sb);
26816 +       expected_level = -1;
26817 +       while (1) {
26818 +
26819 +#ifdef CONFIG_REISERFS_CHECK
26820 +               if (!(++repeat_counter % 50000))
26821 +                       reiserfs_warning(sb, "PAP-5100",
26822 +                                        "%s: there were %d iterations of "
26823 +                                        "while loop looking for key %K",
26824 +                                        current->comm, repeat_counter,
26825 +                                        key);
26826 +#endif
26827 +
26828 +               /* prep path to have another element added to it. */
26829 +               last_element =
26830 +                   PATH_OFFSET_PELEMENT(search_path,
26831 +                                        ++search_path->path_length);
26832 +               fs_gen = get_generation(sb);
26833 +
26834 +               /*
26835 +                * Read the next tree node, and set the last element
26836 +                * in the path to have a pointer to it.
26837 +                */
26838 +               if ((bh = last_element->pe_buffer =
26839 +                    sb_getblk(sb, block_number))) {
26840 +
26841 +                       /*
26842 +                        * We'll need to drop the lock if we encounter any
26843 +                        * buffers that need to be read. If all of them are
26844 +                        * already up to date, we don't need to drop the lock.
26845 +                        */
26846 +                       int depth = -1;
26847 +
26848 +                       if (!buffer_uptodate(bh) && reada_count > 1)
26849 +                               depth = search_by_key_reada(sb, reada_bh,
26850 +                                                   reada_blocks, reada_count);
26851 +
26852 +                       if (!buffer_uptodate(bh) && depth == -1)
26853 +                               depth = reiserfs_write_unlock_nested(sb);
26854 +
26855 +                       bh_read_nowait(bh, 0);
26856 +                       wait_on_buffer(bh);
26857 +
26858 +                       if (depth != -1)
26859 +                               reiserfs_write_lock_nested(sb, depth);
26860 +                       if (!buffer_uptodate(bh))
26861 +                               goto io_error;
26862 +               } else {
26863 +io_error:
26864 +                       search_path->path_length--;
26865 +                       pathrelse(search_path);
26866 +                       return IO_ERROR;
26867 +               }
26868 +               reada_count = 0;
26869 +               if (expected_level == -1)
26870 +                       expected_level = SB_TREE_HEIGHT(sb);
26871 +               expected_level--;
26872 +
26873 +               /*
26874 +                * It is possible that schedule occurred. We must check
26875 +                * whether the key to search is still in the tree rooted
26876 +                * from the current buffer. If not then repeat search
26877 +                * from the root.
26878 +                */
26879 +               if (fs_changed(fs_gen, sb) &&
26880 +                   (!B_IS_IN_TREE(bh) ||
26881 +                    B_LEVEL(bh) != expected_level ||
26882 +                    !key_in_buffer(search_path, key, sb))) {
26883 +                       PROC_INFO_INC(sb, search_by_key_fs_changed);
26884 +                       PROC_INFO_INC(sb, search_by_key_restarted);
26885 +                       PROC_INFO_INC(sb,
26886 +                                     sbk_restarted[expected_level - 1]);
26887 +                       pathrelse(search_path);
26888 +
26889 +                       /*
26890 +                        * Get the root block number so that we can
26891 +                        * repeat the search starting from the root.
26892 +                        */
26893 +                       block_number = SB_ROOT_BLOCK(sb);
26894 +                       expected_level = -1;
26895 +
26896 +                       /* repeat search from the root */
26897 +                       continue;
26898 +               }
26899 +
26900 +               /*
26901 +                * only check that the key is in the buffer if key is not
26902 +                * equal to the MAX_KEY. Latter case is only possible in
26903 +                * "finish_unfinished()" processing during mount.
26904 +                */
26905 +               RFALSE(comp_keys(&MAX_KEY, key) &&
26906 +                      !key_in_buffer(search_path, key, sb),
26907 +                      "PAP-5130: key is not in the buffer");
26908 +#ifdef CONFIG_REISERFS_CHECK
26909 +               if (REISERFS_SB(sb)->cur_tb) {
26910 +                       print_cur_tb("5140");
26911 +                       reiserfs_panic(sb, "PAP-5140",
26912 +                                      "schedule occurred in do_balance!");
26913 +               }
26914 +#endif
26915 +
26916 +               /*
26917 +                * make sure, that the node contents look like a node of
26918 +                * certain level
26919 +                */
26920 +               if (!is_tree_node(bh, expected_level)) {
26921 +                       reiserfs_error(sb, "vs-5150",
26922 +                                      "invalid format found in block %ld. "
26923 +                                      "Fsck?", bh->b_blocknr);
26924 +                       pathrelse(search_path);
26925 +                       return IO_ERROR;
26926 +               }
26927 +
26928 +               /* ok, we have acquired next formatted node in the tree */
26929 +               node_level = B_LEVEL(bh);
26930 +
26931 +               PROC_INFO_BH_STAT(sb, bh, node_level - 1);
26932 +
26933 +               RFALSE(node_level < stop_level,
26934 +                      "vs-5152: tree level (%d) is less than stop level (%d)",
26935 +                      node_level, stop_level);
26936 +
26937 +               retval = bin_search(key, item_head(bh, 0),
26938 +                                     B_NR_ITEMS(bh),
26939 +                                     (node_level ==
26940 +                                      DISK_LEAF_NODE_LEVEL) ? IH_SIZE :
26941 +                                     KEY_SIZE,
26942 +                                     &last_element->pe_position);
26943 +               if (node_level == stop_level) {
26944 +                       return retval;
26945 +               }
26946 +
26947 +               /* we are not in the stop level */
26948 +               /*
26949 +                * item has been found, so we choose the pointer which
26950 +                * is to the right of the found one
26951 +                */
26952 +               if (retval == ITEM_FOUND)
26953 +                       last_element->pe_position++;
26954 +
26955 +               /*
26956 +                * if item was not found we choose the position which is to
26957 +                * the left of the found item. This requires no code,
26958 +                * bin_search did it already.
26959 +                */
26960 +
26961 +               /*
26962 +                * So we have chosen a position in the current node which is
26963 +                * an internal node.  Now we calculate child block number by
26964 +                * position in the node.
26965 +                */
26966 +               block_number =
26967 +                   B_N_CHILD_NUM(bh, last_element->pe_position);
26968 +
26969 +               /*
26970 +                * if we are going to read leaf nodes, try for read
26971 +                * ahead as well
26972 +                */
26973 +               if ((search_path->reada & PATH_READA) &&
26974 +                   node_level == DISK_LEAF_NODE_LEVEL + 1) {
26975 +                       int pos = last_element->pe_position;
26976 +                       int limit = B_NR_ITEMS(bh);
26977 +                       struct reiserfs_key *le_key;
26978 +
26979 +                       if (search_path->reada & PATH_READA_BACK)
26980 +                               limit = 0;
26981 +                       while (reada_count < SEARCH_BY_KEY_READA) {
26982 +                               if (pos == limit)
26983 +                                       break;
26984 +                               reada_blocks[reada_count++] =
26985 +                                   B_N_CHILD_NUM(bh, pos);
26986 +                               if (search_path->reada & PATH_READA_BACK)
26987 +                                       pos--;
26988 +                               else
26989 +                                       pos++;
26990 +
26991 +                               /*
26992 +                                * check to make sure we're in the same object
26993 +                                */
26994 +                               le_key = internal_key(bh, pos);
26995 +                               if (le32_to_cpu(le_key->k_objectid) !=
26996 +                                   key->on_disk_key.k_objectid) {
26997 +                                       break;
26998 +                               }
26999 +                       }
27000 +               }
27001 +       }
27002 +}
27003 +
27004 +/*
27005 + * Form the path to an item and position in this item which contains
27006 + * file byte defined by key. If there is no such item
27007 + * corresponding to the key, we point the path to the item with
27008 + * maximal key less than key, and *pos_in_item is set to one
27009 + * past the last entry/byte in the item.  If searching for entry in a
27010 + * directory item, and it is not found, *pos_in_item is set to one
27011 + * entry more than the entry with maximal key which is less than the
27012 + * sought key.
27013 + *
27014 + * Note that if there is no entry in this same node which is one more,
27015 + * then we point to an imaginary entry.  for direct items, the
27016 + * position is in units of bytes, for indirect items the position is
27017 + * in units of blocknr entries, for directory items the position is in
27018 + * units of directory entries.
27019 + */
27020 +/* The function is NOT SCHEDULE-SAFE! */
27021 +int search_for_position_by_key(struct super_block *sb,
27022 +                              /* Key to search (cpu variable) */
27023 +                              const struct cpu_key *p_cpu_key,
27024 +                              /* Filled up by this function. */
27025 +                              struct treepath *search_path)
27026 +{
27027 +       struct item_head *p_le_ih;      /* pointer to on-disk structure */
27028 +       int blk_size;
27029 +       loff_t item_offset, offset;
27030 +       struct reiserfs_dir_entry de;
27031 +       int retval;
27032 +
27033 +       /* If searching for directory entry. */
27034 +       if (is_direntry_cpu_key(p_cpu_key))
27035 +               return search_by_entry_key(sb, p_cpu_key, search_path,
27036 +                                          &de);
27037 +
27038 +       /* If not searching for directory entry. */
27039 +
27040 +       /* If item is found. */
27041 +       retval = search_item(sb, p_cpu_key, search_path);
27042 +       if (retval == IO_ERROR)
27043 +               return retval;
27044 +       if (retval == ITEM_FOUND) {
27045 +
27046 +               RFALSE(!ih_item_len
27047 +                      (item_head
27048 +                       (PATH_PLAST_BUFFER(search_path),
27049 +                        PATH_LAST_POSITION(search_path))),
27050 +                      "PAP-5165: item length equals zero");
27051 +
27052 +               pos_in_item(search_path) = 0;
27053 +               return POSITION_FOUND;
27054 +       }
27055 +
27056 +       RFALSE(!PATH_LAST_POSITION(search_path),
27057 +              "PAP-5170: position equals zero");
27058 +
27059 +       /* Item is not found. Set path to the previous item. */
27060 +       p_le_ih =
27061 +           item_head(PATH_PLAST_BUFFER(search_path),
27062 +                          --PATH_LAST_POSITION(search_path));
27063 +       blk_size = sb->s_blocksize;
27064 +
27065 +       if (comp_short_keys(&p_le_ih->ih_key, p_cpu_key))
27066 +               return FILE_NOT_FOUND;
27067 +
27068 +       /* FIXME: quite ugly this far */
27069 +
27070 +       item_offset = le_ih_k_offset(p_le_ih);
27071 +       offset = cpu_key_k_offset(p_cpu_key);
27072 +
27073 +       /* Needed byte is contained in the item pointed to by the path. */
27074 +       if (item_offset <= offset &&
27075 +           item_offset + op_bytes_number(p_le_ih, blk_size) > offset) {
27076 +               pos_in_item(search_path) = offset - item_offset;
27077 +               if (is_indirect_le_ih(p_le_ih)) {
27078 +                       pos_in_item(search_path) /= blk_size;
27079 +               }
27080 +               return POSITION_FOUND;
27081 +       }
27082 +
27083 +       /*
27084 +        * Needed byte is not contained in the item pointed to by the
27085 +        * path. Set pos_in_item out of the item.
27086 +        */
27087 +       if (is_indirect_le_ih(p_le_ih))
27088 +               pos_in_item(search_path) =
27089 +                   ih_item_len(p_le_ih) / UNFM_P_SIZE;
27090 +       else
27091 +               pos_in_item(search_path) = ih_item_len(p_le_ih);
27092 +
27093 +       return POSITION_NOT_FOUND;
27094 +}
27095 +
27096 +/* Compare given item and item pointed to by the path. */
27097 +int comp_items(const struct item_head *stored_ih, const struct treepath *path)
27098 +{
27099 +       struct buffer_head *bh = PATH_PLAST_BUFFER(path);
27100 +       struct item_head *ih;
27101 +
27102 +       /* Last buffer at the path is not in the tree. */
27103 +       if (!B_IS_IN_TREE(bh))
27104 +               return 1;
27105 +
27106 +       /* Last path position is invalid. */
27107 +       if (PATH_LAST_POSITION(path) >= B_NR_ITEMS(bh))
27108 +               return 1;
27109 +
27110 +       /* we need only to know, whether it is the same item */
27111 +       ih = tp_item_head(path);
27112 +       return memcmp(stored_ih, ih, IH_SIZE);
27113 +}
27114 +
27115 +/* prepare for delete or cut of direct item */
27116 +static inline int prepare_for_direct_item(struct treepath *path,
27117 +                                         struct item_head *le_ih,
27118 +                                         struct inode *inode,
27119 +                                         loff_t new_file_length, int *cut_size)
27120 +{
27121 +       loff_t round_len;
27122 +
27123 +       if (new_file_length == max_reiserfs_offset(inode)) {
27124 +               /* item has to be deleted */
27125 +               *cut_size = -(IH_SIZE + ih_item_len(le_ih));
27126 +               return M_DELETE;
27127 +       }
27128 +       /* new file gets truncated */
27129 +       if (get_inode_item_key_version(inode) == KEY_FORMAT_3_6) {
27130 +               round_len = ROUND_UP(new_file_length);
27131 +               /* this was new_file_length < le_ih ... */
27132 +               if (round_len < le_ih_k_offset(le_ih)) {
27133 +                       *cut_size = -(IH_SIZE + ih_item_len(le_ih));
27134 +                       return M_DELETE;        /* Delete this item. */
27135 +               }
27136 +               /* Calculate first position and size for cutting from item. */
27137 +               pos_in_item(path) = round_len - (le_ih_k_offset(le_ih) - 1);
27138 +               *cut_size = -(ih_item_len(le_ih) - pos_in_item(path));
27139 +
27140 +               return M_CUT;   /* Cut from this item. */
27141 +       }
27142 +
27143 +       /* old file: items may have any length */
27144 +
27145 +       if (new_file_length < le_ih_k_offset(le_ih)) {
27146 +               *cut_size = -(IH_SIZE + ih_item_len(le_ih));
27147 +               return M_DELETE;        /* Delete this item. */
27148 +       }
27149 +
27150 +       /* Calculate first position and size for cutting from item. */
27151 +       *cut_size = -(ih_item_len(le_ih) -
27152 +                     (pos_in_item(path) =
27153 +                      new_file_length + 1 - le_ih_k_offset(le_ih)));
27154 +       return M_CUT;           /* Cut from this item. */
27155 +}
27156 +
27157 +static inline int prepare_for_direntry_item(struct treepath *path,
27158 +                                           struct item_head *le_ih,
27159 +                                           struct inode *inode,
27160 +                                           loff_t new_file_length,
27161 +                                           int *cut_size)
27162 +{
27163 +       if (le_ih_k_offset(le_ih) == DOT_OFFSET &&
27164 +           new_file_length == max_reiserfs_offset(inode)) {
27165 +               RFALSE(ih_entry_count(le_ih) != 2,
27166 +                      "PAP-5220: incorrect empty directory item (%h)", le_ih);
27167 +               *cut_size = -(IH_SIZE + ih_item_len(le_ih));
27168 +               /* Delete the directory item containing "." and ".." entry. */
27169 +               return M_DELETE;
27170 +       }
27171 +
27172 +       if (ih_entry_count(le_ih) == 1) {
27173 +               /*
27174 +                * Delete the directory item such as there is one record only
27175 +                * in this item
27176 +                */
27177 +               *cut_size = -(IH_SIZE + ih_item_len(le_ih));
27178 +               return M_DELETE;
27179 +       }
27180 +
27181 +       /* Cut one record from the directory item. */
27182 +       *cut_size =
27183 +           -(DEH_SIZE +
27184 +             entry_length(get_last_bh(path), le_ih, pos_in_item(path)));
27185 +       return M_CUT;
27186 +}
27187 +
27188 +#define JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD (2 * JOURNAL_PER_BALANCE_CNT + 1)
27189 +
27190 +/*
27191 + * If the path points to a directory or direct item, calculate mode
27192 + * and the size cut, for balance.
27193 + * If the path points to an indirect item, remove some number of its
27194 + * unformatted nodes.
27195 + * In case of file truncate calculate whether this item must be
27196 + * deleted/truncated or last unformatted node of this item will be
27197 + * converted to a direct item.
27198 + * This function returns a determination of what balance mode the
27199 + * calling function should employ.
27200 + */
27201 +static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th,
27202 +                                     struct inode *inode,
27203 +                                     struct treepath *path,
27204 +                                     const struct cpu_key *item_key,
27205 +                                     /*
27206 +                                      * Number of unformatted nodes
27207 +                                      * which were removed from end
27208 +                                      * of the file.
27209 +                                      */
27210 +                                     int *removed,
27211 +                                     int *cut_size,
27212 +                                     /* MAX_KEY_OFFSET in case of delete. */
27213 +                                     unsigned long long new_file_length
27214 +    )
27215 +{
27216 +       struct super_block *sb = inode->i_sb;
27217 +       struct item_head *p_le_ih = tp_item_head(path);
27218 +       struct buffer_head *bh = PATH_PLAST_BUFFER(path);
27219 +
27220 +       BUG_ON(!th->t_trans_id);
27221 +
27222 +       /* Stat_data item. */
27223 +       if (is_statdata_le_ih(p_le_ih)) {
27224 +
27225 +               RFALSE(new_file_length != max_reiserfs_offset(inode),
27226 +                      "PAP-5210: mode must be M_DELETE");
27227 +
27228 +               *cut_size = -(IH_SIZE + ih_item_len(p_le_ih));
27229 +               return M_DELETE;
27230 +       }
27231 +
27232 +       /* Directory item. */
27233 +       if (is_direntry_le_ih(p_le_ih))
27234 +               return prepare_for_direntry_item(path, p_le_ih, inode,
27235 +                                                new_file_length,
27236 +                                                cut_size);
27237 +
27238 +       /* Direct item. */
27239 +       if (is_direct_le_ih(p_le_ih))
27240 +               return prepare_for_direct_item(path, p_le_ih, inode,
27241 +                                              new_file_length, cut_size);
27242 +
27243 +       /* Case of an indirect item. */
27244 +       {
27245 +           int blk_size = sb->s_blocksize;
27246 +           struct item_head s_ih;
27247 +           int need_re_search;
27248 +           int delete = 0;
27249 +           int result = M_CUT;
27250 +           int pos = 0;
27251 +
27252 +           if ( new_file_length == max_reiserfs_offset (inode) ) {
27253 +               /*
27254 +                * prepare_for_delete_or_cut() is called by
27255 +                * reiserfs_delete_item()
27256 +                */
27257 +               new_file_length = 0;
27258 +               delete = 1;
27259 +           }
27260 +
27261 +           do {
27262 +               need_re_search = 0;
27263 +               *cut_size = 0;
27264 +               bh = PATH_PLAST_BUFFER(path);
27265 +               copy_item_head(&s_ih, tp_item_head(path));
27266 +               pos = I_UNFM_NUM(&s_ih);
27267 +
27268 +               while (le_ih_k_offset (&s_ih) + (pos - 1) * blk_size > new_file_length) {
27269 +                   __le32 *unfm;
27270 +                   __u32 block;
27271 +
27272 +                   /*
27273 +                    * Each unformatted block deletion may involve
27274 +                    * one additional bitmap block into the transaction,
27275 +                    * thereby the initial journal space reservation
27276 +                    * might not be enough.
27277 +                    */
27278 +                   if (!delete && (*cut_size) != 0 &&
27279 +                       reiserfs_transaction_free_space(th) < JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD)
27280 +                       break;
27281 +
27282 +                   unfm = (__le32 *)ih_item_body(bh, &s_ih) + pos - 1;
27283 +                   block = get_block_num(unfm, 0);
27284 +
27285 +                   if (block != 0) {
27286 +                       reiserfs_prepare_for_journal(sb, bh, 1);
27287 +                       put_block_num(unfm, 0, 0);
27288 +                       journal_mark_dirty(th, bh);
27289 +                       reiserfs_free_block(th, inode, block, 1);
27290 +                   }
27291 +
27292 +                   reiserfs_cond_resched(sb);
27293 +
27294 +                   if (item_moved (&s_ih, path))  {
27295 +                       need_re_search = 1;
27296 +                       break;
27297 +                   }
27298 +
27299 +                   pos --;
27300 +                   (*removed)++;
27301 +                   (*cut_size) -= UNFM_P_SIZE;
27302 +
27303 +                   if (pos == 0) {
27304 +                       (*cut_size) -= IH_SIZE;
27305 +                       result = M_DELETE;
27306 +                       break;
27307 +                   }
27308 +               }
27309 +               /*
27310 +                * a trick.  If the buffer has been logged, this will
27311 +                * do nothing.  If we've broken the loop without logging
27312 +                * it, it will restore the buffer
27313 +                */
27314 +               reiserfs_restore_prepared_buffer(sb, bh);
27315 +           } while (need_re_search &&
27316 +                    search_for_position_by_key(sb, item_key, path) == POSITION_FOUND);
27317 +           pos_in_item(path) = pos * UNFM_P_SIZE;
27318 +
27319 +           if (*cut_size == 0) {
27320 +               /*
27321 +                * Nothing was cut. maybe convert last unformatted node to the
27322 +                * direct item?
27323 +                */
27324 +               result = M_CONVERT;
27325 +           }
27326 +           return result;
27327 +       }
27328 +}
27329 +
27330 +/* Calculate number of bytes which will be deleted or cut during balance */
27331 +static int calc_deleted_bytes_number(struct tree_balance *tb, char mode)
27332 +{
27333 +       int del_size;
27334 +       struct item_head *p_le_ih = tp_item_head(tb->tb_path);
27335 +
27336 +       if (is_statdata_le_ih(p_le_ih))
27337 +               return 0;
27338 +
27339 +       del_size =
27340 +           (mode ==
27341 +            M_DELETE) ? ih_item_len(p_le_ih) : -tb->insert_size[0];
27342 +       if (is_direntry_le_ih(p_le_ih)) {
27343 +               /*
27344 +                * return EMPTY_DIR_SIZE; We delete emty directories only.
27345 +                * we can't use EMPTY_DIR_SIZE, as old format dirs have a
27346 +                * different empty size.  ick. FIXME, is this right?
27347 +                */
27348 +               return del_size;
27349 +       }
27350 +
27351 +       if (is_indirect_le_ih(p_le_ih))
27352 +               del_size = (del_size / UNFM_P_SIZE) *
27353 +                               (PATH_PLAST_BUFFER(tb->tb_path)->b_size);
27354 +       return del_size;
27355 +}
27356 +
27357 +static void init_tb_struct(struct reiserfs_transaction_handle *th,
27358 +                          struct tree_balance *tb,
27359 +                          struct super_block *sb,
27360 +                          struct treepath *path, int size)
27361 +{
27362 +
27363 +       BUG_ON(!th->t_trans_id);
27364 +
27365 +       memset(tb, '\0', sizeof(struct tree_balance));
27366 +       tb->transaction_handle = th;
27367 +       tb->tb_sb = sb;
27368 +       tb->tb_path = path;
27369 +       PATH_OFFSET_PBUFFER(path, ILLEGAL_PATH_ELEMENT_OFFSET) = NULL;
27370 +       PATH_OFFSET_POSITION(path, ILLEGAL_PATH_ELEMENT_OFFSET) = 0;
27371 +       tb->insert_size[0] = size;
27372 +}
27373 +
27374 +void padd_item(char *item, int total_length, int length)
27375 +{
27376 +       int i;
27377 +
27378 +       for (i = total_length; i > length;)
27379 +               item[--i] = 0;
27380 +}
27381 +
27382 +#ifdef REISERQUOTA_DEBUG
27383 +char key2type(struct reiserfs_key *ih)
27384 +{
27385 +       if (is_direntry_le_key(2, ih))
27386 +               return 'd';
27387 +       if (is_direct_le_key(2, ih))
27388 +               return 'D';
27389 +       if (is_indirect_le_key(2, ih))
27390 +               return 'i';
27391 +       if (is_statdata_le_key(2, ih))
27392 +               return 's';
27393 +       return 'u';
27394 +}
27395 +
27396 +char head2type(struct item_head *ih)
27397 +{
27398 +       if (is_direntry_le_ih(ih))
27399 +               return 'd';
27400 +       if (is_direct_le_ih(ih))
27401 +               return 'D';
27402 +       if (is_indirect_le_ih(ih))
27403 +               return 'i';
27404 +       if (is_statdata_le_ih(ih))
27405 +               return 's';
27406 +       return 'u';
27407 +}
27408 +#endif
27409 +
27410 +/*
27411 + * Delete object item.
27412 + * th       - active transaction handle
27413 + * path     - path to the deleted item
27414 + * item_key - key to search for the deleted item
27415 + * indode   - used for updating i_blocks and quotas
27416 + * un_bh    - NULL or unformatted node pointer
27417 + */
27418 +int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
27419 +                        struct treepath *path, const struct cpu_key *item_key,
27420 +                        struct inode *inode, struct buffer_head *un_bh)
27421 +{
27422 +       struct super_block *sb = inode->i_sb;
27423 +       struct tree_balance s_del_balance;
27424 +       struct item_head s_ih;
27425 +       struct item_head *q_ih;
27426 +       int quota_cut_bytes;
27427 +       int ret_value, del_size, removed;
27428 +       int depth;
27429 +
27430 +#ifdef CONFIG_REISERFS_CHECK
27431 +       char mode;
27432 +#endif
27433 +
27434 +       BUG_ON(!th->t_trans_id);
27435 +
27436 +       init_tb_struct(th, &s_del_balance, sb, path,
27437 +                      0 /*size is unknown */ );
27438 +
27439 +       while (1) {
27440 +               removed = 0;
27441 +
27442 +#ifdef CONFIG_REISERFS_CHECK
27443 +               mode =
27444 +#endif
27445 +                   prepare_for_delete_or_cut(th, inode, path,
27446 +                                             item_key, &removed,
27447 +                                             &del_size,
27448 +                                             max_reiserfs_offset(inode));
27449 +
27450 +               RFALSE(mode != M_DELETE, "PAP-5320: mode must be M_DELETE");
27451 +
27452 +               copy_item_head(&s_ih, tp_item_head(path));
27453 +               s_del_balance.insert_size[0] = del_size;
27454 +
27455 +               ret_value = fix_nodes(M_DELETE, &s_del_balance, NULL, NULL);
27456 +               if (ret_value != REPEAT_SEARCH)
27457 +                       break;
27458 +
27459 +               PROC_INFO_INC(sb, delete_item_restarted);
27460 +
27461 +               /* file system changed, repeat search */
27462 +               ret_value =
27463 +                   search_for_position_by_key(sb, item_key, path);
27464 +               if (ret_value == IO_ERROR)
27465 +                       break;
27466 +               if (ret_value == FILE_NOT_FOUND) {
27467 +                       reiserfs_warning(sb, "vs-5340",
27468 +                                        "no items of the file %K found",
27469 +                                        item_key);
27470 +                       break;
27471 +               }
27472 +       }                       /* while (1) */
27473 +
27474 +       if (ret_value != CARRY_ON) {
27475 +               unfix_nodes(&s_del_balance);
27476 +               return 0;
27477 +       }
27478 +
27479 +       /* reiserfs_delete_item returns item length when success */
27480 +       ret_value = calc_deleted_bytes_number(&s_del_balance, M_DELETE);
27481 +       q_ih = tp_item_head(path);
27482 +       quota_cut_bytes = ih_item_len(q_ih);
27483 +
27484 +       /*
27485 +        * hack so the quota code doesn't have to guess if the file has a
27486 +        * tail.  On tail insert, we allocate quota for 1 unformatted node.
27487 +        * We test the offset because the tail might have been
27488 +        * split into multiple items, and we only want to decrement for
27489 +        * the unfm node once
27490 +        */
27491 +       if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(q_ih)) {
27492 +               if ((le_ih_k_offset(q_ih) & (sb->s_blocksize - 1)) == 1) {
27493 +                       quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE;
27494 +               } else {
27495 +                       quota_cut_bytes = 0;
27496 +               }
27497 +       }
27498 +
27499 +       if (un_bh) {
27500 +               int off;
27501 +               char *data;
27502 +
27503 +               /*
27504 +                * We are in direct2indirect conversion, so move tail contents
27505 +                * to the unformatted node
27506 +                */
27507 +               /*
27508 +                * note, we do the copy before preparing the buffer because we
27509 +                * don't care about the contents of the unformatted node yet.
27510 +                * the only thing we really care about is the direct item's
27511 +                * data is in the unformatted node.
27512 +                *
27513 +                * Otherwise, we would have to call
27514 +                * reiserfs_prepare_for_journal on the unformatted node,
27515 +                * which might schedule, meaning we'd have to loop all the
27516 +                * way back up to the start of the while loop.
27517 +                *
27518 +                * The unformatted node must be dirtied later on.  We can't be
27519 +                * sure here if the entire tail has been deleted yet.
27520 +                *
27521 +                * un_bh is from the page cache (all unformatted nodes are
27522 +                * from the page cache) and might be a highmem page.  So, we
27523 +                * can't use un_bh->b_data.
27524 +                * -clm
27525 +                */
27526 +
27527 +               data = kmap_atomic(un_bh->b_page);
27528 +               off = ((le_ih_k_offset(&s_ih) - 1) & (PAGE_SIZE - 1));
27529 +               memcpy(data + off,
27530 +                      ih_item_body(PATH_PLAST_BUFFER(path), &s_ih),
27531 +                      ret_value);
27532 +               kunmap_atomic(data);
27533 +       }
27534 +
27535 +       /* Perform balancing after all resources have been collected at once. */
27536 +       do_balance(&s_del_balance, NULL, NULL, M_DELETE);
27537 +
27538 +#ifdef REISERQUOTA_DEBUG
27539 +       reiserfs_debug(sb, REISERFS_DEBUG_CODE,
27540 +                      "reiserquota delete_item(): freeing %u, id=%u type=%c",
27541 +                      quota_cut_bytes, inode->i_uid, head2type(&s_ih));
27542 +#endif
27543 +       depth = reiserfs_write_unlock_nested(inode->i_sb);
27544 +       dquot_free_space_nodirty(inode, quota_cut_bytes);
27545 +       reiserfs_write_lock_nested(inode->i_sb, depth);
27546 +
27547 +       /* Return deleted body length */
27548 +       return ret_value;
27549 +}
27550 +
27551 +/*
27552 + * Summary Of Mechanisms For Handling Collisions Between Processes:
27553 + *
27554 + *  deletion of the body of the object is performed by iput(), with the
27555 + *  result that if multiple processes are operating on a file, the
27556 + *  deletion of the body of the file is deferred until the last process
27557 + *  that has an open inode performs its iput().
27558 + *
27559 + *  writes and truncates are protected from collisions by use of
27560 + *  semaphores.
27561 + *
27562 + *  creates, linking, and mknod are protected from collisions with other
27563 + *  processes by making the reiserfs_add_entry() the last step in the
27564 + *  creation, and then rolling back all changes if there was a collision.
27565 + *  - Hans
27566 +*/
27567 +
27568 +/* this deletes item which never gets split */
27569 +void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
27570 +                               struct inode *inode, struct reiserfs_key *key)
27571 +{
27572 +       struct super_block *sb = th->t_super;
27573 +       struct tree_balance tb;
27574 +       INITIALIZE_PATH(path);
27575 +       int item_len = 0;
27576 +       int tb_init = 0;
27577 +       struct cpu_key cpu_key = {};
27578 +       int retval;
27579 +       int quota_cut_bytes = 0;
27580 +
27581 +       BUG_ON(!th->t_trans_id);
27582 +
27583 +       le_key2cpu_key(&cpu_key, key);
27584 +
27585 +       while (1) {
27586 +               retval = search_item(th->t_super, &cpu_key, &path);
27587 +               if (retval == IO_ERROR) {
27588 +                       reiserfs_error(th->t_super, "vs-5350",
27589 +                                      "i/o failure occurred trying "
27590 +                                      "to delete %K", &cpu_key);
27591 +                       break;
27592 +               }
27593 +               if (retval != ITEM_FOUND) {
27594 +                       pathrelse(&path);
27595 +                       /*
27596 +                        * No need for a warning, if there is just no free
27597 +                        * space to insert '..' item into the
27598 +                        * newly-created subdir
27599 +                        */
27600 +                       if (!
27601 +                           ((unsigned long long)
27602 +                            GET_HASH_VALUE(le_key_k_offset
27603 +                                           (le_key_version(key), key)) == 0
27604 +                            && (unsigned long long)
27605 +                            GET_GENERATION_NUMBER(le_key_k_offset
27606 +                                                  (le_key_version(key),
27607 +                                                   key)) == 1))
27608 +                               reiserfs_warning(th->t_super, "vs-5355",
27609 +                                                "%k not found", key);
27610 +                       break;
27611 +               }
27612 +               if (!tb_init) {
27613 +                       tb_init = 1;
27614 +                       item_len = ih_item_len(tp_item_head(&path));
27615 +                       init_tb_struct(th, &tb, th->t_super, &path,
27616 +                                      -(IH_SIZE + item_len));
27617 +               }
27618 +               quota_cut_bytes = ih_item_len(tp_item_head(&path));
27619 +
27620 +               retval = fix_nodes(M_DELETE, &tb, NULL, NULL);
27621 +               if (retval == REPEAT_SEARCH) {
27622 +                       PROC_INFO_INC(th->t_super, delete_solid_item_restarted);
27623 +                       continue;
27624 +               }
27625 +
27626 +               if (retval == CARRY_ON) {
27627 +                       do_balance(&tb, NULL, NULL, M_DELETE);
27628 +                       /*
27629 +                        * Should we count quota for item? (we don't
27630 +                        * count quotas for save-links)
27631 +                        */
27632 +                       if (inode) {
27633 +                               int depth;
27634 +#ifdef REISERQUOTA_DEBUG
27635 +                               reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE,
27636 +                                              "reiserquota delete_solid_item(): freeing %u id=%u type=%c",
27637 +                                              quota_cut_bytes, inode->i_uid,
27638 +                                              key2type(key));
27639 +#endif
27640 +                               depth = reiserfs_write_unlock_nested(sb);
27641 +                               dquot_free_space_nodirty(inode,
27642 +                                                        quota_cut_bytes);
27643 +                               reiserfs_write_lock_nested(sb, depth);
27644 +                       }
27645 +                       break;
27646 +               }
27647 +
27648 +               /* IO_ERROR, NO_DISK_SPACE, etc */
27649 +               reiserfs_warning(th->t_super, "vs-5360",
27650 +                                "could not delete %K due to fix_nodes failure",
27651 +                                &cpu_key);
27652 +               unfix_nodes(&tb);
27653 +               break;
27654 +       }
27655 +
27656 +       reiserfs_check_path(&path);
27657 +}
27658 +
27659 +int reiserfs_delete_object(struct reiserfs_transaction_handle *th,
27660 +                          struct inode *inode)
27661 +{
27662 +       int err;
27663 +       inode->i_size = 0;
27664 +       BUG_ON(!th->t_trans_id);
27665 +
27666 +       /* for directory this deletes item containing "." and ".." */
27667 +       err =
27668 +           reiserfs_do_truncate(th, inode, NULL, 0 /*no timestamp updates */ );
27669 +       if (err)
27670 +               return err;
27671 +
27672 +#if defined( USE_INODE_GENERATION_COUNTER )
27673 +       if (!old_format_only(th->t_super)) {
27674 +               __le32 *inode_generation;
27675 +
27676 +               inode_generation =
27677 +                   &REISERFS_SB(th->t_super)->s_rs->s_inode_generation;
27678 +               le32_add_cpu(inode_generation, 1);
27679 +       }
27680 +/* USE_INODE_GENERATION_COUNTER */
27681 +#endif
27682 +       reiserfs_delete_solid_item(th, inode, INODE_PKEY(inode));
27683 +
27684 +       return err;
27685 +}
27686 +
27687 +static void unmap_buffers(struct page *page, loff_t pos)
27688 +{
27689 +       struct buffer_head *bh;
27690 +       struct buffer_head *head;
27691 +       struct buffer_head *next;
27692 +       unsigned long tail_index;
27693 +       unsigned long cur_index;
27694 +
27695 +       if (page) {
27696 +               if (page_has_buffers(page)) {
27697 +                       tail_index = pos & (PAGE_SIZE - 1);
27698 +                       cur_index = 0;
27699 +                       head = page_buffers(page);
27700 +                       bh = head;
27701 +                       do {
27702 +                               next = bh->b_this_page;
27703 +
27704 +                               /*
27705 +                                * we want to unmap the buffers that contain
27706 +                                * the tail, and all the buffers after it
27707 +                                * (since the tail must be at the end of the
27708 +                                * file).  We don't want to unmap file data
27709 +                                * before the tail, since it might be dirty
27710 +                                * and waiting to reach disk
27711 +                                */
27712 +                               cur_index += bh->b_size;
27713 +                               if (cur_index > tail_index) {
27714 +                                       reiserfs_unmap_buffer(bh);
27715 +                               }
27716 +                               bh = next;
27717 +                       } while (bh != head);
27718 +               }
27719 +       }
27720 +}
27721 +
27722 +static int maybe_indirect_to_direct(struct reiserfs_transaction_handle *th,
27723 +                                   struct inode *inode,
27724 +                                   struct page *page,
27725 +                                   struct treepath *path,
27726 +                                   const struct cpu_key *item_key,
27727 +                                   loff_t new_file_size, char *mode)
27728 +{
27729 +       struct super_block *sb = inode->i_sb;
27730 +       int block_size = sb->s_blocksize;
27731 +       int cut_bytes;
27732 +       BUG_ON(!th->t_trans_id);
27733 +       BUG_ON(new_file_size != inode->i_size);
27734 +
27735 +       /*
27736 +        * the page being sent in could be NULL if there was an i/o error
27737 +        * reading in the last block.  The user will hit problems trying to
27738 +        * read the file, but for now we just skip the indirect2direct
27739 +        */
27740 +       if (atomic_read(&inode->i_count) > 1 ||
27741 +           !tail_has_to_be_packed(inode) ||
27742 +           !page || (REISERFS_I(inode)->i_flags & i_nopack_mask)) {
27743 +               /* leave tail in an unformatted node */
27744 +               *mode = M_SKIP_BALANCING;
27745 +               cut_bytes =
27746 +                   block_size - (new_file_size & (block_size - 1));
27747 +               pathrelse(path);
27748 +               return cut_bytes;
27749 +       }
27750 +
27751 +       /* Perform the conversion to a direct_item. */
27752 +       return indirect2direct(th, inode, page, path, item_key,
27753 +                              new_file_size, mode);
27754 +}
27755 +
27756 +/*
27757 + * we did indirect_to_direct conversion. And we have inserted direct
27758 + * item successesfully, but there were no disk space to cut unfm
27759 + * pointer being converted. Therefore we have to delete inserted
27760 + * direct item(s)
27761 + */
27762 +static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th,
27763 +                                        struct inode *inode, struct treepath *path)
27764 +{
27765 +       struct cpu_key tail_key;
27766 +       int tail_len;
27767 +       int removed;
27768 +       BUG_ON(!th->t_trans_id);
27769 +
27770 +       make_cpu_key(&tail_key, inode, inode->i_size + 1, TYPE_DIRECT, 4);
27771 +       tail_key.key_length = 4;
27772 +
27773 +       tail_len =
27774 +           (cpu_key_k_offset(&tail_key) & (inode->i_sb->s_blocksize - 1)) - 1;
27775 +       while (tail_len) {
27776 +               /* look for the last byte of the tail */
27777 +               if (search_for_position_by_key(inode->i_sb, &tail_key, path) ==
27778 +                   POSITION_NOT_FOUND)
27779 +                       reiserfs_panic(inode->i_sb, "vs-5615",
27780 +                                      "found invalid item");
27781 +               RFALSE(path->pos_in_item !=
27782 +                      ih_item_len(tp_item_head(path)) - 1,
27783 +                      "vs-5616: appended bytes found");
27784 +               PATH_LAST_POSITION(path)--;
27785 +
27786 +               removed =
27787 +                   reiserfs_delete_item(th, path, &tail_key, inode,
27788 +                                        NULL /*unbh not needed */ );
27789 +               RFALSE(removed <= 0
27790 +                      || removed > tail_len,
27791 +                      "vs-5617: there was tail %d bytes, removed item length %d bytes",
27792 +                      tail_len, removed);
27793 +               tail_len -= removed;
27794 +               set_cpu_key_k_offset(&tail_key,
27795 +                                    cpu_key_k_offset(&tail_key) - removed);
27796 +       }
27797 +       reiserfs_warning(inode->i_sb, "reiserfs-5091", "indirect_to_direct "
27798 +                        "conversion has been rolled back due to "
27799 +                        "lack of disk space");
27800 +       mark_inode_dirty(inode);
27801 +}
27802 +
27803 +/* (Truncate or cut entry) or delete object item. Returns < 0 on failure */
27804 +int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
27805 +                          struct treepath *path,
27806 +                          struct cpu_key *item_key,
27807 +                          struct inode *inode,
27808 +                          struct page *page, loff_t new_file_size)
27809 +{
27810 +       struct super_block *sb = inode->i_sb;
27811 +       /*
27812 +        * Every function which is going to call do_balance must first
27813 +        * create a tree_balance structure.  Then it must fill up this
27814 +        * structure by using the init_tb_struct and fix_nodes functions.
27815 +        * After that we can make tree balancing.
27816 +        */
27817 +       struct tree_balance s_cut_balance;
27818 +       struct item_head *p_le_ih;
27819 +       int cut_size = 0;       /* Amount to be cut. */
27820 +       int ret_value = CARRY_ON;
27821 +       int removed = 0;        /* Number of the removed unformatted nodes. */
27822 +       int is_inode_locked = 0;
27823 +       char mode;              /* Mode of the balance. */
27824 +       int retval2 = -1;
27825 +       int quota_cut_bytes;
27826 +       loff_t tail_pos = 0;
27827 +       int depth;
27828 +
27829 +       BUG_ON(!th->t_trans_id);
27830 +
27831 +       init_tb_struct(th, &s_cut_balance, inode->i_sb, path,
27832 +                      cut_size);
27833 +
27834 +       /*
27835 +        * Repeat this loop until we either cut the item without needing
27836 +        * to balance, or we fix_nodes without schedule occurring
27837 +        */
27838 +       while (1) {
27839 +               /*
27840 +                * Determine the balance mode, position of the first byte to
27841 +                * be cut, and size to be cut.  In case of the indirect item
27842 +                * free unformatted nodes which are pointed to by the cut
27843 +                * pointers.
27844 +                */
27845 +
27846 +               mode =
27847 +                   prepare_for_delete_or_cut(th, inode, path,
27848 +                                             item_key, &removed,
27849 +                                             &cut_size, new_file_size);
27850 +               if (mode == M_CONVERT) {
27851 +                       /*
27852 +                        * convert last unformatted node to direct item or
27853 +                        * leave tail in the unformatted node
27854 +                        */
27855 +                       RFALSE(ret_value != CARRY_ON,
27856 +                              "PAP-5570: can not convert twice");
27857 +
27858 +                       ret_value =
27859 +                           maybe_indirect_to_direct(th, inode, page,
27860 +                                                    path, item_key,
27861 +                                                    new_file_size, &mode);
27862 +                       if (mode == M_SKIP_BALANCING)
27863 +                               /* tail has been left in the unformatted node */
27864 +                               return ret_value;
27865 +
27866 +                       is_inode_locked = 1;
27867 +
27868 +                       /*
27869 +                        * removing of last unformatted node will
27870 +                        * change value we have to return to truncate.
27871 +                        * Save it
27872 +                        */
27873 +                       retval2 = ret_value;
27874 +
27875 +                       /*
27876 +                        * So, we have performed the first part of the
27877 +                        * conversion:
27878 +                        * inserting the new direct item.  Now we are
27879 +                        * removing the last unformatted node pointer.
27880 +                        * Set key to search for it.
27881 +                        */
27882 +                       set_cpu_key_k_type(item_key, TYPE_INDIRECT);
27883 +                       item_key->key_length = 4;
27884 +                       new_file_size -=
27885 +                           (new_file_size & (sb->s_blocksize - 1));
27886 +                       tail_pos = new_file_size;
27887 +                       set_cpu_key_k_offset(item_key, new_file_size + 1);
27888 +                       if (search_for_position_by_key
27889 +                           (sb, item_key,
27890 +                            path) == POSITION_NOT_FOUND) {
27891 +                               print_block(PATH_PLAST_BUFFER(path), 3,
27892 +                                           PATH_LAST_POSITION(path) - 1,
27893 +                                           PATH_LAST_POSITION(path) + 1);
27894 +                               reiserfs_panic(sb, "PAP-5580", "item to "
27895 +                                              "convert does not exist (%K)",
27896 +                                              item_key);
27897 +                       }
27898 +                       continue;
27899 +               }
27900 +               if (cut_size == 0) {
27901 +                       pathrelse(path);
27902 +                       return 0;
27903 +               }
27904 +
27905 +               s_cut_balance.insert_size[0] = cut_size;
27906 +
27907 +               ret_value = fix_nodes(mode, &s_cut_balance, NULL, NULL);
27908 +               if (ret_value != REPEAT_SEARCH)
27909 +                       break;
27910 +
27911 +               PROC_INFO_INC(sb, cut_from_item_restarted);
27912 +
27913 +               ret_value =
27914 +                   search_for_position_by_key(sb, item_key, path);
27915 +               if (ret_value == POSITION_FOUND)
27916 +                       continue;
27917 +
27918 +               reiserfs_warning(sb, "PAP-5610", "item %K not found",
27919 +                                item_key);
27920 +               unfix_nodes(&s_cut_balance);
27921 +               return (ret_value == IO_ERROR) ? -EIO : -ENOENT;
27922 +       }                       /* while */
27923 +
27924 +       /* check fix_nodes results (IO_ERROR or NO_DISK_SPACE) */
27925 +       if (ret_value != CARRY_ON) {
27926 +               if (is_inode_locked) {
27927 +                       /*
27928 +                        * FIXME: this seems to be not needed: we are always
27929 +                        * able to cut item
27930 +                        */
27931 +                       indirect_to_direct_roll_back(th, inode, path);
27932 +               }
27933 +               if (ret_value == NO_DISK_SPACE)
27934 +                       reiserfs_warning(sb, "reiserfs-5092",
27935 +                                        "NO_DISK_SPACE");
27936 +               unfix_nodes(&s_cut_balance);
27937 +               return -EIO;
27938 +       }
27939 +
27940 +       /* go ahead and perform balancing */
27941 +
27942 +       RFALSE(mode == M_PASTE || mode == M_INSERT, "invalid mode");
27943 +
27944 +       /* Calculate number of bytes that need to be cut from the item. */
27945 +       quota_cut_bytes =
27946 +           (mode ==
27947 +            M_DELETE) ? ih_item_len(tp_item_head(path)) : -s_cut_balance.
27948 +           insert_size[0];
27949 +       if (retval2 == -1)
27950 +               ret_value = calc_deleted_bytes_number(&s_cut_balance, mode);
27951 +       else
27952 +               ret_value = retval2;
27953 +
27954 +       /*
27955 +        * For direct items, we only change the quota when deleting the last
27956 +        * item.
27957 +        */
27958 +       p_le_ih = tp_item_head(s_cut_balance.tb_path);
27959 +       if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(p_le_ih)) {
27960 +               if (mode == M_DELETE &&
27961 +                   (le_ih_k_offset(p_le_ih) & (sb->s_blocksize - 1)) ==
27962 +                   1) {
27963 +                       /* FIXME: this is to keep 3.5 happy */
27964 +                       REISERFS_I(inode)->i_first_direct_byte = U32_MAX;
27965 +                       quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE;
27966 +               } else {
27967 +                       quota_cut_bytes = 0;
27968 +               }
27969 +       }
27970 +#ifdef CONFIG_REISERFS_CHECK
27971 +       if (is_inode_locked) {
27972 +               struct item_head *le_ih =
27973 +                   tp_item_head(s_cut_balance.tb_path);
27974 +               /*
27975 +                * we are going to complete indirect2direct conversion. Make
27976 +                * sure, that we exactly remove last unformatted node pointer
27977 +                * of the item
27978 +                */
27979 +               if (!is_indirect_le_ih(le_ih))
27980 +                       reiserfs_panic(sb, "vs-5652",
27981 +                                      "item must be indirect %h", le_ih);
27982 +
27983 +               if (mode == M_DELETE && ih_item_len(le_ih) != UNFM_P_SIZE)
27984 +                       reiserfs_panic(sb, "vs-5653", "completing "
27985 +                                      "indirect2direct conversion indirect "
27986 +                                      "item %h being deleted must be of "
27987 +                                      "4 byte long", le_ih);
27988 +
27989 +               if (mode == M_CUT
27990 +                   && s_cut_balance.insert_size[0] != -UNFM_P_SIZE) {
27991 +                       reiserfs_panic(sb, "vs-5654", "can not complete "
27992 +                                      "indirect2direct conversion of %h "
27993 +                                      "(CUT, insert_size==%d)",
27994 +                                      le_ih, s_cut_balance.insert_size[0]);
27995 +               }
27996 +               /*
27997 +                * it would be useful to make sure, that right neighboring
27998 +                * item is direct item of this file
27999 +                */
28000 +       }
28001 +#endif
28002 +
28003 +       do_balance(&s_cut_balance, NULL, NULL, mode);
28004 +       if (is_inode_locked) {
28005 +               /*
28006 +                * we've done an indirect->direct conversion.  when the
28007 +                * data block was freed, it was removed from the list of
28008 +                * blocks that must be flushed before the transaction
28009 +                * commits, make sure to unmap and invalidate it
28010 +                */
28011 +               unmap_buffers(page, tail_pos);
28012 +               REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
28013 +       }
28014 +#ifdef REISERQUOTA_DEBUG
28015 +       reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
28016 +                      "reiserquota cut_from_item(): freeing %u id=%u type=%c",
28017 +                      quota_cut_bytes, inode->i_uid, '?');
28018 +#endif
28019 +       depth = reiserfs_write_unlock_nested(sb);
28020 +       dquot_free_space_nodirty(inode, quota_cut_bytes);
28021 +       reiserfs_write_lock_nested(sb, depth);
28022 +       return ret_value;
28023 +}
28024 +
28025 +static void truncate_directory(struct reiserfs_transaction_handle *th,
28026 +                              struct inode *inode)
28027 +{
28028 +       BUG_ON(!th->t_trans_id);
28029 +       if (inode->i_nlink)
28030 +               reiserfs_error(inode->i_sb, "vs-5655", "link count != 0");
28031 +
28032 +       set_le_key_k_offset(KEY_FORMAT_3_5, INODE_PKEY(inode), DOT_OFFSET);
28033 +       set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_DIRENTRY);
28034 +       reiserfs_delete_solid_item(th, inode, INODE_PKEY(inode));
28035 +       reiserfs_update_sd(th, inode);
28036 +       set_le_key_k_offset(KEY_FORMAT_3_5, INODE_PKEY(inode), SD_OFFSET);
28037 +       set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_STAT_DATA);
28038 +}
28039 +
28040 +/*
28041 + * Truncate file to the new size. Note, this must be called with a
28042 + * transaction already started
28043 + */
28044 +int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
28045 +                        struct inode *inode,   /* ->i_size contains new size */
28046 +                        struct page *page,     /* up to date for last block */
28047 +                        /*
28048 +                         * when it is called by file_release to convert
28049 +                         * the tail - no timestamps should be updated
28050 +                         */
28051 +                        int update_timestamps
28052 +    )
28053 +{
28054 +       INITIALIZE_PATH(s_search_path); /* Path to the current object item. */
28055 +       struct item_head *p_le_ih;      /* Pointer to an item header. */
28056 +
28057 +       /* Key to search for a previous file item. */
28058 +       struct cpu_key s_item_key;
28059 +       loff_t file_size,       /* Old file size. */
28060 +        new_file_size; /* New file size. */
28061 +       int deleted;            /* Number of deleted or truncated bytes. */
28062 +       int retval;
28063 +       int err = 0;
28064 +
28065 +       BUG_ON(!th->t_trans_id);
28066 +       if (!
28067 +           (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
28068 +            || S_ISLNK(inode->i_mode)))
28069 +               return 0;
28070 +
28071 +       /* deletion of directory - no need to update timestamps */
28072 +       if (S_ISDIR(inode->i_mode)) {
28073 +               truncate_directory(th, inode);
28074 +               return 0;
28075 +       }
28076 +
28077 +       /* Get new file size. */
28078 +       new_file_size = inode->i_size;
28079 +
28080 +       /* FIXME: note, that key type is unimportant here */
28081 +       make_cpu_key(&s_item_key, inode, max_reiserfs_offset(inode),
28082 +                    TYPE_DIRECT, 3);
28083 +
28084 +       retval =
28085 +           search_for_position_by_key(inode->i_sb, &s_item_key,
28086 +                                      &s_search_path);
28087 +       if (retval == IO_ERROR) {
28088 +               reiserfs_error(inode->i_sb, "vs-5657",
28089 +                              "i/o failure occurred trying to truncate %K",
28090 +                              &s_item_key);
28091 +               err = -EIO;
28092 +               goto out;
28093 +       }
28094 +       if (retval == POSITION_FOUND || retval == FILE_NOT_FOUND) {
28095 +               reiserfs_error(inode->i_sb, "PAP-5660",
28096 +                              "wrong result %d of search for %K", retval,
28097 +                              &s_item_key);
28098 +
28099 +               err = -EIO;
28100 +               goto out;
28101 +       }
28102 +
28103 +       s_search_path.pos_in_item--;
28104 +
28105 +       /* Get real file size (total length of all file items) */
28106 +       p_le_ih = tp_item_head(&s_search_path);
28107 +       if (is_statdata_le_ih(p_le_ih))
28108 +               file_size = 0;
28109 +       else {
28110 +               loff_t offset = le_ih_k_offset(p_le_ih);
28111 +               int bytes =
28112 +                   op_bytes_number(p_le_ih, inode->i_sb->s_blocksize);
28113 +
28114 +               /*
28115 +                * this may mismatch with real file size: if last direct item
28116 +                * had no padding zeros and last unformatted node had no free
28117 +                * space, this file would have this file size
28118 +                */
28119 +               file_size = offset + bytes - 1;
28120 +       }
28121 +       /*
28122 +        * are we doing a full truncate or delete, if so
28123 +        * kick in the reada code
28124 +        */
28125 +       if (new_file_size == 0)
28126 +               s_search_path.reada = PATH_READA | PATH_READA_BACK;
28127 +
28128 +       if (file_size == 0 || file_size < new_file_size) {
28129 +               goto update_and_out;
28130 +       }
28131 +
28132 +       /* Update key to search for the last file item. */
28133 +       set_cpu_key_k_offset(&s_item_key, file_size);
28134 +
28135 +       do {
28136 +               /* Cut or delete file item. */
28137 +               deleted =
28138 +                   reiserfs_cut_from_item(th, &s_search_path, &s_item_key,
28139 +                                          inode, page, new_file_size);
28140 +               if (deleted < 0) {
28141 +                       reiserfs_warning(inode->i_sb, "vs-5665",
28142 +                                        "reiserfs_cut_from_item failed");
28143 +                       reiserfs_check_path(&s_search_path);
28144 +                       return 0;
28145 +               }
28146 +
28147 +               RFALSE(deleted > file_size,
28148 +                      "PAP-5670: reiserfs_cut_from_item: too many bytes deleted: deleted %d, file_size %lu, item_key %K",
28149 +                      deleted, file_size, &s_item_key);
28150 +
28151 +               /* Change key to search the last file item. */
28152 +               file_size -= deleted;
28153 +
28154 +               set_cpu_key_k_offset(&s_item_key, file_size);
28155 +
28156 +               /*
28157 +                * While there are bytes to truncate and previous
28158 +                * file item is presented in the tree.
28159 +                */
28160 +
28161 +               /*
28162 +                * This loop could take a really long time, and could log
28163 +                * many more blocks than a transaction can hold.  So, we do
28164 +                * a polite journal end here, and if the transaction needs
28165 +                * ending, we make sure the file is consistent before ending
28166 +                * the current trans and starting a new one
28167 +                */
28168 +               if (journal_transaction_should_end(th, 0) ||
28169 +                   reiserfs_transaction_free_space(th) <= JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD) {
28170 +                       pathrelse(&s_search_path);
28171 +
28172 +                       if (update_timestamps) {
28173 +                               inode_set_mtime_to_ts(inode,
28174 +                                                     current_time(inode));
28175 +                               inode_set_ctime_current(inode);
28176 +                       }
28177 +                       reiserfs_update_sd(th, inode);
28178 +
28179 +                       err = journal_end(th);
28180 +                       if (err)
28181 +                               goto out;
28182 +                       err = journal_begin(th, inode->i_sb,
28183 +                                           JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD + JOURNAL_PER_BALANCE_CNT * 4) ;
28184 +                       if (err)
28185 +                               goto out;
28186 +                       reiserfs_update_inode_transaction(inode);
28187 +               }
28188 +       } while (file_size > ROUND_UP(new_file_size) &&
28189 +                search_for_position_by_key(inode->i_sb, &s_item_key,
28190 +                                           &s_search_path) == POSITION_FOUND);
28191 +
28192 +       RFALSE(file_size > ROUND_UP(new_file_size),
28193 +              "PAP-5680: truncate did not finish: new_file_size %lld, current %lld, oid %d",
28194 +              new_file_size, file_size, s_item_key.on_disk_key.k_objectid);
28195 +
28196 +update_and_out:
28197 +       if (update_timestamps) {
28198 +               /* this is truncate, not file closing */
28199 +               inode_set_mtime_to_ts(inode, current_time(inode));
28200 +               inode_set_ctime_current(inode);
28201 +       }
28202 +       reiserfs_update_sd(th, inode);
28203 +
28204 +out:
28205 +       pathrelse(&s_search_path);
28206 +       return err;
28207 +}
28208 +
28209 +#ifdef CONFIG_REISERFS_CHECK
28210 +/* this makes sure, that we __append__, not overwrite or add holes */
28211 +static void check_research_for_paste(struct treepath *path,
28212 +                                    const struct cpu_key *key)
28213 +{
28214 +       struct item_head *found_ih = tp_item_head(path);
28215 +
28216 +       if (is_direct_le_ih(found_ih)) {
28217 +               if (le_ih_k_offset(found_ih) +
28218 +                   op_bytes_number(found_ih,
28219 +                                   get_last_bh(path)->b_size) !=
28220 +                   cpu_key_k_offset(key)
28221 +                   || op_bytes_number(found_ih,
28222 +                                      get_last_bh(path)->b_size) !=
28223 +                   pos_in_item(path))
28224 +                       reiserfs_panic(NULL, "PAP-5720", "found direct item "
28225 +                                      "%h or position (%d) does not match "
28226 +                                      "to key %K", found_ih,
28227 +                                      pos_in_item(path), key);
28228 +       }
28229 +       if (is_indirect_le_ih(found_ih)) {
28230 +               if (le_ih_k_offset(found_ih) +
28231 +                   op_bytes_number(found_ih,
28232 +                                   get_last_bh(path)->b_size) !=
28233 +                   cpu_key_k_offset(key)
28234 +                   || I_UNFM_NUM(found_ih) != pos_in_item(path)
28235 +                   || get_ih_free_space(found_ih) != 0)
28236 +                       reiserfs_panic(NULL, "PAP-5730", "found indirect "
28237 +                                      "item (%h) or position (%d) does not "
28238 +                                      "match to key (%K)",
28239 +                                      found_ih, pos_in_item(path), key);
28240 +       }
28241 +}
28242 +#endif                         /* config reiserfs check */
28243 +
28244 +/*
28245 + * Paste bytes to the existing item.
28246 + * Returns bytes number pasted into the item.
28247 + */
28248 +int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th,
28249 +                            /* Path to the pasted item. */
28250 +                            struct treepath *search_path,
28251 +                            /* Key to search for the needed item. */
28252 +                            const struct cpu_key *key,
28253 +                            /* Inode item belongs to */
28254 +                            struct inode *inode,
28255 +                            /* Pointer to the bytes to paste. */
28256 +                            const char *body,
28257 +                            /* Size of pasted bytes. */
28258 +                            int pasted_size)
28259 +{
28260 +       struct super_block *sb = inode->i_sb;
28261 +       struct tree_balance s_paste_balance;
28262 +       int retval;
28263 +       int fs_gen;
28264 +       int depth;
28265 +
28266 +       BUG_ON(!th->t_trans_id);
28267 +
28268 +       fs_gen = get_generation(inode->i_sb);
28269 +
28270 +#ifdef REISERQUOTA_DEBUG
28271 +       reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
28272 +                      "reiserquota paste_into_item(): allocating %u id=%u type=%c",
28273 +                      pasted_size, inode->i_uid,
28274 +                      key2type(&key->on_disk_key));
28275 +#endif
28276 +
28277 +       depth = reiserfs_write_unlock_nested(sb);
28278 +       retval = dquot_alloc_space_nodirty(inode, pasted_size);
28279 +       reiserfs_write_lock_nested(sb, depth);
28280 +       if (retval) {
28281 +               pathrelse(search_path);
28282 +               return retval;
28283 +       }
28284 +       init_tb_struct(th, &s_paste_balance, th->t_super, search_path,
28285 +                      pasted_size);
28286 +#ifdef DISPLACE_NEW_PACKING_LOCALITIES
28287 +       s_paste_balance.key = key->on_disk_key;
28288 +#endif
28289 +
28290 +       /* DQUOT_* can schedule, must check before the fix_nodes */
28291 +       if (fs_changed(fs_gen, inode->i_sb)) {
28292 +               goto search_again;
28293 +       }
28294 +
28295 +       while ((retval =
28296 +               fix_nodes(M_PASTE, &s_paste_balance, NULL,
28297 +                         body)) == REPEAT_SEARCH) {
28298 +search_again:
28299 +               /* file system changed while we were in the fix_nodes */
28300 +               PROC_INFO_INC(th->t_super, paste_into_item_restarted);
28301 +               retval =
28302 +                   search_for_position_by_key(th->t_super, key,
28303 +                                              search_path);
28304 +               if (retval == IO_ERROR) {
28305 +                       retval = -EIO;
28306 +                       goto error_out;
28307 +               }
28308 +               if (retval == POSITION_FOUND) {
28309 +                       reiserfs_warning(inode->i_sb, "PAP-5710",
28310 +                                        "entry or pasted byte (%K) exists",
28311 +                                        key);
28312 +                       retval = -EEXIST;
28313 +                       goto error_out;
28314 +               }
28315 +#ifdef CONFIG_REISERFS_CHECK
28316 +               check_research_for_paste(search_path, key);
28317 +#endif
28318 +       }
28319 +
28320 +       /*
28321 +        * Perform balancing after all resources are collected by fix_nodes,
28322 +        * and accessing them will not risk triggering schedule.
28323 +        */
28324 +       if (retval == CARRY_ON) {
28325 +               do_balance(&s_paste_balance, NULL /*ih */ , body, M_PASTE);
28326 +               return 0;
28327 +       }
28328 +       retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO;
28329 +error_out:
28330 +       /* this also releases the path */
28331 +       unfix_nodes(&s_paste_balance);
28332 +#ifdef REISERQUOTA_DEBUG
28333 +       reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
28334 +                      "reiserquota paste_into_item(): freeing %u id=%u type=%c",
28335 +                      pasted_size, inode->i_uid,
28336 +                      key2type(&key->on_disk_key));
28337 +#endif
28338 +       depth = reiserfs_write_unlock_nested(sb);
28339 +       dquot_free_space_nodirty(inode, pasted_size);
28340 +       reiserfs_write_lock_nested(sb, depth);
28341 +       return retval;
28342 +}
28343 +
28344 +/*
28345 + * Insert new item into the buffer at the path.
28346 + * th   - active transaction handle
28347 + * path - path to the inserted item
28348 + * ih   - pointer to the item header to insert
28349 + * body - pointer to the bytes to insert
28350 + */
28351 +int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
28352 +                        struct treepath *path, const struct cpu_key *key,
28353 +                        struct item_head *ih, struct inode *inode,
28354 +                        const char *body)
28355 +{
28356 +       struct tree_balance s_ins_balance;
28357 +       int retval;
28358 +       int fs_gen = 0;
28359 +       int quota_bytes = 0;
28360 +
28361 +       BUG_ON(!th->t_trans_id);
28362 +
28363 +       if (inode) {            /* Do we count quotas for item? */
28364 +               int depth;
28365 +               fs_gen = get_generation(inode->i_sb);
28366 +               quota_bytes = ih_item_len(ih);
28367 +
28368 +               /*
28369 +                * hack so the quota code doesn't have to guess
28370 +                * if the file has a tail, links are always tails,
28371 +                * so there's no guessing needed
28372 +                */
28373 +               if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(ih))
28374 +                       quota_bytes = inode->i_sb->s_blocksize + UNFM_P_SIZE;
28375 +#ifdef REISERQUOTA_DEBUG
28376 +               reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
28377 +                              "reiserquota insert_item(): allocating %u id=%u type=%c",
28378 +                              quota_bytes, inode->i_uid, head2type(ih));
28379 +#endif
28380 +               /*
28381 +                * We can't dirty inode here. It would be immediately
28382 +                * written but appropriate stat item isn't inserted yet...
28383 +                */
28384 +               depth = reiserfs_write_unlock_nested(inode->i_sb);
28385 +               retval = dquot_alloc_space_nodirty(inode, quota_bytes);
28386 +               reiserfs_write_lock_nested(inode->i_sb, depth);
28387 +               if (retval) {
28388 +                       pathrelse(path);
28389 +                       return retval;
28390 +               }
28391 +       }
28392 +       init_tb_struct(th, &s_ins_balance, th->t_super, path,
28393 +                      IH_SIZE + ih_item_len(ih));
28394 +#ifdef DISPLACE_NEW_PACKING_LOCALITIES
28395 +       s_ins_balance.key = key->on_disk_key;
28396 +#endif
28397 +       /*
28398 +        * DQUOT_* can schedule, must check to be sure calling
28399 +        * fix_nodes is safe
28400 +        */
28401 +       if (inode && fs_changed(fs_gen, inode->i_sb)) {
28402 +               goto search_again;
28403 +       }
28404 +
28405 +       while ((retval =
28406 +               fix_nodes(M_INSERT, &s_ins_balance, ih,
28407 +                         body)) == REPEAT_SEARCH) {
28408 +search_again:
28409 +               /* file system changed while we were in the fix_nodes */
28410 +               PROC_INFO_INC(th->t_super, insert_item_restarted);
28411 +               retval = search_item(th->t_super, key, path);
28412 +               if (retval == IO_ERROR) {
28413 +                       retval = -EIO;
28414 +                       goto error_out;
28415 +               }
28416 +               if (retval == ITEM_FOUND) {
28417 +                       reiserfs_warning(th->t_super, "PAP-5760",
28418 +                                        "key %K already exists in the tree",
28419 +                                        key);
28420 +                       retval = -EEXIST;
28421 +                       goto error_out;
28422 +               }
28423 +       }
28424 +
28425 +       /* make balancing after all resources will be collected at a time */
28426 +       if (retval == CARRY_ON) {
28427 +               do_balance(&s_ins_balance, ih, body, M_INSERT);
28428 +               return 0;
28429 +       }
28430 +
28431 +       retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO;
28432 +error_out:
28433 +       /* also releases the path */
28434 +       unfix_nodes(&s_ins_balance);
28435 +#ifdef REISERQUOTA_DEBUG
28436 +       if (inode)
28437 +               reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE,
28438 +                      "reiserquota insert_item(): freeing %u id=%u type=%c",
28439 +                      quota_bytes, inode->i_uid, head2type(ih));
28440 +#endif
28441 +       if (inode) {
28442 +               int depth = reiserfs_write_unlock_nested(inode->i_sb);
28443 +               dquot_free_space_nodirty(inode, quota_bytes);
28444 +               reiserfs_write_lock_nested(inode->i_sb, depth);
28445 +       }
28446 +       return retval;
28447 +}
28448 diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
28449 new file mode 100644
28450 index 000000000000..ab76468da02d
28451 --- /dev/null
28452 +++ b/fs/reiserfs/super.c
28453 @@ -0,0 +1,2646 @@
28454 +/*
28455 + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
28456 + *
28457 + * Trivial changes by Alan Cox to add the LFS fixes
28458 + *
28459 + * Trivial Changes:
28460 + * Rights granted to Hans Reiser to redistribute under other terms providing
28461 + * he accepts all liability including but not limited to patent, fitness
28462 + * for purpose, and direct or indirect claims arising from failure to perform.
28463 + *
28464 + * NO WARRANTY
28465 + */
28466 +
28467 +#include <linux/module.h>
28468 +#include <linux/slab.h>
28469 +#include <linux/vmalloc.h>
28470 +#include <linux/time.h>
28471 +#include <linux/uaccess.h>
28472 +#include "reiserfs.h"
28473 +#include "acl.h"
28474 +#include "xattr.h"
28475 +#include <linux/init.h>
28476 +#include <linux/blkdev.h>
28477 +#include <linux/backing-dev.h>
28478 +#include <linux/buffer_head.h>
28479 +#include <linux/exportfs.h>
28480 +#include <linux/quotaops.h>
28481 +#include <linux/vfs.h>
28482 +#include <linux/mount.h>
28483 +#include <linux/namei.h>
28484 +#include <linux/crc32.h>
28485 +#include <linux/seq_file.h>
28486 +
28487 +struct file_system_type reiserfs_fs_type;
28488 +
28489 +static const char reiserfs_3_5_magic_string[] = REISERFS_SUPER_MAGIC_STRING;
28490 +static const char reiserfs_3_6_magic_string[] = REISER2FS_SUPER_MAGIC_STRING;
28491 +static const char reiserfs_jr_magic_string[] = REISER2FS_JR_SUPER_MAGIC_STRING;
28492 +
28493 +int is_reiserfs_3_5(struct reiserfs_super_block *rs)
28494 +{
28495 +       return !strncmp(rs->s_v1.s_magic, reiserfs_3_5_magic_string,
28496 +                       strlen(reiserfs_3_5_magic_string));
28497 +}
28498 +
28499 +int is_reiserfs_3_6(struct reiserfs_super_block *rs)
28500 +{
28501 +       return !strncmp(rs->s_v1.s_magic, reiserfs_3_6_magic_string,
28502 +                       strlen(reiserfs_3_6_magic_string));
28503 +}
28504 +
28505 +int is_reiserfs_jr(struct reiserfs_super_block *rs)
28506 +{
28507 +       return !strncmp(rs->s_v1.s_magic, reiserfs_jr_magic_string,
28508 +                       strlen(reiserfs_jr_magic_string));
28509 +}
28510 +
28511 +static int is_any_reiserfs_magic_string(struct reiserfs_super_block *rs)
28512 +{
28513 +       return (is_reiserfs_3_5(rs) || is_reiserfs_3_6(rs) ||
28514 +               is_reiserfs_jr(rs));
28515 +}
28516 +
28517 +static int reiserfs_remount(struct super_block *s, int *flags, char *data);
28518 +static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf);
28519 +
28520 +static int reiserfs_sync_fs(struct super_block *s, int wait)
28521 +{
28522 +       struct reiserfs_transaction_handle th;
28523 +
28524 +       /*
28525 +        * Writeback quota in non-journalled quota case - journalled quota has
28526 +        * no dirty dquots
28527 +        */
28528 +       dquot_writeback_dquots(s, -1);
28529 +       reiserfs_write_lock(s);
28530 +       if (!journal_begin(&th, s, 1))
28531 +               if (!journal_end_sync(&th))
28532 +                       reiserfs_flush_old_commits(s);
28533 +       reiserfs_write_unlock(s);
28534 +       return 0;
28535 +}
28536 +
28537 +static void flush_old_commits(struct work_struct *work)
28538 +{
28539 +       struct reiserfs_sb_info *sbi;
28540 +       struct super_block *s;
28541 +
28542 +       sbi = container_of(work, struct reiserfs_sb_info, old_work.work);
28543 +       s = sbi->s_journal->j_work_sb;
28544 +
28545 +       /*
28546 +        * We need s_umount for protecting quota writeback. We have to use
28547 +        * trylock as reiserfs_cancel_old_flush() may be waiting for this work
28548 +        * to complete with s_umount held.
28549 +        */
28550 +       if (!down_read_trylock(&s->s_umount)) {
28551 +               /* Requeue work if we are not cancelling it */
28552 +               spin_lock(&sbi->old_work_lock);
28553 +               if (sbi->work_queued == 1)
28554 +                       queue_delayed_work(system_long_wq, &sbi->old_work, HZ);
28555 +               spin_unlock(&sbi->old_work_lock);
28556 +               return;
28557 +       }
28558 +       spin_lock(&sbi->old_work_lock);
28559 +       /* Avoid clobbering the cancel state... */
28560 +       if (sbi->work_queued == 1)
28561 +               sbi->work_queued = 0;
28562 +       spin_unlock(&sbi->old_work_lock);
28563 +
28564 +       reiserfs_sync_fs(s, 1);
28565 +       up_read(&s->s_umount);
28566 +}
28567 +
28568 +void reiserfs_schedule_old_flush(struct super_block *s)
28569 +{
28570 +       struct reiserfs_sb_info *sbi = REISERFS_SB(s);
28571 +       unsigned long delay;
28572 +
28573 +       /*
28574 +        * Avoid scheduling flush when sb is being shut down. It can race
28575 +        * with journal shutdown and free still queued delayed work.
28576 +        */
28577 +       if (sb_rdonly(s) || !(s->s_flags & SB_ACTIVE))
28578 +               return;
28579 +
28580 +       spin_lock(&sbi->old_work_lock);
28581 +       if (!sbi->work_queued) {
28582 +               delay = msecs_to_jiffies(dirty_writeback_interval * 10);
28583 +               queue_delayed_work(system_long_wq, &sbi->old_work, delay);
28584 +               sbi->work_queued = 1;
28585 +       }
28586 +       spin_unlock(&sbi->old_work_lock);
28587 +}
28588 +
28589 +void reiserfs_cancel_old_flush(struct super_block *s)
28590 +{
28591 +       struct reiserfs_sb_info *sbi = REISERFS_SB(s);
28592 +
28593 +       spin_lock(&sbi->old_work_lock);
28594 +       /* Make sure no new flushes will be queued */
28595 +       sbi->work_queued = 2;
28596 +       spin_unlock(&sbi->old_work_lock);
28597 +       cancel_delayed_work_sync(&REISERFS_SB(s)->old_work);
28598 +}
28599 +
28600 +static int reiserfs_freeze(struct super_block *s)
28601 +{
28602 +       struct reiserfs_transaction_handle th;
28603 +
28604 +       reiserfs_cancel_old_flush(s);
28605 +
28606 +       reiserfs_write_lock(s);
28607 +       if (!sb_rdonly(s)) {
28608 +               int err = journal_begin(&th, s, 1);
28609 +               if (err) {
28610 +                       reiserfs_block_writes(&th);
28611 +               } else {
28612 +                       reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s),
28613 +                                                    1);
28614 +                       journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
28615 +                       reiserfs_block_writes(&th);
28616 +                       journal_end_sync(&th);
28617 +               }
28618 +       }
28619 +       reiserfs_write_unlock(s);
28620 +       return 0;
28621 +}
28622 +
28623 +static int reiserfs_unfreeze(struct super_block *s)
28624 +{
28625 +       struct reiserfs_sb_info *sbi = REISERFS_SB(s);
28626 +
28627 +       reiserfs_allow_writes(s);
28628 +       spin_lock(&sbi->old_work_lock);
28629 +       /* Allow old_work to run again */
28630 +       sbi->work_queued = 0;
28631 +       spin_unlock(&sbi->old_work_lock);
28632 +       return 0;
28633 +}
28634 +
28635 +extern const struct in_core_key MAX_IN_CORE_KEY;
28636 +
28637 +/*
28638 + * this is used to delete "save link" when there are no items of a
28639 + * file it points to. It can either happen if unlink is completed but
28640 + * "save unlink" removal, or if file has both unlink and truncate
28641 + * pending and as unlink completes first (because key of "save link"
28642 + * protecting unlink is bigger that a key lf "save link" which
28643 + * protects truncate), so there left no items to make truncate
28644 + * completion on
28645 + */
28646 +static int remove_save_link_only(struct super_block *s,
28647 +                                struct reiserfs_key *key, int oid_free)
28648 +{
28649 +       struct reiserfs_transaction_handle th;
28650 +       int err;
28651 +
28652 +       /* we are going to do one balancing */
28653 +       err = journal_begin(&th, s, JOURNAL_PER_BALANCE_CNT);
28654 +       if (err)
28655 +               return err;
28656 +
28657 +       reiserfs_delete_solid_item(&th, NULL, key);
28658 +       if (oid_free)
28659 +               /* removals are protected by direct items */
28660 +               reiserfs_release_objectid(&th, le32_to_cpu(key->k_objectid));
28661 +
28662 +       return journal_end(&th);
28663 +}
28664 +
28665 +#ifdef CONFIG_QUOTA
28666 +static int reiserfs_quota_on_mount(struct super_block *, int);
28667 +#endif
28668 +
28669 +/*
28670 + * Look for uncompleted unlinks and truncates and complete them
28671 + *
28672 + * Called with superblock write locked.  If quotas are enabled, we have to
28673 + * release/retake lest we call dquot_quota_on_mount(), proceed to
28674 + * schedule_on_each_cpu() in invalidate_bdev() and deadlock waiting for the per
28675 + * cpu worklets to complete flush_async_commits() that in turn wait for the
28676 + * superblock write lock.
28677 + */
28678 +static int finish_unfinished(struct super_block *s)
28679 +{
28680 +       INITIALIZE_PATH(path);
28681 +       struct cpu_key max_cpu_key, obj_key;
28682 +       struct reiserfs_key save_link_key, last_inode_key;
28683 +       int retval = 0;
28684 +       struct item_head *ih;
28685 +       struct buffer_head *bh;
28686 +       int item_pos;
28687 +       char *item;
28688 +       int done;
28689 +       struct inode *inode;
28690 +       int truncate;
28691 +#ifdef CONFIG_QUOTA
28692 +       int i;
28693 +       int ms_active_set;
28694 +       int quota_enabled[REISERFS_MAXQUOTAS];
28695 +#endif
28696 +
28697 +       /* compose key to look for "save" links */
28698 +       max_cpu_key.version = KEY_FORMAT_3_5;
28699 +       max_cpu_key.on_disk_key.k_dir_id = ~0U;
28700 +       max_cpu_key.on_disk_key.k_objectid = ~0U;
28701 +       set_cpu_key_k_offset(&max_cpu_key, ~0U);
28702 +       max_cpu_key.key_length = 3;
28703 +
28704 +       memset(&last_inode_key, 0, sizeof(last_inode_key));
28705 +
28706 +#ifdef CONFIG_QUOTA
28707 +       /* Needed for iput() to work correctly and not trash data */
28708 +       if (s->s_flags & SB_ACTIVE) {
28709 +               ms_active_set = 0;
28710 +       } else {
28711 +               ms_active_set = 1;
28712 +               s->s_flags |= SB_ACTIVE;
28713 +       }
28714 +       /* Turn on quotas so that they are updated correctly */
28715 +       for (i = 0; i < REISERFS_MAXQUOTAS; i++) {
28716 +               quota_enabled[i] = 1;
28717 +               if (REISERFS_SB(s)->s_qf_names[i]) {
28718 +                       int ret;
28719 +
28720 +                       if (sb_has_quota_active(s, i)) {
28721 +                               quota_enabled[i] = 0;
28722 +                               continue;
28723 +                       }
28724 +                       reiserfs_write_unlock(s);
28725 +                       ret = reiserfs_quota_on_mount(s, i);
28726 +                       reiserfs_write_lock(s);
28727 +                       if (ret < 0)
28728 +                               reiserfs_warning(s, "reiserfs-2500",
28729 +                                                "cannot turn on journaled "
28730 +                                                "quota: error %d", ret);
28731 +               }
28732 +       }
28733 +#endif
28734 +
28735 +       done = 0;
28736 +       REISERFS_SB(s)->s_is_unlinked_ok = 1;
28737 +       while (!retval) {
28738 +               int depth;
28739 +               retval = search_item(s, &max_cpu_key, &path);
28740 +               if (retval != ITEM_NOT_FOUND) {
28741 +                       reiserfs_error(s, "vs-2140",
28742 +                                      "search_by_key returned %d", retval);
28743 +                       break;
28744 +               }
28745 +
28746 +               bh = get_last_bh(&path);
28747 +               item_pos = get_item_pos(&path);
28748 +               if (item_pos != B_NR_ITEMS(bh)) {
28749 +                       reiserfs_warning(s, "vs-2060",
28750 +                                        "wrong position found");
28751 +                       break;
28752 +               }
28753 +               item_pos--;
28754 +               ih = item_head(bh, item_pos);
28755 +
28756 +               if (le32_to_cpu(ih->ih_key.k_dir_id) != MAX_KEY_OBJECTID)
28757 +                       /* there are no "save" links anymore */
28758 +                       break;
28759 +
28760 +               save_link_key = ih->ih_key;
28761 +               if (is_indirect_le_ih(ih))
28762 +                       truncate = 1;
28763 +               else
28764 +                       truncate = 0;
28765 +
28766 +               /* reiserfs_iget needs k_dirid and k_objectid only */
28767 +               item = ih_item_body(bh, ih);
28768 +               obj_key.on_disk_key.k_dir_id = le32_to_cpu(*(__le32 *) item);
28769 +               obj_key.on_disk_key.k_objectid =
28770 +                   le32_to_cpu(ih->ih_key.k_objectid);
28771 +               obj_key.on_disk_key.k_offset = 0;
28772 +               obj_key.on_disk_key.k_type = 0;
28773 +
28774 +               pathrelse(&path);
28775 +
28776 +               inode = reiserfs_iget(s, &obj_key);
28777 +               if (IS_ERR_OR_NULL(inode)) {
28778 +                       /*
28779 +                        * the unlink almost completed, it just did not
28780 +                        * manage to remove "save" link and release objectid
28781 +                        */
28782 +                       reiserfs_warning(s, "vs-2180", "iget failed for %K",
28783 +                                        &obj_key);
28784 +                       retval = remove_save_link_only(s, &save_link_key, 1);
28785 +                       continue;
28786 +               }
28787 +
28788 +               if (!truncate && inode->i_nlink) {
28789 +                       /* file is not unlinked */
28790 +                       reiserfs_warning(s, "vs-2185",
28791 +                                        "file %K is not unlinked",
28792 +                                        &obj_key);
28793 +                       retval = remove_save_link_only(s, &save_link_key, 0);
28794 +                       continue;
28795 +               }
28796 +               depth = reiserfs_write_unlock_nested(inode->i_sb);
28797 +               dquot_initialize(inode);
28798 +               reiserfs_write_lock_nested(inode->i_sb, depth);
28799 +
28800 +               if (truncate && S_ISDIR(inode->i_mode)) {
28801 +                       /*
28802 +                        * We got a truncate request for a dir which
28803 +                        * is impossible.  The only imaginable way is to
28804 +                        * execute unfinished truncate request then boot
28805 +                        * into old kernel, remove the file and create dir
28806 +                        * with the same key.
28807 +                        */
28808 +                       reiserfs_warning(s, "green-2101",
28809 +                                        "impossible truncate on a "
28810 +                                        "directory %k. Please report",
28811 +                                        INODE_PKEY(inode));
28812 +                       retval = remove_save_link_only(s, &save_link_key, 0);
28813 +                       truncate = 0;
28814 +                       iput(inode);
28815 +                       continue;
28816 +               }
28817 +
28818 +               if (truncate) {
28819 +                       REISERFS_I(inode)->i_flags |=
28820 +                           i_link_saved_truncate_mask;
28821 +                       /*
28822 +                        * not completed truncate found. New size was
28823 +                        * committed together with "save" link
28824 +                        */
28825 +                       reiserfs_info(s, "Truncating %k to %lld ..",
28826 +                                     INODE_PKEY(inode), inode->i_size);
28827 +
28828 +                       /* don't update modification time */
28829 +                       reiserfs_truncate_file(inode, 0);
28830 +
28831 +                       retval = remove_save_link(inode, truncate);
28832 +               } else {
28833 +                       REISERFS_I(inode)->i_flags |= i_link_saved_unlink_mask;
28834 +                       /* not completed unlink (rmdir) found */
28835 +                       reiserfs_info(s, "Removing %k..", INODE_PKEY(inode));
28836 +                       if (memcmp(&last_inode_key, INODE_PKEY(inode),
28837 +                                       sizeof(last_inode_key))){
28838 +                               last_inode_key = *INODE_PKEY(inode);
28839 +                               /* removal gets completed in iput */
28840 +                               retval = 0;
28841 +                       } else {
28842 +                               reiserfs_warning(s, "super-2189", "Dead loop "
28843 +                                                "in finish_unfinished "
28844 +                                                "detected, just remove "
28845 +                                                "save link\n");
28846 +                               retval = remove_save_link_only(s,
28847 +                                                       &save_link_key, 0);
28848 +                       }
28849 +               }
28850 +
28851 +               iput(inode);
28852 +               printk("done\n");
28853 +               done++;
28854 +       }
28855 +       REISERFS_SB(s)->s_is_unlinked_ok = 0;
28856 +
28857 +#ifdef CONFIG_QUOTA
28858 +       /* Turn quotas off */
28859 +       reiserfs_write_unlock(s);
28860 +       for (i = 0; i < REISERFS_MAXQUOTAS; i++) {
28861 +               if (sb_dqopt(s)->files[i] && quota_enabled[i])
28862 +                       dquot_quota_off(s, i);
28863 +       }
28864 +       reiserfs_write_lock(s);
28865 +       if (ms_active_set)
28866 +               /* Restore the flag back */
28867 +               s->s_flags &= ~SB_ACTIVE;
28868 +#endif
28869 +       pathrelse(&path);
28870 +       if (done)
28871 +               reiserfs_info(s, "There were %d uncompleted unlinks/truncates. "
28872 +                             "Completed\n", done);
28873 +       return retval;
28874 +}
28875 +
28876 +/*
28877 + * to protect file being unlinked from getting lost we "safe" link files
28878 + * being unlinked. This link will be deleted in the same transaction with last
28879 + * item of file. mounting the filesystem we scan all these links and remove
28880 + * files which almost got lost
28881 + */
28882 +void add_save_link(struct reiserfs_transaction_handle *th,
28883 +                  struct inode *inode, int truncate)
28884 +{
28885 +       INITIALIZE_PATH(path);
28886 +       int retval;
28887 +       struct cpu_key key;
28888 +       struct item_head ih;
28889 +       __le32 link;
28890 +
28891 +       BUG_ON(!th->t_trans_id);
28892 +
28893 +       /* file can only get one "save link" of each kind */
28894 +       RFALSE(truncate &&
28895 +              (REISERFS_I(inode)->i_flags & i_link_saved_truncate_mask),
28896 +              "saved link already exists for truncated inode %lx",
28897 +              (long)inode->i_ino);
28898 +       RFALSE(!truncate &&
28899 +              (REISERFS_I(inode)->i_flags & i_link_saved_unlink_mask),
28900 +              "saved link already exists for unlinked inode %lx",
28901 +              (long)inode->i_ino);
28902 +
28903 +       /* setup key of "save" link */
28904 +       key.version = KEY_FORMAT_3_5;
28905 +       key.on_disk_key.k_dir_id = MAX_KEY_OBJECTID;
28906 +       key.on_disk_key.k_objectid = inode->i_ino;
28907 +       if (!truncate) {
28908 +               /* unlink, rmdir, rename */
28909 +               set_cpu_key_k_offset(&key, 1 + inode->i_sb->s_blocksize);
28910 +               set_cpu_key_k_type(&key, TYPE_DIRECT);
28911 +
28912 +               /* item head of "safe" link */
28913 +               make_le_item_head(&ih, &key, key.version,
28914 +                                 1 + inode->i_sb->s_blocksize, TYPE_DIRECT,
28915 +                                 4 /*length */ , 0xffff /*free space */ );
28916 +       } else {
28917 +               /* truncate */
28918 +               if (S_ISDIR(inode->i_mode))
28919 +                       reiserfs_warning(inode->i_sb, "green-2102",
28920 +                                        "Adding a truncate savelink for "
28921 +                                        "a directory %k! Please report",
28922 +                                        INODE_PKEY(inode));
28923 +               set_cpu_key_k_offset(&key, 1);
28924 +               set_cpu_key_k_type(&key, TYPE_INDIRECT);
28925 +
28926 +               /* item head of "safe" link */
28927 +               make_le_item_head(&ih, &key, key.version, 1, TYPE_INDIRECT,
28928 +                                 4 /*length */ , 0 /*free space */ );
28929 +       }
28930 +       key.key_length = 3;
28931 +
28932 +       /* look for its place in the tree */
28933 +       retval = search_item(inode->i_sb, &key, &path);
28934 +       if (retval != ITEM_NOT_FOUND) {
28935 +               if (retval != -ENOSPC)
28936 +                       reiserfs_error(inode->i_sb, "vs-2100",
28937 +                                      "search_by_key (%K) returned %d", &key,
28938 +                                      retval);
28939 +               pathrelse(&path);
28940 +               return;
28941 +       }
28942 +
28943 +       /* body of "save" link */
28944 +       link = INODE_PKEY(inode)->k_dir_id;
28945 +
28946 +       /* put "save" link into tree, don't charge quota to anyone */
28947 +       retval =
28948 +           reiserfs_insert_item(th, &path, &key, &ih, NULL, (char *)&link);
28949 +       if (retval) {
28950 +               if (retval != -ENOSPC)
28951 +                       reiserfs_error(inode->i_sb, "vs-2120",
28952 +                                      "insert_item returned %d", retval);
28953 +       } else {
28954 +               if (truncate)
28955 +                       REISERFS_I(inode)->i_flags |=
28956 +                           i_link_saved_truncate_mask;
28957 +               else
28958 +                       REISERFS_I(inode)->i_flags |= i_link_saved_unlink_mask;
28959 +       }
28960 +}
28961 +
28962 +/* this opens transaction unlike add_save_link */
28963 +int remove_save_link(struct inode *inode, int truncate)
28964 +{
28965 +       struct reiserfs_transaction_handle th;
28966 +       struct reiserfs_key key;
28967 +       int err;
28968 +
28969 +       /* we are going to do one balancing only */
28970 +       err = journal_begin(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT);
28971 +       if (err)
28972 +               return err;
28973 +
28974 +       /* setup key of "save" link */
28975 +       key.k_dir_id = cpu_to_le32(MAX_KEY_OBJECTID);
28976 +       key.k_objectid = INODE_PKEY(inode)->k_objectid;
28977 +       if (!truncate) {
28978 +               /* unlink, rmdir, rename */
28979 +               set_le_key_k_offset(KEY_FORMAT_3_5, &key,
28980 +                                   1 + inode->i_sb->s_blocksize);
28981 +               set_le_key_k_type(KEY_FORMAT_3_5, &key, TYPE_DIRECT);
28982 +       } else {
28983 +               /* truncate */
28984 +               set_le_key_k_offset(KEY_FORMAT_3_5, &key, 1);
28985 +               set_le_key_k_type(KEY_FORMAT_3_5, &key, TYPE_INDIRECT);
28986 +       }
28987 +
28988 +       if ((truncate &&
28989 +            (REISERFS_I(inode)->i_flags & i_link_saved_truncate_mask)) ||
28990 +           (!truncate &&
28991 +            (REISERFS_I(inode)->i_flags & i_link_saved_unlink_mask)))
28992 +               /* don't take quota bytes from anywhere */
28993 +               reiserfs_delete_solid_item(&th, NULL, &key);
28994 +       if (!truncate) {
28995 +               reiserfs_release_objectid(&th, inode->i_ino);
28996 +               REISERFS_I(inode)->i_flags &= ~i_link_saved_unlink_mask;
28997 +       } else
28998 +               REISERFS_I(inode)->i_flags &= ~i_link_saved_truncate_mask;
28999 +
29000 +       return journal_end(&th);
29001 +}
29002 +
29003 +static void reiserfs_kill_sb(struct super_block *s)
29004 +{
29005 +       if (REISERFS_SB(s)) {
29006 +               reiserfs_proc_info_done(s);
29007 +               /*
29008 +                * Force any pending inode evictions to occur now. Any
29009 +                * inodes to be removed that have extended attributes
29010 +                * associated with them need to clean them up before
29011 +                * we can release the extended attribute root dentries.
29012 +                * shrink_dcache_for_umount will BUG if we don't release
29013 +                * those before it's called so ->put_super is too late.
29014 +                */
29015 +               shrink_dcache_sb(s);
29016 +
29017 +               dput(REISERFS_SB(s)->xattr_root);
29018 +               REISERFS_SB(s)->xattr_root = NULL;
29019 +               dput(REISERFS_SB(s)->priv_root);
29020 +               REISERFS_SB(s)->priv_root = NULL;
29021 +       }
29022 +
29023 +       kill_block_super(s);
29024 +}
29025 +
29026 +#ifdef CONFIG_QUOTA
29027 +static int reiserfs_quota_off(struct super_block *sb, int type);
29028 +
29029 +static void reiserfs_quota_off_umount(struct super_block *s)
29030 +{
29031 +       int type;
29032 +
29033 +       for (type = 0; type < REISERFS_MAXQUOTAS; type++)
29034 +               reiserfs_quota_off(s, type);
29035 +}
29036 +#else
29037 +static inline void reiserfs_quota_off_umount(struct super_block *s)
29038 +{
29039 +}
29040 +#endif
29041 +
29042 +static void reiserfs_put_super(struct super_block *s)
29043 +{
29044 +       struct reiserfs_transaction_handle th;
29045 +       th.t_trans_id = 0;
29046 +
29047 +       reiserfs_quota_off_umount(s);
29048 +
29049 +       reiserfs_write_lock(s);
29050 +
29051 +       /*
29052 +        * change file system state to current state if it was mounted
29053 +        * with read-write permissions
29054 +        */
29055 +       if (!sb_rdonly(s)) {
29056 +               if (!journal_begin(&th, s, 10)) {
29057 +                       reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s),
29058 +                                                    1);
29059 +                       set_sb_umount_state(SB_DISK_SUPER_BLOCK(s),
29060 +                                           REISERFS_SB(s)->s_mount_state);
29061 +                       journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
29062 +               }
29063 +       }
29064 +
29065 +       /*
29066 +        * note, journal_release checks for readonly mount, and can
29067 +        * decide not to do a journal_end
29068 +        */
29069 +       journal_release(&th, s);
29070 +
29071 +       reiserfs_free_bitmap_cache(s);
29072 +
29073 +       brelse(SB_BUFFER_WITH_SB(s));
29074 +
29075 +       print_statistics(s);
29076 +
29077 +       if (REISERFS_SB(s)->reserved_blocks != 0) {
29078 +               reiserfs_warning(s, "green-2005", "reserved blocks left %d",
29079 +                                REISERFS_SB(s)->reserved_blocks);
29080 +       }
29081 +
29082 +       reiserfs_write_unlock(s);
29083 +       mutex_destroy(&REISERFS_SB(s)->lock);
29084 +       destroy_workqueue(REISERFS_SB(s)->commit_wq);
29085 +       kfree(REISERFS_SB(s)->s_jdev);
29086 +       kfree(s->s_fs_info);
29087 +       s->s_fs_info = NULL;
29088 +}
29089 +
29090 +static struct kmem_cache *reiserfs_inode_cachep;
29091 +
29092 +static struct inode *reiserfs_alloc_inode(struct super_block *sb)
29093 +{
29094 +       struct reiserfs_inode_info *ei;
29095 +       ei = alloc_inode_sb(sb, reiserfs_inode_cachep, GFP_KERNEL);
29096 +       if (!ei)
29097 +               return NULL;
29098 +       atomic_set(&ei->openers, 0);
29099 +       mutex_init(&ei->tailpack);
29100 +#ifdef CONFIG_QUOTA
29101 +       memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
29102 +#endif
29103 +
29104 +       return &ei->vfs_inode;
29105 +}
29106 +
29107 +static void reiserfs_free_inode(struct inode *inode)
29108 +{
29109 +       kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode));
29110 +}
29111 +
29112 +static void init_once(void *foo)
29113 +{
29114 +       struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo;
29115 +
29116 +       INIT_LIST_HEAD(&ei->i_prealloc_list);
29117 +       inode_init_once(&ei->vfs_inode);
29118 +}
29119 +
29120 +static int __init init_inodecache(void)
29121 +{
29122 +       reiserfs_inode_cachep = kmem_cache_create("reiser_inode_cache",
29123 +                                                 sizeof(struct
29124 +                                                        reiserfs_inode_info),
29125 +                                                 0, (SLAB_RECLAIM_ACCOUNT|
29126 +                                                     SLAB_ACCOUNT),
29127 +                                                 init_once);
29128 +       if (reiserfs_inode_cachep == NULL)
29129 +               return -ENOMEM;
29130 +       return 0;
29131 +}
29132 +
29133 +static void destroy_inodecache(void)
29134 +{
29135 +       /*
29136 +        * Make sure all delayed rcu free inodes are flushed before we
29137 +        * destroy cache.
29138 +        */
29139 +       rcu_barrier();
29140 +       kmem_cache_destroy(reiserfs_inode_cachep);
29141 +}
29142 +
29143 +/* we don't mark inodes dirty, we just log them */
29144 +static void reiserfs_dirty_inode(struct inode *inode, int flags)
29145 +{
29146 +       struct reiserfs_transaction_handle th;
29147 +
29148 +       int err = 0;
29149 +
29150 +       if (sb_rdonly(inode->i_sb)) {
29151 +               reiserfs_warning(inode->i_sb, "clm-6006",
29152 +                                "writing inode %lu on readonly FS",
29153 +                                inode->i_ino);
29154 +               return;
29155 +       }
29156 +       reiserfs_write_lock(inode->i_sb);
29157 +
29158 +       /*
29159 +        * this is really only used for atime updates, so they don't have
29160 +        * to be included in O_SYNC or fsync
29161 +        */
29162 +       err = journal_begin(&th, inode->i_sb, 1);
29163 +       if (err)
29164 +               goto out;
29165 +
29166 +       reiserfs_update_sd(&th, inode);
29167 +       journal_end(&th);
29168 +
29169 +out:
29170 +       reiserfs_write_unlock(inode->i_sb);
29171 +}
29172 +
29173 +static int reiserfs_show_options(struct seq_file *seq, struct dentry *root)
29174 +{
29175 +       struct super_block *s = root->d_sb;
29176 +       struct reiserfs_journal *journal = SB_JOURNAL(s);
29177 +       long opts = REISERFS_SB(s)->s_mount_opt;
29178 +
29179 +       if (opts & (1 << REISERFS_LARGETAIL))
29180 +               seq_puts(seq, ",tails=on");
29181 +       else if (!(opts & (1 << REISERFS_SMALLTAIL)))
29182 +               seq_puts(seq, ",notail");
29183 +       /* tails=small is default so we don't show it */
29184 +
29185 +       if (!(opts & (1 << REISERFS_BARRIER_FLUSH)))
29186 +               seq_puts(seq, ",barrier=none");
29187 +       /* barrier=flush is default so we don't show it */
29188 +
29189 +       if (opts & (1 << REISERFS_ERROR_CONTINUE))
29190 +               seq_puts(seq, ",errors=continue");
29191 +       else if (opts & (1 << REISERFS_ERROR_PANIC))
29192 +               seq_puts(seq, ",errors=panic");
29193 +       /* errors=ro is default so we don't show it */
29194 +
29195 +       if (opts & (1 << REISERFS_DATA_LOG))
29196 +               seq_puts(seq, ",data=journal");
29197 +       else if (opts & (1 << REISERFS_DATA_WRITEBACK))
29198 +               seq_puts(seq, ",data=writeback");
29199 +       /* data=ordered is default so we don't show it */
29200 +
29201 +       if (opts & (1 << REISERFS_ATTRS))
29202 +               seq_puts(seq, ",attrs");
29203 +
29204 +       if (opts & (1 << REISERFS_XATTRS_USER))
29205 +               seq_puts(seq, ",user_xattr");
29206 +
29207 +       if (opts & (1 << REISERFS_EXPOSE_PRIVROOT))
29208 +               seq_puts(seq, ",expose_privroot");
29209 +
29210 +       if (opts & (1 << REISERFS_POSIXACL))
29211 +               seq_puts(seq, ",acl");
29212 +
29213 +       if (REISERFS_SB(s)->s_jdev)
29214 +               seq_show_option(seq, "jdev", REISERFS_SB(s)->s_jdev);
29215 +
29216 +       if (journal->j_max_commit_age != journal->j_default_max_commit_age)
29217 +               seq_printf(seq, ",commit=%d", journal->j_max_commit_age);
29218 +
29219 +#ifdef CONFIG_QUOTA
29220 +       if (REISERFS_SB(s)->s_qf_names[USRQUOTA])
29221 +               seq_show_option(seq, "usrjquota",
29222 +                               REISERFS_SB(s)->s_qf_names[USRQUOTA]);
29223 +       else if (opts & (1 << REISERFS_USRQUOTA))
29224 +               seq_puts(seq, ",usrquota");
29225 +       if (REISERFS_SB(s)->s_qf_names[GRPQUOTA])
29226 +               seq_show_option(seq, "grpjquota",
29227 +                               REISERFS_SB(s)->s_qf_names[GRPQUOTA]);
29228 +       else if (opts & (1 << REISERFS_GRPQUOTA))
29229 +               seq_puts(seq, ",grpquota");
29230 +       if (REISERFS_SB(s)->s_jquota_fmt) {
29231 +               if (REISERFS_SB(s)->s_jquota_fmt == QFMT_VFS_OLD)
29232 +                       seq_puts(seq, ",jqfmt=vfsold");
29233 +               else if (REISERFS_SB(s)->s_jquota_fmt == QFMT_VFS_V0)
29234 +                       seq_puts(seq, ",jqfmt=vfsv0");
29235 +       }
29236 +#endif
29237 +
29238 +       /* Block allocator options */
29239 +       if (opts & (1 << REISERFS_NO_BORDER))
29240 +               seq_puts(seq, ",block-allocator=noborder");
29241 +       if (opts & (1 << REISERFS_NO_UNHASHED_RELOCATION))
29242 +               seq_puts(seq, ",block-allocator=no_unhashed_relocation");
29243 +       if (opts & (1 << REISERFS_HASHED_RELOCATION))
29244 +               seq_puts(seq, ",block-allocator=hashed_relocation");
29245 +       if (opts & (1 << REISERFS_TEST4))
29246 +               seq_puts(seq, ",block-allocator=test4");
29247 +       show_alloc_options(seq, s);
29248 +       return 0;
29249 +}
29250 +
29251 +#ifdef CONFIG_QUOTA
29252 +static ssize_t reiserfs_quota_write(struct super_block *, int, const char *,
29253 +                                   size_t, loff_t);
29254 +static ssize_t reiserfs_quota_read(struct super_block *, int, char *, size_t,
29255 +                                  loff_t);
29256 +
29257 +static struct dquot __rcu **reiserfs_get_dquots(struct inode *inode)
29258 +{
29259 +       return REISERFS_I(inode)->i_dquot;
29260 +}
29261 +#endif
29262 +
29263 +static const struct super_operations reiserfs_sops = {
29264 +       .alloc_inode = reiserfs_alloc_inode,
29265 +       .free_inode = reiserfs_free_inode,
29266 +       .write_inode = reiserfs_write_inode,
29267 +       .dirty_inode = reiserfs_dirty_inode,
29268 +       .evict_inode = reiserfs_evict_inode,
29269 +       .put_super = reiserfs_put_super,
29270 +       .sync_fs = reiserfs_sync_fs,
29271 +       .freeze_fs = reiserfs_freeze,
29272 +       .unfreeze_fs = reiserfs_unfreeze,
29273 +       .statfs = reiserfs_statfs,
29274 +       .remount_fs = reiserfs_remount,
29275 +       .show_options = reiserfs_show_options,
29276 +#ifdef CONFIG_QUOTA
29277 +       .quota_read = reiserfs_quota_read,
29278 +       .quota_write = reiserfs_quota_write,
29279 +       .get_dquots = reiserfs_get_dquots,
29280 +#endif
29281 +};
29282 +
29283 +#ifdef CONFIG_QUOTA
29284 +#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
29285 +
29286 +static int reiserfs_write_dquot(struct dquot *);
29287 +static int reiserfs_acquire_dquot(struct dquot *);
29288 +static int reiserfs_release_dquot(struct dquot *);
29289 +static int reiserfs_mark_dquot_dirty(struct dquot *);
29290 +static int reiserfs_write_info(struct super_block *, int);
29291 +static int reiserfs_quota_on(struct super_block *, int, int, const struct path *);
29292 +
29293 +static const struct dquot_operations reiserfs_quota_operations = {
29294 +       .write_dquot = reiserfs_write_dquot,
29295 +       .acquire_dquot = reiserfs_acquire_dquot,
29296 +       .release_dquot = reiserfs_release_dquot,
29297 +       .mark_dirty = reiserfs_mark_dquot_dirty,
29298 +       .write_info = reiserfs_write_info,
29299 +       .alloc_dquot    = dquot_alloc,
29300 +       .destroy_dquot  = dquot_destroy,
29301 +       .get_next_id    = dquot_get_next_id,
29302 +};
29303 +
29304 +static const struct quotactl_ops reiserfs_qctl_operations = {
29305 +       .quota_on = reiserfs_quota_on,
29306 +       .quota_off = reiserfs_quota_off,
29307 +       .quota_sync = dquot_quota_sync,
29308 +       .get_state = dquot_get_state,
29309 +       .set_info = dquot_set_dqinfo,
29310 +       .get_dqblk = dquot_get_dqblk,
29311 +       .set_dqblk = dquot_set_dqblk,
29312 +};
29313 +#endif
29314 +
29315 +static const struct export_operations reiserfs_export_ops = {
29316 +       .encode_fh = reiserfs_encode_fh,
29317 +       .fh_to_dentry = reiserfs_fh_to_dentry,
29318 +       .fh_to_parent = reiserfs_fh_to_parent,
29319 +       .get_parent = reiserfs_get_parent,
29320 +};
29321 +
29322 +/*
29323 + * this struct is used in reiserfs_getopt () for containing the value for
29324 + * those mount options that have values rather than being toggles.
29325 + */
29326 +typedef struct {
29327 +       char *value;
29328 +       /*
29329 +        * bitmask which is to set on mount_options bitmask
29330 +        * when this value is found, 0 is no bits are to be changed.
29331 +        */
29332 +       int setmask;
29333 +       /*
29334 +        * bitmask which is to clear on mount_options bitmask
29335 +        * when this value is found, 0 is no bits are to be changed.
29336 +        * This is applied BEFORE setmask
29337 +        */
29338 +       int clrmask;
29339 +} arg_desc_t;
29340 +
29341 +/* Set this bit in arg_required to allow empty arguments */
29342 +#define REISERFS_OPT_ALLOWEMPTY 31
29343 +
29344 +/*
29345 + * this struct is used in reiserfs_getopt() for describing the
29346 + * set of reiserfs mount options
29347 + */
29348 +typedef struct {
29349 +       char *option_name;
29350 +
29351 +       /* 0 if argument is not required, not 0 otherwise */
29352 +       int arg_required;
29353 +
29354 +       /* list of values accepted by an option */
29355 +       const arg_desc_t *values;
29356 +
29357 +       /*
29358 +        * bitmask which is to set on mount_options bitmask
29359 +        * when this value is found, 0 is no bits are to be changed.
29360 +        */
29361 +       int setmask;
29362 +
29363 +       /*
29364 +        * bitmask which is to clear on mount_options bitmask
29365 +        * when this value is found, 0 is no bits are to be changed.
29366 +        * This is applied BEFORE setmask
29367 +        */
29368 +       int clrmask;
29369 +} opt_desc_t;
29370 +
29371 +/* possible values for -o data= */
29372 +static const arg_desc_t logging_mode[] = {
29373 +       {"ordered", 1 << REISERFS_DATA_ORDERED,
29374 +        (1 << REISERFS_DATA_LOG | 1 << REISERFS_DATA_WRITEBACK)},
29375 +       {"journal", 1 << REISERFS_DATA_LOG,
29376 +        (1 << REISERFS_DATA_ORDERED | 1 << REISERFS_DATA_WRITEBACK)},
29377 +       {"writeback", 1 << REISERFS_DATA_WRITEBACK,
29378 +        (1 << REISERFS_DATA_ORDERED | 1 << REISERFS_DATA_LOG)},
29379 +       {.value = NULL}
29380 +};
29381 +
29382 +/* possible values for -o barrier= */
29383 +static const arg_desc_t barrier_mode[] = {
29384 +       {"none", 1 << REISERFS_BARRIER_NONE, 1 << REISERFS_BARRIER_FLUSH},
29385 +       {"flush", 1 << REISERFS_BARRIER_FLUSH, 1 << REISERFS_BARRIER_NONE},
29386 +       {.value = NULL}
29387 +};
29388 +
29389 +/*
29390 + * possible values for "-o block-allocator=" and bits which are to be set in
29391 + * s_mount_opt of reiserfs specific part of in-core super block
29392 + */
29393 +static const arg_desc_t balloc[] = {
29394 +       {"noborder", 1 << REISERFS_NO_BORDER, 0},
29395 +       {"border", 0, 1 << REISERFS_NO_BORDER},
29396 +       {"no_unhashed_relocation", 1 << REISERFS_NO_UNHASHED_RELOCATION, 0},
29397 +       {"hashed_relocation", 1 << REISERFS_HASHED_RELOCATION, 0},
29398 +       {"test4", 1 << REISERFS_TEST4, 0},
29399 +       {"notest4", 0, 1 << REISERFS_TEST4},
29400 +       {NULL, 0, 0}
29401 +};
29402 +
29403 +static const arg_desc_t tails[] = {
29404 +       {"on", 1 << REISERFS_LARGETAIL, 1 << REISERFS_SMALLTAIL},
29405 +       {"off", 0, (1 << REISERFS_LARGETAIL) | (1 << REISERFS_SMALLTAIL)},
29406 +       {"small", 1 << REISERFS_SMALLTAIL, 1 << REISERFS_LARGETAIL},
29407 +       {NULL, 0, 0}
29408 +};
29409 +
29410 +static const arg_desc_t error_actions[] = {
29411 +       {"panic", 1 << REISERFS_ERROR_PANIC,
29412 +        (1 << REISERFS_ERROR_RO | 1 << REISERFS_ERROR_CONTINUE)},
29413 +       {"ro-remount", 1 << REISERFS_ERROR_RO,
29414 +        (1 << REISERFS_ERROR_PANIC | 1 << REISERFS_ERROR_CONTINUE)},
29415 +#ifdef REISERFS_JOURNAL_ERROR_ALLOWS_NO_LOG
29416 +       {"continue", 1 << REISERFS_ERROR_CONTINUE,
29417 +        (1 << REISERFS_ERROR_PANIC | 1 << REISERFS_ERROR_RO)},
29418 +#endif
29419 +       {NULL, 0, 0},
29420 +};
29421 +
29422 +/*
29423 + * proceed only one option from a list *cur - string containing of mount
29424 + * options
29425 + * opts - array of options which are accepted
29426 + * opt_arg - if option is found and requires an argument and if it is specifed
29427 + * in the input - pointer to the argument is stored here
29428 + * bit_flags - if option requires to set a certain bit - it is set here
29429 + * return -1 if unknown option is found, opt->arg_required otherwise
29430 + */
29431 +static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts,
29432 +                          char **opt_arg, unsigned long *bit_flags)
29433 +{
29434 +       char *p;
29435 +       /*
29436 +        * foo=bar,
29437 +        * ^   ^  ^
29438 +        * |   |  +-- option_end
29439 +        * |   +-- arg_start
29440 +        * +-- option_start
29441 +        */
29442 +       const opt_desc_t *opt;
29443 +       const arg_desc_t *arg;
29444 +
29445 +       p = *cur;
29446 +
29447 +       /* assume argument cannot contain commas */
29448 +       *cur = strchr(p, ',');
29449 +       if (*cur) {
29450 +               *(*cur) = '\0';
29451 +               (*cur)++;
29452 +       }
29453 +
29454 +       if (!strncmp(p, "alloc=", 6)) {
29455 +               /*
29456 +                * Ugly special case, probably we should redo options
29457 +                * parser so that it can understand several arguments for
29458 +                * some options, also so that it can fill several bitfields
29459 +                * with option values.
29460 +                */
29461 +               if (reiserfs_parse_alloc_options(s, p + 6)) {
29462 +                       return -1;
29463 +               } else {
29464 +                       return 0;
29465 +               }
29466 +       }
29467 +
29468 +       /* for every option in the list */
29469 +       for (opt = opts; opt->option_name; opt++) {
29470 +               if (!strncmp(p, opt->option_name, strlen(opt->option_name))) {
29471 +                       if (bit_flags) {
29472 +                               if (opt->clrmask ==
29473 +                                   (1 << REISERFS_UNSUPPORTED_OPT))
29474 +                                       reiserfs_warning(s, "super-6500",
29475 +                                                        "%s not supported.\n",
29476 +                                                        p);
29477 +                               else
29478 +                                       *bit_flags &= ~opt->clrmask;
29479 +                               if (opt->setmask ==
29480 +                                   (1 << REISERFS_UNSUPPORTED_OPT))
29481 +                                       reiserfs_warning(s, "super-6501",
29482 +                                                        "%s not supported.\n",
29483 +                                                        p);
29484 +                               else
29485 +                                       *bit_flags |= opt->setmask;
29486 +                       }
29487 +                       break;
29488 +               }
29489 +       }
29490 +       if (!opt->option_name) {
29491 +               reiserfs_warning(s, "super-6502",
29492 +                                "unknown mount option \"%s\"", p);
29493 +               return -1;
29494 +       }
29495 +
29496 +       p += strlen(opt->option_name);
29497 +       switch (*p) {
29498 +       case '=':
29499 +               if (!opt->arg_required) {
29500 +                       reiserfs_warning(s, "super-6503",
29501 +                                        "the option \"%s\" does not "
29502 +                                        "require an argument\n",
29503 +                                        opt->option_name);
29504 +                       return -1;
29505 +               }
29506 +               break;
29507 +
29508 +       case 0:
29509 +               if (opt->arg_required) {
29510 +                       reiserfs_warning(s, "super-6504",
29511 +                                        "the option \"%s\" requires an "
29512 +                                        "argument\n", opt->option_name);
29513 +                       return -1;
29514 +               }
29515 +               break;
29516 +       default:
29517 +               reiserfs_warning(s, "super-6505",
29518 +                                "head of option \"%s\" is only correct\n",
29519 +                                opt->option_name);
29520 +               return -1;
29521 +       }
29522 +
29523 +       /*
29524 +        * move to the argument, or to next option if argument is not
29525 +        * required
29526 +        */
29527 +       p++;
29528 +
29529 +       if (opt->arg_required
29530 +           && !(opt->arg_required & (1 << REISERFS_OPT_ALLOWEMPTY))
29531 +           && !strlen(p)) {
29532 +               /* this catches "option=," if not allowed */
29533 +               reiserfs_warning(s, "super-6506",
29534 +                                "empty argument for \"%s\"\n",
29535 +                                opt->option_name);
29536 +               return -1;
29537 +       }
29538 +
29539 +       if (!opt->values) {
29540 +               /* *=NULLopt_arg contains pointer to argument */
29541 +               *opt_arg = p;
29542 +               return opt->arg_required & ~(1 << REISERFS_OPT_ALLOWEMPTY);
29543 +       }
29544 +
29545 +       /* values possible for this option are listed in opt->values */
29546 +       for (arg = opt->values; arg->value; arg++) {
29547 +               if (!strcmp(p, arg->value)) {
29548 +                       if (bit_flags) {
29549 +                               *bit_flags &= ~arg->clrmask;
29550 +                               *bit_flags |= arg->setmask;
29551 +                       }
29552 +                       return opt->arg_required;
29553 +               }
29554 +       }
29555 +
29556 +       reiserfs_warning(s, "super-6506",
29557 +                        "bad value \"%s\" for option \"%s\"\n", p,
29558 +                        opt->option_name);
29559 +       return -1;
29560 +}
29561 +
29562 +/* returns 0 if something is wrong in option string, 1 - otherwise */
29563 +static int reiserfs_parse_options(struct super_block *s,
29564 +
29565 +                                 /* string given via mount's -o */
29566 +                                 char *options,
29567 +
29568 +                                 /*
29569 +                                  * after the parsing phase, contains the
29570 +                                  * collection of bitflags defining what
29571 +                                  * mount options were selected.
29572 +                                  */
29573 +                                 unsigned long *mount_options,
29574 +
29575 +                                 /* strtol-ed from NNN of resize=NNN */
29576 +                                 unsigned long *blocks,
29577 +                                 char **jdev_name,
29578 +                                 unsigned int *commit_max_age,
29579 +                                 char **qf_names,
29580 +                                 unsigned int *qfmt)
29581 +{
29582 +       int c;
29583 +       char *arg = NULL;
29584 +       char *pos;
29585 +       opt_desc_t opts[] = {
29586 +               /*
29587 +                * Compatibility stuff, so that -o notail for old
29588 +                * setups still work
29589 +                */
29590 +               {"tails",.arg_required = 't',.values = tails},
29591 +               {"notail",.clrmask =
29592 +                (1 << REISERFS_LARGETAIL) | (1 << REISERFS_SMALLTAIL)},
29593 +               {"conv",.setmask = 1 << REISERFS_CONVERT},
29594 +               {"attrs",.setmask = 1 << REISERFS_ATTRS},
29595 +               {"noattrs",.clrmask = 1 << REISERFS_ATTRS},
29596 +               {"expose_privroot", .setmask = 1 << REISERFS_EXPOSE_PRIVROOT},
29597 +#ifdef CONFIG_REISERFS_FS_XATTR
29598 +               {"user_xattr",.setmask = 1 << REISERFS_XATTRS_USER},
29599 +               {"nouser_xattr",.clrmask = 1 << REISERFS_XATTRS_USER},
29600 +#else
29601 +               {"user_xattr",.setmask = 1 << REISERFS_UNSUPPORTED_OPT},
29602 +               {"nouser_xattr",.clrmask = 1 << REISERFS_UNSUPPORTED_OPT},
29603 +#endif
29604 +#ifdef CONFIG_REISERFS_FS_POSIX_ACL
29605 +               {"acl",.setmask = 1 << REISERFS_POSIXACL},
29606 +               {"noacl",.clrmask = 1 << REISERFS_POSIXACL},
29607 +#else
29608 +               {"acl",.setmask = 1 << REISERFS_UNSUPPORTED_OPT},
29609 +               {"noacl",.clrmask = 1 << REISERFS_UNSUPPORTED_OPT},
29610 +#endif
29611 +               {.option_name = "nolog"},
29612 +               {"replayonly",.setmask = 1 << REPLAYONLY},
29613 +               {"block-allocator",.arg_required = 'a',.values = balloc},
29614 +               {"data",.arg_required = 'd',.values = logging_mode},
29615 +               {"barrier",.arg_required = 'b',.values = barrier_mode},
29616 +               {"resize",.arg_required = 'r',.values = NULL},
29617 +               {"jdev",.arg_required = 'j',.values = NULL},
29618 +               {"nolargeio",.arg_required = 'w',.values = NULL},
29619 +               {"commit",.arg_required = 'c',.values = NULL},
29620 +               {"usrquota",.setmask = 1 << REISERFS_USRQUOTA},
29621 +               {"grpquota",.setmask = 1 << REISERFS_GRPQUOTA},
29622 +               {"noquota",.clrmask = 1 << REISERFS_USRQUOTA | 1 << REISERFS_GRPQUOTA},
29623 +               {"errors",.arg_required = 'e',.values = error_actions},
29624 +               {"usrjquota",.arg_required =
29625 +                'u' | (1 << REISERFS_OPT_ALLOWEMPTY),.values = NULL},
29626 +               {"grpjquota",.arg_required =
29627 +                'g' | (1 << REISERFS_OPT_ALLOWEMPTY),.values = NULL},
29628 +               {"jqfmt",.arg_required = 'f',.values = NULL},
29629 +               {.option_name = NULL}
29630 +       };
29631 +
29632 +       *blocks = 0;
29633 +       if (!options || !*options)
29634 +               /*
29635 +                * use default configuration: create tails, journaling on, no
29636 +                * conversion to newest format
29637 +                */
29638 +               return 1;
29639 +
29640 +       for (pos = options; pos;) {
29641 +               c = reiserfs_getopt(s, &pos, opts, &arg, mount_options);
29642 +               if (c == -1)
29643 +                       /* wrong option is given */
29644 +                       return 0;
29645 +
29646 +               if (c == 'r') {
29647 +                       char *p;
29648 +
29649 +                       p = NULL;
29650 +                       /* "resize=NNN" or "resize=auto" */
29651 +
29652 +                       if (!strcmp(arg, "auto")) {
29653 +                               /* From JFS code, to auto-get the size. */
29654 +                               *blocks = sb_bdev_nr_blocks(s);
29655 +                       } else {
29656 +                               *blocks = simple_strtoul(arg, &p, 0);
29657 +                               if (*p != '\0') {
29658 +                                       /* NNN does not look like a number */
29659 +                                       reiserfs_warning(s, "super-6507",
29660 +                                                        "bad value %s for "
29661 +                                                        "-oresize\n", arg);
29662 +                                       return 0;
29663 +                               }
29664 +                       }
29665 +               }
29666 +
29667 +               if (c == 'c') {
29668 +                       char *p = NULL;
29669 +                       unsigned long val = simple_strtoul(arg, &p, 0);
29670 +                       /* commit=NNN (time in seconds) */
29671 +                       if (*p != '\0' || val >= (unsigned int)-1) {
29672 +                               reiserfs_warning(s, "super-6508",
29673 +                                                "bad value %s for -ocommit\n",
29674 +                                                arg);
29675 +                               return 0;
29676 +                       }
29677 +                       *commit_max_age = (unsigned int)val;
29678 +               }
29679 +
29680 +               if (c == 'w') {
29681 +                       reiserfs_warning(s, "super-6509", "nolargeio option "
29682 +                                        "is no longer supported");
29683 +                       return 0;
29684 +               }
29685 +
29686 +               if (c == 'j') {
29687 +                       if (arg && *arg && jdev_name) {
29688 +                               /* Hm, already assigned? */
29689 +                               if (*jdev_name) {
29690 +                                       reiserfs_warning(s, "super-6510",
29691 +                                                        "journal device was "
29692 +                                                        "already specified to "
29693 +                                                        "be %s", *jdev_name);
29694 +                                       return 0;
29695 +                               }
29696 +                               *jdev_name = arg;
29697 +                       }
29698 +               }
29699 +#ifdef CONFIG_QUOTA
29700 +               if (c == 'u' || c == 'g') {
29701 +                       int qtype = c == 'u' ? USRQUOTA : GRPQUOTA;
29702 +
29703 +                       if (sb_any_quota_loaded(s) &&
29704 +                           (!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) {
29705 +                               reiserfs_warning(s, "super-6511",
29706 +                                                "cannot change journaled "
29707 +                                                "quota options when quota "
29708 +                                                "turned on.");
29709 +                               return 0;
29710 +                       }
29711 +                       if (qf_names[qtype] !=
29712 +                           REISERFS_SB(s)->s_qf_names[qtype])
29713 +                               kfree(qf_names[qtype]);
29714 +                       qf_names[qtype] = NULL;
29715 +                       if (*arg) {     /* Some filename specified? */
29716 +                               if (REISERFS_SB(s)->s_qf_names[qtype]
29717 +                                   && strcmp(REISERFS_SB(s)->s_qf_names[qtype],
29718 +                                             arg)) {
29719 +                                       reiserfs_warning(s, "super-6512",
29720 +                                                        "%s quota file "
29721 +                                                        "already specified.",
29722 +                                                        QTYPE2NAME(qtype));
29723 +                                       return 0;
29724 +                               }
29725 +                               if (strchr(arg, '/')) {
29726 +                                       reiserfs_warning(s, "super-6513",
29727 +                                                        "quotafile must be "
29728 +                                                        "on filesystem root.");
29729 +                                       return 0;
29730 +                               }
29731 +                               qf_names[qtype] = kstrdup(arg, GFP_KERNEL);
29732 +                               if (!qf_names[qtype]) {
29733 +                                       reiserfs_warning(s, "reiserfs-2502",
29734 +                                                        "not enough memory "
29735 +                                                        "for storing "
29736 +                                                        "quotafile name.");
29737 +                                       return 0;
29738 +                               }
29739 +                               if (qtype == USRQUOTA)
29740 +                                       *mount_options |= 1 << REISERFS_USRQUOTA;
29741 +                               else
29742 +                                       *mount_options |= 1 << REISERFS_GRPQUOTA;
29743 +                       } else {
29744 +                               if (qtype == USRQUOTA)
29745 +                                       *mount_options &= ~(1 << REISERFS_USRQUOTA);
29746 +                               else
29747 +                                       *mount_options &= ~(1 << REISERFS_GRPQUOTA);
29748 +                       }
29749 +               }
29750 +               if (c == 'f') {
29751 +                       if (!strcmp(arg, "vfsold"))
29752 +                               *qfmt = QFMT_VFS_OLD;
29753 +                       else if (!strcmp(arg, "vfsv0"))
29754 +                               *qfmt = QFMT_VFS_V0;
29755 +                       else {
29756 +                               reiserfs_warning(s, "super-6514",
29757 +                                                "unknown quota format "
29758 +                                                "specified.");
29759 +                               return 0;
29760 +                       }
29761 +                       if (sb_any_quota_loaded(s) &&
29762 +                           *qfmt != REISERFS_SB(s)->s_jquota_fmt) {
29763 +                               reiserfs_warning(s, "super-6515",
29764 +                                                "cannot change journaled "
29765 +                                                "quota options when quota "
29766 +                                                "turned on.");
29767 +                               return 0;
29768 +                       }
29769 +               }
29770 +#else
29771 +               if (c == 'u' || c == 'g' || c == 'f') {
29772 +                       reiserfs_warning(s, "reiserfs-2503", "journaled "
29773 +                                        "quota options not supported.");
29774 +                       return 0;
29775 +               }
29776 +#endif
29777 +       }
29778 +
29779 +#ifdef CONFIG_QUOTA
29780 +       if (!REISERFS_SB(s)->s_jquota_fmt && !*qfmt
29781 +           && (qf_names[USRQUOTA] || qf_names[GRPQUOTA])) {
29782 +               reiserfs_warning(s, "super-6515",
29783 +                                "journaled quota format not specified.");
29784 +               return 0;
29785 +       }
29786 +       if ((!(*mount_options & (1 << REISERFS_USRQUOTA)) &&
29787 +              sb_has_quota_loaded(s, USRQUOTA)) ||
29788 +           (!(*mount_options & (1 << REISERFS_GRPQUOTA)) &&
29789 +              sb_has_quota_loaded(s, GRPQUOTA))) {
29790 +               reiserfs_warning(s, "super-6516", "quota options must "
29791 +                                "be present when quota is turned on.");
29792 +               return 0;
29793 +       }
29794 +#endif
29795 +
29796 +       return 1;
29797 +}
29798 +
29799 +static void switch_data_mode(struct super_block *s, unsigned long mode)
29800 +{
29801 +       REISERFS_SB(s)->s_mount_opt &= ~((1 << REISERFS_DATA_LOG) |
29802 +                                        (1 << REISERFS_DATA_ORDERED) |
29803 +                                        (1 << REISERFS_DATA_WRITEBACK));
29804 +       REISERFS_SB(s)->s_mount_opt |= (1 << mode);
29805 +}
29806 +
29807 +static void handle_data_mode(struct super_block *s, unsigned long mount_options)
29808 +{
29809 +       if (mount_options & (1 << REISERFS_DATA_LOG)) {
29810 +               if (!reiserfs_data_log(s)) {
29811 +                       switch_data_mode(s, REISERFS_DATA_LOG);
29812 +                       reiserfs_info(s, "switching to journaled data mode\n");
29813 +               }
29814 +       } else if (mount_options & (1 << REISERFS_DATA_ORDERED)) {
29815 +               if (!reiserfs_data_ordered(s)) {
29816 +                       switch_data_mode(s, REISERFS_DATA_ORDERED);
29817 +                       reiserfs_info(s, "switching to ordered data mode\n");
29818 +               }
29819 +       } else if (mount_options & (1 << REISERFS_DATA_WRITEBACK)) {
29820 +               if (!reiserfs_data_writeback(s)) {
29821 +                       switch_data_mode(s, REISERFS_DATA_WRITEBACK);
29822 +                       reiserfs_info(s, "switching to writeback data mode\n");
29823 +               }
29824 +       }
29825 +}
29826 +
29827 +static void handle_barrier_mode(struct super_block *s, unsigned long bits)
29828 +{
29829 +       int flush = (1 << REISERFS_BARRIER_FLUSH);
29830 +       int none = (1 << REISERFS_BARRIER_NONE);
29831 +       int all_barrier = flush | none;
29832 +
29833 +       if (bits & all_barrier) {
29834 +               REISERFS_SB(s)->s_mount_opt &= ~all_barrier;
29835 +               if (bits & flush) {
29836 +                       REISERFS_SB(s)->s_mount_opt |= flush;
29837 +                       printk("reiserfs: enabling write barrier flush mode\n");
29838 +               } else if (bits & none) {
29839 +                       REISERFS_SB(s)->s_mount_opt |= none;
29840 +                       printk("reiserfs: write barriers turned off\n");
29841 +               }
29842 +       }
29843 +}
29844 +
29845 +static void handle_attrs(struct super_block *s)
29846 +{
29847 +       struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s);
29848 +
29849 +       if (reiserfs_attrs(s)) {
29850 +               if (old_format_only(s)) {
29851 +                       reiserfs_warning(s, "super-6517", "cannot support "
29852 +                                        "attributes on 3.5.x disk format");
29853 +                       REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS);
29854 +                       return;
29855 +               }
29856 +               if (!(le32_to_cpu(rs->s_flags) & reiserfs_attrs_cleared)) {
29857 +                       reiserfs_warning(s, "super-6518", "cannot support "
29858 +                                        "attributes until flag is set in "
29859 +                                        "super-block");
29860 +                       REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS);
29861 +               }
29862 +       }
29863 +}
29864 +
29865 +#ifdef CONFIG_QUOTA
29866 +static void handle_quota_files(struct super_block *s, char **qf_names,
29867 +                              unsigned int *qfmt)
29868 +{
29869 +       int i;
29870 +
29871 +       for (i = 0; i < REISERFS_MAXQUOTAS; i++) {
29872 +               if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i])
29873 +                       kfree(REISERFS_SB(s)->s_qf_names[i]);
29874 +               REISERFS_SB(s)->s_qf_names[i] = qf_names[i];
29875 +       }
29876 +       if (*qfmt)
29877 +               REISERFS_SB(s)->s_jquota_fmt = *qfmt;
29878 +}
29879 +#endif
29880 +
29881 +static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
29882 +{
29883 +       struct reiserfs_super_block *rs;
29884 +       struct reiserfs_transaction_handle th;
29885 +       unsigned long blocks;
29886 +       unsigned long mount_options = REISERFS_SB(s)->s_mount_opt;
29887 +       unsigned long safe_mask = 0;
29888 +       unsigned int commit_max_age = (unsigned int)-1;
29889 +       struct reiserfs_journal *journal = SB_JOURNAL(s);
29890 +       int err;
29891 +       char *qf_names[REISERFS_MAXQUOTAS];
29892 +       unsigned int qfmt = 0;
29893 +#ifdef CONFIG_QUOTA
29894 +       int i;
29895 +#endif
29896 +
29897 +       sync_filesystem(s);
29898 +       reiserfs_write_lock(s);
29899 +
29900 +#ifdef CONFIG_QUOTA
29901 +       memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names));
29902 +#endif
29903 +
29904 +       rs = SB_DISK_SUPER_BLOCK(s);
29905 +
29906 +       if (!reiserfs_parse_options
29907 +           (s, arg, &mount_options, &blocks, NULL, &commit_max_age,
29908 +           qf_names, &qfmt)) {
29909 +#ifdef CONFIG_QUOTA
29910 +               for (i = 0; i < REISERFS_MAXQUOTAS; i++)
29911 +                       if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i])
29912 +                               kfree(qf_names[i]);
29913 +#endif
29914 +               err = -EINVAL;
29915 +               goto out_err_unlock;
29916 +       }
29917 +#ifdef CONFIG_QUOTA
29918 +       handle_quota_files(s, qf_names, &qfmt);
29919 +#endif
29920 +
29921 +       handle_attrs(s);
29922 +
29923 +       /* Add options that are safe here */
29924 +       safe_mask |= 1 << REISERFS_SMALLTAIL;
29925 +       safe_mask |= 1 << REISERFS_LARGETAIL;
29926 +       safe_mask |= 1 << REISERFS_NO_BORDER;
29927 +       safe_mask |= 1 << REISERFS_NO_UNHASHED_RELOCATION;
29928 +       safe_mask |= 1 << REISERFS_HASHED_RELOCATION;
29929 +       safe_mask |= 1 << REISERFS_TEST4;
29930 +       safe_mask |= 1 << REISERFS_ATTRS;
29931 +       safe_mask |= 1 << REISERFS_XATTRS_USER;
29932 +       safe_mask |= 1 << REISERFS_POSIXACL;
29933 +       safe_mask |= 1 << REISERFS_BARRIER_FLUSH;
29934 +       safe_mask |= 1 << REISERFS_BARRIER_NONE;
29935 +       safe_mask |= 1 << REISERFS_ERROR_RO;
29936 +       safe_mask |= 1 << REISERFS_ERROR_CONTINUE;
29937 +       safe_mask |= 1 << REISERFS_ERROR_PANIC;
29938 +       safe_mask |= 1 << REISERFS_USRQUOTA;
29939 +       safe_mask |= 1 << REISERFS_GRPQUOTA;
29940 +
29941 +       /*
29942 +        * Update the bitmask, taking care to keep
29943 +        * the bits we're not allowed to change here
29944 +        */
29945 +       REISERFS_SB(s)->s_mount_opt =
29946 +           (REISERFS_SB(s)->
29947 +            s_mount_opt & ~safe_mask) | (mount_options & safe_mask);
29948 +
29949 +       if (commit_max_age != 0 && commit_max_age != (unsigned int)-1) {
29950 +               journal->j_max_commit_age = commit_max_age;
29951 +               journal->j_max_trans_age = commit_max_age;
29952 +       } else if (commit_max_age == 0) {
29953 +               /* 0 means restore defaults. */
29954 +               journal->j_max_commit_age = journal->j_default_max_commit_age;
29955 +               journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE;
29956 +       }
29957 +
29958 +       if (blocks) {
29959 +               err = reiserfs_resize(s, blocks);
29960 +               if (err != 0)
29961 +                       goto out_err_unlock;
29962 +       }
29963 +
29964 +       if (*mount_flags & SB_RDONLY) {
29965 +               reiserfs_write_unlock(s);
29966 +               reiserfs_xattr_init(s, *mount_flags);
29967 +               /* remount read-only */
29968 +               if (sb_rdonly(s))
29969 +                       /* it is read-only already */
29970 +                       goto out_ok_unlocked;
29971 +
29972 +               err = dquot_suspend(s, -1);
29973 +               if (err < 0)
29974 +                       goto out_err;
29975 +
29976 +               /* try to remount file system with read-only permissions */
29977 +               if (sb_umount_state(rs) == REISERFS_VALID_FS
29978 +                   || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) {
29979 +                       goto out_ok_unlocked;
29980 +               }
29981 +
29982 +               reiserfs_write_lock(s);
29983 +
29984 +               err = journal_begin(&th, s, 10);
29985 +               if (err)
29986 +                       goto out_err_unlock;
29987 +
29988 +               /* Mounting a rw partition read-only. */
29989 +               reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
29990 +               set_sb_umount_state(rs, REISERFS_SB(s)->s_mount_state);
29991 +               journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
29992 +       } else {
29993 +               /* remount read-write */
29994 +               if (!sb_rdonly(s)) {
29995 +                       reiserfs_write_unlock(s);
29996 +                       reiserfs_xattr_init(s, *mount_flags);
29997 +                       goto out_ok_unlocked;   /* We are read-write already */
29998 +               }
29999 +
30000 +               if (reiserfs_is_journal_aborted(journal)) {
30001 +                       err = journal->j_errno;
30002 +                       goto out_err_unlock;
30003 +               }
30004 +
30005 +               handle_data_mode(s, mount_options);
30006 +               handle_barrier_mode(s, mount_options);
30007 +               REISERFS_SB(s)->s_mount_state = sb_umount_state(rs);
30008 +
30009 +               /* now it is safe to call journal_begin */
30010 +               s->s_flags &= ~SB_RDONLY;
30011 +               err = journal_begin(&th, s, 10);
30012 +               if (err)
30013 +                       goto out_err_unlock;
30014 +
30015 +               /* Mount a partition which is read-only, read-write */
30016 +               reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
30017 +               REISERFS_SB(s)->s_mount_state = sb_umount_state(rs);
30018 +               s->s_flags &= ~SB_RDONLY;
30019 +               set_sb_umount_state(rs, REISERFS_ERROR_FS);
30020 +               if (!old_format_only(s))
30021 +                       set_sb_mnt_count(rs, sb_mnt_count(rs) + 1);
30022 +               /* mark_buffer_dirty (SB_BUFFER_WITH_SB (s), 1); */
30023 +               journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
30024 +               REISERFS_SB(s)->s_mount_state = REISERFS_VALID_FS;
30025 +       }
30026 +       /* this will force a full flush of all journal lists */
30027 +       SB_JOURNAL(s)->j_must_wait = 1;
30028 +       err = journal_end(&th);
30029 +       if (err)
30030 +               goto out_err_unlock;
30031 +
30032 +       reiserfs_write_unlock(s);
30033 +       if (!(*mount_flags & SB_RDONLY)) {
30034 +               dquot_resume(s, -1);
30035 +               reiserfs_write_lock(s);
30036 +               finish_unfinished(s);
30037 +               reiserfs_write_unlock(s);
30038 +               reiserfs_xattr_init(s, *mount_flags);
30039 +       }
30040 +
30041 +out_ok_unlocked:
30042 +       return 0;
30043 +
30044 +out_err_unlock:
30045 +       reiserfs_write_unlock(s);
30046 +out_err:
30047 +       return err;
30048 +}
30049 +
30050 +static int read_super_block(struct super_block *s, int offset)
30051 +{
30052 +       struct buffer_head *bh;
30053 +       struct reiserfs_super_block *rs;
30054 +       int fs_blocksize;
30055 +
30056 +       bh = sb_bread(s, offset / s->s_blocksize);
30057 +       if (!bh) {
30058 +               reiserfs_warning(s, "sh-2006",
30059 +                                "bread failed (dev %s, block %lu, size %lu)",
30060 +                                s->s_id, offset / s->s_blocksize,
30061 +                                s->s_blocksize);
30062 +               return 1;
30063 +       }
30064 +
30065 +       rs = (struct reiserfs_super_block *)bh->b_data;
30066 +       if (!is_any_reiserfs_magic_string(rs)) {
30067 +               brelse(bh);
30068 +               return 1;
30069 +       }
30070 +       /*
30071 +        * ok, reiserfs signature (old or new) found in at the given offset
30072 +        */
30073 +       fs_blocksize = sb_blocksize(rs);
30074 +       brelse(bh);
30075 +       sb_set_blocksize(s, fs_blocksize);
30076 +
30077 +       bh = sb_bread(s, offset / s->s_blocksize);
30078 +       if (!bh) {
30079 +               reiserfs_warning(s, "sh-2007",
30080 +                                "bread failed (dev %s, block %lu, size %lu)",
30081 +                                s->s_id, offset / s->s_blocksize,
30082 +                                s->s_blocksize);
30083 +               return 1;
30084 +       }
30085 +
30086 +       rs = (struct reiserfs_super_block *)bh->b_data;
30087 +       if (sb_blocksize(rs) != s->s_blocksize) {
30088 +               reiserfs_warning(s, "sh-2011", "can't find a reiserfs "
30089 +                                "filesystem on (dev %s, block %llu, size %lu)",
30090 +                                s->s_id,
30091 +                                (unsigned long long)bh->b_blocknr,
30092 +                                s->s_blocksize);
30093 +               brelse(bh);
30094 +               return 1;
30095 +       }
30096 +
30097 +       if (rs->s_v1.s_root_block == cpu_to_le32(-1)) {
30098 +               brelse(bh);
30099 +               reiserfs_warning(s, "super-6519", "Unfinished reiserfsck "
30100 +                                "--rebuild-tree run detected. Please run\n"
30101 +                                "reiserfsck --rebuild-tree and wait for a "
30102 +                                "completion. If that fails\n"
30103 +                                "get newer reiserfsprogs package");
30104 +               return 1;
30105 +       }
30106 +
30107 +       reiserfs_warning(NULL, "", "reiserfs filesystem is deprecated and "
30108 +               "scheduled to be removed from the kernel in 2025");
30109 +       SB_BUFFER_WITH_SB(s) = bh;
30110 +       SB_DISK_SUPER_BLOCK(s) = rs;
30111 +
30112 +       /*
30113 +        * magic is of non-standard journal filesystem, look at s_version to
30114 +        * find which format is in use
30115 +        */
30116 +       if (is_reiserfs_jr(rs)) {
30117 +               if (sb_version(rs) == REISERFS_VERSION_2)
30118 +                       reiserfs_info(s, "found reiserfs format \"3.6\""
30119 +                                     " with non-standard journal\n");
30120 +               else if (sb_version(rs) == REISERFS_VERSION_1)
30121 +                       reiserfs_info(s, "found reiserfs format \"3.5\""
30122 +                                     " with non-standard journal\n");
30123 +               else {
30124 +                       reiserfs_warning(s, "sh-2012", "found unknown "
30125 +                                        "format \"%u\" of reiserfs with "
30126 +                                        "non-standard magic", sb_version(rs));
30127 +                       return 1;
30128 +               }
30129 +       } else
30130 +               /*
30131 +                * s_version of standard format may contain incorrect
30132 +                * information, so we just look at the magic string
30133 +                */
30134 +               reiserfs_info(s,
30135 +                             "found reiserfs format \"%s\" with standard journal\n",
30136 +                             is_reiserfs_3_5(rs) ? "3.5" : "3.6");
30137 +
30138 +       s->s_op = &reiserfs_sops;
30139 +       s->s_export_op = &reiserfs_export_ops;
30140 +#ifdef CONFIG_QUOTA
30141 +       s->s_qcop = &reiserfs_qctl_operations;
30142 +       s->dq_op = &reiserfs_quota_operations;
30143 +       s->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
30144 +#endif
30145 +
30146 +       /*
30147 +        * new format is limited by the 32 bit wide i_blocks field, want to
30148 +        * be one full block below that.
30149 +        */
30150 +       s->s_maxbytes = (512LL << 32) - s->s_blocksize;
30151 +       return 0;
30152 +}
30153 +
30154 +/* after journal replay, reread all bitmap and super blocks */
30155 +static int reread_meta_blocks(struct super_block *s)
30156 +{
30157 +       if (bh_read(SB_BUFFER_WITH_SB(s), 0) < 0) {
30158 +               reiserfs_warning(s, "reiserfs-2504", "error reading the super");
30159 +               return 1;
30160 +       }
30161 +
30162 +       return 0;
30163 +}
30164 +
30165 +/* hash detection stuff */
30166 +
30167 +/*
30168 + * if root directory is empty - we set default - Yura's - hash and
30169 + * warn about it
30170 + * FIXME: we look for only one name in a directory. If tea and yura
30171 + * both have the same value - we ask user to send report to the
30172 + * mailing list
30173 + */
30174 +static __u32 find_hash_out(struct super_block *s)
30175 +{
30176 +       int retval;
30177 +       struct inode *inode;
30178 +       struct cpu_key key;
30179 +       INITIALIZE_PATH(path);
30180 +       struct reiserfs_dir_entry de;
30181 +       struct reiserfs_de_head *deh;
30182 +       __u32 hash = DEFAULT_HASH;
30183 +       __u32 deh_hashval, teahash, r5hash, yurahash;
30184 +
30185 +       inode = d_inode(s->s_root);
30186 +
30187 +       make_cpu_key(&key, inode, ~0, TYPE_DIRENTRY, 3);
30188 +       retval = search_by_entry_key(s, &key, &path, &de);
30189 +       if (retval == IO_ERROR) {
30190 +               pathrelse(&path);
30191 +               return UNSET_HASH;
30192 +       }
30193 +       if (retval == NAME_NOT_FOUND)
30194 +               de.de_entry_num--;
30195 +
30196 +       set_de_name_and_namelen(&de);
30197 +       deh = de.de_deh + de.de_entry_num;
30198 +
30199 +       if (deh_offset(deh) == DOT_DOT_OFFSET) {
30200 +               /* allow override in this case */
30201 +               if (reiserfs_rupasov_hash(s))
30202 +                       hash = YURA_HASH;
30203 +               reiserfs_info(s, "FS seems to be empty, autodetect is using the default hash\n");
30204 +               goto out;
30205 +       }
30206 +
30207 +       deh_hashval = GET_HASH_VALUE(deh_offset(deh));
30208 +       r5hash = GET_HASH_VALUE(r5_hash(de.de_name, de.de_namelen));
30209 +       teahash = GET_HASH_VALUE(keyed_hash(de.de_name, de.de_namelen));
30210 +       yurahash = GET_HASH_VALUE(yura_hash(de.de_name, de.de_namelen));
30211 +
30212 +       if ((teahash == r5hash && deh_hashval == r5hash) ||
30213 +           (teahash == yurahash && deh_hashval == yurahash) ||
30214 +           (r5hash == yurahash && deh_hashval == yurahash)) {
30215 +               reiserfs_warning(s, "reiserfs-2506",
30216 +                                "Unable to automatically detect hash "
30217 +                                "function. Please mount with -o "
30218 +                                "hash={tea,rupasov,r5}");
30219 +               hash = UNSET_HASH;
30220 +               goto out;
30221 +       }
30222 +
30223 +       if (deh_hashval == yurahash)
30224 +               hash = YURA_HASH;
30225 +       else if (deh_hashval == teahash)
30226 +               hash = TEA_HASH;
30227 +       else if (deh_hashval == r5hash)
30228 +               hash = R5_HASH;
30229 +       else {
30230 +               reiserfs_warning(s, "reiserfs-2506",
30231 +                                "Unrecognised hash function");
30232 +               hash = UNSET_HASH;
30233 +       }
30234 +out:
30235 +       pathrelse(&path);
30236 +       return hash;
30237 +}
30238 +
30239 +/* finds out which hash names are sorted with */
30240 +static int what_hash(struct super_block *s)
30241 +{
30242 +       __u32 code;
30243 +
30244 +       code = sb_hash_function_code(SB_DISK_SUPER_BLOCK(s));
30245 +
30246 +       /*
30247 +        * reiserfs_hash_detect() == true if any of the hash mount options
30248 +        * were used.  We must check them to make sure the user isn't
30249 +        * using a bad hash value
30250 +        */
30251 +       if (code == UNSET_HASH || reiserfs_hash_detect(s))
30252 +               code = find_hash_out(s);
30253 +
30254 +       if (code != UNSET_HASH && reiserfs_hash_detect(s)) {
30255 +               /*
30256 +                * detection has found the hash, and we must check against the
30257 +                * mount options
30258 +                */
30259 +               if (reiserfs_rupasov_hash(s) && code != YURA_HASH) {
30260 +                       reiserfs_warning(s, "reiserfs-2507",
30261 +                                        "Error, %s hash detected, "
30262 +                                        "unable to force rupasov hash",
30263 +                                        reiserfs_hashname(code));
30264 +                       code = UNSET_HASH;
30265 +               } else if (reiserfs_tea_hash(s) && code != TEA_HASH) {
30266 +                       reiserfs_warning(s, "reiserfs-2508",
30267 +                                        "Error, %s hash detected, "
30268 +                                        "unable to force tea hash",
30269 +                                        reiserfs_hashname(code));
30270 +                       code = UNSET_HASH;
30271 +               } else if (reiserfs_r5_hash(s) && code != R5_HASH) {
30272 +                       reiserfs_warning(s, "reiserfs-2509",
30273 +                                        "Error, %s hash detected, "
30274 +                                        "unable to force r5 hash",
30275 +                                        reiserfs_hashname(code));
30276 +                       code = UNSET_HASH;
30277 +               }
30278 +       } else {
30279 +               /*
30280 +                * find_hash_out was not called or
30281 +                * could not determine the hash
30282 +                */
30283 +               if (reiserfs_rupasov_hash(s)) {
30284 +                       code = YURA_HASH;
30285 +               } else if (reiserfs_tea_hash(s)) {
30286 +                       code = TEA_HASH;
30287 +               } else if (reiserfs_r5_hash(s)) {
30288 +                       code = R5_HASH;
30289 +               }
30290 +       }
30291 +
30292 +       /*
30293 +        * if we are mounted RW, and we have a new valid hash code, update
30294 +        * the super
30295 +        */
30296 +       if (code != UNSET_HASH &&
30297 +           !sb_rdonly(s) &&
30298 +           code != sb_hash_function_code(SB_DISK_SUPER_BLOCK(s))) {
30299 +               set_sb_hash_function_code(SB_DISK_SUPER_BLOCK(s), code);
30300 +       }
30301 +       return code;
30302 +}
30303 +
30304 +/* return pointer to appropriate function */
30305 +static hashf_t hash_function(struct super_block *s)
30306 +{
30307 +       switch (what_hash(s)) {
30308 +       case TEA_HASH:
30309 +               reiserfs_info(s, "Using tea hash to sort names\n");
30310 +               return keyed_hash;
30311 +       case YURA_HASH:
30312 +               reiserfs_info(s, "Using rupasov hash to sort names\n");
30313 +               return yura_hash;
30314 +       case R5_HASH:
30315 +               reiserfs_info(s, "Using r5 hash to sort names\n");
30316 +               return r5_hash;
30317 +       }
30318 +       return NULL;
30319 +}
30320 +
30321 +/* this is used to set up correct value for old partitions */
30322 +static int function2code(hashf_t func)
30323 +{
30324 +       if (func == keyed_hash)
30325 +               return TEA_HASH;
30326 +       if (func == yura_hash)
30327 +               return YURA_HASH;
30328 +       if (func == r5_hash)
30329 +               return R5_HASH;
30330 +
30331 +       BUG();                  /* should never happen */
30332 +
30333 +       return 0;
30334 +}
30335 +
30336 +#define SWARN(silent, s, id, ...)                      \
30337 +       if (!(silent))                          \
30338 +               reiserfs_warning(s, id, __VA_ARGS__)
30339 +
30340 +static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
30341 +{
30342 +       struct inode *root_inode;
30343 +       struct reiserfs_transaction_handle th;
30344 +       int old_format = 0;
30345 +       unsigned long blocks;
30346 +       unsigned int commit_max_age = 0;
30347 +       int jinit_done = 0;
30348 +       struct reiserfs_iget_args args;
30349 +       struct reiserfs_super_block *rs;
30350 +       char *jdev_name;
30351 +       struct reiserfs_sb_info *sbi;
30352 +       int errval = -EINVAL;
30353 +       char *qf_names[REISERFS_MAXQUOTAS] = {};
30354 +       unsigned int qfmt = 0;
30355 +
30356 +       sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL);
30357 +       if (!sbi)
30358 +               return -ENOMEM;
30359 +       s->s_fs_info = sbi;
30360 +       /* Set default values for options: non-aggressive tails, RO on errors */
30361 +       sbi->s_mount_opt |= (1 << REISERFS_SMALLTAIL);
30362 +       sbi->s_mount_opt |= (1 << REISERFS_ERROR_RO);
30363 +       sbi->s_mount_opt |= (1 << REISERFS_BARRIER_FLUSH);
30364 +       /* no preallocation minimum, be smart in reiserfs_file_write instead */
30365 +       sbi->s_alloc_options.preallocmin = 0;
30366 +       /* Preallocate by 16 blocks (17-1) at once */
30367 +       sbi->s_alloc_options.preallocsize = 17;
30368 +       /* setup default block allocator options */
30369 +       reiserfs_init_alloc_options(s);
30370 +
30371 +       spin_lock_init(&sbi->old_work_lock);
30372 +       INIT_DELAYED_WORK(&sbi->old_work, flush_old_commits);
30373 +       mutex_init(&sbi->lock);
30374 +       sbi->lock_depth = -1;
30375 +
30376 +       sbi->commit_wq = alloc_workqueue("reiserfs/%s", WQ_MEM_RECLAIM, 0,
30377 +                                        s->s_id);
30378 +       if (!sbi->commit_wq) {
30379 +               SWARN(silent, s, "", "Cannot allocate commit workqueue");
30380 +               errval = -ENOMEM;
30381 +               goto error_unlocked;
30382 +       }
30383 +
30384 +       jdev_name = NULL;
30385 +       if (reiserfs_parse_options
30386 +           (s, (char *)data, &sbi->s_mount_opt, &blocks, &jdev_name,
30387 +            &commit_max_age, qf_names, &qfmt) == 0) {
30388 +               goto error_unlocked;
30389 +       }
30390 +       if (jdev_name && jdev_name[0]) {
30391 +               sbi->s_jdev = kstrdup(jdev_name, GFP_KERNEL);
30392 +               if (!sbi->s_jdev) {
30393 +                       SWARN(silent, s, "", "Cannot allocate memory for "
30394 +                               "journal device name");
30395 +                       goto error_unlocked;
30396 +               }
30397 +       }
30398 +#ifdef CONFIG_QUOTA
30399 +       handle_quota_files(s, qf_names, &qfmt);
30400 +#endif
30401 +
30402 +       if (blocks) {
30403 +               SWARN(silent, s, "jmacd-7", "resize option for remount only");
30404 +               goto error_unlocked;
30405 +       }
30406 +
30407 +       /*
30408 +        * try old format (undistributed bitmap, super block in 8-th 1k
30409 +        * block of a device)
30410 +        */
30411 +       if (!read_super_block(s, REISERFS_OLD_DISK_OFFSET_IN_BYTES))
30412 +               old_format = 1;
30413 +
30414 +       /*
30415 +        * try new format (64-th 1k block), which can contain reiserfs
30416 +        * super block
30417 +        */
30418 +       else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) {
30419 +               SWARN(silent, s, "sh-2021", "can not find reiserfs on %s",
30420 +                     s->s_id);
30421 +               goto error_unlocked;
30422 +       }
30423 +
30424 +       s->s_time_min = 0;
30425 +       s->s_time_max = U32_MAX;
30426 +
30427 +       rs = SB_DISK_SUPER_BLOCK(s);
30428 +       /*
30429 +        * Let's do basic sanity check to verify that underlying device is not
30430 +        * smaller than the filesystem. If the check fails then abort and
30431 +        * scream, because bad stuff will happen otherwise.
30432 +        */
30433 +       if (bdev_nr_bytes(s->s_bdev) < sb_block_count(rs) * sb_blocksize(rs)) {
30434 +               SWARN(silent, s, "", "Filesystem cannot be "
30435 +                     "mounted because it is bigger than the device");
30436 +               SWARN(silent, s, "", "You may need to run fsck "
30437 +                     "or increase size of your LVM partition");
30438 +               SWARN(silent, s, "", "Or may be you forgot to "
30439 +                     "reboot after fdisk when it told you to");
30440 +               goto error_unlocked;
30441 +       }
30442 +
30443 +       sbi->s_mount_state = SB_REISERFS_STATE(s);
30444 +       sbi->s_mount_state = REISERFS_VALID_FS;
30445 +
30446 +       if ((errval = reiserfs_init_bitmap_cache(s))) {
30447 +               SWARN(silent, s, "jmacd-8", "unable to read bitmap");
30448 +               goto error_unlocked;
30449 +       }
30450 +
30451 +       errval = -EINVAL;
30452 +#ifdef CONFIG_REISERFS_CHECK
30453 +       SWARN(silent, s, "", "CONFIG_REISERFS_CHECK is set ON");
30454 +       SWARN(silent, s, "", "- it is slow mode for debugging.");
30455 +#endif
30456 +
30457 +       /* make data=ordered the default */
30458 +       if (!reiserfs_data_log(s) && !reiserfs_data_ordered(s) &&
30459 +           !reiserfs_data_writeback(s)) {
30460 +               sbi->s_mount_opt |= (1 << REISERFS_DATA_ORDERED);
30461 +       }
30462 +
30463 +       if (reiserfs_data_log(s)) {
30464 +               reiserfs_info(s, "using journaled data mode\n");
30465 +       } else if (reiserfs_data_ordered(s)) {
30466 +               reiserfs_info(s, "using ordered data mode\n");
30467 +       } else {
30468 +               reiserfs_info(s, "using writeback data mode\n");
30469 +       }
30470 +       if (reiserfs_barrier_flush(s)) {
30471 +               printk("reiserfs: using flush barriers\n");
30472 +       }
30473 +
30474 +       if (journal_init(s, jdev_name, old_format, commit_max_age)) {
30475 +               SWARN(silent, s, "sh-2022",
30476 +                     "unable to initialize journal space");
30477 +               goto error_unlocked;
30478 +       } else {
30479 +               /*
30480 +                * once this is set, journal_release must be called
30481 +                * if we error out of the mount
30482 +                */
30483 +               jinit_done = 1;
30484 +       }
30485 +
30486 +       if (reread_meta_blocks(s)) {
30487 +               SWARN(silent, s, "jmacd-9",
30488 +                     "unable to reread meta blocks after journal init");
30489 +               goto error_unlocked;
30490 +       }
30491 +
30492 +       if (replay_only(s))
30493 +               goto error_unlocked;
30494 +
30495 +       s->s_xattr = reiserfs_xattr_handlers;
30496 +
30497 +       if (bdev_read_only(s->s_bdev) && !sb_rdonly(s)) {
30498 +               SWARN(silent, s, "clm-7000",
30499 +                     "Detected readonly device, marking FS readonly");
30500 +               s->s_flags |= SB_RDONLY;
30501 +       }
30502 +       args.objectid = REISERFS_ROOT_OBJECTID;
30503 +       args.dirid = REISERFS_ROOT_PARENT_OBJECTID;
30504 +       root_inode =
30505 +           iget5_locked(s, REISERFS_ROOT_OBJECTID, reiserfs_find_actor,
30506 +                        reiserfs_init_locked_inode, (void *)&args);
30507 +       if (!root_inode) {
30508 +               SWARN(silent, s, "jmacd-10", "get root inode failed");
30509 +               goto error_unlocked;
30510 +       }
30511 +
30512 +       /*
30513 +        * This path assumed to be called with the BKL in the old times.
30514 +        * Now we have inherited the big reiserfs lock from it and many
30515 +        * reiserfs helpers called in the mount path and elsewhere require
30516 +        * this lock to be held even if it's not always necessary. Let's be
30517 +        * conservative and hold it early. The window can be reduced after
30518 +        * careful review of the code.
30519 +        */
30520 +       reiserfs_write_lock(s);
30521 +
30522 +       if (root_inode->i_state & I_NEW) {
30523 +               reiserfs_read_locked_inode(root_inode, &args);
30524 +               unlock_new_inode(root_inode);
30525 +       }
30526 +
30527 +       if (!S_ISDIR(root_inode->i_mode) || !inode_get_bytes(root_inode) ||
30528 +           !root_inode->i_size) {
30529 +               SWARN(silent, s, "", "corrupt root inode, run fsck");
30530 +               iput(root_inode);
30531 +               errval = -EUCLEAN;
30532 +               goto error;
30533 +       }
30534 +
30535 +       s->s_root = d_make_root(root_inode);
30536 +       if (!s->s_root)
30537 +               goto error;
30538 +       /* define and initialize hash function */
30539 +       sbi->s_hash_function = hash_function(s);
30540 +       if (sbi->s_hash_function == NULL) {
30541 +               dput(s->s_root);
30542 +               s->s_root = NULL;
30543 +               goto error;
30544 +       }
30545 +
30546 +       if (is_reiserfs_3_5(rs)
30547 +           || (is_reiserfs_jr(rs) && SB_VERSION(s) == REISERFS_VERSION_1))
30548 +               set_bit(REISERFS_3_5, &sbi->s_properties);
30549 +       else if (old_format)
30550 +               set_bit(REISERFS_OLD_FORMAT, &sbi->s_properties);
30551 +       else
30552 +               set_bit(REISERFS_3_6, &sbi->s_properties);
30553 +
30554 +       if (!sb_rdonly(s)) {
30555 +
30556 +               errval = journal_begin(&th, s, 1);
30557 +               if (errval) {
30558 +                       dput(s->s_root);
30559 +                       s->s_root = NULL;
30560 +                       goto error;
30561 +               }
30562 +               reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
30563 +
30564 +               set_sb_umount_state(rs, REISERFS_ERROR_FS);
30565 +               set_sb_fs_state(rs, 0);
30566 +
30567 +               /*
30568 +                * Clear out s_bmap_nr if it would wrap. We can handle this
30569 +                * case, but older revisions can't. This will cause the
30570 +                * file system to fail mount on those older implementations,
30571 +                * avoiding corruption. -jeffm
30572 +                */
30573 +               if (bmap_would_wrap(reiserfs_bmap_count(s)) &&
30574 +                   sb_bmap_nr(rs) != 0) {
30575 +                       reiserfs_warning(s, "super-2030", "This file system "
30576 +                                       "claims to use %u bitmap blocks in "
30577 +                                       "its super block, but requires %u. "
30578 +                                       "Clearing to zero.", sb_bmap_nr(rs),
30579 +                                       reiserfs_bmap_count(s));
30580 +
30581 +                       set_sb_bmap_nr(rs, 0);
30582 +               }
30583 +
30584 +               if (old_format_only(s)) {
30585 +                       /*
30586 +                        * filesystem of format 3.5 either with standard
30587 +                        * or non-standard journal
30588 +                        */
30589 +                       if (convert_reiserfs(s)) {
30590 +                               /* and -o conv is given */
30591 +                               if (!silent)
30592 +                                       reiserfs_info(s,
30593 +                                                     "converting 3.5 filesystem to the 3.6 format");
30594 +
30595 +                               if (is_reiserfs_3_5(rs))
30596 +                                       /*
30597 +                                        * put magic string of 3.6 format.
30598 +                                        * 2.2 will not be able to
30599 +                                        * mount this filesystem anymore
30600 +                                        */
30601 +                                       memcpy(rs->s_v1.s_magic,
30602 +                                              reiserfs_3_6_magic_string,
30603 +                                              sizeof
30604 +                                              (reiserfs_3_6_magic_string));
30605 +
30606 +                               set_sb_version(rs, REISERFS_VERSION_2);
30607 +                               reiserfs_convert_objectid_map_v1(s);
30608 +                               set_bit(REISERFS_3_6, &sbi->s_properties);
30609 +                               clear_bit(REISERFS_3_5, &sbi->s_properties);
30610 +                       } else if (!silent) {
30611 +                               reiserfs_info(s, "using 3.5.x disk format\n");
30612 +                       }
30613 +               } else
30614 +                       set_sb_mnt_count(rs, sb_mnt_count(rs) + 1);
30615 +
30616 +
30617 +               journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
30618 +               errval = journal_end(&th);
30619 +               if (errval) {
30620 +                       dput(s->s_root);
30621 +                       s->s_root = NULL;
30622 +                       goto error;
30623 +               }
30624 +
30625 +               reiserfs_write_unlock(s);
30626 +               if ((errval = reiserfs_lookup_privroot(s)) ||
30627 +                   (errval = reiserfs_xattr_init(s, s->s_flags))) {
30628 +                       dput(s->s_root);
30629 +                       s->s_root = NULL;
30630 +                       goto error_unlocked;
30631 +               }
30632 +               reiserfs_write_lock(s);
30633 +
30634 +               /*
30635 +                * look for files which were to be removed in previous session
30636 +                */
30637 +               finish_unfinished(s);
30638 +       } else {
30639 +               if (old_format_only(s) && !silent) {
30640 +                       reiserfs_info(s, "using 3.5.x disk format\n");
30641 +               }
30642 +
30643 +               reiserfs_write_unlock(s);
30644 +               if ((errval = reiserfs_lookup_privroot(s)) ||
30645 +                   (errval = reiserfs_xattr_init(s, s->s_flags))) {
30646 +                       dput(s->s_root);
30647 +                       s->s_root = NULL;
30648 +                       goto error_unlocked;
30649 +               }
30650 +               reiserfs_write_lock(s);
30651 +       }
30652 +       /*
30653 +        * mark hash in super block: it could be unset. overwrite should be ok
30654 +        */
30655 +       set_sb_hash_function_code(rs, function2code(sbi->s_hash_function));
30656 +
30657 +       handle_attrs(s);
30658 +
30659 +       reiserfs_proc_info_init(s);
30660 +
30661 +       init_waitqueue_head(&(sbi->s_wait));
30662 +       spin_lock_init(&sbi->bitmap_lock);
30663 +
30664 +       reiserfs_write_unlock(s);
30665 +
30666 +       return (0);
30667 +
30668 +error:
30669 +       reiserfs_write_unlock(s);
30670 +
30671 +error_unlocked:
30672 +       /* kill the commit thread, free journal ram */
30673 +       if (jinit_done) {
30674 +               reiserfs_write_lock(s);
30675 +               journal_release_error(NULL, s);
30676 +               reiserfs_write_unlock(s);
30677 +       }
30678 +
30679 +       if (sbi->commit_wq)
30680 +               destroy_workqueue(sbi->commit_wq);
30681 +
30682 +       reiserfs_cancel_old_flush(s);
30683 +
30684 +       reiserfs_free_bitmap_cache(s);
30685 +       if (SB_BUFFER_WITH_SB(s))
30686 +               brelse(SB_BUFFER_WITH_SB(s));
30687 +#ifdef CONFIG_QUOTA
30688 +       {
30689 +               int j;
30690 +               for (j = 0; j < REISERFS_MAXQUOTAS; j++)
30691 +                       kfree(qf_names[j]);
30692 +       }
30693 +#endif
30694 +       kfree(sbi->s_jdev);
30695 +       kfree(sbi);
30696 +
30697 +       s->s_fs_info = NULL;
30698 +       return errval;
30699 +}
30700 +
30701 +static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf)
30702 +{
30703 +       struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(dentry->d_sb);
30704 +
30705 +       buf->f_namelen = (REISERFS_MAX_NAME(s->s_blocksize));
30706 +       buf->f_bfree = sb_free_blocks(rs);
30707 +       buf->f_bavail = buf->f_bfree;
30708 +       buf->f_blocks = sb_block_count(rs) - sb_bmap_nr(rs) - 1;
30709 +       buf->f_bsize = dentry->d_sb->s_blocksize;
30710 +       /* changed to accommodate gcc folks. */
30711 +       buf->f_type = REISERFS_SUPER_MAGIC;
30712 +       buf->f_fsid.val[0] = (u32)crc32_le(0, rs->s_uuid, sizeof(rs->s_uuid)/2);
30713 +       buf->f_fsid.val[1] = (u32)crc32_le(0, rs->s_uuid + sizeof(rs->s_uuid)/2,
30714 +                               sizeof(rs->s_uuid)/2);
30715 +
30716 +       return 0;
30717 +}
30718 +
30719 +#ifdef CONFIG_QUOTA
30720 +static int reiserfs_write_dquot(struct dquot *dquot)
30721 +{
30722 +       struct reiserfs_transaction_handle th;
30723 +       int ret, err;
30724 +       int depth;
30725 +
30726 +       reiserfs_write_lock(dquot->dq_sb);
30727 +       ret =
30728 +           journal_begin(&th, dquot->dq_sb,
30729 +                         REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
30730 +       if (ret)
30731 +               goto out;
30732 +       depth = reiserfs_write_unlock_nested(dquot->dq_sb);
30733 +       ret = dquot_commit(dquot);
30734 +       reiserfs_write_lock_nested(dquot->dq_sb, depth);
30735 +       err = journal_end(&th);
30736 +       if (!ret && err)
30737 +               ret = err;
30738 +out:
30739 +       reiserfs_write_unlock(dquot->dq_sb);
30740 +       return ret;
30741 +}
30742 +
30743 +static int reiserfs_acquire_dquot(struct dquot *dquot)
30744 +{
30745 +       struct reiserfs_transaction_handle th;
30746 +       int ret, err;
30747 +       int depth;
30748 +
30749 +       reiserfs_write_lock(dquot->dq_sb);
30750 +       ret =
30751 +           journal_begin(&th, dquot->dq_sb,
30752 +                         REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb));
30753 +       if (ret)
30754 +               goto out;
30755 +       depth = reiserfs_write_unlock_nested(dquot->dq_sb);
30756 +       ret = dquot_acquire(dquot);
30757 +       reiserfs_write_lock_nested(dquot->dq_sb, depth);
30758 +       err = journal_end(&th);
30759 +       if (!ret && err)
30760 +               ret = err;
30761 +out:
30762 +       reiserfs_write_unlock(dquot->dq_sb);
30763 +       return ret;
30764 +}
30765 +
30766 +static int reiserfs_release_dquot(struct dquot *dquot)
30767 +{
30768 +       struct reiserfs_transaction_handle th;
30769 +       int ret, err;
30770 +
30771 +       reiserfs_write_lock(dquot->dq_sb);
30772 +       ret =
30773 +           journal_begin(&th, dquot->dq_sb,
30774 +                         REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb));
30775 +       reiserfs_write_unlock(dquot->dq_sb);
30776 +       if (ret) {
30777 +               /* Release dquot anyway to avoid endless cycle in dqput() */
30778 +               dquot_release(dquot);
30779 +               goto out;
30780 +       }
30781 +       ret = dquot_release(dquot);
30782 +       reiserfs_write_lock(dquot->dq_sb);
30783 +       err = journal_end(&th);
30784 +       if (!ret && err)
30785 +               ret = err;
30786 +       reiserfs_write_unlock(dquot->dq_sb);
30787 +out:
30788 +       return ret;
30789 +}
30790 +
30791 +static int reiserfs_mark_dquot_dirty(struct dquot *dquot)
30792 +{
30793 +       /* Are we journaling quotas? */
30794 +       if (REISERFS_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
30795 +           REISERFS_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
30796 +               dquot_mark_dquot_dirty(dquot);
30797 +               return reiserfs_write_dquot(dquot);
30798 +       } else
30799 +               return dquot_mark_dquot_dirty(dquot);
30800 +}
30801 +
30802 +static int reiserfs_write_info(struct super_block *sb, int type)
30803 +{
30804 +       struct reiserfs_transaction_handle th;
30805 +       int ret, err;
30806 +       int depth;
30807 +
30808 +       /* Data block + inode block */
30809 +       reiserfs_write_lock(sb);
30810 +       ret = journal_begin(&th, sb, 2);
30811 +       if (ret)
30812 +               goto out;
30813 +       depth = reiserfs_write_unlock_nested(sb);
30814 +       ret = dquot_commit_info(sb, type);
30815 +       reiserfs_write_lock_nested(sb, depth);
30816 +       err = journal_end(&th);
30817 +       if (!ret && err)
30818 +               ret = err;
30819 +out:
30820 +       reiserfs_write_unlock(sb);
30821 +       return ret;
30822 +}
30823 +
30824 +/*
30825 + * Turn on quotas during mount time - we need to find the quota file and such...
30826 + */
30827 +static int reiserfs_quota_on_mount(struct super_block *sb, int type)
30828 +{
30829 +       return dquot_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type],
30830 +                                       REISERFS_SB(sb)->s_jquota_fmt, type);
30831 +}
30832 +
30833 +/*
30834 + * Standard function to be called on quota_on
30835 + */
30836 +static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
30837 +                            const struct path *path)
30838 +{
30839 +       int err;
30840 +       struct inode *inode;
30841 +       struct reiserfs_transaction_handle th;
30842 +       int opt = type == USRQUOTA ? REISERFS_USRQUOTA : REISERFS_GRPQUOTA;
30843 +
30844 +       reiserfs_write_lock(sb);
30845 +       if (!(REISERFS_SB(sb)->s_mount_opt & (1 << opt))) {
30846 +               err = -EINVAL;
30847 +               goto out;
30848 +       }
30849 +
30850 +       /* Quotafile not on the same filesystem? */
30851 +       if (path->dentry->d_sb != sb) {
30852 +               err = -EXDEV;
30853 +               goto out;
30854 +       }
30855 +       inode = d_inode(path->dentry);
30856 +       /*
30857 +        * We must not pack tails for quota files on reiserfs for quota
30858 +        * IO to work
30859 +        */
30860 +       if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) {
30861 +               err = reiserfs_unpack(inode);
30862 +               if (err) {
30863 +                       reiserfs_warning(sb, "super-6520",
30864 +                               "Unpacking tail of quota file failed"
30865 +                               " (%d). Cannot turn on quotas.", err);
30866 +                       err = -EINVAL;
30867 +                       goto out;
30868 +               }
30869 +               mark_inode_dirty(inode);
30870 +       }
30871 +       /* Journaling quota? */
30872 +       if (REISERFS_SB(sb)->s_qf_names[type]) {
30873 +               /* Quotafile not of fs root? */
30874 +               if (path->dentry->d_parent != sb->s_root)
30875 +                       reiserfs_warning(sb, "super-6521",
30876 +                                "Quota file not on filesystem root. "
30877 +                                "Journalled quota will not work.");
30878 +       }
30879 +
30880 +       /*
30881 +        * When we journal data on quota file, we have to flush journal to see
30882 +        * all updates to the file when we bypass pagecache...
30883 +        */
30884 +       if (reiserfs_file_data_log(inode)) {
30885 +               /* Just start temporary transaction and finish it */
30886 +               err = journal_begin(&th, sb, 1);
30887 +               if (err)
30888 +                       goto out;
30889 +               err = journal_end_sync(&th);
30890 +               if (err)
30891 +                       goto out;
30892 +       }
30893 +       reiserfs_write_unlock(sb);
30894 +       err = dquot_quota_on(sb, type, format_id, path);
30895 +       if (!err) {
30896 +               inode_lock(inode);
30897 +               REISERFS_I(inode)->i_attrs |= REISERFS_IMMUTABLE_FL |
30898 +                                             REISERFS_NOATIME_FL;
30899 +               inode_set_flags(inode, S_IMMUTABLE | S_NOATIME,
30900 +                               S_IMMUTABLE | S_NOATIME);
30901 +               inode_unlock(inode);
30902 +               mark_inode_dirty(inode);
30903 +       }
30904 +       return err;
30905 +out:
30906 +       reiserfs_write_unlock(sb);
30907 +       return err;
30908 +}
30909 +
30910 +static int reiserfs_quota_off(struct super_block *sb, int type)
30911 +{
30912 +       int err;
30913 +       struct inode *inode = sb_dqopt(sb)->files[type];
30914 +
30915 +       if (!inode || !igrab(inode))
30916 +               goto out;
30917 +
30918 +       err = dquot_quota_off(sb, type);
30919 +       if (err)
30920 +               goto out_put;
30921 +
30922 +       inode_lock(inode);
30923 +       REISERFS_I(inode)->i_attrs &= ~(REISERFS_IMMUTABLE_FL |
30924 +                                       REISERFS_NOATIME_FL);
30925 +       inode_set_flags(inode, 0, S_IMMUTABLE | S_NOATIME);
30926 +       inode_unlock(inode);
30927 +       mark_inode_dirty(inode);
30928 +out_put:
30929 +       iput(inode);
30930 +       return err;
30931 +out:
30932 +       return dquot_quota_off(sb, type);
30933 +}
30934 +
30935 +/*
30936 + * Read data from quotafile - avoid pagecache and such because we cannot afford
30937 + * acquiring the locks... As quota files are never truncated and quota code
30938 + * itself serializes the operations (and no one else should touch the files)
30939 + * we don't have to be afraid of races
30940 + */
30941 +static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data,
30942 +                                  size_t len, loff_t off)
30943 +{
30944 +       struct inode *inode = sb_dqopt(sb)->files[type];
30945 +       unsigned long blk = off >> sb->s_blocksize_bits;
30946 +       int err = 0, offset = off & (sb->s_blocksize - 1), tocopy;
30947 +       size_t toread;
30948 +       struct buffer_head tmp_bh, *bh;
30949 +       loff_t i_size = i_size_read(inode);
30950 +
30951 +       if (off > i_size)
30952 +               return 0;
30953 +       if (off + len > i_size)
30954 +               len = i_size - off;
30955 +       toread = len;
30956 +       while (toread > 0) {
30957 +               tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread);
30958 +               tmp_bh.b_state = 0;
30959 +               /*
30960 +                * Quota files are without tails so we can safely
30961 +                * use this function
30962 +                */
30963 +               reiserfs_write_lock(sb);
30964 +               err = reiserfs_get_block(inode, blk, &tmp_bh, 0);
30965 +               reiserfs_write_unlock(sb);
30966 +               if (err)
30967 +                       return err;
30968 +               if (!buffer_mapped(&tmp_bh))    /* A hole? */
30969 +                       memset(data, 0, tocopy);
30970 +               else {
30971 +                       bh = sb_bread(sb, tmp_bh.b_blocknr);
30972 +                       if (!bh)
30973 +                               return -EIO;
30974 +                       memcpy(data, bh->b_data + offset, tocopy);
30975 +                       brelse(bh);
30976 +               }
30977 +               offset = 0;
30978 +               toread -= tocopy;
30979 +               data += tocopy;
30980 +               blk++;
30981 +       }
30982 +       return len;
30983 +}
30984 +
30985 +/*
30986 + * Write to quotafile (we know the transaction is already started and has
30987 + * enough credits)
30988 + */
30989 +static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
30990 +                                   const char *data, size_t len, loff_t off)
30991 +{
30992 +       struct inode *inode = sb_dqopt(sb)->files[type];
30993 +       unsigned long blk = off >> sb->s_blocksize_bits;
30994 +       int err = 0, offset = off & (sb->s_blocksize - 1), tocopy;
30995 +       int journal_quota = REISERFS_SB(sb)->s_qf_names[type] != NULL;
30996 +       size_t towrite = len;
30997 +       struct buffer_head tmp_bh, *bh;
30998 +
30999 +       if (!current->journal_info) {
31000 +               printk(KERN_WARNING "reiserfs: Quota write (off=%llu, len=%llu) cancelled because transaction is not started.\n",
31001 +                       (unsigned long long)off, (unsigned long long)len);
31002 +               return -EIO;
31003 +       }
31004 +       while (towrite > 0) {
31005 +               tocopy = min_t(unsigned long, sb->s_blocksize - offset, towrite);
31006 +               tmp_bh.b_state = 0;
31007 +               reiserfs_write_lock(sb);
31008 +               err = reiserfs_get_block(inode, blk, &tmp_bh, GET_BLOCK_CREATE);
31009 +               reiserfs_write_unlock(sb);
31010 +               if (err)
31011 +                       goto out;
31012 +               if (offset || tocopy != sb->s_blocksize)
31013 +                       bh = sb_bread(sb, tmp_bh.b_blocknr);
31014 +               else
31015 +                       bh = sb_getblk(sb, tmp_bh.b_blocknr);
31016 +               if (!bh) {
31017 +                       err = -EIO;
31018 +                       goto out;
31019 +               }
31020 +               lock_buffer(bh);
31021 +               memcpy(bh->b_data + offset, data, tocopy);
31022 +               flush_dcache_page(bh->b_page);
31023 +               set_buffer_uptodate(bh);
31024 +               unlock_buffer(bh);
31025 +               reiserfs_write_lock(sb);
31026 +               reiserfs_prepare_for_journal(sb, bh, 1);
31027 +               journal_mark_dirty(current->journal_info, bh);
31028 +               if (!journal_quota)
31029 +                       reiserfs_add_ordered_list(inode, bh);
31030 +               reiserfs_write_unlock(sb);
31031 +               brelse(bh);
31032 +               offset = 0;
31033 +               towrite -= tocopy;
31034 +               data += tocopy;
31035 +               blk++;
31036 +       }
31037 +out:
31038 +       if (len == towrite)
31039 +               return err;
31040 +       if (inode->i_size < off + len - towrite)
31041 +               i_size_write(inode, off + len - towrite);
31042 +       inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
31043 +       mark_inode_dirty(inode);
31044 +       return len - towrite;
31045 +}
31046 +
31047 +#endif
31048 +
31049 +static struct dentry *get_super_block(struct file_system_type *fs_type,
31050 +                          int flags, const char *dev_name,
31051 +                          void *data)
31052 +{
31053 +       return mount_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super);
31054 +}
31055 +
31056 +static int __init init_reiserfs_fs(void)
31057 +{
31058 +       int ret;
31059 +
31060 +       ret = init_inodecache();
31061 +       if (ret)
31062 +               return ret;
31063 +
31064 +       reiserfs_proc_info_global_init();
31065 +
31066 +       ret = register_filesystem(&reiserfs_fs_type);
31067 +       if (ret)
31068 +               goto out;
31069 +
31070 +       return 0;
31071 +out:
31072 +       reiserfs_proc_info_global_done();
31073 +       destroy_inodecache();
31074 +
31075 +       return ret;
31076 +}
31077 +
31078 +static void __exit exit_reiserfs_fs(void)
31079 +{
31080 +       reiserfs_proc_info_global_done();
31081 +       unregister_filesystem(&reiserfs_fs_type);
31082 +       destroy_inodecache();
31083 +}
31084 +
31085 +struct file_system_type reiserfs_fs_type = {
31086 +       .owner = THIS_MODULE,
31087 +       .name = "reiserfs",
31088 +       .mount = get_super_block,
31089 +       .kill_sb = reiserfs_kill_sb,
31090 +       .fs_flags = FS_REQUIRES_DEV,
31091 +};
31092 +MODULE_ALIAS_FS("reiserfs");
31093 +
31094 +MODULE_DESCRIPTION("ReiserFS journaled filesystem");
31095 +MODULE_AUTHOR("Hans Reiser <reiser@namesys.com>");
31096 +MODULE_LICENSE("GPL");
31097 +
31098 +module_init(init_reiserfs_fs);
31099 +module_exit(exit_reiserfs_fs);
31100 diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c
31101 new file mode 100644
31102 index 000000000000..2cec61af2a9e
31103 --- /dev/null
31104 +++ b/fs/reiserfs/tail_conversion.c
31105 @@ -0,0 +1,318 @@
31106 +// SPDX-License-Identifier: GPL-2.0
31107 +/*
31108 + * Copyright 1999 Hans Reiser, see reiserfs/README for licensing and copyright
31109 + * details
31110 + */
31111 +
31112 +#include <linux/time.h>
31113 +#include <linux/pagemap.h>
31114 +#include <linux/buffer_head.h>
31115 +#include "reiserfs.h"
31116 +
31117 +/*
31118 + * access to tail : when one is going to read tail it must make sure, that is
31119 + * not running.  direct2indirect and indirect2direct can not run concurrently
31120 + */
31121 +
31122 +/*
31123 + * Converts direct items to an unformatted node. Panics if file has no
31124 + * tail. -ENOSPC if no disk space for conversion
31125 + */
31126 +/*
31127 + * path points to first direct item of the file regardless of how many of
31128 + * them are there
31129 + */
31130 +int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
31131 +                   struct treepath *path, struct buffer_head *unbh,
31132 +                   loff_t tail_offset)
31133 +{
31134 +       struct super_block *sb = inode->i_sb;
31135 +       struct buffer_head *up_to_date_bh;
31136 +       struct item_head *p_le_ih = tp_item_head(path);
31137 +       unsigned long total_tail = 0;
31138 +
31139 +       /* Key to search for the last byte of the converted item. */
31140 +       struct cpu_key end_key;
31141 +
31142 +       /*
31143 +        * new indirect item to be inserted or key
31144 +        * of unfm pointer to be pasted
31145 +        */
31146 +       struct item_head ind_ih;
31147 +       int blk_size;
31148 +       /* returned value for reiserfs_insert_item and clones */
31149 +       int  retval;
31150 +       /* Handle on an unformatted node that will be inserted in the tree. */
31151 +       unp_t unfm_ptr;
31152 +
31153 +       BUG_ON(!th->t_trans_id);
31154 +
31155 +       REISERFS_SB(sb)->s_direct2indirect++;
31156 +
31157 +       blk_size = sb->s_blocksize;
31158 +
31159 +       /*
31160 +        * and key to search for append or insert pointer to the new
31161 +        * unformatted node.
31162 +        */
31163 +       copy_item_head(&ind_ih, p_le_ih);
31164 +       set_le_ih_k_offset(&ind_ih, tail_offset);
31165 +       set_le_ih_k_type(&ind_ih, TYPE_INDIRECT);
31166 +
31167 +       /* Set the key to search for the place for new unfm pointer */
31168 +       make_cpu_key(&end_key, inode, tail_offset, TYPE_INDIRECT, 4);
31169 +
31170 +       /* FIXME: we could avoid this */
31171 +       if (search_for_position_by_key(sb, &end_key, path) == POSITION_FOUND) {
31172 +               reiserfs_error(sb, "PAP-14030",
31173 +                              "pasted or inserted byte exists in "
31174 +                              "the tree %K. Use fsck to repair.", &end_key);
31175 +               pathrelse(path);
31176 +               return -EIO;
31177 +       }
31178 +
31179 +       p_le_ih = tp_item_head(path);
31180 +
31181 +       unfm_ptr = cpu_to_le32(unbh->b_blocknr);
31182 +
31183 +       if (is_statdata_le_ih(p_le_ih)) {
31184 +               /* Insert new indirect item. */
31185 +               set_ih_free_space(&ind_ih, 0);  /* delete at nearest future */
31186 +               put_ih_item_len(&ind_ih, UNFM_P_SIZE);
31187 +               PATH_LAST_POSITION(path)++;
31188 +               retval =
31189 +                   reiserfs_insert_item(th, path, &end_key, &ind_ih, inode,
31190 +                                        (char *)&unfm_ptr);
31191 +       } else {
31192 +               /* Paste into last indirect item of an object. */
31193 +               retval = reiserfs_paste_into_item(th, path, &end_key, inode,
31194 +                                                   (char *)&unfm_ptr,
31195 +                                                   UNFM_P_SIZE);
31196 +       }
31197 +       if (retval) {
31198 +               return retval;
31199 +       }
31200 +       /*
31201 +        * note: from here there are two keys which have matching first
31202 +        *  three key components. They only differ by the fourth one.
31203 +        */
31204 +
31205 +       /* Set the key to search for the direct items of the file */
31206 +       make_cpu_key(&end_key, inode, max_reiserfs_offset(inode), TYPE_DIRECT,
31207 +                    4);
31208 +
31209 +       /*
31210 +        * Move bytes from the direct items to the new unformatted node
31211 +        * and delete them.
31212 +        */
31213 +       while (1) {
31214 +               int tail_size;
31215 +
31216 +               /*
31217 +                * end_key.k_offset is set so, that we will always have found
31218 +                * last item of the file
31219 +                */
31220 +               if (search_for_position_by_key(sb, &end_key, path) ==
31221 +                   POSITION_FOUND)
31222 +                       reiserfs_panic(sb, "PAP-14050",
31223 +                                      "direct item (%K) not found", &end_key);
31224 +               p_le_ih = tp_item_head(path);
31225 +               RFALSE(!is_direct_le_ih(p_le_ih),
31226 +                      "vs-14055: direct item expected(%K), found %h",
31227 +                      &end_key, p_le_ih);
31228 +               tail_size = (le_ih_k_offset(p_le_ih) & (blk_size - 1))
31229 +                   + ih_item_len(p_le_ih) - 1;
31230 +
31231 +               /*
31232 +                * we only send the unbh pointer if the buffer is not
31233 +                * up to date.  this avoids overwriting good data from
31234 +                * writepage() with old data from the disk or buffer cache
31235 +                * Special case: unbh->b_page will be NULL if we are coming
31236 +                * through DIRECT_IO handler here.
31237 +                */
31238 +               if (!unbh->b_page || buffer_uptodate(unbh)
31239 +                   || PageUptodate(unbh->b_page)) {
31240 +                       up_to_date_bh = NULL;
31241 +               } else {
31242 +                       up_to_date_bh = unbh;
31243 +               }
31244 +               retval = reiserfs_delete_item(th, path, &end_key, inode,
31245 +                                               up_to_date_bh);
31246 +
31247 +               total_tail += retval;
31248 +
31249 +               /* done: file does not have direct items anymore */
31250 +               if (tail_size == retval)
31251 +                       break;
31252 +
31253 +       }
31254 +       /*
31255 +        * if we've copied bytes from disk into the page, we need to zero
31256 +        * out the unused part of the block (it was not up to date before)
31257 +        */
31258 +       if (up_to_date_bh) {
31259 +               unsigned pgoff =
31260 +                   (tail_offset + total_tail - 1) & (PAGE_SIZE - 1);
31261 +               char *kaddr = kmap_atomic(up_to_date_bh->b_page);
31262 +               memset(kaddr + pgoff, 0, blk_size - total_tail);
31263 +               kunmap_atomic(kaddr);
31264 +       }
31265 +
31266 +       REISERFS_I(inode)->i_first_direct_byte = U32_MAX;
31267 +
31268 +       return 0;
31269 +}
31270 +
31271 +/* stolen from fs/buffer.c */
31272 +void reiserfs_unmap_buffer(struct buffer_head *bh)
31273 +{
31274 +       lock_buffer(bh);
31275 +       if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
31276 +               BUG();
31277 +       }
31278 +       clear_buffer_dirty(bh);
31279 +       /*
31280 +        * Remove the buffer from whatever list it belongs to. We are mostly
31281 +        * interested in removing it from per-sb j_dirty_buffers list, to avoid
31282 +        * BUG() on attempt to write not mapped buffer
31283 +        */
31284 +       if ((!list_empty(&bh->b_assoc_buffers) || bh->b_private) && bh->b_page) {
31285 +               struct inode *inode = bh->b_folio->mapping->host;
31286 +               struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
31287 +               spin_lock(&j->j_dirty_buffers_lock);
31288 +               list_del_init(&bh->b_assoc_buffers);
31289 +               reiserfs_free_jh(bh);
31290 +               spin_unlock(&j->j_dirty_buffers_lock);
31291 +       }
31292 +       clear_buffer_mapped(bh);
31293 +       clear_buffer_req(bh);
31294 +       clear_buffer_new(bh);
31295 +       bh->b_bdev = NULL;
31296 +       unlock_buffer(bh);
31297 +}
31298 +
31299 +/*
31300 + * this first locks inode (neither reads nor sync are permitted),
31301 + * reads tail through page cache, insert direct item. When direct item
31302 + * inserted successfully inode is left locked. Return value is always
31303 + * what we expect from it (number of cut bytes). But when tail remains
31304 + * in the unformatted node, we set mode to SKIP_BALANCING and unlock
31305 + * inode
31306 + */
31307 +int indirect2direct(struct reiserfs_transaction_handle *th,
31308 +                   struct inode *inode, struct page *page,
31309 +                   struct treepath *path,      /* path to the indirect item. */
31310 +                   const struct cpu_key *item_key,     /* Key to look for
31311 +                                                        * unformatted node
31312 +                                                        * pointer to be cut. */
31313 +                   loff_t n_new_file_size,     /* New file size. */
31314 +                   char *mode)
31315 +{
31316 +       struct super_block *sb = inode->i_sb;
31317 +       struct item_head s_ih;
31318 +       unsigned long block_size = sb->s_blocksize;
31319 +       char *tail;
31320 +       int tail_len, round_tail_len;
31321 +       loff_t pos, pos1;       /* position of first byte of the tail */
31322 +       struct cpu_key key;
31323 +
31324 +       BUG_ON(!th->t_trans_id);
31325 +
31326 +       REISERFS_SB(sb)->s_indirect2direct++;
31327 +
31328 +       *mode = M_SKIP_BALANCING;
31329 +
31330 +       /* store item head path points to. */
31331 +       copy_item_head(&s_ih, tp_item_head(path));
31332 +
31333 +       tail_len = (n_new_file_size & (block_size - 1));
31334 +       if (get_inode_sd_version(inode) == STAT_DATA_V2)
31335 +               round_tail_len = ROUND_UP(tail_len);
31336 +       else
31337 +               round_tail_len = tail_len;
31338 +
31339 +       pos =
31340 +           le_ih_k_offset(&s_ih) - 1 + (ih_item_len(&s_ih) / UNFM_P_SIZE -
31341 +                                        1) * sb->s_blocksize;
31342 +       pos1 = pos;
31343 +
31344 +       /*
31345 +        * we are protected by i_mutex. The tail can not disapper, not
31346 +        * append can be done either
31347 +        * we are in truncate or packing tail in file_release
31348 +        */
31349 +
31350 +       tail = (char *)kmap(page);      /* this can schedule */
31351 +
31352 +       if (path_changed(&s_ih, path)) {
31353 +               /* re-search indirect item */
31354 +               if (search_for_position_by_key(sb, item_key, path)
31355 +                   == POSITION_NOT_FOUND)
31356 +                       reiserfs_panic(sb, "PAP-5520",
31357 +                                      "item to be converted %K does not exist",
31358 +                                      item_key);
31359 +               copy_item_head(&s_ih, tp_item_head(path));
31360 +#ifdef CONFIG_REISERFS_CHECK
31361 +               pos = le_ih_k_offset(&s_ih) - 1 +
31362 +                   (ih_item_len(&s_ih) / UNFM_P_SIZE -
31363 +                    1) * sb->s_blocksize;
31364 +               if (pos != pos1)
31365 +                       reiserfs_panic(sb, "vs-5530", "tail position "
31366 +                                      "changed while we were reading it");
31367 +#endif
31368 +       }
31369 +
31370 +       /* Set direct item header to insert. */
31371 +       make_le_item_head(&s_ih, NULL, get_inode_item_key_version(inode),
31372 +                         pos1 + 1, TYPE_DIRECT, round_tail_len,
31373 +                         0xffff /*ih_free_space */ );
31374 +
31375 +       /*
31376 +        * we want a pointer to the first byte of the tail in the page.
31377 +        * the page was locked and this part of the page was up to date when
31378 +        * indirect2direct was called, so we know the bytes are still valid
31379 +        */
31380 +       tail = tail + (pos & (PAGE_SIZE - 1));
31381 +
31382 +       PATH_LAST_POSITION(path)++;
31383 +
31384 +       key = *item_key;
31385 +       set_cpu_key_k_type(&key, TYPE_DIRECT);
31386 +       key.key_length = 4;
31387 +       /* Insert tail as new direct item in the tree */
31388 +       if (reiserfs_insert_item(th, path, &key, &s_ih, inode,
31389 +                                tail ? tail : NULL) < 0) {
31390 +               /*
31391 +                * No disk memory. So we can not convert last unformatted node
31392 +                * to the direct item.  In this case we used to adjust
31393 +                * indirect items's ih_free_space. Now ih_free_space is not
31394 +                * used, it would be ideal to write zeros to corresponding
31395 +                * unformatted node. For now i_size is considered as guard for
31396 +                * going out of file size
31397 +                */
31398 +               kunmap(page);
31399 +               return block_size - round_tail_len;
31400 +       }
31401 +       kunmap(page);
31402 +
31403 +       /* make sure to get the i_blocks changes from reiserfs_insert_item */
31404 +       reiserfs_update_sd(th, inode);
31405 +
31406 +       /*
31407 +        * note: we have now the same as in above direct2indirect
31408 +        * conversion: there are two keys which have matching first three
31409 +        * key components. They only differ by the fourth one.
31410 +        */
31411 +
31412 +       /*
31413 +        * We have inserted new direct item and must remove last
31414 +        * unformatted node.
31415 +        */
31416 +       *mode = M_CUT;
31417 +
31418 +       /* we store position of first direct item in the in-core inode */
31419 +       /* mark_file_with_tail (inode, pos1 + 1); */
31420 +       REISERFS_I(inode)->i_first_direct_byte = pos1 + 1;
31421 +
31422 +       return block_size - round_tail_len;
31423 +}
31424 diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
31425 new file mode 100644
31426 index 000000000000..998035a6388e
31427 --- /dev/null
31428 +++ b/fs/reiserfs/xattr.c
31429 @@ -0,0 +1,1039 @@
31430 +// SPDX-License-Identifier: GPL-2.0
31431 +/*
31432 + * linux/fs/reiserfs/xattr.c
31433 + *
31434 + * Copyright (c) 2002 by Jeff Mahoney, <jeffm@suse.com>
31435 + *
31436 + */
31437 +
31438 +/*
31439 + * In order to implement EA/ACLs in a clean, backwards compatible manner,
31440 + * they are implemented as files in a "private" directory.
31441 + * Each EA is in it's own file, with the directory layout like so (/ is assumed
31442 + * to be relative to fs root). Inside the /.reiserfs_priv/xattrs directory,
31443 + * directories named using the capital-hex form of the objectid and
31444 + * generation number are used. Inside each directory are individual files
31445 + * named with the name of the extended attribute.
31446 + *
31447 + * So, for objectid 12648430, we could have:
31448 + * /.reiserfs_priv/xattrs/C0FFEE.0/system.posix_acl_access
31449 + * /.reiserfs_priv/xattrs/C0FFEE.0/system.posix_acl_default
31450 + * /.reiserfs_priv/xattrs/C0FFEE.0/user.Content-Type
31451 + * .. or similar.
31452 + *
31453 + * The file contents are the text of the EA. The size is known based on the
31454 + * stat data describing the file.
31455 + *
31456 + * In the case of system.posix_acl_access and system.posix_acl_default, since
31457 + * these are special cases for filesystem ACLs, they are interpreted by the
31458 + * kernel, in addition, they are negatively and positively cached and attached
31459 + * to the inode so that unnecessary lookups are avoided.
31460 + *
31461 + * Locking works like so:
31462 + * Directory components (xattr root, xattr dir) are protectd by their i_mutex.
31463 + * The xattrs themselves are protected by the xattr_sem.
31464 + */
31465 +
31466 +#include "reiserfs.h"
31467 +#include <linux/capability.h>
31468 +#include <linux/dcache.h>
31469 +#include <linux/namei.h>
31470 +#include <linux/errno.h>
31471 +#include <linux/gfp.h>
31472 +#include <linux/fs.h>
31473 +#include <linux/file.h>
31474 +#include <linux/pagemap.h>
31475 +#include <linux/xattr.h>
31476 +#include "xattr.h"
31477 +#include "acl.h"
31478 +#include <linux/uaccess.h>
31479 +#include <net/checksum.h>
31480 +#include <linux/stat.h>
31481 +#include <linux/quotaops.h>
31482 +#include <linux/security.h>
31483 +#include <linux/posix_acl_xattr.h>
31484 +#include <linux/xattr.h>
31485 +
31486 +#define PRIVROOT_NAME ".reiserfs_priv"
31487 +#define XAROOT_NAME   "xattrs"
31488 +
31489 +
31490 +/*
31491 + * Helpers for inode ops. We do this so that we don't have all the VFS
31492 + * overhead and also for proper i_mutex annotation.
31493 + * dir->i_mutex must be held for all of them.
31494 + */
31495 +#ifdef CONFIG_REISERFS_FS_XATTR
31496 +static int xattr_create(struct inode *dir, struct dentry *dentry, int mode)
31497 +{
31498 +       BUG_ON(!inode_is_locked(dir));
31499 +       return dir->i_op->create(&nop_mnt_idmap, dir, dentry, mode, true);
31500 +}
31501 +#endif
31502 +
31503 +static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
31504 +{
31505 +       BUG_ON(!inode_is_locked(dir));
31506 +       return dir->i_op->mkdir(&nop_mnt_idmap, dir, dentry, mode);
31507 +}
31508 +
31509 +/*
31510 + * We use I_MUTEX_CHILD here to silence lockdep. It's safe because xattr
31511 + * mutation ops aren't called during rename or splace, which are the
31512 + * only other users of I_MUTEX_CHILD. It violates the ordering, but that's
31513 + * better than allocating another subclass just for this code.
31514 + */
31515 +static int xattr_unlink(struct inode *dir, struct dentry *dentry)
31516 +{
31517 +       int error;
31518 +
31519 +       BUG_ON(!inode_is_locked(dir));
31520 +
31521 +       inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD);
31522 +       error = dir->i_op->unlink(dir, dentry);
31523 +       inode_unlock(d_inode(dentry));
31524 +
31525 +       if (!error)
31526 +               d_delete(dentry);
31527 +       return error;
31528 +}
31529 +
31530 +static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
31531 +{
31532 +       int error;
31533 +
31534 +       BUG_ON(!inode_is_locked(dir));
31535 +
31536 +       inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD);
31537 +       error = dir->i_op->rmdir(dir, dentry);
31538 +       if (!error)
31539 +               d_inode(dentry)->i_flags |= S_DEAD;
31540 +       inode_unlock(d_inode(dentry));
31541 +       if (!error)
31542 +               d_delete(dentry);
31543 +
31544 +       return error;
31545 +}
31546 +
31547 +#define xattr_may_create(flags)        (!flags || flags & XATTR_CREATE)
31548 +
31549 +static struct dentry *open_xa_root(struct super_block *sb, int flags)
31550 +{
31551 +       struct dentry *privroot = REISERFS_SB(sb)->priv_root;
31552 +       struct dentry *xaroot;
31553 +
31554 +       if (d_really_is_negative(privroot))
31555 +               return ERR_PTR(-EOPNOTSUPP);
31556 +
31557 +       inode_lock_nested(d_inode(privroot), I_MUTEX_XATTR);
31558 +
31559 +       xaroot = dget(REISERFS_SB(sb)->xattr_root);
31560 +       if (!xaroot)
31561 +               xaroot = ERR_PTR(-EOPNOTSUPP);
31562 +       else if (d_really_is_negative(xaroot)) {
31563 +               int err = -ENODATA;
31564 +
31565 +               if (xattr_may_create(flags))
31566 +                       err = xattr_mkdir(d_inode(privroot), xaroot, 0700);
31567 +               if (err) {
31568 +                       dput(xaroot);
31569 +                       xaroot = ERR_PTR(err);
31570 +               }
31571 +       }
31572 +
31573 +       inode_unlock(d_inode(privroot));
31574 +       return xaroot;
31575 +}
31576 +
31577 +static struct dentry *open_xa_dir(const struct inode *inode, int flags)
31578 +{
31579 +       struct dentry *xaroot, *xadir;
31580 +       char namebuf[17];
31581 +
31582 +       xaroot = open_xa_root(inode->i_sb, flags);
31583 +       if (IS_ERR(xaroot))
31584 +               return xaroot;
31585 +
31586 +       snprintf(namebuf, sizeof(namebuf), "%X.%X",
31587 +                le32_to_cpu(INODE_PKEY(inode)->k_objectid),
31588 +                inode->i_generation);
31589 +
31590 +       inode_lock_nested(d_inode(xaroot), I_MUTEX_XATTR);
31591 +
31592 +       xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf));
31593 +       if (!IS_ERR(xadir) && d_really_is_negative(xadir)) {
31594 +               int err = -ENODATA;
31595 +
31596 +               if (xattr_may_create(flags))
31597 +                       err = xattr_mkdir(d_inode(xaroot), xadir, 0700);
31598 +               if (err) {
31599 +                       dput(xadir);
31600 +                       xadir = ERR_PTR(err);
31601 +               }
31602 +       }
31603 +
31604 +       inode_unlock(d_inode(xaroot));
31605 +       dput(xaroot);
31606 +       return xadir;
31607 +}
31608 +
31609 +/*
31610 + * The following are side effects of other operations that aren't explicitly
31611 + * modifying extended attributes. This includes operations such as permissions
31612 + * or ownership changes, object deletions, etc.
31613 + */
31614 +struct reiserfs_dentry_buf {
31615 +       struct dir_context ctx;
31616 +       struct dentry *xadir;
31617 +       int count;
31618 +       int err;
31619 +       struct dentry *dentries[8];
31620 +};
31621 +
31622 +static bool
31623 +fill_with_dentries(struct dir_context *ctx, const char *name, int namelen,
31624 +                  loff_t offset, u64 ino, unsigned int d_type)
31625 +{
31626 +       struct reiserfs_dentry_buf *dbuf =
31627 +               container_of(ctx, struct reiserfs_dentry_buf, ctx);
31628 +       struct dentry *dentry;
31629 +
31630 +       WARN_ON_ONCE(!inode_is_locked(d_inode(dbuf->xadir)));
31631 +
31632 +       if (dbuf->count == ARRAY_SIZE(dbuf->dentries))
31633 +               return false;
31634 +
31635 +       if (name[0] == '.' && (namelen < 2 ||
31636 +                              (namelen == 2 && name[1] == '.')))
31637 +               return true;
31638 +
31639 +       dentry = lookup_one_len(name, dbuf->xadir, namelen);
31640 +       if (IS_ERR(dentry)) {
31641 +               dbuf->err = PTR_ERR(dentry);
31642 +               return false;
31643 +       } else if (d_really_is_negative(dentry)) {
31644 +               /* A directory entry exists, but no file? */
31645 +               reiserfs_error(dentry->d_sb, "xattr-20003",
31646 +                              "Corrupted directory: xattr %pd listed but "
31647 +                              "not found for file %pd.\n",
31648 +                              dentry, dbuf->xadir);
31649 +               dput(dentry);
31650 +               dbuf->err = -EIO;
31651 +               return false;
31652 +       }
31653 +
31654 +       dbuf->dentries[dbuf->count++] = dentry;
31655 +       return true;
31656 +}
31657 +
31658 +static void
31659 +cleanup_dentry_buf(struct reiserfs_dentry_buf *buf)
31660 +{
31661 +       int i;
31662 +
31663 +       for (i = 0; i < buf->count; i++)
31664 +               if (buf->dentries[i])
31665 +                       dput(buf->dentries[i]);
31666 +}
31667 +
31668 +static int reiserfs_for_each_xattr(struct inode *inode,
31669 +                                  int (*action)(struct dentry *, void *),
31670 +                                  void *data)
31671 +{
31672 +       struct dentry *dir;
31673 +       int i, err = 0;
31674 +       struct reiserfs_dentry_buf buf = {
31675 +               .ctx.actor = fill_with_dentries,
31676 +       };
31677 +
31678 +       /* Skip out, an xattr has no xattrs associated with it */
31679 +       if (IS_PRIVATE(inode) || get_inode_sd_version(inode) == STAT_DATA_V1)
31680 +               return 0;
31681 +
31682 +       dir = open_xa_dir(inode, XATTR_REPLACE);
31683 +       if (IS_ERR(dir)) {
31684 +               err = PTR_ERR(dir);
31685 +               goto out;
31686 +       } else if (d_really_is_negative(dir)) {
31687 +               err = 0;
31688 +               goto out_dir;
31689 +       }
31690 +
31691 +       inode_lock_nested(d_inode(dir), I_MUTEX_XATTR);
31692 +
31693 +       buf.xadir = dir;
31694 +       while (1) {
31695 +               err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx);
31696 +               if (err)
31697 +                       break;
31698 +               if (buf.err) {
31699 +                       err = buf.err;
31700 +                       break;
31701 +               }
31702 +               if (!buf.count)
31703 +                       break;
31704 +               for (i = 0; !err && i < buf.count && buf.dentries[i]; i++) {
31705 +                       struct dentry *dentry = buf.dentries[i];
31706 +
31707 +                       if (!d_is_dir(dentry))
31708 +                               err = action(dentry, data);
31709 +
31710 +                       dput(dentry);
31711 +                       buf.dentries[i] = NULL;
31712 +               }
31713 +               if (err)
31714 +                       break;
31715 +               buf.count = 0;
31716 +       }
31717 +       inode_unlock(d_inode(dir));
31718 +
31719 +       cleanup_dentry_buf(&buf);
31720 +
31721 +       if (!err) {
31722 +               /*
31723 +                * We start a transaction here to avoid a ABBA situation
31724 +                * between the xattr root's i_mutex and the journal lock.
31725 +                * This doesn't incur much additional overhead since the
31726 +                * new transaction will just nest inside the
31727 +                * outer transaction.
31728 +                */
31729 +               int blocks = JOURNAL_PER_BALANCE_CNT * 2 + 2 +
31730 +                            4 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
31731 +               struct reiserfs_transaction_handle th;
31732 +
31733 +               reiserfs_write_lock(inode->i_sb);
31734 +               err = journal_begin(&th, inode->i_sb, blocks);
31735 +               reiserfs_write_unlock(inode->i_sb);
31736 +               if (!err) {
31737 +                       int jerror;
31738 +
31739 +                       inode_lock_nested(d_inode(dir->d_parent),
31740 +                                         I_MUTEX_XATTR);
31741 +                       err = action(dir, data);
31742 +                       reiserfs_write_lock(inode->i_sb);
31743 +                       jerror = journal_end(&th);
31744 +                       reiserfs_write_unlock(inode->i_sb);
31745 +                       inode_unlock(d_inode(dir->d_parent));
31746 +                       err = jerror ?: err;
31747 +               }
31748 +       }
31749 +out_dir:
31750 +       dput(dir);
31751 +out:
31752 +       /*
31753 +        * -ENODATA: this object doesn't have any xattrs
31754 +        * -EOPNOTSUPP: this file system doesn't have xattrs enabled on disk.
31755 +        * Neither are errors
31756 +        */
31757 +       if (err == -ENODATA || err == -EOPNOTSUPP)
31758 +               err = 0;
31759 +       return err;
31760 +}
31761 +
31762 +static int delete_one_xattr(struct dentry *dentry, void *data)
31763 +{
31764 +       struct inode *dir = d_inode(dentry->d_parent);
31765 +
31766 +       /* This is the xattr dir, handle specially. */
31767 +       if (d_is_dir(dentry))
31768 +               return xattr_rmdir(dir, dentry);
31769 +
31770 +       return xattr_unlink(dir, dentry);
31771 +}
31772 +
31773 +static int chown_one_xattr(struct dentry *dentry, void *data)
31774 +{
31775 +       struct iattr *attrs = data;
31776 +       int ia_valid = attrs->ia_valid;
31777 +       int err;
31778 +
31779 +       /*
31780 +        * We only want the ownership bits. Otherwise, we'll do
31781 +        * things like change a directory to a regular file if
31782 +        * ATTR_MODE is set.
31783 +        */
31784 +       attrs->ia_valid &= (ATTR_UID|ATTR_GID);
31785 +       err = reiserfs_setattr(&nop_mnt_idmap, dentry, attrs);
31786 +       attrs->ia_valid = ia_valid;
31787 +
31788 +       return err;
31789 +}
31790 +
31791 +/* No i_mutex, but the inode is unconnected. */
31792 +int reiserfs_delete_xattrs(struct inode *inode)
31793 +{
31794 +       int err = reiserfs_for_each_xattr(inode, delete_one_xattr, NULL);
31795 +
31796 +       if (err)
31797 +               reiserfs_warning(inode->i_sb, "jdm-20004",
31798 +                                "Couldn't delete all xattrs (%d)\n", err);
31799 +       return err;
31800 +}
31801 +
31802 +/* inode->i_mutex: down */
31803 +int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs)
31804 +{
31805 +       int err = reiserfs_for_each_xattr(inode, chown_one_xattr, attrs);
31806 +
31807 +       if (err)
31808 +               reiserfs_warning(inode->i_sb, "jdm-20007",
31809 +                                "Couldn't chown all xattrs (%d)\n", err);
31810 +       return err;
31811 +}
31812 +
31813 +#ifdef CONFIG_REISERFS_FS_XATTR
31814 +/*
31815 + * Returns a dentry corresponding to a specific extended attribute file
31816 + * for the inode. If flags allow, the file is created. Otherwise, a
31817 + * valid or negative dentry, or an error is returned.
31818 + */
31819 +static struct dentry *xattr_lookup(struct inode *inode, const char *name,
31820 +                                   int flags)
31821 +{
31822 +       struct dentry *xadir, *xafile;
31823 +       int err = 0;
31824 +
31825 +       xadir = open_xa_dir(inode, flags);
31826 +       if (IS_ERR(xadir))
31827 +               return ERR_CAST(xadir);
31828 +
31829 +       inode_lock_nested(d_inode(xadir), I_MUTEX_XATTR);
31830 +       xafile = lookup_one_len(name, xadir, strlen(name));
31831 +       if (IS_ERR(xafile)) {
31832 +               err = PTR_ERR(xafile);
31833 +               goto out;
31834 +       }
31835 +
31836 +       if (d_really_is_positive(xafile) && (flags & XATTR_CREATE))
31837 +               err = -EEXIST;
31838 +
31839 +       if (d_really_is_negative(xafile)) {
31840 +               err = -ENODATA;
31841 +               if (xattr_may_create(flags))
31842 +                       err = xattr_create(d_inode(xadir), xafile,
31843 +                                             0700|S_IFREG);
31844 +       }
31845 +
31846 +       if (err)
31847 +               dput(xafile);
31848 +out:
31849 +       inode_unlock(d_inode(xadir));
31850 +       dput(xadir);
31851 +       if (err)
31852 +               return ERR_PTR(err);
31853 +       return xafile;
31854 +}
31855 +
31856 +/* Internal operations on file data */
31857 +static inline void reiserfs_put_page(struct page *page)
31858 +{
31859 +       kunmap(page);
31860 +       put_page(page);
31861 +}
31862 +
31863 +static struct page *reiserfs_get_page(struct inode *dir, size_t n)
31864 +{
31865 +       struct address_space *mapping = dir->i_mapping;
31866 +       struct page *page;
31867 +       /*
31868 +        * We can deadlock if we try to free dentries,
31869 +        * and an unlink/rmdir has just occurred - GFP_NOFS avoids this
31870 +        */
31871 +       mapping_set_gfp_mask(mapping, GFP_NOFS);
31872 +       page = read_mapping_page(mapping, n >> PAGE_SHIFT, NULL);
31873 +       if (!IS_ERR(page))
31874 +               kmap(page);
31875 +       return page;
31876 +}
31877 +
31878 +static inline __u32 xattr_hash(const char *msg, int len)
31879 +{
31880 +       /*
31881 +        * csum_partial() gives different results for little-endian and
31882 +        * big endian hosts. Images created on little-endian hosts and
31883 +        * mounted on big-endian hosts(and vice versa) will see csum mismatches
31884 +        * when trying to fetch xattrs. Treating the hash as __wsum_t would
31885 +        * lower the frequency of mismatch.  This is an endianness bug in
31886 +        * reiserfs.  The return statement would result in a sparse warning. Do
31887 +        * not fix the sparse warning so as to not hide a reminder of the bug.
31888 +        */
31889 +       return csum_partial(msg, len, 0);
31890 +}
31891 +
31892 +int reiserfs_commit_write(struct file *f, struct page *page,
31893 +                         unsigned from, unsigned to);
31894 +
31895 +static void update_ctime(struct inode *inode)
31896 +{
31897 +       struct timespec64 now = current_time(inode);
31898 +       struct timespec64 ctime = inode_get_ctime(inode);
31899 +
31900 +       if (inode_unhashed(inode) || !inode->i_nlink ||
31901 +           timespec64_equal(&ctime, &now))
31902 +               return;
31903 +
31904 +       inode_set_ctime_to_ts(inode, now);
31905 +       mark_inode_dirty(inode);
31906 +}
31907 +
31908 +static int lookup_and_delete_xattr(struct inode *inode, const char *name)
31909 +{
31910 +       int err = 0;
31911 +       struct dentry *dentry, *xadir;
31912 +
31913 +       xadir = open_xa_dir(inode, XATTR_REPLACE);
31914 +       if (IS_ERR(xadir))
31915 +               return PTR_ERR(xadir);
31916 +
31917 +       inode_lock_nested(d_inode(xadir), I_MUTEX_XATTR);
31918 +       dentry = lookup_one_len(name, xadir, strlen(name));
31919 +       if (IS_ERR(dentry)) {
31920 +               err = PTR_ERR(dentry);
31921 +               goto out_dput;
31922 +       }
31923 +
31924 +       if (d_really_is_positive(dentry)) {
31925 +               err = xattr_unlink(d_inode(xadir), dentry);
31926 +               update_ctime(inode);
31927 +       }
31928 +
31929 +       dput(dentry);
31930 +out_dput:
31931 +       inode_unlock(d_inode(xadir));
31932 +       dput(xadir);
31933 +       return err;
31934 +}
31935 +
31936 +
31937 +/* Generic extended attribute operations that can be used by xa plugins */
31938 +
31939 +/*
31940 + * inode->i_mutex: down
31941 + */
31942 +int
31943 +reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
31944 +                         struct inode *inode, const char *name,
31945 +                         const void *buffer, size_t buffer_size, int flags)
31946 +{
31947 +       int err = 0;
31948 +       struct dentry *dentry;
31949 +       struct page *page;
31950 +       char *data;
31951 +       size_t file_pos = 0;
31952 +       size_t buffer_pos = 0;
31953 +       size_t new_size;
31954 +       __u32 xahash = 0;
31955 +
31956 +       if (get_inode_sd_version(inode) == STAT_DATA_V1)
31957 +               return -EOPNOTSUPP;
31958 +
31959 +       if (!buffer) {
31960 +               err = lookup_and_delete_xattr(inode, name);
31961 +               return err;
31962 +       }
31963 +
31964 +       dentry = xattr_lookup(inode, name, flags);
31965 +       if (IS_ERR(dentry))
31966 +               return PTR_ERR(dentry);
31967 +
31968 +       down_write(&REISERFS_I(inode)->i_xattr_sem);
31969 +
31970 +       xahash = xattr_hash(buffer, buffer_size);
31971 +       while (buffer_pos < buffer_size || buffer_pos == 0) {
31972 +               size_t chunk;
31973 +               size_t skip = 0;
31974 +               size_t page_offset = (file_pos & (PAGE_SIZE - 1));
31975 +
31976 +               if (buffer_size - buffer_pos > PAGE_SIZE)
31977 +                       chunk = PAGE_SIZE;
31978 +               else
31979 +                       chunk = buffer_size - buffer_pos;
31980 +
31981 +               page = reiserfs_get_page(d_inode(dentry), file_pos);
31982 +               if (IS_ERR(page)) {
31983 +                       err = PTR_ERR(page);
31984 +                       goto out_unlock;
31985 +               }
31986 +
31987 +               lock_page(page);
31988 +               data = page_address(page);
31989 +
31990 +               if (file_pos == 0) {
31991 +                       struct reiserfs_xattr_header *rxh;
31992 +
31993 +                       skip = file_pos = sizeof(struct reiserfs_xattr_header);
31994 +                       if (chunk + skip > PAGE_SIZE)
31995 +                               chunk = PAGE_SIZE - skip;
31996 +                       rxh = (struct reiserfs_xattr_header *)data;
31997 +                       rxh->h_magic = cpu_to_le32(REISERFS_XATTR_MAGIC);
31998 +                       rxh->h_hash = cpu_to_le32(xahash);
31999 +               }
32000 +
32001 +               reiserfs_write_lock(inode->i_sb);
32002 +               err = __reiserfs_write_begin(page, page_offset, chunk + skip);
32003 +               if (!err) {
32004 +                       if (buffer)
32005 +                               memcpy(data + skip, buffer + buffer_pos, chunk);
32006 +                       err = reiserfs_commit_write(NULL, page, page_offset,
32007 +                                                   page_offset + chunk +
32008 +                                                   skip);
32009 +               }
32010 +               reiserfs_write_unlock(inode->i_sb);
32011 +               unlock_page(page);
32012 +               reiserfs_put_page(page);
32013 +               buffer_pos += chunk;
32014 +               file_pos += chunk;
32015 +               skip = 0;
32016 +               if (err || buffer_size == 0 || !buffer)
32017 +                       break;
32018 +       }
32019 +
32020 +       new_size = buffer_size + sizeof(struct reiserfs_xattr_header);
32021 +       if (!err && new_size < i_size_read(d_inode(dentry))) {
32022 +               struct iattr newattrs = {
32023 +                       .ia_ctime = current_time(inode),
32024 +                       .ia_size = new_size,
32025 +                       .ia_valid = ATTR_SIZE | ATTR_CTIME,
32026 +               };
32027 +
32028 +               inode_lock_nested(d_inode(dentry), I_MUTEX_XATTR);
32029 +               inode_dio_wait(d_inode(dentry));
32030 +
32031 +               err = reiserfs_setattr(&nop_mnt_idmap, dentry, &newattrs);
32032 +               inode_unlock(d_inode(dentry));
32033 +       } else
32034 +               update_ctime(inode);
32035 +out_unlock:
32036 +       up_write(&REISERFS_I(inode)->i_xattr_sem);
32037 +       dput(dentry);
32038 +       return err;
32039 +}
32040 +
32041 +/* We need to start a transaction to maintain lock ordering */
32042 +int reiserfs_xattr_set(struct inode *inode, const char *name,
32043 +                      const void *buffer, size_t buffer_size, int flags)
32044 +{
32045 +
32046 +       struct reiserfs_transaction_handle th;
32047 +       int error, error2;
32048 +       size_t jbegin_count = reiserfs_xattr_nblocks(inode, buffer_size);
32049 +
32050 +       /* Check before we start a transaction and then do nothing. */
32051 +       if (!d_really_is_positive(REISERFS_SB(inode->i_sb)->priv_root))
32052 +               return -EOPNOTSUPP;
32053 +
32054 +       if (!(flags & XATTR_REPLACE))
32055 +               jbegin_count += reiserfs_xattr_jcreate_nblocks(inode);
32056 +
32057 +       reiserfs_write_lock(inode->i_sb);
32058 +       error = journal_begin(&th, inode->i_sb, jbegin_count);
32059 +       reiserfs_write_unlock(inode->i_sb);
32060 +       if (error) {
32061 +               return error;
32062 +       }
32063 +
32064 +       error = reiserfs_xattr_set_handle(&th, inode, name,
32065 +                                         buffer, buffer_size, flags);
32066 +
32067 +       reiserfs_write_lock(inode->i_sb);
32068 +       error2 = journal_end(&th);
32069 +       reiserfs_write_unlock(inode->i_sb);
32070 +       if (error == 0)
32071 +               error = error2;
32072 +
32073 +       return error;
32074 +}
32075 +
32076 +/*
32077 + * inode->i_mutex: down
32078 + */
32079 +int
32080 +reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer,
32081 +                  size_t buffer_size)
32082 +{
32083 +       ssize_t err = 0;
32084 +       struct dentry *dentry;
32085 +       size_t isize;
32086 +       size_t file_pos = 0;
32087 +       size_t buffer_pos = 0;
32088 +       struct page *page;
32089 +       __u32 hash = 0;
32090 +
32091 +       if (name == NULL)
32092 +               return -EINVAL;
32093 +
32094 +       /*
32095 +        * We can't have xattrs attached to v1 items since they don't have
32096 +        * generation numbers
32097 +        */
32098 +       if (get_inode_sd_version(inode) == STAT_DATA_V1)
32099 +               return -EOPNOTSUPP;
32100 +
32101 +       /*
32102 +        * priv_root needn't be initialized during mount so allow initial
32103 +        * lookups to succeed.
32104 +        */
32105 +       if (!REISERFS_SB(inode->i_sb)->priv_root)
32106 +               return 0;
32107 +
32108 +       dentry = xattr_lookup(inode, name, XATTR_REPLACE);
32109 +       if (IS_ERR(dentry)) {
32110 +               err = PTR_ERR(dentry);
32111 +               goto out;
32112 +       }
32113 +
32114 +       down_read(&REISERFS_I(inode)->i_xattr_sem);
32115 +
32116 +       isize = i_size_read(d_inode(dentry));
32117 +
32118 +       /* Just return the size needed */
32119 +       if (buffer == NULL) {
32120 +               err = isize - sizeof(struct reiserfs_xattr_header);
32121 +               goto out_unlock;
32122 +       }
32123 +
32124 +       if (buffer_size < isize - sizeof(struct reiserfs_xattr_header)) {
32125 +               err = -ERANGE;
32126 +               goto out_unlock;
32127 +       }
32128 +
32129 +       while (file_pos < isize) {
32130 +               size_t chunk;
32131 +               char *data;
32132 +               size_t skip = 0;
32133 +
32134 +               if (isize - file_pos > PAGE_SIZE)
32135 +                       chunk = PAGE_SIZE;
32136 +               else
32137 +                       chunk = isize - file_pos;
32138 +
32139 +               page = reiserfs_get_page(d_inode(dentry), file_pos);
32140 +               if (IS_ERR(page)) {
32141 +                       err = PTR_ERR(page);
32142 +                       goto out_unlock;
32143 +               }
32144 +
32145 +               lock_page(page);
32146 +               data = page_address(page);
32147 +               if (file_pos == 0) {
32148 +                       struct reiserfs_xattr_header *rxh =
32149 +                           (struct reiserfs_xattr_header *)data;
32150 +                       skip = file_pos = sizeof(struct reiserfs_xattr_header);
32151 +                       chunk -= skip;
32152 +                       /* Magic doesn't match up.. */
32153 +                       if (rxh->h_magic != cpu_to_le32(REISERFS_XATTR_MAGIC)) {
32154 +                               unlock_page(page);
32155 +                               reiserfs_put_page(page);
32156 +                               reiserfs_warning(inode->i_sb, "jdm-20001",
32157 +                                                "Invalid magic for xattr (%s) "
32158 +                                                "associated with %k", name,
32159 +                                                INODE_PKEY(inode));
32160 +                               err = -EIO;
32161 +                               goto out_unlock;
32162 +                       }
32163 +                       hash = le32_to_cpu(rxh->h_hash);
32164 +               }
32165 +               memcpy(buffer + buffer_pos, data + skip, chunk);
32166 +               unlock_page(page);
32167 +               reiserfs_put_page(page);
32168 +               file_pos += chunk;
32169 +               buffer_pos += chunk;
32170 +               skip = 0;
32171 +       }
32172 +       err = isize - sizeof(struct reiserfs_xattr_header);
32173 +
32174 +       if (xattr_hash(buffer, isize - sizeof(struct reiserfs_xattr_header)) !=
32175 +           hash) {
32176 +               reiserfs_warning(inode->i_sb, "jdm-20002",
32177 +                                "Invalid hash for xattr (%s) associated "
32178 +                                "with %k", name, INODE_PKEY(inode));
32179 +               err = -EIO;
32180 +       }
32181 +
32182 +out_unlock:
32183 +       up_read(&REISERFS_I(inode)->i_xattr_sem);
32184 +       dput(dentry);
32185 +
32186 +out:
32187 +       return err;
32188 +}
32189 +
32190 +/*
32191 + * In order to implement different sets of xattr operations for each xattr
32192 + * prefix with the generic xattr API, a filesystem should create a
32193 + * null-terminated array of struct xattr_handler (one for each prefix) and
32194 + * hang a pointer to it off of the s_xattr field of the superblock.
32195 + *
32196 + * The generic_fooxattr() functions will use this list to dispatch xattr
32197 + * operations to the correct xattr_handler.
32198 + */
32199 +#define for_each_xattr_handler(handlers, handler)              \
32200 +               for ((handler) = *(handlers)++;                 \
32201 +                       (handler) != NULL;                      \
32202 +                       (handler) = *(handlers)++)
32203 +
32204 +static inline bool reiserfs_posix_acl_list(const char *name,
32205 +                                          struct dentry *dentry)
32206 +{
32207 +       return (posix_acl_type(name) >= 0) &&
32208 +              IS_POSIXACL(d_backing_inode(dentry));
32209 +}
32210 +
32211 +/* This is the implementation for the xattr plugin infrastructure */
32212 +static inline bool reiserfs_xattr_list(const struct xattr_handler * const *handlers,
32213 +                                      const char *name, struct dentry *dentry)
32214 +{
32215 +       if (handlers) {
32216 +               const struct xattr_handler *xah = NULL;
32217 +
32218 +               for_each_xattr_handler(handlers, xah) {
32219 +                       const char *prefix = xattr_prefix(xah);
32220 +
32221 +                       if (strncmp(prefix, name, strlen(prefix)))
32222 +                               continue;
32223 +
32224 +                       if (!xattr_handler_can_list(xah, dentry))
32225 +                               return false;
32226 +
32227 +                       return true;
32228 +               }
32229 +       }
32230 +
32231 +       return reiserfs_posix_acl_list(name, dentry);
32232 +}
32233 +
32234 +struct listxattr_buf {
32235 +       struct dir_context ctx;
32236 +       size_t size;
32237 +       size_t pos;
32238 +       char *buf;
32239 +       struct dentry *dentry;
32240 +};
32241 +
32242 +static bool listxattr_filler(struct dir_context *ctx, const char *name,
32243 +                           int namelen, loff_t offset, u64 ino,
32244 +                           unsigned int d_type)
32245 +{
32246 +       struct listxattr_buf *b =
32247 +               container_of(ctx, struct listxattr_buf, ctx);
32248 +       size_t size;
32249 +
32250 +       if (name[0] != '.' ||
32251 +           (namelen != 1 && (name[1] != '.' || namelen != 2))) {
32252 +               if (!reiserfs_xattr_list(b->dentry->d_sb->s_xattr, name,
32253 +                                        b->dentry))
32254 +                       return true;
32255 +               size = namelen + 1;
32256 +               if (b->buf) {
32257 +                       if (b->pos + size > b->size) {
32258 +                               b->pos = -ERANGE;
32259 +                               return false;
32260 +                       }
32261 +                       memcpy(b->buf + b->pos, name, namelen);
32262 +                       b->buf[b->pos + namelen] = 0;
32263 +               }
32264 +               b->pos += size;
32265 +       }
32266 +       return true;
32267 +}
32268 +
32269 +/*
32270 + * Inode operation listxattr()
32271 + *
32272 + * We totally ignore the generic listxattr here because it would be stupid
32273 + * not to. Since the xattrs are organized in a directory, we can just
32274 + * readdir to find them.
32275 + */
32276 +ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
32277 +{
32278 +       struct dentry *dir;
32279 +       int err = 0;
32280 +       struct listxattr_buf buf = {
32281 +               .ctx.actor = listxattr_filler,
32282 +               .dentry = dentry,
32283 +               .buf = buffer,
32284 +               .size = buffer ? size : 0,
32285 +       };
32286 +
32287 +       if (d_really_is_negative(dentry))
32288 +               return -EINVAL;
32289 +
32290 +       if (get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
32291 +               return -EOPNOTSUPP;
32292 +
32293 +       dir = open_xa_dir(d_inode(dentry), XATTR_REPLACE);
32294 +       if (IS_ERR(dir)) {
32295 +               err = PTR_ERR(dir);
32296 +               if (err == -ENODATA)
32297 +                       err = 0;  /* Not an error if there aren't any xattrs */
32298 +               goto out;
32299 +       }
32300 +
32301 +       inode_lock_nested(d_inode(dir), I_MUTEX_XATTR);
32302 +       err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx);
32303 +       inode_unlock(d_inode(dir));
32304 +
32305 +       if (!err)
32306 +               err = buf.pos;
32307 +
32308 +       dput(dir);
32309 +out:
32310 +       return err;
32311 +}
32312 +
32313 +static int create_privroot(struct dentry *dentry)
32314 +{
32315 +       int err;
32316 +       struct inode *inode = d_inode(dentry->d_parent);
32317 +
32318 +       WARN_ON_ONCE(!inode_is_locked(inode));
32319 +
32320 +       err = xattr_mkdir(inode, dentry, 0700);
32321 +       if (err || d_really_is_negative(dentry)) {
32322 +               reiserfs_warning(dentry->d_sb, "jdm-20006",
32323 +                                "xattrs/ACLs enabled and couldn't "
32324 +                                "find/create .reiserfs_priv. "
32325 +                                "Failing mount.");
32326 +               return -EOPNOTSUPP;
32327 +       }
32328 +
32329 +       reiserfs_init_priv_inode(d_inode(dentry));
32330 +       reiserfs_info(dentry->d_sb, "Created %s - reserved for xattr "
32331 +                     "storage.\n", PRIVROOT_NAME);
32332 +
32333 +       return 0;
32334 +}
32335 +
32336 +#else
32337 +int __init reiserfs_xattr_register_handlers(void) { return 0; }
32338 +void reiserfs_xattr_unregister_handlers(void) {}
32339 +static int create_privroot(struct dentry *dentry) { return 0; }
32340 +#endif
32341 +
32342 +/* Actual operations that are exported to VFS-land */
32343 +const struct xattr_handler * const reiserfs_xattr_handlers[] = {
32344 +#ifdef CONFIG_REISERFS_FS_XATTR
32345 +       &reiserfs_xattr_user_handler,
32346 +       &reiserfs_xattr_trusted_handler,
32347 +#endif
32348 +#ifdef CONFIG_REISERFS_FS_SECURITY
32349 +       &reiserfs_xattr_security_handler,
32350 +#endif
32351 +       NULL
32352 +};
32353 +
32354 +static int xattr_mount_check(struct super_block *s)
32355 +{
32356 +       /*
32357 +        * We need generation numbers to ensure that the oid mapping is correct
32358 +        * v3.5 filesystems don't have them.
32359 +        */
32360 +       if (old_format_only(s)) {
32361 +               if (reiserfs_xattrs_optional(s)) {
32362 +                       /*
32363 +                        * Old format filesystem, but optional xattrs have
32364 +                        * been enabled. Error out.
32365 +                        */
32366 +                       reiserfs_warning(s, "jdm-2005",
32367 +                                        "xattrs/ACLs not supported "
32368 +                                        "on pre-v3.6 format filesystems. "
32369 +                                        "Failing mount.");
32370 +                       return -EOPNOTSUPP;
32371 +               }
32372 +       }
32373 +
32374 +       return 0;
32375 +}
32376 +
32377 +int reiserfs_permission(struct mnt_idmap *idmap, struct inode *inode,
32378 +                       int mask)
32379 +{
32380 +       /*
32381 +        * We don't do permission checks on the internal objects.
32382 +        * Permissions are determined by the "owning" object.
32383 +        */
32384 +       if (IS_PRIVATE(inode))
32385 +               return 0;
32386 +
32387 +       return generic_permission(&nop_mnt_idmap, inode, mask);
32388 +}
32389 +
32390 +static int xattr_hide_revalidate(struct dentry *dentry, unsigned int flags)
32391 +{
32392 +       return -EPERM;
32393 +}
32394 +
32395 +static const struct dentry_operations xattr_lookup_poison_ops = {
32396 +       .d_revalidate = xattr_hide_revalidate,
32397 +};
32398 +
32399 +int reiserfs_lookup_privroot(struct super_block *s)
32400 +{
32401 +       struct dentry *dentry;
32402 +       int err = 0;
32403 +
32404 +       /* If we don't have the privroot located yet - go find it */
32405 +       inode_lock(d_inode(s->s_root));
32406 +       dentry = lookup_one_len(PRIVROOT_NAME, s->s_root,
32407 +                               strlen(PRIVROOT_NAME));
32408 +       if (!IS_ERR(dentry)) {
32409 +               REISERFS_SB(s)->priv_root = dentry;
32410 +               d_set_d_op(dentry, &xattr_lookup_poison_ops);
32411 +               if (d_really_is_positive(dentry))
32412 +                       reiserfs_init_priv_inode(d_inode(dentry));
32413 +       } else
32414 +               err = PTR_ERR(dentry);
32415 +       inode_unlock(d_inode(s->s_root));
32416 +
32417 +       return err;
32418 +}
32419 +
32420 +/*
32421 + * We need to take a copy of the mount flags since things like
32422 + * SB_RDONLY don't get set until *after* we're called.
32423 + * mount_flags != mount_options
32424 + */
32425 +int reiserfs_xattr_init(struct super_block *s, int mount_flags)
32426 +{
32427 +       int err = 0;
32428 +       struct dentry *privroot = REISERFS_SB(s)->priv_root;
32429 +
32430 +       err = xattr_mount_check(s);
32431 +       if (err)
32432 +               goto error;
32433 +
32434 +       if (d_really_is_negative(privroot) && !(mount_flags & SB_RDONLY)) {
32435 +               inode_lock(d_inode(s->s_root));
32436 +               err = create_privroot(REISERFS_SB(s)->priv_root);
32437 +               inode_unlock(d_inode(s->s_root));
32438 +       }
32439 +
32440 +       if (d_really_is_positive(privroot)) {
32441 +               inode_lock(d_inode(privroot));
32442 +               if (!REISERFS_SB(s)->xattr_root) {
32443 +                       struct dentry *dentry;
32444 +
32445 +                       dentry = lookup_one_len(XAROOT_NAME, privroot,
32446 +                                               strlen(XAROOT_NAME));
32447 +                       if (!IS_ERR(dentry))
32448 +                               REISERFS_SB(s)->xattr_root = dentry;
32449 +                       else
32450 +                               err = PTR_ERR(dentry);
32451 +               }
32452 +               inode_unlock(d_inode(privroot));
32453 +       }
32454 +
32455 +error:
32456 +       if (err) {
32457 +               clear_bit(REISERFS_XATTRS_USER, &REISERFS_SB(s)->s_mount_opt);
32458 +               clear_bit(REISERFS_POSIXACL, &REISERFS_SB(s)->s_mount_opt);
32459 +       }
32460 +
32461 +       /* The super_block SB_POSIXACL must mirror the (no)acl mount option. */
32462 +       if (reiserfs_posixacl(s))
32463 +               s->s_flags |= SB_POSIXACL;
32464 +       else
32465 +               s->s_flags &= ~SB_POSIXACL;
32466 +
32467 +       return err;
32468 +}
32469 diff --git a/fs/reiserfs/xattr.h b/fs/reiserfs/xattr.h
32470 new file mode 100644
32471 index 000000000000..5868a4e990e3
32472 --- /dev/null
32473 +++ b/fs/reiserfs/xattr.h
32474 @@ -0,0 +1,117 @@
32475 +/* SPDX-License-Identifier: GPL-2.0 */
32476 +#include <linux/reiserfs_xattr.h>
32477 +#include <linux/init.h>
32478 +#include <linux/list.h>
32479 +#include <linux/rwsem.h>
32480 +#include <linux/xattr.h>
32481 +
32482 +struct inode;
32483 +struct dentry;
32484 +struct iattr;
32485 +struct super_block;
32486 +
32487 +int reiserfs_xattr_register_handlers(void) __init;
32488 +void reiserfs_xattr_unregister_handlers(void);
32489 +int reiserfs_xattr_init(struct super_block *sb, int mount_flags);
32490 +int reiserfs_lookup_privroot(struct super_block *sb);
32491 +int reiserfs_delete_xattrs(struct inode *inode);
32492 +int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs);
32493 +int reiserfs_permission(struct mnt_idmap *idmap,
32494 +                       struct inode *inode, int mask);
32495 +
32496 +#ifdef CONFIG_REISERFS_FS_XATTR
32497 +#define has_xattr_dir(inode) (REISERFS_I(inode)->i_flags & i_has_xattr_dir)
32498 +ssize_t reiserfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
32499 +
32500 +int reiserfs_xattr_get(struct inode *, const char *, void *, size_t);
32501 +int reiserfs_xattr_set(struct inode *, const char *, const void *, size_t, int);
32502 +int reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *,
32503 +                             struct inode *, const char *, const void *,
32504 +                             size_t, int);
32505 +
32506 +extern const struct xattr_handler reiserfs_xattr_user_handler;
32507 +extern const struct xattr_handler reiserfs_xattr_trusted_handler;
32508 +extern const struct xattr_handler reiserfs_xattr_security_handler;
32509 +#ifdef CONFIG_REISERFS_FS_SECURITY
32510 +int reiserfs_security_init(struct inode *dir, struct inode *inode,
32511 +                          const struct qstr *qstr,
32512 +                          struct reiserfs_security_handle *sec);
32513 +int reiserfs_security_write(struct reiserfs_transaction_handle *th,
32514 +                           struct inode *inode,
32515 +                           struct reiserfs_security_handle *sec);
32516 +void reiserfs_security_free(struct reiserfs_security_handle *sec);
32517 +#endif
32518 +
32519 +static inline int reiserfs_xattrs_initialized(struct super_block *sb)
32520 +{
32521 +       return REISERFS_SB(sb)->priv_root && REISERFS_SB(sb)->xattr_root;
32522 +}
32523 +
32524 +#define xattr_size(size) ((size) + sizeof(struct reiserfs_xattr_header))
32525 +static inline loff_t reiserfs_xattr_nblocks(struct inode *inode, loff_t size)
32526 +{
32527 +       loff_t ret = 0;
32528 +       if (reiserfs_file_data_log(inode)) {
32529 +               ret = _ROUND_UP(xattr_size(size), inode->i_sb->s_blocksize);
32530 +               ret >>= inode->i_sb->s_blocksize_bits;
32531 +       }
32532 +       return ret;
32533 +}
32534 +
32535 +/*
32536 + * We may have to create up to 3 objects: xattr root, xattr dir, xattr file.
32537 + * Let's try to be smart about it.
32538 + * xattr root: We cache it. If it's not cached, we may need to create it.
32539 + * xattr dir: If anything has been loaded for this inode, we can set a flag
32540 + *            saying so.
32541 + * xattr file: Since we don't cache xattrs, we can't tell. We always include
32542 + *             blocks for it.
32543 + *
32544 + * However, since root and dir can be created between calls - YOU MUST SAVE
32545 + * THIS VALUE.
32546 + */
32547 +static inline size_t reiserfs_xattr_jcreate_nblocks(struct inode *inode)
32548 +{
32549 +       size_t nblocks = JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
32550 +
32551 +       if ((REISERFS_I(inode)->i_flags & i_has_xattr_dir) == 0) {
32552 +               nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
32553 +               if (d_really_is_negative(REISERFS_SB(inode->i_sb)->xattr_root))
32554 +                       nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
32555 +       }
32556 +
32557 +       return nblocks;
32558 +}
32559 +
32560 +static inline void reiserfs_init_xattr_rwsem(struct inode *inode)
32561 +{
32562 +       init_rwsem(&REISERFS_I(inode)->i_xattr_sem);
32563 +}
32564 +
32565 +#else
32566 +
32567 +#define reiserfs_listxattr NULL
32568 +
32569 +static inline void reiserfs_init_xattr_rwsem(struct inode *inode)
32570 +{
32571 +}
32572 +#endif  /*  CONFIG_REISERFS_FS_XATTR  */
32573 +
32574 +#ifndef CONFIG_REISERFS_FS_SECURITY
32575 +static inline int reiserfs_security_init(struct inode *dir,
32576 +                                        struct inode *inode,
32577 +                                        const struct qstr *qstr,
32578 +                                        struct reiserfs_security_handle *sec)
32579 +{
32580 +       return 0;
32581 +}
32582 +static inline int
32583 +reiserfs_security_write(struct reiserfs_transaction_handle *th,
32584 +                       struct inode *inode,
32585 +                       struct reiserfs_security_handle *sec)
32586 +{
32587 +       return 0;
32588 +}
32589 +static inline void reiserfs_security_free(struct reiserfs_security_handle *sec)
32590 +{}
32591 +#endif
32592 diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
32593 new file mode 100644
32594 index 000000000000..064264992b49
32595 --- /dev/null
32596 +++ b/fs/reiserfs/xattr_acl.c
32597 @@ -0,0 +1,411 @@
32598 +// SPDX-License-Identifier: GPL-2.0
32599 +#include <linux/capability.h>
32600 +#include <linux/fs.h>
32601 +#include <linux/posix_acl.h>
32602 +#include "reiserfs.h"
32603 +#include <linux/errno.h>
32604 +#include <linux/pagemap.h>
32605 +#include <linux/xattr.h>
32606 +#include <linux/slab.h>
32607 +#include <linux/posix_acl_xattr.h>
32608 +#include "xattr.h"
32609 +#include "acl.h"
32610 +#include <linux/uaccess.h>
32611 +
32612 +static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th,
32613 +                           struct inode *inode, int type,
32614 +                           struct posix_acl *acl);
32615 +
32616 +
32617 +int
32618 +reiserfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
32619 +                struct posix_acl *acl, int type)
32620 +{
32621 +       int error, error2;
32622 +       struct reiserfs_transaction_handle th;
32623 +       size_t jcreate_blocks;
32624 +       int size = acl ? posix_acl_xattr_size(acl->a_count) : 0;
32625 +       int update_mode = 0;
32626 +       struct inode *inode = d_inode(dentry);
32627 +       umode_t mode = inode->i_mode;
32628 +
32629 +       /*
32630 +        * Pessimism: We can't assume that anything from the xattr root up
32631 +        * has been created.
32632 +        */
32633 +
32634 +       jcreate_blocks = reiserfs_xattr_jcreate_nblocks(inode) +
32635 +                        reiserfs_xattr_nblocks(inode, size) * 2;
32636 +
32637 +       reiserfs_write_lock(inode->i_sb);
32638 +       error = journal_begin(&th, inode->i_sb, jcreate_blocks);
32639 +       reiserfs_write_unlock(inode->i_sb);
32640 +       if (error == 0) {
32641 +               if (type == ACL_TYPE_ACCESS && acl) {
32642 +                       error = posix_acl_update_mode(&nop_mnt_idmap, inode,
32643 +                                                     &mode, &acl);
32644 +                       if (error)
32645 +                               goto unlock;
32646 +                       update_mode = 1;
32647 +               }
32648 +               error = __reiserfs_set_acl(&th, inode, type, acl);
32649 +               if (!error && update_mode)
32650 +                       inode->i_mode = mode;
32651 +unlock:
32652 +               reiserfs_write_lock(inode->i_sb);
32653 +               error2 = journal_end(&th);
32654 +               reiserfs_write_unlock(inode->i_sb);
32655 +               if (error2)
32656 +                       error = error2;
32657 +       }
32658 +
32659 +       return error;
32660 +}
32661 +
32662 +/*
32663 + * Convert from filesystem to in-memory representation.
32664 + */
32665 +static struct posix_acl *reiserfs_posix_acl_from_disk(const void *value, size_t size)
32666 +{
32667 +       const char *end = (char *)value + size;
32668 +       int n, count;
32669 +       struct posix_acl *acl;
32670 +
32671 +       if (!value)
32672 +               return NULL;
32673 +       if (size < sizeof(reiserfs_acl_header))
32674 +               return ERR_PTR(-EINVAL);
32675 +       if (((reiserfs_acl_header *) value)->a_version !=
32676 +           cpu_to_le32(REISERFS_ACL_VERSION))
32677 +               return ERR_PTR(-EINVAL);
32678 +       value = (char *)value + sizeof(reiserfs_acl_header);
32679 +       count = reiserfs_acl_count(size);
32680 +       if (count < 0)
32681 +               return ERR_PTR(-EINVAL);
32682 +       if (count == 0)
32683 +               return NULL;
32684 +       acl = posix_acl_alloc(count, GFP_NOFS);
32685 +       if (!acl)
32686 +               return ERR_PTR(-ENOMEM);
32687 +       for (n = 0; n < count; n++) {
32688 +               reiserfs_acl_entry *entry = (reiserfs_acl_entry *) value;
32689 +               if ((char *)value + sizeof(reiserfs_acl_entry_short) > end)
32690 +                       goto fail;
32691 +               acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
32692 +               acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
32693 +               switch (acl->a_entries[n].e_tag) {
32694 +               case ACL_USER_OBJ:
32695 +               case ACL_GROUP_OBJ:
32696 +               case ACL_MASK:
32697 +               case ACL_OTHER:
32698 +                       value = (char *)value +
32699 +                           sizeof(reiserfs_acl_entry_short);
32700 +                       break;
32701 +
32702 +               case ACL_USER:
32703 +                       value = (char *)value + sizeof(reiserfs_acl_entry);
32704 +                       if ((char *)value > end)
32705 +                               goto fail;
32706 +                       acl->a_entries[n].e_uid =
32707 +                               make_kuid(&init_user_ns,
32708 +                                         le32_to_cpu(entry->e_id));
32709 +                       break;
32710 +               case ACL_GROUP:
32711 +                       value = (char *)value + sizeof(reiserfs_acl_entry);
32712 +                       if ((char *)value > end)
32713 +                               goto fail;
32714 +                       acl->a_entries[n].e_gid =
32715 +                               make_kgid(&init_user_ns,
32716 +                                         le32_to_cpu(entry->e_id));
32717 +                       break;
32718 +
32719 +               default:
32720 +                       goto fail;
32721 +               }
32722 +       }
32723 +       if (value != end)
32724 +               goto fail;
32725 +       return acl;
32726 +
32727 +fail:
32728 +       posix_acl_release(acl);
32729 +       return ERR_PTR(-EINVAL);
32730 +}
32731 +
32732 +/*
32733 + * Convert from in-memory to filesystem representation.
32734 + */
32735 +static void *reiserfs_posix_acl_to_disk(const struct posix_acl *acl, size_t * size)
32736 +{
32737 +       reiserfs_acl_header *ext_acl;
32738 +       char *e;
32739 +       int n;
32740 +
32741 +       *size = reiserfs_acl_size(acl->a_count);
32742 +       ext_acl = kmalloc(sizeof(reiserfs_acl_header) +
32743 +                                                 acl->a_count *
32744 +                                                 sizeof(reiserfs_acl_entry),
32745 +                                                 GFP_NOFS);
32746 +       if (!ext_acl)
32747 +               return ERR_PTR(-ENOMEM);
32748 +       ext_acl->a_version = cpu_to_le32(REISERFS_ACL_VERSION);
32749 +       e = (char *)ext_acl + sizeof(reiserfs_acl_header);
32750 +       for (n = 0; n < acl->a_count; n++) {
32751 +               const struct posix_acl_entry *acl_e = &acl->a_entries[n];
32752 +               reiserfs_acl_entry *entry = (reiserfs_acl_entry *) e;
32753 +               entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag);
32754 +               entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
32755 +               switch (acl->a_entries[n].e_tag) {
32756 +               case ACL_USER:
32757 +                       entry->e_id = cpu_to_le32(
32758 +                               from_kuid(&init_user_ns, acl_e->e_uid));
32759 +                       e += sizeof(reiserfs_acl_entry);
32760 +                       break;
32761 +               case ACL_GROUP:
32762 +                       entry->e_id = cpu_to_le32(
32763 +                               from_kgid(&init_user_ns, acl_e->e_gid));
32764 +                       e += sizeof(reiserfs_acl_entry);
32765 +                       break;
32766 +
32767 +               case ACL_USER_OBJ:
32768 +               case ACL_GROUP_OBJ:
32769 +               case ACL_MASK:
32770 +               case ACL_OTHER:
32771 +                       e += sizeof(reiserfs_acl_entry_short);
32772 +                       break;
32773 +
32774 +               default:
32775 +                       goto fail;
32776 +               }
32777 +       }
32778 +       return (char *)ext_acl;
32779 +
32780 +fail:
32781 +       kfree(ext_acl);
32782 +       return ERR_PTR(-EINVAL);
32783 +}
32784 +
32785 +/*
32786 + * Inode operation get_posix_acl().
32787 + *
32788 + * inode->i_mutex: down
32789 + * BKL held [before 2.5.x]
32790 + */
32791 +struct posix_acl *reiserfs_get_acl(struct inode *inode, int type, bool rcu)
32792 +{
32793 +       char *name, *value;
32794 +       struct posix_acl *acl;
32795 +       int size;
32796 +       int retval;
32797 +
32798 +       if (rcu)
32799 +               return ERR_PTR(-ECHILD);
32800 +
32801 +       switch (type) {
32802 +       case ACL_TYPE_ACCESS:
32803 +               name = XATTR_NAME_POSIX_ACL_ACCESS;
32804 +               break;
32805 +       case ACL_TYPE_DEFAULT:
32806 +               name = XATTR_NAME_POSIX_ACL_DEFAULT;
32807 +               break;
32808 +       default:
32809 +               BUG();
32810 +       }
32811 +
32812 +       size = reiserfs_xattr_get(inode, name, NULL, 0);
32813 +       if (size < 0) {
32814 +               if (size == -ENODATA || size == -ENOSYS)
32815 +                       return NULL;
32816 +               return ERR_PTR(size);
32817 +       }
32818 +
32819 +       value = kmalloc(size, GFP_NOFS);
32820 +       if (!value)
32821 +               return ERR_PTR(-ENOMEM);
32822 +
32823 +       retval = reiserfs_xattr_get(inode, name, value, size);
32824 +       if (retval == -ENODATA || retval == -ENOSYS) {
32825 +               /*
32826 +                * This shouldn't actually happen as it should have
32827 +                * been caught above.. but just in case
32828 +                */
32829 +               acl = NULL;
32830 +       } else if (retval < 0) {
32831 +               acl = ERR_PTR(retval);
32832 +       } else {
32833 +               acl = reiserfs_posix_acl_from_disk(value, retval);
32834 +       }
32835 +
32836 +       kfree(value);
32837 +       return acl;
32838 +}
32839 +
32840 +/*
32841 + * Inode operation set_posix_acl().
32842 + *
32843 + * inode->i_mutex: down
32844 + * BKL held [before 2.5.x]
32845 + */
32846 +static int
32847 +__reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
32848 +                int type, struct posix_acl *acl)
32849 +{
32850 +       char *name;
32851 +       void *value = NULL;
32852 +       size_t size = 0;
32853 +       int error;
32854 +
32855 +       switch (type) {
32856 +       case ACL_TYPE_ACCESS:
32857 +               name = XATTR_NAME_POSIX_ACL_ACCESS;
32858 +               break;
32859 +       case ACL_TYPE_DEFAULT:
32860 +               name = XATTR_NAME_POSIX_ACL_DEFAULT;
32861 +               if (!S_ISDIR(inode->i_mode))
32862 +                       return acl ? -EACCES : 0;
32863 +               break;
32864 +       default:
32865 +               return -EINVAL;
32866 +       }
32867 +
32868 +       if (acl) {
32869 +               value = reiserfs_posix_acl_to_disk(acl, &size);
32870 +               if (IS_ERR(value))
32871 +                       return (int)PTR_ERR(value);
32872 +       }
32873 +
32874 +       error = reiserfs_xattr_set_handle(th, inode, name, value, size, 0);
32875 +
32876 +       /*
32877 +        * Ensure that the inode gets dirtied if we're only using
32878 +        * the mode bits and an old ACL didn't exist. We don't need
32879 +        * to check if the inode is hashed here since we won't get
32880 +        * called by reiserfs_inherit_default_acl().
32881 +        */
32882 +       if (error == -ENODATA) {
32883 +               error = 0;
32884 +               if (type == ACL_TYPE_ACCESS) {
32885 +                       inode_set_ctime_current(inode);
32886 +                       mark_inode_dirty(inode);
32887 +               }
32888 +       }
32889 +
32890 +       kfree(value);
32891 +
32892 +       if (!error)
32893 +               set_cached_acl(inode, type, acl);
32894 +
32895 +       return error;
32896 +}
32897 +
32898 +/*
32899 + * dir->i_mutex: locked,
32900 + * inode is new and not released into the wild yet
32901 + */
32902 +int
32903 +reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
32904 +                            struct inode *dir, struct dentry *dentry,
32905 +                            struct inode *inode)
32906 +{
32907 +       struct posix_acl *default_acl, *acl;
32908 +       int err = 0;
32909 +
32910 +       /* ACLs only get applied to files and directories */
32911 +       if (S_ISLNK(inode->i_mode))
32912 +               return 0;
32913 +
32914 +       /*
32915 +        * ACLs can only be used on "new" objects, so if it's an old object
32916 +        * there is nothing to inherit from
32917 +        */
32918 +       if (get_inode_sd_version(dir) == STAT_DATA_V1)
32919 +               goto apply_umask;
32920 +
32921 +       /*
32922 +        * Don't apply ACLs to objects in the .reiserfs_priv tree.. This
32923 +        * would be useless since permissions are ignored, and a pain because
32924 +        * it introduces locking cycles
32925 +        */
32926 +       if (IS_PRIVATE(inode))
32927 +               goto apply_umask;
32928 +
32929 +       err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
32930 +       if (err)
32931 +               return err;
32932 +
32933 +       if (default_acl) {
32934 +               err = __reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT,
32935 +                                        default_acl);
32936 +               posix_acl_release(default_acl);
32937 +       }
32938 +       if (acl) {
32939 +               if (!err)
32940 +                       err = __reiserfs_set_acl(th, inode, ACL_TYPE_ACCESS,
32941 +                                                acl);
32942 +               posix_acl_release(acl);
32943 +       }
32944 +
32945 +       return err;
32946 +
32947 +apply_umask:
32948 +       /* no ACL, apply umask */
32949 +       inode->i_mode &= ~current_umask();
32950 +       return err;
32951 +}
32952 +
32953 +/* This is used to cache the default acl before a new object is created.
32954 + * The biggest reason for this is to get an idea of how many blocks will
32955 + * actually be required for the create operation if we must inherit an ACL.
32956 + * An ACL write can add up to 3 object creations and an additional file write
32957 + * so we'd prefer not to reserve that many blocks in the journal if we can.
32958 + * It also has the advantage of not loading the ACL with a transaction open,
32959 + * this may seem silly, but if the owner of the directory is doing the
32960 + * creation, the ACL may not be loaded since the permissions wouldn't require
32961 + * it.
32962 + * We return the number of blocks required for the transaction.
32963 + */
32964 +int reiserfs_cache_default_acl(struct inode *inode)
32965 +{
32966 +       struct posix_acl *acl;
32967 +       int nblocks = 0;
32968 +
32969 +       if (IS_PRIVATE(inode))
32970 +               return 0;
32971 +
32972 +       acl = get_inode_acl(inode, ACL_TYPE_DEFAULT);
32973 +
32974 +       if (acl && !IS_ERR(acl)) {
32975 +               int size = reiserfs_acl_size(acl->a_count);
32976 +
32977 +               /* Other xattrs can be created during inode creation. We don't
32978 +                * want to claim too many blocks, so we check to see if we
32979 +                * need to create the tree to the xattrs, and then we
32980 +                * just want two files. */
32981 +               nblocks = reiserfs_xattr_jcreate_nblocks(inode);
32982 +               nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
32983 +
32984 +               REISERFS_I(inode)->i_flags |= i_has_xattr_dir;
32985 +
32986 +               /* We need to account for writes + bitmaps for two files */
32987 +               nblocks += reiserfs_xattr_nblocks(inode, size) * 4;
32988 +               posix_acl_release(acl);
32989 +       }
32990 +
32991 +       return nblocks;
32992 +}
32993 +
32994 +/*
32995 + * Called under i_mutex
32996 + */
32997 +int reiserfs_acl_chmod(struct dentry *dentry)
32998 +{
32999 +       struct inode *inode = d_inode(dentry);
33000 +
33001 +       if (IS_PRIVATE(inode))
33002 +               return 0;
33003 +       if (get_inode_sd_version(inode) == STAT_DATA_V1 ||
33004 +           !reiserfs_posixacl(inode->i_sb))
33005 +               return 0;
33006 +
33007 +       return posix_acl_chmod(&nop_mnt_idmap, dentry, inode->i_mode);
33008 +}
33009 diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
33010 new file mode 100644
33011 index 000000000000..078dd8cc312f
33012 --- /dev/null
33013 +++ b/fs/reiserfs/xattr_security.c
33014 @@ -0,0 +1,127 @@
33015 +// SPDX-License-Identifier: GPL-2.0
33016 +#include "reiserfs.h"
33017 +#include <linux/errno.h>
33018 +#include <linux/fs.h>
33019 +#include <linux/pagemap.h>
33020 +#include <linux/xattr.h>
33021 +#include <linux/slab.h>
33022 +#include "xattr.h"
33023 +#include <linux/security.h>
33024 +#include <linux/uaccess.h>
33025 +
33026 +static int
33027 +security_get(const struct xattr_handler *handler, struct dentry *unused,
33028 +            struct inode *inode, const char *name, void *buffer, size_t size)
33029 +{
33030 +       if (IS_PRIVATE(inode))
33031 +               return -EPERM;
33032 +
33033 +       return reiserfs_xattr_get(inode, xattr_full_name(handler, name),
33034 +                                 buffer, size);
33035 +}
33036 +
33037 +static int
33038 +security_set(const struct xattr_handler *handler,
33039 +            struct mnt_idmap *idmap, struct dentry *unused,
33040 +            struct inode *inode, const char *name, const void *buffer,
33041 +            size_t size, int flags)
33042 +{
33043 +       if (IS_PRIVATE(inode))
33044 +               return -EPERM;
33045 +
33046 +       return reiserfs_xattr_set(inode,
33047 +                                 xattr_full_name(handler, name),
33048 +                                 buffer, size, flags);
33049 +}
33050 +
33051 +static bool security_list(struct dentry *dentry)
33052 +{
33053 +       return !IS_PRIVATE(d_inode(dentry));
33054 +}
33055 +
33056 +static int
33057 +reiserfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
33058 +                   void *fs_info)
33059 +{
33060 +       struct reiserfs_security_handle *sec = fs_info;
33061 +
33062 +       sec->value = kmemdup(xattr_array->value, xattr_array->value_len,
33063 +                            GFP_KERNEL);
33064 +       if (!sec->value)
33065 +               return -ENOMEM;
33066 +
33067 +       sec->name = xattr_array->name;
33068 +       sec->length = xattr_array->value_len;
33069 +       return 0;
33070 +}
33071 +
33072 +/* Initializes the security context for a new inode and returns the number
33073 + * of blocks needed for the transaction. If successful, reiserfs_security
33074 + * must be released using reiserfs_security_free when the caller is done. */
33075 +int reiserfs_security_init(struct inode *dir, struct inode *inode,
33076 +                          const struct qstr *qstr,
33077 +                          struct reiserfs_security_handle *sec)
33078 +{
33079 +       int blocks = 0;
33080 +       int error;
33081 +
33082 +       sec->name = NULL;
33083 +       sec->value = NULL;
33084 +       sec->length = 0;
33085 +
33086 +       /* Don't add selinux attributes on xattrs - they'll never get used */
33087 +       if (IS_PRIVATE(dir))
33088 +               return 0;
33089 +
33090 +       error = security_inode_init_security(inode, dir, qstr,
33091 +                                            &reiserfs_initxattrs, sec);
33092 +       if (error) {
33093 +               sec->name = NULL;
33094 +               sec->value = NULL;
33095 +               sec->length = 0;
33096 +               return error;
33097 +       }
33098 +
33099 +       if (sec->length && reiserfs_xattrs_initialized(inode->i_sb)) {
33100 +               blocks = reiserfs_xattr_jcreate_nblocks(inode) +
33101 +                        reiserfs_xattr_nblocks(inode, sec->length);
33102 +               /* We don't want to count the directories twice if we have
33103 +                * a default ACL. */
33104 +               REISERFS_I(inode)->i_flags |= i_has_xattr_dir;
33105 +       }
33106 +       return blocks;
33107 +}
33108 +
33109 +int reiserfs_security_write(struct reiserfs_transaction_handle *th,
33110 +                           struct inode *inode,
33111 +                           struct reiserfs_security_handle *sec)
33112 +{
33113 +       char xattr_name[XATTR_NAME_MAX + 1] = XATTR_SECURITY_PREFIX;
33114 +       int error;
33115 +
33116 +       if (XATTR_SECURITY_PREFIX_LEN + strlen(sec->name) > XATTR_NAME_MAX)
33117 +               return -EINVAL;
33118 +
33119 +       strlcat(xattr_name, sec->name, sizeof(xattr_name));
33120 +
33121 +       error = reiserfs_xattr_set_handle(th, inode, xattr_name, sec->value,
33122 +                                         sec->length, XATTR_CREATE);
33123 +       if (error == -ENODATA || error == -EOPNOTSUPP)
33124 +               error = 0;
33125 +
33126 +       return error;
33127 +}
33128 +
33129 +void reiserfs_security_free(struct reiserfs_security_handle *sec)
33130 +{
33131 +       kfree(sec->value);
33132 +       sec->name = NULL;
33133 +       sec->value = NULL;
33134 +}
33135 +
33136 +const struct xattr_handler reiserfs_xattr_security_handler = {
33137 +       .prefix = XATTR_SECURITY_PREFIX,
33138 +       .get = security_get,
33139 +       .set = security_set,
33140 +       .list = security_list,
33141 +};
33142 diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
33143 new file mode 100644
33144 index 000000000000..0c0c74d8db0e
33145 --- /dev/null
33146 +++ b/fs/reiserfs/xattr_trusted.c
33147 @@ -0,0 +1,46 @@
33148 +// SPDX-License-Identifier: GPL-2.0
33149 +#include "reiserfs.h"
33150 +#include <linux/capability.h>
33151 +#include <linux/errno.h>
33152 +#include <linux/fs.h>
33153 +#include <linux/pagemap.h>
33154 +#include <linux/xattr.h>
33155 +#include "xattr.h"
33156 +#include <linux/uaccess.h>
33157 +
33158 +static int
33159 +trusted_get(const struct xattr_handler *handler, struct dentry *unused,
33160 +           struct inode *inode, const char *name, void *buffer, size_t size)
33161 +{
33162 +       if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode))
33163 +               return -EPERM;
33164 +
33165 +       return reiserfs_xattr_get(inode, xattr_full_name(handler, name),
33166 +                                 buffer, size);
33167 +}
33168 +
33169 +static int
33170 +trusted_set(const struct xattr_handler *handler,
33171 +           struct mnt_idmap *idmap, struct dentry *unused,
33172 +           struct inode *inode, const char *name, const void *buffer,
33173 +           size_t size, int flags)
33174 +{
33175 +       if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode))
33176 +               return -EPERM;
33177 +
33178 +       return reiserfs_xattr_set(inode,
33179 +                                 xattr_full_name(handler, name),
33180 +                                 buffer, size, flags);
33181 +}
33182 +
33183 +static bool trusted_list(struct dentry *dentry)
33184 +{
33185 +       return capable(CAP_SYS_ADMIN) && !IS_PRIVATE(d_inode(dentry));
33186 +}
33187 +
33188 +const struct xattr_handler reiserfs_xattr_trusted_handler = {
33189 +       .prefix = XATTR_TRUSTED_PREFIX,
33190 +       .get = trusted_get,
33191 +       .set = trusted_set,
33192 +       .list = trusted_list,
33193 +};
33194 diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
33195 new file mode 100644
33196 index 000000000000..88195181e1d7
33197 --- /dev/null
33198 +++ b/fs/reiserfs/xattr_user.c
33199 @@ -0,0 +1,43 @@
33200 +// SPDX-License-Identifier: GPL-2.0
33201 +#include "reiserfs.h"
33202 +#include <linux/errno.h>
33203 +#include <linux/fs.h>
33204 +#include <linux/pagemap.h>
33205 +#include <linux/xattr.h>
33206 +#include "xattr.h"
33207 +#include <linux/uaccess.h>
33208 +
33209 +static int
33210 +user_get(const struct xattr_handler *handler, struct dentry *unused,
33211 +        struct inode *inode, const char *name, void *buffer, size_t size)
33212 +{
33213 +       if (!reiserfs_xattrs_user(inode->i_sb))
33214 +               return -EOPNOTSUPP;
33215 +       return reiserfs_xattr_get(inode, xattr_full_name(handler, name),
33216 +                                 buffer, size);
33217 +}
33218 +
33219 +static int
33220 +user_set(const struct xattr_handler *handler, struct mnt_idmap *idmap,
33221 +        struct dentry *unused,
33222 +        struct inode *inode, const char *name, const void *buffer,
33223 +        size_t size, int flags)
33224 +{
33225 +       if (!reiserfs_xattrs_user(inode->i_sb))
33226 +               return -EOPNOTSUPP;
33227 +       return reiserfs_xattr_set(inode,
33228 +                                 xattr_full_name(handler, name),
33229 +                                 buffer, size, flags);
33230 +}
33231 +
33232 +static bool user_list(struct dentry *dentry)
33233 +{
33234 +       return reiserfs_xattrs_user(dentry->d_sb);
33235 +}
33236 +
33237 +const struct xattr_handler reiserfs_xattr_user_handler = {
33238 +       .prefix = XATTR_USER_PREFIX,
33239 +       .get = user_get,
33240 +       .set = user_set,
33241 +       .list = user_list,
33242 +};
33243 diff --git a/include/uapi/linux/reiserfs_fs.h b/include/uapi/linux/reiserfs_fs.h
33244 new file mode 100644
33245 index 000000000000..5bb921409f2b
33246 --- /dev/null
33247 +++ b/include/uapi/linux/reiserfs_fs.h
33248 @@ -0,0 +1,27 @@
33249 +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
33250 +/*
33251 + * Copyright 1996, 1997, 1998 Hans Reiser, see reiserfs/README for licensing and copyright details
33252 + */
33253 +#ifndef _LINUX_REISER_FS_H
33254 +#define _LINUX_REISER_FS_H
33255 +
33256 +#include <linux/types.h>
33257 +#include <linux/magic.h>
33258 +
33259 +/*
33260 + *  include/linux/reiser_fs.h
33261 + *
33262 + *  Reiser File System constants and structures
33263 + *
33264 + */
33265 +
33266 +/* ioctl's command */
33267 +#define REISERFS_IOC_UNPACK            _IOW(0xCD,1,long)
33268 +/* define following flags to be the same as in ext2, so that chattr(1),
33269 +   lsattr(1) will work with us. */
33270 +#define REISERFS_IOC_GETFLAGS          FS_IOC_GETFLAGS
33271 +#define REISERFS_IOC_SETFLAGS          FS_IOC_SETFLAGS
33272 +#define REISERFS_IOC_GETVERSION                FS_IOC_GETVERSION
33273 +#define REISERFS_IOC_SETVERSION                FS_IOC_SETVERSION
33274 +
33275 +#endif                         /* _LINUX_REISER_FS_H */
33276 diff --git a/include/uapi/linux/reiserfs_xattr.h b/include/uapi/linux/reiserfs_xattr.h
33277 new file mode 100644
33278 index 000000000000..503ad018ce5b
33279 --- /dev/null
33280 +++ b/include/uapi/linux/reiserfs_xattr.h
33281 @@ -0,0 +1,25 @@
33282 +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
33283 +/*
33284 +  File: linux/reiserfs_xattr.h
33285 +*/
33286 +
33287 +#ifndef _LINUX_REISERFS_XATTR_H
33288 +#define _LINUX_REISERFS_XATTR_H
33289 +
33290 +#include <linux/types.h>
33291 +
33292 +/* Magic value in header */
33293 +#define REISERFS_XATTR_MAGIC 0x52465841        /* "RFXA" */
33294 +
33295 +struct reiserfs_xattr_header {
33296 +       __le32 h_magic;         /* magic number for identification */
33297 +       __le32 h_hash;          /* hash of the value */
33298 +};
33299 +
33300 +struct reiserfs_security_handle {
33301 +       const char *name;
33302 +       void *value;
33303 +       __kernel_size_t length;
33304 +};
33305 +
33306 +#endif  /*  _LINUX_REISERFS_XATTR_H  */
33307 diff --git a/scripts/selinux/mdp/mdp.c b/scripts/selinux/mdp/mdp.c
33308 index ea7fbe595971..52365921c043 100644
33309 --- a/scripts/selinux/mdp/mdp.c
33310 +++ b/scripts/selinux/mdp/mdp.c
33311 @@ -167,6 +167,9 @@ int main(int argc, char *argv[])
33312  #ifdef CONFIG_JFS_SECURITY
33313         FS_USE("xattr", "jfs");
33314  #endif
33315 +#ifdef CONFIG_REISERFS_FS_SECURITY
33316 +       FS_USE("xattr", "reiserfs");
33317 +#endif
33318  #ifdef CONFIG_JFFS2_FS_SECURITY
33319         FS_USE("xattr", "jffs2");
33320  #endif
33321 diff --git a/tools/objtool/noreturns.h b/tools/objtool/noreturns.h
33322 index f37614cc2c1b..e7da92489167 100644
33323 --- a/tools/objtool/noreturns.h
33324 +++ b/tools/objtool/noreturns.h
33325 @@ -11,6 +11,7 @@ NORETURN(__ia32_sys_exit)
33326  NORETURN(__ia32_sys_exit_group)
33327  NORETURN(__kunit_abort)
33328  NORETURN(__module_put_and_kthread_exit)
33329 +NORETURN(__reiserfs_panic)
33330  NORETURN(__stack_chk_fail)
33331  NORETURN(__tdx_hypercall_failed)
33332  NORETURN(__ubsan_handle_builtin_unreachable)
33333 diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test.c b/tools/testing/selftests/filesystems/statmount/statmount_test.c
33334 index 8eb6aa606a0d..c773334bbcc9 100644
33335 --- a/tools/testing/selftests/filesystems/statmount/statmount_test.c
33336 +++ b/tools/testing/selftests/filesystems/statmount/statmount_test.c
33337 @@ -27,7 +27,7 @@ static const char *const known_fs[] = {
33338         "ipathfs", "iso9660", "jffs2", "jfs", "minix", "mqueue", "msdos",
33339         "nfs", "nfs4", "nfsd", "nilfs2", "nsfs", "ntfs", "ntfs3", "ocfs2",
33340         "ocfs2_dlmfs", "ocxlflash", "omfs", "openpromfs", "overlay", "pipefs",
33341 -       "proc", "pstore", "pvfs2", "qnx4", "qnx6", "ramfs",
33342 +       "proc", "pstore", "pvfs2", "qnx4", "qnx6", "ramfs", "reiserfs",
33343         "resctrl", "romfs", "rootfs", "rpc_pipefs", "s390_hypfs", "secretmem",
33344         "securityfs", "selinuxfs", "smackfs", "smb3", "sockfs", "spufs",
33345         "squashfs", "sysfs", "sysv", "tmpfs", "tracefs", "ubifs", "udf",