Update git submodules
[mediawiki.git] / includes / historyblob / DiffHistoryBlob.php
blob6f7ac43590a7bb555d758710a947433fac29a19a
1 <?php
2 /**
3 * Efficient concatenated text storage.
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 * http://www.gnu.org/copyleft/gpl.html
20 * @file
23 /**
24 * Diff-based history compression
25 * Requires xdiff and zlib
27 * WARNING: Objects of this class are serialized and permanently stored in the DB.
28 * Do not change the name or visibility of any property!
30 class DiffHistoryBlob implements HistoryBlob {
31 /** @var string[] Uncompressed item cache */
32 public $mItems = [];
34 /** @var int Total uncompressed size */
35 public $mSize = 0;
37 /**
38 * @var array|null Array of diffs. If a diff D from A to B is notated D = B - A,
39 * and Z is an empty string:
41 * { item[map[i]] - item[map[i-1]] where i > 0
42 * diff[i] = {
43 * { item[map[i]] - Z where i = 0
45 public $mDiffs;
47 /** @var array The diff map, see above */
48 public $mDiffMap;
50 /** @var string The key for getText()
52 public $mDefaultKey;
54 /** @var string|null Compressed storage */
55 public $mCompressed;
57 /** @var bool True if the object is locked against further writes */
58 public $mFrozen = false;
60 /**
61 * @var int The maximum uncompressed size before the object becomes sad
62 * Should be less than max_allowed_packet
64 public $mMaxSize = 10000000;
66 /** @var int The maximum number of text items before the object becomes sad */
67 public $mMaxCount = 100;
69 /** Constants from xdiff.h */
70 private const XDL_BDOP_INS = 1;
71 private const XDL_BDOP_CPY = 2;
72 private const XDL_BDOP_INSB = 3;
74 public function __construct() {
75 if ( !function_exists( 'gzdeflate' ) ) {
76 throw new MWException( "Need zlib support to read or write DiffHistoryBlob\n" );
80 /**
81 * @throws MWException
82 * @param string $text
83 * @return string
85 public function addItem( $text ) {
86 if ( $this->mFrozen ) {
87 throw new MWException( __METHOD__ . ": Cannot add more items after sleep/wakeup" );
90 $this->mItems[] = $text;
91 $this->mSize += strlen( $text );
92 $this->mDiffs = null; // later
93 return (string)( count( $this->mItems ) - 1 );
96 /**
97 * @param string $key
98 * @return string
100 public function getItem( $key ) {
101 return $this->mItems[(int)$key];
105 * @param string $text
107 public function setText( $text ) {
108 $this->mDefaultKey = $this->addItem( $text );
112 * @return string
114 public function getText() {
115 return $this->getItem( $this->mDefaultKey );
119 * @throws MWException
121 private function compress() {
122 if ( !function_exists( 'xdiff_string_rabdiff' ) ) {
123 throw new MWException( "Need xdiff support to write DiffHistoryBlob\n" );
125 if ( isset( $this->mDiffs ) ) {
126 // Already compressed
127 return;
129 if ( $this->mItems === [] ) {
130 return;
133 // Create two diff sequences: one for main text and one for small text
134 $sequences = [
135 'small' => [
136 'tail' => '',
137 'diffs' => [],
138 'map' => [],
140 'main' => [
141 'tail' => '',
142 'diffs' => [],
143 'map' => [],
146 $smallFactor = 0.5;
148 $mItemsCount = count( $this->mItems );
149 for ( $i = 0; $i < $mItemsCount; $i++ ) {
150 $text = $this->mItems[$i];
151 if ( $i == 0 ) {
152 $seqName = 'main';
153 } else {
154 $mainTail = $sequences['main']['tail'];
155 if ( strlen( $text ) < strlen( $mainTail ) * $smallFactor ) {
156 $seqName = 'small';
157 } else {
158 $seqName = 'main';
162 $tail = $sequences[$seqName]['tail'];
163 $diff = $this->diff( $tail, $text );
164 $sequences[$seqName]['diffs'][] = $diff;
165 $sequences[$seqName]['map'][] = $i;
166 $sequences[$seqName]['tail'] = $text;
169 // Knit the sequences together
170 $tail = '';
171 $this->mDiffs = [];
172 $this->mDiffMap = [];
173 foreach ( $sequences as $seq ) {
174 if ( $seq['diffs'] === [] ) {
175 continue;
177 if ( $tail === '' ) {
178 $this->mDiffs[] = $seq['diffs'][0];
179 } else {
180 $head = $this->patch( '', $seq['diffs'][0] );
181 $this->mDiffs[] = $this->diff( $tail, $head );
183 $this->mDiffMap[] = $seq['map'][0];
184 $diffsCount = count( $seq['diffs'] );
185 for ( $i = 1; $i < $diffsCount; $i++ ) {
186 $this->mDiffs[] = $seq['diffs'][$i];
187 $this->mDiffMap[] = $seq['map'][$i];
189 $tail = $seq['tail'];
194 * @param string $t1
195 * @param string $t2
196 * @return string
198 private function diff( $t1, $t2 ) {
199 return xdiff_string_rabdiff( $t1, $t2 );
203 * @param string $base
204 * @param string $diff
205 * @return bool|string
207 private function patch( $base, $diff ) {
208 if ( function_exists( 'xdiff_string_bpatch' ) ) {
209 return xdiff_string_bpatch( $base, $diff );
212 # Pure PHP implementation
214 $header = unpack( 'Vofp/Vcsize', substr( $diff, 0, 8 ) );
216 # Check the checksum
217 $ofp = $this->xdiffAdler32( $base );
218 if ( $ofp !== substr( $diff, 0, 4 ) ) {
219 wfDebug( __METHOD__ . ": incorrect base checksum" );
220 return false;
222 if ( $header['csize'] != strlen( $base ) ) {
223 wfDebug( __METHOD__ . ": incorrect base length" );
224 return false;
227 $p = 8;
228 $out = '';
229 while ( $p < strlen( $diff ) ) {
230 $x = unpack( 'Cop', substr( $diff, $p, 1 ) );
231 $op = $x['op'];
232 ++$p;
233 switch ( $op ) {
234 case self::XDL_BDOP_INS:
235 $x = unpack( 'Csize', substr( $diff, $p, 1 ) );
236 $p++;
237 $out .= substr( $diff, $p, $x['size'] );
238 $p += $x['size'];
239 break;
240 case self::XDL_BDOP_INSB:
241 $x = unpack( 'Vcsize', substr( $diff, $p, 4 ) );
242 $p += 4;
243 $out .= substr( $diff, $p, $x['csize'] );
244 $p += $x['csize'];
245 break;
246 case self::XDL_BDOP_CPY:
247 $x = unpack( 'Voff/Vcsize', substr( $diff, $p, 8 ) );
248 $p += 8;
249 $out .= substr( $base, $x['off'], $x['csize'] );
250 break;
251 default:
252 wfDebug( __METHOD__ . ": invalid op" );
253 return false;
256 return $out;
260 * Compute a binary "Adler-32" checksum as defined by LibXDiff, i.e. with
261 * the bytes backwards and initialised with 0 instead of 1. See T36428.
263 * @param string $s
264 * @return string
266 public function xdiffAdler32( $s ) {
267 static $init;
268 if ( $init === null ) {
269 $init = str_repeat( "\xf0", 205 ) . "\xee" . str_repeat( "\xf0", 67 ) . "\x02";
272 // The real Adler-32 checksum of $init is zero, so it initialises the
273 // state to zero, as it is at the start of LibXDiff's checksum
274 // algorithm. Appending the subject string then simulates LibXDiff.
275 return strrev( hash( 'adler32', $init . $s, true ) );
278 private function uncompress() {
279 if ( !$this->mDiffs ) {
280 return;
282 $tail = '';
283 $mDiffsCount = count( $this->mDiffs );
284 for ( $diffKey = 0; $diffKey < $mDiffsCount; $diffKey++ ) {
285 $textKey = $this->mDiffMap[$diffKey];
286 $text = $this->patch( $tail, $this->mDiffs[$diffKey] );
287 $this->mItems[$textKey] = $text;
288 $tail = $text;
293 * @return array
295 public function __sleep() {
296 $this->compress();
297 if ( $this->mItems === [] ) {
298 $info = false;
299 } else {
300 // Take forward differences to improve the compression ratio for sequences
301 $map = '';
302 $prev = 0;
303 foreach ( $this->mDiffMap as $i ) {
304 if ( $map !== '' ) {
305 $map .= ',';
307 $map .= $i - $prev;
308 $prev = $i;
310 $info = [
311 'diffs' => $this->mDiffs,
312 'map' => $map
315 if ( isset( $this->mDefaultKey ) ) {
316 $info['default'] = $this->mDefaultKey;
318 $this->mCompressed = gzdeflate( serialize( $info ) );
319 return [ 'mCompressed' ];
322 public function __wakeup() {
323 // addItem() doesn't work if mItems is partially filled from mDiffs
324 $this->mFrozen = true;
325 $info = HistoryBlobUtils::unserializeArray( gzinflate( $this->mCompressed ) );
326 $this->mCompressed = null;
328 if ( !$info ) {
329 // Empty object
330 return;
333 if ( isset( $info['default'] ) ) {
334 $this->mDefaultKey = $info['default'];
336 $this->mDiffs = $info['diffs'];
337 if ( isset( $info['base'] ) ) {
338 // Old format
339 $this->mDiffMap = range( 0, count( $this->mDiffs ) - 1 );
340 array_unshift( $this->mDiffs,
341 pack( 'VVCV', 0, 0, self::XDL_BDOP_INSB, strlen( $info['base'] ) ) .
342 $info['base'] );
343 } else {
344 // New format
345 $map = explode( ',', $info['map'] );
346 $cur = 0;
347 $this->mDiffMap = [];
348 foreach ( $map as $i ) {
349 $cur += $i;
350 $this->mDiffMap[] = $cur;
353 $this->uncompress();
357 * Helper function for compression jobs
358 * Returns true until the object is "full" and ready to be committed
360 * @return bool
362 public function isHappy() {
363 return $this->mSize < $this->mMaxSize
364 && count( $this->mItems ) < $this->mMaxCount;