resources/src/mediawiki.String.js

   1 ( function () {
   2         /**
   3          * Calculate the byte length of a string (accounting for UTF-8).
   4          *
   5          * @author Jan Paul Posma, 2011
   6          * @author Timo Tijhof, 2012
   7          * @author David Chan, 2013
   8          * @memberof module:mediawiki.String
   9          * @param {string} str
  10          * @return {number}
  11          */
  12         function byteLength( str ) {
  13                 // This basically figures out how many bytes a UTF-16 string (which is what js sees)
  14                 // will take in UTF-8 by replacing a 2 byte character with 2 *'s, etc, and counting that.
  15                 // Note, surrogate (\uD800-\uDFFF) characters are counted as 2 bytes, since there's two of them
  16                 // and the actual character takes 4 bytes in UTF-8 (2*2=4). Might not work perfectly in
  17                 // edge cases such as illegal sequences, but that should never happen.
  18
  19                 // https://en.wikipedia.org/wiki/UTF-8#Description
  20                 // The mapping from UTF-16 code units to UTF-8 bytes is as follows:
  21                 // > Range 0000-007F: codepoints that become 1 byte of UTF-8
  22                 // > Range 0080-07FF: codepoints that become 2 bytes of UTF-8
  23                 // > Range 0800-D7FF: codepoints that become 3 bytes of UTF-8
  24                 // > Range D800-DFFF: Surrogates (each pair becomes 4 bytes of UTF-8)
  25                 // > Range E000-FFFF: codepoints that become 3 bytes of UTF-8 (continued)
  26
  27                 return str
  28                         .replace( /[\u0080-\u07FF\uD800-\uDFFF]/g, '**' )
  29                         .replace( /[\u0800-\uD7FF\uE000-\uFFFF]/g, '***' )
  30                         .length;
  31         }
  32
  33         /**
  34          * Calculate the character length of a string (accounting for UTF-16 surrogates).
  35          *
  36          * @memberof module:mediawiki.String
  37          * @param {string} str
  38          * @return {number}
  39          */
  40         function codePointLength( str ) {
  41                 return str
  42                         // Low surrogate + high surrogate pairs represent one character (codepoint) each
  43                         .replace( /[\uD800-\uDBFF][\uDC00-\uDFFF]/g, '*' )
  44                         .length;
  45         }
  46
  47         /**
  48          * Like {@link https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/String/charAt String.charAt()},
  49          * but return the pair of UTF-16 surrogates for characters outside of BMP.
  50          *
  51          * @memberof module:mediawiki.String
  52          * @param {string} string
  53          * @param {number} offset Offset to extract the character
  54          * @param {boolean} [backwards] Use backwards direction to detect UTF-16 surrogates,
  55          *                              defaults to false
  56          * @return {string}
  57          */
  58         function charAt( string, offset, backwards ) {
  59                 // We don't need to check for offsets at the beginning or end of string,
  60                 // String#slice will simply return a shorter (or empty) substring.
  61                 const maybePair = backwards ?
  62                         string.slice( offset - 1, offset + 1 ) :
  63                         string.slice( offset, offset + 2 );
  64                 if ( /^[\uD800-\uDBFF][\uDC00-\uDFFF]$/.test( maybePair ) ) {
  65                         return maybePair;
  66                 } else {
  67                         return string.charAt( offset );
  68                 }
  69         }
  70
  71         /**
  72          * Lowercase the first character. Support UTF-16 surrogates for characters outside of BMP.
  73          *
  74          * @memberof module:mediawiki.String
  75          * @param {string} string
  76          * @return {string}
  77          */
  78         function lcFirst( string ) {
  79                 const firstChar = charAt( string, 0 );
  80                 return firstChar.toLowerCase() + string.slice( firstChar.length );
  81         }
  82
  83         /**
  84          * Uppercase the first character. Support UTF-16 surrogates for characters outside of BMP.
  85          *
  86          * @memberof module:mediawiki.String
  87          * @param {string} string
  88          * @return {string}
  89          */
  90         function ucFirst( string ) {
  91                 const firstChar = charAt( string, 0 );
  92                 return firstChar.toUpperCase() + string.slice( firstChar.length );
  93         }
  94
  95         function trimLength( safeVal, newVal, length, lengthFn ) {
  96                 const oldVal = safeVal;
  97
  98                 // Run the hook if one was provided, but only on the length
  99                 // assessment. The value itself is not to be affected by the hook.
 100                 if ( lengthFn( newVal ) <= length ) {
 101                         // Limit was not reached, just remember the new value
 102                         // and let the user continue.
 103                         return {
 104                                 newVal: newVal,
 105                                 trimmed: false
 106                         };
 107                 }
 108
 109                 // Current input is longer than the active limit.
 110                 // Figure out what was added and limit the addition.
 111                 let startMatches = 0;
 112                 let endMatches = 0;
 113
 114                 // It is important that we keep the search within the range of
 115                 // the shortest string's length.
 116                 // Imagine a user adds text that matches the end of the old value
 117                 // (e.g. "foo" -> "foofoo"). startMatches would be 3, but without
 118                 // limiting both searches to the shortest length, endMatches would
 119                 // also be 3.
 120                 const matchesLen = Math.min( newVal.length, oldVal.length );
 121
 122                 // Count same characters from the left, first.
 123                 // (if "foo" -> "foofoo", assume addition was at the end).
 124                 while ( startMatches < matchesLen ) {
 125                         const oldChar = charAt( oldVal, startMatches, false );
 126                         const newChar = charAt( newVal, startMatches, false );
 127                         if ( oldChar !== newChar ) {
 128                                 break;
 129                         }
 130                         startMatches += oldChar.length;
 131                 }
 132
 133                 while ( endMatches < ( matchesLen - startMatches ) ) {
 134                         const oldChar = charAt( oldVal, oldVal.length - 1 - endMatches, true );
 135                         const newChar = charAt( newVal, newVal.length - 1 - endMatches, true );
 136                         if ( oldChar !== newChar ) {
 137                                 break;
 138                         }
 139                         endMatches += oldChar.length;
 140                 }
 141
 142                 const inpParts = [
 143                         // Same start
 144                         newVal.slice( 0, startMatches ),
 145                         // Inserted content
 146                         newVal.slice( startMatches, newVal.length - endMatches ),
 147                         // Same end
 148                         newVal.slice( newVal.length - endMatches )
 149                 ];
 150
 151                 // Chop off characters from the end of the "inserted content" string
 152                 // until the limit is statisfied.
 153                 // Make sure to stop when there is nothing to slice (T43450).
 154                 while ( lengthFn( inpParts.join( '' ) ) > length && inpParts[ 1 ].length > 0 ) {
 155                         // Do not chop off halves of surrogate pairs
 156                         const chopOff = /[\uD800-\uDBFF][\uDC00-\uDFFF]$/.test( inpParts[ 1 ] ) ? 2 : 1;
 157                         inpParts[ 1 ] = inpParts[ 1 ].slice( 0, -chopOff );
 158                 }
 159
 160                 return {
 161                         newVal: inpParts.join( '' ),
 162                         // For pathological lengthFn() that always returns a length greater than the limit, we might have
 163                         // ended up not trimming - check for this case to avoid infinite loops
 164                         trimmed: newVal !== inpParts.join( '' )
 165                 };
 166         }
 167
 168         /**
 169          * @typedef {Object} module:mediawiki.String~StringTrimmed
 170          * @property {string} newVal a trimmed version of the string
 171          * @property {boolean} trimmed whether the string is different from the original version.
 172          */
 173
 174         /**
 175          * Utility function to trim down a string, based on byteLimit
 176          * and given a safe start position. It supports insertion anywhere
 177          * in the string, so "foo" to "fobaro" if limit is 4 will result in
 178          * "fobo", not "foba". Basically emulating the native maxlength by
 179          * reconstructing where the insertion occurred.
 180          *
 181          * @memberof module:mediawiki.String
 182          * @param {string} safeVal Known value that was previously returned by this
 183          * function, if none, pass empty string.
 184          * @param {string} newVal New value that may have to be trimmed down.
 185          * @param {number} byteLimit Number of bytes the value may be in size.
 186          * @param {Function} [filterFunction] Function to call on the string before assessing the length.
 187          * @return {module:mediawiki.String~StringTrimmed}
 188          */
 189         function trimByteLength( safeVal, newVal, byteLimit, filterFunction ) {
 190                 let lengthFn;
 191                 if ( filterFunction ) {
 192                         lengthFn = function ( val ) {
 193                                 return byteLength( filterFunction( val ) );
 194                         };
 195                 } else {
 196                         lengthFn = byteLength;
 197                 }
 198
 199                 return trimLength( safeVal, newVal, byteLimit, lengthFn );
 200         }
 201
 202         /**
 203          * Utility function to trim down a string, based on codePointLimit
 204          * and given a safe start position. It supports insertion anywhere
 205          * in the string, so "foo" to "fobaro" if limit is 4 will result in
 206          * "fobo", not "foba". Basically emulating the native maxlength by
 207          * reconstructing where the insertion occurred.
 208          *
 209          * @memberof module:mediawiki.String
 210          * @param {string} safeVal Known value that was previously returned by this
 211          * function, if none, pass empty string.
 212          * @param {string} newVal New value that may have to be trimmed down.
 213          * @param {number} codePointLimit Number of characters the value may be in size.
 214          * @param {Function} [filterFunction] Function to call on the string before assessing the length.
 215          * @return {module:mediawiki.String~StringTrimmed}
 216          */
 217         function trimCodePointLength( safeVal, newVal, codePointLimit, filterFunction ) {
 218                 let lengthFn;
 219                 if ( filterFunction ) {
 220                         lengthFn = function ( val ) {
 221                                 return codePointLength( filterFunction( val ) );
 222                         };
 223                 } else {
 224                         lengthFn = codePointLength;
 225                 }
 226
 227                 return trimLength( safeVal, newVal, codePointLimit, lengthFn );
 228         }
 229
 230         /**
 231          * Module providing string utility functions.
 232          *
 233          * @exports mediawiki.String
 234          */
 235         module.exports = {
 236                 byteLength,
 237                 codePointLength,
 238                 charAt,
 239                 lcFirst,
 240                 ucFirst,
 241                 trimByteLength,
 242                 trimCodePointLength
 243         };
 244
 245 }() );