packages/docs-shared/lib/WordCount/createWordCountInfo.spec.ts

   1 import { createWordCountInfo } from './createWordCountInfo'
   2
   3 const countWords = (text: string) => createWordCountInfo(text).wordCount
   4
   5 test('Simple sentence', () => {
   6   const text = 'The quick brown fox jumps over the lazy dog.'
   7   const { wordCount, characterCount, nonWhitespaceCharacterCount } = createWordCountInfo(text)
   8   expect(wordCount).toBe(9)
   9   expect(characterCount).toBe(44)
  10   expect(nonWhitespaceCharacterCount).toBe(36)
  11 })
  12
  13 test('Different separators', () => {
  14   const text =
  15     'Is this a question? Here I am *emphasising* this word. Here: are some comma,separated,words and comma, separated, words'
  16   const { wordCount, characterCount, nonWhitespaceCharacterCount } = createWordCountInfo(text)
  17
  18   expect(wordCount).toBe(20)
  19   expect(characterCount).toBe(119)
  20   expect(nonWhitespaceCharacterCount).toBe(102)
  21 })
  22
  23 test('Em dashes delimit words', () => {
  24   const text = 'Btw—hello—world—foo'
  25
  26   const { wordCount, characterCount, nonWhitespaceCharacterCount } = createWordCountInfo(text)
  27
  28   expect(wordCount).toBe(4)
  29   expect(characterCount).toBe(19)
  30   expect(nonWhitespaceCharacterCount).toBe(19)
  31 })
  32
  33 test('Special characters and symbols', () => {
  34   const text = 'The cost is $5.99 @store #bigDeal'
  35
  36   const { wordCount, characterCount, nonWhitespaceCharacterCount } = createWordCountInfo(text)
  37
  38   /** Matches other online word counters and Docs products*/
  39   expect(wordCount).toBe(7)
  40   expect(characterCount).toBe(33)
  41   expect(nonWhitespaceCharacterCount).toBe(28)
  42 })
  43
  44 test('Numbers in words', () => {
  45   const text = 'The chemical formula for water is H2O and the pandemic is COVID-19'
  46   const { wordCount } = createWordCountInfo(text)
  47
  48   expect(wordCount).toBe(12)
  49 })
  50
  51 test('Accented Characters and diacritics', () => {
  52   const text = 'Café culture is thriving in the city, with many façades reflecting a historical charm.'
  53   const { wordCount } = createWordCountInfo(text)
  54
  55   expect(wordCount).toBe(14)
  56 })
  57
  58 test('Long text and paragraphs', () => {
  59   // simulate a large gap in text
  60   const longText = 'This is a long paragraph. ' + ' '.repeat(1000) + 'Another sentence.'
  61   const { wordCount } = createWordCountInfo(longText)
  62
  63   expect(wordCount).toBe(7)
  64 })
  65
  66 test('Consecutive punctuation marks', () => {
  67   const text = 'Wait... what?!!! This is incredible...'
  68   const { wordCount } = createWordCountInfo(text)
  69
  70   expect(wordCount).toBe(5)
  71 })
  72
  73 test('URLs and email addresses', () => {
  74   const text = 'Please visit our site at https://example.com or contact us at info@example.com'
  75   const { wordCount } = createWordCountInfo(text)
  76
  77   /** Matches other online word counters and Docs products*/
  78   expect(wordCount).toBe(15)
  79 })
  80
  81 test('Abbreviations and acronyms', () => {
  82   const text = 'I live in the U.S.A. and work for NASA.'
  83   const { wordCount } = createWordCountInfo(text)
  84
  85   /** Matches other online word counters and Docs products*/
  86   expect(wordCount).toBe(11)
  87 })
  88
  89 test('Hyphens do not delimit words', () => {
  90   const text = 'Btw-hello-world-foo'
  91
  92   const { wordCount, characterCount, nonWhitespaceCharacterCount } = createWordCountInfo(text)
  93
  94   expect(wordCount).toBe(1)
  95   expect(characterCount).toBe(19)
  96   expect(nonWhitespaceCharacterCount).toBe(19)
  97 })
  98
  99 test('Whitespace', () => {
 100   const text = 'The    quick     brown     fox\njumps\tover the lazy dog'
 101   const { wordCount, characterCount, nonWhitespaceCharacterCount } = createWordCountInfo(text)
 102
 103   expect(wordCount).toBe(9)
 104   expect(characterCount).toBe(54)
 105   expect(nonWhitespaceCharacterCount).toBe(35)
 106 })
 107
 108 test('Leading and trailing whitespace', () => {
 109   expect(createWordCountInfo('               The quick brown fox jumps over the lazy dog           ').wordCount).toBe(9)
 110 })
 111
 112 test('The\u00a0quick\u00a0brown\u00a0fox\u00a0jumps\u00a0over\u00a0the\u00a0lazy\u00a0dog', () => {
 113   const text = 'The\u00a0quick\u00a0brown\u00a0fox\u00a0jumps\u00a0over\u00a0the\u00a0lazy\u00a0dog'
 114
 115   const { wordCount, characterCount, nonWhitespaceCharacterCount } = createWordCountInfo(text)
 116
 117   expect(wordCount).toBe(9)
 118   expect(characterCount).toBe(43)
 119   expect(nonWhitespaceCharacterCount).toBe(35)
 120 })
 121
 122 test('contractions with apostrophe should not count as two separate words', () => {
 123   const text = "shouldn't couldn't wouldn't"
 124
 125   const { wordCount, characterCount, nonWhitespaceCharacterCount } = createWordCountInfo(text)
 126
 127   expect(wordCount).toBe(3)
 128   expect(characterCount).toBe(27)
 129   expect(nonWhitespaceCharacterCount).toBe(25)
 130 })
 131
 132 test('Spanish', () => {
 133   expect(
 134     createWordCountInfo('¿Qué opinas de las nuevas reformas? Me gustan los cambios, pero a veces son complicados.')
 135       .wordCount,
 136   ).toBe(15)
 137 })
 138
 139 test('Arabic sentence', () => {
 140   const sentence = 'أنا أحب تعلم اللغات! وأنت؟'
 141   expect(countWords(sentence)).toBe(5)
 142 })
 143
 144 // Example test for Korean text
 145 test('Korean sentence', () => {
 146   const sentence = '나는 언어를 배우는 것을 좋아해요! 당신은요?'
 147   expect(countWords(sentence)).toBe(6)
 148 })
 149
 150 test('Hindi sentence', () => {
 151   const sentence = 'मुझे भाषाएँ सीखना पसंद है। आप कैसे हैं?'
 152   expect(countWords(sentence)).toBe(8)
 153 })
 154
 155 test('Portuguese sentence', () => {
 156   const sentence = 'Eu gosto de aprender línguas. E você, gosta?'
 157   expect(countWords(sentence)).toBe(8)
 158 })
 159
 160 test('Bengali sentence', () => {
 161   const sentence = 'আমি ভাষা শেখা পছন্দ করি। আপনি কেমন আছেন?'
 162   expect(countWords(sentence)).toBe(8)
 163 })
 164
 165 test('Russian sentence', () => {
 166   const sentence = 'Мне нравится учить языки. А вам?'
 167   expect(countWords(sentence)).toBe(6)
 168 })
 169
 170 test('French sentence', () => {
 171   const sentence = 'J’aime apprendre des langues ! Et toi ?'
 172   expect(countWords(sentence)).toBe(6)
 173 })
 174
 175 test('German sentence', () => {
 176   const sentence = 'Ich mag Sprachen lernen. Und du?'
 177   expect(countWords(sentence)).toBe(6)
 178 })
 179
 180 test('Vietnamese sentence', () => {
 181   const sentence = 'Tôi thích học các ngôn ngữ! Bạn thì sao?'
 182   expect(countWords(sentence)).toBe(9)
 183 })
 184
 185 test('Urdu sentence', () => {
 186   const sentence = 'مجھے زبانیں سیکھنا پسند ہے۔ آپ کیسے ہیں؟'
 187   expect(countWords(sentence)).toBe(8)
 188 })
 189
 190 test('Turkish sentence', () => {
 191   const sentence = 'Dilleri öğrenmeyi seviyorum! Sen ne düşünüyorsun?'
 192   expect(countWords(sentence)).toBe(6) // "Dilleri", "öğrenmeyi", "seviyorum", "Sen", "ne", "düşünüyorsun"
 193 })
 194
 195 test('Italian sentence', () => {
 196   const sentence = 'Mi piace imparare le lingue. E tu?'
 197   expect(countWords(sentence)).toBe(7)
 198 })
 199
 200 test('Persian (Farsi) sentence', () => {
 201   const sentence = 'من یادگیری زبان‌ها را دوست دارم! شما چطور؟'
 202   expect(countWords(sentence)).toBe(8)
 203 })
 204
 205 test('Polish sentence', () => {
 206   const sentence = 'Lubię uczyć się języków. A ty?'
 207   expect(countWords(sentence)).toBe(6)
 208 })
 209
 210 test('Tamil sentence', () => {
 211   const sentence = 'நான் மொழிகள் கற்க விரும்புகிறேன்! நீங்கள் எப்படி?'
 212   expect(countWords(sentence)).toBe(6)
 213 })
 214
 215 // // Non-Whitespace-Separated Languages
 216 test('Mandarin Chinese sentence', () => {
 217   const sentence = '我喜欢学习语言。你呢？'
 218   expect(countWords(sentence)).toBe(9)
 219   expect(countWords('天地玄黄，宇宙洪荒。日月盈昃，辰宿列张。寒来暑往，秋收冬藏。')).toBe(24)
 220 })
 221
 222 test('Japanese', () => {
 223   const text = '速い茶色の狐が怠け者の犬の上を飛び越えます。'
 224   const text2 = 'しい改革についてどう思いますか？'
 225   const text3 = '変化は好きですが、時々複雑です。'
 226   const text4 = 'しい改革についてどう思いますか？'
 227
 228   expect(countWords(text)).toBe(21)
 229   expect(countWords(text2)).toBe(15)
 230   expect(countWords(text3)).toBe(14)
 231   expect(countWords(text4)).toBe(15)
 232 })
 233
 234 test('Thai sentence', () => {
 235   const sentence = 'ฉันชอบเรียนภาษาไทย! คุณล่ะ?'
 236   expect(countWords(sentence)).toBe(20)
 237 })
 238
 239 test('Emojis including emoji sequences', () => {
 240   const text =
 241     'The quick brown fox jumps over the lazy dog 🦊. Look at this cool rocket and astronaut 1️⃣ 🚀👨‍🚀! 🌈✨🚀 😎'
 242
 243   const { wordCount, characterCount, nonWhitespaceCharacterCount } = createWordCountInfo(text)
 244   expect(wordCount).toBe(21)
 245   expect(characterCount).toBe(97)
 246   expect(nonWhitespaceCharacterCount).toBe(77)
 247 })