xapian-core/languages/indonesian.sbl

   1 // Alias: id
   2
   3 // An implementation of the "Porter Stemmer for Bahasa Indonesia" from:
   4 // http://www.illc.uva.nl/Research/Publications/Reports/MoL-2003-02.text.pdf
   5
   6 integers (
   7     // The paper defines measure as the number of vowels in the word.  We
   8     // count this initially, then adjust the count each time we remove a
   9     // prefix or suffix.
  10     measure
  11
  12     // Numeric code for the type of prefix removed:
  13     //
  14     // 0 other/none
  15     // 1 'di' or 'meng' or 'ter'
  16     // 2 'per'
  17     // 3 'ke' or 'peng'
  18     // 4 'ber'
  19     //
  20     // Some of these have variant forms, so e.g. "meng" includes "men", "me",
  21     // "meny", "mem".
  22     //
  23     // Note that the value of prefix is only used in remove_suffix (and
  24     // routines it calls) so we don't need to worry about
  25     // remove_second_order_prefix overwriting a value of prefix set by
  26     // remove_first_order_prefix since remove_suffix gets called between
  27     // the two.
  28     prefix
  29 )
  30
  31 groupings ( vowel )
  32
  33 routines (
  34     remove_particle
  35     remove_possessive_pronoun
  36     remove_first_order_prefix
  37     remove_second_order_prefix
  38     remove_suffix
  39     KER
  40     SUFFIX_KAN_OK
  41     SUFFIX_AN_OK
  42     SUFFIX_I_OK
  43     VOWEL
  44 )
  45
  46 externals ( stem )
  47
  48 stringescapes {}
  49
  50 backwardmode (
  51
  52     define remove_particle as (
  53         [substring] among (
  54             'kah' 'lah' 'pun' (delete $measure-=1)
  55         )
  56     )
  57
  58     define remove_possessive_pronoun as (
  59         [substring] among (
  60             'ku' 'mu' 'nya' (delete $measure-=1)
  61         )
  62     )
  63
  64     // prefix not in {ke, peng, per}
  65     define SUFFIX_KAN_OK as (
  66         // On page 29, the example "kompas Q.31" says "Both Nazief and Porter
  67         // stemmer converted the word peledakan (blast, explotion) to ledak (to
  68         // blast, to explode)".  However, the algorithm as described doesn't
  69         // behave in this way - grammatically the prefix pe- occurs as a
  70         // variation of both the first-order derivational prefix peng- and the
  71         // second-order derivational prefix per-, but table 2.5 doesn't include
  72         // "pe", only table 2.6 does, so "peledakan" is handled (incorrectly)
  73         // as having prefix "per" not "peng", and so we remove derivational
  74         // suffix "kan" rather than "an" to give stem leda.  (Porter-style
  75         // stemmers remove the longest suffix they can amongst those available,
  76         // which this paper notes in the last paragraph on page 15).
  77         //
  78         // We resolve this by amending the condition on suffix "kan" to
  79         // "prefix ∉ {ke, peng, per}", which seems to make the stemmer's
  80         // behaviour match all the examples in the paper except for one:
  81         // "perbaikan" is shown in table 3.4 as stemming to "bai", but with
  82         // this change it now stems to "baik".  The table notes that "baik" is
  83         // the actual root so this deviation is an improvement.  In a sample
  84         // vocabulary derived from the most common words in id.wikipedia.org,
  85         // this change only affects 0.12% of words (76 out of 64,587, including
  86         // "peledakan" and "perbaikan").
  87         $prefix != 3 and $prefix != 2
  88     )
  89
  90     // prefix not in {di, meng, ter}
  91     define SUFFIX_AN_OK as ( $prefix != 1 )
  92
  93     define SUFFIX_I_OK as (
  94         // prefix not in {ke, peng, ber}
  95         $prefix <= 2
  96
  97         // The rest of the condition from the paper is:
  98         //   V|K...c₁c₁, c₁ ≠ s, c₂ ≠ i
  99         //
 100         // The meaning of this is unclear in several ways, and none of the
 101         // examples given of the stemmer's behaviour in the paper help to
 102         // resolve these issues.
 103         //
 104         // Notice that c₂ isn't actually used - the most obvious explanation
 105         // seems to be that "c₁c₁" should read "c₁c₂", or maybe "c₂c₁".
 106         //
 107         // Elsewhere the paper defines V... as meaning "the stem starts with
 108         // a vowel" and K... as meaning "the stem starts with a consonant".
 109         //
 110         // In other places where it says X|Y... it seems the | binds more
 111         // tightly, so it's (V|K)...cᵢcⱼ not V|(K...cᵢcⱼ).  That seems a bit
 112         // odd as the first letter must be either a vowel or a consonant, so
 113         // that really just means "ends cᵢcⱼ".  However, nowhere in the paper
 114         // uses or defines a notation such as ...X, which may explain this
 115         // seemingly redundant way of specifying this.
 116         //
 117         // The conditions elsewhere on prefix removal (e.g. V...) are clearly
 118         // on the stem left after the prefix is removed.  None of the other
 119         // rules for suffix removal have conditions on the stem, but for
 120         // consistency with the prefix rules we might expect that the cᵢcⱼ
 121         // test is on what's left *after* removing the "i" suffix.
 122         //
 123         // However, studying Indonesian wordlists and discussion with a native
 124         // speaker leads us to conclude that the purpose of this check is to
 125         // protect words of foreign origin (e.g. "televisi", "organisasi",
 126         // "komunikasi") from stemming, and the common feature of these is
 127         // that the word ends "-si", so we conclude that the condition here
 128         // should be read as "word does not end -si", and this is what we
 129         // have implemented.
 130         not 's'
 131     )
 132
 133     define remove_suffix as (
 134         [substring] among (
 135             'kan' SUFFIX_KAN_OK 'an' SUFFIX_AN_OK 'i' SUFFIX_I_OK
 136                 (delete $measure-=1)
 137         )
 138     )
 139 )
 140
 141 define vowel 'aeiou'
 142
 143 define VOWEL as ( vowel )
 144
 145 define KER as ( non-vowel 'er' )
 146
 147 define remove_first_order_prefix as (
 148     [substring] among (
 149         'di' 'meng' 'men' 'me' 'ter' (delete $prefix=1 $measure-=1)
 150         'ke' 'peng' 'pen' (delete $prefix=3 $measure-=1)
 151         'meny' VOWEL ($prefix=1 <-'s' $measure-=1)
 152         'peny' VOWEL ($prefix=3 <-'s' $measure-=1)
 153         'mem' ($prefix=1 $measure-=1 vowel and <-'p' or delete)
 154         'pem' ($prefix=3 $measure-=1 vowel and <-'p' or delete)
 155     )
 156 )
 157
 158 define remove_second_order_prefix as (
 159     // The paper has the condition on removal of prefix "bel" and "pel" as
 160     // just "ajar" not "ajar..." but it seems that the latter must be what
 161     // is intended so that e.g. "pelajaran" stems to "ajar" not "lajar".
 162     // This change only affects a very small number of words (11 out of
 163     // 64,587) and only for the better.
 164     [substring] among (
 165         'per' 'pe' (delete $prefix=2 $measure-=1)
 166         'pelajar' (<-'ajar' $measure-=1)
 167         'ber' (delete $prefix=4 $measure-=1)
 168         'belajar' (<-'ajar' $prefix=4 $measure-=1)
 169         'be' KER (delete $prefix=4 $measure-=1)
 170     )
 171 )
 172
 173 define stem as (
 174     $measure = 0
 175     do ( repeat ( gopast vowel $measure+=1 ) )
 176     $measure > 2
 177     $prefix = 0
 178     backwards (
 179         do remove_particle
 180         $measure > 2
 181         do remove_possessive_pronoun
 182     )
 183     $measure > 2
 184     test (
 185         remove_first_order_prefix
 186         do (
 187             test ($measure > 2 backwards remove_suffix)
 188             $measure > 2 remove_second_order_prefix
 189         )
 190     ) or (
 191         do remove_second_order_prefix
 192         do ($measure > 2 backwards remove_suffix)
 193     )
 194 )