basis/porter-stemmer/porter-stemmer.factor

   1 IN: porter-stemmer
   2 USING: kernel math parser sequences combinators splitting ;
   3
   4 : consonant? ( i str -- ? )
   5     2dup nth dup "aeiou" member? [
   6         3drop f
   7     ] [
   8         CHAR: y = [
   9             over zero?
  10             [ 2drop t ] [ [ 1- ] dip consonant? not ] if
  11         ] [
  12             2drop t
  13         ] if
  14     ] if ;
  15
  16 : skip-vowels ( i str -- i str )
  17     2dup bounds-check? [
  18         2dup consonant? [ [ 1+ ] dip skip-vowels ] unless
  19     ] when ;
  20
  21 : skip-consonants ( i str -- i str )
  22     2dup bounds-check? [
  23         2dup consonant? [ [ 1+ ] dip skip-consonants ] when
  24     ] when ;
  25
  26 : (consonant-seq) ( n i str -- n )
  27     skip-vowels
  28     2dup bounds-check? [
  29         [ 1+ ] [ 1+ ] [ ] tri* skip-consonants [ 1+ ] dip
  30         (consonant-seq)
  31     ] [
  32         2drop
  33     ] if ;
  34
  35 : consonant-seq ( str -- n )
  36     0 0 rot skip-consonants (consonant-seq) ;
  37
  38 : stem-vowel? ( str -- ? )
  39     [ length ] keep [ consonant? ] curry all? not ;
  40
  41 : double-consonant? ( i str -- ? )
  42     over 1 < [
  43         2drop f
  44     ] [
  45         2dup nth [ over 1- over nth ] dip = [
  46             consonant?
  47         ] [
  48             2drop f
  49         ] if
  50     ] if ;
  51
  52 : consonant-end? ( n seq -- ? )
  53     [ length swap - ] keep consonant? ;
  54
  55 : last-is? ( str possibilities -- ? ) [ peek ] dip member? ;
  56
  57 : cvc? ( str -- ? )
  58     {
  59         { [ dup length 3 < ] [ drop f ] }
  60         { [ 1 over consonant-end? not ] [ drop f ] }
  61         { [ 2 over consonant-end? ] [ drop f ] }
  62         { [ 3 over consonant-end? not ] [ drop f ] }
  63         [ "wxy" last-is? not ]
  64     } cond ;
  65
  66 : r ( str oldsuffix newsuffix -- str )
  67     pick consonant-seq 0 > [ nip ] [ drop ] if append ;
  68
  69 : step1a ( str -- newstr )
  70     dup peek CHAR: s = [
  71         {
  72             { [ "sses" ?tail ] [ "ss" append ] }
  73             { [ "ies" ?tail ] [ "i" append ] }
  74             { [ dup "ss" tail? ] [ ] }
  75             { [ "s" ?tail ] [ ] }
  76             [ ]
  77         } cond
  78     ] when ;
  79
  80 : -eed ( str -- str )
  81     dup consonant-seq 0 > "ee" "eed" ? append ;
  82
  83 : -ed ( str -- str ? )
  84     dup stem-vowel? [ [ "ed" append ] unless ] keep ;
  85
  86 : -ing ( str -- str ? )
  87     dup stem-vowel? [ [ "ing" append ] unless ] keep ;
  88
  89 : -ed/ing ( str -- str )
  90     {
  91         { [ "at" ?tail ] [ "ate" append ] }
  92         { [ "bl" ?tail ] [ "ble" append ] }
  93         { [ "iz" ?tail ] [ "ize" append ] }
  94         {
  95             [ dup length 1- over double-consonant? ]
  96             [ dup "lsz" last-is? [ but-last-slice ] unless ]
  97         }
  98         {
  99             [ t ]
 100             [
 101                 dup consonant-seq 1 = over cvc? and
 102                 [ "e" append ] when
 103             ]
 104         }
 105     } cond ;
 106
 107 : step1b ( str -- newstr )
 108     {
 109         { [ "eed" ?tail ] [ -eed ] }
 110         {
 111             [
 112                 {
 113                     { [ "ed" ?tail ] [ -ed ] }
 114                     { [ "ing" ?tail ] [ -ing ] }
 115                     [ f ]
 116                 } cond
 117             ] [ -ed/ing ]
 118         }
 119         [ ]
 120     } cond ;
 121
 122 : step1c ( str -- newstr )
 123     dup but-last-slice stem-vowel? [
 124         "y" ?tail [ "i" append ] when
 125     ] when ;
 126
 127 : step2 ( str -- newstr )
 128     {
 129         { [ "ational" ?tail ] [ "ational" "ate"  r ] }
 130         { [ "tional"  ?tail ] [ "tional"  "tion" r ] }
 131         { [ "enci"    ?tail ] [ "enci"    "ence" r ] }
 132         { [ "anci"    ?tail ] [ "anci"    "ance" r ] }
 133         { [ "izer"    ?tail ] [ "izer"    "ize"  r ] }
 134         { [ "bli"     ?tail ] [ "bli"     "ble"  r ] }
 135         { [ "alli"    ?tail ] [ "alli"    "al"   r ] }
 136         { [ "entli"   ?tail ] [ "entli"   "ent"  r ] }
 137         { [ "eli"     ?tail ] [ "eli"     "e"    r ] }
 138         { [ "ousli"   ?tail ] [ "ousli"   "ous"  r ] }
 139         { [ "ization" ?tail ] [ "ization" "ize"  r ] }
 140         { [ "ation"   ?tail ] [ "ation"   "ate"  r ] }
 141         { [ "ator"    ?tail ] [ "ator"    "ate"  r ] }
 142         { [ "alism"   ?tail ] [ "alism"   "al"   r ] }
 143         { [ "iveness" ?tail ] [ "iveness" "ive"  r ] }
 144         { [ "fulness" ?tail ] [ "fulness" "ful"  r ] }
 145         { [ "ousness" ?tail ] [ "ousness" "ous"  r ] }
 146         { [ "aliti"   ?tail ] [ "aliti"   "al"   r ] }
 147         { [ "iviti"   ?tail ] [ "iviti"   "ive"  r ] }
 148         { [ "biliti"  ?tail ] [ "biliti"  "ble"  r ] }
 149         { [ "logi"    ?tail ] [ "logi"    "log"  r ] }
 150         [ ]
 151     } cond ;
 152
 153 : step3 ( str -- newstr )
 154     {
 155         { [ "icate" ?tail ] [ "icate" "ic" r ] }
 156         { [ "ative" ?tail ] [ "ative" ""   r ] }
 157         { [ "alize" ?tail ] [ "alize" "al" r ] }
 158         { [ "iciti" ?tail ] [ "iciti" "ic" r ] }
 159         { [ "ical"  ?tail ] [ "ical"  "ic" r ] }
 160         { [ "ful"   ?tail ] [ "ful"   ""   r ] }
 161         { [ "ness"  ?tail ] [ "ness"  ""   r ] }
 162         [ ]
 163     } cond ;
 164
 165 : -ion ( str -- newstr )
 166     [
 167         "ion"
 168     ] [
 169         dup "st" last-is? [ "ion" append ] unless
 170     ] if-empty ;
 171
 172 : step4 ( str -- newstr )
 173     dup {
 174         { [ "al"    ?tail ] [ ] }
 175         { [ "ance"  ?tail ] [ ] }
 176         { [ "ence"  ?tail ] [ ] }
 177         { [ "er"    ?tail ] [ ] }
 178         { [ "ic"    ?tail ] [ ] }
 179         { [ "able"  ?tail ] [ ] }
 180         { [ "ible"  ?tail ] [ ] }
 181         { [ "ant"   ?tail ] [ ] }
 182         { [ "ement" ?tail ] [ ] }
 183         { [ "ment"  ?tail ] [ ] }
 184         { [ "ent"   ?tail ] [ ] }
 185         { [ "ion"   ?tail ] [ -ion ] }
 186         { [ "ou"    ?tail ] [ ] }
 187         { [ "ism"   ?tail ] [ ] }
 188         { [ "ate"   ?tail ] [ ] }
 189         { [ "iti"   ?tail ] [ ] }
 190         { [ "ous"   ?tail ] [ ] }
 191         { [ "ive"   ?tail ] [ ] }
 192         { [ "ize"   ?tail ] [ ] }
 193         [ ]
 194     } cond dup consonant-seq 1 > [ nip ] [ drop ] if ;
 195
 196 : remove-e? ( str -- ? )
 197     dup consonant-seq dup 1 >
 198     [ 2drop t ]
 199     [ 1 = [ but-last-slice cvc? not ] [ drop f ] if ] if ;
 200
 201 : remove-e ( str -- newstr )
 202     dup peek CHAR: e = [
 203         dup remove-e? [ but-last-slice ] when
 204     ] when ;
 205
 206 : ll->l ( str -- newstr )
 207     {
 208         { [ dup peek CHAR: l = not ] [ ] }
 209         { [ dup length 1- over double-consonant? not ] [ ] }
 210         { [ dup consonant-seq 1 > ] [ but-last-slice ] }
 211         [ ]
 212     } cond ;
 213
 214 : step5 ( str -- newstr ) remove-e ll->l ;
 215
 216 : stem ( str -- newstr )
 217     dup length 2 <= [
 218         step1a step1b step1c step2 step3 step4 step5 "" like
 219     ] unless ;