transex3/source/wtratree.cxx

   1 /*************************************************************************
   2  *
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * Copyright 2008 by Sun Microsystems, Inc.
   6  *
   7  * OpenOffice.org - a multi-platform office productivity suite
   8  *
   9  * $RCSfile: wtratree.cxx,v $
  10  * $Revision: 1.5 $
  11  *
  12  * This file is part of OpenOffice.org.
  13  *
  14  * OpenOffice.org is free software: you can redistribute it and/or modify
  15  * it under the terms of the GNU Lesser General Public License version 3
  16  * only, as published by the Free Software Foundation.
  17  *
  18  * OpenOffice.org is distributed in the hope that it will be useful,
  19  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21  * GNU Lesser General Public License version 3 for more details
  22  * (a copy is included in the LICENSE file that accompanied this code).
  23  *
  24  * You should have received a copy of the GNU Lesser General Public License
  25  * version 3 along with OpenOffice.org.  If not, see
  26  * <http://www.openoffice.org/license.html>
  27  * for a copy of the LGPLv3 License.
  28  *
  29  ************************************************************************/
  30
  31 // MARKER(update_precomp.py): autogen include statement, do not remove
  32 #include "precompiled_transex3.hxx"
  33
  34
  35 #include "wtratree.hxx"
  36
  37
  38
  39 /** @ATTENTION
  40     For reasons of speed, class WordTransTree works with two simple
  41     char arrays, sOutput and sInput, instead of secure containers or
  42     streams. So be extremely careful, when changing this code!!!
  43 **/
  44
  45
  46
  47 // NOT FULLY DECLARED SERVICES
  48 #include <string.h>
  49 #include <stdio.h>
  50 #include <ctype.h>
  51 #include "wtranode.hxx"
  52
  53
  54 const BRANCH_T  BR_END                  = 0;
  55 const BRANCH_T  BR_NONALPHA     = 1;
  56 const BRANCH_T  BR_HOTKEY       = 2;
  57 const BRANCH_T  BR_BACKSLASH    = 3;
  58 const BRANCH_T  BR_ALPHABASE    = 4;    /// @ATTENTION  All branches not valid for words must be smaller than this value!
  59 const BRANCH_T  BR_AE           = 30;
  60 const BRANCH_T  BR_OE           = 31;
  61 const BRANCH_T  BR_UE           = 32;
  62 const BRANCH_T  BR_SZ           = 33;
  63 const BRANCH_T  BR_MAX          = 34;   /// @ATTENTION  Must be updated always!
  64
  65 const BRANCH_T  BR_START                = 0;
  66
  67
  68
  69
  70
  71 WordTransTree::WordTransTree(CharSet  i_nWorkingCharSet)
  72     :   sInput(0),
  73         nInputLength(0),
  74         pInputEnd(0),
  75         sOutput(0),
  76         nOutputMaxLength(0),
  77         dpParsingTreeTop(0),
  78         pUnknownAlpha(0),
  79         // cChar2Branch
  80         c_AE(u_char('\xC4')), c_OE(u_char('\xD6')), c_UE(u_char('\xDC')),
  81         c_ae(u_char('\xE4')), c_oe(u_char('\xF6')), c_ue(u_char('\xFC')),
  82         pInputCurTokenStart(0),
  83         pInputPosition(0),
  84         pOutputPosition(0),
  85         pCurParseNode(0),
  86         eCurResult(OK),
  87         cCurHotkey(0),
  88         cCurHotkeySign(u_char('~'))
  89 {
  90     // Initialize parsing tree:
  91     pUnknownAlpha = new WTT_Node(BR_ALPHABASE,0,0);     // This will be deleted as part of the parsing tree.
  92     for ( UINT8 i = BR_ALPHABASE; i < C_NR_OF_BRANCHES; i++)
  93     {
  94         pUnknownAlpha->SetBranch(i,pUnknownAlpha);
  95     }  // end for
  96
  97     dpParsingTreeTop = new WTT_Node(BR_START,0,pUnknownAlpha);
  98
  99     WTT_Node * dpNonAlpha = new WTT_Node(BR_NONALPHA,0,0);
 100
 101     dpNonAlpha->SetBranch(BR_NONALPHA,dpNonAlpha);
 102     dpParsingTreeTop->SetBranch(BR_NONALPHA,dpNonAlpha);
 103
 104     WTT_Node * dpBackslash = new WTT_Node(BR_BACKSLASH,dpNonAlpha,dpNonAlpha);
 105     dpBackslash->SetBranch(BR_END,0);
 106
 107     dpParsingTreeTop->SetBranch(BR_BACKSLASH,dpBackslash);
 108     dpNonAlpha->SetBranch(BR_BACKSLASH,dpBackslash);
 109
 110
 111     // Initialize character set:
 112     SetCharSet(i_nWorkingCharSet);
 113
 114     if (C_BR_ALPHABASE != BR_ALPHABASE || C_NR_OF_BRANCHES != BR_MAX)
 115     {
 116         fprintf(stderr, "Assertion failed: file %s line %d.", __FILE__,  __LINE__);
 117         exit(1);
 118     }
 119 }
 120
 121 void
 122 WordTransTree::SetCharSet(CharSet i_nWorkingCharSet)
 123 {
 124     ByteString sConvert("\xC4\xD6\xDC\xE4\xF6\xFC\xDF");
 125     const u_char * pConvert = (const u_char * ) ( sConvert.Convert(RTL_TEXTENCODING_MS_1252, i_nWorkingCharSet).GetBuffer() );
 126
 127     INT16 i = 0;
 128     for ( ; i < C_NR_OF_POSSIBLE_CHARS; ++i )
 129     {
 130         cChar2Branch[i] = BR_NONALPHA;
 131     }  // end for
 132     for ( i = 'a'; i <= 'z'; ++i )
 133     {
 134         cChar2Branch[i] = BR_ALPHABASE + i - 'a';
 135     }  // end for
 136     for ( i = 'A'; i <= 'Z'; ++i )
 137     {
 138         cChar2Branch[i] = BR_ALPHABASE + i - 'A';
 139     }  // end for
 140     cChar2Branch[pConvert[0]] = BR_AE;
 141     cChar2Branch[pConvert[1]] = BR_OE;
 142     cChar2Branch[pConvert[2]] = BR_UE;
 143     cChar2Branch[pConvert[3]] = BR_AE;
 144     cChar2Branch[pConvert[4]] = BR_OE;
 145     cChar2Branch[pConvert[5]] = BR_UE;
 146     cChar2Branch[pConvert[6]] = BR_SZ;
 147
 148     cChar2Branch[u_char('~')] = BR_HOTKEY;
 149     cChar2Branch[u_char('&')] = BR_HOTKEY;
 150
 151
 152     c_AE = pConvert[0];
 153     c_OE = pConvert[1];
 154     c_UE = pConvert[2];
 155     c_ae = pConvert[3];
 156     c_oe = pConvert[4];
 157     c_ue = pConvert[5];
 158 }
 159
 160 WordTransTree::~WordTransTree()
 161 {
 162     delete dpParsingTreeTop;
 163     if (sOutput != 0)
 164         delete [] sOutput;
 165 }
 166
 167 void
 168 WordTransTree::AddWordPair(     const ByteString &              i_sOldString,
 169                             const ByteString &          i_sReplaceString )
 170 {
 171     if (i_sOldString.Len() == 0)
 172         return;
 173
 174     pCurParseNode = dpParsingTreeTop;
 175     WTT_Node * pBranch = 0;
 176     char cBranch = 0;
 177
 178     for ( constr pOld = i_sOldString.GetBuffer();
 179           *pOld != 0;
 180           pOld++ )
 181     {
 182         cBranch = CalculateBranch(*pOld);
 183         pBranch = pCurParseNode->GetNextNode(cBranch);
 184         if (pBranch == 0 || pBranch == pUnknownAlpha)
 185         {
 186             pBranch = new WTT_Node(cBranch,0,pUnknownAlpha);
 187             pCurParseNode->SetBranch(cBranch,pBranch);
 188         }
 189         pCurParseNode = pBranch;
 190     }   // end for
 191     pCurParseNode->SetAsTokenToReplace(i_sReplaceString);
 192 }
 193
 194 void
 195 WordTransTree::InitTransformation( const char * i_sInput,
 196                                    UINT32               i_nInputLength,
 197                                    UINT32               i_nOutputMaxLength )
 198 {
 199     sInput = (const u_char *)i_sInput;
 200     nInputLength = i_nInputLength;
 201     pInputEnd = &sInput[i_nInputLength];
 202
 203     pInputCurTokenStart = sInput;
 204     pInputPosition = sInput;
 205
 206     if (nOutputMaxLength < i_nOutputMaxLength)
 207     {
 208         if (sOutput != 0)
 209             delete [] sOutput;
 210         sOutput = new unsigned char[i_nOutputMaxLength];
 211         nOutputMaxLength = i_nOutputMaxLength;
 212     }
 213     pOutputPosition = sOutput;
 214 }
 215
 216 /**     pInputCurTokenStart and CurParseNode are updated just when
 217     starting this function. After its end they must not be changed
 218     till this functon is called again.
 219     Outside this function pInputPositon and pOutputPosition are both
 220     on the first not transformed char in their respective array.
 221 **/
 222 WordTransTree::E_Result
 223 WordTransTree::TransformNextToken()
 224 {
 225     pInputCurTokenStart = pInputPosition;
 226     pCurParseNode = dpParsingTreeTop;
 227     cCurHotkey = 0;
 228     eCurResult = OK;
 229
 230     WTT_Node * pBranch = 0;
 231     UINT8 cBranch = 0;
 232
 233     for ( pCurParseNode = dpParsingTreeTop;
 234           pInputPosition != pInputEnd;
 235           ++pInputPosition )
 236     {
 237         cBranch = CalculateBranch(*pInputPosition);
 238         pBranch = pCurParseNode->GetNextNode( cBranch );
 239         if (pBranch != 0)
 240         {
 241             pCurParseNode = pBranch;
 242         }
 243         else
 244         {
 245             if (cBranch == BR_HOTKEY)   // current letter is '~' or '&'.
 246             {
 247                 // Logic of the following. There are 9 possible cases -
 248                 // A = alphabetic letter, NA = non alphabetic, TB = token begin,
 249                 // Eot = end of text:
 250                 //       1.     A~A          set hotkey to following letter, continue
 251                 //       2.     A~NA         token end
 252                 //       3.     A~Eot        token end
 253                 //       4.     NA~A         token end
 254                 //       5.     NA~NA        continue
 255                 //       6.     A~Eof        continue
 256                 //       7.     TB~A         set hotkey to following letter, continue
 257                 //       8.     TB~NA        continue
 258                 //       9.     TB~Eot       continue
 259
 260                 // bNext and Prev are true, if there are alphabetic letters:
 261                 BOOL bNext =  pInputPosition + 1 != pInputEnd
 262                                     ?   CalculateBranch(pInputPosition[1]) >= BR_ALPHABASE
 263                                     :   FALSE;
 264                 BOOL bPrev = pCurParseNode->Value() >= BR_ALPHABASE;
 265
 266                 if ( bNext && (bPrev || pCurParseNode == dpParsingTreeTop) )
 267                 {   // case 1. and 7.
 268                     Handle_Hotkey();
 269                     continue;
 270                 }
 271                 else if  (!bPrev && !bNext)
 272                 {   // case 5.,6.,8.,9.
 273                     continue;
 274                 }
 275
 276                 // Case 2.,3.,4. :
 277                 //      so this should be handled as an end of a token.
 278             }
 279             if (pCurParseNode->TokenType() == WTT_Node::token_to_keep)
 280             {
 281                 Handle_TokenToKeep();
 282                 return eCurResult;
 283             }
 284             else
 285             {
 286                 Handle_TokenToTransform();
 287                 return eCurResult;
 288             }   // endif (pCurParseNode->TokenType() == WTT_Node::token_to_keep)
 289         }       // endif (pBranch == 0) else
 290     }   // end for
 291
 292     // If here, the text end is reached
 293     if (pCurParseNode->TokenType() == WTT_Node::token_to_keep)
 294     {
 295         Handle_TokenToKeep();
 296         return eCurResult;
 297     }
 298     else
 299     {
 300         Handle_TokenToTransform();
 301         return eCurResult;
 302     }
 303 }
 304
 305 ByteString
 306 WordTransTree::CurReplacingString() const
 307 {
 308     return pCurParseNode->ReplaceString();
 309 }
 310
 311 void
 312 WordTransTree::Handle_Hotkey()
 313 {
 314     if (cCurHotkey == 0)        // Avoid to replace the first found hotkey by
 315                             //   a later one - though this shouldn't happen anyway.
 316     {
 317         cCurHotkey = (pInputPosition+1) != pInputEnd ? pInputPosition[1] : 0;
 318         cCurHotkeySign = *pInputPosition;
 319     }
 320 }
 321
 322 void
 323 WordTransTree::Handle_TokenToKeep()
 324 {
 325     UINT32 nTokenLength = pInputPosition-pInputCurTokenStart;
 326
 327     memcpy(pOutputPosition,pInputCurTokenStart,nTokenLength);
 328
 329     pOutputPosition += nTokenLength;
 330     *pOutputPosition = '\0';
 331 }
 332
 333 void
 334 WordTransTree::Handle_TokenToTransform()
 335 {
 336     BOOL bHaveHotkey = CalculateBranch(cCurHotkey) >= BR_ALPHABASE;
 337     const ByteString & rReplace = pCurParseNode->ReplaceString();
 338
 339     // Find position of hotkey in replace-string:
 340     USHORT nHotkeyPos = bHaveHotkey
 341                             ?   rReplace.Search(char(cCurHotkey))
 342                             :   STRING_NOTFOUND;
 343     if (nHotkeyPos == STRING_NOTFOUND && bHaveHotkey)
 344     {
 345         if (cCurHotkey < 128)
 346         {
 347             if (islower(cCurHotkey))
 348                 nHotkeyPos = rReplace.Search(toupper(char(cCurHotkey)));
 349             else
 350                 nHotkeyPos = rReplace.Search(tolower(char(cCurHotkey)));
 351         }
 352         else    // cCurHotkey >= 128
 353         {
 354             if (cCurHotkey == c_ae)
 355                 nHotkeyPos = rReplace.Search(char(c_AE));
 356             else if (cCurHotkey == c_oe)
 357                 nHotkeyPos = rReplace.Search(char(c_OE));
 358             else if (cCurHotkey == c_ue)
 359                 nHotkeyPos = rReplace.Search(char(c_UE));
 360             else if (cCurHotkey == c_AE)
 361                 nHotkeyPos = rReplace.Search(char(c_ae));
 362             else if (cCurHotkey == c_OE)
 363                 nHotkeyPos = rReplace.Search(char(c_oe));
 364             else if (cCurHotkey == c_UE)
 365                 nHotkeyPos = rReplace.Search(char(c_ue));
 366         }       // endif (cCurHotkey < 128) else
 367
 368         if (nHotkeyPos == STRING_NOTFOUND)
 369         {
 370             eCurResult = HOTKEY_LOST;
 371             bHaveHotkey = FALSE;
 372         }
 373     }   // endif (nHotkeyPos == STRING_NOT_FOUND && bHaveHotkey)
 374
 375
 376     UINT32 nOutputTokenLength = rReplace.Len() + (bHaveHotkey ? 1 : 0);
 377
 378     if (bHaveHotkey)
 379     {
 380         memcpy( pOutputPosition,
 381                 pCurParseNode->ReplaceString().GetBuffer(),
 382                 nHotkeyPos );
 383         *(pOutputPosition + nHotkeyPos) = cCurHotkeySign;
 384         memcpy( pOutputPosition + nHotkeyPos + 1,
 385                 pCurParseNode->ReplaceString().GetBuffer() + nHotkeyPos,
 386                 nOutputTokenLength - nHotkeyPos - 1);
 387     }
 388     else
 389     {
 390         memcpy( pOutputPosition,
 391                 pCurParseNode->ReplaceString().GetBuffer(),
 392                 nOutputTokenLength );
 393     }
 394
 395     // Convert first letter into upper if necessary:
 396     u_char cInStart = CalculateBranch(*pInputCurTokenStart) == BR_HOTKEY
 397                             ?   pInputCurTokenStart[1]
 398                             :   pInputCurTokenStart[0] ;
 399     u_char * pOutStart = nHotkeyPos == 0
 400                             ?   pOutputPosition + 1
 401                             :   pOutputPosition ;
 402     if (isupper(cInStart) || cInStart > 127)
 403     {   // Possibly cInStart is upper character:
 404         if (isupper(cInStart) || cInStart == c_AE || cInStart == c_OE || cInStart == c_UE)
 405         {       // Surely cInStart is upper character:
 406             u_char cOutStart = *pOutStart;
 407             if (cOutStart < 128)
 408                 *pOutStart = toupper(cOutStart);
 409             else if (cOutStart == c_ae)
 410                 *pOutStart = c_AE;
 411             else if (cOutStart == c_oe)
 412                 *pOutStart = c_OE;
 413             else if (cOutStart == c_ue)
 414                 *pOutStart = c_UE;
 415         }
 416     }   // endif (isupper(cInStart) || cInStart > 127)
 417
 418     pOutputPosition += nOutputTokenLength;
 419     *pOutputPosition = '\0';
 420 }
 421