1 /*************************************************************************
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 * Copyright 2008 by Sun Microsystems, Inc.
7 * OpenOffice.org - a multi-platform office productivity suite
9 * $RCSfile: wtratree.cxx,v $
12 * This file is part of OpenOffice.org.
14 * OpenOffice.org is free software: you can redistribute it and/or modify
15 * it under the terms of the GNU Lesser General Public License version 3
16 * only, as published by the Free Software Foundation.
18 * OpenOffice.org is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU Lesser General Public License version 3 for more details
22 * (a copy is included in the LICENSE file that accompanied this code).
24 * You should have received a copy of the GNU Lesser General Public License
25 * version 3 along with OpenOffice.org. If not, see
26 * <http://www.openoffice.org/license.html>
27 * for a copy of the LGPLv3 License.
29 ************************************************************************/
31 // MARKER(update_precomp.py): autogen include statement, do not remove
32 #include "precompiled_transex3.hxx"
35 #include "wtratree.hxx"
40 For reasons of speed, class WordTransTree works with two simple
41 char arrays, sOutput and sInput, instead of secure containers or
42 streams. So be extremely careful, when changing this code!!!
47 // NOT FULLY DECLARED SERVICES
51 #include "wtranode.hxx"
54 const BRANCH_T BR_END
= 0;
55 const BRANCH_T BR_NONALPHA
= 1;
56 const BRANCH_T BR_HOTKEY
= 2;
57 const BRANCH_T BR_BACKSLASH
= 3;
58 const BRANCH_T BR_ALPHABASE
= 4; /// @ATTENTION All branches not valid for words must be smaller than this value!
59 const BRANCH_T BR_AE
= 30;
60 const BRANCH_T BR_OE
= 31;
61 const BRANCH_T BR_UE
= 32;
62 const BRANCH_T BR_SZ
= 33;
63 const BRANCH_T BR_MAX
= 34; /// @ATTENTION Must be updated always!
65 const BRANCH_T BR_START
= 0;
71 WordTransTree::WordTransTree(CharSet i_nWorkingCharSet
)
80 c_AE(u_char('\xC4')), c_OE(u_char('\xD6')), c_UE(u_char('\xDC')),
81 c_ae(u_char('\xE4')), c_oe(u_char('\xF6')), c_ue(u_char('\xFC')),
82 pInputCurTokenStart(0),
88 cCurHotkeySign(u_char('~'))
90 // Initialize parsing tree:
91 pUnknownAlpha
= new WTT_Node(BR_ALPHABASE
,0,0); // This will be deleted as part of the parsing tree.
92 for ( UINT8 i
= BR_ALPHABASE
; i
< C_NR_OF_BRANCHES
; i
++)
94 pUnknownAlpha
->SetBranch(i
,pUnknownAlpha
);
97 dpParsingTreeTop
= new WTT_Node(BR_START
,0,pUnknownAlpha
);
99 WTT_Node
* dpNonAlpha
= new WTT_Node(BR_NONALPHA
,0,0);
101 dpNonAlpha
->SetBranch(BR_NONALPHA
,dpNonAlpha
);
102 dpParsingTreeTop
->SetBranch(BR_NONALPHA
,dpNonAlpha
);
104 WTT_Node
* dpBackslash
= new WTT_Node(BR_BACKSLASH
,dpNonAlpha
,dpNonAlpha
);
105 dpBackslash
->SetBranch(BR_END
,0);
107 dpParsingTreeTop
->SetBranch(BR_BACKSLASH
,dpBackslash
);
108 dpNonAlpha
->SetBranch(BR_BACKSLASH
,dpBackslash
);
111 // Initialize character set:
112 SetCharSet(i_nWorkingCharSet
);
114 if (C_BR_ALPHABASE
!= BR_ALPHABASE
|| C_NR_OF_BRANCHES
!= BR_MAX
)
116 fprintf(stderr
, "Assertion failed: file %s line %d.", __FILE__
, __LINE__
);
122 WordTransTree::SetCharSet(CharSet i_nWorkingCharSet
)
124 ByteString
sConvert("\xC4\xD6\xDC\xE4\xF6\xFC\xDF");
125 const u_char
* pConvert
= (const u_char
* ) ( sConvert
.Convert(RTL_TEXTENCODING_MS_1252
, i_nWorkingCharSet
).GetBuffer() );
128 for ( ; i
< C_NR_OF_POSSIBLE_CHARS
; ++i
)
130 cChar2Branch
[i
] = BR_NONALPHA
;
132 for ( i
= 'a'; i
<= 'z'; ++i
)
134 cChar2Branch
[i
] = BR_ALPHABASE
+ i
- 'a';
136 for ( i
= 'A'; i
<= 'Z'; ++i
)
138 cChar2Branch
[i
] = BR_ALPHABASE
+ i
- 'A';
140 cChar2Branch
[pConvert
[0]] = BR_AE
;
141 cChar2Branch
[pConvert
[1]] = BR_OE
;
142 cChar2Branch
[pConvert
[2]] = BR_UE
;
143 cChar2Branch
[pConvert
[3]] = BR_AE
;
144 cChar2Branch
[pConvert
[4]] = BR_OE
;
145 cChar2Branch
[pConvert
[5]] = BR_UE
;
146 cChar2Branch
[pConvert
[6]] = BR_SZ
;
148 cChar2Branch
[u_char('~')] = BR_HOTKEY
;
149 cChar2Branch
[u_char('&')] = BR_HOTKEY
;
160 WordTransTree::~WordTransTree()
162 delete dpParsingTreeTop
;
168 WordTransTree::AddWordPair( const ByteString
& i_sOldString
,
169 const ByteString
& i_sReplaceString
)
171 if (i_sOldString
.Len() == 0)
174 pCurParseNode
= dpParsingTreeTop
;
175 WTT_Node
* pBranch
= 0;
178 for ( constr pOld
= i_sOldString
.GetBuffer();
182 cBranch
= CalculateBranch(*pOld
);
183 pBranch
= pCurParseNode
->GetNextNode(cBranch
);
184 if (pBranch
== 0 || pBranch
== pUnknownAlpha
)
186 pBranch
= new WTT_Node(cBranch
,0,pUnknownAlpha
);
187 pCurParseNode
->SetBranch(cBranch
,pBranch
);
189 pCurParseNode
= pBranch
;
191 pCurParseNode
->SetAsTokenToReplace(i_sReplaceString
);
195 WordTransTree::InitTransformation( const char * i_sInput
,
196 UINT32 i_nInputLength
,
197 UINT32 i_nOutputMaxLength
)
199 sInput
= (const u_char
*)i_sInput
;
200 nInputLength
= i_nInputLength
;
201 pInputEnd
= &sInput
[i_nInputLength
];
203 pInputCurTokenStart
= sInput
;
204 pInputPosition
= sInput
;
206 if (nOutputMaxLength
< i_nOutputMaxLength
)
210 sOutput
= new unsigned char[i_nOutputMaxLength
];
211 nOutputMaxLength
= i_nOutputMaxLength
;
213 pOutputPosition
= sOutput
;
216 /** pInputCurTokenStart and CurParseNode are updated just when
217 starting this function. After its end they must not be changed
218 till this functon is called again.
219 Outside this function pInputPositon and pOutputPosition are both
220 on the first not transformed char in their respective array.
222 WordTransTree::E_Result
223 WordTransTree::TransformNextToken()
225 pInputCurTokenStart
= pInputPosition
;
226 pCurParseNode
= dpParsingTreeTop
;
230 WTT_Node
* pBranch
= 0;
233 for ( pCurParseNode
= dpParsingTreeTop
;
234 pInputPosition
!= pInputEnd
;
237 cBranch
= CalculateBranch(*pInputPosition
);
238 pBranch
= pCurParseNode
->GetNextNode( cBranch
);
241 pCurParseNode
= pBranch
;
245 if (cBranch
== BR_HOTKEY
) // current letter is '~' or '&'.
247 // Logic of the following. There are 9 possible cases -
248 // A = alphabetic letter, NA = non alphabetic, TB = token begin,
249 // Eot = end of text:
250 // 1. A~A set hotkey to following letter, continue
252 // 3. A~Eot token end
256 // 7. TB~A set hotkey to following letter, continue
258 // 9. TB~Eot continue
260 // bNext and Prev are true, if there are alphabetic letters:
261 BOOL bNext
= pInputPosition
+ 1 != pInputEnd
262 ? CalculateBranch(pInputPosition
[1]) >= BR_ALPHABASE
264 BOOL bPrev
= pCurParseNode
->Value() >= BR_ALPHABASE
;
266 if ( bNext
&& (bPrev
|| pCurParseNode
== dpParsingTreeTop
) )
271 else if (!bPrev
&& !bNext
)
272 { // case 5.,6.,8.,9.
277 // so this should be handled as an end of a token.
279 if (pCurParseNode
->TokenType() == WTT_Node::token_to_keep
)
281 Handle_TokenToKeep();
286 Handle_TokenToTransform();
288 } // endif (pCurParseNode->TokenType() == WTT_Node::token_to_keep)
289 } // endif (pBranch == 0) else
292 // If here, the text end is reached
293 if (pCurParseNode
->TokenType() == WTT_Node::token_to_keep
)
295 Handle_TokenToKeep();
300 Handle_TokenToTransform();
306 WordTransTree::CurReplacingString() const
308 return pCurParseNode
->ReplaceString();
312 WordTransTree::Handle_Hotkey()
314 if (cCurHotkey
== 0) // Avoid to replace the first found hotkey by
315 // a later one - though this shouldn't happen anyway.
317 cCurHotkey
= (pInputPosition
+1) != pInputEnd
? pInputPosition
[1] : 0;
318 cCurHotkeySign
= *pInputPosition
;
323 WordTransTree::Handle_TokenToKeep()
325 UINT32 nTokenLength
= pInputPosition
-pInputCurTokenStart
;
327 memcpy(pOutputPosition
,pInputCurTokenStart
,nTokenLength
);
329 pOutputPosition
+= nTokenLength
;
330 *pOutputPosition
= '\0';
334 WordTransTree::Handle_TokenToTransform()
336 BOOL bHaveHotkey
= CalculateBranch(cCurHotkey
) >= BR_ALPHABASE
;
337 const ByteString
& rReplace
= pCurParseNode
->ReplaceString();
339 // Find position of hotkey in replace-string:
340 USHORT nHotkeyPos
= bHaveHotkey
341 ? rReplace
.Search(char(cCurHotkey
))
343 if (nHotkeyPos
== STRING_NOTFOUND
&& bHaveHotkey
)
345 if (cCurHotkey
< 128)
347 if (islower(cCurHotkey
))
348 nHotkeyPos
= rReplace
.Search(toupper(char(cCurHotkey
)));
350 nHotkeyPos
= rReplace
.Search(tolower(char(cCurHotkey
)));
352 else // cCurHotkey >= 128
354 if (cCurHotkey
== c_ae
)
355 nHotkeyPos
= rReplace
.Search(char(c_AE
));
356 else if (cCurHotkey
== c_oe
)
357 nHotkeyPos
= rReplace
.Search(char(c_OE
));
358 else if (cCurHotkey
== c_ue
)
359 nHotkeyPos
= rReplace
.Search(char(c_UE
));
360 else if (cCurHotkey
== c_AE
)
361 nHotkeyPos
= rReplace
.Search(char(c_ae
));
362 else if (cCurHotkey
== c_OE
)
363 nHotkeyPos
= rReplace
.Search(char(c_oe
));
364 else if (cCurHotkey
== c_UE
)
365 nHotkeyPos
= rReplace
.Search(char(c_ue
));
366 } // endif (cCurHotkey < 128) else
368 if (nHotkeyPos
== STRING_NOTFOUND
)
370 eCurResult
= HOTKEY_LOST
;
373 } // endif (nHotkeyPos == STRING_NOT_FOUND && bHaveHotkey)
376 UINT32 nOutputTokenLength
= rReplace
.Len() + (bHaveHotkey
? 1 : 0);
380 memcpy( pOutputPosition
,
381 pCurParseNode
->ReplaceString().GetBuffer(),
383 *(pOutputPosition
+ nHotkeyPos
) = cCurHotkeySign
;
384 memcpy( pOutputPosition
+ nHotkeyPos
+ 1,
385 pCurParseNode
->ReplaceString().GetBuffer() + nHotkeyPos
,
386 nOutputTokenLength
- nHotkeyPos
- 1);
390 memcpy( pOutputPosition
,
391 pCurParseNode
->ReplaceString().GetBuffer(),
392 nOutputTokenLength
);
395 // Convert first letter into upper if necessary:
396 u_char cInStart
= CalculateBranch(*pInputCurTokenStart
) == BR_HOTKEY
397 ? pInputCurTokenStart
[1]
398 : pInputCurTokenStart
[0] ;
399 u_char
* pOutStart
= nHotkeyPos
== 0
400 ? pOutputPosition
+ 1
402 if (isupper(cInStart
) || cInStart
> 127)
403 { // Possibly cInStart is upper character:
404 if (isupper(cInStart
) || cInStart
== c_AE
|| cInStart
== c_OE
|| cInStart
== c_UE
)
405 { // Surely cInStart is upper character:
406 u_char cOutStart
= *pOutStart
;
408 *pOutStart
= toupper(cOutStart
);
409 else if (cOutStart
== c_ae
)
411 else if (cOutStart
== c_oe
)
413 else if (cOutStart
== c_ue
)
416 } // endif (isupper(cInStart) || cInStart > 127)
418 pOutputPosition
+= nOutputTokenLength
;
419 *pOutputPosition
= '\0';