1 // Implementation of the CStem class’s word functions.
2 // Copyright © 2009 The University of Chicago
5 #include <Q3TextStream>
6 #include "MiniLexicon.h"
10 #include "StringFunc.h"
12 // Get the number of stems
17 int CStem::GetNumberOfStems() const
19 int count
= m_NumberOfStems
;
20 if (count
< 2 && m_Stem2Loc
) { count
= 2; }
25 // Find out if this word has a prefix
28 // bool - true if a prefix has been
31 bool CStem::HasAPrefix() const
33 if (m_PrefixLoc
> 0) return true;
38 // Increment the marked location of the suffix
40 void CStem::IncrementSuffixLocs()
42 if (m_SuffixLoc
) m_SuffixLoc
++;
46 // Find out if this word has a marked
50 // bool - true if there is a
53 bool CStem::HasASuffix() const
66 // Shift the stem/suffix boundary n spaces
70 // n - number of spaces to shift, a
71 // negative value is legal and shifts
74 void CStem::ShiftStemSuffixBoundary (int n
) // how many positions to the right
76 // QString f = "fixador";
77 // Q_ASSERT (CStringSurrogate(m_Key,0,GetKeyLength()) != CStringSurrogate(f.unicode(),0,f.length()) );
78 CStringSurrogate ssSuffix
;
82 if ( m_SuffixLoc
== 0)
88 ssSuffix
= GetPiece( m_SuffixLoc
);
90 if ( ssSuffix
.GetLength() < (int) n
)
95 if ( ssSuffix
.GetLength() == (int) n
) // changed Oct 2002
97 ClearRootSuffixSplit();
102 m_Pieces
[m_StemLoc
] += n
;
106 else // shift to the left...
108 if ( m_SuffixLoc
== 0 )
110 m_StemLoc
= 1; // this is too simple -- prefixes --
111 m_SuffixLoc
= m_StemLoc
+ 1;
112 CutRightBeforeHere ( GetKeyLength() + n
); // sept 28 2002
116 m_Pieces
[m_StemLoc
] += n
;
120 // Shift the prefix/stem boundary n spaces
124 // n - number of spaces to shift, a
125 // negative value is legal and shifts
128 void CStem::ShiftPrefixStemBoundary (int n
) // how many positions to the left
131 CStringSurrogate ssPrefix
;
135 if ( m_PrefixLoc
== 0)
141 ssPrefix
= GetPiece( m_PrefixLoc
);
143 if ( ssPrefix
.GetLength() < (int) n
)
148 if ( ssPrefix
.GetLength() == (int) n
) // changed Oct 2002
150 ClearPrefixStemSplit();
155 m_Pieces
[m_StemLoc
] -= n
;
159 else // negative number means shift to the right
161 if ( m_PrefixLoc
== 0 )
165 CutRightBeforeHere ( -1 *n
); // sept 28 2002
169 m_Pieces
[m_PrefixLoc
] += -1 * n
;
174 // Get the marked stem of the word
177 // CStringSurrogate - a surrogate string
180 CStringSurrogate
CStem::GetStem( )
182 if (m_strStem
.GetKeyLength() > 0) {
183 return CStringSurrogate(m_strStem
);
186 return GetPiece( m_StemLoc
);
189 return CStringSurrogate();
193 // Get the marked suffix of the word
196 // CStringSurrogate - a surrogate string
199 CStringSurrogate
CStem::GetSuffix() const
202 if (m_strSuffix
.GetKeyLength() > 0) {
203 return CStringSurrogate (m_strSuffix
);
206 return GetPiece( m_SuffixLoc
);
209 return CStringSurrogate();
213 // Get the marked prefix of the word
216 // CStringSurrogate - a surrogate string
219 CStringSurrogate
CStem::GetPrefix() const
222 return GetPiece( m_PrefixLoc
);
224 return CStringSurrogate();
228 // Attach a new prefix signature
231 // pSig = pointer to new signature
233 void CStem::AttachPrefixSignature(CSignature
* pSig
)
235 if (m_pPrefixSignature
!= 0 && m_pPrefixSignature
!= pSig
)
236 m_pPrefixSignature
->DetachWord(this,
237 CSignature::eDo_Not_Call_Words
);
238 m_pPrefixSignature
= pSig
;
241 // Attach a new suffix signature
244 // pSig = pointer to new signature
246 void CStem::AttachSuffixSignature(CSignature
* pSig
)
248 if (m_pSuffixSignature
!= 0 && m_pSuffixSignature
!= pSig
)
249 m_pSuffixSignature
->DetachWord(this,
250 CSignature::eDo_Not_Call_Words
);
251 m_pSuffixSignature
= pSig
;
254 // Attach a new stem and attach this word
258 // pStem - point to new stem
260 void CStem::AttachWordAndSuffixalStem(CStem
* pStem
)
262 if (m_pStem
&& m_pStem
!= pStem
)
264 m_pStem
->RemoveWordFromWordPtrList( this );
265 m_pStem
->DetachSuffix ( m_pSuffix
);
270 pStem
->AddWord (this); // it checks whether this is on pStem's list yet.
271 m_pStem
->IncrementWordCount();
272 m_pStem
->IncrementCorpusCount( GetCorpusCount() - 1 );
278 void CStem::AttachWordAndPrefixalStem(CStem
* pStem
)
280 if (m_pStem
&& m_pStem
!= pStem
)
282 m_pStem
->RemoveWordFromWordPtrList( this );
283 m_pStem
->DetachPrefix( m_pPrefix
); //todo this is causing a problem which
284 //I can't identify -- but I think it should be there. jg
290 if( pStem
->AddWord (this) )
292 m_pStem
->IncrementWordCount();
293 m_pStem
->IncrementCorpusCount( GetCorpusCount() - 1 );
299 // Attach a new stem and new prefix and
300 // attach this word to both
303 // pNewStem - the new stem
304 // pNewPrefix - the new prefix
306 void CStem::AttachWordStemAndPrefix(CStem
* pNewStem
, CPrefix
* pNewPrefix
)
309 // Sending message to the old Prefix that it's being dropped.
312 m_pPrefix
->IncrementCorpusCount ( -GetCorpusCount() );
313 m_pPrefix
->RemoveFromStemPtrList ( m_pStem
);
314 m_pPrefix
->RemoveStemString ( m_pStem
->GetKey() );
317 AttachWordAndPrefixalStem(pNewStem
); // Also increments stem counts
319 m_pPrefix
= pNewPrefix
;
320 m_pStem
->AddPrefix( m_pPrefix
);
321 m_pPrefix
->AddStem( m_pStem
);
323 m_pPrefix
->IncrementUseCount(); // July 2003
324 m_pPrefix
->IncrementCorpusCount( GetCorpusCount() - 1 );
330 // Attach a new stem and new suffix and
331 // attach this word to both
334 // pNewStem - the new stem
335 // pNewSuffix - the new suffix
337 void CStem::AttachWordStemAndSuffix(CStem
* pNewStem
, CSuffix
* pNewSuffix
)
340 // Sending message to the old Suffix that it's being dropped.
343 m_pSuffix
->IncrementCorpusCount ( -GetCorpusCount() );
344 m_pSuffix
->RemoveFromStemPtrList ( m_pStem
);
345 m_pSuffix
->RemoveStemString ( m_pStem
->GetKey() );
348 AttachWordAndSuffixalStem(pNewStem
); // Also increments stem counts
350 m_pSuffix
= pNewSuffix
;
351 m_pStem
->AddSuffix( m_pSuffix
);
352 m_pSuffix
->AddStem( m_pStem
);
354 m_pSuffix
->IncrementUseCount(); // July 2003
355 m_pSuffix
->IncrementCorpusCount( GetCorpusCount() - 1 );
362 // Find out if this is a valid word
365 // bool - true if the word is valid
367 bool CStem::IsValid() const
369 Q_ASSERT (m_StemType
>= 0);
370 Q_ASSERT (m_PieceCount
< 2 || m_Pieces
[1] > 0 );
371 Q_ASSERT ( m_Pieces
[m_PieceCount
] == GetKeyLength() );
372 if ( m_StemType
< 0 ) return false;
376 for (int i
= 1; i
<= m_PieceCount
; i
++)
378 CStringSurrogate SS
= GetPiece(i
);
379 LxStrCpy( &SS
, appnd
, SS
.GetLength() );
383 return true; // TODO: ???? what's being tested
387 // Delete the affix/stem separations
389 void CStem::DeleteFactorization()
391 SimplifyParseStructure();
398 // Clear the pointers to stem, suffix, and
401 void CStem::ClearPointers()
406 m_pSuffixSignature
= NULL
;
407 m_pPrefixSignature
= NULL
;
411 // Get the successor frequency
414 // n - the position before the break
417 // int - the successor frequency
419 int CStem::SF(int n
) const // TODO: more descriptive name
421 CStringSurrogate ssPrefix
;
422 if( n
<= GetKeyLength() )
425 // TODO: uncomment lines when CNode and CLexicon are available
426 // CNode* pNode = m_Lexicon->GetWords()->GetTrie()->SearchForPrefix ( ssPrefix, Result );
427 // return pNode->GetWidth( );
430 else return -1; //TODO: is this change OK??
436 void CStem::DeletePrefix()
439 Q_ASSERT ( m_PrefixLoc
== 1 );
441 int PrefixLength
= ThisPieceLength (m_PrefixLoc
);
442 int Length
= GetKeyLength() - PrefixLength
;
444 m_AllocatedLength
= GetKeyLength() - PrefixLength
+1 ;
445 QChar
* NewKey
= new QChar
[ m_AllocatedLength
];
446 LxStrCpy(m_Key
, NewKey
, Length
-PrefixLength
, PrefixLength
);
450 int* NewPieces
= new int [ m_PieceCount
- 1 ];
451 for (int i
= 1; i
<= m_LengthOfPieceVector
; i
++)
453 NewPieces
[i
-1] = m_Pieces
[i
] - m_Pieces
[1];
456 delete m_Pieces
; m_Pieces
= NewPieces
;
457 delete [] m_Key
; m_Key
= NewKey
;
459 m_LengthOfPieceVector
--;
462 if (m_Stem2Loc
) { m_Stem2Loc
--; }
467 // Get a CParse copy of this word in broken
471 // CParse - <stem>.+.<suffix>
473 CParse
CStem::DisplayBrokenForm()
476 if ( m_BrokenForm
== NULL
)
478 m_BrokenForm
= new CParse();
481 m_BrokenForm
->Append ( GetStem() );
482 m_BrokenForm
->Append ( plus
);
483 m_BrokenForm
->Append ( GetSuffix() );
487 m_BrokenForm
->Append ( CStringSurrogate(m_Key
,0,m_KeyLength
) );
491 return *m_BrokenForm
;
495 void CStem::OutputWord(Q3TextStream
& outf
, int index
,
496 QMap
<QString
, QString
>* filter
)
500 for (int j
= 1; j
<= Size(); ++j
) {
501 pieces
+= GetPiece(j
).Display(filter
);
525 outf
<< GetCorpusCount();
529 outf
<< GetPrefixLoc();
533 outf
<< GetStemLoc();
537 outf
<< GetSuffixLoc();
540 if (GetSuffixLoc() == 0) {
541 if (GetPrefixLoc() == 0)
543 else if (CSignature
* sig
= m_pPrefixSignature
)
544 outf
<< sig
->Display('.', filter
) << ' ';
547 } else if (CSignature
* sig
= m_pSuffixSignature
) {
548 outf
<< sig
->Display('.', filter
) << ' ';
556 bool CStem::IsAnalyzed()
558 if (m_pSuffixSignature
) return TRUE
;
559 if (m_pPrefixSignature
) return TRUE
;