CMiniLexicon::FindMajorSignatures(): use log file routines
[linguistica.git] / SparseVector.cpp
blob70d76198cada1904ad4d3b9841b575d635780310
1 // Implementation of CSparseVector methods
2 // Copyright © 2009 The University of Chicago
3 #include "SparseVector.h"
5 struct BiData
7 int Integer;
8 float m_Values;
11 int CompareBiData (const void* , const void* );
13 CSparseVector::CSparseVector()
15 m_Length = 0;
16 m_NumberOfSlots = 10;
17 m_Values = new float [m_NumberOfSlots];
18 m_Dimension = new int [m_NumberOfSlots];
20 for (int i = 0; i < m_NumberOfSlots; i++)
22 m_Values[i] = 0;
23 m_Dimension[i] = 0;
25 m_NormalizedFlag = 0;
29 CSparseVector::CSparseVector ( int n)
31 int MinimumNumberOfSlots = 10;
32 if (n > MinimumNumberOfSlots)
34 m_NumberOfSlots = n ;
35 } else
37 m_NumberOfSlots = MinimumNumberOfSlots;
40 m_Length = n;
41 m_Values = new float [m_NumberOfSlots];
42 m_Dimension = new int [m_NumberOfSlots];
44 for (int i = 0; i < m_NumberOfSlots; i++)
46 m_Values[i] = 0;
47 m_Dimension[i] = 0;
49 m_NormalizedFlag = 0;
52 CSparseVector:: CSparseVector (CSparseVector& rhs)
55 m_NumberOfSlots = rhs.GetNumberOfSlots();
56 m_Length = rhs.GetLength();
58 int pos = 0;
59 while ( GetNextDimension ( m_Dimension[pos], m_Values[pos], pos ) )
61 // blank intentionally
64 m_NormalizedFlag = rhs.GetNormalizedFlag();
67 CSparseVector:: ~CSparseVector()
69 if (m_Values) {delete [] m_Values; }
70 if (m_Dimension) {delete [] m_Dimension;}
72 /* -colin-
73 void CSparseVector::Clear()
75 if (m_Length > 0)
77 for (int i = 0; i < m_NumberOfSlots; i++)
79 m_Values[i] = 0;
80 m_Dimension[i] = 0;
83 m_NormalizedFlag = 0;
85 */ /*
86 void CSparseVector::SetLength (int n)
88 m_Length = n;
89 m_NumberOfSlots = n;
90 if (m_Values)
92 delete m_Values;
94 m_Values = new float [n];
98 int CSparseVector:: GetLength() {return m_Length;}
100 /* -colin-
101 float CSparseVector::operator[] (int n) // uses internal index, not external --
103 if (n < 0 || n >= m_Length ) {return -1;}
104 else { return m_Values[n]; };
107 -colin- */
108 int CSparseVector::GetNumberOfSlots() {return m_NumberOfSlots;}
110 int CSparseVector::GetNextDimension (int& DimOut, float& Value, int& Position)
112 if ( Position < 0 ) { return 0; }
113 if ( Position >= m_Length ) { return 0; }
114 DimOut = m_Dimension[ Position ];
115 Value = m_Values [ Position ];
116 Position++;
117 return 1;
120 /* -colin-
121 float CSparseVector::GetAt (int dim) // dim is the value to the outside world
123 int n = ContainsDim(dim);
125 //if (n == -1 ) { return -1; }
127 if ( n== -1) { return 0; }
128 else { return m_Values[n]; };
131 void CSparseVector::operator= (CSparseVector& rhs)
134 m_Length = rhs.GetLength();
136 if ( rhs.GetLength() > m_NumberOfSlots )
138 if (m_Dimension) delete m_Dimension;
139 if (m_Values) delete m_Values;
140 m_NumberOfSlots = rhs.GetLength();
141 m_Dimension = new int [ m_NumberOfSlots ];
142 m_Values = new float [ m_NumberOfSlots ];
144 for (int i = 0; i<m_Length; i++)
146 m_Dimension[i] = rhs.m_Dimension[i];
147 m_Values[i] = rhs.m_Values[i];
152 int CSparseVector::operator== (CSparseVector& L2)
154 if (m_Length != L2.GetLength() ) {return 0;}
155 for (int i = 0; i < m_Length; i++)
157 if (m_Dimension[i] != L2.m_Dimension[i]) { return 0; }
158 if (m_Values[i] != L2[i] ) { return 0; }
160 return 1;
163 int CSparseVector::Delete (int Dim)
165 int InternalDim = -1;
166 if (m_Length == 0)
168 return -1;
170 for (int i = 0; i < m_Length; i++)
172 if ( m_Dimension[i] == Dim)
174 InternalDim = i;
175 break;
178 if (InternalDim < 0)
180 return -1;
182 for ( i = InternalDim; i< m_Length -1; i++)
184 m_Dimension[i] = m_Dimension[i+1];
185 m_Values[i] = m_Values[i+1];
187 m_Length--;
188 return 0;
191 int CSparseVector::GetTopDimension()
193 return m_Dimension[m_Length -1];
196 void CSparseVector::SetAt (int dimen, float value) // insert New at the
197 // right spot.
198 { int index,
199 RightSpot;
201 if ( dimen < 0 )
203 return;
205 // Find out if the dimension is already represented in this vector...
207 //--------------------------------------------------
208 index = ContainsDim (dimen);
209 if (index >= 0 ) // if it is already represented, fine...
211 m_Values[index] = value;
212 return;
215 //--------------------------------------------------
216 // Otherwise, we must add a slot to represent it.
218 m_Length++;
220 if (m_Length <= m_NumberOfSlots)
222 for (int i = 0; i < m_Length-1 && dimen > m_Dimension[i]; i++ )
224 // nothing here on purpose
226 RightSpot = i;
227 for (i = m_Length - 1; i > RightSpot; i--)
229 m_Dimension[i] = m_Dimension[i-1];
230 m_Values[i] = m_Values [i-1];
232 m_Values [RightSpot] = value;
233 m_Dimension [RightSpot] = dimen;
236 else
239 m_NumberOfSlots *= 2;
240 float * Newm_Values = new float[ m_NumberOfSlots ];
241 int * Newm_Dimension = new int [ m_NumberOfSlots ];
243 for (int i = 0; i< m_Length - 1 && dimen > m_Dimension[i] ; i++)
245 Newm_Dimension[i] = m_Dimension[i];
246 Newm_Values[i] = m_Values[i];
249 Newm_Values[i] = value;
250 Newm_Dimension[i] = dimen;
252 for (i++; i< m_Length; i++)
254 Newm_Dimension[i] = m_Dimension[i-1];
255 Newm_Values[i] = m_Values [i-1];
258 if (m_Values)
260 delete[] m_Dimension;
261 delete [] m_Values;
263 m_Values = Newm_Values;
264 m_Dimension = Newm_Dimension;
267 m_NormalizedFlag = 0;
270 void CSparseVector::operator() (int n, float val)
272 SetAt (n,val);
273 m_NormalizedFlag =0;
276 void CSparseVector::IncrementAt(int n, float val )
278 int dim = ContainsDim (n);
279 if ( dim >= 0 )
281 m_Values[dim] += val;
282 return;
283 } else // we assume that a value that was not present is taken to be zero.
285 SetAt (n, val);
287 m_NormalizedFlag =0;
291 int Overlap (CSparseVector& List1, CSparseVector& List2)
293 int overlap = 0;
294 int i = 0,
295 j = 0;
296 float Val1=0,
297 Val2=0;
299 while ( i < List1.GetLength() && j < List2.GetLength() )
301 Val1 = List1[i];
302 Val2 = List2[j];
303 if (Val1 == Val2)
305 overlap++; i++; j++;
307 else
309 if (Val1 < Val2 )
311 i++;
313 else
315 j++;
319 return overlap;
321 void CSparseVector::ClearOut()
323 for (int i = 0; i < m_NumberOfSlots; i++)
325 m_Values[i] = 0;
326 m_Dimension[i] = 0;
329 m_NormalizedFlag = 0;
330 m_Length = 0;
333 int CSparseVector::ContainsDim (int n)
334 { // take advantage of log Size: do this smarter. Do
335 // this in log n time, not n/2!
337 if (m_Length == 0) {return -1;}
339 for (int i = 0; i < m_Length; i++)
341 if ( m_Dimension[i] == n)
343 return i;
346 return -1;
352 ostream& CSparseVector::OutputToStream (CWordCollection& rWords, ostream& stream,int threshold)
355 BiData* SortArray = new BiData[ m_Length ];
357 for (int i = 0; i<m_Length; i++) {
359 SortArray[i].Integer = m_Dimension[i];
360 SortArray[i].m_Values = m_Values[i];
363 qsort ( (void*) SortArray, m_Length, sizeof (BiData ), CompareBiData );
365 for ( i = 0; i<m_Length ; i++)
367 if ( SortArray[i].m_Values < threshold ) {continue;}
368 if (SortArray[i].Integer > (int) rWords.GetCount() ||
369 SortArray[i].Integer < 0 )
370 {continue;}
371 stream
372 << setw(6) << SortArray[i].Integer
373 << setw(16) << rWords[ SortArray[i].Integer ]-> GetKey()
374 << setw (7) << SortArray[i].m_Values
375 << endl;
379 delete SortArray;
380 return stream;
385 int CompareBiData (const void* BiData1, const void* BiData2)
388 float S1 = (float) int( ((BiData*) BiData1)->m_Values );
389 float S2 = (float) int( ((BiData*) BiData2)->m_Values );
390 if (S2 > S1) {return 1;}
391 if (S2 < S1) {return -1;}
392 return 0;
396 void CSparseVector::MakeLogFreq (CWordCollection* Words)
397 { int WordNumber;
399 for (int i = 0; i < m_Length; i++ )
401 WordNumber = m_Dimension[i];
402 Q_ASSERT ( Words->GetAt (WordNumber) ->GetCorpusCount() > 0 );
403 m_Values[i] = float
404 ( log ( m_Values[i] ) -
405 log ( Words->GetAt (WordNumber)->GetCorpusCount() ) );
407 m_NormalizedFlag = 0;
410 void CSparseVector::Normalize()
411 { float Total = 0;
412 for (int i = 0; i < m_Length; i++)
414 Total += m_Values[i];
416 for (i=0; i < m_Length; i++)
418 m_Values[i] /= Total;
420 m_NormalizedFlag = 1;
422 -colin- */
423 int CSparseVector::GetNormalizedFlag() {return m_NormalizedFlag;}
424 /* -colin-
425 float CSparseVector::InnerProduct ( CSparseVector* Vector2)
427 float Total = 0,
428 ThatValue= 0;
429 int ThisDim=0;
431 for (int i = 0; i < m_Length; i++)
433 ThisDim = m_Dimension[i];
434 ThatValue = Vector2->GetAt(ThisDim);
435 if (ThatValue >= 0)
437 Total += m_Values[i] * ThatValue;
440 return Total;
442 -colin- */