Compute lucene-style scores for our hits.
[beagle.git] / Filters / FilterPPT.cs
blob7bcd911ba89f4d00f025ae6f74a87cddd72504b6
1 /* -*- Mode: csharp; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
2 //
3 // FilterPPT.cs
4 //
5 // Copyright (C) 2004 Novell, Inc.
6 //
8 //
9 // Permission is hereby granted, free of charge, to any person obtaining a
10 // copy of this software and associated documentation files (the "Software"),
11 // to deal in the Software without restriction, including without limitation
12 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 // and/or sell copies of the Software, and to permit persons to whom the
14 // Software is furnished to do so, subject to the following conditions:
16 // The above copyright notice and this permission notice shall be included in
17 // all copies or substantial portions of the Software.
19 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 // DEALINGS IN THE SOFTWARE.
29 using System;
30 using System.Collections;
31 using System.IO;
32 using System.Text;
33 using Gsf;
35 using Beagle.Daemon;
36 using Beagle.Util;
38 internal class RecordType
41 public enum TypeCode {
42 Unknown = 0,
43 Document = 1000,
44 DocumentAtom = 1001,
45 EndDocument = 1002,
46 Slide = 1006,
47 SlideAtom = 1007,
48 Notes = 1008,
49 NotesAtom = 1009,
50 Environment = 1010,
51 SlidePersistAtom = 1011,
52 SSlideLayoutAtom = 1015,
53 MainMaster = 1016,
54 SSSlideInfoAtom = 1017,
55 SlideViewInfo = 1018,
56 GuideAtom = 1019,
57 ViewInfo = 1020,
58 ViewInfoAtom = 1021,
59 SlideViewInfoAtom = 1022,
60 VBAInfo = 1023,
61 VBAInfoAtom = 1024,
62 SSDocInfoAtom = 1025,
63 Summary = 1026,
64 DocRoutingSlip = 1030,
65 OutlineViewInfo = 1031,
66 SorterViewInfo = 1032,
67 ExObjList = 1033,
68 ExObjListAtom = 1034,
69 PPDrawingGroup = 1035, //FIXME: Office Art File Format Docu
70 PPDrawing = 1036, //FIXME: Office Art File Format Docu
71 NamedShows = 1040, // don't know if container
72 NamedShow = 1041,
73 NamedShowSlides = 1042, // don't know if container
74 List = 2000,
75 FontCollection = 2005,
76 BookmarkCollection = 2019,
77 SoundCollAtom = 2021,
78 Sound = 2022,
79 SoundData = 2023,
80 BookmarkSeedAtom = 2025,
81 ColorSchemeAtom = 2032,
82 ExObjRefAtom = 3009,
83 OEShapeAtom = 3009,
84 OEPlaceholderAtom = 3011,
85 GPointAtom = 3024,
86 GRatioAtom = 3031,
87 OutlineTextRefAtom = 3998,
88 TextHeaderAtom = 3999,
89 TextCharsAtom = 4000,
90 StyleTextPropAtom = 4001,
91 BaseTextPropAtom = 4002,
92 TxMasterStyleAtom = 4003,
93 TxCFStyleAtom = 4004,
94 TxPFStyleAtom = 4005,
95 TextRulerAtom = 4006,
96 TextBookmarkAtom = 4007,
97 TextBytesAtom = 4008,
98 TxSIStyleAtom = 4009,
99 TextSpecInfoAtom = 4010,
100 DefaultRulerAtom = 4011,
101 FontEntityAtom = 4023,
102 FontEmbeddedData = 4024,
103 CString = 4026,
104 MetaFile = 4033,
105 ExOleObjAtom = 4035,
106 SrKinsoku = 4040,
107 HandOut = 4041,
108 ExEmbed = 4044,
109 ExEmbedAtom = 4045,
110 ExLink = 4046,
111 BookmarkEntityAtom = 4048,
112 ExLinkAtom = 4049,
113 SrKinsokuAtom = 4050,
114 ExHyperlinkAtom = 4051,
115 ExHyperlink = 4055,
116 SlideNumberMCAtom = 4056,
117 HeadersFooters = 4057,
118 HeadersFootersAtom = 4058,
119 TxInteractiveInfoAtom = 4063,
120 CharFormatAtom = 4066,
121 ParaFormatAtom = 4067,
122 RecolorInfoAtom = 4071,
123 ExQuickTimeMovie = 4074,
124 ExQuickTimeMovieData = 4075,
125 ExControl = 4078,
126 SlideListWithText = 4080,
127 InteractiveInfo = 4082,
128 InteractiveInfoAtom = 4083,
129 UserEditAtom = 4085,
130 CurrentUserAtom = 4086,
131 DateTimeMCAtom = 4087,
132 GenericDateMCAtom = 4088,
133 FooterMCAtom = 4090,
134 ExControlAtom = 4091,
135 ExMediaAtom = 4100,
136 ExVideo = 4101,
137 ExAviMovie = 4102,
138 ExMCIMovie = 4103,
139 ExMIDIAudio = 4109,
140 ExCDAudio = 4110,
141 ExWAVAudioEmbedded = 4111,
142 ExWAVAudioLink = 4112,
143 ExOleObjStg = 4113,
144 ExCDAudioAtom = 4114,
145 ExWAVAudioEmbeddedAtom = 4115,
146 AnimationInfoAtom = 4116,
147 RTFDateTimeMCAtom = 4117,
148 ProgTags = 5000, // don't know if container
149 ProgStringTag = 5001,
150 ProgBinaryTag = 5002,
151 BinaryTagData = 5003,
152 PrintOptions = 6000,
153 PersistPtrFullBlock = 6001, // don't know if container
154 PersistPtrIncrementalBlock = 6002, // don't know if container
155 GScalingAtom = 10001,
156 GRColorAtom = 10002,
157 EscherDggContainer = 0xf000, /* Drawing Group Container */
158 EscherDgg = 0xf006,
159 EscherCLSID = 0xf016,
160 EscherOPT = 0xf00b,
161 EscherBStoreContainer = 0xf001,
162 EscherBSE = 0xf007,
163 EscherBlip_START = 0xf018, /* Blip types are between */
164 EscherBlip_END = 0xf117, /* these two values */
165 EscherDgContainer = 0xf002, /* Drawing Container */
166 EscherDg = 0xf008,
167 EscherRegroupItems = 0xf118,
168 EscherColorScheme = 0xf120, /* bug in docs */
169 EscherSpgrContainer = 0xf003,
170 EscherSpContainer = 0xf004,
171 EscherSpgr = 0xf009,
172 EscherSp = 0xf00a,
173 EscherTextbox = 0xf00c,
174 EscherClientTextbox = 0xf00d,
175 EscherAnchor = 0xf00e,
176 EscherChildAnchor = 0xf00f,
177 EscherClientAnchor = 0xf010,
178 EscherClientData = 0xf011,
179 EscherSolverContainer = 0xf005,
180 EscherConnectorRule = 0xf012, /* bug in docs */
181 EscherAlignRule = 0xf013,
182 EscherArcRule = 0xf014,
183 EscherClientRule = 0xf015,
184 EscherCalloutRule = 0xf017,
185 EscherSelection = 0xf119,
186 EscherColorMRU = 0xf11a,
187 EscherDeletedPspl = 0xf11d, /* bug in docs */
188 EscherSplitMenuColors = 0xf11e,
189 EscherOleObject = 0xf11f,
190 EscherUserDefined = 0xf122,
193 public TypeCode typecode;
194 public string name;
195 public bool is_container;
196 public bool do_read;
197 public int min_record_size;
198 public int max_record_size;
199 RecordType (TypeCode typecode, string name, bool is_container, bool do_read, int min_record_size, int max_record_size)
201 this.typecode = typecode;
202 this.name = name;
203 this.is_container = is_container;
204 this.do_read = do_read;
205 this.min_record_size = min_record_size;
206 this.max_record_size = max_record_size;
209 static RecordType[] types =
211 new RecordType ( TypeCode.Unknown, "Unknown", false, true, -1, -1 ),
212 new RecordType ( TypeCode.Document, "Document", true, true, -1, -1 ),
213 new RecordType ( TypeCode.DocumentAtom, "DocumentAtom", false, true, -1, -1 ),
214 new RecordType ( TypeCode.EndDocument, "EndDocument", false, true, -1, -1 ),
215 new RecordType ( TypeCode.Slide, "Slide", true, true, -1, -1 ),
216 new RecordType ( TypeCode.SlideAtom, "SlideAtom", false, true, -1, -1 ),
217 new RecordType ( TypeCode.Notes, "Notes", true, true, -1, -1 ),
218 new RecordType ( TypeCode.NotesAtom, "NotesAtom", false, true, -1, -1 ),
219 new RecordType ( TypeCode.Environment, "Environment", true, true, -1, -1 ),
220 new RecordType ( TypeCode.SlidePersistAtom, "SlidePersistAtom", false, true, -1, -1 ),
221 new RecordType ( TypeCode.SSlideLayoutAtom, "SSlideLayoutAtom", false, true, -1, -1 ),
222 new RecordType ( TypeCode.MainMaster, "MainMaster", true, true, -1, -1 ),
223 new RecordType ( TypeCode.SSSlideInfoAtom, "SSSlideInfoAtom", false, true, -1, -1 ),
224 new RecordType ( TypeCode.SlideViewInfo, "SlideViewInfo", true, true, -1, -1 ),
225 new RecordType ( TypeCode.GuideAtom, "GuideAtom", false, true, -1, -1 ),
226 new RecordType ( TypeCode.ViewInfo, "ViewInfo", true, true, -1, -1 ),
227 new RecordType ( TypeCode.ViewInfoAtom, "ViewInfoAtom", false, true, -1, -1 ),
228 new RecordType ( TypeCode.SlideViewInfoAtom, "SlideViewInfoAtom", false, true, -1, -1 ),
229 new RecordType ( TypeCode.VBAInfo, "VBAInfo", true, true, -1, -1 ),
230 new RecordType ( TypeCode.VBAInfoAtom, "VBAInfoAtom", false, true, -1, -1 ),
231 new RecordType ( TypeCode.SSDocInfoAtom, "SSDocInfoAtom", false, true, -1, -1 ),
232 new RecordType ( TypeCode.Summary, "Summary", true, true, -1, -1 ),
233 new RecordType ( TypeCode.DocRoutingSlip, "DocRoutingSlip", false, true, -1, -1 ),
234 new RecordType ( TypeCode.OutlineViewInfo, "OutlineViewInfo", true, true, -1, -1 ),
235 new RecordType ( TypeCode.SorterViewInfo, "SorterViewInfo", true, true, -1, -1 ),
236 new RecordType ( TypeCode.ExObjList, "ExObjList", true, true, -1, -1 ),
237 new RecordType ( TypeCode.ExObjListAtom, "ExObjListAtom", false, true, -1, -1 ),
238 new RecordType ( TypeCode.PPDrawingGroup, "PPDrawingGroup", true, true, -1, -1 ), //FIXME: Office Art File Format Docu
239 new RecordType ( TypeCode.PPDrawing, "PPDrawing", true, true, -1, -1 ), //FIXME: Office Art File Format Docu
240 new RecordType ( TypeCode.NamedShows, "NamedShows", false, true, -1, -1 ), // don't know if container
241 new RecordType ( TypeCode.NamedShow, "NamedShow", true, true, -1, -1 ),
242 new RecordType ( TypeCode.NamedShowSlides, "NamedShowSlides", false, true, -1, -1 ), // don't know if container
243 new RecordType ( TypeCode.List, "List", true, true, -1, -1 ),
244 new RecordType ( TypeCode.FontCollection, "FontCollection", true, true, -1, -1 ),
245 new RecordType ( TypeCode.BookmarkCollection, "BookmarkCollection", true, true, -1, -1 ),
246 new RecordType ( TypeCode.SoundCollAtom, "SoundCollAtom", false, true, -1, -1 ),
247 new RecordType ( TypeCode.Sound, "Sound", true, true, -1, -1 ),
248 new RecordType ( TypeCode.SoundData, "SoundData", false, true, -1, -1 ),
249 new RecordType ( TypeCode.BookmarkSeedAtom, "BookmarkSeedAtom", false, true, -1, -1 ),
250 new RecordType ( TypeCode.ColorSchemeAtom, "ColorSchemeAtom", false, true, -1, -1 ),
251 new RecordType ( TypeCode.ExObjRefAtom, "ExObjRefAtom", false, true, -1, -1 ),
252 new RecordType ( TypeCode.OEShapeAtom, "OEShapeAtom", false, true, -1, -1 ),
253 new RecordType ( TypeCode.OEPlaceholderAtom, "OEPlaceholderAtom", false, true, -1, -1 ),
254 new RecordType ( TypeCode.GPointAtom, "GPointAtom", false, true, -1, -1 ),
255 new RecordType ( TypeCode.GRatioAtom, "GRatioAtom", false, true, -1, -1 ),
256 new RecordType ( TypeCode.OutlineTextRefAtom, "OutlineTextRefAtom", false, true, -1, -1 ),
257 new RecordType ( TypeCode.TextHeaderAtom, "TextHeaderAtom", false, true, -1, -1 ),
258 new RecordType ( TypeCode.TextCharsAtom, "TextCharsAtom", false, true, -1, -1 ),
259 new RecordType ( TypeCode.StyleTextPropAtom, "StyleTextPropAtom", false, true, -1, -1 ),
260 new RecordType ( TypeCode.BaseTextPropAtom, "BaseTextPropAtom", false, true, -1, -1 ),
261 new RecordType ( TypeCode.TxMasterStyleAtom, "TxMasterStyleAtom", false, true, -1, -1 ),
262 new RecordType ( TypeCode.TxCFStyleAtom, "TxCFStyleAtom", false, true, -1, -1 ),
263 new RecordType ( TypeCode.TxPFStyleAtom, "TxPFStyleAtom", false, true, -1, -1 ),
264 new RecordType ( TypeCode.TextRulerAtom, "TextRulerAtom", false, true, -1, -1 ),
265 new RecordType ( TypeCode.TextBookmarkAtom, "TextBookmarkAtom", false, true, -1, -1 ),
266 new RecordType ( TypeCode.TextBytesAtom, "TextBytesAtom", false, true, -1, -1 ),
267 new RecordType ( TypeCode.TxSIStyleAtom, "TxSIStyleAtom", false, true, -1, -1 ),
268 new RecordType ( TypeCode.TextSpecInfoAtom, "TextSpecInfoAtom", false, true, -1, -1 ),
269 new RecordType ( TypeCode.DefaultRulerAtom, "DefaultRulerAtom", false, true, -1, -1 ),
270 new RecordType ( TypeCode.FontEntityAtom, "FontEntityAtom", false, true, -1, -1 ),
271 new RecordType ( TypeCode.FontEmbeddedData, "FontEmbeddedData", false, true, -1, -1 ),
272 new RecordType ( TypeCode.CString, "CString", false, true, -1, -1 ),
273 new RecordType ( TypeCode.MetaFile, "MetaFile", false, true, -1, -1 ),
274 new RecordType ( TypeCode.ExOleObjAtom, "ExOleObjAtom", false, true, -1, -1 ),
275 new RecordType ( TypeCode.SrKinsoku, "SrKinsoku", true, true, -1, -1 ),
276 new RecordType ( TypeCode.HandOut, "HandOut", true, true, -1, -1 ),
277 new RecordType ( TypeCode.ExEmbed, "ExEmbed", true, true, -1, -1 ),
278 new RecordType ( TypeCode.ExEmbedAtom, "ExEmbedAtom", false, true, -1, -1 ),
279 new RecordType ( TypeCode.ExLink, "ExLink", true, true, -1, -1 ),
280 new RecordType ( TypeCode.BookmarkEntityAtom, "BookmarkEntityAtom", false, true, -1, -1 ),
281 new RecordType ( TypeCode.ExLinkAtom, "ExLinkAtom", false, true, -1, -1 ),
282 new RecordType ( TypeCode.SrKinsokuAtom, "SrKinsokuAtom", false, true, -1, -1 ),
283 new RecordType ( TypeCode.ExHyperlinkAtom, "ExHyperlinkAtom", false, true, -1, -1 ),
284 new RecordType ( TypeCode.ExHyperlink, "ExHyperlink", true, true, -1, -1 ),
285 new RecordType ( TypeCode.SlideNumberMCAtom, "SlideNumberMCAtom", false, true, -1, -1 ),
286 new RecordType ( TypeCode.HeadersFooters, "HeadersFooters", true, true, -1, -1 ),
287 new RecordType ( TypeCode.HeadersFootersAtom, "HeadersFootersAtom", false, true, -1, -1 ),
288 new RecordType ( TypeCode.TxInteractiveInfoAtom, "TxInteractiveInfoAtom", false, true, -1, -1 ),
289 new RecordType ( TypeCode.CharFormatAtom, "CharFormatAtom", false, true, -1, -1 ),
290 new RecordType ( TypeCode.ParaFormatAtom, "ParaFormatAtom", false, true, -1, -1 ),
291 new RecordType ( TypeCode.RecolorInfoAtom, "RecolorInfoAtom", false, true, -1, -1 ),
292 new RecordType ( TypeCode.ExQuickTimeMovie, "ExQuickTimeMovie", true, true, -1, -1 ),
293 new RecordType ( TypeCode.ExQuickTimeMovieData, "ExQuickTimeMovieData", false, true, -1, -1 ),
294 new RecordType ( TypeCode.ExControl, "ExControl", true, true, -1, -1 ),
295 new RecordType ( TypeCode.SlideListWithText, "SlideListWithText", true, true, -1, -1 ),
296 new RecordType ( TypeCode.InteractiveInfo, "InteractiveInfo", true, true, -1, -1 ),
297 new RecordType ( TypeCode.InteractiveInfoAtom, "InteractiveInfoAtom", false, true, -1, -1 ),
298 new RecordType ( TypeCode.UserEditAtom, "UserEditAtom", false, true, -1, -1 ),
299 new RecordType ( TypeCode.CurrentUserAtom, "CurrentUserAtom", false, true, -1, -1 ),
300 new RecordType ( TypeCode.DateTimeMCAtom, "DateTimeMCAtom", false, true, -1, -1 ),
301 new RecordType ( TypeCode.GenericDateMCAtom, "GenericDateMCAtom", false, true, -1, -1 ),
302 new RecordType ( TypeCode.FooterMCAtom, "FooterMCAtom", false, true, -1, -1 ),
303 new RecordType ( TypeCode.ExControlAtom, "ExControlAtom", false, true, -1, -1 ),
304 new RecordType ( TypeCode.ExMediaAtom, "ExMediaAtom", false, true, -1, -1 ),
305 new RecordType ( TypeCode.ExVideo, "ExVideo", true, true, -1, -1 ),
306 new RecordType ( TypeCode.ExAviMovie, "ExAviMovie", true, true, -1, -1 ),
307 new RecordType ( TypeCode.ExMCIMovie, "ExMCIMovie", true, true, -1, -1 ),
308 new RecordType ( TypeCode.ExMIDIAudio, "ExMIDIAudio", true, true, -1, -1 ),
309 new RecordType ( TypeCode.ExCDAudio, "ExCDAudio", true, true, -1, -1 ),
310 new RecordType ( TypeCode.ExWAVAudioEmbedded, "ExWAVAudioEmbedded", true, true, -1, -1 ),
311 new RecordType ( TypeCode.ExWAVAudioLink, "ExWAVAudioLink", true, true, -1, -1 ),
312 new RecordType ( TypeCode.ExOleObjStg, "ExOleObjStg", false, true, -1, -1 ),
313 new RecordType ( TypeCode.ExCDAudioAtom, "ExCDAudioAtom", false, true, -1, -1 ),
314 new RecordType ( TypeCode.ExWAVAudioEmbeddedAtom, "ExWAVAudioEmbeddedAtom", false, true, -1, -1 ),
315 new RecordType ( TypeCode.AnimationInfoAtom, "AnimationInfoAtom", false, true, -1, -1 ),
316 new RecordType ( TypeCode.RTFDateTimeMCAtom, "RTFDateTimeMCAtom", false, true, -1, -1 ),
317 new RecordType ( TypeCode.ProgTags, "ProgTags", false, true, -1, -1 ), // don't know if container
318 new RecordType ( TypeCode.ProgStringTag, "ProgStringTag", true, true, -1, -1 ),
319 new RecordType ( TypeCode.ProgBinaryTag, "ProgBinaryTag", true, true, -1, -1 ),
320 new RecordType ( TypeCode.BinaryTagData, "BinaryTagData", false, true, -1, -1 ),
321 new RecordType ( TypeCode.PrintOptions, "PrintOptions", false, true, -1, -1 ),
322 new RecordType ( TypeCode.PersistPtrFullBlock, "PersistPtrFullBlock", false, true, -1, -1 ), // don't know if container
323 new RecordType ( TypeCode.PersistPtrIncrementalBlock, "PersistPtrIncrementalBlock", false, true, -1, -1 ),
324 new RecordType ( TypeCode.GScalingAtom, "GScalingAtom", false, true, -1, -1 ),
325 new RecordType ( TypeCode.GRColorAtom, "GRColorAtom", false, true, -1, -1 ),
327 new RecordType ( TypeCode.EscherDggContainer, "EscherDggContainer", true, true, -1, -1 ),
328 new RecordType ( TypeCode.EscherDgg, "EscherDgg", false, true, -1, -1 ),
329 new RecordType ( TypeCode.EscherCLSID, "EscherCLSID", false, true, -1, -1 ),
330 new RecordType ( TypeCode.EscherOPT, "EscherOPT", false, true, -1, -1 ),
331 new RecordType ( TypeCode.EscherBStoreContainer, "EscherBStoreContainer", true, true, -1, -1 ),
332 new RecordType ( TypeCode.EscherBSE, "EscherBSE", false, true, -1, -1 ),
333 new RecordType ( TypeCode.EscherBlip_START, "EscherBlip_START", false, true, -1, -1 ),
334 new RecordType ( TypeCode.EscherBlip_END, "EscherBlip_END", false, true, -1, -1 ),
335 new RecordType ( TypeCode.EscherDgContainer, "EscherDgContainer", true, true, -1, -1 ),
336 new RecordType ( TypeCode.EscherDg, "EscherDg", false, true, -1, -1 ),
337 new RecordType ( TypeCode.EscherRegroupItems, "EscherRegroupItems", false, true, -1, -1 ),
338 new RecordType ( TypeCode.EscherColorScheme, "EscherColorScheme", false, true, -1, -1 ),
339 new RecordType ( TypeCode.EscherSpgrContainer, "EscherSpgrContainer", true, true, -1, -1 ),
340 new RecordType ( TypeCode.EscherSpContainer, "EscherSpContainer", true, true, -1, -1 ),
341 new RecordType ( TypeCode.EscherSpgr, "EscherSpgr", false, true, -1, -1 ),
342 new RecordType ( TypeCode.EscherSp, "EscherSp", false, true, -1, -1 ),
343 new RecordType ( TypeCode.EscherTextbox, "EscherTextbox", false, true, -1, -1 ),
344 new RecordType ( TypeCode.EscherClientTextbox, "EscherClientTextbox", true, true, -1, -1 ),
345 new RecordType ( TypeCode.EscherAnchor, "EscherAnchor", false, true, -1, -1 ),
346 new RecordType ( TypeCode.EscherChildAnchor, "EscherChildAnchor", false, true, -1, -1 ),
347 new RecordType ( TypeCode.EscherClientAnchor, "EscherClientAnchor", false, true, -1, -1 ),
348 new RecordType ( TypeCode.EscherClientData, "EscherClientData", true, true, -1, -1 ),
349 new RecordType ( TypeCode.EscherSolverContainer, "EscherSolverContainer", true, true, -1, -1 ),
350 new RecordType ( TypeCode.EscherConnectorRule, "EscherConnectorRule", false, true, -1, -1 ),
351 new RecordType ( TypeCode.EscherAlignRule, "EscherAlignRule", false, true, -1, -1 ),
352 new RecordType ( TypeCode.EscherArcRule, "EscherArcRule", false, true, -1, -1 ),
353 new RecordType ( TypeCode.EscherClientRule, "EscherClientRule", false, true, -1, -1 ),
354 new RecordType ( TypeCode.EscherCalloutRule, "EscherCalloutRule", false, true, -1, -1 ),
355 new RecordType ( TypeCode.EscherSelection, "EscherSelection", false, true, -1, -1 ),
356 new RecordType ( TypeCode.EscherColorMRU, "EscherColorMRU", false, true, -1, -1 ),
357 new RecordType ( TypeCode.EscherDeletedPspl, "EscherDeletedPspl", false, true, -1, -1 ),
358 new RecordType ( TypeCode.EscherSplitMenuColors, "EscherSplitMenuColors", false, true, -1, -1 ),
359 new RecordType ( TypeCode.EscherOleObject, "EscherOleObject", false, true, -1, -1 ),
360 new RecordType ( TypeCode.EscherUserDefined, "EscherUserDefined", false, true, -1, -1 )
363 public static RecordType Find (TypeCode typecode)
365 for (int i = 0; i < types.Length; i++) {
366 if (types[i].typecode == typecode)
367 return types[i];
369 return types[0];
373 namespace Beagle.Filters {
375 public class FilterPPT : Beagle.Daemon.Filter {
377 private enum TextType {
378 Invalid = -1,
379 Title,
380 Body,
381 Notes,
382 NotUsed,
383 Other,
384 CenterBody,
385 CenterTitle,
386 HalfBody,
387 QuarterBody
390 Infile file;
391 TextType textType;
392 string FileName;
393 public FilterPPT ()
395 AddSupportedFlavor (FilterFlavor.NewFromMimeType ("application/vnd.ms-powerpoint"));
396 textType = TextType.Invalid;
397 file = null;
398 FileName = null;
399 SnippetMode = true;
402 private int ParseElement (Input stream)
404 byte [] data = stream.Read(8);
405 if (data == null)
406 return 0;
407 RecordType.TypeCode opcode = (RecordType.TypeCode) GetInt16(data, 2);
408 int length = GetInt32(data, 4);
409 RecordType type = RecordType.Find (opcode);
411 // Process the container tree
412 if (type.is_container) {
413 int length_remaining = length;
415 if (opcode == RecordType.TypeCode.MainMaster) {
416 // Ignore MainMaster container as it contains
417 // just a master-slide view and no user data.
418 stream.Seek (length_remaining, SeekOrigin.Current);
419 } else {
420 while (length_remaining > 0) {
421 int elem_length = ParseElement(stream);
422 if (elem_length == 0)
423 return 0;
424 length_remaining -= elem_length;
427 } else {
428 if (length != 0) {
429 System.Text.Encoding encoding = null;
431 if (opcode == RecordType.TypeCode.TextBytesAtom) {
432 //encoding = System.Text.Encoding.GetEncoding (28591);
433 encoding = System.Text.Encoding.UTF8;
434 } else if (opcode == RecordType.TypeCode.TextCharsAtom) {
435 encoding = System.Text.Encoding.Unicode;
438 if (encoding != null && textType != TextType.NotUsed) {
439 StringBuilder strData = new StringBuilder () ;
440 data = stream.Read(length);
441 if (data == null)
442 return 0;
443 // Replace all ^M with "whitespace",
444 // because of which the contents were not properly
445 // been appended to the text pool.
446 strData.Append (encoding.GetString (data).Replace ('\r', ' '));
448 // Replace all ^K with "whitespace",
449 // because of which the contents were not properly
450 // been appended to the text pool.
451 strData.Replace ((char)0x0B, (char)0x20);
453 if (textType == TextType.Title ||
454 textType == TextType.CenterBody ||
455 textType == TextType.CenterTitle)
456 HotUp ();
457 AppendText (strData.ToString());
458 if (IsHot)
459 HotDown ();
460 else
461 AppendStructuralBreak ();
462 } else if (opcode == RecordType.TypeCode.TextHeaderAtom) {
463 data = stream.Read (4);
464 textType = (TextType) GetInt32 (data, 0);
465 } else {
466 stream.Seek(length, SeekOrigin.Current);
471 // length = RecordHeader.recLen
472 // 8 = sizeof (RecordHeader)
473 // Every Atom/container is preceded by a RecordHeader
474 return length + 8;
478 private void ExtractMetaData (Input sumStream, Input docSumStream)
480 DocMetaData sumMeta = null;
481 if (sumStream != null)
482 sumMeta = Msole.MetadataReadReal (sumStream);
483 else
484 Logger.Log.Error ("SummaryInformationStream not found in {0}", FileName);
486 DocMetaData docSumMeta = null;
487 if (docSumStream != null)
488 docSumMeta = Msole.MetadataReadReal (docSumStream);
489 else
490 Logger.Log.Error ("DocumentSummaryInformationStream not found in {0}", FileName);
492 DocProp prop = null;
493 string str = null;
494 if (sumMeta != null) {
495 prop = sumMeta.GetProp ("dc:title");
496 if (prop != null)
497 str = Gsf.Global.GetPropValStr (prop);
498 if (str != null && str.Length > 0)
499 AddProperty (Beagle.Property.New ("dc:title", str));
501 str = null;
502 prop = sumMeta.GetProp ("dc:subject");
503 if (prop != null)
504 str = Gsf.Global.GetPropValStr (prop);
505 if (str != null && str.Length > 0)
506 AddProperty (Beagle.Property.New ("dc:subject", str));
508 str = null;
509 prop = sumMeta.GetProp ("dc:description");
510 if (prop != null)
511 str = Gsf.Global.GetPropValStr (prop);
512 if (str != null && str.Length > 0)
513 AddProperty (Beagle.Property.New ("dc:description", str));
515 str = null;
516 prop = sumMeta.GetProp ("gsf:keywords");
517 if (prop != null)
518 str = Gsf.Global.GetPropValStr (prop);
519 if (str != null && str.Length > 0)
520 AddProperty (Beagle.Property.New ("fixme:keywords", str));
522 str = null;
523 prop = sumMeta.GetProp ("gsf:creator");
524 if (prop != null)
525 str = Gsf.Global.GetPropValStr (prop);
526 if (str != null && str.Length > 0)
527 AddProperty (Beagle.Property.New ("fixme:author", str));
530 if (docSumMeta != null) {
531 str = null;
532 prop = docSumMeta.GetProp ("gsf:company");
533 if (prop != null)
534 str = Gsf.Global.GetPropValStr (prop);
535 if (str != null && str.Length > 0)
536 AddProperty (Beagle.Property.New ("fixme:company", str));
538 str = null;
539 prop = docSumMeta.GetProp ("gsf:slide-count");
540 if (prop != null)
541 str = Gsf.Global.GetPropValStr (prop);
542 if (str != null && str.Length > 0)
543 AddProperty (Beagle.Property.New ("fixme:slide-count", str));
546 override protected void DoPullProperties ()
548 Input sumStream = null;
549 Input docSumStream = null;
550 string str = null;
551 int childCount = 0;
552 int found = 0;
554 if (file == null) {
555 Finished ();
556 return;
559 try {
560 // FIXME: Should try to use Encoding instead of
561 // string.IndexOf ()... Hacky stuff ;-)
562 childCount = file.NumChildren();
563 for (int i = 0; i < childCount && found != 2; i++) {
564 str = file.NameByIndex (i);
565 if (str.IndexOf ("SummaryInformation") > -1 && found < 1) {
566 sumStream = file.ChildByIndex (i);
567 found = 1;
569 else if (str.IndexOf ("DocumentSummaryInformation") > -1) {
570 docSumStream = file.ChildByIndex (i);
571 found = 2;
574 ExtractMetaData (sumStream, docSumStream);
575 } catch (Exception e) {
576 Logger.Log.Error ("Exception {0} occurred duing DoPullProperties.", e.Message);
577 Finished ();
581 override protected void DoPull ()
583 if (file == null) {
584 Finished ();
585 return;
588 Input stream = null;
589 try {
590 stream = file.ChildByName ("PowerPoint Document");
592 if (stream != null) {
594 // The parsing was getting terminated when "EndDocument"
595 // container was parsed. We need to continue our
596 // parsing till the end of the file, since, some of the
597 // slides do persist after the actual "Document"
598 // container.
599 // PPTs exported from OO.o actually writes almost all the slides
600 // after "Document" container.
601 // And certain PPTs do have some slides in after
602 // "Document" container.
603 while (!stream.Eof)
604 ParseElement (stream);
605 } else {
606 Logger.Log.Error ("Ole stream not found in {0}. Content extraction skipped.", FileName);
608 Finished();
609 } catch (Exception e) {
610 Logger.Log.Error ("Exception {0} occurred during DoPull.", e.Message);
611 Finished ();
615 override protected void DoOpen (FileInfo info)
617 FileName = info.FullName;
619 try {
620 Gsf.Global.Init ();
621 Input input = Input.MmapNew (info.FullName);
622 if (input != null) {
623 input = input.Uncompress();
624 file = new InfileMSOle (input);
626 if (input == null || file == null) {
627 Logger.Log.Error ("Unable to open [{0}] ",info.FullName);
628 Finished ();
629 return;
632 } catch (Exception e) {
633 Logger.Log.Error ("Unable to open "+info.FullName);
634 Finished ();
635 return;
638 // PPT 95/97-2000 format contains a "PP97_DUALSTORAGE", which is required
639 // to index PPT 97-2000 files.
640 // We don't support PPT 95 files, however, we happily accept patches ;-)
642 Input dualStorTemp = null;
643 try {
644 if ((dualStorTemp = file.ChildByName ("PP97_DUALSTORAGE")) != null) {
645 // "PP97_DUALSTORAGE" is a storage containing some streams
646 if (dualStorTemp.Handle != IntPtr.Zero)
647 file = (Gsf.Infile) GLib.Object.GetObject (dualStorTemp.Handle);
648 } else if (((dualStorTemp = file.ChildByName ("Header")) != null) ||
649 ((dualStorTemp = file.ChildByName ("PowerPoint Document")) == null)) {
650 Logger.Log.Error ("{0} is a PPT 95/4.0 file. Beagle does not support PPT 95 files. Skipping...", FileName);
651 Finished ();
653 } catch (Exception e) {
655 Logger.Log.Error ("Unable to open OleFile stream of "+info.FullName);
656 Finished ();
660 // FIXME: These are utility functions and can be useful
661 // outside this filter as well.
662 public static int GetInt32 (byte [] data, int offset) {
663 return data[offset] + (data[offset + 1] << 8) + (data[offset + 2] << 16) + (data[offset + 3] << 24);
665 public static int GetInt16 (byte [] data, int offset) {
666 return data[offset] + (data[offset + 1] << 8);