1 //----------------------------------------------------------------------
2 // This software is part of the OpenBeOS distribution and is covered
4 //---------------------------------------------------------------------
7 MIME sniffer rule parser declarations
9 #ifndef _SNIFFER_PARSER_H
10 #define _SNIFFER_PARSER_H
12 #include <SupportDefs.h>
13 #include <sniffer/CharStream.h>
14 #include <sniffer/Err.h>
15 #include <sniffer/Range.h>
16 #include <sniffer/Rule.h>
23 //! MIME Sniffer related classes
33 //------------------------------------------------------------------------------
34 // The mighty parsing function ;-)
35 //------------------------------------------------------------------------------
37 status_t
parse(const char *rule
, Rule
*result
, BString
*parseError
= NULL
);
39 //------------------------------------------------------------------------------
40 // Classes used internally by the parser
41 //------------------------------------------------------------------------------
44 typedef enum TokenType
{
59 /*! \brief Returns a NULL-terminated string contating the
60 name of the given token type
62 const char* tokenTypeToString(TokenType type
);
64 //! Base token class returned by TokenStream
65 /*! Each token represents a single chunk of relevant information
66 in a given rule. For example, the floating point number "1.2e-35",
67 originally represented as a 7-character string, is added to the
68 token stream as a single FloatToken object.
72 Token(TokenType type
= EmptyToken
, const ssize_t pos
= -1);
74 TokenType
Type() const;
75 virtual const std::string
& String() const;
76 virtual int32
Int() const;
77 virtual double Float() const;
79 bool operator==(Token
&ref
) const;
85 //! String token class
86 /*! Single-quoted strings, double-quoted strings, unquoted strings, and
87 hex literals are all converted to StringToken objects by the scanner
88 and from then on treated uniformly.
90 class StringToken
: public Token
{
92 StringToken(const std::string
&str
, const ssize_t pos
);
93 virtual ~StringToken();
94 virtual const std::string
& String() const;
99 //! Integer token class
100 /*! Signed or unsigned integer literals are coverted to IntToken objects,
101 which may then be treated as either ints or floats (since a priority
102 of "1" would be valid, but scanned as an int instead of a float).
104 class IntToken
: public Token
{
106 IntToken(const int32 value
, const ssize_t pos
);
108 virtual int32
Int() const;
109 virtual double Float() const;
114 //! Floating point token class
115 /*! Signed or unsigned, extended or non-extended notation floating point
116 numbers are converted to FloatToken objects by the scanner.
118 class FloatToken
: public Token
{
120 FloatToken(const double value
, const ssize_t pos
);
121 virtual ~FloatToken();
122 virtual double Float() const;
127 //! Manages a stream of Token objects
128 /*! Provides Get() and Unget() operations, some handy shortcut operations (Read()
129 and CondRead()), and handles memory management with respect to all the
130 Token objects in the stream (i.e. never delete a Token object returned by Get()).
132 Also, the scanner portion of the parser is implemented in the TokenStream's
137 TokenStream(const std::string
&string
);
141 status_t
SetTo(const std::string
&string
);
143 status_t
InitCheck() const;
148 void Read(TokenType type
);
149 bool CondRead(TokenType type
);
152 ssize_t
EndPos() const;
154 bool IsEmpty() const;
157 void AddToken(TokenType type
, ssize_t pos
);
158 void AddString(const std::string
&str
, ssize_t pos
);
159 void AddInt(const char *str
, ssize_t pos
);
160 void AddFloat(const char *str
, ssize_t pos
);
162 std::vector
<Token
*> fTokenList
;
168 TokenStream(const TokenStream
&ref
);
169 TokenStream
& operator=(const TokenStream
&ref
);
172 //! Handles parsing a sniffer rule, yielding either a parsed rule or a descriptive error message.
173 /*! A MIME sniffer rule is valid if it is well-formed with respect to the
174 following grammar and fulfills some further conditions listed thereafter:
177 Rule ::= LWS Priority LWS ConjList LWS
178 ConjList ::= DisjList (LWS DisjList)*
179 DisjList ::= "(" LWS PatternList LWS ")"
180 | "(" LWS RPatternList LWS ")"
181 | Range LWS "(" LWS PatternList LWS ")"
182 RPatternList ::= [Flag LWS] RPattern (LWS "|" LWS [Flag LWS] RPattern)*
183 PatternList ::= [Flag LWS] Pattern (LWS "|" LWS [Flag LWS] Pattern)*
185 RPattern ::= LWS Range LWS Pattern
186 Pattern ::= PString [ LWS "&" LWS Mask ]
187 Range ::= "[" LWS SDecimal [LWS ":" LWS SDecimal] LWS "]"
191 PString ::= HexLiteral | QuotedString | UnquotedString
193 HexLiteral ::= "0x" HexPair HexPair*
194 HexPair ::= HexChar HexChar
196 QuotedString ::= SingleQuotedString | DoubleQuotedString
197 SQuotedString := "'" SQChar+ "'"
198 DQuotedString := '"' DQChar+ '"'
200 UnquotedString ::= EscapedChar UChar*
201 EscapedChar ::= OctalEscape | HexEscape | "\" Char
202 OctalEscape ::= "\" [[OctHiChar] OctChar] OctChar
203 HexEscape ::= "\x" HexPair
207 SDecimal ::= [Sign] Decimal
208 Decimal ::= DecChar DecChar*
209 Float ::= Fixed [("E" | "e") SDecimal]
210 Fixed ::= SDecimal ["." [Decimal]] | [Sign] "." Decimal
213 PunctuationChar ::= "(" | ")" | "[" | "]" | "|" | "&" | ":"
214 OctHiChar ::= "0" | "1" | "2" | "3"
215 OctChar ::= OctHiChar | "4" | "5" | "6" | "7"
216 DecChar ::= OctChar | "8" | "9"
217 HexChar ::= DecChar | "a" | "b" | "c" | "d" | "e" | "f" | "A" | "B" | "C"
220 Char :: <any character>
221 SQChar ::= <Char except "\", "'"> | EscapedChar
222 DQChar ::= <Char except "\", '"'> | EscapedChar
223 UChar ::= <Char except "\", LWSChar, and PunctuationChar> | EscapedChar
226 LWSChar ::= " " | TAB | LF
230 - If a mask is specified for a pattern, this mask must have the same
231 length as the pattern string.
232 - 0.0 <= Priority <= 1.0
233 - 0 <= Range begin <= Range end
236 - If a case-insensitive flag ("-i") appears in front of any Pattern or RPattern
237 in a DisjList, case-insensitivity is applied to the entire DisjList.
241 The file must start with the string "ABCD". The priority of the rule
243 - 0.8 [0:3] ('ABCD' | 'abcd')
244 The file must contain the string "ABCD" or "abcd" starting somewhere in
245 the first four bytes. The rule priority is 0.8.
246 - 0.5 ([0:3] 'ABCD' | [0:3] 'abcd' | [13] 'EFGH')
247 The file must contain the string "ABCD" or "abcd" starting somewhere in
248 the first four bytes or the string "EFGH" at position 13. The rule
250 - 0.8 [0:3] ('ABCD' & 0xff00ffff | 'abcd' & 0xffff00ff)
251 The file must contain the string "A.CD" or "ab.d" (whereas "." is an
252 arbitrary character) starting somewhere in the first four bytes. The
253 rule priority is 0.8.
254 - 0.3 [10] ('mnop') ('abc') [20] ('xyz')
255 The file must contain the string 'abc' at the beginning of the file,
256 the string 'mnop' starting at position 10, and the string 'xyz'
257 starting at position 20. The rule priority is 0.3.
259 The file must contain the string 'ab', 'aB', 'Ab', or 'AB' at the
260 beginning of the file. The rule priority is 0.2.
263 - 0.20 ([0]"//" | [0]"/\*" | [0:32]"#include" | [0:32]"#ifndef"
266 - 0.70 ("8BPS \000\000\000\000" & 0xffffffff0000ffffffff )
268 - 0.40 [0:64]( -i "<HTML" | "<HEAD" | "<TITLE" | "<BODY"
269 | "<TABLE" | "<!--" | "<META" | "<CENTER")
277 status_t
Parse(const char *rule
, Rule
*result
, BString
*parseError
= NULL
);
279 std::string
ErrorMessage(Err
*err
, const char *rule
);
281 // Things that get done a lot :-)
282 void ThrowEndOfStreamError();
283 inline void ThrowOutOfMemError(ssize_t pos
);
284 void ThrowUnexpectedTokenError(TokenType expected
, const Token
*found
);
285 void ThrowUnexpectedTokenError(TokenType expected1
, TokenType expected2
, const Token
*found
);
288 void ParseRule(Rule
*result
);
289 double ParsePriority();
290 std::vector
<DisjList
*>* ParseConjList();
291 DisjList
* ParseDisjList();
293 DisjList
* ParsePatternList(Range range
);
294 DisjList
* ParseRPatternList();
295 RPattern
* ParseRPattern();
296 Pattern
* ParsePattern();
303 }; // namespace Sniffer
304 }; // namespace Storage
305 }; // namespace BPrivate
307 #endif // _SNIFFER_PARSER_H