vfs: check userland buffers before reading them.
[haiku.git] / headers / private / storage / sniffer / Parser.h
blob805236f4faf4bd3e03eb92444e3e6bcf56dcf5f2
1 //----------------------------------------------------------------------
2 // This software is part of the OpenBeOS distribution and is covered
3 // by the MIT License.
4 //---------------------------------------------------------------------
5 /*!
6 \file sniffer/Parser.h
7 MIME sniffer rule parser declarations
8 */
9 #ifndef _SNIFFER_PARSER_H
10 #define _SNIFFER_PARSER_H
12 #include <SupportDefs.h>
13 #include <sniffer/CharStream.h>
14 #include <sniffer/Err.h>
15 #include <sniffer/Range.h>
16 #include <sniffer/Rule.h>
17 #include <List.h>
18 #include <string>
19 #include <vector>
21 class BString;
23 //! MIME Sniffer related classes
24 namespace BPrivate {
25 namespace Storage {
26 namespace Sniffer {
28 class Rule;
29 class DisjList;
30 class RPattern;
31 class Pattern;
33 //------------------------------------------------------------------------------
34 // The mighty parsing function ;-)
35 //------------------------------------------------------------------------------
37 status_t parse(const char *rule, Rule *result, BString *parseError = NULL);
39 //------------------------------------------------------------------------------
40 // Classes used internally by the parser
41 //------------------------------------------------------------------------------
43 //! Types of tokens
44 typedef enum TokenType {
45 EmptyToken,
46 LeftParen,
47 RightParen,
48 LeftBracket,
49 RightBracket,
50 Colon,
51 Divider,
52 Ampersand,
53 CaseInsensitiveFlag,
54 CharacterString,
55 Integer,
56 FloatingPoint
57 } TokenType;
59 /*! \brief Returns a NULL-terminated string contating the
60 name of the given token type
62 const char* tokenTypeToString(TokenType type);
64 //! Base token class returned by TokenStream
65 /*! Each token represents a single chunk of relevant information
66 in a given rule. For example, the floating point number "1.2e-35",
67 originally represented as a 7-character string, is added to the
68 token stream as a single FloatToken object.
70 class Token {
71 public:
72 Token(TokenType type = EmptyToken, const ssize_t pos = -1);
73 virtual ~Token();
74 TokenType Type() const;
75 virtual const std::string& String() const;
76 virtual int32 Int() const;
77 virtual double Float() const;
78 ssize_t Pos() const;
79 bool operator==(Token &ref) const;
80 protected:
81 TokenType fType;
82 ssize_t fPos;
85 //! String token class
86 /*! Single-quoted strings, double-quoted strings, unquoted strings, and
87 hex literals are all converted to StringToken objects by the scanner
88 and from then on treated uniformly.
90 class StringToken : public Token {
91 public:
92 StringToken(const std::string &str, const ssize_t pos);
93 virtual ~StringToken();
94 virtual const std::string& String() const;
95 protected:
96 std::string fString;
99 //! Integer token class
100 /*! Signed or unsigned integer literals are coverted to IntToken objects,
101 which may then be treated as either ints or floats (since a priority
102 of "1" would be valid, but scanned as an int instead of a float).
104 class IntToken : public Token {
105 public:
106 IntToken(const int32 value, const ssize_t pos);
107 virtual ~IntToken();
108 virtual int32 Int() const;
109 virtual double Float() const;
110 protected:
111 int32 fValue;
114 //! Floating point token class
115 /*! Signed or unsigned, extended or non-extended notation floating point
116 numbers are converted to FloatToken objects by the scanner.
118 class FloatToken : public Token {
119 public:
120 FloatToken(const double value, const ssize_t pos);
121 virtual ~FloatToken();
122 virtual double Float() const;
123 protected:
124 double fValue;
127 //! Manages a stream of Token objects
128 /*! Provides Get() and Unget() operations, some handy shortcut operations (Read()
129 and CondRead()), and handles memory management with respect to all the
130 Token objects in the stream (i.e. never delete a Token object returned by Get()).
132 Also, the scanner portion of the parser is implemented in the TokenStream's
133 SetTo() function.
135 class TokenStream {
136 public:
137 TokenStream(const std::string &string);
138 TokenStream();
139 ~TokenStream();
141 status_t SetTo(const std::string &string);
142 void Unset();
143 status_t InitCheck() const;
145 const Token* Get();
146 void Unget();
148 void Read(TokenType type);
149 bool CondRead(TokenType type);
151 ssize_t Pos() const;
152 ssize_t EndPos() const;
154 bool IsEmpty() const;
156 private:
157 void AddToken(TokenType type, ssize_t pos);
158 void AddString(const std::string &str, ssize_t pos);
159 void AddInt(const char *str, ssize_t pos);
160 void AddFloat(const char *str, ssize_t pos);
162 std::vector<Token*> fTokenList;
163 status_t fCStatus;
164 int fPos;
165 int fStrLen;
168 TokenStream(const TokenStream &ref);
169 TokenStream& operator=(const TokenStream &ref);
172 //! Handles parsing a sniffer rule, yielding either a parsed rule or a descriptive error message.
173 /*! A MIME sniffer rule is valid if it is well-formed with respect to the
174 following grammar and fulfills some further conditions listed thereafter:
176 <code>
177 Rule ::= LWS Priority LWS ConjList LWS
178 ConjList ::= DisjList (LWS DisjList)*
179 DisjList ::= "(" LWS PatternList LWS ")"
180 | "(" LWS RPatternList LWS ")"
181 | Range LWS "(" LWS PatternList LWS ")"
182 RPatternList ::= [Flag LWS] RPattern (LWS "|" LWS [Flag LWS] RPattern)*
183 PatternList ::= [Flag LWS] Pattern (LWS "|" LWS [Flag LWS] Pattern)*
185 RPattern ::= LWS Range LWS Pattern
186 Pattern ::= PString [ LWS "&" LWS Mask ]
187 Range ::= "[" LWS SDecimal [LWS ":" LWS SDecimal] LWS "]"
189 Priority ::= Float
190 Mask ::= PString
191 PString ::= HexLiteral | QuotedString | UnquotedString
193 HexLiteral ::= "0x" HexPair HexPair*
194 HexPair ::= HexChar HexChar
196 QuotedString ::= SingleQuotedString | DoubleQuotedString
197 SQuotedString := "'" SQChar+ "'"
198 DQuotedString := '"' DQChar+ '"'
200 UnquotedString ::= EscapedChar UChar*
201 EscapedChar ::= OctalEscape | HexEscape | "\" Char
202 OctalEscape ::= "\" [[OctHiChar] OctChar] OctChar
203 HexEscape ::= "\x" HexPair
205 Flag ::= "-i"
207 SDecimal ::= [Sign] Decimal
208 Decimal ::= DecChar DecChar*
209 Float ::= Fixed [("E" | "e") SDecimal]
210 Fixed ::= SDecimal ["." [Decimal]] | [Sign] "." Decimal
211 Sign ::= "+" | "-"
213 PunctuationChar ::= "(" | ")" | "[" | "]" | "|" | "&" | ":"
214 OctHiChar ::= "0" | "1" | "2" | "3"
215 OctChar ::= OctHiChar | "4" | "5" | "6" | "7"
216 DecChar ::= OctChar | "8" | "9"
217 HexChar ::= DecChar | "a" | "b" | "c" | "d" | "e" | "f" | "A" | "B" | "C"
218 | "D" | "E" | "F"
220 Char :: <any character>
221 SQChar ::= <Char except "\", "'"> | EscapedChar
222 DQChar ::= <Char except "\", '"'> | EscapedChar
223 UChar ::= <Char except "\", LWSChar, and PunctuationChar> | EscapedChar
225 LWS ::= LWSChar*
226 LWSChar ::= " " | TAB | LF
227 </code>
229 Conditions:
230 - If a mask is specified for a pattern, this mask must have the same
231 length as the pattern string.
232 - 0.0 <= Priority <= 1.0
233 - 0 <= Range begin <= Range end
235 Notes:
236 - If a case-insensitive flag ("-i") appears in front of any Pattern or RPattern
237 in a DisjList, case-insensitivity is applied to the entire DisjList.
239 Examples:
240 - 1.0 ('ABCD')
241 The file must start with the string "ABCD". The priority of the rule
242 is 1.0 (maximal).
243 - 0.8 [0:3] ('ABCD' | 'abcd')
244 The file must contain the string "ABCD" or "abcd" starting somewhere in
245 the first four bytes. The rule priority is 0.8.
246 - 0.5 ([0:3] 'ABCD' | [0:3] 'abcd' | [13] 'EFGH')
247 The file must contain the string "ABCD" or "abcd" starting somewhere in
248 the first four bytes or the string "EFGH" at position 13. The rule
249 priority is 0.5.
250 - 0.8 [0:3] ('ABCD' & 0xff00ffff | 'abcd' & 0xffff00ff)
251 The file must contain the string "A.CD" or "ab.d" (whereas "." is an
252 arbitrary character) starting somewhere in the first four bytes. The
253 rule priority is 0.8.
254 - 0.3 [10] ('mnop') ('abc') [20] ('xyz')
255 The file must contain the string 'abc' at the beginning of the file,
256 the string 'mnop' starting at position 10, and the string 'xyz'
257 starting at position 20. The rule priority is 0.3.
258 - 200e-3 (-i 'ab')
259 The file must contain the string 'ab', 'aB', 'Ab', or 'AB' at the
260 beginning of the file. The rule priority is 0.2.
262 Real examples:
263 - 0.20 ([0]"//" | [0]"/\*" | [0:32]"#include" | [0:32]"#ifndef"
264 | [0:32]"#ifdef")
265 text/x-source-code
266 - 0.70 ("8BPS \000\000\000\000" & 0xffffffff0000ffffffff )
267 image/x-photoshop
268 - 0.40 [0:64]( -i "&lt;HTML" | "&lt;HEAD" | "&lt;TITLE" | "&lt;BODY"
269 | "&lt;TABLE" | "&lt;!--" | "&lt;META" | "&lt;CENTER")
270 text/html
273 class Parser {
274 public:
275 Parser();
276 ~Parser();
277 status_t Parse(const char *rule, Rule *result, BString *parseError = NULL);
278 private:
279 std::string ErrorMessage(Err *err, const char *rule);
281 // Things that get done a lot :-)
282 void ThrowEndOfStreamError();
283 inline void ThrowOutOfMemError(ssize_t pos);
284 void ThrowUnexpectedTokenError(TokenType expected, const Token *found);
285 void ThrowUnexpectedTokenError(TokenType expected1, TokenType expected2, const Token *found);
287 // Parsing functions
288 void ParseRule(Rule *result);
289 double ParsePriority();
290 std::vector<DisjList*>* ParseConjList();
291 DisjList* ParseDisjList();
292 Range ParseRange();
293 DisjList* ParsePatternList(Range range);
294 DisjList* ParseRPatternList();
295 RPattern* ParseRPattern();
296 Pattern* ParsePattern();
298 TokenStream stream;
300 Err *fOutOfMemErr;
303 }; // namespace Sniffer
304 }; // namespace Storage
305 }; // namespace BPrivate
307 #endif // _SNIFFER_PARSER_H