headers/private/storage/sniffer/Parser.h

   1 //----------------------------------------------------------------------
   2 //  This software is part of the OpenBeOS distribution and is covered
   3 //  by the MIT License.
   4 //---------------------------------------------------------------------
   5 /*!
   6         \file sniffer/Parser.h
   7         MIME sniffer rule parser declarations
   8 */
   9 #ifndef _SNIFFER_PARSER_H
  10 #define _SNIFFER_PARSER_H
  11
  12 #include <SupportDefs.h>
  13 #include <sniffer/CharStream.h>
  14 #include <sniffer/Err.h>
  15 #include <sniffer/Range.h>
  16 #include <sniffer/Rule.h>
  17 #include <List.h>
  18 #include <string>
  19 #include <vector>
  20
  21 class BString;
  22
  23 //! MIME Sniffer related classes
  24 namespace BPrivate {
  25 namespace Storage {
  26 namespace Sniffer {
  27
  28 class Rule;
  29 class DisjList;
  30 class RPattern;
  31 class Pattern;
  32
  33 //------------------------------------------------------------------------------
  34 // The mighty parsing function ;-)
  35 //------------------------------------------------------------------------------
  36
  37 status_t parse(const char *rule, Rule *result, BString *parseError = NULL);
  38
  39 //------------------------------------------------------------------------------
  40 // Classes used internally by the parser
  41 //------------------------------------------------------------------------------
  42
  43 //! Types of tokens
  44 typedef enum TokenType {
  45         EmptyToken,
  46         LeftParen,
  47         RightParen,
  48         LeftBracket,
  49         RightBracket,
  50         Colon,
  51         Divider,
  52         Ampersand,
  53         CaseInsensitiveFlag,
  54         CharacterString,
  55         Integer,
  56         FloatingPoint
  57 } TokenType;
  58
  59 /*! \brief Returns a NULL-terminated string contating the
  60                    name of the given token type
  61 */
  62 const char* tokenTypeToString(TokenType type);
  63
  64 //! Base token class returned by TokenStream
  65 /*! Each token represents a single chunk of relevant information
  66     in a given rule. For example, the floating point number "1.2e-35",
  67     originally represented as a 7-character string, is added to the
  68     token stream as a single FloatToken object.
  69 */
  70 class Token {
  71 public:
  72         Token(TokenType type = EmptyToken, const ssize_t pos = -1);
  73         virtual ~Token();
  74         TokenType Type() const;
  75         virtual const std::string& String() const;
  76         virtual int32 Int() const;
  77         virtual double Float() const;
  78         ssize_t Pos() const;
  79         bool operator==(Token &ref) const;
  80 protected:
  81         TokenType fType;
  82         ssize_t fPos;
  83 };
  84
  85 //! String token class
  86 /*! Single-quoted strings, double-quoted strings, unquoted strings, and
  87         hex literals are all converted to StringToken objects by the scanner
  88         and from then on treated uniformly.
  89 */
  90 class StringToken : public Token {
  91 public:
  92         StringToken(const std::string &str, const ssize_t pos);
  93         virtual ~StringToken();
  94         virtual const std::string& String() const;
  95 protected:
  96         std::string fString;
  97 };
  98
  99 //! Integer token class
 100 /*! Signed or unsigned integer literals are coverted to IntToken objects,
 101     which may then be treated as either ints or floats (since a priority
 102     of "1" would be valid, but scanned as an int instead of a float).
 103 */
 104 class IntToken : public Token {
 105 public:
 106         IntToken(const int32 value, const ssize_t pos);
 107         virtual ~IntToken();
 108         virtual int32 Int() const;
 109         virtual double Float() const;
 110 protected:
 111         int32 fValue;
 112 };
 113
 114 //! Floating point token class
 115 /*! Signed or unsigned, extended or non-extended notation floating point
 116     numbers are converted to FloatToken objects by the scanner.
 117 */
 118 class FloatToken : public Token {
 119 public:
 120         FloatToken(const double value, const ssize_t pos);
 121         virtual ~FloatToken();
 122         virtual double Float() const;
 123 protected:
 124         double fValue;
 125 };
 126
 127 //! Manages a stream of Token objects
 128 /*! Provides Get() and Unget() operations, some handy shortcut operations (Read()
 129     and CondRead()), and handles memory management with respect to all the
 130     Token objects in the stream (i.e. never delete a Token object returned by Get()).
 131
 132     Also, the scanner portion of the parser is implemented in the TokenStream's
 133     SetTo() function.
 134 */
 135 class TokenStream {
 136 public:
 137         TokenStream(const std::string &string);
 138         TokenStream();
 139         ~TokenStream();
 140
 141         status_t SetTo(const std::string &string);
 142         void Unset();
 143         status_t InitCheck() const;
 144
 145         const Token* Get();
 146         void Unget();
 147
 148         void Read(TokenType type);
 149         bool CondRead(TokenType type);
 150
 151         ssize_t Pos() const;
 152         ssize_t EndPos() const;
 153
 154         bool IsEmpty() const;
 155
 156 private:
 157         void AddToken(TokenType type, ssize_t pos);
 158         void AddString(const std::string &str, ssize_t pos);
 159         void AddInt(const char *str, ssize_t pos);
 160         void AddFloat(const char *str, ssize_t pos);
 161
 162         std::vector<Token*> fTokenList;
 163         status_t fCStatus;
 164         int fPos;
 165         int fStrLen;
 166
 167
 168         TokenStream(const TokenStream &ref);
 169         TokenStream& operator=(const TokenStream &ref);
 170 };
 171
 172 //! Handles parsing a sniffer rule, yielding either a parsed rule or a descriptive error message.
 173 /*! A MIME sniffer rule is valid if it is well-formed with respect to the
 174         following grammar and fulfills some further conditions listed thereafter:
 175
 176         <code>
 177         Rule                    ::= LWS Priority LWS ConjList LWS
 178         ConjList                ::= DisjList (LWS DisjList)*
 179         DisjList                ::= "(" LWS PatternList LWS ")"
 180                                                 | "(" LWS RPatternList LWS ")"
 181                                                 | Range LWS "(" LWS PatternList LWS ")"
 182         RPatternList    ::= [Flag LWS] RPattern (LWS "|" LWS [Flag LWS] RPattern)*
 183         PatternList             ::= [Flag LWS] Pattern (LWS "|" LWS [Flag LWS] Pattern)*
 184
 185         RPattern                ::= LWS Range LWS Pattern
 186         Pattern                 ::= PString [ LWS "&" LWS Mask ]
 187         Range                   ::=     "[" LWS SDecimal [LWS ":" LWS SDecimal] LWS "]"
 188
 189         Priority                ::= Float
 190         Mask                    ::= PString
 191         PString                 ::= HexLiteral | QuotedString | UnquotedString
 192
 193         HexLiteral              ::= "0x" HexPair HexPair*
 194         HexPair                 ::= HexChar HexChar
 195
 196         QuotedString    ::= SingleQuotedString | DoubleQuotedString
 197         SQuotedString   := "'" SQChar+ "'"
 198         DQuotedString   := '"' DQChar+ '"'
 199
 200         UnquotedString  ::= EscapedChar UChar*
 201         EscapedChar             ::= OctalEscape | HexEscape | "\" Char
 202         OctalEscape             ::= "\" [[OctHiChar] OctChar] OctChar
 203         HexEscape               ::= "\x" HexPair
 204
 205         Flag                    ::= "-i"
 206
 207         SDecimal                ::= [Sign] Decimal
 208         Decimal                 ::= DecChar DecChar*
 209         Float                   ::= Fixed [("E" | "e") SDecimal]
 210         Fixed                   ::= SDecimal ["." [Decimal]] | [Sign] "." Decimal
 211         Sign                    ::= "+" | "-"
 212
 213         PunctuationChar ::= "(" | ")" | "[" | "]" | "|" | "&" | ":"
 214         OctHiChar               ::= "0" | "1" | "2" | "3"
 215         OctChar                 ::= OctHiChar | "4" | "5" | "6" | "7"
 216         DecChar                 ::= OctChar | "8" | "9"
 217         HexChar                 ::= DecChar | "a" | "b" | "c" | "d" | "e" | "f" | "A" | "B" | "C"
 218                                                 | "D" | "E" | "F"
 219
 220         Char                    :: <any character>
 221         SQChar                  ::= <Char except "\", "'"> | EscapedChar
 222         DQChar                  ::= <Char except "\", '"'> | EscapedChar
 223         UChar                   ::= <Char except "\", LWSChar,  and PunctuationChar> | EscapedChar
 224
 225         LWS                             ::= LWSChar*
 226         LWSChar                 ::= " " | TAB | LF
 227         </code>
 228
 229         Conditions:
 230         - If a mask is specified for a pattern, this mask must have the same
 231           length as the pattern string.
 232         - 0.0 <= Priority <= 1.0
 233         - 0 <= Range begin <= Range end
 234
 235         Notes:
 236         - If a case-insensitive flag ("-i") appears in front of any Pattern or RPattern
 237           in a DisjList, case-insensitivity is applied to the entire DisjList.
 238
 239         Examples:
 240         - 1.0 ('ABCD')
 241           The file must start with the string "ABCD". The priority of the rule
 242           is 1.0 (maximal).
 243         - 0.8 [0:3] ('ABCD' | 'abcd')
 244           The file must contain the string "ABCD" or "abcd" starting somewhere in
 245           the first four bytes. The rule priority is 0.8.
 246         - 0.5 ([0:3] 'ABCD' | [0:3] 'abcd' | [13] 'EFGH')
 247           The file must contain the string "ABCD" or "abcd" starting somewhere in
 248           the first four bytes or the string "EFGH" at position 13. The rule
 249           priority is 0.5.
 250         - 0.8 [0:3] ('ABCD' & 0xff00ffff | 'abcd' & 0xffff00ff)
 251           The file must contain the string "A.CD" or "ab.d" (whereas "." is an
 252           arbitrary character) starting somewhere in the first four bytes. The
 253           rule priority is 0.8.
 254         - 0.3 [10] ('mnop') ('abc') [20] ('xyz')
 255           The file must contain the string 'abc' at the beginning of the file,
 256           the string 'mnop' starting at position 10, and the string 'xyz'
 257           starting at position 20. The rule priority is 0.3.
 258         - 200e-3 (-i 'ab')
 259           The file must contain the string 'ab', 'aB', 'Ab', or 'AB' at the
 260           beginning of the file. The rule priority is 0.2.
 261
 262         Real examples:
 263         - 0.20 ([0]"//" | [0]"/\*" | [0:32]"#include" | [0:32]"#ifndef"
 264                 | [0:32]"#ifdef")
 265           text/x-source-code
 266         - 0.70 ("8BPS  \000\000\000\000" & 0xffffffff0000ffffffff )
 267           image/x-photoshop
 268         - 0.40 [0:64]( -i "&lt;HTML" | "&lt;HEAD" | "&lt;TITLE" | "&lt;BODY"
 269                         | "&lt;TABLE" | "&lt;!--" | "&lt;META" | "&lt;CENTER")
 270           text/html
 271
 272 */
 273 class Parser {
 274 public:
 275         Parser();
 276         ~Parser();
 277         status_t Parse(const char *rule, Rule *result, BString *parseError = NULL);
 278 private:
 279         std::string ErrorMessage(Err *err, const char *rule);
 280
 281         // Things that get done a lot :-)
 282         void ThrowEndOfStreamError();
 283         inline void ThrowOutOfMemError(ssize_t pos);
 284         void ThrowUnexpectedTokenError(TokenType expected, const Token *found);
 285         void ThrowUnexpectedTokenError(TokenType expected1, TokenType expected2, const Token *found);
 286
 287         // Parsing functions
 288         void ParseRule(Rule *result);
 289         double ParsePriority();
 290         std::vector<DisjList*>* ParseConjList();
 291         DisjList* ParseDisjList();
 292         Range ParseRange();
 293         DisjList* ParsePatternList(Range range);
 294         DisjList* ParseRPatternList();
 295         RPattern* ParseRPattern();
 296         Pattern* ParsePattern();
 297
 298         TokenStream stream;
 299
 300         Err *fOutOfMemErr;
 301 };
 302
 303 };      // namespace Sniffer
 304 };      // namespace Storage
 305 };      // namespace BPrivate
 306
 307 #endif  // _SNIFFER_PARSER_H
 308
 309
 310
 311