2 * Implements the lexical analyzer, which converts source code into lexical tokens.
4 * Specification: $(LINK2 https://dlang.org/spec/lex.html, Lexical)
6 * Copyright: Copyright (C) 1999-2022 by The D Language Foundation, All Rights Reserved
7 * Authors: $(LINK2 https://www.digitalmars.com, Walter Bright)
8 * License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
9 * Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/lexer.d, _lexer.d)
10 * Documentation: https://dlang.org/phobos/dmd_lexer.html
11 * Coverage: https://codecov.io/gh/dlang/dmd/src/master/src/dmd/lexer.d
16 import core
.stdc
.ctype
;
17 import core
.stdc
.errno
;
18 import core
.stdc
.stdarg
;
19 import core
.stdc
.stdio
;
20 import core
.stdc
.stdlib
: getenv
;
21 import core
.stdc
.string
;
22 import core
.stdc
.time
;
28 import dmd
.identifier
;
29 import dmd
.root
.array
;
30 import dmd
.root
.ctfloat
;
31 import dmd
.common
.outbuffer
;
34 import dmd
.root
.string
;
46 /***********************************************************
50 private __gshared OutBuffer stringbuffer
;
52 Loc scanloc
; // for error messages
53 Loc prevloc
; // location of token before current
55 const(char)* p
; // current character
60 bool Ccompile
; /// true if compiling ImportC
62 // The following are valid only if (Ccompile == true)
63 ubyte boolsize
; /// size of a C _Bool, default 1
64 ubyte shortsize
; /// size of a C short, default 2
65 ubyte intsize
; /// size of a C int, default 4
66 ubyte longsize
; /// size of C long, 4 or 8
67 ubyte long_longsize
; /// size of a C long long, default 8
68 ubyte long_doublesize
; /// size of C long double, 8 or D real.sizeof
69 ubyte wchar_tsize
; /// size of C wchar_t, 2 or 4
73 const(char)* base
; // pointer to start of buffer
74 const(char)* end
; // pointer to last element of buffer
75 const(char)* line
; // start of current line
77 bool doDocComment
; // collect doc comment information
78 bool anyToken
; // seen at least one token
79 bool commentToken
; // comments are TOK.comment's
80 bool tokenizeNewlines
; // newlines are turned into TOK.endOfLine's
84 bool whitespaceToken
; // tokenize whitespaces
87 int inTokenStringConstant
; // can be larger than 1 when in nested q{} strings
88 int lastDocLine
; // last line of previous doc comment
95 /*********************
96 * Creates a Lexer for the source code base[begoffset..endoffset+1].
97 * The last character, base[endoffset], must be null (0) or EOF (0x1A).
100 * filename = used for error messages
101 * base = source code, must be terminated by a null (0) or EOF (0x1A) character
102 * begoffset = starting offset into base[]
103 * endoffset = the last offset to read into base[]
104 * doDocComment = handle documentation comments
105 * commentToken = comments become TOK.comment's
107 this(const(char)* filename
, const(char)* base
, size_t begoffset
,
108 size_t endoffset
, bool doDocComment
, bool commentToken
) pure
110 scanloc
= Loc(filename
, 1, 1);
111 // debug printf("Lexer::Lexer(%p)\n", base);
112 // debug printf("lexer.filename = %s\n", filename);
115 this.end
= base
+ endoffset
;
116 p
= base
+ begoffset
;
118 this.doDocComment
= doDocComment
;
119 this.commentToken
= commentToken
;
120 this.tokenizeNewlines
= false;
121 this.inTokenStringConstant
= 0;
122 this.lastDocLine
= 0;
124 /* If first line starts with '#!', ignore the line
126 if (p
&& p
[0] == '#' && p
[1] == '!')
151 this(const(char)* filename
, const(char)* base
, size_t begoffset
, size_t endoffset
,
152 bool doDocComment
, bool commentToken
, bool whitespaceToken
)
154 this(filename
, base
, begoffset
, endoffset
, doDocComment
, commentToken
);
155 this.whitespaceToken
= whitespaceToken
;
158 bool empty() const pure @property @nogc @safe
160 return front() == TOK
.endOfFile
;
163 TOK
front() const pure @property @nogc @safe
174 /// Returns: a newly allocated `Token`.
175 Token
* allocateToken() pure nothrow @safe
179 Token
* t
= tokenFreelist
;
180 tokenFreelist
= t
.next
;
187 /// Frees the given token by returning it to the freelist.
188 private void releaseToken(Token
* token
) pure nothrow @nogc @safe
192 token
.next
= tokenFreelist
;
193 tokenFreelist
= token
;
196 final TOK
nextToken()
201 Token
* t
= token
.next
;
202 memcpy(&token
, t
, Token
.sizeof
);
209 //printf(token.toChars());
213 /***********************
214 * Look ahead at next token's value.
218 return peek(&token
).value
;
221 /***********************
222 * Look 2 tokens ahead at value.
224 final TOK
peekNext2()
226 Token
* t
= peek(&token
);
227 return peek(t
).value
;
230 /****************************
231 * Turn next token in buffer into a token.
233 * t = the token to set the resulting Token to
235 final void scan(Token
* t
)
237 const lastLine
= scanloc
.linnum
;
239 t
.blockComment
= null;
240 t
.lineComment
= null;
245 //printf("p = %p, *p = '%c'\n",p,*p);
251 t
.value
= TOK
.endOfFile
; // end of file
252 // Intentionally not advancing `p`, such that subsequent calls keep returning TOK.endOfFile.
255 // Skip 4 spaces at a time after aligning 'p' to a 4-byte boundary.
256 while ((cast(size_t
)p
) % uint.sizeof
)
259 goto LendSkipFourSpaces
;
262 while (*(cast(uint*)p
) == 0x20202020) // ' ' == 0x20
264 // Skip over any remaining space on the line.
272 t
.value
= TOK
.whitespace
;
276 continue; // skip white space
285 t
.value
= TOK
.whitespace
;
289 continue; // skip white space
292 if (*p
!= '\n') // if CR stands by itself
295 if (tokenizeNewlines
)
297 t
.value
= TOK
.endOfLine
;
298 tokenizeNewlines
= false;
306 t
.value
= TOK
.whitespace
;
310 continue; // skip white space
314 if (tokenizeNewlines
)
316 t
.value
= TOK
.endOfLine
;
317 tokenizeNewlines
= false;
324 t
.value
= TOK
.whitespace
;
328 continue; // skip white space
330 if (!isZeroSecond(p
[1])) // if numeric literal does not continue
334 t
.value
= TOK
.int32Literal
;
339 case '1': .. case '9':
340 if (!isDigitSecond(p
[1])) // if numeric literal does not continue
342 t
.unsvalue
= *p
- '0';
344 t
.value
= TOK
.int32Literal
;
352 if (issinglechar(p
[1]) && p
[2] == '\'')
354 t
.unsvalue
= p
[1]; // simple one character literal
355 t
.value
= TOK
.charLiteral
;
360 clexerCharConstant(*t
, 0);
364 t
.value
= charConstant(t
);
373 if (p
[1] == '\'') // C wide character constant
376 if (c
== 'L') // convert L to u or U
377 c
= (wchar_tsize
== 4) ?
'u' : 'U';
379 clexerCharConstant(*t
, c
);
382 else if (p
[1] == '\"') // C wide string literal
386 escapeStringConstant(t
);
387 t
.postfix
= c
== 'L' ?
(wchar_tsize
== 2 ?
'w' : 'd') :
392 else if (p
[1] == '8' && p
[2] == '\"') // C UTF-8 string literal
395 escapeStringConstant(t
);
401 if (Ccompile || p
[1] != '"')
408 wysiwygStringConstant(t
);
416 delimitedStringConstant(t
);
419 else if (p
[1] == '{')
422 tokenStringConstant(t
);
428 escapeStringConstant(t
);
446 /*case 'q': case 'r':*/
492 const u
= decodeUTF();
495 error("char 0x%04x not allowed in identifier", u
);
500 Identifier id
= Identifier
.idPool(cast(char*)t
.ptr
, cast(uint)(p
- t
.ptr
));
502 t
.value
= cast(TOK
)id
.getValue();
506 /* Different keywords for C and D
510 if (t
.value
!= TOK
.identifier
)
512 t
.value
= Ckeywords
[t
.value
]; // filter out D keywords
515 else if (t
.value
>= FirstCKeyword
)
516 t
.value
= TOK
.identifier
; // filter out C keywords
518 else if (*t
.ptr
== '_') // if special identifier token
520 // Lazy initialization
521 TimeStampInfo
.initialize(t
.loc
);
525 t
.ustring
= TimeStampInfo
.date
.ptr
;
528 else if (id
== Id
.TIME
)
530 t
.ustring
= TimeStampInfo
.time
.ptr
;
533 else if (id
== Id
.VENDOR
)
535 t
.ustring
= global
.vendor
.xarraydup
.ptr
;
538 else if (id
== Id
.TIMESTAMP
)
540 t
.ustring
= TimeStampInfo
.timestamp
.ptr
;
542 t
.value
= TOK
.string_
;
544 t
.len
= cast(uint)strlen(t
.ustring
);
546 else if (id
== Id
.VERSIONX
)
548 t
.value
= TOK
.int64Literal
;
549 t
.unsvalue
= global
.versionNumber();
551 else if (id
== Id
.EOFX
)
553 t
.value
= TOK
.endOfFile
;
554 // Advance scanner to end of file
555 while (!(*p
== 0 ||
*p
== 0x1A))
559 //printf("t.value = %d\n",t.value);
568 t
.value
= TOK
.divAssign
;
593 error("unterminated /* */ comment");
596 t
.value
= TOK
.endOfFile
;
601 const u
= decodeUTF();
602 if (u
== PS || u
== LS
)
611 if (p
[-2] == '*' && p
- 3 != t
.ptr
)
617 t
.value
= TOK
.comment
;
620 else if (doDocComment
&& t
.ptr
[2] == '*' && p
- 4 != t
.ptr
)
622 // if /** but not /**/
623 getDocComment(t
, lastLine
== startLoc
.linnum
, startLoc
.linnum
- lastDocLine
> 1);
624 lastDocLine
= scanloc
.linnum
;
627 case '/': // do // style comments
646 t
.value
= TOK
.comment
;
649 if (doDocComment
&& t
.ptr
[2] == '/')
651 getDocComment(t
, lastLine
== startLoc
.linnum
, startLoc
.linnum
- lastDocLine
> 1);
652 lastDocLine
= scanloc
.linnum
;
656 t
.value
= TOK
.endOfFile
;
661 const u
= decodeUTF();
662 if (u
== PS || u
== LS
)
678 t
.value
= TOK
.comment
;
681 if (doDocComment
&& t
.ptr
[2] == '/')
683 getDocComment(t
, lastLine
== startLoc
.linnum
, startLoc
.linnum
- lastDocLine
> 1);
684 lastDocLine
= scanloc
.linnum
;
729 error("unterminated /+ +/ comment");
732 t
.value
= TOK
.endOfFile
;
737 uint u
= decodeUTF();
738 if (u
== PS || u
== LS
)
749 t
.value
= TOK
.comment
;
752 if (doDocComment
&& t
.ptr
[2] == '+' && p
- 4 != t
.ptr
)
754 // if /++ but not /++/
755 getDocComment(t
, lastLine
== startLoc
.linnum
, startLoc
.linnum
- lastDocLine
> 1);
756 lastDocLine
= scanloc
.linnum
;
770 /* Note that we don't allow ._1 and ._ as being
771 * valid floating point numbers.
776 else if (p
[0] == '.')
781 t
.value
= TOK
.dotDotDot
;
797 t
.value
= TOK
.andAssign
;
802 t
.value
= TOK
.andAnd
;
812 t
.value
= TOK
.orAssign
;
827 t
.value
= TOK
.minAssign
;
832 t
.value
= TOK
.minusMinus
;
847 t
.value
= TOK
.addAssign
;
852 t
.value
= TOK
.plusPlus
;
862 t
.value
= TOK
.lessOrEqual
; // <=
870 t
.value
= TOK
.leftShiftAssign
; // <<=
873 t
.value
= TOK
.leftShift
; // <<
875 else if (*p
== ':' && Ccompile
)
878 t
.value
= TOK
.leftBracket
; // <:
880 else if (*p
== '%' && Ccompile
)
883 t
.value
= TOK
.leftCurly
; // <%
886 t
.value
= TOK
.lessThan
; // <
893 t
.value
= TOK
.greaterOrEqual
; // >=
901 t
.value
= TOK
.rightShiftAssign
; // >>=
909 t
.value
= TOK
.unsignedRightShiftAssign
; // >>>=
912 t
.value
= TOK
.unsignedRightShift
; // >>>
915 t
.value
= TOK
.rightShift
; // >>
918 t
.value
= TOK
.greaterThan
; // >
925 t
.value
= TOK
.notEqual
; // !=
928 t
.value
= TOK
.not; // !
935 t
.value
= TOK
.equal
; // ==
940 t
.value
= TOK
.goesTo
; // =>
943 t
.value
= TOK
.assign
; // =
950 t
.value
= TOK
.concatenateAssign
; // ~=
953 t
.value
= TOK
.tilde
; // ~
963 t
.value
= TOK
.powAssign
; // ^^=
966 t
.value
= TOK
.pow
; // ^^
971 t
.value
= TOK
.xorAssign
; // ^=
974 t
.value
= TOK
.xor; // ^
978 t
.value
= TOK
.leftParenthesis
;
982 t
.value
= TOK
.rightParenthesis
;
986 t
.value
= TOK
.leftBracket
;
990 t
.value
= TOK
.rightBracket
;
994 t
.value
= TOK
.leftCurly
;
998 t
.value
= TOK
.rightCurly
;
1002 t
.value
= TOK
.question
;
1006 t
.value
= TOK
.comma
;
1010 t
.value
= TOK
.semicolon
;
1017 t
.value
= TOK
.colonColon
;
1019 else if (*p
== '>' && Ccompile
)
1022 t
.value
= TOK
.rightBracket
;
1025 t
.value
= TOK
.colon
;
1029 t
.value
= TOK
.dollar
;
1040 t
.value
= TOK
.mulAssign
;
1050 t
.value
= TOK
.modAssign
;
1052 else if (*p
== '>' && Ccompile
)
1055 t
.value
= TOK
.rightCurly
;
1057 else if (*p
== ':' && Ccompile
)
1059 goto case '#'; // %: means #
1066 // https://issues.dlang.org/show_bug.cgi?id=22825
1067 // Special token sequences are terminated by newlines,
1068 // and should not be skipped over.
1069 this.tokenizeNewlines
= true;
1071 if (parseSpecialTokenSequence())
1073 t
.value
= TOK
.pound
;
1082 // Check for start of unicode identifier
1085 if (c
== PS || c
== LS
)
1089 if (tokenizeNewlines
)
1091 t
.value
= TOK
.endOfLine
;
1092 tokenizeNewlines
= false;
1098 if (c
< 0x80 && isprint(c
))
1099 error("character '%c' is not a valid token", c
);
1101 error("character 0x%02x is not a valid token", c
);
1109 final Token
* peek(Token
* ct
)
1116 t
= allocateToken();
1123 /*********************************
1124 * tk is on the opening (.
1125 * Look ahead and return token that is past the closing ).
1127 final Token
* peekPastParen(Token
* tk
)
1129 //printf("peekPastParen()\n");
1138 case TOK
.leftParenthesis
:
1141 case TOK
.rightParenthesis
:
1150 case TOK
.rightCurly
:
1151 if (--curlynest
>= 0)
1167 /*******************************************
1168 * Parse escape sequence.
1170 private uint escapeSequence()
1172 return Lexer
.escapeSequence(token
.loc
, p
, Ccompile
);
1176 * Parse the given string literal escape sequence into a single character.
1177 * D https://dlang.org/spec/lex.html#escape_sequences
1180 * loc = location to use for error messages
1181 * sequence = pointer to string with escape sequence to parse. Updated to
1182 * point past the end of the escape sequence
1183 * Ccompile = true for compile C11 escape sequences
1185 * the escape sequence as a single character
1187 private static dchar escapeSequence(const ref Loc loc
, ref const(char)* sequence
, bool Ccompile
)
1189 const(char)* p
= sequence
; // cache sequence reference on stack
1190 scope(exit
) sequence
= p
;
1235 if (ishex(cast(char)c
))
1241 if (isdigit(cast(char)c
))
1243 else if (islower(c
))
1251 if (!ishex(cast(char)c
))
1253 .error(loc
, "escape hex sequence has %d hex digits instead of %d", n
, ndigits
);
1257 if (ndigits
!= 2 && !utf_isValidDchar(v
))
1259 .error(loc
, "invalid UTF character \\U%08x", v
);
1260 v
= '?'; // recover with valid UTF character
1266 .error(loc
, "undefined escape hex sequence \\%c%c", sequence
[0], c
);
1274 // named character entity
1275 for (const idstart
= ++p
; 1; p
++)
1280 c
= HtmlNamedEntity(idstart
, p
- idstart
);
1283 .error(loc
, "unnamed character entity &%.*s;", cast(int)(p
- idstart
), idstart
);
1289 if (isalpha(*p
) ||
(p
!= idstart
&& isdigit(*p
)))
1291 .error(loc
, "unterminated named entity &%.*s;", cast(int)(p
- idstart
+ 1), idstart
);
1304 if (isoctal(cast(char)c
))
1310 v
= v
* 8 + (c
- '0');
1313 while (++n
< 3 && isoctal(cast(char)c
));
1316 .error(loc
, "escape octal sequence \\%03o is larger than \\377", c
);
1320 .error(loc
, "undefined escape sequence \\%c", c
);
1329 Lex a wysiwyg string. `p` must be pointing to the first character before the
1330 contents of the string literal. The character pointed to by `p` will be used as
1331 the terminating character (i.e. backtick or double-quote).
1333 result = pointer to the token that accepts the result
1335 private void wysiwygStringConstant(Token
* result
)
1337 result
.value
= TOK
.string_
;
1339 auto terminator
= p
[0];
1341 stringbuffer
.setsize(0);
1354 c
= '\n'; // treat EndOfLine as \n character
1359 error("unterminated string constant starting at %s", start
.toChars());
1361 // rewind `p` so it points to the EOF character
1365 if (c
== terminator
)
1367 result
.setString(stringbuffer
);
1368 stringPostfix(result
);
1374 const u
= decodeUTF();
1376 if (u
== PS || u
== LS
)
1378 stringbuffer
.writeUTF8(u
);
1383 stringbuffer
.writeByte(c
);
1388 Lex a delimited string. Some examples of delimited strings are:
1390 q"(foo(xxx))" // "foo(xxx)"
1391 q"[foo$(LPAREN)]" // "foo$(LPAREN)"
1397 It is assumed that `p` points to the opening double-quote '"'.
1399 result = pointer to the token that accepts the result
1401 private void delimitedStringConstant(Token
* result
)
1403 result
.value
= TOK
.string_
;
1405 dchar delimleft
= 0;
1406 dchar delimright
= 0;
1408 uint nestcount
= ~0; // dead assignment, needed to suppress warning
1409 Identifier hereid
= null;
1413 stringbuffer
.setsize(0);
1417 //printf("c = '%c'\n", c);
1431 stringbuffer
.writeUTF8(c
);
1438 c
= '\n'; // treat EndOfLine as \n character
1442 error("unterminated delimited string constant starting at %s", start
.toChars());
1444 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1453 if (c
== PS || c
== LS
)
1471 else if (isalpha(c
) || c
== '_' ||
(c
>= 0x80 && isUniAlpha(c
)))
1473 // Start of identifier; must be a heredoc
1476 scan(&tok
); // read in heredoc identifier
1477 if (tok
.value
!= TOK
.identifier
)
1479 error("identifier expected for heredoc, not %s", tok
.toChars());
1485 //printf("hereid = '%s'\n", hereid.toChars());
1495 error("delimiter cannot be whitespace");
1502 error("heredoc rest of line should be blank");
1510 else if (c
== delimright
)
1517 else if (c
== delimright
)
1519 if (startline
&& (isalpha(c
) || c
== '_' ||
(c
>= 0x80 && isUniAlpha(c
))) && hereid
)
1524 scan(&tok
); // read in possible heredoc identifier
1525 //printf("endid = '%s'\n", tok.ident.toChars());
1526 if (tok
.value
== TOK
.identifier
&& tok
.ident
is hereid
)
1528 /* should check that rest of line is blank
1534 stringbuffer
.writeUTF8(c
);
1542 error("delimited string must end in `%s\"`", hereid
.toChars());
1543 else if (isspace(delimright
))
1544 error("delimited string must end in `\"`");
1546 error("delimited string must end in `%c\"`", delimright
);
1547 result
.setString(stringbuffer
);
1548 stringPostfix(result
);
1552 Lex a token string. Some examples of token strings are:
1554 q{ foo(xxx) } // " foo(xxx) "
1555 q{foo$(LPAREN)} // "foo$(LPAREN)"
1556 q{{foo}"}"} // "{foo}"}""
1558 It is assumed that `p` points to the opening curly-brace.
1560 result = pointer to the token that accepts the result
1562 private void tokenStringConstant(Token
* result
)
1564 result
.value
= TOK
.string_
;
1567 const start
= loc();
1569 inTokenStringConstant
++;
1570 scope(exit
) inTokenStringConstant
--;
1580 case TOK
.rightCurly
:
1583 result
.setString(pstart
, p
- 1 - pstart
);
1584 stringPostfix(result
);
1589 error("unterminated token string constant starting at %s", start
.toChars());
1599 Scan a quoted string while building the processed string value by
1600 handling escape sequences. The result is returned in the given `t` token.
1601 This function assumes that `p` currently points to the opening quote
1604 t = the token to set the resulting string to
1606 * D https://dlang.org/spec/lex.html#double_quoted_strings
1609 private void escapeStringConstant(Token
* t
)
1611 t
.value
= TOK
.string_
;
1613 const start
= loc();
1614 const tc
= *p
++; // opening quote
1615 stringbuffer
.setsize(0);
1631 c
= escapeSequence();
1632 stringbuffer
.writeUTF8(c
);
1635 c
= escapeSequence();
1647 c
= '\n'; // treat EndOfLine as \n character
1656 t
.setString(stringbuffer
);
1662 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1665 error("unterminated string constant starting at %s", start
.toChars());
1673 if (c
== LS || c
== PS
)
1681 stringbuffer
.writeUTF8(c
);
1686 stringbuffer
.writeByte(c
);
1690 /**************************************
1692 * https://dlang.org/spec/lex.html#characterliteral
1694 private TOK
charConstant(Token
* t
)
1696 TOK tk
= TOK
.charLiteral
;
1697 //printf("Lexer::charConstant\n");
1706 t
.unsvalue
= escapeSequence();
1707 tk
= TOK
.wcharLiteral
;
1711 t
.unsvalue
= escapeSequence();
1712 tk
= TOK
.dcharLiteral
;
1715 t
.unsvalue
= escapeSequence();
1727 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1731 error("unterminated character constant");
1740 if (c
== LS || c
== PS
)
1742 if (c
< 0xD800 ||
(c
>= 0xE000 && c
< 0xFFFE))
1743 tk
= TOK
.wcharLiteral
;
1745 tk
= TOK
.dcharLiteral
;
1752 while (*p
!= '\'' && *p
!= 0x1A && *p
!= 0 && *p
!= '\n' &&
1753 *p
!= '\r' && *p
!= ';' && *p
!= ')' && *p
!= ']' && *p
!= '}')
1759 if (c
== LS || c
== PS
)
1770 error("character constant has multiple characters");
1774 error("unterminated character constant");
1782 /***************************************
1783 * Lex C character constant.
1784 * Parser is on the opening quote.
1786 * t = token to fill in
1787 * prefix = one of `u`, `U` or 0.
1791 private void clexerCharConstant(ref Token t
, char prefix
)
1793 escapeStringConstant(&t
);
1794 const(char)[] str = t
.ustring
[0 .. t
.len
];
1795 const n
= str.length
;
1799 error(loc
, "empty character constant");
1800 t
.value
= TOK
.semicolon
;
1808 if (n
== 1) // fast case
1813 error(loc
, "max number of chars in character literal is 4, had %d",
1818 (cast(char*)&u
)[n
- 1 - i
] = c
;
1825 auto msg
= utf_decodeChar(str, idx
, d1
);
1827 if (idx
< n
&& !msg
)
1828 msg
= utf_decodeChar(str, idx
, d2
);
1830 error(loc
, "%s", msg
);
1832 error(loc
, "max number of chars in 16 bit character literal is 2, had %d",
1834 else if (d1
> 0x1_0000)
1835 error(loc
, "%d does not fit in 16 bits", d1
);
1836 else if (d2
> 0x1_0000)
1837 error(loc
, "%d does not fit in 16 bits", d2
);
1840 u
= (d1
<< 16) | d2
;
1846 auto msg
= utf_decodeChar(str, idx
, d
);
1848 error(loc
, "%s", msg
);
1850 error(loc
, "max number of chars in 32 bit character literal is 1, had %d",
1858 t
.value
= n
== 1 ? TOK
.charLiteral
: TOK
.int32Literal
;
1862 /***************************************
1863 * Get postfix of string literal.
1865 private void stringPostfix(Token
* t
) pure @nogc
1881 /**************************************
1883 * If it's an integer, store it in tok.TKutok.Vlong.
1884 * integers can be decimal, octal or hex
1885 * Handle the suffixes U, UL, LU, L, etc.
1886 * If it's double, store it in tok.TKutok.Vdouble.
1891 private TOK
number(Token
* t
)
1895 uinteger_t n
= 0; // unsigned >=64 bit integer type
1898 bool overflow
= false;
1899 bool anyBinaryDigitsNoSingleUS
= false;
1900 bool anyHexDigitsNoSingleUS
= false;
1901 char errorDigit
= 0;
1922 errorDigit
= cast(char) c
;
1933 error("binary constants not allowed");
1939 goto Ldone
; // if ".."
1940 if (isalpha(p
[1]) || p
[1] == '_' || p
[1] & 0x80)
1942 if (Ccompile
&& (p
[1] == 'f' || p
[1] == 'F' || p
[1] == 'l' || p
[1] == 'L'))
1943 goto Lreal
; // if `0.f` or `0.L`
1944 goto Ldone
; // if ".identifier" or ".unicode"
1946 goto Lreal
; // '.' is part of current token
1953 error("embedded `_` not allowed");
1998 if (c
== 'e' || c
== 'E' || c
== 'f' || c
== 'F')
2012 goto Ldone
; // if ".."
2013 if (base
<= 10 && n
> 0 && (isalpha(p
[1]) || p
[1] == '_' || p
[1] & 0x80))
2015 if (Ccompile
&& base
== 10 &&
2016 (p
[1] == 'e' || p
[1] == 'E' || p
[1] == 'f' || p
[1] == 'F' || p
[1] == 'l' || p
[1] == 'L'))
2017 goto Lreal
; // if `1.e6` or `1.f` or `1.L`
2018 goto Ldone
; // if ".identifier" or ".unicode"
2020 if (base
== 16 && (!ishex(p
[1]) || p
[1] == '_' || p
[1] & 0x80))
2021 goto Ldone
; // if ".identifier" or ".unicode"
2023 goto Ldone
; // if ".identifier" or ".unicode"
2024 goto Lreal
; // otherwise as part of a floating point literal
2039 // got a digit here, set any necessary flags, check for errors
2040 anyHexDigitsNoSingleUS
= true;
2041 anyBinaryDigitsNoSingleUS
= true;
2042 if (!errorDigit
&& d
>= base
)
2044 errorDigit
= cast(char) c
;
2046 // Avoid expensive overflow check if we aren't at risk of overflow
2047 if (n
<= 0x0FFF_FFFF_FFFF_FFFFUL
)
2051 import core
.checkedint
: mulu
, addu
;
2053 n
= mulu(n
, base
, overflow
);
2054 n
= addu(n
, d
, overflow
);
2060 error("%s digit expected, not `%c`", base
== 2 ?
"binary".ptr
:
2061 base
== 8 ?
"octal".ptr
:
2062 "decimal".ptr
, errorDigit
);
2065 if (overflow
&& !err
)
2067 error("integer overflow");
2070 if ((base
== 2 && !anyBinaryDigitsNoSingleUS
) ||
2071 (base
== 16 && !anyHexDigitsNoSingleUS
))
2072 error("`%.*s` isn't a valid integer literal, use `%.*s0` instead", cast(int)(p
- start
), start
, 2, start
);
2077 return cnumber(base
, n
);
2082 decimal
= 1, // decimal
2083 unsigned
= 2, // u or U suffix
2084 long_
= 4, // L suffix
2087 FLAGS flags
= (base
== 10) ? FLAGS
.decimal
: FLAGS
.none
;
2088 // Parse trailing 'u', 'U', 'l' or 'L' in any combination
2101 error("lower case integer suffix 'l' is not allowed. Please use 'L' instead");
2107 if ((flags
& f
) && !err
)
2109 error("unrecognized token");
2112 flags
= cast(FLAGS
)(flags | f
);
2119 if (base
== 8 && n
>= 8)
2122 // can't translate invalid octal value, just show a generic message
2123 error("octal literals larger than 7 are no longer supported");
2125 error("octal literals `0%llo%.*s` are no longer supported, use `std.conv.octal!\"%llo%.*s\"` instead",
2126 n
, cast(int)(p
- psuffix
), psuffix
, n
, cast(int)(p
- psuffix
), psuffix
);
2132 /* Octal or Hexadecimal constant.
2133 * First that fits: int, uint, long, ulong
2135 if (n
& 0x8000000000000000L
)
2136 result
= TOK
.uns64Literal
;
2137 else if (n
& 0xFFFFFFFF00000000L
)
2138 result
= TOK
.int64Literal
;
2139 else if (n
& 0x80000000)
2140 result
= TOK
.uns32Literal
;
2142 result
= TOK
.int32Literal
;
2145 /* First that fits: int, long, long long
2147 if (n
& 0x8000000000000000L
)
2149 result
= TOK
.uns64Literal
;
2151 else if (n
& 0xFFFFFFFF80000000L
)
2152 result
= TOK
.int64Literal
;
2154 result
= TOK
.int32Literal
;
2156 case FLAGS
.unsigned
:
2157 case FLAGS
.decimal | FLAGS
.unsigned
:
2158 /* First that fits: uint, ulong
2160 if (n
& 0xFFFFFFFF00000000L
)
2161 result
= TOK
.uns64Literal
;
2163 result
= TOK
.uns32Literal
;
2165 case FLAGS
.decimal | FLAGS
.long_
:
2166 if (n
& 0x8000000000000000L
)
2170 error("signed integer overflow");
2173 result
= TOK
.uns64Literal
;
2176 result
= TOK
.int64Literal
;
2179 if (n
& 0x8000000000000000L
)
2180 result
= TOK
.uns64Literal
;
2182 result
= TOK
.int64Literal
;
2184 case FLAGS
.unsigned | FLAGS
.long_
:
2185 case FLAGS
.decimal | FLAGS
.unsigned | FLAGS
.long_
:
2186 result
= TOK
.uns64Literal
;
2191 printf("%x\n", flags
);
2198 /**************************************
2199 * Lex C integer-suffix
2201 * base = number base
2202 * n = raw integer value
2206 private TOK
cnumber(int base
, uinteger_t n
)
2209 * Parse trailing suffixes:
2216 octalhex
= 1, // octal or hexadecimal
2217 decimal
= 2, // decimal
2218 unsigned
= 4, // u or U suffix
2219 long_
= 8, // l or L suffix
2220 llong
= 0x10 // ll or LL
2222 FLAGS flags
= (base
== 10) ? FLAGS
.decimal
: FLAGS
.octalhex
;
2241 f
= FLAGS
.long_ | FLAGS
.llong
;
2250 if ((flags
& f
) && !err
)
2252 error("duplicate integer suffixes");
2255 flags
= cast(FLAGS
)(flags | f
);
2258 TOK result
= TOK
.int32Literal
; // default
2261 /* Since D doesn't have a variable sized `long` or `unsigned long` type,
2262 * this code deviates from C by picking D int, uint, long, or ulong instead
2265 case FLAGS
.octalhex
:
2266 /* Octal or Hexadecimal constant.
2267 * First that fits: int, unsigned, long, unsigned long,
2268 * long long, unsigned long long
2270 if (n
& 0x8000000000000000L
)
2271 result
= TOK
.uns64Literal
; // unsigned long
2272 else if (n
& 0xFFFFFFFF00000000L
)
2273 result
= TOK
.int64Literal
; // long
2274 else if (n
& 0x80000000)
2275 result
= TOK
.uns32Literal
;
2277 result
= TOK
.int32Literal
;
2281 /* First that fits: int, long, long long
2283 if (n
& 0x8000000000000000L
)
2284 result
= TOK
.uns64Literal
; // unsigned long
2285 else if (n
& 0xFFFFFFFF80000000L
)
2286 result
= TOK
.int64Literal
; // long
2288 result
= TOK
.int32Literal
;
2291 case FLAGS
.octalhex | FLAGS
.unsigned
:
2292 case FLAGS
.decimal | FLAGS
.unsigned
:
2293 /* First that fits: unsigned, unsigned long, unsigned long long
2295 if (n
& 0xFFFFFFFF00000000L
)
2296 result
= TOK
.uns64Literal
; // unsigned long
2298 result
= TOK
.uns32Literal
;
2301 case FLAGS
.decimal | FLAGS
.long_
:
2302 /* First that fits: long, long long
2304 if (longsize
== 4 || long_longsize
== 4)
2306 if (n
& 0xFFFFFFFF_80000000L)
2307 result
= TOK
.int64Literal
;
2309 result
= TOK
.int32Literal
; // long
2313 result
= TOK
.int64Literal
; // long
2317 case FLAGS
.octalhex | FLAGS
.long_
:
2318 /* First that fits: long, unsigned long, long long,
2319 * unsigned long long
2321 if (longsize
== 4 || long_longsize
== 4)
2323 if (n
& 0x8000000000000000L
)
2324 result
= TOK
.uns64Literal
;
2325 else if (n
& 0xFFFFFFFF00000000L
)
2326 result
= TOK
.int64Literal
;
2327 else if (n
& 0x80000000)
2328 result
= TOK
.uns32Literal
; // unsigned long
2330 result
= TOK
.int32Literal
; // long
2334 if (n
& 0x80000000_00000000L)
2335 result
= TOK
.uns64Literal
; // unsigned long
2337 result
= TOK
.int64Literal
; // long
2341 case FLAGS
.octalhex | FLAGS
.unsigned | FLAGS
.long_
:
2342 case FLAGS
.decimal | FLAGS
.unsigned | FLAGS
.long_
:
2343 /* First that fits: unsigned long, unsigned long long
2345 if (longsize
== 4 || long_longsize
== 4)
2347 if (n
& 0xFFFFFFFF00000000L
)
2348 result
= TOK
.uns64Literal
;
2350 result
= TOK
.uns32Literal
; // unsigned long
2354 result
= TOK
.uns64Literal
; // unsigned long
2358 case FLAGS
.octalhex | FLAGS
.long_ | FLAGS
.llong
:
2359 /* First that fits: long long, unsigned long long
2361 if (n
& 0x8000000000000000L
)
2362 result
= TOK
.uns64Literal
;
2364 result
= TOK
.int64Literal
;
2367 case FLAGS
.decimal | FLAGS
.long_ | FLAGS
.llong
:
2370 result
= TOK
.int64Literal
;
2373 case FLAGS
.octalhex | FLAGS
.long_ | FLAGS
.unsigned | FLAGS
.llong
:
2374 case FLAGS
.decimal | FLAGS
.long_ | FLAGS
.unsigned | FLAGS
.llong
:
2375 result
= TOK
.uns64Literal
;
2379 debug printf("%x\n",flags
);
2385 /**************************************
2386 * Read in characters, converting them to real.
2388 * Exponent overflow not detected.
2389 * Too much requested precision is not detected.
2391 private TOK
inreal(Token
* t
)
2393 //printf("Lexer::inreal()\n");
2396 assert(*p
== '.' ||
isdigit(*p
));
2398 bool isWellformedString
= true;
2399 stringbuffer
.setsize(0);
2407 if (c
== 'x' || c
== 'X')
2413 // Digits to left of '.'
2421 if (isdigit(c
) ||
(hex
&& isxdigit(c
)) || c
== '_')
2428 // Digits to right of '.'
2431 if (isdigit(c
) ||
(hex
&& isxdigit(c
)) || c
== '_')
2438 if (c
== 'e' || c
== 'E' ||
(hex
&& (c
== 'p' || c
== 'P')))
2441 if (c
== '-' || c
== '+')
2445 bool anyexp
= false;
2457 error("embedded `_` in numeric literals not allowed");
2463 error("missing exponent");
2464 isWellformedString
= false;
2471 error("exponent required for hex float");
2472 isWellformedString
= false;
2478 stringbuffer
.writeByte(*pstart
);
2481 stringbuffer
.writeByte(0);
2482 auto sbufptr
= cast(const(char)*)stringbuffer
[].ptr
;
2484 bool isOutOfRange
= false;
2485 t
.floatvalue
= (isWellformedString ? CTFloat
.parse(sbufptr
, &isOutOfRange
) : CTFloat
.zero
);
2490 if (isWellformedString
&& !isOutOfRange
)
2491 isOutOfRange
= Port
.isFloat32LiteralOutOfRange(sbufptr
);
2492 result
= TOK
.float32Literal
;
2496 if (isWellformedString
&& !isOutOfRange
)
2497 isOutOfRange
= Port
.isFloat64LiteralOutOfRange(sbufptr
);
2498 result
= TOK
.float64Literal
;
2502 error("use 'L' suffix instead of 'l'");
2506 if (Ccompile
&& long_doublesize
== 8)
2508 result
= TOK
.float80Literal
;
2511 if ((*p
== 'i' ||
*p
== 'I') && !Ccompile
)
2514 error("use 'i' suffix instead of 'I'");
2518 case TOK
.float32Literal
:
2519 result
= TOK
.imaginary32Literal
;
2521 case TOK
.float64Literal
:
2522 result
= TOK
.imaginary64Literal
;
2524 case TOK
.float80Literal
:
2525 result
= TOK
.imaginary80Literal
;
2531 const isLong
= (result
== TOK
.float80Literal || result
== TOK
.imaginary80Literal
);
2532 if (isOutOfRange
&& !isLong
&& (!Ccompile || hex
))
2534 /* C11 6.4.4.2 doesn't actually care if it is not representable if it is not hex
2536 const char* suffix
= (result
== TOK
.float32Literal || result
== TOK
.imaginary32Literal
) ?
"f" : "";
2537 error(scanloc
, "number `%s%s` is not representable", sbufptr
, suffix
);
2543 case TOK
.float32Literal
:
2544 case TOK
.float64Literal
:
2545 case TOK
.float80Literal
:
2546 case TOK
.imaginary32Literal
:
2547 case TOK
.imaginary64Literal
:
2548 case TOK
.imaginary80Literal
:
2557 final Loc
loc() pure @nogc
2559 scanloc
.charnum
= cast(uint)(1 + p
- line
);
2561 scanloc
.fileOffset
= cast(uint)(p
- base
);
2565 final void error(const(char)* format
, ...)
2568 va_start(args
, format
);
2569 .verror(token
.loc
, format
, args
);
2573 final void error(const ref Loc loc
, const(char)* format
, ...)
2576 va_start(args
, format
);
2577 .verror(loc
, format
, args
);
2581 final void deprecation(const(char)* format
, ...)
2584 va_start(args
, format
);
2585 .vdeprecation(token
.loc
, format
, args
);
2589 /***************************************
2590 * Parse special token sequence:
2592 * true if the special token sequence was handled
2594 * https://dlang.org/spec/lex.html#special-token-sequence
2596 bool parseSpecialTokenSequence()
2600 if (n
.value
== TOK
.identifier
)
2602 if (n
.ident
== Id
.line
)
2604 poundLine(n
, false);
2610 warning(locx
, "C preprocessor directive `#%s` is not supported", n
.ident
.toChars());
2613 else if (n
.value
== TOK
.if_
)
2615 error("C preprocessor directive `#if` is not supported, use `version` or `static if`");
2620 /*********************************************
2621 * Parse line/file preprocessor directive:
2622 * #line linnum [filespec]
2623 * Allow __LINE__ for linnum, and __FILE__ for filespec.
2624 * Accept linemarker format:
2625 * # linnum [filespec] {flags}
2626 * There can be zero or more flags, which are one of the digits 1..4, and
2627 * must be in ascending order. The flags are ignored.
2629 * tok = token we're on, which is linnum of linemarker
2630 * linemarker = true if line marker format and lexer is on linnum
2632 * linemarker https://gcc.gnu.org/onlinedocs/gcc-11.1.0/cpp/Preprocessor-Output.html
2634 final void poundLine(ref Token tok
, bool linemarker
)
2636 auto linnum
= this.scanloc
.linnum
;
2637 const(char)* filespec
= null;
2642 if (tok
.value
== TOK
.int32Literal || tok
.value
== TOK
.int64Literal
)
2644 const lin
= cast(int)(tok
.unsvalue
);
2645 if (lin
!= tok
.unsvalue
)
2647 error(tok
.loc
, "line number `%lld` out of range", cast(ulong)tok
.unsvalue
);
2654 else if (tok
.value
== TOK
.line
) // #line __LINE__
2659 error(tok
.loc
, "positive integer argument expected following `#line`");
2660 if (tok
.value
!= TOK
.endOfLine
)
2671 if (!inTokenStringConstant
)
2673 this.scanloc
.linnum
= linnum
;
2675 this.scanloc
.filename
= filespec
;
2679 if (filespec || flags
)
2681 filespec
= mem
.xstrdup(scanloc
.filename
);
2684 if (filespec || flags
)
2686 if (tok
.ptr
[0] != '"' || tok
.postfix
!= 0)
2688 filespec
= tok
.ustring
;
2690 case TOK
.int32Literal
:
2693 if (linemarker
&& tok
.unsvalue
>= 1 && tok
.unsvalue
<= 4)
2695 flags
= true; // linemarker flags seen
2704 if (filespec
is null)
2705 error(tok
.loc
, "invalid filename for `#line` directive");
2706 else if (linemarker
)
2707 error(tok
.loc
, "invalid flag for line marker directive");
2709 error(tok
.loc
, "found `%s` when expecting new line following `#line` directive", tok
.toChars());
2710 if (tok
.value
!= TOK
.endOfLine
)
2714 /***************************************
2715 * Scan forward to start of next line.
2717 final void skipToNextLine()
2725 return; // do not advance p
2740 const u
= decodeUTF();
2741 if (u
== PS || u
== LS
)
2753 tokenizeNewlines
= false;
2756 /********************************************
2757 * Decode UTF character.
2758 * Issue error messages for invalid sequences.
2759 * Return decoded character, advance p to last character in UTF sequence.
2761 private uint decodeUTF()
2765 // Check length of remaining string up to 4 UTF-8 characters
2767 for (len
= 1; len
< 4 && s
[len
]; len
++)
2772 const msg
= utf_decodeChar(s
[0 .. len
], idx
, u
);
2776 error("%.*s", cast(int)msg
.length
, msg
.ptr
);
2781 /***************************************************
2782 * Parse doc comment embedded between t.ptr and p.
2783 * Remove trailing blanks and tabs from lines.
2784 * Replace all newlines with \n.
2785 * Remove leading comment character from each line.
2786 * Decide if it's a lineComment or a blockComment.
2787 * Append to previous one for this token.
2789 * If newParagraph is true, an extra newline will be
2790 * added between adjoining doc comments.
2792 private void getDocComment(Token
* t
, uint lineComment
, bool newParagraph
) pure
2794 /* ct tells us which kind of comment it is: '/', '*', or '+'
2796 const ct
= t
.ptr
[2];
2797 /* Start of comment text skips over / * *, / + +, or / / /
2799 const(char)* q
= t
.ptr
+ 3; // start of comment text
2800 const(char)* qend
= p
;
2801 if (ct
== '*' || ct
== '+')
2803 /* Scan over initial row of ****'s or ++++'s or ////'s
2805 for (; q
< qend
; q
++)
2810 /* Remove leading spaces until start of the comment
2815 while (q
< qend
&& (*q
== ' ' ||
*q
== '\t'))
2823 if (q
< qend
&& *q
== '\n')
2827 else if (*q
== '\n')
2833 /* Remove trailing row of ****'s or ++++'s
2837 for (; q
< qend
; qend
--)
2843 /* Comment is now [q .. qend].
2844 * Canonicalize it into buf[].
2848 void trimTrailingWhitespace()
2851 auto len
= s
.length
;
2852 while (len
&& (s
[len
- 1] == ' ' || s
[len
- 1] == '\t'))
2857 for (; q
< qend
; q
++)
2864 if (linestart
&& c
== ct
)
2867 /* Trim preceding whitespace up to preceding \n
2869 trimTrailingWhitespace();
2878 continue; // skip the \r
2884 if (q
[1] == 128 && (q
[2] == 168 || q
[2] == 169))
2893 c
= '\n'; // replace all newlines with \n
2897 /* Trim trailing whitespace
2899 trimTrailingWhitespace();
2904 /* Trim trailing whitespace (if the last line does not have newline)
2906 trimTrailingWhitespace();
2908 // Always end with a newline
2910 if (s
.length
== 0 || s
[$ - 1] != '\n')
2911 buf
.writeByte('\n');
2913 // It's a line comment if the start of the doc comment comes
2914 // after other non-whitespace on the same line.
2915 auto dc
= (lineComment
&& anyToken
) ?
&t
.lineComment
: &t
.blockComment
;
2916 // Combine with previous doc comment, if any
2918 *dc
= combineComments(*dc
, buf
[], newParagraph
).toDString();
2920 *dc
= buf
.extractSlice(true);
2923 /********************************************
2924 * Combine two document comments into one,
2925 * separated by an extra newline if newParagraph is true.
2927 static const(char)* combineComments(const(char)[] c1
, const(char)[] c2
, bool newParagraph
) pure
2929 //debug printf("Lexer::combineComments('%*.s', '%*.s', '%i')\n", cast(int) c1.length, c1.ptr, cast(int) c2.length, c2.ptr, newParagraph);
2930 const(int) newParagraphSize
= newParagraph ?
1 : 0; // Size of the combining '\n'
2936 int insertNewLine
= 0;
2937 if (c1
.length
&& c1
[$ - 1] != '\n')
2939 const retSize
= c1
.length
+ insertNewLine
+ newParagraphSize
+ c2
.length
;
2940 auto p
= cast(char*)mem
.xmalloc_noscan(retSize
+ 1);
2941 p
[0 .. c1
.length
] = c1
[];
2943 p
[c1
.length
] = '\n';
2945 p
[c1
.length
+ insertNewLine
] = '\n';
2946 p
[retSize
- c2
.length
.. retSize
] = c2
[];
2951 /**************************
2952 * `p` should be at start of next line
2954 private void endOfLine() pure @nogc @safe
2962 /******************************* Private *****************************************/
2966 /// Support for `__DATE__`, `__TIME__`, and `__TIMESTAMP__`
2967 private struct TimeStampInfo
2969 private __gshared
bool initdone
= false;
2971 // Note: Those properties need to be guarded by a call to `init`
2972 // The API isn't safe, and quite brittle, but it was left this way
2973 // over performance concerns.
2974 // This is currently only called once, from the lexer.
2975 __gshared
char[11 + 1] date
;
2976 __gshared
char[8 + 1] time
;
2977 __gshared
char[24 + 1] timestamp
;
2979 public static void initialize(const ref Loc loc
) nothrow
2986 // https://issues.dlang.org/show_bug.cgi?id=20444
2987 if (auto p
= getenv("SOURCE_DATE_EPOCH"))
2989 if (!ct
.parseDigits(p
.toDString()))
2990 error(loc
, "value of environment variable `SOURCE_DATE_EPOCH` should be a valid UNIX timestamp, not: `%s`", p
);
2994 const p
= ctime(&ct
);
2996 sprintf(&date
[0], "%.6s %.4s", p
+ 4, p
+ 20);
2997 sprintf(&time
[0], "%.8s", p
+ 11);
2998 sprintf(×tamp
[0], "%.24s", p
);
3002 private enum LS
= 0x2028; // UTF line separator
3003 private enum PS
= 0x2029; // UTF paragraph separator
3005 /********************************************
3006 * Do our own char maps
3008 private static immutable cmtable
= ()
3011 foreach (const c
; 0 .. table
.length
)
3013 if ('0' <= c
&& c
<= '7')
3014 table
[c
] |
= CMoctal
;
3017 if (c_isalnum(c
) || c
== '_')
3018 table
[c
] |
= CMidchar
;
3024 table
[c
] |
= CMzerosecond
;
3027 case '0': .. case '9':
3036 table
[c
] |
= CMzerosecond | CMdigitsecond
;
3054 table
[c
] |
= CMsinglechar
;
3065 enum CMidchar
= 0x4;
3066 enum CMzerosecond
= 0x8;
3067 enum CMdigitsecond
= 0x10;
3068 enum CMsinglechar
= 0x20;
3071 private bool isoctal(const char c
) pure @nogc @safe
3073 return (cmtable
[c
] & CMoctal
) != 0;
3076 private bool ishex(const char c
) pure @nogc @safe
3078 return (cmtable
[c
] & CMhex
) != 0;
3081 private bool isidchar(const char c
) pure @nogc @safe
3083 return (cmtable
[c
] & CMidchar
) != 0;
3086 private bool isZeroSecond(const char c
) pure @nogc @safe
3088 return (cmtable
[c
] & CMzerosecond
) != 0;
3091 private bool isDigitSecond(const char c
) pure @nogc @safe
3093 return (cmtable
[c
] & CMdigitsecond
) != 0;
3096 private bool issinglechar(const char c
) pure @nogc @safe
3098 return (cmtable
[c
] & CMsinglechar
) != 0;
3101 private bool c_isxdigit(const int c
) pure @nogc @safe
3103 return (( c
>= '0' && c
<= '9') ||
3104 ( c
>= 'a' && c
<= 'f') ||
3105 ( c
>= 'A' && c
<= 'F'));
3108 private bool c_isalnum(const int c
) pure @nogc @safe
3110 return (( c
>= '0' && c
<= '9') ||
3111 ( c
>= 'a' && c
<= 'z') ||
3112 ( c
>= 'A' && c
<= 'Z'));
3115 /******************************* Unittest *****************************************/
3120 nothrow bool assertDiagnosticHandler(const ref Loc loc
, Color headerColor
, const(char)* header
,
3121 const(char)* format
, va_list ap
, const(char)* p1
, const(char)* p2
)
3125 diagnosticHandler
= &assertDiagnosticHandler
;
3127 static void test(T
)(string sequence
, T expected
, bool Ccompile
= false)
3129 auto p
= cast(const(char)*)sequence
.ptr
;
3130 assert(expected
== Lexer
.escapeSequence(Loc
.initial
, p
, Ccompile
));
3131 assert(p
== sequence
.ptr
+ sequence
.length
);
3156 test(`357`, '\357');
3158 test(`u1234`, '\u1234');
3159 test(`uf0e4`, '\uf0e4');
3161 test(`U0001f603`, '\U0001f603');
3163 test(`"`, '"');
3167 diagnosticHandler
= null;
3176 nothrow bool expectDiagnosticHandler(const ref Loc loc
, Color headerColor
, const(char)* header
,
3177 const(char)* format
, va_list ap
, const(char)* p1
, const(char)* p2
)
3179 assert(cast(Classification
)headerColor
== Classification
.error
);
3182 char[100] buffer
= void;
3183 auto actual
= buffer
[0 .. vsprintf(buffer
.ptr
, format
, ap
)];
3184 assert(expected
== actual
);
3188 diagnosticHandler
= &expectDiagnosticHandler
;
3190 void test(string sequence
, string expectedError
, dchar expectedReturnValue
, uint expectedScanLength
, bool Ccompile
= false)
3192 uint errors
= global
.errors
;
3194 expected
= expectedError
;
3195 auto p
= cast(const(char)*)sequence
.ptr
;
3196 auto actualReturnValue
= Lexer
.escapeSequence(Loc
.initial
, p
, Ccompile
);
3198 assert(expectedReturnValue
== actualReturnValue
);
3200 auto actualScanLength
= p
- sequence
.ptr
;
3201 assert(expectedScanLength
== actualScanLength
);
3202 global
.errors
= errors
;
3205 test("c", `undefined escape sequence \c`, 'c', 1);
3206 test("!", `undefined escape sequence \!`, '!', 1);
3207 test(""", `undefined escape sequence \&`, '&', 1, true);
3209 test("x1", `escape hex sequence has 1 hex digits instead of 2`, '\x01', 2);
3211 test("u1" , `escape hex sequence has 1 hex digits instead of 4`, 0x1, 2);
3212 test("u12" , `escape hex sequence has 2 hex digits instead of 4`, 0x12, 3);
3213 test("u123", `escape hex sequence has 3 hex digits instead of 4`, 0x123, 4);
3215 test("U0" , `escape hex sequence has 1 hex digits instead of 8`, 0x0, 2);
3216 test("U00" , `escape hex sequence has 2 hex digits instead of 8`, 0x00, 3);
3217 test("U000" , `escape hex sequence has 3 hex digits instead of 8`, 0x000, 4);
3218 test("U0000" , `escape hex sequence has 4 hex digits instead of 8`, 0x0000, 5);
3219 test("U0001f" , `escape hex sequence has 5 hex digits instead of 8`, 0x0001f, 6);
3220 test("U0001f6" , `escape hex sequence has 6 hex digits instead of 8`, 0x0001f6, 7);
3221 test("U0001f60", `escape hex sequence has 7 hex digits instead of 8`, 0x0001f60, 8);
3223 test("ud800" , `invalid UTF character \U0000d800`, '?', 5);
3224 test("udfff" , `invalid UTF character \U0000dfff`, '?', 5);
3225 test("U00110000", `invalid UTF character \U00110000`, '?', 9);
3227 test("xg0" , `undefined escape hex sequence \xg`, 'g', 2);
3228 test("ug000" , `undefined escape hex sequence \ug`, 'g', 2);
3229 test("Ug0000000", `undefined escape hex sequence \Ug`, 'g', 2);
3231 test("&BAD;", `unnamed character entity &BAD;` , '?', 5);
3232 test(""", `unterminated named entity "`, '?', 5);
3233 test(""", `unterminated named entity "`, '?', 5);
3235 test("400", `escape octal sequence \400 is larger than \377`, 0x100, 3);
3237 diagnosticHandler
= null;
3242 //printf("lexer.unittest\n");
3243 /* Not much here, just trying things out.
3245 string text
= "int"; // We rely on the implicit null-terminator
3246 scope Lexer lex1
= new Lexer(null, text
.ptr
, 0, text
.length
, 0, 0);
3248 tok
= lex1
.nextToken();
3249 //printf("tok == %s, %d, %d\n", Token::toChars(tok), tok, TOK.int32);
3250 assert(tok
== TOK
.int32
);
3251 tok
= lex1
.nextToken();
3252 assert(tok
== TOK
.endOfFile
);
3253 tok
= lex1
.nextToken();
3254 assert(tok
== TOK
.endOfFile
);
3255 tok
= lex1
.nextToken();
3256 assert(tok
== TOK
.endOfFile
);
3261 // We don't want to see Lexer error output during these tests.
3262 uint errors
= global
.startGagging();
3263 scope(exit
) global
.endGagging(errors
);
3265 // Test malformed input: even malformed input should end in a TOK.endOfFile.
3266 static immutable char[][] testcases
=
3267 [ // Testcase must end with 0 or 0x1A.
3268 [0], // not malformed, but pathological
3271 ['{', '{', 'q', '{', 0],
3279 foreach (testcase
; testcases
)
3281 scope Lexer lex2
= new Lexer(null, testcase
.ptr
, 0, testcase
.length
-1, 0, 0);
3282 TOK tok
= lex2
.nextToken();
3283 size_t iterations
= 1;
3284 while ((tok
!= TOK
.endOfFile
) && (iterations
++ < testcase
.length
))
3286 tok
= lex2
.nextToken();
3288 assert(tok
== TOK
.endOfFile
);
3289 tok
= lex2
.nextToken();
3290 assert(tok
== TOK
.endOfFile
);