2 * Implements the lexical analyzer, which converts source code into lexical tokens.
4 * Specification: $(LINK2 https://dlang.org/spec/lex.html, Lexical)
6 * Copyright: Copyright (C) 1999-2021 by The D Language Foundation, All Rights Reserved
7 * Authors: $(LINK2 http://www.digitalmars.com, Walter Bright)
8 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
9 * Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/lexer.d, _lexer.d)
10 * Documentation: https://dlang.org/phobos/dmd_lexer.html
11 * Coverage: https://codecov.io/gh/dlang/dmd/src/master/src/dmd/lexer.d
16 import core
.stdc
.ctype
;
17 import core
.stdc
.errno
;
18 import core
.stdc
.stdarg
;
19 import core
.stdc
.stdio
;
20 import core
.stdc
.stdlib
: getenv
;
21 import core
.stdc
.string
;
22 import core
.stdc
.time
;
28 import dmd
.identifier
;
29 import dmd
.root
.array
;
30 import dmd
.root
.ctfloat
;
31 import dmd
.common
.outbuffer
;
34 import dmd
.root
.string
;
41 private enum LS
= 0x2028; // UTF line separator
42 private enum PS
= 0x2029; // UTF paragraph separator
44 /********************************************
45 * Do our own char maps
47 private static immutable cmtable
= () {
49 foreach (const c
; 0 .. table
.length
)
51 if ('0' <= c
&& c
<= '7')
55 if (c_isalnum(c
) || c
== '_')
62 table
[c
] |
= CMzerosecond
;
65 case '0': .. case '9':
74 table
[c
] |
= CMzerosecond | CMdigitsecond
;
92 table
[c
] |
= CMsinglechar
;
104 enum CMzerosecond
= 0x8;
105 enum CMdigitsecond
= 0x10;
106 enum CMsinglechar
= 0x20;
109 private bool isoctal(const char c
) pure @nogc @safe
111 return (cmtable
[c
] & CMoctal
) != 0;
114 private bool ishex(const char c
) pure @nogc @safe
116 return (cmtable
[c
] & CMhex
) != 0;
119 private bool isidchar(const char c
) pure @nogc @safe
121 return (cmtable
[c
] & CMidchar
) != 0;
124 private bool isZeroSecond(const char c
) pure @nogc @safe
126 return (cmtable
[c
] & CMzerosecond
) != 0;
129 private bool isDigitSecond(const char c
) pure @nogc @safe
131 return (cmtable
[c
] & CMdigitsecond
) != 0;
134 private bool issinglechar(const char c
) pure @nogc @safe
136 return (cmtable
[c
] & CMsinglechar
) != 0;
139 private bool c_isxdigit(const int c
) pure @nogc @safe
141 return (( c
>= '0' && c
<= '9') ||
142 ( c
>= 'a' && c
<= 'f') ||
143 ( c
>= 'A' && c
<= 'F'));
146 private bool c_isalnum(const int c
) pure @nogc @safe
148 return (( c
>= '0' && c
<= '9') ||
149 ( c
>= 'a' && c
<= 'z') ||
150 ( c
>= 'A' && c
<= 'Z'));
155 //printf("lexer.unittest\n");
156 /* Not much here, just trying things out.
158 string text
= "int"; // We rely on the implicit null-terminator
159 scope Lexer lex1
= new Lexer(null, text
.ptr
, 0, text
.length
, 0, 0);
161 tok
= lex1
.nextToken();
162 //printf("tok == %s, %d, %d\n", Token::toChars(tok), tok, TOK.int32);
163 assert(tok
== TOK
.int32
);
164 tok
= lex1
.nextToken();
165 assert(tok
== TOK
.endOfFile
);
166 tok
= lex1
.nextToken();
167 assert(tok
== TOK
.endOfFile
);
168 tok
= lex1
.nextToken();
169 assert(tok
== TOK
.endOfFile
);
174 // We don't want to see Lexer error output during these tests.
175 uint errors
= global
.startGagging();
176 scope(exit
) global
.endGagging(errors
);
178 // Test malformed input: even malformed input should end in a TOK.endOfFile.
179 static immutable char[][] testcases
=
180 [ // Testcase must end with 0 or 0x1A.
181 [0], // not malformed, but pathological
184 ['{', '{', 'q', '{', 0],
192 foreach (testcase
; testcases
)
194 scope Lexer lex2
= new Lexer(null, testcase
.ptr
, 0, testcase
.length
-1, 0, 0);
195 TOK tok
= lex2
.nextToken();
196 size_t iterations
= 1;
197 while ((tok
!= TOK
.endOfFile
) && (iterations
++ < testcase
.length
))
199 tok
= lex2
.nextToken();
201 assert(tok
== TOK
.endOfFile
);
202 tok
= lex2
.nextToken();
203 assert(tok
== TOK
.endOfFile
);
212 /***********************************************************
216 private __gshared OutBuffer stringbuffer
;
218 Loc scanloc
; // for error messages
219 Loc prevloc
; // location of token before current
221 const(char)* p
; // current character
226 bool Ccompile
; /// true if compiling ImportC
228 // The following are valid only if (Ccompile == true)
229 ubyte longsize
; /// size of C long, 4 or 8
230 ubyte long_doublesize
; /// size of C long double, 8 or D real.sizeof
231 ubyte wchar_tsize
; /// size of C wchar_t, 2 or 4
233 structalign_t packalign
; /// current state of #pragma pack alignment (ImportC)
237 const(char)* base
; // pointer to start of buffer
238 const(char)* end
; // pointer to last element of buffer
239 const(char)* line
; // start of current line
241 bool doDocComment
; // collect doc comment information
242 bool anyToken
; // seen at least one token
243 bool commentToken
; // comments are TOK.comment's
244 int inTokenStringConstant
; // can be larger than 1 when in nested q{} strings
245 int lastDocLine
; // last line of previous doc comment
247 Token
* tokenFreelist
;
249 // ImportC #pragma pack stack
250 Array
!Identifier
* records
; // identifers (or null)
251 Array
!structalign_t
* packs
; // parallel alignment values
256 /*********************
257 * Creates a Lexer for the source code base[begoffset..endoffset+1].
258 * The last character, base[endoffset], must be null (0) or EOF (0x1A).
261 * filename = used for error messages
262 * base = source code, must be terminated by a null (0) or EOF (0x1A) character
263 * begoffset = starting offset into base[]
264 * endoffset = the last offset to read into base[]
265 * doDocComment = handle documentation comments
266 * commentToken = comments become TOK.comment's
268 this(const(char)* filename
, const(char)* base
, size_t begoffset
,
269 size_t endoffset
, bool doDocComment
, bool commentToken
) pure
271 scanloc
= Loc(filename
, 1, 1);
272 //printf("Lexer::Lexer(%p,%d)\n",base,length);
273 //printf("lexer.filename = %s\n", filename);
276 this.end
= base
+ endoffset
;
277 p
= base
+ begoffset
;
279 this.doDocComment
= doDocComment
;
280 this.commentToken
= commentToken
;
281 this.inTokenStringConstant
= 0;
282 this.lastDocLine
= 0;
283 this.packalign
.setDefault();
285 /* If first line starts with '#!', ignore the line
287 if (p
&& p
[0] == '#' && p
[1] == '!')
310 /// Returns: a newly allocated `Token`.
311 Token
* allocateToken() pure nothrow @safe
315 Token
* t
= tokenFreelist
;
316 tokenFreelist
= t
.next
;
323 /// Frees the given token by returning it to the freelist.
324 private void releaseToken(Token
* token
) pure nothrow @nogc @safe
328 token
.next
= tokenFreelist
;
329 tokenFreelist
= token
;
332 final TOK
nextToken()
337 Token
* t
= token
.next
;
338 memcpy(&token
, t
, Token
.sizeof
);
345 //printf(token.toChars());
349 /***********************
350 * Look ahead at next token's value.
354 return peek(&token
).value
;
357 /***********************
358 * Look 2 tokens ahead at value.
360 final TOK
peekNext2()
362 Token
* t
= peek(&token
);
363 return peek(t
).value
;
366 /****************************
367 * Turn next token in buffer into a token.
369 final void scan(Token
* t
)
371 const lastLine
= scanloc
.linnum
;
373 t
.blockComment
= null;
374 t
.lineComment
= null;
379 //printf("p = %p, *p = '%c'\n",p,*p);
385 t
.value
= TOK
.endOfFile
; // end of file
386 // Intentionally not advancing `p`, such that subsequent calls keep returning TOK.endOfFile.
393 continue; // skip white space
396 if (*p
!= '\n') // if CR stands by itself
401 continue; // skip white space
406 while (*(cast(uint*)p
) == 0x20202020) //' ' == 0x20
410 continue; // skip white space
412 if (!isZeroSecond(p
[1])) // if numeric literal does not continue
416 t
.value
= TOK
.int32Literal
;
421 case '1': .. case '9':
422 if (!isDigitSecond(p
[1])) // if numeric literal does not continue
424 t
.unsvalue
= *p
- '0';
426 t
.value
= TOK
.int32Literal
;
434 if (issinglechar(p
[1]) && p
[2] == '\'')
436 t
.unsvalue
= p
[1]; // simple one character literal
437 t
.value
= Ccompile ? TOK
.int32Literal
: TOK
.charLiteral
;
442 clexerCharConstant(*t
, 0);
446 t
.value
= charConstant(t
);
455 if (p
[1] == '\'') // C wide character constant
458 if (c
== 'L') // convert L to u or U
459 c
= (wchar_tsize
== 4) ?
'u' : 'U';
461 clexerCharConstant(*t
, c
);
464 else if (p
[1] == '\"') // C wide string literal
468 escapeStringConstant(t
);
469 t
.postfix
= c
== 'L' ?
(wchar_tsize
== 2 ?
'w' : 'd') :
482 wysiwygStringConstant(t
);
490 t
.value
= hexStringConstant(t
);
491 hexString
.write(start
[0 .. p
- start
]);
492 error("Built-in hex string literals are obsolete, use `std.conv.hexString!%s` instead.", hexString
.extractChars());
498 delimitedStringConstant(t
);
501 else if (p
[1] == '{')
504 tokenStringConstant(t
);
510 escapeStringConstant(t
);
528 /*case 'q': case 'r':*/
574 const u
= decodeUTF();
577 error("char 0x%04x not allowed in identifier", u
);
582 Identifier id
= Identifier
.idPool(cast(char*)t
.ptr
, cast(uint)(p
- t
.ptr
));
584 t
.value
= cast(TOK
)id
.getValue();
588 /* Different keywords for C and D
592 if (t
.value
!= TOK
.identifier
)
594 t
.value
= Ckeywords
[t
.value
]; // filter out D keywords
597 else if (t
.value
>= FirstCKeyword
)
598 t
.value
= TOK
.identifier
; // filter out C keywords
600 else if (*t
.ptr
== '_') // if special identifier token
602 // Lazy initialization
603 TimeStampInfo
.initialize(t
.loc
);
607 t
.ustring
= TimeStampInfo
.date
.ptr
;
610 else if (id
== Id
.TIME
)
612 t
.ustring
= TimeStampInfo
.time
.ptr
;
615 else if (id
== Id
.VENDOR
)
617 t
.ustring
= global
.vendor
.xarraydup
.ptr
;
620 else if (id
== Id
.TIMESTAMP
)
622 t
.ustring
= TimeStampInfo
.timestamp
.ptr
;
624 t
.value
= TOK
.string_
;
626 t
.len
= cast(uint)strlen(t
.ustring
);
628 else if (id
== Id
.VERSIONX
)
630 t
.value
= TOK
.int64Literal
;
631 t
.unsvalue
= global
.versionNumber();
633 else if (id
== Id
.EOFX
)
635 t
.value
= TOK
.endOfFile
;
636 // Advance scanner to end of file
637 while (!(*p
== 0 ||
*p
== 0x1A))
641 //printf("t.value = %d\n",t.value);
650 t
.value
= TOK
.divAssign
;
675 error("unterminated /* */ comment");
678 t
.value
= TOK
.endOfFile
;
683 const u
= decodeUTF();
684 if (u
== PS || u
== LS
)
693 if (p
[-2] == '*' && p
- 3 != t
.ptr
)
699 t
.value
= TOK
.comment
;
702 else if (doDocComment
&& t
.ptr
[2] == '*' && p
- 4 != t
.ptr
)
704 // if /** but not /**/
705 getDocComment(t
, lastLine
== startLoc
.linnum
, startLoc
.linnum
- lastDocLine
> 1);
706 lastDocLine
= scanloc
.linnum
;
709 case '/': // do // style comments
728 t
.value
= TOK
.comment
;
731 if (doDocComment
&& t
.ptr
[2] == '/')
733 getDocComment(t
, lastLine
== startLoc
.linnum
, startLoc
.linnum
- lastDocLine
> 1);
734 lastDocLine
= scanloc
.linnum
;
738 t
.value
= TOK
.endOfFile
;
743 const u
= decodeUTF();
744 if (u
== PS || u
== LS
)
756 t
.value
= TOK
.comment
;
759 if (doDocComment
&& t
.ptr
[2] == '/')
761 getDocComment(t
, lastLine
== startLoc
.linnum
, startLoc
.linnum
- lastDocLine
> 1);
762 lastDocLine
= scanloc
.linnum
;
806 error("unterminated /+ +/ comment");
809 t
.value
= TOK
.endOfFile
;
814 uint u
= decodeUTF();
815 if (u
== PS || u
== LS
)
826 t
.value
= TOK
.comment
;
829 if (doDocComment
&& t
.ptr
[2] == '+' && p
- 4 != t
.ptr
)
831 // if /++ but not /++/
832 getDocComment(t
, lastLine
== startLoc
.linnum
, startLoc
.linnum
- lastDocLine
> 1);
833 lastDocLine
= scanloc
.linnum
;
846 /* Note that we don't allow ._1 and ._ as being
847 * valid floating point numbers.
852 else if (p
[0] == '.')
857 t
.value
= TOK
.dotDotDot
;
873 t
.value
= TOK
.andAssign
;
878 t
.value
= TOK
.andAnd
;
888 t
.value
= TOK
.orAssign
;
903 t
.value
= TOK
.minAssign
;
908 t
.value
= TOK
.minusMinus
;
923 t
.value
= TOK
.addAssign
;
928 t
.value
= TOK
.plusPlus
;
938 t
.value
= TOK
.lessOrEqual
; // <=
946 t
.value
= TOK
.leftShiftAssign
; // <<=
949 t
.value
= TOK
.leftShift
; // <<
951 else if (*p
== ':' && Ccompile
)
954 t
.value
= TOK
.leftBracket
; // <:
956 else if (*p
== '%' && Ccompile
)
959 t
.value
= TOK
.leftCurly
; // <%
962 t
.value
= TOK
.lessThan
; // <
969 t
.value
= TOK
.greaterOrEqual
; // >=
977 t
.value
= TOK
.rightShiftAssign
; // >>=
985 t
.value
= TOK
.unsignedRightShiftAssign
; // >>>=
988 t
.value
= TOK
.unsignedRightShift
; // >>>
991 t
.value
= TOK
.rightShift
; // >>
994 t
.value
= TOK
.greaterThan
; // >
1001 t
.value
= TOK
.notEqual
; // !=
1004 t
.value
= TOK
.not; // !
1011 t
.value
= TOK
.equal
; // ==
1016 t
.value
= TOK
.goesTo
; // =>
1019 t
.value
= TOK
.assign
; // =
1026 t
.value
= TOK
.concatenateAssign
; // ~=
1029 t
.value
= TOK
.tilde
; // ~
1039 t
.value
= TOK
.powAssign
; // ^^=
1042 t
.value
= TOK
.pow
; // ^^
1047 t
.value
= TOK
.xorAssign
; // ^=
1050 t
.value
= TOK
.xor; // ^
1054 t
.value
= TOK
.leftParenthesis
;
1058 t
.value
= TOK
.rightParenthesis
;
1062 t
.value
= TOK
.leftBracket
;
1066 t
.value
= TOK
.rightBracket
;
1070 t
.value
= TOK
.leftCurly
;
1074 t
.value
= TOK
.rightCurly
;
1078 t
.value
= TOK
.question
;
1082 t
.value
= TOK
.comma
;
1086 t
.value
= TOK
.semicolon
;
1093 t
.value
= TOK
.colonColon
;
1095 else if (*p
== '>' && Ccompile
)
1098 t
.value
= TOK
.rightBracket
;
1101 t
.value
= TOK
.colon
;
1105 t
.value
= TOK
.dollar
;
1116 t
.value
= TOK
.mulAssign
;
1126 t
.value
= TOK
.modAssign
;
1128 else if (*p
== '>' && Ccompile
)
1131 t
.value
= TOK
.rightCurly
;
1133 else if (*p
== ':' && Ccompile
)
1135 goto case '#'; // %: means #
1145 if (Ccompile
&& n
.value
== TOK
.int32Literal
)
1150 if (n
.value
== TOK
.identifier
)
1152 if (n
.ident
== Id
.line
)
1154 poundLine(n
, false);
1157 else if (n
.ident
== Id
.__pragma
&& Ccompile
)
1159 pragmaDirective(scanloc
);
1165 warning(locx
, "C preprocessor directive `#%s` is not supported", n
.ident
.toChars());
1168 else if (n
.value
== TOK
.if_
)
1170 error("C preprocessor directive `#if` is not supported, use `version` or `static if`");
1172 t
.value
= TOK
.pound
;
1181 // Check for start of unicode identifier
1184 if (c
== PS || c
== LS
)
1191 if (c
< 0x80 && isprint(c
))
1192 error("character '%c' is not a valid token", c
);
1194 error("character 0x%02x is not a valid token", c
);
1202 final Token
* peek(Token
* ct
)
1209 t
= allocateToken();
1216 /*********************************
1217 * tk is on the opening (.
1218 * Look ahead and return token that is past the closing ).
1220 final Token
* peekPastParen(Token
* tk
)
1222 //printf("peekPastParen()\n");
1231 case TOK
.leftParenthesis
:
1234 case TOK
.rightParenthesis
:
1243 case TOK
.rightCurly
:
1244 if (--curlynest
>= 0)
1260 /*******************************************
1261 * Parse escape sequence.
1263 private uint escapeSequence()
1265 return Lexer
.escapeSequence(token
.loc
, p
, Ccompile
);
1269 * Parse the given string literal escape sequence into a single character.
1270 * D https://dlang.org/spec/lex.html#escape_sequences
1273 * loc = location to use for error messages
1274 * sequence = pointer to string with escape sequence to parse. Updated to
1275 * point past the end of the escape sequence
1276 * Ccompile = true for compile C11 escape sequences
1278 * the escape sequence as a single character
1280 private static dchar escapeSequence(const ref Loc loc
, ref const(char)* sequence
, bool Ccompile
)
1282 const(char)* p
= sequence
; // cache sequence reference on stack
1283 scope(exit
) sequence
= p
;
1328 if (ishex(cast(char)c
))
1334 if (isdigit(cast(char)c
))
1336 else if (islower(c
))
1344 if (!ishex(cast(char)c
))
1346 .error(loc
, "escape hex sequence has %d hex digits instead of %d", n
, ndigits
);
1350 if (ndigits
!= 2 && !utf_isValidDchar(v
))
1352 .error(loc
, "invalid UTF character \\U%08x", v
);
1353 v
= '?'; // recover with valid UTF character
1359 .error(loc
, "undefined escape hex sequence \\%c%c", sequence
[0], c
);
1367 // named character entity
1368 for (const idstart
= ++p
; 1; p
++)
1373 c
= HtmlNamedEntity(idstart
, p
- idstart
);
1376 .error(loc
, "unnamed character entity &%.*s;", cast(int)(p
- idstart
), idstart
);
1382 if (isalpha(*p
) ||
(p
!= idstart
&& isdigit(*p
)))
1384 .error(loc
, "unterminated named entity &%.*s;", cast(int)(p
- idstart
+ 1), idstart
);
1397 if (isoctal(cast(char)c
))
1403 v
= v
* 8 + (c
- '0');
1406 while (++n
< 3 && isoctal(cast(char)c
));
1409 .error(loc
, "escape octal sequence \\%03o is larger than \\377", c
);
1413 .error(loc
, "undefined escape sequence \\%c", c
);
1422 Lex a wysiwyg string. `p` must be pointing to the first character before the
1423 contents of the string literal. The character pointed to by `p` will be used as
1424 the terminating character (i.e. backtick or double-quote).
1426 result = pointer to the token that accepts the result
1428 private void wysiwygStringConstant(Token
* result
)
1430 result
.value
= TOK
.string_
;
1432 auto terminator
= p
[0];
1434 stringbuffer
.setsize(0);
1447 c
= '\n'; // treat EndOfLine as \n character
1452 error("unterminated string constant starting at %s", start
.toChars());
1454 // rewind `p` so it points to the EOF character
1458 if (c
== terminator
)
1460 result
.setString(stringbuffer
);
1461 stringPostfix(result
);
1467 const u
= decodeUTF();
1469 if (u
== PS || u
== LS
)
1471 stringbuffer
.writeUTF8(u
);
1476 stringbuffer
.writeByte(c
);
1480 /**************************************
1484 private TOK
hexStringConstant(Token
* t
)
1488 uint v
= ~0; // dead assignment, needed to suppress warning
1490 stringbuffer
.setsize(0);
1500 continue; // skip white space
1503 continue; // ignore '\r' if followed by '\n'
1504 // Treat isolated '\r' as if it were a '\n'
1511 error("unterminated string constant starting at %s", start
.toChars());
1513 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1515 return TOK
.hexadecimalString
;
1519 error("odd number (%d) of hex characters in hex string", n
);
1520 stringbuffer
.writeByte(v
);
1522 t
.setString(stringbuffer
);
1524 return TOK
.hexadecimalString
;
1526 if (c
>= '0' && c
<= '9')
1528 else if (c
>= 'a' && c
<= 'f')
1530 else if (c
>= 'A' && c
<= 'F')
1535 const u
= decodeUTF();
1537 if (u
== PS || u
== LS
)
1540 error("non-hex character \\u%04x in hex string", u
);
1543 error("non-hex character '%c' in hex string", c
);
1547 stringbuffer
.writeByte(v
);
1555 assert(0); // see bug 15731
1559 Lex a delimited string. Some examples of delimited strings are:
1561 q"(foo(xxx))" // "foo(xxx)"
1562 q"[foo$(LPAREN)]" // "foo$(LPAREN)"
1568 It is assumed that `p` points to the opening double-quote '"'.
1570 result = pointer to the token that accepts the result
1572 private void delimitedStringConstant(Token
* result
)
1574 result
.value
= TOK
.string_
;
1576 dchar delimleft
= 0;
1577 dchar delimright
= 0;
1579 uint nestcount
= ~0; // dead assignment, needed to suppress warning
1580 Identifier hereid
= null;
1584 stringbuffer
.setsize(0);
1588 //printf("c = '%c'\n", c);
1602 stringbuffer
.writeUTF8(c
);
1609 c
= '\n'; // treat EndOfLine as \n character
1613 error("unterminated delimited string constant starting at %s", start
.toChars());
1615 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1624 if (c
== PS || c
== LS
)
1642 else if (isalpha(c
) || c
== '_' ||
(c
>= 0x80 && isUniAlpha(c
)))
1644 // Start of identifier; must be a heredoc
1647 scan(&tok
); // read in heredoc identifier
1648 if (tok
.value
!= TOK
.identifier
)
1650 error("identifier expected for heredoc, not %s", tok
.toChars());
1656 //printf("hereid = '%s'\n", hereid.toChars());
1666 error("delimiter cannot be whitespace");
1673 error("heredoc rest of line should be blank");
1681 else if (c
== delimright
)
1688 else if (c
== delimright
)
1690 if (startline
&& (isalpha(c
) || c
== '_' ||
(c
>= 0x80 && isUniAlpha(c
))) && hereid
)
1695 scan(&tok
); // read in possible heredoc identifier
1696 //printf("endid = '%s'\n", tok.ident.toChars());
1697 if (tok
.value
== TOK
.identifier
&& tok
.ident
is hereid
)
1699 /* should check that rest of line is blank
1705 stringbuffer
.writeUTF8(c
);
1713 error("delimited string must end in %s\"", hereid
.toChars());
1715 error("delimited string must end in %c\"", delimright
);
1716 result
.setString(stringbuffer
);
1717 stringPostfix(result
);
1721 Lex a token string. Some examples of token strings are:
1723 q{ foo(xxx) } // " foo(xxx) "
1724 q{foo$(LPAREN)} // "foo$(LPAREN)"
1725 q{{foo}"}"} // "{foo}"}""
1727 It is assumed that `p` points to the opening curly-brace.
1729 result = pointer to the token that accepts the result
1731 private void tokenStringConstant(Token
* result
)
1733 result
.value
= TOK
.string_
;
1736 const start
= loc();
1738 inTokenStringConstant
++;
1739 scope(exit
) inTokenStringConstant
--;
1749 case TOK
.rightCurly
:
1752 result
.setString(pstart
, p
- 1 - pstart
);
1753 stringPostfix(result
);
1758 error("unterminated token string constant starting at %s", start
.toChars());
1768 Scan a quoted string while building the processed string value by
1769 handling escape sequences. The result is returned in the given `t` token.
1770 This function assumes that `p` currently points to the opening quote
1773 t = the token to set the resulting string to
1775 * D https://dlang.org/spec/lex.html#double_quoted_strings
1778 private void escapeStringConstant(Token
* t
)
1780 t
.value
= TOK
.string_
;
1782 const start
= loc();
1783 const tc
= *p
++; // opening quote
1784 stringbuffer
.setsize(0);
1800 c
= escapeSequence();
1801 stringbuffer
.writeUTF8(c
);
1804 c
= escapeSequence();
1816 c
= '\n'; // treat EndOfLine as \n character
1825 t
.setString(stringbuffer
);
1831 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1834 error("unterminated string constant starting at %s", start
.toChars());
1842 if (c
== LS || c
== PS
)
1850 stringbuffer
.writeUTF8(c
);
1855 stringbuffer
.writeByte(c
);
1859 /**************************************
1861 * https://dlang.org/spec/lex.html#characterliteral
1863 private TOK
charConstant(Token
* t
)
1865 TOK tk
= TOK
.charLiteral
;
1866 //printf("Lexer::charConstant\n");
1875 t
.unsvalue
= escapeSequence();
1876 tk
= TOK
.wcharLiteral
;
1880 t
.unsvalue
= escapeSequence();
1881 tk
= TOK
.dcharLiteral
;
1884 t
.unsvalue
= escapeSequence();
1896 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1900 error("unterminated character constant");
1909 if (c
== LS || c
== PS
)
1911 if (c
< 0xD800 ||
(c
>= 0xE000 && c
< 0xFFFE))
1912 tk
= TOK
.wcharLiteral
;
1914 tk
= TOK
.dcharLiteral
;
1921 while (*p
!= '\'' && *p
!= 0x1A && *p
!= 0 && *p
!= '\n' &&
1922 *p
!= '\r' && *p
!= ';' && *p
!= ')' && *p
!= ']' && *p
!= '}')
1928 if (c
== LS || c
== PS
)
1939 error("character constant has multiple characters");
1943 error("unterminated character constant");
1951 /***************************************
1952 * Lex C character constant.
1953 * Parser is on the opening quote.
1955 * t = token to fill in
1956 * prefix = one of `u`, `U` or 0.
1960 private void clexerCharConstant(ref Token t
, char prefix
)
1962 escapeStringConstant(&t
);
1963 const(char)[] str = t
.ustring
[0 .. t
.len
];
1964 const n
= str.length
;
1968 error(loc
, "empty character constant");
1969 t
.value
= TOK
.semicolon
;
1977 if (n
== 1) // fast case
1982 error(loc
, "max number of chars in character literal is 4, had %d",
1987 (cast(char*)&u
)[n
- 1 - i
] = c
;
1994 auto msg
= utf_decodeChar(str, idx
, d1
);
1996 if (idx
< n
&& !msg
)
1997 msg
= utf_decodeChar(str, idx
, d2
);
1999 error(loc
, "%s", msg
);
2001 error(loc
, "max number of chars in 16 bit character literal is 2, had %d",
2003 else if (d1
> 0x1_0000)
2004 error(loc
, "%d does not fit in 16 bits", d1
);
2005 else if (d2
> 0x1_0000)
2006 error(loc
, "%d does not fit in 16 bits", d2
);
2009 u
= (d1
<< 16) | d2
;
2015 auto msg
= utf_decodeChar(str, idx
, d
);
2017 error(loc
, "%s", msg
);
2019 error(loc
, "max number of chars in 32 bit character literal is 1, had %d",
2027 t
.value
= TOK
.int32Literal
;
2031 /***************************************
2032 * Get postfix of string literal.
2034 private void stringPostfix(Token
* t
) pure @nogc
2050 /**************************************
2052 * If it's an integer, store it in tok.TKutok.Vlong.
2053 * integers can be decimal, octal or hex
2054 * Handle the suffixes U, UL, LU, L, etc.
2055 * If it's double, store it in tok.TKutok.Vdouble.
2060 private TOK
number(Token
* t
)
2064 uinteger_t n
= 0; // unsigned >=64 bit integer type
2067 bool overflow
= false;
2068 bool anyBinaryDigitsNoSingleUS
= false;
2069 bool anyHexDigitsNoSingleUS
= false;
2091 error("octal digit expected, not `%c`", c
);
2102 error("binary constants not allowed");
2108 goto Ldone
; // if ".."
2109 if (isalpha(p
[1]) || p
[1] == '_' || p
[1] & 0x80)
2110 goto Ldone
; // if ".identifier" or ".unicode"
2111 goto Lreal
; // '.' is part of current token
2118 error("embedded `_` not allowed");
2163 if (c
== 'e' || c
== 'E' || c
== 'f' || c
== 'F')
2177 goto Ldone
; // if ".."
2178 if (base
<= 10 && n
> 0 && (isalpha(p
[1]) || p
[1] == '_' || p
[1] & 0x80))
2179 goto Ldone
; // if ".identifier" or ".unicode"
2180 if (base
== 16 && (!ishex(p
[1]) || p
[1] == '_' || p
[1] & 0x80))
2181 goto Ldone
; // if ".identifier" or ".unicode"
2183 goto Ldone
; // if ".identifier" or ".unicode"
2184 goto Lreal
; // otherwise as part of a floating point literal
2199 // got a digit here, set any necessary flags, check for errors
2200 anyHexDigitsNoSingleUS
= true;
2201 anyBinaryDigitsNoSingleUS
= true;
2202 if (!err
&& d
>= base
)
2204 error("%s digit expected, not `%c`", base
== 2 ?
"binary".ptr
:
2205 base
== 8 ?
"octal".ptr
:
2209 // Avoid expensive overflow check if we aren't at risk of overflow
2210 if (n
<= 0x0FFF_FFFF_FFFF_FFFFUL
)
2214 import core
.checkedint
: mulu
, addu
;
2216 n
= mulu(n
, base
, overflow
);
2217 n
= addu(n
, d
, overflow
);
2221 if (overflow
&& !err
)
2223 error("integer overflow");
2226 if ((base
== 2 && !anyBinaryDigitsNoSingleUS
) ||
2227 (base
== 16 && !anyHexDigitsNoSingleUS
))
2228 error("`%.*s` isn't a valid integer literal, use `%.*s0` instead", cast(int)(p
- start
), start
, 2, start
);
2233 return cnumber(base
, n
);
2238 decimal
= 1, // decimal
2239 unsigned
= 2, // u or U suffix
2240 long_
= 4, // L suffix
2243 FLAGS flags
= (base
== 10) ? FLAGS
.decimal
: FLAGS
.none
;
2244 // Parse trailing 'u', 'U', 'l' or 'L' in any combination
2257 error("lower case integer suffix 'l' is not allowed. Please use 'L' instead");
2263 if ((flags
& f
) && !err
)
2265 error("unrecognized token");
2268 flags
= cast(FLAGS
)(flags | f
);
2275 if (base
== 8 && n
>= 8)
2278 // can't translate invalid octal value, just show a generic message
2279 error("octal literals larger than 7 are no longer supported");
2281 error("octal literals `0%llo%.*s` are no longer supported, use `std.conv.octal!%llo%.*s` instead",
2282 n
, cast(int)(p
- psuffix
), psuffix
, n
, cast(int)(p
- psuffix
), psuffix
);
2288 /* Octal or Hexadecimal constant.
2289 * First that fits: int, uint, long, ulong
2291 if (n
& 0x8000000000000000L
)
2292 result
= TOK
.uns64Literal
;
2293 else if (n
& 0xFFFFFFFF00000000L
)
2294 result
= TOK
.int64Literal
;
2295 else if (n
& 0x80000000)
2296 result
= TOK
.uns32Literal
;
2298 result
= TOK
.int32Literal
;
2301 /* First that fits: int, long, long long
2303 if (n
& 0x8000000000000000L
)
2305 result
= TOK
.uns64Literal
;
2307 else if (n
& 0xFFFFFFFF80000000L
)
2308 result
= TOK
.int64Literal
;
2310 result
= TOK
.int32Literal
;
2312 case FLAGS
.unsigned
:
2313 case FLAGS
.decimal | FLAGS
.unsigned
:
2314 /* First that fits: uint, ulong
2316 if (n
& 0xFFFFFFFF00000000L
)
2317 result
= TOK
.uns64Literal
;
2319 result
= TOK
.uns32Literal
;
2321 case FLAGS
.decimal | FLAGS
.long_
:
2322 if (n
& 0x8000000000000000L
)
2326 error("signed integer overflow");
2329 result
= TOK
.uns64Literal
;
2332 result
= TOK
.int64Literal
;
2335 if (n
& 0x8000000000000000L
)
2336 result
= TOK
.uns64Literal
;
2338 result
= TOK
.int64Literal
;
2340 case FLAGS
.unsigned | FLAGS
.long_
:
2341 case FLAGS
.decimal | FLAGS
.unsigned | FLAGS
.long_
:
2342 result
= TOK
.uns64Literal
;
2347 printf("%x\n", flags
);
2354 /**************************************
2355 * Lex C integer-suffix
2357 * base = number base
2358 * n = raw integer value
2362 private TOK
cnumber(int base
, uinteger_t n
)
2365 * Parse trailing suffixes:
2372 octalhex
= 1, // octal or hexadecimal
2373 decimal
= 2, // decimal
2374 unsigned
= 4, // u or U suffix
2375 long_
= 8, // l or L suffix
2376 llong
= 0x10 // ll or LL
2378 FLAGS flags
= (base
== 10) ? FLAGS
.decimal
: FLAGS
.octalhex
;
2397 f
= FLAGS
.long_ | FLAGS
.llong
;
2406 if ((flags
& f
) && !err
)
2408 error("duplicate integer suffixes");
2411 flags
= cast(FLAGS
)(flags | f
);
2416 error("integer overflow");
2419 TOK result
= TOK
.int32Literal
; // default
2422 /* Since D doesn't have a variable sized `long` or `unsigned long` type,
2423 * this code deviates from C by picking D int, uint, long, or ulong instead
2426 case FLAGS
.octalhex
:
2427 /* Octal or Hexadecimal constant.
2428 * First that fits: int, unsigned, long, unsigned long,
2429 * long long, unsigned long long
2433 if (n
& 0x8000000000000000L
)
2434 result
= TOK
.uns64Literal
;
2435 else if (n
& 0xFFFFFFFF00000000L
)
2436 result
= TOK
.int64Literal
;
2437 else if (n
& 0x80000000)
2438 result
= TOK
.uns32Literal
;
2440 result
= TOK
.int32Literal
;
2444 if (n
& 0x8000000000000000L
)
2445 result
= TOK
.uns64Literal
; // unsigned long
2446 else if (n
& 0xFFFFFFFF00000000L
)
2447 result
= TOK
.int64Literal
; // long
2448 else if (n
& 0x80000000)
2449 result
= TOK
.uns32Literal
;
2451 result
= TOK
.int32Literal
;
2456 /* First that fits: int, long, long long
2460 if (n
& 0x8000000000000000L
)
2461 result
= TOK
.uns64Literal
;
2462 else if (n
& 0xFFFFFFFF80000000L
)
2463 result
= TOK
.int64Literal
;
2465 result
= TOK
.int32Literal
;
2469 if (n
& 0x8000000000000000L
)
2470 result
= TOK
.uns64Literal
; // unsigned long
2471 else if (n
& 0xFFFFFFFF80000000L
)
2472 result
= TOK
.int64Literal
; // long
2474 result
= TOK
.int32Literal
;
2478 case FLAGS
.octalhex | FLAGS
.unsigned
:
2479 case FLAGS
.decimal | FLAGS
.unsigned
:
2480 /* First that fits: unsigned, unsigned long, unsigned long long
2484 if (n
& 0xFFFFFFFF00000000L
)
2485 result
= TOK
.uns64Literal
;
2487 result
= TOK
.uns32Literal
;
2491 if (n
& 0xFFFFFFFF00000000L
)
2492 result
= TOK
.uns64Literal
; // unsigned long
2494 result
= TOK
.uns32Literal
;
2498 case FLAGS
.decimal | FLAGS
.long_
:
2499 /* First that fits: long, long long
2503 if (n
& 0x8000000000000000L
)
2505 else if (n
& 0xFFFFFFFF_80000000L)
2506 result
= TOK
.int64Literal
;
2508 result
= TOK
.int32Literal
; // long
2512 if (n
& 0x8000000000000000L
)
2515 result
= TOK
.int64Literal
; // long
2519 case FLAGS
.octalhex | FLAGS
.long_
:
2520 /* First that fits: long, unsigned long, long long,
2521 * unsigned long long
2525 if (n
& 0x8000000000000000L
)
2526 result
= TOK
.uns64Literal
;
2527 else if (n
& 0xFFFFFFFF00000000L
)
2528 result
= TOK
.int64Literal
;
2529 else if (n
& 0x80000000)
2530 result
= TOK
.uns32Literal
; // unsigned long
2532 result
= TOK
.int32Literal
; // long
2536 if (n
& 0x80000000_00000000L)
2537 result
= TOK
.uns64Literal
; // unsigned long
2539 result
= TOK
.int64Literal
; // long
2543 case FLAGS
.octalhex | FLAGS
.unsigned | FLAGS
.long_
:
2544 case FLAGS
.decimal | FLAGS
.unsigned | FLAGS
.long_
:
2545 /* First that fits: unsigned long, unsigned long long
2549 if (n
& 0xFFFFFFFF00000000L
)
2550 result
= TOK
.uns64Literal
;
2552 result
= TOK
.uns32Literal
; // unsigned long
2556 result
= TOK
.uns64Literal
; // unsigned long
2560 case FLAGS
.octalhex | FLAGS
.long_ | FLAGS
.llong
:
2561 /* First that fits: long long, unsigned long long
2563 if (n
& 0x8000000000000000L
)
2564 result
= TOK
.uns64Literal
;
2566 result
= TOK
.int64Literal
;
2569 case FLAGS
.decimal | FLAGS
.long_ | FLAGS
.llong
:
2572 result
= TOK
.int64Literal
;
2575 case FLAGS
.octalhex | FLAGS
.long_ | FLAGS
.unsigned | FLAGS
.llong
:
2576 case FLAGS
.decimal | FLAGS
.long_ | FLAGS
.unsigned | FLAGS
.llong
:
2577 result
= TOK
.uns64Literal
;
2581 debug printf("%x\n",flags
);
2587 /**************************************
2588 * Read in characters, converting them to real.
2590 * Exponent overflow not detected.
2591 * Too much requested precision is not detected.
2593 private TOK
inreal(Token
* t
)
2595 //printf("Lexer::inreal()\n");
2598 assert(*p
== '.' ||
isdigit(*p
));
2600 bool isWellformedString
= true;
2601 stringbuffer
.setsize(0);
2609 if (c
== 'x' || c
== 'X')
2615 // Digits to left of '.'
2623 if (isdigit(c
) ||
(hex
&& isxdigit(c
)) || c
== '_')
2630 // Digits to right of '.'
2633 if (isdigit(c
) ||
(hex
&& isxdigit(c
)) || c
== '_')
2640 if (c
== 'e' || c
== 'E' ||
(hex
&& (c
== 'p' || c
== 'P')))
2643 if (c
== '-' || c
== '+')
2647 bool anyexp
= false;
2659 error("embedded `_` in numeric literals not allowed");
2665 error("missing exponent");
2666 isWellformedString
= false;
2673 error("exponent required for hex float");
2674 isWellformedString
= false;
2680 stringbuffer
.writeByte(*pstart
);
2683 stringbuffer
.writeByte(0);
2684 auto sbufptr
= cast(const(char)*)stringbuffer
[].ptr
;
2686 bool isOutOfRange
= false;
2687 t
.floatvalue
= (isWellformedString ? CTFloat
.parse(sbufptr
, &isOutOfRange
) : CTFloat
.zero
);
2692 if (isWellformedString
&& !isOutOfRange
)
2693 isOutOfRange
= Port
.isFloat32LiteralOutOfRange(sbufptr
);
2694 result
= TOK
.float32Literal
;
2698 if (isWellformedString
&& !isOutOfRange
)
2699 isOutOfRange
= Port
.isFloat64LiteralOutOfRange(sbufptr
);
2700 result
= TOK
.float64Literal
;
2704 error("use 'L' suffix instead of 'l'");
2708 if (Ccompile
&& long_doublesize
== 8)
2710 result
= TOK
.float80Literal
;
2713 if ((*p
== 'i' ||
*p
== 'I') && !Ccompile
)
2716 error("use 'i' suffix instead of 'I'");
2720 case TOK
.float32Literal
:
2721 result
= TOK
.imaginary32Literal
;
2723 case TOK
.float64Literal
:
2724 result
= TOK
.imaginary64Literal
;
2726 case TOK
.float80Literal
:
2727 result
= TOK
.imaginary80Literal
;
2733 const isLong
= (result
== TOK
.float80Literal || result
== TOK
.imaginary80Literal
);
2734 if (isOutOfRange
&& !isLong
)
2736 const char* suffix
= (result
== TOK
.float32Literal || result
== TOK
.imaginary32Literal
) ?
"f" : "";
2737 error(scanloc
, "number `%s%s` is not representable", sbufptr
, suffix
);
2743 case TOK
.float32Literal
:
2744 case TOK
.float64Literal
:
2745 case TOK
.float80Literal
:
2746 case TOK
.imaginary32Literal
:
2747 case TOK
.imaginary64Literal
:
2748 case TOK
.imaginary80Literal
:
2757 final Loc
loc() pure @nogc
2759 scanloc
.charnum
= cast(uint)(1 + p
- line
);
2761 scanloc
.fileOffset
= cast(uint)(p
- base
);
2765 final void error(const(char)* format
, ...)
2768 va_start(args
, format
);
2769 .verror(token
.loc
, format
, args
);
2773 final void error(const ref Loc loc
, const(char)* format
, ...)
2776 va_start(args
, format
);
2777 .verror(loc
, format
, args
);
2781 final void deprecation(const(char)* format
, ...)
2784 va_start(args
, format
);
2785 .vdeprecation(token
.loc
, format
, args
);
2789 /*********************************************
2790 * Parse line/file preprocessor directive:
2791 * #line linnum [filespec]
2792 * Allow __LINE__ for linnum, and __FILE__ for filespec.
2793 * Accept linemarker format:
2794 * # linnum [filespec] {flags}
2795 * There can be zero or more flags, which are one of the digits 1..4, and
2796 * must be in ascending order. The flags are ignored.
2798 * tok = token we're on, which is linnum of linemarker
2799 * linemarker = true if line marker format and lexer is on linnum
2801 * linemarker https://gcc.gnu.org/onlinedocs/gcc-11.1.0/cpp/Preprocessor-Output.html
2803 private void poundLine(ref Token tok
, bool linemarker
)
2805 auto linnum
= this.scanloc
.linnum
;
2806 const(char)* filespec
= null;
2807 const loc
= this.loc();
2812 if (tok
.value
== TOK
.int32Literal || tok
.value
== TOK
.int64Literal
)
2814 const lin
= cast(int)(tok
.unsvalue
- 1);
2815 if (lin
!= tok
.unsvalue
- 1)
2816 error("line number `%lld` out of range", cast(ulong)tok
.unsvalue
);
2820 else if (tok
.value
== TOK
.line
) // #line __LINE__
2833 if (!inTokenStringConstant
)
2835 this.scanloc
.linnum
= linnum
;
2837 this.scanloc
.filename
= filespec
;
2853 continue; // skip white space
2855 if (filespec || flags
)
2857 if (memcmp(p
, "__FILE__".ptr
, 8) == 0)
2860 filespec
= mem
.xstrdup(scanloc
.filename
);
2865 if (filespec || flags
)
2867 stringbuffer
.setsize(0);
2881 stringbuffer
.writeByte(0);
2882 filespec
= mem
.xstrdup(cast(const(char)*)stringbuffer
[].ptr
);
2888 uint u
= decodeUTF();
2889 if (u
== PS || u
== LS
)
2892 stringbuffer
.writeByte(c
);
2904 flags
= true; // linemarker flags seen
2906 if ('0' <= *p
&& *p
<= '9')
2907 goto Lerr
; // only one digit allowed
2913 uint u
= decodeUTF();
2914 if (u
== PS || u
== LS
)
2922 error(loc
, "# integer [\"filespec\"] { 1 | 2 | 3 | 4 }\\n expected");
2924 error(loc
, "#line integer [\"filespec\"]\\n expected");
2927 /*********************************************
2928 * C11 6.10.6 Pragma directive
2929 * # pragma pp-tokens(opt) new-line
2930 * The C preprocessor sometimes leaves pragma directives in
2931 * the preprocessed output. Ignore them.
2932 * Upon return, p is at start of next line.
2934 private void pragmaDirective(const ref Loc loc
)
2938 if (n
.value
== TOK
.identifier
&& n
.ident
== Id
.pack
)
2939 return pragmaPack(loc
);
2946 * https://gcc.gnu.org/onlinedocs/gcc-4.4.4/gcc/Structure_002dPacking-Pragmas.html
2947 * https://docs.microsoft.com/en-us/cpp/preprocessor/pack
2948 * Scanner is on the `pack`
2950 * startloc = location to use for error messages
2952 private void pragmaPack(const ref Loc startloc
)
2954 const loc
= startloc
;
2957 if (n
.value
!= TOK
.leftParenthesis
)
2959 error(loc
, "left parenthesis expected to follow `#pragma pack`");
2966 if (n
.value
!= TOK
.rightParenthesis
)
2968 error(loc
, "right parenthesis expected to close `#pragma pack(`");
2973 void setPackAlign(ref const Token t
)
2975 const n
= t
.unsvalue
;
2976 if (n
< 1 || n
& (n
- 1) ||
ushort.max
< n
)
2977 error(loc
, "pack must be an integer positive power of 2, not 0x%llx", cast(ulong)n
);
2978 packalign
.set(cast(uint)n
);
2979 packalign
.setPack(true);
2986 records
= new Array
!Identifier
;
2987 packs
= new Array
!structalign_t
;
2990 /* # pragma pack ( show )
2992 if (n
.value
== TOK
.identifier
&& n
.ident
== Id
.show
)
2994 if (packalign
.isDefault())
2995 warning(startloc
, "current pack attribute is default");
2997 warning(startloc
, "current pack attribute is %d", packalign
.get());
2999 return closingParen();
3001 /* # pragma pack ( push )
3002 * # pragma pack ( push , identifier )
3003 * # pragma pack ( push , integer )
3004 * # pragma pack ( push , identifier , integer )
3006 if (n
.value
== TOK
.identifier
&& n
.ident
== Id
.push)
3009 Identifier record
= null;
3010 if (n
.value
== TOK
.comma
)
3013 if (n
.value
== TOK
.identifier
)
3017 if (n
.value
== TOK
.comma
)
3020 if (n
.value
== TOK
.int32Literal
)
3026 error(loc
, "alignment value expected, not `%s`", n
.toChars());
3029 else if (n
.value
== TOK
.int32Literal
)
3035 error(loc
, "alignment value expected, not `%s`", n
.toChars());
3037 this.records
.push(record
);
3038 this.packs
.push(packalign
);
3039 return closingParen();
3041 /* # pragma pack ( pop )
3042 * # pragma pack ( pop PopList )
3044 * , IdentifierOrInteger
3045 * , IdentifierOrInteger PopList
3046 * IdentifierOrInteger:
3050 if (n
.value
== TOK
.identifier
&& n
.ident
== Id
.pop)
3053 while (n
.value
== TOK
.comma
)
3056 if (n
.value
== TOK
.identifier
)
3058 for (size_t len
= this.records
.length
; len
; --len
)
3060 if ((*this.records
)[len
- 1] == n
.ident
)
3062 packalign
= (*this.packs
)[len
- 1];
3063 this.records
.setDim(len
- 1);
3064 this.packs
.setDim(len
- 1);
3070 else if (n
.value
== TOK
.int32Literal
)
3073 this.records
.push(null);
3074 this.packs
.push(packalign
);
3078 return closingParen();
3080 /* # pragma pack ( integer )
3082 if (n
.value
== TOK
.int32Literal
)
3086 return closingParen();
3088 /* # pragma pack ( )
3090 if (n
.value
== TOK
.rightParenthesis
)
3092 packalign
.setDefault();
3093 return closingParen();
3096 error(loc
, "unrecognized `#pragma pack(%s)`", n
.toChars());
3100 /***************************************
3101 * Scan forward to start of next line.
3103 private void skipToNextLine()
3111 return; // do not advance p
3126 const u
= decodeUTF();
3127 if (u
== PS || u
== LS
)
3141 /********************************************
3142 * Decode UTF character.
3143 * Issue error messages for invalid sequences.
3144 * Return decoded character, advance p to last character in UTF sequence.
3146 private uint decodeUTF()
3150 // Check length of remaining string up to 4 UTF-8 characters
3152 for (len
= 1; len
< 4 && s
[len
]; len
++)
3157 const msg
= utf_decodeChar(s
[0 .. len
], idx
, u
);
3161 error("%.*s", cast(int)msg
.length
, msg
.ptr
);
3166 /***************************************************
3167 * Parse doc comment embedded between t.ptr and p.
3168 * Remove trailing blanks and tabs from lines.
3169 * Replace all newlines with \n.
3170 * Remove leading comment character from each line.
3171 * Decide if it's a lineComment or a blockComment.
3172 * Append to previous one for this token.
3174 * If newParagraph is true, an extra newline will be
3175 * added between adjoining doc comments.
3177 private void getDocComment(Token
* t
, uint lineComment
, bool newParagraph
) pure
3179 /* ct tells us which kind of comment it is: '/', '*', or '+'
3181 const ct
= t
.ptr
[2];
3182 /* Start of comment text skips over / * *, / + +, or / / /
3184 const(char)* q
= t
.ptr
+ 3; // start of comment text
3185 const(char)* qend
= p
;
3186 if (ct
== '*' || ct
== '+')
3188 /* Scan over initial row of ****'s or ++++'s or ////'s
3190 for (; q
< qend
; q
++)
3195 /* Remove leading spaces until start of the comment
3200 while (q
< qend
&& (*q
== ' ' ||
*q
== '\t'))
3208 if (q
< qend
&& *q
== '\n')
3212 else if (*q
== '\n')
3218 /* Remove trailing row of ****'s or ++++'s
3222 for (; q
< qend
; qend
--)
3228 /* Comment is now [q .. qend].
3229 * Canonicalize it into buf[].
3233 void trimTrailingWhitespace()
3236 auto len
= s
.length
;
3237 while (len
&& (s
[len
- 1] == ' ' || s
[len
- 1] == '\t'))
3242 for (; q
< qend
; q
++)
3249 if (linestart
&& c
== ct
)
3252 /* Trim preceding whitespace up to preceding \n
3254 trimTrailingWhitespace();
3263 continue; // skip the \r
3269 if (q
[1] == 128 && (q
[2] == 168 || q
[2] == 169))
3278 c
= '\n'; // replace all newlines with \n
3282 /* Trim trailing whitespace
3284 trimTrailingWhitespace();
3289 /* Trim trailing whitespace (if the last line does not have newline)
3291 trimTrailingWhitespace();
3293 // Always end with a newline
3295 if (s
.length
== 0 || s
[$ - 1] != '\n')
3296 buf
.writeByte('\n');
3298 // It's a line comment if the start of the doc comment comes
3299 // after other non-whitespace on the same line.
3300 auto dc
= (lineComment
&& anyToken
) ?
&t
.lineComment
: &t
.blockComment
;
3301 // Combine with previous doc comment, if any
3303 *dc
= combineComments(*dc
, buf
[], newParagraph
).toDString();
3305 *dc
= buf
.extractSlice(true);
3308 /********************************************
3309 * Combine two document comments into one,
3310 * separated by an extra newline if newParagraph is true.
3312 static const(char)* combineComments(const(char)[] c1
, const(char)[] c2
, bool newParagraph
) pure
3314 //printf("Lexer::combineComments('%s', '%s', '%i')\n", c1, c2, newParagraph);
3315 const(int) newParagraphSize
= newParagraph ?
1 : 0; // Size of the combining '\n'
3321 int insertNewLine
= 0;
3322 if (c1
.length
&& c1
[$ - 1] != '\n')
3324 const retSize
= c1
.length
+ insertNewLine
+ newParagraphSize
+ c2
.length
;
3325 auto p
= cast(char*)mem
.xmalloc_noscan(retSize
+ 1);
3326 p
[0 .. c1
.length
] = c1
[];
3328 p
[c1
.length
] = '\n';
3330 p
[c1
.length
+ insertNewLine
] = '\n';
3331 p
[retSize
- c2
.length
.. retSize
] = c2
[];
3336 /**************************
3337 * `p` should be at start of next line
3339 private void endOfLine() pure @nogc @safe
3346 /// Support for `__DATE__`, `__TIME__`, and `__TIMESTAMP__`
3347 private struct TimeStampInfo
3349 private __gshared
bool initdone
= false;
3351 // Note: Those properties need to be guarded by a call to `init`
3352 // The API isn't safe, and quite brittle, but it was left this way
3353 // over performance concerns.
3354 // This is currently only called once, from the lexer.
3355 __gshared
char[11 + 1] date
;
3356 __gshared
char[8 + 1] time
;
3357 __gshared
char[24 + 1] timestamp
;
3359 public static void initialize(const ref Loc loc
) nothrow
3366 // https://issues.dlang.org/show_bug.cgi?id=20444
3367 if (auto p
= getenv("SOURCE_DATE_EPOCH"))
3369 if (!ct
.parseDigits(p
.toDString()))
3370 error(loc
, "Value of environment variable `SOURCE_DATE_EPOCH` should be a valid UNIX timestamp, not: `%s`", p
);
3374 const p
= ctime(&ct
);
3376 sprintf(&date
[0], "%.6s %.4s", p
+ 4, p
+ 20);
3377 sprintf(&time
[0], "%.8s", p
+ 11);
3378 sprintf(×tamp
[0], "%.24s", p
);
3385 nothrow bool assertDiagnosticHandler(const ref Loc loc
, Color headerColor
, const(char)* header
,
3386 const(char)* format
, va_list ap
, const(char)* p1
, const(char)* p2
)
3390 diagnosticHandler
= &assertDiagnosticHandler
;
3392 static void test(T
)(string sequence
, T expected
, bool Ccompile
= false)
3394 auto p
= cast(const(char)*)sequence
.ptr
;
3395 assert(expected
== Lexer
.escapeSequence(Loc
.initial
, p
, Ccompile
));
3396 assert(p
== sequence
.ptr
+ sequence
.length
);
3421 test(`357`, '\357');
3423 test(`u1234`, '\u1234');
3424 test(`uf0e4`, '\uf0e4');
3426 test(`U0001f603`, '\U0001f603');
3428 test(`"`, '"');
3432 diagnosticHandler
= null;
3440 nothrow bool expectDiagnosticHandler(const ref Loc loc
, Color headerColor
, const(char)* header
,
3441 const(char)* format
, va_list ap
, const(char)* p1
, const(char)* p2
)
3443 assert(cast(Classification
)headerColor
== Classification
.error
);
3446 char[100] buffer
= void;
3447 auto actual
= buffer
[0 .. vsprintf(buffer
.ptr
, format
, ap
)];
3448 assert(expected
== actual
);
3452 diagnosticHandler
= &expectDiagnosticHandler
;
3454 void test(string sequence
, string expectedError
, dchar expectedReturnValue
, uint expectedScanLength
, bool Ccompile
= false)
3456 uint errors
= global
.errors
;
3458 expected
= expectedError
;
3459 auto p
= cast(const(char)*)sequence
.ptr
;
3460 auto actualReturnValue
= Lexer
.escapeSequence(Loc
.initial
, p
, Ccompile
);
3462 assert(expectedReturnValue
== actualReturnValue
);
3464 auto actualScanLength
= p
- sequence
.ptr
;
3465 assert(expectedScanLength
== actualScanLength
);
3466 global
.errors
= errors
;
3469 test("c", `undefined escape sequence \c`, 'c', 1);
3470 test("!", `undefined escape sequence \!`, '!', 1);
3471 test(""", `undefined escape sequence \&`, '&', 1, true);
3473 test("x1", `escape hex sequence has 1 hex digits instead of 2`, '\x01', 2);
3475 test("u1" , `escape hex sequence has 1 hex digits instead of 4`, 0x1, 2);
3476 test("u12" , `escape hex sequence has 2 hex digits instead of 4`, 0x12, 3);
3477 test("u123", `escape hex sequence has 3 hex digits instead of 4`, 0x123, 4);
3479 test("U0" , `escape hex sequence has 1 hex digits instead of 8`, 0x0, 2);
3480 test("U00" , `escape hex sequence has 2 hex digits instead of 8`, 0x00, 3);
3481 test("U000" , `escape hex sequence has 3 hex digits instead of 8`, 0x000, 4);
3482 test("U0000" , `escape hex sequence has 4 hex digits instead of 8`, 0x0000, 5);
3483 test("U0001f" , `escape hex sequence has 5 hex digits instead of 8`, 0x0001f, 6);
3484 test("U0001f6" , `escape hex sequence has 6 hex digits instead of 8`, 0x0001f6, 7);
3485 test("U0001f60", `escape hex sequence has 7 hex digits instead of 8`, 0x0001f60, 8);
3487 test("ud800" , `invalid UTF character \U0000d800`, '?', 5);
3488 test("udfff" , `invalid UTF character \U0000dfff`, '?', 5);
3489 test("U00110000", `invalid UTF character \U00110000`, '?', 9);
3491 test("xg0" , `undefined escape hex sequence \xg`, 'g', 2);
3492 test("ug000" , `undefined escape hex sequence \ug`, 'g', 2);
3493 test("Ug0000000", `undefined escape hex sequence \Ug`, 'g', 2);
3495 test("&BAD;", `unnamed character entity &BAD;` , '?', 5);
3496 test(""", `unterminated named entity "`, '?', 5);
3497 test(""", `unterminated named entity "`, '?', 5);
3499 test("400", `escape octal sequence \400 is larger than \377`, 0x100, 3);
3501 diagnosticHandler
= null;