Newer
Older
dub_jkp / source / dub / internal / dyaml / scanner.d
@WebFreak001 WebFreak001 on 4 Feb 2023 67 KB fix typo(s)
  1.  
  2. // Copyright Ferdinand Majerech 2011-2014.
  3. // Distributed under the Boost Software License, Version 1.0.
  4. // (See accompanying file LICENSE_1_0.txt or copy at
  5. // http://www.boost.org/LICENSE_1_0.txt)
  6.  
  7. /// YAML scanner.
  8. /// Code based on PyYAML: http://www.pyyaml.org
  9. module dub.internal.dyaml.scanner;
  10.  
  11.  
  12. import core.stdc.string;
  13.  
  14. import std.algorithm;
  15. import std.array;
  16. import std.conv;
  17. import std.ascii : isAlphaNum, isDigit, isHexDigit;
  18. import std.exception;
  19. import std.string;
  20. import std.typecons;
  21. import std.traits : Unqual;
  22. import std.utf;
  23.  
  24. import dub.internal.dyaml.escapes;
  25. import dub.internal.dyaml.exception;
  26. import dub.internal.dyaml.queue;
  27. import dub.internal.dyaml.reader;
  28. import dub.internal.dyaml.style;
  29. import dub.internal.dyaml.token;
  30.  
  31. package:
  32. /// Scanner produces tokens of the following types:
  33. /// STREAM-START
  34. /// STREAM-END
  35. /// DIRECTIVE(name, value)
  36. /// DOCUMENT-START
  37. /// DOCUMENT-END
  38. /// BLOCK-SEQUENCE-START
  39. /// BLOCK-MAPPING-START
  40. /// BLOCK-END
  41. /// FLOW-SEQUENCE-START
  42. /// FLOW-MAPPING-START
  43. /// FLOW-SEQUENCE-END
  44. /// FLOW-MAPPING-END
  45. /// BLOCK-ENTRY
  46. /// FLOW-ENTRY
  47. /// KEY
  48. /// VALUE
  49. /// ALIAS(value)
  50. /// ANCHOR(value)
  51. /// TAG(value)
  52. /// SCALAR(value, plain, style)
  53.  
  54. alias isBreak = among!('\0', '\n', '\r', '\u0085', '\u2028', '\u2029');
  55.  
  56. alias isBreakOrSpace = among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029');
  57.  
  58. alias isWhiteSpace = among!(' ', '\t', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029');
  59.  
  60. alias isNonLinebreakWhitespace = among!(' ', '\t');
  61.  
  62. alias isNonScalarStartCharacter = among!('-', '?', ':', ',', '[', ']', '{', '}',
  63. '#', '&', '*', '!', '|', '>', '\'', '"', '%', '@', '`', ' ', '\t', '\0', '\n',
  64. '\r', '\u0085', '\u2028', '\u2029');
  65.  
  66. alias isURIChar = among!('-', ';', '/', '?', ':', '@', '&', '=', '+', '$', ',',
  67. '_', '.', '!', '~', '*', '\'', '(', ')', '[', ']', '%');
  68.  
  69. alias isNSChar = among!(' ', '\n', '\r', '\u0085', '\u2028', '\u2029');
  70.  
  71. alias isBChar = among!('\n', '\r', '\u0085', '\u2028', '\u2029');
  72.  
  73. alias isFlowScalarBreakSpace = among!(' ', '\t', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029', '\'', '"', '\\');
  74.  
  75. alias isNSAnchorName = c => !c.isWhiteSpace && !c.among!('[', ']', '{', '}', ',', '\uFEFF');
  76.  
  77. /// Marked exception thrown at scanner errors.
  78. ///
  79. /// See_Also: MarkedYAMLException
  80. class ScannerException : MarkedYAMLException
  81. {
  82. mixin MarkedExceptionCtors;
  83. }
  84.  
  85. /// Generates tokens from data provided by a Reader.
  86. struct Scanner
  87. {
  88. private:
  89. /// A simple key is a key that is not denoted by the '?' indicator.
  90. /// For example:
  91. /// ---
  92. /// block simple key: value
  93. /// ? not a simple key:
  94. /// : { flow simple key: value }
  95. /// We emit the KEY token before all keys, so when we find a potential simple
  96. /// key, we try to locate the corresponding ':' indicator. Simple keys should be
  97. /// limited to a single line and 1024 characters.
  98. ///
  99. /// 16 bytes on 64-bit.
  100. static struct SimpleKey
  101. {
  102. /// Character index in reader where the key starts.
  103. uint charIndex = uint.max;
  104. /// Index of the key token from start (first token scanned being 0).
  105. uint tokenIndex;
  106. /// Line the key starts at.
  107. uint line;
  108. /// Column the key starts at.
  109. ushort column;
  110. /// Is this required to be a simple key?
  111. bool required;
  112. /// Is this struct "null" (invalid)?.
  113. bool isNull;
  114. }
  115.  
  116. /// Block chomping types.
  117. enum Chomping
  118. {
  119. /// Strip all trailing line breaks. '-' indicator.
  120. strip,
  121. /// Line break of the last line is preserved, others discarded. Default.
  122. clip,
  123. /// All trailing line breaks are preserved. '+' indicator.
  124. keep
  125. }
  126.  
  127. /// Reader used to read from a file/stream.
  128. Reader reader_;
  129. /// Are we done scanning?
  130. bool done_;
  131.  
  132. /// Level of nesting in flow context. If 0, we're in block context.
  133. uint flowLevel_;
  134. /// Current indentation level.
  135. int indent_ = -1;
  136. /// Past indentation levels. Used as a stack.
  137. Appender!(int[]) indents_;
  138.  
  139. /// Processed tokens not yet emitted. Used as a queue.
  140. Queue!Token tokens_;
  141.  
  142. /// Number of tokens emitted through the getToken method.
  143. uint tokensTaken_;
  144.  
  145. /// Can a simple key start at the current position? A simple key may start:
  146. /// - at the beginning of the line, not counting indentation spaces
  147. /// (in block context),
  148. /// - after '{', '[', ',' (in the flow context),
  149. /// - after '?', ':', '-' (in the block context).
  150. /// In the block context, this flag also signifies if a block collection
  151. /// may start at the current position.
  152. bool allowSimpleKey_ = true;
  153.  
  154. /// Possible simple keys indexed by flow levels.
  155. SimpleKey[] possibleSimpleKeys_;
  156.  
  157. public:
  158. /// Construct a Scanner using specified Reader.
  159. this(Reader reader) @safe nothrow
  160. {
  161. // Return the next token, but do not delete it from the queue
  162. reader_ = reader;
  163. fetchStreamStart();
  164. }
  165.  
  166. /// Advance to the next token
  167. void popFront() @safe
  168. {
  169. ++tokensTaken_;
  170. tokens_.pop();
  171. }
  172.  
  173. /// Return the current token
  174. const(Token) front() @safe
  175. {
  176. enforce(!empty, "No token left to peek");
  177. return tokens_.peek();
  178. }
  179.  
  180. /// Return whether there are any more tokens left.
  181. bool empty() @safe
  182. {
  183. while (needMoreTokens())
  184. {
  185. fetchToken();
  186. }
  187. return tokens_.empty;
  188. }
  189.  
  190. /// Set file name.
  191. void name(string name) @safe pure nothrow @nogc
  192. {
  193. reader_.name = name;
  194. }
  195.  
  196. private:
  197. /// Most scanning error messages have the same format; so build them with this
  198. /// function.
  199. string expected(T)(string expected, T found)
  200. {
  201. return text("expected ", expected, ", but found ", found);
  202. }
  203.  
  204. /// Determine whether or not we need to fetch more tokens before peeking/getting a token.
  205. bool needMoreTokens() @safe pure
  206. {
  207. if(done_) { return false; }
  208. if(tokens_.empty) { return true; }
  209.  
  210. /// The current token may be a potential simple key, so we need to look further.
  211. stalePossibleSimpleKeys();
  212. return nextPossibleSimpleKey() == tokensTaken_;
  213. }
  214.  
  215. /// Fetch at token, adding it to tokens_.
  216. void fetchToken() @safe
  217. {
  218. // Eat whitespaces and comments until we reach the next token.
  219. scanToNextToken();
  220.  
  221. // Remove obsolete possible simple keys.
  222. stalePossibleSimpleKeys();
  223.  
  224. // Compare current indentation and column. It may add some tokens
  225. // and decrease the current indentation level.
  226. unwindIndent(reader_.column);
  227.  
  228. // Get the next character.
  229. const dchar c = reader_.peekByte();
  230.  
  231. // Fetch the token.
  232. if(c == '\0') { return fetchStreamEnd(); }
  233. if(checkDirective()) { return fetchDirective(); }
  234. if(checkDocumentStart()) { return fetchDocumentStart(); }
  235. if(checkDocumentEnd()) { return fetchDocumentEnd(); }
  236. // Order of the following checks is NOT significant.
  237. switch(c)
  238. {
  239. case '[': return fetchFlowSequenceStart();
  240. case '{': return fetchFlowMappingStart();
  241. case ']': return fetchFlowSequenceEnd();
  242. case '}': return fetchFlowMappingEnd();
  243. case ',': return fetchFlowEntry();
  244. case '!': return fetchTag();
  245. case '\'': return fetchSingle();
  246. case '\"': return fetchDouble();
  247. case '*': return fetchAlias();
  248. case '&': return fetchAnchor();
  249. case '?': if(checkKey()) { return fetchKey(); } goto default;
  250. case ':': if(checkValue()) { return fetchValue(); } goto default;
  251. case '-': if(checkBlockEntry()) { return fetchBlockEntry(); } goto default;
  252. case '|': if(flowLevel_ == 0) { return fetchLiteral(); } break;
  253. case '>': if(flowLevel_ == 0) { return fetchFolded(); } break;
  254. default: if(checkPlain()) { return fetchPlain(); }
  255. }
  256.  
  257. throw new ScannerException("While scanning for the next token, found character " ~
  258. "\'%s\', index %s that cannot start any token"
  259. .format(c, to!int(c)), reader_.mark);
  260. }
  261.  
  262.  
  263. /// Return the token number of the nearest possible simple key.
  264. uint nextPossibleSimpleKey() @safe pure nothrow @nogc
  265. {
  266. uint minTokenNumber = uint.max;
  267. foreach(k, ref simpleKey; possibleSimpleKeys_)
  268. {
  269. if(simpleKey.isNull) { continue; }
  270. minTokenNumber = min(minTokenNumber, simpleKey.tokenIndex);
  271. }
  272. return minTokenNumber;
  273. }
  274.  
  275. /// Remove entries that are no longer possible simple keys.
  276. ///
  277. /// According to the YAML specification, simple keys
  278. /// - should be limited to a single line,
  279. /// - should be no longer than 1024 characters.
  280. /// Disabling this will allow simple keys of any length and
  281. /// height (may cause problems if indentation is broken though).
  282. void stalePossibleSimpleKeys() @safe pure
  283. {
  284. foreach(level, ref key; possibleSimpleKeys_)
  285. {
  286. if(key.isNull) { continue; }
  287. if(key.line != reader_.line || reader_.charIndex - key.charIndex > 1024)
  288. {
  289. enforce(!key.required,
  290. new ScannerException("While scanning a simple key",
  291. Mark(reader_.name, key.line, key.column),
  292. "could not find expected ':'", reader_.mark));
  293. key.isNull = true;
  294. }
  295. }
  296. }
  297.  
  298. /// Check if the next token starts a possible simple key and if so, save its position.
  299. ///
  300. /// This function is called for ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
  301. void savePossibleSimpleKey() @safe pure
  302. {
  303. // Check if a simple key is required at the current position.
  304. const required = (flowLevel_ == 0 && indent_ == reader_.column);
  305. assert(allowSimpleKey_ || !required, "A simple key is required only if it is " ~
  306. "the first token in the current line. Therefore it is always allowed.");
  307.  
  308. if(!allowSimpleKey_) { return; }
  309.  
  310. // The next token might be a simple key, so save its number and position.
  311. removePossibleSimpleKey();
  312. const tokenCount = tokensTaken_ + cast(uint)tokens_.length;
  313.  
  314. const line = reader_.line;
  315. const column = reader_.column;
  316. const key = SimpleKey(cast(uint)reader_.charIndex, tokenCount, line,
  317. cast(ushort)min(column, ushort.max), required);
  318.  
  319. if(possibleSimpleKeys_.length <= flowLevel_)
  320. {
  321. const oldLength = possibleSimpleKeys_.length;
  322. possibleSimpleKeys_.length = flowLevel_ + 1;
  323. //No need to initialize the last element, it's already done in the next line.
  324. possibleSimpleKeys_[oldLength .. flowLevel_] = SimpleKey.init;
  325. }
  326. possibleSimpleKeys_[flowLevel_] = key;
  327. }
  328.  
  329. /// Remove the saved possible key position at the current flow level.
  330. void removePossibleSimpleKey() @safe pure
  331. {
  332. if(possibleSimpleKeys_.length <= flowLevel_) { return; }
  333.  
  334. if(!possibleSimpleKeys_[flowLevel_].isNull)
  335. {
  336. const key = possibleSimpleKeys_[flowLevel_];
  337. enforce(!key.required,
  338. new ScannerException("While scanning a simple key",
  339. Mark(reader_.name, key.line, key.column),
  340. "could not find expected ':'", reader_.mark));
  341. possibleSimpleKeys_[flowLevel_].isNull = true;
  342. }
  343. }
  344.  
  345. /// Decrease indentation, removing entries in indents_.
  346. ///
  347. /// Params: column = Current column in the file/stream.
  348. void unwindIndent(const int column) @safe
  349. {
  350. if(flowLevel_ > 0)
  351. {
  352. // In flow context, tokens should respect indentation.
  353. // The condition should be `indent >= column` according to the spec.
  354. // But this condition will prohibit intuitively correct
  355. // constructions such as
  356. // key : {
  357. // }
  358.  
  359. // In the flow context, indentation is ignored. We make the scanner less
  360. // restrictive than what the specification requires.
  361. // if(pedantic_ && flowLevel_ > 0 && indent_ > column)
  362. // {
  363. // throw new ScannerException("Invalid indentation or unclosed '[' or '{'",
  364. // reader_.mark)
  365. // }
  366. return;
  367. }
  368.  
  369. // In block context, we may need to issue the BLOCK-END tokens.
  370. while(indent_ > column)
  371. {
  372. indent_ = indents_.data.back;
  373. assert(indents_.data.length);
  374. indents_.shrinkTo(indents_.data.length - 1);
  375. tokens_.push(blockEndToken(reader_.mark, reader_.mark));
  376. }
  377. }
  378.  
  379. /// Increase indentation if needed.
  380. ///
  381. /// Params: column = Current column in the file/stream.
  382. ///
  383. /// Returns: true if the indentation was increased, false otherwise.
  384. bool addIndent(int column) @safe
  385. {
  386. if(indent_ >= column){return false;}
  387. indents_ ~= indent_;
  388. indent_ = column;
  389. return true;
  390. }
  391.  
  392.  
  393. /// Add STREAM-START token.
  394. void fetchStreamStart() @safe nothrow
  395. {
  396. tokens_.push(streamStartToken(reader_.mark, reader_.mark, reader_.encoding));
  397. }
  398.  
  399. ///Add STREAM-END token.
  400. void fetchStreamEnd() @safe
  401. {
  402. //Set indentation to -1 .
  403. unwindIndent(-1);
  404. removePossibleSimpleKey();
  405. allowSimpleKey_ = false;
  406. possibleSimpleKeys_.destroy;
  407.  
  408. tokens_.push(streamEndToken(reader_.mark, reader_.mark));
  409. done_ = true;
  410. }
  411.  
  412. /// Add DIRECTIVE token.
  413. void fetchDirective() @safe
  414. {
  415. // Set indentation to -1 .
  416. unwindIndent(-1);
  417. // Reset simple keys.
  418. removePossibleSimpleKey();
  419. allowSimpleKey_ = false;
  420.  
  421. auto directive = scanDirective();
  422. tokens_.push(directive);
  423. }
  424.  
  425. /// Add DOCUMENT-START or DOCUMENT-END token.
  426. void fetchDocumentIndicator(TokenID id)()
  427. if(id == TokenID.documentStart || id == TokenID.documentEnd)
  428. {
  429. // Set indentation to -1 .
  430. unwindIndent(-1);
  431. // Reset simple keys. Note that there can't be a block collection after '---'.
  432. removePossibleSimpleKey();
  433. allowSimpleKey_ = false;
  434.  
  435. Mark startMark = reader_.mark;
  436. reader_.forward(3);
  437. tokens_.push(simpleToken!id(startMark, reader_.mark));
  438. }
  439.  
  440. /// Aliases to add DOCUMENT-START or DOCUMENT-END token.
  441. alias fetchDocumentStart = fetchDocumentIndicator!(TokenID.documentStart);
  442. alias fetchDocumentEnd = fetchDocumentIndicator!(TokenID.documentEnd);
  443.  
  444. /// Add FLOW-SEQUENCE-START or FLOW-MAPPING-START token.
  445. void fetchFlowCollectionStart(TokenID id)() @safe
  446. {
  447. // '[' and '{' may start a simple key.
  448. savePossibleSimpleKey();
  449. // Simple keys are allowed after '[' and '{'.
  450. allowSimpleKey_ = true;
  451. ++flowLevel_;
  452.  
  453. Mark startMark = reader_.mark;
  454. reader_.forward();
  455. tokens_.push(simpleToken!id(startMark, reader_.mark));
  456. }
  457.  
  458. /// Aliases to add FLOW-SEQUENCE-START or FLOW-MAPPING-START token.
  459. alias fetchFlowSequenceStart = fetchFlowCollectionStart!(TokenID.flowSequenceStart);
  460. alias fetchFlowMappingStart = fetchFlowCollectionStart!(TokenID.flowMappingStart);
  461.  
  462. /// Add FLOW-SEQUENCE-START or FLOW-MAPPING-START token.
  463. void fetchFlowCollectionEnd(TokenID id)()
  464. {
  465. // Reset possible simple key on the current level.
  466. removePossibleSimpleKey();
  467. // No simple keys after ']' and '}'.
  468. allowSimpleKey_ = false;
  469. --flowLevel_;
  470.  
  471. Mark startMark = reader_.mark;
  472. reader_.forward();
  473. tokens_.push(simpleToken!id(startMark, reader_.mark));
  474. }
  475.  
  476. /// Aliases to add FLOW-SEQUENCE-START or FLOW-MAPPING-START token/
  477. alias fetchFlowSequenceEnd = fetchFlowCollectionEnd!(TokenID.flowSequenceEnd);
  478. alias fetchFlowMappingEnd = fetchFlowCollectionEnd!(TokenID.flowMappingEnd);
  479.  
  480. /// Add FLOW-ENTRY token;
  481. void fetchFlowEntry() @safe
  482. {
  483. // Reset possible simple key on the current level.
  484. removePossibleSimpleKey();
  485. // Simple keys are allowed after ','.
  486. allowSimpleKey_ = true;
  487.  
  488. Mark startMark = reader_.mark;
  489. reader_.forward();
  490. tokens_.push(flowEntryToken(startMark, reader_.mark));
  491. }
  492.  
  493. /// Additional checks used in block context in fetchBlockEntry and fetchKey.
  494. ///
  495. /// Params: type = String representing the token type we might need to add.
  496. /// id = Token type we might need to add.
  497. void blockChecks(string type, TokenID id)()
  498. {
  499. enum context = type ~ " keys are not allowed here";
  500. // Are we allowed to start a key (not necessarily a simple one)?
  501. enforce(allowSimpleKey_, new ScannerException(context, reader_.mark));
  502.  
  503. if(addIndent(reader_.column))
  504. {
  505. tokens_.push(simpleToken!id(reader_.mark, reader_.mark));
  506. }
  507. }
  508.  
  509. /// Add BLOCK-ENTRY token. Might add BLOCK-SEQUENCE-START in the process.
  510. void fetchBlockEntry() @safe
  511. {
  512. if(flowLevel_ == 0) { blockChecks!("Sequence", TokenID.blockSequenceStart)(); }
  513.  
  514. // It's an error for the block entry to occur in the flow context,
  515. // but we let the parser detect this.
  516.  
  517. // Reset possible simple key on the current level.
  518. removePossibleSimpleKey();
  519. // Simple keys are allowed after '-'.
  520. allowSimpleKey_ = true;
  521.  
  522. Mark startMark = reader_.mark;
  523. reader_.forward();
  524. tokens_.push(blockEntryToken(startMark, reader_.mark));
  525. }
  526.  
  527. /// Add KEY token. Might add BLOCK-MAPPING-START in the process.
  528. void fetchKey() @safe
  529. {
  530. if(flowLevel_ == 0) { blockChecks!("Mapping", TokenID.blockMappingStart)(); }
  531.  
  532. // Reset possible simple key on the current level.
  533. removePossibleSimpleKey();
  534. // Simple keys are allowed after '?' in the block context.
  535. allowSimpleKey_ = (flowLevel_ == 0);
  536.  
  537. Mark startMark = reader_.mark;
  538. reader_.forward();
  539. tokens_.push(keyToken(startMark, reader_.mark));
  540. }
  541.  
  542. /// Add VALUE token. Might add KEY and/or BLOCK-MAPPING-START in the process.
  543. void fetchValue() @safe
  544. {
  545. //Do we determine a simple key?
  546. if(possibleSimpleKeys_.length > flowLevel_ &&
  547. !possibleSimpleKeys_[flowLevel_].isNull)
  548. {
  549. const key = possibleSimpleKeys_[flowLevel_];
  550. possibleSimpleKeys_[flowLevel_].isNull = true;
  551. Mark keyMark = Mark(reader_.name, key.line, key.column);
  552. const idx = key.tokenIndex - tokensTaken_;
  553.  
  554. assert(idx >= 0);
  555.  
  556. // Add KEY.
  557. // Manually inserting since tokens are immutable (need linked list).
  558. tokens_.insert(keyToken(keyMark, keyMark), idx);
  559.  
  560. // If this key starts a new block mapping, we need to add BLOCK-MAPPING-START.
  561. if(flowLevel_ == 0 && addIndent(key.column))
  562. {
  563. tokens_.insert(blockMappingStartToken(keyMark, keyMark), idx);
  564. }
  565.  
  566. // There cannot be two simple keys in a row.
  567. allowSimpleKey_ = false;
  568. }
  569. // Part of a complex key
  570. else
  571. {
  572. // We can start a complex value if and only if we can start a simple key.
  573. enforce(flowLevel_ > 0 || allowSimpleKey_,
  574. new ScannerException("Mapping values are not allowed here", reader_.mark));
  575.  
  576. // If this value starts a new block mapping, we need to add
  577. // BLOCK-MAPPING-START. It'll be detected as an error later by the parser.
  578. if(flowLevel_ == 0 && addIndent(reader_.column))
  579. {
  580. tokens_.push(blockMappingStartToken(reader_.mark, reader_.mark));
  581. }
  582.  
  583. // Reset possible simple key on the current level.
  584. removePossibleSimpleKey();
  585. // Simple keys are allowed after ':' in the block context.
  586. allowSimpleKey_ = (flowLevel_ == 0);
  587. }
  588.  
  589. // Add VALUE.
  590. Mark startMark = reader_.mark;
  591. reader_.forward();
  592. tokens_.push(valueToken(startMark, reader_.mark));
  593. }
  594.  
  595. /// Add ALIAS or ANCHOR token.
  596. void fetchAnchor_(TokenID id)() @safe
  597. if(id == TokenID.alias_ || id == TokenID.anchor)
  598. {
  599. // ALIAS/ANCHOR could be a simple key.
  600. savePossibleSimpleKey();
  601. // No simple keys after ALIAS/ANCHOR.
  602. allowSimpleKey_ = false;
  603.  
  604. auto anchor = scanAnchor(id);
  605. tokens_.push(anchor);
  606. }
  607.  
  608. /// Aliases to add ALIAS or ANCHOR token.
  609. alias fetchAlias = fetchAnchor_!(TokenID.alias_);
  610. alias fetchAnchor = fetchAnchor_!(TokenID.anchor);
  611.  
  612. /// Add TAG token.
  613. void fetchTag() @safe
  614. {
  615. //TAG could start a simple key.
  616. savePossibleSimpleKey();
  617. //No simple keys after TAG.
  618. allowSimpleKey_ = false;
  619.  
  620. tokens_.push(scanTag());
  621. }
  622.  
  623. /// Add block SCALAR token.
  624. void fetchBlockScalar(ScalarStyle style)() @safe
  625. if(style == ScalarStyle.literal || style == ScalarStyle.folded)
  626. {
  627. // Reset possible simple key on the current level.
  628. removePossibleSimpleKey();
  629. // A simple key may follow a block scalar.
  630. allowSimpleKey_ = true;
  631.  
  632. auto blockScalar = scanBlockScalar(style);
  633. tokens_.push(blockScalar);
  634. }
  635.  
  636. /// Aliases to add literal or folded block scalar.
  637. alias fetchLiteral = fetchBlockScalar!(ScalarStyle.literal);
  638. alias fetchFolded = fetchBlockScalar!(ScalarStyle.folded);
  639.  
  640. /// Add quoted flow SCALAR token.
  641. void fetchFlowScalar(ScalarStyle quotes)()
  642. {
  643. // A flow scalar could be a simple key.
  644. savePossibleSimpleKey();
  645. // No simple keys after flow scalars.
  646. allowSimpleKey_ = false;
  647.  
  648. // Scan and add SCALAR.
  649. auto scalar = scanFlowScalar(quotes);
  650. tokens_.push(scalar);
  651. }
  652.  
  653. /// Aliases to add single or double quoted block scalar.
  654. alias fetchSingle = fetchFlowScalar!(ScalarStyle.singleQuoted);
  655. alias fetchDouble = fetchFlowScalar!(ScalarStyle.doubleQuoted);
  656.  
  657. /// Add plain SCALAR token.
  658. void fetchPlain() @safe
  659. {
  660. // A plain scalar could be a simple key
  661. savePossibleSimpleKey();
  662. // No simple keys after plain scalars. But note that scanPlain() will
  663. // change this flag if the scan is finished at the beginning of the line.
  664. allowSimpleKey_ = false;
  665. auto plain = scanPlain();
  666.  
  667. // Scan and add SCALAR. May change allowSimpleKey_
  668. tokens_.push(plain);
  669. }
  670.  
  671. pure:
  672.  
  673. ///Check if the next token is DIRECTIVE: ^ '%' ...
  674. bool checkDirective() @safe
  675. {
  676. return reader_.peekByte() == '%' && reader_.column == 0;
  677. }
  678.  
  679. /// Check if the next token is DOCUMENT-START: ^ '---' (' '|'\n')
  680. bool checkDocumentStart() @safe
  681. {
  682. // Check one char first, then all 3, to prevent reading outside the buffer.
  683. return reader_.column == 0 &&
  684. reader_.peekByte() == '-' &&
  685. reader_.prefix(3) == "---" &&
  686. reader_.peek(3).isWhiteSpace;
  687. }
  688.  
  689. /// Check if the next token is DOCUMENT-END: ^ '...' (' '|'\n')
  690. bool checkDocumentEnd() @safe
  691. {
  692. // Check one char first, then all 3, to prevent reading outside the buffer.
  693. return reader_.column == 0 &&
  694. reader_.peekByte() == '.' &&
  695. reader_.prefix(3) == "..." &&
  696. reader_.peek(3).isWhiteSpace;
  697. }
  698.  
  699. /// Check if the next token is BLOCK-ENTRY: '-' (' '|'\n')
  700. bool checkBlockEntry() @safe
  701. {
  702. return !!reader_.peek(1).isWhiteSpace;
  703. }
  704.  
  705. /// Check if the next token is KEY(flow context): '?'
  706. ///
  707. /// or KEY(block context): '?' (' '|'\n')
  708. bool checkKey() @safe
  709. {
  710. return (flowLevel_ > 0 || reader_.peek(1).isWhiteSpace);
  711. }
  712.  
  713. /// Check if the next token is VALUE(flow context): ':'
  714. ///
  715. /// or VALUE(block context): ':' (' '|'\n')
  716. bool checkValue() @safe
  717. {
  718. return flowLevel_ > 0 || reader_.peek(1).isWhiteSpace;
  719. }
  720.  
  721. /// Check if the next token is a plain scalar.
  722. ///
  723. /// A plain scalar may start with any non-space character except:
  724. /// '-', '?', ':', ',', '[', ']', '{', '}',
  725. /// '#', '&', '*', '!', '|', '>', '\'', '\"',
  726. /// '%', '@', '`'.
  727. ///
  728. /// It may also start with
  729. /// '-', '?', ':'
  730. /// if it is followed by a non-space character.
  731. ///
  732. /// Note that we limit the last rule to the block context (except the
  733. /// '-' character) because we want the flow context to be space
  734. /// independent.
  735. bool checkPlain() @safe
  736. {
  737. const c = reader_.peek();
  738. if(!c.isNonScalarStartCharacter)
  739. {
  740. return true;
  741. }
  742. return !reader_.peek(1).isWhiteSpace &&
  743. (c == '-' || (flowLevel_ == 0 && (c == '?' || c == ':')));
  744. }
  745.  
  746. /// Move to the next non-space character.
  747. void findNextNonSpace() @safe
  748. {
  749. while(reader_.peekByte() == ' ') { reader_.forward(); }
  750. }
  751.  
  752. /// Scan a string of alphanumeric or "-_" characters.
  753. ///
  754. /// Assumes that the caller is building a slice in Reader, and puts the scanned
  755. /// characters into that slice.
  756. void scanAlphaNumericToSlice(string name)(const Mark startMark)
  757. {
  758. size_t length;
  759. dchar c = reader_.peek();
  760. while(c.isAlphaNum || c.among!('-', '_')) { c = reader_.peek(++length); }
  761.  
  762. enforce(length > 0, new ScannerException("While scanning " ~ name,
  763. startMark, expected("alphanumeric, '-' or '_'", c), reader_.mark));
  764.  
  765. reader_.sliceBuilder.write(reader_.get(length));
  766. }
  767.  
  768. /// Scan a string.
  769. ///
  770. /// Assumes that the caller is building a slice in Reader, and puts the scanned
  771. /// characters into that slice.
  772. void scanAnchorAliasToSlice(const Mark startMark) @safe
  773. {
  774. size_t length;
  775. dchar c = reader_.peek();
  776. while (c.isNSAnchorName)
  777. {
  778. c = reader_.peek(++length);
  779. }
  780.  
  781. enforce(length > 0, new ScannerException("While scanning an anchor or alias",
  782. startMark, expected("a printable character besides '[', ']', '{', '}' and ','", c), reader_.mark));
  783.  
  784. reader_.sliceBuilder.write(reader_.get(length));
  785. }
  786.  
  787. /// Scan and throw away all characters until next line break.
  788. void scanToNextBreak() @safe
  789. {
  790. while(!reader_.peek().isBreak) { reader_.forward(); }
  791. }
  792.  
  793. /// Scan all characters until next line break.
  794. ///
  795. /// Assumes that the caller is building a slice in Reader, and puts the scanned
  796. /// characters into that slice.
  797. void scanToNextBreakToSlice() @safe
  798. {
  799. uint length;
  800. while(!reader_.peek(length).isBreak)
  801. {
  802. ++length;
  803. }
  804. reader_.sliceBuilder.write(reader_.get(length));
  805. }
  806.  
  807.  
  808. /// Move to next token in the file/stream.
  809. ///
  810. /// We ignore spaces, line breaks and comments.
  811. /// If we find a line break in the block context, we set
  812. /// allowSimpleKey` on.
  813. ///
  814. /// We do not yet support BOM inside the stream as the
  815. /// specification requires. Any such mark will be considered as a part
  816. /// of the document.
  817. void scanToNextToken() @safe
  818. {
  819. // TODO(PyYAML): We need to make tab handling rules more sane. A good rule is:
  820. // Tabs cannot precede tokens
  821. // BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
  822. // KEY(block), VALUE(block), BLOCK-ENTRY
  823. // So the checking code is
  824. // if <TAB>:
  825. // allowSimpleKey_ = false
  826. // We also need to add the check for `allowSimpleKey_ == true` to
  827. // `unwindIndent` before issuing BLOCK-END.
  828. // Scanners for block, flow, and plain scalars need to be modified.
  829.  
  830. for(;;)
  831. {
  832. //All whitespace in flow context is ignored, even whitespace
  833. // not allowed in other contexts
  834. if (flowLevel_ > 0)
  835. {
  836. while(reader_.peekByte().isNonLinebreakWhitespace) { reader_.forward(); }
  837. }
  838. else
  839. {
  840. findNextNonSpace();
  841. }
  842. if(reader_.peekByte() == '#') { scanToNextBreak(); }
  843. if(scanLineBreak() != '\0')
  844. {
  845. if(flowLevel_ == 0) { allowSimpleKey_ = true; }
  846. }
  847. else
  848. {
  849. break;
  850. }
  851. }
  852. }
  853.  
  854. /// Scan directive token.
  855. Token scanDirective() @safe
  856. {
  857. Mark startMark = reader_.mark;
  858. // Skip the '%'.
  859. reader_.forward();
  860.  
  861. // Scan directive name
  862. reader_.sliceBuilder.begin();
  863. scanDirectiveNameToSlice(startMark);
  864. const name = reader_.sliceBuilder.finish();
  865.  
  866. reader_.sliceBuilder.begin();
  867.  
  868. // Index where tag handle ends and suffix starts in a tag directive value.
  869. uint tagHandleEnd = uint.max;
  870. if(name == "YAML") { scanYAMLDirectiveValueToSlice(startMark); }
  871. else if(name == "TAG") { tagHandleEnd = scanTagDirectiveValueToSlice(startMark); }
  872. char[] value = reader_.sliceBuilder.finish();
  873.  
  874. Mark endMark = reader_.mark;
  875.  
  876. DirectiveType directive;
  877. if(name == "YAML") { directive = DirectiveType.yaml; }
  878. else if(name == "TAG") { directive = DirectiveType.tag; }
  879. else
  880. {
  881. directive = DirectiveType.reserved;
  882. scanToNextBreak();
  883. }
  884.  
  885. scanDirectiveIgnoredLine(startMark);
  886.  
  887. return directiveToken(startMark, endMark, value, directive, tagHandleEnd);
  888. }
  889.  
  890. /// Scan name of a directive token.
  891. ///
  892. /// Assumes that the caller is building a slice in Reader, and puts the scanned
  893. /// characters into that slice.
  894. void scanDirectiveNameToSlice(const Mark startMark) @safe
  895. {
  896. // Scan directive name.
  897. scanAlphaNumericToSlice!"a directive"(startMark);
  898.  
  899. enforce(reader_.peek().among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'),
  900. new ScannerException("While scanning a directive", startMark,
  901. expected("alphanumeric, '-' or '_'", reader_.peek()), reader_.mark));
  902. }
  903.  
  904. /// Scan value of a YAML directive token. Returns major, minor version separated by '.'.
  905. ///
  906. /// Assumes that the caller is building a slice in Reader, and puts the scanned
  907. /// characters into that slice.
  908. void scanYAMLDirectiveValueToSlice(const Mark startMark) @safe
  909. {
  910. findNextNonSpace();
  911.  
  912. scanYAMLDirectiveNumberToSlice(startMark);
  913.  
  914. enforce(reader_.peekByte() == '.',
  915. new ScannerException("While scanning a directive", startMark,
  916. expected("digit or '.'", reader_.peek()), reader_.mark));
  917. // Skip the '.'.
  918. reader_.forward();
  919.  
  920. reader_.sliceBuilder.write('.');
  921. scanYAMLDirectiveNumberToSlice(startMark);
  922.  
  923. enforce(reader_.peek().among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'),
  924. new ScannerException("While scanning a directive", startMark,
  925. expected("digit or '.'", reader_.peek()), reader_.mark));
  926. }
  927.  
  928. /// Scan a number from a YAML directive.
  929. ///
  930. /// Assumes that the caller is building a slice in Reader, and puts the scanned
  931. /// characters into that slice.
  932. void scanYAMLDirectiveNumberToSlice(const Mark startMark) @safe
  933. {
  934. enforce(isDigit(reader_.peek()),
  935. new ScannerException("While scanning a directive", startMark,
  936. expected("digit", reader_.peek()), reader_.mark));
  937.  
  938. // Already found the first digit in the enforce(), so set length to 1.
  939. uint length = 1;
  940. while(reader_.peek(length).isDigit) { ++length; }
  941.  
  942. reader_.sliceBuilder.write(reader_.get(length));
  943. }
  944.  
  945. /// Scan value of a tag directive.
  946. ///
  947. /// Assumes that the caller is building a slice in Reader, and puts the scanned
  948. /// characters into that slice.
  949. ///
  950. /// Returns: Length of tag handle (which is before tag prefix) in scanned data
  951. uint scanTagDirectiveValueToSlice(const Mark startMark) @safe
  952. {
  953. findNextNonSpace();
  954. const startLength = reader_.sliceBuilder.length;
  955. scanTagDirectiveHandleToSlice(startMark);
  956. const handleLength = cast(uint)(reader_.sliceBuilder.length - startLength);
  957. findNextNonSpace();
  958. scanTagDirectivePrefixToSlice(startMark);
  959.  
  960. return handleLength;
  961. }
  962.  
  963. /// Scan handle of a tag directive.
  964. ///
  965. /// Assumes that the caller is building a slice in Reader, and puts the scanned
  966. /// characters into that slice.
  967. void scanTagDirectiveHandleToSlice(const Mark startMark) @safe
  968. {
  969. scanTagHandleToSlice!"directive"(startMark);
  970. enforce(reader_.peekByte() == ' ',
  971. new ScannerException("While scanning a directive handle", startMark,
  972. expected("' '", reader_.peek()), reader_.mark));
  973. }
  974.  
  975. /// Scan prefix of a tag directive.
  976. ///
  977. /// Assumes that the caller is building a slice in Reader, and puts the scanned
  978. /// characters into that slice.
  979. void scanTagDirectivePrefixToSlice(const Mark startMark) @safe
  980. {
  981. scanTagURIToSlice!"directive"(startMark);
  982. enforce(reader_.peek().among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'),
  983. new ScannerException("While scanning a directive prefix", startMark,
  984. expected("' '", reader_.peek()), reader_.mark));
  985. }
  986.  
  987. /// Scan (and ignore) ignored line after a directive.
  988. void scanDirectiveIgnoredLine(const Mark startMark) @safe
  989. {
  990. findNextNonSpace();
  991. if(reader_.peekByte() == '#') { scanToNextBreak(); }
  992. enforce(reader_.peek().isBreak,
  993. new ScannerException("While scanning a directive", startMark,
  994. expected("comment or a line break", reader_.peek()), reader_.mark));
  995. scanLineBreak();
  996. }
  997.  
  998.  
  999. /// Scan an alias or an anchor.
  1000. ///
  1001. /// The specification does not restrict characters for anchors and
  1002. /// aliases. This may lead to problems, for instance, the document:
  1003. /// [ *alias, value ]
  1004. /// can be interpreted in two ways, as
  1005. /// [ "value" ]
  1006. /// and
  1007. /// [ *alias , "value" ]
  1008. /// Therefore we restrict aliases to ASCII alphanumeric characters.
  1009. Token scanAnchor(const TokenID id) @safe
  1010. {
  1011. const startMark = reader_.mark;
  1012. reader_.forward(); // The */& character was only peeked, so we drop it now
  1013.  
  1014. reader_.sliceBuilder.begin();
  1015. scanAnchorAliasToSlice(startMark);
  1016. // On error, value is discarded as we return immediately
  1017. char[] value = reader_.sliceBuilder.finish();
  1018.  
  1019. assert(!reader_.peek().isNSAnchorName, "Anchor/alias name not fully scanned");
  1020.  
  1021. if(id == TokenID.alias_)
  1022. {
  1023. return aliasToken(startMark, reader_.mark, value);
  1024. }
  1025. if(id == TokenID.anchor)
  1026. {
  1027. return anchorToken(startMark, reader_.mark, value);
  1028. }
  1029. assert(false, "This code should never be reached");
  1030. }
  1031.  
  1032. /// Scan a tag token.
  1033. Token scanTag() @safe
  1034. {
  1035. const startMark = reader_.mark;
  1036. dchar c = reader_.peek(1);
  1037.  
  1038. reader_.sliceBuilder.begin();
  1039. scope(failure) { reader_.sliceBuilder.finish(); }
  1040. // Index where tag handle ends and tag suffix starts in the tag value
  1041. // (slice) we will produce.
  1042. uint handleEnd;
  1043.  
  1044. if(c == '<')
  1045. {
  1046. reader_.forward(2);
  1047.  
  1048. handleEnd = 0;
  1049. scanTagURIToSlice!"tag"(startMark);
  1050. enforce(reader_.peekByte() == '>',
  1051. new ScannerException("While scanning a tag", startMark,
  1052. expected("'>'", reader_.peek()), reader_.mark));
  1053. reader_.forward();
  1054. }
  1055. else if(c.isWhiteSpace)
  1056. {
  1057. reader_.forward();
  1058. handleEnd = 0;
  1059. reader_.sliceBuilder.write('!');
  1060. }
  1061. else
  1062. {
  1063. uint length = 1;
  1064. bool useHandle;
  1065.  
  1066. while(!c.isBreakOrSpace)
  1067. {
  1068. if(c == '!')
  1069. {
  1070. useHandle = true;
  1071. break;
  1072. }
  1073. ++length;
  1074. c = reader_.peek(length);
  1075. }
  1076.  
  1077. if(useHandle)
  1078. {
  1079. scanTagHandleToSlice!"tag"(startMark);
  1080. handleEnd = cast(uint)reader_.sliceBuilder.length;
  1081. }
  1082. else
  1083. {
  1084. reader_.forward();
  1085. reader_.sliceBuilder.write('!');
  1086. handleEnd = cast(uint)reader_.sliceBuilder.length;
  1087. }
  1088.  
  1089. scanTagURIToSlice!"tag"(startMark);
  1090. }
  1091.  
  1092. enforce(reader_.peek().isBreakOrSpace,
  1093. new ScannerException("While scanning a tag", startMark, expected("' '", reader_.peek()),
  1094. reader_.mark));
  1095.  
  1096. char[] slice = reader_.sliceBuilder.finish();
  1097. return tagToken(startMark, reader_.mark, slice, handleEnd);
  1098. }
  1099.  
  1100. /// Scan a block scalar token with specified style.
  1101. Token scanBlockScalar(const ScalarStyle style) @safe
  1102. {
  1103. const startMark = reader_.mark;
  1104.  
  1105. // Scan the header.
  1106. reader_.forward();
  1107.  
  1108. const indicators = scanBlockScalarIndicators(startMark);
  1109.  
  1110. const chomping = indicators[0];
  1111. const increment = indicators[1];
  1112. scanBlockScalarIgnoredLine(startMark);
  1113.  
  1114. // Determine the indentation level and go to the first non-empty line.
  1115. Mark endMark;
  1116. uint indent = max(1, indent_ + 1);
  1117.  
  1118. reader_.sliceBuilder.begin();
  1119. alias Transaction = SliceBuilder.Transaction;
  1120. // Used to strip the last line breaks written to the slice at the end of the
  1121. // scalar, which may be needed based on chomping.
  1122. Transaction breaksTransaction = Transaction(&reader_.sliceBuilder);
  1123. // Read the first indentation/line breaks before the scalar.
  1124. size_t startLen = reader_.sliceBuilder.length;
  1125. if(increment == int.min)
  1126. {
  1127. auto indentation = scanBlockScalarIndentationToSlice();
  1128. endMark = indentation[1];
  1129. indent = max(indent, indentation[0]);
  1130. }
  1131. else
  1132. {
  1133. indent += increment - 1;
  1134. endMark = scanBlockScalarBreaksToSlice(indent);
  1135. }
  1136.  
  1137. // int.max means there's no line break (int.max is outside UTF-32).
  1138. dchar lineBreak = cast(dchar)int.max;
  1139.  
  1140. // Scan the inner part of the block scalar.
  1141. while(reader_.column == indent && reader_.peekByte() != '\0')
  1142. {
  1143. breaksTransaction.commit();
  1144. const bool leadingNonSpace = !reader_.peekByte().among!(' ', '\t');
  1145. // This is where the 'interesting' non-whitespace data gets read.
  1146. scanToNextBreakToSlice();
  1147. lineBreak = scanLineBreak();
  1148.  
  1149.  
  1150. // This transaction serves to rollback data read in the
  1151. // scanBlockScalarBreaksToSlice() call.
  1152. breaksTransaction = Transaction(&reader_.sliceBuilder);
  1153. startLen = reader_.sliceBuilder.length;
  1154. // The line breaks should actually be written _after_ the if() block
  1155. // below. We work around that by inserting
  1156. endMark = scanBlockScalarBreaksToSlice(indent);
  1157.  
  1158. // This will not run during the last iteration (see the if() vs the
  1159. // while()), hence breaksTransaction rollback (which happens after this
  1160. // loop) will never roll back data written in this if() block.
  1161. if(reader_.column == indent && reader_.peekByte() != '\0')
  1162. {
  1163. // Unfortunately, folding rules are ambiguous.
  1164.  
  1165. // This is the folding according to the specification:
  1166. if(style == ScalarStyle.folded && lineBreak == '\n' &&
  1167. leadingNonSpace && !reader_.peekByte().among!(' ', '\t'))
  1168. {
  1169. // No breaks were scanned; no need to insert the space in the
  1170. // middle of slice.
  1171. if(startLen == reader_.sliceBuilder.length)
  1172. {
  1173. reader_.sliceBuilder.write(' ');
  1174. }
  1175. }
  1176. else
  1177. {
  1178. // We need to insert in the middle of the slice in case any line
  1179. // breaks were scanned.
  1180. reader_.sliceBuilder.insert(lineBreak, startLen);
  1181. }
  1182.  
  1183. ////this is Clark Evans's interpretation (also in the spec
  1184. ////examples):
  1185. //
  1186. //if(style == ScalarStyle.folded && lineBreak == '\n')
  1187. //{
  1188. // if(startLen == endLen)
  1189. // {
  1190. // if(!" \t"d.canFind(reader_.peekByte()))
  1191. // {
  1192. // reader_.sliceBuilder.write(' ');
  1193. // }
  1194. // else
  1195. // {
  1196. // chunks ~= lineBreak;
  1197. // }
  1198. // }
  1199. //}
  1200. //else
  1201. //{
  1202. // reader_.sliceBuilder.insertBack(lineBreak, endLen - startLen);
  1203. //}
  1204. }
  1205. else
  1206. {
  1207. break;
  1208. }
  1209. }
  1210.  
  1211. // If chomping is Keep, we keep (commit) the last scanned line breaks
  1212. // (which are at the end of the scalar). Otherwise re remove them (end the
  1213. // transaction).
  1214. if(chomping == Chomping.keep) { breaksTransaction.commit(); }
  1215. else { breaksTransaction.end(); }
  1216. if(chomping != Chomping.strip && lineBreak != int.max)
  1217. {
  1218. // If chomping is Keep, we keep the line break but the first line break
  1219. // that isn't stripped (since chomping isn't Strip in this branch) must
  1220. // be inserted _before_ the other line breaks.
  1221. if(chomping == Chomping.keep)
  1222. {
  1223. reader_.sliceBuilder.insert(lineBreak, startLen);
  1224. }
  1225. // If chomping is not Keep, breaksTransaction was cancelled so we can
  1226. // directly write the first line break (as it isn't stripped - chomping
  1227. // is not Strip)
  1228. else
  1229. {
  1230. reader_.sliceBuilder.write(lineBreak);
  1231. }
  1232. }
  1233.  
  1234. char[] slice = reader_.sliceBuilder.finish();
  1235. return scalarToken(startMark, endMark, slice, style);
  1236. }
  1237.  
  1238. /// Scan chomping and indentation indicators of a scalar token.
  1239. Tuple!(Chomping, int) scanBlockScalarIndicators(const Mark startMark) @safe
  1240. {
  1241. auto chomping = Chomping.clip;
  1242. int increment = int.min;
  1243. dchar c = reader_.peek();
  1244.  
  1245. /// Indicators can be in any order.
  1246. if(getChomping(c, chomping))
  1247. {
  1248. getIncrement(c, increment, startMark);
  1249. }
  1250. else
  1251. {
  1252. const gotIncrement = getIncrement(c, increment, startMark);
  1253. if(gotIncrement) { getChomping(c, chomping); }
  1254. }
  1255.  
  1256. enforce(c.among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'),
  1257. new ScannerException("While scanning a block scalar", startMark,
  1258. expected("chomping or indentation indicator", c), reader_.mark));
  1259.  
  1260. return tuple(chomping, increment);
  1261. }
  1262.  
  1263. /// Get chomping indicator, if detected. Return false otherwise.
  1264. ///
  1265. /// Used in scanBlockScalarIndicators.
  1266. ///
  1267. /// Params:
  1268. ///
  1269. /// c = The character that may be a chomping indicator.
  1270. /// chomping = Write the chomping value here, if detected.
  1271. bool getChomping(ref dchar c, ref Chomping chomping) @safe
  1272. {
  1273. if(!c.among!('+', '-')) { return false; }
  1274. chomping = c == '+' ? Chomping.keep : Chomping.strip;
  1275. reader_.forward();
  1276. c = reader_.peek();
  1277. return true;
  1278. }
  1279.  
  1280. /// Get increment indicator, if detected. Return false otherwise.
  1281. ///
  1282. /// Used in scanBlockScalarIndicators.
  1283. ///
  1284. /// Params:
  1285. ///
  1286. /// c = The character that may be an increment indicator.
  1287. /// If an increment indicator is detected, this will be updated to
  1288. /// the next character in the Reader.
  1289. /// increment = Write the increment value here, if detected.
  1290. /// startMark = Mark for error messages.
  1291. bool getIncrement(ref dchar c, ref int increment, const Mark startMark) @safe
  1292. {
  1293. if(!c.isDigit) { return false; }
  1294. // Convert a digit to integer.
  1295. increment = c - '0';
  1296. assert(increment < 10 && increment >= 0, "Digit has invalid value");
  1297.  
  1298. enforce(increment > 0,
  1299. new ScannerException("While scanning a block scalar", startMark,
  1300. expected("indentation indicator in range 1-9", "0"), reader_.mark));
  1301.  
  1302. reader_.forward();
  1303. c = reader_.peek();
  1304. return true;
  1305. }
  1306.  
  1307. /// Scan (and ignore) ignored line in a block scalar.
  1308. void scanBlockScalarIgnoredLine(const Mark startMark) @safe
  1309. {
  1310. findNextNonSpace();
  1311. if(reader_.peekByte()== '#') { scanToNextBreak(); }
  1312.  
  1313. enforce(reader_.peek().isBreak,
  1314. new ScannerException("While scanning a block scalar", startMark,
  1315. expected("comment or line break", reader_.peek()), reader_.mark));
  1316.  
  1317. scanLineBreak();
  1318. }
  1319.  
  1320. /// Scan indentation in a block scalar, returning line breaks, max indent and end mark.
  1321. ///
  1322. /// Assumes that the caller is building a slice in Reader, and puts the scanned
  1323. /// characters into that slice.
  1324. Tuple!(uint, Mark) scanBlockScalarIndentationToSlice() @safe
  1325. {
  1326. uint maxIndent;
  1327. Mark endMark = reader_.mark;
  1328.  
  1329. while(reader_.peek().among!(' ', '\n', '\r', '\u0085', '\u2028', '\u2029'))
  1330. {
  1331. if(reader_.peekByte() != ' ')
  1332. {
  1333. reader_.sliceBuilder.write(scanLineBreak());
  1334. endMark = reader_.mark;
  1335. continue;
  1336. }
  1337. reader_.forward();
  1338. maxIndent = max(reader_.column, maxIndent);
  1339. }
  1340.  
  1341. return tuple(maxIndent, endMark);
  1342. }
  1343.  
  1344. /// Scan line breaks at lower or specified indentation in a block scalar.
  1345. ///
  1346. /// Assumes that the caller is building a slice in Reader, and puts the scanned
  1347. /// characters into that slice.
  1348. Mark scanBlockScalarBreaksToSlice(const uint indent) @safe
  1349. {
  1350. Mark endMark = reader_.mark;
  1351.  
  1352. for(;;)
  1353. {
  1354. while(reader_.column < indent && reader_.peekByte() == ' ') { reader_.forward(); }
  1355. if(!reader_.peek().among!('\n', '\r', '\u0085', '\u2028', '\u2029')) { break; }
  1356. reader_.sliceBuilder.write(scanLineBreak());
  1357. endMark = reader_.mark;
  1358. }
  1359.  
  1360. return endMark;
  1361. }
  1362.  
  1363. /// Scan a quoted flow scalar token with specified quotes.
  1364. Token scanFlowScalar(const ScalarStyle quotes) @safe
  1365. {
  1366. const startMark = reader_.mark;
  1367. const quote = reader_.get();
  1368.  
  1369. reader_.sliceBuilder.begin();
  1370.  
  1371. scanFlowScalarNonSpacesToSlice(quotes, startMark);
  1372.  
  1373. while(reader_.peek() != quote)
  1374. {
  1375. scanFlowScalarSpacesToSlice(startMark);
  1376. scanFlowScalarNonSpacesToSlice(quotes, startMark);
  1377. }
  1378. reader_.forward();
  1379.  
  1380. auto slice = reader_.sliceBuilder.finish();
  1381. return scalarToken(startMark, reader_.mark, slice, quotes);
  1382. }
  1383.  
  1384. /// Scan non-space characters in a flow scalar.
  1385. ///
  1386. /// Assumes that the caller is building a slice in Reader, and puts the scanned
  1387. /// characters into that slice.
  1388. void scanFlowScalarNonSpacesToSlice(const ScalarStyle quotes, const Mark startMark)
  1389. @safe
  1390. {
  1391. for(;;)
  1392. {
  1393. dchar c = reader_.peek();
  1394.  
  1395. size_t numCodePoints;
  1396. while(!reader_.peek(numCodePoints).isFlowScalarBreakSpace) { ++numCodePoints; }
  1397.  
  1398. if (numCodePoints > 0) { reader_.sliceBuilder.write(reader_.get(numCodePoints)); }
  1399.  
  1400. c = reader_.peek();
  1401. if(quotes == ScalarStyle.singleQuoted && c == '\'' && reader_.peek(1) == '\'')
  1402. {
  1403. reader_.forward(2);
  1404. reader_.sliceBuilder.write('\'');
  1405. }
  1406. else if((quotes == ScalarStyle.doubleQuoted && c == '\'') ||
  1407. (quotes == ScalarStyle.singleQuoted && c.among!('"', '\\')))
  1408. {
  1409. reader_.forward();
  1410. reader_.sliceBuilder.write(c);
  1411. }
  1412. else if(quotes == ScalarStyle.doubleQuoted && c == '\\')
  1413. {
  1414. reader_.forward();
  1415. c = reader_.peek();
  1416. if(c.among!(escapes))
  1417. {
  1418. reader_.forward();
  1419. // Escaping has been moved to Parser as it can't be done in
  1420. // place (in a slice) in case of '\P' and '\L' (very uncommon,
  1421. // but we don't want to break the spec)
  1422. char[2] escapeSequence = ['\\', cast(char)c];
  1423. reader_.sliceBuilder.write(escapeSequence);
  1424. }
  1425. else if(c.among!(escapeHexCodeList))
  1426. {
  1427. const hexLength = dub.internal.dyaml.escapes.escapeHexLength(c);
  1428. reader_.forward();
  1429.  
  1430. foreach(i; 0 .. hexLength) {
  1431. enforce(reader_.peek(i).isHexDigit,
  1432. new ScannerException("While scanning a double quoted scalar", startMark,
  1433. expected("escape sequence of hexadecimal numbers",
  1434. reader_.peek(i)), reader_.mark));
  1435. }
  1436. char[] hex = reader_.get(hexLength);
  1437.  
  1438. enforce((hex.length > 0) && (hex.length <= 8),
  1439. new ScannerException("While scanning a double quoted scalar", startMark,
  1440. "overflow when parsing an escape sequence of " ~
  1441. "hexadecimal numbers.", reader_.mark));
  1442.  
  1443. char[2] escapeStart = ['\\', cast(char) c];
  1444. reader_.sliceBuilder.write(escapeStart);
  1445. reader_.sliceBuilder.write(hex);
  1446.  
  1447. }
  1448. else if(c.among!('\n', '\r', '\u0085', '\u2028', '\u2029'))
  1449. {
  1450. scanLineBreak();
  1451. scanFlowScalarBreaksToSlice(startMark);
  1452. }
  1453. else
  1454. {
  1455. throw new ScannerException("While scanning a double quoted scalar", startMark,
  1456. text("found unsupported escape character ", c),
  1457. reader_.mark);
  1458. }
  1459. }
  1460. else { return; }
  1461. }
  1462. }
  1463.  
  1464. /// Scan space characters in a flow scalar.
  1465. ///
  1466. /// Assumes that the caller is building a slice in Reader, and puts the scanned
  1467. /// spaces into that slice.
  1468. void scanFlowScalarSpacesToSlice(const Mark startMark) @safe
  1469. {
  1470. // Increase length as long as we see whitespace.
  1471. size_t length;
  1472. while(reader_.peekByte(length).among!(' ', '\t')) { ++length; }
  1473. auto whitespaces = reader_.prefixBytes(length);
  1474.  
  1475. // Can check the last byte without striding because '\0' is ASCII
  1476. const c = reader_.peek(length);
  1477. enforce(c != '\0',
  1478. new ScannerException("While scanning a quoted scalar", startMark,
  1479. "found unexpected end of buffer", reader_.mark));
  1480.  
  1481. // Spaces not followed by a line break.
  1482. if(!c.among!('\n', '\r', '\u0085', '\u2028', '\u2029'))
  1483. {
  1484. reader_.forward(length);
  1485. reader_.sliceBuilder.write(whitespaces);
  1486. return;
  1487. }
  1488.  
  1489. // There's a line break after the spaces.
  1490. reader_.forward(length);
  1491. const lineBreak = scanLineBreak();
  1492.  
  1493. if(lineBreak != '\n') { reader_.sliceBuilder.write(lineBreak); }
  1494.  
  1495. // If we have extra line breaks after the first, scan them into the
  1496. // slice.
  1497. const bool extraBreaks = scanFlowScalarBreaksToSlice(startMark);
  1498.  
  1499. // No extra breaks, one normal line break. Replace it with a space.
  1500. if(lineBreak == '\n' && !extraBreaks) { reader_.sliceBuilder.write(' '); }
  1501. }
  1502.  
  1503. /// Scan line breaks in a flow scalar.
  1504. ///
  1505. /// Assumes that the caller is building a slice in Reader, and puts the scanned
  1506. /// line breaks into that slice.
  1507. bool scanFlowScalarBreaksToSlice(const Mark startMark) @safe
  1508. {
  1509. // True if at least one line break was found.
  1510. bool anyBreaks;
  1511. for(;;)
  1512. {
  1513. // Instead of checking indentation, we check for document separators.
  1514. const prefix = reader_.prefix(3);
  1515. enforce(!(prefix == "---" || prefix == "...") ||
  1516. !reader_.peek(3).isWhiteSpace,
  1517. new ScannerException("While scanning a quoted scalar", startMark,
  1518. "found unexpected document separator", reader_.mark));
  1519.  
  1520. // Skip any whitespaces.
  1521. while(reader_.peekByte().among!(' ', '\t')) { reader_.forward(); }
  1522.  
  1523. // Encountered a non-whitespace non-linebreak character, so we're done.
  1524. if(!reader_.peek().among!(' ', '\n', '\r', '\u0085', '\u2028', '\u2029')) { break; }
  1525.  
  1526. const lineBreak = scanLineBreak();
  1527. anyBreaks = true;
  1528. reader_.sliceBuilder.write(lineBreak);
  1529. }
  1530. return anyBreaks;
  1531. }
  1532.  
  1533. /// Scan plain scalar token (no block, no quotes).
  1534. Token scanPlain() @safe
  1535. {
  1536. // We keep track of the allowSimpleKey_ flag here.
  1537. // Indentation rules are loosed for the flow context
  1538. const startMark = reader_.mark;
  1539. Mark endMark = startMark;
  1540. const indent = indent_ + 1;
  1541.  
  1542. // We allow zero indentation for scalars, but then we need to check for
  1543. // document separators at the beginning of the line.
  1544. // if(indent == 0) { indent = 1; }
  1545.  
  1546. reader_.sliceBuilder.begin();
  1547.  
  1548. alias Transaction = SliceBuilder.Transaction;
  1549. Transaction spacesTransaction;
  1550. // Stop at a comment.
  1551. while(reader_.peekByte() != '#')
  1552. {
  1553. // Scan the entire plain scalar.
  1554. size_t length;
  1555. dchar c = reader_.peek(length);
  1556. for(;;)
  1557. {
  1558. const cNext = reader_.peek(length + 1);
  1559. if(c.isWhiteSpace ||
  1560. (flowLevel_ == 0 && c == ':' && cNext.isWhiteSpace) ||
  1561. (flowLevel_ > 0 && c.among!(',', ':', '?', '[', ']', '{', '}')))
  1562. {
  1563. break;
  1564. }
  1565. ++length;
  1566. c = cNext;
  1567. }
  1568.  
  1569. // It's not clear what we should do with ':' in the flow context.
  1570. enforce(flowLevel_ == 0 || c != ':' ||
  1571. reader_.peek(length + 1).isWhiteSpace ||
  1572. reader_.peek(length + 1).among!(',', '[', ']', '{', '}'),
  1573. new ScannerException("While scanning a plain scalar", startMark,
  1574. "found unexpected ':' . Please check " ~
  1575. "http://pyyaml.org/wiki/YAMLColonInFlowContext for details.",
  1576. reader_.mark));
  1577.  
  1578. if(length == 0) { break; }
  1579.  
  1580. allowSimpleKey_ = false;
  1581.  
  1582. reader_.sliceBuilder.write(reader_.get(length));
  1583.  
  1584. endMark = reader_.mark;
  1585.  
  1586. spacesTransaction.commit();
  1587. spacesTransaction = Transaction(&reader_.sliceBuilder);
  1588.  
  1589. const startLength = reader_.sliceBuilder.length;
  1590. scanPlainSpacesToSlice();
  1591. if(startLength == reader_.sliceBuilder.length ||
  1592. (flowLevel_ == 0 && reader_.column < indent))
  1593. {
  1594. break;
  1595. }
  1596. }
  1597.  
  1598. spacesTransaction.end();
  1599. char[] slice = reader_.sliceBuilder.finish();
  1600.  
  1601. return scalarToken(startMark, endMark, slice, ScalarStyle.plain);
  1602. }
  1603.  
  1604. /// Scan spaces in a plain scalar.
  1605. ///
  1606. /// Assumes that the caller is building a slice in Reader, and puts the spaces
  1607. /// into that slice.
  1608. void scanPlainSpacesToSlice() @safe
  1609. {
  1610. // The specification is really confusing about tabs in plain scalars.
  1611. // We just forbid them completely. Do not use tabs in YAML!
  1612.  
  1613. // Get as many plain spaces as there are.
  1614. size_t length;
  1615. while(reader_.peekByte(length) == ' ') { ++length; }
  1616. char[] whitespaces = reader_.prefixBytes(length);
  1617. reader_.forward(length);
  1618.  
  1619. const dchar c = reader_.peek();
  1620. if(!c.isNSChar)
  1621. {
  1622. // We have spaces, but no newline.
  1623. if(whitespaces.length > 0) { reader_.sliceBuilder.write(whitespaces); }
  1624. return;
  1625. }
  1626.  
  1627. // Newline after the spaces (if any)
  1628. const lineBreak = scanLineBreak();
  1629. allowSimpleKey_ = true;
  1630.  
  1631. static bool end(Reader reader_) @safe pure
  1632. {
  1633. const prefix = reader_.prefix(3);
  1634. return ("---" == prefix || "..." == prefix)
  1635. && reader_.peek(3).among!(' ', '\t', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029');
  1636. }
  1637.  
  1638. if(end(reader_)) { return; }
  1639.  
  1640. bool extraBreaks;
  1641.  
  1642. alias Transaction = SliceBuilder.Transaction;
  1643. auto transaction = Transaction(&reader_.sliceBuilder);
  1644. if(lineBreak != '\n') { reader_.sliceBuilder.write(lineBreak); }
  1645. while(reader_.peek().isNSChar)
  1646. {
  1647. if(reader_.peekByte() == ' ') { reader_.forward(); }
  1648. else
  1649. {
  1650. const lBreak = scanLineBreak();
  1651. extraBreaks = true;
  1652. reader_.sliceBuilder.write(lBreak);
  1653.  
  1654. if(end(reader_)) { return; }
  1655. }
  1656. }
  1657. transaction.commit();
  1658.  
  1659. // No line breaks, only a space.
  1660. if(lineBreak == '\n' && !extraBreaks) { reader_.sliceBuilder.write(' '); }
  1661. }
  1662.  
  1663. /// Scan handle of a tag token.
  1664. ///
  1665. /// Assumes that the caller is building a slice in Reader, and puts the scanned
  1666. /// characters into that slice.
  1667. void scanTagHandleToSlice(string name)(const Mark startMark)
  1668. {
  1669. dchar c = reader_.peek();
  1670. enum contextMsg = "While scanning a " ~ name;
  1671. enforce(c == '!',
  1672. new ScannerException(contextMsg, startMark, expected("'!'", c), reader_.mark));
  1673.  
  1674. uint length = 1;
  1675. c = reader_.peek(length);
  1676. if(c != ' ')
  1677. {
  1678. while(c.isAlphaNum || c.among!('-', '_'))
  1679. {
  1680. ++length;
  1681. c = reader_.peek(length);
  1682. }
  1683. enforce(c == '!',
  1684. new ScannerException(contextMsg, startMark, expected("'!'", c), reader_.mark));
  1685. ++length;
  1686. }
  1687.  
  1688. reader_.sliceBuilder.write(reader_.get(length));
  1689. }
  1690.  
  1691. /// Scan URI in a tag token.
  1692. ///
  1693. /// Assumes that the caller is building a slice in Reader, and puts the scanned
  1694. /// characters into that slice.
  1695. void scanTagURIToSlice(string name)(const Mark startMark)
  1696. {
  1697. // Note: we do not check if URI is well-formed.
  1698. dchar c = reader_.peek();
  1699. const startLen = reader_.sliceBuilder.length;
  1700. {
  1701. uint length;
  1702. while(c.isAlphaNum || c.isURIChar)
  1703. {
  1704. if(c == '%')
  1705. {
  1706. auto chars = reader_.get(length);
  1707. reader_.sliceBuilder.write(chars);
  1708. length = 0;
  1709. scanURIEscapesToSlice!name(startMark);
  1710. }
  1711. else { ++length; }
  1712. c = reader_.peek(length);
  1713. }
  1714. if(length > 0)
  1715. {
  1716. auto chars = reader_.get(length);
  1717. reader_.sliceBuilder.write(chars);
  1718. length = 0;
  1719. }
  1720. }
  1721. // OK if we scanned something, error otherwise.
  1722. enum contextMsg = "While parsing a " ~ name;
  1723. enforce(reader_.sliceBuilder.length > startLen,
  1724. new ScannerException(contextMsg, startMark, expected("URI", c), reader_.mark));
  1725. }
  1726.  
  1727. // Not @nogc yet because std.utf.decode is not @nogc
  1728. /// Scan URI escape sequences.
  1729. ///
  1730. /// Assumes that the caller is building a slice in Reader, and puts the scanned
  1731. /// characters into that slice.
  1732. void scanURIEscapesToSlice(string name)(const Mark startMark)
  1733. {
  1734. import core.exception : UnicodeException;
  1735. // URI escapes encode a UTF-8 string. We store UTF-8 code units here for
  1736. // decoding into UTF-32.
  1737. Appender!string buffer;
  1738.  
  1739.  
  1740. enum contextMsg = "While scanning a " ~ name;
  1741. while(reader_.peekByte() == '%')
  1742. {
  1743. reader_.forward();
  1744. char[2] nextByte = [reader_.peekByte(), reader_.peekByte(1)];
  1745.  
  1746. enforce(nextByte[0].isHexDigit && nextByte[1].isHexDigit,
  1747. new ScannerException(contextMsg, startMark,
  1748. expected("URI escape sequence of 2 hexadecimal " ~
  1749. "numbers", nextByte), reader_.mark));
  1750.  
  1751. buffer ~= nextByte[].to!ubyte(16);
  1752.  
  1753. reader_.forward(2);
  1754. }
  1755. try
  1756. {
  1757. foreach (dchar chr; buffer.data)
  1758. {
  1759. reader_.sliceBuilder.write(chr);
  1760. }
  1761. }
  1762. catch (UnicodeException)
  1763. {
  1764. throw new ScannerException(contextMsg, startMark,
  1765. "Invalid UTF-8 data encoded in URI escape sequence",
  1766. reader_.mark);
  1767. }
  1768. }
  1769.  
  1770.  
  1771. /// Scan a line break, if any.
  1772. ///
  1773. /// Transforms:
  1774. /// '\r\n' : '\n'
  1775. /// '\r' : '\n'
  1776. /// '\n' : '\n'
  1777. /// '\u0085' : '\n'
  1778. /// '\u2028' : '\u2028'
  1779. /// '\u2029 : '\u2029'
  1780. /// no break : '\0'
  1781. dchar scanLineBreak() @safe
  1782. {
  1783. // Fast path for ASCII line breaks.
  1784. const b = reader_.peekByte();
  1785. if(b < 0x80)
  1786. {
  1787. if(b == '\n' || b == '\r')
  1788. {
  1789. if(reader_.prefix(2) == "\r\n") { reader_.forward(2); }
  1790. else { reader_.forward(); }
  1791. return '\n';
  1792. }
  1793. return '\0';
  1794. }
  1795.  
  1796. const c = reader_.peek();
  1797. if(c == '\x85')
  1798. {
  1799. reader_.forward();
  1800. return '\n';
  1801. }
  1802. if(c == '\u2028' || c == '\u2029')
  1803. {
  1804. reader_.forward();
  1805. return c;
  1806. }
  1807. return '\0';
  1808. }
  1809. }