Nested comments

Jan 19 at 8:15 PM
Edited Jan 19 at 8:20 PM
The parser for SQL supports many features of TSQL, but there is one pesky detail that I had to solve - nested comments. Apparently, in TSQL you can do something like this:
/* comment
/* nested comment
still comment (this would not work in most languages)
To take care of this, I extended the CommentTerminal class (CommentTerminal.cs) as follows:

#region License
/* **********************************************************************************
 * Copyright (c) Roman Ivantsov
 * This source code is subject to terms and conditions of the MIT License
 * for Irony. A copy of the license can be found in the License.txt file
 * at the root of this distribution. 
 * By using this source code in any fashion, you are agreeing to be bound by the terms of the 
 * MIT License.
 * You must not remove this notice from this software.
 * **********************************************************************************/

using System;
using System.Collections.Generic;
using System.Text;

namespace Irony.Parsing {

  public class CommentTerminal : Terminal {
    public CommentTerminal(string name, string startSymbol, params string[] endSymbols) : base(name, TokenCategory.Comment) {
      this.StartSymbol = startSymbol;
      this.EndSymbols = new StringList();
      Priority = TerminalPriority.High; //assign max priority

        public CommentTerminal(string name, string startSymbol, bool canBeNested, params string[] endSymbols) : base(name, TokenCategory.Comment)
            this.StartSymbol = startSymbol;
            this.EndSymbols = new StringList();
            Priority = TerminalPriority.High; //assign max priority
            _canBeNested = canBeNested;

        public string StartSymbol;
    public StringList EndSymbols;
    private char[] _endSymbolsFirsts;
        private char[] _startAndEndSymbolsFirsts;
        private bool _isLineComment; //true if NewLine is one of EndSymbols; if yes, EOF is also considered a valid end symbol
    private bool _canBeNested = false;

    #region overrides
    public override void Init(GrammarData grammarData) {
      //_endSymbolsFirsts char array is used for fast search for end symbols using String's method IndexOfAny(...)
      _endSymbolsFirsts = new char[EndSymbols.Count];
            _startAndEndSymbolsFirsts = new char[EndSymbols.Count + 1];
            for (int i = 0; i < EndSymbols.Count; i++) {
        string sym = EndSymbols[i];
        _endSymbolsFirsts[i] = sym[0];
                _startAndEndSymbolsFirsts[i] = sym[0];
        _isLineComment |= sym.Contains("\n");
        if (!_isLineComment)
            _startAndEndSymbolsFirsts[EndSymbols.Count] = StartSymbol[0];
      if (this.EditorInfo == null) {
        TokenType ttype = _isLineComment ? TokenType.LineComment : TokenType.Comment;
        this.EditorInfo = new TokenEditorInfo(ttype, TokenColor.Comment, TokenTriggers.None);

    public override Token TryMatch(ParsingContext context, ISourceStream source) {
      Token result;
      if (context.VsLineScanState.Value != 0) {
        // we are continuing in line mode - restore internal env (none in this case)
        context.VsLineScanState.Value = 0;
      } else {
        //we are starting from scratch
        if (!BeginMatch(context, source)) return null;
      result = CompleteMatch(context, source);
      if (result != null) return result;
      //if it is LineComment, it is ok to hit EOF without final line-break; just return all until end.
      if (_isLineComment)
        return source.CreateToken(this.OutputTerminal);
      if (context.Mode == ParseMode.VsLineScan)
        return CreateIncompleteToken(context, source);
      return context.CreateErrorToken(Resources.ErrUnclosedComment);

    private Token CreateIncompleteToken(ParsingContext context, ISourceStream source) {
      source.PreviewPosition = source.Text.Length;
      Token result = source.CreateToken(this.OutputTerminal);
      result.Flags |= TokenFlags.IsIncomplete;
      context.VsLineScanState.TerminalIndex = this.MultilineIndex;
      return result; 

    private bool BeginMatch(ParsingContext context, ISourceStream source) {
      //Check starting symbol
      if (!source.MatchSymbol(StartSymbol)) return false;
      source.PreviewPosition += StartSymbol.Length;
      return true; 
    private Token CompleteMatch(ParsingContext context, ISourceStream source) {
            //Find end symbol
            int nestLevel = 0;
      while (!source.EOF()) {
        int firstCharPos;
        if (EndSymbols.Count == 1 && !_canBeNested)
          firstCharPos = source.Text.IndexOf(EndSymbols[0], source.PreviewPosition);
        else if(!_canBeNested)
          firstCharPos = source.Text.IndexOfAny(_endSymbolsFirsts, source.PreviewPosition);
            firstCharPos = source.Text.IndexOfAny(_startAndEndSymbolsFirsts, source.PreviewPosition);
          if (firstCharPos < 0) {
          source.PreviewPosition = source.Text.Length;
          return null; //indicating error
        //We found a character that might start an end symbol; let's see if it is true.
        source.PreviewPosition = firstCharPos;
                if (_canBeNested && source.MatchSymbol(StartSymbol))
                    foreach (string endSymbol in EndSymbols)
                        if (source.MatchSymbol(endSymbol))
                            //We found end symbol; eat end symbol only if it is not line comment.
                            // For line comment, leave LF symbol there, it might be important to have a separate LF token
                            if (!_isLineComment)
                                source.PreviewPosition += endSymbol.Length;
                            if (!_canBeNested || nestLevel == 0)
                                return source.CreateToken(this.OutputTerminal);
                    }//foreach endSymbol
        source.PreviewPosition++; //move to the next char and try again    
      return null; //might happen if we found a start char of end symbol, but not the full endSymbol

    public override IList<string> GetFirsts() {
      return new string[] { StartSymbol };
  }//CommentTerminal class

Using the new constructor parameter, you can now allow a nested comment as such:
var comment = new CommentTerminal("comment", "/*", true, "*/");
The same goes for MDX.
Hope this helps someone.