How to make a parser for BBCode (or XML, or any similar markup language)

Jan 17, 2012 at 3:23 PM

Hi,

I'm trying to use Irony to make a BBCode parser, but I don't know much about parsers and such... I looked at the samples provided with Irony, but I didn't find anything that looks like a markup language (except Wiki, but it doesn't have a structure similar to XML or BBCode so it's not very helpful). How should I go about defining the grammar for such a language?

Here's what I tried so far. It's my first attempt at this kind of thing, so forgive me if it doesn't make any sense...

    [Language("BBCode", "1.0", "BBCode markup grammar")]
    public class BBCodeGrammar : Grammar
    {
        private readonly KeyTerm _openingBracket;
        private readonly KeyTerm _closingBracket;
        private readonly KeyTerm _slash;
        private readonly KeyTerm _equals;

        public BBCodeGrammar()
        {
            this.GrammarComments = "A grammar to parse BBCode";

            _openingBracket = ToTerm("[");
            _closingBracket = ToTerm("]");
            _slash = ToTerm("/");
            _equals = ToTerm("=");

            var bbElement = new NonTerminal("bbElement");
            var bbText = new NonTerminal("bbText");

            // Terminals
            var text = new BBCodeText();

            // Formatting
            var bold = new BBCodeInlineTag("b", false);
            MakeTagRule(bold, bbText, null);
            var italic = new BBCodeInlineTag("i", false);
            MakeTagRule(italic, bbText, null);
            var underlined = new BBCodeInlineTag("u", false);
            MakeTagRule(underlined, bbText, null);
            var striked = new BBCodeInlineTag("s", false);
            MakeTagRule(striked, bbText, null);
            var size = new BBCodeInlineTag("size", true);
            MakeTagRule(size, bbText, text);
            var color = new BBCodeInlineTag("color", true);
            MakeTagRule(color, bbText, text);
            var inlineCode = new BBCodeInlineTag("CODEINLINE", true);
            MakeTagRule(inlineCode, text, text);
            
            // Links, images and attachments
            var url = new BBCodeInlineTag("url", null);
            MakeTagRule(url, bbText, text);
            var email = new BBCodeInlineTag("email", true);
            MakeTagRule(email, bbText, text);
            var image = new BBCodeInlineTag("img", false);
            MakeTagRule(image, text, null);
            var attachment = new BBCodeInlineTag("attach", false);
            MakeTagRule(attachment, text, null);

            // Blocks
            var codeBlock = new BBCodeBlockTag("code", null);
            MakeTagRule(codeBlock, bbText, text);
            var blockQuote = new BBCodeBlockTag("quote", null);
            MakeTagRule(blockQuote, bbText, text);
            var preformatted = new BBCodeBlockTag("pre", false);
            MakeTagRule(preformatted, bbText, null);

            // List
            var list = new BBCodeListTag();
            var listBullet = _openingBracket + ToTerm("*") + _closingBracket;
            var listItem = listBullet + bbText;
            var listContent = MakeStarRule(list, listItem);
            MakeTagRule(list, listContent, text);

            // Rules
            bbElement.Rule = text | bold | italic | underlined | striked | size | color | inlineCode |
                             url | email | image | attachment |
                             codeBlock | blockQuote | preformatted | list;
            bbText.Rule = MakeStarRule(bbText, bbElement);

            Root = bbText;
        }

        private void MakeTagRule(BBCodeTag tag, BnfTerm content, BnfTerm paramContent)
        {
            BnfExpression param = Empty;
            if (paramContent != null)
            {
                if (tag.HasParameter == true)
                    param = _equals + paramContent;
                else if (tag.HasParameter == null)
                    param = Empty | (_equals + paramContent);
            }
            BnfExpression openingTag = _openingBracket + tag.Name + param + _closingBracket;
            BnfExpression closingTag = _openingBracket + _slash + tag.Name + _closingBracket;
            tag.Rule = openingTag + content + closingTag;
        }
    }

    enum BBTermType
    {
        Text,
        ListItem,
        Inline,
        Block,
        List,
    }

    abstract class BBCodeTerminalBase : Terminal
    {
        private readonly BBTermType _type;

        protected BBCodeTerminalBase(string name, BBTermType type) : base(name)
        {
            _type = type;
        }

        public BBTermType Type
        {
            get { return _type; }
        }
    }

    class BBCodeText : BBCodeTerminalBase
    {
        public BBCodeText() : base("text", BBTermType.Text)
        {
        }

        public override IList<string> GetFirsts()
        {
            return null;
        }

        private static readonly char[] _stopChars = new[] { '\r', '\n', '[' };
        public override Token TryMatch(ParsingContext context, ISourceStream source)
        {
            int stopIndex = source.Text.IndexOfAny(_stopChars, source.Location.Position + 1);
            if (stopIndex == source.Location.Position) return null;
            if (stopIndex < 0) stopIndex = source.Text.Length;
            source.PreviewPosition = stopIndex;
            return source.CreateToken(this.OutputTerminal);
        }
    }

    abstract class BBCodeTag : NonTerminal
    {
        private readonly bool? _hasParameter;
        private readonly BBTermType _type;

        protected BBCodeTag(string name, bool? hasParameter, BBTermType type) : base(name)
        {
            _hasParameter = hasParameter;
            _type = type;
        }

        public BBTermType Type
        {
            get { return _type; }
        }

        public bool? HasParameter
        {
            get { return _hasParameter; }
        }
    }

    class BBCodeInlineTag : BBCodeTag
    {
        public BBCodeInlineTag(string name, bool? hasParameter) : base(name, hasParameter, BBTermType.Inline)
        {
        }
    }

    class BBCodeBlockTag : BBCodeTag
    {
        public BBCodeBlockTag(string name, bool? hasParameter)
            : base(name, hasParameter, BBTermType.Block)
        {
        }
    }

    class BBCodeListTag : BBCodeTag
    {
        public BBCodeListTag() : base("list", null, BBTermType.List)
        {
        }
    }

Needless to say, it doesn't work... I'm getting a few errors in Grammar Explorer (things like "Shift-reduce conflict. State S2, lookaheads [[]. Selected shift as preferred action.", which I don't understand at all), and the parser output only gives me a series of "bbElement" nodes with text terminals.

Could someone tell me what I'm doing wrong (probably many things, I guess)? Am I even going in the right direction?

Coordinator
Feb 8, 2012 at 1:01 AM

BBCode sounds familiar, I think somebody mentioned it before, I mean somebody built a parser for this but I'm not sure. Try searching discussions.

Can't help you with your grammar, seems way too complicated and I don't know anything about the target language. And I would suggest first to read a bit about LR/LALR parsing, at least shift-reduce conflict must be a clear thing, before you embark on building parsers. Sorry, Irony makes it easier to build LALR parser, but you have to know what it is.

Roman

Feb 8, 2012 at 8:16 AM

Hi Roman,

Thanks for your answer. I had searched the discussions before, but there's nothing about BBCode. I guess I should read about parsers in general before trying to do this... but anyway, writing a parser wasn't my final goal, so eventually I used a parser made by someone else.

Regards,
Thomas