﻿// Converts HTML to a DOM Tree

// Copyright 2011 Nathanael Schmolze
// The following code may be used for any purpose, and modified at your every whim,
// as long as you keep this notice at the top of the code, and promise to thank
// me if we cross paths in the next life.

// Regarding modifications: This code isn't 100% bug proof; it'll probably be perfect
// when html is perfect :-/.  So, if you make any improvements that you think others
// may benefit from, please email me the changes at opensourcejunkie@gmail.com.  I
// would have made this library open source to facilitate such improvement, but I
// didn't want to frighten anyone away with the legalese of the LGPL.  So, it's my
// gift to the world - just make sure to give back.



// name: convertHtmlToDom
// desc: converts the provided string of html to a DOM tree, and returns the
//       resulting DocumentFragment object.
// parm: html - string containing well-formatted html
//       doc  - the documentobject (usually window.document) to which this will be attached
function convertHtmlToDom(html, doc) {
    //|| Variables ||//
    var that = this;
    var i;
    var current_char;
    var previous_char;
    var next_char;
    var line_number;
    var char_number
    var text_buffer;
    var state_text;
    var state_quoted_string;
    var tag_stack; // stack used for semantic analysis


    //|| Setup ||//
    root = document.createElement('root');
    current_node = root;
    i = -1;
    current_char = ' ';
    previous_char = ' ';
    next_char = ' ';
    line_number = 1;
    char_number = 0;
    text_buffer = '';
    state_text = true;  // html file begins outside of an element node
    state_quoted_string = false;
    tag_stack = new Array();


    //|| Objects ||//
    // object to hold string literal with a matching token type/symbol
    that.LToken = function (p_literal, p_symbol) {
        this.literal = p_literal;
        this.symbol = p_symbol;
    }

    that.SAInfo = function (p_previous_token, p_current_token) {
        this.previous_token = p_previous_token;
        this.current_token = p_current_token;
    }


    //|| Lexical Analyzer ||//

    // advances the pointer to the next character
    that.advanceChar = function () {
        //:: Advance Character
        i++;
        current_char = html.charAt(i);
        if (i > -1)
            previous_char = html.charAt(i - 1);
        if (i < html.length - 1)
            next_char = html.charAt(i + 1);

        //:: Advance Char Number
        char_number++;

        //:: Advance Line Number
        if (current_char == '\n') {
            line_number++;
            char_number = 0;
        }
    }

    // retreats the pointer to the previous character
    that.retreatChar = function () {
        //:: Retreat Character
        i--;
        if (i > -1)
            previous_char = html.charAt(i - 1);
        current_char = html.charAt(i);

        //:: Retreat Char Number
        char_number--;

        //:: Can't Retreat Line Number
        //   too difficult to make it worth it
    }

    // checks if provided character is whitespace character
    that.isWhitespace = function (c) {
        switch (c) {
            case ' ': return true;
            case '\r': return true;
            case '\t': return true;
            case '\n': return true;
        }
        return false;
    }

    // returns an LToken object representing the next lexeme
    that.getToken = function () {
        //:: If @ End Of String
        if (i >= html.length) {
            return new LToken('', 'eos');
        }

        //:: Advance current_char
        advanceChar();

        //:: Symbols '<' && '</'
        if (current_char == '<') {
            advanceChar(); // check if 'end tag' symbol: </
            state_text = false;
            if (current_char == '/') {
                return new LToken(previous_char + current_char, '</');
            } else {
                retreatChar();
                return new LToken(previous_char, '<');
            }
        }

        //:: Symbol '>'
        if (current_char == '>') {
            state_text = true;
            return new LToken(current_char, '>');
        }

        //:: Symbol '/>'
        if (current_char == '/' && next_char == '>') {
            state_text = true;
            advanceChar();
            return new LToken(previous_char + current_char, '/>');
        }

        //:: Symbol '='
        if (current_char == '=' && !state_text) {
            return new LToken(current_char, '=');
        }

        //:: Symbol '"'
        if (current_char == '"' && !state_text) {
            if (state_quoted_string && previous_char == '"') { // case: empty string read last iteration
                state_quoted_string = false;  // toggle string reading off
                return new LToken(current_char, '"');
            }
            else if (previous_char == '"') { // case: empty string not yet read
                // do nothing; let 'Quoted String' handler (below) deal with it
            }
            else if (state_quoted_string) {                    // case: non-empty string read last iteration
                state_quoted_string = false;  // toggle string reading off
                return new LToken(current_char, '"');
            }
            else {                                            // case: non-empty string not yet read
                state_quoted_string = true;   // toggle string reading on
                return new LToken(current_char, '"');
            }
        }

        //:: Quoted String
        if (state_quoted_string) {
            //   record string
            text_buffer = '';
            while (('' + current_char) != '"'
                && i < html.length) {
                text_buffer += current_char;
                advanceChar();
            }

            // decrement to register quote on next getToken() call
            if (i < html.length) // not @ eos
                retreatChar();

            //   return quoted string (without quotes)
            return new LToken(text_buffer, 'quoted_string');
        }

        //:: Text
        if (state_text) {
            //   record string
            text_buffer = '';
            while (('' + current_char) != '<'
                && i < html.length) {
                text_buffer += current_char;
                advanceChar();
            }

            // decrement to register angle bracket on next getToken() call
            if (i < html.length) // not @ eos
                retreatChar();

            //   return text
            return new LToken(text_buffer, 'text');
        }

        //:: Identifier (tag or attribute name)
        if (!state_text && !state_quoted_string && !isWhitespace(current_char)) {
            //   record string
            text_buffer = '';
            while (('' + current_char) != '<'
                && ('' + current_char) != '>'
                && ('' + current_char) != '='
                && ('' + current_char) != '"'
                && !isWhitespace(current_char)
                && i < html.length) {
                text_buffer += current_char;
                advanceChar();
            }

            // decrement to register next character on next getToken() call
            if (i < html.length) // not @ eos
                retreatChar();

            //   return quoted string (without quotes)
            return new LToken(text_buffer, 'identifier');
        }

        //:: Whitespace
        if (isWhitespace(current_char)) {
            // eliminate whitespace
            while (isWhitespace(current_char) && i < html.length) {
                advanceChar();
            }

            // decrement to register next character on next getToken() call
            if (i < html.length) // not @ eos
                retreatChar();

            // return next token
            return getToken();
        }

        //:: Error
        //   this part of the code should never be reached.
        throw new LexicalAnalyzerBugException;
    }


    //|| Syntactical Analyzer ||//
    //   (in Backus Naur form)

    //|| HTML
    //   html ::= { <tagSet> }
    that.html = function (current_document) {
        //:: Variables
        var root = current_document.createDocumentFragment();
        var temp_node = null;
        var current_token = getToken();

        //:: { <tagContents> }
        while (current_token.symbol != 'eos') {
            // <tagContents>
            current_token = tagContents(current_token, root, current_document).current_token;

            // check for a closing tag (error)
            if (current_token.symbol == '</') {
                throwSyntaxError(1);
            }
        }

        //:: Return root
        return root;
    }

    //|| tagContents
    //   { <tagSet> | text }
    that.tagContents = function (p_current_token, current_node, current_document) {
        //:: Variables
        var temp_node = null;
        var current_token = p_current_token;
        var temp_sainfo;

        //:: { <tagSet> | text }
        while (current_token.symbol != '</' && current_token.symbol != 'eos') {
            // [ <tagSet> ]
            if (current_token.symbol == '<') {
                current_token = tagSet(current_token, current_node, current_document).current_token;
            }

            // [ text ]
            else if (current_token.symbol == 'text') {
                // create text node
                temp_node = current_document.createTextNode(current_token.literal);
                current_node.appendChild(temp_node);

                // advance token
                current_token = getToken();
            }

            // (error)
            else {
                throwSyntaxError(8);
            }
        }

        //:: Return Current Token
        return new SAInfo(null, current_token);
    }

    //|| tagSet
    //   <tagBegin> [ <tagContents> <tagEnd> ]
    that.tagSet = function (p_current_token, current_node, current_document) {
        //:: Variables
        var temp_node = null;
        var current_token = p_current_token;
        var temp_sainfo;

        //:: | <tagBegin>
        temp_sainfo = tagBegin(current_token, current_node, current_document);
        current_token = temp_sainfo.current_token;

        //:: [  <tagContents> <tagEnd> ]
        if (temp_sainfo.previous_token.symbol != '/>') {
            // <tagContents>
            current_token = tagContents(current_token, current_node.lastChild, current_document).current_token;

            // <tagEnd>
            current_token = tagEnd(current_token, current_node, current_document).current_token;
        }

        //:: Return Current Token
        return new SAInfo(null, current_token);
    }

    //|| tagBegin
    //   "<" identifier { <attributes> } (">" | "/>")
    that.tagBegin = function (p_current_token, current_node, current_document) {
        //:: Variables
        var temp_node = null;
        var temp_sainfo = null;
        var previous_token = null;
        var current_token = p_current_token;

        //:: "<"
        if (current_token.symbol == '<') {
            // advance token
            current_token = getToken();
        } else {
            throwSyntaxError(1);
        }

        //:: identifier
        if (current_token.symbol == 'identifier') {
            // create new element node
            temp_node = current_document.createElement(current_token.literal);
            current_node.appendChild(temp_node);

            // advance token
            current_token = getToken();
        } else {
            throwSyntaxError(0);
        }

        //:: { <attribute> }
        while (current_token.symbol != '>' && current_token.symbol != '/>') {
            current_token = attribute(current_token, current_node, current_document).current_token;
        }

        //:: ">" | "/>"
        if (current_token.symbol == '>' || current_token.symbol == '/>') {
            // advance token
            previous_token = current_token;
            current_token = getToken();
        } else {
            throwSyntaxError(4);
        }

        //:: Return Current Token
        return new SAInfo(previous_token, current_token);
    }

    //|| attribute
    //   identifier [ "=" """ [ quoted_string ] """ ]
    that.attribute = function (p_current_token, current_node, current_document) {
        //:: Variables
        var temp_node = null;
        var attribute_name = '';
        var current_token = p_current_token;

        //:: identifier
        if (current_token.symbol == 'identifier') {
            // record attribute name
            attribute_name = current_token.literal;

            // advance token
            current_token = getToken();
        } else {
            throwSyntaxError(0);
        }

        //:: "="
        if (current_token.symbol == '=') {
            // advance token
            current_token = getToken();
        } else if (current_token.symbol == 'identifier' || current_token.symbol == '>' || current_token.symbol == '/>') {
            // attribute with no value
            current_node.lastChild.setAttribute(attribute_name, true);

            // end early
            return new SAInfo(null, current_token);
        } else {
            throwSyntaxError(5);
        }

        //:: """
        if (current_token.symbol == '"') {
            // advance token
            current_token = getToken();
        } else {
            throwSyntaxError(6);
        }

        //:: quoted_string
        if (current_token.symbol == 'quoted_string') {
            // set attribute
            current_node.lastChild.setAttribute(attribute_name, current_token.literal);

            // advance token
            current_token = getToken();
        } else if (current_token.symbol == '"') {
            // empty string
            current_node.lastChild.setAttribute(attribute_name, '');

            // do not advance symbol
        } else {
            throwSyntaxError(7); // I don't think it's possible to reach this
        }

        //:: """
        if (current_token.symbol == '"') {
            // advance token
            current_token = getToken();
        } else {
            throwSyntaxError(6);
        }

        //:: Return Current Token
        return new SAInfo(null, current_token);
    }

    //|| tagEnd
    //   "</" identifier ">"
    that.tagEnd = function (p_current_token, current_node, current_document) {
        //:: Variables
        var temp_node = null;
        var current_token = p_current_token;

        //:: "</"
        if (current_token.symbol == '</') {
            // advance token
            current_token = getToken();
        } else {
            throwSyntaxError(2);
        }

        //:: identifier
        if (current_token.symbol == 'identifier') {
            // compare begin tag's identifier with end tag's identifier
            if (current_node.lastChild.tagName.toLowerCase() != current_token.literal.toLowerCase()) {
                throwSemanticsError(0, current_node.lastChild.tagName.toLowerCase(),
                                    current_token.literal.toLowerCase());
            }

            // advance token
            current_token = getToken();
        } else {
            throwSyntaxError(0);
        }

        //:: ">"
        if (current_token.symbol == '>') {
            // advance token
            current_token = getToken();
        } else {
            throwSyntaxError(3);
        }

        //:: Return Current Token
        return new SAInfo(null, current_token);
    }

    //|| 
    //   
    that.template = function (p_current_token, current_node, current_document) {
        //:: Variables
        var temp_node = null;
        var current_token = p_current_token;

        //:: ----------------
        while (current_token.symbol != 'eos') {
            current_token = getToken();
        }

        //:: Return Current Token
        return new SAInfo(null, current_token);
    }

    // syntax error handler
    that.throwSyntaxError = function (error_num) {
        //:: Variables
        var error_message = 'Syntax error'; // default

        //:: Determine Error Type
        switch (error_num) {
            case 0: error_message = 'Expecting an identifier';
                break;
            case 1: error_message = 'Expecting a left angle bracket (\'<\')';
                break;
            case 2: error_message = 'Expecting a left angle bracket and forward slash (\'</\')';
                break;
            case 3: error_message = 'Expecting a right angle bracket (\'>\')';
                break;
            case 4: error_message = 'Expecting a right angle bracket (\'>\') or a forward slash and right angle bracket (\'/>\')';
                break;
            case 5: error_message = 'Expecting an equals sign (\'=\')';
                break;
            case 6: error_message = 'Expecting a double quote (\'"\')';
                break;
            case 7: error_message = 'Expecting a quoted string (\'"hello, world!"\')';
                break;
            case 8: error_message = 'Expecting text (\'hello, world!\') or a left angle bracket (\'<\')';
                break;
        }

        //:: Throw Exception
        throw new BadHTMLSyntaxException(line_number, char_number, error_message);
    }

    // semantics error handler
    that.throwSemanticsError = function (error_num) {
        //:: Variables
        var error_message = 'Semantics error'; // default

        //:: Determine Error Type
        switch (error_num) {
            case 0: error_message = 'The end tag "</' + arguments[2] + '>" does not match the beginning tag "<'
                    + arguments[1] + '>"';
                break;
        }

        //:: Throw Exception
        throw new BadHTMLSemanticsException(line_number, char_number, error_message);
    }

    //:: Return root
    return that.html(doc);
}



// exceptions

function LexicalAnalyzerBugException() {
    this.name = 'LexicalAnalyzerBugException';
    this.message = 'You should never see this error.  If you do, it means that there is a problem '
                 + 'with the lexical analyzer in DomConverter.js.  Please report any bugs (or any '
                 + 'bug fixes) to opensourcejunkie@gmail.com';
}
function BadHTMLSyntaxException(line, char, error) {
    this.name = 'BadHTMLSyntaxException';
    this.message = 'The HTML provided to convertHTMLToDom is malformatted.  If '
                 + 'it is not malformatted, there may be a bug in the function.  Please report any '
                 + 'bugs (or any bug fixes) to opensourcejunkie@gmail.com\n\n'
                 + '(line ' + line + ', char ' + char + '): ' + error;

    this.line = line;
    this.char = char;
    this.error = error;
}
function BadHTMLSemanticsException(line, char, error) {
    this.name = 'BadHTMLSemanticsException';
    this.message = 'The HTML provided to convertHTMLToDom is malformatted.  If '
                 + 'it is not malformatted, there may be a bug in the function.  Please report any '
                 + 'bugs (or any bug fixes) to opensourcejunkie@gmail.com\n\n'
                 + '(line ' + line + ', char ' + char + '): ' + error;

    this.line = line;
    this.char = char;
    this.error = error;
}
