erythros/System/Libraries/Html/Tokenizer.HC

#define GROWABLE_STRING_INCREMENT_SIZE 16

#define HTML_STATE_INVALID 0
#define HTML_STATE_DATA 1
#define HTML_STATE_RCDATA 2
#define HTML_STATE_RAWTEXT 3
#define HTML_STATE_SCRIPT_DATA 4
#define HTML_STATE_PLAINTEXT 5
#define HTML_STATE_TAG_OPEN 6
#define HTML_STATE_END_TAG_OPEN 7
#define HTML_STATE_TAG_NAME 8
#define HTML_STATE_RCDATA_LESS_THAN_SIGN 9
#define HTML_STATE_RCDATA_END_TAG_OPEN 10
#define HTML_STATE_RCDATA_END_TAG_NAME 11
#define HTML_STATE_RAWTEXT_LESS_THAN_SIGN 12
#define HTML_STATE_RAWTEXT_END_TAG_OPEN 13
#define HTML_STATE_RAWTEXT_END_TAG_NAME 14
#define HTML_STATE_SCRIPT_DATA_LESS_THAN_SIGN 15
#define HTML_STATE_SCRIPT_DATA_END_TAG_OPEN 16
#define HTML_STATE_SCRIPT_DATA_END_TAG_NAME 17
#define HTML_STATE_SCRIPT_DATA_ESCAPE_START 18
#define HTML_STATE_SCRIPT_DATA_ESCAPE_START_DASH 19
#define HTML_STATE_SCRIPT_DATA_ESCAPED 20
#define HTML_STATE_SCRIPT_DATA_ESCAPED_DASH 21
#define HTML_STATE_SCRIPT_DATA_ESCAPED_DASH_DASH 22
#define HTML_STATE_SCRIPT_DATA_ESCAPED_LESS_THAN 23
#define HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_OPEN 24
#define HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_NAME 25
#define HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPE_START 26
#define HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED 27
#define HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_DASH 28
#define HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH 29
#define HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN 30
#define HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPE_END 31
#define HTML_STATE_BEFORE_ATTRIBUTE_NAME 32
#define HTML_STATE_ATTRIBUTE_NAME 33
#define HTML_STATE_AFTER_ATTRIBUTE_NAME 34
#define HTML_STATE_BEFORE_ATTRIBUTE_VALUE 35
#define HTML_STATE_ATTRIBUTE_VALUE_DOUBLE_QUOTED 36
#define HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED 37
#define HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED 38
#define HTML_STATE_AFTER_ATTRIBUTE_VALUE_QUOTED 39
#define HTML_STATE_SELF_CLOSING_START_TAG 40
#define HTML_STATE_BOGUS_COMMENT 41
#define HTML_STATE_MARKUP_DECLARATION_OPEN 42
#define HTML_STATE_COMMENT_START 43
#define HTML_STATE_COMMENT_START_DASH 44
#define HTML_STATE_COMMENT 45
#define HTML_STATE_COMMENT_LESS_THAN_SIGN 46
#define HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG 47
#define HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG_DASH 48
#define HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH 49
#define HTML_STATE_COMMENT_END_DASH 50
#define HTML_STATE_COMMENT_END 51
#define HTML_STATE_COMMENT_END_BANG 52
#define HTML_STATE_DOCTYPE 53
#define HTML_STATE_BEFORE_DOCTYPE_NAME 54
#define HTML_STATE_DOCTYPE_NAME 55
#define HTML_STATE_AFTER_DOCTYPE_NAME 56
#define HTML_STATE_AFTER_DOCTYPE_PUBLIC_KEYWORD 57
#define HTML_STATE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER 58
#define HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED 59
#define HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED 60
#define HTML_STATE_AFTER_DOCTYPE_PUBLIC_IDENTIFIER 61
#define HTML_STATE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS 62
#define HTML_STATE_AFTER_DOCTYPE_SYSTEM_KEYWORD 63
#define HTML_STATE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER 64
#define HTML_STATE_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED 65
#define HTML_STATE_DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED 66
#define HTML_STATE_AFTER_DOCTYPE_SYSTEM_IDENTIFIER 67
#define HTML_STATE_BOGUS_DOCTYPE 68
#define HTML_STATE_CDATA_SECTION 69
#define HTML_STATE_CDATA_SECTION_BRACKET 70
#define HTML_STATE_CDATA_SECTION_END 71
#define HTML_STATE_CHARACTER_REFERENCE 72
#define HTML_STATE_NAMED_CHARACTER_REFERENCE 73
#define HTML_STATE_AMBIGUOUS_AMPERSAND 74
#define HTML_STATE_NUMERIC_CHARACTER_REFERENCE 75
#define HTML_STATE_HEXADECIMAL_CHARACTER_REFERENCE_START 76
#define HTML_STATE_DECIMAL_CHARACTER_REFERENCE_START 77
#define HTML_STATE_HEXADECIMAL_CHARACTER_REFERENCE 78
#define HTML_STATE_DECIMAL_CHARACTER_REFERENCE 79
#define HTML_STATE_NUMERIC_CHARACTER_REFERENCE_END 80

class @html_dom_node : JsonElement
{
    @html_dom_node* parentNode;
    U8 tagName[32];
    JsonObject* attributes;
    JsonArray* children;
    U8* text;
    I64 textAlign;
    I64 width;
    I64 height;
    U32 backgroundColor;
    U32 color;
    I64 font_size;
    Bool display_block;
};

class @html_input_buffer
{
    U8* data;
    I64 size;
    I64 pos;
};

class @html_tokenizer
{
    @html_input_buffer inputBuffer;
    I64 state;
    I64 returnState;
    U8 currentInputChar;
    JsonKey* currentAttribute;
    @html_dom_node* appendNode;
    @html_dom_node* currentNode;
    @html_dom_node* originNode;
    I64 nodeTreeDepth;
    I64 dataStateCounter;
    @html_input_buffer tempBuffer;
    Bool consumeTempBuffer;
    I64 numOfImgNodes;
    CTask* mem_task;
};

I64 @round_value_up(I64 numToRound, I64 multiple)
{
    if (multiple == 0)
        return numToRound;
    I64 remainder = Abs(numToRound) % multiple;
    if (remainder == 0)
        return numToRound;
    if (numToRound < 0)
        return -(Abs(numToRound) - remainder);
    else
        return numToRound + multiple - remainder;
}

U8* @init_growable_string(CTask* mem_task) { return CAlloc(GROWABLE_STRING_INCREMENT_SIZE, mem_task); }

U8* @append_char_to_growable_string(U8* s, I64 char, CTask* mem_task)
{
    I64 oldBufSize =
        @round_value_up(StrLen(s), GROWABLE_STRING_INCREMENT_SIZE - 1);
    I64 newBufSize =
        @round_value_up(StrLen(s) + 1, GROWABLE_STRING_INCREMENT_SIZE - 1);
    if (newBufSize > oldBufSize) {
        U8* newBuf = CAlloc(
            @round_value_up(StrLen(s) + 1, GROWABLE_STRING_INCREMENT_SIZE - 1) * 2, mem_task);
        StrCpy(newBuf, s);
        newBuf[StrLen(newBuf)] = char;
        Free(s);
        return newBuf;
    } else {
        s[StrLen(s)] = char;
        return s;
    }
}

U0 @empty_temp_buffer(@html_tokenizer* t)
{
    MemSet(t->tempBuffer.data, NULL, 512);
    t->tempBuffer.size = 0;
    t->tempBuffer.pos = 0;
}

U0 @recalculate_temp_buffer_size(@html_tokenizer* t)
{
    t->tempBuffer.size = StrLen(t->tempBuffer.data);
    t->tempBuffer.pos = 0;
}

U0 @replace_temp_buffer_with_named_character_reference(@html_tokenizer* t)
{

    if (!StrICmp(t->tempBuffer.data, "&amp;")) {
        StrCpy(t->tempBuffer.data, "\x11");
        @recalculate_temp_buffer_size(t);
        return;
    }
    if (!StrICmp(t->tempBuffer.data, "&aring;")) {
        StrCpy(t->tempBuffer.data, "\xc3\x85");
        @recalculate_temp_buffer_size(t);
        return;
    }
    if (!StrICmp(t->tempBuffer.data, "&bull;")) {
        StrCpy(t->tempBuffer.data, "\xe2\x80\xa2");
        @recalculate_temp_buffer_size(t);
        return;
    }
    if (!StrICmp(t->tempBuffer.data, "&copy;")) {
        StrCpy(t->tempBuffer.data, "\xc2\xa9");
        @recalculate_temp_buffer_size(t);
        return;
    }
    if (!StrICmp(t->tempBuffer.data, "&emsp;")) {
        StrCpy(t->tempBuffer.data, " ");
        @recalculate_temp_buffer_size(t);
        return;
    }
    if (!StrICmp(t->tempBuffer.data, "&hellip;")) {
        StrCpy(t->tempBuffer.data, "...");
        @recalculate_temp_buffer_size(t);
        return;
    }
    if (!StrICmp(t->tempBuffer.data, "&mdash;")) {
        StrCpy(t->tempBuffer.data, "-");
        @recalculate_temp_buffer_size(t);
        return;
    }
    if (!StrICmp(t->tempBuffer.data, "&nbsp;")) {
        StrCpy(t->tempBuffer.data, " ");
        @recalculate_temp_buffer_size(t);
        return;
    }
    if (!StrICmp(t->tempBuffer.data, "&lt;")) {
        StrCpy(t->tempBuffer.data, "\x12");
        @recalculate_temp_buffer_size(t);
        return;
    }
    if (!StrICmp(t->tempBuffer.data, "&gt;")) {
        StrCpy(t->tempBuffer.data, ">");
        @recalculate_temp_buffer_size(t);
        return;
    }
    if (!StrICmp(t->tempBuffer.data, "&quot;")) {
        StrCpy(t->tempBuffer.data, "\"");
        @recalculate_temp_buffer_size(t);
        return;
    }
    if (!StrICmp(t->tempBuffer.data, "&zerowidthspace;")) {
        StrCpy(t->tempBuffer.data, "");
        @recalculate_temp_buffer_size(t);
        return;
    }

    StrCpy(t->tempBuffer.data, "?");
    @recalculate_temp_buffer_size(t);
    return;
}

I64 @hex_table_i;
I64 @hex_table[256];
MemSet(&@hex_table, NULL, sizeof(I64) * 256);

for (@hex_table_i = '0'; @hex_table_i < ':'; @hex_table_i++) {
    @hex_table[@hex_table_i] = @hex_table_i - '0';
}

for (@hex_table_i = 'A'; @hex_table_i < 'G'; @hex_table_i++) {
    @hex_table[@hex_table_i] = 10 + (@hex_table_i - 'A');
}

for (@hex_table_i = 'a'; @hex_table_i < 'g'; @hex_table_i++) {
    @hex_table[@hex_table_i] = 10 + (@hex_table_i - 'a');
}

I64 @utf8_encode(U8* out, I64 utf)
{
    if (utf <= 0x7F) {
        // Plain ASCII
        out[0] = utf;
        out[1] = 0;
        return 1;
    } else if (utf <= 0x07FF) {
        // 2-byte unicode
        out[0] = (((utf >> 6) & 0x1F) | 0xC0);
        out[1] = (((utf >> 0) & 0x3F) | 0x80);
        out[2] = 0;
        return 2;
    } else if (utf <= 0xFFFF) {
        // 3-byte unicode
        out[0] = (((utf >> 12) & 0x0F) | 0xE0);
        out[1] = (((utf >> 6) & 0x3F) | 0x80);
        out[2] = (((utf >> 0) & 0x3F) | 0x80);
        out[3] = 0;
        return 3;
    } else if (utf <= 0x10FFFF) {
        // 4-byte unicode
        out[0] = (((utf >> 18) & 0x07) | 0xF0);
        out[1] = (((utf >> 12) & 0x3F) | 0x80);
        out[2] = (((utf >> 6) & 0x3F) | 0x80);
        out[3] = (((utf >> 0) & 0x3F) | 0x80);
        out[4] = 0;
        return 4;
    } else {
        // error - use replacement character
        out[0] = 0xEF;
        out[1] = 0xBF;
        out[2] = 0xBD;
        out[3] = 0;
        return 0;
    }
}

U0 @replace_temp_buffer_with_dec_character_reference(@html_tokenizer* t)
{
    t->tempBuffer.data[StrLen(t->tempBuffer.data) - 1] = NULL; // chop off semicolon
    I64 charCode = Str2I64(t->tempBuffer.data + 2);
    @utf8_encode(t->tempBuffer.data, charCode);
    @recalculate_temp_buffer_size(t);
}

U0 @replace_temp_buffer_with_hex_character_reference(@html_tokenizer* t)
{
    I64 dec_char = 0;

    t->tempBuffer.data[StrLen(t->tempBuffer.data) - 1] = NULL; // chop off semicolon

    U8* ch = t->tempBuffer.data + 3;

    while (*ch && dec_char >= 0) {
        dec_char = (dec_char << 4) | @hex_table[*ch++];
    }

    StrPrint(t->tempBuffer.data, "&#%d;", dec_char);
    @recalculate_temp_buffer_size(t);

    @replace_temp_buffer_with_dec_character_reference(t);
}

U0 @replace_temp_buffer_with_numeric_character_reference(@html_tokenizer* t)
{
    switch (t->tempBuffer.data[2]) {
    case 'x':
        @replace_temp_buffer_with_hex_character_reference(t);
        break;
    default:
        @replace_temp_buffer_with_dec_character_reference(t);
        break;
    }
}

U0 @append_char_to_temp_buffer(@html_tokenizer* t, I64 char)
{
    t->tempBuffer.data[StrLen(t->tempBuffer.data)] = char;
    t->tempBuffer.size++;
}

@html_dom_node* @create_new_node(U8* tagName, CTask* mem_task)
{
    @html_dom_node* node = CAlloc(sizeof(@html_dom_node), mem_task);
    StrCpy(node->tagName, tagName);
    node->attributes = Json.CreateObject(mem_task);
    node->children = Json.CreateArray(mem_task);
    node->text = @init_growable_string(mem_task);
    node->sig = JSON_SIG;
    node->type = JSON_HTML;
    return node;
}

U0 @init_tokenizer(@html_tokenizer* t, U8* data, I64 size, CTask* mem_task)
{
    t->mem_task = mem_task;
    t->inputBuffer.data = data;
    t->inputBuffer.size = size;
    t->inputBuffer.pos = 0;
    t->state = HTML_STATE_DATA;
    t->tempBuffer.data = CAlloc(512, t->mem_task);
    t->tempBuffer.size = size;
    t->tempBuffer.pos = 0;
    t->originNode = @create_new_node("Document", t->mem_task);
    t->appendNode = t->originNode;
    t->currentNode = t->originNode;
    t->consumeTempBuffer = FALSE;
    t->dataStateCounter = 0;
    t->numOfImgNodes = 0;
}

U0 @consume_next_input_char(@html_tokenizer* t)
{
    if (t->consumeTempBuffer) {
        if (t->tempBuffer.pos < t->tempBuffer.size) {
            t->currentInputChar = t->tempBuffer.data[t->tempBuffer.pos++];
            return;
        } else {
            t->consumeTempBuffer = FALSE;
        }
    }
    t->currentInputChar = t->inputBuffer.data[t->inputBuffer.pos++];
}

U0 @emit_current_character(@html_tokenizer* t)
{
    if (!t->dataStateCounter) {
        @html_dom_node* node = @create_new_node("InternalTextNode", t->mem_task);
        t->currentNode = node;
    }
    t->currentNode->text = @append_char_to_growable_string(t->currentNode->text,
        t->currentInputChar, t->mem_task);
    t->dataStateCounter++;
}

Bool @node_is_self_closing(@html_dom_node* node)
{
    if (!StrICmp(node->tagName, "InternalTextNode"))
        return TRUE;
    if (!StrICmp(node->tagName, "area"))
        return TRUE;
    if (!StrICmp(node->tagName, "base"))
        return TRUE;
    if (!StrICmp(node->tagName, "br"))
        return TRUE;
    if (!StrICmp(node->tagName, "col"))
        return TRUE;
    if (!StrICmp(node->tagName, "embed"))
        return TRUE;
    if (!StrICmp(node->tagName, "hr"))
        return TRUE;
    if (!StrICmp(node->tagName, "img"))
        return TRUE;
    if (!StrICmp(node->tagName, "input"))
        return TRUE;
    if (!StrICmp(node->tagName, "link"))
        return TRUE;
    if (!StrICmp(node->tagName, "meta"))
        return TRUE;
    if (!StrICmp(node->tagName, "param"))
        return TRUE;
    if (!StrICmp(node->tagName, "source"))
        return TRUE;
    if (!StrICmp(node->tagName, "track"))
        return TRUE;
    if (!StrICmp(node->tagName, "wbr"))
        return TRUE;
    return FALSE;
}

U0 @emit_current_node(@html_tokenizer* t)
{
    @html_dom_node* origAppendNode = t->appendNode;
    if (t->currentNode->tagName[0] == '/') {
        if (StrICmp(t->appendNode->tagName, t->currentNode->tagName + 1)) {
            /* end tag tagName for currentNode does not match appendNode tagName,
             * traverse up parentNode until we find a match */
            while (StrICmp(t->appendNode->tagName, t->currentNode->tagName + 1)) {
                if (!StrICmp(t->appendNode->tagName,
                        "Document")) { // If we've traversed this far up, then the
                                       // closing tag is invalid
                    t->appendNode = origAppendNode;
                    return;
                }
                t->appendNode = t->appendNode->parentNode;
            }
        }
        t->appendNode = t->appendNode->parentNode;
        return;
    }
    // JsonItem* nodeItem = CAlloc(sizeof(JsonItem), t->mem_task);
    t->currentNode->parentNode = t->appendNode;
    // nodeItem->value = t->currentNode;
    //  Json.AppendItem(t->appendNode->children, nodeItem);
    t->appendNode->children->append(t->currentNode);
    if (!@node_is_self_closing(t->currentNode))
        t->appendNode = t->currentNode;
}

U0 @set_current_attribute_on_current_node(@html_tokenizer* t)
{
    t->currentNode->attributes->set(t->currentAttribute->name,
        t->currentAttribute->value, JSON_STRING);
    // Json.Set(t->currentNode->attributes, t->currentAttribute->name,
    //     t->currentAttribute->value, JSON_STRING, t->mem_task);
}

Bool @skip_script_data(@html_tokenizer* t)
{
    // FIXME: This will work in most cases, except for when </script> tags are escaped in SCRIPT data.
    U8 cmpbuf[16];
    MemSet(cmpbuf, NULL, 16);
    MemCpy(cmpbuf, t->inputBuffer.data + t->inputBuffer.pos, 6);
    if (!StrICmp(cmpbuf, "script")) {
        t->inputBuffer.pos += 6;
        while (StrICmp(cmpbuf, "</script>")) {
            MemSet(cmpbuf, NULL, 16);
            MemCpy(cmpbuf, t->inputBuffer.data + t->inputBuffer.pos, 9);
            ++t->inputBuffer.pos;
        }
        t->inputBuffer.pos += 8;
        return TRUE;
    }
    return FALSE;
}

U0 @tokenizer_html_state_data(@html_tokenizer* t)
{
    @consume_next_input_char(t);
    switch (t->currentInputChar) {
    case '&':
        // Set the return state to the data state. Switch to the character reference
        // state.
        t->returnState = HTML_STATE_DATA;
        t->state = HTML_STATE_CHARACTER_REFERENCE;
        break;
    case '<':
        if (!@skip_script_data(t)) {
            // Switch to the tag open state.
            if (t->dataStateCounter)
                @emit_current_node(t);
            t->dataStateCounter = 0;
            t->state = HTML_STATE_TAG_OPEN;
        }
        break;
    default:
        // Emit the current input character as a character token.
        @emit_current_character(t);
        break;
    }
}

U0 @tokenizer_html_state_tag_open(@html_tokenizer* t)
{
    @consume_next_input_char(t);
    switch (t->currentInputChar) {
    case '!':
        // Switch to the markup declaration open state.
        t->state = HTML_STATE_MARKUP_DECLARATION_OPEN;
        break;
    case '/':
        // Switch to the end tag open state.
        t->state = HTML_STATE_END_TAG_OPEN;
        break;
    case 'A' ... 'Z':
    case 'a' ... 'z':
        // Create a new start tag token, set its tag name to the empty string.
        // Reconsume in the tag name state.
        @html_dom_node* node = @create_new_node("", t->mem_task);
        t->currentNode = node;
        t->inputBuffer.pos--;
        t->state = HTML_STATE_TAG_NAME;
        break;
    case '?':
        // This is an unexpected-question-mark-instead-of-tag-name parse error.
        // Create a comment token whose data is the empty string. Reconsume in the
        // bogus comment state.
        t->inputBuffer.pos--;
        t->state = HTML_STATE_BOGUS_COMMENT;
        break;
    default:
        // This is an invalid-first-character-of-tag-name parse error. Emit a U+003C
        // LESS-THAN SIGN character token. Reconsume in the data state.
        @emit_current_character(t);
        t->inputBuffer.pos--;
        t->state = HTML_STATE_DATA;
        break;
    }
}

U0 @tokenizer_html_state_markup_declaration_open(@html_tokenizer* t)
{
    if ((t->inputBuffer.data[t->inputBuffer.pos] == '-') && (t->inputBuffer.data[t->inputBuffer.pos + 1] == '-')) {
        // Consume those two characters, create a comment token whose data is the
        // empty string, and switch to the comment state.
        t->inputBuffer.pos += 2;
        t->state = HTML_STATE_COMMENT;
        return;
    }
    U8 buf[8];
    buf[7] = NULL;
    MemCpy(buf, t->inputBuffer.data + t->inputBuffer.pos, 7);
    if (!StrICmp(buf, "DOCTYPE")) {
        // Consume those characters and switch to the DOCTYPE state.
        t->inputBuffer.pos += 7;
        t->state = HTML_STATE_DOCTYPE;
        return;
    }
    t->state = HTML_STATE_BOGUS_COMMENT;
}

U0 @tokenizer_html_state_doctype(@html_tokenizer* t)
{
    @consume_next_input_char(t);
    switch (t->currentInputChar) {
    case '\n':
    case '\r':
    case '\t':
    case ' ':
        // Switch to the before DOCTYPE name state.
        t->state = HTML_STATE_BEFORE_DOCTYPE_NAME;
        break;
    case '>':
        // Reconsume in the before DOCTYPE name state.
        t->inputBuffer.pos--;
        t->state = HTML_STATE_BEFORE_DOCTYPE_NAME;
        break;
    default:
        // This is a missing-whitespace-before-doctype-name parse error. Reconsume
        // in the before DOCTYPE name state.
        t->inputBuffer.pos--;
        t->state = HTML_STATE_BEFORE_DOCTYPE_NAME;
        break;
    }
}

U0 @tokenizer_html_state_before_doctype_name(@html_tokenizer* t)
{
    @consume_next_input_char(t);
    switch (t->currentInputChar) {
    case '\n':
    case '\r':
    case '\t':
    case ' ':
        // Ignore the character.
        break;
    case 'A' ... 'Z':
        // Create a new DOCTYPE token. Set the token's name to the lowercase version
        // of the current input character (add 0x0020 to the character's code
        // point). Switch to the DOCTYPE name state.
        t->state = HTML_STATE_DOCTYPE_NAME;
        break;
    case '>':
        // This is a missing-doctype-name parse error. Create a new DOCTYPE token.
        // Set its force-quirks flag to on. Switch to the data state. Emit the
        // current token.
        @emit_current_character(t);
        t->state = HTML_STATE_DATA;
        break;
    default:
        // Create a new DOCTYPE token. Set the token's name to the current input
        // character. Switch to the DOCTYPE name state.
        t->state = HTML_STATE_DOCTYPE_NAME;
        break;
    }
}

U0 @tokenizer_html_state_doctype_name(@html_tokenizer* t)
{
    @consume_next_input_char(t);
    switch (t->currentInputChar) {
    case '\n':
    case '\r':
    case '\t':
    case ' ':
        // Switch to the after DOCTYPE name state.
        t->state = HTML_STATE_AFTER_DOCTYPE_NAME;
        break;
    case '>':
        // Switch to the data state. Emit the current DOCTYPE token.
        t->state = HTML_STATE_DATA;
        break;
    case 'A' ... 'Z':
        // Append the lowercase version of the current input character (add 0x0020
        // to the character's code point) to the current DOCTYPE token's name.
        break;
    default:
        // Append the current input character to the current DOCTYPE token's name.
        break;
    }
}

U0 @tokenizer_html_state_after_doctype_name(@html_tokenizer* t)
{
    @consume_next_input_char(t);
    switch (t->currentInputChar) {
    case '\n':
    case '\r':
    case '\t':
    case ' ':
        // Ignore the character.
        break;
    case 'A' ... 'Z':
        // Create a new DOCTYPE token. Set the token's name to the lowercase version
        // of the current input character (add 0x0020 to the character's code
        // point). Switch to the DOCTYPE name state.
        t->state = HTML_STATE_DOCTYPE_NAME;
        break;
    case '>':
        // Switch to the data state. Emit the current DOCTYPE token.
        t->state = HTML_STATE_DATA;
        break;
    default:
        // Reconsume in the bogus DOCTYPE state.
        t->inputBuffer.pos--;
        t->state = HTML_STATE_BOGUS_DOCTYPE;
        break;
    }
}

U0 @tokenizer_html_state_bogus_doctype(@html_tokenizer* t)
{
    @consume_next_input_char(t);
    switch (t->currentInputChar) {
    case '>':
        // Switch to the data state. Emit the DOCTYPE token.
        t->state = HTML_STATE_DATA;
        break;
    default:
        // Ignore the character.
        break;
    }
}

U0 @tokenizer_html_state_bogus_comment(@html_tokenizer* t)
{
    @consume_next_input_char(t);
    switch (t->currentInputChar) {
    case '>':
        // Switch to the data state. Emit the DOCTYPE token.
        t->state = HTML_STATE_DATA;
        break;
    default:
        // Ignore the character.
        break;
    }
}

U0 @tokenizer_html_state_tag_name(@html_tokenizer* t)
{
    @consume_next_input_char(t);
    switch (t->currentInputChar) {
    case '\n':
    case '\r':
    case '\t':
    case ' ':
        // Switch to the before attribute name state.
        t->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME;
        break;
    case '/':
        // Switch to the self-closing start tag state.
        t->state = HTML_STATE_SELF_CLOSING_START_TAG;
        break;
    case '>':
        // Switch to the data state. Emit the current tag token.
        @emit_current_node(t);
        t->state = HTML_STATE_DATA;
        break;
    case 'A' ... 'Z':
        // Append the lowercase version of the current input character (add 0x0020
        // to the character's code point) to the current tag token's tag name.
        t->currentNode->tagName[StrLen(t->currentNode->tagName)] = t->currentInputChar + 0x20;
        if (!StrICmp(t->currentNode->tagName, "img"))
            t->numOfImgNodes++;
        if (!StrICmp(t->currentNode->tagName, "body")) {
            t->currentNode->backgroundColor = Color(255, 255, 255);
            t->currentNode->color = Color(0, 0, 0);
        }
        break;
    default:
        // Append the current input character to the current tag token's tag name.
        t->currentNode->tagName[StrLen(t->currentNode->tagName)] = t->currentInputChar;
        if (!StrICmp(t->currentNode->tagName, "img"))
            t->numOfImgNodes++;
        if (!StrICmp(t->currentNode->tagName, "body")) {
            t->currentNode->backgroundColor = Color(255, 255, 255);
            t->currentNode->color = Color(0, 0, 0);
        }
        break;
    }
}

U0 @tokenizer_html_state_before_attribute_name(@html_tokenizer* t)
{
    @consume_next_input_char(t);
    switch (t->currentInputChar) {
    case '\n':
    case '\r':
    case '\t':
    case ' ':
        // Ignore the character.
        break;
    case '/':
    case '>':
        // Reconsume in the after attribute name state.
        t->inputBuffer.pos--;
        t->state = HTML_STATE_AFTER_ATTRIBUTE_NAME;
        break;
    case '=':
        // This is an unexpected-equals-sign-before-attribute-name parse error.
        // Start a new attribute in the current tag token. Set that attribute's name
        // to the current input character, and its value to the empty string. Switch
        // to the attribute name state.
        t->currentAttribute = CAlloc(sizeof(JsonKey), t->mem_task);
        t->currentAttribute->name = @init_growable_string(t->mem_task);
        t->currentAttribute->value = @init_growable_string(t->mem_task);
        t->currentAttribute->name = @append_char_to_growable_string(
            t->currentAttribute->name, t->currentInputChar, t->mem_task);
        t->state = HTML_STATE_ATTRIBUTE_NAME;
        break;
    default:
        // Start a new attribute in the current tag token. Set that attribute name
        // and value to the empty string. Reconsume in the attribute name state.
        t->currentAttribute = CAlloc(sizeof(JsonKey), t->mem_task);
        t->currentAttribute->name = @init_growable_string(t->mem_task);
        t->currentAttribute->value = @init_growable_string(t->mem_task);
        t->inputBuffer.pos--;
        t->state = HTML_STATE_ATTRIBUTE_NAME;
        break;
    }
}

U0 @tokenizer_html_state_attribute_name(@html_tokenizer* t)
{
    @consume_next_input_char(t);
    switch (t->currentInputChar) {
    case '\n':
    case '\r':
    case '\t':
    case ' ':
    case '/':
    case '>':
        // Reconsume in the after attribute name state.
        t->inputBuffer.pos--;
        t->state = HTML_STATE_AFTER_ATTRIBUTE_NAME;
        break;
    case '=':
        // Switch to the before attribute value state.
        t->state = HTML_STATE_BEFORE_ATTRIBUTE_VALUE;
        break;
    case 'A' ... 'Z':
        // Append the lowercase version of the current input character (add 0x0020
        // to the character's code point) to the current attribute's name.
        t->currentAttribute->name = @append_char_to_growable_string(
            t->currentAttribute->name, t->currentInputChar + 0x20, t->mem_task);
        break;
    case '"':
    case '\'':
    case '<':
    // This is an unexpected-character-in-attribute-name parse error. Treat it as
    // per the "anything else" entry below.
    default:
        // Append the current input character to the current attribute's name.
        t->currentAttribute->name = @append_char_to_growable_string(
            t->currentAttribute->name, t->currentInputChar, t->mem_task);
        break;
    }
}

U0 @tokenizer_html_state_before_attribute_value(@html_tokenizer* t)
{
    @consume_next_input_char(t);
    switch (t->currentInputChar) {
    case '\n':
    case '\r':
    case '\t':
    case ' ':
        // Ignore the character.
        break;
    case '"':
        // Switch to the attribute value (double-quoted) state.
        t->state = HTML_STATE_ATTRIBUTE_VALUE_DOUBLE_QUOTED;
        break;
    case '\'':
        // Switch to the attribute value (single-quoted) state.
        t->state = HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED;
        break;
    case '>':
        // This is a missing-attribute-value parse error. Switch to the data state.
        // Emit the current tag token.
        @emit_current_node(t);
        t->state = HTML_STATE_DATA;
        break;
    default:
        // Reconsume in the attribute value (unquoted) state.
        t->inputBuffer.pos--;
        t->state = HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED;
        break;
    }
}

U0 @tokenizer_html_state_attribute_value_double_quoted(@html_tokenizer* t)
{
    @consume_next_input_char(t);
    switch (t->currentInputChar) {
    case '"':
        // Switch to the after attribute value (quoted) state.
        @set_current_attribute_on_current_node(t);
        t->state = HTML_STATE_AFTER_ATTRIBUTE_VALUE_QUOTED;
        break;
    /*
    case '&':
      // Set the return state to the attribute value (double-quoted) state. Switch
      // to the character reference state.
      t->returnState = HTML_STATE_ATTRIBUTE_VALUE_DOUBLE_QUOTED;
      t->state = HTML_STATE_CHARACTER_REFERENCE;
      break;
    */
    default:
        // Append the current input character to the current attribute's value.
        t->currentAttribute->value = @append_char_to_growable_string(
            t->currentAttribute->value, t->currentInputChar, t->mem_task);
        break;
    }
}

U0 @tokenizer_html_state_attribute_value_unquoted(@html_tokenizer* t)
{
    @consume_next_input_char(t);
    switch (t->currentInputChar) {
    case '\n':
    case '\r':
    case '\t':
    case ' ':
        // Switch to the before attribute name state.
        t->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME;
        break;
    /*
    case '&':
      // Set the return state to the attribute value (double-quoted) state. Switch
      // to the character reference state.
      t->returnState = HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED;
      t->state = HTML_STATE_CHARACTER_REFERENCE;
      break;
    */
    case '>':
        // Switch to the data state. Emit the current tag token.
        @emit_current_node(t);
        t->state = HTML_STATE_DATA;
        break;
    default:
        // Append the current input character to the current attribute's value.
        t->currentAttribute->value = @append_char_to_growable_string(
            t->currentAttribute->value, t->currentInputChar, t->mem_task);
        break;
    }
}

U0 @tokenizer_html_state_attribute_value_single_quoted(@html_tokenizer* t)
{
    @consume_next_input_char(t);
    switch (t->currentInputChar) {
    case '\'':
        // Switch to the after attribute value (quoted) state.
        @set_current_attribute_on_current_node(t);
        t->state = HTML_STATE_AFTER_ATTRIBUTE_VALUE_QUOTED;
        break;
    /*
    case '&':
      // Set the return state to the attribute value (double-quoted) state. Switch
      // to the character reference state.
      t->returnState = HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED;
      t->state = HTML_STATE_CHARACTER_REFERENCE;
      break;
    */
    default:
        // Append the current input character to the current attribute's value.
        t->currentAttribute->value = @append_char_to_growable_string(
            t->currentAttribute->value, t->currentInputChar, t->mem_task);
        break;
    }
}

U0 @tokenizer_html_state_after_attribute_value_quoted(@html_tokenizer* t)
{
    @consume_next_input_char(t);
    switch (t->currentInputChar) {
    case '\n':
    case '\r':
    case '\t':
    case ' ':
        // Switch to the before attribute name state.
        t->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME;
        break;
    case '/':
        // Switch to the self-closing start tag state.
        t->state = HTML_STATE_SELF_CLOSING_START_TAG;
        break;
    case '>':
        // Switch to the data state. Emit the current tag token.
        @emit_current_node(t);
        t->state = HTML_STATE_DATA;
        break;
    default:
        // This is a missing-whitespace-between-attributes parse error. Reconsume in
        // the before attribute name state.
        t->inputBuffer.pos--;
        t->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME;
        break;
    }
}

U0 @tokenizer_html_state_end_tag_open(@html_tokenizer* t)
{
    @consume_next_input_char(t);
    switch (t->currentInputChar) {
    case 'A' ... 'Z':
    case 'a' ... 'z':
        // Create a new end tag token, set its tag name to the empty string.
        // Reconsume in the tag name state.
        @html_dom_node* node = @create_new_node("/", t->mem_task);
        t->currentNode = node;
        t->inputBuffer.pos--;
        t->state = HTML_STATE_TAG_NAME;
        break;
    case '>':
        // This is a missing-end-tag-name parse error. Switch to the data state.
        t->state = HTML_STATE_DATA;
        break;
    default:
        // This is an invalid-first-character-of-tag-name parse error. Create a
        // comment token whose data is the empty string. Reconsume in the bogus
        // comment state.
        t->inputBuffer.pos--;
        t->state = HTML_STATE_BOGUS_COMMENT;
        break;
    }
}

U0 @tokenizer_html_state_after_attribute_name(@html_tokenizer* t)
{
    @consume_next_input_char(t);
    switch (t->currentInputChar) {
    case '\n':
    case '\r':
    case '\t':
    case ' ':
        // Ignore the character.
        break;
    /*
    case '"':
      // Switch to the attribute value (double-quoted) state.
      t->state = HTML_STATE_ATTRIBUTE_VALUE_DOUBLE_QUOTED;
      break;
    case '\'':
      // Switch to the attribute value (single-quoted) state.
      t->state = HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED;
      break;
    */
    case '/':
        // Switch to the self-closing start tag state.
        t->state = HTML_STATE_SELF_CLOSING_START_TAG;
        break;
    case '=':
        // Switch to the before attribute value state.
        t->state = HTML_STATE_BEFORE_ATTRIBUTE_VALUE;
        break;
    case '>':
        // Switch to the data state. Emit the current tag token.
        @set_current_attribute_on_current_node(t);
        @emit_current_node(t);
        t->state = HTML_STATE_DATA;
        break;
    default:
        // Start a new attribute in the current tag token. Set that attribute name
        // and value to the empty string. Reconsume in the attribute name state.
        t->currentAttribute = CAlloc(sizeof(JsonKey), t->mem_task);
        t->currentAttribute->name = @init_growable_string(t->mem_task);
        t->currentAttribute->value = @init_growable_string(t->mem_task);
        t->inputBuffer.pos--;
        t->state = HTML_STATE_ATTRIBUTE_NAME;
        break;
    }
}

U0 @tokenizer_html_state_self_closing_start_tag(@html_tokenizer* t)
{
    @consume_next_input_char(t);
    switch (t->currentInputChar) {
    case '>':
        // Set the self-closing flag of the current tag token. Switch to the data
        // state. Emit the current tag token.
        @emit_current_node(t);
        t->state = HTML_STATE_DATA;
        break;
    default:
        // This is an unexpected-solidus-in-tag parse error. Reconsume in the before
        // attribute name state.
        t->inputBuffer.pos--;
        t->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME;
        break;
    }
}

U0 @tokenizer_html_state_character_reference(@html_tokenizer* t)
{
    // Set the temporary buffer to the empty string.
    @empty_temp_buffer(t);
    // Append a U+0026 AMPERSAND (&) character to the temporary buffer.
    @append_char_to_temp_buffer(t, '&');
    @consume_next_input_char(t);
    switch (t->currentInputChar) {
    case 'A' ... 'Z':
    case 'a' ... 'z':
        // Reconsume in the named character reference state.
        t->inputBuffer.pos--;
        t->state = HTML_STATE_NAMED_CHARACTER_REFERENCE;
        break;
    case '#':
        // Append the current input character to the temporary buffer. Switch to the
        // numeric character reference state.
        @append_char_to_temp_buffer(t, '#');
        t->state = HTML_STATE_NUMERIC_CHARACTER_REFERENCE;
        break;
    default:
        // Flush code points consumed as a character reference. Reconsume in the
        // return state.
        t->consumeTempBuffer = TRUE;
        t->state = t->returnState;
        break;
    }
}

U0 @tokenizer_html_state_named_character_reference(@html_tokenizer* t)
{
    // Consume the maximum number of characters possible, where the consumed
    // characters are one of the identifiers in the first column of the named
    // character references table. Append each character to the temporary buffer
    // when it's consumed.
    @consume_next_input_char(t);
    @append_char_to_temp_buffer(t, t->currentInputChar);
    switch (t->currentInputChar) {
    case ';':
        @replace_temp_buffer_with_named_character_reference(t);
        t->consumeTempBuffer = TRUE;
        t->state = t->returnState;
        break;
    default:
        break;
    }
}

U0 @tokenizer_html_state_numeric_character_reference(@html_tokenizer* t)
{
    @consume_next_input_char(t);
    @append_char_to_temp_buffer(t, t->currentInputChar);
    switch (t->currentInputChar) {
    case ';':
        @replace_temp_buffer_with_numeric_character_reference(t);
        t->consumeTempBuffer = TRUE;
        t->state = t->returnState;
        break;
    default:
        break;
    }
}

U0 @tokenizer_html_state_comment_start(@html_tokenizer* t)
{
    @consume_next_input_char(t);
    switch (t->currentInputChar) {
    case '-':
        // Switch to the comment start dash state.
        t->state = HTML_STATE_COMMENT_START_DASH;
        break;
    case '>':
        // This is an abrupt-closing-of-empty-comment parse error. Switch to the
        // data state. Emit the current comment token.
        t->state = HTML_STATE_DATA;
        break;
    default:
        // Reconsume in the comment state.
        t->inputBuffer.pos--;
        t->state = HTML_STATE_COMMENT;
        break;
    }
}

U0 @tokenizer_html_state_comment(@html_tokenizer* t)
{
    if ((t->inputBuffer.data[t->inputBuffer.pos] == '-') && (t->inputBuffer.data[t->inputBuffer.pos + 1] == '-') && (t->inputBuffer.data[t->inputBuffer.pos + 2] == '>')) {
        // Consume those three characters, and switch to the data state.
        t->inputBuffer.pos += 3;
        t->state = HTML_STATE_DATA;
        return;
    }
    @consume_next_input_char(t);
}

U0 @dump_node(@html_tokenizer* t, @html_dom_node* node)
{

    I64 i;

    if (StrICmp(node->tagName, "InternalTextNode") && StrICmp(node->tagName, "Document")) {
        for (i = 0; i < t->nodeTreeDepth; i++)
            "-";
        "<%s> : parentNode: <%s 0x%08x>\n", node->tagName,
            node->parentNode->tagName, node->parentNode;
    }

    if (node->children->length) {
        t->nodeTreeDepth += 2;
        for (i = 0; i < node->children->length; i++) {
            @dump_node(t, node->children->@(i));
            //@dump_node(t, Json.ArrayIndex(node->children, i));
        }
        t->nodeTreeDepth -= 2;
    }
}

U0 @dump_node_list(@html_tokenizer* t)
{
    t->nodeTreeDepth = -2;
    @dump_node(t, t->originNode);
    "\n";
}

@html_dom_node* @html_tokenize_and_create_node_list(U8* buffer, I64 size, CTask* mem_task,
    I64* num_of_images)
{
    @html_tokenizer t;
    @init_tokenizer(&t, buffer, size, mem_task);
    while (t.inputBuffer.pos < t.inputBuffer.size && buffer[t.inputBuffer.pos]) {
        switch (t.state) {
        case HTML_STATE_DATA:
            @tokenizer_html_state_data(&t);
            break;
        case HTML_STATE_TAG_OPEN:
            @tokenizer_html_state_tag_open(&t);
            break;
        case HTML_STATE_MARKUP_DECLARATION_OPEN:
            @tokenizer_html_state_markup_declaration_open(&t);
            break;
        case HTML_STATE_DOCTYPE:
            @tokenizer_html_state_doctype(&t);
            break;
        case HTML_STATE_BEFORE_DOCTYPE_NAME:
            @tokenizer_html_state_before_doctype_name(&t);
            break;
        case HTML_STATE_DOCTYPE_NAME:
            @tokenizer_html_state_doctype_name(&t);
            break;
        case HTML_STATE_TAG_NAME:
            @tokenizer_html_state_tag_name(&t);
            break;
        case HTML_STATE_BEFORE_ATTRIBUTE_NAME:
            @tokenizer_html_state_before_attribute_name(&t);
            break;
        case HTML_STATE_ATTRIBUTE_NAME:
            @tokenizer_html_state_attribute_name(&t);
            break;
        case HTML_STATE_BEFORE_ATTRIBUTE_VALUE:
            @tokenizer_html_state_before_attribute_value(&t);
            break;
        case HTML_STATE_ATTRIBUTE_VALUE_DOUBLE_QUOTED:
            @tokenizer_html_state_attribute_value_double_quoted(&t);
            break;
        case HTML_STATE_AFTER_ATTRIBUTE_VALUE_QUOTED:
            @tokenizer_html_state_after_attribute_value_quoted(&t);
            break;
        case HTML_STATE_CHARACTER_REFERENCE:
            @tokenizer_html_state_character_reference(&t);
            break;
        case HTML_STATE_END_TAG_OPEN:
            @tokenizer_html_state_end_tag_open(&t);
            break;
        case HTML_STATE_AFTER_ATTRIBUTE_NAME:
            @tokenizer_html_state_after_attribute_name(&t);
            break;
        case HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED:
            @tokenizer_html_state_attribute_value_single_quoted(&t);
            break;
        case HTML_STATE_NAMED_CHARACTER_REFERENCE:
            @tokenizer_html_state_named_character_reference(&t);
            break;
        case HTML_STATE_NUMERIC_CHARACTER_REFERENCE:
            @tokenizer_html_state_numeric_character_reference(&t);
            break;
        case HTML_STATE_AFTER_DOCTYPE_NAME:
            @tokenizer_html_state_after_doctype_name(&t);
            break;
        case HTML_STATE_BOGUS_DOCTYPE:
            @tokenizer_html_state_bogus_doctype(&t);
            break;
        case HTML_STATE_SELF_CLOSING_START_TAG:
            @tokenizer_html_state_self_closing_start_tag(&t);
            break;
        case HTML_STATE_BOGUS_COMMENT:
            @tokenizer_html_state_bogus_comment(&t);
            break;
        case HTML_STATE_COMMENT_START:
            @tokenizer_html_state_comment_start(&t);
            break;
        case HTML_STATE_COMMENT:
            @tokenizer_html_state_comment(&t);
            break;
        case HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED:
            @tokenizer_html_state_attribute_value_unquoted(&t);
            break;
        case HTML_STATE_INVALID:
        default:
            "\n$FG,0$HTML Tokenization error: Invalid or unimplemented "
            "state\nInputBuffer position: %d\nState: %d$FD$\n\n",
                t.inputBuffer.pos, t.state;
            PressAKey;
            break;
        }
    }
    @html_dom_node* node_list = t.originNode;
    *num_of_images = t.numOfImgNodes;
    return node_list;
}