#define GROWABLE_STRING_INCREMENT_SIZE 16 #define HTML_STATE_INVALID 0 #define HTML_STATE_DATA 1 #define HTML_STATE_RCDATA 2 #define HTML_STATE_RAWTEXT 3 #define HTML_STATE_SCRIPT_DATA 4 #define HTML_STATE_PLAINTEXT 5 #define HTML_STATE_TAG_OPEN 6 #define HTML_STATE_END_TAG_OPEN 7 #define HTML_STATE_TAG_NAME 8 #define HTML_STATE_RCDATA_LESS_THAN_SIGN 9 #define HTML_STATE_RCDATA_END_TAG_OPEN 10 #define HTML_STATE_RCDATA_END_TAG_NAME 11 #define HTML_STATE_RAWTEXT_LESS_THAN_SIGN 12 #define HTML_STATE_RAWTEXT_END_TAG_OPEN 13 #define HTML_STATE_RAWTEXT_END_TAG_NAME 14 #define HTML_STATE_SCRIPT_DATA_LESS_THAN_SIGN 15 #define HTML_STATE_SCRIPT_DATA_END_TAG_OPEN 16 #define HTML_STATE_SCRIPT_DATA_END_TAG_NAME 17 #define HTML_STATE_SCRIPT_DATA_ESCAPE_START 18 #define HTML_STATE_SCRIPT_DATA_ESCAPE_START_DASH 19 #define HTML_STATE_SCRIPT_DATA_ESCAPED 20 #define HTML_STATE_SCRIPT_DATA_ESCAPED_DASH 21 #define HTML_STATE_SCRIPT_DATA_ESCAPED_DASH_DASH 22 #define HTML_STATE_SCRIPT_DATA_ESCAPED_LESS_THAN 23 #define HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_OPEN 24 #define HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_NAME 25 #define HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPE_START 26 #define HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED 27 #define HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_DASH 28 #define HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH 29 #define HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN 30 #define HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPE_END 31 #define HTML_STATE_BEFORE_ATTRIBUTE_NAME 32 #define HTML_STATE_ATTRIBUTE_NAME 33 #define HTML_STATE_AFTER_ATTRIBUTE_NAME 34 #define HTML_STATE_BEFORE_ATTRIBUTE_VALUE 35 #define HTML_STATE_ATTRIBUTE_VALUE_DOUBLE_QUOTED 36 #define HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED 37 #define HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED 38 #define HTML_STATE_AFTER_ATTRIBUTE_VALUE_QUOTED 39 #define HTML_STATE_SELF_CLOSING_START_TAG 40 #define HTML_STATE_BOGUS_COMMENT 41 #define HTML_STATE_MARKUP_DECLARATION_OPEN 42 #define HTML_STATE_COMMENT_START 43 #define HTML_STATE_COMMENT_START_DASH 44 #define HTML_STATE_COMMENT 45 #define HTML_STATE_COMMENT_LESS_THAN_SIGN 46 #define HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG 47 #define HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG_DASH 48 #define HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH 49 #define HTML_STATE_COMMENT_END_DASH 50 #define HTML_STATE_COMMENT_END 51 #define HTML_STATE_COMMENT_END_BANG 52 #define HTML_STATE_DOCTYPE 53 #define HTML_STATE_BEFORE_DOCTYPE_NAME 54 #define HTML_STATE_DOCTYPE_NAME 55 #define HTML_STATE_AFTER_DOCTYPE_NAME 56 #define HTML_STATE_AFTER_DOCTYPE_PUBLIC_KEYWORD 57 #define HTML_STATE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER 58 #define HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED 59 #define HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED 60 #define HTML_STATE_AFTER_DOCTYPE_PUBLIC_IDENTIFIER 61 #define HTML_STATE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS 62 #define HTML_STATE_AFTER_DOCTYPE_SYSTEM_KEYWORD 63 #define HTML_STATE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER 64 #define HTML_STATE_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED 65 #define HTML_STATE_DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED 66 #define HTML_STATE_AFTER_DOCTYPE_SYSTEM_IDENTIFIER 67 #define HTML_STATE_BOGUS_DOCTYPE 68 #define HTML_STATE_CDATA_SECTION 69 #define HTML_STATE_CDATA_SECTION_BRACKET 70 #define HTML_STATE_CDATA_SECTION_END 71 #define HTML_STATE_CHARACTER_REFERENCE 72 #define HTML_STATE_NAMED_CHARACTER_REFERENCE 73 #define HTML_STATE_AMBIGUOUS_AMPERSAND 74 #define HTML_STATE_NUMERIC_CHARACTER_REFERENCE 75 #define HTML_STATE_HEXADECIMAL_CHARACTER_REFERENCE_START 76 #define HTML_STATE_DECIMAL_CHARACTER_REFERENCE_START 77 #define HTML_STATE_HEXADECIMAL_CHARACTER_REFERENCE 78 #define HTML_STATE_DECIMAL_CHARACTER_REFERENCE 79 #define HTML_STATE_NUMERIC_CHARACTER_REFERENCE_END 80 class @html_dom_node : JsonElement { @html_dom_node* parentNode; U8 tagName[32]; JsonObject* attributes; JsonArray* children; U8* text; I64 textAlign; I64 width; I64 height; U32 backgroundColor; U32 color; I64 font_size; Bool display_block; }; class @html_input_buffer { U8* data; I64 size; I64 pos; }; class @html_tokenizer { @html_input_buffer inputBuffer; I64 state; I64 returnState; U8 currentInputChar; JsonKey* currentAttribute; @html_dom_node* appendNode; @html_dom_node* currentNode; @html_dom_node* originNode; I64 nodeTreeDepth; I64 dataStateCounter; @html_input_buffer tempBuffer; Bool consumeTempBuffer; I64 numOfImgNodes; CTask* mem_task; }; I64 @round_value_up(I64 numToRound, I64 multiple) { if (multiple == 0) return numToRound; I64 remainder = Abs(numToRound) % multiple; if (remainder == 0) return numToRound; if (numToRound < 0) return -(Abs(numToRound) - remainder); else return numToRound + multiple - remainder; } U8* @init_growable_string(CTask* mem_task) { return CAlloc(GROWABLE_STRING_INCREMENT_SIZE, mem_task); } U8* @append_char_to_growable_string(U8* s, I64 char, CTask* mem_task) { I64 oldBufSize = @round_value_up(StrLen(s), GROWABLE_STRING_INCREMENT_SIZE - 1); I64 newBufSize = @round_value_up(StrLen(s) + 1, GROWABLE_STRING_INCREMENT_SIZE - 1); if (newBufSize > oldBufSize) { U8* newBuf = CAlloc( @round_value_up(StrLen(s) + 1, GROWABLE_STRING_INCREMENT_SIZE - 1) * 2, mem_task); StrCpy(newBuf, s); newBuf[StrLen(newBuf)] = char; Free(s); return newBuf; } else { s[StrLen(s)] = char; return s; } } U0 @empty_temp_buffer(@html_tokenizer* t) { MemSet(t->tempBuffer.data, NULL, 512); t->tempBuffer.size = 0; t->tempBuffer.pos = 0; } U0 @recalculate_temp_buffer_size(@html_tokenizer* t) { t->tempBuffer.size = StrLen(t->tempBuffer.data); t->tempBuffer.pos = 0; } U0 @replace_temp_buffer_with_named_character_reference(@html_tokenizer* t) { if (!StrICmp(t->tempBuffer.data, "&")) { StrCpy(t->tempBuffer.data, "\x11"); @recalculate_temp_buffer_size(t); return; } if (!StrICmp(t->tempBuffer.data, "å")) { StrCpy(t->tempBuffer.data, "\xc3\x85"); @recalculate_temp_buffer_size(t); return; } if (!StrICmp(t->tempBuffer.data, "•")) { StrCpy(t->tempBuffer.data, "\xe2\x80\xa2"); @recalculate_temp_buffer_size(t); return; } if (!StrICmp(t->tempBuffer.data, "©")) { StrCpy(t->tempBuffer.data, "\xc2\xa9"); @recalculate_temp_buffer_size(t); return; } if (!StrICmp(t->tempBuffer.data, " ")) { StrCpy(t->tempBuffer.data, " "); @recalculate_temp_buffer_size(t); return; } if (!StrICmp(t->tempBuffer.data, "…")) { StrCpy(t->tempBuffer.data, "..."); @recalculate_temp_buffer_size(t); return; } if (!StrICmp(t->tempBuffer.data, "—")) { StrCpy(t->tempBuffer.data, "-"); @recalculate_temp_buffer_size(t); return; } if (!StrICmp(t->tempBuffer.data, " ")) { StrCpy(t->tempBuffer.data, " "); @recalculate_temp_buffer_size(t); return; } if (!StrICmp(t->tempBuffer.data, "<")) { StrCpy(t->tempBuffer.data, "\x12"); @recalculate_temp_buffer_size(t); return; } if (!StrICmp(t->tempBuffer.data, ">")) { StrCpy(t->tempBuffer.data, ">"); @recalculate_temp_buffer_size(t); return; } if (!StrICmp(t->tempBuffer.data, """)) { StrCpy(t->tempBuffer.data, "\""); @recalculate_temp_buffer_size(t); return; } if (!StrICmp(t->tempBuffer.data, "&zerowidthspace;")) { StrCpy(t->tempBuffer.data, ""); @recalculate_temp_buffer_size(t); return; } StrCpy(t->tempBuffer.data, "?"); @recalculate_temp_buffer_size(t); return; } I64 @hex_table_i; I64 @hex_table[256]; MemSet(&@hex_table, NULL, sizeof(I64) * 256); for (@hex_table_i = '0'; @hex_table_i < ':'; @hex_table_i++) { @hex_table[@hex_table_i] = @hex_table_i - '0'; } for (@hex_table_i = 'A'; @hex_table_i < 'G'; @hex_table_i++) { @hex_table[@hex_table_i] = 10 + (@hex_table_i - 'A'); } for (@hex_table_i = 'a'; @hex_table_i < 'g'; @hex_table_i++) { @hex_table[@hex_table_i] = 10 + (@hex_table_i - 'a'); } I64 @utf8_encode(U8* out, I64 utf) { if (utf <= 0x7F) { // Plain ASCII out[0] = utf; out[1] = 0; return 1; } else if (utf <= 0x07FF) { // 2-byte unicode out[0] = (((utf >> 6) & 0x1F) | 0xC0); out[1] = (((utf >> 0) & 0x3F) | 0x80); out[2] = 0; return 2; } else if (utf <= 0xFFFF) { // 3-byte unicode out[0] = (((utf >> 12) & 0x0F) | 0xE0); out[1] = (((utf >> 6) & 0x3F) | 0x80); out[2] = (((utf >> 0) & 0x3F) | 0x80); out[3] = 0; return 3; } else if (utf <= 0x10FFFF) { // 4-byte unicode out[0] = (((utf >> 18) & 0x07) | 0xF0); out[1] = (((utf >> 12) & 0x3F) | 0x80); out[2] = (((utf >> 6) & 0x3F) | 0x80); out[3] = (((utf >> 0) & 0x3F) | 0x80); out[4] = 0; return 4; } else { // error - use replacement character out[0] = 0xEF; out[1] = 0xBF; out[2] = 0xBD; out[3] = 0; return 0; } } U0 @replace_temp_buffer_with_dec_character_reference(@html_tokenizer* t) { t->tempBuffer.data[StrLen(t->tempBuffer.data) - 1] = NULL; // chop off semicolon I64 charCode = Str2I64(t->tempBuffer.data + 2); @utf8_encode(t->tempBuffer.data, charCode); @recalculate_temp_buffer_size(t); } U0 @replace_temp_buffer_with_hex_character_reference(@html_tokenizer* t) { I64 dec_char = 0; t->tempBuffer.data[StrLen(t->tempBuffer.data) - 1] = NULL; // chop off semicolon U8* ch = t->tempBuffer.data + 3; while (*ch && dec_char >= 0) { dec_char = (dec_char << 4) | @hex_table[*ch++]; } StrPrint(t->tempBuffer.data, "&#%d;", dec_char); @recalculate_temp_buffer_size(t); @replace_temp_buffer_with_dec_character_reference(t); } U0 @replace_temp_buffer_with_numeric_character_reference(@html_tokenizer* t) { switch (t->tempBuffer.data[2]) { case 'x': @replace_temp_buffer_with_hex_character_reference(t); break; default: @replace_temp_buffer_with_dec_character_reference(t); break; } } U0 @append_char_to_temp_buffer(@html_tokenizer* t, I64 char) { t->tempBuffer.data[StrLen(t->tempBuffer.data)] = char; t->tempBuffer.size++; } @html_dom_node* @create_new_node(U8* tagName, CTask* mem_task) { @html_dom_node* node = CAlloc(sizeof(@html_dom_node), mem_task); StrCpy(node->tagName, tagName); node->attributes = Json.CreateObject(mem_task); node->children = Json.CreateArray(mem_task); node->text = @init_growable_string(mem_task); node->sig = JSON_SIG; node->type = JSON_HTML; return node; } U0 @init_tokenizer(@html_tokenizer* t, U8* data, I64 size, CTask* mem_task) { t->mem_task = mem_task; t->inputBuffer.data = data; t->inputBuffer.size = size; t->inputBuffer.pos = 0; t->state = HTML_STATE_DATA; t->tempBuffer.data = CAlloc(512, t->mem_task); t->tempBuffer.size = size; t->tempBuffer.pos = 0; t->originNode = @create_new_node("Document", t->mem_task); t->appendNode = t->originNode; t->currentNode = t->originNode; t->consumeTempBuffer = FALSE; t->dataStateCounter = 0; t->numOfImgNodes = 0; } U0 @consume_next_input_char(@html_tokenizer* t) { if (t->consumeTempBuffer) { if (t->tempBuffer.pos < t->tempBuffer.size) { t->currentInputChar = t->tempBuffer.data[t->tempBuffer.pos++]; return; } else { t->consumeTempBuffer = FALSE; } } t->currentInputChar = t->inputBuffer.data[t->inputBuffer.pos++]; } U0 @emit_current_character(@html_tokenizer* t) { if (!t->dataStateCounter) { @html_dom_node* node = @create_new_node("InternalTextNode", t->mem_task); t->currentNode = node; } t->currentNode->text = @append_char_to_growable_string(t->currentNode->text, t->currentInputChar, t->mem_task); t->dataStateCounter++; } Bool @node_is_self_closing(@html_dom_node* node) { if (!StrICmp(node->tagName, "InternalTextNode")) return TRUE; if (!StrICmp(node->tagName, "area")) return TRUE; if (!StrICmp(node->tagName, "base")) return TRUE; if (!StrICmp(node->tagName, "br")) return TRUE; if (!StrICmp(node->tagName, "col")) return TRUE; if (!StrICmp(node->tagName, "embed")) return TRUE; if (!StrICmp(node->tagName, "hr")) return TRUE; if (!StrICmp(node->tagName, "img")) return TRUE; if (!StrICmp(node->tagName, "input")) return TRUE; if (!StrICmp(node->tagName, "link")) return TRUE; if (!StrICmp(node->tagName, "meta")) return TRUE; if (!StrICmp(node->tagName, "param")) return TRUE; if (!StrICmp(node->tagName, "source")) return TRUE; if (!StrICmp(node->tagName, "track")) return TRUE; if (!StrICmp(node->tagName, "wbr")) return TRUE; return FALSE; } U0 @emit_current_node(@html_tokenizer* t) { @html_dom_node* origAppendNode = t->appendNode; if (t->currentNode->tagName[0] == '/') { if (StrICmp(t->appendNode->tagName, t->currentNode->tagName + 1)) { /* end tag tagName for currentNode does not match appendNode tagName, * traverse up parentNode until we find a match */ while (StrICmp(t->appendNode->tagName, t->currentNode->tagName + 1)) { if (!StrICmp(t->appendNode->tagName, "Document")) { // If we've traversed this far up, then the // closing tag is invalid t->appendNode = origAppendNode; return; } t->appendNode = t->appendNode->parentNode; } } t->appendNode = t->appendNode->parentNode; return; } // JsonItem* nodeItem = CAlloc(sizeof(JsonItem), t->mem_task); t->currentNode->parentNode = t->appendNode; // nodeItem->value = t->currentNode; // Json.AppendItem(t->appendNode->children, nodeItem); t->appendNode->children->append(t->currentNode); if (!@node_is_self_closing(t->currentNode)) t->appendNode = t->currentNode; } U0 @set_current_attribute_on_current_node(@html_tokenizer* t) { t->currentNode->attributes->set(t->currentAttribute->name, t->currentAttribute->value, JSON_STRING); // Json.Set(t->currentNode->attributes, t->currentAttribute->name, // t->currentAttribute->value, JSON_STRING, t->mem_task); } Bool @skip_script_data(@html_tokenizer* t) { // FIXME: This will work in most cases, except for when tags are escaped in SCRIPT data. U8 cmpbuf[16]; MemSet(cmpbuf, NULL, 16); MemCpy(cmpbuf, t->inputBuffer.data + t->inputBuffer.pos, 6); if (!StrICmp(cmpbuf, "script")) { t->inputBuffer.pos += 6; while (StrICmp(cmpbuf, "")) { MemSet(cmpbuf, NULL, 16); MemCpy(cmpbuf, t->inputBuffer.data + t->inputBuffer.pos, 9); ++t->inputBuffer.pos; } t->inputBuffer.pos += 8; return TRUE; } return FALSE; } U0 @tokenizer_html_state_data(@html_tokenizer* t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '&': // Set the return state to the data state. Switch to the character reference // state. t->returnState = HTML_STATE_DATA; t->state = HTML_STATE_CHARACTER_REFERENCE; break; case '<': if (!@skip_script_data(t)) { // Switch to the tag open state. if (t->dataStateCounter) @emit_current_node(t); t->dataStateCounter = 0; t->state = HTML_STATE_TAG_OPEN; } break; default: // Emit the current input character as a character token. @emit_current_character(t); break; } } U0 @tokenizer_html_state_tag_open(@html_tokenizer* t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '!': // Switch to the markup declaration open state. t->state = HTML_STATE_MARKUP_DECLARATION_OPEN; break; case '/': // Switch to the end tag open state. t->state = HTML_STATE_END_TAG_OPEN; break; case 'A' ... 'Z': case 'a' ... 'z': // Create a new start tag token, set its tag name to the empty string. // Reconsume in the tag name state. @html_dom_node* node = @create_new_node("", t->mem_task); t->currentNode = node; t->inputBuffer.pos--; t->state = HTML_STATE_TAG_NAME; break; case '?': // This is an unexpected-question-mark-instead-of-tag-name parse error. // Create a comment token whose data is the empty string. Reconsume in the // bogus comment state. t->inputBuffer.pos--; t->state = HTML_STATE_BOGUS_COMMENT; break; default: // This is an invalid-first-character-of-tag-name parse error. Emit a U+003C // LESS-THAN SIGN character token. Reconsume in the data state. @emit_current_character(t); t->inputBuffer.pos--; t->state = HTML_STATE_DATA; break; } } U0 @tokenizer_html_state_markup_declaration_open(@html_tokenizer* t) { if ((t->inputBuffer.data[t->inputBuffer.pos] == '-') && (t->inputBuffer.data[t->inputBuffer.pos + 1] == '-')) { // Consume those two characters, create a comment token whose data is the // empty string, and switch to the comment state. t->inputBuffer.pos += 2; t->state = HTML_STATE_COMMENT; return; } U8 buf[8]; buf[7] = NULL; MemCpy(buf, t->inputBuffer.data + t->inputBuffer.pos, 7); if (!StrICmp(buf, "DOCTYPE")) { // Consume those characters and switch to the DOCTYPE state. t->inputBuffer.pos += 7; t->state = HTML_STATE_DOCTYPE; return; } t->state = HTML_STATE_BOGUS_COMMENT; } U0 @tokenizer_html_state_doctype(@html_tokenizer* t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '\n': case '\r': case '\t': case ' ': // Switch to the before DOCTYPE name state. t->state = HTML_STATE_BEFORE_DOCTYPE_NAME; break; case '>': // Reconsume in the before DOCTYPE name state. t->inputBuffer.pos--; t->state = HTML_STATE_BEFORE_DOCTYPE_NAME; break; default: // This is a missing-whitespace-before-doctype-name parse error. Reconsume // in the before DOCTYPE name state. t->inputBuffer.pos--; t->state = HTML_STATE_BEFORE_DOCTYPE_NAME; break; } } U0 @tokenizer_html_state_before_doctype_name(@html_tokenizer* t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '\n': case '\r': case '\t': case ' ': // Ignore the character. break; case 'A' ... 'Z': // Create a new DOCTYPE token. Set the token's name to the lowercase version // of the current input character (add 0x0020 to the character's code // point). Switch to the DOCTYPE name state. t->state = HTML_STATE_DOCTYPE_NAME; break; case '>': // This is a missing-doctype-name parse error. Create a new DOCTYPE token. // Set its force-quirks flag to on. Switch to the data state. Emit the // current token. @emit_current_character(t); t->state = HTML_STATE_DATA; break; default: // Create a new DOCTYPE token. Set the token's name to the current input // character. Switch to the DOCTYPE name state. t->state = HTML_STATE_DOCTYPE_NAME; break; } } U0 @tokenizer_html_state_doctype_name(@html_tokenizer* t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '\n': case '\r': case '\t': case ' ': // Switch to the after DOCTYPE name state. t->state = HTML_STATE_AFTER_DOCTYPE_NAME; break; case '>': // Switch to the data state. Emit the current DOCTYPE token. t->state = HTML_STATE_DATA; break; case 'A' ... 'Z': // Append the lowercase version of the current input character (add 0x0020 // to the character's code point) to the current DOCTYPE token's name. break; default: // Append the current input character to the current DOCTYPE token's name. break; } } U0 @tokenizer_html_state_after_doctype_name(@html_tokenizer* t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '\n': case '\r': case '\t': case ' ': // Ignore the character. break; case 'A' ... 'Z': // Create a new DOCTYPE token. Set the token's name to the lowercase version // of the current input character (add 0x0020 to the character's code // point). Switch to the DOCTYPE name state. t->state = HTML_STATE_DOCTYPE_NAME; break; case '>': // Switch to the data state. Emit the current DOCTYPE token. t->state = HTML_STATE_DATA; break; default: // Reconsume in the bogus DOCTYPE state. t->inputBuffer.pos--; t->state = HTML_STATE_BOGUS_DOCTYPE; break; } } U0 @tokenizer_html_state_bogus_doctype(@html_tokenizer* t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '>': // Switch to the data state. Emit the DOCTYPE token. t->state = HTML_STATE_DATA; break; default: // Ignore the character. break; } } U0 @tokenizer_html_state_bogus_comment(@html_tokenizer* t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '>': // Switch to the data state. Emit the DOCTYPE token. t->state = HTML_STATE_DATA; break; default: // Ignore the character. break; } } U0 @tokenizer_html_state_tag_name(@html_tokenizer* t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '\n': case '\r': case '\t': case ' ': // Switch to the before attribute name state. t->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME; break; case '/': // Switch to the self-closing start tag state. t->state = HTML_STATE_SELF_CLOSING_START_TAG; break; case '>': // Switch to the data state. Emit the current tag token. @emit_current_node(t); t->state = HTML_STATE_DATA; break; case 'A' ... 'Z': // Append the lowercase version of the current input character (add 0x0020 // to the character's code point) to the current tag token's tag name. t->currentNode->tagName[StrLen(t->currentNode->tagName)] = t->currentInputChar + 0x20; if (!StrICmp(t->currentNode->tagName, "img")) t->numOfImgNodes++; if (!StrICmp(t->currentNode->tagName, "body")) { t->currentNode->backgroundColor = Color(255, 255, 255); t->currentNode->color = Color(0, 0, 0); } break; default: // Append the current input character to the current tag token's tag name. t->currentNode->tagName[StrLen(t->currentNode->tagName)] = t->currentInputChar; if (!StrICmp(t->currentNode->tagName, "img")) t->numOfImgNodes++; if (!StrICmp(t->currentNode->tagName, "body")) { t->currentNode->backgroundColor = Color(255, 255, 255); t->currentNode->color = Color(0, 0, 0); } break; } } U0 @tokenizer_html_state_before_attribute_name(@html_tokenizer* t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '\n': case '\r': case '\t': case ' ': // Ignore the character. break; case '/': case '>': // Reconsume in the after attribute name state. t->inputBuffer.pos--; t->state = HTML_STATE_AFTER_ATTRIBUTE_NAME; break; case '=': // This is an unexpected-equals-sign-before-attribute-name parse error. // Start a new attribute in the current tag token. Set that attribute's name // to the current input character, and its value to the empty string. Switch // to the attribute name state. t->currentAttribute = CAlloc(sizeof(JsonKey), t->mem_task); t->currentAttribute->name = @init_growable_string(t->mem_task); t->currentAttribute->value = @init_growable_string(t->mem_task); t->currentAttribute->name = @append_char_to_growable_string( t->currentAttribute->name, t->currentInputChar, t->mem_task); t->state = HTML_STATE_ATTRIBUTE_NAME; break; default: // Start a new attribute in the current tag token. Set that attribute name // and value to the empty string. Reconsume in the attribute name state. t->currentAttribute = CAlloc(sizeof(JsonKey), t->mem_task); t->currentAttribute->name = @init_growable_string(t->mem_task); t->currentAttribute->value = @init_growable_string(t->mem_task); t->inputBuffer.pos--; t->state = HTML_STATE_ATTRIBUTE_NAME; break; } } U0 @tokenizer_html_state_attribute_name(@html_tokenizer* t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '\n': case '\r': case '\t': case ' ': case '/': case '>': // Reconsume in the after attribute name state. t->inputBuffer.pos--; t->state = HTML_STATE_AFTER_ATTRIBUTE_NAME; break; case '=': // Switch to the before attribute value state. t->state = HTML_STATE_BEFORE_ATTRIBUTE_VALUE; break; case 'A' ... 'Z': // Append the lowercase version of the current input character (add 0x0020 // to the character's code point) to the current attribute's name. t->currentAttribute->name = @append_char_to_growable_string( t->currentAttribute->name, t->currentInputChar + 0x20, t->mem_task); break; case '"': case '\'': case '<': // This is an unexpected-character-in-attribute-name parse error. Treat it as // per the "anything else" entry below. default: // Append the current input character to the current attribute's name. t->currentAttribute->name = @append_char_to_growable_string( t->currentAttribute->name, t->currentInputChar, t->mem_task); break; } } U0 @tokenizer_html_state_before_attribute_value(@html_tokenizer* t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '\n': case '\r': case '\t': case ' ': // Ignore the character. break; case '"': // Switch to the attribute value (double-quoted) state. t->state = HTML_STATE_ATTRIBUTE_VALUE_DOUBLE_QUOTED; break; case '\'': // Switch to the attribute value (single-quoted) state. t->state = HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED; break; case '>': // This is a missing-attribute-value parse error. Switch to the data state. // Emit the current tag token. @emit_current_node(t); t->state = HTML_STATE_DATA; break; default: // Reconsume in the attribute value (unquoted) state. t->inputBuffer.pos--; t->state = HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED; break; } } U0 @tokenizer_html_state_attribute_value_double_quoted(@html_tokenizer* t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '"': // Switch to the after attribute value (quoted) state. @set_current_attribute_on_current_node(t); t->state = HTML_STATE_AFTER_ATTRIBUTE_VALUE_QUOTED; break; /* case '&': // Set the return state to the attribute value (double-quoted) state. Switch // to the character reference state. t->returnState = HTML_STATE_ATTRIBUTE_VALUE_DOUBLE_QUOTED; t->state = HTML_STATE_CHARACTER_REFERENCE; break; */ default: // Append the current input character to the current attribute's value. t->currentAttribute->value = @append_char_to_growable_string( t->currentAttribute->value, t->currentInputChar, t->mem_task); break; } } U0 @tokenizer_html_state_attribute_value_unquoted(@html_tokenizer* t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '\n': case '\r': case '\t': case ' ': // Switch to the before attribute name state. t->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME; break; /* case '&': // Set the return state to the attribute value (double-quoted) state. Switch // to the character reference state. t->returnState = HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED; t->state = HTML_STATE_CHARACTER_REFERENCE; break; */ case '>': // Switch to the data state. Emit the current tag token. @emit_current_node(t); t->state = HTML_STATE_DATA; break; default: // Append the current input character to the current attribute's value. t->currentAttribute->value = @append_char_to_growable_string( t->currentAttribute->value, t->currentInputChar, t->mem_task); break; } } U0 @tokenizer_html_state_attribute_value_single_quoted(@html_tokenizer* t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '\'': // Switch to the after attribute value (quoted) state. @set_current_attribute_on_current_node(t); t->state = HTML_STATE_AFTER_ATTRIBUTE_VALUE_QUOTED; break; /* case '&': // Set the return state to the attribute value (double-quoted) state. Switch // to the character reference state. t->returnState = HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED; t->state = HTML_STATE_CHARACTER_REFERENCE; break; */ default: // Append the current input character to the current attribute's value. t->currentAttribute->value = @append_char_to_growable_string( t->currentAttribute->value, t->currentInputChar, t->mem_task); break; } } U0 @tokenizer_html_state_after_attribute_value_quoted(@html_tokenizer* t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '\n': case '\r': case '\t': case ' ': // Switch to the before attribute name state. t->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME; break; case '/': // Switch to the self-closing start tag state. t->state = HTML_STATE_SELF_CLOSING_START_TAG; break; case '>': // Switch to the data state. Emit the current tag token. @emit_current_node(t); t->state = HTML_STATE_DATA; break; default: // This is a missing-whitespace-between-attributes parse error. Reconsume in // the before attribute name state. t->inputBuffer.pos--; t->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME; break; } } U0 @tokenizer_html_state_end_tag_open(@html_tokenizer* t) { @consume_next_input_char(t); switch (t->currentInputChar) { case 'A' ... 'Z': case 'a' ... 'z': // Create a new end tag token, set its tag name to the empty string. // Reconsume in the tag name state. @html_dom_node* node = @create_new_node("/", t->mem_task); t->currentNode = node; t->inputBuffer.pos--; t->state = HTML_STATE_TAG_NAME; break; case '>': // This is a missing-end-tag-name parse error. Switch to the data state. t->state = HTML_STATE_DATA; break; default: // This is an invalid-first-character-of-tag-name parse error. Create a // comment token whose data is the empty string. Reconsume in the bogus // comment state. t->inputBuffer.pos--; t->state = HTML_STATE_BOGUS_COMMENT; break; } } U0 @tokenizer_html_state_after_attribute_name(@html_tokenizer* t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '\n': case '\r': case '\t': case ' ': // Ignore the character. break; /* case '"': // Switch to the attribute value (double-quoted) state. t->state = HTML_STATE_ATTRIBUTE_VALUE_DOUBLE_QUOTED; break; case '\'': // Switch to the attribute value (single-quoted) state. t->state = HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED; break; */ case '/': // Switch to the self-closing start tag state. t->state = HTML_STATE_SELF_CLOSING_START_TAG; break; case '=': // Switch to the before attribute value state. t->state = HTML_STATE_BEFORE_ATTRIBUTE_VALUE; break; case '>': // Switch to the data state. Emit the current tag token. @set_current_attribute_on_current_node(t); @emit_current_node(t); t->state = HTML_STATE_DATA; break; default: // Start a new attribute in the current tag token. Set that attribute name // and value to the empty string. Reconsume in the attribute name state. t->currentAttribute = CAlloc(sizeof(JsonKey), t->mem_task); t->currentAttribute->name = @init_growable_string(t->mem_task); t->currentAttribute->value = @init_growable_string(t->mem_task); t->inputBuffer.pos--; t->state = HTML_STATE_ATTRIBUTE_NAME; break; } } U0 @tokenizer_html_state_self_closing_start_tag(@html_tokenizer* t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '>': // Set the self-closing flag of the current tag token. Switch to the data // state. Emit the current tag token. @emit_current_node(t); t->state = HTML_STATE_DATA; break; default: // This is an unexpected-solidus-in-tag parse error. Reconsume in the before // attribute name state. t->inputBuffer.pos--; t->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME; break; } } U0 @tokenizer_html_state_character_reference(@html_tokenizer* t) { // Set the temporary buffer to the empty string. @empty_temp_buffer(t); // Append a U+0026 AMPERSAND (&) character to the temporary buffer. @append_char_to_temp_buffer(t, '&'); @consume_next_input_char(t); switch (t->currentInputChar) { case 'A' ... 'Z': case 'a' ... 'z': // Reconsume in the named character reference state. t->inputBuffer.pos--; t->state = HTML_STATE_NAMED_CHARACTER_REFERENCE; break; case '#': // Append the current input character to the temporary buffer. Switch to the // numeric character reference state. @append_char_to_temp_buffer(t, '#'); t->state = HTML_STATE_NUMERIC_CHARACTER_REFERENCE; break; default: // Flush code points consumed as a character reference. Reconsume in the // return state. t->consumeTempBuffer = TRUE; t->state = t->returnState; break; } } U0 @tokenizer_html_state_named_character_reference(@html_tokenizer* t) { // Consume the maximum number of characters possible, where the consumed // characters are one of the identifiers in the first column of the named // character references table. Append each character to the temporary buffer // when it's consumed. @consume_next_input_char(t); @append_char_to_temp_buffer(t, t->currentInputChar); switch (t->currentInputChar) { case ';': @replace_temp_buffer_with_named_character_reference(t); t->consumeTempBuffer = TRUE; t->state = t->returnState; break; default: break; } } U0 @tokenizer_html_state_numeric_character_reference(@html_tokenizer* t) { @consume_next_input_char(t); @append_char_to_temp_buffer(t, t->currentInputChar); switch (t->currentInputChar) { case ';': @replace_temp_buffer_with_numeric_character_reference(t); t->consumeTempBuffer = TRUE; t->state = t->returnState; break; default: break; } } U0 @tokenizer_html_state_comment_start(@html_tokenizer* t) { @consume_next_input_char(t); switch (t->currentInputChar) { case '-': // Switch to the comment start dash state. t->state = HTML_STATE_COMMENT_START_DASH; break; case '>': // This is an abrupt-closing-of-empty-comment parse error. Switch to the // data state. Emit the current comment token. t->state = HTML_STATE_DATA; break; default: // Reconsume in the comment state. t->inputBuffer.pos--; t->state = HTML_STATE_COMMENT; break; } } U0 @tokenizer_html_state_comment(@html_tokenizer* t) { if ((t->inputBuffer.data[t->inputBuffer.pos] == '-') && (t->inputBuffer.data[t->inputBuffer.pos + 1] == '-') && (t->inputBuffer.data[t->inputBuffer.pos + 2] == '>')) { // Consume those three characters, and switch to the data state. t->inputBuffer.pos += 3; t->state = HTML_STATE_DATA; return; } @consume_next_input_char(t); } U0 @dump_node(@html_tokenizer* t, @html_dom_node* node) { I64 i; if (StrICmp(node->tagName, "InternalTextNode") && StrICmp(node->tagName, "Document")) { for (i = 0; i < t->nodeTreeDepth; i++) "-"; "<%s> : parentNode: <%s 0x%08x>\n", node->tagName, node->parentNode->tagName, node->parentNode; } if (node->children->length) { t->nodeTreeDepth += 2; for (i = 0; i < node->children->length; i++) { @dump_node(t, node->children->@(i)); //@dump_node(t, Json.ArrayIndex(node->children, i)); } t->nodeTreeDepth -= 2; } } U0 @dump_node_list(@html_tokenizer* t) { t->nodeTreeDepth = -2; @dump_node(t, t->originNode); "\n"; } @html_dom_node* @html_tokenize_and_create_node_list(U8* buffer, I64 size, CTask* mem_task, I64* num_of_images) { @html_tokenizer t; @init_tokenizer(&t, buffer, size, mem_task); while (t.inputBuffer.pos < t.inputBuffer.size && buffer[t.inputBuffer.pos]) { switch (t.state) { case HTML_STATE_DATA: @tokenizer_html_state_data(&t); break; case HTML_STATE_TAG_OPEN: @tokenizer_html_state_tag_open(&t); break; case HTML_STATE_MARKUP_DECLARATION_OPEN: @tokenizer_html_state_markup_declaration_open(&t); break; case HTML_STATE_DOCTYPE: @tokenizer_html_state_doctype(&t); break; case HTML_STATE_BEFORE_DOCTYPE_NAME: @tokenizer_html_state_before_doctype_name(&t); break; case HTML_STATE_DOCTYPE_NAME: @tokenizer_html_state_doctype_name(&t); break; case HTML_STATE_TAG_NAME: @tokenizer_html_state_tag_name(&t); break; case HTML_STATE_BEFORE_ATTRIBUTE_NAME: @tokenizer_html_state_before_attribute_name(&t); break; case HTML_STATE_ATTRIBUTE_NAME: @tokenizer_html_state_attribute_name(&t); break; case HTML_STATE_BEFORE_ATTRIBUTE_VALUE: @tokenizer_html_state_before_attribute_value(&t); break; case HTML_STATE_ATTRIBUTE_VALUE_DOUBLE_QUOTED: @tokenizer_html_state_attribute_value_double_quoted(&t); break; case HTML_STATE_AFTER_ATTRIBUTE_VALUE_QUOTED: @tokenizer_html_state_after_attribute_value_quoted(&t); break; case HTML_STATE_CHARACTER_REFERENCE: @tokenizer_html_state_character_reference(&t); break; case HTML_STATE_END_TAG_OPEN: @tokenizer_html_state_end_tag_open(&t); break; case HTML_STATE_AFTER_ATTRIBUTE_NAME: @tokenizer_html_state_after_attribute_name(&t); break; case HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED: @tokenizer_html_state_attribute_value_single_quoted(&t); break; case HTML_STATE_NAMED_CHARACTER_REFERENCE: @tokenizer_html_state_named_character_reference(&t); break; case HTML_STATE_NUMERIC_CHARACTER_REFERENCE: @tokenizer_html_state_numeric_character_reference(&t); break; case HTML_STATE_AFTER_DOCTYPE_NAME: @tokenizer_html_state_after_doctype_name(&t); break; case HTML_STATE_BOGUS_DOCTYPE: @tokenizer_html_state_bogus_doctype(&t); break; case HTML_STATE_SELF_CLOSING_START_TAG: @tokenizer_html_state_self_closing_start_tag(&t); break; case HTML_STATE_BOGUS_COMMENT: @tokenizer_html_state_bogus_comment(&t); break; case HTML_STATE_COMMENT_START: @tokenizer_html_state_comment_start(&t); break; case HTML_STATE_COMMENT: @tokenizer_html_state_comment(&t); break; case HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED: @tokenizer_html_state_attribute_value_unquoted(&t); break; case HTML_STATE_INVALID: default: "\n$FG,0$HTML Tokenization error: Invalid or unimplemented " "state\nInputBuffer position: %d\nState: %d$FD$\n\n", t.inputBuffer.pos, t.state; PressAKey; break; } } @html_dom_node* node_list = t.originNode; *num_of_images = t.numOfImgNodes; return node_list; }